Model: fpadovani/hin-deva-100mb-after-ppt-shuff-dyck-100mb-ckpt500_seed3407 Source: Original Platform
90200 lines
2.4 MiB
90200 lines
2.4 MiB
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 4.322766570605188,
|
|
"eval_steps": 3000,
|
|
"global_step": 45000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 4.81198468208313,
|
|
"epoch": 0.0004803073967339097,
|
|
"grad_norm": 15.3125,
|
|
"learning_rate": 2e-06,
|
|
"loss": 14.3995,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 10855.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 4.828950214385986,
|
|
"epoch": 0.0009606147934678194,
|
|
"grad_norm": 16.0,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 14.4568,
|
|
"mean_token_accuracy": 6.361323175951838e-05,
|
|
"num_tokens": 24110.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 4.885565328598022,
|
|
"epoch": 0.001440922190201729,
|
|
"grad_norm": 18.375,
|
|
"learning_rate": 7e-06,
|
|
"loss": 14.1468,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 35984.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 5.113980484008789,
|
|
"epoch": 0.0019212295869356388,
|
|
"grad_norm": 25.5,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 13.5274,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 48152.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 7.0846137523651125,
|
|
"epoch": 0.0024015369836695487,
|
|
"grad_norm": 18.875,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 11.983,
|
|
"mean_token_accuracy": 5.9031875571236016e-05,
|
|
"num_tokens": 59810.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.311653995513916,
|
|
"epoch": 0.002881844380403458,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.8966,
|
|
"mean_token_accuracy": 0.0035814862465485932,
|
|
"num_tokens": 70852.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.698549842834472,
|
|
"epoch": 0.0033621517771373678,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 10.681,
|
|
"mean_token_accuracy": 0.012990868836641311,
|
|
"num_tokens": 83378.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.70135440826416,
|
|
"epoch": 0.0038424591738712775,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 10.3702,
|
|
"mean_token_accuracy": 0.015855902433395387,
|
|
"num_tokens": 95505.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.669420051574708,
|
|
"epoch": 0.004322766570605188,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 10.0399,
|
|
"mean_token_accuracy": 0.019150405284017326,
|
|
"num_tokens": 106812.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.626140022277832,
|
|
"epoch": 0.004803073967339097,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.8531,
|
|
"mean_token_accuracy": 0.030371082201600074,
|
|
"num_tokens": 118572.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.630718421936034,
|
|
"epoch": 0.005283381364073006,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.7085,
|
|
"mean_token_accuracy": 0.02918087989091873,
|
|
"num_tokens": 130051.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.632691478729248,
|
|
"epoch": 0.005763688760806916,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.6316,
|
|
"mean_token_accuracy": 0.033551334962248804,
|
|
"num_tokens": 141920.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.621756076812744,
|
|
"epoch": 0.006243996157540826,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.4968,
|
|
"mean_token_accuracy": 0.03377603869885206,
|
|
"num_tokens": 152706.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.59926996231079,
|
|
"epoch": 0.0067243035542747355,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4671,
|
|
"mean_token_accuracy": 0.030284658074378967,
|
|
"num_tokens": 165253.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.586241340637207,
|
|
"epoch": 0.007204610951008645,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.3528,
|
|
"mean_token_accuracy": 0.03066213186830282,
|
|
"num_tokens": 176708.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.572576808929444,
|
|
"epoch": 0.007684918347742555,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.3119,
|
|
"mean_token_accuracy": 0.02979854876175523,
|
|
"num_tokens": 188240.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.554954528808594,
|
|
"epoch": 0.008165225744476465,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.1145,
|
|
"mean_token_accuracy": 0.03125303704291582,
|
|
"num_tokens": 198355.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.53057928085327,
|
|
"epoch": 0.008645533141210375,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.0646,
|
|
"mean_token_accuracy": 0.02982727512717247,
|
|
"num_tokens": 209497.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.494773197174073,
|
|
"epoch": 0.009125840537944284,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.9936,
|
|
"mean_token_accuracy": 0.02780488096177578,
|
|
"num_tokens": 220859.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.448780918121338,
|
|
"epoch": 0.009606147934678195,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.9232,
|
|
"mean_token_accuracy": 0.030998879671096803,
|
|
"num_tokens": 231550.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.376792049407959,
|
|
"epoch": 0.010086455331412104,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.7452,
|
|
"mean_token_accuracy": 0.030790003202855586,
|
|
"num_tokens": 244210.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.282748031616212,
|
|
"epoch": 0.010566762728146013,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.6175,
|
|
"mean_token_accuracy": 0.040817446634173395,
|
|
"num_tokens": 255745.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 10.166150856018067,
|
|
"epoch": 0.011047070124879923,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.5074,
|
|
"mean_token_accuracy": 0.0365377115085721,
|
|
"num_tokens": 266180.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 10.028709888458252,
|
|
"epoch": 0.011527377521613832,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.3681,
|
|
"mean_token_accuracy": 0.03765994198620319,
|
|
"num_tokens": 277736.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.827960968017578,
|
|
"epoch": 0.012007684918347743,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.2429,
|
|
"mean_token_accuracy": 0.035723325610160825,
|
|
"num_tokens": 289069.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.59237585067749,
|
|
"epoch": 0.012487992315081652,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.0891,
|
|
"mean_token_accuracy": 0.04738196656107903,
|
|
"num_tokens": 300240.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.368733978271484,
|
|
"epoch": 0.012968299711815562,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.0332,
|
|
"mean_token_accuracy": 0.04018798861652613,
|
|
"num_tokens": 311698.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.110132884979247,
|
|
"epoch": 0.013448607108549471,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 7.9056,
|
|
"mean_token_accuracy": 0.0432288508862257,
|
|
"num_tokens": 322844.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 8.820003223419189,
|
|
"epoch": 0.013928914505283382,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 7.8235,
|
|
"mean_token_accuracy": 0.045638217404484746,
|
|
"num_tokens": 335092.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.585826587677001,
|
|
"epoch": 0.01440922190201729,
|
|
"grad_norm": 0.8359375,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.7332,
|
|
"mean_token_accuracy": 0.04667803719639778,
|
|
"num_tokens": 347033.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.385289859771728,
|
|
"epoch": 0.014889529298751201,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.6524,
|
|
"mean_token_accuracy": 0.05755673125386238,
|
|
"num_tokens": 358696.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.231111812591553,
|
|
"epoch": 0.01536983669548511,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.6369,
|
|
"mean_token_accuracy": 0.05747554413974285,
|
|
"num_tokens": 369390.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.13049030303955,
|
|
"epoch": 0.01585014409221902,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.573,
|
|
"mean_token_accuracy": 0.058345531672239305,
|
|
"num_tokens": 380540.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.037137985229492,
|
|
"epoch": 0.01633045148895293,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.5672,
|
|
"mean_token_accuracy": 0.05862935781478882,
|
|
"num_tokens": 391243.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 7.971378183364868,
|
|
"epoch": 0.01681075888568684,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.5403,
|
|
"mean_token_accuracy": 0.06493047513067722,
|
|
"num_tokens": 403336.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 7.996695470809937,
|
|
"epoch": 0.01729106628242075,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.4714,
|
|
"mean_token_accuracy": 0.06883232817053794,
|
|
"num_tokens": 413886.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 7.944087362289428,
|
|
"epoch": 0.01777137367915466,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.5072,
|
|
"mean_token_accuracy": 0.07003857865929604,
|
|
"num_tokens": 425277.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 7.903090763092041,
|
|
"epoch": 0.01825168107588857,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.5901,
|
|
"mean_token_accuracy": 0.07094852812588215,
|
|
"num_tokens": 436868.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.9524956226348875,
|
|
"epoch": 0.018731988472622477,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.3956,
|
|
"mean_token_accuracy": 0.0713607795536518,
|
|
"num_tokens": 448349.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 7.893163013458252,
|
|
"epoch": 0.01921229586935639,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.398,
|
|
"mean_token_accuracy": 0.07450502514839172,
|
|
"num_tokens": 459447.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.827638578414917,
|
|
"epoch": 0.0196926032660903,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.3545,
|
|
"mean_token_accuracy": 0.07836289256811142,
|
|
"num_tokens": 470734.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.920483875274658,
|
|
"epoch": 0.020172910662824207,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.3929,
|
|
"mean_token_accuracy": 0.07436848841607571,
|
|
"num_tokens": 482015.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.829608154296875,
|
|
"epoch": 0.020653218059558116,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.3388,
|
|
"mean_token_accuracy": 0.0812894694507122,
|
|
"num_tokens": 493339.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 7.832039451599121,
|
|
"epoch": 0.021133525456292025,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.2806,
|
|
"mean_token_accuracy": 0.08215347118675709,
|
|
"num_tokens": 504924.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.841120386123658,
|
|
"epoch": 0.021613832853025938,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.2586,
|
|
"mean_token_accuracy": 0.07783420942723751,
|
|
"num_tokens": 516603.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.667848110198975,
|
|
"epoch": 0.022094140249759846,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.1767,
|
|
"mean_token_accuracy": 0.0903685748577118,
|
|
"num_tokens": 528347.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.665532779693604,
|
|
"epoch": 0.022574447646493755,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.2657,
|
|
"mean_token_accuracy": 0.08881851136684418,
|
|
"num_tokens": 539328.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.787159252166748,
|
|
"epoch": 0.023054755043227664,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.2264,
|
|
"mean_token_accuracy": 0.09179538786411286,
|
|
"num_tokens": 549297.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.68054313659668,
|
|
"epoch": 0.023535062439961577,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.1925,
|
|
"mean_token_accuracy": 0.0870781309902668,
|
|
"num_tokens": 560306.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.722461795806884,
|
|
"epoch": 0.024015369836695485,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.2601,
|
|
"mean_token_accuracy": 0.08716249391436577,
|
|
"num_tokens": 571972.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.669500827789307,
|
|
"epoch": 0.024495677233429394,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.1479,
|
|
"mean_token_accuracy": 0.09271593019366264,
|
|
"num_tokens": 582962.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.6647216796875,
|
|
"epoch": 0.024975984630163303,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.1214,
|
|
"mean_token_accuracy": 0.09072922170162201,
|
|
"num_tokens": 597193.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.66283483505249,
|
|
"epoch": 0.025456292026897216,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.1819,
|
|
"mean_token_accuracy": 0.09304547160863877,
|
|
"num_tokens": 608982.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.661752843856812,
|
|
"epoch": 0.025936599423631124,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.2188,
|
|
"mean_token_accuracy": 0.08966975659132004,
|
|
"num_tokens": 619953.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.643835210800171,
|
|
"epoch": 0.026416906820365033,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.1751,
|
|
"mean_token_accuracy": 0.09371341913938522,
|
|
"num_tokens": 631039.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.632717418670654,
|
|
"epoch": 0.026897214217098942,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.1656,
|
|
"mean_token_accuracy": 0.09481634944677353,
|
|
"num_tokens": 642656.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.468483591079712,
|
|
"epoch": 0.027377521613832854,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.0285,
|
|
"mean_token_accuracy": 0.10727941244840622,
|
|
"num_tokens": 653748.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.516920471191407,
|
|
"epoch": 0.027857829010566763,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.0029,
|
|
"mean_token_accuracy": 0.09661566317081452,
|
|
"num_tokens": 665618.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.486124277114868,
|
|
"epoch": 0.028338136407300672,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.0287,
|
|
"mean_token_accuracy": 0.09913064762949944,
|
|
"num_tokens": 677329.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.49315767288208,
|
|
"epoch": 0.02881844380403458,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 6.9864,
|
|
"mean_token_accuracy": 0.1033214770257473,
|
|
"num_tokens": 688278.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.431641435623169,
|
|
"epoch": 0.029298751200768493,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.046,
|
|
"mean_token_accuracy": 0.10180941373109817,
|
|
"num_tokens": 700739.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.378959465026855,
|
|
"epoch": 0.029779058597502402,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 6.9858,
|
|
"mean_token_accuracy": 0.104751455783844,
|
|
"num_tokens": 712527.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.4179362773895265,
|
|
"epoch": 0.03025936599423631,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.0113,
|
|
"mean_token_accuracy": 0.09946026802062988,
|
|
"num_tokens": 724514.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.464642429351807,
|
|
"epoch": 0.03073967339097022,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 6.958,
|
|
"mean_token_accuracy": 0.10636739879846573,
|
|
"num_tokens": 735679.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.379268789291382,
|
|
"epoch": 0.03121998078770413,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 6.9502,
|
|
"mean_token_accuracy": 0.10707954466342925,
|
|
"num_tokens": 747896.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.4328147888183596,
|
|
"epoch": 0.03170028818443804,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.0008,
|
|
"mean_token_accuracy": 0.10451544597744941,
|
|
"num_tokens": 759081.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.373377466201783,
|
|
"epoch": 0.03218059558117195,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 6.9349,
|
|
"mean_token_accuracy": 0.10051383301615716,
|
|
"num_tokens": 770459.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.3182484149932865,
|
|
"epoch": 0.03266090297790586,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 6.9097,
|
|
"mean_token_accuracy": 0.10436427593231201,
|
|
"num_tokens": 783960.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.2723020076751705,
|
|
"epoch": 0.03314121037463977,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 6.9998,
|
|
"mean_token_accuracy": 0.1017355315387249,
|
|
"num_tokens": 795425.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.288401937484741,
|
|
"epoch": 0.03362151777137368,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 6.9466,
|
|
"mean_token_accuracy": 0.1032905712723732,
|
|
"num_tokens": 807536.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.429675006866455,
|
|
"epoch": 0.034101825168107586,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000177,
|
|
"loss": 6.9955,
|
|
"mean_token_accuracy": 0.09869879111647606,
|
|
"num_tokens": 818801.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.303883075714111,
|
|
"epoch": 0.0345821325648415,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 6.8664,
|
|
"mean_token_accuracy": 0.1042160525918007,
|
|
"num_tokens": 831497.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.275684547424317,
|
|
"epoch": 0.03506243996157541,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000182,
|
|
"loss": 6.8349,
|
|
"mean_token_accuracy": 0.10631057769060134,
|
|
"num_tokens": 842491.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.303065443038941,
|
|
"epoch": 0.03554274735830932,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 6.9059,
|
|
"mean_token_accuracy": 0.09917943850159645,
|
|
"num_tokens": 854560.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.275861215591431,
|
|
"epoch": 0.03602305475504323,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000187,
|
|
"loss": 6.8151,
|
|
"mean_token_accuracy": 0.11120132729411125,
|
|
"num_tokens": 866688.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.233143997192383,
|
|
"epoch": 0.03650336215177714,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 6.9205,
|
|
"mean_token_accuracy": 0.09971508085727691,
|
|
"num_tokens": 879484.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.290747499465942,
|
|
"epoch": 0.036983669548511046,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000192,
|
|
"loss": 6.9039,
|
|
"mean_token_accuracy": 0.10731675177812576,
|
|
"num_tokens": 890807.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.2609399318695065,
|
|
"epoch": 0.037463976945244955,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 6.854,
|
|
"mean_token_accuracy": 0.10835549905896187,
|
|
"num_tokens": 901759.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.174216985702515,
|
|
"epoch": 0.037944284341978864,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 6.7707,
|
|
"mean_token_accuracy": 0.1162538155913353,
|
|
"num_tokens": 912212.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.264402294158936,
|
|
"epoch": 0.03842459173871278,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 6.8764,
|
|
"mean_token_accuracy": 0.10775518119335174,
|
|
"num_tokens": 923947.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.194364166259765,
|
|
"epoch": 0.03890489913544669,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000202,
|
|
"loss": 6.8149,
|
|
"mean_token_accuracy": 0.1155998706817627,
|
|
"num_tokens": 935732.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.094007158279419,
|
|
"epoch": 0.0393852065321806,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 6.7534,
|
|
"mean_token_accuracy": 0.11219719424843788,
|
|
"num_tokens": 948261.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.198687505722046,
|
|
"epoch": 0.039865513928914506,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000207,
|
|
"loss": 6.8682,
|
|
"mean_token_accuracy": 0.11036199703812599,
|
|
"num_tokens": 959574.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.14764518737793,
|
|
"epoch": 0.040345821325648415,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 6.9302,
|
|
"mean_token_accuracy": 0.10567210242152214,
|
|
"num_tokens": 970329.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.284962558746338,
|
|
"epoch": 0.040826128722382324,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000212,
|
|
"loss": 6.7852,
|
|
"mean_token_accuracy": 0.11808342635631561,
|
|
"num_tokens": 982037.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 6.99963059425354,
|
|
"epoch": 0.04130643611911623,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.7507,
|
|
"mean_token_accuracy": 0.1121592566370964,
|
|
"num_tokens": 994612.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.1772722721099855,
|
|
"epoch": 0.04178674351585014,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.8563,
|
|
"mean_token_accuracy": 0.11890432462096215,
|
|
"num_tokens": 1005960.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.119032526016236,
|
|
"epoch": 0.04226705091258405,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 6.726,
|
|
"mean_token_accuracy": 0.11254842653870582,
|
|
"num_tokens": 1017618.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.120699787139893,
|
|
"epoch": 0.042747358309317966,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.7617,
|
|
"mean_token_accuracy": 0.11123086810112,
|
|
"num_tokens": 1029307.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.10453462600708,
|
|
"epoch": 0.043227665706051875,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.7794,
|
|
"mean_token_accuracy": 0.11213452070951462,
|
|
"num_tokens": 1042027.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.109935092926025,
|
|
"epoch": 0.043707973102785784,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.7726,
|
|
"mean_token_accuracy": 0.11005142331123352,
|
|
"num_tokens": 1053125.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.093224906921387,
|
|
"epoch": 0.04418828049951969,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.7646,
|
|
"mean_token_accuracy": 0.11863623559474945,
|
|
"num_tokens": 1064908.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.0393500328063965,
|
|
"epoch": 0.0446685878962536,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.6415,
|
|
"mean_token_accuracy": 0.12022090703248978,
|
|
"num_tokens": 1076328.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.159615230560303,
|
|
"epoch": 0.04514889529298751,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 6.8668,
|
|
"mean_token_accuracy": 0.10638144612312317,
|
|
"num_tokens": 1088469.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 6.9358738422393795,
|
|
"epoch": 0.04562920268972142,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.6608,
|
|
"mean_token_accuracy": 0.11796007007360458,
|
|
"num_tokens": 1099408.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 6.921041584014892,
|
|
"epoch": 0.04610951008645533,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 6.596,
|
|
"mean_token_accuracy": 0.12084084451198578,
|
|
"num_tokens": 1111101.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 6.980242967605591,
|
|
"epoch": 0.046589817483189244,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.6189,
|
|
"mean_token_accuracy": 0.11961494460701942,
|
|
"num_tokens": 1122877.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 6.998215103149414,
|
|
"epoch": 0.04707012487992315,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.7183,
|
|
"mean_token_accuracy": 0.1069619596004486,
|
|
"num_tokens": 1133956.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 6.955817556381225,
|
|
"epoch": 0.04755043227665706,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.6106,
|
|
"mean_token_accuracy": 0.12115221694111825,
|
|
"num_tokens": 1146101.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 6.991823005676269,
|
|
"epoch": 0.04803073967339097,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.704,
|
|
"mean_token_accuracy": 0.1240153320133686,
|
|
"num_tokens": 1157432.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 6.995119285583496,
|
|
"epoch": 0.04851104707012488,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000252,
|
|
"loss": 6.6931,
|
|
"mean_token_accuracy": 0.12121785953640937,
|
|
"num_tokens": 1167601.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 6.925166416168213,
|
|
"epoch": 0.04899135446685879,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 6.5948,
|
|
"mean_token_accuracy": 0.11933866590261459,
|
|
"num_tokens": 1178818.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.102405261993408,
|
|
"epoch": 0.0494716618635927,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000257,
|
|
"loss": 6.8296,
|
|
"mean_token_accuracy": 0.11879347264766693,
|
|
"num_tokens": 1189977.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 6.896050024032593,
|
|
"epoch": 0.049951969260326606,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 6.6543,
|
|
"mean_token_accuracy": 0.12233106046915054,
|
|
"num_tokens": 1201039.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.007365083694458,
|
|
"epoch": 0.05043227665706052,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000262,
|
|
"loss": 6.6791,
|
|
"mean_token_accuracy": 0.12215208187699318,
|
|
"num_tokens": 1212573.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.002063369750976,
|
|
"epoch": 0.05091258405379443,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 6.6208,
|
|
"mean_token_accuracy": 0.1271028608083725,
|
|
"num_tokens": 1223382.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 6.9438478469848635,
|
|
"epoch": 0.05139289145052834,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.6969,
|
|
"mean_token_accuracy": 0.12958464100956918,
|
|
"num_tokens": 1236501.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 6.931712675094604,
|
|
"epoch": 0.05187319884726225,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.687,
|
|
"mean_token_accuracy": 0.12256318107247352,
|
|
"num_tokens": 1246798.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 6.9002622127532955,
|
|
"epoch": 0.05235350624399616,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.6164,
|
|
"mean_token_accuracy": 0.12228193208575248,
|
|
"num_tokens": 1258182.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 6.873838090896607,
|
|
"epoch": 0.052833813640730067,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 6.5781,
|
|
"mean_token_accuracy": 0.11714496314525605,
|
|
"num_tokens": 1270273.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 6.869143629074097,
|
|
"epoch": 0.053314121037463975,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000277,
|
|
"loss": 6.6336,
|
|
"mean_token_accuracy": 0.11991709843277931,
|
|
"num_tokens": 1281136.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 6.914445209503174,
|
|
"epoch": 0.053794428434197884,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.6257,
|
|
"mean_token_accuracy": 0.12010404467582703,
|
|
"num_tokens": 1294488.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 6.732436418533325,
|
|
"epoch": 0.05427473583093179,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.5262,
|
|
"mean_token_accuracy": 0.12693093419075013,
|
|
"num_tokens": 1304113.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 6.927071809768677,
|
|
"epoch": 0.05475504322766571,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.5843,
|
|
"mean_token_accuracy": 0.12877818644046785,
|
|
"num_tokens": 1315417.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 6.783261919021607,
|
|
"epoch": 0.05523535062439962,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.5521,
|
|
"mean_token_accuracy": 0.1234595388174057,
|
|
"num_tokens": 1328084.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 6.8645414352417,
|
|
"epoch": 0.05571565802113353,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.6982,
|
|
"mean_token_accuracy": 0.1229254849255085,
|
|
"num_tokens": 1338696.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 6.887264966964722,
|
|
"epoch": 0.056195965417867436,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.6333,
|
|
"mean_token_accuracy": 0.12206205278635025,
|
|
"num_tokens": 1350240.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 6.901881551742553,
|
|
"epoch": 0.056676272814601344,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.5792,
|
|
"mean_token_accuracy": 0.12374859303236008,
|
|
"num_tokens": 1361720.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 6.646714115142823,
|
|
"epoch": 0.05715658021133525,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.5831,
|
|
"mean_token_accuracy": 0.12852583453059196,
|
|
"num_tokens": 1373286.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 6.89121675491333,
|
|
"epoch": 0.05763688760806916,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 6.5332,
|
|
"mean_token_accuracy": 0.12378557696938515,
|
|
"num_tokens": 1384274.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 6.707057476043701,
|
|
"epoch": 0.05811719500480307,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000302,
|
|
"loss": 6.5674,
|
|
"mean_token_accuracy": 0.1248041570186615,
|
|
"num_tokens": 1395355.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 6.787681436538696,
|
|
"epoch": 0.05859750240153699,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.5071,
|
|
"mean_token_accuracy": 0.1337241604924202,
|
|
"num_tokens": 1406664.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 6.907395648956299,
|
|
"epoch": 0.059077809798270896,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.6562,
|
|
"mean_token_accuracy": 0.12113718539476395,
|
|
"num_tokens": 1418450.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 6.8045419216156,
|
|
"epoch": 0.059558117195004805,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.5466,
|
|
"mean_token_accuracy": 0.12454390972852707,
|
|
"num_tokens": 1430048.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 6.808126592636109,
|
|
"epoch": 0.060038424591738714,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.5911,
|
|
"mean_token_accuracy": 0.12378140687942504,
|
|
"num_tokens": 1441820.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 6.753187370300293,
|
|
"epoch": 0.06051873198847262,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.445,
|
|
"mean_token_accuracy": 0.13010460510849953,
|
|
"num_tokens": 1453209.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 6.6527941703796385,
|
|
"epoch": 0.06099903938520653,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.4598,
|
|
"mean_token_accuracy": 0.12725651860237122,
|
|
"num_tokens": 1465423.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 6.711978006362915,
|
|
"epoch": 0.06147934678194044,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.4541,
|
|
"mean_token_accuracy": 0.13069155365228652,
|
|
"num_tokens": 1476575.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 6.659121417999268,
|
|
"epoch": 0.06195965417867435,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.4109,
|
|
"mean_token_accuracy": 0.12579366862773894,
|
|
"num_tokens": 1486932.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 6.691300868988037,
|
|
"epoch": 0.06243996157540826,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.4399,
|
|
"mean_token_accuracy": 0.12854820042848586,
|
|
"num_tokens": 1498494.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 6.7037928104400635,
|
|
"epoch": 0.06292026897214217,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.4936,
|
|
"mean_token_accuracy": 0.12374913021922111,
|
|
"num_tokens": 1509937.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 6.782931184768676,
|
|
"epoch": 0.06340057636887608,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.5147,
|
|
"mean_token_accuracy": 0.13380258977413179,
|
|
"num_tokens": 1519823.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 6.726450872421265,
|
|
"epoch": 0.06388088376560999,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.5528,
|
|
"mean_token_accuracy": 0.12575417309999465,
|
|
"num_tokens": 1529943.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 6.611954069137573,
|
|
"epoch": 0.0643611911623439,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.3767,
|
|
"mean_token_accuracy": 0.13369367122650147,
|
|
"num_tokens": 1540618.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 6.685780334472656,
|
|
"epoch": 0.06484149855907781,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.5048,
|
|
"mean_token_accuracy": 0.1227756217122078,
|
|
"num_tokens": 1553208.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 6.6764894962310795,
|
|
"epoch": 0.06532180595581172,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.4589,
|
|
"mean_token_accuracy": 0.1339925467967987,
|
|
"num_tokens": 1563975.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 6.717716455459595,
|
|
"epoch": 0.06580211335254563,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.5252,
|
|
"mean_token_accuracy": 0.12458744868636132,
|
|
"num_tokens": 1575998.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 6.6251349449157715,
|
|
"epoch": 0.06628242074927954,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.3994,
|
|
"mean_token_accuracy": 0.13568611592054367,
|
|
"num_tokens": 1586041.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 6.637330770492554,
|
|
"epoch": 0.06676272814601344,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.4796,
|
|
"mean_token_accuracy": 0.12872253656387328,
|
|
"num_tokens": 1597531.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.617096710205078,
|
|
"epoch": 0.06724303554274735,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.4549,
|
|
"mean_token_accuracy": 0.12859696000814438,
|
|
"num_tokens": 1609255.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 6.640483236312866,
|
|
"epoch": 0.06772334293948126,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.439,
|
|
"mean_token_accuracy": 0.13394341096282006,
|
|
"num_tokens": 1621098.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.601499080657959,
|
|
"epoch": 0.06820365033621517,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.3504,
|
|
"mean_token_accuracy": 0.14078185856342315,
|
|
"num_tokens": 1631941.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 6.551211166381836,
|
|
"epoch": 0.0686839577329491,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.3471,
|
|
"mean_token_accuracy": 0.13648251742124556,
|
|
"num_tokens": 1643117.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.5161905765533445,
|
|
"epoch": 0.069164265129683,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.3952,
|
|
"mean_token_accuracy": 0.13429828062653543,
|
|
"num_tokens": 1653595.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 6.614610481262207,
|
|
"epoch": 0.06964457252641691,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.4168,
|
|
"mean_token_accuracy": 0.13274685442447662,
|
|
"num_tokens": 1664495.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.5094832420349125,
|
|
"epoch": 0.07012487992315082,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.4047,
|
|
"mean_token_accuracy": 0.136563728004694,
|
|
"num_tokens": 1674923.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 6.602942371368409,
|
|
"epoch": 0.07060518731988473,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.3045,
|
|
"mean_token_accuracy": 0.13681301474571228,
|
|
"num_tokens": 1685904.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.596617603302002,
|
|
"epoch": 0.07108549471661864,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.5324,
|
|
"mean_token_accuracy": 0.12432878389954567,
|
|
"num_tokens": 1699133.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 6.504991292953491,
|
|
"epoch": 0.07156580211335255,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.342,
|
|
"mean_token_accuracy": 0.13271907046437265,
|
|
"num_tokens": 1711559.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.592547464370727,
|
|
"epoch": 0.07204610951008646,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.2575,
|
|
"mean_token_accuracy": 0.14460937380790712,
|
|
"num_tokens": 1722526.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.4313709259033205,
|
|
"epoch": 0.07252641690682037,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.3265,
|
|
"mean_token_accuracy": 0.1398925192654133,
|
|
"num_tokens": 1734261.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.5256377220153805,
|
|
"epoch": 0.07300672430355427,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.3105,
|
|
"mean_token_accuracy": 0.14366703033447265,
|
|
"num_tokens": 1745151.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.631883907318115,
|
|
"epoch": 0.07348703170028818,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.4547,
|
|
"mean_token_accuracy": 0.1341322012245655,
|
|
"num_tokens": 1755463.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.584089756011963,
|
|
"epoch": 0.07396733909702209,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.4178,
|
|
"mean_token_accuracy": 0.1315837398171425,
|
|
"num_tokens": 1767717.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.3859930515289305,
|
|
"epoch": 0.074447646493756,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.2619,
|
|
"mean_token_accuracy": 0.14160886630415917,
|
|
"num_tokens": 1779115.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.3998737812042235,
|
|
"epoch": 0.07492795389048991,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.213,
|
|
"mean_token_accuracy": 0.1398429863154888,
|
|
"num_tokens": 1789644.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.540688323974609,
|
|
"epoch": 0.07540826128722382,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.4251,
|
|
"mean_token_accuracy": 0.13578777611255646,
|
|
"num_tokens": 1800606.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.513448238372803,
|
|
"epoch": 0.07588856868395773,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.4264,
|
|
"mean_token_accuracy": 0.12942690253257752,
|
|
"num_tokens": 1812168.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.5457319736480715,
|
|
"epoch": 0.07636887608069164,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.3796,
|
|
"mean_token_accuracy": 0.1303087830543518,
|
|
"num_tokens": 1823830.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.495282316207886,
|
|
"epoch": 0.07684918347742556,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.3456,
|
|
"mean_token_accuracy": 0.13957973942160606,
|
|
"num_tokens": 1835611.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.467644214630127,
|
|
"epoch": 0.07732949087415947,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.4127,
|
|
"mean_token_accuracy": 0.1334280975162983,
|
|
"num_tokens": 1847036.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.464094591140747,
|
|
"epoch": 0.07780979827089338,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.3528,
|
|
"mean_token_accuracy": 0.13223012760281563,
|
|
"num_tokens": 1857476.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.50727949142456,
|
|
"epoch": 0.07829010566762729,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.3773,
|
|
"mean_token_accuracy": 0.1352442115545273,
|
|
"num_tokens": 1869073.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.384515810012817,
|
|
"epoch": 0.0787704130643612,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.2486,
|
|
"mean_token_accuracy": 0.14026699736714363,
|
|
"num_tokens": 1880439.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.561717510223389,
|
|
"epoch": 0.0792507204610951,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.4116,
|
|
"mean_token_accuracy": 0.134783523529768,
|
|
"num_tokens": 1891600.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.414502573013306,
|
|
"epoch": 0.07973102785782901,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.3783,
|
|
"mean_token_accuracy": 0.13531816452741624,
|
|
"num_tokens": 1903126.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.5730548858642575,
|
|
"epoch": 0.08021133525456292,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.3467,
|
|
"mean_token_accuracy": 0.14032403156161308,
|
|
"num_tokens": 1913913.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.344644355773926,
|
|
"epoch": 0.08069164265129683,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.2684,
|
|
"mean_token_accuracy": 0.1382530964910984,
|
|
"num_tokens": 1924961.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.523792457580567,
|
|
"epoch": 0.08117195004803074,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.3612,
|
|
"mean_token_accuracy": 0.12942377403378486,
|
|
"num_tokens": 1936773.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.355926513671875,
|
|
"epoch": 0.08165225744476465,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.2783,
|
|
"mean_token_accuracy": 0.13875910267233849,
|
|
"num_tokens": 1948190.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.331581449508667,
|
|
"epoch": 0.08213256484149856,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.2694,
|
|
"mean_token_accuracy": 0.14160780385136604,
|
|
"num_tokens": 1960038.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.557125091552734,
|
|
"epoch": 0.08261287223823247,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.3489,
|
|
"mean_token_accuracy": 0.14002878665924073,
|
|
"num_tokens": 1970535.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.411432456970215,
|
|
"epoch": 0.08309317963496637,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.3226,
|
|
"mean_token_accuracy": 0.13546231836080552,
|
|
"num_tokens": 1981386.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.337710332870484,
|
|
"epoch": 0.08357348703170028,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.2428,
|
|
"mean_token_accuracy": 0.1426716774702072,
|
|
"num_tokens": 1993196.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.432919025421143,
|
|
"epoch": 0.08405379442843419,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.2741,
|
|
"mean_token_accuracy": 0.14658503904938697,
|
|
"num_tokens": 2004756.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.315603113174438,
|
|
"epoch": 0.0845341018251681,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.2347,
|
|
"mean_token_accuracy": 0.14145326390862464,
|
|
"num_tokens": 2016020.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.380750274658203,
|
|
"epoch": 0.08501440922190202,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.2819,
|
|
"mean_token_accuracy": 0.14082487300038338,
|
|
"num_tokens": 2027747.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.4264098644256595,
|
|
"epoch": 0.08549471661863593,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.2553,
|
|
"mean_token_accuracy": 0.13818828240036965,
|
|
"num_tokens": 2038841.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.385887289047242,
|
|
"epoch": 0.08597502401536984,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.3043,
|
|
"mean_token_accuracy": 0.13402576446533204,
|
|
"num_tokens": 2049905.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.424469089508056,
|
|
"epoch": 0.08645533141210375,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.3803,
|
|
"mean_token_accuracy": 0.13485484719276428,
|
|
"num_tokens": 2062492.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.387258577346802,
|
|
"epoch": 0.08693563880883766,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.31,
|
|
"mean_token_accuracy": 0.1353304862976074,
|
|
"num_tokens": 2073840.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.3580629348754885,
|
|
"epoch": 0.08741594620557157,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.221,
|
|
"mean_token_accuracy": 0.14060378223657607,
|
|
"num_tokens": 2085720.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.353258228302002,
|
|
"epoch": 0.08789625360230548,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.3039,
|
|
"mean_token_accuracy": 0.1413162462413311,
|
|
"num_tokens": 2096649.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.436611890792847,
|
|
"epoch": 0.08837656099903939,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.3061,
|
|
"mean_token_accuracy": 0.14285610914230346,
|
|
"num_tokens": 2109030.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.35608320236206,
|
|
"epoch": 0.0888568683957733,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.2113,
|
|
"mean_token_accuracy": 0.14488047659397124,
|
|
"num_tokens": 2121384.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.269479846954345,
|
|
"epoch": 0.0893371757925072,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.1635,
|
|
"mean_token_accuracy": 0.147640460729599,
|
|
"num_tokens": 2131377.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.344134902954101,
|
|
"epoch": 0.08981748318924111,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.3531,
|
|
"mean_token_accuracy": 0.1383367098867893,
|
|
"num_tokens": 2142364.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.356987571716308,
|
|
"epoch": 0.09029779058597502,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.2296,
|
|
"mean_token_accuracy": 0.14149210676550866,
|
|
"num_tokens": 2153040.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.35843825340271,
|
|
"epoch": 0.09077809798270893,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.2728,
|
|
"mean_token_accuracy": 0.14314480721950532,
|
|
"num_tokens": 2165571.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.3020600318908695,
|
|
"epoch": 0.09125840537944284,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.2423,
|
|
"mean_token_accuracy": 0.14072795882821082,
|
|
"num_tokens": 2177241.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.329180097579956,
|
|
"epoch": 0.09173871277617675,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.2801,
|
|
"mean_token_accuracy": 0.1361616224050522,
|
|
"num_tokens": 2187475.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.315436792373657,
|
|
"epoch": 0.09221902017291066,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.3087,
|
|
"mean_token_accuracy": 0.14151085540652275,
|
|
"num_tokens": 2198185.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.303459358215332,
|
|
"epoch": 0.09269932756964457,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.2346,
|
|
"mean_token_accuracy": 0.14740882739424704,
|
|
"num_tokens": 2210404.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.370419549942016,
|
|
"epoch": 0.09317963496637849,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.2262,
|
|
"mean_token_accuracy": 0.144054813683033,
|
|
"num_tokens": 2222188.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.290718269348145,
|
|
"epoch": 0.0936599423631124,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.2775,
|
|
"mean_token_accuracy": 0.1421047918498516,
|
|
"num_tokens": 2233418.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.352431869506836,
|
|
"epoch": 0.0941402497598463,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.2415,
|
|
"mean_token_accuracy": 0.14807373881340027,
|
|
"num_tokens": 2245053.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.250268840789795,
|
|
"epoch": 0.09462055715658022,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.2715,
|
|
"mean_token_accuracy": 0.14363499581813813,
|
|
"num_tokens": 2256375.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.225133609771729,
|
|
"epoch": 0.09510086455331412,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.1142,
|
|
"mean_token_accuracy": 0.1477846160531044,
|
|
"num_tokens": 2267074.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.191523456573487,
|
|
"epoch": 0.09558117195004803,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.1547,
|
|
"mean_token_accuracy": 0.14838184416294098,
|
|
"num_tokens": 2277168.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.25091781616211,
|
|
"epoch": 0.09606147934678194,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.1381,
|
|
"mean_token_accuracy": 0.14807945489883423,
|
|
"num_tokens": 2288178.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 6.215264129638672,
|
|
"epoch": 0.09654178674351585,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999999983283737,
|
|
"loss": 6.1686,
|
|
"mean_token_accuracy": 0.1440332628786564,
|
|
"num_tokens": 2299765.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.3124645233154295,
|
|
"epoch": 0.09702209414024976,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999999915373924,
|
|
"loss": 6.2644,
|
|
"mean_token_accuracy": 0.13689299449324607,
|
|
"num_tokens": 2312047.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.30297064781189,
|
|
"epoch": 0.09750240153698367,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999999795225793,
|
|
"loss": 6.2563,
|
|
"mean_token_accuracy": 0.1363622300326824,
|
|
"num_tokens": 2324118.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 6.299112796783447,
|
|
"epoch": 0.09798270893371758,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004999999622839347,
|
|
"loss": 6.2494,
|
|
"mean_token_accuracy": 0.14326749965548516,
|
|
"num_tokens": 2335171.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.283253812789917,
|
|
"epoch": 0.09846301633045149,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999999398214593,
|
|
"loss": 6.1501,
|
|
"mean_token_accuracy": 0.14212532341480255,
|
|
"num_tokens": 2346338.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 6.212884902954102,
|
|
"epoch": 0.0989433237271854,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999999121351532,
|
|
"loss": 6.1934,
|
|
"mean_token_accuracy": 0.14963782876729964,
|
|
"num_tokens": 2357185.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.190281915664673,
|
|
"epoch": 0.0994236311239193,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999998792250173,
|
|
"loss": 6.1183,
|
|
"mean_token_accuracy": 0.15685753300786018,
|
|
"num_tokens": 2368494.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.289627552032471,
|
|
"epoch": 0.09990393852065321,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999998410910524,
|
|
"loss": 6.3364,
|
|
"mean_token_accuracy": 0.13329742476344109,
|
|
"num_tokens": 2380800.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 6.3118733882904055,
|
|
"epoch": 0.10038424591738712,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999997977332592,
|
|
"loss": 6.2551,
|
|
"mean_token_accuracy": 0.13934137374162675,
|
|
"num_tokens": 2391753.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.178606843948364,
|
|
"epoch": 0.10086455331412104,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999997491516389,
|
|
"loss": 6.1391,
|
|
"mean_token_accuracy": 0.1400229126214981,
|
|
"num_tokens": 2403324.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.235824918746948,
|
|
"epoch": 0.10134486071085495,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999996953461925,
|
|
"loss": 6.2482,
|
|
"mean_token_accuracy": 0.13423383459448815,
|
|
"num_tokens": 2414873.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.138184642791748,
|
|
"epoch": 0.10182516810758886,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999996363169212,
|
|
"loss": 6.0208,
|
|
"mean_token_accuracy": 0.15671658217906953,
|
|
"num_tokens": 2425308.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.144180011749268,
|
|
"epoch": 0.10230547550432277,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999995720638266,
|
|
"loss": 6.0654,
|
|
"mean_token_accuracy": 0.1525282308459282,
|
|
"num_tokens": 2436835.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.183439445495606,
|
|
"epoch": 0.10278578290105668,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00049999950258691,
|
|
"loss": 6.1921,
|
|
"mean_token_accuracy": 0.1451313279569149,
|
|
"num_tokens": 2446798.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 6.123720979690551,
|
|
"epoch": 0.10326609029779059,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999994278861731,
|
|
"loss": 6.0747,
|
|
"mean_token_accuracy": 0.15084402859210969,
|
|
"num_tokens": 2457308.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 6.215669107437134,
|
|
"epoch": 0.1037463976945245,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999993479616175,
|
|
"loss": 6.1309,
|
|
"mean_token_accuracy": 0.13830516785383223,
|
|
"num_tokens": 2468917.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.227848720550537,
|
|
"epoch": 0.1042267050912584,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999992628132451,
|
|
"loss": 6.1529,
|
|
"mean_token_accuracy": 0.14558819606900214,
|
|
"num_tokens": 2481363.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.175233983993531,
|
|
"epoch": 0.10470701248799232,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999991724410582,
|
|
"loss": 6.1551,
|
|
"mean_token_accuracy": 0.14347582682967186,
|
|
"num_tokens": 2493082.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 6.150361251831055,
|
|
"epoch": 0.10518731988472622,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999990768450583,
|
|
"loss": 6.106,
|
|
"mean_token_accuracy": 0.1499667778611183,
|
|
"num_tokens": 2503849.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.225272464752197,
|
|
"epoch": 0.10566762728146013,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999989760252482,
|
|
"loss": 6.1511,
|
|
"mean_token_accuracy": 0.14817013815045357,
|
|
"num_tokens": 2514528.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.097928714752197,
|
|
"epoch": 0.10614793467819404,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004999988699816299,
|
|
"loss": 6.1427,
|
|
"mean_token_accuracy": 0.14771459847688675,
|
|
"num_tokens": 2524971.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.153327941894531,
|
|
"epoch": 0.10662824207492795,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999987587142058,
|
|
"loss": 6.057,
|
|
"mean_token_accuracy": 0.14452041387557985,
|
|
"num_tokens": 2535674.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.2696786403656,
|
|
"epoch": 0.10710854947166186,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999986422229789,
|
|
"loss": 6.2903,
|
|
"mean_token_accuracy": 0.13996392711997033,
|
|
"num_tokens": 2547108.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 6.155757236480713,
|
|
"epoch": 0.10758885686839577,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999985205079514,
|
|
"loss": 6.1047,
|
|
"mean_token_accuracy": 0.1451355442404747,
|
|
"num_tokens": 2559474.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.012842035293579,
|
|
"epoch": 0.10806916426512968,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999983935691265,
|
|
"loss": 5.9441,
|
|
"mean_token_accuracy": 0.16244944632053376,
|
|
"num_tokens": 2571264.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.159362649917602,
|
|
"epoch": 0.10854947166186359,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499998261406507,
|
|
"loss": 6.1208,
|
|
"mean_token_accuracy": 0.1507526934146881,
|
|
"num_tokens": 2583731.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.268857860565186,
|
|
"epoch": 0.10902977905859751,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004999981240200958,
|
|
"loss": 6.1607,
|
|
"mean_token_accuracy": 0.14638862013816833,
|
|
"num_tokens": 2595497.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.053813219070435,
|
|
"epoch": 0.10951008645533142,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999979814098966,
|
|
"loss": 6.1148,
|
|
"mean_token_accuracy": 0.1516471363604069,
|
|
"num_tokens": 2607358.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.1449603080749515,
|
|
"epoch": 0.10999039385206533,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999978335759121,
|
|
"loss": 6.0354,
|
|
"mean_token_accuracy": 0.15392047837376593,
|
|
"num_tokens": 2618936.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.154958772659302,
|
|
"epoch": 0.11047070124879924,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999976805181461,
|
|
"loss": 6.1981,
|
|
"mean_token_accuracy": 0.14167412593960763,
|
|
"num_tokens": 2631840.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.140295743942261,
|
|
"epoch": 0.11095100864553314,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499997522236602,
|
|
"loss": 6.1443,
|
|
"mean_token_accuracy": 0.15361175835132598,
|
|
"num_tokens": 2642412.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.160842370986939,
|
|
"epoch": 0.11143131604226705,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999973587312837,
|
|
"loss": 6.1067,
|
|
"mean_token_accuracy": 0.14919153451919556,
|
|
"num_tokens": 2653890.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.146590614318848,
|
|
"epoch": 0.11191162343900096,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999971900021947,
|
|
"loss": 6.163,
|
|
"mean_token_accuracy": 0.15273661985993386,
|
|
"num_tokens": 2664888.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.159024953842163,
|
|
"epoch": 0.11239193083573487,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999970160493391,
|
|
"loss": 6.0579,
|
|
"mean_token_accuracy": 0.14569913148880004,
|
|
"num_tokens": 2675550.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.02392611503601,
|
|
"epoch": 0.11287223823246878,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999968368727209,
|
|
"loss": 6.0724,
|
|
"mean_token_accuracy": 0.15466973930597305,
|
|
"num_tokens": 2688022.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.1862691879272464,
|
|
"epoch": 0.11335254562920269,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999966524723442,
|
|
"loss": 6.0632,
|
|
"mean_token_accuracy": 0.14964798092842102,
|
|
"num_tokens": 2698737.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.077165365219116,
|
|
"epoch": 0.1138328530259366,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999964628482135,
|
|
"loss": 6.0344,
|
|
"mean_token_accuracy": 0.15742302685976028,
|
|
"num_tokens": 2709844.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.127112817764282,
|
|
"epoch": 0.1143131604226705,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999962680003328,
|
|
"loss": 6.1035,
|
|
"mean_token_accuracy": 0.1519095703959465,
|
|
"num_tokens": 2720273.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.1255943775177,
|
|
"epoch": 0.11479346781940442,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499996067928707,
|
|
"loss": 6.1124,
|
|
"mean_token_accuracy": 0.14679019302129745,
|
|
"num_tokens": 2731354.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.127178192138672,
|
|
"epoch": 0.11527377521613832,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999958626333406,
|
|
"loss": 6.1052,
|
|
"mean_token_accuracy": 0.1527300015091896,
|
|
"num_tokens": 2742966.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.03611798286438,
|
|
"epoch": 0.11575408261287223,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999956521142383,
|
|
"loss": 6.009,
|
|
"mean_token_accuracy": 0.1586822062730789,
|
|
"num_tokens": 2755010.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.0991308212280275,
|
|
"epoch": 0.11623439000960614,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999954363714051,
|
|
"loss": 6.0361,
|
|
"mean_token_accuracy": 0.14981242269277573,
|
|
"num_tokens": 2766176.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.185801792144775,
|
|
"epoch": 0.11671469740634005,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999952154048459,
|
|
"loss": 6.1829,
|
|
"mean_token_accuracy": 0.15044604614377022,
|
|
"num_tokens": 2777861.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.021704149246216,
|
|
"epoch": 0.11719500480307397,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499994989214566,
|
|
"loss": 5.9954,
|
|
"mean_token_accuracy": 0.1536705419421196,
|
|
"num_tokens": 2788725.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.0181561470031735,
|
|
"epoch": 0.11767531219980788,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999947578005705,
|
|
"loss": 6.0312,
|
|
"mean_token_accuracy": 0.15193646997213364,
|
|
"num_tokens": 2801613.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.218272686004639,
|
|
"epoch": 0.11815561959654179,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999945211628648,
|
|
"loss": 6.0986,
|
|
"mean_token_accuracy": 0.1493365317583084,
|
|
"num_tokens": 2812474.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 5.971197032928467,
|
|
"epoch": 0.1186359269932757,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999942793014544,
|
|
"loss": 6.0103,
|
|
"mean_token_accuracy": 0.15563429594039918,
|
|
"num_tokens": 2823178.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.045905733108521,
|
|
"epoch": 0.11911623439000961,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.000499994032216345,
|
|
"loss": 6.0211,
|
|
"mean_token_accuracy": 0.15064174830913543,
|
|
"num_tokens": 2836486.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.107371759414673,
|
|
"epoch": 0.11959654178674352,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999937799075422,
|
|
"loss": 6.0746,
|
|
"mean_token_accuracy": 0.1570821538567543,
|
|
"num_tokens": 2847902.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 5.903108596801758,
|
|
"epoch": 0.12007684918347743,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000499993522375052,
|
|
"loss": 5.9739,
|
|
"mean_token_accuracy": 0.15461545437574387,
|
|
"num_tokens": 2859991.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.248143100738526,
|
|
"epoch": 0.12055715658021134,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999932596188802,
|
|
"loss": 6.1545,
|
|
"mean_token_accuracy": 0.14593613222241403,
|
|
"num_tokens": 2870269.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.034249687194825,
|
|
"epoch": 0.12103746397694524,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999929916390331,
|
|
"loss": 6.0279,
|
|
"mean_token_accuracy": 0.14597706943750383,
|
|
"num_tokens": 2882191.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 5.966269588470459,
|
|
"epoch": 0.12151777137367915,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999927184355169,
|
|
"loss": 6.0372,
|
|
"mean_token_accuracy": 0.14836430177092552,
|
|
"num_tokens": 2892775.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.147925519943238,
|
|
"epoch": 0.12199807877041306,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999924400083377,
|
|
"loss": 6.0247,
|
|
"mean_token_accuracy": 0.15831544399261474,
|
|
"num_tokens": 2904750.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.081568050384521,
|
|
"epoch": 0.12247838616714697,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999921563575022,
|
|
"loss": 6.0988,
|
|
"mean_token_accuracy": 0.14920950308442116,
|
|
"num_tokens": 2916150.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.07696213722229,
|
|
"epoch": 0.12295869356388088,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999918674830169,
|
|
"loss": 6.0644,
|
|
"mean_token_accuracy": 0.1496642827987671,
|
|
"num_tokens": 2928452.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.035782670974731,
|
|
"epoch": 0.12343900096061479,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999915733848886,
|
|
"loss": 6.0442,
|
|
"mean_token_accuracy": 0.1454036220908165,
|
|
"num_tokens": 2940577.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.022758436203003,
|
|
"epoch": 0.1239193083573487,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499991274063124,
|
|
"loss": 6.0283,
|
|
"mean_token_accuracy": 0.15150520876049994,
|
|
"num_tokens": 2952302.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.0645428657531735,
|
|
"epoch": 0.12439961575408261,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999909695177301,
|
|
"loss": 6.0669,
|
|
"mean_token_accuracy": 0.15440516471862792,
|
|
"num_tokens": 2964611.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.0961566925048825,
|
|
"epoch": 0.12487992315081652,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499990659748714,
|
|
"loss": 6.05,
|
|
"mean_token_accuracy": 0.15006925463676452,
|
|
"num_tokens": 2975668.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.146146440505982,
|
|
"epoch": 0.12536023054755044,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999903447560828,
|
|
"loss": 6.1198,
|
|
"mean_token_accuracy": 0.14781473577022552,
|
|
"num_tokens": 2987303.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.117984342575073,
|
|
"epoch": 0.12584053794428435,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999900245398439,
|
|
"loss": 6.0166,
|
|
"mean_token_accuracy": 0.16036698669195176,
|
|
"num_tokens": 3000400.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.010946893692017,
|
|
"epoch": 0.12632084534101826,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999896991000047,
|
|
"loss": 5.9477,
|
|
"mean_token_accuracy": 0.1495976448059082,
|
|
"num_tokens": 3012336.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.054377698898316,
|
|
"epoch": 0.12680115273775217,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999893684365729,
|
|
"loss": 6.0047,
|
|
"mean_token_accuracy": 0.15137309059500695,
|
|
"num_tokens": 3023004.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.044629859924316,
|
|
"epoch": 0.12728146013448607,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999890325495559,
|
|
"loss": 6.0922,
|
|
"mean_token_accuracy": 0.147823116928339,
|
|
"num_tokens": 3035147.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.072157478332519,
|
|
"epoch": 0.12776176753121998,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999886914389617,
|
|
"loss": 5.9177,
|
|
"mean_token_accuracy": 0.1551705077290535,
|
|
"num_tokens": 3045611.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 5.916638660430908,
|
|
"epoch": 0.1282420749279539,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999883451047981,
|
|
"loss": 5.9296,
|
|
"mean_token_accuracy": 0.1561925306916237,
|
|
"num_tokens": 3056420.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 5.977782440185547,
|
|
"epoch": 0.1287223823246878,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999879935470733,
|
|
"loss": 5.9227,
|
|
"mean_token_accuracy": 0.15750788599252702,
|
|
"num_tokens": 3068770.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.05616979598999,
|
|
"epoch": 0.1292026897214217,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999876367657954,
|
|
"loss": 6.0521,
|
|
"mean_token_accuracy": 0.14580482840538025,
|
|
"num_tokens": 3080806.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.143747854232788,
|
|
"epoch": 0.12968299711815562,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999872747609725,
|
|
"loss": 6.0742,
|
|
"mean_token_accuracy": 0.1484417587518692,
|
|
"num_tokens": 3091769.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 5.9879156112670895,
|
|
"epoch": 0.13016330451488953,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999869075326132,
|
|
"loss": 5.9938,
|
|
"mean_token_accuracy": 0.15191702395677567,
|
|
"num_tokens": 3103121.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.010816240310669,
|
|
"epoch": 0.13064361191162344,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.000499986535080726,
|
|
"loss": 5.9724,
|
|
"mean_token_accuracy": 0.16233935654163362,
|
|
"num_tokens": 3115606.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.026129817962646,
|
|
"epoch": 0.13112391930835735,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999861574053196,
|
|
"loss": 5.8723,
|
|
"mean_token_accuracy": 0.16096271872520446,
|
|
"num_tokens": 3127961.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 5.87260947227478,
|
|
"epoch": 0.13160422670509125,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999857745064027,
|
|
"loss": 5.8905,
|
|
"mean_token_accuracy": 0.15895691215991975,
|
|
"num_tokens": 3138316.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 5.953699588775635,
|
|
"epoch": 0.13208453410182516,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.000499985386383984,
|
|
"loss": 5.8671,
|
|
"mean_token_accuracy": 0.15866711735725403,
|
|
"num_tokens": 3150818.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.006815195083618,
|
|
"epoch": 0.13256484149855907,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999849930380729,
|
|
"loss": 6.0195,
|
|
"mean_token_accuracy": 0.1508159779012203,
|
|
"num_tokens": 3162066.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 5.941660642623901,
|
|
"epoch": 0.13304514889529298,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999845944686781,
|
|
"loss": 5.9924,
|
|
"mean_token_accuracy": 0.1508888617157936,
|
|
"num_tokens": 3172209.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 5.954594707489013,
|
|
"epoch": 0.1335254562920269,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999841906758093,
|
|
"loss": 5.8218,
|
|
"mean_token_accuracy": 0.1675858825445175,
|
|
"num_tokens": 3183248.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 5.94215030670166,
|
|
"epoch": 0.1340057636887608,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999837816594757,
|
|
"loss": 5.9139,
|
|
"mean_token_accuracy": 0.15847276002168656,
|
|
"num_tokens": 3194748.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 5.930553770065307,
|
|
"epoch": 0.1344860710854947,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999833674196865,
|
|
"loss": 5.8849,
|
|
"mean_token_accuracy": 0.16950529664754868,
|
|
"num_tokens": 3205669.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 5.932918214797974,
|
|
"epoch": 0.13496637848222862,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999829479564518,
|
|
"loss": 5.9807,
|
|
"mean_token_accuracy": 0.14995542094111441,
|
|
"num_tokens": 3216035.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.064324188232422,
|
|
"epoch": 0.13544668587896252,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499982523269781,
|
|
"loss": 5.9647,
|
|
"mean_token_accuracy": 0.15931690335273743,
|
|
"num_tokens": 3227192.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 5.975619888305664,
|
|
"epoch": 0.13592699327569643,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999820933596842,
|
|
"loss": 5.9871,
|
|
"mean_token_accuracy": 0.15620121210813523,
|
|
"num_tokens": 3240237.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 5.962911701202392,
|
|
"epoch": 0.13640730067243034,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499981658226171,
|
|
"loss": 5.8734,
|
|
"mean_token_accuracy": 0.16469697579741477,
|
|
"num_tokens": 3251963.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 5.908741474151611,
|
|
"epoch": 0.13688760806916425,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499981217869252,
|
|
"loss": 5.9953,
|
|
"mean_token_accuracy": 0.15814436972141266,
|
|
"num_tokens": 3263101.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 5.985613679885864,
|
|
"epoch": 0.1373679154658982,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499980772288937,
|
|
"loss": 5.8679,
|
|
"mean_token_accuracy": 0.16649020761251448,
|
|
"num_tokens": 3275100.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 5.945235109329223,
|
|
"epoch": 0.1378482228626321,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004999803214852367,
|
|
"loss": 5.9638,
|
|
"mean_token_accuracy": 0.15565589517354966,
|
|
"num_tokens": 3287025.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.04934253692627,
|
|
"epoch": 0.138328530259366,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004999798654581613,
|
|
"loss": 5.9662,
|
|
"mean_token_accuracy": 0.15883919447660447,
|
|
"num_tokens": 3299867.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 5.918570852279663,
|
|
"epoch": 0.13880883765609991,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999794042077214,
|
|
"loss": 5.9038,
|
|
"mean_token_accuracy": 0.16191874593496322,
|
|
"num_tokens": 3311183.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 5.952925539016723,
|
|
"epoch": 0.13928914505283382,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999789377339279,
|
|
"loss": 5.9687,
|
|
"mean_token_accuracy": 0.15641413480043412,
|
|
"num_tokens": 3322247.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 5.962415742874145,
|
|
"epoch": 0.13976945244956773,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999784660367915,
|
|
"loss": 5.8826,
|
|
"mean_token_accuracy": 0.1588966131210327,
|
|
"num_tokens": 3333369.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 5.904612874984741,
|
|
"epoch": 0.14024975984630164,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999779891163231,
|
|
"loss": 5.9113,
|
|
"mean_token_accuracy": 0.16011089235544204,
|
|
"num_tokens": 3345876.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 5.91278772354126,
|
|
"epoch": 0.14073006724303555,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999775069725339,
|
|
"loss": 5.8124,
|
|
"mean_token_accuracy": 0.1629629462957382,
|
|
"num_tokens": 3357323.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 5.912459039688111,
|
|
"epoch": 0.14121037463976946,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499977019605435,
|
|
"loss": 5.897,
|
|
"mean_token_accuracy": 0.15947655588388443,
|
|
"num_tokens": 3367689.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 5.844752836227417,
|
|
"epoch": 0.14169068203650337,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999765270150378,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.15955205261707306,
|
|
"num_tokens": 3379472.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 5.996302938461303,
|
|
"epoch": 0.14217098943323728,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999760292013536,
|
|
"loss": 5.8922,
|
|
"mean_token_accuracy": 0.15859662368893623,
|
|
"num_tokens": 3390929.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 5.99014687538147,
|
|
"epoch": 0.14265129682997119,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999755261643941,
|
|
"loss": 5.8976,
|
|
"mean_token_accuracy": 0.16287715286016463,
|
|
"num_tokens": 3401242.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 5.869934892654419,
|
|
"epoch": 0.1431316042267051,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999750179041709,
|
|
"loss": 5.8878,
|
|
"mean_token_accuracy": 0.16124220937490463,
|
|
"num_tokens": 3411169.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 5.874157810211182,
|
|
"epoch": 0.143611911623439,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999745044206959,
|
|
"loss": 5.7279,
|
|
"mean_token_accuracy": 0.16647156924009324,
|
|
"num_tokens": 3423265.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 5.832660913467407,
|
|
"epoch": 0.1440922190201729,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999739857139809,
|
|
"loss": 5.8347,
|
|
"mean_token_accuracy": 0.16908216327428818,
|
|
"num_tokens": 3434793.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 5.757522106170654,
|
|
"epoch": 0.14457252641690682,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499973461784038,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.17928926199674605,
|
|
"num_tokens": 3445732.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 5.942258501052857,
|
|
"epoch": 0.14505283381364073,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999729326308792,
|
|
"loss": 5.9516,
|
|
"mean_token_accuracy": 0.15832037180662156,
|
|
"num_tokens": 3457090.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 5.99946174621582,
|
|
"epoch": 0.14553314121037464,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499972398254517,
|
|
"loss": 5.9388,
|
|
"mean_token_accuracy": 0.15340567082166673,
|
|
"num_tokens": 3468087.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 5.941799163818359,
|
|
"epoch": 0.14601344860710855,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499971858654964,
|
|
"loss": 5.8778,
|
|
"mean_token_accuracy": 0.1609287366271019,
|
|
"num_tokens": 3478820.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 5.859274196624756,
|
|
"epoch": 0.14649375600384246,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999713138322321,
|
|
"loss": 5.9021,
|
|
"mean_token_accuracy": 0.15754427909851074,
|
|
"num_tokens": 3489878.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 5.942076396942139,
|
|
"epoch": 0.14697406340057637,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999707637863346,
|
|
"loss": 5.8905,
|
|
"mean_token_accuracy": 0.1585473045706749,
|
|
"num_tokens": 3500944.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 5.8406360149383545,
|
|
"epoch": 0.14745437079731027,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999702085172838,
|
|
"loss": 5.8719,
|
|
"mean_token_accuracy": 0.16607238352298737,
|
|
"num_tokens": 3511383.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 5.969763612747192,
|
|
"epoch": 0.14793467819404418,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999696480250929,
|
|
"loss": 5.963,
|
|
"mean_token_accuracy": 0.15430965945124625,
|
|
"num_tokens": 3523300.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 5.970634698867798,
|
|
"epoch": 0.1484149855907781,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999690823097747,
|
|
"loss": 5.8799,
|
|
"mean_token_accuracy": 0.1521039791405201,
|
|
"num_tokens": 3534371.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 5.841155576705932,
|
|
"epoch": 0.148895292987512,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999685113713426,
|
|
"loss": 5.8552,
|
|
"mean_token_accuracy": 0.16120514869689942,
|
|
"num_tokens": 3544847.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 5.92685284614563,
|
|
"epoch": 0.1493756003842459,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999679352098096,
|
|
"loss": 5.8223,
|
|
"mean_token_accuracy": 0.16645588725805283,
|
|
"num_tokens": 3555859.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 5.8343531608581545,
|
|
"epoch": 0.14985590778097982,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999673538251891,
|
|
"loss": 5.8389,
|
|
"mean_token_accuracy": 0.15894080251455306,
|
|
"num_tokens": 3568283.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 5.834793663024902,
|
|
"epoch": 0.15033621517771373,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999667672174947,
|
|
"loss": 5.917,
|
|
"mean_token_accuracy": 0.1583700641989708,
|
|
"num_tokens": 3581442.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.0175745487213135,
|
|
"epoch": 0.15081652257444764,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00049996617538674,
|
|
"loss": 5.9571,
|
|
"mean_token_accuracy": 0.15496992468833923,
|
|
"num_tokens": 3594055.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 5.962413930892945,
|
|
"epoch": 0.15129682997118155,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999655783329386,
|
|
"loss": 5.9187,
|
|
"mean_token_accuracy": 0.15283605754375457,
|
|
"num_tokens": 3605952.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 5.910793209075928,
|
|
"epoch": 0.15177713736791545,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999649760561046,
|
|
"loss": 5.9577,
|
|
"mean_token_accuracy": 0.158383572101593,
|
|
"num_tokens": 3618544.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 5.908201408386231,
|
|
"epoch": 0.15225744476464936,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999643685562519,
|
|
"loss": 5.8929,
|
|
"mean_token_accuracy": 0.16440413743257523,
|
|
"num_tokens": 3630445.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 5.935053777694702,
|
|
"epoch": 0.15273775216138327,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999637558333945,
|
|
"loss": 5.8797,
|
|
"mean_token_accuracy": 0.16155748218297958,
|
|
"num_tokens": 3642516.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 5.843541431427002,
|
|
"epoch": 0.15321805955811718,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999631378875467,
|
|
"loss": 5.8175,
|
|
"mean_token_accuracy": 0.16581382006406784,
|
|
"num_tokens": 3654425.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 5.805763053894043,
|
|
"epoch": 0.15369836695485112,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999625147187228,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.16464165300130845,
|
|
"num_tokens": 3666521.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.019205856323242,
|
|
"epoch": 0.15417867435158503,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999618863269373,
|
|
"loss": 5.8806,
|
|
"mean_token_accuracy": 0.15575164407491685,
|
|
"num_tokens": 3679121.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 5.91282377243042,
|
|
"epoch": 0.15465898174831894,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999612527122049,
|
|
"loss": 5.8941,
|
|
"mean_token_accuracy": 0.15461272597312928,
|
|
"num_tokens": 3691095.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 5.826972103118896,
|
|
"epoch": 0.15513928914505284,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 0.0004999606138745402,
|
|
"loss": 5.8562,
|
|
"mean_token_accuracy": 0.16407538801431656,
|
|
"num_tokens": 3703426.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 5.967412042617798,
|
|
"epoch": 0.15561959654178675,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999599698139581,
|
|
"loss": 5.9309,
|
|
"mean_token_accuracy": 0.1637990355491638,
|
|
"num_tokens": 3715429.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 5.932253503799439,
|
|
"epoch": 0.15609990393852066,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999593205304734,
|
|
"loss": 5.909,
|
|
"mean_token_accuracy": 0.15584128946065903,
|
|
"num_tokens": 3726327.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 5.9037374496459964,
|
|
"epoch": 0.15658021133525457,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999586660241012,
|
|
"loss": 5.8582,
|
|
"mean_token_accuracy": 0.1553866222500801,
|
|
"num_tokens": 3736818.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 5.929326868057251,
|
|
"epoch": 0.15706051873198848,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999580062948569,
|
|
"loss": 5.8583,
|
|
"mean_token_accuracy": 0.16254822611808778,
|
|
"num_tokens": 3747776.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 5.7625970363616945,
|
|
"epoch": 0.1575408261287224,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999573413427556,
|
|
"loss": 5.7301,
|
|
"mean_token_accuracy": 0.164338056743145,
|
|
"num_tokens": 3758990.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 5.8398857593536375,
|
|
"epoch": 0.1580211335254563,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999566711678128,
|
|
"loss": 5.7961,
|
|
"mean_token_accuracy": 0.1605479434132576,
|
|
"num_tokens": 3769686.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 5.867894649505615,
|
|
"epoch": 0.1585014409221902,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999559957700442,
|
|
"loss": 5.8554,
|
|
"mean_token_accuracy": 0.16354380249977113,
|
|
"num_tokens": 3781815.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 5.88207426071167,
|
|
"epoch": 0.15898174831892412,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999553151494653,
|
|
"loss": 5.9139,
|
|
"mean_token_accuracy": 0.15942219495773316,
|
|
"num_tokens": 3793392.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 5.860579538345337,
|
|
"epoch": 0.15946205571565802,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999546293060919,
|
|
"loss": 5.8298,
|
|
"mean_token_accuracy": 0.16041782200336457,
|
|
"num_tokens": 3804974.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 5.799793004989624,
|
|
"epoch": 0.15994236311239193,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00049995393823994,
|
|
"loss": 5.7028,
|
|
"mean_token_accuracy": 0.17192372530698777,
|
|
"num_tokens": 3817166.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 5.849306297302246,
|
|
"epoch": 0.16042267050912584,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999532419510255,
|
|
"loss": 5.8307,
|
|
"mean_token_accuracy": 0.1580624461174011,
|
|
"num_tokens": 3828151.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 5.847281789779663,
|
|
"epoch": 0.16090297790585975,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499952540439365,
|
|
"loss": 5.8283,
|
|
"mean_token_accuracy": 0.16032543033361435,
|
|
"num_tokens": 3839439.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 5.906755828857422,
|
|
"epoch": 0.16138328530259366,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999518337049743,
|
|
"loss": 5.8813,
|
|
"mean_token_accuracy": 0.15963228195905685,
|
|
"num_tokens": 3851694.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 5.831542205810547,
|
|
"epoch": 0.16186359269932757,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.00049995112174787,
|
|
"loss": 5.8589,
|
|
"mean_token_accuracy": 0.15917099863290787,
|
|
"num_tokens": 3863593.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 5.811672306060791,
|
|
"epoch": 0.16234390009606148,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999504045680687,
|
|
"loss": 5.7935,
|
|
"mean_token_accuracy": 0.1701650395989418,
|
|
"num_tokens": 3874588.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 5.894420862197876,
|
|
"epoch": 0.1628242074927954,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999496821655869,
|
|
"loss": 5.8753,
|
|
"mean_token_accuracy": 0.16022350043058395,
|
|
"num_tokens": 3884662.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 5.956241655349731,
|
|
"epoch": 0.1633045148895293,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0004999489545404414,
|
|
"loss": 5.9739,
|
|
"mean_token_accuracy": 0.15092033073306083,
|
|
"num_tokens": 3896569.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 5.943658018112183,
|
|
"epoch": 0.1637848222862632,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0004999482216926493,
|
|
"loss": 5.8162,
|
|
"mean_token_accuracy": 0.1632000833749771,
|
|
"num_tokens": 3907691.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 5.843317651748658,
|
|
"epoch": 0.1642651296829971,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999474836222273,
|
|
"loss": 5.83,
|
|
"mean_token_accuracy": 0.1665841408073902,
|
|
"num_tokens": 3918794.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 5.834485340118408,
|
|
"epoch": 0.16474543707973102,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999467403291928,
|
|
"loss": 5.8301,
|
|
"mean_token_accuracy": 0.1692491739988327,
|
|
"num_tokens": 3929773.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 5.874946594238281,
|
|
"epoch": 0.16522574447646493,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999459918135628,
|
|
"loss": 5.8498,
|
|
"mean_token_accuracy": 0.16062923073768615,
|
|
"num_tokens": 3940264.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 5.791439247131348,
|
|
"epoch": 0.16570605187319884,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499945238075355,
|
|
"loss": 5.7456,
|
|
"mean_token_accuracy": 0.1693306788802147,
|
|
"num_tokens": 3951500.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 5.851829910278321,
|
|
"epoch": 0.16618635926993275,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999444791145865,
|
|
"loss": 5.8145,
|
|
"mean_token_accuracy": 0.16588351577520372,
|
|
"num_tokens": 3963580.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 5.804158353805542,
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999437149312754,
|
|
"loss": 5.7585,
|
|
"mean_token_accuracy": 0.17176578491926192,
|
|
"num_tokens": 3975994.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 5.836318635940552,
|
|
"epoch": 0.16714697406340057,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499942945525439,
|
|
"loss": 5.7658,
|
|
"mean_token_accuracy": 0.15896687656641006,
|
|
"num_tokens": 3987897.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 5.888211059570312,
|
|
"epoch": 0.16762728146013448,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999421708970954,
|
|
"loss": 5.93,
|
|
"mean_token_accuracy": 0.15537445321679116,
|
|
"num_tokens": 3999829.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 5.7658594131469725,
|
|
"epoch": 0.16810758885686838,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999413910462625,
|
|
"loss": 5.7591,
|
|
"mean_token_accuracy": 0.16620118021965027,
|
|
"num_tokens": 4010882.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 5.861884737014771,
|
|
"epoch": 0.1685878962536023,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999406059729586,
|
|
"loss": 5.7469,
|
|
"mean_token_accuracy": 0.17034892737865448,
|
|
"num_tokens": 4021423.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 5.888075494766236,
|
|
"epoch": 0.1690682036503362,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999398156772016,
|
|
"loss": 5.8931,
|
|
"mean_token_accuracy": 0.15374189764261245,
|
|
"num_tokens": 4033590.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 5.721970653533935,
|
|
"epoch": 0.16954851104707014,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00049993902015901,
|
|
"loss": 5.7562,
|
|
"mean_token_accuracy": 0.16655992865562438,
|
|
"num_tokens": 4043978.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 5.931190156936646,
|
|
"epoch": 0.17002881844380405,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999382194184023,
|
|
"loss": 5.8756,
|
|
"mean_token_accuracy": 0.16273052543401717,
|
|
"num_tokens": 4054513.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 5.857993745803833,
|
|
"epoch": 0.17050912584053796,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999374134553972,
|
|
"loss": 5.8367,
|
|
"mean_token_accuracy": 0.16276317089796066,
|
|
"num_tokens": 4066019.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 5.841061735153199,
|
|
"epoch": 0.17098943323727187,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999366022700131,
|
|
"loss": 5.7935,
|
|
"mean_token_accuracy": 0.1673088401556015,
|
|
"num_tokens": 4077688.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 5.860415935516357,
|
|
"epoch": 0.17146974063400577,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999357858622691,
|
|
"loss": 5.8573,
|
|
"mean_token_accuracy": 0.1664716601371765,
|
|
"num_tokens": 4089803.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 5.8289069652557375,
|
|
"epoch": 0.17195004803073968,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004999349642321842,
|
|
"loss": 5.8073,
|
|
"mean_token_accuracy": 0.16912547051906585,
|
|
"num_tokens": 4101969.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 5.799117517471314,
|
|
"epoch": 0.1724303554274736,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999341373797772,
|
|
"loss": 5.7955,
|
|
"mean_token_accuracy": 0.15957102179527283,
|
|
"num_tokens": 4113567.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 5.814974451065064,
|
|
"epoch": 0.1729106628242075,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999333053050675,
|
|
"loss": 5.7575,
|
|
"mean_token_accuracy": 0.1691056177020073,
|
|
"num_tokens": 4125191.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 5.827954626083374,
|
|
"epoch": 0.1733909702209414,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999324680080744,
|
|
"loss": 5.8004,
|
|
"mean_token_accuracy": 0.16687883883714677,
|
|
"num_tokens": 4135050.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 5.842863750457764,
|
|
"epoch": 0.17387127761767532,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999316254888172,
|
|
"loss": 5.8736,
|
|
"mean_token_accuracy": 0.1648238182067871,
|
|
"num_tokens": 4146874.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 5.857775688171387,
|
|
"epoch": 0.17435158501440923,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999307777473157,
|
|
"loss": 5.7974,
|
|
"mean_token_accuracy": 0.16151650995016098,
|
|
"num_tokens": 4158118.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 5.818978691101075,
|
|
"epoch": 0.17483189241114314,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999299247835893,
|
|
"loss": 5.7561,
|
|
"mean_token_accuracy": 0.17479462176561356,
|
|
"num_tokens": 4169035.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 5.738432455062866,
|
|
"epoch": 0.17531219980787704,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499929066597658,
|
|
"loss": 5.745,
|
|
"mean_token_accuracy": 0.17148349434137344,
|
|
"num_tokens": 4180314.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 5.883955717086792,
|
|
"epoch": 0.17579250720461095,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999282031895418,
|
|
"loss": 5.8239,
|
|
"mean_token_accuracy": 0.16614590883255004,
|
|
"num_tokens": 4192238.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 5.769097232818604,
|
|
"epoch": 0.17627281460134486,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999273345592604,
|
|
"loss": 5.756,
|
|
"mean_token_accuracy": 0.16652164459228516,
|
|
"num_tokens": 4203346.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 5.811061954498291,
|
|
"epoch": 0.17675312199807877,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999264607068343,
|
|
"loss": 5.8159,
|
|
"mean_token_accuracy": 0.17016567289829254,
|
|
"num_tokens": 4213763.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 5.781940555572509,
|
|
"epoch": 0.17723342939481268,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.0004999255816322837,
|
|
"loss": 5.7699,
|
|
"mean_token_accuracy": 0.16876950412988662,
|
|
"num_tokens": 4225553.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 5.857665061950684,
|
|
"epoch": 0.1777137367915466,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000499924697335629,
|
|
"loss": 5.702,
|
|
"mean_token_accuracy": 0.17350574135780333,
|
|
"num_tokens": 4236058.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 5.640166330337524,
|
|
"epoch": 0.1781940441882805,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999238078168906,
|
|
"loss": 5.7763,
|
|
"mean_token_accuracy": 0.17054813206195832,
|
|
"num_tokens": 4248299.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 5.8273721694946286,
|
|
"epoch": 0.1786743515850144,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999229130760894,
|
|
"loss": 5.7052,
|
|
"mean_token_accuracy": 0.17111807465553283,
|
|
"num_tokens": 4259704.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 5.691127586364746,
|
|
"epoch": 0.17915465898174832,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000499922013113246,
|
|
"loss": 5.587,
|
|
"mean_token_accuracy": 0.18398697525262833,
|
|
"num_tokens": 4270480.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 5.780127954483032,
|
|
"epoch": 0.17963496637848222,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999211079283814,
|
|
"loss": 5.8538,
|
|
"mean_token_accuracy": 0.16719998568296432,
|
|
"num_tokens": 4282104.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 5.849603605270386,
|
|
"epoch": 0.18011527377521613,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999201975215164,
|
|
"loss": 5.8172,
|
|
"mean_token_accuracy": 0.16666848957538605,
|
|
"num_tokens": 4294251.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 5.757232236862182,
|
|
"epoch": 0.18059558117195004,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999192818926725,
|
|
"loss": 5.7017,
|
|
"mean_token_accuracy": 0.16847867369651795,
|
|
"num_tokens": 4305569.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 5.859993028640747,
|
|
"epoch": 0.18107588856868395,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999183610418706,
|
|
"loss": 5.8283,
|
|
"mean_token_accuracy": 0.16413767859339715,
|
|
"num_tokens": 4317845.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 5.76594557762146,
|
|
"epoch": 0.18155619596541786,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004999174349691322,
|
|
"loss": 5.6959,
|
|
"mean_token_accuracy": 0.17179392874240876,
|
|
"num_tokens": 4329987.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 5.697657203674316,
|
|
"epoch": 0.18203650336215177,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 0.0004999165036744788,
|
|
"loss": 5.7257,
|
|
"mean_token_accuracy": 0.16847490072250365,
|
|
"num_tokens": 4341628.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 5.861244201660156,
|
|
"epoch": 0.18251681075888568,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999155671579322,
|
|
"loss": 5.7851,
|
|
"mean_token_accuracy": 0.1615397110581398,
|
|
"num_tokens": 4352379.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 5.6849024295806885,
|
|
"epoch": 0.1829971181556196,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499914625419514,
|
|
"loss": 5.7181,
|
|
"mean_token_accuracy": 0.171738800406456,
|
|
"num_tokens": 4364800.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 5.776795959472656,
|
|
"epoch": 0.1834774255523535,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999136784592459,
|
|
"loss": 5.7315,
|
|
"mean_token_accuracy": 0.16872817426919937,
|
|
"num_tokens": 4376048.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 5.730347061157227,
|
|
"epoch": 0.1839577329490874,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999127262771502,
|
|
"loss": 5.7297,
|
|
"mean_token_accuracy": 0.16825871765613556,
|
|
"num_tokens": 4388072.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 5.872533082962036,
|
|
"epoch": 0.1844380403458213,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999117688732487,
|
|
"loss": 5.8226,
|
|
"mean_token_accuracy": 0.16391085535287858,
|
|
"num_tokens": 4399843.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 5.713910245895386,
|
|
"epoch": 0.18491834774255522,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999108062475638,
|
|
"loss": 5.6757,
|
|
"mean_token_accuracy": 0.17384760677814484,
|
|
"num_tokens": 4411373.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 5.716005563735962,
|
|
"epoch": 0.18539865513928913,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499909838400118,
|
|
"loss": 5.6614,
|
|
"mean_token_accuracy": 0.173922398686409,
|
|
"num_tokens": 4421857.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 5.820113229751587,
|
|
"epoch": 0.18587896253602307,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999088653309334,
|
|
"loss": 5.7618,
|
|
"mean_token_accuracy": 0.1711716189980507,
|
|
"num_tokens": 4432728.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 5.708466053009033,
|
|
"epoch": 0.18635926993275698,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999078870400329,
|
|
"loss": 5.693,
|
|
"mean_token_accuracy": 0.17283684760332108,
|
|
"num_tokens": 4444683.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 5.8614743709564205,
|
|
"epoch": 0.18683957732949089,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999069035274391,
|
|
"loss": 5.8215,
|
|
"mean_token_accuracy": 0.16018551886081694,
|
|
"num_tokens": 4456961.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 5.694478511810303,
|
|
"epoch": 0.1873198847262248,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004999059147931747,
|
|
"loss": 5.665,
|
|
"mean_token_accuracy": 0.1762719616293907,
|
|
"num_tokens": 4468424.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 5.791493558883667,
|
|
"epoch": 0.1878001921229587,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999049208372629,
|
|
"loss": 5.8694,
|
|
"mean_token_accuracy": 0.15364666059613227,
|
|
"num_tokens": 4479813.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 5.952554082870483,
|
|
"epoch": 0.1882804995196926,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999039216597267,
|
|
"loss": 5.862,
|
|
"mean_token_accuracy": 0.16733278185129166,
|
|
"num_tokens": 4491172.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 5.706536293029785,
|
|
"epoch": 0.18876080691642652,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999029172605892,
|
|
"loss": 5.7439,
|
|
"mean_token_accuracy": 0.1704375624656677,
|
|
"num_tokens": 4503063.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 5.889812326431274,
|
|
"epoch": 0.18924111431316043,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004999019076398738,
|
|
"loss": 5.8177,
|
|
"mean_token_accuracy": 0.15313875377178193,
|
|
"num_tokens": 4514188.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 5.822384834289551,
|
|
"epoch": 0.18972142170989434,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000499900892797604,
|
|
"loss": 5.7258,
|
|
"mean_token_accuracy": 0.17310872822999954,
|
|
"num_tokens": 4525293.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 5.80044903755188,
|
|
"epoch": 0.19020172910662825,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998998727338031,
|
|
"loss": 5.8139,
|
|
"mean_token_accuracy": 0.1692732721567154,
|
|
"num_tokens": 4536589.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 5.689789342880249,
|
|
"epoch": 0.19068203650336216,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998988474484952,
|
|
"loss": 5.5648,
|
|
"mean_token_accuracy": 0.19031796902418135,
|
|
"num_tokens": 4547594.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 5.717133808135986,
|
|
"epoch": 0.19116234390009607,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004998978169417038,
|
|
"loss": 5.78,
|
|
"mean_token_accuracy": 0.1743384450674057,
|
|
"num_tokens": 4559850.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 5.791743421554566,
|
|
"epoch": 0.19164265129682997,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998967812134529,
|
|
"loss": 5.7138,
|
|
"mean_token_accuracy": 0.17110339552164078,
|
|
"num_tokens": 4570727.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 5.610540056228638,
|
|
"epoch": 0.19212295869356388,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998957402637664,
|
|
"loss": 5.6542,
|
|
"mean_token_accuracy": 0.17157155871391297,
|
|
"num_tokens": 4582248.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 5.801579093933105,
|
|
"epoch": 0.1926032660902978,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998946940926687,
|
|
"loss": 5.6973,
|
|
"mean_token_accuracy": 0.17121600955724717,
|
|
"num_tokens": 4592604.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 5.661766576766968,
|
|
"epoch": 0.1930835734870317,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499893642700184,
|
|
"loss": 5.7182,
|
|
"mean_token_accuracy": 0.17020188719034196,
|
|
"num_tokens": 4604398.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 5.790825366973877,
|
|
"epoch": 0.1935638808837656,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004998925860863368,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.1685462474822998,
|
|
"num_tokens": 4616434.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 5.820285224914551,
|
|
"epoch": 0.19404418828049952,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004998915242511516,
|
|
"loss": 5.7541,
|
|
"mean_token_accuracy": 0.17625110745429992,
|
|
"num_tokens": 4627577.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 5.7781401634216305,
|
|
"epoch": 0.19452449567723343,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998904571946528,
|
|
"loss": 5.817,
|
|
"mean_token_accuracy": 0.16743545606732368,
|
|
"num_tokens": 4639698.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 5.838766145706177,
|
|
"epoch": 0.19500480307396734,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998893849168655,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.16433341503143312,
|
|
"num_tokens": 4650643.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 5.762656116485596,
|
|
"epoch": 0.19548511047070125,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004998883074178144,
|
|
"loss": 5.7427,
|
|
"mean_token_accuracy": 0.16878412663936615,
|
|
"num_tokens": 4662897.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 5.818380117416382,
|
|
"epoch": 0.19596541786743515,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998872246975247,
|
|
"loss": 5.8217,
|
|
"mean_token_accuracy": 0.1706990644335747,
|
|
"num_tokens": 4673701.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 5.910197305679321,
|
|
"epoch": 0.19644572526416906,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998861367560213,
|
|
"loss": 5.7826,
|
|
"mean_token_accuracy": 0.16689348816871644,
|
|
"num_tokens": 4685873.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 5.714930677413941,
|
|
"epoch": 0.19692603266090297,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998850435933296,
|
|
"loss": 5.6724,
|
|
"mean_token_accuracy": 0.17364383190870286,
|
|
"num_tokens": 4697179.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 5.752671766281128,
|
|
"epoch": 0.19740634005763688,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998839452094749,
|
|
"loss": 5.7084,
|
|
"mean_token_accuracy": 0.17288116365671158,
|
|
"num_tokens": 4707752.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 5.625265073776245,
|
|
"epoch": 0.1978866474543708,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998828416044829,
|
|
"loss": 5.58,
|
|
"mean_token_accuracy": 0.17766032367944717,
|
|
"num_tokens": 4718413.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 5.750666522979737,
|
|
"epoch": 0.1983669548511047,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499881732778379,
|
|
"loss": 5.7696,
|
|
"mean_token_accuracy": 0.16185117661952972,
|
|
"num_tokens": 4730033.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 5.668474435806274,
|
|
"epoch": 0.1988472622478386,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.000499880618731189,
|
|
"loss": 5.6346,
|
|
"mean_token_accuracy": 0.17201206237077712,
|
|
"num_tokens": 4742084.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 5.801948118209839,
|
|
"epoch": 0.19932756964457252,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998794994629388,
|
|
"loss": 5.8485,
|
|
"mean_token_accuracy": 0.16415513008832933,
|
|
"num_tokens": 4753885.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 5.755141353607177,
|
|
"epoch": 0.19980787704130643,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998783749736545,
|
|
"loss": 5.6852,
|
|
"mean_token_accuracy": 0.17273288518190383,
|
|
"num_tokens": 4765686.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 5.7318039894104,
|
|
"epoch": 0.20028818443804033,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004998772452633619,
|
|
"loss": 5.7343,
|
|
"mean_token_accuracy": 0.1667577311396599,
|
|
"num_tokens": 4777157.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 5.734004545211792,
|
|
"epoch": 0.20076849183477424,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998761103320876,
|
|
"loss": 5.6803,
|
|
"mean_token_accuracy": 0.17569620162248611,
|
|
"num_tokens": 4788583.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 5.81385350227356,
|
|
"epoch": 0.20124879923150815,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004998749701798577,
|
|
"loss": 5.795,
|
|
"mean_token_accuracy": 0.164644692838192,
|
|
"num_tokens": 4800749.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 5.652225208282471,
|
|
"epoch": 0.2017291066282421,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004998738248066986,
|
|
"loss": 5.7001,
|
|
"mean_token_accuracy": 0.17118856757879258,
|
|
"num_tokens": 4812488.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 5.816308832168579,
|
|
"epoch": 0.202209414024976,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998726742126372,
|
|
"loss": 5.6902,
|
|
"mean_token_accuracy": 0.17228334546089172,
|
|
"num_tokens": 4823495.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 5.622010517120361,
|
|
"epoch": 0.2026897214217099,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998715183976999,
|
|
"loss": 5.726,
|
|
"mean_token_accuracy": 0.16997579634189605,
|
|
"num_tokens": 4834450.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 5.763468551635742,
|
|
"epoch": 0.20317002881844382,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004998703573619137,
|
|
"loss": 5.6443,
|
|
"mean_token_accuracy": 0.18120874017477034,
|
|
"num_tokens": 4846826.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 5.804740762710571,
|
|
"epoch": 0.20365033621517772,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004998691911053056,
|
|
"loss": 5.8366,
|
|
"mean_token_accuracy": 0.15913107842206956,
|
|
"num_tokens": 4859668.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 5.727064418792724,
|
|
"epoch": 0.20413064361191163,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998680196279026,
|
|
"loss": 5.7049,
|
|
"mean_token_accuracy": 0.17213667631149293,
|
|
"num_tokens": 4871727.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 5.794467830657959,
|
|
"epoch": 0.20461095100864554,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998668429297319,
|
|
"loss": 5.7674,
|
|
"mean_token_accuracy": 0.17240212336182595,
|
|
"num_tokens": 4882191.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 5.760322952270508,
|
|
"epoch": 0.20509125840537945,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998656610108208,
|
|
"loss": 5.6971,
|
|
"mean_token_accuracy": 0.1685373991727829,
|
|
"num_tokens": 4892416.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 5.694274854660034,
|
|
"epoch": 0.20557156580211336,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998644738711969,
|
|
"loss": 5.6674,
|
|
"mean_token_accuracy": 0.1685459852218628,
|
|
"num_tokens": 4903572.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 5.810105037689209,
|
|
"epoch": 0.20605187319884727,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 0.0004998632815108874,
|
|
"loss": 5.763,
|
|
"mean_token_accuracy": 0.16395961344242097,
|
|
"num_tokens": 4915417.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 5.73304591178894,
|
|
"epoch": 0.20653218059558118,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998620839299203,
|
|
"loss": 5.6495,
|
|
"mean_token_accuracy": 0.17259960770606994,
|
|
"num_tokens": 4926943.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 5.6710865020751955,
|
|
"epoch": 0.2070124879923151,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998608811283233,
|
|
"loss": 5.6095,
|
|
"mean_token_accuracy": 0.17803010493516921,
|
|
"num_tokens": 4937724.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 5.7808784484863285,
|
|
"epoch": 0.207492795389049,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998596731061244,
|
|
"loss": 5.7756,
|
|
"mean_token_accuracy": 0.16368448734283447,
|
|
"num_tokens": 4949970.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 5.784394645690918,
|
|
"epoch": 0.2079731027857829,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004998584598633516,
|
|
"loss": 5.774,
|
|
"mean_token_accuracy": 0.16977567672729493,
|
|
"num_tokens": 4961389.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 5.7822630405426025,
|
|
"epoch": 0.2084534101825168,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998572414000329,
|
|
"loss": 5.82,
|
|
"mean_token_accuracy": 0.16696709543466567,
|
|
"num_tokens": 4973888.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 5.75656681060791,
|
|
"epoch": 0.20893371757925072,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998560177161969,
|
|
"loss": 5.7667,
|
|
"mean_token_accuracy": 0.1604086473584175,
|
|
"num_tokens": 4985423.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 5.70469822883606,
|
|
"epoch": 0.20941402497598463,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004998547888118718,
|
|
"loss": 5.726,
|
|
"mean_token_accuracy": 0.16619897931814193,
|
|
"num_tokens": 4997711.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 5.7725687503814695,
|
|
"epoch": 0.20989433237271854,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998535546870862,
|
|
"loss": 5.7454,
|
|
"mean_token_accuracy": 0.1679087519645691,
|
|
"num_tokens": 5009633.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 5.739374876022339,
|
|
"epoch": 0.21037463976945245,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998523153418687,
|
|
"loss": 5.6759,
|
|
"mean_token_accuracy": 0.17375072985887527,
|
|
"num_tokens": 5021523.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 5.785361337661743,
|
|
"epoch": 0.21085494716618636,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998510707762481,
|
|
"loss": 5.7695,
|
|
"mean_token_accuracy": 0.1699072614312172,
|
|
"num_tokens": 5033513.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 5.7873194217681885,
|
|
"epoch": 0.21133525456292027,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004998498209902533,
|
|
"loss": 5.7758,
|
|
"mean_token_accuracy": 0.16922611892223358,
|
|
"num_tokens": 5047055.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 5.707646226882934,
|
|
"epoch": 0.21181556195965417,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998485659839134,
|
|
"loss": 5.6497,
|
|
"mean_token_accuracy": 0.17682456970214844,
|
|
"num_tokens": 5057613.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 5.753945970535279,
|
|
"epoch": 0.21229586935638808,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998473057572575,
|
|
"loss": 5.7615,
|
|
"mean_token_accuracy": 0.16833806186914443,
|
|
"num_tokens": 5068886.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 5.742906093597412,
|
|
"epoch": 0.212776176753122,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998460403103146,
|
|
"loss": 5.7494,
|
|
"mean_token_accuracy": 0.16465574279427528,
|
|
"num_tokens": 5079978.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 5.736083173751831,
|
|
"epoch": 0.2132564841498559,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998447696431146,
|
|
"loss": 5.7159,
|
|
"mean_token_accuracy": 0.17075446248054504,
|
|
"num_tokens": 5091021.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 5.6740076541900635,
|
|
"epoch": 0.2137367915465898,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998434937556865,
|
|
"loss": 5.5988,
|
|
"mean_token_accuracy": 0.181574647128582,
|
|
"num_tokens": 5101483.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 5.708674907684326,
|
|
"epoch": 0.21421709894332372,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998422126480602,
|
|
"loss": 5.7447,
|
|
"mean_token_accuracy": 0.16306292563676833,
|
|
"num_tokens": 5113116.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 5.82704176902771,
|
|
"epoch": 0.21469740634005763,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998409263202653,
|
|
"loss": 5.6819,
|
|
"mean_token_accuracy": 0.1686948984861374,
|
|
"num_tokens": 5124824.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 5.589908075332642,
|
|
"epoch": 0.21517771373679154,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998396347723318,
|
|
"loss": 5.6335,
|
|
"mean_token_accuracy": 0.16587817817926406,
|
|
"num_tokens": 5137567.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 5.72907018661499,
|
|
"epoch": 0.21565802113352545,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004998383380042895,
|
|
"loss": 5.6846,
|
|
"mean_token_accuracy": 0.16729460805654525,
|
|
"num_tokens": 5149016.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 5.6214783668518065,
|
|
"epoch": 0.21613832853025935,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998370360161688,
|
|
"loss": 5.5788,
|
|
"mean_token_accuracy": 0.17212725132703782,
|
|
"num_tokens": 5160356.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 5.79612250328064,
|
|
"epoch": 0.21661863592699326,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998357288079996,
|
|
"loss": 5.7818,
|
|
"mean_token_accuracy": 0.16184753328561782,
|
|
"num_tokens": 5172100.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 5.740008592605591,
|
|
"epoch": 0.21709894332372717,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998344163798125,
|
|
"loss": 5.7405,
|
|
"mean_token_accuracy": 0.16320510655641557,
|
|
"num_tokens": 5183984.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 5.707123565673828,
|
|
"epoch": 0.21757925072046108,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004998330987316379,
|
|
"loss": 5.7153,
|
|
"mean_token_accuracy": 0.167342671751976,
|
|
"num_tokens": 5195853.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 5.6320737361907955,
|
|
"epoch": 0.21805955811719502,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998317758635062,
|
|
"loss": 5.5593,
|
|
"mean_token_accuracy": 0.17451774328947067,
|
|
"num_tokens": 5206995.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 5.515458297729492,
|
|
"epoch": 0.21853986551392893,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998304477754484,
|
|
"loss": 5.5989,
|
|
"mean_token_accuracy": 0.17679600268602372,
|
|
"num_tokens": 5219291.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 5.740645408630371,
|
|
"epoch": 0.21902017291066284,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998291144674952,
|
|
"loss": 5.6885,
|
|
"mean_token_accuracy": 0.17223394364118577,
|
|
"num_tokens": 5230856.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 5.601490020751953,
|
|
"epoch": 0.21950048030739674,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998277759396776,
|
|
"loss": 5.5333,
|
|
"mean_token_accuracy": 0.1814967930316925,
|
|
"num_tokens": 5242871.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 5.656805944442749,
|
|
"epoch": 0.21998078770413065,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998264321920265,
|
|
"loss": 5.64,
|
|
"mean_token_accuracy": 0.17801354676485062,
|
|
"num_tokens": 5253835.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 5.676252794265747,
|
|
"epoch": 0.22046109510086456,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0004998250832245734,
|
|
"loss": 5.6181,
|
|
"mean_token_accuracy": 0.17702293545007705,
|
|
"num_tokens": 5266195.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 5.641697740554809,
|
|
"epoch": 0.22094140249759847,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998237290373494,
|
|
"loss": 5.6002,
|
|
"mean_token_accuracy": 0.1801271617412567,
|
|
"num_tokens": 5277499.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 5.739913368225098,
|
|
"epoch": 0.22142170989433238,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000499822369630386,
|
|
"loss": 5.7231,
|
|
"mean_token_accuracy": 0.1597047820687294,
|
|
"num_tokens": 5288622.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 5.738846015930176,
|
|
"epoch": 0.2219020172910663,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998210050037148,
|
|
"loss": 5.7816,
|
|
"mean_token_accuracy": 0.16195343434810638,
|
|
"num_tokens": 5299664.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 5.717037725448608,
|
|
"epoch": 0.2223823246878002,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998196351573674,
|
|
"loss": 5.6552,
|
|
"mean_token_accuracy": 0.17402878403663635,
|
|
"num_tokens": 5311627.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 5.5637411117553714,
|
|
"epoch": 0.2228626320845341,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998182600913757,
|
|
"loss": 5.5627,
|
|
"mean_token_accuracy": 0.17947529554367064,
|
|
"num_tokens": 5323000.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 5.704880237579346,
|
|
"epoch": 0.22334293948126802,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998168798057715,
|
|
"loss": 5.5992,
|
|
"mean_token_accuracy": 0.18110302537679673,
|
|
"num_tokens": 5333811.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 5.615099573135376,
|
|
"epoch": 0.22382324687800192,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499815494300587,
|
|
"loss": 5.5991,
|
|
"mean_token_accuracy": 0.17574110478162766,
|
|
"num_tokens": 5344762.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 5.721481513977051,
|
|
"epoch": 0.22430355427473583,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998141035758542,
|
|
"loss": 5.6195,
|
|
"mean_token_accuracy": 0.17343118488788606,
|
|
"num_tokens": 5356112.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 5.655849504470825,
|
|
"epoch": 0.22478386167146974,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004998127076316054,
|
|
"loss": 5.7311,
|
|
"mean_token_accuracy": 0.17190437763929367,
|
|
"num_tokens": 5367339.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 5.674526071548462,
|
|
"epoch": 0.22526416906820365,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998113064678734,
|
|
"loss": 5.6665,
|
|
"mean_token_accuracy": 0.17564141601324082,
|
|
"num_tokens": 5378627.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 5.726110649108887,
|
|
"epoch": 0.22574447646493756,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998099000846901,
|
|
"loss": 5.7012,
|
|
"mean_token_accuracy": 0.1681268870830536,
|
|
"num_tokens": 5390209.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 5.734390020370483,
|
|
"epoch": 0.22622478386167147,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998084884820887,
|
|
"loss": 5.6833,
|
|
"mean_token_accuracy": 0.17136491537094117,
|
|
"num_tokens": 5401578.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 5.615032052993774,
|
|
"epoch": 0.22670509125840538,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998070716601016,
|
|
"loss": 5.5881,
|
|
"mean_token_accuracy": 0.17977205514907837,
|
|
"num_tokens": 5413831.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 5.722073316574097,
|
|
"epoch": 0.2271853986551393,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998056496187618,
|
|
"loss": 5.6496,
|
|
"mean_token_accuracy": 0.1711253985762596,
|
|
"num_tokens": 5425430.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 5.49839334487915,
|
|
"epoch": 0.2276657060518732,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998042223581025,
|
|
"loss": 5.4985,
|
|
"mean_token_accuracy": 0.1870403528213501,
|
|
"num_tokens": 5435353.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 5.7514622688293455,
|
|
"epoch": 0.2281460134486071,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004998027898781565,
|
|
"loss": 5.6991,
|
|
"mean_token_accuracy": 0.17083023190498353,
|
|
"num_tokens": 5446925.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 5.589994049072265,
|
|
"epoch": 0.228626320845341,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998013521789574,
|
|
"loss": 5.5899,
|
|
"mean_token_accuracy": 0.1772562175989151,
|
|
"num_tokens": 5456613.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 5.697564649581909,
|
|
"epoch": 0.22910662824207492,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997999092605384,
|
|
"loss": 5.6209,
|
|
"mean_token_accuracy": 0.17314212173223495,
|
|
"num_tokens": 5467790.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 5.672542333602905,
|
|
"epoch": 0.22958693563880883,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000499798461122933,
|
|
"loss": 5.6065,
|
|
"mean_token_accuracy": 0.17598363608121873,
|
|
"num_tokens": 5479166.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 5.594286203384399,
|
|
"epoch": 0.23006724303554274,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997970077661748,
|
|
"loss": 5.5932,
|
|
"mean_token_accuracy": 0.18340873271226882,
|
|
"num_tokens": 5490186.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 5.690382814407348,
|
|
"epoch": 0.23054755043227665,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997955491902977,
|
|
"loss": 5.5575,
|
|
"mean_token_accuracy": 0.1718940794467926,
|
|
"num_tokens": 5500416.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 5.582558584213257,
|
|
"epoch": 0.23102785782901056,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997940853953354,
|
|
"loss": 5.6489,
|
|
"mean_token_accuracy": 0.17370383739471434,
|
|
"num_tokens": 5512189.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 5.628128719329834,
|
|
"epoch": 0.23150816522574447,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.000499792616381322,
|
|
"loss": 5.5142,
|
|
"mean_token_accuracy": 0.1828036591410637,
|
|
"num_tokens": 5523631.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 5.609222555160523,
|
|
"epoch": 0.23198847262247838,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004997911421482914,
|
|
"loss": 5.5763,
|
|
"mean_token_accuracy": 0.1823565348982811,
|
|
"num_tokens": 5535637.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 5.639013814926147,
|
|
"epoch": 0.23246878001921228,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499789662696278,
|
|
"loss": 5.5869,
|
|
"mean_token_accuracy": 0.18035637438297272,
|
|
"num_tokens": 5546470.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 5.694498586654663,
|
|
"epoch": 0.2329490874159462,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004997881780253162,
|
|
"loss": 5.7456,
|
|
"mean_token_accuracy": 0.1703657627105713,
|
|
"num_tokens": 5558633.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 5.6558629989624025,
|
|
"epoch": 0.2334293948126801,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 0.0004997866881354403,
|
|
"loss": 5.6547,
|
|
"mean_token_accuracy": 0.17033104449510575,
|
|
"num_tokens": 5570427.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 5.6951744556427,
|
|
"epoch": 0.23390970220941404,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000499785193026685,
|
|
"loss": 5.6383,
|
|
"mean_token_accuracy": 0.17484120875597,
|
|
"num_tokens": 5580991.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 5.701549911499024,
|
|
"epoch": 0.23439000960614795,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997836926990851,
|
|
"loss": 5.6816,
|
|
"mean_token_accuracy": 0.17114701271057128,
|
|
"num_tokens": 5592777.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 5.602617788314819,
|
|
"epoch": 0.23487031700288186,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997821871526752,
|
|
"loss": 5.5874,
|
|
"mean_token_accuracy": 0.17974285781383514,
|
|
"num_tokens": 5603326.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 5.631419324874878,
|
|
"epoch": 0.23535062439961577,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997806763874905,
|
|
"loss": 5.5697,
|
|
"mean_token_accuracy": 0.1791187435388565,
|
|
"num_tokens": 5614504.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 5.617094326019287,
|
|
"epoch": 0.23583093179634967,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004997791604035659,
|
|
"loss": 5.6264,
|
|
"mean_token_accuracy": 0.17776354700326918,
|
|
"num_tokens": 5625150.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 5.6507199764251705,
|
|
"epoch": 0.23631123919308358,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997776392009366,
|
|
"loss": 5.6458,
|
|
"mean_token_accuracy": 0.169050732254982,
|
|
"num_tokens": 5636815.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 5.706958866119384,
|
|
"epoch": 0.2367915465898175,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004997761127796381,
|
|
"loss": 5.6366,
|
|
"mean_token_accuracy": 0.17092559188604356,
|
|
"num_tokens": 5648272.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 5.628375577926636,
|
|
"epoch": 0.2372718539865514,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997745811397056,
|
|
"loss": 5.5463,
|
|
"mean_token_accuracy": 0.17801680713891982,
|
|
"num_tokens": 5659227.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 5.6414820671081545,
|
|
"epoch": 0.2377521613832853,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997730442811748,
|
|
"loss": 5.6796,
|
|
"mean_token_accuracy": 0.17399391829967498,
|
|
"num_tokens": 5670411.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 5.5770539283752445,
|
|
"epoch": 0.23823246878001922,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997715022040814,
|
|
"loss": 5.5182,
|
|
"mean_token_accuracy": 0.1782184734940529,
|
|
"num_tokens": 5681570.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 5.523485231399536,
|
|
"epoch": 0.23871277617675313,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000499769954908461,
|
|
"loss": 5.5022,
|
|
"mean_token_accuracy": 0.1887900114059448,
|
|
"num_tokens": 5693021.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 5.659896421432495,
|
|
"epoch": 0.23919308357348704,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004997684023943498,
|
|
"loss": 5.5883,
|
|
"mean_token_accuracy": 0.17428779155015944,
|
|
"num_tokens": 5704043.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 5.5805792808532715,
|
|
"epoch": 0.23967339097022095,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997668446617837,
|
|
"loss": 5.6675,
|
|
"mean_token_accuracy": 0.16685750484466552,
|
|
"num_tokens": 5715735.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 5.760880804061889,
|
|
"epoch": 0.24015369836695485,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997652817107989,
|
|
"loss": 5.6294,
|
|
"mean_token_accuracy": 0.17232899218797684,
|
|
"num_tokens": 5725778.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 5.601306343078614,
|
|
"epoch": 0.24063400576368876,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997637135414315,
|
|
"loss": 5.6628,
|
|
"mean_token_accuracy": 0.17220552116632462,
|
|
"num_tokens": 5737224.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 5.779234981536865,
|
|
"epoch": 0.24111431316042267,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004997621401537183,
|
|
"loss": 5.6855,
|
|
"mean_token_accuracy": 0.17120948135852815,
|
|
"num_tokens": 5749226.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 5.6741156578063965,
|
|
"epoch": 0.24159462055715658,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997605615476955,
|
|
"loss": 5.6578,
|
|
"mean_token_accuracy": 0.17114464193582535,
|
|
"num_tokens": 5760282.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 5.539696168899536,
|
|
"epoch": 0.2420749279538905,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004997589777234,
|
|
"loss": 5.5633,
|
|
"mean_token_accuracy": 0.181555312871933,
|
|
"num_tokens": 5771756.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 5.650804233551026,
|
|
"epoch": 0.2425552353506244,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997573886808684,
|
|
"loss": 5.5835,
|
|
"mean_token_accuracy": 0.16679947078227997,
|
|
"num_tokens": 5783237.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 5.646309852600098,
|
|
"epoch": 0.2430355427473583,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997557944201375,
|
|
"loss": 5.6814,
|
|
"mean_token_accuracy": 0.17147036045789718,
|
|
"num_tokens": 5794825.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 5.675209999084473,
|
|
"epoch": 0.24351585014409222,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997541949412445,
|
|
"loss": 5.5712,
|
|
"mean_token_accuracy": 0.18625136017799376,
|
|
"num_tokens": 5805578.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 5.649836206436158,
|
|
"epoch": 0.24399615754082613,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004997525902442266,
|
|
"loss": 5.6738,
|
|
"mean_token_accuracy": 0.16476511359214782,
|
|
"num_tokens": 5818201.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 5.602812147140503,
|
|
"epoch": 0.24447646493756003,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004997509803291207,
|
|
"loss": 5.5959,
|
|
"mean_token_accuracy": 0.17587143927812576,
|
|
"num_tokens": 5830319.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 5.5824614524841305,
|
|
"epoch": 0.24495677233429394,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997493651959647,
|
|
"loss": 5.5428,
|
|
"mean_token_accuracy": 0.17996817231178283,
|
|
"num_tokens": 5840638.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 5.66239709854126,
|
|
"epoch": 0.24543707973102785,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004997477448447955,
|
|
"loss": 5.5773,
|
|
"mean_token_accuracy": 0.17367178648710252,
|
|
"num_tokens": 5852472.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 5.678495073318482,
|
|
"epoch": 0.24591738712776176,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997461192756512,
|
|
"loss": 5.6133,
|
|
"mean_token_accuracy": 0.170744089782238,
|
|
"num_tokens": 5863455.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 5.512450170516968,
|
|
"epoch": 0.24639769452449567,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997444884885694,
|
|
"loss": 5.5251,
|
|
"mean_token_accuracy": 0.17817995101213455,
|
|
"num_tokens": 5873141.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 5.603986024856567,
|
|
"epoch": 0.24687800192122958,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997428524835879,
|
|
"loss": 5.6316,
|
|
"mean_token_accuracy": 0.17475323528051376,
|
|
"num_tokens": 5884363.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 5.740997219085694,
|
|
"epoch": 0.2473583093179635,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004997412112607446,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.17148932665586472,
|
|
"num_tokens": 5895856.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 5.542859792709351,
|
|
"epoch": 0.2478386167146974,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997395648200778,
|
|
"loss": 5.4922,
|
|
"mean_token_accuracy": 0.17950474172830583,
|
|
"num_tokens": 5906657.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 5.600370979309082,
|
|
"epoch": 0.2483189241114313,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0004997379131616257,
|
|
"loss": 5.6226,
|
|
"mean_token_accuracy": 0.1700095072388649,
|
|
"num_tokens": 5919496.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 5.690901279449463,
|
|
"epoch": 0.24879923150816521,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004997362562854266,
|
|
"loss": 5.6843,
|
|
"mean_token_accuracy": 0.16776154488325118,
|
|
"num_tokens": 5932593.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 5.619813919067383,
|
|
"epoch": 0.24927953890489912,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997345941915187,
|
|
"loss": 5.6128,
|
|
"mean_token_accuracy": 0.17226099967956543,
|
|
"num_tokens": 5944080.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 5.602241802215576,
|
|
"epoch": 0.24975984630163303,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997329268799412,
|
|
"loss": 5.5752,
|
|
"mean_token_accuracy": 0.18460023701190947,
|
|
"num_tokens": 5955703.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 5.62792739868164,
|
|
"epoch": 0.25024015369836694,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004997312543507322,
|
|
"loss": 5.6565,
|
|
"mean_token_accuracy": 0.1714890867471695,
|
|
"num_tokens": 5966979.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 5.672908306121826,
|
|
"epoch": 0.2507204610951009,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997295766039309,
|
|
"loss": 5.545,
|
|
"mean_token_accuracy": 0.17637500017881394,
|
|
"num_tokens": 5978808.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 5.6401097774505615,
|
|
"epoch": 0.25120076849183476,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004997278936395761,
|
|
"loss": 5.7288,
|
|
"mean_token_accuracy": 0.16584430038928985,
|
|
"num_tokens": 5992145.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 5.665263652801514,
|
|
"epoch": 0.2516810758885687,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004997262054577071,
|
|
"loss": 5.5694,
|
|
"mean_token_accuracy": 0.17564088106155396,
|
|
"num_tokens": 6003723.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 5.6567973613739015,
|
|
"epoch": 0.2521613832853026,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004997245120583627,
|
|
"loss": 5.6351,
|
|
"mean_token_accuracy": 0.1769047811627388,
|
|
"num_tokens": 6014064.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 5.53907151222229,
|
|
"epoch": 0.2526416906820365,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004997228134415825,
|
|
"loss": 5.5168,
|
|
"mean_token_accuracy": 0.1834915667772293,
|
|
"num_tokens": 6025455.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 5.6452476501464846,
|
|
"epoch": 0.2531219980787704,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997211096074059,
|
|
"loss": 5.6231,
|
|
"mean_token_accuracy": 0.16973316073417663,
|
|
"num_tokens": 6037347.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 5.600665187835693,
|
|
"epoch": 0.25360230547550433,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997194005558722,
|
|
"loss": 5.5304,
|
|
"mean_token_accuracy": 0.18019532412290573,
|
|
"num_tokens": 6049236.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 5.534391641616821,
|
|
"epoch": 0.2540826128722382,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004997176862870216,
|
|
"loss": 5.5339,
|
|
"mean_token_accuracy": 0.1798613414168358,
|
|
"num_tokens": 6060982.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 5.637931680679321,
|
|
"epoch": 0.25456292026897215,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004997159668008933,
|
|
"loss": 5.5514,
|
|
"mean_token_accuracy": 0.17985030263662338,
|
|
"num_tokens": 6070925.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 5.526381587982177,
|
|
"epoch": 0.25504322766570603,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997142420975277,
|
|
"loss": 5.514,
|
|
"mean_token_accuracy": 0.18175738006830217,
|
|
"num_tokens": 6081279.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 5.5633796691894535,
|
|
"epoch": 0.25552353506243997,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004997125121769647,
|
|
"loss": 5.6108,
|
|
"mean_token_accuracy": 0.17793446481227876,
|
|
"num_tokens": 6091797.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 5.687921333312988,
|
|
"epoch": 0.25600384245917385,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004997107770392444,
|
|
"loss": 5.6134,
|
|
"mean_token_accuracy": 0.1804993599653244,
|
|
"num_tokens": 6103435.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 5.648722791671753,
|
|
"epoch": 0.2564841498559078,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.000499709036684407,
|
|
"loss": 5.6751,
|
|
"mean_token_accuracy": 0.17587384432554246,
|
|
"num_tokens": 6114531.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 5.569314622879029,
|
|
"epoch": 0.25696445725264166,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997072911124932,
|
|
"loss": 5.5173,
|
|
"mean_token_accuracy": 0.17945850938558577,
|
|
"num_tokens": 6126110.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 5.670061159133911,
|
|
"epoch": 0.2574447646493756,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997055403235432,
|
|
"loss": 5.6187,
|
|
"mean_token_accuracy": 0.1766670301556587,
|
|
"num_tokens": 6137114.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 5.62683253288269,
|
|
"epoch": 0.2579250720461095,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004997037843175978,
|
|
"loss": 5.5718,
|
|
"mean_token_accuracy": 0.17658228576183319,
|
|
"num_tokens": 6148696.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 5.59165620803833,
|
|
"epoch": 0.2584053794428434,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004997020230946978,
|
|
"loss": 5.568,
|
|
"mean_token_accuracy": 0.1790614068508148,
|
|
"num_tokens": 6160235.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 5.629477691650391,
|
|
"epoch": 0.25888568683957736,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004997002566548841,
|
|
"loss": 5.5586,
|
|
"mean_token_accuracy": 0.17292713820934297,
|
|
"num_tokens": 6172031.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 5.48054838180542,
|
|
"epoch": 0.25936599423631124,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004996984849981976,
|
|
"loss": 5.4233,
|
|
"mean_token_accuracy": 0.1893267199397087,
|
|
"num_tokens": 6183547.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 5.619540548324585,
|
|
"epoch": 0.2598463016330452,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004996967081246794,
|
|
"loss": 5.632,
|
|
"mean_token_accuracy": 0.1678134724497795,
|
|
"num_tokens": 6194768.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 5.6499683380126955,
|
|
"epoch": 0.26032660902977905,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004996949260343711,
|
|
"loss": 5.6314,
|
|
"mean_token_accuracy": 0.1706198126077652,
|
|
"num_tokens": 6206099.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 5.624089670181275,
|
|
"epoch": 0.260806916426513,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996931387273137,
|
|
"loss": 5.6262,
|
|
"mean_token_accuracy": 0.17660144418478013,
|
|
"num_tokens": 6217530.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 5.713815212249756,
|
|
"epoch": 0.2612872238232469,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004996913462035487,
|
|
"loss": 5.6448,
|
|
"mean_token_accuracy": 0.1767139658331871,
|
|
"num_tokens": 6228564.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 5.539792156219482,
|
|
"epoch": 0.2617675312199808,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499689548463118,
|
|
"loss": 5.5174,
|
|
"mean_token_accuracy": 0.17854675203561782,
|
|
"num_tokens": 6239945.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 5.59919810295105,
|
|
"epoch": 0.2622478386167147,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004996877455060631,
|
|
"loss": 5.6312,
|
|
"mean_token_accuracy": 0.17017472237348558,
|
|
"num_tokens": 6251829.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 5.7330786228179935,
|
|
"epoch": 0.2627281460134486,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004996859373324259,
|
|
"loss": 5.7264,
|
|
"mean_token_accuracy": 0.16224824339151384,
|
|
"num_tokens": 6264823.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 5.5701476573944095,
|
|
"epoch": 0.2632084534101825,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996841239422485,
|
|
"loss": 5.4065,
|
|
"mean_token_accuracy": 0.18482713848352433,
|
|
"num_tokens": 6276247.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 5.470470857620239,
|
|
"epoch": 0.26368876080691644,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004996823053355729,
|
|
"loss": 5.5321,
|
|
"mean_token_accuracy": 0.18076382875442504,
|
|
"num_tokens": 6287593.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 5.685536909103393,
|
|
"epoch": 0.2641690682036503,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004996804815124413,
|
|
"loss": 5.6897,
|
|
"mean_token_accuracy": 0.16898608654737474,
|
|
"num_tokens": 6299918.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 5.568260049819946,
|
|
"epoch": 0.26464937560038426,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996786524728962,
|
|
"loss": 5.5287,
|
|
"mean_token_accuracy": 0.18196363002061844,
|
|
"num_tokens": 6311147.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 5.45229320526123,
|
|
"epoch": 0.26512968299711814,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004996768182169797,
|
|
"loss": 5.4564,
|
|
"mean_token_accuracy": 0.18652137070894242,
|
|
"num_tokens": 6323239.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 5.692247343063355,
|
|
"epoch": 0.2656099903938521,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996749787447349,
|
|
"loss": 5.5567,
|
|
"mean_token_accuracy": 0.17187336832284927,
|
|
"num_tokens": 6334625.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 5.545494651794433,
|
|
"epoch": 0.26609029779058596,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499673134056204,
|
|
"loss": 5.5938,
|
|
"mean_token_accuracy": 0.17517421692609786,
|
|
"num_tokens": 6346068.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 5.584152412414551,
|
|
"epoch": 0.2665706051873199,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996712841514303,
|
|
"loss": 5.5716,
|
|
"mean_token_accuracy": 0.17334717959165574,
|
|
"num_tokens": 6357097.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 5.656313180923462,
|
|
"epoch": 0.2670509125840538,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996694290304563,
|
|
"loss": 5.6313,
|
|
"mean_token_accuracy": 0.16709280461072923,
|
|
"num_tokens": 6367481.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 5.52793607711792,
|
|
"epoch": 0.2675312199807877,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004996675686933255,
|
|
"loss": 5.5381,
|
|
"mean_token_accuracy": 0.18144787847995758,
|
|
"num_tokens": 6378873.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 5.664049291610718,
|
|
"epoch": 0.2680115273775216,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004996657031400807,
|
|
"loss": 5.5768,
|
|
"mean_token_accuracy": 0.18006865531206132,
|
|
"num_tokens": 6390651.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 5.478256464004517,
|
|
"epoch": 0.26849183477425553,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996638323707655,
|
|
"loss": 5.446,
|
|
"mean_token_accuracy": 0.1820421040058136,
|
|
"num_tokens": 6401631.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 5.48651123046875,
|
|
"epoch": 0.2689721421709894,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004996619563854232,
|
|
"loss": 5.5308,
|
|
"mean_token_accuracy": 0.1832943469285965,
|
|
"num_tokens": 6413875.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 5.689049482345581,
|
|
"epoch": 0.26945244956772335,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996600751840974,
|
|
"loss": 5.5579,
|
|
"mean_token_accuracy": 0.1733505442738533,
|
|
"num_tokens": 6425764.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 5.478516244888306,
|
|
"epoch": 0.26993275696445723,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004996581887668317,
|
|
"loss": 5.494,
|
|
"mean_token_accuracy": 0.18221275955438615,
|
|
"num_tokens": 6437911.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 5.534301519393921,
|
|
"epoch": 0.27041306436119117,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00049965629713367,
|
|
"loss": 5.4961,
|
|
"mean_token_accuracy": 0.18141991049051284,
|
|
"num_tokens": 6449942.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 5.604593276977539,
|
|
"epoch": 0.27089337175792505,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004996544002846561,
|
|
"loss": 5.6208,
|
|
"mean_token_accuracy": 0.17682201713323592,
|
|
"num_tokens": 6461729.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 5.614752101898193,
|
|
"epoch": 0.271373679154659,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996524982198343,
|
|
"loss": 5.5988,
|
|
"mean_token_accuracy": 0.17795798033475876,
|
|
"num_tokens": 6472046.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 5.600375080108643,
|
|
"epoch": 0.27185398655139287,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996505909392485,
|
|
"loss": 5.5667,
|
|
"mean_token_accuracy": 0.17373612523078918,
|
|
"num_tokens": 6483308.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 5.429362010955811,
|
|
"epoch": 0.2723342939481268,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004996486784429429,
|
|
"loss": 5.4311,
|
|
"mean_token_accuracy": 0.18428465574979783,
|
|
"num_tokens": 6495093.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 5.5981306552886965,
|
|
"epoch": 0.2728146013448607,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996467607309622,
|
|
"loss": 5.5307,
|
|
"mean_token_accuracy": 0.17854470163583755,
|
|
"num_tokens": 6505933.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 5.626583003997803,
|
|
"epoch": 0.2732949087415946,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996448378033507,
|
|
"loss": 5.5893,
|
|
"mean_token_accuracy": 0.17490534335374833,
|
|
"num_tokens": 6517280.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 5.60156021118164,
|
|
"epoch": 0.2737752161383285,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996429096601532,
|
|
"loss": 5.6315,
|
|
"mean_token_accuracy": 0.17191672027111055,
|
|
"num_tokens": 6528980.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 5.601687097549439,
|
|
"epoch": 0.27425552353506244,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996409763014144,
|
|
"loss": 5.6235,
|
|
"mean_token_accuracy": 0.17743158787488938,
|
|
"num_tokens": 6540670.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 5.593181991577149,
|
|
"epoch": 0.2747358309317964,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996390377271791,
|
|
"loss": 5.5855,
|
|
"mean_token_accuracy": 0.18115401417016982,
|
|
"num_tokens": 6551302.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 5.5507872104644775,
|
|
"epoch": 0.27521613832853026,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996370939374924,
|
|
"loss": 5.5433,
|
|
"mean_token_accuracy": 0.1738438919186592,
|
|
"num_tokens": 6563177.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 5.72943229675293,
|
|
"epoch": 0.2756964457252642,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004996351449323994,
|
|
"loss": 5.6521,
|
|
"mean_token_accuracy": 0.17468605786561966,
|
|
"num_tokens": 6573323.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 5.5880653858184814,
|
|
"epoch": 0.2761767531219981,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996331907119455,
|
|
"loss": 5.591,
|
|
"mean_token_accuracy": 0.16756793707609177,
|
|
"num_tokens": 6585382.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 5.474012231826782,
|
|
"epoch": 0.276657060518732,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996312312761758,
|
|
"loss": 5.467,
|
|
"mean_token_accuracy": 0.1900227263569832,
|
|
"num_tokens": 6596629.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 5.6394744396209715,
|
|
"epoch": 0.2771373679154659,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499629266625136,
|
|
"loss": 5.5734,
|
|
"mean_token_accuracy": 0.17828488498926162,
|
|
"num_tokens": 6608408.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 5.638094282150268,
|
|
"epoch": 0.27761767531219983,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996272967588715,
|
|
"loss": 5.5989,
|
|
"mean_token_accuracy": 0.1704651966691017,
|
|
"num_tokens": 6619375.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 5.618940448760986,
|
|
"epoch": 0.2780979827089337,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996253216774283,
|
|
"loss": 5.6398,
|
|
"mean_token_accuracy": 0.17304042726755142,
|
|
"num_tokens": 6631317.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 5.576578378677368,
|
|
"epoch": 0.27857829010566765,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996233413808521,
|
|
"loss": 5.4904,
|
|
"mean_token_accuracy": 0.18116467744112014,
|
|
"num_tokens": 6642009.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 5.609902429580688,
|
|
"epoch": 0.27905859750240153,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996213558691889,
|
|
"loss": 5.6478,
|
|
"mean_token_accuracy": 0.1682332620024681,
|
|
"num_tokens": 6654713.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 5.651772451400757,
|
|
"epoch": 0.27953890489913547,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004996193651424848,
|
|
"loss": 5.6064,
|
|
"mean_token_accuracy": 0.17700932323932647,
|
|
"num_tokens": 6667157.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 5.575735330581665,
|
|
"epoch": 0.28001921229586935,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.000499617369200786,
|
|
"loss": 5.5599,
|
|
"mean_token_accuracy": 0.18871267586946489,
|
|
"num_tokens": 6679573.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 5.593114852905273,
|
|
"epoch": 0.2804995196926033,
|
|
"grad_norm": 0.859375,
|
|
"learning_rate": 0.0004996153680441389,
|
|
"loss": 5.624,
|
|
"mean_token_accuracy": 0.17413021624088287,
|
|
"num_tokens": 6691768.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 5.653490257263184,
|
|
"epoch": 0.28097982708933716,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00049961336167259,
|
|
"loss": 5.5864,
|
|
"mean_token_accuracy": 0.17438612282276153,
|
|
"num_tokens": 6701964.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 5.618965578079224,
|
|
"epoch": 0.2814601344860711,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996113500861857,
|
|
"loss": 5.5759,
|
|
"mean_token_accuracy": 0.1726679503917694,
|
|
"num_tokens": 6713506.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 5.581022930145264,
|
|
"epoch": 0.281940441882805,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996093332849729,
|
|
"loss": 5.593,
|
|
"mean_token_accuracy": 0.1725487932562828,
|
|
"num_tokens": 6724616.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 5.562248182296753,
|
|
"epoch": 0.2824207492795389,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996073112689983,
|
|
"loss": 5.5803,
|
|
"mean_token_accuracy": 0.17757243812084197,
|
|
"num_tokens": 6735054.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 5.616918420791626,
|
|
"epoch": 0.2829010566762728,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004996052840383088,
|
|
"loss": 5.6325,
|
|
"mean_token_accuracy": 0.17381539791822434,
|
|
"num_tokens": 6746756.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 5.603857469558716,
|
|
"epoch": 0.28338136407300674,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.0004996032515929516,
|
|
"loss": 5.4992,
|
|
"mean_token_accuracy": 0.1776091992855072,
|
|
"num_tokens": 6759566.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 5.573670148849487,
|
|
"epoch": 0.2838616714697406,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004996012139329738,
|
|
"loss": 5.5225,
|
|
"mean_token_accuracy": 0.17899418324232103,
|
|
"num_tokens": 6771375.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 5.619125080108643,
|
|
"epoch": 0.28434197886647455,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995991710584228,
|
|
"loss": 5.6311,
|
|
"mean_token_accuracy": 0.16734524071216583,
|
|
"num_tokens": 6783252.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 5.58878116607666,
|
|
"epoch": 0.28482228626320844,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004995971229693459,
|
|
"loss": 5.5941,
|
|
"mean_token_accuracy": 0.17340553402900696,
|
|
"num_tokens": 6795525.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 5.610876131057739,
|
|
"epoch": 0.28530259365994237,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004995950696657909,
|
|
"loss": 5.5353,
|
|
"mean_token_accuracy": 0.17990380227565766,
|
|
"num_tokens": 6807212.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 5.52398419380188,
|
|
"epoch": 0.28578290105667625,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995930111478051,
|
|
"loss": 5.4712,
|
|
"mean_token_accuracy": 0.1771505206823349,
|
|
"num_tokens": 6819367.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 5.5713125705719,
|
|
"epoch": 0.2862632084534102,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995909474154365,
|
|
"loss": 5.5531,
|
|
"mean_token_accuracy": 0.17791730761528016,
|
|
"num_tokens": 6830405.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 5.524326038360596,
|
|
"epoch": 0.28674351585014407,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004995888784687331,
|
|
"loss": 5.5413,
|
|
"mean_token_accuracy": 0.18089909702539445,
|
|
"num_tokens": 6841479.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 5.545838022232056,
|
|
"epoch": 0.287223823246878,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995868043077428,
|
|
"loss": 5.5784,
|
|
"mean_token_accuracy": 0.1739095240831375,
|
|
"num_tokens": 6851585.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 5.605233526229858,
|
|
"epoch": 0.2877041306436119,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004995847249325137,
|
|
"loss": 5.5488,
|
|
"mean_token_accuracy": 0.1776391088962555,
|
|
"num_tokens": 6863176.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 5.596064901351928,
|
|
"epoch": 0.2881844380403458,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995826403430942,
|
|
"loss": 5.595,
|
|
"mean_token_accuracy": 0.17474860548973084,
|
|
"num_tokens": 6874021.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.2881844380403458,
|
|
"eval_entropy": 5.440896103871502,
|
|
"eval_loss": 5.576871395111084,
|
|
"eval_mean_token_accuracy": 0.18414354559419172,
|
|
"eval_num_tokens": 6874021.0,
|
|
"eval_runtime": 26.9459,
|
|
"eval_samples_per_second": 1217.809,
|
|
"eval_steps_per_second": 152.231,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 5.6302040100097654,
|
|
"epoch": 0.2886647454370797,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995805505395328,
|
|
"loss": 5.5584,
|
|
"mean_token_accuracy": 0.17477040886878967,
|
|
"num_tokens": 6884999.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"entropy": 5.559301853179932,
|
|
"epoch": 0.28914505283381364,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995784555218778,
|
|
"loss": 5.548,
|
|
"mean_token_accuracy": 0.17850742042064666,
|
|
"num_tokens": 6897021.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 5.518660974502564,
|
|
"epoch": 0.2896253602305475,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995763552901779,
|
|
"loss": 5.5449,
|
|
"mean_token_accuracy": 0.17909058481454848,
|
|
"num_tokens": 6908320.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"entropy": 5.68627028465271,
|
|
"epoch": 0.29010566762728146,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995742498444818,
|
|
"loss": 5.5342,
|
|
"mean_token_accuracy": 0.18174685835838317,
|
|
"num_tokens": 6919957.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 5.529996299743653,
|
|
"epoch": 0.2905859750240154,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004995721391848387,
|
|
"loss": 5.4942,
|
|
"mean_token_accuracy": 0.17575003057718278,
|
|
"num_tokens": 6930531.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"entropy": 5.623160696029663,
|
|
"epoch": 0.2910662824207493,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995700233112972,
|
|
"loss": 5.6325,
|
|
"mean_token_accuracy": 0.17704310566186904,
|
|
"num_tokens": 6942556.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 5.583187103271484,
|
|
"epoch": 0.2915465898174832,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004995679022239066,
|
|
"loss": 5.5762,
|
|
"mean_token_accuracy": 0.17900587618350983,
|
|
"num_tokens": 6954410.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"entropy": 5.579293632507325,
|
|
"epoch": 0.2920268972142171,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004995657759227162,
|
|
"loss": 5.5857,
|
|
"mean_token_accuracy": 0.17669540643692017,
|
|
"num_tokens": 6964970.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 5.554018545150757,
|
|
"epoch": 0.29250720461095103,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004995636444077751,
|
|
"loss": 5.4673,
|
|
"mean_token_accuracy": 0.1851392537355423,
|
|
"num_tokens": 6976016.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"entropy": 5.490430164337158,
|
|
"epoch": 0.2929875120076849,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995615076791333,
|
|
"loss": 5.4999,
|
|
"mean_token_accuracy": 0.1816742718219757,
|
|
"num_tokens": 6987199.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 5.5644313335418705,
|
|
"epoch": 0.29346781940441885,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995593657368399,
|
|
"loss": 5.5218,
|
|
"mean_token_accuracy": 0.18650518208742142,
|
|
"num_tokens": 6999174.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"entropy": 5.557963037490845,
|
|
"epoch": 0.29394812680115273,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499557218580945,
|
|
"loss": 5.5884,
|
|
"mean_token_accuracy": 0.17525261044502258,
|
|
"num_tokens": 7012148.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 5.486077213287354,
|
|
"epoch": 0.29442843419788667,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995550662114981,
|
|
"loss": 5.4609,
|
|
"mean_token_accuracy": 0.18215615749359132,
|
|
"num_tokens": 7023238.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"entropy": 5.561151647567749,
|
|
"epoch": 0.29490874159462055,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995529086285495,
|
|
"loss": 5.5521,
|
|
"mean_token_accuracy": 0.17758539766073228,
|
|
"num_tokens": 7034944.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 5.563313627243042,
|
|
"epoch": 0.2953890489913545,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499550745832149,
|
|
"loss": 5.4154,
|
|
"mean_token_accuracy": 0.18512072116136552,
|
|
"num_tokens": 7046880.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"entropy": 5.486554431915283,
|
|
"epoch": 0.29586935638808837,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004995485778223471,
|
|
"loss": 5.4866,
|
|
"mean_token_accuracy": 0.1800946146249771,
|
|
"num_tokens": 7057678.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 5.4739940643310545,
|
|
"epoch": 0.2963496637848223,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995464045991939,
|
|
"loss": 5.4688,
|
|
"mean_token_accuracy": 0.18641662895679473,
|
|
"num_tokens": 7068336.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"entropy": 5.588371753692627,
|
|
"epoch": 0.2968299711815562,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00049954422616274,
|
|
"loss": 5.5343,
|
|
"mean_token_accuracy": 0.17594826519489287,
|
|
"num_tokens": 7080341.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 5.6965454578399655,
|
|
"epoch": 0.2973102785782901,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995420425130359,
|
|
"loss": 5.6866,
|
|
"mean_token_accuracy": 0.17018966376781464,
|
|
"num_tokens": 7090618.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"entropy": 5.499913692474365,
|
|
"epoch": 0.297790585975024,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995398536501324,
|
|
"loss": 5.4331,
|
|
"mean_token_accuracy": 0.18785624653100969,
|
|
"num_tokens": 7101843.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 5.4791899681091305,
|
|
"epoch": 0.29827089337175794,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004995376595740801,
|
|
"loss": 5.5056,
|
|
"mean_token_accuracy": 0.18063082695007324,
|
|
"num_tokens": 7112014.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"entropy": 5.632973289489746,
|
|
"epoch": 0.2987512007684918,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995354602849302,
|
|
"loss": 5.5822,
|
|
"mean_token_accuracy": 0.17074308097362517,
|
|
"num_tokens": 7123860.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 5.571376514434815,
|
|
"epoch": 0.29923150816522576,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995332557827337,
|
|
"loss": 5.5564,
|
|
"mean_token_accuracy": 0.17600722908973693,
|
|
"num_tokens": 7135901.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"entropy": 5.5778998851776125,
|
|
"epoch": 0.29971181556195964,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995310460675416,
|
|
"loss": 5.5339,
|
|
"mean_token_accuracy": 0.1845734417438507,
|
|
"num_tokens": 7148743.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 5.589261770248413,
|
|
"epoch": 0.3001921229586936,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004995288311394053,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.18021756410598755,
|
|
"num_tokens": 7160731.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"entropy": 5.574976587295533,
|
|
"epoch": 0.30067243035542746,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004995266109983764,
|
|
"loss": 5.5617,
|
|
"mean_token_accuracy": 0.17890461087226867,
|
|
"num_tokens": 7172861.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 5.5695881843566895,
|
|
"epoch": 0.3011527377521614,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995243856445062,
|
|
"loss": 5.5087,
|
|
"mean_token_accuracy": 0.17425711154937745,
|
|
"num_tokens": 7183954.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"entropy": 5.523225164413452,
|
|
"epoch": 0.3016330451488953,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995221550778466,
|
|
"loss": 5.4793,
|
|
"mean_token_accuracy": 0.1828732267022133,
|
|
"num_tokens": 7195466.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 5.535993862152099,
|
|
"epoch": 0.3021133525456292,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995199192984491,
|
|
"loss": 5.4733,
|
|
"mean_token_accuracy": 0.18358256071805953,
|
|
"num_tokens": 7207173.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"entropy": 5.601380920410156,
|
|
"epoch": 0.3025936599423631,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995176783063657,
|
|
"loss": 5.6094,
|
|
"mean_token_accuracy": 0.17880836874246597,
|
|
"num_tokens": 7220095.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 5.5713316917419435,
|
|
"epoch": 0.30307396733909703,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995154321016487,
|
|
"loss": 5.5217,
|
|
"mean_token_accuracy": 0.18463317751884462,
|
|
"num_tokens": 7230664.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"entropy": 5.5087896347045895,
|
|
"epoch": 0.3035542747358309,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995131806843499,
|
|
"loss": 5.4837,
|
|
"mean_token_accuracy": 0.18419086784124375,
|
|
"num_tokens": 7241278.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 5.4533278465271,
|
|
"epoch": 0.30403458213256485,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995109240545218,
|
|
"loss": 5.6281,
|
|
"mean_token_accuracy": 0.1725993424654007,
|
|
"num_tokens": 7252999.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"entropy": 5.589286613464355,
|
|
"epoch": 0.3045148895292987,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995086622122167,
|
|
"loss": 5.4738,
|
|
"mean_token_accuracy": 0.17775996774435043,
|
|
"num_tokens": 7263949.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 5.558937978744507,
|
|
"epoch": 0.30499519692603266,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004995063951574871,
|
|
"loss": 5.5219,
|
|
"mean_token_accuracy": 0.18208030313253404,
|
|
"num_tokens": 7275467.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"entropy": 5.563764429092407,
|
|
"epoch": 0.30547550432276654,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004995041228903856,
|
|
"loss": 5.4858,
|
|
"mean_token_accuracy": 0.18617523461580276,
|
|
"num_tokens": 7285534.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 5.614857864379883,
|
|
"epoch": 0.3059558117195005,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499501845410965,
|
|
"loss": 5.5985,
|
|
"mean_token_accuracy": 0.18059034049510955,
|
|
"num_tokens": 7297252.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"entropy": 5.526304435729981,
|
|
"epoch": 0.30643611911623436,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004994995627192781,
|
|
"loss": 5.4686,
|
|
"mean_token_accuracy": 0.18378556221723558,
|
|
"num_tokens": 7308492.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 5.5130932331085205,
|
|
"epoch": 0.3069164265129683,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004994972748153781,
|
|
"loss": 5.5122,
|
|
"mean_token_accuracy": 0.18087892532348632,
|
|
"num_tokens": 7319703.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"entropy": 5.598230838775635,
|
|
"epoch": 0.30739673390970224,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499494981699318,
|
|
"loss": 5.4766,
|
|
"mean_token_accuracy": 0.18629593551158904,
|
|
"num_tokens": 7331022.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 5.5110736846923825,
|
|
"epoch": 0.3078770413064361,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499492683371151,
|
|
"loss": 5.5125,
|
|
"mean_token_accuracy": 0.18337176293134688,
|
|
"num_tokens": 7342977.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"entropy": 5.602800512313843,
|
|
"epoch": 0.30835734870317005,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004994903798309306,
|
|
"loss": 5.5087,
|
|
"mean_token_accuracy": 0.17746395766735076,
|
|
"num_tokens": 7353227.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 5.563166570663452,
|
|
"epoch": 0.30883765609990393,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994880710787102,
|
|
"loss": 5.5743,
|
|
"mean_token_accuracy": 0.1642255187034607,
|
|
"num_tokens": 7364165.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"entropy": 5.544680643081665,
|
|
"epoch": 0.30931796349663787,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004994857571145432,
|
|
"loss": 5.5023,
|
|
"mean_token_accuracy": 0.18458254784345626,
|
|
"num_tokens": 7374800.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 5.425434350967407,
|
|
"epoch": 0.30979827089337175,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004994834379384837,
|
|
"loss": 5.4565,
|
|
"mean_token_accuracy": 0.18336665779352188,
|
|
"num_tokens": 7386360.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"entropy": 5.552868223190307,
|
|
"epoch": 0.3102785782901057,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994811135505851,
|
|
"loss": 5.4698,
|
|
"mean_token_accuracy": 0.18341365456581116,
|
|
"num_tokens": 7397066.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 5.558938503265381,
|
|
"epoch": 0.31075888568683957,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994787839509018,
|
|
"loss": 5.564,
|
|
"mean_token_accuracy": 0.1713826075196266,
|
|
"num_tokens": 7408349.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"entropy": 5.5813216209411625,
|
|
"epoch": 0.3112391930835735,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004994764491394876,
|
|
"loss": 5.5886,
|
|
"mean_token_accuracy": 0.17263369262218475,
|
|
"num_tokens": 7420343.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 5.624362230300903,
|
|
"epoch": 0.3117195004803074,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004994741091163969,
|
|
"loss": 5.4904,
|
|
"mean_token_accuracy": 0.18449428975582122,
|
|
"num_tokens": 7431683.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"entropy": 5.41058030128479,
|
|
"epoch": 0.3121998078770413,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499471763881684,
|
|
"loss": 5.4083,
|
|
"mean_token_accuracy": 0.18659997135400772,
|
|
"num_tokens": 7443327.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 5.545905923843383,
|
|
"epoch": 0.3126801152737752,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004994694134354031,
|
|
"loss": 5.517,
|
|
"mean_token_accuracy": 0.18232496678829194,
|
|
"num_tokens": 7454002.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"entropy": 5.49485216140747,
|
|
"epoch": 0.31316042267050914,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499467057777609,
|
|
"loss": 5.5092,
|
|
"mean_token_accuracy": 0.18318750262260436,
|
|
"num_tokens": 7464074.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 5.470322179794311,
|
|
"epoch": 0.313640730067243,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004994646969083565,
|
|
"loss": 5.434,
|
|
"mean_token_accuracy": 0.1871152251958847,
|
|
"num_tokens": 7475543.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"entropy": 5.583432674407959,
|
|
"epoch": 0.31412103746397696,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004994623308277002,
|
|
"loss": 5.4947,
|
|
"mean_token_accuracy": 0.18215811550617217,
|
|
"num_tokens": 7486818.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 5.5460193157196045,
|
|
"epoch": 0.31460134486071084,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499459959535695,
|
|
"loss": 5.5431,
|
|
"mean_token_accuracy": 0.17775923311710357,
|
|
"num_tokens": 7499046.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"entropy": 5.530418539047242,
|
|
"epoch": 0.3150816522574448,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004994575830323962,
|
|
"loss": 5.4758,
|
|
"mean_token_accuracy": 0.1772423878312111,
|
|
"num_tokens": 7509853.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 5.422787761688232,
|
|
"epoch": 0.31556195965417866,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004994552013178586,
|
|
"loss": 5.3345,
|
|
"mean_token_accuracy": 0.1908559814095497,
|
|
"num_tokens": 7521091.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"entropy": 5.470391035079956,
|
|
"epoch": 0.3160422670509126,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499452814392138,
|
|
"loss": 5.4638,
|
|
"mean_token_accuracy": 0.19296756088733674,
|
|
"num_tokens": 7531317.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 5.550863265991211,
|
|
"epoch": 0.3165225744476465,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004994504222552894,
|
|
"loss": 5.6115,
|
|
"mean_token_accuracy": 0.17447966411709787,
|
|
"num_tokens": 7542822.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"entropy": 5.679572725296021,
|
|
"epoch": 0.3170028818443804,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994480249073684,
|
|
"loss": 5.5371,
|
|
"mean_token_accuracy": 0.17899394482374192,
|
|
"num_tokens": 7552434.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 5.455837345123291,
|
|
"epoch": 0.3174831892411143,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004994456223484308,
|
|
"loss": 5.412,
|
|
"mean_token_accuracy": 0.1847301483154297,
|
|
"num_tokens": 7563895.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"entropy": 5.356154918670654,
|
|
"epoch": 0.31796349663784823,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994432145785323,
|
|
"loss": 5.4431,
|
|
"mean_token_accuracy": 0.1852705791592598,
|
|
"num_tokens": 7575391.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 5.603661298751831,
|
|
"epoch": 0.3184438040345821,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994408015977288,
|
|
"loss": 5.5895,
|
|
"mean_token_accuracy": 0.18396379053592682,
|
|
"num_tokens": 7587119.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"entropy": 5.5791820049285885,
|
|
"epoch": 0.31892411143131605,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004994383834060764,
|
|
"loss": 5.5529,
|
|
"mean_token_accuracy": 0.17733592242002488,
|
|
"num_tokens": 7598615.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 5.522308588027954,
|
|
"epoch": 0.31940441882804993,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004994359600036311,
|
|
"loss": 5.5022,
|
|
"mean_token_accuracy": 0.18452920615673066,
|
|
"num_tokens": 7610159.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"entropy": 5.598204278945923,
|
|
"epoch": 0.31988472622478387,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004994335313904493,
|
|
"loss": 5.4916,
|
|
"mean_token_accuracy": 0.18418505936861038,
|
|
"num_tokens": 7620922.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 5.45703272819519,
|
|
"epoch": 0.32036503362151775,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004994310975665873,
|
|
"loss": 5.4117,
|
|
"mean_token_accuracy": 0.18754592537879944,
|
|
"num_tokens": 7632343.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"entropy": 5.619206094741822,
|
|
"epoch": 0.3208453410182517,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004994286585321017,
|
|
"loss": 5.6097,
|
|
"mean_token_accuracy": 0.1694990485906601,
|
|
"num_tokens": 7644748.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 5.595988607406616,
|
|
"epoch": 0.32132564841498557,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499426214287049,
|
|
"loss": 5.5649,
|
|
"mean_token_accuracy": 0.18684215247631072,
|
|
"num_tokens": 7655449.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"entropy": 5.522005844116211,
|
|
"epoch": 0.3218059558117195,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004994237648314862,
|
|
"loss": 5.5274,
|
|
"mean_token_accuracy": 0.18205100297927856,
|
|
"num_tokens": 7665623.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 5.492083740234375,
|
|
"epoch": 0.3222862632084534,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994213101654697,
|
|
"loss": 5.4173,
|
|
"mean_token_accuracy": 0.18764639347791673,
|
|
"num_tokens": 7676860.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"entropy": 5.5761909008026125,
|
|
"epoch": 0.3227665706051873,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499418850289057,
|
|
"loss": 5.603,
|
|
"mean_token_accuracy": 0.1757027193903923,
|
|
"num_tokens": 7687778.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 5.565295886993408,
|
|
"epoch": 0.32324687800192126,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994163852023048,
|
|
"loss": 5.4981,
|
|
"mean_token_accuracy": 0.18085954636335372,
|
|
"num_tokens": 7699154.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"entropy": 5.525069093704223,
|
|
"epoch": 0.32372718539865514,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994139149052706,
|
|
"loss": 5.5175,
|
|
"mean_token_accuracy": 0.18480815589427949,
|
|
"num_tokens": 7711010.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 5.576666164398193,
|
|
"epoch": 0.3242074927953891,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004994114393980117,
|
|
"loss": 5.538,
|
|
"mean_token_accuracy": 0.17918068915605545,
|
|
"num_tokens": 7721969.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"entropy": 5.561730909347534,
|
|
"epoch": 0.32468780019212296,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004994089586805856,
|
|
"loss": 5.4863,
|
|
"mean_token_accuracy": 0.1827893927693367,
|
|
"num_tokens": 7733762.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 5.549566268920898,
|
|
"epoch": 0.3251681075888569,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004994064727530496,
|
|
"loss": 5.4963,
|
|
"mean_token_accuracy": 0.17758472561836242,
|
|
"num_tokens": 7744614.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"entropy": 5.498316717147827,
|
|
"epoch": 0.3256484149855908,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004994039816154618,
|
|
"loss": 5.4339,
|
|
"mean_token_accuracy": 0.18473347425460815,
|
|
"num_tokens": 7755799.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 5.455300903320312,
|
|
"epoch": 0.3261287223823247,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00049940148526788,
|
|
"loss": 5.4848,
|
|
"mean_token_accuracy": 0.18304541558027268,
|
|
"num_tokens": 7768140.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"entropy": 5.568225574493408,
|
|
"epoch": 0.3266090297790586,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993989837103618,
|
|
"loss": 5.4898,
|
|
"mean_token_accuracy": 0.1791609227657318,
|
|
"num_tokens": 7778494.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 5.607134199142456,
|
|
"epoch": 0.3270893371757925,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993964769429657,
|
|
"loss": 5.5675,
|
|
"mean_token_accuracy": 0.18318891525268555,
|
|
"num_tokens": 7789234.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"entropy": 5.541140413284301,
|
|
"epoch": 0.3275696445725264,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004993939649657498,
|
|
"loss": 5.548,
|
|
"mean_token_accuracy": 0.18319968730211258,
|
|
"num_tokens": 7800602.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 5.469655227661133,
|
|
"epoch": 0.32804995196926034,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004993914477787721,
|
|
"loss": 5.3674,
|
|
"mean_token_accuracy": 0.1912238970398903,
|
|
"num_tokens": 7812803.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"entropy": 5.625386571884155,
|
|
"epoch": 0.3285302593659942,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993889253820915,
|
|
"loss": 5.6669,
|
|
"mean_token_accuracy": 0.16849727183580399,
|
|
"num_tokens": 7825432.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 5.567583656311035,
|
|
"epoch": 0.32901056676272816,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993863977757663,
|
|
"loss": 5.4819,
|
|
"mean_token_accuracy": 0.18198901265859604,
|
|
"num_tokens": 7837258.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"entropy": 5.42762131690979,
|
|
"epoch": 0.32949087415946204,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993838649598552,
|
|
"loss": 5.3739,
|
|
"mean_token_accuracy": 0.1897459015250206,
|
|
"num_tokens": 7847573.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 5.551398038864136,
|
|
"epoch": 0.329971181556196,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993813269344171,
|
|
"loss": 5.4969,
|
|
"mean_token_accuracy": 0.17690201252698898,
|
|
"num_tokens": 7857957.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"entropy": 5.5013957023620605,
|
|
"epoch": 0.33045148895292986,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993787836995108,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.1926833838224411,
|
|
"num_tokens": 7867996.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 5.446499681472778,
|
|
"epoch": 0.3309317963496638,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993762352551954,
|
|
"loss": 5.4766,
|
|
"mean_token_accuracy": 0.1805843397974968,
|
|
"num_tokens": 7879245.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"entropy": 5.61943678855896,
|
|
"epoch": 0.3314121037463977,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004993736816015301,
|
|
"loss": 5.5669,
|
|
"mean_token_accuracy": 0.17582879960536957,
|
|
"num_tokens": 7891186.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 5.609936046600342,
|
|
"epoch": 0.3318924111431316,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004993711227385742,
|
|
"loss": 5.5802,
|
|
"mean_token_accuracy": 0.1823540985584259,
|
|
"num_tokens": 7902231.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"entropy": 5.523345851898194,
|
|
"epoch": 0.3323727185398655,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993685586663871,
|
|
"loss": 5.5412,
|
|
"mean_token_accuracy": 0.18139662891626357,
|
|
"num_tokens": 7913364.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 5.735165405273437,
|
|
"epoch": 0.33285302593659943,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993659893850281,
|
|
"loss": 5.7308,
|
|
"mean_token_accuracy": 0.16727230101823806,
|
|
"num_tokens": 7925217.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"entropy": 5.506084823608399,
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004993634148945573,
|
|
"loss": 5.4639,
|
|
"mean_token_accuracy": 0.17894653379917144,
|
|
"num_tokens": 7937636.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 5.5272363185882565,
|
|
"epoch": 0.33381364073006725,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993608351950341,
|
|
"loss": 5.4896,
|
|
"mean_token_accuracy": 0.17503666803240775,
|
|
"num_tokens": 7948958.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"entropy": 5.620566320419312,
|
|
"epoch": 0.33429394812680113,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004993582502865185,
|
|
"loss": 5.5323,
|
|
"mean_token_accuracy": 0.18402974754571916,
|
|
"num_tokens": 7960013.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 5.462809419631958,
|
|
"epoch": 0.33477425552353507,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993556601690706,
|
|
"loss": 5.5416,
|
|
"mean_token_accuracy": 0.17792800366878508,
|
|
"num_tokens": 7971041.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"entropy": 5.618744802474976,
|
|
"epoch": 0.33525456292026895,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993530648427505,
|
|
"loss": 5.576,
|
|
"mean_token_accuracy": 0.1723045140504837,
|
|
"num_tokens": 7982752.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 5.599891996383667,
|
|
"epoch": 0.3357348703170029,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004993504643076184,
|
|
"loss": 5.4278,
|
|
"mean_token_accuracy": 0.18250093311071397,
|
|
"num_tokens": 7993681.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"entropy": 5.470984411239624,
|
|
"epoch": 0.33621517771373677,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993478585637347,
|
|
"loss": 5.4781,
|
|
"mean_token_accuracy": 0.18258391320705414,
|
|
"num_tokens": 8004727.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 5.505999660491943,
|
|
"epoch": 0.3366954851104707,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004993452476111599,
|
|
"loss": 5.4797,
|
|
"mean_token_accuracy": 0.18967788219451903,
|
|
"num_tokens": 8015423.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"entropy": 5.512713193893433,
|
|
"epoch": 0.3371757925072046,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004993426314499546,
|
|
"loss": 5.4536,
|
|
"mean_token_accuracy": 0.18748492896556854,
|
|
"num_tokens": 8027911.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 5.572777605056762,
|
|
"epoch": 0.3376560999039385,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004993400100801796,
|
|
"loss": 5.4747,
|
|
"mean_token_accuracy": 0.1818804770708084,
|
|
"num_tokens": 8038831.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"entropy": 5.392134952545166,
|
|
"epoch": 0.3381364073006724,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993373835018956,
|
|
"loss": 5.3718,
|
|
"mean_token_accuracy": 0.18957587629556655,
|
|
"num_tokens": 8049906.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 5.393214273452759,
|
|
"epoch": 0.33861671469740634,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004993347517151638,
|
|
"loss": 5.469,
|
|
"mean_token_accuracy": 0.18386447727680205,
|
|
"num_tokens": 8061158.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"entropy": 5.6083544254302975,
|
|
"epoch": 0.3390970220941403,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993321147200452,
|
|
"loss": 5.4326,
|
|
"mean_token_accuracy": 0.181746444106102,
|
|
"num_tokens": 8071958.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 5.465584182739258,
|
|
"epoch": 0.33957732949087416,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000499329472516601,
|
|
"loss": 5.4294,
|
|
"mean_token_accuracy": 0.17608542144298553,
|
|
"num_tokens": 8084068.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"entropy": 5.410733461380005,
|
|
"epoch": 0.3400576368876081,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004993268251048925,
|
|
"loss": 5.3472,
|
|
"mean_token_accuracy": 0.19578494429588317,
|
|
"num_tokens": 8096132.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 5.503920364379883,
|
|
"epoch": 0.340537944284342,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004993241724849814,
|
|
"loss": 5.5102,
|
|
"mean_token_accuracy": 0.18362511545419694,
|
|
"num_tokens": 8107327.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"entropy": 5.497963953018188,
|
|
"epoch": 0.3410182516810759,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499321514656929,
|
|
"loss": 5.4779,
|
|
"mean_token_accuracy": 0.18374822586774825,
|
|
"num_tokens": 8118584.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 5.550964641571045,
|
|
"epoch": 0.3414985590778098,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993188516207972,
|
|
"loss": 5.5337,
|
|
"mean_token_accuracy": 0.1793607845902443,
|
|
"num_tokens": 8130081.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"entropy": 5.507245492935181,
|
|
"epoch": 0.34197886647454373,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004993161833766478,
|
|
"loss": 5.4932,
|
|
"mean_token_accuracy": 0.1838148668408394,
|
|
"num_tokens": 8141463.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 5.541257572174072,
|
|
"epoch": 0.3424591738712776,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993135099245426,
|
|
"loss": 5.5042,
|
|
"mean_token_accuracy": 0.17985130697488785,
|
|
"num_tokens": 8153863.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"entropy": 5.428792333602905,
|
|
"epoch": 0.34293948126801155,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993108312645438,
|
|
"loss": 5.463,
|
|
"mean_token_accuracy": 0.18102106750011443,
|
|
"num_tokens": 8165695.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 5.5374926090240475,
|
|
"epoch": 0.34341978866474543,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004993081473967135,
|
|
"loss": 5.5119,
|
|
"mean_token_accuracy": 0.18098655641078948,
|
|
"num_tokens": 8176456.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"entropy": 5.58543210029602,
|
|
"epoch": 0.34390009606147937,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004993054583211143,
|
|
"loss": 5.5092,
|
|
"mean_token_accuracy": 0.1822955548763275,
|
|
"num_tokens": 8189050.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 5.43015308380127,
|
|
"epoch": 0.34438040345821325,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004993027640378081,
|
|
"loss": 5.4081,
|
|
"mean_token_accuracy": 0.185765840113163,
|
|
"num_tokens": 8200011.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"entropy": 5.474026918411255,
|
|
"epoch": 0.3448607108549472,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499300064546858,
|
|
"loss": 5.4183,
|
|
"mean_token_accuracy": 0.1868817389011383,
|
|
"num_tokens": 8211770.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 5.55191330909729,
|
|
"epoch": 0.34534101825168106,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992973598483264,
|
|
"loss": 5.4638,
|
|
"mean_token_accuracy": 0.18688549250364303,
|
|
"num_tokens": 8223582.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"entropy": 5.575275611877442,
|
|
"epoch": 0.345821325648415,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499294649942276,
|
|
"loss": 5.5846,
|
|
"mean_token_accuracy": 0.1825041502714157,
|
|
"num_tokens": 8234336.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 5.547464847564697,
|
|
"epoch": 0.3463016330451489,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004992919348287699,
|
|
"loss": 5.4941,
|
|
"mean_token_accuracy": 0.18366153985261918,
|
|
"num_tokens": 8244605.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"entropy": 5.5259942531585695,
|
|
"epoch": 0.3467819404418828,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004992892145078711,
|
|
"loss": 5.5254,
|
|
"mean_token_accuracy": 0.17931086868047713,
|
|
"num_tokens": 8255876.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 5.4697678565979,
|
|
"epoch": 0.3472622478386167,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004992864889796427,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.18721913993358613,
|
|
"num_tokens": 8266602.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"entropy": 5.546818780899048,
|
|
"epoch": 0.34774255523535064,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004992837582441481,
|
|
"loss": 5.4216,
|
|
"mean_token_accuracy": 0.18347607105970382,
|
|
"num_tokens": 8279804.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 5.569514989852905,
|
|
"epoch": 0.3482228626320845,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004992810223014506,
|
|
"loss": 5.5242,
|
|
"mean_token_accuracy": 0.1833881989121437,
|
|
"num_tokens": 8291020.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"entropy": 5.5203827857971195,
|
|
"epoch": 0.34870317002881845,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004992782811516137,
|
|
"loss": 5.4727,
|
|
"mean_token_accuracy": 0.18729409873485564,
|
|
"num_tokens": 8302192.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 5.496627855300903,
|
|
"epoch": 0.34918347742555234,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004992755347947011,
|
|
"loss": 5.4324,
|
|
"mean_token_accuracy": 0.18265776634216307,
|
|
"num_tokens": 8313649.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"entropy": 5.44870662689209,
|
|
"epoch": 0.34966378482228627,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004992727832307766,
|
|
"loss": 5.4304,
|
|
"mean_token_accuracy": 0.18587879687547684,
|
|
"num_tokens": 8324694.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 5.604543972015381,
|
|
"epoch": 0.35014409221902015,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004992700264599039,
|
|
"loss": 5.594,
|
|
"mean_token_accuracy": 0.1727964922785759,
|
|
"num_tokens": 8336517.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"entropy": 5.540855789184571,
|
|
"epoch": 0.3506243996157541,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004992672644821473,
|
|
"loss": 5.5425,
|
|
"mean_token_accuracy": 0.1779757022857666,
|
|
"num_tokens": 8349001.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 5.5626523971557615,
|
|
"epoch": 0.35110470701248797,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004992644972975707,
|
|
"loss": 5.4537,
|
|
"mean_token_accuracy": 0.1864044651389122,
|
|
"num_tokens": 8361230.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"entropy": 5.394788694381714,
|
|
"epoch": 0.3515850144092219,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992617249062383,
|
|
"loss": 5.3924,
|
|
"mean_token_accuracy": 0.19216873198747636,
|
|
"num_tokens": 8372159.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 5.543751049041748,
|
|
"epoch": 0.3520653218059558,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004992589473082147,
|
|
"loss": 5.5214,
|
|
"mean_token_accuracy": 0.18608528524637222,
|
|
"num_tokens": 8383228.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"entropy": 5.509809923171997,
|
|
"epoch": 0.3525456292026897,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004992561645035641,
|
|
"loss": 5.4561,
|
|
"mean_token_accuracy": 0.18168068826198577,
|
|
"num_tokens": 8394582.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 5.514116191864014,
|
|
"epoch": 0.3530259365994236,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004992533764923515,
|
|
"loss": 5.4481,
|
|
"mean_token_accuracy": 0.18126334249973297,
|
|
"num_tokens": 8406784.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"entropy": 5.483726072311401,
|
|
"epoch": 0.35350624399615754,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992505832746412,
|
|
"loss": 5.4286,
|
|
"mean_token_accuracy": 0.19101243019104003,
|
|
"num_tokens": 8418405.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 5.5265562534332275,
|
|
"epoch": 0.3539865513928914,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004992477848504983,
|
|
"loss": 5.392,
|
|
"mean_token_accuracy": 0.18716304898262023,
|
|
"num_tokens": 8430432.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"entropy": 5.479315328598022,
|
|
"epoch": 0.35446685878962536,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004992449812199877,
|
|
"loss": 5.5635,
|
|
"mean_token_accuracy": 0.17799893915653228,
|
|
"num_tokens": 8442423.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 5.518668079376221,
|
|
"epoch": 0.3549471661863593,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004992421723831745,
|
|
"loss": 5.546,
|
|
"mean_token_accuracy": 0.1842621758580208,
|
|
"num_tokens": 8454951.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"entropy": 5.520323848724365,
|
|
"epoch": 0.3554274735830932,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004992393583401239,
|
|
"loss": 5.4033,
|
|
"mean_token_accuracy": 0.18851898312568666,
|
|
"num_tokens": 8467758.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 5.475191354751587,
|
|
"epoch": 0.3559077809798271,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004992365390909014,
|
|
"loss": 5.4854,
|
|
"mean_token_accuracy": 0.17992179691791535,
|
|
"num_tokens": 8479728.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"entropy": 5.535838651657104,
|
|
"epoch": 0.356388088376561,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004992337146355721,
|
|
"loss": 5.552,
|
|
"mean_token_accuracy": 0.17727553099393845,
|
|
"num_tokens": 8492099.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 5.610863542556762,
|
|
"epoch": 0.35686839577329493,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992308849742019,
|
|
"loss": 5.4819,
|
|
"mean_token_accuracy": 0.17355056405067443,
|
|
"num_tokens": 8504657.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"entropy": 5.48232364654541,
|
|
"epoch": 0.3573487031700288,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004992280501068563,
|
|
"loss": 5.4509,
|
|
"mean_token_accuracy": 0.18914830237627028,
|
|
"num_tokens": 8514728.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 5.528886175155639,
|
|
"epoch": 0.35782901056676275,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004992252100336012,
|
|
"loss": 5.581,
|
|
"mean_token_accuracy": 0.1833130970597267,
|
|
"num_tokens": 8525588.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"entropy": 5.540911626815796,
|
|
"epoch": 0.35830931796349663,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992223647545027,
|
|
"loss": 5.527,
|
|
"mean_token_accuracy": 0.18297800421714783,
|
|
"num_tokens": 8537468.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 5.5527503490448,
|
|
"epoch": 0.35878962536023057,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004992195142696266,
|
|
"loss": 5.438,
|
|
"mean_token_accuracy": 0.18914629518985748,
|
|
"num_tokens": 8548598.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"entropy": 5.33068585395813,
|
|
"epoch": 0.35926993275696445,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004992166585790391,
|
|
"loss": 5.3396,
|
|
"mean_token_accuracy": 0.19562919437885284,
|
|
"num_tokens": 8560301.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 5.483434391021729,
|
|
"epoch": 0.3597502401536984,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992137976828067,
|
|
"loss": 5.4516,
|
|
"mean_token_accuracy": 0.18603197634220123,
|
|
"num_tokens": 8571186.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"entropy": 5.484015607833863,
|
|
"epoch": 0.36023054755043227,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992109315809955,
|
|
"loss": 5.4383,
|
|
"mean_token_accuracy": 0.18905191421508788,
|
|
"num_tokens": 8580725.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 5.519361686706543,
|
|
"epoch": 0.3607108549471662,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004992080602736725,
|
|
"loss": 5.5532,
|
|
"mean_token_accuracy": 0.1773756206035614,
|
|
"num_tokens": 8594598.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"entropy": 5.643574905395508,
|
|
"epoch": 0.3611911623439001,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004992051837609039,
|
|
"loss": 5.5404,
|
|
"mean_token_accuracy": 0.17730522602796556,
|
|
"num_tokens": 8606733.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 5.508514451980591,
|
|
"epoch": 0.361671469740634,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004992023020427568,
|
|
"loss": 5.4788,
|
|
"mean_token_accuracy": 0.18672696501016617,
|
|
"num_tokens": 8618863.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"entropy": 5.3892511367797855,
|
|
"epoch": 0.3621517771373679,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004991994151192979,
|
|
"loss": 5.3304,
|
|
"mean_token_accuracy": 0.18849435597658157,
|
|
"num_tokens": 8629270.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 5.4767759323120115,
|
|
"epoch": 0.36263208453410184,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991965229905943,
|
|
"loss": 5.5364,
|
|
"mean_token_accuracy": 0.18494855612516403,
|
|
"num_tokens": 8641363.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"entropy": 5.6278270244598385,
|
|
"epoch": 0.3631123919308357,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004991936256567133,
|
|
"loss": 5.4992,
|
|
"mean_token_accuracy": 0.18451761305332184,
|
|
"num_tokens": 8653233.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 5.4851010799407955,
|
|
"epoch": 0.36359269932756966,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.000499190723117722,
|
|
"loss": 5.487,
|
|
"mean_token_accuracy": 0.17836329340934753,
|
|
"num_tokens": 8665192.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"entropy": 5.579302835464477,
|
|
"epoch": 0.36407300672430354,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991878153736877,
|
|
"loss": 5.5583,
|
|
"mean_token_accuracy": 0.17446503937244415,
|
|
"num_tokens": 8677669.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 5.419927787780762,
|
|
"epoch": 0.3645533141210375,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991849024246781,
|
|
"loss": 5.3676,
|
|
"mean_token_accuracy": 0.18973670154809952,
|
|
"num_tokens": 8688002.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"entropy": 5.438193988800049,
|
|
"epoch": 0.36503362151777136,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004991819842707608,
|
|
"loss": 5.4133,
|
|
"mean_token_accuracy": 0.18962489068508148,
|
|
"num_tokens": 8698396.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 5.543167686462402,
|
|
"epoch": 0.3655139289145053,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991790609120035,
|
|
"loss": 5.4297,
|
|
"mean_token_accuracy": 0.18700562715530394,
|
|
"num_tokens": 8711135.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"entropy": 5.469641494750976,
|
|
"epoch": 0.3659942363112392,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499176132348474,
|
|
"loss": 5.4735,
|
|
"mean_token_accuracy": 0.1897922232747078,
|
|
"num_tokens": 8723707.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 5.582857084274292,
|
|
"epoch": 0.3664745437079731,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004991731985802405,
|
|
"loss": 5.4338,
|
|
"mean_token_accuracy": 0.18693850934505463,
|
|
"num_tokens": 8734193.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"entropy": 5.444149160385132,
|
|
"epoch": 0.366954851104707,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991702596073708,
|
|
"loss": 5.4841,
|
|
"mean_token_accuracy": 0.18134361505508423,
|
|
"num_tokens": 8745619.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 5.426347923278809,
|
|
"epoch": 0.36743515850144093,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991673154299335,
|
|
"loss": 5.4231,
|
|
"mean_token_accuracy": 0.18122087568044662,
|
|
"num_tokens": 8757331.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"entropy": 5.515204238891601,
|
|
"epoch": 0.3679154658981748,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004991643660479967,
|
|
"loss": 5.428,
|
|
"mean_token_accuracy": 0.1868494287133217,
|
|
"num_tokens": 8768840.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 5.460073804855346,
|
|
"epoch": 0.36839577329490875,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004991614114616289,
|
|
"loss": 5.3818,
|
|
"mean_token_accuracy": 0.18779707103967666,
|
|
"num_tokens": 8781214.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"entropy": 5.510246324539184,
|
|
"epoch": 0.3688760806916426,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004991584516708988,
|
|
"loss": 5.4477,
|
|
"mean_token_accuracy": 0.18548956960439683,
|
|
"num_tokens": 8791645.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 5.5942995071411135,
|
|
"epoch": 0.36935638808837656,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004991554866758751,
|
|
"loss": 5.6333,
|
|
"mean_token_accuracy": 0.1739022307097912,
|
|
"num_tokens": 8803286.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"entropy": 5.493673467636109,
|
|
"epoch": 0.36983669548511044,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991525164766265,
|
|
"loss": 5.4163,
|
|
"mean_token_accuracy": 0.1872221603989601,
|
|
"num_tokens": 8814207.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 5.503255462646484,
|
|
"epoch": 0.3703170028818444,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004991495410732222,
|
|
"loss": 5.4683,
|
|
"mean_token_accuracy": 0.17725101560354234,
|
|
"num_tokens": 8825540.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"entropy": 5.5069482803344725,
|
|
"epoch": 0.37079731027857826,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991465604657311,
|
|
"loss": 5.5937,
|
|
"mean_token_accuracy": 0.17322031259536744,
|
|
"num_tokens": 8838182.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 5.526088094711303,
|
|
"epoch": 0.3712776176753122,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991435746542224,
|
|
"loss": 5.4654,
|
|
"mean_token_accuracy": 0.18988653868436814,
|
|
"num_tokens": 8850211.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"entropy": 5.439452648162842,
|
|
"epoch": 0.37175792507204614,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004991405836387655,
|
|
"loss": 5.5032,
|
|
"mean_token_accuracy": 0.18108827471733094,
|
|
"num_tokens": 8862804.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 5.529762125015258,
|
|
"epoch": 0.37223823246878,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004991375874194298,
|
|
"loss": 5.4602,
|
|
"mean_token_accuracy": 0.17960784435272217,
|
|
"num_tokens": 8874112.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"entropy": 5.469674205780029,
|
|
"epoch": 0.37271853986551395,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499134585996285,
|
|
"loss": 5.477,
|
|
"mean_token_accuracy": 0.18614101260900498,
|
|
"num_tokens": 8885114.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 5.554774141311645,
|
|
"epoch": 0.37319884726224783,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004991315793694004,
|
|
"loss": 5.3691,
|
|
"mean_token_accuracy": 0.18807282894849778,
|
|
"num_tokens": 8895555.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"entropy": 5.405085754394531,
|
|
"epoch": 0.37367915465898177,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004991285675388463,
|
|
"loss": 5.3765,
|
|
"mean_token_accuracy": 0.19634046405553818,
|
|
"num_tokens": 8906073.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 5.501630163192749,
|
|
"epoch": 0.37415946205571565,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004991255505046922,
|
|
"loss": 5.5188,
|
|
"mean_token_accuracy": 0.1789945885539055,
|
|
"num_tokens": 8916587.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"entropy": 5.550557231903076,
|
|
"epoch": 0.3746397694524496,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991225282670083,
|
|
"loss": 5.4113,
|
|
"mean_token_accuracy": 0.1861289381980896,
|
|
"num_tokens": 8927923.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 5.382868242263794,
|
|
"epoch": 0.37512007684918347,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499119500825865,
|
|
"loss": 5.4579,
|
|
"mean_token_accuracy": 0.18377629071474075,
|
|
"num_tokens": 8939939.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"entropy": 5.397466945648193,
|
|
"epoch": 0.3756003842459174,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004991164681813323,
|
|
"loss": 5.4378,
|
|
"mean_token_accuracy": 0.19209783971309663,
|
|
"num_tokens": 8951748.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 5.485667037963867,
|
|
"epoch": 0.3760806916426513,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004991134303334807,
|
|
"loss": 5.3588,
|
|
"mean_token_accuracy": 0.19007459729909898,
|
|
"num_tokens": 8962922.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"entropy": 5.372178030014038,
|
|
"epoch": 0.3765609990393852,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004991103872823807,
|
|
"loss": 5.3442,
|
|
"mean_token_accuracy": 0.19452154785394668,
|
|
"num_tokens": 8974013.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 5.436591958999633,
|
|
"epoch": 0.3770413064361191,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499107339028103,
|
|
"loss": 5.4262,
|
|
"mean_token_accuracy": 0.18169266134500503,
|
|
"num_tokens": 8986032.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"entropy": 5.542058515548706,
|
|
"epoch": 0.37752161383285304,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991042855707184,
|
|
"loss": 5.4187,
|
|
"mean_token_accuracy": 0.1796349912881851,
|
|
"num_tokens": 8996889.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 5.436617517471314,
|
|
"epoch": 0.3780019212295869,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991012269102977,
|
|
"loss": 5.3992,
|
|
"mean_token_accuracy": 0.18429471999406816,
|
|
"num_tokens": 9007594.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"entropy": 5.426474618911743,
|
|
"epoch": 0.37848222862632086,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004990981630469119,
|
|
"loss": 5.402,
|
|
"mean_token_accuracy": 0.18193352967500687,
|
|
"num_tokens": 9018097.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 5.5093968391418455,
|
|
"epoch": 0.37896253602305474,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004990950939806323,
|
|
"loss": 5.5113,
|
|
"mean_token_accuracy": 0.18117111474275588,
|
|
"num_tokens": 9029554.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"entropy": 5.489337825775147,
|
|
"epoch": 0.3794428434197887,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00049909201971153,
|
|
"loss": 5.3772,
|
|
"mean_token_accuracy": 0.1829820305109024,
|
|
"num_tokens": 9042518.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 5.421378660202026,
|
|
"epoch": 0.37992315081652256,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004990889402396763,
|
|
"loss": 5.4316,
|
|
"mean_token_accuracy": 0.18639881759881974,
|
|
"num_tokens": 9054524.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"entropy": 5.510490798950196,
|
|
"epoch": 0.3804034582132565,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004990858555651431,
|
|
"loss": 5.4016,
|
|
"mean_token_accuracy": 0.18468015938997268,
|
|
"num_tokens": 9065375.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 5.44808177947998,
|
|
"epoch": 0.3808837656099904,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990827656880015,
|
|
"loss": 5.3509,
|
|
"mean_token_accuracy": 0.1859322890639305,
|
|
"num_tokens": 9076338.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"entropy": 5.432799911499023,
|
|
"epoch": 0.3813640730067243,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004990796706083235,
|
|
"loss": 5.4011,
|
|
"mean_token_accuracy": 0.18659975230693818,
|
|
"num_tokens": 9088407.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 5.426470470428467,
|
|
"epoch": 0.3818443804034582,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004990765703261809,
|
|
"loss": 5.3649,
|
|
"mean_token_accuracy": 0.18807975053787232,
|
|
"num_tokens": 9099833.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"entropy": 5.350304222106933,
|
|
"epoch": 0.38232468780019213,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004990734648416458,
|
|
"loss": 5.3388,
|
|
"mean_token_accuracy": 0.189335997402668,
|
|
"num_tokens": 9111126.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 5.505539417266846,
|
|
"epoch": 0.382804995196926,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004990703541547901,
|
|
"loss": 5.4548,
|
|
"mean_token_accuracy": 0.1886373370885849,
|
|
"num_tokens": 9121979.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"entropy": 5.520917081832886,
|
|
"epoch": 0.38328530259365995,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990672382656863,
|
|
"loss": 5.4535,
|
|
"mean_token_accuracy": 0.18644375950098038,
|
|
"num_tokens": 9132929.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 5.485851383209228,
|
|
"epoch": 0.38376560999039383,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004990641171744064,
|
|
"loss": 5.4111,
|
|
"mean_token_accuracy": 0.1882080391049385,
|
|
"num_tokens": 9143903.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"entropy": 5.495297384262085,
|
|
"epoch": 0.38424591738712777,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004990609908810231,
|
|
"loss": 5.5045,
|
|
"mean_token_accuracy": 0.18192221075296403,
|
|
"num_tokens": 9154416.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 5.513756942749024,
|
|
"epoch": 0.38472622478386165,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004990578593856089,
|
|
"loss": 5.4805,
|
|
"mean_token_accuracy": 0.18242392241954802,
|
|
"num_tokens": 9165613.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"entropy": 5.4664655208587645,
|
|
"epoch": 0.3852065321805956,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004990547226882366,
|
|
"loss": 5.433,
|
|
"mean_token_accuracy": 0.18787842243909836,
|
|
"num_tokens": 9177884.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 5.5449103832244875,
|
|
"epoch": 0.38568683957732947,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004990515807889788,
|
|
"loss": 5.5669,
|
|
"mean_token_accuracy": 0.17467134743928908,
|
|
"num_tokens": 9190041.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"entropy": 5.556881046295166,
|
|
"epoch": 0.3861671469740634,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004990484336879087,
|
|
"loss": 5.4402,
|
|
"mean_token_accuracy": 0.18740091025829314,
|
|
"num_tokens": 9202390.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 5.409300327301025,
|
|
"epoch": 0.3866474543707973,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004990452813850992,
|
|
"loss": 5.4373,
|
|
"mean_token_accuracy": 0.18635576069355012,
|
|
"num_tokens": 9213437.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"entropy": 5.554971408843994,
|
|
"epoch": 0.3871277617675312,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004990421238806236,
|
|
"loss": 5.517,
|
|
"mean_token_accuracy": 0.17564513981342317,
|
|
"num_tokens": 9226310.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 5.530429458618164,
|
|
"epoch": 0.38760806916426516,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004990389611745551,
|
|
"loss": 5.4495,
|
|
"mean_token_accuracy": 0.1819504901766777,
|
|
"num_tokens": 9236271.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"entropy": 5.516104078292846,
|
|
"epoch": 0.38808837656099904,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004990357932669672,
|
|
"loss": 5.5245,
|
|
"mean_token_accuracy": 0.18500009030103684,
|
|
"num_tokens": 9247755.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 5.464123487472534,
|
|
"epoch": 0.388568683957733,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004990326201579335,
|
|
"loss": 5.361,
|
|
"mean_token_accuracy": 0.19129124879837037,
|
|
"num_tokens": 9259821.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"entropy": 5.4668073654174805,
|
|
"epoch": 0.38904899135446686,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004990294418475274,
|
|
"loss": 5.4631,
|
|
"mean_token_accuracy": 0.18641964942216874,
|
|
"num_tokens": 9270663.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 5.465627670288086,
|
|
"epoch": 0.3895292987512008,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004990262583358231,
|
|
"loss": 5.4879,
|
|
"mean_token_accuracy": 0.17998379915952684,
|
|
"num_tokens": 9282588.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"entropy": 5.510502290725708,
|
|
"epoch": 0.3900096061479347,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004990230696228943,
|
|
"loss": 5.4397,
|
|
"mean_token_accuracy": 0.17829088270664215,
|
|
"num_tokens": 9293368.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 5.477728748321534,
|
|
"epoch": 0.3904899135446686,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004990198757088149,
|
|
"loss": 5.5128,
|
|
"mean_token_accuracy": 0.1811017781496048,
|
|
"num_tokens": 9305962.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"entropy": 5.508330774307251,
|
|
"epoch": 0.3909702209414025,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004990166765936593,
|
|
"loss": 5.393,
|
|
"mean_token_accuracy": 0.19244694262742995,
|
|
"num_tokens": 9317955.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 5.450256824493408,
|
|
"epoch": 0.3914505283381364,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004990134722775016,
|
|
"loss": 5.3934,
|
|
"mean_token_accuracy": 0.19047792106866837,
|
|
"num_tokens": 9329491.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"entropy": 5.451663637161255,
|
|
"epoch": 0.3919308357348703,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004990102627604162,
|
|
"loss": 5.5273,
|
|
"mean_token_accuracy": 0.19028781056404115,
|
|
"num_tokens": 9341612.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 5.524235773086548,
|
|
"epoch": 0.39241114313160425,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004990070480424778,
|
|
"loss": 5.458,
|
|
"mean_token_accuracy": 0.18043633103370665,
|
|
"num_tokens": 9352302.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"entropy": 5.440912199020386,
|
|
"epoch": 0.3928914505283381,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004990038281237608,
|
|
"loss": 5.3919,
|
|
"mean_token_accuracy": 0.1852226436138153,
|
|
"num_tokens": 9363303.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 5.433840227127075,
|
|
"epoch": 0.39337175792507206,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004990006030043401,
|
|
"loss": 5.3732,
|
|
"mean_token_accuracy": 0.1849522888660431,
|
|
"num_tokens": 9375878.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"entropy": 5.470492124557495,
|
|
"epoch": 0.39385206532180594,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004989973726842906,
|
|
"loss": 5.4145,
|
|
"mean_token_accuracy": 0.18103147149086,
|
|
"num_tokens": 9388342.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 5.44459342956543,
|
|
"epoch": 0.3943323727185399,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004989941371636872,
|
|
"loss": 5.3549,
|
|
"mean_token_accuracy": 0.1901955187320709,
|
|
"num_tokens": 9399047.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"entropy": 5.449139881134033,
|
|
"epoch": 0.39481268011527376,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004989908964426051,
|
|
"loss": 5.4342,
|
|
"mean_token_accuracy": 0.18933464139699935,
|
|
"num_tokens": 9410172.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 5.547493505477905,
|
|
"epoch": 0.3952929875120077,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004989876505211194,
|
|
"loss": 5.5794,
|
|
"mean_token_accuracy": 0.17717085629701615,
|
|
"num_tokens": 9422287.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"entropy": 5.5754584789276125,
|
|
"epoch": 0.3957732949087416,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004989843993993056,
|
|
"loss": 5.44,
|
|
"mean_token_accuracy": 0.18759053498506545,
|
|
"num_tokens": 9433709.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 5.341240167617798,
|
|
"epoch": 0.3962536023054755,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004989811430772392,
|
|
"loss": 5.3199,
|
|
"mean_token_accuracy": 0.189169280230999,
|
|
"num_tokens": 9445138.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"entropy": 5.4137170791625975,
|
|
"epoch": 0.3967339097022094,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004989778815549957,
|
|
"loss": 5.4579,
|
|
"mean_token_accuracy": 0.1827932521700859,
|
|
"num_tokens": 9455263.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 5.533003664016723,
|
|
"epoch": 0.39721421709894333,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004989746148326508,
|
|
"loss": 5.4184,
|
|
"mean_token_accuracy": 0.18644048422574996,
|
|
"num_tokens": 9465491.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"entropy": 5.372505331039429,
|
|
"epoch": 0.3976945244956772,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004989713429102805,
|
|
"loss": 5.3821,
|
|
"mean_token_accuracy": 0.1837732046842575,
|
|
"num_tokens": 9477601.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 5.426533985137939,
|
|
"epoch": 0.39817483189241115,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004989680657879607,
|
|
"loss": 5.4426,
|
|
"mean_token_accuracy": 0.18387902528047562,
|
|
"num_tokens": 9489385.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"entropy": 5.473710680007935,
|
|
"epoch": 0.39865513928914503,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004989647834657675,
|
|
"loss": 5.3249,
|
|
"mean_token_accuracy": 0.19230013936758042,
|
|
"num_tokens": 9501131.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 5.420683908462524,
|
|
"epoch": 0.39913544668587897,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000498961495943777,
|
|
"loss": 5.4614,
|
|
"mean_token_accuracy": 0.18854968398809432,
|
|
"num_tokens": 9513094.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"entropy": 5.577786207199097,
|
|
"epoch": 0.39961575408261285,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004989582032220656,
|
|
"loss": 5.5832,
|
|
"mean_token_accuracy": 0.17526223361492158,
|
|
"num_tokens": 9524538.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 5.522935295104981,
|
|
"epoch": 0.4000960614793468,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004989549053007096,
|
|
"loss": 5.3961,
|
|
"mean_token_accuracy": 0.19305580705404282,
|
|
"num_tokens": 9535284.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"entropy": 5.462124681472778,
|
|
"epoch": 0.40057636887608067,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004989516021797858,
|
|
"loss": 5.471,
|
|
"mean_token_accuracy": 0.18390081077814102,
|
|
"num_tokens": 9546472.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 5.499347305297851,
|
|
"epoch": 0.4010566762728146,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000498948293859371,
|
|
"loss": 5.4605,
|
|
"mean_token_accuracy": 0.18212546557188034,
|
|
"num_tokens": 9558358.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"entropy": 5.496229076385498,
|
|
"epoch": 0.4015369836695485,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004989449803395415,
|
|
"loss": 5.4959,
|
|
"mean_token_accuracy": 0.18471186012029647,
|
|
"num_tokens": 9570653.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 5.556100845336914,
|
|
"epoch": 0.4020172910662824,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004989416616203747,
|
|
"loss": 5.4386,
|
|
"mean_token_accuracy": 0.18714374899864197,
|
|
"num_tokens": 9582150.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"entropy": 5.4823558807373045,
|
|
"epoch": 0.4024975984630163,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004989383377019476,
|
|
"loss": 5.38,
|
|
"mean_token_accuracy": 0.19184014648199083,
|
|
"num_tokens": 9592462.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 5.375227689743042,
|
|
"epoch": 0.40297790585975024,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004989350085843371,
|
|
"loss": 5.374,
|
|
"mean_token_accuracy": 0.18951477408409118,
|
|
"num_tokens": 9604027.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"entropy": 5.387249088287353,
|
|
"epoch": 0.4034582132564842,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004989316742676207,
|
|
"loss": 5.3733,
|
|
"mean_token_accuracy": 0.19109322130680084,
|
|
"num_tokens": 9616325.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 5.396379852294922,
|
|
"epoch": 0.40393852065321806,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004989283347518757,
|
|
"loss": 5.3338,
|
|
"mean_token_accuracy": 0.18609212040901185,
|
|
"num_tokens": 9628133.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"entropy": 5.579652786254883,
|
|
"epoch": 0.404418828049952,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004989249900371797,
|
|
"loss": 5.5629,
|
|
"mean_token_accuracy": 0.17861852645874024,
|
|
"num_tokens": 9639686.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 5.429533529281616,
|
|
"epoch": 0.4048991354466859,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004989216401236103,
|
|
"loss": 5.4184,
|
|
"mean_token_accuracy": 0.18496839255094527,
|
|
"num_tokens": 9650222.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"entropy": 5.367856836318969,
|
|
"epoch": 0.4053794428434198,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004989182850112455,
|
|
"loss": 5.3417,
|
|
"mean_token_accuracy": 0.1997272178530693,
|
|
"num_tokens": 9661792.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 5.516646957397461,
|
|
"epoch": 0.4058597502401537,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004989149247001629,
|
|
"loss": 5.4497,
|
|
"mean_token_accuracy": 0.18383817970752717,
|
|
"num_tokens": 9673000.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"entropy": 5.532714462280273,
|
|
"epoch": 0.40634005763688763,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004989115591904407,
|
|
"loss": 5.3975,
|
|
"mean_token_accuracy": 0.1901587262749672,
|
|
"num_tokens": 9685253.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 5.391170501708984,
|
|
"epoch": 0.4068203650336215,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004989081884821569,
|
|
"loss": 5.4004,
|
|
"mean_token_accuracy": 0.18320820480585098,
|
|
"num_tokens": 9697245.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"entropy": 5.450364589691162,
|
|
"epoch": 0.40730067243035545,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004989048125753899,
|
|
"loss": 5.4156,
|
|
"mean_token_accuracy": 0.18504445552825927,
|
|
"num_tokens": 9710095.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 5.407678937911987,
|
|
"epoch": 0.40778097982708933,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000498901431470218,
|
|
"loss": 5.2919,
|
|
"mean_token_accuracy": 0.19396644979715347,
|
|
"num_tokens": 9721488.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"entropy": 5.2491998195648195,
|
|
"epoch": 0.40826128722382327,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004988980451667198,
|
|
"loss": 5.255,
|
|
"mean_token_accuracy": 0.19170391261577607,
|
|
"num_tokens": 9733280.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 5.455927753448487,
|
|
"epoch": 0.40874159462055715,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004988946536649737,
|
|
"loss": 5.3863,
|
|
"mean_token_accuracy": 0.18661659061908722,
|
|
"num_tokens": 9744514.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"entropy": 5.413423871994018,
|
|
"epoch": 0.4092219020172911,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004988912569650585,
|
|
"loss": 5.3752,
|
|
"mean_token_accuracy": 0.19112140834331512,
|
|
"num_tokens": 9754931.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 5.389836359024048,
|
|
"epoch": 0.40970220941402496,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004988878550670533,
|
|
"loss": 5.3725,
|
|
"mean_token_accuracy": 0.19297343790531157,
|
|
"num_tokens": 9765635.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"entropy": 5.508016872406006,
|
|
"epoch": 0.4101825168107589,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004988844479710369,
|
|
"loss": 5.4792,
|
|
"mean_token_accuracy": 0.18072771430015563,
|
|
"num_tokens": 9777512.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 5.541130542755127,
|
|
"epoch": 0.4106628242074928,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004988810356770884,
|
|
"loss": 5.4764,
|
|
"mean_token_accuracy": 0.1744610548019409,
|
|
"num_tokens": 9790128.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"entropy": 5.451146841049194,
|
|
"epoch": 0.4111431316042267,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000498877618185287,
|
|
"loss": 5.4112,
|
|
"mean_token_accuracy": 0.19078320413827896,
|
|
"num_tokens": 9802549.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 5.365971374511719,
|
|
"epoch": 0.4116234390009606,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988741954957121,
|
|
"loss": 5.3574,
|
|
"mean_token_accuracy": 0.18884203881025313,
|
|
"num_tokens": 9813736.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"entropy": 5.380771827697754,
|
|
"epoch": 0.41210374639769454,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004988707676084432,
|
|
"loss": 5.3584,
|
|
"mean_token_accuracy": 0.19705824106931685,
|
|
"num_tokens": 9823785.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 5.432324981689453,
|
|
"epoch": 0.4125840537944284,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004988673345235597,
|
|
"loss": 5.3197,
|
|
"mean_token_accuracy": 0.1934140741825104,
|
|
"num_tokens": 9834910.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"entropy": 5.437625408172607,
|
|
"epoch": 0.41306436119116235,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004988638962411416,
|
|
"loss": 5.363,
|
|
"mean_token_accuracy": 0.18818716257810592,
|
|
"num_tokens": 9845593.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 5.392855072021485,
|
|
"epoch": 0.41354466858789624,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004988604527612685,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.2009762555360794,
|
|
"num_tokens": 9856763.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"entropy": 5.503190565109253,
|
|
"epoch": 0.4140249759846302,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004988570040840205,
|
|
"loss": 5.4945,
|
|
"mean_token_accuracy": 0.18051616251468658,
|
|
"num_tokens": 9869528.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 5.407845735549927,
|
|
"epoch": 0.41450528338136405,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004988535502094774,
|
|
"loss": 5.3958,
|
|
"mean_token_accuracy": 0.18804680705070495,
|
|
"num_tokens": 9881170.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"entropy": 5.461514711380005,
|
|
"epoch": 0.414985590778098,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988500911377198,
|
|
"loss": 5.4803,
|
|
"mean_token_accuracy": 0.18439086973667146,
|
|
"num_tokens": 9893119.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 5.368999385833741,
|
|
"epoch": 0.41546589817483187,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004988466268688276,
|
|
"loss": 5.3154,
|
|
"mean_token_accuracy": 0.19932861626148224,
|
|
"num_tokens": 9905339.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"entropy": 5.482837677001953,
|
|
"epoch": 0.4159462055715658,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004988431574028814,
|
|
"loss": 5.4002,
|
|
"mean_token_accuracy": 0.19202394932508468,
|
|
"num_tokens": 9917500.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 5.466025495529175,
|
|
"epoch": 0.4164265129682997,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004988396827399618,
|
|
"loss": 5.4808,
|
|
"mean_token_accuracy": 0.18326758295297624,
|
|
"num_tokens": 9929667.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"entropy": 5.48503007888794,
|
|
"epoch": 0.4169068203650336,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004988362028801495,
|
|
"loss": 5.4048,
|
|
"mean_token_accuracy": 0.18796583414077758,
|
|
"num_tokens": 9941102.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 5.412125444412231,
|
|
"epoch": 0.4173871277617675,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004988327178235253,
|
|
"loss": 5.3058,
|
|
"mean_token_accuracy": 0.1973835989832878,
|
|
"num_tokens": 9951986.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"entropy": 5.383547782897949,
|
|
"epoch": 0.41786743515850144,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004988292275701699,
|
|
"loss": 5.3119,
|
|
"mean_token_accuracy": 0.19086995273828505,
|
|
"num_tokens": 9964486.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 5.406881952285767,
|
|
"epoch": 0.4183477425552353,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004988257321201646,
|
|
"loss": 5.4094,
|
|
"mean_token_accuracy": 0.1860354095697403,
|
|
"num_tokens": 9975909.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"entropy": 5.473488092422485,
|
|
"epoch": 0.41882804995196926,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004988222314735902,
|
|
"loss": 5.4171,
|
|
"mean_token_accuracy": 0.18617332428693772,
|
|
"num_tokens": 9986951.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 5.517805814743042,
|
|
"epoch": 0.41930835734870314,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004988187256305284,
|
|
"loss": 5.5057,
|
|
"mean_token_accuracy": 0.1791812226176262,
|
|
"num_tokens": 9999234.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"entropy": 5.405948638916016,
|
|
"epoch": 0.4197886647454371,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004988152145910603,
|
|
"loss": 5.3792,
|
|
"mean_token_accuracy": 0.1959477871656418,
|
|
"num_tokens": 10010178.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 5.391415548324585,
|
|
"epoch": 0.420268972142171,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988116983552675,
|
|
"loss": 5.3218,
|
|
"mean_token_accuracy": 0.18838354647159578,
|
|
"num_tokens": 10021183.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"entropy": 5.590651321411133,
|
|
"epoch": 0.4207492795389049,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004988081769232317,
|
|
"loss": 5.6204,
|
|
"mean_token_accuracy": 0.17428677082061766,
|
|
"num_tokens": 10033686.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 5.384156322479248,
|
|
"epoch": 0.42122958693563883,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004988046502950346,
|
|
"loss": 5.3079,
|
|
"mean_token_accuracy": 0.187077134847641,
|
|
"num_tokens": 10045923.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"entropy": 5.270208120346069,
|
|
"epoch": 0.4217098943323727,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000498801118470758,
|
|
"loss": 5.2402,
|
|
"mean_token_accuracy": 0.19899773895740508,
|
|
"num_tokens": 10057196.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 5.409784030914307,
|
|
"epoch": 0.42219020172910665,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000498797581450484,
|
|
"loss": 5.4295,
|
|
"mean_token_accuracy": 0.18354050666093827,
|
|
"num_tokens": 10069655.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"entropy": 5.448616600036621,
|
|
"epoch": 0.42267050912584053,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004987940392342948,
|
|
"loss": 5.3095,
|
|
"mean_token_accuracy": 0.19377071112394334,
|
|
"num_tokens": 10080876.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 5.421027898788452,
|
|
"epoch": 0.42315081652257447,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004987904918222726,
|
|
"loss": 5.415,
|
|
"mean_token_accuracy": 0.18513490557670592,
|
|
"num_tokens": 10091986.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"entropy": 5.5097509860992435,
|
|
"epoch": 0.42363112391930835,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004987869392144996,
|
|
"loss": 5.499,
|
|
"mean_token_accuracy": 0.18492884635925294,
|
|
"num_tokens": 10104027.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 5.425499534606933,
|
|
"epoch": 0.4241114313160423,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004987833814110584,
|
|
"loss": 5.3567,
|
|
"mean_token_accuracy": 0.1865203857421875,
|
|
"num_tokens": 10114665.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"entropy": 5.385516119003296,
|
|
"epoch": 0.42459173871277617,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004987798184120316,
|
|
"loss": 5.3742,
|
|
"mean_token_accuracy": 0.19014959633350373,
|
|
"num_tokens": 10126032.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 5.512171411514283,
|
|
"epoch": 0.4250720461095101,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004987762502175018,
|
|
"loss": 5.4288,
|
|
"mean_token_accuracy": 0.1829407036304474,
|
|
"num_tokens": 10137256.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"entropy": 5.3579336643219,
|
|
"epoch": 0.425552353506244,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498772676827552,
|
|
"loss": 5.3117,
|
|
"mean_token_accuracy": 0.1916539713740349,
|
|
"num_tokens": 10149445.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 5.474416351318359,
|
|
"epoch": 0.4260326609029779,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004987690982422652,
|
|
"loss": 5.4495,
|
|
"mean_token_accuracy": 0.18037094324827194,
|
|
"num_tokens": 10161607.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"entropy": 5.448618030548095,
|
|
"epoch": 0.4265129682997118,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004987655144617243,
|
|
"loss": 5.4681,
|
|
"mean_token_accuracy": 0.18403236269950868,
|
|
"num_tokens": 10173184.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 5.4251587867736815,
|
|
"epoch": 0.42699327569644574,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004987619254860126,
|
|
"loss": 5.328,
|
|
"mean_token_accuracy": 0.19698531180620193,
|
|
"num_tokens": 10184617.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"entropy": 5.4672339916229244,
|
|
"epoch": 0.4274735830931796,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004987583313152134,
|
|
"loss": 5.3568,
|
|
"mean_token_accuracy": 0.18906597346067427,
|
|
"num_tokens": 10195608.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 5.386989736557007,
|
|
"epoch": 0.42795389048991356,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004987547319494104,
|
|
"loss": 5.4529,
|
|
"mean_token_accuracy": 0.18423379063606263,
|
|
"num_tokens": 10206763.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"entropy": 5.486404466629028,
|
|
"epoch": 0.42843419788664744,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004987511273886867,
|
|
"loss": 5.3933,
|
|
"mean_token_accuracy": 0.1908423647284508,
|
|
"num_tokens": 10218714.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 5.427644729614258,
|
|
"epoch": 0.4289145052833814,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004987475176331263,
|
|
"loss": 5.415,
|
|
"mean_token_accuracy": 0.18401106595993041,
|
|
"num_tokens": 10229902.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"entropy": 5.423227453231812,
|
|
"epoch": 0.42939481268011526,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004987439026828129,
|
|
"loss": 5.288,
|
|
"mean_token_accuracy": 0.19139131158590317,
|
|
"num_tokens": 10241578.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 5.324700498580933,
|
|
"epoch": 0.4298751200768492,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004987402825378305,
|
|
"loss": 5.2595,
|
|
"mean_token_accuracy": 0.19443607479333877,
|
|
"num_tokens": 10252109.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"entropy": 5.429213285446167,
|
|
"epoch": 0.4303554274735831,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004987366571982631,
|
|
"loss": 5.4252,
|
|
"mean_token_accuracy": 0.18883214443922042,
|
|
"num_tokens": 10263357.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 5.487810945510864,
|
|
"epoch": 0.430835734870317,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004987330266641948,
|
|
"loss": 5.4308,
|
|
"mean_token_accuracy": 0.18471152931451798,
|
|
"num_tokens": 10275536.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"entropy": 5.453687620162964,
|
|
"epoch": 0.4313160422670509,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004987293909357101,
|
|
"loss": 5.415,
|
|
"mean_token_accuracy": 0.19442622363567352,
|
|
"num_tokens": 10286901.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 5.365311050415039,
|
|
"epoch": 0.43179634966378483,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004987257500128933,
|
|
"loss": 5.3172,
|
|
"mean_token_accuracy": 0.18610639423131942,
|
|
"num_tokens": 10298961.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"entropy": 5.462113523483277,
|
|
"epoch": 0.4322766570605187,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004987221038958288,
|
|
"loss": 5.4543,
|
|
"mean_token_accuracy": 0.18748044222593307,
|
|
"num_tokens": 10310911.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 5.510283613204956,
|
|
"epoch": 0.43275696445725265,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004987184525846015,
|
|
"loss": 5.4389,
|
|
"mean_token_accuracy": 0.1841048017144203,
|
|
"num_tokens": 10322267.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"entropy": 5.411655378341675,
|
|
"epoch": 0.4332372718539865,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004987147960792958,
|
|
"loss": 5.459,
|
|
"mean_token_accuracy": 0.18804670721292496,
|
|
"num_tokens": 10335111.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 5.520284938812256,
|
|
"epoch": 0.43371757925072046,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004987111343799971,
|
|
"loss": 5.3974,
|
|
"mean_token_accuracy": 0.1907435804605484,
|
|
"num_tokens": 10345672.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"entropy": 5.501500225067138,
|
|
"epoch": 0.43419788664745435,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00049870746748679,
|
|
"loss": 5.3725,
|
|
"mean_token_accuracy": 0.1861974611878395,
|
|
"num_tokens": 10357369.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 5.38987283706665,
|
|
"epoch": 0.4346781940441883,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004987037953997598,
|
|
"loss": 5.3935,
|
|
"mean_token_accuracy": 0.18683493435382842,
|
|
"num_tokens": 10368842.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"entropy": 5.43892183303833,
|
|
"epoch": 0.43515850144092216,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004987001181189918,
|
|
"loss": 5.3539,
|
|
"mean_token_accuracy": 0.18663013726472855,
|
|
"num_tokens": 10380096.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 5.306481552124024,
|
|
"epoch": 0.4356388088376561,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004986964356445713,
|
|
"loss": 5.3772,
|
|
"mean_token_accuracy": 0.19005681425333024,
|
|
"num_tokens": 10391996.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"entropy": 5.48760027885437,
|
|
"epoch": 0.43611911623439004,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004986927479765837,
|
|
"loss": 5.3288,
|
|
"mean_token_accuracy": 0.18343985229730606,
|
|
"num_tokens": 10403607.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 5.396467876434326,
|
|
"epoch": 0.4365994236311239,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004986890551151148,
|
|
"loss": 5.3604,
|
|
"mean_token_accuracy": 0.184589384496212,
|
|
"num_tokens": 10413580.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"entropy": 5.349568462371826,
|
|
"epoch": 0.43707973102785785,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004986853570602503,
|
|
"loss": 5.3881,
|
|
"mean_token_accuracy": 0.18719975054264068,
|
|
"num_tokens": 10426456.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 5.520879220962525,
|
|
"epoch": 0.43756003842459174,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004986816538120758,
|
|
"loss": 5.4101,
|
|
"mean_token_accuracy": 0.18188669979572297,
|
|
"num_tokens": 10438869.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"entropy": 5.397240781784058,
|
|
"epoch": 0.43804034582132567,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004986779453706778,
|
|
"loss": 5.4142,
|
|
"mean_token_accuracy": 0.1816550999879837,
|
|
"num_tokens": 10450672.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 5.4152685642242435,
|
|
"epoch": 0.43852065321805955,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004986742317361419,
|
|
"loss": 5.3271,
|
|
"mean_token_accuracy": 0.19575155526399612,
|
|
"num_tokens": 10461890.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"entropy": 5.498744964599609,
|
|
"epoch": 0.4390009606147935,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004986705129085546,
|
|
"loss": 5.4613,
|
|
"mean_token_accuracy": 0.17549378722906112,
|
|
"num_tokens": 10473866.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 5.460689496994019,
|
|
"epoch": 0.43948126801152737,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004986667888880021,
|
|
"loss": 5.381,
|
|
"mean_token_accuracy": 0.18632390201091767,
|
|
"num_tokens": 10484889.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"entropy": 5.412662744522095,
|
|
"epoch": 0.4399615754082613,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004986630596745709,
|
|
"loss": 5.4207,
|
|
"mean_token_accuracy": 0.1880632683634758,
|
|
"num_tokens": 10496108.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 5.389367771148682,
|
|
"epoch": 0.4404418828049952,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986593252683477,
|
|
"loss": 5.363,
|
|
"mean_token_accuracy": 0.18732869774103164,
|
|
"num_tokens": 10505472.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"entropy": 5.307269144058227,
|
|
"epoch": 0.4409221902017291,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004986555856694191,
|
|
"loss": 5.2773,
|
|
"mean_token_accuracy": 0.19333918690681456,
|
|
"num_tokens": 10516954.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 5.524228239059449,
|
|
"epoch": 0.441402497598463,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004986518408778718,
|
|
"loss": 5.3859,
|
|
"mean_token_accuracy": 0.18945636600255966,
|
|
"num_tokens": 10528166.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"entropy": 5.38381519317627,
|
|
"epoch": 0.44188280499519694,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004986480908937929,
|
|
"loss": 5.3113,
|
|
"mean_token_accuracy": 0.18772315680980683,
|
|
"num_tokens": 10538112.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 5.444307518005371,
|
|
"epoch": 0.4423631123919308,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004986443357172695,
|
|
"loss": 5.4568,
|
|
"mean_token_accuracy": 0.18497458845376968,
|
|
"num_tokens": 10549888.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"entropy": 5.58274884223938,
|
|
"epoch": 0.44284341978866476,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004986405753483887,
|
|
"loss": 5.5294,
|
|
"mean_token_accuracy": 0.17502811402082444,
|
|
"num_tokens": 10561710.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 5.410598850250244,
|
|
"epoch": 0.44332372718539864,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004986368097872377,
|
|
"loss": 5.379,
|
|
"mean_token_accuracy": 0.18401092439889907,
|
|
"num_tokens": 10574564.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"entropy": 5.41968560218811,
|
|
"epoch": 0.4438040345821326,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004986330390339042,
|
|
"loss": 5.3586,
|
|
"mean_token_accuracy": 0.18878330439329147,
|
|
"num_tokens": 10586639.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 5.373893547058105,
|
|
"epoch": 0.44428434197886646,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004986292630884755,
|
|
"loss": 5.3645,
|
|
"mean_token_accuracy": 0.18980913162231444,
|
|
"num_tokens": 10598730.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"entropy": 5.395772886276245,
|
|
"epoch": 0.4447646493756004,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004986254819510393,
|
|
"loss": 5.2863,
|
|
"mean_token_accuracy": 0.2030077889561653,
|
|
"num_tokens": 10610352.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 5.410120058059692,
|
|
"epoch": 0.4452449567723343,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004986216956216835,
|
|
"loss": 5.3544,
|
|
"mean_token_accuracy": 0.18991922438144684,
|
|
"num_tokens": 10621951.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"entropy": 5.380520057678223,
|
|
"epoch": 0.4457252641690682,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000498617904100496,
|
|
"loss": 5.3114,
|
|
"mean_token_accuracy": 0.1913859009742737,
|
|
"num_tokens": 10633207.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 5.473378133773804,
|
|
"epoch": 0.4462055715658021,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004986141073875646,
|
|
"loss": 5.4035,
|
|
"mean_token_accuracy": 0.18385644257068634,
|
|
"num_tokens": 10645853.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"entropy": 5.330105209350586,
|
|
"epoch": 0.44668587896253603,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004986103054829779,
|
|
"loss": 5.3305,
|
|
"mean_token_accuracy": 0.18985379487276077,
|
|
"num_tokens": 10656892.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 5.424197340011597,
|
|
"epoch": 0.4471661863592699,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004986064983868237,
|
|
"loss": 5.3095,
|
|
"mean_token_accuracy": 0.18436852544546128,
|
|
"num_tokens": 10670110.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"entropy": 5.429648303985596,
|
|
"epoch": 0.44764649375600385,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004986026860991906,
|
|
"loss": 5.4385,
|
|
"mean_token_accuracy": 0.185771344602108,
|
|
"num_tokens": 10681255.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 5.471052789688111,
|
|
"epoch": 0.44812680115273773,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004985988686201672,
|
|
"loss": 5.5041,
|
|
"mean_token_accuracy": 0.1844386264681816,
|
|
"num_tokens": 10692631.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"entropy": 5.442734622955323,
|
|
"epoch": 0.44860710854947167,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985950459498419,
|
|
"loss": 5.3372,
|
|
"mean_token_accuracy": 0.19462240785360335,
|
|
"num_tokens": 10704880.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 5.390188550949096,
|
|
"epoch": 0.44908741594620555,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004985912180883037,
|
|
"loss": 5.3095,
|
|
"mean_token_accuracy": 0.19716786891222,
|
|
"num_tokens": 10715561.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"entropy": 5.376702499389649,
|
|
"epoch": 0.4495677233429395,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004985873850356411,
|
|
"loss": 5.3369,
|
|
"mean_token_accuracy": 0.19014816135168075,
|
|
"num_tokens": 10727232.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 5.387975978851318,
|
|
"epoch": 0.45004803073967337,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004985835467919436,
|
|
"loss": 5.3461,
|
|
"mean_token_accuracy": 0.19422013461589813,
|
|
"num_tokens": 10739404.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"entropy": 5.369897413253784,
|
|
"epoch": 0.4505283381364073,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004985797033572999,
|
|
"loss": 5.3767,
|
|
"mean_token_accuracy": 0.18446222841739654,
|
|
"num_tokens": 10751948.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 5.362226104736328,
|
|
"epoch": 0.4510086455331412,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004985758547317994,
|
|
"loss": 5.3363,
|
|
"mean_token_accuracy": 0.18433189690113067,
|
|
"num_tokens": 10764611.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"entropy": 5.447867727279663,
|
|
"epoch": 0.4514889529298751,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004985720009155315,
|
|
"loss": 5.3727,
|
|
"mean_token_accuracy": 0.1841047078371048,
|
|
"num_tokens": 10775954.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 5.409327983856201,
|
|
"epoch": 0.45196926032660906,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004985681419085856,
|
|
"loss": 5.3909,
|
|
"mean_token_accuracy": 0.18282371312379836,
|
|
"num_tokens": 10788723.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"entropy": 5.421317195892334,
|
|
"epoch": 0.45244956772334294,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004985642777110513,
|
|
"loss": 5.3841,
|
|
"mean_token_accuracy": 0.1885462448000908,
|
|
"num_tokens": 10799879.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 5.3301918506622314,
|
|
"epoch": 0.4529298751200769,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985604083230183,
|
|
"loss": 5.3231,
|
|
"mean_token_accuracy": 0.18998679518699646,
|
|
"num_tokens": 10811838.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"entropy": 5.428510332107544,
|
|
"epoch": 0.45341018251681076,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004985565337445765,
|
|
"loss": 5.3434,
|
|
"mean_token_accuracy": 0.19171882420778275,
|
|
"num_tokens": 10822910.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 5.471314573287964,
|
|
"epoch": 0.4538904899135447,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985526539758158,
|
|
"loss": 5.3992,
|
|
"mean_token_accuracy": 0.18527638167142868,
|
|
"num_tokens": 10835344.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"entropy": 5.375976181030273,
|
|
"epoch": 0.4543707973102786,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004985487690168263,
|
|
"loss": 5.4034,
|
|
"mean_token_accuracy": 0.19202104806900025,
|
|
"num_tokens": 10846043.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 5.380132484436035,
|
|
"epoch": 0.4548511047070125,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000498544878867698,
|
|
"loss": 5.298,
|
|
"mean_token_accuracy": 0.19829845130443574,
|
|
"num_tokens": 10857783.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"entropy": 5.434480476379394,
|
|
"epoch": 0.4553314121037464,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004985409835285215,
|
|
"loss": 5.373,
|
|
"mean_token_accuracy": 0.19089124351739883,
|
|
"num_tokens": 10870527.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 5.414768075942993,
|
|
"epoch": 0.45581171950048033,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004985370829993873,
|
|
"loss": 5.3646,
|
|
"mean_token_accuracy": 0.19075230062007903,
|
|
"num_tokens": 10882285.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"entropy": 5.423041200637817,
|
|
"epoch": 0.4562920268972142,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004985331772803857,
|
|
"loss": 5.3874,
|
|
"mean_token_accuracy": 0.19265468865633012,
|
|
"num_tokens": 10895319.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 5.484057378768921,
|
|
"epoch": 0.45677233429394815,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004985292663716074,
|
|
"loss": 5.382,
|
|
"mean_token_accuracy": 0.19183963984251023,
|
|
"num_tokens": 10906253.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"entropy": 5.229197072982788,
|
|
"epoch": 0.457252641690682,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004985253502731435,
|
|
"loss": 5.2575,
|
|
"mean_token_accuracy": 0.19930023998022078,
|
|
"num_tokens": 10918197.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 5.455323648452759,
|
|
"epoch": 0.45773294908741596,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004985214289850845,
|
|
"loss": 5.4579,
|
|
"mean_token_accuracy": 0.17997599244117737,
|
|
"num_tokens": 10930771.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"entropy": 5.443937206268311,
|
|
"epoch": 0.45821325648414984,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004985175025075217,
|
|
"loss": 5.3491,
|
|
"mean_token_accuracy": 0.18804308474063874,
|
|
"num_tokens": 10942759.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 5.591840028762817,
|
|
"epoch": 0.4586935638808838,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004985135708405462,
|
|
"loss": 5.5609,
|
|
"mean_token_accuracy": 0.17564835995435715,
|
|
"num_tokens": 10953557.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"entropy": 5.411443281173706,
|
|
"epoch": 0.45917387127761766,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004985096339842493,
|
|
"loss": 5.3321,
|
|
"mean_token_accuracy": 0.19676847159862518,
|
|
"num_tokens": 10963142.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 5.309838056564331,
|
|
"epoch": 0.4596541786743516,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004985056919387224,
|
|
"loss": 5.2856,
|
|
"mean_token_accuracy": 0.19894758760929107,
|
|
"num_tokens": 10974321.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"entropy": 5.502527189254761,
|
|
"epoch": 0.4601344860710855,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004985017447040569,
|
|
"loss": 5.4874,
|
|
"mean_token_accuracy": 0.18695860356092453,
|
|
"num_tokens": 10985524.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 5.457700490951538,
|
|
"epoch": 0.4606147934678194,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004984977922803447,
|
|
"loss": 5.3727,
|
|
"mean_token_accuracy": 0.1937094435095787,
|
|
"num_tokens": 10997606.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"entropy": 5.4323536396026615,
|
|
"epoch": 0.4610951008645533,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004984938346676772,
|
|
"loss": 5.3833,
|
|
"mean_token_accuracy": 0.18257274031639098,
|
|
"num_tokens": 11010692.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 5.40803747177124,
|
|
"epoch": 0.46157540826128723,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004984898718661468,
|
|
"loss": 5.3099,
|
|
"mean_token_accuracy": 0.19199058413505554,
|
|
"num_tokens": 11022517.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"entropy": 5.350576591491699,
|
|
"epoch": 0.4620557156580211,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004984859038758451,
|
|
"loss": 5.3253,
|
|
"mean_token_accuracy": 0.19188573807477952,
|
|
"num_tokens": 11033141.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 5.32304048538208,
|
|
"epoch": 0.46253602305475505,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004984819306968642,
|
|
"loss": 5.3173,
|
|
"mean_token_accuracy": 0.19185021072626113,
|
|
"num_tokens": 11044619.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"entropy": 5.495067167282104,
|
|
"epoch": 0.46301633045148893,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004984779523292966,
|
|
"loss": 5.3646,
|
|
"mean_token_accuracy": 0.18967657685279846,
|
|
"num_tokens": 11055934.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 5.383758926391602,
|
|
"epoch": 0.46349663784822287,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004984739687732345,
|
|
"loss": 5.2493,
|
|
"mean_token_accuracy": 0.19513811767101288,
|
|
"num_tokens": 11066203.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"entropy": 5.187354946136475,
|
|
"epoch": 0.46397694524495675,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004984699800287705,
|
|
"loss": 5.1973,
|
|
"mean_token_accuracy": 0.19977913796901703,
|
|
"num_tokens": 11079664.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 5.341605234146118,
|
|
"epoch": 0.4644572526416907,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000498465986095997,
|
|
"loss": 5.2652,
|
|
"mean_token_accuracy": 0.19821466654539108,
|
|
"num_tokens": 11091186.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"entropy": 5.42094578742981,
|
|
"epoch": 0.46493756003842457,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004984619869750069,
|
|
"loss": 5.383,
|
|
"mean_token_accuracy": 0.18526540249586104,
|
|
"num_tokens": 11102710.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 5.292195415496826,
|
|
"epoch": 0.4654178674351585,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000498457982665893,
|
|
"loss": 5.2795,
|
|
"mean_token_accuracy": 0.19302588403224946,
|
|
"num_tokens": 11114746.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"entropy": 5.397561931610108,
|
|
"epoch": 0.4658981748318924,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004984539731687483,
|
|
"loss": 5.3462,
|
|
"mean_token_accuracy": 0.18983854949474335,
|
|
"num_tokens": 11126572.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 5.380267095565796,
|
|
"epoch": 0.4663784822286263,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004984499584836659,
|
|
"loss": 5.2431,
|
|
"mean_token_accuracy": 0.19321491122245787,
|
|
"num_tokens": 11137830.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"entropy": 5.32379674911499,
|
|
"epoch": 0.4668587896253602,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498445938610739,
|
|
"loss": 5.281,
|
|
"mean_token_accuracy": 0.19294328689575196,
|
|
"num_tokens": 11148860.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 5.419743824005127,
|
|
"epoch": 0.46733909702209414,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004984419135500608,
|
|
"loss": 5.4081,
|
|
"mean_token_accuracy": 0.17859717160463334,
|
|
"num_tokens": 11161311.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"entropy": 5.430191612243652,
|
|
"epoch": 0.4678194044188281,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004984378833017249,
|
|
"loss": 5.2942,
|
|
"mean_token_accuracy": 0.19046030193567276,
|
|
"num_tokens": 11173124.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 5.344765472412109,
|
|
"epoch": 0.46829971181556196,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004984338478658248,
|
|
"loss": 5.3783,
|
|
"mean_token_accuracy": 0.19164984971284865,
|
|
"num_tokens": 11184879.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"entropy": 5.45609302520752,
|
|
"epoch": 0.4687800192122959,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004984298072424542,
|
|
"loss": 5.378,
|
|
"mean_token_accuracy": 0.1874854624271393,
|
|
"num_tokens": 11196243.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 5.339529609680175,
|
|
"epoch": 0.4692603266090298,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000498425761431707,
|
|
"loss": 5.2513,
|
|
"mean_token_accuracy": 0.20040780752897264,
|
|
"num_tokens": 11207485.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"entropy": 5.312271356582642,
|
|
"epoch": 0.4697406340057637,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000498421710433677,
|
|
"loss": 5.279,
|
|
"mean_token_accuracy": 0.19036460667848587,
|
|
"num_tokens": 11219891.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 5.4914182186126705,
|
|
"epoch": 0.4702209414024976,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004984176542484584,
|
|
"loss": 5.388,
|
|
"mean_token_accuracy": 0.18597144782543182,
|
|
"num_tokens": 11231329.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"entropy": 5.378525733947754,
|
|
"epoch": 0.47070124879923153,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004984135928761452,
|
|
"loss": 5.266,
|
|
"mean_token_accuracy": 0.1995886370539665,
|
|
"num_tokens": 11241367.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 5.358568334579468,
|
|
"epoch": 0.4711815561959654,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004984095263168317,
|
|
"loss": 5.3589,
|
|
"mean_token_accuracy": 0.18466073721647264,
|
|
"num_tokens": 11254532.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"entropy": 5.4979103088378904,
|
|
"epoch": 0.47166186359269935,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004984054545706124,
|
|
"loss": 5.4398,
|
|
"mean_token_accuracy": 0.18243181705474854,
|
|
"num_tokens": 11265223.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 5.3696846008300785,
|
|
"epoch": 0.47214217098943323,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000498401377637582,
|
|
"loss": 5.3635,
|
|
"mean_token_accuracy": 0.18885526210069656,
|
|
"num_tokens": 11278228.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"entropy": 5.484466791152954,
|
|
"epoch": 0.47262247838616717,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000498397295517835,
|
|
"loss": 5.4846,
|
|
"mean_token_accuracy": 0.1801117405295372,
|
|
"num_tokens": 11289654.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 5.394139242172241,
|
|
"epoch": 0.47310278578290105,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004983932082114659,
|
|
"loss": 5.2357,
|
|
"mean_token_accuracy": 0.19755308330059052,
|
|
"num_tokens": 11301911.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"entropy": 5.4873377799987795,
|
|
"epoch": 0.473583093179635,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004983891157185699,
|
|
"loss": 5.4364,
|
|
"mean_token_accuracy": 0.18308536261320113,
|
|
"num_tokens": 11312945.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 5.549541664123535,
|
|
"epoch": 0.47406340057636887,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004983850180392421,
|
|
"loss": 5.4774,
|
|
"mean_token_accuracy": 0.18022425770759581,
|
|
"num_tokens": 11324126.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"entropy": 5.402717351913452,
|
|
"epoch": 0.4745437079731028,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004983809151735775,
|
|
"loss": 5.4133,
|
|
"mean_token_accuracy": 0.18017226606607437,
|
|
"num_tokens": 11336395.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 5.403596019744873,
|
|
"epoch": 0.4750240153698367,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004983768071216713,
|
|
"loss": 5.3135,
|
|
"mean_token_accuracy": 0.1902969852089882,
|
|
"num_tokens": 11347387.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"entropy": 5.353836917877198,
|
|
"epoch": 0.4755043227665706,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004983726938836189,
|
|
"loss": 5.308,
|
|
"mean_token_accuracy": 0.19681546241044998,
|
|
"num_tokens": 11358467.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 5.486645841598511,
|
|
"epoch": 0.4759846301633045,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004983685754595159,
|
|
"loss": 5.4724,
|
|
"mean_token_accuracy": 0.18010423183441163,
|
|
"num_tokens": 11370322.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"entropy": 5.333859491348266,
|
|
"epoch": 0.47646493756003844,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004983644518494578,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.20096147507429124,
|
|
"num_tokens": 11381719.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 5.328320550918579,
|
|
"epoch": 0.4769452449567723,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004983603230535403,
|
|
"loss": 5.2895,
|
|
"mean_token_accuracy": 0.1948627695441246,
|
|
"num_tokens": 11393561.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"entropy": 5.460376167297364,
|
|
"epoch": 0.47742555235350626,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004983561890718594,
|
|
"loss": 5.3849,
|
|
"mean_token_accuracy": 0.18933912962675095,
|
|
"num_tokens": 11405411.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 5.5110303401947025,
|
|
"epoch": 0.47790585975024014,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000498352049904511,
|
|
"loss": 5.4771,
|
|
"mean_token_accuracy": 0.17981591820716858,
|
|
"num_tokens": 11417419.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"entropy": 5.429950714111328,
|
|
"epoch": 0.4783861671469741,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004983479055515914,
|
|
"loss": 5.2844,
|
|
"mean_token_accuracy": 0.18997065275907515,
|
|
"num_tokens": 11428145.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 5.290281534194946,
|
|
"epoch": 0.47886647454370795,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004983437560131964,
|
|
"loss": 5.2422,
|
|
"mean_token_accuracy": 0.1993091583251953,
|
|
"num_tokens": 11439224.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"entropy": 5.409195756912231,
|
|
"epoch": 0.4793467819404419,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004983396012894228,
|
|
"loss": 5.3477,
|
|
"mean_token_accuracy": 0.18979695290327073,
|
|
"num_tokens": 11451731.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 5.435146522521973,
|
|
"epoch": 0.47982708933717577,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004983354413803666,
|
|
"loss": 5.3375,
|
|
"mean_token_accuracy": 0.1958609476685524,
|
|
"num_tokens": 11463058.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"entropy": 5.473912382125855,
|
|
"epoch": 0.4803073967339097,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004983312762861248,
|
|
"loss": 5.4305,
|
|
"mean_token_accuracy": 0.18449530750513077,
|
|
"num_tokens": 11472618.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 5.364778709411621,
|
|
"epoch": 0.4807877041306436,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004983271060067939,
|
|
"loss": 5.3246,
|
|
"mean_token_accuracy": 0.18677808940410615,
|
|
"num_tokens": 11483114.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"entropy": 5.3417730808258055,
|
|
"epoch": 0.4812680115273775,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004983229305424707,
|
|
"loss": 5.2799,
|
|
"mean_token_accuracy": 0.19405496269464492,
|
|
"num_tokens": 11494281.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 5.351672601699829,
|
|
"epoch": 0.4817483189241114,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004983187498932522,
|
|
"loss": 5.3503,
|
|
"mean_token_accuracy": 0.18800514042377472,
|
|
"num_tokens": 11505962.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"entropy": 5.4874766826629635,
|
|
"epoch": 0.48222862632084534,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004983145640592354,
|
|
"loss": 5.4492,
|
|
"mean_token_accuracy": 0.18352760821580888,
|
|
"num_tokens": 11517558.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 5.448751974105835,
|
|
"epoch": 0.4827089337175792,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004983103730405176,
|
|
"loss": 5.4179,
|
|
"mean_token_accuracy": 0.18682138621807098,
|
|
"num_tokens": 11529184.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"entropy": 5.338459253311157,
|
|
"epoch": 0.48318924111431316,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498306176837196,
|
|
"loss": 5.3335,
|
|
"mean_token_accuracy": 0.18406548202037812,
|
|
"num_tokens": 11540727.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 5.360374689102173,
|
|
"epoch": 0.48366954851104704,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004983019754493681,
|
|
"loss": 5.261,
|
|
"mean_token_accuracy": 0.1907915487885475,
|
|
"num_tokens": 11551510.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"entropy": 5.47594895362854,
|
|
"epoch": 0.484149855907781,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004982977688771314,
|
|
"loss": 5.4187,
|
|
"mean_token_accuracy": 0.18854755759239197,
|
|
"num_tokens": 11563203.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 5.308377647399903,
|
|
"epoch": 0.4846301633045149,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004982935571205835,
|
|
"loss": 5.2718,
|
|
"mean_token_accuracy": 0.19544857442379,
|
|
"num_tokens": 11576013.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"entropy": 5.291185140609741,
|
|
"epoch": 0.4851104707012488,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004982893401798223,
|
|
"loss": 5.2498,
|
|
"mean_token_accuracy": 0.20830876976251603,
|
|
"num_tokens": 11587535.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 5.403550291061402,
|
|
"epoch": 0.48559077809798273,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004982851180549456,
|
|
"loss": 5.2771,
|
|
"mean_token_accuracy": 0.19294197112321854,
|
|
"num_tokens": 11598487.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"entropy": 5.25755033493042,
|
|
"epoch": 0.4860710854947166,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004982808907460515,
|
|
"loss": 5.1559,
|
|
"mean_token_accuracy": 0.20932556241750716,
|
|
"num_tokens": 11609457.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 5.265308237075805,
|
|
"epoch": 0.48655139289145055,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004982766582532382,
|
|
"loss": 5.2257,
|
|
"mean_token_accuracy": 0.19795275181531907,
|
|
"num_tokens": 11620251.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"entropy": 5.307956266403198,
|
|
"epoch": 0.48703170028818443,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004982724205766038,
|
|
"loss": 5.2262,
|
|
"mean_token_accuracy": 0.19880327582359314,
|
|
"num_tokens": 11630956.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 5.348564767837525,
|
|
"epoch": 0.48751200768491837,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004982681777162468,
|
|
"loss": 5.2773,
|
|
"mean_token_accuracy": 0.1949208691716194,
|
|
"num_tokens": 11642560.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"entropy": 5.300316572189331,
|
|
"epoch": 0.48799231508165225,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004982639296722657,
|
|
"loss": 5.2365,
|
|
"mean_token_accuracy": 0.19546635299921036,
|
|
"num_tokens": 11654050.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 5.333183813095093,
|
|
"epoch": 0.4884726224783862,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004982596764447591,
|
|
"loss": 5.4035,
|
|
"mean_token_accuracy": 0.19310665130615234,
|
|
"num_tokens": 11664947.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"entropy": 5.469000768661499,
|
|
"epoch": 0.48895292987512007,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004982554180338258,
|
|
"loss": 5.3106,
|
|
"mean_token_accuracy": 0.19500951319932938,
|
|
"num_tokens": 11676927.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 5.502379417419434,
|
|
"epoch": 0.489433237271854,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004982511544395646,
|
|
"loss": 5.4242,
|
|
"mean_token_accuracy": 0.18115128874778746,
|
|
"num_tokens": 11688573.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"entropy": 5.288805294036865,
|
|
"epoch": 0.4899135446685879,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004982468856620745,
|
|
"loss": 5.3128,
|
|
"mean_token_accuracy": 0.18783441036939622,
|
|
"num_tokens": 11698704.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 5.3273578643798825,
|
|
"epoch": 0.4903938520653218,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004982426117014545,
|
|
"loss": 5.2533,
|
|
"mean_token_accuracy": 0.19392533451318741,
|
|
"num_tokens": 11709466.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"entropy": 5.3791663646698,
|
|
"epoch": 0.4908741594620557,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004982383325578041,
|
|
"loss": 5.3413,
|
|
"mean_token_accuracy": 0.1898537114262581,
|
|
"num_tokens": 11721120.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 5.4256843566894535,
|
|
"epoch": 0.49135446685878964,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004982340482312226,
|
|
"loss": 5.3358,
|
|
"mean_token_accuracy": 0.18456312417984008,
|
|
"num_tokens": 11732120.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"entropy": 5.288364553451538,
|
|
"epoch": 0.4918347742555235,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004982297587218092,
|
|
"loss": 5.2294,
|
|
"mean_token_accuracy": 0.1978309139609337,
|
|
"num_tokens": 11743501.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 5.363348197937012,
|
|
"epoch": 0.49231508165225746,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004982254640296637,
|
|
"loss": 5.3152,
|
|
"mean_token_accuracy": 0.1956743210554123,
|
|
"num_tokens": 11755051.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"entropy": 5.436681079864502,
|
|
"epoch": 0.49279538904899134,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004982211641548857,
|
|
"loss": 5.4609,
|
|
"mean_token_accuracy": 0.1842927649617195,
|
|
"num_tokens": 11767663.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 5.419048309326172,
|
|
"epoch": 0.4932756964457253,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004982168590975752,
|
|
"loss": 5.3034,
|
|
"mean_token_accuracy": 0.19774986803531647,
|
|
"num_tokens": 11778828.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"entropy": 5.459513902664185,
|
|
"epoch": 0.49375600384245916,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004982125488578321,
|
|
"loss": 5.4794,
|
|
"mean_token_accuracy": 0.18496931344270706,
|
|
"num_tokens": 11790654.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 5.433895540237427,
|
|
"epoch": 0.4942363112391931,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004982082334357563,
|
|
"loss": 5.2837,
|
|
"mean_token_accuracy": 0.1902835488319397,
|
|
"num_tokens": 11801489.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"entropy": 5.311564207077026,
|
|
"epoch": 0.494716618635927,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004982039128314481,
|
|
"loss": 5.2873,
|
|
"mean_token_accuracy": 0.19224448949098588,
|
|
"num_tokens": 11813818.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 5.333755207061768,
|
|
"epoch": 0.4951969260326609,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004981995870450079,
|
|
"loss": 5.2929,
|
|
"mean_token_accuracy": 0.191859370470047,
|
|
"num_tokens": 11824814.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"entropy": 5.45896692276001,
|
|
"epoch": 0.4956772334293948,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004981952560765361,
|
|
"loss": 5.3373,
|
|
"mean_token_accuracy": 0.18679553270339966,
|
|
"num_tokens": 11836252.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 5.314207363128662,
|
|
"epoch": 0.49615754082612873,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004981909199261331,
|
|
"loss": 5.2629,
|
|
"mean_token_accuracy": 0.19086166322231293,
|
|
"num_tokens": 11847715.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"entropy": 5.273135042190551,
|
|
"epoch": 0.4966378482228626,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004981865785938998,
|
|
"loss": 5.2629,
|
|
"mean_token_accuracy": 0.19300127327442168,
|
|
"num_tokens": 11860309.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 5.348716497421265,
|
|
"epoch": 0.49711815561959655,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004981822320799367,
|
|
"loss": 5.2577,
|
|
"mean_token_accuracy": 0.1956932559609413,
|
|
"num_tokens": 11872569.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"entropy": 5.3287012577056885,
|
|
"epoch": 0.49759846301633043,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004981778803843449,
|
|
"loss": 5.2523,
|
|
"mean_token_accuracy": 0.19481286704540252,
|
|
"num_tokens": 11884778.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 5.390296173095703,
|
|
"epoch": 0.49807877041306436,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004981735235072256,
|
|
"loss": 5.3358,
|
|
"mean_token_accuracy": 0.1911753833293915,
|
|
"num_tokens": 11897324.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"entropy": 5.467144203186035,
|
|
"epoch": 0.49855907780979825,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004981691614486796,
|
|
"loss": 5.366,
|
|
"mean_token_accuracy": 0.18982964605093003,
|
|
"num_tokens": 11909145.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 5.322554683685302,
|
|
"epoch": 0.4990393852065322,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004981647942088084,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.20009808093309403,
|
|
"num_tokens": 11921021.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"entropy": 5.487699699401856,
|
|
"epoch": 0.49951969260326606,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004981604217877135,
|
|
"loss": 5.4279,
|
|
"mean_token_accuracy": 0.1888749822974205,
|
|
"num_tokens": 11932565.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 5.318529844284058,
|
|
"epoch": 0.5,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000498156044185496,
|
|
"loss": 5.3392,
|
|
"mean_token_accuracy": 0.19370948225259782,
|
|
"num_tokens": 11943225.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"entropy": 5.364103078842163,
|
|
"epoch": 0.5004803073967339,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004981516614022579,
|
|
"loss": 5.3219,
|
|
"mean_token_accuracy": 0.1932568922638893,
|
|
"num_tokens": 11954821.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 5.446450281143188,
|
|
"epoch": 0.5009606147934679,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004981472734381008,
|
|
"loss": 5.2738,
|
|
"mean_token_accuracy": 0.1951069414615631,
|
|
"num_tokens": 11966090.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"entropy": 5.353061962127685,
|
|
"epoch": 0.5014409221902018,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004981428802931267,
|
|
"loss": 5.3074,
|
|
"mean_token_accuracy": 0.1921882688999176,
|
|
"num_tokens": 11977410.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 5.339950656890869,
|
|
"epoch": 0.5019212295869356,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004981384819674375,
|
|
"loss": 5.2841,
|
|
"mean_token_accuracy": 0.19126271605491638,
|
|
"num_tokens": 11989119.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"entropy": 5.432912015914917,
|
|
"epoch": 0.5024015369836695,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004981340784611354,
|
|
"loss": 5.3942,
|
|
"mean_token_accuracy": 0.19018032401800156,
|
|
"num_tokens": 12000165.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 5.395741987228393,
|
|
"epoch": 0.5028818443804035,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004981296697743224,
|
|
"loss": 5.3475,
|
|
"mean_token_accuracy": 0.18768104463815688,
|
|
"num_tokens": 12012118.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"entropy": 5.430673694610595,
|
|
"epoch": 0.5033621517771374,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004981252559071012,
|
|
"loss": 5.4181,
|
|
"mean_token_accuracy": 0.1866712138056755,
|
|
"num_tokens": 12023432.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 5.427559089660645,
|
|
"epoch": 0.5038424591738713,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004981208368595739,
|
|
"loss": 5.2939,
|
|
"mean_token_accuracy": 0.1980261042714119,
|
|
"num_tokens": 12034323.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"entropy": 5.264776802062988,
|
|
"epoch": 0.5043227665706052,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004981164126318435,
|
|
"loss": 5.3022,
|
|
"mean_token_accuracy": 0.19116167575120926,
|
|
"num_tokens": 12045532.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 5.449652862548828,
|
|
"epoch": 0.5048030739673391,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004981119832240124,
|
|
"loss": 5.3111,
|
|
"mean_token_accuracy": 0.19520313441753387,
|
|
"num_tokens": 12057346.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"entropy": 5.301677227020264,
|
|
"epoch": 0.505283381364073,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004981075486361837,
|
|
"loss": 5.2825,
|
|
"mean_token_accuracy": 0.19872631430625914,
|
|
"num_tokens": 12068670.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 5.390146923065186,
|
|
"epoch": 0.5057636887608069,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004981031088684601,
|
|
"loss": 5.4028,
|
|
"mean_token_accuracy": 0.18470921665430068,
|
|
"num_tokens": 12079664.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"entropy": 5.474726438522339,
|
|
"epoch": 0.5062439961575408,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004980986639209448,
|
|
"loss": 5.3285,
|
|
"mean_token_accuracy": 0.1994831383228302,
|
|
"num_tokens": 12089984.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 5.29730339050293,
|
|
"epoch": 0.5067243035542748,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000498094213793741,
|
|
"loss": 5.2835,
|
|
"mean_token_accuracy": 0.1948940023779869,
|
|
"num_tokens": 12101182.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"entropy": 5.408280658721924,
|
|
"epoch": 0.5072046109510087,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000498089758486952,
|
|
"loss": 5.353,
|
|
"mean_token_accuracy": 0.18289182782173158,
|
|
"num_tokens": 12112002.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 5.495666790008545,
|
|
"epoch": 0.5076849183477425,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004980852980006812,
|
|
"loss": 5.4392,
|
|
"mean_token_accuracy": 0.1805154114961624,
|
|
"num_tokens": 12124194.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"entropy": 5.392632579803466,
|
|
"epoch": 0.5081652257444764,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004980808323350323,
|
|
"loss": 5.359,
|
|
"mean_token_accuracy": 0.1960368499159813,
|
|
"num_tokens": 12133966.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 5.391989612579346,
|
|
"epoch": 0.5086455331412104,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004980763614901089,
|
|
"loss": 5.2967,
|
|
"mean_token_accuracy": 0.19686038345098494,
|
|
"num_tokens": 12145643.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"entropy": 5.379247760772705,
|
|
"epoch": 0.5091258405379443,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004980718854660146,
|
|
"loss": 5.3464,
|
|
"mean_token_accuracy": 0.18789971768856048,
|
|
"num_tokens": 12156804.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 5.400803756713867,
|
|
"epoch": 0.5096061479346782,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004980674042628537,
|
|
"loss": 5.2967,
|
|
"mean_token_accuracy": 0.19052283465862274,
|
|
"num_tokens": 12168700.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"entropy": 5.401619243621826,
|
|
"epoch": 0.5100864553314121,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00049806291788073,
|
|
"loss": 5.3123,
|
|
"mean_token_accuracy": 0.18629832863807677,
|
|
"num_tokens": 12181050.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 5.469602966308594,
|
|
"epoch": 0.510566762728146,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004980584263197477,
|
|
"loss": 5.3949,
|
|
"mean_token_accuracy": 0.1858072027564049,
|
|
"num_tokens": 12192001.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"entropy": 5.508568143844604,
|
|
"epoch": 0.5110470701248799,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004980539295800111,
|
|
"loss": 5.509,
|
|
"mean_token_accuracy": 0.18043418526649474,
|
|
"num_tokens": 12202436.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 5.362590551376343,
|
|
"epoch": 0.5115273775216138,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004980494276616246,
|
|
"loss": 5.3016,
|
|
"mean_token_accuracy": 0.18966611623764038,
|
|
"num_tokens": 12214454.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"entropy": 5.349428033828735,
|
|
"epoch": 0.5120076849183477,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004980449205646926,
|
|
"loss": 5.3122,
|
|
"mean_token_accuracy": 0.19553214311599731,
|
|
"num_tokens": 12225924.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 5.415020084381103,
|
|
"epoch": 0.5124879923150817,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00049804040828932,
|
|
"loss": 5.3326,
|
|
"mean_token_accuracy": 0.19512139409780502,
|
|
"num_tokens": 12236456.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"entropy": 5.421989011764526,
|
|
"epoch": 0.5129682997118156,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004980358908356113,
|
|
"loss": 5.3535,
|
|
"mean_token_accuracy": 0.18762658089399337,
|
|
"num_tokens": 12247719.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 5.350346803665161,
|
|
"epoch": 0.5134486071085494,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004980313682036717,
|
|
"loss": 5.381,
|
|
"mean_token_accuracy": 0.1927213490009308,
|
|
"num_tokens": 12259141.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"entropy": 5.49134635925293,
|
|
"epoch": 0.5139289145052833,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004980268403936058,
|
|
"loss": 5.4456,
|
|
"mean_token_accuracy": 0.18453603684902192,
|
|
"num_tokens": 12269748.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 5.434391784667969,
|
|
"epoch": 0.5144092219020173,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004980223074055189,
|
|
"loss": 5.379,
|
|
"mean_token_accuracy": 0.1960138276219368,
|
|
"num_tokens": 12281456.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"entropy": 5.409012746810913,
|
|
"epoch": 0.5148895292987512,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004980177692395164,
|
|
"loss": 5.3518,
|
|
"mean_token_accuracy": 0.18338604271411896,
|
|
"num_tokens": 12293763.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 5.351993417739868,
|
|
"epoch": 0.5153698366954851,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004980132258957035,
|
|
"loss": 5.2808,
|
|
"mean_token_accuracy": 0.1969463735818863,
|
|
"num_tokens": 12305398.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"entropy": 5.274507617950439,
|
|
"epoch": 0.515850144092219,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004980086773741856,
|
|
"loss": 5.2796,
|
|
"mean_token_accuracy": 0.19121709913015367,
|
|
"num_tokens": 12316582.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 5.483122396469116,
|
|
"epoch": 0.516330451488953,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004980041236750685,
|
|
"loss": 5.3846,
|
|
"mean_token_accuracy": 0.18809578120708464,
|
|
"num_tokens": 12328463.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"entropy": 5.445298194885254,
|
|
"epoch": 0.5168107588856868,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004979995647984577,
|
|
"loss": 5.3698,
|
|
"mean_token_accuracy": 0.19524169117212295,
|
|
"num_tokens": 12341040.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 5.2983297348022464,
|
|
"epoch": 0.5172910662824207,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004979950007444593,
|
|
"loss": 5.261,
|
|
"mean_token_accuracy": 0.1934810236096382,
|
|
"num_tokens": 12353024.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"entropy": 5.358570623397827,
|
|
"epoch": 0.5177713736791547,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004979904315131792,
|
|
"loss": 5.2844,
|
|
"mean_token_accuracy": 0.19403222799301148,
|
|
"num_tokens": 12366100.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 5.293501186370849,
|
|
"epoch": 0.5182516810758886,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004979858571047233,
|
|
"loss": 5.2707,
|
|
"mean_token_accuracy": 0.19768950045108796,
|
|
"num_tokens": 12377829.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"entropy": 5.466844320297241,
|
|
"epoch": 0.5187319884726225,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004979812775191979,
|
|
"loss": 5.4031,
|
|
"mean_token_accuracy": 0.18979473859071733,
|
|
"num_tokens": 12390830.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 5.328051805496216,
|
|
"epoch": 0.5192122958693564,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004979766927567094,
|
|
"loss": 5.2545,
|
|
"mean_token_accuracy": 0.19470396041870117,
|
|
"num_tokens": 12401642.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"entropy": 5.3456236839294435,
|
|
"epoch": 0.5196926032660903,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004979721028173643,
|
|
"loss": 5.3476,
|
|
"mean_token_accuracy": 0.1877232700586319,
|
|
"num_tokens": 12411653.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 5.386164760589599,
|
|
"epoch": 0.5201729106628242,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000497967507701269,
|
|
"loss": 5.2486,
|
|
"mean_token_accuracy": 0.20038487911224365,
|
|
"num_tokens": 12422891.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"entropy": 5.397801113128662,
|
|
"epoch": 0.5206532180595581,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004979629074085303,
|
|
"loss": 5.3408,
|
|
"mean_token_accuracy": 0.19329493790864943,
|
|
"num_tokens": 12434190.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 5.424389457702636,
|
|
"epoch": 0.521133525456292,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004979583019392548,
|
|
"loss": 5.3974,
|
|
"mean_token_accuracy": 0.18989453911781312,
|
|
"num_tokens": 12445796.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"entropy": 5.483598613739014,
|
|
"epoch": 0.521613832853026,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004979536912935497,
|
|
"loss": 5.4639,
|
|
"mean_token_accuracy": 0.18501935750246049,
|
|
"num_tokens": 12456212.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 5.330318355560303,
|
|
"epoch": 0.5220941402497599,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000497949075471522,
|
|
"loss": 5.1899,
|
|
"mean_token_accuracy": 0.19820088148117065,
|
|
"num_tokens": 12467871.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"entropy": 5.372925519943237,
|
|
"epoch": 0.5225744476464937,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004979444544732786,
|
|
"loss": 5.2819,
|
|
"mean_token_accuracy": 0.1852207139134407,
|
|
"num_tokens": 12478626.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 5.313206958770752,
|
|
"epoch": 0.5230547550432276,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000497939828298927,
|
|
"loss": 5.3741,
|
|
"mean_token_accuracy": 0.19033849388360977,
|
|
"num_tokens": 12491487.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"entropy": 5.462804317474365,
|
|
"epoch": 0.5235350624399616,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004979351969485747,
|
|
"loss": 5.3383,
|
|
"mean_token_accuracy": 0.18805173933506011,
|
|
"num_tokens": 12503240.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 5.4243183612823485,
|
|
"epoch": 0.5240153698366955,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004979305604223291,
|
|
"loss": 5.2774,
|
|
"mean_token_accuracy": 0.1903422147035599,
|
|
"num_tokens": 12513860.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"entropy": 5.313809871673584,
|
|
"epoch": 0.5244956772334294,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004979259187202978,
|
|
"loss": 5.352,
|
|
"mean_token_accuracy": 0.1945337176322937,
|
|
"num_tokens": 12525884.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 5.442373895645142,
|
|
"epoch": 0.5249759846301633,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004979212718425887,
|
|
"loss": 5.2672,
|
|
"mean_token_accuracy": 0.1932208612561226,
|
|
"num_tokens": 12536709.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"entropy": 5.334468412399292,
|
|
"epoch": 0.5254562920268973,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004979166197893096,
|
|
"loss": 5.2663,
|
|
"mean_token_accuracy": 0.19677013605833055,
|
|
"num_tokens": 12549727.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 5.339883422851562,
|
|
"epoch": 0.5259365994236311,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004979119625605683,
|
|
"loss": 5.3345,
|
|
"mean_token_accuracy": 0.18942939788103103,
|
|
"num_tokens": 12562053.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"entropy": 5.287409067153931,
|
|
"epoch": 0.526416906820365,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004979073001564734,
|
|
"loss": 5.2257,
|
|
"mean_token_accuracy": 0.20170782059431075,
|
|
"num_tokens": 12574096.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 5.40628571510315,
|
|
"epoch": 0.5268972142170989,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004979026325771328,
|
|
"loss": 5.4013,
|
|
"mean_token_accuracy": 0.18865474164485932,
|
|
"num_tokens": 12585416.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"entropy": 5.369120025634766,
|
|
"epoch": 0.5273775216138329,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004978979598226549,
|
|
"loss": 5.2525,
|
|
"mean_token_accuracy": 0.1964880034327507,
|
|
"num_tokens": 12596861.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 5.307511520385742,
|
|
"epoch": 0.5278578290105668,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004978932818931483,
|
|
"loss": 5.2672,
|
|
"mean_token_accuracy": 0.19722044318914414,
|
|
"num_tokens": 12607761.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"entropy": 5.4275431632995605,
|
|
"epoch": 0.5283381364073007,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978885987887216,
|
|
"loss": 5.3898,
|
|
"mean_token_accuracy": 0.19588741660118103,
|
|
"num_tokens": 12619889.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 5.4371997833251955,
|
|
"epoch": 0.5288184438040345,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978839105094833,
|
|
"loss": 5.3606,
|
|
"mean_token_accuracy": 0.19224700778722764,
|
|
"num_tokens": 12630604.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"entropy": 5.222589921951294,
|
|
"epoch": 0.5292987512007685,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004978792170555426,
|
|
"loss": 5.2618,
|
|
"mean_token_accuracy": 0.19633477181196213,
|
|
"num_tokens": 12641172.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 5.292724561691284,
|
|
"epoch": 0.5297790585975024,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004978745184270083,
|
|
"loss": 5.1601,
|
|
"mean_token_accuracy": 0.20660953521728515,
|
|
"num_tokens": 12651731.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"entropy": 5.392834901809692,
|
|
"epoch": 0.5302593659942363,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004978698146239893,
|
|
"loss": 5.2978,
|
|
"mean_token_accuracy": 0.1936490774154663,
|
|
"num_tokens": 12663050.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 5.409347009658814,
|
|
"epoch": 0.5307396733909702,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004978651056465952,
|
|
"loss": 5.3862,
|
|
"mean_token_accuracy": 0.18999682515859603,
|
|
"num_tokens": 12674732.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"entropy": 5.332290983200073,
|
|
"epoch": 0.5312199807877042,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000497860391494935,
|
|
"loss": 5.2171,
|
|
"mean_token_accuracy": 0.19382983297109604,
|
|
"num_tokens": 12685981.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 5.412051010131836,
|
|
"epoch": 0.531700288184438,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004978556721691183,
|
|
"loss": 5.3525,
|
|
"mean_token_accuracy": 0.19065555483102797,
|
|
"num_tokens": 12697139.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"entropy": 5.317591810226441,
|
|
"epoch": 0.5321805955811719,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004978509476692547,
|
|
"loss": 5.2966,
|
|
"mean_token_accuracy": 0.18611351698637008,
|
|
"num_tokens": 12708268.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 5.375318956375122,
|
|
"epoch": 0.5326609029779059,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004978462179954538,
|
|
"loss": 5.2958,
|
|
"mean_token_accuracy": 0.18993753045797349,
|
|
"num_tokens": 12720715.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"entropy": 5.3367125511169435,
|
|
"epoch": 0.5331412103746398,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978414831478253,
|
|
"loss": 5.269,
|
|
"mean_token_accuracy": 0.19713337272405623,
|
|
"num_tokens": 12732409.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 5.323969554901123,
|
|
"epoch": 0.5336215177713737,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004978367431264794,
|
|
"loss": 5.397,
|
|
"mean_token_accuracy": 0.18209069669246675,
|
|
"num_tokens": 12745174.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"entropy": 5.410878992080688,
|
|
"epoch": 0.5341018251681076,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004978319979315261,
|
|
"loss": 5.3328,
|
|
"mean_token_accuracy": 0.19573558866977692,
|
|
"num_tokens": 12756116.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 5.376229763031006,
|
|
"epoch": 0.5345821325648416,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004978272475630752,
|
|
"loss": 5.2851,
|
|
"mean_token_accuracy": 0.1916971653699875,
|
|
"num_tokens": 12768183.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"entropy": 5.264455699920655,
|
|
"epoch": 0.5350624399615754,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004978224920212374,
|
|
"loss": 5.2931,
|
|
"mean_token_accuracy": 0.1934914067387581,
|
|
"num_tokens": 12778537.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 5.313297891616822,
|
|
"epoch": 0.5355427473583093,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004978177313061232,
|
|
"loss": 5.3228,
|
|
"mean_token_accuracy": 0.19088124930858613,
|
|
"num_tokens": 12789691.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"entropy": 5.473337554931641,
|
|
"epoch": 0.5360230547550432,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004978129654178426,
|
|
"loss": 5.3433,
|
|
"mean_token_accuracy": 0.18791570216417314,
|
|
"num_tokens": 12801438.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 5.4069455623626705,
|
|
"epoch": 0.5365033621517772,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004978081943565067,
|
|
"loss": 5.3061,
|
|
"mean_token_accuracy": 0.18656288981437683,
|
|
"num_tokens": 12812425.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"entropy": 5.307536172866821,
|
|
"epoch": 0.5369836695485111,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004978034181222261,
|
|
"loss": 5.2769,
|
|
"mean_token_accuracy": 0.18625542372465134,
|
|
"num_tokens": 12824735.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 5.430880117416382,
|
|
"epoch": 0.537463976945245,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004977986367151119,
|
|
"loss": 5.3688,
|
|
"mean_token_accuracy": 0.1952778786420822,
|
|
"num_tokens": 12835454.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"entropy": 5.434065580368042,
|
|
"epoch": 0.5379442843419788,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004977938501352747,
|
|
"loss": 5.4122,
|
|
"mean_token_accuracy": 0.18514797538518907,
|
|
"num_tokens": 12847086.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 5.385431623458862,
|
|
"epoch": 0.5384245917387128,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004977890583828259,
|
|
"loss": 5.3549,
|
|
"mean_token_accuracy": 0.1888865575194359,
|
|
"num_tokens": 12857713.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"entropy": 5.36136646270752,
|
|
"epoch": 0.5389048991354467,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004977842614578768,
|
|
"loss": 5.3356,
|
|
"mean_token_accuracy": 0.18914903849363326,
|
|
"num_tokens": 12869967.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 5.433460998535156,
|
|
"epoch": 0.5393852065321806,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004977794593605386,
|
|
"loss": 5.3684,
|
|
"mean_token_accuracy": 0.18960850983858107,
|
|
"num_tokens": 12881230.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"entropy": 5.352547121047974,
|
|
"epoch": 0.5398655139289145,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000497774652090923,
|
|
"loss": 5.3222,
|
|
"mean_token_accuracy": 0.18944347649812698,
|
|
"num_tokens": 12892376.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 5.436691570281982,
|
|
"epoch": 0.5403458213256485,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004977698396491414,
|
|
"loss": 5.3307,
|
|
"mean_token_accuracy": 0.19240753799676896,
|
|
"num_tokens": 12903709.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"entropy": 5.2928542137146,
|
|
"epoch": 0.5408261287223823,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004977650220353055,
|
|
"loss": 5.1629,
|
|
"mean_token_accuracy": 0.19530351608991622,
|
|
"num_tokens": 12914958.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 5.280749416351318,
|
|
"epoch": 0.5413064361191162,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004977601992495274,
|
|
"loss": 5.2875,
|
|
"mean_token_accuracy": 0.1923414632678032,
|
|
"num_tokens": 12927418.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"entropy": 5.413435602188111,
|
|
"epoch": 0.5417867435158501,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004977553712919189,
|
|
"loss": 5.3325,
|
|
"mean_token_accuracy": 0.1892315372824669,
|
|
"num_tokens": 12939874.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 5.463119792938232,
|
|
"epoch": 0.5422670509125841,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004977505381625921,
|
|
"loss": 5.3542,
|
|
"mean_token_accuracy": 0.18793897628784179,
|
|
"num_tokens": 12951113.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"entropy": 5.333239316940308,
|
|
"epoch": 0.542747358309318,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004977456998616593,
|
|
"loss": 5.247,
|
|
"mean_token_accuracy": 0.19487171471118928,
|
|
"num_tokens": 12961940.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 5.247047281265258,
|
|
"epoch": 0.5432276657060519,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004977408563892327,
|
|
"loss": 5.2389,
|
|
"mean_token_accuracy": 0.19528348445892335,
|
|
"num_tokens": 12973938.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"entropy": 5.355054330825806,
|
|
"epoch": 0.5437079731027857,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004977360077454249,
|
|
"loss": 5.2669,
|
|
"mean_token_accuracy": 0.19261687248945236,
|
|
"num_tokens": 12985400.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 5.381504774093628,
|
|
"epoch": 0.5441882804995197,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004977311539303483,
|
|
"loss": 5.2984,
|
|
"mean_token_accuracy": 0.202898870408535,
|
|
"num_tokens": 12996402.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"entropy": 5.339759063720703,
|
|
"epoch": 0.5446685878962536,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004977262949441158,
|
|
"loss": 5.1882,
|
|
"mean_token_accuracy": 0.20247950553894042,
|
|
"num_tokens": 13006991.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 5.329454803466797,
|
|
"epoch": 0.5451488952929875,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004977214307868399,
|
|
"loss": 5.2909,
|
|
"mean_token_accuracy": 0.19646303355693817,
|
|
"num_tokens": 13016969.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"entropy": 5.333616399765015,
|
|
"epoch": 0.5456292026897214,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000497716561458634,
|
|
"loss": 5.2395,
|
|
"mean_token_accuracy": 0.1989587128162384,
|
|
"num_tokens": 13027759.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 5.4932708740234375,
|
|
"epoch": 0.5461095100864554,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004977116869596107,
|
|
"loss": 5.4415,
|
|
"mean_token_accuracy": 0.1860479310154915,
|
|
"num_tokens": 13039881.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"entropy": 5.399776601791382,
|
|
"epoch": 0.5465898174831892,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004977068072898834,
|
|
"loss": 5.3041,
|
|
"mean_token_accuracy": 0.18947898745536804,
|
|
"num_tokens": 13051443.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 5.3822290897369385,
|
|
"epoch": 0.5470701248799231,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004977019224495652,
|
|
"loss": 5.3697,
|
|
"mean_token_accuracy": 0.18962922990322112,
|
|
"num_tokens": 13063474.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"entropy": 5.307476902008057,
|
|
"epoch": 0.547550432276657,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004976970324387698,
|
|
"loss": 5.234,
|
|
"mean_token_accuracy": 0.20077043473720552,
|
|
"num_tokens": 13074365.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 5.339881372451782,
|
|
"epoch": 0.548030739673391,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004976921372576104,
|
|
"loss": 5.3033,
|
|
"mean_token_accuracy": 0.19367703795433044,
|
|
"num_tokens": 13087354.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"entropy": 5.32935528755188,
|
|
"epoch": 0.5485110470701249,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004976872369062011,
|
|
"loss": 5.2787,
|
|
"mean_token_accuracy": 0.19071510583162307,
|
|
"num_tokens": 13099306.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 5.4302033424377445,
|
|
"epoch": 0.5489913544668588,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004976823313846552,
|
|
"loss": 5.4164,
|
|
"mean_token_accuracy": 0.19036435931921006,
|
|
"num_tokens": 13111259.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"entropy": 5.4693896770477295,
|
|
"epoch": 0.5494716618635928,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004976774206930869,
|
|
"loss": 5.3256,
|
|
"mean_token_accuracy": 0.18587163984775543,
|
|
"num_tokens": 13123589.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 5.253912925720215,
|
|
"epoch": 0.5499519692603266,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004976725048316101,
|
|
"loss": 5.322,
|
|
"mean_token_accuracy": 0.19089159667491912,
|
|
"num_tokens": 13136485.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"entropy": 5.40102801322937,
|
|
"epoch": 0.5504322766570605,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004976675838003388,
|
|
"loss": 5.2997,
|
|
"mean_token_accuracy": 0.19145811647176741,
|
|
"num_tokens": 13148067.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 5.367999935150147,
|
|
"epoch": 0.5509125840537944,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004976626575993877,
|
|
"loss": 5.2818,
|
|
"mean_token_accuracy": 0.18961854726076127,
|
|
"num_tokens": 13159813.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"entropy": 5.410087442398071,
|
|
"epoch": 0.5513928914505284,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004976577262288705,
|
|
"loss": 5.356,
|
|
"mean_token_accuracy": 0.18928916603326798,
|
|
"num_tokens": 13170828.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 5.265670728683472,
|
|
"epoch": 0.5518731988472623,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004976527896889023,
|
|
"loss": 5.181,
|
|
"mean_token_accuracy": 0.20403801798820495,
|
|
"num_tokens": 13181883.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"entropy": 5.295314884185791,
|
|
"epoch": 0.5523535062439962,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004976478479795974,
|
|
"loss": 5.2557,
|
|
"mean_token_accuracy": 0.1949864685535431,
|
|
"num_tokens": 13193530.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 5.484155082702637,
|
|
"epoch": 0.55283381364073,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004976429011010706,
|
|
"loss": 5.4823,
|
|
"mean_token_accuracy": 0.17912757843732835,
|
|
"num_tokens": 13205822.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"entropy": 5.3539347648620605,
|
|
"epoch": 0.553314121037464,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004976379490534366,
|
|
"loss": 5.2081,
|
|
"mean_token_accuracy": 0.19992550164461137,
|
|
"num_tokens": 13216698.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 5.291062736511231,
|
|
"epoch": 0.5537944284341979,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004976329918368107,
|
|
"loss": 5.2968,
|
|
"mean_token_accuracy": 0.19075367897748946,
|
|
"num_tokens": 13228389.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"entropy": 5.433424997329712,
|
|
"epoch": 0.5542747358309318,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004976280294513079,
|
|
"loss": 5.3505,
|
|
"mean_token_accuracy": 0.18287664502859116,
|
|
"num_tokens": 13239628.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 5.404953861236573,
|
|
"epoch": 0.5547550432276657,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004976230618970431,
|
|
"loss": 5.352,
|
|
"mean_token_accuracy": 0.19548004865646362,
|
|
"num_tokens": 13251149.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"entropy": 5.455016326904297,
|
|
"epoch": 0.5552353506243997,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000497618089174132,
|
|
"loss": 5.413,
|
|
"mean_token_accuracy": 0.18660195618867875,
|
|
"num_tokens": 13264846.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 5.248121690750122,
|
|
"epoch": 0.5557156580211335,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004976131112826898,
|
|
"loss": 5.1913,
|
|
"mean_token_accuracy": 0.2054605171084404,
|
|
"num_tokens": 13275409.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"entropy": 5.259016036987305,
|
|
"epoch": 0.5561959654178674,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004976081282228323,
|
|
"loss": 5.1657,
|
|
"mean_token_accuracy": 0.20358884781599046,
|
|
"num_tokens": 13287173.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 5.411679124832153,
|
|
"epoch": 0.5566762728146013,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000497603139994675,
|
|
"loss": 5.2377,
|
|
"mean_token_accuracy": 0.19680293649435043,
|
|
"num_tokens": 13298225.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"entropy": 5.2930761814117435,
|
|
"epoch": 0.5571565802113353,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004975981465983338,
|
|
"loss": 5.2468,
|
|
"mean_token_accuracy": 0.19053254425525665,
|
|
"num_tokens": 13309685.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 5.304633331298828,
|
|
"epoch": 0.5576368876080692,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004975931480339246,
|
|
"loss": 5.2554,
|
|
"mean_token_accuracy": 0.19651708900928497,
|
|
"num_tokens": 13320837.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"entropy": 5.383905267715454,
|
|
"epoch": 0.5581171950048031,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004975881443015635,
|
|
"loss": 5.3718,
|
|
"mean_token_accuracy": 0.19027461260557174,
|
|
"num_tokens": 13333512.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 5.465289068222046,
|
|
"epoch": 0.5585975024015369,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004975831354013667,
|
|
"loss": 5.3829,
|
|
"mean_token_accuracy": 0.19368760734796525,
|
|
"num_tokens": 13345189.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"entropy": 5.329316329956055,
|
|
"epoch": 0.5590778097982709,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004975781213334503,
|
|
"loss": 5.2472,
|
|
"mean_token_accuracy": 0.20152513086795806,
|
|
"num_tokens": 13356123.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 5.329442405700684,
|
|
"epoch": 0.5595581171950048,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004975731020979309,
|
|
"loss": 5.2949,
|
|
"mean_token_accuracy": 0.19351785629987717,
|
|
"num_tokens": 13366902.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"entropy": 5.4559613227844235,
|
|
"epoch": 0.5600384245917387,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004975680776949249,
|
|
"loss": 5.3542,
|
|
"mean_token_accuracy": 0.18989898711442948,
|
|
"num_tokens": 13377567.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 5.390386629104614,
|
|
"epoch": 0.5605187319884726,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004975630481245492,
|
|
"loss": 5.2869,
|
|
"mean_token_accuracy": 0.2009364992380142,
|
|
"num_tokens": 13387297.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"entropy": 5.348505544662475,
|
|
"epoch": 0.5609990393852066,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004975580133869202,
|
|
"loss": 5.3381,
|
|
"mean_token_accuracy": 0.1932346299290657,
|
|
"num_tokens": 13397723.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 5.408625984191895,
|
|
"epoch": 0.5614793467819404,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004975529734821552,
|
|
"loss": 5.3863,
|
|
"mean_token_accuracy": 0.18635910749435425,
|
|
"num_tokens": 13409875.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"entropy": 5.352054500579834,
|
|
"epoch": 0.5619596541786743,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004975479284103708,
|
|
"loss": 5.2921,
|
|
"mean_token_accuracy": 0.1954024314880371,
|
|
"num_tokens": 13421338.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 5.418287992477417,
|
|
"epoch": 0.5624399615754082,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004975428781716845,
|
|
"loss": 5.3258,
|
|
"mean_token_accuracy": 0.19152757823467254,
|
|
"num_tokens": 13431373.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"entropy": 5.360725784301758,
|
|
"epoch": 0.5629202689721422,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004975378227662134,
|
|
"loss": 5.3208,
|
|
"mean_token_accuracy": 0.19721843004226686,
|
|
"num_tokens": 13443158.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 5.44525113105774,
|
|
"epoch": 0.5634005763688761,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004975327621940746,
|
|
"loss": 5.3795,
|
|
"mean_token_accuracy": 0.18757863938808442,
|
|
"num_tokens": 13454559.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"entropy": 5.453475904464722,
|
|
"epoch": 0.56388088376561,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004975276964553861,
|
|
"loss": 5.4604,
|
|
"mean_token_accuracy": 0.1895272508263588,
|
|
"num_tokens": 13466934.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 5.349884796142578,
|
|
"epoch": 0.5643611911623438,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004975226255502651,
|
|
"loss": 5.2124,
|
|
"mean_token_accuracy": 0.20376883447170258,
|
|
"num_tokens": 13477770.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"entropy": 5.428862237930298,
|
|
"epoch": 0.5648414985590778,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004975175494788297,
|
|
"loss": 5.4214,
|
|
"mean_token_accuracy": 0.1833633303642273,
|
|
"num_tokens": 13490093.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 5.4273130893707275,
|
|
"epoch": 0.5653218059558117,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004975124682411974,
|
|
"loss": 5.2743,
|
|
"mean_token_accuracy": 0.19006698280572892,
|
|
"num_tokens": 13500663.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"entropy": 5.404650068283081,
|
|
"epoch": 0.5658021133525456,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004975073818374863,
|
|
"loss": 5.3747,
|
|
"mean_token_accuracy": 0.19194794446229935,
|
|
"num_tokens": 13512369.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 5.352162408828735,
|
|
"epoch": 0.5662824207492796,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004975022902678145,
|
|
"loss": 5.2518,
|
|
"mean_token_accuracy": 0.18981288820505143,
|
|
"num_tokens": 13523181.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"entropy": 5.307896852493286,
|
|
"epoch": 0.5667627281460135,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004974971935323003,
|
|
"loss": 5.2062,
|
|
"mean_token_accuracy": 0.19488532990217208,
|
|
"num_tokens": 13534113.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 5.3025891304016115,
|
|
"epoch": 0.5672430355427474,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004974920916310619,
|
|
"loss": 5.2425,
|
|
"mean_token_accuracy": 0.19460777193307877,
|
|
"num_tokens": 13545037.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"entropy": 5.368872261047363,
|
|
"epoch": 0.5677233429394812,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004974869845642178,
|
|
"loss": 5.2926,
|
|
"mean_token_accuracy": 0.19421349167823793,
|
|
"num_tokens": 13555541.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 5.389457654953003,
|
|
"epoch": 0.5682036503362152,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004974818723318866,
|
|
"loss": 5.2973,
|
|
"mean_token_accuracy": 0.19764145314693451,
|
|
"num_tokens": 13566951.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"entropy": 5.347638368606567,
|
|
"epoch": 0.5686839577329491,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004974767549341868,
|
|
"loss": 5.3505,
|
|
"mean_token_accuracy": 0.18888978958129882,
|
|
"num_tokens": 13578492.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 5.425949621200561,
|
|
"epoch": 0.569164265129683,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004974716323712376,
|
|
"loss": 5.2433,
|
|
"mean_token_accuracy": 0.20290264040231704,
|
|
"num_tokens": 13589183.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"entropy": 5.37887659072876,
|
|
"epoch": 0.5696445725264169,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004974665046431576,
|
|
"loss": 5.3868,
|
|
"mean_token_accuracy": 0.19258931577205657,
|
|
"num_tokens": 13600588.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 5.309185123443603,
|
|
"epoch": 0.5701248799231509,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004974613717500659,
|
|
"loss": 5.2605,
|
|
"mean_token_accuracy": 0.20295644104480742,
|
|
"num_tokens": 13612107.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"entropy": 5.485657453536987,
|
|
"epoch": 0.5706051873198847,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004974562336920818,
|
|
"loss": 5.4246,
|
|
"mean_token_accuracy": 0.18908909112215042,
|
|
"num_tokens": 13623973.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 5.3633698463439945,
|
|
"epoch": 0.5710854947166186,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004974510904693245,
|
|
"loss": 5.2372,
|
|
"mean_token_accuracy": 0.19648284167051316,
|
|
"num_tokens": 13634994.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"entropy": 5.412157249450684,
|
|
"epoch": 0.5715658021133525,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004974459420819134,
|
|
"loss": 5.3895,
|
|
"mean_token_accuracy": 0.19440043568611146,
|
|
"num_tokens": 13646361.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 5.36341814994812,
|
|
"epoch": 0.5720461095100865,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000497440788529968,
|
|
"loss": 5.2834,
|
|
"mean_token_accuracy": 0.19329349249601363,
|
|
"num_tokens": 13656975.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"entropy": 5.428890562057495,
|
|
"epoch": 0.5725264169068204,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004974356298136081,
|
|
"loss": 5.3207,
|
|
"mean_token_accuracy": 0.18961571753025055,
|
|
"num_tokens": 13668434.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 5.403112125396729,
|
|
"epoch": 0.5730067243035543,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004974304659329533,
|
|
"loss": 5.301,
|
|
"mean_token_accuracy": 0.1921529397368431,
|
|
"num_tokens": 13679266.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"entropy": 5.291449975967407,
|
|
"epoch": 0.5734870317002881,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004974252968881236,
|
|
"loss": 5.3247,
|
|
"mean_token_accuracy": 0.18704658299684523,
|
|
"num_tokens": 13690921.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 5.385117483139038,
|
|
"epoch": 0.5739673390970221,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000497420122679239,
|
|
"loss": 5.2579,
|
|
"mean_token_accuracy": 0.19390686601400375,
|
|
"num_tokens": 13702329.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"entropy": 5.317170143127441,
|
|
"epoch": 0.574447646493756,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004974149433064196,
|
|
"loss": 5.2295,
|
|
"mean_token_accuracy": 0.20150385797023773,
|
|
"num_tokens": 13713356.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 5.237676763534546,
|
|
"epoch": 0.5749279538904899,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004974097587697856,
|
|
"loss": 5.2294,
|
|
"mean_token_accuracy": 0.19473931789398194,
|
|
"num_tokens": 13724718.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"entropy": 5.28824028968811,
|
|
"epoch": 0.5754082612872238,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004974045690694575,
|
|
"loss": 5.2596,
|
|
"mean_token_accuracy": 0.196784345805645,
|
|
"num_tokens": 13736113.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 5.417406034469605,
|
|
"epoch": 0.5758885686839578,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004973993742055557,
|
|
"loss": 5.272,
|
|
"mean_token_accuracy": 0.19672393202781677,
|
|
"num_tokens": 13748322.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"entropy": 5.3009929180145265,
|
|
"epoch": 0.5763688760806917,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004973941741782007,
|
|
"loss": 5.2743,
|
|
"mean_token_accuracy": 0.18973211497068404,
|
|
"num_tokens": 13759433.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.5763688760806917,
|
|
"eval_entropy": 5.216975544093005,
|
|
"eval_loss": 5.320178508758545,
|
|
"eval_mean_token_accuracy": 0.1993778554485636,
|
|
"eval_num_tokens": 13759433.0,
|
|
"eval_runtime": 27.3927,
|
|
"eval_samples_per_second": 1197.949,
|
|
"eval_steps_per_second": 149.748,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 5.300173044204712,
|
|
"epoch": 0.5768491834774255,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004973889689875135,
|
|
"loss": 5.195,
|
|
"mean_token_accuracy": 0.1984873592853546,
|
|
"num_tokens": 13770181.0,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"entropy": 5.272101497650146,
|
|
"epoch": 0.5773294908741594,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004973837586336147,
|
|
"loss": 5.2443,
|
|
"mean_token_accuracy": 0.19233150780200958,
|
|
"num_tokens": 13781792.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 5.360331630706787,
|
|
"epoch": 0.5778097982708934,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004973785431166254,
|
|
"loss": 5.3034,
|
|
"mean_token_accuracy": 0.1883278176188469,
|
|
"num_tokens": 13792101.0,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"entropy": 5.372050094604492,
|
|
"epoch": 0.5782901056676273,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004973733224366666,
|
|
"loss": 5.2927,
|
|
"mean_token_accuracy": 0.19648923128843307,
|
|
"num_tokens": 13803640.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 5.3110956192016605,
|
|
"epoch": 0.5787704130643612,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004973680965938597,
|
|
"loss": 5.2097,
|
|
"mean_token_accuracy": 0.1993360698223114,
|
|
"num_tokens": 13815017.0,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"entropy": 5.383702278137207,
|
|
"epoch": 0.579250720461095,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004973628655883258,
|
|
"loss": 5.4354,
|
|
"mean_token_accuracy": 0.18790345638990402,
|
|
"num_tokens": 13826119.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 5.362691211700439,
|
|
"epoch": 0.579731027857829,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004973576294201865,
|
|
"loss": 5.2425,
|
|
"mean_token_accuracy": 0.193815678358078,
|
|
"num_tokens": 13837869.0,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"entropy": 5.532571697235108,
|
|
"epoch": 0.5802113352545629,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004973523880895633,
|
|
"loss": 5.4173,
|
|
"mean_token_accuracy": 0.1810302734375,
|
|
"num_tokens": 13849333.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 5.346681356430054,
|
|
"epoch": 0.5806916426512968,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004973471415965779,
|
|
"loss": 5.2732,
|
|
"mean_token_accuracy": 0.19453433007001877,
|
|
"num_tokens": 13860648.0,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"entropy": 5.426644325256348,
|
|
"epoch": 0.5811719500480308,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000497341889941352,
|
|
"loss": 5.4282,
|
|
"mean_token_accuracy": 0.18453214317560196,
|
|
"num_tokens": 13872979.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 5.398784351348877,
|
|
"epoch": 0.5816522574447647,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004973366331240078,
|
|
"loss": 5.2973,
|
|
"mean_token_accuracy": 0.19479347318410872,
|
|
"num_tokens": 13884363.0,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"entropy": 5.369167709350586,
|
|
"epoch": 0.5821325648414986,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000497331371144667,
|
|
"loss": 5.2897,
|
|
"mean_token_accuracy": 0.1914879024028778,
|
|
"num_tokens": 13895424.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 5.348582983016968,
|
|
"epoch": 0.5826128722382324,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004973261040034521,
|
|
"loss": 5.4136,
|
|
"mean_token_accuracy": 0.1861998423933983,
|
|
"num_tokens": 13907319.0,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"entropy": 5.298081350326538,
|
|
"epoch": 0.5830931796349664,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004973208317004852,
|
|
"loss": 5.2514,
|
|
"mean_token_accuracy": 0.19497257471084595,
|
|
"num_tokens": 13920013.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 5.347387409210205,
|
|
"epoch": 0.5835734870317003,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004973155542358889,
|
|
"loss": 5.2194,
|
|
"mean_token_accuracy": 0.19683697521686555,
|
|
"num_tokens": 13932033.0,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"entropy": 5.275155830383301,
|
|
"epoch": 0.5840537944284342,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004973102716097853,
|
|
"loss": 5.2393,
|
|
"mean_token_accuracy": 0.19724634289741516,
|
|
"num_tokens": 13943324.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 5.339147853851318,
|
|
"epoch": 0.5845341018251681,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004973049838222973,
|
|
"loss": 5.3067,
|
|
"mean_token_accuracy": 0.19921963214874266,
|
|
"num_tokens": 13954291.0,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"entropy": 5.465323781967163,
|
|
"epoch": 0.5850144092219021,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004972996908735479,
|
|
"loss": 5.414,
|
|
"mean_token_accuracy": 0.1826832190155983,
|
|
"num_tokens": 13966264.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 5.453038024902344,
|
|
"epoch": 0.585494716618636,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004972943927636597,
|
|
"loss": 5.2937,
|
|
"mean_token_accuracy": 0.18843238651752472,
|
|
"num_tokens": 13977785.0,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"entropy": 5.289157247543335,
|
|
"epoch": 0.5859750240153698,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004972890894927558,
|
|
"loss": 5.2774,
|
|
"mean_token_accuracy": 0.1957810938358307,
|
|
"num_tokens": 13989704.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 5.3781982421875,
|
|
"epoch": 0.5864553314121037,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004972837810609592,
|
|
"loss": 5.2735,
|
|
"mean_token_accuracy": 0.1950765624642372,
|
|
"num_tokens": 14000565.0,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"entropy": 5.25339150428772,
|
|
"epoch": 0.5869356388088377,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004972784674683933,
|
|
"loss": 5.1817,
|
|
"mean_token_accuracy": 0.20146536976099014,
|
|
"num_tokens": 14011521.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 5.334528207778931,
|
|
"epoch": 0.5874159462055716,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004972731487151815,
|
|
"loss": 5.2246,
|
|
"mean_token_accuracy": 0.20138936042785643,
|
|
"num_tokens": 14022966.0,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"entropy": 5.453242635726928,
|
|
"epoch": 0.5878962536023055,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004972678248014471,
|
|
"loss": 5.3627,
|
|
"mean_token_accuracy": 0.1907268077135086,
|
|
"num_tokens": 14034905.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 5.245919466018677,
|
|
"epoch": 0.5883765609990393,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004972624957273139,
|
|
"loss": 5.2067,
|
|
"mean_token_accuracy": 0.1968609645962715,
|
|
"num_tokens": 14045816.0,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"entropy": 5.300683164596558,
|
|
"epoch": 0.5888568683957733,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004972571614929055,
|
|
"loss": 5.1877,
|
|
"mean_token_accuracy": 0.1941026657819748,
|
|
"num_tokens": 14057316.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 5.375409936904907,
|
|
"epoch": 0.5893371757925072,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004972518220983457,
|
|
"loss": 5.308,
|
|
"mean_token_accuracy": 0.18491660058498383,
|
|
"num_tokens": 14067542.0,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"entropy": 5.255684518814087,
|
|
"epoch": 0.5898174831892411,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004972464775437586,
|
|
"loss": 5.1798,
|
|
"mean_token_accuracy": 0.19628757536411284,
|
|
"num_tokens": 14079467.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 5.297601890563965,
|
|
"epoch": 0.590297790585975,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004972411278292683,
|
|
"loss": 5.1377,
|
|
"mean_token_accuracy": 0.2022266536951065,
|
|
"num_tokens": 14090695.0,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"entropy": 5.310152339935303,
|
|
"epoch": 0.590778097982709,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004972357729549988,
|
|
"loss": 5.1883,
|
|
"mean_token_accuracy": 0.20686898082494737,
|
|
"num_tokens": 14101380.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 5.255633974075318,
|
|
"epoch": 0.5912584053794429,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004972304129210746,
|
|
"loss": 5.2317,
|
|
"mean_token_accuracy": 0.19752228856086732,
|
|
"num_tokens": 14112265.0,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"entropy": 5.3430475234985355,
|
|
"epoch": 0.5917387127761767,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004972250477276202,
|
|
"loss": 5.2263,
|
|
"mean_token_accuracy": 0.19471232295036317,
|
|
"num_tokens": 14124294.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 5.320528650283814,
|
|
"epoch": 0.5922190201729106,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004972196773747599,
|
|
"loss": 5.2509,
|
|
"mean_token_accuracy": 0.1957827016711235,
|
|
"num_tokens": 14135230.0,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"entropy": 5.363646554946899,
|
|
"epoch": 0.5926993275696446,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004972143018626186,
|
|
"loss": 5.3444,
|
|
"mean_token_accuracy": 0.19247063994407654,
|
|
"num_tokens": 14147576.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 5.2979090213775635,
|
|
"epoch": 0.5931796349663785,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004972089211913211,
|
|
"loss": 5.2239,
|
|
"mean_token_accuracy": 0.20307936817407607,
|
|
"num_tokens": 14158834.0,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"entropy": 5.386188983917236,
|
|
"epoch": 0.5936599423631124,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004972035353609923,
|
|
"loss": 5.3616,
|
|
"mean_token_accuracy": 0.1897743433713913,
|
|
"num_tokens": 14170694.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 5.350439167022705,
|
|
"epoch": 0.5941402497598463,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004971981443717572,
|
|
"loss": 5.275,
|
|
"mean_token_accuracy": 0.19451749473810195,
|
|
"num_tokens": 14183184.0,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"entropy": 5.413315868377685,
|
|
"epoch": 0.5946205571565802,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004971927482237409,
|
|
"loss": 5.3614,
|
|
"mean_token_accuracy": 0.18740272670984268,
|
|
"num_tokens": 14194761.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 5.309095239639282,
|
|
"epoch": 0.5951008645533141,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004971873469170689,
|
|
"loss": 5.1715,
|
|
"mean_token_accuracy": 0.19937018156051636,
|
|
"num_tokens": 14205820.0,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"entropy": 5.198407316207886,
|
|
"epoch": 0.595581171950048,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004971819404518664,
|
|
"loss": 5.1875,
|
|
"mean_token_accuracy": 0.20422977358102798,
|
|
"num_tokens": 14217826.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 5.258822679519653,
|
|
"epoch": 0.5960614793467819,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000497176528828259,
|
|
"loss": 5.1531,
|
|
"mean_token_accuracy": 0.20904283672571183,
|
|
"num_tokens": 14228632.0,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"entropy": 5.395007658004761,
|
|
"epoch": 0.5965417867435159,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004971711120463722,
|
|
"loss": 5.3763,
|
|
"mean_token_accuracy": 0.1843624085187912,
|
|
"num_tokens": 14240231.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 5.421968078613281,
|
|
"epoch": 0.5970220941402498,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000497165690106332,
|
|
"loss": 5.2243,
|
|
"mean_token_accuracy": 0.19937607198953627,
|
|
"num_tokens": 14251718.0,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"entropy": 5.301095485687256,
|
|
"epoch": 0.5975024015369836,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004971602630082642,
|
|
"loss": 5.2678,
|
|
"mean_token_accuracy": 0.20054133832454682,
|
|
"num_tokens": 14263129.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 5.272244215011597,
|
|
"epoch": 0.5979827089337176,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004971548307522947,
|
|
"loss": 5.1874,
|
|
"mean_token_accuracy": 0.195827853679657,
|
|
"num_tokens": 14274465.0,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"entropy": 5.465359544754028,
|
|
"epoch": 0.5984630163304515,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004971493933385498,
|
|
"loss": 5.3672,
|
|
"mean_token_accuracy": 0.19250321239233018,
|
|
"num_tokens": 14286190.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 5.372733306884766,
|
|
"epoch": 0.5989433237271854,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004971439507671556,
|
|
"loss": 5.3155,
|
|
"mean_token_accuracy": 0.1895249903202057,
|
|
"num_tokens": 14297638.0,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"entropy": 5.272776174545288,
|
|
"epoch": 0.5994236311239193,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004971385030382384,
|
|
"loss": 5.261,
|
|
"mean_token_accuracy": 0.19516938775777817,
|
|
"num_tokens": 14309773.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 5.37848744392395,
|
|
"epoch": 0.5999039385206533,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004971330501519248,
|
|
"loss": 5.2141,
|
|
"mean_token_accuracy": 0.1981222003698349,
|
|
"num_tokens": 14320543.0,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"entropy": 5.3494123935699465,
|
|
"epoch": 0.6003842459173871,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004971275921083414,
|
|
"loss": 5.2969,
|
|
"mean_token_accuracy": 0.1915585696697235,
|
|
"num_tokens": 14332076.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 5.2921350479125975,
|
|
"epoch": 0.600864553314121,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000497122128907615,
|
|
"loss": 5.2163,
|
|
"mean_token_accuracy": 0.19611912965774536,
|
|
"num_tokens": 14343768.0,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"entropy": 5.304634475708008,
|
|
"epoch": 0.6013448607108549,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004971166605498722,
|
|
"loss": 5.259,
|
|
"mean_token_accuracy": 0.1918262854218483,
|
|
"num_tokens": 14357051.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 5.361700868606567,
|
|
"epoch": 0.6018251681075889,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004971111870352401,
|
|
"loss": 5.2776,
|
|
"mean_token_accuracy": 0.19047526866197587,
|
|
"num_tokens": 14368800.0,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"entropy": 5.3359825134277346,
|
|
"epoch": 0.6023054755043228,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004971057083638458,
|
|
"loss": 5.2601,
|
|
"mean_token_accuracy": 0.19838642477989196,
|
|
"num_tokens": 14379617.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 5.304495000839234,
|
|
"epoch": 0.6027857829010567,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004971002245358166,
|
|
"loss": 5.2402,
|
|
"mean_token_accuracy": 0.19964935183525084,
|
|
"num_tokens": 14391454.0,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"entropy": 5.369532489776612,
|
|
"epoch": 0.6032660902977905,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004970947355512795,
|
|
"loss": 5.2397,
|
|
"mean_token_accuracy": 0.20377653539180757,
|
|
"num_tokens": 14402379.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 5.291622447967529,
|
|
"epoch": 0.6037463976945245,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004970892414103622,
|
|
"loss": 5.2178,
|
|
"mean_token_accuracy": 0.19688135832548143,
|
|
"num_tokens": 14415040.0,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"entropy": 5.331115865707398,
|
|
"epoch": 0.6042267050912584,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004970837421131921,
|
|
"loss": 5.3402,
|
|
"mean_token_accuracy": 0.18655368387699128,
|
|
"num_tokens": 14426677.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 5.365583896636963,
|
|
"epoch": 0.6047070124879923,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004970782376598972,
|
|
"loss": 5.2407,
|
|
"mean_token_accuracy": 0.19816339612007142,
|
|
"num_tokens": 14436676.0,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"entropy": 5.344175672531128,
|
|
"epoch": 0.6051873198847262,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004970727280506048,
|
|
"loss": 5.2782,
|
|
"mean_token_accuracy": 0.19335077702999115,
|
|
"num_tokens": 14448294.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 5.324075174331665,
|
|
"epoch": 0.6056676272814602,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004970672132854431,
|
|
"loss": 5.2542,
|
|
"mean_token_accuracy": 0.19286826699972154,
|
|
"num_tokens": 14460642.0,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"entropy": 5.305904293060303,
|
|
"epoch": 0.6061479346781941,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004970616933645403,
|
|
"loss": 5.1445,
|
|
"mean_token_accuracy": 0.20802572518587112,
|
|
"num_tokens": 14471370.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 5.404938650131226,
|
|
"epoch": 0.6066282420749279,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004970561682880242,
|
|
"loss": 5.3134,
|
|
"mean_token_accuracy": 0.19435038715600966,
|
|
"num_tokens": 14482358.0,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"entropy": 5.315574312210083,
|
|
"epoch": 0.6071085494716618,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004970506380560233,
|
|
"loss": 5.3291,
|
|
"mean_token_accuracy": 0.1953802764415741,
|
|
"num_tokens": 14494413.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 5.382768726348877,
|
|
"epoch": 0.6075888568683958,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004970451026686659,
|
|
"loss": 5.2398,
|
|
"mean_token_accuracy": 0.203892120718956,
|
|
"num_tokens": 14506370.0,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"entropy": 5.3352419376373295,
|
|
"epoch": 0.6080691642651297,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004970395621260806,
|
|
"loss": 5.2322,
|
|
"mean_token_accuracy": 0.19640335738658904,
|
|
"num_tokens": 14517198.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 5.415618896484375,
|
|
"epoch": 0.6085494716618636,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000497034016428396,
|
|
"loss": 5.3509,
|
|
"mean_token_accuracy": 0.18669991344213485,
|
|
"num_tokens": 14529434.0,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"entropy": 5.267712688446045,
|
|
"epoch": 0.6090297790585975,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004970284655757409,
|
|
"loss": 5.234,
|
|
"mean_token_accuracy": 0.1992996484041214,
|
|
"num_tokens": 14540857.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 5.300823831558228,
|
|
"epoch": 0.6095100864553314,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004970229095682439,
|
|
"loss": 5.2642,
|
|
"mean_token_accuracy": 0.1999596104025841,
|
|
"num_tokens": 14552594.0,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"entropy": 5.428168153762817,
|
|
"epoch": 0.6099903938520653,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004970173484060344,
|
|
"loss": 5.3089,
|
|
"mean_token_accuracy": 0.18913527578115463,
|
|
"num_tokens": 14563599.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 5.352987909317017,
|
|
"epoch": 0.6104707012487992,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004970117820892414,
|
|
"loss": 5.2652,
|
|
"mean_token_accuracy": 0.19545669108629227,
|
|
"num_tokens": 14575905.0,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"entropy": 5.381795263290405,
|
|
"epoch": 0.6109510086455331,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004970062106179939,
|
|
"loss": 5.281,
|
|
"mean_token_accuracy": 0.1997828796505928,
|
|
"num_tokens": 14587800.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 5.3206565380096436,
|
|
"epoch": 0.6114313160422671,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004970006339924214,
|
|
"loss": 5.2193,
|
|
"mean_token_accuracy": 0.20122889876365663,
|
|
"num_tokens": 14600654.0,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"entropy": 5.323697805404663,
|
|
"epoch": 0.611911623439001,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004969950522126534,
|
|
"loss": 5.2165,
|
|
"mean_token_accuracy": 0.1956930086016655,
|
|
"num_tokens": 14611985.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 5.253861331939698,
|
|
"epoch": 0.6123919308357348,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004969894652788196,
|
|
"loss": 5.2112,
|
|
"mean_token_accuracy": 0.19875800609588623,
|
|
"num_tokens": 14625004.0,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"entropy": 5.403214597702027,
|
|
"epoch": 0.6128722382324687,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004969838731910494,
|
|
"loss": 5.2834,
|
|
"mean_token_accuracy": 0.19371581822633743,
|
|
"num_tokens": 14635381.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 5.254595232009888,
|
|
"epoch": 0.6133525456292027,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004969782759494729,
|
|
"loss": 5.2267,
|
|
"mean_token_accuracy": 0.2021428868174553,
|
|
"num_tokens": 14646582.0,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"entropy": 5.305480861663819,
|
|
"epoch": 0.6138328530259366,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00049697267355422,
|
|
"loss": 5.2809,
|
|
"mean_token_accuracy": 0.19987702816724778,
|
|
"num_tokens": 14658024.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 5.461784887313843,
|
|
"epoch": 0.6143131604226705,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004969670660054208,
|
|
"loss": 5.3349,
|
|
"mean_token_accuracy": 0.18528691679239273,
|
|
"num_tokens": 14669933.0,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"entropy": 5.338407850265503,
|
|
"epoch": 0.6147934678194045,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004969614533032054,
|
|
"loss": 5.2732,
|
|
"mean_token_accuracy": 0.19340415596961974,
|
|
"num_tokens": 14681331.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 5.253151512145996,
|
|
"epoch": 0.6152737752161384,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004969558354477041,
|
|
"loss": 5.1417,
|
|
"mean_token_accuracy": 0.20471659004688264,
|
|
"num_tokens": 14691361.0,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"entropy": 5.294320201873779,
|
|
"epoch": 0.6157540826128722,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004969502124390474,
|
|
"loss": 5.2437,
|
|
"mean_token_accuracy": 0.19678280949592591,
|
|
"num_tokens": 14701791.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 5.343457746505737,
|
|
"epoch": 0.6162343900096061,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004969445842773658,
|
|
"loss": 5.2944,
|
|
"mean_token_accuracy": 0.1905987396836281,
|
|
"num_tokens": 14713672.0,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"entropy": 5.280415821075439,
|
|
"epoch": 0.6167146974063401,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00049693895096279,
|
|
"loss": 5.134,
|
|
"mean_token_accuracy": 0.20532522946596146,
|
|
"num_tokens": 14724789.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 5.300199031829834,
|
|
"epoch": 0.617195004803074,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004969333124954508,
|
|
"loss": 5.2255,
|
|
"mean_token_accuracy": 0.19219568222761155,
|
|
"num_tokens": 14737212.0,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"entropy": 5.260709667205811,
|
|
"epoch": 0.6176753121998079,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004969276688754791,
|
|
"loss": 5.2243,
|
|
"mean_token_accuracy": 0.19760479480028154,
|
|
"num_tokens": 14748387.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 5.331292247772216,
|
|
"epoch": 0.6181556195965417,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004969220201030059,
|
|
"loss": 5.2515,
|
|
"mean_token_accuracy": 0.19721336513757706,
|
|
"num_tokens": 14758477.0,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"entropy": 5.393822145462036,
|
|
"epoch": 0.6186359269932757,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004969163661781624,
|
|
"loss": 5.3479,
|
|
"mean_token_accuracy": 0.19438280314207076,
|
|
"num_tokens": 14769650.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 5.244111680984497,
|
|
"epoch": 0.6191162343900096,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004969107071010798,
|
|
"loss": 5.1141,
|
|
"mean_token_accuracy": 0.2060042515397072,
|
|
"num_tokens": 14780988.0,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"entropy": 5.328417253494263,
|
|
"epoch": 0.6195965417867435,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004969050428718895,
|
|
"loss": 5.2141,
|
|
"mean_token_accuracy": 0.20170180201530458,
|
|
"num_tokens": 14792458.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 5.370841217041016,
|
|
"epoch": 0.6200768491834774,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000496899373490723,
|
|
"loss": 5.302,
|
|
"mean_token_accuracy": 0.19348903000354767,
|
|
"num_tokens": 14804772.0,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"entropy": 5.38532247543335,
|
|
"epoch": 0.6205571565802114,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000496893698957712,
|
|
"loss": 5.2386,
|
|
"mean_token_accuracy": 0.19131766855716706,
|
|
"num_tokens": 14816126.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 5.34750804901123,
|
|
"epoch": 0.6210374639769453,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004968880192729882,
|
|
"loss": 5.3551,
|
|
"mean_token_accuracy": 0.19073225259780885,
|
|
"num_tokens": 14829376.0,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"entropy": 5.3107569217681885,
|
|
"epoch": 0.6215177713736791,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004968823344366835,
|
|
"loss": 5.1864,
|
|
"mean_token_accuracy": 0.20422402322292327,
|
|
"num_tokens": 14841098.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 5.321711921691895,
|
|
"epoch": 0.621998078770413,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004968766444489298,
|
|
"loss": 5.1995,
|
|
"mean_token_accuracy": 0.19894031435251236,
|
|
"num_tokens": 14852690.0,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"entropy": 5.283687448501587,
|
|
"epoch": 0.622478386167147,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004968709493098593,
|
|
"loss": 5.1839,
|
|
"mean_token_accuracy": 0.199807707965374,
|
|
"num_tokens": 14863327.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 5.36080002784729,
|
|
"epoch": 0.6229586935638809,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004968652490196041,
|
|
"loss": 5.3413,
|
|
"mean_token_accuracy": 0.19556114822626114,
|
|
"num_tokens": 14875213.0,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"entropy": 5.360726022720337,
|
|
"epoch": 0.6234390009606148,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004968595435782967,
|
|
"loss": 5.26,
|
|
"mean_token_accuracy": 0.20531406849622727,
|
|
"num_tokens": 14886129.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 5.433628988265991,
|
|
"epoch": 0.6239193083573487,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004968538329860695,
|
|
"loss": 5.3958,
|
|
"mean_token_accuracy": 0.18427062630653382,
|
|
"num_tokens": 14897217.0,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"entropy": 5.34465217590332,
|
|
"epoch": 0.6243996157540826,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004968481172430549,
|
|
"loss": 5.304,
|
|
"mean_token_accuracy": 0.19546790570020675,
|
|
"num_tokens": 14908438.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 5.292671966552734,
|
|
"epoch": 0.6248799231508165,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000496842396349386,
|
|
"loss": 5.1776,
|
|
"mean_token_accuracy": 0.1998446449637413,
|
|
"num_tokens": 14920472.0,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"entropy": 5.301685905456543,
|
|
"epoch": 0.6253602305475504,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004968366703051952,
|
|
"loss": 5.2294,
|
|
"mean_token_accuracy": 0.1975826621055603,
|
|
"num_tokens": 14932075.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 5.327823829650879,
|
|
"epoch": 0.6258405379442843,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004968309391106157,
|
|
"loss": 5.2856,
|
|
"mean_token_accuracy": 0.1950155645608902,
|
|
"num_tokens": 14942971.0,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"entropy": 5.307694292068481,
|
|
"epoch": 0.6263208453410183,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004968252027657806,
|
|
"loss": 5.1169,
|
|
"mean_token_accuracy": 0.20416004061698914,
|
|
"num_tokens": 14954288.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 5.208348560333252,
|
|
"epoch": 0.6268011527377522,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004968194612708229,
|
|
"loss": 5.1838,
|
|
"mean_token_accuracy": 0.19983574450016023,
|
|
"num_tokens": 14966017.0,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"entropy": 5.391433906555176,
|
|
"epoch": 0.627281460134486,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004968137146258759,
|
|
"loss": 5.268,
|
|
"mean_token_accuracy": 0.19897303581237794,
|
|
"num_tokens": 14978022.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 5.215319442749023,
|
|
"epoch": 0.6277617675312199,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004968079628310732,
|
|
"loss": 5.178,
|
|
"mean_token_accuracy": 0.20022727847099303,
|
|
"num_tokens": 14990370.0,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"entropy": 5.197961759567261,
|
|
"epoch": 0.6282420749279539,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004968022058865482,
|
|
"loss": 5.2013,
|
|
"mean_token_accuracy": 0.20392390042543412,
|
|
"num_tokens": 15001535.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 5.426657629013062,
|
|
"epoch": 0.6287223823246878,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004967964437924346,
|
|
"loss": 5.2742,
|
|
"mean_token_accuracy": 0.19859713315963745,
|
|
"num_tokens": 15012560.0,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"entropy": 5.225249814987182,
|
|
"epoch": 0.6292026897214217,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004967906765488662,
|
|
"loss": 5.1805,
|
|
"mean_token_accuracy": 0.20398483723402022,
|
|
"num_tokens": 15024594.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 5.287734794616699,
|
|
"epoch": 0.6296829971181557,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004967849041559769,
|
|
"loss": 5.209,
|
|
"mean_token_accuracy": 0.19682117998600007,
|
|
"num_tokens": 15034735.0,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"entropy": 5.371293401718139,
|
|
"epoch": 0.6301633045148896,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004967791266139006,
|
|
"loss": 5.3218,
|
|
"mean_token_accuracy": 0.192607519030571,
|
|
"num_tokens": 15046955.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 5.280347061157227,
|
|
"epoch": 0.6306436119116234,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004967733439227716,
|
|
"loss": 5.181,
|
|
"mean_token_accuracy": 0.20368633568286895,
|
|
"num_tokens": 15058250.0,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"entropy": 5.304067659378052,
|
|
"epoch": 0.6311239193083573,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000496767556082724,
|
|
"loss": 5.3101,
|
|
"mean_token_accuracy": 0.19713823050260543,
|
|
"num_tokens": 15069451.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 5.3541919708251955,
|
|
"epoch": 0.6316042267050913,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004967617630938924,
|
|
"loss": 5.2624,
|
|
"mean_token_accuracy": 0.20069709718227385,
|
|
"num_tokens": 15080920.0,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"entropy": 5.265263462066651,
|
|
"epoch": 0.6320845341018252,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000496755964956411,
|
|
"loss": 5.1853,
|
|
"mean_token_accuracy": 0.195594023168087,
|
|
"num_tokens": 15091579.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 5.33905520439148,
|
|
"epoch": 0.6325648414985591,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004967501616704147,
|
|
"loss": 5.3058,
|
|
"mean_token_accuracy": 0.19578456729650498,
|
|
"num_tokens": 15103097.0,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"entropy": 5.338488435745239,
|
|
"epoch": 0.633045148895293,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000496744353236038,
|
|
"loss": 5.2011,
|
|
"mean_token_accuracy": 0.1994476333260536,
|
|
"num_tokens": 15114646.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 5.396579837799072,
|
|
"epoch": 0.633525456292027,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000496738539653416,
|
|
"loss": 5.2462,
|
|
"mean_token_accuracy": 0.19710550010204314,
|
|
"num_tokens": 15126019.0,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"entropy": 5.320884847640992,
|
|
"epoch": 0.6340057636887608,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004967327209226835,
|
|
"loss": 5.2417,
|
|
"mean_token_accuracy": 0.20047877579927445,
|
|
"num_tokens": 15136132.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 5.328518581390381,
|
|
"epoch": 0.6344860710854947,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004967268970439755,
|
|
"loss": 5.3295,
|
|
"mean_token_accuracy": 0.19120048433542253,
|
|
"num_tokens": 15149016.0,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"entropy": 5.390049648284912,
|
|
"epoch": 0.6349663784822286,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004967210680174274,
|
|
"loss": 5.2831,
|
|
"mean_token_accuracy": 0.1991571456193924,
|
|
"num_tokens": 15160302.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 5.326699829101562,
|
|
"epoch": 0.6354466858789626,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004967152338431745,
|
|
"loss": 5.266,
|
|
"mean_token_accuracy": 0.19298024773597716,
|
|
"num_tokens": 15171770.0,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"entropy": 5.307769346237182,
|
|
"epoch": 0.6359269932756965,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004967093945213523,
|
|
"loss": 5.1512,
|
|
"mean_token_accuracy": 0.20107036381959914,
|
|
"num_tokens": 15182283.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 5.251153898239136,
|
|
"epoch": 0.6364073006724303,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004967035500520962,
|
|
"loss": 5.2544,
|
|
"mean_token_accuracy": 0.19852742105722426,
|
|
"num_tokens": 15193917.0,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"entropy": 5.439336967468262,
|
|
"epoch": 0.6368876080691642,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004966977004355421,
|
|
"loss": 5.3349,
|
|
"mean_token_accuracy": 0.18941647708415985,
|
|
"num_tokens": 15205854.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 5.270390892028809,
|
|
"epoch": 0.6373679154658982,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004966918456718256,
|
|
"loss": 5.1945,
|
|
"mean_token_accuracy": 0.20114167034626007,
|
|
"num_tokens": 15217884.0,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"entropy": 5.367384433746338,
|
|
"epoch": 0.6378482228626321,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004966859857610828,
|
|
"loss": 5.3235,
|
|
"mean_token_accuracy": 0.19650316685438157,
|
|
"num_tokens": 15229493.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 5.309024906158447,
|
|
"epoch": 0.638328530259366,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004966801207034497,
|
|
"loss": 5.2748,
|
|
"mean_token_accuracy": 0.1954337552189827,
|
|
"num_tokens": 15241402.0,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"entropy": 5.396312856674195,
|
|
"epoch": 0.6388088376560999,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004966742504990624,
|
|
"loss": 5.2793,
|
|
"mean_token_accuracy": 0.19446442872285843,
|
|
"num_tokens": 15252981.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 5.396939516067505,
|
|
"epoch": 0.6392891450528339,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004966683751480572,
|
|
"loss": 5.2068,
|
|
"mean_token_accuracy": 0.20260323137044906,
|
|
"num_tokens": 15264171.0,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"entropy": 5.215060138702393,
|
|
"epoch": 0.6397694524495677,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004966624946505706,
|
|
"loss": 5.2122,
|
|
"mean_token_accuracy": 0.2042062520980835,
|
|
"num_tokens": 15275595.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 5.32531771659851,
|
|
"epoch": 0.6402497598463016,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004966566090067391,
|
|
"loss": 5.284,
|
|
"mean_token_accuracy": 0.19337738156318665,
|
|
"num_tokens": 15286074.0,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"entropy": 5.371769189834595,
|
|
"epoch": 0.6407300672430355,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004966507182166993,
|
|
"loss": 5.2436,
|
|
"mean_token_accuracy": 0.19362216591835021,
|
|
"num_tokens": 15296230.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 5.315968751907349,
|
|
"epoch": 0.6412103746397695,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000496644822280588,
|
|
"loss": 5.2326,
|
|
"mean_token_accuracy": 0.19759678691625596,
|
|
"num_tokens": 15307428.0,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"entropy": 5.394734859466553,
|
|
"epoch": 0.6416906820365034,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000496638921198542,
|
|
"loss": 5.3513,
|
|
"mean_token_accuracy": 0.18394145518541336,
|
|
"num_tokens": 15318796.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 5.314431810379029,
|
|
"epoch": 0.6421709894332372,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004966330149706984,
|
|
"loss": 5.2571,
|
|
"mean_token_accuracy": 0.19566282480955124,
|
|
"num_tokens": 15330914.0,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"entropy": 5.29867787361145,
|
|
"epoch": 0.6426512968299711,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004966271035971944,
|
|
"loss": 5.2047,
|
|
"mean_token_accuracy": 0.19110651910305024,
|
|
"num_tokens": 15341756.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 5.369540071487426,
|
|
"epoch": 0.6431316042267051,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004966211870781671,
|
|
"loss": 5.2993,
|
|
"mean_token_accuracy": 0.19377565532922744,
|
|
"num_tokens": 15353217.0,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"entropy": 5.348928213119507,
|
|
"epoch": 0.643611911623439,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000496615265413754,
|
|
"loss": 5.2096,
|
|
"mean_token_accuracy": 0.1918511837720871,
|
|
"num_tokens": 15363813.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 5.329305982589721,
|
|
"epoch": 0.6440922190201729,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004966093386040923,
|
|
"loss": 5.2085,
|
|
"mean_token_accuracy": 0.19547198563814164,
|
|
"num_tokens": 15375223.0,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"entropy": 5.3240186214447025,
|
|
"epoch": 0.6445725264169068,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00049660340664932,
|
|
"loss": 5.2076,
|
|
"mean_token_accuracy": 0.19727633148431778,
|
|
"num_tokens": 15385523.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 5.337905216217041,
|
|
"epoch": 0.6450528338136408,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004965974695495746,
|
|
"loss": 5.2756,
|
|
"mean_token_accuracy": 0.1926635518670082,
|
|
"num_tokens": 15397262.0,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"entropy": 5.240037250518799,
|
|
"epoch": 0.6455331412103746,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004965915273049938,
|
|
"loss": 5.2106,
|
|
"mean_token_accuracy": 0.19746471792459488,
|
|
"num_tokens": 15409043.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 5.357408618927002,
|
|
"epoch": 0.6460134486071085,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004965855799157158,
|
|
"loss": 5.2057,
|
|
"mean_token_accuracy": 0.19802628457546234,
|
|
"num_tokens": 15420859.0,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"entropy": 5.3712615966796875,
|
|
"epoch": 0.6464937560038425,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004965796273818787,
|
|
"loss": 5.2864,
|
|
"mean_token_accuracy": 0.189888796210289,
|
|
"num_tokens": 15433650.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 5.233816766738892,
|
|
"epoch": 0.6469740634005764,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004965736697036206,
|
|
"loss": 5.1634,
|
|
"mean_token_accuracy": 0.2005162462592125,
|
|
"num_tokens": 15445158.0,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"entropy": 5.397826766967773,
|
|
"epoch": 0.6474543707973103,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004965677068810798,
|
|
"loss": 5.4198,
|
|
"mean_token_accuracy": 0.18722799867391587,
|
|
"num_tokens": 15456974.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 5.41676893234253,
|
|
"epoch": 0.6479346781940442,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004965617389143947,
|
|
"loss": 5.2898,
|
|
"mean_token_accuracy": 0.19216358810663223,
|
|
"num_tokens": 15467395.0,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"entropy": 5.299721145629883,
|
|
"epoch": 0.6484149855907781,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000496555765803704,
|
|
"loss": 5.2777,
|
|
"mean_token_accuracy": 0.19596335887908936,
|
|
"num_tokens": 15479396.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 5.318911552429199,
|
|
"epoch": 0.648895292987512,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004965497875491462,
|
|
"loss": 5.2232,
|
|
"mean_token_accuracy": 0.20304810851812363,
|
|
"num_tokens": 15490992.0,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"entropy": 5.438581037521362,
|
|
"epoch": 0.6493756003842459,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004965438041508604,
|
|
"loss": 5.3009,
|
|
"mean_token_accuracy": 0.18606224209070205,
|
|
"num_tokens": 15502413.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 5.337257432937622,
|
|
"epoch": 0.6498559077809798,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004965378156089851,
|
|
"loss": 5.2402,
|
|
"mean_token_accuracy": 0.19228145480155945,
|
|
"num_tokens": 15512588.0,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"entropy": 5.320986652374268,
|
|
"epoch": 0.6503362151777138,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004965318219236597,
|
|
"loss": 5.2151,
|
|
"mean_token_accuracy": 0.19258550703525543,
|
|
"num_tokens": 15523252.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 5.275221490859986,
|
|
"epoch": 0.6508165225744477,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004965258230950232,
|
|
"loss": 5.3192,
|
|
"mean_token_accuracy": 0.19502570629119872,
|
|
"num_tokens": 15534975.0,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"entropy": 5.3065108299255375,
|
|
"epoch": 0.6512968299711815,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004965198191232148,
|
|
"loss": 5.1863,
|
|
"mean_token_accuracy": 0.2026347428560257,
|
|
"num_tokens": 15545709.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 5.357606697082519,
|
|
"epoch": 0.6517771373679154,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000496513810008374,
|
|
"loss": 5.2888,
|
|
"mean_token_accuracy": 0.2010987401008606,
|
|
"num_tokens": 15557270.0,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"entropy": 5.383016109466553,
|
|
"epoch": 0.6522574447646494,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004965077957506403,
|
|
"loss": 5.2384,
|
|
"mean_token_accuracy": 0.1945773482322693,
|
|
"num_tokens": 15569156.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 5.228867387771606,
|
|
"epoch": 0.6527377521613833,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004965017763501533,
|
|
"loss": 5.1821,
|
|
"mean_token_accuracy": 0.20229731500148773,
|
|
"num_tokens": 15579270.0,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"entropy": 5.204952526092529,
|
|
"epoch": 0.6532180595581172,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004964957518070529,
|
|
"loss": 5.1436,
|
|
"mean_token_accuracy": 0.2022804006934166,
|
|
"num_tokens": 15589912.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 5.351508712768554,
|
|
"epoch": 0.6536983669548511,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004964897221214788,
|
|
"loss": 5.2636,
|
|
"mean_token_accuracy": 0.1962052032351494,
|
|
"num_tokens": 15601088.0,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"entropy": 5.400100564956665,
|
|
"epoch": 0.654178674351585,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000496483687293571,
|
|
"loss": 5.2467,
|
|
"mean_token_accuracy": 0.19797869324684142,
|
|
"num_tokens": 15612520.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 5.225302648544312,
|
|
"epoch": 0.6546589817483189,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004964776473234696,
|
|
"loss": 5.1854,
|
|
"mean_token_accuracy": 0.20556757897138594,
|
|
"num_tokens": 15623991.0,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"entropy": 5.196185064315796,
|
|
"epoch": 0.6551392891450528,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000496471602211315,
|
|
"loss": 5.11,
|
|
"mean_token_accuracy": 0.2007066160440445,
|
|
"num_tokens": 15635389.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 5.309195470809937,
|
|
"epoch": 0.6556195965417867,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004964655519572475,
|
|
"loss": 5.2207,
|
|
"mean_token_accuracy": 0.20262846797704698,
|
|
"num_tokens": 15646427.0,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"entropy": 5.216340732574463,
|
|
"epoch": 0.6560999039385207,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004964594965614072,
|
|
"loss": 5.2179,
|
|
"mean_token_accuracy": 0.192514306306839,
|
|
"num_tokens": 15657518.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 5.357060527801513,
|
|
"epoch": 0.6565802113352546,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004964534360239353,
|
|
"loss": 5.2477,
|
|
"mean_token_accuracy": 0.19148818552494049,
|
|
"num_tokens": 15669775.0,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"entropy": 5.3501969337463375,
|
|
"epoch": 0.6570605187319885,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000496447370344972,
|
|
"loss": 5.236,
|
|
"mean_token_accuracy": 0.18657899051904678,
|
|
"num_tokens": 15682566.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 5.258563375473022,
|
|
"epoch": 0.6575408261287223,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004964412995246584,
|
|
"loss": 5.2279,
|
|
"mean_token_accuracy": 0.19911282062530516,
|
|
"num_tokens": 15693168.0,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"entropy": 5.332230424880981,
|
|
"epoch": 0.6580211335254563,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004964352235631354,
|
|
"loss": 5.2429,
|
|
"mean_token_accuracy": 0.19526638984680175,
|
|
"num_tokens": 15703879.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 5.370284652709961,
|
|
"epoch": 0.6585014409221902,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000496429142460544,
|
|
"loss": 5.2742,
|
|
"mean_token_accuracy": 0.19143834859132766,
|
|
"num_tokens": 15716015.0,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"entropy": 5.361678266525269,
|
|
"epoch": 0.6589817483189241,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004964230562170254,
|
|
"loss": 5.2845,
|
|
"mean_token_accuracy": 0.19869090169668197,
|
|
"num_tokens": 15728254.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 5.305975437164307,
|
|
"epoch": 0.659462055715658,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004964169648327209,
|
|
"loss": 5.2485,
|
|
"mean_token_accuracy": 0.1971095770597458,
|
|
"num_tokens": 15738778.0,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"entropy": 5.41188497543335,
|
|
"epoch": 0.659942363112392,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000496410868307772,
|
|
"loss": 5.2555,
|
|
"mean_token_accuracy": 0.194293774664402,
|
|
"num_tokens": 15750305.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 5.304081630706787,
|
|
"epoch": 0.6604226705091258,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004964047666423203,
|
|
"loss": 5.2242,
|
|
"mean_token_accuracy": 0.19772389829158782,
|
|
"num_tokens": 15761303.0,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"entropy": 5.241645288467407,
|
|
"epoch": 0.6609029779058597,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004963986598365072,
|
|
"loss": 5.1887,
|
|
"mean_token_accuracy": 0.20886375904083251,
|
|
"num_tokens": 15773095.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 5.377842140197754,
|
|
"epoch": 0.6613832853025937,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004963925478904746,
|
|
"loss": 5.2658,
|
|
"mean_token_accuracy": 0.19911141097545623,
|
|
"num_tokens": 15784405.0,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"entropy": 5.203935384750366,
|
|
"epoch": 0.6618635926993276,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004963864308043645,
|
|
"loss": 5.198,
|
|
"mean_token_accuracy": 0.19471461623907088,
|
|
"num_tokens": 15795178.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 5.240695381164551,
|
|
"epoch": 0.6623439000960615,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004963803085783189,
|
|
"loss": 5.1989,
|
|
"mean_token_accuracy": 0.20192865282297134,
|
|
"num_tokens": 15806205.0,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"entropy": 5.311961603164673,
|
|
"epoch": 0.6628242074927954,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004963741812124799,
|
|
"loss": 5.1274,
|
|
"mean_token_accuracy": 0.20400931388139726,
|
|
"num_tokens": 15817474.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 5.277816581726074,
|
|
"epoch": 0.6633045148895294,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004963680487069898,
|
|
"loss": 5.2417,
|
|
"mean_token_accuracy": 0.19115398675203324,
|
|
"num_tokens": 15829728.0,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"entropy": 5.356722640991211,
|
|
"epoch": 0.6637848222862632,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004963619110619908,
|
|
"loss": 5.3267,
|
|
"mean_token_accuracy": 0.19444778561592102,
|
|
"num_tokens": 15840082.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 5.332076549530029,
|
|
"epoch": 0.6642651296829971,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004963557682776256,
|
|
"loss": 5.2147,
|
|
"mean_token_accuracy": 0.19637496322393416,
|
|
"num_tokens": 15851450.0,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"entropy": 5.394643926620484,
|
|
"epoch": 0.664745437079731,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004963496203540368,
|
|
"loss": 5.3604,
|
|
"mean_token_accuracy": 0.18928576707839967,
|
|
"num_tokens": 15864168.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 5.382654428482056,
|
|
"epoch": 0.665225744476465,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004963434672913671,
|
|
"loss": 5.3005,
|
|
"mean_token_accuracy": 0.1974198803305626,
|
|
"num_tokens": 15875634.0,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"entropy": 5.406427907943725,
|
|
"epoch": 0.6657060518731989,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004963373090897592,
|
|
"loss": 5.304,
|
|
"mean_token_accuracy": 0.19297229051589965,
|
|
"num_tokens": 15888411.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 5.341877174377442,
|
|
"epoch": 0.6661863592699327,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004963311457493563,
|
|
"loss": 5.2933,
|
|
"mean_token_accuracy": 0.19015721529722213,
|
|
"num_tokens": 15901084.0,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"entropy": 5.429636812210083,
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004963249772703015,
|
|
"loss": 5.3105,
|
|
"mean_token_accuracy": 0.1917794793844223,
|
|
"num_tokens": 15912061.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 5.352426385879516,
|
|
"epoch": 0.6671469740634006,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004963188036527378,
|
|
"loss": 5.3612,
|
|
"mean_token_accuracy": 0.1935911074280739,
|
|
"num_tokens": 15925439.0,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"entropy": 5.256782197952271,
|
|
"epoch": 0.6676272814601345,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004963126248968087,
|
|
"loss": 5.0929,
|
|
"mean_token_accuracy": 0.2068867191672325,
|
|
"num_tokens": 15937762.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 5.249649286270142,
|
|
"epoch": 0.6681075888568684,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004963064410026577,
|
|
"loss": 5.1521,
|
|
"mean_token_accuracy": 0.20011164397001266,
|
|
"num_tokens": 15948656.0,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"entropy": 5.343927001953125,
|
|
"epoch": 0.6685878962536023,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004963002519704281,
|
|
"loss": 5.2221,
|
|
"mean_token_accuracy": 0.19271822571754454,
|
|
"num_tokens": 15960376.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 5.325286912918091,
|
|
"epoch": 0.6690682036503363,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000496294057800264,
|
|
"loss": 5.2315,
|
|
"mean_token_accuracy": 0.19581420868635177,
|
|
"num_tokens": 15971913.0,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"entropy": 5.3356259822845455,
|
|
"epoch": 0.6695485110470701,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004962878584923089,
|
|
"loss": 5.2895,
|
|
"mean_token_accuracy": 0.18817632645368576,
|
|
"num_tokens": 15984775.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 5.344109296798706,
|
|
"epoch": 0.670028818443804,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004962816540467068,
|
|
"loss": 5.2166,
|
|
"mean_token_accuracy": 0.19363451302051543,
|
|
"num_tokens": 15996717.0,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"entropy": 5.3354270458221436,
|
|
"epoch": 0.6705091258405379,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004962754444636017,
|
|
"loss": 5.2025,
|
|
"mean_token_accuracy": 0.20301510095596315,
|
|
"num_tokens": 16007964.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 5.232488775253296,
|
|
"epoch": 0.6709894332372719,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000496269229743138,
|
|
"loss": 5.2268,
|
|
"mean_token_accuracy": 0.19161542654037475,
|
|
"num_tokens": 16019185.0,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"entropy": 5.335939073562622,
|
|
"epoch": 0.6714697406340058,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004962630098854597,
|
|
"loss": 5.2341,
|
|
"mean_token_accuracy": 0.2045590490102768,
|
|
"num_tokens": 16029983.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 5.36190619468689,
|
|
"epoch": 0.6719500480307397,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004962567848907113,
|
|
"loss": 5.1731,
|
|
"mean_token_accuracy": 0.1986493170261383,
|
|
"num_tokens": 16040574.0,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"entropy": 5.364934206008911,
|
|
"epoch": 0.6724303554274735,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004962505547590374,
|
|
"loss": 5.3243,
|
|
"mean_token_accuracy": 0.19473587423563005,
|
|
"num_tokens": 16052037.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 5.329007911682129,
|
|
"epoch": 0.6729106628242075,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004962443194905826,
|
|
"loss": 5.2961,
|
|
"mean_token_accuracy": 0.19470866173505783,
|
|
"num_tokens": 16063413.0,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"entropy": 5.34464545249939,
|
|
"epoch": 0.6733909702209414,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004962380790854916,
|
|
"loss": 5.186,
|
|
"mean_token_accuracy": 0.1919792741537094,
|
|
"num_tokens": 16074373.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 5.395242977142334,
|
|
"epoch": 0.6738712776176753,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004962318335439094,
|
|
"loss": 5.3215,
|
|
"mean_token_accuracy": 0.1902454525232315,
|
|
"num_tokens": 16086575.0,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"entropy": 5.374180316925049,
|
|
"epoch": 0.6743515850144092,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004962255828659809,
|
|
"loss": 5.1814,
|
|
"mean_token_accuracy": 0.19680528789758683,
|
|
"num_tokens": 16098529.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 5.237930870056152,
|
|
"epoch": 0.6748318924111432,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004962193270518513,
|
|
"loss": 5.2299,
|
|
"mean_token_accuracy": 0.20323789566755296,
|
|
"num_tokens": 16110085.0,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"entropy": 5.248058891296386,
|
|
"epoch": 0.675312199807877,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004962130661016659,
|
|
"loss": 5.1142,
|
|
"mean_token_accuracy": 0.20192392021417618,
|
|
"num_tokens": 16120249.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 5.408255577087402,
|
|
"epoch": 0.6757925072046109,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004962068000155699,
|
|
"loss": 5.4028,
|
|
"mean_token_accuracy": 0.18510753959417342,
|
|
"num_tokens": 16132645.0,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"entropy": 5.360714340209961,
|
|
"epoch": 0.6762728146013448,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004962005287937088,
|
|
"loss": 5.2683,
|
|
"mean_token_accuracy": 0.19801645576953888,
|
|
"num_tokens": 16143808.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 5.2652663230896,
|
|
"epoch": 0.6767531219980788,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004961942524362283,
|
|
"loss": 5.2683,
|
|
"mean_token_accuracy": 0.199874410033226,
|
|
"num_tokens": 16154309.0,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"entropy": 5.331776762008667,
|
|
"epoch": 0.6772334293948127,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004961879709432741,
|
|
"loss": 5.2157,
|
|
"mean_token_accuracy": 0.19288192838430404,
|
|
"num_tokens": 16164654.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 5.368539190292358,
|
|
"epoch": 0.6777137367915466,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000496181684314992,
|
|
"loss": 5.2722,
|
|
"mean_token_accuracy": 0.1937314122915268,
|
|
"num_tokens": 16177837.0,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"entropy": 5.267057132720947,
|
|
"epoch": 0.6781940441882806,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004961753925515279,
|
|
"loss": 5.1551,
|
|
"mean_token_accuracy": 0.20363912582397461,
|
|
"num_tokens": 16189073.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 5.379831266403198,
|
|
"epoch": 0.6786743515850144,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000496169095653028,
|
|
"loss": 5.3474,
|
|
"mean_token_accuracy": 0.19800200462341308,
|
|
"num_tokens": 16200371.0,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"entropy": 5.303866577148438,
|
|
"epoch": 0.6791546589817483,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004961627936196384,
|
|
"loss": 5.1378,
|
|
"mean_token_accuracy": 0.19792077392339708,
|
|
"num_tokens": 16210526.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 5.324541854858398,
|
|
"epoch": 0.6796349663784822,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004961564864515055,
|
|
"loss": 5.2485,
|
|
"mean_token_accuracy": 0.19714123010635376,
|
|
"num_tokens": 16221687.0,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"entropy": 5.198392057418824,
|
|
"epoch": 0.6801152737752162,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004961501741487757,
|
|
"loss": 5.1228,
|
|
"mean_token_accuracy": 0.207232241332531,
|
|
"num_tokens": 16233828.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 5.265508413314819,
|
|
"epoch": 0.6805955811719501,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004961438567115955,
|
|
"loss": 5.1098,
|
|
"mean_token_accuracy": 0.20777646452188492,
|
|
"num_tokens": 16243900.0,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"entropy": 5.29700231552124,
|
|
"epoch": 0.681075888568684,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004961375341401116,
|
|
"loss": 5.2056,
|
|
"mean_token_accuracy": 0.19741868525743483,
|
|
"num_tokens": 16256347.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 5.292010688781739,
|
|
"epoch": 0.6815561959654178,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004961312064344708,
|
|
"loss": 5.1188,
|
|
"mean_token_accuracy": 0.20195768475532533,
|
|
"num_tokens": 16267743.0,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"entropy": 5.1783538341522215,
|
|
"epoch": 0.6820365033621518,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00049612487359482,
|
|
"loss": 5.1529,
|
|
"mean_token_accuracy": 0.20347070544958115,
|
|
"num_tokens": 16278280.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 5.204392337799073,
|
|
"epoch": 0.6825168107588857,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004961185356213062,
|
|
"loss": 5.1439,
|
|
"mean_token_accuracy": 0.20720461010932922,
|
|
"num_tokens": 16288568.0,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"entropy": 5.255360317230225,
|
|
"epoch": 0.6829971181556196,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004961121925140767,
|
|
"loss": 5.2192,
|
|
"mean_token_accuracy": 0.20132138431072236,
|
|
"num_tokens": 16300730.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 5.277251672744751,
|
|
"epoch": 0.6834774255523535,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004961058442732786,
|
|
"loss": 5.2216,
|
|
"mean_token_accuracy": 0.19964685887098313,
|
|
"num_tokens": 16311789.0,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"entropy": 5.32997989654541,
|
|
"epoch": 0.6839577329490875,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004960994908990594,
|
|
"loss": 5.1993,
|
|
"mean_token_accuracy": 0.1934235706925392,
|
|
"num_tokens": 16324439.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 5.3754744052886965,
|
|
"epoch": 0.6844380403458213,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004960931323915665,
|
|
"loss": 5.3411,
|
|
"mean_token_accuracy": 0.20248763710260392,
|
|
"num_tokens": 16335344.0,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"entropy": 5.296399450302124,
|
|
"epoch": 0.6849183477425552,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004960867687509475,
|
|
"loss": 5.1806,
|
|
"mean_token_accuracy": 0.2043926537036896,
|
|
"num_tokens": 16349018.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 5.313053369522095,
|
|
"epoch": 0.6853986551392891,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004960803999773504,
|
|
"loss": 5.2619,
|
|
"mean_token_accuracy": 0.1975033849477768,
|
|
"num_tokens": 16360137.0,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"entropy": 5.3848051071167,
|
|
"epoch": 0.6858789625360231,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004960740260709228,
|
|
"loss": 5.2692,
|
|
"mean_token_accuracy": 0.19346715658903121,
|
|
"num_tokens": 16372277.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 5.36100664138794,
|
|
"epoch": 0.686359269932757,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004960676470318128,
|
|
"loss": 5.2727,
|
|
"mean_token_accuracy": 0.19226655662059783,
|
|
"num_tokens": 16383440.0,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"entropy": 5.314020681381225,
|
|
"epoch": 0.6868395773294909,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004960612628601683,
|
|
"loss": 5.228,
|
|
"mean_token_accuracy": 0.19899825602769852,
|
|
"num_tokens": 16394330.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 5.215233564376831,
|
|
"epoch": 0.6873198847262247,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004960548735561379,
|
|
"loss": 5.15,
|
|
"mean_token_accuracy": 0.20503795742988587,
|
|
"num_tokens": 16405734.0,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"entropy": 5.318601846694946,
|
|
"epoch": 0.6878001921229587,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004960484791198697,
|
|
"loss": 5.1957,
|
|
"mean_token_accuracy": 0.1932004436850548,
|
|
"num_tokens": 16416025.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 5.244638299942016,
|
|
"epoch": 0.6882804995196926,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004960420795515121,
|
|
"loss": 5.1858,
|
|
"mean_token_accuracy": 0.19369462579488755,
|
|
"num_tokens": 16427416.0,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"entropy": 5.237499618530274,
|
|
"epoch": 0.6887608069164265,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004960356748512138,
|
|
"loss": 5.185,
|
|
"mean_token_accuracy": 0.20407173335552214,
|
|
"num_tokens": 16438073.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 5.274411201477051,
|
|
"epoch": 0.6892411143131604,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004960292650191236,
|
|
"loss": 5.1486,
|
|
"mean_token_accuracy": 0.1983790621161461,
|
|
"num_tokens": 16449994.0,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"entropy": 5.2747314929962155,
|
|
"epoch": 0.6897214217098944,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004960228500553899,
|
|
"loss": 5.1435,
|
|
"mean_token_accuracy": 0.21412984877824784,
|
|
"num_tokens": 16460355.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 5.401017570495606,
|
|
"epoch": 0.6902017291066282,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004960164299601623,
|
|
"loss": 5.2192,
|
|
"mean_token_accuracy": 0.19781634211540222,
|
|
"num_tokens": 16472274.0,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"entropy": 5.366089820861816,
|
|
"epoch": 0.6906820365033621,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004960100047335892,
|
|
"loss": 5.2644,
|
|
"mean_token_accuracy": 0.19702471643686295,
|
|
"num_tokens": 16482959.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 5.314202547073364,
|
|
"epoch": 0.691162343900096,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004960035743758202,
|
|
"loss": 5.2786,
|
|
"mean_token_accuracy": 0.1941748395562172,
|
|
"num_tokens": 16494533.0,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"entropy": 5.2420876026153564,
|
|
"epoch": 0.69164265129683,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004959971388870044,
|
|
"loss": 5.1539,
|
|
"mean_token_accuracy": 0.20189982801675796,
|
|
"num_tokens": 16506124.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 5.253983736038208,
|
|
"epoch": 0.6921229586935639,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004959906982672912,
|
|
"loss": 5.1791,
|
|
"mean_token_accuracy": 0.19119898676872255,
|
|
"num_tokens": 16517867.0,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"entropy": 5.217931318283081,
|
|
"epoch": 0.6926032660902978,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004959842525168302,
|
|
"loss": 5.11,
|
|
"mean_token_accuracy": 0.20074271708726882,
|
|
"num_tokens": 16529075.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 5.294466924667359,
|
|
"epoch": 0.6930835734870316,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004959778016357712,
|
|
"loss": 5.1452,
|
|
"mean_token_accuracy": 0.20483478754758835,
|
|
"num_tokens": 16540326.0,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"entropy": 5.280926847457886,
|
|
"epoch": 0.6935638808837656,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004959713456242637,
|
|
"loss": 5.2002,
|
|
"mean_token_accuracy": 0.1991882160305977,
|
|
"num_tokens": 16551570.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 5.373426103591919,
|
|
"epoch": 0.6940441882804995,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004959648844824576,
|
|
"loss": 5.2523,
|
|
"mean_token_accuracy": 0.19577774852514268,
|
|
"num_tokens": 16562636.0,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"entropy": 5.285389518737793,
|
|
"epoch": 0.6945244956772334,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004959584182105032,
|
|
"loss": 5.1307,
|
|
"mean_token_accuracy": 0.2037241354584694,
|
|
"num_tokens": 16573867.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 5.235301113128662,
|
|
"epoch": 0.6950048030739674,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004959519468085504,
|
|
"loss": 5.1263,
|
|
"mean_token_accuracy": 0.20582810789346695,
|
|
"num_tokens": 16584533.0,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"entropy": 5.211501741409302,
|
|
"epoch": 0.6954851104707013,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004959454702767494,
|
|
"loss": 5.3437,
|
|
"mean_token_accuracy": 0.20351918488740922,
|
|
"num_tokens": 16596562.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 5.31222095489502,
|
|
"epoch": 0.6959654178674352,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004959389886152507,
|
|
"loss": 5.1793,
|
|
"mean_token_accuracy": 0.21050842702388764,
|
|
"num_tokens": 16607508.0,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"entropy": 5.342617702484131,
|
|
"epoch": 0.696445725264169,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004959325018242048,
|
|
"loss": 5.2606,
|
|
"mean_token_accuracy": 0.19728082418441772,
|
|
"num_tokens": 16617737.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 5.327224397659302,
|
|
"epoch": 0.696926032660903,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004959260099037622,
|
|
"loss": 5.2593,
|
|
"mean_token_accuracy": 0.19428612291812897,
|
|
"num_tokens": 16628518.0,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"entropy": 5.318410634994507,
|
|
"epoch": 0.6974063400576369,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004959195128540737,
|
|
"loss": 5.2864,
|
|
"mean_token_accuracy": 0.18943870663642884,
|
|
"num_tokens": 16639644.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 5.433784484863281,
|
|
"epoch": 0.6978866474543708,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00049591301067529,
|
|
"loss": 5.3405,
|
|
"mean_token_accuracy": 0.18801091760396957,
|
|
"num_tokens": 16650882.0,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"entropy": 5.336638355255127,
|
|
"epoch": 0.6983669548511047,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004959065033675623,
|
|
"loss": 5.2265,
|
|
"mean_token_accuracy": 0.19460077136754989,
|
|
"num_tokens": 16662464.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 5.355379629135132,
|
|
"epoch": 0.6988472622478387,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004958999909310414,
|
|
"loss": 5.2623,
|
|
"mean_token_accuracy": 0.19179683029651642,
|
|
"num_tokens": 16673718.0,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"entropy": 5.290545892715454,
|
|
"epoch": 0.6993275696445725,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004958934733658788,
|
|
"loss": 5.1873,
|
|
"mean_token_accuracy": 0.19871881008148193,
|
|
"num_tokens": 16684957.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 5.306415462493897,
|
|
"epoch": 0.6998078770413064,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004958869506722256,
|
|
"loss": 5.2294,
|
|
"mean_token_accuracy": 0.19656572341918946,
|
|
"num_tokens": 16695782.0,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"entropy": 5.280803632736206,
|
|
"epoch": 0.7002881844380403,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004958804228502332,
|
|
"loss": 5.1363,
|
|
"mean_token_accuracy": 0.20514173954725265,
|
|
"num_tokens": 16707448.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 5.23841404914856,
|
|
"epoch": 0.7007684918347743,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004958738899000534,
|
|
"loss": 5.1653,
|
|
"mean_token_accuracy": 0.19584250301122666,
|
|
"num_tokens": 16718074.0,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"entropy": 5.29450945854187,
|
|
"epoch": 0.7012487992315082,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004958673518218377,
|
|
"loss": 5.2046,
|
|
"mean_token_accuracy": 0.20462729632854462,
|
|
"num_tokens": 16728656.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 5.307183456420899,
|
|
"epoch": 0.7017291066282421,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004958608086157379,
|
|
"loss": 5.2517,
|
|
"mean_token_accuracy": 0.19895842373371125,
|
|
"num_tokens": 16740703.0,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"entropy": 5.321242046356201,
|
|
"epoch": 0.7022094140249759,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000495854260281906,
|
|
"loss": 5.1298,
|
|
"mean_token_accuracy": 0.2080679327249527,
|
|
"num_tokens": 16753320.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 5.1749231815338135,
|
|
"epoch": 0.7026897214217099,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004958477068204941,
|
|
"loss": 5.2007,
|
|
"mean_token_accuracy": 0.20202370434999467,
|
|
"num_tokens": 16764889.0,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"entropy": 5.385407257080078,
|
|
"epoch": 0.7031700288184438,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000495841148231654,
|
|
"loss": 5.2381,
|
|
"mean_token_accuracy": 0.2000655323266983,
|
|
"num_tokens": 16776301.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 5.351953983306885,
|
|
"epoch": 0.7036503362151777,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004958345845155383,
|
|
"loss": 5.2239,
|
|
"mean_token_accuracy": 0.20341352075338365,
|
|
"num_tokens": 16786935.0,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"entropy": 5.241073179244995,
|
|
"epoch": 0.7041306436119116,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004958280156722992,
|
|
"loss": 5.2238,
|
|
"mean_token_accuracy": 0.1962451696395874,
|
|
"num_tokens": 16799335.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 5.23271107673645,
|
|
"epoch": 0.7046109510086456,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004958214417020894,
|
|
"loss": 5.1471,
|
|
"mean_token_accuracy": 0.20265070348978043,
|
|
"num_tokens": 16811376.0,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"entropy": 5.326413011550903,
|
|
"epoch": 0.7050912584053795,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004958148626050614,
|
|
"loss": 5.2133,
|
|
"mean_token_accuracy": 0.19387653321027756,
|
|
"num_tokens": 16822859.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 5.298766660690307,
|
|
"epoch": 0.7055715658021133,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000495808278381368,
|
|
"loss": 5.2019,
|
|
"mean_token_accuracy": 0.1966390699148178,
|
|
"num_tokens": 16833831.0,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"entropy": 5.3197509288787845,
|
|
"epoch": 0.7060518731988472,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000495801689031162,
|
|
"loss": 5.234,
|
|
"mean_token_accuracy": 0.2002715587615967,
|
|
"num_tokens": 16846827.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 5.338577556610107,
|
|
"epoch": 0.7065321805955812,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004957950945545965,
|
|
"loss": 5.2022,
|
|
"mean_token_accuracy": 0.20344888269901276,
|
|
"num_tokens": 16858166.0,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"entropy": 5.377301120758057,
|
|
"epoch": 0.7070124879923151,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004957884949518246,
|
|
"loss": 5.3351,
|
|
"mean_token_accuracy": 0.1826968491077423,
|
|
"num_tokens": 16870201.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 5.329647397994995,
|
|
"epoch": 0.707492795389049,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004957818902229992,
|
|
"loss": 5.2073,
|
|
"mean_token_accuracy": 0.19947872757911683,
|
|
"num_tokens": 16880891.0,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"entropy": 5.316723299026489,
|
|
"epoch": 0.7079731027857828,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004957752803682741,
|
|
"loss": 5.2801,
|
|
"mean_token_accuracy": 0.19825806319713593,
|
|
"num_tokens": 16893498.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 5.346575832366943,
|
|
"epoch": 0.7084534101825168,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004957686653878024,
|
|
"loss": 5.2538,
|
|
"mean_token_accuracy": 0.1996032789349556,
|
|
"num_tokens": 16904959.0,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"entropy": 5.391040182113647,
|
|
"epoch": 0.7089337175792507,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000495762045281738,
|
|
"loss": 5.2735,
|
|
"mean_token_accuracy": 0.19459239691495894,
|
|
"num_tokens": 16916547.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 5.243378686904907,
|
|
"epoch": 0.7094140249759846,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004957554200502344,
|
|
"loss": 5.2145,
|
|
"mean_token_accuracy": 0.1971280872821808,
|
|
"num_tokens": 16928580.0,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"entropy": 5.325099229812622,
|
|
"epoch": 0.7098943323727186,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004957487896934454,
|
|
"loss": 5.232,
|
|
"mean_token_accuracy": 0.19476460963487624,
|
|
"num_tokens": 16941247.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 5.422113227844238,
|
|
"epoch": 0.7103746397694525,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000495742154211525,
|
|
"loss": 5.2508,
|
|
"mean_token_accuracy": 0.19776693880558013,
|
|
"num_tokens": 16951119.0,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"entropy": 5.253897190093994,
|
|
"epoch": 0.7108549471661864,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004957355136046272,
|
|
"loss": 5.1788,
|
|
"mean_token_accuracy": 0.20065926164388656,
|
|
"num_tokens": 16962608.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 5.298086404800415,
|
|
"epoch": 0.7113352545629202,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004957288678729064,
|
|
"loss": 5.2149,
|
|
"mean_token_accuracy": 0.19710940271615982,
|
|
"num_tokens": 16973291.0,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"entropy": 5.391532135009766,
|
|
"epoch": 0.7118155619596542,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004957222170165166,
|
|
"loss": 5.2282,
|
|
"mean_token_accuracy": 0.1964917078614235,
|
|
"num_tokens": 16984895.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 5.302619600296021,
|
|
"epoch": 0.7122958693563881,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004957155610356124,
|
|
"loss": 5.2556,
|
|
"mean_token_accuracy": 0.19016777127981185,
|
|
"num_tokens": 16997109.0,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"entropy": 5.284333562850952,
|
|
"epoch": 0.712776176753122,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004957088999303481,
|
|
"loss": 5.1909,
|
|
"mean_token_accuracy": 0.19305464029312133,
|
|
"num_tokens": 17008640.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 5.3054125785827635,
|
|
"epoch": 0.7132564841498559,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004957022337008787,
|
|
"loss": 5.1643,
|
|
"mean_token_accuracy": 0.2041031762957573,
|
|
"num_tokens": 17018924.0,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"entropy": 5.2749724864959715,
|
|
"epoch": 0.7137367915465899,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004956955623473587,
|
|
"loss": 5.2059,
|
|
"mean_token_accuracy": 0.19430594891309738,
|
|
"num_tokens": 17029932.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 5.317395496368408,
|
|
"epoch": 0.7142170989433237,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000495688885869943,
|
|
"loss": 5.2942,
|
|
"mean_token_accuracy": 0.19469617754220964,
|
|
"num_tokens": 17042164.0,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"entropy": 5.349671697616577,
|
|
"epoch": 0.7146974063400576,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004956822042687868,
|
|
"loss": 5.2677,
|
|
"mean_token_accuracy": 0.1921522706747055,
|
|
"num_tokens": 17054729.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 5.250608634948731,
|
|
"epoch": 0.7151777137367915,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004956755175440451,
|
|
"loss": 5.1269,
|
|
"mean_token_accuracy": 0.21212284564971923,
|
|
"num_tokens": 17066537.0,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"entropy": 5.282503080368042,
|
|
"epoch": 0.7156580211335255,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004956688256958731,
|
|
"loss": 5.1974,
|
|
"mean_token_accuracy": 0.200366573035717,
|
|
"num_tokens": 17077787.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 5.301390218734741,
|
|
"epoch": 0.7161383285302594,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004956621287244262,
|
|
"loss": 5.2218,
|
|
"mean_token_accuracy": 0.1973409503698349,
|
|
"num_tokens": 17089555.0,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"entropy": 5.344415140151978,
|
|
"epoch": 0.7166186359269933,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004956554266298599,
|
|
"loss": 5.1796,
|
|
"mean_token_accuracy": 0.2040402039885521,
|
|
"num_tokens": 17099911.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 5.202849531173706,
|
|
"epoch": 0.7170989433237271,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004956487194123298,
|
|
"loss": 5.1746,
|
|
"mean_token_accuracy": 0.2012821167707443,
|
|
"num_tokens": 17110880.0,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"entropy": 5.285581064224243,
|
|
"epoch": 0.7175792507204611,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004956420070719918,
|
|
"loss": 5.1395,
|
|
"mean_token_accuracy": 0.20026769638061523,
|
|
"num_tokens": 17122272.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 5.362657308578491,
|
|
"epoch": 0.718059558117195,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004956352896090014,
|
|
"loss": 5.274,
|
|
"mean_token_accuracy": 0.19526351988315582,
|
|
"num_tokens": 17133995.0,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"entropy": 5.318215322494507,
|
|
"epoch": 0.7185398655139289,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004956285670235147,
|
|
"loss": 5.1494,
|
|
"mean_token_accuracy": 0.20162675380706788,
|
|
"num_tokens": 17145970.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 5.310476064682007,
|
|
"epoch": 0.7190201729106628,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004956218393156879,
|
|
"loss": 5.3381,
|
|
"mean_token_accuracy": 0.1912968397140503,
|
|
"num_tokens": 17157747.0,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"entropy": 5.311709976196289,
|
|
"epoch": 0.7195004803073968,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004956151064856772,
|
|
"loss": 5.2701,
|
|
"mean_token_accuracy": 0.20224885493516923,
|
|
"num_tokens": 17168357.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 5.349070930480957,
|
|
"epoch": 0.7199807877041307,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004956083685336386,
|
|
"loss": 5.2336,
|
|
"mean_token_accuracy": 0.20705265551805496,
|
|
"num_tokens": 17179871.0,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"entropy": 5.332374906539917,
|
|
"epoch": 0.7204610951008645,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004956016254597289,
|
|
"loss": 5.1764,
|
|
"mean_token_accuracy": 0.20444130003452302,
|
|
"num_tokens": 17190456.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 5.290010213851929,
|
|
"epoch": 0.7209414024975984,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004955948772641044,
|
|
"loss": 5.2627,
|
|
"mean_token_accuracy": 0.19260406792163848,
|
|
"num_tokens": 17201623.0,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"entropy": 5.2717503070831295,
|
|
"epoch": 0.7214217098943324,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000495588123946922,
|
|
"loss": 5.1584,
|
|
"mean_token_accuracy": 0.2030091643333435,
|
|
"num_tokens": 17212891.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 5.332910203933716,
|
|
"epoch": 0.7219020172910663,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004955813655083384,
|
|
"loss": 5.2124,
|
|
"mean_token_accuracy": 0.19683704823255538,
|
|
"num_tokens": 17223983.0,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"entropy": 5.344393396377564,
|
|
"epoch": 0.7223823246878002,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004955746019485104,
|
|
"loss": 5.2591,
|
|
"mean_token_accuracy": 0.20013708472251893,
|
|
"num_tokens": 17233701.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 5.320908880233764,
|
|
"epoch": 0.722862632084534,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000495567833267595,
|
|
"loss": 5.2505,
|
|
"mean_token_accuracy": 0.19587977081537247,
|
|
"num_tokens": 17245958.0,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"entropy": 5.409331274032593,
|
|
"epoch": 0.723342939481268,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004955610594657496,
|
|
"loss": 5.2579,
|
|
"mean_token_accuracy": 0.18650022745132447,
|
|
"num_tokens": 17257687.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 5.327173328399658,
|
|
"epoch": 0.7238232468780019,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004955542805431313,
|
|
"loss": 5.287,
|
|
"mean_token_accuracy": 0.19946179389953614,
|
|
"num_tokens": 17270247.0,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"entropy": 5.272875452041626,
|
|
"epoch": 0.7243035542747358,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004955474964998976,
|
|
"loss": 5.2117,
|
|
"mean_token_accuracy": 0.20549824982881545,
|
|
"num_tokens": 17281920.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 5.4193642139434814,
|
|
"epoch": 0.7247838616714697,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004955407073362058,
|
|
"loss": 5.3923,
|
|
"mean_token_accuracy": 0.1859695628285408,
|
|
"num_tokens": 17293602.0,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"entropy": 5.421948957443237,
|
|
"epoch": 0.7252641690682037,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004955339130522136,
|
|
"loss": 5.1722,
|
|
"mean_token_accuracy": 0.19999373257160186,
|
|
"num_tokens": 17304484.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 5.196074771881103,
|
|
"epoch": 0.7257444764649376,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000495527113648079,
|
|
"loss": 5.1102,
|
|
"mean_token_accuracy": 0.2052151992917061,
|
|
"num_tokens": 17314953.0,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"entropy": 5.263898038864136,
|
|
"epoch": 0.7262247838616714,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004955203091239596,
|
|
"loss": 5.2313,
|
|
"mean_token_accuracy": 0.19495706707239152,
|
|
"num_tokens": 17326813.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 5.3781215190887455,
|
|
"epoch": 0.7267050912584054,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004955134994800134,
|
|
"loss": 5.3161,
|
|
"mean_token_accuracy": 0.1874473437666893,
|
|
"num_tokens": 17337968.0,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"entropy": 5.309977293014526,
|
|
"epoch": 0.7271853986551393,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004955066847163984,
|
|
"loss": 5.2156,
|
|
"mean_token_accuracy": 0.1947050377726555,
|
|
"num_tokens": 17350406.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 5.217679643630982,
|
|
"epoch": 0.7276657060518732,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004954998648332731,
|
|
"loss": 5.1128,
|
|
"mean_token_accuracy": 0.20684792548418046,
|
|
"num_tokens": 17361888.0,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"entropy": 5.307045364379883,
|
|
"epoch": 0.7281460134486071,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004954930398307956,
|
|
"loss": 5.2392,
|
|
"mean_token_accuracy": 0.196373288333416,
|
|
"num_tokens": 17374047.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 5.35437798500061,
|
|
"epoch": 0.7286263208453411,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004954862097091245,
|
|
"loss": 5.2254,
|
|
"mean_token_accuracy": 0.18948904722929,
|
|
"num_tokens": 17386175.0,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"entropy": 5.318008661270142,
|
|
"epoch": 0.729106628242075,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004954793744684184,
|
|
"loss": 5.2641,
|
|
"mean_token_accuracy": 0.20655235201120375,
|
|
"num_tokens": 17398168.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 5.230118227005005,
|
|
"epoch": 0.7295869356388088,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004954725341088358,
|
|
"loss": 5.1178,
|
|
"mean_token_accuracy": 0.2040240153670311,
|
|
"num_tokens": 17408825.0,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"entropy": 5.2700879096984865,
|
|
"epoch": 0.7300672430355427,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004954656886305356,
|
|
"loss": 5.2158,
|
|
"mean_token_accuracy": 0.1991657391190529,
|
|
"num_tokens": 17419813.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 5.354073095321655,
|
|
"epoch": 0.7305475504322767,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004954588380336768,
|
|
"loss": 5.2669,
|
|
"mean_token_accuracy": 0.1913167342543602,
|
|
"num_tokens": 17431134.0,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"entropy": 5.251888942718506,
|
|
"epoch": 0.7310278578290106,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004954519823184184,
|
|
"loss": 5.1767,
|
|
"mean_token_accuracy": 0.200624917447567,
|
|
"num_tokens": 17442614.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 5.315543699264526,
|
|
"epoch": 0.7315081652257445,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004954451214849196,
|
|
"loss": 5.1714,
|
|
"mean_token_accuracy": 0.19947559833526612,
|
|
"num_tokens": 17454615.0,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"entropy": 5.376033115386963,
|
|
"epoch": 0.7319884726224783,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004954382555333397,
|
|
"loss": 5.2185,
|
|
"mean_token_accuracy": 0.1976114273071289,
|
|
"num_tokens": 17467025.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 5.226950979232788,
|
|
"epoch": 0.7324687800192123,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000495431384463838,
|
|
"loss": 5.1785,
|
|
"mean_token_accuracy": 0.20502081364393235,
|
|
"num_tokens": 17477681.0,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"entropy": 5.305552339553833,
|
|
"epoch": 0.7329490874159462,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004954245082765741,
|
|
"loss": 5.2359,
|
|
"mean_token_accuracy": 0.1983788013458252,
|
|
"num_tokens": 17489814.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 5.342312955856324,
|
|
"epoch": 0.7334293948126801,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004954176269717077,
|
|
"loss": 5.231,
|
|
"mean_token_accuracy": 0.19795534610748292,
|
|
"num_tokens": 17501701.0,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"entropy": 5.281050443649292,
|
|
"epoch": 0.733909702209414,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004954107405493984,
|
|
"loss": 5.1416,
|
|
"mean_token_accuracy": 0.203464774787426,
|
|
"num_tokens": 17513585.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 5.367125082015991,
|
|
"epoch": 0.734390009606148,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004954038490098064,
|
|
"loss": 5.2764,
|
|
"mean_token_accuracy": 0.20417422205209732,
|
|
"num_tokens": 17525557.0,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"entropy": 5.269024848937988,
|
|
"epoch": 0.7348703170028819,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004953969523530914,
|
|
"loss": 5.1457,
|
|
"mean_token_accuracy": 0.195808245241642,
|
|
"num_tokens": 17538312.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 5.354407835006714,
|
|
"epoch": 0.7353506243996157,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004953900505794136,
|
|
"loss": 5.2732,
|
|
"mean_token_accuracy": 0.1993303641676903,
|
|
"num_tokens": 17550248.0,
|
|
"step": 7655
|
|
},
|
|
{
|
|
"entropy": 5.282435178756714,
|
|
"epoch": 0.7358309317963496,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004953831436889332,
|
|
"loss": 5.1346,
|
|
"mean_token_accuracy": 0.19791701585054397,
|
|
"num_tokens": 17560624.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 5.303052854537964,
|
|
"epoch": 0.7363112391930836,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004953762316818106,
|
|
"loss": 5.1963,
|
|
"mean_token_accuracy": 0.19871004968881606,
|
|
"num_tokens": 17572439.0,
|
|
"step": 7665
|
|
},
|
|
{
|
|
"entropy": 5.351328325271607,
|
|
"epoch": 0.7367915465898175,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004953693145582064,
|
|
"loss": 5.2461,
|
|
"mean_token_accuracy": 0.18978616893291472,
|
|
"num_tokens": 17583120.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 5.191801738739014,
|
|
"epoch": 0.7372718539865514,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000495362392318281,
|
|
"loss": 5.0753,
|
|
"mean_token_accuracy": 0.2021123856306076,
|
|
"num_tokens": 17593677.0,
|
|
"step": 7675
|
|
},
|
|
{
|
|
"entropy": 5.268652153015137,
|
|
"epoch": 0.7377521613832853,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004953554649621951,
|
|
"loss": 5.2193,
|
|
"mean_token_accuracy": 0.19628288149833678,
|
|
"num_tokens": 17605180.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 5.3253484725952145,
|
|
"epoch": 0.7382324687800192,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004953485324901098,
|
|
"loss": 5.1844,
|
|
"mean_token_accuracy": 0.20035452097654344,
|
|
"num_tokens": 17617459.0,
|
|
"step": 7685
|
|
},
|
|
{
|
|
"entropy": 5.265099334716797,
|
|
"epoch": 0.7387127761767531,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004953415949021857,
|
|
"loss": 5.185,
|
|
"mean_token_accuracy": 0.19952280819416046,
|
|
"num_tokens": 17628024.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 5.240673971176148,
|
|
"epoch": 0.739193083573487,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004953346521985843,
|
|
"loss": 5.2044,
|
|
"mean_token_accuracy": 0.19829180389642714,
|
|
"num_tokens": 17639833.0,
|
|
"step": 7695
|
|
},
|
|
{
|
|
"entropy": 5.402639627456665,
|
|
"epoch": 0.7396733909702209,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004953277043794663,
|
|
"loss": 5.2946,
|
|
"mean_token_accuracy": 0.18773587048053741,
|
|
"num_tokens": 17651057.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 5.30786247253418,
|
|
"epoch": 0.7401536983669549,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004953207514449933,
|
|
"loss": 5.1709,
|
|
"mean_token_accuracy": 0.20614788979291915,
|
|
"num_tokens": 17662288.0,
|
|
"step": 7705
|
|
},
|
|
{
|
|
"entropy": 5.260968112945557,
|
|
"epoch": 0.7406340057636888,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004953137933953267,
|
|
"loss": 5.1842,
|
|
"mean_token_accuracy": 0.1984902873635292,
|
|
"num_tokens": 17673885.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 5.276099634170532,
|
|
"epoch": 0.7411143131604226,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000495306830230628,
|
|
"loss": 5.2038,
|
|
"mean_token_accuracy": 0.20187791883945466,
|
|
"num_tokens": 17685030.0,
|
|
"step": 7715
|
|
},
|
|
{
|
|
"entropy": 5.298851442337036,
|
|
"epoch": 0.7415946205571565,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004952998619510589,
|
|
"loss": 5.1554,
|
|
"mean_token_accuracy": 0.20665809214115144,
|
|
"num_tokens": 17696624.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 5.264416313171386,
|
|
"epoch": 0.7420749279538905,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004952928885567811,
|
|
"loss": 5.1403,
|
|
"mean_token_accuracy": 0.20183341205120087,
|
|
"num_tokens": 17707386.0,
|
|
"step": 7725
|
|
},
|
|
{
|
|
"entropy": 5.397011804580688,
|
|
"epoch": 0.7425552353506244,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004952859100479566,
|
|
"loss": 5.3804,
|
|
"mean_token_accuracy": 0.18951933234930038,
|
|
"num_tokens": 17718605.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 5.272250413894653,
|
|
"epoch": 0.7430355427473583,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004952789264247474,
|
|
"loss": 5.1757,
|
|
"mean_token_accuracy": 0.1989718645811081,
|
|
"num_tokens": 17730275.0,
|
|
"step": 7735
|
|
},
|
|
{
|
|
"entropy": 5.323815870285034,
|
|
"epoch": 0.7435158501440923,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004952719376873156,
|
|
"loss": 5.2921,
|
|
"mean_token_accuracy": 0.19190367460250854,
|
|
"num_tokens": 17741390.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 5.308934831619263,
|
|
"epoch": 0.7439961575408262,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004952649438358234,
|
|
"loss": 5.1731,
|
|
"mean_token_accuracy": 0.20842925161123277,
|
|
"num_tokens": 17752354.0,
|
|
"step": 7745
|
|
},
|
|
{
|
|
"entropy": 5.308169984817505,
|
|
"epoch": 0.74447646493756,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004952579448704334,
|
|
"loss": 5.1631,
|
|
"mean_token_accuracy": 0.20139929056167602,
|
|
"num_tokens": 17762839.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 5.350342273712158,
|
|
"epoch": 0.7449567723342939,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000495250940791308,
|
|
"loss": 5.2605,
|
|
"mean_token_accuracy": 0.19137903451919555,
|
|
"num_tokens": 17775800.0,
|
|
"step": 7755
|
|
},
|
|
{
|
|
"entropy": 5.306150579452515,
|
|
"epoch": 0.7454370797310279,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004952439315986096,
|
|
"loss": 5.1811,
|
|
"mean_token_accuracy": 0.19805798083543777,
|
|
"num_tokens": 17787804.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 5.341886854171753,
|
|
"epoch": 0.7459173871277618,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004952369172925012,
|
|
"loss": 5.2853,
|
|
"mean_token_accuracy": 0.1993953987956047,
|
|
"num_tokens": 17800291.0,
|
|
"step": 7765
|
|
},
|
|
{
|
|
"entropy": 5.292854881286621,
|
|
"epoch": 0.7463976945244957,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004952298978731454,
|
|
"loss": 5.147,
|
|
"mean_token_accuracy": 0.20547049790620803,
|
|
"num_tokens": 17811548.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 5.28916335105896,
|
|
"epoch": 0.7468780019212296,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004952228733407055,
|
|
"loss": 5.1011,
|
|
"mean_token_accuracy": 0.20431289821863174,
|
|
"num_tokens": 17822589.0,
|
|
"step": 7775
|
|
},
|
|
{
|
|
"entropy": 5.175790548324585,
|
|
"epoch": 0.7473583093179635,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004952158436953444,
|
|
"loss": 5.1236,
|
|
"mean_token_accuracy": 0.20223413705825805,
|
|
"num_tokens": 17834203.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 5.22423152923584,
|
|
"epoch": 0.7478386167146974,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004952088089372252,
|
|
"loss": 5.2105,
|
|
"mean_token_accuracy": 0.19397516399621964,
|
|
"num_tokens": 17846238.0,
|
|
"step": 7785
|
|
},
|
|
{
|
|
"entropy": 5.331250286102295,
|
|
"epoch": 0.7483189241114313,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004952017690665114,
|
|
"loss": 5.1324,
|
|
"mean_token_accuracy": 0.2026346132159233,
|
|
"num_tokens": 17857640.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 5.280352592468262,
|
|
"epoch": 0.7487992315081652,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004951947240833664,
|
|
"loss": 5.1374,
|
|
"mean_token_accuracy": 0.20421989113092423,
|
|
"num_tokens": 17868755.0,
|
|
"step": 7795
|
|
},
|
|
{
|
|
"entropy": 5.23347110748291,
|
|
"epoch": 0.7492795389048992,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004951876739879537,
|
|
"loss": 5.2158,
|
|
"mean_token_accuracy": 0.1939207211136818,
|
|
"num_tokens": 17881078.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 5.29048261642456,
|
|
"epoch": 0.7497598463016331,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004951806187804371,
|
|
"loss": 5.1629,
|
|
"mean_token_accuracy": 0.19929923862218857,
|
|
"num_tokens": 17893888.0,
|
|
"step": 7805
|
|
},
|
|
{
|
|
"entropy": 5.296859693527222,
|
|
"epoch": 0.7502401536983669,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004951735584609804,
|
|
"loss": 5.2196,
|
|
"mean_token_accuracy": 0.19920673221349716,
|
|
"num_tokens": 17904443.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 5.286273050308227,
|
|
"epoch": 0.7507204610951008,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004951664930297474,
|
|
"loss": 5.217,
|
|
"mean_token_accuracy": 0.20082310885190963,
|
|
"num_tokens": 17918090.0,
|
|
"step": 7815
|
|
},
|
|
{
|
|
"entropy": 5.227938318252564,
|
|
"epoch": 0.7512007684918348,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000495159422486902,
|
|
"loss": 5.161,
|
|
"mean_token_accuracy": 0.20609356909990312,
|
|
"num_tokens": 17929233.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 5.242263078689575,
|
|
"epoch": 0.7516810758885687,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004951523468326088,
|
|
"loss": 5.1965,
|
|
"mean_token_accuracy": 0.19512112885713578,
|
|
"num_tokens": 17940580.0,
|
|
"step": 7825
|
|
},
|
|
{
|
|
"entropy": 5.342494058609009,
|
|
"epoch": 0.7521613832853026,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004951452660670317,
|
|
"loss": 5.278,
|
|
"mean_token_accuracy": 0.18720510900020598,
|
|
"num_tokens": 17953993.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 5.285890769958496,
|
|
"epoch": 0.7526416906820365,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004951381801903352,
|
|
"loss": 5.11,
|
|
"mean_token_accuracy": 0.20024892687797546,
|
|
"num_tokens": 17966033.0,
|
|
"step": 7835
|
|
},
|
|
{
|
|
"entropy": 5.114803695678711,
|
|
"epoch": 0.7531219980787704,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004951310892026839,
|
|
"loss": 5.0968,
|
|
"mean_token_accuracy": 0.2095889687538147,
|
|
"num_tokens": 17977943.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 5.293044853210449,
|
|
"epoch": 0.7536023054755043,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004951239931042424,
|
|
"loss": 5.1698,
|
|
"mean_token_accuracy": 0.20413365513086318,
|
|
"num_tokens": 17990135.0,
|
|
"step": 7845
|
|
},
|
|
{
|
|
"entropy": 5.279680919647217,
|
|
"epoch": 0.7540826128722382,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004951168918951753,
|
|
"loss": 5.2056,
|
|
"mean_token_accuracy": 0.20387261509895324,
|
|
"num_tokens": 18002126.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 5.190751075744629,
|
|
"epoch": 0.7545629202689721,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004951097855756476,
|
|
"loss": 5.0763,
|
|
"mean_token_accuracy": 0.20258077830076218,
|
|
"num_tokens": 18013147.0,
|
|
"step": 7855
|
|
},
|
|
{
|
|
"entropy": 5.328785943984985,
|
|
"epoch": 0.7550432276657061,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004951026741458243,
|
|
"loss": 5.1906,
|
|
"mean_token_accuracy": 0.19995464086532594,
|
|
"num_tokens": 18025146.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 5.261568832397461,
|
|
"epoch": 0.75552353506244,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004950955576058705,
|
|
"loss": 5.1556,
|
|
"mean_token_accuracy": 0.19386046081781388,
|
|
"num_tokens": 18036412.0,
|
|
"step": 7865
|
|
},
|
|
{
|
|
"entropy": 5.259433937072754,
|
|
"epoch": 0.7560038424591738,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004950884359559513,
|
|
"loss": 5.2224,
|
|
"mean_token_accuracy": 0.20132519155740738,
|
|
"num_tokens": 18048041.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 5.339427757263183,
|
|
"epoch": 0.7564841498559077,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004950813091962324,
|
|
"loss": 5.277,
|
|
"mean_token_accuracy": 0.19186609387397766,
|
|
"num_tokens": 18060163.0,
|
|
"step": 7875
|
|
},
|
|
{
|
|
"entropy": 5.334090280532837,
|
|
"epoch": 0.7569644572526417,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004950741773268788,
|
|
"loss": 5.1936,
|
|
"mean_token_accuracy": 0.1961333230137825,
|
|
"num_tokens": 18071628.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 5.1901530742645265,
|
|
"epoch": 0.7574447646493756,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004950670403480562,
|
|
"loss": 5.0997,
|
|
"mean_token_accuracy": 0.20009191036224366,
|
|
"num_tokens": 18082979.0,
|
|
"step": 7885
|
|
},
|
|
{
|
|
"entropy": 5.252532863616944,
|
|
"epoch": 0.7579250720461095,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004950598982599306,
|
|
"loss": 5.1792,
|
|
"mean_token_accuracy": 0.20101021528244017,
|
|
"num_tokens": 18093889.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 5.262256002426147,
|
|
"epoch": 0.7584053794428435,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004950527510626675,
|
|
"loss": 5.2165,
|
|
"mean_token_accuracy": 0.19798852652311325,
|
|
"num_tokens": 18105559.0,
|
|
"step": 7895
|
|
},
|
|
{
|
|
"entropy": 5.3081278800964355,
|
|
"epoch": 0.7588856868395774,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004950455987564329,
|
|
"loss": 5.2965,
|
|
"mean_token_accuracy": 0.19645372927188873,
|
|
"num_tokens": 18116316.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 5.334239339828491,
|
|
"epoch": 0.7593659942363112,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004950384413413931,
|
|
"loss": 5.1851,
|
|
"mean_token_accuracy": 0.19774624109268188,
|
|
"num_tokens": 18126851.0,
|
|
"step": 7905
|
|
},
|
|
{
|
|
"entropy": 5.322545146942138,
|
|
"epoch": 0.7598463016330451,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004950312788177139,
|
|
"loss": 5.277,
|
|
"mean_token_accuracy": 0.19229816943407058,
|
|
"num_tokens": 18139571.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 5.347561597824097,
|
|
"epoch": 0.7603266090297791,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004950241111855618,
|
|
"loss": 5.2449,
|
|
"mean_token_accuracy": 0.19438967555761338,
|
|
"num_tokens": 18150680.0,
|
|
"step": 7915
|
|
},
|
|
{
|
|
"entropy": 5.285850143432617,
|
|
"epoch": 0.760806916426513,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004950169384451031,
|
|
"loss": 5.1977,
|
|
"mean_token_accuracy": 0.20325633138418198,
|
|
"num_tokens": 18161911.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 5.3055215835571286,
|
|
"epoch": 0.7612872238232469,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004950097605965045,
|
|
"loss": 5.1865,
|
|
"mean_token_accuracy": 0.20584756135940552,
|
|
"num_tokens": 18172714.0,
|
|
"step": 7925
|
|
},
|
|
{
|
|
"entropy": 5.3049579620361325,
|
|
"epoch": 0.7617675312199808,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004950025776399326,
|
|
"loss": 5.1777,
|
|
"mean_token_accuracy": 0.20212606489658355,
|
|
"num_tokens": 18184250.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 5.343746089935303,
|
|
"epoch": 0.7622478386167147,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000494995389575554,
|
|
"loss": 5.298,
|
|
"mean_token_accuracy": 0.19154924601316453,
|
|
"num_tokens": 18195299.0,
|
|
"step": 7935
|
|
},
|
|
{
|
|
"entropy": 5.377411794662476,
|
|
"epoch": 0.7627281460134486,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004949881964035357,
|
|
"loss": 5.3633,
|
|
"mean_token_accuracy": 0.1878654807806015,
|
|
"num_tokens": 18206863.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 5.3593430519104,
|
|
"epoch": 0.7632084534101825,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004949809981240448,
|
|
"loss": 5.2789,
|
|
"mean_token_accuracy": 0.19559144079685212,
|
|
"num_tokens": 18219643.0,
|
|
"step": 7945
|
|
},
|
|
{
|
|
"entropy": 5.277910900115967,
|
|
"epoch": 0.7636887608069164,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004949737947372483,
|
|
"loss": 5.1325,
|
|
"mean_token_accuracy": 0.20357694774866103,
|
|
"num_tokens": 18230461.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 5.320190668106079,
|
|
"epoch": 0.7641690682036504,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004949665862433134,
|
|
"loss": 5.2868,
|
|
"mean_token_accuracy": 0.19289156794548035,
|
|
"num_tokens": 18243768.0,
|
|
"step": 7955
|
|
},
|
|
{
|
|
"entropy": 5.296690511703491,
|
|
"epoch": 0.7646493756003843,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004949593726424077,
|
|
"loss": 5.1664,
|
|
"mean_token_accuracy": 0.2003849595785141,
|
|
"num_tokens": 18255322.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 5.291482019424438,
|
|
"epoch": 0.7651296829971181,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004949521539346985,
|
|
"loss": 5.2128,
|
|
"mean_token_accuracy": 0.20742505341768264,
|
|
"num_tokens": 18267262.0,
|
|
"step": 7965
|
|
},
|
|
{
|
|
"entropy": 5.294230127334595,
|
|
"epoch": 0.765609990393852,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004949449301203533,
|
|
"loss": 5.1096,
|
|
"mean_token_accuracy": 0.20100534409284593,
|
|
"num_tokens": 18277332.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 5.269995260238647,
|
|
"epoch": 0.766090297790586,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004949377011995399,
|
|
"loss": 5.2462,
|
|
"mean_token_accuracy": 0.20773502439260483,
|
|
"num_tokens": 18289547.0,
|
|
"step": 7975
|
|
},
|
|
{
|
|
"entropy": 5.273986148834228,
|
|
"epoch": 0.7665706051873199,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004949304671724263,
|
|
"loss": 5.1432,
|
|
"mean_token_accuracy": 0.20398979485034943,
|
|
"num_tokens": 18302097.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 5.271750497817993,
|
|
"epoch": 0.7670509125840538,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004949232280391802,
|
|
"loss": 5.2367,
|
|
"mean_token_accuracy": 0.19723534286022187,
|
|
"num_tokens": 18314151.0,
|
|
"step": 7985
|
|
},
|
|
{
|
|
"entropy": 5.288878488540649,
|
|
"epoch": 0.7675312199807877,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004949159837999698,
|
|
"loss": 5.17,
|
|
"mean_token_accuracy": 0.19713514000177385,
|
|
"num_tokens": 18326085.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 5.297585439682007,
|
|
"epoch": 0.7680115273775217,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004949087344549633,
|
|
"loss": 5.1623,
|
|
"mean_token_accuracy": 0.19341499507427215,
|
|
"num_tokens": 18338158.0,
|
|
"step": 7995
|
|
},
|
|
{
|
|
"entropy": 5.30235710144043,
|
|
"epoch": 0.7684918347742555,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000494901480004329,
|
|
"loss": 5.1809,
|
|
"mean_token_accuracy": 0.20551337599754332,
|
|
"num_tokens": 18349418.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 5.301124715805054,
|
|
"epoch": 0.7689721421709894,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004948942204482351,
|
|
"loss": 5.1761,
|
|
"mean_token_accuracy": 0.20214684456586837,
|
|
"num_tokens": 18361964.0,
|
|
"step": 8005
|
|
},
|
|
{
|
|
"entropy": 5.219300603866577,
|
|
"epoch": 0.7694524495677233,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004948869557868506,
|
|
"loss": 5.0979,
|
|
"mean_token_accuracy": 0.21326844096183778,
|
|
"num_tokens": 18373108.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 5.1787127494812015,
|
|
"epoch": 0.7699327569644573,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004948796860203439,
|
|
"loss": 5.123,
|
|
"mean_token_accuracy": 0.20177519619464873,
|
|
"num_tokens": 18385310.0,
|
|
"step": 8015
|
|
},
|
|
{
|
|
"entropy": 5.337287092208863,
|
|
"epoch": 0.7704130643611912,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004948724111488838,
|
|
"loss": 5.2967,
|
|
"mean_token_accuracy": 0.19337289929389953,
|
|
"num_tokens": 18396132.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 5.4783307075500485,
|
|
"epoch": 0.770893371757925,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004948651311726391,
|
|
"loss": 5.4042,
|
|
"mean_token_accuracy": 0.18754971623420716,
|
|
"num_tokens": 18409930.0,
|
|
"step": 8025
|
|
},
|
|
{
|
|
"entropy": 5.384222173690796,
|
|
"epoch": 0.7713736791546589,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004948578460917789,
|
|
"loss": 5.2773,
|
|
"mean_token_accuracy": 0.19422808140516282,
|
|
"num_tokens": 18421204.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 5.340588712692261,
|
|
"epoch": 0.7718539865513929,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004948505559064725,
|
|
"loss": 5.1639,
|
|
"mean_token_accuracy": 0.20545457750558854,
|
|
"num_tokens": 18433194.0,
|
|
"step": 8035
|
|
},
|
|
{
|
|
"entropy": 5.259980058670044,
|
|
"epoch": 0.7723342939481268,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004948432606168889,
|
|
"loss": 5.1246,
|
|
"mean_token_accuracy": 0.20445887744426727,
|
|
"num_tokens": 18445282.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 5.232318782806397,
|
|
"epoch": 0.7728146013448607,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004948359602231976,
|
|
"loss": 5.1841,
|
|
"mean_token_accuracy": 0.20268695801496506,
|
|
"num_tokens": 18456264.0,
|
|
"step": 8045
|
|
},
|
|
{
|
|
"entropy": 5.2557531833648685,
|
|
"epoch": 0.7732949087415946,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004948286547255681,
|
|
"loss": 5.12,
|
|
"mean_token_accuracy": 0.20428049117326735,
|
|
"num_tokens": 18467573.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 5.278136920928955,
|
|
"epoch": 0.7737752161383286,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00049482134412417,
|
|
"loss": 5.0755,
|
|
"mean_token_accuracy": 0.2096497043967247,
|
|
"num_tokens": 18478333.0,
|
|
"step": 8055
|
|
},
|
|
{
|
|
"entropy": 5.19740858078003,
|
|
"epoch": 0.7742555235350624,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000494814028419173,
|
|
"loss": 5.1055,
|
|
"mean_token_accuracy": 0.20423219799995423,
|
|
"num_tokens": 18489106.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 5.313218450546264,
|
|
"epoch": 0.7747358309317963,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000494806707610747,
|
|
"loss": 5.2388,
|
|
"mean_token_accuracy": 0.1978022873401642,
|
|
"num_tokens": 18500090.0,
|
|
"step": 8065
|
|
},
|
|
{
|
|
"entropy": 5.315935182571411,
|
|
"epoch": 0.7752161383285303,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000494799381699062,
|
|
"loss": 5.1933,
|
|
"mean_token_accuracy": 0.20687556862831116,
|
|
"num_tokens": 18510305.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 5.306492900848388,
|
|
"epoch": 0.7756964457252642,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004947920506842879,
|
|
"loss": 5.1982,
|
|
"mean_token_accuracy": 0.1990632399916649,
|
|
"num_tokens": 18523048.0,
|
|
"step": 8075
|
|
},
|
|
{
|
|
"entropy": 5.279481077194214,
|
|
"epoch": 0.7761767531219981,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004947847145665951,
|
|
"loss": 5.1368,
|
|
"mean_token_accuracy": 0.2043842852115631,
|
|
"num_tokens": 18534145.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 5.309163236618042,
|
|
"epoch": 0.776657060518732,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004947773733461539,
|
|
"loss": 5.18,
|
|
"mean_token_accuracy": 0.20123105943202974,
|
|
"num_tokens": 18545045.0,
|
|
"step": 8085
|
|
},
|
|
{
|
|
"entropy": 5.278306007385254,
|
|
"epoch": 0.777137367915466,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004947700270231347,
|
|
"loss": 5.1526,
|
|
"mean_token_accuracy": 0.2032118022441864,
|
|
"num_tokens": 18557531.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 5.23764853477478,
|
|
"epoch": 0.7776176753121998,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004947626755977079,
|
|
"loss": 5.1887,
|
|
"mean_token_accuracy": 0.19730121344327928,
|
|
"num_tokens": 18569127.0,
|
|
"step": 8095
|
|
},
|
|
{
|
|
"entropy": 5.3856611251831055,
|
|
"epoch": 0.7780979827089337,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004947553190700444,
|
|
"loss": 5.2255,
|
|
"mean_token_accuracy": 0.20432638376951218,
|
|
"num_tokens": 18580606.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 5.353847932815552,
|
|
"epoch": 0.7785782901056676,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000494747957440315,
|
|
"loss": 5.3448,
|
|
"mean_token_accuracy": 0.1883766993880272,
|
|
"num_tokens": 18592330.0,
|
|
"step": 8105
|
|
},
|
|
{
|
|
"entropy": 5.406799602508545,
|
|
"epoch": 0.7790585975024016,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004947405907086905,
|
|
"loss": 5.3101,
|
|
"mean_token_accuracy": 0.1930047556757927,
|
|
"num_tokens": 18604721.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 5.2951795101165775,
|
|
"epoch": 0.7795389048991355,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004947332188753419,
|
|
"loss": 5.1811,
|
|
"mean_token_accuracy": 0.2039830431342125,
|
|
"num_tokens": 18616814.0,
|
|
"step": 8115
|
|
},
|
|
{
|
|
"entropy": 5.2599467754364015,
|
|
"epoch": 0.7800192122958693,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004947258419404405,
|
|
"loss": 5.1832,
|
|
"mean_token_accuracy": 0.19927904903888702,
|
|
"num_tokens": 18628224.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 5.2949143409729,
|
|
"epoch": 0.7804995196926032,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004947184599041576,
|
|
"loss": 5.2286,
|
|
"mean_token_accuracy": 0.19865068048238754,
|
|
"num_tokens": 18639777.0,
|
|
"step": 8125
|
|
},
|
|
{
|
|
"entropy": 5.241250896453858,
|
|
"epoch": 0.7809798270893372,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004947110727666644,
|
|
"loss": 5.1412,
|
|
"mean_token_accuracy": 0.2019078940153122,
|
|
"num_tokens": 18651044.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 5.28899393081665,
|
|
"epoch": 0.7814601344860711,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004947036805281325,
|
|
"loss": 5.2099,
|
|
"mean_token_accuracy": 0.2031030997633934,
|
|
"num_tokens": 18663142.0,
|
|
"step": 8135
|
|
},
|
|
{
|
|
"entropy": 5.4257384777069095,
|
|
"epoch": 0.781940441882805,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004946962831887336,
|
|
"loss": 5.2878,
|
|
"mean_token_accuracy": 0.1936602771282196,
|
|
"num_tokens": 18674079.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 5.335578870773316,
|
|
"epoch": 0.7824207492795389,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004946888807486393,
|
|
"loss": 5.272,
|
|
"mean_token_accuracy": 0.1942149966955185,
|
|
"num_tokens": 18685744.0,
|
|
"step": 8145
|
|
},
|
|
{
|
|
"entropy": 5.229654836654663,
|
|
"epoch": 0.7829010566762729,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004946814732080214,
|
|
"loss": 5.1577,
|
|
"mean_token_accuracy": 0.19906039535999298,
|
|
"num_tokens": 18697049.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 5.2480401515960695,
|
|
"epoch": 0.7833813640730067,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004946740605670523,
|
|
"loss": 5.171,
|
|
"mean_token_accuracy": 0.20052818953990936,
|
|
"num_tokens": 18708765.0,
|
|
"step": 8155
|
|
},
|
|
{
|
|
"entropy": 5.28274884223938,
|
|
"epoch": 0.7838616714697406,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004946666428259037,
|
|
"loss": 5.1497,
|
|
"mean_token_accuracy": 0.20465652495622635,
|
|
"num_tokens": 18719819.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 5.361910057067871,
|
|
"epoch": 0.7843419788664745,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004946592199847478,
|
|
"loss": 5.2101,
|
|
"mean_token_accuracy": 0.19702319502830506,
|
|
"num_tokens": 18730668.0,
|
|
"step": 8165
|
|
},
|
|
{
|
|
"entropy": 5.166529130935669,
|
|
"epoch": 0.7848222862632085,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004946517920437571,
|
|
"loss": 5.0232,
|
|
"mean_token_accuracy": 0.205000402033329,
|
|
"num_tokens": 18741991.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 5.288968944549561,
|
|
"epoch": 0.7853025936599424,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004946443590031041,
|
|
"loss": 5.2402,
|
|
"mean_token_accuracy": 0.1971651256084442,
|
|
"num_tokens": 18753919.0,
|
|
"step": 8175
|
|
},
|
|
{
|
|
"entropy": 5.322829055786133,
|
|
"epoch": 0.7857829010566763,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004946369208629613,
|
|
"loss": 5.136,
|
|
"mean_token_accuracy": 0.20793365240097045,
|
|
"num_tokens": 18764342.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 5.140931224822998,
|
|
"epoch": 0.7862632084534101,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004946294776235013,
|
|
"loss": 5.1578,
|
|
"mean_token_accuracy": 0.2009105786681175,
|
|
"num_tokens": 18776044.0,
|
|
"step": 8185
|
|
},
|
|
{
|
|
"entropy": 5.405412912368774,
|
|
"epoch": 0.7867435158501441,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004946220292848971,
|
|
"loss": 5.2153,
|
|
"mean_token_accuracy": 0.20181388556957244,
|
|
"num_tokens": 18787354.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 5.33922643661499,
|
|
"epoch": 0.787223823246878,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004946145758473214,
|
|
"loss": 5.1366,
|
|
"mean_token_accuracy": 0.20076511055231094,
|
|
"num_tokens": 18797845.0,
|
|
"step": 8195
|
|
},
|
|
{
|
|
"entropy": 5.270247936248779,
|
|
"epoch": 0.7877041306436119,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004946071173109475,
|
|
"loss": 5.2099,
|
|
"mean_token_accuracy": 0.19680924713611603,
|
|
"num_tokens": 18809253.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 5.210458469390869,
|
|
"epoch": 0.7881844380403458,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004945996536759484,
|
|
"loss": 5.0893,
|
|
"mean_token_accuracy": 0.20848129391670228,
|
|
"num_tokens": 18819768.0,
|
|
"step": 8205
|
|
},
|
|
{
|
|
"entropy": 5.4275593757629395,
|
|
"epoch": 0.7886647454370798,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004945921849424974,
|
|
"loss": 5.3408,
|
|
"mean_token_accuracy": 0.19263991117477416,
|
|
"num_tokens": 18831151.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 5.344443464279175,
|
|
"epoch": 0.7891450528338136,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004945847111107679,
|
|
"loss": 5.113,
|
|
"mean_token_accuracy": 0.20607621520757674,
|
|
"num_tokens": 18842133.0,
|
|
"step": 8215
|
|
},
|
|
{
|
|
"entropy": 5.285537433624268,
|
|
"epoch": 0.7896253602305475,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004945772321809334,
|
|
"loss": 5.2747,
|
|
"mean_token_accuracy": 0.19406631737947463,
|
|
"num_tokens": 18853295.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 5.242657232284546,
|
|
"epoch": 0.7901056676272814,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004945697481531677,
|
|
"loss": 5.2147,
|
|
"mean_token_accuracy": 0.20358818471431733,
|
|
"num_tokens": 18865802.0,
|
|
"step": 8225
|
|
},
|
|
{
|
|
"entropy": 5.291993951797485,
|
|
"epoch": 0.7905859750240154,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004945622590276443,
|
|
"loss": 5.1269,
|
|
"mean_token_accuracy": 0.21141389459371568,
|
|
"num_tokens": 18877693.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 5.316649341583252,
|
|
"epoch": 0.7910662824207493,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004945547648045373,
|
|
"loss": 5.1811,
|
|
"mean_token_accuracy": 0.20542819797992706,
|
|
"num_tokens": 18888549.0,
|
|
"step": 8235
|
|
},
|
|
{
|
|
"entropy": 5.251237916946411,
|
|
"epoch": 0.7915465898174832,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004945472654840206,
|
|
"loss": 5.1278,
|
|
"mean_token_accuracy": 0.20496677309274675,
|
|
"num_tokens": 18899132.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 5.192249727249146,
|
|
"epoch": 0.7920268972142172,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004945397610662683,
|
|
"loss": 5.1362,
|
|
"mean_token_accuracy": 0.1992405891418457,
|
|
"num_tokens": 18911774.0,
|
|
"step": 8245
|
|
},
|
|
{
|
|
"entropy": 5.370453500747681,
|
|
"epoch": 0.792507204610951,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004945322515514547,
|
|
"loss": 5.2096,
|
|
"mean_token_accuracy": 0.20203327834606172,
|
|
"num_tokens": 18922806.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 5.284592056274414,
|
|
"epoch": 0.7929875120076849,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000494524736939754,
|
|
"loss": 5.1768,
|
|
"mean_token_accuracy": 0.20510386675596237,
|
|
"num_tokens": 18934861.0,
|
|
"step": 8255
|
|
},
|
|
{
|
|
"entropy": 5.287734031677246,
|
|
"epoch": 0.7934678194044188,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004945172172313408,
|
|
"loss": 5.1468,
|
|
"mean_token_accuracy": 0.20960791260004044,
|
|
"num_tokens": 18944269.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 5.277343654632569,
|
|
"epoch": 0.7939481268011528,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004945096924263896,
|
|
"loss": 5.2071,
|
|
"mean_token_accuracy": 0.19710262566804887,
|
|
"num_tokens": 18957197.0,
|
|
"step": 8265
|
|
},
|
|
{
|
|
"entropy": 5.340837478637695,
|
|
"epoch": 0.7944284341978867,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004945021625250753,
|
|
"loss": 5.1948,
|
|
"mean_token_accuracy": 0.19287520945072173,
|
|
"num_tokens": 18968254.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 5.223880767822266,
|
|
"epoch": 0.7949087415946205,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004944946275275724,
|
|
"loss": 5.115,
|
|
"mean_token_accuracy": 0.2050468847155571,
|
|
"num_tokens": 18979372.0,
|
|
"step": 8275
|
|
},
|
|
{
|
|
"entropy": 5.223621273040772,
|
|
"epoch": 0.7953890489913544,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004944870874340561,
|
|
"loss": 5.1239,
|
|
"mean_token_accuracy": 0.20524471253156662,
|
|
"num_tokens": 18991075.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 5.2544965744018555,
|
|
"epoch": 0.7958693563880884,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004944795422447013,
|
|
"loss": 5.0548,
|
|
"mean_token_accuracy": 0.20748359262943267,
|
|
"num_tokens": 19002324.0,
|
|
"step": 8285
|
|
},
|
|
{
|
|
"entropy": 5.34930009841919,
|
|
"epoch": 0.7963496637848223,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004944719919596835,
|
|
"loss": 5.2493,
|
|
"mean_token_accuracy": 0.1979260191321373,
|
|
"num_tokens": 19014406.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 5.200772380828857,
|
|
"epoch": 0.7968299711815562,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004944644365791776,
|
|
"loss": 5.1136,
|
|
"mean_token_accuracy": 0.20155889242887498,
|
|
"num_tokens": 19025984.0,
|
|
"step": 8295
|
|
},
|
|
{
|
|
"entropy": 5.297162580490112,
|
|
"epoch": 0.7973102785782901,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000494456876103359,
|
|
"loss": 5.2354,
|
|
"mean_token_accuracy": 0.20328541100025177,
|
|
"num_tokens": 19036189.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 5.296557950973511,
|
|
"epoch": 0.7977905859750241,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004944493105324035,
|
|
"loss": 5.207,
|
|
"mean_token_accuracy": 0.19707799553871155,
|
|
"num_tokens": 19047587.0,
|
|
"step": 8305
|
|
},
|
|
{
|
|
"entropy": 5.326691627502441,
|
|
"epoch": 0.7982708933717579,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004944417398664866,
|
|
"loss": 5.2014,
|
|
"mean_token_accuracy": 0.1997044637799263,
|
|
"num_tokens": 19058467.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 5.3209089756011965,
|
|
"epoch": 0.7987512007684918,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004944341641057843,
|
|
"loss": 5.2646,
|
|
"mean_token_accuracy": 0.19579226821660994,
|
|
"num_tokens": 19070235.0,
|
|
"step": 8315
|
|
},
|
|
{
|
|
"entropy": 5.347403049468994,
|
|
"epoch": 0.7992315081652257,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004944265832504721,
|
|
"loss": 5.2227,
|
|
"mean_token_accuracy": 0.19858405888080596,
|
|
"num_tokens": 19082005.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 5.348876476287842,
|
|
"epoch": 0.7997118155619597,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004944189973007262,
|
|
"loss": 5.2395,
|
|
"mean_token_accuracy": 0.1973268657922745,
|
|
"num_tokens": 19092922.0,
|
|
"step": 8325
|
|
},
|
|
{
|
|
"entropy": 5.313145542144776,
|
|
"epoch": 0.8001921229586936,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004944114062567229,
|
|
"loss": 5.2259,
|
|
"mean_token_accuracy": 0.19848893135786055,
|
|
"num_tokens": 19104832.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 5.300992155075074,
|
|
"epoch": 0.8006724303554275,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004944038101186381,
|
|
"loss": 5.2693,
|
|
"mean_token_accuracy": 0.20075047612190247,
|
|
"num_tokens": 19116261.0,
|
|
"step": 8335
|
|
},
|
|
{
|
|
"entropy": 5.354841804504394,
|
|
"epoch": 0.8011527377521613,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004943962088866483,
|
|
"loss": 5.2437,
|
|
"mean_token_accuracy": 0.19408100843429565,
|
|
"num_tokens": 19127195.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 5.398009014129639,
|
|
"epoch": 0.8016330451488953,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004943886025609301,
|
|
"loss": 5.2731,
|
|
"mean_token_accuracy": 0.19554793536663057,
|
|
"num_tokens": 19138164.0,
|
|
"step": 8345
|
|
},
|
|
{
|
|
"entropy": 5.338754367828369,
|
|
"epoch": 0.8021133525456292,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00049438099114166,
|
|
"loss": 5.2362,
|
|
"mean_token_accuracy": 0.19320564866065978,
|
|
"num_tokens": 19149153.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 5.311693477630615,
|
|
"epoch": 0.8025936599423631,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004943733746290147,
|
|
"loss": 5.2499,
|
|
"mean_token_accuracy": 0.19263479709625245,
|
|
"num_tokens": 19161023.0,
|
|
"step": 8355
|
|
},
|
|
{
|
|
"entropy": 5.233406496047974,
|
|
"epoch": 0.803073967339097,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000494365753023171,
|
|
"loss": 5.1045,
|
|
"mean_token_accuracy": 0.21303804814815522,
|
|
"num_tokens": 19172466.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 5.315703201293945,
|
|
"epoch": 0.803554274735831,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004943581263243059,
|
|
"loss": 5.1847,
|
|
"mean_token_accuracy": 0.20322347730398177,
|
|
"num_tokens": 19183684.0,
|
|
"step": 8365
|
|
},
|
|
{
|
|
"entropy": 5.203450679779053,
|
|
"epoch": 0.8040345821325648,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004943504945325965,
|
|
"loss": 5.0808,
|
|
"mean_token_accuracy": 0.20951220840215684,
|
|
"num_tokens": 19194479.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 5.310950231552124,
|
|
"epoch": 0.8045148895292987,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004943428576482198,
|
|
"loss": 5.1797,
|
|
"mean_token_accuracy": 0.19598036706447602,
|
|
"num_tokens": 19206323.0,
|
|
"step": 8375
|
|
},
|
|
{
|
|
"entropy": 5.409931755065918,
|
|
"epoch": 0.8049951969260326,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004943352156713535,
|
|
"loss": 5.2646,
|
|
"mean_token_accuracy": 0.19424921572208403,
|
|
"num_tokens": 19218849.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 5.2228189468383786,
|
|
"epoch": 0.8054755043227666,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004943275686021747,
|
|
"loss": 5.0933,
|
|
"mean_token_accuracy": 0.2045307993888855,
|
|
"num_tokens": 19229603.0,
|
|
"step": 8385
|
|
},
|
|
{
|
|
"entropy": 5.316561031341553,
|
|
"epoch": 0.8059558117195005,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000494319916440861,
|
|
"loss": 5.2698,
|
|
"mean_token_accuracy": 0.19353571087121962,
|
|
"num_tokens": 19241318.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 5.331172943115234,
|
|
"epoch": 0.8064361191162344,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004943122591875901,
|
|
"loss": 5.1608,
|
|
"mean_token_accuracy": 0.201753132045269,
|
|
"num_tokens": 19252640.0,
|
|
"step": 8395
|
|
},
|
|
{
|
|
"entropy": 5.2201464653015135,
|
|
"epoch": 0.8069164265129684,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004943045968425398,
|
|
"loss": 5.1455,
|
|
"mean_token_accuracy": 0.20201311111450196,
|
|
"num_tokens": 19262971.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 5.3145428657531735,
|
|
"epoch": 0.8073967339097022,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004942969294058878,
|
|
"loss": 5.2328,
|
|
"mean_token_accuracy": 0.1995360553264618,
|
|
"num_tokens": 19274426.0,
|
|
"step": 8405
|
|
},
|
|
{
|
|
"entropy": 5.307784461975098,
|
|
"epoch": 0.8078770413064361,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004942892568778125,
|
|
"loss": 5.1895,
|
|
"mean_token_accuracy": 0.20282406657934188,
|
|
"num_tokens": 19286806.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 5.257102823257446,
|
|
"epoch": 0.80835734870317,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004942815792584917,
|
|
"loss": 5.164,
|
|
"mean_token_accuracy": 0.20003714710474013,
|
|
"num_tokens": 19297997.0,
|
|
"step": 8415
|
|
},
|
|
{
|
|
"entropy": 5.2559874057769775,
|
|
"epoch": 0.808837656099904,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004942738965481038,
|
|
"loss": 5.1548,
|
|
"mean_token_accuracy": 0.2016161561012268,
|
|
"num_tokens": 19309789.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 5.270598459243774,
|
|
"epoch": 0.8093179634966379,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004942662087468272,
|
|
"loss": 5.1497,
|
|
"mean_token_accuracy": 0.2061583325266838,
|
|
"num_tokens": 19320688.0,
|
|
"step": 8425
|
|
},
|
|
{
|
|
"entropy": 5.307732200622558,
|
|
"epoch": 0.8097982708933718,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004942585158548402,
|
|
"loss": 5.1946,
|
|
"mean_token_accuracy": 0.2020473822951317,
|
|
"num_tokens": 19331670.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 5.252246427536011,
|
|
"epoch": 0.8102785782901056,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004942508178723214,
|
|
"loss": 5.1434,
|
|
"mean_token_accuracy": 0.19983597844839096,
|
|
"num_tokens": 19343578.0,
|
|
"step": 8435
|
|
},
|
|
{
|
|
"entropy": 5.280768489837646,
|
|
"epoch": 0.8107588856868396,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004942431147994499,
|
|
"loss": 5.1919,
|
|
"mean_token_accuracy": 0.20007235407829285,
|
|
"num_tokens": 19354875.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 5.264632368087769,
|
|
"epoch": 0.8112391930835735,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004942354066364042,
|
|
"loss": 5.1298,
|
|
"mean_token_accuracy": 0.20225782990455626,
|
|
"num_tokens": 19366353.0,
|
|
"step": 8445
|
|
},
|
|
{
|
|
"entropy": 5.154706716537476,
|
|
"epoch": 0.8117195004803074,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004942276933833634,
|
|
"loss": 5.0671,
|
|
"mean_token_accuracy": 0.2101285368204117,
|
|
"num_tokens": 19377534.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 5.28000054359436,
|
|
"epoch": 0.8121998078770413,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004942199750405064,
|
|
"loss": 5.2018,
|
|
"mean_token_accuracy": 0.19314154237508774,
|
|
"num_tokens": 19388188.0,
|
|
"step": 8455
|
|
},
|
|
{
|
|
"entropy": 5.197183513641358,
|
|
"epoch": 0.8126801152737753,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004942122516080127,
|
|
"loss": 5.0585,
|
|
"mean_token_accuracy": 0.21394696682691575,
|
|
"num_tokens": 19399910.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 5.321963691711426,
|
|
"epoch": 0.8131604226705091,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004942045230860614,
|
|
"loss": 5.2039,
|
|
"mean_token_accuracy": 0.20521147847175597,
|
|
"num_tokens": 19411715.0,
|
|
"step": 8465
|
|
},
|
|
{
|
|
"entropy": 5.375604724884033,
|
|
"epoch": 0.813640730067243,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004941967894748319,
|
|
"loss": 5.2942,
|
|
"mean_token_accuracy": 0.19328842014074327,
|
|
"num_tokens": 19423275.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 5.287278127670288,
|
|
"epoch": 0.8141210374639769,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004941890507745039,
|
|
"loss": 5.221,
|
|
"mean_token_accuracy": 0.1964712470769882,
|
|
"num_tokens": 19436035.0,
|
|
"step": 8475
|
|
},
|
|
{
|
|
"entropy": 5.246680879592896,
|
|
"epoch": 0.8146013448607109,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004941813069852569,
|
|
"loss": 5.1855,
|
|
"mean_token_accuracy": 0.19806223958730698,
|
|
"num_tokens": 19447755.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 5.337065172195435,
|
|
"epoch": 0.8150816522574448,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004941735581072708,
|
|
"loss": 5.1272,
|
|
"mean_token_accuracy": 0.20841425210237502,
|
|
"num_tokens": 19459044.0,
|
|
"step": 8485
|
|
},
|
|
{
|
|
"entropy": 5.253565120697021,
|
|
"epoch": 0.8155619596541787,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004941658041407255,
|
|
"loss": 5.1429,
|
|
"mean_token_accuracy": 0.20152915716171266,
|
|
"num_tokens": 19471486.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 5.191393518447876,
|
|
"epoch": 0.8160422670509125,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000494158045085801,
|
|
"loss": 5.0798,
|
|
"mean_token_accuracy": 0.2094142973423004,
|
|
"num_tokens": 19482428.0,
|
|
"step": 8495
|
|
},
|
|
{
|
|
"entropy": 5.303775215148926,
|
|
"epoch": 0.8165225744476465,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004941502809426776,
|
|
"loss": 5.2344,
|
|
"mean_token_accuracy": 0.20312505811452866,
|
|
"num_tokens": 19494616.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 5.311273241043091,
|
|
"epoch": 0.8170028818443804,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004941425117115354,
|
|
"loss": 5.1312,
|
|
"mean_token_accuracy": 0.19991155862808227,
|
|
"num_tokens": 19504953.0,
|
|
"step": 8505
|
|
},
|
|
{
|
|
"entropy": 5.307030820846558,
|
|
"epoch": 0.8174831892411143,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004941347373925547,
|
|
"loss": 5.2125,
|
|
"mean_token_accuracy": 0.20029536336660386,
|
|
"num_tokens": 19517408.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 5.188129425048828,
|
|
"epoch": 0.8179634966378482,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004941269579859161,
|
|
"loss": 5.1463,
|
|
"mean_token_accuracy": 0.20623117536306382,
|
|
"num_tokens": 19529190.0,
|
|
"step": 8515
|
|
},
|
|
{
|
|
"entropy": 5.283451843261719,
|
|
"epoch": 0.8184438040345822,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004941191734918002,
|
|
"loss": 5.1822,
|
|
"mean_token_accuracy": 0.19969442188739778,
|
|
"num_tokens": 19540278.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 5.270493650436402,
|
|
"epoch": 0.818924111431316,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000494111383910388,
|
|
"loss": 5.1628,
|
|
"mean_token_accuracy": 0.2021285906434059,
|
|
"num_tokens": 19551224.0,
|
|
"step": 8525
|
|
},
|
|
{
|
|
"entropy": 5.344288444519043,
|
|
"epoch": 0.8194044188280499,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004941035892418597,
|
|
"loss": 5.2,
|
|
"mean_token_accuracy": 0.19705056995153428,
|
|
"num_tokens": 19564386.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 5.330119323730469,
|
|
"epoch": 0.8198847262247838,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004940957894863968,
|
|
"loss": 5.1846,
|
|
"mean_token_accuracy": 0.20080768764019014,
|
|
"num_tokens": 19576170.0,
|
|
"step": 8535
|
|
},
|
|
{
|
|
"entropy": 5.294156408309936,
|
|
"epoch": 0.8203650336215178,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004940879846441804,
|
|
"loss": 5.1453,
|
|
"mean_token_accuracy": 0.20027249306440353,
|
|
"num_tokens": 19587220.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 5.254690933227539,
|
|
"epoch": 0.8208453410182517,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004940801747153914,
|
|
"loss": 5.152,
|
|
"mean_token_accuracy": 0.20080652981996536,
|
|
"num_tokens": 19598649.0,
|
|
"step": 8545
|
|
},
|
|
{
|
|
"entropy": 5.252722549438476,
|
|
"epoch": 0.8213256484149856,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004940723597002113,
|
|
"loss": 5.2088,
|
|
"mean_token_accuracy": 0.1964610293507576,
|
|
"num_tokens": 19610243.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 5.271253156661987,
|
|
"epoch": 0.8218059558117194,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004940645395988216,
|
|
"loss": 5.2269,
|
|
"mean_token_accuracy": 0.1978047624230385,
|
|
"num_tokens": 19621467.0,
|
|
"step": 8555
|
|
},
|
|
{
|
|
"entropy": 5.397564172744751,
|
|
"epoch": 0.8222862632084534,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004940567144114036,
|
|
"loss": 5.3089,
|
|
"mean_token_accuracy": 0.18792566508054734,
|
|
"num_tokens": 19633367.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 5.240535068511963,
|
|
"epoch": 0.8227665706051873,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004940488841381393,
|
|
"loss": 5.1488,
|
|
"mean_token_accuracy": 0.20554967522621154,
|
|
"num_tokens": 19643144.0,
|
|
"step": 8565
|
|
},
|
|
{
|
|
"entropy": 5.280447959899902,
|
|
"epoch": 0.8232468780019212,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004940410487792103,
|
|
"loss": 5.238,
|
|
"mean_token_accuracy": 0.19904158860445023,
|
|
"num_tokens": 19654501.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 5.359654140472412,
|
|
"epoch": 0.8237271853986552,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004940332083347986,
|
|
"loss": 5.1943,
|
|
"mean_token_accuracy": 0.195090389251709,
|
|
"num_tokens": 19665382.0,
|
|
"step": 8575
|
|
},
|
|
{
|
|
"entropy": 5.263187026977539,
|
|
"epoch": 0.8242074927953891,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004940253628050861,
|
|
"loss": 5.0497,
|
|
"mean_token_accuracy": 0.2040240526199341,
|
|
"num_tokens": 19677222.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 5.184061717987061,
|
|
"epoch": 0.824687800192123,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004940175121902552,
|
|
"loss": 5.1351,
|
|
"mean_token_accuracy": 0.2033315122127533,
|
|
"num_tokens": 19688550.0,
|
|
"step": 8585
|
|
},
|
|
{
|
|
"entropy": 5.364740371704102,
|
|
"epoch": 0.8251681075888568,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000494009656490488,
|
|
"loss": 5.2543,
|
|
"mean_token_accuracy": 0.19725525230169297,
|
|
"num_tokens": 19700125.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 5.361681652069092,
|
|
"epoch": 0.8256484149855908,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004940017957059668,
|
|
"loss": 5.2424,
|
|
"mean_token_accuracy": 0.19780150651931763,
|
|
"num_tokens": 19711969.0,
|
|
"step": 8595
|
|
},
|
|
{
|
|
"entropy": 5.24332218170166,
|
|
"epoch": 0.8261287223823247,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004939939298368742,
|
|
"loss": 5.1943,
|
|
"mean_token_accuracy": 0.20086236745119096,
|
|
"num_tokens": 19723813.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 5.300223016738892,
|
|
"epoch": 0.8266090297790586,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004939860588833929,
|
|
"loss": 5.2175,
|
|
"mean_token_accuracy": 0.1928972378373146,
|
|
"num_tokens": 19735539.0,
|
|
"step": 8605
|
|
},
|
|
{
|
|
"entropy": 5.409295749664307,
|
|
"epoch": 0.8270893371757925,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004939781828457055,
|
|
"loss": 5.3039,
|
|
"mean_token_accuracy": 0.19560184627771376,
|
|
"num_tokens": 19747106.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 5.279783630371094,
|
|
"epoch": 0.8275696445725265,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000493970301723995,
|
|
"loss": 5.2012,
|
|
"mean_token_accuracy": 0.21216825842857362,
|
|
"num_tokens": 19760156.0,
|
|
"step": 8615
|
|
},
|
|
{
|
|
"entropy": 5.249637460708618,
|
|
"epoch": 0.8280499519692603,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004939624155184443,
|
|
"loss": 5.0916,
|
|
"mean_token_accuracy": 0.20376598685979844,
|
|
"num_tokens": 19771256.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 5.247362232208252,
|
|
"epoch": 0.8285302593659942,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004939545242292365,
|
|
"loss": 5.0985,
|
|
"mean_token_accuracy": 0.20896128118038176,
|
|
"num_tokens": 19781332.0,
|
|
"step": 8625
|
|
},
|
|
{
|
|
"entropy": 5.298260116577149,
|
|
"epoch": 0.8290105667627281,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004939466278565547,
|
|
"loss": 5.2686,
|
|
"mean_token_accuracy": 0.19464389234781265,
|
|
"num_tokens": 19793573.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 5.274344348907471,
|
|
"epoch": 0.8294908741594621,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004939387264005825,
|
|
"loss": 5.1697,
|
|
"mean_token_accuracy": 0.20536390393972398,
|
|
"num_tokens": 19803421.0,
|
|
"step": 8635
|
|
},
|
|
{
|
|
"entropy": 5.219406032562256,
|
|
"epoch": 0.829971181556196,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004939308198615031,
|
|
"loss": 5.0333,
|
|
"mean_token_accuracy": 0.21232510209083558,
|
|
"num_tokens": 19814440.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 5.218847370147705,
|
|
"epoch": 0.8304514889529299,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004939229082395001,
|
|
"loss": 5.1656,
|
|
"mean_token_accuracy": 0.203065949678421,
|
|
"num_tokens": 19825721.0,
|
|
"step": 8645
|
|
},
|
|
{
|
|
"entropy": 5.23597731590271,
|
|
"epoch": 0.8309317963496637,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004939149915347573,
|
|
"loss": 5.118,
|
|
"mean_token_accuracy": 0.20436252951622008,
|
|
"num_tokens": 19837273.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 5.303889560699463,
|
|
"epoch": 0.8314121037463977,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004939070697474585,
|
|
"loss": 5.1542,
|
|
"mean_token_accuracy": 0.20653810799121858,
|
|
"num_tokens": 19849436.0,
|
|
"step": 8655
|
|
},
|
|
{
|
|
"entropy": 5.191540098190307,
|
|
"epoch": 0.8318924111431316,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004938991428777875,
|
|
"loss": 5.024,
|
|
"mean_token_accuracy": 0.21526659429073333,
|
|
"num_tokens": 19860222.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 5.258981561660766,
|
|
"epoch": 0.8323727185398655,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004938912109259284,
|
|
"loss": 5.1197,
|
|
"mean_token_accuracy": 0.20716548562049866,
|
|
"num_tokens": 19870934.0,
|
|
"step": 8665
|
|
},
|
|
{
|
|
"entropy": 5.302361869812012,
|
|
"epoch": 0.8328530259365994,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004938832738920654,
|
|
"loss": 5.1999,
|
|
"mean_token_accuracy": 0.19550460278987886,
|
|
"num_tokens": 19882149.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 5.235181427001953,
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004938753317763826,
|
|
"loss": 5.1105,
|
|
"mean_token_accuracy": 0.20328403115272523,
|
|
"num_tokens": 19893276.0,
|
|
"step": 8675
|
|
},
|
|
{
|
|
"entropy": 5.312360382080078,
|
|
"epoch": 0.8338136407300673,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004938673845790646,
|
|
"loss": 5.285,
|
|
"mean_token_accuracy": 0.193190498650074,
|
|
"num_tokens": 19904723.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 5.3193567276000975,
|
|
"epoch": 0.8342939481268011,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004938594323002957,
|
|
"loss": 5.1907,
|
|
"mean_token_accuracy": 0.1986311361193657,
|
|
"num_tokens": 19915226.0,
|
|
"step": 8685
|
|
},
|
|
{
|
|
"entropy": 5.285708665847778,
|
|
"epoch": 0.834774255523535,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004938514749402609,
|
|
"loss": 5.2077,
|
|
"mean_token_accuracy": 0.19915120750665666,
|
|
"num_tokens": 19927911.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 5.2792503356933596,
|
|
"epoch": 0.835254562920269,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004938435124991447,
|
|
"loss": 5.2415,
|
|
"mean_token_accuracy": 0.2002886116504669,
|
|
"num_tokens": 19940356.0,
|
|
"step": 8695
|
|
},
|
|
{
|
|
"entropy": 5.316353034973145,
|
|
"epoch": 0.8357348703170029,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004938355449771318,
|
|
"loss": 5.2102,
|
|
"mean_token_accuracy": 0.1947380557656288,
|
|
"num_tokens": 19951108.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 5.217819595336914,
|
|
"epoch": 0.8362151777137368,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004938275723744075,
|
|
"loss": 5.1384,
|
|
"mean_token_accuracy": 0.20654254257678986,
|
|
"num_tokens": 19962427.0,
|
|
"step": 8705
|
|
},
|
|
{
|
|
"entropy": 5.329318904876709,
|
|
"epoch": 0.8366954851104706,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004938195946911567,
|
|
"loss": 5.2467,
|
|
"mean_token_accuracy": 0.1941828101873398,
|
|
"num_tokens": 19973476.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 5.303026580810547,
|
|
"epoch": 0.8371757925072046,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004938116119275645,
|
|
"loss": 5.1976,
|
|
"mean_token_accuracy": 0.20169365853071214,
|
|
"num_tokens": 19984034.0,
|
|
"step": 8715
|
|
},
|
|
{
|
|
"entropy": 5.358089828491211,
|
|
"epoch": 0.8376560999039385,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004938036240838166,
|
|
"loss": 5.2273,
|
|
"mean_token_accuracy": 0.19537217020988465,
|
|
"num_tokens": 19996035.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 5.276670169830322,
|
|
"epoch": 0.8381364073006724,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004937956311600983,
|
|
"loss": 5.1475,
|
|
"mean_token_accuracy": 0.20285791158676147,
|
|
"num_tokens": 20007259.0,
|
|
"step": 8725
|
|
},
|
|
{
|
|
"entropy": 5.261865663528442,
|
|
"epoch": 0.8386167146974063,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004937876331565951,
|
|
"loss": 5.1463,
|
|
"mean_token_accuracy": 0.19915680289268495,
|
|
"num_tokens": 20018673.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 5.294298076629639,
|
|
"epoch": 0.8390970220941403,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004937796300734926,
|
|
"loss": 5.1975,
|
|
"mean_token_accuracy": 0.20387612730264665,
|
|
"num_tokens": 20030767.0,
|
|
"step": 8735
|
|
},
|
|
{
|
|
"entropy": 5.3252601146698,
|
|
"epoch": 0.8395773294908742,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004937716219109769,
|
|
"loss": 5.2188,
|
|
"mean_token_accuracy": 0.19471233934164048,
|
|
"num_tokens": 20041268.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 5.241558361053467,
|
|
"epoch": 0.840057636887608,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004937636086692336,
|
|
"loss": 5.0847,
|
|
"mean_token_accuracy": 0.21300528049468995,
|
|
"num_tokens": 20052285.0,
|
|
"step": 8745
|
|
},
|
|
{
|
|
"entropy": 5.35512056350708,
|
|
"epoch": 0.840537944284342,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000493755590348449,
|
|
"loss": 5.2941,
|
|
"mean_token_accuracy": 0.19627934098243713,
|
|
"num_tokens": 20065113.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 5.437167024612426,
|
|
"epoch": 0.8410182516810759,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004937475669488091,
|
|
"loss": 5.2709,
|
|
"mean_token_accuracy": 0.1977367326617241,
|
|
"num_tokens": 20076151.0,
|
|
"step": 8755
|
|
},
|
|
{
|
|
"entropy": 5.263547420501709,
|
|
"epoch": 0.8414985590778098,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004937395384705004,
|
|
"loss": 5.2141,
|
|
"mean_token_accuracy": 0.19887446761131286,
|
|
"num_tokens": 20088195.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 5.211323404312134,
|
|
"epoch": 0.8419788664745437,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004937315049137089,
|
|
"loss": 5.0603,
|
|
"mean_token_accuracy": 0.21020377576351165,
|
|
"num_tokens": 20098576.0,
|
|
"step": 8765
|
|
},
|
|
{
|
|
"entropy": 5.343924474716187,
|
|
"epoch": 0.8424591738712777,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004937234662786216,
|
|
"loss": 5.2761,
|
|
"mean_token_accuracy": 0.19456166923046112,
|
|
"num_tokens": 20110176.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 5.3267858028411865,
|
|
"epoch": 0.8429394812680115,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004937154225654246,
|
|
"loss": 5.18,
|
|
"mean_token_accuracy": 0.20727563351392747,
|
|
"num_tokens": 20121713.0,
|
|
"step": 8775
|
|
},
|
|
{
|
|
"entropy": 5.364311075210571,
|
|
"epoch": 0.8434197886647454,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004937073737743051,
|
|
"loss": 5.2787,
|
|
"mean_token_accuracy": 0.19634425789117813,
|
|
"num_tokens": 20134126.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 5.2509232521057125,
|
|
"epoch": 0.8439000960614793,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004936993199054499,
|
|
"loss": 5.1719,
|
|
"mean_token_accuracy": 0.20614974945783615,
|
|
"num_tokens": 20144791.0,
|
|
"step": 8785
|
|
},
|
|
{
|
|
"entropy": 5.353460693359375,
|
|
"epoch": 0.8443804034582133,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004936912609590458,
|
|
"loss": 5.3211,
|
|
"mean_token_accuracy": 0.19727177768945695,
|
|
"num_tokens": 20157214.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 5.415266323089599,
|
|
"epoch": 0.8448607108549472,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00049368319693528,
|
|
"loss": 5.3047,
|
|
"mean_token_accuracy": 0.18851037174463273,
|
|
"num_tokens": 20168952.0,
|
|
"step": 8795
|
|
},
|
|
{
|
|
"entropy": 5.313450288772583,
|
|
"epoch": 0.8453410182516811,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004936751278343397,
|
|
"loss": 5.1884,
|
|
"mean_token_accuracy": 0.1982526332139969,
|
|
"num_tokens": 20181829.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 5.392148017883301,
|
|
"epoch": 0.845821325648415,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004936670536564123,
|
|
"loss": 5.3346,
|
|
"mean_token_accuracy": 0.18645845502614974,
|
|
"num_tokens": 20193362.0,
|
|
"step": 8805
|
|
},
|
|
{
|
|
"entropy": 5.310501289367676,
|
|
"epoch": 0.8463016330451489,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004936589744016853,
|
|
"loss": 5.2495,
|
|
"mean_token_accuracy": 0.1979646310210228,
|
|
"num_tokens": 20205159.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 5.309202194213867,
|
|
"epoch": 0.8467819404418828,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000493650890070346,
|
|
"loss": 5.134,
|
|
"mean_token_accuracy": 0.21007836610078812,
|
|
"num_tokens": 20217075.0,
|
|
"step": 8815
|
|
},
|
|
{
|
|
"entropy": 5.284643697738647,
|
|
"epoch": 0.8472622478386167,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004936428006625824,
|
|
"loss": 5.145,
|
|
"mean_token_accuracy": 0.20609464943408967,
|
|
"num_tokens": 20227901.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 5.279466247558593,
|
|
"epoch": 0.8477425552353506,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004936347061785823,
|
|
"loss": 5.1655,
|
|
"mean_token_accuracy": 0.20799711346626282,
|
|
"num_tokens": 20239945.0,
|
|
"step": 8825
|
|
},
|
|
{
|
|
"entropy": 5.381019020080567,
|
|
"epoch": 0.8482228626320846,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004936266066185334,
|
|
"loss": 5.2448,
|
|
"mean_token_accuracy": 0.19857099950313567,
|
|
"num_tokens": 20251503.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 5.273257160186768,
|
|
"epoch": 0.8487031700288185,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004936185019826239,
|
|
"loss": 5.1679,
|
|
"mean_token_accuracy": 0.20055765956640242,
|
|
"num_tokens": 20263044.0,
|
|
"step": 8835
|
|
},
|
|
{
|
|
"entropy": 5.303754806518555,
|
|
"epoch": 0.8491834774255523,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004936103922710419,
|
|
"loss": 5.1429,
|
|
"mean_token_accuracy": 0.20382609218358994,
|
|
"num_tokens": 20273563.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 5.319860410690308,
|
|
"epoch": 0.8496637848222862,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004936022774839759,
|
|
"loss": 5.1983,
|
|
"mean_token_accuracy": 0.20010559260845184,
|
|
"num_tokens": 20285482.0,
|
|
"step": 8845
|
|
},
|
|
{
|
|
"entropy": 5.3607221126556395,
|
|
"epoch": 0.8501440922190202,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004935941576216141,
|
|
"loss": 5.4022,
|
|
"mean_token_accuracy": 0.19553606808185578,
|
|
"num_tokens": 20296666.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 5.242348289489746,
|
|
"epoch": 0.8506243996157541,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000493586032684145,
|
|
"loss": 5.0657,
|
|
"mean_token_accuracy": 0.20746321827173234,
|
|
"num_tokens": 20308978.0,
|
|
"step": 8855
|
|
},
|
|
{
|
|
"entropy": 5.249849462509156,
|
|
"epoch": 0.851104707012488,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004935779026717573,
|
|
"loss": 5.1432,
|
|
"mean_token_accuracy": 0.20203766077756882,
|
|
"num_tokens": 20321488.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 5.2123401165008545,
|
|
"epoch": 0.8515850144092219,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004935697675846396,
|
|
"loss": 4.9968,
|
|
"mean_token_accuracy": 0.21457493007183076,
|
|
"num_tokens": 20332938.0,
|
|
"step": 8865
|
|
},
|
|
{
|
|
"entropy": 5.208367538452149,
|
|
"epoch": 0.8520653218059558,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004935616274229811,
|
|
"loss": 5.0981,
|
|
"mean_token_accuracy": 0.21241182535886766,
|
|
"num_tokens": 20342986.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 5.260292434692383,
|
|
"epoch": 0.8525456292026897,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004935534821869705,
|
|
"loss": 5.1548,
|
|
"mean_token_accuracy": 0.19940277189016342,
|
|
"num_tokens": 20355791.0,
|
|
"step": 8875
|
|
},
|
|
{
|
|
"entropy": 5.18192982673645,
|
|
"epoch": 0.8530259365994236,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004935453318767971,
|
|
"loss": 5.0285,
|
|
"mean_token_accuracy": 0.21193305552005767,
|
|
"num_tokens": 20367080.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 5.33548674583435,
|
|
"epoch": 0.8535062439961575,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00049353717649265,
|
|
"loss": 5.1438,
|
|
"mean_token_accuracy": 0.2096237510442734,
|
|
"num_tokens": 20376697.0,
|
|
"step": 8885
|
|
},
|
|
{
|
|
"entropy": 5.283786773681641,
|
|
"epoch": 0.8539865513928915,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004935290160347185,
|
|
"loss": 5.1626,
|
|
"mean_token_accuracy": 0.20003535747528076,
|
|
"num_tokens": 20387430.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 5.23840708732605,
|
|
"epoch": 0.8544668587896254,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004935208505031922,
|
|
"loss": 5.1485,
|
|
"mean_token_accuracy": 0.20197722762823106,
|
|
"num_tokens": 20398488.0,
|
|
"step": 8895
|
|
},
|
|
{
|
|
"entropy": 5.174388408660889,
|
|
"epoch": 0.8549471661863592,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004935126798982606,
|
|
"loss": 5.1712,
|
|
"mean_token_accuracy": 0.19997829645872117,
|
|
"num_tokens": 20410316.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 5.2989085674285885,
|
|
"epoch": 0.8554274735830932,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004935045042201135,
|
|
"loss": 5.0965,
|
|
"mean_token_accuracy": 0.2099878177046776,
|
|
"num_tokens": 20421645.0,
|
|
"step": 8905
|
|
},
|
|
{
|
|
"entropy": 5.297290563583374,
|
|
"epoch": 0.8559077809798271,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004934963234689407,
|
|
"loss": 5.2136,
|
|
"mean_token_accuracy": 0.19259357154369355,
|
|
"num_tokens": 20433397.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 5.210119295120239,
|
|
"epoch": 0.856388088376561,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000493488137644932,
|
|
"loss": 5.1128,
|
|
"mean_token_accuracy": 0.2082274630665779,
|
|
"num_tokens": 20446218.0,
|
|
"step": 8915
|
|
},
|
|
{
|
|
"entropy": 5.327347612380981,
|
|
"epoch": 0.8568683957732949,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004934799467482774,
|
|
"loss": 5.2096,
|
|
"mean_token_accuracy": 0.2002415493130684,
|
|
"num_tokens": 20457265.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 5.317370796203614,
|
|
"epoch": 0.8573487031700289,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004934717507791673,
|
|
"loss": 5.2003,
|
|
"mean_token_accuracy": 0.19859042763710022,
|
|
"num_tokens": 20468748.0,
|
|
"step": 8925
|
|
},
|
|
{
|
|
"entropy": 5.320759439468384,
|
|
"epoch": 0.8578290105667628,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004934635497377919,
|
|
"loss": 5.2082,
|
|
"mean_token_accuracy": 0.19929444640874863,
|
|
"num_tokens": 20481416.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 5.236276292800904,
|
|
"epoch": 0.8583093179634966,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004934553436243415,
|
|
"loss": 5.1091,
|
|
"mean_token_accuracy": 0.20469743758440018,
|
|
"num_tokens": 20493063.0,
|
|
"step": 8935
|
|
},
|
|
{
|
|
"entropy": 5.221277904510498,
|
|
"epoch": 0.8587896253602305,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004934471324390067,
|
|
"loss": 5.1152,
|
|
"mean_token_accuracy": 0.2038355737924576,
|
|
"num_tokens": 20504000.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 5.267146587371826,
|
|
"epoch": 0.8592699327569645,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004934389161819783,
|
|
"loss": 5.2179,
|
|
"mean_token_accuracy": 0.20483950674533843,
|
|
"num_tokens": 20516419.0,
|
|
"step": 8945
|
|
},
|
|
{
|
|
"entropy": 5.273205709457398,
|
|
"epoch": 0.8597502401536984,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004934306948534467,
|
|
"loss": 5.1552,
|
|
"mean_token_accuracy": 0.19798202067613602,
|
|
"num_tokens": 20527385.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 5.291792201995849,
|
|
"epoch": 0.8602305475504323,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004934224684536031,
|
|
"loss": 5.1449,
|
|
"mean_token_accuracy": 0.2052535355091095,
|
|
"num_tokens": 20538051.0,
|
|
"step": 8955
|
|
},
|
|
{
|
|
"entropy": 5.321442127227783,
|
|
"epoch": 0.8607108549471661,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004934142369826382,
|
|
"loss": 5.1746,
|
|
"mean_token_accuracy": 0.19756327718496322,
|
|
"num_tokens": 20550321.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 5.309886932373047,
|
|
"epoch": 0.8611911623439001,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004934060004407434,
|
|
"loss": 5.1616,
|
|
"mean_token_accuracy": 0.2022399291396141,
|
|
"num_tokens": 20561229.0,
|
|
"step": 8965
|
|
},
|
|
{
|
|
"entropy": 5.2354882717132565,
|
|
"epoch": 0.861671469740634,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004933977588281099,
|
|
"loss": 5.1065,
|
|
"mean_token_accuracy": 0.20714430809020995,
|
|
"num_tokens": 20572040.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 5.225004816055298,
|
|
"epoch": 0.8621517771373679,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004933895121449288,
|
|
"loss": 5.1445,
|
|
"mean_token_accuracy": 0.20627815425395965,
|
|
"num_tokens": 20583110.0,
|
|
"step": 8975
|
|
},
|
|
{
|
|
"entropy": 5.25708327293396,
|
|
"epoch": 0.8626320845341018,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004933812603913917,
|
|
"loss": 5.1451,
|
|
"mean_token_accuracy": 0.20151159167289734,
|
|
"num_tokens": 20593647.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 5.185816431045533,
|
|
"epoch": 0.8631123919308358,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004933730035676903,
|
|
"loss": 5.0209,
|
|
"mean_token_accuracy": 0.21777433753013611,
|
|
"num_tokens": 20604428.0,
|
|
"step": 8985
|
|
},
|
|
{
|
|
"entropy": 5.165512609481811,
|
|
"epoch": 0.8635926993275697,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004933647416740161,
|
|
"loss": 5.0746,
|
|
"mean_token_accuracy": 0.21601256728172302,
|
|
"num_tokens": 20615811.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 5.30065655708313,
|
|
"epoch": 0.8640730067243035,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000493356474710561,
|
|
"loss": 5.1769,
|
|
"mean_token_accuracy": 0.20163519084453582,
|
|
"num_tokens": 20627679.0,
|
|
"step": 8995
|
|
},
|
|
{
|
|
"entropy": 5.325274658203125,
|
|
"epoch": 0.8645533141210374,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000493348202677517,
|
|
"loss": 5.2067,
|
|
"mean_token_accuracy": 0.20573359727859497,
|
|
"num_tokens": 20638879.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.8645533141210374,
|
|
"eval_entropy": 5.134741041033748,
|
|
"eval_loss": 5.199076175689697,
|
|
"eval_mean_token_accuracy": 0.20850473279537574,
|
|
"eval_num_tokens": 20638879.0,
|
|
"eval_runtime": 26.7295,
|
|
"eval_samples_per_second": 1227.669,
|
|
"eval_steps_per_second": 153.463,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 5.26629228591919,
|
|
"epoch": 0.8650336215177714,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004933399255750761,
|
|
"loss": 5.1956,
|
|
"mean_token_accuracy": 0.20548682659864426,
|
|
"num_tokens": 20649729.0,
|
|
"step": 9005
|
|
},
|
|
{
|
|
"entropy": 5.366102600097657,
|
|
"epoch": 0.8655139289145053,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004933316434034304,
|
|
"loss": 5.2311,
|
|
"mean_token_accuracy": 0.20523984879255294,
|
|
"num_tokens": 20660473.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 5.326435089111328,
|
|
"epoch": 0.8659942363112392,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004933233561627723,
|
|
"loss": 5.1972,
|
|
"mean_token_accuracy": 0.2020814150571823,
|
|
"num_tokens": 20671776.0,
|
|
"step": 9015
|
|
},
|
|
{
|
|
"entropy": 5.251844644546509,
|
|
"epoch": 0.866474543707973,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004933150638532942,
|
|
"loss": 5.1605,
|
|
"mean_token_accuracy": 0.2062242418527603,
|
|
"num_tokens": 20684147.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 5.3002519607543945,
|
|
"epoch": 0.866954851104707,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004933067664751885,
|
|
"loss": 5.1469,
|
|
"mean_token_accuracy": 0.20623468309640886,
|
|
"num_tokens": 20695248.0,
|
|
"step": 9025
|
|
},
|
|
{
|
|
"entropy": 5.244437265396118,
|
|
"epoch": 0.8674351585014409,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000493298464028648,
|
|
"loss": 5.168,
|
|
"mean_token_accuracy": 0.2045750394463539,
|
|
"num_tokens": 20707772.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 5.263054895401001,
|
|
"epoch": 0.8679154658981748,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004932901565138653,
|
|
"loss": 5.1264,
|
|
"mean_token_accuracy": 0.1987837016582489,
|
|
"num_tokens": 20718813.0,
|
|
"step": 9035
|
|
},
|
|
{
|
|
"entropy": 5.2314427375793455,
|
|
"epoch": 0.8683957732949087,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004932818439310334,
|
|
"loss": 5.1175,
|
|
"mean_token_accuracy": 0.2132244125008583,
|
|
"num_tokens": 20730939.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 5.376590538024902,
|
|
"epoch": 0.8688760806916427,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004932735262803452,
|
|
"loss": 5.2384,
|
|
"mean_token_accuracy": 0.1961486503481865,
|
|
"num_tokens": 20742990.0,
|
|
"step": 9045
|
|
},
|
|
{
|
|
"entropy": 5.2218879699707035,
|
|
"epoch": 0.8693563880883766,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004932652035619939,
|
|
"loss": 5.0787,
|
|
"mean_token_accuracy": 0.2043047398328781,
|
|
"num_tokens": 20754076.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 5.269600582122803,
|
|
"epoch": 0.8698366954851104,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004932568757761727,
|
|
"loss": 5.1352,
|
|
"mean_token_accuracy": 0.20455852448940276,
|
|
"num_tokens": 20765538.0,
|
|
"step": 9055
|
|
},
|
|
{
|
|
"entropy": 5.2922038555145265,
|
|
"epoch": 0.8703170028818443,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004932485429230748,
|
|
"loss": 5.1899,
|
|
"mean_token_accuracy": 0.19730205535888673,
|
|
"num_tokens": 20776359.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 5.234628915786743,
|
|
"epoch": 0.8707973102785783,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000493240205002894,
|
|
"loss": 5.1682,
|
|
"mean_token_accuracy": 0.2084574043750763,
|
|
"num_tokens": 20787581.0,
|
|
"step": 9065
|
|
},
|
|
{
|
|
"entropy": 5.34184308052063,
|
|
"epoch": 0.8712776176753122,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004932318620158235,
|
|
"loss": 5.2041,
|
|
"mean_token_accuracy": 0.19864192605018616,
|
|
"num_tokens": 20799904.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 5.319941759109497,
|
|
"epoch": 0.8717579250720461,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004932235139620574,
|
|
"loss": 5.1384,
|
|
"mean_token_accuracy": 0.20746065229177474,
|
|
"num_tokens": 20810238.0,
|
|
"step": 9075
|
|
},
|
|
{
|
|
"entropy": 5.2344482898712155,
|
|
"epoch": 0.8722382324687801,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004932151608417892,
|
|
"loss": 5.0957,
|
|
"mean_token_accuracy": 0.20455455929040908,
|
|
"num_tokens": 20821349.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 5.202734899520874,
|
|
"epoch": 0.872718539865514,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004932068026552127,
|
|
"loss": 5.1513,
|
|
"mean_token_accuracy": 0.20545032173395156,
|
|
"num_tokens": 20834788.0,
|
|
"step": 9085
|
|
},
|
|
{
|
|
"entropy": 5.319971227645874,
|
|
"epoch": 0.8731988472622478,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004931984394025224,
|
|
"loss": 5.2178,
|
|
"mean_token_accuracy": 0.20217667371034623,
|
|
"num_tokens": 20845571.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 5.311048793792724,
|
|
"epoch": 0.8736791546589817,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004931900710839123,
|
|
"loss": 5.1952,
|
|
"mean_token_accuracy": 0.19956784099340438,
|
|
"num_tokens": 20857209.0,
|
|
"step": 9095
|
|
},
|
|
{
|
|
"entropy": 5.329868745803833,
|
|
"epoch": 0.8741594620557157,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004931816976995766,
|
|
"loss": 5.2614,
|
|
"mean_token_accuracy": 0.19563933461904526,
|
|
"num_tokens": 20870624.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 5.286147880554199,
|
|
"epoch": 0.8746397694524496,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004931733192497097,
|
|
"loss": 5.1638,
|
|
"mean_token_accuracy": 0.20552606284618377,
|
|
"num_tokens": 20881769.0,
|
|
"step": 9105
|
|
},
|
|
{
|
|
"entropy": 5.256227636337281,
|
|
"epoch": 0.8751200768491835,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004931649357345062,
|
|
"loss": 5.1336,
|
|
"mean_token_accuracy": 0.19931492060422898,
|
|
"num_tokens": 20892817.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 5.266511297225952,
|
|
"epoch": 0.8756003842459174,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004931565471541606,
|
|
"loss": 5.0994,
|
|
"mean_token_accuracy": 0.20882656574249267,
|
|
"num_tokens": 20903042.0,
|
|
"step": 9115
|
|
},
|
|
{
|
|
"entropy": 5.197468280792236,
|
|
"epoch": 0.8760806916426513,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004931481535088679,
|
|
"loss": 5.0548,
|
|
"mean_token_accuracy": 0.2176084190607071,
|
|
"num_tokens": 20914684.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 5.21255669593811,
|
|
"epoch": 0.8765609990393852,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004931397547988229,
|
|
"loss": 5.1169,
|
|
"mean_token_accuracy": 0.21571636497974395,
|
|
"num_tokens": 20926585.0,
|
|
"step": 9125
|
|
},
|
|
{
|
|
"entropy": 5.315608882904053,
|
|
"epoch": 0.8770413064361191,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004931313510242204,
|
|
"loss": 5.1677,
|
|
"mean_token_accuracy": 0.2050992101430893,
|
|
"num_tokens": 20939729.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 5.231499481201172,
|
|
"epoch": 0.877521613832853,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004931229421852557,
|
|
"loss": 5.103,
|
|
"mean_token_accuracy": 0.2057361498475075,
|
|
"num_tokens": 20951697.0,
|
|
"step": 9135
|
|
},
|
|
{
|
|
"entropy": 5.321991300582885,
|
|
"epoch": 0.878001921229587,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000493114528282124,
|
|
"loss": 5.2363,
|
|
"mean_token_accuracy": 0.20253994166851044,
|
|
"num_tokens": 20962729.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 5.2189311504364015,
|
|
"epoch": 0.8784822286263209,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004931061093150206,
|
|
"loss": 5.0919,
|
|
"mean_token_accuracy": 0.20677362531423568,
|
|
"num_tokens": 20973331.0,
|
|
"step": 9145
|
|
},
|
|
{
|
|
"entropy": 5.188636112213135,
|
|
"epoch": 0.8789625360230547,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004930976852841409,
|
|
"loss": 5.0942,
|
|
"mean_token_accuracy": 0.20331761091947556,
|
|
"num_tokens": 20985609.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 5.212237691879272,
|
|
"epoch": 0.8794428434197886,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004930892561896806,
|
|
"loss": 5.1191,
|
|
"mean_token_accuracy": 0.19904601573944092,
|
|
"num_tokens": 20997231.0,
|
|
"step": 9155
|
|
},
|
|
{
|
|
"entropy": 5.302338361740112,
|
|
"epoch": 0.8799231508165226,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004930808220318354,
|
|
"loss": 5.1304,
|
|
"mean_token_accuracy": 0.20466675609350204,
|
|
"num_tokens": 21008511.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 5.251391744613647,
|
|
"epoch": 0.8804034582132565,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004930723828108012,
|
|
"loss": 5.07,
|
|
"mean_token_accuracy": 0.20593566447496414,
|
|
"num_tokens": 21019108.0,
|
|
"step": 9165
|
|
},
|
|
{
|
|
"entropy": 5.1970141410827635,
|
|
"epoch": 0.8808837656099904,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004930639385267736,
|
|
"loss": 5.1312,
|
|
"mean_token_accuracy": 0.20703590363264085,
|
|
"num_tokens": 21030621.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 5.254196977615356,
|
|
"epoch": 0.8813640730067243,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000493055489179949,
|
|
"loss": 5.106,
|
|
"mean_token_accuracy": 0.20593850463628768,
|
|
"num_tokens": 21041778.0,
|
|
"step": 9175
|
|
},
|
|
{
|
|
"entropy": 5.293916034698486,
|
|
"epoch": 0.8818443804034583,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004930470347705234,
|
|
"loss": 5.1213,
|
|
"mean_token_accuracy": 0.20545565485954284,
|
|
"num_tokens": 21054257.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 5.25820026397705,
|
|
"epoch": 0.8823246878001921,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000493038575298693,
|
|
"loss": 5.1774,
|
|
"mean_token_accuracy": 0.1998462751507759,
|
|
"num_tokens": 21066378.0,
|
|
"step": 9185
|
|
},
|
|
{
|
|
"entropy": 5.217289543151855,
|
|
"epoch": 0.882804995196926,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004930301107646545,
|
|
"loss": 5.1022,
|
|
"mean_token_accuracy": 0.20249929428100585,
|
|
"num_tokens": 21078913.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 5.315297651290893,
|
|
"epoch": 0.8832853025936599,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004930216411686042,
|
|
"loss": 5.1549,
|
|
"mean_token_accuracy": 0.1985946238040924,
|
|
"num_tokens": 21090500.0,
|
|
"step": 9195
|
|
},
|
|
{
|
|
"entropy": 5.2111443996429445,
|
|
"epoch": 0.8837656099903939,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004930131665107387,
|
|
"loss": 5.1334,
|
|
"mean_token_accuracy": 0.2010358154773712,
|
|
"num_tokens": 21102793.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 5.382960557937622,
|
|
"epoch": 0.8842459173871278,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000493004686791255,
|
|
"loss": 5.2304,
|
|
"mean_token_accuracy": 0.19272204041481017,
|
|
"num_tokens": 21114504.0,
|
|
"step": 9205
|
|
},
|
|
{
|
|
"entropy": 5.27924222946167,
|
|
"epoch": 0.8847262247838616,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004929962020103496,
|
|
"loss": 5.1007,
|
|
"mean_token_accuracy": 0.20397736132144928,
|
|
"num_tokens": 21126733.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 5.19653902053833,
|
|
"epoch": 0.8852065321805955,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004929877121682198,
|
|
"loss": 5.0931,
|
|
"mean_token_accuracy": 0.20474224388599396,
|
|
"num_tokens": 21138045.0,
|
|
"step": 9215
|
|
},
|
|
{
|
|
"entropy": 5.260480785369873,
|
|
"epoch": 0.8856868395773295,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004929792172650627,
|
|
"loss": 5.1796,
|
|
"mean_token_accuracy": 0.19880712181329727,
|
|
"num_tokens": 21151562.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 5.315613460540772,
|
|
"epoch": 0.8861671469740634,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004929707173010753,
|
|
"loss": 5.1299,
|
|
"mean_token_accuracy": 0.2056412249803543,
|
|
"num_tokens": 21162943.0,
|
|
"step": 9225
|
|
},
|
|
{
|
|
"entropy": 5.241054391860962,
|
|
"epoch": 0.8866474543707973,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004929622122764552,
|
|
"loss": 5.1699,
|
|
"mean_token_accuracy": 0.2012902170419693,
|
|
"num_tokens": 21174392.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 5.1802393913269045,
|
|
"epoch": 0.8871277617675313,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004929537021913997,
|
|
"loss": 5.008,
|
|
"mean_token_accuracy": 0.21468252092599868,
|
|
"num_tokens": 21185372.0,
|
|
"step": 9235
|
|
},
|
|
{
|
|
"entropy": 5.236097574234009,
|
|
"epoch": 0.8876080691642652,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004929451870461064,
|
|
"loss": 5.1562,
|
|
"mean_token_accuracy": 0.20373494178056717,
|
|
"num_tokens": 21197044.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 5.191161966323852,
|
|
"epoch": 0.888088376560999,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004929366668407731,
|
|
"loss": 5.1047,
|
|
"mean_token_accuracy": 0.20978552401065825,
|
|
"num_tokens": 21207729.0,
|
|
"step": 9245
|
|
},
|
|
{
|
|
"entropy": 5.312554979324341,
|
|
"epoch": 0.8885686839577329,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004929281415755974,
|
|
"loss": 5.1436,
|
|
"mean_token_accuracy": 0.20457518696784974,
|
|
"num_tokens": 21218909.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 5.344946384429932,
|
|
"epoch": 0.8890489913544669,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004929196112507775,
|
|
"loss": 5.2498,
|
|
"mean_token_accuracy": 0.19993363320827484,
|
|
"num_tokens": 21230543.0,
|
|
"step": 9255
|
|
},
|
|
{
|
|
"entropy": 5.270783472061157,
|
|
"epoch": 0.8895292987512008,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004929110758665112,
|
|
"loss": 5.1876,
|
|
"mean_token_accuracy": 0.1981159120798111,
|
|
"num_tokens": 21242064.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 5.326478481292725,
|
|
"epoch": 0.8900096061479347,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004929025354229969,
|
|
"loss": 5.2097,
|
|
"mean_token_accuracy": 0.2005533829331398,
|
|
"num_tokens": 21254321.0,
|
|
"step": 9265
|
|
},
|
|
{
|
|
"entropy": 5.180127668380737,
|
|
"epoch": 0.8904899135446686,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004928939899204326,
|
|
"loss": 5.0312,
|
|
"mean_token_accuracy": 0.20711840987205504,
|
|
"num_tokens": 21264741.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 5.250730323791504,
|
|
"epoch": 0.8909702209414025,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000492885439359017,
|
|
"loss": 5.1329,
|
|
"mean_token_accuracy": 0.20080725252628326,
|
|
"num_tokens": 21276834.0,
|
|
"step": 9275
|
|
},
|
|
{
|
|
"entropy": 5.236090469360351,
|
|
"epoch": 0.8914505283381364,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004928768837389485,
|
|
"loss": 5.0918,
|
|
"mean_token_accuracy": 0.20890207290649415,
|
|
"num_tokens": 21287108.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 5.248825597763061,
|
|
"epoch": 0.8919308357348703,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004928683230604257,
|
|
"loss": 5.1298,
|
|
"mean_token_accuracy": 0.20136982649564744,
|
|
"num_tokens": 21299942.0,
|
|
"step": 9285
|
|
},
|
|
{
|
|
"entropy": 5.365978527069092,
|
|
"epoch": 0.8924111431316042,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004928597573236474,
|
|
"loss": 5.2691,
|
|
"mean_token_accuracy": 0.2037052556872368,
|
|
"num_tokens": 21311243.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 5.274964046478272,
|
|
"epoch": 0.8928914505283382,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004928511865288123,
|
|
"loss": 5.1035,
|
|
"mean_token_accuracy": 0.2068115308880806,
|
|
"num_tokens": 21322291.0,
|
|
"step": 9295
|
|
},
|
|
{
|
|
"entropy": 5.258368492126465,
|
|
"epoch": 0.8933717579250721,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004928426106761197,
|
|
"loss": 5.1982,
|
|
"mean_token_accuracy": 0.20522145330905914,
|
|
"num_tokens": 21333257.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 5.194037771224975,
|
|
"epoch": 0.8938520653218059,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004928340297657685,
|
|
"loss": 5.1119,
|
|
"mean_token_accuracy": 0.20364685207605362,
|
|
"num_tokens": 21345848.0,
|
|
"step": 9305
|
|
},
|
|
{
|
|
"entropy": 5.306222867965698,
|
|
"epoch": 0.8943323727185398,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004928254437979578,
|
|
"loss": 5.1371,
|
|
"mean_token_accuracy": 0.2047370731830597,
|
|
"num_tokens": 21357693.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 5.277711868286133,
|
|
"epoch": 0.8948126801152738,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004928168527728873,
|
|
"loss": 5.229,
|
|
"mean_token_accuracy": 0.20137819200754165,
|
|
"num_tokens": 21369653.0,
|
|
"step": 9315
|
|
},
|
|
{
|
|
"entropy": 5.314446830749512,
|
|
"epoch": 0.8952929875120077,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004928082566907562,
|
|
"loss": 5.1813,
|
|
"mean_token_accuracy": 0.20231199115514756,
|
|
"num_tokens": 21383924.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 5.318646097183228,
|
|
"epoch": 0.8957732949087416,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004927996555517642,
|
|
"loss": 5.156,
|
|
"mean_token_accuracy": 0.1997460052371025,
|
|
"num_tokens": 21395963.0,
|
|
"step": 9325
|
|
},
|
|
{
|
|
"entropy": 5.2909129619598385,
|
|
"epoch": 0.8962536023054755,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004927910493561109,
|
|
"loss": 5.1562,
|
|
"mean_token_accuracy": 0.20208995938301086,
|
|
"num_tokens": 21408200.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 5.299256086349487,
|
|
"epoch": 0.8967339097022095,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000492782438103996,
|
|
"loss": 5.26,
|
|
"mean_token_accuracy": 0.1976392984390259,
|
|
"num_tokens": 21419963.0,
|
|
"step": 9335
|
|
},
|
|
{
|
|
"entropy": 5.280540561676025,
|
|
"epoch": 0.8972142170989433,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004927738217956197,
|
|
"loss": 5.2154,
|
|
"mean_token_accuracy": 0.20124684274196625,
|
|
"num_tokens": 21431824.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 5.220011901855469,
|
|
"epoch": 0.8976945244956772,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004927652004311819,
|
|
"loss": 4.9671,
|
|
"mean_token_accuracy": 0.21261375546455383,
|
|
"num_tokens": 21442354.0,
|
|
"step": 9345
|
|
},
|
|
{
|
|
"entropy": 5.272494840621948,
|
|
"epoch": 0.8981748318924111,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004927565740108828,
|
|
"loss": 5.1539,
|
|
"mean_token_accuracy": 0.19939538985490798,
|
|
"num_tokens": 21453734.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 5.264281797409057,
|
|
"epoch": 0.8986551392891451,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004927479425349226,
|
|
"loss": 5.1664,
|
|
"mean_token_accuracy": 0.20830067843198777,
|
|
"num_tokens": 21465471.0,
|
|
"step": 9355
|
|
},
|
|
{
|
|
"entropy": 5.347072267532349,
|
|
"epoch": 0.899135446685879,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004927393060035018,
|
|
"loss": 5.3012,
|
|
"mean_token_accuracy": 0.19275195002555848,
|
|
"num_tokens": 21477775.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 5.256478118896484,
|
|
"epoch": 0.8996157540826129,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004927306644168207,
|
|
"loss": 5.0715,
|
|
"mean_token_accuracy": 0.2134536847472191,
|
|
"num_tokens": 21489319.0,
|
|
"step": 9365
|
|
},
|
|
{
|
|
"entropy": 5.3430397510528564,
|
|
"epoch": 0.9000960614793467,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004927220177750803,
|
|
"loss": 5.2993,
|
|
"mean_token_accuracy": 0.20141739547252654,
|
|
"num_tokens": 21499742.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 5.2615800380706785,
|
|
"epoch": 0.9005763688760807,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004927133660784811,
|
|
"loss": 5.0778,
|
|
"mean_token_accuracy": 0.20828621387481688,
|
|
"num_tokens": 21511063.0,
|
|
"step": 9375
|
|
},
|
|
{
|
|
"entropy": 5.2883483409881595,
|
|
"epoch": 0.9010566762728146,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004927047093272241,
|
|
"loss": 5.0993,
|
|
"mean_token_accuracy": 0.2080937907099724,
|
|
"num_tokens": 21522500.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 5.292206716537476,
|
|
"epoch": 0.9015369836695485,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00049269604752151,
|
|
"loss": 5.183,
|
|
"mean_token_accuracy": 0.19954841285943986,
|
|
"num_tokens": 21533578.0,
|
|
"step": 9385
|
|
},
|
|
{
|
|
"entropy": 5.2589469909667965,
|
|
"epoch": 0.9020172910662824,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004926873806615403,
|
|
"loss": 5.1761,
|
|
"mean_token_accuracy": 0.2022814229130745,
|
|
"num_tokens": 21544296.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 5.285726165771484,
|
|
"epoch": 0.9024975984630164,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004926787087475158,
|
|
"loss": 5.2485,
|
|
"mean_token_accuracy": 0.19234858453273773,
|
|
"num_tokens": 21555386.0,
|
|
"step": 9395
|
|
},
|
|
{
|
|
"entropy": 5.2691357135772705,
|
|
"epoch": 0.9029779058597502,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004926700317796382,
|
|
"loss": 5.0119,
|
|
"mean_token_accuracy": 0.2185451105237007,
|
|
"num_tokens": 21566527.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 5.316603708267212,
|
|
"epoch": 0.9034582132564841,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004926613497581088,
|
|
"loss": 5.1657,
|
|
"mean_token_accuracy": 0.19770514070987702,
|
|
"num_tokens": 21576870.0,
|
|
"step": 9405
|
|
},
|
|
{
|
|
"entropy": 5.246594953536987,
|
|
"epoch": 0.9039385206532181,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004926526626831292,
|
|
"loss": 5.1326,
|
|
"mean_token_accuracy": 0.20468196123838425,
|
|
"num_tokens": 21588113.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 5.279461526870728,
|
|
"epoch": 0.904418828049952,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004926439705549011,
|
|
"loss": 5.1535,
|
|
"mean_token_accuracy": 0.2016696736216545,
|
|
"num_tokens": 21599307.0,
|
|
"step": 9415
|
|
},
|
|
{
|
|
"entropy": 5.25780029296875,
|
|
"epoch": 0.9048991354466859,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004926352733736262,
|
|
"loss": 5.1166,
|
|
"mean_token_accuracy": 0.2065201461315155,
|
|
"num_tokens": 21609961.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 5.2240455627441404,
|
|
"epoch": 0.9053794428434198,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004926265711395065,
|
|
"loss": 5.1391,
|
|
"mean_token_accuracy": 0.2021078497171402,
|
|
"num_tokens": 21622222.0,
|
|
"step": 9425
|
|
},
|
|
{
|
|
"entropy": 5.2489923477172855,
|
|
"epoch": 0.9058597502401537,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000492617863852744,
|
|
"loss": 5.1228,
|
|
"mean_token_accuracy": 0.21097581535577775,
|
|
"num_tokens": 21632843.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 5.273687887191772,
|
|
"epoch": 0.9063400576368876,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004926091515135409,
|
|
"loss": 5.1694,
|
|
"mean_token_accuracy": 0.20245194882154466,
|
|
"num_tokens": 21645387.0,
|
|
"step": 9435
|
|
},
|
|
{
|
|
"entropy": 5.272740983963013,
|
|
"epoch": 0.9068203650336215,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004926004341220995,
|
|
"loss": 5.1151,
|
|
"mean_token_accuracy": 0.20472093671560287,
|
|
"num_tokens": 21656787.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 5.286762046813965,
|
|
"epoch": 0.9073006724303554,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004925917116786222,
|
|
"loss": 5.1467,
|
|
"mean_token_accuracy": 0.202509106695652,
|
|
"num_tokens": 21667800.0,
|
|
"step": 9445
|
|
},
|
|
{
|
|
"entropy": 5.323235177993775,
|
|
"epoch": 0.9077809798270894,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004925829841833114,
|
|
"loss": 5.2022,
|
|
"mean_token_accuracy": 0.1957914039492607,
|
|
"num_tokens": 21679297.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 5.2604146003723145,
|
|
"epoch": 0.9082612872238233,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004925742516363699,
|
|
"loss": 5.2104,
|
|
"mean_token_accuracy": 0.19677306711673737,
|
|
"num_tokens": 21692956.0,
|
|
"step": 9455
|
|
},
|
|
{
|
|
"entropy": 5.239795923233032,
|
|
"epoch": 0.9087415946205571,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004925655140380002,
|
|
"loss": 4.9955,
|
|
"mean_token_accuracy": 0.20952331125736237,
|
|
"num_tokens": 21704852.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 5.240779304504395,
|
|
"epoch": 0.909221902017291,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004925567713884054,
|
|
"loss": 5.1345,
|
|
"mean_token_accuracy": 0.20403669029474258,
|
|
"num_tokens": 21715886.0,
|
|
"step": 9465
|
|
},
|
|
{
|
|
"entropy": 5.256079244613647,
|
|
"epoch": 0.909702209414025,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004925480236877884,
|
|
"loss": 5.126,
|
|
"mean_token_accuracy": 0.20910231918096542,
|
|
"num_tokens": 21727157.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 5.261584663391114,
|
|
"epoch": 0.9101825168107589,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004925392709363522,
|
|
"loss": 5.1387,
|
|
"mean_token_accuracy": 0.20324090272188186,
|
|
"num_tokens": 21738232.0,
|
|
"step": 9475
|
|
},
|
|
{
|
|
"entropy": 5.268222141265869,
|
|
"epoch": 0.9106628242074928,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004925305131343001,
|
|
"loss": 5.1774,
|
|
"mean_token_accuracy": 0.20176736861467362,
|
|
"num_tokens": 21749029.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 5.357953786849976,
|
|
"epoch": 0.9111431316042267,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004925217502818355,
|
|
"loss": 5.1727,
|
|
"mean_token_accuracy": 0.1988372653722763,
|
|
"num_tokens": 21761243.0,
|
|
"step": 9485
|
|
},
|
|
{
|
|
"entropy": 5.250569820404053,
|
|
"epoch": 0.9116234390009607,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004925129823791616,
|
|
"loss": 5.1391,
|
|
"mean_token_accuracy": 0.20034718960523606,
|
|
"num_tokens": 21772623.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 5.282710075378418,
|
|
"epoch": 0.9121037463976945,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004925042094264822,
|
|
"loss": 5.1644,
|
|
"mean_token_accuracy": 0.20180542021989822,
|
|
"num_tokens": 21782611.0,
|
|
"step": 9495
|
|
},
|
|
{
|
|
"entropy": 5.176083374023437,
|
|
"epoch": 0.9125840537944284,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000492495431424001,
|
|
"loss": 5.0361,
|
|
"mean_token_accuracy": 0.210744047164917,
|
|
"num_tokens": 21793946.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 5.317784595489502,
|
|
"epoch": 0.9130643611911623,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004924866483719216,
|
|
"loss": 5.2217,
|
|
"mean_token_accuracy": 0.18893510699272156,
|
|
"num_tokens": 21803878.0,
|
|
"step": 9505
|
|
},
|
|
{
|
|
"entropy": 5.291093206405639,
|
|
"epoch": 0.9135446685878963,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004924778602704481,
|
|
"loss": 5.1693,
|
|
"mean_token_accuracy": 0.20558474063873292,
|
|
"num_tokens": 21815187.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 5.253582382202149,
|
|
"epoch": 0.9140249759846302,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004924690671197845,
|
|
"loss": 5.1219,
|
|
"mean_token_accuracy": 0.21276892423629762,
|
|
"num_tokens": 21825597.0,
|
|
"step": 9515
|
|
},
|
|
{
|
|
"entropy": 5.222238779067993,
|
|
"epoch": 0.914505283381364,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004924602689201348,
|
|
"loss": 5.1079,
|
|
"mean_token_accuracy": 0.2087915927171707,
|
|
"num_tokens": 21837110.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 5.478323316574096,
|
|
"epoch": 0.9149855907780979,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004924514656717034,
|
|
"loss": 5.3741,
|
|
"mean_token_accuracy": 0.19211723804473876,
|
|
"num_tokens": 21847754.0,
|
|
"step": 9525
|
|
},
|
|
{
|
|
"entropy": 5.294663047790527,
|
|
"epoch": 0.9154658981748319,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004924426573746948,
|
|
"loss": 5.1594,
|
|
"mean_token_accuracy": 0.20195448100566865,
|
|
"num_tokens": 21859162.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 5.239957857131958,
|
|
"epoch": 0.9159462055715658,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004924338440293131,
|
|
"loss": 5.1104,
|
|
"mean_token_accuracy": 0.20837367475032806,
|
|
"num_tokens": 21870826.0,
|
|
"step": 9535
|
|
},
|
|
{
|
|
"entropy": 5.256366109848022,
|
|
"epoch": 0.9164265129682997,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004924250256357635,
|
|
"loss": 5.1534,
|
|
"mean_token_accuracy": 0.19792882353067398,
|
|
"num_tokens": 21882003.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 5.215576648712158,
|
|
"epoch": 0.9169068203650336,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004924162021942502,
|
|
"loss": 5.097,
|
|
"mean_token_accuracy": 0.2071886330842972,
|
|
"num_tokens": 21894132.0,
|
|
"step": 9545
|
|
},
|
|
{
|
|
"entropy": 5.181234216690063,
|
|
"epoch": 0.9173871277617676,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004924073737049784,
|
|
"loss": 5.1089,
|
|
"mean_token_accuracy": 0.21016984134912492,
|
|
"num_tokens": 21904951.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 5.26510066986084,
|
|
"epoch": 0.9178674351585014,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004923985401681528,
|
|
"loss": 5.1376,
|
|
"mean_token_accuracy": 0.20959776937961577,
|
|
"num_tokens": 21917100.0,
|
|
"step": 9555
|
|
},
|
|
{
|
|
"entropy": 5.28123664855957,
|
|
"epoch": 0.9183477425552353,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004923897015839788,
|
|
"loss": 5.1579,
|
|
"mean_token_accuracy": 0.1991439864039421,
|
|
"num_tokens": 21927541.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 5.2830277442932125,
|
|
"epoch": 0.9188280499519692,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004923808579526613,
|
|
"loss": 5.0914,
|
|
"mean_token_accuracy": 0.20503710806369782,
|
|
"num_tokens": 21938723.0,
|
|
"step": 9565
|
|
},
|
|
{
|
|
"entropy": 5.24866738319397,
|
|
"epoch": 0.9193083573487032,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004923720092744059,
|
|
"loss": 5.0354,
|
|
"mean_token_accuracy": 0.21492197811603547,
|
|
"num_tokens": 21950424.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 5.25103907585144,
|
|
"epoch": 0.9197886647454371,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004923631555494179,
|
|
"loss": 5.1937,
|
|
"mean_token_accuracy": 0.2016189157962799,
|
|
"num_tokens": 21961030.0,
|
|
"step": 9575
|
|
},
|
|
{
|
|
"entropy": 5.231373453140259,
|
|
"epoch": 0.920268972142171,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004923542967779028,
|
|
"loss": 5.0957,
|
|
"mean_token_accuracy": 0.20799438655376434,
|
|
"num_tokens": 21971625.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 5.300740003585815,
|
|
"epoch": 0.920749279538905,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004923454329600664,
|
|
"loss": 5.1185,
|
|
"mean_token_accuracy": 0.20712572187185288,
|
|
"num_tokens": 21983733.0,
|
|
"step": 9585
|
|
},
|
|
{
|
|
"entropy": 5.19854097366333,
|
|
"epoch": 0.9212295869356388,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004923365640961143,
|
|
"loss": 5.0651,
|
|
"mean_token_accuracy": 0.21446898579597473,
|
|
"num_tokens": 21995621.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 5.249282026290894,
|
|
"epoch": 0.9217098943323727,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004923276901862526,
|
|
"loss": 5.1486,
|
|
"mean_token_accuracy": 0.20122848600149154,
|
|
"num_tokens": 22007325.0,
|
|
"step": 9595
|
|
},
|
|
{
|
|
"entropy": 5.247177934646606,
|
|
"epoch": 0.9221902017291066,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004923188112306874,
|
|
"loss": 5.1148,
|
|
"mean_token_accuracy": 0.2028706982731819,
|
|
"num_tokens": 22017733.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 5.297493267059326,
|
|
"epoch": 0.9226705091258406,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004923099272296246,
|
|
"loss": 5.227,
|
|
"mean_token_accuracy": 0.1984498158097267,
|
|
"num_tokens": 22030451.0,
|
|
"step": 9605
|
|
},
|
|
{
|
|
"entropy": 5.293121433258056,
|
|
"epoch": 0.9231508165225745,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004923010381832706,
|
|
"loss": 5.1655,
|
|
"mean_token_accuracy": 0.1920482635498047,
|
|
"num_tokens": 22042626.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 5.263902759552002,
|
|
"epoch": 0.9236311239193083,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004922921440918318,
|
|
"loss": 5.1479,
|
|
"mean_token_accuracy": 0.20066307634115219,
|
|
"num_tokens": 22053314.0,
|
|
"step": 9615
|
|
},
|
|
{
|
|
"entropy": 5.3540332317352295,
|
|
"epoch": 0.9241114313160422,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004922832449555144,
|
|
"loss": 5.2321,
|
|
"mean_token_accuracy": 0.19173655807971954,
|
|
"num_tokens": 22064395.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 5.229344749450684,
|
|
"epoch": 0.9245917387127762,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004922743407745255,
|
|
"loss": 5.123,
|
|
"mean_token_accuracy": 0.20057824850082398,
|
|
"num_tokens": 22075960.0,
|
|
"step": 9625
|
|
},
|
|
{
|
|
"entropy": 5.272555780410767,
|
|
"epoch": 0.9250720461095101,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004922654315490714,
|
|
"loss": 5.1871,
|
|
"mean_token_accuracy": 0.20159071534872056,
|
|
"num_tokens": 22086034.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 5.249064683914185,
|
|
"epoch": 0.925552353506244,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004922565172793593,
|
|
"loss": 5.2023,
|
|
"mean_token_accuracy": 0.20035000890493393,
|
|
"num_tokens": 22096184.0,
|
|
"step": 9635
|
|
},
|
|
{
|
|
"entropy": 5.262630033493042,
|
|
"epoch": 0.9260326609029779,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004922475979655958,
|
|
"loss": 5.1593,
|
|
"mean_token_accuracy": 0.2061972975730896,
|
|
"num_tokens": 22108795.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 5.240458583831787,
|
|
"epoch": 0.9265129682997119,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004922386736079883,
|
|
"loss": 5.1362,
|
|
"mean_token_accuracy": 0.20278566032648088,
|
|
"num_tokens": 22119608.0,
|
|
"step": 9645
|
|
},
|
|
{
|
|
"entropy": 5.239983415603637,
|
|
"epoch": 0.9269932756964457,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004922297442067438,
|
|
"loss": 5.1009,
|
|
"mean_token_accuracy": 0.21008216142654418,
|
|
"num_tokens": 22131621.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 5.303403711318969,
|
|
"epoch": 0.9274735830931796,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004922208097620697,
|
|
"loss": 5.0679,
|
|
"mean_token_accuracy": 0.20454230904579163,
|
|
"num_tokens": 22142745.0,
|
|
"step": 9655
|
|
},
|
|
{
|
|
"entropy": 5.2560042381286625,
|
|
"epoch": 0.9279538904899135,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004922118702741735,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.19514112025499344,
|
|
"num_tokens": 22155457.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 5.320225811004638,
|
|
"epoch": 0.9284341978866475,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004922029257432625,
|
|
"loss": 5.1395,
|
|
"mean_token_accuracy": 0.2117284744977951,
|
|
"num_tokens": 22165955.0,
|
|
"step": 9665
|
|
},
|
|
{
|
|
"entropy": 5.256221914291382,
|
|
"epoch": 0.9289145052833814,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004921939761695446,
|
|
"loss": 5.0865,
|
|
"mean_token_accuracy": 0.2021948665380478,
|
|
"num_tokens": 22178142.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 5.171184015274048,
|
|
"epoch": 0.9293948126801153,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004921850215532275,
|
|
"loss": 5.0653,
|
|
"mean_token_accuracy": 0.21012310534715653,
|
|
"num_tokens": 22190315.0,
|
|
"step": 9675
|
|
},
|
|
{
|
|
"entropy": 5.2543559074401855,
|
|
"epoch": 0.9298751200768491,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004921760618945192,
|
|
"loss": 5.1284,
|
|
"mean_token_accuracy": 0.20433304756879805,
|
|
"num_tokens": 22201785.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 5.295661354064942,
|
|
"epoch": 0.9303554274735831,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004921670971936276,
|
|
"loss": 5.0781,
|
|
"mean_token_accuracy": 0.20774878412485123,
|
|
"num_tokens": 22212471.0,
|
|
"step": 9685
|
|
},
|
|
{
|
|
"entropy": 5.277561855316162,
|
|
"epoch": 0.930835734870317,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004921581274507607,
|
|
"loss": 5.1692,
|
|
"mean_token_accuracy": 0.20450907647609712,
|
|
"num_tokens": 22223188.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 5.20819878578186,
|
|
"epoch": 0.9313160422670509,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000492149152666127,
|
|
"loss": 5.1741,
|
|
"mean_token_accuracy": 0.2045721873641014,
|
|
"num_tokens": 22233978.0,
|
|
"step": 9695
|
|
},
|
|
{
|
|
"entropy": 5.227841567993164,
|
|
"epoch": 0.9317963496637848,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004921401728399348,
|
|
"loss": 5.1147,
|
|
"mean_token_accuracy": 0.21509994715452194,
|
|
"num_tokens": 22244713.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 5.259960889816284,
|
|
"epoch": 0.9322766570605188,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004921311879723926,
|
|
"loss": 5.1705,
|
|
"mean_token_accuracy": 0.20220176130533218,
|
|
"num_tokens": 22256192.0,
|
|
"step": 9705
|
|
},
|
|
{
|
|
"entropy": 5.332875108718872,
|
|
"epoch": 0.9327569644572526,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004921221980637088,
|
|
"loss": 5.1401,
|
|
"mean_token_accuracy": 0.20141558051109315,
|
|
"num_tokens": 22268294.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 5.3014007091522215,
|
|
"epoch": 0.9332372718539865,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004921132031140925,
|
|
"loss": 5.1616,
|
|
"mean_token_accuracy": 0.20787308365106583,
|
|
"num_tokens": 22278952.0,
|
|
"step": 9715
|
|
},
|
|
{
|
|
"entropy": 5.249713897705078,
|
|
"epoch": 0.9337175792507204,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004921042031237521,
|
|
"loss": 5.1181,
|
|
"mean_token_accuracy": 0.1999865725636482,
|
|
"num_tokens": 22291057.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 5.333038187026977,
|
|
"epoch": 0.9341978866474544,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004920951980928969,
|
|
"loss": 5.2022,
|
|
"mean_token_accuracy": 0.20720864981412887,
|
|
"num_tokens": 22302479.0,
|
|
"step": 9725
|
|
},
|
|
{
|
|
"entropy": 5.381272459030152,
|
|
"epoch": 0.9346781940441883,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004920861880217359,
|
|
"loss": 5.27,
|
|
"mean_token_accuracy": 0.19498737156391144,
|
|
"num_tokens": 22315116.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 5.309507656097412,
|
|
"epoch": 0.9351585014409222,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004920771729104781,
|
|
"loss": 5.1831,
|
|
"mean_token_accuracy": 0.20069352984428407,
|
|
"num_tokens": 22327548.0,
|
|
"step": 9735
|
|
},
|
|
{
|
|
"entropy": 5.204008626937866,
|
|
"epoch": 0.9356388088376562,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004920681527593329,
|
|
"loss": 5.0612,
|
|
"mean_token_accuracy": 0.20920901447534562,
|
|
"num_tokens": 22339154.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 5.256301832199097,
|
|
"epoch": 0.93611911623439,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004920591275685098,
|
|
"loss": 5.1518,
|
|
"mean_token_accuracy": 0.20383056104183198,
|
|
"num_tokens": 22350781.0,
|
|
"step": 9745
|
|
},
|
|
{
|
|
"entropy": 5.336814022064209,
|
|
"epoch": 0.9365994236311239,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004920500973382184,
|
|
"loss": 5.1758,
|
|
"mean_token_accuracy": 0.20595642030239106,
|
|
"num_tokens": 22361990.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 5.223576879501342,
|
|
"epoch": 0.9370797310278578,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004920410620686682,
|
|
"loss": 5.0488,
|
|
"mean_token_accuracy": 0.21444960832595825,
|
|
"num_tokens": 22372973.0,
|
|
"step": 9755
|
|
},
|
|
{
|
|
"entropy": 5.18360276222229,
|
|
"epoch": 0.9375600384245918,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004920320217600689,
|
|
"loss": 5.0665,
|
|
"mean_token_accuracy": 0.21210620701313018,
|
|
"num_tokens": 22384369.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 5.320396280288696,
|
|
"epoch": 0.9380403458213257,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004920229764126306,
|
|
"loss": 5.1679,
|
|
"mean_token_accuracy": 0.2056802451610565,
|
|
"num_tokens": 22395792.0,
|
|
"step": 9765
|
|
},
|
|
{
|
|
"entropy": 5.2880340099334715,
|
|
"epoch": 0.9385206532180596,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004920139260265632,
|
|
"loss": 5.1575,
|
|
"mean_token_accuracy": 0.19827589392662048,
|
|
"num_tokens": 22408182.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 5.341842079162598,
|
|
"epoch": 0.9390009606147934,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004920048706020769,
|
|
"loss": 5.2885,
|
|
"mean_token_accuracy": 0.19330597370862962,
|
|
"num_tokens": 22419774.0,
|
|
"step": 9775
|
|
},
|
|
{
|
|
"entropy": 5.187641191482544,
|
|
"epoch": 0.9394812680115274,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004919958101393817,
|
|
"loss": 4.9989,
|
|
"mean_token_accuracy": 0.21211641579866408,
|
|
"num_tokens": 22430210.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 5.290931463241577,
|
|
"epoch": 0.9399615754082613,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004919867446386883,
|
|
"loss": 5.153,
|
|
"mean_token_accuracy": 0.20970916748046875,
|
|
"num_tokens": 22442444.0,
|
|
"step": 9785
|
|
},
|
|
{
|
|
"entropy": 5.184951877593994,
|
|
"epoch": 0.9404418828049952,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000491977674100207,
|
|
"loss": 5.1207,
|
|
"mean_token_accuracy": 0.2109922468662262,
|
|
"num_tokens": 22455521.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 5.312680387496949,
|
|
"epoch": 0.9409221902017291,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004919685985241483,
|
|
"loss": 5.1845,
|
|
"mean_token_accuracy": 0.20736344754695893,
|
|
"num_tokens": 22466997.0,
|
|
"step": 9795
|
|
},
|
|
{
|
|
"entropy": 5.291236543655396,
|
|
"epoch": 0.9414024975984631,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000491959517910723,
|
|
"loss": 5.0996,
|
|
"mean_token_accuracy": 0.21323858797550202,
|
|
"num_tokens": 22477851.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 5.2509393215179445,
|
|
"epoch": 0.9418828049951969,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004919504322601421,
|
|
"loss": 5.193,
|
|
"mean_token_accuracy": 0.205467090010643,
|
|
"num_tokens": 22489319.0,
|
|
"step": 9805
|
|
},
|
|
{
|
|
"entropy": 5.174720096588135,
|
|
"epoch": 0.9423631123919308,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004919413415726162,
|
|
"loss": 5.0491,
|
|
"mean_token_accuracy": 0.21085420697927476,
|
|
"num_tokens": 22500847.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 5.309349250793457,
|
|
"epoch": 0.9428434197886647,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004919322458483566,
|
|
"loss": 5.1415,
|
|
"mean_token_accuracy": 0.20241572856903076,
|
|
"num_tokens": 22512719.0,
|
|
"step": 9815
|
|
},
|
|
{
|
|
"entropy": 5.222389364242554,
|
|
"epoch": 0.9433237271853987,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004919231450875745,
|
|
"loss": 5.0661,
|
|
"mean_token_accuracy": 0.21022214293479918,
|
|
"num_tokens": 22522984.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 5.2375284194946286,
|
|
"epoch": 0.9438040345821326,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004919140392904809,
|
|
"loss": 5.1092,
|
|
"mean_token_accuracy": 0.21000211089849471,
|
|
"num_tokens": 22534816.0,
|
|
"step": 9825
|
|
},
|
|
{
|
|
"entropy": 5.230174970626831,
|
|
"epoch": 0.9442843419788665,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004919049284572875,
|
|
"loss": 5.0975,
|
|
"mean_token_accuracy": 0.20355214923620224,
|
|
"num_tokens": 22545753.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 5.301757907867431,
|
|
"epoch": 0.9447646493756003,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004918958125882058,
|
|
"loss": 5.1963,
|
|
"mean_token_accuracy": 0.1956578239798546,
|
|
"num_tokens": 22557237.0,
|
|
"step": 9835
|
|
},
|
|
{
|
|
"entropy": 5.310576248168945,
|
|
"epoch": 0.9452449567723343,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004918866916834474,
|
|
"loss": 5.1236,
|
|
"mean_token_accuracy": 0.20908855646848679,
|
|
"num_tokens": 22568909.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 5.315052127838134,
|
|
"epoch": 0.9457252641690682,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004918775657432239,
|
|
"loss": 5.2595,
|
|
"mean_token_accuracy": 0.19276428669691087,
|
|
"num_tokens": 22582162.0,
|
|
"step": 9845
|
|
},
|
|
{
|
|
"entropy": 5.258047676086425,
|
|
"epoch": 0.9462055715658021,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004918684347677474,
|
|
"loss": 5.0962,
|
|
"mean_token_accuracy": 0.20234745740890503,
|
|
"num_tokens": 22592405.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 5.273072290420532,
|
|
"epoch": 0.946685878962536,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004918592987572298,
|
|
"loss": 5.1377,
|
|
"mean_token_accuracy": 0.20278570502996446,
|
|
"num_tokens": 22603588.0,
|
|
"step": 9855
|
|
},
|
|
{
|
|
"entropy": 5.321579885482788,
|
|
"epoch": 0.94716618635927,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004918501577118832,
|
|
"loss": 5.2008,
|
|
"mean_token_accuracy": 0.2043844997882843,
|
|
"num_tokens": 22614995.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 5.288969469070435,
|
|
"epoch": 0.9476464937560038,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00049184101163192,
|
|
"loss": 5.1545,
|
|
"mean_token_accuracy": 0.20031799376010895,
|
|
"num_tokens": 22627556.0,
|
|
"step": 9865
|
|
},
|
|
{
|
|
"entropy": 5.311606693267822,
|
|
"epoch": 0.9481268011527377,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004918318605175522,
|
|
"loss": 5.1448,
|
|
"mean_token_accuracy": 0.20381494760513305,
|
|
"num_tokens": 22638339.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 5.309900140762329,
|
|
"epoch": 0.9486071085494716,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004918227043689924,
|
|
"loss": 5.1063,
|
|
"mean_token_accuracy": 0.2075771450996399,
|
|
"num_tokens": 22648922.0,
|
|
"step": 9875
|
|
},
|
|
{
|
|
"entropy": 5.18968620300293,
|
|
"epoch": 0.9490874159462056,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004918135431864534,
|
|
"loss": 5.1583,
|
|
"mean_token_accuracy": 0.20583543330430984,
|
|
"num_tokens": 22662006.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 5.232013797760009,
|
|
"epoch": 0.9495677233429395,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004918043769701478,
|
|
"loss": 5.0866,
|
|
"mean_token_accuracy": 0.2079631954431534,
|
|
"num_tokens": 22674649.0,
|
|
"step": 9885
|
|
},
|
|
{
|
|
"entropy": 5.360668706893921,
|
|
"epoch": 0.9500480307396734,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004917952057202882,
|
|
"loss": 5.2037,
|
|
"mean_token_accuracy": 0.1974567338824272,
|
|
"num_tokens": 22685971.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 5.22627215385437,
|
|
"epoch": 0.9505283381364072,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004917860294370877,
|
|
"loss": 5.0656,
|
|
"mean_token_accuracy": 0.2093571364879608,
|
|
"num_tokens": 22696174.0,
|
|
"step": 9895
|
|
},
|
|
{
|
|
"entropy": 5.31975827217102,
|
|
"epoch": 0.9510086455331412,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004917768481207593,
|
|
"loss": 5.1813,
|
|
"mean_token_accuracy": 0.20513910204172134,
|
|
"num_tokens": 22706983.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 5.248136568069458,
|
|
"epoch": 0.9514889529298751,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004917676617715162,
|
|
"loss": 5.1088,
|
|
"mean_token_accuracy": 0.206376151740551,
|
|
"num_tokens": 22718251.0,
|
|
"step": 9905
|
|
},
|
|
{
|
|
"entropy": 5.161273384094239,
|
|
"epoch": 0.951969260326609,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004917584703895717,
|
|
"loss": 5.0842,
|
|
"mean_token_accuracy": 0.2124750316143036,
|
|
"num_tokens": 22730071.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 5.238349151611328,
|
|
"epoch": 0.952449567723343,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004917492739751391,
|
|
"loss": 5.1473,
|
|
"mean_token_accuracy": 0.20351121425628663,
|
|
"num_tokens": 22742027.0,
|
|
"step": 9915
|
|
},
|
|
{
|
|
"entropy": 5.210487508773804,
|
|
"epoch": 0.9529298751200769,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000491740072528432,
|
|
"loss": 5.0373,
|
|
"mean_token_accuracy": 0.21686818301677704,
|
|
"num_tokens": 22752946.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 5.287168884277344,
|
|
"epoch": 0.9534101825168108,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000491730866049664,
|
|
"loss": 5.108,
|
|
"mean_token_accuracy": 0.20497333854436875,
|
|
"num_tokens": 22763944.0,
|
|
"step": 9925
|
|
},
|
|
{
|
|
"entropy": 5.225655937194825,
|
|
"epoch": 0.9538904899135446,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004917216545390489,
|
|
"loss": 5.0843,
|
|
"mean_token_accuracy": 0.20906523764133453,
|
|
"num_tokens": 22774414.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 5.186794948577881,
|
|
"epoch": 0.9543707973102786,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004917124379968004,
|
|
"loss": 5.0443,
|
|
"mean_token_accuracy": 0.21451948434114457,
|
|
"num_tokens": 22785533.0,
|
|
"step": 9935
|
|
},
|
|
{
|
|
"entropy": 5.153272867202759,
|
|
"epoch": 0.9548511047070125,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004917032164231327,
|
|
"loss": 4.9939,
|
|
"mean_token_accuracy": 0.21023591607809067,
|
|
"num_tokens": 22795809.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 5.244364500045776,
|
|
"epoch": 0.9553314121037464,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004916939898182598,
|
|
"loss": 5.2216,
|
|
"mean_token_accuracy": 0.20205324590206147,
|
|
"num_tokens": 22807705.0,
|
|
"step": 9945
|
|
},
|
|
{
|
|
"entropy": 5.34041018486023,
|
|
"epoch": 0.9558117195004803,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004916847581823958,
|
|
"loss": 5.1064,
|
|
"mean_token_accuracy": 0.20732269585132598,
|
|
"num_tokens": 22818852.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 5.187279415130615,
|
|
"epoch": 0.9562920268972143,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004916755215157552,
|
|
"loss": 5.0225,
|
|
"mean_token_accuracy": 0.21118980795145034,
|
|
"num_tokens": 22829146.0,
|
|
"step": 9955
|
|
},
|
|
{
|
|
"entropy": 5.152674341201783,
|
|
"epoch": 0.9567723342939481,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004916662798185524,
|
|
"loss": 5.107,
|
|
"mean_token_accuracy": 0.21148771941661834,
|
|
"num_tokens": 22840088.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 5.259473514556885,
|
|
"epoch": 0.957252641690682,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004916570330910019,
|
|
"loss": 5.1244,
|
|
"mean_token_accuracy": 0.20842421650886536,
|
|
"num_tokens": 22852470.0,
|
|
"step": 9965
|
|
},
|
|
{
|
|
"entropy": 5.29966549873352,
|
|
"epoch": 0.9577329490874159,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004916477813333185,
|
|
"loss": 5.1655,
|
|
"mean_token_accuracy": 0.19774912297725677,
|
|
"num_tokens": 22863673.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 5.227234315872193,
|
|
"epoch": 0.9582132564841499,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004916385245457168,
|
|
"loss": 5.1421,
|
|
"mean_token_accuracy": 0.2026590123772621,
|
|
"num_tokens": 22874888.0,
|
|
"step": 9975
|
|
},
|
|
{
|
|
"entropy": 5.2691041946411135,
|
|
"epoch": 0.9586935638808838,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000491629262728412,
|
|
"loss": 5.1591,
|
|
"mean_token_accuracy": 0.19835399985313415,
|
|
"num_tokens": 22886811.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 5.282389736175537,
|
|
"epoch": 0.9591738712776177,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004916199958816188,
|
|
"loss": 5.1101,
|
|
"mean_token_accuracy": 0.20272685140371322,
|
|
"num_tokens": 22898777.0,
|
|
"step": 9985
|
|
},
|
|
{
|
|
"entropy": 5.259513235092163,
|
|
"epoch": 0.9596541786743515,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004916107240055527,
|
|
"loss": 5.0984,
|
|
"mean_token_accuracy": 0.20606767982244492,
|
|
"num_tokens": 22910804.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 5.3379199504852295,
|
|
"epoch": 0.9601344860710855,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004916014471004287,
|
|
"loss": 5.2127,
|
|
"mean_token_accuracy": 0.20945288687944413,
|
|
"num_tokens": 22922002.0,
|
|
"step": 9995
|
|
},
|
|
{
|
|
"entropy": 5.268113040924073,
|
|
"epoch": 0.9606147934678194,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004915921651664622,
|
|
"loss": 5.1176,
|
|
"mean_token_accuracy": 0.20583815425634383,
|
|
"num_tokens": 22933471.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 5.167844009399414,
|
|
"epoch": 0.9610951008645533,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000491582878203869,
|
|
"loss": 5.0343,
|
|
"mean_token_accuracy": 0.21021876633167266,
|
|
"num_tokens": 22945303.0,
|
|
"step": 10005
|
|
},
|
|
{
|
|
"entropy": 5.2859704971313475,
|
|
"epoch": 0.9615754082612872,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004915735862128643,
|
|
"loss": 5.1734,
|
|
"mean_token_accuracy": 0.1960235893726349,
|
|
"num_tokens": 22956620.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 5.301449775695801,
|
|
"epoch": 0.9620557156580212,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004915642891936641,
|
|
"loss": 5.1695,
|
|
"mean_token_accuracy": 0.20270660370588303,
|
|
"num_tokens": 22968941.0,
|
|
"step": 10015
|
|
},
|
|
{
|
|
"entropy": 5.263174438476563,
|
|
"epoch": 0.962536023054755,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004915549871464841,
|
|
"loss": 5.1471,
|
|
"mean_token_accuracy": 0.2005195811390877,
|
|
"num_tokens": 22980222.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 5.377077054977417,
|
|
"epoch": 0.9630163304514889,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004915456800715403,
|
|
"loss": 5.1674,
|
|
"mean_token_accuracy": 0.19867794066667557,
|
|
"num_tokens": 22991156.0,
|
|
"step": 10025
|
|
},
|
|
{
|
|
"entropy": 5.341533660888672,
|
|
"epoch": 0.9634966378482228,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000491536367969049,
|
|
"loss": 5.2939,
|
|
"mean_token_accuracy": 0.19976369738578797,
|
|
"num_tokens": 23002939.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 5.351443099975586,
|
|
"epoch": 0.9639769452449568,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004915270508392261,
|
|
"loss": 5.1535,
|
|
"mean_token_accuracy": 0.20271336436271667,
|
|
"num_tokens": 23015590.0,
|
|
"step": 10035
|
|
},
|
|
{
|
|
"entropy": 5.186623668670654,
|
|
"epoch": 0.9644572526416907,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000491517728682288,
|
|
"loss": 5.0173,
|
|
"mean_token_accuracy": 0.21069204956293106,
|
|
"num_tokens": 23026387.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 5.1746241569519045,
|
|
"epoch": 0.9649375600384246,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004915084014984512,
|
|
"loss": 5.0882,
|
|
"mean_token_accuracy": 0.21177269369363785,
|
|
"num_tokens": 23037475.0,
|
|
"step": 10045
|
|
},
|
|
{
|
|
"entropy": 5.21121768951416,
|
|
"epoch": 0.9654178674351584,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004914990692879322,
|
|
"loss": 5.0636,
|
|
"mean_token_accuracy": 0.21139880418777465,
|
|
"num_tokens": 23049305.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 5.226834392547607,
|
|
"epoch": 0.9658981748318924,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004914897320509478,
|
|
"loss": 5.0927,
|
|
"mean_token_accuracy": 0.208532877266407,
|
|
"num_tokens": 23061765.0,
|
|
"step": 10055
|
|
},
|
|
{
|
|
"entropy": 5.257470321655274,
|
|
"epoch": 0.9663784822286263,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004914803897877146,
|
|
"loss": 5.0923,
|
|
"mean_token_accuracy": 0.20083025693893433,
|
|
"num_tokens": 23072355.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 5.321026134490967,
|
|
"epoch": 0.9668587896253602,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004914710424984495,
|
|
"loss": 5.1071,
|
|
"mean_token_accuracy": 0.19924261420965195,
|
|
"num_tokens": 23085583.0,
|
|
"step": 10065
|
|
},
|
|
{
|
|
"entropy": 5.320055437088013,
|
|
"epoch": 0.9673390970220941,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004914616901833696,
|
|
"loss": 5.1466,
|
|
"mean_token_accuracy": 0.2025774121284485,
|
|
"num_tokens": 23095942.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 5.203504228591919,
|
|
"epoch": 0.9678194044188281,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000491452332842692,
|
|
"loss": 5.0852,
|
|
"mean_token_accuracy": 0.21284036785364152,
|
|
"num_tokens": 23106540.0,
|
|
"step": 10075
|
|
},
|
|
{
|
|
"entropy": 5.239231157302856,
|
|
"epoch": 0.968299711815562,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000491442970476634,
|
|
"loss": 5.1603,
|
|
"mean_token_accuracy": 0.20696305185556413,
|
|
"num_tokens": 23118006.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 5.230704307556152,
|
|
"epoch": 0.9687800192122958,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004914336030854129,
|
|
"loss": 5.0625,
|
|
"mean_token_accuracy": 0.2126757651567459,
|
|
"num_tokens": 23129103.0,
|
|
"step": 10085
|
|
},
|
|
{
|
|
"entropy": 5.221098470687866,
|
|
"epoch": 0.9692603266090298,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004914242306692461,
|
|
"loss": 5.0595,
|
|
"mean_token_accuracy": 0.21233255714178084,
|
|
"num_tokens": 23140009.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 5.2262026309967045,
|
|
"epoch": 0.9697406340057637,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004914148532283516,
|
|
"loss": 5.123,
|
|
"mean_token_accuracy": 0.2098432034254074,
|
|
"num_tokens": 23150982.0,
|
|
"step": 10095
|
|
},
|
|
{
|
|
"entropy": 5.3084290504455565,
|
|
"epoch": 0.9702209414024976,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004914054707629466,
|
|
"loss": 5.1217,
|
|
"mean_token_accuracy": 0.203516785800457,
|
|
"num_tokens": 23161834.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 5.321819496154785,
|
|
"epoch": 0.9707012487992315,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004913960832732493,
|
|
"loss": 5.2516,
|
|
"mean_token_accuracy": 0.1973107188940048,
|
|
"num_tokens": 23173355.0,
|
|
"step": 10105
|
|
},
|
|
{
|
|
"entropy": 5.291294431686401,
|
|
"epoch": 0.9711815561959655,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004913866907594774,
|
|
"loss": 5.183,
|
|
"mean_token_accuracy": 0.20283153355121614,
|
|
"num_tokens": 23185075.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 5.235888957977295,
|
|
"epoch": 0.9716618635926993,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004913772932218491,
|
|
"loss": 5.1614,
|
|
"mean_token_accuracy": 0.21172062009572984,
|
|
"num_tokens": 23195590.0,
|
|
"step": 10115
|
|
},
|
|
{
|
|
"entropy": 5.287680578231812,
|
|
"epoch": 0.9721421709894332,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004913678906605825,
|
|
"loss": 5.1168,
|
|
"mean_token_accuracy": 0.20626269578933715,
|
|
"num_tokens": 23207668.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 5.210545921325684,
|
|
"epoch": 0.9726224783861671,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004913584830758961,
|
|
"loss": 5.1037,
|
|
"mean_token_accuracy": 0.2144807457923889,
|
|
"num_tokens": 23218497.0,
|
|
"step": 10125
|
|
},
|
|
{
|
|
"entropy": 5.295179796218872,
|
|
"epoch": 0.9731027857829011,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004913490704680081,
|
|
"loss": 5.1883,
|
|
"mean_token_accuracy": 0.19650790989398956,
|
|
"num_tokens": 23230575.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 5.3129924774169925,
|
|
"epoch": 0.973583093179635,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004913396528371371,
|
|
"loss": 5.1775,
|
|
"mean_token_accuracy": 0.2083025798201561,
|
|
"num_tokens": 23242348.0,
|
|
"step": 10135
|
|
},
|
|
{
|
|
"entropy": 5.318413543701172,
|
|
"epoch": 0.9740634005763689,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004913302301835018,
|
|
"loss": 5.1449,
|
|
"mean_token_accuracy": 0.20831867009401323,
|
|
"num_tokens": 23253297.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 5.209083795547485,
|
|
"epoch": 0.9745437079731027,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000491320802507321,
|
|
"loss": 5.0829,
|
|
"mean_token_accuracy": 0.2160535603761673,
|
|
"num_tokens": 23265830.0,
|
|
"step": 10145
|
|
},
|
|
{
|
|
"entropy": 5.241401433944702,
|
|
"epoch": 0.9750240153698367,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004913113698088133,
|
|
"loss": 5.1587,
|
|
"mean_token_accuracy": 0.2019126072525978,
|
|
"num_tokens": 23275591.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 5.27093539237976,
|
|
"epoch": 0.9755043227665706,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000491301932088198,
|
|
"loss": 5.0606,
|
|
"mean_token_accuracy": 0.20884221643209458,
|
|
"num_tokens": 23286685.0,
|
|
"step": 10155
|
|
},
|
|
{
|
|
"entropy": 5.236410522460938,
|
|
"epoch": 0.9759846301633045,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004912924893456942,
|
|
"loss": 5.0771,
|
|
"mean_token_accuracy": 0.21038100719451905,
|
|
"num_tokens": 23298776.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 5.189069700241089,
|
|
"epoch": 0.9764649375600384,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000491283041581521,
|
|
"loss": 5.0817,
|
|
"mean_token_accuracy": 0.2063506156206131,
|
|
"num_tokens": 23310498.0,
|
|
"step": 10165
|
|
},
|
|
{
|
|
"entropy": 5.217845678329468,
|
|
"epoch": 0.9769452449567724,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004912735887958978,
|
|
"loss": 5.1382,
|
|
"mean_token_accuracy": 0.20284378677606582,
|
|
"num_tokens": 23321089.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 5.288270330429077,
|
|
"epoch": 0.9774255523535063,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004912641309890441,
|
|
"loss": 5.1083,
|
|
"mean_token_accuracy": 0.20696865767240524,
|
|
"num_tokens": 23332142.0,
|
|
"step": 10175
|
|
},
|
|
{
|
|
"entropy": 5.252698373794556,
|
|
"epoch": 0.9779058597502401,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004912546681611794,
|
|
"loss": 5.0731,
|
|
"mean_token_accuracy": 0.21283762007951737,
|
|
"num_tokens": 23343014.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 5.207805871963501,
|
|
"epoch": 0.978386167146974,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004912452003125234,
|
|
"loss": 5.0497,
|
|
"mean_token_accuracy": 0.2128495082259178,
|
|
"num_tokens": 23354611.0,
|
|
"step": 10185
|
|
},
|
|
{
|
|
"entropy": 5.191194486618042,
|
|
"epoch": 0.978866474543708,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000491235727443296,
|
|
"loss": 5.0971,
|
|
"mean_token_accuracy": 0.200925113260746,
|
|
"num_tokens": 23365608.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 5.283109283447265,
|
|
"epoch": 0.9793467819404419,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004912262495537171,
|
|
"loss": 5.1403,
|
|
"mean_token_accuracy": 0.20711569488048553,
|
|
"num_tokens": 23377884.0,
|
|
"step": 10195
|
|
},
|
|
{
|
|
"entropy": 5.176312112808228,
|
|
"epoch": 0.9798270893371758,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.0004912167666440068,
|
|
"loss": 5.0456,
|
|
"mean_token_accuracy": 0.21011523604393006,
|
|
"num_tokens": 23389553.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 5.18413896560669,
|
|
"epoch": 0.9803073967339097,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004912072787143852,
|
|
"loss": 5.0395,
|
|
"mean_token_accuracy": 0.20854321867227554,
|
|
"num_tokens": 23401079.0,
|
|
"step": 10205
|
|
},
|
|
{
|
|
"entropy": 5.191148519515991,
|
|
"epoch": 0.9807877041306436,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004911977857650725,
|
|
"loss": 5.0952,
|
|
"mean_token_accuracy": 0.20658079236745835,
|
|
"num_tokens": 23412886.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 5.287184333801269,
|
|
"epoch": 0.9812680115273775,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004911882877962893,
|
|
"loss": 5.1568,
|
|
"mean_token_accuracy": 0.2016318693757057,
|
|
"num_tokens": 23424758.0,
|
|
"step": 10215
|
|
},
|
|
{
|
|
"entropy": 5.213660001754761,
|
|
"epoch": 0.9817483189241114,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004911787848082559,
|
|
"loss": 5.0263,
|
|
"mean_token_accuracy": 0.2168577641248703,
|
|
"num_tokens": 23435552.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 5.178222560882569,
|
|
"epoch": 0.9822286263208453,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004911692768011931,
|
|
"loss": 5.0387,
|
|
"mean_token_accuracy": 0.21100341975688935,
|
|
"num_tokens": 23446584.0,
|
|
"step": 10225
|
|
},
|
|
{
|
|
"entropy": 5.319640445709228,
|
|
"epoch": 0.9827089337175793,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004911597637753217,
|
|
"loss": 5.2566,
|
|
"mean_token_accuracy": 0.19432248920202255,
|
|
"num_tokens": 23458452.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 5.280249691009521,
|
|
"epoch": 0.9831892411143132,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004911502457308623,
|
|
"loss": 5.1235,
|
|
"mean_token_accuracy": 0.1981524184346199,
|
|
"num_tokens": 23470310.0,
|
|
"step": 10235
|
|
},
|
|
{
|
|
"entropy": 5.322625064849854,
|
|
"epoch": 0.983669548511047,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000491140722668036,
|
|
"loss": 5.1723,
|
|
"mean_token_accuracy": 0.20412614494562148,
|
|
"num_tokens": 23481166.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 5.271013641357422,
|
|
"epoch": 0.984149855907781,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000491131194587064,
|
|
"loss": 5.1469,
|
|
"mean_token_accuracy": 0.207887963950634,
|
|
"num_tokens": 23493134.0,
|
|
"step": 10245
|
|
},
|
|
{
|
|
"entropy": 5.145558023452759,
|
|
"epoch": 0.9846301633045149,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004911216614881675,
|
|
"loss": 5.0461,
|
|
"mean_token_accuracy": 0.21059294939041137,
|
|
"num_tokens": 23504983.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 5.310237264633178,
|
|
"epoch": 0.9851104707012488,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004911121233715677,
|
|
"loss": 5.1215,
|
|
"mean_token_accuracy": 0.2087342619895935,
|
|
"num_tokens": 23516119.0,
|
|
"step": 10255
|
|
},
|
|
{
|
|
"entropy": 5.301252555847168,
|
|
"epoch": 0.9855907780979827,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004911025802374861,
|
|
"loss": 5.1551,
|
|
"mean_token_accuracy": 0.2059706538915634,
|
|
"num_tokens": 23528242.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 5.182842016220093,
|
|
"epoch": 0.9860710854947167,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004910930320861442,
|
|
"loss": 5.0482,
|
|
"mean_token_accuracy": 0.21699930280447005,
|
|
"num_tokens": 23539738.0,
|
|
"step": 10265
|
|
},
|
|
{
|
|
"entropy": 5.0920305252075195,
|
|
"epoch": 0.9865513928914506,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004910834789177639,
|
|
"loss": 5.0687,
|
|
"mean_token_accuracy": 0.2103741407394409,
|
|
"num_tokens": 23551228.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 5.33592963218689,
|
|
"epoch": 0.9870317002881844,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004910739207325668,
|
|
"loss": 5.1207,
|
|
"mean_token_accuracy": 0.2097514569759369,
|
|
"num_tokens": 23563084.0,
|
|
"step": 10275
|
|
},
|
|
{
|
|
"entropy": 5.250389766693115,
|
|
"epoch": 0.9875120076849183,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004910643575307749,
|
|
"loss": 5.0891,
|
|
"mean_token_accuracy": 0.2097397819161415,
|
|
"num_tokens": 23574328.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 5.176170492172242,
|
|
"epoch": 0.9879923150816523,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004910547893126102,
|
|
"loss": 5.0627,
|
|
"mean_token_accuracy": 0.21138110905885696,
|
|
"num_tokens": 23585230.0,
|
|
"step": 10285
|
|
},
|
|
{
|
|
"entropy": 5.22738127708435,
|
|
"epoch": 0.9884726224783862,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004910452160782948,
|
|
"loss": 5.1049,
|
|
"mean_token_accuracy": 0.20212821811437606,
|
|
"num_tokens": 23596951.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 5.28731255531311,
|
|
"epoch": 0.9889529298751201,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000491035637828051,
|
|
"loss": 5.1244,
|
|
"mean_token_accuracy": 0.21019872575998305,
|
|
"num_tokens": 23607759.0,
|
|
"step": 10295
|
|
},
|
|
{
|
|
"entropy": 5.2878436088562015,
|
|
"epoch": 0.989433237271854,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004910260545621012,
|
|
"loss": 5.1489,
|
|
"mean_token_accuracy": 0.20213536471128463,
|
|
"num_tokens": 23619631.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 5.243245649337768,
|
|
"epoch": 0.9899135446685879,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004910164662806679,
|
|
"loss": 5.1312,
|
|
"mean_token_accuracy": 0.20988930463790895,
|
|
"num_tokens": 23630601.0,
|
|
"step": 10305
|
|
},
|
|
{
|
|
"entropy": 5.271698808670044,
|
|
"epoch": 0.9903938520653218,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004910068729839736,
|
|
"loss": 5.0656,
|
|
"mean_token_accuracy": 0.21258559077978134,
|
|
"num_tokens": 23641330.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 5.2771772861480715,
|
|
"epoch": 0.9908741594620557,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004909972746722413,
|
|
"loss": 5.1537,
|
|
"mean_token_accuracy": 0.2006964460015297,
|
|
"num_tokens": 23651492.0,
|
|
"step": 10315
|
|
},
|
|
{
|
|
"entropy": 5.199603843688965,
|
|
"epoch": 0.9913544668587896,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004909876713456935,
|
|
"loss": 5.0443,
|
|
"mean_token_accuracy": 0.2088362917304039,
|
|
"num_tokens": 23661773.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 5.239661455154419,
|
|
"epoch": 0.9918347742555236,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004909780630045534,
|
|
"loss": 5.0905,
|
|
"mean_token_accuracy": 0.20916487127542496,
|
|
"num_tokens": 23673534.0,
|
|
"step": 10325
|
|
},
|
|
{
|
|
"entropy": 5.266284799575805,
|
|
"epoch": 0.9923150816522575,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000490968449649044,
|
|
"loss": 5.1886,
|
|
"mean_token_accuracy": 0.20307926088571548,
|
|
"num_tokens": 23684892.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 5.284223937988282,
|
|
"epoch": 0.9927953890489913,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004909588312793884,
|
|
"loss": 5.1518,
|
|
"mean_token_accuracy": 0.2053588092327118,
|
|
"num_tokens": 23696076.0,
|
|
"step": 10335
|
|
},
|
|
{
|
|
"entropy": 5.283962202072144,
|
|
"epoch": 0.9932756964457252,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004909492078958101,
|
|
"loss": 5.1537,
|
|
"mean_token_accuracy": 0.20028176456689833,
|
|
"num_tokens": 23707795.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 5.249281978607177,
|
|
"epoch": 0.9937560038424592,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004909395794985324,
|
|
"loss": 5.057,
|
|
"mean_token_accuracy": 0.2048047587275505,
|
|
"num_tokens": 23720802.0,
|
|
"step": 10345
|
|
},
|
|
{
|
|
"entropy": 5.25334734916687,
|
|
"epoch": 0.9942363112391931,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004909299460877788,
|
|
"loss": 5.0896,
|
|
"mean_token_accuracy": 0.20352237075567245,
|
|
"num_tokens": 23732854.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 5.2786060810089115,
|
|
"epoch": 0.994716618635927,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004909203076637732,
|
|
"loss": 5.1659,
|
|
"mean_token_accuracy": 0.2006428435444832,
|
|
"num_tokens": 23743593.0,
|
|
"step": 10355
|
|
},
|
|
{
|
|
"entropy": 5.3259721279144285,
|
|
"epoch": 0.9951969260326609,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004909106642267392,
|
|
"loss": 5.1651,
|
|
"mean_token_accuracy": 0.20107742697000502,
|
|
"num_tokens": 23755447.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 5.273270559310913,
|
|
"epoch": 0.9956772334293948,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004909010157769006,
|
|
"loss": 5.1412,
|
|
"mean_token_accuracy": 0.20289405286312104,
|
|
"num_tokens": 23767181.0,
|
|
"step": 10365
|
|
},
|
|
{
|
|
"entropy": 5.343906021118164,
|
|
"epoch": 0.9961575408261287,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004908913623144814,
|
|
"loss": 5.2162,
|
|
"mean_token_accuracy": 0.1965470626950264,
|
|
"num_tokens": 23776356.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 5.270178937911988,
|
|
"epoch": 0.9966378482228626,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000490881703839706,
|
|
"loss": 5.1747,
|
|
"mean_token_accuracy": 0.20546858310699462,
|
|
"num_tokens": 23787965.0,
|
|
"step": 10375
|
|
},
|
|
{
|
|
"entropy": 5.23285551071167,
|
|
"epoch": 0.9971181556195965,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004908720403527984,
|
|
"loss": 5.0634,
|
|
"mean_token_accuracy": 0.21320411562919617,
|
|
"num_tokens": 23800327.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 5.2099464416503904,
|
|
"epoch": 0.9975984630163305,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000490862371853983,
|
|
"loss": 5.0535,
|
|
"mean_token_accuracy": 0.20708101391792297,
|
|
"num_tokens": 23812845.0,
|
|
"step": 10385
|
|
},
|
|
{
|
|
"entropy": 5.323228597640991,
|
|
"epoch": 0.9980787704130644,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004908526983434844,
|
|
"loss": 5.2069,
|
|
"mean_token_accuracy": 0.19844041019678116,
|
|
"num_tokens": 23824831.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 5.255801010131836,
|
|
"epoch": 0.9985590778097982,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000490843019821527,
|
|
"loss": 5.0732,
|
|
"mean_token_accuracy": 0.20827420055866241,
|
|
"num_tokens": 23836697.0,
|
|
"step": 10395
|
|
},
|
|
{
|
|
"entropy": 5.204647493362427,
|
|
"epoch": 0.9990393852065321,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004908333362883358,
|
|
"loss": 5.0994,
|
|
"mean_token_accuracy": 0.20774794071912767,
|
|
"num_tokens": 23847112.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 5.334963607788086,
|
|
"epoch": 0.9995196926032661,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004908236477441353,
|
|
"loss": 5.193,
|
|
"mean_token_accuracy": 0.2045993834733963,
|
|
"num_tokens": 23858185.0,
|
|
"step": 10405
|
|
},
|
|
{
|
|
"entropy": 5.197092485427857,
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004908139541891505,
|
|
"loss": 4.9697,
|
|
"mean_token_accuracy": 0.21775645166635513,
|
|
"num_tokens": 23868536.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 5.299159669876099,
|
|
"epoch": 1.0004803073967339,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004908042556236066,
|
|
"loss": 5.0114,
|
|
"mean_token_accuracy": 0.21747902780771255,
|
|
"num_tokens": 23880283.0,
|
|
"step": 10415
|
|
},
|
|
{
|
|
"entropy": 5.265295839309692,
|
|
"epoch": 1.0009606147934678,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004907945520477286,
|
|
"loss": 5.0792,
|
|
"mean_token_accuracy": 0.20754191726446153,
|
|
"num_tokens": 23892413.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 5.34681248664856,
|
|
"epoch": 1.0014409221902016,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004907848434617419,
|
|
"loss": 5.1832,
|
|
"mean_token_accuracy": 0.19456289261579512,
|
|
"num_tokens": 23903977.0,
|
|
"step": 10425
|
|
},
|
|
{
|
|
"entropy": 5.302938079833984,
|
|
"epoch": 1.0019212295869357,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000490775129865872,
|
|
"loss": 5.1463,
|
|
"mean_token_accuracy": 0.2010764569044113,
|
|
"num_tokens": 23915153.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 5.225161218643189,
|
|
"epoch": 1.0024015369836696,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004907654112603442,
|
|
"loss": 5.0186,
|
|
"mean_token_accuracy": 0.2120182618498802,
|
|
"num_tokens": 23926043.0,
|
|
"step": 10435
|
|
},
|
|
{
|
|
"entropy": 5.143458271026612,
|
|
"epoch": 1.0028818443804035,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004907556876453843,
|
|
"loss": 4.9208,
|
|
"mean_token_accuracy": 0.2206213116645813,
|
|
"num_tokens": 23936658.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 5.225710868835449,
|
|
"epoch": 1.0033621517771374,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000490745959021218,
|
|
"loss": 5.0434,
|
|
"mean_token_accuracy": 0.20257661491632462,
|
|
"num_tokens": 23947676.0,
|
|
"step": 10445
|
|
},
|
|
{
|
|
"entropy": 5.27921199798584,
|
|
"epoch": 1.0038424591738713,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004907362253880711,
|
|
"loss": 5.0296,
|
|
"mean_token_accuracy": 0.2058090642094612,
|
|
"num_tokens": 23959130.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 5.305146789550781,
|
|
"epoch": 1.0043227665706052,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004907264867461697,
|
|
"loss": 5.1168,
|
|
"mean_token_accuracy": 0.20227408558130264,
|
|
"num_tokens": 23969905.0,
|
|
"step": 10455
|
|
},
|
|
{
|
|
"entropy": 5.156929969787598,
|
|
"epoch": 1.004803073967339,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004907167430957399,
|
|
"loss": 5.005,
|
|
"mean_token_accuracy": 0.2087326243519783,
|
|
"num_tokens": 23982016.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 5.258314514160157,
|
|
"epoch": 1.005283381364073,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004907069944370077,
|
|
"loss": 5.1583,
|
|
"mean_token_accuracy": 0.20535677224397658,
|
|
"num_tokens": 23994200.0,
|
|
"step": 10465
|
|
},
|
|
{
|
|
"entropy": 5.350189876556397,
|
|
"epoch": 1.005763688760807,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004906972407701998,
|
|
"loss": 5.0445,
|
|
"mean_token_accuracy": 0.21789115369319917,
|
|
"num_tokens": 24004695.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 5.15443787574768,
|
|
"epoch": 1.006243996157541,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004906874820955423,
|
|
"loss": 4.9954,
|
|
"mean_token_accuracy": 0.21623784005641938,
|
|
"num_tokens": 24015922.0,
|
|
"step": 10475
|
|
},
|
|
{
|
|
"entropy": 5.146531486511231,
|
|
"epoch": 1.0067243035542748,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004906777184132621,
|
|
"loss": 4.9992,
|
|
"mean_token_accuracy": 0.21183741688728333,
|
|
"num_tokens": 24026759.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 5.300349760055542,
|
|
"epoch": 1.0072046109510087,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004906679497235856,
|
|
"loss": 5.0743,
|
|
"mean_token_accuracy": 0.19960159063339233,
|
|
"num_tokens": 24037988.0,
|
|
"step": 10485
|
|
},
|
|
{
|
|
"entropy": 5.296932697296143,
|
|
"epoch": 1.0076849183477425,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004906581760267397,
|
|
"loss": 5.1147,
|
|
"mean_token_accuracy": 0.2084190621972084,
|
|
"num_tokens": 24050837.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 5.258708572387695,
|
|
"epoch": 1.0081652257444764,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004906483973229513,
|
|
"loss": 4.9916,
|
|
"mean_token_accuracy": 0.2128538578748703,
|
|
"num_tokens": 24063085.0,
|
|
"step": 10495
|
|
},
|
|
{
|
|
"entropy": 5.162822246551514,
|
|
"epoch": 1.0086455331412103,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004906386136124476,
|
|
"loss": 5.0433,
|
|
"mean_token_accuracy": 0.21004260182380677,
|
|
"num_tokens": 24074082.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 5.3224996566772464,
|
|
"epoch": 1.0091258405379442,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004906288248954554,
|
|
"loss": 5.0928,
|
|
"mean_token_accuracy": 0.20969793498516082,
|
|
"num_tokens": 24085050.0,
|
|
"step": 10505
|
|
},
|
|
{
|
|
"entropy": 5.235025644302368,
|
|
"epoch": 1.0096061479346783,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004906190311722023,
|
|
"loss": 5.0543,
|
|
"mean_token_accuracy": 0.21070992648601533,
|
|
"num_tokens": 24095523.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 5.172921419143677,
|
|
"epoch": 1.0100864553314122,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004906092324429155,
|
|
"loss": 5.0162,
|
|
"mean_token_accuracy": 0.20338939726352692,
|
|
"num_tokens": 24107157.0,
|
|
"step": 10515
|
|
},
|
|
{
|
|
"entropy": 5.295492935180664,
|
|
"epoch": 1.010566762728146,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004905994287078227,
|
|
"loss": 5.0377,
|
|
"mean_token_accuracy": 0.21216456443071366,
|
|
"num_tokens": 24118668.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 5.215558815002441,
|
|
"epoch": 1.01104707012488,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004905896199671512,
|
|
"loss": 5.087,
|
|
"mean_token_accuracy": 0.20571289211511612,
|
|
"num_tokens": 24129563.0,
|
|
"step": 10525
|
|
},
|
|
{
|
|
"entropy": 5.31974778175354,
|
|
"epoch": 1.0115273775216138,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000490579806221129,
|
|
"loss": 5.0963,
|
|
"mean_token_accuracy": 0.20234022289514542,
|
|
"num_tokens": 24139965.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 5.339018249511719,
|
|
"epoch": 1.0120076849183477,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004905699874699838,
|
|
"loss": 5.0927,
|
|
"mean_token_accuracy": 0.2037150517106056,
|
|
"num_tokens": 24152109.0,
|
|
"step": 10535
|
|
},
|
|
{
|
|
"entropy": 5.20176329612732,
|
|
"epoch": 1.0124879923150816,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004905601637139436,
|
|
"loss": 5.0382,
|
|
"mean_token_accuracy": 0.2062271788716316,
|
|
"num_tokens": 24162558.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 5.153134155273437,
|
|
"epoch": 1.0129682997118155,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004905503349532365,
|
|
"loss": 4.951,
|
|
"mean_token_accuracy": 0.2199328899383545,
|
|
"num_tokens": 24173963.0,
|
|
"step": 10545
|
|
},
|
|
{
|
|
"entropy": 5.1954326152801515,
|
|
"epoch": 1.0134486071085496,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004905405011880906,
|
|
"loss": 4.9813,
|
|
"mean_token_accuracy": 0.2153230667114258,
|
|
"num_tokens": 24184369.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 5.316426420211792,
|
|
"epoch": 1.0139289145052834,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004905306624187343,
|
|
"loss": 5.1457,
|
|
"mean_token_accuracy": 0.20428456813097,
|
|
"num_tokens": 24194692.0,
|
|
"step": 10555
|
|
},
|
|
{
|
|
"entropy": 5.306823635101319,
|
|
"epoch": 1.0144092219020173,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004905208186453961,
|
|
"loss": 5.0775,
|
|
"mean_token_accuracy": 0.20443981587886811,
|
|
"num_tokens": 24205363.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 5.286128664016724,
|
|
"epoch": 1.0148895292987512,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004905109698683044,
|
|
"loss": 5.0921,
|
|
"mean_token_accuracy": 0.20271058976650239,
|
|
"num_tokens": 24216678.0,
|
|
"step": 10565
|
|
},
|
|
{
|
|
"entropy": 5.180634689331055,
|
|
"epoch": 1.015369836695485,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004905011160876878,
|
|
"loss": 4.9789,
|
|
"mean_token_accuracy": 0.20550620704889297,
|
|
"num_tokens": 24227541.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 5.23009238243103,
|
|
"epoch": 1.015850144092219,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004904912573037753,
|
|
"loss": 4.9955,
|
|
"mean_token_accuracy": 0.2097481057047844,
|
|
"num_tokens": 24238118.0,
|
|
"step": 10575
|
|
},
|
|
{
|
|
"entropy": 5.300039005279541,
|
|
"epoch": 1.0163304514889528,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004904813935167957,
|
|
"loss": 5.124,
|
|
"mean_token_accuracy": 0.20036156624555587,
|
|
"num_tokens": 24250044.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 5.260974168777466,
|
|
"epoch": 1.0168107588856867,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004904715247269779,
|
|
"loss": 5.1359,
|
|
"mean_token_accuracy": 0.19710972905158997,
|
|
"num_tokens": 24262805.0,
|
|
"step": 10585
|
|
},
|
|
{
|
|
"entropy": 5.25907940864563,
|
|
"epoch": 1.0172910662824208,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004904616509345514,
|
|
"loss": 5.0512,
|
|
"mean_token_accuracy": 0.19826420694589614,
|
|
"num_tokens": 24274287.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 5.131495952606201,
|
|
"epoch": 1.0177713736791547,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004904517721397449,
|
|
"loss": 4.9545,
|
|
"mean_token_accuracy": 0.2095574140548706,
|
|
"num_tokens": 24284839.0,
|
|
"step": 10595
|
|
},
|
|
{
|
|
"entropy": 5.185259437561035,
|
|
"epoch": 1.0182516810758886,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004904418883427881,
|
|
"loss": 5.0363,
|
|
"mean_token_accuracy": 0.20771684646606445,
|
|
"num_tokens": 24295812.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 5.253675317764282,
|
|
"epoch": 1.0187319884726225,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004904319995439104,
|
|
"loss": 5.0144,
|
|
"mean_token_accuracy": 0.21546317189931868,
|
|
"num_tokens": 24306365.0,
|
|
"step": 10605
|
|
},
|
|
{
|
|
"entropy": 5.241600894927979,
|
|
"epoch": 1.0192122958693564,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004904221057433412,
|
|
"loss": 5.0161,
|
|
"mean_token_accuracy": 0.20605212748050689,
|
|
"num_tokens": 24317557.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 5.208569526672363,
|
|
"epoch": 1.0196926032660902,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004904122069413105,
|
|
"loss": 4.9752,
|
|
"mean_token_accuracy": 0.21478671878576278,
|
|
"num_tokens": 24328874.0,
|
|
"step": 10615
|
|
},
|
|
{
|
|
"entropy": 5.325365495681763,
|
|
"epoch": 1.0201729106628241,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000490402303138048,
|
|
"loss": 5.1671,
|
|
"mean_token_accuracy": 0.2036365568637848,
|
|
"num_tokens": 24340918.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 5.296480655670166,
|
|
"epoch": 1.0206532180595582,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004903923943337836,
|
|
"loss": 5.09,
|
|
"mean_token_accuracy": 0.20379555076360703,
|
|
"num_tokens": 24352642.0,
|
|
"step": 10625
|
|
},
|
|
{
|
|
"entropy": 5.210208606719971,
|
|
"epoch": 1.021133525456292,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004903824805287475,
|
|
"loss": 4.9669,
|
|
"mean_token_accuracy": 0.21470995843410492,
|
|
"num_tokens": 24364874.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 5.171673250198364,
|
|
"epoch": 1.021613832853026,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004903725617231696,
|
|
"loss": 5.0951,
|
|
"mean_token_accuracy": 0.21187301725149155,
|
|
"num_tokens": 24376566.0,
|
|
"step": 10635
|
|
},
|
|
{
|
|
"entropy": 5.203986120223999,
|
|
"epoch": 1.0220941402497599,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004903626379172805,
|
|
"loss": 4.9178,
|
|
"mean_token_accuracy": 0.21737915873527527,
|
|
"num_tokens": 24387283.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 5.284794282913208,
|
|
"epoch": 1.0225744476464937,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004903527091113102,
|
|
"loss": 5.123,
|
|
"mean_token_accuracy": 0.2050114780664444,
|
|
"num_tokens": 24397970.0,
|
|
"step": 10645
|
|
},
|
|
{
|
|
"entropy": 5.227866220474243,
|
|
"epoch": 1.0230547550432276,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004903427753054897,
|
|
"loss": 5.0674,
|
|
"mean_token_accuracy": 0.2044738933444023,
|
|
"num_tokens": 24411223.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 5.2655031204223635,
|
|
"epoch": 1.0235350624399615,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004903328365000492,
|
|
"loss": 5.0926,
|
|
"mean_token_accuracy": 0.20762381106615066,
|
|
"num_tokens": 24422117.0,
|
|
"step": 10655
|
|
},
|
|
{
|
|
"entropy": 5.236895370483398,
|
|
"epoch": 1.0240153698366954,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004903228926952199,
|
|
"loss": 5.0426,
|
|
"mean_token_accuracy": 0.20284196585416794,
|
|
"num_tokens": 24434142.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 5.223333692550659,
|
|
"epoch": 1.0244956772334295,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004903129438912322,
|
|
"loss": 4.9533,
|
|
"mean_token_accuracy": 0.2077935144305229,
|
|
"num_tokens": 24445295.0,
|
|
"step": 10665
|
|
},
|
|
{
|
|
"entropy": 5.216018962860107,
|
|
"epoch": 1.0249759846301634,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004903029900883174,
|
|
"loss": 5.0294,
|
|
"mean_token_accuracy": 0.21322072446346282,
|
|
"num_tokens": 24456092.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 5.246077156066894,
|
|
"epoch": 1.0254562920268973,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004902930312867063,
|
|
"loss": 5.1249,
|
|
"mean_token_accuracy": 0.20178017616271973,
|
|
"num_tokens": 24467653.0,
|
|
"step": 10675
|
|
},
|
|
{
|
|
"entropy": 5.290040493011475,
|
|
"epoch": 1.0259365994236311,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004902830674866306,
|
|
"loss": 5.0763,
|
|
"mean_token_accuracy": 0.20854619145393372,
|
|
"num_tokens": 24479164.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 5.184268283843994,
|
|
"epoch": 1.026416906820365,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004902730986883211,
|
|
"loss": 4.9426,
|
|
"mean_token_accuracy": 0.21813494712114334,
|
|
"num_tokens": 24489785.0,
|
|
"step": 10685
|
|
},
|
|
{
|
|
"entropy": 5.223792457580567,
|
|
"epoch": 1.026897214217099,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004902631248920096,
|
|
"loss": 5.0399,
|
|
"mean_token_accuracy": 0.20972464382648467,
|
|
"num_tokens": 24500158.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 5.308423471450806,
|
|
"epoch": 1.0273775216138328,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004902531460979274,
|
|
"loss": 5.1518,
|
|
"mean_token_accuracy": 0.20060888230800628,
|
|
"num_tokens": 24512851.0,
|
|
"step": 10695
|
|
},
|
|
{
|
|
"entropy": 5.2649911403656,
|
|
"epoch": 1.0278578290105667,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004902431623063065,
|
|
"loss": 5.0938,
|
|
"mean_token_accuracy": 0.20821331739425658,
|
|
"num_tokens": 24524016.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 5.1589634895324705,
|
|
"epoch": 1.0283381364073008,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004902331735173785,
|
|
"loss": 4.979,
|
|
"mean_token_accuracy": 0.2058223605155945,
|
|
"num_tokens": 24536348.0,
|
|
"step": 10705
|
|
},
|
|
{
|
|
"entropy": 5.149962663650513,
|
|
"epoch": 1.0288184438040346,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004902231797313752,
|
|
"loss": 5.0329,
|
|
"mean_token_accuracy": 0.2095619484782219,
|
|
"num_tokens": 24548718.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 5.180563497543335,
|
|
"epoch": 1.0292987512007685,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004902131809485288,
|
|
"loss": 4.9461,
|
|
"mean_token_accuracy": 0.21883852481842042,
|
|
"num_tokens": 24560567.0,
|
|
"step": 10715
|
|
},
|
|
{
|
|
"entropy": 5.216690635681152,
|
|
"epoch": 1.0297790585975024,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004902031771690713,
|
|
"loss": 4.973,
|
|
"mean_token_accuracy": 0.21082175374031067,
|
|
"num_tokens": 24572610.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 5.22666974067688,
|
|
"epoch": 1.0302593659942363,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004901931683932352,
|
|
"loss": 5.0303,
|
|
"mean_token_accuracy": 0.21025995314121246,
|
|
"num_tokens": 24584738.0,
|
|
"step": 10725
|
|
},
|
|
{
|
|
"entropy": 5.256303358078003,
|
|
"epoch": 1.0307396733909702,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004901831546212526,
|
|
"loss": 5.081,
|
|
"mean_token_accuracy": 0.20822075754404068,
|
|
"num_tokens": 24596603.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 5.166937351226807,
|
|
"epoch": 1.031219980787704,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004901731358533562,
|
|
"loss": 4.9585,
|
|
"mean_token_accuracy": 0.21120154708623887,
|
|
"num_tokens": 24607061.0,
|
|
"step": 10735
|
|
},
|
|
{
|
|
"entropy": 5.163893556594848,
|
|
"epoch": 1.031700288184438,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004901631120897785,
|
|
"loss": 5.0366,
|
|
"mean_token_accuracy": 0.21032600998878478,
|
|
"num_tokens": 24619177.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 5.19158706665039,
|
|
"epoch": 1.032180595581172,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004901530833307522,
|
|
"loss": 4.9629,
|
|
"mean_token_accuracy": 0.2108414351940155,
|
|
"num_tokens": 24631336.0,
|
|
"step": 10745
|
|
},
|
|
{
|
|
"entropy": 5.147015523910523,
|
|
"epoch": 1.032660902977906,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004901430495765103,
|
|
"loss": 4.9281,
|
|
"mean_token_accuracy": 0.22001660764217376,
|
|
"num_tokens": 24641743.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 5.218332290649414,
|
|
"epoch": 1.0331412103746398,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004901330108272855,
|
|
"loss": 4.9683,
|
|
"mean_token_accuracy": 0.21641426235437394,
|
|
"num_tokens": 24652318.0,
|
|
"step": 10755
|
|
},
|
|
{
|
|
"entropy": 5.181002473831176,
|
|
"epoch": 1.0336215177713737,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004901229670833111,
|
|
"loss": 4.9866,
|
|
"mean_token_accuracy": 0.21016779094934462,
|
|
"num_tokens": 24664129.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 5.25865159034729,
|
|
"epoch": 1.0341018251681076,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004901129183448201,
|
|
"loss": 5.0585,
|
|
"mean_token_accuracy": 0.20536702275276184,
|
|
"num_tokens": 24674921.0,
|
|
"step": 10765
|
|
},
|
|
{
|
|
"entropy": 5.233693790435791,
|
|
"epoch": 1.0345821325648414,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004901028646120459,
|
|
"loss": 5.0129,
|
|
"mean_token_accuracy": 0.20729674845933915,
|
|
"num_tokens": 24686052.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 5.181743049621582,
|
|
"epoch": 1.0350624399615753,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000490092805885222,
|
|
"loss": 5.0101,
|
|
"mean_token_accuracy": 0.20977197587490082,
|
|
"num_tokens": 24698625.0,
|
|
"step": 10775
|
|
},
|
|
{
|
|
"entropy": 5.177609062194824,
|
|
"epoch": 1.0355427473583094,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004900827421645816,
|
|
"loss": 4.9688,
|
|
"mean_token_accuracy": 0.2193769931793213,
|
|
"num_tokens": 24709322.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 5.15111927986145,
|
|
"epoch": 1.0360230547550433,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004900726734503589,
|
|
"loss": 4.9438,
|
|
"mean_token_accuracy": 0.21662437915802002,
|
|
"num_tokens": 24719512.0,
|
|
"step": 10785
|
|
},
|
|
{
|
|
"entropy": 5.16123480796814,
|
|
"epoch": 1.0365033621517772,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004900625997427872,
|
|
"loss": 4.9806,
|
|
"mean_token_accuracy": 0.21338418126106262,
|
|
"num_tokens": 24729947.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 5.135127162933349,
|
|
"epoch": 1.036983669548511,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004900525210421006,
|
|
"loss": 4.9767,
|
|
"mean_token_accuracy": 0.21927962452173233,
|
|
"num_tokens": 24741423.0,
|
|
"step": 10795
|
|
},
|
|
{
|
|
"entropy": 5.223546314239502,
|
|
"epoch": 1.037463976945245,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004900424373485329,
|
|
"loss": 5.014,
|
|
"mean_token_accuracy": 0.20575396567583085,
|
|
"num_tokens": 24753403.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 5.266994619369507,
|
|
"epoch": 1.0379442843419788,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004900323486623185,
|
|
"loss": 5.1261,
|
|
"mean_token_accuracy": 0.20268698483705522,
|
|
"num_tokens": 24763660.0,
|
|
"step": 10805
|
|
},
|
|
{
|
|
"entropy": 5.23413200378418,
|
|
"epoch": 1.0384245917387127,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004900222549836914,
|
|
"loss": 5.0232,
|
|
"mean_token_accuracy": 0.21380564272403718,
|
|
"num_tokens": 24775061.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 5.259318685531616,
|
|
"epoch": 1.0389048991354466,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000490012156312886,
|
|
"loss": 5.0158,
|
|
"mean_token_accuracy": 0.20518611520528793,
|
|
"num_tokens": 24785248.0,
|
|
"step": 10815
|
|
},
|
|
{
|
|
"entropy": 5.128819990158081,
|
|
"epoch": 1.0393852065321807,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004900020526501369,
|
|
"loss": 4.9191,
|
|
"mean_token_accuracy": 0.21387154012918472,
|
|
"num_tokens": 24797024.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 5.312929439544678,
|
|
"epoch": 1.0398655139289146,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004899919439956785,
|
|
"loss": 5.1953,
|
|
"mean_token_accuracy": 0.20268491804599761,
|
|
"num_tokens": 24808500.0,
|
|
"step": 10825
|
|
},
|
|
{
|
|
"entropy": 5.34240870475769,
|
|
"epoch": 1.0403458213256485,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004899818303497455,
|
|
"loss": 5.1314,
|
|
"mean_token_accuracy": 0.2014186292886734,
|
|
"num_tokens": 24818805.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 5.268064880371094,
|
|
"epoch": 1.0408261287223823,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004899717117125728,
|
|
"loss": 5.0649,
|
|
"mean_token_accuracy": 0.20589411109685898,
|
|
"num_tokens": 24829247.0,
|
|
"step": 10835
|
|
},
|
|
{
|
|
"entropy": 5.1017598628997805,
|
|
"epoch": 1.0413064361191162,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004899615880843953,
|
|
"loss": 5.0078,
|
|
"mean_token_accuracy": 0.21258261501789094,
|
|
"num_tokens": 24840139.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 5.246176147460938,
|
|
"epoch": 1.04178674351585,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004899514594654481,
|
|
"loss": 5.1039,
|
|
"mean_token_accuracy": 0.20273203402757645,
|
|
"num_tokens": 24851734.0,
|
|
"step": 10845
|
|
},
|
|
{
|
|
"entropy": 5.264043140411377,
|
|
"epoch": 1.042267050912584,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004899413258559662,
|
|
"loss": 5.0466,
|
|
"mean_token_accuracy": 0.21014518439769744,
|
|
"num_tokens": 24863424.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 5.277558660507202,
|
|
"epoch": 1.0427473583093179,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004899311872561849,
|
|
"loss": 5.0636,
|
|
"mean_token_accuracy": 0.20547475218772887,
|
|
"num_tokens": 24875086.0,
|
|
"step": 10855
|
|
},
|
|
{
|
|
"entropy": 5.196067905426025,
|
|
"epoch": 1.043227665706052,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004899210436663398,
|
|
"loss": 4.9935,
|
|
"mean_token_accuracy": 0.21280764788389206,
|
|
"num_tokens": 24888408.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 5.165633726119995,
|
|
"epoch": 1.0437079731027858,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004899108950866661,
|
|
"loss": 4.9365,
|
|
"mean_token_accuracy": 0.21639619767665863,
|
|
"num_tokens": 24900357.0,
|
|
"step": 10865
|
|
},
|
|
{
|
|
"entropy": 5.252479410171508,
|
|
"epoch": 1.0441882804995197,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004899007415173997,
|
|
"loss": 5.0378,
|
|
"mean_token_accuracy": 0.2107843890786171,
|
|
"num_tokens": 24910790.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 5.1988269805908205,
|
|
"epoch": 1.0446685878962536,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004898905829587762,
|
|
"loss": 5.013,
|
|
"mean_token_accuracy": 0.20995523184537887,
|
|
"num_tokens": 24922124.0,
|
|
"step": 10875
|
|
},
|
|
{
|
|
"entropy": 5.104007339477539,
|
|
"epoch": 1.0451488952929875,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004898804194110313,
|
|
"loss": 4.9304,
|
|
"mean_token_accuracy": 0.21980289071798326,
|
|
"num_tokens": 24933591.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 5.195203590393066,
|
|
"epoch": 1.0456292026897214,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004898702508744012,
|
|
"loss": 4.9496,
|
|
"mean_token_accuracy": 0.21639021039009093,
|
|
"num_tokens": 24944708.0,
|
|
"step": 10885
|
|
},
|
|
{
|
|
"entropy": 5.157971286773682,
|
|
"epoch": 1.0461095100864553,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004898600773491221,
|
|
"loss": 4.9176,
|
|
"mean_token_accuracy": 0.21461566239595414,
|
|
"num_tokens": 24955966.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 5.254655361175537,
|
|
"epoch": 1.0465898174831891,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004898498988354297,
|
|
"loss": 5.0325,
|
|
"mean_token_accuracy": 0.21173021644353868,
|
|
"num_tokens": 24967292.0,
|
|
"step": 10895
|
|
},
|
|
{
|
|
"entropy": 5.230365228652954,
|
|
"epoch": 1.0470701248799232,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004898397153335608,
|
|
"loss": 5.0959,
|
|
"mean_token_accuracy": 0.20530790984630584,
|
|
"num_tokens": 24977407.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 5.294993305206299,
|
|
"epoch": 1.0475504322766571,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004898295268437517,
|
|
"loss": 5.1541,
|
|
"mean_token_accuracy": 0.20490354150533677,
|
|
"num_tokens": 24988804.0,
|
|
"step": 10905
|
|
},
|
|
{
|
|
"entropy": 5.229137849807739,
|
|
"epoch": 1.048030739673391,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004898193333662388,
|
|
"loss": 5.0796,
|
|
"mean_token_accuracy": 0.20612839758396148,
|
|
"num_tokens": 25000297.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 5.225936555862427,
|
|
"epoch": 1.0485110470701249,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004898091349012588,
|
|
"loss": 5.0167,
|
|
"mean_token_accuracy": 0.20787729918956757,
|
|
"num_tokens": 25012135.0,
|
|
"step": 10915
|
|
},
|
|
{
|
|
"entropy": 5.138573503494262,
|
|
"epoch": 1.0489913544668588,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004897989314490486,
|
|
"loss": 4.9946,
|
|
"mean_token_accuracy": 0.21607837826013565,
|
|
"num_tokens": 25023572.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 5.2005609512329105,
|
|
"epoch": 1.0494716618635926,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004897887230098451,
|
|
"loss": 5.0626,
|
|
"mean_token_accuracy": 0.20565639436244965,
|
|
"num_tokens": 25035015.0,
|
|
"step": 10925
|
|
},
|
|
{
|
|
"entropy": 5.209970331192016,
|
|
"epoch": 1.0499519692603265,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004897785095838852,
|
|
"loss": 4.9928,
|
|
"mean_token_accuracy": 0.2115662842988968,
|
|
"num_tokens": 25045931.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 5.248309993743897,
|
|
"epoch": 1.0504322766570606,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004897682911714061,
|
|
"loss": 5.0403,
|
|
"mean_token_accuracy": 0.2143391728401184,
|
|
"num_tokens": 25056767.0,
|
|
"step": 10935
|
|
},
|
|
{
|
|
"entropy": 5.159217071533203,
|
|
"epoch": 1.0509125840537945,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000489758067772645,
|
|
"loss": 4.9852,
|
|
"mean_token_accuracy": 0.21812189370393753,
|
|
"num_tokens": 25068731.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 5.239347696304321,
|
|
"epoch": 1.0513928914505284,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004897478393878392,
|
|
"loss": 5.0752,
|
|
"mean_token_accuracy": 0.20207206010818482,
|
|
"num_tokens": 25081268.0,
|
|
"step": 10945
|
|
},
|
|
{
|
|
"entropy": 5.1585955142974855,
|
|
"epoch": 1.0518731988472623,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004897376060172264,
|
|
"loss": 4.9696,
|
|
"mean_token_accuracy": 0.2180320918560028,
|
|
"num_tokens": 25093105.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 5.193137502670288,
|
|
"epoch": 1.0523535062439962,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004897273676610438,
|
|
"loss": 4.9759,
|
|
"mean_token_accuracy": 0.21481747329235076,
|
|
"num_tokens": 25103766.0,
|
|
"step": 10955
|
|
},
|
|
{
|
|
"entropy": 5.226558351516724,
|
|
"epoch": 1.05283381364073,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004897171243195295,
|
|
"loss": 5.1226,
|
|
"mean_token_accuracy": 0.21184030324220657,
|
|
"num_tokens": 25115675.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 5.209109592437744,
|
|
"epoch": 1.053314121037464,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000489706875992921,
|
|
"loss": 4.991,
|
|
"mean_token_accuracy": 0.21007043421268462,
|
|
"num_tokens": 25127907.0,
|
|
"step": 10965
|
|
},
|
|
{
|
|
"entropy": 5.1817710399627686,
|
|
"epoch": 1.0537944284341978,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004896966226814565,
|
|
"loss": 5.0141,
|
|
"mean_token_accuracy": 0.2095083549618721,
|
|
"num_tokens": 25139675.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 5.254951429367066,
|
|
"epoch": 1.054274735830932,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004896863643853739,
|
|
"loss": 5.1364,
|
|
"mean_token_accuracy": 0.20798720717430114,
|
|
"num_tokens": 25150960.0,
|
|
"step": 10975
|
|
},
|
|
{
|
|
"entropy": 5.297211503982544,
|
|
"epoch": 1.0547550432276658,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004896761011049114,
|
|
"loss": 5.1038,
|
|
"mean_token_accuracy": 0.20611060559749603,
|
|
"num_tokens": 25163676.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 5.181599044799805,
|
|
"epoch": 1.0552353506243997,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004896658328403074,
|
|
"loss": 5.0374,
|
|
"mean_token_accuracy": 0.20499148815870286,
|
|
"num_tokens": 25174317.0,
|
|
"step": 10985
|
|
},
|
|
{
|
|
"entropy": 5.152895545959472,
|
|
"epoch": 1.0557156580211335,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004896555595918001,
|
|
"loss": 5.0424,
|
|
"mean_token_accuracy": 0.20741064995527267,
|
|
"num_tokens": 25186585.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 5.254479122161865,
|
|
"epoch": 1.0561959654178674,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004896452813596281,
|
|
"loss": 5.064,
|
|
"mean_token_accuracy": 0.20580837428569793,
|
|
"num_tokens": 25199014.0,
|
|
"step": 10995
|
|
},
|
|
{
|
|
"entropy": 5.286199140548706,
|
|
"epoch": 1.0566762728146013,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004896349981440301,
|
|
"loss": 5.1538,
|
|
"mean_token_accuracy": 0.20260929614305495,
|
|
"num_tokens": 25210544.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 5.2207067012786865,
|
|
"epoch": 1.0571565802113352,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004896247099452447,
|
|
"loss": 5.025,
|
|
"mean_token_accuracy": 0.21664920300245286,
|
|
"num_tokens": 25221583.0,
|
|
"step": 11005
|
|
},
|
|
{
|
|
"entropy": 5.203857946395874,
|
|
"epoch": 1.057636887608069,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004896144167635108,
|
|
"loss": 5.0237,
|
|
"mean_token_accuracy": 0.21649594753980636,
|
|
"num_tokens": 25231724.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 5.265408086776733,
|
|
"epoch": 1.0581171950048032,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004896041185990675,
|
|
"loss": 5.1366,
|
|
"mean_token_accuracy": 0.2034787967801094,
|
|
"num_tokens": 25243021.0,
|
|
"step": 11015
|
|
},
|
|
{
|
|
"entropy": 5.302087926864624,
|
|
"epoch": 1.058597502401537,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004895938154521538,
|
|
"loss": 5.0813,
|
|
"mean_token_accuracy": 0.20792468786239623,
|
|
"num_tokens": 25254189.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 5.238738918304444,
|
|
"epoch": 1.059077809798271,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004895835073230089,
|
|
"loss": 5.1264,
|
|
"mean_token_accuracy": 0.20500208884477616,
|
|
"num_tokens": 25265556.0,
|
|
"step": 11025
|
|
},
|
|
{
|
|
"entropy": 5.12807183265686,
|
|
"epoch": 1.0595581171950048,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004895731942118722,
|
|
"loss": 4.9421,
|
|
"mean_token_accuracy": 0.2060550183057785,
|
|
"num_tokens": 25276789.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 5.241643381118775,
|
|
"epoch": 1.0600384245917387,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004895628761189829,
|
|
"loss": 5.1203,
|
|
"mean_token_accuracy": 0.2057103246450424,
|
|
"num_tokens": 25288505.0,
|
|
"step": 11035
|
|
},
|
|
{
|
|
"entropy": 5.179819774627686,
|
|
"epoch": 1.0605187319884726,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004895525530445809,
|
|
"loss": 4.9773,
|
|
"mean_token_accuracy": 0.22087481170892714,
|
|
"num_tokens": 25301490.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 5.209347820281982,
|
|
"epoch": 1.0609990393852065,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004895422249889057,
|
|
"loss": 5.0721,
|
|
"mean_token_accuracy": 0.20252202302217484,
|
|
"num_tokens": 25313303.0,
|
|
"step": 11045
|
|
},
|
|
{
|
|
"entropy": 5.27926664352417,
|
|
"epoch": 1.0614793467819403,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004895318919521971,
|
|
"loss": 5.1309,
|
|
"mean_token_accuracy": 0.19722591042518617,
|
|
"num_tokens": 25324379.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 5.209228229522705,
|
|
"epoch": 1.0619596541786744,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004895215539346949,
|
|
"loss": 4.9702,
|
|
"mean_token_accuracy": 0.21044884771108627,
|
|
"num_tokens": 25335834.0,
|
|
"step": 11055
|
|
},
|
|
{
|
|
"entropy": 5.195343494415283,
|
|
"epoch": 1.0624399615754083,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004895112109366393,
|
|
"loss": 5.0312,
|
|
"mean_token_accuracy": 0.2079668939113617,
|
|
"num_tokens": 25347591.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 5.194220972061157,
|
|
"epoch": 1.0629202689721422,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004895008629582703,
|
|
"loss": 5.0066,
|
|
"mean_token_accuracy": 0.21342374235391617,
|
|
"num_tokens": 25358483.0,
|
|
"step": 11065
|
|
},
|
|
{
|
|
"entropy": 5.19577956199646,
|
|
"epoch": 1.063400576368876,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004894905099998283,
|
|
"loss": 5.0158,
|
|
"mean_token_accuracy": 0.20696393847465516,
|
|
"num_tokens": 25369434.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 5.223496198654175,
|
|
"epoch": 1.06388088376561,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004894801520615535,
|
|
"loss": 5.0318,
|
|
"mean_token_accuracy": 0.21212296783924103,
|
|
"num_tokens": 25381007.0,
|
|
"step": 11075
|
|
},
|
|
{
|
|
"entropy": 5.193694734573365,
|
|
"epoch": 1.0643611911623438,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004894697891436863,
|
|
"loss": 5.0148,
|
|
"mean_token_accuracy": 0.20833683609962464,
|
|
"num_tokens": 25393809.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 5.198791122436523,
|
|
"epoch": 1.0648414985590777,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004894594212464676,
|
|
"loss": 5.0451,
|
|
"mean_token_accuracy": 0.21562531143426894,
|
|
"num_tokens": 25404967.0,
|
|
"step": 11085
|
|
},
|
|
{
|
|
"entropy": 5.2563148021698,
|
|
"epoch": 1.0653218059558118,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004894490483701381,
|
|
"loss": 5.0122,
|
|
"mean_token_accuracy": 0.21502433270215987,
|
|
"num_tokens": 25417092.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 5.2914710521698,
|
|
"epoch": 1.0658021133525457,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004894386705149382,
|
|
"loss": 5.1036,
|
|
"mean_token_accuracy": 0.2005739152431488,
|
|
"num_tokens": 25428425.0,
|
|
"step": 11095
|
|
},
|
|
{
|
|
"entropy": 5.151738977432251,
|
|
"epoch": 1.0662824207492796,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004894282876811093,
|
|
"loss": 4.9347,
|
|
"mean_token_accuracy": 0.21947899460792542,
|
|
"num_tokens": 25440134.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 5.143119049072266,
|
|
"epoch": 1.0667627281460135,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004894178998688921,
|
|
"loss": 5.0003,
|
|
"mean_token_accuracy": 0.21364154070615768,
|
|
"num_tokens": 25452222.0,
|
|
"step": 11105
|
|
},
|
|
{
|
|
"entropy": 5.19956374168396,
|
|
"epoch": 1.0672430355427474,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004894075070785281,
|
|
"loss": 5.0462,
|
|
"mean_token_accuracy": 0.2120614990592003,
|
|
"num_tokens": 25464541.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 5.293523740768433,
|
|
"epoch": 1.0677233429394812,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004893971093102585,
|
|
"loss": 5.0531,
|
|
"mean_token_accuracy": 0.19972920715808867,
|
|
"num_tokens": 25476537.0,
|
|
"step": 11115
|
|
},
|
|
{
|
|
"entropy": 5.360321044921875,
|
|
"epoch": 1.0682036503362151,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004893867065643245,
|
|
"loss": 5.1091,
|
|
"mean_token_accuracy": 0.20334839224815368,
|
|
"num_tokens": 25486737.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 5.192300510406494,
|
|
"epoch": 1.068683957732949,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004893762988409678,
|
|
"loss": 5.0534,
|
|
"mean_token_accuracy": 0.20364596098661422,
|
|
"num_tokens": 25497278.0,
|
|
"step": 11125
|
|
},
|
|
{
|
|
"entropy": 5.208005428314209,
|
|
"epoch": 1.069164265129683,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004893658861404301,
|
|
"loss": 5.0304,
|
|
"mean_token_accuracy": 0.2047014966607094,
|
|
"num_tokens": 25508716.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 5.282382202148438,
|
|
"epoch": 1.069644572526417,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004893554684629529,
|
|
"loss": 5.1053,
|
|
"mean_token_accuracy": 0.20216587483882903,
|
|
"num_tokens": 25519439.0,
|
|
"step": 11135
|
|
},
|
|
{
|
|
"entropy": 5.184592771530151,
|
|
"epoch": 1.0701248799231509,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004893450458087784,
|
|
"loss": 4.9136,
|
|
"mean_token_accuracy": 0.2212449848651886,
|
|
"num_tokens": 25530911.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 5.250302124023437,
|
|
"epoch": 1.0706051873198847,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004893346181781483,
|
|
"loss": 5.149,
|
|
"mean_token_accuracy": 0.1957184687256813,
|
|
"num_tokens": 25542452.0,
|
|
"step": 11145
|
|
},
|
|
{
|
|
"entropy": 5.269041585922241,
|
|
"epoch": 1.0710854947166186,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004893241855713048,
|
|
"loss": 5.0786,
|
|
"mean_token_accuracy": 0.19805409461259843,
|
|
"num_tokens": 25554105.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 5.312788391113282,
|
|
"epoch": 1.0715658021133525,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004893137479884903,
|
|
"loss": 5.1134,
|
|
"mean_token_accuracy": 0.20513837188482284,
|
|
"num_tokens": 25564806.0,
|
|
"step": 11155
|
|
},
|
|
{
|
|
"entropy": 5.186615085601806,
|
|
"epoch": 1.0720461095100864,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004893033054299468,
|
|
"loss": 5.0038,
|
|
"mean_token_accuracy": 0.21586932092905045,
|
|
"num_tokens": 25575664.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 5.158471345901489,
|
|
"epoch": 1.0725264169068203,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000489292857895917,
|
|
"loss": 4.9254,
|
|
"mean_token_accuracy": 0.21706438809633255,
|
|
"num_tokens": 25586227.0,
|
|
"step": 11165
|
|
},
|
|
{
|
|
"entropy": 5.24803352355957,
|
|
"epoch": 1.0730067243035544,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004892824053866432,
|
|
"loss": 5.1114,
|
|
"mean_token_accuracy": 0.20555976331233977,
|
|
"num_tokens": 25597475.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 5.169841670989991,
|
|
"epoch": 1.0734870317002883,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004892719479023683,
|
|
"loss": 4.9757,
|
|
"mean_token_accuracy": 0.220069320499897,
|
|
"num_tokens": 25608098.0,
|
|
"step": 11175
|
|
},
|
|
{
|
|
"entropy": 5.268892574310303,
|
|
"epoch": 1.0739673390970221,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000489261485443335,
|
|
"loss": 5.101,
|
|
"mean_token_accuracy": 0.20311392694711686,
|
|
"num_tokens": 25620053.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 5.293476009368897,
|
|
"epoch": 1.074447646493756,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004892510180097863,
|
|
"loss": 5.0365,
|
|
"mean_token_accuracy": 0.2041410133242607,
|
|
"num_tokens": 25630534.0,
|
|
"step": 11185
|
|
},
|
|
{
|
|
"entropy": 5.236781454086303,
|
|
"epoch": 1.07492795389049,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004892405456019651,
|
|
"loss": 5.0553,
|
|
"mean_token_accuracy": 0.20958582758903505,
|
|
"num_tokens": 25641413.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 5.12468638420105,
|
|
"epoch": 1.0754082612872238,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004892300682201147,
|
|
"loss": 4.9719,
|
|
"mean_token_accuracy": 0.21782579123973847,
|
|
"num_tokens": 25652081.0,
|
|
"step": 11195
|
|
},
|
|
{
|
|
"entropy": 5.232947635650635,
|
|
"epoch": 1.0758885686839577,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004892195858644782,
|
|
"loss": 5.0749,
|
|
"mean_token_accuracy": 0.21364531815052032,
|
|
"num_tokens": 25664282.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 5.29130368232727,
|
|
"epoch": 1.0763688760806915,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000489209098535299,
|
|
"loss": 5.1488,
|
|
"mean_token_accuracy": 0.2104579210281372,
|
|
"num_tokens": 25675310.0,
|
|
"step": 11205
|
|
},
|
|
{
|
|
"entropy": 5.364002227783203,
|
|
"epoch": 1.0768491834774256,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004891986062328205,
|
|
"loss": 5.125,
|
|
"mean_token_accuracy": 0.21008958518505097,
|
|
"num_tokens": 25686895.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 5.1909034729003904,
|
|
"epoch": 1.0773294908741595,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004891881089572865,
|
|
"loss": 4.9291,
|
|
"mean_token_accuracy": 0.21216631978750228,
|
|
"num_tokens": 25697778.0,
|
|
"step": 11215
|
|
},
|
|
{
|
|
"entropy": 5.194598817825318,
|
|
"epoch": 1.0778097982708934,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004891776067089406,
|
|
"loss": 5.0898,
|
|
"mean_token_accuracy": 0.20165782868862153,
|
|
"num_tokens": 25708602.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 5.248305320739746,
|
|
"epoch": 1.0782901056676273,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004891670994880266,
|
|
"loss": 4.9873,
|
|
"mean_token_accuracy": 0.21111140102148057,
|
|
"num_tokens": 25719671.0,
|
|
"step": 11225
|
|
},
|
|
{
|
|
"entropy": 5.291704702377319,
|
|
"epoch": 1.0787704130643612,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004891565872947888,
|
|
"loss": 5.1287,
|
|
"mean_token_accuracy": 0.2033605992794037,
|
|
"num_tokens": 25731797.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 5.131606006622315,
|
|
"epoch": 1.079250720461095,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004891460701294706,
|
|
"loss": 4.9989,
|
|
"mean_token_accuracy": 0.2117511048913002,
|
|
"num_tokens": 25743984.0,
|
|
"step": 11235
|
|
},
|
|
{
|
|
"entropy": 5.14411768913269,
|
|
"epoch": 1.079731027857829,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004891355479923167,
|
|
"loss": 5.0089,
|
|
"mean_token_accuracy": 0.21176680326461791,
|
|
"num_tokens": 25755252.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 5.19481086730957,
|
|
"epoch": 1.080211335254563,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004891250208835712,
|
|
"loss": 4.9358,
|
|
"mean_token_accuracy": 0.21151957362890245,
|
|
"num_tokens": 25765715.0,
|
|
"step": 11245
|
|
},
|
|
{
|
|
"entropy": 5.282035970687867,
|
|
"epoch": 1.080691642651297,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004891144888034784,
|
|
"loss": 5.062,
|
|
"mean_token_accuracy": 0.2050844192504883,
|
|
"num_tokens": 25777866.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 5.208021640777588,
|
|
"epoch": 1.0811719500480308,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004891039517522832,
|
|
"loss": 4.9757,
|
|
"mean_token_accuracy": 0.21298900246620178,
|
|
"num_tokens": 25791199.0,
|
|
"step": 11255
|
|
},
|
|
{
|
|
"entropy": 5.092755365371704,
|
|
"epoch": 1.0816522574447647,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004890934097302299,
|
|
"loss": 4.9385,
|
|
"mean_token_accuracy": 0.2256488636136055,
|
|
"num_tokens": 25802979.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 5.082042789459228,
|
|
"epoch": 1.0821325648414986,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004890828627375632,
|
|
"loss": 4.9459,
|
|
"mean_token_accuracy": 0.2113230675458908,
|
|
"num_tokens": 25814696.0,
|
|
"step": 11265
|
|
},
|
|
{
|
|
"entropy": 5.222199535369873,
|
|
"epoch": 1.0826128722382324,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004890723107745283,
|
|
"loss": 5.0634,
|
|
"mean_token_accuracy": 0.20086456686258317,
|
|
"num_tokens": 25825376.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 5.239181756973267,
|
|
"epoch": 1.0830931796349663,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004890617538413699,
|
|
"loss": 5.0172,
|
|
"mean_token_accuracy": 0.21430771350860595,
|
|
"num_tokens": 25835491.0,
|
|
"step": 11275
|
|
},
|
|
{
|
|
"entropy": 5.273404932022094,
|
|
"epoch": 1.0835734870317002,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004890511919383333,
|
|
"loss": 5.0797,
|
|
"mean_token_accuracy": 0.20752860009670257,
|
|
"num_tokens": 25848154.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 5.189536762237549,
|
|
"epoch": 1.084053794428434,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004890406250656636,
|
|
"loss": 5.0563,
|
|
"mean_token_accuracy": 0.20726050287485123,
|
|
"num_tokens": 25859471.0,
|
|
"step": 11285
|
|
},
|
|
{
|
|
"entropy": 5.1854105472564695,
|
|
"epoch": 1.0845341018251682,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004890300532236062,
|
|
"loss": 4.9667,
|
|
"mean_token_accuracy": 0.21461206972599028,
|
|
"num_tokens": 25869460.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 5.1921216487884525,
|
|
"epoch": 1.085014409221902,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004890194764124064,
|
|
"loss": 4.9847,
|
|
"mean_token_accuracy": 0.21348736435174942,
|
|
"num_tokens": 25881892.0,
|
|
"step": 11295
|
|
},
|
|
{
|
|
"entropy": 5.274989652633667,
|
|
"epoch": 1.085494716618636,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004890088946323099,
|
|
"loss": 5.1136,
|
|
"mean_token_accuracy": 0.20064806640148164,
|
|
"num_tokens": 25893774.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 5.141970014572143,
|
|
"epoch": 1.0859750240153698,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004889983078835623,
|
|
"loss": 4.924,
|
|
"mean_token_accuracy": 0.21487925201654434,
|
|
"num_tokens": 25904758.0,
|
|
"step": 11305
|
|
},
|
|
{
|
|
"entropy": 5.145872449874878,
|
|
"epoch": 1.0864553314121037,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004889877161664096,
|
|
"loss": 5.0486,
|
|
"mean_token_accuracy": 0.20480419993400573,
|
|
"num_tokens": 25917013.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 5.349794626235962,
|
|
"epoch": 1.0869356388088376,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004889771194810974,
|
|
"loss": 5.1048,
|
|
"mean_token_accuracy": 0.2101388841867447,
|
|
"num_tokens": 25927780.0,
|
|
"step": 11315
|
|
},
|
|
{
|
|
"entropy": 5.162079620361328,
|
|
"epoch": 1.0874159462055715,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004889665178278719,
|
|
"loss": 4.99,
|
|
"mean_token_accuracy": 0.21398296654224397,
|
|
"num_tokens": 25939339.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 5.184692430496216,
|
|
"epoch": 1.0878962536023056,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004889559112069792,
|
|
"loss": 4.9803,
|
|
"mean_token_accuracy": 0.21223179250955582,
|
|
"num_tokens": 25950440.0,
|
|
"step": 11325
|
|
},
|
|
{
|
|
"entropy": 5.23531174659729,
|
|
"epoch": 1.0883765609990395,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004889452996186657,
|
|
"loss": 5.0247,
|
|
"mean_token_accuracy": 0.20834243446588516,
|
|
"num_tokens": 25962849.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 5.221828603744507,
|
|
"epoch": 1.0888568683957733,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004889346830631774,
|
|
"loss": 5.0695,
|
|
"mean_token_accuracy": 0.20499206930398942,
|
|
"num_tokens": 25973616.0,
|
|
"step": 11335
|
|
},
|
|
{
|
|
"entropy": 5.175124979019165,
|
|
"epoch": 1.0893371757925072,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000488924061540761,
|
|
"loss": 4.9543,
|
|
"mean_token_accuracy": 0.21570177525281906,
|
|
"num_tokens": 25984727.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 5.218035411834717,
|
|
"epoch": 1.089817483189241,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004889134350516633,
|
|
"loss": 5.069,
|
|
"mean_token_accuracy": 0.20992496013641357,
|
|
"num_tokens": 25996431.0,
|
|
"step": 11345
|
|
},
|
|
{
|
|
"entropy": 5.199566984176636,
|
|
"epoch": 1.090297790585975,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004889028035961308,
|
|
"loss": 5.0212,
|
|
"mean_token_accuracy": 0.20875319093465805,
|
|
"num_tokens": 26008936.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 5.28378643989563,
|
|
"epoch": 1.0907780979827089,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004888921671744103,
|
|
"loss": 5.0843,
|
|
"mean_token_accuracy": 0.20148587226867676,
|
|
"num_tokens": 26019308.0,
|
|
"step": 11355
|
|
},
|
|
{
|
|
"entropy": 5.234175491333008,
|
|
"epoch": 1.0912584053794427,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004888815257867488,
|
|
"loss": 4.9794,
|
|
"mean_token_accuracy": 0.21648937463760376,
|
|
"num_tokens": 26030705.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 5.227200984954834,
|
|
"epoch": 1.0917387127761768,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004888708794333934,
|
|
"loss": 5.0079,
|
|
"mean_token_accuracy": 0.21071529090404512,
|
|
"num_tokens": 26042759.0,
|
|
"step": 11365
|
|
},
|
|
{
|
|
"entropy": 5.233187103271485,
|
|
"epoch": 1.0922190201729107,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004888602281145913,
|
|
"loss": 5.0673,
|
|
"mean_token_accuracy": 0.20930221676826477,
|
|
"num_tokens": 26054719.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 5.210502481460571,
|
|
"epoch": 1.0926993275696446,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004888495718305897,
|
|
"loss": 5.0531,
|
|
"mean_token_accuracy": 0.20732715278863906,
|
|
"num_tokens": 26065765.0,
|
|
"step": 11375
|
|
},
|
|
{
|
|
"entropy": 5.092276668548584,
|
|
"epoch": 1.0931796349663785,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000488838910581636,
|
|
"loss": 4.9681,
|
|
"mean_token_accuracy": 0.21723177582025527,
|
|
"num_tokens": 26077719.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 5.271825551986694,
|
|
"epoch": 1.0936599423631124,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004888282443679777,
|
|
"loss": 5.1021,
|
|
"mean_token_accuracy": 0.1978613868355751,
|
|
"num_tokens": 26089924.0,
|
|
"step": 11385
|
|
},
|
|
{
|
|
"entropy": 5.389468097686768,
|
|
"epoch": 1.0941402497598463,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004888175731898627,
|
|
"loss": 5.1585,
|
|
"mean_token_accuracy": 0.20117444396018982,
|
|
"num_tokens": 26100312.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 5.196537494659424,
|
|
"epoch": 1.0946205571565801,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004888068970475384,
|
|
"loss": 5.0671,
|
|
"mean_token_accuracy": 0.21175539195537568,
|
|
"num_tokens": 26111932.0,
|
|
"step": 11395
|
|
},
|
|
{
|
|
"entropy": 5.233985948562622,
|
|
"epoch": 1.0951008645533142,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004887962159412529,
|
|
"loss": 4.9669,
|
|
"mean_token_accuracy": 0.2145277202129364,
|
|
"num_tokens": 26123989.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 5.24011116027832,
|
|
"epoch": 1.0955811719500481,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004887855298712541,
|
|
"loss": 5.0822,
|
|
"mean_token_accuracy": 0.2078133523464203,
|
|
"num_tokens": 26135589.0,
|
|
"step": 11405
|
|
},
|
|
{
|
|
"entropy": 5.1291491985321045,
|
|
"epoch": 1.096061479346782,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00048877483883779,
|
|
"loss": 4.9785,
|
|
"mean_token_accuracy": 0.21442267745733262,
|
|
"num_tokens": 26147069.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 5.19802622795105,
|
|
"epoch": 1.0965417867435159,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004887641428411091,
|
|
"loss": 5.0536,
|
|
"mean_token_accuracy": 0.2031223937869072,
|
|
"num_tokens": 26159331.0,
|
|
"step": 11415
|
|
},
|
|
{
|
|
"entropy": 5.239735078811646,
|
|
"epoch": 1.0970220941402498,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004887534418814595,
|
|
"loss": 5.0489,
|
|
"mean_token_accuracy": 0.21215286552906037,
|
|
"num_tokens": 26169863.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 5.201609802246094,
|
|
"epoch": 1.0975024015369836,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004887427359590897,
|
|
"loss": 5.018,
|
|
"mean_token_accuracy": 0.20545354038476943,
|
|
"num_tokens": 26182888.0,
|
|
"step": 11425
|
|
},
|
|
{
|
|
"entropy": 5.196053218841553,
|
|
"epoch": 1.0979827089337175,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004887320250742482,
|
|
"loss": 5.0074,
|
|
"mean_token_accuracy": 0.21169717162847518,
|
|
"num_tokens": 26194979.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 5.163520240783692,
|
|
"epoch": 1.0984630163304514,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004887213092271838,
|
|
"loss": 5.0171,
|
|
"mean_token_accuracy": 0.218392214179039,
|
|
"num_tokens": 26207309.0,
|
|
"step": 11435
|
|
},
|
|
{
|
|
"entropy": 5.180823183059692,
|
|
"epoch": 1.0989433237271853,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004887105884181451,
|
|
"loss": 5.0562,
|
|
"mean_token_accuracy": 0.20584176182746888,
|
|
"num_tokens": 26219231.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 5.28138575553894,
|
|
"epoch": 1.0994236311239194,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004886998626473813,
|
|
"loss": 5.0914,
|
|
"mean_token_accuracy": 0.2082364484667778,
|
|
"num_tokens": 26229355.0,
|
|
"step": 11445
|
|
},
|
|
{
|
|
"entropy": 5.259874248504639,
|
|
"epoch": 1.0999039385206533,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004886891319151411,
|
|
"loss": 4.9917,
|
|
"mean_token_accuracy": 0.21067868769168854,
|
|
"num_tokens": 26239069.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 5.232890796661377,
|
|
"epoch": 1.1003842459173871,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004886783962216738,
|
|
"loss": 5.1051,
|
|
"mean_token_accuracy": 0.20159524232149123,
|
|
"num_tokens": 26250403.0,
|
|
"step": 11455
|
|
},
|
|
{
|
|
"entropy": 5.2542445182800295,
|
|
"epoch": 1.100864553314121,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004886676555672287,
|
|
"loss": 5.0155,
|
|
"mean_token_accuracy": 0.20787968933582307,
|
|
"num_tokens": 26262926.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 5.3055487155914305,
|
|
"epoch": 1.101344860710855,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004886569099520551,
|
|
"loss": 5.1355,
|
|
"mean_token_accuracy": 0.2017137423157692,
|
|
"num_tokens": 26274030.0,
|
|
"step": 11465
|
|
},
|
|
{
|
|
"entropy": 5.146465301513672,
|
|
"epoch": 1.1018251681075888,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004886461593764024,
|
|
"loss": 5.0465,
|
|
"mean_token_accuracy": 0.21580926030874253,
|
|
"num_tokens": 26284799.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 5.112147951126099,
|
|
"epoch": 1.1023054755043227,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004886354038405204,
|
|
"loss": 4.9238,
|
|
"mean_token_accuracy": 0.22144615203142165,
|
|
"num_tokens": 26295154.0,
|
|
"step": 11475
|
|
},
|
|
{
|
|
"entropy": 5.2879190921783445,
|
|
"epoch": 1.1027857829010568,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004886246433446586,
|
|
"loss": 5.0586,
|
|
"mean_token_accuracy": 0.20436291843652726,
|
|
"num_tokens": 26306181.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 5.296609544754029,
|
|
"epoch": 1.1032660902977907,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004886138778890669,
|
|
"loss": 5.1495,
|
|
"mean_token_accuracy": 0.19853242188692094,
|
|
"num_tokens": 26318674.0,
|
|
"step": 11485
|
|
},
|
|
{
|
|
"entropy": 5.262785196304321,
|
|
"epoch": 1.1037463976945245,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004886031074739953,
|
|
"loss": 5.0698,
|
|
"mean_token_accuracy": 0.2011367380619049,
|
|
"num_tokens": 26330257.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 5.268162727355957,
|
|
"epoch": 1.1042267050912584,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004885923320996938,
|
|
"loss": 5.0199,
|
|
"mean_token_accuracy": 0.20887107402086258,
|
|
"num_tokens": 26342035.0,
|
|
"step": 11495
|
|
},
|
|
{
|
|
"entropy": 5.2100663661956785,
|
|
"epoch": 1.1047070124879923,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004885815517664127,
|
|
"loss": 5.0453,
|
|
"mean_token_accuracy": 0.20797477215528487,
|
|
"num_tokens": 26352703.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 5.280159616470337,
|
|
"epoch": 1.1051873198847262,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000488570766474402,
|
|
"loss": 5.0525,
|
|
"mean_token_accuracy": 0.2123723268508911,
|
|
"num_tokens": 26362324.0,
|
|
"step": 11505
|
|
},
|
|
{
|
|
"entropy": 5.240962123870849,
|
|
"epoch": 1.10566762728146,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004885599762239124,
|
|
"loss": 5.0814,
|
|
"mean_token_accuracy": 0.20114895701408386,
|
|
"num_tokens": 26373540.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 5.198315954208374,
|
|
"epoch": 1.106147934678194,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004885491810151943,
|
|
"loss": 5.0311,
|
|
"mean_token_accuracy": 0.21315819025039673,
|
|
"num_tokens": 26384259.0,
|
|
"step": 11515
|
|
},
|
|
{
|
|
"entropy": 5.164407539367676,
|
|
"epoch": 1.106628242074928,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004885383808484982,
|
|
"loss": 5.0093,
|
|
"mean_token_accuracy": 0.2147809937596321,
|
|
"num_tokens": 26395193.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 5.166898584365844,
|
|
"epoch": 1.107108549471662,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004885275757240751,
|
|
"loss": 4.9888,
|
|
"mean_token_accuracy": 0.21306061148643493,
|
|
"num_tokens": 26408556.0,
|
|
"step": 11525
|
|
},
|
|
{
|
|
"entropy": 5.181211996078491,
|
|
"epoch": 1.1075888568683958,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004885167656421757,
|
|
"loss": 5.0399,
|
|
"mean_token_accuracy": 0.21158163398504257,
|
|
"num_tokens": 26420066.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 5.251511573791504,
|
|
"epoch": 1.1080691642651297,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000488505950603051,
|
|
"loss": 5.0044,
|
|
"mean_token_accuracy": 0.21201496720314025,
|
|
"num_tokens": 26432533.0,
|
|
"step": 11535
|
|
},
|
|
{
|
|
"entropy": 5.220172452926636,
|
|
"epoch": 1.1085494716618636,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000488495130606952,
|
|
"loss": 5.0101,
|
|
"mean_token_accuracy": 0.20595130324363708,
|
|
"num_tokens": 26443878.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 5.173053646087647,
|
|
"epoch": 1.1090297790585975,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004884843056541302,
|
|
"loss": 5.0053,
|
|
"mean_token_accuracy": 0.20983056724071503,
|
|
"num_tokens": 26455111.0,
|
|
"step": 11545
|
|
},
|
|
{
|
|
"entropy": 5.2253295421600345,
|
|
"epoch": 1.1095100864553313,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004884734757448367,
|
|
"loss": 5.0401,
|
|
"mean_token_accuracy": 0.2117287129163742,
|
|
"num_tokens": 26466577.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 5.212938976287842,
|
|
"epoch": 1.1099903938520654,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000488462640879323,
|
|
"loss": 4.9652,
|
|
"mean_token_accuracy": 0.21090029329061508,
|
|
"num_tokens": 26479637.0,
|
|
"step": 11555
|
|
},
|
|
{
|
|
"entropy": 5.146299743652344,
|
|
"epoch": 1.1104707012487993,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004884518010578405,
|
|
"loss": 5.0053,
|
|
"mean_token_accuracy": 0.2093895897269249,
|
|
"num_tokens": 26489923.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 5.262247848510742,
|
|
"epoch": 1.1109510086455332,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004884409562806411,
|
|
"loss": 5.107,
|
|
"mean_token_accuracy": 0.2048266798257828,
|
|
"num_tokens": 26501236.0,
|
|
"step": 11565
|
|
},
|
|
{
|
|
"entropy": 5.210489082336426,
|
|
"epoch": 1.111431316042267,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004884301065479765,
|
|
"loss": 4.9631,
|
|
"mean_token_accuracy": 0.2114759638905525,
|
|
"num_tokens": 26511509.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 5.148941612243652,
|
|
"epoch": 1.111911623439001,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004884192518600986,
|
|
"loss": 4.9407,
|
|
"mean_token_accuracy": 0.2189345121383667,
|
|
"num_tokens": 26520931.0,
|
|
"step": 11575
|
|
},
|
|
{
|
|
"entropy": 5.10898380279541,
|
|
"epoch": 1.1123919308357348,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004884083922172593,
|
|
"loss": 4.9487,
|
|
"mean_token_accuracy": 0.2212027356028557,
|
|
"num_tokens": 26531333.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 5.150489377975464,
|
|
"epoch": 1.1128722382324687,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004883975276197108,
|
|
"loss": 5.0245,
|
|
"mean_token_accuracy": 0.2141410857439041,
|
|
"num_tokens": 26543696.0,
|
|
"step": 11585
|
|
},
|
|
{
|
|
"entropy": 5.21362886428833,
|
|
"epoch": 1.1133525456292026,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004883866580677055,
|
|
"loss": 4.9893,
|
|
"mean_token_accuracy": 0.21139197200536727,
|
|
"num_tokens": 26556292.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 5.244434595108032,
|
|
"epoch": 1.1138328530259365,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004883757835614956,
|
|
"loss": 5.0254,
|
|
"mean_token_accuracy": 0.2002588540315628,
|
|
"num_tokens": 26568023.0,
|
|
"step": 11595
|
|
},
|
|
{
|
|
"entropy": 5.1676887512207035,
|
|
"epoch": 1.1143131604226706,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004883649041013335,
|
|
"loss": 5.0,
|
|
"mean_token_accuracy": 0.20850686728954315,
|
|
"num_tokens": 26579469.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 5.249563598632813,
|
|
"epoch": 1.1147934678194045,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000488354019687472,
|
|
"loss": 5.0175,
|
|
"mean_token_accuracy": 0.20751053243875503,
|
|
"num_tokens": 26591592.0,
|
|
"step": 11605
|
|
},
|
|
{
|
|
"entropy": 5.24787974357605,
|
|
"epoch": 1.1152737752161384,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004883431303201636,
|
|
"loss": 5.0213,
|
|
"mean_token_accuracy": 0.2119702085852623,
|
|
"num_tokens": 26604075.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 5.092301654815674,
|
|
"epoch": 1.1157540826128722,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004883322359996613,
|
|
"loss": 4.9402,
|
|
"mean_token_accuracy": 0.21203264445066453,
|
|
"num_tokens": 26616448.0,
|
|
"step": 11615
|
|
},
|
|
{
|
|
"entropy": 5.232432794570923,
|
|
"epoch": 1.1162343900096061,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004883213367262179,
|
|
"loss": 5.014,
|
|
"mean_token_accuracy": 0.21606809943914412,
|
|
"num_tokens": 26628749.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 5.1754385471344,
|
|
"epoch": 1.11671469740634,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004883104325000866,
|
|
"loss": 4.9661,
|
|
"mean_token_accuracy": 0.21770241409540175,
|
|
"num_tokens": 26640163.0,
|
|
"step": 11625
|
|
},
|
|
{
|
|
"entropy": 5.214389657974243,
|
|
"epoch": 1.1171950048030739,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004882995233215203,
|
|
"loss": 5.0279,
|
|
"mean_token_accuracy": 0.21030279397964477,
|
|
"num_tokens": 26650729.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 5.1765196323394775,
|
|
"epoch": 1.117675312199808,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004882886091907726,
|
|
"loss": 5.0681,
|
|
"mean_token_accuracy": 0.20886261761188507,
|
|
"num_tokens": 26661995.0,
|
|
"step": 11635
|
|
},
|
|
{
|
|
"entropy": 5.166051578521729,
|
|
"epoch": 1.1181556195965419,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048827769010809666,
|
|
"loss": 4.9625,
|
|
"mean_token_accuracy": 0.21373932361602782,
|
|
"num_tokens": 26673224.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 5.2252014636993405,
|
|
"epoch": 1.1186359269932757,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048826676607374606,
|
|
"loss": 4.9973,
|
|
"mean_token_accuracy": 0.20560778081417083,
|
|
"num_tokens": 26686331.0,
|
|
"step": 11645
|
|
},
|
|
{
|
|
"entropy": 5.253619718551636,
|
|
"epoch": 1.1191162343900096,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048825583708797434,
|
|
"loss": 5.0623,
|
|
"mean_token_accuracy": 0.2126183569431305,
|
|
"num_tokens": 26696816.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 5.149569368362426,
|
|
"epoch": 1.1195965417867435,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004882449031510354,
|
|
"loss": 5.0084,
|
|
"mean_token_accuracy": 0.21252903193235398,
|
|
"num_tokens": 26708126.0,
|
|
"step": 11655
|
|
},
|
|
{
|
|
"entropy": 5.2515003204345705,
|
|
"epoch": 1.1200768491834774,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000488233964263183,
|
|
"loss": 5.1267,
|
|
"mean_token_accuracy": 0.20479959100484849,
|
|
"num_tokens": 26718951.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 5.24708366394043,
|
|
"epoch": 1.1205571565802113,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048822302042467115,
|
|
"loss": 5.0769,
|
|
"mean_token_accuracy": 0.20536175221204758,
|
|
"num_tokens": 26730550.0,
|
|
"step": 11665
|
|
},
|
|
{
|
|
"entropy": 5.11295714378357,
|
|
"epoch": 1.1210374639769451,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004882120716357539,
|
|
"loss": 4.9113,
|
|
"mean_token_accuracy": 0.21777141392230986,
|
|
"num_tokens": 26741485.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 5.075635814666748,
|
|
"epoch": 1.1215177713736793,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004882011178966854,
|
|
"loss": 4.9079,
|
|
"mean_token_accuracy": 0.2191823497414589,
|
|
"num_tokens": 26753947.0,
|
|
"step": 11675
|
|
},
|
|
{
|
|
"entropy": 5.206958436965943,
|
|
"epoch": 1.1219980787704131,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004881901592077201,
|
|
"loss": 4.9939,
|
|
"mean_token_accuracy": 0.2158915787935257,
|
|
"num_tokens": 26764921.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 5.221724176406861,
|
|
"epoch": 1.122478386167147,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004881791955691123,
|
|
"loss": 4.9439,
|
|
"mean_token_accuracy": 0.21361148059368135,
|
|
"num_tokens": 26777384.0,
|
|
"step": 11685
|
|
},
|
|
{
|
|
"entropy": 5.2075098037719725,
|
|
"epoch": 1.122958693563881,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048816822698111655,
|
|
"loss": 5.07,
|
|
"mean_token_accuracy": 0.20504006147384643,
|
|
"num_tokens": 26789916.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 5.194676733016967,
|
|
"epoch": 1.1234390009606148,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048815725344398766,
|
|
"loss": 4.9768,
|
|
"mean_token_accuracy": 0.2126043662428856,
|
|
"num_tokens": 26801167.0,
|
|
"step": 11695
|
|
},
|
|
{
|
|
"entropy": 5.290380001068115,
|
|
"epoch": 1.1239193083573487,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00048814627495798017,
|
|
"loss": 5.0949,
|
|
"mean_token_accuracy": 0.20276835262775422,
|
|
"num_tokens": 26813235.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 5.283368635177612,
|
|
"epoch": 1.1243996157540825,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004881352915233492,
|
|
"loss": 5.0815,
|
|
"mean_token_accuracy": 0.2050356462597847,
|
|
"num_tokens": 26824758.0,
|
|
"step": 11705
|
|
},
|
|
{
|
|
"entropy": 5.222380495071411,
|
|
"epoch": 1.1248799231508164,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048812430314034956,
|
|
"loss": 5.0459,
|
|
"mean_token_accuracy": 0.20643949508666992,
|
|
"num_tokens": 26836877.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 5.146471929550171,
|
|
"epoch": 1.1253602305475505,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004881133098092365,
|
|
"loss": 4.8847,
|
|
"mean_token_accuracy": 0.22436288893222808,
|
|
"num_tokens": 26848394.0,
|
|
"step": 11715
|
|
},
|
|
{
|
|
"entropy": 5.169895315170288,
|
|
"epoch": 1.1258405379442844,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004881023115302652,
|
|
"loss": 5.0248,
|
|
"mean_token_accuracy": 0.20842950493097306,
|
|
"num_tokens": 26859064.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 5.259196662902832,
|
|
"epoch": 1.1263208453410183,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000488091308303691,
|
|
"loss": 5.1436,
|
|
"mean_token_accuracy": 0.19910948574543,
|
|
"num_tokens": 26870401.0,
|
|
"step": 11725
|
|
},
|
|
{
|
|
"entropy": 5.256836318969727,
|
|
"epoch": 1.1268011527377522,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004880803001297694,
|
|
"loss": 5.0458,
|
|
"mean_token_accuracy": 0.21228888928890227,
|
|
"num_tokens": 26881767.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 5.177135944366455,
|
|
"epoch": 1.127281460134486,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004880692870087559,
|
|
"loss": 4.9447,
|
|
"mean_token_accuracy": 0.21096309274435043,
|
|
"num_tokens": 26892854.0,
|
|
"step": 11735
|
|
},
|
|
{
|
|
"entropy": 5.290387105941773,
|
|
"epoch": 1.12776176753122,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00048805826894090626,
|
|
"loss": 5.1022,
|
|
"mean_token_accuracy": 0.2025263249874115,
|
|
"num_tokens": 26905079.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 5.267297887802124,
|
|
"epoch": 1.1282420749279538,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048804724592647626,
|
|
"loss": 5.1096,
|
|
"mean_token_accuracy": 0.205536325275898,
|
|
"num_tokens": 26915640.0,
|
|
"step": 11745
|
|
},
|
|
{
|
|
"entropy": 5.2140075206756595,
|
|
"epoch": 1.1287223823246877,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004880362179657218,
|
|
"loss": 5.0008,
|
|
"mean_token_accuracy": 0.2080523982644081,
|
|
"num_tokens": 26927039.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 5.185867214202881,
|
|
"epoch": 1.1292026897214218,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048802518505889904,
|
|
"loss": 4.9986,
|
|
"mean_token_accuracy": 0.21176180839538575,
|
|
"num_tokens": 26939368.0,
|
|
"step": 11755
|
|
},
|
|
{
|
|
"entropy": 5.182089567184448,
|
|
"epoch": 1.1296829971181557,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048801414720626404,
|
|
"loss": 5.1614,
|
|
"mean_token_accuracy": 0.19189264625310898,
|
|
"num_tokens": 26952051.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 5.198432683944702,
|
|
"epoch": 1.1301633045148896,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048800310440807294,
|
|
"loss": 4.9895,
|
|
"mean_token_accuracy": 0.20608253926038742,
|
|
"num_tokens": 26963680.0,
|
|
"step": 11765
|
|
},
|
|
{
|
|
"entropy": 5.098900985717774,
|
|
"epoch": 1.1306436119116234,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004879920566645823,
|
|
"loss": 4.8958,
|
|
"mean_token_accuracy": 0.22000947147607802,
|
|
"num_tokens": 26973899.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 5.164202928543091,
|
|
"epoch": 1.1311239193083573,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004879810039760486,
|
|
"loss": 5.0344,
|
|
"mean_token_accuracy": 0.20242914706468582,
|
|
"num_tokens": 26985692.0,
|
|
"step": 11775
|
|
},
|
|
{
|
|
"entropy": 5.224712562561035,
|
|
"epoch": 1.1316042267050912,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004879699463427284,
|
|
"loss": 4.9784,
|
|
"mean_token_accuracy": 0.21051635444164277,
|
|
"num_tokens": 26996354.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 5.236588096618652,
|
|
"epoch": 1.132084534101825,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004879588837648785,
|
|
"loss": 5.0068,
|
|
"mean_token_accuracy": 0.20998671054840087,
|
|
"num_tokens": 27007925.0,
|
|
"step": 11785
|
|
},
|
|
{
|
|
"entropy": 5.186428022384644,
|
|
"epoch": 1.1325648414985592,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048794781624275554,
|
|
"loss": 4.975,
|
|
"mean_token_accuracy": 0.2176157593727112,
|
|
"num_tokens": 27019222.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 5.125696468353271,
|
|
"epoch": 1.133045148895293,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00048793674377661664,
|
|
"loss": 4.9674,
|
|
"mean_token_accuracy": 0.21915102750062943,
|
|
"num_tokens": 27029973.0,
|
|
"step": 11795
|
|
},
|
|
{
|
|
"entropy": 5.233280372619629,
|
|
"epoch": 1.133525456292027,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00048792566636671886,
|
|
"loss": 5.1306,
|
|
"mean_token_accuracy": 0.2096991240978241,
|
|
"num_tokens": 27042002.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 5.178752517700195,
|
|
"epoch": 1.1340057636887608,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004879145840133194,
|
|
"loss": 5.0308,
|
|
"mean_token_accuracy": 0.21618867963552474,
|
|
"num_tokens": 27052770.0,
|
|
"step": 11805
|
|
},
|
|
{
|
|
"entropy": 5.270467710494995,
|
|
"epoch": 1.1344860710854947,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004879034967166755,
|
|
"loss": 5.0632,
|
|
"mean_token_accuracy": 0.20635210424661637,
|
|
"num_tokens": 27063375.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 5.201555490493774,
|
|
"epoch": 1.1349663784822286,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004878924044770446,
|
|
"loss": 5.0013,
|
|
"mean_token_accuracy": 0.20745259374380112,
|
|
"num_tokens": 27075908.0,
|
|
"step": 11815
|
|
},
|
|
{
|
|
"entropy": 5.1736366748809814,
|
|
"epoch": 1.1354466858789625,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004878813072946843,
|
|
"loss": 5.0447,
|
|
"mean_token_accuracy": 0.21043994426727294,
|
|
"num_tokens": 27087590.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 5.225687408447266,
|
|
"epoch": 1.1359269932756964,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048787020516985203,
|
|
"loss": 4.9593,
|
|
"mean_token_accuracy": 0.218149633705616,
|
|
"num_tokens": 27098076.0,
|
|
"step": 11825
|
|
},
|
|
{
|
|
"entropy": 5.2235795021057125,
|
|
"epoch": 1.1364073006724302,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00048785909810280576,
|
|
"loss": 4.9734,
|
|
"mean_token_accuracy": 0.2160875007510185,
|
|
"num_tokens": 27109851.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 5.222425508499145,
|
|
"epoch": 1.1368876080691643,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004878479860938033,
|
|
"loss": 4.9228,
|
|
"mean_token_accuracy": 0.21766222417354583,
|
|
"num_tokens": 27121288.0,
|
|
"step": 11835
|
|
},
|
|
{
|
|
"entropy": 5.195209598541259,
|
|
"epoch": 1.1373679154658982,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048783686914310266,
|
|
"loss": 5.0172,
|
|
"mean_token_accuracy": 0.21215075105428696,
|
|
"num_tokens": 27133877.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 5.063023233413697,
|
|
"epoch": 1.137848222862632,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004878257472509619,
|
|
"loss": 4.8351,
|
|
"mean_token_accuracy": 0.22125699520111083,
|
|
"num_tokens": 27145616.0,
|
|
"step": 11845
|
|
},
|
|
{
|
|
"entropy": 5.107744407653809,
|
|
"epoch": 1.138328530259366,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004878146204176392,
|
|
"loss": 4.9853,
|
|
"mean_token_accuracy": 0.20994766801595688,
|
|
"num_tokens": 27157182.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 5.2036010265350345,
|
|
"epoch": 1.1388088376560999,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000487803488643393,
|
|
"loss": 4.9386,
|
|
"mean_token_accuracy": 0.2160127192735672,
|
|
"num_tokens": 27167609.0,
|
|
"step": 11855
|
|
},
|
|
{
|
|
"entropy": 5.24319167137146,
|
|
"epoch": 1.1392891450528337,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048779235192848166,
|
|
"loss": 5.1012,
|
|
"mean_token_accuracy": 0.2044668361544609,
|
|
"num_tokens": 27180043.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 5.169292831420899,
|
|
"epoch": 1.1397694524495678,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004877812102731638,
|
|
"loss": 4.8945,
|
|
"mean_token_accuracy": 0.22032397091388703,
|
|
"num_tokens": 27191870.0,
|
|
"step": 11865
|
|
},
|
|
{
|
|
"entropy": 5.157991981506347,
|
|
"epoch": 1.1402497598463017,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048777006367769804,
|
|
"loss": 5.0222,
|
|
"mean_token_accuracy": 0.213824962079525,
|
|
"num_tokens": 27203291.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 5.2055253982543945,
|
|
"epoch": 1.1407300672430356,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004877589121423432,
|
|
"loss": 5.041,
|
|
"mean_token_accuracy": 0.21266603320837021,
|
|
"num_tokens": 27214607.0,
|
|
"step": 11875
|
|
},
|
|
{
|
|
"entropy": 5.185617160797119,
|
|
"epoch": 1.1412103746397695,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004877477556673582,
|
|
"loss": 4.9066,
|
|
"mean_token_accuracy": 0.21550966054201126,
|
|
"num_tokens": 27224658.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 5.1661797046661375,
|
|
"epoch": 1.1416906820365034,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000487736594253002,
|
|
"loss": 5.0204,
|
|
"mean_token_accuracy": 0.20654748678207396,
|
|
"num_tokens": 27235306.0,
|
|
"step": 11885
|
|
},
|
|
{
|
|
"entropy": 5.276842164993286,
|
|
"epoch": 1.1421709894332372,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00048772542789953384,
|
|
"loss": 5.0314,
|
|
"mean_token_accuracy": 0.20629957020282746,
|
|
"num_tokens": 27246679.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 5.28350601196289,
|
|
"epoch": 1.1426512968299711,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048771425660721284,
|
|
"loss": 5.065,
|
|
"mean_token_accuracy": 0.20193494856357574,
|
|
"num_tokens": 27257799.0,
|
|
"step": 11895
|
|
},
|
|
{
|
|
"entropy": 5.124622106552124,
|
|
"epoch": 1.143131604226705,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048770308037629853,
|
|
"loss": 5.0324,
|
|
"mean_token_accuracy": 0.2094832718372345,
|
|
"num_tokens": 27268651.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 5.176864957809448,
|
|
"epoch": 1.143611911623439,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004876918992070502,
|
|
"loss": 5.0004,
|
|
"mean_token_accuracy": 0.2177245572209358,
|
|
"num_tokens": 27280655.0,
|
|
"step": 11905
|
|
},
|
|
{
|
|
"entropy": 5.243468952178955,
|
|
"epoch": 1.144092219020173,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004876807130997276,
|
|
"loss": 5.0788,
|
|
"mean_token_accuracy": 0.20621824115514756,
|
|
"num_tokens": 27292664.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 5.224751567840576,
|
|
"epoch": 1.1445725264169069,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004876695220545903,
|
|
"loss": 5.0655,
|
|
"mean_token_accuracy": 0.20803812742233277,
|
|
"num_tokens": 27306143.0,
|
|
"step": 11915
|
|
},
|
|
{
|
|
"entropy": 5.201717710494995,
|
|
"epoch": 1.1450528338136408,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048765832607189824,
|
|
"loss": 4.9712,
|
|
"mean_token_accuracy": 0.2136443629860878,
|
|
"num_tokens": 27316926.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 5.28920431137085,
|
|
"epoch": 1.1455331412103746,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00048764712515191136,
|
|
"loss": 5.0737,
|
|
"mean_token_accuracy": 0.2188516676425934,
|
|
"num_tokens": 27327472.0,
|
|
"step": 11925
|
|
},
|
|
{
|
|
"entropy": 5.269708442687988,
|
|
"epoch": 1.1460134486071085,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048763591929488966,
|
|
"loss": 5.045,
|
|
"mean_token_accuracy": 0.21446569710969926,
|
|
"num_tokens": 27338388.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 5.159726858139038,
|
|
"epoch": 1.1464937560038424,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004876247085010933,
|
|
"loss": 4.9518,
|
|
"mean_token_accuracy": 0.2131276786327362,
|
|
"num_tokens": 27349879.0,
|
|
"step": 11935
|
|
},
|
|
{
|
|
"entropy": 5.11401858329773,
|
|
"epoch": 1.1469740634005763,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00048761349277078253,
|
|
"loss": 4.9533,
|
|
"mean_token_accuracy": 0.2188461974263191,
|
|
"num_tokens": 27361226.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 5.213291311264038,
|
|
"epoch": 1.1474543707973104,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00048760227210421775,
|
|
"loss": 4.9345,
|
|
"mean_token_accuracy": 0.2213941693305969,
|
|
"num_tokens": 27373512.0,
|
|
"step": 11945
|
|
},
|
|
{
|
|
"entropy": 5.145865774154663,
|
|
"epoch": 1.1479346781940443,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004875910465016596,
|
|
"loss": 5.0049,
|
|
"mean_token_accuracy": 0.21635116934776305,
|
|
"num_tokens": 27384950.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 5.295030307769776,
|
|
"epoch": 1.1484149855907781,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004875798159633686,
|
|
"loss": 5.0487,
|
|
"mean_token_accuracy": 0.20899975001811982,
|
|
"num_tokens": 27395877.0,
|
|
"step": 11955
|
|
},
|
|
{
|
|
"entropy": 5.202318477630615,
|
|
"epoch": 1.148895292987512,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004875685804896055,
|
|
"loss": 5.0986,
|
|
"mean_token_accuracy": 0.21304885745048524,
|
|
"num_tokens": 27407783.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 5.181471586227417,
|
|
"epoch": 1.149375600384246,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004875573400806312,
|
|
"loss": 4.9887,
|
|
"mean_token_accuracy": 0.21124700605869293,
|
|
"num_tokens": 27420598.0,
|
|
"step": 11965
|
|
},
|
|
{
|
|
"entropy": 5.237709093093872,
|
|
"epoch": 1.1498559077809798,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00048754609473670654,
|
|
"loss": 5.0651,
|
|
"mean_token_accuracy": 0.21090197712182998,
|
|
"num_tokens": 27431961.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 5.21086540222168,
|
|
"epoch": 1.1503362151777137,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004875348444580927,
|
|
"loss": 4.9156,
|
|
"mean_token_accuracy": 0.22040790617465972,
|
|
"num_tokens": 27444010.0,
|
|
"step": 11975
|
|
},
|
|
{
|
|
"entropy": 5.2328328609466555,
|
|
"epoch": 1.1508165225744476,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000487523589245051,
|
|
"loss": 4.9771,
|
|
"mean_token_accuracy": 0.2139630988240242,
|
|
"num_tokens": 27455576.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 5.184505844116211,
|
|
"epoch": 1.1512968299711814,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004875123290978425,
|
|
"loss": 5.0403,
|
|
"mean_token_accuracy": 0.20502331107854843,
|
|
"num_tokens": 27468957.0,
|
|
"step": 11985
|
|
},
|
|
{
|
|
"entropy": 5.2058931350708,
|
|
"epoch": 1.1517771373679155,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048750106401672876,
|
|
"loss": 4.9986,
|
|
"mean_token_accuracy": 0.21766173243522643,
|
|
"num_tokens": 27479992.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 5.238912153244018,
|
|
"epoch": 1.1522574447646494,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00048748979400197134,
|
|
"loss": 5.0967,
|
|
"mean_token_accuracy": 0.20631994754076005,
|
|
"num_tokens": 27490878.0,
|
|
"step": 11995
|
|
},
|
|
{
|
|
"entropy": 5.145661306381226,
|
|
"epoch": 1.1527377521613833,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00048747851905383183,
|
|
"loss": 4.9302,
|
|
"mean_token_accuracy": 0.21805770546197892,
|
|
"num_tokens": 27502009.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 1.1527377521613833,
|
|
"eval_entropy": 5.072871884969547,
|
|
"eval_loss": 5.132204532623291,
|
|
"eval_mean_token_accuracy": 0.21277229704311537,
|
|
"eval_num_tokens": 27502009.0,
|
|
"eval_runtime": 26.6332,
|
|
"eval_samples_per_second": 1232.108,
|
|
"eval_steps_per_second": 154.018,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"entropy": 5.252287817001343,
|
|
"epoch": 1.1532180595581172,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004874672391725721,
|
|
"loss": 5.1089,
|
|
"mean_token_accuracy": 0.20380218029022218,
|
|
"num_tokens": 27513376.0,
|
|
"step": 12005
|
|
},
|
|
{
|
|
"entropy": 5.117784070968628,
|
|
"epoch": 1.153698366954851,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004874559543584539,
|
|
"loss": 4.9715,
|
|
"mean_token_accuracy": 0.2118404433131218,
|
|
"num_tokens": 27525166.0,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"entropy": 5.092162704467773,
|
|
"epoch": 1.154178674351585,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004874446646117394,
|
|
"loss": 4.8865,
|
|
"mean_token_accuracy": 0.226571424305439,
|
|
"num_tokens": 27535994.0,
|
|
"step": 12015
|
|
},
|
|
{
|
|
"entropy": 5.154507493972778,
|
|
"epoch": 1.154658981748319,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004874333699326906,
|
|
"loss": 4.9842,
|
|
"mean_token_accuracy": 0.213858063519001,
|
|
"num_tokens": 27546883.0,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"entropy": 5.312671184539795,
|
|
"epoch": 1.155139289145053,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004874220703215697,
|
|
"loss": 5.0903,
|
|
"mean_token_accuracy": 0.20375553965568544,
|
|
"num_tokens": 27558840.0,
|
|
"step": 12025
|
|
},
|
|
{
|
|
"entropy": 5.188431692123413,
|
|
"epoch": 1.1556195965417868,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004874107657786391,
|
|
"loss": 5.0142,
|
|
"mean_token_accuracy": 0.2109085887670517,
|
|
"num_tokens": 27569776.0,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"entropy": 5.199657583236695,
|
|
"epoch": 1.1560999039385207,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00048739945630416124,
|
|
"loss": 4.9845,
|
|
"mean_token_accuracy": 0.20919086486101152,
|
|
"num_tokens": 27580097.0,
|
|
"step": 12035
|
|
},
|
|
{
|
|
"entropy": 5.2678807258605955,
|
|
"epoch": 1.1565802113352546,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004873881418983987,
|
|
"loss": 5.1066,
|
|
"mean_token_accuracy": 0.21011823117733003,
|
|
"num_tokens": 27592035.0,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"entropy": 5.2372087955474855,
|
|
"epoch": 1.1570605187319885,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004873768225616141,
|
|
"loss": 5.0383,
|
|
"mean_token_accuracy": 0.21346299797296525,
|
|
"num_tokens": 27604409.0,
|
|
"step": 12045
|
|
},
|
|
{
|
|
"entropy": 5.251517963409424,
|
|
"epoch": 1.1575408261287223,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048736549829407047,
|
|
"loss": 4.9845,
|
|
"mean_token_accuracy": 0.21415583789348602,
|
|
"num_tokens": 27615019.0,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"entropy": 5.244622087478637,
|
|
"epoch": 1.1580211335254562,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004873541690960305,
|
|
"loss": 5.1,
|
|
"mean_token_accuracy": 0.2044678211212158,
|
|
"num_tokens": 27627924.0,
|
|
"step": 12055
|
|
},
|
|
{
|
|
"entropy": 5.12835431098938,
|
|
"epoch": 1.15850144092219,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004873428349677573,
|
|
"loss": 4.925,
|
|
"mean_token_accuracy": 0.21388751715421678,
|
|
"num_tokens": 27639188.0,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"entropy": 5.273781251907349,
|
|
"epoch": 1.1589817483189242,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000487331495909514,
|
|
"loss": 5.1448,
|
|
"mean_token_accuracy": 0.2023579403758049,
|
|
"num_tokens": 27652621.0,
|
|
"step": 12065
|
|
},
|
|
{
|
|
"entropy": 5.239981460571289,
|
|
"epoch": 1.159462055715658,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00048732015192156383,
|
|
"loss": 5.0773,
|
|
"mean_token_accuracy": 0.20497591197490692,
|
|
"num_tokens": 27665726.0,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"entropy": 5.234362506866455,
|
|
"epoch": 1.159942363112392,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00048730880300417015,
|
|
"loss": 5.0152,
|
|
"mean_token_accuracy": 0.20888158231973647,
|
|
"num_tokens": 27676984.0,
|
|
"step": 12075
|
|
},
|
|
{
|
|
"entropy": 5.258095026016235,
|
|
"epoch": 1.1604226705091258,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048729744915759657,
|
|
"loss": 5.0926,
|
|
"mean_token_accuracy": 0.20860619992017745,
|
|
"num_tokens": 27688812.0,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"entropy": 5.133250331878662,
|
|
"epoch": 1.1609029779058597,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048728609038210655,
|
|
"loss": 4.8731,
|
|
"mean_token_accuracy": 0.21741154789924622,
|
|
"num_tokens": 27699152.0,
|
|
"step": 12085
|
|
},
|
|
{
|
|
"entropy": 5.181648588180542,
|
|
"epoch": 1.1613832853025936,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00048727472667796395,
|
|
"loss": 5.063,
|
|
"mean_token_accuracy": 0.20912941545248032,
|
|
"num_tokens": 27710347.0,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"entropy": 5.160825157165528,
|
|
"epoch": 1.1618635926993275,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004872633580454325,
|
|
"loss": 4.9654,
|
|
"mean_token_accuracy": 0.21775319874286653,
|
|
"num_tokens": 27721112.0,
|
|
"step": 12095
|
|
},
|
|
{
|
|
"entropy": 5.271908760070801,
|
|
"epoch": 1.1623439000960616,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048725198448477616,
|
|
"loss": 5.0553,
|
|
"mean_token_accuracy": 0.20950192213058472,
|
|
"num_tokens": 27731766.0,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"entropy": 5.173818635940552,
|
|
"epoch": 1.1628242074927955,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048724060599625893,
|
|
"loss": 4.9923,
|
|
"mean_token_accuracy": 0.2151247590780258,
|
|
"num_tokens": 27743718.0,
|
|
"step": 12105
|
|
},
|
|
{
|
|
"entropy": 5.192879867553711,
|
|
"epoch": 1.1633045148895294,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048722922258014506,
|
|
"loss": 5.0514,
|
|
"mean_token_accuracy": 0.20927377343177794,
|
|
"num_tokens": 27754999.0,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"entropy": 5.269402647018433,
|
|
"epoch": 1.1637848222862632,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004872178342366989,
|
|
"loss": 5.1583,
|
|
"mean_token_accuracy": 0.20009388625621796,
|
|
"num_tokens": 27767684.0,
|
|
"step": 12115
|
|
},
|
|
{
|
|
"entropy": 5.148327445983886,
|
|
"epoch": 1.1642651296829971,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048720644096618475,
|
|
"loss": 5.0272,
|
|
"mean_token_accuracy": 0.21265908777713777,
|
|
"num_tokens": 27779103.0,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"entropy": 5.183534049987793,
|
|
"epoch": 1.164745437079731,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004871950427688672,
|
|
"loss": 5.0281,
|
|
"mean_token_accuracy": 0.21695935279130935,
|
|
"num_tokens": 27791223.0,
|
|
"step": 12125
|
|
},
|
|
{
|
|
"entropy": 5.227220249176026,
|
|
"epoch": 1.1652257444764649,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048718363964501087,
|
|
"loss": 4.964,
|
|
"mean_token_accuracy": 0.2172775998711586,
|
|
"num_tokens": 27802660.0,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"entropy": 5.253418016433716,
|
|
"epoch": 1.1657060518731988,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004871722315948805,
|
|
"loss": 5.0586,
|
|
"mean_token_accuracy": 0.20542750507593155,
|
|
"num_tokens": 27814370.0,
|
|
"step": 12135
|
|
},
|
|
{
|
|
"entropy": 5.152675437927246,
|
|
"epoch": 1.1661863592699326,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004871608186187408,
|
|
"loss": 4.9636,
|
|
"mean_token_accuracy": 0.2136980563402176,
|
|
"num_tokens": 27825035.0,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"entropy": 5.11957426071167,
|
|
"epoch": 1.1666666666666667,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048714940071685703,
|
|
"loss": 4.8978,
|
|
"mean_token_accuracy": 0.21549834907054902,
|
|
"num_tokens": 27837276.0,
|
|
"step": 12145
|
|
},
|
|
{
|
|
"entropy": 5.30233063697815,
|
|
"epoch": 1.1671469740634006,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048713797788949405,
|
|
"loss": 5.0647,
|
|
"mean_token_accuracy": 0.2075889676809311,
|
|
"num_tokens": 27848618.0,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"entropy": 5.188276481628418,
|
|
"epoch": 1.1676272814601345,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048712655013691714,
|
|
"loss": 5.0438,
|
|
"mean_token_accuracy": 0.20078416913747787,
|
|
"num_tokens": 27861556.0,
|
|
"step": 12155
|
|
},
|
|
{
|
|
"entropy": 5.225617361068726,
|
|
"epoch": 1.1681075888568684,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048711511745939165,
|
|
"loss": 5.0115,
|
|
"mean_token_accuracy": 0.20684807151556014,
|
|
"num_tokens": 27873211.0,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"entropy": 5.213997268676758,
|
|
"epoch": 1.1685878962536023,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000487103679857183,
|
|
"loss": 5.0288,
|
|
"mean_token_accuracy": 0.20902891159057618,
|
|
"num_tokens": 27883365.0,
|
|
"step": 12165
|
|
},
|
|
{
|
|
"entropy": 5.114966011047363,
|
|
"epoch": 1.1690682036503361,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004870922373305567,
|
|
"loss": 4.9032,
|
|
"mean_token_accuracy": 0.21823573112487793,
|
|
"num_tokens": 27894669.0,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"entropy": 5.189022970199585,
|
|
"epoch": 1.1695485110470702,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048708078987977837,
|
|
"loss": 5.0406,
|
|
"mean_token_accuracy": 0.21348860412836074,
|
|
"num_tokens": 27906480.0,
|
|
"step": 12175
|
|
},
|
|
{
|
|
"entropy": 5.1677796840667725,
|
|
"epoch": 1.1700288184438041,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048706933750511394,
|
|
"loss": 4.9761,
|
|
"mean_token_accuracy": 0.21937694698572158,
|
|
"num_tokens": 27918383.0,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"entropy": 5.246155738830566,
|
|
"epoch": 1.170509125840538,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004870578802068292,
|
|
"loss": 5.0332,
|
|
"mean_token_accuracy": 0.2065381273627281,
|
|
"num_tokens": 27928944.0,
|
|
"step": 12185
|
|
},
|
|
{
|
|
"entropy": 5.170810222625732,
|
|
"epoch": 1.170989433237272,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048704641798519006,
|
|
"loss": 4.9503,
|
|
"mean_token_accuracy": 0.21335744559764863,
|
|
"num_tokens": 27941105.0,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"entropy": 5.2019225597381595,
|
|
"epoch": 1.1714697406340058,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048703495084046286,
|
|
"loss": 4.9969,
|
|
"mean_token_accuracy": 0.2134275645017624,
|
|
"num_tokens": 27952925.0,
|
|
"step": 12195
|
|
},
|
|
{
|
|
"entropy": 5.109914255142212,
|
|
"epoch": 1.1719500480307397,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004870234787729137,
|
|
"loss": 4.9838,
|
|
"mean_token_accuracy": 0.21968378871679306,
|
|
"num_tokens": 27965504.0,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"entropy": 5.158345079421997,
|
|
"epoch": 1.1724303554274735,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004870120017828089,
|
|
"loss": 4.9624,
|
|
"mean_token_accuracy": 0.21934993118047713,
|
|
"num_tokens": 27975985.0,
|
|
"step": 12205
|
|
},
|
|
{
|
|
"entropy": 5.340363693237305,
|
|
"epoch": 1.1729106628242074,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000487000519870415,
|
|
"loss": 5.0612,
|
|
"mean_token_accuracy": 0.20156388878822326,
|
|
"num_tokens": 27987784.0,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"entropy": 5.214973592758179,
|
|
"epoch": 1.1733909702209413,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004869890330359986,
|
|
"loss": 4.9831,
|
|
"mean_token_accuracy": 0.207270847260952,
|
|
"num_tokens": 27999623.0,
|
|
"step": 12215
|
|
},
|
|
{
|
|
"entropy": 5.146721315383911,
|
|
"epoch": 1.1738712776176754,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004869775412798262,
|
|
"loss": 4.9998,
|
|
"mean_token_accuracy": 0.20671399533748627,
|
|
"num_tokens": 28011561.0,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"entropy": 5.233490705490112,
|
|
"epoch": 1.1743515850144093,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048696604460216476,
|
|
"loss": 5.0422,
|
|
"mean_token_accuracy": 0.21980682760477066,
|
|
"num_tokens": 28022108.0,
|
|
"step": 12225
|
|
},
|
|
{
|
|
"entropy": 5.276953649520874,
|
|
"epoch": 1.1748318924111432,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048695454300328123,
|
|
"loss": 5.0175,
|
|
"mean_token_accuracy": 0.20736639499664306,
|
|
"num_tokens": 28033497.0,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"entropy": 5.237717533111573,
|
|
"epoch": 1.175312199807877,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048694303648344256,
|
|
"loss": 5.0456,
|
|
"mean_token_accuracy": 0.20967191308736802,
|
|
"num_tokens": 28044790.0,
|
|
"step": 12235
|
|
},
|
|
{
|
|
"entropy": 5.111946868896484,
|
|
"epoch": 1.175792507204611,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048693152504291595,
|
|
"loss": 4.9879,
|
|
"mean_token_accuracy": 0.21664219200611115,
|
|
"num_tokens": 28056000.0,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"entropy": 5.18005404472351,
|
|
"epoch": 1.1762728146013448,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004869200086819686,
|
|
"loss": 5.0213,
|
|
"mean_token_accuracy": 0.21006689369678497,
|
|
"num_tokens": 28068264.0,
|
|
"step": 12245
|
|
},
|
|
{
|
|
"entropy": 5.234067392349243,
|
|
"epoch": 1.1767531219980787,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048690848740086796,
|
|
"loss": 4.9192,
|
|
"mean_token_accuracy": 0.21703227013349533,
|
|
"num_tokens": 28080220.0,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"entropy": 5.251984262466431,
|
|
"epoch": 1.1772334293948128,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004868969611998814,
|
|
"loss": 4.9671,
|
|
"mean_token_accuracy": 0.21270408034324645,
|
|
"num_tokens": 28091531.0,
|
|
"step": 12255
|
|
},
|
|
{
|
|
"entropy": 5.154574251174926,
|
|
"epoch": 1.1777137367915467,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004868854300792767,
|
|
"loss": 4.9726,
|
|
"mean_token_accuracy": 0.20649342983961105,
|
|
"num_tokens": 28102982.0,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"entropy": 5.172384786605835,
|
|
"epoch": 1.1781940441882806,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048687389403932144,
|
|
"loss": 5.0406,
|
|
"mean_token_accuracy": 0.21188410818576814,
|
|
"num_tokens": 28114131.0,
|
|
"step": 12265
|
|
},
|
|
{
|
|
"entropy": 5.2116370677948,
|
|
"epoch": 1.1786743515850144,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004868623530802835,
|
|
"loss": 4.9634,
|
|
"mean_token_accuracy": 0.21050270646810532,
|
|
"num_tokens": 28125637.0,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"entropy": 5.274893808364868,
|
|
"epoch": 1.1791546589817483,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048685080720243086,
|
|
"loss": 5.0384,
|
|
"mean_token_accuracy": 0.21438082605600356,
|
|
"num_tokens": 28137284.0,
|
|
"step": 12275
|
|
},
|
|
{
|
|
"entropy": 5.185189199447632,
|
|
"epoch": 1.1796349663784822,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004868392564060315,
|
|
"loss": 5.0397,
|
|
"mean_token_accuracy": 0.21561664193868638,
|
|
"num_tokens": 28148909.0,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"entropy": 5.227141571044922,
|
|
"epoch": 1.180115273775216,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004868277006913537,
|
|
"loss": 5.0201,
|
|
"mean_token_accuracy": 0.21342774629592895,
|
|
"num_tokens": 28160392.0,
|
|
"step": 12285
|
|
},
|
|
{
|
|
"entropy": 5.2492955207824705,
|
|
"epoch": 1.18059558117195,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004868161400586656,
|
|
"loss": 5.0965,
|
|
"mean_token_accuracy": 0.20555120557546616,
|
|
"num_tokens": 28172957.0,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"entropy": 5.334953641891479,
|
|
"epoch": 1.1810758885686838,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004868045745082357,
|
|
"loss": 5.0748,
|
|
"mean_token_accuracy": 0.20318131595849992,
|
|
"num_tokens": 28183239.0,
|
|
"step": 12295
|
|
},
|
|
{
|
|
"entropy": 5.188865756988525,
|
|
"epoch": 1.181556195965418,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004867930040403326,
|
|
"loss": 4.9936,
|
|
"mean_token_accuracy": 0.21321745961904526,
|
|
"num_tokens": 28195382.0,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"entropy": 5.176113891601562,
|
|
"epoch": 1.1820365033621518,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00048678142865522475,
|
|
"loss": 5.0722,
|
|
"mean_token_accuracy": 0.2028682142496109,
|
|
"num_tokens": 28206645.0,
|
|
"step": 12305
|
|
},
|
|
{
|
|
"entropy": 5.261957263946533,
|
|
"epoch": 1.1825168107588857,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000486769848353181,
|
|
"loss": 5.0623,
|
|
"mean_token_accuracy": 0.21030078679323197,
|
|
"num_tokens": 28218420.0,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"entropy": 5.234085512161255,
|
|
"epoch": 1.1829971181556196,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048675826313447027,
|
|
"loss": 5.0647,
|
|
"mean_token_accuracy": 0.20770383477211,
|
|
"num_tokens": 28229458.0,
|
|
"step": 12315
|
|
},
|
|
{
|
|
"entropy": 5.270634365081787,
|
|
"epoch": 1.1834774255523535,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048674667299936135,
|
|
"loss": 5.0538,
|
|
"mean_token_accuracy": 0.2074426531791687,
|
|
"num_tokens": 28242240.0,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"entropy": 5.380625152587891,
|
|
"epoch": 1.1839577329490873,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048673507794812356,
|
|
"loss": 5.139,
|
|
"mean_token_accuracy": 0.20491064041852952,
|
|
"num_tokens": 28254597.0,
|
|
"step": 12325
|
|
},
|
|
{
|
|
"entropy": 5.217261886596679,
|
|
"epoch": 1.1844380403458212,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004867234779810259,
|
|
"loss": 5.0605,
|
|
"mean_token_accuracy": 0.20772763192653657,
|
|
"num_tokens": 28266674.0,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"entropy": 5.174903488159179,
|
|
"epoch": 1.1849183477425553,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004867118730983378,
|
|
"loss": 5.0235,
|
|
"mean_token_accuracy": 0.213454669713974,
|
|
"num_tokens": 28278671.0,
|
|
"step": 12335
|
|
},
|
|
{
|
|
"entropy": 5.22681565284729,
|
|
"epoch": 1.1853986551392892,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004867002633003286,
|
|
"loss": 4.9512,
|
|
"mean_token_accuracy": 0.21354973167181016,
|
|
"num_tokens": 28291085.0,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"entropy": 5.146066761016845,
|
|
"epoch": 1.185878962536023,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000486688648587268,
|
|
"loss": 5.0309,
|
|
"mean_token_accuracy": 0.215641950070858,
|
|
"num_tokens": 28303457.0,
|
|
"step": 12345
|
|
},
|
|
{
|
|
"entropy": 5.223130226135254,
|
|
"epoch": 1.186359269932757,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004866770289594256,
|
|
"loss": 5.0391,
|
|
"mean_token_accuracy": 0.20597289353609086,
|
|
"num_tokens": 28314599.0,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"entropy": 5.3448234558105465,
|
|
"epoch": 1.1868395773294909,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048666540441707107,
|
|
"loss": 5.1315,
|
|
"mean_token_accuracy": 0.2068374440073967,
|
|
"num_tokens": 28326266.0,
|
|
"step": 12355
|
|
},
|
|
{
|
|
"entropy": 5.251500129699707,
|
|
"epoch": 1.1873198847262247,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004866537749604744,
|
|
"loss": 4.9853,
|
|
"mean_token_accuracy": 0.207984322309494,
|
|
"num_tokens": 28337238.0,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"entropy": 5.157985210418701,
|
|
"epoch": 1.1878001921229586,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048664214058990546,
|
|
"loss": 4.9818,
|
|
"mean_token_accuracy": 0.21365345120429993,
|
|
"num_tokens": 28348060.0,
|
|
"step": 12365
|
|
},
|
|
{
|
|
"entropy": 5.145942068099975,
|
|
"epoch": 1.1882804995196925,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004866305013056346,
|
|
"loss": 5.0371,
|
|
"mean_token_accuracy": 0.2051353007555008,
|
|
"num_tokens": 28359442.0,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"entropy": 5.244502449035645,
|
|
"epoch": 1.1887608069164266,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004866188571079318,
|
|
"loss": 5.0663,
|
|
"mean_token_accuracy": 0.2088254600763321,
|
|
"num_tokens": 28370827.0,
|
|
"step": 12375
|
|
},
|
|
{
|
|
"entropy": 5.345438051223755,
|
|
"epoch": 1.1892411143131605,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004866072079970676,
|
|
"loss": 5.0994,
|
|
"mean_token_accuracy": 0.19838736355304717,
|
|
"num_tokens": 28383115.0,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"entropy": 5.179604244232178,
|
|
"epoch": 1.1897214217098944,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00048659555397331236,
|
|
"loss": 4.9712,
|
|
"mean_token_accuracy": 0.21606729328632354,
|
|
"num_tokens": 28394904.0,
|
|
"step": 12385
|
|
},
|
|
{
|
|
"entropy": 5.1870029926300045,
|
|
"epoch": 1.1902017291066282,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004865838950369366,
|
|
"loss": 5.0248,
|
|
"mean_token_accuracy": 0.2077416345477104,
|
|
"num_tokens": 28407357.0,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"entropy": 5.206893348693848,
|
|
"epoch": 1.1906820365033621,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00048657223118821116,
|
|
"loss": 4.9821,
|
|
"mean_token_accuracy": 0.21372554153203965,
|
|
"num_tokens": 28418088.0,
|
|
"step": 12395
|
|
},
|
|
{
|
|
"entropy": 5.062207841873169,
|
|
"epoch": 1.191162343900096,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048656056242740665,
|
|
"loss": 4.889,
|
|
"mean_token_accuracy": 0.21973242610692978,
|
|
"num_tokens": 28430022.0,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"entropy": 5.1975541591644285,
|
|
"epoch": 1.19164265129683,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004865488887547942,
|
|
"loss": 4.9833,
|
|
"mean_token_accuracy": 0.214840891957283,
|
|
"num_tokens": 28440530.0,
|
|
"step": 12405
|
|
},
|
|
{
|
|
"entropy": 5.258637619018555,
|
|
"epoch": 1.192122958693564,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004865372101706446,
|
|
"loss": 5.0893,
|
|
"mean_token_accuracy": 0.20531991273164749,
|
|
"num_tokens": 28452707.0,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"entropy": 5.119096803665161,
|
|
"epoch": 1.1926032660902979,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004865255266752292,
|
|
"loss": 4.8979,
|
|
"mean_token_accuracy": 0.22423699051141738,
|
|
"num_tokens": 28465131.0,
|
|
"step": 12415
|
|
},
|
|
{
|
|
"entropy": 5.142260789871216,
|
|
"epoch": 1.1930835734870318,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004865138382688191,
|
|
"loss": 5.0173,
|
|
"mean_token_accuracy": 0.2174185335636139,
|
|
"num_tokens": 28476136.0,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"entropy": 5.216041707992554,
|
|
"epoch": 1.1935638808837656,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004865021449516859,
|
|
"loss": 4.9275,
|
|
"mean_token_accuracy": 0.2106182113289833,
|
|
"num_tokens": 28488374.0,
|
|
"step": 12425
|
|
},
|
|
{
|
|
"entropy": 5.248443746566773,
|
|
"epoch": 1.1940441882804995,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004864904467241008,
|
|
"loss": 4.9863,
|
|
"mean_token_accuracy": 0.20798565447330475,
|
|
"num_tokens": 28499585.0,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"entropy": 5.132749748229981,
|
|
"epoch": 1.1945244956772334,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048647874358633556,
|
|
"loss": 4.9399,
|
|
"mean_token_accuracy": 0.2145738869905472,
|
|
"num_tokens": 28510707.0,
|
|
"step": 12435
|
|
},
|
|
{
|
|
"entropy": 5.247034311294556,
|
|
"epoch": 1.1950048030739673,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048646703553866183,
|
|
"loss": 5.0617,
|
|
"mean_token_accuracy": 0.20649414509534836,
|
|
"num_tokens": 28522398.0,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"entropy": 5.194453859329224,
|
|
"epoch": 1.1954851104707012,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004864553225813515,
|
|
"loss": 4.9577,
|
|
"mean_token_accuracy": 0.21434492319822313,
|
|
"num_tokens": 28532949.0,
|
|
"step": 12445
|
|
},
|
|
{
|
|
"entropy": 5.127518177032471,
|
|
"epoch": 1.195965417867435,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004864436047146765,
|
|
"loss": 4.9267,
|
|
"mean_token_accuracy": 0.22026402056217192,
|
|
"num_tokens": 28544292.0,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"entropy": 5.219339227676391,
|
|
"epoch": 1.1964457252641691,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048643188193890874,
|
|
"loss": 5.0951,
|
|
"mean_token_accuracy": 0.20473963618278504,
|
|
"num_tokens": 28556430.0,
|
|
"step": 12455
|
|
},
|
|
{
|
|
"entropy": 5.302267837524414,
|
|
"epoch": 1.196926032660903,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004864201542543206,
|
|
"loss": 5.0565,
|
|
"mean_token_accuracy": 0.20778754204511643,
|
|
"num_tokens": 28568475.0,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"entropy": 5.265069389343262,
|
|
"epoch": 1.197406340057637,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004864084216611843,
|
|
"loss": 5.0026,
|
|
"mean_token_accuracy": 0.20922221690416337,
|
|
"num_tokens": 28579653.0,
|
|
"step": 12465
|
|
},
|
|
{
|
|
"entropy": 5.153708600997925,
|
|
"epoch": 1.1978866474543708,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048639668415977207,
|
|
"loss": 4.9371,
|
|
"mean_token_accuracy": 0.21175346672534942,
|
|
"num_tokens": 28590108.0,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"entropy": 5.165795612335205,
|
|
"epoch": 1.1983669548511047,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00048638494175035665,
|
|
"loss": 4.9589,
|
|
"mean_token_accuracy": 0.2166977271437645,
|
|
"num_tokens": 28602008.0,
|
|
"step": 12475
|
|
},
|
|
{
|
|
"entropy": 5.141294145584107,
|
|
"epoch": 1.1988472622478386,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004863731944332105,
|
|
"loss": 4.8937,
|
|
"mean_token_accuracy": 0.22100506275892257,
|
|
"num_tokens": 28613286.0,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"entropy": 5.158958911895752,
|
|
"epoch": 1.1993275696445724,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004863614422086065,
|
|
"loss": 4.905,
|
|
"mean_token_accuracy": 0.21994735598564147,
|
|
"num_tokens": 28625497.0,
|
|
"step": 12485
|
|
},
|
|
{
|
|
"entropy": 5.1904459476470945,
|
|
"epoch": 1.1998078770413065,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004863496850768174,
|
|
"loss": 5.0029,
|
|
"mean_token_accuracy": 0.21328900158405303,
|
|
"num_tokens": 28638046.0,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"entropy": 5.097305774688721,
|
|
"epoch": 1.2002881844380404,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004863379230381162,
|
|
"loss": 4.9046,
|
|
"mean_token_accuracy": 0.2162349119782448,
|
|
"num_tokens": 28647923.0,
|
|
"step": 12495
|
|
},
|
|
{
|
|
"entropy": 5.14150071144104,
|
|
"epoch": 1.2007684918347743,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000486326156092776,
|
|
"loss": 4.9583,
|
|
"mean_token_accuracy": 0.21833768635988235,
|
|
"num_tokens": 28660023.0,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"entropy": 5.187930011749268,
|
|
"epoch": 1.2012487992315082,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048631438424106985,
|
|
"loss": 4.9908,
|
|
"mean_token_accuracy": 0.21353389471769332,
|
|
"num_tokens": 28671568.0,
|
|
"step": 12505
|
|
},
|
|
{
|
|
"entropy": 5.122547054290772,
|
|
"epoch": 1.201729106628242,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048630260748327124,
|
|
"loss": 4.9636,
|
|
"mean_token_accuracy": 0.21925023049116135,
|
|
"num_tokens": 28682897.0,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"entropy": 5.162188148498535,
|
|
"epoch": 1.202209414024976,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00048629082581965355,
|
|
"loss": 5.0342,
|
|
"mean_token_accuracy": 0.21101551204919816,
|
|
"num_tokens": 28694067.0,
|
|
"step": 12515
|
|
},
|
|
{
|
|
"entropy": 5.242506408691407,
|
|
"epoch": 1.2026897214217098,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00048627903925049033,
|
|
"loss": 4.9589,
|
|
"mean_token_accuracy": 0.21290026605129242,
|
|
"num_tokens": 28705738.0,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"entropy": 5.195635080337524,
|
|
"epoch": 1.2031700288184437,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048626724777605507,
|
|
"loss": 4.9092,
|
|
"mean_token_accuracy": 0.21891177147626878,
|
|
"num_tokens": 28717419.0,
|
|
"step": 12525
|
|
},
|
|
{
|
|
"entropy": 5.187272739410401,
|
|
"epoch": 1.2036503362151778,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004862554513966217,
|
|
"loss": 5.1105,
|
|
"mean_token_accuracy": 0.20062761902809143,
|
|
"num_tokens": 28728587.0,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"entropy": 5.23038272857666,
|
|
"epoch": 1.2041306436119117,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048624365011246405,
|
|
"loss": 5.0802,
|
|
"mean_token_accuracy": 0.20460240244865419,
|
|
"num_tokens": 28740818.0,
|
|
"step": 12535
|
|
},
|
|
{
|
|
"entropy": 5.1875214099884035,
|
|
"epoch": 1.2046109510086456,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004862318439238561,
|
|
"loss": 4.927,
|
|
"mean_token_accuracy": 0.224493670463562,
|
|
"num_tokens": 28751936.0,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"entropy": 5.185813951492309,
|
|
"epoch": 1.2050912584053795,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000486220032831072,
|
|
"loss": 4.9948,
|
|
"mean_token_accuracy": 0.2092664435505867,
|
|
"num_tokens": 28763271.0,
|
|
"step": 12545
|
|
},
|
|
{
|
|
"entropy": 5.225413799285889,
|
|
"epoch": 1.2055715658021133,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004862082168343859,
|
|
"loss": 5.0384,
|
|
"mean_token_accuracy": 0.21466702818870545,
|
|
"num_tokens": 28774282.0,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"entropy": 5.275173616409302,
|
|
"epoch": 1.2060518731988472,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004861963959340722,
|
|
"loss": 5.0968,
|
|
"mean_token_accuracy": 0.20915820598602294,
|
|
"num_tokens": 28785826.0,
|
|
"step": 12555
|
|
},
|
|
{
|
|
"entropy": 5.120945119857788,
|
|
"epoch": 1.206532180595581,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004861845701304053,
|
|
"loss": 4.9057,
|
|
"mean_token_accuracy": 0.21730198264122008,
|
|
"num_tokens": 28797669.0,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"entropy": 5.19795560836792,
|
|
"epoch": 1.2070124879923152,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048617273942365977,
|
|
"loss": 5.0742,
|
|
"mean_token_accuracy": 0.20622512996196746,
|
|
"num_tokens": 28808438.0,
|
|
"step": 12565
|
|
},
|
|
{
|
|
"entropy": 5.136143445968628,
|
|
"epoch": 1.207492795389049,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004861609038141103,
|
|
"loss": 4.9156,
|
|
"mean_token_accuracy": 0.2179456263780594,
|
|
"num_tokens": 28819707.0,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"entropy": 5.243184280395508,
|
|
"epoch": 1.207973102785783,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048614906330203165,
|
|
"loss": 5.02,
|
|
"mean_token_accuracy": 0.2118644818663597,
|
|
"num_tokens": 28831829.0,
|
|
"step": 12575
|
|
},
|
|
{
|
|
"entropy": 5.178156328201294,
|
|
"epoch": 1.2084534101825168,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004861372178876987,
|
|
"loss": 4.9879,
|
|
"mean_token_accuracy": 0.2112196832895279,
|
|
"num_tokens": 28843429.0,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"entropy": 5.162006902694702,
|
|
"epoch": 1.2089337175792507,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048612536757138653,
|
|
"loss": 4.9146,
|
|
"mean_token_accuracy": 0.21943530589342117,
|
|
"num_tokens": 28856239.0,
|
|
"step": 12585
|
|
},
|
|
{
|
|
"entropy": 5.224708223342896,
|
|
"epoch": 1.2094140249759846,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004861135123533702,
|
|
"loss": 5.0127,
|
|
"mean_token_accuracy": 0.21318120807409285,
|
|
"num_tokens": 28868599.0,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"entropy": 5.2225141525268555,
|
|
"epoch": 1.2098943323727185,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048610165223392503,
|
|
"loss": 5.035,
|
|
"mean_token_accuracy": 0.20697897523641587,
|
|
"num_tokens": 28880234.0,
|
|
"step": 12595
|
|
},
|
|
{
|
|
"entropy": 5.2457115173339846,
|
|
"epoch": 1.2103746397694524,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004860897872133263,
|
|
"loss": 5.0615,
|
|
"mean_token_accuracy": 0.2093821495771408,
|
|
"num_tokens": 28892198.0,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"entropy": 5.230489587783813,
|
|
"epoch": 1.2108549471661862,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004860779172918496,
|
|
"loss": 5.0893,
|
|
"mean_token_accuracy": 0.20883096009492874,
|
|
"num_tokens": 28904153.0,
|
|
"step": 12605
|
|
},
|
|
{
|
|
"entropy": 5.278970098495483,
|
|
"epoch": 1.2113352545629203,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004860660424697704,
|
|
"loss": 5.0579,
|
|
"mean_token_accuracy": 0.21126580536365508,
|
|
"num_tokens": 28915284.0,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"entropy": 5.21758451461792,
|
|
"epoch": 1.2118155619596542,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048605416274736434,
|
|
"loss": 5.0239,
|
|
"mean_token_accuracy": 0.20837975144386292,
|
|
"num_tokens": 28928168.0,
|
|
"step": 12615
|
|
},
|
|
{
|
|
"entropy": 5.152167892456054,
|
|
"epoch": 1.2122958693563881,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00048604227812490744,
|
|
"loss": 4.9032,
|
|
"mean_token_accuracy": 0.21995031386613845,
|
|
"num_tokens": 28938548.0,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"entropy": 5.1527406692504885,
|
|
"epoch": 1.212776176753122,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048603038860267546,
|
|
"loss": 4.9921,
|
|
"mean_token_accuracy": 0.21356878131628038,
|
|
"num_tokens": 28949252.0,
|
|
"step": 12625
|
|
},
|
|
{
|
|
"entropy": 5.230664396286011,
|
|
"epoch": 1.2132564841498559,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004860184941809445,
|
|
"loss": 5.0107,
|
|
"mean_token_accuracy": 0.21580713540315627,
|
|
"num_tokens": 28960758.0,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"entropy": 5.214424085617066,
|
|
"epoch": 1.2137367915465898,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048600659485999073,
|
|
"loss": 4.9823,
|
|
"mean_token_accuracy": 0.21401938945055007,
|
|
"num_tokens": 28972604.0,
|
|
"step": 12635
|
|
},
|
|
{
|
|
"entropy": 5.171445035934449,
|
|
"epoch": 1.2142170989433236,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048599469064009027,
|
|
"loss": 4.9781,
|
|
"mean_token_accuracy": 0.2194841518998146,
|
|
"num_tokens": 28983617.0,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"entropy": 5.171384048461914,
|
|
"epoch": 1.2146974063400577,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048598278152151974,
|
|
"loss": 5.0017,
|
|
"mean_token_accuracy": 0.21235128194093705,
|
|
"num_tokens": 28994277.0,
|
|
"step": 12645
|
|
},
|
|
{
|
|
"entropy": 5.2698729038238525,
|
|
"epoch": 1.2151777137367916,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004859708675045555,
|
|
"loss": 5.0415,
|
|
"mean_token_accuracy": 0.21211624890565872,
|
|
"num_tokens": 29004844.0,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"entropy": 5.067803430557251,
|
|
"epoch": 1.2156580211335255,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004859589485894741,
|
|
"loss": 4.9214,
|
|
"mean_token_accuracy": 0.21981519609689712,
|
|
"num_tokens": 29015978.0,
|
|
"step": 12655
|
|
},
|
|
{
|
|
"entropy": 5.245090198516846,
|
|
"epoch": 1.2161383285302594,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004859470247765524,
|
|
"loss": 5.0228,
|
|
"mean_token_accuracy": 0.21242079883813858,
|
|
"num_tokens": 29026966.0,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"entropy": 5.231347465515137,
|
|
"epoch": 1.2166186359269933,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004859350960660671,
|
|
"loss": 4.9512,
|
|
"mean_token_accuracy": 0.2151286020874977,
|
|
"num_tokens": 29037943.0,
|
|
"step": 12665
|
|
},
|
|
{
|
|
"entropy": 5.210123300552368,
|
|
"epoch": 1.2170989433237271,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004859231624582953,
|
|
"loss": 5.0191,
|
|
"mean_token_accuracy": 0.2085331290960312,
|
|
"num_tokens": 29049939.0,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"entropy": 5.1238236904144285,
|
|
"epoch": 1.217579250720461,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048591122395351394,
|
|
"loss": 5.0294,
|
|
"mean_token_accuracy": 0.21389884501695633,
|
|
"num_tokens": 29062442.0,
|
|
"step": 12675
|
|
},
|
|
{
|
|
"entropy": 5.19063811302185,
|
|
"epoch": 1.218059558117195,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004858992805520003,
|
|
"loss": 5.0303,
|
|
"mean_token_accuracy": 0.21078938841819764,
|
|
"num_tokens": 29074240.0,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"entropy": 5.28305025100708,
|
|
"epoch": 1.218539865513929,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048588733225403153,
|
|
"loss": 5.0248,
|
|
"mean_token_accuracy": 0.20675573348999024,
|
|
"num_tokens": 29085550.0,
|
|
"step": 12685
|
|
},
|
|
{
|
|
"entropy": 5.132738399505615,
|
|
"epoch": 1.219020172910663,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004858753790598851,
|
|
"loss": 4.9732,
|
|
"mean_token_accuracy": 0.21328083127737046,
|
|
"num_tokens": 29097092.0,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"entropy": 5.131064367294312,
|
|
"epoch": 1.2195004803073968,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004858634209698386,
|
|
"loss": 5.0119,
|
|
"mean_token_accuracy": 0.21203078627586364,
|
|
"num_tokens": 29108119.0,
|
|
"step": 12695
|
|
},
|
|
{
|
|
"entropy": 5.2007276058197025,
|
|
"epoch": 1.2199807877041307,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048585145798416956,
|
|
"loss": 4.9885,
|
|
"mean_token_accuracy": 0.20587167888879776,
|
|
"num_tokens": 29120605.0,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"entropy": 5.203617095947266,
|
|
"epoch": 1.2204610951008645,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004858394901031558,
|
|
"loss": 4.9715,
|
|
"mean_token_accuracy": 0.21140657663345336,
|
|
"num_tokens": 29131582.0,
|
|
"step": 12705
|
|
},
|
|
{
|
|
"entropy": 5.226724433898926,
|
|
"epoch": 1.2209414024975984,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004858275173270751,
|
|
"loss": 4.9982,
|
|
"mean_token_accuracy": 0.2063015416264534,
|
|
"num_tokens": 29143436.0,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"entropy": 5.195918226242066,
|
|
"epoch": 1.2214217098943323,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00048581553965620553,
|
|
"loss": 4.9219,
|
|
"mean_token_accuracy": 0.2154676854610443,
|
|
"num_tokens": 29154445.0,
|
|
"step": 12715
|
|
},
|
|
{
|
|
"entropy": 5.198876190185547,
|
|
"epoch": 1.2219020172910664,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048580355709082506,
|
|
"loss": 5.0403,
|
|
"mean_token_accuracy": 0.21526143848896026,
|
|
"num_tokens": 29164599.0,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"entropy": 5.141847133636475,
|
|
"epoch": 1.2223823246878003,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000485791569631212,
|
|
"loss": 4.9984,
|
|
"mean_token_accuracy": 0.20752517729997635,
|
|
"num_tokens": 29176275.0,
|
|
"step": 12725
|
|
},
|
|
{
|
|
"entropy": 5.235421657562256,
|
|
"epoch": 1.2228626320845342,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004857795772776446,
|
|
"loss": 5.0038,
|
|
"mean_token_accuracy": 0.20879749357700347,
|
|
"num_tokens": 29189102.0,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"entropy": 5.225815057754517,
|
|
"epoch": 1.223342939481268,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048576758003040127,
|
|
"loss": 5.0339,
|
|
"mean_token_accuracy": 0.2110910639166832,
|
|
"num_tokens": 29200953.0,
|
|
"step": 12735
|
|
},
|
|
{
|
|
"entropy": 5.213280534744262,
|
|
"epoch": 1.223823246878002,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00048575557788976066,
|
|
"loss": 5.0438,
|
|
"mean_token_accuracy": 0.20327647179365158,
|
|
"num_tokens": 29212942.0,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"entropy": 5.164249658584595,
|
|
"epoch": 1.2243035542747358,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004857435708560013,
|
|
"loss": 4.9348,
|
|
"mean_token_accuracy": 0.21420682966709137,
|
|
"num_tokens": 29224949.0,
|
|
"step": 12745
|
|
},
|
|
{
|
|
"entropy": 5.280761194229126,
|
|
"epoch": 1.2247838616714697,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048573155892940204,
|
|
"loss": 5.0932,
|
|
"mean_token_accuracy": 0.20139861702919007,
|
|
"num_tokens": 29236044.0,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"entropy": 5.2009320735931395,
|
|
"epoch": 1.2252641690682036,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00048571954211024164,
|
|
"loss": 4.9868,
|
|
"mean_token_accuracy": 0.21266197860240937,
|
|
"num_tokens": 29248084.0,
|
|
"step": 12755
|
|
},
|
|
{
|
|
"entropy": 5.135626983642578,
|
|
"epoch": 1.2257444764649374,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048570752039879924,
|
|
"loss": 4.873,
|
|
"mean_token_accuracy": 0.22127241939306258,
|
|
"num_tokens": 29258710.0,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"entropy": 5.214362525939942,
|
|
"epoch": 1.2262247838616716,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004856954937953539,
|
|
"loss": 5.0884,
|
|
"mean_token_accuracy": 0.20115942060947417,
|
|
"num_tokens": 29270173.0,
|
|
"step": 12765
|
|
},
|
|
{
|
|
"entropy": 5.278593635559082,
|
|
"epoch": 1.2267050912584054,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004856834623001848,
|
|
"loss": 5.0685,
|
|
"mean_token_accuracy": 0.20889558047056198,
|
|
"num_tokens": 29280407.0,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"entropy": 5.135749340057373,
|
|
"epoch": 1.2271853986551393,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004856714259135713,
|
|
"loss": 4.9413,
|
|
"mean_token_accuracy": 0.2196897506713867,
|
|
"num_tokens": 29292287.0,
|
|
"step": 12775
|
|
},
|
|
{
|
|
"entropy": 5.2047443866729735,
|
|
"epoch": 1.2276657060518732,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004856593846357929,
|
|
"loss": 5.0592,
|
|
"mean_token_accuracy": 0.20803760290145873,
|
|
"num_tokens": 29303099.0,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"entropy": 5.2750386714935305,
|
|
"epoch": 1.228146013448607,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004856473384671291,
|
|
"loss": 5.1128,
|
|
"mean_token_accuracy": 0.2065555065870285,
|
|
"num_tokens": 29314445.0,
|
|
"step": 12785
|
|
},
|
|
{
|
|
"entropy": 5.146969270706177,
|
|
"epoch": 1.228626320845341,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00048563528740785955,
|
|
"loss": 4.9752,
|
|
"mean_token_accuracy": 0.20982863157987594,
|
|
"num_tokens": 29325309.0,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"entropy": 5.152886533737183,
|
|
"epoch": 1.2291066282420748,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00048562323145826414,
|
|
"loss": 4.9259,
|
|
"mean_token_accuracy": 0.22105071544647217,
|
|
"num_tokens": 29338582.0,
|
|
"step": 12795
|
|
},
|
|
{
|
|
"entropy": 5.207387828826905,
|
|
"epoch": 1.229586935638809,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004856111706186227,
|
|
"loss": 4.9922,
|
|
"mean_token_accuracy": 0.21350787281990052,
|
|
"num_tokens": 29349875.0,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"entropy": 5.283040285110474,
|
|
"epoch": 1.2300672430355428,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00048559910488921534,
|
|
"loss": 5.1049,
|
|
"mean_token_accuracy": 0.2001900017261505,
|
|
"num_tokens": 29361800.0,
|
|
"step": 12805
|
|
},
|
|
{
|
|
"entropy": 5.165265846252441,
|
|
"epoch": 1.2305475504322767,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000485587034270322,
|
|
"loss": 4.9814,
|
|
"mean_token_accuracy": 0.21499158591032028,
|
|
"num_tokens": 29372795.0,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"entropy": 5.20758466720581,
|
|
"epoch": 1.2310278578290106,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000485574958762223,
|
|
"loss": 5.0659,
|
|
"mean_token_accuracy": 0.21106487214565278,
|
|
"num_tokens": 29385391.0,
|
|
"step": 12815
|
|
},
|
|
{
|
|
"entropy": 5.130280923843384,
|
|
"epoch": 1.2315081652257445,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048556287836519886,
|
|
"loss": 4.9144,
|
|
"mean_token_accuracy": 0.22030035853385926,
|
|
"num_tokens": 29397113.0,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"entropy": 5.277672386169433,
|
|
"epoch": 1.2319884726224783,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004855507930795299,
|
|
"loss": 5.097,
|
|
"mean_token_accuracy": 0.21232483088970183,
|
|
"num_tokens": 29407552.0,
|
|
"step": 12825
|
|
},
|
|
{
|
|
"entropy": 5.186304426193237,
|
|
"epoch": 1.2324687800192122,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048553870290549665,
|
|
"loss": 5.0012,
|
|
"mean_token_accuracy": 0.21472340673208237,
|
|
"num_tokens": 29418500.0,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"entropy": 5.229617691040039,
|
|
"epoch": 1.232949087415946,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048552660784338,
|
|
"loss": 5.0069,
|
|
"mean_token_accuracy": 0.21694095134735109,
|
|
"num_tokens": 29430335.0,
|
|
"step": 12835
|
|
},
|
|
{
|
|
"entropy": 5.128983354568481,
|
|
"epoch": 1.23342939481268,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004855145078934606,
|
|
"loss": 5.0013,
|
|
"mean_token_accuracy": 0.20726215988397598,
|
|
"num_tokens": 29441435.0,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"entropy": 5.22078046798706,
|
|
"epoch": 1.233909702209414,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004855024030560195,
|
|
"loss": 4.9875,
|
|
"mean_token_accuracy": 0.21403325647115706,
|
|
"num_tokens": 29453140.0,
|
|
"step": 12845
|
|
},
|
|
{
|
|
"entropy": 5.107886886596679,
|
|
"epoch": 1.234390009606148,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004854902933313376,
|
|
"loss": 4.9168,
|
|
"mean_token_accuracy": 0.2206453412771225,
|
|
"num_tokens": 29464572.0,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"entropy": 5.273893547058106,
|
|
"epoch": 1.2348703170028819,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048547817871969607,
|
|
"loss": 5.1412,
|
|
"mean_token_accuracy": 0.19843536466360093,
|
|
"num_tokens": 29477069.0,
|
|
"step": 12855
|
|
},
|
|
{
|
|
"entropy": 5.258860635757446,
|
|
"epoch": 1.2353506243996157,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048546605922137633,
|
|
"loss": 5.0259,
|
|
"mean_token_accuracy": 0.21055852621793747,
|
|
"num_tokens": 29486860.0,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"entropy": 5.241803359985352,
|
|
"epoch": 1.2358309317963496,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004854539348366596,
|
|
"loss": 5.0934,
|
|
"mean_token_accuracy": 0.20382969826459885,
|
|
"num_tokens": 29499129.0,
|
|
"step": 12865
|
|
},
|
|
{
|
|
"entropy": 5.1568663120269775,
|
|
"epoch": 1.2363112391930835,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004854418055658274,
|
|
"loss": 4.9683,
|
|
"mean_token_accuracy": 0.21473043411970139,
|
|
"num_tokens": 29510764.0,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"entropy": 5.179628610610962,
|
|
"epoch": 1.2367915465898176,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048542967140916134,
|
|
"loss": 5.0443,
|
|
"mean_token_accuracy": 0.2080310419201851,
|
|
"num_tokens": 29522882.0,
|
|
"step": 12875
|
|
},
|
|
{
|
|
"entropy": 5.231398963928223,
|
|
"epoch": 1.2372718539865515,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004854175323669432,
|
|
"loss": 4.9823,
|
|
"mean_token_accuracy": 0.20900345593690872,
|
|
"num_tokens": 29533348.0,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"entropy": 5.1414636135101315,
|
|
"epoch": 1.2377521613832854,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004854053884394547,
|
|
"loss": 4.9611,
|
|
"mean_token_accuracy": 0.2162790149450302,
|
|
"num_tokens": 29545649.0,
|
|
"step": 12885
|
|
},
|
|
{
|
|
"entropy": 5.209917974472046,
|
|
"epoch": 1.2382324687800192,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048539323962697796,
|
|
"loss": 5.0101,
|
|
"mean_token_accuracy": 0.20617685168981553,
|
|
"num_tokens": 29558252.0,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"entropy": 5.315207386016846,
|
|
"epoch": 1.2387127761767531,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004853810859297949,
|
|
"loss": 5.1166,
|
|
"mean_token_accuracy": 0.21191854476928712,
|
|
"num_tokens": 29569495.0,
|
|
"step": 12895
|
|
},
|
|
{
|
|
"entropy": 5.209201908111572,
|
|
"epoch": 1.239193083573487,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048536892734818773,
|
|
"loss": 4.9999,
|
|
"mean_token_accuracy": 0.211149762570858,
|
|
"num_tokens": 29582167.0,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"entropy": 5.23478512763977,
|
|
"epoch": 1.239673390970221,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004853567638824387,
|
|
"loss": 5.0181,
|
|
"mean_token_accuracy": 0.2111159771680832,
|
|
"num_tokens": 29593964.0,
|
|
"step": 12905
|
|
},
|
|
{
|
|
"entropy": 5.227269411087036,
|
|
"epoch": 1.2401536983669548,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048534459553283026,
|
|
"loss": 4.9755,
|
|
"mean_token_accuracy": 0.21934866458177565,
|
|
"num_tokens": 29604844.0,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"entropy": 5.219102811813355,
|
|
"epoch": 1.2406340057636887,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004853324222996449,
|
|
"loss": 5.0165,
|
|
"mean_token_accuracy": 0.21728123128414153,
|
|
"num_tokens": 29615219.0,
|
|
"step": 12915
|
|
},
|
|
{
|
|
"entropy": 5.303751039505005,
|
|
"epoch": 1.2411143131604228,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048532024418316525,
|
|
"loss": 5.1028,
|
|
"mean_token_accuracy": 0.2071371465921402,
|
|
"num_tokens": 29626472.0,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"entropy": 5.16833004951477,
|
|
"epoch": 1.2415946205571566,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004853080611836741,
|
|
"loss": 4.9767,
|
|
"mean_token_accuracy": 0.2196236953139305,
|
|
"num_tokens": 29637966.0,
|
|
"step": 12925
|
|
},
|
|
{
|
|
"entropy": 5.132480478286743,
|
|
"epoch": 1.2420749279538905,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048529587330145427,
|
|
"loss": 4.906,
|
|
"mean_token_accuracy": 0.2214494377374649,
|
|
"num_tokens": 29648730.0,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"entropy": 5.132258462905884,
|
|
"epoch": 1.2425552353506244,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048528368053678863,
|
|
"loss": 4.929,
|
|
"mean_token_accuracy": 0.2147599697113037,
|
|
"num_tokens": 29660576.0,
|
|
"step": 12935
|
|
},
|
|
{
|
|
"entropy": 5.175818014144897,
|
|
"epoch": 1.2430355427473583,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004852714828899604,
|
|
"loss": 4.9604,
|
|
"mean_token_accuracy": 0.2147279053926468,
|
|
"num_tokens": 29672906.0,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"entropy": 5.156858634948731,
|
|
"epoch": 1.2435158501440922,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00048525928036125264,
|
|
"loss": 4.9559,
|
|
"mean_token_accuracy": 0.21674090325832368,
|
|
"num_tokens": 29685360.0,
|
|
"step": 12945
|
|
},
|
|
{
|
|
"entropy": 5.184188318252564,
|
|
"epoch": 1.243996157540826,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048524707295094884,
|
|
"loss": 4.9588,
|
|
"mean_token_accuracy": 0.2069990187883377,
|
|
"num_tokens": 29697257.0,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"entropy": 5.153713750839233,
|
|
"epoch": 1.2444764649375601,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004852348606593322,
|
|
"loss": 4.9132,
|
|
"mean_token_accuracy": 0.220682792365551,
|
|
"num_tokens": 29707877.0,
|
|
"step": 12955
|
|
},
|
|
{
|
|
"entropy": 5.208719635009766,
|
|
"epoch": 1.244956772334294,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048522264348668646,
|
|
"loss": 4.9975,
|
|
"mean_token_accuracy": 0.21275103688240052,
|
|
"num_tokens": 29719358.0,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"entropy": 5.10908875465393,
|
|
"epoch": 1.245437079731028,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004852104214332951,
|
|
"loss": 4.8733,
|
|
"mean_token_accuracy": 0.22901579290628432,
|
|
"num_tokens": 29730383.0,
|
|
"step": 12965
|
|
},
|
|
{
|
|
"entropy": 5.184739780426026,
|
|
"epoch": 1.2459173871277618,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048519819449944205,
|
|
"loss": 4.9995,
|
|
"mean_token_accuracy": 0.21587478816509248,
|
|
"num_tokens": 29741142.0,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"entropy": 5.156636571884155,
|
|
"epoch": 1.2463976945244957,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000485185962685411,
|
|
"loss": 4.9697,
|
|
"mean_token_accuracy": 0.21590882092714309,
|
|
"num_tokens": 29754618.0,
|
|
"step": 12975
|
|
},
|
|
{
|
|
"entropy": 5.214703130722046,
|
|
"epoch": 1.2468780019212296,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000485173725991486,
|
|
"loss": 4.9845,
|
|
"mean_token_accuracy": 0.21426209211349487,
|
|
"num_tokens": 29767115.0,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"entropy": 5.226226806640625,
|
|
"epoch": 1.2473583093179634,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048516148441795124,
|
|
"loss": 5.055,
|
|
"mean_token_accuracy": 0.21063894778490067,
|
|
"num_tokens": 29778165.0,
|
|
"step": 12985
|
|
},
|
|
{
|
|
"entropy": 5.291137981414795,
|
|
"epoch": 1.2478386167146973,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004851492379650908,
|
|
"loss": 5.1231,
|
|
"mean_token_accuracy": 0.2022738501429558,
|
|
"num_tokens": 29790528.0,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"entropy": 5.202538394927979,
|
|
"epoch": 1.2483189241114312,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004851369866331891,
|
|
"loss": 4.9323,
|
|
"mean_token_accuracy": 0.20517953634262084,
|
|
"num_tokens": 29801709.0,
|
|
"step": 12995
|
|
},
|
|
{
|
|
"entropy": 5.175222253799438,
|
|
"epoch": 1.2487992315081653,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004851247304225306,
|
|
"loss": 5.0036,
|
|
"mean_token_accuracy": 0.2128250151872635,
|
|
"num_tokens": 29812963.0,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"entropy": 5.2060657978057865,
|
|
"epoch": 1.2492795389048992,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004851124693333997,
|
|
"loss": 4.9772,
|
|
"mean_token_accuracy": 0.2129211023449898,
|
|
"num_tokens": 29823711.0,
|
|
"step": 13005
|
|
},
|
|
{
|
|
"entropy": 5.165619707107544,
|
|
"epoch": 1.249759846301633,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004851002033660812,
|
|
"loss": 4.9446,
|
|
"mean_token_accuracy": 0.21848293840885163,
|
|
"num_tokens": 29834038.0,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"entropy": 5.213901424407959,
|
|
"epoch": 1.250240153698367,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048508793252085994,
|
|
"loss": 4.9833,
|
|
"mean_token_accuracy": 0.21572160869836807,
|
|
"num_tokens": 29844759.0,
|
|
"step": 13015
|
|
},
|
|
{
|
|
"entropy": 5.092281866073608,
|
|
"epoch": 1.2507204610951008,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004850756567980206,
|
|
"loss": 4.8518,
|
|
"mean_token_accuracy": 0.21635698527097702,
|
|
"num_tokens": 29855643.0,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"entropy": 5.237672472000122,
|
|
"epoch": 1.2512007684918347,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00048506337619784836,
|
|
"loss": 5.0672,
|
|
"mean_token_accuracy": 0.20816876441240312,
|
|
"num_tokens": 29866917.0,
|
|
"step": 13025
|
|
},
|
|
{
|
|
"entropy": 5.285785484313965,
|
|
"epoch": 1.2516810758885688,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004850510907206283,
|
|
"loss": 5.1273,
|
|
"mean_token_accuracy": 0.20473649799823762,
|
|
"num_tokens": 29878937.0,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"entropy": 5.139752054214478,
|
|
"epoch": 1.2521613832853027,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00048503880036664555,
|
|
"loss": 4.9387,
|
|
"mean_token_accuracy": 0.2179243117570877,
|
|
"num_tokens": 29889544.0,
|
|
"step": 13035
|
|
},
|
|
{
|
|
"entropy": 5.133181190490722,
|
|
"epoch": 1.2526416906820366,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004850265051361857,
|
|
"loss": 4.9495,
|
|
"mean_token_accuracy": 0.21097120344638826,
|
|
"num_tokens": 29901919.0,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"entropy": 5.163506126403808,
|
|
"epoch": 1.2531219980787704,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004850142050295339,
|
|
"loss": 4.9949,
|
|
"mean_token_accuracy": 0.21175018399953843,
|
|
"num_tokens": 29913870.0,
|
|
"step": 13045
|
|
},
|
|
{
|
|
"entropy": 5.183034372329712,
|
|
"epoch": 1.2536023054755043,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00048500190004697595,
|
|
"loss": 4.9554,
|
|
"mean_token_accuracy": 0.21792073249816896,
|
|
"num_tokens": 29925180.0,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"entropy": 5.097564458847046,
|
|
"epoch": 1.2540826128722382,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004849895901887974,
|
|
"loss": 4.8784,
|
|
"mean_token_accuracy": 0.22044360488653184,
|
|
"num_tokens": 29936433.0,
|
|
"step": 13055
|
|
},
|
|
{
|
|
"entropy": 5.19610743522644,
|
|
"epoch": 1.254562920268972,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004849772754552842,
|
|
"loss": 5.074,
|
|
"mean_token_accuracy": 0.20816617459058762,
|
|
"num_tokens": 29948891.0,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"entropy": 5.186704921722412,
|
|
"epoch": 1.255043227665706,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048496495584672214,
|
|
"loss": 4.884,
|
|
"mean_token_accuracy": 0.21899646669626235,
|
|
"num_tokens": 29960113.0,
|
|
"step": 13065
|
|
},
|
|
{
|
|
"entropy": 5.193692255020141,
|
|
"epoch": 1.2555235350624399,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00048495263136339725,
|
|
"loss": 5.0114,
|
|
"mean_token_accuracy": 0.20587489008903503,
|
|
"num_tokens": 29972168.0,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"entropy": 5.169920969009399,
|
|
"epoch": 1.2560038424591737,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004849403020055956,
|
|
"loss": 5.021,
|
|
"mean_token_accuracy": 0.21360062509775163,
|
|
"num_tokens": 29982996.0,
|
|
"step": 13075
|
|
},
|
|
{
|
|
"entropy": 5.186982870101929,
|
|
"epoch": 1.2564841498559078,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00048492796777360373,
|
|
"loss": 5.0222,
|
|
"mean_token_accuracy": 0.20997272729873656,
|
|
"num_tokens": 29994088.0,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"entropy": 5.285563182830811,
|
|
"epoch": 1.2569644572526417,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048491562866770767,
|
|
"loss": 5.0864,
|
|
"mean_token_accuracy": 0.2084256410598755,
|
|
"num_tokens": 30005403.0,
|
|
"step": 13085
|
|
},
|
|
{
|
|
"entropy": 5.225534963607788,
|
|
"epoch": 1.2574447646493756,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048490328468819404,
|
|
"loss": 4.9807,
|
|
"mean_token_accuracy": 0.22016366571187973,
|
|
"num_tokens": 30015961.0,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"entropy": 5.1508348941802975,
|
|
"epoch": 1.2579250720461095,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048489093583534945,
|
|
"loss": 4.9337,
|
|
"mean_token_accuracy": 0.21542756259441376,
|
|
"num_tokens": 30026670.0,
|
|
"step": 13095
|
|
},
|
|
{
|
|
"entropy": 5.179332733154297,
|
|
"epoch": 1.2584053794428434,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004848785821094606,
|
|
"loss": 4.9697,
|
|
"mean_token_accuracy": 0.21637785881757737,
|
|
"num_tokens": 30036711.0,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"entropy": 5.204781723022461,
|
|
"epoch": 1.2588856868395775,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004848662235108142,
|
|
"loss": 5.0481,
|
|
"mean_token_accuracy": 0.20675273686647416,
|
|
"num_tokens": 30047587.0,
|
|
"step": 13105
|
|
},
|
|
{
|
|
"entropy": 5.196116733551025,
|
|
"epoch": 1.2593659942363113,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004848538600396973,
|
|
"loss": 4.982,
|
|
"mean_token_accuracy": 0.21352463364601135,
|
|
"num_tokens": 30059348.0,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"entropy": 5.233518457412719,
|
|
"epoch": 1.2598463016330452,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048484149169639694,
|
|
"loss": 4.9836,
|
|
"mean_token_accuracy": 0.21362563073635102,
|
|
"num_tokens": 30070485.0,
|
|
"step": 13115
|
|
},
|
|
{
|
|
"entropy": 5.15550799369812,
|
|
"epoch": 1.260326609029779,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004848291184812003,
|
|
"loss": 4.9135,
|
|
"mean_token_accuracy": 0.22237775921821595,
|
|
"num_tokens": 30081114.0,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"entropy": 5.186419725418091,
|
|
"epoch": 1.260806916426513,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004848167403943945,
|
|
"loss": 5.0575,
|
|
"mean_token_accuracy": 0.2090092420578003,
|
|
"num_tokens": 30092634.0,
|
|
"step": 13125
|
|
},
|
|
{
|
|
"entropy": 5.190171480178833,
|
|
"epoch": 1.2612872238232469,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00048480435743626703,
|
|
"loss": 4.9924,
|
|
"mean_token_accuracy": 0.21862466484308243,
|
|
"num_tokens": 30104205.0,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"entropy": 5.219733333587646,
|
|
"epoch": 1.2617675312199808,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004847919696071054,
|
|
"loss": 5.014,
|
|
"mean_token_accuracy": 0.21289038211107253,
|
|
"num_tokens": 30116978.0,
|
|
"step": 13135
|
|
},
|
|
{
|
|
"entropy": 5.1809934139251705,
|
|
"epoch": 1.2622478386167146,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00048477957690719716,
|
|
"loss": 4.9081,
|
|
"mean_token_accuracy": 0.21542966216802598,
|
|
"num_tokens": 30128549.0,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"entropy": 5.242063808441162,
|
|
"epoch": 1.2627281460134485,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004847671793368301,
|
|
"loss": 5.0544,
|
|
"mean_token_accuracy": 0.2094632938504219,
|
|
"num_tokens": 30139492.0,
|
|
"step": 13145
|
|
},
|
|
{
|
|
"entropy": 5.182856559753418,
|
|
"epoch": 1.2632084534101824,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000484754776896292,
|
|
"loss": 4.969,
|
|
"mean_token_accuracy": 0.21324526518583298,
|
|
"num_tokens": 30150450.0,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"entropy": 5.2239217281341555,
|
|
"epoch": 1.2636887608069165,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004847423695858708,
|
|
"loss": 5.0259,
|
|
"mean_token_accuracy": 0.21593111753463745,
|
|
"num_tokens": 30162204.0,
|
|
"step": 13155
|
|
},
|
|
{
|
|
"entropy": 5.2082499980926515,
|
|
"epoch": 1.2641690682036504,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048472995740585456,
|
|
"loss": 4.971,
|
|
"mean_token_accuracy": 0.210064397752285,
|
|
"num_tokens": 30172574.0,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"entropy": 5.113088941574096,
|
|
"epoch": 1.2646493756003843,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004847175403565316,
|
|
"loss": 4.9037,
|
|
"mean_token_accuracy": 0.21865027099847795,
|
|
"num_tokens": 30183957.0,
|
|
"step": 13165
|
|
},
|
|
{
|
|
"entropy": 5.228566980361938,
|
|
"epoch": 1.2651296829971181,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048470511843818996,
|
|
"loss": 4.9679,
|
|
"mean_token_accuracy": 0.2197330266237259,
|
|
"num_tokens": 30194207.0,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"entropy": 5.143984985351563,
|
|
"epoch": 1.265609990393852,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004846926916511182,
|
|
"loss": 4.9185,
|
|
"mean_token_accuracy": 0.22421342581510545,
|
|
"num_tokens": 30205180.0,
|
|
"step": 13175
|
|
},
|
|
{
|
|
"entropy": 5.170929908752441,
|
|
"epoch": 1.266090297790586,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004846802599956048,
|
|
"loss": 4.9828,
|
|
"mean_token_accuracy": 0.20965515226125717,
|
|
"num_tokens": 30217734.0,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"entropy": 5.29590859413147,
|
|
"epoch": 1.26657060518732,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048466782347193847,
|
|
"loss": 5.2001,
|
|
"mean_token_accuracy": 0.1999218687415123,
|
|
"num_tokens": 30229835.0,
|
|
"step": 13185
|
|
},
|
|
{
|
|
"entropy": 5.277102136611939,
|
|
"epoch": 1.267050912584054,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048465538208040775,
|
|
"loss": 5.0373,
|
|
"mean_token_accuracy": 0.20946380198001863,
|
|
"num_tokens": 30241932.0,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"entropy": 5.211878299713135,
|
|
"epoch": 1.2675312199807878,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048464293582130166,
|
|
"loss": 5.0248,
|
|
"mean_token_accuracy": 0.21039628088474274,
|
|
"num_tokens": 30253149.0,
|
|
"step": 13195
|
|
},
|
|
{
|
|
"entropy": 5.256510972976685,
|
|
"epoch": 1.2680115273775217,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004846304846949091,
|
|
"loss": 4.9547,
|
|
"mean_token_accuracy": 0.2133228898048401,
|
|
"num_tokens": 30264083.0,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"entropy": 5.156170415878296,
|
|
"epoch": 1.2684918347742555,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048461802870151916,
|
|
"loss": 4.9245,
|
|
"mean_token_accuracy": 0.22115042805671692,
|
|
"num_tokens": 30274832.0,
|
|
"step": 13205
|
|
},
|
|
{
|
|
"entropy": 5.104134511947632,
|
|
"epoch": 1.2689721421709894,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00048460556784142106,
|
|
"loss": 4.9446,
|
|
"mean_token_accuracy": 0.21649524569511414,
|
|
"num_tokens": 30284945.0,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"entropy": 5.27008090019226,
|
|
"epoch": 1.2694524495677233,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048459310211490406,
|
|
"loss": 4.9969,
|
|
"mean_token_accuracy": 0.2195618912577629,
|
|
"num_tokens": 30295133.0,
|
|
"step": 13215
|
|
},
|
|
{
|
|
"entropy": 5.191392421722412,
|
|
"epoch": 1.2699327569644572,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004845806315222576,
|
|
"loss": 5.0609,
|
|
"mean_token_accuracy": 0.20277179926633834,
|
|
"num_tokens": 30305268.0,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"entropy": 5.123500633239746,
|
|
"epoch": 1.270413064361191,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004845681560637711,
|
|
"loss": 4.9226,
|
|
"mean_token_accuracy": 0.21860510110855103,
|
|
"num_tokens": 30317118.0,
|
|
"step": 13225
|
|
},
|
|
{
|
|
"entropy": 5.243611288070679,
|
|
"epoch": 1.270893371757925,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004845556757397344,
|
|
"loss": 5.0812,
|
|
"mean_token_accuracy": 0.20786524415016175,
|
|
"num_tokens": 30328684.0,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"entropy": 5.27822527885437,
|
|
"epoch": 1.271373679154659,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004845431905504372,
|
|
"loss": 5.0788,
|
|
"mean_token_accuracy": 0.2057919830083847,
|
|
"num_tokens": 30341433.0,
|
|
"step": 13235
|
|
},
|
|
{
|
|
"entropy": 5.1837303161621096,
|
|
"epoch": 1.271853986551393,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048453070049616926,
|
|
"loss": 4.958,
|
|
"mean_token_accuracy": 0.2240109384059906,
|
|
"num_tokens": 30353159.0,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"entropy": 5.156756496429443,
|
|
"epoch": 1.2723342939481268,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048451820557722064,
|
|
"loss": 5.0083,
|
|
"mean_token_accuracy": 0.21551052629947662,
|
|
"num_tokens": 30363251.0,
|
|
"step": 13245
|
|
},
|
|
{
|
|
"entropy": 5.246157121658325,
|
|
"epoch": 1.2728146013448607,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004845057057938815,
|
|
"loss": 5.0621,
|
|
"mean_token_accuracy": 0.21401735842227937,
|
|
"num_tokens": 30375850.0,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"entropy": 5.240186405181885,
|
|
"epoch": 1.2732949087415946,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048449320114644185,
|
|
"loss": 5.0836,
|
|
"mean_token_accuracy": 0.20593365728855134,
|
|
"num_tokens": 30386839.0,
|
|
"step": 13255
|
|
},
|
|
{
|
|
"entropy": 5.22416672706604,
|
|
"epoch": 1.2737752161383284,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004844806916351922,
|
|
"loss": 5.052,
|
|
"mean_token_accuracy": 0.2087215930223465,
|
|
"num_tokens": 30398872.0,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"entropy": 5.206205415725708,
|
|
"epoch": 1.2742555235350626,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004844681772604229,
|
|
"loss": 4.962,
|
|
"mean_token_accuracy": 0.22111569941043854,
|
|
"num_tokens": 30409581.0,
|
|
"step": 13265
|
|
},
|
|
{
|
|
"entropy": 5.173876953125,
|
|
"epoch": 1.2747358309317964,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048445565802242454,
|
|
"loss": 4.9982,
|
|
"mean_token_accuracy": 0.214154152572155,
|
|
"num_tokens": 30420209.0,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"entropy": 5.225718021392822,
|
|
"epoch": 1.2752161383285303,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004844431339214878,
|
|
"loss": 5.0296,
|
|
"mean_token_accuracy": 0.21498659551143645,
|
|
"num_tokens": 30432093.0,
|
|
"step": 13275
|
|
},
|
|
{
|
|
"entropy": 5.192876482009888,
|
|
"epoch": 1.2756964457252642,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004844306049579034,
|
|
"loss": 4.9477,
|
|
"mean_token_accuracy": 0.21263082027435304,
|
|
"num_tokens": 30442796.0,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"entropy": 5.20331597328186,
|
|
"epoch": 1.276176753121998,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048441807113196216,
|
|
"loss": 4.9849,
|
|
"mean_token_accuracy": 0.20858001410961152,
|
|
"num_tokens": 30455226.0,
|
|
"step": 13285
|
|
},
|
|
{
|
|
"entropy": 5.262969589233398,
|
|
"epoch": 1.276657060518732,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00048440553244395517,
|
|
"loss": 5.0852,
|
|
"mean_token_accuracy": 0.20551335960626602,
|
|
"num_tokens": 30467082.0,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"entropy": 5.207232666015625,
|
|
"epoch": 1.2771373679154658,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048439298889417357,
|
|
"loss": 4.9857,
|
|
"mean_token_accuracy": 0.20911924540996552,
|
|
"num_tokens": 30479051.0,
|
|
"step": 13295
|
|
},
|
|
{
|
|
"entropy": 5.192299127578735,
|
|
"epoch": 1.2776176753121997,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048438044048290847,
|
|
"loss": 5.0429,
|
|
"mean_token_accuracy": 0.20957115888595582,
|
|
"num_tokens": 30489989.0,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"entropy": 5.226817989349366,
|
|
"epoch": 1.2780979827089336,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048436788721045135,
|
|
"loss": 4.9441,
|
|
"mean_token_accuracy": 0.21679565608501433,
|
|
"num_tokens": 30501533.0,
|
|
"step": 13305
|
|
},
|
|
{
|
|
"entropy": 5.203074645996094,
|
|
"epoch": 1.2785782901056677,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004843553290770935,
|
|
"loss": 5.029,
|
|
"mean_token_accuracy": 0.2081605538725853,
|
|
"num_tokens": 30512231.0,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"entropy": 5.154972076416016,
|
|
"epoch": 1.2790585975024016,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004843427660831266,
|
|
"loss": 5.0001,
|
|
"mean_token_accuracy": 0.21241324096918107,
|
|
"num_tokens": 30523204.0,
|
|
"step": 13315
|
|
},
|
|
{
|
|
"entropy": 5.282387590408325,
|
|
"epoch": 1.2795389048991355,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00048433019822884235,
|
|
"loss": 5.1216,
|
|
"mean_token_accuracy": 0.20325924307107926,
|
|
"num_tokens": 30534956.0,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"entropy": 5.182562732696534,
|
|
"epoch": 1.2800192122958693,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004843176255145325,
|
|
"loss": 4.9731,
|
|
"mean_token_accuracy": 0.21960055232048034,
|
|
"num_tokens": 30545938.0,
|
|
"step": 13325
|
|
},
|
|
{
|
|
"entropy": 5.180881690979004,
|
|
"epoch": 1.2804995196926032,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004843050479404888,
|
|
"loss": 4.915,
|
|
"mean_token_accuracy": 0.21800871789455414,
|
|
"num_tokens": 30557323.0,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"entropy": 5.227671194076538,
|
|
"epoch": 1.280979827089337,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048429246550700343,
|
|
"loss": 4.9882,
|
|
"mean_token_accuracy": 0.21517169177532197,
|
|
"num_tokens": 30569960.0,
|
|
"step": 13335
|
|
},
|
|
{
|
|
"entropy": 5.110123968124389,
|
|
"epoch": 1.2814601344860712,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004842798782143686,
|
|
"loss": 5.0236,
|
|
"mean_token_accuracy": 0.20901857316493988,
|
|
"num_tokens": 30581904.0,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"entropy": 5.196750164031982,
|
|
"epoch": 1.281940441882805,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048426728606287627,
|
|
"loss": 4.9905,
|
|
"mean_token_accuracy": 0.21553199142217636,
|
|
"num_tokens": 30592955.0,
|
|
"step": 13345
|
|
},
|
|
{
|
|
"entropy": 5.22657151222229,
|
|
"epoch": 1.282420749279539,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004842546890528191,
|
|
"loss": 5.0424,
|
|
"mean_token_accuracy": 0.21174602657556535,
|
|
"num_tokens": 30604020.0,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"entropy": 5.152847194671631,
|
|
"epoch": 1.2829010566762729,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004842420871844893,
|
|
"loss": 4.9739,
|
|
"mean_token_accuracy": 0.2086465060710907,
|
|
"num_tokens": 30615623.0,
|
|
"step": 13355
|
|
},
|
|
{
|
|
"entropy": 5.217724800109863,
|
|
"epoch": 1.2833813640730067,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004842294804581796,
|
|
"loss": 5.0257,
|
|
"mean_token_accuracy": 0.2142942488193512,
|
|
"num_tokens": 30626258.0,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"entropy": 5.231122970581055,
|
|
"epoch": 1.2838616714697406,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00048421686887418266,
|
|
"loss": 5.008,
|
|
"mean_token_accuracy": 0.20600861310958862,
|
|
"num_tokens": 30637861.0,
|
|
"step": 13365
|
|
},
|
|
{
|
|
"entropy": 5.25348687171936,
|
|
"epoch": 1.2843419788664745,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004842042524327912,
|
|
"loss": 5.0313,
|
|
"mean_token_accuracy": 0.2152662232518196,
|
|
"num_tokens": 30648835.0,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"entropy": 5.268816089630127,
|
|
"epoch": 1.2848222862632084,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004841916311342983,
|
|
"loss": 5.071,
|
|
"mean_token_accuracy": 0.20168877840042115,
|
|
"num_tokens": 30659117.0,
|
|
"step": 13375
|
|
},
|
|
{
|
|
"entropy": 5.167082214355469,
|
|
"epoch": 1.2853025936599423,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004841790049789969,
|
|
"loss": 4.99,
|
|
"mean_token_accuracy": 0.21684323698282243,
|
|
"num_tokens": 30670282.0,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"entropy": 5.192442560195923,
|
|
"epoch": 1.2857829010566761,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00048416637396718004,
|
|
"loss": 5.049,
|
|
"mean_token_accuracy": 0.20397165417671204,
|
|
"num_tokens": 30681967.0,
|
|
"step": 13385
|
|
},
|
|
{
|
|
"entropy": 5.288777303695679,
|
|
"epoch": 1.2862632084534102,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004841537380991411,
|
|
"loss": 4.9953,
|
|
"mean_token_accuracy": 0.2100960224866867,
|
|
"num_tokens": 30692803.0,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"entropy": 5.132681179046631,
|
|
"epoch": 1.2867435158501441,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048414109737517346,
|
|
"loss": 4.8827,
|
|
"mean_token_accuracy": 0.22248595058918,
|
|
"num_tokens": 30704097.0,
|
|
"step": 13395
|
|
},
|
|
{
|
|
"entropy": 5.161887550354004,
|
|
"epoch": 1.287223823246878,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004841284517955706,
|
|
"loss": 5.0591,
|
|
"mean_token_accuracy": 0.20569444447755814,
|
|
"num_tokens": 30717983.0,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"entropy": 5.150148868560791,
|
|
"epoch": 1.2877041306436119,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000484115801360626,
|
|
"loss": 4.8516,
|
|
"mean_token_accuracy": 0.22625237703323364,
|
|
"num_tokens": 30728446.0,
|
|
"step": 13405
|
|
},
|
|
{
|
|
"entropy": 5.112018346786499,
|
|
"epoch": 1.2881844380403458,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004841031460706335,
|
|
"loss": 4.8186,
|
|
"mean_token_accuracy": 0.22128551304340363,
|
|
"num_tokens": 30739587.0,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"entropy": 5.120370292663575,
|
|
"epoch": 1.2886647454370797,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00048409048592588683,
|
|
"loss": 4.9393,
|
|
"mean_token_accuracy": 0.21629711836576462,
|
|
"num_tokens": 30750093.0,
|
|
"step": 13415
|
|
},
|
|
{
|
|
"entropy": 5.221544075012207,
|
|
"epoch": 1.2891450528338138,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004840778209266799,
|
|
"loss": 5.0089,
|
|
"mean_token_accuracy": 0.21404524147510529,
|
|
"num_tokens": 30761692.0,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"entropy": 5.132301568984985,
|
|
"epoch": 1.2896253602305476,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048406515107330685,
|
|
"loss": 4.9333,
|
|
"mean_token_accuracy": 0.21460689157247542,
|
|
"num_tokens": 30773474.0,
|
|
"step": 13425
|
|
},
|
|
{
|
|
"entropy": 5.293475818634033,
|
|
"epoch": 1.2901056676272815,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048405247636606173,
|
|
"loss": 5.1002,
|
|
"mean_token_accuracy": 0.20041738152503968,
|
|
"num_tokens": 30785464.0,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"entropy": 5.225921392440796,
|
|
"epoch": 1.2905859750240154,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00048403979680523894,
|
|
"loss": 4.9796,
|
|
"mean_token_accuracy": 0.21184006035327912,
|
|
"num_tokens": 30796343.0,
|
|
"step": 13435
|
|
},
|
|
{
|
|
"entropy": 5.233290290832519,
|
|
"epoch": 1.2910662824207493,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004840271123911328,
|
|
"loss": 4.9907,
|
|
"mean_token_accuracy": 0.21301163733005524,
|
|
"num_tokens": 30807795.0,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"entropy": 5.200629138946534,
|
|
"epoch": 1.2915465898174832,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004840144231240377,
|
|
"loss": 4.9783,
|
|
"mean_token_accuracy": 0.20906727910041809,
|
|
"num_tokens": 30819629.0,
|
|
"step": 13445
|
|
},
|
|
{
|
|
"entropy": 5.146392774581909,
|
|
"epoch": 1.292026897214217,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004840017290042484,
|
|
"loss": 4.9387,
|
|
"mean_token_accuracy": 0.21682157814502717,
|
|
"num_tokens": 30831545.0,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"entropy": 5.180679416656494,
|
|
"epoch": 1.292507204610951,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00048398903003205957,
|
|
"loss": 4.9697,
|
|
"mean_token_accuracy": 0.21640813797712327,
|
|
"num_tokens": 30843614.0,
|
|
"step": 13455
|
|
},
|
|
{
|
|
"entropy": 5.144548463821411,
|
|
"epoch": 1.2929875120076848,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048397632620776604,
|
|
"loss": 4.9008,
|
|
"mean_token_accuracy": 0.21930991858243942,
|
|
"num_tokens": 30853749.0,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"entropy": 5.183133602142334,
|
|
"epoch": 1.293467819404419,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048396361753166276,
|
|
"loss": 4.9319,
|
|
"mean_token_accuracy": 0.21686296314001083,
|
|
"num_tokens": 30865065.0,
|
|
"step": 13465
|
|
},
|
|
{
|
|
"entropy": 5.188254976272583,
|
|
"epoch": 1.2939481268011528,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048395090400404466,
|
|
"loss": 5.0198,
|
|
"mean_token_accuracy": 0.209990793466568,
|
|
"num_tokens": 30876746.0,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"entropy": 5.24221830368042,
|
|
"epoch": 1.2944284341978867,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048393818562520715,
|
|
"loss": 5.0519,
|
|
"mean_token_accuracy": 0.21296084821224212,
|
|
"num_tokens": 30889543.0,
|
|
"step": 13475
|
|
},
|
|
{
|
|
"entropy": 5.221997165679932,
|
|
"epoch": 1.2949087415946205,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00048392546239544535,
|
|
"loss": 5.0196,
|
|
"mean_token_accuracy": 0.21187058687210084,
|
|
"num_tokens": 30901949.0,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"entropy": 5.177766799926758,
|
|
"epoch": 1.2953890489913544,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004839127343150547,
|
|
"loss": 5.0104,
|
|
"mean_token_accuracy": 0.2122276872396469,
|
|
"num_tokens": 30912660.0,
|
|
"step": 13485
|
|
},
|
|
{
|
|
"entropy": 5.1722331047058105,
|
|
"epoch": 1.2958693563880883,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004839000013843307,
|
|
"loss": 4.9911,
|
|
"mean_token_accuracy": 0.2134689912199974,
|
|
"num_tokens": 30924276.0,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"entropy": 5.172789239883423,
|
|
"epoch": 1.2963496637848224,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048388726360356894,
|
|
"loss": 4.9235,
|
|
"mean_token_accuracy": 0.21602853089571,
|
|
"num_tokens": 30935169.0,
|
|
"step": 13495
|
|
},
|
|
{
|
|
"entropy": 5.292079591751099,
|
|
"epoch": 1.2968299711815563,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004838745209730653,
|
|
"loss": 5.1031,
|
|
"mean_token_accuracy": 0.20405390560626985,
|
|
"num_tokens": 30946116.0,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"entropy": 5.241908931732178,
|
|
"epoch": 1.2973102785782902,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00048386177349311535,
|
|
"loss": 5.0269,
|
|
"mean_token_accuracy": 0.20935841649770737,
|
|
"num_tokens": 30958509.0,
|
|
"step": 13505
|
|
},
|
|
{
|
|
"entropy": 5.223677349090576,
|
|
"epoch": 1.297790585975024,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004838490211640153,
|
|
"loss": 5.0269,
|
|
"mean_token_accuracy": 0.2150777280330658,
|
|
"num_tokens": 30969501.0,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"entropy": 5.135782957077026,
|
|
"epoch": 1.298270893371758,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004838362639860611,
|
|
"loss": 4.9413,
|
|
"mean_token_accuracy": 0.21526733487844468,
|
|
"num_tokens": 30979768.0,
|
|
"step": 13515
|
|
},
|
|
{
|
|
"entropy": 5.1186598777771,
|
|
"epoch": 1.2987512007684918,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000483823501959549,
|
|
"loss": 4.9318,
|
|
"mean_token_accuracy": 0.22160837799310684,
|
|
"num_tokens": 30992020.0,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"entropy": 5.242673254013061,
|
|
"epoch": 1.2992315081652257,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048381073508477527,
|
|
"loss": 5.0647,
|
|
"mean_token_accuracy": 0.21089437007904052,
|
|
"num_tokens": 31004081.0,
|
|
"step": 13525
|
|
},
|
|
{
|
|
"entropy": 5.293477010726929,
|
|
"epoch": 1.2997118155619596,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00048379796336203625,
|
|
"loss": 5.0783,
|
|
"mean_token_accuracy": 0.2032276600599289,
|
|
"num_tokens": 31015776.0,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"entropy": 5.198190069198608,
|
|
"epoch": 1.3001921229586935,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004837851867916286,
|
|
"loss": 4.9279,
|
|
"mean_token_accuracy": 0.21109964847564697,
|
|
"num_tokens": 31028066.0,
|
|
"step": 13535
|
|
},
|
|
{
|
|
"entropy": 5.151645803451538,
|
|
"epoch": 1.3006724303554273,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004837724053738488,
|
|
"loss": 4.9701,
|
|
"mean_token_accuracy": 0.21504315882921218,
|
|
"num_tokens": 31040234.0,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"entropy": 5.235762643814087,
|
|
"epoch": 1.3011527377521614,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00048375961910899373,
|
|
"loss": 5.02,
|
|
"mean_token_accuracy": 0.21187748908996581,
|
|
"num_tokens": 31051158.0,
|
|
"step": 13545
|
|
},
|
|
{
|
|
"entropy": 5.290946435928345,
|
|
"epoch": 1.3016330451488953,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004837468279973602,
|
|
"loss": 5.0187,
|
|
"mean_token_accuracy": 0.21141756772994996,
|
|
"num_tokens": 31063404.0,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"entropy": 5.1908276081085205,
|
|
"epoch": 1.3021133525456292,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004837340320392451,
|
|
"loss": 4.9814,
|
|
"mean_token_accuracy": 0.2137753427028656,
|
|
"num_tokens": 31073845.0,
|
|
"step": 13555
|
|
},
|
|
{
|
|
"entropy": 5.128818511962891,
|
|
"epoch": 1.302593659942363,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00048372123123494563,
|
|
"loss": 5.0075,
|
|
"mean_token_accuracy": 0.20479750484228135,
|
|
"num_tokens": 31086914.0,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"entropy": 5.209336137771606,
|
|
"epoch": 1.303073967339097,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000483708425584759,
|
|
"loss": 4.9457,
|
|
"mean_token_accuracy": 0.21852964758872986,
|
|
"num_tokens": 31098627.0,
|
|
"step": 13565
|
|
},
|
|
{
|
|
"entropy": 5.310657691955567,
|
|
"epoch": 1.3035542747358309,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004836956150889825,
|
|
"loss": 5.2078,
|
|
"mean_token_accuracy": 0.20212220698595046,
|
|
"num_tokens": 31110064.0,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"entropy": 5.232502269744873,
|
|
"epoch": 1.304034582132565,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004836827997479134,
|
|
"loss": 4.94,
|
|
"mean_token_accuracy": 0.22403647303581237,
|
|
"num_tokens": 31121572.0,
|
|
"step": 13575
|
|
},
|
|
{
|
|
"entropy": 5.2666408061981205,
|
|
"epoch": 1.3045148895292988,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004836699795618494,
|
|
"loss": 5.0574,
|
|
"mean_token_accuracy": 0.20667948424816132,
|
|
"num_tokens": 31133276.0,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"entropy": 5.261113119125366,
|
|
"epoch": 1.3049951969260327,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004836571545310881,
|
|
"loss": 5.0826,
|
|
"mean_token_accuracy": 0.20838973075151443,
|
|
"num_tokens": 31144524.0,
|
|
"step": 13585
|
|
},
|
|
{
|
|
"entropy": 5.195058012008667,
|
|
"epoch": 1.3054755043227666,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00048364432465592723,
|
|
"loss": 4.9389,
|
|
"mean_token_accuracy": 0.2192491337656975,
|
|
"num_tokens": 31155775.0,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"entropy": 5.19091010093689,
|
|
"epoch": 1.3059558117195005,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004836314899366647,
|
|
"loss": 4.9018,
|
|
"mean_token_accuracy": 0.21838261038064957,
|
|
"num_tokens": 31166176.0,
|
|
"step": 13595
|
|
},
|
|
{
|
|
"entropy": 5.180618000030518,
|
|
"epoch": 1.3064361191162344,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048361865037359846,
|
|
"loss": 4.9768,
|
|
"mean_token_accuracy": 0.21404249221086502,
|
|
"num_tokens": 31178568.0,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"entropy": 5.244294548034668,
|
|
"epoch": 1.3069164265129682,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00048360580596702664,
|
|
"loss": 5.0507,
|
|
"mean_token_accuracy": 0.21091840416193008,
|
|
"num_tokens": 31188897.0,
|
|
"step": 13605
|
|
},
|
|
{
|
|
"entropy": 5.194856262207031,
|
|
"epoch": 1.3073967339097021,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048359295671724744,
|
|
"loss": 5.0011,
|
|
"mean_token_accuracy": 0.21148939728736876,
|
|
"num_tokens": 31201370.0,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"entropy": 5.184963703155518,
|
|
"epoch": 1.307877041306436,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004835801026245592,
|
|
"loss": 5.0653,
|
|
"mean_token_accuracy": 0.20697593539953232,
|
|
"num_tokens": 31212704.0,
|
|
"step": 13615
|
|
},
|
|
{
|
|
"entropy": 5.2461165428161625,
|
|
"epoch": 1.30835734870317,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048356724368926035,
|
|
"loss": 5.0037,
|
|
"mean_token_accuracy": 0.21529320627450943,
|
|
"num_tokens": 31224447.0,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"entropy": 5.257749462127686,
|
|
"epoch": 1.308837656099904,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00048355437991164937,
|
|
"loss": 5.0519,
|
|
"mean_token_accuracy": 0.21352533251047134,
|
|
"num_tokens": 31235648.0,
|
|
"step": 13625
|
|
},
|
|
{
|
|
"entropy": 5.160556888580322,
|
|
"epoch": 1.3093179634966379,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000483541511292025,
|
|
"loss": 5.0468,
|
|
"mean_token_accuracy": 0.21006453037261963,
|
|
"num_tokens": 31248330.0,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"entropy": 5.208528804779053,
|
|
"epoch": 1.3097982708933718,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048352863783068594,
|
|
"loss": 5.0721,
|
|
"mean_token_accuracy": 0.21153536587953567,
|
|
"num_tokens": 31260036.0,
|
|
"step": 13635
|
|
},
|
|
{
|
|
"entropy": 5.1190698623657225,
|
|
"epoch": 1.3102785782901056,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00048351575952793117,
|
|
"loss": 4.8524,
|
|
"mean_token_accuracy": 0.21958549171686173,
|
|
"num_tokens": 31271473.0,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"entropy": 5.102574396133423,
|
|
"epoch": 1.3107588856868395,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004835028763840595,
|
|
"loss": 4.8655,
|
|
"mean_token_accuracy": 0.21855390667915345,
|
|
"num_tokens": 31282045.0,
|
|
"step": 13645
|
|
},
|
|
{
|
|
"entropy": 5.208409547805786,
|
|
"epoch": 1.3112391930835736,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004834899883993703,
|
|
"loss": 5.0836,
|
|
"mean_token_accuracy": 0.2093469277024269,
|
|
"num_tokens": 31292644.0,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"entropy": 5.228438043594361,
|
|
"epoch": 1.3117195004803075,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00048347709557416263,
|
|
"loss": 5.0165,
|
|
"mean_token_accuracy": 0.22113081067800522,
|
|
"num_tokens": 31304422.0,
|
|
"step": 13655
|
|
},
|
|
{
|
|
"entropy": 5.12980465888977,
|
|
"epoch": 1.3121998078770414,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004834641979087359,
|
|
"loss": 4.9284,
|
|
"mean_token_accuracy": 0.21452227383852004,
|
|
"num_tokens": 31314845.0,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"entropy": 5.248886060714722,
|
|
"epoch": 1.3126801152737753,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004834512954033894,
|
|
"loss": 5.0768,
|
|
"mean_token_accuracy": 0.20920535922050476,
|
|
"num_tokens": 31325669.0,
|
|
"step": 13665
|
|
},
|
|
{
|
|
"entropy": 5.186451864242554,
|
|
"epoch": 1.3131604226705091,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048343838805842284,
|
|
"loss": 4.9737,
|
|
"mean_token_accuracy": 0.22008894085884095,
|
|
"num_tokens": 31336023.0,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"entropy": 5.238216543197632,
|
|
"epoch": 1.313640730067243,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048342547587413583,
|
|
"loss": 5.0729,
|
|
"mean_token_accuracy": 0.20934326946735382,
|
|
"num_tokens": 31347146.0,
|
|
"step": 13675
|
|
},
|
|
{
|
|
"entropy": 5.273143291473389,
|
|
"epoch": 1.314121037463977,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004834125588508282,
|
|
"loss": 5.0544,
|
|
"mean_token_accuracy": 0.2122495487332344,
|
|
"num_tokens": 31357616.0,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"entropy": 5.262011289596558,
|
|
"epoch": 1.3146013448607108,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004833996369887998,
|
|
"loss": 5.0584,
|
|
"mean_token_accuracy": 0.2167961359024048,
|
|
"num_tokens": 31369189.0,
|
|
"step": 13685
|
|
},
|
|
{
|
|
"entropy": 5.159175825119019,
|
|
"epoch": 1.3150816522574447,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048338671028835063,
|
|
"loss": 4.9154,
|
|
"mean_token_accuracy": 0.22063409090042113,
|
|
"num_tokens": 31380841.0,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"entropy": 5.126403951644898,
|
|
"epoch": 1.3155619596541785,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048337377874978086,
|
|
"loss": 4.9398,
|
|
"mean_token_accuracy": 0.21937094777822494,
|
|
"num_tokens": 31393092.0,
|
|
"step": 13695
|
|
},
|
|
{
|
|
"entropy": 5.165436792373657,
|
|
"epoch": 1.3160422670509127,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00048336084237339067,
|
|
"loss": 4.9908,
|
|
"mean_token_accuracy": 0.21392715871334075,
|
|
"num_tokens": 31404228.0,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"entropy": 5.2636415481567385,
|
|
"epoch": 1.3165225744476465,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004833479011594804,
|
|
"loss": 5.0796,
|
|
"mean_token_accuracy": 0.20588037818670274,
|
|
"num_tokens": 31416233.0,
|
|
"step": 13705
|
|
},
|
|
{
|
|
"entropy": 5.302021265029907,
|
|
"epoch": 1.3170028818443804,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00048333495510835057,
|
|
"loss": 5.0554,
|
|
"mean_token_accuracy": 0.2089495837688446,
|
|
"num_tokens": 31427787.0,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"entropy": 5.034874057769775,
|
|
"epoch": 1.3174831892411143,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048332200422030163,
|
|
"loss": 4.8463,
|
|
"mean_token_accuracy": 0.21873539835214614,
|
|
"num_tokens": 31438736.0,
|
|
"step": 13715
|
|
},
|
|
{
|
|
"entropy": 5.0820282936096195,
|
|
"epoch": 1.3179634966378482,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004833090484956345,
|
|
"loss": 4.8609,
|
|
"mean_token_accuracy": 0.2156251400709152,
|
|
"num_tokens": 31449225.0,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"entropy": 5.3021155834198,
|
|
"epoch": 1.318443804034582,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00048329608793464966,
|
|
"loss": 5.0795,
|
|
"mean_token_accuracy": 0.21024447679519653,
|
|
"num_tokens": 31461223.0,
|
|
"step": 13725
|
|
},
|
|
{
|
|
"entropy": 5.292631769180298,
|
|
"epoch": 1.3189241114313162,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004832831225376482,
|
|
"loss": 5.0905,
|
|
"mean_token_accuracy": 0.2085978850722313,
|
|
"num_tokens": 31472397.0,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"entropy": 5.110633659362793,
|
|
"epoch": 1.31940441882805,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004832701523049312,
|
|
"loss": 4.9209,
|
|
"mean_token_accuracy": 0.22347624897956847,
|
|
"num_tokens": 31484079.0,
|
|
"step": 13735
|
|
},
|
|
{
|
|
"entropy": 5.168765449523926,
|
|
"epoch": 1.319884726224784,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004832571772367997,
|
|
"loss": 4.9554,
|
|
"mean_token_accuracy": 0.20989650189876558,
|
|
"num_tokens": 31496732.0,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"entropy": 5.222825241088867,
|
|
"epoch": 1.3203650336215178,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00048324419733355485,
|
|
"loss": 4.8996,
|
|
"mean_token_accuracy": 0.22089578211307526,
|
|
"num_tokens": 31507601.0,
|
|
"step": 13745
|
|
},
|
|
{
|
|
"entropy": 5.175820016860962,
|
|
"epoch": 1.3208453410182517,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00048323121259549805,
|
|
"loss": 4.9605,
|
|
"mean_token_accuracy": 0.21191650182008742,
|
|
"num_tokens": 31519551.0,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"entropy": 5.148926830291748,
|
|
"epoch": 1.3213256484149856,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048321822302293095,
|
|
"loss": 5.0144,
|
|
"mean_token_accuracy": 0.21526147425174713,
|
|
"num_tokens": 31531531.0,
|
|
"step": 13755
|
|
},
|
|
{
|
|
"entropy": 5.234309291839599,
|
|
"epoch": 1.3218059558117194,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004832052286161549,
|
|
"loss": 4.9451,
|
|
"mean_token_accuracy": 0.20788251608610153,
|
|
"num_tokens": 31542386.0,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"entropy": 5.1972698211669925,
|
|
"epoch": 1.3222862632084533,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00048319222937547176,
|
|
"loss": 4.9567,
|
|
"mean_token_accuracy": 0.21743978857994078,
|
|
"num_tokens": 31554142.0,
|
|
"step": 13765
|
|
},
|
|
{
|
|
"entropy": 5.197652673721313,
|
|
"epoch": 1.3227665706051872,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00048317922530118323,
|
|
"loss": 4.9909,
|
|
"mean_token_accuracy": 0.210670205950737,
|
|
"num_tokens": 31566687.0,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"entropy": 5.255802297592163,
|
|
"epoch": 1.3232468780019213,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004831662163935912,
|
|
"loss": 5.0836,
|
|
"mean_token_accuracy": 0.20739447325468063,
|
|
"num_tokens": 31577893.0,
|
|
"step": 13775
|
|
},
|
|
{
|
|
"entropy": 5.338723754882812,
|
|
"epoch": 1.3237271853986552,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048315320265299784,
|
|
"loss": 5.141,
|
|
"mean_token_accuracy": 0.20476285815238954,
|
|
"num_tokens": 31590483.0,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"entropy": 5.232583475112915,
|
|
"epoch": 1.324207492795389,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048314018407970516,
|
|
"loss": 5.0018,
|
|
"mean_token_accuracy": 0.21295965909957887,
|
|
"num_tokens": 31601373.0,
|
|
"step": 13785
|
|
},
|
|
{
|
|
"entropy": 5.163480615615844,
|
|
"epoch": 1.324687800192123,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048312716067401535,
|
|
"loss": 5.037,
|
|
"mean_token_accuracy": 0.20841159224510192,
|
|
"num_tokens": 31613981.0,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"entropy": 5.269176197052002,
|
|
"epoch": 1.3251681075888568,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000483114132436231,
|
|
"loss": 5.0406,
|
|
"mean_token_accuracy": 0.20180656611919404,
|
|
"num_tokens": 31626727.0,
|
|
"step": 13795
|
|
},
|
|
{
|
|
"entropy": 5.169664478302002,
|
|
"epoch": 1.3256484149855907,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004831010993666545,
|
|
"loss": 4.9317,
|
|
"mean_token_accuracy": 0.21578803807497024,
|
|
"num_tokens": 31638519.0,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"entropy": 5.129696464538574,
|
|
"epoch": 1.3261287223823248,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004830880614655884,
|
|
"loss": 4.9147,
|
|
"mean_token_accuracy": 0.21223049610853195,
|
|
"num_tokens": 31649184.0,
|
|
"step": 13805
|
|
},
|
|
{
|
|
"entropy": 5.234156227111816,
|
|
"epoch": 1.3266090297790587,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00048307501873333527,
|
|
"loss": 5.0198,
|
|
"mean_token_accuracy": 0.21354001611471177,
|
|
"num_tokens": 31660266.0,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"entropy": 5.198557376861572,
|
|
"epoch": 1.3270893371757926,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004830619711701982,
|
|
"loss": 4.978,
|
|
"mean_token_accuracy": 0.2158899873495102,
|
|
"num_tokens": 31670841.0,
|
|
"step": 13815
|
|
},
|
|
{
|
|
"entropy": 5.151759386062622,
|
|
"epoch": 1.3275696445725265,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004830489187764799,
|
|
"loss": 4.9399,
|
|
"mean_token_accuracy": 0.21373986601829528,
|
|
"num_tokens": 31680786.0,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"entropy": 5.0768732070922855,
|
|
"epoch": 1.3280499519692603,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004830358615524835,
|
|
"loss": 4.9565,
|
|
"mean_token_accuracy": 0.21601667404174804,
|
|
"num_tokens": 31692639.0,
|
|
"step": 13825
|
|
},
|
|
{
|
|
"entropy": 5.29799165725708,
|
|
"epoch": 1.3285302593659942,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048302279949851215,
|
|
"loss": 5.0705,
|
|
"mean_token_accuracy": 0.21043513119220733,
|
|
"num_tokens": 31703520.0,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"entropy": 5.207717561721802,
|
|
"epoch": 1.329010566762728,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048300973261486906,
|
|
"loss": 4.9361,
|
|
"mean_token_accuracy": 0.22781557142734526,
|
|
"num_tokens": 31715659.0,
|
|
"step": 13835
|
|
},
|
|
{
|
|
"entropy": 5.160664987564087,
|
|
"epoch": 1.329490874159462,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004829966609018577,
|
|
"loss": 5.009,
|
|
"mean_token_accuracy": 0.20956473052501678,
|
|
"num_tokens": 31727896.0,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"entropy": 5.167005348205566,
|
|
"epoch": 1.3299711815561959,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048298358435978146,
|
|
"loss": 4.9763,
|
|
"mean_token_accuracy": 0.21739626675844193,
|
|
"num_tokens": 31738618.0,
|
|
"step": 13845
|
|
},
|
|
{
|
|
"entropy": 5.205134677886963,
|
|
"epoch": 1.3304514889529298,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048297050298894394,
|
|
"loss": 5.0286,
|
|
"mean_token_accuracy": 0.20701279789209365,
|
|
"num_tokens": 31750306.0,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"entropy": 5.272959852218628,
|
|
"epoch": 1.3309317963496639,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004829574167896489,
|
|
"loss": 5.0978,
|
|
"mean_token_accuracy": 0.20369507372379303,
|
|
"num_tokens": 31762015.0,
|
|
"step": 13855
|
|
},
|
|
{
|
|
"entropy": 5.2029674530029295,
|
|
"epoch": 1.3314121037463977,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00048294432576220027,
|
|
"loss": 4.9805,
|
|
"mean_token_accuracy": 0.2133902981877327,
|
|
"num_tokens": 31774120.0,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"entropy": 5.171335029602051,
|
|
"epoch": 1.3318924111431316,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004829312299069017,
|
|
"loss": 4.9216,
|
|
"mean_token_accuracy": 0.22213226556777954,
|
|
"num_tokens": 31785485.0,
|
|
"step": 13865
|
|
},
|
|
{
|
|
"entropy": 5.200899696350097,
|
|
"epoch": 1.3323727185398655,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00048291812922405755,
|
|
"loss": 5.0193,
|
|
"mean_token_accuracy": 0.21298006922006607,
|
|
"num_tokens": 31795621.0,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"entropy": 5.163142156600952,
|
|
"epoch": 1.3328530259365994,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004829050237139717,
|
|
"loss": 4.908,
|
|
"mean_token_accuracy": 0.21594492197036744,
|
|
"num_tokens": 31806761.0,
|
|
"step": 13875
|
|
},
|
|
{
|
|
"entropy": 5.23978180885315,
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004828919133769486,
|
|
"loss": 4.9982,
|
|
"mean_token_accuracy": 0.2074069321155548,
|
|
"num_tokens": 31817996.0,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"entropy": 5.220539999008179,
|
|
"epoch": 1.3338136407300674,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004828787982132926,
|
|
"loss": 5.0618,
|
|
"mean_token_accuracy": 0.21009029895067216,
|
|
"num_tokens": 31829888.0,
|
|
"step": 13885
|
|
},
|
|
{
|
|
"entropy": 5.245324230194091,
|
|
"epoch": 1.3342939481268012,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048286567822330815,
|
|
"loss": 4.9951,
|
|
"mean_token_accuracy": 0.21646622121334075,
|
|
"num_tokens": 31842582.0,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"entropy": 5.1263108253479,
|
|
"epoch": 1.3347742555235351,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004828525534072999,
|
|
"loss": 4.9066,
|
|
"mean_token_accuracy": 0.2145892322063446,
|
|
"num_tokens": 31854765.0,
|
|
"step": 13895
|
|
},
|
|
{
|
|
"entropy": 5.185867691040039,
|
|
"epoch": 1.335254562920269,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048283942376557254,
|
|
"loss": 4.9543,
|
|
"mean_token_accuracy": 0.20899006426334382,
|
|
"num_tokens": 31865505.0,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"entropy": 5.1626379013061525,
|
|
"epoch": 1.3357348703170029,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048282628929843097,
|
|
"loss": 4.9562,
|
|
"mean_token_accuracy": 0.2139149159193039,
|
|
"num_tokens": 31876599.0,
|
|
"step": 13905
|
|
},
|
|
{
|
|
"entropy": 5.196528911590576,
|
|
"epoch": 1.3362151777137368,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00048281315000617996,
|
|
"loss": 4.9635,
|
|
"mean_token_accuracy": 0.21398892104625702,
|
|
"num_tokens": 31887292.0,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"entropy": 5.179883575439453,
|
|
"epoch": 1.3366954851104706,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004828000058891248,
|
|
"loss": 5.0204,
|
|
"mean_token_accuracy": 0.20837054550647735,
|
|
"num_tokens": 31899018.0,
|
|
"step": 13915
|
|
},
|
|
{
|
|
"entropy": 5.125789833068848,
|
|
"epoch": 1.3371757925072045,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004827868569475706,
|
|
"loss": 4.9423,
|
|
"mean_token_accuracy": 0.2189765304327011,
|
|
"num_tokens": 31910402.0,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"entropy": 5.2392956733703615,
|
|
"epoch": 1.3376560999039384,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048277370318182243,
|
|
"loss": 5.0742,
|
|
"mean_token_accuracy": 0.2051733672618866,
|
|
"num_tokens": 31921792.0,
|
|
"step": 13925
|
|
},
|
|
{
|
|
"entropy": 5.264586544036865,
|
|
"epoch": 1.3381364073006723,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048276054459218596,
|
|
"loss": 5.0715,
|
|
"mean_token_accuracy": 0.206744547188282,
|
|
"num_tokens": 31932786.0,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"entropy": 5.2243023872375485,
|
|
"epoch": 1.3386167146974064,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048274738117896643,
|
|
"loss": 5.0083,
|
|
"mean_token_accuracy": 0.21426367163658142,
|
|
"num_tokens": 31944286.0,
|
|
"step": 13935
|
|
},
|
|
{
|
|
"entropy": 5.166335105895996,
|
|
"epoch": 1.3390970220941403,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048273421294246966,
|
|
"loss": 5.069,
|
|
"mean_token_accuracy": 0.21005858927965165,
|
|
"num_tokens": 31956304.0,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"entropy": 5.209878873825073,
|
|
"epoch": 1.3395773294908742,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048272103988300134,
|
|
"loss": 5.0088,
|
|
"mean_token_accuracy": 0.21268565505743026,
|
|
"num_tokens": 31968978.0,
|
|
"step": 13945
|
|
},
|
|
{
|
|
"entropy": 5.243113946914673,
|
|
"epoch": 1.340057636887608,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004827078620008672,
|
|
"loss": 4.987,
|
|
"mean_token_accuracy": 0.20750254094600679,
|
|
"num_tokens": 31980133.0,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"entropy": 5.202300357818603,
|
|
"epoch": 1.340537944284342,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048269467929637337,
|
|
"loss": 4.9367,
|
|
"mean_token_accuracy": 0.22350060045719147,
|
|
"num_tokens": 31990331.0,
|
|
"step": 13955
|
|
},
|
|
{
|
|
"entropy": 5.211510848999024,
|
|
"epoch": 1.341018251681076,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048268149176982576,
|
|
"loss": 5.0098,
|
|
"mean_token_accuracy": 0.2139397069811821,
|
|
"num_tokens": 32001956.0,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"entropy": 5.2395045280456545,
|
|
"epoch": 1.34149855907781,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048266829942153055,
|
|
"loss": 5.054,
|
|
"mean_token_accuracy": 0.21206386983394623,
|
|
"num_tokens": 32013577.0,
|
|
"step": 13965
|
|
},
|
|
{
|
|
"entropy": 5.341452741622925,
|
|
"epoch": 1.3419788664745438,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048265510225179413,
|
|
"loss": 5.1105,
|
|
"mean_token_accuracy": 0.20956795960664748,
|
|
"num_tokens": 32025751.0,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"entropy": 5.188568878173828,
|
|
"epoch": 1.3424591738712777,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004826419002609229,
|
|
"loss": 5.0519,
|
|
"mean_token_accuracy": 0.20738618373870848,
|
|
"num_tokens": 32036592.0,
|
|
"step": 13975
|
|
},
|
|
{
|
|
"entropy": 5.274795866012573,
|
|
"epoch": 1.3429394812680115,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00048262869344922326,
|
|
"loss": 5.0194,
|
|
"mean_token_accuracy": 0.2111186280846596,
|
|
"num_tokens": 32048122.0,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"entropy": 5.239737606048584,
|
|
"epoch": 1.3434197886647454,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00048261548181700186,
|
|
"loss": 4.9765,
|
|
"mean_token_accuracy": 0.2105955883860588,
|
|
"num_tokens": 32058673.0,
|
|
"step": 13985
|
|
},
|
|
{
|
|
"entropy": 5.085059738159179,
|
|
"epoch": 1.3439000960614793,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004826022653645655,
|
|
"loss": 4.924,
|
|
"mean_token_accuracy": 0.2151069536805153,
|
|
"num_tokens": 32070219.0,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"entropy": 5.24159984588623,
|
|
"epoch": 1.3443804034582132,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000482589044092221,
|
|
"loss": 5.0455,
|
|
"mean_token_accuracy": 0.20947272181510926,
|
|
"num_tokens": 32081883.0,
|
|
"step": 13995
|
|
},
|
|
{
|
|
"entropy": 5.30658369064331,
|
|
"epoch": 1.344860710854947,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048257581800027527,
|
|
"loss": 5.0672,
|
|
"mean_token_accuracy": 0.20932937860488893,
|
|
"num_tokens": 32094107.0,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"entropy": 5.196062517166138,
|
|
"epoch": 1.345341018251681,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004825625870890354,
|
|
"loss": 4.936,
|
|
"mean_token_accuracy": 0.21412646919488906,
|
|
"num_tokens": 32105242.0,
|
|
"step": 14005
|
|
},
|
|
{
|
|
"entropy": 5.214570760726929,
|
|
"epoch": 1.345821325648415,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004825493513588086,
|
|
"loss": 4.9387,
|
|
"mean_token_accuracy": 0.2163090154528618,
|
|
"num_tokens": 32117326.0,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"entropy": 5.2475536346435545,
|
|
"epoch": 1.346301633045149,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048253611080990226,
|
|
"loss": 5.0293,
|
|
"mean_token_accuracy": 0.21036939769983293,
|
|
"num_tokens": 32129002.0,
|
|
"step": 14015
|
|
},
|
|
{
|
|
"entropy": 5.212005138397217,
|
|
"epoch": 1.3467819404418828,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004825228654426236,
|
|
"loss": 5.0548,
|
|
"mean_token_accuracy": 0.2069345846772194,
|
|
"num_tokens": 32140380.0,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"entropy": 5.134690666198731,
|
|
"epoch": 1.3472622478386167,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004825096152572803,
|
|
"loss": 4.9054,
|
|
"mean_token_accuracy": 0.21957446187734603,
|
|
"num_tokens": 32151806.0,
|
|
"step": 14025
|
|
},
|
|
{
|
|
"entropy": 5.20372257232666,
|
|
"epoch": 1.3477425552353506,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048249636025417974,
|
|
"loss": 5.1018,
|
|
"mean_token_accuracy": 0.2116215631365776,
|
|
"num_tokens": 32163426.0,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"entropy": 5.213733768463134,
|
|
"epoch": 1.3482228626320845,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048248310043362997,
|
|
"loss": 5.0054,
|
|
"mean_token_accuracy": 0.21968272477388381,
|
|
"num_tokens": 32174349.0,
|
|
"step": 14035
|
|
},
|
|
{
|
|
"entropy": 5.264178276062012,
|
|
"epoch": 1.3487031700288186,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004824698357959386,
|
|
"loss": 5.0717,
|
|
"mean_token_accuracy": 0.2072421357035637,
|
|
"num_tokens": 32185382.0,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"entropy": 5.181209135055542,
|
|
"epoch": 1.3491834774255524,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048245656634141385,
|
|
"loss": 4.9607,
|
|
"mean_token_accuracy": 0.21428043842315675,
|
|
"num_tokens": 32195687.0,
|
|
"step": 14045
|
|
},
|
|
{
|
|
"entropy": 5.154476022720337,
|
|
"epoch": 1.3496637848222863,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048244329207036354,
|
|
"loss": 4.9185,
|
|
"mean_token_accuracy": 0.22052669078111647,
|
|
"num_tokens": 32205973.0,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"entropy": 5.170621156692505,
|
|
"epoch": 1.3501440922190202,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00048243001298309604,
|
|
"loss": 4.9966,
|
|
"mean_token_accuracy": 0.21300121247768403,
|
|
"num_tokens": 32217469.0,
|
|
"step": 14055
|
|
},
|
|
{
|
|
"entropy": 5.1875158786773685,
|
|
"epoch": 1.350624399615754,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048241672907991954,
|
|
"loss": 5.0227,
|
|
"mean_token_accuracy": 0.21119635105133056,
|
|
"num_tokens": 32228257.0,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"entropy": 5.206764030456543,
|
|
"epoch": 1.351104707012488,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004824034403611424,
|
|
"loss": 4.9875,
|
|
"mean_token_accuracy": 0.21840890049934386,
|
|
"num_tokens": 32239420.0,
|
|
"step": 14065
|
|
},
|
|
{
|
|
"entropy": 5.238004541397094,
|
|
"epoch": 1.3515850144092219,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004823901468270733,
|
|
"loss": 5.0209,
|
|
"mean_token_accuracy": 0.21211865544319153,
|
|
"num_tokens": 32250962.0,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"entropy": 5.234237623214722,
|
|
"epoch": 1.3520653218059557,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004823768484780209,
|
|
"loss": 5.0047,
|
|
"mean_token_accuracy": 0.21667125970125198,
|
|
"num_tokens": 32262310.0,
|
|
"step": 14075
|
|
},
|
|
{
|
|
"entropy": 5.1772243022918705,
|
|
"epoch": 1.3525456292026896,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00048236354531429375,
|
|
"loss": 4.903,
|
|
"mean_token_accuracy": 0.21903317421674728,
|
|
"num_tokens": 32273373.0,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"entropy": 5.260414171218872,
|
|
"epoch": 1.3530259365994235,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004823502373362009,
|
|
"loss": 5.0224,
|
|
"mean_token_accuracy": 0.21051734387874604,
|
|
"num_tokens": 32285020.0,
|
|
"step": 14085
|
|
},
|
|
{
|
|
"entropy": 5.182881259918213,
|
|
"epoch": 1.3535062439961576,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004823369245440512,
|
|
"loss": 5.0027,
|
|
"mean_token_accuracy": 0.21150606274604797,
|
|
"num_tokens": 32296224.0,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"entropy": 5.177131128311157,
|
|
"epoch": 1.3539865513928915,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048232360693815387,
|
|
"loss": 5.0028,
|
|
"mean_token_accuracy": 0.21548304408788682,
|
|
"num_tokens": 32306913.0,
|
|
"step": 14095
|
|
},
|
|
{
|
|
"entropy": 5.239502000808716,
|
|
"epoch": 1.3544668587896254,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048231028451881786,
|
|
"loss": 4.9757,
|
|
"mean_token_accuracy": 0.21516055166721343,
|
|
"num_tokens": 32317981.0,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"entropy": 5.112478399276734,
|
|
"epoch": 1.3549471661863592,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 0.0004822969572863527,
|
|
"loss": 4.8805,
|
|
"mean_token_accuracy": 0.22126417160034179,
|
|
"num_tokens": 32329656.0,
|
|
"step": 14105
|
|
},
|
|
{
|
|
"entropy": 5.247384786605835,
|
|
"epoch": 1.3554274735830931,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048228362524106776,
|
|
"loss": 5.0463,
|
|
"mean_token_accuracy": 0.2171102821826935,
|
|
"num_tokens": 32339460.0,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"entropy": 5.15761137008667,
|
|
"epoch": 1.3559077809798272,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048227028838327253,
|
|
"loss": 4.9853,
|
|
"mean_token_accuracy": 0.2139111652970314,
|
|
"num_tokens": 32351237.0,
|
|
"step": 14115
|
|
},
|
|
{
|
|
"entropy": 5.221787309646606,
|
|
"epoch": 1.356388088376561,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048225694671327665,
|
|
"loss": 4.9212,
|
|
"mean_token_accuracy": 0.21492843478918075,
|
|
"num_tokens": 32362368.0,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"entropy": 5.100544738769531,
|
|
"epoch": 1.356868395773295,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004822436002313899,
|
|
"loss": 4.9558,
|
|
"mean_token_accuracy": 0.22090719044208526,
|
|
"num_tokens": 32373738.0,
|
|
"step": 14125
|
|
},
|
|
{
|
|
"entropy": 5.202834510803223,
|
|
"epoch": 1.3573487031700289,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004822302489379222,
|
|
"loss": 5.0004,
|
|
"mean_token_accuracy": 0.21080951392650604,
|
|
"num_tokens": 32384274.0,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"entropy": 5.1527222156524655,
|
|
"epoch": 1.3578290105667628,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00048221689283318335,
|
|
"loss": 4.9162,
|
|
"mean_token_accuracy": 0.22189487069845198,
|
|
"num_tokens": 32395692.0,
|
|
"step": 14135
|
|
},
|
|
{
|
|
"entropy": 5.170470714569092,
|
|
"epoch": 1.3583093179634966,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004822035319174837,
|
|
"loss": 4.9993,
|
|
"mean_token_accuracy": 0.215805584192276,
|
|
"num_tokens": 32407367.0,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"entropy": 5.2223457336425785,
|
|
"epoch": 1.3587896253602305,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004821901661911332,
|
|
"loss": 4.9985,
|
|
"mean_token_accuracy": 0.21592912524938584,
|
|
"num_tokens": 32418753.0,
|
|
"step": 14145
|
|
},
|
|
{
|
|
"entropy": 5.224755716323853,
|
|
"epoch": 1.3592699327569644,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004821767956544423,
|
|
"loss": 4.9901,
|
|
"mean_token_accuracy": 0.2103568896651268,
|
|
"num_tokens": 32429499.0,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"entropy": 5.1020674228668215,
|
|
"epoch": 1.3597502401536983,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004821634203077214,
|
|
"loss": 4.9625,
|
|
"mean_token_accuracy": 0.2103301167488098,
|
|
"num_tokens": 32441246.0,
|
|
"step": 14155
|
|
},
|
|
{
|
|
"entropy": 5.2334638118743895,
|
|
"epoch": 1.3602305475504322,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000482150040151281,
|
|
"loss": 5.0333,
|
|
"mean_token_accuracy": 0.2047850400209427,
|
|
"num_tokens": 32452953.0,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"entropy": 5.247525882720947,
|
|
"epoch": 1.3607108549471663,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004821366551854318,
|
|
"loss": 4.9967,
|
|
"mean_token_accuracy": 0.2128307819366455,
|
|
"num_tokens": 32464030.0,
|
|
"step": 14165
|
|
},
|
|
{
|
|
"entropy": 5.110248804092407,
|
|
"epoch": 1.3611911623439001,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004821232654104845,
|
|
"loss": 4.811,
|
|
"mean_token_accuracy": 0.21935641169548034,
|
|
"num_tokens": 32475091.0,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"entropy": 5.127313280105591,
|
|
"epoch": 1.361671469740634,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048210987082675005,
|
|
"loss": 5.0527,
|
|
"mean_token_accuracy": 0.21497475653886794,
|
|
"num_tokens": 32486047.0,
|
|
"step": 14175
|
|
},
|
|
{
|
|
"entropy": 5.163631916046143,
|
|
"epoch": 1.362151777137368,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00048209647143453946,
|
|
"loss": 4.9198,
|
|
"mean_token_accuracy": 0.22030255049467087,
|
|
"num_tokens": 32497141.0,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"entropy": 5.1883574485778805,
|
|
"epoch": 1.3626320845341018,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00048208306723416356,
|
|
"loss": 4.9806,
|
|
"mean_token_accuracy": 0.2113000214099884,
|
|
"num_tokens": 32509282.0,
|
|
"step": 14185
|
|
},
|
|
{
|
|
"entropy": 5.247734832763672,
|
|
"epoch": 1.3631123919308357,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004820696582259339,
|
|
"loss": 5.0618,
|
|
"mean_token_accuracy": 0.20664857178926468,
|
|
"num_tokens": 32521383.0,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"entropy": 5.196439790725708,
|
|
"epoch": 1.3635926993275698,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004820562444101616,
|
|
"loss": 4.9404,
|
|
"mean_token_accuracy": 0.2142431080341339,
|
|
"num_tokens": 32533168.0,
|
|
"step": 14195
|
|
},
|
|
{
|
|
"entropy": 5.119819307327271,
|
|
"epoch": 1.3640730067243036,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004820428257871581,
|
|
"loss": 4.9734,
|
|
"mean_token_accuracy": 0.21469512581825256,
|
|
"num_tokens": 32545333.0,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"entropy": 5.115553903579712,
|
|
"epoch": 1.3645533141210375,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004820294023572351,
|
|
"loss": 4.9275,
|
|
"mean_token_accuracy": 0.2175430715084076,
|
|
"num_tokens": 32556665.0,
|
|
"step": 14205
|
|
},
|
|
{
|
|
"entropy": 5.278141355514526,
|
|
"epoch": 1.3650336215177714,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000482015974120704,
|
|
"loss": 5.1187,
|
|
"mean_token_accuracy": 0.2067723110318184,
|
|
"num_tokens": 32568318.0,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"entropy": 5.243190097808838,
|
|
"epoch": 1.3655139289145053,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048200254107787677,
|
|
"loss": 4.924,
|
|
"mean_token_accuracy": 0.21714796870946884,
|
|
"num_tokens": 32580010.0,
|
|
"step": 14215
|
|
},
|
|
{
|
|
"entropy": 5.200288200378418,
|
|
"epoch": 1.3659942363112392,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00048198910322906516,
|
|
"loss": 5.094,
|
|
"mean_token_accuracy": 0.201282075047493,
|
|
"num_tokens": 32592075.0,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"entropy": 5.279963874816895,
|
|
"epoch": 1.366474543707973,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00048197566057458125,
|
|
"loss": 5.1004,
|
|
"mean_token_accuracy": 0.21089850068092347,
|
|
"num_tokens": 32604548.0,
|
|
"step": 14225
|
|
},
|
|
{
|
|
"entropy": 5.305767488479614,
|
|
"epoch": 1.366954851104707,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000481962213114737,
|
|
"loss": 5.0238,
|
|
"mean_token_accuracy": 0.2131134197115898,
|
|
"num_tokens": 32616036.0,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"entropy": 5.124698162078857,
|
|
"epoch": 1.3674351585014408,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004819487608498448,
|
|
"loss": 4.9755,
|
|
"mean_token_accuracy": 0.21251588463783264,
|
|
"num_tokens": 32628157.0,
|
|
"step": 14235
|
|
},
|
|
{
|
|
"entropy": 5.2571838855743405,
|
|
"epoch": 1.3679154658981747,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00048193530378021687,
|
|
"loss": 5.0518,
|
|
"mean_token_accuracy": 0.20990225523710251,
|
|
"num_tokens": 32640571.0,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"entropy": 5.211359739303589,
|
|
"epoch": 1.3683957732949088,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00048192184190616567,
|
|
"loss": 5.0147,
|
|
"mean_token_accuracy": 0.20850124061107636,
|
|
"num_tokens": 32652005.0,
|
|
"step": 14245
|
|
},
|
|
{
|
|
"entropy": 5.182319211959839,
|
|
"epoch": 1.3688760806916427,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004819083752280037,
|
|
"loss": 4.953,
|
|
"mean_token_accuracy": 0.2171345219016075,
|
|
"num_tokens": 32663655.0,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"entropy": 5.153241872787476,
|
|
"epoch": 1.3693563880883766,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048189490374604373,
|
|
"loss": 5.0271,
|
|
"mean_token_accuracy": 0.20629312843084335,
|
|
"num_tokens": 32675419.0,
|
|
"step": 14255
|
|
},
|
|
{
|
|
"entropy": 5.223625183105469,
|
|
"epoch": 1.3698366954851104,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004818814274605983,
|
|
"loss": 4.9521,
|
|
"mean_token_accuracy": 0.21175645738840104,
|
|
"num_tokens": 32686680.0,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"entropy": 5.112087917327881,
|
|
"epoch": 1.3703170028818443,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004818679463719805,
|
|
"loss": 4.8616,
|
|
"mean_token_accuracy": 0.2263885572552681,
|
|
"num_tokens": 32697321.0,
|
|
"step": 14265
|
|
},
|
|
{
|
|
"entropy": 5.0614667415618895,
|
|
"epoch": 1.3707973102785782,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004818544604805033,
|
|
"loss": 4.9871,
|
|
"mean_token_accuracy": 0.2116144999861717,
|
|
"num_tokens": 32708885.0,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"entropy": 5.212231779098511,
|
|
"epoch": 1.3712776176753123,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004818409697864798,
|
|
"loss": 4.9688,
|
|
"mean_token_accuracy": 0.221114681661129,
|
|
"num_tokens": 32720517.0,
|
|
"step": 14275
|
|
},
|
|
{
|
|
"entropy": 5.181997632980346,
|
|
"epoch": 1.3717579250720462,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00048182747429022303,
|
|
"loss": 4.9662,
|
|
"mean_token_accuracy": 0.2124532178044319,
|
|
"num_tokens": 32731072.0,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"entropy": 5.273564004898072,
|
|
"epoch": 1.37223823246878,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004818139739920465,
|
|
"loss": 5.0366,
|
|
"mean_token_accuracy": 0.21512430757284165,
|
|
"num_tokens": 32742117.0,
|
|
"step": 14285
|
|
},
|
|
{
|
|
"entropy": 5.170454597473144,
|
|
"epoch": 1.372718539865514,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004818004688922637,
|
|
"loss": 5.0208,
|
|
"mean_token_accuracy": 0.2080516129732132,
|
|
"num_tokens": 32753754.0,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"entropy": 5.15729718208313,
|
|
"epoch": 1.3731988472622478,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000481786958991188,
|
|
"loss": 4.9537,
|
|
"mean_token_accuracy": 0.21438979208469391,
|
|
"num_tokens": 32766511.0,
|
|
"step": 14295
|
|
},
|
|
{
|
|
"entropy": 5.300592947006225,
|
|
"epoch": 1.3736791546589817,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048177344428913316,
|
|
"loss": 5.0434,
|
|
"mean_token_accuracy": 0.20874705910682678,
|
|
"num_tokens": 32777715.0,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"entropy": 5.202049112319946,
|
|
"epoch": 1.3741594620557156,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00048175992478641293,
|
|
"loss": 4.9841,
|
|
"mean_token_accuracy": 0.21299902647733687,
|
|
"num_tokens": 32789132.0,
|
|
"step": 14305
|
|
},
|
|
{
|
|
"entropy": 5.293251276016235,
|
|
"epoch": 1.3746397694524495,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004817464004833412,
|
|
"loss": 5.0638,
|
|
"mean_token_accuracy": 0.21248952597379683,
|
|
"num_tokens": 32800439.0,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"entropy": 5.183023118972779,
|
|
"epoch": 1.3751200768491834,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048173287138023204,
|
|
"loss": 4.9448,
|
|
"mean_token_accuracy": 0.2125942125916481,
|
|
"num_tokens": 32813605.0,
|
|
"step": 14315
|
|
},
|
|
{
|
|
"entropy": 5.2913895606994625,
|
|
"epoch": 1.3756003842459175,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004817193374773993,
|
|
"loss": 5.0649,
|
|
"mean_token_accuracy": 0.21377015858888626,
|
|
"num_tokens": 32824225.0,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"entropy": 5.224006319046021,
|
|
"epoch": 1.3760806916426513,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048170579877515753,
|
|
"loss": 5.0986,
|
|
"mean_token_accuracy": 0.1994484543800354,
|
|
"num_tokens": 32836917.0,
|
|
"step": 14325
|
|
},
|
|
{
|
|
"entropy": 5.171962022781372,
|
|
"epoch": 1.3765609990393852,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004816922552738209,
|
|
"loss": 4.9375,
|
|
"mean_token_accuracy": 0.2171504095196724,
|
|
"num_tokens": 32847972.0,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"entropy": 5.122869682312012,
|
|
"epoch": 1.377041306436119,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048167870697370373,
|
|
"loss": 4.9153,
|
|
"mean_token_accuracy": 0.2238215833902359,
|
|
"num_tokens": 32858922.0,
|
|
"step": 14335
|
|
},
|
|
{
|
|
"entropy": 5.170677995681762,
|
|
"epoch": 1.377521613832853,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004816651538751207,
|
|
"loss": 4.8897,
|
|
"mean_token_accuracy": 0.21658048182725906,
|
|
"num_tokens": 32869788.0,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"entropy": 5.328837919235229,
|
|
"epoch": 1.3780019212295869,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048165159597838664,
|
|
"loss": 5.1289,
|
|
"mean_token_accuracy": 0.1964000031352043,
|
|
"num_tokens": 32881678.0,
|
|
"step": 14345
|
|
},
|
|
{
|
|
"entropy": 5.239847898483276,
|
|
"epoch": 1.378482228626321,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000481638033283816,
|
|
"loss": 5.0056,
|
|
"mean_token_accuracy": 0.20893828570842743,
|
|
"num_tokens": 32894183.0,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"entropy": 5.181384801864624,
|
|
"epoch": 1.3789625360230549,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048162446579172387,
|
|
"loss": 4.9588,
|
|
"mean_token_accuracy": 0.21461172699928283,
|
|
"num_tokens": 32906001.0,
|
|
"step": 14355
|
|
},
|
|
{
|
|
"entropy": 5.219556427001953,
|
|
"epoch": 1.3794428434197887,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004816108935024252,
|
|
"loss": 4.9869,
|
|
"mean_token_accuracy": 0.21462354362010955,
|
|
"num_tokens": 32917236.0,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"entropy": 5.205921077728272,
|
|
"epoch": 1.3799231508165226,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048159731641623507,
|
|
"loss": 5.0295,
|
|
"mean_token_accuracy": 0.20555862188339233,
|
|
"num_tokens": 32929710.0,
|
|
"step": 14365
|
|
},
|
|
{
|
|
"entropy": 5.1836954116821286,
|
|
"epoch": 1.3804034582132565,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004815837345334687,
|
|
"loss": 5.0501,
|
|
"mean_token_accuracy": 0.21190683096647261,
|
|
"num_tokens": 32941565.0,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"entropy": 5.2973743915557865,
|
|
"epoch": 1.3808837656099904,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004815701478544415,
|
|
"loss": 5.1076,
|
|
"mean_token_accuracy": 0.20745208263397216,
|
|
"num_tokens": 32952730.0,
|
|
"step": 14375
|
|
},
|
|
{
|
|
"entropy": 5.153661918640137,
|
|
"epoch": 1.3813640730067243,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00048155655637946876,
|
|
"loss": 4.9323,
|
|
"mean_token_accuracy": 0.21619703769683837,
|
|
"num_tokens": 32963589.0,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"entropy": 5.156609296798706,
|
|
"epoch": 1.3818443804034581,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004815429601088662,
|
|
"loss": 4.8997,
|
|
"mean_token_accuracy": 0.21508285403251648,
|
|
"num_tokens": 32975008.0,
|
|
"step": 14385
|
|
},
|
|
{
|
|
"entropy": 5.220380783081055,
|
|
"epoch": 1.382324687800192,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004815293590429494,
|
|
"loss": 5.0953,
|
|
"mean_token_accuracy": 0.20399677157402038,
|
|
"num_tokens": 32986497.0,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"entropy": 5.126517963409424,
|
|
"epoch": 1.382804995196926,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048151575318203417,
|
|
"loss": 4.8398,
|
|
"mean_token_accuracy": 0.2227206841111183,
|
|
"num_tokens": 32998298.0,
|
|
"step": 14395
|
|
},
|
|
{
|
|
"entropy": 5.315741157531738,
|
|
"epoch": 1.38328530259366,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00048150214252643637,
|
|
"loss": 5.0991,
|
|
"mean_token_accuracy": 0.20775451213121415,
|
|
"num_tokens": 33010943.0,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"entropy": 5.2394379615783695,
|
|
"epoch": 1.3837656099903939,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000481488527076472,
|
|
"loss": 5.0057,
|
|
"mean_token_accuracy": 0.2137501820921898,
|
|
"num_tokens": 33022139.0,
|
|
"step": 14405
|
|
},
|
|
{
|
|
"entropy": 5.229398345947265,
|
|
"epoch": 1.3842459173871278,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004814749068324571,
|
|
"loss": 5.0743,
|
|
"mean_token_accuracy": 0.208684541285038,
|
|
"num_tokens": 33034653.0,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"entropy": 5.133923292160034,
|
|
"epoch": 1.3847262247838616,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00048146128179470804,
|
|
"loss": 4.7775,
|
|
"mean_token_accuracy": 0.2346142292022705,
|
|
"num_tokens": 33044339.0,
|
|
"step": 14415
|
|
},
|
|
{
|
|
"entropy": 5.1511882781982425,
|
|
"epoch": 1.3852065321805955,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004814476519635411,
|
|
"loss": 4.9889,
|
|
"mean_token_accuracy": 0.21854183077812195,
|
|
"num_tokens": 33055418.0,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"entropy": 5.2140251159667965,
|
|
"epoch": 1.3856868395773294,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00048143401733927274,
|
|
"loss": 5.0771,
|
|
"mean_token_accuracy": 0.21263548582792283,
|
|
"num_tokens": 33067239.0,
|
|
"step": 14425
|
|
},
|
|
{
|
|
"entropy": 5.169557380676269,
|
|
"epoch": 1.3861671469740635,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00048142037792221943,
|
|
"loss": 4.8765,
|
|
"mean_token_accuracy": 0.22270715832710267,
|
|
"num_tokens": 33079101.0,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"entropy": 5.172262811660767,
|
|
"epoch": 1.3866474543707974,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004814067337126978,
|
|
"loss": 4.9711,
|
|
"mean_token_accuracy": 0.21912187784910203,
|
|
"num_tokens": 33090265.0,
|
|
"step": 14435
|
|
},
|
|
{
|
|
"entropy": 5.1767114162445065,
|
|
"epoch": 1.3871277617675313,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004813930847110248,
|
|
"loss": 4.9278,
|
|
"mean_token_accuracy": 0.22050705552101135,
|
|
"num_tokens": 33101724.0,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"entropy": 5.140494298934937,
|
|
"epoch": 1.3876080691642652,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004813794309175173,
|
|
"loss": 4.9003,
|
|
"mean_token_accuracy": 0.2202922970056534,
|
|
"num_tokens": 33113111.0,
|
|
"step": 14445
|
|
},
|
|
{
|
|
"entropy": 5.224148082733154,
|
|
"epoch": 1.388088376560999,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048136577233249205,
|
|
"loss": 5.0668,
|
|
"mean_token_accuracy": 0.20422582030296327,
|
|
"num_tokens": 33123925.0,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"entropy": 5.21802864074707,
|
|
"epoch": 1.388568683957733,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004813521089562666,
|
|
"loss": 5.0279,
|
|
"mean_token_accuracy": 0.2057347998023033,
|
|
"num_tokens": 33137400.0,
|
|
"step": 14455
|
|
},
|
|
{
|
|
"entropy": 5.1206972122192385,
|
|
"epoch": 1.3890489913544668,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004813384407891577,
|
|
"loss": 4.8598,
|
|
"mean_token_accuracy": 0.225153611600399,
|
|
"num_tokens": 33149326.0,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"entropy": 5.25689435005188,
|
|
"epoch": 1.3895292987512007,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000481324767831483,
|
|
"loss": 5.0631,
|
|
"mean_token_accuracy": 0.20121145844459534,
|
|
"num_tokens": 33159904.0,
|
|
"step": 14465
|
|
},
|
|
{
|
|
"entropy": 5.1242194175720215,
|
|
"epoch": 1.3900096061479346,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004813110900835598,
|
|
"loss": 4.9674,
|
|
"mean_token_accuracy": 0.21700112670660018,
|
|
"num_tokens": 33171701.0,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"entropy": 5.188337993621826,
|
|
"epoch": 1.3904899135446687,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004812974075457058,
|
|
"loss": 4.8593,
|
|
"mean_token_accuracy": 0.21914471834897994,
|
|
"num_tokens": 33181598.0,
|
|
"step": 14475
|
|
},
|
|
{
|
|
"entropy": 5.158151054382325,
|
|
"epoch": 1.3909702209414025,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00048128372021823845,
|
|
"loss": 4.9018,
|
|
"mean_token_accuracy": 0.2141671285033226,
|
|
"num_tokens": 33192674.0,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"entropy": 5.147328567504883,
|
|
"epoch": 1.3914505283381364,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048127002810147574,
|
|
"loss": 4.9428,
|
|
"mean_token_accuracy": 0.21623384952545166,
|
|
"num_tokens": 33203356.0,
|
|
"step": 14485
|
|
},
|
|
{
|
|
"entropy": 5.203651762008667,
|
|
"epoch": 1.3919308357348703,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004812563311957355,
|
|
"loss": 5.0018,
|
|
"mean_token_accuracy": 0.20919703990221022,
|
|
"num_tokens": 33215471.0,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"entropy": 5.1700574398040775,
|
|
"epoch": 1.3924111431316042,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004812426295013356,
|
|
"loss": 4.9644,
|
|
"mean_token_accuracy": 0.2118792712688446,
|
|
"num_tokens": 33227656.0,
|
|
"step": 14495
|
|
},
|
|
{
|
|
"entropy": 5.211377429962158,
|
|
"epoch": 1.392891450528338,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048122892301859433,
|
|
"loss": 5.0935,
|
|
"mean_token_accuracy": 0.2065966710448265,
|
|
"num_tokens": 33239752.0,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"entropy": 5.261070919036865,
|
|
"epoch": 1.3933717579250722,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048121521174782983,
|
|
"loss": 5.024,
|
|
"mean_token_accuracy": 0.20616735219955445,
|
|
"num_tokens": 33251352.0,
|
|
"step": 14505
|
|
},
|
|
{
|
|
"entropy": 5.210687255859375,
|
|
"epoch": 1.393852065321806,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00048120149568936044,
|
|
"loss": 5.0163,
|
|
"mean_token_accuracy": 0.210965596139431,
|
|
"num_tokens": 33262276.0,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"entropy": 5.234826755523682,
|
|
"epoch": 1.39433237271854,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004811877748435046,
|
|
"loss": 5.0574,
|
|
"mean_token_accuracy": 0.20583246052265167,
|
|
"num_tokens": 33273615.0,
|
|
"step": 14515
|
|
},
|
|
{
|
|
"entropy": 5.220763158798218,
|
|
"epoch": 1.3948126801152738,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004811740492105809,
|
|
"loss": 4.9741,
|
|
"mean_token_accuracy": 0.2154085621237755,
|
|
"num_tokens": 33283990.0,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"entropy": 5.14139952659607,
|
|
"epoch": 1.3952929875120077,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000481160318790908,
|
|
"loss": 4.9668,
|
|
"mean_token_accuracy": 0.21832637190818788,
|
|
"num_tokens": 33295472.0,
|
|
"step": 14525
|
|
},
|
|
{
|
|
"entropy": 5.236377191543579,
|
|
"epoch": 1.3957732949087416,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048114658358480467,
|
|
"loss": 5.028,
|
|
"mean_token_accuracy": 0.21142471432685853,
|
|
"num_tokens": 33306742.0,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"entropy": 5.2647254943847654,
|
|
"epoch": 1.3962536023054755,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048113284359258977,
|
|
"loss": 5.0231,
|
|
"mean_token_accuracy": 0.20946406126022338,
|
|
"num_tokens": 33317737.0,
|
|
"step": 14535
|
|
},
|
|
{
|
|
"entropy": 5.16503872871399,
|
|
"epoch": 1.3967339097022093,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00048111909881458234,
|
|
"loss": 5.0284,
|
|
"mean_token_accuracy": 0.20923743396997452,
|
|
"num_tokens": 33329673.0,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"entropy": 5.086363649368286,
|
|
"epoch": 1.3972142170989432,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00048110534925110146,
|
|
"loss": 4.8421,
|
|
"mean_token_accuracy": 0.2240893319249153,
|
|
"num_tokens": 33342047.0,
|
|
"step": 14545
|
|
},
|
|
{
|
|
"entropy": 5.161404705047607,
|
|
"epoch": 1.397694524495677,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004810915949024664,
|
|
"loss": 4.9326,
|
|
"mean_token_accuracy": 0.21960555166006088,
|
|
"num_tokens": 33353287.0,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"entropy": 5.251311635971069,
|
|
"epoch": 1.3981748318924112,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004810778357689965,
|
|
"loss": 5.0174,
|
|
"mean_token_accuracy": 0.21918236762285231,
|
|
"num_tokens": 33365465.0,
|
|
"step": 14555
|
|
},
|
|
{
|
|
"entropy": 5.1449960231781,
|
|
"epoch": 1.398655139289145,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048106407185101116,
|
|
"loss": 4.9636,
|
|
"mean_token_accuracy": 0.21153138428926468,
|
|
"num_tokens": 33376680.0,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"entropy": 5.135947895050049,
|
|
"epoch": 1.399135446685879,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048105030314883,
|
|
"loss": 4.9717,
|
|
"mean_token_accuracy": 0.21088991016149522,
|
|
"num_tokens": 33387995.0,
|
|
"step": 14565
|
|
},
|
|
{
|
|
"entropy": 5.18218960762024,
|
|
"epoch": 1.3996157540826129,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004810365296627725,
|
|
"loss": 4.9477,
|
|
"mean_token_accuracy": 0.21563183516263962,
|
|
"num_tokens": 33400455.0,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"entropy": 5.21840271949768,
|
|
"epoch": 1.4000960614793467,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004810227513931587,
|
|
"loss": 4.9926,
|
|
"mean_token_accuracy": 0.20900965332984925,
|
|
"num_tokens": 33413264.0,
|
|
"step": 14575
|
|
},
|
|
{
|
|
"entropy": 5.157942056655884,
|
|
"epoch": 1.4005763688760806,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004810089683403084,
|
|
"loss": 4.8773,
|
|
"mean_token_accuracy": 0.224508535861969,
|
|
"num_tokens": 33423516.0,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"entropy": 5.199560356140137,
|
|
"epoch": 1.4010566762728147,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004809951805045415,
|
|
"loss": 5.0182,
|
|
"mean_token_accuracy": 0.21045213490724562,
|
|
"num_tokens": 33434952.0,
|
|
"step": 14585
|
|
},
|
|
{
|
|
"entropy": 5.167169284820557,
|
|
"epoch": 1.4015369836695486,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00048098138788617815,
|
|
"loss": 4.9808,
|
|
"mean_token_accuracy": 0.20959090143442155,
|
|
"num_tokens": 33447025.0,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"entropy": 5.233838939666748,
|
|
"epoch": 1.4020172910662825,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004809675904855387,
|
|
"loss": 4.9454,
|
|
"mean_token_accuracy": 0.21511317044496536,
|
|
"num_tokens": 33459990.0,
|
|
"step": 14595
|
|
},
|
|
{
|
|
"entropy": 5.194369840621948,
|
|
"epoch": 1.4024975984630164,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00048095378830294343,
|
|
"loss": 4.92,
|
|
"mean_token_accuracy": 0.21844571679830552,
|
|
"num_tokens": 33471334.0,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"entropy": 5.197934675216675,
|
|
"epoch": 1.4029779058597502,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048093998133871276,
|
|
"loss": 5.0333,
|
|
"mean_token_accuracy": 0.21131069511175155,
|
|
"num_tokens": 33483744.0,
|
|
"step": 14605
|
|
},
|
|
{
|
|
"entropy": 5.198799562454224,
|
|
"epoch": 1.4034582132564841,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004809261695931671,
|
|
"loss": 5.0426,
|
|
"mean_token_accuracy": 0.21084193140268326,
|
|
"num_tokens": 33496532.0,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"entropy": 5.159095096588135,
|
|
"epoch": 1.403938520653218,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004809123530666273,
|
|
"loss": 4.9746,
|
|
"mean_token_accuracy": 0.21460918039083482,
|
|
"num_tokens": 33507545.0,
|
|
"step": 14615
|
|
},
|
|
{
|
|
"entropy": 5.24304347038269,
|
|
"epoch": 1.4044188280499519,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004808985317594142,
|
|
"loss": 4.9747,
|
|
"mean_token_accuracy": 0.21796323955059052,
|
|
"num_tokens": 33519232.0,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"entropy": 5.157637119293213,
|
|
"epoch": 1.4048991354466858,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00048088470567184854,
|
|
"loss": 4.9454,
|
|
"mean_token_accuracy": 0.2096275046467781,
|
|
"num_tokens": 33531088.0,
|
|
"step": 14625
|
|
},
|
|
{
|
|
"entropy": 5.225188112258911,
|
|
"epoch": 1.4053794428434199,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00048087087480425133,
|
|
"loss": 5.0125,
|
|
"mean_token_accuracy": 0.2120126485824585,
|
|
"num_tokens": 33543991.0,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"entropy": 5.237149286270141,
|
|
"epoch": 1.4058597502401537,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004808570391569437,
|
|
"loss": 5.0633,
|
|
"mean_token_accuracy": 0.20491664558649064,
|
|
"num_tokens": 33555303.0,
|
|
"step": 14635
|
|
},
|
|
{
|
|
"entropy": 5.264308309555053,
|
|
"epoch": 1.4063400576368876,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048084319873024694,
|
|
"loss": 5.0398,
|
|
"mean_token_accuracy": 0.2091526836156845,
|
|
"num_tokens": 33565587.0,
|
|
"step": 14640
|
|
},
|
|
{
|
|
"entropy": 5.181278657913208,
|
|
"epoch": 1.4068203650336215,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004808293535244823,
|
|
"loss": 4.9055,
|
|
"mean_token_accuracy": 0.21529979556798934,
|
|
"num_tokens": 33576874.0,
|
|
"step": 14645
|
|
},
|
|
{
|
|
"entropy": 5.14690670967102,
|
|
"epoch": 1.4073006724303554,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004808155035399712,
|
|
"loss": 4.9121,
|
|
"mean_token_accuracy": 0.22220734357833863,
|
|
"num_tokens": 33587703.0,
|
|
"step": 14650
|
|
},
|
|
{
|
|
"entropy": 5.13805742263794,
|
|
"epoch": 1.4077809798270893,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004808016487770354,
|
|
"loss": 4.8853,
|
|
"mean_token_accuracy": 0.22091327458620072,
|
|
"num_tokens": 33598487.0,
|
|
"step": 14655
|
|
},
|
|
{
|
|
"entropy": 5.156050348281861,
|
|
"epoch": 1.4082612872238234,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00048078778923599637,
|
|
"loss": 5.0117,
|
|
"mean_token_accuracy": 0.20967613756656647,
|
|
"num_tokens": 33610838.0,
|
|
"step": 14660
|
|
},
|
|
{
|
|
"entropy": 5.176731777191162,
|
|
"epoch": 1.4087415946205573,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00048077392491717593,
|
|
"loss": 4.958,
|
|
"mean_token_accuracy": 0.21370896100997924,
|
|
"num_tokens": 33622726.0,
|
|
"step": 14665
|
|
},
|
|
{
|
|
"entropy": 5.12830867767334,
|
|
"epoch": 1.4092219020172911,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048076005582089597,
|
|
"loss": 4.935,
|
|
"mean_token_accuracy": 0.21416952610015869,
|
|
"num_tokens": 33635922.0,
|
|
"step": 14670
|
|
},
|
|
{
|
|
"entropy": 5.317528486251831,
|
|
"epoch": 1.409702209414025,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00048074618194747845,
|
|
"loss": 5.1027,
|
|
"mean_token_accuracy": 0.20418261289596557,
|
|
"num_tokens": 33648486.0,
|
|
"step": 14675
|
|
},
|
|
{
|
|
"entropy": 5.2534605979919435,
|
|
"epoch": 1.410182516810759,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004807323032972456,
|
|
"loss": 4.9975,
|
|
"mean_token_accuracy": 0.21309973299503326,
|
|
"num_tokens": 33659812.0,
|
|
"step": 14680
|
|
},
|
|
{
|
|
"entropy": 5.222612524032593,
|
|
"epoch": 1.4106628242074928,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004807184198705196,
|
|
"loss": 4.9752,
|
|
"mean_token_accuracy": 0.21646286994218827,
|
|
"num_tokens": 33671878.0,
|
|
"step": 14685
|
|
},
|
|
{
|
|
"entropy": 5.133413934707642,
|
|
"epoch": 1.4111431316042267,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004807045316676226,
|
|
"loss": 4.8809,
|
|
"mean_token_accuracy": 0.21957986503839494,
|
|
"num_tokens": 33683759.0,
|
|
"step": 14690
|
|
},
|
|
{
|
|
"entropy": 5.253868293762207,
|
|
"epoch": 1.4116234390009605,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004806906386888773,
|
|
"loss": 5.0728,
|
|
"mean_token_accuracy": 0.2058554098010063,
|
|
"num_tokens": 33694085.0,
|
|
"step": 14695
|
|
},
|
|
{
|
|
"entropy": 5.213767671585083,
|
|
"epoch": 1.4121037463976944,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048067674093460607,
|
|
"loss": 4.9295,
|
|
"mean_token_accuracy": 0.21423121094703673,
|
|
"num_tokens": 33705810.0,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"entropy": 5.210308361053467,
|
|
"epoch": 1.4125840537944283,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048066283840513175,
|
|
"loss": 5.0187,
|
|
"mean_token_accuracy": 0.21089961528778076,
|
|
"num_tokens": 33716798.0,
|
|
"step": 14705
|
|
},
|
|
{
|
|
"entropy": 5.149886178970337,
|
|
"epoch": 1.4130643611911624,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004806489311007769,
|
|
"loss": 4.9737,
|
|
"mean_token_accuracy": 0.22218613475561141,
|
|
"num_tokens": 33728515.0,
|
|
"step": 14710
|
|
},
|
|
{
|
|
"entropy": 5.183704948425293,
|
|
"epoch": 1.4135446685878963,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00048063501902186463,
|
|
"loss": 4.9941,
|
|
"mean_token_accuracy": 0.21435530483722687,
|
|
"num_tokens": 33740684.0,
|
|
"step": 14715
|
|
},
|
|
{
|
|
"entropy": 5.210676050186157,
|
|
"epoch": 1.4140249759846302,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048062110216871775,
|
|
"loss": 4.9772,
|
|
"mean_token_accuracy": 0.2168477714061737,
|
|
"num_tokens": 33753017.0,
|
|
"step": 14720
|
|
},
|
|
{
|
|
"entropy": 5.175452709197998,
|
|
"epoch": 1.414505283381364,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048060718054165945,
|
|
"loss": 4.9873,
|
|
"mean_token_accuracy": 0.21028392165899276,
|
|
"num_tokens": 33764393.0,
|
|
"step": 14725
|
|
},
|
|
{
|
|
"entropy": 5.235194349288941,
|
|
"epoch": 1.414985590778098,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000480593254141013,
|
|
"loss": 5.0995,
|
|
"mean_token_accuracy": 0.2018577605485916,
|
|
"num_tokens": 33774941.0,
|
|
"step": 14730
|
|
},
|
|
{
|
|
"entropy": 5.101527976989746,
|
|
"epoch": 1.4154658981748318,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048057932296710165,
|
|
"loss": 4.8514,
|
|
"mean_token_accuracy": 0.22604466378688812,
|
|
"num_tokens": 33786534.0,
|
|
"step": 14735
|
|
},
|
|
{
|
|
"entropy": 5.284107494354248,
|
|
"epoch": 1.415946205571566,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004805653870202489,
|
|
"loss": 5.1043,
|
|
"mean_token_accuracy": 0.2047765925526619,
|
|
"num_tokens": 33798339.0,
|
|
"step": 14740
|
|
},
|
|
{
|
|
"entropy": 5.283356618881226,
|
|
"epoch": 1.4164265129682998,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00048055144630077825,
|
|
"loss": 5.1154,
|
|
"mean_token_accuracy": 0.2043526902794838,
|
|
"num_tokens": 33810368.0,
|
|
"step": 14745
|
|
},
|
|
{
|
|
"entropy": 5.194024896621704,
|
|
"epoch": 1.4169068203650337,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00048053750080901336,
|
|
"loss": 4.9659,
|
|
"mean_token_accuracy": 0.2081344470381737,
|
|
"num_tokens": 33821111.0,
|
|
"step": 14750
|
|
},
|
|
{
|
|
"entropy": 5.111878871917725,
|
|
"epoch": 1.4173871277617676,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00048052355054527794,
|
|
"loss": 4.9638,
|
|
"mean_token_accuracy": 0.21788180470466614,
|
|
"num_tokens": 33833629.0,
|
|
"step": 14755
|
|
},
|
|
{
|
|
"entropy": 5.268445110321045,
|
|
"epoch": 1.4178674351585014,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048050959550989606,
|
|
"loss": 5.063,
|
|
"mean_token_accuracy": 0.20761503428220748,
|
|
"num_tokens": 33846531.0,
|
|
"step": 14760
|
|
},
|
|
{
|
|
"entropy": 5.145606899261475,
|
|
"epoch": 1.4183477425552353,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004804956357031916,
|
|
"loss": 4.9377,
|
|
"mean_token_accuracy": 0.21531563848257065,
|
|
"num_tokens": 33857251.0,
|
|
"step": 14765
|
|
},
|
|
{
|
|
"entropy": 5.137730884552002,
|
|
"epoch": 1.4188280499519692,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00048048167112548873,
|
|
"loss": 4.9639,
|
|
"mean_token_accuracy": 0.21248998492956161,
|
|
"num_tokens": 33869314.0,
|
|
"step": 14770
|
|
},
|
|
{
|
|
"entropy": 5.285694265365601,
|
|
"epoch": 1.419308357348703,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048046770177711157,
|
|
"loss": 5.0916,
|
|
"mean_token_accuracy": 0.2074048936367035,
|
|
"num_tokens": 33880203.0,
|
|
"step": 14775
|
|
},
|
|
{
|
|
"entropy": 5.2045482158660885,
|
|
"epoch": 1.419788664745437,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004804537276583844,
|
|
"loss": 4.9006,
|
|
"mean_token_accuracy": 0.2209893763065338,
|
|
"num_tokens": 33891379.0,
|
|
"step": 14780
|
|
},
|
|
{
|
|
"entropy": 5.149256420135498,
|
|
"epoch": 1.420268972142171,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004804397487696319,
|
|
"loss": 4.9435,
|
|
"mean_token_accuracy": 0.21625811159610747,
|
|
"num_tokens": 33902788.0,
|
|
"step": 14785
|
|
},
|
|
{
|
|
"entropy": 5.160754537582397,
|
|
"epoch": 1.420749279538905,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004804257651111783,
|
|
"loss": 4.9719,
|
|
"mean_token_accuracy": 0.21887465864419936,
|
|
"num_tokens": 33913609.0,
|
|
"step": 14790
|
|
},
|
|
{
|
|
"entropy": 5.174741888046265,
|
|
"epoch": 1.4212295869356388,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00048041177668334853,
|
|
"loss": 4.8739,
|
|
"mean_token_accuracy": 0.21784851402044297,
|
|
"num_tokens": 33924379.0,
|
|
"step": 14795
|
|
},
|
|
{
|
|
"entropy": 5.137226343154907,
|
|
"epoch": 1.4217098943323727,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004803977834864672,
|
|
"loss": 5.009,
|
|
"mean_token_accuracy": 0.20945742577314377,
|
|
"num_tokens": 33936209.0,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"entropy": 5.228136348724365,
|
|
"epoch": 1.4221902017291066,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00048038378552085927,
|
|
"loss": 4.9569,
|
|
"mean_token_accuracy": 0.2131284847855568,
|
|
"num_tokens": 33947679.0,
|
|
"step": 14805
|
|
},
|
|
{
|
|
"entropy": 5.164991664886474,
|
|
"epoch": 1.4226705091258405,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00048036978278684974,
|
|
"loss": 4.9628,
|
|
"mean_token_accuracy": 0.2154536247253418,
|
|
"num_tokens": 33959474.0,
|
|
"step": 14810
|
|
},
|
|
{
|
|
"entropy": 5.293703842163086,
|
|
"epoch": 1.4231508165225746,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004803557752847636,
|
|
"loss": 5.1033,
|
|
"mean_token_accuracy": 0.20608988404273987,
|
|
"num_tokens": 33970831.0,
|
|
"step": 14815
|
|
},
|
|
{
|
|
"entropy": 5.348191404342652,
|
|
"epoch": 1.4236311239193085,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00048034176301492616,
|
|
"loss": 5.0618,
|
|
"mean_token_accuracy": 0.2066340461373329,
|
|
"num_tokens": 33981765.0,
|
|
"step": 14820
|
|
},
|
|
{
|
|
"entropy": 5.168016386032105,
|
|
"epoch": 1.4241114313160423,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004803277459776628,
|
|
"loss": 4.9541,
|
|
"mean_token_accuracy": 0.21502473205327988,
|
|
"num_tokens": 33992435.0,
|
|
"step": 14825
|
|
},
|
|
{
|
|
"entropy": 5.09135160446167,
|
|
"epoch": 1.4245917387127762,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00048031372417329875,
|
|
"loss": 4.9171,
|
|
"mean_token_accuracy": 0.22110578566789627,
|
|
"num_tokens": 34004570.0,
|
|
"step": 14830
|
|
},
|
|
{
|
|
"entropy": 5.189966630935669,
|
|
"epoch": 1.42507204610951,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004802996976021598,
|
|
"loss": 4.9145,
|
|
"mean_token_accuracy": 0.22021586894989015,
|
|
"num_tokens": 34015494.0,
|
|
"step": 14835
|
|
},
|
|
{
|
|
"entropy": 5.226748323440551,
|
|
"epoch": 1.425552353506244,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00048028566626457145,
|
|
"loss": 4.9932,
|
|
"mean_token_accuracy": 0.20883565545082092,
|
|
"num_tokens": 34026684.0,
|
|
"step": 14840
|
|
},
|
|
{
|
|
"entropy": 5.173442220687866,
|
|
"epoch": 1.4260326609029779,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00048027163016085947,
|
|
"loss": 4.9726,
|
|
"mean_token_accuracy": 0.21021606177091598,
|
|
"num_tokens": 34038948.0,
|
|
"step": 14845
|
|
},
|
|
{
|
|
"entropy": 5.259505367279052,
|
|
"epoch": 1.4265129682997117,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00048025758929134976,
|
|
"loss": 5.1272,
|
|
"mean_token_accuracy": 0.20375512093305587,
|
|
"num_tokens": 34052216.0,
|
|
"step": 14850
|
|
},
|
|
{
|
|
"entropy": 5.240663814544678,
|
|
"epoch": 1.4269932756964456,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004802435436563684,
|
|
"loss": 4.9785,
|
|
"mean_token_accuracy": 0.22330356240272523,
|
|
"num_tokens": 34062602.0,
|
|
"step": 14855
|
|
},
|
|
{
|
|
"entropy": 5.1824178218841555,
|
|
"epoch": 1.4274735830931795,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00048022949325624134,
|
|
"loss": 4.9436,
|
|
"mean_token_accuracy": 0.21629261821508408,
|
|
"num_tokens": 34075049.0,
|
|
"step": 14860
|
|
},
|
|
{
|
|
"entropy": 5.177359342575073,
|
|
"epoch": 1.4279538904899136,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00048021543809129483,
|
|
"loss": 5.0492,
|
|
"mean_token_accuracy": 0.20940061509609223,
|
|
"num_tokens": 34086719.0,
|
|
"step": 14865
|
|
},
|
|
{
|
|
"entropy": 5.237700700759888,
|
|
"epoch": 1.4284341978866475,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004802013781618552,
|
|
"loss": 4.9707,
|
|
"mean_token_accuracy": 0.21425776779651642,
|
|
"num_tokens": 34098439.0,
|
|
"step": 14870
|
|
},
|
|
{
|
|
"entropy": 5.309500598907471,
|
|
"epoch": 1.4289145052833814,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00048018731346824895,
|
|
"loss": 4.9895,
|
|
"mean_token_accuracy": 0.21168100982904434,
|
|
"num_tokens": 34110711.0,
|
|
"step": 14875
|
|
},
|
|
{
|
|
"entropy": 5.141423988342285,
|
|
"epoch": 1.4293948126801153,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004801732440108026,
|
|
"loss": 4.9326,
|
|
"mean_token_accuracy": 0.21338745206594467,
|
|
"num_tokens": 34122191.0,
|
|
"step": 14880
|
|
},
|
|
{
|
|
"entropy": 5.094103765487671,
|
|
"epoch": 1.4298751200768491,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.0004801591697898427,
|
|
"loss": 4.8899,
|
|
"mean_token_accuracy": 0.21937906593084336,
|
|
"num_tokens": 34132838.0,
|
|
"step": 14885
|
|
},
|
|
{
|
|
"entropy": 5.210858488082886,
|
|
"epoch": 1.430355427473583,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004801450908056961,
|
|
"loss": 5.0114,
|
|
"mean_token_accuracy": 0.20959677100181578,
|
|
"num_tokens": 34143394.0,
|
|
"step": 14890
|
|
},
|
|
{
|
|
"entropy": 5.3004334449768065,
|
|
"epoch": 1.4308357348703171,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004801310070586896,
|
|
"loss": 5.0731,
|
|
"mean_token_accuracy": 0.20850346684455873,
|
|
"num_tokens": 34155934.0,
|
|
"step": 14895
|
|
},
|
|
{
|
|
"entropy": 5.077618026733399,
|
|
"epoch": 1.431316042267051,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004801169185491503,
|
|
"loss": 4.8665,
|
|
"mean_token_accuracy": 0.22559798061847686,
|
|
"num_tokens": 34167949.0,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"entropy": 5.2307600498199465,
|
|
"epoch": 1.4317963496637849,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048010282527740516,
|
|
"loss": 5.1348,
|
|
"mean_token_accuracy": 0.20259464681148528,
|
|
"num_tokens": 34179733.0,
|
|
"step": 14905
|
|
},
|
|
{
|
|
"entropy": 5.207586050033569,
|
|
"epoch": 1.4322766570605188,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048008872724378146,
|
|
"loss": 4.9037,
|
|
"mean_token_accuracy": 0.2150167018175125,
|
|
"num_tokens": 34190513.0,
|
|
"step": 14910
|
|
},
|
|
{
|
|
"entropy": 5.152509164810181,
|
|
"epoch": 1.4327569644572526,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004800746244486065,
|
|
"loss": 4.9531,
|
|
"mean_token_accuracy": 0.21709322184324265,
|
|
"num_tokens": 34201562.0,
|
|
"step": 14915
|
|
},
|
|
{
|
|
"entropy": 5.062794637680054,
|
|
"epoch": 1.4332372718539865,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004800605168922077,
|
|
"loss": 4.9106,
|
|
"mean_token_accuracy": 0.21840206682682037,
|
|
"num_tokens": 34212637.0,
|
|
"step": 14920
|
|
},
|
|
{
|
|
"entropy": 5.150622749328614,
|
|
"epoch": 1.4337175792507204,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048004640457491267,
|
|
"loss": 4.9488,
|
|
"mean_token_accuracy": 0.21455983370542525,
|
|
"num_tokens": 34225394.0,
|
|
"step": 14925
|
|
},
|
|
{
|
|
"entropy": 5.278602600097656,
|
|
"epoch": 1.4341978866474543,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000480032287497049,
|
|
"loss": 4.9471,
|
|
"mean_token_accuracy": 0.2104356735944748,
|
|
"num_tokens": 34236977.0,
|
|
"step": 14930
|
|
},
|
|
{
|
|
"entropy": 5.212551403045654,
|
|
"epoch": 1.4346781940441882,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00048001816565894427,
|
|
"loss": 5.048,
|
|
"mean_token_accuracy": 0.21722146570682527,
|
|
"num_tokens": 34247486.0,
|
|
"step": 14935
|
|
},
|
|
{
|
|
"entropy": 5.173838663101196,
|
|
"epoch": 1.435158501440922,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004800040390609267,
|
|
"loss": 4.9366,
|
|
"mean_token_accuracy": 0.22200540751218795,
|
|
"num_tokens": 34259404.0,
|
|
"step": 14940
|
|
},
|
|
{
|
|
"entropy": 5.181051015853882,
|
|
"epoch": 1.4356388088376562,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047998990770332396,
|
|
"loss": 4.933,
|
|
"mean_token_accuracy": 0.22339427024126052,
|
|
"num_tokens": 34270388.0,
|
|
"step": 14945
|
|
},
|
|
{
|
|
"entropy": 5.264690160751343,
|
|
"epoch": 1.43611911623439,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004799757715864643,
|
|
"loss": 4.9981,
|
|
"mean_token_accuracy": 0.21528103947639465,
|
|
"num_tokens": 34281321.0,
|
|
"step": 14950
|
|
},
|
|
{
|
|
"entropy": 5.287259483337403,
|
|
"epoch": 1.436599423631124,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004799616307106759,
|
|
"loss": 5.0543,
|
|
"mean_token_accuracy": 0.20392760783433914,
|
|
"num_tokens": 34293177.0,
|
|
"step": 14955
|
|
},
|
|
{
|
|
"entropy": 5.23415994644165,
|
|
"epoch": 1.4370797310278578,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000479947485076287,
|
|
"loss": 5.0558,
|
|
"mean_token_accuracy": 0.21059397161006926,
|
|
"num_tokens": 34305175.0,
|
|
"step": 14960
|
|
},
|
|
{
|
|
"entropy": 5.194935846328735,
|
|
"epoch": 1.4375600384245917,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00047993333468362607,
|
|
"loss": 5.0247,
|
|
"mean_token_accuracy": 0.20465970337390899,
|
|
"num_tokens": 34317969.0,
|
|
"step": 14965
|
|
},
|
|
{
|
|
"entropy": 5.307715892791748,
|
|
"epoch": 1.4380403458213258,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047991917953302173,
|
|
"loss": 5.0479,
|
|
"mean_token_accuracy": 0.20630020052194595,
|
|
"num_tokens": 34329913.0,
|
|
"step": 14970
|
|
},
|
|
{
|
|
"entropy": 5.25423846244812,
|
|
"epoch": 1.4385206532180597,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047990501962480236,
|
|
"loss": 4.9951,
|
|
"mean_token_accuracy": 0.2149421378970146,
|
|
"num_tokens": 34341656.0,
|
|
"step": 14975
|
|
},
|
|
{
|
|
"entropy": 5.171209383010864,
|
|
"epoch": 1.4390009606147935,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000479890854959297,
|
|
"loss": 4.9685,
|
|
"mean_token_accuracy": 0.21098122894763946,
|
|
"num_tokens": 34351767.0,
|
|
"step": 14980
|
|
},
|
|
{
|
|
"entropy": 5.176991987228393,
|
|
"epoch": 1.4394812680115274,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004798766855368344,
|
|
"loss": 4.9592,
|
|
"mean_token_accuracy": 0.2132784456014633,
|
|
"num_tokens": 34363437.0,
|
|
"step": 14985
|
|
},
|
|
{
|
|
"entropy": 5.265459060668945,
|
|
"epoch": 1.4399615754082613,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047986251135774343,
|
|
"loss": 5.0465,
|
|
"mean_token_accuracy": 0.2133356049656868,
|
|
"num_tokens": 34374991.0,
|
|
"step": 14990
|
|
},
|
|
{
|
|
"entropy": 5.162412214279175,
|
|
"epoch": 1.4404418828049952,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004798483324223533,
|
|
"loss": 4.9692,
|
|
"mean_token_accuracy": 0.21427072286605836,
|
|
"num_tokens": 34386218.0,
|
|
"step": 14995
|
|
},
|
|
{
|
|
"entropy": 5.246811056137085,
|
|
"epoch": 1.440922190201729,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004798341487309932,
|
|
"loss": 5.0105,
|
|
"mean_token_accuracy": 0.2116893395781517,
|
|
"num_tokens": 34396287.0,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 1.440922190201729,
|
|
"eval_entropy": 5.037319316682671,
|
|
"eval_loss": 5.06929874420166,
|
|
"eval_mean_token_accuracy": 0.21831489476792584,
|
|
"eval_num_tokens": 34396287.0,
|
|
"eval_runtime": 26.5223,
|
|
"eval_samples_per_second": 1237.261,
|
|
"eval_steps_per_second": 154.662,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"entropy": 5.250750732421875,
|
|
"epoch": 1.441402497598463,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047981996028399233,
|
|
"loss": 5.0093,
|
|
"mean_token_accuracy": 0.20998309999704362,
|
|
"num_tokens": 34407251.0,
|
|
"step": 15005
|
|
},
|
|
{
|
|
"entropy": 5.326452255249023,
|
|
"epoch": 1.4418828049951968,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004798057670816802,
|
|
"loss": 5.1435,
|
|
"mean_token_accuracy": 0.20517653226852417,
|
|
"num_tokens": 34419185.0,
|
|
"step": 15010
|
|
},
|
|
{
|
|
"entropy": 5.208475351333618,
|
|
"epoch": 1.4423631123919307,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004797915691243863,
|
|
"loss": 4.9709,
|
|
"mean_token_accuracy": 0.21588644683361052,
|
|
"num_tokens": 34431159.0,
|
|
"step": 15015
|
|
},
|
|
{
|
|
"entropy": 5.154812479019165,
|
|
"epoch": 1.4428434197886648,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004797773664124403,
|
|
"loss": 4.9572,
|
|
"mean_token_accuracy": 0.21588555574417115,
|
|
"num_tokens": 34442614.0,
|
|
"step": 15020
|
|
},
|
|
{
|
|
"entropy": 5.13772292137146,
|
|
"epoch": 1.4433237271853987,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047976315894617195,
|
|
"loss": 4.9335,
|
|
"mean_token_accuracy": 0.21881027668714523,
|
|
"num_tokens": 34453984.0,
|
|
"step": 15025
|
|
},
|
|
{
|
|
"entropy": 5.1489208221435545,
|
|
"epoch": 1.4438040345821326,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000479748946725911,
|
|
"loss": 4.9391,
|
|
"mean_token_accuracy": 0.21688321828842164,
|
|
"num_tokens": 34466296.0,
|
|
"step": 15030
|
|
},
|
|
{
|
|
"entropy": 5.300703620910644,
|
|
"epoch": 1.4442843419788665,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004797347297519875,
|
|
"loss": 5.1447,
|
|
"mean_token_accuracy": 0.20751263648271562,
|
|
"num_tokens": 34478088.0,
|
|
"step": 15035
|
|
},
|
|
{
|
|
"entropy": 5.195109748840332,
|
|
"epoch": 1.4447646493756003,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047972050802473154,
|
|
"loss": 5.047,
|
|
"mean_token_accuracy": 0.20686309933662414,
|
|
"num_tokens": 34491664.0,
|
|
"step": 15040
|
|
},
|
|
{
|
|
"entropy": 5.13297290802002,
|
|
"epoch": 1.4452449567723342,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004797062815444733,
|
|
"loss": 4.9291,
|
|
"mean_token_accuracy": 0.21764905750751495,
|
|
"num_tokens": 34502977.0,
|
|
"step": 15045
|
|
},
|
|
{
|
|
"entropy": 5.207884359359741,
|
|
"epoch": 1.4457252641690683,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000479692050311543,
|
|
"loss": 4.8953,
|
|
"mean_token_accuracy": 0.2181214064359665,
|
|
"num_tokens": 34515067.0,
|
|
"step": 15050
|
|
},
|
|
{
|
|
"entropy": 5.223143815994263,
|
|
"epoch": 1.4462055715658022,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004796778143262711,
|
|
"loss": 5.0498,
|
|
"mean_token_accuracy": 0.2105468362569809,
|
|
"num_tokens": 34525012.0,
|
|
"step": 15055
|
|
},
|
|
{
|
|
"entropy": 5.092001056671142,
|
|
"epoch": 1.446685878962536,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004796635735889882,
|
|
"loss": 4.8538,
|
|
"mean_token_accuracy": 0.22900600135326385,
|
|
"num_tokens": 34535789.0,
|
|
"step": 15060
|
|
},
|
|
{
|
|
"entropy": 5.198391342163086,
|
|
"epoch": 1.44716618635927,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047964932810002476,
|
|
"loss": 4.9676,
|
|
"mean_token_accuracy": 0.21989088952541352,
|
|
"num_tokens": 34546276.0,
|
|
"step": 15065
|
|
},
|
|
{
|
|
"entropy": 5.198813486099243,
|
|
"epoch": 1.4476464937560038,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004796350778597117,
|
|
"loss": 4.9705,
|
|
"mean_token_accuracy": 0.21042503118515016,
|
|
"num_tokens": 34558361.0,
|
|
"step": 15070
|
|
},
|
|
{
|
|
"entropy": 5.154997491836548,
|
|
"epoch": 1.4481268011527377,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004796208228683796,
|
|
"loss": 4.9392,
|
|
"mean_token_accuracy": 0.21781230419874192,
|
|
"num_tokens": 34569482.0,
|
|
"step": 15075
|
|
},
|
|
{
|
|
"entropy": 5.262822818756104,
|
|
"epoch": 1.4486071085494716,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047960656312635977,
|
|
"loss": 5.0336,
|
|
"mean_token_accuracy": 0.2166367918252945,
|
|
"num_tokens": 34580128.0,
|
|
"step": 15080
|
|
},
|
|
{
|
|
"entropy": 5.16861662864685,
|
|
"epoch": 1.4490874159462055,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004795922986339831,
|
|
"loss": 4.9457,
|
|
"mean_token_accuracy": 0.21179744154214858,
|
|
"num_tokens": 34591105.0,
|
|
"step": 15085
|
|
},
|
|
{
|
|
"entropy": 5.203324699401856,
|
|
"epoch": 1.4495677233429394,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00047957802939158057,
|
|
"loss": 4.9878,
|
|
"mean_token_accuracy": 0.21329084187746047,
|
|
"num_tokens": 34602618.0,
|
|
"step": 15090
|
|
},
|
|
{
|
|
"entropy": 5.227561187744141,
|
|
"epoch": 1.4500480307396733,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004795637553994838,
|
|
"loss": 5.0729,
|
|
"mean_token_accuracy": 0.2020048052072525,
|
|
"num_tokens": 34614179.0,
|
|
"step": 15095
|
|
},
|
|
{
|
|
"entropy": 5.321483945846557,
|
|
"epoch": 1.4505283381364074,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00047954947665802404,
|
|
"loss": 5.0928,
|
|
"mean_token_accuracy": 0.2034539520740509,
|
|
"num_tokens": 34625456.0,
|
|
"step": 15100
|
|
},
|
|
{
|
|
"entropy": 5.186492490768432,
|
|
"epoch": 1.4510086455331412,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004795351931675329,
|
|
"loss": 4.9536,
|
|
"mean_token_accuracy": 0.22268653959035872,
|
|
"num_tokens": 34636268.0,
|
|
"step": 15105
|
|
},
|
|
{
|
|
"entropy": 5.138030385971069,
|
|
"epoch": 1.4514889529298751,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004795209049283419,
|
|
"loss": 4.9032,
|
|
"mean_token_accuracy": 0.22105590552091597,
|
|
"num_tokens": 34647665.0,
|
|
"step": 15110
|
|
},
|
|
{
|
|
"entropy": 5.2529072761535645,
|
|
"epoch": 1.451969260326609,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004795066119407827,
|
|
"loss": 5.0009,
|
|
"mean_token_accuracy": 0.21218062788248063,
|
|
"num_tokens": 34659965.0,
|
|
"step": 15115
|
|
},
|
|
{
|
|
"entropy": 5.214857578277588,
|
|
"epoch": 1.4524495677233429,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004794923142051873,
|
|
"loss": 5.052,
|
|
"mean_token_accuracy": 0.21157704889774323,
|
|
"num_tokens": 34671724.0,
|
|
"step": 15120
|
|
},
|
|
{
|
|
"entropy": 5.126708841323852,
|
|
"epoch": 1.452929875120077,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047947801172188755,
|
|
"loss": 4.9103,
|
|
"mean_token_accuracy": 0.22448884695768356,
|
|
"num_tokens": 34682445.0,
|
|
"step": 15125
|
|
},
|
|
{
|
|
"entropy": 5.107527399063111,
|
|
"epoch": 1.4534101825168109,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004794637044912155,
|
|
"loss": 4.8546,
|
|
"mean_token_accuracy": 0.22458722293376923,
|
|
"num_tokens": 34693621.0,
|
|
"step": 15130
|
|
},
|
|
{
|
|
"entropy": 5.1660699367523195,
|
|
"epoch": 1.4538904899135447,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004794493925135034,
|
|
"loss": 4.8351,
|
|
"mean_token_accuracy": 0.227722430229187,
|
|
"num_tokens": 34703978.0,
|
|
"step": 15135
|
|
},
|
|
{
|
|
"entropy": 5.199901390075683,
|
|
"epoch": 1.4543707973102786,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00047943507578908357,
|
|
"loss": 5.0363,
|
|
"mean_token_accuracy": 0.21987725645303727,
|
|
"num_tokens": 34715468.0,
|
|
"step": 15140
|
|
},
|
|
{
|
|
"entropy": 5.237486171722412,
|
|
"epoch": 1.4548511047070125,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004794207543182883,
|
|
"loss": 4.9965,
|
|
"mean_token_accuracy": 0.21251980364322662,
|
|
"num_tokens": 34726383.0,
|
|
"step": 15145
|
|
},
|
|
{
|
|
"entropy": 5.179723453521729,
|
|
"epoch": 1.4553314121037464,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047940642810145005,
|
|
"loss": 5.0156,
|
|
"mean_token_accuracy": 0.21433500498533248,
|
|
"num_tokens": 34737123.0,
|
|
"step": 15150
|
|
},
|
|
{
|
|
"entropy": 5.0879114151000975,
|
|
"epoch": 1.4558117195004803,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00047939209713890156,
|
|
"loss": 4.9252,
|
|
"mean_token_accuracy": 0.22000515311956406,
|
|
"num_tokens": 34749197.0,
|
|
"step": 15155
|
|
},
|
|
{
|
|
"entropy": 5.225097751617431,
|
|
"epoch": 1.4562920268972142,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00047937776143097547,
|
|
"loss": 4.9765,
|
|
"mean_token_accuracy": 0.2143160358071327,
|
|
"num_tokens": 34759785.0,
|
|
"step": 15160
|
|
},
|
|
{
|
|
"entropy": 5.112883234024048,
|
|
"epoch": 1.456772334293948,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004793634209780047,
|
|
"loss": 4.8375,
|
|
"mean_token_accuracy": 0.22548486590385436,
|
|
"num_tokens": 34770938.0,
|
|
"step": 15165
|
|
},
|
|
{
|
|
"entropy": 5.128720092773437,
|
|
"epoch": 1.457252641690682,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004793490757803221,
|
|
"loss": 4.8902,
|
|
"mean_token_accuracy": 0.22066261917352675,
|
|
"num_tokens": 34782126.0,
|
|
"step": 15170
|
|
},
|
|
{
|
|
"entropy": 5.119160270690918,
|
|
"epoch": 1.457732949087416,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047933472583826063,
|
|
"loss": 4.9154,
|
|
"mean_token_accuracy": 0.21358481496572496,
|
|
"num_tokens": 34793802.0,
|
|
"step": 15175
|
|
},
|
|
{
|
|
"entropy": 5.1480120658874515,
|
|
"epoch": 1.45821325648415,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004793203711521537,
|
|
"loss": 4.9813,
|
|
"mean_token_accuracy": 0.22073666751384735,
|
|
"num_tokens": 34805696.0,
|
|
"step": 15180
|
|
},
|
|
{
|
|
"entropy": 5.296039390563965,
|
|
"epoch": 1.4586935638808838,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047930601172233446,
|
|
"loss": 5.1314,
|
|
"mean_token_accuracy": 0.2063765347003937,
|
|
"num_tokens": 34818679.0,
|
|
"step": 15185
|
|
},
|
|
{
|
|
"entropy": 5.297084808349609,
|
|
"epoch": 1.4591738712776177,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00047929164754913624,
|
|
"loss": 4.9855,
|
|
"mean_token_accuracy": 0.21749197095632553,
|
|
"num_tokens": 34830528.0,
|
|
"step": 15190
|
|
},
|
|
{
|
|
"entropy": 5.171178531646729,
|
|
"epoch": 1.4596541786743515,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004792772786328926,
|
|
"loss": 4.8748,
|
|
"mean_token_accuracy": 0.22240075021982192,
|
|
"num_tokens": 34841621.0,
|
|
"step": 15195
|
|
},
|
|
{
|
|
"entropy": 5.087448406219482,
|
|
"epoch": 1.4601344860710854,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047926290497393714,
|
|
"loss": 4.8775,
|
|
"mean_token_accuracy": 0.2203219324350357,
|
|
"num_tokens": 34854448.0,
|
|
"step": 15200
|
|
},
|
|
{
|
|
"entropy": 5.195474147796631,
|
|
"epoch": 1.4606147934678195,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004792485265726036,
|
|
"loss": 4.996,
|
|
"mean_token_accuracy": 0.2099252760410309,
|
|
"num_tokens": 34866492.0,
|
|
"step": 15205
|
|
},
|
|
{
|
|
"entropy": 5.18867597579956,
|
|
"epoch": 1.4610951008645534,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004792341434292257,
|
|
"loss": 4.958,
|
|
"mean_token_accuracy": 0.21405645608901977,
|
|
"num_tokens": 34876869.0,
|
|
"step": 15210
|
|
},
|
|
{
|
|
"entropy": 5.122355937957764,
|
|
"epoch": 1.4615754082612873,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004792197555441374,
|
|
"loss": 4.8834,
|
|
"mean_token_accuracy": 0.22930939495563507,
|
|
"num_tokens": 34888449.0,
|
|
"step": 15215
|
|
},
|
|
{
|
|
"entropy": 5.125897169113159,
|
|
"epoch": 1.4620557156580212,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004792053629176729,
|
|
"loss": 4.9055,
|
|
"mean_token_accuracy": 0.2263544738292694,
|
|
"num_tokens": 34898124.0,
|
|
"step": 15220
|
|
},
|
|
{
|
|
"entropy": 5.202107191085815,
|
|
"epoch": 1.462536023054755,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004791909655501662,
|
|
"loss": 4.8921,
|
|
"mean_token_accuracy": 0.22310193479061127,
|
|
"num_tokens": 34909128.0,
|
|
"step": 15225
|
|
},
|
|
{
|
|
"entropy": 5.092991304397583,
|
|
"epoch": 1.463016330451489,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004791765634419516,
|
|
"loss": 4.9389,
|
|
"mean_token_accuracy": 0.2171325519680977,
|
|
"num_tokens": 34920541.0,
|
|
"step": 15230
|
|
},
|
|
{
|
|
"entropy": 5.047167062759399,
|
|
"epoch": 1.4634966378482228,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00047916215659336343,
|
|
"loss": 4.8782,
|
|
"mean_token_accuracy": 0.22050851583480835,
|
|
"num_tokens": 34931605.0,
|
|
"step": 15235
|
|
},
|
|
{
|
|
"entropy": 5.168670988082885,
|
|
"epoch": 1.4639769452449567,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004791477450047363,
|
|
"loss": 4.9172,
|
|
"mean_token_accuracy": 0.22224834561347961,
|
|
"num_tokens": 34943057.0,
|
|
"step": 15240
|
|
},
|
|
{
|
|
"entropy": 5.2066905975341795,
|
|
"epoch": 1.4644572526416906,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00047913332867640464,
|
|
"loss": 5.085,
|
|
"mean_token_accuracy": 0.20473618805408478,
|
|
"num_tokens": 34954386.0,
|
|
"step": 15245
|
|
},
|
|
{
|
|
"entropy": 5.141163301467896,
|
|
"epoch": 1.4649375600384245,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004791189076087033,
|
|
"loss": 4.9266,
|
|
"mean_token_accuracy": 0.21532471030950545,
|
|
"num_tokens": 34965874.0,
|
|
"step": 15250
|
|
},
|
|
{
|
|
"entropy": 5.232634353637695,
|
|
"epoch": 1.4654178674351586,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047910448180196703,
|
|
"loss": 5.0222,
|
|
"mean_token_accuracy": 0.2117237016558647,
|
|
"num_tokens": 34977408.0,
|
|
"step": 15255
|
|
},
|
|
{
|
|
"entropy": 5.1937174797058105,
|
|
"epoch": 1.4658981748318924,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004790900512565307,
|
|
"loss": 4.8537,
|
|
"mean_token_accuracy": 0.22348989248275758,
|
|
"num_tokens": 34987788.0,
|
|
"step": 15260
|
|
},
|
|
{
|
|
"entropy": 5.230491399765015,
|
|
"epoch": 1.4663784822286263,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004790756159727294,
|
|
"loss": 5.0276,
|
|
"mean_token_accuracy": 0.2132936492562294,
|
|
"num_tokens": 35001051.0,
|
|
"step": 15265
|
|
},
|
|
{
|
|
"entropy": 5.185359954833984,
|
|
"epoch": 1.4668587896253602,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047906117595089835,
|
|
"loss": 4.969,
|
|
"mean_token_accuracy": 0.21702387034893036,
|
|
"num_tokens": 35012621.0,
|
|
"step": 15270
|
|
},
|
|
{
|
|
"entropy": 5.178160524368286,
|
|
"epoch": 1.467339097022094,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004790467311913727,
|
|
"loss": 4.9673,
|
|
"mean_token_accuracy": 0.21435904800891875,
|
|
"num_tokens": 35023789.0,
|
|
"step": 15275
|
|
},
|
|
{
|
|
"entropy": 5.145422744750976,
|
|
"epoch": 1.4678194044188282,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004790322816944879,
|
|
"loss": 4.947,
|
|
"mean_token_accuracy": 0.22396451681852342,
|
|
"num_tokens": 35035839.0,
|
|
"step": 15280
|
|
},
|
|
{
|
|
"entropy": 5.2104826927185055,
|
|
"epoch": 1.468299711815562,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004790178274605793,
|
|
"loss": 5.0371,
|
|
"mean_token_accuracy": 0.20950869023799895,
|
|
"num_tokens": 35047823.0,
|
|
"step": 15285
|
|
},
|
|
{
|
|
"entropy": 5.244489479064941,
|
|
"epoch": 1.468780019212296,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00047900336848998254,
|
|
"loss": 5.0496,
|
|
"mean_token_accuracy": 0.2074924662709236,
|
|
"num_tokens": 35058597.0,
|
|
"step": 15290
|
|
},
|
|
{
|
|
"entropy": 5.127735662460327,
|
|
"epoch": 1.4692603266090298,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004789889047830334,
|
|
"loss": 4.9085,
|
|
"mean_token_accuracy": 0.22231007516384124,
|
|
"num_tokens": 35069822.0,
|
|
"step": 15295
|
|
},
|
|
{
|
|
"entropy": 5.178108882904053,
|
|
"epoch": 1.4697406340057637,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047897443634006766,
|
|
"loss": 4.9835,
|
|
"mean_token_accuracy": 0.21558043211698533,
|
|
"num_tokens": 35081423.0,
|
|
"step": 15300
|
|
},
|
|
{
|
|
"entropy": 5.246568965911865,
|
|
"epoch": 1.4702209414024976,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004789599631614211,
|
|
"loss": 5.0002,
|
|
"mean_token_accuracy": 0.21287845075130463,
|
|
"num_tokens": 35092565.0,
|
|
"step": 15305
|
|
},
|
|
{
|
|
"entropy": 5.18705940246582,
|
|
"epoch": 1.4707012487992315,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004789454852474298,
|
|
"loss": 4.941,
|
|
"mean_token_accuracy": 0.21825831830501558,
|
|
"num_tokens": 35103811.0,
|
|
"step": 15310
|
|
},
|
|
{
|
|
"entropy": 5.2444439888000485,
|
|
"epoch": 1.4711815561959654,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004789310025984299,
|
|
"loss": 5.07,
|
|
"mean_token_accuracy": 0.20772210359573365,
|
|
"num_tokens": 35115516.0,
|
|
"step": 15315
|
|
},
|
|
{
|
|
"entropy": 5.217182779312134,
|
|
"epoch": 1.4716618635926992,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047891651521475776,
|
|
"loss": 5.0205,
|
|
"mean_token_accuracy": 0.2142233058810234,
|
|
"num_tokens": 35127285.0,
|
|
"step": 15320
|
|
},
|
|
{
|
|
"entropy": 5.153268194198608,
|
|
"epoch": 1.4721421709894331,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00047890202309674963,
|
|
"loss": 4.9433,
|
|
"mean_token_accuracy": 0.21643016785383223,
|
|
"num_tokens": 35137884.0,
|
|
"step": 15325
|
|
},
|
|
{
|
|
"entropy": 5.276388359069824,
|
|
"epoch": 1.4726224783861672,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047888752624474195,
|
|
"loss": 5.1031,
|
|
"mean_token_accuracy": 0.20545923113822936,
|
|
"num_tokens": 35149935.0,
|
|
"step": 15330
|
|
},
|
|
{
|
|
"entropy": 5.2142415046691895,
|
|
"epoch": 1.473102785782901,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004788730246590714,
|
|
"loss": 5.0424,
|
|
"mean_token_accuracy": 0.21042255759239198,
|
|
"num_tokens": 35162610.0,
|
|
"step": 15335
|
|
},
|
|
{
|
|
"entropy": 5.180163049697876,
|
|
"epoch": 1.473583093179635,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047885851834007456,
|
|
"loss": 4.9073,
|
|
"mean_token_accuracy": 0.2148707166314125,
|
|
"num_tokens": 35174799.0,
|
|
"step": 15340
|
|
},
|
|
{
|
|
"entropy": 5.1315391063690186,
|
|
"epoch": 1.4740634005763689,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047884400728808824,
|
|
"loss": 4.9346,
|
|
"mean_token_accuracy": 0.2183023527264595,
|
|
"num_tokens": 35186004.0,
|
|
"step": 15345
|
|
},
|
|
{
|
|
"entropy": 5.175625896453857,
|
|
"epoch": 1.4745437079731027,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004788294915034494,
|
|
"loss": 4.9593,
|
|
"mean_token_accuracy": 0.2172788307070732,
|
|
"num_tokens": 35197310.0,
|
|
"step": 15350
|
|
},
|
|
{
|
|
"entropy": 5.120343971252441,
|
|
"epoch": 1.4750240153698366,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000478814970986495,
|
|
"loss": 4.909,
|
|
"mean_token_accuracy": 0.22186490893363953,
|
|
"num_tokens": 35208703.0,
|
|
"step": 15355
|
|
},
|
|
{
|
|
"entropy": 5.156807708740234,
|
|
"epoch": 1.4755043227665707,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047880044573756213,
|
|
"loss": 4.9205,
|
|
"mean_token_accuracy": 0.22117386311292647,
|
|
"num_tokens": 35219927.0,
|
|
"step": 15360
|
|
},
|
|
{
|
|
"entropy": 5.186833000183105,
|
|
"epoch": 1.4759846301633046,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047878591575698816,
|
|
"loss": 4.9142,
|
|
"mean_token_accuracy": 0.21543453335762025,
|
|
"num_tokens": 35231077.0,
|
|
"step": 15365
|
|
},
|
|
{
|
|
"entropy": 5.196823215484619,
|
|
"epoch": 1.4764649375600385,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004787713810451103,
|
|
"loss": 4.9946,
|
|
"mean_token_accuracy": 0.2103252202272415,
|
|
"num_tokens": 35241984.0,
|
|
"step": 15370
|
|
},
|
|
{
|
|
"entropy": 5.159264945983887,
|
|
"epoch": 1.4769452449567724,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00047875684160226606,
|
|
"loss": 4.9422,
|
|
"mean_token_accuracy": 0.2182306170463562,
|
|
"num_tokens": 35252717.0,
|
|
"step": 15375
|
|
},
|
|
{
|
|
"entropy": 5.179389905929566,
|
|
"epoch": 1.4774255523535063,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000478742297428793,
|
|
"loss": 5.0073,
|
|
"mean_token_accuracy": 0.21191850453615188,
|
|
"num_tokens": 35263916.0,
|
|
"step": 15380
|
|
},
|
|
{
|
|
"entropy": 5.164166069030761,
|
|
"epoch": 1.4779058597502401,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047872774852502877,
|
|
"loss": 4.9267,
|
|
"mean_token_accuracy": 0.21910004168748856,
|
|
"num_tokens": 35274772.0,
|
|
"step": 15385
|
|
},
|
|
{
|
|
"entropy": 5.11943564414978,
|
|
"epoch": 1.478386167146974,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004787131948913112,
|
|
"loss": 4.8781,
|
|
"mean_token_accuracy": 0.22653010189533235,
|
|
"num_tokens": 35287150.0,
|
|
"step": 15390
|
|
},
|
|
{
|
|
"entropy": 5.131641054153443,
|
|
"epoch": 1.478866474543708,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047869863652797806,
|
|
"loss": 4.8877,
|
|
"mean_token_accuracy": 0.2227863147854805,
|
|
"num_tokens": 35298538.0,
|
|
"step": 15395
|
|
},
|
|
{
|
|
"entropy": 5.107625436782837,
|
|
"epoch": 1.4793467819404418,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004786840734353675,
|
|
"loss": 4.886,
|
|
"mean_token_accuracy": 0.22072995603084564,
|
|
"num_tokens": 35309395.0,
|
|
"step": 15400
|
|
},
|
|
{
|
|
"entropy": 5.136013507843018,
|
|
"epoch": 1.4798270893371757,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047866950561381756,
|
|
"loss": 4.9366,
|
|
"mean_token_accuracy": 0.21733225584030152,
|
|
"num_tokens": 35320741.0,
|
|
"step": 15405
|
|
},
|
|
{
|
|
"entropy": 5.298266792297364,
|
|
"epoch": 1.4803073967339098,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004786549330636665,
|
|
"loss": 5.057,
|
|
"mean_token_accuracy": 0.20781148821115494,
|
|
"num_tokens": 35331895.0,
|
|
"step": 15410
|
|
},
|
|
{
|
|
"entropy": 5.117559576034546,
|
|
"epoch": 1.4807877041306436,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047864035578525256,
|
|
"loss": 4.8407,
|
|
"mean_token_accuracy": 0.23251519501209258,
|
|
"num_tokens": 35343775.0,
|
|
"step": 15415
|
|
},
|
|
{
|
|
"entropy": 5.1231804370880125,
|
|
"epoch": 1.4812680115273775,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004786257737789143,
|
|
"loss": 4.9988,
|
|
"mean_token_accuracy": 0.21539798378944397,
|
|
"num_tokens": 35355043.0,
|
|
"step": 15420
|
|
},
|
|
{
|
|
"entropy": 5.244394207000733,
|
|
"epoch": 1.4817483189241114,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004786111870449902,
|
|
"loss": 5.0196,
|
|
"mean_token_accuracy": 0.2094297468662262,
|
|
"num_tokens": 35365387.0,
|
|
"step": 15425
|
|
},
|
|
{
|
|
"entropy": 5.215351247787476,
|
|
"epoch": 1.4822286263208453,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047859659558381894,
|
|
"loss": 4.9363,
|
|
"mean_token_accuracy": 0.22851166874170303,
|
|
"num_tokens": 35376400.0,
|
|
"step": 15430
|
|
},
|
|
{
|
|
"entropy": 5.197208881378174,
|
|
"epoch": 1.4827089337175792,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047858199939573935,
|
|
"loss": 4.987,
|
|
"mean_token_accuracy": 0.21214037835597993,
|
|
"num_tokens": 35387315.0,
|
|
"step": 15435
|
|
},
|
|
{
|
|
"entropy": 5.180497694015503,
|
|
"epoch": 1.4831892411143133,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047856739848109014,
|
|
"loss": 4.981,
|
|
"mean_token_accuracy": 0.21736457496881484,
|
|
"num_tokens": 35398666.0,
|
|
"step": 15440
|
|
},
|
|
{
|
|
"entropy": 5.155378150939941,
|
|
"epoch": 1.4836695485110472,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047855279284021046,
|
|
"loss": 4.96,
|
|
"mean_token_accuracy": 0.22037553489208223,
|
|
"num_tokens": 35409192.0,
|
|
"step": 15445
|
|
},
|
|
{
|
|
"entropy": 5.189713001251221,
|
|
"epoch": 1.484149855907781,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047853818247343933,
|
|
"loss": 5.0013,
|
|
"mean_token_accuracy": 0.206741601228714,
|
|
"num_tokens": 35419812.0,
|
|
"step": 15450
|
|
},
|
|
{
|
|
"entropy": 5.255188465118408,
|
|
"epoch": 1.484630163304515,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047852356738111606,
|
|
"loss": 4.9344,
|
|
"mean_token_accuracy": 0.22674974501132966,
|
|
"num_tokens": 35430875.0,
|
|
"step": 15455
|
|
},
|
|
{
|
|
"entropy": 5.152353525161743,
|
|
"epoch": 1.4851104707012488,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004785089475635799,
|
|
"loss": 4.9248,
|
|
"mean_token_accuracy": 0.21964309960603715,
|
|
"num_tokens": 35441065.0,
|
|
"step": 15460
|
|
},
|
|
{
|
|
"entropy": 5.103597593307495,
|
|
"epoch": 1.4855907780979827,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047849432302117024,
|
|
"loss": 4.9745,
|
|
"mean_token_accuracy": 0.2140120819211006,
|
|
"num_tokens": 35452164.0,
|
|
"step": 15465
|
|
},
|
|
{
|
|
"entropy": 5.235323476791382,
|
|
"epoch": 1.4860710854947166,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047847969375422656,
|
|
"loss": 5.0663,
|
|
"mean_token_accuracy": 0.20626734495162963,
|
|
"num_tokens": 35463158.0,
|
|
"step": 15470
|
|
},
|
|
{
|
|
"entropy": 5.090017223358155,
|
|
"epoch": 1.4865513928914504,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004784650597630887,
|
|
"loss": 4.8789,
|
|
"mean_token_accuracy": 0.22733232527971267,
|
|
"num_tokens": 35474153.0,
|
|
"step": 15475
|
|
},
|
|
{
|
|
"entropy": 5.211921691894531,
|
|
"epoch": 1.4870317002881843,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047845042104809635,
|
|
"loss": 4.9649,
|
|
"mean_token_accuracy": 0.21242944300174713,
|
|
"num_tokens": 35485680.0,
|
|
"step": 15480
|
|
},
|
|
{
|
|
"entropy": 5.209507083892822,
|
|
"epoch": 1.4875120076849184,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004784357776095892,
|
|
"loss": 5.0195,
|
|
"mean_token_accuracy": 0.215239979326725,
|
|
"num_tokens": 35497271.0,
|
|
"step": 15485
|
|
},
|
|
{
|
|
"entropy": 5.190091228485107,
|
|
"epoch": 1.4879923150816523,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004784211294479075,
|
|
"loss": 4.9131,
|
|
"mean_token_accuracy": 0.22402856945991517,
|
|
"num_tokens": 35509166.0,
|
|
"step": 15490
|
|
},
|
|
{
|
|
"entropy": 5.165255784988403,
|
|
"epoch": 1.4884726224783862,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004784064765633912,
|
|
"loss": 4.9992,
|
|
"mean_token_accuracy": 0.2183140769600868,
|
|
"num_tokens": 35521289.0,
|
|
"step": 15495
|
|
},
|
|
{
|
|
"entropy": 5.160954904556275,
|
|
"epoch": 1.48895292987512,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047839181895638057,
|
|
"loss": 4.9491,
|
|
"mean_token_accuracy": 0.22083631306886672,
|
|
"num_tokens": 35532179.0,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"entropy": 5.308222866058349,
|
|
"epoch": 1.489433237271854,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047837715662721575,
|
|
"loss": 5.1324,
|
|
"mean_token_accuracy": 0.2071303442120552,
|
|
"num_tokens": 35544703.0,
|
|
"step": 15505
|
|
},
|
|
{
|
|
"entropy": 5.273142337799072,
|
|
"epoch": 1.4899135446685878,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004783624895762372,
|
|
"loss": 5.1179,
|
|
"mean_token_accuracy": 0.2036224529147148,
|
|
"num_tokens": 35557853.0,
|
|
"step": 15510
|
|
},
|
|
{
|
|
"entropy": 5.17064061164856,
|
|
"epoch": 1.490393852065322,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00047834781780378563,
|
|
"loss": 4.8318,
|
|
"mean_token_accuracy": 0.22622861266136168,
|
|
"num_tokens": 35570340.0,
|
|
"step": 15515
|
|
},
|
|
{
|
|
"entropy": 5.192807006835937,
|
|
"epoch": 1.4908741594620558,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004783331413102015,
|
|
"loss": 5.0187,
|
|
"mean_token_accuracy": 0.21527829617261887,
|
|
"num_tokens": 35582387.0,
|
|
"step": 15520
|
|
},
|
|
{
|
|
"entropy": 5.2773651599884035,
|
|
"epoch": 1.4913544668587897,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047831846009582557,
|
|
"loss": 5.07,
|
|
"mean_token_accuracy": 0.206882107257843,
|
|
"num_tokens": 35595105.0,
|
|
"step": 15525
|
|
},
|
|
{
|
|
"entropy": 5.235234880447388,
|
|
"epoch": 1.4918347742555236,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004783037741609988,
|
|
"loss": 5.0424,
|
|
"mean_token_accuracy": 0.2106972947716713,
|
|
"num_tokens": 35607160.0,
|
|
"step": 15530
|
|
},
|
|
{
|
|
"entropy": 5.196556234359742,
|
|
"epoch": 1.4923150816522575,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004782890835060621,
|
|
"loss": 4.9382,
|
|
"mean_token_accuracy": 0.21910466104745865,
|
|
"num_tokens": 35619097.0,
|
|
"step": 15535
|
|
},
|
|
{
|
|
"entropy": 5.073312139511108,
|
|
"epoch": 1.4927953890489913,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004782743881313564,
|
|
"loss": 4.8311,
|
|
"mean_token_accuracy": 0.22151407450437546,
|
|
"num_tokens": 35629868.0,
|
|
"step": 15540
|
|
},
|
|
{
|
|
"entropy": 5.171602392196656,
|
|
"epoch": 1.4932756964457252,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00047825968803722315,
|
|
"loss": 4.9882,
|
|
"mean_token_accuracy": 0.21382750123739241,
|
|
"num_tokens": 35640622.0,
|
|
"step": 15545
|
|
},
|
|
{
|
|
"entropy": 5.203648948669434,
|
|
"epoch": 1.493756003842459,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004782449832240035,
|
|
"loss": 4.9731,
|
|
"mean_token_accuracy": 0.21205914914608,
|
|
"num_tokens": 35652383.0,
|
|
"step": 15550
|
|
},
|
|
{
|
|
"entropy": 5.1007692337036135,
|
|
"epoch": 1.494236311239193,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004782302736920387,
|
|
"loss": 4.9049,
|
|
"mean_token_accuracy": 0.2167341247200966,
|
|
"num_tokens": 35663838.0,
|
|
"step": 15555
|
|
},
|
|
{
|
|
"entropy": 5.2010805130004885,
|
|
"epoch": 1.4947166186359269,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004782155594416705,
|
|
"loss": 4.9483,
|
|
"mean_token_accuracy": 0.2152695655822754,
|
|
"num_tokens": 35674564.0,
|
|
"step": 15560
|
|
},
|
|
{
|
|
"entropy": 5.242137813568116,
|
|
"epoch": 1.495196926032661,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047820084047324045,
|
|
"loss": 4.968,
|
|
"mean_token_accuracy": 0.21514491289854049,
|
|
"num_tokens": 35685518.0,
|
|
"step": 15565
|
|
},
|
|
{
|
|
"entropy": 5.261635780334473,
|
|
"epoch": 1.4956772334293948,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047818611678709027,
|
|
"loss": 4.9776,
|
|
"mean_token_accuracy": 0.215865059196949,
|
|
"num_tokens": 35696597.0,
|
|
"step": 15570
|
|
},
|
|
{
|
|
"entropy": 5.122075605392456,
|
|
"epoch": 1.4961575408261287,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004781713883835618,
|
|
"loss": 4.9016,
|
|
"mean_token_accuracy": 0.22335670590400697,
|
|
"num_tokens": 35707229.0,
|
|
"step": 15575
|
|
},
|
|
{
|
|
"entropy": 5.1522300243377686,
|
|
"epoch": 1.4966378482228626,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047815665526299695,
|
|
"loss": 4.9901,
|
|
"mean_token_accuracy": 0.21233994662761688,
|
|
"num_tokens": 35719440.0,
|
|
"step": 15580
|
|
},
|
|
{
|
|
"entropy": 5.230376529693603,
|
|
"epoch": 1.4971181556195965,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004781419174257378,
|
|
"loss": 4.9846,
|
|
"mean_token_accuracy": 0.20775482654571534,
|
|
"num_tokens": 35731611.0,
|
|
"step": 15585
|
|
},
|
|
{
|
|
"entropy": 5.1648476123809814,
|
|
"epoch": 1.4975984630163304,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004781271748721266,
|
|
"loss": 4.9783,
|
|
"mean_token_accuracy": 0.21286329627037048,
|
|
"num_tokens": 35743047.0,
|
|
"step": 15590
|
|
},
|
|
{
|
|
"entropy": 5.227255868911743,
|
|
"epoch": 1.4980787704130645,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004781124276025055,
|
|
"loss": 4.9184,
|
|
"mean_token_accuracy": 0.23288827687501906,
|
|
"num_tokens": 35753499.0,
|
|
"step": 15595
|
|
},
|
|
{
|
|
"entropy": 5.215896415710449,
|
|
"epoch": 1.4985590778097984,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000478097675617217,
|
|
"loss": 5.1346,
|
|
"mean_token_accuracy": 0.21066973358392715,
|
|
"num_tokens": 35764682.0,
|
|
"step": 15600
|
|
},
|
|
{
|
|
"entropy": 5.239116668701172,
|
|
"epoch": 1.4990393852065322,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047808291891660357,
|
|
"loss": 4.9312,
|
|
"mean_token_accuracy": 0.22250870913267135,
|
|
"num_tokens": 35775160.0,
|
|
"step": 15605
|
|
},
|
|
{
|
|
"entropy": 5.216899585723877,
|
|
"epoch": 1.4995196926032661,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047806815750100774,
|
|
"loss": 4.9735,
|
|
"mean_token_accuracy": 0.21689383089542388,
|
|
"num_tokens": 35786089.0,
|
|
"step": 15610
|
|
},
|
|
{
|
|
"entropy": 5.17706995010376,
|
|
"epoch": 1.5,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004780533913707723,
|
|
"loss": 4.9976,
|
|
"mean_token_accuracy": 0.21321234852075577,
|
|
"num_tokens": 35796851.0,
|
|
"step": 15615
|
|
},
|
|
{
|
|
"entropy": 5.229093360900879,
|
|
"epoch": 1.5004803073967339,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047803862052624006,
|
|
"loss": 5.0117,
|
|
"mean_token_accuracy": 0.20809693783521652,
|
|
"num_tokens": 35808553.0,
|
|
"step": 15620
|
|
},
|
|
{
|
|
"entropy": 5.275240278244018,
|
|
"epoch": 1.5009606147934678,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047802384496775397,
|
|
"loss": 5.1488,
|
|
"mean_token_accuracy": 0.21014924496412277,
|
|
"num_tokens": 35820108.0,
|
|
"step": 15625
|
|
},
|
|
{
|
|
"entropy": 5.187834882736206,
|
|
"epoch": 1.5014409221902016,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004780090646956571,
|
|
"loss": 4.8862,
|
|
"mean_token_accuracy": 0.22773226201534272,
|
|
"num_tokens": 35831672.0,
|
|
"step": 15630
|
|
},
|
|
{
|
|
"entropy": 5.280662202835083,
|
|
"epoch": 1.5019212295869355,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047799427971029245,
|
|
"loss": 5.0788,
|
|
"mean_token_accuracy": 0.21054953187704087,
|
|
"num_tokens": 35843164.0,
|
|
"step": 15635
|
|
},
|
|
{
|
|
"entropy": 5.225699520111084,
|
|
"epoch": 1.5024015369836694,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004779794900120034,
|
|
"loss": 5.0723,
|
|
"mean_token_accuracy": 0.21249438375234603,
|
|
"num_tokens": 35854677.0,
|
|
"step": 15640
|
|
},
|
|
{
|
|
"entropy": 5.19042010307312,
|
|
"epoch": 1.5028818443804035,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004779646956011334,
|
|
"loss": 4.9617,
|
|
"mean_token_accuracy": 0.22047783583402633,
|
|
"num_tokens": 35865956.0,
|
|
"step": 15645
|
|
},
|
|
{
|
|
"entropy": 5.176256704330444,
|
|
"epoch": 1.5033621517771374,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047794989647802574,
|
|
"loss": 4.9709,
|
|
"mean_token_accuracy": 0.2125203862786293,
|
|
"num_tokens": 35877451.0,
|
|
"step": 15650
|
|
},
|
|
{
|
|
"entropy": 5.194400215148926,
|
|
"epoch": 1.5038424591738713,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047793509264302424,
|
|
"loss": 4.9537,
|
|
"mean_token_accuracy": 0.21531011760234833,
|
|
"num_tokens": 35888436.0,
|
|
"step": 15655
|
|
},
|
|
{
|
|
"entropy": 5.187810611724854,
|
|
"epoch": 1.5043227665706052,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047792028409647237,
|
|
"loss": 4.9621,
|
|
"mean_token_accuracy": 0.2149658814072609,
|
|
"num_tokens": 35901010.0,
|
|
"step": 15660
|
|
},
|
|
{
|
|
"entropy": 5.213496112823487,
|
|
"epoch": 1.5048030739673393,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047790547083871414,
|
|
"loss": 4.9768,
|
|
"mean_token_accuracy": 0.21322050243616103,
|
|
"num_tokens": 35912697.0,
|
|
"step": 15665
|
|
},
|
|
{
|
|
"entropy": 5.153905248641967,
|
|
"epoch": 1.5052833813640731,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00047789065287009335,
|
|
"loss": 4.8969,
|
|
"mean_token_accuracy": 0.22069223672151567,
|
|
"num_tokens": 35924614.0,
|
|
"step": 15670
|
|
},
|
|
{
|
|
"entropy": 5.203504943847657,
|
|
"epoch": 1.505763688760807,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004778758301909542,
|
|
"loss": 4.9809,
|
|
"mean_token_accuracy": 0.2169592186808586,
|
|
"num_tokens": 35935492.0,
|
|
"step": 15675
|
|
},
|
|
{
|
|
"entropy": 5.098133087158203,
|
|
"epoch": 1.506243996157541,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004778610028016405,
|
|
"loss": 4.9889,
|
|
"mean_token_accuracy": 0.21588987559080125,
|
|
"num_tokens": 35947901.0,
|
|
"step": 15680
|
|
},
|
|
{
|
|
"entropy": 5.133358764648437,
|
|
"epoch": 1.5067243035542748,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004778461707024967,
|
|
"loss": 4.8208,
|
|
"mean_token_accuracy": 0.22946203052997588,
|
|
"num_tokens": 35959690.0,
|
|
"step": 15685
|
|
},
|
|
{
|
|
"entropy": 5.206504774093628,
|
|
"epoch": 1.5072046109510087,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004778313338938672,
|
|
"loss": 4.9199,
|
|
"mean_token_accuracy": 0.22398556172847747,
|
|
"num_tokens": 35971209.0,
|
|
"step": 15690
|
|
},
|
|
{
|
|
"entropy": 5.120486497879028,
|
|
"epoch": 1.5076849183477425,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047781649237609643,
|
|
"loss": 4.8075,
|
|
"mean_token_accuracy": 0.2294871136546135,
|
|
"num_tokens": 35981795.0,
|
|
"step": 15695
|
|
},
|
|
{
|
|
"entropy": 5.0691118240356445,
|
|
"epoch": 1.5081652257444764,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004778016461495289,
|
|
"loss": 4.9587,
|
|
"mean_token_accuracy": 0.21469815373420714,
|
|
"num_tokens": 35993358.0,
|
|
"step": 15700
|
|
},
|
|
{
|
|
"entropy": 5.153229188919068,
|
|
"epoch": 1.5086455331412103,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004777867952145094,
|
|
"loss": 4.9904,
|
|
"mean_token_accuracy": 0.22090202271938325,
|
|
"num_tokens": 36005353.0,
|
|
"step": 15705
|
|
},
|
|
{
|
|
"entropy": 5.269125986099243,
|
|
"epoch": 1.5091258405379442,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004777719395713826,
|
|
"loss": 4.9591,
|
|
"mean_token_accuracy": 0.21391933113336564,
|
|
"num_tokens": 36017510.0,
|
|
"step": 15710
|
|
},
|
|
{
|
|
"entropy": 5.196733570098877,
|
|
"epoch": 1.509606147934678,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047775707922049354,
|
|
"loss": 4.9517,
|
|
"mean_token_accuracy": 0.21790158450603486,
|
|
"num_tokens": 36028687.0,
|
|
"step": 15715
|
|
},
|
|
{
|
|
"entropy": 5.108423948287964,
|
|
"epoch": 1.510086455331412,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004777422141621871,
|
|
"loss": 4.8758,
|
|
"mean_token_accuracy": 0.21638176292181016,
|
|
"num_tokens": 36039851.0,
|
|
"step": 15720
|
|
},
|
|
{
|
|
"entropy": 5.204169845581054,
|
|
"epoch": 1.510566762728146,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004777273443968085,
|
|
"loss": 5.0531,
|
|
"mean_token_accuracy": 0.21372610628604888,
|
|
"num_tokens": 36050776.0,
|
|
"step": 15725
|
|
},
|
|
{
|
|
"entropy": 5.132400417327881,
|
|
"epoch": 1.51104707012488,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004777124699247029,
|
|
"loss": 4.8719,
|
|
"mean_token_accuracy": 0.21945572644472122,
|
|
"num_tokens": 36062151.0,
|
|
"step": 15730
|
|
},
|
|
{
|
|
"entropy": 5.187724494934082,
|
|
"epoch": 1.5115273775216138,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004776975907462157,
|
|
"loss": 4.9771,
|
|
"mean_token_accuracy": 0.21894902735948563,
|
|
"num_tokens": 36074601.0,
|
|
"step": 15735
|
|
},
|
|
{
|
|
"entropy": 5.149637079238891,
|
|
"epoch": 1.5120076849183477,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004776827068616924,
|
|
"loss": 4.8867,
|
|
"mean_token_accuracy": 0.22149800211191178,
|
|
"num_tokens": 36085436.0,
|
|
"step": 15740
|
|
},
|
|
{
|
|
"entropy": 5.249966144561768,
|
|
"epoch": 1.5124879923150818,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004776678182714785,
|
|
"loss": 5.0655,
|
|
"mean_token_accuracy": 0.20393786877393721,
|
|
"num_tokens": 36097589.0,
|
|
"step": 15745
|
|
},
|
|
{
|
|
"entropy": 5.1826738834381105,
|
|
"epoch": 1.5129682997118157,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047765292497591955,
|
|
"loss": 4.9858,
|
|
"mean_token_accuracy": 0.21298189610242843,
|
|
"num_tokens": 36109993.0,
|
|
"step": 15750
|
|
},
|
|
{
|
|
"entropy": 5.207590389251709,
|
|
"epoch": 1.5134486071085496,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047763802697536146,
|
|
"loss": 4.924,
|
|
"mean_token_accuracy": 0.21502208560705185,
|
|
"num_tokens": 36122439.0,
|
|
"step": 15755
|
|
},
|
|
{
|
|
"entropy": 5.2367846965789795,
|
|
"epoch": 1.5139289145052834,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00047762312427015015,
|
|
"loss": 4.9541,
|
|
"mean_token_accuracy": 0.21168054342269899,
|
|
"num_tokens": 36133867.0,
|
|
"step": 15760
|
|
},
|
|
{
|
|
"entropy": 5.259883260726928,
|
|
"epoch": 1.5144092219020173,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047760821686063153,
|
|
"loss": 5.053,
|
|
"mean_token_accuracy": 0.20947152823209764,
|
|
"num_tokens": 36144202.0,
|
|
"step": 15765
|
|
},
|
|
{
|
|
"entropy": 5.237866592407227,
|
|
"epoch": 1.5148895292987512,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00047759330474715173,
|
|
"loss": 4.9472,
|
|
"mean_token_accuracy": 0.22170411497354509,
|
|
"num_tokens": 36154863.0,
|
|
"step": 15770
|
|
},
|
|
{
|
|
"entropy": 5.215115213394165,
|
|
"epoch": 1.515369836695485,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047757838793005704,
|
|
"loss": 4.9774,
|
|
"mean_token_accuracy": 0.21179027259349822,
|
|
"num_tokens": 36166360.0,
|
|
"step": 15775
|
|
},
|
|
{
|
|
"entropy": 5.117362546920776,
|
|
"epoch": 1.515850144092219,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00047756346640969366,
|
|
"loss": 4.8669,
|
|
"mean_token_accuracy": 0.22667350769042968,
|
|
"num_tokens": 36177477.0,
|
|
"step": 15780
|
|
},
|
|
{
|
|
"entropy": 5.210712575912476,
|
|
"epoch": 1.5163304514889528,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047754854018640803,
|
|
"loss": 4.9971,
|
|
"mean_token_accuracy": 0.21386642158031463,
|
|
"num_tokens": 36188510.0,
|
|
"step": 15785
|
|
},
|
|
{
|
|
"entropy": 5.14784517288208,
|
|
"epoch": 1.5168107588856867,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00047753360926054684,
|
|
"loss": 4.8942,
|
|
"mean_token_accuracy": 0.22038694620132446,
|
|
"num_tokens": 36199084.0,
|
|
"step": 15790
|
|
},
|
|
{
|
|
"entropy": 5.111806440353393,
|
|
"epoch": 1.5172910662824206,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047751867363245653,
|
|
"loss": 4.9112,
|
|
"mean_token_accuracy": 0.21519201546907424,
|
|
"num_tokens": 36211265.0,
|
|
"step": 15795
|
|
},
|
|
{
|
|
"entropy": 5.292361068725586,
|
|
"epoch": 1.5177713736791547,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004775037333024841,
|
|
"loss": 5.0752,
|
|
"mean_token_accuracy": 0.20719213485717775,
|
|
"num_tokens": 36223070.0,
|
|
"step": 15800
|
|
},
|
|
{
|
|
"entropy": 5.223334217071534,
|
|
"epoch": 1.5182516810758886,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004774887882709762,
|
|
"loss": 5.0089,
|
|
"mean_token_accuracy": 0.21609985679388047,
|
|
"num_tokens": 36235021.0,
|
|
"step": 15805
|
|
},
|
|
{
|
|
"entropy": 5.188208532333374,
|
|
"epoch": 1.5187319884726225,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047747383853827995,
|
|
"loss": 4.9597,
|
|
"mean_token_accuracy": 0.21417346149682998,
|
|
"num_tokens": 36245647.0,
|
|
"step": 15810
|
|
},
|
|
{
|
|
"entropy": 5.244234657287597,
|
|
"epoch": 1.5192122958693564,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004774588841047424,
|
|
"loss": 5.0398,
|
|
"mean_token_accuracy": 0.21030631810426711,
|
|
"num_tokens": 36257470.0,
|
|
"step": 15815
|
|
},
|
|
{
|
|
"entropy": 5.176792812347412,
|
|
"epoch": 1.5196926032660905,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004774439249707108,
|
|
"loss": 4.8951,
|
|
"mean_token_accuracy": 0.21645855009555817,
|
|
"num_tokens": 36268839.0,
|
|
"step": 15820
|
|
},
|
|
{
|
|
"entropy": 5.1920037269592285,
|
|
"epoch": 1.5201729106628243,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004774289611365323,
|
|
"loss": 5.038,
|
|
"mean_token_accuracy": 0.20925631374120712,
|
|
"num_tokens": 36280624.0,
|
|
"step": 15825
|
|
},
|
|
{
|
|
"entropy": 5.211209154129028,
|
|
"epoch": 1.5206532180595582,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00047741399260255434,
|
|
"loss": 4.9448,
|
|
"mean_token_accuracy": 0.21624568998813629,
|
|
"num_tokens": 36292696.0,
|
|
"step": 15830
|
|
},
|
|
{
|
|
"entropy": 5.22960147857666,
|
|
"epoch": 1.521133525456292,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047739901936912467,
|
|
"loss": 4.9583,
|
|
"mean_token_accuracy": 0.21953330487012862,
|
|
"num_tokens": 36303612.0,
|
|
"step": 15835
|
|
},
|
|
{
|
|
"entropy": 5.146418714523316,
|
|
"epoch": 1.521613832853026,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004773840414365907,
|
|
"loss": 4.9303,
|
|
"mean_token_accuracy": 0.21528706550598145,
|
|
"num_tokens": 36314511.0,
|
|
"step": 15840
|
|
},
|
|
{
|
|
"entropy": 5.289036989212036,
|
|
"epoch": 1.5220941402497599,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047736905880530026,
|
|
"loss": 5.0616,
|
|
"mean_token_accuracy": 0.20672106891870498,
|
|
"num_tokens": 36327276.0,
|
|
"step": 15845
|
|
},
|
|
{
|
|
"entropy": 5.214120817184448,
|
|
"epoch": 1.5225744476464937,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004773540714756012,
|
|
"loss": 4.9296,
|
|
"mean_token_accuracy": 0.21783770322799684,
|
|
"num_tokens": 36339373.0,
|
|
"step": 15850
|
|
},
|
|
{
|
|
"entropy": 5.239572238922119,
|
|
"epoch": 1.5230547550432276,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00047733907944784144,
|
|
"loss": 5.0491,
|
|
"mean_token_accuracy": 0.20820102244615554,
|
|
"num_tokens": 36351863.0,
|
|
"step": 15855
|
|
},
|
|
{
|
|
"entropy": 5.161839580535888,
|
|
"epoch": 1.5235350624399615,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004773240827223691,
|
|
"loss": 4.9524,
|
|
"mean_token_accuracy": 0.22128331512212754,
|
|
"num_tokens": 36363961.0,
|
|
"step": 15860
|
|
},
|
|
{
|
|
"entropy": 5.185322666168213,
|
|
"epoch": 1.5240153698366954,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004773090812995323,
|
|
"loss": 4.9384,
|
|
"mean_token_accuracy": 0.2248750001192093,
|
|
"num_tokens": 36374738.0,
|
|
"step": 15865
|
|
},
|
|
{
|
|
"entropy": 5.185145139694214,
|
|
"epoch": 1.5244956772334293,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047729407517967945,
|
|
"loss": 4.8691,
|
|
"mean_token_accuracy": 0.22020863592624665,
|
|
"num_tokens": 36386472.0,
|
|
"step": 15870
|
|
},
|
|
{
|
|
"entropy": 5.170194339752197,
|
|
"epoch": 1.5249759846301632,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047727906436315884,
|
|
"loss": 4.956,
|
|
"mean_token_accuracy": 0.22778922319412231,
|
|
"num_tokens": 36397042.0,
|
|
"step": 15875
|
|
},
|
|
{
|
|
"entropy": 5.166629409790039,
|
|
"epoch": 1.5254562920268973,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047726404885031895,
|
|
"loss": 4.9269,
|
|
"mean_token_accuracy": 0.21769467294216155,
|
|
"num_tokens": 36408720.0,
|
|
"step": 15880
|
|
},
|
|
{
|
|
"entropy": 5.233714628219604,
|
|
"epoch": 1.5259365994236311,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047724902864150845,
|
|
"loss": 5.0013,
|
|
"mean_token_accuracy": 0.2127738893032074,
|
|
"num_tokens": 36420885.0,
|
|
"step": 15885
|
|
},
|
|
{
|
|
"entropy": 5.250354194641114,
|
|
"epoch": 1.526416906820365,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047723400373707607,
|
|
"loss": 5.0181,
|
|
"mean_token_accuracy": 0.20683068931102752,
|
|
"num_tokens": 36433678.0,
|
|
"step": 15890
|
|
},
|
|
{
|
|
"entropy": 5.2336162567138675,
|
|
"epoch": 1.526897214217099,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004772189741373707,
|
|
"loss": 5.0423,
|
|
"mean_token_accuracy": 0.21756250262260438,
|
|
"num_tokens": 36445143.0,
|
|
"step": 15895
|
|
},
|
|
{
|
|
"entropy": 5.102718687057495,
|
|
"epoch": 1.527377521613833,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047720393984274117,
|
|
"loss": 4.9456,
|
|
"mean_token_accuracy": 0.21202410906553268,
|
|
"num_tokens": 36456214.0,
|
|
"step": 15900
|
|
},
|
|
{
|
|
"entropy": 5.2174307346344,
|
|
"epoch": 1.5278578290105669,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047718890085353654,
|
|
"loss": 4.9635,
|
|
"mean_token_accuracy": 0.2163314238190651,
|
|
"num_tokens": 36466767.0,
|
|
"step": 15905
|
|
},
|
|
{
|
|
"entropy": 5.3040376663208,
|
|
"epoch": 1.5283381364073008,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000477173857170106,
|
|
"loss": 4.9536,
|
|
"mean_token_accuracy": 0.2135412722826004,
|
|
"num_tokens": 36478150.0,
|
|
"step": 15910
|
|
},
|
|
{
|
|
"entropy": 5.167091703414917,
|
|
"epoch": 1.5288184438040346,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047715880879279894,
|
|
"loss": 4.9546,
|
|
"mean_token_accuracy": 0.2112107068300247,
|
|
"num_tokens": 36488280.0,
|
|
"step": 15915
|
|
},
|
|
{
|
|
"entropy": 5.194524145126342,
|
|
"epoch": 1.5292987512007685,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004771437557219646,
|
|
"loss": 5.0048,
|
|
"mean_token_accuracy": 0.21566450595855713,
|
|
"num_tokens": 36498505.0,
|
|
"step": 15920
|
|
},
|
|
{
|
|
"entropy": 5.1693662166595455,
|
|
"epoch": 1.5297790585975024,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004771286979579524,
|
|
"loss": 4.9538,
|
|
"mean_token_accuracy": 0.2125968560576439,
|
|
"num_tokens": 36509271.0,
|
|
"step": 15925
|
|
},
|
|
{
|
|
"entropy": 5.234210538864136,
|
|
"epoch": 1.5302593659942363,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004771136355011121,
|
|
"loss": 5.0366,
|
|
"mean_token_accuracy": 0.20775097012519836,
|
|
"num_tokens": 36520022.0,
|
|
"step": 15930
|
|
},
|
|
{
|
|
"entropy": 5.213120317459106,
|
|
"epoch": 1.5307396733909702,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047709856835179333,
|
|
"loss": 4.9638,
|
|
"mean_token_accuracy": 0.21767441034317017,
|
|
"num_tokens": 36532769.0,
|
|
"step": 15935
|
|
},
|
|
{
|
|
"entropy": 5.239807176589966,
|
|
"epoch": 1.531219980787704,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047708349651034586,
|
|
"loss": 4.9947,
|
|
"mean_token_accuracy": 0.22149415910243989,
|
|
"num_tokens": 36544454.0,
|
|
"step": 15940
|
|
},
|
|
{
|
|
"entropy": 5.220270252227783,
|
|
"epoch": 1.531700288184438,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047706841997711974,
|
|
"loss": 4.9688,
|
|
"mean_token_accuracy": 0.20992875397205352,
|
|
"num_tokens": 36555916.0,
|
|
"step": 15945
|
|
},
|
|
{
|
|
"entropy": 5.154624176025391,
|
|
"epoch": 1.5321805955811718,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047705333875246495,
|
|
"loss": 4.9463,
|
|
"mean_token_accuracy": 0.21742784678936006,
|
|
"num_tokens": 36567829.0,
|
|
"step": 15950
|
|
},
|
|
{
|
|
"entropy": 5.20950345993042,
|
|
"epoch": 1.532660902977906,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047703825283673153,
|
|
"loss": 5.0589,
|
|
"mean_token_accuracy": 0.21559867709875108,
|
|
"num_tokens": 36578216.0,
|
|
"step": 15955
|
|
},
|
|
{
|
|
"entropy": 5.240422534942627,
|
|
"epoch": 1.5331412103746398,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004770231622302699,
|
|
"loss": 4.9537,
|
|
"mean_token_accuracy": 0.21943466514348983,
|
|
"num_tokens": 36589945.0,
|
|
"step": 15960
|
|
},
|
|
{
|
|
"entropy": 5.250634670257568,
|
|
"epoch": 1.5336215177713737,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00047700806693343016,
|
|
"loss": 4.9771,
|
|
"mean_token_accuracy": 0.21575426161289216,
|
|
"num_tokens": 36600724.0,
|
|
"step": 15965
|
|
},
|
|
{
|
|
"entropy": 5.060350513458252,
|
|
"epoch": 1.5341018251681076,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047699296694656316,
|
|
"loss": 4.8074,
|
|
"mean_token_accuracy": 0.22741931974887847,
|
|
"num_tokens": 36611154.0,
|
|
"step": 15970
|
|
},
|
|
{
|
|
"entropy": 5.169518995285034,
|
|
"epoch": 1.5345821325648417,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004769778622700192,
|
|
"loss": 4.9076,
|
|
"mean_token_accuracy": 0.21358391046524047,
|
|
"num_tokens": 36621750.0,
|
|
"step": 15975
|
|
},
|
|
{
|
|
"entropy": 5.205866909027099,
|
|
"epoch": 1.5350624399615755,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00047696275290414885,
|
|
"loss": 4.9543,
|
|
"mean_token_accuracy": 0.21394149214029312,
|
|
"num_tokens": 36633294.0,
|
|
"step": 15980
|
|
},
|
|
{
|
|
"entropy": 5.125529336929321,
|
|
"epoch": 1.5355427473583094,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047694763884930324,
|
|
"loss": 4.8902,
|
|
"mean_token_accuracy": 0.2196623682975769,
|
|
"num_tokens": 36646377.0,
|
|
"step": 15985
|
|
},
|
|
{
|
|
"entropy": 5.141584873199463,
|
|
"epoch": 1.5360230547550433,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047693252010583314,
|
|
"loss": 4.8424,
|
|
"mean_token_accuracy": 0.22554460167884827,
|
|
"num_tokens": 36656159.0,
|
|
"step": 15990
|
|
},
|
|
{
|
|
"entropy": 5.14939513206482,
|
|
"epoch": 1.5365033621517772,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004769173966740895,
|
|
"loss": 4.9904,
|
|
"mean_token_accuracy": 0.2079702839255333,
|
|
"num_tokens": 36667522.0,
|
|
"step": 15995
|
|
},
|
|
{
|
|
"entropy": 5.26382999420166,
|
|
"epoch": 1.536983669548511,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00047690226855442346,
|
|
"loss": 4.9977,
|
|
"mean_token_accuracy": 0.21323842704296112,
|
|
"num_tokens": 36678662.0,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"entropy": 5.19386396408081,
|
|
"epoch": 1.537463976945245,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004768871357471863,
|
|
"loss": 4.9681,
|
|
"mean_token_accuracy": 0.21223879903554915,
|
|
"num_tokens": 36689876.0,
|
|
"step": 16005
|
|
},
|
|
{
|
|
"entropy": 5.211331748962403,
|
|
"epoch": 1.5379442843419788,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047687199825272936,
|
|
"loss": 4.9179,
|
|
"mean_token_accuracy": 0.22314226925373076,
|
|
"num_tokens": 36701140.0,
|
|
"step": 16010
|
|
},
|
|
{
|
|
"entropy": 5.2290960311889645,
|
|
"epoch": 1.5384245917387127,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047685685607140403,
|
|
"loss": 5.067,
|
|
"mean_token_accuracy": 0.20884881168603897,
|
|
"num_tokens": 36711077.0,
|
|
"step": 16015
|
|
},
|
|
{
|
|
"entropy": 5.214303207397461,
|
|
"epoch": 1.5389048991354466,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00047684170920356185,
|
|
"loss": 4.9477,
|
|
"mean_token_accuracy": 0.2187011405825615,
|
|
"num_tokens": 36722705.0,
|
|
"step": 16020
|
|
},
|
|
{
|
|
"entropy": 5.1465880393981935,
|
|
"epoch": 1.5393852065321805,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004768265576495546,
|
|
"loss": 4.9004,
|
|
"mean_token_accuracy": 0.2221095785498619,
|
|
"num_tokens": 36733446.0,
|
|
"step": 16025
|
|
},
|
|
{
|
|
"entropy": 5.114369249343872,
|
|
"epoch": 1.5398655139289144,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047681140140973396,
|
|
"loss": 4.9272,
|
|
"mean_token_accuracy": 0.2224670261144638,
|
|
"num_tokens": 36744529.0,
|
|
"step": 16030
|
|
},
|
|
{
|
|
"entropy": 5.095712089538575,
|
|
"epoch": 1.5403458213256485,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004767962404844517,
|
|
"loss": 4.9387,
|
|
"mean_token_accuracy": 0.21873563379049302,
|
|
"num_tokens": 36756248.0,
|
|
"step": 16035
|
|
},
|
|
{
|
|
"entropy": 5.287334442138672,
|
|
"epoch": 1.5408261287223823,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047678107487406015,
|
|
"loss": 5.0196,
|
|
"mean_token_accuracy": 0.21320114731788636,
|
|
"num_tokens": 36768097.0,
|
|
"step": 16040
|
|
},
|
|
{
|
|
"entropy": 5.204667949676514,
|
|
"epoch": 1.5413064361191162,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00047676590457891116,
|
|
"loss": 4.9646,
|
|
"mean_token_accuracy": 0.21519066542387008,
|
|
"num_tokens": 36780395.0,
|
|
"step": 16045
|
|
},
|
|
{
|
|
"entropy": 5.173113012313843,
|
|
"epoch": 1.54178674351585,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004767507295993569,
|
|
"loss": 4.9343,
|
|
"mean_token_accuracy": 0.211539426445961,
|
|
"num_tokens": 36791897.0,
|
|
"step": 16050
|
|
},
|
|
{
|
|
"entropy": 5.160366153717041,
|
|
"epoch": 1.5422670509125842,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004767355499357498,
|
|
"loss": 4.9177,
|
|
"mean_token_accuracy": 0.22163355052471162,
|
|
"num_tokens": 36802716.0,
|
|
"step": 16055
|
|
},
|
|
{
|
|
"entropy": 5.138392639160156,
|
|
"epoch": 1.542747358309318,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004767203655884423,
|
|
"loss": 4.9724,
|
|
"mean_token_accuracy": 0.21543466299772263,
|
|
"num_tokens": 36814471.0,
|
|
"step": 16060
|
|
},
|
|
{
|
|
"entropy": 5.1336760997772215,
|
|
"epoch": 1.543227665706052,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004767051765577869,
|
|
"loss": 4.9174,
|
|
"mean_token_accuracy": 0.2216467648744583,
|
|
"num_tokens": 36825975.0,
|
|
"step": 16065
|
|
},
|
|
{
|
|
"entropy": 5.2050800800323485,
|
|
"epoch": 1.5437079731027858,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047668998284413624,
|
|
"loss": 5.0166,
|
|
"mean_token_accuracy": 0.20977095812559127,
|
|
"num_tokens": 36837990.0,
|
|
"step": 16070
|
|
},
|
|
{
|
|
"entropy": 5.227380561828613,
|
|
"epoch": 1.5441882804995197,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00047667478444784306,
|
|
"loss": 5.0358,
|
|
"mean_token_accuracy": 0.21031395345926285,
|
|
"num_tokens": 36849115.0,
|
|
"step": 16075
|
|
},
|
|
{
|
|
"entropy": 5.22444372177124,
|
|
"epoch": 1.5446685878962536,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004766595813692602,
|
|
"loss": 4.8866,
|
|
"mean_token_accuracy": 0.21476306468248368,
|
|
"num_tokens": 36860626.0,
|
|
"step": 16080
|
|
},
|
|
{
|
|
"entropy": 5.200777339935303,
|
|
"epoch": 1.5451488952929875,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047664437360874076,
|
|
"loss": 5.0325,
|
|
"mean_token_accuracy": 0.20902796387672423,
|
|
"num_tokens": 36871926.0,
|
|
"step": 16085
|
|
},
|
|
{
|
|
"entropy": 5.180306005477905,
|
|
"epoch": 1.5456292026897214,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047662916116663766,
|
|
"loss": 4.949,
|
|
"mean_token_accuracy": 0.22320764660835266,
|
|
"num_tokens": 36883511.0,
|
|
"step": 16090
|
|
},
|
|
{
|
|
"entropy": 5.253410387039184,
|
|
"epoch": 1.5461095100864553,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047661394404330417,
|
|
"loss": 5.0173,
|
|
"mean_token_accuracy": 0.21642861217260362,
|
|
"num_tokens": 36895468.0,
|
|
"step": 16095
|
|
},
|
|
{
|
|
"entropy": 5.182759952545166,
|
|
"epoch": 1.5465898174831891,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047659872223909357,
|
|
"loss": 4.911,
|
|
"mean_token_accuracy": 0.21871508955955504,
|
|
"num_tokens": 36906957.0,
|
|
"step": 16100
|
|
},
|
|
{
|
|
"entropy": 5.156119346618652,
|
|
"epoch": 1.547070124879923,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004765834957543592,
|
|
"loss": 4.9463,
|
|
"mean_token_accuracy": 0.21861688941717147,
|
|
"num_tokens": 36916495.0,
|
|
"step": 16105
|
|
},
|
|
{
|
|
"entropy": 5.1166833400726315,
|
|
"epoch": 1.547550432276657,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047656826458945475,
|
|
"loss": 4.9244,
|
|
"mean_token_accuracy": 0.21573103368282318,
|
|
"num_tokens": 36927301.0,
|
|
"step": 16110
|
|
},
|
|
{
|
|
"entropy": 5.178887462615966,
|
|
"epoch": 1.548030739673391,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047655302874473365,
|
|
"loss": 4.8872,
|
|
"mean_token_accuracy": 0.22074204683303833,
|
|
"num_tokens": 36938116.0,
|
|
"step": 16115
|
|
},
|
|
{
|
|
"entropy": 5.199589490890503,
|
|
"epoch": 1.5485110470701249,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004765377882205498,
|
|
"loss": 4.9571,
|
|
"mean_token_accuracy": 0.21698470413684845,
|
|
"num_tokens": 36950292.0,
|
|
"step": 16120
|
|
},
|
|
{
|
|
"entropy": 5.1331400871276855,
|
|
"epoch": 1.5489913544668588,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004765225430172568,
|
|
"loss": 4.9833,
|
|
"mean_token_accuracy": 0.220487479865551,
|
|
"num_tokens": 36962017.0,
|
|
"step": 16125
|
|
},
|
|
{
|
|
"entropy": 5.2799131870269775,
|
|
"epoch": 1.5494716618635929,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004765072931352089,
|
|
"loss": 5.0086,
|
|
"mean_token_accuracy": 0.21049043387174607,
|
|
"num_tokens": 36972831.0,
|
|
"step": 16130
|
|
},
|
|
{
|
|
"entropy": 5.199352645874024,
|
|
"epoch": 1.5499519692603267,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00047649203857476,
|
|
"loss": 4.8725,
|
|
"mean_token_accuracy": 0.2225062444806099,
|
|
"num_tokens": 36983324.0,
|
|
"step": 16135
|
|
},
|
|
{
|
|
"entropy": 5.153161334991455,
|
|
"epoch": 1.5504322766570606,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047647677933626423,
|
|
"loss": 5.1157,
|
|
"mean_token_accuracy": 0.20301380157470703,
|
|
"num_tokens": 36996093.0,
|
|
"step": 16140
|
|
},
|
|
{
|
|
"entropy": 5.1830164909362795,
|
|
"epoch": 1.5509125840537945,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047646151542007583,
|
|
"loss": 4.9454,
|
|
"mean_token_accuracy": 0.2194085642695427,
|
|
"num_tokens": 37006548.0,
|
|
"step": 16145
|
|
},
|
|
{
|
|
"entropy": 5.234643363952637,
|
|
"epoch": 1.5513928914505284,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004764462468265494,
|
|
"loss": 4.946,
|
|
"mean_token_accuracy": 0.2132670909166336,
|
|
"num_tokens": 37019181.0,
|
|
"step": 16150
|
|
},
|
|
{
|
|
"entropy": 5.261085796356201,
|
|
"epoch": 1.5518731988472623,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00047643097355603913,
|
|
"loss": 4.985,
|
|
"mean_token_accuracy": 0.21415400505065918,
|
|
"num_tokens": 37029808.0,
|
|
"step": 16155
|
|
},
|
|
{
|
|
"entropy": 5.180344581604004,
|
|
"epoch": 1.5523535062439962,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004764156956088999,
|
|
"loss": 4.9579,
|
|
"mean_token_accuracy": 0.21201344579458237,
|
|
"num_tokens": 37040352.0,
|
|
"step": 16160
|
|
},
|
|
{
|
|
"entropy": 5.1573163032531735,
|
|
"epoch": 1.55283381364073,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004764004129854863,
|
|
"loss": 4.9926,
|
|
"mean_token_accuracy": 0.2195364996790886,
|
|
"num_tokens": 37050780.0,
|
|
"step": 16165
|
|
},
|
|
{
|
|
"entropy": 5.190805196762085,
|
|
"epoch": 1.553314121037464,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047638512568615307,
|
|
"loss": 4.9335,
|
|
"mean_token_accuracy": 0.21873989403247834,
|
|
"num_tokens": 37062608.0,
|
|
"step": 16170
|
|
},
|
|
{
|
|
"entropy": 5.184975290298462,
|
|
"epoch": 1.5537944284341978,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004763698337112553,
|
|
"loss": 5.0266,
|
|
"mean_token_accuracy": 0.2099449321627617,
|
|
"num_tokens": 37074578.0,
|
|
"step": 16175
|
|
},
|
|
{
|
|
"entropy": 5.243215465545655,
|
|
"epoch": 1.5542747358309317,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004763545370611479,
|
|
"loss": 5.0469,
|
|
"mean_token_accuracy": 0.2158788859844208,
|
|
"num_tokens": 37086487.0,
|
|
"step": 16180
|
|
},
|
|
{
|
|
"entropy": 5.2162518978118895,
|
|
"epoch": 1.5547550432276656,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047633923573618605,
|
|
"loss": 4.9287,
|
|
"mean_token_accuracy": 0.22581578940153121,
|
|
"num_tokens": 37097249.0,
|
|
"step": 16185
|
|
},
|
|
{
|
|
"entropy": 5.257421016693115,
|
|
"epoch": 1.5552353506243997,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000476323929736725,
|
|
"loss": 5.1366,
|
|
"mean_token_accuracy": 0.20327619165182115,
|
|
"num_tokens": 37109274.0,
|
|
"step": 16190
|
|
},
|
|
{
|
|
"entropy": 5.191748714447021,
|
|
"epoch": 1.5557156580211335,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00047630861906312004,
|
|
"loss": 4.9506,
|
|
"mean_token_accuracy": 0.21775271743535995,
|
|
"num_tokens": 37120100.0,
|
|
"step": 16195
|
|
},
|
|
{
|
|
"entropy": 5.18410849571228,
|
|
"epoch": 1.5561959654178674,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004762933037157268,
|
|
"loss": 4.9254,
|
|
"mean_token_accuracy": 0.21204567849636077,
|
|
"num_tokens": 37131889.0,
|
|
"step": 16200
|
|
},
|
|
{
|
|
"entropy": 5.114263343811035,
|
|
"epoch": 1.5566762728146013,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047627798369490076,
|
|
"loss": 4.8947,
|
|
"mean_token_accuracy": 0.22275954782962798,
|
|
"num_tokens": 37142538.0,
|
|
"step": 16205
|
|
},
|
|
{
|
|
"entropy": 5.235247564315796,
|
|
"epoch": 1.5571565802113354,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047626265900099757,
|
|
"loss": 5.0412,
|
|
"mean_token_accuracy": 0.2095339596271515,
|
|
"num_tokens": 37153639.0,
|
|
"step": 16210
|
|
},
|
|
{
|
|
"entropy": 5.216181850433349,
|
|
"epoch": 1.5576368876080693,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047624732963437314,
|
|
"loss": 4.9877,
|
|
"mean_token_accuracy": 0.21557213515043258,
|
|
"num_tokens": 37164029.0,
|
|
"step": 16215
|
|
},
|
|
{
|
|
"entropy": 5.211888360977173,
|
|
"epoch": 1.5581171950048032,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047623199559538324,
|
|
"loss": 5.032,
|
|
"mean_token_accuracy": 0.211012165248394,
|
|
"num_tokens": 37175880.0,
|
|
"step": 16220
|
|
},
|
|
{
|
|
"entropy": 5.221793174743652,
|
|
"epoch": 1.558597502401537,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.000476216656884384,
|
|
"loss": 4.9699,
|
|
"mean_token_accuracy": 0.21573802679777146,
|
|
"num_tokens": 37187483.0,
|
|
"step": 16225
|
|
},
|
|
{
|
|
"entropy": 5.2223461151123045,
|
|
"epoch": 1.559077809798271,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047620131350173135,
|
|
"loss": 4.9836,
|
|
"mean_token_accuracy": 0.21847614794969558,
|
|
"num_tokens": 37198978.0,
|
|
"step": 16230
|
|
},
|
|
{
|
|
"entropy": 5.254594516754151,
|
|
"epoch": 1.5595581171950048,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004761859654477817,
|
|
"loss": 5.0051,
|
|
"mean_token_accuracy": 0.20368780344724655,
|
|
"num_tokens": 37210054.0,
|
|
"step": 16235
|
|
},
|
|
{
|
|
"entropy": 5.1702179431915285,
|
|
"epoch": 1.5600384245917387,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004761706127228914,
|
|
"loss": 4.9572,
|
|
"mean_token_accuracy": 0.21917274296283723,
|
|
"num_tokens": 37221744.0,
|
|
"step": 16240
|
|
},
|
|
{
|
|
"entropy": 5.151251411437988,
|
|
"epoch": 1.5605187319884726,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004761552553274168,
|
|
"loss": 4.9985,
|
|
"mean_token_accuracy": 0.21715299040079117,
|
|
"num_tokens": 37234629.0,
|
|
"step": 16245
|
|
},
|
|
{
|
|
"entropy": 5.2933587551116945,
|
|
"epoch": 1.5609990393852065,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004761398932617144,
|
|
"loss": 5.0695,
|
|
"mean_token_accuracy": 0.21107089519500732,
|
|
"num_tokens": 37246960.0,
|
|
"step": 16250
|
|
},
|
|
{
|
|
"entropy": 5.151790046691895,
|
|
"epoch": 1.5614793467819403,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000476124526526141,
|
|
"loss": 4.9236,
|
|
"mean_token_accuracy": 0.2184425637125969,
|
|
"num_tokens": 37259326.0,
|
|
"step": 16255
|
|
},
|
|
{
|
|
"entropy": 5.091758918762207,
|
|
"epoch": 1.5619596541786742,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00047610915512105327,
|
|
"loss": 4.9732,
|
|
"mean_token_accuracy": 0.2134696900844574,
|
|
"num_tokens": 37271436.0,
|
|
"step": 16260
|
|
},
|
|
{
|
|
"entropy": 5.234701013565063,
|
|
"epoch": 1.562439961575408,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004760937790468082,
|
|
"loss": 5.038,
|
|
"mean_token_accuracy": 0.20630017220973967,
|
|
"num_tokens": 37283230.0,
|
|
"step": 16265
|
|
},
|
|
{
|
|
"entropy": 5.2092828273773195,
|
|
"epoch": 1.5629202689721422,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004760783983037627,
|
|
"loss": 4.9399,
|
|
"mean_token_accuracy": 0.21871796250343323,
|
|
"num_tokens": 37295185.0,
|
|
"step": 16270
|
|
},
|
|
{
|
|
"entropy": 5.198478031158447,
|
|
"epoch": 1.563400576368876,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004760630128922738,
|
|
"loss": 5.0027,
|
|
"mean_token_accuracy": 0.21369778662919997,
|
|
"num_tokens": 37307573.0,
|
|
"step": 16275
|
|
},
|
|
{
|
|
"entropy": 5.271113395690918,
|
|
"epoch": 1.56388088376561,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004760476228126989,
|
|
"loss": 5.0284,
|
|
"mean_token_accuracy": 0.21126459836959838,
|
|
"num_tokens": 37319084.0,
|
|
"step": 16280
|
|
},
|
|
{
|
|
"entropy": 5.120991039276123,
|
|
"epoch": 1.5643611911623438,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004760322280653951,
|
|
"loss": 4.9307,
|
|
"mean_token_accuracy": 0.21826708912849427,
|
|
"num_tokens": 37330447.0,
|
|
"step": 16285
|
|
},
|
|
{
|
|
"entropy": 5.164800930023193,
|
|
"epoch": 1.564841498559078,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004760168286507199,
|
|
"loss": 4.8447,
|
|
"mean_token_accuracy": 0.21647296249866485,
|
|
"num_tokens": 37341184.0,
|
|
"step": 16290
|
|
},
|
|
{
|
|
"entropy": 5.217311954498291,
|
|
"epoch": 1.5653218059558118,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047600142456903085,
|
|
"loss": 4.9875,
|
|
"mean_token_accuracy": 0.21339131146669388,
|
|
"num_tokens": 37352712.0,
|
|
"step": 16295
|
|
},
|
|
{
|
|
"entropy": 5.191257572174072,
|
|
"epoch": 1.5658021133525457,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047598601582068555,
|
|
"loss": 5.0424,
|
|
"mean_token_accuracy": 0.2111809030175209,
|
|
"num_tokens": 37365001.0,
|
|
"step": 16300
|
|
},
|
|
{
|
|
"entropy": 5.306282901763916,
|
|
"epoch": 1.5662824207492796,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004759706024060418,
|
|
"loss": 5.1008,
|
|
"mean_token_accuracy": 0.20626700818538665,
|
|
"num_tokens": 37377005.0,
|
|
"step": 16305
|
|
},
|
|
{
|
|
"entropy": 5.196134757995606,
|
|
"epoch": 1.5667627281460135,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004759551843254575,
|
|
"loss": 4.9281,
|
|
"mean_token_accuracy": 0.2204608216881752,
|
|
"num_tokens": 37387329.0,
|
|
"step": 16310
|
|
},
|
|
{
|
|
"entropy": 5.256574726104736,
|
|
"epoch": 1.5672430355427474,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047593976157929034,
|
|
"loss": 5.0302,
|
|
"mean_token_accuracy": 0.21475369334220887,
|
|
"num_tokens": 37398894.0,
|
|
"step": 16315
|
|
},
|
|
{
|
|
"entropy": 5.231758117675781,
|
|
"epoch": 1.5677233429394812,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004759243341678987,
|
|
"loss": 4.937,
|
|
"mean_token_accuracy": 0.21920875310897828,
|
|
"num_tokens": 37409913.0,
|
|
"step": 16320
|
|
},
|
|
{
|
|
"entropy": 5.217631864547729,
|
|
"epoch": 1.5682036503362151,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004759089020916407,
|
|
"loss": 4.899,
|
|
"mean_token_accuracy": 0.21592830419540404,
|
|
"num_tokens": 37421859.0,
|
|
"step": 16325
|
|
},
|
|
{
|
|
"entropy": 5.188553094863892,
|
|
"epoch": 1.568683957732949,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047589346535087444,
|
|
"loss": 4.9689,
|
|
"mean_token_accuracy": 0.21344497352838515,
|
|
"num_tokens": 37432827.0,
|
|
"step": 16330
|
|
},
|
|
{
|
|
"entropy": 5.171142482757569,
|
|
"epoch": 1.5691642651296829,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004758780239459586,
|
|
"loss": 4.9727,
|
|
"mean_token_accuracy": 0.22152907252311707,
|
|
"num_tokens": 37444171.0,
|
|
"step": 16335
|
|
},
|
|
{
|
|
"entropy": 5.194261264801026,
|
|
"epoch": 1.5696445725264168,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004758625778772514,
|
|
"loss": 4.9252,
|
|
"mean_token_accuracy": 0.2152291163802147,
|
|
"num_tokens": 37455194.0,
|
|
"step": 16340
|
|
},
|
|
{
|
|
"entropy": 5.0763763904571535,
|
|
"epoch": 1.5701248799231509,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00047584712714511166,
|
|
"loss": 4.868,
|
|
"mean_token_accuracy": 0.22146839201450347,
|
|
"num_tokens": 37465839.0,
|
|
"step": 16345
|
|
},
|
|
{
|
|
"entropy": 5.157925367355347,
|
|
"epoch": 1.5706051873198847,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00047583167174989797,
|
|
"loss": 4.9428,
|
|
"mean_token_accuracy": 0.22237591743469237,
|
|
"num_tokens": 37476948.0,
|
|
"step": 16350
|
|
},
|
|
{
|
|
"entropy": 5.198430585861206,
|
|
"epoch": 1.5710854947166186,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004758162116919692,
|
|
"loss": 4.9383,
|
|
"mean_token_accuracy": 0.21987285912036897,
|
|
"num_tokens": 37487382.0,
|
|
"step": 16355
|
|
},
|
|
{
|
|
"entropy": 5.106133604049683,
|
|
"epoch": 1.5715658021133525,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047580074697168434,
|
|
"loss": 4.9045,
|
|
"mean_token_accuracy": 0.22711621075868607,
|
|
"num_tokens": 37498347.0,
|
|
"step": 16360
|
|
},
|
|
{
|
|
"entropy": 5.189422130584717,
|
|
"epoch": 1.5720461095100866,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047578527758940236,
|
|
"loss": 5.0108,
|
|
"mean_token_accuracy": 0.20658042430877685,
|
|
"num_tokens": 37509620.0,
|
|
"step": 16365
|
|
},
|
|
{
|
|
"entropy": 5.200649690628052,
|
|
"epoch": 1.5725264169068205,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004757698035454825,
|
|
"loss": 4.8942,
|
|
"mean_token_accuracy": 0.22583490014076232,
|
|
"num_tokens": 37519768.0,
|
|
"step": 16370
|
|
},
|
|
{
|
|
"entropy": 5.118372964859009,
|
|
"epoch": 1.5730067243035544,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004757543248402839,
|
|
"loss": 4.8915,
|
|
"mean_token_accuracy": 0.21790195405483245,
|
|
"num_tokens": 37530476.0,
|
|
"step": 16375
|
|
},
|
|
{
|
|
"entropy": 5.189021921157837,
|
|
"epoch": 1.5734870317002883,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047573884147416597,
|
|
"loss": 5.0071,
|
|
"mean_token_accuracy": 0.21477911174297332,
|
|
"num_tokens": 37541702.0,
|
|
"step": 16380
|
|
},
|
|
{
|
|
"entropy": 5.234851312637329,
|
|
"epoch": 1.5739673390970221,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004757233534474883,
|
|
"loss": 4.9618,
|
|
"mean_token_accuracy": 0.21515962332487107,
|
|
"num_tokens": 37553323.0,
|
|
"step": 16385
|
|
},
|
|
{
|
|
"entropy": 5.239250135421753,
|
|
"epoch": 1.574447646493756,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004757078607606103,
|
|
"loss": 5.0426,
|
|
"mean_token_accuracy": 0.20883728861808776,
|
|
"num_tokens": 37564882.0,
|
|
"step": 16390
|
|
},
|
|
{
|
|
"entropy": 5.2088868618011475,
|
|
"epoch": 1.57492795389049,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004756923634138918,
|
|
"loss": 4.9551,
|
|
"mean_token_accuracy": 0.21754217594861985,
|
|
"num_tokens": 37575612.0,
|
|
"step": 16395
|
|
},
|
|
{
|
|
"entropy": 5.184630298614502,
|
|
"epoch": 1.5754082612872238,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047567686140769264,
|
|
"loss": 4.9768,
|
|
"mean_token_accuracy": 0.21821277886629104,
|
|
"num_tokens": 37587089.0,
|
|
"step": 16400
|
|
},
|
|
{
|
|
"entropy": 5.175845193862915,
|
|
"epoch": 1.5758885686839577,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047566135474237247,
|
|
"loss": 4.9184,
|
|
"mean_token_accuracy": 0.21809831261634827,
|
|
"num_tokens": 37598429.0,
|
|
"step": 16405
|
|
},
|
|
{
|
|
"entropy": 5.239535188674926,
|
|
"epoch": 1.5763688760806915,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047564584341829166,
|
|
"loss": 5.0381,
|
|
"mean_token_accuracy": 0.21282682567834854,
|
|
"num_tokens": 37609769.0,
|
|
"step": 16410
|
|
},
|
|
{
|
|
"entropy": 5.221446514129639,
|
|
"epoch": 1.5768491834774254,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00047563032743581,
|
|
"loss": 5.087,
|
|
"mean_token_accuracy": 0.20481704473495482,
|
|
"num_tokens": 37622425.0,
|
|
"step": 16415
|
|
},
|
|
{
|
|
"entropy": 5.187086963653565,
|
|
"epoch": 1.5773294908741593,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00047561480679528804,
|
|
"loss": 4.9807,
|
|
"mean_token_accuracy": 0.21880702823400497,
|
|
"num_tokens": 37634819.0,
|
|
"step": 16420
|
|
},
|
|
{
|
|
"entropy": 5.198013353347778,
|
|
"epoch": 1.5778097982708934,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004755992814970859,
|
|
"loss": 4.8433,
|
|
"mean_token_accuracy": 0.2195771813392639,
|
|
"num_tokens": 37645531.0,
|
|
"step": 16425
|
|
},
|
|
{
|
|
"entropy": 5.127077627182007,
|
|
"epoch": 1.5782901056676273,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000475583751541564,
|
|
"loss": 4.9271,
|
|
"mean_token_accuracy": 0.21498081386089324,
|
|
"num_tokens": 37657243.0,
|
|
"step": 16430
|
|
},
|
|
{
|
|
"entropy": 5.225577545166016,
|
|
"epoch": 1.5787704130643612,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047556821692908315,
|
|
"loss": 4.9839,
|
|
"mean_token_accuracy": 0.21665328592061997,
|
|
"num_tokens": 37668218.0,
|
|
"step": 16435
|
|
},
|
|
{
|
|
"entropy": 5.249110507965088,
|
|
"epoch": 1.579250720461095,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004755526776600038,
|
|
"loss": 4.9283,
|
|
"mean_token_accuracy": 0.21837957799434662,
|
|
"num_tokens": 37680173.0,
|
|
"step": 16440
|
|
},
|
|
{
|
|
"entropy": 5.161304950714111,
|
|
"epoch": 1.5797310278578292,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047553713373468684,
|
|
"loss": 4.9597,
|
|
"mean_token_accuracy": 0.21027158498764037,
|
|
"num_tokens": 37691281.0,
|
|
"step": 16445
|
|
},
|
|
{
|
|
"entropy": 5.209333801269532,
|
|
"epoch": 1.580211335254563,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047552158515349306,
|
|
"loss": 5.0076,
|
|
"mean_token_accuracy": 0.20411573201417924,
|
|
"num_tokens": 37702320.0,
|
|
"step": 16450
|
|
},
|
|
{
|
|
"entropy": 5.214414644241333,
|
|
"epoch": 1.580691642651297,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047550603191678356,
|
|
"loss": 4.9337,
|
|
"mean_token_accuracy": 0.21256616711616516,
|
|
"num_tokens": 37713724.0,
|
|
"step": 16455
|
|
},
|
|
{
|
|
"entropy": 5.193028688430786,
|
|
"epoch": 1.5811719500480308,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004754904740249194,
|
|
"loss": 4.9276,
|
|
"mean_token_accuracy": 0.21470242887735366,
|
|
"num_tokens": 37724532.0,
|
|
"step": 16460
|
|
},
|
|
{
|
|
"entropy": 5.131993532180786,
|
|
"epoch": 1.5816522574447647,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047547491147826156,
|
|
"loss": 4.8838,
|
|
"mean_token_accuracy": 0.22787191569805146,
|
|
"num_tokens": 37735174.0,
|
|
"step": 16465
|
|
},
|
|
{
|
|
"entropy": 5.177940845489502,
|
|
"epoch": 1.5821325648414986,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004754593442771718,
|
|
"loss": 5.0392,
|
|
"mean_token_accuracy": 0.21157672852277756,
|
|
"num_tokens": 37746981.0,
|
|
"step": 16470
|
|
},
|
|
{
|
|
"entropy": 5.217215061187744,
|
|
"epoch": 1.5826128722382324,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047544377242201115,
|
|
"loss": 4.989,
|
|
"mean_token_accuracy": 0.2165716901421547,
|
|
"num_tokens": 37757576.0,
|
|
"step": 16475
|
|
},
|
|
{
|
|
"entropy": 5.216026020050049,
|
|
"epoch": 1.5830931796349663,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047542819591314136,
|
|
"loss": 5.006,
|
|
"mean_token_accuracy": 0.2124703660607338,
|
|
"num_tokens": 37769053.0,
|
|
"step": 16480
|
|
},
|
|
{
|
|
"entropy": 5.248029994964599,
|
|
"epoch": 1.5835734870317002,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004754126147509241,
|
|
"loss": 4.9664,
|
|
"mean_token_accuracy": 0.2157026171684265,
|
|
"num_tokens": 37780517.0,
|
|
"step": 16485
|
|
},
|
|
{
|
|
"entropy": 5.269943332672119,
|
|
"epoch": 1.584053794428434,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047539702893572086,
|
|
"loss": 5.0513,
|
|
"mean_token_accuracy": 0.20178954899311066,
|
|
"num_tokens": 37791333.0,
|
|
"step": 16490
|
|
},
|
|
{
|
|
"entropy": 5.14649658203125,
|
|
"epoch": 1.584534101825168,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047538143846789376,
|
|
"loss": 4.9061,
|
|
"mean_token_accuracy": 0.22500312328338623,
|
|
"num_tokens": 37802635.0,
|
|
"step": 16495
|
|
},
|
|
{
|
|
"entropy": 5.0807657718658445,
|
|
"epoch": 1.585014409221902,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004753658433478047,
|
|
"loss": 4.8146,
|
|
"mean_token_accuracy": 0.22304627895355225,
|
|
"num_tokens": 37814788.0,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"entropy": 5.194208145141602,
|
|
"epoch": 1.585494716618636,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047535024357581564,
|
|
"loss": 4.9254,
|
|
"mean_token_accuracy": 0.2166296660900116,
|
|
"num_tokens": 37826650.0,
|
|
"step": 16505
|
|
},
|
|
{
|
|
"entropy": 5.219819498062134,
|
|
"epoch": 1.5859750240153698,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004753346391522889,
|
|
"loss": 5.0167,
|
|
"mean_token_accuracy": 0.21236117631196977,
|
|
"num_tokens": 37839362.0,
|
|
"step": 16510
|
|
},
|
|
{
|
|
"entropy": 5.205073118209839,
|
|
"epoch": 1.5864553314121037,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047531903007758667,
|
|
"loss": 4.9954,
|
|
"mean_token_accuracy": 0.209988933801651,
|
|
"num_tokens": 37851057.0,
|
|
"step": 16515
|
|
},
|
|
{
|
|
"entropy": 5.181534385681152,
|
|
"epoch": 1.5869356388088378,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004753034163520714,
|
|
"loss": 4.8959,
|
|
"mean_token_accuracy": 0.22577075511217118,
|
|
"num_tokens": 37863507.0,
|
|
"step": 16520
|
|
},
|
|
{
|
|
"entropy": 5.188526678085327,
|
|
"epoch": 1.5874159462055717,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047528779797610557,
|
|
"loss": 4.9664,
|
|
"mean_token_accuracy": 0.2138543888926506,
|
|
"num_tokens": 37874439.0,
|
|
"step": 16525
|
|
},
|
|
{
|
|
"entropy": 5.17763729095459,
|
|
"epoch": 1.5878962536023056,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00047527217495005184,
|
|
"loss": 4.9292,
|
|
"mean_token_accuracy": 0.21783537715673446,
|
|
"num_tokens": 37886720.0,
|
|
"step": 16530
|
|
},
|
|
{
|
|
"entropy": 5.214607858657837,
|
|
"epoch": 1.5883765609990395,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047525654727427285,
|
|
"loss": 5.0085,
|
|
"mean_token_accuracy": 0.21364359855651854,
|
|
"num_tokens": 37897919.0,
|
|
"step": 16535
|
|
},
|
|
{
|
|
"entropy": 5.20990104675293,
|
|
"epoch": 1.5888568683957733,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004752409149491315,
|
|
"loss": 5.0482,
|
|
"mean_token_accuracy": 0.21286925077438354,
|
|
"num_tokens": 37911061.0,
|
|
"step": 16540
|
|
},
|
|
{
|
|
"entropy": 5.240172100067139,
|
|
"epoch": 1.5893371757925072,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047522527797499075,
|
|
"loss": 5.0121,
|
|
"mean_token_accuracy": 0.21240307092666627,
|
|
"num_tokens": 37922677.0,
|
|
"step": 16545
|
|
},
|
|
{
|
|
"entropy": 5.096889591217041,
|
|
"epoch": 1.589817483189241,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004752096363522135,
|
|
"loss": 4.8004,
|
|
"mean_token_accuracy": 0.2269800528883934,
|
|
"num_tokens": 37932802.0,
|
|
"step": 16550
|
|
},
|
|
{
|
|
"entropy": 5.175290727615357,
|
|
"epoch": 1.590297790585975,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047519399008116305,
|
|
"loss": 4.9299,
|
|
"mean_token_accuracy": 0.21652191430330275,
|
|
"num_tokens": 37944782.0,
|
|
"step": 16555
|
|
},
|
|
{
|
|
"entropy": 5.169613695144653,
|
|
"epoch": 1.5907780979827089,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004751783391622026,
|
|
"loss": 4.9413,
|
|
"mean_token_accuracy": 0.21697622388601304,
|
|
"num_tokens": 37956577.0,
|
|
"step": 16560
|
|
},
|
|
{
|
|
"entropy": 5.214099884033203,
|
|
"epoch": 1.5912584053794427,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004751626835956955,
|
|
"loss": 4.9855,
|
|
"mean_token_accuracy": 0.21233401596546173,
|
|
"num_tokens": 37967937.0,
|
|
"step": 16565
|
|
},
|
|
{
|
|
"entropy": 5.126390886306763,
|
|
"epoch": 1.5917387127761766,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004751470233820053,
|
|
"loss": 4.9299,
|
|
"mean_token_accuracy": 0.22897855043411255,
|
|
"num_tokens": 37978113.0,
|
|
"step": 16570
|
|
},
|
|
{
|
|
"entropy": 5.143404293060303,
|
|
"epoch": 1.5922190201729105,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004751313585214955,
|
|
"loss": 4.8748,
|
|
"mean_token_accuracy": 0.22316106110811235,
|
|
"num_tokens": 37988852.0,
|
|
"step": 16575
|
|
},
|
|
{
|
|
"entropy": 5.231024122238159,
|
|
"epoch": 1.5926993275696446,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004751156890145298,
|
|
"loss": 4.9576,
|
|
"mean_token_accuracy": 0.2123672142624855,
|
|
"num_tokens": 38000531.0,
|
|
"step": 16580
|
|
},
|
|
{
|
|
"entropy": 5.1332155704498295,
|
|
"epoch": 1.5931796349663785,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000475100014861472,
|
|
"loss": 4.932,
|
|
"mean_token_accuracy": 0.22418509423732758,
|
|
"num_tokens": 38011818.0,
|
|
"step": 16585
|
|
},
|
|
{
|
|
"entropy": 5.2205602645874025,
|
|
"epoch": 1.5936599423631124,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004750843360626861,
|
|
"loss": 4.9578,
|
|
"mean_token_accuracy": 0.22010722607374192,
|
|
"num_tokens": 38022455.0,
|
|
"step": 16590
|
|
},
|
|
{
|
|
"entropy": 5.125461912155151,
|
|
"epoch": 1.5941402497598463,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004750686526185359,
|
|
"loss": 4.9286,
|
|
"mean_token_accuracy": 0.22543989717960358,
|
|
"num_tokens": 38033126.0,
|
|
"step": 16595
|
|
},
|
|
{
|
|
"entropy": 5.033788013458252,
|
|
"epoch": 1.5946205571565804,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047505296452938584,
|
|
"loss": 4.8884,
|
|
"mean_token_accuracy": 0.21922594010829927,
|
|
"num_tokens": 38044935.0,
|
|
"step": 16600
|
|
},
|
|
{
|
|
"entropy": 5.212667560577392,
|
|
"epoch": 1.5951008645533142,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047503727179559995,
|
|
"loss": 4.9732,
|
|
"mean_token_accuracy": 0.21320051848888397,
|
|
"num_tokens": 38056080.0,
|
|
"step": 16605
|
|
},
|
|
{
|
|
"entropy": 5.208511447906494,
|
|
"epoch": 1.5955811719500481,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047502157441754256,
|
|
"loss": 4.9921,
|
|
"mean_token_accuracy": 0.2140924945473671,
|
|
"num_tokens": 38066788.0,
|
|
"step": 16610
|
|
},
|
|
{
|
|
"entropy": 5.246877956390381,
|
|
"epoch": 1.596061479346782,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004750058723955781,
|
|
"loss": 5.0093,
|
|
"mean_token_accuracy": 0.21564434170722963,
|
|
"num_tokens": 38079529.0,
|
|
"step": 16615
|
|
},
|
|
{
|
|
"entropy": 5.1925897121429445,
|
|
"epoch": 1.5965417867435159,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004749901657300713,
|
|
"loss": 4.9602,
|
|
"mean_token_accuracy": 0.21586501747369766,
|
|
"num_tokens": 38091484.0,
|
|
"step": 16620
|
|
},
|
|
{
|
|
"entropy": 5.188087844848633,
|
|
"epoch": 1.5970220941402498,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047497445442138667,
|
|
"loss": 4.9166,
|
|
"mean_token_accuracy": 0.21774567365646363,
|
|
"num_tokens": 38103102.0,
|
|
"step": 16625
|
|
},
|
|
{
|
|
"entropy": 5.196709108352661,
|
|
"epoch": 1.5975024015369836,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047495873846988896,
|
|
"loss": 5.0333,
|
|
"mean_token_accuracy": 0.2103504180908203,
|
|
"num_tokens": 38115565.0,
|
|
"step": 16630
|
|
},
|
|
{
|
|
"entropy": 5.170992374420166,
|
|
"epoch": 1.5979827089337175,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004749430178759431,
|
|
"loss": 4.9409,
|
|
"mean_token_accuracy": 0.217051962018013,
|
|
"num_tokens": 38126741.0,
|
|
"step": 16635
|
|
},
|
|
{
|
|
"entropy": 5.17347526550293,
|
|
"epoch": 1.5984630163304514,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00047492729263991413,
|
|
"loss": 4.9496,
|
|
"mean_token_accuracy": 0.2166967958211899,
|
|
"num_tokens": 38137149.0,
|
|
"step": 16640
|
|
},
|
|
{
|
|
"entropy": 5.188420677185059,
|
|
"epoch": 1.5989433237271853,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047491156276216695,
|
|
"loss": 4.8864,
|
|
"mean_token_accuracy": 0.2285622701048851,
|
|
"num_tokens": 38148537.0,
|
|
"step": 16645
|
|
},
|
|
{
|
|
"entropy": 5.168776988983154,
|
|
"epoch": 1.5994236311239192,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047489582824306704,
|
|
"loss": 4.9282,
|
|
"mean_token_accuracy": 0.21946836411952972,
|
|
"num_tokens": 38159097.0,
|
|
"step": 16650
|
|
},
|
|
{
|
|
"entropy": 5.191212558746338,
|
|
"epoch": 1.5999039385206533,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047488008908297955,
|
|
"loss": 4.9028,
|
|
"mean_token_accuracy": 0.22523313760757446,
|
|
"num_tokens": 38171400.0,
|
|
"step": 16655
|
|
},
|
|
{
|
|
"entropy": 5.212293004989624,
|
|
"epoch": 1.6003842459173871,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004748643452822699,
|
|
"loss": 4.955,
|
|
"mean_token_accuracy": 0.21418403089046478,
|
|
"num_tokens": 38181876.0,
|
|
"step": 16660
|
|
},
|
|
{
|
|
"entropy": 5.211285972595215,
|
|
"epoch": 1.600864553314121,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004748485968413036,
|
|
"loss": 4.9318,
|
|
"mean_token_accuracy": 0.22244168519973756,
|
|
"num_tokens": 38193680.0,
|
|
"step": 16665
|
|
},
|
|
{
|
|
"entropy": 5.134846878051758,
|
|
"epoch": 1.601344860710855,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047483284376044634,
|
|
"loss": 4.9106,
|
|
"mean_token_accuracy": 0.2231910213828087,
|
|
"num_tokens": 38204944.0,
|
|
"step": 16670
|
|
},
|
|
{
|
|
"entropy": 5.129866218566894,
|
|
"epoch": 1.601825168107589,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004748170860400638,
|
|
"loss": 4.8625,
|
|
"mean_token_accuracy": 0.2263529285788536,
|
|
"num_tokens": 38216381.0,
|
|
"step": 16675
|
|
},
|
|
{
|
|
"entropy": 5.1508636474609375,
|
|
"epoch": 1.602305475504323,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047480132368052185,
|
|
"loss": 4.8845,
|
|
"mean_token_accuracy": 0.22613088488578797,
|
|
"num_tokens": 38227420.0,
|
|
"step": 16680
|
|
},
|
|
{
|
|
"entropy": 5.208236646652222,
|
|
"epoch": 1.6027857829010568,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00047478555668218643,
|
|
"loss": 5.0062,
|
|
"mean_token_accuracy": 0.21762621849775315,
|
|
"num_tokens": 38237869.0,
|
|
"step": 16685
|
|
},
|
|
{
|
|
"entropy": 5.164920616149902,
|
|
"epoch": 1.6032660902977907,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004747697850454237,
|
|
"loss": 4.9765,
|
|
"mean_token_accuracy": 0.2174433395266533,
|
|
"num_tokens": 38250362.0,
|
|
"step": 16690
|
|
},
|
|
{
|
|
"entropy": 5.280116271972656,
|
|
"epoch": 1.6037463976945245,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004747540087705997,
|
|
"loss": 5.0409,
|
|
"mean_token_accuracy": 0.21639316529035568,
|
|
"num_tokens": 38262887.0,
|
|
"step": 16695
|
|
},
|
|
{
|
|
"entropy": 5.230034446716308,
|
|
"epoch": 1.6042267050912584,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004747382278580808,
|
|
"loss": 4.9034,
|
|
"mean_token_accuracy": 0.2294904425740242,
|
|
"num_tokens": 38273206.0,
|
|
"step": 16700
|
|
},
|
|
{
|
|
"entropy": 5.185104942321777,
|
|
"epoch": 1.6047070124879923,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004747224423082333,
|
|
"loss": 4.9173,
|
|
"mean_token_accuracy": 0.21850554943084716,
|
|
"num_tokens": 38283307.0,
|
|
"step": 16705
|
|
},
|
|
{
|
|
"entropy": 5.211791467666626,
|
|
"epoch": 1.6051873198847262,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047470665212142384,
|
|
"loss": 4.9883,
|
|
"mean_token_accuracy": 0.21574016958475112,
|
|
"num_tokens": 38293830.0,
|
|
"step": 16710
|
|
},
|
|
{
|
|
"entropy": 5.214894819259643,
|
|
"epoch": 1.60566762728146,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047469085729801887,
|
|
"loss": 4.9774,
|
|
"mean_token_accuracy": 0.219608137011528,
|
|
"num_tokens": 38306523.0,
|
|
"step": 16715
|
|
},
|
|
{
|
|
"entropy": 5.200537395477295,
|
|
"epoch": 1.606147934678194,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047467505783838515,
|
|
"loss": 4.9473,
|
|
"mean_token_accuracy": 0.2235700950026512,
|
|
"num_tokens": 38318468.0,
|
|
"step": 16720
|
|
},
|
|
{
|
|
"entropy": 5.2180544376373295,
|
|
"epoch": 1.6066282420749278,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004746592537428895,
|
|
"loss": 4.8867,
|
|
"mean_token_accuracy": 0.22414906024932862,
|
|
"num_tokens": 38329852.0,
|
|
"step": 16725
|
|
},
|
|
{
|
|
"entropy": 5.076629400253296,
|
|
"epoch": 1.6071085494716617,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047464344501189877,
|
|
"loss": 4.8605,
|
|
"mean_token_accuracy": 0.23030537664890288,
|
|
"num_tokens": 38340951.0,
|
|
"step": 16730
|
|
},
|
|
{
|
|
"entropy": 5.25096173286438,
|
|
"epoch": 1.6075888568683958,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047462763164578015,
|
|
"loss": 5.0228,
|
|
"mean_token_accuracy": 0.21357613205909728,
|
|
"num_tokens": 38351490.0,
|
|
"step": 16735
|
|
},
|
|
{
|
|
"entropy": 5.198782014846802,
|
|
"epoch": 1.6080691642651297,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004746118136449007,
|
|
"loss": 4.9292,
|
|
"mean_token_accuracy": 0.2180192857980728,
|
|
"num_tokens": 38362364.0,
|
|
"step": 16740
|
|
},
|
|
{
|
|
"entropy": 5.180115175247193,
|
|
"epoch": 1.6085494716618636,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004745959910096276,
|
|
"loss": 4.9898,
|
|
"mean_token_accuracy": 0.22322781383991241,
|
|
"num_tokens": 38374132.0,
|
|
"step": 16745
|
|
},
|
|
{
|
|
"entropy": 5.1993663787841795,
|
|
"epoch": 1.6090297790585975,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047458016374032837,
|
|
"loss": 4.9685,
|
|
"mean_token_accuracy": 0.2143290311098099,
|
|
"num_tokens": 38384608.0,
|
|
"step": 16750
|
|
},
|
|
{
|
|
"entropy": 5.17359938621521,
|
|
"epoch": 1.6095100864553316,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004745643318373703,
|
|
"loss": 4.9836,
|
|
"mean_token_accuracy": 0.21605729013681413,
|
|
"num_tokens": 38396639.0,
|
|
"step": 16755
|
|
},
|
|
{
|
|
"entropy": 5.18921127319336,
|
|
"epoch": 1.6099903938520654,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047454849530112106,
|
|
"loss": 4.9898,
|
|
"mean_token_accuracy": 0.2078189730644226,
|
|
"num_tokens": 38407954.0,
|
|
"step": 16760
|
|
},
|
|
{
|
|
"entropy": 5.28509259223938,
|
|
"epoch": 1.6104707012487993,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00047453265413194826,
|
|
"loss": 4.9396,
|
|
"mean_token_accuracy": 0.2211918741464615,
|
|
"num_tokens": 38418939.0,
|
|
"step": 16765
|
|
},
|
|
{
|
|
"entropy": 5.21851601600647,
|
|
"epoch": 1.6109510086455332,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047451680833021973,
|
|
"loss": 4.9328,
|
|
"mean_token_accuracy": 0.21415177136659622,
|
|
"num_tokens": 38429717.0,
|
|
"step": 16770
|
|
},
|
|
{
|
|
"entropy": 5.12396125793457,
|
|
"epoch": 1.611431316042267,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004745009578963034,
|
|
"loss": 4.8761,
|
|
"mean_token_accuracy": 0.22479525655508042,
|
|
"num_tokens": 38441054.0,
|
|
"step": 16775
|
|
},
|
|
{
|
|
"entropy": 5.115128135681152,
|
|
"epoch": 1.611911623439001,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00047448510283056716,
|
|
"loss": 4.9116,
|
|
"mean_token_accuracy": 0.22436713427305222,
|
|
"num_tokens": 38451329.0,
|
|
"step": 16780
|
|
},
|
|
{
|
|
"entropy": 5.247094392776489,
|
|
"epoch": 1.6123919308357348,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047446924313337925,
|
|
"loss": 4.9893,
|
|
"mean_token_accuracy": 0.21566009074449538,
|
|
"num_tokens": 38462118.0,
|
|
"step": 16785
|
|
},
|
|
{
|
|
"entropy": 5.180738306045532,
|
|
"epoch": 1.6128722382324687,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047445337880510773,
|
|
"loss": 4.9299,
|
|
"mean_token_accuracy": 0.22173037976026536,
|
|
"num_tokens": 38472642.0,
|
|
"step": 16790
|
|
},
|
|
{
|
|
"entropy": 5.087857055664062,
|
|
"epoch": 1.6133525456292026,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004744375098461211,
|
|
"loss": 4.8573,
|
|
"mean_token_accuracy": 0.22567004710435867,
|
|
"num_tokens": 38483866.0,
|
|
"step": 16795
|
|
},
|
|
{
|
|
"entropy": 5.136823225021362,
|
|
"epoch": 1.6138328530259365,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004744216362567876,
|
|
"loss": 4.856,
|
|
"mean_token_accuracy": 0.2275155559182167,
|
|
"num_tokens": 38494164.0,
|
|
"step": 16800
|
|
},
|
|
{
|
|
"entropy": 5.250975799560547,
|
|
"epoch": 1.6143131604226704,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047440575803747595,
|
|
"loss": 5.0822,
|
|
"mean_token_accuracy": 0.20916907489299774,
|
|
"num_tokens": 38505340.0,
|
|
"step": 16805
|
|
},
|
|
{
|
|
"entropy": 5.361870002746582,
|
|
"epoch": 1.6147934678194045,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047438987518855463,
|
|
"loss": 5.082,
|
|
"mean_token_accuracy": 0.20903967767953874,
|
|
"num_tokens": 38516164.0,
|
|
"step": 16810
|
|
},
|
|
{
|
|
"entropy": 5.22707691192627,
|
|
"epoch": 1.6152737752161384,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004743739877103926,
|
|
"loss": 4.9441,
|
|
"mean_token_accuracy": 0.21966830492019654,
|
|
"num_tokens": 38526545.0,
|
|
"step": 16815
|
|
},
|
|
{
|
|
"entropy": 5.091162443161011,
|
|
"epoch": 1.6157540826128722,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004743580956033585,
|
|
"loss": 4.9081,
|
|
"mean_token_accuracy": 0.2183246672153473,
|
|
"num_tokens": 38538102.0,
|
|
"step": 16820
|
|
},
|
|
{
|
|
"entropy": 5.256732702255249,
|
|
"epoch": 1.6162343900096061,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00047434219886782135,
|
|
"loss": 5.0344,
|
|
"mean_token_accuracy": 0.21077128499746323,
|
|
"num_tokens": 38550594.0,
|
|
"step": 16825
|
|
},
|
|
{
|
|
"entropy": 5.209319257736206,
|
|
"epoch": 1.6167146974063402,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004743262975041504,
|
|
"loss": 4.9029,
|
|
"mean_token_accuracy": 0.2212669938802719,
|
|
"num_tokens": 38561666.0,
|
|
"step": 16830
|
|
},
|
|
{
|
|
"entropy": 5.135405111312866,
|
|
"epoch": 1.617195004803074,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004743103915127146,
|
|
"loss": 4.8723,
|
|
"mean_token_accuracy": 0.22310091853141784,
|
|
"num_tokens": 38572923.0,
|
|
"step": 16835
|
|
},
|
|
{
|
|
"entropy": 5.170195627212524,
|
|
"epoch": 1.617675312199808,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00047429448089388336,
|
|
"loss": 4.9256,
|
|
"mean_token_accuracy": 0.22085566222667694,
|
|
"num_tokens": 38584108.0,
|
|
"step": 16840
|
|
},
|
|
{
|
|
"entropy": 5.3052619934082035,
|
|
"epoch": 1.6181556195965419,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047427856564802605,
|
|
"loss": 5.052,
|
|
"mean_token_accuracy": 0.20214477479457854,
|
|
"num_tokens": 38594974.0,
|
|
"step": 16845
|
|
},
|
|
{
|
|
"entropy": 5.246438884735108,
|
|
"epoch": 1.6186359269932757,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004742626457755122,
|
|
"loss": 5.0071,
|
|
"mean_token_accuracy": 0.2198152020573616,
|
|
"num_tokens": 38606436.0,
|
|
"step": 16850
|
|
},
|
|
{
|
|
"entropy": 5.152284097671509,
|
|
"epoch": 1.6191162343900096,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004742467212767114,
|
|
"loss": 4.8889,
|
|
"mean_token_accuracy": 0.2204935997724533,
|
|
"num_tokens": 38618760.0,
|
|
"step": 16855
|
|
},
|
|
{
|
|
"entropy": 5.138833665847779,
|
|
"epoch": 1.6195965417867435,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004742307921519933,
|
|
"loss": 4.8611,
|
|
"mean_token_accuracy": 0.2265857771039009,
|
|
"num_tokens": 38629901.0,
|
|
"step": 16860
|
|
},
|
|
{
|
|
"entropy": 5.268418407440185,
|
|
"epoch": 1.6200768491834774,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047421485840172794,
|
|
"loss": 5.0478,
|
|
"mean_token_accuracy": 0.20666339248418808,
|
|
"num_tokens": 38640798.0,
|
|
"step": 16865
|
|
},
|
|
{
|
|
"entropy": 5.245220804214478,
|
|
"epoch": 1.6205571565802113,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004741989200262851,
|
|
"loss": 4.9676,
|
|
"mean_token_accuracy": 0.21834530234336852,
|
|
"num_tokens": 38653048.0,
|
|
"step": 16870
|
|
},
|
|
{
|
|
"entropy": 5.228298997879028,
|
|
"epoch": 1.6210374639769451,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004741829770260347,
|
|
"loss": 5.0116,
|
|
"mean_token_accuracy": 0.21510974317789078,
|
|
"num_tokens": 38666219.0,
|
|
"step": 16875
|
|
},
|
|
{
|
|
"entropy": 5.2042152881622314,
|
|
"epoch": 1.621517771373679,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047416702940134714,
|
|
"loss": 4.909,
|
|
"mean_token_accuracy": 0.22228912860155106,
|
|
"num_tokens": 38678584.0,
|
|
"step": 16880
|
|
},
|
|
{
|
|
"entropy": 5.181402587890625,
|
|
"epoch": 1.621998078770413,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00047415107715259255,
|
|
"loss": 4.9377,
|
|
"mean_token_accuracy": 0.21005474478006364,
|
|
"num_tokens": 38690164.0,
|
|
"step": 16885
|
|
},
|
|
{
|
|
"entropy": 5.208297920227051,
|
|
"epoch": 1.622478386167147,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047413512028014125,
|
|
"loss": 5.0103,
|
|
"mean_token_accuracy": 0.21610539108514787,
|
|
"num_tokens": 38702035.0,
|
|
"step": 16890
|
|
},
|
|
{
|
|
"entropy": 5.2519388675689695,
|
|
"epoch": 1.622958693563881,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004741191587843638,
|
|
"loss": 5.0296,
|
|
"mean_token_accuracy": 0.21492973119020461,
|
|
"num_tokens": 38713802.0,
|
|
"step": 16895
|
|
},
|
|
{
|
|
"entropy": 5.190322732925415,
|
|
"epoch": 1.6234390009606148,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004741031926656308,
|
|
"loss": 4.8923,
|
|
"mean_token_accuracy": 0.22572966963052749,
|
|
"num_tokens": 38725682.0,
|
|
"step": 16900
|
|
},
|
|
{
|
|
"entropy": 5.178541278839111,
|
|
"epoch": 1.6239193083573487,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004740872219243128,
|
|
"loss": 4.9072,
|
|
"mean_token_accuracy": 0.21873684823513032,
|
|
"num_tokens": 38737897.0,
|
|
"step": 16905
|
|
},
|
|
{
|
|
"entropy": 5.229857921600342,
|
|
"epoch": 1.6243996157540828,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004740712465607807,
|
|
"loss": 5.018,
|
|
"mean_token_accuracy": 0.2116595149040222,
|
|
"num_tokens": 38749446.0,
|
|
"step": 16910
|
|
},
|
|
{
|
|
"entropy": 5.176464700698853,
|
|
"epoch": 1.6248799231508166,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004740552665754054,
|
|
"loss": 4.9725,
|
|
"mean_token_accuracy": 0.21386126130819322,
|
|
"num_tokens": 38761406.0,
|
|
"step": 16915
|
|
},
|
|
{
|
|
"entropy": 5.204869890213013,
|
|
"epoch": 1.6253602305475505,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047403928196855776,
|
|
"loss": 4.976,
|
|
"mean_token_accuracy": 0.22327034771442414,
|
|
"num_tokens": 38772574.0,
|
|
"step": 16920
|
|
},
|
|
{
|
|
"entropy": 5.201022005081176,
|
|
"epoch": 1.6258405379442844,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047402329274060916,
|
|
"loss": 4.9489,
|
|
"mean_token_accuracy": 0.22577953338623047,
|
|
"num_tokens": 38782932.0,
|
|
"step": 16925
|
|
},
|
|
{
|
|
"entropy": 5.209049367904663,
|
|
"epoch": 1.6263208453410183,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004740072988919306,
|
|
"loss": 5.009,
|
|
"mean_token_accuracy": 0.21278314143419266,
|
|
"num_tokens": 38793799.0,
|
|
"step": 16930
|
|
},
|
|
{
|
|
"entropy": 5.073943376541138,
|
|
"epoch": 1.6268011527377522,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004739913004228936,
|
|
"loss": 4.8296,
|
|
"mean_token_accuracy": 0.2272112175822258,
|
|
"num_tokens": 38804899.0,
|
|
"step": 16935
|
|
},
|
|
{
|
|
"entropy": 5.17745361328125,
|
|
"epoch": 1.627281460134486,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004739752973338694,
|
|
"loss": 5.0203,
|
|
"mean_token_accuracy": 0.20754062682390212,
|
|
"num_tokens": 38816455.0,
|
|
"step": 16940
|
|
},
|
|
{
|
|
"entropy": 5.232742404937744,
|
|
"epoch": 1.62776176753122,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047395928962522965,
|
|
"loss": 4.9653,
|
|
"mean_token_accuracy": 0.21127762645483017,
|
|
"num_tokens": 38827735.0,
|
|
"step": 16945
|
|
},
|
|
{
|
|
"entropy": 5.1770717144012455,
|
|
"epoch": 1.6282420749279538,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047394327729734595,
|
|
"loss": 4.9796,
|
|
"mean_token_accuracy": 0.2126038447022438,
|
|
"num_tokens": 38838536.0,
|
|
"step": 16950
|
|
},
|
|
{
|
|
"entropy": 5.273472642898559,
|
|
"epoch": 1.6287223823246877,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004739272603505901,
|
|
"loss": 5.0236,
|
|
"mean_token_accuracy": 0.21635421216487885,
|
|
"num_tokens": 38849577.0,
|
|
"step": 16955
|
|
},
|
|
{
|
|
"entropy": 5.277729892730713,
|
|
"epoch": 1.6292026897214216,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000473911238785334,
|
|
"loss": 4.981,
|
|
"mean_token_accuracy": 0.21457867622375487,
|
|
"num_tokens": 38861370.0,
|
|
"step": 16960
|
|
},
|
|
{
|
|
"entropy": 5.16580753326416,
|
|
"epoch": 1.6296829971181557,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004738952126019496,
|
|
"loss": 4.927,
|
|
"mean_token_accuracy": 0.21482086628675462,
|
|
"num_tokens": 38872885.0,
|
|
"step": 16965
|
|
},
|
|
{
|
|
"entropy": 5.1759748458862305,
|
|
"epoch": 1.6301633045148896,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004738791818008089,
|
|
"loss": 4.9798,
|
|
"mean_token_accuracy": 0.21594414860010147,
|
|
"num_tokens": 38885440.0,
|
|
"step": 16970
|
|
},
|
|
{
|
|
"entropy": 5.167402601242065,
|
|
"epoch": 1.6306436119116234,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004738631463822841,
|
|
"loss": 4.8927,
|
|
"mean_token_accuracy": 0.22061660438776015,
|
|
"num_tokens": 38897108.0,
|
|
"step": 16975
|
|
},
|
|
{
|
|
"entropy": 5.139171504974366,
|
|
"epoch": 1.6311239193083573,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047384710634674766,
|
|
"loss": 4.9746,
|
|
"mean_token_accuracy": 0.21460178643465042,
|
|
"num_tokens": 38908020.0,
|
|
"step": 16980
|
|
},
|
|
{
|
|
"entropy": 5.231308460235596,
|
|
"epoch": 1.6316042267050914,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00047383106169457184,
|
|
"loss": 4.982,
|
|
"mean_token_accuracy": 0.21495762020349501,
|
|
"num_tokens": 38919558.0,
|
|
"step": 16985
|
|
},
|
|
{
|
|
"entropy": 5.212464714050293,
|
|
"epoch": 1.6320845341018253,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004738150124261292,
|
|
"loss": 4.9595,
|
|
"mean_token_accuracy": 0.22091935575008392,
|
|
"num_tokens": 38931008.0,
|
|
"step": 16990
|
|
},
|
|
{
|
|
"entropy": 5.120176219940186,
|
|
"epoch": 1.6325648414985592,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047379895854179226,
|
|
"loss": 4.8583,
|
|
"mean_token_accuracy": 0.22508623749017714,
|
|
"num_tokens": 38943474.0,
|
|
"step": 16995
|
|
},
|
|
{
|
|
"entropy": 5.216991710662842,
|
|
"epoch": 1.633045148895293,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004737829000419338,
|
|
"loss": 5.051,
|
|
"mean_token_accuracy": 0.2126992627978325,
|
|
"num_tokens": 38954592.0,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"entropy": 5.210864067077637,
|
|
"epoch": 1.633525456292027,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047376683692692666,
|
|
"loss": 4.9369,
|
|
"mean_token_accuracy": 0.21950011253356932,
|
|
"num_tokens": 38965627.0,
|
|
"step": 17005
|
|
},
|
|
{
|
|
"entropy": 5.096020603179932,
|
|
"epoch": 1.6340057636887608,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004737507691971439,
|
|
"loss": 4.9038,
|
|
"mean_token_accuracy": 0.2222321853041649,
|
|
"num_tokens": 38975731.0,
|
|
"step": 17010
|
|
},
|
|
{
|
|
"entropy": 5.133788967132569,
|
|
"epoch": 1.6344860710854947,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047373469685295833,
|
|
"loss": 4.9094,
|
|
"mean_token_accuracy": 0.22627927511930465,
|
|
"num_tokens": 38987101.0,
|
|
"step": 17015
|
|
},
|
|
{
|
|
"entropy": 5.1970940113067625,
|
|
"epoch": 1.6349663784822286,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047371861989474326,
|
|
"loss": 4.957,
|
|
"mean_token_accuracy": 0.22381552755832673,
|
|
"num_tokens": 38999584.0,
|
|
"step": 17020
|
|
},
|
|
{
|
|
"entropy": 5.131382656097412,
|
|
"epoch": 1.6354466858789625,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004737025383228719,
|
|
"loss": 4.9378,
|
|
"mean_token_accuracy": 0.2247656613588333,
|
|
"num_tokens": 39012025.0,
|
|
"step": 17025
|
|
},
|
|
{
|
|
"entropy": 5.053452491760254,
|
|
"epoch": 1.6359269932756964,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047368645213771764,
|
|
"loss": 4.8227,
|
|
"mean_token_accuracy": 0.2244688794016838,
|
|
"num_tokens": 39024102.0,
|
|
"step": 17030
|
|
},
|
|
{
|
|
"entropy": 5.165864324569702,
|
|
"epoch": 1.6364073006724302,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000473670361339654,
|
|
"loss": 4.957,
|
|
"mean_token_accuracy": 0.2133333921432495,
|
|
"num_tokens": 39035295.0,
|
|
"step": 17035
|
|
},
|
|
{
|
|
"entropy": 5.164595079421997,
|
|
"epoch": 1.6368876080691641,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004736542659290544,
|
|
"loss": 4.8585,
|
|
"mean_token_accuracy": 0.22263574600219727,
|
|
"num_tokens": 39046493.0,
|
|
"step": 17040
|
|
},
|
|
{
|
|
"entropy": 5.168615007400513,
|
|
"epoch": 1.6373679154658982,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004736381659062927,
|
|
"loss": 4.9059,
|
|
"mean_token_accuracy": 0.21746231317520143,
|
|
"num_tokens": 39056746.0,
|
|
"step": 17045
|
|
},
|
|
{
|
|
"entropy": 5.183399057388305,
|
|
"epoch": 1.637848222862632,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047362206127174255,
|
|
"loss": 4.9483,
|
|
"mean_token_accuracy": 0.2127215713262558,
|
|
"num_tokens": 39068263.0,
|
|
"step": 17050
|
|
},
|
|
{
|
|
"entropy": 5.211107921600342,
|
|
"epoch": 1.638328530259366,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047360595202577786,
|
|
"loss": 4.9733,
|
|
"mean_token_accuracy": 0.21741271317005156,
|
|
"num_tokens": 39080666.0,
|
|
"step": 17055
|
|
},
|
|
{
|
|
"entropy": 5.103889846801758,
|
|
"epoch": 1.6388088376560999,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047358983816877284,
|
|
"loss": 4.9431,
|
|
"mean_token_accuracy": 0.21217281520366668,
|
|
"num_tokens": 39091851.0,
|
|
"step": 17060
|
|
},
|
|
{
|
|
"entropy": 5.235210275650024,
|
|
"epoch": 1.639289145052834,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004735737197011015,
|
|
"loss": 4.9498,
|
|
"mean_token_accuracy": 0.2229301705956459,
|
|
"num_tokens": 39102893.0,
|
|
"step": 17065
|
|
},
|
|
{
|
|
"entropy": 5.150740432739258,
|
|
"epoch": 1.6397694524495678,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00047355759662313793,
|
|
"loss": 4.8814,
|
|
"mean_token_accuracy": 0.2147470995783806,
|
|
"num_tokens": 39113582.0,
|
|
"step": 17070
|
|
},
|
|
{
|
|
"entropy": 5.24199595451355,
|
|
"epoch": 1.6402497598463017,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004735414689352566,
|
|
"loss": 5.0776,
|
|
"mean_token_accuracy": 0.21010097116231918,
|
|
"num_tokens": 39124616.0,
|
|
"step": 17075
|
|
},
|
|
{
|
|
"entropy": 5.12250599861145,
|
|
"epoch": 1.6407300672430356,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004735253366378318,
|
|
"loss": 4.8727,
|
|
"mean_token_accuracy": 0.22840944528579712,
|
|
"num_tokens": 39135537.0,
|
|
"step": 17080
|
|
},
|
|
{
|
|
"entropy": 5.1794140338897705,
|
|
"epoch": 1.6412103746397695,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004735091997312383,
|
|
"loss": 4.9501,
|
|
"mean_token_accuracy": 0.22090162485837936,
|
|
"num_tokens": 39146363.0,
|
|
"step": 17085
|
|
},
|
|
{
|
|
"entropy": 5.157111310958863,
|
|
"epoch": 1.6416906820365034,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047349305821585067,
|
|
"loss": 4.91,
|
|
"mean_token_accuracy": 0.22369770109653472,
|
|
"num_tokens": 39157319.0,
|
|
"step": 17090
|
|
},
|
|
{
|
|
"entropy": 5.173314619064331,
|
|
"epoch": 1.6421709894332372,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004734769120920435,
|
|
"loss": 4.9141,
|
|
"mean_token_accuracy": 0.22113776504993438,
|
|
"num_tokens": 39169352.0,
|
|
"step": 17095
|
|
},
|
|
{
|
|
"entropy": 5.206317377090454,
|
|
"epoch": 1.6426512968299711,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004734607613601919,
|
|
"loss": 4.9054,
|
|
"mean_token_accuracy": 0.22213377356529235,
|
|
"num_tokens": 39180989.0,
|
|
"step": 17100
|
|
},
|
|
{
|
|
"entropy": 5.174010324478149,
|
|
"epoch": 1.643131604226705,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047344460602067077,
|
|
"loss": 4.9605,
|
|
"mean_token_accuracy": 0.21831177175045013,
|
|
"num_tokens": 39192309.0,
|
|
"step": 17105
|
|
},
|
|
{
|
|
"entropy": 5.108195209503174,
|
|
"epoch": 1.643611911623439,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000473428446073855,
|
|
"loss": 4.9048,
|
|
"mean_token_accuracy": 0.21822543889284135,
|
|
"num_tokens": 39203586.0,
|
|
"step": 17110
|
|
},
|
|
{
|
|
"entropy": 5.1728309154510494,
|
|
"epoch": 1.6440922190201728,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047341228152012003,
|
|
"loss": 4.9358,
|
|
"mean_token_accuracy": 0.2138598531484604,
|
|
"num_tokens": 39216248.0,
|
|
"step": 17115
|
|
},
|
|
{
|
|
"entropy": 5.217025470733643,
|
|
"epoch": 1.6445725264169067,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000473396112359841,
|
|
"loss": 4.9216,
|
|
"mean_token_accuracy": 0.21717059910297393,
|
|
"num_tokens": 39227244.0,
|
|
"step": 17120
|
|
},
|
|
{
|
|
"entropy": 5.171856927871704,
|
|
"epoch": 1.6450528338136408,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047337993859339334,
|
|
"loss": 4.8631,
|
|
"mean_token_accuracy": 0.22649723738431932,
|
|
"num_tokens": 39238769.0,
|
|
"step": 17125
|
|
},
|
|
{
|
|
"entropy": 5.140405130386353,
|
|
"epoch": 1.6455331412103746,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047336376022115255,
|
|
"loss": 4.9605,
|
|
"mean_token_accuracy": 0.21623745262622834,
|
|
"num_tokens": 39251424.0,
|
|
"step": 17130
|
|
},
|
|
{
|
|
"entropy": 5.150231647491455,
|
|
"epoch": 1.6460134486071085,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00047334757724349437,
|
|
"loss": 4.9106,
|
|
"mean_token_accuracy": 0.2248495638370514,
|
|
"num_tokens": 39262451.0,
|
|
"step": 17135
|
|
},
|
|
{
|
|
"entropy": 5.156459474563599,
|
|
"epoch": 1.6464937560038426,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004733313896607943,
|
|
"loss": 4.9223,
|
|
"mean_token_accuracy": 0.2204621374607086,
|
|
"num_tokens": 39275129.0,
|
|
"step": 17140
|
|
},
|
|
{
|
|
"entropy": 5.195129108428955,
|
|
"epoch": 1.6469740634005765,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004733151974734284,
|
|
"loss": 5.0455,
|
|
"mean_token_accuracy": 0.21298815310001373,
|
|
"num_tokens": 39287399.0,
|
|
"step": 17145
|
|
},
|
|
{
|
|
"entropy": 5.2080107688903805,
|
|
"epoch": 1.6474543707973104,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047329900068177245,
|
|
"loss": 4.8904,
|
|
"mean_token_accuracy": 0.22203465551137924,
|
|
"num_tokens": 39297755.0,
|
|
"step": 17150
|
|
},
|
|
{
|
|
"entropy": 5.27492470741272,
|
|
"epoch": 1.6479346781940443,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047328279928620244,
|
|
"loss": 5.0159,
|
|
"mean_token_accuracy": 0.20743546783924102,
|
|
"num_tokens": 39311012.0,
|
|
"step": 17155
|
|
},
|
|
{
|
|
"entropy": 5.191003847122192,
|
|
"epoch": 1.6484149855907781,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004732665932870947,
|
|
"loss": 4.916,
|
|
"mean_token_accuracy": 0.22007073909044267,
|
|
"num_tokens": 39321797.0,
|
|
"step": 17160
|
|
},
|
|
{
|
|
"entropy": 5.178836727142334,
|
|
"epoch": 1.648895292987512,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00047325038268482544,
|
|
"loss": 4.8881,
|
|
"mean_token_accuracy": 0.23106451481580734,
|
|
"num_tokens": 39333598.0,
|
|
"step": 17165
|
|
},
|
|
{
|
|
"entropy": 5.215980386734008,
|
|
"epoch": 1.649375600384246,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004732341674797709,
|
|
"loss": 4.9832,
|
|
"mean_token_accuracy": 0.21182733327150344,
|
|
"num_tokens": 39345578.0,
|
|
"step": 17170
|
|
},
|
|
{
|
|
"entropy": 5.085534715652466,
|
|
"epoch": 1.6498559077809798,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047321794767230766,
|
|
"loss": 4.849,
|
|
"mean_token_accuracy": 0.2237042009830475,
|
|
"num_tokens": 39356744.0,
|
|
"step": 17175
|
|
},
|
|
{
|
|
"entropy": 5.203284645080567,
|
|
"epoch": 1.6503362151777137,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047320172326281224,
|
|
"loss": 4.9813,
|
|
"mean_token_accuracy": 0.21088655143976212,
|
|
"num_tokens": 39368138.0,
|
|
"step": 17180
|
|
},
|
|
{
|
|
"entropy": 5.088240718841552,
|
|
"epoch": 1.6508165225744476,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047318549425166134,
|
|
"loss": 4.825,
|
|
"mean_token_accuracy": 0.2208867460489273,
|
|
"num_tokens": 39380117.0,
|
|
"step": 17185
|
|
},
|
|
{
|
|
"entropy": 5.144798612594604,
|
|
"epoch": 1.6512968299711814,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004731692606392318,
|
|
"loss": 4.9032,
|
|
"mean_token_accuracy": 0.21989178657531738,
|
|
"num_tokens": 39390913.0,
|
|
"step": 17190
|
|
},
|
|
{
|
|
"entropy": 5.259878635406494,
|
|
"epoch": 1.6517771373679153,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004731530224259004,
|
|
"loss": 5.0399,
|
|
"mean_token_accuracy": 0.209253753721714,
|
|
"num_tokens": 39401001.0,
|
|
"step": 17195
|
|
},
|
|
{
|
|
"entropy": 5.167004823684692,
|
|
"epoch": 1.6522574447646494,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004731367796120442,
|
|
"loss": 4.9199,
|
|
"mean_token_accuracy": 0.2191713660955429,
|
|
"num_tokens": 39411982.0,
|
|
"step": 17200
|
|
},
|
|
{
|
|
"entropy": 5.236986684799194,
|
|
"epoch": 1.6527377521613833,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004731205321980404,
|
|
"loss": 4.9493,
|
|
"mean_token_accuracy": 0.21665553301572799,
|
|
"num_tokens": 39423363.0,
|
|
"step": 17205
|
|
},
|
|
{
|
|
"entropy": 5.178996753692627,
|
|
"epoch": 1.6532180595581172,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047310428018426616,
|
|
"loss": 4.9931,
|
|
"mean_token_accuracy": 0.21703283488750458,
|
|
"num_tokens": 39434002.0,
|
|
"step": 17210
|
|
},
|
|
{
|
|
"entropy": 5.136433792114258,
|
|
"epoch": 1.653698366954851,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004730880235710987,
|
|
"loss": 4.8818,
|
|
"mean_token_accuracy": 0.2200036182999611,
|
|
"num_tokens": 39444629.0,
|
|
"step": 17215
|
|
},
|
|
{
|
|
"entropy": 5.1588235855102536,
|
|
"epoch": 1.6541786743515852,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004730717623589155,
|
|
"loss": 4.8502,
|
|
"mean_token_accuracy": 0.23025956898927688,
|
|
"num_tokens": 39454803.0,
|
|
"step": 17220
|
|
},
|
|
{
|
|
"entropy": 5.160949420928955,
|
|
"epoch": 1.654658981748319,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004730554965480942,
|
|
"loss": 4.9935,
|
|
"mean_token_accuracy": 0.2163141682744026,
|
|
"num_tokens": 39466931.0,
|
|
"step": 17225
|
|
},
|
|
{
|
|
"entropy": 5.18949818611145,
|
|
"epoch": 1.655139289145053,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004730392261390124,
|
|
"loss": 4.9649,
|
|
"mean_token_accuracy": 0.22342453449964522,
|
|
"num_tokens": 39478127.0,
|
|
"step": 17230
|
|
},
|
|
{
|
|
"entropy": 5.222326755523682,
|
|
"epoch": 1.6556195965417868,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004730229511320478,
|
|
"loss": 4.9707,
|
|
"mean_token_accuracy": 0.21511962711811067,
|
|
"num_tokens": 39488188.0,
|
|
"step": 17235
|
|
},
|
|
{
|
|
"entropy": 5.210654878616333,
|
|
"epoch": 1.6560999039385207,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047300667152757827,
|
|
"loss": 4.9296,
|
|
"mean_token_accuracy": 0.22033513486385345,
|
|
"num_tokens": 39500035.0,
|
|
"step": 17240
|
|
},
|
|
{
|
|
"entropy": 5.2070694923400875,
|
|
"epoch": 1.6565802113352546,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047299038732598184,
|
|
"loss": 4.999,
|
|
"mean_token_accuracy": 0.21806135773658752,
|
|
"num_tokens": 39510922.0,
|
|
"step": 17245
|
|
},
|
|
{
|
|
"entropy": 5.182500553131104,
|
|
"epoch": 1.6570605187319885,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047297409852763644,
|
|
"loss": 4.997,
|
|
"mean_token_accuracy": 0.22009943872690202,
|
|
"num_tokens": 39521319.0,
|
|
"step": 17250
|
|
},
|
|
{
|
|
"entropy": 5.2111945152282715,
|
|
"epoch": 1.6575408261287223,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004729578051329204,
|
|
"loss": 4.9534,
|
|
"mean_token_accuracy": 0.21612063497304917,
|
|
"num_tokens": 39532408.0,
|
|
"step": 17255
|
|
},
|
|
{
|
|
"entropy": 5.219672155380249,
|
|
"epoch": 1.6580211335254562,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047294150714221185,
|
|
"loss": 5.0105,
|
|
"mean_token_accuracy": 0.21210616379976271,
|
|
"num_tokens": 39543323.0,
|
|
"step": 17260
|
|
},
|
|
{
|
|
"entropy": 5.242244911193848,
|
|
"epoch": 1.65850144092219,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004729252045558894,
|
|
"loss": 4.9603,
|
|
"mean_token_accuracy": 0.21582386493682862,
|
|
"num_tokens": 39554133.0,
|
|
"step": 17265
|
|
},
|
|
{
|
|
"entropy": 5.154227447509766,
|
|
"epoch": 1.658981748318924,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047290889737433133,
|
|
"loss": 4.84,
|
|
"mean_token_accuracy": 0.22423603981733323,
|
|
"num_tokens": 39565990.0,
|
|
"step": 17270
|
|
},
|
|
{
|
|
"entropy": 5.206848955154419,
|
|
"epoch": 1.6594620557156579,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047289258559791633,
|
|
"loss": 4.9429,
|
|
"mean_token_accuracy": 0.2176191046833992,
|
|
"num_tokens": 39577591.0,
|
|
"step": 17275
|
|
},
|
|
{
|
|
"entropy": 5.2025104522705075,
|
|
"epoch": 1.659942363112392,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047287626922702317,
|
|
"loss": 4.9147,
|
|
"mean_token_accuracy": 0.21797798275947572,
|
|
"num_tokens": 39589922.0,
|
|
"step": 17280
|
|
},
|
|
{
|
|
"entropy": 5.126957130432129,
|
|
"epoch": 1.6604226705091258,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047285994826203054,
|
|
"loss": 4.9458,
|
|
"mean_token_accuracy": 0.22071049809455873,
|
|
"num_tokens": 39602183.0,
|
|
"step": 17285
|
|
},
|
|
{
|
|
"entropy": 5.20958571434021,
|
|
"epoch": 1.6609029779058597,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004728436227033175,
|
|
"loss": 4.9578,
|
|
"mean_token_accuracy": 0.21979390680789948,
|
|
"num_tokens": 39612922.0,
|
|
"step": 17290
|
|
},
|
|
{
|
|
"entropy": 5.232578420639038,
|
|
"epoch": 1.6613832853025938,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047282729255126294,
|
|
"loss": 4.9504,
|
|
"mean_token_accuracy": 0.2208220601081848,
|
|
"num_tokens": 39624475.0,
|
|
"step": 17295
|
|
},
|
|
{
|
|
"entropy": 5.023839282989502,
|
|
"epoch": 1.6618635926993277,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004728109578062461,
|
|
"loss": 4.8012,
|
|
"mean_token_accuracy": 0.2280938968062401,
|
|
"num_tokens": 39635230.0,
|
|
"step": 17300
|
|
},
|
|
{
|
|
"entropy": 5.088182401657105,
|
|
"epoch": 1.6623439000960616,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00047279461846864626,
|
|
"loss": 4.9072,
|
|
"mean_token_accuracy": 0.22009400725364686,
|
|
"num_tokens": 39646788.0,
|
|
"step": 17305
|
|
},
|
|
{
|
|
"entropy": 5.137494707107544,
|
|
"epoch": 1.6628242074927955,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00047277827453884265,
|
|
"loss": 4.8803,
|
|
"mean_token_accuracy": 0.2225083142518997,
|
|
"num_tokens": 39657531.0,
|
|
"step": 17310
|
|
},
|
|
{
|
|
"entropy": 5.292917203903198,
|
|
"epoch": 1.6633045148895294,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047276192601721477,
|
|
"loss": 5.0619,
|
|
"mean_token_accuracy": 0.20813206434249878,
|
|
"num_tokens": 39669008.0,
|
|
"step": 17315
|
|
},
|
|
{
|
|
"entropy": 5.207920122146606,
|
|
"epoch": 1.6637848222862632,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004727455729041422,
|
|
"loss": 4.9061,
|
|
"mean_token_accuracy": 0.22048480212688445,
|
|
"num_tokens": 39680772.0,
|
|
"step": 17320
|
|
},
|
|
{
|
|
"entropy": 5.23115291595459,
|
|
"epoch": 1.6642651296829971,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047272921520000465,
|
|
"loss": 5.0517,
|
|
"mean_token_accuracy": 0.21050333827733994,
|
|
"num_tokens": 39694107.0,
|
|
"step": 17325
|
|
},
|
|
{
|
|
"entropy": 5.18481068611145,
|
|
"epoch": 1.664745437079731,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004727128529051819,
|
|
"loss": 4.9103,
|
|
"mean_token_accuracy": 0.21743627935647963,
|
|
"num_tokens": 39705440.0,
|
|
"step": 17330
|
|
},
|
|
{
|
|
"entropy": 5.171319675445557,
|
|
"epoch": 1.6652257444764649,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004726964860200537,
|
|
"loss": 4.9121,
|
|
"mean_token_accuracy": 0.2184045359492302,
|
|
"num_tokens": 39716246.0,
|
|
"step": 17335
|
|
},
|
|
{
|
|
"entropy": 5.164891290664673,
|
|
"epoch": 1.6657060518731988,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004726801145450002,
|
|
"loss": 4.9312,
|
|
"mean_token_accuracy": 0.21594095528125762,
|
|
"num_tokens": 39727233.0,
|
|
"step": 17340
|
|
},
|
|
{
|
|
"entropy": 5.196539783477784,
|
|
"epoch": 1.6661863592699326,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004726637384804014,
|
|
"loss": 4.9045,
|
|
"mean_token_accuracy": 0.2156997725367546,
|
|
"num_tokens": 39738929.0,
|
|
"step": 17345
|
|
},
|
|
{
|
|
"entropy": 5.152793312072754,
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004726473578266375,
|
|
"loss": 4.9228,
|
|
"mean_token_accuracy": 0.21952597051858902,
|
|
"num_tokens": 39750714.0,
|
|
"step": 17350
|
|
},
|
|
{
|
|
"entropy": 5.183078670501709,
|
|
"epoch": 1.6671469740634006,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047263097258408893,
|
|
"loss": 4.983,
|
|
"mean_token_accuracy": 0.21496337354183198,
|
|
"num_tokens": 39762810.0,
|
|
"step": 17355
|
|
},
|
|
{
|
|
"entropy": 5.259543704986572,
|
|
"epoch": 1.6676272814601345,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004726145827531359,
|
|
"loss": 5.0234,
|
|
"mean_token_accuracy": 0.21382358223199843,
|
|
"num_tokens": 39774611.0,
|
|
"step": 17360
|
|
},
|
|
{
|
|
"entropy": 5.12020697593689,
|
|
"epoch": 1.6681075888568684,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047259818833415916,
|
|
"loss": 4.8302,
|
|
"mean_token_accuracy": 0.2306036338210106,
|
|
"num_tokens": 39784900.0,
|
|
"step": 17365
|
|
},
|
|
{
|
|
"entropy": 5.213765668869018,
|
|
"epoch": 1.6685878962536023,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00047258178932753917,
|
|
"loss": 4.9686,
|
|
"mean_token_accuracy": 0.2193788021802902,
|
|
"num_tokens": 39794805.0,
|
|
"step": 17370
|
|
},
|
|
{
|
|
"entropy": 5.206267833709717,
|
|
"epoch": 1.6690682036503364,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047256538573365675,
|
|
"loss": 4.9778,
|
|
"mean_token_accuracy": 0.212169349193573,
|
|
"num_tokens": 39806444.0,
|
|
"step": 17375
|
|
},
|
|
{
|
|
"entropy": 5.318323183059692,
|
|
"epoch": 1.6695485110470702,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004725489775528928,
|
|
"loss": 5.116,
|
|
"mean_token_accuracy": 0.20811543017625808,
|
|
"num_tokens": 39817724.0,
|
|
"step": 17380
|
|
},
|
|
{
|
|
"entropy": 5.151338577270508,
|
|
"epoch": 1.6700288184438041,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00047253256478562805,
|
|
"loss": 4.9153,
|
|
"mean_token_accuracy": 0.2115355148911476,
|
|
"num_tokens": 39828944.0,
|
|
"step": 17385
|
|
},
|
|
{
|
|
"entropy": 5.144230937957763,
|
|
"epoch": 1.670509125840538,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047251614743224374,
|
|
"loss": 4.9378,
|
|
"mean_token_accuracy": 0.2210545301437378,
|
|
"num_tokens": 39839978.0,
|
|
"step": 17390
|
|
},
|
|
{
|
|
"entropy": 5.192106771469116,
|
|
"epoch": 1.670989433237272,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047249972549312107,
|
|
"loss": 4.9251,
|
|
"mean_token_accuracy": 0.22135266661643982,
|
|
"num_tokens": 39850787.0,
|
|
"step": 17395
|
|
},
|
|
{
|
|
"entropy": 5.250539684295655,
|
|
"epoch": 1.6714697406340058,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004724832989686411,
|
|
"loss": 5.0037,
|
|
"mean_token_accuracy": 0.21468748897314072,
|
|
"num_tokens": 39862245.0,
|
|
"step": 17400
|
|
},
|
|
{
|
|
"entropy": 5.163530015945435,
|
|
"epoch": 1.6719500480307397,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00047246686785918545,
|
|
"loss": 4.9126,
|
|
"mean_token_accuracy": 0.22578096389770508,
|
|
"num_tokens": 39872295.0,
|
|
"step": 17405
|
|
},
|
|
{
|
|
"entropy": 5.088949918746948,
|
|
"epoch": 1.6724303554274735,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00047245043216513546,
|
|
"loss": 4.8559,
|
|
"mean_token_accuracy": 0.21463808417320251,
|
|
"num_tokens": 39882839.0,
|
|
"step": 17410
|
|
},
|
|
{
|
|
"entropy": 5.105159568786621,
|
|
"epoch": 1.6729106628242074,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004724339918868727,
|
|
"loss": 4.856,
|
|
"mean_token_accuracy": 0.221045646071434,
|
|
"num_tokens": 39894767.0,
|
|
"step": 17415
|
|
},
|
|
{
|
|
"entropy": 5.254346513748169,
|
|
"epoch": 1.6733909702209413,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000472417547024779,
|
|
"loss": 5.0189,
|
|
"mean_token_accuracy": 0.20958801060914994,
|
|
"num_tokens": 39907492.0,
|
|
"step": 17420
|
|
},
|
|
{
|
|
"entropy": 5.103402757644654,
|
|
"epoch": 1.6738712776176752,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047240109757923593,
|
|
"loss": 4.8019,
|
|
"mean_token_accuracy": 0.22363511472940445,
|
|
"num_tokens": 39919005.0,
|
|
"step": 17425
|
|
},
|
|
{
|
|
"entropy": 5.133781242370605,
|
|
"epoch": 1.674351585014409,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004723846435506256,
|
|
"loss": 4.9053,
|
|
"mean_token_accuracy": 0.22093903720378877,
|
|
"num_tokens": 39930417.0,
|
|
"step": 17430
|
|
},
|
|
{
|
|
"entropy": 5.136504316329956,
|
|
"epoch": 1.6748318924111432,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047236818493932994,
|
|
"loss": 4.9295,
|
|
"mean_token_accuracy": 0.21972116082906723,
|
|
"num_tokens": 39941873.0,
|
|
"step": 17435
|
|
},
|
|
{
|
|
"entropy": 5.218835639953613,
|
|
"epoch": 1.675312199807877,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004723517217457311,
|
|
"loss": 4.9594,
|
|
"mean_token_accuracy": 0.2127823770046234,
|
|
"num_tokens": 39953280.0,
|
|
"step": 17440
|
|
},
|
|
{
|
|
"entropy": 5.266405916213989,
|
|
"epoch": 1.675792507204611,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004723352539702113,
|
|
"loss": 5.033,
|
|
"mean_token_accuracy": 0.2113511174917221,
|
|
"num_tokens": 39964898.0,
|
|
"step": 17445
|
|
},
|
|
{
|
|
"entropy": 5.169444513320923,
|
|
"epoch": 1.6762728146013448,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004723187816131529,
|
|
"loss": 4.8971,
|
|
"mean_token_accuracy": 0.22143382728099822,
|
|
"num_tokens": 39976945.0,
|
|
"step": 17450
|
|
},
|
|
{
|
|
"entropy": 5.21337661743164,
|
|
"epoch": 1.676753121998079,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004723023046749383,
|
|
"loss": 5.0068,
|
|
"mean_token_accuracy": 0.21935284435749053,
|
|
"num_tokens": 39987760.0,
|
|
"step": 17455
|
|
},
|
|
{
|
|
"entropy": 5.109252262115478,
|
|
"epoch": 1.6772334293948128,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00047228582315595,
|
|
"loss": 4.7994,
|
|
"mean_token_accuracy": 0.2287563070654869,
|
|
"num_tokens": 39998171.0,
|
|
"step": 17460
|
|
},
|
|
{
|
|
"entropy": 5.1659423351287845,
|
|
"epoch": 1.6777137367915467,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004722693370565708,
|
|
"loss": 4.9056,
|
|
"mean_token_accuracy": 0.21699397414922714,
|
|
"num_tokens": 40010124.0,
|
|
"step": 17465
|
|
},
|
|
{
|
|
"entropy": 5.1289918422698975,
|
|
"epoch": 1.6781940441882806,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047225284637718323,
|
|
"loss": 4.9364,
|
|
"mean_token_accuracy": 0.21586138755083084,
|
|
"num_tokens": 40022599.0,
|
|
"step": 17470
|
|
},
|
|
{
|
|
"entropy": 5.150423860549926,
|
|
"epoch": 1.6786743515850144,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004722363511181703,
|
|
"loss": 4.8901,
|
|
"mean_token_accuracy": 0.2244936302304268,
|
|
"num_tokens": 40033738.0,
|
|
"step": 17475
|
|
},
|
|
{
|
|
"entropy": 5.188096809387207,
|
|
"epoch": 1.6791546589817483,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000472219851279915,
|
|
"loss": 4.9457,
|
|
"mean_token_accuracy": 0.2202860251069069,
|
|
"num_tokens": 40044148.0,
|
|
"step": 17480
|
|
},
|
|
{
|
|
"entropy": 5.223975753784179,
|
|
"epoch": 1.6796349663784822,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004722033468628004,
|
|
"loss": 4.9322,
|
|
"mean_token_accuracy": 0.22158615589141845,
|
|
"num_tokens": 40055430.0,
|
|
"step": 17485
|
|
},
|
|
{
|
|
"entropy": 5.1668178081512455,
|
|
"epoch": 1.680115273775216,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004721868378672098,
|
|
"loss": 4.9149,
|
|
"mean_token_accuracy": 0.2159278705716133,
|
|
"num_tokens": 40066147.0,
|
|
"step": 17490
|
|
},
|
|
{
|
|
"entropy": 5.178054475784302,
|
|
"epoch": 1.68059558117195,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004721703242935261,
|
|
"loss": 4.9165,
|
|
"mean_token_accuracy": 0.21332445442676545,
|
|
"num_tokens": 40077567.0,
|
|
"step": 17495
|
|
},
|
|
{
|
|
"entropy": 5.231701517105103,
|
|
"epoch": 1.6810758885686838,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004721538061421331,
|
|
"loss": 5.005,
|
|
"mean_token_accuracy": 0.21199633330106735,
|
|
"num_tokens": 40089544.0,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"entropy": 5.082833242416382,
|
|
"epoch": 1.6815561959654177,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047213728341341407,
|
|
"loss": 4.8582,
|
|
"mean_token_accuracy": 0.22495235800743102,
|
|
"num_tokens": 40100557.0,
|
|
"step": 17505
|
|
},
|
|
{
|
|
"entropy": 5.198224449157715,
|
|
"epoch": 1.6820365033621518,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004721207561077527,
|
|
"loss": 4.9242,
|
|
"mean_token_accuracy": 0.21465859711170196,
|
|
"num_tokens": 40112052.0,
|
|
"step": 17510
|
|
},
|
|
{
|
|
"entropy": 5.35629358291626,
|
|
"epoch": 1.6825168107588857,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004721042242255327,
|
|
"loss": 5.1065,
|
|
"mean_token_accuracy": 0.20359711796045304,
|
|
"num_tokens": 40123830.0,
|
|
"step": 17515
|
|
},
|
|
{
|
|
"entropy": 5.199491548538208,
|
|
"epoch": 1.6829971181556196,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047208768776713805,
|
|
"loss": 4.9982,
|
|
"mean_token_accuracy": 0.22124958634376526,
|
|
"num_tokens": 40134222.0,
|
|
"step": 17520
|
|
},
|
|
{
|
|
"entropy": 5.147200632095337,
|
|
"epoch": 1.6834774255523535,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004720711467329523,
|
|
"loss": 4.9883,
|
|
"mean_token_accuracy": 0.21965805292129517,
|
|
"num_tokens": 40145552.0,
|
|
"step": 17525
|
|
},
|
|
{
|
|
"entropy": 5.155480766296387,
|
|
"epoch": 1.6839577329490876,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004720546011233599,
|
|
"loss": 4.8324,
|
|
"mean_token_accuracy": 0.2238232597708702,
|
|
"num_tokens": 40156615.0,
|
|
"step": 17530
|
|
},
|
|
{
|
|
"entropy": 5.144622659683227,
|
|
"epoch": 1.6844380403458215,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004720380509387446,
|
|
"loss": 4.932,
|
|
"mean_token_accuracy": 0.22399861961603165,
|
|
"num_tokens": 40168825.0,
|
|
"step": 17535
|
|
},
|
|
{
|
|
"entropy": 5.190985679626465,
|
|
"epoch": 1.6849183477425553,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000472021496179491,
|
|
"loss": 4.9261,
|
|
"mean_token_accuracy": 0.22324578315019608,
|
|
"num_tokens": 40179523.0,
|
|
"step": 17540
|
|
},
|
|
{
|
|
"entropy": 5.129342079162598,
|
|
"epoch": 1.6853986551392892,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00047200493684598316,
|
|
"loss": 4.8848,
|
|
"mean_token_accuracy": 0.22110755145549774,
|
|
"num_tokens": 40191362.0,
|
|
"step": 17545
|
|
},
|
|
{
|
|
"entropy": 5.168604230880737,
|
|
"epoch": 1.685878962536023,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047198837293860573,
|
|
"loss": 4.9654,
|
|
"mean_token_accuracy": 0.21929350346326829,
|
|
"num_tokens": 40202274.0,
|
|
"step": 17550
|
|
},
|
|
{
|
|
"entropy": 5.244691181182861,
|
|
"epoch": 1.686359269932757,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004719718044577432,
|
|
"loss": 4.9938,
|
|
"mean_token_accuracy": 0.20958704501390457,
|
|
"num_tokens": 40213907.0,
|
|
"step": 17555
|
|
},
|
|
{
|
|
"entropy": 5.152917718887329,
|
|
"epoch": 1.6868395773294909,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047195523140378034,
|
|
"loss": 4.9344,
|
|
"mean_token_accuracy": 0.21637397557497023,
|
|
"num_tokens": 40225300.0,
|
|
"step": 17560
|
|
},
|
|
{
|
|
"entropy": 5.109961271286011,
|
|
"epoch": 1.6873198847262247,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00047193865377710177,
|
|
"loss": 4.8457,
|
|
"mean_token_accuracy": 0.22791109681129457,
|
|
"num_tokens": 40236197.0,
|
|
"step": 17565
|
|
},
|
|
{
|
|
"entropy": 5.227808856964112,
|
|
"epoch": 1.6878001921229586,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047192207157809246,
|
|
"loss": 5.0596,
|
|
"mean_token_accuracy": 0.2109197899699211,
|
|
"num_tokens": 40247887.0,
|
|
"step": 17570
|
|
},
|
|
{
|
|
"entropy": 5.263893365859985,
|
|
"epoch": 1.6882804995196925,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047190548480713736,
|
|
"loss": 4.8982,
|
|
"mean_token_accuracy": 0.2192056208848953,
|
|
"num_tokens": 40258262.0,
|
|
"step": 17575
|
|
},
|
|
{
|
|
"entropy": 5.223889493942261,
|
|
"epoch": 1.6887608069164264,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047188889346462163,
|
|
"loss": 4.9589,
|
|
"mean_token_accuracy": 0.21418242901563644,
|
|
"num_tokens": 40268735.0,
|
|
"step": 17580
|
|
},
|
|
{
|
|
"entropy": 5.200381278991699,
|
|
"epoch": 1.6892411143131603,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00047187229755093037,
|
|
"loss": 5.0426,
|
|
"mean_token_accuracy": 0.20822969675064087,
|
|
"num_tokens": 40279905.0,
|
|
"step": 17585
|
|
},
|
|
{
|
|
"entropy": 5.207263040542602,
|
|
"epoch": 1.6897214217098944,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000471855697066449,
|
|
"loss": 4.914,
|
|
"mean_token_accuracy": 0.21707093566656113,
|
|
"num_tokens": 40290580.0,
|
|
"step": 17590
|
|
},
|
|
{
|
|
"entropy": 5.2004670143127445,
|
|
"epoch": 1.6902017291066282,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00047183909201156297,
|
|
"loss": 5.0006,
|
|
"mean_token_accuracy": 0.2152295872569084,
|
|
"num_tokens": 40302472.0,
|
|
"step": 17595
|
|
},
|
|
{
|
|
"entropy": 5.251315307617188,
|
|
"epoch": 1.6906820365033621,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004718224823866576,
|
|
"loss": 4.9872,
|
|
"mean_token_accuracy": 0.21021779626607895,
|
|
"num_tokens": 40314238.0,
|
|
"step": 17600
|
|
},
|
|
{
|
|
"entropy": 5.131517028808593,
|
|
"epoch": 1.691162343900096,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004718058681921186,
|
|
"loss": 4.9242,
|
|
"mean_token_accuracy": 0.22052521407604217,
|
|
"num_tokens": 40326147.0,
|
|
"step": 17605
|
|
},
|
|
{
|
|
"entropy": 5.177646541595459,
|
|
"epoch": 1.6916426512968301,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00047178924942833185,
|
|
"loss": 4.8935,
|
|
"mean_token_accuracy": 0.221788227558136,
|
|
"num_tokens": 40338210.0,
|
|
"step": 17610
|
|
},
|
|
{
|
|
"entropy": 5.096933364868164,
|
|
"epoch": 1.692122958693564,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004717726260956831,
|
|
"loss": 4.859,
|
|
"mean_token_accuracy": 0.22617415189743043,
|
|
"num_tokens": 40349293.0,
|
|
"step": 17615
|
|
},
|
|
{
|
|
"entropy": 5.1479727268219,
|
|
"epoch": 1.6926032660902979,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004717559981945581,
|
|
"loss": 4.8868,
|
|
"mean_token_accuracy": 0.21825706362724304,
|
|
"num_tokens": 40360916.0,
|
|
"step": 17620
|
|
},
|
|
{
|
|
"entropy": 5.165114021301269,
|
|
"epoch": 1.6930835734870318,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004717393657253432,
|
|
"loss": 4.9631,
|
|
"mean_token_accuracy": 0.215592922270298,
|
|
"num_tokens": 40373525.0,
|
|
"step": 17625
|
|
},
|
|
{
|
|
"entropy": 5.215253448486328,
|
|
"epoch": 1.6935638808837656,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004717227286884243,
|
|
"loss": 4.9601,
|
|
"mean_token_accuracy": 0.21485102623701097,
|
|
"num_tokens": 40385286.0,
|
|
"step": 17630
|
|
},
|
|
{
|
|
"entropy": 5.161527442932129,
|
|
"epoch": 1.6940441882804995,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004717060870841879,
|
|
"loss": 4.8639,
|
|
"mean_token_accuracy": 0.22210344523191453,
|
|
"num_tokens": 40396371.0,
|
|
"step": 17635
|
|
},
|
|
{
|
|
"entropy": 5.174149513244629,
|
|
"epoch": 1.6945244956772334,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004716894409130202,
|
|
"loss": 4.9296,
|
|
"mean_token_accuracy": 0.21754053086042405,
|
|
"num_tokens": 40407304.0,
|
|
"step": 17640
|
|
},
|
|
{
|
|
"entropy": 5.169687795639038,
|
|
"epoch": 1.6950048030739673,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004716727901753078,
|
|
"loss": 4.8853,
|
|
"mean_token_accuracy": 0.21763041615486145,
|
|
"num_tokens": 40418384.0,
|
|
"step": 17645
|
|
},
|
|
{
|
|
"entropy": 5.20654559135437,
|
|
"epoch": 1.6954851104707012,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004716561348714371,
|
|
"loss": 4.9963,
|
|
"mean_token_accuracy": 0.2188402831554413,
|
|
"num_tokens": 40430603.0,
|
|
"step": 17650
|
|
},
|
|
{
|
|
"entropy": 5.123339128494263,
|
|
"epoch": 1.695965417867435,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047163947500179494,
|
|
"loss": 4.8871,
|
|
"mean_token_accuracy": 0.22057809680700302,
|
|
"num_tokens": 40442597.0,
|
|
"step": 17655
|
|
},
|
|
{
|
|
"entropy": 5.097166204452515,
|
|
"epoch": 1.696445725264169,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004716228105667681,
|
|
"loss": 4.8132,
|
|
"mean_token_accuracy": 0.22695180177688598,
|
|
"num_tokens": 40454078.0,
|
|
"step": 17660
|
|
},
|
|
{
|
|
"entropy": 5.142997455596924,
|
|
"epoch": 1.696926032660903,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004716061415667435,
|
|
"loss": 4.8731,
|
|
"mean_token_accuracy": 0.2260493054986,
|
|
"num_tokens": 40465561.0,
|
|
"step": 17665
|
|
},
|
|
{
|
|
"entropy": 5.214082384109497,
|
|
"epoch": 1.697406340057637,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000471589468002108,
|
|
"loss": 4.9655,
|
|
"mean_token_accuracy": 0.2226713106036186,
|
|
"num_tokens": 40476883.0,
|
|
"step": 17670
|
|
},
|
|
{
|
|
"entropy": 5.1789998531341555,
|
|
"epoch": 1.6978866474543708,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004715727898732488,
|
|
"loss": 4.9351,
|
|
"mean_token_accuracy": 0.2229066714644432,
|
|
"num_tokens": 40488139.0,
|
|
"step": 17675
|
|
},
|
|
{
|
|
"entropy": 5.166921186447143,
|
|
"epoch": 1.6983669548511047,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047155610718055315,
|
|
"loss": 4.931,
|
|
"mean_token_accuracy": 0.22183982133865357,
|
|
"num_tokens": 40499367.0,
|
|
"step": 17680
|
|
},
|
|
{
|
|
"entropy": 5.078970956802368,
|
|
"epoch": 1.6988472622478388,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047153941992440833,
|
|
"loss": 4.8881,
|
|
"mean_token_accuracy": 0.2115646108984947,
|
|
"num_tokens": 40510628.0,
|
|
"step": 17685
|
|
},
|
|
{
|
|
"entropy": 5.121505689620972,
|
|
"epoch": 1.6993275696445727,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004715227281052018,
|
|
"loss": 4.8719,
|
|
"mean_token_accuracy": 0.2255760669708252,
|
|
"num_tokens": 40522680.0,
|
|
"step": 17690
|
|
},
|
|
{
|
|
"entropy": 5.144559717178344,
|
|
"epoch": 1.6998078770413065,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004715060317233211,
|
|
"loss": 4.8053,
|
|
"mean_token_accuracy": 0.23404240906238555,
|
|
"num_tokens": 40533139.0,
|
|
"step": 17695
|
|
},
|
|
{
|
|
"entropy": 5.200980138778687,
|
|
"epoch": 1.7002881844380404,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004714893307791538,
|
|
"loss": 5.0023,
|
|
"mean_token_accuracy": 0.21619048565626145,
|
|
"num_tokens": 40544578.0,
|
|
"step": 17700
|
|
},
|
|
{
|
|
"entropy": 5.229176378250122,
|
|
"epoch": 1.7007684918347743,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00047147262527308766,
|
|
"loss": 4.9251,
|
|
"mean_token_accuracy": 0.22029948830604554,
|
|
"num_tokens": 40555667.0,
|
|
"step": 17705
|
|
},
|
|
{
|
|
"entropy": 5.145949172973633,
|
|
"epoch": 1.7012487992315082,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004714559152055106,
|
|
"loss": 4.9556,
|
|
"mean_token_accuracy": 0.224330173432827,
|
|
"num_tokens": 40567123.0,
|
|
"step": 17710
|
|
},
|
|
{
|
|
"entropy": 5.191706800460816,
|
|
"epoch": 1.701729106628242,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004714392005768106,
|
|
"loss": 4.9387,
|
|
"mean_token_accuracy": 0.21762551963329316,
|
|
"num_tokens": 40579692.0,
|
|
"step": 17715
|
|
},
|
|
{
|
|
"entropy": 5.1885899066925045,
|
|
"epoch": 1.702209414024976,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004714224813873756,
|
|
"loss": 4.9025,
|
|
"mean_token_accuracy": 0.22344619333744048,
|
|
"num_tokens": 40590989.0,
|
|
"step": 17720
|
|
},
|
|
{
|
|
"entropy": 5.131214809417725,
|
|
"epoch": 1.7026897214217098,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00047140575763759393,
|
|
"loss": 4.9276,
|
|
"mean_token_accuracy": 0.22275308072566985,
|
|
"num_tokens": 40602050.0,
|
|
"step": 17725
|
|
},
|
|
{
|
|
"entropy": 5.1476117134094235,
|
|
"epoch": 1.7031700288184437,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00047138902932785363,
|
|
"loss": 4.9118,
|
|
"mean_token_accuracy": 0.22158618420362472,
|
|
"num_tokens": 40614552.0,
|
|
"step": 17730
|
|
},
|
|
{
|
|
"entropy": 5.210604238510132,
|
|
"epoch": 1.7036503362151776,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047137229645854333,
|
|
"loss": 4.8718,
|
|
"mean_token_accuracy": 0.2272538051009178,
|
|
"num_tokens": 40625903.0,
|
|
"step": 17735
|
|
},
|
|
{
|
|
"entropy": 5.17086706161499,
|
|
"epoch": 1.7041306436119115,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004713555590300513,
|
|
"loss": 4.9524,
|
|
"mean_token_accuracy": 0.21819649636745453,
|
|
"num_tokens": 40638634.0,
|
|
"step": 17740
|
|
},
|
|
{
|
|
"entropy": 5.219864559173584,
|
|
"epoch": 1.7046109510086456,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004713388170427664,
|
|
"loss": 4.9558,
|
|
"mean_token_accuracy": 0.21615031808614732,
|
|
"num_tokens": 40651279.0,
|
|
"step": 17745
|
|
},
|
|
{
|
|
"entropy": 5.184417057037353,
|
|
"epoch": 1.7050912584053795,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004713220704970771,
|
|
"loss": 4.9216,
|
|
"mean_token_accuracy": 0.22155367732048034,
|
|
"num_tokens": 40662306.0,
|
|
"step": 17750
|
|
},
|
|
{
|
|
"entropy": 5.0326759815216064,
|
|
"epoch": 1.7055715658021133,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047130531939337236,
|
|
"loss": 4.775,
|
|
"mean_token_accuracy": 0.22894255667924882,
|
|
"num_tokens": 40672290.0,
|
|
"step": 17755
|
|
},
|
|
{
|
|
"entropy": 5.187178373336792,
|
|
"epoch": 1.7060518731988472,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047128856373204086,
|
|
"loss": 4.9134,
|
|
"mean_token_accuracy": 0.22119101732969285,
|
|
"num_tokens": 40683447.0,
|
|
"step": 17760
|
|
},
|
|
{
|
|
"entropy": 5.292993640899658,
|
|
"epoch": 1.7065321805955813,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047127180351347184,
|
|
"loss": 5.0599,
|
|
"mean_token_accuracy": 0.21206379681825638,
|
|
"num_tokens": 40695230.0,
|
|
"step": 17765
|
|
},
|
|
{
|
|
"entropy": 5.193490266799927,
|
|
"epoch": 1.7070124879923152,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004712550387380544,
|
|
"loss": 4.9057,
|
|
"mean_token_accuracy": 0.21839701384305954,
|
|
"num_tokens": 40707311.0,
|
|
"step": 17770
|
|
},
|
|
{
|
|
"entropy": 5.201669216156006,
|
|
"epoch": 1.707492795389049,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004712382694061776,
|
|
"loss": 4.9201,
|
|
"mean_token_accuracy": 0.21827106177806854,
|
|
"num_tokens": 40717928.0,
|
|
"step": 17775
|
|
},
|
|
{
|
|
"entropy": 5.137501668930054,
|
|
"epoch": 1.707973102785783,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047122149551823096,
|
|
"loss": 4.93,
|
|
"mean_token_accuracy": 0.2145393192768097,
|
|
"num_tokens": 40730355.0,
|
|
"step": 17780
|
|
},
|
|
{
|
|
"entropy": 5.221040725708008,
|
|
"epoch": 1.7084534101825168,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004712047170746039,
|
|
"loss": 5.0179,
|
|
"mean_token_accuracy": 0.21589890420436858,
|
|
"num_tokens": 40741412.0,
|
|
"step": 17785
|
|
},
|
|
{
|
|
"entropy": 5.209814357757568,
|
|
"epoch": 1.7089337175792507,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047118793407568586,
|
|
"loss": 4.9045,
|
|
"mean_token_accuracy": 0.22416329383850098,
|
|
"num_tokens": 40753491.0,
|
|
"step": 17790
|
|
},
|
|
{
|
|
"entropy": 5.131305885314942,
|
|
"epoch": 1.7094140249759846,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00047117114652186657,
|
|
"loss": 4.8506,
|
|
"mean_token_accuracy": 0.22409170120954514,
|
|
"num_tokens": 40765209.0,
|
|
"step": 17795
|
|
},
|
|
{
|
|
"entropy": 5.116810369491577,
|
|
"epoch": 1.7098943323727185,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00047115435441353573,
|
|
"loss": 4.9496,
|
|
"mean_token_accuracy": 0.21688321977853775,
|
|
"num_tokens": 40778065.0,
|
|
"step": 17800
|
|
},
|
|
{
|
|
"entropy": 5.138747310638427,
|
|
"epoch": 1.7103746397694524,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047113755775108333,
|
|
"loss": 4.9235,
|
|
"mean_token_accuracy": 0.22174129486083985,
|
|
"num_tokens": 40789149.0,
|
|
"step": 17805
|
|
},
|
|
{
|
|
"entropy": 5.201202821731568,
|
|
"epoch": 1.7108549471661862,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00047112075653489913,
|
|
"loss": 4.9227,
|
|
"mean_token_accuracy": 0.22810300290584565,
|
|
"num_tokens": 40800340.0,
|
|
"step": 17810
|
|
},
|
|
{
|
|
"entropy": 5.178883695602417,
|
|
"epoch": 1.7113352545629201,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004711039507653734,
|
|
"loss": 4.9677,
|
|
"mean_token_accuracy": 0.22021741718053817,
|
|
"num_tokens": 40811866.0,
|
|
"step": 17815
|
|
},
|
|
{
|
|
"entropy": 5.158439302444458,
|
|
"epoch": 1.7118155619596542,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004710871404428961,
|
|
"loss": 4.9635,
|
|
"mean_token_accuracy": 0.21318840384483337,
|
|
"num_tokens": 40825850.0,
|
|
"step": 17820
|
|
},
|
|
{
|
|
"entropy": 5.2235781192779545,
|
|
"epoch": 1.7122958693563881,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00047107032556785786,
|
|
"loss": 4.9129,
|
|
"mean_token_accuracy": 0.22048740983009338,
|
|
"num_tokens": 40836688.0,
|
|
"step": 17825
|
|
},
|
|
{
|
|
"entropy": 5.180397367477417,
|
|
"epoch": 1.712776176753122,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047105350614064874,
|
|
"loss": 4.9461,
|
|
"mean_token_accuracy": 0.2169654995203018,
|
|
"num_tokens": 40847803.0,
|
|
"step": 17830
|
|
},
|
|
{
|
|
"entropy": 5.188759469985962,
|
|
"epoch": 1.7132564841498559,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047103668216165944,
|
|
"loss": 4.975,
|
|
"mean_token_accuracy": 0.2155713826417923,
|
|
"num_tokens": 40859099.0,
|
|
"step": 17835
|
|
},
|
|
{
|
|
"entropy": 5.144463443756104,
|
|
"epoch": 1.71373679154659,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00047101985363128045,
|
|
"loss": 4.8284,
|
|
"mean_token_accuracy": 0.23141866326332092,
|
|
"num_tokens": 40870440.0,
|
|
"step": 17840
|
|
},
|
|
{
|
|
"entropy": 5.26697793006897,
|
|
"epoch": 1.7142170989433239,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047100302054990255,
|
|
"loss": 5.0215,
|
|
"mean_token_accuracy": 0.2102995663881302,
|
|
"num_tokens": 40882329.0,
|
|
"step": 17845
|
|
},
|
|
{
|
|
"entropy": 5.247942304611206,
|
|
"epoch": 1.7146974063400577,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004709861829179165,
|
|
"loss": 5.0207,
|
|
"mean_token_accuracy": 0.21234164237976075,
|
|
"num_tokens": 40893458.0,
|
|
"step": 17850
|
|
},
|
|
{
|
|
"entropy": 5.1152942180633545,
|
|
"epoch": 1.7151777137367916,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00047096934073571325,
|
|
"loss": 4.843,
|
|
"mean_token_accuracy": 0.22821006327867507,
|
|
"num_tokens": 40904626.0,
|
|
"step": 17855
|
|
},
|
|
{
|
|
"entropy": 5.088890552520752,
|
|
"epoch": 1.7156580211335255,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047095249400368384,
|
|
"loss": 4.8521,
|
|
"mean_token_accuracy": 0.22707059532403945,
|
|
"num_tokens": 40916005.0,
|
|
"step": 17860
|
|
},
|
|
{
|
|
"entropy": 5.091427659988403,
|
|
"epoch": 1.7161383285302594,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047093564272221927,
|
|
"loss": 4.8326,
|
|
"mean_token_accuracy": 0.2260905146598816,
|
|
"num_tokens": 40927470.0,
|
|
"step": 17865
|
|
},
|
|
{
|
|
"entropy": 5.2040282726287845,
|
|
"epoch": 1.7166186359269933,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047091878689171105,
|
|
"loss": 4.981,
|
|
"mean_token_accuracy": 0.2205181822180748,
|
|
"num_tokens": 40938968.0,
|
|
"step": 17870
|
|
},
|
|
{
|
|
"entropy": 5.121591472625733,
|
|
"epoch": 1.7170989433237271,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004709019265125502,
|
|
"loss": 4.8641,
|
|
"mean_token_accuracy": 0.22871831506490709,
|
|
"num_tokens": 40952636.0,
|
|
"step": 17875
|
|
},
|
|
{
|
|
"entropy": 5.232312250137329,
|
|
"epoch": 1.717579250720461,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047088506158512837,
|
|
"loss": 4.9736,
|
|
"mean_token_accuracy": 0.2218574747443199,
|
|
"num_tokens": 40964816.0,
|
|
"step": 17880
|
|
},
|
|
{
|
|
"entropy": 5.210276556015015,
|
|
"epoch": 1.718059558117195,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047086819210983714,
|
|
"loss": 5.0101,
|
|
"mean_token_accuracy": 0.2185376450419426,
|
|
"num_tokens": 40977152.0,
|
|
"step": 17885
|
|
},
|
|
{
|
|
"entropy": 5.152036380767822,
|
|
"epoch": 1.7185398655139288,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047085131808706813,
|
|
"loss": 4.9234,
|
|
"mean_token_accuracy": 0.22430746555328368,
|
|
"num_tokens": 40987506.0,
|
|
"step": 17890
|
|
},
|
|
{
|
|
"entropy": 5.213549518585205,
|
|
"epoch": 1.7190201729106627,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004708344395172129,
|
|
"loss": 4.9523,
|
|
"mean_token_accuracy": 0.21810881644487382,
|
|
"num_tokens": 40998598.0,
|
|
"step": 17895
|
|
},
|
|
{
|
|
"entropy": 5.188005638122559,
|
|
"epoch": 1.7195004803073968,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004708175564006636,
|
|
"loss": 4.9545,
|
|
"mean_token_accuracy": 0.21332263350486755,
|
|
"num_tokens": 41009644.0,
|
|
"step": 17900
|
|
},
|
|
{
|
|
"entropy": 5.1379045963287355,
|
|
"epoch": 1.7199807877041307,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004708006687378121,
|
|
"loss": 4.8313,
|
|
"mean_token_accuracy": 0.22353375256061553,
|
|
"num_tokens": 41021816.0,
|
|
"step": 17905
|
|
},
|
|
{
|
|
"entropy": 5.118629121780396,
|
|
"epoch": 1.7204610951008645,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004707837765290505,
|
|
"loss": 4.8747,
|
|
"mean_token_accuracy": 0.22374353557825089,
|
|
"num_tokens": 41033482.0,
|
|
"step": 17910
|
|
},
|
|
{
|
|
"entropy": 5.131730937957764,
|
|
"epoch": 1.7209414024975984,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004707668797747709,
|
|
"loss": 4.8753,
|
|
"mean_token_accuracy": 0.2280108168721199,
|
|
"num_tokens": 41043854.0,
|
|
"step": 17915
|
|
},
|
|
{
|
|
"entropy": 5.200505256652832,
|
|
"epoch": 1.7214217098943325,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004707499784753657,
|
|
"loss": 5.002,
|
|
"mean_token_accuracy": 0.2113771140575409,
|
|
"num_tokens": 41056956.0,
|
|
"step": 17920
|
|
},
|
|
{
|
|
"entropy": 5.264665651321411,
|
|
"epoch": 1.7219020172910664,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004707330726312273,
|
|
"loss": 5.0186,
|
|
"mean_token_accuracy": 0.20672281384468078,
|
|
"num_tokens": 41068170.0,
|
|
"step": 17925
|
|
},
|
|
{
|
|
"entropy": 5.199624300003052,
|
|
"epoch": 1.7223823246878003,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047071616224274803,
|
|
"loss": 4.9427,
|
|
"mean_token_accuracy": 0.21977581828832626,
|
|
"num_tokens": 41079149.0,
|
|
"step": 17930
|
|
},
|
|
{
|
|
"entropy": 5.2467875480651855,
|
|
"epoch": 1.7228626320845342,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004706992473103207,
|
|
"loss": 4.9797,
|
|
"mean_token_accuracy": 0.21803479194641112,
|
|
"num_tokens": 41091039.0,
|
|
"step": 17935
|
|
},
|
|
{
|
|
"entropy": 5.147163963317871,
|
|
"epoch": 1.723342939481268,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00047068232783433806,
|
|
"loss": 4.9624,
|
|
"mean_token_accuracy": 0.221414914727211,
|
|
"num_tokens": 41103318.0,
|
|
"step": 17940
|
|
},
|
|
{
|
|
"entropy": 5.137469387054443,
|
|
"epoch": 1.723823246878002,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004706654038151927,
|
|
"loss": 4.8392,
|
|
"mean_token_accuracy": 0.22574034184217454,
|
|
"num_tokens": 41114235.0,
|
|
"step": 17945
|
|
},
|
|
{
|
|
"entropy": 5.139921188354492,
|
|
"epoch": 1.7243035542747358,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004706484752532777,
|
|
"loss": 4.9196,
|
|
"mean_token_accuracy": 0.22196324169635773,
|
|
"num_tokens": 41126008.0,
|
|
"step": 17950
|
|
},
|
|
{
|
|
"entropy": 5.084310674667359,
|
|
"epoch": 1.7247838616714697,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004706315421489861,
|
|
"loss": 4.9158,
|
|
"mean_token_accuracy": 0.22012482583522797,
|
|
"num_tokens": 41138819.0,
|
|
"step": 17955
|
|
},
|
|
{
|
|
"entropy": 5.177040863037109,
|
|
"epoch": 1.7252641690682036,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004706146045027109,
|
|
"loss": 4.8906,
|
|
"mean_token_accuracy": 0.2244026854634285,
|
|
"num_tokens": 41149389.0,
|
|
"step": 17960
|
|
},
|
|
{
|
|
"entropy": 5.206911277770996,
|
|
"epoch": 1.7257444764649374,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004705976623148455,
|
|
"loss": 4.8801,
|
|
"mean_token_accuracy": 0.2156105950474739,
|
|
"num_tokens": 41161810.0,
|
|
"step": 17965
|
|
},
|
|
{
|
|
"entropy": 5.18083701133728,
|
|
"epoch": 1.7262247838616713,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00047058071558578324,
|
|
"loss": 4.9052,
|
|
"mean_token_accuracy": 0.21902025789022445,
|
|
"num_tokens": 41172903.0,
|
|
"step": 17970
|
|
},
|
|
{
|
|
"entropy": 5.162399435043335,
|
|
"epoch": 1.7267050912584054,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004705637643159175,
|
|
"loss": 4.9808,
|
|
"mean_token_accuracy": 0.22272872775793076,
|
|
"num_tokens": 41183905.0,
|
|
"step": 17975
|
|
},
|
|
{
|
|
"entropy": 5.145203065872193,
|
|
"epoch": 1.7271853986551393,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047054680850564185,
|
|
"loss": 4.8865,
|
|
"mean_token_accuracy": 0.21936126351356505,
|
|
"num_tokens": 41195921.0,
|
|
"step": 17980
|
|
},
|
|
{
|
|
"entropy": 5.071988487243653,
|
|
"epoch": 1.7276657060518732,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004705298481553499,
|
|
"loss": 4.9223,
|
|
"mean_token_accuracy": 0.22444438189268112,
|
|
"num_tokens": 41208287.0,
|
|
"step": 17985
|
|
},
|
|
{
|
|
"entropy": 5.1723504066467285,
|
|
"epoch": 1.728146013448607,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00047051288326543553,
|
|
"loss": 4.9596,
|
|
"mean_token_accuracy": 0.21251793950796127,
|
|
"num_tokens": 41219864.0,
|
|
"step": 17990
|
|
},
|
|
{
|
|
"entropy": 5.212101888656616,
|
|
"epoch": 1.7286263208453412,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047049591383629247,
|
|
"loss": 4.8862,
|
|
"mean_token_accuracy": 0.2253048986196518,
|
|
"num_tokens": 41230640.0,
|
|
"step": 17995
|
|
},
|
|
{
|
|
"entropy": 5.166407299041748,
|
|
"epoch": 1.729106628242075,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047047893986831493,
|
|
"loss": 4.9322,
|
|
"mean_token_accuracy": 0.2219822809100151,
|
|
"num_tokens": 41242413.0,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"epoch": 1.729106628242075,
|
|
"eval_entropy": 5.004426007965958,
|
|
"eval_loss": 5.013918876647949,
|
|
"eval_mean_token_accuracy": 0.22329990555514567,
|
|
"eval_num_tokens": 41242413.0,
|
|
"eval_runtime": 26.6347,
|
|
"eval_samples_per_second": 1232.042,
|
|
"eval_steps_per_second": 154.01,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"entropy": 5.205334949493408,
|
|
"epoch": 1.729586935638809,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047046196136189686,
|
|
"loss": 4.9423,
|
|
"mean_token_accuracy": 0.21605349332094193,
|
|
"num_tokens": 41254400.0,
|
|
"step": 18005
|
|
},
|
|
{
|
|
"entropy": 5.1552910804748535,
|
|
"epoch": 1.7300672430355428,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004704449783174323,
|
|
"loss": 4.9063,
|
|
"mean_token_accuracy": 0.21709170937538147,
|
|
"num_tokens": 41266849.0,
|
|
"step": 18010
|
|
},
|
|
{
|
|
"entropy": 5.202658462524414,
|
|
"epoch": 1.7305475504322767,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004704279907353158,
|
|
"loss": 4.9789,
|
|
"mean_token_accuracy": 0.2149608999490738,
|
|
"num_tokens": 41278179.0,
|
|
"step": 18015
|
|
},
|
|
{
|
|
"entropy": 5.187939310073853,
|
|
"epoch": 1.7310278578290106,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00047041099861594167,
|
|
"loss": 4.9223,
|
|
"mean_token_accuracy": 0.22421054244041444,
|
|
"num_tokens": 41289904.0,
|
|
"step": 18020
|
|
},
|
|
{
|
|
"entropy": 5.29556884765625,
|
|
"epoch": 1.7315081652257445,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004703940019597044,
|
|
"loss": 5.0694,
|
|
"mean_token_accuracy": 0.21076448261737823,
|
|
"num_tokens": 41301861.0,
|
|
"step": 18025
|
|
},
|
|
{
|
|
"entropy": 5.164215087890625,
|
|
"epoch": 1.7319884726224783,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00047037700076699857,
|
|
"loss": 4.9078,
|
|
"mean_token_accuracy": 0.22254907786846162,
|
|
"num_tokens": 41312420.0,
|
|
"step": 18030
|
|
},
|
|
{
|
|
"entropy": 5.1473808765411375,
|
|
"epoch": 1.7324687800192122,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000470359995038219,
|
|
"loss": 4.9099,
|
|
"mean_token_accuracy": 0.22299474775791167,
|
|
"num_tokens": 41323264.0,
|
|
"step": 18035
|
|
},
|
|
{
|
|
"entropy": 5.2324567317962645,
|
|
"epoch": 1.732949087415946,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004703429847737604,
|
|
"loss": 5.0112,
|
|
"mean_token_accuracy": 0.21593133956193925,
|
|
"num_tokens": 41335438.0,
|
|
"step": 18040
|
|
},
|
|
{
|
|
"entropy": 5.196746826171875,
|
|
"epoch": 1.73342939481268,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004703259699740177,
|
|
"loss": 4.9825,
|
|
"mean_token_accuracy": 0.20940714031457902,
|
|
"num_tokens": 41347295.0,
|
|
"step": 18045
|
|
},
|
|
{
|
|
"entropy": 5.165305757522583,
|
|
"epoch": 1.7339097022094139,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00047030895063938607,
|
|
"loss": 4.9752,
|
|
"mean_token_accuracy": 0.21267576068639754,
|
|
"num_tokens": 41358886.0,
|
|
"step": 18050
|
|
},
|
|
{
|
|
"entropy": 5.155602979660034,
|
|
"epoch": 1.734390009606148,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00047029192677026043,
|
|
"loss": 4.8144,
|
|
"mean_token_accuracy": 0.22844478636980056,
|
|
"num_tokens": 41369402.0,
|
|
"step": 18055
|
|
},
|
|
{
|
|
"entropy": 5.24388952255249,
|
|
"epoch": 1.7348703170028819,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004702748983670363,
|
|
"loss": 5.0246,
|
|
"mean_token_accuracy": 0.21362147182226182,
|
|
"num_tokens": 41379772.0,
|
|
"step": 18060
|
|
},
|
|
{
|
|
"entropy": 5.218845558166504,
|
|
"epoch": 1.7353506243996157,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004702578654301088,
|
|
"loss": 4.9436,
|
|
"mean_token_accuracy": 0.2147745117545128,
|
|
"num_tokens": 41389776.0,
|
|
"step": 18065
|
|
},
|
|
{
|
|
"entropy": 5.055250024795532,
|
|
"epoch": 1.7358309317963496,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004702408279598734,
|
|
"loss": 4.8347,
|
|
"mean_token_accuracy": 0.2225829392671585,
|
|
"num_tokens": 41400640.0,
|
|
"step": 18070
|
|
},
|
|
{
|
|
"entropy": 5.136556816101074,
|
|
"epoch": 1.7363112391930837,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004702237859567258,
|
|
"loss": 4.87,
|
|
"mean_token_accuracy": 0.21867617815732956,
|
|
"num_tokens": 41412653.0,
|
|
"step": 18075
|
|
},
|
|
{
|
|
"entropy": 5.3015899658203125,
|
|
"epoch": 1.7367915465898176,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004702067394210616,
|
|
"loss": 5.0337,
|
|
"mean_token_accuracy": 0.20988011807203294,
|
|
"num_tokens": 41423815.0,
|
|
"step": 18080
|
|
},
|
|
{
|
|
"entropy": 5.196315050125122,
|
|
"epoch": 1.7372718539865515,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047018968835327643,
|
|
"loss": 4.9292,
|
|
"mean_token_accuracy": 0.21484117954969406,
|
|
"num_tokens": 41436016.0,
|
|
"step": 18085
|
|
},
|
|
{
|
|
"entropy": 5.100410509109497,
|
|
"epoch": 1.7377521613832854,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004701726327537664,
|
|
"loss": 4.884,
|
|
"mean_token_accuracy": 0.2229089468717575,
|
|
"num_tokens": 41447837.0,
|
|
"step": 18090
|
|
},
|
|
{
|
|
"entropy": 5.191250038146973,
|
|
"epoch": 1.7382324687800192,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004701555726229274,
|
|
"loss": 4.9544,
|
|
"mean_token_accuracy": 0.21401021778583526,
|
|
"num_tokens": 41459816.0,
|
|
"step": 18095
|
|
},
|
|
{
|
|
"entropy": 5.251665163040161,
|
|
"epoch": 1.7387127761767531,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004701385079611555,
|
|
"loss": 4.8759,
|
|
"mean_token_accuracy": 0.21814749985933304,
|
|
"num_tokens": 41471461.0,
|
|
"step": 18100
|
|
},
|
|
{
|
|
"entropy": 5.142739725112915,
|
|
"epoch": 1.739193083573487,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00047012143876884677,
|
|
"loss": 4.8584,
|
|
"mean_token_accuracy": 0.22734878063201905,
|
|
"num_tokens": 41481886.0,
|
|
"step": 18105
|
|
},
|
|
{
|
|
"entropy": 5.112404155731201,
|
|
"epoch": 1.739673390970221,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004701043650463977,
|
|
"loss": 4.909,
|
|
"mean_token_accuracy": 0.221170374751091,
|
|
"num_tokens": 41493698.0,
|
|
"step": 18110
|
|
},
|
|
{
|
|
"entropy": 5.152590131759643,
|
|
"epoch": 1.7401536983669548,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004700872867942046,
|
|
"loss": 4.8657,
|
|
"mean_token_accuracy": 0.22904037982225417,
|
|
"num_tokens": 41504744.0,
|
|
"step": 18115
|
|
},
|
|
{
|
|
"entropy": 5.22120509147644,
|
|
"epoch": 1.7406340057636887,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.000470070204012664,
|
|
"loss": 4.9395,
|
|
"mean_token_accuracy": 0.22211443781852722,
|
|
"num_tokens": 41517129.0,
|
|
"step": 18120
|
|
},
|
|
{
|
|
"entropy": 5.2185193538665775,
|
|
"epoch": 1.7411143131604225,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047005311670217256,
|
|
"loss": 4.9401,
|
|
"mean_token_accuracy": 0.21087878495454787,
|
|
"num_tokens": 41529969.0,
|
|
"step": 18125
|
|
},
|
|
{
|
|
"entropy": 5.125943803787232,
|
|
"epoch": 1.7415946205571564,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047003602486312687,
|
|
"loss": 4.8841,
|
|
"mean_token_accuracy": 0.22201181203126907,
|
|
"num_tokens": 41541515.0,
|
|
"step": 18130
|
|
},
|
|
{
|
|
"entropy": 5.139989805221558,
|
|
"epoch": 1.7420749279538905,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004700189284959238,
|
|
"loss": 4.949,
|
|
"mean_token_accuracy": 0.21680050939321518,
|
|
"num_tokens": 41553045.0,
|
|
"step": 18135
|
|
},
|
|
{
|
|
"entropy": 5.198971176147461,
|
|
"epoch": 1.7425552353506244,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00047000182760096037,
|
|
"loss": 4.9805,
|
|
"mean_token_accuracy": 0.2192812144756317,
|
|
"num_tokens": 41565376.0,
|
|
"step": 18140
|
|
},
|
|
{
|
|
"entropy": 5.197419261932373,
|
|
"epoch": 1.7430355427473583,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004699847221786335,
|
|
"loss": 4.9399,
|
|
"mean_token_accuracy": 0.21375814825296402,
|
|
"num_tokens": 41577515.0,
|
|
"step": 18145
|
|
},
|
|
{
|
|
"entropy": 5.120383930206299,
|
|
"epoch": 1.7435158501440924,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004699676122293403,
|
|
"loss": 4.8522,
|
|
"mean_token_accuracy": 0.22192323207855225,
|
|
"num_tokens": 41588589.0,
|
|
"step": 18150
|
|
},
|
|
{
|
|
"entropy": 5.194192266464233,
|
|
"epoch": 1.7439961575408263,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004699504977534782,
|
|
"loss": 4.9965,
|
|
"mean_token_accuracy": 0.21457838714122773,
|
|
"num_tokens": 41600727.0,
|
|
"step": 18155
|
|
},
|
|
{
|
|
"entropy": 5.205707025527954,
|
|
"epoch": 1.7444764649375601,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004699333787514444,
|
|
"loss": 4.9063,
|
|
"mean_token_accuracy": 0.2258935034275055,
|
|
"num_tokens": 41612177.0,
|
|
"step": 18160
|
|
},
|
|
{
|
|
"entropy": 5.194634437561035,
|
|
"epoch": 1.744956772334294,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004699162552236363,
|
|
"loss": 4.9316,
|
|
"mean_token_accuracy": 0.2233037084341049,
|
|
"num_tokens": 41623749.0,
|
|
"step": 18165
|
|
},
|
|
{
|
|
"entropy": 5.256892395019531,
|
|
"epoch": 1.745437079731028,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046989912717045165,
|
|
"loss": 4.9596,
|
|
"mean_token_accuracy": 0.21816200911998748,
|
|
"num_tokens": 41635549.0,
|
|
"step": 18170
|
|
},
|
|
{
|
|
"entropy": 5.236298227310181,
|
|
"epoch": 1.7459173871277618,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046988199459228793,
|
|
"loss": 5.0126,
|
|
"mean_token_accuracy": 0.21799473017454146,
|
|
"num_tokens": 41647219.0,
|
|
"step": 18175
|
|
},
|
|
{
|
|
"entropy": 5.22831883430481,
|
|
"epoch": 1.7463976945244957,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004698648574895429,
|
|
"loss": 4.9618,
|
|
"mean_token_accuracy": 0.21960075348615646,
|
|
"num_tokens": 41658850.0,
|
|
"step": 18180
|
|
},
|
|
{
|
|
"entropy": 5.239414978027344,
|
|
"epoch": 1.7468780019212296,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046984771586261465,
|
|
"loss": 4.9616,
|
|
"mean_token_accuracy": 0.2154267430305481,
|
|
"num_tokens": 41669304.0,
|
|
"step": 18185
|
|
},
|
|
{
|
|
"entropy": 5.238613176345825,
|
|
"epoch": 1.7473583093179634,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000469830569711901,
|
|
"loss": 4.9846,
|
|
"mean_token_accuracy": 0.22659707218408584,
|
|
"num_tokens": 41681652.0,
|
|
"step": 18190
|
|
},
|
|
{
|
|
"entropy": 5.21958441734314,
|
|
"epoch": 1.7478386167146973,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004698134190377999,
|
|
"loss": 4.9527,
|
|
"mean_token_accuracy": 0.21284282505512236,
|
|
"num_tokens": 41692548.0,
|
|
"step": 18195
|
|
},
|
|
{
|
|
"entropy": 5.184736871719361,
|
|
"epoch": 1.7483189241114312,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046979626384070983,
|
|
"loss": 4.9079,
|
|
"mean_token_accuracy": 0.21957321614027023,
|
|
"num_tokens": 41704528.0,
|
|
"step": 18200
|
|
},
|
|
{
|
|
"entropy": 5.1673534393310545,
|
|
"epoch": 1.748799231508165,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000469779104121029,
|
|
"loss": 4.9325,
|
|
"mean_token_accuracy": 0.2225162521004677,
|
|
"num_tokens": 41717314.0,
|
|
"step": 18205
|
|
},
|
|
{
|
|
"entropy": 5.260458278656006,
|
|
"epoch": 1.7492795389048992,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046976193987915553,
|
|
"loss": 4.9965,
|
|
"mean_token_accuracy": 0.21652406752109526,
|
|
"num_tokens": 41729193.0,
|
|
"step": 18210
|
|
},
|
|
{
|
|
"entropy": 5.199534273147583,
|
|
"epoch": 1.749759846301633,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004697447711154883,
|
|
"loss": 4.8802,
|
|
"mean_token_accuracy": 0.2234228655695915,
|
|
"num_tokens": 41741208.0,
|
|
"step": 18215
|
|
},
|
|
{
|
|
"entropy": 5.170067024230957,
|
|
"epoch": 1.750240153698367,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046972759783042576,
|
|
"loss": 4.8981,
|
|
"mean_token_accuracy": 0.22203250229358673,
|
|
"num_tokens": 41752792.0,
|
|
"step": 18220
|
|
},
|
|
{
|
|
"entropy": 5.188149499893188,
|
|
"epoch": 1.7507204610951008,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004697104200243666,
|
|
"loss": 4.903,
|
|
"mean_token_accuracy": 0.2209831014275551,
|
|
"num_tokens": 41763629.0,
|
|
"step": 18225
|
|
},
|
|
{
|
|
"entropy": 5.1191747188568115,
|
|
"epoch": 1.751200768491835,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004696932376977096,
|
|
"loss": 4.8712,
|
|
"mean_token_accuracy": 0.22894158214330673,
|
|
"num_tokens": 41774502.0,
|
|
"step": 18230
|
|
},
|
|
{
|
|
"entropy": 5.180744075775147,
|
|
"epoch": 1.7516810758885688,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004696760508508538,
|
|
"loss": 4.9337,
|
|
"mean_token_accuracy": 0.2156649187207222,
|
|
"num_tokens": 41785700.0,
|
|
"step": 18235
|
|
},
|
|
{
|
|
"entropy": 5.124039554595948,
|
|
"epoch": 1.7521613832853027,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046965885948419814,
|
|
"loss": 4.8808,
|
|
"mean_token_accuracy": 0.2154080703854561,
|
|
"num_tokens": 41797347.0,
|
|
"step": 18240
|
|
},
|
|
{
|
|
"entropy": 5.144995784759521,
|
|
"epoch": 1.7526416906820366,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004696416635981418,
|
|
"loss": 4.9778,
|
|
"mean_token_accuracy": 0.21650518029928206,
|
|
"num_tokens": 41810866.0,
|
|
"step": 18245
|
|
},
|
|
{
|
|
"entropy": 5.219710350036621,
|
|
"epoch": 1.7531219980787704,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000469624463193084,
|
|
"loss": 4.9528,
|
|
"mean_token_accuracy": 0.21880155354738234,
|
|
"num_tokens": 41821786.0,
|
|
"step": 18250
|
|
},
|
|
{
|
|
"entropy": 5.1789576530456545,
|
|
"epoch": 1.7536023054755043,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000469607258269424,
|
|
"loss": 4.9279,
|
|
"mean_token_accuracy": 0.2151848182082176,
|
|
"num_tokens": 41833591.0,
|
|
"step": 18255
|
|
},
|
|
{
|
|
"entropy": 5.148035192489624,
|
|
"epoch": 1.7540826128722382,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004695900488275614,
|
|
"loss": 4.8489,
|
|
"mean_token_accuracy": 0.22885044515132905,
|
|
"num_tokens": 41844659.0,
|
|
"step": 18260
|
|
},
|
|
{
|
|
"entropy": 5.111026906967163,
|
|
"epoch": 1.754562920268972,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004695728348678957,
|
|
"loss": 4.882,
|
|
"mean_token_accuracy": 0.22123366296291352,
|
|
"num_tokens": 41856378.0,
|
|
"step": 18265
|
|
},
|
|
{
|
|
"entropy": 5.129684257507324,
|
|
"epoch": 1.755043227665706,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004695556163908265,
|
|
"loss": 4.8941,
|
|
"mean_token_accuracy": 0.21303319483995437,
|
|
"num_tokens": 41867624.0,
|
|
"step": 18270
|
|
},
|
|
{
|
|
"entropy": 5.219369792938233,
|
|
"epoch": 1.7555235350624399,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004695383933967536,
|
|
"loss": 4.9177,
|
|
"mean_token_accuracy": 0.2267256498336792,
|
|
"num_tokens": 41879250.0,
|
|
"step": 18275
|
|
},
|
|
{
|
|
"entropy": 5.2192254066467285,
|
|
"epoch": 1.7560038424591737,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046952116588607694,
|
|
"loss": 4.9981,
|
|
"mean_token_accuracy": 0.21951815187931062,
|
|
"num_tokens": 41890713.0,
|
|
"step": 18280
|
|
},
|
|
{
|
|
"entropy": 5.2056262493133545,
|
|
"epoch": 1.7564841498559076,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004695039338591963,
|
|
"loss": 4.9828,
|
|
"mean_token_accuracy": 0.2174760267138481,
|
|
"num_tokens": 41902924.0,
|
|
"step": 18285
|
|
},
|
|
{
|
|
"entropy": 5.23514404296875,
|
|
"epoch": 1.7569644572526417,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000469486697316512,
|
|
"loss": 4.9828,
|
|
"mean_token_accuracy": 0.21504862755537033,
|
|
"num_tokens": 41914498.0,
|
|
"step": 18290
|
|
},
|
|
{
|
|
"entropy": 5.156249570846557,
|
|
"epoch": 1.7574447646493756,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000469469456258424,
|
|
"loss": 4.8904,
|
|
"mean_token_accuracy": 0.22041202187538148,
|
|
"num_tokens": 41925258.0,
|
|
"step": 18295
|
|
},
|
|
{
|
|
"entropy": 5.142882680892944,
|
|
"epoch": 1.7579250720461095,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004694522106853327,
|
|
"loss": 4.9446,
|
|
"mean_token_accuracy": 0.21827390491962434,
|
|
"num_tokens": 41938176.0,
|
|
"step": 18300
|
|
},
|
|
{
|
|
"entropy": 5.182098150253296,
|
|
"epoch": 1.7584053794428436,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046943496059763845,
|
|
"loss": 4.7877,
|
|
"mean_token_accuracy": 0.23012328147888184,
|
|
"num_tokens": 41949160.0,
|
|
"step": 18305
|
|
},
|
|
{
|
|
"entropy": 5.17469048500061,
|
|
"epoch": 1.7588856868395775,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046941770599574176,
|
|
"loss": 4.9748,
|
|
"mean_token_accuracy": 0.22138626724481583,
|
|
"num_tokens": 41960981.0,
|
|
"step": 18310
|
|
},
|
|
{
|
|
"entropy": 5.136798906326294,
|
|
"epoch": 1.7593659942363113,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004694004468800433,
|
|
"loss": 4.9173,
|
|
"mean_token_accuracy": 0.2229830577969551,
|
|
"num_tokens": 41971755.0,
|
|
"step": 18315
|
|
},
|
|
{
|
|
"entropy": 5.1424705505371096,
|
|
"epoch": 1.7598463016330452,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004693831832509437,
|
|
"loss": 4.9102,
|
|
"mean_token_accuracy": 0.22170276194810867,
|
|
"num_tokens": 41984337.0,
|
|
"step": 18320
|
|
},
|
|
{
|
|
"entropy": 5.138628149032593,
|
|
"epoch": 1.760326609029779,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046936591510884375,
|
|
"loss": 4.9208,
|
|
"mean_token_accuracy": 0.2221353381872177,
|
|
"num_tokens": 41995648.0,
|
|
"step": 18325
|
|
},
|
|
{
|
|
"entropy": 5.172077274322509,
|
|
"epoch": 1.760806916426513,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00046934864245414443,
|
|
"loss": 4.8644,
|
|
"mean_token_accuracy": 0.22787528187036515,
|
|
"num_tokens": 42006995.0,
|
|
"step": 18330
|
|
},
|
|
{
|
|
"entropy": 5.261659097671509,
|
|
"epoch": 1.7612872238232469,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046933136528724676,
|
|
"loss": 5.0065,
|
|
"mean_token_accuracy": 0.21258640736341478,
|
|
"num_tokens": 42020074.0,
|
|
"step": 18335
|
|
},
|
|
{
|
|
"entropy": 5.150201749801636,
|
|
"epoch": 1.7617675312199808,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004693140836085518,
|
|
"loss": 4.8376,
|
|
"mean_token_accuracy": 0.22797610759735107,
|
|
"num_tokens": 42032053.0,
|
|
"step": 18340
|
|
},
|
|
{
|
|
"entropy": 5.1066876411437985,
|
|
"epoch": 1.7622478386167146,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046929679741846076,
|
|
"loss": 4.8674,
|
|
"mean_token_accuracy": 0.22894890010356903,
|
|
"num_tokens": 42042223.0,
|
|
"step": 18345
|
|
},
|
|
{
|
|
"entropy": 5.165579700469971,
|
|
"epoch": 1.7627281460134485,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00046927950671737505,
|
|
"loss": 4.9458,
|
|
"mean_token_accuracy": 0.22484788596630095,
|
|
"num_tokens": 42053988.0,
|
|
"step": 18350
|
|
},
|
|
{
|
|
"entropy": 5.1154531955719,
|
|
"epoch": 1.7632084534101824,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046926221150569617,
|
|
"loss": 4.8331,
|
|
"mean_token_accuracy": 0.22157266587018967,
|
|
"num_tokens": 42064451.0,
|
|
"step": 18355
|
|
},
|
|
{
|
|
"entropy": 5.160877132415772,
|
|
"epoch": 1.7636887608069163,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004692449117838255,
|
|
"loss": 5.0066,
|
|
"mean_token_accuracy": 0.21049903929233552,
|
|
"num_tokens": 42076503.0,
|
|
"step": 18360
|
|
},
|
|
{
|
|
"entropy": 5.189673995971679,
|
|
"epoch": 1.7641690682036504,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004692276075521648,
|
|
"loss": 4.9317,
|
|
"mean_token_accuracy": 0.21206413209438324,
|
|
"num_tokens": 42088341.0,
|
|
"step": 18365
|
|
},
|
|
{
|
|
"entropy": 5.185642528533935,
|
|
"epoch": 1.7646493756003843,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004692102988111158,
|
|
"loss": 4.891,
|
|
"mean_token_accuracy": 0.2198364794254303,
|
|
"num_tokens": 42099862.0,
|
|
"step": 18370
|
|
},
|
|
{
|
|
"entropy": 5.2197236061096195,
|
|
"epoch": 1.7651296829971181,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046919298556108023,
|
|
"loss": 5.0118,
|
|
"mean_token_accuracy": 0.21568115353584288,
|
|
"num_tokens": 42112117.0,
|
|
"step": 18375
|
|
},
|
|
{
|
|
"entropy": 5.237560224533081,
|
|
"epoch": 1.765609990393852,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046917566780246036,
|
|
"loss": 4.9319,
|
|
"mean_token_accuracy": 0.2160506397485733,
|
|
"num_tokens": 42123093.0,
|
|
"step": 18380
|
|
},
|
|
{
|
|
"entropy": 5.233705615997314,
|
|
"epoch": 1.7660902977905861,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046915834553565793,
|
|
"loss": 5.0065,
|
|
"mean_token_accuracy": 0.218734946846962,
|
|
"num_tokens": 42135266.0,
|
|
"step": 18385
|
|
},
|
|
{
|
|
"entropy": 5.329632234573364,
|
|
"epoch": 1.76657060518732,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004691410187610753,
|
|
"loss": 5.0506,
|
|
"mean_token_accuracy": 0.21428396850824355,
|
|
"num_tokens": 42145743.0,
|
|
"step": 18390
|
|
},
|
|
{
|
|
"entropy": 5.15242190361023,
|
|
"epoch": 1.767050912584054,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046912368747911465,
|
|
"loss": 4.8459,
|
|
"mean_token_accuracy": 0.22501615881919862,
|
|
"num_tokens": 42157604.0,
|
|
"step": 18395
|
|
},
|
|
{
|
|
"entropy": 5.172465991973877,
|
|
"epoch": 1.7675312199807878,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00046910635169017845,
|
|
"loss": 4.8578,
|
|
"mean_token_accuracy": 0.2229616954922676,
|
|
"num_tokens": 42168738.0,
|
|
"step": 18400
|
|
},
|
|
{
|
|
"entropy": 5.202865123748779,
|
|
"epoch": 1.7680115273775217,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004690890113946691,
|
|
"loss": 4.9664,
|
|
"mean_token_accuracy": 0.21324502676725388,
|
|
"num_tokens": 42179699.0,
|
|
"step": 18405
|
|
},
|
|
{
|
|
"entropy": 5.189118957519531,
|
|
"epoch": 1.7684918347742555,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004690716665929893,
|
|
"loss": 4.9179,
|
|
"mean_token_accuracy": 0.2131508007645607,
|
|
"num_tokens": 42190512.0,
|
|
"step": 18410
|
|
},
|
|
{
|
|
"entropy": 5.244009923934937,
|
|
"epoch": 1.7689721421709894,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046905431728554164,
|
|
"loss": 4.968,
|
|
"mean_token_accuracy": 0.213258358836174,
|
|
"num_tokens": 42200996.0,
|
|
"step": 18415
|
|
},
|
|
{
|
|
"entropy": 5.138740158081054,
|
|
"epoch": 1.7694524495677233,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046903696347272894,
|
|
"loss": 4.8941,
|
|
"mean_token_accuracy": 0.22915413826704026,
|
|
"num_tokens": 42213032.0,
|
|
"step": 18420
|
|
},
|
|
{
|
|
"entropy": 5.151826953887939,
|
|
"epoch": 1.7699327569644572,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046901960515495413,
|
|
"loss": 4.9574,
|
|
"mean_token_accuracy": 0.213711653649807,
|
|
"num_tokens": 42224366.0,
|
|
"step": 18425
|
|
},
|
|
{
|
|
"entropy": 5.080650377273559,
|
|
"epoch": 1.770413064361191,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004690022423326202,
|
|
"loss": 4.8162,
|
|
"mean_token_accuracy": 0.2286729708313942,
|
|
"num_tokens": 42235540.0,
|
|
"step": 18430
|
|
},
|
|
{
|
|
"entropy": 5.1232880592346195,
|
|
"epoch": 1.770893371757925,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004689848750061303,
|
|
"loss": 4.8721,
|
|
"mean_token_accuracy": 0.22594636976718901,
|
|
"num_tokens": 42246755.0,
|
|
"step": 18435
|
|
},
|
|
{
|
|
"entropy": 5.144126129150391,
|
|
"epoch": 1.7713736791546588,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004689675031758876,
|
|
"loss": 4.8891,
|
|
"mean_token_accuracy": 0.22070587277412415,
|
|
"num_tokens": 42259536.0,
|
|
"step": 18440
|
|
},
|
|
{
|
|
"entropy": 5.184299182891846,
|
|
"epoch": 1.771853986551393,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004689501268422954,
|
|
"loss": 4.88,
|
|
"mean_token_accuracy": 0.2211346685886383,
|
|
"num_tokens": 42270813.0,
|
|
"step": 18445
|
|
},
|
|
{
|
|
"entropy": 5.265690517425537,
|
|
"epoch": 1.7723342939481268,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046893274600575725,
|
|
"loss": 5.0345,
|
|
"mean_token_accuracy": 0.21505656242370605,
|
|
"num_tokens": 42281953.0,
|
|
"step": 18450
|
|
},
|
|
{
|
|
"entropy": 5.187439250946045,
|
|
"epoch": 1.7728146013448607,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004689153606666765,
|
|
"loss": 5.0476,
|
|
"mean_token_accuracy": 0.20950031727552415,
|
|
"num_tokens": 42294041.0,
|
|
"step": 18455
|
|
},
|
|
{
|
|
"entropy": 5.180276155471802,
|
|
"epoch": 1.7732949087415946,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000468897970825457,
|
|
"loss": 4.8725,
|
|
"mean_token_accuracy": 0.22297078520059585,
|
|
"num_tokens": 42305812.0,
|
|
"step": 18460
|
|
},
|
|
{
|
|
"entropy": 5.162807846069336,
|
|
"epoch": 1.7737752161383287,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046888057648250233,
|
|
"loss": 4.8246,
|
|
"mean_token_accuracy": 0.22508549243211745,
|
|
"num_tokens": 42317585.0,
|
|
"step": 18465
|
|
},
|
|
{
|
|
"entropy": 5.214176034927368,
|
|
"epoch": 1.7742555235350626,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004688631776382164,
|
|
"loss": 4.9353,
|
|
"mean_token_accuracy": 0.2175424426794052,
|
|
"num_tokens": 42328723.0,
|
|
"step": 18470
|
|
},
|
|
{
|
|
"entropy": 5.142759513854981,
|
|
"epoch": 1.7747358309317964,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046884577429300305,
|
|
"loss": 4.9217,
|
|
"mean_token_accuracy": 0.22638531923294067,
|
|
"num_tokens": 42340741.0,
|
|
"step": 18475
|
|
},
|
|
{
|
|
"entropy": 5.177513170242309,
|
|
"epoch": 1.7752161383285303,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004688283664472665,
|
|
"loss": 4.9653,
|
|
"mean_token_accuracy": 0.22027941346168517,
|
|
"num_tokens": 42351504.0,
|
|
"step": 18480
|
|
},
|
|
{
|
|
"entropy": 5.203497743606567,
|
|
"epoch": 1.7756964457252642,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046881095410141084,
|
|
"loss": 4.9315,
|
|
"mean_token_accuracy": 0.2178051844239235,
|
|
"num_tokens": 42362920.0,
|
|
"step": 18485
|
|
},
|
|
{
|
|
"entropy": 5.117229461669922,
|
|
"epoch": 1.776176753121998,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046879353725584036,
|
|
"loss": 4.8404,
|
|
"mean_token_accuracy": 0.22348118722438812,
|
|
"num_tokens": 42374523.0,
|
|
"step": 18490
|
|
},
|
|
{
|
|
"entropy": 5.152104187011719,
|
|
"epoch": 1.776657060518732,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046877611591095923,
|
|
"loss": 4.9442,
|
|
"mean_token_accuracy": 0.21657546162605285,
|
|
"num_tokens": 42384988.0,
|
|
"step": 18495
|
|
},
|
|
{
|
|
"entropy": 5.063777208328247,
|
|
"epoch": 1.7771373679154658,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046875869006717224,
|
|
"loss": 4.8488,
|
|
"mean_token_accuracy": 0.22488499134778978,
|
|
"num_tokens": 42396503.0,
|
|
"step": 18500
|
|
},
|
|
{
|
|
"entropy": 5.220582485198975,
|
|
"epoch": 1.7776176753121997,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046874125972488375,
|
|
"loss": 4.9486,
|
|
"mean_token_accuracy": 0.2133714646100998,
|
|
"num_tokens": 42406877.0,
|
|
"step": 18505
|
|
},
|
|
{
|
|
"entropy": 5.186308240890503,
|
|
"epoch": 1.7780979827089336,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046872382488449853,
|
|
"loss": 4.89,
|
|
"mean_token_accuracy": 0.2266918882727623,
|
|
"num_tokens": 42418010.0,
|
|
"step": 18510
|
|
},
|
|
{
|
|
"entropy": 5.1456788063049315,
|
|
"epoch": 1.7785782901056675,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046870638554642133,
|
|
"loss": 4.8567,
|
|
"mean_token_accuracy": 0.22165956050157548,
|
|
"num_tokens": 42429056.0,
|
|
"step": 18515
|
|
},
|
|
{
|
|
"entropy": 5.1068305492401125,
|
|
"epoch": 1.7790585975024016,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046868894171105704,
|
|
"loss": 4.8752,
|
|
"mean_token_accuracy": 0.2224562093615532,
|
|
"num_tokens": 42442294.0,
|
|
"step": 18520
|
|
},
|
|
{
|
|
"entropy": 5.205835819244385,
|
|
"epoch": 1.7795389048991355,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004686714933788107,
|
|
"loss": 4.9486,
|
|
"mean_token_accuracy": 0.21565259397029876,
|
|
"num_tokens": 42454064.0,
|
|
"step": 18525
|
|
},
|
|
{
|
|
"entropy": 5.19152512550354,
|
|
"epoch": 1.7800192122958693,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004686540405500873,
|
|
"loss": 4.9295,
|
|
"mean_token_accuracy": 0.22629985958337784,
|
|
"num_tokens": 42465289.0,
|
|
"step": 18530
|
|
},
|
|
{
|
|
"entropy": 5.114503574371338,
|
|
"epoch": 1.7804995196926032,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004686365832252922,
|
|
"loss": 4.8308,
|
|
"mean_token_accuracy": 0.22398976534605025,
|
|
"num_tokens": 42476662.0,
|
|
"step": 18535
|
|
},
|
|
{
|
|
"entropy": 5.182208681106568,
|
|
"epoch": 1.7809798270893373,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046861912140483056,
|
|
"loss": 4.8944,
|
|
"mean_token_accuracy": 0.22320135533809662,
|
|
"num_tokens": 42486893.0,
|
|
"step": 18540
|
|
},
|
|
{
|
|
"entropy": 5.151525783538818,
|
|
"epoch": 1.7814601344860712,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046860165508910787,
|
|
"loss": 4.8671,
|
|
"mean_token_accuracy": 0.2246992588043213,
|
|
"num_tokens": 42498202.0,
|
|
"step": 18545
|
|
},
|
|
{
|
|
"entropy": 5.105840873718262,
|
|
"epoch": 1.781940441882805,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004685841842785296,
|
|
"loss": 4.7625,
|
|
"mean_token_accuracy": 0.22927410155534744,
|
|
"num_tokens": 42509378.0,
|
|
"step": 18550
|
|
},
|
|
{
|
|
"entropy": 5.1813897609710695,
|
|
"epoch": 1.782420749279539,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004685667089735014,
|
|
"loss": 4.9547,
|
|
"mean_token_accuracy": 0.22281541526317597,
|
|
"num_tokens": 42519955.0,
|
|
"step": 18555
|
|
},
|
|
{
|
|
"entropy": 5.25247893333435,
|
|
"epoch": 1.7829010566762729,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00046854922917442907,
|
|
"loss": 5.0055,
|
|
"mean_token_accuracy": 0.21443188935518265,
|
|
"num_tokens": 42531896.0,
|
|
"step": 18560
|
|
},
|
|
{
|
|
"entropy": 5.18243556022644,
|
|
"epoch": 1.7833813640730067,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004685317448817182,
|
|
"loss": 4.9004,
|
|
"mean_token_accuracy": 0.22561157047748565,
|
|
"num_tokens": 42542123.0,
|
|
"step": 18565
|
|
},
|
|
{
|
|
"entropy": 5.176688098907471,
|
|
"epoch": 1.7838616714697406,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004685142560957751,
|
|
"loss": 4.9098,
|
|
"mean_token_accuracy": 0.21898285746574403,
|
|
"num_tokens": 42554034.0,
|
|
"step": 18570
|
|
},
|
|
{
|
|
"entropy": 5.121625852584839,
|
|
"epoch": 1.7843419788664745,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004684967628170054,
|
|
"loss": 4.8628,
|
|
"mean_token_accuracy": 0.22410739213228226,
|
|
"num_tokens": 42564532.0,
|
|
"step": 18575
|
|
},
|
|
{
|
|
"entropy": 5.121355819702148,
|
|
"epoch": 1.7848222862632084,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00046847926504581553,
|
|
"loss": 4.8422,
|
|
"mean_token_accuracy": 0.21688616573810576,
|
|
"num_tokens": 42576864.0,
|
|
"step": 18580
|
|
},
|
|
{
|
|
"entropy": 5.106497955322266,
|
|
"epoch": 1.7853025936599423,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004684617627826116,
|
|
"loss": 4.8606,
|
|
"mean_token_accuracy": 0.22770557105541228,
|
|
"num_tokens": 42588571.0,
|
|
"step": 18585
|
|
},
|
|
{
|
|
"entropy": 5.218504762649536,
|
|
"epoch": 1.7857829010566761,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004684442560278001,
|
|
"loss": 4.9925,
|
|
"mean_token_accuracy": 0.20908130556344987,
|
|
"num_tokens": 42600912.0,
|
|
"step": 18590
|
|
},
|
|
{
|
|
"entropy": 5.226399898529053,
|
|
"epoch": 1.78626320845341,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046842674478178727,
|
|
"loss": 4.8523,
|
|
"mean_token_accuracy": 0.22524797022342682,
|
|
"num_tokens": 42612929.0,
|
|
"step": 18595
|
|
},
|
|
{
|
|
"entropy": 5.167687702178955,
|
|
"epoch": 1.7867435158501441,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004684092290449798,
|
|
"loss": 4.9383,
|
|
"mean_token_accuracy": 0.22270502150058746,
|
|
"num_tokens": 42623688.0,
|
|
"step": 18600
|
|
},
|
|
{
|
|
"entropy": 5.221596956253052,
|
|
"epoch": 1.787223823246878,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004683917088177844,
|
|
"loss": 5.0482,
|
|
"mean_token_accuracy": 0.20961541533470154,
|
|
"num_tokens": 42635773.0,
|
|
"step": 18605
|
|
},
|
|
{
|
|
"entropy": 5.200922203063965,
|
|
"epoch": 1.7877041306436119,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004683741841006077,
|
|
"loss": 4.8933,
|
|
"mean_token_accuracy": 0.2215485990047455,
|
|
"num_tokens": 42647210.0,
|
|
"step": 18610
|
|
},
|
|
{
|
|
"entropy": 5.1452563285827635,
|
|
"epoch": 1.7881844380403458,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004683566548938567,
|
|
"loss": 4.9286,
|
|
"mean_token_accuracy": 0.21941764205694197,
|
|
"num_tokens": 42658029.0,
|
|
"step": 18615
|
|
},
|
|
{
|
|
"entropy": 5.139180374145508,
|
|
"epoch": 1.7886647454370799,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004683391211979383,
|
|
"loss": 4.9548,
|
|
"mean_token_accuracy": 0.21801802963018418,
|
|
"num_tokens": 42669327.0,
|
|
"step": 18620
|
|
},
|
|
{
|
|
"entropy": 5.2122523307800295,
|
|
"epoch": 1.7891450528338138,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004683215830132597,
|
|
"loss": 4.8596,
|
|
"mean_token_accuracy": 0.22672210782766342,
|
|
"num_tokens": 42679794.0,
|
|
"step": 18625
|
|
},
|
|
{
|
|
"entropy": 5.142408990859986,
|
|
"epoch": 1.7896253602305476,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00046830404034022786,
|
|
"loss": 4.8864,
|
|
"mean_token_accuracy": 0.23015541732311248,
|
|
"num_tokens": 42689838.0,
|
|
"step": 18630
|
|
},
|
|
{
|
|
"entropy": 5.161951875686645,
|
|
"epoch": 1.7901056676272815,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004682864931792502,
|
|
"loss": 4.9635,
|
|
"mean_token_accuracy": 0.2196178674697876,
|
|
"num_tokens": 42700935.0,
|
|
"step": 18635
|
|
},
|
|
{
|
|
"entropy": 5.285586929321289,
|
|
"epoch": 1.7905859750240154,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004682689415307342,
|
|
"loss": 5.0287,
|
|
"mean_token_accuracy": 0.20989848375320436,
|
|
"num_tokens": 42712533.0,
|
|
"step": 18640
|
|
},
|
|
{
|
|
"entropy": 5.190979862213135,
|
|
"epoch": 1.7910662824207493,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004682513853950872,
|
|
"loss": 4.9081,
|
|
"mean_token_accuracy": 0.22632770985364914,
|
|
"num_tokens": 42723604.0,
|
|
"step": 18645
|
|
},
|
|
{
|
|
"entropy": 5.207202386856079,
|
|
"epoch": 1.7915465898174832,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000468233824772717,
|
|
"loss": 4.9157,
|
|
"mean_token_accuracy": 0.21897136121988298,
|
|
"num_tokens": 42735957.0,
|
|
"step": 18650
|
|
},
|
|
{
|
|
"entropy": 5.185697603225708,
|
|
"epoch": 1.792026897214217,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000468216259664031,
|
|
"loss": 5.0322,
|
|
"mean_token_accuracy": 0.2145911380648613,
|
|
"num_tokens": 42747287.0,
|
|
"step": 18655
|
|
},
|
|
{
|
|
"entropy": 5.173118257522583,
|
|
"epoch": 1.792507204610951,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046819869006943727,
|
|
"loss": 4.9094,
|
|
"mean_token_accuracy": 0.22061461806297303,
|
|
"num_tokens": 42759270.0,
|
|
"step": 18660
|
|
},
|
|
{
|
|
"entropy": 5.142519521713257,
|
|
"epoch": 1.7929875120076848,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004681811159893436,
|
|
"loss": 4.8533,
|
|
"mean_token_accuracy": 0.22513288110494614,
|
|
"num_tokens": 42771955.0,
|
|
"step": 18665
|
|
},
|
|
{
|
|
"entropy": 5.2744043350219725,
|
|
"epoch": 1.7934678194044187,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00046816353742415814,
|
|
"loss": 5.0293,
|
|
"mean_token_accuracy": 0.21436074376106262,
|
|
"num_tokens": 42784610.0,
|
|
"step": 18670
|
|
},
|
|
{
|
|
"entropy": 5.184707164764404,
|
|
"epoch": 1.7939481268011528,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046814595437428885,
|
|
"loss": 4.9121,
|
|
"mean_token_accuracy": 0.2221132293343544,
|
|
"num_tokens": 42795221.0,
|
|
"step": 18675
|
|
},
|
|
{
|
|
"entropy": 5.069939041137696,
|
|
"epoch": 1.7944284341978867,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000468128366840144,
|
|
"loss": 4.8812,
|
|
"mean_token_accuracy": 0.22387212961912156,
|
|
"num_tokens": 42807475.0,
|
|
"step": 18680
|
|
},
|
|
{
|
|
"entropy": 5.158438444137573,
|
|
"epoch": 1.7949087415946205,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000468110774822132,
|
|
"loss": 4.901,
|
|
"mean_token_accuracy": 0.22224035561084748,
|
|
"num_tokens": 42819193.0,
|
|
"step": 18685
|
|
},
|
|
{
|
|
"entropy": 5.226526880264283,
|
|
"epoch": 1.7953890489913544,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004680931783206612,
|
|
"loss": 4.9806,
|
|
"mean_token_accuracy": 0.22079339921474456,
|
|
"num_tokens": 42831267.0,
|
|
"step": 18690
|
|
},
|
|
{
|
|
"entropy": 5.199577331542969,
|
|
"epoch": 1.7958693563880885,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046807557733614014,
|
|
"loss": 4.994,
|
|
"mean_token_accuracy": 0.21518171280622483,
|
|
"num_tokens": 42843066.0,
|
|
"step": 18695
|
|
},
|
|
{
|
|
"entropy": 5.143007516860962,
|
|
"epoch": 1.7963496637848224,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046805797186897757,
|
|
"loss": 4.8699,
|
|
"mean_token_accuracy": 0.22779001146554947,
|
|
"num_tokens": 42854630.0,
|
|
"step": 18700
|
|
},
|
|
{
|
|
"entropy": 5.2355574607849125,
|
|
"epoch": 1.7968299711815563,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00046804036191958206,
|
|
"loss": 4.9618,
|
|
"mean_token_accuracy": 0.21646393537521363,
|
|
"num_tokens": 42865986.0,
|
|
"step": 18705
|
|
},
|
|
{
|
|
"entropy": 5.242122411727905,
|
|
"epoch": 1.7973102785782902,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046802274748836267,
|
|
"loss": 4.9731,
|
|
"mean_token_accuracy": 0.2180660679936409,
|
|
"num_tokens": 42877533.0,
|
|
"step": 18710
|
|
},
|
|
{
|
|
"entropy": 5.145714998245239,
|
|
"epoch": 1.797790585975024,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004680051285757281,
|
|
"loss": 4.8409,
|
|
"mean_token_accuracy": 0.2281106159090996,
|
|
"num_tokens": 42889114.0,
|
|
"step": 18715
|
|
},
|
|
{
|
|
"entropy": 5.230079174041748,
|
|
"epoch": 1.798270893371758,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004679875051820877,
|
|
"loss": 5.0519,
|
|
"mean_token_accuracy": 0.20809556990861894,
|
|
"num_tokens": 42899483.0,
|
|
"step": 18720
|
|
},
|
|
{
|
|
"entropy": 5.178544616699218,
|
|
"epoch": 1.7987512007684918,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004679698773078503,
|
|
"loss": 4.8805,
|
|
"mean_token_accuracy": 0.2276952013373375,
|
|
"num_tokens": 42910267.0,
|
|
"step": 18725
|
|
},
|
|
{
|
|
"entropy": 5.102669811248779,
|
|
"epoch": 1.7992315081652257,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046795224495342554,
|
|
"loss": 4.8994,
|
|
"mean_token_accuracy": 0.2255684345960617,
|
|
"num_tokens": 42922440.0,
|
|
"step": 18730
|
|
},
|
|
{
|
|
"entropy": 5.137301397323609,
|
|
"epoch": 1.7997118155619596,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046793460811922255,
|
|
"loss": 4.8559,
|
|
"mean_token_accuracy": 0.22756927013397216,
|
|
"num_tokens": 42933967.0,
|
|
"step": 18735
|
|
},
|
|
{
|
|
"entropy": 5.228575134277344,
|
|
"epoch": 1.8001921229586935,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046791696680565075,
|
|
"loss": 4.8842,
|
|
"mean_token_accuracy": 0.22466631084680558,
|
|
"num_tokens": 42945049.0,
|
|
"step": 18740
|
|
},
|
|
{
|
|
"entropy": 5.231136322021484,
|
|
"epoch": 1.8006724303554273,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046789932101312003,
|
|
"loss": 5.0187,
|
|
"mean_token_accuracy": 0.2174960657954216,
|
|
"num_tokens": 42956062.0,
|
|
"step": 18745
|
|
},
|
|
{
|
|
"entropy": 5.230274868011475,
|
|
"epoch": 1.8011527377521612,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004678816707420397,
|
|
"loss": 5.0032,
|
|
"mean_token_accuracy": 0.21903230547904967,
|
|
"num_tokens": 42967687.0,
|
|
"step": 18750
|
|
},
|
|
{
|
|
"entropy": 5.197688245773316,
|
|
"epoch": 1.8016330451488953,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004678640159928198,
|
|
"loss": 4.964,
|
|
"mean_token_accuracy": 0.21139907091856003,
|
|
"num_tokens": 42979916.0,
|
|
"step": 18755
|
|
},
|
|
{
|
|
"entropy": 5.182051801681519,
|
|
"epoch": 1.8021133525456292,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004678463567658701,
|
|
"loss": 4.957,
|
|
"mean_token_accuracy": 0.2161658376455307,
|
|
"num_tokens": 42991639.0,
|
|
"step": 18760
|
|
},
|
|
{
|
|
"entropy": 5.224612426757813,
|
|
"epoch": 1.802593659942363,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004678286930616006,
|
|
"loss": 4.9208,
|
|
"mean_token_accuracy": 0.2289966121315956,
|
|
"num_tokens": 43001843.0,
|
|
"step": 18765
|
|
},
|
|
{
|
|
"entropy": 5.08487868309021,
|
|
"epoch": 1.803073967339097,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004678110248804215,
|
|
"loss": 4.8759,
|
|
"mean_token_accuracy": 0.21512030959129333,
|
|
"num_tokens": 43013249.0,
|
|
"step": 18770
|
|
},
|
|
{
|
|
"entropy": 5.132952308654785,
|
|
"epoch": 1.803554274735831,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046779335222274293,
|
|
"loss": 4.9273,
|
|
"mean_token_accuracy": 0.22092564702033995,
|
|
"num_tokens": 43025867.0,
|
|
"step": 18775
|
|
},
|
|
{
|
|
"entropy": 5.224576234817505,
|
|
"epoch": 1.804034582132565,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046777567508897515,
|
|
"loss": 4.9973,
|
|
"mean_token_accuracy": 0.21442876756191254,
|
|
"num_tokens": 43037530.0,
|
|
"step": 18780
|
|
},
|
|
{
|
|
"entropy": 5.17865571975708,
|
|
"epoch": 1.8045148895292988,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046775799347952864,
|
|
"loss": 4.9551,
|
|
"mean_token_accuracy": 0.21771474480628966,
|
|
"num_tokens": 43048675.0,
|
|
"step": 18785
|
|
},
|
|
{
|
|
"entropy": 5.216196775436401,
|
|
"epoch": 1.8049951969260327,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004677403073948139,
|
|
"loss": 4.9717,
|
|
"mean_token_accuracy": 0.2206488221883774,
|
|
"num_tokens": 43059592.0,
|
|
"step": 18790
|
|
},
|
|
{
|
|
"entropy": 5.095714521408081,
|
|
"epoch": 1.8054755043227666,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004677226168352416,
|
|
"loss": 4.791,
|
|
"mean_token_accuracy": 0.22456269711256027,
|
|
"num_tokens": 43071755.0,
|
|
"step": 18795
|
|
},
|
|
{
|
|
"entropy": 5.183867025375366,
|
|
"epoch": 1.8059558117195005,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004677049218012223,
|
|
"loss": 4.9514,
|
|
"mean_token_accuracy": 0.22125904858112336,
|
|
"num_tokens": 43081801.0,
|
|
"step": 18800
|
|
},
|
|
{
|
|
"entropy": 5.173227787017822,
|
|
"epoch": 1.8064361191162344,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000467687222293167,
|
|
"loss": 4.8655,
|
|
"mean_token_accuracy": 0.22511634826660157,
|
|
"num_tokens": 43094160.0,
|
|
"step": 18805
|
|
},
|
|
{
|
|
"entropy": 5.154326152801514,
|
|
"epoch": 1.8069164265129682,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004676695183114866,
|
|
"loss": 4.9346,
|
|
"mean_token_accuracy": 0.22008128166198732,
|
|
"num_tokens": 43106082.0,
|
|
"step": 18810
|
|
},
|
|
{
|
|
"entropy": 5.140604019165039,
|
|
"epoch": 1.8073967339097021,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000467651809856592,
|
|
"loss": 4.9109,
|
|
"mean_token_accuracy": 0.21887325048446654,
|
|
"num_tokens": 43117215.0,
|
|
"step": 18815
|
|
},
|
|
{
|
|
"entropy": 5.256780433654785,
|
|
"epoch": 1.807877041306436,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046763409692889446,
|
|
"loss": 5.0994,
|
|
"mean_token_accuracy": 0.20832848697900772,
|
|
"num_tokens": 43127858.0,
|
|
"step": 18820
|
|
},
|
|
{
|
|
"entropy": 5.255016899108886,
|
|
"epoch": 1.8083573487031699,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046761637952880516,
|
|
"loss": 4.9017,
|
|
"mean_token_accuracy": 0.2215781033039093,
|
|
"num_tokens": 43139952.0,
|
|
"step": 18825
|
|
},
|
|
{
|
|
"entropy": 5.103308439254761,
|
|
"epoch": 1.808837656099904,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046759865765673555,
|
|
"loss": 4.8262,
|
|
"mean_token_accuracy": 0.22883502542972564,
|
|
"num_tokens": 43150900.0,
|
|
"step": 18830
|
|
},
|
|
{
|
|
"entropy": 5.1161055088043215,
|
|
"epoch": 1.8093179634966379,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000467580931313097,
|
|
"loss": 4.8714,
|
|
"mean_token_accuracy": 0.2249667078256607,
|
|
"num_tokens": 43163199.0,
|
|
"step": 18835
|
|
},
|
|
{
|
|
"entropy": 5.187123489379883,
|
|
"epoch": 1.8097982708933718,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046756320049830106,
|
|
"loss": 4.8581,
|
|
"mean_token_accuracy": 0.21825749725103377,
|
|
"num_tokens": 43174147.0,
|
|
"step": 18840
|
|
},
|
|
{
|
|
"entropy": 5.133212614059448,
|
|
"epoch": 1.8102785782901056,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004675454652127594,
|
|
"loss": 4.8336,
|
|
"mean_token_accuracy": 0.22265468090772628,
|
|
"num_tokens": 43186191.0,
|
|
"step": 18845
|
|
},
|
|
{
|
|
"entropy": 5.20296802520752,
|
|
"epoch": 1.8107588856868397,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00046752772545688377,
|
|
"loss": 5.0445,
|
|
"mean_token_accuracy": 0.2127169817686081,
|
|
"num_tokens": 43197541.0,
|
|
"step": 18850
|
|
},
|
|
{
|
|
"entropy": 5.243007707595825,
|
|
"epoch": 1.8112391930835736,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004675099812310861,
|
|
"loss": 4.9299,
|
|
"mean_token_accuracy": 0.2172359123826027,
|
|
"num_tokens": 43208792.0,
|
|
"step": 18855
|
|
},
|
|
{
|
|
"entropy": 5.255295085906982,
|
|
"epoch": 1.8117195004803075,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004674922325357782,
|
|
"loss": 5.0026,
|
|
"mean_token_accuracy": 0.21394715160131456,
|
|
"num_tokens": 43219914.0,
|
|
"step": 18860
|
|
},
|
|
{
|
|
"entropy": 5.067933750152588,
|
|
"epoch": 1.8121998078770414,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046747447937137235,
|
|
"loss": 4.8319,
|
|
"mean_token_accuracy": 0.22988979518413544,
|
|
"num_tokens": 43232179.0,
|
|
"step": 18865
|
|
},
|
|
{
|
|
"entropy": 5.12700834274292,
|
|
"epoch": 1.8126801152737753,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00046745672173828057,
|
|
"loss": 4.8656,
|
|
"mean_token_accuracy": 0.21539948731660843,
|
|
"num_tokens": 43243196.0,
|
|
"step": 18870
|
|
},
|
|
{
|
|
"entropy": 5.245073318481445,
|
|
"epoch": 1.8131604226705091,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004674389596369151,
|
|
"loss": 4.9478,
|
|
"mean_token_accuracy": 0.21452756077051163,
|
|
"num_tokens": 43255573.0,
|
|
"step": 18875
|
|
},
|
|
{
|
|
"entropy": 5.197284078598022,
|
|
"epoch": 1.813640730067243,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00046742119306768855,
|
|
"loss": 4.8845,
|
|
"mean_token_accuracy": 0.21725525557994843,
|
|
"num_tokens": 43267679.0,
|
|
"step": 18880
|
|
},
|
|
{
|
|
"entropy": 5.266736459732056,
|
|
"epoch": 1.814121037463977,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004674034220310132,
|
|
"loss": 5.02,
|
|
"mean_token_accuracy": 0.21002791672945023,
|
|
"num_tokens": 43278857.0,
|
|
"step": 18885
|
|
},
|
|
{
|
|
"entropy": 5.224592781066894,
|
|
"epoch": 1.8146013448607108,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00046738564652730176,
|
|
"loss": 4.9148,
|
|
"mean_token_accuracy": 0.2243320897221565,
|
|
"num_tokens": 43291099.0,
|
|
"step": 18890
|
|
},
|
|
{
|
|
"entropy": 5.180141496658325,
|
|
"epoch": 1.8150816522574447,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004673678665569669,
|
|
"loss": 4.9346,
|
|
"mean_token_accuracy": 0.21560515463352203,
|
|
"num_tokens": 43302863.0,
|
|
"step": 18895
|
|
},
|
|
{
|
|
"entropy": 5.17077054977417,
|
|
"epoch": 1.8155619596541785,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004673500821204213,
|
|
"loss": 4.8709,
|
|
"mean_token_accuracy": 0.22833613753318788,
|
|
"num_tokens": 43314673.0,
|
|
"step": 18900
|
|
},
|
|
{
|
|
"entropy": 5.162666893005371,
|
|
"epoch": 1.8160422670509124,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000467332293218078,
|
|
"loss": 4.8609,
|
|
"mean_token_accuracy": 0.22799091786146164,
|
|
"num_tokens": 43325927.0,
|
|
"step": 18905
|
|
},
|
|
{
|
|
"entropy": 5.155388593673706,
|
|
"epoch": 1.8165225744476465,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046731449985035,
|
|
"loss": 4.9259,
|
|
"mean_token_accuracy": 0.22295121848583221,
|
|
"num_tokens": 43337615.0,
|
|
"step": 18910
|
|
},
|
|
{
|
|
"entropy": 5.222216415405273,
|
|
"epoch": 1.8170028818443804,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046729670201765036,
|
|
"loss": 4.9945,
|
|
"mean_token_accuracy": 0.21877157241106032,
|
|
"num_tokens": 43349195.0,
|
|
"step": 18915
|
|
},
|
|
{
|
|
"entropy": 5.218001508712769,
|
|
"epoch": 1.8174831892411143,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046727889972039227,
|
|
"loss": 4.9506,
|
|
"mean_token_accuracy": 0.21841635107994078,
|
|
"num_tokens": 43361274.0,
|
|
"step": 18920
|
|
},
|
|
{
|
|
"entropy": 5.12210259437561,
|
|
"epoch": 1.8179634966378482,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046726109295898904,
|
|
"loss": 4.8873,
|
|
"mean_token_accuracy": 0.22357902377843858,
|
|
"num_tokens": 43372843.0,
|
|
"step": 18925
|
|
},
|
|
{
|
|
"entropy": 5.185367012023926,
|
|
"epoch": 1.8184438040345823,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004672432817338542,
|
|
"loss": 4.8715,
|
|
"mean_token_accuracy": 0.2250346526503563,
|
|
"num_tokens": 43383492.0,
|
|
"step": 18930
|
|
},
|
|
{
|
|
"entropy": 5.152243757247925,
|
|
"epoch": 1.8189241114313162,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046722546604540115,
|
|
"loss": 4.897,
|
|
"mean_token_accuracy": 0.2156965285539627,
|
|
"num_tokens": 43395669.0,
|
|
"step": 18935
|
|
},
|
|
{
|
|
"entropy": 5.071681165695191,
|
|
"epoch": 1.81940441882805,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004672076458940436,
|
|
"loss": 4.7988,
|
|
"mean_token_accuracy": 0.22689439207315446,
|
|
"num_tokens": 43407340.0,
|
|
"step": 18940
|
|
},
|
|
{
|
|
"entropy": 5.160733652114868,
|
|
"epoch": 1.819884726224784,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046718982128019534,
|
|
"loss": 4.9413,
|
|
"mean_token_accuracy": 0.21972607225179672,
|
|
"num_tokens": 43418607.0,
|
|
"step": 18945
|
|
},
|
|
{
|
|
"entropy": 5.311149311065674,
|
|
"epoch": 1.8203650336215178,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046717199220427003,
|
|
"loss": 5.0566,
|
|
"mean_token_accuracy": 0.21264605075120926,
|
|
"num_tokens": 43429286.0,
|
|
"step": 18950
|
|
},
|
|
{
|
|
"entropy": 5.150112533569336,
|
|
"epoch": 1.8208453410182517,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046715415866668163,
|
|
"loss": 4.8894,
|
|
"mean_token_accuracy": 0.22030486166477203,
|
|
"num_tokens": 43440794.0,
|
|
"step": 18955
|
|
},
|
|
{
|
|
"entropy": 5.130909872055054,
|
|
"epoch": 1.8213256484149856,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004671363206678443,
|
|
"loss": 4.9403,
|
|
"mean_token_accuracy": 0.2197520062327385,
|
|
"num_tokens": 43452184.0,
|
|
"step": 18960
|
|
},
|
|
{
|
|
"entropy": 5.096556758880615,
|
|
"epoch": 1.8218059558117194,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046711847820817215,
|
|
"loss": 4.8894,
|
|
"mean_token_accuracy": 0.22419148236513137,
|
|
"num_tokens": 43463361.0,
|
|
"step": 18965
|
|
},
|
|
{
|
|
"entropy": 5.205463838577271,
|
|
"epoch": 1.8222862632084533,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004671006312880794,
|
|
"loss": 4.9319,
|
|
"mean_token_accuracy": 0.21770550161600113,
|
|
"num_tokens": 43474802.0,
|
|
"step": 18970
|
|
},
|
|
{
|
|
"entropy": 5.156509208679199,
|
|
"epoch": 1.8227665706051872,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004670827799079805,
|
|
"loss": 4.8782,
|
|
"mean_token_accuracy": 0.221728877723217,
|
|
"num_tokens": 43486962.0,
|
|
"step": 18975
|
|
},
|
|
{
|
|
"entropy": 5.206051826477051,
|
|
"epoch": 1.823246878001921,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046706492406828966,
|
|
"loss": 4.9016,
|
|
"mean_token_accuracy": 0.223107148706913,
|
|
"num_tokens": 43498761.0,
|
|
"step": 18980
|
|
},
|
|
{
|
|
"entropy": 5.212453365325928,
|
|
"epoch": 1.8237271853986552,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004670470637694217,
|
|
"loss": 4.9724,
|
|
"mean_token_accuracy": 0.22523002177476883,
|
|
"num_tokens": 43511294.0,
|
|
"step": 18985
|
|
},
|
|
{
|
|
"entropy": 5.180626010894775,
|
|
"epoch": 1.824207492795389,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004670291990117912,
|
|
"loss": 4.8959,
|
|
"mean_token_accuracy": 0.222798952460289,
|
|
"num_tokens": 43522858.0,
|
|
"step": 18990
|
|
},
|
|
{
|
|
"entropy": 5.216363430023193,
|
|
"epoch": 1.824687800192123,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004670113297958128,
|
|
"loss": 4.8931,
|
|
"mean_token_accuracy": 0.21746156513690948,
|
|
"num_tokens": 43533453.0,
|
|
"step": 18995
|
|
},
|
|
{
|
|
"entropy": 5.1565718173980715,
|
|
"epoch": 1.8251681075888568,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046699345612190155,
|
|
"loss": 4.8594,
|
|
"mean_token_accuracy": 0.23229910880327226,
|
|
"num_tokens": 43543889.0,
|
|
"step": 19000
|
|
},
|
|
{
|
|
"entropy": 5.084559917449951,
|
|
"epoch": 1.825648414985591,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046697557799047233,
|
|
"loss": 4.8411,
|
|
"mean_token_accuracy": 0.23009685277938843,
|
|
"num_tokens": 43554815.0,
|
|
"step": 19005
|
|
},
|
|
{
|
|
"entropy": 5.076380491256714,
|
|
"epoch": 1.8261287223823248,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004669576954019403,
|
|
"loss": 4.779,
|
|
"mean_token_accuracy": 0.23019351810216904,
|
|
"num_tokens": 43565258.0,
|
|
"step": 19010
|
|
},
|
|
{
|
|
"entropy": 5.193867635726929,
|
|
"epoch": 1.8266090297790587,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004669398083567205,
|
|
"loss": 4.9275,
|
|
"mean_token_accuracy": 0.21558635979890822,
|
|
"num_tokens": 43576710.0,
|
|
"step": 19015
|
|
},
|
|
{
|
|
"entropy": 5.190999507904053,
|
|
"epoch": 1.8270893371757926,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004669219168552284,
|
|
"loss": 4.9226,
|
|
"mean_token_accuracy": 0.21981949657201766,
|
|
"num_tokens": 43587590.0,
|
|
"step": 19020
|
|
},
|
|
{
|
|
"entropy": 5.135986948013306,
|
|
"epoch": 1.8275696445725265,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046690402089787916,
|
|
"loss": 4.9395,
|
|
"mean_token_accuracy": 0.21903201937675476,
|
|
"num_tokens": 43599675.0,
|
|
"step": 19025
|
|
},
|
|
{
|
|
"entropy": 5.108835506439209,
|
|
"epoch": 1.8280499519692603,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004668861204850884,
|
|
"loss": 4.8008,
|
|
"mean_token_accuracy": 0.2312204658985138,
|
|
"num_tokens": 43612369.0,
|
|
"step": 19030
|
|
},
|
|
{
|
|
"entropy": 5.134114503860474,
|
|
"epoch": 1.8285302593659942,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046686821561727176,
|
|
"loss": 4.8287,
|
|
"mean_token_accuracy": 0.22977259159088134,
|
|
"num_tokens": 43624807.0,
|
|
"step": 19035
|
|
},
|
|
{
|
|
"entropy": 5.119396448135376,
|
|
"epoch": 1.829010566762728,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004668503062948449,
|
|
"loss": 4.9142,
|
|
"mean_token_accuracy": 0.22185174524784088,
|
|
"num_tokens": 43635389.0,
|
|
"step": 19040
|
|
},
|
|
{
|
|
"entropy": 5.198632860183716,
|
|
"epoch": 1.829490874159462,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004668323925182236,
|
|
"loss": 4.8978,
|
|
"mean_token_accuracy": 0.21805914491415024,
|
|
"num_tokens": 43646807.0,
|
|
"step": 19045
|
|
},
|
|
{
|
|
"entropy": 5.205413579940796,
|
|
"epoch": 1.8299711815561959,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046681447428782377,
|
|
"loss": 4.9393,
|
|
"mean_token_accuracy": 0.218313068151474,
|
|
"num_tokens": 43657910.0,
|
|
"step": 19050
|
|
},
|
|
{
|
|
"entropy": 5.148894500732422,
|
|
"epoch": 1.8304514889529298,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004667965516040613,
|
|
"loss": 4.8903,
|
|
"mean_token_accuracy": 0.2225254535675049,
|
|
"num_tokens": 43669376.0,
|
|
"step": 19055
|
|
},
|
|
{
|
|
"entropy": 5.132978916168213,
|
|
"epoch": 1.8309317963496636,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004667786244673526,
|
|
"loss": 4.8537,
|
|
"mean_token_accuracy": 0.2235651895403862,
|
|
"num_tokens": 43681702.0,
|
|
"step": 19060
|
|
},
|
|
{
|
|
"entropy": 5.235198020935059,
|
|
"epoch": 1.8314121037463977,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046676069287811365,
|
|
"loss": 4.9819,
|
|
"mean_token_accuracy": 0.2188320890069008,
|
|
"num_tokens": 43694795.0,
|
|
"step": 19065
|
|
},
|
|
{
|
|
"entropy": 5.248945569992065,
|
|
"epoch": 1.8318924111431316,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004667427568367607,
|
|
"loss": 4.9883,
|
|
"mean_token_accuracy": 0.22114041894674302,
|
|
"num_tokens": 43707197.0,
|
|
"step": 19070
|
|
},
|
|
{
|
|
"entropy": 5.220323514938355,
|
|
"epoch": 1.8323727185398655,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046672481634371047,
|
|
"loss": 4.9703,
|
|
"mean_token_accuracy": 0.21035372614860534,
|
|
"num_tokens": 43719066.0,
|
|
"step": 19075
|
|
},
|
|
{
|
|
"entropy": 5.185680723190307,
|
|
"epoch": 1.8328530259365994,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046670687139937925,
|
|
"loss": 4.9139,
|
|
"mean_token_accuracy": 0.22479754090309143,
|
|
"num_tokens": 43731080.0,
|
|
"step": 19080
|
|
},
|
|
{
|
|
"entropy": 5.197793245315552,
|
|
"epoch": 1.8333333333333335,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004666889220041837,
|
|
"loss": 4.9586,
|
|
"mean_token_accuracy": 0.22060257345438003,
|
|
"num_tokens": 43743048.0,
|
|
"step": 19085
|
|
},
|
|
{
|
|
"entropy": 5.127030611038208,
|
|
"epoch": 1.8338136407300674,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046667096815854056,
|
|
"loss": 4.9217,
|
|
"mean_token_accuracy": 0.21732288599014282,
|
|
"num_tokens": 43754701.0,
|
|
"step": 19090
|
|
},
|
|
{
|
|
"entropy": 5.1769365787506105,
|
|
"epoch": 1.8342939481268012,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004666530098628667,
|
|
"loss": 4.9569,
|
|
"mean_token_accuracy": 0.21874051839113234,
|
|
"num_tokens": 43765696.0,
|
|
"step": 19095
|
|
},
|
|
{
|
|
"entropy": 5.254151487350464,
|
|
"epoch": 1.8347742555235351,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004666350471175791,
|
|
"loss": 5.0066,
|
|
"mean_token_accuracy": 0.21649441719055176,
|
|
"num_tokens": 43777162.0,
|
|
"step": 19100
|
|
},
|
|
{
|
|
"entropy": 5.217176914215088,
|
|
"epoch": 1.835254562920269,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004666170799230947,
|
|
"loss": 4.9294,
|
|
"mean_token_accuracy": 0.21871206164360046,
|
|
"num_tokens": 43789340.0,
|
|
"step": 19105
|
|
},
|
|
{
|
|
"entropy": 5.128383445739746,
|
|
"epoch": 1.8357348703170029,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004665991082798307,
|
|
"loss": 4.8158,
|
|
"mean_token_accuracy": 0.22825666517019272,
|
|
"num_tokens": 43800098.0,
|
|
"step": 19110
|
|
},
|
|
{
|
|
"entropy": 5.125943803787232,
|
|
"epoch": 1.8362151777137368,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004665811321882043,
|
|
"loss": 4.9025,
|
|
"mean_token_accuracy": 0.22539056986570358,
|
|
"num_tokens": 43811403.0,
|
|
"step": 19115
|
|
},
|
|
{
|
|
"entropy": 5.256423711776733,
|
|
"epoch": 1.8366954851104706,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00046656315164863297,
|
|
"loss": 4.9268,
|
|
"mean_token_accuracy": 0.22319784462451936,
|
|
"num_tokens": 43821946.0,
|
|
"step": 19120
|
|
},
|
|
{
|
|
"entropy": 5.073716640472412,
|
|
"epoch": 1.8371757925072045,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046654516666153403,
|
|
"loss": 4.8323,
|
|
"mean_token_accuracy": 0.2236419141292572,
|
|
"num_tokens": 43832638.0,
|
|
"step": 19125
|
|
},
|
|
{
|
|
"entropy": 5.12318000793457,
|
|
"epoch": 1.8376560999039384,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004665271772273251,
|
|
"loss": 4.8481,
|
|
"mean_token_accuracy": 0.22164386957883836,
|
|
"num_tokens": 43843616.0,
|
|
"step": 19130
|
|
},
|
|
{
|
|
"entropy": 5.230293321609497,
|
|
"epoch": 1.8381364073006723,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004665091833464239,
|
|
"loss": 4.8637,
|
|
"mean_token_accuracy": 0.22107865214347838,
|
|
"num_tokens": 43854680.0,
|
|
"step": 19135
|
|
},
|
|
{
|
|
"entropy": 5.138217401504517,
|
|
"epoch": 1.8386167146974062,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046649118501924805,
|
|
"loss": 4.8908,
|
|
"mean_token_accuracy": 0.2224344864487648,
|
|
"num_tokens": 43866683.0,
|
|
"step": 19140
|
|
},
|
|
{
|
|
"entropy": 5.197470855712891,
|
|
"epoch": 1.8390970220941403,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004664731822462154,
|
|
"loss": 4.9705,
|
|
"mean_token_accuracy": 0.2186468482017517,
|
|
"num_tokens": 43878080.0,
|
|
"step": 19145
|
|
},
|
|
{
|
|
"entropy": 5.240995216369629,
|
|
"epoch": 1.8395773294908742,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046645517502774415,
|
|
"loss": 4.9098,
|
|
"mean_token_accuracy": 0.2190140336751938,
|
|
"num_tokens": 43888633.0,
|
|
"step": 19150
|
|
},
|
|
{
|
|
"entropy": 5.252007484436035,
|
|
"epoch": 1.840057636887608,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046643716336425224,
|
|
"loss": 4.9367,
|
|
"mean_token_accuracy": 0.2217061460018158,
|
|
"num_tokens": 43899678.0,
|
|
"step": 19155
|
|
},
|
|
{
|
|
"entropy": 5.145054817199707,
|
|
"epoch": 1.8405379442843421,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004664191472561578,
|
|
"loss": 4.8904,
|
|
"mean_token_accuracy": 0.22237008661031724,
|
|
"num_tokens": 43910744.0,
|
|
"step": 19160
|
|
},
|
|
{
|
|
"entropy": 5.220931482315064,
|
|
"epoch": 1.841018251681076,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004664011267038792,
|
|
"loss": 5.0016,
|
|
"mean_token_accuracy": 0.21928611844778062,
|
|
"num_tokens": 43922467.0,
|
|
"step": 19165
|
|
},
|
|
{
|
|
"entropy": 5.142969036102295,
|
|
"epoch": 1.84149855907781,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00046638310170783476,
|
|
"loss": 4.8598,
|
|
"mean_token_accuracy": 0.22431076914072037,
|
|
"num_tokens": 43934919.0,
|
|
"step": 19170
|
|
},
|
|
{
|
|
"entropy": 5.187114095687866,
|
|
"epoch": 1.8419788664745438,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000466365072268443,
|
|
"loss": 4.9115,
|
|
"mean_token_accuracy": 0.22165709882974624,
|
|
"num_tokens": 43947737.0,
|
|
"step": 19175
|
|
},
|
|
{
|
|
"entropy": 5.130555963516235,
|
|
"epoch": 1.8424591738712777,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004663470383861225,
|
|
"loss": 4.848,
|
|
"mean_token_accuracy": 0.22130993008613586,
|
|
"num_tokens": 43959656.0,
|
|
"step": 19180
|
|
},
|
|
{
|
|
"entropy": 5.1659932136535645,
|
|
"epoch": 1.8429394812680115,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000466329000061292,
|
|
"loss": 4.9018,
|
|
"mean_token_accuracy": 0.22112877368927003,
|
|
"num_tokens": 43970732.0,
|
|
"step": 19185
|
|
},
|
|
{
|
|
"entropy": 5.174559164047241,
|
|
"epoch": 1.8434197886647454,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004663109572943702,
|
|
"loss": 4.8847,
|
|
"mean_token_accuracy": 0.21701590120792388,
|
|
"num_tokens": 43982369.0,
|
|
"step": 19190
|
|
},
|
|
{
|
|
"entropy": 5.214370965957642,
|
|
"epoch": 1.8439000960614793,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000466292910085776,
|
|
"loss": 4.9312,
|
|
"mean_token_accuracy": 0.21655133664608,
|
|
"num_tokens": 43993751.0,
|
|
"step": 19195
|
|
},
|
|
{
|
|
"entropy": 5.124353647232056,
|
|
"epoch": 1.8443804034582132,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046627485843592854,
|
|
"loss": 4.8756,
|
|
"mean_token_accuracy": 0.2187720462679863,
|
|
"num_tokens": 44005021.0,
|
|
"step": 19200
|
|
},
|
|
{
|
|
"entropy": 5.108295059204101,
|
|
"epoch": 1.844860710854947,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046625680234524674,
|
|
"loss": 4.8729,
|
|
"mean_token_accuracy": 0.2216949701309204,
|
|
"num_tokens": 44015571.0,
|
|
"step": 19205
|
|
},
|
|
{
|
|
"entropy": 5.198014593124389,
|
|
"epoch": 1.845341018251681,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046623874181414993,
|
|
"loss": 4.9338,
|
|
"mean_token_accuracy": 0.21955927908420564,
|
|
"num_tokens": 44026363.0,
|
|
"step": 19210
|
|
},
|
|
{
|
|
"entropy": 5.117824935913086,
|
|
"epoch": 1.8458213256484148,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004662206768430574,
|
|
"loss": 4.8308,
|
|
"mean_token_accuracy": 0.23248151242733,
|
|
"num_tokens": 44038042.0,
|
|
"step": 19215
|
|
},
|
|
{
|
|
"entropy": 5.17954683303833,
|
|
"epoch": 1.846301633045149,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004662026074323886,
|
|
"loss": 4.9171,
|
|
"mean_token_accuracy": 0.21451413333415986,
|
|
"num_tokens": 44049086.0,
|
|
"step": 19220
|
|
},
|
|
{
|
|
"entropy": 5.197770690917968,
|
|
"epoch": 1.8467819404418828,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046618453358256303,
|
|
"loss": 4.927,
|
|
"mean_token_accuracy": 0.22353516668081283,
|
|
"num_tokens": 44060816.0,
|
|
"step": 19225
|
|
},
|
|
{
|
|
"entropy": 5.115513134002685,
|
|
"epoch": 1.8472622478386167,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046616645529400026,
|
|
"loss": 4.7785,
|
|
"mean_token_accuracy": 0.22394300550222396,
|
|
"num_tokens": 44071877.0,
|
|
"step": 19230
|
|
},
|
|
{
|
|
"entropy": 5.157533788681031,
|
|
"epoch": 1.8477425552353506,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004661483725671201,
|
|
"loss": 4.9054,
|
|
"mean_token_accuracy": 0.22069804072380067,
|
|
"num_tokens": 44082618.0,
|
|
"step": 19235
|
|
},
|
|
{
|
|
"entropy": 5.154093313217163,
|
|
"epoch": 1.8482228626320847,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046613028540234226,
|
|
"loss": 4.8624,
|
|
"mean_token_accuracy": 0.22440769523382187,
|
|
"num_tokens": 44093411.0,
|
|
"step": 19240
|
|
},
|
|
{
|
|
"entropy": 5.08169150352478,
|
|
"epoch": 1.8487031700288186,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004661121938000867,
|
|
"loss": 4.7732,
|
|
"mean_token_accuracy": 0.23668147772550582,
|
|
"num_tokens": 44103595.0,
|
|
"step": 19245
|
|
},
|
|
{
|
|
"entropy": 5.101725435256958,
|
|
"epoch": 1.8491834774255524,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004660940977607736,
|
|
"loss": 4.8774,
|
|
"mean_token_accuracy": 0.23302264213562013,
|
|
"num_tokens": 44114740.0,
|
|
"step": 19250
|
|
},
|
|
{
|
|
"entropy": 5.108340311050415,
|
|
"epoch": 1.8496637848222863,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046607599728482285,
|
|
"loss": 4.8246,
|
|
"mean_token_accuracy": 0.22913457453250885,
|
|
"num_tokens": 44125415.0,
|
|
"step": 19255
|
|
},
|
|
{
|
|
"entropy": 5.1172998428344725,
|
|
"epoch": 1.8501440922190202,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046605789237265496,
|
|
"loss": 4.8727,
|
|
"mean_token_accuracy": 0.22260245233774184,
|
|
"num_tokens": 44137399.0,
|
|
"step": 19260
|
|
},
|
|
{
|
|
"entropy": 5.148006868362427,
|
|
"epoch": 1.850624399615754,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046603978302469,
|
|
"loss": 4.8204,
|
|
"mean_token_accuracy": 0.21915482729673386,
|
|
"num_tokens": 44148354.0,
|
|
"step": 19265
|
|
},
|
|
{
|
|
"entropy": 5.109054517745972,
|
|
"epoch": 1.851104707012488,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004660216692413486,
|
|
"loss": 4.8581,
|
|
"mean_token_accuracy": 0.22766265720129014,
|
|
"num_tokens": 44160496.0,
|
|
"step": 19270
|
|
},
|
|
{
|
|
"entropy": 5.156183767318725,
|
|
"epoch": 1.8515850144092219,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004660035510230513,
|
|
"loss": 4.9271,
|
|
"mean_token_accuracy": 0.2220014289021492,
|
|
"num_tokens": 44171832.0,
|
|
"step": 19275
|
|
},
|
|
{
|
|
"entropy": 5.232931613922119,
|
|
"epoch": 1.8520653218059557,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004659854283702186,
|
|
"loss": 4.8992,
|
|
"mean_token_accuracy": 0.21975383013486863,
|
|
"num_tokens": 44182850.0,
|
|
"step": 19280
|
|
},
|
|
{
|
|
"entropy": 5.176450967788696,
|
|
"epoch": 1.8525456292026896,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004659673012832715,
|
|
"loss": 4.9137,
|
|
"mean_token_accuracy": 0.2231437310576439,
|
|
"num_tokens": 44194343.0,
|
|
"step": 19285
|
|
},
|
|
{
|
|
"entropy": 5.230204057693482,
|
|
"epoch": 1.8530259365994235,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004659491697626306,
|
|
"loss": 5.011,
|
|
"mean_token_accuracy": 0.21255818009376526,
|
|
"num_tokens": 44206294.0,
|
|
"step": 19290
|
|
},
|
|
{
|
|
"entropy": 5.189047527313233,
|
|
"epoch": 1.8535062439961574,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00046593103380871705,
|
|
"loss": 4.8994,
|
|
"mean_token_accuracy": 0.22048285007476806,
|
|
"num_tokens": 44218576.0,
|
|
"step": 19295
|
|
},
|
|
{
|
|
"entropy": 5.099210739135742,
|
|
"epoch": 1.8539865513928915,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046591289342195184,
|
|
"loss": 4.8999,
|
|
"mean_token_accuracy": 0.21878990679979324,
|
|
"num_tokens": 44230480.0,
|
|
"step": 19300
|
|
},
|
|
{
|
|
"entropy": 5.1187114238739015,
|
|
"epoch": 1.8544668587896254,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004658947486027562,
|
|
"loss": 4.7986,
|
|
"mean_token_accuracy": 0.2290068194270134,
|
|
"num_tokens": 44240907.0,
|
|
"step": 19305
|
|
},
|
|
{
|
|
"entropy": 5.228351020812989,
|
|
"epoch": 1.8549471661863592,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00046587659935155124,
|
|
"loss": 4.9873,
|
|
"mean_token_accuracy": 0.21489476263523102,
|
|
"num_tokens": 44251469.0,
|
|
"step": 19310
|
|
},
|
|
{
|
|
"entropy": 5.287296009063721,
|
|
"epoch": 1.8554274735830933,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046585844566875845,
|
|
"loss": 4.9903,
|
|
"mean_token_accuracy": 0.2172749251127243,
|
|
"num_tokens": 44263875.0,
|
|
"step": 19315
|
|
},
|
|
{
|
|
"entropy": 5.1942919254302975,
|
|
"epoch": 1.8559077809798272,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004658402875547993,
|
|
"loss": 4.9816,
|
|
"mean_token_accuracy": 0.2130007728934288,
|
|
"num_tokens": 44276163.0,
|
|
"step": 19320
|
|
},
|
|
{
|
|
"entropy": 5.15506649017334,
|
|
"epoch": 1.856388088376561,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046582212501009533,
|
|
"loss": 4.8997,
|
|
"mean_token_accuracy": 0.21825883835554122,
|
|
"num_tokens": 44287985.0,
|
|
"step": 19325
|
|
},
|
|
{
|
|
"entropy": 5.2520318031311035,
|
|
"epoch": 1.856868395773295,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046580395803506825,
|
|
"loss": 4.9908,
|
|
"mean_token_accuracy": 0.21863823086023332,
|
|
"num_tokens": 44299360.0,
|
|
"step": 19330
|
|
},
|
|
{
|
|
"entropy": 5.217044162750244,
|
|
"epoch": 1.8573487031700289,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004657857866301399,
|
|
"loss": 4.8603,
|
|
"mean_token_accuracy": 0.231880284845829,
|
|
"num_tokens": 44310459.0,
|
|
"step": 19335
|
|
},
|
|
{
|
|
"entropy": 5.279342746734619,
|
|
"epoch": 1.8578290105667628,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00046576761079573204,
|
|
"loss": 5.0231,
|
|
"mean_token_accuracy": 0.2130425050854683,
|
|
"num_tokens": 44321832.0,
|
|
"step": 19340
|
|
},
|
|
{
|
|
"entropy": 5.147693634033203,
|
|
"epoch": 1.8583093179634966,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004657494305322667,
|
|
"loss": 4.9793,
|
|
"mean_token_accuracy": 0.2211272895336151,
|
|
"num_tokens": 44332829.0,
|
|
"step": 19345
|
|
},
|
|
{
|
|
"entropy": 5.161108541488647,
|
|
"epoch": 1.8587896253602305,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004657312458401661,
|
|
"loss": 4.8809,
|
|
"mean_token_accuracy": 0.2182894691824913,
|
|
"num_tokens": 44344321.0,
|
|
"step": 19350
|
|
},
|
|
{
|
|
"entropy": 5.287922000885009,
|
|
"epoch": 1.8592699327569644,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004657130567198522,
|
|
"loss": 4.9975,
|
|
"mean_token_accuracy": 0.20884452909231185,
|
|
"num_tokens": 44357183.0,
|
|
"step": 19355
|
|
},
|
|
{
|
|
"entropy": 5.24537878036499,
|
|
"epoch": 1.8597502401536983,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046569486317174746,
|
|
"loss": 4.97,
|
|
"mean_token_accuracy": 0.2225006863474846,
|
|
"num_tokens": 44367118.0,
|
|
"step": 19360
|
|
},
|
|
{
|
|
"entropy": 5.225019359588623,
|
|
"epoch": 1.8602305475504322,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004656766651962742,
|
|
"loss": 4.9776,
|
|
"mean_token_accuracy": 0.22012574672698976,
|
|
"num_tokens": 44377699.0,
|
|
"step": 19365
|
|
},
|
|
{
|
|
"entropy": 5.138026428222656,
|
|
"epoch": 1.860710854947166,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004656584627938551,
|
|
"loss": 4.9285,
|
|
"mean_token_accuracy": 0.21567461490631104,
|
|
"num_tokens": 44389457.0,
|
|
"step": 19370
|
|
},
|
|
{
|
|
"entropy": 5.127941083908081,
|
|
"epoch": 1.8611911623439001,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046564025596491254,
|
|
"loss": 4.8416,
|
|
"mean_token_accuracy": 0.22531868368387223,
|
|
"num_tokens": 44400140.0,
|
|
"step": 19375
|
|
},
|
|
{
|
|
"entropy": 5.152888202667237,
|
|
"epoch": 1.861671469740634,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004656220447098693,
|
|
"loss": 4.8384,
|
|
"mean_token_accuracy": 0.2336040586233139,
|
|
"num_tokens": 44411088.0,
|
|
"step": 19380
|
|
},
|
|
{
|
|
"entropy": 5.154614400863648,
|
|
"epoch": 1.862151777137368,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004656038290291483,
|
|
"loss": 4.8516,
|
|
"mean_token_accuracy": 0.21973695307970048,
|
|
"num_tokens": 44422515.0,
|
|
"step": 19385
|
|
},
|
|
{
|
|
"entropy": 5.126431226730347,
|
|
"epoch": 1.8626320845341018,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004655856089231723,
|
|
"loss": 4.8983,
|
|
"mean_token_accuracy": 0.2270813450217247,
|
|
"num_tokens": 44433711.0,
|
|
"step": 19390
|
|
},
|
|
{
|
|
"entropy": 5.23093090057373,
|
|
"epoch": 1.8631123919308359,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004655673843923644,
|
|
"loss": 4.9393,
|
|
"mean_token_accuracy": 0.2212497591972351,
|
|
"num_tokens": 44446157.0,
|
|
"step": 19395
|
|
},
|
|
{
|
|
"entropy": 5.242627716064453,
|
|
"epoch": 1.8635926993275698,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004655491554371477,
|
|
"loss": 5.0405,
|
|
"mean_token_accuracy": 0.21081701517105103,
|
|
"num_tokens": 44457506.0,
|
|
"step": 19400
|
|
},
|
|
{
|
|
"entropy": 5.200697708129883,
|
|
"epoch": 1.8640730067243036,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046553092205794543,
|
|
"loss": 5.033,
|
|
"mean_token_accuracy": 0.21556743383407592,
|
|
"num_tokens": 44469572.0,
|
|
"step": 19405
|
|
},
|
|
{
|
|
"entropy": 5.1919454574584964,
|
|
"epoch": 1.8645533141210375,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00046551268425518096,
|
|
"loss": 4.9358,
|
|
"mean_token_accuracy": 0.22127741873264312,
|
|
"num_tokens": 44481932.0,
|
|
"step": 19410
|
|
},
|
|
{
|
|
"entropy": 5.158847665786743,
|
|
"epoch": 1.8650336215177714,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004654944420292776,
|
|
"loss": 4.8473,
|
|
"mean_token_accuracy": 0.2273672789335251,
|
|
"num_tokens": 44493081.0,
|
|
"step": 19415
|
|
},
|
|
{
|
|
"entropy": 5.1619750499725345,
|
|
"epoch": 1.8655139289145053,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000465476195380659,
|
|
"loss": 4.9037,
|
|
"mean_token_accuracy": 0.2256894126534462,
|
|
"num_tokens": 44503919.0,
|
|
"step": 19420
|
|
},
|
|
{
|
|
"entropy": 5.163426876068115,
|
|
"epoch": 1.8659942363112392,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004654579443097487,
|
|
"loss": 4.8935,
|
|
"mean_token_accuracy": 0.22945554107427596,
|
|
"num_tokens": 44515493.0,
|
|
"step": 19425
|
|
},
|
|
{
|
|
"entropy": 5.217961978912354,
|
|
"epoch": 1.866474543707973,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004654396888169705,
|
|
"loss": 4.9388,
|
|
"mean_token_accuracy": 0.21313630491495134,
|
|
"num_tokens": 44527432.0,
|
|
"step": 19430
|
|
},
|
|
{
|
|
"entropy": 5.127003288269043,
|
|
"epoch": 1.866954851104707,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046542142890274816,
|
|
"loss": 4.8843,
|
|
"mean_token_accuracy": 0.22532198578119278,
|
|
"num_tokens": 44537634.0,
|
|
"step": 19435
|
|
},
|
|
{
|
|
"entropy": 5.097131729125977,
|
|
"epoch": 1.8674351585014408,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004654031645675057,
|
|
"loss": 4.951,
|
|
"mean_token_accuracy": 0.22149149626493453,
|
|
"num_tokens": 44549856.0,
|
|
"step": 19440
|
|
},
|
|
{
|
|
"entropy": 5.095395898818969,
|
|
"epoch": 1.8679154658981747,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004653848958116672,
|
|
"loss": 4.8544,
|
|
"mean_token_accuracy": 0.22061508595943452,
|
|
"num_tokens": 44563214.0,
|
|
"step": 19445
|
|
},
|
|
{
|
|
"entropy": 5.1080015182495115,
|
|
"epoch": 1.8683957732949086,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046536662263565667,
|
|
"loss": 4.877,
|
|
"mean_token_accuracy": 0.21556110978126525,
|
|
"num_tokens": 44572982.0,
|
|
"step": 19450
|
|
},
|
|
{
|
|
"entropy": 5.159835386276245,
|
|
"epoch": 1.8688760806916427,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004653483450398985,
|
|
"loss": 4.8434,
|
|
"mean_token_accuracy": 0.22941485792398453,
|
|
"num_tokens": 44583664.0,
|
|
"step": 19455
|
|
},
|
|
{
|
|
"entropy": 5.096084451675415,
|
|
"epoch": 1.8693563880883766,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046533006302481694,
|
|
"loss": 4.75,
|
|
"mean_token_accuracy": 0.23159895092248917,
|
|
"num_tokens": 44594987.0,
|
|
"step": 19460
|
|
},
|
|
{
|
|
"entropy": 5.144198322296143,
|
|
"epoch": 1.8698366954851104,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004653117765908365,
|
|
"loss": 4.9304,
|
|
"mean_token_accuracy": 0.22142930179834366,
|
|
"num_tokens": 44607123.0,
|
|
"step": 19465
|
|
},
|
|
{
|
|
"entropy": 5.126646852493286,
|
|
"epoch": 1.8703170028818443,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004652934857383816,
|
|
"loss": 4.8467,
|
|
"mean_token_accuracy": 0.22166724801063536,
|
|
"num_tokens": 44618736.0,
|
|
"step": 19470
|
|
},
|
|
{
|
|
"entropy": 5.101489782333374,
|
|
"epoch": 1.8707973102785784,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004652751904678772,
|
|
"loss": 4.8425,
|
|
"mean_token_accuracy": 0.23013574928045272,
|
|
"num_tokens": 44630681.0,
|
|
"step": 19475
|
|
},
|
|
{
|
|
"entropy": 5.146808242797851,
|
|
"epoch": 1.8712776176753123,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046525689077974775,
|
|
"loss": 4.9051,
|
|
"mean_token_accuracy": 0.2216115802526474,
|
|
"num_tokens": 44641641.0,
|
|
"step": 19480
|
|
},
|
|
{
|
|
"entropy": 5.13797926902771,
|
|
"epoch": 1.8717579250720462,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046523858667441834,
|
|
"loss": 4.8518,
|
|
"mean_token_accuracy": 0.223634971678257,
|
|
"num_tokens": 44652489.0,
|
|
"step": 19485
|
|
},
|
|
{
|
|
"entropy": 5.1899360656738285,
|
|
"epoch": 1.87223823246878,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004652202781523138,
|
|
"loss": 4.941,
|
|
"mean_token_accuracy": 0.21827635020017624,
|
|
"num_tokens": 44665659.0,
|
|
"step": 19490
|
|
},
|
|
{
|
|
"entropy": 5.164613914489746,
|
|
"epoch": 1.872718539865514,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004652019652138592,
|
|
"loss": 4.8517,
|
|
"mean_token_accuracy": 0.22854416221380233,
|
|
"num_tokens": 44677300.0,
|
|
"step": 19495
|
|
},
|
|
{
|
|
"entropy": 5.118680858612061,
|
|
"epoch": 1.8731988472622478,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004651836478594798,
|
|
"loss": 4.919,
|
|
"mean_token_accuracy": 0.216032674908638,
|
|
"num_tokens": 44690084.0,
|
|
"step": 19500
|
|
},
|
|
{
|
|
"entropy": 5.161847019195557,
|
|
"epoch": 1.8736791546589817,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004651653260896008,
|
|
"loss": 4.8971,
|
|
"mean_token_accuracy": 0.22766951471567154,
|
|
"num_tokens": 44701208.0,
|
|
"step": 19505
|
|
},
|
|
{
|
|
"entropy": 5.2724034786224365,
|
|
"epoch": 1.8741594620557156,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046514699990464763,
|
|
"loss": 5.0115,
|
|
"mean_token_accuracy": 0.21198325604200363,
|
|
"num_tokens": 44713546.0,
|
|
"step": 19510
|
|
},
|
|
{
|
|
"entropy": 5.362866449356079,
|
|
"epoch": 1.8746397694524495,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004651286693050458,
|
|
"loss": 5.1058,
|
|
"mean_token_accuracy": 0.20230617225170136,
|
|
"num_tokens": 44724802.0,
|
|
"step": 19515
|
|
},
|
|
{
|
|
"entropy": 5.175036287307739,
|
|
"epoch": 1.8751200768491834,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004651103342912207,
|
|
"loss": 4.8891,
|
|
"mean_token_accuracy": 0.22242380380630494,
|
|
"num_tokens": 44736625.0,
|
|
"step": 19520
|
|
},
|
|
{
|
|
"entropy": 5.085053825378418,
|
|
"epoch": 1.8756003842459172,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046509199486359824,
|
|
"loss": 4.8602,
|
|
"mean_token_accuracy": 0.21328776627779006,
|
|
"num_tokens": 44750451.0,
|
|
"step": 19525
|
|
},
|
|
{
|
|
"entropy": 5.08069429397583,
|
|
"epoch": 1.8760806916426513,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00046507365102260403,
|
|
"loss": 4.8825,
|
|
"mean_token_accuracy": 0.22342679798603057,
|
|
"num_tokens": 44762043.0,
|
|
"step": 19530
|
|
},
|
|
{
|
|
"entropy": 5.201959800720215,
|
|
"epoch": 1.8765609990393852,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046505530276866417,
|
|
"loss": 4.9845,
|
|
"mean_token_accuracy": 0.21430910676717757,
|
|
"num_tokens": 44774329.0,
|
|
"step": 19535
|
|
},
|
|
{
|
|
"entropy": 5.232240343093872,
|
|
"epoch": 1.877041306436119,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00046503695010220443,
|
|
"loss": 4.9481,
|
|
"mean_token_accuracy": 0.2223116397857666,
|
|
"num_tokens": 44786352.0,
|
|
"step": 19540
|
|
},
|
|
{
|
|
"entropy": 5.121707820892334,
|
|
"epoch": 1.877521613832853,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000465018593023651,
|
|
"loss": 4.7645,
|
|
"mean_token_accuracy": 0.22961812168359758,
|
|
"num_tokens": 44796829.0,
|
|
"step": 19545
|
|
},
|
|
{
|
|
"entropy": 5.183013200759888,
|
|
"epoch": 1.878001921229587,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004650002315334302,
|
|
"loss": 4.9705,
|
|
"mean_token_accuracy": 0.21601766496896743,
|
|
"num_tokens": 44808900.0,
|
|
"step": 19550
|
|
},
|
|
{
|
|
"entropy": 5.1030841827392575,
|
|
"epoch": 1.878482228626321,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000464981865631968,
|
|
"loss": 4.8785,
|
|
"mean_token_accuracy": 0.22552091330289842,
|
|
"num_tokens": 44820638.0,
|
|
"step": 19555
|
|
},
|
|
{
|
|
"entropy": 5.112918424606323,
|
|
"epoch": 1.8789625360230549,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004649634953196912,
|
|
"loss": 4.8595,
|
|
"mean_token_accuracy": 0.22413474321365356,
|
|
"num_tokens": 44833370.0,
|
|
"step": 19560
|
|
},
|
|
{
|
|
"entropy": 5.111734342575073,
|
|
"epoch": 1.8794428434197887,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046494512059702605,
|
|
"loss": 4.9102,
|
|
"mean_token_accuracy": 0.22396851181983948,
|
|
"num_tokens": 44844291.0,
|
|
"step": 19565
|
|
},
|
|
{
|
|
"entropy": 5.137105417251587,
|
|
"epoch": 1.8799231508165226,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004649267414643992,
|
|
"loss": 4.8452,
|
|
"mean_token_accuracy": 0.22659626305103303,
|
|
"num_tokens": 44855303.0,
|
|
"step": 19570
|
|
},
|
|
{
|
|
"entropy": 5.119134569168091,
|
|
"epoch": 1.8804034582132565,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004649083579222374,
|
|
"loss": 4.8031,
|
|
"mean_token_accuracy": 0.2253907725214958,
|
|
"num_tokens": 44866593.0,
|
|
"step": 19575
|
|
},
|
|
{
|
|
"entropy": 5.115862894058227,
|
|
"epoch": 1.8808837656099904,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046488996997096744,
|
|
"loss": 4.9113,
|
|
"mean_token_accuracy": 0.22246016263961793,
|
|
"num_tokens": 44879124.0,
|
|
"step": 19580
|
|
},
|
|
{
|
|
"entropy": 5.138719701766968,
|
|
"epoch": 1.8813640730067243,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004648715776110162,
|
|
"loss": 4.8542,
|
|
"mean_token_accuracy": 0.23651068955659865,
|
|
"num_tokens": 44889942.0,
|
|
"step": 19585
|
|
},
|
|
{
|
|
"entropy": 5.114886236190796,
|
|
"epoch": 1.8818443804034581,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004648531808428108,
|
|
"loss": 4.8155,
|
|
"mean_token_accuracy": 0.23198310881853104,
|
|
"num_tokens": 44901002.0,
|
|
"step": 19590
|
|
},
|
|
{
|
|
"entropy": 5.169087839126587,
|
|
"epoch": 1.882324687800192,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004648347796667782,
|
|
"loss": 4.9631,
|
|
"mean_token_accuracy": 0.2160460963845253,
|
|
"num_tokens": 44913495.0,
|
|
"step": 19595
|
|
},
|
|
{
|
|
"entropy": 5.137406253814698,
|
|
"epoch": 1.882804995196926,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004648163740833458,
|
|
"loss": 4.8565,
|
|
"mean_token_accuracy": 0.21922577768564225,
|
|
"num_tokens": 44924624.0,
|
|
"step": 19600
|
|
},
|
|
{
|
|
"entropy": 5.1774355411529545,
|
|
"epoch": 1.8832853025936598,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046479796409294076,
|
|
"loss": 4.9293,
|
|
"mean_token_accuracy": 0.22170649766921996,
|
|
"num_tokens": 44935539.0,
|
|
"step": 19605
|
|
},
|
|
{
|
|
"entropy": 5.150081396102905,
|
|
"epoch": 1.8837656099903939,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004647795496959907,
|
|
"loss": 4.8425,
|
|
"mean_token_accuracy": 0.22837162464857103,
|
|
"num_tokens": 44945979.0,
|
|
"step": 19610
|
|
},
|
|
{
|
|
"entropy": 5.108224296569825,
|
|
"epoch": 1.8842459173871278,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046476113089292286,
|
|
"loss": 4.9229,
|
|
"mean_token_accuracy": 0.21530138850212097,
|
|
"num_tokens": 44958294.0,
|
|
"step": 19615
|
|
},
|
|
{
|
|
"entropy": 5.206214857101441,
|
|
"epoch": 1.8847262247838616,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004647427076841651,
|
|
"loss": 5.0064,
|
|
"mean_token_accuracy": 0.21917444765567778,
|
|
"num_tokens": 44970261.0,
|
|
"step": 19620
|
|
},
|
|
{
|
|
"entropy": 5.231733274459839,
|
|
"epoch": 1.8852065321805955,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046472428007014515,
|
|
"loss": 4.9209,
|
|
"mean_token_accuracy": 0.2197403684258461,
|
|
"num_tokens": 44981515.0,
|
|
"step": 19625
|
|
},
|
|
{
|
|
"entropy": 5.0884459018707275,
|
|
"epoch": 1.8856868395773296,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004647058480512907,
|
|
"loss": 4.8162,
|
|
"mean_token_accuracy": 0.23149994909763336,
|
|
"num_tokens": 44992473.0,
|
|
"step": 19630
|
|
},
|
|
{
|
|
"entropy": 5.197129726409912,
|
|
"epoch": 1.8861671469740635,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046468741162802987,
|
|
"loss": 4.9509,
|
|
"mean_token_accuracy": 0.2180525004863739,
|
|
"num_tokens": 45004264.0,
|
|
"step": 19635
|
|
},
|
|
{
|
|
"entropy": 5.163276147842407,
|
|
"epoch": 1.8866474543707974,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004646689708007905,
|
|
"loss": 4.8297,
|
|
"mean_token_accuracy": 0.22839758545160294,
|
|
"num_tokens": 45015601.0,
|
|
"step": 19640
|
|
},
|
|
{
|
|
"entropy": 5.204332256317139,
|
|
"epoch": 1.8871277617675313,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046465052557000087,
|
|
"loss": 5.0055,
|
|
"mean_token_accuracy": 0.22162444591522218,
|
|
"num_tokens": 45028044.0,
|
|
"step": 19645
|
|
},
|
|
{
|
|
"entropy": 5.240250301361084,
|
|
"epoch": 1.8876080691642652,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00046463207593608916,
|
|
"loss": 4.9686,
|
|
"mean_token_accuracy": 0.2230614274740219,
|
|
"num_tokens": 45040144.0,
|
|
"step": 19650
|
|
},
|
|
{
|
|
"entropy": 5.266805171966553,
|
|
"epoch": 1.888088376560999,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004646136218994837,
|
|
"loss": 5.0224,
|
|
"mean_token_accuracy": 0.21359916925430297,
|
|
"num_tokens": 45051844.0,
|
|
"step": 19655
|
|
},
|
|
{
|
|
"entropy": 5.174513053894043,
|
|
"epoch": 1.888568683957733,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046459516346061304,
|
|
"loss": 4.9785,
|
|
"mean_token_accuracy": 0.21726988703012468,
|
|
"num_tokens": 45062684.0,
|
|
"step": 19660
|
|
},
|
|
{
|
|
"entropy": 5.155341243743896,
|
|
"epoch": 1.8890489913544668,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046457670061990564,
|
|
"loss": 4.9112,
|
|
"mean_token_accuracy": 0.2235699400305748,
|
|
"num_tokens": 45073327.0,
|
|
"step": 19665
|
|
},
|
|
{
|
|
"entropy": 5.172566366195679,
|
|
"epoch": 1.8895292987512007,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046455823337779024,
|
|
"loss": 4.928,
|
|
"mean_token_accuracy": 0.2199169397354126,
|
|
"num_tokens": 45085309.0,
|
|
"step": 19670
|
|
},
|
|
{
|
|
"entropy": 5.26616792678833,
|
|
"epoch": 1.8900096061479346,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004645397617346954,
|
|
"loss": 4.981,
|
|
"mean_token_accuracy": 0.2198196455836296,
|
|
"num_tokens": 45095934.0,
|
|
"step": 19675
|
|
},
|
|
{
|
|
"entropy": 5.195982265472412,
|
|
"epoch": 1.8904899135446684,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004645212856910502,
|
|
"loss": 4.8771,
|
|
"mean_token_accuracy": 0.22742779403924943,
|
|
"num_tokens": 45106220.0,
|
|
"step": 19680
|
|
},
|
|
{
|
|
"entropy": 5.119170188903809,
|
|
"epoch": 1.8909702209414025,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004645028052472835,
|
|
"loss": 4.8547,
|
|
"mean_token_accuracy": 0.2206563949584961,
|
|
"num_tokens": 45117861.0,
|
|
"step": 19685
|
|
},
|
|
{
|
|
"entropy": 5.113675260543824,
|
|
"epoch": 1.8914505283381364,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046448432040382444,
|
|
"loss": 4.8622,
|
|
"mean_token_accuracy": 0.23194930851459503,
|
|
"num_tokens": 45129248.0,
|
|
"step": 19690
|
|
},
|
|
{
|
|
"entropy": 5.2112102031707765,
|
|
"epoch": 1.8919308357348703,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000464465831161102,
|
|
"loss": 4.8966,
|
|
"mean_token_accuracy": 0.21587489694356918,
|
|
"num_tokens": 45140610.0,
|
|
"step": 19695
|
|
},
|
|
{
|
|
"entropy": 5.147746658325195,
|
|
"epoch": 1.8924111431316042,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004644473375195456,
|
|
"loss": 4.8274,
|
|
"mean_token_accuracy": 0.22959212362766265,
|
|
"num_tokens": 45150697.0,
|
|
"step": 19700
|
|
},
|
|
{
|
|
"entropy": 5.177985334396363,
|
|
"epoch": 1.8928914505283383,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046442883947958466,
|
|
"loss": 4.9042,
|
|
"mean_token_accuracy": 0.22117964476346968,
|
|
"num_tokens": 45162627.0,
|
|
"step": 19705
|
|
},
|
|
{
|
|
"entropy": 5.038029146194458,
|
|
"epoch": 1.8933717579250722,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046441033704164845,
|
|
"loss": 4.6972,
|
|
"mean_token_accuracy": 0.2338120698928833,
|
|
"num_tokens": 45173030.0,
|
|
"step": 19710
|
|
},
|
|
{
|
|
"entropy": 5.172407436370849,
|
|
"epoch": 1.893852065321806,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004643918302061666,
|
|
"loss": 4.9106,
|
|
"mean_token_accuracy": 0.22116362750530244,
|
|
"num_tokens": 45183588.0,
|
|
"step": 19715
|
|
},
|
|
{
|
|
"entropy": 5.237579727172852,
|
|
"epoch": 1.89433237271854,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004643733189735689,
|
|
"loss": 4.9034,
|
|
"mean_token_accuracy": 0.2231490433216095,
|
|
"num_tokens": 45195289.0,
|
|
"step": 19720
|
|
},
|
|
{
|
|
"entropy": 5.165693855285644,
|
|
"epoch": 1.8948126801152738,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000464354803344285,
|
|
"loss": 4.8877,
|
|
"mean_token_accuracy": 0.22597122192382812,
|
|
"num_tokens": 45207001.0,
|
|
"step": 19725
|
|
},
|
|
{
|
|
"entropy": 5.0735241889953615,
|
|
"epoch": 1.8952929875120077,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046433628331874496,
|
|
"loss": 4.9137,
|
|
"mean_token_accuracy": 0.22631023377180098,
|
|
"num_tokens": 45217447.0,
|
|
"step": 19730
|
|
},
|
|
{
|
|
"entropy": 5.171666955947876,
|
|
"epoch": 1.8957732949087416,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004643177588973785,
|
|
"loss": 4.8659,
|
|
"mean_token_accuracy": 0.22453079670667647,
|
|
"num_tokens": 45227588.0,
|
|
"step": 19735
|
|
},
|
|
{
|
|
"entropy": 5.181704807281494,
|
|
"epoch": 1.8962536023054755,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004642992300806159,
|
|
"loss": 4.8899,
|
|
"mean_token_accuracy": 0.2267356261610985,
|
|
"num_tokens": 45239969.0,
|
|
"step": 19740
|
|
},
|
|
{
|
|
"entropy": 5.118777227401734,
|
|
"epoch": 1.8967339097022093,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004642806968688873,
|
|
"loss": 4.8654,
|
|
"mean_token_accuracy": 0.21952651739120482,
|
|
"num_tokens": 45250877.0,
|
|
"step": 19745
|
|
},
|
|
{
|
|
"entropy": 5.194926071166992,
|
|
"epoch": 1.8972142170989432,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046426215926262295,
|
|
"loss": 4.9448,
|
|
"mean_token_accuracy": 0.22451459765434265,
|
|
"num_tokens": 45262846.0,
|
|
"step": 19750
|
|
},
|
|
{
|
|
"entropy": 5.164976167678833,
|
|
"epoch": 1.897694524495677,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004642436172622532,
|
|
"loss": 4.8732,
|
|
"mean_token_accuracy": 0.22321307361125947,
|
|
"num_tokens": 45274198.0,
|
|
"step": 19755
|
|
},
|
|
{
|
|
"entropy": 5.107458639144897,
|
|
"epoch": 1.898174831892411,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004642250708682086,
|
|
"loss": 4.876,
|
|
"mean_token_accuracy": 0.22586560100317002,
|
|
"num_tokens": 45285803.0,
|
|
"step": 19760
|
|
},
|
|
{
|
|
"entropy": 5.195139932632446,
|
|
"epoch": 1.898655139289145,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046420652008091984,
|
|
"loss": 4.9029,
|
|
"mean_token_accuracy": 0.22238381803035737,
|
|
"num_tokens": 45298242.0,
|
|
"step": 19765
|
|
},
|
|
{
|
|
"entropy": 5.137690496444702,
|
|
"epoch": 1.899135446685879,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004641879649008174,
|
|
"loss": 4.8659,
|
|
"mean_token_accuracy": 0.22768788039684296,
|
|
"num_tokens": 45309567.0,
|
|
"step": 19770
|
|
},
|
|
{
|
|
"entropy": 5.216307497024536,
|
|
"epoch": 1.8996157540826129,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004641694053283323,
|
|
"loss": 4.9775,
|
|
"mean_token_accuracy": 0.22154446691274643,
|
|
"num_tokens": 45321693.0,
|
|
"step": 19775
|
|
},
|
|
{
|
|
"entropy": 5.222153615951538,
|
|
"epoch": 1.9000960614793467,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00046415084136389525,
|
|
"loss": 4.9213,
|
|
"mean_token_accuracy": 0.2227088153362274,
|
|
"num_tokens": 45333094.0,
|
|
"step": 19780
|
|
},
|
|
{
|
|
"entropy": 5.142860698699951,
|
|
"epoch": 1.9005763688760808,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004641322730079374,
|
|
"loss": 4.8623,
|
|
"mean_token_accuracy": 0.22830192744731903,
|
|
"num_tokens": 45344549.0,
|
|
"step": 19785
|
|
},
|
|
{
|
|
"entropy": 5.189712476730347,
|
|
"epoch": 1.9010566762728147,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004641137002608897,
|
|
"loss": 4.9291,
|
|
"mean_token_accuracy": 0.215300115942955,
|
|
"num_tokens": 45356433.0,
|
|
"step": 19790
|
|
},
|
|
{
|
|
"entropy": 5.16183123588562,
|
|
"epoch": 1.9015369836695486,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046409512312318345,
|
|
"loss": 4.875,
|
|
"mean_token_accuracy": 0.22419418394565582,
|
|
"num_tokens": 45368224.0,
|
|
"step": 19795
|
|
},
|
|
{
|
|
"entropy": 5.074509048461914,
|
|
"epoch": 1.9020172910662825,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046407654159524994,
|
|
"loss": 4.814,
|
|
"mean_token_accuracy": 0.22955633252859114,
|
|
"num_tokens": 45379673.0,
|
|
"step": 19800
|
|
},
|
|
{
|
|
"entropy": 5.221636247634888,
|
|
"epoch": 1.9024975984630164,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00046405795567752055,
|
|
"loss": 4.9505,
|
|
"mean_token_accuracy": 0.22039366215467454,
|
|
"num_tokens": 45391613.0,
|
|
"step": 19805
|
|
},
|
|
{
|
|
"entropy": 5.164040994644165,
|
|
"epoch": 1.9029779058597502,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00046403936537042686,
|
|
"loss": 4.9312,
|
|
"mean_token_accuracy": 0.21759382635354996,
|
|
"num_tokens": 45403693.0,
|
|
"step": 19810
|
|
},
|
|
{
|
|
"entropy": 5.153062391281128,
|
|
"epoch": 1.9034582132564841,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046402077067440043,
|
|
"loss": 4.9291,
|
|
"mean_token_accuracy": 0.22086333185434343,
|
|
"num_tokens": 45416151.0,
|
|
"step": 19815
|
|
},
|
|
{
|
|
"entropy": 5.180631017684936,
|
|
"epoch": 1.903938520653218,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046400217158987293,
|
|
"loss": 4.9654,
|
|
"mean_token_accuracy": 0.21587077677249908,
|
|
"num_tokens": 45428276.0,
|
|
"step": 19820
|
|
},
|
|
{
|
|
"entropy": 5.128487300872803,
|
|
"epoch": 1.9044188280499519,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046398356811727626,
|
|
"loss": 4.8451,
|
|
"mean_token_accuracy": 0.22816976755857468,
|
|
"num_tokens": 45439877.0,
|
|
"step": 19825
|
|
},
|
|
{
|
|
"entropy": 5.178048086166382,
|
|
"epoch": 1.9048991354466858,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004639649602570423,
|
|
"loss": 4.9352,
|
|
"mean_token_accuracy": 0.23003823608160018,
|
|
"num_tokens": 45451459.0,
|
|
"step": 19830
|
|
},
|
|
{
|
|
"entropy": 5.043431615829467,
|
|
"epoch": 1.9053794428434196,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046394634800960314,
|
|
"loss": 4.7977,
|
|
"mean_token_accuracy": 0.22761821001768112,
|
|
"num_tokens": 45463341.0,
|
|
"step": 19835
|
|
},
|
|
{
|
|
"entropy": 5.19926815032959,
|
|
"epoch": 1.9058597502401537,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046392773137539074,
|
|
"loss": 4.8938,
|
|
"mean_token_accuracy": 0.22187106013298036,
|
|
"num_tokens": 45475585.0,
|
|
"step": 19840
|
|
},
|
|
{
|
|
"entropy": 5.246723794937134,
|
|
"epoch": 1.9063400576368876,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00046390911035483744,
|
|
"loss": 4.8847,
|
|
"mean_token_accuracy": 0.22155932635068892,
|
|
"num_tokens": 45487178.0,
|
|
"step": 19845
|
|
},
|
|
{
|
|
"entropy": 5.1914482593536375,
|
|
"epoch": 1.9068203650336215,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004638904849483756,
|
|
"loss": 4.9501,
|
|
"mean_token_accuracy": 0.2210107535123825,
|
|
"num_tokens": 45498832.0,
|
|
"step": 19850
|
|
},
|
|
{
|
|
"entropy": 5.174285650253296,
|
|
"epoch": 1.9073006724303554,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046387185515643756,
|
|
"loss": 4.9425,
|
|
"mean_token_accuracy": 0.2178223967552185,
|
|
"num_tokens": 45511549.0,
|
|
"step": 19855
|
|
},
|
|
{
|
|
"entropy": 5.192152786254883,
|
|
"epoch": 1.9077809798270895,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004638532209794559,
|
|
"loss": 4.9193,
|
|
"mean_token_accuracy": 0.22249827682971954,
|
|
"num_tokens": 45523421.0,
|
|
"step": 19860
|
|
},
|
|
{
|
|
"entropy": 5.224132394790649,
|
|
"epoch": 1.9082612872238234,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004638345824178631,
|
|
"loss": 4.9337,
|
|
"mean_token_accuracy": 0.2218421757221222,
|
|
"num_tokens": 45535247.0,
|
|
"step": 19865
|
|
},
|
|
{
|
|
"entropy": 5.100824499130249,
|
|
"epoch": 1.9087415946205573,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046381593947209215,
|
|
"loss": 4.7771,
|
|
"mean_token_accuracy": 0.23252510279417038,
|
|
"num_tokens": 45546296.0,
|
|
"step": 19870
|
|
},
|
|
{
|
|
"entropy": 5.161165046691894,
|
|
"epoch": 1.9092219020172911,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004637972921425757,
|
|
"loss": 4.9398,
|
|
"mean_token_accuracy": 0.22113918364048005,
|
|
"num_tokens": 45557360.0,
|
|
"step": 19875
|
|
},
|
|
{
|
|
"entropy": 5.156943368911743,
|
|
"epoch": 1.909702209414025,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046377864042974675,
|
|
"loss": 4.9039,
|
|
"mean_token_accuracy": 0.22512820065021516,
|
|
"num_tokens": 45568500.0,
|
|
"step": 19880
|
|
},
|
|
{
|
|
"entropy": 5.203023481369018,
|
|
"epoch": 1.910182516810759,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004637599843340384,
|
|
"loss": 4.9682,
|
|
"mean_token_accuracy": 0.2143253818154335,
|
|
"num_tokens": 45579513.0,
|
|
"step": 19885
|
|
},
|
|
{
|
|
"entropy": 5.187750816345215,
|
|
"epoch": 1.9106628242074928,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046374132385588356,
|
|
"loss": 4.879,
|
|
"mean_token_accuracy": 0.22665640264749526,
|
|
"num_tokens": 45591144.0,
|
|
"step": 19890
|
|
},
|
|
{
|
|
"entropy": 5.166464710235596,
|
|
"epoch": 1.9111431316042267,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046372265899571576,
|
|
"loss": 4.9192,
|
|
"mean_token_accuracy": 0.22032105475664138,
|
|
"num_tokens": 45602487.0,
|
|
"step": 19895
|
|
},
|
|
{
|
|
"entropy": 5.213956642150879,
|
|
"epoch": 1.9116234390009605,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046370398975396817,
|
|
"loss": 4.9565,
|
|
"mean_token_accuracy": 0.2110735148191452,
|
|
"num_tokens": 45614448.0,
|
|
"step": 19900
|
|
},
|
|
{
|
|
"entropy": 5.190959644317627,
|
|
"epoch": 1.9121037463976944,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004636853161310743,
|
|
"loss": 4.9115,
|
|
"mean_token_accuracy": 0.21847577691078185,
|
|
"num_tokens": 45625660.0,
|
|
"step": 19905
|
|
},
|
|
{
|
|
"entropy": 5.06601505279541,
|
|
"epoch": 1.9125840537944283,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046366663812746764,
|
|
"loss": 4.7971,
|
|
"mean_token_accuracy": 0.23348991572856903,
|
|
"num_tokens": 45638602.0,
|
|
"step": 19910
|
|
},
|
|
{
|
|
"entropy": 5.091584253311157,
|
|
"epoch": 1.9130643611911622,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004636479557435818,
|
|
"loss": 4.7884,
|
|
"mean_token_accuracy": 0.2402698814868927,
|
|
"num_tokens": 45649460.0,
|
|
"step": 19915
|
|
},
|
|
{
|
|
"entropy": 5.059809160232544,
|
|
"epoch": 1.9135446685878963,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046362926897985067,
|
|
"loss": 4.7924,
|
|
"mean_token_accuracy": 0.23128978610038758,
|
|
"num_tokens": 45659995.0,
|
|
"step": 19920
|
|
},
|
|
{
|
|
"entropy": 5.077728748321533,
|
|
"epoch": 1.9140249759846302,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000463610577836708,
|
|
"loss": 4.8649,
|
|
"mean_token_accuracy": 0.2222018852829933,
|
|
"num_tokens": 45672262.0,
|
|
"step": 19925
|
|
},
|
|
{
|
|
"entropy": 5.184646224975586,
|
|
"epoch": 1.914505283381364,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046359188231458783,
|
|
"loss": 4.933,
|
|
"mean_token_accuracy": 0.2203192874789238,
|
|
"num_tokens": 45685210.0,
|
|
"step": 19930
|
|
},
|
|
{
|
|
"entropy": 5.165609121322632,
|
|
"epoch": 1.914985590778098,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046357318241392414,
|
|
"loss": 4.9574,
|
|
"mean_token_accuracy": 0.21842219084501266,
|
|
"num_tokens": 45696418.0,
|
|
"step": 19935
|
|
},
|
|
{
|
|
"entropy": 5.18068208694458,
|
|
"epoch": 1.915465898174832,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000463554478135151,
|
|
"loss": 4.9414,
|
|
"mean_token_accuracy": 0.21887536495923995,
|
|
"num_tokens": 45708393.0,
|
|
"step": 19940
|
|
},
|
|
{
|
|
"entropy": 5.234921455383301,
|
|
"epoch": 1.915946205571566,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004635357694787029,
|
|
"loss": 4.9527,
|
|
"mean_token_accuracy": 0.2121457889676094,
|
|
"num_tokens": 45721287.0,
|
|
"step": 19945
|
|
},
|
|
{
|
|
"entropy": 5.099327230453492,
|
|
"epoch": 1.9164265129682998,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000463517056445014,
|
|
"loss": 4.7687,
|
|
"mean_token_accuracy": 0.23359038829803466,
|
|
"num_tokens": 45732846.0,
|
|
"step": 19950
|
|
},
|
|
{
|
|
"entropy": 5.162627077102661,
|
|
"epoch": 1.9169068203650337,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046349833903451884,
|
|
"loss": 4.9632,
|
|
"mean_token_accuracy": 0.21162394881248475,
|
|
"num_tokens": 45743931.0,
|
|
"step": 19955
|
|
},
|
|
{
|
|
"entropy": 5.217878150939941,
|
|
"epoch": 1.9173871277617676,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046347961724765196,
|
|
"loss": 4.8885,
|
|
"mean_token_accuracy": 0.22479778081178664,
|
|
"num_tokens": 45755512.0,
|
|
"step": 19960
|
|
},
|
|
{
|
|
"entropy": 5.19102373123169,
|
|
"epoch": 1.9178674351585014,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046346089108484806,
|
|
"loss": 4.857,
|
|
"mean_token_accuracy": 0.22538246363401412,
|
|
"num_tokens": 45766793.0,
|
|
"step": 19965
|
|
},
|
|
{
|
|
"entropy": 5.146558666229248,
|
|
"epoch": 1.9183477425552353,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046344216054654193,
|
|
"loss": 4.9382,
|
|
"mean_token_accuracy": 0.2223276048898697,
|
|
"num_tokens": 45778214.0,
|
|
"step": 19970
|
|
},
|
|
{
|
|
"entropy": 5.272023105621338,
|
|
"epoch": 1.9188280499519692,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046342342563316833,
|
|
"loss": 5.1017,
|
|
"mean_token_accuracy": 0.20328507870435714,
|
|
"num_tokens": 45791255.0,
|
|
"step": 19975
|
|
},
|
|
{
|
|
"entropy": 5.253413915634155,
|
|
"epoch": 1.919308357348703,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046340468634516223,
|
|
"loss": 4.9458,
|
|
"mean_token_accuracy": 0.21117972880601882,
|
|
"num_tokens": 45803351.0,
|
|
"step": 19980
|
|
},
|
|
{
|
|
"entropy": 5.097596788406372,
|
|
"epoch": 1.919788664745437,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00046338594268295884,
|
|
"loss": 4.8919,
|
|
"mean_token_accuracy": 0.23192906975746155,
|
|
"num_tokens": 45814986.0,
|
|
"step": 19985
|
|
},
|
|
{
|
|
"entropy": 5.0635106563568115,
|
|
"epoch": 1.9202689721421708,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004633671946469933,
|
|
"loss": 4.8551,
|
|
"mean_token_accuracy": 0.22398319989442825,
|
|
"num_tokens": 45827073.0,
|
|
"step": 19990
|
|
},
|
|
{
|
|
"entropy": 5.206949281692505,
|
|
"epoch": 1.920749279538905,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00046334844223770076,
|
|
"loss": 4.917,
|
|
"mean_token_accuracy": 0.231214801967144,
|
|
"num_tokens": 45837808.0,
|
|
"step": 19995
|
|
},
|
|
{
|
|
"entropy": 5.1531201839447025,
|
|
"epoch": 1.9212295869356388,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00046332968545551674,
|
|
"loss": 4.9168,
|
|
"mean_token_accuracy": 0.22213385105133057,
|
|
"num_tokens": 45849932.0,
|
|
"step": 20000
|
|
},
|
|
{
|
|
"entropy": 5.133365106582642,
|
|
"epoch": 1.9217098943323727,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004633109243008765,
|
|
"loss": 4.8827,
|
|
"mean_token_accuracy": 0.22360755801200866,
|
|
"num_tokens": 45860961.0,
|
|
"step": 20005
|
|
},
|
|
{
|
|
"entropy": 5.0825090408325195,
|
|
"epoch": 1.9221902017291066,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004632921587742159,
|
|
"loss": 4.8216,
|
|
"mean_token_accuracy": 0.22799782902002336,
|
|
"num_tokens": 45872251.0,
|
|
"step": 20010
|
|
},
|
|
{
|
|
"entropy": 5.222860527038574,
|
|
"epoch": 1.9226705091258407,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046327338887597043,
|
|
"loss": 4.9614,
|
|
"mean_token_accuracy": 0.21876602619886398,
|
|
"num_tokens": 45883234.0,
|
|
"step": 20015
|
|
},
|
|
{
|
|
"entropy": 5.1399219036102295,
|
|
"epoch": 1.9231508165225746,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000463254614606576,
|
|
"loss": 4.8616,
|
|
"mean_token_accuracy": 0.22610373198986053,
|
|
"num_tokens": 45894003.0,
|
|
"step": 20020
|
|
},
|
|
{
|
|
"entropy": 5.038976621627808,
|
|
"epoch": 1.9236311239193085,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004632358359664683,
|
|
"loss": 4.8077,
|
|
"mean_token_accuracy": 0.22781697660684586,
|
|
"num_tokens": 45906109.0,
|
|
"step": 20025
|
|
},
|
|
{
|
|
"entropy": 5.154880475997925,
|
|
"epoch": 1.9241114313160423,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046321705295608356,
|
|
"loss": 4.9599,
|
|
"mean_token_accuracy": 0.21925552040338517,
|
|
"num_tokens": 45918372.0,
|
|
"step": 20030
|
|
},
|
|
{
|
|
"entropy": 5.219087791442871,
|
|
"epoch": 1.9245917387127762,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046319826557585764,
|
|
"loss": 4.9844,
|
|
"mean_token_accuracy": 0.21151201874017717,
|
|
"num_tokens": 45928750.0,
|
|
"step": 20035
|
|
},
|
|
{
|
|
"entropy": 5.254603242874145,
|
|
"epoch": 1.92507204610951,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004631794738262269,
|
|
"loss": 4.9662,
|
|
"mean_token_accuracy": 0.2226362034678459,
|
|
"num_tokens": 45939297.0,
|
|
"step": 20040
|
|
},
|
|
{
|
|
"entropy": 5.169375038146972,
|
|
"epoch": 1.925552353506244,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004631606777076275,
|
|
"loss": 4.8812,
|
|
"mean_token_accuracy": 0.22293281108140944,
|
|
"num_tokens": 45949986.0,
|
|
"step": 20045
|
|
},
|
|
{
|
|
"entropy": 5.1302508354187015,
|
|
"epoch": 1.9260326609029779,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046314187722049587,
|
|
"loss": 4.9219,
|
|
"mean_token_accuracy": 0.21803017556667328,
|
|
"num_tokens": 45960747.0,
|
|
"step": 20050
|
|
},
|
|
{
|
|
"entropy": 5.203114032745361,
|
|
"epoch": 1.9265129682997117,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00046312307236526863,
|
|
"loss": 4.9201,
|
|
"mean_token_accuracy": 0.22686970084905625,
|
|
"num_tokens": 45972260.0,
|
|
"step": 20055
|
|
},
|
|
{
|
|
"entropy": 5.126967477798462,
|
|
"epoch": 1.9269932756964456,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046310426314238217,
|
|
"loss": 4.8274,
|
|
"mean_token_accuracy": 0.22837868332862854,
|
|
"num_tokens": 45983589.0,
|
|
"step": 20060
|
|
},
|
|
{
|
|
"entropy": 5.058112907409668,
|
|
"epoch": 1.9274735830931795,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004630854495522733,
|
|
"loss": 4.8587,
|
|
"mean_token_accuracy": 0.22899621576070786,
|
|
"num_tokens": 45994776.0,
|
|
"step": 20065
|
|
},
|
|
{
|
|
"entropy": 5.203479146957397,
|
|
"epoch": 1.9279538904899134,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00046306663159537874,
|
|
"loss": 4.9497,
|
|
"mean_token_accuracy": 0.2172028511762619,
|
|
"num_tokens": 46005530.0,
|
|
"step": 20070
|
|
},
|
|
{
|
|
"entropy": 5.165763235092163,
|
|
"epoch": 1.9284341978866475,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046304780927213554,
|
|
"loss": 4.8475,
|
|
"mean_token_accuracy": 0.23035346120595931,
|
|
"num_tokens": 46017538.0,
|
|
"step": 20075
|
|
},
|
|
{
|
|
"entropy": 5.1419881820678714,
|
|
"epoch": 1.9289145052833814,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00046302898258298046,
|
|
"loss": 4.845,
|
|
"mean_token_accuracy": 0.2168242171406746,
|
|
"num_tokens": 46029011.0,
|
|
"step": 20080
|
|
},
|
|
{
|
|
"entropy": 5.077986097335815,
|
|
"epoch": 1.9293948126801153,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004630101515283509,
|
|
"loss": 4.8498,
|
|
"mean_token_accuracy": 0.22713633477687836,
|
|
"num_tokens": 46040354.0,
|
|
"step": 20085
|
|
},
|
|
{
|
|
"entropy": 5.12605299949646,
|
|
"epoch": 1.9298751200768491,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046299131610868377,
|
|
"loss": 4.8651,
|
|
"mean_token_accuracy": 0.22488067746162416,
|
|
"num_tokens": 46051127.0,
|
|
"step": 20090
|
|
},
|
|
{
|
|
"entropy": 5.192133140563965,
|
|
"epoch": 1.9303554274735832,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004629724763244165,
|
|
"loss": 4.8896,
|
|
"mean_token_accuracy": 0.22987643331289292,
|
|
"num_tokens": 46062343.0,
|
|
"step": 20095
|
|
},
|
|
{
|
|
"entropy": 5.099241828918457,
|
|
"epoch": 1.9308357348703171,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004629536321759866,
|
|
"loss": 4.8436,
|
|
"mean_token_accuracy": 0.22163994908332824,
|
|
"num_tokens": 46074193.0,
|
|
"step": 20100
|
|
},
|
|
{
|
|
"entropy": 5.14430193901062,
|
|
"epoch": 1.931316042267051,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046293478366383133,
|
|
"loss": 4.8598,
|
|
"mean_token_accuracy": 0.22932973504066467,
|
|
"num_tokens": 46085930.0,
|
|
"step": 20105
|
|
},
|
|
{
|
|
"entropy": 5.226075792312622,
|
|
"epoch": 1.9317963496637849,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004629159307883885,
|
|
"loss": 4.9729,
|
|
"mean_token_accuracy": 0.21434650868177413,
|
|
"num_tokens": 46099759.0,
|
|
"step": 20110
|
|
},
|
|
{
|
|
"entropy": 5.315010738372803,
|
|
"epoch": 1.9322766570605188,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004628970735500958,
|
|
"loss": 4.9977,
|
|
"mean_token_accuracy": 0.20691378712654113,
|
|
"num_tokens": 46110907.0,
|
|
"step": 20115
|
|
},
|
|
{
|
|
"entropy": 5.166486072540283,
|
|
"epoch": 1.9327569644572526,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046287821194939094,
|
|
"loss": 4.9242,
|
|
"mean_token_accuracy": 0.22367848455905914,
|
|
"num_tokens": 46122167.0,
|
|
"step": 20120
|
|
},
|
|
{
|
|
"entropy": 5.162136554718018,
|
|
"epoch": 1.9332372718539865,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046285934598671186,
|
|
"loss": 4.9868,
|
|
"mean_token_accuracy": 0.21337527185678482,
|
|
"num_tokens": 46132948.0,
|
|
"step": 20125
|
|
},
|
|
{
|
|
"entropy": 5.2009600639343265,
|
|
"epoch": 1.9337175792507204,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00046284047566249665,
|
|
"loss": 4.9534,
|
|
"mean_token_accuracy": 0.21956295073032378,
|
|
"num_tokens": 46144275.0,
|
|
"step": 20130
|
|
},
|
|
{
|
|
"entropy": 5.173071098327637,
|
|
"epoch": 1.9341978866474543,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00046282160097718336,
|
|
"loss": 4.9259,
|
|
"mean_token_accuracy": 0.22100828140974044,
|
|
"num_tokens": 46156138.0,
|
|
"step": 20135
|
|
},
|
|
{
|
|
"entropy": 5.237935400009155,
|
|
"epoch": 1.9346781940441882,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004628027219312102,
|
|
"loss": 4.9515,
|
|
"mean_token_accuracy": 0.2174043759703636,
|
|
"num_tokens": 46169437.0,
|
|
"step": 20140
|
|
},
|
|
{
|
|
"entropy": 5.155817985534668,
|
|
"epoch": 1.935158501440922,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004627838385250155,
|
|
"loss": 4.8613,
|
|
"mean_token_accuracy": 0.22143658697605134,
|
|
"num_tokens": 46182150.0,
|
|
"step": 20145
|
|
},
|
|
{
|
|
"entropy": 5.1375514507293705,
|
|
"epoch": 1.9356388088376562,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046276495075903764,
|
|
"loss": 4.8953,
|
|
"mean_token_accuracy": 0.227192685008049,
|
|
"num_tokens": 46193945.0,
|
|
"step": 20150
|
|
},
|
|
{
|
|
"entropy": 5.250063371658325,
|
|
"epoch": 1.93611911623439,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00046274605863371517,
|
|
"loss": 4.9954,
|
|
"mean_token_accuracy": 0.21560989171266556,
|
|
"num_tokens": 46206130.0,
|
|
"step": 20155
|
|
},
|
|
{
|
|
"entropy": 5.096949529647827,
|
|
"epoch": 1.936599423631124,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004627271621494868,
|
|
"loss": 4.8381,
|
|
"mean_token_accuracy": 0.2306928291916847,
|
|
"num_tokens": 46217284.0,
|
|
"step": 20160
|
|
},
|
|
{
|
|
"entropy": 5.12322769165039,
|
|
"epoch": 1.9370797310278578,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000462708261306791,
|
|
"loss": 4.877,
|
|
"mean_token_accuracy": 0.2295483872294426,
|
|
"num_tokens": 46227949.0,
|
|
"step": 20165
|
|
},
|
|
{
|
|
"entropy": 5.13996729850769,
|
|
"epoch": 1.937560038424592,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004626893561060669,
|
|
"loss": 4.8539,
|
|
"mean_token_accuracy": 0.21847130507230758,
|
|
"num_tokens": 46239101.0,
|
|
"step": 20170
|
|
},
|
|
{
|
|
"entropy": 5.225299978256226,
|
|
"epoch": 1.9380403458213258,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046267044654775324,
|
|
"loss": 5.0556,
|
|
"mean_token_accuracy": 0.2185836911201477,
|
|
"num_tokens": 46251272.0,
|
|
"step": 20175
|
|
},
|
|
{
|
|
"entropy": 5.208836698532105,
|
|
"epoch": 1.9385206532180597,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000462651532632289,
|
|
"loss": 4.9816,
|
|
"mean_token_accuracy": 0.2192826822400093,
|
|
"num_tokens": 46262603.0,
|
|
"step": 20180
|
|
},
|
|
{
|
|
"entropy": 5.184619903564453,
|
|
"epoch": 1.9390009606147935,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046263261436011344,
|
|
"loss": 4.8913,
|
|
"mean_token_accuracy": 0.22006487399339675,
|
|
"num_tokens": 46275142.0,
|
|
"step": 20185
|
|
},
|
|
{
|
|
"entropy": 5.155868911743164,
|
|
"epoch": 1.9394812680115274,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004626136917316657,
|
|
"loss": 4.9209,
|
|
"mean_token_accuracy": 0.2221606343984604,
|
|
"num_tokens": 46286339.0,
|
|
"step": 20190
|
|
},
|
|
{
|
|
"entropy": 5.146366453170776,
|
|
"epoch": 1.9399615754082613,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00046259476474738514,
|
|
"loss": 4.8876,
|
|
"mean_token_accuracy": 0.2205871284008026,
|
|
"num_tokens": 46297514.0,
|
|
"step": 20195
|
|
},
|
|
{
|
|
"entropy": 5.10622353553772,
|
|
"epoch": 1.9404418828049952,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046257583340771123,
|
|
"loss": 4.772,
|
|
"mean_token_accuracy": 0.23131893575191498,
|
|
"num_tokens": 46309249.0,
|
|
"step": 20200
|
|
},
|
|
{
|
|
"entropy": 5.167063570022583,
|
|
"epoch": 1.940922190201729,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004625568977130835,
|
|
"loss": 4.9408,
|
|
"mean_token_accuracy": 0.21829203367233277,
|
|
"num_tokens": 46320759.0,
|
|
"step": 20205
|
|
},
|
|
{
|
|
"entropy": 5.2694789409637455,
|
|
"epoch": 1.941402497598463,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004625379576639414,
|
|
"loss": 4.9495,
|
|
"mean_token_accuracy": 0.22133799344301225,
|
|
"num_tokens": 46332113.0,
|
|
"step": 20210
|
|
},
|
|
{
|
|
"entropy": 5.178436660766602,
|
|
"epoch": 1.9418828049951968,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046251901326072487,
|
|
"loss": 4.9182,
|
|
"mean_token_accuracy": 0.22542383521795273,
|
|
"num_tokens": 46342648.0,
|
|
"step": 20215
|
|
},
|
|
{
|
|
"entropy": 5.069062662124634,
|
|
"epoch": 1.9423631123919307,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046250006450387367,
|
|
"loss": 4.836,
|
|
"mean_token_accuracy": 0.2282964691519737,
|
|
"num_tokens": 46354066.0,
|
|
"step": 20220
|
|
},
|
|
{
|
|
"entropy": 5.241002178192138,
|
|
"epoch": 1.9428434197886646,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004624811113938277,
|
|
"loss": 4.9557,
|
|
"mean_token_accuracy": 0.21603974103927612,
|
|
"num_tokens": 46366586.0,
|
|
"step": 20225
|
|
},
|
|
{
|
|
"entropy": 5.172988367080689,
|
|
"epoch": 1.9433237271853987,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004624621539310271,
|
|
"loss": 4.8469,
|
|
"mean_token_accuracy": 0.22970083802938462,
|
|
"num_tokens": 46378155.0,
|
|
"step": 20230
|
|
},
|
|
{
|
|
"entropy": 5.133848571777344,
|
|
"epoch": 1.9438040345821326,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046244319211591193,
|
|
"loss": 4.9357,
|
|
"mean_token_accuracy": 0.21522724479436875,
|
|
"num_tokens": 46389631.0,
|
|
"step": 20235
|
|
},
|
|
{
|
|
"entropy": 5.231821441650391,
|
|
"epoch": 1.9442843419788665,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004624242259489223,
|
|
"loss": 4.9675,
|
|
"mean_token_accuracy": 0.21257461309432985,
|
|
"num_tokens": 46401915.0,
|
|
"step": 20240
|
|
},
|
|
{
|
|
"entropy": 5.232212734222412,
|
|
"epoch": 1.9447646493756003,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046240525543049884,
|
|
"loss": 4.9201,
|
|
"mean_token_accuracy": 0.2200825333595276,
|
|
"num_tokens": 46412396.0,
|
|
"step": 20245
|
|
},
|
|
{
|
|
"entropy": 5.070683145523072,
|
|
"epoch": 1.9452449567723344,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046238628056108176,
|
|
"loss": 4.83,
|
|
"mean_token_accuracy": 0.23035591542720796,
|
|
"num_tokens": 46423089.0,
|
|
"step": 20250
|
|
},
|
|
{
|
|
"entropy": 5.243849849700927,
|
|
"epoch": 1.9457252641690683,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046236730134111166,
|
|
"loss": 4.938,
|
|
"mean_token_accuracy": 0.21500004231929778,
|
|
"num_tokens": 46434858.0,
|
|
"step": 20255
|
|
},
|
|
{
|
|
"entropy": 5.151482677459716,
|
|
"epoch": 1.9462055715658022,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004623483177710291,
|
|
"loss": 4.8766,
|
|
"mean_token_accuracy": 0.23079614639282225,
|
|
"num_tokens": 46445628.0,
|
|
"step": 20260
|
|
},
|
|
{
|
|
"entropy": 5.125904130935669,
|
|
"epoch": 1.946685878962536,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004623293298512751,
|
|
"loss": 4.8532,
|
|
"mean_token_accuracy": 0.22425605952739716,
|
|
"num_tokens": 46457382.0,
|
|
"step": 20265
|
|
},
|
|
{
|
|
"entropy": 5.156096410751343,
|
|
"epoch": 1.94716618635927,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046231033758229026,
|
|
"loss": 4.9302,
|
|
"mean_token_accuracy": 0.21903278380632402,
|
|
"num_tokens": 46469765.0,
|
|
"step": 20270
|
|
},
|
|
{
|
|
"entropy": 5.3018150806427,
|
|
"epoch": 1.9476464937560038,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004622913409645154,
|
|
"loss": 5.006,
|
|
"mean_token_accuracy": 0.2137613371014595,
|
|
"num_tokens": 46481451.0,
|
|
"step": 20275
|
|
},
|
|
{
|
|
"entropy": 5.180299520492554,
|
|
"epoch": 1.9481268011527377,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004622723399983919,
|
|
"loss": 4.7839,
|
|
"mean_token_accuracy": 0.22959625124931335,
|
|
"num_tokens": 46491585.0,
|
|
"step": 20280
|
|
},
|
|
{
|
|
"entropy": 5.159808111190796,
|
|
"epoch": 1.9486071085494716,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046225333468436077,
|
|
"loss": 5.02,
|
|
"mean_token_accuracy": 0.21366383731365204,
|
|
"num_tokens": 46503789.0,
|
|
"step": 20285
|
|
},
|
|
{
|
|
"entropy": 5.278623628616333,
|
|
"epoch": 1.9490874159462055,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046223432502286323,
|
|
"loss": 4.9789,
|
|
"mean_token_accuracy": 0.2195223718881607,
|
|
"num_tokens": 46516647.0,
|
|
"step": 20290
|
|
},
|
|
{
|
|
"entropy": 5.185705709457397,
|
|
"epoch": 1.9495677233429394,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046221531101434056,
|
|
"loss": 4.9046,
|
|
"mean_token_accuracy": 0.21597968637943268,
|
|
"num_tokens": 46527432.0,
|
|
"step": 20295
|
|
},
|
|
{
|
|
"entropy": 5.209407901763916,
|
|
"epoch": 1.9500480307396733,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004621962926592343,
|
|
"loss": 4.9442,
|
|
"mean_token_accuracy": 0.2232288047671318,
|
|
"num_tokens": 46538990.0,
|
|
"step": 20300
|
|
},
|
|
{
|
|
"entropy": 5.1626382827758786,
|
|
"epoch": 1.9505283381364071,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000462177269957986,
|
|
"loss": 4.922,
|
|
"mean_token_accuracy": 0.2165737271308899,
|
|
"num_tokens": 46551291.0,
|
|
"step": 20305
|
|
},
|
|
{
|
|
"entropy": 5.165114831924439,
|
|
"epoch": 1.9510086455331412,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004621582429110373,
|
|
"loss": 4.8466,
|
|
"mean_token_accuracy": 0.2309819519519806,
|
|
"num_tokens": 46562415.0,
|
|
"step": 20310
|
|
},
|
|
{
|
|
"entropy": 5.232455444335938,
|
|
"epoch": 1.9514889529298751,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046213921151883,
|
|
"loss": 4.9708,
|
|
"mean_token_accuracy": 0.22538121789693832,
|
|
"num_tokens": 46572544.0,
|
|
"step": 20315
|
|
},
|
|
{
|
|
"entropy": 5.136361789703369,
|
|
"epoch": 1.951969260326609,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004621201757818059,
|
|
"loss": 4.8627,
|
|
"mean_token_accuracy": 0.2275417312979698,
|
|
"num_tokens": 46584694.0,
|
|
"step": 20320
|
|
},
|
|
{
|
|
"entropy": 5.199646091461181,
|
|
"epoch": 1.952449567723343,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046210113570040683,
|
|
"loss": 4.9058,
|
|
"mean_token_accuracy": 0.22562002390623093,
|
|
"num_tokens": 46595086.0,
|
|
"step": 20325
|
|
},
|
|
{
|
|
"entropy": 5.240507936477661,
|
|
"epoch": 1.952929875120077,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000462082091275075,
|
|
"loss": 4.9261,
|
|
"mean_token_accuracy": 0.2216821476817131,
|
|
"num_tokens": 46606627.0,
|
|
"step": 20330
|
|
},
|
|
{
|
|
"entropy": 5.210877895355225,
|
|
"epoch": 1.9534101825168109,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004620630425062526,
|
|
"loss": 4.9342,
|
|
"mean_token_accuracy": 0.2145678550004959,
|
|
"num_tokens": 46618560.0,
|
|
"step": 20335
|
|
},
|
|
{
|
|
"entropy": 5.178671979904175,
|
|
"epoch": 1.9538904899135447,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004620439893943817,
|
|
"loss": 4.89,
|
|
"mean_token_accuracy": 0.22025668919086455,
|
|
"num_tokens": 46630109.0,
|
|
"step": 20340
|
|
},
|
|
{
|
|
"entropy": 5.146572542190552,
|
|
"epoch": 1.9543707973102786,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004620249319399049,
|
|
"loss": 4.8815,
|
|
"mean_token_accuracy": 0.22778512686491012,
|
|
"num_tokens": 46642247.0,
|
|
"step": 20345
|
|
},
|
|
{
|
|
"entropy": 5.222317838668824,
|
|
"epoch": 1.9548511047070125,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046200587014326455,
|
|
"loss": 4.9358,
|
|
"mean_token_accuracy": 0.22239342778921128,
|
|
"num_tokens": 46653119.0,
|
|
"step": 20350
|
|
},
|
|
{
|
|
"entropy": 5.147661209106445,
|
|
"epoch": 1.9553314121037464,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004619868040049031,
|
|
"loss": 4.8709,
|
|
"mean_token_accuracy": 0.2261478677392006,
|
|
"num_tokens": 46664392.0,
|
|
"step": 20355
|
|
},
|
|
{
|
|
"entropy": 5.09419617652893,
|
|
"epoch": 1.9558117195004803,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004619677335252633,
|
|
"loss": 4.8705,
|
|
"mean_token_accuracy": 0.23083866387605667,
|
|
"num_tokens": 46674981.0,
|
|
"step": 20360
|
|
},
|
|
{
|
|
"entropy": 5.161249685287475,
|
|
"epoch": 1.9562920268972142,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00046194865870478793,
|
|
"loss": 4.9732,
|
|
"mean_token_accuracy": 0.22053551971912383,
|
|
"num_tokens": 46688692.0,
|
|
"step": 20365
|
|
},
|
|
{
|
|
"entropy": 5.051489639282226,
|
|
"epoch": 1.956772334293948,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046192957954391983,
|
|
"loss": 4.7946,
|
|
"mean_token_accuracy": 0.23118489980697632,
|
|
"num_tokens": 46699799.0,
|
|
"step": 20370
|
|
},
|
|
{
|
|
"entropy": 5.137979364395141,
|
|
"epoch": 1.957252641690682,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000461910496043102,
|
|
"loss": 4.9096,
|
|
"mean_token_accuracy": 0.22183059453964232,
|
|
"num_tokens": 46711994.0,
|
|
"step": 20375
|
|
},
|
|
{
|
|
"entropy": 5.189553737640381,
|
|
"epoch": 1.9577329490874158,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004618914082027773,
|
|
"loss": 4.9327,
|
|
"mean_token_accuracy": 0.22325861006975173,
|
|
"num_tokens": 46723247.0,
|
|
"step": 20380
|
|
},
|
|
{
|
|
"entropy": 5.254940795898437,
|
|
"epoch": 1.95821325648415,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046187231602338926,
|
|
"loss": 4.9589,
|
|
"mean_token_accuracy": 0.2160505548119545,
|
|
"num_tokens": 46732824.0,
|
|
"step": 20385
|
|
},
|
|
{
|
|
"entropy": 5.099370813369751,
|
|
"epoch": 1.9586935638808838,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046185321950538086,
|
|
"loss": 4.8828,
|
|
"mean_token_accuracy": 0.22503511756658554,
|
|
"num_tokens": 46744640.0,
|
|
"step": 20390
|
|
},
|
|
{
|
|
"entropy": 5.152339124679566,
|
|
"epoch": 1.9591738712776177,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004618341186491955,
|
|
"loss": 4.9108,
|
|
"mean_token_accuracy": 0.22266879081726074,
|
|
"num_tokens": 46757513.0,
|
|
"step": 20395
|
|
},
|
|
{
|
|
"entropy": 5.143527030944824,
|
|
"epoch": 1.9596541786743515,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004618150134552768,
|
|
"loss": 4.8554,
|
|
"mean_token_accuracy": 0.2254609391093254,
|
|
"num_tokens": 46769021.0,
|
|
"step": 20400
|
|
},
|
|
{
|
|
"entropy": 5.09747953414917,
|
|
"epoch": 1.9601344860710856,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000461795903924068,
|
|
"loss": 4.7974,
|
|
"mean_token_accuracy": 0.22561995834112167,
|
|
"num_tokens": 46781175.0,
|
|
"step": 20405
|
|
},
|
|
{
|
|
"entropy": 5.138916158676148,
|
|
"epoch": 1.9606147934678195,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046177679005601313,
|
|
"loss": 4.8219,
|
|
"mean_token_accuracy": 0.2253713935613632,
|
|
"num_tokens": 46792256.0,
|
|
"step": 20410
|
|
},
|
|
{
|
|
"entropy": 5.13484263420105,
|
|
"epoch": 1.9610951008645534,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004617576718515558,
|
|
"loss": 4.8069,
|
|
"mean_token_accuracy": 0.23279052674770356,
|
|
"num_tokens": 46803215.0,
|
|
"step": 20415
|
|
},
|
|
{
|
|
"entropy": 5.221542453765869,
|
|
"epoch": 1.9615754082612873,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004617385493111399,
|
|
"loss": 4.9427,
|
|
"mean_token_accuracy": 0.2263939142227173,
|
|
"num_tokens": 46813770.0,
|
|
"step": 20420
|
|
},
|
|
{
|
|
"entropy": 5.142890548706054,
|
|
"epoch": 1.9620557156580212,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004617194224352093,
|
|
"loss": 4.928,
|
|
"mean_token_accuracy": 0.22260897755622863,
|
|
"num_tokens": 46826923.0,
|
|
"step": 20425
|
|
},
|
|
{
|
|
"entropy": 5.1240606784820555,
|
|
"epoch": 1.962536023054755,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004617002912242083,
|
|
"loss": 4.8204,
|
|
"mean_token_accuracy": 0.23046000003814698,
|
|
"num_tokens": 46837761.0,
|
|
"step": 20430
|
|
},
|
|
{
|
|
"entropy": 5.143495988845825,
|
|
"epoch": 1.963016330451489,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00046168115567858084,
|
|
"loss": 4.9023,
|
|
"mean_token_accuracy": 0.2167316809296608,
|
|
"num_tokens": 46848190.0,
|
|
"step": 20435
|
|
},
|
|
{
|
|
"entropy": 5.088902616500855,
|
|
"epoch": 1.9634966378482228,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046166201579877125,
|
|
"loss": 4.7869,
|
|
"mean_token_accuracy": 0.23102391064167022,
|
|
"num_tokens": 46858643.0,
|
|
"step": 20440
|
|
},
|
|
{
|
|
"entropy": 5.146438980102539,
|
|
"epoch": 1.9639769452449567,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004616428715852239,
|
|
"loss": 4.8859,
|
|
"mean_token_accuracy": 0.22377959191799163,
|
|
"num_tokens": 46868711.0,
|
|
"step": 20445
|
|
},
|
|
{
|
|
"entropy": 5.192721700668335,
|
|
"epoch": 1.9644572526416906,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004616237230383833,
|
|
"loss": 4.9867,
|
|
"mean_token_accuracy": 0.2198183998465538,
|
|
"num_tokens": 46880045.0,
|
|
"step": 20450
|
|
},
|
|
{
|
|
"entropy": 5.1650725364685055,
|
|
"epoch": 1.9649375600384245,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046160457015869414,
|
|
"loss": 4.9148,
|
|
"mean_token_accuracy": 0.22214917838573456,
|
|
"num_tokens": 46890680.0,
|
|
"step": 20455
|
|
},
|
|
{
|
|
"entropy": 5.133912897109985,
|
|
"epoch": 1.9654178674351583,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046158541294660083,
|
|
"loss": 4.8629,
|
|
"mean_token_accuracy": 0.22117667347192765,
|
|
"num_tokens": 46904807.0,
|
|
"step": 20460
|
|
},
|
|
{
|
|
"entropy": 5.1891176223754885,
|
|
"epoch": 1.9658981748318924,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004615662514025483,
|
|
"loss": 4.8667,
|
|
"mean_token_accuracy": 0.22456077635288238,
|
|
"num_tokens": 46916039.0,
|
|
"step": 20465
|
|
},
|
|
{
|
|
"entropy": 5.184952592849731,
|
|
"epoch": 1.9663784822286263,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046154708552698147,
|
|
"loss": 4.8863,
|
|
"mean_token_accuracy": 0.2231151282787323,
|
|
"num_tokens": 46927175.0,
|
|
"step": 20470
|
|
},
|
|
{
|
|
"entropy": 5.097032785415649,
|
|
"epoch": 1.9668587896253602,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046152791532034517,
|
|
"loss": 4.7743,
|
|
"mean_token_accuracy": 0.23669862747192383,
|
|
"num_tokens": 46938711.0,
|
|
"step": 20475
|
|
},
|
|
{
|
|
"entropy": 5.152624464035034,
|
|
"epoch": 1.967339097022094,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00046150874078308463,
|
|
"loss": 4.8935,
|
|
"mean_token_accuracy": 0.21999486535787582,
|
|
"num_tokens": 46951002.0,
|
|
"step": 20480
|
|
},
|
|
{
|
|
"entropy": 5.23142032623291,
|
|
"epoch": 1.9678194044188282,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004614895619156449,
|
|
"loss": 4.9591,
|
|
"mean_token_accuracy": 0.22308919131755828,
|
|
"num_tokens": 46962673.0,
|
|
"step": 20485
|
|
},
|
|
{
|
|
"entropy": 5.1750617027282715,
|
|
"epoch": 1.968299711815562,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046147037871847125,
|
|
"loss": 4.8769,
|
|
"mean_token_accuracy": 0.22022126466035843,
|
|
"num_tokens": 46973955.0,
|
|
"step": 20490
|
|
},
|
|
{
|
|
"entropy": 5.196652221679687,
|
|
"epoch": 1.968780019212296,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004614511911920092,
|
|
"loss": 5.0136,
|
|
"mean_token_accuracy": 0.21228874027729033,
|
|
"num_tokens": 46985791.0,
|
|
"step": 20495
|
|
},
|
|
{
|
|
"entropy": 5.277962875366211,
|
|
"epoch": 1.9692603266090298,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046143199933670404,
|
|
"loss": 5.053,
|
|
"mean_token_accuracy": 0.21041589677333833,
|
|
"num_tokens": 46998437.0,
|
|
"step": 20500
|
|
},
|
|
{
|
|
"entropy": 5.223962783813477,
|
|
"epoch": 1.9697406340057637,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004614128031530015,
|
|
"loss": 4.8518,
|
|
"mean_token_accuracy": 0.21939769387245178,
|
|
"num_tokens": 47009666.0,
|
|
"step": 20505
|
|
},
|
|
{
|
|
"entropy": 5.241269969940186,
|
|
"epoch": 1.9702209414024976,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00046139360264134724,
|
|
"loss": 4.936,
|
|
"mean_token_accuracy": 0.21502902060747148,
|
|
"num_tokens": 47021170.0,
|
|
"step": 20510
|
|
},
|
|
{
|
|
"entropy": 5.196494102478027,
|
|
"epoch": 1.9707012487992315,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004613743978021869,
|
|
"loss": 4.8961,
|
|
"mean_token_accuracy": 0.22281893640756606,
|
|
"num_tokens": 47033077.0,
|
|
"step": 20515
|
|
},
|
|
{
|
|
"entropy": 5.200880479812622,
|
|
"epoch": 1.9711815561959654,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00046135518863596654,
|
|
"loss": 4.9337,
|
|
"mean_token_accuracy": 0.2225465178489685,
|
|
"num_tokens": 47044858.0,
|
|
"step": 20520
|
|
},
|
|
{
|
|
"entropy": 5.217056846618652,
|
|
"epoch": 1.9716618635926992,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046133597514313204,
|
|
"loss": 5.0065,
|
|
"mean_token_accuracy": 0.2121262475848198,
|
|
"num_tokens": 47057587.0,
|
|
"step": 20525
|
|
},
|
|
{
|
|
"entropy": 5.242043828964233,
|
|
"epoch": 1.9721421709894331,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004613167573241295,
|
|
"loss": 4.9383,
|
|
"mean_token_accuracy": 0.21319063603878022,
|
|
"num_tokens": 47068690.0,
|
|
"step": 20530
|
|
},
|
|
{
|
|
"entropy": 5.134704160690307,
|
|
"epoch": 1.972622478386167,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004612975351794051,
|
|
"loss": 4.9088,
|
|
"mean_token_accuracy": 0.22375574558973313,
|
|
"num_tokens": 47081279.0,
|
|
"step": 20535
|
|
},
|
|
{
|
|
"entropy": 5.199207830429077,
|
|
"epoch": 1.973102785782901,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004612783087094051,
|
|
"loss": 4.9617,
|
|
"mean_token_accuracy": 0.21864840090274812,
|
|
"num_tokens": 47093372.0,
|
|
"step": 20540
|
|
},
|
|
{
|
|
"entropy": 5.259644794464111,
|
|
"epoch": 1.973583093179635,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046125907791457594,
|
|
"loss": 5.0322,
|
|
"mean_token_accuracy": 0.21205034255981445,
|
|
"num_tokens": 47104332.0,
|
|
"step": 20545
|
|
},
|
|
{
|
|
"entropy": 5.201462650299073,
|
|
"epoch": 1.9740634005763689,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00046123984279536405,
|
|
"loss": 4.9045,
|
|
"mean_token_accuracy": 0.21743421554565429,
|
|
"num_tokens": 47116941.0,
|
|
"step": 20550
|
|
},
|
|
{
|
|
"entropy": 5.270235967636109,
|
|
"epoch": 1.9745437079731027,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046122060335221604,
|
|
"loss": 4.9614,
|
|
"mean_token_accuracy": 0.21613819301128387,
|
|
"num_tokens": 47128473.0,
|
|
"step": 20555
|
|
},
|
|
{
|
|
"entropy": 5.208339738845825,
|
|
"epoch": 1.9750240153698368,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046120135958557855,
|
|
"loss": 4.8999,
|
|
"mean_token_accuracy": 0.21493572890758514,
|
|
"num_tokens": 47140586.0,
|
|
"step": 20560
|
|
},
|
|
{
|
|
"entropy": 5.112038850784302,
|
|
"epoch": 1.9755043227665707,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046118211149589843,
|
|
"loss": 4.8393,
|
|
"mean_token_accuracy": 0.23156896084547043,
|
|
"num_tokens": 47150875.0,
|
|
"step": 20565
|
|
},
|
|
{
|
|
"entropy": 5.145128870010376,
|
|
"epoch": 1.9759846301633046,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004611628590836225,
|
|
"loss": 4.7982,
|
|
"mean_token_accuracy": 0.23350438922643663,
|
|
"num_tokens": 47162074.0,
|
|
"step": 20570
|
|
},
|
|
{
|
|
"entropy": 5.13048825263977,
|
|
"epoch": 1.9764649375600385,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004611436023491979,
|
|
"loss": 4.8319,
|
|
"mean_token_accuracy": 0.22945202738046647,
|
|
"num_tokens": 47174544.0,
|
|
"step": 20575
|
|
},
|
|
{
|
|
"entropy": 5.209021663665771,
|
|
"epoch": 1.9769452449567724,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004611243412930714,
|
|
"loss": 4.9348,
|
|
"mean_token_accuracy": 0.21808502674102784,
|
|
"num_tokens": 47186260.0,
|
|
"step": 20580
|
|
},
|
|
{
|
|
"entropy": 5.1533670902252195,
|
|
"epoch": 1.9774255523535063,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00046110507591569047,
|
|
"loss": 4.8547,
|
|
"mean_token_accuracy": 0.2219648018479347,
|
|
"num_tokens": 47198059.0,
|
|
"step": 20585
|
|
},
|
|
{
|
|
"entropy": 5.1115552425384525,
|
|
"epoch": 1.9779058597502401,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004610858062175023,
|
|
"loss": 4.8177,
|
|
"mean_token_accuracy": 0.23198753446340561,
|
|
"num_tokens": 47208300.0,
|
|
"step": 20590
|
|
},
|
|
{
|
|
"entropy": 5.135996913909912,
|
|
"epoch": 1.978386167146974,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00046106653219895417,
|
|
"loss": 4.8764,
|
|
"mean_token_accuracy": 0.23086352050304412,
|
|
"num_tokens": 47220135.0,
|
|
"step": 20595
|
|
},
|
|
{
|
|
"entropy": 5.274822854995728,
|
|
"epoch": 1.978866474543708,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004610472538604938,
|
|
"loss": 4.9403,
|
|
"mean_token_accuracy": 0.21703655570745467,
|
|
"num_tokens": 47230385.0,
|
|
"step": 20600
|
|
},
|
|
{
|
|
"entropy": 5.115185976028442,
|
|
"epoch": 1.9793467819404418,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046102797120256854,
|
|
"loss": 4.8057,
|
|
"mean_token_accuracy": 0.22790150493383407,
|
|
"num_tokens": 47242593.0,
|
|
"step": 20605
|
|
},
|
|
{
|
|
"entropy": 5.228035879135132,
|
|
"epoch": 1.9798270893371757,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004610086842256262,
|
|
"loss": 4.9663,
|
|
"mean_token_accuracy": 0.21830164194107055,
|
|
"num_tokens": 47254223.0,
|
|
"step": 20610
|
|
},
|
|
{
|
|
"entropy": 5.213165378570556,
|
|
"epoch": 1.9803073967339095,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004609893929301146,
|
|
"loss": 4.9534,
|
|
"mean_token_accuracy": 0.22537720054388047,
|
|
"num_tokens": 47266559.0,
|
|
"step": 20615
|
|
},
|
|
{
|
|
"entropy": 5.259535551071167,
|
|
"epoch": 1.9807877041306436,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004609700973164816,
|
|
"loss": 4.9239,
|
|
"mean_token_accuracy": 0.22068007290363312,
|
|
"num_tokens": 47276438.0,
|
|
"step": 20620
|
|
},
|
|
{
|
|
"entropy": 5.153802680969238,
|
|
"epoch": 1.9812680115273775,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004609507973851751,
|
|
"loss": 4.936,
|
|
"mean_token_accuracy": 0.2177566260099411,
|
|
"num_tokens": 47287791.0,
|
|
"step": 20625
|
|
},
|
|
{
|
|
"entropy": 5.166331243515015,
|
|
"epoch": 1.9817483189241114,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046093149313664316,
|
|
"loss": 4.8766,
|
|
"mean_token_accuracy": 0.22625285685062407,
|
|
"num_tokens": 47300282.0,
|
|
"step": 20630
|
|
},
|
|
{
|
|
"entropy": 5.292104244232178,
|
|
"epoch": 1.9822286263208453,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004609121845713342,
|
|
"loss": 4.9625,
|
|
"mean_token_accuracy": 0.22391965836286545,
|
|
"num_tokens": 47310473.0,
|
|
"step": 20635
|
|
},
|
|
{
|
|
"entropy": 5.091473150253296,
|
|
"epoch": 1.9827089337175794,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004608928716896963,
|
|
"loss": 4.8727,
|
|
"mean_token_accuracy": 0.22272167056798936,
|
|
"num_tokens": 47322245.0,
|
|
"step": 20640
|
|
},
|
|
{
|
|
"entropy": 5.215969657897949,
|
|
"epoch": 1.9831892411143133,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004608735544921778,
|
|
"loss": 4.8979,
|
|
"mean_token_accuracy": 0.22845628559589387,
|
|
"num_tokens": 47334316.0,
|
|
"step": 20645
|
|
},
|
|
{
|
|
"entropy": 5.24719614982605,
|
|
"epoch": 1.9836695485110472,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046085423297922745,
|
|
"loss": 4.9315,
|
|
"mean_token_accuracy": 0.2245178133249283,
|
|
"num_tokens": 47344909.0,
|
|
"step": 20650
|
|
},
|
|
{
|
|
"entropy": 5.075035095214844,
|
|
"epoch": 1.984149855907781,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046083490715129367,
|
|
"loss": 4.806,
|
|
"mean_token_accuracy": 0.23430878520011902,
|
|
"num_tokens": 47356682.0,
|
|
"step": 20655
|
|
},
|
|
{
|
|
"entropy": 5.1300498962402346,
|
|
"epoch": 1.984630163304515,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004608155770088251,
|
|
"loss": 4.7656,
|
|
"mean_token_accuracy": 0.2302561417222023,
|
|
"num_tokens": 47368362.0,
|
|
"step": 20660
|
|
},
|
|
{
|
|
"entropy": 5.2220391750335695,
|
|
"epoch": 1.9851104707012488,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00046079624255227066,
|
|
"loss": 4.8974,
|
|
"mean_token_accuracy": 0.22234228402376174,
|
|
"num_tokens": 47379932.0,
|
|
"step": 20665
|
|
},
|
|
{
|
|
"entropy": 5.178265142440796,
|
|
"epoch": 1.9855907780979827,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004607769037820791,
|
|
"loss": 5.0239,
|
|
"mean_token_accuracy": 0.2156722739338875,
|
|
"num_tokens": 47391244.0,
|
|
"step": 20670
|
|
},
|
|
{
|
|
"entropy": 5.200816106796265,
|
|
"epoch": 1.9860710854947166,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004607575606986995,
|
|
"loss": 4.9086,
|
|
"mean_token_accuracy": 0.221403868496418,
|
|
"num_tokens": 47404005.0,
|
|
"step": 20675
|
|
},
|
|
{
|
|
"entropy": 5.120086431503296,
|
|
"epoch": 1.9865513928914504,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004607382133025809,
|
|
"loss": 4.8111,
|
|
"mean_token_accuracy": 0.22623786628246306,
|
|
"num_tokens": 47414555.0,
|
|
"step": 20680
|
|
},
|
|
{
|
|
"entropy": 5.157238340377807,
|
|
"epoch": 1.9870317002881843,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00046071886159417257,
|
|
"loss": 4.9429,
|
|
"mean_token_accuracy": 0.22154468148946763,
|
|
"num_tokens": 47425902.0,
|
|
"step": 20685
|
|
},
|
|
{
|
|
"entropy": 5.058484172821045,
|
|
"epoch": 1.9875120076849182,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004606995055739238,
|
|
"loss": 4.8014,
|
|
"mean_token_accuracy": 0.22854122519493103,
|
|
"num_tokens": 47438068.0,
|
|
"step": 20690
|
|
},
|
|
{
|
|
"entropy": 5.215109252929688,
|
|
"epoch": 1.9879923150816523,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00046068014524228374,
|
|
"loss": 4.9531,
|
|
"mean_token_accuracy": 0.2229817181825638,
|
|
"num_tokens": 47450322.0,
|
|
"step": 20695
|
|
},
|
|
{
|
|
"entropy": 5.151501083374024,
|
|
"epoch": 1.9884726224783862,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046066078059970217,
|
|
"loss": 4.8988,
|
|
"mean_token_accuracy": 0.22711405605077745,
|
|
"num_tokens": 47462615.0,
|
|
"step": 20700
|
|
},
|
|
{
|
|
"entropy": 5.158354616165161,
|
|
"epoch": 1.98895292987512,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004606414116466286,
|
|
"loss": 4.8534,
|
|
"mean_token_accuracy": 0.2186825007200241,
|
|
"num_tokens": 47472530.0,
|
|
"step": 20705
|
|
},
|
|
{
|
|
"entropy": 5.094603776931763,
|
|
"epoch": 1.989433237271854,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046062203838351267,
|
|
"loss": 4.8324,
|
|
"mean_token_accuracy": 0.22771522551774978,
|
|
"num_tokens": 47484217.0,
|
|
"step": 20710
|
|
},
|
|
{
|
|
"entropy": 5.1945013999938965,
|
|
"epoch": 1.989913544668588,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046060266081080414,
|
|
"loss": 4.9128,
|
|
"mean_token_accuracy": 0.22298493534326552,
|
|
"num_tokens": 47496365.0,
|
|
"step": 20715
|
|
},
|
|
{
|
|
"entropy": 5.161307954788208,
|
|
"epoch": 1.990393852065322,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004605832789289531,
|
|
"loss": 4.8889,
|
|
"mean_token_accuracy": 0.23147787898778915,
|
|
"num_tokens": 47509437.0,
|
|
"step": 20720
|
|
},
|
|
{
|
|
"entropy": 5.249230623245239,
|
|
"epoch": 1.9908741594620558,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004605638927384093,
|
|
"loss": 4.9501,
|
|
"mean_token_accuracy": 0.21548073440790178,
|
|
"num_tokens": 47520913.0,
|
|
"step": 20725
|
|
},
|
|
{
|
|
"entropy": 5.247469091415406,
|
|
"epoch": 1.9913544668587897,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046054450223962284,
|
|
"loss": 4.9293,
|
|
"mean_token_accuracy": 0.21719965785741807,
|
|
"num_tokens": 47532015.0,
|
|
"step": 20730
|
|
},
|
|
{
|
|
"entropy": 5.224554872512817,
|
|
"epoch": 1.9918347742555236,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046052510743304405,
|
|
"loss": 5.011,
|
|
"mean_token_accuracy": 0.21654557585716247,
|
|
"num_tokens": 47543675.0,
|
|
"step": 20735
|
|
},
|
|
{
|
|
"entropy": 5.162395858764649,
|
|
"epoch": 1.9923150816522575,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004605057083191232,
|
|
"loss": 4.8745,
|
|
"mean_token_accuracy": 0.22160074561834336,
|
|
"num_tokens": 47555757.0,
|
|
"step": 20740
|
|
},
|
|
{
|
|
"entropy": 5.2275580883026125,
|
|
"epoch": 1.9927953890489913,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004604863048983106,
|
|
"loss": 5.0157,
|
|
"mean_token_accuracy": 0.21975622475147247,
|
|
"num_tokens": 47568043.0,
|
|
"step": 20745
|
|
},
|
|
{
|
|
"entropy": 5.137474584579468,
|
|
"epoch": 1.9932756964457252,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004604668971710568,
|
|
"loss": 4.8694,
|
|
"mean_token_accuracy": 0.22627678513526917,
|
|
"num_tokens": 47577811.0,
|
|
"step": 20750
|
|
},
|
|
{
|
|
"entropy": 5.268559169769287,
|
|
"epoch": 1.993756003842459,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004604474851378124,
|
|
"loss": 4.9726,
|
|
"mean_token_accuracy": 0.21728340685367584,
|
|
"num_tokens": 47588309.0,
|
|
"step": 20755
|
|
},
|
|
{
|
|
"entropy": 5.331043815612793,
|
|
"epoch": 1.994236311239193,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046042806879902803,
|
|
"loss": 5.0637,
|
|
"mean_token_accuracy": 0.20983712524175643,
|
|
"num_tokens": 47599814.0,
|
|
"step": 20760
|
|
},
|
|
{
|
|
"entropy": 5.226640224456787,
|
|
"epoch": 1.9947166186359269,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004604086481551546,
|
|
"loss": 4.9663,
|
|
"mean_token_accuracy": 0.2197731092572212,
|
|
"num_tokens": 47611851.0,
|
|
"step": 20765
|
|
},
|
|
{
|
|
"entropy": 5.173228645324707,
|
|
"epoch": 1.9951969260326607,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004603892232066428,
|
|
"loss": 4.8209,
|
|
"mean_token_accuracy": 0.22686802744865417,
|
|
"num_tokens": 47624056.0,
|
|
"step": 20770
|
|
},
|
|
{
|
|
"entropy": 5.171985721588134,
|
|
"epoch": 1.9956772334293948,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046036979395394374,
|
|
"loss": 4.9178,
|
|
"mean_token_accuracy": 0.22749678641557694,
|
|
"num_tokens": 47637214.0,
|
|
"step": 20775
|
|
},
|
|
{
|
|
"entropy": 5.163872909545899,
|
|
"epoch": 1.9961575408261287,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004603503603975085,
|
|
"loss": 4.8755,
|
|
"mean_token_accuracy": 0.2249054953455925,
|
|
"num_tokens": 47647498.0,
|
|
"step": 20780
|
|
},
|
|
{
|
|
"entropy": 5.165285444259643,
|
|
"epoch": 1.9966378482228626,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004603309225377883,
|
|
"loss": 4.8804,
|
|
"mean_token_accuracy": 0.2229431003332138,
|
|
"num_tokens": 47659576.0,
|
|
"step": 20785
|
|
},
|
|
{
|
|
"entropy": 5.208536052703858,
|
|
"epoch": 1.9971181556195965,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004603114803752344,
|
|
"loss": 4.8832,
|
|
"mean_token_accuracy": 0.21672031581401824,
|
|
"num_tokens": 47670631.0,
|
|
"step": 20790
|
|
},
|
|
{
|
|
"entropy": 5.1253608703613285,
|
|
"epoch": 1.9975984630163306,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00046029203391029813,
|
|
"loss": 4.8264,
|
|
"mean_token_accuracy": 0.22523149996995925,
|
|
"num_tokens": 47681647.0,
|
|
"step": 20795
|
|
},
|
|
{
|
|
"entropy": 5.123491811752319,
|
|
"epoch": 1.9980787704130645,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046027258314343107,
|
|
"loss": 4.8585,
|
|
"mean_token_accuracy": 0.22700741440057753,
|
|
"num_tokens": 47694386.0,
|
|
"step": 20800
|
|
},
|
|
{
|
|
"entropy": 5.194536399841309,
|
|
"epoch": 1.9985590778097984,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046025312807508487,
|
|
"loss": 4.902,
|
|
"mean_token_accuracy": 0.2210228532552719,
|
|
"num_tokens": 47704897.0,
|
|
"step": 20805
|
|
},
|
|
{
|
|
"entropy": 5.185521221160888,
|
|
"epoch": 1.9990393852065322,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046023366870571097,
|
|
"loss": 4.9504,
|
|
"mean_token_accuracy": 0.22025657594203948,
|
|
"num_tokens": 47715771.0,
|
|
"step": 20810
|
|
},
|
|
{
|
|
"entropy": 5.205077028274536,
|
|
"epoch": 1.9995196926032661,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00046021420503576145,
|
|
"loss": 4.832,
|
|
"mean_token_accuracy": 0.22895766347646712,
|
|
"num_tokens": 47726295.0,
|
|
"step": 20815
|
|
},
|
|
{
|
|
"entropy": 5.0892219066619875,
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004601947370656879,
|
|
"loss": 4.9096,
|
|
"mean_token_accuracy": 0.21949992924928666,
|
|
"num_tokens": 47737072.0,
|
|
"step": 20820
|
|
},
|
|
{
|
|
"entropy": 5.194896125793457,
|
|
"epoch": 2.000480307396734,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004601752647959426,
|
|
"loss": 4.8771,
|
|
"mean_token_accuracy": 0.2225003719329834,
|
|
"num_tokens": 47750379.0,
|
|
"step": 20825
|
|
},
|
|
{
|
|
"entropy": 5.293734169006347,
|
|
"epoch": 2.0009606147934678,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004601557882269775,
|
|
"loss": 4.9057,
|
|
"mean_token_accuracy": 0.21733027547597886,
|
|
"num_tokens": 47763059.0,
|
|
"step": 20830
|
|
},
|
|
{
|
|
"entropy": 5.149887371063232,
|
|
"epoch": 2.0014409221902016,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004601363073592447,
|
|
"loss": 4.8075,
|
|
"mean_token_accuracy": 0.22715968489646912,
|
|
"num_tokens": 47775513.0,
|
|
"step": 20835
|
|
},
|
|
{
|
|
"entropy": 5.059220695495606,
|
|
"epoch": 2.0019212295869355,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004601168221931967,
|
|
"loss": 4.7368,
|
|
"mean_token_accuracy": 0.2307532474398613,
|
|
"num_tokens": 47786215.0,
|
|
"step": 20840
|
|
},
|
|
{
|
|
"entropy": 5.1203209400177006,
|
|
"epoch": 2.0024015369836694,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004600973327292857,
|
|
"loss": 4.8167,
|
|
"mean_token_accuracy": 0.22654520273208617,
|
|
"num_tokens": 47796894.0,
|
|
"step": 20845
|
|
},
|
|
{
|
|
"entropy": 5.16290979385376,
|
|
"epoch": 2.0028818443804033,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00046007783896796436,
|
|
"loss": 4.8404,
|
|
"mean_token_accuracy": 0.22516684383153915,
|
|
"num_tokens": 47808505.0,
|
|
"step": 20850
|
|
},
|
|
{
|
|
"entropy": 5.139503717422485,
|
|
"epoch": 2.003362151777137,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004600583409096851,
|
|
"loss": 4.729,
|
|
"mean_token_accuracy": 0.23259917199611663,
|
|
"num_tokens": 47819617.0,
|
|
"step": 20855
|
|
},
|
|
{
|
|
"entropy": 5.103642559051513,
|
|
"epoch": 2.0038424591738715,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046003883855490066,
|
|
"loss": 4.7818,
|
|
"mean_token_accuracy": 0.23402840942144393,
|
|
"num_tokens": 47830655.0,
|
|
"step": 20860
|
|
},
|
|
{
|
|
"entropy": 5.106681871414184,
|
|
"epoch": 2.0043227665706054,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004600193319040638,
|
|
"loss": 4.7267,
|
|
"mean_token_accuracy": 0.23414026349782943,
|
|
"num_tokens": 47842439.0,
|
|
"step": 20865
|
|
},
|
|
{
|
|
"entropy": 5.0792402744293215,
|
|
"epoch": 2.0048030739673393,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045999982095762756,
|
|
"loss": 4.6937,
|
|
"mean_token_accuracy": 0.23838399052619935,
|
|
"num_tokens": 47853327.0,
|
|
"step": 20870
|
|
},
|
|
{
|
|
"entropy": 5.178643083572387,
|
|
"epoch": 2.005283381364073,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00045998030571604473,
|
|
"loss": 4.8768,
|
|
"mean_token_accuracy": 0.22799644619226456,
|
|
"num_tokens": 47867155.0,
|
|
"step": 20875
|
|
},
|
|
{
|
|
"entropy": 5.142761993408203,
|
|
"epoch": 2.005763688760807,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004599607861797685,
|
|
"loss": 4.8213,
|
|
"mean_token_accuracy": 0.22531704753637313,
|
|
"num_tokens": 47879641.0,
|
|
"step": 20880
|
|
},
|
|
{
|
|
"entropy": 5.163343143463135,
|
|
"epoch": 2.006243996157541,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00045994126234925203,
|
|
"loss": 4.8515,
|
|
"mean_token_accuracy": 0.22218380719423295,
|
|
"num_tokens": 47890512.0,
|
|
"step": 20885
|
|
},
|
|
{
|
|
"entropy": 5.1860779285430905,
|
|
"epoch": 2.006724303554275,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045992173422494865,
|
|
"loss": 4.8701,
|
|
"mean_token_accuracy": 0.21841327995061874,
|
|
"num_tokens": 47902575.0,
|
|
"step": 20890
|
|
},
|
|
{
|
|
"entropy": 5.16825647354126,
|
|
"epoch": 2.0072046109510087,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004599022018073117,
|
|
"loss": 4.7529,
|
|
"mean_token_accuracy": 0.2366969734430313,
|
|
"num_tokens": 47915447.0,
|
|
"step": 20895
|
|
},
|
|
{
|
|
"entropy": 5.090538883209229,
|
|
"epoch": 2.0076849183477425,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004598826650967946,
|
|
"loss": 4.8135,
|
|
"mean_token_accuracy": 0.2228596404194832,
|
|
"num_tokens": 47925702.0,
|
|
"step": 20900
|
|
},
|
|
{
|
|
"entropy": 5.163640880584717,
|
|
"epoch": 2.0081652257444764,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045986312409385105,
|
|
"loss": 4.8047,
|
|
"mean_token_accuracy": 0.22682830542325974,
|
|
"num_tokens": 47936621.0,
|
|
"step": 20905
|
|
},
|
|
{
|
|
"entropy": 5.110746955871582,
|
|
"epoch": 2.0086455331412103,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004598435787989347,
|
|
"loss": 4.7464,
|
|
"mean_token_accuracy": 0.23588199466466903,
|
|
"num_tokens": 47948026.0,
|
|
"step": 20910
|
|
},
|
|
{
|
|
"entropy": 5.19895453453064,
|
|
"epoch": 2.009125840537944,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00045982402921249934,
|
|
"loss": 4.8678,
|
|
"mean_token_accuracy": 0.22533251196146012,
|
|
"num_tokens": 47959976.0,
|
|
"step": 20915
|
|
},
|
|
{
|
|
"entropy": 5.225091600418091,
|
|
"epoch": 2.009606147934678,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004598044753349988,
|
|
"loss": 4.8494,
|
|
"mean_token_accuracy": 0.22337938696146012,
|
|
"num_tokens": 47972071.0,
|
|
"step": 20920
|
|
},
|
|
{
|
|
"entropy": 5.185384368896484,
|
|
"epoch": 2.010086455331412,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045978491716688706,
|
|
"loss": 4.7873,
|
|
"mean_token_accuracy": 0.2292557254433632,
|
|
"num_tokens": 47984789.0,
|
|
"step": 20925
|
|
},
|
|
{
|
|
"entropy": 5.139200592041016,
|
|
"epoch": 2.010566762728146,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004597653547086184,
|
|
"loss": 4.7904,
|
|
"mean_token_accuracy": 0.231342613697052,
|
|
"num_tokens": 47995661.0,
|
|
"step": 20930
|
|
},
|
|
{
|
|
"entropy": 5.208558177947998,
|
|
"epoch": 2.0110470701248797,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004597457879606467,
|
|
"loss": 4.8689,
|
|
"mean_token_accuracy": 0.22292304635047913,
|
|
"num_tokens": 48006043.0,
|
|
"step": 20935
|
|
},
|
|
{
|
|
"entropy": 5.165970993041992,
|
|
"epoch": 2.011527377521614,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00045972621692342636,
|
|
"loss": 4.9192,
|
|
"mean_token_accuracy": 0.22206807434558867,
|
|
"num_tokens": 48016904.0,
|
|
"step": 20940
|
|
},
|
|
{
|
|
"entropy": 5.181275033950806,
|
|
"epoch": 2.012007684918348,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00045970664159741186,
|
|
"loss": 4.8304,
|
|
"mean_token_accuracy": 0.23026852905750275,
|
|
"num_tokens": 48026533.0,
|
|
"step": 20945
|
|
},
|
|
{
|
|
"entropy": 5.155037689208984,
|
|
"epoch": 2.012487992315082,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045968706198305765,
|
|
"loss": 4.769,
|
|
"mean_token_accuracy": 0.23366015553474426,
|
|
"num_tokens": 48038280.0,
|
|
"step": 20950
|
|
},
|
|
{
|
|
"entropy": 5.073322820663452,
|
|
"epoch": 2.0129682997118157,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00045966747808081824,
|
|
"loss": 4.7476,
|
|
"mean_token_accuracy": 0.23707432001829148,
|
|
"num_tokens": 48049663.0,
|
|
"step": 20955
|
|
},
|
|
{
|
|
"entropy": 5.147378587722779,
|
|
"epoch": 2.0134486071085496,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004596478898911483,
|
|
"loss": 4.8248,
|
|
"mean_token_accuracy": 0.22687099874019623,
|
|
"num_tokens": 48059826.0,
|
|
"step": 20960
|
|
},
|
|
{
|
|
"entropy": 5.163741254806519,
|
|
"epoch": 2.0139289145052834,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045962829741450265,
|
|
"loss": 4.7405,
|
|
"mean_token_accuracy": 0.23313153833150863,
|
|
"num_tokens": 48071097.0,
|
|
"step": 20965
|
|
},
|
|
{
|
|
"entropy": 5.1321446895599365,
|
|
"epoch": 2.0144092219020173,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004596087006513361,
|
|
"loss": 4.7963,
|
|
"mean_token_accuracy": 0.22714407742023468,
|
|
"num_tokens": 48082297.0,
|
|
"step": 20970
|
|
},
|
|
{
|
|
"entropy": 5.073679256439209,
|
|
"epoch": 2.014889529298751,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00045958909960210385,
|
|
"loss": 4.7386,
|
|
"mean_token_accuracy": 0.22801399379968643,
|
|
"num_tokens": 48092946.0,
|
|
"step": 20975
|
|
},
|
|
{
|
|
"entropy": 5.0920733451843265,
|
|
"epoch": 2.015369836695485,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00045956949426726075,
|
|
"loss": 4.7988,
|
|
"mean_token_accuracy": 0.2351382240653038,
|
|
"num_tokens": 48105000.0,
|
|
"step": 20980
|
|
},
|
|
{
|
|
"entropy": 5.10857367515564,
|
|
"epoch": 2.015850144092219,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045954988464726203,
|
|
"loss": 4.7492,
|
|
"mean_token_accuracy": 0.22923020124435425,
|
|
"num_tokens": 48117513.0,
|
|
"step": 20985
|
|
},
|
|
{
|
|
"entropy": 5.245584154129029,
|
|
"epoch": 2.016330451488953,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000459530270742563,
|
|
"loss": 4.9732,
|
|
"mean_token_accuracy": 0.21183091551065444,
|
|
"num_tokens": 48130895.0,
|
|
"step": 20990
|
|
},
|
|
{
|
|
"entropy": 5.161238050460815,
|
|
"epoch": 2.0168107588856867,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00045951065255361905,
|
|
"loss": 4.7578,
|
|
"mean_token_accuracy": 0.22671116292476653,
|
|
"num_tokens": 48143348.0,
|
|
"step": 20995
|
|
},
|
|
{
|
|
"entropy": 5.128641080856323,
|
|
"epoch": 2.0172910662824206,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004594910300808856,
|
|
"loss": 4.7912,
|
|
"mean_token_accuracy": 0.23204947561025618,
|
|
"num_tokens": 48154959.0,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"epoch": 2.0172910662824206,
|
|
"eval_entropy": 4.9797098409949365,
|
|
"eval_loss": 4.967945575714111,
|
|
"eval_mean_token_accuracy": 0.22761427323133668,
|
|
"eval_num_tokens": 48154959.0,
|
|
"eval_runtime": 26.6315,
|
|
"eval_samples_per_second": 1232.188,
|
|
"eval_steps_per_second": 154.028,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"entropy": 5.058965253829956,
|
|
"epoch": 2.0177713736791545,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004594714033248183,
|
|
"loss": 4.6705,
|
|
"mean_token_accuracy": 0.23726486414670944,
|
|
"num_tokens": 48165227.0,
|
|
"step": 21005
|
|
},
|
|
{
|
|
"entropy": 5.185387229919433,
|
|
"epoch": 2.0182516810758884,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004594517722858728,
|
|
"loss": 4.8974,
|
|
"mean_token_accuracy": 0.22563222348690032,
|
|
"num_tokens": 48175133.0,
|
|
"step": 21010
|
|
},
|
|
{
|
|
"entropy": 5.174899244308472,
|
|
"epoch": 2.0187319884726227,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045943213696450475,
|
|
"loss": 4.8064,
|
|
"mean_token_accuracy": 0.227556312084198,
|
|
"num_tokens": 48187554.0,
|
|
"step": 21015
|
|
},
|
|
{
|
|
"entropy": 5.197635507583618,
|
|
"epoch": 2.0192122958693566,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00045941249736117023,
|
|
"loss": 4.8612,
|
|
"mean_token_accuracy": 0.22105547934770584,
|
|
"num_tokens": 48199410.0,
|
|
"step": 21020
|
|
},
|
|
{
|
|
"entropy": 5.099879741668701,
|
|
"epoch": 2.0196926032660905,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004593928534763251,
|
|
"loss": 4.778,
|
|
"mean_token_accuracy": 0.22858137935400008,
|
|
"num_tokens": 48211124.0,
|
|
"step": 21025
|
|
},
|
|
{
|
|
"entropy": 5.104985618591309,
|
|
"epoch": 2.0201729106628243,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004593732053104254,
|
|
"loss": 4.7302,
|
|
"mean_token_accuracy": 0.2319321408867836,
|
|
"num_tokens": 48222690.0,
|
|
"step": 21030
|
|
},
|
|
{
|
|
"entropy": 5.139070272445679,
|
|
"epoch": 2.020653218059558,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045935355286392735,
|
|
"loss": 4.7768,
|
|
"mean_token_accuracy": 0.22877870500087738,
|
|
"num_tokens": 48235054.0,
|
|
"step": 21035
|
|
},
|
|
{
|
|
"entropy": 5.045724630355835,
|
|
"epoch": 2.021133525456292,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004593338961372873,
|
|
"loss": 4.7274,
|
|
"mean_token_accuracy": 0.2347530335187912,
|
|
"num_tokens": 48246645.0,
|
|
"step": 21040
|
|
},
|
|
{
|
|
"entropy": 5.0374504089355465,
|
|
"epoch": 2.021613832853026,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004593142351309614,
|
|
"loss": 4.7458,
|
|
"mean_token_accuracy": 0.23782979398965837,
|
|
"num_tokens": 48258149.0,
|
|
"step": 21045
|
|
},
|
|
{
|
|
"entropy": 5.162465238571167,
|
|
"epoch": 2.02209414024976,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004592945698454064,
|
|
"loss": 4.8985,
|
|
"mean_token_accuracy": 0.2223748430609703,
|
|
"num_tokens": 48269892.0,
|
|
"step": 21050
|
|
},
|
|
{
|
|
"entropy": 5.217119455337524,
|
|
"epoch": 2.0225744476464937,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045927490028107866,
|
|
"loss": 4.8687,
|
|
"mean_token_accuracy": 0.22514340579509734,
|
|
"num_tokens": 48281087.0,
|
|
"step": 21055
|
|
},
|
|
{
|
|
"entropy": 5.139309453964233,
|
|
"epoch": 2.0230547550432276,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000459255226438435,
|
|
"loss": 4.7274,
|
|
"mean_token_accuracy": 0.23785278648138047,
|
|
"num_tokens": 48292177.0,
|
|
"step": 21060
|
|
},
|
|
{
|
|
"entropy": 5.058869981765747,
|
|
"epoch": 2.0235350624399615,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000459235548317932,
|
|
"loss": 4.7745,
|
|
"mean_token_accuracy": 0.2236475557088852,
|
|
"num_tokens": 48304281.0,
|
|
"step": 21065
|
|
},
|
|
{
|
|
"entropy": 5.087431287765503,
|
|
"epoch": 2.0240153698366954,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00045921586592002667,
|
|
"loss": 4.751,
|
|
"mean_token_accuracy": 0.23001312762498854,
|
|
"num_tokens": 48316758.0,
|
|
"step": 21070
|
|
},
|
|
{
|
|
"entropy": 5.2216087818145756,
|
|
"epoch": 2.0244956772334293,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000459196179245176,
|
|
"loss": 4.9064,
|
|
"mean_token_accuracy": 0.22211889773607255,
|
|
"num_tokens": 48327455.0,
|
|
"step": 21075
|
|
},
|
|
{
|
|
"entropy": 5.122845983505249,
|
|
"epoch": 2.024975984630163,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004591764882938369,
|
|
"loss": 4.8134,
|
|
"mean_token_accuracy": 0.2234889015555382,
|
|
"num_tokens": 48338105.0,
|
|
"step": 21080
|
|
},
|
|
{
|
|
"entropy": 5.1362800121307375,
|
|
"epoch": 2.025456292026897,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004591567930664667,
|
|
"loss": 4.8668,
|
|
"mean_token_accuracy": 0.22312761843204498,
|
|
"num_tokens": 48349599.0,
|
|
"step": 21085
|
|
},
|
|
{
|
|
"entropy": 5.173755311965943,
|
|
"epoch": 2.025936599423631,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004591370935635226,
|
|
"loss": 4.8962,
|
|
"mean_token_accuracy": 0.22018368989229203,
|
|
"num_tokens": 48361488.0,
|
|
"step": 21090
|
|
},
|
|
{
|
|
"entropy": 5.229612350463867,
|
|
"epoch": 2.0264169068203652,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004591173897854619,
|
|
"loss": 4.8223,
|
|
"mean_token_accuracy": 0.22342414259910584,
|
|
"num_tokens": 48372049.0,
|
|
"step": 21095
|
|
},
|
|
{
|
|
"entropy": 5.111845207214356,
|
|
"epoch": 2.026897214217099,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004590976817327422,
|
|
"loss": 4.7524,
|
|
"mean_token_accuracy": 0.23141625225543977,
|
|
"num_tokens": 48385599.0,
|
|
"step": 21100
|
|
},
|
|
{
|
|
"entropy": 5.187682151794434,
|
|
"epoch": 2.027377521613833,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004590779694058209,
|
|
"loss": 4.8404,
|
|
"mean_token_accuracy": 0.230179800093174,
|
|
"num_tokens": 48397390.0,
|
|
"step": 21105
|
|
},
|
|
{
|
|
"entropy": 5.152220296859741,
|
|
"epoch": 2.027857829010567,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00045905825280515586,
|
|
"loss": 4.8487,
|
|
"mean_token_accuracy": 0.22240738272666932,
|
|
"num_tokens": 48409837.0,
|
|
"step": 21110
|
|
},
|
|
{
|
|
"entropy": 5.095877361297608,
|
|
"epoch": 2.0283381364073008,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00045903853193120464,
|
|
"loss": 4.6985,
|
|
"mean_token_accuracy": 0.2371117353439331,
|
|
"num_tokens": 48420765.0,
|
|
"step": 21115
|
|
},
|
|
{
|
|
"entropy": 5.13346791267395,
|
|
"epoch": 2.0288184438040346,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045901880678442524,
|
|
"loss": 4.8352,
|
|
"mean_token_accuracy": 0.22843140363693237,
|
|
"num_tokens": 48431939.0,
|
|
"step": 21120
|
|
},
|
|
{
|
|
"entropy": 5.203976631164551,
|
|
"epoch": 2.0292987512007685,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045899907736527556,
|
|
"loss": 4.8942,
|
|
"mean_token_accuracy": 0.21955382823944092,
|
|
"num_tokens": 48444820.0,
|
|
"step": 21125
|
|
},
|
|
{
|
|
"entropy": 5.116818284988403,
|
|
"epoch": 2.0297790585975024,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045897934367421364,
|
|
"loss": 4.8264,
|
|
"mean_token_accuracy": 0.2259441375732422,
|
|
"num_tokens": 48457315.0,
|
|
"step": 21130
|
|
},
|
|
{
|
|
"entropy": 5.199477338790894,
|
|
"epoch": 2.0302593659942363,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004589596057116977,
|
|
"loss": 4.8979,
|
|
"mean_token_accuracy": 0.21822259724140167,
|
|
"num_tokens": 48468612.0,
|
|
"step": 21135
|
|
},
|
|
{
|
|
"entropy": 5.089222574234009,
|
|
"epoch": 2.03073967339097,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045893986347818593,
|
|
"loss": 4.7565,
|
|
"mean_token_accuracy": 0.22949785143136978,
|
|
"num_tokens": 48479453.0,
|
|
"step": 21140
|
|
},
|
|
{
|
|
"entropy": 5.087232303619385,
|
|
"epoch": 2.031219980787704,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004589201169741368,
|
|
"loss": 4.7952,
|
|
"mean_token_accuracy": 0.22973719537258147,
|
|
"num_tokens": 48491656.0,
|
|
"step": 21145
|
|
},
|
|
{
|
|
"entropy": 5.103940820693969,
|
|
"epoch": 2.031700288184438,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045890036620000856,
|
|
"loss": 4.7292,
|
|
"mean_token_accuracy": 0.2226880133152008,
|
|
"num_tokens": 48503431.0,
|
|
"step": 21150
|
|
},
|
|
{
|
|
"entropy": 5.117399597167969,
|
|
"epoch": 2.032180595581172,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045888061115626,
|
|
"loss": 4.7687,
|
|
"mean_token_accuracy": 0.22986358106136323,
|
|
"num_tokens": 48513922.0,
|
|
"step": 21155
|
|
},
|
|
{
|
|
"entropy": 5.036969900131226,
|
|
"epoch": 2.0326609029779057,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004588608518433496,
|
|
"loss": 4.7416,
|
|
"mean_token_accuracy": 0.2294941857457161,
|
|
"num_tokens": 48524715.0,
|
|
"step": 21160
|
|
},
|
|
{
|
|
"entropy": 5.133905744552612,
|
|
"epoch": 2.0331412103746396,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004588410882617362,
|
|
"loss": 4.884,
|
|
"mean_token_accuracy": 0.22323887348175048,
|
|
"num_tokens": 48536696.0,
|
|
"step": 21165
|
|
},
|
|
{
|
|
"entropy": 5.264414978027344,
|
|
"epoch": 2.0336215177713735,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004588213204118786,
|
|
"loss": 4.9228,
|
|
"mean_token_accuracy": 0.22241499423980712,
|
|
"num_tokens": 48550290.0,
|
|
"step": 21170
|
|
},
|
|
{
|
|
"entropy": 5.195666217803955,
|
|
"epoch": 2.034101825168108,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045880154829423586,
|
|
"loss": 4.8509,
|
|
"mean_token_accuracy": 0.22739229947328568,
|
|
"num_tokens": 48561047.0,
|
|
"step": 21175
|
|
},
|
|
{
|
|
"entropy": 5.034094524383545,
|
|
"epoch": 2.0345821325648417,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004587817719092668,
|
|
"loss": 4.636,
|
|
"mean_token_accuracy": 0.23849904984235765,
|
|
"num_tokens": 48571268.0,
|
|
"step": 21180
|
|
},
|
|
{
|
|
"entropy": 5.0633808135986325,
|
|
"epoch": 2.0350624399615755,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00045876199125743087,
|
|
"loss": 4.7628,
|
|
"mean_token_accuracy": 0.22590662389993668,
|
|
"num_tokens": 48583158.0,
|
|
"step": 21185
|
|
},
|
|
{
|
|
"entropy": 5.175520372390747,
|
|
"epoch": 2.0355427473583094,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004587422063391871,
|
|
"loss": 4.8415,
|
|
"mean_token_accuracy": 0.22876403331756592,
|
|
"num_tokens": 48593419.0,
|
|
"step": 21190
|
|
},
|
|
{
|
|
"entropy": 5.183585357666016,
|
|
"epoch": 2.0360230547550433,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004587224171549949,
|
|
"loss": 4.8184,
|
|
"mean_token_accuracy": 0.23225459605455398,
|
|
"num_tokens": 48605845.0,
|
|
"step": 21195
|
|
},
|
|
{
|
|
"entropy": 5.118991184234619,
|
|
"epoch": 2.036503362151777,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045870262370531376,
|
|
"loss": 4.7669,
|
|
"mean_token_accuracy": 0.23297881484031677,
|
|
"num_tokens": 48616503.0,
|
|
"step": 21200
|
|
},
|
|
{
|
|
"entropy": 5.188354969024658,
|
|
"epoch": 2.036983669548511,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00045868282599060314,
|
|
"loss": 4.8723,
|
|
"mean_token_accuracy": 0.22089865952730178,
|
|
"num_tokens": 48628410.0,
|
|
"step": 21205
|
|
},
|
|
{
|
|
"entropy": 5.106374835968017,
|
|
"epoch": 2.037463976945245,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004586630240113227,
|
|
"loss": 4.7639,
|
|
"mean_token_accuracy": 0.23033759295940398,
|
|
"num_tokens": 48639457.0,
|
|
"step": 21210
|
|
},
|
|
{
|
|
"entropy": 5.113154745101928,
|
|
"epoch": 2.037944284341979,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004586432177679322,
|
|
"loss": 4.81,
|
|
"mean_token_accuracy": 0.22925937473773955,
|
|
"num_tokens": 48652250.0,
|
|
"step": 21215
|
|
},
|
|
{
|
|
"entropy": 5.124432039260864,
|
|
"epoch": 2.0384245917387127,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045862340726089153,
|
|
"loss": 4.8795,
|
|
"mean_token_accuracy": 0.22818073034286498,
|
|
"num_tokens": 48663386.0,
|
|
"step": 21220
|
|
},
|
|
{
|
|
"entropy": 5.189832639694214,
|
|
"epoch": 2.0389048991354466,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004586035924906606,
|
|
"loss": 4.8491,
|
|
"mean_token_accuracy": 0.22109754979610444,
|
|
"num_tokens": 48675111.0,
|
|
"step": 21225
|
|
},
|
|
{
|
|
"entropy": 5.126422500610351,
|
|
"epoch": 2.0393852065321805,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00045858377345769946,
|
|
"loss": 4.7458,
|
|
"mean_token_accuracy": 0.22514686733484268,
|
|
"num_tokens": 48686264.0,
|
|
"step": 21230
|
|
},
|
|
{
|
|
"entropy": 5.126074361801147,
|
|
"epoch": 2.0398655139289144,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004585639501624682,
|
|
"loss": 4.7946,
|
|
"mean_token_accuracy": 0.22376175075769425,
|
|
"num_tokens": 48697249.0,
|
|
"step": 21235
|
|
},
|
|
{
|
|
"entropy": 5.101242637634277,
|
|
"epoch": 2.0403458213256482,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000458544122605427,
|
|
"loss": 4.7369,
|
|
"mean_token_accuracy": 0.23103302717208862,
|
|
"num_tokens": 48708808.0,
|
|
"step": 21240
|
|
},
|
|
{
|
|
"entropy": 5.159991884231568,
|
|
"epoch": 2.040826128722382,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045852429078703646,
|
|
"loss": 4.8868,
|
|
"mean_token_accuracy": 0.22002633064985275,
|
|
"num_tokens": 48719879.0,
|
|
"step": 21245
|
|
},
|
|
{
|
|
"entropy": 5.2014281272888185,
|
|
"epoch": 2.0413064361191164,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00045850445470775673,
|
|
"loss": 4.8326,
|
|
"mean_token_accuracy": 0.22262947410345077,
|
|
"num_tokens": 48731110.0,
|
|
"step": 21250
|
|
},
|
|
{
|
|
"entropy": 5.137082576751709,
|
|
"epoch": 2.0417867435158503,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004584846143680485,
|
|
"loss": 4.7875,
|
|
"mean_token_accuracy": 0.22981350421905516,
|
|
"num_tokens": 48742799.0,
|
|
"step": 21255
|
|
},
|
|
{
|
|
"entropy": 5.246884727478028,
|
|
"epoch": 2.042267050912584,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004584647697683723,
|
|
"loss": 4.9331,
|
|
"mean_token_accuracy": 0.2172001451253891,
|
|
"num_tokens": 48753773.0,
|
|
"step": 21260
|
|
},
|
|
{
|
|
"entropy": 5.141643142700195,
|
|
"epoch": 2.042747358309318,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00045844492090918904,
|
|
"loss": 4.7857,
|
|
"mean_token_accuracy": 0.23033898323774338,
|
|
"num_tokens": 48764561.0,
|
|
"step": 21265
|
|
},
|
|
{
|
|
"entropy": 5.087370872497559,
|
|
"epoch": 2.043227665706052,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045842506779095936,
|
|
"loss": 4.6829,
|
|
"mean_token_accuracy": 0.22804915010929108,
|
|
"num_tokens": 48775319.0,
|
|
"step": 21270
|
|
},
|
|
{
|
|
"entropy": 5.2386863231658936,
|
|
"epoch": 2.043707973102786,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004584052104141443,
|
|
"loss": 4.9516,
|
|
"mean_token_accuracy": 0.208183716237545,
|
|
"num_tokens": 48787595.0,
|
|
"step": 21275
|
|
},
|
|
{
|
|
"entropy": 5.194898891448974,
|
|
"epoch": 2.0441882804995197,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004583853487792048,
|
|
"loss": 4.8145,
|
|
"mean_token_accuracy": 0.2251271814107895,
|
|
"num_tokens": 48800015.0,
|
|
"step": 21280
|
|
},
|
|
{
|
|
"entropy": 5.183368492126465,
|
|
"epoch": 2.0446685878962536,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004583654828866021,
|
|
"loss": 4.836,
|
|
"mean_token_accuracy": 0.22027584314346313,
|
|
"num_tokens": 48811651.0,
|
|
"step": 21285
|
|
},
|
|
{
|
|
"entropy": 5.174183654785156,
|
|
"epoch": 2.0451488952929875,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004583456127367973,
|
|
"loss": 4.86,
|
|
"mean_token_accuracy": 0.22795891016721725,
|
|
"num_tokens": 48823561.0,
|
|
"step": 21290
|
|
},
|
|
{
|
|
"entropy": 5.126351070404053,
|
|
"epoch": 2.0456292026897214,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004583257383302519,
|
|
"loss": 4.8773,
|
|
"mean_token_accuracy": 0.2161845326423645,
|
|
"num_tokens": 48835385.0,
|
|
"step": 21295
|
|
},
|
|
{
|
|
"entropy": 5.1982903480529785,
|
|
"epoch": 2.0461095100864553,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004583058596674271,
|
|
"loss": 4.8693,
|
|
"mean_token_accuracy": 0.22223322540521623,
|
|
"num_tokens": 48846899.0,
|
|
"step": 21300
|
|
},
|
|
{
|
|
"entropy": 5.138012409210205,
|
|
"epoch": 2.046589817483189,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004582859767487846,
|
|
"loss": 4.8044,
|
|
"mean_token_accuracy": 0.22934290319681166,
|
|
"num_tokens": 48857850.0,
|
|
"step": 21305
|
|
},
|
|
{
|
|
"entropy": 5.0926210403442385,
|
|
"epoch": 2.047070124879923,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045826608957478604,
|
|
"loss": 4.8101,
|
|
"mean_token_accuracy": 0.226822829246521,
|
|
"num_tokens": 48869192.0,
|
|
"step": 21310
|
|
},
|
|
{
|
|
"entropy": 5.1192957878112795,
|
|
"epoch": 2.047550432276657,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00045824619814589297,
|
|
"loss": 4.782,
|
|
"mean_token_accuracy": 0.22894603312015532,
|
|
"num_tokens": 48881314.0,
|
|
"step": 21315
|
|
},
|
|
{
|
|
"entropy": 5.085238838195801,
|
|
"epoch": 2.0480307396733908,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004582263024625674,
|
|
"loss": 4.8282,
|
|
"mean_token_accuracy": 0.22456549853086472,
|
|
"num_tokens": 48893292.0,
|
|
"step": 21320
|
|
},
|
|
{
|
|
"entropy": 5.155279207229614,
|
|
"epoch": 2.048511047070125,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004582064025252711,
|
|
"loss": 4.8899,
|
|
"mean_token_accuracy": 0.22350900620222092,
|
|
"num_tokens": 48905607.0,
|
|
"step": 21325
|
|
},
|
|
{
|
|
"entropy": 5.1824195861816404,
|
|
"epoch": 2.048991354466859,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004581864983344661,
|
|
"loss": 4.7898,
|
|
"mean_token_accuracy": 0.2279469147324562,
|
|
"num_tokens": 48916194.0,
|
|
"step": 21330
|
|
},
|
|
{
|
|
"entropy": 5.151317453384399,
|
|
"epoch": 2.049471661863593,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004581665898906147,
|
|
"loss": 4.793,
|
|
"mean_token_accuracy": 0.22845213562250138,
|
|
"num_tokens": 48927560.0,
|
|
"step": 21335
|
|
},
|
|
{
|
|
"entropy": 5.038725471496582,
|
|
"epoch": 2.0499519692603267,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045814667719417887,
|
|
"loss": 4.7988,
|
|
"mean_token_accuracy": 0.23149679154157637,
|
|
"num_tokens": 48938284.0,
|
|
"step": 21340
|
|
},
|
|
{
|
|
"entropy": 5.110639238357544,
|
|
"epoch": 2.0504322766570606,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000458126760245621,
|
|
"loss": 4.789,
|
|
"mean_token_accuracy": 0.22833970189094543,
|
|
"num_tokens": 48948325.0,
|
|
"step": 21345
|
|
},
|
|
{
|
|
"entropy": 5.164987468719483,
|
|
"epoch": 2.0509125840537945,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004581068390454036,
|
|
"loss": 4.79,
|
|
"mean_token_accuracy": 0.23049985021352767,
|
|
"num_tokens": 48960020.0,
|
|
"step": 21350
|
|
},
|
|
{
|
|
"entropy": 5.129503297805786,
|
|
"epoch": 2.0513928914505284,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045808691359398905,
|
|
"loss": 4.7458,
|
|
"mean_token_accuracy": 0.2291984051465988,
|
|
"num_tokens": 48970833.0,
|
|
"step": 21355
|
|
},
|
|
{
|
|
"entropy": 5.103326511383057,
|
|
"epoch": 2.0518731988472623,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004580669838918401,
|
|
"loss": 4.8077,
|
|
"mean_token_accuracy": 0.23423267751932145,
|
|
"num_tokens": 48981821.0,
|
|
"step": 21360
|
|
},
|
|
{
|
|
"entropy": 5.16456298828125,
|
|
"epoch": 2.052353506243996,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045804704993941935,
|
|
"loss": 4.8188,
|
|
"mean_token_accuracy": 0.22716680616140367,
|
|
"num_tokens": 48992341.0,
|
|
"step": 21365
|
|
},
|
|
{
|
|
"entropy": 5.176882934570313,
|
|
"epoch": 2.05283381364073,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00045802711173718966,
|
|
"loss": 4.8875,
|
|
"mean_token_accuracy": 0.22166429013013839,
|
|
"num_tokens": 49003063.0,
|
|
"step": 21370
|
|
},
|
|
{
|
|
"entropy": 5.161245679855346,
|
|
"epoch": 2.053314121037464,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004580071692856138,
|
|
"loss": 4.8891,
|
|
"mean_token_accuracy": 0.2255357474088669,
|
|
"num_tokens": 49015353.0,
|
|
"step": 21375
|
|
},
|
|
{
|
|
"entropy": 5.0843805313110355,
|
|
"epoch": 2.053794428434198,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045798722258515504,
|
|
"loss": 4.7591,
|
|
"mean_token_accuracy": 0.23294135332107543,
|
|
"num_tokens": 49026405.0,
|
|
"step": 21380
|
|
},
|
|
{
|
|
"entropy": 5.180259132385254,
|
|
"epoch": 2.0542747358309317,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045796727163627623,
|
|
"loss": 4.8378,
|
|
"mean_token_accuracy": 0.22284193336963654,
|
|
"num_tokens": 49038433.0,
|
|
"step": 21385
|
|
},
|
|
{
|
|
"entropy": 5.066757488250732,
|
|
"epoch": 2.0547550432276656,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004579473164394408,
|
|
"loss": 4.7192,
|
|
"mean_token_accuracy": 0.2332998186349869,
|
|
"num_tokens": 49049463.0,
|
|
"step": 21390
|
|
},
|
|
{
|
|
"entropy": 5.036176252365112,
|
|
"epoch": 2.0552353506243994,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045792735699511176,
|
|
"loss": 4.7009,
|
|
"mean_token_accuracy": 0.2337260901927948,
|
|
"num_tokens": 49061607.0,
|
|
"step": 21395
|
|
},
|
|
{
|
|
"entropy": 5.158068370819092,
|
|
"epoch": 2.0557156580211333,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045790739330375276,
|
|
"loss": 4.8613,
|
|
"mean_token_accuracy": 0.2220068097114563,
|
|
"num_tokens": 49074687.0,
|
|
"step": 21400
|
|
},
|
|
{
|
|
"entropy": 5.102404451370239,
|
|
"epoch": 2.0561959654178676,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045788742536582717,
|
|
"loss": 4.7332,
|
|
"mean_token_accuracy": 0.23337887227535248,
|
|
"num_tokens": 49086151.0,
|
|
"step": 21405
|
|
},
|
|
{
|
|
"entropy": 5.126197957992554,
|
|
"epoch": 2.0566762728146015,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00045786745318179866,
|
|
"loss": 4.8397,
|
|
"mean_token_accuracy": 0.2232088029384613,
|
|
"num_tokens": 49097418.0,
|
|
"step": 21410
|
|
},
|
|
{
|
|
"entropy": 5.067611217498779,
|
|
"epoch": 2.0571565802113354,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004578474767521309,
|
|
"loss": 4.7617,
|
|
"mean_token_accuracy": 0.23839059025049208,
|
|
"num_tokens": 49108570.0,
|
|
"step": 21415
|
|
},
|
|
{
|
|
"entropy": 5.0432921886444095,
|
|
"epoch": 2.0576368876080693,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00045782749607728765,
|
|
"loss": 4.8426,
|
|
"mean_token_accuracy": 0.22151308357715607,
|
|
"num_tokens": 49120906.0,
|
|
"step": 21420
|
|
},
|
|
{
|
|
"entropy": 5.13737416267395,
|
|
"epoch": 2.058117195004803,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045780751115773286,
|
|
"loss": 4.7463,
|
|
"mean_token_accuracy": 0.2359360083937645,
|
|
"num_tokens": 49132040.0,
|
|
"step": 21425
|
|
},
|
|
{
|
|
"entropy": 5.133153438568115,
|
|
"epoch": 2.058597502401537,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004577875219939304,
|
|
"loss": 4.8587,
|
|
"mean_token_accuracy": 0.23113487362861634,
|
|
"num_tokens": 49143427.0,
|
|
"step": 21430
|
|
},
|
|
{
|
|
"entropy": 5.129691934585571,
|
|
"epoch": 2.059077809798271,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004577675285863446,
|
|
"loss": 4.8194,
|
|
"mean_token_accuracy": 0.22555259466171265,
|
|
"num_tokens": 49155351.0,
|
|
"step": 21435
|
|
},
|
|
{
|
|
"entropy": 5.158038377761841,
|
|
"epoch": 2.059558117195005,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045774753093543943,
|
|
"loss": 4.8271,
|
|
"mean_token_accuracy": 0.22593292891979216,
|
|
"num_tokens": 49167898.0,
|
|
"step": 21440
|
|
},
|
|
{
|
|
"entropy": 5.149058246612549,
|
|
"epoch": 2.0600384245917387,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004577275290416791,
|
|
"loss": 4.8146,
|
|
"mean_token_accuracy": 0.22242112308740616,
|
|
"num_tokens": 49179209.0,
|
|
"step": 21445
|
|
},
|
|
{
|
|
"entropy": 5.105331230163574,
|
|
"epoch": 2.0605187319884726,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004577075229055283,
|
|
"loss": 4.7651,
|
|
"mean_token_accuracy": 0.2339042067527771,
|
|
"num_tokens": 49190382.0,
|
|
"step": 21450
|
|
},
|
|
{
|
|
"entropy": 5.0633519172668455,
|
|
"epoch": 2.0609990393852065,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00045768751252745133,
|
|
"loss": 4.8038,
|
|
"mean_token_accuracy": 0.23498952239751816,
|
|
"num_tokens": 49203511.0,
|
|
"step": 21455
|
|
},
|
|
{
|
|
"entropy": 5.231168937683106,
|
|
"epoch": 2.0614793467819403,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00045766749790791274,
|
|
"loss": 4.8476,
|
|
"mean_token_accuracy": 0.22552503943443297,
|
|
"num_tokens": 49214276.0,
|
|
"step": 21460
|
|
},
|
|
{
|
|
"entropy": 5.007623672485352,
|
|
"epoch": 2.061959654178674,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004576474790473773,
|
|
"loss": 4.7003,
|
|
"mean_token_accuracy": 0.2353790283203125,
|
|
"num_tokens": 49225803.0,
|
|
"step": 21465
|
|
},
|
|
{
|
|
"entropy": 5.100078535079956,
|
|
"epoch": 2.062439961575408,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045762745594630973,
|
|
"loss": 4.8239,
|
|
"mean_token_accuracy": 0.22536432445049287,
|
|
"num_tokens": 49237187.0,
|
|
"step": 21470
|
|
},
|
|
{
|
|
"entropy": 5.23383059501648,
|
|
"epoch": 2.062920268972142,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004576074286051749,
|
|
"loss": 4.8905,
|
|
"mean_token_accuracy": 0.22259025722742082,
|
|
"num_tokens": 49250151.0,
|
|
"step": 21475
|
|
},
|
|
{
|
|
"entropy": 5.116559600830078,
|
|
"epoch": 2.063400576368876,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00045758739702443787,
|
|
"loss": 4.7177,
|
|
"mean_token_accuracy": 0.24015939831733704,
|
|
"num_tokens": 49261970.0,
|
|
"step": 21480
|
|
},
|
|
{
|
|
"entropy": 5.085567474365234,
|
|
"epoch": 2.06388088376561,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004575673612045636,
|
|
"loss": 4.7807,
|
|
"mean_token_accuracy": 0.23618687838315963,
|
|
"num_tokens": 49271427.0,
|
|
"step": 21485
|
|
},
|
|
{
|
|
"entropy": 5.068835973739624,
|
|
"epoch": 2.064361191162344,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004575473211460173,
|
|
"loss": 4.8175,
|
|
"mean_token_accuracy": 0.22350717782974244,
|
|
"num_tokens": 49285102.0,
|
|
"step": 21490
|
|
},
|
|
{
|
|
"entropy": 5.153846311569214,
|
|
"epoch": 2.064841498559078,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004575272768492644,
|
|
"loss": 4.815,
|
|
"mean_token_accuracy": 0.2282892346382141,
|
|
"num_tokens": 49297569.0,
|
|
"step": 21495
|
|
},
|
|
{
|
|
"entropy": 5.153155851364136,
|
|
"epoch": 2.065321805955812,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045750722831476993,
|
|
"loss": 4.7356,
|
|
"mean_token_accuracy": 0.2330961436033249,
|
|
"num_tokens": 49308795.0,
|
|
"step": 21500
|
|
},
|
|
{
|
|
"entropy": 5.150883054733276,
|
|
"epoch": 2.0658021133525457,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00045748717554299964,
|
|
"loss": 4.8588,
|
|
"mean_token_accuracy": 0.22739875316619873,
|
|
"num_tokens": 49320971.0,
|
|
"step": 21505
|
|
},
|
|
{
|
|
"entropy": 5.160495758056641,
|
|
"epoch": 2.0662824207492796,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000457467118534419,
|
|
"loss": 4.9338,
|
|
"mean_token_accuracy": 0.21607262045145034,
|
|
"num_tokens": 49333624.0,
|
|
"step": 21510
|
|
},
|
|
{
|
|
"entropy": 5.244060420989991,
|
|
"epoch": 2.0667627281460135,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004574470572894938,
|
|
"loss": 4.9336,
|
|
"mean_token_accuracy": 0.21895478069782257,
|
|
"num_tokens": 49344863.0,
|
|
"step": 21515
|
|
},
|
|
{
|
|
"entropy": 5.147519111633301,
|
|
"epoch": 2.0672430355427474,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004574269918086895,
|
|
"loss": 4.7947,
|
|
"mean_token_accuracy": 0.22707038521766662,
|
|
"num_tokens": 49355501.0,
|
|
"step": 21520
|
|
},
|
|
{
|
|
"entropy": 5.124402475357056,
|
|
"epoch": 2.0677233429394812,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004574069220924722,
|
|
"loss": 4.8635,
|
|
"mean_token_accuracy": 0.23370101898908616,
|
|
"num_tokens": 49366384.0,
|
|
"step": 21525
|
|
},
|
|
{
|
|
"entropy": 5.225724935531616,
|
|
"epoch": 2.068203650336215,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004573868481413079,
|
|
"loss": 4.9503,
|
|
"mean_token_accuracy": 0.21145387589931489,
|
|
"num_tokens": 49378670.0,
|
|
"step": 21530
|
|
},
|
|
{
|
|
"entropy": 5.120734691619873,
|
|
"epoch": 2.068683957732949,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00045736676995566244,
|
|
"loss": 4.7891,
|
|
"mean_token_accuracy": 0.22941143959760665,
|
|
"num_tokens": 49390273.0,
|
|
"step": 21535
|
|
},
|
|
{
|
|
"entropy": 5.159012079238892,
|
|
"epoch": 2.069164265129683,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045734668753600217,
|
|
"loss": 4.8598,
|
|
"mean_token_accuracy": 0.22018487006425858,
|
|
"num_tokens": 49402503.0,
|
|
"step": 21540
|
|
},
|
|
{
|
|
"entropy": 5.173748779296875,
|
|
"epoch": 2.0696445725264168,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045732660088279326,
|
|
"loss": 4.7962,
|
|
"mean_token_accuracy": 0.2246626928448677,
|
|
"num_tokens": 49413948.0,
|
|
"step": 21545
|
|
},
|
|
{
|
|
"entropy": 5.095520544052124,
|
|
"epoch": 2.0701248799231506,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045730650999650216,
|
|
"loss": 4.7589,
|
|
"mean_token_accuracy": 0.23621760606765746,
|
|
"num_tokens": 49424164.0,
|
|
"step": 21550
|
|
},
|
|
{
|
|
"entropy": 5.136573696136475,
|
|
"epoch": 2.0706051873198845,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00045728641487759506,
|
|
"loss": 4.9694,
|
|
"mean_token_accuracy": 0.21803467869758605,
|
|
"num_tokens": 49435102.0,
|
|
"step": 21555
|
|
},
|
|
{
|
|
"entropy": 5.109771871566773,
|
|
"epoch": 2.071085494716619,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004572663155265388,
|
|
"loss": 4.7947,
|
|
"mean_token_accuracy": 0.23301592767238616,
|
|
"num_tokens": 49445445.0,
|
|
"step": 21560
|
|
},
|
|
{
|
|
"entropy": 5.155466461181641,
|
|
"epoch": 2.0715658021133527,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004572462119437999,
|
|
"loss": 4.7728,
|
|
"mean_token_accuracy": 0.229949714243412,
|
|
"num_tokens": 49457578.0,
|
|
"step": 21565
|
|
},
|
|
{
|
|
"entropy": 5.060719394683838,
|
|
"epoch": 2.0720461095100866,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00045722610412984513,
|
|
"loss": 4.7735,
|
|
"mean_token_accuracy": 0.2294953465461731,
|
|
"num_tokens": 49470253.0,
|
|
"step": 21570
|
|
},
|
|
{
|
|
"entropy": 5.164776754379273,
|
|
"epoch": 2.0725264169068205,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004572059920851412,
|
|
"loss": 4.847,
|
|
"mean_token_accuracy": 0.22464604824781417,
|
|
"num_tokens": 49481854.0,
|
|
"step": 21575
|
|
},
|
|
{
|
|
"entropy": 5.15602068901062,
|
|
"epoch": 2.0730067243035544,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045718587581015534,
|
|
"loss": 4.8702,
|
|
"mean_token_accuracy": 0.2171690970659256,
|
|
"num_tokens": 49494311.0,
|
|
"step": 21580
|
|
},
|
|
{
|
|
"entropy": 5.178106212615967,
|
|
"epoch": 2.0734870317002883,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004571657553053544,
|
|
"loss": 4.8043,
|
|
"mean_token_accuracy": 0.23305046260356904,
|
|
"num_tokens": 49506186.0,
|
|
"step": 21585
|
|
},
|
|
{
|
|
"entropy": 5.129500436782837,
|
|
"epoch": 2.073967339097022,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004571456305712055,
|
|
"loss": 4.7951,
|
|
"mean_token_accuracy": 0.22732842117547988,
|
|
"num_tokens": 49517841.0,
|
|
"step": 21590
|
|
},
|
|
{
|
|
"entropy": 5.098537969589233,
|
|
"epoch": 2.074447646493756,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004571255016081759,
|
|
"loss": 4.7894,
|
|
"mean_token_accuracy": 0.2313293009996414,
|
|
"num_tokens": 49529260.0,
|
|
"step": 21595
|
|
},
|
|
{
|
|
"entropy": 5.134753608703614,
|
|
"epoch": 2.07492795389049,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000457105368416733,
|
|
"loss": 4.7866,
|
|
"mean_token_accuracy": 0.23531247079372405,
|
|
"num_tokens": 49540892.0,
|
|
"step": 21600
|
|
},
|
|
{
|
|
"entropy": 5.188248872756958,
|
|
"epoch": 2.0754082612872238,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00045708523099734417,
|
|
"loss": 4.8153,
|
|
"mean_token_accuracy": 0.22493548393249513,
|
|
"num_tokens": 49550924.0,
|
|
"step": 21605
|
|
},
|
|
{
|
|
"entropy": 5.116950845718383,
|
|
"epoch": 2.0758885686839577,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045706508935047693,
|
|
"loss": 4.8371,
|
|
"mean_token_accuracy": 0.23132913410663605,
|
|
"num_tokens": 49562424.0,
|
|
"step": 21610
|
|
},
|
|
{
|
|
"entropy": 5.108313131332397,
|
|
"epoch": 2.0763688760806915,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000457044943476599,
|
|
"loss": 4.7706,
|
|
"mean_token_accuracy": 0.23251599222421646,
|
|
"num_tokens": 49573341.0,
|
|
"step": 21615
|
|
},
|
|
{
|
|
"entropy": 5.13428955078125,
|
|
"epoch": 2.0768491834774254,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00045702479337617795,
|
|
"loss": 4.7639,
|
|
"mean_token_accuracy": 0.22919657826423645,
|
|
"num_tokens": 49584047.0,
|
|
"step": 21620
|
|
},
|
|
{
|
|
"entropy": 5.056707668304443,
|
|
"epoch": 2.0773294908741593,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004570046390496818,
|
|
"loss": 4.7705,
|
|
"mean_token_accuracy": 0.22819400131702422,
|
|
"num_tokens": 49595898.0,
|
|
"step": 21625
|
|
},
|
|
{
|
|
"entropy": 5.098710060119629,
|
|
"epoch": 2.077809798270893,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004569844804975783,
|
|
"loss": 4.8347,
|
|
"mean_token_accuracy": 0.22691741287708284,
|
|
"num_tokens": 49607865.0,
|
|
"step": 21630
|
|
},
|
|
{
|
|
"entropy": 5.2283299446105955,
|
|
"epoch": 2.0782901056676275,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004569643177203356,
|
|
"loss": 4.8626,
|
|
"mean_token_accuracy": 0.22496672421693803,
|
|
"num_tokens": 49619513.0,
|
|
"step": 21635
|
|
},
|
|
{
|
|
"entropy": 5.142520713806152,
|
|
"epoch": 2.0787704130643614,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004569441507184218,
|
|
"loss": 4.7594,
|
|
"mean_token_accuracy": 0.23643842935562134,
|
|
"num_tokens": 49631232.0,
|
|
"step": 21640
|
|
},
|
|
{
|
|
"entropy": 5.1284263134002686,
|
|
"epoch": 2.0792507204610953,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045692397949230495,
|
|
"loss": 4.7922,
|
|
"mean_token_accuracy": 0.22983661592006682,
|
|
"num_tokens": 49642653.0,
|
|
"step": 21645
|
|
},
|
|
{
|
|
"entropy": 5.0891499519348145,
|
|
"epoch": 2.079731027857829,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00045690380404245364,
|
|
"loss": 4.746,
|
|
"mean_token_accuracy": 0.2293478086590767,
|
|
"num_tokens": 49654479.0,
|
|
"step": 21650
|
|
},
|
|
{
|
|
"entropy": 5.111334705352784,
|
|
"epoch": 2.080211335254563,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00045688362436933607,
|
|
"loss": 4.8206,
|
|
"mean_token_accuracy": 0.2245978146791458,
|
|
"num_tokens": 49666461.0,
|
|
"step": 21655
|
|
},
|
|
{
|
|
"entropy": 5.1269042015075685,
|
|
"epoch": 2.080691642651297,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004568634404734208,
|
|
"loss": 4.8667,
|
|
"mean_token_accuracy": 0.23144145607948302,
|
|
"num_tokens": 49676602.0,
|
|
"step": 21660
|
|
},
|
|
{
|
|
"entropy": 5.069075059890747,
|
|
"epoch": 2.081171950048031,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004568432523551765,
|
|
"loss": 4.7765,
|
|
"mean_token_accuracy": 0.2366631269454956,
|
|
"num_tokens": 49689225.0,
|
|
"step": 21665
|
|
},
|
|
{
|
|
"entropy": 5.090816020965576,
|
|
"epoch": 2.0816522574447647,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004568230600150718,
|
|
"loss": 4.8148,
|
|
"mean_token_accuracy": 0.22771646976470947,
|
|
"num_tokens": 49700827.0,
|
|
"step": 21670
|
|
},
|
|
{
|
|
"entropy": 5.17385630607605,
|
|
"epoch": 2.0821325648414986,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004568028634535757,
|
|
"loss": 4.8407,
|
|
"mean_token_accuracy": 0.23026363849639891,
|
|
"num_tokens": 49712777.0,
|
|
"step": 21675
|
|
},
|
|
{
|
|
"entropy": 5.157884550094605,
|
|
"epoch": 2.0826128722382324,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004567826626711568,
|
|
"loss": 4.825,
|
|
"mean_token_accuracy": 0.23216718733310698,
|
|
"num_tokens": 49725234.0,
|
|
"step": 21680
|
|
},
|
|
{
|
|
"entropy": 5.0832091808319095,
|
|
"epoch": 2.0830931796349663,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004567624576682843,
|
|
"loss": 4.7638,
|
|
"mean_token_accuracy": 0.23463573008775712,
|
|
"num_tokens": 49736933.0,
|
|
"step": 21685
|
|
},
|
|
{
|
|
"entropy": 5.1686598777771,
|
|
"epoch": 2.0835734870317,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004567422484454273,
|
|
"loss": 4.8665,
|
|
"mean_token_accuracy": 0.22514163851737976,
|
|
"num_tokens": 49748279.0,
|
|
"step": 21690
|
|
},
|
|
{
|
|
"entropy": 5.040525722503662,
|
|
"epoch": 2.084053794428434,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00045672203500305493,
|
|
"loss": 4.7803,
|
|
"mean_token_accuracy": 0.23218834549188613,
|
|
"num_tokens": 49760874.0,
|
|
"step": 21695
|
|
},
|
|
{
|
|
"entropy": 5.063557481765747,
|
|
"epoch": 2.084534101825168,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045670181734163654,
|
|
"loss": 4.7046,
|
|
"mean_token_accuracy": 0.2362649843096733,
|
|
"num_tokens": 49772520.0,
|
|
"step": 21700
|
|
},
|
|
{
|
|
"entropy": 5.07045087814331,
|
|
"epoch": 2.085014409221902,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004566815954616414,
|
|
"loss": 4.7643,
|
|
"mean_token_accuracy": 0.2305879309773445,
|
|
"num_tokens": 49783019.0,
|
|
"step": 21705
|
|
},
|
|
{
|
|
"entropy": 5.038512563705444,
|
|
"epoch": 2.0854947166186357,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00045666136936353913,
|
|
"loss": 4.7758,
|
|
"mean_token_accuracy": 0.22893125116825103,
|
|
"num_tokens": 49795111.0,
|
|
"step": 21710
|
|
},
|
|
{
|
|
"entropy": 5.105225610733032,
|
|
"epoch": 2.08597502401537,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004566411390477993,
|
|
"loss": 4.7396,
|
|
"mean_token_accuracy": 0.2333238035440445,
|
|
"num_tokens": 49807022.0,
|
|
"step": 21715
|
|
},
|
|
{
|
|
"entropy": 5.177683877944946,
|
|
"epoch": 2.086455331412104,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00045662090451489156,
|
|
"loss": 4.838,
|
|
"mean_token_accuracy": 0.2248334839940071,
|
|
"num_tokens": 49819225.0,
|
|
"step": 21720
|
|
},
|
|
{
|
|
"entropy": 5.13505277633667,
|
|
"epoch": 2.086935638808838,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045660066576528577,
|
|
"loss": 4.7901,
|
|
"mean_token_accuracy": 0.2331462487578392,
|
|
"num_tokens": 49830989.0,
|
|
"step": 21725
|
|
},
|
|
{
|
|
"entropy": 5.134175109863281,
|
|
"epoch": 2.0874159462055717,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004565804227994518,
|
|
"loss": 4.7844,
|
|
"mean_token_accuracy": 0.23327937126159667,
|
|
"num_tokens": 49841618.0,
|
|
"step": 21730
|
|
},
|
|
{
|
|
"entropy": 5.0149389743804935,
|
|
"epoch": 2.0878962536023056,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004565601756178595,
|
|
"loss": 4.6533,
|
|
"mean_token_accuracy": 0.23993728905916215,
|
|
"num_tokens": 49851985.0,
|
|
"step": 21735
|
|
},
|
|
{
|
|
"entropy": 5.093489313125611,
|
|
"epoch": 2.0883765609990395,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004565399242209791,
|
|
"loss": 4.866,
|
|
"mean_token_accuracy": 0.22467585802078247,
|
|
"num_tokens": 49863383.0,
|
|
"step": 21740
|
|
},
|
|
{
|
|
"entropy": 5.182189702987671,
|
|
"epoch": 2.0888568683957733,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004565196686092807,
|
|
"loss": 4.8218,
|
|
"mean_token_accuracy": 0.23218757808208465,
|
|
"num_tokens": 49875757.0,
|
|
"step": 21745
|
|
},
|
|
{
|
|
"entropy": 5.112382221221924,
|
|
"epoch": 2.089337175792507,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004564994087832346,
|
|
"loss": 4.7662,
|
|
"mean_token_accuracy": 0.23386679738759994,
|
|
"num_tokens": 49888419.0,
|
|
"step": 21750
|
|
},
|
|
{
|
|
"entropy": 5.132735776901245,
|
|
"epoch": 2.089817483189241,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00045647914474331123,
|
|
"loss": 4.8519,
|
|
"mean_token_accuracy": 0.22820329815149307,
|
|
"num_tokens": 49899629.0,
|
|
"step": 21755
|
|
},
|
|
{
|
|
"entropy": 5.226576328277588,
|
|
"epoch": 2.090297790585975,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00045645887648998094,
|
|
"loss": 4.8461,
|
|
"mean_token_accuracy": 0.22697775065898895,
|
|
"num_tokens": 49909858.0,
|
|
"step": 21760
|
|
},
|
|
{
|
|
"entropy": 5.0463744640350345,
|
|
"epoch": 2.090778097982709,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045643860402371433,
|
|
"loss": 4.809,
|
|
"mean_token_accuracy": 0.2285969987511635,
|
|
"num_tokens": 49921305.0,
|
|
"step": 21765
|
|
},
|
|
{
|
|
"entropy": 5.13976731300354,
|
|
"epoch": 2.0912584053794427,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004564183273449821,
|
|
"loss": 4.8182,
|
|
"mean_token_accuracy": 0.22704905718564988,
|
|
"num_tokens": 49931966.0,
|
|
"step": 21770
|
|
},
|
|
{
|
|
"entropy": 5.118189096450806,
|
|
"epoch": 2.0917387127761766,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004563980464542551,
|
|
"loss": 4.7414,
|
|
"mean_token_accuracy": 0.22773682326078415,
|
|
"num_tokens": 49943781.0,
|
|
"step": 21775
|
|
},
|
|
{
|
|
"entropy": 5.141473865509033,
|
|
"epoch": 2.0922190201729105,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045637776135200406,
|
|
"loss": 4.7968,
|
|
"mean_token_accuracy": 0.23308294266462326,
|
|
"num_tokens": 49954149.0,
|
|
"step": 21780
|
|
},
|
|
{
|
|
"entropy": 5.181213665008545,
|
|
"epoch": 2.0926993275696444,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004563574720386999,
|
|
"loss": 4.8447,
|
|
"mean_token_accuracy": 0.2268218591809273,
|
|
"num_tokens": 49965542.0,
|
|
"step": 21785
|
|
},
|
|
{
|
|
"entropy": 5.112164497375488,
|
|
"epoch": 2.0931796349663783,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004563371785148139,
|
|
"loss": 4.7811,
|
|
"mean_token_accuracy": 0.225405690073967,
|
|
"num_tokens": 49977444.0,
|
|
"step": 21790
|
|
},
|
|
{
|
|
"entropy": 5.08275146484375,
|
|
"epoch": 2.0936599423631126,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045631688078081695,
|
|
"loss": 4.7678,
|
|
"mean_token_accuracy": 0.22847483456134796,
|
|
"num_tokens": 49988409.0,
|
|
"step": 21795
|
|
},
|
|
{
|
|
"entropy": 5.13219313621521,
|
|
"epoch": 2.0941402497598465,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004562965788371805,
|
|
"loss": 4.8285,
|
|
"mean_token_accuracy": 0.22423352152109147,
|
|
"num_tokens": 49999639.0,
|
|
"step": 21800
|
|
},
|
|
{
|
|
"entropy": 5.11357626914978,
|
|
"epoch": 2.0946205571565804,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004562762726843758,
|
|
"loss": 4.7884,
|
|
"mean_token_accuracy": 0.22604774087667465,
|
|
"num_tokens": 50010964.0,
|
|
"step": 21805
|
|
},
|
|
{
|
|
"entropy": 5.1882233142852785,
|
|
"epoch": 2.0951008645533142,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045625596232287436,
|
|
"loss": 4.9171,
|
|
"mean_token_accuracy": 0.21435359567403794,
|
|
"num_tokens": 50024487.0,
|
|
"step": 21810
|
|
},
|
|
{
|
|
"entropy": 5.275296449661255,
|
|
"epoch": 2.095581171950048,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004562356477531477,
|
|
"loss": 4.9596,
|
|
"mean_token_accuracy": 0.21690075546503068,
|
|
"num_tokens": 50037202.0,
|
|
"step": 21815
|
|
},
|
|
{
|
|
"entropy": 5.1064393520355225,
|
|
"epoch": 2.096061479346782,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004562153289756674,
|
|
"loss": 4.7115,
|
|
"mean_token_accuracy": 0.23600296229124068,
|
|
"num_tokens": 50048520.0,
|
|
"step": 21820
|
|
},
|
|
{
|
|
"entropy": 5.138644599914551,
|
|
"epoch": 2.096541786743516,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004561950059909053,
|
|
"loss": 4.8322,
|
|
"mean_token_accuracy": 0.22320448607206345,
|
|
"num_tokens": 50058523.0,
|
|
"step": 21825
|
|
},
|
|
{
|
|
"entropy": 5.159434032440186,
|
|
"epoch": 2.0970220941402498,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004561746787993332,
|
|
"loss": 4.8459,
|
|
"mean_token_accuracy": 0.22186700254678726,
|
|
"num_tokens": 50070241.0,
|
|
"step": 21830
|
|
},
|
|
{
|
|
"entropy": 5.162479114532471,
|
|
"epoch": 2.0975024015369836,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045615434740142307,
|
|
"loss": 4.8536,
|
|
"mean_token_accuracy": 0.22856236398220062,
|
|
"num_tokens": 50081326.0,
|
|
"step": 21835
|
|
},
|
|
{
|
|
"entropy": 5.051312732696533,
|
|
"epoch": 2.0979827089337175,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045613401179764686,
|
|
"loss": 4.7083,
|
|
"mean_token_accuracy": 0.23097764253616332,
|
|
"num_tokens": 50091767.0,
|
|
"step": 21840
|
|
},
|
|
{
|
|
"entropy": 5.1112017154693605,
|
|
"epoch": 2.0984630163304514,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045611367198847676,
|
|
"loss": 4.8492,
|
|
"mean_token_accuracy": 0.22863307744264602,
|
|
"num_tokens": 50103435.0,
|
|
"step": 21845
|
|
},
|
|
{
|
|
"entropy": 5.160291004180908,
|
|
"epoch": 2.0989433237271853,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000456093327974385,
|
|
"loss": 4.8023,
|
|
"mean_token_accuracy": 0.22593716233968736,
|
|
"num_tokens": 50114127.0,
|
|
"step": 21850
|
|
},
|
|
{
|
|
"entropy": 5.141428422927857,
|
|
"epoch": 2.099423631123919,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004560729797558438,
|
|
"loss": 4.8195,
|
|
"mean_token_accuracy": 0.23141436874866486,
|
|
"num_tokens": 50126328.0,
|
|
"step": 21855
|
|
},
|
|
{
|
|
"entropy": 5.143086194992065,
|
|
"epoch": 2.099903938520653,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004560526273333259,
|
|
"loss": 4.7925,
|
|
"mean_token_accuracy": 0.2326791599392891,
|
|
"num_tokens": 50137600.0,
|
|
"step": 21860
|
|
},
|
|
{
|
|
"entropy": 5.146692323684692,
|
|
"epoch": 2.100384245917387,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045603227070730346,
|
|
"loss": 4.861,
|
|
"mean_token_accuracy": 0.222840116918087,
|
|
"num_tokens": 50148711.0,
|
|
"step": 21865
|
|
},
|
|
{
|
|
"entropy": 5.104106807708741,
|
|
"epoch": 2.1008645533141213,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00045601190987824933,
|
|
"loss": 4.8259,
|
|
"mean_token_accuracy": 0.21673232913017274,
|
|
"num_tokens": 50161141.0,
|
|
"step": 21870
|
|
},
|
|
{
|
|
"entropy": 5.149733543395996,
|
|
"epoch": 2.101344860710855,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045599154484663606,
|
|
"loss": 4.8356,
|
|
"mean_token_accuracy": 0.22281887978315354,
|
|
"num_tokens": 50173145.0,
|
|
"step": 21875
|
|
},
|
|
{
|
|
"entropy": 5.15035605430603,
|
|
"epoch": 2.101825168107589,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045597117561293663,
|
|
"loss": 4.8121,
|
|
"mean_token_accuracy": 0.22988341897726058,
|
|
"num_tokens": 50184074.0,
|
|
"step": 21880
|
|
},
|
|
{
|
|
"entropy": 5.278981351852417,
|
|
"epoch": 2.102305475504323,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004559508021776238,
|
|
"loss": 5.0788,
|
|
"mean_token_accuracy": 0.20862277299165727,
|
|
"num_tokens": 50195004.0,
|
|
"step": 21885
|
|
},
|
|
{
|
|
"entropy": 5.090446996688843,
|
|
"epoch": 2.1027857829010568,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004559304245411707,
|
|
"loss": 4.7646,
|
|
"mean_token_accuracy": 0.23295176327228545,
|
|
"num_tokens": 50205329.0,
|
|
"step": 21890
|
|
},
|
|
{
|
|
"entropy": 5.181223297119141,
|
|
"epoch": 2.1032660902977907,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00045591004270405044,
|
|
"loss": 4.8294,
|
|
"mean_token_accuracy": 0.22295380681753157,
|
|
"num_tokens": 50218346.0,
|
|
"step": 21895
|
|
},
|
|
{
|
|
"entropy": 5.123899793624878,
|
|
"epoch": 2.1037463976945245,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004558896566667361,
|
|
"loss": 4.7986,
|
|
"mean_token_accuracy": 0.23100828528404235,
|
|
"num_tokens": 50229790.0,
|
|
"step": 21900
|
|
},
|
|
{
|
|
"entropy": 5.232024002075195,
|
|
"epoch": 2.1042267050912584,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045586926642970113,
|
|
"loss": 4.9623,
|
|
"mean_token_accuracy": 0.21864116042852402,
|
|
"num_tokens": 50240547.0,
|
|
"step": 21905
|
|
},
|
|
{
|
|
"entropy": 5.128908777236939,
|
|
"epoch": 2.1047070124879923,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004558488719934188,
|
|
"loss": 4.7698,
|
|
"mean_token_accuracy": 0.23230497986078263,
|
|
"num_tokens": 50251889.0,
|
|
"step": 21910
|
|
},
|
|
{
|
|
"entropy": 5.117974853515625,
|
|
"epoch": 2.105187319884726,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004558284733583627,
|
|
"loss": 4.8416,
|
|
"mean_token_accuracy": 0.2239149734377861,
|
|
"num_tokens": 50263599.0,
|
|
"step": 21915
|
|
},
|
|
{
|
|
"entropy": 5.144348382949829,
|
|
"epoch": 2.10566762728146,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045580807052500645,
|
|
"loss": 4.8838,
|
|
"mean_token_accuracy": 0.22369515597820283,
|
|
"num_tokens": 50275645.0,
|
|
"step": 21920
|
|
},
|
|
{
|
|
"entropy": 5.09762921333313,
|
|
"epoch": 2.106147934678194,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004557876634938236,
|
|
"loss": 4.7573,
|
|
"mean_token_accuracy": 0.23217075318098068,
|
|
"num_tokens": 50287640.0,
|
|
"step": 21925
|
|
},
|
|
{
|
|
"entropy": 5.126234912872315,
|
|
"epoch": 2.106628242074928,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004557672522652881,
|
|
"loss": 4.8627,
|
|
"mean_token_accuracy": 0.22643891870975494,
|
|
"num_tokens": 50299119.0,
|
|
"step": 21930
|
|
},
|
|
{
|
|
"entropy": 5.120940732955932,
|
|
"epoch": 2.1071085494716617,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004557468368398738,
|
|
"loss": 4.8007,
|
|
"mean_token_accuracy": 0.22507584393024443,
|
|
"num_tokens": 50311377.0,
|
|
"step": 21935
|
|
},
|
|
{
|
|
"entropy": 5.157046747207642,
|
|
"epoch": 2.1075888568683956,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004557264172180546,
|
|
"loss": 4.8418,
|
|
"mean_token_accuracy": 0.2272740438580513,
|
|
"num_tokens": 50323185.0,
|
|
"step": 21940
|
|
},
|
|
{
|
|
"entropy": 5.1661652565002445,
|
|
"epoch": 2.1080691642651295,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004557059934003046,
|
|
"loss": 4.9028,
|
|
"mean_token_accuracy": 0.22802554219961166,
|
|
"num_tokens": 50333992.0,
|
|
"step": 21945
|
|
},
|
|
{
|
|
"entropy": 5.253039216995239,
|
|
"epoch": 2.108549471661864,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004556855653870981,
|
|
"loss": 4.8791,
|
|
"mean_token_accuracy": 0.21829527467489243,
|
|
"num_tokens": 50346087.0,
|
|
"step": 21950
|
|
},
|
|
{
|
|
"entropy": 5.206522464752197,
|
|
"epoch": 2.1090297790585977,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004556651331789092,
|
|
"loss": 4.879,
|
|
"mean_token_accuracy": 0.22323226183652878,
|
|
"num_tokens": 50358274.0,
|
|
"step": 21955
|
|
},
|
|
{
|
|
"entropy": 5.15128870010376,
|
|
"epoch": 2.1095100864553316,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004556446967762125,
|
|
"loss": 4.7805,
|
|
"mean_token_accuracy": 0.2334059163928032,
|
|
"num_tokens": 50367784.0,
|
|
"step": 21960
|
|
},
|
|
{
|
|
"entropy": 5.178620862960815,
|
|
"epoch": 2.1099903938520654,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045562425617948226,
|
|
"loss": 4.9087,
|
|
"mean_token_accuracy": 0.2221836417913437,
|
|
"num_tokens": 50377877.0,
|
|
"step": 21965
|
|
},
|
|
{
|
|
"entropy": 5.145533895492553,
|
|
"epoch": 2.1104707012487993,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045560381138919315,
|
|
"loss": 4.8704,
|
|
"mean_token_accuracy": 0.22770289629697799,
|
|
"num_tokens": 50391023.0,
|
|
"step": 21970
|
|
},
|
|
{
|
|
"entropy": 5.199602508544922,
|
|
"epoch": 2.110951008645533,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045558336240581984,
|
|
"loss": 4.852,
|
|
"mean_token_accuracy": 0.2230261117219925,
|
|
"num_tokens": 50404417.0,
|
|
"step": 21975
|
|
},
|
|
{
|
|
"entropy": 5.131056404113769,
|
|
"epoch": 2.111431316042267,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00045556290922983705,
|
|
"loss": 4.8492,
|
|
"mean_token_accuracy": 0.22247645556926726,
|
|
"num_tokens": 50416153.0,
|
|
"step": 21980
|
|
},
|
|
{
|
|
"entropy": 5.08565092086792,
|
|
"epoch": 2.111911623439001,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004555424518617197,
|
|
"loss": 4.8037,
|
|
"mean_token_accuracy": 0.23319306671619416,
|
|
"num_tokens": 50426860.0,
|
|
"step": 21985
|
|
},
|
|
{
|
|
"entropy": 5.109177160263061,
|
|
"epoch": 2.112391930835735,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00045552199030194274,
|
|
"loss": 4.7496,
|
|
"mean_token_accuracy": 0.2333011209964752,
|
|
"num_tokens": 50437262.0,
|
|
"step": 21990
|
|
},
|
|
{
|
|
"entropy": 5.113007116317749,
|
|
"epoch": 2.1128722382324687,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045550152455098113,
|
|
"loss": 4.7431,
|
|
"mean_token_accuracy": 0.23262507021427153,
|
|
"num_tokens": 50448564.0,
|
|
"step": 21995
|
|
},
|
|
{
|
|
"entropy": 5.070738649368286,
|
|
"epoch": 2.1133525456292026,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004554810546093102,
|
|
"loss": 4.7446,
|
|
"mean_token_accuracy": 0.2425445109605789,
|
|
"num_tokens": 50459416.0,
|
|
"step": 22000
|
|
},
|
|
{
|
|
"entropy": 5.15314302444458,
|
|
"epoch": 2.1138328530259365,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000455460580477405,
|
|
"loss": 4.8402,
|
|
"mean_token_accuracy": 0.2306630253791809,
|
|
"num_tokens": 50470934.0,
|
|
"step": 22005
|
|
},
|
|
{
|
|
"entropy": 5.205999040603638,
|
|
"epoch": 2.1143131604226704,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000455440102155741,
|
|
"loss": 4.8454,
|
|
"mean_token_accuracy": 0.22617195397615433,
|
|
"num_tokens": 50483050.0,
|
|
"step": 22010
|
|
},
|
|
{
|
|
"entropy": 5.158279943466186,
|
|
"epoch": 2.1147934678194042,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004554196196447937,
|
|
"loss": 4.87,
|
|
"mean_token_accuracy": 0.21700138747692108,
|
|
"num_tokens": 50494655.0,
|
|
"step": 22015
|
|
},
|
|
{
|
|
"entropy": 5.155607461929321,
|
|
"epoch": 2.115273775216138,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004553991329450385,
|
|
"loss": 4.8006,
|
|
"mean_token_accuracy": 0.22141497135162352,
|
|
"num_tokens": 50506302.0,
|
|
"step": 22020
|
|
},
|
|
{
|
|
"entropy": 5.169172048568726,
|
|
"epoch": 2.115754082612872,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00045537864205695116,
|
|
"loss": 4.8307,
|
|
"mean_token_accuracy": 0.2262236014008522,
|
|
"num_tokens": 50517979.0,
|
|
"step": 22025
|
|
},
|
|
{
|
|
"entropy": 5.175928783416748,
|
|
"epoch": 2.1162343900096063,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004553581469810073,
|
|
"loss": 4.8986,
|
|
"mean_token_accuracy": 0.21773719787597656,
|
|
"num_tokens": 50530054.0,
|
|
"step": 22030
|
|
},
|
|
{
|
|
"entropy": 5.101979446411133,
|
|
"epoch": 2.11671469740634,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045533764771768287,
|
|
"loss": 4.7949,
|
|
"mean_token_accuracy": 0.22100536227226258,
|
|
"num_tokens": 50540664.0,
|
|
"step": 22035
|
|
},
|
|
{
|
|
"entropy": 5.193927669525147,
|
|
"epoch": 2.117195004803074,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045531714426745373,
|
|
"loss": 4.8578,
|
|
"mean_token_accuracy": 0.2234180748462677,
|
|
"num_tokens": 50551444.0,
|
|
"step": 22040
|
|
},
|
|
{
|
|
"entropy": 5.0862926006317135,
|
|
"epoch": 2.117675312199808,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004552966366307959,
|
|
"loss": 4.7849,
|
|
"mean_token_accuracy": 0.23260476291179658,
|
|
"num_tokens": 50562371.0,
|
|
"step": 22045
|
|
},
|
|
{
|
|
"entropy": 5.075074005126953,
|
|
"epoch": 2.118155619596542,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004552761248081856,
|
|
"loss": 4.7629,
|
|
"mean_token_accuracy": 0.22897567898035048,
|
|
"num_tokens": 50573545.0,
|
|
"step": 22050
|
|
},
|
|
{
|
|
"entropy": 5.192044448852539,
|
|
"epoch": 2.1186359269932757,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000455255608800099,
|
|
"loss": 4.9045,
|
|
"mean_token_accuracy": 0.21907887905836104,
|
|
"num_tokens": 50584799.0,
|
|
"step": 22055
|
|
},
|
|
{
|
|
"entropy": 5.175659608840943,
|
|
"epoch": 2.1191162343900096,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045523508860701237,
|
|
"loss": 4.8346,
|
|
"mean_token_accuracy": 0.22237366437911987,
|
|
"num_tokens": 50596435.0,
|
|
"step": 22060
|
|
},
|
|
{
|
|
"entropy": 5.097451782226562,
|
|
"epoch": 2.1195965417867435,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004552145642294021,
|
|
"loss": 4.7367,
|
|
"mean_token_accuracy": 0.2358742281794548,
|
|
"num_tokens": 50606921.0,
|
|
"step": 22065
|
|
},
|
|
{
|
|
"entropy": 5.129500150680542,
|
|
"epoch": 2.1200768491834774,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00045519403566774493,
|
|
"loss": 4.9222,
|
|
"mean_token_accuracy": 0.21600277125835418,
|
|
"num_tokens": 50618107.0,
|
|
"step": 22070
|
|
},
|
|
{
|
|
"entropy": 5.139066696166992,
|
|
"epoch": 2.1205571565802113,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004551735029225172,
|
|
"loss": 4.8115,
|
|
"mean_token_accuracy": 0.2253260374069214,
|
|
"num_tokens": 50628958.0,
|
|
"step": 22075
|
|
},
|
|
{
|
|
"entropy": 5.1106373310089115,
|
|
"epoch": 2.121037463976945,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00045515296599419583,
|
|
"loss": 4.79,
|
|
"mean_token_accuracy": 0.22253842353820802,
|
|
"num_tokens": 50640557.0,
|
|
"step": 22080
|
|
},
|
|
{
|
|
"entropy": 5.1263810157775875,
|
|
"epoch": 2.121517771373679,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004551324248832574,
|
|
"loss": 4.8494,
|
|
"mean_token_accuracy": 0.2251705527305603,
|
|
"num_tokens": 50652699.0,
|
|
"step": 22085
|
|
},
|
|
{
|
|
"entropy": 5.118668031692505,
|
|
"epoch": 2.121998078770413,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004551118795901791,
|
|
"loss": 4.7231,
|
|
"mean_token_accuracy": 0.2325097680091858,
|
|
"num_tokens": 50663735.0,
|
|
"step": 22090
|
|
},
|
|
{
|
|
"entropy": 5.100133562088013,
|
|
"epoch": 2.122478386167147,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004550913301154376,
|
|
"loss": 4.8066,
|
|
"mean_token_accuracy": 0.2283138006925583,
|
|
"num_tokens": 50676384.0,
|
|
"step": 22095
|
|
},
|
|
{
|
|
"entropy": 5.165930652618409,
|
|
"epoch": 2.1229586935638807,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004550707764595103,
|
|
"loss": 4.8467,
|
|
"mean_token_accuracy": 0.22226257771253585,
|
|
"num_tokens": 50687775.0,
|
|
"step": 22100
|
|
},
|
|
{
|
|
"entropy": 5.177178525924683,
|
|
"epoch": 2.123439000960615,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045505021862287434,
|
|
"loss": 4.8929,
|
|
"mean_token_accuracy": 0.2158343955874443,
|
|
"num_tokens": 50698522.0,
|
|
"step": 22105
|
|
},
|
|
{
|
|
"entropy": 5.11175765991211,
|
|
"epoch": 2.123919308357349,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045502965660600684,
|
|
"loss": 4.7975,
|
|
"mean_token_accuracy": 0.2320536717772484,
|
|
"num_tokens": 50709994.0,
|
|
"step": 22110
|
|
},
|
|
{
|
|
"entropy": 5.087244272232056,
|
|
"epoch": 2.1243996157540828,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004550090904093853,
|
|
"loss": 4.7694,
|
|
"mean_token_accuracy": 0.22834665477275848,
|
|
"num_tokens": 50720777.0,
|
|
"step": 22115
|
|
},
|
|
{
|
|
"entropy": 5.1074995517730715,
|
|
"epoch": 2.1248799231508166,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004549885200334872,
|
|
"loss": 4.7568,
|
|
"mean_token_accuracy": 0.23058853149414063,
|
|
"num_tokens": 50732836.0,
|
|
"step": 22120
|
|
},
|
|
{
|
|
"entropy": 5.137260675430298,
|
|
"epoch": 2.1253602305475505,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004549679454787901,
|
|
"loss": 4.8156,
|
|
"mean_token_accuracy": 0.2311272978782654,
|
|
"num_tokens": 50743854.0,
|
|
"step": 22125
|
|
},
|
|
{
|
|
"entropy": 5.062818193435669,
|
|
"epoch": 2.1258405379442844,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00045494736674577175,
|
|
"loss": 4.7465,
|
|
"mean_token_accuracy": 0.230141381919384,
|
|
"num_tokens": 50755524.0,
|
|
"step": 22130
|
|
},
|
|
{
|
|
"entropy": 5.133535289764405,
|
|
"epoch": 2.1263208453410183,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004549267838349099,
|
|
"loss": 4.7783,
|
|
"mean_token_accuracy": 0.23211053013801575,
|
|
"num_tokens": 50766909.0,
|
|
"step": 22135
|
|
},
|
|
{
|
|
"entropy": 5.0793849468231205,
|
|
"epoch": 2.126801152737752,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004549061967466823,
|
|
"loss": 4.7333,
|
|
"mean_token_accuracy": 0.23264259546995164,
|
|
"num_tokens": 50778065.0,
|
|
"step": 22140
|
|
},
|
|
{
|
|
"entropy": 5.14521746635437,
|
|
"epoch": 2.127281460134486,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004548856054815671,
|
|
"loss": 4.8307,
|
|
"mean_token_accuracy": 0.23031747192144394,
|
|
"num_tokens": 50790122.0,
|
|
"step": 22145
|
|
},
|
|
{
|
|
"entropy": 5.094038772583008,
|
|
"epoch": 2.12776176753122,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045486501004004225,
|
|
"loss": 4.8313,
|
|
"mean_token_accuracy": 0.2208801105618477,
|
|
"num_tokens": 50802661.0,
|
|
"step": 22150
|
|
},
|
|
{
|
|
"entropy": 5.153134393692016,
|
|
"epoch": 2.128242074927954,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000454844410422586,
|
|
"loss": 4.8698,
|
|
"mean_token_accuracy": 0.22763815373182297,
|
|
"num_tokens": 50814279.0,
|
|
"step": 22155
|
|
},
|
|
{
|
|
"entropy": 5.065900707244873,
|
|
"epoch": 2.1287223823246877,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00045482380662967655,
|
|
"loss": 4.7076,
|
|
"mean_token_accuracy": 0.23367461413145066,
|
|
"num_tokens": 50826186.0,
|
|
"step": 22160
|
|
},
|
|
{
|
|
"entropy": 5.157932567596435,
|
|
"epoch": 2.1292026897214216,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004548031986617923,
|
|
"loss": 4.8323,
|
|
"mean_token_accuracy": 0.21744155138731003,
|
|
"num_tokens": 50838185.0,
|
|
"step": 22165
|
|
},
|
|
{
|
|
"entropy": 5.084433650970459,
|
|
"epoch": 2.1296829971181555,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004547825865194117,
|
|
"loss": 4.7825,
|
|
"mean_token_accuracy": 0.23202351927757264,
|
|
"num_tokens": 50850461.0,
|
|
"step": 22170
|
|
},
|
|
{
|
|
"entropy": 5.170603656768799,
|
|
"epoch": 2.1301633045148893,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00045476197020301323,
|
|
"loss": 4.8724,
|
|
"mean_token_accuracy": 0.22438560724258422,
|
|
"num_tokens": 50861859.0,
|
|
"step": 22175
|
|
},
|
|
{
|
|
"entropy": 5.137531900405884,
|
|
"epoch": 2.1306436119116237,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045474134971307554,
|
|
"loss": 4.7638,
|
|
"mean_token_accuracy": 0.2304867058992386,
|
|
"num_tokens": 50872776.0,
|
|
"step": 22180
|
|
},
|
|
{
|
|
"entropy": 5.075492095947266,
|
|
"epoch": 2.1311239193083575,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004547207250500775,
|
|
"loss": 4.7869,
|
|
"mean_token_accuracy": 0.22913870215415955,
|
|
"num_tokens": 50885333.0,
|
|
"step": 22185
|
|
},
|
|
{
|
|
"entropy": 5.1370524883270265,
|
|
"epoch": 2.1316042267050914,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000454700096214498,
|
|
"loss": 4.7838,
|
|
"mean_token_accuracy": 0.22217728048563004,
|
|
"num_tokens": 50897256.0,
|
|
"step": 22190
|
|
},
|
|
{
|
|
"entropy": 5.099877500534058,
|
|
"epoch": 2.1320845341018253,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045467946320681567,
|
|
"loss": 4.7826,
|
|
"mean_token_accuracy": 0.22619348019361496,
|
|
"num_tokens": 50909558.0,
|
|
"step": 22195
|
|
},
|
|
{
|
|
"entropy": 5.117943239212036,
|
|
"epoch": 2.132564841498559,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004546588260275098,
|
|
"loss": 4.818,
|
|
"mean_token_accuracy": 0.2298060894012451,
|
|
"num_tokens": 50920617.0,
|
|
"step": 22200
|
|
},
|
|
{
|
|
"entropy": 5.107833623886108,
|
|
"epoch": 2.133045148895293,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045463818467705955,
|
|
"loss": 4.7844,
|
|
"mean_token_accuracy": 0.22848994582891463,
|
|
"num_tokens": 50931323.0,
|
|
"step": 22205
|
|
},
|
|
{
|
|
"entropy": 5.100920295715332,
|
|
"epoch": 2.133525456292027,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000454617539155944,
|
|
"loss": 4.6748,
|
|
"mean_token_accuracy": 0.23774074912071227,
|
|
"num_tokens": 50942798.0,
|
|
"step": 22210
|
|
},
|
|
{
|
|
"entropy": 5.047171878814697,
|
|
"epoch": 2.134005763688761,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045459688946464255,
|
|
"loss": 4.6721,
|
|
"mean_token_accuracy": 0.2314303919672966,
|
|
"num_tokens": 50954877.0,
|
|
"step": 22215
|
|
},
|
|
{
|
|
"entropy": 5.193651580810547,
|
|
"epoch": 2.1344860710854947,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004545762356036346,
|
|
"loss": 4.7615,
|
|
"mean_token_accuracy": 0.22914791703224183,
|
|
"num_tokens": 50966133.0,
|
|
"step": 22220
|
|
},
|
|
{
|
|
"entropy": 5.151372289657592,
|
|
"epoch": 2.1349663784822286,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004545555775733998,
|
|
"loss": 4.7618,
|
|
"mean_token_accuracy": 0.23861754089593887,
|
|
"num_tokens": 50977334.0,
|
|
"step": 22225
|
|
},
|
|
{
|
|
"entropy": 5.105147123336792,
|
|
"epoch": 2.1354466858789625,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045453491537441747,
|
|
"loss": 4.8307,
|
|
"mean_token_accuracy": 0.2180767059326172,
|
|
"num_tokens": 50988743.0,
|
|
"step": 22230
|
|
},
|
|
{
|
|
"entropy": 5.126622343063355,
|
|
"epoch": 2.1359269932756964,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045451424900716763,
|
|
"loss": 4.7816,
|
|
"mean_token_accuracy": 0.2273872137069702,
|
|
"num_tokens": 51000591.0,
|
|
"step": 22235
|
|
},
|
|
{
|
|
"entropy": 5.193524265289307,
|
|
"epoch": 2.1364073006724302,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045449357847212994,
|
|
"loss": 4.9755,
|
|
"mean_token_accuracy": 0.2240106552839279,
|
|
"num_tokens": 51011565.0,
|
|
"step": 22240
|
|
},
|
|
{
|
|
"entropy": 5.134286689758301,
|
|
"epoch": 2.136887608069164,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004544729037697844,
|
|
"loss": 4.8427,
|
|
"mean_token_accuracy": 0.22441424876451493,
|
|
"num_tokens": 51022755.0,
|
|
"step": 22245
|
|
},
|
|
{
|
|
"entropy": 5.170749378204346,
|
|
"epoch": 2.137367915465898,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045445222490061093,
|
|
"loss": 4.8751,
|
|
"mean_token_accuracy": 0.22253476232290267,
|
|
"num_tokens": 51032839.0,
|
|
"step": 22250
|
|
},
|
|
{
|
|
"entropy": 5.129157400131225,
|
|
"epoch": 2.1378482228626323,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004544315418650897,
|
|
"loss": 4.8233,
|
|
"mean_token_accuracy": 0.22642728239297866,
|
|
"num_tokens": 51045020.0,
|
|
"step": 22255
|
|
},
|
|
{
|
|
"entropy": 5.132080316543579,
|
|
"epoch": 2.138328530259366,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004544108546637008,
|
|
"loss": 4.738,
|
|
"mean_token_accuracy": 0.23691660314798355,
|
|
"num_tokens": 51056765.0,
|
|
"step": 22260
|
|
},
|
|
{
|
|
"entropy": 5.070805454254151,
|
|
"epoch": 2.1388088376561,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004543901632969247,
|
|
"loss": 4.7495,
|
|
"mean_token_accuracy": 0.2298893377184868,
|
|
"num_tokens": 51069559.0,
|
|
"step": 22265
|
|
},
|
|
{
|
|
"entropy": 5.124291658401489,
|
|
"epoch": 2.139289145052834,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045436946776524157,
|
|
"loss": 4.7954,
|
|
"mean_token_accuracy": 0.22728511691093445,
|
|
"num_tokens": 51080972.0,
|
|
"step": 22270
|
|
},
|
|
{
|
|
"entropy": 5.047629261016846,
|
|
"epoch": 2.139769452449568,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045434876806913204,
|
|
"loss": 4.7466,
|
|
"mean_token_accuracy": 0.2311826914548874,
|
|
"num_tokens": 51092101.0,
|
|
"step": 22275
|
|
},
|
|
{
|
|
"entropy": 5.180647182464599,
|
|
"epoch": 2.1402497598463017,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004543280642090767,
|
|
"loss": 4.9278,
|
|
"mean_token_accuracy": 0.2261410266160965,
|
|
"num_tokens": 51103372.0,
|
|
"step": 22280
|
|
},
|
|
{
|
|
"entropy": 5.1281579494476315,
|
|
"epoch": 2.1407300672430356,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004543073561855562,
|
|
"loss": 4.8322,
|
|
"mean_token_accuracy": 0.22201440036296843,
|
|
"num_tokens": 51114856.0,
|
|
"step": 22285
|
|
},
|
|
{
|
|
"entropy": 5.22743821144104,
|
|
"epoch": 2.1412103746397695,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004542866439990513,
|
|
"loss": 4.9103,
|
|
"mean_token_accuracy": 0.21878052353858948,
|
|
"num_tokens": 51126354.0,
|
|
"step": 22290
|
|
},
|
|
{
|
|
"entropy": 5.209173345565796,
|
|
"epoch": 2.1416906820365034,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004542659276500429,
|
|
"loss": 4.8647,
|
|
"mean_token_accuracy": 0.22599587440490723,
|
|
"num_tokens": 51138341.0,
|
|
"step": 22295
|
|
},
|
|
{
|
|
"entropy": 5.069614505767822,
|
|
"epoch": 2.1421709894332372,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045424520713901204,
|
|
"loss": 4.7487,
|
|
"mean_token_accuracy": 0.23675140142440795,
|
|
"num_tokens": 51150969.0,
|
|
"step": 22300
|
|
},
|
|
{
|
|
"entropy": 5.048203372955323,
|
|
"epoch": 2.142651296829971,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004542244824664396,
|
|
"loss": 4.7701,
|
|
"mean_token_accuracy": 0.22264178842306137,
|
|
"num_tokens": 51162199.0,
|
|
"step": 22305
|
|
},
|
|
{
|
|
"entropy": 5.094993686676025,
|
|
"epoch": 2.143131604226705,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00045420375363280696,
|
|
"loss": 4.8432,
|
|
"mean_token_accuracy": 0.220883372426033,
|
|
"num_tokens": 51173129.0,
|
|
"step": 22310
|
|
},
|
|
{
|
|
"entropy": 5.074619722366333,
|
|
"epoch": 2.143611911623439,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045418302063859526,
|
|
"loss": 4.7323,
|
|
"mean_token_accuracy": 0.23448609113693236,
|
|
"num_tokens": 51184593.0,
|
|
"step": 22315
|
|
},
|
|
{
|
|
"entropy": 5.133390474319458,
|
|
"epoch": 2.1440922190201728,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00045416228348428583,
|
|
"loss": 4.7951,
|
|
"mean_token_accuracy": 0.23045875877141953,
|
|
"num_tokens": 51196524.0,
|
|
"step": 22320
|
|
},
|
|
{
|
|
"entropy": 5.139809799194336,
|
|
"epoch": 2.1445725264169067,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00045414154217036023,
|
|
"loss": 4.8641,
|
|
"mean_token_accuracy": 0.2281458020210266,
|
|
"num_tokens": 51209307.0,
|
|
"step": 22325
|
|
},
|
|
{
|
|
"entropy": 5.067409896850586,
|
|
"epoch": 2.1450528338136405,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045412079669730006,
|
|
"loss": 4.7651,
|
|
"mean_token_accuracy": 0.23632195293903352,
|
|
"num_tokens": 51220919.0,
|
|
"step": 22330
|
|
},
|
|
{
|
|
"entropy": 5.133350276947022,
|
|
"epoch": 2.1455331412103744,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004541000470655867,
|
|
"loss": 4.7202,
|
|
"mean_token_accuracy": 0.23960795998573303,
|
|
"num_tokens": 51232930.0,
|
|
"step": 22335
|
|
},
|
|
{
|
|
"entropy": 5.068939113616944,
|
|
"epoch": 2.1460134486071087,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045407929327570215,
|
|
"loss": 4.8146,
|
|
"mean_token_accuracy": 0.23344950377941132,
|
|
"num_tokens": 51244697.0,
|
|
"step": 22340
|
|
},
|
|
{
|
|
"entropy": 5.100889825820923,
|
|
"epoch": 2.1464937560038426,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004540585353281282,
|
|
"loss": 4.8058,
|
|
"mean_token_accuracy": 0.23310038298368455,
|
|
"num_tokens": 51255333.0,
|
|
"step": 22345
|
|
},
|
|
{
|
|
"entropy": 5.168522882461548,
|
|
"epoch": 2.1469740634005765,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004540377732233467,
|
|
"loss": 4.8088,
|
|
"mean_token_accuracy": 0.22492084354162217,
|
|
"num_tokens": 51267649.0,
|
|
"step": 22350
|
|
},
|
|
{
|
|
"entropy": 5.118089246749878,
|
|
"epoch": 2.1474543707973104,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004540170069618397,
|
|
"loss": 4.7318,
|
|
"mean_token_accuracy": 0.22790632545948028,
|
|
"num_tokens": 51279576.0,
|
|
"step": 22355
|
|
},
|
|
{
|
|
"entropy": 5.149295806884766,
|
|
"epoch": 2.1479346781940443,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045399623654408946,
|
|
"loss": 4.8046,
|
|
"mean_token_accuracy": 0.22683072388172149,
|
|
"num_tokens": 51290974.0,
|
|
"step": 22360
|
|
},
|
|
{
|
|
"entropy": 5.23675651550293,
|
|
"epoch": 2.148414985590778,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000453975461970578,
|
|
"loss": 4.8662,
|
|
"mean_token_accuracy": 0.22433996796607972,
|
|
"num_tokens": 51301405.0,
|
|
"step": 22365
|
|
},
|
|
{
|
|
"entropy": 5.199128198623657,
|
|
"epoch": 2.148895292987512,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004539546832417879,
|
|
"loss": 4.9317,
|
|
"mean_token_accuracy": 0.21655915826559066,
|
|
"num_tokens": 51313158.0,
|
|
"step": 22370
|
|
},
|
|
{
|
|
"entropy": 5.159457111358643,
|
|
"epoch": 2.149375600384246,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045393390035820136,
|
|
"loss": 4.8091,
|
|
"mean_token_accuracy": 0.238103286921978,
|
|
"num_tokens": 51323789.0,
|
|
"step": 22375
|
|
},
|
|
{
|
|
"entropy": 5.081663799285889,
|
|
"epoch": 2.14985590778098,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000453913113320301,
|
|
"loss": 4.83,
|
|
"mean_token_accuracy": 0.22566504627466202,
|
|
"num_tokens": 51334864.0,
|
|
"step": 22380
|
|
},
|
|
{
|
|
"entropy": 5.1508691787719725,
|
|
"epoch": 2.1503362151777137,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004538923221285694,
|
|
"loss": 4.7237,
|
|
"mean_token_accuracy": 0.22757074534893035,
|
|
"num_tokens": 51345473.0,
|
|
"step": 22385
|
|
},
|
|
{
|
|
"entropy": 5.209955358505249,
|
|
"epoch": 2.1508165225744476,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004538715267834893,
|
|
"loss": 4.883,
|
|
"mean_token_accuracy": 0.22349812388420104,
|
|
"num_tokens": 51356755.0,
|
|
"step": 22390
|
|
},
|
|
{
|
|
"entropy": 5.011656999588013,
|
|
"epoch": 2.1512968299711814,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004538507272855434,
|
|
"loss": 4.6891,
|
|
"mean_token_accuracy": 0.2348538041114807,
|
|
"num_tokens": 51368351.0,
|
|
"step": 22395
|
|
},
|
|
{
|
|
"entropy": 5.162692880630493,
|
|
"epoch": 2.1517771373679153,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045382992363521486,
|
|
"loss": 4.9466,
|
|
"mean_token_accuracy": 0.22023718655109406,
|
|
"num_tokens": 51379144.0,
|
|
"step": 22400
|
|
},
|
|
{
|
|
"entropy": 5.127411413192749,
|
|
"epoch": 2.152257444764649,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00045380911583298633,
|
|
"loss": 4.7643,
|
|
"mean_token_accuracy": 0.23007805794477462,
|
|
"num_tokens": 51390732.0,
|
|
"step": 22405
|
|
},
|
|
{
|
|
"entropy": 5.080131578445434,
|
|
"epoch": 2.152737752161383,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00045378830387934123,
|
|
"loss": 4.7175,
|
|
"mean_token_accuracy": 0.2360814943909645,
|
|
"num_tokens": 51401185.0,
|
|
"step": 22410
|
|
},
|
|
{
|
|
"entropy": 5.140558958053589,
|
|
"epoch": 2.1532180595581174,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004537674877747626,
|
|
"loss": 4.8385,
|
|
"mean_token_accuracy": 0.22309084683656694,
|
|
"num_tokens": 51412598.0,
|
|
"step": 22415
|
|
},
|
|
{
|
|
"entropy": 5.099089097976685,
|
|
"epoch": 2.1536983669548513,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00045374666751973365,
|
|
"loss": 4.7378,
|
|
"mean_token_accuracy": 0.23137754797935486,
|
|
"num_tokens": 51423248.0,
|
|
"step": 22420
|
|
},
|
|
{
|
|
"entropy": 5.034801387786866,
|
|
"epoch": 2.154178674351585,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00045372584311473784,
|
|
"loss": 4.6803,
|
|
"mean_token_accuracy": 0.24379053264856337,
|
|
"num_tokens": 51433862.0,
|
|
"step": 22425
|
|
},
|
|
{
|
|
"entropy": 5.033099889755249,
|
|
"epoch": 2.154658981748319,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004537050145602587,
|
|
"loss": 4.7913,
|
|
"mean_token_accuracy": 0.23027238994836807,
|
|
"num_tokens": 51445311.0,
|
|
"step": 22430
|
|
},
|
|
{
|
|
"entropy": 5.064621496200561,
|
|
"epoch": 2.155139289145053,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004536841818567798,
|
|
"loss": 4.6955,
|
|
"mean_token_accuracy": 0.24127983748912812,
|
|
"num_tokens": 51456550.0,
|
|
"step": 22435
|
|
},
|
|
{
|
|
"entropy": 5.14633994102478,
|
|
"epoch": 2.155619596541787,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004536633450047847,
|
|
"loss": 4.8075,
|
|
"mean_token_accuracy": 0.22890851646661758,
|
|
"num_tokens": 51467645.0,
|
|
"step": 22440
|
|
},
|
|
{
|
|
"entropy": 5.107261705398559,
|
|
"epoch": 2.1560999039385207,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00045364250400475734,
|
|
"loss": 4.7765,
|
|
"mean_token_accuracy": 0.23856985867023467,
|
|
"num_tokens": 51478275.0,
|
|
"step": 22445
|
|
},
|
|
{
|
|
"entropy": 5.05497145652771,
|
|
"epoch": 2.1565802113352546,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004536216588571814,
|
|
"loss": 4.746,
|
|
"mean_token_accuracy": 0.23648817390203475,
|
|
"num_tokens": 51489634.0,
|
|
"step": 22450
|
|
},
|
|
{
|
|
"entropy": 5.086511611938477,
|
|
"epoch": 2.1570605187319885,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000453600809562541,
|
|
"loss": 4.7761,
|
|
"mean_token_accuracy": 0.2348331943154335,
|
|
"num_tokens": 51500245.0,
|
|
"step": 22455
|
|
},
|
|
{
|
|
"entropy": 5.160480165481568,
|
|
"epoch": 2.1575408261287223,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004535799561213202,
|
|
"loss": 4.7828,
|
|
"mean_token_accuracy": 0.23081537187099457,
|
|
"num_tokens": 51511996.0,
|
|
"step": 22460
|
|
},
|
|
{
|
|
"entropy": 5.074454116821289,
|
|
"epoch": 2.158021133525456,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000453559098534003,
|
|
"loss": 4.7398,
|
|
"mean_token_accuracy": 0.23446240425109863,
|
|
"num_tokens": 51523522.0,
|
|
"step": 22465
|
|
},
|
|
{
|
|
"entropy": 5.158469390869141,
|
|
"epoch": 2.15850144092219,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004535382368010738,
|
|
"loss": 4.7873,
|
|
"mean_token_accuracy": 0.23328575044870375,
|
|
"num_tokens": 51535261.0,
|
|
"step": 22470
|
|
},
|
|
{
|
|
"entropy": 5.031622505187988,
|
|
"epoch": 2.158981748318924,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045351737092301676,
|
|
"loss": 4.732,
|
|
"mean_token_accuracy": 0.23541125059127807,
|
|
"num_tokens": 51546965.0,
|
|
"step": 22475
|
|
},
|
|
{
|
|
"entropy": 5.092594337463379,
|
|
"epoch": 2.159462055715658,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004534965009003165,
|
|
"loss": 4.7498,
|
|
"mean_token_accuracy": 0.23316184133291246,
|
|
"num_tokens": 51558976.0,
|
|
"step": 22480
|
|
},
|
|
{
|
|
"entropy": 5.18133511543274,
|
|
"epoch": 2.1599423631123917,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004534756267334576,
|
|
"loss": 4.8635,
|
|
"mean_token_accuracy": 0.22220516204833984,
|
|
"num_tokens": 51571625.0,
|
|
"step": 22485
|
|
},
|
|
{
|
|
"entropy": 5.218547773361206,
|
|
"epoch": 2.160422670509126,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045345474842292455,
|
|
"loss": 4.8676,
|
|
"mean_token_accuracy": 0.2245472252368927,
|
|
"num_tokens": 51582732.0,
|
|
"step": 22490
|
|
},
|
|
{
|
|
"entropy": 5.110941934585571,
|
|
"epoch": 2.16090297790586,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004534338659692022,
|
|
"loss": 4.7882,
|
|
"mean_token_accuracy": 0.22675420343875885,
|
|
"num_tokens": 51594346.0,
|
|
"step": 22495
|
|
},
|
|
{
|
|
"entropy": 5.1038251399993895,
|
|
"epoch": 2.161383285302594,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004534129793727753,
|
|
"loss": 4.7964,
|
|
"mean_token_accuracy": 0.23224691152572632,
|
|
"num_tokens": 51606909.0,
|
|
"step": 22500
|
|
},
|
|
{
|
|
"entropy": 5.046190071105957,
|
|
"epoch": 2.1618635926993277,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004533920886341288,
|
|
"loss": 4.7774,
|
|
"mean_token_accuracy": 0.2357572838664055,
|
|
"num_tokens": 51617596.0,
|
|
"step": 22505
|
|
},
|
|
{
|
|
"entropy": 5.081631660461426,
|
|
"epoch": 2.1623439000960616,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004533711937537477,
|
|
"loss": 4.6884,
|
|
"mean_token_accuracy": 0.23987720012664795,
|
|
"num_tokens": 51627573.0,
|
|
"step": 22510
|
|
},
|
|
{
|
|
"entropy": 5.03942461013794,
|
|
"epoch": 2.1628242074927955,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004533502947321171,
|
|
"loss": 4.7468,
|
|
"mean_token_accuracy": 0.22461321353912353,
|
|
"num_tokens": 51640070.0,
|
|
"step": 22515
|
|
},
|
|
{
|
|
"entropy": 5.15150785446167,
|
|
"epoch": 2.1633045148895294,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004533293915697223,
|
|
"loss": 4.6963,
|
|
"mean_token_accuracy": 0.2359900563955307,
|
|
"num_tokens": 51651139.0,
|
|
"step": 22520
|
|
},
|
|
{
|
|
"entropy": 5.16046838760376,
|
|
"epoch": 2.1637848222862632,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00045330848426704853,
|
|
"loss": 4.8447,
|
|
"mean_token_accuracy": 0.2310606837272644,
|
|
"num_tokens": 51661893.0,
|
|
"step": 22525
|
|
},
|
|
{
|
|
"entropy": 5.088311004638672,
|
|
"epoch": 2.164265129682997,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004532875728245813,
|
|
"loss": 4.7932,
|
|
"mean_token_accuracy": 0.22725322842597961,
|
|
"num_tokens": 51674266.0,
|
|
"step": 22530
|
|
},
|
|
{
|
|
"entropy": 5.105494022369385,
|
|
"epoch": 2.164745437079731,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00045326665724280594,
|
|
"loss": 4.812,
|
|
"mean_token_accuracy": 0.23540204167366027,
|
|
"num_tokens": 51685450.0,
|
|
"step": 22535
|
|
},
|
|
{
|
|
"entropy": 5.1548017978668215,
|
|
"epoch": 2.165225744476465,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045324573752220814,
|
|
"loss": 4.8023,
|
|
"mean_token_accuracy": 0.2320707470178604,
|
|
"num_tokens": 51696779.0,
|
|
"step": 22540
|
|
},
|
|
{
|
|
"entropy": 5.172539901733399,
|
|
"epoch": 2.1657060518731988,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045322481366327365,
|
|
"loss": 4.8312,
|
|
"mean_token_accuracy": 0.2289508491754532,
|
|
"num_tokens": 51709387.0,
|
|
"step": 22545
|
|
},
|
|
{
|
|
"entropy": 5.096820926666259,
|
|
"epoch": 2.1661863592699326,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004532038856664882,
|
|
"loss": 4.7993,
|
|
"mean_token_accuracy": 0.2279914915561676,
|
|
"num_tokens": 51720978.0,
|
|
"step": 22550
|
|
},
|
|
{
|
|
"entropy": 5.249212408065796,
|
|
"epoch": 2.1666666666666665,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004531829535323376,
|
|
"loss": 4.9049,
|
|
"mean_token_accuracy": 0.220039102435112,
|
|
"num_tokens": 51733510.0,
|
|
"step": 22555
|
|
},
|
|
{
|
|
"entropy": 5.079677438735962,
|
|
"epoch": 2.1671469740634004,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000453162017261308,
|
|
"loss": 4.7079,
|
|
"mean_token_accuracy": 0.2436806619167328,
|
|
"num_tokens": 51743660.0,
|
|
"step": 22560
|
|
},
|
|
{
|
|
"entropy": 5.074145793914795,
|
|
"epoch": 2.1676272814601343,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004531410768538854,
|
|
"loss": 4.7453,
|
|
"mean_token_accuracy": 0.23445709496736528,
|
|
"num_tokens": 51754592.0,
|
|
"step": 22565
|
|
},
|
|
{
|
|
"entropy": 5.062536191940308,
|
|
"epoch": 2.168107588856868,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00045312013231055596,
|
|
"loss": 4.7404,
|
|
"mean_token_accuracy": 0.23631888031959533,
|
|
"num_tokens": 51766241.0,
|
|
"step": 22570
|
|
},
|
|
{
|
|
"entropy": 5.134042263031006,
|
|
"epoch": 2.1685878962536025,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045309918363180593,
|
|
"loss": 4.7631,
|
|
"mean_token_accuracy": 0.2316461443901062,
|
|
"num_tokens": 51777718.0,
|
|
"step": 22575
|
|
},
|
|
{
|
|
"entropy": 4.987406969070435,
|
|
"epoch": 2.1690682036503364,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045307823081812166,
|
|
"loss": 4.6724,
|
|
"mean_token_accuracy": 0.24229834973812103,
|
|
"num_tokens": 51788290.0,
|
|
"step": 22580
|
|
},
|
|
{
|
|
"entropy": 5.032321119308472,
|
|
"epoch": 2.1695485110470702,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00045305727386998977,
|
|
"loss": 4.7373,
|
|
"mean_token_accuracy": 0.22974208891391754,
|
|
"num_tokens": 51800080.0,
|
|
"step": 22585
|
|
},
|
|
{
|
|
"entropy": 5.1141222476959225,
|
|
"epoch": 2.170028818443804,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004530363127878966,
|
|
"loss": 4.7809,
|
|
"mean_token_accuracy": 0.2325947716832161,
|
|
"num_tokens": 51810155.0,
|
|
"step": 22590
|
|
},
|
|
{
|
|
"entropy": 5.146018838882446,
|
|
"epoch": 2.170509125840538,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045301534757232885,
|
|
"loss": 4.8255,
|
|
"mean_token_accuracy": 0.23102711737155915,
|
|
"num_tokens": 51823082.0,
|
|
"step": 22595
|
|
},
|
|
{
|
|
"entropy": 5.156763553619385,
|
|
"epoch": 2.170989433237272,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004529943782237735,
|
|
"loss": 4.8598,
|
|
"mean_token_accuracy": 0.22403838038444518,
|
|
"num_tokens": 51833280.0,
|
|
"step": 22600
|
|
},
|
|
{
|
|
"entropy": 5.114382219314575,
|
|
"epoch": 2.1714697406340058,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045297340474271717,
|
|
"loss": 4.7748,
|
|
"mean_token_accuracy": 0.22729934453964235,
|
|
"num_tokens": 51845105.0,
|
|
"step": 22605
|
|
},
|
|
{
|
|
"entropy": 5.13478045463562,
|
|
"epoch": 2.1719500480307397,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004529524271296468,
|
|
"loss": 4.758,
|
|
"mean_token_accuracy": 0.23644569963216783,
|
|
"num_tokens": 51856232.0,
|
|
"step": 22610
|
|
},
|
|
{
|
|
"entropy": 5.076405763626099,
|
|
"epoch": 2.1724303554274735,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045293144538504943,
|
|
"loss": 4.779,
|
|
"mean_token_accuracy": 0.23823365718126296,
|
|
"num_tokens": 51866218.0,
|
|
"step": 22615
|
|
},
|
|
{
|
|
"entropy": 5.115446138381958,
|
|
"epoch": 2.1729106628242074,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004529104595094124,
|
|
"loss": 4.7947,
|
|
"mean_token_accuracy": 0.22927693277597427,
|
|
"num_tokens": 51877425.0,
|
|
"step": 22620
|
|
},
|
|
{
|
|
"entropy": 5.124441528320313,
|
|
"epoch": 2.1733909702209413,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045288946950322264,
|
|
"loss": 4.7993,
|
|
"mean_token_accuracy": 0.2279620125889778,
|
|
"num_tokens": 51888818.0,
|
|
"step": 22625
|
|
},
|
|
{
|
|
"entropy": 5.106270027160645,
|
|
"epoch": 2.173871277617675,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004528684753669677,
|
|
"loss": 4.7799,
|
|
"mean_token_accuracy": 0.22617914527654648,
|
|
"num_tokens": 51899875.0,
|
|
"step": 22630
|
|
},
|
|
{
|
|
"entropy": 5.124859189987182,
|
|
"epoch": 2.174351585014409,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004528474771011349,
|
|
"loss": 4.7568,
|
|
"mean_token_accuracy": 0.2302141085267067,
|
|
"num_tokens": 51911084.0,
|
|
"step": 22635
|
|
},
|
|
{
|
|
"entropy": 5.080235481262207,
|
|
"epoch": 2.174831892411143,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045282647470621176,
|
|
"loss": 4.7936,
|
|
"mean_token_accuracy": 0.2341061756014824,
|
|
"num_tokens": 51922165.0,
|
|
"step": 22640
|
|
},
|
|
{
|
|
"entropy": 5.127397632598877,
|
|
"epoch": 2.175312199807877,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045280546818268595,
|
|
"loss": 4.8127,
|
|
"mean_token_accuracy": 0.2303827852010727,
|
|
"num_tokens": 51934827.0,
|
|
"step": 22645
|
|
},
|
|
{
|
|
"entropy": 5.158881187438965,
|
|
"epoch": 2.175792507204611,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004527844575310452,
|
|
"loss": 4.8226,
|
|
"mean_token_accuracy": 0.22183109372854232,
|
|
"num_tokens": 51947997.0,
|
|
"step": 22650
|
|
},
|
|
{
|
|
"entropy": 5.118029928207397,
|
|
"epoch": 2.176272814601345,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045276344275177715,
|
|
"loss": 4.842,
|
|
"mean_token_accuracy": 0.22817557156085969,
|
|
"num_tokens": 51959639.0,
|
|
"step": 22655
|
|
},
|
|
{
|
|
"entropy": 5.144334506988526,
|
|
"epoch": 2.176753121998079,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00045274242384536984,
|
|
"loss": 4.8592,
|
|
"mean_token_accuracy": 0.2271733269095421,
|
|
"num_tokens": 51970716.0,
|
|
"step": 22660
|
|
},
|
|
{
|
|
"entropy": 5.128339672088623,
|
|
"epoch": 2.177233429394813,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004527214008123113,
|
|
"loss": 4.7914,
|
|
"mean_token_accuracy": 0.23347052782773972,
|
|
"num_tokens": 51982556.0,
|
|
"step": 22665
|
|
},
|
|
{
|
|
"entropy": 5.134899616241455,
|
|
"epoch": 2.1777137367915467,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004527003736530895,
|
|
"loss": 4.7718,
|
|
"mean_token_accuracy": 0.2311878204345703,
|
|
"num_tokens": 51994894.0,
|
|
"step": 22670
|
|
},
|
|
{
|
|
"entropy": 5.13347806930542,
|
|
"epoch": 2.1781940441882806,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045267934236819265,
|
|
"loss": 4.8463,
|
|
"mean_token_accuracy": 0.21979733407497407,
|
|
"num_tokens": 52007128.0,
|
|
"step": 22675
|
|
},
|
|
{
|
|
"entropy": 5.114097976684571,
|
|
"epoch": 2.1786743515850144,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004526583069581091,
|
|
"loss": 4.9296,
|
|
"mean_token_accuracy": 0.21860868930816652,
|
|
"num_tokens": 52019598.0,
|
|
"step": 22680
|
|
},
|
|
{
|
|
"entropy": 5.116789150238037,
|
|
"epoch": 2.1791546589817483,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004526372674233272,
|
|
"loss": 4.8043,
|
|
"mean_token_accuracy": 0.23327067494392395,
|
|
"num_tokens": 52031445.0,
|
|
"step": 22685
|
|
},
|
|
{
|
|
"entropy": 5.212763261795044,
|
|
"epoch": 2.179634966378482,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045261622376433543,
|
|
"loss": 4.8476,
|
|
"mean_token_accuracy": 0.2272602990269661,
|
|
"num_tokens": 52042548.0,
|
|
"step": 22690
|
|
},
|
|
{
|
|
"entropy": 5.077626943588257,
|
|
"epoch": 2.180115273775216,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00045259517598162237,
|
|
"loss": 4.7724,
|
|
"mean_token_accuracy": 0.23077887147665024,
|
|
"num_tokens": 52054147.0,
|
|
"step": 22695
|
|
},
|
|
{
|
|
"entropy": 5.057162761688232,
|
|
"epoch": 2.18059558117195,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004525741240756766,
|
|
"loss": 4.7556,
|
|
"mean_token_accuracy": 0.23871446251869202,
|
|
"num_tokens": 52065556.0,
|
|
"step": 22700
|
|
},
|
|
{
|
|
"entropy": 5.189388418197632,
|
|
"epoch": 2.181075888568684,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004525530680469871,
|
|
"loss": 4.85,
|
|
"mean_token_accuracy": 0.23103740066289902,
|
|
"num_tokens": 52077543.0,
|
|
"step": 22705
|
|
},
|
|
{
|
|
"entropy": 5.084654235839844,
|
|
"epoch": 2.1815561959654177,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00045253200789604245,
|
|
"loss": 4.7745,
|
|
"mean_token_accuracy": 0.23199355304241182,
|
|
"num_tokens": 52088376.0,
|
|
"step": 22710
|
|
},
|
|
{
|
|
"entropy": 5.065414905548096,
|
|
"epoch": 2.1820365033621516,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045251094362333186,
|
|
"loss": 4.7859,
|
|
"mean_token_accuracy": 0.22604561150074004,
|
|
"num_tokens": 52100019.0,
|
|
"step": 22715
|
|
},
|
|
{
|
|
"entropy": 5.075776672363281,
|
|
"epoch": 2.1825168107588855,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004524898752293441,
|
|
"loss": 4.7523,
|
|
"mean_token_accuracy": 0.23174404501914977,
|
|
"num_tokens": 52110643.0,
|
|
"step": 22720
|
|
},
|
|
{
|
|
"entropy": 5.025632047653199,
|
|
"epoch": 2.18299711815562,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045246880271456857,
|
|
"loss": 4.7199,
|
|
"mean_token_accuracy": 0.235849928855896,
|
|
"num_tokens": 52123092.0,
|
|
"step": 22725
|
|
},
|
|
{
|
|
"entropy": 5.192577219009399,
|
|
"epoch": 2.1834774255523537,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004524477260794944,
|
|
"loss": 4.8897,
|
|
"mean_token_accuracy": 0.22173816263675689,
|
|
"num_tokens": 52135371.0,
|
|
"step": 22730
|
|
},
|
|
{
|
|
"entropy": 5.149078893661499,
|
|
"epoch": 2.1839577329490876,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045242664532461094,
|
|
"loss": 4.7368,
|
|
"mean_token_accuracy": 0.23526560813188552,
|
|
"num_tokens": 52145877.0,
|
|
"step": 22735
|
|
},
|
|
{
|
|
"entropy": 5.034790945053101,
|
|
"epoch": 2.1844380403458215,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045240556045040767,
|
|
"loss": 4.8035,
|
|
"mean_token_accuracy": 0.22363627403974534,
|
|
"num_tokens": 52158228.0,
|
|
"step": 22740
|
|
},
|
|
{
|
|
"entropy": 5.198207521438599,
|
|
"epoch": 2.1849183477425553,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00045238447145737397,
|
|
"loss": 4.9068,
|
|
"mean_token_accuracy": 0.2178325742483139,
|
|
"num_tokens": 52169515.0,
|
|
"step": 22745
|
|
},
|
|
{
|
|
"entropy": 5.153679275512696,
|
|
"epoch": 2.185398655139289,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045236337834599966,
|
|
"loss": 4.8384,
|
|
"mean_token_accuracy": 0.2262999877333641,
|
|
"num_tokens": 52179846.0,
|
|
"step": 22750
|
|
},
|
|
{
|
|
"entropy": 5.009027528762817,
|
|
"epoch": 2.185878962536023,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045234228111677434,
|
|
"loss": 4.6796,
|
|
"mean_token_accuracy": 0.23403288871049882,
|
|
"num_tokens": 52191095.0,
|
|
"step": 22755
|
|
},
|
|
{
|
|
"entropy": 5.104574346542359,
|
|
"epoch": 2.186359269932757,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004523211797701878,
|
|
"loss": 4.8442,
|
|
"mean_token_accuracy": 0.22490133345127106,
|
|
"num_tokens": 52202289.0,
|
|
"step": 22760
|
|
},
|
|
{
|
|
"entropy": 5.049128198623658,
|
|
"epoch": 2.186839577329491,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045230007430673014,
|
|
"loss": 4.7937,
|
|
"mean_token_accuracy": 0.23295473158359528,
|
|
"num_tokens": 52214424.0,
|
|
"step": 22765
|
|
},
|
|
{
|
|
"entropy": 5.171049499511719,
|
|
"epoch": 2.1873198847262247,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004522789647268911,
|
|
"loss": 4.8374,
|
|
"mean_token_accuracy": 0.2265281304717064,
|
|
"num_tokens": 52225446.0,
|
|
"step": 22770
|
|
},
|
|
{
|
|
"entropy": 5.092030334472656,
|
|
"epoch": 2.1878001921229586,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004522578510311609,
|
|
"loss": 4.7353,
|
|
"mean_token_accuracy": 0.2287060409784317,
|
|
"num_tokens": 52236063.0,
|
|
"step": 22775
|
|
},
|
|
{
|
|
"entropy": 5.024506282806397,
|
|
"epoch": 2.1882804995196925,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045223673322002984,
|
|
"loss": 4.7728,
|
|
"mean_token_accuracy": 0.2300436779856682,
|
|
"num_tokens": 52248618.0,
|
|
"step": 22780
|
|
},
|
|
{
|
|
"entropy": 5.129857873916626,
|
|
"epoch": 2.1887608069164264,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00045221561129398804,
|
|
"loss": 4.901,
|
|
"mean_token_accuracy": 0.2197287231683731,
|
|
"num_tokens": 52261913.0,
|
|
"step": 22785
|
|
},
|
|
{
|
|
"entropy": 5.165701150894165,
|
|
"epoch": 2.1892411143131603,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000452194485253526,
|
|
"loss": 4.7645,
|
|
"mean_token_accuracy": 0.23000348508358,
|
|
"num_tokens": 52273838.0,
|
|
"step": 22790
|
|
},
|
|
{
|
|
"entropy": 5.074887895584107,
|
|
"epoch": 2.189721421709894,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004521733550991342,
|
|
"loss": 4.8224,
|
|
"mean_token_accuracy": 0.2346891850233078,
|
|
"num_tokens": 52285464.0,
|
|
"step": 22795
|
|
},
|
|
{
|
|
"entropy": 5.058576774597168,
|
|
"epoch": 2.1902017291066285,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045215222083130316,
|
|
"loss": 4.765,
|
|
"mean_token_accuracy": 0.23533908724784852,
|
|
"num_tokens": 52295800.0,
|
|
"step": 22800
|
|
},
|
|
{
|
|
"entropy": 5.189198160171509,
|
|
"epoch": 2.1906820365033624,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004521310824505236,
|
|
"loss": 4.888,
|
|
"mean_token_accuracy": 0.22185751646757126,
|
|
"num_tokens": 52307243.0,
|
|
"step": 22805
|
|
},
|
|
{
|
|
"entropy": 5.105354499816895,
|
|
"epoch": 2.1911623439000962,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045210993995728623,
|
|
"loss": 4.7606,
|
|
"mean_token_accuracy": 0.23480006754398347,
|
|
"num_tokens": 52317065.0,
|
|
"step": 22810
|
|
},
|
|
{
|
|
"entropy": 5.121937274932861,
|
|
"epoch": 2.19164265129683,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000452088793352082,
|
|
"loss": 4.7907,
|
|
"mean_token_accuracy": 0.22747430503368377,
|
|
"num_tokens": 52328385.0,
|
|
"step": 22815
|
|
},
|
|
{
|
|
"entropy": 5.000249195098877,
|
|
"epoch": 2.192122958693564,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004520676426354018,
|
|
"loss": 4.7284,
|
|
"mean_token_accuracy": 0.23484476506710053,
|
|
"num_tokens": 52340046.0,
|
|
"step": 22820
|
|
},
|
|
{
|
|
"entropy": 5.029712343215943,
|
|
"epoch": 2.192603266090298,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004520464878077368,
|
|
"loss": 4.7419,
|
|
"mean_token_accuracy": 0.23461335599422456,
|
|
"num_tokens": 52352071.0,
|
|
"step": 22825
|
|
},
|
|
{
|
|
"entropy": 5.2037135601043705,
|
|
"epoch": 2.1930835734870318,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00045202532886957805,
|
|
"loss": 4.8642,
|
|
"mean_token_accuracy": 0.2289625346660614,
|
|
"num_tokens": 52364173.0,
|
|
"step": 22830
|
|
},
|
|
{
|
|
"entropy": 5.04330883026123,
|
|
"epoch": 2.1935638808837656,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00045200416582141676,
|
|
"loss": 4.689,
|
|
"mean_token_accuracy": 0.23917343467473984,
|
|
"num_tokens": 52374690.0,
|
|
"step": 22835
|
|
},
|
|
{
|
|
"entropy": 5.183155298233032,
|
|
"epoch": 2.1940441882804995,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004519829986637444,
|
|
"loss": 4.9656,
|
|
"mean_token_accuracy": 0.22324798554182052,
|
|
"num_tokens": 52385661.0,
|
|
"step": 22840
|
|
},
|
|
{
|
|
"entropy": 5.145434188842773,
|
|
"epoch": 2.1945244956772334,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004519618273970523,
|
|
"loss": 4.8464,
|
|
"mean_token_accuracy": 0.2259630724787712,
|
|
"num_tokens": 52396759.0,
|
|
"step": 22845
|
|
},
|
|
{
|
|
"entropy": 5.199276876449585,
|
|
"epoch": 2.1950048030739673,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00045194065202183205,
|
|
"loss": 4.9162,
|
|
"mean_token_accuracy": 0.21793469041585922,
|
|
"num_tokens": 52409122.0,
|
|
"step": 22850
|
|
},
|
|
{
|
|
"entropy": 5.250739097595215,
|
|
"epoch": 2.195485110470701,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004519194725385752,
|
|
"loss": 4.8925,
|
|
"mean_token_accuracy": 0.21719120740890502,
|
|
"num_tokens": 52421318.0,
|
|
"step": 22855
|
|
},
|
|
{
|
|
"entropy": 5.042360639572143,
|
|
"epoch": 2.195965417867435,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045189828894777364,
|
|
"loss": 4.746,
|
|
"mean_token_accuracy": 0.23408232480287552,
|
|
"num_tokens": 52432595.0,
|
|
"step": 22860
|
|
},
|
|
{
|
|
"entropy": 5.068605136871338,
|
|
"epoch": 2.196445725264169,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045187710124991904,
|
|
"loss": 4.7719,
|
|
"mean_token_accuracy": 0.2310498610138893,
|
|
"num_tokens": 52444423.0,
|
|
"step": 22865
|
|
},
|
|
{
|
|
"entropy": 5.16375584602356,
|
|
"epoch": 2.196926032660903,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004518559094455034,
|
|
"loss": 4.8846,
|
|
"mean_token_accuracy": 0.2241843119263649,
|
|
"num_tokens": 52457183.0,
|
|
"step": 22870
|
|
},
|
|
{
|
|
"entropy": 5.163765239715576,
|
|
"epoch": 2.1974063400576367,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004518347135350187,
|
|
"loss": 4.8152,
|
|
"mean_token_accuracy": 0.22464211732149125,
|
|
"num_tokens": 52469624.0,
|
|
"step": 22875
|
|
},
|
|
{
|
|
"entropy": 5.1679223537445065,
|
|
"epoch": 2.1978866474543706,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00045181351351895703,
|
|
"loss": 4.916,
|
|
"mean_token_accuracy": 0.21683948189020158,
|
|
"num_tokens": 52481369.0,
|
|
"step": 22880
|
|
},
|
|
{
|
|
"entropy": 5.0861540794372555,
|
|
"epoch": 2.198366954851105,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004517923093978106,
|
|
"loss": 4.7479,
|
|
"mean_token_accuracy": 0.23464581966400147,
|
|
"num_tokens": 52493315.0,
|
|
"step": 22885
|
|
},
|
|
{
|
|
"entropy": 5.099695634841919,
|
|
"epoch": 2.1988472622478388,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004517711011720717,
|
|
"loss": 4.8213,
|
|
"mean_token_accuracy": 0.23163434565067292,
|
|
"num_tokens": 52504929.0,
|
|
"step": 22890
|
|
},
|
|
{
|
|
"entropy": 5.067135286331177,
|
|
"epoch": 2.1993275696445727,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004517498888422327,
|
|
"loss": 4.7238,
|
|
"mean_token_accuracy": 0.2288893863558769,
|
|
"num_tokens": 52518364.0,
|
|
"step": 22895
|
|
},
|
|
{
|
|
"entropy": 5.086096143722534,
|
|
"epoch": 2.1998078770413065,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004517286724087862,
|
|
"loss": 4.7689,
|
|
"mean_token_accuracy": 0.23164378404617308,
|
|
"num_tokens": 52530359.0,
|
|
"step": 22900
|
|
},
|
|
{
|
|
"entropy": 5.117498397827148,
|
|
"epoch": 2.2002881844380404,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004517074518722247,
|
|
"loss": 4.7983,
|
|
"mean_token_accuracy": 0.23343625366687776,
|
|
"num_tokens": 52542242.0,
|
|
"step": 22905
|
|
},
|
|
{
|
|
"entropy": 5.153238010406494,
|
|
"epoch": 2.2007684918347743,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00045168622723304084,
|
|
"loss": 4.8348,
|
|
"mean_token_accuracy": 0.23051410913467407,
|
|
"num_tokens": 52552838.0,
|
|
"step": 22910
|
|
},
|
|
{
|
|
"entropy": 5.107896041870117,
|
|
"epoch": 2.201248799231508,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004516649984917274,
|
|
"loss": 4.7863,
|
|
"mean_token_accuracy": 0.22998663187026977,
|
|
"num_tokens": 52564568.0,
|
|
"step": 22915
|
|
},
|
|
{
|
|
"entropy": 5.170346450805664,
|
|
"epoch": 2.201729106628242,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00045164376564877734,
|
|
"loss": 4.8308,
|
|
"mean_token_accuracy": 0.22436516731977463,
|
|
"num_tokens": 52577563.0,
|
|
"step": 22920
|
|
},
|
|
{
|
|
"entropy": 5.161020660400391,
|
|
"epoch": 2.202209414024976,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045162252870468354,
|
|
"loss": 4.8411,
|
|
"mean_token_accuracy": 0.22507598847150803,
|
|
"num_tokens": 52590295.0,
|
|
"step": 22925
|
|
},
|
|
{
|
|
"entropy": 5.116861581802368,
|
|
"epoch": 2.20268972142171,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004516012876599391,
|
|
"loss": 4.7759,
|
|
"mean_token_accuracy": 0.23662513345479966,
|
|
"num_tokens": 52602727.0,
|
|
"step": 22930
|
|
},
|
|
{
|
|
"entropy": 5.012200355529785,
|
|
"epoch": 2.2031700288184437,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045158004251503715,
|
|
"loss": 4.6927,
|
|
"mean_token_accuracy": 0.2406057357788086,
|
|
"num_tokens": 52613422.0,
|
|
"step": 22935
|
|
},
|
|
{
|
|
"entropy": 4.982286357879639,
|
|
"epoch": 2.2036503362151776,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00045155879327047087,
|
|
"loss": 4.7271,
|
|
"mean_token_accuracy": 0.2413996621966362,
|
|
"num_tokens": 52624907.0,
|
|
"step": 22940
|
|
},
|
|
{
|
|
"entropy": 5.147170066833496,
|
|
"epoch": 2.2041306436119115,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004515375399267338,
|
|
"loss": 4.8421,
|
|
"mean_token_accuracy": 0.22663878351449968,
|
|
"num_tokens": 52635864.0,
|
|
"step": 22945
|
|
},
|
|
{
|
|
"entropy": 5.1448752880096436,
|
|
"epoch": 2.2046109510086453,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045151628248431925,
|
|
"loss": 4.8956,
|
|
"mean_token_accuracy": 0.22156819105148315,
|
|
"num_tokens": 52647023.0,
|
|
"step": 22950
|
|
},
|
|
{
|
|
"entropy": 5.151683616638183,
|
|
"epoch": 2.2050912584053792,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045149502094372077,
|
|
"loss": 4.8621,
|
|
"mean_token_accuracy": 0.22776360362768172,
|
|
"num_tokens": 52658846.0,
|
|
"step": 22955
|
|
},
|
|
{
|
|
"entropy": 5.090791654586792,
|
|
"epoch": 2.2055715658021136,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00045147375530543195,
|
|
"loss": 4.652,
|
|
"mean_token_accuracy": 0.2444545805454254,
|
|
"num_tokens": 52671363.0,
|
|
"step": 22960
|
|
},
|
|
{
|
|
"entropy": 5.138968563079834,
|
|
"epoch": 2.2060518731988474,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00045145248556994653,
|
|
"loss": 4.8832,
|
|
"mean_token_accuracy": 0.22517444640398027,
|
|
"num_tokens": 52682734.0,
|
|
"step": 22965
|
|
},
|
|
{
|
|
"entropy": 5.081032848358154,
|
|
"epoch": 2.2065321805955813,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004514312117377584,
|
|
"loss": 4.7326,
|
|
"mean_token_accuracy": 0.2331282079219818,
|
|
"num_tokens": 52693272.0,
|
|
"step": 22970
|
|
},
|
|
{
|
|
"entropy": 5.096043682098388,
|
|
"epoch": 2.207012487992315,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004514099338093613,
|
|
"loss": 4.8028,
|
|
"mean_token_accuracy": 0.2290824383497238,
|
|
"num_tokens": 52703395.0,
|
|
"step": 22975
|
|
},
|
|
{
|
|
"entropy": 5.066891288757324,
|
|
"epoch": 2.207492795389049,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004513886517852496,
|
|
"loss": 4.7096,
|
|
"mean_token_accuracy": 0.23548574298620223,
|
|
"num_tokens": 52715870.0,
|
|
"step": 22980
|
|
},
|
|
{
|
|
"entropy": 5.105998754501343,
|
|
"epoch": 2.207973102785783,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000451367365665917,
|
|
"loss": 4.8257,
|
|
"mean_token_accuracy": 0.23159873336553574,
|
|
"num_tokens": 52726922.0,
|
|
"step": 22985
|
|
},
|
|
{
|
|
"entropy": 5.127434110641479,
|
|
"epoch": 2.208453410182517,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045134607545185785,
|
|
"loss": 4.7961,
|
|
"mean_token_accuracy": 0.23253277987241744,
|
|
"num_tokens": 52738211.0,
|
|
"step": 22990
|
|
},
|
|
{
|
|
"entropy": 5.1540055751800535,
|
|
"epoch": 2.2089337175792507,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004513247811435666,
|
|
"loss": 4.7853,
|
|
"mean_token_accuracy": 0.230952388048172,
|
|
"num_tokens": 52749267.0,
|
|
"step": 22995
|
|
},
|
|
{
|
|
"entropy": 5.015000057220459,
|
|
"epoch": 2.2094140249759846,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045130348274153735,
|
|
"loss": 4.7892,
|
|
"mean_token_accuracy": 0.22779532968997956,
|
|
"num_tokens": 52760520.0,
|
|
"step": 23000
|
|
},
|
|
{
|
|
"entropy": 5.087809896469116,
|
|
"epoch": 2.2098943323727185,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00045128218024626486,
|
|
"loss": 4.7342,
|
|
"mean_token_accuracy": 0.24323214888572692,
|
|
"num_tokens": 52772865.0,
|
|
"step": 23005
|
|
},
|
|
{
|
|
"entropy": 5.111327123641968,
|
|
"epoch": 2.2103746397694524,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004512608736582436,
|
|
"loss": 4.7896,
|
|
"mean_token_accuracy": 0.23642992824316025,
|
|
"num_tokens": 52783935.0,
|
|
"step": 23010
|
|
},
|
|
{
|
|
"entropy": 4.973553323745728,
|
|
"epoch": 2.2108549471661862,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004512395629779682,
|
|
"loss": 4.7307,
|
|
"mean_token_accuracy": 0.2389068379998207,
|
|
"num_tokens": 52794980.0,
|
|
"step": 23015
|
|
},
|
|
{
|
|
"entropy": 5.190898704528808,
|
|
"epoch": 2.21133525456292,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004512182482059335,
|
|
"loss": 4.8966,
|
|
"mean_token_accuracy": 0.21514176428318024,
|
|
"num_tokens": 52806426.0,
|
|
"step": 23020
|
|
},
|
|
{
|
|
"entropy": 5.059572219848633,
|
|
"epoch": 2.211815561959654,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004511969293426343,
|
|
"loss": 4.7195,
|
|
"mean_token_accuracy": 0.2370661735534668,
|
|
"num_tokens": 52816832.0,
|
|
"step": 23025
|
|
},
|
|
{
|
|
"entropy": 5.115974092483521,
|
|
"epoch": 2.212295869356388,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00045117560638856567,
|
|
"loss": 4.7834,
|
|
"mean_token_accuracy": 0.2324156790971756,
|
|
"num_tokens": 52829913.0,
|
|
"step": 23030
|
|
},
|
|
{
|
|
"entropy": 5.101070880889893,
|
|
"epoch": 2.212776176753122,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004511542793442225,
|
|
"loss": 4.8067,
|
|
"mean_token_accuracy": 0.23004190921783446,
|
|
"num_tokens": 52841300.0,
|
|
"step": 23035
|
|
},
|
|
{
|
|
"entropy": 5.085496854782105,
|
|
"epoch": 2.213256484149856,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004511329482101001,
|
|
"loss": 4.7434,
|
|
"mean_token_accuracy": 0.23557529896497725,
|
|
"num_tokens": 52852862.0,
|
|
"step": 23040
|
|
},
|
|
{
|
|
"entropy": 5.185792589187622,
|
|
"epoch": 2.21373679154659,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004511116129866936,
|
|
"loss": 4.8093,
|
|
"mean_token_accuracy": 0.22784344851970673,
|
|
"num_tokens": 52863499.0,
|
|
"step": 23045
|
|
},
|
|
{
|
|
"entropy": 5.111684799194336,
|
|
"epoch": 2.214217098943324,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045109027367449845,
|
|
"loss": 4.8609,
|
|
"mean_token_accuracy": 0.2328698828816414,
|
|
"num_tokens": 52875459.0,
|
|
"step": 23050
|
|
},
|
|
{
|
|
"entropy": 5.057990264892578,
|
|
"epoch": 2.2146974063400577,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045106893027400995,
|
|
"loss": 4.7438,
|
|
"mean_token_accuracy": 0.22991834282875062,
|
|
"num_tokens": 52886758.0,
|
|
"step": 23055
|
|
},
|
|
{
|
|
"entropy": 5.102661371231079,
|
|
"epoch": 2.2151777137367916,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00045104758278572375,
|
|
"loss": 4.7455,
|
|
"mean_token_accuracy": 0.23097250014543533,
|
|
"num_tokens": 52898703.0,
|
|
"step": 23060
|
|
},
|
|
{
|
|
"entropy": 5.148905897140503,
|
|
"epoch": 2.2156580211335255,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004510262312101355,
|
|
"loss": 4.7997,
|
|
"mean_token_accuracy": 0.2216897800564766,
|
|
"num_tokens": 52910455.0,
|
|
"step": 23065
|
|
},
|
|
{
|
|
"entropy": 4.989671325683593,
|
|
"epoch": 2.2161383285302594,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004510048755477407,
|
|
"loss": 4.7085,
|
|
"mean_token_accuracy": 0.23190236687660218,
|
|
"num_tokens": 52921992.0,
|
|
"step": 23070
|
|
},
|
|
{
|
|
"entropy": 5.081862878799439,
|
|
"epoch": 2.2166186359269933,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004509835157990354,
|
|
"loss": 4.8352,
|
|
"mean_token_accuracy": 0.2297719180583954,
|
|
"num_tokens": 52934994.0,
|
|
"step": 23075
|
|
},
|
|
{
|
|
"entropy": 5.142678594589233,
|
|
"epoch": 2.217098943323727,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045096215196451547,
|
|
"loss": 4.895,
|
|
"mean_token_accuracy": 0.2210488885641098,
|
|
"num_tokens": 52945745.0,
|
|
"step": 23080
|
|
},
|
|
{
|
|
"entropy": 5.145126438140869,
|
|
"epoch": 2.217579250720461,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00045094078404467683,
|
|
"loss": 4.8814,
|
|
"mean_token_accuracy": 0.22445845007896423,
|
|
"num_tokens": 52957470.0,
|
|
"step": 23085
|
|
},
|
|
{
|
|
"entropy": 5.179237985610962,
|
|
"epoch": 2.218059558117195,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00045091941204001564,
|
|
"loss": 4.8069,
|
|
"mean_token_accuracy": 0.23417698442935944,
|
|
"num_tokens": 52968584.0,
|
|
"step": 23090
|
|
},
|
|
{
|
|
"entropy": 5.103171491622925,
|
|
"epoch": 2.218539865513929,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000450898035951028,
|
|
"loss": 4.8118,
|
|
"mean_token_accuracy": 0.22522962391376494,
|
|
"num_tokens": 52980187.0,
|
|
"step": 23095
|
|
},
|
|
{
|
|
"entropy": 5.015201663970947,
|
|
"epoch": 2.2190201729106627,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00045087665577821034,
|
|
"loss": 4.7333,
|
|
"mean_token_accuracy": 0.2403994545340538,
|
|
"num_tokens": 52990478.0,
|
|
"step": 23100
|
|
},
|
|
{
|
|
"entropy": 5.108901691436768,
|
|
"epoch": 2.2195004803073966,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000450855271522059,
|
|
"loss": 4.8785,
|
|
"mean_token_accuracy": 0.21689791679382325,
|
|
"num_tokens": 53001130.0,
|
|
"step": 23105
|
|
},
|
|
{
|
|
"entropy": 5.1791112422943115,
|
|
"epoch": 2.219980787704131,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00045083388318307044,
|
|
"loss": 4.8656,
|
|
"mean_token_accuracy": 0.22352044582366942,
|
|
"num_tokens": 53013057.0,
|
|
"step": 23110
|
|
},
|
|
{
|
|
"entropy": 5.1998707294464115,
|
|
"epoch": 2.2204610951008648,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004508124907617411,
|
|
"loss": 4.8818,
|
|
"mean_token_accuracy": 0.2214874416589737,
|
|
"num_tokens": 53024333.0,
|
|
"step": 23115
|
|
},
|
|
{
|
|
"entropy": 5.098461532592774,
|
|
"epoch": 2.2209414024975986,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004507910942585679,
|
|
"loss": 4.801,
|
|
"mean_token_accuracy": 0.22544850260019303,
|
|
"num_tokens": 53036723.0,
|
|
"step": 23120
|
|
},
|
|
{
|
|
"entropy": 5.149092721939087,
|
|
"epoch": 2.2214217098943325,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004507696936740475,
|
|
"loss": 4.8479,
|
|
"mean_token_accuracy": 0.23135247081518173,
|
|
"num_tokens": 53048183.0,
|
|
"step": 23125
|
|
},
|
|
{
|
|
"entropy": 5.089809226989746,
|
|
"epoch": 2.2219020172910664,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004507482890086767,
|
|
"loss": 4.8052,
|
|
"mean_token_accuracy": 0.23036097437143327,
|
|
"num_tokens": 53061467.0,
|
|
"step": 23130
|
|
},
|
|
{
|
|
"entropy": 5.070451068878174,
|
|
"epoch": 2.2223823246878003,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004507268802629525,
|
|
"loss": 4.8054,
|
|
"mean_token_accuracy": 0.23150533586740493,
|
|
"num_tokens": 53072833.0,
|
|
"step": 23135
|
|
},
|
|
{
|
|
"entropy": 5.148936319351196,
|
|
"epoch": 2.222862632084534,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004507054674373719,
|
|
"loss": 4.7687,
|
|
"mean_token_accuracy": 0.22679489552974702,
|
|
"num_tokens": 53084109.0,
|
|
"step": 23140
|
|
},
|
|
{
|
|
"entropy": 5.129045248031616,
|
|
"epoch": 2.223342939481268,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045068405053243216,
|
|
"loss": 4.8048,
|
|
"mean_token_accuracy": 0.2320725902915001,
|
|
"num_tokens": 53095997.0,
|
|
"step": 23145
|
|
},
|
|
{
|
|
"entropy": 5.0516626834869385,
|
|
"epoch": 2.223823246878002,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004506626295486304,
|
|
"loss": 4.7096,
|
|
"mean_token_accuracy": 0.2448731392621994,
|
|
"num_tokens": 53107518.0,
|
|
"step": 23150
|
|
},
|
|
{
|
|
"entropy": 5.1101579666137695,
|
|
"epoch": 2.224303554274736,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045064120448646405,
|
|
"loss": 4.7798,
|
|
"mean_token_accuracy": 0.2291765719652176,
|
|
"num_tokens": 53118908.0,
|
|
"step": 23155
|
|
},
|
|
{
|
|
"entropy": 5.082378721237182,
|
|
"epoch": 2.2247838616714697,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004506197753464304,
|
|
"loss": 4.7706,
|
|
"mean_token_accuracy": 0.23491850942373277,
|
|
"num_tokens": 53129404.0,
|
|
"step": 23160
|
|
},
|
|
{
|
|
"entropy": 5.156738233566284,
|
|
"epoch": 2.2252641690682036,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045059834212902707,
|
|
"loss": 4.8714,
|
|
"mean_token_accuracy": 0.22582435458898545,
|
|
"num_tokens": 53141364.0,
|
|
"step": 23165
|
|
},
|
|
{
|
|
"entropy": 5.176635265350342,
|
|
"epoch": 2.2257444764649374,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00045057690483475167,
|
|
"loss": 4.7811,
|
|
"mean_token_accuracy": 0.23388808816671372,
|
|
"num_tokens": 53153617.0,
|
|
"step": 23170
|
|
},
|
|
{
|
|
"entropy": 5.103824949264526,
|
|
"epoch": 2.2262247838616713,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004505554634641019,
|
|
"loss": 4.8897,
|
|
"mean_token_accuracy": 0.22136110216379165,
|
|
"num_tokens": 53166537.0,
|
|
"step": 23175
|
|
},
|
|
{
|
|
"entropy": 5.135892486572265,
|
|
"epoch": 2.226705091258405,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00045053401801757554,
|
|
"loss": 4.7672,
|
|
"mean_token_accuracy": 0.23385286778211595,
|
|
"num_tokens": 53178979.0,
|
|
"step": 23180
|
|
},
|
|
{
|
|
"entropy": 5.119091510772705,
|
|
"epoch": 2.227185398655139,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00045051256849567054,
|
|
"loss": 4.8117,
|
|
"mean_token_accuracy": 0.23698574751615525,
|
|
"num_tokens": 53190177.0,
|
|
"step": 23185
|
|
},
|
|
{
|
|
"entropy": 5.080839538574219,
|
|
"epoch": 2.227665706051873,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045049111489888486,
|
|
"loss": 4.7824,
|
|
"mean_token_accuracy": 0.2351214364171028,
|
|
"num_tokens": 53201398.0,
|
|
"step": 23190
|
|
},
|
|
{
|
|
"entropy": 5.166760444641113,
|
|
"epoch": 2.2281460134486073,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004504696572277165,
|
|
"loss": 4.8306,
|
|
"mean_token_accuracy": 0.22560259848833084,
|
|
"num_tokens": 53213795.0,
|
|
"step": 23195
|
|
},
|
|
{
|
|
"entropy": 5.140866422653199,
|
|
"epoch": 2.228626320845341,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00045044819548266385,
|
|
"loss": 4.861,
|
|
"mean_token_accuracy": 0.22336962670087815,
|
|
"num_tokens": 53226831.0,
|
|
"step": 23200
|
|
},
|
|
{
|
|
"entropy": 5.182083988189698,
|
|
"epoch": 2.229106628242075,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045042672966422506,
|
|
"loss": 4.872,
|
|
"mean_token_accuracy": 0.22910162359476088,
|
|
"num_tokens": 53238558.0,
|
|
"step": 23205
|
|
},
|
|
{
|
|
"entropy": 5.104340553283691,
|
|
"epoch": 2.229586935638809,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00045040525977289847,
|
|
"loss": 4.7725,
|
|
"mean_token_accuracy": 0.22808566242456435,
|
|
"num_tokens": 53250376.0,
|
|
"step": 23210
|
|
},
|
|
{
|
|
"entropy": 5.119502019882202,
|
|
"epoch": 2.230067243035543,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004503837858091826,
|
|
"loss": 4.7659,
|
|
"mean_token_accuracy": 0.23049113750457764,
|
|
"num_tokens": 53260872.0,
|
|
"step": 23215
|
|
},
|
|
{
|
|
"entropy": 5.084338140487671,
|
|
"epoch": 2.2305475504322767,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00045036230777357604,
|
|
"loss": 4.7652,
|
|
"mean_token_accuracy": 0.23104382902383805,
|
|
"num_tokens": 53271363.0,
|
|
"step": 23220
|
|
},
|
|
{
|
|
"entropy": 5.171426820755005,
|
|
"epoch": 2.2310278578290106,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004503408256665774,
|
|
"loss": 4.9026,
|
|
"mean_token_accuracy": 0.21932850778102875,
|
|
"num_tokens": 53282356.0,
|
|
"step": 23225
|
|
},
|
|
{
|
|
"entropy": 5.074285173416138,
|
|
"epoch": 2.2315081652257445,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00045031933948868545,
|
|
"loss": 4.8338,
|
|
"mean_token_accuracy": 0.2266443893313408,
|
|
"num_tokens": 53293675.0,
|
|
"step": 23230
|
|
},
|
|
{
|
|
"entropy": 5.120794582366943,
|
|
"epoch": 2.2319884726224783,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00045029784924039903,
|
|
"loss": 4.7167,
|
|
"mean_token_accuracy": 0.2397169515490532,
|
|
"num_tokens": 53305193.0,
|
|
"step": 23235
|
|
},
|
|
{
|
|
"entropy": 5.133155250549317,
|
|
"epoch": 2.2324687800192122,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00045027635492221716,
|
|
"loss": 4.7169,
|
|
"mean_token_accuracy": 0.24129508286714554,
|
|
"num_tokens": 53315729.0,
|
|
"step": 23240
|
|
},
|
|
{
|
|
"entropy": 5.038401126861572,
|
|
"epoch": 2.232949087415946,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00045025485653463866,
|
|
"loss": 4.794,
|
|
"mean_token_accuracy": 0.2298990473151207,
|
|
"num_tokens": 53327220.0,
|
|
"step": 23245
|
|
},
|
|
{
|
|
"entropy": 5.081314659118652,
|
|
"epoch": 2.23342939481268,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004502333540781628,
|
|
"loss": 4.8294,
|
|
"mean_token_accuracy": 0.2268001616001129,
|
|
"num_tokens": 53337565.0,
|
|
"step": 23250
|
|
},
|
|
{
|
|
"entropy": 5.1318401336669925,
|
|
"epoch": 2.233909702209414,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004502118475532888,
|
|
"loss": 4.8018,
|
|
"mean_token_accuracy": 0.23005928546190263,
|
|
"num_tokens": 53348791.0,
|
|
"step": 23255
|
|
},
|
|
{
|
|
"entropy": 5.074340200424194,
|
|
"epoch": 2.2343900096061478,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000450190336960516,
|
|
"loss": 4.7488,
|
|
"mean_token_accuracy": 0.23026071041822432,
|
|
"num_tokens": 53361341.0,
|
|
"step": 23260
|
|
},
|
|
{
|
|
"entropy": 5.094004583358765,
|
|
"epoch": 2.2348703170028816,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004501688223003438,
|
|
"loss": 4.828,
|
|
"mean_token_accuracy": 0.22650541067123414,
|
|
"num_tokens": 53372085.0,
|
|
"step": 23265
|
|
},
|
|
{
|
|
"entropy": 5.079095840454102,
|
|
"epoch": 2.235350624399616,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004501473035732717,
|
|
"loss": 4.7845,
|
|
"mean_token_accuracy": 0.2256512776017189,
|
|
"num_tokens": 53383665.0,
|
|
"step": 23270
|
|
},
|
|
{
|
|
"entropy": 5.098389530181885,
|
|
"epoch": 2.23583093179635,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004501257807797993,
|
|
"loss": 4.7929,
|
|
"mean_token_accuracy": 0.22722006291151048,
|
|
"num_tokens": 53394697.0,
|
|
"step": 23275
|
|
},
|
|
{
|
|
"entropy": 5.0698864459991455,
|
|
"epoch": 2.2363112391930837,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00045010425392042624,
|
|
"loss": 4.8061,
|
|
"mean_token_accuracy": 0.22553065419197083,
|
|
"num_tokens": 53405538.0,
|
|
"step": 23280
|
|
},
|
|
{
|
|
"entropy": 5.212454700469971,
|
|
"epoch": 2.2367915465898176,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004500827229956524,
|
|
"loss": 4.903,
|
|
"mean_token_accuracy": 0.22110868543386458,
|
|
"num_tokens": 53416515.0,
|
|
"step": 23285
|
|
},
|
|
{
|
|
"entropy": 5.109039974212647,
|
|
"epoch": 2.2372718539865515,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045006118800597757,
|
|
"loss": 4.7505,
|
|
"mean_token_accuracy": 0.2293446272611618,
|
|
"num_tokens": 53426530.0,
|
|
"step": 23290
|
|
},
|
|
{
|
|
"entropy": 5.094769811630249,
|
|
"epoch": 2.2377521613832854,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00045003964895190177,
|
|
"loss": 4.8575,
|
|
"mean_token_accuracy": 0.2245904505252838,
|
|
"num_tokens": 53438514.0,
|
|
"step": 23295
|
|
},
|
|
{
|
|
"entropy": 5.1014293193817135,
|
|
"epoch": 2.2382324687800192,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004500181058339251,
|
|
"loss": 4.7228,
|
|
"mean_token_accuracy": 0.23853760361671447,
|
|
"num_tokens": 53448845.0,
|
|
"step": 23300
|
|
},
|
|
{
|
|
"entropy": 5.093882274627686,
|
|
"epoch": 2.238712776176753,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004499965586525478,
|
|
"loss": 4.8414,
|
|
"mean_token_accuracy": 0.22516778856515884,
|
|
"num_tokens": 53460526.0,
|
|
"step": 23305
|
|
},
|
|
{
|
|
"entropy": 5.183574533462524,
|
|
"epoch": 2.239193083573487,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044997500740826993,
|
|
"loss": 4.8775,
|
|
"mean_token_accuracy": 0.22742329388856888,
|
|
"num_tokens": 53471997.0,
|
|
"step": 23310
|
|
},
|
|
{
|
|
"entropy": 5.103569459915161,
|
|
"epoch": 2.239673390970221,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000449953452101592,
|
|
"loss": 4.8401,
|
|
"mean_token_accuracy": 0.22768707275390626,
|
|
"num_tokens": 53483097.0,
|
|
"step": 23315
|
|
},
|
|
{
|
|
"entropy": 5.116818475723266,
|
|
"epoch": 2.2401536983669548,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00044993189273301445,
|
|
"loss": 4.7605,
|
|
"mean_token_accuracy": 0.23938217461109162,
|
|
"num_tokens": 53494877.0,
|
|
"step": 23320
|
|
},
|
|
{
|
|
"entropy": 5.0951494693756105,
|
|
"epoch": 2.2406340057636887,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004499103293030377,
|
|
"loss": 4.7768,
|
|
"mean_token_accuracy": 0.2359197422862053,
|
|
"num_tokens": 53507065.0,
|
|
"step": 23325
|
|
},
|
|
{
|
|
"entropy": 5.169115924835205,
|
|
"epoch": 2.2411143131604225,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004498887618121625,
|
|
"loss": 4.8494,
|
|
"mean_token_accuracy": 0.23051951229572296,
|
|
"num_tokens": 53518188.0,
|
|
"step": 23330
|
|
},
|
|
{
|
|
"entropy": 5.06104588508606,
|
|
"epoch": 2.2415946205571564,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004498671902608897,
|
|
"loss": 4.7008,
|
|
"mean_token_accuracy": 0.2351858526468277,
|
|
"num_tokens": 53529872.0,
|
|
"step": 23335
|
|
},
|
|
{
|
|
"entropy": 5.143391942977905,
|
|
"epoch": 2.2420749279538903,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004498456146497198,
|
|
"loss": 4.7767,
|
|
"mean_token_accuracy": 0.22685908675193786,
|
|
"num_tokens": 53540175.0,
|
|
"step": 23340
|
|
},
|
|
{
|
|
"entropy": 5.0613484382629395,
|
|
"epoch": 2.2425552353506246,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00044982403497915405,
|
|
"loss": 4.7051,
|
|
"mean_token_accuracy": 0.23086913526058198,
|
|
"num_tokens": 53552520.0,
|
|
"step": 23345
|
|
},
|
|
{
|
|
"entropy": 5.091251564025879,
|
|
"epoch": 2.2430355427473585,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044980245124969333,
|
|
"loss": 4.7983,
|
|
"mean_token_accuracy": 0.22655535042285918,
|
|
"num_tokens": 53563937.0,
|
|
"step": 23350
|
|
},
|
|
{
|
|
"entropy": 5.100781488418579,
|
|
"epoch": 2.2435158501440924,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004497808634618386,
|
|
"loss": 4.7599,
|
|
"mean_token_accuracy": 0.22517484724521636,
|
|
"num_tokens": 53574654.0,
|
|
"step": 23355
|
|
},
|
|
{
|
|
"entropy": 5.123874092102051,
|
|
"epoch": 2.2439961575408263,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004497592716160913,
|
|
"loss": 4.8158,
|
|
"mean_token_accuracy": 0.22852961719036102,
|
|
"num_tokens": 53586848.0,
|
|
"step": 23360
|
|
},
|
|
{
|
|
"entropy": 5.088836193084717,
|
|
"epoch": 2.24447646493756,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044973767571295273,
|
|
"loss": 4.796,
|
|
"mean_token_accuracy": 0.2306663915514946,
|
|
"num_tokens": 53598200.0,
|
|
"step": 23365
|
|
},
|
|
{
|
|
"entropy": 5.068975734710693,
|
|
"epoch": 2.244956772334294,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004497160757529241,
|
|
"loss": 4.7346,
|
|
"mean_token_accuracy": 0.2349630042910576,
|
|
"num_tokens": 53609957.0,
|
|
"step": 23370
|
|
},
|
|
{
|
|
"entropy": 5.128179311752319,
|
|
"epoch": 2.245437079731028,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044969447173650695,
|
|
"loss": 4.8083,
|
|
"mean_token_accuracy": 0.23668235242366792,
|
|
"num_tokens": 53621029.0,
|
|
"step": 23375
|
|
},
|
|
{
|
|
"entropy": 5.086098146438599,
|
|
"epoch": 2.245917387127762,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000449672863664203,
|
|
"loss": 4.7745,
|
|
"mean_token_accuracy": 0.22932577580213548,
|
|
"num_tokens": 53633279.0,
|
|
"step": 23380
|
|
},
|
|
{
|
|
"entropy": 5.0923463821411135,
|
|
"epoch": 2.2463976945244957,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00044965125153651375,
|
|
"loss": 4.793,
|
|
"mean_token_accuracy": 0.22933094650506974,
|
|
"num_tokens": 53643959.0,
|
|
"step": 23385
|
|
},
|
|
{
|
|
"entropy": 5.102643966674805,
|
|
"epoch": 2.2468780019212296,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004496296353539411,
|
|
"loss": 4.7822,
|
|
"mean_token_accuracy": 0.22918579429388047,
|
|
"num_tokens": 53655215.0,
|
|
"step": 23390
|
|
},
|
|
{
|
|
"entropy": 5.177783489227295,
|
|
"epoch": 2.2473583093179634,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004496080151169869,
|
|
"loss": 4.7781,
|
|
"mean_token_accuracy": 0.23483146280050277,
|
|
"num_tokens": 53666557.0,
|
|
"step": 23395
|
|
},
|
|
{
|
|
"entropy": 5.139918661117553,
|
|
"epoch": 2.2478386167146973,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044958639082615294,
|
|
"loss": 4.8471,
|
|
"mean_token_accuracy": 0.2253044903278351,
|
|
"num_tokens": 53679626.0,
|
|
"step": 23400
|
|
},
|
|
{
|
|
"entropy": 5.118360996246338,
|
|
"epoch": 2.248318924111431,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004495647624819415,
|
|
"loss": 4.7777,
|
|
"mean_token_accuracy": 0.2323940321803093,
|
|
"num_tokens": 53691681.0,
|
|
"step": 23405
|
|
},
|
|
{
|
|
"entropy": 5.150889730453491,
|
|
"epoch": 2.248799231508165,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044954313008485457,
|
|
"loss": 4.7347,
|
|
"mean_token_accuracy": 0.23787372261285783,
|
|
"num_tokens": 53703320.0,
|
|
"step": 23410
|
|
},
|
|
{
|
|
"entropy": 5.032420969009399,
|
|
"epoch": 2.249279538904899,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00044952149363539453,
|
|
"loss": 4.7102,
|
|
"mean_token_accuracy": 0.23317102640867232,
|
|
"num_tokens": 53714867.0,
|
|
"step": 23415
|
|
},
|
|
{
|
|
"entropy": 5.101826238632202,
|
|
"epoch": 2.249759846301633,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004494998531340635,
|
|
"loss": 4.8373,
|
|
"mean_token_accuracy": 0.2332456275820732,
|
|
"num_tokens": 53726121.0,
|
|
"step": 23420
|
|
},
|
|
{
|
|
"entropy": 5.173738813400268,
|
|
"epoch": 2.2502401536983667,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004494782085813641,
|
|
"loss": 4.8415,
|
|
"mean_token_accuracy": 0.22598159462213516,
|
|
"num_tokens": 53736966.0,
|
|
"step": 23425
|
|
},
|
|
{
|
|
"entropy": 5.074115467071533,
|
|
"epoch": 2.250720461095101,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004494565599777988,
|
|
"loss": 4.7125,
|
|
"mean_token_accuracy": 0.24225749671459199,
|
|
"num_tokens": 53747446.0,
|
|
"step": 23430
|
|
},
|
|
{
|
|
"entropy": 5.092739295959473,
|
|
"epoch": 2.251200768491835,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044943490732387025,
|
|
"loss": 4.8496,
|
|
"mean_token_accuracy": 0.2204935595393181,
|
|
"num_tokens": 53759588.0,
|
|
"step": 23435
|
|
},
|
|
{
|
|
"entropy": 5.169422292709351,
|
|
"epoch": 2.251681075888569,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004494132506200811,
|
|
"loss": 4.8258,
|
|
"mean_token_accuracy": 0.23271931260824202,
|
|
"num_tokens": 53770314.0,
|
|
"step": 23440
|
|
},
|
|
{
|
|
"entropy": 5.100150728225708,
|
|
"epoch": 2.2521613832853027,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004493915898669341,
|
|
"loss": 4.7553,
|
|
"mean_token_accuracy": 0.23632270991802215,
|
|
"num_tokens": 53781251.0,
|
|
"step": 23445
|
|
},
|
|
{
|
|
"entropy": 5.1533149719238285,
|
|
"epoch": 2.2526416906820366,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004493699250649323,
|
|
"loss": 4.8618,
|
|
"mean_token_accuracy": 0.2235618367791176,
|
|
"num_tokens": 53792676.0,
|
|
"step": 23450
|
|
},
|
|
{
|
|
"entropy": 5.1419731140136715,
|
|
"epoch": 2.2531219980787704,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004493482562145786,
|
|
"loss": 4.8624,
|
|
"mean_token_accuracy": 0.22552858293056488,
|
|
"num_tokens": 53802714.0,
|
|
"step": 23455
|
|
},
|
|
{
|
|
"entropy": 5.067761516571045,
|
|
"epoch": 2.2536023054755043,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044932658331637605,
|
|
"loss": 4.8241,
|
|
"mean_token_accuracy": 0.2310011625289917,
|
|
"num_tokens": 53815043.0,
|
|
"step": 23460
|
|
},
|
|
{
|
|
"entropy": 5.161076307296753,
|
|
"epoch": 2.254082612872238,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004493049063708279,
|
|
"loss": 4.8072,
|
|
"mean_token_accuracy": 0.22329722046852113,
|
|
"num_tokens": 53827903.0,
|
|
"step": 23465
|
|
},
|
|
{
|
|
"entropy": 5.223559808731079,
|
|
"epoch": 2.254562920268972,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044928322537843746,
|
|
"loss": 4.8834,
|
|
"mean_token_accuracy": 0.22471548467874528,
|
|
"num_tokens": 53839574.0,
|
|
"step": 23470
|
|
},
|
|
{
|
|
"entropy": 5.110046672821045,
|
|
"epoch": 2.255043227665706,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00044926154033970793,
|
|
"loss": 4.7297,
|
|
"mean_token_accuracy": 0.2362123802304268,
|
|
"num_tokens": 53850488.0,
|
|
"step": 23475
|
|
},
|
|
{
|
|
"entropy": 5.1028900146484375,
|
|
"epoch": 2.25552353506244,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000449239851255143,
|
|
"loss": 4.7653,
|
|
"mean_token_accuracy": 0.23202899992465972,
|
|
"num_tokens": 53860766.0,
|
|
"step": 23480
|
|
},
|
|
{
|
|
"entropy": 5.041772985458374,
|
|
"epoch": 2.2560038424591737,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044921815812524606,
|
|
"loss": 4.7769,
|
|
"mean_token_accuracy": 0.22772487998008728,
|
|
"num_tokens": 53872088.0,
|
|
"step": 23485
|
|
},
|
|
{
|
|
"entropy": 5.1281756401062015,
|
|
"epoch": 2.2564841498559076,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044919646095052077,
|
|
"loss": 4.8177,
|
|
"mean_token_accuracy": 0.22846906781196594,
|
|
"num_tokens": 53883302.0,
|
|
"step": 23490
|
|
},
|
|
{
|
|
"entropy": 5.061516618728637,
|
|
"epoch": 2.2569644572526415,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000449174759731471,
|
|
"loss": 4.7135,
|
|
"mean_token_accuracy": 0.2427988812327385,
|
|
"num_tokens": 53894310.0,
|
|
"step": 23495
|
|
},
|
|
{
|
|
"entropy": 5.1370398044586185,
|
|
"epoch": 2.2574447646493754,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00044915305446860046,
|
|
"loss": 4.8432,
|
|
"mean_token_accuracy": 0.22062044441699982,
|
|
"num_tokens": 53906760.0,
|
|
"step": 23500
|
|
},
|
|
{
|
|
"entropy": 5.098146247863769,
|
|
"epoch": 2.2579250720461097,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00044913134516241305,
|
|
"loss": 4.7964,
|
|
"mean_token_accuracy": 0.2295202597975731,
|
|
"num_tokens": 53918758.0,
|
|
"step": 23505
|
|
},
|
|
{
|
|
"entropy": 5.026399803161621,
|
|
"epoch": 2.2584053794428436,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004491096318134129,
|
|
"loss": 4.7536,
|
|
"mean_token_accuracy": 0.2342957004904747,
|
|
"num_tokens": 53932574.0,
|
|
"step": 23510
|
|
},
|
|
{
|
|
"entropy": 5.158463478088379,
|
|
"epoch": 2.2588856868395775,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004490879144221042,
|
|
"loss": 4.8358,
|
|
"mean_token_accuracy": 0.2323979079723358,
|
|
"num_tokens": 53943801.0,
|
|
"step": 23515
|
|
},
|
|
{
|
|
"entropy": 5.069459533691406,
|
|
"epoch": 2.2593659942363113,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00044906619298899097,
|
|
"loss": 4.7704,
|
|
"mean_token_accuracy": 0.24293200224637984,
|
|
"num_tokens": 53955458.0,
|
|
"step": 23520
|
|
},
|
|
{
|
|
"entropy": 5.143928813934326,
|
|
"epoch": 2.2598463016330452,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004490444675145777,
|
|
"loss": 4.8139,
|
|
"mean_token_accuracy": 0.2232506200671196,
|
|
"num_tokens": 53967242.0,
|
|
"step": 23525
|
|
},
|
|
{
|
|
"entropy": 5.126056480407715,
|
|
"epoch": 2.260326609029779,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004490227379993686,
|
|
"loss": 4.8327,
|
|
"mean_token_accuracy": 0.2304875761270523,
|
|
"num_tokens": 53978953.0,
|
|
"step": 23530
|
|
},
|
|
{
|
|
"entropy": 5.167446851730347,
|
|
"epoch": 2.260806916426513,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004490010044438683,
|
|
"loss": 4.8344,
|
|
"mean_token_accuracy": 0.22517274022102357,
|
|
"num_tokens": 53990553.0,
|
|
"step": 23535
|
|
},
|
|
{
|
|
"entropy": 5.10265703201294,
|
|
"epoch": 2.261287223823247,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044897926684858133,
|
|
"loss": 4.819,
|
|
"mean_token_accuracy": 0.23733688592910768,
|
|
"num_tokens": 54002187.0,
|
|
"step": 23540
|
|
},
|
|
{
|
|
"entropy": 5.160494375228882,
|
|
"epoch": 2.2617675312199808,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044895752521401246,
|
|
"loss": 4.7928,
|
|
"mean_token_accuracy": 0.22392333447933196,
|
|
"num_tokens": 54014202.0,
|
|
"step": 23545
|
|
},
|
|
{
|
|
"entropy": 5.08729305267334,
|
|
"epoch": 2.2622478386167146,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004489357795406663,
|
|
"loss": 4.7955,
|
|
"mean_token_accuracy": 0.2341983512043953,
|
|
"num_tokens": 54026286.0,
|
|
"step": 23550
|
|
},
|
|
{
|
|
"entropy": 5.094353675842285,
|
|
"epoch": 2.2627281460134485,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004489140298290479,
|
|
"loss": 4.8109,
|
|
"mean_token_accuracy": 0.22887043058872222,
|
|
"num_tokens": 54038845.0,
|
|
"step": 23555
|
|
},
|
|
{
|
|
"entropy": 5.211902523040772,
|
|
"epoch": 2.2632084534101824,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044889227607966217,
|
|
"loss": 4.8762,
|
|
"mean_token_accuracy": 0.22510544210672379,
|
|
"num_tokens": 54049991.0,
|
|
"step": 23560
|
|
},
|
|
{
|
|
"entropy": 5.100750732421875,
|
|
"epoch": 2.2636887608069163,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044887051829301406,
|
|
"loss": 4.7642,
|
|
"mean_token_accuracy": 0.2320538729429245,
|
|
"num_tokens": 54061408.0,
|
|
"step": 23565
|
|
},
|
|
{
|
|
"entropy": 5.088111400604248,
|
|
"epoch": 2.26416906820365,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044884875646960886,
|
|
"loss": 4.7678,
|
|
"mean_token_accuracy": 0.22809007316827773,
|
|
"num_tokens": 54072670.0,
|
|
"step": 23570
|
|
},
|
|
{
|
|
"entropy": 5.122489023208618,
|
|
"epoch": 2.264649375600384,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044882699060995175,
|
|
"loss": 4.7954,
|
|
"mean_token_accuracy": 0.2249316856265068,
|
|
"num_tokens": 54084555.0,
|
|
"step": 23575
|
|
},
|
|
{
|
|
"entropy": 5.021838665008545,
|
|
"epoch": 2.2651296829971184,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004488052207145481,
|
|
"loss": 4.7386,
|
|
"mean_token_accuracy": 0.23619322329759598,
|
|
"num_tokens": 54096764.0,
|
|
"step": 23580
|
|
},
|
|
{
|
|
"entropy": 5.170163249969482,
|
|
"epoch": 2.2656099903938522,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00044878344678390324,
|
|
"loss": 4.831,
|
|
"mean_token_accuracy": 0.2345489665865898,
|
|
"num_tokens": 54109101.0,
|
|
"step": 23585
|
|
},
|
|
{
|
|
"entropy": 5.196159839630127,
|
|
"epoch": 2.266090297790586,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044876166881852286,
|
|
"loss": 4.8298,
|
|
"mean_token_accuracy": 0.22761160433292388,
|
|
"num_tokens": 54120217.0,
|
|
"step": 23590
|
|
},
|
|
{
|
|
"entropy": 5.139959335327148,
|
|
"epoch": 2.26657060518732,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004487398868189125,
|
|
"loss": 4.8119,
|
|
"mean_token_accuracy": 0.2269844725728035,
|
|
"num_tokens": 54132059.0,
|
|
"step": 23595
|
|
},
|
|
{
|
|
"entropy": 5.175757598876953,
|
|
"epoch": 2.267050912584054,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044871810078557777,
|
|
"loss": 4.8604,
|
|
"mean_token_accuracy": 0.22221217453479766,
|
|
"num_tokens": 54141928.0,
|
|
"step": 23600
|
|
},
|
|
{
|
|
"entropy": 5.15883059501648,
|
|
"epoch": 2.2675312199807878,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004486963107190247,
|
|
"loss": 4.8823,
|
|
"mean_token_accuracy": 0.22258317917585374,
|
|
"num_tokens": 54153052.0,
|
|
"step": 23605
|
|
},
|
|
{
|
|
"entropy": 5.18836932182312,
|
|
"epoch": 2.2680115273775217,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044867451661975894,
|
|
"loss": 4.8245,
|
|
"mean_token_accuracy": 0.22355309426784514,
|
|
"num_tokens": 54163539.0,
|
|
"step": 23610
|
|
},
|
|
{
|
|
"entropy": 5.065151262283325,
|
|
"epoch": 2.2684918347742555,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044865271848828673,
|
|
"loss": 4.7707,
|
|
"mean_token_accuracy": 0.2353520065546036,
|
|
"num_tokens": 54176313.0,
|
|
"step": 23615
|
|
},
|
|
{
|
|
"entropy": 5.156122493743896,
|
|
"epoch": 2.2689721421709894,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000448630916325114,
|
|
"loss": 4.8559,
|
|
"mean_token_accuracy": 0.22095520347356795,
|
|
"num_tokens": 54187080.0,
|
|
"step": 23620
|
|
},
|
|
{
|
|
"entropy": 5.154760360717773,
|
|
"epoch": 2.2694524495677233,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000448609110130747,
|
|
"loss": 4.8292,
|
|
"mean_token_accuracy": 0.23027612417936325,
|
|
"num_tokens": 54197856.0,
|
|
"step": 23625
|
|
},
|
|
{
|
|
"entropy": 5.1559305667877195,
|
|
"epoch": 2.269932756964457,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00044858729990569193,
|
|
"loss": 4.8263,
|
|
"mean_token_accuracy": 0.22898895889520646,
|
|
"num_tokens": 54210865.0,
|
|
"step": 23630
|
|
},
|
|
{
|
|
"entropy": 5.0686627388000485,
|
|
"epoch": 2.270413064361191,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00044856548565045523,
|
|
"loss": 4.7199,
|
|
"mean_token_accuracy": 0.22956303358078003,
|
|
"num_tokens": 54222719.0,
|
|
"step": 23635
|
|
},
|
|
{
|
|
"entropy": 5.11489691734314,
|
|
"epoch": 2.270893371757925,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044854366736554323,
|
|
"loss": 4.7648,
|
|
"mean_token_accuracy": 0.23259628117084502,
|
|
"num_tokens": 54233874.0,
|
|
"step": 23640
|
|
},
|
|
{
|
|
"entropy": 5.0634620666503904,
|
|
"epoch": 2.271373679154659,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044852184505146274,
|
|
"loss": 4.6935,
|
|
"mean_token_accuracy": 0.2324426457285881,
|
|
"num_tokens": 54245966.0,
|
|
"step": 23645
|
|
},
|
|
{
|
|
"entropy": 5.093727016448975,
|
|
"epoch": 2.2718539865513927,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004485000187087202,
|
|
"loss": 4.7653,
|
|
"mean_token_accuracy": 0.23520055264234543,
|
|
"num_tokens": 54256468.0,
|
|
"step": 23650
|
|
},
|
|
{
|
|
"entropy": 5.049545001983643,
|
|
"epoch": 2.272334293948127,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004484781883378224,
|
|
"loss": 4.7044,
|
|
"mean_token_accuracy": 0.2259022116661072,
|
|
"num_tokens": 54266362.0,
|
|
"step": 23655
|
|
},
|
|
{
|
|
"entropy": 5.10016827583313,
|
|
"epoch": 2.2728146013448605,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044845635393927623,
|
|
"loss": 4.8972,
|
|
"mean_token_accuracy": 0.21926648616790773,
|
|
"num_tokens": 54278365.0,
|
|
"step": 23660
|
|
},
|
|
{
|
|
"entropy": 5.151840925216675,
|
|
"epoch": 2.273294908741595,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004484345155135886,
|
|
"loss": 4.8911,
|
|
"mean_token_accuracy": 0.22861198633909224,
|
|
"num_tokens": 54289310.0,
|
|
"step": 23665
|
|
},
|
|
{
|
|
"entropy": 5.096772718429565,
|
|
"epoch": 2.2737752161383287,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004484126730612664,
|
|
"loss": 4.7457,
|
|
"mean_token_accuracy": 0.2337497740983963,
|
|
"num_tokens": 54300421.0,
|
|
"step": 23670
|
|
},
|
|
{
|
|
"entropy": 5.0949928760528564,
|
|
"epoch": 2.2742555235350626,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000448390826582817,
|
|
"loss": 4.8159,
|
|
"mean_token_accuracy": 0.2337231382727623,
|
|
"num_tokens": 54312803.0,
|
|
"step": 23675
|
|
},
|
|
{
|
|
"entropy": 5.162675762176514,
|
|
"epoch": 2.2747358309317964,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00044836897607874744,
|
|
"loss": 4.8348,
|
|
"mean_token_accuracy": 0.23068183213472365,
|
|
"num_tokens": 54324651.0,
|
|
"step": 23680
|
|
},
|
|
{
|
|
"entropy": 5.091884756088257,
|
|
"epoch": 2.2752161383285303,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000448347121549565,
|
|
"loss": 4.7145,
|
|
"mean_token_accuracy": 0.24177918434143067,
|
|
"num_tokens": 54334841.0,
|
|
"step": 23685
|
|
},
|
|
{
|
|
"entropy": 5.127256727218628,
|
|
"epoch": 2.275696445725264,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004483252629957771,
|
|
"loss": 4.7806,
|
|
"mean_token_accuracy": 0.23010292500257493,
|
|
"num_tokens": 54346685.0,
|
|
"step": 23690
|
|
},
|
|
{
|
|
"entropy": 5.068298482894898,
|
|
"epoch": 2.276176753121998,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00044830340041789133,
|
|
"loss": 4.8236,
|
|
"mean_token_accuracy": 0.2222338065505028,
|
|
"num_tokens": 54358710.0,
|
|
"step": 23695
|
|
},
|
|
{
|
|
"entropy": 5.092076301574707,
|
|
"epoch": 2.276657060518732,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004482815338164152,
|
|
"loss": 4.8428,
|
|
"mean_token_accuracy": 0.23226935118436814,
|
|
"num_tokens": 54369695.0,
|
|
"step": 23700
|
|
},
|
|
{
|
|
"entropy": 5.104079532623291,
|
|
"epoch": 2.277137367915466,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004482596631918564,
|
|
"loss": 4.7529,
|
|
"mean_token_accuracy": 0.23143748939037323,
|
|
"num_tokens": 54381605.0,
|
|
"step": 23705
|
|
},
|
|
{
|
|
"entropy": 5.129797792434692,
|
|
"epoch": 2.2776176753121997,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044823778854472267,
|
|
"loss": 4.7956,
|
|
"mean_token_accuracy": 0.23360486328601837,
|
|
"num_tokens": 54392309.0,
|
|
"step": 23710
|
|
},
|
|
{
|
|
"entropy": 5.115341234207153,
|
|
"epoch": 2.2780979827089336,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004482159098755219,
|
|
"loss": 4.8227,
|
|
"mean_token_accuracy": 0.2333234429359436,
|
|
"num_tokens": 54403195.0,
|
|
"step": 23715
|
|
},
|
|
{
|
|
"entropy": 5.061098051071167,
|
|
"epoch": 2.2785782901056675,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004481940271847621,
|
|
"loss": 4.7589,
|
|
"mean_token_accuracy": 0.23784710019826888,
|
|
"num_tokens": 54415105.0,
|
|
"step": 23720
|
|
},
|
|
{
|
|
"entropy": 5.095363569259644,
|
|
"epoch": 2.2790585975024014,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004481721404729513,
|
|
"loss": 4.7834,
|
|
"mean_token_accuracy": 0.2298209086060524,
|
|
"num_tokens": 54425798.0,
|
|
"step": 23725
|
|
},
|
|
{
|
|
"entropy": 5.179401540756226,
|
|
"epoch": 2.2795389048991357,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004481502497405975,
|
|
"loss": 4.8219,
|
|
"mean_token_accuracy": 0.2277689814567566,
|
|
"num_tokens": 54437254.0,
|
|
"step": 23730
|
|
},
|
|
{
|
|
"entropy": 5.160029888153076,
|
|
"epoch": 2.280019212295869,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004481283549882091,
|
|
"loss": 4.7782,
|
|
"mean_token_accuracy": 0.2246842786669731,
|
|
"num_tokens": 54448152.0,
|
|
"step": 23735
|
|
},
|
|
{
|
|
"entropy": 5.111397838592529,
|
|
"epoch": 2.2804995196926034,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044810645621629443,
|
|
"loss": 4.8327,
|
|
"mean_token_accuracy": 0.22581015527248383,
|
|
"num_tokens": 54460066.0,
|
|
"step": 23740
|
|
},
|
|
{
|
|
"entropy": 5.1661521911621096,
|
|
"epoch": 2.2809798270893373,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044808455342536176,
|
|
"loss": 4.8551,
|
|
"mean_token_accuracy": 0.22923391908407212,
|
|
"num_tokens": 54471769.0,
|
|
"step": 23745
|
|
},
|
|
{
|
|
"entropy": 5.098841714859009,
|
|
"epoch": 2.281460134486071,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044806264661591976,
|
|
"loss": 4.6924,
|
|
"mean_token_accuracy": 0.24215531200170518,
|
|
"num_tokens": 54483630.0,
|
|
"step": 23750
|
|
},
|
|
{
|
|
"entropy": 5.086545419692993,
|
|
"epoch": 2.281940441882805,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004480407357884771,
|
|
"loss": 4.7871,
|
|
"mean_token_accuracy": 0.22741892635822297,
|
|
"num_tokens": 54495696.0,
|
|
"step": 23755
|
|
},
|
|
{
|
|
"entropy": 5.096376657485962,
|
|
"epoch": 2.282420749279539,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044801882094354226,
|
|
"loss": 4.7937,
|
|
"mean_token_accuracy": 0.22762938886880874,
|
|
"num_tokens": 54507153.0,
|
|
"step": 23760
|
|
},
|
|
{
|
|
"entropy": 5.191448068618774,
|
|
"epoch": 2.282901056676273,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004479969020816242,
|
|
"loss": 4.8577,
|
|
"mean_token_accuracy": 0.227578766644001,
|
|
"num_tokens": 54519922.0,
|
|
"step": 23765
|
|
},
|
|
{
|
|
"entropy": 5.086835479736328,
|
|
"epoch": 2.2833813640730067,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00044797497920323175,
|
|
"loss": 4.7584,
|
|
"mean_token_accuracy": 0.2371147409081459,
|
|
"num_tokens": 54531857.0,
|
|
"step": 23770
|
|
},
|
|
{
|
|
"entropy": 5.093832492828369,
|
|
"epoch": 2.2838616714697406,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004479530523088739,
|
|
"loss": 4.7632,
|
|
"mean_token_accuracy": 0.24112324118614198,
|
|
"num_tokens": 54542079.0,
|
|
"step": 23775
|
|
},
|
|
{
|
|
"entropy": 5.015680027008057,
|
|
"epoch": 2.2843419788664745,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004479311213990598,
|
|
"loss": 4.7534,
|
|
"mean_token_accuracy": 0.23263996243476867,
|
|
"num_tokens": 54552722.0,
|
|
"step": 23780
|
|
},
|
|
{
|
|
"entropy": 5.055504846572876,
|
|
"epoch": 2.2848222862632084,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044790918647429854,
|
|
"loss": 4.753,
|
|
"mean_token_accuracy": 0.23083293735980986,
|
|
"num_tokens": 54564224.0,
|
|
"step": 23785
|
|
},
|
|
{
|
|
"entropy": 5.142749786376953,
|
|
"epoch": 2.2853025936599423,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044788724753509935,
|
|
"loss": 4.731,
|
|
"mean_token_accuracy": 0.23213585019111632,
|
|
"num_tokens": 54575553.0,
|
|
"step": 23790
|
|
},
|
|
{
|
|
"entropy": 5.122098827362061,
|
|
"epoch": 2.285782901056676,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004478653045819717,
|
|
"loss": 4.9048,
|
|
"mean_token_accuracy": 0.22266788631677628,
|
|
"num_tokens": 54587315.0,
|
|
"step": 23795
|
|
},
|
|
{
|
|
"entropy": 5.168012189865112,
|
|
"epoch": 2.28626320845341,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004478433576154249,
|
|
"loss": 4.817,
|
|
"mean_token_accuracy": 0.23212073296308516,
|
|
"num_tokens": 54598776.0,
|
|
"step": 23800
|
|
},
|
|
{
|
|
"entropy": 5.197915554046631,
|
|
"epoch": 2.286743515850144,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004478214066359687,
|
|
"loss": 4.8842,
|
|
"mean_token_accuracy": 0.2148657962679863,
|
|
"num_tokens": 54610272.0,
|
|
"step": 23805
|
|
},
|
|
{
|
|
"entropy": 5.140944910049439,
|
|
"epoch": 2.287223823246878,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044779945164411254,
|
|
"loss": 4.8522,
|
|
"mean_token_accuracy": 0.22315765023231507,
|
|
"num_tokens": 54622105.0,
|
|
"step": 23810
|
|
},
|
|
{
|
|
"entropy": 5.1300232887268065,
|
|
"epoch": 2.287704130643612,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004477774926403662,
|
|
"loss": 4.8058,
|
|
"mean_token_accuracy": 0.23476217389106752,
|
|
"num_tokens": 54631771.0,
|
|
"step": 23815
|
|
},
|
|
{
|
|
"entropy": 5.058514976501465,
|
|
"epoch": 2.288184438040346,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004477555296252396,
|
|
"loss": 4.7358,
|
|
"mean_token_accuracy": 0.23375988900661468,
|
|
"num_tokens": 54642724.0,
|
|
"step": 23820
|
|
},
|
|
{
|
|
"entropy": 5.0570252418518065,
|
|
"epoch": 2.28866474543708,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044773356259924255,
|
|
"loss": 4.7505,
|
|
"mean_token_accuracy": 0.2327996626496315,
|
|
"num_tokens": 54655126.0,
|
|
"step": 23825
|
|
},
|
|
{
|
|
"entropy": 5.049631500244141,
|
|
"epoch": 2.2891450528338138,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00044771159156288505,
|
|
"loss": 4.7099,
|
|
"mean_token_accuracy": 0.23850573152303695,
|
|
"num_tokens": 54665334.0,
|
|
"step": 23830
|
|
},
|
|
{
|
|
"entropy": 5.051522731781006,
|
|
"epoch": 2.2896253602305476,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004476896165166772,
|
|
"loss": 4.7049,
|
|
"mean_token_accuracy": 0.23670212179422379,
|
|
"num_tokens": 54676205.0,
|
|
"step": 23835
|
|
},
|
|
{
|
|
"entropy": 5.1268744468688965,
|
|
"epoch": 2.2901056676272815,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044766763746112936,
|
|
"loss": 4.8099,
|
|
"mean_token_accuracy": 0.23005885928869246,
|
|
"num_tokens": 54686586.0,
|
|
"step": 23840
|
|
},
|
|
{
|
|
"entropy": 5.132952785491943,
|
|
"epoch": 2.2905859750240154,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004476456543967517,
|
|
"loss": 4.7902,
|
|
"mean_token_accuracy": 0.23009070456027986,
|
|
"num_tokens": 54698879.0,
|
|
"step": 23845
|
|
},
|
|
{
|
|
"entropy": 5.073030281066894,
|
|
"epoch": 2.2910662824207493,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00044762366732405454,
|
|
"loss": 4.765,
|
|
"mean_token_accuracy": 0.23329982608556749,
|
|
"num_tokens": 54710140.0,
|
|
"step": 23850
|
|
},
|
|
{
|
|
"entropy": 5.046080446243286,
|
|
"epoch": 2.291546589817483,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004476016762435484,
|
|
"loss": 4.6823,
|
|
"mean_token_accuracy": 0.23559171855449676,
|
|
"num_tokens": 54721317.0,
|
|
"step": 23855
|
|
},
|
|
{
|
|
"entropy": 5.207999563217163,
|
|
"epoch": 2.292026897214217,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004475796811557439,
|
|
"loss": 4.9528,
|
|
"mean_token_accuracy": 0.21821181774139403,
|
|
"num_tokens": 54733329.0,
|
|
"step": 23860
|
|
},
|
|
{
|
|
"entropy": 5.176177167892456,
|
|
"epoch": 2.292507204610951,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044755768206115155,
|
|
"loss": 4.8303,
|
|
"mean_token_accuracy": 0.220322947204113,
|
|
"num_tokens": 54745230.0,
|
|
"step": 23865
|
|
},
|
|
{
|
|
"entropy": 5.09772367477417,
|
|
"epoch": 2.292987512007685,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004475356789602823,
|
|
"loss": 4.8181,
|
|
"mean_token_accuracy": 0.2298712059855461,
|
|
"num_tokens": 54757428.0,
|
|
"step": 23870
|
|
},
|
|
{
|
|
"entropy": 5.073602104187012,
|
|
"epoch": 2.2934678194044187,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044751367185364696,
|
|
"loss": 4.6947,
|
|
"mean_token_accuracy": 0.2402465119957924,
|
|
"num_tokens": 54768886.0,
|
|
"step": 23875
|
|
},
|
|
{
|
|
"entropy": 5.1595619201660154,
|
|
"epoch": 2.2939481268011526,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044749166074175634,
|
|
"loss": 4.9136,
|
|
"mean_token_accuracy": 0.2256806194782257,
|
|
"num_tokens": 54780243.0,
|
|
"step": 23880
|
|
},
|
|
{
|
|
"entropy": 5.073761415481568,
|
|
"epoch": 2.2944284341978864,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044746964562512154,
|
|
"loss": 4.69,
|
|
"mean_token_accuracy": 0.2334916412830353,
|
|
"num_tokens": 54792462.0,
|
|
"step": 23885
|
|
},
|
|
{
|
|
"entropy": 5.176160907745361,
|
|
"epoch": 2.2949087415946208,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00044744762650425376,
|
|
"loss": 4.8299,
|
|
"mean_token_accuracy": 0.22740929424762726,
|
|
"num_tokens": 54804412.0,
|
|
"step": 23890
|
|
},
|
|
{
|
|
"entropy": 5.115084457397461,
|
|
"epoch": 2.2953890489913547,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00044742560337966415,
|
|
"loss": 4.7857,
|
|
"mean_token_accuracy": 0.23885392695665358,
|
|
"num_tokens": 54814910.0,
|
|
"step": 23895
|
|
},
|
|
{
|
|
"entropy": 5.088543176651001,
|
|
"epoch": 2.2958693563880885,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000447403576251864,
|
|
"loss": 4.7708,
|
|
"mean_token_accuracy": 0.23376950323581697,
|
|
"num_tokens": 54826500.0,
|
|
"step": 23900
|
|
},
|
|
{
|
|
"entropy": 5.084391689300537,
|
|
"epoch": 2.2963496637848224,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004473815451213648,
|
|
"loss": 4.777,
|
|
"mean_token_accuracy": 0.22784235030412675,
|
|
"num_tokens": 54837682.0,
|
|
"step": 23905
|
|
},
|
|
{
|
|
"entropy": 5.085320568084716,
|
|
"epoch": 2.2968299711815563,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004473595099886779,
|
|
"loss": 4.7319,
|
|
"mean_token_accuracy": 0.23291932046413422,
|
|
"num_tokens": 54848937.0,
|
|
"step": 23910
|
|
},
|
|
{
|
|
"entropy": 5.1188312530517575,
|
|
"epoch": 2.29731027857829,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004473374708543151,
|
|
"loss": 4.6921,
|
|
"mean_token_accuracy": 0.2435552567243576,
|
|
"num_tokens": 54859384.0,
|
|
"step": 23915
|
|
},
|
|
{
|
|
"entropy": 5.03520393371582,
|
|
"epoch": 2.297790585975024,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004473154277187878,
|
|
"loss": 4.7073,
|
|
"mean_token_accuracy": 0.24123682230710983,
|
|
"num_tokens": 54871280.0,
|
|
"step": 23920
|
|
},
|
|
{
|
|
"entropy": 5.054597043991089,
|
|
"epoch": 2.298270893371758,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044729338058260805,
|
|
"loss": 4.6948,
|
|
"mean_token_accuracy": 0.2395238071680069,
|
|
"num_tokens": 54882699.0,
|
|
"step": 23925
|
|
},
|
|
{
|
|
"entropy": 5.184109544754028,
|
|
"epoch": 2.298751200768492,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004472713294462876,
|
|
"loss": 4.841,
|
|
"mean_token_accuracy": 0.2287002757191658,
|
|
"num_tokens": 54894041.0,
|
|
"step": 23930
|
|
},
|
|
{
|
|
"entropy": 5.16756534576416,
|
|
"epoch": 2.2992315081652257,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044724927431033843,
|
|
"loss": 4.8109,
|
|
"mean_token_accuracy": 0.22440232038497926,
|
|
"num_tokens": 54905007.0,
|
|
"step": 23935
|
|
},
|
|
{
|
|
"entropy": 5.154220914840698,
|
|
"epoch": 2.2997118155619596,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004472272151752725,
|
|
"loss": 4.8183,
|
|
"mean_token_accuracy": 0.2311574250459671,
|
|
"num_tokens": 54915735.0,
|
|
"step": 23940
|
|
},
|
|
{
|
|
"entropy": 5.02349214553833,
|
|
"epoch": 2.3001921229586935,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004472051520416022,
|
|
"loss": 4.6565,
|
|
"mean_token_accuracy": 0.235342113673687,
|
|
"num_tokens": 54926350.0,
|
|
"step": 23945
|
|
},
|
|
{
|
|
"entropy": 5.0691643238067625,
|
|
"epoch": 2.3006724303554273,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004471830849098395,
|
|
"loss": 4.8175,
|
|
"mean_token_accuracy": 0.22720725387334822,
|
|
"num_tokens": 54938354.0,
|
|
"step": 23950
|
|
},
|
|
{
|
|
"entropy": 5.146879768371582,
|
|
"epoch": 2.3011527377521612,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044716101378049683,
|
|
"loss": 4.8074,
|
|
"mean_token_accuracy": 0.2259829819202423,
|
|
"num_tokens": 54949440.0,
|
|
"step": 23955
|
|
},
|
|
{
|
|
"entropy": 5.15836272239685,
|
|
"epoch": 2.301633045148895,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044713893865408667,
|
|
"loss": 4.7659,
|
|
"mean_token_accuracy": 0.2332813635468483,
|
|
"num_tokens": 54961751.0,
|
|
"step": 23960
|
|
},
|
|
{
|
|
"entropy": 5.211078453063965,
|
|
"epoch": 2.3021133525456294,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004471168595311215,
|
|
"loss": 4.9402,
|
|
"mean_token_accuracy": 0.21601903587579727,
|
|
"num_tokens": 54975415.0,
|
|
"step": 23965
|
|
},
|
|
{
|
|
"entropy": 5.160834455490113,
|
|
"epoch": 2.302593659942363,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044709477641211395,
|
|
"loss": 4.7611,
|
|
"mean_token_accuracy": 0.22833193838596344,
|
|
"num_tokens": 54986599.0,
|
|
"step": 23970
|
|
},
|
|
{
|
|
"entropy": 5.120666885375977,
|
|
"epoch": 2.303073967339097,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004470726892975767,
|
|
"loss": 4.7707,
|
|
"mean_token_accuracy": 0.23147224336862565,
|
|
"num_tokens": 54997428.0,
|
|
"step": 23975
|
|
},
|
|
{
|
|
"entropy": 5.139890432357788,
|
|
"epoch": 2.303554274735831,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00044705059818802255,
|
|
"loss": 4.8604,
|
|
"mean_token_accuracy": 0.21991382539272308,
|
|
"num_tokens": 55008434.0,
|
|
"step": 23980
|
|
},
|
|
{
|
|
"entropy": 5.044625473022461,
|
|
"epoch": 2.304034582132565,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004470285030839644,
|
|
"loss": 4.7534,
|
|
"mean_token_accuracy": 0.2294306129217148,
|
|
"num_tokens": 55019834.0,
|
|
"step": 23985
|
|
},
|
|
{
|
|
"entropy": 5.145654344558716,
|
|
"epoch": 2.304514889529299,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044700640398591526,
|
|
"loss": 4.7596,
|
|
"mean_token_accuracy": 0.23586667478084564,
|
|
"num_tokens": 55030432.0,
|
|
"step": 23990
|
|
},
|
|
{
|
|
"entropy": 5.18192458152771,
|
|
"epoch": 2.3049951969260327,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004469843008943881,
|
|
"loss": 4.8233,
|
|
"mean_token_accuracy": 0.2283098965883255,
|
|
"num_tokens": 55041482.0,
|
|
"step": 23995
|
|
},
|
|
{
|
|
"entropy": 5.060428190231323,
|
|
"epoch": 2.3054755043227666,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004469621938098962,
|
|
"loss": 4.683,
|
|
"mean_token_accuracy": 0.23807364255189895,
|
|
"num_tokens": 55051894.0,
|
|
"step": 24000
|
|
},
|
|
{
|
|
"epoch": 2.3054755043227666,
|
|
"eval_entropy": 4.882824030810598,
|
|
"eval_loss": 4.924381732940674,
|
|
"eval_mean_token_accuracy": 0.23270893815826754,
|
|
"eval_num_tokens": 55051894.0,
|
|
"eval_runtime": 26.627,
|
|
"eval_samples_per_second": 1232.394,
|
|
"eval_steps_per_second": 154.054,
|
|
"step": 24000
|
|
},
|
|
{
|
|
"entropy": 5.088639545440674,
|
|
"epoch": 2.3059558117195005,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004469400827329528,
|
|
"loss": 4.8239,
|
|
"mean_token_accuracy": 0.23225966691970826,
|
|
"num_tokens": 55063907.0,
|
|
"step": 24005
|
|
},
|
|
{
|
|
"entropy": 5.140566349029541,
|
|
"epoch": 2.3064361191162344,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004469179676640712,
|
|
"loss": 4.7989,
|
|
"mean_token_accuracy": 0.22386384904384612,
|
|
"num_tokens": 55076004.0,
|
|
"step": 24010
|
|
},
|
|
{
|
|
"entropy": 5.029444885253906,
|
|
"epoch": 2.3069164265129682,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004468958486037649,
|
|
"loss": 4.6965,
|
|
"mean_token_accuracy": 0.24416131228208543,
|
|
"num_tokens": 55087506.0,
|
|
"step": 24015
|
|
},
|
|
{
|
|
"entropy": 5.067629337310791,
|
|
"epoch": 2.307396733909702,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004468737255525474,
|
|
"loss": 4.7896,
|
|
"mean_token_accuracy": 0.22917503416538237,
|
|
"num_tokens": 55099406.0,
|
|
"step": 24020
|
|
},
|
|
{
|
|
"entropy": 5.218100070953369,
|
|
"epoch": 2.307877041306436,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004468515985109324,
|
|
"loss": 4.8918,
|
|
"mean_token_accuracy": 0.219009730219841,
|
|
"num_tokens": 55111733.0,
|
|
"step": 24025
|
|
},
|
|
{
|
|
"entropy": 5.145392227172851,
|
|
"epoch": 2.30835734870317,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004468294674794335,
|
|
"loss": 4.7831,
|
|
"mean_token_accuracy": 0.22559967041015624,
|
|
"num_tokens": 55123455.0,
|
|
"step": 24030
|
|
},
|
|
{
|
|
"entropy": 5.125941610336303,
|
|
"epoch": 2.3088376560999038,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004468073324585647,
|
|
"loss": 4.7914,
|
|
"mean_token_accuracy": 0.22934773564338684,
|
|
"num_tokens": 55134284.0,
|
|
"step": 24035
|
|
},
|
|
{
|
|
"entropy": 5.141989803314209,
|
|
"epoch": 2.309317963496638,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004467851934488398,
|
|
"loss": 4.8032,
|
|
"mean_token_accuracy": 0.22890194356441498,
|
|
"num_tokens": 55147224.0,
|
|
"step": 24040
|
|
},
|
|
{
|
|
"entropy": 5.089644145965576,
|
|
"epoch": 2.3097982708933715,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004467630504507727,
|
|
"loss": 4.7432,
|
|
"mean_token_accuracy": 0.2306762605905533,
|
|
"num_tokens": 55158144.0,
|
|
"step": 24045
|
|
},
|
|
{
|
|
"entropy": 5.090526151657104,
|
|
"epoch": 2.310278578290106,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004467409034648776,
|
|
"loss": 4.8517,
|
|
"mean_token_accuracy": 0.23368075489997864,
|
|
"num_tokens": 55171522.0,
|
|
"step": 24050
|
|
},
|
|
{
|
|
"entropy": 5.121523189544678,
|
|
"epoch": 2.3107588856868397,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004467187524916688,
|
|
"loss": 4.7947,
|
|
"mean_token_accuracy": 0.23201913088560105,
|
|
"num_tokens": 55182097.0,
|
|
"step": 24055
|
|
},
|
|
{
|
|
"entropy": 5.181788063049316,
|
|
"epoch": 2.3112391930835736,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004466965975316604,
|
|
"loss": 4.8179,
|
|
"mean_token_accuracy": 0.21985867619514465,
|
|
"num_tokens": 55192287.0,
|
|
"step": 24060
|
|
},
|
|
{
|
|
"entropy": 5.124038362503052,
|
|
"epoch": 2.3117195004803075,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044667443858536685,
|
|
"loss": 4.8373,
|
|
"mean_token_accuracy": 0.23194748163223267,
|
|
"num_tokens": 55204049.0,
|
|
"step": 24065
|
|
},
|
|
{
|
|
"entropy": 5.010155916213989,
|
|
"epoch": 2.3121998078770414,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004466522756533026,
|
|
"loss": 4.6349,
|
|
"mean_token_accuracy": 0.24204261302948,
|
|
"num_tokens": 55214969.0,
|
|
"step": 24070
|
|
},
|
|
{
|
|
"entropy": 5.138979721069336,
|
|
"epoch": 2.3126801152737753,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004466301087359822,
|
|
"loss": 4.8743,
|
|
"mean_token_accuracy": 0.2311398297548294,
|
|
"num_tokens": 55226362.0,
|
|
"step": 24075
|
|
},
|
|
{
|
|
"entropy": 5.224458885192871,
|
|
"epoch": 2.313160422670509,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00044660793783392035,
|
|
"loss": 4.8783,
|
|
"mean_token_accuracy": 0.225470569729805,
|
|
"num_tokens": 55237345.0,
|
|
"step": 24080
|
|
},
|
|
{
|
|
"entropy": 5.187859106063843,
|
|
"epoch": 2.313640730067243,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004465857629476317,
|
|
"loss": 4.8906,
|
|
"mean_token_accuracy": 0.22839065492153168,
|
|
"num_tokens": 55248426.0,
|
|
"step": 24085
|
|
},
|
|
{
|
|
"entropy": 5.087028169631958,
|
|
"epoch": 2.314121037463977,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004465635840776312,
|
|
"loss": 4.8218,
|
|
"mean_token_accuracy": 0.229251691699028,
|
|
"num_tokens": 55259017.0,
|
|
"step": 24090
|
|
},
|
|
{
|
|
"entropy": 5.154250049591065,
|
|
"epoch": 2.314601344860711,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044654140122443373,
|
|
"loss": 4.8562,
|
|
"mean_token_accuracy": 0.22954557836055756,
|
|
"num_tokens": 55270074.0,
|
|
"step": 24095
|
|
},
|
|
{
|
|
"entropy": 5.123119354248047,
|
|
"epoch": 2.3150816522574447,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004465192143885542,
|
|
"loss": 4.7823,
|
|
"mean_token_accuracy": 0.23124384135007858,
|
|
"num_tokens": 55282291.0,
|
|
"step": 24100
|
|
},
|
|
{
|
|
"entropy": 5.168704271316528,
|
|
"epoch": 2.3155619596541785,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044649702357050787,
|
|
"loss": 4.9123,
|
|
"mean_token_accuracy": 0.22692507654428482,
|
|
"num_tokens": 55293113.0,
|
|
"step": 24105
|
|
},
|
|
{
|
|
"entropy": 5.142979717254638,
|
|
"epoch": 2.3160422670509124,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004464748287708099,
|
|
"loss": 4.8226,
|
|
"mean_token_accuracy": 0.22785865962505342,
|
|
"num_tokens": 55304182.0,
|
|
"step": 24110
|
|
},
|
|
{
|
|
"entropy": 5.087536096572876,
|
|
"epoch": 2.3165225744476463,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00044645262998997557,
|
|
"loss": 4.7536,
|
|
"mean_token_accuracy": 0.2385023683309555,
|
|
"num_tokens": 55315292.0,
|
|
"step": 24115
|
|
},
|
|
{
|
|
"entropy": 5.09465913772583,
|
|
"epoch": 2.31700288184438,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044643042722852024,
|
|
"loss": 4.7532,
|
|
"mean_token_accuracy": 0.2335251748561859,
|
|
"num_tokens": 55327000.0,
|
|
"step": 24120
|
|
},
|
|
{
|
|
"entropy": 5.059976387023926,
|
|
"epoch": 2.3174831892411145,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004464082204869595,
|
|
"loss": 4.7052,
|
|
"mean_token_accuracy": 0.24020502716302872,
|
|
"num_tokens": 55337506.0,
|
|
"step": 24125
|
|
},
|
|
{
|
|
"entropy": 5.041950273513794,
|
|
"epoch": 2.3179634966378484,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004463860097658088,
|
|
"loss": 4.6782,
|
|
"mean_token_accuracy": 0.23825192600488662,
|
|
"num_tokens": 55348538.0,
|
|
"step": 24130
|
|
},
|
|
{
|
|
"entropy": 5.19771637916565,
|
|
"epoch": 2.3184438040345823,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004463637950655839,
|
|
"loss": 4.8208,
|
|
"mean_token_accuracy": 0.2281821385025978,
|
|
"num_tokens": 55360501.0,
|
|
"step": 24135
|
|
},
|
|
{
|
|
"entropy": 5.0973663330078125,
|
|
"epoch": 2.318924111431316,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00044634157638680054,
|
|
"loss": 4.7656,
|
|
"mean_token_accuracy": 0.2324952080845833,
|
|
"num_tokens": 55371616.0,
|
|
"step": 24140
|
|
},
|
|
{
|
|
"entropy": 5.11321268081665,
|
|
"epoch": 2.31940441882805,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044631935372997455,
|
|
"loss": 4.8792,
|
|
"mean_token_accuracy": 0.21913430839776993,
|
|
"num_tokens": 55382702.0,
|
|
"step": 24145
|
|
},
|
|
{
|
|
"entropy": 5.1108448028564455,
|
|
"epoch": 2.319884726224784,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004462971270956219,
|
|
"loss": 4.7452,
|
|
"mean_token_accuracy": 0.23138088285923003,
|
|
"num_tokens": 55394826.0,
|
|
"step": 24150
|
|
},
|
|
{
|
|
"entropy": 5.062004709243775,
|
|
"epoch": 2.320365033621518,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004462748964842586,
|
|
"loss": 4.7143,
|
|
"mean_token_accuracy": 0.2407672330737114,
|
|
"num_tokens": 55406584.0,
|
|
"step": 24155
|
|
},
|
|
{
|
|
"entropy": 5.088969039916992,
|
|
"epoch": 2.3208453410182517,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004462526618964008,
|
|
"loss": 4.8449,
|
|
"mean_token_accuracy": 0.22762546092271804,
|
|
"num_tokens": 55419640.0,
|
|
"step": 24160
|
|
},
|
|
{
|
|
"entropy": 5.054949760437012,
|
|
"epoch": 2.3213256484149856,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004462304233325647,
|
|
"loss": 4.783,
|
|
"mean_token_accuracy": 0.2326088547706604,
|
|
"num_tokens": 55430197.0,
|
|
"step": 24165
|
|
},
|
|
{
|
|
"entropy": 5.0983367443084715,
|
|
"epoch": 2.3218059558117194,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004462081807932666,
|
|
"loss": 4.8065,
|
|
"mean_token_accuracy": 0.23061417937278747,
|
|
"num_tokens": 55441832.0,
|
|
"step": 24170
|
|
},
|
|
{
|
|
"entropy": 5.076362228393554,
|
|
"epoch": 2.3222862632084533,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004461859342790229,
|
|
"loss": 4.6935,
|
|
"mean_token_accuracy": 0.2416341170668602,
|
|
"num_tokens": 55452656.0,
|
|
"step": 24175
|
|
},
|
|
{
|
|
"entropy": 4.971002101898193,
|
|
"epoch": 2.322766570605187,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004461636837903501,
|
|
"loss": 4.6705,
|
|
"mean_token_accuracy": 0.23961927890777587,
|
|
"num_tokens": 55463936.0,
|
|
"step": 24180
|
|
},
|
|
{
|
|
"entropy": 5.142798280715942,
|
|
"epoch": 2.323246878001921,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004461414293277649,
|
|
"loss": 4.8605,
|
|
"mean_token_accuracy": 0.22794176787137985,
|
|
"num_tokens": 55475885.0,
|
|
"step": 24185
|
|
},
|
|
{
|
|
"entropy": 5.1128401279449465,
|
|
"epoch": 2.323727185398655,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004461191708917839,
|
|
"loss": 4.7341,
|
|
"mean_token_accuracy": 0.2254903048276901,
|
|
"num_tokens": 55489022.0,
|
|
"step": 24190
|
|
},
|
|
{
|
|
"entropy": 5.06376371383667,
|
|
"epoch": 2.324207492795389,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00044609690848292376,
|
|
"loss": 4.7527,
|
|
"mean_token_accuracy": 0.2291923373937607,
|
|
"num_tokens": 55500008.0,
|
|
"step": 24195
|
|
},
|
|
{
|
|
"entropy": 5.1281633377075195,
|
|
"epoch": 2.324687800192123,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004460746421017016,
|
|
"loss": 4.8874,
|
|
"mean_token_accuracy": 0.2207140639424324,
|
|
"num_tokens": 55511024.0,
|
|
"step": 24200
|
|
},
|
|
{
|
|
"entropy": 5.200454950332642,
|
|
"epoch": 2.325168107588857,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044605237174863405,
|
|
"loss": 4.809,
|
|
"mean_token_accuracy": 0.23201199173927306,
|
|
"num_tokens": 55522199.0,
|
|
"step": 24205
|
|
},
|
|
{
|
|
"entropy": 5.098220157623291,
|
|
"epoch": 2.325648414985591,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004460300974242386,
|
|
"loss": 4.7585,
|
|
"mean_token_accuracy": 0.23540398478507996,
|
|
"num_tokens": 55533197.0,
|
|
"step": 24210
|
|
},
|
|
{
|
|
"entropy": 5.101766872406006,
|
|
"epoch": 2.326128722382325,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004460078191290319,
|
|
"loss": 4.7875,
|
|
"mean_token_accuracy": 0.2317366361618042,
|
|
"num_tokens": 55544317.0,
|
|
"step": 24215
|
|
},
|
|
{
|
|
"entropy": 5.124845695495606,
|
|
"epoch": 2.3266090297790587,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00044598553686353153,
|
|
"loss": 4.8222,
|
|
"mean_token_accuracy": 0.22728889137506486,
|
|
"num_tokens": 55555388.0,
|
|
"step": 24220
|
|
},
|
|
{
|
|
"entropy": 5.128252983093262,
|
|
"epoch": 2.3270893371757926,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00044596325062825476,
|
|
"loss": 4.7615,
|
|
"mean_token_accuracy": 0.22990836650133134,
|
|
"num_tokens": 55566189.0,
|
|
"step": 24225
|
|
},
|
|
{
|
|
"entropy": 5.047675561904907,
|
|
"epoch": 2.3275696445725265,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004459409604237189,
|
|
"loss": 4.7352,
|
|
"mean_token_accuracy": 0.23851094841957093,
|
|
"num_tokens": 55578235.0,
|
|
"step": 24230
|
|
},
|
|
{
|
|
"entropy": 4.9796771049499515,
|
|
"epoch": 2.3280499519692603,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044591866625044154,
|
|
"loss": 4.7211,
|
|
"mean_token_accuracy": 0.23492682725191116,
|
|
"num_tokens": 55590300.0,
|
|
"step": 24235
|
|
},
|
|
{
|
|
"entropy": 5.157256269454956,
|
|
"epoch": 2.3285302593659942,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004458963681089403,
|
|
"loss": 4.8027,
|
|
"mean_token_accuracy": 0.23687927275896073,
|
|
"num_tokens": 55601017.0,
|
|
"step": 24240
|
|
},
|
|
{
|
|
"entropy": 5.188142204284668,
|
|
"epoch": 2.329010566762728,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004458740659997328,
|
|
"loss": 4.8333,
|
|
"mean_token_accuracy": 0.23082994669675827,
|
|
"num_tokens": 55612411.0,
|
|
"step": 24245
|
|
},
|
|
{
|
|
"entropy": 5.150514221191406,
|
|
"epoch": 2.329490874159462,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004458517599233369,
|
|
"loss": 4.8387,
|
|
"mean_token_accuracy": 0.22728616893291473,
|
|
"num_tokens": 55623872.0,
|
|
"step": 24250
|
|
},
|
|
{
|
|
"entropy": 5.202387571334839,
|
|
"epoch": 2.329971181556196,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004458294498802706,
|
|
"loss": 4.8477,
|
|
"mean_token_accuracy": 0.21881984323263168,
|
|
"num_tokens": 55634887.0,
|
|
"step": 24255
|
|
},
|
|
{
|
|
"entropy": 5.094072580337524,
|
|
"epoch": 2.3304514889529298,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004458071358710516,
|
|
"loss": 4.7988,
|
|
"mean_token_accuracy": 0.23121218085289003,
|
|
"num_tokens": 55646286.0,
|
|
"step": 24260
|
|
},
|
|
{
|
|
"entropy": 5.139373064041138,
|
|
"epoch": 2.3309317963496636,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004457848178961981,
|
|
"loss": 4.8043,
|
|
"mean_token_accuracy": 0.22870695888996123,
|
|
"num_tokens": 55657647.0,
|
|
"step": 24265
|
|
},
|
|
{
|
|
"entropy": 5.083712530136109,
|
|
"epoch": 2.3314121037463975,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00044576249595622833,
|
|
"loss": 4.7764,
|
|
"mean_token_accuracy": 0.2308934897184372,
|
|
"num_tokens": 55669000.0,
|
|
"step": 24270
|
|
},
|
|
{
|
|
"entropy": 5.115909576416016,
|
|
"epoch": 2.331892411143132,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004457401700516603,
|
|
"loss": 4.7746,
|
|
"mean_token_accuracy": 0.22995221614837646,
|
|
"num_tokens": 55680678.0,
|
|
"step": 24275
|
|
},
|
|
{
|
|
"entropy": 5.134755945205688,
|
|
"epoch": 2.3323727185398653,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00044571784018301267,
|
|
"loss": 4.795,
|
|
"mean_token_accuracy": 0.23035201877355577,
|
|
"num_tokens": 55692429.0,
|
|
"step": 24280
|
|
},
|
|
{
|
|
"entropy": 5.1184654712677,
|
|
"epoch": 2.3328530259365996,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044569550635080365,
|
|
"loss": 4.8635,
|
|
"mean_token_accuracy": 0.22763997316360474,
|
|
"num_tokens": 55704659.0,
|
|
"step": 24285
|
|
},
|
|
{
|
|
"entropy": 5.0660813331604,
|
|
"epoch": 2.3333333333333335,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044567316855555184,
|
|
"loss": 4.6613,
|
|
"mean_token_accuracy": 0.23994368612766265,
|
|
"num_tokens": 55715584.0,
|
|
"step": 24290
|
|
},
|
|
{
|
|
"entropy": 5.162021112442017,
|
|
"epoch": 2.3338136407300674,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004456508267977759,
|
|
"loss": 4.8847,
|
|
"mean_token_accuracy": 0.22551312744617463,
|
|
"num_tokens": 55726715.0,
|
|
"step": 24295
|
|
},
|
|
{
|
|
"entropy": 5.064092540740967,
|
|
"epoch": 2.3342939481268012,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044562848107799444,
|
|
"loss": 4.7438,
|
|
"mean_token_accuracy": 0.2381514459848404,
|
|
"num_tokens": 55737069.0,
|
|
"step": 24300
|
|
},
|
|
{
|
|
"entropy": 5.0794895648956295,
|
|
"epoch": 2.334774255523535,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044560613139672627,
|
|
"loss": 4.7254,
|
|
"mean_token_accuracy": 0.2401443362236023,
|
|
"num_tokens": 55747774.0,
|
|
"step": 24305
|
|
},
|
|
{
|
|
"entropy": 5.0981381893157955,
|
|
"epoch": 2.335254562920269,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044558377775449036,
|
|
"loss": 4.775,
|
|
"mean_token_accuracy": 0.2361172690987587,
|
|
"num_tokens": 55758209.0,
|
|
"step": 24310
|
|
},
|
|
{
|
|
"entropy": 5.187936162948608,
|
|
"epoch": 2.335734870317003,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044556142015180573,
|
|
"loss": 4.7946,
|
|
"mean_token_accuracy": 0.22627351880073548,
|
|
"num_tokens": 55768668.0,
|
|
"step": 24315
|
|
},
|
|
{
|
|
"entropy": 5.277261114120483,
|
|
"epoch": 2.3362151777137368,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044553905858919134,
|
|
"loss": 4.9289,
|
|
"mean_token_accuracy": 0.22501867413520812,
|
|
"num_tokens": 55780254.0,
|
|
"step": 24320
|
|
},
|
|
{
|
|
"entropy": 5.141206741333008,
|
|
"epoch": 2.3366954851104706,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004455166930671664,
|
|
"loss": 4.7969,
|
|
"mean_token_accuracy": 0.2337252080440521,
|
|
"num_tokens": 55790241.0,
|
|
"step": 24325
|
|
},
|
|
{
|
|
"entropy": 5.156115913391114,
|
|
"epoch": 2.3371757925072045,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044549432358625014,
|
|
"loss": 4.8835,
|
|
"mean_token_accuracy": 0.22898427098989488,
|
|
"num_tokens": 55802040.0,
|
|
"step": 24330
|
|
},
|
|
{
|
|
"entropy": 5.128282070159912,
|
|
"epoch": 2.3376560999039384,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000445471950146962,
|
|
"loss": 4.7985,
|
|
"mean_token_accuracy": 0.23188591599464417,
|
|
"num_tokens": 55813926.0,
|
|
"step": 24335
|
|
},
|
|
{
|
|
"entropy": 5.106117057800293,
|
|
"epoch": 2.3381364073006723,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004454495727498214,
|
|
"loss": 4.7634,
|
|
"mean_token_accuracy": 0.23359781950712205,
|
|
"num_tokens": 55825281.0,
|
|
"step": 24340
|
|
},
|
|
{
|
|
"entropy": 5.084489583969116,
|
|
"epoch": 2.338616714697406,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004454271913953478,
|
|
"loss": 4.7665,
|
|
"mean_token_accuracy": 0.23407331258058547,
|
|
"num_tokens": 55836295.0,
|
|
"step": 24345
|
|
},
|
|
{
|
|
"entropy": 5.046988296508789,
|
|
"epoch": 2.3390970220941405,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044540480608406093,
|
|
"loss": 4.7572,
|
|
"mean_token_accuracy": 0.24078552424907684,
|
|
"num_tokens": 55847150.0,
|
|
"step": 24350
|
|
},
|
|
{
|
|
"entropy": 5.127749443054199,
|
|
"epoch": 2.339577329490874,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004453824168164804,
|
|
"loss": 4.84,
|
|
"mean_token_accuracy": 0.2280465394258499,
|
|
"num_tokens": 55858135.0,
|
|
"step": 24355
|
|
},
|
|
{
|
|
"entropy": 5.231775188446045,
|
|
"epoch": 2.3400576368876083,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004453600235931261,
|
|
"loss": 4.8786,
|
|
"mean_token_accuracy": 0.2284584030508995,
|
|
"num_tokens": 55869319.0,
|
|
"step": 24360
|
|
},
|
|
{
|
|
"entropy": 5.1386302471160885,
|
|
"epoch": 2.340537944284342,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000445337626414518,
|
|
"loss": 4.851,
|
|
"mean_token_accuracy": 0.2288869395852089,
|
|
"num_tokens": 55881383.0,
|
|
"step": 24365
|
|
},
|
|
{
|
|
"entropy": 5.093533611297607,
|
|
"epoch": 2.341018251681076,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044531522528117593,
|
|
"loss": 4.7657,
|
|
"mean_token_accuracy": 0.23314369469881058,
|
|
"num_tokens": 55893726.0,
|
|
"step": 24370
|
|
},
|
|
{
|
|
"entropy": 5.150975704193115,
|
|
"epoch": 2.34149855907781,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00044529282019362007,
|
|
"loss": 4.8339,
|
|
"mean_token_accuracy": 0.23597938418388367,
|
|
"num_tokens": 55905260.0,
|
|
"step": 24375
|
|
},
|
|
{
|
|
"entropy": 5.138191604614258,
|
|
"epoch": 2.341978866474544,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044527041115237056,
|
|
"loss": 4.8237,
|
|
"mean_token_accuracy": 0.23421450853347778,
|
|
"num_tokens": 55915404.0,
|
|
"step": 24380
|
|
},
|
|
{
|
|
"entropy": 5.051256847381592,
|
|
"epoch": 2.3424591738712777,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004452479981579477,
|
|
"loss": 4.7269,
|
|
"mean_token_accuracy": 0.23279765248298645,
|
|
"num_tokens": 55925972.0,
|
|
"step": 24385
|
|
},
|
|
{
|
|
"entropy": 5.081080436706543,
|
|
"epoch": 2.3429394812680115,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004452255812108719,
|
|
"loss": 4.7605,
|
|
"mean_token_accuracy": 0.22747449278831483,
|
|
"num_tokens": 55937545.0,
|
|
"step": 24390
|
|
},
|
|
{
|
|
"entropy": 5.030919933319092,
|
|
"epoch": 2.3434197886647454,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004452031603116636,
|
|
"loss": 4.7176,
|
|
"mean_token_accuracy": 0.2428242087364197,
|
|
"num_tokens": 55949934.0,
|
|
"step": 24395
|
|
},
|
|
{
|
|
"entropy": 5.092579746246338,
|
|
"epoch": 2.3439000960614793,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044518073546084326,
|
|
"loss": 4.81,
|
|
"mean_token_accuracy": 0.2370229333639145,
|
|
"num_tokens": 55960414.0,
|
|
"step": 24400
|
|
},
|
|
{
|
|
"entropy": 5.066870450973511,
|
|
"epoch": 2.344380403458213,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004451583066589316,
|
|
"loss": 4.7419,
|
|
"mean_token_accuracy": 0.22898071706295015,
|
|
"num_tokens": 55970999.0,
|
|
"step": 24405
|
|
},
|
|
{
|
|
"entropy": 5.101868677139282,
|
|
"epoch": 2.344860710854947,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00044513587390644925,
|
|
"loss": 4.7664,
|
|
"mean_token_accuracy": 0.23451221138238906,
|
|
"num_tokens": 55982801.0,
|
|
"step": 24410
|
|
},
|
|
{
|
|
"entropy": 5.186482763290405,
|
|
"epoch": 2.345341018251681,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00044511343720391724,
|
|
"loss": 4.8305,
|
|
"mean_token_accuracy": 0.22371030896902083,
|
|
"num_tokens": 55995143.0,
|
|
"step": 24415
|
|
},
|
|
{
|
|
"entropy": 5.081795644760132,
|
|
"epoch": 2.345821325648415,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004450909965518563,
|
|
"loss": 4.8215,
|
|
"mean_token_accuracy": 0.22740914970636367,
|
|
"num_tokens": 56006368.0,
|
|
"step": 24420
|
|
},
|
|
{
|
|
"entropy": 5.138456726074219,
|
|
"epoch": 2.3463016330451487,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00044506855195078755,
|
|
"loss": 4.8737,
|
|
"mean_token_accuracy": 0.22875082045793532,
|
|
"num_tokens": 56019245.0,
|
|
"step": 24425
|
|
},
|
|
{
|
|
"entropy": 5.166050815582276,
|
|
"epoch": 2.3467819404418826,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044504610340123185,
|
|
"loss": 4.8532,
|
|
"mean_token_accuracy": 0.21877577304840087,
|
|
"num_tokens": 56030982.0,
|
|
"step": 24430
|
|
},
|
|
{
|
|
"entropy": 5.135728549957276,
|
|
"epoch": 2.347262247838617,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00044502365090371066,
|
|
"loss": 4.8184,
|
|
"mean_token_accuracy": 0.22757762521505356,
|
|
"num_tokens": 56042871.0,
|
|
"step": 24435
|
|
},
|
|
{
|
|
"entropy": 5.1946179389953615,
|
|
"epoch": 2.347742555235351,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004450011944587452,
|
|
"loss": 4.8264,
|
|
"mean_token_accuracy": 0.22652811259031297,
|
|
"num_tokens": 56054099.0,
|
|
"step": 24440
|
|
},
|
|
{
|
|
"entropy": 5.047242975234985,
|
|
"epoch": 2.3482228626320847,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044497873406685673,
|
|
"loss": 4.767,
|
|
"mean_token_accuracy": 0.2269476056098938,
|
|
"num_tokens": 56064315.0,
|
|
"step": 24445
|
|
},
|
|
{
|
|
"entropy": 5.022474241256714,
|
|
"epoch": 2.3487031700288186,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004449562697285669,
|
|
"loss": 4.7471,
|
|
"mean_token_accuracy": 0.22951697558164597,
|
|
"num_tokens": 56076446.0,
|
|
"step": 24450
|
|
},
|
|
{
|
|
"entropy": 5.190836668014526,
|
|
"epoch": 2.3491834774255524,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044493380144439707,
|
|
"loss": 4.8706,
|
|
"mean_token_accuracy": 0.2270712062716484,
|
|
"num_tokens": 56089356.0,
|
|
"step": 24455
|
|
},
|
|
{
|
|
"entropy": 5.1657195568084715,
|
|
"epoch": 2.3496637848222863,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000444911329214869,
|
|
"loss": 4.8631,
|
|
"mean_token_accuracy": 0.22868295907974243,
|
|
"num_tokens": 56101245.0,
|
|
"step": 24460
|
|
},
|
|
{
|
|
"entropy": 5.008936548233033,
|
|
"epoch": 2.35014409221902,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044488885304050434,
|
|
"loss": 4.6899,
|
|
"mean_token_accuracy": 0.23889461904764175,
|
|
"num_tokens": 56112756.0,
|
|
"step": 24465
|
|
},
|
|
{
|
|
"entropy": 5.117912292480469,
|
|
"epoch": 2.350624399615754,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000444866372921825,
|
|
"loss": 4.8119,
|
|
"mean_token_accuracy": 0.22884083539247513,
|
|
"num_tokens": 56124316.0,
|
|
"step": 24470
|
|
},
|
|
{
|
|
"entropy": 5.052361536026001,
|
|
"epoch": 2.351104707012488,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044484388885935287,
|
|
"loss": 4.732,
|
|
"mean_token_accuracy": 0.24278282523155212,
|
|
"num_tokens": 56135972.0,
|
|
"step": 24475
|
|
},
|
|
{
|
|
"entropy": 5.156489706039428,
|
|
"epoch": 2.351585014409222,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044482140085361005,
|
|
"loss": 4.8235,
|
|
"mean_token_accuracy": 0.2255215510725975,
|
|
"num_tokens": 56147335.0,
|
|
"step": 24480
|
|
},
|
|
{
|
|
"entropy": 5.050457763671875,
|
|
"epoch": 2.3520653218059557,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00044479890890511853,
|
|
"loss": 4.7099,
|
|
"mean_token_accuracy": 0.23369956463575364,
|
|
"num_tokens": 56158294.0,
|
|
"step": 24485
|
|
},
|
|
{
|
|
"entropy": 5.035678100585938,
|
|
"epoch": 2.3525456292026896,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044477641301440054,
|
|
"loss": 4.6703,
|
|
"mean_token_accuracy": 0.2425915777683258,
|
|
"num_tokens": 56169468.0,
|
|
"step": 24490
|
|
},
|
|
{
|
|
"entropy": 5.119379329681396,
|
|
"epoch": 2.3530259365994235,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004447539131819784,
|
|
"loss": 4.7564,
|
|
"mean_token_accuracy": 0.22898904383182525,
|
|
"num_tokens": 56180112.0,
|
|
"step": 24495
|
|
},
|
|
{
|
|
"entropy": 5.158781814575195,
|
|
"epoch": 2.3535062439961574,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044473140940837436,
|
|
"loss": 4.7991,
|
|
"mean_token_accuracy": 0.23098965287208556,
|
|
"num_tokens": 56191290.0,
|
|
"step": 24500
|
|
},
|
|
{
|
|
"entropy": 5.157944536209106,
|
|
"epoch": 2.3539865513928913,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044470890169411107,
|
|
"loss": 4.859,
|
|
"mean_token_accuracy": 0.22956420928239823,
|
|
"num_tokens": 56202447.0,
|
|
"step": 24505
|
|
},
|
|
{
|
|
"entropy": 5.150196647644043,
|
|
"epoch": 2.3544668587896256,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000444686390039711,
|
|
"loss": 4.8725,
|
|
"mean_token_accuracy": 0.2220148727297783,
|
|
"num_tokens": 56213802.0,
|
|
"step": 24510
|
|
},
|
|
{
|
|
"entropy": 5.100703859329224,
|
|
"epoch": 2.3549471661863595,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004446638744456968,
|
|
"loss": 4.7799,
|
|
"mean_token_accuracy": 0.23059052973985672,
|
|
"num_tokens": 56225243.0,
|
|
"step": 24515
|
|
},
|
|
{
|
|
"entropy": 5.120981311798095,
|
|
"epoch": 2.3554274735830933,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044464135491259135,
|
|
"loss": 4.7158,
|
|
"mean_token_accuracy": 0.23078160136938095,
|
|
"num_tokens": 56235116.0,
|
|
"step": 24520
|
|
},
|
|
{
|
|
"entropy": 5.068836307525634,
|
|
"epoch": 2.3559077809798272,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004446188314409172,
|
|
"loss": 4.7882,
|
|
"mean_token_accuracy": 0.23106331676244735,
|
|
"num_tokens": 56246654.0,
|
|
"step": 24525
|
|
},
|
|
{
|
|
"entropy": 5.145525598526001,
|
|
"epoch": 2.356388088376561,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004445963040311975,
|
|
"loss": 4.7791,
|
|
"mean_token_accuracy": 0.2322086364030838,
|
|
"num_tokens": 56257787.0,
|
|
"step": 24530
|
|
},
|
|
{
|
|
"entropy": 5.057206106185913,
|
|
"epoch": 2.356868395773295,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00044457377268395526,
|
|
"loss": 4.6896,
|
|
"mean_token_accuracy": 0.2340852975845337,
|
|
"num_tokens": 56268881.0,
|
|
"step": 24535
|
|
},
|
|
{
|
|
"entropy": 4.9917152404785154,
|
|
"epoch": 2.357348703170029,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044455123739971355,
|
|
"loss": 4.7075,
|
|
"mean_token_accuracy": 0.2374115616083145,
|
|
"num_tokens": 56281515.0,
|
|
"step": 24540
|
|
},
|
|
{
|
|
"entropy": 4.995164155960083,
|
|
"epoch": 2.3578290105667628,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044452869817899554,
|
|
"loss": 4.6698,
|
|
"mean_token_accuracy": 0.2395367980003357,
|
|
"num_tokens": 56293226.0,
|
|
"step": 24545
|
|
},
|
|
{
|
|
"entropy": 5.118550491333008,
|
|
"epoch": 2.3583093179634966,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004445061550223246,
|
|
"loss": 4.7668,
|
|
"mean_token_accuracy": 0.2326399102807045,
|
|
"num_tokens": 56303408.0,
|
|
"step": 24550
|
|
},
|
|
{
|
|
"entropy": 5.112800025939942,
|
|
"epoch": 2.3587896253602305,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044448360793022403,
|
|
"loss": 4.7617,
|
|
"mean_token_accuracy": 0.2299429401755333,
|
|
"num_tokens": 56315379.0,
|
|
"step": 24555
|
|
},
|
|
{
|
|
"entropy": 5.091808128356933,
|
|
"epoch": 2.3592699327569644,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004444610569032174,
|
|
"loss": 4.7931,
|
|
"mean_token_accuracy": 0.23082323074340821,
|
|
"num_tokens": 56326496.0,
|
|
"step": 24560
|
|
},
|
|
{
|
|
"entropy": 5.028980159759522,
|
|
"epoch": 2.3597502401536983,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004444385019418281,
|
|
"loss": 4.731,
|
|
"mean_token_accuracy": 0.2351315811276436,
|
|
"num_tokens": 56339416.0,
|
|
"step": 24565
|
|
},
|
|
{
|
|
"entropy": 5.138836240768432,
|
|
"epoch": 2.360230547550432,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00044441594304658004,
|
|
"loss": 4.8032,
|
|
"mean_token_accuracy": 0.22849867790937423,
|
|
"num_tokens": 56352305.0,
|
|
"step": 24570
|
|
},
|
|
{
|
|
"entropy": 5.0952881336212155,
|
|
"epoch": 2.360710854947166,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004443933802179968,
|
|
"loss": 4.7269,
|
|
"mean_token_accuracy": 0.23680901676416397,
|
|
"num_tokens": 56364598.0,
|
|
"step": 24575
|
|
},
|
|
{
|
|
"entropy": 5.13135256767273,
|
|
"epoch": 2.3611911623439,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044437081345660224,
|
|
"loss": 4.8741,
|
|
"mean_token_accuracy": 0.2198468491435051,
|
|
"num_tokens": 56376651.0,
|
|
"step": 24580
|
|
},
|
|
{
|
|
"entropy": 5.090953397750854,
|
|
"epoch": 2.3616714697406342,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004443482427629204,
|
|
"loss": 4.7517,
|
|
"mean_token_accuracy": 0.23561461567878722,
|
|
"num_tokens": 56388038.0,
|
|
"step": 24585
|
|
},
|
|
{
|
|
"entropy": 5.10401463508606,
|
|
"epoch": 2.3621517771373677,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004443256681374751,
|
|
"loss": 4.7729,
|
|
"mean_token_accuracy": 0.23340231776237488,
|
|
"num_tokens": 56398889.0,
|
|
"step": 24590
|
|
},
|
|
{
|
|
"entropy": 5.080159282684326,
|
|
"epoch": 2.362632084534102,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004443030895807907,
|
|
"loss": 4.804,
|
|
"mean_token_accuracy": 0.2332967445254326,
|
|
"num_tokens": 56411548.0,
|
|
"step": 24595
|
|
},
|
|
{
|
|
"entropy": 5.1313213348388675,
|
|
"epoch": 2.363112391930836,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044428050709339117,
|
|
"loss": 4.6843,
|
|
"mean_token_accuracy": 0.23506833761930465,
|
|
"num_tokens": 56423292.0,
|
|
"step": 24600
|
|
},
|
|
{
|
|
"entropy": 5.024478149414063,
|
|
"epoch": 2.3635926993275698,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000444257920675801,
|
|
"loss": 4.7636,
|
|
"mean_token_accuracy": 0.23502004146575928,
|
|
"num_tokens": 56434834.0,
|
|
"step": 24605
|
|
},
|
|
{
|
|
"entropy": 5.061081123352051,
|
|
"epoch": 2.3640730067243036,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00044423533032854454,
|
|
"loss": 4.8158,
|
|
"mean_token_accuracy": 0.22861055731773378,
|
|
"num_tokens": 56445554.0,
|
|
"step": 24610
|
|
},
|
|
{
|
|
"entropy": 5.078392887115479,
|
|
"epoch": 2.3645533141210375,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004442127360521462,
|
|
"loss": 4.7748,
|
|
"mean_token_accuracy": 0.2413931518793106,
|
|
"num_tokens": 56457748.0,
|
|
"step": 24615
|
|
},
|
|
{
|
|
"entropy": 5.069872283935547,
|
|
"epoch": 2.3650336215177714,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004441901378471306,
|
|
"loss": 4.7002,
|
|
"mean_token_accuracy": 0.2420249953866005,
|
|
"num_tokens": 56469650.0,
|
|
"step": 24620
|
|
},
|
|
{
|
|
"entropy": 5.040999507904052,
|
|
"epoch": 2.3655139289145053,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044416753571402233,
|
|
"loss": 4.7438,
|
|
"mean_token_accuracy": 0.2333931416273117,
|
|
"num_tokens": 56482061.0,
|
|
"step": 24625
|
|
},
|
|
{
|
|
"entropy": 5.136098337173462,
|
|
"epoch": 2.365994236311239,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004441449296533462,
|
|
"loss": 4.8222,
|
|
"mean_token_accuracy": 0.22758017480373383,
|
|
"num_tokens": 56493599.0,
|
|
"step": 24630
|
|
},
|
|
{
|
|
"entropy": 5.165631055831909,
|
|
"epoch": 2.366474543707973,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044412231966562717,
|
|
"loss": 4.8648,
|
|
"mean_token_accuracy": 0.22515485137701036,
|
|
"num_tokens": 56505842.0,
|
|
"step": 24635
|
|
},
|
|
{
|
|
"entropy": 5.052130842208863,
|
|
"epoch": 2.366954851104707,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044409970575139,
|
|
"loss": 4.7477,
|
|
"mean_token_accuracy": 0.23796017169952394,
|
|
"num_tokens": 56517688.0,
|
|
"step": 24640
|
|
},
|
|
{
|
|
"entropy": 5.12772536277771,
|
|
"epoch": 2.367435158501441,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004440770879111598,
|
|
"loss": 4.7834,
|
|
"mean_token_accuracy": 0.23488107621669768,
|
|
"num_tokens": 56528665.0,
|
|
"step": 24645
|
|
},
|
|
{
|
|
"entropy": 4.977132081985474,
|
|
"epoch": 2.3679154658981747,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00044405446614546163,
|
|
"loss": 4.6826,
|
|
"mean_token_accuracy": 0.2373212084174156,
|
|
"num_tokens": 56539097.0,
|
|
"step": 24650
|
|
},
|
|
{
|
|
"entropy": 5.1884829044342045,
|
|
"epoch": 2.3683957732949086,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004440318404548208,
|
|
"loss": 4.8751,
|
|
"mean_token_accuracy": 0.23188122361898422,
|
|
"num_tokens": 56550373.0,
|
|
"step": 24655
|
|
},
|
|
{
|
|
"entropy": 5.152935409545899,
|
|
"epoch": 2.3688760806916425,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044400921083976246,
|
|
"loss": 4.8138,
|
|
"mean_token_accuracy": 0.22929610162973404,
|
|
"num_tokens": 56561289.0,
|
|
"step": 24660
|
|
},
|
|
{
|
|
"entropy": 5.109140205383301,
|
|
"epoch": 2.3693563880883763,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004439865773008122,
|
|
"loss": 4.8364,
|
|
"mean_token_accuracy": 0.23006531596183777,
|
|
"num_tokens": 56572331.0,
|
|
"step": 24665
|
|
},
|
|
{
|
|
"entropy": 5.076235580444336,
|
|
"epoch": 2.3698366954851107,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004439639398384953,
|
|
"loss": 4.7167,
|
|
"mean_token_accuracy": 0.23638332933187484,
|
|
"num_tokens": 56584375.0,
|
|
"step": 24670
|
|
},
|
|
{
|
|
"entropy": 5.078734445571899,
|
|
"epoch": 2.3703170028818445,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00044394129845333756,
|
|
"loss": 4.7333,
|
|
"mean_token_accuracy": 0.22900070548057555,
|
|
"num_tokens": 56595825.0,
|
|
"step": 24675
|
|
},
|
|
{
|
|
"entropy": 5.089966487884522,
|
|
"epoch": 2.3707973102785784,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004439186531458645,
|
|
"loss": 4.8249,
|
|
"mean_token_accuracy": 0.2364494889974594,
|
|
"num_tokens": 56607842.0,
|
|
"step": 24680
|
|
},
|
|
{
|
|
"entropy": 5.140970182418823,
|
|
"epoch": 2.3712776176753123,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00044389600391660185,
|
|
"loss": 4.8352,
|
|
"mean_token_accuracy": 0.23416105657815933,
|
|
"num_tokens": 56618509.0,
|
|
"step": 24685
|
|
},
|
|
{
|
|
"entropy": 5.044953441619873,
|
|
"epoch": 2.371757925072046,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044387335076607554,
|
|
"loss": 4.7198,
|
|
"mean_token_accuracy": 0.2367233455181122,
|
|
"num_tokens": 56630293.0,
|
|
"step": 24690
|
|
},
|
|
{
|
|
"entropy": 5.189683341979981,
|
|
"epoch": 2.37223823246878,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004438506936948115,
|
|
"loss": 4.8887,
|
|
"mean_token_accuracy": 0.22379377484321594,
|
|
"num_tokens": 56641852.0,
|
|
"step": 24695
|
|
},
|
|
{
|
|
"entropy": 5.250722169876099,
|
|
"epoch": 2.372718539865514,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044382803270333565,
|
|
"loss": 4.8611,
|
|
"mean_token_accuracy": 0.22589135318994522,
|
|
"num_tokens": 56654409.0,
|
|
"step": 24700
|
|
},
|
|
{
|
|
"entropy": 5.074213266372681,
|
|
"epoch": 2.373198847262248,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004438053677921743,
|
|
"loss": 4.7654,
|
|
"mean_token_accuracy": 0.2345658928155899,
|
|
"num_tokens": 56665227.0,
|
|
"step": 24705
|
|
},
|
|
{
|
|
"entropy": 5.101580572128296,
|
|
"epoch": 2.3736791546589817,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044378269896185344,
|
|
"loss": 4.8164,
|
|
"mean_token_accuracy": 0.23337904661893843,
|
|
"num_tokens": 56677924.0,
|
|
"step": 24710
|
|
},
|
|
{
|
|
"entropy": 5.117094945907593,
|
|
"epoch": 2.3741594620557156,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004437600262128996,
|
|
"loss": 4.7135,
|
|
"mean_token_accuracy": 0.23515274077653886,
|
|
"num_tokens": 56689789.0,
|
|
"step": 24715
|
|
},
|
|
{
|
|
"entropy": 5.135672187805175,
|
|
"epoch": 2.3746397694524495,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000443737349545839,
|
|
"loss": 4.8667,
|
|
"mean_token_accuracy": 0.2295097976922989,
|
|
"num_tokens": 56700595.0,
|
|
"step": 24720
|
|
},
|
|
{
|
|
"entropy": 5.15782356262207,
|
|
"epoch": 2.3751200768491834,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00044371466896119823,
|
|
"loss": 4.8917,
|
|
"mean_token_accuracy": 0.2281707540154457,
|
|
"num_tokens": 56712554.0,
|
|
"step": 24725
|
|
},
|
|
{
|
|
"entropy": 5.170312452316284,
|
|
"epoch": 2.3756003842459172,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044369198445950384,
|
|
"loss": 4.7679,
|
|
"mean_token_accuracy": 0.22951631993055344,
|
|
"num_tokens": 56723698.0,
|
|
"step": 24730
|
|
},
|
|
{
|
|
"entropy": 5.124538946151733,
|
|
"epoch": 2.376080691642651,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004436692960412824,
|
|
"loss": 4.7966,
|
|
"mean_token_accuracy": 0.2397472620010376,
|
|
"num_tokens": 56734660.0,
|
|
"step": 24735
|
|
},
|
|
{
|
|
"entropy": 5.09040675163269,
|
|
"epoch": 2.376560999039385,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004436466037070608,
|
|
"loss": 4.8102,
|
|
"mean_token_accuracy": 0.22980419397354127,
|
|
"num_tokens": 56746150.0,
|
|
"step": 24740
|
|
},
|
|
{
|
|
"entropy": 5.187903594970703,
|
|
"epoch": 2.3770413064361193,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044362390745736585,
|
|
"loss": 4.9064,
|
|
"mean_token_accuracy": 0.22432048469781876,
|
|
"num_tokens": 56758018.0,
|
|
"step": 24745
|
|
},
|
|
{
|
|
"entropy": 5.102304887771607,
|
|
"epoch": 2.377521613832853,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004436012072927245,
|
|
"loss": 4.7469,
|
|
"mean_token_accuracy": 0.23562378585338592,
|
|
"num_tokens": 56769129.0,
|
|
"step": 24750
|
|
},
|
|
{
|
|
"entropy": 5.136471176147461,
|
|
"epoch": 2.378001921229587,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00044357850321366375,
|
|
"loss": 4.8011,
|
|
"mean_token_accuracy": 0.23276554346084594,
|
|
"num_tokens": 56780445.0,
|
|
"step": 24755
|
|
},
|
|
{
|
|
"entropy": 5.076611423492432,
|
|
"epoch": 2.378482228626321,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004435557952207107,
|
|
"loss": 4.811,
|
|
"mean_token_accuracy": 0.23137751519680022,
|
|
"num_tokens": 56792338.0,
|
|
"step": 24760
|
|
},
|
|
{
|
|
"entropy": 5.113924932479859,
|
|
"epoch": 2.378962536023055,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044353308331439257,
|
|
"loss": 4.7922,
|
|
"mean_token_accuracy": 0.2352191373705864,
|
|
"num_tokens": 56802345.0,
|
|
"step": 24765
|
|
},
|
|
{
|
|
"entropy": 5.175993299484253,
|
|
"epoch": 2.3794428434197887,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004435103674952367,
|
|
"loss": 4.8465,
|
|
"mean_token_accuracy": 0.23275587558746338,
|
|
"num_tokens": 56813865.0,
|
|
"step": 24770
|
|
},
|
|
{
|
|
"entropy": 5.101814031600952,
|
|
"epoch": 2.3799231508165226,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00044348764776377047,
|
|
"loss": 4.7193,
|
|
"mean_token_accuracy": 0.23715719431638718,
|
|
"num_tokens": 56824135.0,
|
|
"step": 24775
|
|
},
|
|
{
|
|
"entropy": 5.057256031036377,
|
|
"epoch": 2.3804034582132565,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004434649241205214,
|
|
"loss": 4.7761,
|
|
"mean_token_accuracy": 0.23238783925771714,
|
|
"num_tokens": 56836104.0,
|
|
"step": 24780
|
|
},
|
|
{
|
|
"entropy": 5.077281188964844,
|
|
"epoch": 2.3808837656099904,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044344219656601704,
|
|
"loss": 4.7555,
|
|
"mean_token_accuracy": 0.23776894956827163,
|
|
"num_tokens": 56848308.0,
|
|
"step": 24785
|
|
},
|
|
{
|
|
"entropy": 5.078270006179809,
|
|
"epoch": 2.3813640730067243,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000443419465100785,
|
|
"loss": 4.7897,
|
|
"mean_token_accuracy": 0.23689354062080384,
|
|
"num_tokens": 56859351.0,
|
|
"step": 24790
|
|
},
|
|
{
|
|
"entropy": 5.093816423416138,
|
|
"epoch": 2.381844380403458,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004433967297253531,
|
|
"loss": 4.8158,
|
|
"mean_token_accuracy": 0.23118849694728852,
|
|
"num_tokens": 56870645.0,
|
|
"step": 24795
|
|
},
|
|
{
|
|
"entropy": 5.073959541320801,
|
|
"epoch": 2.382324687800192,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044337399044024924,
|
|
"loss": 4.7549,
|
|
"mean_token_accuracy": 0.2352516159415245,
|
|
"num_tokens": 56881246.0,
|
|
"step": 24800
|
|
},
|
|
{
|
|
"entropy": 5.077259492874146,
|
|
"epoch": 2.382804995196926,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004433512472460012,
|
|
"loss": 4.7521,
|
|
"mean_token_accuracy": 0.23205724507570266,
|
|
"num_tokens": 56893477.0,
|
|
"step": 24805
|
|
},
|
|
{
|
|
"entropy": 5.086012125015259,
|
|
"epoch": 2.38328530259366,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044332850014313713,
|
|
"loss": 4.7813,
|
|
"mean_token_accuracy": 0.22985356599092482,
|
|
"num_tokens": 56904813.0,
|
|
"step": 24810
|
|
},
|
|
{
|
|
"entropy": 5.07104344367981,
|
|
"epoch": 2.3837656099903937,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004433057491321851,
|
|
"loss": 4.759,
|
|
"mean_token_accuracy": 0.22867830097675323,
|
|
"num_tokens": 56916160.0,
|
|
"step": 24815
|
|
},
|
|
{
|
|
"entropy": 5.059772109985351,
|
|
"epoch": 2.384245917387128,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044328299421367333,
|
|
"loss": 4.7429,
|
|
"mean_token_accuracy": 0.2354225590825081,
|
|
"num_tokens": 56927791.0,
|
|
"step": 24820
|
|
},
|
|
{
|
|
"entropy": 5.142007541656494,
|
|
"epoch": 2.3847262247838614,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004432602353881302,
|
|
"loss": 4.8027,
|
|
"mean_token_accuracy": 0.23089745938777922,
|
|
"num_tokens": 56938848.0,
|
|
"step": 24825
|
|
},
|
|
{
|
|
"entropy": 5.1220542907714846,
|
|
"epoch": 2.3852065321805958,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00044323747265608395,
|
|
"loss": 4.7882,
|
|
"mean_token_accuracy": 0.23613769859075545,
|
|
"num_tokens": 56950116.0,
|
|
"step": 24830
|
|
},
|
|
{
|
|
"entropy": 5.070098209381103,
|
|
"epoch": 2.3856868395773296,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004432147060180632,
|
|
"loss": 4.7478,
|
|
"mean_token_accuracy": 0.23370686769485474,
|
|
"num_tokens": 56961911.0,
|
|
"step": 24835
|
|
},
|
|
{
|
|
"entropy": 5.168145179748535,
|
|
"epoch": 2.3861671469740635,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044319193547459645,
|
|
"loss": 4.8208,
|
|
"mean_token_accuracy": 0.2195179507136345,
|
|
"num_tokens": 56974105.0,
|
|
"step": 24840
|
|
},
|
|
{
|
|
"entropy": 5.104530096054077,
|
|
"epoch": 2.3866474543707974,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004431691610262124,
|
|
"loss": 4.7759,
|
|
"mean_token_accuracy": 0.23273722231388091,
|
|
"num_tokens": 56985564.0,
|
|
"step": 24845
|
|
},
|
|
{
|
|
"entropy": 5.078565549850464,
|
|
"epoch": 2.3871277617675313,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044314638267343976,
|
|
"loss": 4.7083,
|
|
"mean_token_accuracy": 0.2425155222415924,
|
|
"num_tokens": 56995835.0,
|
|
"step": 24850
|
|
},
|
|
{
|
|
"entropy": 4.960423183441162,
|
|
"epoch": 2.387608069164265,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004431236004168075,
|
|
"loss": 4.6584,
|
|
"mean_token_accuracy": 0.23421775847673415,
|
|
"num_tokens": 57008114.0,
|
|
"step": 24855
|
|
},
|
|
{
|
|
"entropy": 5.015463972091675,
|
|
"epoch": 2.388088376560999,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004431008142568444,
|
|
"loss": 4.7401,
|
|
"mean_token_accuracy": 0.2357071578502655,
|
|
"num_tokens": 57018968.0,
|
|
"step": 24860
|
|
},
|
|
{
|
|
"entropy": 5.0624267578125,
|
|
"epoch": 2.388568683957733,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044307802419407954,
|
|
"loss": 4.7803,
|
|
"mean_token_accuracy": 0.238409586250782,
|
|
"num_tokens": 57031297.0,
|
|
"step": 24865
|
|
},
|
|
{
|
|
"entropy": 5.12186484336853,
|
|
"epoch": 2.389048991354467,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004430552302290421,
|
|
"loss": 4.7949,
|
|
"mean_token_accuracy": 0.228073151409626,
|
|
"num_tokens": 57043457.0,
|
|
"step": 24870
|
|
},
|
|
{
|
|
"entropy": 5.0508099555969235,
|
|
"epoch": 2.3895292987512007,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004430324323622611,
|
|
"loss": 4.7178,
|
|
"mean_token_accuracy": 0.23357915431261062,
|
|
"num_tokens": 57054474.0,
|
|
"step": 24875
|
|
},
|
|
{
|
|
"entropy": 5.072915172576904,
|
|
"epoch": 2.3900096061479346,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044300963059426605,
|
|
"loss": 4.7986,
|
|
"mean_token_accuracy": 0.23037643283605574,
|
|
"num_tokens": 57065120.0,
|
|
"step": 24880
|
|
},
|
|
{
|
|
"entropy": 5.044278001785278,
|
|
"epoch": 2.3904899135446684,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044298682492558637,
|
|
"loss": 4.7529,
|
|
"mean_token_accuracy": 0.2331436961889267,
|
|
"num_tokens": 57076712.0,
|
|
"step": 24885
|
|
},
|
|
{
|
|
"entropy": 5.173734998703003,
|
|
"epoch": 2.3909702209414023,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044296401535675136,
|
|
"loss": 4.8632,
|
|
"mean_token_accuracy": 0.22800443917512894,
|
|
"num_tokens": 57087894.0,
|
|
"step": 24890
|
|
},
|
|
{
|
|
"entropy": 5.182175540924073,
|
|
"epoch": 2.3914505283381366,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00044294120188829056,
|
|
"loss": 4.9352,
|
|
"mean_token_accuracy": 0.2199179157614708,
|
|
"num_tokens": 57101029.0,
|
|
"step": 24895
|
|
},
|
|
{
|
|
"entropy": 5.202096891403198,
|
|
"epoch": 2.39193083573487,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004429183845207339,
|
|
"loss": 4.8049,
|
|
"mean_token_accuracy": 0.22310205698013305,
|
|
"num_tokens": 57112893.0,
|
|
"step": 24900
|
|
},
|
|
{
|
|
"entropy": 5.1200279712677,
|
|
"epoch": 2.3924111431316044,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004428955632546108,
|
|
"loss": 4.838,
|
|
"mean_token_accuracy": 0.23471231013536453,
|
|
"num_tokens": 57123899.0,
|
|
"step": 24905
|
|
},
|
|
{
|
|
"entropy": 5.041331338882446,
|
|
"epoch": 2.3928914505283383,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004428727380904514,
|
|
"loss": 4.8432,
|
|
"mean_token_accuracy": 0.23019375056028366,
|
|
"num_tokens": 57134896.0,
|
|
"step": 24910
|
|
},
|
|
{
|
|
"entropy": 4.98831615447998,
|
|
"epoch": 2.393371757925072,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00044284990902878545,
|
|
"loss": 4.6332,
|
|
"mean_token_accuracy": 0.2411554917693138,
|
|
"num_tokens": 57146278.0,
|
|
"step": 24915
|
|
},
|
|
{
|
|
"entropy": 5.117031049728394,
|
|
"epoch": 2.393852065321806,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044282707607014304,
|
|
"loss": 4.7572,
|
|
"mean_token_accuracy": 0.23582173883914948,
|
|
"num_tokens": 57157663.0,
|
|
"step": 24920
|
|
},
|
|
{
|
|
"entropy": 5.070637035369873,
|
|
"epoch": 2.39433237271854,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00044280423921505427,
|
|
"loss": 4.7232,
|
|
"mean_token_accuracy": 0.2379809319972992,
|
|
"num_tokens": 57169131.0,
|
|
"step": 24925
|
|
},
|
|
{
|
|
"entropy": 5.091719150543213,
|
|
"epoch": 2.394812680115274,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004427813984640493,
|
|
"loss": 4.7648,
|
|
"mean_token_accuracy": 0.22879979461431504,
|
|
"num_tokens": 57181653.0,
|
|
"step": 24930
|
|
},
|
|
{
|
|
"entropy": 5.034427452087402,
|
|
"epoch": 2.3952929875120077,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004427585538176585,
|
|
"loss": 4.752,
|
|
"mean_token_accuracy": 0.23759771287441253,
|
|
"num_tokens": 57192817.0,
|
|
"step": 24935
|
|
},
|
|
{
|
|
"entropy": 5.11380443572998,
|
|
"epoch": 2.3957732949087416,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00044273570527641223,
|
|
"loss": 4.8208,
|
|
"mean_token_accuracy": 0.228305621445179,
|
|
"num_tokens": 57203570.0,
|
|
"step": 24940
|
|
},
|
|
{
|
|
"entropy": 5.065595197677612,
|
|
"epoch": 2.3962536023054755,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044271285284084097,
|
|
"loss": 4.7001,
|
|
"mean_token_accuracy": 0.23602611124515532,
|
|
"num_tokens": 57214388.0,
|
|
"step": 24945
|
|
},
|
|
{
|
|
"entropy": 5.124724578857422,
|
|
"epoch": 2.3967339097022093,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004426899965114752,
|
|
"loss": 4.8045,
|
|
"mean_token_accuracy": 0.22568911015987397,
|
|
"num_tokens": 57225004.0,
|
|
"step": 24950
|
|
},
|
|
{
|
|
"entropy": 5.051713514328003,
|
|
"epoch": 2.3972142170989432,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044266713628884566,
|
|
"loss": 4.7179,
|
|
"mean_token_accuracy": 0.2410505473613739,
|
|
"num_tokens": 57236643.0,
|
|
"step": 24955
|
|
},
|
|
{
|
|
"entropy": 5.09238977432251,
|
|
"epoch": 2.397694524495677,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044264427217348315,
|
|
"loss": 4.7375,
|
|
"mean_token_accuracy": 0.2336786285042763,
|
|
"num_tokens": 57247502.0,
|
|
"step": 24960
|
|
},
|
|
{
|
|
"entropy": 5.191510391235352,
|
|
"epoch": 2.398174831892411,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004426214041659184,
|
|
"loss": 4.9253,
|
|
"mean_token_accuracy": 0.2243089497089386,
|
|
"num_tokens": 57259539.0,
|
|
"step": 24965
|
|
},
|
|
{
|
|
"entropy": 5.225309705734253,
|
|
"epoch": 2.398655139289145,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004425985322666824,
|
|
"loss": 4.8844,
|
|
"mean_token_accuracy": 0.22500480264425277,
|
|
"num_tokens": 57270946.0,
|
|
"step": 24970
|
|
},
|
|
{
|
|
"entropy": 5.12941083908081,
|
|
"epoch": 2.3991354466858787,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004425756564763061,
|
|
"loss": 4.8302,
|
|
"mean_token_accuracy": 0.22922021597623826,
|
|
"num_tokens": 57282855.0,
|
|
"step": 24975
|
|
},
|
|
{
|
|
"entropy": 5.125387191772461,
|
|
"epoch": 2.399615754082613,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00044255277679532075,
|
|
"loss": 4.827,
|
|
"mean_token_accuracy": 0.2260574668645859,
|
|
"num_tokens": 57292638.0,
|
|
"step": 24980
|
|
},
|
|
{
|
|
"entropy": 5.144128751754761,
|
|
"epoch": 2.400096061479347,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00044252989322425735,
|
|
"loss": 4.7697,
|
|
"mean_token_accuracy": 0.2287430688738823,
|
|
"num_tokens": 57304088.0,
|
|
"step": 24985
|
|
},
|
|
{
|
|
"entropy": 5.150278091430664,
|
|
"epoch": 2.400576368876081,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044250700576364734,
|
|
"loss": 4.7633,
|
|
"mean_token_accuracy": 0.23506819903850557,
|
|
"num_tokens": 57314534.0,
|
|
"step": 24990
|
|
},
|
|
{
|
|
"entropy": 5.115130281448364,
|
|
"epoch": 2.4010566762728147,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000442484114414022,
|
|
"loss": 4.8511,
|
|
"mean_token_accuracy": 0.2235242545604706,
|
|
"num_tokens": 57326787.0,
|
|
"step": 24995
|
|
},
|
|
{
|
|
"entropy": 5.050136089324951,
|
|
"epoch": 2.4015369836695486,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004424612191759129,
|
|
"loss": 4.7492,
|
|
"mean_token_accuracy": 0.22763265818357467,
|
|
"num_tokens": 57338935.0,
|
|
"step": 25000
|
|
},
|
|
{
|
|
"entropy": 5.17376184463501,
|
|
"epoch": 2.4020172910662825,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004424383200498515,
|
|
"loss": 4.7813,
|
|
"mean_token_accuracy": 0.2324720099568367,
|
|
"num_tokens": 57349963.0,
|
|
"step": 25005
|
|
},
|
|
{
|
|
"entropy": 5.074791526794433,
|
|
"epoch": 2.4024975984630164,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004424154170363696,
|
|
"loss": 4.7924,
|
|
"mean_token_accuracy": 0.23403090387582778,
|
|
"num_tokens": 57361305.0,
|
|
"step": 25010
|
|
},
|
|
{
|
|
"entropy": 5.015868616104126,
|
|
"epoch": 2.4029779058597502,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004423925101359987,
|
|
"loss": 4.7577,
|
|
"mean_token_accuracy": 0.22880287021398543,
|
|
"num_tokens": 57371582.0,
|
|
"step": 25015
|
|
},
|
|
{
|
|
"entropy": 5.091523838043213,
|
|
"epoch": 2.403458213256484,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004423695993492709,
|
|
"loss": 4.7413,
|
|
"mean_token_accuracy": 0.23197826892137527,
|
|
"num_tokens": 57382222.0,
|
|
"step": 25020
|
|
},
|
|
{
|
|
"entropy": 5.099827480316162,
|
|
"epoch": 2.403938520653218,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004423466846767179,
|
|
"loss": 4.7438,
|
|
"mean_token_accuracy": 0.23061240166425706,
|
|
"num_tokens": 57393939.0,
|
|
"step": 25025
|
|
},
|
|
{
|
|
"entropy": 5.127017211914063,
|
|
"epoch": 2.404418828049952,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00044232376611887185,
|
|
"loss": 4.7366,
|
|
"mean_token_accuracy": 0.23424165695905685,
|
|
"num_tokens": 57404889.0,
|
|
"step": 25030
|
|
},
|
|
{
|
|
"entropy": 5.120872449874878,
|
|
"epoch": 2.4048991354466858,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044230084367626477,
|
|
"loss": 4.834,
|
|
"mean_token_accuracy": 0.2335745483636856,
|
|
"num_tokens": 57416296.0,
|
|
"step": 25035
|
|
},
|
|
{
|
|
"entropy": 5.063665580749512,
|
|
"epoch": 2.4053794428434196,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004422779173494288,
|
|
"loss": 4.7934,
|
|
"mean_token_accuracy": 0.23263732492923736,
|
|
"num_tokens": 57427926.0,
|
|
"step": 25040
|
|
},
|
|
{
|
|
"entropy": 5.122995281219483,
|
|
"epoch": 2.4058597502401535,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004422549871388965,
|
|
"loss": 4.7749,
|
|
"mean_token_accuracy": 0.23277383893728257,
|
|
"num_tokens": 57439397.0,
|
|
"step": 25045
|
|
},
|
|
{
|
|
"entropy": 5.136260223388672,
|
|
"epoch": 2.4063400576368874,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00044223205304519994,
|
|
"loss": 4.8298,
|
|
"mean_token_accuracy": 0.2284989833831787,
|
|
"num_tokens": 57449976.0,
|
|
"step": 25050
|
|
},
|
|
{
|
|
"entropy": 5.135150289535522,
|
|
"epoch": 2.4068203650336217,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004422091150688717,
|
|
"loss": 4.8585,
|
|
"mean_token_accuracy": 0.22531607747077942,
|
|
"num_tokens": 57461578.0,
|
|
"step": 25055
|
|
},
|
|
{
|
|
"entropy": 5.116857385635376,
|
|
"epoch": 2.4073006724303556,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004421861732104443,
|
|
"loss": 4.7996,
|
|
"mean_token_accuracy": 0.22948758900165558,
|
|
"num_tokens": 57471832.0,
|
|
"step": 25060
|
|
},
|
|
{
|
|
"entropy": 5.1676966667175295,
|
|
"epoch": 2.4077809798270895,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004421632274704504,
|
|
"loss": 4.825,
|
|
"mean_token_accuracy": 0.22644471675157546,
|
|
"num_tokens": 57482854.0,
|
|
"step": 25065
|
|
},
|
|
{
|
|
"entropy": 5.165962553024292,
|
|
"epoch": 2.4082612872238234,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004421402778494227,
|
|
"loss": 4.8661,
|
|
"mean_token_accuracy": 0.2232041284441948,
|
|
"num_tokens": 57494243.0,
|
|
"step": 25070
|
|
},
|
|
{
|
|
"entropy": 5.136231994628906,
|
|
"epoch": 2.4087415946205573,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004421173243478941,
|
|
"loss": 4.7417,
|
|
"mean_token_accuracy": 0.23099250942468644,
|
|
"num_tokens": 57506185.0,
|
|
"step": 25075
|
|
},
|
|
{
|
|
"entropy": 5.1206300258636475,
|
|
"epoch": 2.409221902017291,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044209436696639745,
|
|
"loss": 4.7652,
|
|
"mean_token_accuracy": 0.23854574412107468,
|
|
"num_tokens": 57517980.0,
|
|
"step": 25080
|
|
},
|
|
{
|
|
"entropy": 5.012642097473145,
|
|
"epoch": 2.409702209414025,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044207140570546574,
|
|
"loss": 4.6386,
|
|
"mean_token_accuracy": 0.2411160036921501,
|
|
"num_tokens": 57529331.0,
|
|
"step": 25085
|
|
},
|
|
{
|
|
"entropy": 5.095395565032959,
|
|
"epoch": 2.410182516810759,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00044204844056563216,
|
|
"loss": 4.7346,
|
|
"mean_token_accuracy": 0.23751559257507324,
|
|
"num_tokens": 57540463.0,
|
|
"step": 25090
|
|
},
|
|
{
|
|
"entropy": 5.154835271835327,
|
|
"epoch": 2.410662824207493,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004420254715474297,
|
|
"loss": 4.8798,
|
|
"mean_token_accuracy": 0.2264431521296501,
|
|
"num_tokens": 57550896.0,
|
|
"step": 25095
|
|
},
|
|
{
|
|
"entropy": 5.081275081634521,
|
|
"epoch": 2.4111431316042267,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00044200249865139187,
|
|
"loss": 4.7653,
|
|
"mean_token_accuracy": 0.23305115401744841,
|
|
"num_tokens": 57562041.0,
|
|
"step": 25100
|
|
},
|
|
{
|
|
"entropy": 5.226250839233399,
|
|
"epoch": 2.4116234390009605,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044197952187805185,
|
|
"loss": 4.9266,
|
|
"mean_token_accuracy": 0.2227129802107811,
|
|
"num_tokens": 57573771.0,
|
|
"step": 25105
|
|
},
|
|
{
|
|
"entropy": 5.150281715393066,
|
|
"epoch": 2.4121037463976944,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00044195654122794324,
|
|
"loss": 4.7519,
|
|
"mean_token_accuracy": 0.23074692040681838,
|
|
"num_tokens": 57585651.0,
|
|
"step": 25110
|
|
},
|
|
{
|
|
"entropy": 5.072700929641724,
|
|
"epoch": 2.4125840537944283,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004419335567015994,
|
|
"loss": 4.7639,
|
|
"mean_token_accuracy": 0.23477055728435517,
|
|
"num_tokens": 57596328.0,
|
|
"step": 25115
|
|
},
|
|
{
|
|
"entropy": 5.054577207565307,
|
|
"epoch": 2.413064361191162,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004419105682995542,
|
|
"loss": 4.6737,
|
|
"mean_token_accuracy": 0.23865675032138825,
|
|
"num_tokens": 57608012.0,
|
|
"step": 25120
|
|
},
|
|
{
|
|
"entropy": 5.1676277160644535,
|
|
"epoch": 2.413544668587896,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004418875760223411,
|
|
"loss": 4.7777,
|
|
"mean_token_accuracy": 0.22592634558677674,
|
|
"num_tokens": 57619711.0,
|
|
"step": 25125
|
|
},
|
|
{
|
|
"entropy": 5.076362609863281,
|
|
"epoch": 2.4140249759846304,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00044186457987049405,
|
|
"loss": 4.755,
|
|
"mean_token_accuracy": 0.2357165664434433,
|
|
"num_tokens": 57630263.0,
|
|
"step": 25130
|
|
},
|
|
{
|
|
"entropy": 5.052877140045166,
|
|
"epoch": 2.414505283381364,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000441841579844547,
|
|
"loss": 4.7552,
|
|
"mean_token_accuracy": 0.23254823684692383,
|
|
"num_tokens": 57640973.0,
|
|
"step": 25135
|
|
},
|
|
{
|
|
"entropy": 5.1252655506134035,
|
|
"epoch": 2.414985590778098,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004418185759450338,
|
|
"loss": 4.7086,
|
|
"mean_token_accuracy": 0.232559834420681,
|
|
"num_tokens": 57651698.0,
|
|
"step": 25140
|
|
},
|
|
{
|
|
"entropy": 5.069209814071655,
|
|
"epoch": 2.415465898174832,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004417955681724887,
|
|
"loss": 4.7524,
|
|
"mean_token_accuracy": 0.23003358244895936,
|
|
"num_tokens": 57662547.0,
|
|
"step": 25145
|
|
},
|
|
{
|
|
"entropy": 5.152061462402344,
|
|
"epoch": 2.415946205571566,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00044177255652744576,
|
|
"loss": 4.8545,
|
|
"mean_token_accuracy": 0.22393652498722078,
|
|
"num_tokens": 57675665.0,
|
|
"step": 25150
|
|
},
|
|
{
|
|
"entropy": 5.0961161136627195,
|
|
"epoch": 2.4164265129683,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00044174954101043926,
|
|
"loss": 4.7158,
|
|
"mean_token_accuracy": 0.24122230857610702,
|
|
"num_tokens": 57687765.0,
|
|
"step": 25155
|
|
},
|
|
{
|
|
"entropy": 5.111287069320679,
|
|
"epoch": 2.4169068203650337,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044172652162200354,
|
|
"loss": 4.806,
|
|
"mean_token_accuracy": 0.22566018700599672,
|
|
"num_tokens": 57699575.0,
|
|
"step": 25160
|
|
},
|
|
{
|
|
"entropy": 5.1426841735839846,
|
|
"epoch": 2.4173871277617676,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004417034983626731,
|
|
"loss": 4.8203,
|
|
"mean_token_accuracy": 0.23345723748207092,
|
|
"num_tokens": 57710014.0,
|
|
"step": 25165
|
|
},
|
|
{
|
|
"entropy": 5.054921293258667,
|
|
"epoch": 2.4178674351585014,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004416804712329825,
|
|
"loss": 4.6687,
|
|
"mean_token_accuracy": 0.24694691896438598,
|
|
"num_tokens": 57720763.0,
|
|
"step": 25170
|
|
},
|
|
{
|
|
"entropy": 5.112240743637085,
|
|
"epoch": 2.4183477425552353,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00044165744023346614,
|
|
"loss": 4.8264,
|
|
"mean_token_accuracy": 0.22955900579690933,
|
|
"num_tokens": 57731581.0,
|
|
"step": 25175
|
|
},
|
|
{
|
|
"entropy": 5.151104831695557,
|
|
"epoch": 2.418828049951969,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00044163440536465904,
|
|
"loss": 4.8684,
|
|
"mean_token_accuracy": 0.23169025778770447,
|
|
"num_tokens": 57742473.0,
|
|
"step": 25180
|
|
},
|
|
{
|
|
"entropy": 5.178462362289428,
|
|
"epoch": 2.419308357348703,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044161136662709577,
|
|
"loss": 4.876,
|
|
"mean_token_accuracy": 0.22610796988010406,
|
|
"num_tokens": 57754317.0,
|
|
"step": 25185
|
|
},
|
|
{
|
|
"entropy": 5.075574684143066,
|
|
"epoch": 2.419788664745437,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00044158832402131133,
|
|
"loss": 4.7637,
|
|
"mean_token_accuracy": 0.2303366556763649,
|
|
"num_tokens": 57765882.0,
|
|
"step": 25190
|
|
},
|
|
{
|
|
"entropy": 5.116954517364502,
|
|
"epoch": 2.420268972142171,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00044156527754784066,
|
|
"loss": 4.7507,
|
|
"mean_token_accuracy": 0.23588968962430953,
|
|
"num_tokens": 57776977.0,
|
|
"step": 25195
|
|
},
|
|
{
|
|
"entropy": 5.129793405532837,
|
|
"epoch": 2.4207492795389047,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00044154222720721887,
|
|
"loss": 4.8295,
|
|
"mean_token_accuracy": 0.2288350611925125,
|
|
"num_tokens": 57788652.0,
|
|
"step": 25200
|
|
},
|
|
{
|
|
"entropy": 5.172764730453491,
|
|
"epoch": 2.421229586935639,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000441519172999981,
|
|
"loss": 4.8086,
|
|
"mean_token_accuracy": 0.22164968103170396,
|
|
"num_tokens": 57799660.0,
|
|
"step": 25205
|
|
},
|
|
{
|
|
"entropy": 5.151558542251587,
|
|
"epoch": 2.4217098943323725,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004414961149266625,
|
|
"loss": 4.8751,
|
|
"mean_token_accuracy": 0.22082775533199311,
|
|
"num_tokens": 57811754.0,
|
|
"step": 25210
|
|
},
|
|
{
|
|
"entropy": 5.142359209060669,
|
|
"epoch": 2.422190201729107,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044147305298779856,
|
|
"loss": 4.7975,
|
|
"mean_token_accuracy": 0.23699275106191636,
|
|
"num_tokens": 57823470.0,
|
|
"step": 25215
|
|
},
|
|
{
|
|
"entropy": 5.211223363876343,
|
|
"epoch": 2.4226705091258407,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004414499871839247,
|
|
"loss": 4.8946,
|
|
"mean_token_accuracy": 0.22818945497274398,
|
|
"num_tokens": 57834917.0,
|
|
"step": 25220
|
|
},
|
|
{
|
|
"entropy": 5.136872911453247,
|
|
"epoch": 2.4231508165225746,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004414269175155763,
|
|
"loss": 4.8005,
|
|
"mean_token_accuracy": 0.2274375304579735,
|
|
"num_tokens": 57844655.0,
|
|
"step": 25225
|
|
},
|
|
{
|
|
"entropy": 5.085866117477417,
|
|
"epoch": 2.4236311239193085,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004414038439832891,
|
|
"loss": 4.8088,
|
|
"mean_token_accuracy": 0.23270874172449113,
|
|
"num_tokens": 57856019.0,
|
|
"step": 25230
|
|
},
|
|
{
|
|
"entropy": 5.147326040267944,
|
|
"epoch": 2.4241114313160423,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004413807665875988,
|
|
"loss": 4.7681,
|
|
"mean_token_accuracy": 0.23173436522483826,
|
|
"num_tokens": 57867661.0,
|
|
"step": 25235
|
|
},
|
|
{
|
|
"entropy": 5.150324010848999,
|
|
"epoch": 2.4245917387127762,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044135768532904104,
|
|
"loss": 4.7815,
|
|
"mean_token_accuracy": 0.22542063891887665,
|
|
"num_tokens": 57879899.0,
|
|
"step": 25240
|
|
},
|
|
{
|
|
"entropy": 5.136937427520752,
|
|
"epoch": 2.42507204610951,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000441334600208152,
|
|
"loss": 4.8528,
|
|
"mean_token_accuracy": 0.22068443894386292,
|
|
"num_tokens": 57891168.0,
|
|
"step": 25245
|
|
},
|
|
{
|
|
"entropy": 5.083675765991211,
|
|
"epoch": 2.425552353506244,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00044131151122546724,
|
|
"loss": 4.7149,
|
|
"mean_token_accuracy": 0.23319132924079894,
|
|
"num_tokens": 57901997.0,
|
|
"step": 25250
|
|
},
|
|
{
|
|
"entropy": 5.115674448013306,
|
|
"epoch": 2.426032660902978,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044128841838152313,
|
|
"loss": 4.7814,
|
|
"mean_token_accuracy": 0.23135359287261964,
|
|
"num_tokens": 57913380.0,
|
|
"step": 25255
|
|
},
|
|
{
|
|
"entropy": 5.047495317459107,
|
|
"epoch": 2.4265129682997117,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004412653216768558,
|
|
"loss": 4.6851,
|
|
"mean_token_accuracy": 0.23554279059171676,
|
|
"num_tokens": 57923675.0,
|
|
"step": 25260
|
|
},
|
|
{
|
|
"entropy": 5.1381189823150635,
|
|
"epoch": 2.4269932756964456,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004412422211120013,
|
|
"loss": 4.8304,
|
|
"mean_token_accuracy": 0.2330308437347412,
|
|
"num_tokens": 57935885.0,
|
|
"step": 25265
|
|
},
|
|
{
|
|
"entropy": 5.08838529586792,
|
|
"epoch": 2.4274735830931795,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004412191166874961,
|
|
"loss": 4.7604,
|
|
"mean_token_accuracy": 0.23321136087179184,
|
|
"num_tokens": 57946551.0,
|
|
"step": 25270
|
|
},
|
|
{
|
|
"entropy": 5.0506280899047855,
|
|
"epoch": 2.4279538904899134,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004411960084038766,
|
|
"loss": 4.6814,
|
|
"mean_token_accuracy": 0.24472323954105377,
|
|
"num_tokens": 57957778.0,
|
|
"step": 25275
|
|
},
|
|
{
|
|
"entropy": 5.0799188137054445,
|
|
"epoch": 2.4284341978866473,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044117289626167917,
|
|
"loss": 4.7983,
|
|
"mean_token_accuracy": 0.2373049721121788,
|
|
"num_tokens": 57969256.0,
|
|
"step": 25280
|
|
},
|
|
{
|
|
"entropy": 5.145950269699097,
|
|
"epoch": 2.428914505283381,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004411497802614406,
|
|
"loss": 4.7595,
|
|
"mean_token_accuracy": 0.2394431099295616,
|
|
"num_tokens": 57981368.0,
|
|
"step": 25285
|
|
},
|
|
{
|
|
"entropy": 5.103071928024292,
|
|
"epoch": 2.4293948126801155,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004411266604036975,
|
|
"loss": 4.8485,
|
|
"mean_token_accuracy": 0.23120342344045638,
|
|
"num_tokens": 57992862.0,
|
|
"step": 25290
|
|
},
|
|
{
|
|
"entropy": 5.055536603927612,
|
|
"epoch": 2.4298751200768494,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044110353668898674,
|
|
"loss": 4.6424,
|
|
"mean_token_accuracy": 0.24648849815130233,
|
|
"num_tokens": 58003492.0,
|
|
"step": 25295
|
|
},
|
|
{
|
|
"entropy": 5.051392936706543,
|
|
"epoch": 2.4303554274735832,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004410804091178449,
|
|
"loss": 4.6773,
|
|
"mean_token_accuracy": 0.23831250369548798,
|
|
"num_tokens": 58014623.0,
|
|
"step": 25300
|
|
},
|
|
{
|
|
"entropy": 5.129943037033081,
|
|
"epoch": 2.430835734870317,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004410572776908092,
|
|
"loss": 4.8595,
|
|
"mean_token_accuracy": 0.22626224607229234,
|
|
"num_tokens": 58024886.0,
|
|
"step": 25305
|
|
},
|
|
{
|
|
"entropy": 5.164258241653442,
|
|
"epoch": 2.431316042267051,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044103414240841664,
|
|
"loss": 4.837,
|
|
"mean_token_accuracy": 0.23164584636688232,
|
|
"num_tokens": 58035998.0,
|
|
"step": 25310
|
|
},
|
|
{
|
|
"entropy": 5.151918363571167,
|
|
"epoch": 2.431796349663785,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004410110032712043,
|
|
"loss": 4.7838,
|
|
"mean_token_accuracy": 0.23683720380067824,
|
|
"num_tokens": 58046577.0,
|
|
"step": 25315
|
|
},
|
|
{
|
|
"entropy": 5.03354926109314,
|
|
"epoch": 2.4322766570605188,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004409878602797094,
|
|
"loss": 4.7273,
|
|
"mean_token_accuracy": 0.2367846444249153,
|
|
"num_tokens": 58057711.0,
|
|
"step": 25320
|
|
},
|
|
{
|
|
"entropy": 5.1005795955657955,
|
|
"epoch": 2.4327569644572526,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00044096471343446923,
|
|
"loss": 4.7228,
|
|
"mean_token_accuracy": 0.2318343847990036,
|
|
"num_tokens": 58069657.0,
|
|
"step": 25325
|
|
},
|
|
{
|
|
"entropy": 5.083721780776978,
|
|
"epoch": 2.4332372718539865,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004409415627360213,
|
|
"loss": 4.7779,
|
|
"mean_token_accuracy": 0.2350016176700592,
|
|
"num_tokens": 58081081.0,
|
|
"step": 25330
|
|
},
|
|
{
|
|
"entropy": 5.1245338916778564,
|
|
"epoch": 2.4337175792507204,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00044091840818490303,
|
|
"loss": 4.7303,
|
|
"mean_token_accuracy": 0.2299958735704422,
|
|
"num_tokens": 58091561.0,
|
|
"step": 25335
|
|
},
|
|
{
|
|
"entropy": 5.00467004776001,
|
|
"epoch": 2.4341978866474543,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00044089524978165197,
|
|
"loss": 4.7628,
|
|
"mean_token_accuracy": 0.23996685147285463,
|
|
"num_tokens": 58102640.0,
|
|
"step": 25340
|
|
},
|
|
{
|
|
"entropy": 5.1055091381072994,
|
|
"epoch": 2.434678194044188,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00044087208752680577,
|
|
"loss": 4.8772,
|
|
"mean_token_accuracy": 0.22194685488939286,
|
|
"num_tokens": 58113356.0,
|
|
"step": 25345
|
|
},
|
|
{
|
|
"entropy": 5.228708505630493,
|
|
"epoch": 2.435158501440922,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004408489214209023,
|
|
"loss": 4.8862,
|
|
"mean_token_accuracy": 0.22105642706155776,
|
|
"num_tokens": 58125437.0,
|
|
"step": 25350
|
|
},
|
|
{
|
|
"entropy": 5.217840385437012,
|
|
"epoch": 2.435638808837656,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004408257514644793,
|
|
"loss": 4.8646,
|
|
"mean_token_accuracy": 0.22782525420188904,
|
|
"num_tokens": 58138491.0,
|
|
"step": 25355
|
|
},
|
|
{
|
|
"entropy": 5.073573255538941,
|
|
"epoch": 2.43611911623439,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044080257765807476,
|
|
"loss": 4.7787,
|
|
"mean_token_accuracy": 0.22393606305122377,
|
|
"num_tokens": 58150200.0,
|
|
"step": 25360
|
|
},
|
|
{
|
|
"entropy": 5.111953592300415,
|
|
"epoch": 2.436599423631124,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004407794000022267,
|
|
"loss": 4.8759,
|
|
"mean_token_accuracy": 0.23408314138650893,
|
|
"num_tokens": 58162467.0,
|
|
"step": 25365
|
|
},
|
|
{
|
|
"entropy": 5.198561382293701,
|
|
"epoch": 2.437079731027858,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004407562184974732,
|
|
"loss": 4.9004,
|
|
"mean_token_accuracy": 0.22066261023283004,
|
|
"num_tokens": 58175052.0,
|
|
"step": 25370
|
|
},
|
|
{
|
|
"entropy": 5.134199714660644,
|
|
"epoch": 2.437560038424592,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004407330331443526,
|
|
"loss": 4.7854,
|
|
"mean_token_accuracy": 0.2339022383093834,
|
|
"num_tokens": 58186999.0,
|
|
"step": 25375
|
|
},
|
|
{
|
|
"entropy": 5.0688145637512205,
|
|
"epoch": 2.438040345821326,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000440709843943403,
|
|
"loss": 4.6838,
|
|
"mean_token_accuracy": 0.23762887567281724,
|
|
"num_tokens": 58197811.0,
|
|
"step": 25380
|
|
},
|
|
{
|
|
"entropy": 5.206927680969239,
|
|
"epoch": 2.4385206532180597,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000440686650895163,
|
|
"loss": 4.9039,
|
|
"mean_token_accuracy": 0.22684629559516906,
|
|
"num_tokens": 58209545.0,
|
|
"step": 25385
|
|
},
|
|
{
|
|
"entropy": 5.0824668407440186,
|
|
"epoch": 2.4390009606147935,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00044066345400017084,
|
|
"loss": 4.7038,
|
|
"mean_token_accuracy": 0.23633986413478852,
|
|
"num_tokens": 58220994.0,
|
|
"step": 25390
|
|
},
|
|
{
|
|
"entropy": 5.027984094619751,
|
|
"epoch": 2.4394812680115274,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00044064025325896524,
|
|
"loss": 4.6381,
|
|
"mean_token_accuracy": 0.24538477808237075,
|
|
"num_tokens": 58232778.0,
|
|
"step": 25395
|
|
},
|
|
{
|
|
"entropy": 5.163522481918335,
|
|
"epoch": 2.4399615754082613,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00044061704867208484,
|
|
"loss": 4.8416,
|
|
"mean_token_accuracy": 0.23376935720443726,
|
|
"num_tokens": 58243889.0,
|
|
"step": 25400
|
|
},
|
|
{
|
|
"entropy": 5.054913663864136,
|
|
"epoch": 2.440441882804995,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00044059384024006825,
|
|
"loss": 4.7418,
|
|
"mean_token_accuracy": 0.23509032428264617,
|
|
"num_tokens": 58256144.0,
|
|
"step": 25405
|
|
},
|
|
{
|
|
"entropy": 5.045040321350098,
|
|
"epoch": 2.440922190201729,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004405706279634545,
|
|
"loss": 4.7683,
|
|
"mean_token_accuracy": 0.2350820556282997,
|
|
"num_tokens": 58267509.0,
|
|
"step": 25410
|
|
},
|
|
{
|
|
"entropy": 5.059934377670288,
|
|
"epoch": 2.441402497598463,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00044054741184278243,
|
|
"loss": 4.6942,
|
|
"mean_token_accuracy": 0.24670542627573014,
|
|
"num_tokens": 58278880.0,
|
|
"step": 25415
|
|
},
|
|
{
|
|
"entropy": 5.173495578765869,
|
|
"epoch": 2.441882804995197,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00044052419187859095,
|
|
"loss": 4.7589,
|
|
"mean_token_accuracy": 0.2268371656537056,
|
|
"num_tokens": 58290046.0,
|
|
"step": 25420
|
|
},
|
|
{
|
|
"entropy": 5.016854190826416,
|
|
"epoch": 2.4423631123919307,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004405009680714193,
|
|
"loss": 4.6244,
|
|
"mean_token_accuracy": 0.2436349555850029,
|
|
"num_tokens": 58301406.0,
|
|
"step": 25425
|
|
},
|
|
{
|
|
"entropy": 5.0364861488342285,
|
|
"epoch": 2.4428434197886646,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004404777404218065,
|
|
"loss": 4.7178,
|
|
"mean_token_accuracy": 0.23964912444353104,
|
|
"num_tokens": 58313064.0,
|
|
"step": 25430
|
|
},
|
|
{
|
|
"entropy": 5.1077882766723635,
|
|
"epoch": 2.4433237271853985,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000440454508930292,
|
|
"loss": 4.8441,
|
|
"mean_token_accuracy": 0.22159260958433152,
|
|
"num_tokens": 58324779.0,
|
|
"step": 25435
|
|
},
|
|
{
|
|
"entropy": 5.182451915740967,
|
|
"epoch": 2.443804034582133,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004404312735974152,
|
|
"loss": 4.8368,
|
|
"mean_token_accuracy": 0.2312808156013489,
|
|
"num_tokens": 58336064.0,
|
|
"step": 25440
|
|
},
|
|
{
|
|
"entropy": 5.032112169265747,
|
|
"epoch": 2.4442843419788662,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044040803442371533,
|
|
"loss": 4.6852,
|
|
"mean_token_accuracy": 0.23814561814069748,
|
|
"num_tokens": 58347497.0,
|
|
"step": 25445
|
|
},
|
|
{
|
|
"entropy": 4.990532445907593,
|
|
"epoch": 2.4447646493756006,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004403847914097321,
|
|
"loss": 4.6822,
|
|
"mean_token_accuracy": 0.2447928160429001,
|
|
"num_tokens": 58357716.0,
|
|
"step": 25450
|
|
},
|
|
{
|
|
"entropy": 5.114711761474609,
|
|
"epoch": 2.4452449567723344,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00044036154455600517,
|
|
"loss": 4.8153,
|
|
"mean_token_accuracy": 0.23038498014211656,
|
|
"num_tokens": 58368451.0,
|
|
"step": 25455
|
|
},
|
|
{
|
|
"entropy": 5.123874950408935,
|
|
"epoch": 2.4457252641690683,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004403382938630741,
|
|
"loss": 4.7933,
|
|
"mean_token_accuracy": 0.22636810839176177,
|
|
"num_tokens": 58378915.0,
|
|
"step": 25460
|
|
},
|
|
{
|
|
"entropy": 5.079519605636596,
|
|
"epoch": 2.446205571565802,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00044031503933147887,
|
|
"loss": 4.7004,
|
|
"mean_token_accuracy": 0.24838587641716003,
|
|
"num_tokens": 58390251.0,
|
|
"step": 25465
|
|
},
|
|
{
|
|
"entropy": 5.059087228775025,
|
|
"epoch": 2.446685878962536,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044029178096175934,
|
|
"loss": 4.7154,
|
|
"mean_token_accuracy": 0.23726015239953996,
|
|
"num_tokens": 58401390.0,
|
|
"step": 25470
|
|
},
|
|
{
|
|
"entropy": 5.05997052192688,
|
|
"epoch": 2.44716618635927,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004402685187544554,
|
|
"loss": 4.7455,
|
|
"mean_token_accuracy": 0.24133506268262864,
|
|
"num_tokens": 58412530.0,
|
|
"step": 25475
|
|
},
|
|
{
|
|
"entropy": 5.161045837402344,
|
|
"epoch": 2.447646493756004,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004402452527101072,
|
|
"loss": 4.848,
|
|
"mean_token_accuracy": 0.22996888011693956,
|
|
"num_tokens": 58424437.0,
|
|
"step": 25480
|
|
},
|
|
{
|
|
"entropy": 5.153648948669433,
|
|
"epoch": 2.4481268011527377,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004402219828292549,
|
|
"loss": 4.8021,
|
|
"mean_token_accuracy": 0.22747268825769423,
|
|
"num_tokens": 58435661.0,
|
|
"step": 25485
|
|
},
|
|
{
|
|
"entropy": 5.1356048583984375,
|
|
"epoch": 2.4486071085494716,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004401987091124388,
|
|
"loss": 4.9027,
|
|
"mean_token_accuracy": 0.21983251720666885,
|
|
"num_tokens": 58445840.0,
|
|
"step": 25490
|
|
},
|
|
{
|
|
"entropy": 5.135798072814941,
|
|
"epoch": 2.4490874159462055,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004401754315601992,
|
|
"loss": 4.8442,
|
|
"mean_token_accuracy": 0.22019636034965515,
|
|
"num_tokens": 58460160.0,
|
|
"step": 25495
|
|
},
|
|
{
|
|
"entropy": 5.105370426177979,
|
|
"epoch": 2.4495677233429394,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004401521501730765,
|
|
"loss": 4.6796,
|
|
"mean_token_accuracy": 0.24194978177547455,
|
|
"num_tokens": 58471257.0,
|
|
"step": 25500
|
|
},
|
|
{
|
|
"entropy": 4.981129550933838,
|
|
"epoch": 2.4500480307396733,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00044012886495161144,
|
|
"loss": 4.7116,
|
|
"mean_token_accuracy": 0.23703904449939728,
|
|
"num_tokens": 58482830.0,
|
|
"step": 25505
|
|
},
|
|
{
|
|
"entropy": 5.105340099334716,
|
|
"epoch": 2.450528338136407,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004401055758963443,
|
|
"loss": 4.7269,
|
|
"mean_token_accuracy": 0.24424219131469727,
|
|
"num_tokens": 58494278.0,
|
|
"step": 25510
|
|
},
|
|
{
|
|
"entropy": 5.112192296981812,
|
|
"epoch": 2.451008645533141,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000440082283007816,
|
|
"loss": 4.7334,
|
|
"mean_token_accuracy": 0.22664321213960648,
|
|
"num_tokens": 58505653.0,
|
|
"step": 25515
|
|
},
|
|
{
|
|
"entropy": 5.135557985305786,
|
|
"epoch": 2.451488952929875,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00044005898628656734,
|
|
"loss": 4.7579,
|
|
"mean_token_accuracy": 0.23311397582292556,
|
|
"num_tokens": 58516906.0,
|
|
"step": 25520
|
|
},
|
|
{
|
|
"entropy": 5.103575658798218,
|
|
"epoch": 2.4519692603266092,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000440035685733139,
|
|
"loss": 4.7533,
|
|
"mean_token_accuracy": 0.23946669548749924,
|
|
"num_tokens": 58527326.0,
|
|
"step": 25525
|
|
},
|
|
{
|
|
"entropy": 5.078144502639771,
|
|
"epoch": 2.452449567723343,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004400123813480722,
|
|
"loss": 4.7108,
|
|
"mean_token_accuracy": 0.24113842248916625,
|
|
"num_tokens": 58537749.0,
|
|
"step": 25530
|
|
},
|
|
{
|
|
"entropy": 5.087492990493774,
|
|
"epoch": 2.452929875120077,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043998907313190787,
|
|
"loss": 4.7554,
|
|
"mean_token_accuracy": 0.2300448402762413,
|
|
"num_tokens": 58549479.0,
|
|
"step": 25535
|
|
},
|
|
{
|
|
"entropy": 5.093681192398071,
|
|
"epoch": 2.453410182516811,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004399657610851873,
|
|
"loss": 4.7711,
|
|
"mean_token_accuracy": 0.23146681785583495,
|
|
"num_tokens": 58561051.0,
|
|
"step": 25540
|
|
},
|
|
{
|
|
"entropy": 5.136078310012818,
|
|
"epoch": 2.4538904899135447,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043994244520845146,
|
|
"loss": 4.8589,
|
|
"mean_token_accuracy": 0.22926601022481918,
|
|
"num_tokens": 58571482.0,
|
|
"step": 25545
|
|
},
|
|
{
|
|
"entropy": 5.14628643989563,
|
|
"epoch": 2.4543707973102786,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004399191255022418,
|
|
"loss": 4.7427,
|
|
"mean_token_accuracy": 0.23322267979383468,
|
|
"num_tokens": 58584369.0,
|
|
"step": 25550
|
|
},
|
|
{
|
|
"entropy": 5.113161659240722,
|
|
"epoch": 2.4548511047070125,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004398958019670998,
|
|
"loss": 4.7312,
|
|
"mean_token_accuracy": 0.2340619221329689,
|
|
"num_tokens": 58595786.0,
|
|
"step": 25555
|
|
},
|
|
{
|
|
"entropy": 5.0093278884887695,
|
|
"epoch": 2.4553314121037464,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043987247460356696,
|
|
"loss": 4.7354,
|
|
"mean_token_accuracy": 0.233761328458786,
|
|
"num_tokens": 58608536.0,
|
|
"step": 25560
|
|
},
|
|
{
|
|
"entropy": 5.031365156173706,
|
|
"epoch": 2.4558117195004803,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004398491434121847,
|
|
"loss": 4.6684,
|
|
"mean_token_accuracy": 0.2444691464304924,
|
|
"num_tokens": 58620981.0,
|
|
"step": 25565
|
|
},
|
|
{
|
|
"entropy": 5.117167186737061,
|
|
"epoch": 2.456292026897214,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000439825808393495,
|
|
"loss": 4.7756,
|
|
"mean_token_accuracy": 0.2298066183924675,
|
|
"num_tokens": 58632802.0,
|
|
"step": 25570
|
|
},
|
|
{
|
|
"entropy": 5.1339428424835205,
|
|
"epoch": 2.456772334293948,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004398024695480394,
|
|
"loss": 4.819,
|
|
"mean_token_accuracy": 0.23125423789024352,
|
|
"num_tokens": 58644577.0,
|
|
"step": 25575
|
|
},
|
|
{
|
|
"entropy": 5.117880201339721,
|
|
"epoch": 2.457252641690682,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004397791268763598,
|
|
"loss": 4.7427,
|
|
"mean_token_accuracy": 0.2311472088098526,
|
|
"num_tokens": 58654970.0,
|
|
"step": 25580
|
|
},
|
|
{
|
|
"entropy": 5.114732074737549,
|
|
"epoch": 2.457732949087416,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043975578037899814,
|
|
"loss": 4.8628,
|
|
"mean_token_accuracy": 0.2292526826262474,
|
|
"num_tokens": 58666534.0,
|
|
"step": 25585
|
|
},
|
|
{
|
|
"entropy": 5.08922929763794,
|
|
"epoch": 2.4582132564841497,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004397324300564966,
|
|
"loss": 4.7486,
|
|
"mean_token_accuracy": 0.23776388466358184,
|
|
"num_tokens": 58677451.0,
|
|
"step": 25590
|
|
},
|
|
{
|
|
"entropy": 5.102073621749878,
|
|
"epoch": 2.4586935638808836,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004397090759093971,
|
|
"loss": 4.7518,
|
|
"mean_token_accuracy": 0.23649442344903945,
|
|
"num_tokens": 58689179.0,
|
|
"step": 25595
|
|
},
|
|
{
|
|
"entropy": 5.10036883354187,
|
|
"epoch": 2.459173871277618,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043968571793824194,
|
|
"loss": 4.8596,
|
|
"mean_token_accuracy": 0.22949687093496324,
|
|
"num_tokens": 58701939.0,
|
|
"step": 25600
|
|
},
|
|
{
|
|
"entropy": 5.045658206939697,
|
|
"epoch": 2.4596541786743518,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004396623561435734,
|
|
"loss": 4.6265,
|
|
"mean_token_accuracy": 0.24700823426246643,
|
|
"num_tokens": 58714302.0,
|
|
"step": 25605
|
|
},
|
|
{
|
|
"entropy": 5.052271366119385,
|
|
"epoch": 2.4601344860710856,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004396389905259339,
|
|
"loss": 4.7653,
|
|
"mean_token_accuracy": 0.22990579009056092,
|
|
"num_tokens": 58727619.0,
|
|
"step": 25610
|
|
},
|
|
{
|
|
"entropy": 5.145890998840332,
|
|
"epoch": 2.4606147934678195,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00043961562108586603,
|
|
"loss": 4.8212,
|
|
"mean_token_accuracy": 0.22512982934713363,
|
|
"num_tokens": 58738929.0,
|
|
"step": 25615
|
|
},
|
|
{
|
|
"entropy": 5.2230690002441404,
|
|
"epoch": 2.4610951008645534,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043959224782391215,
|
|
"loss": 4.8429,
|
|
"mean_token_accuracy": 0.23013273477554322,
|
|
"num_tokens": 58751771.0,
|
|
"step": 25620
|
|
},
|
|
{
|
|
"entropy": 5.004738235473633,
|
|
"epoch": 2.4615754082612873,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000439568870740615,
|
|
"loss": 4.6611,
|
|
"mean_token_accuracy": 0.2486381411552429,
|
|
"num_tokens": 58762304.0,
|
|
"step": 25625
|
|
},
|
|
{
|
|
"entropy": 5.052642393112182,
|
|
"epoch": 2.462055715658021,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004395454898365174,
|
|
"loss": 4.7486,
|
|
"mean_token_accuracy": 0.22977619022130966,
|
|
"num_tokens": 58775222.0,
|
|
"step": 25630
|
|
},
|
|
{
|
|
"entropy": 5.007144784927368,
|
|
"epoch": 2.462536023054755,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043952210511216205,
|
|
"loss": 4.7316,
|
|
"mean_token_accuracy": 0.23972392976284027,
|
|
"num_tokens": 58785904.0,
|
|
"step": 25635
|
|
},
|
|
{
|
|
"entropy": 5.0395995616912845,
|
|
"epoch": 2.463016330451489,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00043949871656809205,
|
|
"loss": 4.7426,
|
|
"mean_token_accuracy": 0.2365034982562065,
|
|
"num_tokens": 58796186.0,
|
|
"step": 25640
|
|
},
|
|
{
|
|
"entropy": 5.159918355941772,
|
|
"epoch": 2.463496637848223,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00043947532420485024,
|
|
"loss": 4.8109,
|
|
"mean_token_accuracy": 0.23558780550956726,
|
|
"num_tokens": 58807317.0,
|
|
"step": 25645
|
|
},
|
|
{
|
|
"entropy": 5.176284885406494,
|
|
"epoch": 2.4639769452449567,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004394519280229798,
|
|
"loss": 4.8786,
|
|
"mean_token_accuracy": 0.22497029304504396,
|
|
"num_tokens": 58819108.0,
|
|
"step": 25650
|
|
},
|
|
{
|
|
"entropy": 5.087363910675049,
|
|
"epoch": 2.4644572526416906,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043942852802302397,
|
|
"loss": 4.8114,
|
|
"mean_token_accuracy": 0.22998191565275192,
|
|
"num_tokens": 58830962.0,
|
|
"step": 25655
|
|
},
|
|
{
|
|
"entropy": 5.166617107391358,
|
|
"epoch": 2.4649375600384245,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004394051242055259,
|
|
"loss": 4.8907,
|
|
"mean_token_accuracy": 0.22262165695428848,
|
|
"num_tokens": 58843236.0,
|
|
"step": 25660
|
|
},
|
|
{
|
|
"entropy": 5.128620433807373,
|
|
"epoch": 2.4654178674351583,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004393817165710291,
|
|
"loss": 4.7161,
|
|
"mean_token_accuracy": 0.2342569798231125,
|
|
"num_tokens": 58853995.0,
|
|
"step": 25665
|
|
},
|
|
{
|
|
"entropy": 5.0780110359191895,
|
|
"epoch": 2.465898174831892,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043935830512007687,
|
|
"loss": 4.7098,
|
|
"mean_token_accuracy": 0.24568893015384674,
|
|
"num_tokens": 58866563.0,
|
|
"step": 25670
|
|
},
|
|
{
|
|
"entropy": 5.14299054145813,
|
|
"epoch": 2.4663784822286265,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00043933488985321286,
|
|
"loss": 4.8673,
|
|
"mean_token_accuracy": 0.22950955629348754,
|
|
"num_tokens": 58878660.0,
|
|
"step": 25675
|
|
},
|
|
{
|
|
"entropy": 5.0542590618133545,
|
|
"epoch": 2.46685878962536,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004393114707709807,
|
|
"loss": 4.7065,
|
|
"mean_token_accuracy": 0.2350746512413025,
|
|
"num_tokens": 58890332.0,
|
|
"step": 25680
|
|
},
|
|
{
|
|
"entropy": 5.098819255828857,
|
|
"epoch": 2.4673390970220943,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004392880478739241,
|
|
"loss": 4.7921,
|
|
"mean_token_accuracy": 0.2282637909054756,
|
|
"num_tokens": 58902228.0,
|
|
"step": 25685
|
|
},
|
|
{
|
|
"entropy": 5.1141167163848875,
|
|
"epoch": 2.467819404418828,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004392646211625869,
|
|
"loss": 4.7941,
|
|
"mean_token_accuracy": 0.23558991700410842,
|
|
"num_tokens": 58914452.0,
|
|
"step": 25690
|
|
},
|
|
{
|
|
"entropy": 5.116840028762818,
|
|
"epoch": 2.468299711815562,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004392411906375129,
|
|
"loss": 4.7676,
|
|
"mean_token_accuracy": 0.22971803247928618,
|
|
"num_tokens": 58926079.0,
|
|
"step": 25695
|
|
},
|
|
{
|
|
"entropy": 5.092357540130616,
|
|
"epoch": 2.468780019212296,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00043921775629924615,
|
|
"loss": 4.7433,
|
|
"mean_token_accuracy": 0.23688182830810547,
|
|
"num_tokens": 58937292.0,
|
|
"step": 25700
|
|
},
|
|
{
|
|
"entropy": 5.097441005706787,
|
|
"epoch": 2.46926032660903,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043919431814833077,
|
|
"loss": 4.8116,
|
|
"mean_token_accuracy": 0.23292779326438903,
|
|
"num_tokens": 58949425.0,
|
|
"step": 25705
|
|
},
|
|
{
|
|
"entropy": 5.1808027744293215,
|
|
"epoch": 2.4697406340057637,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043917087618531084,
|
|
"loss": 4.7755,
|
|
"mean_token_accuracy": 0.23483059853315352,
|
|
"num_tokens": 58961620.0,
|
|
"step": 25710
|
|
},
|
|
{
|
|
"entropy": 5.1284150123596195,
|
|
"epoch": 2.4702209414024976,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004391474304107307,
|
|
"loss": 4.7137,
|
|
"mean_token_accuracy": 0.2397887110710144,
|
|
"num_tokens": 58973043.0,
|
|
"step": 25715
|
|
},
|
|
{
|
|
"entropy": 5.0684206008911135,
|
|
"epoch": 2.4707012487992315,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043912398082513463,
|
|
"loss": 4.8725,
|
|
"mean_token_accuracy": 0.22367147654294967,
|
|
"num_tokens": 58984516.0,
|
|
"step": 25720
|
|
},
|
|
{
|
|
"entropy": 5.085205459594727,
|
|
"epoch": 2.4711815561959654,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000439100527429067,
|
|
"loss": 4.7149,
|
|
"mean_token_accuracy": 0.23901582807302474,
|
|
"num_tokens": 58996473.0,
|
|
"step": 25725
|
|
},
|
|
{
|
|
"entropy": 5.107437419891357,
|
|
"epoch": 2.4716618635926992,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043907707022307243,
|
|
"loss": 4.7239,
|
|
"mean_token_accuracy": 0.2298218384385109,
|
|
"num_tokens": 59009001.0,
|
|
"step": 25730
|
|
},
|
|
{
|
|
"entropy": 5.174347257614135,
|
|
"epoch": 2.472142170989433,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00043905360920769553,
|
|
"loss": 4.8174,
|
|
"mean_token_accuracy": 0.23512738794088364,
|
|
"num_tokens": 59021238.0,
|
|
"step": 25735
|
|
},
|
|
{
|
|
"entropy": 5.152868318557739,
|
|
"epoch": 2.472622478386167,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000439030144383481,
|
|
"loss": 4.8156,
|
|
"mean_token_accuracy": 0.23765633702278138,
|
|
"num_tokens": 59031311.0,
|
|
"step": 25740
|
|
},
|
|
{
|
|
"entropy": 5.0604266166687015,
|
|
"epoch": 2.473102785782901,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00043900667575097355,
|
|
"loss": 4.7536,
|
|
"mean_token_accuracy": 0.2329123303294182,
|
|
"num_tokens": 59043151.0,
|
|
"step": 25745
|
|
},
|
|
{
|
|
"entropy": 5.134230995178223,
|
|
"epoch": 2.473583093179635,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004389832033107181,
|
|
"loss": 4.8443,
|
|
"mean_token_accuracy": 0.2332065299153328,
|
|
"num_tokens": 59054513.0,
|
|
"step": 25750
|
|
},
|
|
{
|
|
"entropy": 5.129967975616455,
|
|
"epoch": 2.4740634005763686,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00043895972706325953,
|
|
"loss": 4.7195,
|
|
"mean_token_accuracy": 0.237641379237175,
|
|
"num_tokens": 59064909.0,
|
|
"step": 25755
|
|
},
|
|
{
|
|
"entropy": 5.071166515350342,
|
|
"epoch": 2.474543707973103,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000438936247009143,
|
|
"loss": 4.7855,
|
|
"mean_token_accuracy": 0.22626807391643525,
|
|
"num_tokens": 59077089.0,
|
|
"step": 25760
|
|
},
|
|
{
|
|
"entropy": 5.099506616592407,
|
|
"epoch": 2.475024015369837,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00043891276314891365,
|
|
"loss": 4.7394,
|
|
"mean_token_accuracy": 0.23274567127227783,
|
|
"num_tokens": 59089188.0,
|
|
"step": 25765
|
|
},
|
|
{
|
|
"entropy": 5.167963123321533,
|
|
"epoch": 2.4755043227665707,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004388892754831166,
|
|
"loss": 4.8073,
|
|
"mean_token_accuracy": 0.22330971658229828,
|
|
"num_tokens": 59101384.0,
|
|
"step": 25770
|
|
},
|
|
{
|
|
"entropy": 5.132992935180664,
|
|
"epoch": 2.4759846301633046,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004388657840122973,
|
|
"loss": 4.7529,
|
|
"mean_token_accuracy": 0.23194568306207658,
|
|
"num_tokens": 59112327.0,
|
|
"step": 25775
|
|
},
|
|
{
|
|
"entropy": 5.120018386840821,
|
|
"epoch": 2.4764649375600385,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000438842288737001,
|
|
"loss": 4.8109,
|
|
"mean_token_accuracy": 0.22848654985427858,
|
|
"num_tokens": 59123577.0,
|
|
"step": 25780
|
|
},
|
|
{
|
|
"entropy": 5.031744861602784,
|
|
"epoch": 2.4769452449567724,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00043881878965777325,
|
|
"loss": 4.6937,
|
|
"mean_token_accuracy": 0.24387203007936478,
|
|
"num_tokens": 59135359.0,
|
|
"step": 25785
|
|
},
|
|
{
|
|
"entropy": 5.032623624801635,
|
|
"epoch": 2.4774255523535063,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00043879528677515973,
|
|
"loss": 4.6516,
|
|
"mean_token_accuracy": 0.24132218658924104,
|
|
"num_tokens": 59146679.0,
|
|
"step": 25790
|
|
},
|
|
{
|
|
"entropy": 5.063320732116699,
|
|
"epoch": 2.47790585975024,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00043877178008970596,
|
|
"loss": 4.788,
|
|
"mean_token_accuracy": 0.232273106276989,
|
|
"num_tokens": 59159893.0,
|
|
"step": 25795
|
|
},
|
|
{
|
|
"entropy": 5.072348546981812,
|
|
"epoch": 2.478386167146974,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004387482696019578,
|
|
"loss": 4.7349,
|
|
"mean_token_accuracy": 0.2393971264362335,
|
|
"num_tokens": 59172592.0,
|
|
"step": 25800
|
|
},
|
|
{
|
|
"entropy": 5.076195192337036,
|
|
"epoch": 2.478866474543708,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043872475531246105,
|
|
"loss": 4.7512,
|
|
"mean_token_accuracy": 0.23732175081968307,
|
|
"num_tokens": 59183650.0,
|
|
"step": 25805
|
|
},
|
|
{
|
|
"entropy": 5.10286431312561,
|
|
"epoch": 2.479346781940442,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00043870123722176166,
|
|
"loss": 4.7915,
|
|
"mean_token_accuracy": 0.23892272710800172,
|
|
"num_tokens": 59195423.0,
|
|
"step": 25810
|
|
},
|
|
{
|
|
"entropy": 5.10826735496521,
|
|
"epoch": 2.4798270893371757,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004386777153304056,
|
|
"loss": 4.7901,
|
|
"mean_token_accuracy": 0.22936685085296632,
|
|
"num_tokens": 59208486.0,
|
|
"step": 25815
|
|
},
|
|
{
|
|
"entropy": 5.073663139343262,
|
|
"epoch": 2.4803073967339095,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00043865418963893896,
|
|
"loss": 4.7637,
|
|
"mean_token_accuracy": 0.23651630729436873,
|
|
"num_tokens": 59220230.0,
|
|
"step": 25820
|
|
},
|
|
{
|
|
"entropy": 5.084012842178344,
|
|
"epoch": 2.4807877041306434,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004386306601479081,
|
|
"loss": 4.7628,
|
|
"mean_token_accuracy": 0.2342966765165329,
|
|
"num_tokens": 59231524.0,
|
|
"step": 25825
|
|
},
|
|
{
|
|
"entropy": 5.099392652511597,
|
|
"epoch": 2.4812680115273773,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004386071268578591,
|
|
"loss": 4.8511,
|
|
"mean_token_accuracy": 0.22831531316041948,
|
|
"num_tokens": 59243525.0,
|
|
"step": 25830
|
|
},
|
|
{
|
|
"entropy": 5.136034917831421,
|
|
"epoch": 2.4817483189241116,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00043858358976933844,
|
|
"loss": 4.6946,
|
|
"mean_token_accuracy": 0.23946435898542404,
|
|
"num_tokens": 59253774.0,
|
|
"step": 25835
|
|
},
|
|
{
|
|
"entropy": 5.108441257476807,
|
|
"epoch": 2.4822286263208455,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043856004888289264,
|
|
"loss": 4.771,
|
|
"mean_token_accuracy": 0.23220686763525009,
|
|
"num_tokens": 59265640.0,
|
|
"step": 25840
|
|
},
|
|
{
|
|
"entropy": 5.0565966129302975,
|
|
"epoch": 2.4827089337175794,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004385365041990681,
|
|
"loss": 4.8102,
|
|
"mean_token_accuracy": 0.23166382163763047,
|
|
"num_tokens": 59277513.0,
|
|
"step": 25845
|
|
},
|
|
{
|
|
"entropy": 5.199204635620117,
|
|
"epoch": 2.4831892411143133,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004385129557184116,
|
|
"loss": 4.7662,
|
|
"mean_token_accuracy": 0.23623427748680115,
|
|
"num_tokens": 59289446.0,
|
|
"step": 25850
|
|
},
|
|
{
|
|
"entropy": 5.113775300979614,
|
|
"epoch": 2.483669548511047,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00043848940344146976,
|
|
"loss": 4.8157,
|
|
"mean_token_accuracy": 0.23519984036684036,
|
|
"num_tokens": 59300156.0,
|
|
"step": 25855
|
|
},
|
|
{
|
|
"entropy": 5.048851490020752,
|
|
"epoch": 2.484149855907781,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004384658473687894,
|
|
"loss": 4.7629,
|
|
"mean_token_accuracy": 0.23632101267576217,
|
|
"num_tokens": 59311500.0,
|
|
"step": 25860
|
|
},
|
|
{
|
|
"entropy": 5.210094690322876,
|
|
"epoch": 2.484630163304515,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004384422875009176,
|
|
"loss": 4.8639,
|
|
"mean_token_accuracy": 0.22739754617214203,
|
|
"num_tokens": 59323299.0,
|
|
"step": 25865
|
|
},
|
|
{
|
|
"entropy": 5.152326250076294,
|
|
"epoch": 2.485110470701249,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004384187238384011,
|
|
"loss": 4.8075,
|
|
"mean_token_accuracy": 0.2332732543349266,
|
|
"num_tokens": 59335955.0,
|
|
"step": 25870
|
|
},
|
|
{
|
|
"entropy": 5.007804298400879,
|
|
"epoch": 2.4855907780979827,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004383951563817871,
|
|
"loss": 4.6885,
|
|
"mean_token_accuracy": 0.23482925742864608,
|
|
"num_tokens": 59347571.0,
|
|
"step": 25875
|
|
},
|
|
{
|
|
"entropy": 5.103703784942627,
|
|
"epoch": 2.4860710854947166,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004383715851316227,
|
|
"loss": 4.7639,
|
|
"mean_token_accuracy": 0.23006821870803834,
|
|
"num_tokens": 59358932.0,
|
|
"step": 25880
|
|
},
|
|
{
|
|
"entropy": 5.0206766605377195,
|
|
"epoch": 2.4865513928914504,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043834801008845527,
|
|
"loss": 4.6671,
|
|
"mean_token_accuracy": 0.2419501304626465,
|
|
"num_tokens": 59371425.0,
|
|
"step": 25885
|
|
},
|
|
{
|
|
"entropy": 5.026921224594116,
|
|
"epoch": 2.4870317002881843,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004383244312528321,
|
|
"loss": 4.7173,
|
|
"mean_token_accuracy": 0.23288854360580444,
|
|
"num_tokens": 59382917.0,
|
|
"step": 25890
|
|
},
|
|
{
|
|
"entropy": 5.1574970245361325,
|
|
"epoch": 2.487512007684918,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004383008486253006,
|
|
"loss": 4.833,
|
|
"mean_token_accuracy": 0.22575733363628386,
|
|
"num_tokens": 59395014.0,
|
|
"step": 25895
|
|
},
|
|
{
|
|
"entropy": 5.147827816009522,
|
|
"epoch": 2.487992315081652,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00043827726220640827,
|
|
"loss": 4.8096,
|
|
"mean_token_accuracy": 0.22929120808839798,
|
|
"num_tokens": 59407115.0,
|
|
"step": 25900
|
|
},
|
|
{
|
|
"entropy": 5.234961748123169,
|
|
"epoch": 2.488472622478386,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00043825367199670274,
|
|
"loss": 4.8872,
|
|
"mean_token_accuracy": 0.22529138028621673,
|
|
"num_tokens": 59418744.0,
|
|
"step": 25905
|
|
},
|
|
{
|
|
"entropy": 5.047393321990967,
|
|
"epoch": 2.4889529298751203,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004382300779967318,
|
|
"loss": 4.6953,
|
|
"mean_token_accuracy": 0.24463900476694106,
|
|
"num_tokens": 59429950.0,
|
|
"step": 25910
|
|
},
|
|
{
|
|
"entropy": 5.096230125427246,
|
|
"epoch": 2.489433237271854,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00043820648020704303,
|
|
"loss": 4.8733,
|
|
"mean_token_accuracy": 0.23028983771800995,
|
|
"num_tokens": 59441335.0,
|
|
"step": 25915
|
|
},
|
|
{
|
|
"entropy": 5.147711133956909,
|
|
"epoch": 2.489913544668588,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00043818287862818444,
|
|
"loss": 4.7661,
|
|
"mean_token_accuracy": 0.23482993692159654,
|
|
"num_tokens": 59452325.0,
|
|
"step": 25920
|
|
},
|
|
{
|
|
"entropy": 5.134023904800415,
|
|
"epoch": 2.490393852065322,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000438159273260704,
|
|
"loss": 4.8731,
|
|
"mean_token_accuracy": 0.23030917197465897,
|
|
"num_tokens": 59464352.0,
|
|
"step": 25925
|
|
},
|
|
{
|
|
"entropy": 5.108134984970093,
|
|
"epoch": 2.490874159462056,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004381356641051497,
|
|
"loss": 4.7492,
|
|
"mean_token_accuracy": 0.23972258418798448,
|
|
"num_tokens": 59476048.0,
|
|
"step": 25930
|
|
},
|
|
{
|
|
"entropy": 5.152993965148926,
|
|
"epoch": 2.4913544668587897,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004381120511620697,
|
|
"loss": 4.8179,
|
|
"mean_token_accuracy": 0.2259794533252716,
|
|
"num_tokens": 59488139.0,
|
|
"step": 25935
|
|
},
|
|
{
|
|
"entropy": 5.102795743942261,
|
|
"epoch": 2.4918347742555236,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043808843443201217,
|
|
"loss": 4.8476,
|
|
"mean_token_accuracy": 0.22818621397018432,
|
|
"num_tokens": 59498279.0,
|
|
"step": 25940
|
|
},
|
|
{
|
|
"entropy": 5.15069465637207,
|
|
"epoch": 2.4923150816522575,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004380648139155255,
|
|
"loss": 4.7699,
|
|
"mean_token_accuracy": 0.23187788128852843,
|
|
"num_tokens": 59509914.0,
|
|
"step": 25945
|
|
},
|
|
{
|
|
"entropy": 5.200746536254883,
|
|
"epoch": 2.4927953890489913,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004380411896131581,
|
|
"loss": 4.9313,
|
|
"mean_token_accuracy": 0.2188297063112259,
|
|
"num_tokens": 59521441.0,
|
|
"step": 25950
|
|
},
|
|
{
|
|
"entropy": 5.111665678024292,
|
|
"epoch": 2.493275696445725,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043801756152545836,
|
|
"loss": 4.844,
|
|
"mean_token_accuracy": 0.22982463389635086,
|
|
"num_tokens": 59533086.0,
|
|
"step": 25955
|
|
},
|
|
{
|
|
"entropy": 5.176836347579956,
|
|
"epoch": 2.493756003842459,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00043799392965297496,
|
|
"loss": 4.8271,
|
|
"mean_token_accuracy": 0.22817039489746094,
|
|
"num_tokens": 59545165.0,
|
|
"step": 25960
|
|
},
|
|
{
|
|
"entropy": 5.115931177139283,
|
|
"epoch": 2.494236311239193,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004379702939962564,
|
|
"loss": 4.8065,
|
|
"mean_token_accuracy": 0.22424955815076827,
|
|
"num_tokens": 59556016.0,
|
|
"step": 25965
|
|
},
|
|
{
|
|
"entropy": 5.119119167327881,
|
|
"epoch": 2.494716618635927,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004379466545558516,
|
|
"loss": 4.7896,
|
|
"mean_token_accuracy": 0.22895194143056868,
|
|
"num_tokens": 59566431.0,
|
|
"step": 25970
|
|
},
|
|
{
|
|
"entropy": 5.152138757705688,
|
|
"epoch": 2.4951969260326607,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00043792301133230933,
|
|
"loss": 4.8451,
|
|
"mean_token_accuracy": 0.2285153165459633,
|
|
"num_tokens": 59579371.0,
|
|
"step": 25975
|
|
},
|
|
{
|
|
"entropy": 5.109066534042358,
|
|
"epoch": 2.4956772334293946,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004378993643261785,
|
|
"loss": 4.7384,
|
|
"mean_token_accuracy": 0.24127317667007447,
|
|
"num_tokens": 59590088.0,
|
|
"step": 25980
|
|
},
|
|
{
|
|
"entropy": 5.071279907226563,
|
|
"epoch": 2.496157540826129,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00043787571353800814,
|
|
"loss": 4.7119,
|
|
"mean_token_accuracy": 0.24527304023504257,
|
|
"num_tokens": 59600973.0,
|
|
"step": 25985
|
|
},
|
|
{
|
|
"entropy": 5.117540597915649,
|
|
"epoch": 2.4966378482228624,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004378520589683475,
|
|
"loss": 4.7536,
|
|
"mean_token_accuracy": 0.23426486998796464,
|
|
"num_tokens": 59613194.0,
|
|
"step": 25990
|
|
},
|
|
{
|
|
"entropy": 5.076265668869018,
|
|
"epoch": 2.4971181556195967,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043782840061774544,
|
|
"loss": 4.7693,
|
|
"mean_token_accuracy": 0.23121773898601533,
|
|
"num_tokens": 59623932.0,
|
|
"step": 25995
|
|
},
|
|
{
|
|
"entropy": 5.0959716796875,
|
|
"epoch": 2.4975984630163306,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043780473848675143,
|
|
"loss": 4.8548,
|
|
"mean_token_accuracy": 0.227916020154953,
|
|
"num_tokens": 59635095.0,
|
|
"step": 26000
|
|
},
|
|
{
|
|
"entropy": 5.101946401596069,
|
|
"epoch": 2.4980787704130645,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004377810725759149,
|
|
"loss": 4.7575,
|
|
"mean_token_accuracy": 0.2337815672159195,
|
|
"num_tokens": 59646930.0,
|
|
"step": 26005
|
|
},
|
|
{
|
|
"entropy": 5.095354413986206,
|
|
"epoch": 2.4985590778097984,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00043775740288578516,
|
|
"loss": 4.7307,
|
|
"mean_token_accuracy": 0.23744526356458664,
|
|
"num_tokens": 59658729.0,
|
|
"step": 26010
|
|
},
|
|
{
|
|
"entropy": 5.054209232330322,
|
|
"epoch": 2.4990393852065322,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004377337294169118,
|
|
"loss": 4.8073,
|
|
"mean_token_accuracy": 0.23229757994413375,
|
|
"num_tokens": 59670619.0,
|
|
"step": 26015
|
|
},
|
|
{
|
|
"entropy": 5.174720811843872,
|
|
"epoch": 2.499519692603266,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00043771005216984457,
|
|
"loss": 4.8931,
|
|
"mean_token_accuracy": 0.22655860781669618,
|
|
"num_tokens": 59682307.0,
|
|
"step": 26020
|
|
},
|
|
{
|
|
"entropy": 5.136973333358765,
|
|
"epoch": 2.5,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000437686371145133,
|
|
"loss": 4.7471,
|
|
"mean_token_accuracy": 0.23237128108739852,
|
|
"num_tokens": 59693515.0,
|
|
"step": 26025
|
|
},
|
|
{
|
|
"entropy": 5.1173011302948,
|
|
"epoch": 2.500480307396734,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000437662686343327,
|
|
"loss": 4.799,
|
|
"mean_token_accuracy": 0.22752099484205246,
|
|
"num_tokens": 59705012.0,
|
|
"step": 26030
|
|
},
|
|
{
|
|
"entropy": 5.171797037124634,
|
|
"epoch": 2.5009606147934678,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004376389977649764,
|
|
"loss": 4.8797,
|
|
"mean_token_accuracy": 0.23065385967493057,
|
|
"num_tokens": 59716776.0,
|
|
"step": 26035
|
|
},
|
|
{
|
|
"entropy": 5.087970304489136,
|
|
"epoch": 2.5014409221902016,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004376153054106313,
|
|
"loss": 4.7667,
|
|
"mean_token_accuracy": 0.23524891585111618,
|
|
"num_tokens": 59727526.0,
|
|
"step": 26040
|
|
},
|
|
{
|
|
"entropy": 5.122954797744751,
|
|
"epoch": 2.5019212295869355,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004375916092808416,
|
|
"loss": 4.7422,
|
|
"mean_token_accuracy": 0.23872457444667816,
|
|
"num_tokens": 59738861.0,
|
|
"step": 26045
|
|
},
|
|
{
|
|
"entropy": 5.249390029907227,
|
|
"epoch": 2.5024015369836694,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004375679093761575,
|
|
"loss": 4.9846,
|
|
"mean_token_accuracy": 0.2129766032099724,
|
|
"num_tokens": 59751019.0,
|
|
"step": 26050
|
|
},
|
|
{
|
|
"entropy": 5.14567813873291,
|
|
"epoch": 2.5028818443804033,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00043754420569712925,
|
|
"loss": 4.7858,
|
|
"mean_token_accuracy": 0.23589466214179994,
|
|
"num_tokens": 59762936.0,
|
|
"step": 26055
|
|
},
|
|
{
|
|
"entropy": 5.094907569885254,
|
|
"epoch": 2.5033621517771376,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00043752049824430736,
|
|
"loss": 4.7202,
|
|
"mean_token_accuracy": 0.23375146836042404,
|
|
"num_tokens": 59772909.0,
|
|
"step": 26060
|
|
},
|
|
{
|
|
"entropy": 5.0118269443511965,
|
|
"epoch": 2.503842459173871,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00043749678701824197,
|
|
"loss": 4.6903,
|
|
"mean_token_accuracy": 0.23832600712776184,
|
|
"num_tokens": 59784003.0,
|
|
"step": 26065
|
|
},
|
|
{
|
|
"entropy": 5.098705244064331,
|
|
"epoch": 2.5043227665706054,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004374730720194837,
|
|
"loss": 4.835,
|
|
"mean_token_accuracy": 0.2284349739551544,
|
|
"num_tokens": 59796200.0,
|
|
"step": 26070
|
|
},
|
|
{
|
|
"entropy": 5.090223264694214,
|
|
"epoch": 2.5048030739673393,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004374493532485832,
|
|
"loss": 4.7573,
|
|
"mean_token_accuracy": 0.23184773176908494,
|
|
"num_tokens": 59807559.0,
|
|
"step": 26075
|
|
},
|
|
{
|
|
"entropy": 5.136758613586426,
|
|
"epoch": 2.505283381364073,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000437425630706091,
|
|
"loss": 4.819,
|
|
"mean_token_accuracy": 0.22647526264190673,
|
|
"num_tokens": 59818213.0,
|
|
"step": 26080
|
|
},
|
|
{
|
|
"entropy": 5.130527019500732,
|
|
"epoch": 2.505763688760807,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000437401904392558,
|
|
"loss": 4.812,
|
|
"mean_token_accuracy": 0.2289934679865837,
|
|
"num_tokens": 59829485.0,
|
|
"step": 26085
|
|
},
|
|
{
|
|
"entropy": 5.078642749786377,
|
|
"epoch": 2.506243996157541,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043737817430853504,
|
|
"loss": 4.8662,
|
|
"mean_token_accuracy": 0.22903375029563905,
|
|
"num_tokens": 59842187.0,
|
|
"step": 26090
|
|
},
|
|
{
|
|
"entropy": 5.200610637664795,
|
|
"epoch": 2.506724303554275,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00043735444045457303,
|
|
"loss": 4.9089,
|
|
"mean_token_accuracy": 0.2234987273812294,
|
|
"num_tokens": 59852783.0,
|
|
"step": 26095
|
|
},
|
|
{
|
|
"entropy": 5.155863475799561,
|
|
"epoch": 2.5072046109510087,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043733070283122306,
|
|
"loss": 4.755,
|
|
"mean_token_accuracy": 0.23283324986696244,
|
|
"num_tokens": 59863661.0,
|
|
"step": 26100
|
|
},
|
|
{
|
|
"entropy": 5.051503658294678,
|
|
"epoch": 2.5076849183477425,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00043730696143903607,
|
|
"loss": 4.7325,
|
|
"mean_token_accuracy": 0.23420979529619218,
|
|
"num_tokens": 59876109.0,
|
|
"step": 26105
|
|
},
|
|
{
|
|
"entropy": 5.024322080612182,
|
|
"epoch": 2.5081652257444764,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004372832162785635,
|
|
"loss": 4.5933,
|
|
"mean_token_accuracy": 0.24501541554927825,
|
|
"num_tokens": 59886322.0,
|
|
"step": 26110
|
|
},
|
|
{
|
|
"entropy": 5.029271793365479,
|
|
"epoch": 2.5086455331412103,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004372594673503565,
|
|
"loss": 4.7192,
|
|
"mean_token_accuracy": 0.23893527537584305,
|
|
"num_tokens": 59897307.0,
|
|
"step": 26115
|
|
},
|
|
{
|
|
"entropy": 5.052051687240601,
|
|
"epoch": 2.509125840537944,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004372357146549664,
|
|
"loss": 4.6937,
|
|
"mean_token_accuracy": 0.2373049482703209,
|
|
"num_tokens": 59908709.0,
|
|
"step": 26120
|
|
},
|
|
{
|
|
"entropy": 5.067273283004761,
|
|
"epoch": 2.509606147934678,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043721195819294487,
|
|
"loss": 4.7122,
|
|
"mean_token_accuracy": 0.24751894921064377,
|
|
"num_tokens": 59920082.0,
|
|
"step": 26125
|
|
},
|
|
{
|
|
"entropy": 5.077537918090821,
|
|
"epoch": 2.510086455331412,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004371881979648433,
|
|
"loss": 4.7497,
|
|
"mean_token_accuracy": 0.2332884654402733,
|
|
"num_tokens": 59931632.0,
|
|
"step": 26130
|
|
},
|
|
{
|
|
"entropy": 5.0393143653869625,
|
|
"epoch": 2.5105667627281463,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004371644339712133,
|
|
"loss": 4.696,
|
|
"mean_token_accuracy": 0.2380808562040329,
|
|
"num_tokens": 59944093.0,
|
|
"step": 26135
|
|
},
|
|
{
|
|
"entropy": 5.0620362758636475,
|
|
"epoch": 2.5110470701248797,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004371406662126067,
|
|
"loss": 4.7571,
|
|
"mean_token_accuracy": 0.22995698899030687,
|
|
"num_tokens": 59954879.0,
|
|
"step": 26140
|
|
},
|
|
{
|
|
"entropy": 5.057656860351562,
|
|
"epoch": 2.511527377521614,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043711689468957534,
|
|
"loss": 4.7668,
|
|
"mean_token_accuracy": 0.23091911375522614,
|
|
"num_tokens": 59967074.0,
|
|
"step": 26145
|
|
},
|
|
{
|
|
"entropy": 5.194906091690063,
|
|
"epoch": 2.5120076849183475,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043709311940267107,
|
|
"loss": 4.8049,
|
|
"mean_token_accuracy": 0.23293969184160232,
|
|
"num_tokens": 59978939.0,
|
|
"step": 26150
|
|
},
|
|
{
|
|
"entropy": 5.159637403488159,
|
|
"epoch": 2.512487992315082,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004370693403524458,
|
|
"loss": 4.7721,
|
|
"mean_token_accuracy": 0.23222009539604188,
|
|
"num_tokens": 59989251.0,
|
|
"step": 26155
|
|
},
|
|
{
|
|
"entropy": 5.043718004226685,
|
|
"epoch": 2.5129682997118157,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004370455575394518,
|
|
"loss": 4.7492,
|
|
"mean_token_accuracy": 0.23366657197475432,
|
|
"num_tokens": 60000267.0,
|
|
"step": 26160
|
|
},
|
|
{
|
|
"entropy": 5.143046808242798,
|
|
"epoch": 2.5134486071085496,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004370217709642411,
|
|
"loss": 4.7776,
|
|
"mean_token_accuracy": 0.23386308401823044,
|
|
"num_tokens": 60011190.0,
|
|
"step": 26165
|
|
},
|
|
{
|
|
"entropy": 5.128286361694336,
|
|
"epoch": 2.5139289145052834,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000436997980627366,
|
|
"loss": 4.8399,
|
|
"mean_token_accuracy": 0.22566935122013093,
|
|
"num_tokens": 60023496.0,
|
|
"step": 26170
|
|
},
|
|
{
|
|
"entropy": 5.190862083435059,
|
|
"epoch": 2.5144092219020173,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00043697418652937877,
|
|
"loss": 4.8595,
|
|
"mean_token_accuracy": 0.22487357556819915,
|
|
"num_tokens": 60034585.0,
|
|
"step": 26175
|
|
},
|
|
{
|
|
"entropy": 5.187910270690918,
|
|
"epoch": 2.514889529298751,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004369503886708319,
|
|
"loss": 4.8479,
|
|
"mean_token_accuracy": 0.22261265069246292,
|
|
"num_tokens": 60046333.0,
|
|
"step": 26180
|
|
},
|
|
{
|
|
"entropy": 5.111104536056518,
|
|
"epoch": 2.515369836695485,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043692658705227796,
|
|
"loss": 4.8294,
|
|
"mean_token_accuracy": 0.22929517477750777,
|
|
"num_tokens": 60058744.0,
|
|
"step": 26185
|
|
},
|
|
{
|
|
"entropy": 5.182344436645508,
|
|
"epoch": 2.515850144092219,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00043690278167426945,
|
|
"loss": 4.87,
|
|
"mean_token_accuracy": 0.22710851728916168,
|
|
"num_tokens": 60069303.0,
|
|
"step": 26190
|
|
},
|
|
{
|
|
"entropy": 5.06790828704834,
|
|
"epoch": 2.516330451488953,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004368789725373591,
|
|
"loss": 4.7214,
|
|
"mean_token_accuracy": 0.2350516840815544,
|
|
"num_tokens": 60080604.0,
|
|
"step": 26195
|
|
},
|
|
{
|
|
"entropy": 5.1351783752441404,
|
|
"epoch": 2.5168107588856867,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00043685515964209977,
|
|
"loss": 4.7637,
|
|
"mean_token_accuracy": 0.23912479281425475,
|
|
"num_tokens": 60090151.0,
|
|
"step": 26200
|
|
},
|
|
{
|
|
"entropy": 5.189052581787109,
|
|
"epoch": 2.5172910662824206,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004368313429890441,
|
|
"loss": 4.8757,
|
|
"mean_token_accuracy": 0.22785695642232895,
|
|
"num_tokens": 60102528.0,
|
|
"step": 26205
|
|
},
|
|
{
|
|
"entropy": 5.059284973144531,
|
|
"epoch": 2.517771373679155,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004368075225787453,
|
|
"loss": 4.7217,
|
|
"mean_token_accuracy": 0.23929833620786667,
|
|
"num_tokens": 60113078.0,
|
|
"step": 26210
|
|
},
|
|
{
|
|
"entropy": 5.032156848907471,
|
|
"epoch": 2.5182516810758884,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004367836984117562,
|
|
"loss": 4.7139,
|
|
"mean_token_accuracy": 0.2332184448838234,
|
|
"num_tokens": 60124728.0,
|
|
"step": 26215
|
|
},
|
|
{
|
|
"entropy": 5.013386535644531,
|
|
"epoch": 2.5187319884726227,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043675987048863,
|
|
"loss": 4.6951,
|
|
"mean_token_accuracy": 0.23805242478847505,
|
|
"num_tokens": 60136224.0,
|
|
"step": 26220
|
|
},
|
|
{
|
|
"entropy": 5.131964063644409,
|
|
"epoch": 2.519212295869356,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004367360388099201,
|
|
"loss": 4.8383,
|
|
"mean_token_accuracy": 0.2287626013159752,
|
|
"num_tokens": 60148654.0,
|
|
"step": 26225
|
|
},
|
|
{
|
|
"entropy": 5.223377132415772,
|
|
"epoch": 2.5196926032660905,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004367122033761796,
|
|
"loss": 4.8275,
|
|
"mean_token_accuracy": 0.22268654704093932,
|
|
"num_tokens": 60159740.0,
|
|
"step": 26230
|
|
},
|
|
{
|
|
"entropy": 5.142727422714233,
|
|
"epoch": 2.5201729106628243,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004366883641879618,
|
|
"loss": 4.8051,
|
|
"mean_token_accuracy": 0.22963927835226058,
|
|
"num_tokens": 60171387.0,
|
|
"step": 26235
|
|
},
|
|
{
|
|
"entropy": 5.037440156936645,
|
|
"epoch": 2.520653218059558,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00043666452124582034,
|
|
"loss": 4.7227,
|
|
"mean_token_accuracy": 0.23929983526468276,
|
|
"num_tokens": 60181562.0,
|
|
"step": 26240
|
|
},
|
|
{
|
|
"entropy": 4.980976724624634,
|
|
"epoch": 2.521133525456292,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004366406745503088,
|
|
"loss": 4.6635,
|
|
"mean_token_accuracy": 0.2430158495903015,
|
|
"num_tokens": 60191187.0,
|
|
"step": 26245
|
|
},
|
|
{
|
|
"entropy": 5.0385167598724365,
|
|
"epoch": 2.521613832853026,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004366168241019807,
|
|
"loss": 4.714,
|
|
"mean_token_accuracy": 0.2383538380265236,
|
|
"num_tokens": 60203792.0,
|
|
"step": 26250
|
|
},
|
|
{
|
|
"entropy": 5.156890964508056,
|
|
"epoch": 2.52209414024976,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004365929699013899,
|
|
"loss": 4.7941,
|
|
"mean_token_accuracy": 0.23458444625139235,
|
|
"num_tokens": 60215523.0,
|
|
"step": 26255
|
|
},
|
|
{
|
|
"entropy": 5.120486068725586,
|
|
"epoch": 2.5225744476464937,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004365691119490902,
|
|
"loss": 4.831,
|
|
"mean_token_accuracy": 0.2312985271215439,
|
|
"num_tokens": 60226113.0,
|
|
"step": 26260
|
|
},
|
|
{
|
|
"entropy": 5.1754053115844725,
|
|
"epoch": 2.5230547550432276,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004365452502456354,
|
|
"loss": 4.8934,
|
|
"mean_token_accuracy": 0.2248413920402527,
|
|
"num_tokens": 60238521.0,
|
|
"step": 26265
|
|
},
|
|
{
|
|
"entropy": 5.02523455619812,
|
|
"epoch": 2.5235350624399615,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004365213847915796,
|
|
"loss": 4.6587,
|
|
"mean_token_accuracy": 0.24070182889699937,
|
|
"num_tokens": 60249717.0,
|
|
"step": 26270
|
|
},
|
|
{
|
|
"entropy": 5.152025556564331,
|
|
"epoch": 2.5240153698366954,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043649751558747695,
|
|
"loss": 4.8645,
|
|
"mean_token_accuracy": 0.22075033336877822,
|
|
"num_tokens": 60261749.0,
|
|
"step": 26275
|
|
},
|
|
{
|
|
"entropy": 5.1482549667358395,
|
|
"epoch": 2.5244956772334293,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00043647364263388143,
|
|
"loss": 4.8425,
|
|
"mean_token_accuracy": 0.22865833938121796,
|
|
"num_tokens": 60272925.0,
|
|
"step": 26280
|
|
},
|
|
{
|
|
"entropy": 5.139992952346802,
|
|
"epoch": 2.524975984630163,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004364497659313475,
|
|
"loss": 4.7977,
|
|
"mean_token_accuracy": 0.2323908507823944,
|
|
"num_tokens": 60285081.0,
|
|
"step": 26285
|
|
},
|
|
{
|
|
"entropy": 5.155612897872925,
|
|
"epoch": 2.525456292026897,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004364258854804294,
|
|
"loss": 4.7932,
|
|
"mean_token_accuracy": 0.237948477268219,
|
|
"num_tokens": 60296796.0,
|
|
"step": 26290
|
|
},
|
|
{
|
|
"entropy": 5.1394867420196535,
|
|
"epoch": 2.5259365994236314,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004364020012816815,
|
|
"loss": 4.8584,
|
|
"mean_token_accuracy": 0.22993704825639724,
|
|
"num_tokens": 60308600.0,
|
|
"step": 26295
|
|
},
|
|
{
|
|
"entropy": 5.062632894515991,
|
|
"epoch": 2.526416906820365,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004363781133356584,
|
|
"loss": 4.7279,
|
|
"mean_token_accuracy": 0.2410830244421959,
|
|
"num_tokens": 60319188.0,
|
|
"step": 26300
|
|
},
|
|
{
|
|
"entropy": 5.1485496997833256,
|
|
"epoch": 2.526897214217099,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004363542216429147,
|
|
"loss": 4.8249,
|
|
"mean_token_accuracy": 0.2361416146159172,
|
|
"num_tokens": 60329912.0,
|
|
"step": 26305
|
|
},
|
|
{
|
|
"entropy": 5.187692260742187,
|
|
"epoch": 2.527377521613833,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004363303262040051,
|
|
"loss": 4.796,
|
|
"mean_token_accuracy": 0.2305053174495697,
|
|
"num_tokens": 60339760.0,
|
|
"step": 26310
|
|
},
|
|
{
|
|
"entropy": 5.007799482345581,
|
|
"epoch": 2.527857829010567,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043630642701948446,
|
|
"loss": 4.6331,
|
|
"mean_token_accuracy": 0.2434243828058243,
|
|
"num_tokens": 60349947.0,
|
|
"step": 26315
|
|
},
|
|
{
|
|
"entropy": 5.076165342330933,
|
|
"epoch": 2.5283381364073008,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043628252408990756,
|
|
"loss": 4.8564,
|
|
"mean_token_accuracy": 0.22621034681797028,
|
|
"num_tokens": 60362620.0,
|
|
"step": 26320
|
|
},
|
|
{
|
|
"entropy": 5.204525279998779,
|
|
"epoch": 2.5288184438040346,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00043625861741582926,
|
|
"loss": 4.8204,
|
|
"mean_token_accuracy": 0.23000132739543916,
|
|
"num_tokens": 60373729.0,
|
|
"step": 26325
|
|
},
|
|
{
|
|
"entropy": 5.181565809249878,
|
|
"epoch": 2.5292987512007685,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043623470699780483,
|
|
"loss": 4.7641,
|
|
"mean_token_accuracy": 0.23317065536975862,
|
|
"num_tokens": 60386423.0,
|
|
"step": 26330
|
|
},
|
|
{
|
|
"entropy": 5.090573835372925,
|
|
"epoch": 2.5297790585975024,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004362107928363892,
|
|
"loss": 4.7975,
|
|
"mean_token_accuracy": 0.22879107743501664,
|
|
"num_tokens": 60398364.0,
|
|
"step": 26335
|
|
},
|
|
{
|
|
"entropy": 5.127240180969238,
|
|
"epoch": 2.5302593659942363,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004361868749321377,
|
|
"loss": 4.8279,
|
|
"mean_token_accuracy": 0.23116668611764907,
|
|
"num_tokens": 60408423.0,
|
|
"step": 26340
|
|
},
|
|
{
|
|
"entropy": 5.175243282318116,
|
|
"epoch": 2.53073967339097,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004361629532856055,
|
|
"loss": 4.8457,
|
|
"mean_token_accuracy": 0.2255500078201294,
|
|
"num_tokens": 60420229.0,
|
|
"step": 26345
|
|
},
|
|
{
|
|
"entropy": 5.2243876457214355,
|
|
"epoch": 2.531219980787704,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00043613902789734816,
|
|
"loss": 4.873,
|
|
"mean_token_accuracy": 0.2286778062582016,
|
|
"num_tokens": 60430711.0,
|
|
"step": 26350
|
|
},
|
|
{
|
|
"entropy": 5.182257080078125,
|
|
"epoch": 2.531700288184438,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000436115098767921,
|
|
"loss": 4.8509,
|
|
"mean_token_accuracy": 0.226052425801754,
|
|
"num_tokens": 60442293.0,
|
|
"step": 26355
|
|
},
|
|
{
|
|
"entropy": 5.0641368389129635,
|
|
"epoch": 2.532180595581172,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043609116589787974,
|
|
"loss": 4.7563,
|
|
"mean_token_accuracy": 0.23511586487293243,
|
|
"num_tokens": 60452616.0,
|
|
"step": 26360
|
|
},
|
|
{
|
|
"entropy": 5.088418579101562,
|
|
"epoch": 2.5326609029779057,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004360672292877799,
|
|
"loss": 4.7419,
|
|
"mean_token_accuracy": 0.23013172149658204,
|
|
"num_tokens": 60463254.0,
|
|
"step": 26365
|
|
},
|
|
{
|
|
"entropy": 5.086653232574463,
|
|
"epoch": 2.53314121037464,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00043604328893817726,
|
|
"loss": 4.7792,
|
|
"mean_token_accuracy": 0.23134986758232118,
|
|
"num_tokens": 60473725.0,
|
|
"step": 26370
|
|
},
|
|
{
|
|
"entropy": 5.126363468170166,
|
|
"epoch": 2.5336215177713735,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00043601934484962775,
|
|
"loss": 4.7947,
|
|
"mean_token_accuracy": 0.2362975835800171,
|
|
"num_tokens": 60484502.0,
|
|
"step": 26375
|
|
},
|
|
{
|
|
"entropy": 5.046994161605835,
|
|
"epoch": 2.534101825168108,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004359953970226871,
|
|
"loss": 4.7355,
|
|
"mean_token_accuracy": 0.2410357877612114,
|
|
"num_tokens": 60496134.0,
|
|
"step": 26380
|
|
},
|
|
{
|
|
"entropy": 5.132125663757324,
|
|
"epoch": 2.5345821325648417,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043597144545791134,
|
|
"loss": 4.8178,
|
|
"mean_token_accuracy": 0.23338208943605424,
|
|
"num_tokens": 60507236.0,
|
|
"step": 26385
|
|
},
|
|
{
|
|
"entropy": 5.057839679718017,
|
|
"epoch": 2.5350624399615755,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004359474901558567,
|
|
"loss": 4.7484,
|
|
"mean_token_accuracy": 0.24036937803030015,
|
|
"num_tokens": 60517499.0,
|
|
"step": 26390
|
|
},
|
|
{
|
|
"entropy": 5.162707138061523,
|
|
"epoch": 2.5355427473583094,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004359235311170792,
|
|
"loss": 4.8445,
|
|
"mean_token_accuracy": 0.2248290151357651,
|
|
"num_tokens": 60529434.0,
|
|
"step": 26395
|
|
},
|
|
{
|
|
"entropy": 5.166756963729858,
|
|
"epoch": 2.5360230547550433,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004358995683421352,
|
|
"loss": 4.8376,
|
|
"mean_token_accuracy": 0.22776952087879182,
|
|
"num_tokens": 60542571.0,
|
|
"step": 26400
|
|
},
|
|
{
|
|
"entropy": 5.0918957710266115,
|
|
"epoch": 2.536503362151777,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043587560183158095,
|
|
"loss": 4.7352,
|
|
"mean_token_accuracy": 0.2310478702187538,
|
|
"num_tokens": 60553826.0,
|
|
"step": 26405
|
|
},
|
|
{
|
|
"entropy": 5.049466800689697,
|
|
"epoch": 2.536983669548511,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000435851631585973,
|
|
"loss": 4.7643,
|
|
"mean_token_accuracy": 0.23094182163476945,
|
|
"num_tokens": 60565173.0,
|
|
"step": 26410
|
|
},
|
|
{
|
|
"entropy": 5.071749114990235,
|
|
"epoch": 2.537463976945245,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004358276576058677,
|
|
"loss": 4.7518,
|
|
"mean_token_accuracy": 0.24238502383232116,
|
|
"num_tokens": 60577267.0,
|
|
"step": 26415
|
|
},
|
|
{
|
|
"entropy": 5.134958410263062,
|
|
"epoch": 2.537944284341979,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004358036798918218,
|
|
"loss": 4.7841,
|
|
"mean_token_accuracy": 0.22933975905179976,
|
|
"num_tokens": 60587441.0,
|
|
"step": 26420
|
|
},
|
|
{
|
|
"entropy": 5.195329713821411,
|
|
"epoch": 2.5384245917387127,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004357796984443919,
|
|
"loss": 4.8349,
|
|
"mean_token_accuracy": 0.23350724577903748,
|
|
"num_tokens": 60597173.0,
|
|
"step": 26425
|
|
},
|
|
{
|
|
"entropy": 5.05770468711853,
|
|
"epoch": 2.5389048991354466,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043575571326413484,
|
|
"loss": 4.7077,
|
|
"mean_token_accuracy": 0.23081723600625992,
|
|
"num_tokens": 60608299.0,
|
|
"step": 26430
|
|
},
|
|
{
|
|
"entropy": 5.198756313323974,
|
|
"epoch": 2.5393852065321805,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004357317243516075,
|
|
"loss": 4.8884,
|
|
"mean_token_accuracy": 0.22811190336942672,
|
|
"num_tokens": 60619106.0,
|
|
"step": 26435
|
|
},
|
|
{
|
|
"entropy": 5.089833211898804,
|
|
"epoch": 2.5398655139289144,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00043570773170736676,
|
|
"loss": 4.6762,
|
|
"mean_token_accuracy": 0.23719825744628906,
|
|
"num_tokens": 60629339.0,
|
|
"step": 26440
|
|
},
|
|
{
|
|
"entropy": 5.109752893447876,
|
|
"epoch": 2.5403458213256487,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043568373533196976,
|
|
"loss": 4.716,
|
|
"mean_token_accuracy": 0.2442714810371399,
|
|
"num_tokens": 60640162.0,
|
|
"step": 26445
|
|
},
|
|
{
|
|
"entropy": 5.031629228591919,
|
|
"epoch": 2.540826128722382,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00043565973522597344,
|
|
"loss": 4.7499,
|
|
"mean_token_accuracy": 0.2293874904513359,
|
|
"num_tokens": 60652529.0,
|
|
"step": 26450
|
|
},
|
|
{
|
|
"entropy": 5.047484922409057,
|
|
"epoch": 2.5413064361191164,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043563573138993524,
|
|
"loss": 4.6916,
|
|
"mean_token_accuracy": 0.233546245098114,
|
|
"num_tokens": 60663035.0,
|
|
"step": 26455
|
|
},
|
|
{
|
|
"entropy": 5.098540163040161,
|
|
"epoch": 2.54178674351585,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004356117238244123,
|
|
"loss": 4.7277,
|
|
"mean_token_accuracy": 0.23866922706365584,
|
|
"num_tokens": 60674430.0,
|
|
"step": 26460
|
|
},
|
|
{
|
|
"entropy": 5.027708196640015,
|
|
"epoch": 2.542267050912584,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043558771252996204,
|
|
"loss": 4.7001,
|
|
"mean_token_accuracy": 0.23966480642557145,
|
|
"num_tokens": 60685923.0,
|
|
"step": 26465
|
|
},
|
|
{
|
|
"entropy": 5.07547516822815,
|
|
"epoch": 2.542747358309318,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000435563697507142,
|
|
"loss": 4.7643,
|
|
"mean_token_accuracy": 0.23587315380573273,
|
|
"num_tokens": 60696344.0,
|
|
"step": 26470
|
|
},
|
|
{
|
|
"entropy": 5.108949947357178,
|
|
"epoch": 2.543227665706052,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004355396787565096,
|
|
"loss": 4.7781,
|
|
"mean_token_accuracy": 0.23361520916223527,
|
|
"num_tokens": 60709941.0,
|
|
"step": 26475
|
|
},
|
|
{
|
|
"entropy": 5.096314716339111,
|
|
"epoch": 2.543707973102786,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00043551565627862257,
|
|
"loss": 4.7535,
|
|
"mean_token_accuracy": 0.2340693786740303,
|
|
"num_tokens": 60721245.0,
|
|
"step": 26480
|
|
},
|
|
{
|
|
"entropy": 5.066954851150513,
|
|
"epoch": 2.5441882804995197,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004354916300740387,
|
|
"loss": 4.7433,
|
|
"mean_token_accuracy": 0.23161177486181259,
|
|
"num_tokens": 60732763.0,
|
|
"step": 26485
|
|
},
|
|
{
|
|
"entropy": 5.016753816604615,
|
|
"epoch": 2.5446685878962536,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004354676001433157,
|
|
"loss": 4.7248,
|
|
"mean_token_accuracy": 0.24093882292509078,
|
|
"num_tokens": 60743192.0,
|
|
"step": 26490
|
|
},
|
|
{
|
|
"entropy": 4.984168529510498,
|
|
"epoch": 2.5451488952929875,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004354435664870116,
|
|
"loss": 4.6551,
|
|
"mean_token_accuracy": 0.23839119523763658,
|
|
"num_tokens": 60754115.0,
|
|
"step": 26495
|
|
},
|
|
{
|
|
"entropy": 5.142358589172363,
|
|
"epoch": 2.5456292026897214,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043541952910568417,
|
|
"loss": 4.8166,
|
|
"mean_token_accuracy": 0.23036390393972397,
|
|
"num_tokens": 60765516.0,
|
|
"step": 26500
|
|
},
|
|
{
|
|
"entropy": 5.143162775039673,
|
|
"epoch": 2.5461095100864553,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004353954879998916,
|
|
"loss": 4.7868,
|
|
"mean_token_accuracy": 0.23051732331514357,
|
|
"num_tokens": 60776206.0,
|
|
"step": 26505
|
|
},
|
|
{
|
|
"entropy": 5.050413990020752,
|
|
"epoch": 2.546589817483189,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004353714431701922,
|
|
"loss": 4.7071,
|
|
"mean_token_accuracy": 0.23476076275110244,
|
|
"num_tokens": 60786992.0,
|
|
"step": 26510
|
|
},
|
|
{
|
|
"entropy": 5.112949514389038,
|
|
"epoch": 2.547070124879923,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004353473946171441,
|
|
"loss": 4.798,
|
|
"mean_token_accuracy": 0.22756085842847823,
|
|
"num_tokens": 60797765.0,
|
|
"step": 26515
|
|
},
|
|
{
|
|
"entropy": 5.15986328125,
|
|
"epoch": 2.547550432276657,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043532334234130547,
|
|
"loss": 4.8078,
|
|
"mean_token_accuracy": 0.22541076242923735,
|
|
"num_tokens": 60808873.0,
|
|
"step": 26520
|
|
},
|
|
{
|
|
"entropy": 5.143631410598755,
|
|
"epoch": 2.5480307396733908,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043529928634323503,
|
|
"loss": 4.8367,
|
|
"mean_token_accuracy": 0.22586060762405397,
|
|
"num_tokens": 60820245.0,
|
|
"step": 26525
|
|
},
|
|
{
|
|
"entropy": 5.073045969009399,
|
|
"epoch": 2.548511047070125,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00043527522662349113,
|
|
"loss": 4.7861,
|
|
"mean_token_accuracy": 0.23567369878292083,
|
|
"num_tokens": 60832159.0,
|
|
"step": 26530
|
|
},
|
|
{
|
|
"entropy": 5.082254457473755,
|
|
"epoch": 2.5489913544668585,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004352511631826324,
|
|
"loss": 4.7432,
|
|
"mean_token_accuracy": 0.2327280730009079,
|
|
"num_tokens": 60842957.0,
|
|
"step": 26535
|
|
},
|
|
{
|
|
"entropy": 5.082061862945556,
|
|
"epoch": 2.549471661863593,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004352270960212175,
|
|
"loss": 4.7523,
|
|
"mean_token_accuracy": 0.2365626201033592,
|
|
"num_tokens": 60852537.0,
|
|
"step": 26540
|
|
},
|
|
{
|
|
"entropy": 5.237741470336914,
|
|
"epoch": 2.5499519692603267,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004352030251398052,
|
|
"loss": 4.9144,
|
|
"mean_token_accuracy": 0.2198659136891365,
|
|
"num_tokens": 60865761.0,
|
|
"step": 26545
|
|
},
|
|
{
|
|
"entropy": 5.127560138702393,
|
|
"epoch": 2.5504322766570606,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043517895053895434,
|
|
"loss": 4.77,
|
|
"mean_token_accuracy": 0.2361322596669197,
|
|
"num_tokens": 60877805.0,
|
|
"step": 26550
|
|
},
|
|
{
|
|
"entropy": 5.085631465911865,
|
|
"epoch": 2.5509125840537945,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004351548722192239,
|
|
"loss": 4.7979,
|
|
"mean_token_accuracy": 0.2329874664545059,
|
|
"num_tokens": 60889868.0,
|
|
"step": 26555
|
|
},
|
|
{
|
|
"entropy": 5.120602083206177,
|
|
"epoch": 2.5513928914505284,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004351307901811729,
|
|
"loss": 4.7695,
|
|
"mean_token_accuracy": 0.23612948954105378,
|
|
"num_tokens": 60902046.0,
|
|
"step": 26560
|
|
},
|
|
{
|
|
"entropy": 5.121815824508667,
|
|
"epoch": 2.5518731988472623,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004351067044253605,
|
|
"loss": 4.7806,
|
|
"mean_token_accuracy": 0.2298535168170929,
|
|
"num_tokens": 60913447.0,
|
|
"step": 26565
|
|
},
|
|
{
|
|
"entropy": 5.1224446296691895,
|
|
"epoch": 2.552353506243996,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043508261495234577,
|
|
"loss": 4.757,
|
|
"mean_token_accuracy": 0.2374941736459732,
|
|
"num_tokens": 60924920.0,
|
|
"step": 26570
|
|
},
|
|
{
|
|
"entropy": 5.0311089038848875,
|
|
"epoch": 2.55283381364073,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000435058521762688,
|
|
"loss": 4.6902,
|
|
"mean_token_accuracy": 0.23946462124586104,
|
|
"num_tokens": 60936937.0,
|
|
"step": 26575
|
|
},
|
|
{
|
|
"entropy": 5.035173082351685,
|
|
"epoch": 2.553314121037464,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004350344248569467,
|
|
"loss": 4.7392,
|
|
"mean_token_accuracy": 0.24265369772911072,
|
|
"num_tokens": 60947836.0,
|
|
"step": 26580
|
|
},
|
|
{
|
|
"entropy": 5.0782520294189455,
|
|
"epoch": 2.553794428434198,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004350103242356813,
|
|
"loss": 4.8239,
|
|
"mean_token_accuracy": 0.23157395124435426,
|
|
"num_tokens": 60959380.0,
|
|
"step": 26585
|
|
},
|
|
{
|
|
"entropy": 5.1494324684143065,
|
|
"epoch": 2.5542747358309317,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004349862198994513,
|
|
"loss": 4.8105,
|
|
"mean_token_accuracy": 0.2270434394478798,
|
|
"num_tokens": 60970148.0,
|
|
"step": 26590
|
|
},
|
|
{
|
|
"entropy": 5.107781076431275,
|
|
"epoch": 2.5547550432276656,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004349621118488163,
|
|
"loss": 4.7167,
|
|
"mean_token_accuracy": 0.2449997827410698,
|
|
"num_tokens": 60981139.0,
|
|
"step": 26595
|
|
},
|
|
{
|
|
"entropy": 4.997005796432495,
|
|
"epoch": 2.5552353506243994,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004349380000843361,
|
|
"loss": 4.7246,
|
|
"mean_token_accuracy": 0.23980281502008438,
|
|
"num_tokens": 60992204.0,
|
|
"step": 26600
|
|
},
|
|
{
|
|
"entropy": 5.0700541019439695,
|
|
"epoch": 2.5557156580211338,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004349138846065704,
|
|
"loss": 4.7374,
|
|
"mean_token_accuracy": 0.23640399128198625,
|
|
"num_tokens": 61002198.0,
|
|
"step": 26605
|
|
},
|
|
{
|
|
"entropy": 5.165579319000244,
|
|
"epoch": 2.556195965417867,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004348897654160791,
|
|
"loss": 4.8376,
|
|
"mean_token_accuracy": 0.2286193385720253,
|
|
"num_tokens": 61012987.0,
|
|
"step": 26610
|
|
},
|
|
{
|
|
"entropy": 5.0254762172698975,
|
|
"epoch": 2.5566762728146015,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004348656425134223,
|
|
"loss": 4.6344,
|
|
"mean_token_accuracy": 0.24285491108894347,
|
|
"num_tokens": 61023668.0,
|
|
"step": 26615
|
|
},
|
|
{
|
|
"entropy": 5.082610177993774,
|
|
"epoch": 2.5571565802113354,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043484151589916,
|
|
"loss": 4.6805,
|
|
"mean_token_accuracy": 0.24342512488365173,
|
|
"num_tokens": 61034375.0,
|
|
"step": 26620
|
|
},
|
|
{
|
|
"entropy": 5.10274305343628,
|
|
"epoch": 2.5576368876080693,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004348173855738523,
|
|
"loss": 4.775,
|
|
"mean_token_accuracy": 0.231153304874897,
|
|
"num_tokens": 61045625.0,
|
|
"step": 26625
|
|
},
|
|
{
|
|
"entropy": 5.083966779708862,
|
|
"epoch": 2.558117195004803,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004347932515380595,
|
|
"loss": 4.8058,
|
|
"mean_token_accuracy": 0.23799928277730942,
|
|
"num_tokens": 61056359.0,
|
|
"step": 26630
|
|
},
|
|
{
|
|
"entropy": 5.089589023590088,
|
|
"epoch": 2.558597502401537,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004347691137923418,
|
|
"loss": 4.7357,
|
|
"mean_token_accuracy": 0.23540275543928146,
|
|
"num_tokens": 61067194.0,
|
|
"step": 26635
|
|
},
|
|
{
|
|
"entropy": 5.176966857910156,
|
|
"epoch": 2.559077809798271,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004347449723372598,
|
|
"loss": 4.8407,
|
|
"mean_token_accuracy": 0.22330044209957123,
|
|
"num_tokens": 61079341.0,
|
|
"step": 26640
|
|
},
|
|
{
|
|
"entropy": 5.10914101600647,
|
|
"epoch": 2.559558117195005,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004347208271733738,
|
|
"loss": 4.824,
|
|
"mean_token_accuracy": 0.23296955972909927,
|
|
"num_tokens": 61090843.0,
|
|
"step": 26645
|
|
},
|
|
{
|
|
"entropy": 5.112281465530396,
|
|
"epoch": 2.5600384245917387,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004346966783012445,
|
|
"loss": 4.7308,
|
|
"mean_token_accuracy": 0.2376183584332466,
|
|
"num_tokens": 61101780.0,
|
|
"step": 26650
|
|
},
|
|
{
|
|
"entropy": 5.108506917953491,
|
|
"epoch": 2.5605187319884726,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00043467252572143247,
|
|
"loss": 4.8093,
|
|
"mean_token_accuracy": 0.23573538511991501,
|
|
"num_tokens": 61113026.0,
|
|
"step": 26655
|
|
},
|
|
{
|
|
"entropy": 5.07466197013855,
|
|
"epoch": 2.5609990393852065,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00043464836943449866,
|
|
"loss": 4.7777,
|
|
"mean_token_accuracy": 0.23345693796873093,
|
|
"num_tokens": 61124772.0,
|
|
"step": 26660
|
|
},
|
|
{
|
|
"entropy": 5.091155910491944,
|
|
"epoch": 2.5614793467819403,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004346242094410036,
|
|
"loss": 4.7689,
|
|
"mean_token_accuracy": 0.23651626110076904,
|
|
"num_tokens": 61136683.0,
|
|
"step": 26665
|
|
},
|
|
{
|
|
"entropy": 5.103244400024414,
|
|
"epoch": 2.561959654178674,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004346000457415085,
|
|
"loss": 4.7298,
|
|
"mean_token_accuracy": 0.24229811131954193,
|
|
"num_tokens": 61148047.0,
|
|
"step": 26670
|
|
},
|
|
{
|
|
"entropy": 5.061260938644409,
|
|
"epoch": 2.562439961575408,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043457587833657424,
|
|
"loss": 4.6959,
|
|
"mean_token_accuracy": 0.24003661572933196,
|
|
"num_tokens": 61160680.0,
|
|
"step": 26675
|
|
},
|
|
{
|
|
"entropy": 5.118103408813477,
|
|
"epoch": 2.5629202689721424,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00043455170722676194,
|
|
"loss": 4.7777,
|
|
"mean_token_accuracy": 0.23091289401054382,
|
|
"num_tokens": 61172624.0,
|
|
"step": 26680
|
|
},
|
|
{
|
|
"entropy": 4.997263336181641,
|
|
"epoch": 2.563400576368876,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004345275324126328,
|
|
"loss": 4.6791,
|
|
"mean_token_accuracy": 0.24024482667446137,
|
|
"num_tokens": 61183971.0,
|
|
"step": 26685
|
|
},
|
|
{
|
|
"entropy": 5.060653734207153,
|
|
"epoch": 2.56388088376561,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00043450335389474796,
|
|
"loss": 4.6853,
|
|
"mean_token_accuracy": 0.24575116485357285,
|
|
"num_tokens": 61194511.0,
|
|
"step": 26690
|
|
},
|
|
{
|
|
"entropy": 5.046016693115234,
|
|
"epoch": 2.5643611911623436,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004344791716736689,
|
|
"loss": 4.6947,
|
|
"mean_token_accuracy": 0.23781549483537673,
|
|
"num_tokens": 61205235.0,
|
|
"step": 26695
|
|
},
|
|
{
|
|
"entropy": 5.028386497497559,
|
|
"epoch": 2.564841498559078,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043445498574995705,
|
|
"loss": 4.7266,
|
|
"mean_token_accuracy": 0.23924607634544373,
|
|
"num_tokens": 61217460.0,
|
|
"step": 26700
|
|
},
|
|
{
|
|
"entropy": 5.087894010543823,
|
|
"epoch": 2.565321805955812,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00043443079612417394,
|
|
"loss": 4.7255,
|
|
"mean_token_accuracy": 0.24343910813331604,
|
|
"num_tokens": 61229917.0,
|
|
"step": 26705
|
|
},
|
|
{
|
|
"entropy": 5.15901665687561,
|
|
"epoch": 2.5658021133525457,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004344066027968812,
|
|
"loss": 4.8817,
|
|
"mean_token_accuracy": 0.22483388036489488,
|
|
"num_tokens": 61241826.0,
|
|
"step": 26710
|
|
},
|
|
{
|
|
"entropy": 5.1887860774993895,
|
|
"epoch": 2.5662824207492796,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043438240576864034,
|
|
"loss": 4.8689,
|
|
"mean_token_accuracy": 0.23257529586553574,
|
|
"num_tokens": 61254095.0,
|
|
"step": 26715
|
|
},
|
|
{
|
|
"entropy": 5.109234619140625,
|
|
"epoch": 2.5667627281460135,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004343582050400134,
|
|
"loss": 4.8407,
|
|
"mean_token_accuracy": 0.23535792976617814,
|
|
"num_tokens": 61266114.0,
|
|
"step": 26720
|
|
},
|
|
{
|
|
"entropy": 5.076544189453125,
|
|
"epoch": 2.5672430355427474,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004343340006115621,
|
|
"loss": 4.7512,
|
|
"mean_token_accuracy": 0.2435067653656006,
|
|
"num_tokens": 61277289.0,
|
|
"step": 26725
|
|
},
|
|
{
|
|
"entropy": 5.213899040222168,
|
|
"epoch": 2.5677233429394812,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004343097924838483,
|
|
"loss": 4.8736,
|
|
"mean_token_accuracy": 0.22794857025146484,
|
|
"num_tokens": 61289424.0,
|
|
"step": 26730
|
|
},
|
|
{
|
|
"entropy": 5.176094341278076,
|
|
"epoch": 2.568203650336215,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004342855806574344,
|
|
"loss": 4.7694,
|
|
"mean_token_accuracy": 0.23666035383939743,
|
|
"num_tokens": 61300821.0,
|
|
"step": 26735
|
|
},
|
|
{
|
|
"entropy": 5.061013555526733,
|
|
"epoch": 2.568683957732949,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004342613651328822,
|
|
"loss": 4.7296,
|
|
"mean_token_accuracy": 0.2397848516702652,
|
|
"num_tokens": 61312097.0,
|
|
"step": 26740
|
|
},
|
|
{
|
|
"entropy": 5.012730407714844,
|
|
"epoch": 2.569164265129683,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000434237145910754,
|
|
"loss": 4.7368,
|
|
"mean_token_accuracy": 0.24157094657421113,
|
|
"num_tokens": 61324144.0,
|
|
"step": 26745
|
|
},
|
|
{
|
|
"entropy": 5.09266152381897,
|
|
"epoch": 2.5696445725264168,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00043421292299161213,
|
|
"loss": 4.7273,
|
|
"mean_token_accuracy": 0.23389368057250975,
|
|
"num_tokens": 61335809.0,
|
|
"step": 26750
|
|
},
|
|
{
|
|
"entropy": 5.086174964904785,
|
|
"epoch": 2.570124879923151,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043418869637601887,
|
|
"loss": 4.7346,
|
|
"mean_token_accuracy": 0.23781654387712478,
|
|
"num_tokens": 61346980.0,
|
|
"step": 26755
|
|
},
|
|
{
|
|
"entropy": 5.129696369171143,
|
|
"epoch": 2.5706051873198845,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00043416446606453686,
|
|
"loss": 4.8818,
|
|
"mean_token_accuracy": 0.22965129613876342,
|
|
"num_tokens": 61358401.0,
|
|
"step": 26760
|
|
},
|
|
{
|
|
"entropy": 5.049072360992431,
|
|
"epoch": 2.571085494716619,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004341402320577285,
|
|
"loss": 4.733,
|
|
"mean_token_accuracy": 0.2349224328994751,
|
|
"num_tokens": 61369383.0,
|
|
"step": 26765
|
|
},
|
|
{
|
|
"entropy": 5.117078590393066,
|
|
"epoch": 2.5715658021133523,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004341159943561566,
|
|
"loss": 4.767,
|
|
"mean_token_accuracy": 0.23110248148441315,
|
|
"num_tokens": 61380439.0,
|
|
"step": 26770
|
|
},
|
|
{
|
|
"entropy": 5.0966477394104,
|
|
"epoch": 2.5720461095100866,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004340917529603837,
|
|
"loss": 4.7259,
|
|
"mean_token_accuracy": 0.2448524907231331,
|
|
"num_tokens": 61391218.0,
|
|
"step": 26775
|
|
},
|
|
{
|
|
"entropy": 5.062763261795044,
|
|
"epoch": 2.5725264169068205,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004340675078709728,
|
|
"loss": 4.7023,
|
|
"mean_token_accuracy": 0.23651713877916336,
|
|
"num_tokens": 61402715.0,
|
|
"step": 26780
|
|
},
|
|
{
|
|
"entropy": 5.110429859161377,
|
|
"epoch": 2.5730067243035544,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004340432590884866,
|
|
"loss": 4.8037,
|
|
"mean_token_accuracy": 0.23051778227090836,
|
|
"num_tokens": 61414361.0,
|
|
"step": 26785
|
|
},
|
|
{
|
|
"entropy": 5.071790027618408,
|
|
"epoch": 2.5734870317002883,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00043401900661348825,
|
|
"loss": 4.7136,
|
|
"mean_token_accuracy": 0.2407862976193428,
|
|
"num_tokens": 61426817.0,
|
|
"step": 26790
|
|
},
|
|
{
|
|
"entropy": 5.04749174118042,
|
|
"epoch": 2.573967339097022,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00043399475044654073,
|
|
"loss": 4.7295,
|
|
"mean_token_accuracy": 0.23616889715194703,
|
|
"num_tokens": 61438142.0,
|
|
"step": 26795
|
|
},
|
|
{
|
|
"entropy": 5.0938348293304445,
|
|
"epoch": 2.574447646493756,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00043397049058820726,
|
|
"loss": 4.7878,
|
|
"mean_token_accuracy": 0.2282659724354744,
|
|
"num_tokens": 61450708.0,
|
|
"step": 26800
|
|
},
|
|
{
|
|
"entropy": 5.106382703781128,
|
|
"epoch": 2.57492795389049,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000433946227039051,
|
|
"loss": 4.7747,
|
|
"mean_token_accuracy": 0.23768922835588455,
|
|
"num_tokens": 61462266.0,
|
|
"step": 26805
|
|
},
|
|
{
|
|
"entropy": 5.117368936538696,
|
|
"epoch": 2.5754082612872238,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004339219597996353,
|
|
"loss": 4.7742,
|
|
"mean_token_accuracy": 0.23543445467948915,
|
|
"num_tokens": 61473690.0,
|
|
"step": 26810
|
|
},
|
|
{
|
|
"entropy": 4.9948780059814455,
|
|
"epoch": 2.5758885686839577,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004338976888705236,
|
|
"loss": 4.6335,
|
|
"mean_token_accuracy": 0.23899848759174347,
|
|
"num_tokens": 61484997.0,
|
|
"step": 26815
|
|
},
|
|
{
|
|
"entropy": 4.981661653518676,
|
|
"epoch": 2.5763688760806915,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00043387341425227944,
|
|
"loss": 4.8124,
|
|
"mean_token_accuracy": 0.2320055529475212,
|
|
"num_tokens": 61497792.0,
|
|
"step": 26820
|
|
},
|
|
{
|
|
"entropy": 5.081102275848389,
|
|
"epoch": 2.5768491834774254,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004338491359454664,
|
|
"loss": 4.6597,
|
|
"mean_token_accuracy": 0.24134160727262496,
|
|
"num_tokens": 61509060.0,
|
|
"step": 26825
|
|
},
|
|
{
|
|
"entropy": 5.135309314727783,
|
|
"epoch": 2.5773294908741593,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043382485395064796,
|
|
"loss": 4.7687,
|
|
"mean_token_accuracy": 0.24514102190732956,
|
|
"num_tokens": 61519577.0,
|
|
"step": 26830
|
|
},
|
|
{
|
|
"entropy": 5.019241142272949,
|
|
"epoch": 2.577809798270893,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004338005682683881,
|
|
"loss": 4.7197,
|
|
"mean_token_accuracy": 0.2386346474289894,
|
|
"num_tokens": 61531956.0,
|
|
"step": 26835
|
|
},
|
|
{
|
|
"entropy": 5.139713191986084,
|
|
"epoch": 2.5782901056676275,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00043377627889925057,
|
|
"loss": 4.8069,
|
|
"mean_token_accuracy": 0.2325481116771698,
|
|
"num_tokens": 61543801.0,
|
|
"step": 26840
|
|
},
|
|
{
|
|
"entropy": 5.137574863433838,
|
|
"epoch": 2.578770413064361,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004337519858437994,
|
|
"loss": 4.8302,
|
|
"mean_token_accuracy": 0.2269312933087349,
|
|
"num_tokens": 61555899.0,
|
|
"step": 26845
|
|
},
|
|
{
|
|
"entropy": 5.0105894088745115,
|
|
"epoch": 2.5792507204610953,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004337276891025984,
|
|
"loss": 4.6782,
|
|
"mean_token_accuracy": 0.2456341192126274,
|
|
"num_tokens": 61567280.0,
|
|
"step": 26850
|
|
},
|
|
{
|
|
"entropy": 5.057559442520142,
|
|
"epoch": 2.579731027857829,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043370338867621184,
|
|
"loss": 4.7743,
|
|
"mean_token_accuracy": 0.23362419605255128,
|
|
"num_tokens": 61578026.0,
|
|
"step": 26855
|
|
},
|
|
{
|
|
"entropy": 5.123590898513794,
|
|
"epoch": 2.580211335254563,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00043367908456520387,
|
|
"loss": 4.7506,
|
|
"mean_token_accuracy": 0.23303580284118652,
|
|
"num_tokens": 61589399.0,
|
|
"step": 26860
|
|
},
|
|
{
|
|
"entropy": 5.094915103912354,
|
|
"epoch": 2.580691642651297,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004336547767701387,
|
|
"loss": 4.7913,
|
|
"mean_token_accuracy": 0.23051347732543945,
|
|
"num_tokens": 61601025.0,
|
|
"step": 26865
|
|
},
|
|
{
|
|
"entropy": 5.1751209735870365,
|
|
"epoch": 2.581171950048031,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043363046529158077,
|
|
"loss": 4.8907,
|
|
"mean_token_accuracy": 0.22715967744588852,
|
|
"num_tokens": 61612529.0,
|
|
"step": 26870
|
|
},
|
|
{
|
|
"entropy": 5.114480495452881,
|
|
"epoch": 2.5816522574447647,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004336061501300944,
|
|
"loss": 4.8035,
|
|
"mean_token_accuracy": 0.23027193993330003,
|
|
"num_tokens": 61624423.0,
|
|
"step": 26875
|
|
},
|
|
{
|
|
"entropy": 5.162697601318359,
|
|
"epoch": 2.5821325648414986,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004335818312862442,
|
|
"loss": 4.8424,
|
|
"mean_token_accuracy": 0.2312071889638901,
|
|
"num_tokens": 61635002.0,
|
|
"step": 26880
|
|
},
|
|
{
|
|
"entropy": 5.069757986068725,
|
|
"epoch": 2.5826128722382324,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043355750876059485,
|
|
"loss": 4.7046,
|
|
"mean_token_accuracy": 0.24180822521448136,
|
|
"num_tokens": 61645911.0,
|
|
"step": 26885
|
|
},
|
|
{
|
|
"entropy": 5.083767700195312,
|
|
"epoch": 2.5830931796349663,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043353318255371086,
|
|
"loss": 4.7485,
|
|
"mean_token_accuracy": 0.2376156985759735,
|
|
"num_tokens": 61657545.0,
|
|
"step": 26890
|
|
},
|
|
{
|
|
"entropy": 5.162339305877685,
|
|
"epoch": 2.5835734870317,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004335088526661571,
|
|
"loss": 4.8215,
|
|
"mean_token_accuracy": 0.2307661294937134,
|
|
"num_tokens": 61669567.0,
|
|
"step": 26895
|
|
},
|
|
{
|
|
"entropy": 5.1503373146057125,
|
|
"epoch": 2.584053794428434,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00043348451909849855,
|
|
"loss": 4.8068,
|
|
"mean_token_accuracy": 0.23385066986083985,
|
|
"num_tokens": 61681725.0,
|
|
"step": 26900
|
|
},
|
|
{
|
|
"entropy": 5.151612710952759,
|
|
"epoch": 2.584534101825168,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043346018185130006,
|
|
"loss": 4.836,
|
|
"mean_token_accuracy": 0.23845063894987106,
|
|
"num_tokens": 61692503.0,
|
|
"step": 26905
|
|
},
|
|
{
|
|
"entropy": 5.039955091476441,
|
|
"epoch": 2.585014409221902,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00043343584092512665,
|
|
"loss": 4.7092,
|
|
"mean_token_accuracy": 0.24210308790206908,
|
|
"num_tokens": 61704567.0,
|
|
"step": 26910
|
|
},
|
|
{
|
|
"entropy": 5.066004133224487,
|
|
"epoch": 2.585494716618636,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004334114963205435,
|
|
"loss": 4.7261,
|
|
"mean_token_accuracy": 0.2413608655333519,
|
|
"num_tokens": 61716155.0,
|
|
"step": 26915
|
|
},
|
|
{
|
|
"entropy": 5.088526725769043,
|
|
"epoch": 2.5859750240153696,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004333871480381158,
|
|
"loss": 4.8512,
|
|
"mean_token_accuracy": 0.2299770951271057,
|
|
"num_tokens": 61727489.0,
|
|
"step": 26920
|
|
},
|
|
{
|
|
"entropy": 5.126882791519165,
|
|
"epoch": 2.586455331412104,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004333627960784088,
|
|
"loss": 4.7744,
|
|
"mean_token_accuracy": 0.23732642978429794,
|
|
"num_tokens": 61739792.0,
|
|
"step": 26925
|
|
},
|
|
{
|
|
"entropy": 5.118010234832764,
|
|
"epoch": 2.586935638808838,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004333384404419879,
|
|
"loss": 4.7854,
|
|
"mean_token_accuracy": 0.23018379360437394,
|
|
"num_tokens": 61750891.0,
|
|
"step": 26930
|
|
},
|
|
{
|
|
"entropy": 5.066667318344116,
|
|
"epoch": 2.5874159462055717,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004333140811294186,
|
|
"loss": 4.7431,
|
|
"mean_token_accuracy": 0.24288289994001389,
|
|
"num_tokens": 61761691.0,
|
|
"step": 26935
|
|
},
|
|
{
|
|
"entropy": 5.106765794754028,
|
|
"epoch": 2.5878962536023056,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004332897181412664,
|
|
"loss": 4.7476,
|
|
"mean_token_accuracy": 0.23299417197704314,
|
|
"num_tokens": 61772371.0,
|
|
"step": 26940
|
|
},
|
|
{
|
|
"entropy": 5.064568710327149,
|
|
"epoch": 2.5883765609990395,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043326535147809696,
|
|
"loss": 4.7184,
|
|
"mean_token_accuracy": 0.23607280999422073,
|
|
"num_tokens": 61784908.0,
|
|
"step": 26945
|
|
},
|
|
{
|
|
"entropy": 5.105681419372559,
|
|
"epoch": 2.5888568683957733,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043324098114047604,
|
|
"loss": 4.8136,
|
|
"mean_token_accuracy": 0.23087596744298935,
|
|
"num_tokens": 61795948.0,
|
|
"step": 26950
|
|
},
|
|
{
|
|
"entropy": 5.14426064491272,
|
|
"epoch": 2.589337175792507,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004332166071289695,
|
|
"loss": 4.7804,
|
|
"mean_token_accuracy": 0.22809687554836272,
|
|
"num_tokens": 61807528.0,
|
|
"step": 26955
|
|
},
|
|
{
|
|
"entropy": 5.120312786102295,
|
|
"epoch": 2.589817483189241,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00043319222944414304,
|
|
"loss": 4.7465,
|
|
"mean_token_accuracy": 0.23673148155212403,
|
|
"num_tokens": 61819971.0,
|
|
"step": 26960
|
|
},
|
|
{
|
|
"entropy": 5.0299958229064945,
|
|
"epoch": 2.590297790585975,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043316784808656276,
|
|
"loss": 4.7058,
|
|
"mean_token_accuracy": 0.24276293814182281,
|
|
"num_tokens": 61832284.0,
|
|
"step": 26965
|
|
},
|
|
{
|
|
"entropy": 5.098874902725219,
|
|
"epoch": 2.590778097982709,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043314346305679477,
|
|
"loss": 4.7955,
|
|
"mean_token_accuracy": 0.2335294172167778,
|
|
"num_tokens": 61844350.0,
|
|
"step": 26970
|
|
},
|
|
{
|
|
"entropy": 5.099506902694702,
|
|
"epoch": 2.5912584053794427,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043311907435540517,
|
|
"loss": 4.7377,
|
|
"mean_token_accuracy": 0.24343179762363434,
|
|
"num_tokens": 61855180.0,
|
|
"step": 26975
|
|
},
|
|
{
|
|
"entropy": 5.059788990020752,
|
|
"epoch": 2.5917387127761766,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004330946819829601,
|
|
"loss": 4.7935,
|
|
"mean_token_accuracy": 0.2240220710635185,
|
|
"num_tokens": 61866963.0,
|
|
"step": 26980
|
|
},
|
|
{
|
|
"entropy": 5.133587837219238,
|
|
"epoch": 2.5922190201729105,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043307028594002597,
|
|
"loss": 4.831,
|
|
"mean_token_accuracy": 0.23132447004318238,
|
|
"num_tokens": 61877503.0,
|
|
"step": 26985
|
|
},
|
|
{
|
|
"entropy": 5.092684125900268,
|
|
"epoch": 2.592699327569645,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00043304588622716924,
|
|
"loss": 4.7193,
|
|
"mean_token_accuracy": 0.24377114772796632,
|
|
"num_tokens": 61888426.0,
|
|
"step": 26990
|
|
},
|
|
{
|
|
"entropy": 5.100298738479614,
|
|
"epoch": 2.5931796349663783,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004330214828449563,
|
|
"loss": 4.7229,
|
|
"mean_token_accuracy": 0.23494363129138945,
|
|
"num_tokens": 61899952.0,
|
|
"step": 26995
|
|
},
|
|
{
|
|
"entropy": 4.997558212280273,
|
|
"epoch": 2.5936599423631126,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043299707579395365,
|
|
"loss": 4.7283,
|
|
"mean_token_accuracy": 0.23868339210748674,
|
|
"num_tokens": 61911956.0,
|
|
"step": 27000
|
|
},
|
|
{
|
|
"epoch": 2.5936599423631126,
|
|
"eval_entropy": 4.852371316488518,
|
|
"eval_loss": 4.87723970413208,
|
|
"eval_mean_token_accuracy": 0.23725959622386983,
|
|
"eval_num_tokens": 61911956.0,
|
|
"eval_runtime": 26.6641,
|
|
"eval_samples_per_second": 1230.683,
|
|
"eval_steps_per_second": 153.84,
|
|
"step": 27000
|
|
},
|
|
{
|
|
"entropy": 5.029786920547485,
|
|
"epoch": 2.594140249759846,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004329726650747282,
|
|
"loss": 4.7416,
|
|
"mean_token_accuracy": 0.23479766696691512,
|
|
"num_tokens": 61922916.0,
|
|
"step": 27005
|
|
},
|
|
{
|
|
"entropy": 5.17045111656189,
|
|
"epoch": 2.5946205571565804,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004329482506878465,
|
|
"loss": 4.7953,
|
|
"mean_token_accuracy": 0.2329192042350769,
|
|
"num_tokens": 61933683.0,
|
|
"step": 27010
|
|
},
|
|
{
|
|
"entropy": 5.054750871658325,
|
|
"epoch": 2.5951008645533142,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00043292383263387536,
|
|
"loss": 4.8124,
|
|
"mean_token_accuracy": 0.2400738313794136,
|
|
"num_tokens": 61945063.0,
|
|
"step": 27015
|
|
},
|
|
{
|
|
"entropy": 5.075078678131104,
|
|
"epoch": 2.595581171950048,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043289941091338187,
|
|
"loss": 4.7742,
|
|
"mean_token_accuracy": 0.23035979121923447,
|
|
"num_tokens": 61955919.0,
|
|
"step": 27020
|
|
},
|
|
{
|
|
"entropy": 5.244627285003662,
|
|
"epoch": 2.596061479346782,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004328749855269329,
|
|
"loss": 4.9298,
|
|
"mean_token_accuracy": 0.22583782225847243,
|
|
"num_tokens": 61967083.0,
|
|
"step": 27025
|
|
},
|
|
{
|
|
"entropy": 5.064927673339843,
|
|
"epoch": 2.596541786743516,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004328505564750955,
|
|
"loss": 4.7086,
|
|
"mean_token_accuracy": 0.2429245576262474,
|
|
"num_tokens": 61978659.0,
|
|
"step": 27030
|
|
},
|
|
{
|
|
"entropy": 5.051366949081421,
|
|
"epoch": 2.5970220941402498,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000432826123758437,
|
|
"loss": 4.6966,
|
|
"mean_token_accuracy": 0.24067613929510118,
|
|
"num_tokens": 61990966.0,
|
|
"step": 27035
|
|
},
|
|
{
|
|
"entropy": 5.026900863647461,
|
|
"epoch": 2.5975024015369836,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004328016873775244,
|
|
"loss": 4.713,
|
|
"mean_token_accuracy": 0.23913576900959016,
|
|
"num_tokens": 62002417.0,
|
|
"step": 27040
|
|
},
|
|
{
|
|
"entropy": 5.1119975566864015,
|
|
"epoch": 2.5979827089337175,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00043277724733292527,
|
|
"loss": 4.8071,
|
|
"mean_token_accuracy": 0.22981019616127013,
|
|
"num_tokens": 62014698.0,
|
|
"step": 27045
|
|
},
|
|
{
|
|
"entropy": 5.070964431762695,
|
|
"epoch": 2.5984630163304514,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004327528036252071,
|
|
"loss": 4.6792,
|
|
"mean_token_accuracy": 0.243856018781662,
|
|
"num_tokens": 62025298.0,
|
|
"step": 27050
|
|
},
|
|
{
|
|
"entropy": 5.006353187561035,
|
|
"epoch": 2.5989433237271853,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004327283562549371,
|
|
"loss": 4.6973,
|
|
"mean_token_accuracy": 0.2428078219294548,
|
|
"num_tokens": 62036419.0,
|
|
"step": 27055
|
|
},
|
|
{
|
|
"entropy": 5.1200206756591795,
|
|
"epoch": 2.599423631123919,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000432703905222683,
|
|
"loss": 4.8084,
|
|
"mean_token_accuracy": 0.23436084985733033,
|
|
"num_tokens": 62046800.0,
|
|
"step": 27060
|
|
},
|
|
{
|
|
"entropy": 5.110630846023559,
|
|
"epoch": 2.5999039385206535,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043267945052901264,
|
|
"loss": 4.7779,
|
|
"mean_token_accuracy": 0.23510639518499374,
|
|
"num_tokens": 62058519.0,
|
|
"step": 27065
|
|
},
|
|
{
|
|
"entropy": 5.109530687332153,
|
|
"epoch": 2.600384245917387,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004326549921744935,
|
|
"loss": 4.757,
|
|
"mean_token_accuracy": 0.23558929562568665,
|
|
"num_tokens": 62070445.0,
|
|
"step": 27070
|
|
},
|
|
{
|
|
"entropy": 5.126020717620849,
|
|
"epoch": 2.6008645533141213,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004326305301596937,
|
|
"loss": 4.822,
|
|
"mean_token_accuracy": 0.22863227427005767,
|
|
"num_tokens": 62083084.0,
|
|
"step": 27075
|
|
},
|
|
{
|
|
"entropy": 5.143060207366943,
|
|
"epoch": 2.6013448607108547,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043260606448518096,
|
|
"loss": 4.763,
|
|
"mean_token_accuracy": 0.2291410893201828,
|
|
"num_tokens": 62094334.0,
|
|
"step": 27080
|
|
},
|
|
{
|
|
"entropy": 5.088135766983032,
|
|
"epoch": 2.601825168107589,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043258159515152347,
|
|
"loss": 4.7475,
|
|
"mean_token_accuracy": 0.23574207574129105,
|
|
"num_tokens": 62105065.0,
|
|
"step": 27085
|
|
},
|
|
{
|
|
"entropy": 5.125645399093628,
|
|
"epoch": 2.602305475504323,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004325571221592892,
|
|
"loss": 4.8434,
|
|
"mean_token_accuracy": 0.22585551887750627,
|
|
"num_tokens": 62116990.0,
|
|
"step": 27090
|
|
},
|
|
{
|
|
"entropy": 5.089757871627808,
|
|
"epoch": 2.6027857829010568,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00043253264550904646,
|
|
"loss": 4.6787,
|
|
"mean_token_accuracy": 0.2427651584148407,
|
|
"num_tokens": 62128096.0,
|
|
"step": 27095
|
|
},
|
|
{
|
|
"entropy": 5.205277824401856,
|
|
"epoch": 2.6032660902977907,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004325081652013635,
|
|
"loss": 4.8836,
|
|
"mean_token_accuracy": 0.22276324778795242,
|
|
"num_tokens": 62141139.0,
|
|
"step": 27100
|
|
},
|
|
{
|
|
"entropy": 5.055752944946289,
|
|
"epoch": 2.6037463976945245,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00043248368123680855,
|
|
"loss": 4.6158,
|
|
"mean_token_accuracy": 0.24648532420396804,
|
|
"num_tokens": 62152522.0,
|
|
"step": 27105
|
|
},
|
|
{
|
|
"entropy": 4.990445327758789,
|
|
"epoch": 2.6042267050912584,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00043245919361595026,
|
|
"loss": 4.6983,
|
|
"mean_token_accuracy": 0.24285367727279664,
|
|
"num_tokens": 62162919.0,
|
|
"step": 27110
|
|
},
|
|
{
|
|
"entropy": 5.088385057449341,
|
|
"epoch": 2.6047070124879923,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043243470233935696,
|
|
"loss": 4.736,
|
|
"mean_token_accuracy": 0.23736914843320847,
|
|
"num_tokens": 62174936.0,
|
|
"step": 27115
|
|
},
|
|
{
|
|
"entropy": 5.158363199234008,
|
|
"epoch": 2.605187319884726,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004324102074075973,
|
|
"loss": 4.7903,
|
|
"mean_token_accuracy": 0.23448725491762162,
|
|
"num_tokens": 62186260.0,
|
|
"step": 27120
|
|
},
|
|
{
|
|
"entropy": 5.0671289443969725,
|
|
"epoch": 2.60566762728146,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004323857088212402,
|
|
"loss": 4.6928,
|
|
"mean_token_accuracy": 0.2398076131939888,
|
|
"num_tokens": 62198616.0,
|
|
"step": 27125
|
|
},
|
|
{
|
|
"entropy": 5.043847560882568,
|
|
"epoch": 2.606147934678194,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004323612065808541,
|
|
"loss": 4.7053,
|
|
"mean_token_accuracy": 0.23870875984430312,
|
|
"num_tokens": 62209318.0,
|
|
"step": 27130
|
|
},
|
|
{
|
|
"entropy": 5.1781915664672855,
|
|
"epoch": 2.606628242074928,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00043233670068700827,
|
|
"loss": 4.8822,
|
|
"mean_token_accuracy": 0.22753420770168303,
|
|
"num_tokens": 62220510.0,
|
|
"step": 27135
|
|
},
|
|
{
|
|
"entropy": 5.175050258636475,
|
|
"epoch": 2.6071085494716617,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004323121911402713,
|
|
"loss": 4.8766,
|
|
"mean_token_accuracy": 0.22503857761621476,
|
|
"num_tokens": 62233676.0,
|
|
"step": 27140
|
|
},
|
|
{
|
|
"entropy": 5.11496376991272,
|
|
"epoch": 2.6075888568683956,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00043228767794121245,
|
|
"loss": 4.7784,
|
|
"mean_token_accuracy": 0.23106486797332765,
|
|
"num_tokens": 62244604.0,
|
|
"step": 27145
|
|
},
|
|
{
|
|
"entropy": 5.075182390213013,
|
|
"epoch": 2.60806916426513,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004322631610904006,
|
|
"loss": 4.792,
|
|
"mean_token_accuracy": 0.23115721493959426,
|
|
"num_tokens": 62255431.0,
|
|
"step": 27150
|
|
},
|
|
{
|
|
"entropy": 5.2251488208770756,
|
|
"epoch": 2.6085494716618634,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043223864058840534,
|
|
"loss": 4.9481,
|
|
"mean_token_accuracy": 0.22021759003400804,
|
|
"num_tokens": 62267050.0,
|
|
"step": 27155
|
|
},
|
|
{
|
|
"entropy": 5.163397312164307,
|
|
"epoch": 2.6090297790585977,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00043221411643579557,
|
|
"loss": 4.8037,
|
|
"mean_token_accuracy": 0.23190855830907822,
|
|
"num_tokens": 62278939.0,
|
|
"step": 27160
|
|
},
|
|
{
|
|
"entropy": 5.035414218902588,
|
|
"epoch": 2.6095100864553316,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00043218958863314096,
|
|
"loss": 4.764,
|
|
"mean_token_accuracy": 0.23242587298154832,
|
|
"num_tokens": 62291275.0,
|
|
"step": 27165
|
|
},
|
|
{
|
|
"entropy": 4.995542573928833,
|
|
"epoch": 2.6099903938520654,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004321650571810109,
|
|
"loss": 4.6767,
|
|
"mean_token_accuracy": 0.23926113694906234,
|
|
"num_tokens": 62302191.0,
|
|
"step": 27170
|
|
},
|
|
{
|
|
"entropy": 5.10365104675293,
|
|
"epoch": 2.6104707012487993,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004321405220799747,
|
|
"loss": 4.7078,
|
|
"mean_token_accuracy": 0.24113160371780396,
|
|
"num_tokens": 62313678.0,
|
|
"step": 27175
|
|
},
|
|
{
|
|
"entropy": 5.059454250335693,
|
|
"epoch": 2.610951008645533,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004321159833306024,
|
|
"loss": 4.723,
|
|
"mean_token_accuracy": 0.2401316285133362,
|
|
"num_tokens": 62324324.0,
|
|
"step": 27180
|
|
},
|
|
{
|
|
"entropy": 5.137457323074341,
|
|
"epoch": 2.611431316042267,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004320914409334634,
|
|
"loss": 4.8844,
|
|
"mean_token_accuracy": 0.2295592397451401,
|
|
"num_tokens": 62335385.0,
|
|
"step": 27185
|
|
},
|
|
{
|
|
"entropy": 5.141259002685547,
|
|
"epoch": 2.611911623439001,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004320668948891276,
|
|
"loss": 4.83,
|
|
"mean_token_accuracy": 0.22620759904384613,
|
|
"num_tokens": 62346733.0,
|
|
"step": 27190
|
|
},
|
|
{
|
|
"entropy": 5.138140726089477,
|
|
"epoch": 2.612391930835735,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043204234519816486,
|
|
"loss": 4.7799,
|
|
"mean_token_accuracy": 0.23260708153247833,
|
|
"num_tokens": 62358674.0,
|
|
"step": 27195
|
|
},
|
|
{
|
|
"entropy": 5.097907829284668,
|
|
"epoch": 2.6128722382324687,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004320177918611453,
|
|
"loss": 4.7617,
|
|
"mean_token_accuracy": 0.2355603665113449,
|
|
"num_tokens": 62370431.0,
|
|
"step": 27200
|
|
},
|
|
{
|
|
"entropy": 5.071603536605835,
|
|
"epoch": 2.6133525456292026,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00043199323487863876,
|
|
"loss": 4.7055,
|
|
"mean_token_accuracy": 0.23453706800937651,
|
|
"num_tokens": 62381098.0,
|
|
"step": 27205
|
|
},
|
|
{
|
|
"entropy": 5.105896091461181,
|
|
"epoch": 2.6138328530259365,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00043196867425121554,
|
|
"loss": 4.7289,
|
|
"mean_token_accuracy": 0.2379012182354927,
|
|
"num_tokens": 62393446.0,
|
|
"step": 27210
|
|
},
|
|
{
|
|
"entropy": 5.0916307926177975,
|
|
"epoch": 2.6143131604226704,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00043194410997944577,
|
|
"loss": 4.8031,
|
|
"mean_token_accuracy": 0.2336391270160675,
|
|
"num_tokens": 62406581.0,
|
|
"step": 27215
|
|
},
|
|
{
|
|
"entropy": 5.140932083129883,
|
|
"epoch": 2.6147934678194042,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043191954206389985,
|
|
"loss": 4.8223,
|
|
"mean_token_accuracy": 0.22878152132034302,
|
|
"num_tokens": 62419516.0,
|
|
"step": 27220
|
|
},
|
|
{
|
|
"entropy": 5.123697328567505,
|
|
"epoch": 2.6152737752161386,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004318949705051481,
|
|
"loss": 4.7438,
|
|
"mean_token_accuracy": 0.2326513096690178,
|
|
"num_tokens": 62431577.0,
|
|
"step": 27225
|
|
},
|
|
{
|
|
"entropy": 5.015741491317749,
|
|
"epoch": 2.615754082612872,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004318703953037609,
|
|
"loss": 4.7388,
|
|
"mean_token_accuracy": 0.23688763231039048,
|
|
"num_tokens": 62444507.0,
|
|
"step": 27230
|
|
},
|
|
{
|
|
"entropy": 5.019672060012818,
|
|
"epoch": 2.6162343900096063,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004318458164603091,
|
|
"loss": 4.7087,
|
|
"mean_token_accuracy": 0.2319497287273407,
|
|
"num_tokens": 62455692.0,
|
|
"step": 27235
|
|
},
|
|
{
|
|
"entropy": 5.1985191822052,
|
|
"epoch": 2.61671469740634,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004318212339753631,
|
|
"loss": 4.8151,
|
|
"mean_token_accuracy": 0.2287910521030426,
|
|
"num_tokens": 62467215.0,
|
|
"step": 27240
|
|
},
|
|
{
|
|
"entropy": 5.120895576477051,
|
|
"epoch": 2.617195004803074,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043179664784949375,
|
|
"loss": 4.8341,
|
|
"mean_token_accuracy": 0.230615296959877,
|
|
"num_tokens": 62480016.0,
|
|
"step": 27245
|
|
},
|
|
{
|
|
"entropy": 5.132107305526733,
|
|
"epoch": 2.617675312199808,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004317720580832717,
|
|
"loss": 4.8261,
|
|
"mean_token_accuracy": 0.23632377982139588,
|
|
"num_tokens": 62491088.0,
|
|
"step": 27250
|
|
},
|
|
{
|
|
"entropy": 5.117570686340332,
|
|
"epoch": 2.618155619596542,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004317474646772681,
|
|
"loss": 4.7609,
|
|
"mean_token_accuracy": 0.23309019654989244,
|
|
"num_tokens": 62502405.0,
|
|
"step": 27255
|
|
},
|
|
{
|
|
"entropy": 5.130024766921997,
|
|
"epoch": 2.6186359269932757,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004317228676320539,
|
|
"loss": 4.7606,
|
|
"mean_token_accuracy": 0.2346285194158554,
|
|
"num_tokens": 62512968.0,
|
|
"step": 27260
|
|
},
|
|
{
|
|
"entropy": 5.057028913497925,
|
|
"epoch": 2.6191162343900096,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00043169826694819987,
|
|
"loss": 4.6535,
|
|
"mean_token_accuracy": 0.24227974116802214,
|
|
"num_tokens": 62524242.0,
|
|
"step": 27265
|
|
},
|
|
{
|
|
"entropy": 5.120878648757935,
|
|
"epoch": 2.6195965417867435,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004316736626262775,
|
|
"loss": 4.8243,
|
|
"mean_token_accuracy": 0.23332867175340652,
|
|
"num_tokens": 62536556.0,
|
|
"step": 27270
|
|
},
|
|
{
|
|
"entropy": 5.16615777015686,
|
|
"epoch": 2.6200768491834774,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004316490546668579,
|
|
"loss": 4.7916,
|
|
"mean_token_accuracy": 0.2371727392077446,
|
|
"num_tokens": 62547897.0,
|
|
"step": 27275
|
|
},
|
|
{
|
|
"entropy": 5.110966777801513,
|
|
"epoch": 2.6205571565802113,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004316244430705124,
|
|
"loss": 4.7821,
|
|
"mean_token_accuracy": 0.2275903344154358,
|
|
"num_tokens": 62560190.0,
|
|
"step": 27280
|
|
},
|
|
{
|
|
"entropy": 5.09627046585083,
|
|
"epoch": 2.621037463976945,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004315998278378123,
|
|
"loss": 4.856,
|
|
"mean_token_accuracy": 0.22962083518505097,
|
|
"num_tokens": 62571885.0,
|
|
"step": 27285
|
|
},
|
|
{
|
|
"entropy": 5.106700801849366,
|
|
"epoch": 2.621517771373679,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043157520896932943,
|
|
"loss": 4.7795,
|
|
"mean_token_accuracy": 0.2367392286658287,
|
|
"num_tokens": 62583698.0,
|
|
"step": 27290
|
|
},
|
|
{
|
|
"entropy": 5.210479211807251,
|
|
"epoch": 2.621998078770413,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000431550586465635,
|
|
"loss": 4.8258,
|
|
"mean_token_accuracy": 0.2318419858813286,
|
|
"num_tokens": 62594813.0,
|
|
"step": 27295
|
|
},
|
|
{
|
|
"entropy": 5.029856395721436,
|
|
"epoch": 2.6224783861671472,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00043152596032730085,
|
|
"loss": 4.6127,
|
|
"mean_token_accuracy": 0.24580606669187546,
|
|
"num_tokens": 62604868.0,
|
|
"step": 27300
|
|
},
|
|
{
|
|
"entropy": 4.966013050079345,
|
|
"epoch": 2.6229586935638807,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043150133055489865,
|
|
"loss": 4.6922,
|
|
"mean_token_accuracy": 0.23778087943792342,
|
|
"num_tokens": 62616553.0,
|
|
"step": 27305
|
|
},
|
|
{
|
|
"entropy": 5.088241481781006,
|
|
"epoch": 2.623439000960615,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004314766971490003,
|
|
"loss": 4.6779,
|
|
"mean_token_accuracy": 0.23836185783147812,
|
|
"num_tokens": 62627468.0,
|
|
"step": 27310
|
|
},
|
|
{
|
|
"entropy": 5.1208264350891115,
|
|
"epoch": 2.6239193083573484,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004314520601101776,
|
|
"loss": 4.7641,
|
|
"mean_token_accuracy": 0.23261116296052933,
|
|
"num_tokens": 62639255.0,
|
|
"step": 27315
|
|
},
|
|
{
|
|
"entropy": 5.080009984970093,
|
|
"epoch": 2.6243996157540828,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00043142741943900275,
|
|
"loss": 4.7307,
|
|
"mean_token_accuracy": 0.2292654573917389,
|
|
"num_tokens": 62650553.0,
|
|
"step": 27320
|
|
},
|
|
{
|
|
"entropy": 5.138628101348877,
|
|
"epoch": 2.6248799231508166,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00043140277513604763,
|
|
"loss": 4.883,
|
|
"mean_token_accuracy": 0.22575445473194122,
|
|
"num_tokens": 62662653.0,
|
|
"step": 27325
|
|
},
|
|
{
|
|
"entropy": 5.092043256759643,
|
|
"epoch": 2.6253602305475505,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004313781272018845,
|
|
"loss": 4.7271,
|
|
"mean_token_accuracy": 0.23904713839292527,
|
|
"num_tokens": 62673618.0,
|
|
"step": 27330
|
|
},
|
|
{
|
|
"entropy": 5.065589809417725,
|
|
"epoch": 2.6258405379442844,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004313534756370857,
|
|
"loss": 4.6816,
|
|
"mean_token_accuracy": 0.23946530520915985,
|
|
"num_tokens": 62685221.0,
|
|
"step": 27335
|
|
},
|
|
{
|
|
"entropy": 5.036132669448852,
|
|
"epoch": 2.6263208453410183,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00043132882044222336,
|
|
"loss": 4.6175,
|
|
"mean_token_accuracy": 0.24858939349651338,
|
|
"num_tokens": 62695544.0,
|
|
"step": 27340
|
|
},
|
|
{
|
|
"entropy": 4.945739364624023,
|
|
"epoch": 2.626801152737752,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00043130416161787005,
|
|
"loss": 4.5768,
|
|
"mean_token_accuracy": 0.24817198663949966,
|
|
"num_tokens": 62706945.0,
|
|
"step": 27345
|
|
},
|
|
{
|
|
"entropy": 5.086688137054443,
|
|
"epoch": 2.627281460134486,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00043127949916459823,
|
|
"loss": 4.7471,
|
|
"mean_token_accuracy": 0.23233324140310288,
|
|
"num_tokens": 62717821.0,
|
|
"step": 27350
|
|
},
|
|
{
|
|
"entropy": 5.041226196289062,
|
|
"epoch": 2.62776176753122,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00043125483308298053,
|
|
"loss": 4.664,
|
|
"mean_token_accuracy": 0.2415490359067917,
|
|
"num_tokens": 62730477.0,
|
|
"step": 27355
|
|
},
|
|
{
|
|
"entropy": 5.064918756484985,
|
|
"epoch": 2.628242074927954,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004312301633735895,
|
|
"loss": 4.7282,
|
|
"mean_token_accuracy": 0.24113650470972062,
|
|
"num_tokens": 62741725.0,
|
|
"step": 27360
|
|
},
|
|
{
|
|
"entropy": 5.099993944168091,
|
|
"epoch": 2.6287223823246877,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004312054900369981,
|
|
"loss": 4.8248,
|
|
"mean_token_accuracy": 0.22712087631225586,
|
|
"num_tokens": 62753600.0,
|
|
"step": 27365
|
|
},
|
|
{
|
|
"entropy": 5.102942943572998,
|
|
"epoch": 2.6292026897214216,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000431180813073779,
|
|
"loss": 4.84,
|
|
"mean_token_accuracy": 0.23344789445400238,
|
|
"num_tokens": 62766218.0,
|
|
"step": 27370
|
|
},
|
|
{
|
|
"entropy": 5.087442827224732,
|
|
"epoch": 2.629682997118156,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004311561324845051,
|
|
"loss": 4.7105,
|
|
"mean_token_accuracy": 0.23643388450145722,
|
|
"num_tokens": 62777723.0,
|
|
"step": 27375
|
|
},
|
|
{
|
|
"entropy": 5.167104578018188,
|
|
"epoch": 2.6301633045148893,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004311314482697496,
|
|
"loss": 4.8839,
|
|
"mean_token_accuracy": 0.22849978357553483,
|
|
"num_tokens": 62789711.0,
|
|
"step": 27380
|
|
},
|
|
{
|
|
"entropy": 5.061525678634643,
|
|
"epoch": 2.6306436119116237,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004311067604300855,
|
|
"loss": 4.7134,
|
|
"mean_token_accuracy": 0.24252797961235045,
|
|
"num_tokens": 62800600.0,
|
|
"step": 27385
|
|
},
|
|
{
|
|
"entropy": 5.123406314849854,
|
|
"epoch": 2.631123919308357,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004310820689660859,
|
|
"loss": 4.7797,
|
|
"mean_token_accuracy": 0.2355131432414055,
|
|
"num_tokens": 62811872.0,
|
|
"step": 27390
|
|
},
|
|
{
|
|
"entropy": 5.110841369628906,
|
|
"epoch": 2.6316042267050914,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004310573738783242,
|
|
"loss": 4.7553,
|
|
"mean_token_accuracy": 0.24118882417678833,
|
|
"num_tokens": 62822886.0,
|
|
"step": 27395
|
|
},
|
|
{
|
|
"entropy": 5.041201734542847,
|
|
"epoch": 2.6320845341018253,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004310326751673736,
|
|
"loss": 4.6955,
|
|
"mean_token_accuracy": 0.23755438774824142,
|
|
"num_tokens": 62833911.0,
|
|
"step": 27400
|
|
},
|
|
{
|
|
"entropy": 5.047704887390137,
|
|
"epoch": 2.632564841498559,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043100797283380756,
|
|
"loss": 4.7228,
|
|
"mean_token_accuracy": 0.2393511191010475,
|
|
"num_tokens": 62844756.0,
|
|
"step": 27405
|
|
},
|
|
{
|
|
"entropy": 5.068274259567261,
|
|
"epoch": 2.633045148895293,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00043098326687819973,
|
|
"loss": 4.7178,
|
|
"mean_token_accuracy": 0.23599186539649963,
|
|
"num_tokens": 62856072.0,
|
|
"step": 27410
|
|
},
|
|
{
|
|
"entropy": 5.170894289016724,
|
|
"epoch": 2.633525456292027,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004309585573011236,
|
|
"loss": 4.8316,
|
|
"mean_token_accuracy": 0.22551288306713105,
|
|
"num_tokens": 62868334.0,
|
|
"step": 27415
|
|
},
|
|
{
|
|
"entropy": 5.042424583435059,
|
|
"epoch": 2.634005763688761,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004309338441031528,
|
|
"loss": 4.6388,
|
|
"mean_token_accuracy": 0.2423781707882881,
|
|
"num_tokens": 62879459.0,
|
|
"step": 27420
|
|
},
|
|
{
|
|
"entropy": 4.985827016830444,
|
|
"epoch": 2.6344860710854947,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043090912728486135,
|
|
"loss": 4.6274,
|
|
"mean_token_accuracy": 0.24838806539773942,
|
|
"num_tokens": 62890137.0,
|
|
"step": 27425
|
|
},
|
|
{
|
|
"entropy": 5.121232414245606,
|
|
"epoch": 2.6349663784822286,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004308844068468228,
|
|
"loss": 4.7832,
|
|
"mean_token_accuracy": 0.22631029188632965,
|
|
"num_tokens": 62902688.0,
|
|
"step": 27430
|
|
},
|
|
{
|
|
"entropy": 5.035251426696777,
|
|
"epoch": 2.6354466858789625,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00043085968278961116,
|
|
"loss": 4.748,
|
|
"mean_token_accuracy": 0.24102103114128112,
|
|
"num_tokens": 62913008.0,
|
|
"step": 27435
|
|
},
|
|
{
|
|
"entropy": 5.134324455261231,
|
|
"epoch": 2.6359269932756964,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00043083495511380055,
|
|
"loss": 4.7854,
|
|
"mean_token_accuracy": 0.2309037923812866,
|
|
"num_tokens": 62924894.0,
|
|
"step": 27440
|
|
},
|
|
{
|
|
"entropy": 5.075713205337524,
|
|
"epoch": 2.6364073006724302,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00043081022381996506,
|
|
"loss": 4.6053,
|
|
"mean_token_accuracy": 0.25516229718923567,
|
|
"num_tokens": 62935709.0,
|
|
"step": 27445
|
|
},
|
|
{
|
|
"entropy": 5.1133420944213865,
|
|
"epoch": 2.636887608069164,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004307854889086787,
|
|
"loss": 4.8046,
|
|
"mean_token_accuracy": 0.23403265476226806,
|
|
"num_tokens": 62947122.0,
|
|
"step": 27450
|
|
},
|
|
{
|
|
"entropy": 5.044565868377686,
|
|
"epoch": 2.637367915465898,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00043076075038051605,
|
|
"loss": 4.7006,
|
|
"mean_token_accuracy": 0.23542070537805557,
|
|
"num_tokens": 62957993.0,
|
|
"step": 27455
|
|
},
|
|
{
|
|
"entropy": 5.1721728324890135,
|
|
"epoch": 2.6378482228626323,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004307360082360512,
|
|
"loss": 4.8084,
|
|
"mean_token_accuracy": 0.23182854056358337,
|
|
"num_tokens": 62969645.0,
|
|
"step": 27460
|
|
},
|
|
{
|
|
"entropy": 5.086919450759888,
|
|
"epoch": 2.6383285302593658,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00043071126247585866,
|
|
"loss": 4.7242,
|
|
"mean_token_accuracy": 0.2411841481924057,
|
|
"num_tokens": 62980770.0,
|
|
"step": 27465
|
|
},
|
|
{
|
|
"entropy": 5.03817572593689,
|
|
"epoch": 2.6388088376561,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000430686513100513,
|
|
"loss": 4.6346,
|
|
"mean_token_accuracy": 0.24868223518133165,
|
|
"num_tokens": 62991418.0,
|
|
"step": 27470
|
|
},
|
|
{
|
|
"entropy": 5.026139974594116,
|
|
"epoch": 2.639289145052834,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00043066176011058877,
|
|
"loss": 4.6808,
|
|
"mean_token_accuracy": 0.24658098071813583,
|
|
"num_tokens": 63002228.0,
|
|
"step": 27475
|
|
},
|
|
{
|
|
"entropy": 5.067081069946289,
|
|
"epoch": 2.639769452449568,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00043063700350666066,
|
|
"loss": 4.7243,
|
|
"mean_token_accuracy": 0.23191245943307875,
|
|
"num_tokens": 63013502.0,
|
|
"step": 27480
|
|
},
|
|
{
|
|
"entropy": 5.13173246383667,
|
|
"epoch": 2.6402497598463017,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004306122432893036,
|
|
"loss": 4.8459,
|
|
"mean_token_accuracy": 0.2285308927297592,
|
|
"num_tokens": 63024530.0,
|
|
"step": 27485
|
|
},
|
|
{
|
|
"entropy": 5.074685430526733,
|
|
"epoch": 2.6407300672430356,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00043058747945909224,
|
|
"loss": 4.7229,
|
|
"mean_token_accuracy": 0.24568860083818436,
|
|
"num_tokens": 63035668.0,
|
|
"step": 27490
|
|
},
|
|
{
|
|
"entropy": 5.076900672912598,
|
|
"epoch": 2.6412103746397695,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00043056271201660166,
|
|
"loss": 4.7192,
|
|
"mean_token_accuracy": 0.240654818713665,
|
|
"num_tokens": 63048020.0,
|
|
"step": 27495
|
|
},
|
|
{
|
|
"entropy": 5.099683141708374,
|
|
"epoch": 2.6416906820365034,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004305379409624068,
|
|
"loss": 4.7328,
|
|
"mean_token_accuracy": 0.23687127530574797,
|
|
"num_tokens": 63060019.0,
|
|
"step": 27500
|
|
},
|
|
{
|
|
"entropy": 5.041384553909301,
|
|
"epoch": 2.6421709894332372,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004305131662970827,
|
|
"loss": 4.6942,
|
|
"mean_token_accuracy": 0.23739387094974518,
|
|
"num_tokens": 63070151.0,
|
|
"step": 27505
|
|
},
|
|
{
|
|
"entropy": 5.056379270553589,
|
|
"epoch": 2.642651296829971,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004304883880212048,
|
|
"loss": 4.7149,
|
|
"mean_token_accuracy": 0.2374277889728546,
|
|
"num_tokens": 63082174.0,
|
|
"step": 27510
|
|
},
|
|
{
|
|
"entropy": 5.059743070602417,
|
|
"epoch": 2.643131604226705,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004304636061353482,
|
|
"loss": 4.7311,
|
|
"mean_token_accuracy": 0.24360514134168626,
|
|
"num_tokens": 63093478.0,
|
|
"step": 27515
|
|
},
|
|
{
|
|
"entropy": 5.002159929275512,
|
|
"epoch": 2.643611911623439,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004304388206400883,
|
|
"loss": 4.6349,
|
|
"mean_token_accuracy": 0.2428338959813118,
|
|
"num_tokens": 63104153.0,
|
|
"step": 27520
|
|
},
|
|
{
|
|
"entropy": 4.995828104019165,
|
|
"epoch": 2.6440922190201728,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004304140315360005,
|
|
"loss": 4.6428,
|
|
"mean_token_accuracy": 0.24678767919540406,
|
|
"num_tokens": 63115469.0,
|
|
"step": 27525
|
|
},
|
|
{
|
|
"entropy": 5.022479581832886,
|
|
"epoch": 2.6445725264169067,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004303892388236604,
|
|
"loss": 4.7931,
|
|
"mean_token_accuracy": 0.22994169294834138,
|
|
"num_tokens": 63126917.0,
|
|
"step": 27530
|
|
},
|
|
{
|
|
"entropy": 5.1109912395477295,
|
|
"epoch": 2.645052833813641,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004303644425036436,
|
|
"loss": 4.7511,
|
|
"mean_token_accuracy": 0.23468243926763535,
|
|
"num_tokens": 63138865.0,
|
|
"step": 27535
|
|
},
|
|
{
|
|
"entropy": 5.048393392562867,
|
|
"epoch": 2.6455331412103744,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043033964257652575,
|
|
"loss": 4.573,
|
|
"mean_token_accuracy": 0.24706293046474456,
|
|
"num_tokens": 63148937.0,
|
|
"step": 27540
|
|
},
|
|
{
|
|
"entropy": 4.972245502471924,
|
|
"epoch": 2.6460134486071087,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004303148390428827,
|
|
"loss": 4.6733,
|
|
"mean_token_accuracy": 0.24274368435144425,
|
|
"num_tokens": 63160850.0,
|
|
"step": 27545
|
|
},
|
|
{
|
|
"entropy": 5.079576253890991,
|
|
"epoch": 2.6464937560038426,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00043029003190329023,
|
|
"loss": 4.779,
|
|
"mean_token_accuracy": 0.23910272717475892,
|
|
"num_tokens": 63171660.0,
|
|
"step": 27550
|
|
},
|
|
{
|
|
"entropy": 5.109176349639893,
|
|
"epoch": 2.6469740634005765,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004302652211583244,
|
|
"loss": 4.7526,
|
|
"mean_token_accuracy": 0.23146633058786392,
|
|
"num_tokens": 63182291.0,
|
|
"step": 27555
|
|
},
|
|
{
|
|
"entropy": 5.085505628585816,
|
|
"epoch": 2.6474543707973104,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004302404068085612,
|
|
"loss": 4.6465,
|
|
"mean_token_accuracy": 0.24001386463642121,
|
|
"num_tokens": 63193646.0,
|
|
"step": 27560
|
|
},
|
|
{
|
|
"entropy": 5.036965370178223,
|
|
"epoch": 2.6479346781940443,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004302155888545766,
|
|
"loss": 4.807,
|
|
"mean_token_accuracy": 0.23138225376605986,
|
|
"num_tokens": 63205021.0,
|
|
"step": 27565
|
|
},
|
|
{
|
|
"entropy": 5.088585996627808,
|
|
"epoch": 2.648414985590778,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000430190767296947,
|
|
"loss": 4.7587,
|
|
"mean_token_accuracy": 0.2371032327413559,
|
|
"num_tokens": 63217315.0,
|
|
"step": 27570
|
|
},
|
|
{
|
|
"entropy": 5.120937442779541,
|
|
"epoch": 2.648895292987512,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004301659421362486,
|
|
"loss": 4.7396,
|
|
"mean_token_accuracy": 0.23861674815416337,
|
|
"num_tokens": 63229082.0,
|
|
"step": 27575
|
|
},
|
|
{
|
|
"entropy": 5.21462049484253,
|
|
"epoch": 2.649375600384246,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004301411133730578,
|
|
"loss": 4.8436,
|
|
"mean_token_accuracy": 0.2371189922094345,
|
|
"num_tokens": 63240498.0,
|
|
"step": 27580
|
|
},
|
|
{
|
|
"entropy": 5.082724905014038,
|
|
"epoch": 2.64985590778098,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00043011628100795093,
|
|
"loss": 4.8199,
|
|
"mean_token_accuracy": 0.22867369055747985,
|
|
"num_tokens": 63251149.0,
|
|
"step": 27585
|
|
},
|
|
{
|
|
"entropy": 5.183015918731689,
|
|
"epoch": 2.6503362151777137,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004300914450415047,
|
|
"loss": 4.8088,
|
|
"mean_token_accuracy": 0.23819313049316407,
|
|
"num_tokens": 63261903.0,
|
|
"step": 27590
|
|
},
|
|
{
|
|
"entropy": 5.048046827316284,
|
|
"epoch": 2.6508165225744476,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00043006660547429565,
|
|
"loss": 4.7319,
|
|
"mean_token_accuracy": 0.23319058120250702,
|
|
"num_tokens": 63274281.0,
|
|
"step": 27595
|
|
},
|
|
{
|
|
"entropy": 4.994765424728394,
|
|
"epoch": 2.6512968299711814,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004300417623069005,
|
|
"loss": 4.6199,
|
|
"mean_token_accuracy": 0.24826875925064087,
|
|
"num_tokens": 63285421.0,
|
|
"step": 27600
|
|
},
|
|
{
|
|
"entropy": 5.075887060165405,
|
|
"epoch": 2.6517771373679153,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004300169155398959,
|
|
"loss": 4.721,
|
|
"mean_token_accuracy": 0.23933835029602052,
|
|
"num_tokens": 63296828.0,
|
|
"step": 27605
|
|
},
|
|
{
|
|
"entropy": 5.192564821243286,
|
|
"epoch": 2.6522574447646496,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00042999206517385885,
|
|
"loss": 4.8035,
|
|
"mean_token_accuracy": 0.2297218009829521,
|
|
"num_tokens": 63307132.0,
|
|
"step": 27610
|
|
},
|
|
{
|
|
"entropy": 5.140701389312744,
|
|
"epoch": 2.652737752161383,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004299672112093663,
|
|
"loss": 4.8137,
|
|
"mean_token_accuracy": 0.22922593206167222,
|
|
"num_tokens": 63318911.0,
|
|
"step": 27615
|
|
},
|
|
{
|
|
"entropy": 5.119142007827759,
|
|
"epoch": 2.6532180595581174,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00042994235364699526,
|
|
"loss": 4.8356,
|
|
"mean_token_accuracy": 0.22658012062311172,
|
|
"num_tokens": 63329973.0,
|
|
"step": 27620
|
|
},
|
|
{
|
|
"entropy": 5.096360969543457,
|
|
"epoch": 2.653698366954851,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004299174924873229,
|
|
"loss": 4.764,
|
|
"mean_token_accuracy": 0.23279329836368562,
|
|
"num_tokens": 63341576.0,
|
|
"step": 27625
|
|
},
|
|
{
|
|
"entropy": 5.0801787853240965,
|
|
"epoch": 2.654178674351585,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004298926277309263,
|
|
"loss": 4.7749,
|
|
"mean_token_accuracy": 0.23500386327505113,
|
|
"num_tokens": 63353988.0,
|
|
"step": 27630
|
|
},
|
|
{
|
|
"entropy": 5.150972509384156,
|
|
"epoch": 2.654658981748319,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00042986775937838283,
|
|
"loss": 4.771,
|
|
"mean_token_accuracy": 0.23352182358503343,
|
|
"num_tokens": 63364746.0,
|
|
"step": 27635
|
|
},
|
|
{
|
|
"entropy": 5.140335607528686,
|
|
"epoch": 2.655139289145053,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004298428874302699,
|
|
"loss": 4.786,
|
|
"mean_token_accuracy": 0.2273385837674141,
|
|
"num_tokens": 63375608.0,
|
|
"step": 27640
|
|
},
|
|
{
|
|
"entropy": 5.034888410568238,
|
|
"epoch": 2.655619596541787,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004298180118871649,
|
|
"loss": 4.6918,
|
|
"mean_token_accuracy": 0.2379670947790146,
|
|
"num_tokens": 63386581.0,
|
|
"step": 27645
|
|
},
|
|
{
|
|
"entropy": 5.061635303497314,
|
|
"epoch": 2.6560999039385207,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042979313274964535,
|
|
"loss": 4.7719,
|
|
"mean_token_accuracy": 0.23388247340917587,
|
|
"num_tokens": 63398844.0,
|
|
"step": 27650
|
|
},
|
|
{
|
|
"entropy": 5.147040510177613,
|
|
"epoch": 2.6565802113352546,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042976825001828897,
|
|
"loss": 4.7711,
|
|
"mean_token_accuracy": 0.240594382584095,
|
|
"num_tokens": 63409972.0,
|
|
"step": 27655
|
|
},
|
|
{
|
|
"entropy": 5.079216957092285,
|
|
"epoch": 2.6570605187319885,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042974336369367333,
|
|
"loss": 4.6831,
|
|
"mean_token_accuracy": 0.24222468584775925,
|
|
"num_tokens": 63420598.0,
|
|
"step": 27660
|
|
},
|
|
{
|
|
"entropy": 5.138257026672363,
|
|
"epoch": 2.6575408261287223,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004297184737763763,
|
|
"loss": 4.8325,
|
|
"mean_token_accuracy": 0.23077375292778016,
|
|
"num_tokens": 63431943.0,
|
|
"step": 27665
|
|
},
|
|
{
|
|
"entropy": 5.031608772277832,
|
|
"epoch": 2.658021133525456,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042969358026697567,
|
|
"loss": 4.6484,
|
|
"mean_token_accuracy": 0.24392486214637757,
|
|
"num_tokens": 63442539.0,
|
|
"step": 27670
|
|
},
|
|
{
|
|
"entropy": 5.033156442642212,
|
|
"epoch": 2.65850144092219,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004296686831660495,
|
|
"loss": 4.68,
|
|
"mean_token_accuracy": 0.23908789008855819,
|
|
"num_tokens": 63453100.0,
|
|
"step": 27675
|
|
},
|
|
{
|
|
"entropy": 5.078754043579101,
|
|
"epoch": 2.658981748318924,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004296437824741758,
|
|
"loss": 4.7895,
|
|
"mean_token_accuracy": 0.23553553819656373,
|
|
"num_tokens": 63464226.0,
|
|
"step": 27680
|
|
},
|
|
{
|
|
"entropy": 5.083715200424194,
|
|
"epoch": 2.659462055715658,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042961887819193263,
|
|
"loss": 4.6776,
|
|
"mean_token_accuracy": 0.2441350758075714,
|
|
"num_tokens": 63475159.0,
|
|
"step": 27685
|
|
},
|
|
{
|
|
"entropy": 5.163394260406494,
|
|
"epoch": 2.6599423631123917,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004295939703198983,
|
|
"loss": 4.9027,
|
|
"mean_token_accuracy": 0.2212497740983963,
|
|
"num_tokens": 63488015.0,
|
|
"step": 27690
|
|
},
|
|
{
|
|
"entropy": 5.097438478469849,
|
|
"epoch": 2.660422670509126,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000429569058858651,
|
|
"loss": 4.7295,
|
|
"mean_token_accuracy": 0.23866434544324874,
|
|
"num_tokens": 63501216.0,
|
|
"step": 27695
|
|
},
|
|
{
|
|
"entropy": 5.1523370265960695,
|
|
"epoch": 2.6609029779058595,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042954414380876906,
|
|
"loss": 4.813,
|
|
"mean_token_accuracy": 0.2316722974181175,
|
|
"num_tokens": 63511560.0,
|
|
"step": 27700
|
|
},
|
|
{
|
|
"entropy": 5.157254886627197,
|
|
"epoch": 2.661383285302594,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00042951922517083104,
|
|
"loss": 4.8948,
|
|
"mean_token_accuracy": 0.23377819657325744,
|
|
"num_tokens": 63522762.0,
|
|
"step": 27705
|
|
},
|
|
{
|
|
"entropy": 5.125922918319702,
|
|
"epoch": 2.6618635926993277,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004294943029454155,
|
|
"loss": 4.8184,
|
|
"mean_token_accuracy": 0.23747162520885468,
|
|
"num_tokens": 63535095.0,
|
|
"step": 27710
|
|
},
|
|
{
|
|
"entropy": 5.068044853210449,
|
|
"epoch": 2.6623439000960616,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042946937713310093,
|
|
"loss": 4.6503,
|
|
"mean_token_accuracy": 0.24541468024253846,
|
|
"num_tokens": 63545256.0,
|
|
"step": 27715
|
|
},
|
|
{
|
|
"entropy": 5.084882020950317,
|
|
"epoch": 2.6628242074927955,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004294444477344661,
|
|
"loss": 4.7642,
|
|
"mean_token_accuracy": 0.2323298290371895,
|
|
"num_tokens": 63557269.0,
|
|
"step": 27720
|
|
},
|
|
{
|
|
"entropy": 5.0054723739624025,
|
|
"epoch": 2.6633045148895294,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004294195147500898,
|
|
"loss": 4.6352,
|
|
"mean_token_accuracy": 0.24495234042406083,
|
|
"num_tokens": 63568452.0,
|
|
"step": 27725
|
|
},
|
|
{
|
|
"entropy": 5.117168378829956,
|
|
"epoch": 2.6637848222862632,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00042939457818055095,
|
|
"loss": 4.8288,
|
|
"mean_token_accuracy": 0.22925741076469422,
|
|
"num_tokens": 63581653.0,
|
|
"step": 27730
|
|
},
|
|
{
|
|
"entropy": 5.056406259536743,
|
|
"epoch": 2.664265129682997,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042936963802642843,
|
|
"loss": 4.6295,
|
|
"mean_token_accuracy": 0.23729730695486068,
|
|
"num_tokens": 63592862.0,
|
|
"step": 27735
|
|
},
|
|
{
|
|
"entropy": 5.104184293746949,
|
|
"epoch": 2.664745437079731,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004293446942883013,
|
|
"loss": 4.7992,
|
|
"mean_token_accuracy": 0.23112255334854126,
|
|
"num_tokens": 63603306.0,
|
|
"step": 27740
|
|
},
|
|
{
|
|
"entropy": 5.138091516494751,
|
|
"epoch": 2.665225744476465,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042931974696674866,
|
|
"loss": 4.7845,
|
|
"mean_token_accuracy": 0.23358572274446487,
|
|
"num_tokens": 63615031.0,
|
|
"step": 27745
|
|
},
|
|
{
|
|
"entropy": 5.125448942184448,
|
|
"epoch": 2.6657060518731988,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004292947960623497,
|
|
"loss": 4.8156,
|
|
"mean_token_accuracy": 0.2346802145242691,
|
|
"num_tokens": 63627004.0,
|
|
"step": 27750
|
|
},
|
|
{
|
|
"entropy": 5.10497636795044,
|
|
"epoch": 2.6661863592699326,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00042926984157568384,
|
|
"loss": 4.7762,
|
|
"mean_token_accuracy": 0.23420519083738328,
|
|
"num_tokens": 63637712.0,
|
|
"step": 27755
|
|
},
|
|
{
|
|
"entropy": 5.08453893661499,
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00042924488350733024,
|
|
"loss": 4.787,
|
|
"mean_token_accuracy": 0.23482900261878967,
|
|
"num_tokens": 63649631.0,
|
|
"step": 27760
|
|
},
|
|
{
|
|
"entropy": 5.143103504180909,
|
|
"epoch": 2.6671469740634004,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00042921992185786847,
|
|
"loss": 4.7379,
|
|
"mean_token_accuracy": 0.23483039140701295,
|
|
"num_tokens": 63661929.0,
|
|
"step": 27765
|
|
},
|
|
{
|
|
"entropy": 5.112990140914917,
|
|
"epoch": 2.6676272814601347,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00042919495662787813,
|
|
"loss": 4.8094,
|
|
"mean_token_accuracy": 0.22674295753240586,
|
|
"num_tokens": 63674411.0,
|
|
"step": 27770
|
|
},
|
|
{
|
|
"entropy": 5.083460283279419,
|
|
"epoch": 2.668107588856868,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004291699878179387,
|
|
"loss": 4.677,
|
|
"mean_token_accuracy": 0.23690251559019088,
|
|
"num_tokens": 63685182.0,
|
|
"step": 27775
|
|
},
|
|
{
|
|
"entropy": 5.082758903503418,
|
|
"epoch": 2.6685878962536025,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004291450154286299,
|
|
"loss": 4.7682,
|
|
"mean_token_accuracy": 0.22938674837350845,
|
|
"num_tokens": 63696374.0,
|
|
"step": 27780
|
|
},
|
|
{
|
|
"entropy": 5.066847276687622,
|
|
"epoch": 2.6690682036503364,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004291200394605317,
|
|
"loss": 4.7926,
|
|
"mean_token_accuracy": 0.23720103353261948,
|
|
"num_tokens": 63708092.0,
|
|
"step": 27785
|
|
},
|
|
{
|
|
"entropy": 5.027221441268921,
|
|
"epoch": 2.6695485110470702,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004290950599142237,
|
|
"loss": 4.6912,
|
|
"mean_token_accuracy": 0.24183435142040252,
|
|
"num_tokens": 63721143.0,
|
|
"step": 27790
|
|
},
|
|
{
|
|
"entropy": 5.13440351486206,
|
|
"epoch": 2.670028818443804,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000429070076790286,
|
|
"loss": 4.7732,
|
|
"mean_token_accuracy": 0.23022121489048003,
|
|
"num_tokens": 63732346.0,
|
|
"step": 27795
|
|
},
|
|
{
|
|
"entropy": 5.032449340820312,
|
|
"epoch": 2.670509125840538,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042904509008929873,
|
|
"loss": 4.6695,
|
|
"mean_token_accuracy": 0.24931943714618682,
|
|
"num_tokens": 63744008.0,
|
|
"step": 27800
|
|
},
|
|
{
|
|
"entropy": 5.087543678283692,
|
|
"epoch": 2.670989433237272,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004290200998118417,
|
|
"loss": 4.7047,
|
|
"mean_token_accuracy": 0.24750794917345048,
|
|
"num_tokens": 63754792.0,
|
|
"step": 27805
|
|
},
|
|
{
|
|
"entropy": 5.059582996368408,
|
|
"epoch": 2.6714697406340058,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00042899510595849544,
|
|
"loss": 4.6903,
|
|
"mean_token_accuracy": 0.23685694187879563,
|
|
"num_tokens": 63767194.0,
|
|
"step": 27810
|
|
},
|
|
{
|
|
"entropy": 5.0132557392120365,
|
|
"epoch": 2.6719500480307397,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00042897010852984004,
|
|
"loss": 4.7079,
|
|
"mean_token_accuracy": 0.2424158573150635,
|
|
"num_tokens": 63778641.0,
|
|
"step": 27815
|
|
},
|
|
{
|
|
"entropy": 5.095559215545654,
|
|
"epoch": 2.6724303554274735,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042894510752645586,
|
|
"loss": 4.7444,
|
|
"mean_token_accuracy": 0.23031575381755828,
|
|
"num_tokens": 63791447.0,
|
|
"step": 27820
|
|
},
|
|
{
|
|
"entropy": 5.129614448547363,
|
|
"epoch": 2.6729106628242074,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004289201029489235,
|
|
"loss": 4.8211,
|
|
"mean_token_accuracy": 0.2294018790125847,
|
|
"num_tokens": 63802162.0,
|
|
"step": 27825
|
|
},
|
|
{
|
|
"entropy": 5.1430269241333,
|
|
"epoch": 2.6733909702209413,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004288950947978234,
|
|
"loss": 4.7005,
|
|
"mean_token_accuracy": 0.23975236117839813,
|
|
"num_tokens": 63812689.0,
|
|
"step": 27830
|
|
},
|
|
{
|
|
"entropy": 5.1405720710754395,
|
|
"epoch": 2.673871277617675,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004288700830737361,
|
|
"loss": 4.7763,
|
|
"mean_token_accuracy": 0.23490793853998185,
|
|
"num_tokens": 63822915.0,
|
|
"step": 27835
|
|
},
|
|
{
|
|
"entropy": 5.038186025619507,
|
|
"epoch": 2.674351585014409,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042884506777724244,
|
|
"loss": 4.7465,
|
|
"mean_token_accuracy": 0.23948826938867568,
|
|
"num_tokens": 63834350.0,
|
|
"step": 27840
|
|
},
|
|
{
|
|
"entropy": 4.974474334716797,
|
|
"epoch": 2.6748318924111434,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004288200489089231,
|
|
"loss": 4.667,
|
|
"mean_token_accuracy": 0.24383982121944428,
|
|
"num_tokens": 63846150.0,
|
|
"step": 27845
|
|
},
|
|
{
|
|
"entropy": 5.162803077697754,
|
|
"epoch": 2.675312199807877,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000428795026469359,
|
|
"loss": 4.7259,
|
|
"mean_token_accuracy": 0.23322818130254747,
|
|
"num_tokens": 63857675.0,
|
|
"step": 27850
|
|
},
|
|
{
|
|
"entropy": 5.184978771209717,
|
|
"epoch": 2.675792507204611,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000428770000459131,
|
|
"loss": 4.8741,
|
|
"mean_token_accuracy": 0.2263594910502434,
|
|
"num_tokens": 63868073.0,
|
|
"step": 27855
|
|
},
|
|
{
|
|
"entropy": 5.11458010673523,
|
|
"epoch": 2.6762728146013446,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004287449708788202,
|
|
"loss": 4.7497,
|
|
"mean_token_accuracy": 0.239044252038002,
|
|
"num_tokens": 63879085.0,
|
|
"step": 27860
|
|
},
|
|
{
|
|
"entropy": 5.038825511932373,
|
|
"epoch": 2.676753121998079,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004287199377290077,
|
|
"loss": 4.6583,
|
|
"mean_token_accuracy": 0.24389497488737105,
|
|
"num_tokens": 63890885.0,
|
|
"step": 27865
|
|
},
|
|
{
|
|
"entropy": 5.092413330078125,
|
|
"epoch": 2.677233429394813,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004286949010102748,
|
|
"loss": 4.7329,
|
|
"mean_token_accuracy": 0.23647015541791916,
|
|
"num_tokens": 63902421.0,
|
|
"step": 27870
|
|
},
|
|
{
|
|
"entropy": 5.077416276931762,
|
|
"epoch": 2.6777137367915467,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004286698607232026,
|
|
"loss": 4.7645,
|
|
"mean_token_accuracy": 0.24023524522781373,
|
|
"num_tokens": 63913845.0,
|
|
"step": 27875
|
|
},
|
|
{
|
|
"entropy": 4.969077301025391,
|
|
"epoch": 2.6781940441882806,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042864481686837253,
|
|
"loss": 4.6337,
|
|
"mean_token_accuracy": 0.2435666501522064,
|
|
"num_tokens": 63925429.0,
|
|
"step": 27880
|
|
},
|
|
{
|
|
"entropy": 5.053189754486084,
|
|
"epoch": 2.6786743515850144,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00042861976944636604,
|
|
"loss": 4.7372,
|
|
"mean_token_accuracy": 0.23916109502315522,
|
|
"num_tokens": 63939451.0,
|
|
"step": 27885
|
|
},
|
|
{
|
|
"entropy": 5.203045845031738,
|
|
"epoch": 2.6791546589817483,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004285947184577647,
|
|
"loss": 4.8214,
|
|
"mean_token_accuracy": 0.23106684535741806,
|
|
"num_tokens": 63949077.0,
|
|
"step": 27890
|
|
},
|
|
{
|
|
"entropy": 5.1415793895721436,
|
|
"epoch": 2.679634966378482,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042856966390315013,
|
|
"loss": 4.7746,
|
|
"mean_token_accuracy": 0.2313278779387474,
|
|
"num_tokens": 63960553.0,
|
|
"step": 27895
|
|
},
|
|
{
|
|
"entropy": 5.140982055664063,
|
|
"epoch": 2.680115273775216,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004285446057831039,
|
|
"loss": 4.8116,
|
|
"mean_token_accuracy": 0.22729451060295106,
|
|
"num_tokens": 63973086.0,
|
|
"step": 27900
|
|
},
|
|
{
|
|
"entropy": 5.056582450866699,
|
|
"epoch": 2.68059558117195,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004285195440982078,
|
|
"loss": 4.7607,
|
|
"mean_token_accuracy": 0.22835081964731216,
|
|
"num_tokens": 63983933.0,
|
|
"step": 27905
|
|
},
|
|
{
|
|
"entropy": 5.026354026794434,
|
|
"epoch": 2.681075888568684,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004284944788490439,
|
|
"loss": 4.636,
|
|
"mean_token_accuracy": 0.24929089844226837,
|
|
"num_tokens": 63994557.0,
|
|
"step": 27910
|
|
},
|
|
{
|
|
"entropy": 5.155221891403198,
|
|
"epoch": 2.6815561959654177,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004284694100361938,
|
|
"loss": 4.8504,
|
|
"mean_token_accuracy": 0.2283597931265831,
|
|
"num_tokens": 64006368.0,
|
|
"step": 27915
|
|
},
|
|
{
|
|
"entropy": 5.095969390869141,
|
|
"epoch": 2.682036503362152,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004284443376602398,
|
|
"loss": 4.6959,
|
|
"mean_token_accuracy": 0.2350688710808754,
|
|
"num_tokens": 64018336.0,
|
|
"step": 27920
|
|
},
|
|
{
|
|
"entropy": 5.120890522003174,
|
|
"epoch": 2.6825168107588855,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004284192617217639,
|
|
"loss": 4.811,
|
|
"mean_token_accuracy": 0.23156831711530684,
|
|
"num_tokens": 64030329.0,
|
|
"step": 27925
|
|
},
|
|
{
|
|
"entropy": 5.079585886001587,
|
|
"epoch": 2.68299711815562,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004283941822213484,
|
|
"loss": 4.7732,
|
|
"mean_token_accuracy": 0.23211552947759628,
|
|
"num_tokens": 64042502.0,
|
|
"step": 27930
|
|
},
|
|
{
|
|
"entropy": 5.088845252990723,
|
|
"epoch": 2.6834774255523532,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004283690991595754,
|
|
"loss": 4.7424,
|
|
"mean_token_accuracy": 0.23303174823522568,
|
|
"num_tokens": 64054217.0,
|
|
"step": 27935
|
|
},
|
|
{
|
|
"entropy": 5.216445446014404,
|
|
"epoch": 2.6839577329490876,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00042834401253702734,
|
|
"loss": 4.8074,
|
|
"mean_token_accuracy": 0.2379058927297592,
|
|
"num_tokens": 64065019.0,
|
|
"step": 27940
|
|
},
|
|
{
|
|
"entropy": 5.018643236160278,
|
|
"epoch": 2.6844380403458215,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004283189223542866,
|
|
"loss": 4.6702,
|
|
"mean_token_accuracy": 0.2423916980624199,
|
|
"num_tokens": 64076251.0,
|
|
"step": 27945
|
|
},
|
|
{
|
|
"entropy": 5.0703541278839115,
|
|
"epoch": 2.6849183477425553,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00042829382861193585,
|
|
"loss": 4.7322,
|
|
"mean_token_accuracy": 0.23972439169883727,
|
|
"num_tokens": 64087257.0,
|
|
"step": 27950
|
|
},
|
|
{
|
|
"entropy": 5.083153772354126,
|
|
"epoch": 2.685398655139289,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004282687313105575,
|
|
"loss": 4.7656,
|
|
"mean_token_accuracy": 0.22837662547826768,
|
|
"num_tokens": 64098304.0,
|
|
"step": 27955
|
|
},
|
|
{
|
|
"entropy": 5.092272567749023,
|
|
"epoch": 2.685878962536023,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042824363045073434,
|
|
"loss": 4.7113,
|
|
"mean_token_accuracy": 0.24303918182849885,
|
|
"num_tokens": 64108839.0,
|
|
"step": 27960
|
|
},
|
|
{
|
|
"entropy": 5.149320268630982,
|
|
"epoch": 2.686359269932757,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004282185260330491,
|
|
"loss": 4.821,
|
|
"mean_token_accuracy": 0.23023061752319335,
|
|
"num_tokens": 64121579.0,
|
|
"step": 27965
|
|
},
|
|
{
|
|
"entropy": 5.015567398071289,
|
|
"epoch": 2.686839577329491,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00042819341805808473,
|
|
"loss": 4.6742,
|
|
"mean_token_accuracy": 0.24303774684667587,
|
|
"num_tokens": 64132220.0,
|
|
"step": 27970
|
|
},
|
|
{
|
|
"entropy": 5.044483709335327,
|
|
"epoch": 2.6873198847262247,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00042816830652642396,
|
|
"loss": 4.7284,
|
|
"mean_token_accuracy": 0.2366749495267868,
|
|
"num_tokens": 64143580.0,
|
|
"step": 27975
|
|
},
|
|
{
|
|
"entropy": 5.081659555435181,
|
|
"epoch": 2.6878001921229586,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004281431914386501,
|
|
"loss": 4.7454,
|
|
"mean_token_accuracy": 0.23123384416103362,
|
|
"num_tokens": 64155042.0,
|
|
"step": 27980
|
|
},
|
|
{
|
|
"entropy": 5.1955053329467775,
|
|
"epoch": 2.6882804995196925,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000428118072795346,
|
|
"loss": 4.8289,
|
|
"mean_token_accuracy": 0.23210142105817794,
|
|
"num_tokens": 64167755.0,
|
|
"step": 27985
|
|
},
|
|
{
|
|
"entropy": 5.125277090072632,
|
|
"epoch": 2.6887608069164264,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00042809295059709483,
|
|
"loss": 4.7947,
|
|
"mean_token_accuracy": 0.23100631237030028,
|
|
"num_tokens": 64180930.0,
|
|
"step": 27990
|
|
},
|
|
{
|
|
"entropy": 5.084764528274536,
|
|
"epoch": 2.6892411143131603,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00042806782484448,
|
|
"loss": 4.7767,
|
|
"mean_token_accuracy": 0.2393814891576767,
|
|
"num_tokens": 64192994.0,
|
|
"step": 27995
|
|
},
|
|
{
|
|
"entropy": 5.138259649276733,
|
|
"epoch": 2.689721421709894,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004280426955380848,
|
|
"loss": 4.7912,
|
|
"mean_token_accuracy": 0.23672110140323638,
|
|
"num_tokens": 64204572.0,
|
|
"step": 28000
|
|
},
|
|
{
|
|
"entropy": 5.04312310218811,
|
|
"epoch": 2.6902017291066285,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00042801756267849266,
|
|
"loss": 4.7363,
|
|
"mean_token_accuracy": 0.23942153304815292,
|
|
"num_tokens": 64216361.0,
|
|
"step": 28005
|
|
},
|
|
{
|
|
"entropy": 5.029325532913208,
|
|
"epoch": 2.690682036503362,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004279924262662871,
|
|
"loss": 4.6864,
|
|
"mean_token_accuracy": 0.24289357662200928,
|
|
"num_tokens": 64227074.0,
|
|
"step": 28010
|
|
},
|
|
{
|
|
"entropy": 5.0360212326049805,
|
|
"epoch": 2.6911623439000962,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004279672863020517,
|
|
"loss": 4.6994,
|
|
"mean_token_accuracy": 0.2375594422221184,
|
|
"num_tokens": 64239507.0,
|
|
"step": 28015
|
|
},
|
|
{
|
|
"entropy": 5.150512218475342,
|
|
"epoch": 2.69164265129683,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00042794214278637013,
|
|
"loss": 4.8131,
|
|
"mean_token_accuracy": 0.23091583847999572,
|
|
"num_tokens": 64251307.0,
|
|
"step": 28020
|
|
},
|
|
{
|
|
"entropy": 5.080439472198487,
|
|
"epoch": 2.692122958693564,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00042791699571982606,
|
|
"loss": 4.7844,
|
|
"mean_token_accuracy": 0.23450076282024385,
|
|
"num_tokens": 64263988.0,
|
|
"step": 28025
|
|
},
|
|
{
|
|
"entropy": 5.104472923278808,
|
|
"epoch": 2.692603266090298,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004278918451030035,
|
|
"loss": 4.7919,
|
|
"mean_token_accuracy": 0.2293301820755005,
|
|
"num_tokens": 64275311.0,
|
|
"step": 28030
|
|
},
|
|
{
|
|
"entropy": 5.169442796707154,
|
|
"epoch": 2.6930835734870318,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004278666909364863,
|
|
"loss": 4.8172,
|
|
"mean_token_accuracy": 0.23217507153749467,
|
|
"num_tokens": 64286055.0,
|
|
"step": 28035
|
|
},
|
|
{
|
|
"entropy": 5.032286691665649,
|
|
"epoch": 2.6935638808837656,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004278415332208584,
|
|
"loss": 4.711,
|
|
"mean_token_accuracy": 0.24192306250333787,
|
|
"num_tokens": 64297286.0,
|
|
"step": 28040
|
|
},
|
|
{
|
|
"entropy": 5.09624924659729,
|
|
"epoch": 2.6940441882804995,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042781637195670396,
|
|
"loss": 4.7968,
|
|
"mean_token_accuracy": 0.23089852035045624,
|
|
"num_tokens": 64308277.0,
|
|
"step": 28045
|
|
},
|
|
{
|
|
"entropy": 5.267874479293823,
|
|
"epoch": 2.6945244956772334,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004277912071446072,
|
|
"loss": 4.9367,
|
|
"mean_token_accuracy": 0.21548387855291368,
|
|
"num_tokens": 64319260.0,
|
|
"step": 28050
|
|
},
|
|
{
|
|
"entropy": 5.188047170639038,
|
|
"epoch": 2.6950048030739673,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004277660387851522,
|
|
"loss": 4.8236,
|
|
"mean_token_accuracy": 0.2330697700381279,
|
|
"num_tokens": 64329840.0,
|
|
"step": 28055
|
|
},
|
|
{
|
|
"entropy": 5.199683332443238,
|
|
"epoch": 2.695485110470701,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004277408668789235,
|
|
"loss": 4.894,
|
|
"mean_token_accuracy": 0.22882702201604843,
|
|
"num_tokens": 64342375.0,
|
|
"step": 28060
|
|
},
|
|
{
|
|
"entropy": 4.991396713256836,
|
|
"epoch": 2.695965417867435,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004277156914265054,
|
|
"loss": 4.6266,
|
|
"mean_token_accuracy": 0.25021957606077194,
|
|
"num_tokens": 64353396.0,
|
|
"step": 28065
|
|
},
|
|
{
|
|
"entropy": 5.153245162963867,
|
|
"epoch": 2.696445725264169,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004276905124284824,
|
|
"loss": 4.809,
|
|
"mean_token_accuracy": 0.23273993134498597,
|
|
"num_tokens": 64364802.0,
|
|
"step": 28070
|
|
},
|
|
{
|
|
"entropy": 5.133913660049439,
|
|
"epoch": 2.696926032660903,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004276653298854391,
|
|
"loss": 4.7728,
|
|
"mean_token_accuracy": 0.2290853813290596,
|
|
"num_tokens": 64376379.0,
|
|
"step": 28075
|
|
},
|
|
{
|
|
"entropy": 5.084238815307617,
|
|
"epoch": 2.697406340057637,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004276401437979601,
|
|
"loss": 4.7893,
|
|
"mean_token_accuracy": 0.23213465213775636,
|
|
"num_tokens": 64388039.0,
|
|
"step": 28080
|
|
},
|
|
{
|
|
"entropy": 5.1134857654571535,
|
|
"epoch": 2.6978866474543706,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004276149541666303,
|
|
"loss": 4.7497,
|
|
"mean_token_accuracy": 0.23422105759382247,
|
|
"num_tokens": 64401288.0,
|
|
"step": 28085
|
|
},
|
|
{
|
|
"entropy": 5.13084306716919,
|
|
"epoch": 2.698366954851105,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042758976099203444,
|
|
"loss": 4.7295,
|
|
"mean_token_accuracy": 0.2404603809118271,
|
|
"num_tokens": 64411392.0,
|
|
"step": 28090
|
|
},
|
|
{
|
|
"entropy": 5.123020887374878,
|
|
"epoch": 2.6988472622478388,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00042756456427475736,
|
|
"loss": 4.8165,
|
|
"mean_token_accuracy": 0.23579190522432328,
|
|
"num_tokens": 64421955.0,
|
|
"step": 28095
|
|
},
|
|
{
|
|
"entropy": 5.074358367919922,
|
|
"epoch": 2.6993275696445727,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004275393640153842,
|
|
"loss": 4.7291,
|
|
"mean_token_accuracy": 0.23813998848199844,
|
|
"num_tokens": 64434365.0,
|
|
"step": 28100
|
|
},
|
|
{
|
|
"entropy": 5.056596994400024,
|
|
"epoch": 2.6998078770413065,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00042751416021449986,
|
|
"loss": 4.691,
|
|
"mean_token_accuracy": 0.2383261129260063,
|
|
"num_tokens": 64444846.0,
|
|
"step": 28105
|
|
},
|
|
{
|
|
"entropy": 5.061080837249756,
|
|
"epoch": 2.7002881844380404,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004274889528726896,
|
|
"loss": 4.6862,
|
|
"mean_token_accuracy": 0.23701339811086655,
|
|
"num_tokens": 64455215.0,
|
|
"step": 28110
|
|
},
|
|
{
|
|
"entropy": 5.132177495956421,
|
|
"epoch": 2.7007684918347743,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004274637419905388,
|
|
"loss": 4.724,
|
|
"mean_token_accuracy": 0.24133433252573014,
|
|
"num_tokens": 64465810.0,
|
|
"step": 28115
|
|
},
|
|
{
|
|
"entropy": 5.069816207885742,
|
|
"epoch": 2.701248799231508,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00042743852756863253,
|
|
"loss": 4.6915,
|
|
"mean_token_accuracy": 0.24327150732278824,
|
|
"num_tokens": 64476615.0,
|
|
"step": 28120
|
|
},
|
|
{
|
|
"entropy": 5.0869025707244875,
|
|
"epoch": 2.701729106628242,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004274133096075563,
|
|
"loss": 4.7701,
|
|
"mean_token_accuracy": 0.23665109276771545,
|
|
"num_tokens": 64490137.0,
|
|
"step": 28125
|
|
},
|
|
{
|
|
"entropy": 5.078602361679077,
|
|
"epoch": 2.702209414024976,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004273880881078956,
|
|
"loss": 4.7441,
|
|
"mean_token_accuracy": 0.24227222949266433,
|
|
"num_tokens": 64501357.0,
|
|
"step": 28130
|
|
},
|
|
{
|
|
"entropy": 4.969816112518311,
|
|
"epoch": 2.70268972142171,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004273628630702359,
|
|
"loss": 4.5719,
|
|
"mean_token_accuracy": 0.24865195155143738,
|
|
"num_tokens": 64512784.0,
|
|
"step": 28135
|
|
},
|
|
{
|
|
"entropy": 5.0676685810089115,
|
|
"epoch": 2.7031700288184437,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00042733763449516313,
|
|
"loss": 4.7317,
|
|
"mean_token_accuracy": 0.24380185604095458,
|
|
"num_tokens": 64523800.0,
|
|
"step": 28140
|
|
},
|
|
{
|
|
"entropy": 5.086323976516724,
|
|
"epoch": 2.7036503362151776,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00042731240238326273,
|
|
"loss": 4.7152,
|
|
"mean_token_accuracy": 0.23434408009052277,
|
|
"num_tokens": 64535551.0,
|
|
"step": 28145
|
|
},
|
|
{
|
|
"entropy": 5.077157020568848,
|
|
"epoch": 2.7041306436119115,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00042728716673512065,
|
|
"loss": 4.7252,
|
|
"mean_token_accuracy": 0.24621228575706483,
|
|
"num_tokens": 64547892.0,
|
|
"step": 28150
|
|
},
|
|
{
|
|
"entropy": 4.9974264144897464,
|
|
"epoch": 2.704610951008646,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00042726192755132276,
|
|
"loss": 4.6678,
|
|
"mean_token_accuracy": 0.24135030061006546,
|
|
"num_tokens": 64559687.0,
|
|
"step": 28155
|
|
},
|
|
{
|
|
"entropy": 5.046790599822998,
|
|
"epoch": 2.7050912584053792,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00042723668483245496,
|
|
"loss": 4.6399,
|
|
"mean_token_accuracy": 0.24230584502220154,
|
|
"num_tokens": 64571691.0,
|
|
"step": 28160
|
|
},
|
|
{
|
|
"entropy": 5.146127080917358,
|
|
"epoch": 2.7055715658021136,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004272114385791035,
|
|
"loss": 4.8018,
|
|
"mean_token_accuracy": 0.23526528775691985,
|
|
"num_tokens": 64582570.0,
|
|
"step": 28165
|
|
},
|
|
{
|
|
"entropy": 5.111129426956177,
|
|
"epoch": 2.706051873198847,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00042718618879185435,
|
|
"loss": 4.7435,
|
|
"mean_token_accuracy": 0.23832932561635972,
|
|
"num_tokens": 64593610.0,
|
|
"step": 28170
|
|
},
|
|
{
|
|
"entropy": 5.058528900146484,
|
|
"epoch": 2.7065321805955813,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004271609354712938,
|
|
"loss": 4.6835,
|
|
"mean_token_accuracy": 0.24811802953481674,
|
|
"num_tokens": 64604802.0,
|
|
"step": 28175
|
|
},
|
|
{
|
|
"entropy": 5.051104402542114,
|
|
"epoch": 2.707012487992315,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004271356786180082,
|
|
"loss": 4.7137,
|
|
"mean_token_accuracy": 0.2327448919415474,
|
|
"num_tokens": 64615816.0,
|
|
"step": 28180
|
|
},
|
|
{
|
|
"entropy": 5.00749945640564,
|
|
"epoch": 2.707492795389049,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004271104182325838,
|
|
"loss": 4.6554,
|
|
"mean_token_accuracy": 0.24093613475561143,
|
|
"num_tokens": 64628890.0,
|
|
"step": 28185
|
|
},
|
|
{
|
|
"entropy": 5.027282094955444,
|
|
"epoch": 2.707973102785783,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042708515431560723,
|
|
"loss": 4.6951,
|
|
"mean_token_accuracy": 0.24452045410871506,
|
|
"num_tokens": 64639494.0,
|
|
"step": 28190
|
|
},
|
|
{
|
|
"entropy": 5.180206060409546,
|
|
"epoch": 2.708453410182517,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000427059886867665,
|
|
"loss": 4.9158,
|
|
"mean_token_accuracy": 0.22348858714103698,
|
|
"num_tokens": 64651235.0,
|
|
"step": 28195
|
|
},
|
|
{
|
|
"entropy": 5.1558619976043705,
|
|
"epoch": 2.7089337175792507,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004270346158893436,
|
|
"loss": 4.7674,
|
|
"mean_token_accuracy": 0.23493716567754747,
|
|
"num_tokens": 64662517.0,
|
|
"step": 28200
|
|
},
|
|
{
|
|
"entropy": 5.112757205963135,
|
|
"epoch": 2.7094140249759846,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00042700934138123004,
|
|
"loss": 4.8213,
|
|
"mean_token_accuracy": 0.2212560459971428,
|
|
"num_tokens": 64674521.0,
|
|
"step": 28205
|
|
},
|
|
{
|
|
"entropy": 5.166494464874267,
|
|
"epoch": 2.7098943323727185,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00042698406334391084,
|
|
"loss": 4.9204,
|
|
"mean_token_accuracy": 0.22131302058696747,
|
|
"num_tokens": 64686894.0,
|
|
"step": 28210
|
|
},
|
|
{
|
|
"entropy": 5.172549533843994,
|
|
"epoch": 2.7103746397694524,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000426958781777973,
|
|
"loss": 4.8365,
|
|
"mean_token_accuracy": 0.23947456032037734,
|
|
"num_tokens": 64697948.0,
|
|
"step": 28215
|
|
},
|
|
{
|
|
"entropy": 5.080643033981323,
|
|
"epoch": 2.7108549471661862,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004269334966840035,
|
|
"loss": 4.7004,
|
|
"mean_token_accuracy": 0.24020812660455704,
|
|
"num_tokens": 64709429.0,
|
|
"step": 28220
|
|
},
|
|
{
|
|
"entropy": 5.072011661529541,
|
|
"epoch": 2.71133525456292,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042690820806258933,
|
|
"loss": 4.6912,
|
|
"mean_token_accuracy": 0.24333603233098983,
|
|
"num_tokens": 64720529.0,
|
|
"step": 28225
|
|
},
|
|
{
|
|
"entropy": 5.007713651657104,
|
|
"epoch": 2.7118155619596545,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004268829159143176,
|
|
"loss": 4.667,
|
|
"mean_token_accuracy": 0.2444024607539177,
|
|
"num_tokens": 64731602.0,
|
|
"step": 28230
|
|
},
|
|
{
|
|
"entropy": 5.138308429718018,
|
|
"epoch": 2.712295869356388,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004268576202397757,
|
|
"loss": 4.6776,
|
|
"mean_token_accuracy": 0.23795579075813295,
|
|
"num_tokens": 64742746.0,
|
|
"step": 28235
|
|
},
|
|
{
|
|
"entropy": 5.0846727848052975,
|
|
"epoch": 2.712776176753122,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004268323210395506,
|
|
"loss": 4.7464,
|
|
"mean_token_accuracy": 0.23865850120782853,
|
|
"num_tokens": 64754643.0,
|
|
"step": 28240
|
|
},
|
|
{
|
|
"entropy": 5.182872915267945,
|
|
"epoch": 2.7132564841498557,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00042680701831423004,
|
|
"loss": 4.8211,
|
|
"mean_token_accuracy": 0.230291485786438,
|
|
"num_tokens": 64766349.0,
|
|
"step": 28245
|
|
},
|
|
{
|
|
"entropy": 5.03706693649292,
|
|
"epoch": 2.71373679154659,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004267817120644012,
|
|
"loss": 4.7334,
|
|
"mean_token_accuracy": 0.23736280649900438,
|
|
"num_tokens": 64777645.0,
|
|
"step": 28250
|
|
},
|
|
{
|
|
"entropy": 5.090947532653809,
|
|
"epoch": 2.714217098943324,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042675640229065167,
|
|
"loss": 4.7662,
|
|
"mean_token_accuracy": 0.2323785498738289,
|
|
"num_tokens": 64788065.0,
|
|
"step": 28255
|
|
},
|
|
{
|
|
"entropy": 5.138057613372803,
|
|
"epoch": 2.7146974063400577,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00042673108899356915,
|
|
"loss": 4.7284,
|
|
"mean_token_accuracy": 0.24340671747922898,
|
|
"num_tokens": 64798938.0,
|
|
"step": 28260
|
|
},
|
|
{
|
|
"entropy": 5.0383988380432125,
|
|
"epoch": 2.7151777137367916,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004267057721737413,
|
|
"loss": 4.7627,
|
|
"mean_token_accuracy": 0.23495194613933562,
|
|
"num_tokens": 64810572.0,
|
|
"step": 28265
|
|
},
|
|
{
|
|
"entropy": 5.016577291488647,
|
|
"epoch": 2.7156580211335255,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004266804518317559,
|
|
"loss": 4.7593,
|
|
"mean_token_accuracy": 0.2415664240717888,
|
|
"num_tokens": 64821877.0,
|
|
"step": 28270
|
|
},
|
|
{
|
|
"entropy": 5.098750972747803,
|
|
"epoch": 2.7161383285302594,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004266551279682008,
|
|
"loss": 4.6987,
|
|
"mean_token_accuracy": 0.2440878689289093,
|
|
"num_tokens": 64832737.0,
|
|
"step": 28275
|
|
},
|
|
{
|
|
"entropy": 5.076100206375122,
|
|
"epoch": 2.7166186359269933,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004266298005836639,
|
|
"loss": 4.7607,
|
|
"mean_token_accuracy": 0.22922112345695494,
|
|
"num_tokens": 64844713.0,
|
|
"step": 28280
|
|
},
|
|
{
|
|
"entropy": 5.078899669647217,
|
|
"epoch": 2.717098943323727,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042660446967873327,
|
|
"loss": 4.7614,
|
|
"mean_token_accuracy": 0.23753941804170609,
|
|
"num_tokens": 64855896.0,
|
|
"step": 28285
|
|
},
|
|
{
|
|
"entropy": 5.141567993164062,
|
|
"epoch": 2.717579250720461,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00042657913525399703,
|
|
"loss": 4.8146,
|
|
"mean_token_accuracy": 0.23122312724590302,
|
|
"num_tokens": 64866529.0,
|
|
"step": 28290
|
|
},
|
|
{
|
|
"entropy": 5.0869077205657955,
|
|
"epoch": 2.718059558117195,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004265537973100435,
|
|
"loss": 4.7199,
|
|
"mean_token_accuracy": 0.2347149908542633,
|
|
"num_tokens": 64876691.0,
|
|
"step": 28295
|
|
},
|
|
{
|
|
"entropy": 5.036520099639892,
|
|
"epoch": 2.718539865513929,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004265284558474607,
|
|
"loss": 4.7108,
|
|
"mean_token_accuracy": 0.23933310508728028,
|
|
"num_tokens": 64887271.0,
|
|
"step": 28300
|
|
},
|
|
{
|
|
"entropy": 5.102957010269165,
|
|
"epoch": 2.7190201729106627,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00042650311086683715,
|
|
"loss": 4.8139,
|
|
"mean_token_accuracy": 0.22686078101396562,
|
|
"num_tokens": 64900698.0,
|
|
"step": 28305
|
|
},
|
|
{
|
|
"entropy": 5.1559515476226805,
|
|
"epoch": 2.7195004803073966,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004264777623687612,
|
|
"loss": 4.7448,
|
|
"mean_token_accuracy": 0.23295564502477645,
|
|
"num_tokens": 64911349.0,
|
|
"step": 28310
|
|
},
|
|
{
|
|
"entropy": 5.051888799667358,
|
|
"epoch": 2.719980787704131,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004264524103538214,
|
|
"loss": 4.7408,
|
|
"mean_token_accuracy": 0.24038063436746598,
|
|
"num_tokens": 64922522.0,
|
|
"step": 28315
|
|
},
|
|
{
|
|
"entropy": 5.099909830093384,
|
|
"epoch": 2.7204610951008643,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004264270548226064,
|
|
"loss": 4.766,
|
|
"mean_token_accuracy": 0.23822131007909775,
|
|
"num_tokens": 64934796.0,
|
|
"step": 28320
|
|
},
|
|
{
|
|
"entropy": 5.10898756980896,
|
|
"epoch": 2.7209414024975986,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004264016957757048,
|
|
"loss": 4.7571,
|
|
"mean_token_accuracy": 0.24004254788160323,
|
|
"num_tokens": 64946717.0,
|
|
"step": 28325
|
|
},
|
|
{
|
|
"entropy": 5.098352909088135,
|
|
"epoch": 2.7214217098943325,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042637633321370545,
|
|
"loss": 4.8395,
|
|
"mean_token_accuracy": 0.2257276654243469,
|
|
"num_tokens": 64958818.0,
|
|
"step": 28330
|
|
},
|
|
{
|
|
"entropy": 5.11536021232605,
|
|
"epoch": 2.7219020172910664,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004263509671371971,
|
|
"loss": 4.6868,
|
|
"mean_token_accuracy": 0.23188695609569548,
|
|
"num_tokens": 64968974.0,
|
|
"step": 28335
|
|
},
|
|
{
|
|
"entropy": 5.114422798156738,
|
|
"epoch": 2.7223823246878003,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00042632559754676865,
|
|
"loss": 4.7642,
|
|
"mean_token_accuracy": 0.23466452211141586,
|
|
"num_tokens": 64980347.0,
|
|
"step": 28340
|
|
},
|
|
{
|
|
"entropy": 5.088268184661866,
|
|
"epoch": 2.722862632084534,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004263002244430092,
|
|
"loss": 4.7339,
|
|
"mean_token_accuracy": 0.23719714879989623,
|
|
"num_tokens": 64992916.0,
|
|
"step": 28345
|
|
},
|
|
{
|
|
"entropy": 5.093293190002441,
|
|
"epoch": 2.723342939481268,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004262748478265078,
|
|
"loss": 4.81,
|
|
"mean_token_accuracy": 0.23484614342451096,
|
|
"num_tokens": 65003382.0,
|
|
"step": 28350
|
|
},
|
|
{
|
|
"entropy": 4.988045167922974,
|
|
"epoch": 2.723823246878002,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004262494676978537,
|
|
"loss": 4.5804,
|
|
"mean_token_accuracy": 0.25099532306194305,
|
|
"num_tokens": 65014064.0,
|
|
"step": 28355
|
|
},
|
|
{
|
|
"entropy": 5.103696966171265,
|
|
"epoch": 2.724303554274736,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00042622408405763607,
|
|
"loss": 4.7906,
|
|
"mean_token_accuracy": 0.22995427697896959,
|
|
"num_tokens": 65025701.0,
|
|
"step": 28360
|
|
},
|
|
{
|
|
"entropy": 5.142344427108765,
|
|
"epoch": 2.7247838616714697,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004261986969064442,
|
|
"loss": 4.7664,
|
|
"mean_token_accuracy": 0.23485698848962783,
|
|
"num_tokens": 65038270.0,
|
|
"step": 28365
|
|
},
|
|
{
|
|
"entropy": 5.065335035324097,
|
|
"epoch": 2.7252641690682036,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042617330624486753,
|
|
"loss": 4.7015,
|
|
"mean_token_accuracy": 0.24524183720350265,
|
|
"num_tokens": 65049453.0,
|
|
"step": 28370
|
|
},
|
|
{
|
|
"entropy": 4.921413230895996,
|
|
"epoch": 2.7257444764649374,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004261479120734956,
|
|
"loss": 4.5774,
|
|
"mean_token_accuracy": 0.2530855819582939,
|
|
"num_tokens": 65061211.0,
|
|
"step": 28375
|
|
},
|
|
{
|
|
"entropy": 5.010696601867676,
|
|
"epoch": 2.7262247838616713,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000426122514392918,
|
|
"loss": 4.649,
|
|
"mean_token_accuracy": 0.24367837458848954,
|
|
"num_tokens": 65073073.0,
|
|
"step": 28380
|
|
},
|
|
{
|
|
"entropy": 5.078413200378418,
|
|
"epoch": 2.726705091258405,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00042609711320372435,
|
|
"loss": 4.7231,
|
|
"mean_token_accuracy": 0.24840225130319596,
|
|
"num_tokens": 65084647.0,
|
|
"step": 28385
|
|
},
|
|
{
|
|
"entropy": 5.070221567153931,
|
|
"epoch": 2.7271853986551395,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004260717085065045,
|
|
"loss": 4.6986,
|
|
"mean_token_accuracy": 0.24333804994821548,
|
|
"num_tokens": 65095405.0,
|
|
"step": 28390
|
|
},
|
|
{
|
|
"entropy": 5.0989728450775145,
|
|
"epoch": 2.727665706051873,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00042604630030184797,
|
|
"loss": 4.7636,
|
|
"mean_token_accuracy": 0.23326624184846878,
|
|
"num_tokens": 65106981.0,
|
|
"step": 28395
|
|
},
|
|
{
|
|
"entropy": 5.173717212677002,
|
|
"epoch": 2.7281460134486073,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000426020888590345,
|
|
"loss": 4.8478,
|
|
"mean_token_accuracy": 0.22634569704532623,
|
|
"num_tokens": 65118957.0,
|
|
"step": 28400
|
|
},
|
|
{
|
|
"entropy": 5.059170246124268,
|
|
"epoch": 2.728626320845341,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00042599547337258536,
|
|
"loss": 4.7339,
|
|
"mean_token_accuracy": 0.2388172686100006,
|
|
"num_tokens": 65130897.0,
|
|
"step": 28405
|
|
},
|
|
{
|
|
"entropy": 4.973813772201538,
|
|
"epoch": 2.729106628242075,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00042597005464915924,
|
|
"loss": 4.5451,
|
|
"mean_token_accuracy": 0.2504707619547844,
|
|
"num_tokens": 65142828.0,
|
|
"step": 28410
|
|
},
|
|
{
|
|
"entropy": 4.978707313537598,
|
|
"epoch": 2.729586935638809,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00042594463242065674,
|
|
"loss": 4.6767,
|
|
"mean_token_accuracy": 0.24309882372617722,
|
|
"num_tokens": 65154787.0,
|
|
"step": 28415
|
|
},
|
|
{
|
|
"entropy": 5.0573992252349855,
|
|
"epoch": 2.730067243035543,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004259192066876681,
|
|
"loss": 4.7096,
|
|
"mean_token_accuracy": 0.2386387825012207,
|
|
"num_tokens": 65165530.0,
|
|
"step": 28420
|
|
},
|
|
{
|
|
"entropy": 5.140051460266113,
|
|
"epoch": 2.7305475504322767,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042589377745078354,
|
|
"loss": 4.829,
|
|
"mean_token_accuracy": 0.2319121852517128,
|
|
"num_tokens": 65176809.0,
|
|
"step": 28425
|
|
},
|
|
{
|
|
"entropy": 5.166042423248291,
|
|
"epoch": 2.7310278578290106,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00042586834471059366,
|
|
"loss": 4.8491,
|
|
"mean_token_accuracy": 0.2317554920911789,
|
|
"num_tokens": 65189251.0,
|
|
"step": 28430
|
|
},
|
|
{
|
|
"entropy": 5.004992914199829,
|
|
"epoch": 2.7315081652257445,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00042584290846768867,
|
|
"loss": 4.6268,
|
|
"mean_token_accuracy": 0.2488670364022255,
|
|
"num_tokens": 65200918.0,
|
|
"step": 28435
|
|
},
|
|
{
|
|
"entropy": 5.097946310043335,
|
|
"epoch": 2.7319884726224783,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004258174687226593,
|
|
"loss": 4.7161,
|
|
"mean_token_accuracy": 0.2372460260987282,
|
|
"num_tokens": 65213283.0,
|
|
"step": 28440
|
|
},
|
|
{
|
|
"entropy": 5.092672300338745,
|
|
"epoch": 2.7324687800192122,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004257920254760962,
|
|
"loss": 4.7477,
|
|
"mean_token_accuracy": 0.24076730161905288,
|
|
"num_tokens": 65223680.0,
|
|
"step": 28445
|
|
},
|
|
{
|
|
"entropy": 5.094247150421142,
|
|
"epoch": 2.732949087415946,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004257665787285899,
|
|
"loss": 4.7471,
|
|
"mean_token_accuracy": 0.23675734102725982,
|
|
"num_tokens": 65234570.0,
|
|
"step": 28450
|
|
},
|
|
{
|
|
"entropy": 5.12308988571167,
|
|
"epoch": 2.73342939481268,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042574112848073147,
|
|
"loss": 4.7203,
|
|
"mean_token_accuracy": 0.2399858608841896,
|
|
"num_tokens": 65246616.0,
|
|
"step": 28455
|
|
},
|
|
{
|
|
"entropy": 5.042817211151123,
|
|
"epoch": 2.733909702209414,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042571567473311157,
|
|
"loss": 4.7393,
|
|
"mean_token_accuracy": 0.23980943709611893,
|
|
"num_tokens": 65257853.0,
|
|
"step": 28460
|
|
},
|
|
{
|
|
"entropy": 5.070425033569336,
|
|
"epoch": 2.734390009606148,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004256902174863213,
|
|
"loss": 4.7108,
|
|
"mean_token_accuracy": 0.24079181402921676,
|
|
"num_tokens": 65268843.0,
|
|
"step": 28465
|
|
},
|
|
{
|
|
"entropy": 5.209898042678833,
|
|
"epoch": 2.7348703170028816,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042566475674095155,
|
|
"loss": 4.857,
|
|
"mean_token_accuracy": 0.225233294069767,
|
|
"num_tokens": 65279824.0,
|
|
"step": 28470
|
|
},
|
|
{
|
|
"entropy": 5.0555259704589846,
|
|
"epoch": 2.735350624399616,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004256392924975936,
|
|
"loss": 4.7751,
|
|
"mean_token_accuracy": 0.23760847896337509,
|
|
"num_tokens": 65291318.0,
|
|
"step": 28475
|
|
},
|
|
{
|
|
"entropy": 5.009477043151856,
|
|
"epoch": 2.7358309317963494,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00042561382475683854,
|
|
"loss": 4.7233,
|
|
"mean_token_accuracy": 0.23941340893507004,
|
|
"num_tokens": 65302997.0,
|
|
"step": 28480
|
|
},
|
|
{
|
|
"entropy": 5.039452934265137,
|
|
"epoch": 2.7363112391930837,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004255883535192777,
|
|
"loss": 4.6862,
|
|
"mean_token_accuracy": 0.2484783872961998,
|
|
"num_tokens": 65314336.0,
|
|
"step": 28485
|
|
},
|
|
{
|
|
"entropy": 5.114763593673706,
|
|
"epoch": 2.7367915465898176,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004255628787855025,
|
|
"loss": 4.7705,
|
|
"mean_token_accuracy": 0.23738002330064772,
|
|
"num_tokens": 65325138.0,
|
|
"step": 28490
|
|
},
|
|
{
|
|
"entropy": 5.137652969360351,
|
|
"epoch": 2.7372718539865515,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004255374005561043,
|
|
"loss": 4.7312,
|
|
"mean_token_accuracy": 0.23959817737340927,
|
|
"num_tokens": 65337067.0,
|
|
"step": 28495
|
|
},
|
|
{
|
|
"entropy": 5.104769468307495,
|
|
"epoch": 2.7377521613832854,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00042551191883167464,
|
|
"loss": 4.7638,
|
|
"mean_token_accuracy": 0.23433667719364165,
|
|
"num_tokens": 65349598.0,
|
|
"step": 28500
|
|
},
|
|
{
|
|
"entropy": 5.133802175521851,
|
|
"epoch": 2.7382324687800192,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004254864336128052,
|
|
"loss": 4.8134,
|
|
"mean_token_accuracy": 0.2284764528274536,
|
|
"num_tokens": 65361052.0,
|
|
"step": 28505
|
|
},
|
|
{
|
|
"entropy": 5.139010906219482,
|
|
"epoch": 2.738712776176753,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00042546094490008765,
|
|
"loss": 4.8087,
|
|
"mean_token_accuracy": 0.23835351914167405,
|
|
"num_tokens": 65373246.0,
|
|
"step": 28510
|
|
},
|
|
{
|
|
"entropy": 5.076985883712768,
|
|
"epoch": 2.739193083573487,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004254354526941136,
|
|
"loss": 4.6916,
|
|
"mean_token_accuracy": 0.24306633770465852,
|
|
"num_tokens": 65384349.0,
|
|
"step": 28515
|
|
},
|
|
{
|
|
"entropy": 4.981937980651855,
|
|
"epoch": 2.739673390970221,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004254099569954751,
|
|
"loss": 4.6957,
|
|
"mean_token_accuracy": 0.23665003925561906,
|
|
"num_tokens": 65396241.0,
|
|
"step": 28520
|
|
},
|
|
{
|
|
"entropy": 5.017926216125488,
|
|
"epoch": 2.7401536983669548,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004253844578047641,
|
|
"loss": 4.77,
|
|
"mean_token_accuracy": 0.23209561556577682,
|
|
"num_tokens": 65408583.0,
|
|
"step": 28525
|
|
},
|
|
{
|
|
"entropy": 5.111822986602784,
|
|
"epoch": 2.7406340057636887,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004253589551225725,
|
|
"loss": 4.6833,
|
|
"mean_token_accuracy": 0.2423287332057953,
|
|
"num_tokens": 65419845.0,
|
|
"step": 28530
|
|
},
|
|
{
|
|
"entropy": 5.136012268066406,
|
|
"epoch": 2.7411143131604225,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042533344894949245,
|
|
"loss": 4.7398,
|
|
"mean_token_accuracy": 0.230571748316288,
|
|
"num_tokens": 65433790.0,
|
|
"step": 28535
|
|
},
|
|
{
|
|
"entropy": 5.122740936279297,
|
|
"epoch": 2.7415946205571564,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00042530793928611605,
|
|
"loss": 4.801,
|
|
"mean_token_accuracy": 0.23554478138685225,
|
|
"num_tokens": 65445958.0,
|
|
"step": 28540
|
|
},
|
|
{
|
|
"entropy": 5.0635027408599855,
|
|
"epoch": 2.7420749279538903,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004252824261330357,
|
|
"loss": 4.7747,
|
|
"mean_token_accuracy": 0.23710028380155562,
|
|
"num_tokens": 65457973.0,
|
|
"step": 28545
|
|
},
|
|
{
|
|
"entropy": 5.090057420730591,
|
|
"epoch": 2.7425552353506246,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042525690949084364,
|
|
"loss": 4.7557,
|
|
"mean_token_accuracy": 0.2368753135204315,
|
|
"num_tokens": 65470245.0,
|
|
"step": 28550
|
|
},
|
|
{
|
|
"entropy": 5.017141246795655,
|
|
"epoch": 2.743035542747358,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042523138936013233,
|
|
"loss": 4.7262,
|
|
"mean_token_accuracy": 0.24332302957773208,
|
|
"num_tokens": 65482541.0,
|
|
"step": 28555
|
|
},
|
|
{
|
|
"entropy": 5.177900648117065,
|
|
"epoch": 2.7435158501440924,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042520586574149423,
|
|
"loss": 4.8372,
|
|
"mean_token_accuracy": 0.22966494411230087,
|
|
"num_tokens": 65494499.0,
|
|
"step": 28560
|
|
},
|
|
{
|
|
"entropy": 5.228618240356445,
|
|
"epoch": 2.7439961575408263,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00042518033863552185,
|
|
"loss": 4.8589,
|
|
"mean_token_accuracy": 0.22425288259983062,
|
|
"num_tokens": 65506377.0,
|
|
"step": 28565
|
|
},
|
|
{
|
|
"entropy": 5.14985466003418,
|
|
"epoch": 2.74447646493756,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004251548080428081,
|
|
"loss": 4.7947,
|
|
"mean_token_accuracy": 0.23412707149982454,
|
|
"num_tokens": 65517476.0,
|
|
"step": 28570
|
|
},
|
|
{
|
|
"entropy": 5.153133773803711,
|
|
"epoch": 2.744956772334294,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004251292739639455,
|
|
"loss": 4.7353,
|
|
"mean_token_accuracy": 0.23334655314683914,
|
|
"num_tokens": 65529829.0,
|
|
"step": 28575
|
|
},
|
|
{
|
|
"entropy": 4.9859757900238035,
|
|
"epoch": 2.745437079731028,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042510373639952694,
|
|
"loss": 4.6211,
|
|
"mean_token_accuracy": 0.24574377238750458,
|
|
"num_tokens": 65541030.0,
|
|
"step": 28580
|
|
},
|
|
{
|
|
"entropy": 5.0365828514099125,
|
|
"epoch": 2.745917387127762,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00042507819535014547,
|
|
"loss": 4.7371,
|
|
"mean_token_accuracy": 0.23719628006219864,
|
|
"num_tokens": 65551727.0,
|
|
"step": 28585
|
|
},
|
|
{
|
|
"entropy": 5.021198701858521,
|
|
"epoch": 2.7463976945244957,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00042505265081639376,
|
|
"loss": 4.7006,
|
|
"mean_token_accuracy": 0.24776863306760788,
|
|
"num_tokens": 65563085.0,
|
|
"step": 28590
|
|
},
|
|
{
|
|
"entropy": 5.107274675369263,
|
|
"epoch": 2.7468780019212296,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004250271027988652,
|
|
"loss": 4.7229,
|
|
"mean_token_accuracy": 0.23755284249782563,
|
|
"num_tokens": 65574909.0,
|
|
"step": 28595
|
|
},
|
|
{
|
|
"entropy": 5.060042953491211,
|
|
"epoch": 2.7473583093179634,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042500155129815274,
|
|
"loss": 4.7173,
|
|
"mean_token_accuracy": 0.24350056499242784,
|
|
"num_tokens": 65585515.0,
|
|
"step": 28600
|
|
},
|
|
{
|
|
"entropy": 5.024354028701782,
|
|
"epoch": 2.7478386167146973,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042497599631484965,
|
|
"loss": 4.6989,
|
|
"mean_token_accuracy": 0.23446216583251953,
|
|
"num_tokens": 65598160.0,
|
|
"step": 28605
|
|
},
|
|
{
|
|
"entropy": 5.121377420425415,
|
|
"epoch": 2.748318924111431,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00042495043784954926,
|
|
"loss": 4.7677,
|
|
"mean_token_accuracy": 0.23277996033430098,
|
|
"num_tokens": 65609886.0,
|
|
"step": 28610
|
|
},
|
|
{
|
|
"entropy": 5.088078260421753,
|
|
"epoch": 2.748799231508165,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000424924875902845,
|
|
"loss": 4.7246,
|
|
"mean_token_accuracy": 0.23480461686849594,
|
|
"num_tokens": 65620256.0,
|
|
"step": 28615
|
|
},
|
|
{
|
|
"entropy": 5.12593822479248,
|
|
"epoch": 2.749279538904899,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004248993104753303,
|
|
"loss": 4.7698,
|
|
"mean_token_accuracy": 0.23956041187047958,
|
|
"num_tokens": 65631108.0,
|
|
"step": 28620
|
|
},
|
|
{
|
|
"entropy": 5.117194652557373,
|
|
"epoch": 2.7497598463016333,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004248737415675987,
|
|
"loss": 4.7618,
|
|
"mean_token_accuracy": 0.23265804052352906,
|
|
"num_tokens": 65642423.0,
|
|
"step": 28625
|
|
},
|
|
{
|
|
"entropy": 5.039339065551758,
|
|
"epoch": 2.7502401536983667,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004248481691802439,
|
|
"loss": 4.7433,
|
|
"mean_token_accuracy": 0.23624941408634187,
|
|
"num_tokens": 65654105.0,
|
|
"step": 28630
|
|
},
|
|
{
|
|
"entropy": 5.145820808410645,
|
|
"epoch": 2.750720461095101,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004248225933138595,
|
|
"loss": 4.8094,
|
|
"mean_token_accuracy": 0.23825272619724275,
|
|
"num_tokens": 65664738.0,
|
|
"step": 28635
|
|
},
|
|
{
|
|
"entropy": 5.058314323425293,
|
|
"epoch": 2.751200768491835,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042479701396903945,
|
|
"loss": 4.6796,
|
|
"mean_token_accuracy": 0.2405511423945427,
|
|
"num_tokens": 65676815.0,
|
|
"step": 28640
|
|
},
|
|
{
|
|
"entropy": 5.125538015365601,
|
|
"epoch": 2.751681075888569,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004247714311463775,
|
|
"loss": 4.772,
|
|
"mean_token_accuracy": 0.23657451570034027,
|
|
"num_tokens": 65687234.0,
|
|
"step": 28645
|
|
},
|
|
{
|
|
"entropy": 5.122481727600098,
|
|
"epoch": 2.7521613832853027,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00042474584484646766,
|
|
"loss": 4.8587,
|
|
"mean_token_accuracy": 0.22422019988298417,
|
|
"num_tokens": 65700356.0,
|
|
"step": 28650
|
|
},
|
|
{
|
|
"entropy": 5.0451537609100345,
|
|
"epoch": 2.7526416906820366,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004247202550699039,
|
|
"loss": 4.6113,
|
|
"mean_token_accuracy": 0.24570255875587463,
|
|
"num_tokens": 65711320.0,
|
|
"step": 28655
|
|
},
|
|
{
|
|
"entropy": 5.073936176300049,
|
|
"epoch": 2.7531219980787704,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004246946618172805,
|
|
"loss": 4.7644,
|
|
"mean_token_accuracy": 0.23449680656194688,
|
|
"num_tokens": 65722461.0,
|
|
"step": 28660
|
|
},
|
|
{
|
|
"entropy": 4.981145191192627,
|
|
"epoch": 2.7536023054755043,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004246690650891915,
|
|
"loss": 4.6224,
|
|
"mean_token_accuracy": 0.24322707056999207,
|
|
"num_tokens": 65732311.0,
|
|
"step": 28665
|
|
},
|
|
{
|
|
"entropy": 5.082883024215699,
|
|
"epoch": 2.754082612872238,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004246434648862312,
|
|
"loss": 4.7577,
|
|
"mean_token_accuracy": 0.242487533390522,
|
|
"num_tokens": 65743830.0,
|
|
"step": 28670
|
|
},
|
|
{
|
|
"entropy": 5.124186706542969,
|
|
"epoch": 2.754562920268972,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004246178612089941,
|
|
"loss": 4.8113,
|
|
"mean_token_accuracy": 0.2254979908466339,
|
|
"num_tokens": 65754851.0,
|
|
"step": 28675
|
|
},
|
|
{
|
|
"entropy": 5.125278568267822,
|
|
"epoch": 2.755043227665706,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004245922540580744,
|
|
"loss": 4.787,
|
|
"mean_token_accuracy": 0.2342199668288231,
|
|
"num_tokens": 65765228.0,
|
|
"step": 28680
|
|
},
|
|
{
|
|
"entropy": 5.126844644546509,
|
|
"epoch": 2.75552353506244,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004245666434340668,
|
|
"loss": 4.8303,
|
|
"mean_token_accuracy": 0.22773058861494064,
|
|
"num_tokens": 65775334.0,
|
|
"step": 28685
|
|
},
|
|
{
|
|
"entropy": 4.998766899108887,
|
|
"epoch": 2.7560038424591737,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004245410293375659,
|
|
"loss": 4.6538,
|
|
"mean_token_accuracy": 0.2525763615965843,
|
|
"num_tokens": 65786256.0,
|
|
"step": 28690
|
|
},
|
|
{
|
|
"entropy": 5.094659471511841,
|
|
"epoch": 2.7564841498559076,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004245154117691664,
|
|
"loss": 4.7088,
|
|
"mean_token_accuracy": 0.24570492506027222,
|
|
"num_tokens": 65798831.0,
|
|
"step": 28695
|
|
},
|
|
{
|
|
"entropy": 5.041133260726928,
|
|
"epoch": 2.756964457252642,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004244897907294628,
|
|
"loss": 4.7536,
|
|
"mean_token_accuracy": 0.2359408512711525,
|
|
"num_tokens": 65810256.0,
|
|
"step": 28700
|
|
},
|
|
{
|
|
"entropy": 5.226623058319092,
|
|
"epoch": 2.7574447646493754,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004244641662190504,
|
|
"loss": 4.8904,
|
|
"mean_token_accuracy": 0.2273922637104988,
|
|
"num_tokens": 65821752.0,
|
|
"step": 28705
|
|
},
|
|
{
|
|
"entropy": 5.13972659111023,
|
|
"epoch": 2.7579250720461097,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00042443853823852376,
|
|
"loss": 4.793,
|
|
"mean_token_accuracy": 0.22990813702344895,
|
|
"num_tokens": 65833056.0,
|
|
"step": 28710
|
|
},
|
|
{
|
|
"entropy": 5.105608987808227,
|
|
"epoch": 2.7584053794428436,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000424412906788478,
|
|
"loss": 4.6967,
|
|
"mean_token_accuracy": 0.23675004094839097,
|
|
"num_tokens": 65845081.0,
|
|
"step": 28715
|
|
},
|
|
{
|
|
"entropy": 5.074363422393799,
|
|
"epoch": 2.7588856868395775,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004243872718695082,
|
|
"loss": 4.7417,
|
|
"mean_token_accuracy": 0.23619790077209474,
|
|
"num_tokens": 65856024.0,
|
|
"step": 28720
|
|
},
|
|
{
|
|
"entropy": 5.0468220710754395,
|
|
"epoch": 2.7593659942363113,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00042436163348220956,
|
|
"loss": 4.6461,
|
|
"mean_token_accuracy": 0.25140986293554307,
|
|
"num_tokens": 65868282.0,
|
|
"step": 28725
|
|
},
|
|
{
|
|
"entropy": 4.991868495941162,
|
|
"epoch": 2.7598463016330452,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004243359916271773,
|
|
"loss": 4.6913,
|
|
"mean_token_accuracy": 0.24277271181344987,
|
|
"num_tokens": 65879835.0,
|
|
"step": 28730
|
|
},
|
|
{
|
|
"entropy": 5.024890518188476,
|
|
"epoch": 2.760326609029779,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004243103463050067,
|
|
"loss": 4.6609,
|
|
"mean_token_accuracy": 0.2428322196006775,
|
|
"num_tokens": 65890916.0,
|
|
"step": 28735
|
|
},
|
|
{
|
|
"entropy": 5.154018449783325,
|
|
"epoch": 2.760806916426513,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004242846975162933,
|
|
"loss": 4.7635,
|
|
"mean_token_accuracy": 0.23299016058444977,
|
|
"num_tokens": 65901716.0,
|
|
"step": 28740
|
|
},
|
|
{
|
|
"entropy": 5.115095472335815,
|
|
"epoch": 2.761287223823247,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042425904526163246,
|
|
"loss": 4.8004,
|
|
"mean_token_accuracy": 0.23285145312547684,
|
|
"num_tokens": 65914366.0,
|
|
"step": 28745
|
|
},
|
|
{
|
|
"entropy": 5.145034217834473,
|
|
"epoch": 2.7617675312199808,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004242333895416198,
|
|
"loss": 4.9146,
|
|
"mean_token_accuracy": 0.2228606328368187,
|
|
"num_tokens": 65926396.0,
|
|
"step": 28750
|
|
},
|
|
{
|
|
"entropy": 5.1206944465637205,
|
|
"epoch": 2.7622478386167146,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000424207730356851,
|
|
"loss": 4.7526,
|
|
"mean_token_accuracy": 0.23485395759344102,
|
|
"num_tokens": 65939200.0,
|
|
"step": 28755
|
|
},
|
|
{
|
|
"entropy": 5.132875204086304,
|
|
"epoch": 2.7627281460134485,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004241820677079218,
|
|
"loss": 4.7511,
|
|
"mean_token_accuracy": 0.23293102085590361,
|
|
"num_tokens": 65948978.0,
|
|
"step": 28760
|
|
},
|
|
{
|
|
"entropy": 5.078152847290039,
|
|
"epoch": 2.7632084534101824,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042415640159542783,
|
|
"loss": 4.6865,
|
|
"mean_token_accuracy": 0.2410816565155983,
|
|
"num_tokens": 65959661.0,
|
|
"step": 28765
|
|
},
|
|
{
|
|
"entropy": 4.9896392822265625,
|
|
"epoch": 2.7636887608069163,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004241307320199652,
|
|
"loss": 4.6194,
|
|
"mean_token_accuracy": 0.2544292494654655,
|
|
"num_tokens": 65970208.0,
|
|
"step": 28770
|
|
},
|
|
{
|
|
"entropy": 5.10506763458252,
|
|
"epoch": 2.7641690682036506,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004241050589821298,
|
|
"loss": 4.7439,
|
|
"mean_token_accuracy": 0.23854973167181015,
|
|
"num_tokens": 65982606.0,
|
|
"step": 28775
|
|
},
|
|
{
|
|
"entropy": 5.0196840286254885,
|
|
"epoch": 2.764649375600384,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004240793824825177,
|
|
"loss": 4.6577,
|
|
"mean_token_accuracy": 0.2405321404337883,
|
|
"num_tokens": 65993984.0,
|
|
"step": 28780
|
|
},
|
|
{
|
|
"entropy": 5.08947868347168,
|
|
"epoch": 2.7651296829971184,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042405370252172496,
|
|
"loss": 4.7187,
|
|
"mean_token_accuracy": 0.23867221027612687,
|
|
"num_tokens": 66004740.0,
|
|
"step": 28785
|
|
},
|
|
{
|
|
"entropy": 5.052742671966553,
|
|
"epoch": 2.765609990393852,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004240280191003479,
|
|
"loss": 4.7952,
|
|
"mean_token_accuracy": 0.23609665632247925,
|
|
"num_tokens": 66019265.0,
|
|
"step": 28790
|
|
},
|
|
{
|
|
"entropy": 5.046192693710327,
|
|
"epoch": 2.766090297790586,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004240023322189828,
|
|
"loss": 4.705,
|
|
"mean_token_accuracy": 0.23975181132555007,
|
|
"num_tokens": 66030434.0,
|
|
"step": 28795
|
|
},
|
|
{
|
|
"entropy": 5.129871892929077,
|
|
"epoch": 2.76657060518732,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004239766418782258,
|
|
"loss": 4.7147,
|
|
"mean_token_accuracy": 0.2322618395090103,
|
|
"num_tokens": 66041100.0,
|
|
"step": 28800
|
|
},
|
|
{
|
|
"entropy": 5.0403828620910645,
|
|
"epoch": 2.767050912584054,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004239509480786737,
|
|
"loss": 4.7009,
|
|
"mean_token_accuracy": 0.2370862916111946,
|
|
"num_tokens": 66053046.0,
|
|
"step": 28805
|
|
},
|
|
{
|
|
"entropy": 5.147002124786377,
|
|
"epoch": 2.7675312199807878,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042392525082092286,
|
|
"loss": 4.8408,
|
|
"mean_token_accuracy": 0.22971219569444656,
|
|
"num_tokens": 66063508.0,
|
|
"step": 28810
|
|
},
|
|
{
|
|
"entropy": 5.126589155197143,
|
|
"epoch": 2.7680115273775217,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004238995501055699,
|
|
"loss": 4.845,
|
|
"mean_token_accuracy": 0.22856855392456055,
|
|
"num_tokens": 66076135.0,
|
|
"step": 28815
|
|
},
|
|
{
|
|
"entropy": 5.110952854156494,
|
|
"epoch": 2.7684918347742555,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004238738459332115,
|
|
"loss": 4.79,
|
|
"mean_token_accuracy": 0.23187243789434434,
|
|
"num_tokens": 66087565.0,
|
|
"step": 28820
|
|
},
|
|
{
|
|
"entropy": 5.137646675109863,
|
|
"epoch": 2.7689721421709894,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004238481383044445,
|
|
"loss": 4.7791,
|
|
"mean_token_accuracy": 0.2414294421672821,
|
|
"num_tokens": 66099430.0,
|
|
"step": 28825
|
|
},
|
|
{
|
|
"entropy": 5.0425450801849365,
|
|
"epoch": 2.7694524495677233,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00042382242721986573,
|
|
"loss": 4.6932,
|
|
"mean_token_accuracy": 0.23897880762815477,
|
|
"num_tokens": 66111060.0,
|
|
"step": 28830
|
|
},
|
|
{
|
|
"entropy": 4.9416584968566895,
|
|
"epoch": 2.769932756964457,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00042379671268007207,
|
|
"loss": 4.643,
|
|
"mean_token_accuracy": 0.2510156065225601,
|
|
"num_tokens": 66123367.0,
|
|
"step": 28835
|
|
},
|
|
{
|
|
"entropy": 5.090784740447998,
|
|
"epoch": 2.770413064361191,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004237709946856607,
|
|
"loss": 4.7071,
|
|
"mean_token_accuracy": 0.24626193791627884,
|
|
"num_tokens": 66134198.0,
|
|
"step": 28840
|
|
},
|
|
{
|
|
"entropy": 5.100472450256348,
|
|
"epoch": 2.770893371757925,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00042374527323722836,
|
|
"loss": 4.7161,
|
|
"mean_token_accuracy": 0.23950215280056,
|
|
"num_tokens": 66145991.0,
|
|
"step": 28845
|
|
},
|
|
{
|
|
"entropy": 5.112044715881348,
|
|
"epoch": 2.771373679154659,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042371954833537263,
|
|
"loss": 4.7642,
|
|
"mean_token_accuracy": 0.2365383803844452,
|
|
"num_tokens": 66157447.0,
|
|
"step": 28850
|
|
},
|
|
{
|
|
"entropy": 5.01291127204895,
|
|
"epoch": 2.7718539865513927,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00042369381998069055,
|
|
"loss": 4.7748,
|
|
"mean_token_accuracy": 0.24323766380548478,
|
|
"num_tokens": 66169259.0,
|
|
"step": 28855
|
|
},
|
|
{
|
|
"entropy": 5.067882633209228,
|
|
"epoch": 2.772334293948127,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004236680881737795,
|
|
"loss": 4.7304,
|
|
"mean_token_accuracy": 0.23722611963748932,
|
|
"num_tokens": 66181221.0,
|
|
"step": 28860
|
|
},
|
|
{
|
|
"entropy": 5.1237670421600345,
|
|
"epoch": 2.7728146013448605,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004236423529152369,
|
|
"loss": 4.7753,
|
|
"mean_token_accuracy": 0.22963873594999312,
|
|
"num_tokens": 66193513.0,
|
|
"step": 28865
|
|
},
|
|
{
|
|
"entropy": 5.064371585845947,
|
|
"epoch": 2.773294908741595,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004236166142056602,
|
|
"loss": 4.6319,
|
|
"mean_token_accuracy": 0.246799498796463,
|
|
"num_tokens": 66204951.0,
|
|
"step": 28870
|
|
},
|
|
{
|
|
"entropy": 4.997515344619751,
|
|
"epoch": 2.7737752161383287,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004235908720456471,
|
|
"loss": 4.7317,
|
|
"mean_token_accuracy": 0.2407101422548294,
|
|
"num_tokens": 66215845.0,
|
|
"step": 28875
|
|
},
|
|
{
|
|
"entropy": 5.101406002044678,
|
|
"epoch": 2.7742555235350626,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004235651264357951,
|
|
"loss": 4.8379,
|
|
"mean_token_accuracy": 0.23241375535726547,
|
|
"num_tokens": 66226510.0,
|
|
"step": 28880
|
|
},
|
|
{
|
|
"entropy": 5.141011476516724,
|
|
"epoch": 2.7747358309317964,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042353937737670206,
|
|
"loss": 4.7585,
|
|
"mean_token_accuracy": 0.2364454001188278,
|
|
"num_tokens": 66238134.0,
|
|
"step": 28885
|
|
},
|
|
{
|
|
"entropy": 5.10180025100708,
|
|
"epoch": 2.7752161383285303,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004235136248689658,
|
|
"loss": 4.7388,
|
|
"mean_token_accuracy": 0.23604300916194915,
|
|
"num_tokens": 66250526.0,
|
|
"step": 28890
|
|
},
|
|
{
|
|
"entropy": 5.133022356033325,
|
|
"epoch": 2.775696445725264,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004234878689131841,
|
|
"loss": 4.7886,
|
|
"mean_token_accuracy": 0.235670568048954,
|
|
"num_tokens": 66262144.0,
|
|
"step": 28895
|
|
},
|
|
{
|
|
"entropy": 5.164633417129517,
|
|
"epoch": 2.776176753121998,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000423462109509955,
|
|
"loss": 4.8015,
|
|
"mean_token_accuracy": 0.22956809103488923,
|
|
"num_tokens": 66273262.0,
|
|
"step": 28900
|
|
},
|
|
{
|
|
"entropy": 5.0345587730407715,
|
|
"epoch": 2.776657060518732,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004234363466598765,
|
|
"loss": 4.6986,
|
|
"mean_token_accuracy": 0.2430693671107292,
|
|
"num_tokens": 66284127.0,
|
|
"step": 28905
|
|
},
|
|
{
|
|
"entropy": 5.019112014770508,
|
|
"epoch": 2.777137367915466,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00042341058036354687,
|
|
"loss": 4.7241,
|
|
"mean_token_accuracy": 0.23520620614290239,
|
|
"num_tokens": 66295092.0,
|
|
"step": 28910
|
|
},
|
|
{
|
|
"entropy": 4.953162574768067,
|
|
"epoch": 2.7776176753121997,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042338481062156424,
|
|
"loss": 4.5774,
|
|
"mean_token_accuracy": 0.24913661181926727,
|
|
"num_tokens": 66306050.0,
|
|
"step": 28915
|
|
},
|
|
{
|
|
"entropy": 4.981451845169067,
|
|
"epoch": 2.7780979827089336,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042335903743452694,
|
|
"loss": 4.6705,
|
|
"mean_token_accuracy": 0.2409507527947426,
|
|
"num_tokens": 66317186.0,
|
|
"step": 28920
|
|
},
|
|
{
|
|
"entropy": 5.073168516159058,
|
|
"epoch": 2.7785782901056675,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004233332608030333,
|
|
"loss": 4.7515,
|
|
"mean_token_accuracy": 0.23477090448141097,
|
|
"num_tokens": 66328486.0,
|
|
"step": 28925
|
|
},
|
|
{
|
|
"entropy": 5.048687314987182,
|
|
"epoch": 2.7790585975024014,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00042330748072768183,
|
|
"loss": 4.6605,
|
|
"mean_token_accuracy": 0.24403852671384813,
|
|
"num_tokens": 66340649.0,
|
|
"step": 28930
|
|
},
|
|
{
|
|
"entropy": 5.113717126846313,
|
|
"epoch": 2.7795389048991357,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.000423281697209071,
|
|
"loss": 4.7913,
|
|
"mean_token_accuracy": 0.22693513035774232,
|
|
"num_tokens": 66353555.0,
|
|
"step": 28935
|
|
},
|
|
{
|
|
"entropy": 5.196904087066651,
|
|
"epoch": 2.780019212295869,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004232559102477995,
|
|
"loss": 4.8463,
|
|
"mean_token_accuracy": 0.22786442339420318,
|
|
"num_tokens": 66365445.0,
|
|
"step": 28940
|
|
},
|
|
{
|
|
"entropy": 5.024850273132325,
|
|
"epoch": 2.7804995196926034,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000423230119844466,
|
|
"loss": 4.6531,
|
|
"mean_token_accuracy": 0.24638040065765382,
|
|
"num_tokens": 66377203.0,
|
|
"step": 28945
|
|
},
|
|
{
|
|
"entropy": 5.088182783126831,
|
|
"epoch": 2.7809798270893373,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004232043259996692,
|
|
"loss": 4.7946,
|
|
"mean_token_accuracy": 0.2338147297501564,
|
|
"num_tokens": 66388637.0,
|
|
"step": 28950
|
|
},
|
|
{
|
|
"entropy": 5.0920398235321045,
|
|
"epoch": 2.781460134486071,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004231785287140081,
|
|
"loss": 4.7654,
|
|
"mean_token_accuracy": 0.2338176667690277,
|
|
"num_tokens": 66400470.0,
|
|
"step": 28955
|
|
},
|
|
{
|
|
"entropy": 5.170683860778809,
|
|
"epoch": 2.781940441882805,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004231527279880816,
|
|
"loss": 4.788,
|
|
"mean_token_accuracy": 0.2422279790043831,
|
|
"num_tokens": 66411079.0,
|
|
"step": 28960
|
|
},
|
|
{
|
|
"entropy": 5.074834871292114,
|
|
"epoch": 2.782420749279539,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004231269238224885,
|
|
"loss": 4.7482,
|
|
"mean_token_accuracy": 0.2362466499209404,
|
|
"num_tokens": 66423706.0,
|
|
"step": 28965
|
|
},
|
|
{
|
|
"entropy": 4.980147123336792,
|
|
"epoch": 2.782901056676273,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004231011162178282,
|
|
"loss": 4.6911,
|
|
"mean_token_accuracy": 0.2411831110715866,
|
|
"num_tokens": 66434994.0,
|
|
"step": 28970
|
|
},
|
|
{
|
|
"entropy": 5.05233211517334,
|
|
"epoch": 2.7833813640730067,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004230753051746998,
|
|
"loss": 4.6128,
|
|
"mean_token_accuracy": 0.24962817281484603,
|
|
"num_tokens": 66446474.0,
|
|
"step": 28975
|
|
},
|
|
{
|
|
"entropy": 5.07179479598999,
|
|
"epoch": 2.7838616714697406,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042304949069370246,
|
|
"loss": 4.7032,
|
|
"mean_token_accuracy": 0.2386288583278656,
|
|
"num_tokens": 66457633.0,
|
|
"step": 28980
|
|
},
|
|
{
|
|
"entropy": 5.0545814514160154,
|
|
"epoch": 2.7843419788664745,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00042302367277543553,
|
|
"loss": 4.7138,
|
|
"mean_token_accuracy": 0.2398442029953003,
|
|
"num_tokens": 66468160.0,
|
|
"step": 28985
|
|
},
|
|
{
|
|
"entropy": 5.078740501403809,
|
|
"epoch": 2.7848222862632084,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00042299785142049855,
|
|
"loss": 4.7339,
|
|
"mean_token_accuracy": 0.23528432250022888,
|
|
"num_tokens": 66478609.0,
|
|
"step": 28990
|
|
},
|
|
{
|
|
"entropy": 5.0767899513244625,
|
|
"epoch": 2.7853025936599423,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004229720266294908,
|
|
"loss": 4.7871,
|
|
"mean_token_accuracy": 0.23646434545516967,
|
|
"num_tokens": 66489992.0,
|
|
"step": 28995
|
|
},
|
|
{
|
|
"entropy": 5.17231707572937,
|
|
"epoch": 2.785782901056676,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000422946198403012,
|
|
"loss": 4.8007,
|
|
"mean_token_accuracy": 0.2307385966181755,
|
|
"num_tokens": 66501331.0,
|
|
"step": 29000
|
|
},
|
|
{
|
|
"entropy": 5.195744562149048,
|
|
"epoch": 2.78626320845341,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004229203667416619,
|
|
"loss": 4.7932,
|
|
"mean_token_accuracy": 0.23550139963626862,
|
|
"num_tokens": 66513336.0,
|
|
"step": 29005
|
|
},
|
|
{
|
|
"entropy": 5.071753883361817,
|
|
"epoch": 2.7867435158501443,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00042289453164604,
|
|
"loss": 4.6473,
|
|
"mean_token_accuracy": 0.249504953622818,
|
|
"num_tokens": 66523577.0,
|
|
"step": 29010
|
|
},
|
|
{
|
|
"entropy": 5.017402505874633,
|
|
"epoch": 2.787223823246878,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004228686931167463,
|
|
"loss": 4.7436,
|
|
"mean_token_accuracy": 0.2307626038789749,
|
|
"num_tokens": 66536181.0,
|
|
"step": 29015
|
|
},
|
|
{
|
|
"entropy": 5.065583562850952,
|
|
"epoch": 2.787704130643612,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004228428511543806,
|
|
"loss": 4.7102,
|
|
"mean_token_accuracy": 0.23388897031545638,
|
|
"num_tokens": 66546881.0,
|
|
"step": 29020
|
|
},
|
|
{
|
|
"entropy": 5.175033617019653,
|
|
"epoch": 2.7881844380403455,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00042281700575954283,
|
|
"loss": 4.8451,
|
|
"mean_token_accuracy": 0.2391469433903694,
|
|
"num_tokens": 66559091.0,
|
|
"step": 29025
|
|
},
|
|
{
|
|
"entropy": 5.142588186264038,
|
|
"epoch": 2.78866474543708,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004227911569328332,
|
|
"loss": 4.8233,
|
|
"mean_token_accuracy": 0.22944566160440444,
|
|
"num_tokens": 66570584.0,
|
|
"step": 29030
|
|
},
|
|
{
|
|
"entropy": 5.087484645843506,
|
|
"epoch": 2.7891450528338138,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004227653046748517,
|
|
"loss": 4.7233,
|
|
"mean_token_accuracy": 0.2394936978816986,
|
|
"num_tokens": 66581921.0,
|
|
"step": 29035
|
|
},
|
|
{
|
|
"entropy": 5.0868641376495365,
|
|
"epoch": 2.7896253602305476,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00042273944898619864,
|
|
"loss": 4.835,
|
|
"mean_token_accuracy": 0.23168757557868958,
|
|
"num_tokens": 66595132.0,
|
|
"step": 29040
|
|
},
|
|
{
|
|
"entropy": 5.173955488204956,
|
|
"epoch": 2.7901056676272815,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00042271358986747427,
|
|
"loss": 4.7861,
|
|
"mean_token_accuracy": 0.23523377031087875,
|
|
"num_tokens": 66607212.0,
|
|
"step": 29045
|
|
},
|
|
{
|
|
"entropy": 5.135601615905761,
|
|
"epoch": 2.7905859750240154,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00042268772731927895,
|
|
"loss": 4.748,
|
|
"mean_token_accuracy": 0.24023500680923462,
|
|
"num_tokens": 66619502.0,
|
|
"step": 29050
|
|
},
|
|
{
|
|
"entropy": 5.078439521789551,
|
|
"epoch": 2.7910662824207493,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042266186134221317,
|
|
"loss": 4.7932,
|
|
"mean_token_accuracy": 0.22847750633955002,
|
|
"num_tokens": 66631813.0,
|
|
"step": 29055
|
|
},
|
|
{
|
|
"entropy": 5.136983346939087,
|
|
"epoch": 2.791546589817483,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004226359919368774,
|
|
"loss": 4.8381,
|
|
"mean_token_accuracy": 0.22895507216453553,
|
|
"num_tokens": 66643879.0,
|
|
"step": 29060
|
|
},
|
|
{
|
|
"entropy": 5.174340677261353,
|
|
"epoch": 2.792026897214217,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042261011910387224,
|
|
"loss": 4.8108,
|
|
"mean_token_accuracy": 0.23358631283044815,
|
|
"num_tokens": 66656819.0,
|
|
"step": 29065
|
|
},
|
|
{
|
|
"entropy": 5.0694104671478275,
|
|
"epoch": 2.792507204610951,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004225842428437985,
|
|
"loss": 4.6674,
|
|
"mean_token_accuracy": 0.24253317564725876,
|
|
"num_tokens": 66668745.0,
|
|
"step": 29070
|
|
},
|
|
{
|
|
"entropy": 5.050758314132691,
|
|
"epoch": 2.792987512007685,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00042255836315725694,
|
|
"loss": 4.7219,
|
|
"mean_token_accuracy": 0.24155887365341186,
|
|
"num_tokens": 66681620.0,
|
|
"step": 29075
|
|
},
|
|
{
|
|
"entropy": 5.057884931564331,
|
|
"epoch": 2.7934678194044187,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004225324800448483,
|
|
"loss": 4.6954,
|
|
"mean_token_accuracy": 0.2354402020573616,
|
|
"num_tokens": 66692232.0,
|
|
"step": 29080
|
|
},
|
|
{
|
|
"entropy": 5.067146492004395,
|
|
"epoch": 2.793948126801153,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042250659350717343,
|
|
"loss": 4.6955,
|
|
"mean_token_accuracy": 0.24201476722955703,
|
|
"num_tokens": 66703401.0,
|
|
"step": 29085
|
|
},
|
|
{
|
|
"entropy": 4.958551979064941,
|
|
"epoch": 2.7944284341978864,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042248070354483354,
|
|
"loss": 4.6304,
|
|
"mean_token_accuracy": 0.24315544962882996,
|
|
"num_tokens": 66714066.0,
|
|
"step": 29090
|
|
},
|
|
{
|
|
"entropy": 4.9515398979187015,
|
|
"epoch": 2.7949087415946208,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004224548101584297,
|
|
"loss": 4.5899,
|
|
"mean_token_accuracy": 0.2479192927479744,
|
|
"num_tokens": 66724504.0,
|
|
"step": 29095
|
|
},
|
|
{
|
|
"entropy": 5.094303226470947,
|
|
"epoch": 2.795389048991354,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000422428913348563,
|
|
"loss": 4.7101,
|
|
"mean_token_accuracy": 0.23548691868782043,
|
|
"num_tokens": 66735772.0,
|
|
"step": 29100
|
|
},
|
|
{
|
|
"entropy": 5.090736150741577,
|
|
"epoch": 2.7958693563880885,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004224030131158346,
|
|
"loss": 4.7576,
|
|
"mean_token_accuracy": 0.2333929643034935,
|
|
"num_tokens": 66747956.0,
|
|
"step": 29105
|
|
},
|
|
{
|
|
"entropy": 5.064262390136719,
|
|
"epoch": 2.7963496637848224,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004223771094608461,
|
|
"loss": 4.7749,
|
|
"mean_token_accuracy": 0.23554529398679733,
|
|
"num_tokens": 66759112.0,
|
|
"step": 29110
|
|
},
|
|
{
|
|
"entropy": 5.056329822540283,
|
|
"epoch": 2.7968299711815563,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004223512023841986,
|
|
"loss": 4.7125,
|
|
"mean_token_accuracy": 0.2383354589343071,
|
|
"num_tokens": 66770927.0,
|
|
"step": 29115
|
|
},
|
|
{
|
|
"entropy": 5.102386045455932,
|
|
"epoch": 2.79731027857829,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042232529188649374,
|
|
"loss": 4.819,
|
|
"mean_token_accuracy": 0.2361222356557846,
|
|
"num_tokens": 66781809.0,
|
|
"step": 29120
|
|
},
|
|
{
|
|
"entropy": 5.017792701721191,
|
|
"epoch": 2.797790585975024,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004222993779683331,
|
|
"loss": 4.697,
|
|
"mean_token_accuracy": 0.23675900995731353,
|
|
"num_tokens": 66793495.0,
|
|
"step": 29125
|
|
},
|
|
{
|
|
"entropy": 5.044356393814087,
|
|
"epoch": 2.798270893371758,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042227346063031837,
|
|
"loss": 4.7142,
|
|
"mean_token_accuracy": 0.24150240570306777,
|
|
"num_tokens": 66804568.0,
|
|
"step": 29130
|
|
},
|
|
{
|
|
"entropy": 5.061636066436767,
|
|
"epoch": 2.798751200768492,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004222475398730511,
|
|
"loss": 4.6495,
|
|
"mean_token_accuracy": 0.24463913440704346,
|
|
"num_tokens": 66816698.0,
|
|
"step": 29135
|
|
},
|
|
{
|
|
"entropy": 5.104371786117554,
|
|
"epoch": 2.7992315081652257,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004222216156971332,
|
|
"loss": 4.7831,
|
|
"mean_token_accuracy": 0.23421378880739213,
|
|
"num_tokens": 66829299.0,
|
|
"step": 29140
|
|
},
|
|
{
|
|
"entropy": 4.958194351196289,
|
|
"epoch": 2.7997118155619596,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042219568810316656,
|
|
"loss": 4.589,
|
|
"mean_token_accuracy": 0.2420770525932312,
|
|
"num_tokens": 66839871.0,
|
|
"step": 29145
|
|
},
|
|
{
|
|
"entropy": 5.1890904903411865,
|
|
"epoch": 2.8001921229586935,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004221697570917531,
|
|
"loss": 4.7849,
|
|
"mean_token_accuracy": 0.2348181426525116,
|
|
"num_tokens": 66850368.0,
|
|
"step": 29150
|
|
},
|
|
{
|
|
"entropy": 5.111753463745117,
|
|
"epoch": 2.8006724303554273,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000422143822663495,
|
|
"loss": 4.7684,
|
|
"mean_token_accuracy": 0.235346058011055,
|
|
"num_tokens": 66861067.0,
|
|
"step": 29155
|
|
},
|
|
{
|
|
"entropy": 5.082866239547729,
|
|
"epoch": 2.8011527377521612,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004221178848189941,
|
|
"loss": 4.8492,
|
|
"mean_token_accuracy": 0.22439933717250823,
|
|
"num_tokens": 66873841.0,
|
|
"step": 29160
|
|
},
|
|
{
|
|
"entropy": 5.106626224517822,
|
|
"epoch": 2.801633045148895,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00042209194355885283,
|
|
"loss": 4.7841,
|
|
"mean_token_accuracy": 0.22784036695957183,
|
|
"num_tokens": 66885042.0,
|
|
"step": 29165
|
|
},
|
|
{
|
|
"entropy": 5.114370965957642,
|
|
"epoch": 2.8021133525456294,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004220659988836734,
|
|
"loss": 4.7594,
|
|
"mean_token_accuracy": 0.23598769903182984,
|
|
"num_tokens": 66897076.0,
|
|
"step": 29170
|
|
},
|
|
{
|
|
"entropy": 5.031212520599365,
|
|
"epoch": 2.802593659942363,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004220400507940582,
|
|
"loss": 4.6507,
|
|
"mean_token_accuracy": 0.23573495745658873,
|
|
"num_tokens": 66907879.0,
|
|
"step": 29175
|
|
},
|
|
{
|
|
"entropy": 5.082202768325805,
|
|
"epoch": 2.803073967339097,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00042201409929060955,
|
|
"loss": 4.8137,
|
|
"mean_token_accuracy": 0.23303966522216796,
|
|
"num_tokens": 66920801.0,
|
|
"step": 29180
|
|
},
|
|
{
|
|
"entropy": 5.096289825439453,
|
|
"epoch": 2.803554274735831,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004219881443739301,
|
|
"loss": 4.6695,
|
|
"mean_token_accuracy": 0.23883183002471925,
|
|
"num_tokens": 66931411.0,
|
|
"step": 29185
|
|
},
|
|
{
|
|
"entropy": 5.111446666717529,
|
|
"epoch": 2.804034582132565,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004219621860446225,
|
|
"loss": 4.7507,
|
|
"mean_token_accuracy": 0.23893794417381287,
|
|
"num_tokens": 66942122.0,
|
|
"step": 29190
|
|
},
|
|
{
|
|
"entropy": 5.04069766998291,
|
|
"epoch": 2.804514889529299,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004219362243032892,
|
|
"loss": 4.7822,
|
|
"mean_token_accuracy": 0.23491215258836745,
|
|
"num_tokens": 66954096.0,
|
|
"step": 29195
|
|
},
|
|
{
|
|
"entropy": 5.125007200241089,
|
|
"epoch": 2.8049951969260327,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00042191025915053323,
|
|
"loss": 4.8025,
|
|
"mean_token_accuracy": 0.23034960478544236,
|
|
"num_tokens": 66965729.0,
|
|
"step": 29200
|
|
},
|
|
{
|
|
"entropy": 5.113742399215698,
|
|
"epoch": 2.8054755043227666,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042188429058695714,
|
|
"loss": 4.768,
|
|
"mean_token_accuracy": 0.22859703600406647,
|
|
"num_tokens": 66975819.0,
|
|
"step": 29205
|
|
},
|
|
{
|
|
"entropy": 5.046423530578613,
|
|
"epoch": 2.8059558117195005,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00042185831861316406,
|
|
"loss": 4.7134,
|
|
"mean_token_accuracy": 0.24926585853099822,
|
|
"num_tokens": 66987150.0,
|
|
"step": 29210
|
|
},
|
|
{
|
|
"entropy": 5.07684817314148,
|
|
"epoch": 2.8064361191162344,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004218323432297568,
|
|
"loss": 4.6882,
|
|
"mean_token_accuracy": 0.23167644441127777,
|
|
"num_tokens": 66999449.0,
|
|
"step": 29215
|
|
},
|
|
{
|
|
"entropy": 5.0162135601043705,
|
|
"epoch": 2.8069164265129682,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042180636443733864,
|
|
"loss": 4.6862,
|
|
"mean_token_accuracy": 0.24328800439834594,
|
|
"num_tokens": 67011256.0,
|
|
"step": 29220
|
|
},
|
|
{
|
|
"entropy": 4.9040539264678955,
|
|
"epoch": 2.807396733909702,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042178038223651253,
|
|
"loss": 4.587,
|
|
"mean_token_accuracy": 0.24731760174036027,
|
|
"num_tokens": 67021878.0,
|
|
"step": 29225
|
|
},
|
|
{
|
|
"entropy": 5.091517114639283,
|
|
"epoch": 2.807877041306436,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042175439662788195,
|
|
"loss": 4.8519,
|
|
"mean_token_accuracy": 0.22310329526662825,
|
|
"num_tokens": 67032544.0,
|
|
"step": 29230
|
|
},
|
|
{
|
|
"entropy": 5.048261499404907,
|
|
"epoch": 2.80835734870317,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00042172840761204986,
|
|
"loss": 4.715,
|
|
"mean_token_accuracy": 0.24167356193065642,
|
|
"num_tokens": 67042119.0,
|
|
"step": 29235
|
|
},
|
|
{
|
|
"entropy": 5.085355520248413,
|
|
"epoch": 2.8088376560999038,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004217024151896199,
|
|
"loss": 4.7588,
|
|
"mean_token_accuracy": 0.2463191419839859,
|
|
"num_tokens": 67053126.0,
|
|
"step": 29240
|
|
},
|
|
{
|
|
"entropy": 5.1328675746917725,
|
|
"epoch": 2.809317963496638,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042167641936119557,
|
|
"loss": 4.8357,
|
|
"mean_token_accuracy": 0.2317552775144577,
|
|
"num_tokens": 67063818.0,
|
|
"step": 29245
|
|
},
|
|
{
|
|
"entropy": 5.040121793746948,
|
|
"epoch": 2.8097982708933715,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004216504201273802,
|
|
"loss": 4.736,
|
|
"mean_token_accuracy": 0.24150463789701462,
|
|
"num_tokens": 67074477.0,
|
|
"step": 29250
|
|
},
|
|
{
|
|
"entropy": 5.108019542694092,
|
|
"epoch": 2.810278578290106,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004216244174887776,
|
|
"loss": 4.7634,
|
|
"mean_token_accuracy": 0.23590658009052276,
|
|
"num_tokens": 67085920.0,
|
|
"step": 29255
|
|
},
|
|
{
|
|
"entropy": 5.0599141120910645,
|
|
"epoch": 2.8107588856868397,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00042159841144599145,
|
|
"loss": 4.7853,
|
|
"mean_token_accuracy": 0.2368880867958069,
|
|
"num_tokens": 67097542.0,
|
|
"step": 29260
|
|
},
|
|
{
|
|
"entropy": 5.1055501937866214,
|
|
"epoch": 2.8112391930835736,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042157240199962537,
|
|
"loss": 4.8005,
|
|
"mean_token_accuracy": 0.23813695907592775,
|
|
"num_tokens": 67108514.0,
|
|
"step": 29265
|
|
},
|
|
{
|
|
"entropy": 5.073172616958618,
|
|
"epoch": 2.8117195004803075,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004215463891502834,
|
|
"loss": 4.6163,
|
|
"mean_token_accuracy": 0.245599864423275,
|
|
"num_tokens": 67120263.0,
|
|
"step": 29270
|
|
},
|
|
{
|
|
"entropy": 5.074782133102417,
|
|
"epoch": 2.8121998078770414,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042152037289856954,
|
|
"loss": 4.709,
|
|
"mean_token_accuracy": 0.23398882746696473,
|
|
"num_tokens": 67133607.0,
|
|
"step": 29275
|
|
},
|
|
{
|
|
"entropy": 5.077461671829224,
|
|
"epoch": 2.8126801152737753,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042149435324508755,
|
|
"loss": 4.7064,
|
|
"mean_token_accuracy": 0.2428443506360054,
|
|
"num_tokens": 67145452.0,
|
|
"step": 29280
|
|
},
|
|
{
|
|
"entropy": 4.991470813751221,
|
|
"epoch": 2.813160422670509,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004214683301904417,
|
|
"loss": 4.7309,
|
|
"mean_token_accuracy": 0.24190180599689484,
|
|
"num_tokens": 67159178.0,
|
|
"step": 29285
|
|
},
|
|
{
|
|
"entropy": 5.179703235626221,
|
|
"epoch": 2.813640730067243,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00042144230373523624,
|
|
"loss": 4.8226,
|
|
"mean_token_accuracy": 0.2293036624789238,
|
|
"num_tokens": 67171652.0,
|
|
"step": 29290
|
|
},
|
|
{
|
|
"entropy": 5.142070007324219,
|
|
"epoch": 2.814121037463977,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004214162738800753,
|
|
"loss": 4.7704,
|
|
"mean_token_accuracy": 0.23842364102602004,
|
|
"num_tokens": 67181844.0,
|
|
"step": 29295
|
|
},
|
|
{
|
|
"entropy": 5.066088676452637,
|
|
"epoch": 2.814601344860711,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004213902406255632,
|
|
"loss": 4.7882,
|
|
"mean_token_accuracy": 0.2425612300634384,
|
|
"num_tokens": 67193156.0,
|
|
"step": 29300
|
|
},
|
|
{
|
|
"entropy": 5.030477046966553,
|
|
"epoch": 2.8150816522574447,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004213642039723044,
|
|
"loss": 4.7371,
|
|
"mean_token_accuracy": 0.23682460188865662,
|
|
"num_tokens": 67205447.0,
|
|
"step": 29305
|
|
},
|
|
{
|
|
"entropy": 5.194211769104004,
|
|
"epoch": 2.8155619596541785,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042133816392090343,
|
|
"loss": 4.8293,
|
|
"mean_token_accuracy": 0.23123976290225984,
|
|
"num_tokens": 67216101.0,
|
|
"step": 29310
|
|
},
|
|
{
|
|
"entropy": 5.037980031967163,
|
|
"epoch": 2.8160422670509124,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00042131212047196484,
|
|
"loss": 4.5674,
|
|
"mean_token_accuracy": 0.25225337147712706,
|
|
"num_tokens": 67226972.0,
|
|
"step": 29315
|
|
},
|
|
{
|
|
"entropy": 4.998749828338623,
|
|
"epoch": 2.8165225744476468,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042128607362609317,
|
|
"loss": 4.7108,
|
|
"mean_token_accuracy": 0.24861632883548737,
|
|
"num_tokens": 67237968.0,
|
|
"step": 29320
|
|
},
|
|
{
|
|
"entropy": 5.095122480392456,
|
|
"epoch": 2.81700288184438,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00042126002338389336,
|
|
"loss": 4.8231,
|
|
"mean_token_accuracy": 0.23481558561325072,
|
|
"num_tokens": 67250817.0,
|
|
"step": 29325
|
|
},
|
|
{
|
|
"entropy": 5.1704460144042965,
|
|
"epoch": 2.8174831892411145,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00042123396974597007,
|
|
"loss": 4.7091,
|
|
"mean_token_accuracy": 0.23950430005788803,
|
|
"num_tokens": 67261491.0,
|
|
"step": 29330
|
|
},
|
|
{
|
|
"entropy": 5.069094800949097,
|
|
"epoch": 2.817963496637848,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00042120791271292823,
|
|
"loss": 4.7515,
|
|
"mean_token_accuracy": 0.241105617582798,
|
|
"num_tokens": 67272684.0,
|
|
"step": 29335
|
|
},
|
|
{
|
|
"entropy": 5.080519819259644,
|
|
"epoch": 2.8184438040345823,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00042118185228537283,
|
|
"loss": 4.7718,
|
|
"mean_token_accuracy": 0.233907251060009,
|
|
"num_tokens": 67284070.0,
|
|
"step": 29340
|
|
},
|
|
{
|
|
"entropy": 5.135709619522094,
|
|
"epoch": 2.818924111431316,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00042115578846390884,
|
|
"loss": 4.8109,
|
|
"mean_token_accuracy": 0.23303520381450654,
|
|
"num_tokens": 67296302.0,
|
|
"step": 29345
|
|
},
|
|
{
|
|
"entropy": 5.1688251972198485,
|
|
"epoch": 2.81940441882805,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004211297212491414,
|
|
"loss": 4.759,
|
|
"mean_token_accuracy": 0.23793066143989564,
|
|
"num_tokens": 67307123.0,
|
|
"step": 29350
|
|
},
|
|
{
|
|
"entropy": 5.125912284851074,
|
|
"epoch": 2.819884726224784,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004211036506416759,
|
|
"loss": 4.742,
|
|
"mean_token_accuracy": 0.2363669753074646,
|
|
"num_tokens": 67318832.0,
|
|
"step": 29355
|
|
},
|
|
{
|
|
"entropy": 5.099205780029297,
|
|
"epoch": 2.820365033621518,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004210775766421173,
|
|
"loss": 4.8252,
|
|
"mean_token_accuracy": 0.2343818336725235,
|
|
"num_tokens": 67329821.0,
|
|
"step": 29360
|
|
},
|
|
{
|
|
"entropy": 5.085246849060058,
|
|
"epoch": 2.8208453410182517,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004210514992510713,
|
|
"loss": 4.7023,
|
|
"mean_token_accuracy": 0.2376452013850212,
|
|
"num_tokens": 67341569.0,
|
|
"step": 29365
|
|
},
|
|
{
|
|
"entropy": 5.1617189884185795,
|
|
"epoch": 2.8213256484149856,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004210254184691431,
|
|
"loss": 4.8206,
|
|
"mean_token_accuracy": 0.23409080356359482,
|
|
"num_tokens": 67352499.0,
|
|
"step": 29370
|
|
},
|
|
{
|
|
"entropy": 5.065718507766723,
|
|
"epoch": 2.8218059558117194,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042099933429693814,
|
|
"loss": 4.7227,
|
|
"mean_token_accuracy": 0.23651075959205628,
|
|
"num_tokens": 67363810.0,
|
|
"step": 29375
|
|
},
|
|
{
|
|
"entropy": 5.126027631759643,
|
|
"epoch": 2.8222862632084533,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004209732467350624,
|
|
"loss": 4.8102,
|
|
"mean_token_accuracy": 0.2424784705042839,
|
|
"num_tokens": 67374690.0,
|
|
"step": 29380
|
|
},
|
|
{
|
|
"entropy": 5.087665176391601,
|
|
"epoch": 2.822766570605187,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004209471557841212,
|
|
"loss": 4.7307,
|
|
"mean_token_accuracy": 0.23689046800136565,
|
|
"num_tokens": 67386297.0,
|
|
"step": 29385
|
|
},
|
|
{
|
|
"entropy": 5.081095743179321,
|
|
"epoch": 2.823246878001921,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004209210614447204,
|
|
"loss": 4.6725,
|
|
"mean_token_accuracy": 0.24220550954341888,
|
|
"num_tokens": 67397544.0,
|
|
"step": 29390
|
|
},
|
|
{
|
|
"entropy": 5.125997829437256,
|
|
"epoch": 2.8237271853986554,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000420894963717466,
|
|
"loss": 4.7593,
|
|
"mean_token_accuracy": 0.23870523571968078,
|
|
"num_tokens": 67408609.0,
|
|
"step": 29395
|
|
},
|
|
{
|
|
"entropy": 5.0870969772338865,
|
|
"epoch": 2.824207492795389,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004208688626029636,
|
|
"loss": 4.7304,
|
|
"mean_token_accuracy": 0.23566053956747054,
|
|
"num_tokens": 67421091.0,
|
|
"step": 29400
|
|
},
|
|
{
|
|
"entropy": 5.089567852020264,
|
|
"epoch": 2.824687800192123,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004208427581018194,
|
|
"loss": 4.7638,
|
|
"mean_token_accuracy": 0.23751177489757538,
|
|
"num_tokens": 67433141.0,
|
|
"step": 29405
|
|
},
|
|
{
|
|
"entropy": 5.149106502532959,
|
|
"epoch": 2.8251681075888566,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004208166502146394,
|
|
"loss": 4.8157,
|
|
"mean_token_accuracy": 0.2274396926164627,
|
|
"num_tokens": 67443646.0,
|
|
"step": 29410
|
|
},
|
|
{
|
|
"entropy": 5.127561283111572,
|
|
"epoch": 2.825648414985591,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00042079053894202977,
|
|
"loss": 4.755,
|
|
"mean_token_accuracy": 0.2354283645749092,
|
|
"num_tokens": 67455783.0,
|
|
"step": 29415
|
|
},
|
|
{
|
|
"entropy": 5.097440528869629,
|
|
"epoch": 2.826128722382325,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004207644242845968,
|
|
"loss": 4.712,
|
|
"mean_token_accuracy": 0.24154630899429322,
|
|
"num_tokens": 67467185.0,
|
|
"step": 29420
|
|
},
|
|
{
|
|
"entropy": 5.0761716842651365,
|
|
"epoch": 2.8266090297790587,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004207383062429467,
|
|
"loss": 4.7032,
|
|
"mean_token_accuracy": 0.23896313607692718,
|
|
"num_tokens": 67478740.0,
|
|
"step": 29425
|
|
},
|
|
{
|
|
"entropy": 5.02870888710022,
|
|
"epoch": 2.8270893371757926,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004207121848176858,
|
|
"loss": 4.6894,
|
|
"mean_token_accuracy": 0.24051135778427124,
|
|
"num_tokens": 67490492.0,
|
|
"step": 29430
|
|
},
|
|
{
|
|
"entropy": 4.981909322738647,
|
|
"epoch": 2.8275696445725265,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00042068606000942075,
|
|
"loss": 4.6571,
|
|
"mean_token_accuracy": 0.24181026667356492,
|
|
"num_tokens": 67500861.0,
|
|
"step": 29435
|
|
},
|
|
{
|
|
"entropy": 5.078646659851074,
|
|
"epoch": 2.8280499519692603,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00042065993181875794,
|
|
"loss": 4.7278,
|
|
"mean_token_accuracy": 0.2341497138142586,
|
|
"num_tokens": 67513577.0,
|
|
"step": 29440
|
|
},
|
|
{
|
|
"entropy": 5.077435159683228,
|
|
"epoch": 2.8285302593659942,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000420633800246304,
|
|
"loss": 4.7257,
|
|
"mean_token_accuracy": 0.2428266689181328,
|
|
"num_tokens": 67525147.0,
|
|
"step": 29445
|
|
},
|
|
{
|
|
"entropy": 5.069917392730713,
|
|
"epoch": 2.829010566762728,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00042060766529266577,
|
|
"loss": 4.7276,
|
|
"mean_token_accuracy": 0.23703081905841827,
|
|
"num_tokens": 67536183.0,
|
|
"step": 29450
|
|
},
|
|
{
|
|
"entropy": 5.130881977081299,
|
|
"epoch": 2.829490874159462,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00042058152695844986,
|
|
"loss": 4.7522,
|
|
"mean_token_accuracy": 0.2344597414135933,
|
|
"num_tokens": 67546868.0,
|
|
"step": 29455
|
|
},
|
|
{
|
|
"entropy": 5.037741565704346,
|
|
"epoch": 2.829971181556196,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00042055538524426317,
|
|
"loss": 4.6772,
|
|
"mean_token_accuracy": 0.24406108856201172,
|
|
"num_tokens": 67557243.0,
|
|
"step": 29460
|
|
},
|
|
{
|
|
"entropy": 5.034839630126953,
|
|
"epoch": 2.8304514889529298,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004205292401507127,
|
|
"loss": 4.7484,
|
|
"mean_token_accuracy": 0.2283693253993988,
|
|
"num_tokens": 67568977.0,
|
|
"step": 29465
|
|
},
|
|
{
|
|
"entropy": 5.0676023960113525,
|
|
"epoch": 2.8309317963496636,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004205030916784053,
|
|
"loss": 4.7028,
|
|
"mean_token_accuracy": 0.24158486872911453,
|
|
"num_tokens": 67581318.0,
|
|
"step": 29470
|
|
},
|
|
{
|
|
"entropy": 5.010189628601074,
|
|
"epoch": 2.8314121037463975,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00042047693982794824,
|
|
"loss": 4.6528,
|
|
"mean_token_accuracy": 0.2412082239985466,
|
|
"num_tokens": 67592707.0,
|
|
"step": 29475
|
|
},
|
|
{
|
|
"entropy": 5.0895514488220215,
|
|
"epoch": 2.831892411143132,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00042045078459994854,
|
|
"loss": 4.7254,
|
|
"mean_token_accuracy": 0.23643447011709212,
|
|
"num_tokens": 67602929.0,
|
|
"step": 29480
|
|
},
|
|
{
|
|
"entropy": 5.115679359436035,
|
|
"epoch": 2.8323727185398653,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004204246259950136,
|
|
"loss": 4.7374,
|
|
"mean_token_accuracy": 0.24406941384077072,
|
|
"num_tokens": 67613049.0,
|
|
"step": 29485
|
|
},
|
|
{
|
|
"entropy": 5.171131896972656,
|
|
"epoch": 2.8328530259365996,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00042039846401375065,
|
|
"loss": 4.8206,
|
|
"mean_token_accuracy": 0.2334555834531784,
|
|
"num_tokens": 67623355.0,
|
|
"step": 29490
|
|
},
|
|
{
|
|
"entropy": 5.166270017623901,
|
|
"epoch": 2.8333333333333335,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00042037229865676714,
|
|
"loss": 4.8016,
|
|
"mean_token_accuracy": 0.23626454472541808,
|
|
"num_tokens": 67634607.0,
|
|
"step": 29495
|
|
},
|
|
{
|
|
"entropy": 5.046123886108399,
|
|
"epoch": 2.8338136407300674,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00042034612992467046,
|
|
"loss": 4.6883,
|
|
"mean_token_accuracy": 0.2428459644317627,
|
|
"num_tokens": 67646549.0,
|
|
"step": 29500
|
|
},
|
|
{
|
|
"entropy": 4.962723445892334,
|
|
"epoch": 2.8342939481268012,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004203199578180683,
|
|
"loss": 4.619,
|
|
"mean_token_accuracy": 0.2527529805898666,
|
|
"num_tokens": 67657785.0,
|
|
"step": 29505
|
|
},
|
|
{
|
|
"entropy": 5.064220571517945,
|
|
"epoch": 2.834774255523535,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004202937823375682,
|
|
"loss": 4.715,
|
|
"mean_token_accuracy": 0.2429947003722191,
|
|
"num_tokens": 67670415.0,
|
|
"step": 29510
|
|
},
|
|
{
|
|
"entropy": 5.099671173095703,
|
|
"epoch": 2.835254562920269,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004202676034837779,
|
|
"loss": 4.7237,
|
|
"mean_token_accuracy": 0.23474127650260926,
|
|
"num_tokens": 67682815.0,
|
|
"step": 29515
|
|
},
|
|
{
|
|
"entropy": 5.086864423751831,
|
|
"epoch": 2.835734870317003,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004202414212573052,
|
|
"loss": 4.7447,
|
|
"mean_token_accuracy": 0.23662538975477218,
|
|
"num_tokens": 67693820.0,
|
|
"step": 29520
|
|
},
|
|
{
|
|
"entropy": 5.068673038482666,
|
|
"epoch": 2.8362151777137368,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00042021523565875796,
|
|
"loss": 4.7307,
|
|
"mean_token_accuracy": 0.23210255354642867,
|
|
"num_tokens": 67705375.0,
|
|
"step": 29525
|
|
},
|
|
{
|
|
"entropy": 5.118408823013306,
|
|
"epoch": 2.8366954851104706,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004201890466887442,
|
|
"loss": 4.7608,
|
|
"mean_token_accuracy": 0.23971533328294753,
|
|
"num_tokens": 67717549.0,
|
|
"step": 29530
|
|
},
|
|
{
|
|
"entropy": 5.064080238342285,
|
|
"epoch": 2.8371757925072045,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004201628543478718,
|
|
"loss": 4.719,
|
|
"mean_token_accuracy": 0.24084258824586868,
|
|
"num_tokens": 67727462.0,
|
|
"step": 29535
|
|
},
|
|
{
|
|
"entropy": 5.07521619796753,
|
|
"epoch": 2.8376560999039384,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000420136658636749,
|
|
"loss": 4.7104,
|
|
"mean_token_accuracy": 0.23347580134868623,
|
|
"num_tokens": 67740059.0,
|
|
"step": 29540
|
|
},
|
|
{
|
|
"entropy": 5.1352294921875,
|
|
"epoch": 2.8381364073006723,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004201104595559841,
|
|
"loss": 4.7545,
|
|
"mean_token_accuracy": 0.2352105587720871,
|
|
"num_tokens": 67751010.0,
|
|
"step": 29545
|
|
},
|
|
{
|
|
"entropy": 5.056938123703003,
|
|
"epoch": 2.838616714697406,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042008425710618507,
|
|
"loss": 4.727,
|
|
"mean_token_accuracy": 0.24354122281074525,
|
|
"num_tokens": 67762131.0,
|
|
"step": 29550
|
|
},
|
|
{
|
|
"entropy": 5.065296459197998,
|
|
"epoch": 2.8390970220941405,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00042005805128796043,
|
|
"loss": 4.7733,
|
|
"mean_token_accuracy": 0.2421492114663124,
|
|
"num_tokens": 67773759.0,
|
|
"step": 29555
|
|
},
|
|
{
|
|
"entropy": 5.059252643585205,
|
|
"epoch": 2.839577329490874,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004200318421019186,
|
|
"loss": 4.6424,
|
|
"mean_token_accuracy": 0.24407144635915756,
|
|
"num_tokens": 67785009.0,
|
|
"step": 29560
|
|
},
|
|
{
|
|
"entropy": 5.083651447296143,
|
|
"epoch": 2.8400576368876083,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000420005629548668,
|
|
"loss": 4.7388,
|
|
"mean_token_accuracy": 0.2332341268658638,
|
|
"num_tokens": 67796588.0,
|
|
"step": 29565
|
|
},
|
|
{
|
|
"entropy": 5.070565891265869,
|
|
"epoch": 2.840537944284342,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00041997941362881735,
|
|
"loss": 4.6984,
|
|
"mean_token_accuracy": 0.2380545437335968,
|
|
"num_tokens": 67808777.0,
|
|
"step": 29570
|
|
},
|
|
{
|
|
"entropy": 5.10891318321228,
|
|
"epoch": 2.841018251681076,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004199531943429752,
|
|
"loss": 4.748,
|
|
"mean_token_accuracy": 0.23936144560575484,
|
|
"num_tokens": 67818562.0,
|
|
"step": 29575
|
|
},
|
|
{
|
|
"entropy": 5.008887243270874,
|
|
"epoch": 2.84149855907781,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004199269716917502,
|
|
"loss": 4.691,
|
|
"mean_token_accuracy": 0.24895972162485122,
|
|
"num_tokens": 67829938.0,
|
|
"step": 29580
|
|
},
|
|
{
|
|
"entropy": 5.102813577651977,
|
|
"epoch": 2.841978866474544,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004199007456757513,
|
|
"loss": 4.7173,
|
|
"mean_token_accuracy": 0.23261762410402298,
|
|
"num_tokens": 67841735.0,
|
|
"step": 29585
|
|
},
|
|
{
|
|
"entropy": 5.013525819778442,
|
|
"epoch": 2.8424591738712777,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041987451629558743,
|
|
"loss": 4.6723,
|
|
"mean_token_accuracy": 0.23993164747953416,
|
|
"num_tokens": 67853545.0,
|
|
"step": 29590
|
|
},
|
|
{
|
|
"entropy": 5.048679161071777,
|
|
"epoch": 2.8429394812680115,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004198482835518674,
|
|
"loss": 4.6501,
|
|
"mean_token_accuracy": 0.247216035425663,
|
|
"num_tokens": 67864763.0,
|
|
"step": 29595
|
|
},
|
|
{
|
|
"entropy": 5.145204257965088,
|
|
"epoch": 2.8434197886647454,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004198220474452004,
|
|
"loss": 4.8172,
|
|
"mean_token_accuracy": 0.23050425350666046,
|
|
"num_tokens": 67875837.0,
|
|
"step": 29600
|
|
},
|
|
{
|
|
"entropy": 5.084896230697632,
|
|
"epoch": 2.8439000960614793,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004197958079761954,
|
|
"loss": 4.7623,
|
|
"mean_token_accuracy": 0.236508746445179,
|
|
"num_tokens": 67887623.0,
|
|
"step": 29605
|
|
},
|
|
{
|
|
"entropy": 5.0404211521148685,
|
|
"epoch": 2.844380403458213,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041976956514546185,
|
|
"loss": 4.7101,
|
|
"mean_token_accuracy": 0.2442680910229683,
|
|
"num_tokens": 67898883.0,
|
|
"step": 29610
|
|
},
|
|
{
|
|
"entropy": 5.066223621368408,
|
|
"epoch": 2.844860710854947,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041974331895360873,
|
|
"loss": 4.6971,
|
|
"mean_token_accuracy": 0.24751601368188858,
|
|
"num_tokens": 67908871.0,
|
|
"step": 29615
|
|
},
|
|
{
|
|
"entropy": 5.031841135025024,
|
|
"epoch": 2.845341018251681,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004197170694012456,
|
|
"loss": 4.7118,
|
|
"mean_token_accuracy": 0.23562415242195128,
|
|
"num_tokens": 67920812.0,
|
|
"step": 29620
|
|
},
|
|
{
|
|
"entropy": 5.023913812637329,
|
|
"epoch": 2.845821325648415,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004196908164889818,
|
|
"loss": 4.6841,
|
|
"mean_token_accuracy": 0.24350014626979827,
|
|
"num_tokens": 67931703.0,
|
|
"step": 29625
|
|
},
|
|
{
|
|
"entropy": 5.110364866256714,
|
|
"epoch": 2.846301633045149,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004196645602174269,
|
|
"loss": 4.7696,
|
|
"mean_token_accuracy": 0.23735318034887315,
|
|
"num_tokens": 67943423.0,
|
|
"step": 29630
|
|
},
|
|
{
|
|
"entropy": 5.149677801132202,
|
|
"epoch": 2.8467819404418826,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041963830058719046,
|
|
"loss": 4.7765,
|
|
"mean_token_accuracy": 0.23049827367067338,
|
|
"num_tokens": 67954047.0,
|
|
"step": 29635
|
|
},
|
|
{
|
|
"entropy": 5.070647478103638,
|
|
"epoch": 2.847262247838617,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004196120375988822,
|
|
"loss": 4.7384,
|
|
"mean_token_accuracy": 0.24005120545625686,
|
|
"num_tokens": 67964833.0,
|
|
"step": 29640
|
|
},
|
|
{
|
|
"entropy": 5.0713142395019535,
|
|
"epoch": 2.8477425552353504,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004195857712531119,
|
|
"loss": 4.7856,
|
|
"mean_token_accuracy": 0.23458235412836076,
|
|
"num_tokens": 67977370.0,
|
|
"step": 29645
|
|
},
|
|
{
|
|
"entropy": 5.132041311264038,
|
|
"epoch": 2.8482228626320847,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004195595015504892,
|
|
"loss": 4.6999,
|
|
"mean_token_accuracy": 0.24161131531000138,
|
|
"num_tokens": 67988680.0,
|
|
"step": 29650
|
|
},
|
|
{
|
|
"entropy": 5.133564519882202,
|
|
"epoch": 2.8487031700288186,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041953322849162415,
|
|
"loss": 4.8106,
|
|
"mean_token_accuracy": 0.23893430978059768,
|
|
"num_tokens": 67999927.0,
|
|
"step": 29655
|
|
},
|
|
{
|
|
"entropy": 5.0700671672821045,
|
|
"epoch": 2.8491834774255524,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004195069520771268,
|
|
"loss": 4.8085,
|
|
"mean_token_accuracy": 0.23393811881542206,
|
|
"num_tokens": 68012176.0,
|
|
"step": 29660
|
|
},
|
|
{
|
|
"entropy": 5.156960439682007,
|
|
"epoch": 2.8496637848222863,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00041948067230760706,
|
|
"loss": 4.7311,
|
|
"mean_token_accuracy": 0.23428400307893754,
|
|
"num_tokens": 68023561.0,
|
|
"step": 29665
|
|
},
|
|
{
|
|
"entropy": 5.136725044250488,
|
|
"epoch": 2.85014409221902,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041945438918367513,
|
|
"loss": 4.8433,
|
|
"mean_token_accuracy": 0.23022455126047134,
|
|
"num_tokens": 68035768.0,
|
|
"step": 29670
|
|
},
|
|
{
|
|
"entropy": 5.175477504730225,
|
|
"epoch": 2.850624399615754,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00041942810270594115,
|
|
"loss": 4.8345,
|
|
"mean_token_accuracy": 0.2284504994750023,
|
|
"num_tokens": 68049414.0,
|
|
"step": 29675
|
|
},
|
|
{
|
|
"entropy": 5.059452867507934,
|
|
"epoch": 2.851104707012488,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004194018128750157,
|
|
"loss": 4.6302,
|
|
"mean_token_accuracy": 0.2474249631166458,
|
|
"num_tokens": 68060743.0,
|
|
"step": 29680
|
|
},
|
|
{
|
|
"entropy": 5.07676739692688,
|
|
"epoch": 2.851585014409222,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00041937551969150873,
|
|
"loss": 4.6701,
|
|
"mean_token_accuracy": 0.2410699486732483,
|
|
"num_tokens": 68070525.0,
|
|
"step": 29685
|
|
},
|
|
{
|
|
"entropy": 5.086371564865113,
|
|
"epoch": 2.8520653218059557,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000419349223156031,
|
|
"loss": 4.7683,
|
|
"mean_token_accuracy": 0.23472922891378403,
|
|
"num_tokens": 68082160.0,
|
|
"step": 29690
|
|
},
|
|
{
|
|
"entropy": 5.043384456634522,
|
|
"epoch": 2.8525456292026896,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004193229232691929,
|
|
"loss": 4.676,
|
|
"mean_token_accuracy": 0.23813123106956482,
|
|
"num_tokens": 68092473.0,
|
|
"step": 29695
|
|
},
|
|
{
|
|
"entropy": 4.977475833892822,
|
|
"epoch": 2.8530259365994235,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00041929662003160504,
|
|
"loss": 4.6828,
|
|
"mean_token_accuracy": 0.24753793478012084,
|
|
"num_tokens": 68103820.0,
|
|
"step": 29700
|
|
},
|
|
{
|
|
"entropy": 4.998800277709961,
|
|
"epoch": 2.8535062439961574,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041927031344387824,
|
|
"loss": 4.5875,
|
|
"mean_token_accuracy": 0.2567884773015976,
|
|
"num_tokens": 68116364.0,
|
|
"step": 29705
|
|
},
|
|
{
|
|
"entropy": 4.9717323780059814,
|
|
"epoch": 2.8539865513928913,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00041924400350662304,
|
|
"loss": 4.5668,
|
|
"mean_token_accuracy": 0.2500680357217789,
|
|
"num_tokens": 68127970.0,
|
|
"step": 29710
|
|
},
|
|
{
|
|
"entropy": 5.091880464553833,
|
|
"epoch": 2.8544668587896256,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00041921769022045045,
|
|
"loss": 4.7633,
|
|
"mean_token_accuracy": 0.23118812441825867,
|
|
"num_tokens": 68139433.0,
|
|
"step": 29715
|
|
},
|
|
{
|
|
"entropy": 5.053804922103882,
|
|
"epoch": 2.854947166186359,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00041919137358597137,
|
|
"loss": 4.7022,
|
|
"mean_token_accuracy": 0.23204765170812608,
|
|
"num_tokens": 68153169.0,
|
|
"step": 29720
|
|
},
|
|
{
|
|
"entropy": 5.1572469711303714,
|
|
"epoch": 2.8554274735830933,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004191650536037967,
|
|
"loss": 4.8032,
|
|
"mean_token_accuracy": 0.22558769285678865,
|
|
"num_tokens": 68163899.0,
|
|
"step": 29725
|
|
},
|
|
{
|
|
"entropy": 5.027514362335205,
|
|
"epoch": 2.8559077809798272,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041913873027453756,
|
|
"loss": 4.6108,
|
|
"mean_token_accuracy": 0.24452922493219376,
|
|
"num_tokens": 68175342.0,
|
|
"step": 29730
|
|
},
|
|
{
|
|
"entropy": 5.049619913101196,
|
|
"epoch": 2.856388088376561,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00041911240359880517,
|
|
"loss": 4.7712,
|
|
"mean_token_accuracy": 0.23811262100934982,
|
|
"num_tokens": 68186646.0,
|
|
"step": 29735
|
|
},
|
|
{
|
|
"entropy": 5.008252954483032,
|
|
"epoch": 2.856868395773295,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041908607357721067,
|
|
"loss": 4.6464,
|
|
"mean_token_accuracy": 0.24562440663576127,
|
|
"num_tokens": 68197464.0,
|
|
"step": 29740
|
|
},
|
|
{
|
|
"entropy": 4.955460834503174,
|
|
"epoch": 2.857348703170029,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041905974021036533,
|
|
"loss": 4.5972,
|
|
"mean_token_accuracy": 0.2518482759594917,
|
|
"num_tokens": 68208846.0,
|
|
"step": 29745
|
|
},
|
|
{
|
|
"entropy": 5.135580253601074,
|
|
"epoch": 2.8578290105667628,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00041903340349888065,
|
|
"loss": 4.8025,
|
|
"mean_token_accuracy": 0.2359012097120285,
|
|
"num_tokens": 68220273.0,
|
|
"step": 29750
|
|
},
|
|
{
|
|
"entropy": 5.188638544082641,
|
|
"epoch": 2.8583093179634966,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000419007063443368,
|
|
"loss": 4.8926,
|
|
"mean_token_accuracy": 0.22688197046518327,
|
|
"num_tokens": 68233871.0,
|
|
"step": 29755
|
|
},
|
|
{
|
|
"entropy": 5.102911806106567,
|
|
"epoch": 2.8587896253602305,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041898072004443906,
|
|
"loss": 4.737,
|
|
"mean_token_accuracy": 0.2358393609523773,
|
|
"num_tokens": 68246155.0,
|
|
"step": 29760
|
|
},
|
|
{
|
|
"entropy": 5.095201587677002,
|
|
"epoch": 2.8592699327569644,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004189543733027052,
|
|
"loss": 4.6985,
|
|
"mean_token_accuracy": 0.24999535232782363,
|
|
"num_tokens": 68258201.0,
|
|
"step": 29765
|
|
},
|
|
{
|
|
"entropy": 5.004236316680908,
|
|
"epoch": 2.8597502401536983,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004189280232187783,
|
|
"loss": 4.6663,
|
|
"mean_token_accuracy": 0.24319817423820494,
|
|
"num_tokens": 68270106.0,
|
|
"step": 29770
|
|
},
|
|
{
|
|
"entropy": 5.041296529769897,
|
|
"epoch": 2.860230547550432,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004189016697932701,
|
|
"loss": 4.7071,
|
|
"mean_token_accuracy": 0.2420891910791397,
|
|
"num_tokens": 68282549.0,
|
|
"step": 29775
|
|
},
|
|
{
|
|
"entropy": 5.132345914840698,
|
|
"epoch": 2.860710854947166,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004188753130267924,
|
|
"loss": 4.7506,
|
|
"mean_token_accuracy": 0.2420486569404602,
|
|
"num_tokens": 68294141.0,
|
|
"step": 29780
|
|
},
|
|
{
|
|
"entropy": 5.106274557113648,
|
|
"epoch": 2.8611911623439,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004188489529199572,
|
|
"loss": 4.7665,
|
|
"mean_token_accuracy": 0.24300543516874312,
|
|
"num_tokens": 68305520.0,
|
|
"step": 29785
|
|
},
|
|
{
|
|
"entropy": 5.028087186813354,
|
|
"epoch": 2.8616714697406342,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041882258947337637,
|
|
"loss": 4.7375,
|
|
"mean_token_accuracy": 0.2376639112830162,
|
|
"num_tokens": 68317906.0,
|
|
"step": 29790
|
|
},
|
|
{
|
|
"entropy": 5.079030466079712,
|
|
"epoch": 2.8621517771373677,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00041879622268766207,
|
|
"loss": 4.7601,
|
|
"mean_token_accuracy": 0.23383686393499375,
|
|
"num_tokens": 68328998.0,
|
|
"step": 29795
|
|
},
|
|
{
|
|
"entropy": 5.046128749847412,
|
|
"epoch": 2.862632084534102,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004187698525634266,
|
|
"loss": 4.7362,
|
|
"mean_token_accuracy": 0.23082706332206726,
|
|
"num_tokens": 68341329.0,
|
|
"step": 29800
|
|
},
|
|
{
|
|
"entropy": 5.101367521286011,
|
|
"epoch": 2.863112391930836,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041874347910128193,
|
|
"loss": 4.7714,
|
|
"mean_token_accuracy": 0.23045611530542373,
|
|
"num_tokens": 68353791.0,
|
|
"step": 29805
|
|
},
|
|
{
|
|
"entropy": 5.05668420791626,
|
|
"epoch": 2.8635926993275698,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004187171023018406,
|
|
"loss": 4.7231,
|
|
"mean_token_accuracy": 0.24457271993160248,
|
|
"num_tokens": 68365493.0,
|
|
"step": 29810
|
|
},
|
|
{
|
|
"entropy": 5.002443075180054,
|
|
"epoch": 2.8640730067243036,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041869072216571486,
|
|
"loss": 4.6475,
|
|
"mean_token_accuracy": 0.24708616137504577,
|
|
"num_tokens": 68377658.0,
|
|
"step": 29815
|
|
},
|
|
{
|
|
"entropy": 5.060185384750366,
|
|
"epoch": 2.8645533141210375,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041866433869351715,
|
|
"loss": 4.6804,
|
|
"mean_token_accuracy": 0.2429857924580574,
|
|
"num_tokens": 68387710.0,
|
|
"step": 29820
|
|
},
|
|
{
|
|
"entropy": 5.112653636932373,
|
|
"epoch": 2.8650336215177714,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004186379518858602,
|
|
"loss": 4.8256,
|
|
"mean_token_accuracy": 0.2314462423324585,
|
|
"num_tokens": 68400726.0,
|
|
"step": 29825
|
|
},
|
|
{
|
|
"entropy": 5.214086675643921,
|
|
"epoch": 2.8655139289145053,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004186115617433565,
|
|
"loss": 4.9045,
|
|
"mean_token_accuracy": 0.223735249042511,
|
|
"num_tokens": 68413414.0,
|
|
"step": 29830
|
|
},
|
|
{
|
|
"entropy": 5.0527424812316895,
|
|
"epoch": 2.865994236311239,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041858516826661876,
|
|
"loss": 4.6331,
|
|
"mean_token_accuracy": 0.2486885368824005,
|
|
"num_tokens": 68424834.0,
|
|
"step": 29835
|
|
},
|
|
{
|
|
"entropy": 4.991256952285767,
|
|
"epoch": 2.866474543707973,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00041855877145625974,
|
|
"loss": 4.6051,
|
|
"mean_token_accuracy": 0.24843001514673232,
|
|
"num_tokens": 68437220.0,
|
|
"step": 29840
|
|
},
|
|
{
|
|
"entropy": 5.040426588058471,
|
|
"epoch": 2.866954851104707,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004185323713128924,
|
|
"loss": 4.7959,
|
|
"mean_token_accuracy": 0.23011162132024765,
|
|
"num_tokens": 68449571.0,
|
|
"step": 29845
|
|
},
|
|
{
|
|
"entropy": 5.065598964691162,
|
|
"epoch": 2.867435158501441,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041850596783712956,
|
|
"loss": 4.6656,
|
|
"mean_token_accuracy": 0.23874905556440354,
|
|
"num_tokens": 68460592.0,
|
|
"step": 29850
|
|
},
|
|
{
|
|
"entropy": 5.074351978302002,
|
|
"epoch": 2.8679154658981747,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004184795610295843,
|
|
"loss": 4.6854,
|
|
"mean_token_accuracy": 0.24854440093040467,
|
|
"num_tokens": 68472268.0,
|
|
"step": 29855
|
|
},
|
|
{
|
|
"entropy": 5.063524532318115,
|
|
"epoch": 2.8683957732949086,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004184531508908697,
|
|
"loss": 4.7075,
|
|
"mean_token_accuracy": 0.23350406885147096,
|
|
"num_tokens": 68483382.0,
|
|
"step": 29860
|
|
},
|
|
{
|
|
"entropy": 5.056384468078614,
|
|
"epoch": 2.868876080691643,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004184267374215989,
|
|
"loss": 4.7365,
|
|
"mean_token_accuracy": 0.24392684549093246,
|
|
"num_tokens": 68494421.0,
|
|
"step": 29865
|
|
},
|
|
{
|
|
"entropy": 5.107250928878784,
|
|
"epoch": 2.8693563880883763,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000418400320622385,
|
|
"loss": 4.7586,
|
|
"mean_token_accuracy": 0.23305828124284744,
|
|
"num_tokens": 68505810.0,
|
|
"step": 29870
|
|
},
|
|
{
|
|
"entropy": 4.997715044021606,
|
|
"epoch": 2.8698366954851107,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004183739004938416,
|
|
"loss": 4.5571,
|
|
"mean_token_accuracy": 0.2586457535624504,
|
|
"num_tokens": 68515632.0,
|
|
"step": 29875
|
|
},
|
|
{
|
|
"entropy": 5.050461959838867,
|
|
"epoch": 2.870317002881844,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000418347477036582,
|
|
"loss": 4.7899,
|
|
"mean_token_accuracy": 0.2352416917681694,
|
|
"num_tokens": 68526930.0,
|
|
"step": 29880
|
|
},
|
|
{
|
|
"entropy": 5.027235651016236,
|
|
"epoch": 2.8707973102785784,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00041832105025121956,
|
|
"loss": 4.6573,
|
|
"mean_token_accuracy": 0.24573568105697632,
|
|
"num_tokens": 68537381.0,
|
|
"step": 29885
|
|
},
|
|
{
|
|
"entropy": 4.983240127563477,
|
|
"epoch": 2.8712776176753123,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004182946201383679,
|
|
"loss": 4.5922,
|
|
"mean_token_accuracy": 0.24639476090669632,
|
|
"num_tokens": 68548982.0,
|
|
"step": 29890
|
|
},
|
|
{
|
|
"entropy": 4.999363040924072,
|
|
"epoch": 2.871757925072046,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004182681866986407,
|
|
"loss": 4.6704,
|
|
"mean_token_accuracy": 0.24055392146110535,
|
|
"num_tokens": 68560572.0,
|
|
"step": 29895
|
|
},
|
|
{
|
|
"entropy": 5.091953277587891,
|
|
"epoch": 2.87223823246878,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041824174993265165,
|
|
"loss": 4.7362,
|
|
"mean_token_accuracy": 0.23719154596328734,
|
|
"num_tokens": 68572080.0,
|
|
"step": 29900
|
|
},
|
|
{
|
|
"entropy": 5.0495476722717285,
|
|
"epoch": 2.872718539865514,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041821530984101444,
|
|
"loss": 4.6898,
|
|
"mean_token_accuracy": 0.2397722065448761,
|
|
"num_tokens": 68584481.0,
|
|
"step": 29905
|
|
},
|
|
{
|
|
"entropy": 5.023144483566284,
|
|
"epoch": 2.873198847262248,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000418188866424343,
|
|
"loss": 4.7048,
|
|
"mean_token_accuracy": 0.2494543418288231,
|
|
"num_tokens": 68595673.0,
|
|
"step": 29910
|
|
},
|
|
{
|
|
"entropy": 5.162940740585327,
|
|
"epoch": 2.8736791546589817,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004181624196832513,
|
|
"loss": 4.793,
|
|
"mean_token_accuracy": 0.237021242082119,
|
|
"num_tokens": 68607310.0,
|
|
"step": 29915
|
|
},
|
|
{
|
|
"entropy": 5.037101364135742,
|
|
"epoch": 2.8741594620557156,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00041813596961835336,
|
|
"loss": 4.7375,
|
|
"mean_token_accuracy": 0.238271826505661,
|
|
"num_tokens": 68618579.0,
|
|
"step": 29920
|
|
},
|
|
{
|
|
"entropy": 5.016074514389038,
|
|
"epoch": 2.8746397694524495,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041810951623026313,
|
|
"loss": 4.5767,
|
|
"mean_token_accuracy": 0.2503830775618553,
|
|
"num_tokens": 68629977.0,
|
|
"step": 29925
|
|
},
|
|
{
|
|
"entropy": 5.0946588039398195,
|
|
"epoch": 2.8751200768491834,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00041808305951959496,
|
|
"loss": 4.7958,
|
|
"mean_token_accuracy": 0.23696231693029404,
|
|
"num_tokens": 68641740.0,
|
|
"step": 29930
|
|
},
|
|
{
|
|
"entropy": 5.073351049423218,
|
|
"epoch": 2.8756003842459172,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000418056599486963,
|
|
"loss": 4.7563,
|
|
"mean_token_accuracy": 0.2414123848080635,
|
|
"num_tokens": 68653171.0,
|
|
"step": 29935
|
|
},
|
|
{
|
|
"entropy": 5.060429573059082,
|
|
"epoch": 2.8760806916426516,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004180301361329816,
|
|
"loss": 4.7642,
|
|
"mean_token_accuracy": 0.24232802242040635,
|
|
"num_tokens": 68664658.0,
|
|
"step": 29940
|
|
},
|
|
{
|
|
"entropy": 5.095524263381958,
|
|
"epoch": 2.876560999039385,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004180036694582651,
|
|
"loss": 4.7003,
|
|
"mean_token_accuracy": 0.24127245396375657,
|
|
"num_tokens": 68675912.0,
|
|
"step": 29945
|
|
},
|
|
{
|
|
"entropy": 5.06903920173645,
|
|
"epoch": 2.8770413064361193,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041797719946342813,
|
|
"loss": 4.6845,
|
|
"mean_token_accuracy": 0.24589995294809341,
|
|
"num_tokens": 68686586.0,
|
|
"step": 29950
|
|
},
|
|
{
|
|
"entropy": 5.057815790176392,
|
|
"epoch": 2.8775216138328528,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00041795072614908503,
|
|
"loss": 4.7498,
|
|
"mean_token_accuracy": 0.23042766749858856,
|
|
"num_tokens": 68698280.0,
|
|
"step": 29955
|
|
},
|
|
{
|
|
"entropy": 5.093736410140991,
|
|
"epoch": 2.878001921229587,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041792424951585055,
|
|
"loss": 4.7126,
|
|
"mean_token_accuracy": 0.24679953008890151,
|
|
"num_tokens": 68709424.0,
|
|
"step": 29960
|
|
},
|
|
{
|
|
"entropy": 5.128250551223755,
|
|
"epoch": 2.878482228626321,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041789776956433947,
|
|
"loss": 4.7641,
|
|
"mean_token_accuracy": 0.24207992255687713,
|
|
"num_tokens": 68720785.0,
|
|
"step": 29965
|
|
},
|
|
{
|
|
"entropy": 5.014354610443116,
|
|
"epoch": 2.878962536023055,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00041787128629516645,
|
|
"loss": 4.6963,
|
|
"mean_token_accuracy": 0.23785762786865233,
|
|
"num_tokens": 68732157.0,
|
|
"step": 29970
|
|
},
|
|
{
|
|
"entropy": 5.011946821212769,
|
|
"epoch": 2.8794428434197887,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004178447997089464,
|
|
"loss": 4.7272,
|
|
"mean_token_accuracy": 0.24545604437589646,
|
|
"num_tokens": 68743604.0,
|
|
"step": 29975
|
|
},
|
|
{
|
|
"entropy": 5.0876930236816404,
|
|
"epoch": 2.8799231508165226,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004178183098062943,
|
|
"loss": 4.7432,
|
|
"mean_token_accuracy": 0.23434943705797195,
|
|
"num_tokens": 68755056.0,
|
|
"step": 29980
|
|
},
|
|
{
|
|
"entropy": 5.081747007369995,
|
|
"epoch": 2.8804034582132565,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004177918165878251,
|
|
"loss": 4.749,
|
|
"mean_token_accuracy": 0.23806031793355942,
|
|
"num_tokens": 68766390.0,
|
|
"step": 29985
|
|
},
|
|
{
|
|
"entropy": 5.096911478042602,
|
|
"epoch": 2.8808837656099904,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004177653200541539,
|
|
"loss": 4.7508,
|
|
"mean_token_accuracy": 0.23880307376384735,
|
|
"num_tokens": 68778780.0,
|
|
"step": 29990
|
|
},
|
|
{
|
|
"entropy": 5.076303577423095,
|
|
"epoch": 2.8813640730067243,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000417738820205896,
|
|
"loss": 4.6311,
|
|
"mean_token_accuracy": 0.24493061006069183,
|
|
"num_tokens": 68789545.0,
|
|
"step": 29995
|
|
},
|
|
{
|
|
"entropy": 5.038773441314698,
|
|
"epoch": 2.881844380403458,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004177123170436665,
|
|
"loss": 4.6587,
|
|
"mean_token_accuracy": 0.24653734415769576,
|
|
"num_tokens": 68802231.0,
|
|
"step": 30000
|
|
},
|
|
{
|
|
"epoch": 2.881844380403458,
|
|
"eval_entropy": 4.925570929416617,
|
|
"eval_loss": 4.830386638641357,
|
|
"eval_mean_token_accuracy": 0.24216745788007177,
|
|
"eval_num_tokens": 68802231.0,
|
|
"eval_runtime": 26.6573,
|
|
"eval_samples_per_second": 1230.996,
|
|
"eval_steps_per_second": 153.879,
|
|
"step": 30000
|
|
},
|
|
{
|
|
"entropy": 5.199371862411499,
|
|
"epoch": 2.882324687800192,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004176858105680807,
|
|
"loss": 4.8046,
|
|
"mean_token_accuracy": 0.2428019016981125,
|
|
"num_tokens": 68812424.0,
|
|
"step": 30005
|
|
},
|
|
{
|
|
"entropy": 5.091816568374634,
|
|
"epoch": 2.882804995196926,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00041765930077975415,
|
|
"loss": 4.6796,
|
|
"mean_token_accuracy": 0.2414682224392891,
|
|
"num_tokens": 68823770.0,
|
|
"step": 30010
|
|
},
|
|
{
|
|
"entropy": 5.043640804290772,
|
|
"epoch": 2.88328530259366,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041763278767930213,
|
|
"loss": 4.6956,
|
|
"mean_token_accuracy": 0.23768699169158936,
|
|
"num_tokens": 68835007.0,
|
|
"step": 30015
|
|
},
|
|
{
|
|
"entropy": 4.995925331115723,
|
|
"epoch": 2.8837656099903937,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004176062712673404,
|
|
"loss": 4.6096,
|
|
"mean_token_accuracy": 0.2436898961663246,
|
|
"num_tokens": 68847070.0,
|
|
"step": 30020
|
|
},
|
|
{
|
|
"entropy": 5.06178035736084,
|
|
"epoch": 2.884245917387128,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004175797515444845,
|
|
"loss": 4.6785,
|
|
"mean_token_accuracy": 0.24139561355113984,
|
|
"num_tokens": 68857197.0,
|
|
"step": 30025
|
|
},
|
|
{
|
|
"entropy": 5.091560554504395,
|
|
"epoch": 2.8847262247838614,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004175532285113501,
|
|
"loss": 4.7369,
|
|
"mean_token_accuracy": 0.2393686965107918,
|
|
"num_tokens": 68869719.0,
|
|
"step": 30030
|
|
},
|
|
{
|
|
"entropy": 5.039063549041748,
|
|
"epoch": 2.8852065321805958,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004175267021685531,
|
|
"loss": 4.7125,
|
|
"mean_token_accuracy": 0.24368225634098054,
|
|
"num_tokens": 68880300.0,
|
|
"step": 30035
|
|
},
|
|
{
|
|
"entropy": 5.0167113780975345,
|
|
"epoch": 2.8856868395773296,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041750017251670926,
|
|
"loss": 4.6726,
|
|
"mean_token_accuracy": 0.24538416266441346,
|
|
"num_tokens": 68892616.0,
|
|
"step": 30040
|
|
},
|
|
{
|
|
"entropy": 5.151379156112671,
|
|
"epoch": 2.8861671469740635,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004174736395564345,
|
|
"loss": 4.8745,
|
|
"mean_token_accuracy": 0.2226516544818878,
|
|
"num_tokens": 68904595.0,
|
|
"step": 30045
|
|
},
|
|
{
|
|
"entropy": 5.084953117370605,
|
|
"epoch": 2.8866474543707974,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00041744710328834493,
|
|
"loss": 4.7178,
|
|
"mean_token_accuracy": 0.23825270533561707,
|
|
"num_tokens": 68917036.0,
|
|
"step": 30050
|
|
},
|
|
{
|
|
"entropy": 5.0371910572052006,
|
|
"epoch": 2.8871277617675313,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041742056371305665,
|
|
"loss": 4.6936,
|
|
"mean_token_accuracy": 0.23949169963598252,
|
|
"num_tokens": 68928216.0,
|
|
"step": 30055
|
|
},
|
|
{
|
|
"entropy": 5.035665798187256,
|
|
"epoch": 2.887608069164265,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041739402083118576,
|
|
"loss": 4.7231,
|
|
"mean_token_accuracy": 0.23918746560811996,
|
|
"num_tokens": 68939806.0,
|
|
"step": 30060
|
|
},
|
|
{
|
|
"entropy": 5.039865207672119,
|
|
"epoch": 2.888088376560999,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004173674746433485,
|
|
"loss": 4.6532,
|
|
"mean_token_accuracy": 0.24816398173570633,
|
|
"num_tokens": 68950402.0,
|
|
"step": 30065
|
|
},
|
|
{
|
|
"entropy": 5.079453849792481,
|
|
"epoch": 2.888568683957733,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00041734092515016127,
|
|
"loss": 4.7442,
|
|
"mean_token_accuracy": 0.23737122267484664,
|
|
"num_tokens": 68961472.0,
|
|
"step": 30070
|
|
},
|
|
{
|
|
"entropy": 5.094537591934204,
|
|
"epoch": 2.889048991354467,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041731437235224036,
|
|
"loss": 4.8154,
|
|
"mean_token_accuracy": 0.23389466851949692,
|
|
"num_tokens": 68972844.0,
|
|
"step": 30075
|
|
},
|
|
{
|
|
"entropy": 5.091789960861206,
|
|
"epoch": 2.8895292987512007,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004172878162502023,
|
|
"loss": 4.7876,
|
|
"mean_token_accuracy": 0.23112728744745253,
|
|
"num_tokens": 68985219.0,
|
|
"step": 30080
|
|
},
|
|
{
|
|
"entropy": 5.134024047851563,
|
|
"epoch": 2.8900096061479346,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00041726125684466374,
|
|
"loss": 4.7597,
|
|
"mean_token_accuracy": 0.24076538532972336,
|
|
"num_tokens": 68997243.0,
|
|
"step": 30085
|
|
},
|
|
{
|
|
"entropy": 5.191966152191162,
|
|
"epoch": 2.8904899135446684,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004172346941362412,
|
|
"loss": 4.8106,
|
|
"mean_token_accuracy": 0.22986829429864883,
|
|
"num_tokens": 69008325.0,
|
|
"step": 30090
|
|
},
|
|
{
|
|
"entropy": 5.186703205108643,
|
|
"epoch": 2.8909702209414023,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041720812812555137,
|
|
"loss": 4.8409,
|
|
"mean_token_accuracy": 0.2290444403886795,
|
|
"num_tokens": 69020203.0,
|
|
"step": 30095
|
|
},
|
|
{
|
|
"entropy": 5.029536485671997,
|
|
"epoch": 2.8914505283381366,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004171815588132111,
|
|
"loss": 4.699,
|
|
"mean_token_accuracy": 0.2431316554546356,
|
|
"num_tokens": 69032538.0,
|
|
"step": 30100
|
|
},
|
|
{
|
|
"entropy": 5.016508913040161,
|
|
"epoch": 2.89193083573487,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004171549861998372,
|
|
"loss": 4.6994,
|
|
"mean_token_accuracy": 0.2457281082868576,
|
|
"num_tokens": 69045106.0,
|
|
"step": 30105
|
|
},
|
|
{
|
|
"entropy": 5.064055061340332,
|
|
"epoch": 2.8924111431316044,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004171284102860467,
|
|
"loss": 4.727,
|
|
"mean_token_accuracy": 0.23846855461597444,
|
|
"num_tokens": 69056526.0,
|
|
"step": 30110
|
|
},
|
|
{
|
|
"entropy": 5.085626649856567,
|
|
"epoch": 2.8928914505283383,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004171018310724565,
|
|
"loss": 4.6923,
|
|
"mean_token_accuracy": 0.24671979546546935,
|
|
"num_tokens": 69067497.0,
|
|
"step": 30115
|
|
},
|
|
{
|
|
"entropy": 5.1378021240234375,
|
|
"epoch": 2.893371757925072,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004170752485596838,
|
|
"loss": 4.6944,
|
|
"mean_token_accuracy": 0.23808026313781738,
|
|
"num_tokens": 69078114.0,
|
|
"step": 30120
|
|
},
|
|
{
|
|
"entropy": 5.032099866867066,
|
|
"epoch": 2.893852065321806,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00041704866274834557,
|
|
"loss": 4.6707,
|
|
"mean_token_accuracy": 0.23985347300767898,
|
|
"num_tokens": 69089329.0,
|
|
"step": 30125
|
|
},
|
|
{
|
|
"entropy": 5.136480760574341,
|
|
"epoch": 2.89433237271854,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041702207363905933,
|
|
"loss": 4.8024,
|
|
"mean_token_accuracy": 0.22398771792650224,
|
|
"num_tokens": 69101487.0,
|
|
"step": 30130
|
|
},
|
|
{
|
|
"entropy": 5.101910257339478,
|
|
"epoch": 2.894812680115274,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041699548123244216,
|
|
"loss": 4.6678,
|
|
"mean_token_accuracy": 0.24428293704986573,
|
|
"num_tokens": 69112297.0,
|
|
"step": 30135
|
|
},
|
|
{
|
|
"entropy": 5.135105895996094,
|
|
"epoch": 2.8952929875120077,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004169688855291116,
|
|
"loss": 4.7799,
|
|
"mean_token_accuracy": 0.2393356144428253,
|
|
"num_tokens": 69123628.0,
|
|
"step": 30140
|
|
},
|
|
{
|
|
"entropy": 4.937651538848877,
|
|
"epoch": 2.8957732949087416,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004169422865296851,
|
|
"loss": 4.598,
|
|
"mean_token_accuracy": 0.24106810986995697,
|
|
"num_tokens": 69135040.0,
|
|
"step": 30145
|
|
},
|
|
{
|
|
"entropy": 5.136594533920288,
|
|
"epoch": 2.8962536023054755,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004169156842347802,
|
|
"loss": 4.8299,
|
|
"mean_token_accuracy": 0.22800944298505782,
|
|
"num_tokens": 69146251.0,
|
|
"step": 30150
|
|
},
|
|
{
|
|
"entropy": 5.176572513580322,
|
|
"epoch": 2.8967339097022093,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004168890786450144,
|
|
"loss": 4.825,
|
|
"mean_token_accuracy": 0.23472704142332076,
|
|
"num_tokens": 69157040.0,
|
|
"step": 30155
|
|
},
|
|
{
|
|
"entropy": 5.140036869049072,
|
|
"epoch": 2.8972142170989432,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004168624697610056,
|
|
"loss": 4.7024,
|
|
"mean_token_accuracy": 0.24542928189039231,
|
|
"num_tokens": 69167330.0,
|
|
"step": 30160
|
|
},
|
|
{
|
|
"entropy": 4.981908178329467,
|
|
"epoch": 2.897694524495677,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00041683585758337156,
|
|
"loss": 4.6426,
|
|
"mean_token_accuracy": 0.24661931693553923,
|
|
"num_tokens": 69177903.0,
|
|
"step": 30165
|
|
},
|
|
{
|
|
"entropy": 4.951627206802368,
|
|
"epoch": 2.898174831892411,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004168092421127299,
|
|
"loss": 4.6666,
|
|
"mean_token_accuracy": 0.2440599873661995,
|
|
"num_tokens": 69189946.0,
|
|
"step": 30170
|
|
},
|
|
{
|
|
"entropy": 4.968938875198364,
|
|
"epoch": 2.8986551392891453,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004167826233496989,
|
|
"loss": 4.6074,
|
|
"mean_token_accuracy": 0.25470257848501204,
|
|
"num_tokens": 69201393.0,
|
|
"step": 30175
|
|
},
|
|
{
|
|
"entropy": 5.121877193450928,
|
|
"epoch": 2.8991354466858787,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004167560012948963,
|
|
"loss": 4.768,
|
|
"mean_token_accuracy": 0.23285290002822875,
|
|
"num_tokens": 69213894.0,
|
|
"step": 30180
|
|
},
|
|
{
|
|
"entropy": 5.116933870315552,
|
|
"epoch": 2.899615754082613,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041672937594894034,
|
|
"loss": 4.7454,
|
|
"mean_token_accuracy": 0.2330809399485588,
|
|
"num_tokens": 69224798.0,
|
|
"step": 30185
|
|
},
|
|
{
|
|
"entropy": 5.108615732192993,
|
|
"epoch": 2.9000960614793465,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041670274731244903,
|
|
"loss": 4.7199,
|
|
"mean_token_accuracy": 0.23888099640607835,
|
|
"num_tokens": 69235893.0,
|
|
"step": 30190
|
|
},
|
|
{
|
|
"entropy": 4.994650602340698,
|
|
"epoch": 2.900576368876081,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004166761153860408,
|
|
"loss": 4.6267,
|
|
"mean_token_accuracy": 0.24582148790359498,
|
|
"num_tokens": 69246554.0,
|
|
"step": 30195
|
|
},
|
|
{
|
|
"entropy": 5.008595514297485,
|
|
"epoch": 2.9010566762728147,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00041664948017033383,
|
|
"loss": 4.6906,
|
|
"mean_token_accuracy": 0.24325044751167296,
|
|
"num_tokens": 69259232.0,
|
|
"step": 30200
|
|
},
|
|
{
|
|
"entropy": 5.028008651733399,
|
|
"epoch": 2.9015369836695486,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004166228416659465,
|
|
"loss": 4.6078,
|
|
"mean_token_accuracy": 0.24577678143978118,
|
|
"num_tokens": 69270143.0,
|
|
"step": 30205
|
|
},
|
|
{
|
|
"entropy": 5.063057708740234,
|
|
"epoch": 2.9020172910662825,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041659619987349734,
|
|
"loss": 4.7292,
|
|
"mean_token_accuracy": 0.24639766663312912,
|
|
"num_tokens": 69280702.0,
|
|
"step": 30210
|
|
},
|
|
{
|
|
"entropy": 5.100144577026367,
|
|
"epoch": 2.9024975984630164,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041656955479360487,
|
|
"loss": 4.7,
|
|
"mean_token_accuracy": 0.2368340790271759,
|
|
"num_tokens": 69292605.0,
|
|
"step": 30215
|
|
},
|
|
{
|
|
"entropy": 5.02829761505127,
|
|
"epoch": 2.9029779058597502,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004165429064268877,
|
|
"loss": 4.6562,
|
|
"mean_token_accuracy": 0.24210800975561142,
|
|
"num_tokens": 69303243.0,
|
|
"step": 30220
|
|
},
|
|
{
|
|
"entropy": 5.054769611358642,
|
|
"epoch": 2.903458213256484,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004165162547739646,
|
|
"loss": 4.7601,
|
|
"mean_token_accuracy": 0.23413576632738115,
|
|
"num_tokens": 69315059.0,
|
|
"step": 30225
|
|
},
|
|
{
|
|
"entropy": 5.101757431030274,
|
|
"epoch": 2.903938520653218,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004164895998354542,
|
|
"loss": 4.7719,
|
|
"mean_token_accuracy": 0.23380008190870286,
|
|
"num_tokens": 69327338.0,
|
|
"step": 30230
|
|
},
|
|
{
|
|
"entropy": 5.104634141921997,
|
|
"epoch": 2.904418828049952,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004164629416119755,
|
|
"loss": 4.707,
|
|
"mean_token_accuracy": 0.23432289361953734,
|
|
"num_tokens": 69339955.0,
|
|
"step": 30235
|
|
},
|
|
{
|
|
"entropy": 5.097865915298462,
|
|
"epoch": 2.9048991354466858,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00041643628010414735,
|
|
"loss": 4.7182,
|
|
"mean_token_accuracy": 0.23935549706220627,
|
|
"num_tokens": 69350847.0,
|
|
"step": 30240
|
|
},
|
|
{
|
|
"entropy": 5.073919820785522,
|
|
"epoch": 2.9053794428434196,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041640961531258877,
|
|
"loss": 4.6868,
|
|
"mean_token_accuracy": 0.23589784801006317,
|
|
"num_tokens": 69362066.0,
|
|
"step": 30245
|
|
},
|
|
{
|
|
"entropy": 5.023674488067627,
|
|
"epoch": 2.905859750240154,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004163829472379187,
|
|
"loss": 4.7058,
|
|
"mean_token_accuracy": 0.23785745352506638,
|
|
"num_tokens": 69373239.0,
|
|
"step": 30250
|
|
},
|
|
{
|
|
"entropy": 5.056734991073609,
|
|
"epoch": 2.9063400576368874,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00041635627588075655,
|
|
"loss": 4.6772,
|
|
"mean_token_accuracy": 0.23729420304298401,
|
|
"num_tokens": 69384094.0,
|
|
"step": 30255
|
|
},
|
|
{
|
|
"entropy": 5.007504224777222,
|
|
"epoch": 2.9068203650336217,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004163296012417213,
|
|
"loss": 4.6821,
|
|
"mean_token_accuracy": 0.24327708333730697,
|
|
"num_tokens": 69394667.0,
|
|
"step": 30260
|
|
},
|
|
{
|
|
"entropy": 5.083268547058106,
|
|
"epoch": 2.907300672430355,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041630292332143245,
|
|
"loss": 4.7385,
|
|
"mean_token_accuracy": 0.24209891855716706,
|
|
"num_tokens": 69405146.0,
|
|
"step": 30265
|
|
},
|
|
{
|
|
"entropy": 5.138149356842041,
|
|
"epoch": 2.9077809798270895,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004162762421205093,
|
|
"loss": 4.7803,
|
|
"mean_token_accuracy": 0.24056767225265502,
|
|
"num_tokens": 69416147.0,
|
|
"step": 30270
|
|
},
|
|
{
|
|
"entropy": 5.072920036315918,
|
|
"epoch": 2.9082612872238234,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00041624955763957134,
|
|
"loss": 4.7042,
|
|
"mean_token_accuracy": 0.24318288415670394,
|
|
"num_tokens": 69426065.0,
|
|
"step": 30275
|
|
},
|
|
{
|
|
"entropy": 5.004269218444824,
|
|
"epoch": 2.9087415946205573,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004162228698792381,
|
|
"loss": 4.6758,
|
|
"mean_token_accuracy": 0.24937786161899567,
|
|
"num_tokens": 69437266.0,
|
|
"step": 30280
|
|
},
|
|
{
|
|
"entropy": 5.061780214309692,
|
|
"epoch": 2.909221902017291,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041619617884012904,
|
|
"loss": 4.7524,
|
|
"mean_token_accuracy": 0.24395810514688493,
|
|
"num_tokens": 69449539.0,
|
|
"step": 30285
|
|
},
|
|
{
|
|
"entropy": 5.095350646972657,
|
|
"epoch": 2.909702209414025,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004161694845228641,
|
|
"loss": 4.7695,
|
|
"mean_token_accuracy": 0.23942003697156905,
|
|
"num_tokens": 69460515.0,
|
|
"step": 30290
|
|
},
|
|
{
|
|
"entropy": 5.1438220024108885,
|
|
"epoch": 2.910182516810759,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004161427869280628,
|
|
"loss": 4.8374,
|
|
"mean_token_accuracy": 0.2357712507247925,
|
|
"num_tokens": 69472997.0,
|
|
"step": 30295
|
|
},
|
|
{
|
|
"entropy": 5.122387933731079,
|
|
"epoch": 2.910662824207493,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041611608605634517,
|
|
"loss": 4.7841,
|
|
"mean_token_accuracy": 0.2316685900092125,
|
|
"num_tokens": 69485118.0,
|
|
"step": 30300
|
|
},
|
|
{
|
|
"entropy": 5.07972412109375,
|
|
"epoch": 2.9111431316042267,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000416089381908331,
|
|
"loss": 4.7233,
|
|
"mean_token_accuracy": 0.24024315625429155,
|
|
"num_tokens": 69495696.0,
|
|
"step": 30305
|
|
},
|
|
{
|
|
"entropy": 4.99944052696228,
|
|
"epoch": 2.9116234390009605,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004160626744846404,
|
|
"loss": 4.5839,
|
|
"mean_token_accuracy": 0.2539135843515396,
|
|
"num_tokens": 69507832.0,
|
|
"step": 30310
|
|
},
|
|
{
|
|
"entropy": 5.075605297088623,
|
|
"epoch": 2.9121037463976944,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004160359637858933,
|
|
"loss": 4.7946,
|
|
"mean_token_accuracy": 0.23319020718336106,
|
|
"num_tokens": 69520180.0,
|
|
"step": 30315
|
|
},
|
|
{
|
|
"entropy": 5.02910418510437,
|
|
"epoch": 2.9125840537944283,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00041600924981270997,
|
|
"loss": 4.6737,
|
|
"mean_token_accuracy": 0.24387964457273484,
|
|
"num_tokens": 69531201.0,
|
|
"step": 30320
|
|
},
|
|
{
|
|
"entropy": 5.127144908905029,
|
|
"epoch": 2.913064361191162,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041598253256571057,
|
|
"loss": 4.7544,
|
|
"mean_token_accuracy": 0.24092497825622558,
|
|
"num_tokens": 69542552.0,
|
|
"step": 30325
|
|
},
|
|
{
|
|
"entropy": 5.0829455852508545,
|
|
"epoch": 2.913544668587896,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004159558120455154,
|
|
"loss": 4.6802,
|
|
"mean_token_accuracy": 0.24230518639087678,
|
|
"num_tokens": 69554424.0,
|
|
"step": 30330
|
|
},
|
|
{
|
|
"entropy": 4.9297326564788815,
|
|
"epoch": 2.9140249759846304,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004159290882527448,
|
|
"loss": 4.5564,
|
|
"mean_token_accuracy": 0.2559991866350174,
|
|
"num_tokens": 69565043.0,
|
|
"step": 30335
|
|
},
|
|
{
|
|
"entropy": 5.063700532913208,
|
|
"epoch": 2.914505283381364,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004159023611880192,
|
|
"loss": 4.6943,
|
|
"mean_token_accuracy": 0.23595308661460876,
|
|
"num_tokens": 69576734.0,
|
|
"step": 30340
|
|
},
|
|
{
|
|
"entropy": 5.1198193550109865,
|
|
"epoch": 2.914985590778098,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004158756308519592,
|
|
"loss": 4.7153,
|
|
"mean_token_accuracy": 0.23583280593156813,
|
|
"num_tokens": 69588565.0,
|
|
"step": 30345
|
|
},
|
|
{
|
|
"entropy": 5.076626873016357,
|
|
"epoch": 2.915465898174832,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041584889724518545,
|
|
"loss": 4.7295,
|
|
"mean_token_accuracy": 0.24376252442598342,
|
|
"num_tokens": 69600595.0,
|
|
"step": 30350
|
|
},
|
|
{
|
|
"entropy": 5.013409376144409,
|
|
"epoch": 2.915946205571566,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00041582216036831844,
|
|
"loss": 4.6144,
|
|
"mean_token_accuracy": 0.2499750643968582,
|
|
"num_tokens": 69611091.0,
|
|
"step": 30355
|
|
},
|
|
{
|
|
"entropy": 5.137284612655639,
|
|
"epoch": 2.9164265129683,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000415795420221979,
|
|
"loss": 4.7759,
|
|
"mean_token_accuracy": 0.2369133248925209,
|
|
"num_tokens": 69621882.0,
|
|
"step": 30360
|
|
},
|
|
{
|
|
"entropy": 5.088633012771607,
|
|
"epoch": 2.9169068203650337,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041576867680678803,
|
|
"loss": 4.7074,
|
|
"mean_token_accuracy": 0.2345858931541443,
|
|
"num_tokens": 69634673.0,
|
|
"step": 30365
|
|
},
|
|
{
|
|
"entropy": 5.076688671112061,
|
|
"epoch": 2.9173871277617676,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004157419301233664,
|
|
"loss": 4.7224,
|
|
"mean_token_accuracy": 0.24228535294532777,
|
|
"num_tokens": 69644950.0,
|
|
"step": 30370
|
|
},
|
|
{
|
|
"entropy": 5.0582098960876465,
|
|
"epoch": 2.9178674351585014,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00041571518017233505,
|
|
"loss": 4.7228,
|
|
"mean_token_accuracy": 0.2429313540458679,
|
|
"num_tokens": 69656750.0,
|
|
"step": 30375
|
|
},
|
|
{
|
|
"entropy": 5.109919261932373,
|
|
"epoch": 2.9183477425552353,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004156884269543151,
|
|
"loss": 4.6876,
|
|
"mean_token_accuracy": 0.24676658511161803,
|
|
"num_tokens": 69667945.0,
|
|
"step": 30380
|
|
},
|
|
{
|
|
"entropy": 5.120557546615601,
|
|
"epoch": 2.918828049951969,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004156616704699275,
|
|
"loss": 4.7351,
|
|
"mean_token_accuracy": 0.24469823092222215,
|
|
"num_tokens": 69679995.0,
|
|
"step": 30385
|
|
},
|
|
{
|
|
"entropy": 5.047893619537353,
|
|
"epoch": 2.919308357348703,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041563491071979375,
|
|
"loss": 4.6986,
|
|
"mean_token_accuracy": 0.24317895323038102,
|
|
"num_tokens": 69691029.0,
|
|
"step": 30390
|
|
},
|
|
{
|
|
"entropy": 5.051618337631226,
|
|
"epoch": 2.919788664745437,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00041560814770453495,
|
|
"loss": 4.638,
|
|
"mean_token_accuracy": 0.24778493344783784,
|
|
"num_tokens": 69702497.0,
|
|
"step": 30395
|
|
},
|
|
{
|
|
"entropy": 5.079649162292481,
|
|
"epoch": 2.920268972142171,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00041558138142477235,
|
|
"loss": 4.7508,
|
|
"mean_token_accuracy": 0.23697479963302612,
|
|
"num_tokens": 69713127.0,
|
|
"step": 30400
|
|
},
|
|
{
|
|
"entropy": 5.047370290756225,
|
|
"epoch": 2.9207492795389047,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041555461188112763,
|
|
"loss": 4.7005,
|
|
"mean_token_accuracy": 0.24249713867902756,
|
|
"num_tokens": 69725314.0,
|
|
"step": 30405
|
|
},
|
|
{
|
|
"entropy": 4.973897171020508,
|
|
"epoch": 2.921229586935639,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00041552783907422217,
|
|
"loss": 4.6301,
|
|
"mean_token_accuracy": 0.24654280990362168,
|
|
"num_tokens": 69736696.0,
|
|
"step": 30410
|
|
},
|
|
{
|
|
"entropy": 5.017396688461304,
|
|
"epoch": 2.9217098943323725,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004155010630046775,
|
|
"loss": 4.681,
|
|
"mean_token_accuracy": 0.24379423260688782,
|
|
"num_tokens": 69748639.0,
|
|
"step": 30415
|
|
},
|
|
{
|
|
"entropy": 5.086289119720459,
|
|
"epoch": 2.922190201729107,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004154742836731155,
|
|
"loss": 4.7823,
|
|
"mean_token_accuracy": 0.23747721016407014,
|
|
"num_tokens": 69760124.0,
|
|
"step": 30420
|
|
},
|
|
{
|
|
"entropy": 5.0709563255310055,
|
|
"epoch": 2.9226705091258407,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004154475010801576,
|
|
"loss": 4.7064,
|
|
"mean_token_accuracy": 0.2438757374882698,
|
|
"num_tokens": 69772280.0,
|
|
"step": 30425
|
|
},
|
|
{
|
|
"entropy": 5.042429447174072,
|
|
"epoch": 2.9231508165225746,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041542071522642583,
|
|
"loss": 4.692,
|
|
"mean_token_accuracy": 0.24266358464956284,
|
|
"num_tokens": 69784308.0,
|
|
"step": 30430
|
|
},
|
|
{
|
|
"entropy": 5.0773824691772464,
|
|
"epoch": 2.9236311239193085,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004153939261125421,
|
|
"loss": 4.744,
|
|
"mean_token_accuracy": 0.23616353273391724,
|
|
"num_tokens": 69795761.0,
|
|
"step": 30435
|
|
},
|
|
{
|
|
"entropy": 5.0080304622650145,
|
|
"epoch": 2.9241114313160423,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004153671337391281,
|
|
"loss": 4.5837,
|
|
"mean_token_accuracy": 0.2585609257221222,
|
|
"num_tokens": 69805581.0,
|
|
"step": 30440
|
|
},
|
|
{
|
|
"entropy": 5.037991571426391,
|
|
"epoch": 2.9245917387127762,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004153403381068062,
|
|
"loss": 4.7104,
|
|
"mean_token_accuracy": 0.2361886367201805,
|
|
"num_tokens": 69816593.0,
|
|
"step": 30445
|
|
},
|
|
{
|
|
"entropy": 5.021226501464843,
|
|
"epoch": 2.92507204610951,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041531353921619833,
|
|
"loss": 4.7122,
|
|
"mean_token_accuracy": 0.24069665968418122,
|
|
"num_tokens": 69828424.0,
|
|
"step": 30450
|
|
},
|
|
{
|
|
"entropy": 5.104537153244019,
|
|
"epoch": 2.925552353506244,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004152867370679267,
|
|
"loss": 4.6893,
|
|
"mean_token_accuracy": 0.24043979048728942,
|
|
"num_tokens": 69839250.0,
|
|
"step": 30455
|
|
},
|
|
{
|
|
"entropy": 5.0309265613555905,
|
|
"epoch": 2.926032660902978,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041525993166261366,
|
|
"loss": 4.6449,
|
|
"mean_token_accuracy": 0.24812211096286774,
|
|
"num_tokens": 69850910.0,
|
|
"step": 30460
|
|
},
|
|
{
|
|
"entropy": 5.049241256713867,
|
|
"epoch": 2.9265129682997117,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004152331230008814,
|
|
"loss": 4.6886,
|
|
"mean_token_accuracy": 0.24405804723501207,
|
|
"num_tokens": 69863048.0,
|
|
"step": 30465
|
|
},
|
|
{
|
|
"entropy": 5.099570608139038,
|
|
"epoch": 2.9269932756964456,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041520631108335254,
|
|
"loss": 4.7521,
|
|
"mean_token_accuracy": 0.23641604334115982,
|
|
"num_tokens": 69873512.0,
|
|
"step": 30470
|
|
},
|
|
{
|
|
"entropy": 5.15320987701416,
|
|
"epoch": 2.9274735830931795,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004151794959106494,
|
|
"loss": 4.8425,
|
|
"mean_token_accuracy": 0.23239615708589553,
|
|
"num_tokens": 69884813.0,
|
|
"step": 30475
|
|
},
|
|
{
|
|
"entropy": 5.053913259506226,
|
|
"epoch": 2.9279538904899134,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004151526774833947,
|
|
"loss": 4.7175,
|
|
"mean_token_accuracy": 0.24499699771404265,
|
|
"num_tokens": 69896832.0,
|
|
"step": 30480
|
|
},
|
|
{
|
|
"entropy": 5.05629334449768,
|
|
"epoch": 2.9284341978866477,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00041512585580221086,
|
|
"loss": 4.6807,
|
|
"mean_token_accuracy": 0.24927377551794053,
|
|
"num_tokens": 69908181.0,
|
|
"step": 30485
|
|
},
|
|
{
|
|
"entropy": 5.101822566986084,
|
|
"epoch": 2.928914505283381,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004150990308677208,
|
|
"loss": 4.7579,
|
|
"mean_token_accuracy": 0.23209206461906434,
|
|
"num_tokens": 69918839.0,
|
|
"step": 30490
|
|
},
|
|
{
|
|
"entropy": 5.16028847694397,
|
|
"epoch": 2.9293948126801155,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041507220268054737,
|
|
"loss": 4.8004,
|
|
"mean_token_accuracy": 0.23220235705375672,
|
|
"num_tokens": 69929637.0,
|
|
"step": 30495
|
|
},
|
|
{
|
|
"entropy": 5.0922932624816895,
|
|
"epoch": 2.929875120076849,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004150453712413131,
|
|
"loss": 4.7395,
|
|
"mean_token_accuracy": 0.23931036442518233,
|
|
"num_tokens": 69940202.0,
|
|
"step": 30500
|
|
},
|
|
{
|
|
"entropy": 5.084680032730103,
|
|
"epoch": 2.9303554274735832,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00041501853655064134,
|
|
"loss": 4.6797,
|
|
"mean_token_accuracy": 0.24100466072559357,
|
|
"num_tokens": 69951049.0,
|
|
"step": 30505
|
|
},
|
|
{
|
|
"entropy": 5.038010835647583,
|
|
"epoch": 2.930835734870317,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004149916986091548,
|
|
"loss": 4.7313,
|
|
"mean_token_accuracy": 0.23879321068525314,
|
|
"num_tokens": 69962950.0,
|
|
"step": 30510
|
|
},
|
|
{
|
|
"entropy": 5.068257236480713,
|
|
"epoch": 2.931316042267051,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004149648574174768,
|
|
"loss": 4.7867,
|
|
"mean_token_accuracy": 0.23279785811901094,
|
|
"num_tokens": 69974961.0,
|
|
"step": 30515
|
|
},
|
|
{
|
|
"entropy": 5.1210166931152346,
|
|
"epoch": 2.931796349663785,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004149380129762304,
|
|
"loss": 4.7003,
|
|
"mean_token_accuracy": 0.24840328395366668,
|
|
"num_tokens": 69985184.0,
|
|
"step": 30520
|
|
},
|
|
{
|
|
"entropy": 4.960989856719971,
|
|
"epoch": 2.9322766570605188,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004149111652860389,
|
|
"loss": 4.6017,
|
|
"mean_token_accuracy": 0.2513887107372284,
|
|
"num_tokens": 69996773.0,
|
|
"step": 30525
|
|
},
|
|
{
|
|
"entropy": 5.100791120529175,
|
|
"epoch": 2.9327569644572526,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004148843143475255,
|
|
"loss": 4.72,
|
|
"mean_token_accuracy": 0.24242961257696152,
|
|
"num_tokens": 70008834.0,
|
|
"step": 30530
|
|
},
|
|
{
|
|
"entropy": 5.047762680053711,
|
|
"epoch": 2.9332372718539865,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004148574601613137,
|
|
"loss": 4.6659,
|
|
"mean_token_accuracy": 0.24508444368839263,
|
|
"num_tokens": 70019708.0,
|
|
"step": 30535
|
|
},
|
|
{
|
|
"entropy": 5.204137897491455,
|
|
"epoch": 2.9337175792507204,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004148306027280271,
|
|
"loss": 4.8914,
|
|
"mean_token_accuracy": 0.22802623510360717,
|
|
"num_tokens": 70029680.0,
|
|
"step": 30540
|
|
},
|
|
{
|
|
"entropy": 5.0172882080078125,
|
|
"epoch": 2.9341978866474543,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00041480374204828896,
|
|
"loss": 4.6428,
|
|
"mean_token_accuracy": 0.2494751915335655,
|
|
"num_tokens": 70041473.0,
|
|
"step": 30545
|
|
},
|
|
{
|
|
"entropy": 5.061795854568482,
|
|
"epoch": 2.934678194044188,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00041477687812272314,
|
|
"loss": 4.7419,
|
|
"mean_token_accuracy": 0.24007183611392974,
|
|
"num_tokens": 70051646.0,
|
|
"step": 30550
|
|
},
|
|
{
|
|
"entropy": 5.126728296279907,
|
|
"epoch": 2.935158501440922,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041475001095195324,
|
|
"loss": 4.7568,
|
|
"mean_token_accuracy": 0.23328060656785965,
|
|
"num_tokens": 70063795.0,
|
|
"step": 30555
|
|
},
|
|
{
|
|
"entropy": 5.114735889434814,
|
|
"epoch": 2.9356388088376564,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004147231405366031,
|
|
"loss": 4.692,
|
|
"mean_token_accuracy": 0.24236190021038057,
|
|
"num_tokens": 70074481.0,
|
|
"step": 30560
|
|
},
|
|
{
|
|
"entropy": 4.975852680206299,
|
|
"epoch": 2.93611911623439,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004146962668772965,
|
|
"loss": 4.6963,
|
|
"mean_token_accuracy": 0.24521246701478958,
|
|
"num_tokens": 70085939.0,
|
|
"step": 30565
|
|
},
|
|
{
|
|
"entropy": 5.053282451629639,
|
|
"epoch": 2.936599423631124,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041466938997465744,
|
|
"loss": 4.7489,
|
|
"mean_token_accuracy": 0.23896246999502183,
|
|
"num_tokens": 70097466.0,
|
|
"step": 30570
|
|
},
|
|
{
|
|
"entropy": 5.067724561691284,
|
|
"epoch": 2.9370797310278576,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041464250982930974,
|
|
"loss": 4.6551,
|
|
"mean_token_accuracy": 0.24233511537313462,
|
|
"num_tokens": 70109720.0,
|
|
"step": 30575
|
|
},
|
|
{
|
|
"entropy": 5.153060054779052,
|
|
"epoch": 2.937560038424592,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00041461562644187777,
|
|
"loss": 4.8261,
|
|
"mean_token_accuracy": 0.2364367201924324,
|
|
"num_tokens": 70121101.0,
|
|
"step": 30580
|
|
},
|
|
{
|
|
"entropy": 5.115277433395386,
|
|
"epoch": 2.938040345821326,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041458873981298547,
|
|
"loss": 4.7482,
|
|
"mean_token_accuracy": 0.24553181678056718,
|
|
"num_tokens": 70131512.0,
|
|
"step": 30585
|
|
},
|
|
{
|
|
"entropy": 5.103408145904541,
|
|
"epoch": 2.9385206532180597,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041456184994325714,
|
|
"loss": 4.7965,
|
|
"mean_token_accuracy": 0.23667109608650208,
|
|
"num_tokens": 70142873.0,
|
|
"step": 30590
|
|
},
|
|
{
|
|
"entropy": 5.090587425231933,
|
|
"epoch": 2.9390009606147935,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041453495683331694,
|
|
"loss": 4.7584,
|
|
"mean_token_accuracy": 0.2342955946922302,
|
|
"num_tokens": 70154476.0,
|
|
"step": 30595
|
|
},
|
|
{
|
|
"entropy": 5.174236869812011,
|
|
"epoch": 2.9394812680115274,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00041450806048378954,
|
|
"loss": 4.7685,
|
|
"mean_token_accuracy": 0.23725222200155258,
|
|
"num_tokens": 70165439.0,
|
|
"step": 30600
|
|
},
|
|
{
|
|
"entropy": 5.062460136413574,
|
|
"epoch": 2.9399615754082613,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004144811608952992,
|
|
"loss": 4.7285,
|
|
"mean_token_accuracy": 0.2378845065832138,
|
|
"num_tokens": 70176515.0,
|
|
"step": 30605
|
|
},
|
|
{
|
|
"entropy": 5.014520120620728,
|
|
"epoch": 2.940441882804995,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00041445425806847043,
|
|
"loss": 4.6886,
|
|
"mean_token_accuracy": 0.24345237016677856,
|
|
"num_tokens": 70189064.0,
|
|
"step": 30610
|
|
},
|
|
{
|
|
"entropy": 5.020877265930176,
|
|
"epoch": 2.940922190201729,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041442735200392783,
|
|
"loss": 4.7009,
|
|
"mean_token_accuracy": 0.23655428886413574,
|
|
"num_tokens": 70200984.0,
|
|
"step": 30615
|
|
},
|
|
{
|
|
"entropy": 5.021997499465942,
|
|
"epoch": 2.941402497598463,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004144004427022962,
|
|
"loss": 4.687,
|
|
"mean_token_accuracy": 0.23773080855607986,
|
|
"num_tokens": 70212071.0,
|
|
"step": 30620
|
|
},
|
|
{
|
|
"entropy": 5.130324935913086,
|
|
"epoch": 2.941882804995197,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041437353016420025,
|
|
"loss": 4.7864,
|
|
"mean_token_accuracy": 0.23806737065315248,
|
|
"num_tokens": 70223395.0,
|
|
"step": 30625
|
|
},
|
|
{
|
|
"entropy": 5.040337181091308,
|
|
"epoch": 2.9423631123919307,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004143466143902648,
|
|
"loss": 4.6063,
|
|
"mean_token_accuracy": 0.25084982961416247,
|
|
"num_tokens": 70233130.0,
|
|
"step": 30630
|
|
},
|
|
{
|
|
"entropy": 5.104285478591919,
|
|
"epoch": 2.9428434197886646,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041431969538111463,
|
|
"loss": 4.7762,
|
|
"mean_token_accuracy": 0.23282581716775894,
|
|
"num_tokens": 70245227.0,
|
|
"step": 30635
|
|
},
|
|
{
|
|
"entropy": 5.101534843444824,
|
|
"epoch": 2.9433237271853985,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004142927731373749,
|
|
"loss": 4.6754,
|
|
"mean_token_accuracy": 0.24160946905612946,
|
|
"num_tokens": 70256483.0,
|
|
"step": 30640
|
|
},
|
|
{
|
|
"entropy": 5.066981267929077,
|
|
"epoch": 2.943804034582133,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004142658476596706,
|
|
"loss": 4.7047,
|
|
"mean_token_accuracy": 0.24397629797458648,
|
|
"num_tokens": 70267484.0,
|
|
"step": 30645
|
|
},
|
|
{
|
|
"entropy": 5.141232967376709,
|
|
"epoch": 2.9442843419788662,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041423891894862687,
|
|
"loss": 4.8418,
|
|
"mean_token_accuracy": 0.23121191561222076,
|
|
"num_tokens": 70278137.0,
|
|
"step": 30650
|
|
},
|
|
{
|
|
"entropy": 5.089439582824707,
|
|
"epoch": 2.9447646493756006,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004142119870048689,
|
|
"loss": 4.7201,
|
|
"mean_token_accuracy": 0.2364410549402237,
|
|
"num_tokens": 70289255.0,
|
|
"step": 30655
|
|
},
|
|
{
|
|
"entropy": 5.108114242553711,
|
|
"epoch": 2.9452449567723344,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004141850518290219,
|
|
"loss": 4.7759,
|
|
"mean_token_accuracy": 0.24463301599025727,
|
|
"num_tokens": 70300672.0,
|
|
"step": 30660
|
|
},
|
|
{
|
|
"entropy": 5.047538185119629,
|
|
"epoch": 2.9457252641690683,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041415811342171134,
|
|
"loss": 4.6993,
|
|
"mean_token_accuracy": 0.2412917673587799,
|
|
"num_tokens": 70311964.0,
|
|
"step": 30665
|
|
},
|
|
{
|
|
"entropy": 5.017793035507202,
|
|
"epoch": 2.946205571565802,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004141311717835625,
|
|
"loss": 4.6647,
|
|
"mean_token_accuracy": 0.24118449687957763,
|
|
"num_tokens": 70323354.0,
|
|
"step": 30670
|
|
},
|
|
{
|
|
"entropy": 5.121945190429687,
|
|
"epoch": 2.946685878962536,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041410422691520114,
|
|
"loss": 4.8108,
|
|
"mean_token_accuracy": 0.22432501018047332,
|
|
"num_tokens": 70335432.0,
|
|
"step": 30675
|
|
},
|
|
{
|
|
"entropy": 5.122028827667236,
|
|
"epoch": 2.94716618635927,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00041407727881725265,
|
|
"loss": 4.7586,
|
|
"mean_token_accuracy": 0.2394048422574997,
|
|
"num_tokens": 70347012.0,
|
|
"step": 30680
|
|
},
|
|
{
|
|
"entropy": 5.01928391456604,
|
|
"epoch": 2.947646493756004,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004140503274903426,
|
|
"loss": 4.7025,
|
|
"mean_token_accuracy": 0.2462942734360695,
|
|
"num_tokens": 70358680.0,
|
|
"step": 30685
|
|
},
|
|
{
|
|
"entropy": 4.992564296722412,
|
|
"epoch": 2.9481268011527377,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000414023372935097,
|
|
"loss": 4.5839,
|
|
"mean_token_accuracy": 0.25254774689674375,
|
|
"num_tokens": 70369081.0,
|
|
"step": 30690
|
|
},
|
|
{
|
|
"entropy": 5.092398643493652,
|
|
"epoch": 2.9486071085494716,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041399641515214137,
|
|
"loss": 4.7644,
|
|
"mean_token_accuracy": 0.23195332586765288,
|
|
"num_tokens": 70380831.0,
|
|
"step": 30695
|
|
},
|
|
{
|
|
"entropy": 5.062392044067383,
|
|
"epoch": 2.9490874159462055,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004139694541421018,
|
|
"loss": 4.7069,
|
|
"mean_token_accuracy": 0.24111398011446,
|
|
"num_tokens": 70392403.0,
|
|
"step": 30700
|
|
},
|
|
{
|
|
"entropy": 5.103695678710937,
|
|
"epoch": 2.9495677233429394,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004139424899056042,
|
|
"loss": 4.7827,
|
|
"mean_token_accuracy": 0.23661188781261444,
|
|
"num_tokens": 70404332.0,
|
|
"step": 30705
|
|
},
|
|
{
|
|
"entropy": 5.041910696029663,
|
|
"epoch": 2.9500480307396733,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041391552244327446,
|
|
"loss": 4.6822,
|
|
"mean_token_accuracy": 0.24355427026748658,
|
|
"num_tokens": 70416119.0,
|
|
"step": 30710
|
|
},
|
|
{
|
|
"entropy": 5.151484203338623,
|
|
"epoch": 2.950528338136407,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004138885517557387,
|
|
"loss": 4.7472,
|
|
"mean_token_accuracy": 0.23242215514183046,
|
|
"num_tokens": 70425874.0,
|
|
"step": 30715
|
|
},
|
|
{
|
|
"entropy": 5.123206567764282,
|
|
"epoch": 2.9510086455331415,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004138615778436234,
|
|
"loss": 4.7905,
|
|
"mean_token_accuracy": 0.23205123245716094,
|
|
"num_tokens": 70435755.0,
|
|
"step": 30720
|
|
},
|
|
{
|
|
"entropy": 4.9721211910247805,
|
|
"epoch": 2.951488952929875,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00041383460070755447,
|
|
"loss": 4.5833,
|
|
"mean_token_accuracy": 0.25036960244178774,
|
|
"num_tokens": 70446764.0,
|
|
"step": 30725
|
|
},
|
|
{
|
|
"entropy": 5.033250236511231,
|
|
"epoch": 2.9519692603266092,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041380762034815834,
|
|
"loss": 4.6268,
|
|
"mean_token_accuracy": 0.25020085871219633,
|
|
"num_tokens": 70457635.0,
|
|
"step": 30730
|
|
},
|
|
{
|
|
"entropy": 5.122746801376342,
|
|
"epoch": 2.952449567723343,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00041378063676606147,
|
|
"loss": 4.7993,
|
|
"mean_token_accuracy": 0.23039833158254625,
|
|
"num_tokens": 70469082.0,
|
|
"step": 30735
|
|
},
|
|
{
|
|
"entropy": 5.082587003707886,
|
|
"epoch": 2.952929875120077,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00041375364996189035,
|
|
"loss": 4.7299,
|
|
"mean_token_accuracy": 0.24270468205213547,
|
|
"num_tokens": 70481634.0,
|
|
"step": 30740
|
|
},
|
|
{
|
|
"entropy": 5.108648157119751,
|
|
"epoch": 2.953410182516811,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041372665993627143,
|
|
"loss": 4.76,
|
|
"mean_token_accuracy": 0.24007008969783783,
|
|
"num_tokens": 70493320.0,
|
|
"step": 30745
|
|
},
|
|
{
|
|
"entropy": 5.084290456771851,
|
|
"epoch": 2.9538904899135447,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00041369966668983144,
|
|
"loss": 4.6791,
|
|
"mean_token_accuracy": 0.24068784862756729,
|
|
"num_tokens": 70504998.0,
|
|
"step": 30750
|
|
},
|
|
{
|
|
"entropy": 5.084942245483399,
|
|
"epoch": 2.9543707973102786,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041367267022319706,
|
|
"loss": 4.7026,
|
|
"mean_token_accuracy": 0.24692281186580659,
|
|
"num_tokens": 70516096.0,
|
|
"step": 30755
|
|
},
|
|
{
|
|
"entropy": 5.053485965728759,
|
|
"epoch": 2.9548511047070125,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000413645670536995,
|
|
"loss": 4.7645,
|
|
"mean_token_accuracy": 0.23790201544761658,
|
|
"num_tokens": 70527193.0,
|
|
"step": 30760
|
|
},
|
|
{
|
|
"entropy": 5.153659057617188,
|
|
"epoch": 2.9553314121037464,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004136186676318522,
|
|
"loss": 4.8127,
|
|
"mean_token_accuracy": 0.22736653536558152,
|
|
"num_tokens": 70538733.0,
|
|
"step": 30765
|
|
},
|
|
{
|
|
"entropy": 5.118077564239502,
|
|
"epoch": 2.9558117195004803,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004135916615083956,
|
|
"loss": 4.6531,
|
|
"mean_token_accuracy": 0.24210628718137742,
|
|
"num_tokens": 70549381.0,
|
|
"step": 30770
|
|
},
|
|
{
|
|
"entropy": 5.057273435592651,
|
|
"epoch": 2.956292026897214,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00041356465216725195,
|
|
"loss": 4.7175,
|
|
"mean_token_accuracy": 0.2406172752380371,
|
|
"num_tokens": 70560576.0,
|
|
"step": 30775
|
|
},
|
|
{
|
|
"entropy": 5.035550498962403,
|
|
"epoch": 2.956772334293948,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041353763960904873,
|
|
"loss": 4.703,
|
|
"mean_token_accuracy": 0.24104924649000167,
|
|
"num_tokens": 70572093.0,
|
|
"step": 30780
|
|
},
|
|
{
|
|
"entropy": 5.135087013244629,
|
|
"epoch": 2.957252641690682,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00041351062383441286,
|
|
"loss": 4.6771,
|
|
"mean_token_accuracy": 0.24282461851835252,
|
|
"num_tokens": 70581567.0,
|
|
"step": 30785
|
|
},
|
|
{
|
|
"entropy": 5.044927167892456,
|
|
"epoch": 2.957732949087416,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004134836048439716,
|
|
"loss": 4.7113,
|
|
"mean_token_accuracy": 0.2433431163430214,
|
|
"num_tokens": 70593729.0,
|
|
"step": 30790
|
|
},
|
|
{
|
|
"entropy": 5.036692953109741,
|
|
"epoch": 2.95821325648415,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041345658263835215,
|
|
"loss": 4.7166,
|
|
"mean_token_accuracy": 0.24356502592563628,
|
|
"num_tokens": 70605816.0,
|
|
"step": 30795
|
|
},
|
|
{
|
|
"entropy": 5.0685828685760494,
|
|
"epoch": 2.9586935638808836,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00041342955721818207,
|
|
"loss": 4.7186,
|
|
"mean_token_accuracy": 0.239371594786644,
|
|
"num_tokens": 70618310.0,
|
|
"step": 30800
|
|
},
|
|
{
|
|
"entropy": 5.079177808761597,
|
|
"epoch": 2.959173871277618,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00041340252858408866,
|
|
"loss": 4.7261,
|
|
"mean_token_accuracy": 0.23834066540002824,
|
|
"num_tokens": 70629842.0,
|
|
"step": 30805
|
|
},
|
|
{
|
|
"entropy": 5.098159217834473,
|
|
"epoch": 2.9596541786743513,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041337549673669963,
|
|
"loss": 4.8281,
|
|
"mean_token_accuracy": 0.2345459997653961,
|
|
"num_tokens": 70642564.0,
|
|
"step": 30810
|
|
},
|
|
{
|
|
"entropy": 5.130438327789307,
|
|
"epoch": 2.9601344860710856,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004133484616766423,
|
|
"loss": 4.7584,
|
|
"mean_token_accuracy": 0.2377777561545372,
|
|
"num_tokens": 70653225.0,
|
|
"step": 30815
|
|
},
|
|
{
|
|
"entropy": 5.1525421142578125,
|
|
"epoch": 2.9606147934678195,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041332142340454463,
|
|
"loss": 4.756,
|
|
"mean_token_accuracy": 0.2319066643714905,
|
|
"num_tokens": 70665428.0,
|
|
"step": 30820
|
|
},
|
|
{
|
|
"entropy": 5.115548801422119,
|
|
"epoch": 2.9610951008645534,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004132943819210342,
|
|
"loss": 4.8091,
|
|
"mean_token_accuracy": 0.23419805765151977,
|
|
"num_tokens": 70676441.0,
|
|
"step": 30825
|
|
},
|
|
{
|
|
"entropy": 4.977198314666748,
|
|
"epoch": 2.9615754082612873,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041326733722673876,
|
|
"loss": 4.5935,
|
|
"mean_token_accuracy": 0.25220203697681426,
|
|
"num_tokens": 70687561.0,
|
|
"step": 30830
|
|
},
|
|
{
|
|
"entropy": 5.093676567077637,
|
|
"epoch": 2.962055715658021,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041324028932228645,
|
|
"loss": 4.7668,
|
|
"mean_token_accuracy": 0.23979503959417342,
|
|
"num_tokens": 70699949.0,
|
|
"step": 30835
|
|
},
|
|
{
|
|
"entropy": 5.060846853256225,
|
|
"epoch": 2.962536023054755,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004132132382083051,
|
|
"loss": 4.6801,
|
|
"mean_token_accuracy": 0.24920041114091873,
|
|
"num_tokens": 70711176.0,
|
|
"step": 30840
|
|
},
|
|
{
|
|
"entropy": 5.1384134769439695,
|
|
"epoch": 2.963016330451489,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00041318618388542274,
|
|
"loss": 4.7248,
|
|
"mean_token_accuracy": 0.23304660767316818,
|
|
"num_tokens": 70722974.0,
|
|
"step": 30845
|
|
},
|
|
{
|
|
"entropy": 4.942797088623047,
|
|
"epoch": 2.963496637848223,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004131591263542675,
|
|
"loss": 4.5784,
|
|
"mean_token_accuracy": 0.2528442844748497,
|
|
"num_tokens": 70735158.0,
|
|
"step": 30850
|
|
},
|
|
{
|
|
"entropy": 5.0456760883331295,
|
|
"epoch": 2.9639769452449567,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004131320656154676,
|
|
"loss": 4.7117,
|
|
"mean_token_accuracy": 0.23798534274101257,
|
|
"num_tokens": 70746444.0,
|
|
"step": 30855
|
|
},
|
|
{
|
|
"entropy": 5.114153432846069,
|
|
"epoch": 2.9644572526416906,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004131050016696514,
|
|
"loss": 4.6967,
|
|
"mean_token_accuracy": 0.24223651736974716,
|
|
"num_tokens": 70757985.0,
|
|
"step": 30860
|
|
},
|
|
{
|
|
"entropy": 5.092724561691284,
|
|
"epoch": 2.9649375600384245,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000413077934517447,
|
|
"loss": 4.7346,
|
|
"mean_token_accuracy": 0.23988196402788162,
|
|
"num_tokens": 70769867.0,
|
|
"step": 30865
|
|
},
|
|
{
|
|
"entropy": 5.087793588638306,
|
|
"epoch": 2.9654178674351583,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004130508641594831,
|
|
"loss": 4.7746,
|
|
"mean_token_accuracy": 0.23190059661865234,
|
|
"num_tokens": 70781610.0,
|
|
"step": 30870
|
|
},
|
|
{
|
|
"entropy": 5.124959897994995,
|
|
"epoch": 2.965898174831892,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00041302379059638794,
|
|
"loss": 4.6942,
|
|
"mean_token_accuracy": 0.2405524343252182,
|
|
"num_tokens": 70793336.0,
|
|
"step": 30875
|
|
},
|
|
{
|
|
"entropy": 5.0523522853851315,
|
|
"epoch": 2.9663784822286265,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00041299671382879024,
|
|
"loss": 4.6792,
|
|
"mean_token_accuracy": 0.24535722136497498,
|
|
"num_tokens": 70802714.0,
|
|
"step": 30880
|
|
},
|
|
{
|
|
"entropy": 5.112238979339599,
|
|
"epoch": 2.96685878962536,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004129696338573187,
|
|
"loss": 4.8274,
|
|
"mean_token_accuracy": 0.23021909594535828,
|
|
"num_tokens": 70813300.0,
|
|
"step": 30885
|
|
},
|
|
{
|
|
"entropy": 5.15190544128418,
|
|
"epoch": 2.9673390970220943,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004129425506826019,
|
|
"loss": 4.852,
|
|
"mean_token_accuracy": 0.22498805224895477,
|
|
"num_tokens": 70824828.0,
|
|
"step": 30890
|
|
},
|
|
{
|
|
"entropy": 5.073496341705322,
|
|
"epoch": 2.967819404418828,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041291546430526863,
|
|
"loss": 4.7641,
|
|
"mean_token_accuracy": 0.2451646074652672,
|
|
"num_tokens": 70836788.0,
|
|
"step": 30895
|
|
},
|
|
{
|
|
"entropy": 5.084818410873413,
|
|
"epoch": 2.968299711815562,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004128883747259478,
|
|
"loss": 4.6997,
|
|
"mean_token_accuracy": 0.2437497243285179,
|
|
"num_tokens": 70849308.0,
|
|
"step": 30900
|
|
},
|
|
{
|
|
"entropy": 5.208883285522461,
|
|
"epoch": 2.968780019212296,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004128612819452684,
|
|
"loss": 4.8601,
|
|
"mean_token_accuracy": 0.22244778871536255,
|
|
"num_tokens": 70859987.0,
|
|
"step": 30905
|
|
},
|
|
{
|
|
"entropy": 5.175189065933227,
|
|
"epoch": 2.96926032660903,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00041283418596385944,
|
|
"loss": 4.7633,
|
|
"mean_token_accuracy": 0.24494120478630066,
|
|
"num_tokens": 70872096.0,
|
|
"step": 30910
|
|
},
|
|
{
|
|
"entropy": 5.02183575630188,
|
|
"epoch": 2.9697406340057637,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004128070867823499,
|
|
"loss": 4.6654,
|
|
"mean_token_accuracy": 0.24081777781248093,
|
|
"num_tokens": 70884336.0,
|
|
"step": 30915
|
|
},
|
|
{
|
|
"entropy": 4.991226148605347,
|
|
"epoch": 2.9702209414024976,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000412779984401369,
|
|
"loss": 4.656,
|
|
"mean_token_accuracy": 0.25205512493848803,
|
|
"num_tokens": 70894707.0,
|
|
"step": 30920
|
|
},
|
|
{
|
|
"entropy": 5.095764636993408,
|
|
"epoch": 2.9707012487992315,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000412752878821546,
|
|
"loss": 4.7567,
|
|
"mean_token_accuracy": 0.23996711522340775,
|
|
"num_tokens": 70906461.0,
|
|
"step": 30925
|
|
},
|
|
{
|
|
"entropy": 5.088328266143799,
|
|
"epoch": 2.9711815561959654,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041272577004351026,
|
|
"loss": 4.6148,
|
|
"mean_token_accuracy": 0.24836140722036362,
|
|
"num_tokens": 70917282.0,
|
|
"step": 30930
|
|
},
|
|
{
|
|
"entropy": 5.056382131576538,
|
|
"epoch": 2.9716618635926992,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041269865806789095,
|
|
"loss": 4.6492,
|
|
"mean_token_accuracy": 0.2463774561882019,
|
|
"num_tokens": 70928253.0,
|
|
"step": 30935
|
|
},
|
|
{
|
|
"entropy": 5.009007358551026,
|
|
"epoch": 2.972142170989433,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004126715428953178,
|
|
"loss": 4.6959,
|
|
"mean_token_accuracy": 0.23733440935611724,
|
|
"num_tokens": 70939089.0,
|
|
"step": 30940
|
|
},
|
|
{
|
|
"entropy": 5.088467168807983,
|
|
"epoch": 2.972622478386167,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004126444245264202,
|
|
"loss": 4.6494,
|
|
"mean_token_accuracy": 0.24308509677648543,
|
|
"num_tokens": 70949533.0,
|
|
"step": 30945
|
|
},
|
|
{
|
|
"entropy": 5.080672836303711,
|
|
"epoch": 2.973102785782901,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004126173029618278,
|
|
"loss": 4.6657,
|
|
"mean_token_accuracy": 0.2412404879927635,
|
|
"num_tokens": 70961449.0,
|
|
"step": 30950
|
|
},
|
|
{
|
|
"entropy": 5.045295476913452,
|
|
"epoch": 2.973583093179635,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004125901782021702,
|
|
"loss": 4.7037,
|
|
"mean_token_accuracy": 0.24060867428779603,
|
|
"num_tokens": 70973008.0,
|
|
"step": 30955
|
|
},
|
|
{
|
|
"entropy": 5.009149217605591,
|
|
"epoch": 2.9740634005763686,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004125630502480773,
|
|
"loss": 4.6909,
|
|
"mean_token_accuracy": 0.2449464187026024,
|
|
"num_tokens": 70984170.0,
|
|
"step": 30960
|
|
},
|
|
{
|
|
"entropy": 5.0899590969085695,
|
|
"epoch": 2.974543707973103,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004125359191001788,
|
|
"loss": 4.7568,
|
|
"mean_token_accuracy": 0.23973051458597183,
|
|
"num_tokens": 70994900.0,
|
|
"step": 30965
|
|
},
|
|
{
|
|
"entropy": 5.081603336334228,
|
|
"epoch": 2.975024015369837,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004125087847591047,
|
|
"loss": 4.694,
|
|
"mean_token_accuracy": 0.24175404906272888,
|
|
"num_tokens": 71007430.0,
|
|
"step": 30970
|
|
},
|
|
{
|
|
"entropy": 5.079427099227905,
|
|
"epoch": 2.9755043227665707,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041248164722548493,
|
|
"loss": 4.7742,
|
|
"mean_token_accuracy": 0.23830578327178956,
|
|
"num_tokens": 71020306.0,
|
|
"step": 30975
|
|
},
|
|
{
|
|
"entropy": 5.0432030200958256,
|
|
"epoch": 2.9759846301633046,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004124545064999495,
|
|
"loss": 4.6331,
|
|
"mean_token_accuracy": 0.24684105515480043,
|
|
"num_tokens": 71031114.0,
|
|
"step": 30980
|
|
},
|
|
{
|
|
"entropy": 4.98370327949524,
|
|
"epoch": 2.9764649375600385,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00041242736258312866,
|
|
"loss": 4.6545,
|
|
"mean_token_accuracy": 0.24585938602685928,
|
|
"num_tokens": 71043289.0,
|
|
"step": 30985
|
|
},
|
|
{
|
|
"entropy": 5.014498090744018,
|
|
"epoch": 2.9769452449567724,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004124002154756525,
|
|
"loss": 4.6412,
|
|
"mean_token_accuracy": 0.2520130693912506,
|
|
"num_tokens": 71055163.0,
|
|
"step": 30990
|
|
},
|
|
{
|
|
"entropy": 5.01092038154602,
|
|
"epoch": 2.9774255523535063,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041237306517815124,
|
|
"loss": 4.648,
|
|
"mean_token_accuracy": 0.24476557970046997,
|
|
"num_tokens": 71067830.0,
|
|
"step": 30995
|
|
},
|
|
{
|
|
"entropy": 5.04044828414917,
|
|
"epoch": 2.97790585975024,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004123459116912554,
|
|
"loss": 4.632,
|
|
"mean_token_accuracy": 0.25297962725162504,
|
|
"num_tokens": 71079242.0,
|
|
"step": 31000
|
|
},
|
|
{
|
|
"entropy": 5.093390607833863,
|
|
"epoch": 2.978386167146974,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00041231875501559535,
|
|
"loss": 4.7382,
|
|
"mean_token_accuracy": 0.23652055859565735,
|
|
"num_tokens": 71091076.0,
|
|
"step": 31005
|
|
},
|
|
{
|
|
"entropy": 5.1234206199646,
|
|
"epoch": 2.978866474543708,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00041229159515180155,
|
|
"loss": 4.6592,
|
|
"mean_token_accuracy": 0.2390012726187706,
|
|
"num_tokens": 71101226.0,
|
|
"step": 31010
|
|
},
|
|
{
|
|
"entropy": 5.055122804641724,
|
|
"epoch": 2.979346781940442,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004122644321005046,
|
|
"loss": 4.5985,
|
|
"mean_token_accuracy": 0.24532987773418427,
|
|
"num_tokens": 71112689.0,
|
|
"step": 31015
|
|
},
|
|
{
|
|
"entropy": 5.007474517822265,
|
|
"epoch": 2.9798270893371757,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00041223726586233505,
|
|
"loss": 4.7317,
|
|
"mean_token_accuracy": 0.23529644906520844,
|
|
"num_tokens": 71124256.0,
|
|
"step": 31020
|
|
},
|
|
{
|
|
"entropy": 5.05922384262085,
|
|
"epoch": 2.9803073967339095,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00041221009643792377,
|
|
"loss": 4.7223,
|
|
"mean_token_accuracy": 0.24540119916200637,
|
|
"num_tokens": 71134636.0,
|
|
"step": 31025
|
|
},
|
|
{
|
|
"entropy": 5.042840385437012,
|
|
"epoch": 2.980787704130644,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004121829238279014,
|
|
"loss": 4.6302,
|
|
"mean_token_accuracy": 0.2476649507880211,
|
|
"num_tokens": 71145969.0,
|
|
"step": 31030
|
|
},
|
|
{
|
|
"entropy": 5.046140527725219,
|
|
"epoch": 2.9812680115273773,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00041215574803289896,
|
|
"loss": 4.7102,
|
|
"mean_token_accuracy": 0.24303379356861116,
|
|
"num_tokens": 71157491.0,
|
|
"step": 31035
|
|
},
|
|
{
|
|
"entropy": 5.035579776763916,
|
|
"epoch": 2.9817483189241116,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004121285690535473,
|
|
"loss": 4.759,
|
|
"mean_token_accuracy": 0.23988830000162126,
|
|
"num_tokens": 71169325.0,
|
|
"step": 31040
|
|
},
|
|
{
|
|
"entropy": 5.052158689498901,
|
|
"epoch": 2.982228626320845,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00041210138689047745,
|
|
"loss": 4.7013,
|
|
"mean_token_accuracy": 0.24333547949790954,
|
|
"num_tokens": 71181179.0,
|
|
"step": 31045
|
|
},
|
|
{
|
|
"entropy": 5.0599668502807615,
|
|
"epoch": 2.9827089337175794,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004120742015443206,
|
|
"loss": 4.7178,
|
|
"mean_token_accuracy": 0.2443860277533531,
|
|
"num_tokens": 71193250.0,
|
|
"step": 31050
|
|
},
|
|
{
|
|
"entropy": 4.984365558624267,
|
|
"epoch": 2.9831892411143133,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004120470130157077,
|
|
"loss": 4.6182,
|
|
"mean_token_accuracy": 0.25142668187618256,
|
|
"num_tokens": 71207544.0,
|
|
"step": 31055
|
|
},
|
|
{
|
|
"entropy": 5.091627693176269,
|
|
"epoch": 2.983669548511047,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00041201982130527006,
|
|
"loss": 4.7437,
|
|
"mean_token_accuracy": 0.2435833767056465,
|
|
"num_tokens": 71219989.0,
|
|
"step": 31060
|
|
},
|
|
{
|
|
"entropy": 5.089881372451782,
|
|
"epoch": 2.984149855907781,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041199262641363914,
|
|
"loss": 4.7047,
|
|
"mean_token_accuracy": 0.23696542531251907,
|
|
"num_tokens": 71232455.0,
|
|
"step": 31065
|
|
},
|
|
{
|
|
"entropy": 5.069763135910034,
|
|
"epoch": 2.984630163304515,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00041196542834144617,
|
|
"loss": 4.6223,
|
|
"mean_token_accuracy": 0.24865677803754807,
|
|
"num_tokens": 71244179.0,
|
|
"step": 31070
|
|
},
|
|
{
|
|
"entropy": 5.01129002571106,
|
|
"epoch": 2.985110470701249,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00041193822708932265,
|
|
"loss": 4.7036,
|
|
"mean_token_accuracy": 0.24350427836179733,
|
|
"num_tokens": 71255468.0,
|
|
"step": 31075
|
|
},
|
|
{
|
|
"entropy": 5.0638265132904055,
|
|
"epoch": 2.9855907780979827,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004119110226579002,
|
|
"loss": 4.7421,
|
|
"mean_token_accuracy": 0.24235102981328965,
|
|
"num_tokens": 71266192.0,
|
|
"step": 31080
|
|
},
|
|
{
|
|
"entropy": 5.00823655128479,
|
|
"epoch": 2.9860710854947166,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041188381504781026,
|
|
"loss": 4.6432,
|
|
"mean_token_accuracy": 0.24189938753843307,
|
|
"num_tokens": 71277166.0,
|
|
"step": 31085
|
|
},
|
|
{
|
|
"entropy": 5.074084186553955,
|
|
"epoch": 2.9865513928914504,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004118566042596846,
|
|
"loss": 4.6538,
|
|
"mean_token_accuracy": 0.2429642543196678,
|
|
"num_tokens": 71288077.0,
|
|
"step": 31090
|
|
},
|
|
{
|
|
"entropy": 5.0013875484466555,
|
|
"epoch": 2.9870317002881843,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000411829390294155,
|
|
"loss": 4.6541,
|
|
"mean_token_accuracy": 0.2529631584882736,
|
|
"num_tokens": 71300423.0,
|
|
"step": 31095
|
|
},
|
|
{
|
|
"entropy": 5.069975185394287,
|
|
"epoch": 2.987512007684918,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00041180217315185333,
|
|
"loss": 4.6413,
|
|
"mean_token_accuracy": 0.24462012946605682,
|
|
"num_tokens": 71312265.0,
|
|
"step": 31100
|
|
},
|
|
{
|
|
"entropy": 4.979261779785157,
|
|
"epoch": 2.9879923150816525,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00041177495283341124,
|
|
"loss": 4.6162,
|
|
"mean_token_accuracy": 0.24670014828443526,
|
|
"num_tokens": 71323369.0,
|
|
"step": 31105
|
|
},
|
|
{
|
|
"entropy": 5.056342935562133,
|
|
"epoch": 2.988472622478386,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000411747729339461,
|
|
"loss": 4.6817,
|
|
"mean_token_accuracy": 0.24661661982536315,
|
|
"num_tokens": 71333977.0,
|
|
"step": 31110
|
|
},
|
|
{
|
|
"entropy": 4.998224973678589,
|
|
"epoch": 2.9889529298751203,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004117205026706345,
|
|
"loss": 4.6844,
|
|
"mean_token_accuracy": 0.24256878048181535,
|
|
"num_tokens": 71345176.0,
|
|
"step": 31115
|
|
},
|
|
{
|
|
"entropy": 5.1082902431488035,
|
|
"epoch": 2.9894332372718537,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00041169327282756396,
|
|
"loss": 4.7581,
|
|
"mean_token_accuracy": 0.2398600995540619,
|
|
"num_tokens": 71357310.0,
|
|
"step": 31120
|
|
},
|
|
{
|
|
"entropy": 5.189077949523925,
|
|
"epoch": 2.989913544668588,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004116660398108815,
|
|
"loss": 4.8598,
|
|
"mean_token_accuracy": 0.23253771811723709,
|
|
"num_tokens": 71369610.0,
|
|
"step": 31125
|
|
},
|
|
{
|
|
"entropy": 5.117611217498779,
|
|
"epoch": 2.990393852065322,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004116388036212194,
|
|
"loss": 4.8285,
|
|
"mean_token_accuracy": 0.2345252439379692,
|
|
"num_tokens": 71381973.0,
|
|
"step": 31130
|
|
},
|
|
{
|
|
"entropy": 5.016928100585938,
|
|
"epoch": 2.990874159462056,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041161156425921004,
|
|
"loss": 4.691,
|
|
"mean_token_accuracy": 0.24259027391672133,
|
|
"num_tokens": 71393739.0,
|
|
"step": 31135
|
|
},
|
|
{
|
|
"entropy": 5.103773975372315,
|
|
"epoch": 2.9913544668587897,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041158432172548577,
|
|
"loss": 4.7692,
|
|
"mean_token_accuracy": 0.23819297105073928,
|
|
"num_tokens": 71405205.0,
|
|
"step": 31140
|
|
},
|
|
{
|
|
"entropy": 5.047510766983033,
|
|
"epoch": 2.9918347742555236,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00041155707602067923,
|
|
"loss": 4.631,
|
|
"mean_token_accuracy": 0.24435337632894516,
|
|
"num_tokens": 71416645.0,
|
|
"step": 31145
|
|
},
|
|
{
|
|
"entropy": 5.104709720611572,
|
|
"epoch": 2.9923150816522575,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004115298271454227,
|
|
"loss": 4.7448,
|
|
"mean_token_accuracy": 0.23368489295244216,
|
|
"num_tokens": 71428854.0,
|
|
"step": 31150
|
|
},
|
|
{
|
|
"entropy": 5.063992261886597,
|
|
"epoch": 2.9927953890489913,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004115025751003491,
|
|
"loss": 4.6758,
|
|
"mean_token_accuracy": 0.24616134017705918,
|
|
"num_tokens": 71439950.0,
|
|
"step": 31155
|
|
},
|
|
{
|
|
"entropy": 5.045139503479004,
|
|
"epoch": 2.993275696445725,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000411475319886091,
|
|
"loss": 4.6799,
|
|
"mean_token_accuracy": 0.2455049678683281,
|
|
"num_tokens": 71451901.0,
|
|
"step": 31160
|
|
},
|
|
{
|
|
"entropy": 5.191359376907348,
|
|
"epoch": 2.993756003842459,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041144806150328117,
|
|
"loss": 4.8261,
|
|
"mean_token_accuracy": 0.24100210070610045,
|
|
"num_tokens": 71463688.0,
|
|
"step": 31165
|
|
},
|
|
{
|
|
"entropy": 5.174281358718872,
|
|
"epoch": 2.994236311239193,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004114207999525527,
|
|
"loss": 4.7989,
|
|
"mean_token_accuracy": 0.2344155117869377,
|
|
"num_tokens": 71475392.0,
|
|
"step": 31170
|
|
},
|
|
{
|
|
"entropy": 5.080008792877197,
|
|
"epoch": 2.994716618635927,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00041139353523453814,
|
|
"loss": 4.734,
|
|
"mean_token_accuracy": 0.24095366150140762,
|
|
"num_tokens": 71486004.0,
|
|
"step": 31175
|
|
},
|
|
{
|
|
"entropy": 5.104208135604859,
|
|
"epoch": 2.9951969260326607,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004113662673498708,
|
|
"loss": 4.8248,
|
|
"mean_token_accuracy": 0.23622356951236725,
|
|
"num_tokens": 71497850.0,
|
|
"step": 31180
|
|
},
|
|
{
|
|
"entropy": 5.087505769729614,
|
|
"epoch": 2.9956772334293946,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00041133899629918364,
|
|
"loss": 4.6633,
|
|
"mean_token_accuracy": 0.2459829866886139,
|
|
"num_tokens": 71507778.0,
|
|
"step": 31185
|
|
},
|
|
{
|
|
"entropy": 5.126943159103393,
|
|
"epoch": 2.996157540826129,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00041131172208310986,
|
|
"loss": 4.7444,
|
|
"mean_token_accuracy": 0.23904307037591935,
|
|
"num_tokens": 71519506.0,
|
|
"step": 31190
|
|
},
|
|
{
|
|
"entropy": 5.041216564178467,
|
|
"epoch": 2.9966378482228624,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00041128444470228253,
|
|
"loss": 4.6498,
|
|
"mean_token_accuracy": 0.24707060307264328,
|
|
"num_tokens": 71530523.0,
|
|
"step": 31195
|
|
},
|
|
{
|
|
"entropy": 5.0975563526153564,
|
|
"epoch": 2.9971181556195967,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041125716415733524,
|
|
"loss": 4.7557,
|
|
"mean_token_accuracy": 0.23416275084018706,
|
|
"num_tokens": 71541473.0,
|
|
"step": 31200
|
|
},
|
|
{
|
|
"entropy": 5.041246032714843,
|
|
"epoch": 2.9975984630163306,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004112298804489011,
|
|
"loss": 4.6961,
|
|
"mean_token_accuracy": 0.24679289758205414,
|
|
"num_tokens": 71552012.0,
|
|
"step": 31205
|
|
},
|
|
{
|
|
"entropy": 5.099580383300781,
|
|
"epoch": 2.9980787704130645,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004112025935776137,
|
|
"loss": 4.7124,
|
|
"mean_token_accuracy": 0.23619519770145417,
|
|
"num_tokens": 71562239.0,
|
|
"step": 31210
|
|
},
|
|
{
|
|
"entropy": 5.034806203842163,
|
|
"epoch": 2.9985590778097984,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00041117530354410647,
|
|
"loss": 4.6254,
|
|
"mean_token_accuracy": 0.2423456683754921,
|
|
"num_tokens": 71573296.0,
|
|
"step": 31215
|
|
},
|
|
{
|
|
"entropy": 5.062445402145386,
|
|
"epoch": 2.9990393852065322,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004111480103490131,
|
|
"loss": 4.7671,
|
|
"mean_token_accuracy": 0.2363523632287979,
|
|
"num_tokens": 71584659.0,
|
|
"step": 31220
|
|
},
|
|
{
|
|
"entropy": 5.051762437820434,
|
|
"epoch": 2.999519692603266,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00041112071399296724,
|
|
"loss": 4.6701,
|
|
"mean_token_accuracy": 0.24363380372524263,
|
|
"num_tokens": 71595672.0,
|
|
"step": 31225
|
|
},
|
|
{
|
|
"entropy": 5.046540021896362,
|
|
"epoch": 3.0,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004110934144766025,
|
|
"loss": 4.6338,
|
|
"mean_token_accuracy": 0.2554298684000969,
|
|
"num_tokens": 71605608.0,
|
|
"step": 31230
|
|
},
|
|
{
|
|
"entropy": 5.051793956756592,
|
|
"epoch": 3.000480307396734,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00041106611180055284,
|
|
"loss": 4.6857,
|
|
"mean_token_accuracy": 0.23950463086366652,
|
|
"num_tokens": 71617542.0,
|
|
"step": 31235
|
|
},
|
|
{
|
|
"entropy": 4.998350143432617,
|
|
"epoch": 3.0009606147934678,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00041103880596545206,
|
|
"loss": 4.5653,
|
|
"mean_token_accuracy": 0.2447400540113449,
|
|
"num_tokens": 71629087.0,
|
|
"step": 31240
|
|
},
|
|
{
|
|
"entropy": 5.108054494857788,
|
|
"epoch": 3.0014409221902016,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004110114969719342,
|
|
"loss": 4.6754,
|
|
"mean_token_accuracy": 0.24601958692073822,
|
|
"num_tokens": 71640929.0,
|
|
"step": 31245
|
|
},
|
|
{
|
|
"entropy": 5.113180303573609,
|
|
"epoch": 3.0019212295869355,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004109841848206333,
|
|
"loss": 4.6396,
|
|
"mean_token_accuracy": 0.24572105705738068,
|
|
"num_tokens": 71652641.0,
|
|
"step": 31250
|
|
},
|
|
{
|
|
"entropy": 5.016724395751953,
|
|
"epoch": 3.0024015369836694,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004109568695121833,
|
|
"loss": 4.5788,
|
|
"mean_token_accuracy": 0.24781061559915543,
|
|
"num_tokens": 71664482.0,
|
|
"step": 31255
|
|
},
|
|
{
|
|
"entropy": 4.972348403930664,
|
|
"epoch": 3.0028818443804033,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004109295510472186,
|
|
"loss": 4.5179,
|
|
"mean_token_accuracy": 0.25019769221544264,
|
|
"num_tokens": 71675568.0,
|
|
"step": 31260
|
|
},
|
|
{
|
|
"entropy": 5.152708053588867,
|
|
"epoch": 3.003362151777137,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041090222942637323,
|
|
"loss": 4.693,
|
|
"mean_token_accuracy": 0.23853697031736373,
|
|
"num_tokens": 71685731.0,
|
|
"step": 31265
|
|
},
|
|
{
|
|
"entropy": 5.118443155288697,
|
|
"epoch": 3.0038424591738715,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00041087490465028175,
|
|
"loss": 4.7673,
|
|
"mean_token_accuracy": 0.2384372115135193,
|
|
"num_tokens": 71697212.0,
|
|
"step": 31270
|
|
},
|
|
{
|
|
"entropy": 4.985861873626709,
|
|
"epoch": 3.0043227665706054,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00041084757671957844,
|
|
"loss": 4.5955,
|
|
"mean_token_accuracy": 0.24781534671783448,
|
|
"num_tokens": 71708444.0,
|
|
"step": 31275
|
|
},
|
|
{
|
|
"entropy": 5.059682369232178,
|
|
"epoch": 3.0048030739673393,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00041082024563489773,
|
|
"loss": 4.6412,
|
|
"mean_token_accuracy": 0.24888246655464172,
|
|
"num_tokens": 71718645.0,
|
|
"step": 31280
|
|
},
|
|
{
|
|
"entropy": 5.0275898456573485,
|
|
"epoch": 3.005283381364073,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004107929113968743,
|
|
"loss": 4.5796,
|
|
"mean_token_accuracy": 0.24466054141521454,
|
|
"num_tokens": 71728798.0,
|
|
"step": 31285
|
|
},
|
|
{
|
|
"entropy": 5.090812873840332,
|
|
"epoch": 3.005763688760807,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004107655740061427,
|
|
"loss": 4.7235,
|
|
"mean_token_accuracy": 0.23481216579675673,
|
|
"num_tokens": 71741237.0,
|
|
"step": 31290
|
|
},
|
|
{
|
|
"entropy": 5.096500730514526,
|
|
"epoch": 3.006243996157541,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004107382334633375,
|
|
"loss": 4.6834,
|
|
"mean_token_accuracy": 0.24862920194864274,
|
|
"num_tokens": 71753755.0,
|
|
"step": 31295
|
|
},
|
|
{
|
|
"entropy": 5.12779860496521,
|
|
"epoch": 3.006724303554275,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004107108897690936,
|
|
"loss": 4.6643,
|
|
"mean_token_accuracy": 0.24606042802333833,
|
|
"num_tokens": 71765447.0,
|
|
"step": 31300
|
|
},
|
|
{
|
|
"entropy": 5.089486980438233,
|
|
"epoch": 3.0072046109510087,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004106835429240458,
|
|
"loss": 4.6896,
|
|
"mean_token_accuracy": 0.24308264255523682,
|
|
"num_tokens": 71777106.0,
|
|
"step": 31305
|
|
},
|
|
{
|
|
"entropy": 5.017534351348877,
|
|
"epoch": 3.0076849183477425,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004106561929288292,
|
|
"loss": 4.5891,
|
|
"mean_token_accuracy": 0.24899385422468184,
|
|
"num_tokens": 71789226.0,
|
|
"step": 31310
|
|
},
|
|
{
|
|
"entropy": 5.0785496711730955,
|
|
"epoch": 3.0081652257444764,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00041062883978407844,
|
|
"loss": 4.6816,
|
|
"mean_token_accuracy": 0.24357341527938842,
|
|
"num_tokens": 71800754.0,
|
|
"step": 31315
|
|
},
|
|
{
|
|
"entropy": 5.0972143650054935,
|
|
"epoch": 3.0086455331412103,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00041060148349042876,
|
|
"loss": 4.7153,
|
|
"mean_token_accuracy": 0.24523738622665406,
|
|
"num_tokens": 71812107.0,
|
|
"step": 31320
|
|
},
|
|
{
|
|
"entropy": 5.081027412414551,
|
|
"epoch": 3.009125840537944,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00041057412404851536,
|
|
"loss": 4.6248,
|
|
"mean_token_accuracy": 0.24740613400936126,
|
|
"num_tokens": 71822031.0,
|
|
"step": 31325
|
|
},
|
|
{
|
|
"entropy": 5.061629343032837,
|
|
"epoch": 3.009606147934678,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004105467614589734,
|
|
"loss": 4.6452,
|
|
"mean_token_accuracy": 0.2387404829263687,
|
|
"num_tokens": 71833744.0,
|
|
"step": 31330
|
|
},
|
|
{
|
|
"entropy": 5.074277353286743,
|
|
"epoch": 3.010086455331412,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004105193957224381,
|
|
"loss": 4.5915,
|
|
"mean_token_accuracy": 0.25017276108264924,
|
|
"num_tokens": 71845385.0,
|
|
"step": 31335
|
|
},
|
|
{
|
|
"entropy": 5.0031942367553714,
|
|
"epoch": 3.010566762728146,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00041049202683954473,
|
|
"loss": 4.4989,
|
|
"mean_token_accuracy": 0.2636774554848671,
|
|
"num_tokens": 71858065.0,
|
|
"step": 31340
|
|
},
|
|
{
|
|
"entropy": 4.990669107437133,
|
|
"epoch": 3.0110470701248797,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00041046465481092893,
|
|
"loss": 4.5891,
|
|
"mean_token_accuracy": 0.24581137001514436,
|
|
"num_tokens": 71869553.0,
|
|
"step": 31345
|
|
},
|
|
{
|
|
"entropy": 5.045453214645386,
|
|
"epoch": 3.011527377521614,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00041043727963722607,
|
|
"loss": 4.6313,
|
|
"mean_token_accuracy": 0.24703803807497024,
|
|
"num_tokens": 71881196.0,
|
|
"step": 31350
|
|
},
|
|
{
|
|
"entropy": 5.046996307373047,
|
|
"epoch": 3.012007684918348,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004104099013190718,
|
|
"loss": 4.6513,
|
|
"mean_token_accuracy": 0.2366416335105896,
|
|
"num_tokens": 71893323.0,
|
|
"step": 31355
|
|
},
|
|
{
|
|
"entropy": 5.045062065124512,
|
|
"epoch": 3.012487992315082,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00041038251985710164,
|
|
"loss": 4.5923,
|
|
"mean_token_accuracy": 0.24852931946516038,
|
|
"num_tokens": 71904679.0,
|
|
"step": 31360
|
|
},
|
|
{
|
|
"entropy": 5.050914001464844,
|
|
"epoch": 3.0129682997118157,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004103551352519514,
|
|
"loss": 4.6881,
|
|
"mean_token_accuracy": 0.25243211090564727,
|
|
"num_tokens": 71916478.0,
|
|
"step": 31365
|
|
},
|
|
{
|
|
"entropy": 5.066346502304077,
|
|
"epoch": 3.0134486071085496,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00041032774750425683,
|
|
"loss": 4.6731,
|
|
"mean_token_accuracy": 0.24854901880025865,
|
|
"num_tokens": 71928203.0,
|
|
"step": 31370
|
|
},
|
|
{
|
|
"entropy": 5.051746463775634,
|
|
"epoch": 3.0139289145052834,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004103003566146538,
|
|
"loss": 4.5767,
|
|
"mean_token_accuracy": 0.24995588511228561,
|
|
"num_tokens": 71939118.0,
|
|
"step": 31375
|
|
},
|
|
{
|
|
"entropy": 5.079462766647339,
|
|
"epoch": 3.0144092219020173,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004102729625837782,
|
|
"loss": 4.7,
|
|
"mean_token_accuracy": 0.2432078868150711,
|
|
"num_tokens": 71950964.0,
|
|
"step": 31380
|
|
},
|
|
{
|
|
"entropy": 4.987093687057495,
|
|
"epoch": 3.014889529298751,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004102455654122662,
|
|
"loss": 4.5463,
|
|
"mean_token_accuracy": 0.25703095048666,
|
|
"num_tokens": 71960587.0,
|
|
"step": 31385
|
|
},
|
|
{
|
|
"entropy": 5.0261084079742435,
|
|
"epoch": 3.015369836695485,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041021816510075366,
|
|
"loss": 4.601,
|
|
"mean_token_accuracy": 0.24622438251972198,
|
|
"num_tokens": 71972014.0,
|
|
"step": 31390
|
|
},
|
|
{
|
|
"entropy": 5.0990345001220705,
|
|
"epoch": 3.015850144092219,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00041019076164987696,
|
|
"loss": 4.6506,
|
|
"mean_token_accuracy": 0.24761470407247543,
|
|
"num_tokens": 71982842.0,
|
|
"step": 31395
|
|
},
|
|
{
|
|
"entropy": 5.067615079879761,
|
|
"epoch": 3.016330451488953,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004101633550602721,
|
|
"loss": 4.5783,
|
|
"mean_token_accuracy": 0.24062599539756774,
|
|
"num_tokens": 71994450.0,
|
|
"step": 31400
|
|
},
|
|
{
|
|
"entropy": 4.987122678756714,
|
|
"epoch": 3.0168107588856867,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004101359453325755,
|
|
"loss": 4.533,
|
|
"mean_token_accuracy": 0.24856770038604736,
|
|
"num_tokens": 72005490.0,
|
|
"step": 31405
|
|
},
|
|
{
|
|
"entropy": 5.035937976837158,
|
|
"epoch": 3.0172910662824206,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00041010853246742357,
|
|
"loss": 4.6569,
|
|
"mean_token_accuracy": 0.24072497636079787,
|
|
"num_tokens": 72016723.0,
|
|
"step": 31410
|
|
},
|
|
{
|
|
"entropy": 4.9769618034362795,
|
|
"epoch": 3.0177713736791545,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004100811164654527,
|
|
"loss": 4.5524,
|
|
"mean_token_accuracy": 0.2532226011157036,
|
|
"num_tokens": 72027189.0,
|
|
"step": 31415
|
|
},
|
|
{
|
|
"entropy": 5.104083442687989,
|
|
"epoch": 3.0182516810758884,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004100536973272994,
|
|
"loss": 4.7135,
|
|
"mean_token_accuracy": 0.24044599682092666,
|
|
"num_tokens": 72039662.0,
|
|
"step": 31420
|
|
},
|
|
{
|
|
"entropy": 5.051034784317016,
|
|
"epoch": 3.0187319884726227,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004100262750536003,
|
|
"loss": 4.6436,
|
|
"mean_token_accuracy": 0.24606235325336456,
|
|
"num_tokens": 72050769.0,
|
|
"step": 31425
|
|
},
|
|
{
|
|
"entropy": 5.049038457870483,
|
|
"epoch": 3.0192122958693566,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040999884964499196,
|
|
"loss": 4.6587,
|
|
"mean_token_accuracy": 0.24487811475992202,
|
|
"num_tokens": 72062657.0,
|
|
"step": 31430
|
|
},
|
|
{
|
|
"entropy": 5.040445280075073,
|
|
"epoch": 3.0196926032660905,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00040997142110211127,
|
|
"loss": 4.6133,
|
|
"mean_token_accuracy": 0.25074315518140794,
|
|
"num_tokens": 72074805.0,
|
|
"step": 31435
|
|
},
|
|
{
|
|
"entropy": 5.018621397018433,
|
|
"epoch": 3.0201729106628243,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040994398942559496,
|
|
"loss": 4.6005,
|
|
"mean_token_accuracy": 0.25499855279922484,
|
|
"num_tokens": 72085842.0,
|
|
"step": 31440
|
|
},
|
|
{
|
|
"entropy": 5.127358055114746,
|
|
"epoch": 3.020653218059558,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004099165546160799,
|
|
"loss": 4.7126,
|
|
"mean_token_accuracy": 0.23831754177808762,
|
|
"num_tokens": 72096750.0,
|
|
"step": 31445
|
|
},
|
|
{
|
|
"entropy": 5.018946027755737,
|
|
"epoch": 3.021133525456292,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00040988911667420305,
|
|
"loss": 4.6262,
|
|
"mean_token_accuracy": 0.24294245690107347,
|
|
"num_tokens": 72108842.0,
|
|
"step": 31450
|
|
},
|
|
{
|
|
"entropy": 5.105809259414673,
|
|
"epoch": 3.021613832853026,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004098616756006015,
|
|
"loss": 4.6976,
|
|
"mean_token_accuracy": 0.23442134708166124,
|
|
"num_tokens": 72119582.0,
|
|
"step": 31455
|
|
},
|
|
{
|
|
"entropy": 5.097371196746826,
|
|
"epoch": 3.02209414024976,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004098342313959122,
|
|
"loss": 4.7045,
|
|
"mean_token_accuracy": 0.24634762108325958,
|
|
"num_tokens": 72129984.0,
|
|
"step": 31460
|
|
},
|
|
{
|
|
"entropy": 5.103570604324341,
|
|
"epoch": 3.0225744476464937,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004098067840607725,
|
|
"loss": 4.6752,
|
|
"mean_token_accuracy": 0.23882693648338318,
|
|
"num_tokens": 72142160.0,
|
|
"step": 31465
|
|
},
|
|
{
|
|
"entropy": 5.148630142211914,
|
|
"epoch": 3.0230547550432276,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004097793335958195,
|
|
"loss": 4.7492,
|
|
"mean_token_accuracy": 0.23690109103918075,
|
|
"num_tokens": 72153691.0,
|
|
"step": 31470
|
|
},
|
|
{
|
|
"entropy": 5.104357385635376,
|
|
"epoch": 3.0235350624399615,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040975188000169074,
|
|
"loss": 4.6469,
|
|
"mean_token_accuracy": 0.24874790906906127,
|
|
"num_tokens": 72165123.0,
|
|
"step": 31475
|
|
},
|
|
{
|
|
"entropy": 5.050551652908325,
|
|
"epoch": 3.0240153698366954,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00040972442327902325,
|
|
"loss": 4.622,
|
|
"mean_token_accuracy": 0.2462276890873909,
|
|
"num_tokens": 72177006.0,
|
|
"step": 31480
|
|
},
|
|
{
|
|
"entropy": 5.109850168228149,
|
|
"epoch": 3.0244956772334293,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004096969634284549,
|
|
"loss": 4.7331,
|
|
"mean_token_accuracy": 0.23722968250513077,
|
|
"num_tokens": 72189185.0,
|
|
"step": 31485
|
|
},
|
|
{
|
|
"entropy": 5.044874048233032,
|
|
"epoch": 3.024975984630163,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004096695004506229,
|
|
"loss": 4.6689,
|
|
"mean_token_accuracy": 0.24475235491991043,
|
|
"num_tokens": 72201530.0,
|
|
"step": 31490
|
|
},
|
|
{
|
|
"entropy": 5.059578609466553,
|
|
"epoch": 3.025456292026897,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00040964203434616496,
|
|
"loss": 4.6296,
|
|
"mean_token_accuracy": 0.24872735887765884,
|
|
"num_tokens": 72213514.0,
|
|
"step": 31495
|
|
},
|
|
{
|
|
"entropy": 5.025071525573731,
|
|
"epoch": 3.025936599423631,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004096145651157189,
|
|
"loss": 4.6059,
|
|
"mean_token_accuracy": 0.2459734320640564,
|
|
"num_tokens": 72224811.0,
|
|
"step": 31500
|
|
},
|
|
{
|
|
"entropy": 5.109280920028686,
|
|
"epoch": 3.0264169068203652,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004095870927599223,
|
|
"loss": 4.7321,
|
|
"mean_token_accuracy": 0.24028065651655198,
|
|
"num_tokens": 72236201.0,
|
|
"step": 31505
|
|
},
|
|
{
|
|
"entropy": 5.123455333709717,
|
|
"epoch": 3.026897214217099,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00040955961727941306,
|
|
"loss": 4.6981,
|
|
"mean_token_accuracy": 0.23767663836479186,
|
|
"num_tokens": 72248963.0,
|
|
"step": 31510
|
|
},
|
|
{
|
|
"entropy": 5.1077268600463865,
|
|
"epoch": 3.027377521613833,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004095321386748291,
|
|
"loss": 4.6995,
|
|
"mean_token_accuracy": 0.24405122101306914,
|
|
"num_tokens": 72259489.0,
|
|
"step": 31515
|
|
},
|
|
{
|
|
"entropy": 4.999540519714356,
|
|
"epoch": 3.027857829010567,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040950465694680825,
|
|
"loss": 4.6631,
|
|
"mean_token_accuracy": 0.2449243649840355,
|
|
"num_tokens": 72271124.0,
|
|
"step": 31520
|
|
},
|
|
{
|
|
"entropy": 5.109373617172241,
|
|
"epoch": 3.0283381364073008,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040947717209598877,
|
|
"loss": 4.6778,
|
|
"mean_token_accuracy": 0.24264902174472808,
|
|
"num_tokens": 72281144.0,
|
|
"step": 31525
|
|
},
|
|
{
|
|
"entropy": 5.048031806945801,
|
|
"epoch": 3.0288184438040346,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00040944968412300867,
|
|
"loss": 4.6363,
|
|
"mean_token_accuracy": 0.2484264850616455,
|
|
"num_tokens": 72293422.0,
|
|
"step": 31530
|
|
},
|
|
{
|
|
"entropy": 5.077163648605347,
|
|
"epoch": 3.0292987512007685,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040942219302850605,
|
|
"loss": 4.6672,
|
|
"mean_token_accuracy": 0.24520911127328873,
|
|
"num_tokens": 72304450.0,
|
|
"step": 31535
|
|
},
|
|
{
|
|
"entropy": 5.0657103061676025,
|
|
"epoch": 3.0297790585975024,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004093946988131193,
|
|
"loss": 4.593,
|
|
"mean_token_accuracy": 0.25350708812475203,
|
|
"num_tokens": 72315000.0,
|
|
"step": 31540
|
|
},
|
|
{
|
|
"entropy": 5.075184917449951,
|
|
"epoch": 3.0302593659942363,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004093672014774868,
|
|
"loss": 4.6842,
|
|
"mean_token_accuracy": 0.2380824714899063,
|
|
"num_tokens": 72327268.0,
|
|
"step": 31545
|
|
},
|
|
{
|
|
"entropy": 5.029738235473633,
|
|
"epoch": 3.03073967339097,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00040933970102224675,
|
|
"loss": 4.5874,
|
|
"mean_token_accuracy": 0.25333288311958313,
|
|
"num_tokens": 72338371.0,
|
|
"step": 31550
|
|
},
|
|
{
|
|
"entropy": 5.065733528137207,
|
|
"epoch": 3.031219980787704,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00040931219744803774,
|
|
"loss": 4.6277,
|
|
"mean_token_accuracy": 0.2469482719898224,
|
|
"num_tokens": 72349211.0,
|
|
"step": 31555
|
|
},
|
|
{
|
|
"entropy": 5.114510488510132,
|
|
"epoch": 3.031700288184438,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004092846907554985,
|
|
"loss": 4.6983,
|
|
"mean_token_accuracy": 0.24264060258865355,
|
|
"num_tokens": 72361634.0,
|
|
"step": 31560
|
|
},
|
|
{
|
|
"entropy": 5.035785341262818,
|
|
"epoch": 3.032180595581172,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040925718094526724,
|
|
"loss": 4.6533,
|
|
"mean_token_accuracy": 0.24493586719036103,
|
|
"num_tokens": 72374071.0,
|
|
"step": 31565
|
|
},
|
|
{
|
|
"entropy": 5.134601068496704,
|
|
"epoch": 3.0326609029779057,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00040922966801798305,
|
|
"loss": 4.763,
|
|
"mean_token_accuracy": 0.23960959017276764,
|
|
"num_tokens": 72386272.0,
|
|
"step": 31570
|
|
},
|
|
{
|
|
"entropy": 5.095665788650512,
|
|
"epoch": 3.0331412103746396,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00040920215197428456,
|
|
"loss": 4.6388,
|
|
"mean_token_accuracy": 0.24907959252595901,
|
|
"num_tokens": 72398832.0,
|
|
"step": 31575
|
|
},
|
|
{
|
|
"entropy": 5.14979248046875,
|
|
"epoch": 3.0336215177713735,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040917463281481053,
|
|
"loss": 4.7366,
|
|
"mean_token_accuracy": 0.2345603808760643,
|
|
"num_tokens": 72409970.0,
|
|
"step": 31580
|
|
},
|
|
{
|
|
"entropy": 5.038898181915283,
|
|
"epoch": 3.034101825168108,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004091471105402,
|
|
"loss": 4.6299,
|
|
"mean_token_accuracy": 0.2518488377332687,
|
|
"num_tokens": 72420640.0,
|
|
"step": 31585
|
|
},
|
|
{
|
|
"entropy": 5.0984334468841555,
|
|
"epoch": 3.0345821325648417,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004091195851510918,
|
|
"loss": 4.692,
|
|
"mean_token_accuracy": 0.24351897835731506,
|
|
"num_tokens": 72432111.0,
|
|
"step": 31590
|
|
},
|
|
{
|
|
"entropy": 5.010867547988892,
|
|
"epoch": 3.0350624399615755,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004090920566481252,
|
|
"loss": 4.5947,
|
|
"mean_token_accuracy": 0.25169342905282976,
|
|
"num_tokens": 72444427.0,
|
|
"step": 31595
|
|
},
|
|
{
|
|
"entropy": 5.098999500274658,
|
|
"epoch": 3.0355427473583094,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004090645250319392,
|
|
"loss": 4.7229,
|
|
"mean_token_accuracy": 0.2429332360625267,
|
|
"num_tokens": 72454955.0,
|
|
"step": 31600
|
|
},
|
|
{
|
|
"entropy": 5.021222496032715,
|
|
"epoch": 3.0360230547550433,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000409036990303173,
|
|
"loss": 4.5377,
|
|
"mean_token_accuracy": 0.25624181628227233,
|
|
"num_tokens": 72464914.0,
|
|
"step": 31605
|
|
},
|
|
{
|
|
"entropy": 4.999936819076538,
|
|
"epoch": 3.036503362151777,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004090094524624658,
|
|
"loss": 4.6868,
|
|
"mean_token_accuracy": 0.250473652780056,
|
|
"num_tokens": 72476379.0,
|
|
"step": 31610
|
|
},
|
|
{
|
|
"entropy": 5.097587633132934,
|
|
"epoch": 3.036983669548511,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040898191151045717,
|
|
"loss": 4.6602,
|
|
"mean_token_accuracy": 0.2441670000553131,
|
|
"num_tokens": 72488249.0,
|
|
"step": 31615
|
|
},
|
|
{
|
|
"entropy": 5.078186416625977,
|
|
"epoch": 3.037463976945245,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004089543674477864,
|
|
"loss": 4.6243,
|
|
"mean_token_accuracy": 0.24243185818195342,
|
|
"num_tokens": 72499365.0,
|
|
"step": 31620
|
|
},
|
|
{
|
|
"entropy": 5.031374263763428,
|
|
"epoch": 3.037944284341979,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004089268202750929,
|
|
"loss": 4.619,
|
|
"mean_token_accuracy": 0.245622855424881,
|
|
"num_tokens": 72511231.0,
|
|
"step": 31625
|
|
},
|
|
{
|
|
"entropy": 5.039086246490479,
|
|
"epoch": 3.0384245917387127,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00040889926999301634,
|
|
"loss": 4.625,
|
|
"mean_token_accuracy": 0.2482289418578148,
|
|
"num_tokens": 72524357.0,
|
|
"step": 31630
|
|
},
|
|
{
|
|
"entropy": 5.095204496383667,
|
|
"epoch": 3.0389048991354466,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004088717166021964,
|
|
"loss": 4.6793,
|
|
"mean_token_accuracy": 0.24549530297517777,
|
|
"num_tokens": 72537170.0,
|
|
"step": 31635
|
|
},
|
|
{
|
|
"entropy": 5.026371765136719,
|
|
"epoch": 3.0393852065321805,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004088441601032727,
|
|
"loss": 4.5856,
|
|
"mean_token_accuracy": 0.25135585814714434,
|
|
"num_tokens": 72548631.0,
|
|
"step": 31640
|
|
},
|
|
{
|
|
"entropy": 5.005275440216065,
|
|
"epoch": 3.0398655139289144,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004088166004968849,
|
|
"loss": 4.5858,
|
|
"mean_token_accuracy": 0.24760494828224183,
|
|
"num_tokens": 72561787.0,
|
|
"step": 31645
|
|
},
|
|
{
|
|
"entropy": 5.027503967285156,
|
|
"epoch": 3.0403458213256482,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00040878903778367317,
|
|
"loss": 4.6303,
|
|
"mean_token_accuracy": 0.2522509038448334,
|
|
"num_tokens": 72572480.0,
|
|
"step": 31650
|
|
},
|
|
{
|
|
"entropy": 5.0430741786956785,
|
|
"epoch": 3.040826128722382,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004087614719642772,
|
|
"loss": 4.6666,
|
|
"mean_token_accuracy": 0.2388513207435608,
|
|
"num_tokens": 72584636.0,
|
|
"step": 31655
|
|
},
|
|
{
|
|
"entropy": 5.107775402069092,
|
|
"epoch": 3.0413064361191164,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00040873390303933693,
|
|
"loss": 4.7112,
|
|
"mean_token_accuracy": 0.23879486471414565,
|
|
"num_tokens": 72595425.0,
|
|
"step": 31660
|
|
},
|
|
{
|
|
"entropy": 5.145121431350708,
|
|
"epoch": 3.0417867435158503,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00040870633100949266,
|
|
"loss": 4.6938,
|
|
"mean_token_accuracy": 0.24287839978933334,
|
|
"num_tokens": 72607344.0,
|
|
"step": 31665
|
|
},
|
|
{
|
|
"entropy": 5.0275026798248295,
|
|
"epoch": 3.042267050912584,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00040867875587538436,
|
|
"loss": 4.6932,
|
|
"mean_token_accuracy": 0.24585120677947997,
|
|
"num_tokens": 72619704.0,
|
|
"step": 31670
|
|
},
|
|
{
|
|
"entropy": 5.039684009552002,
|
|
"epoch": 3.042747358309318,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004086511776376523,
|
|
"loss": 4.5973,
|
|
"mean_token_accuracy": 0.254203824698925,
|
|
"num_tokens": 72632143.0,
|
|
"step": 31675
|
|
},
|
|
{
|
|
"entropy": 5.071047306060791,
|
|
"epoch": 3.043227665706052,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00040862359629693684,
|
|
"loss": 4.632,
|
|
"mean_token_accuracy": 0.245218189060688,
|
|
"num_tokens": 72644581.0,
|
|
"step": 31680
|
|
},
|
|
{
|
|
"entropy": 5.1263385772705075,
|
|
"epoch": 3.043707973102786,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004085960118538781,
|
|
"loss": 4.699,
|
|
"mean_token_accuracy": 0.24387965947389603,
|
|
"num_tokens": 72656309.0,
|
|
"step": 31685
|
|
},
|
|
{
|
|
"entropy": 5.067390775680542,
|
|
"epoch": 3.0441882804995197,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004085684243091168,
|
|
"loss": 4.6763,
|
|
"mean_token_accuracy": 0.2433240920305252,
|
|
"num_tokens": 72668347.0,
|
|
"step": 31690
|
|
},
|
|
{
|
|
"entropy": 4.992222261428833,
|
|
"epoch": 3.0446685878962536,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004085408336632933,
|
|
"loss": 4.5899,
|
|
"mean_token_accuracy": 0.2498211979866028,
|
|
"num_tokens": 72679375.0,
|
|
"step": 31695
|
|
},
|
|
{
|
|
"entropy": 5.048721837997436,
|
|
"epoch": 3.0451488952929875,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040851323991704803,
|
|
"loss": 4.6505,
|
|
"mean_token_accuracy": 0.2475501537322998,
|
|
"num_tokens": 72691020.0,
|
|
"step": 31700
|
|
},
|
|
{
|
|
"entropy": 5.035050344467163,
|
|
"epoch": 3.0456292026897214,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004084856430710219,
|
|
"loss": 4.6038,
|
|
"mean_token_accuracy": 0.2471386671066284,
|
|
"num_tokens": 72703215.0,
|
|
"step": 31705
|
|
},
|
|
{
|
|
"entropy": 5.072145318984985,
|
|
"epoch": 3.0461095100864553,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004084580431258555,
|
|
"loss": 4.6137,
|
|
"mean_token_accuracy": 0.24602895975112915,
|
|
"num_tokens": 72715107.0,
|
|
"step": 31710
|
|
},
|
|
{
|
|
"entropy": 5.084800767898559,
|
|
"epoch": 3.046589817483189,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004084304400821896,
|
|
"loss": 4.6965,
|
|
"mean_token_accuracy": 0.24280115067958832,
|
|
"num_tokens": 72726301.0,
|
|
"step": 31715
|
|
},
|
|
{
|
|
"entropy": 5.033972501754761,
|
|
"epoch": 3.047070124879923,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004084028339406651,
|
|
"loss": 4.6276,
|
|
"mean_token_accuracy": 0.24668311178684235,
|
|
"num_tokens": 72738102.0,
|
|
"step": 31720
|
|
},
|
|
{
|
|
"entropy": 5.022152471542358,
|
|
"epoch": 3.047550432276657,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040837522470192297,
|
|
"loss": 4.7009,
|
|
"mean_token_accuracy": 0.2479358971118927,
|
|
"num_tokens": 72748840.0,
|
|
"step": 31725
|
|
},
|
|
{
|
|
"entropy": 4.960737705230713,
|
|
"epoch": 3.0480307396733908,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004083476123666041,
|
|
"loss": 4.5325,
|
|
"mean_token_accuracy": 0.2612351909279823,
|
|
"num_tokens": 72760480.0,
|
|
"step": 31730
|
|
},
|
|
{
|
|
"entropy": 5.061456727981567,
|
|
"epoch": 3.048511047070125,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004083199969353496,
|
|
"loss": 4.5844,
|
|
"mean_token_accuracy": 0.2589627534151077,
|
|
"num_tokens": 72770446.0,
|
|
"step": 31735
|
|
},
|
|
{
|
|
"entropy": 4.988167381286621,
|
|
"epoch": 3.048991354466859,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040829237840880075,
|
|
"loss": 4.6273,
|
|
"mean_token_accuracy": 0.24723846316337586,
|
|
"num_tokens": 72782262.0,
|
|
"step": 31740
|
|
},
|
|
{
|
|
"entropy": 5.02937421798706,
|
|
"epoch": 3.049471661863593,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040826475678759855,
|
|
"loss": 4.6373,
|
|
"mean_token_accuracy": 0.2460236892104149,
|
|
"num_tokens": 72793857.0,
|
|
"step": 31745
|
|
},
|
|
{
|
|
"entropy": 5.058753204345703,
|
|
"epoch": 3.0499519692603267,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004082371320723845,
|
|
"loss": 4.6553,
|
|
"mean_token_accuracy": 0.2392961248755455,
|
|
"num_tokens": 72805396.0,
|
|
"step": 31750
|
|
},
|
|
{
|
|
"entropy": 5.1182475090026855,
|
|
"epoch": 3.0504322766570606,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00040820950426379986,
|
|
"loss": 4.6899,
|
|
"mean_token_accuracy": 0.24830501526594162,
|
|
"num_tokens": 72816946.0,
|
|
"step": 31755
|
|
},
|
|
{
|
|
"entropy": 5.026547241210937,
|
|
"epoch": 3.0509125840537945,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004081818733624861,
|
|
"loss": 4.5531,
|
|
"mean_token_accuracy": 0.25185046941041944,
|
|
"num_tokens": 72827589.0,
|
|
"step": 31760
|
|
},
|
|
{
|
|
"entropy": 5.005900955200195,
|
|
"epoch": 3.0513928914505284,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004081542393690847,
|
|
"loss": 4.6135,
|
|
"mean_token_accuracy": 0.24803243577480316,
|
|
"num_tokens": 72838333.0,
|
|
"step": 31765
|
|
},
|
|
{
|
|
"entropy": 5.0348255157470705,
|
|
"epoch": 3.0518731988472623,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004081266022842372,
|
|
"loss": 4.5967,
|
|
"mean_token_accuracy": 0.25163368284702303,
|
|
"num_tokens": 72850212.0,
|
|
"step": 31770
|
|
},
|
|
{
|
|
"entropy": 5.064818906784057,
|
|
"epoch": 3.052353506243996,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040809896210858537,
|
|
"loss": 4.5652,
|
|
"mean_token_accuracy": 0.2413250043988228,
|
|
"num_tokens": 72861406.0,
|
|
"step": 31775
|
|
},
|
|
{
|
|
"entropy": 5.032543516159057,
|
|
"epoch": 3.05283381364073,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040807131884277085,
|
|
"loss": 4.6498,
|
|
"mean_token_accuracy": 0.25353990495204926,
|
|
"num_tokens": 72872563.0,
|
|
"step": 31780
|
|
},
|
|
{
|
|
"entropy": 5.070755195617676,
|
|
"epoch": 3.053314121037464,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004080436724874354,
|
|
"loss": 4.6947,
|
|
"mean_token_accuracy": 0.23914626091718674,
|
|
"num_tokens": 72883718.0,
|
|
"step": 31785
|
|
},
|
|
{
|
|
"entropy": 5.148820590972901,
|
|
"epoch": 3.053794428434198,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00040801602304322095,
|
|
"loss": 4.7136,
|
|
"mean_token_accuracy": 0.24600790739059447,
|
|
"num_tokens": 72896423.0,
|
|
"step": 31790
|
|
},
|
|
{
|
|
"entropy": 5.160016441345215,
|
|
"epoch": 3.0542747358309317,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00040798837051076944,
|
|
"loss": 4.7666,
|
|
"mean_token_accuracy": 0.23096111565828323,
|
|
"num_tokens": 72907681.0,
|
|
"step": 31795
|
|
},
|
|
{
|
|
"entropy": 4.996777534484863,
|
|
"epoch": 3.0547550432276656,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040796071489072286,
|
|
"loss": 4.5336,
|
|
"mean_token_accuracy": 0.25322402119636533,
|
|
"num_tokens": 72919782.0,
|
|
"step": 31800
|
|
},
|
|
{
|
|
"entropy": 5.021294403076172,
|
|
"epoch": 3.0552353506243994,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004079330561837233,
|
|
"loss": 4.6145,
|
|
"mean_token_accuracy": 0.2486003264784813,
|
|
"num_tokens": 72931458.0,
|
|
"step": 31805
|
|
},
|
|
{
|
|
"entropy": 5.0960643768310545,
|
|
"epoch": 3.0557156580211333,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00040790539439041287,
|
|
"loss": 4.7014,
|
|
"mean_token_accuracy": 0.24244564771652222,
|
|
"num_tokens": 72942415.0,
|
|
"step": 31810
|
|
},
|
|
{
|
|
"entropy": 4.990510129928589,
|
|
"epoch": 3.0561959654178676,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00040787772951143386,
|
|
"loss": 4.4642,
|
|
"mean_token_accuracy": 0.2599411576986313,
|
|
"num_tokens": 72953134.0,
|
|
"step": 31815
|
|
},
|
|
{
|
|
"entropy": 4.966206312179565,
|
|
"epoch": 3.0566762728146015,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004078500615474285,
|
|
"loss": 4.5925,
|
|
"mean_token_accuracy": 0.24848204404115676,
|
|
"num_tokens": 72964390.0,
|
|
"step": 31820
|
|
},
|
|
{
|
|
"entropy": 4.9862120151519775,
|
|
"epoch": 3.0571565802113354,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00040782239049903926,
|
|
"loss": 4.6309,
|
|
"mean_token_accuracy": 0.24385963827371598,
|
|
"num_tokens": 72976899.0,
|
|
"step": 31825
|
|
},
|
|
{
|
|
"entropy": 5.071328496932983,
|
|
"epoch": 3.0576368876080693,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040779471636690845,
|
|
"loss": 4.711,
|
|
"mean_token_accuracy": 0.24568732529878617,
|
|
"num_tokens": 72989074.0,
|
|
"step": 31830
|
|
},
|
|
{
|
|
"entropy": 5.058893823623658,
|
|
"epoch": 3.058117195004803,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040776703915167866,
|
|
"loss": 4.6436,
|
|
"mean_token_accuracy": 0.2440729945898056,
|
|
"num_tokens": 73000434.0,
|
|
"step": 31835
|
|
},
|
|
{
|
|
"entropy": 5.039112997055054,
|
|
"epoch": 3.058597502401537,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00040773935885399254,
|
|
"loss": 4.6076,
|
|
"mean_token_accuracy": 0.24866542369127273,
|
|
"num_tokens": 73011902.0,
|
|
"step": 31840
|
|
},
|
|
{
|
|
"entropy": 5.0925158023834225,
|
|
"epoch": 3.059077809798271,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004077116754744926,
|
|
"loss": 4.6983,
|
|
"mean_token_accuracy": 0.24309301227331162,
|
|
"num_tokens": 73022349.0,
|
|
"step": 31845
|
|
},
|
|
{
|
|
"entropy": 5.008359956741333,
|
|
"epoch": 3.059558117195005,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040768398901382157,
|
|
"loss": 4.57,
|
|
"mean_token_accuracy": 0.25645024329423904,
|
|
"num_tokens": 73033958.0,
|
|
"step": 31850
|
|
},
|
|
{
|
|
"entropy": 5.018523263931274,
|
|
"epoch": 3.0600384245917387,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004076562994726223,
|
|
"loss": 4.5625,
|
|
"mean_token_accuracy": 0.25753410458564757,
|
|
"num_tokens": 73045807.0,
|
|
"step": 31855
|
|
},
|
|
{
|
|
"entropy": 5.078787231445313,
|
|
"epoch": 3.0605187319884726,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004076286068515378,
|
|
"loss": 4.5796,
|
|
"mean_token_accuracy": 0.24990254342556,
|
|
"num_tokens": 73056883.0,
|
|
"step": 31860
|
|
},
|
|
{
|
|
"entropy": 5.0730164527893065,
|
|
"epoch": 3.0609990393852065,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004076009111512108,
|
|
"loss": 4.6016,
|
|
"mean_token_accuracy": 0.2521222934126854,
|
|
"num_tokens": 73067924.0,
|
|
"step": 31865
|
|
},
|
|
{
|
|
"entropy": 5.054807090759278,
|
|
"epoch": 3.0614793467819403,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004075732123722844,
|
|
"loss": 4.6469,
|
|
"mean_token_accuracy": 0.24260211735963821,
|
|
"num_tokens": 73079965.0,
|
|
"step": 31870
|
|
},
|
|
{
|
|
"entropy": 5.052465772628784,
|
|
"epoch": 3.061959654178674,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004075455105154016,
|
|
"loss": 4.672,
|
|
"mean_token_accuracy": 0.24341681152582167,
|
|
"num_tokens": 73091986.0,
|
|
"step": 31875
|
|
},
|
|
{
|
|
"entropy": 5.114961290359497,
|
|
"epoch": 3.062439961575408,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040751780558120573,
|
|
"loss": 4.6762,
|
|
"mean_token_accuracy": 0.24254262447357178,
|
|
"num_tokens": 73102445.0,
|
|
"step": 31880
|
|
},
|
|
{
|
|
"entropy": 5.07858681678772,
|
|
"epoch": 3.062920268972142,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004074900975703398,
|
|
"loss": 4.6021,
|
|
"mean_token_accuracy": 0.2494074746966362,
|
|
"num_tokens": 73112921.0,
|
|
"step": 31885
|
|
},
|
|
{
|
|
"entropy": 5.085866212844849,
|
|
"epoch": 3.063400576368876,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004074623864834473,
|
|
"loss": 4.609,
|
|
"mean_token_accuracy": 0.24578240364789963,
|
|
"num_tokens": 73123653.0,
|
|
"step": 31890
|
|
},
|
|
{
|
|
"entropy": 5.074773836135864,
|
|
"epoch": 3.06388088376561,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004074346723211715,
|
|
"loss": 4.6255,
|
|
"mean_token_accuracy": 0.25057290941476823,
|
|
"num_tokens": 73135568.0,
|
|
"step": 31895
|
|
},
|
|
{
|
|
"entropy": 4.977847385406494,
|
|
"epoch": 3.064361191162344,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00040740695508415583,
|
|
"loss": 4.5762,
|
|
"mean_token_accuracy": 0.2518226861953735,
|
|
"num_tokens": 73145964.0,
|
|
"step": 31900
|
|
},
|
|
{
|
|
"entropy": 4.93131742477417,
|
|
"epoch": 3.064841498559078,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00040737923477304386,
|
|
"loss": 4.5278,
|
|
"mean_token_accuracy": 0.2546772018074989,
|
|
"num_tokens": 73156941.0,
|
|
"step": 31905
|
|
},
|
|
{
|
|
"entropy": 5.045769786834716,
|
|
"epoch": 3.065321805955812,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040735151138847917,
|
|
"loss": 4.555,
|
|
"mean_token_accuracy": 0.24858633130788804,
|
|
"num_tokens": 73167928.0,
|
|
"step": 31910
|
|
},
|
|
{
|
|
"entropy": 4.972653770446778,
|
|
"epoch": 3.0658021133525457,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004073237849311053,
|
|
"loss": 4.5367,
|
|
"mean_token_accuracy": 0.25381753146648406,
|
|
"num_tokens": 73179861.0,
|
|
"step": 31915
|
|
},
|
|
{
|
|
"entropy": 5.035164880752563,
|
|
"epoch": 3.0662824207492796,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004072960554015661,
|
|
"loss": 4.6648,
|
|
"mean_token_accuracy": 0.24390652775764465,
|
|
"num_tokens": 73191152.0,
|
|
"step": 31920
|
|
},
|
|
{
|
|
"entropy": 5.026795482635498,
|
|
"epoch": 3.0667627281460135,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004072683228005055,
|
|
"loss": 4.6204,
|
|
"mean_token_accuracy": 0.2502211079001427,
|
|
"num_tokens": 73202952.0,
|
|
"step": 31925
|
|
},
|
|
{
|
|
"entropy": 4.971695852279663,
|
|
"epoch": 3.0672430355427474,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00040724058712856697,
|
|
"loss": 4.5412,
|
|
"mean_token_accuracy": 0.250248646736145,
|
|
"num_tokens": 73214899.0,
|
|
"step": 31930
|
|
},
|
|
{
|
|
"entropy": 5.007768440246582,
|
|
"epoch": 3.0677233429394812,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004072128483863948,
|
|
"loss": 4.6875,
|
|
"mean_token_accuracy": 0.24407064020633698,
|
|
"num_tokens": 73226599.0,
|
|
"step": 31935
|
|
},
|
|
{
|
|
"entropy": 5.001120471954346,
|
|
"epoch": 3.068203650336215,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004071851065746328,
|
|
"loss": 4.5448,
|
|
"mean_token_accuracy": 0.24898416101932525,
|
|
"num_tokens": 73238588.0,
|
|
"step": 31940
|
|
},
|
|
{
|
|
"entropy": 5.078129243850708,
|
|
"epoch": 3.068683957732949,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004071573616939252,
|
|
"loss": 4.6786,
|
|
"mean_token_accuracy": 0.24310262948274614,
|
|
"num_tokens": 73250727.0,
|
|
"step": 31945
|
|
},
|
|
{
|
|
"entropy": 5.019961357116699,
|
|
"epoch": 3.069164265129683,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004071296137449161,
|
|
"loss": 4.5707,
|
|
"mean_token_accuracy": 0.2604907304048538,
|
|
"num_tokens": 73261937.0,
|
|
"step": 31950
|
|
},
|
|
{
|
|
"entropy": 4.974229001998902,
|
|
"epoch": 3.0696445725264168,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00040710186272824967,
|
|
"loss": 4.5917,
|
|
"mean_token_accuracy": 0.24307173639535903,
|
|
"num_tokens": 73274853.0,
|
|
"step": 31955
|
|
},
|
|
{
|
|
"entropy": 5.068817138671875,
|
|
"epoch": 3.0701248799231506,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004070741086445703,
|
|
"loss": 4.6664,
|
|
"mean_token_accuracy": 0.24610565453767777,
|
|
"num_tokens": 73284901.0,
|
|
"step": 31960
|
|
},
|
|
{
|
|
"entropy": 5.026659536361694,
|
|
"epoch": 3.0706051873198845,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040704635149452223,
|
|
"loss": 4.6434,
|
|
"mean_token_accuracy": 0.24464693963527678,
|
|
"num_tokens": 73295989.0,
|
|
"step": 31965
|
|
},
|
|
{
|
|
"entropy": 5.097943210601807,
|
|
"epoch": 3.071085494716619,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040701859127875,
|
|
"loss": 4.7203,
|
|
"mean_token_accuracy": 0.2406073048710823,
|
|
"num_tokens": 73306661.0,
|
|
"step": 31970
|
|
},
|
|
{
|
|
"entropy": 5.129525518417358,
|
|
"epoch": 3.0715658021133527,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040699082799789814,
|
|
"loss": 4.7095,
|
|
"mean_token_accuracy": 0.23878285884857178,
|
|
"num_tokens": 73318456.0,
|
|
"step": 31975
|
|
},
|
|
{
|
|
"entropy": 5.124619007110596,
|
|
"epoch": 3.0720461095100866,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00040696306165261117,
|
|
"loss": 4.7345,
|
|
"mean_token_accuracy": 0.24056761413812638,
|
|
"num_tokens": 73330577.0,
|
|
"step": 31980
|
|
},
|
|
{
|
|
"entropy": 5.060488891601563,
|
|
"epoch": 3.0725264169068205,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004069352922435337,
|
|
"loss": 4.6589,
|
|
"mean_token_accuracy": 0.24906503558158874,
|
|
"num_tokens": 73341682.0,
|
|
"step": 31985
|
|
},
|
|
{
|
|
"entropy": 5.021332120895385,
|
|
"epoch": 3.0730067243035544,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004069075197713106,
|
|
"loss": 4.6333,
|
|
"mean_token_accuracy": 0.24834639877080916,
|
|
"num_tokens": 73352924.0,
|
|
"step": 31990
|
|
},
|
|
{
|
|
"entropy": 5.041073036193848,
|
|
"epoch": 3.0734870317002883,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040687974423658655,
|
|
"loss": 4.6495,
|
|
"mean_token_accuracy": 0.250208979845047,
|
|
"num_tokens": 73364904.0,
|
|
"step": 31995
|
|
},
|
|
{
|
|
"entropy": 5.111618137359619,
|
|
"epoch": 3.073967339097022,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00040685196564000644,
|
|
"loss": 4.6803,
|
|
"mean_token_accuracy": 0.24720986187458038,
|
|
"num_tokens": 73376680.0,
|
|
"step": 32000
|
|
},
|
|
{
|
|
"entropy": 4.999582099914551,
|
|
"epoch": 3.074447646493756,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00040682418398221517,
|
|
"loss": 4.6003,
|
|
"mean_token_accuracy": 0.2523179829120636,
|
|
"num_tokens": 73388631.0,
|
|
"step": 32005
|
|
},
|
|
{
|
|
"entropy": 5.027138090133667,
|
|
"epoch": 3.07492795389049,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00040679639926385783,
|
|
"loss": 4.6205,
|
|
"mean_token_accuracy": 0.25160788297653197,
|
|
"num_tokens": 73400953.0,
|
|
"step": 32010
|
|
},
|
|
{
|
|
"entropy": 5.050076913833618,
|
|
"epoch": 3.0754082612872238,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004067686114855794,
|
|
"loss": 4.5331,
|
|
"mean_token_accuracy": 0.25700239688158033,
|
|
"num_tokens": 73411167.0,
|
|
"step": 32015
|
|
},
|
|
{
|
|
"entropy": 4.919393301010132,
|
|
"epoch": 3.0758885686839577,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040674082064802507,
|
|
"loss": 4.5571,
|
|
"mean_token_accuracy": 0.2541386589407921,
|
|
"num_tokens": 73422434.0,
|
|
"step": 32020
|
|
},
|
|
{
|
|
"entropy": 5.088863754272461,
|
|
"epoch": 3.0763688760806915,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004067130267518401,
|
|
"loss": 4.6668,
|
|
"mean_token_accuracy": 0.23891474008560182,
|
|
"num_tokens": 73433543.0,
|
|
"step": 32025
|
|
},
|
|
{
|
|
"entropy": 5.165063953399658,
|
|
"epoch": 3.0768491834774254,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004066852297976698,
|
|
"loss": 4.8349,
|
|
"mean_token_accuracy": 0.231815005838871,
|
|
"num_tokens": 73444624.0,
|
|
"step": 32030
|
|
},
|
|
{
|
|
"entropy": 5.046006107330323,
|
|
"epoch": 3.0773294908741593,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004066574297861595,
|
|
"loss": 4.6193,
|
|
"mean_token_accuracy": 0.24984999746084213,
|
|
"num_tokens": 73454446.0,
|
|
"step": 32035
|
|
},
|
|
{
|
|
"entropy": 5.038469409942627,
|
|
"epoch": 3.077809798270893,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00040662962671795454,
|
|
"loss": 4.6141,
|
|
"mean_token_accuracy": 0.23815218806266786,
|
|
"num_tokens": 73465557.0,
|
|
"step": 32040
|
|
},
|
|
{
|
|
"entropy": 5.092407846450806,
|
|
"epoch": 3.0782901056676275,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004066018205937006,
|
|
"loss": 4.6328,
|
|
"mean_token_accuracy": 0.24465030431747437,
|
|
"num_tokens": 73477939.0,
|
|
"step": 32045
|
|
},
|
|
{
|
|
"entropy": 5.025490808486938,
|
|
"epoch": 3.0787704130643614,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004065740114140431,
|
|
"loss": 4.6916,
|
|
"mean_token_accuracy": 0.2474071577191353,
|
|
"num_tokens": 73490715.0,
|
|
"step": 32050
|
|
},
|
|
{
|
|
"entropy": 5.011799049377442,
|
|
"epoch": 3.0792507204610953,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00040654619917962774,
|
|
"loss": 4.5714,
|
|
"mean_token_accuracy": 0.250951412320137,
|
|
"num_tokens": 73503147.0,
|
|
"step": 32055
|
|
},
|
|
{
|
|
"entropy": 5.1020965576171875,
|
|
"epoch": 3.079731027857829,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004065183838911003,
|
|
"loss": 4.6753,
|
|
"mean_token_accuracy": 0.2404956191778183,
|
|
"num_tokens": 73514750.0,
|
|
"step": 32060
|
|
},
|
|
{
|
|
"entropy": 5.023135662078857,
|
|
"epoch": 3.080211335254563,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004064905655491065,
|
|
"loss": 4.6466,
|
|
"mean_token_accuracy": 0.2515063464641571,
|
|
"num_tokens": 73525845.0,
|
|
"step": 32065
|
|
},
|
|
{
|
|
"entropy": 5.136309814453125,
|
|
"epoch": 3.080691642651297,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040646274415429224,
|
|
"loss": 4.6948,
|
|
"mean_token_accuracy": 0.23915023505687713,
|
|
"num_tokens": 73537086.0,
|
|
"step": 32070
|
|
},
|
|
{
|
|
"entropy": 5.008394193649292,
|
|
"epoch": 3.081171950048031,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004064349197073033,
|
|
"loss": 4.5785,
|
|
"mean_token_accuracy": 0.25241281688213346,
|
|
"num_tokens": 73548708.0,
|
|
"step": 32075
|
|
},
|
|
{
|
|
"entropy": 5.098199844360352,
|
|
"epoch": 3.0816522574447647,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004064070922087859,
|
|
"loss": 4.6855,
|
|
"mean_token_accuracy": 0.24001459777355194,
|
|
"num_tokens": 73559703.0,
|
|
"step": 32080
|
|
},
|
|
{
|
|
"entropy": 5.013723945617675,
|
|
"epoch": 3.0821325648414986,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00040637926165938606,
|
|
"loss": 4.5682,
|
|
"mean_token_accuracy": 0.2518329590559006,
|
|
"num_tokens": 73570858.0,
|
|
"step": 32085
|
|
},
|
|
{
|
|
"entropy": 5.0118945121765135,
|
|
"epoch": 3.0826128722382324,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00040635142805974986,
|
|
"loss": 4.6485,
|
|
"mean_token_accuracy": 0.2444664478302002,
|
|
"num_tokens": 73583442.0,
|
|
"step": 32090
|
|
},
|
|
{
|
|
"entropy": 5.038678121566773,
|
|
"epoch": 3.0830931796349663,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004063235914105235,
|
|
"loss": 4.6603,
|
|
"mean_token_accuracy": 0.24657151848077774,
|
|
"num_tokens": 73595336.0,
|
|
"step": 32095
|
|
},
|
|
{
|
|
"entropy": 5.009463834762573,
|
|
"epoch": 3.0835734870317,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00040629575171235327,
|
|
"loss": 4.6539,
|
|
"mean_token_accuracy": 0.2388184517621994,
|
|
"num_tokens": 73606582.0,
|
|
"step": 32100
|
|
},
|
|
{
|
|
"entropy": 5.074433517456055,
|
|
"epoch": 3.084053794428434,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004062679089658856,
|
|
"loss": 4.6588,
|
|
"mean_token_accuracy": 0.24748821556568146,
|
|
"num_tokens": 73618432.0,
|
|
"step": 32105
|
|
},
|
|
{
|
|
"entropy": 5.029176378250122,
|
|
"epoch": 3.084534101825168,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00040624006317176685,
|
|
"loss": 4.5241,
|
|
"mean_token_accuracy": 0.25378997027873995,
|
|
"num_tokens": 73628880.0,
|
|
"step": 32110
|
|
},
|
|
{
|
|
"entropy": 4.98155779838562,
|
|
"epoch": 3.085014409221902,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040621221433064354,
|
|
"loss": 4.6398,
|
|
"mean_token_accuracy": 0.24306266158819198,
|
|
"num_tokens": 73639813.0,
|
|
"step": 32115
|
|
},
|
|
{
|
|
"entropy": 5.170345878601074,
|
|
"epoch": 3.0854947166186357,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004061843624431623,
|
|
"loss": 4.8138,
|
|
"mean_token_accuracy": 0.23637549877166747,
|
|
"num_tokens": 73651276.0,
|
|
"step": 32120
|
|
},
|
|
{
|
|
"entropy": 5.044910049438476,
|
|
"epoch": 3.08597502401537,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00040615650750996956,
|
|
"loss": 4.5826,
|
|
"mean_token_accuracy": 0.2504544660449028,
|
|
"num_tokens": 73663869.0,
|
|
"step": 32125
|
|
},
|
|
{
|
|
"entropy": 5.04099407196045,
|
|
"epoch": 3.086455331412104,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00040612864953171223,
|
|
"loss": 4.6749,
|
|
"mean_token_accuracy": 0.23652782887220383,
|
|
"num_tokens": 73676089.0,
|
|
"step": 32130
|
|
},
|
|
{
|
|
"entropy": 5.044053459167481,
|
|
"epoch": 3.086935638808838,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040610078850903715,
|
|
"loss": 4.6424,
|
|
"mean_token_accuracy": 0.2413918137550354,
|
|
"num_tokens": 73688718.0,
|
|
"step": 32135
|
|
},
|
|
{
|
|
"entropy": 5.012424325942993,
|
|
"epoch": 3.0874159462055717,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00040607292444259094,
|
|
"loss": 4.5776,
|
|
"mean_token_accuracy": 0.24864151626825332,
|
|
"num_tokens": 73700363.0,
|
|
"step": 32140
|
|
},
|
|
{
|
|
"entropy": 4.92096848487854,
|
|
"epoch": 3.0878962536023056,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004060450573330206,
|
|
"loss": 4.4865,
|
|
"mean_token_accuracy": 0.24904475957155228,
|
|
"num_tokens": 73711873.0,
|
|
"step": 32145
|
|
},
|
|
{
|
|
"entropy": 5.008581447601318,
|
|
"epoch": 3.0883765609990395,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00040601718718097325,
|
|
"loss": 4.5498,
|
|
"mean_token_accuracy": 0.2478708654642105,
|
|
"num_tokens": 73722466.0,
|
|
"step": 32150
|
|
},
|
|
{
|
|
"entropy": 5.022934818267823,
|
|
"epoch": 3.0888568683957733,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040598931398709576,
|
|
"loss": 4.6124,
|
|
"mean_token_accuracy": 0.24530298858880997,
|
|
"num_tokens": 73734712.0,
|
|
"step": 32155
|
|
},
|
|
{
|
|
"entropy": 5.189019346237183,
|
|
"epoch": 3.089337175792507,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040596143775203534,
|
|
"loss": 4.8077,
|
|
"mean_token_accuracy": 0.2337539538741112,
|
|
"num_tokens": 73746367.0,
|
|
"step": 32160
|
|
},
|
|
{
|
|
"entropy": 5.022787284851074,
|
|
"epoch": 3.089817483189241,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00040593355847643933,
|
|
"loss": 4.5566,
|
|
"mean_token_accuracy": 0.26082643419504165,
|
|
"num_tokens": 73758223.0,
|
|
"step": 32165
|
|
},
|
|
{
|
|
"entropy": 5.062843608856201,
|
|
"epoch": 3.090297790585975,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004059056761609548,
|
|
"loss": 4.6652,
|
|
"mean_token_accuracy": 0.2431433767080307,
|
|
"num_tokens": 73769278.0,
|
|
"step": 32170
|
|
},
|
|
{
|
|
"entropy": 5.04819803237915,
|
|
"epoch": 3.090778097982709,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004058777908062292,
|
|
"loss": 4.6069,
|
|
"mean_token_accuracy": 0.24774401038885116,
|
|
"num_tokens": 73778999.0,
|
|
"step": 32175
|
|
},
|
|
{
|
|
"entropy": 5.053538417816162,
|
|
"epoch": 3.0912584053794427,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004058499024129099,
|
|
"loss": 4.6141,
|
|
"mean_token_accuracy": 0.24732532650232314,
|
|
"num_tokens": 73789556.0,
|
|
"step": 32180
|
|
},
|
|
{
|
|
"entropy": 5.054281997680664,
|
|
"epoch": 3.0917387127761766,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040582201098164443,
|
|
"loss": 4.6434,
|
|
"mean_token_accuracy": 0.25142176151275636,
|
|
"num_tokens": 73800691.0,
|
|
"step": 32185
|
|
},
|
|
{
|
|
"entropy": 5.051549243927002,
|
|
"epoch": 3.0922190201729105,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040579411651308034,
|
|
"loss": 4.6921,
|
|
"mean_token_accuracy": 0.24632177203893663,
|
|
"num_tokens": 73811353.0,
|
|
"step": 32190
|
|
},
|
|
{
|
|
"entropy": 5.004362773895264,
|
|
"epoch": 3.0926993275696444,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.00040576621900786523,
|
|
"loss": 4.6235,
|
|
"mean_token_accuracy": 0.24753634631633759,
|
|
"num_tokens": 73822531.0,
|
|
"step": 32195
|
|
},
|
|
{
|
|
"entropy": 5.035271883010864,
|
|
"epoch": 3.0931796349663783,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004057383184666468,
|
|
"loss": 4.6885,
|
|
"mean_token_accuracy": 0.2388680472970009,
|
|
"num_tokens": 73833857.0,
|
|
"step": 32200
|
|
},
|
|
{
|
|
"entropy": 5.075099754333496,
|
|
"epoch": 3.0936599423631126,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040571041489007286,
|
|
"loss": 4.6281,
|
|
"mean_token_accuracy": 0.25046379268169405,
|
|
"num_tokens": 73845664.0,
|
|
"step": 32205
|
|
},
|
|
{
|
|
"entropy": 5.013817644119262,
|
|
"epoch": 3.0941402497598465,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00040568250827879127,
|
|
"loss": 4.4909,
|
|
"mean_token_accuracy": 0.2567852586507797,
|
|
"num_tokens": 73856860.0,
|
|
"step": 32210
|
|
},
|
|
{
|
|
"entropy": 4.977983236312866,
|
|
"epoch": 3.0946205571565804,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004056545986334497,
|
|
"loss": 4.6433,
|
|
"mean_token_accuracy": 0.2497049480676651,
|
|
"num_tokens": 73867532.0,
|
|
"step": 32215
|
|
},
|
|
{
|
|
"entropy": 5.0502697944641115,
|
|
"epoch": 3.0951008645533142,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004056266859546965,
|
|
"loss": 4.683,
|
|
"mean_token_accuracy": 0.24047670662403106,
|
|
"num_tokens": 73880458.0,
|
|
"step": 32220
|
|
},
|
|
{
|
|
"entropy": 5.107214832305909,
|
|
"epoch": 3.095581171950048,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004055987702431795,
|
|
"loss": 4.6897,
|
|
"mean_token_accuracy": 0.24284666925668716,
|
|
"num_tokens": 73892842.0,
|
|
"step": 32225
|
|
},
|
|
{
|
|
"entropy": 5.085785531997681,
|
|
"epoch": 3.096061479346782,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040557085149954677,
|
|
"loss": 4.6671,
|
|
"mean_token_accuracy": 0.24765777289867402,
|
|
"num_tokens": 73903491.0,
|
|
"step": 32230
|
|
},
|
|
{
|
|
"entropy": 4.990330410003662,
|
|
"epoch": 3.096541786743516,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040554292972444663,
|
|
"loss": 4.6217,
|
|
"mean_token_accuracy": 0.24172378480434417,
|
|
"num_tokens": 73915255.0,
|
|
"step": 32235
|
|
},
|
|
{
|
|
"entropy": 4.974247837066651,
|
|
"epoch": 3.0970220941402498,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040551500491852735,
|
|
"loss": 4.5191,
|
|
"mean_token_accuracy": 0.26239279806613924,
|
|
"num_tokens": 73925571.0,
|
|
"step": 32240
|
|
},
|
|
{
|
|
"entropy": 5.119597434997559,
|
|
"epoch": 3.0975024015369836,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004054870770824371,
|
|
"loss": 4.6965,
|
|
"mean_token_accuracy": 0.24282235503196717,
|
|
"num_tokens": 73936957.0,
|
|
"step": 32245
|
|
},
|
|
{
|
|
"entropy": 5.1229418277740475,
|
|
"epoch": 3.0979827089337175,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040545914621682445,
|
|
"loss": 4.5914,
|
|
"mean_token_accuracy": 0.25307161509990694,
|
|
"num_tokens": 73948018.0,
|
|
"step": 32250
|
|
},
|
|
{
|
|
"entropy": 5.030294179916382,
|
|
"epoch": 3.0984630163304514,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004054312123223378,
|
|
"loss": 4.6265,
|
|
"mean_token_accuracy": 0.24262434989213943,
|
|
"num_tokens": 73958713.0,
|
|
"step": 32255
|
|
},
|
|
{
|
|
"entropy": 5.038964891433716,
|
|
"epoch": 3.0989433237271853,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040540327539962567,
|
|
"loss": 4.6892,
|
|
"mean_token_accuracy": 0.24435101002454757,
|
|
"num_tokens": 73969299.0,
|
|
"step": 32260
|
|
},
|
|
{
|
|
"entropy": 5.195756816864014,
|
|
"epoch": 3.099423631123919,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00040537533544933674,
|
|
"loss": 4.7559,
|
|
"mean_token_accuracy": 0.2307824045419693,
|
|
"num_tokens": 73980149.0,
|
|
"step": 32265
|
|
},
|
|
{
|
|
"entropy": 5.074833631515503,
|
|
"epoch": 3.099903938520653,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004053473924721197,
|
|
"loss": 4.5805,
|
|
"mean_token_accuracy": 0.24621020555496215,
|
|
"num_tokens": 73991462.0,
|
|
"step": 32270
|
|
},
|
|
{
|
|
"entropy": 5.0965595722198485,
|
|
"epoch": 3.100384245917387,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004053194464686232,
|
|
"loss": 4.6776,
|
|
"mean_token_accuracy": 0.24148496985435486,
|
|
"num_tokens": 74003442.0,
|
|
"step": 32275
|
|
},
|
|
{
|
|
"entropy": 4.947234678268432,
|
|
"epoch": 3.1008645533141213,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004052914974394961,
|
|
"loss": 4.4884,
|
|
"mean_token_accuracy": 0.2542691543698311,
|
|
"num_tokens": 74014394.0,
|
|
"step": 32280
|
|
},
|
|
{
|
|
"entropy": 5.109122323989868,
|
|
"epoch": 3.101344860710855,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00040526354538538735,
|
|
"loss": 4.7513,
|
|
"mean_token_accuracy": 0.24323177933692933,
|
|
"num_tokens": 74025377.0,
|
|
"step": 32285
|
|
},
|
|
{
|
|
"entropy": 5.013289499282837,
|
|
"epoch": 3.101825168107589,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004052355903069459,
|
|
"loss": 4.5951,
|
|
"mean_token_accuracy": 0.2537130072712898,
|
|
"num_tokens": 74037094.0,
|
|
"step": 32290
|
|
},
|
|
{
|
|
"entropy": 4.960607957839966,
|
|
"epoch": 3.102305475504323,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004052076322048207,
|
|
"loss": 4.4895,
|
|
"mean_token_accuracy": 0.2608163744211197,
|
|
"num_tokens": 74048043.0,
|
|
"step": 32295
|
|
},
|
|
{
|
|
"entropy": 4.994056320190429,
|
|
"epoch": 3.1027857829010568,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00040517967107966095,
|
|
"loss": 4.6733,
|
|
"mean_token_accuracy": 0.24547887295484544,
|
|
"num_tokens": 74061026.0,
|
|
"step": 32300
|
|
},
|
|
{
|
|
"entropy": 5.084767150878906,
|
|
"epoch": 3.1032660902977907,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040515170693211584,
|
|
"loss": 4.7038,
|
|
"mean_token_accuracy": 0.24629230946302413,
|
|
"num_tokens": 74072930.0,
|
|
"step": 32305
|
|
},
|
|
{
|
|
"entropy": 5.06509747505188,
|
|
"epoch": 3.1037463976945245,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004051237397628345,
|
|
"loss": 4.6509,
|
|
"mean_token_accuracy": 0.24190901219844818,
|
|
"num_tokens": 74083847.0,
|
|
"step": 32310
|
|
},
|
|
{
|
|
"entropy": 5.071443939208985,
|
|
"epoch": 3.1042267050912584,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004050957695724663,
|
|
"loss": 4.6169,
|
|
"mean_token_accuracy": 0.2452313095331192,
|
|
"num_tokens": 74096187.0,
|
|
"step": 32315
|
|
},
|
|
{
|
|
"entropy": 5.024712753295899,
|
|
"epoch": 3.1047070124879923,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004050677963616607,
|
|
"loss": 4.6233,
|
|
"mean_token_accuracy": 0.2480012759566307,
|
|
"num_tokens": 74108894.0,
|
|
"step": 32320
|
|
},
|
|
{
|
|
"entropy": 5.007587575912476,
|
|
"epoch": 3.105187319884726,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040503982013106706,
|
|
"loss": 4.7044,
|
|
"mean_token_accuracy": 0.24248451441526414,
|
|
"num_tokens": 74119941.0,
|
|
"step": 32325
|
|
},
|
|
{
|
|
"entropy": 5.037981605529785,
|
|
"epoch": 3.10566762728146,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000405011840881335,
|
|
"loss": 4.6051,
|
|
"mean_token_accuracy": 0.24886149317026138,
|
|
"num_tokens": 74131550.0,
|
|
"step": 32330
|
|
},
|
|
{
|
|
"entropy": 5.0070899486541744,
|
|
"epoch": 3.106147934678194,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004049838586131139,
|
|
"loss": 4.5378,
|
|
"mean_token_accuracy": 0.2561880812048912,
|
|
"num_tokens": 74142530.0,
|
|
"step": 32335
|
|
},
|
|
{
|
|
"entropy": 4.993922376632691,
|
|
"epoch": 3.106628242074928,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004049558733270537,
|
|
"loss": 4.6352,
|
|
"mean_token_accuracy": 0.24153310656547547,
|
|
"num_tokens": 74155117.0,
|
|
"step": 32340
|
|
},
|
|
{
|
|
"entropy": 5.09845290184021,
|
|
"epoch": 3.1071085494716617,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000404927885023804,
|
|
"loss": 4.7432,
|
|
"mean_token_accuracy": 0.23723605871200562,
|
|
"num_tokens": 74165880.0,
|
|
"step": 32345
|
|
},
|
|
{
|
|
"entropy": 4.944010925292969,
|
|
"epoch": 3.1075888568683956,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040489989370401456,
|
|
"loss": 4.4294,
|
|
"mean_token_accuracy": 0.2656581252813339,
|
|
"num_tokens": 74175777.0,
|
|
"step": 32350
|
|
},
|
|
{
|
|
"entropy": 5.002707529067993,
|
|
"epoch": 3.1080691642651295,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004048718993683353,
|
|
"loss": 4.6105,
|
|
"mean_token_accuracy": 0.24675467163324355,
|
|
"num_tokens": 74187745.0,
|
|
"step": 32355
|
|
},
|
|
{
|
|
"entropy": 4.991137838363647,
|
|
"epoch": 3.108549471661864,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00040484390201741627,
|
|
"loss": 4.6143,
|
|
"mean_token_accuracy": 0.24088763147592546,
|
|
"num_tokens": 74198606.0,
|
|
"step": 32360
|
|
},
|
|
{
|
|
"entropy": 5.1798583507537845,
|
|
"epoch": 3.1090297790585977,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004048159016519073,
|
|
"loss": 4.7346,
|
|
"mean_token_accuracy": 0.23228639662265776,
|
|
"num_tokens": 74210530.0,
|
|
"step": 32365
|
|
},
|
|
{
|
|
"entropy": 4.999586725234986,
|
|
"epoch": 3.1095100864553316,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004047878982724586,
|
|
"loss": 4.5747,
|
|
"mean_token_accuracy": 0.25134737193584444,
|
|
"num_tokens": 74222155.0,
|
|
"step": 32370
|
|
},
|
|
{
|
|
"entropy": 5.147057437896729,
|
|
"epoch": 3.1099903938520654,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00040475989187972034,
|
|
"loss": 4.7621,
|
|
"mean_token_accuracy": 0.23894109278917314,
|
|
"num_tokens": 74233003.0,
|
|
"step": 32375
|
|
},
|
|
{
|
|
"entropy": 5.058832311630249,
|
|
"epoch": 3.1104707012487993,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040473188247434265,
|
|
"loss": 4.5372,
|
|
"mean_token_accuracy": 0.25587199479341505,
|
|
"num_tokens": 74244013.0,
|
|
"step": 32380
|
|
},
|
|
{
|
|
"entropy": 5.03604474067688,
|
|
"epoch": 3.110951008645533,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040470387005697587,
|
|
"loss": 4.6202,
|
|
"mean_token_accuracy": 0.2468089148402214,
|
|
"num_tokens": 74256010.0,
|
|
"step": 32385
|
|
},
|
|
{
|
|
"entropy": 5.095707035064697,
|
|
"epoch": 3.111431316042267,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004046758546282704,
|
|
"loss": 4.6826,
|
|
"mean_token_accuracy": 0.23876849859952926,
|
|
"num_tokens": 74268680.0,
|
|
"step": 32390
|
|
},
|
|
{
|
|
"entropy": 5.128348350524902,
|
|
"epoch": 3.111911623439001,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004046478361888766,
|
|
"loss": 4.7623,
|
|
"mean_token_accuracy": 0.23709394335746764,
|
|
"num_tokens": 74280807.0,
|
|
"step": 32395
|
|
},
|
|
{
|
|
"entropy": 5.055472660064697,
|
|
"epoch": 3.112391930835735,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000404619814739445,
|
|
"loss": 4.6744,
|
|
"mean_token_accuracy": 0.24537662863731385,
|
|
"num_tokens": 74292076.0,
|
|
"step": 32400
|
|
},
|
|
{
|
|
"entropy": 5.100376605987549,
|
|
"epoch": 3.1128722382324687,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004045917902806263,
|
|
"loss": 4.665,
|
|
"mean_token_accuracy": 0.24423255324363707,
|
|
"num_tokens": 74303965.0,
|
|
"step": 32405
|
|
},
|
|
{
|
|
"entropy": 4.975015449523926,
|
|
"epoch": 3.1133525456292026,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000404563762813071,
|
|
"loss": 4.5322,
|
|
"mean_token_accuracy": 0.25500834733247757,
|
|
"num_tokens": 74315774.0,
|
|
"step": 32410
|
|
},
|
|
{
|
|
"entropy": 4.988288164138794,
|
|
"epoch": 3.1138328530259365,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004045357323374298,
|
|
"loss": 4.5869,
|
|
"mean_token_accuracy": 0.25064926892518996,
|
|
"num_tokens": 74326624.0,
|
|
"step": 32415
|
|
},
|
|
{
|
|
"entropy": 5.026540088653564,
|
|
"epoch": 3.1143131604226704,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00040450769885435364,
|
|
"loss": 4.6438,
|
|
"mean_token_accuracy": 0.24512701481580734,
|
|
"num_tokens": 74337595.0,
|
|
"step": 32420
|
|
},
|
|
{
|
|
"entropy": 5.070287609100342,
|
|
"epoch": 3.1147934678194042,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040447966236449313,
|
|
"loss": 4.6207,
|
|
"mean_token_accuracy": 0.2501947954297066,
|
|
"num_tokens": 74347695.0,
|
|
"step": 32425
|
|
},
|
|
{
|
|
"entropy": 4.9649329662323,
|
|
"epoch": 3.115273775216138,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00040445162286849935,
|
|
"loss": 4.5405,
|
|
"mean_token_accuracy": 0.2583227038383484,
|
|
"num_tokens": 74358491.0,
|
|
"step": 32430
|
|
},
|
|
{
|
|
"entropy": 5.066698455810547,
|
|
"epoch": 3.115754082612872,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00040442358036702343,
|
|
"loss": 4.7051,
|
|
"mean_token_accuracy": 0.24266576766967773,
|
|
"num_tokens": 74371038.0,
|
|
"step": 32435
|
|
},
|
|
{
|
|
"entropy": 5.048079109191894,
|
|
"epoch": 3.1162343900096063,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004043955348607161,
|
|
"loss": 4.6636,
|
|
"mean_token_accuracy": 0.24872395098209382,
|
|
"num_tokens": 74381449.0,
|
|
"step": 32440
|
|
},
|
|
{
|
|
"entropy": 5.04279842376709,
|
|
"epoch": 3.11671469740634,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004043674863502288,
|
|
"loss": 4.5907,
|
|
"mean_token_accuracy": 0.2526773661375046,
|
|
"num_tokens": 74392970.0,
|
|
"step": 32445
|
|
},
|
|
{
|
|
"entropy": 4.9978090763092045,
|
|
"epoch": 3.117195004803074,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00040433943483621253,
|
|
"loss": 4.5426,
|
|
"mean_token_accuracy": 0.2519802376627922,
|
|
"num_tokens": 74404243.0,
|
|
"step": 32450
|
|
},
|
|
{
|
|
"entropy": 5.077501392364502,
|
|
"epoch": 3.117675312199808,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004043113803193187,
|
|
"loss": 4.7123,
|
|
"mean_token_accuracy": 0.24618444442749024,
|
|
"num_tokens": 74415546.0,
|
|
"step": 32455
|
|
},
|
|
{
|
|
"entropy": 5.010422039031982,
|
|
"epoch": 3.118155619596542,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00040428332280019864,
|
|
"loss": 4.6038,
|
|
"mean_token_accuracy": 0.25266663581132887,
|
|
"num_tokens": 74427138.0,
|
|
"step": 32460
|
|
},
|
|
{
|
|
"entropy": 4.997100400924682,
|
|
"epoch": 3.1186359269932757,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004042552622795036,
|
|
"loss": 4.5335,
|
|
"mean_token_accuracy": 0.24878908544778824,
|
|
"num_tokens": 74438088.0,
|
|
"step": 32465
|
|
},
|
|
{
|
|
"entropy": 5.05629243850708,
|
|
"epoch": 3.1191162343900096,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004042271987578852,
|
|
"loss": 4.6924,
|
|
"mean_token_accuracy": 0.24206115007400514,
|
|
"num_tokens": 74450310.0,
|
|
"step": 32470
|
|
},
|
|
{
|
|
"entropy": 5.05663366317749,
|
|
"epoch": 3.1195965417867435,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040419913223599505,
|
|
"loss": 4.5353,
|
|
"mean_token_accuracy": 0.2491496294736862,
|
|
"num_tokens": 74461128.0,
|
|
"step": 32475
|
|
},
|
|
{
|
|
"entropy": 5.04512767791748,
|
|
"epoch": 3.1200768491834774,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00040417106271448464,
|
|
"loss": 4.66,
|
|
"mean_token_accuracy": 0.24802774637937547,
|
|
"num_tokens": 74472459.0,
|
|
"step": 32480
|
|
},
|
|
{
|
|
"entropy": 4.929509115219116,
|
|
"epoch": 3.1205571565802113,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004041429901940057,
|
|
"loss": 4.5578,
|
|
"mean_token_accuracy": 0.2539891391992569,
|
|
"num_tokens": 74485257.0,
|
|
"step": 32485
|
|
},
|
|
{
|
|
"entropy": 4.945969963073731,
|
|
"epoch": 3.121037463976945,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040411491467521,
|
|
"loss": 4.5456,
|
|
"mean_token_accuracy": 0.2536383971571922,
|
|
"num_tokens": 74496277.0,
|
|
"step": 32490
|
|
},
|
|
{
|
|
"entropy": 5.058421993255616,
|
|
"epoch": 3.121517771373679,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004040868361587494,
|
|
"loss": 4.6489,
|
|
"mean_token_accuracy": 0.24318694770336152,
|
|
"num_tokens": 74508083.0,
|
|
"step": 32495
|
|
},
|
|
{
|
|
"entropy": 5.097837829589844,
|
|
"epoch": 3.121998078770413,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004040587546452758,
|
|
"loss": 4.8113,
|
|
"mean_token_accuracy": 0.22928981035947799,
|
|
"num_tokens": 74520403.0,
|
|
"step": 32500
|
|
},
|
|
{
|
|
"entropy": 5.079450845718384,
|
|
"epoch": 3.122478386167147,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00040403067013544116,
|
|
"loss": 4.6485,
|
|
"mean_token_accuracy": 0.243613338470459,
|
|
"num_tokens": 74531657.0,
|
|
"step": 32505
|
|
},
|
|
{
|
|
"entropy": 5.081295108795166,
|
|
"epoch": 3.1229586935638807,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00040400258262989744,
|
|
"loss": 4.6174,
|
|
"mean_token_accuracy": 0.25055552572011947,
|
|
"num_tokens": 74541841.0,
|
|
"step": 32510
|
|
},
|
|
{
|
|
"entropy": 5.108765125274658,
|
|
"epoch": 3.123439000960615,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040397449212929676,
|
|
"loss": 4.6998,
|
|
"mean_token_accuracy": 0.24419642835855485,
|
|
"num_tokens": 74553312.0,
|
|
"step": 32515
|
|
},
|
|
{
|
|
"entropy": 5.089429330825806,
|
|
"epoch": 3.123919308357349,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004039463986342914,
|
|
"loss": 4.6735,
|
|
"mean_token_accuracy": 0.24255333691835404,
|
|
"num_tokens": 74564470.0,
|
|
"step": 32520
|
|
},
|
|
{
|
|
"entropy": 5.055105543136596,
|
|
"epoch": 3.1243996157540828,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040391830214553365,
|
|
"loss": 4.6643,
|
|
"mean_token_accuracy": 0.2442426785826683,
|
|
"num_tokens": 74575548.0,
|
|
"step": 32525
|
|
},
|
|
{
|
|
"entropy": 5.062867975234985,
|
|
"epoch": 3.1248799231508166,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004038902026636756,
|
|
"loss": 4.6639,
|
|
"mean_token_accuracy": 0.2471578299999237,
|
|
"num_tokens": 74587470.0,
|
|
"step": 32530
|
|
},
|
|
{
|
|
"entropy": 5.053456354141235,
|
|
"epoch": 3.1253602305475505,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004038621001893698,
|
|
"loss": 4.5787,
|
|
"mean_token_accuracy": 0.2550749212503433,
|
|
"num_tokens": 74598743.0,
|
|
"step": 32535
|
|
},
|
|
{
|
|
"entropy": 5.0474076747894285,
|
|
"epoch": 3.1258405379442844,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040383399472326874,
|
|
"loss": 4.6202,
|
|
"mean_token_accuracy": 0.24675973057746886,
|
|
"num_tokens": 74609057.0,
|
|
"step": 32540
|
|
},
|
|
{
|
|
"entropy": 5.058999300003052,
|
|
"epoch": 3.1263208453410183,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00040380588626602484,
|
|
"loss": 4.7125,
|
|
"mean_token_accuracy": 0.238733471930027,
|
|
"num_tokens": 74620938.0,
|
|
"step": 32545
|
|
},
|
|
{
|
|
"entropy": 5.054383659362793,
|
|
"epoch": 3.126801152737752,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004037777748182907,
|
|
"loss": 4.606,
|
|
"mean_token_accuracy": 0.24833089411258696,
|
|
"num_tokens": 74631877.0,
|
|
"step": 32550
|
|
},
|
|
{
|
|
"entropy": 5.107970142364502,
|
|
"epoch": 3.127281460134486,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004037496603807191,
|
|
"loss": 4.68,
|
|
"mean_token_accuracy": 0.24343400448560715,
|
|
"num_tokens": 74641921.0,
|
|
"step": 32555
|
|
},
|
|
{
|
|
"entropy": 4.992760705947876,
|
|
"epoch": 3.12776176753122,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004037215429539626,
|
|
"loss": 4.5844,
|
|
"mean_token_accuracy": 0.2517207324504852,
|
|
"num_tokens": 74652950.0,
|
|
"step": 32560
|
|
},
|
|
{
|
|
"entropy": 5.068590259552002,
|
|
"epoch": 3.128242074927954,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040369342253867413,
|
|
"loss": 4.6339,
|
|
"mean_token_accuracy": 0.25402029752731325,
|
|
"num_tokens": 74665512.0,
|
|
"step": 32565
|
|
},
|
|
{
|
|
"entropy": 5.074625062942505,
|
|
"epoch": 3.1287223823246877,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004036652991355066,
|
|
"loss": 4.6357,
|
|
"mean_token_accuracy": 0.25480311959981916,
|
|
"num_tokens": 74675833.0,
|
|
"step": 32570
|
|
},
|
|
{
|
|
"entropy": 4.91657395362854,
|
|
"epoch": 3.1292026897214216,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004036371727451128,
|
|
"loss": 4.556,
|
|
"mean_token_accuracy": 0.2552287966012955,
|
|
"num_tokens": 74688356.0,
|
|
"step": 32575
|
|
},
|
|
{
|
|
"entropy": 4.994753122329712,
|
|
"epoch": 3.1296829971181555,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040360904336814586,
|
|
"loss": 4.584,
|
|
"mean_token_accuracy": 0.25450084954500196,
|
|
"num_tokens": 74699617.0,
|
|
"step": 32580
|
|
},
|
|
{
|
|
"entropy": 4.967467546463013,
|
|
"epoch": 3.1301633045148893,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004035809110052588,
|
|
"loss": 4.5129,
|
|
"mean_token_accuracy": 0.2529334306716919,
|
|
"num_tokens": 74711319.0,
|
|
"step": 32585
|
|
},
|
|
{
|
|
"entropy": 4.942798805236817,
|
|
"epoch": 3.1306436119116237,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004035527756571048,
|
|
"loss": 4.5741,
|
|
"mean_token_accuracy": 0.2528795599937439,
|
|
"num_tokens": 74723378.0,
|
|
"step": 32590
|
|
},
|
|
{
|
|
"entropy": 5.046454286575317,
|
|
"epoch": 3.1311239193083575,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00040352463732433707,
|
|
"loss": 4.6337,
|
|
"mean_token_accuracy": 0.2438918486237526,
|
|
"num_tokens": 74734011.0,
|
|
"step": 32595
|
|
},
|
|
{
|
|
"entropy": 5.0227948188781735,
|
|
"epoch": 3.1316042267050914,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040349649600760894,
|
|
"loss": 4.6183,
|
|
"mean_token_accuracy": 0.25802345871925353,
|
|
"num_tokens": 74745534.0,
|
|
"step": 32600
|
|
},
|
|
{
|
|
"entropy": 5.0454918384552006,
|
|
"epoch": 3.1320845341018253,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004034683517075737,
|
|
"loss": 4.6479,
|
|
"mean_token_accuracy": 0.25018918663263323,
|
|
"num_tokens": 74757288.0,
|
|
"step": 32605
|
|
},
|
|
{
|
|
"entropy": 4.996994590759277,
|
|
"epoch": 3.132564841498559,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00040344020442488476,
|
|
"loss": 4.6325,
|
|
"mean_token_accuracy": 0.2472263753414154,
|
|
"num_tokens": 74770520.0,
|
|
"step": 32610
|
|
},
|
|
{
|
|
"entropy": 4.991967868804932,
|
|
"epoch": 3.133045148895293,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040341205416019577,
|
|
"loss": 4.6022,
|
|
"mean_token_accuracy": 0.25307370722293854,
|
|
"num_tokens": 74781967.0,
|
|
"step": 32615
|
|
},
|
|
{
|
|
"entropy": 5.0619368076324465,
|
|
"epoch": 3.133525456292027,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004033839009141601,
|
|
"loss": 4.6281,
|
|
"mean_token_accuracy": 0.24463185667991638,
|
|
"num_tokens": 74793022.0,
|
|
"step": 32620
|
|
},
|
|
{
|
|
"entropy": 5.021758079528809,
|
|
"epoch": 3.134005763688761,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040335574468743145,
|
|
"loss": 4.6094,
|
|
"mean_token_accuracy": 0.25471110343933107,
|
|
"num_tokens": 74805165.0,
|
|
"step": 32625
|
|
},
|
|
{
|
|
"entropy": 5.006101131439209,
|
|
"epoch": 3.1344860710854947,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004033275854806636,
|
|
"loss": 4.6742,
|
|
"mean_token_accuracy": 0.2463693603873253,
|
|
"num_tokens": 74816607.0,
|
|
"step": 32630
|
|
},
|
|
{
|
|
"entropy": 4.991056966781616,
|
|
"epoch": 3.1349663784822286,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004032994232945103,
|
|
"loss": 4.6592,
|
|
"mean_token_accuracy": 0.24665045738220215,
|
|
"num_tokens": 74827606.0,
|
|
"step": 32635
|
|
},
|
|
{
|
|
"entropy": 5.031443881988525,
|
|
"epoch": 3.1354466858789625,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004032712581296253,
|
|
"loss": 4.5666,
|
|
"mean_token_accuracy": 0.2526061311364174,
|
|
"num_tokens": 74837507.0,
|
|
"step": 32640
|
|
},
|
|
{
|
|
"entropy": 5.039136600494385,
|
|
"epoch": 3.1359269932756964,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00040324308998666267,
|
|
"loss": 4.6673,
|
|
"mean_token_accuracy": 0.23908869177103043,
|
|
"num_tokens": 74848164.0,
|
|
"step": 32645
|
|
},
|
|
{
|
|
"entropy": 5.0491025924682615,
|
|
"epoch": 3.1364073006724302,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00040321491886627614,
|
|
"loss": 4.6114,
|
|
"mean_token_accuracy": 0.2472015827894211,
|
|
"num_tokens": 74858010.0,
|
|
"step": 32650
|
|
},
|
|
{
|
|
"entropy": 5.024115800857544,
|
|
"epoch": 3.136887608069164,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040318674476912006,
|
|
"loss": 4.569,
|
|
"mean_token_accuracy": 0.24728527516126633,
|
|
"num_tokens": 74868654.0,
|
|
"step": 32655
|
|
},
|
|
{
|
|
"entropy": 5.099744749069214,
|
|
"epoch": 3.137367915465898,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004031585676958483,
|
|
"loss": 4.7081,
|
|
"mean_token_accuracy": 0.24407673478126526,
|
|
"num_tokens": 74879751.0,
|
|
"step": 32660
|
|
},
|
|
{
|
|
"entropy": 5.027320194244385,
|
|
"epoch": 3.1378482228626323,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040313038764711517,
|
|
"loss": 4.6401,
|
|
"mean_token_accuracy": 0.24803238958120347,
|
|
"num_tokens": 74890569.0,
|
|
"step": 32665
|
|
},
|
|
{
|
|
"entropy": 5.08983359336853,
|
|
"epoch": 3.138328530259366,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00040310220462357494,
|
|
"loss": 4.6672,
|
|
"mean_token_accuracy": 0.23627711683511735,
|
|
"num_tokens": 74902078.0,
|
|
"step": 32670
|
|
},
|
|
{
|
|
"entropy": 5.068285226821899,
|
|
"epoch": 3.1388088376561,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004030740186258819,
|
|
"loss": 4.6392,
|
|
"mean_token_accuracy": 0.24788211435079574,
|
|
"num_tokens": 74913228.0,
|
|
"step": 32675
|
|
},
|
|
{
|
|
"entropy": 5.011826419830323,
|
|
"epoch": 3.139289145052834,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004030458296546905,
|
|
"loss": 4.6343,
|
|
"mean_token_accuracy": 0.2474340334534645,
|
|
"num_tokens": 74925419.0,
|
|
"step": 32680
|
|
},
|
|
{
|
|
"entropy": 5.0667013168334964,
|
|
"epoch": 3.139769452449568,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00040301763771065504,
|
|
"loss": 4.6582,
|
|
"mean_token_accuracy": 0.24502648413181305,
|
|
"num_tokens": 74936219.0,
|
|
"step": 32685
|
|
},
|
|
{
|
|
"entropy": 5.016709995269776,
|
|
"epoch": 3.1402497598463017,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004029894427944302,
|
|
"loss": 4.6411,
|
|
"mean_token_accuracy": 0.25027802437543867,
|
|
"num_tokens": 74948015.0,
|
|
"step": 32690
|
|
},
|
|
{
|
|
"entropy": 5.030672311782837,
|
|
"epoch": 3.1407300672430356,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040296124490667065,
|
|
"loss": 4.6175,
|
|
"mean_token_accuracy": 0.2463405415415764,
|
|
"num_tokens": 74959753.0,
|
|
"step": 32695
|
|
},
|
|
{
|
|
"entropy": 4.999003887176514,
|
|
"epoch": 3.1412103746397695,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004029330440480308,
|
|
"loss": 4.5653,
|
|
"mean_token_accuracy": 0.2514141842722893,
|
|
"num_tokens": 74970696.0,
|
|
"step": 32700
|
|
},
|
|
{
|
|
"entropy": 5.0660217761993405,
|
|
"epoch": 3.1416906820365034,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004029048402191656,
|
|
"loss": 4.6656,
|
|
"mean_token_accuracy": 0.2468088760972023,
|
|
"num_tokens": 74982070.0,
|
|
"step": 32705
|
|
},
|
|
{
|
|
"entropy": 5.025319671630859,
|
|
"epoch": 3.1421709894332372,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004028766334207299,
|
|
"loss": 4.6414,
|
|
"mean_token_accuracy": 0.25440729707479476,
|
|
"num_tokens": 74992823.0,
|
|
"step": 32710
|
|
},
|
|
{
|
|
"entropy": 4.922139263153076,
|
|
"epoch": 3.142651296829971,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004028484236533784,
|
|
"loss": 4.5103,
|
|
"mean_token_accuracy": 0.25190082639455796,
|
|
"num_tokens": 75004456.0,
|
|
"step": 32715
|
|
},
|
|
{
|
|
"entropy": 5.1129429817199705,
|
|
"epoch": 3.143131604226705,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040282021091776624,
|
|
"loss": 4.7352,
|
|
"mean_token_accuracy": 0.23567797243595123,
|
|
"num_tokens": 75017039.0,
|
|
"step": 32720
|
|
},
|
|
{
|
|
"entropy": 5.042050075531006,
|
|
"epoch": 3.143611911623439,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004027919952145482,
|
|
"loss": 4.5894,
|
|
"mean_token_accuracy": 0.24799265563488007,
|
|
"num_tokens": 75028155.0,
|
|
"step": 32725
|
|
},
|
|
{
|
|
"entropy": 5.044452762603759,
|
|
"epoch": 3.1440922190201728,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004027637765443795,
|
|
"loss": 4.6271,
|
|
"mean_token_accuracy": 0.2433112919330597,
|
|
"num_tokens": 75039529.0,
|
|
"step": 32730
|
|
},
|
|
{
|
|
"entropy": 5.03828911781311,
|
|
"epoch": 3.1445725264169067,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040273555490791534,
|
|
"loss": 4.6377,
|
|
"mean_token_accuracy": 0.24551484733819962,
|
|
"num_tokens": 75051205.0,
|
|
"step": 32735
|
|
},
|
|
{
|
|
"entropy": 5.053845071792603,
|
|
"epoch": 3.1450528338136405,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004027073303058109,
|
|
"loss": 4.6596,
|
|
"mean_token_accuracy": 0.25166534036397936,
|
|
"num_tokens": 75063379.0,
|
|
"step": 32740
|
|
},
|
|
{
|
|
"entropy": 5.039496374130249,
|
|
"epoch": 3.1455331412103744,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004026791027387214,
|
|
"loss": 4.6273,
|
|
"mean_token_accuracy": 0.24387053847312928,
|
|
"num_tokens": 75074034.0,
|
|
"step": 32745
|
|
},
|
|
{
|
|
"entropy": 5.083844327926636,
|
|
"epoch": 3.1460134486071087,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004026508722073024,
|
|
"loss": 4.6436,
|
|
"mean_token_accuracy": 0.23956361413002014,
|
|
"num_tokens": 75087004.0,
|
|
"step": 32750
|
|
},
|
|
{
|
|
"entropy": 5.0490403175354,
|
|
"epoch": 3.1464937560038426,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040262263871220904,
|
|
"loss": 4.6842,
|
|
"mean_token_accuracy": 0.2463624134659767,
|
|
"num_tokens": 75100270.0,
|
|
"step": 32755
|
|
},
|
|
{
|
|
"entropy": 5.005458498001099,
|
|
"epoch": 3.1469740634005765,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004025944022540971,
|
|
"loss": 4.5903,
|
|
"mean_token_accuracy": 0.25398978739976885,
|
|
"num_tokens": 75113116.0,
|
|
"step": 32760
|
|
},
|
|
{
|
|
"entropy": 5.084502840042115,
|
|
"epoch": 3.1474543707973104,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00040256616283362195,
|
|
"loss": 4.7061,
|
|
"mean_token_accuracy": 0.24278795272111892,
|
|
"num_tokens": 75124225.0,
|
|
"step": 32765
|
|
},
|
|
{
|
|
"entropy": 5.027612638473511,
|
|
"epoch": 3.1479346781940443,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040253792045143926,
|
|
"loss": 4.5723,
|
|
"mean_token_accuracy": 0.2548971638083458,
|
|
"num_tokens": 75135859.0,
|
|
"step": 32770
|
|
},
|
|
{
|
|
"entropy": 5.14524474143982,
|
|
"epoch": 3.148414985590778,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004025096751082048,
|
|
"loss": 4.7552,
|
|
"mean_token_accuracy": 0.24182595908641816,
|
|
"num_tokens": 75148626.0,
|
|
"step": 32775
|
|
},
|
|
{
|
|
"entropy": 5.047990942001343,
|
|
"epoch": 3.148895292987512,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004024814268045743,
|
|
"loss": 4.6451,
|
|
"mean_token_accuracy": 0.2424531862139702,
|
|
"num_tokens": 75159539.0,
|
|
"step": 32780
|
|
},
|
|
{
|
|
"entropy": 5.037782907485962,
|
|
"epoch": 3.149375600384246,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040245317554120363,
|
|
"loss": 4.6381,
|
|
"mean_token_accuracy": 0.24708918929100038,
|
|
"num_tokens": 75172393.0,
|
|
"step": 32785
|
|
},
|
|
{
|
|
"entropy": 5.08271894454956,
|
|
"epoch": 3.14985590778098,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004024249213187487,
|
|
"loss": 4.6625,
|
|
"mean_token_accuracy": 0.24983277618885041,
|
|
"num_tokens": 75183777.0,
|
|
"step": 32790
|
|
},
|
|
{
|
|
"entropy": 4.977076625823974,
|
|
"epoch": 3.1503362151777137,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004023966641378655,
|
|
"loss": 4.5685,
|
|
"mean_token_accuracy": 0.2511023834347725,
|
|
"num_tokens": 75194976.0,
|
|
"step": 32795
|
|
},
|
|
{
|
|
"entropy": 4.969881248474121,
|
|
"epoch": 3.1508165225744476,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00040236840399920996,
|
|
"loss": 4.582,
|
|
"mean_token_accuracy": 0.24932862371206282,
|
|
"num_tokens": 75207709.0,
|
|
"step": 32800
|
|
},
|
|
{
|
|
"entropy": 4.944665145874024,
|
|
"epoch": 3.1512968299711814,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040234014090343833,
|
|
"loss": 4.5121,
|
|
"mean_token_accuracy": 0.2562754929065704,
|
|
"num_tokens": 75217848.0,
|
|
"step": 32805
|
|
},
|
|
{
|
|
"entropy": 5.089080810546875,
|
|
"epoch": 3.1517771373679153,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004023118748512068,
|
|
"loss": 4.6909,
|
|
"mean_token_accuracy": 0.24239312559366227,
|
|
"num_tokens": 75229670.0,
|
|
"step": 32810
|
|
},
|
|
{
|
|
"entropy": 4.988765907287598,
|
|
"epoch": 3.152257444764649,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004022836058431715,
|
|
"loss": 4.5864,
|
|
"mean_token_accuracy": 0.25130273699760436,
|
|
"num_tokens": 75241752.0,
|
|
"step": 32815
|
|
},
|
|
{
|
|
"entropy": 4.97612624168396,
|
|
"epoch": 3.152737752161383,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00040225533387998883,
|
|
"loss": 4.582,
|
|
"mean_token_accuracy": 0.2506746083498001,
|
|
"num_tokens": 75252818.0,
|
|
"step": 32820
|
|
},
|
|
{
|
|
"entropy": 5.016368865966797,
|
|
"epoch": 3.1532180595581174,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004022270589623152,
|
|
"loss": 4.6216,
|
|
"mean_token_accuracy": 0.24806735217571257,
|
|
"num_tokens": 75264076.0,
|
|
"step": 32825
|
|
},
|
|
{
|
|
"entropy": 5.069680118560791,
|
|
"epoch": 3.1536983669548513,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000402198781090807,
|
|
"loss": 4.646,
|
|
"mean_token_accuracy": 0.24499400407075883,
|
|
"num_tokens": 75275281.0,
|
|
"step": 32830
|
|
},
|
|
{
|
|
"entropy": 4.994699382781983,
|
|
"epoch": 3.154178674351585,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004021705002661208,
|
|
"loss": 4.6135,
|
|
"mean_token_accuracy": 0.24449268132448196,
|
|
"num_tokens": 75287443.0,
|
|
"step": 32835
|
|
},
|
|
{
|
|
"entropy": 4.988004922866821,
|
|
"epoch": 3.154658981748319,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004021422164889133,
|
|
"loss": 4.638,
|
|
"mean_token_accuracy": 0.24222270548343658,
|
|
"num_tokens": 75299256.0,
|
|
"step": 32840
|
|
},
|
|
{
|
|
"entropy": 5.096055030822754,
|
|
"epoch": 3.155139289145053,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000402113929759841,
|
|
"loss": 4.6486,
|
|
"mean_token_accuracy": 0.24704242646694183,
|
|
"num_tokens": 75310485.0,
|
|
"step": 32845
|
|
},
|
|
{
|
|
"entropy": 5.10506682395935,
|
|
"epoch": 3.155619596541787,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040208564007956075,
|
|
"loss": 4.6364,
|
|
"mean_token_accuracy": 0.2465073361992836,
|
|
"num_tokens": 75322826.0,
|
|
"step": 32850
|
|
},
|
|
{
|
|
"entropy": 5.00804877281189,
|
|
"epoch": 3.1560999039385207,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004020573474487293,
|
|
"loss": 4.6055,
|
|
"mean_token_accuracy": 0.25069845020771026,
|
|
"num_tokens": 75334826.0,
|
|
"step": 32855
|
|
},
|
|
{
|
|
"entropy": 4.932596969604492,
|
|
"epoch": 3.1565802113352546,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00040202905186800347,
|
|
"loss": 4.5534,
|
|
"mean_token_accuracy": 0.25578352212905886,
|
|
"num_tokens": 75345997.0,
|
|
"step": 32860
|
|
},
|
|
{
|
|
"entropy": 5.094801235198974,
|
|
"epoch": 3.1570605187319885,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004020007533380403,
|
|
"loss": 4.6379,
|
|
"mean_token_accuracy": 0.23823827058076857,
|
|
"num_tokens": 75357018.0,
|
|
"step": 32865
|
|
},
|
|
{
|
|
"entropy": 5.011976289749145,
|
|
"epoch": 3.1575408261287223,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004019724518594967,
|
|
"loss": 4.6183,
|
|
"mean_token_accuracy": 0.2500107690691948,
|
|
"num_tokens": 75368190.0,
|
|
"step": 32870
|
|
},
|
|
{
|
|
"entropy": 5.001093578338623,
|
|
"epoch": 3.158021133525456,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004019441474330298,
|
|
"loss": 4.5785,
|
|
"mean_token_accuracy": 0.25090626180171965,
|
|
"num_tokens": 75380284.0,
|
|
"step": 32875
|
|
},
|
|
{
|
|
"entropy": 4.977503919601441,
|
|
"epoch": 3.15850144092219,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00040191584005929684,
|
|
"loss": 4.6015,
|
|
"mean_token_accuracy": 0.2545105591416359,
|
|
"num_tokens": 75392543.0,
|
|
"step": 32880
|
|
},
|
|
{
|
|
"entropy": 5.009714555740357,
|
|
"epoch": 3.158981748318924,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004018875297389549,
|
|
"loss": 4.6082,
|
|
"mean_token_accuracy": 0.24547204971313477,
|
|
"num_tokens": 75404883.0,
|
|
"step": 32885
|
|
},
|
|
{
|
|
"entropy": 5.064972591400147,
|
|
"epoch": 3.159462055715658,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00040185921647266126,
|
|
"loss": 4.6607,
|
|
"mean_token_accuracy": 0.24727501720190048,
|
|
"num_tokens": 75416717.0,
|
|
"step": 32890
|
|
},
|
|
{
|
|
"entropy": 5.0171041011810305,
|
|
"epoch": 3.1599423631123917,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00040183090026107326,
|
|
"loss": 4.5748,
|
|
"mean_token_accuracy": 0.2567442923784256,
|
|
"num_tokens": 75428103.0,
|
|
"step": 32895
|
|
},
|
|
{
|
|
"entropy": 4.942748641967773,
|
|
"epoch": 3.160422670509126,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00040180258110484847,
|
|
"loss": 4.5397,
|
|
"mean_token_accuracy": 0.257702699303627,
|
|
"num_tokens": 75438333.0,
|
|
"step": 32900
|
|
},
|
|
{
|
|
"entropy": 5.08033013343811,
|
|
"epoch": 3.16090297790586,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004017742590046442,
|
|
"loss": 4.6372,
|
|
"mean_token_accuracy": 0.24954543262720108,
|
|
"num_tokens": 75450023.0,
|
|
"step": 32905
|
|
},
|
|
{
|
|
"entropy": 5.0422680377960205,
|
|
"epoch": 3.161383285302594,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00040174593396111814,
|
|
"loss": 4.6094,
|
|
"mean_token_accuracy": 0.251060651242733,
|
|
"num_tokens": 75460724.0,
|
|
"step": 32910
|
|
},
|
|
{
|
|
"entropy": 5.0254497051239015,
|
|
"epoch": 3.1618635926993277,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040171760597492785,
|
|
"loss": 4.6303,
|
|
"mean_token_accuracy": 0.2505000278353691,
|
|
"num_tokens": 75471752.0,
|
|
"step": 32915
|
|
},
|
|
{
|
|
"entropy": 4.954725503921509,
|
|
"epoch": 3.1623439000960616,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040168927504673094,
|
|
"loss": 4.5487,
|
|
"mean_token_accuracy": 0.2478194385766983,
|
|
"num_tokens": 75483748.0,
|
|
"step": 32920
|
|
},
|
|
{
|
|
"entropy": 5.0682861328125,
|
|
"epoch": 3.1628242074927955,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004016609411771853,
|
|
"loss": 4.7222,
|
|
"mean_token_accuracy": 0.24132005572319032,
|
|
"num_tokens": 75495497.0,
|
|
"step": 32925
|
|
},
|
|
{
|
|
"entropy": 5.077421712875366,
|
|
"epoch": 3.1633045148895294,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00040163260436694876,
|
|
"loss": 4.6752,
|
|
"mean_token_accuracy": 0.24498880207538604,
|
|
"num_tokens": 75506852.0,
|
|
"step": 32930
|
|
},
|
|
{
|
|
"entropy": 5.026989841461182,
|
|
"epoch": 3.1637848222862632,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004016042646166791,
|
|
"loss": 4.6044,
|
|
"mean_token_accuracy": 0.2555360347032547,
|
|
"num_tokens": 75518633.0,
|
|
"step": 32935
|
|
},
|
|
{
|
|
"entropy": 5.018486785888672,
|
|
"epoch": 3.164265129682997,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004015759219270344,
|
|
"loss": 4.5854,
|
|
"mean_token_accuracy": 0.2519551023840904,
|
|
"num_tokens": 75529668.0,
|
|
"step": 32940
|
|
},
|
|
{
|
|
"entropy": 5.044800519943237,
|
|
"epoch": 3.164745437079731,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004015475762986726,
|
|
"loss": 4.6172,
|
|
"mean_token_accuracy": 0.25533214062452314,
|
|
"num_tokens": 75541603.0,
|
|
"step": 32945
|
|
},
|
|
{
|
|
"entropy": 5.093109893798828,
|
|
"epoch": 3.165225744476465,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040151922773225187,
|
|
"loss": 4.6589,
|
|
"mean_token_accuracy": 0.24944701492786409,
|
|
"num_tokens": 75553489.0,
|
|
"step": 32950
|
|
},
|
|
{
|
|
"entropy": 5.018434381484985,
|
|
"epoch": 3.1657060518731988,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004014908762284303,
|
|
"loss": 4.6143,
|
|
"mean_token_accuracy": 0.24663615971803665,
|
|
"num_tokens": 75564809.0,
|
|
"step": 32955
|
|
},
|
|
{
|
|
"entropy": 5.057509565353394,
|
|
"epoch": 3.1661863592699326,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040146252178786633,
|
|
"loss": 4.6544,
|
|
"mean_token_accuracy": 0.24733265042304992,
|
|
"num_tokens": 75575886.0,
|
|
"step": 32960
|
|
},
|
|
{
|
|
"entropy": 5.050777339935303,
|
|
"epoch": 3.1666666666666665,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000401434164411218,
|
|
"loss": 4.6195,
|
|
"mean_token_accuracy": 0.24244523793458939,
|
|
"num_tokens": 75586518.0,
|
|
"step": 32965
|
|
},
|
|
{
|
|
"entropy": 4.947002935409546,
|
|
"epoch": 3.1671469740634004,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00040140580409914385,
|
|
"loss": 4.4814,
|
|
"mean_token_accuracy": 0.2595750898122787,
|
|
"num_tokens": 75598819.0,
|
|
"step": 32970
|
|
},
|
|
{
|
|
"entropy": 4.991409206390381,
|
|
"epoch": 3.1676272814601343,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040137744085230227,
|
|
"loss": 4.6828,
|
|
"mean_token_accuracy": 0.24561095386743545,
|
|
"num_tokens": 75610319.0,
|
|
"step": 32975
|
|
},
|
|
{
|
|
"entropy": 5.220904731750489,
|
|
"epoch": 3.168107588856868,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004013490746713518,
|
|
"loss": 4.7541,
|
|
"mean_token_accuracy": 0.2379745751619339,
|
|
"num_tokens": 75622479.0,
|
|
"step": 32980
|
|
},
|
|
{
|
|
"entropy": 5.052739048004151,
|
|
"epoch": 3.1685878962536025,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00040132070555695096,
|
|
"loss": 4.6099,
|
|
"mean_token_accuracy": 0.24859773218631745,
|
|
"num_tokens": 75634114.0,
|
|
"step": 32985
|
|
},
|
|
{
|
|
"entropy": 5.0500153541564945,
|
|
"epoch": 3.1690682036503364,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00040129233350975847,
|
|
"loss": 4.7329,
|
|
"mean_token_accuracy": 0.2398463323712349,
|
|
"num_tokens": 75645828.0,
|
|
"step": 32990
|
|
},
|
|
{
|
|
"entropy": 5.026799821853638,
|
|
"epoch": 3.1695485110470702,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040126395853043293,
|
|
"loss": 4.6365,
|
|
"mean_token_accuracy": 0.24603895097970963,
|
|
"num_tokens": 75657335.0,
|
|
"step": 32995
|
|
},
|
|
{
|
|
"entropy": 5.0823150157928465,
|
|
"epoch": 3.170028818443804,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004012355806196332,
|
|
"loss": 4.5871,
|
|
"mean_token_accuracy": 0.2491716131567955,
|
|
"num_tokens": 75668886.0,
|
|
"step": 33000
|
|
},
|
|
{
|
|
"epoch": 3.170028818443804,
|
|
"eval_entropy": 4.875704409261264,
|
|
"eval_loss": 4.792845249176025,
|
|
"eval_mean_token_accuracy": 0.24698377140447578,
|
|
"eval_num_tokens": 75668886.0,
|
|
"eval_runtime": 26.5561,
|
|
"eval_samples_per_second": 1235.688,
|
|
"eval_steps_per_second": 154.466,
|
|
"step": 33000
|
|
},
|
|
{
|
|
"entropy": 5.103356218338012,
|
|
"epoch": 3.170509125840538,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00040120719977801823,
|
|
"loss": 4.7241,
|
|
"mean_token_accuracy": 0.23807538598775863,
|
|
"num_tokens": 75679392.0,
|
|
"step": 33005
|
|
},
|
|
{
|
|
"entropy": 5.119944715499878,
|
|
"epoch": 3.170989433237272,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040117881600624676,
|
|
"loss": 4.7102,
|
|
"mean_token_accuracy": 0.24104578644037247,
|
|
"num_tokens": 75692028.0,
|
|
"step": 33010
|
|
},
|
|
{
|
|
"entropy": 5.0259864807128904,
|
|
"epoch": 3.1714697406340058,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00040115042930497787,
|
|
"loss": 4.6282,
|
|
"mean_token_accuracy": 0.24223802238702774,
|
|
"num_tokens": 75702879.0,
|
|
"step": 33015
|
|
},
|
|
{
|
|
"entropy": 5.081137800216675,
|
|
"epoch": 3.1719500480307397,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040112203967487066,
|
|
"loss": 4.5954,
|
|
"mean_token_accuracy": 0.24994781166315078,
|
|
"num_tokens": 75713564.0,
|
|
"step": 33020
|
|
},
|
|
{
|
|
"entropy": 4.96788535118103,
|
|
"epoch": 3.1724303554274735,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00040109364711658416,
|
|
"loss": 4.5737,
|
|
"mean_token_accuracy": 0.2551387965679169,
|
|
"num_tokens": 75724577.0,
|
|
"step": 33025
|
|
},
|
|
{
|
|
"entropy": 5.022297191619873,
|
|
"epoch": 3.1729106628242074,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040106525163077756,
|
|
"loss": 4.6216,
|
|
"mean_token_accuracy": 0.2573698371648788,
|
|
"num_tokens": 75736349.0,
|
|
"step": 33030
|
|
},
|
|
{
|
|
"entropy": 5.048596477508545,
|
|
"epoch": 3.1733909702209413,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004010368532181102,
|
|
"loss": 4.647,
|
|
"mean_token_accuracy": 0.24219596534967422,
|
|
"num_tokens": 75747465.0,
|
|
"step": 33035
|
|
},
|
|
{
|
|
"entropy": 5.19978289604187,
|
|
"epoch": 3.173871277617675,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004010084518792413,
|
|
"loss": 4.7431,
|
|
"mean_token_accuracy": 0.24264875501394273,
|
|
"num_tokens": 75759014.0,
|
|
"step": 33040
|
|
},
|
|
{
|
|
"entropy": 4.992469692230225,
|
|
"epoch": 3.174351585014409,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040098004761483037,
|
|
"loss": 4.5791,
|
|
"mean_token_accuracy": 0.2490800842642784,
|
|
"num_tokens": 75770354.0,
|
|
"step": 33045
|
|
},
|
|
{
|
|
"entropy": 5.090083980560303,
|
|
"epoch": 3.174831892411143,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004009516404255368,
|
|
"loss": 4.7128,
|
|
"mean_token_accuracy": 0.2398114785552025,
|
|
"num_tokens": 75781530.0,
|
|
"step": 33050
|
|
},
|
|
{
|
|
"entropy": 5.0209059715271,
|
|
"epoch": 3.175312199807877,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004009232303120202,
|
|
"loss": 4.6373,
|
|
"mean_token_accuracy": 0.24811351597309111,
|
|
"num_tokens": 75792751.0,
|
|
"step": 33055
|
|
},
|
|
{
|
|
"entropy": 5.025114583969116,
|
|
"epoch": 3.175792507204611,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00040089481727494,
|
|
"loss": 4.5463,
|
|
"mean_token_accuracy": 0.25323901772499086,
|
|
"num_tokens": 75804882.0,
|
|
"step": 33060
|
|
},
|
|
{
|
|
"entropy": 4.963725328445435,
|
|
"epoch": 3.176272814601345,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000400866401314956,
|
|
"loss": 4.5662,
|
|
"mean_token_accuracy": 0.2505721032619476,
|
|
"num_tokens": 75816207.0,
|
|
"step": 33065
|
|
},
|
|
{
|
|
"entropy": 5.0686869621276855,
|
|
"epoch": 3.176753121998079,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040083798243272797,
|
|
"loss": 4.7548,
|
|
"mean_token_accuracy": 0.24290491938591002,
|
|
"num_tokens": 75827957.0,
|
|
"step": 33070
|
|
},
|
|
{
|
|
"entropy": 5.128195428848267,
|
|
"epoch": 3.177233429394813,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00040080956062891554,
|
|
"loss": 4.6695,
|
|
"mean_token_accuracy": 0.2454632982611656,
|
|
"num_tokens": 75839227.0,
|
|
"step": 33075
|
|
},
|
|
{
|
|
"entropy": 5.098654842376709,
|
|
"epoch": 3.1777137367915467,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00040078113590417887,
|
|
"loss": 4.6647,
|
|
"mean_token_accuracy": 0.24950521141290666,
|
|
"num_tokens": 75849831.0,
|
|
"step": 33080
|
|
},
|
|
{
|
|
"entropy": 4.982477331161499,
|
|
"epoch": 3.1781940441882806,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040075270825917753,
|
|
"loss": 4.6105,
|
|
"mean_token_accuracy": 0.25387084633111956,
|
|
"num_tokens": 75860617.0,
|
|
"step": 33085
|
|
},
|
|
{
|
|
"entropy": 5.068709182739258,
|
|
"epoch": 3.1786743515850144,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004007242776945718,
|
|
"loss": 4.6763,
|
|
"mean_token_accuracy": 0.24255860596895218,
|
|
"num_tokens": 75872193.0,
|
|
"step": 33090
|
|
},
|
|
{
|
|
"entropy": 5.076613521575927,
|
|
"epoch": 3.1791546589817483,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00040069584421102174,
|
|
"loss": 4.6655,
|
|
"mean_token_accuracy": 0.2528792917728424,
|
|
"num_tokens": 75885536.0,
|
|
"step": 33095
|
|
},
|
|
{
|
|
"entropy": 5.132878303527832,
|
|
"epoch": 3.179634966378482,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00040066740780918725,
|
|
"loss": 4.6938,
|
|
"mean_token_accuracy": 0.242730313539505,
|
|
"num_tokens": 75897325.0,
|
|
"step": 33100
|
|
},
|
|
{
|
|
"entropy": 5.044506978988648,
|
|
"epoch": 3.180115273775216,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004006389684897288,
|
|
"loss": 4.6858,
|
|
"mean_token_accuracy": 0.24642085283994675,
|
|
"num_tokens": 75908353.0,
|
|
"step": 33105
|
|
},
|
|
{
|
|
"entropy": 5.045048189163208,
|
|
"epoch": 3.18059558117195,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004006105262533066,
|
|
"loss": 4.6808,
|
|
"mean_token_accuracy": 0.24400411993265153,
|
|
"num_tokens": 75919862.0,
|
|
"step": 33110
|
|
},
|
|
{
|
|
"entropy": 5.002927684783936,
|
|
"epoch": 3.181075888568684,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004005820811005809,
|
|
"loss": 4.5726,
|
|
"mean_token_accuracy": 0.25889708399772643,
|
|
"num_tokens": 75930971.0,
|
|
"step": 33115
|
|
},
|
|
{
|
|
"entropy": 5.06383581161499,
|
|
"epoch": 3.1815561959654177,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00040055363303221226,
|
|
"loss": 4.5474,
|
|
"mean_token_accuracy": 0.2644984617829323,
|
|
"num_tokens": 75941124.0,
|
|
"step": 33120
|
|
},
|
|
{
|
|
"entropy": 5.06303448677063,
|
|
"epoch": 3.1820365033621516,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000400525182048861,
|
|
"loss": 4.655,
|
|
"mean_token_accuracy": 0.24702179729938506,
|
|
"num_tokens": 75953040.0,
|
|
"step": 33125
|
|
},
|
|
{
|
|
"entropy": 5.041329669952392,
|
|
"epoch": 3.1825168107588855,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040049672815118775,
|
|
"loss": 4.6944,
|
|
"mean_token_accuracy": 0.23162348121404647,
|
|
"num_tokens": 75965122.0,
|
|
"step": 33130
|
|
},
|
|
{
|
|
"entropy": 5.072277927398682,
|
|
"epoch": 3.18299711815562,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040046827133985316,
|
|
"loss": 4.5845,
|
|
"mean_token_accuracy": 0.24891586154699324,
|
|
"num_tokens": 75975360.0,
|
|
"step": 33135
|
|
},
|
|
{
|
|
"entropy": 5.0704793453216555,
|
|
"epoch": 3.1834774255523537,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00040043981161551784,
|
|
"loss": 4.5532,
|
|
"mean_token_accuracy": 0.25152069330215454,
|
|
"num_tokens": 75985592.0,
|
|
"step": 33140
|
|
},
|
|
{
|
|
"entropy": 4.945167827606201,
|
|
"epoch": 3.1839577329490876,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004004113489788426,
|
|
"loss": 4.6335,
|
|
"mean_token_accuracy": 0.24698051065206528,
|
|
"num_tokens": 75997429.0,
|
|
"step": 33145
|
|
},
|
|
{
|
|
"entropy": 5.114994382858276,
|
|
"epoch": 3.1844380403458215,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00040038288343048823,
|
|
"loss": 4.7781,
|
|
"mean_token_accuracy": 0.2350222647190094,
|
|
"num_tokens": 76008785.0,
|
|
"step": 33150
|
|
},
|
|
{
|
|
"entropy": 5.170015287399292,
|
|
"epoch": 3.1849183477425553,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00040035441497111564,
|
|
"loss": 4.7362,
|
|
"mean_token_accuracy": 0.24190903902053834,
|
|
"num_tokens": 76020773.0,
|
|
"step": 33155
|
|
},
|
|
{
|
|
"entropy": 5.065093851089477,
|
|
"epoch": 3.185398655139289,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00040032594360138576,
|
|
"loss": 4.6355,
|
|
"mean_token_accuracy": 0.25197552144527435,
|
|
"num_tokens": 76033300.0,
|
|
"step": 33160
|
|
},
|
|
{
|
|
"entropy": 4.973330116271972,
|
|
"epoch": 3.185878962536023,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004002974693219595,
|
|
"loss": 4.5369,
|
|
"mean_token_accuracy": 0.24910655617713928,
|
|
"num_tokens": 76043443.0,
|
|
"step": 33165
|
|
},
|
|
{
|
|
"entropy": 5.027342748641968,
|
|
"epoch": 3.186359269932757,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00040026899213349814,
|
|
"loss": 4.5947,
|
|
"mean_token_accuracy": 0.25178042650222776,
|
|
"num_tokens": 76055417.0,
|
|
"step": 33170
|
|
},
|
|
{
|
|
"entropy": 5.066886329650879,
|
|
"epoch": 3.186839577329491,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004002405120366628,
|
|
"loss": 4.6239,
|
|
"mean_token_accuracy": 0.24546905755996704,
|
|
"num_tokens": 76067259.0,
|
|
"step": 33175
|
|
},
|
|
{
|
|
"entropy": 5.068389558792115,
|
|
"epoch": 3.1873198847262247,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00040021202903211454,
|
|
"loss": 4.6222,
|
|
"mean_token_accuracy": 0.24282134771347047,
|
|
"num_tokens": 76079827.0,
|
|
"step": 33180
|
|
},
|
|
{
|
|
"entropy": 5.074465370178222,
|
|
"epoch": 3.1878001921229586,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004001835431205149,
|
|
"loss": 4.6209,
|
|
"mean_token_accuracy": 0.2443382978439331,
|
|
"num_tokens": 76092068.0,
|
|
"step": 33185
|
|
},
|
|
{
|
|
"entropy": 5.103303289413452,
|
|
"epoch": 3.1882804995196925,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00040015505430252506,
|
|
"loss": 4.6673,
|
|
"mean_token_accuracy": 0.24165288209915162,
|
|
"num_tokens": 76102046.0,
|
|
"step": 33190
|
|
},
|
|
{
|
|
"entropy": 5.015643739700318,
|
|
"epoch": 3.1887608069164264,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00040012656257880645,
|
|
"loss": 4.5915,
|
|
"mean_token_accuracy": 0.2513815090060234,
|
|
"num_tokens": 76113629.0,
|
|
"step": 33195
|
|
},
|
|
{
|
|
"entropy": 4.982402324676514,
|
|
"epoch": 3.1892411143131603,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00040009806795002076,
|
|
"loss": 4.5417,
|
|
"mean_token_accuracy": 0.25977119952440264,
|
|
"num_tokens": 76124593.0,
|
|
"step": 33200
|
|
},
|
|
{
|
|
"entropy": 5.165917634963989,
|
|
"epoch": 3.189721421709894,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004000695704168292,
|
|
"loss": 4.8196,
|
|
"mean_token_accuracy": 0.23088828921318055,
|
|
"num_tokens": 76135177.0,
|
|
"step": 33205
|
|
},
|
|
{
|
|
"entropy": 5.04478006362915,
|
|
"epoch": 3.1902017291066285,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004000410699798937,
|
|
"loss": 4.5932,
|
|
"mean_token_accuracy": 0.24386686980724334,
|
|
"num_tokens": 76148998.0,
|
|
"step": 33210
|
|
},
|
|
{
|
|
"entropy": 4.967963171005249,
|
|
"epoch": 3.1906820365033624,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00040001256663987585,
|
|
"loss": 4.625,
|
|
"mean_token_accuracy": 0.2557815477252007,
|
|
"num_tokens": 76161224.0,
|
|
"step": 33215
|
|
},
|
|
{
|
|
"entropy": 4.964840984344482,
|
|
"epoch": 3.1911623439000962,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039998406039743736,
|
|
"loss": 4.6413,
|
|
"mean_token_accuracy": 0.24818596839904786,
|
|
"num_tokens": 76172577.0,
|
|
"step": 33220
|
|
},
|
|
{
|
|
"entropy": 5.035871458053589,
|
|
"epoch": 3.19164265129683,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003999555512532401,
|
|
"loss": 4.5843,
|
|
"mean_token_accuracy": 0.24981313645839692,
|
|
"num_tokens": 76184135.0,
|
|
"step": 33225
|
|
},
|
|
{
|
|
"entropy": 5.109456205368042,
|
|
"epoch": 3.192122958693564,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003999270392079461,
|
|
"loss": 4.6615,
|
|
"mean_token_accuracy": 0.24911601543426515,
|
|
"num_tokens": 76195831.0,
|
|
"step": 33230
|
|
},
|
|
{
|
|
"entropy": 5.100518894195557,
|
|
"epoch": 3.192603266090298,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003998985242622171,
|
|
"loss": 4.682,
|
|
"mean_token_accuracy": 0.2433509945869446,
|
|
"num_tokens": 76208914.0,
|
|
"step": 33235
|
|
},
|
|
{
|
|
"entropy": 5.048551940917969,
|
|
"epoch": 3.1930835734870318,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003998700064167153,
|
|
"loss": 4.6938,
|
|
"mean_token_accuracy": 0.238228178024292,
|
|
"num_tokens": 76220599.0,
|
|
"step": 33240
|
|
},
|
|
{
|
|
"entropy": 5.043987560272217,
|
|
"epoch": 3.1935638808837656,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003998414856721027,
|
|
"loss": 4.6417,
|
|
"mean_token_accuracy": 0.2440706819295883,
|
|
"num_tokens": 76232897.0,
|
|
"step": 33245
|
|
},
|
|
{
|
|
"entropy": 5.098506736755371,
|
|
"epoch": 3.1940441882804995,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003998129620290415,
|
|
"loss": 4.675,
|
|
"mean_token_accuracy": 0.24875639379024506,
|
|
"num_tokens": 76245948.0,
|
|
"step": 33250
|
|
},
|
|
{
|
|
"entropy": 5.020074701309204,
|
|
"epoch": 3.1945244956772334,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00039978443548819393,
|
|
"loss": 4.6321,
|
|
"mean_token_accuracy": 0.2490899845957756,
|
|
"num_tokens": 76257223.0,
|
|
"step": 33255
|
|
},
|
|
{
|
|
"entropy": 4.976610994338989,
|
|
"epoch": 3.1950048030739673,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003997559060502224,
|
|
"loss": 4.5476,
|
|
"mean_token_accuracy": 0.25438212752342226,
|
|
"num_tokens": 76267606.0,
|
|
"step": 33260
|
|
},
|
|
{
|
|
"entropy": 4.997032260894775,
|
|
"epoch": 3.195485110470701,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003997273737157891,
|
|
"loss": 4.626,
|
|
"mean_token_accuracy": 0.2469200849533081,
|
|
"num_tokens": 76280430.0,
|
|
"step": 33265
|
|
},
|
|
{
|
|
"entropy": 5.1250245571136475,
|
|
"epoch": 3.195965417867435,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00039969883848555647,
|
|
"loss": 4.6756,
|
|
"mean_token_accuracy": 0.24692625999450685,
|
|
"num_tokens": 76293089.0,
|
|
"step": 33270
|
|
},
|
|
{
|
|
"entropy": 5.040512371063232,
|
|
"epoch": 3.196445725264169,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003996703003601872,
|
|
"loss": 4.597,
|
|
"mean_token_accuracy": 0.2488690882921219,
|
|
"num_tokens": 76304936.0,
|
|
"step": 33275
|
|
},
|
|
{
|
|
"entropy": 4.960997009277344,
|
|
"epoch": 3.196926032660903,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00039964175934034375,
|
|
"loss": 4.4535,
|
|
"mean_token_accuracy": 0.2637205198407173,
|
|
"num_tokens": 76316195.0,
|
|
"step": 33280
|
|
},
|
|
{
|
|
"entropy": 4.97886323928833,
|
|
"epoch": 3.1974063400576367,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003996132154266887,
|
|
"loss": 4.5768,
|
|
"mean_token_accuracy": 0.24694945216178893,
|
|
"num_tokens": 76328128.0,
|
|
"step": 33285
|
|
},
|
|
{
|
|
"entropy": 4.969552946090698,
|
|
"epoch": 3.1978866474543706,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003995846686198849,
|
|
"loss": 4.5687,
|
|
"mean_token_accuracy": 0.2539353668689728,
|
|
"num_tokens": 76338596.0,
|
|
"step": 33290
|
|
},
|
|
{
|
|
"entropy": 5.039647817611694,
|
|
"epoch": 3.198366954851105,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003995561189205949,
|
|
"loss": 4.6371,
|
|
"mean_token_accuracy": 0.24575907737016678,
|
|
"num_tokens": 76350479.0,
|
|
"step": 33295
|
|
},
|
|
{
|
|
"entropy": 5.01677680015564,
|
|
"epoch": 3.1988472622478388,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003995275663294818,
|
|
"loss": 4.6604,
|
|
"mean_token_accuracy": 0.24575017541646957,
|
|
"num_tokens": 76363433.0,
|
|
"step": 33300
|
|
},
|
|
{
|
|
"entropy": 4.960451126098633,
|
|
"epoch": 3.1993275696445727,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003994990108472084,
|
|
"loss": 4.5468,
|
|
"mean_token_accuracy": 0.25467743426561357,
|
|
"num_tokens": 76374000.0,
|
|
"step": 33305
|
|
},
|
|
{
|
|
"entropy": 4.960294628143311,
|
|
"epoch": 3.1998078770413065,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039947045247443755,
|
|
"loss": 4.554,
|
|
"mean_token_accuracy": 0.24896431416273118,
|
|
"num_tokens": 76384192.0,
|
|
"step": 33310
|
|
},
|
|
{
|
|
"entropy": 5.095963668823242,
|
|
"epoch": 3.2002881844380404,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039944189121183247,
|
|
"loss": 4.7225,
|
|
"mean_token_accuracy": 0.23484220057725907,
|
|
"num_tokens": 76396641.0,
|
|
"step": 33315
|
|
},
|
|
{
|
|
"entropy": 4.964787197113037,
|
|
"epoch": 3.2007684918347743,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00039941332706005617,
|
|
"loss": 4.5098,
|
|
"mean_token_accuracy": 0.25350615531206133,
|
|
"num_tokens": 76407954.0,
|
|
"step": 33320
|
|
},
|
|
{
|
|
"entropy": 4.972380542755127,
|
|
"epoch": 3.201248799231508,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00039938476001977175,
|
|
"loss": 4.6157,
|
|
"mean_token_accuracy": 0.25204425007104875,
|
|
"num_tokens": 76419024.0,
|
|
"step": 33325
|
|
},
|
|
{
|
|
"entropy": 5.087919998168945,
|
|
"epoch": 3.201729106628242,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00039935619009164264,
|
|
"loss": 4.6424,
|
|
"mean_token_accuracy": 0.23842392563819886,
|
|
"num_tokens": 76429786.0,
|
|
"step": 33330
|
|
},
|
|
{
|
|
"entropy": 4.965538406372071,
|
|
"epoch": 3.202209414024976,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003993276172763321,
|
|
"loss": 4.5046,
|
|
"mean_token_accuracy": 0.2566877916455269,
|
|
"num_tokens": 76440439.0,
|
|
"step": 33335
|
|
},
|
|
{
|
|
"entropy": 5.017267942428589,
|
|
"epoch": 3.20268972142171,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00039929904157450343,
|
|
"loss": 4.6522,
|
|
"mean_token_accuracy": 0.2454281345009804,
|
|
"num_tokens": 76451855.0,
|
|
"step": 33340
|
|
},
|
|
{
|
|
"entropy": 4.982196807861328,
|
|
"epoch": 3.2031700288184437,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00039927046298682007,
|
|
"loss": 4.5607,
|
|
"mean_token_accuracy": 0.2517274335026741,
|
|
"num_tokens": 76463662.0,
|
|
"step": 33345
|
|
},
|
|
{
|
|
"entropy": 5.027151155471802,
|
|
"epoch": 3.2036503362151776,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003992418815139456,
|
|
"loss": 4.588,
|
|
"mean_token_accuracy": 0.24779824465513228,
|
|
"num_tokens": 76476273.0,
|
|
"step": 33350
|
|
},
|
|
{
|
|
"entropy": 5.155083417892456,
|
|
"epoch": 3.2041306436119115,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00039921329715654355,
|
|
"loss": 4.6733,
|
|
"mean_token_accuracy": 0.23885925114154816,
|
|
"num_tokens": 76488958.0,
|
|
"step": 33355
|
|
},
|
|
{
|
|
"entropy": 4.977566480636597,
|
|
"epoch": 3.2046109510086453,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003991847099152775,
|
|
"loss": 4.5679,
|
|
"mean_token_accuracy": 0.25699655115604403,
|
|
"num_tokens": 76499606.0,
|
|
"step": 33360
|
|
},
|
|
{
|
|
"entropy": 5.127652788162232,
|
|
"epoch": 3.2050912584053792,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003991561197908114,
|
|
"loss": 4.7337,
|
|
"mean_token_accuracy": 0.2374101310968399,
|
|
"num_tokens": 76510953.0,
|
|
"step": 33365
|
|
},
|
|
{
|
|
"entropy": 5.055814170837403,
|
|
"epoch": 3.2055715658021136,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003991275267838088,
|
|
"loss": 4.5916,
|
|
"mean_token_accuracy": 0.24716406613588332,
|
|
"num_tokens": 76522420.0,
|
|
"step": 33370
|
|
},
|
|
{
|
|
"entropy": 5.016264963150024,
|
|
"epoch": 3.2060518731988474,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039909893089493353,
|
|
"loss": 4.6248,
|
|
"mean_token_accuracy": 0.2561017364263535,
|
|
"num_tokens": 76533748.0,
|
|
"step": 33375
|
|
},
|
|
{
|
|
"entropy": 5.040944290161133,
|
|
"epoch": 3.2065321805955813,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00039907033212484966,
|
|
"loss": 4.5872,
|
|
"mean_token_accuracy": 0.248762047290802,
|
|
"num_tokens": 76544831.0,
|
|
"step": 33380
|
|
},
|
|
{
|
|
"entropy": 5.0381245613098145,
|
|
"epoch": 3.207012487992315,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003990417304742211,
|
|
"loss": 4.6528,
|
|
"mean_token_accuracy": 0.2455194041132927,
|
|
"num_tokens": 76558048.0,
|
|
"step": 33385
|
|
},
|
|
{
|
|
"entropy": 5.03350043296814,
|
|
"epoch": 3.207492795389049,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003990131259437119,
|
|
"loss": 4.6788,
|
|
"mean_token_accuracy": 0.2491478905081749,
|
|
"num_tokens": 76568461.0,
|
|
"step": 33390
|
|
},
|
|
{
|
|
"entropy": 5.018414163589478,
|
|
"epoch": 3.207973102785783,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003989845185339861,
|
|
"loss": 4.6172,
|
|
"mean_token_accuracy": 0.25062026232481005,
|
|
"num_tokens": 76580191.0,
|
|
"step": 33395
|
|
},
|
|
{
|
|
"entropy": 5.06202917098999,
|
|
"epoch": 3.208453410182517,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003989559082457079,
|
|
"loss": 4.5806,
|
|
"mean_token_accuracy": 0.24616572111845017,
|
|
"num_tokens": 76591846.0,
|
|
"step": 33400
|
|
},
|
|
{
|
|
"entropy": 5.032164669036865,
|
|
"epoch": 3.2089337175792507,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039892729507954173,
|
|
"loss": 4.5598,
|
|
"mean_token_accuracy": 0.25316447019577026,
|
|
"num_tokens": 76602301.0,
|
|
"step": 33405
|
|
},
|
|
{
|
|
"entropy": 4.994559240341187,
|
|
"epoch": 3.2094140249759846,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039889867903615165,
|
|
"loss": 4.6123,
|
|
"mean_token_accuracy": 0.2511447861790657,
|
|
"num_tokens": 76613299.0,
|
|
"step": 33410
|
|
},
|
|
{
|
|
"entropy": 5.055741643905639,
|
|
"epoch": 3.2098943323727185,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039887006011620217,
|
|
"loss": 4.6565,
|
|
"mean_token_accuracy": 0.24609391540288925,
|
|
"num_tokens": 76625022.0,
|
|
"step": 33415
|
|
},
|
|
{
|
|
"entropy": 5.023919916152954,
|
|
"epoch": 3.2103746397694524,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039884143832035775,
|
|
"loss": 4.5953,
|
|
"mean_token_accuracy": 0.2515264004468918,
|
|
"num_tokens": 76636393.0,
|
|
"step": 33420
|
|
},
|
|
{
|
|
"entropy": 5.065608930587769,
|
|
"epoch": 3.2108549471661862,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003988128136492828,
|
|
"loss": 4.6301,
|
|
"mean_token_accuracy": 0.24483353346586229,
|
|
"num_tokens": 76648953.0,
|
|
"step": 33425
|
|
},
|
|
{
|
|
"entropy": 5.008993434906006,
|
|
"epoch": 3.21133525456292,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000398784186103642,
|
|
"loss": 4.6321,
|
|
"mean_token_accuracy": 0.24404721707105637,
|
|
"num_tokens": 76660843.0,
|
|
"step": 33430
|
|
},
|
|
{
|
|
"entropy": 5.019329833984375,
|
|
"epoch": 3.211815561959654,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039875555568409996,
|
|
"loss": 4.6255,
|
|
"mean_token_accuracy": 0.25317404270172117,
|
|
"num_tokens": 76671745.0,
|
|
"step": 33435
|
|
},
|
|
{
|
|
"entropy": 5.012636709213257,
|
|
"epoch": 3.212295869356388,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003987269223913214,
|
|
"loss": 4.5873,
|
|
"mean_token_accuracy": 0.24610354006290436,
|
|
"num_tokens": 76684386.0,
|
|
"step": 33440
|
|
},
|
|
{
|
|
"entropy": 5.018772459030151,
|
|
"epoch": 3.212776176753122,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039869828622597105,
|
|
"loss": 4.5626,
|
|
"mean_token_accuracy": 0.25148770660161973,
|
|
"num_tokens": 76694670.0,
|
|
"step": 33445
|
|
},
|
|
{
|
|
"entropy": 4.9570159912109375,
|
|
"epoch": 3.213256484149856,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039866964718871385,
|
|
"loss": 4.6225,
|
|
"mean_token_accuracy": 0.2571745112538338,
|
|
"num_tokens": 76705567.0,
|
|
"step": 33450
|
|
},
|
|
{
|
|
"entropy": 4.97451639175415,
|
|
"epoch": 3.21373679154659,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003986410052802146,
|
|
"loss": 4.5817,
|
|
"mean_token_accuracy": 0.24855275601148605,
|
|
"num_tokens": 76717211.0,
|
|
"step": 33455
|
|
},
|
|
{
|
|
"entropy": 5.097514343261719,
|
|
"epoch": 3.214217098943324,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039861236050113845,
|
|
"loss": 4.6861,
|
|
"mean_token_accuracy": 0.24082629680633544,
|
|
"num_tokens": 76728414.0,
|
|
"step": 33460
|
|
},
|
|
{
|
|
"entropy": 4.929339838027954,
|
|
"epoch": 3.2146974063400577,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003985837128521503,
|
|
"loss": 4.522,
|
|
"mean_token_accuracy": 0.25955982208251954,
|
|
"num_tokens": 76739790.0,
|
|
"step": 33465
|
|
},
|
|
{
|
|
"entropy": 5.041322517395019,
|
|
"epoch": 3.2151777137367916,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003985550623339153,
|
|
"loss": 4.6336,
|
|
"mean_token_accuracy": 0.2502042159438133,
|
|
"num_tokens": 76751558.0,
|
|
"step": 33470
|
|
},
|
|
{
|
|
"entropy": 5.114486455917358,
|
|
"epoch": 3.2156580211335255,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003985264089470987,
|
|
"loss": 4.7915,
|
|
"mean_token_accuracy": 0.2358861654996872,
|
|
"num_tokens": 76763004.0,
|
|
"step": 33475
|
|
},
|
|
{
|
|
"entropy": 5.061826992034912,
|
|
"epoch": 3.2161383285302594,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00039849775269236556,
|
|
"loss": 4.6412,
|
|
"mean_token_accuracy": 0.2504911407828331,
|
|
"num_tokens": 76773625.0,
|
|
"step": 33480
|
|
},
|
|
{
|
|
"entropy": 5.07300214767456,
|
|
"epoch": 3.2166186359269933,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00039846909357038135,
|
|
"loss": 4.6786,
|
|
"mean_token_accuracy": 0.2430872544646263,
|
|
"num_tokens": 76786351.0,
|
|
"step": 33485
|
|
},
|
|
{
|
|
"entropy": 5.053680086135865,
|
|
"epoch": 3.217098943323727,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003984404315818115,
|
|
"loss": 4.5789,
|
|
"mean_token_accuracy": 0.24969908744096755,
|
|
"num_tokens": 76796997.0,
|
|
"step": 33490
|
|
},
|
|
{
|
|
"entropy": 5.041178035736084,
|
|
"epoch": 3.217579250720461,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00039841176672732127,
|
|
"loss": 4.6591,
|
|
"mean_token_accuracy": 0.24388981014490127,
|
|
"num_tokens": 76809690.0,
|
|
"step": 33495
|
|
},
|
|
{
|
|
"entropy": 5.029389333724976,
|
|
"epoch": 3.218059558117195,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003983830990075763,
|
|
"loss": 4.5578,
|
|
"mean_token_accuracy": 0.25537789016962054,
|
|
"num_tokens": 76820387.0,
|
|
"step": 33500
|
|
},
|
|
{
|
|
"entropy": 5.070287752151489,
|
|
"epoch": 3.218539865513929,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039835442842324216,
|
|
"loss": 4.7261,
|
|
"mean_token_accuracy": 0.24159679412841797,
|
|
"num_tokens": 76833261.0,
|
|
"step": 33505
|
|
},
|
|
{
|
|
"entropy": 4.961561965942383,
|
|
"epoch": 3.2190201729106627,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039832575497498454,
|
|
"loss": 4.5496,
|
|
"mean_token_accuracy": 0.24915057718753814,
|
|
"num_tokens": 76844952.0,
|
|
"step": 33510
|
|
},
|
|
{
|
|
"entropy": 5.020523405075073,
|
|
"epoch": 3.2195004803073966,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00039829707866346895,
|
|
"loss": 4.6026,
|
|
"mean_token_accuracy": 0.2461605966091156,
|
|
"num_tokens": 76856076.0,
|
|
"step": 33515
|
|
},
|
|
{
|
|
"entropy": 5.157642126083374,
|
|
"epoch": 3.219980787704131,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003982683994893614,
|
|
"loss": 4.8054,
|
|
"mean_token_accuracy": 0.23462713956832887,
|
|
"num_tokens": 76867510.0,
|
|
"step": 33520
|
|
},
|
|
{
|
|
"entropy": 5.093398809432983,
|
|
"epoch": 3.2204610951008648,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039823971745332764,
|
|
"loss": 4.7145,
|
|
"mean_token_accuracy": 0.2450822800397873,
|
|
"num_tokens": 76879919.0,
|
|
"step": 33525
|
|
},
|
|
{
|
|
"entropy": 5.00069637298584,
|
|
"epoch": 3.2209414024975986,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003982110325560336,
|
|
"loss": 4.6205,
|
|
"mean_token_accuracy": 0.24558212459087372,
|
|
"num_tokens": 76891402.0,
|
|
"step": 33530
|
|
},
|
|
{
|
|
"entropy": 5.091366624832153,
|
|
"epoch": 3.2214217098943325,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003981823447981453,
|
|
"loss": 4.6441,
|
|
"mean_token_accuracy": 0.24134110063314437,
|
|
"num_tokens": 76902703.0,
|
|
"step": 33535
|
|
},
|
|
{
|
|
"entropy": 5.07065863609314,
|
|
"epoch": 3.2219020172910664,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00039815365418032855,
|
|
"loss": 4.5796,
|
|
"mean_token_accuracy": 0.24963993430137635,
|
|
"num_tokens": 76913661.0,
|
|
"step": 33540
|
|
},
|
|
{
|
|
"entropy": 4.983402442932129,
|
|
"epoch": 3.2223823246878003,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039812496070324983,
|
|
"loss": 4.5664,
|
|
"mean_token_accuracy": 0.25056677460670473,
|
|
"num_tokens": 76924175.0,
|
|
"step": 33545
|
|
},
|
|
{
|
|
"entropy": 5.057305765151978,
|
|
"epoch": 3.222862632084534,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000398096264367575,
|
|
"loss": 4.6603,
|
|
"mean_token_accuracy": 0.24586112797260284,
|
|
"num_tokens": 76935158.0,
|
|
"step": 33550
|
|
},
|
|
{
|
|
"entropy": 4.988756513595581,
|
|
"epoch": 3.223342939481268,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003980675651739705,
|
|
"loss": 4.5192,
|
|
"mean_token_accuracy": 0.2597853600978851,
|
|
"num_tokens": 76944918.0,
|
|
"step": 33555
|
|
},
|
|
{
|
|
"entropy": 4.917987489700318,
|
|
"epoch": 3.223823246878002,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00039803886312310253,
|
|
"loss": 4.5161,
|
|
"mean_token_accuracy": 0.26421377807855606,
|
|
"num_tokens": 76955128.0,
|
|
"step": 33560
|
|
},
|
|
{
|
|
"entropy": 4.992546081542969,
|
|
"epoch": 3.224303554274736,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039801015821563755,
|
|
"loss": 4.672,
|
|
"mean_token_accuracy": 0.24415734857320787,
|
|
"num_tokens": 76966227.0,
|
|
"step": 33565
|
|
},
|
|
{
|
|
"entropy": 5.0746008396148685,
|
|
"epoch": 3.2247838616714697,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003979814504522419,
|
|
"loss": 4.5778,
|
|
"mean_token_accuracy": 0.25063045173883436,
|
|
"num_tokens": 76977059.0,
|
|
"step": 33570
|
|
},
|
|
{
|
|
"entropy": 4.96845383644104,
|
|
"epoch": 3.2252641690682036,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039795273983358223,
|
|
"loss": 4.5614,
|
|
"mean_token_accuracy": 0.25698710083961485,
|
|
"num_tokens": 76987447.0,
|
|
"step": 33575
|
|
},
|
|
{
|
|
"entropy": 5.023295307159424,
|
|
"epoch": 3.2257444764649374,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039792402636032497,
|
|
"loss": 4.6579,
|
|
"mean_token_accuracy": 0.2497084230184555,
|
|
"num_tokens": 76998960.0,
|
|
"step": 33580
|
|
},
|
|
{
|
|
"entropy": 5.049805212020874,
|
|
"epoch": 3.2262247838616713,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00039789531003313696,
|
|
"loss": 4.6694,
|
|
"mean_token_accuracy": 0.24890413135290146,
|
|
"num_tokens": 77011255.0,
|
|
"step": 33585
|
|
},
|
|
{
|
|
"entropy": 5.02531247138977,
|
|
"epoch": 3.226705091258405,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003978665908526846,
|
|
"loss": 4.6114,
|
|
"mean_token_accuracy": 0.2471139207482338,
|
|
"num_tokens": 77023061.0,
|
|
"step": 33590
|
|
},
|
|
{
|
|
"entropy": 5.026203250885009,
|
|
"epoch": 3.227185398655139,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003978378688196349,
|
|
"loss": 4.5959,
|
|
"mean_token_accuracy": 0.2514078453183174,
|
|
"num_tokens": 77033949.0,
|
|
"step": 33595
|
|
},
|
|
{
|
|
"entropy": 4.997019195556641,
|
|
"epoch": 3.227665706051873,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003978091439346546,
|
|
"loss": 4.5736,
|
|
"mean_token_accuracy": 0.2488286927342415,
|
|
"num_tokens": 77044990.0,
|
|
"step": 33600
|
|
},
|
|
{
|
|
"entropy": 5.097410869598389,
|
|
"epoch": 3.2281460134486073,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003977804161984108,
|
|
"loss": 4.6759,
|
|
"mean_token_accuracy": 0.2426385059952736,
|
|
"num_tokens": 77055099.0,
|
|
"step": 33605
|
|
},
|
|
{
|
|
"entropy": 5.047701454162597,
|
|
"epoch": 3.228626320845341,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003977516856115702,
|
|
"loss": 4.6047,
|
|
"mean_token_accuracy": 0.25579265505075455,
|
|
"num_tokens": 77065854.0,
|
|
"step": 33610
|
|
},
|
|
{
|
|
"entropy": 5.0332708835601805,
|
|
"epoch": 3.229106628242075,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00039772295217479993,
|
|
"loss": 4.6451,
|
|
"mean_token_accuracy": 0.25158894062042236,
|
|
"num_tokens": 77077531.0,
|
|
"step": 33615
|
|
},
|
|
{
|
|
"entropy": 5.085025262832642,
|
|
"epoch": 3.229586935638809,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003976942158887671,
|
|
"loss": 4.6101,
|
|
"mean_token_accuracy": 0.2541229441761971,
|
|
"num_tokens": 77088659.0,
|
|
"step": 33620
|
|
},
|
|
{
|
|
"entropy": 5.060108709335327,
|
|
"epoch": 3.230067243035543,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000397665476754139,
|
|
"loss": 4.7039,
|
|
"mean_token_accuracy": 0.23708791583776473,
|
|
"num_tokens": 77099868.0,
|
|
"step": 33625
|
|
},
|
|
{
|
|
"entropy": 5.044315433502197,
|
|
"epoch": 3.2305475504322767,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003976367347715828,
|
|
"loss": 4.6222,
|
|
"mean_token_accuracy": 0.25418720692396163,
|
|
"num_tokens": 77112415.0,
|
|
"step": 33630
|
|
},
|
|
{
|
|
"entropy": 5.066732931137085,
|
|
"epoch": 3.2310278578290106,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003976079899417657,
|
|
"loss": 4.7151,
|
|
"mean_token_accuracy": 0.23702074140310286,
|
|
"num_tokens": 77124978.0,
|
|
"step": 33635
|
|
},
|
|
{
|
|
"entropy": 5.072137689590454,
|
|
"epoch": 3.2315081652257445,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003975792422653552,
|
|
"loss": 4.6697,
|
|
"mean_token_accuracy": 0.24586405158042907,
|
|
"num_tokens": 77135800.0,
|
|
"step": 33640
|
|
},
|
|
{
|
|
"entropy": 5.057637071609497,
|
|
"epoch": 3.2319884726224783,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003975504917430186,
|
|
"loss": 4.6713,
|
|
"mean_token_accuracy": 0.24309516847133636,
|
|
"num_tokens": 77147269.0,
|
|
"step": 33645
|
|
},
|
|
{
|
|
"entropy": 5.039349699020386,
|
|
"epoch": 3.2324687800192122,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003975217383754235,
|
|
"loss": 4.6536,
|
|
"mean_token_accuracy": 0.24544923901557922,
|
|
"num_tokens": 77159522.0,
|
|
"step": 33650
|
|
},
|
|
{
|
|
"entropy": 5.0715264797210695,
|
|
"epoch": 3.232949087415946,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003974929821632375,
|
|
"loss": 4.6453,
|
|
"mean_token_accuracy": 0.24852334558963776,
|
|
"num_tokens": 77170884.0,
|
|
"step": 33655
|
|
},
|
|
{
|
|
"entropy": 4.9567262649536135,
|
|
"epoch": 3.23342939481268,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039746422310712814,
|
|
"loss": 4.4834,
|
|
"mean_token_accuracy": 0.2606669679284096,
|
|
"num_tokens": 77181554.0,
|
|
"step": 33660
|
|
},
|
|
{
|
|
"entropy": 5.01213231086731,
|
|
"epoch": 3.233909702209414,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003974354612077632,
|
|
"loss": 4.6844,
|
|
"mean_token_accuracy": 0.24595702141523362,
|
|
"num_tokens": 77192988.0,
|
|
"step": 33665
|
|
},
|
|
{
|
|
"entropy": 5.052438592910766,
|
|
"epoch": 3.2343900096061478,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003974066964658104,
|
|
"loss": 4.5873,
|
|
"mean_token_accuracy": 0.255949005484581,
|
|
"num_tokens": 77204107.0,
|
|
"step": 33670
|
|
},
|
|
{
|
|
"entropy": 5.1571298122406,
|
|
"epoch": 3.2348703170028816,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00039737792888193754,
|
|
"loss": 4.7468,
|
|
"mean_token_accuracy": 0.23783104568719865,
|
|
"num_tokens": 77217008.0,
|
|
"step": 33675
|
|
},
|
|
{
|
|
"entropy": 5.050239706039429,
|
|
"epoch": 3.235350624399616,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003973491584568126,
|
|
"loss": 4.6694,
|
|
"mean_token_accuracy": 0.24460101872682571,
|
|
"num_tokens": 77228651.0,
|
|
"step": 33680
|
|
},
|
|
{
|
|
"entropy": 4.946198034286499,
|
|
"epoch": 3.23583093179635,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003973203851911035,
|
|
"loss": 4.5344,
|
|
"mean_token_accuracy": 0.2570539727807045,
|
|
"num_tokens": 77240251.0,
|
|
"step": 33685
|
|
},
|
|
{
|
|
"entropy": 5.1295225620269775,
|
|
"epoch": 3.2363112391930837,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003972916090854782,
|
|
"loss": 4.6414,
|
|
"mean_token_accuracy": 0.2535032883286476,
|
|
"num_tokens": 77252160.0,
|
|
"step": 33690
|
|
},
|
|
{
|
|
"entropy": 5.016752290725708,
|
|
"epoch": 3.2367915465898176,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039726283014060497,
|
|
"loss": 4.5714,
|
|
"mean_token_accuracy": 0.2535124719142914,
|
|
"num_tokens": 77263678.0,
|
|
"step": 33695
|
|
},
|
|
{
|
|
"entropy": 5.015379238128662,
|
|
"epoch": 3.2372718539865515,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003972340483571519,
|
|
"loss": 4.6308,
|
|
"mean_token_accuracy": 0.2428615778684616,
|
|
"num_tokens": 77275180.0,
|
|
"step": 33700
|
|
},
|
|
{
|
|
"entropy": 5.073586702346802,
|
|
"epoch": 3.2377521613832854,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039720526373578704,
|
|
"loss": 4.646,
|
|
"mean_token_accuracy": 0.24470321238040924,
|
|
"num_tokens": 77286450.0,
|
|
"step": 33705
|
|
},
|
|
{
|
|
"entropy": 5.053950643539428,
|
|
"epoch": 3.2382324687800192,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00039717647627717894,
|
|
"loss": 4.5862,
|
|
"mean_token_accuracy": 0.2565572842955589,
|
|
"num_tokens": 77297011.0,
|
|
"step": 33710
|
|
},
|
|
{
|
|
"entropy": 4.971718645095825,
|
|
"epoch": 3.238712776176753,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003971476859819958,
|
|
"loss": 4.5017,
|
|
"mean_token_accuracy": 0.26188462525606154,
|
|
"num_tokens": 77307408.0,
|
|
"step": 33715
|
|
},
|
|
{
|
|
"entropy": 5.078515434265137,
|
|
"epoch": 3.239193083573487,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003971188928509062,
|
|
"loss": 4.655,
|
|
"mean_token_accuracy": 0.2519090369343758,
|
|
"num_tokens": 77316762.0,
|
|
"step": 33720
|
|
},
|
|
{
|
|
"entropy": 5.088728237152099,
|
|
"epoch": 3.239673390970221,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003970900968845784,
|
|
"loss": 4.647,
|
|
"mean_token_accuracy": 0.2538820832967758,
|
|
"num_tokens": 77327233.0,
|
|
"step": 33725
|
|
},
|
|
{
|
|
"entropy": 5.0935204982757565,
|
|
"epoch": 3.2401536983669548,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039706129808368115,
|
|
"loss": 4.6849,
|
|
"mean_token_accuracy": 0.24508253037929534,
|
|
"num_tokens": 77338001.0,
|
|
"step": 33730
|
|
},
|
|
{
|
|
"entropy": 5.001562309265137,
|
|
"epoch": 3.2406340057636887,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003970324964488829,
|
|
"loss": 4.6347,
|
|
"mean_token_accuracy": 0.24426246285438538,
|
|
"num_tokens": 77350243.0,
|
|
"step": 33735
|
|
},
|
|
{
|
|
"entropy": 5.025910663604736,
|
|
"epoch": 3.2411143131604225,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00039700369198085255,
|
|
"loss": 4.6393,
|
|
"mean_token_accuracy": 0.2468763843178749,
|
|
"num_tokens": 77360712.0,
|
|
"step": 33740
|
|
},
|
|
{
|
|
"entropy": 5.0112837791442875,
|
|
"epoch": 3.2415946205571564,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00039697488468025876,
|
|
"loss": 4.574,
|
|
"mean_token_accuracy": 0.24540430605411528,
|
|
"num_tokens": 77371399.0,
|
|
"step": 33745
|
|
},
|
|
{
|
|
"entropy": 5.097122049331665,
|
|
"epoch": 3.2420749279538903,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003969460745477703,
|
|
"loss": 4.6828,
|
|
"mean_token_accuracy": 0.24141585379838942,
|
|
"num_tokens": 77381652.0,
|
|
"step": 33750
|
|
},
|
|
{
|
|
"entropy": 5.0294352054595945,
|
|
"epoch": 3.2425552353506246,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00039691726158405606,
|
|
"loss": 4.615,
|
|
"mean_token_accuracy": 0.2474424034357071,
|
|
"num_tokens": 77393335.0,
|
|
"step": 33755
|
|
},
|
|
{
|
|
"entropy": 5.017794370651245,
|
|
"epoch": 3.2430355427473585,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039688844578978516,
|
|
"loss": 4.6233,
|
|
"mean_token_accuracy": 0.2538589760661125,
|
|
"num_tokens": 77404311.0,
|
|
"step": 33760
|
|
},
|
|
{
|
|
"entropy": 5.069286727905274,
|
|
"epoch": 3.2435158501440924,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003968596271656263,
|
|
"loss": 4.6646,
|
|
"mean_token_accuracy": 0.24574896097183227,
|
|
"num_tokens": 77415886.0,
|
|
"step": 33765
|
|
},
|
|
{
|
|
"entropy": 5.06607232093811,
|
|
"epoch": 3.2439961575408263,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039683080571224885,
|
|
"loss": 4.6343,
|
|
"mean_token_accuracy": 0.24921049177646637,
|
|
"num_tokens": 77426300.0,
|
|
"step": 33770
|
|
},
|
|
{
|
|
"entropy": 5.1001753330230715,
|
|
"epoch": 3.24447646493756,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003968019814303219,
|
|
"loss": 4.6652,
|
|
"mean_token_accuracy": 0.24348481893539428,
|
|
"num_tokens": 77437803.0,
|
|
"step": 33775
|
|
},
|
|
{
|
|
"entropy": 5.098292875289917,
|
|
"epoch": 3.244956772334294,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003967731543205145,
|
|
"loss": 4.6655,
|
|
"mean_token_accuracy": 0.250624942779541,
|
|
"num_tokens": 77448850.0,
|
|
"step": 33780
|
|
},
|
|
{
|
|
"entropy": 5.036126708984375,
|
|
"epoch": 3.245437079731028,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00039674432438349607,
|
|
"loss": 4.6083,
|
|
"mean_token_accuracy": 0.2461878776550293,
|
|
"num_tokens": 77458933.0,
|
|
"step": 33785
|
|
},
|
|
{
|
|
"entropy": 5.0860895156860355,
|
|
"epoch": 3.245917387127762,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000396715491619936,
|
|
"loss": 4.6225,
|
|
"mean_token_accuracy": 0.23742762207984924,
|
|
"num_tokens": 77469916.0,
|
|
"step": 33790
|
|
},
|
|
{
|
|
"entropy": 5.081142425537109,
|
|
"epoch": 3.2463976945244957,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003966866560305036,
|
|
"loss": 4.6019,
|
|
"mean_token_accuracy": 0.2540366739034653,
|
|
"num_tokens": 77480548.0,
|
|
"step": 33795
|
|
},
|
|
{
|
|
"entropy": 5.031909418106079,
|
|
"epoch": 3.2468780019212296,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039665781761586837,
|
|
"loss": 4.6331,
|
|
"mean_token_accuracy": 0.23783497661352157,
|
|
"num_tokens": 77490664.0,
|
|
"step": 33800
|
|
},
|
|
{
|
|
"entropy": 5.0877001762390135,
|
|
"epoch": 3.2473583093179634,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003966289763766999,
|
|
"loss": 4.5966,
|
|
"mean_token_accuracy": 0.25352431684732435,
|
|
"num_tokens": 77502010.0,
|
|
"step": 33805
|
|
},
|
|
{
|
|
"entropy": 5.030259513854981,
|
|
"epoch": 3.2478386167146973,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003966001323136678,
|
|
"loss": 4.5991,
|
|
"mean_token_accuracy": 0.25208001136779784,
|
|
"num_tokens": 77513453.0,
|
|
"step": 33810
|
|
},
|
|
{
|
|
"entropy": 5.0315773487091064,
|
|
"epoch": 3.248318924111431,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003965712854274416,
|
|
"loss": 4.7023,
|
|
"mean_token_accuracy": 0.2426896795630455,
|
|
"num_tokens": 77526294.0,
|
|
"step": 33815
|
|
},
|
|
{
|
|
"entropy": 5.095651865005493,
|
|
"epoch": 3.248799231508165,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003965424357186912,
|
|
"loss": 4.6988,
|
|
"mean_token_accuracy": 0.24423594325780867,
|
|
"num_tokens": 77537753.0,
|
|
"step": 33820
|
|
},
|
|
{
|
|
"entropy": 5.003453779220581,
|
|
"epoch": 3.249279538904899,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003965135831880864,
|
|
"loss": 4.5521,
|
|
"mean_token_accuracy": 0.2491741508245468,
|
|
"num_tokens": 77548931.0,
|
|
"step": 33825
|
|
},
|
|
{
|
|
"entropy": 4.9637964248657225,
|
|
"epoch": 3.249759846301633,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000396484727836297,
|
|
"loss": 4.5891,
|
|
"mean_token_accuracy": 0.2554382473230362,
|
|
"num_tokens": 77560259.0,
|
|
"step": 33830
|
|
},
|
|
{
|
|
"entropy": 5.063522148132324,
|
|
"epoch": 3.2502401536983667,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003964558696639928,
|
|
"loss": 4.6414,
|
|
"mean_token_accuracy": 0.2532750189304352,
|
|
"num_tokens": 77571688.0,
|
|
"step": 33835
|
|
},
|
|
{
|
|
"entropy": 5.131698942184448,
|
|
"epoch": 3.250720461095101,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003964270086718441,
|
|
"loss": 4.6939,
|
|
"mean_token_accuracy": 0.2505921542644501,
|
|
"num_tokens": 77583754.0,
|
|
"step": 33840
|
|
},
|
|
{
|
|
"entropy": 5.0321290493011475,
|
|
"epoch": 3.251200768491835,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00039639814486052083,
|
|
"loss": 4.5707,
|
|
"mean_token_accuracy": 0.2609803184866905,
|
|
"num_tokens": 77596013.0,
|
|
"step": 33845
|
|
},
|
|
{
|
|
"entropy": 5.047715425491333,
|
|
"epoch": 3.251681075888569,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003963692782306931,
|
|
"loss": 4.6828,
|
|
"mean_token_accuracy": 0.24805556684732438,
|
|
"num_tokens": 77607349.0,
|
|
"step": 33850
|
|
},
|
|
{
|
|
"entropy": 5.053295421600342,
|
|
"epoch": 3.2521613832853027,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003963404087830311,
|
|
"loss": 4.6281,
|
|
"mean_token_accuracy": 0.2486888661980629,
|
|
"num_tokens": 77618325.0,
|
|
"step": 33855
|
|
},
|
|
{
|
|
"entropy": 5.141686725616455,
|
|
"epoch": 3.2526416906820366,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003963115365182051,
|
|
"loss": 4.7248,
|
|
"mean_token_accuracy": 0.2364576131105423,
|
|
"num_tokens": 77629794.0,
|
|
"step": 33860
|
|
},
|
|
{
|
|
"entropy": 4.975665616989136,
|
|
"epoch": 3.2531219980787704,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00039628266143688554,
|
|
"loss": 4.5912,
|
|
"mean_token_accuracy": 0.25148352831602094,
|
|
"num_tokens": 77641800.0,
|
|
"step": 33865
|
|
},
|
|
{
|
|
"entropy": 4.997483825683593,
|
|
"epoch": 3.2536023054755043,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003962537835397426,
|
|
"loss": 4.5895,
|
|
"mean_token_accuracy": 0.24789944887161255,
|
|
"num_tokens": 77653533.0,
|
|
"step": 33870
|
|
},
|
|
{
|
|
"entropy": 5.035173130035401,
|
|
"epoch": 3.254082612872238,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039622490282744684,
|
|
"loss": 4.6184,
|
|
"mean_token_accuracy": 0.2500454217195511,
|
|
"num_tokens": 77663508.0,
|
|
"step": 33875
|
|
},
|
|
{
|
|
"entropy": 5.0597367763519285,
|
|
"epoch": 3.254562920268972,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003961960193006689,
|
|
"loss": 4.6473,
|
|
"mean_token_accuracy": 0.246576663851738,
|
|
"num_tokens": 77674956.0,
|
|
"step": 33880
|
|
},
|
|
{
|
|
"entropy": 5.108331680297852,
|
|
"epoch": 3.255043227665706,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003961671329600792,
|
|
"loss": 4.6458,
|
|
"mean_token_accuracy": 0.2456754356622696,
|
|
"num_tokens": 77686000.0,
|
|
"step": 33885
|
|
},
|
|
{
|
|
"entropy": 4.950703525543213,
|
|
"epoch": 3.25552353506244,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003961382438063485,
|
|
"loss": 4.5561,
|
|
"mean_token_accuracy": 0.24644300490617752,
|
|
"num_tokens": 77698979.0,
|
|
"step": 33890
|
|
},
|
|
{
|
|
"entropy": 4.954560852050781,
|
|
"epoch": 3.2560038424591737,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003961093518401475,
|
|
"loss": 4.5033,
|
|
"mean_token_accuracy": 0.25473318099975584,
|
|
"num_tokens": 77710397.0,
|
|
"step": 33895
|
|
},
|
|
{
|
|
"entropy": 5.02457480430603,
|
|
"epoch": 3.2564841498559076,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039608045706214696,
|
|
"loss": 4.6169,
|
|
"mean_token_accuracy": 0.24143614768981933,
|
|
"num_tokens": 77721665.0,
|
|
"step": 33900
|
|
},
|
|
{
|
|
"entropy": 5.0436742305755615,
|
|
"epoch": 3.2569644572526415,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003960515594730177,
|
|
"loss": 4.6195,
|
|
"mean_token_accuracy": 0.24937530755996704,
|
|
"num_tokens": 77733328.0,
|
|
"step": 33905
|
|
},
|
|
{
|
|
"entropy": 5.059139728546143,
|
|
"epoch": 3.2574447646493754,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003960226590734307,
|
|
"loss": 4.6327,
|
|
"mean_token_accuracy": 0.24265473634004592,
|
|
"num_tokens": 77745812.0,
|
|
"step": 33910
|
|
},
|
|
{
|
|
"entropy": 4.941529989242554,
|
|
"epoch": 3.2579250720461097,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000395993755864057,
|
|
"loss": 4.4927,
|
|
"mean_token_accuracy": 0.2593093618750572,
|
|
"num_tokens": 77757473.0,
|
|
"step": 33915
|
|
},
|
|
{
|
|
"entropy": 5.053327369689941,
|
|
"epoch": 3.2584053794428436,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003959648498455674,
|
|
"loss": 4.6265,
|
|
"mean_token_accuracy": 0.24523908346891404,
|
|
"num_tokens": 77769924.0,
|
|
"step": 33920
|
|
},
|
|
{
|
|
"entropy": 5.04257230758667,
|
|
"epoch": 3.2588856868395775,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00039593594101863333,
|
|
"loss": 4.6375,
|
|
"mean_token_accuracy": 0.2502197057008743,
|
|
"num_tokens": 77782048.0,
|
|
"step": 33925
|
|
},
|
|
{
|
|
"entropy": 4.931873321533203,
|
|
"epoch": 3.2593659942363113,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039590702938392577,
|
|
"loss": 4.4975,
|
|
"mean_token_accuracy": 0.26133894473314284,
|
|
"num_tokens": 77792371.0,
|
|
"step": 33930
|
|
},
|
|
{
|
|
"entropy": 5.0368239402771,
|
|
"epoch": 3.2598463016330452,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00039587811494211594,
|
|
"loss": 4.6242,
|
|
"mean_token_accuracy": 0.2455820083618164,
|
|
"num_tokens": 77804089.0,
|
|
"step": 33935
|
|
},
|
|
{
|
|
"entropy": 4.983446359634399,
|
|
"epoch": 3.260326609029779,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00039584919769387536,
|
|
"loss": 4.5917,
|
|
"mean_token_accuracy": 0.24377025067806243,
|
|
"num_tokens": 77815962.0,
|
|
"step": 33940
|
|
},
|
|
{
|
|
"entropy": 5.08806529045105,
|
|
"epoch": 3.260806916426513,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003958202776398751,
|
|
"loss": 4.7071,
|
|
"mean_token_accuracy": 0.23923599421977998,
|
|
"num_tokens": 77827396.0,
|
|
"step": 33945
|
|
},
|
|
{
|
|
"entropy": 5.032593154907227,
|
|
"epoch": 3.261287223823247,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003957913547807868,
|
|
"loss": 4.6387,
|
|
"mean_token_accuracy": 0.25274568498134614,
|
|
"num_tokens": 77838916.0,
|
|
"step": 33950
|
|
},
|
|
{
|
|
"entropy": 5.155986642837524,
|
|
"epoch": 3.2617675312199808,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000395762429117282,
|
|
"loss": 4.7851,
|
|
"mean_token_accuracy": 0.24382611513137817,
|
|
"num_tokens": 77851393.0,
|
|
"step": 33955
|
|
},
|
|
{
|
|
"entropy": 5.096858119964599,
|
|
"epoch": 3.2622478386167146,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003957335006500321,
|
|
"loss": 4.644,
|
|
"mean_token_accuracy": 0.24779659509658813,
|
|
"num_tokens": 77863124.0,
|
|
"step": 33960
|
|
},
|
|
{
|
|
"entropy": 4.9976495742797855,
|
|
"epoch": 3.2627281460134485,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00039570456937970883,
|
|
"loss": 4.5921,
|
|
"mean_token_accuracy": 0.2588670402765274,
|
|
"num_tokens": 77875568.0,
|
|
"step": 33965
|
|
},
|
|
{
|
|
"entropy": 5.091350555419922,
|
|
"epoch": 3.2632084534101824,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003956756353069839,
|
|
"loss": 4.6674,
|
|
"mean_token_accuracy": 0.24813520759344102,
|
|
"num_tokens": 77885747.0,
|
|
"step": 33970
|
|
},
|
|
{
|
|
"entropy": 4.9637257099151615,
|
|
"epoch": 3.2636887608069163,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003956466984325291,
|
|
"loss": 4.5257,
|
|
"mean_token_accuracy": 0.2556587189435959,
|
|
"num_tokens": 77897074.0,
|
|
"step": 33975
|
|
},
|
|
{
|
|
"entropy": 5.046815204620361,
|
|
"epoch": 3.26416906820365,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039561775875701616,
|
|
"loss": 4.6491,
|
|
"mean_token_accuracy": 0.24351091235876082,
|
|
"num_tokens": 77907718.0,
|
|
"step": 33980
|
|
},
|
|
{
|
|
"entropy": 5.042139005661011,
|
|
"epoch": 3.264649375600384,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.000395588816281117,
|
|
"loss": 4.6144,
|
|
"mean_token_accuracy": 0.24225043952465058,
|
|
"num_tokens": 77919768.0,
|
|
"step": 33985
|
|
},
|
|
{
|
|
"entropy": 5.126606607437134,
|
|
"epoch": 3.2651296829971184,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003955598710055036,
|
|
"loss": 4.6581,
|
|
"mean_token_accuracy": 0.2500751346349716,
|
|
"num_tokens": 77930927.0,
|
|
"step": 33990
|
|
},
|
|
{
|
|
"entropy": 5.079805898666382,
|
|
"epoch": 3.2656099903938522,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000395530922930848,
|
|
"loss": 4.6082,
|
|
"mean_token_accuracy": 0.24244888722896576,
|
|
"num_tokens": 77941907.0,
|
|
"step": 33995
|
|
},
|
|
{
|
|
"entropy": 4.993637466430664,
|
|
"epoch": 3.266090297790586,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003955019720578223,
|
|
"loss": 4.559,
|
|
"mean_token_accuracy": 0.2548254653811455,
|
|
"num_tokens": 77952800.0,
|
|
"step": 34000
|
|
},
|
|
{
|
|
"entropy": 5.073926544189453,
|
|
"epoch": 3.26657060518732,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003954730183870987,
|
|
"loss": 4.6828,
|
|
"mean_token_accuracy": 0.24283509850502014,
|
|
"num_tokens": 77963874.0,
|
|
"step": 34005
|
|
},
|
|
{
|
|
"entropy": 4.987660312652588,
|
|
"epoch": 3.267050912584054,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003954440619193491,
|
|
"loss": 4.4803,
|
|
"mean_token_accuracy": 0.2583694770932198,
|
|
"num_tokens": 77975282.0,
|
|
"step": 34010
|
|
},
|
|
{
|
|
"entropy": 4.964640045166016,
|
|
"epoch": 3.2675312199807878,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00039541510265524626,
|
|
"loss": 4.5422,
|
|
"mean_token_accuracy": 0.25089375078678133,
|
|
"num_tokens": 77987511.0,
|
|
"step": 34015
|
|
},
|
|
{
|
|
"entropy": 5.047157144546508,
|
|
"epoch": 3.2680115273775217,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003953861405954622,
|
|
"loss": 4.6001,
|
|
"mean_token_accuracy": 0.24573374539613724,
|
|
"num_tokens": 77997805.0,
|
|
"step": 34020
|
|
},
|
|
{
|
|
"entropy": 5.05724778175354,
|
|
"epoch": 3.2684918347742555,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003953571757406694,
|
|
"loss": 4.6789,
|
|
"mean_token_accuracy": 0.24250999987125396,
|
|
"num_tokens": 78010312.0,
|
|
"step": 34025
|
|
},
|
|
{
|
|
"entropy": 5.050488424301148,
|
|
"epoch": 3.2689721421709894,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00039532820809154044,
|
|
"loss": 4.5598,
|
|
"mean_token_accuracy": 0.2540115833282471,
|
|
"num_tokens": 78019934.0,
|
|
"step": 34030
|
|
},
|
|
{
|
|
"entropy": 5.0048281192779545,
|
|
"epoch": 3.2694524495677233,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039529923764874774,
|
|
"loss": 4.5606,
|
|
"mean_token_accuracy": 0.2609712705016136,
|
|
"num_tokens": 78031263.0,
|
|
"step": 34035
|
|
},
|
|
{
|
|
"entropy": 4.980857086181641,
|
|
"epoch": 3.269932756964457,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000395270264412964,
|
|
"loss": 4.614,
|
|
"mean_token_accuracy": 0.25289792418479917,
|
|
"num_tokens": 78043829.0,
|
|
"step": 34040
|
|
},
|
|
{
|
|
"entropy": 5.137817287445069,
|
|
"epoch": 3.270413064361191,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039524128838486184,
|
|
"loss": 4.7425,
|
|
"mean_token_accuracy": 0.23789486587047576,
|
|
"num_tokens": 78055292.0,
|
|
"step": 34045
|
|
},
|
|
{
|
|
"entropy": 5.0570290088653564,
|
|
"epoch": 3.270893371757925,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000395212309565114,
|
|
"loss": 4.6443,
|
|
"mean_token_accuracy": 0.24769087135791779,
|
|
"num_tokens": 78066969.0,
|
|
"step": 34050
|
|
},
|
|
{
|
|
"entropy": 5.138012361526489,
|
|
"epoch": 3.271373679154659,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039518332795439326,
|
|
"loss": 4.7234,
|
|
"mean_token_accuracy": 0.24245439916849137,
|
|
"num_tokens": 78077994.0,
|
|
"step": 34055
|
|
},
|
|
{
|
|
"entropy": 5.004557514190674,
|
|
"epoch": 3.2718539865513927,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039515434355337254,
|
|
"loss": 4.5828,
|
|
"mean_token_accuracy": 0.24898525774478913,
|
|
"num_tokens": 78089730.0,
|
|
"step": 34060
|
|
},
|
|
{
|
|
"entropy": 5.042746686935425,
|
|
"epoch": 3.272334293948127,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003951253563627248,
|
|
"loss": 4.6474,
|
|
"mean_token_accuracy": 0.24422961920499803,
|
|
"num_tokens": 78100125.0,
|
|
"step": 34065
|
|
},
|
|
{
|
|
"entropy": 5.118434238433838,
|
|
"epoch": 3.2728146013448605,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003950963663831229,
|
|
"loss": 4.6964,
|
|
"mean_token_accuracy": 0.24061804115772248,
|
|
"num_tokens": 78112170.0,
|
|
"step": 34070
|
|
},
|
|
{
|
|
"entropy": 5.119843292236328,
|
|
"epoch": 3.273294908741595,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039506737361524007,
|
|
"loss": 4.6803,
|
|
"mean_token_accuracy": 0.2418634071946144,
|
|
"num_tokens": 78123796.0,
|
|
"step": 34075
|
|
},
|
|
{
|
|
"entropy": 5.041185426712036,
|
|
"epoch": 3.2737752161383287,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00039503837805974926,
|
|
"loss": 4.5614,
|
|
"mean_token_accuracy": 0.2536353379487991,
|
|
"num_tokens": 78134170.0,
|
|
"step": 34080
|
|
},
|
|
{
|
|
"entropy": 4.980127477645874,
|
|
"epoch": 3.2742555235350626,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00039500937971732376,
|
|
"loss": 4.6221,
|
|
"mean_token_accuracy": 0.2498932659626007,
|
|
"num_tokens": 78145140.0,
|
|
"step": 34085
|
|
},
|
|
{
|
|
"entropy": 5.1225217342376705,
|
|
"epoch": 3.2747358309317964,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003949803785886369,
|
|
"loss": 4.7428,
|
|
"mean_token_accuracy": 0.24274052083492278,
|
|
"num_tokens": 78157627.0,
|
|
"step": 34090
|
|
},
|
|
{
|
|
"entropy": 5.093050003051758,
|
|
"epoch": 3.2752161383285303,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039495137467436184,
|
|
"loss": 4.6975,
|
|
"mean_token_accuracy": 0.24486079663038254,
|
|
"num_tokens": 78170343.0,
|
|
"step": 34095
|
|
},
|
|
{
|
|
"entropy": 5.0705037117004395,
|
|
"epoch": 3.275696445725264,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00039492236797517206,
|
|
"loss": 4.6008,
|
|
"mean_token_accuracy": 0.2509596854448318,
|
|
"num_tokens": 78181393.0,
|
|
"step": 34100
|
|
},
|
|
{
|
|
"entropy": 5.033651685714721,
|
|
"epoch": 3.276176753121998,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000394893358491741,
|
|
"loss": 4.6341,
|
|
"mean_token_accuracy": 0.24210257232189178,
|
|
"num_tokens": 78193048.0,
|
|
"step": 34105
|
|
},
|
|
{
|
|
"entropy": 5.0034088611602785,
|
|
"epoch": 3.276657060518732,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039486434622474216,
|
|
"loss": 4.5586,
|
|
"mean_token_accuracy": 0.2518891394138336,
|
|
"num_tokens": 78203738.0,
|
|
"step": 34110
|
|
},
|
|
{
|
|
"entropy": 5.088667583465576,
|
|
"epoch": 3.277137367915466,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039483533117484916,
|
|
"loss": 4.6619,
|
|
"mean_token_accuracy": 0.23626221120357513,
|
|
"num_tokens": 78215171.0,
|
|
"step": 34115
|
|
},
|
|
{
|
|
"entropy": 5.018340349197388,
|
|
"epoch": 3.2776176753121997,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003948063133427356,
|
|
"loss": 4.5536,
|
|
"mean_token_accuracy": 0.24994443356990814,
|
|
"num_tokens": 78225901.0,
|
|
"step": 34120
|
|
},
|
|
{
|
|
"entropy": 5.001010227203369,
|
|
"epoch": 3.2780979827089336,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003947772927290751,
|
|
"loss": 4.5854,
|
|
"mean_token_accuracy": 0.24815791249275207,
|
|
"num_tokens": 78237327.0,
|
|
"step": 34125
|
|
},
|
|
{
|
|
"entropy": 5.011168193817139,
|
|
"epoch": 3.2785782901056675,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003947482693345416,
|
|
"loss": 4.6219,
|
|
"mean_token_accuracy": 0.2456973373889923,
|
|
"num_tokens": 78248030.0,
|
|
"step": 34130
|
|
},
|
|
{
|
|
"entropy": 4.986574697494507,
|
|
"epoch": 3.2790585975024014,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039471924315980894,
|
|
"loss": 4.5148,
|
|
"mean_token_accuracy": 0.25550462305545807,
|
|
"num_tokens": 78259303.0,
|
|
"step": 34135
|
|
},
|
|
{
|
|
"entropy": 5.041993141174316,
|
|
"epoch": 3.2795389048991357,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039469021420555087,
|
|
"loss": 4.62,
|
|
"mean_token_accuracy": 0.24644032418727874,
|
|
"num_tokens": 78271284.0,
|
|
"step": 34140
|
|
},
|
|
{
|
|
"entropy": 5.01559419631958,
|
|
"epoch": 3.280019212295869,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039466118247244143,
|
|
"loss": 4.5618,
|
|
"mean_token_accuracy": 0.2596289172768593,
|
|
"num_tokens": 78282718.0,
|
|
"step": 34145
|
|
},
|
|
{
|
|
"entropy": 4.9947576999664305,
|
|
"epoch": 3.2804995196926034,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003946321479611547,
|
|
"loss": 4.6023,
|
|
"mean_token_accuracy": 0.2562656417489052,
|
|
"num_tokens": 78294292.0,
|
|
"step": 34150
|
|
},
|
|
{
|
|
"entropy": 4.983988332748413,
|
|
"epoch": 3.2809798270893373,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039460311067236465,
|
|
"loss": 4.6255,
|
|
"mean_token_accuracy": 0.24257488548755646,
|
|
"num_tokens": 78306834.0,
|
|
"step": 34155
|
|
},
|
|
{
|
|
"entropy": 5.084269332885742,
|
|
"epoch": 3.281460134486071,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00039457407060674557,
|
|
"loss": 4.6643,
|
|
"mean_token_accuracy": 0.24376922398805617,
|
|
"num_tokens": 78319866.0,
|
|
"step": 34160
|
|
},
|
|
{
|
|
"entropy": 5.0587766647338865,
|
|
"epoch": 3.281940441882805,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039454502776497163,
|
|
"loss": 4.6218,
|
|
"mean_token_accuracy": 0.24582590013742447,
|
|
"num_tokens": 78330933.0,
|
|
"step": 34165
|
|
},
|
|
{
|
|
"entropy": 5.02895336151123,
|
|
"epoch": 3.282420749279539,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039451598214771706,
|
|
"loss": 4.5921,
|
|
"mean_token_accuracy": 0.25370020866394044,
|
|
"num_tokens": 78342323.0,
|
|
"step": 34170
|
|
},
|
|
{
|
|
"entropy": 5.072585821151733,
|
|
"epoch": 3.282901056676273,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003944869337556563,
|
|
"loss": 4.6043,
|
|
"mean_token_accuracy": 0.2609324440360069,
|
|
"num_tokens": 78353090.0,
|
|
"step": 34175
|
|
},
|
|
{
|
|
"entropy": 5.058878993988037,
|
|
"epoch": 3.2833813640730067,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003944578825894639,
|
|
"loss": 4.6583,
|
|
"mean_token_accuracy": 0.2444702386856079,
|
|
"num_tokens": 78362679.0,
|
|
"step": 34180
|
|
},
|
|
{
|
|
"entropy": 5.04763503074646,
|
|
"epoch": 3.2838616714697406,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039442882864981397,
|
|
"loss": 4.6504,
|
|
"mean_token_accuracy": 0.24100523293018342,
|
|
"num_tokens": 78373592.0,
|
|
"step": 34185
|
|
},
|
|
{
|
|
"entropy": 4.972846364974975,
|
|
"epoch": 3.2843419788664745,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00039439977193738134,
|
|
"loss": 4.5258,
|
|
"mean_token_accuracy": 0.2608575657010078,
|
|
"num_tokens": 78384617.0,
|
|
"step": 34190
|
|
},
|
|
{
|
|
"entropy": 5.000351285934448,
|
|
"epoch": 3.2848222862632084,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039437071245284055,
|
|
"loss": 4.6439,
|
|
"mean_token_accuracy": 0.24550879001617432,
|
|
"num_tokens": 78396253.0,
|
|
"step": 34195
|
|
},
|
|
{
|
|
"entropy": 5.083364057540893,
|
|
"epoch": 3.2853025936599423,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003943416501968663,
|
|
"loss": 4.7062,
|
|
"mean_token_accuracy": 0.2408829912543297,
|
|
"num_tokens": 78407852.0,
|
|
"step": 34200
|
|
},
|
|
{
|
|
"entropy": 5.002320957183838,
|
|
"epoch": 3.285782901056676,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039431258517013323,
|
|
"loss": 4.5475,
|
|
"mean_token_accuracy": 0.2584647685289383,
|
|
"num_tokens": 78419434.0,
|
|
"step": 34205
|
|
},
|
|
{
|
|
"entropy": 5.072924947738647,
|
|
"epoch": 3.28626320845341,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003942835173733163,
|
|
"loss": 4.6388,
|
|
"mean_token_accuracy": 0.24000005573034286,
|
|
"num_tokens": 78431820.0,
|
|
"step": 34210
|
|
},
|
|
{
|
|
"entropy": 5.077604722976685,
|
|
"epoch": 3.286743515850144,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003942544468070903,
|
|
"loss": 4.6388,
|
|
"mean_token_accuracy": 0.24990027397871017,
|
|
"num_tokens": 78443062.0,
|
|
"step": 34215
|
|
},
|
|
{
|
|
"entropy": 5.091687536239624,
|
|
"epoch": 3.287223823246878,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003942253734721301,
|
|
"loss": 4.6612,
|
|
"mean_token_accuracy": 0.24888237565755844,
|
|
"num_tokens": 78454091.0,
|
|
"step": 34220
|
|
},
|
|
{
|
|
"entropy": 5.073592567443848,
|
|
"epoch": 3.287704130643612,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00039419629736911076,
|
|
"loss": 4.7042,
|
|
"mean_token_accuracy": 0.24581227451562881,
|
|
"num_tokens": 78466108.0,
|
|
"step": 34225
|
|
},
|
|
{
|
|
"entropy": 4.997543239593506,
|
|
"epoch": 3.288184438040346,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039416721849870736,
|
|
"loss": 4.5288,
|
|
"mean_token_accuracy": 0.250569124519825,
|
|
"num_tokens": 78476690.0,
|
|
"step": 34230
|
|
},
|
|
{
|
|
"entropy": 5.145573568344116,
|
|
"epoch": 3.28866474543708,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003941381368615951,
|
|
"loss": 4.7435,
|
|
"mean_token_accuracy": 0.23636368811130523,
|
|
"num_tokens": 78487157.0,
|
|
"step": 34235
|
|
},
|
|
{
|
|
"entropy": 5.09866304397583,
|
|
"epoch": 3.2891450528338138,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003941090524584489,
|
|
"loss": 4.6557,
|
|
"mean_token_accuracy": 0.24694444388151168,
|
|
"num_tokens": 78499034.0,
|
|
"step": 34240
|
|
},
|
|
{
|
|
"entropy": 5.00386004447937,
|
|
"epoch": 3.2896253602305476,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003940799652899442,
|
|
"loss": 4.5568,
|
|
"mean_token_accuracy": 0.25482958406209943,
|
|
"num_tokens": 78509720.0,
|
|
"step": 34245
|
|
},
|
|
{
|
|
"entropy": 5.045232772827148,
|
|
"epoch": 3.2901056676272815,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003940508753567564,
|
|
"loss": 4.6288,
|
|
"mean_token_accuracy": 0.24974631518125534,
|
|
"num_tokens": 78522126.0,
|
|
"step": 34250
|
|
},
|
|
{
|
|
"entropy": 5.113291025161743,
|
|
"epoch": 3.2905859750240154,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00039402178265956074,
|
|
"loss": 4.7041,
|
|
"mean_token_accuracy": 0.2427150070667267,
|
|
"num_tokens": 78533348.0,
|
|
"step": 34255
|
|
},
|
|
{
|
|
"entropy": 5.0764930725097654,
|
|
"epoch": 3.2910662824207493,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003939926871990328,
|
|
"loss": 4.6161,
|
|
"mean_token_accuracy": 0.2491741433739662,
|
|
"num_tokens": 78545889.0,
|
|
"step": 34260
|
|
},
|
|
{
|
|
"entropy": 5.077715158462524,
|
|
"epoch": 3.291546589817483,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003939635889758478,
|
|
"loss": 4.656,
|
|
"mean_token_accuracy": 0.24635598659515381,
|
|
"num_tokens": 78557429.0,
|
|
"step": 34265
|
|
},
|
|
{
|
|
"entropy": 4.983099269866943,
|
|
"epoch": 3.292026897214217,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039393448799068164,
|
|
"loss": 4.5366,
|
|
"mean_token_accuracy": 0.2637444019317627,
|
|
"num_tokens": 78567824.0,
|
|
"step": 34270
|
|
},
|
|
{
|
|
"entropy": 5.100458812713623,
|
|
"epoch": 3.292507204610951,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003939053842442098,
|
|
"loss": 4.6634,
|
|
"mean_token_accuracy": 0.2483208805322647,
|
|
"num_tokens": 78578229.0,
|
|
"step": 34275
|
|
},
|
|
{
|
|
"entropy": 5.04512939453125,
|
|
"epoch": 3.292987512007685,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00039387627773710803,
|
|
"loss": 4.6441,
|
|
"mean_token_accuracy": 0.2516984835267067,
|
|
"num_tokens": 78591619.0,
|
|
"step": 34280
|
|
},
|
|
{
|
|
"entropy": 4.999605941772461,
|
|
"epoch": 3.2934678194044187,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039384716847005216,
|
|
"loss": 4.5984,
|
|
"mean_token_accuracy": 0.251834337413311,
|
|
"num_tokens": 78602576.0,
|
|
"step": 34285
|
|
},
|
|
{
|
|
"entropy": 4.958413600921631,
|
|
"epoch": 3.2939481268011526,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039381805644371774,
|
|
"loss": 4.6043,
|
|
"mean_token_accuracy": 0.2595936581492424,
|
|
"num_tokens": 78613532.0,
|
|
"step": 34290
|
|
},
|
|
{
|
|
"entropy": 5.088353824615479,
|
|
"epoch": 3.2944284341978864,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000393788941658781,
|
|
"loss": 4.6476,
|
|
"mean_token_accuracy": 0.24660000056028367,
|
|
"num_tokens": 78626120.0,
|
|
"step": 34295
|
|
},
|
|
{
|
|
"entropy": 5.0401242733001705,
|
|
"epoch": 3.2949087415946208,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039375982411591774,
|
|
"loss": 4.6076,
|
|
"mean_token_accuracy": 0.24968953877687455,
|
|
"num_tokens": 78636175.0,
|
|
"step": 34300
|
|
},
|
|
{
|
|
"entropy": 5.1204423904418945,
|
|
"epoch": 3.2953890489913547,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039373070381580404,
|
|
"loss": 4.6746,
|
|
"mean_token_accuracy": 0.24043382555246354,
|
|
"num_tokens": 78647429.0,
|
|
"step": 34305
|
|
},
|
|
{
|
|
"entropy": 5.073682069778442,
|
|
"epoch": 3.2958693563880885,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003937015807591159,
|
|
"loss": 4.6841,
|
|
"mean_token_accuracy": 0.2399148389697075,
|
|
"num_tokens": 78659792.0,
|
|
"step": 34310
|
|
},
|
|
{
|
|
"entropy": 5.013111019134522,
|
|
"epoch": 3.2963496637848224,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039367245494652963,
|
|
"loss": 4.5591,
|
|
"mean_token_accuracy": 0.2582303687930107,
|
|
"num_tokens": 78670614.0,
|
|
"step": 34315
|
|
},
|
|
{
|
|
"entropy": 5.007979106903076,
|
|
"epoch": 3.2968299711815563,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00039364332637872125,
|
|
"loss": 4.5678,
|
|
"mean_token_accuracy": 0.2522773787379265,
|
|
"num_tokens": 78681475.0,
|
|
"step": 34320
|
|
},
|
|
{
|
|
"entropy": 5.03101658821106,
|
|
"epoch": 3.29731027857829,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00039361419505636714,
|
|
"loss": 4.6014,
|
|
"mean_token_accuracy": 0.2510263308882713,
|
|
"num_tokens": 78692251.0,
|
|
"step": 34325
|
|
},
|
|
{
|
|
"entropy": 5.033343839645386,
|
|
"epoch": 3.297790585975024,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00039358506098014363,
|
|
"loss": 4.623,
|
|
"mean_token_accuracy": 0.24910807013511657,
|
|
"num_tokens": 78704584.0,
|
|
"step": 34330
|
|
},
|
|
{
|
|
"entropy": 5.027398014068604,
|
|
"epoch": 3.298270893371758,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00039355592415072716,
|
|
"loss": 4.5556,
|
|
"mean_token_accuracy": 0.24752137809991837,
|
|
"num_tokens": 78715467.0,
|
|
"step": 34335
|
|
},
|
|
{
|
|
"entropy": 5.020139980316162,
|
|
"epoch": 3.298751200768492,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00039352678456879415,
|
|
"loss": 4.5599,
|
|
"mean_token_accuracy": 0.2560836523771286,
|
|
"num_tokens": 78725835.0,
|
|
"step": 34340
|
|
},
|
|
{
|
|
"entropy": 4.993885707855225,
|
|
"epoch": 3.2992315081652257,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003934976422350212,
|
|
"loss": 4.5578,
|
|
"mean_token_accuracy": 0.2558558017015457,
|
|
"num_tokens": 78737143.0,
|
|
"step": 34345
|
|
},
|
|
{
|
|
"entropy": 5.193957424163818,
|
|
"epoch": 3.2997118155619596,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003934684971500848,
|
|
"loss": 4.7854,
|
|
"mean_token_accuracy": 0.24060207307338716,
|
|
"num_tokens": 78748641.0,
|
|
"step": 34350
|
|
},
|
|
{
|
|
"entropy": 5.041642332077027,
|
|
"epoch": 3.3001921229586935,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00039343934931466165,
|
|
"loss": 4.605,
|
|
"mean_token_accuracy": 0.24833667576313018,
|
|
"num_tokens": 78760200.0,
|
|
"step": 34355
|
|
},
|
|
{
|
|
"entropy": 4.999837636947632,
|
|
"epoch": 3.3006724303554273,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039341019872942855,
|
|
"loss": 4.589,
|
|
"mean_token_accuracy": 0.25284974128007887,
|
|
"num_tokens": 78771347.0,
|
|
"step": 34360
|
|
},
|
|
{
|
|
"entropy": 5.003264427185059,
|
|
"epoch": 3.3011527377521612,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039338104539506227,
|
|
"loss": 4.5099,
|
|
"mean_token_accuracy": 0.2578217089176178,
|
|
"num_tokens": 78783514.0,
|
|
"step": 34365
|
|
},
|
|
{
|
|
"entropy": 5.04760332107544,
|
|
"epoch": 3.301633045148895,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003933518893122396,
|
|
"loss": 4.6619,
|
|
"mean_token_accuracy": 0.24556959122419358,
|
|
"num_tokens": 78794621.0,
|
|
"step": 34370
|
|
},
|
|
{
|
|
"entropy": 5.0478212356567385,
|
|
"epoch": 3.3021133525456294,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003933227304816375,
|
|
"loss": 4.6196,
|
|
"mean_token_accuracy": 0.2519634172320366,
|
|
"num_tokens": 78806293.0,
|
|
"step": 34375
|
|
},
|
|
{
|
|
"entropy": 5.144480228424072,
|
|
"epoch": 3.302593659942363,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000393293568903933,
|
|
"loss": 4.6596,
|
|
"mean_token_accuracy": 0.24188109934329988,
|
|
"num_tokens": 78816930.0,
|
|
"step": 34380
|
|
},
|
|
{
|
|
"entropy": 5.061930131912232,
|
|
"epoch": 3.303073967339097,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003932644045798029,
|
|
"loss": 4.7013,
|
|
"mean_token_accuracy": 0.23837460577487946,
|
|
"num_tokens": 78828321.0,
|
|
"step": 34385
|
|
},
|
|
{
|
|
"entropy": 5.046088790893554,
|
|
"epoch": 3.303554274735831,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003932352375099247,
|
|
"loss": 4.6959,
|
|
"mean_token_accuracy": 0.24086285680532454,
|
|
"num_tokens": 78840967.0,
|
|
"step": 34390
|
|
},
|
|
{
|
|
"entropy": 5.038535118103027,
|
|
"epoch": 3.304034582132565,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003932060676949753,
|
|
"loss": 4.6158,
|
|
"mean_token_accuracy": 0.24674846231937408,
|
|
"num_tokens": 78853037.0,
|
|
"step": 34395
|
|
},
|
|
{
|
|
"entropy": 5.139471483230591,
|
|
"epoch": 3.304514889529299,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003931768951356319,
|
|
"loss": 4.6476,
|
|
"mean_token_accuracy": 0.24751783162355423,
|
|
"num_tokens": 78863441.0,
|
|
"step": 34400
|
|
},
|
|
{
|
|
"entropy": 4.912761688232422,
|
|
"epoch": 3.3049951969260327,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003931477198325721,
|
|
"loss": 4.5402,
|
|
"mean_token_accuracy": 0.2573926866054535,
|
|
"num_tokens": 78875770.0,
|
|
"step": 34405
|
|
},
|
|
{
|
|
"entropy": 5.050485706329345,
|
|
"epoch": 3.3054755043227666,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000393118541786473,
|
|
"loss": 4.6137,
|
|
"mean_token_accuracy": 0.25367127656936644,
|
|
"num_tokens": 78886239.0,
|
|
"step": 34410
|
|
},
|
|
{
|
|
"entropy": 5.072754001617431,
|
|
"epoch": 3.3059558117195005,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00039308936099801203,
|
|
"loss": 4.6464,
|
|
"mean_token_accuracy": 0.24211475551128386,
|
|
"num_tokens": 78897497.0,
|
|
"step": 34415
|
|
},
|
|
{
|
|
"entropy": 5.091218328475952,
|
|
"epoch": 3.3064361191162344,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003930601774678669,
|
|
"loss": 4.6657,
|
|
"mean_token_accuracy": 0.244375142455101,
|
|
"num_tokens": 78908643.0,
|
|
"step": 34420
|
|
},
|
|
{
|
|
"entropy": 5.081040143966675,
|
|
"epoch": 3.3069164265129682,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039303099119671487,
|
|
"loss": 4.7168,
|
|
"mean_token_accuracy": 0.24277334958314895,
|
|
"num_tokens": 78921240.0,
|
|
"step": 34425
|
|
},
|
|
{
|
|
"entropy": 5.032566070556641,
|
|
"epoch": 3.307396733909702,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039300180218523374,
|
|
"loss": 4.6431,
|
|
"mean_token_accuracy": 0.255554161965847,
|
|
"num_tokens": 78933803.0,
|
|
"step": 34430
|
|
},
|
|
{
|
|
"entropy": 5.148969936370849,
|
|
"epoch": 3.307877041306436,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003929726104341013,
|
|
"loss": 4.7358,
|
|
"mean_token_accuracy": 0.2412612333893776,
|
|
"num_tokens": 78944876.0,
|
|
"step": 34435
|
|
},
|
|
{
|
|
"entropy": 5.021073818206787,
|
|
"epoch": 3.30835734870317,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039294341594399494,
|
|
"loss": 4.5694,
|
|
"mean_token_accuracy": 0.25430659353733065,
|
|
"num_tokens": 78957044.0,
|
|
"step": 34440
|
|
},
|
|
{
|
|
"entropy": 4.975513029098511,
|
|
"epoch": 3.3088376560999038,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039291421871559274,
|
|
"loss": 4.5802,
|
|
"mean_token_accuracy": 0.25888924300670624,
|
|
"num_tokens": 78968654.0,
|
|
"step": 34445
|
|
},
|
|
{
|
|
"entropy": 5.046443319320678,
|
|
"epoch": 3.309317963496638,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039288501874957263,
|
|
"loss": 4.6547,
|
|
"mean_token_accuracy": 0.2466696321964264,
|
|
"num_tokens": 78980201.0,
|
|
"step": 34450
|
|
},
|
|
{
|
|
"entropy": 5.0870613098144535,
|
|
"epoch": 3.3097982708933715,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003928558160466123,
|
|
"loss": 4.663,
|
|
"mean_token_accuracy": 0.2413409322500229,
|
|
"num_tokens": 78991172.0,
|
|
"step": 34455
|
|
},
|
|
{
|
|
"entropy": 5.024161911010742,
|
|
"epoch": 3.310278578290106,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003928266106073899,
|
|
"loss": 4.6257,
|
|
"mean_token_accuracy": 0.2498500108718872,
|
|
"num_tokens": 79001944.0,
|
|
"step": 34460
|
|
},
|
|
{
|
|
"entropy": 5.031950569152832,
|
|
"epoch": 3.3107588856868397,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00039279740243258353,
|
|
"loss": 4.6391,
|
|
"mean_token_accuracy": 0.2514120519161224,
|
|
"num_tokens": 79014715.0,
|
|
"step": 34465
|
|
},
|
|
{
|
|
"entropy": 5.045935392379761,
|
|
"epoch": 3.3112391930835736,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003927681915228712,
|
|
"loss": 4.5737,
|
|
"mean_token_accuracy": 0.2540336072444916,
|
|
"num_tokens": 79025499.0,
|
|
"step": 34470
|
|
},
|
|
{
|
|
"entropy": 5.014480066299439,
|
|
"epoch": 3.3117195004803075,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0003927389778789312,
|
|
"loss": 4.6461,
|
|
"mean_token_accuracy": 0.2476935252547264,
|
|
"num_tokens": 79037400.0,
|
|
"step": 34475
|
|
},
|
|
{
|
|
"entropy": 5.067327260971069,
|
|
"epoch": 3.3121998078770414,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003927097615014418,
|
|
"loss": 4.6303,
|
|
"mean_token_accuracy": 0.24671047925949097,
|
|
"num_tokens": 79048450.0,
|
|
"step": 34480
|
|
},
|
|
{
|
|
"entropy": 5.0652423858642575,
|
|
"epoch": 3.3126801152737753,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003926805423910811,
|
|
"loss": 4.6727,
|
|
"mean_token_accuracy": 0.2537225067615509,
|
|
"num_tokens": 79061579.0,
|
|
"step": 34485
|
|
},
|
|
{
|
|
"entropy": 5.02296199798584,
|
|
"epoch": 3.313160422670509,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003926513205485278,
|
|
"loss": 4.5953,
|
|
"mean_token_accuracy": 0.24416229724884034,
|
|
"num_tokens": 79073106.0,
|
|
"step": 34490
|
|
},
|
|
{
|
|
"entropy": 5.067980146408081,
|
|
"epoch": 3.313640730067243,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003926220959744602,
|
|
"loss": 4.6871,
|
|
"mean_token_accuracy": 0.24308695495128632,
|
|
"num_tokens": 79085528.0,
|
|
"step": 34495
|
|
},
|
|
{
|
|
"entropy": 5.017328786849975,
|
|
"epoch": 3.314121037463977,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003925928686695567,
|
|
"loss": 4.5148,
|
|
"mean_token_accuracy": 0.2620679005980492,
|
|
"num_tokens": 79096429.0,
|
|
"step": 34500
|
|
},
|
|
{
|
|
"entropy": 5.082864093780517,
|
|
"epoch": 3.314601344860711,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000392563638634496,
|
|
"loss": 4.6977,
|
|
"mean_token_accuracy": 0.2402432456612587,
|
|
"num_tokens": 79107688.0,
|
|
"step": 34505
|
|
},
|
|
{
|
|
"entropy": 5.049029350280762,
|
|
"epoch": 3.3150816522574447,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003925344058699567,
|
|
"loss": 4.5991,
|
|
"mean_token_accuracy": 0.2516641363501549,
|
|
"num_tokens": 79119549.0,
|
|
"step": 34510
|
|
},
|
|
{
|
|
"entropy": 4.996371936798096,
|
|
"epoch": 3.3155619596541785,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003925051703766175,
|
|
"loss": 4.5842,
|
|
"mean_token_accuracy": 0.25428757518529893,
|
|
"num_tokens": 79131193.0,
|
|
"step": 34515
|
|
},
|
|
{
|
|
"entropy": 5.008741044998169,
|
|
"epoch": 3.3160422670509124,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003924759321551572,
|
|
"loss": 4.6671,
|
|
"mean_token_accuracy": 0.238971708714962,
|
|
"num_tokens": 79142117.0,
|
|
"step": 34520
|
|
},
|
|
{
|
|
"entropy": 5.074768209457398,
|
|
"epoch": 3.3165225744476463,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003924466912062546,
|
|
"loss": 4.6178,
|
|
"mean_token_accuracy": 0.24548172652721406,
|
|
"num_tokens": 79154734.0,
|
|
"step": 34525
|
|
},
|
|
{
|
|
"entropy": 5.067018842697143,
|
|
"epoch": 3.31700288184438,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003924174475305885,
|
|
"loss": 4.6499,
|
|
"mean_token_accuracy": 0.24706732481718063,
|
|
"num_tokens": 79166077.0,
|
|
"step": 34530
|
|
},
|
|
{
|
|
"entropy": 5.053887891769409,
|
|
"epoch": 3.3174831892411145,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039238820112883795,
|
|
"loss": 4.6358,
|
|
"mean_token_accuracy": 0.2421278402209282,
|
|
"num_tokens": 79177544.0,
|
|
"step": 34535
|
|
},
|
|
{
|
|
"entropy": 5.022403430938721,
|
|
"epoch": 3.3179634966378484,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003923589520016819,
|
|
"loss": 4.6493,
|
|
"mean_token_accuracy": 0.2522857293486595,
|
|
"num_tokens": 79189606.0,
|
|
"step": 34540
|
|
},
|
|
{
|
|
"entropy": 4.980406093597412,
|
|
"epoch": 3.3184438040345823,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039232970014979965,
|
|
"loss": 4.5205,
|
|
"mean_token_accuracy": 0.257637657225132,
|
|
"num_tokens": 79201385.0,
|
|
"step": 34545
|
|
},
|
|
{
|
|
"entropy": 5.0712072372436525,
|
|
"epoch": 3.318924111431316,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003923004455738699,
|
|
"loss": 4.688,
|
|
"mean_token_accuracy": 0.23973239362239837,
|
|
"num_tokens": 79213481.0,
|
|
"step": 34550
|
|
},
|
|
{
|
|
"entropy": 5.036832189559936,
|
|
"epoch": 3.31940441882805,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00039227118827457234,
|
|
"loss": 4.5735,
|
|
"mean_token_accuracy": 0.2491331622004509,
|
|
"num_tokens": 79226369.0,
|
|
"step": 34555
|
|
},
|
|
{
|
|
"entropy": 5.086540699005127,
|
|
"epoch": 3.319884726224784,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039224192825258584,
|
|
"loss": 4.683,
|
|
"mean_token_accuracy": 0.24358577728271485,
|
|
"num_tokens": 79238236.0,
|
|
"step": 34560
|
|
},
|
|
{
|
|
"entropy": 5.090976619720459,
|
|
"epoch": 3.320365033621518,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00039221266550859004,
|
|
"loss": 4.6357,
|
|
"mean_token_accuracy": 0.24827074408531188,
|
|
"num_tokens": 79248536.0,
|
|
"step": 34565
|
|
},
|
|
{
|
|
"entropy": 5.061130809783935,
|
|
"epoch": 3.3208453410182517,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039218340004326414,
|
|
"loss": 4.6051,
|
|
"mean_token_accuracy": 0.2513726234436035,
|
|
"num_tokens": 79259273.0,
|
|
"step": 34570
|
|
},
|
|
{
|
|
"entropy": 4.9110987186431885,
|
|
"epoch": 3.3213256484149856,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003921541318572876,
|
|
"loss": 4.5244,
|
|
"mean_token_accuracy": 0.255599670112133,
|
|
"num_tokens": 79271986.0,
|
|
"step": 34575
|
|
},
|
|
{
|
|
"entropy": 5.030234861373901,
|
|
"epoch": 3.3218059558117194,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00039212486095134005,
|
|
"loss": 4.6012,
|
|
"mean_token_accuracy": 0.25354058742523194,
|
|
"num_tokens": 79282602.0,
|
|
"step": 34580
|
|
},
|
|
{
|
|
"entropy": 5.048261070251465,
|
|
"epoch": 3.3222862632084533,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003920955873261011,
|
|
"loss": 4.6313,
|
|
"mean_token_accuracy": 0.2526991873979568,
|
|
"num_tokens": 79295034.0,
|
|
"step": 34585
|
|
},
|
|
{
|
|
"entropy": 4.9824480533599855,
|
|
"epoch": 3.322766570605187,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0003920663109822502,
|
|
"loss": 4.6067,
|
|
"mean_token_accuracy": 0.24954349547624588,
|
|
"num_tokens": 79307434.0,
|
|
"step": 34590
|
|
},
|
|
{
|
|
"entropy": 5.1025842189788815,
|
|
"epoch": 3.323246878001921,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00039203703192046717,
|
|
"loss": 4.6699,
|
|
"mean_token_accuracy": 0.24924284517765044,
|
|
"num_tokens": 79318185.0,
|
|
"step": 34595
|
|
},
|
|
{
|
|
"entropy": 5.00308198928833,
|
|
"epoch": 3.323727185398655,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003920077501414318,
|
|
"loss": 4.5686,
|
|
"mean_token_accuracy": 0.25722116231918335,
|
|
"num_tokens": 79329800.0,
|
|
"step": 34600
|
|
},
|
|
{
|
|
"entropy": 4.989739322662354,
|
|
"epoch": 3.324207492795389,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039197846564582395,
|
|
"loss": 4.6425,
|
|
"mean_token_accuracy": 0.24360090494155884,
|
|
"num_tokens": 79342537.0,
|
|
"step": 34605
|
|
},
|
|
{
|
|
"entropy": 5.022253942489624,
|
|
"epoch": 3.324687800192123,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00039194917843432347,
|
|
"loss": 4.6199,
|
|
"mean_token_accuracy": 0.25004632622003553,
|
|
"num_tokens": 79353727.0,
|
|
"step": 34610
|
|
},
|
|
{
|
|
"entropy": 5.032826948165893,
|
|
"epoch": 3.325168107588857,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003919198885076103,
|
|
"loss": 4.5463,
|
|
"mean_token_accuracy": 0.25729864537715913,
|
|
"num_tokens": 79364337.0,
|
|
"step": 34615
|
|
},
|
|
{
|
|
"entropy": 5.034902667999267,
|
|
"epoch": 3.325648414985591,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039189059586636465,
|
|
"loss": 4.604,
|
|
"mean_token_accuracy": 0.25298081040382386,
|
|
"num_tokens": 79375890.0,
|
|
"step": 34620
|
|
},
|
|
{
|
|
"entropy": 5.0548238277435305,
|
|
"epoch": 3.326128722382325,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003918613005112663,
|
|
"loss": 4.6147,
|
|
"mean_token_accuracy": 0.2492382198572159,
|
|
"num_tokens": 79386686.0,
|
|
"step": 34625
|
|
},
|
|
{
|
|
"entropy": 5.050034379959106,
|
|
"epoch": 3.3266090297790587,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003918320024429956,
|
|
"loss": 4.6355,
|
|
"mean_token_accuracy": 0.24784999787807466,
|
|
"num_tokens": 79397852.0,
|
|
"step": 34630
|
|
},
|
|
{
|
|
"entropy": 5.052454900741577,
|
|
"epoch": 3.3270893371757926,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003918027016622328,
|
|
"loss": 4.6198,
|
|
"mean_token_accuracy": 0.2552115857601166,
|
|
"num_tokens": 79409547.0,
|
|
"step": 34635
|
|
},
|
|
{
|
|
"entropy": 4.92240686416626,
|
|
"epoch": 3.3275696445725265,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003917733981696579,
|
|
"loss": 4.5988,
|
|
"mean_token_accuracy": 0.2538344353437424,
|
|
"num_tokens": 79421906.0,
|
|
"step": 34640
|
|
},
|
|
{
|
|
"entropy": 5.0243449211120605,
|
|
"epoch": 3.3280499519692603,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003917440919659516,
|
|
"loss": 4.594,
|
|
"mean_token_accuracy": 0.25182389467954636,
|
|
"num_tokens": 79433767.0,
|
|
"step": 34645
|
|
},
|
|
{
|
|
"entropy": 5.105506849288941,
|
|
"epoch": 3.3285302593659942,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003917147830517941,
|
|
"loss": 4.643,
|
|
"mean_token_accuracy": 0.2416081815958023,
|
|
"num_tokens": 79445488.0,
|
|
"step": 34650
|
|
},
|
|
{
|
|
"entropy": 5.068530893325805,
|
|
"epoch": 3.329010566762728,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0003916854714278659,
|
|
"loss": 4.6105,
|
|
"mean_token_accuracy": 0.25305124223232267,
|
|
"num_tokens": 79457908.0,
|
|
"step": 34655
|
|
},
|
|
{
|
|
"entropy": 5.064067792892456,
|
|
"epoch": 3.329490874159462,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003916561570948476,
|
|
"loss": 4.6647,
|
|
"mean_token_accuracy": 0.2456543266773224,
|
|
"num_tokens": 79469876.0,
|
|
"step": 34660
|
|
},
|
|
{
|
|
"entropy": 5.001161003112793,
|
|
"epoch": 3.329971181556196,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003916268400534197,
|
|
"loss": 4.541,
|
|
"mean_token_accuracy": 0.25127813071012495,
|
|
"num_tokens": 79480532.0,
|
|
"step": 34665
|
|
},
|
|
{
|
|
"entropy": 5.0196448802948,
|
|
"epoch": 3.3304514889529298,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003915975203042628,
|
|
"loss": 4.6284,
|
|
"mean_token_accuracy": 0.25387539267539977,
|
|
"num_tokens": 79491298.0,
|
|
"step": 34670
|
|
},
|
|
{
|
|
"entropy": 5.026435327529907,
|
|
"epoch": 3.3309317963496636,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039156819784805783,
|
|
"loss": 4.6023,
|
|
"mean_token_accuracy": 0.24340915977954863,
|
|
"num_tokens": 79502794.0,
|
|
"step": 34675
|
|
},
|
|
{
|
|
"entropy": 5.02727255821228,
|
|
"epoch": 3.3314121037463975,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003915388726854854,
|
|
"loss": 4.6149,
|
|
"mean_token_accuracy": 0.24926782846450807,
|
|
"num_tokens": 79513104.0,
|
|
"step": 34680
|
|
},
|
|
{
|
|
"entropy": 5.1041487693786625,
|
|
"epoch": 3.331892411143132,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039150954481722634,
|
|
"loss": 4.7032,
|
|
"mean_token_accuracy": 0.23929551988840103,
|
|
"num_tokens": 79524660.0,
|
|
"step": 34685
|
|
},
|
|
{
|
|
"entropy": 5.1001204490661625,
|
|
"epoch": 3.3323727185398653,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003914802142439617,
|
|
"loss": 4.6225,
|
|
"mean_token_accuracy": 0.24540394246578218,
|
|
"num_tokens": 79538027.0,
|
|
"step": 34690
|
|
},
|
|
{
|
|
"entropy": 5.030107164382935,
|
|
"epoch": 3.3328530259365996,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003914508809663723,
|
|
"loss": 4.5675,
|
|
"mean_token_accuracy": 0.2557969391345978,
|
|
"num_tokens": 79548894.0,
|
|
"step": 34695
|
|
},
|
|
{
|
|
"entropy": 5.004552030563355,
|
|
"epoch": 3.3333333333333335,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039142154498513913,
|
|
"loss": 4.6753,
|
|
"mean_token_accuracy": 0.24684297144412995,
|
|
"num_tokens": 79560520.0,
|
|
"step": 34700
|
|
},
|
|
{
|
|
"entropy": 5.088085651397705,
|
|
"epoch": 3.3338136407300674,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00039139220630094357,
|
|
"loss": 4.6439,
|
|
"mean_token_accuracy": 0.25575283318758013,
|
|
"num_tokens": 79571440.0,
|
|
"step": 34705
|
|
},
|
|
{
|
|
"entropy": 5.060253047943116,
|
|
"epoch": 3.3342939481268012,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00039136286491446657,
|
|
"loss": 4.6559,
|
|
"mean_token_accuracy": 0.2452133461833,
|
|
"num_tokens": 79582046.0,
|
|
"step": 34710
|
|
},
|
|
{
|
|
"entropy": 5.013051509857178,
|
|
"epoch": 3.334774255523535,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00039133352082638923,
|
|
"loss": 4.606,
|
|
"mean_token_accuracy": 0.24551442116498948,
|
|
"num_tokens": 79593704.0,
|
|
"step": 34715
|
|
},
|
|
{
|
|
"entropy": 5.074276065826416,
|
|
"epoch": 3.335254562920269,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00039130417403739315,
|
|
"loss": 4.6681,
|
|
"mean_token_accuracy": 0.24499332159757614,
|
|
"num_tokens": 79605783.0,
|
|
"step": 34720
|
|
},
|
|
{
|
|
"entropy": 5.042751979827881,
|
|
"epoch": 3.335734870317003,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003912748245481594,
|
|
"loss": 4.5323,
|
|
"mean_token_accuracy": 0.25625754743814466,
|
|
"num_tokens": 79617140.0,
|
|
"step": 34725
|
|
},
|
|
{
|
|
"entropy": 5.0301860809326175,
|
|
"epoch": 3.3362151777137368,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039124547235936947,
|
|
"loss": 4.6171,
|
|
"mean_token_accuracy": 0.245346499979496,
|
|
"num_tokens": 79627763.0,
|
|
"step": 34730
|
|
},
|
|
{
|
|
"entropy": 5.1219123840332035,
|
|
"epoch": 3.3366954851104706,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039121611747170495,
|
|
"loss": 4.7187,
|
|
"mean_token_accuracy": 0.24241504669189454,
|
|
"num_tokens": 79640056.0,
|
|
"step": 34735
|
|
},
|
|
{
|
|
"entropy": 5.054620790481567,
|
|
"epoch": 3.3371757925072045,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00039118675988584724,
|
|
"loss": 4.6452,
|
|
"mean_token_accuracy": 0.24911766946315766,
|
|
"num_tokens": 79652795.0,
|
|
"step": 34740
|
|
},
|
|
{
|
|
"entropy": 5.165208911895752,
|
|
"epoch": 3.3376560999039384,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003911573996024779,
|
|
"loss": 4.7728,
|
|
"mean_token_accuracy": 0.23868272453546524,
|
|
"num_tokens": 79663734.0,
|
|
"step": 34745
|
|
},
|
|
{
|
|
"entropy": 5.031789350509643,
|
|
"epoch": 3.3381364073006723,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003911280366222787,
|
|
"loss": 4.5572,
|
|
"mean_token_accuracy": 0.25184957683086395,
|
|
"num_tokens": 79675092.0,
|
|
"step": 34750
|
|
},
|
|
{
|
|
"entropy": 5.066293573379516,
|
|
"epoch": 3.338616714697406,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039109867094593134,
|
|
"loss": 4.7148,
|
|
"mean_token_accuracy": 0.23990821242332458,
|
|
"num_tokens": 79687126.0,
|
|
"step": 34755
|
|
},
|
|
{
|
|
"entropy": 4.976014232635498,
|
|
"epoch": 3.3390970220941405,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003910693025741175,
|
|
"loss": 4.5393,
|
|
"mean_token_accuracy": 0.2583530515432358,
|
|
"num_tokens": 79698511.0,
|
|
"step": 34760
|
|
},
|
|
{
|
|
"entropy": 5.0544932842254635,
|
|
"epoch": 3.339577329490874,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00039103993150751916,
|
|
"loss": 4.6336,
|
|
"mean_token_accuracy": 0.2523597240447998,
|
|
"num_tokens": 79709407.0,
|
|
"step": 34765
|
|
},
|
|
{
|
|
"entropy": 5.0417557716369625,
|
|
"epoch": 3.3400576368876083,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039101055774681825,
|
|
"loss": 4.6133,
|
|
"mean_token_accuracy": 0.25056389570236204,
|
|
"num_tokens": 79720562.0,
|
|
"step": 34770
|
|
},
|
|
{
|
|
"entropy": 5.073037481307983,
|
|
"epoch": 3.340537944284342,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003909811812926966,
|
|
"loss": 4.7071,
|
|
"mean_token_accuracy": 0.23592607975006102,
|
|
"num_tokens": 79731687.0,
|
|
"step": 34775
|
|
},
|
|
{
|
|
"entropy": 5.069588565826416,
|
|
"epoch": 3.341018251681076,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003909518021458363,
|
|
"loss": 4.6447,
|
|
"mean_token_accuracy": 0.24849862605333328,
|
|
"num_tokens": 79742954.0,
|
|
"step": 34780
|
|
},
|
|
{
|
|
"entropy": 5.050691366195679,
|
|
"epoch": 3.34149855907781,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003909224203069195,
|
|
"loss": 4.6767,
|
|
"mean_token_accuracy": 0.24575020968914033,
|
|
"num_tokens": 79754233.0,
|
|
"step": 34785
|
|
},
|
|
{
|
|
"entropy": 5.048553848266602,
|
|
"epoch": 3.341978866474544,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003908930357766283,
|
|
"loss": 4.5237,
|
|
"mean_token_accuracy": 0.25873469561338425,
|
|
"num_tokens": 79765348.0,
|
|
"step": 34790
|
|
},
|
|
{
|
|
"entropy": 5.020631742477417,
|
|
"epoch": 3.3424591738712777,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003908636485556449,
|
|
"loss": 4.5937,
|
|
"mean_token_accuracy": 0.2509862929582596,
|
|
"num_tokens": 79776993.0,
|
|
"step": 34795
|
|
},
|
|
{
|
|
"entropy": 5.103504943847656,
|
|
"epoch": 3.3429394812680115,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00039083425864465165,
|
|
"loss": 4.7444,
|
|
"mean_token_accuracy": 0.2453736409544945,
|
|
"num_tokens": 79787321.0,
|
|
"step": 34800
|
|
},
|
|
{
|
|
"entropy": 5.048814916610718,
|
|
"epoch": 3.3434197886647454,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003908048660443309,
|
|
"loss": 4.5941,
|
|
"mean_token_accuracy": 0.24696359783411026,
|
|
"num_tokens": 79798130.0,
|
|
"step": 34805
|
|
},
|
|
{
|
|
"entropy": 5.0221131324768065,
|
|
"epoch": 3.3439000960614793,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003907754707553651,
|
|
"loss": 4.5643,
|
|
"mean_token_accuracy": 0.25141998529434206,
|
|
"num_tokens": 79809616.0,
|
|
"step": 34810
|
|
},
|
|
{
|
|
"entropy": 4.96633505821228,
|
|
"epoch": 3.344380403458213,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003907460727784365,
|
|
"loss": 4.5849,
|
|
"mean_token_accuracy": 0.25542705655097964,
|
|
"num_tokens": 79820355.0,
|
|
"step": 34815
|
|
},
|
|
{
|
|
"entropy": 5.065265417098999,
|
|
"epoch": 3.344860710854947,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00039071667211422787,
|
|
"loss": 4.6215,
|
|
"mean_token_accuracy": 0.2475579112768173,
|
|
"num_tokens": 79831733.0,
|
|
"step": 34820
|
|
},
|
|
{
|
|
"entropy": 5.066289615631104,
|
|
"epoch": 3.345341018251681,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003906872687634217,
|
|
"loss": 4.6637,
|
|
"mean_token_accuracy": 0.24272034019231797,
|
|
"num_tokens": 79842961.0,
|
|
"step": 34825
|
|
},
|
|
{
|
|
"entropy": 5.043630075454712,
|
|
"epoch": 3.345821325648415,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00039065786272670066,
|
|
"loss": 4.6594,
|
|
"mean_token_accuracy": 0.25113607496023177,
|
|
"num_tokens": 79855142.0,
|
|
"step": 34830
|
|
},
|
|
{
|
|
"entropy": 4.972777795791626,
|
|
"epoch": 3.3463016330451487,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003906284540047475,
|
|
"loss": 4.5554,
|
|
"mean_token_accuracy": 0.2545921131968498,
|
|
"num_tokens": 79865973.0,
|
|
"step": 34835
|
|
},
|
|
{
|
|
"entropy": 5.092799186706543,
|
|
"epoch": 3.3467819404418826,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00039059904259824507,
|
|
"loss": 4.6387,
|
|
"mean_token_accuracy": 0.2502507969737053,
|
|
"num_tokens": 79877424.0,
|
|
"step": 34840
|
|
},
|
|
{
|
|
"entropy": 5.109822702407837,
|
|
"epoch": 3.347262247838617,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000390569628507876,
|
|
"loss": 4.669,
|
|
"mean_token_accuracy": 0.24460556954145432,
|
|
"num_tokens": 79889956.0,
|
|
"step": 34845
|
|
},
|
|
{
|
|
"entropy": 5.022493124008179,
|
|
"epoch": 3.347742555235351,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00039054021173432336,
|
|
"loss": 4.6251,
|
|
"mean_token_accuracy": 0.24823231250047684,
|
|
"num_tokens": 79902454.0,
|
|
"step": 34850
|
|
},
|
|
{
|
|
"entropy": 5.035952091217041,
|
|
"epoch": 3.3482228626320847,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003905107922782701,
|
|
"loss": 4.6641,
|
|
"mean_token_accuracy": 0.24565812349319457,
|
|
"num_tokens": 79914425.0,
|
|
"step": 34855
|
|
},
|
|
{
|
|
"entropy": 4.987614679336548,
|
|
"epoch": 3.3487031700288186,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003904813701403993,
|
|
"loss": 4.5628,
|
|
"mean_token_accuracy": 0.25591775923967364,
|
|
"num_tokens": 79926225.0,
|
|
"step": 34860
|
|
},
|
|
{
|
|
"entropy": 4.974949789047241,
|
|
"epoch": 3.3491834774255524,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00039045194532139396,
|
|
"loss": 4.5591,
|
|
"mean_token_accuracy": 0.25687998682260516,
|
|
"num_tokens": 79937804.0,
|
|
"step": 34865
|
|
},
|
|
{
|
|
"entropy": 5.070349645614624,
|
|
"epoch": 3.3496637848222863,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003904225178219372,
|
|
"loss": 4.6242,
|
|
"mean_token_accuracy": 0.25052126199007035,
|
|
"num_tokens": 79949588.0,
|
|
"step": 34870
|
|
},
|
|
{
|
|
"entropy": 5.033775806427002,
|
|
"epoch": 3.35014409221902,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039039308764271237,
|
|
"loss": 4.5852,
|
|
"mean_token_accuracy": 0.24895876049995422,
|
|
"num_tokens": 79960243.0,
|
|
"step": 34875
|
|
},
|
|
{
|
|
"entropy": 4.979428672790528,
|
|
"epoch": 3.350624399615754,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003903636547844026,
|
|
"loss": 4.6057,
|
|
"mean_token_accuracy": 0.2489009216427803,
|
|
"num_tokens": 79971780.0,
|
|
"step": 34880
|
|
},
|
|
{
|
|
"entropy": 5.041569805145263,
|
|
"epoch": 3.351104707012488,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00039033421924769145,
|
|
"loss": 4.6335,
|
|
"mean_token_accuracy": 0.2484254464507103,
|
|
"num_tokens": 79982105.0,
|
|
"step": 34885
|
|
},
|
|
{
|
|
"entropy": 5.013979148864746,
|
|
"epoch": 3.351585014409222,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00039030478103326216,
|
|
"loss": 4.5643,
|
|
"mean_token_accuracy": 0.2592616483569145,
|
|
"num_tokens": 79993456.0,
|
|
"step": 34890
|
|
},
|
|
{
|
|
"entropy": 5.073239994049072,
|
|
"epoch": 3.3520653218059557,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00039027534014179823,
|
|
"loss": 4.6747,
|
|
"mean_token_accuracy": 0.2410885751247406,
|
|
"num_tokens": 80006335.0,
|
|
"step": 34895
|
|
},
|
|
{
|
|
"entropy": 5.026223230361938,
|
|
"epoch": 3.3525456292026896,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003902458965739832,
|
|
"loss": 4.6048,
|
|
"mean_token_accuracy": 0.25290912985801695,
|
|
"num_tokens": 80017574.0,
|
|
"step": 34900
|
|
},
|
|
{
|
|
"entropy": 5.0304210662841795,
|
|
"epoch": 3.3530259365994235,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003902164503305006,
|
|
"loss": 4.629,
|
|
"mean_token_accuracy": 0.2353252351284027,
|
|
"num_tokens": 80029470.0,
|
|
"step": 34905
|
|
},
|
|
{
|
|
"entropy": 5.000137281417847,
|
|
"epoch": 3.3535062439961574,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003901870014120343,
|
|
"loss": 4.5857,
|
|
"mean_token_accuracy": 0.2529801607131958,
|
|
"num_tokens": 80040599.0,
|
|
"step": 34910
|
|
},
|
|
{
|
|
"entropy": 5.060665082931519,
|
|
"epoch": 3.3539865513928913,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003901575498192678,
|
|
"loss": 4.631,
|
|
"mean_token_accuracy": 0.24895701557397842,
|
|
"num_tokens": 80052597.0,
|
|
"step": 34915
|
|
},
|
|
{
|
|
"entropy": 5.065235280990601,
|
|
"epoch": 3.3544668587896256,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003901280955528849,
|
|
"loss": 4.6645,
|
|
"mean_token_accuracy": 0.2418915808200836,
|
|
"num_tokens": 80063208.0,
|
|
"step": 34920
|
|
},
|
|
{
|
|
"entropy": 4.99900975227356,
|
|
"epoch": 3.3549471661863595,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003900986386135695,
|
|
"loss": 4.6406,
|
|
"mean_token_accuracy": 0.2570782914757729,
|
|
"num_tokens": 80074046.0,
|
|
"step": 34925
|
|
},
|
|
{
|
|
"entropy": 5.06900725364685,
|
|
"epoch": 3.3554274735830933,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00039006917900200543,
|
|
"loss": 4.6908,
|
|
"mean_token_accuracy": 0.24064703434705734,
|
|
"num_tokens": 80086054.0,
|
|
"step": 34930
|
|
},
|
|
{
|
|
"entropy": 5.1344099044799805,
|
|
"epoch": 3.3559077809798272,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00039003971671887675,
|
|
"loss": 4.5825,
|
|
"mean_token_accuracy": 0.25378605872392657,
|
|
"num_tokens": 80096030.0,
|
|
"step": 34935
|
|
},
|
|
{
|
|
"entropy": 4.9018505096435545,
|
|
"epoch": 3.356388088376561,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003900102517648674,
|
|
"loss": 4.4582,
|
|
"mean_token_accuracy": 0.26462458819150925,
|
|
"num_tokens": 80106412.0,
|
|
"step": 34940
|
|
},
|
|
{
|
|
"entropy": 5.00390625,
|
|
"epoch": 3.356868395773295,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003899807841406617,
|
|
"loss": 4.6295,
|
|
"mean_token_accuracy": 0.2515963226556778,
|
|
"num_tokens": 80116580.0,
|
|
"step": 34945
|
|
},
|
|
{
|
|
"entropy": 5.083790874481201,
|
|
"epoch": 3.357348703170029,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003899513138469434,
|
|
"loss": 4.6466,
|
|
"mean_token_accuracy": 0.2497406020760536,
|
|
"num_tokens": 80129055.0,
|
|
"step": 34950
|
|
},
|
|
{
|
|
"entropy": 5.04338812828064,
|
|
"epoch": 3.3578290105667628,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000389921840884397,
|
|
"loss": 4.7086,
|
|
"mean_token_accuracy": 0.24321456551551818,
|
|
"num_tokens": 80140626.0,
|
|
"step": 34955
|
|
},
|
|
{
|
|
"entropy": 5.090425682067871,
|
|
"epoch": 3.3583093179634966,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00038989236525370676,
|
|
"loss": 4.6687,
|
|
"mean_token_accuracy": 0.24711875915527343,
|
|
"num_tokens": 80151896.0,
|
|
"step": 34960
|
|
},
|
|
{
|
|
"entropy": 5.068356513977051,
|
|
"epoch": 3.3587896253602305,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003898628869555569,
|
|
"loss": 4.584,
|
|
"mean_token_accuracy": 0.25327493250370026,
|
|
"num_tokens": 80162801.0,
|
|
"step": 34965
|
|
},
|
|
{
|
|
"entropy": 5.058989238739014,
|
|
"epoch": 3.3592699327569644,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038983340599063187,
|
|
"loss": 4.6073,
|
|
"mean_token_accuracy": 0.24858204871416092,
|
|
"num_tokens": 80174998.0,
|
|
"step": 34970
|
|
},
|
|
{
|
|
"entropy": 5.136708736419678,
|
|
"epoch": 3.3597502401536983,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0003898039223596162,
|
|
"loss": 4.7701,
|
|
"mean_token_accuracy": 0.23823589086532593,
|
|
"num_tokens": 80188092.0,
|
|
"step": 34975
|
|
},
|
|
{
|
|
"entropy": 5.088174486160279,
|
|
"epoch": 3.360230547550432,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003897744360631943,
|
|
"loss": 4.6215,
|
|
"mean_token_accuracy": 0.25468567907810213,
|
|
"num_tokens": 80199749.0,
|
|
"step": 34980
|
|
},
|
|
{
|
|
"entropy": 4.988624811172485,
|
|
"epoch": 3.360710854947166,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038974494710205084,
|
|
"loss": 4.5906,
|
|
"mean_token_accuracy": 0.2534733057022095,
|
|
"num_tokens": 80211714.0,
|
|
"step": 34985
|
|
},
|
|
{
|
|
"entropy": 5.010880756378174,
|
|
"epoch": 3.3611911623439,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00038971545547687036,
|
|
"loss": 4.6184,
|
|
"mean_token_accuracy": 0.25120625346899034,
|
|
"num_tokens": 80222258.0,
|
|
"step": 34990
|
|
},
|
|
{
|
|
"entropy": 4.944441032409668,
|
|
"epoch": 3.3616714697406342,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00038968596118833766,
|
|
"loss": 4.5875,
|
|
"mean_token_accuracy": 0.25603134781122205,
|
|
"num_tokens": 80234532.0,
|
|
"step": 34995
|
|
},
|
|
{
|
|
"entropy": 5.029980945587158,
|
|
"epoch": 3.3621517771373677,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00038965646423713744,
|
|
"loss": 4.5566,
|
|
"mean_token_accuracy": 0.2543851360678673,
|
|
"num_tokens": 80245826.0,
|
|
"step": 35000
|
|
},
|
|
{
|
|
"entropy": 4.983625268936157,
|
|
"epoch": 3.362632084534102,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038962696462395473,
|
|
"loss": 4.5365,
|
|
"mean_token_accuracy": 0.24928556382656097,
|
|
"num_tokens": 80256715.0,
|
|
"step": 35005
|
|
},
|
|
{
|
|
"entropy": 5.06455135345459,
|
|
"epoch": 3.363112391930836,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003895974623494742,
|
|
"loss": 4.6382,
|
|
"mean_token_accuracy": 0.24848238825798036,
|
|
"num_tokens": 80267713.0,
|
|
"step": 35010
|
|
},
|
|
{
|
|
"entropy": 5.060695934295654,
|
|
"epoch": 3.3635926993275698,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038956795741438085,
|
|
"loss": 4.6585,
|
|
"mean_token_accuracy": 0.2485118180513382,
|
|
"num_tokens": 80278935.0,
|
|
"step": 35015
|
|
},
|
|
{
|
|
"entropy": 4.979933404922486,
|
|
"epoch": 3.3640730067243036,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00038953844981935975,
|
|
"loss": 4.5085,
|
|
"mean_token_accuracy": 0.2593720957636833,
|
|
"num_tokens": 80289735.0,
|
|
"step": 35020
|
|
},
|
|
{
|
|
"entropy": 5.018657970428467,
|
|
"epoch": 3.3645533141210375,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038950893956509597,
|
|
"loss": 4.684,
|
|
"mean_token_accuracy": 0.25095806568861007,
|
|
"num_tokens": 80301617.0,
|
|
"step": 35025
|
|
},
|
|
{
|
|
"entropy": 4.975236129760742,
|
|
"epoch": 3.3650336215177714,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003894794266522746,
|
|
"loss": 4.4971,
|
|
"mean_token_accuracy": 0.26460852175951005,
|
|
"num_tokens": 80312996.0,
|
|
"step": 35030
|
|
},
|
|
{
|
|
"entropy": 5.0634908199310305,
|
|
"epoch": 3.3655139289145053,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00038944991108158094,
|
|
"loss": 4.5849,
|
|
"mean_token_accuracy": 0.24807824194431305,
|
|
"num_tokens": 80324435.0,
|
|
"step": 35035
|
|
},
|
|
{
|
|
"entropy": 5.008642959594726,
|
|
"epoch": 3.365994236311239,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003894203928537001,
|
|
"loss": 4.6229,
|
|
"mean_token_accuracy": 0.25143705904483793,
|
|
"num_tokens": 80335922.0,
|
|
"step": 35040
|
|
},
|
|
{
|
|
"entropy": 5.046055936813355,
|
|
"epoch": 3.366474543707973,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038939087196931754,
|
|
"loss": 4.6381,
|
|
"mean_token_accuracy": 0.25367101579904555,
|
|
"num_tokens": 80348405.0,
|
|
"step": 35045
|
|
},
|
|
{
|
|
"entropy": 5.050200891494751,
|
|
"epoch": 3.366954851104707,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00038936134842911863,
|
|
"loss": 4.6002,
|
|
"mean_token_accuracy": 0.25341845452785494,
|
|
"num_tokens": 80361387.0,
|
|
"step": 35050
|
|
},
|
|
{
|
|
"entropy": 4.981326484680176,
|
|
"epoch": 3.367435158501441,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003893318222337888,
|
|
"loss": 4.642,
|
|
"mean_token_accuracy": 0.25004069954156877,
|
|
"num_tokens": 80372500.0,
|
|
"step": 35055
|
|
},
|
|
{
|
|
"entropy": 5.095010089874267,
|
|
"epoch": 3.3679154658981747,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00038930229338401354,
|
|
"loss": 4.7115,
|
|
"mean_token_accuracy": 0.24472733289003373,
|
|
"num_tokens": 80384311.0,
|
|
"step": 35060
|
|
},
|
|
{
|
|
"entropy": 5.059339284896851,
|
|
"epoch": 3.3683957732949086,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003892727618804783,
|
|
"loss": 4.5883,
|
|
"mean_token_accuracy": 0.25022012293338775,
|
|
"num_tokens": 80396259.0,
|
|
"step": 35065
|
|
},
|
|
{
|
|
"entropy": 5.0155357837677,
|
|
"epoch": 3.3688760806916425,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000389243227723869,
|
|
"loss": 4.6135,
|
|
"mean_token_accuracy": 0.24623016715049745,
|
|
"num_tokens": 80407420.0,
|
|
"step": 35070
|
|
},
|
|
{
|
|
"entropy": 4.988042116165161,
|
|
"epoch": 3.3693563880883763,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003892136909148711,
|
|
"loss": 4.5612,
|
|
"mean_token_accuracy": 0.2516023561358452,
|
|
"num_tokens": 80417130.0,
|
|
"step": 35075
|
|
},
|
|
{
|
|
"entropy": 5.065314388275146,
|
|
"epoch": 3.3698366954851107,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0003891841514541706,
|
|
"loss": 4.6198,
|
|
"mean_token_accuracy": 0.25408089309930804,
|
|
"num_tokens": 80429405.0,
|
|
"step": 35080
|
|
},
|
|
{
|
|
"entropy": 5.032301759719848,
|
|
"epoch": 3.3703170028818445,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000389154609342453,
|
|
"loss": 4.6282,
|
|
"mean_token_accuracy": 0.24592494666576387,
|
|
"num_tokens": 80439895.0,
|
|
"step": 35085
|
|
},
|
|
{
|
|
"entropy": 4.953702640533447,
|
|
"epoch": 3.3707973102785784,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003891250645804044,
|
|
"loss": 4.5914,
|
|
"mean_token_accuracy": 0.25820731818675996,
|
|
"num_tokens": 80451884.0,
|
|
"step": 35090
|
|
},
|
|
{
|
|
"entropy": 5.060756540298462,
|
|
"epoch": 3.3712776176753123,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00038909551716871074,
|
|
"loss": 4.6214,
|
|
"mean_token_accuracy": 0.2534745901823044,
|
|
"num_tokens": 80462629.0,
|
|
"step": 35095
|
|
},
|
|
{
|
|
"entropy": 5.088488388061523,
|
|
"epoch": 3.371757925072046,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003890659671080579,
|
|
"loss": 4.6689,
|
|
"mean_token_accuracy": 0.24508027881383895,
|
|
"num_tokens": 80472066.0,
|
|
"step": 35100
|
|
},
|
|
{
|
|
"entropy": 5.0153107166290285,
|
|
"epoch": 3.37223823246878,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.000389036414399132,
|
|
"loss": 4.5686,
|
|
"mean_token_accuracy": 0.25422897189855576,
|
|
"num_tokens": 80483687.0,
|
|
"step": 35105
|
|
},
|
|
{
|
|
"entropy": 5.004940223693848,
|
|
"epoch": 3.372718539865514,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003890068590426191,
|
|
"loss": 4.6007,
|
|
"mean_token_accuracy": 0.2536112517118454,
|
|
"num_tokens": 80493323.0,
|
|
"step": 35110
|
|
},
|
|
{
|
|
"entropy": 5.023319911956787,
|
|
"epoch": 3.373198847262248,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003889773010392056,
|
|
"loss": 4.6233,
|
|
"mean_token_accuracy": 0.24997997283935547,
|
|
"num_tokens": 80504454.0,
|
|
"step": 35115
|
|
},
|
|
{
|
|
"entropy": 4.969642305374146,
|
|
"epoch": 3.3736791546589817,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00038894774038957756,
|
|
"loss": 4.5349,
|
|
"mean_token_accuracy": 0.25590767413377763,
|
|
"num_tokens": 80516201.0,
|
|
"step": 35120
|
|
},
|
|
{
|
|
"entropy": 4.994466161727905,
|
|
"epoch": 3.3741594620557156,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038891817709442135,
|
|
"loss": 4.6321,
|
|
"mean_token_accuracy": 0.2489602282643318,
|
|
"num_tokens": 80528319.0,
|
|
"step": 35125
|
|
},
|
|
{
|
|
"entropy": 5.012839078903198,
|
|
"epoch": 3.3746397694524495,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038888861115442334,
|
|
"loss": 4.5588,
|
|
"mean_token_accuracy": 0.2572706416249275,
|
|
"num_tokens": 80539684.0,
|
|
"step": 35130
|
|
},
|
|
{
|
|
"entropy": 5.011568927764893,
|
|
"epoch": 3.3751200768491834,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003888590425702699,
|
|
"loss": 4.5827,
|
|
"mean_token_accuracy": 0.25684234499931335,
|
|
"num_tokens": 80550971.0,
|
|
"step": 35135
|
|
},
|
|
{
|
|
"entropy": 4.997000217437744,
|
|
"epoch": 3.3756003842459172,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003888294713426477,
|
|
"loss": 4.5924,
|
|
"mean_token_accuracy": 0.2533003658056259,
|
|
"num_tokens": 80562931.0,
|
|
"step": 35140
|
|
},
|
|
{
|
|
"entropy": 4.9594700813293455,
|
|
"epoch": 3.376080691642651,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038879989747224317,
|
|
"loss": 4.5819,
|
|
"mean_token_accuracy": 0.2596623405814171,
|
|
"num_tokens": 80573899.0,
|
|
"step": 35145
|
|
},
|
|
{
|
|
"entropy": 5.024944305419922,
|
|
"epoch": 3.376560999039385,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003887703209597428,
|
|
"loss": 4.5934,
|
|
"mean_token_accuracy": 0.25081279426813125,
|
|
"num_tokens": 80585453.0,
|
|
"step": 35150
|
|
},
|
|
{
|
|
"entropy": 5.05692572593689,
|
|
"epoch": 3.3770413064361193,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003887407418058335,
|
|
"loss": 4.6539,
|
|
"mean_token_accuracy": 0.2525654971599579,
|
|
"num_tokens": 80597911.0,
|
|
"step": 35155
|
|
},
|
|
{
|
|
"entropy": 5.046922397613526,
|
|
"epoch": 3.377521613832853,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00038871116001120196,
|
|
"loss": 4.6075,
|
|
"mean_token_accuracy": 0.25285103768110273,
|
|
"num_tokens": 80609350.0,
|
|
"step": 35160
|
|
},
|
|
{
|
|
"entropy": 5.025788450241089,
|
|
"epoch": 3.378001921229587,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0003886815755765348,
|
|
"loss": 4.6589,
|
|
"mean_token_accuracy": 0.246637824177742,
|
|
"num_tokens": 80620710.0,
|
|
"step": 35165
|
|
},
|
|
{
|
|
"entropy": 5.088965368270874,
|
|
"epoch": 3.378482228626321,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003886519885025191,
|
|
"loss": 4.737,
|
|
"mean_token_accuracy": 0.23663281500339509,
|
|
"num_tokens": 80633031.0,
|
|
"step": 35170
|
|
},
|
|
{
|
|
"entropy": 4.987149667739868,
|
|
"epoch": 3.378962536023055,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00038862239878984173,
|
|
"loss": 4.5205,
|
|
"mean_token_accuracy": 0.2592849716544151,
|
|
"num_tokens": 80644047.0,
|
|
"step": 35175
|
|
},
|
|
{
|
|
"entropy": 5.032893371582031,
|
|
"epoch": 3.3794428434197887,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003885928064391897,
|
|
"loss": 4.5971,
|
|
"mean_token_accuracy": 0.25226512998342515,
|
|
"num_tokens": 80655769.0,
|
|
"step": 35180
|
|
},
|
|
{
|
|
"entropy": 5.070116376876831,
|
|
"epoch": 3.3799231508165226,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003885632114512499,
|
|
"loss": 4.6411,
|
|
"mean_token_accuracy": 0.2474030002951622,
|
|
"num_tokens": 80667881.0,
|
|
"step": 35185
|
|
},
|
|
{
|
|
"entropy": 5.0757129192352295,
|
|
"epoch": 3.3804034582132565,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038853361382670956,
|
|
"loss": 4.6875,
|
|
"mean_token_accuracy": 0.24633048176765443,
|
|
"num_tokens": 80679021.0,
|
|
"step": 35190
|
|
},
|
|
{
|
|
"entropy": 5.026468753814697,
|
|
"epoch": 3.3808837656099904,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038850401356625583,
|
|
"loss": 4.641,
|
|
"mean_token_accuracy": 0.2502972841262817,
|
|
"num_tokens": 80689366.0,
|
|
"step": 35195
|
|
},
|
|
{
|
|
"entropy": 4.9525751113891605,
|
|
"epoch": 3.3813640730067243,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003884744106705759,
|
|
"loss": 4.5438,
|
|
"mean_token_accuracy": 0.25194079875946046,
|
|
"num_tokens": 80700755.0,
|
|
"step": 35200
|
|
},
|
|
{
|
|
"entropy": 5.034112548828125,
|
|
"epoch": 3.381844380403458,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038844480514035727,
|
|
"loss": 4.6111,
|
|
"mean_token_accuracy": 0.24931746870279312,
|
|
"num_tokens": 80712186.0,
|
|
"step": 35205
|
|
},
|
|
{
|
|
"entropy": 4.97149806022644,
|
|
"epoch": 3.382324687800192,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000388415196976287,
|
|
"loss": 4.5836,
|
|
"mean_token_accuracy": 0.2611155539751053,
|
|
"num_tokens": 80723865.0,
|
|
"step": 35210
|
|
},
|
|
{
|
|
"entropy": 5.002107858657837,
|
|
"epoch": 3.382804995196926,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003883855861790526,
|
|
"loss": 4.6256,
|
|
"mean_token_accuracy": 0.2487585127353668,
|
|
"num_tokens": 80735817.0,
|
|
"step": 35215
|
|
},
|
|
{
|
|
"entropy": 5.030579614639282,
|
|
"epoch": 3.38328530259366,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003883559727493417,
|
|
"loss": 4.6974,
|
|
"mean_token_accuracy": 0.24719827026128768,
|
|
"num_tokens": 80747782.0,
|
|
"step": 35220
|
|
},
|
|
{
|
|
"entropy": 5.0343286991119385,
|
|
"epoch": 3.3837656099903937,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003883263566878416,
|
|
"loss": 4.6278,
|
|
"mean_token_accuracy": 0.24029747098684312,
|
|
"num_tokens": 80759119.0,
|
|
"step": 35225
|
|
},
|
|
{
|
|
"entropy": 5.019806241989135,
|
|
"epoch": 3.384245917387128,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00038829673799524006,
|
|
"loss": 4.5461,
|
|
"mean_token_accuracy": 0.2609804138541222,
|
|
"num_tokens": 80769792.0,
|
|
"step": 35230
|
|
},
|
|
{
|
|
"entropy": 4.979036855697632,
|
|
"epoch": 3.3847262247838614,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00038826711667222464,
|
|
"loss": 4.5588,
|
|
"mean_token_accuracy": 0.2536883130669594,
|
|
"num_tokens": 80781685.0,
|
|
"step": 35235
|
|
},
|
|
{
|
|
"entropy": 4.916115140914917,
|
|
"epoch": 3.3852065321805958,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00038823749271948315,
|
|
"loss": 4.5189,
|
|
"mean_token_accuracy": 0.25899354815483094,
|
|
"num_tokens": 80792499.0,
|
|
"step": 35240
|
|
},
|
|
{
|
|
"entropy": 5.050924444198609,
|
|
"epoch": 3.3856868395773296,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038820786613770334,
|
|
"loss": 4.6005,
|
|
"mean_token_accuracy": 0.25700204372406005,
|
|
"num_tokens": 80803020.0,
|
|
"step": 35245
|
|
},
|
|
{
|
|
"entropy": 5.054552841186523,
|
|
"epoch": 3.3861671469740635,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00038817823692757303,
|
|
"loss": 4.6538,
|
|
"mean_token_accuracy": 0.2523463472723961,
|
|
"num_tokens": 80814543.0,
|
|
"step": 35250
|
|
},
|
|
{
|
|
"entropy": 4.916332340240478,
|
|
"epoch": 3.3866474543707974,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038814860508978004,
|
|
"loss": 4.5524,
|
|
"mean_token_accuracy": 0.25886755585670473,
|
|
"num_tokens": 80826686.0,
|
|
"step": 35255
|
|
},
|
|
{
|
|
"entropy": 5.032272148132324,
|
|
"epoch": 3.3871277617675313,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003881189706250125,
|
|
"loss": 4.6673,
|
|
"mean_token_accuracy": 0.24142957776784896,
|
|
"num_tokens": 80838372.0,
|
|
"step": 35260
|
|
},
|
|
{
|
|
"entropy": 5.123521566390991,
|
|
"epoch": 3.387608069164265,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038808933353395836,
|
|
"loss": 4.6314,
|
|
"mean_token_accuracy": 0.24835428595542908,
|
|
"num_tokens": 80849334.0,
|
|
"step": 35265
|
|
},
|
|
{
|
|
"entropy": 4.956131029129028,
|
|
"epoch": 3.388088376560999,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038805969381730564,
|
|
"loss": 4.4944,
|
|
"mean_token_accuracy": 0.25334072560071946,
|
|
"num_tokens": 80860682.0,
|
|
"step": 35270
|
|
},
|
|
{
|
|
"entropy": 5.04627799987793,
|
|
"epoch": 3.388568683957733,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00038803005147574265,
|
|
"loss": 4.6569,
|
|
"mean_token_accuracy": 0.24770759046077728,
|
|
"num_tokens": 80872026.0,
|
|
"step": 35275
|
|
},
|
|
{
|
|
"entropy": 5.063813591003418,
|
|
"epoch": 3.389048991354467,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003880004065099575,
|
|
"loss": 4.6519,
|
|
"mean_token_accuracy": 0.24535784721374512,
|
|
"num_tokens": 80884146.0,
|
|
"step": 35280
|
|
},
|
|
{
|
|
"entropy": 5.036695432662964,
|
|
"epoch": 3.3895292987512007,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003879707589206383,
|
|
"loss": 4.6238,
|
|
"mean_token_accuracy": 0.25011399686336516,
|
|
"num_tokens": 80895975.0,
|
|
"step": 35285
|
|
},
|
|
{
|
|
"entropy": 5.1202473640441895,
|
|
"epoch": 3.3900096061479346,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003879411087084736,
|
|
"loss": 4.7081,
|
|
"mean_token_accuracy": 0.241127410531044,
|
|
"num_tokens": 80908144.0,
|
|
"step": 35290
|
|
},
|
|
{
|
|
"entropy": 5.017182159423828,
|
|
"epoch": 3.3904899135446684,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00038791145587415186,
|
|
"loss": 4.6031,
|
|
"mean_token_accuracy": 0.2476770669221878,
|
|
"num_tokens": 80918616.0,
|
|
"step": 35295
|
|
},
|
|
{
|
|
"entropy": 5.067205572128296,
|
|
"epoch": 3.3909702209414023,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00038788180041836117,
|
|
"loss": 4.7103,
|
|
"mean_token_accuracy": 0.2398787707090378,
|
|
"num_tokens": 80929690.0,
|
|
"step": 35300
|
|
},
|
|
{
|
|
"entropy": 5.053402137756348,
|
|
"epoch": 3.3914505283381366,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038785214234179037,
|
|
"loss": 4.582,
|
|
"mean_token_accuracy": 0.248058520257473,
|
|
"num_tokens": 80940817.0,
|
|
"step": 35305
|
|
},
|
|
{
|
|
"entropy": 5.060543441772461,
|
|
"epoch": 3.39193083573487,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038782248164512804,
|
|
"loss": 4.5829,
|
|
"mean_token_accuracy": 0.2567798539996147,
|
|
"num_tokens": 80950951.0,
|
|
"step": 35310
|
|
},
|
|
{
|
|
"entropy": 5.011791658401489,
|
|
"epoch": 3.3924111431316044,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00038779281832906253,
|
|
"loss": 4.6077,
|
|
"mean_token_accuracy": 0.24900132268667222,
|
|
"num_tokens": 80963511.0,
|
|
"step": 35315
|
|
},
|
|
{
|
|
"entropy": 5.099618673324585,
|
|
"epoch": 3.3928914505283383,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00038776315239428275,
|
|
"loss": 4.7113,
|
|
"mean_token_accuracy": 0.24593275040388107,
|
|
"num_tokens": 80975905.0,
|
|
"step": 35320
|
|
},
|
|
{
|
|
"entropy": 5.089013481140137,
|
|
"epoch": 3.393371757925072,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00038773348384147743,
|
|
"loss": 4.6394,
|
|
"mean_token_accuracy": 0.2502724289894104,
|
|
"num_tokens": 80986850.0,
|
|
"step": 35325
|
|
},
|
|
{
|
|
"entropy": 4.917418622970581,
|
|
"epoch": 3.393852065321806,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003877038126713354,
|
|
"loss": 4.5039,
|
|
"mean_token_accuracy": 0.263748537003994,
|
|
"num_tokens": 80998190.0,
|
|
"step": 35330
|
|
},
|
|
{
|
|
"entropy": 4.979682922363281,
|
|
"epoch": 3.39433237271854,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00038767413888454537,
|
|
"loss": 4.6027,
|
|
"mean_token_accuracy": 0.24252035170793534,
|
|
"num_tokens": 81011086.0,
|
|
"step": 35335
|
|
},
|
|
{
|
|
"entropy": 5.177395629882812,
|
|
"epoch": 3.394812680115274,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00038764446248179665,
|
|
"loss": 4.6873,
|
|
"mean_token_accuracy": 0.2508353665471077,
|
|
"num_tokens": 81022074.0,
|
|
"step": 35340
|
|
},
|
|
{
|
|
"entropy": 5.008141040802002,
|
|
"epoch": 3.3952929875120077,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003876147834637778,
|
|
"loss": 4.5974,
|
|
"mean_token_accuracy": 0.24676503390073776,
|
|
"num_tokens": 81033019.0,
|
|
"step": 35345
|
|
},
|
|
{
|
|
"entropy": 5.081465482711792,
|
|
"epoch": 3.3957732949087416,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038758510183117806,
|
|
"loss": 4.6982,
|
|
"mean_token_accuracy": 0.24363914877176285,
|
|
"num_tokens": 81044955.0,
|
|
"step": 35350
|
|
},
|
|
{
|
|
"entropy": 5.02382526397705,
|
|
"epoch": 3.3962536023054755,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003875554175846866,
|
|
"loss": 4.5134,
|
|
"mean_token_accuracy": 0.26208958923816683,
|
|
"num_tokens": 81055980.0,
|
|
"step": 35355
|
|
},
|
|
{
|
|
"entropy": 4.9981273174285885,
|
|
"epoch": 3.3967339097022093,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038752573072499267,
|
|
"loss": 4.5599,
|
|
"mean_token_accuracy": 0.2592511162161827,
|
|
"num_tokens": 81067960.0,
|
|
"step": 35360
|
|
},
|
|
{
|
|
"entropy": 5.036674976348877,
|
|
"epoch": 3.3972142170989432,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00038749604125278524,
|
|
"loss": 4.6555,
|
|
"mean_token_accuracy": 0.2486070767045021,
|
|
"num_tokens": 81078084.0,
|
|
"step": 35365
|
|
},
|
|
{
|
|
"entropy": 5.049911880493164,
|
|
"epoch": 3.397694524495677,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003874663491687539,
|
|
"loss": 4.5872,
|
|
"mean_token_accuracy": 0.2517434969544411,
|
|
"num_tokens": 81089376.0,
|
|
"step": 35370
|
|
},
|
|
{
|
|
"entropy": 5.107871150970459,
|
|
"epoch": 3.398174831892411,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00038743665447358785,
|
|
"loss": 4.6663,
|
|
"mean_token_accuracy": 0.24245515316724778,
|
|
"num_tokens": 81100906.0,
|
|
"step": 35375
|
|
},
|
|
{
|
|
"entropy": 5.0644361019134525,
|
|
"epoch": 3.398655139289145,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003874069571679766,
|
|
"loss": 4.6551,
|
|
"mean_token_accuracy": 0.24924662858247756,
|
|
"num_tokens": 81112839.0,
|
|
"step": 35380
|
|
},
|
|
{
|
|
"entropy": 4.942523193359375,
|
|
"epoch": 3.3991354466858787,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038737725725260946,
|
|
"loss": 4.5748,
|
|
"mean_token_accuracy": 0.2515710085630417,
|
|
"num_tokens": 81126015.0,
|
|
"step": 35385
|
|
},
|
|
{
|
|
"entropy": 5.0529743194580075,
|
|
"epoch": 3.399615754082613,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00038734755472817617,
|
|
"loss": 4.6528,
|
|
"mean_token_accuracy": 0.24668528586626054,
|
|
"num_tokens": 81139526.0,
|
|
"step": 35390
|
|
},
|
|
{
|
|
"entropy": 5.015586805343628,
|
|
"epoch": 3.400096061479347,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00038731784959536626,
|
|
"loss": 4.5318,
|
|
"mean_token_accuracy": 0.25951134115457536,
|
|
"num_tokens": 81152340.0,
|
|
"step": 35395
|
|
},
|
|
{
|
|
"entropy": 5.102068090438843,
|
|
"epoch": 3.400576368876081,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038728814185486944,
|
|
"loss": 4.651,
|
|
"mean_token_accuracy": 0.2418101504445076,
|
|
"num_tokens": 81162724.0,
|
|
"step": 35400
|
|
},
|
|
{
|
|
"entropy": 5.047775077819824,
|
|
"epoch": 3.4010566762728147,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003872584315073753,
|
|
"loss": 4.612,
|
|
"mean_token_accuracy": 0.24746428579092025,
|
|
"num_tokens": 81172933.0,
|
|
"step": 35405
|
|
},
|
|
{
|
|
"entropy": 4.999932432174683,
|
|
"epoch": 3.4015369836695486,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003872287185535738,
|
|
"loss": 4.627,
|
|
"mean_token_accuracy": 0.24676198065280913,
|
|
"num_tokens": 81183774.0,
|
|
"step": 35410
|
|
},
|
|
{
|
|
"entropy": 4.921209859848022,
|
|
"epoch": 3.4020172910662825,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038719900299415475,
|
|
"loss": 4.4743,
|
|
"mean_token_accuracy": 0.2676608473062515,
|
|
"num_tokens": 81194896.0,
|
|
"step": 35415
|
|
},
|
|
{
|
|
"entropy": 5.025229454040527,
|
|
"epoch": 3.4024975984630164,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003871692848298079,
|
|
"loss": 4.5811,
|
|
"mean_token_accuracy": 0.2540947362780571,
|
|
"num_tokens": 81204889.0,
|
|
"step": 35420
|
|
},
|
|
{
|
|
"entropy": 5.062064266204834,
|
|
"epoch": 3.4029779058597502,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00038713956406122334,
|
|
"loss": 4.6621,
|
|
"mean_token_accuracy": 0.2442566990852356,
|
|
"num_tokens": 81217177.0,
|
|
"step": 35425
|
|
},
|
|
{
|
|
"entropy": 5.0922904968261715,
|
|
"epoch": 3.403458213256484,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038710984068909116,
|
|
"loss": 4.6769,
|
|
"mean_token_accuracy": 0.24345260560512544,
|
|
"num_tokens": 81228767.0,
|
|
"step": 35430
|
|
},
|
|
{
|
|
"entropy": 4.998703479766846,
|
|
"epoch": 3.403938520653218,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003870801147141014,
|
|
"loss": 4.5507,
|
|
"mean_token_accuracy": 0.24674364775419236,
|
|
"num_tokens": 81239599.0,
|
|
"step": 35435
|
|
},
|
|
{
|
|
"entropy": 5.003114843368531,
|
|
"epoch": 3.404418828049952,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003870503861369441,
|
|
"loss": 4.6116,
|
|
"mean_token_accuracy": 0.2476789563894272,
|
|
"num_tokens": 81251051.0,
|
|
"step": 35440
|
|
},
|
|
{
|
|
"entropy": 5.058178091049195,
|
|
"epoch": 3.4048991354466858,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00038702065495830956,
|
|
"loss": 4.6773,
|
|
"mean_token_accuracy": 0.24947068840265274,
|
|
"num_tokens": 81262484.0,
|
|
"step": 35445
|
|
},
|
|
{
|
|
"entropy": 5.028571701049804,
|
|
"epoch": 3.4053794428434196,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003869909211788881,
|
|
"loss": 4.5757,
|
|
"mean_token_accuracy": 0.26031079739332197,
|
|
"num_tokens": 81273068.0,
|
|
"step": 35450
|
|
},
|
|
{
|
|
"entropy": 5.029877090454102,
|
|
"epoch": 3.4058597502401535,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00038696118479936994,
|
|
"loss": 4.6287,
|
|
"mean_token_accuracy": 0.25055190920829773,
|
|
"num_tokens": 81285239.0,
|
|
"step": 35455
|
|
},
|
|
{
|
|
"entropy": 5.05253529548645,
|
|
"epoch": 3.4063400576368874,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00038693144582044553,
|
|
"loss": 4.6357,
|
|
"mean_token_accuracy": 0.2467782527208328,
|
|
"num_tokens": 81295998.0,
|
|
"step": 35460
|
|
},
|
|
{
|
|
"entropy": 4.9913591861724855,
|
|
"epoch": 3.4068203650336217,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038690170424280534,
|
|
"loss": 4.5207,
|
|
"mean_token_accuracy": 0.2551711842417717,
|
|
"num_tokens": 81307569.0,
|
|
"step": 35465
|
|
},
|
|
{
|
|
"entropy": 4.980480146408081,
|
|
"epoch": 3.4073006724303556,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003868719600671399,
|
|
"loss": 4.571,
|
|
"mean_token_accuracy": 0.24703271239995955,
|
|
"num_tokens": 81319023.0,
|
|
"step": 35470
|
|
},
|
|
{
|
|
"entropy": 4.9213526248931885,
|
|
"epoch": 3.4077809798270895,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00038684221329413965,
|
|
"loss": 4.5374,
|
|
"mean_token_accuracy": 0.26038162857294084,
|
|
"num_tokens": 81330087.0,
|
|
"step": 35475
|
|
},
|
|
{
|
|
"entropy": 4.984519338607788,
|
|
"epoch": 3.4082612872238234,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003868124639244954,
|
|
"loss": 4.4978,
|
|
"mean_token_accuracy": 0.26115120351314547,
|
|
"num_tokens": 81341533.0,
|
|
"step": 35480
|
|
},
|
|
{
|
|
"entropy": 5.065533256530761,
|
|
"epoch": 3.4087415946205573,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00038678271195889766,
|
|
"loss": 4.6757,
|
|
"mean_token_accuracy": 0.2376832216978073,
|
|
"num_tokens": 81353505.0,
|
|
"step": 35485
|
|
},
|
|
{
|
|
"entropy": 5.03993034362793,
|
|
"epoch": 3.409221902017291,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00038675295739803734,
|
|
"loss": 4.6213,
|
|
"mean_token_accuracy": 0.24922273308038712,
|
|
"num_tokens": 81365246.0,
|
|
"step": 35490
|
|
},
|
|
{
|
|
"entropy": 5.156140899658203,
|
|
"epoch": 3.409702209414025,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00038672320024260516,
|
|
"loss": 4.7174,
|
|
"mean_token_accuracy": 0.2432163506746292,
|
|
"num_tokens": 81376403.0,
|
|
"step": 35495
|
|
},
|
|
{
|
|
"entropy": 5.041756677627563,
|
|
"epoch": 3.410182516810759,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00038669344049329204,
|
|
"loss": 4.6031,
|
|
"mean_token_accuracy": 0.2519982814788818,
|
|
"num_tokens": 81386865.0,
|
|
"step": 35500
|
|
},
|
|
{
|
|
"entropy": 4.944682168960571,
|
|
"epoch": 3.410662824207493,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00038666367815078887,
|
|
"loss": 4.5717,
|
|
"mean_token_accuracy": 0.25449229329824447,
|
|
"num_tokens": 81398191.0,
|
|
"step": 35505
|
|
},
|
|
{
|
|
"entropy": 5.026098012924194,
|
|
"epoch": 3.4111431316042267,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003866339132157867,
|
|
"loss": 4.6578,
|
|
"mean_token_accuracy": 0.24305206537246704,
|
|
"num_tokens": 81410694.0,
|
|
"step": 35510
|
|
},
|
|
{
|
|
"entropy": 5.051671552658081,
|
|
"epoch": 3.4116234390009605,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003866041456889764,
|
|
"loss": 4.5805,
|
|
"mean_token_accuracy": 0.249052694439888,
|
|
"num_tokens": 81422195.0,
|
|
"step": 35515
|
|
},
|
|
{
|
|
"entropy": 4.9964416980743405,
|
|
"epoch": 3.4121037463976944,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00038657437557104946,
|
|
"loss": 4.6021,
|
|
"mean_token_accuracy": 0.25322929918766024,
|
|
"num_tokens": 81432979.0,
|
|
"step": 35520
|
|
},
|
|
{
|
|
"entropy": 4.972515535354614,
|
|
"epoch": 3.4125840537944283,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003865446028626967,
|
|
"loss": 4.5523,
|
|
"mean_token_accuracy": 0.25452432930469515,
|
|
"num_tokens": 81444328.0,
|
|
"step": 35525
|
|
},
|
|
{
|
|
"entropy": 5.15280499458313,
|
|
"epoch": 3.413064361191162,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038651482756460947,
|
|
"loss": 4.8092,
|
|
"mean_token_accuracy": 0.24050280153751374,
|
|
"num_tokens": 81455530.0,
|
|
"step": 35530
|
|
},
|
|
{
|
|
"entropy": 5.016593647003174,
|
|
"epoch": 3.413544668587896,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038648504967747914,
|
|
"loss": 4.5999,
|
|
"mean_token_accuracy": 0.25508580207824705,
|
|
"num_tokens": 81466462.0,
|
|
"step": 35535
|
|
},
|
|
{
|
|
"entropy": 5.0983155250549315,
|
|
"epoch": 3.4140249759846304,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038645526920199697,
|
|
"loss": 4.7034,
|
|
"mean_token_accuracy": 0.24511379301548003,
|
|
"num_tokens": 81479459.0,
|
|
"step": 35540
|
|
},
|
|
{
|
|
"entropy": 5.1104835033416744,
|
|
"epoch": 3.414505283381364,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003864254861388544,
|
|
"loss": 4.7303,
|
|
"mean_token_accuracy": 0.24671141505241395,
|
|
"num_tokens": 81491026.0,
|
|
"step": 35545
|
|
},
|
|
{
|
|
"entropy": 5.081034135818482,
|
|
"epoch": 3.414985590778098,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038639570048874295,
|
|
"loss": 4.6166,
|
|
"mean_token_accuracy": 0.2519968613982201,
|
|
"num_tokens": 81504469.0,
|
|
"step": 35550
|
|
},
|
|
{
|
|
"entropy": 4.983581924438477,
|
|
"epoch": 3.415465898174832,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00038636591225235407,
|
|
"loss": 4.5361,
|
|
"mean_token_accuracy": 0.256315740942955,
|
|
"num_tokens": 81515873.0,
|
|
"step": 35555
|
|
},
|
|
{
|
|
"entropy": 4.966502714157104,
|
|
"epoch": 3.415946205571566,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0003863361214303794,
|
|
"loss": 4.5935,
|
|
"mean_token_accuracy": 0.25549074858427046,
|
|
"num_tokens": 81528169.0,
|
|
"step": 35560
|
|
},
|
|
{
|
|
"entropy": 5.063381147384644,
|
|
"epoch": 3.4164265129683,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003863063280235106,
|
|
"loss": 4.5861,
|
|
"mean_token_accuracy": 0.24859169274568557,
|
|
"num_tokens": 81538831.0,
|
|
"step": 35565
|
|
},
|
|
{
|
|
"entropy": 5.129141902923584,
|
|
"epoch": 3.4169068203650337,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00038627653203243933,
|
|
"loss": 4.7333,
|
|
"mean_token_accuracy": 0.24121089577674865,
|
|
"num_tokens": 81549809.0,
|
|
"step": 35570
|
|
},
|
|
{
|
|
"entropy": 4.9427472114562985,
|
|
"epoch": 3.4173871277617676,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003862467334578574,
|
|
"loss": 4.5318,
|
|
"mean_token_accuracy": 0.25782768428325653,
|
|
"num_tokens": 81560537.0,
|
|
"step": 35575
|
|
},
|
|
{
|
|
"entropy": 5.009121417999268,
|
|
"epoch": 3.4178674351585014,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038621693230045677,
|
|
"loss": 4.5855,
|
|
"mean_token_accuracy": 0.24942596554756163,
|
|
"num_tokens": 81571983.0,
|
|
"step": 35580
|
|
},
|
|
{
|
|
"entropy": 4.954544734954834,
|
|
"epoch": 3.4183477425552353,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003861871285609291,
|
|
"loss": 4.5645,
|
|
"mean_token_accuracy": 0.2511437177658081,
|
|
"num_tokens": 81584783.0,
|
|
"step": 35585
|
|
},
|
|
{
|
|
"entropy": 4.972129917144775,
|
|
"epoch": 3.418828049951969,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0003861573222399665,
|
|
"loss": 4.5154,
|
|
"mean_token_accuracy": 0.2564325526356697,
|
|
"num_tokens": 81596619.0,
|
|
"step": 35590
|
|
},
|
|
{
|
|
"entropy": 5.14181661605835,
|
|
"epoch": 3.419308357348703,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000386127513338261,
|
|
"loss": 4.7134,
|
|
"mean_token_accuracy": 0.24326184689998626,
|
|
"num_tokens": 81607981.0,
|
|
"step": 35595
|
|
},
|
|
{
|
|
"entropy": 5.087239122390747,
|
|
"epoch": 3.419788664745437,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003860977018565046,
|
|
"loss": 4.6904,
|
|
"mean_token_accuracy": 0.2416967809200287,
|
|
"num_tokens": 81620444.0,
|
|
"step": 35600
|
|
},
|
|
{
|
|
"entropy": 5.025595569610596,
|
|
"epoch": 3.420268972142171,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003860678877953894,
|
|
"loss": 4.5763,
|
|
"mean_token_accuracy": 0.25276096612215043,
|
|
"num_tokens": 81631668.0,
|
|
"step": 35605
|
|
},
|
|
{
|
|
"entropy": 5.03992805480957,
|
|
"epoch": 3.4207492795389047,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003860380711556077,
|
|
"loss": 4.7201,
|
|
"mean_token_accuracy": 0.24342207759618759,
|
|
"num_tokens": 81643230.0,
|
|
"step": 35610
|
|
},
|
|
{
|
|
"entropy": 5.098343706130981,
|
|
"epoch": 3.421229586935639,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038600825193785173,
|
|
"loss": 4.6451,
|
|
"mean_token_accuracy": 0.24605976045131683,
|
|
"num_tokens": 81654217.0,
|
|
"step": 35615
|
|
},
|
|
{
|
|
"entropy": 5.0649824142456055,
|
|
"epoch": 3.4217098943323725,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003859784301428137,
|
|
"loss": 4.6915,
|
|
"mean_token_accuracy": 0.23920787870883942,
|
|
"num_tokens": 81666025.0,
|
|
"step": 35620
|
|
},
|
|
{
|
|
"entropy": 5.0398296356201175,
|
|
"epoch": 3.422190201729107,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003859486057711861,
|
|
"loss": 4.6418,
|
|
"mean_token_accuracy": 0.2450200706720352,
|
|
"num_tokens": 81677417.0,
|
|
"step": 35625
|
|
},
|
|
{
|
|
"entropy": 5.043654251098633,
|
|
"epoch": 3.4226705091258407,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003859187788236613,
|
|
"loss": 4.6808,
|
|
"mean_token_accuracy": 0.25091739892959597,
|
|
"num_tokens": 81687898.0,
|
|
"step": 35630
|
|
},
|
|
{
|
|
"entropy": 4.949628829956055,
|
|
"epoch": 3.4231508165225746,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038588894930093184,
|
|
"loss": 4.5476,
|
|
"mean_token_accuracy": 0.2605251118540764,
|
|
"num_tokens": 81699532.0,
|
|
"step": 35635
|
|
},
|
|
{
|
|
"entropy": 5.077291822433471,
|
|
"epoch": 3.4236311239193085,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00038585911720369023,
|
|
"loss": 4.6599,
|
|
"mean_token_accuracy": 0.24108761101961135,
|
|
"num_tokens": 81709164.0,
|
|
"step": 35640
|
|
},
|
|
{
|
|
"entropy": 5.066379022598267,
|
|
"epoch": 3.4241114313160423,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003858292825326291,
|
|
"loss": 4.6108,
|
|
"mean_token_accuracy": 0.25411655455827714,
|
|
"num_tokens": 81721354.0,
|
|
"step": 35645
|
|
},
|
|
{
|
|
"entropy": 4.982634782791138,
|
|
"epoch": 3.4245917387127762,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003857994452884412,
|
|
"loss": 4.529,
|
|
"mean_token_accuracy": 0.257805435359478,
|
|
"num_tokens": 81731793.0,
|
|
"step": 35650
|
|
},
|
|
{
|
|
"entropy": 5.05521674156189,
|
|
"epoch": 3.42507204610951,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003857696054718191,
|
|
"loss": 4.6328,
|
|
"mean_token_accuracy": 0.256782965362072,
|
|
"num_tokens": 81743254.0,
|
|
"step": 35655
|
|
},
|
|
{
|
|
"entropy": 5.1028913974761965,
|
|
"epoch": 3.425552353506244,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003857397630834557,
|
|
"loss": 4.697,
|
|
"mean_token_accuracy": 0.24332701712846755,
|
|
"num_tokens": 81755080.0,
|
|
"step": 35660
|
|
},
|
|
{
|
|
"entropy": 5.051576948165893,
|
|
"epoch": 3.426032660902978,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.00038570991812404384,
|
|
"loss": 4.6407,
|
|
"mean_token_accuracy": 0.24917400032281875,
|
|
"num_tokens": 81767367.0,
|
|
"step": 35665
|
|
},
|
|
{
|
|
"entropy": 5.072865724563599,
|
|
"epoch": 3.4265129682997117,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003856800705942764,
|
|
"loss": 4.7,
|
|
"mean_token_accuracy": 0.23929235339164734,
|
|
"num_tokens": 81778513.0,
|
|
"step": 35670
|
|
},
|
|
{
|
|
"entropy": 5.057154512405395,
|
|
"epoch": 3.4269932756964456,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038565022049484636,
|
|
"loss": 4.5665,
|
|
"mean_token_accuracy": 0.2571455791592598,
|
|
"num_tokens": 81789845.0,
|
|
"step": 35675
|
|
},
|
|
{
|
|
"entropy": 5.016182804107666,
|
|
"epoch": 3.4274735830931795,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00038562036782644675,
|
|
"loss": 4.6429,
|
|
"mean_token_accuracy": 0.24943196326494216,
|
|
"num_tokens": 81801287.0,
|
|
"step": 35680
|
|
},
|
|
{
|
|
"entropy": 5.031500720977784,
|
|
"epoch": 3.4279538904899134,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003855905125897708,
|
|
"loss": 4.5899,
|
|
"mean_token_accuracy": 0.2518752470612526,
|
|
"num_tokens": 81811462.0,
|
|
"step": 35685
|
|
},
|
|
{
|
|
"entropy": 4.98153395652771,
|
|
"epoch": 3.4284341978866473,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00038556065478551147,
|
|
"loss": 4.5334,
|
|
"mean_token_accuracy": 0.2516901955008507,
|
|
"num_tokens": 81822590.0,
|
|
"step": 35690
|
|
},
|
|
{
|
|
"entropy": 5.091107988357544,
|
|
"epoch": 3.428914505283381,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000385530794414362,
|
|
"loss": 4.6171,
|
|
"mean_token_accuracy": 0.259315188229084,
|
|
"num_tokens": 81834755.0,
|
|
"step": 35695
|
|
},
|
|
{
|
|
"entropy": 4.974374151229858,
|
|
"epoch": 3.4293948126801155,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003855009314770157,
|
|
"loss": 4.5024,
|
|
"mean_token_accuracy": 0.2566398024559021,
|
|
"num_tokens": 81846259.0,
|
|
"step": 35700
|
|
},
|
|
{
|
|
"entropy": 4.9552396774292,
|
|
"epoch": 3.4298751200768494,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00038547106597416593,
|
|
"loss": 4.5194,
|
|
"mean_token_accuracy": 0.2630326122045517,
|
|
"num_tokens": 81856262.0,
|
|
"step": 35705
|
|
},
|
|
{
|
|
"entropy": 5.03646993637085,
|
|
"epoch": 3.4303554274735832,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000385441197906506,
|
|
"loss": 4.6465,
|
|
"mean_token_accuracy": 0.24763473719358445,
|
|
"num_tokens": 81867201.0,
|
|
"step": 35710
|
|
},
|
|
{
|
|
"entropy": 5.095039701461792,
|
|
"epoch": 3.430835734870317,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00038541132727472945,
|
|
"loss": 4.8005,
|
|
"mean_token_accuracy": 0.2443981871008873,
|
|
"num_tokens": 81878654.0,
|
|
"step": 35715
|
|
},
|
|
{
|
|
"entropy": 5.058059453964233,
|
|
"epoch": 3.431316042267051,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038538145407952964,
|
|
"loss": 4.5626,
|
|
"mean_token_accuracy": 0.2526533126831055,
|
|
"num_tokens": 81889672.0,
|
|
"step": 35720
|
|
},
|
|
{
|
|
"entropy": 5.020740413665772,
|
|
"epoch": 3.431796349663785,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003853515783216003,
|
|
"loss": 4.5785,
|
|
"mean_token_accuracy": 0.2503949970006943,
|
|
"num_tokens": 81901072.0,
|
|
"step": 35725
|
|
},
|
|
{
|
|
"entropy": 5.039410972595215,
|
|
"epoch": 3.4322766570605188,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000385321700001635,
|
|
"loss": 4.5605,
|
|
"mean_token_accuracy": 0.2550284430384636,
|
|
"num_tokens": 81913146.0,
|
|
"step": 35730
|
|
},
|
|
{
|
|
"entropy": 5.088719511032105,
|
|
"epoch": 3.4327569644572526,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003852918191203274,
|
|
"loss": 4.6659,
|
|
"mean_token_accuracy": 0.24266094714403152,
|
|
"num_tokens": 81924648.0,
|
|
"step": 35735
|
|
},
|
|
{
|
|
"entropy": 5.051954650878907,
|
|
"epoch": 3.4332372718539865,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003852619356783712,
|
|
"loss": 4.6513,
|
|
"mean_token_accuracy": 0.24199773818254472,
|
|
"num_tokens": 81935744.0,
|
|
"step": 35740
|
|
},
|
|
{
|
|
"entropy": 5.077026414871216,
|
|
"epoch": 3.4337175792507204,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003852320496764603,
|
|
"loss": 4.599,
|
|
"mean_token_accuracy": 0.25022688806056975,
|
|
"num_tokens": 81946931.0,
|
|
"step": 35745
|
|
},
|
|
{
|
|
"entropy": 4.994713354110718,
|
|
"epoch": 3.4341978866474543,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00038520216111528855,
|
|
"loss": 4.57,
|
|
"mean_token_accuracy": 0.2605443805456161,
|
|
"num_tokens": 81958009.0,
|
|
"step": 35750
|
|
},
|
|
{
|
|
"entropy": 5.0040308952331545,
|
|
"epoch": 3.434678194044188,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003851722699955499,
|
|
"loss": 4.6106,
|
|
"mean_token_accuracy": 0.2508874759078026,
|
|
"num_tokens": 81970648.0,
|
|
"step": 35755
|
|
},
|
|
{
|
|
"entropy": 5.03471007347107,
|
|
"epoch": 3.435158501440922,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003851423763179382,
|
|
"loss": 4.5707,
|
|
"mean_token_accuracy": 0.2587558254599571,
|
|
"num_tokens": 81982293.0,
|
|
"step": 35760
|
|
},
|
|
{
|
|
"entropy": 4.992668104171753,
|
|
"epoch": 3.435638808837656,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038511248008314756,
|
|
"loss": 4.5886,
|
|
"mean_token_accuracy": 0.2558387294411659,
|
|
"num_tokens": 81993856.0,
|
|
"step": 35765
|
|
},
|
|
{
|
|
"entropy": 5.088662481307983,
|
|
"epoch": 3.43611911623439,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003850825812918722,
|
|
"loss": 4.6627,
|
|
"mean_token_accuracy": 0.24435512721538544,
|
|
"num_tokens": 82005359.0,
|
|
"step": 35770
|
|
},
|
|
{
|
|
"entropy": 5.056514310836792,
|
|
"epoch": 3.436599423631124,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003850526799448061,
|
|
"loss": 4.6276,
|
|
"mean_token_accuracy": 0.2558924823999405,
|
|
"num_tokens": 82017789.0,
|
|
"step": 35775
|
|
},
|
|
{
|
|
"entropy": 5.071243810653686,
|
|
"epoch": 3.437079731027858,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003850227760426436,
|
|
"loss": 4.6057,
|
|
"mean_token_accuracy": 0.2534796819090843,
|
|
"num_tokens": 82028147.0,
|
|
"step": 35780
|
|
},
|
|
{
|
|
"entropy": 5.021691226959229,
|
|
"epoch": 3.437560038424592,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038499286958607894,
|
|
"loss": 4.5866,
|
|
"mean_token_accuracy": 0.2504493460059166,
|
|
"num_tokens": 82039485.0,
|
|
"step": 35785
|
|
},
|
|
{
|
|
"entropy": 4.984779596328735,
|
|
"epoch": 3.438040345821326,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003849629605758065,
|
|
"loss": 4.5157,
|
|
"mean_token_accuracy": 0.26161112636327744,
|
|
"num_tokens": 82050004.0,
|
|
"step": 35790
|
|
},
|
|
{
|
|
"entropy": 5.006572675704956,
|
|
"epoch": 3.4385206532180597,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003849330490125205,
|
|
"loss": 4.5316,
|
|
"mean_token_accuracy": 0.25540418922901154,
|
|
"num_tokens": 82060621.0,
|
|
"step": 35795
|
|
},
|
|
{
|
|
"entropy": 5.030001497268676,
|
|
"epoch": 3.4390009606147935,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00038490313489691566,
|
|
"loss": 4.6048,
|
|
"mean_token_accuracy": 0.25173886865377426,
|
|
"num_tokens": 82070955.0,
|
|
"step": 35800
|
|
},
|
|
{
|
|
"entropy": 5.1286180973052975,
|
|
"epoch": 3.4394812680115274,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003848732182296863,
|
|
"loss": 4.8051,
|
|
"mean_token_accuracy": 0.23692064583301545,
|
|
"num_tokens": 82082623.0,
|
|
"step": 35805
|
|
},
|
|
{
|
|
"entropy": 5.04611382484436,
|
|
"epoch": 3.4399615754082613,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038484329901152713,
|
|
"loss": 4.5586,
|
|
"mean_token_accuracy": 0.25244507640600206,
|
|
"num_tokens": 82092707.0,
|
|
"step": 35810
|
|
},
|
|
{
|
|
"entropy": 5.013063287734985,
|
|
"epoch": 3.440441882804995,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00038481337724313264,
|
|
"loss": 4.5912,
|
|
"mean_token_accuracy": 0.2531083166599274,
|
|
"num_tokens": 82105114.0,
|
|
"step": 35815
|
|
},
|
|
{
|
|
"entropy": 4.991770172119141,
|
|
"epoch": 3.440922190201729,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0003847834529251977,
|
|
"loss": 4.5489,
|
|
"mean_token_accuracy": 0.26085243225097654,
|
|
"num_tokens": 82117638.0,
|
|
"step": 35820
|
|
},
|
|
{
|
|
"entropy": 5.081189155578613,
|
|
"epoch": 3.441402497598463,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038475352605841693,
|
|
"loss": 4.6518,
|
|
"mean_token_accuracy": 0.24742389023303984,
|
|
"num_tokens": 82129100.0,
|
|
"step": 35825
|
|
},
|
|
{
|
|
"entropy": 5.064033174514771,
|
|
"epoch": 3.441882804995197,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003847235966434852,
|
|
"loss": 4.6592,
|
|
"mean_token_accuracy": 0.2504685491323471,
|
|
"num_tokens": 82140776.0,
|
|
"step": 35830
|
|
},
|
|
{
|
|
"entropy": 5.111886405944825,
|
|
"epoch": 3.4423631123919307,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003846936646810974,
|
|
"loss": 4.6665,
|
|
"mean_token_accuracy": 0.24439497143030167,
|
|
"num_tokens": 82151569.0,
|
|
"step": 35835
|
|
},
|
|
{
|
|
"entropy": 5.114431619644165,
|
|
"epoch": 3.4428434197886646,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00038466373017194834,
|
|
"loss": 4.6824,
|
|
"mean_token_accuracy": 0.2462215691804886,
|
|
"num_tokens": 82162696.0,
|
|
"step": 35840
|
|
},
|
|
{
|
|
"entropy": 4.95943398475647,
|
|
"epoch": 3.4433237271853985,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003846337931167332,
|
|
"loss": 4.4898,
|
|
"mean_token_accuracy": 0.25996591746807096,
|
|
"num_tokens": 82173580.0,
|
|
"step": 35845
|
|
},
|
|
{
|
|
"entropy": 4.962374210357666,
|
|
"epoch": 3.443804034582133,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00038460385351614683,
|
|
"loss": 4.6017,
|
|
"mean_token_accuracy": 0.2535645171999931,
|
|
"num_tokens": 82185839.0,
|
|
"step": 35850
|
|
},
|
|
{
|
|
"entropy": 5.020360898971558,
|
|
"epoch": 3.4442843419788662,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00038457391137088455,
|
|
"loss": 4.6071,
|
|
"mean_token_accuracy": 0.24996693730354308,
|
|
"num_tokens": 82196682.0,
|
|
"step": 35855
|
|
},
|
|
{
|
|
"entropy": 5.0393565654754635,
|
|
"epoch": 3.4447646493756006,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038454396668164136,
|
|
"loss": 4.5911,
|
|
"mean_token_accuracy": 0.24866246283054352,
|
|
"num_tokens": 82207349.0,
|
|
"step": 35860
|
|
},
|
|
{
|
|
"entropy": 5.040912437438965,
|
|
"epoch": 3.4452449567723344,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003845140194491126,
|
|
"loss": 4.5791,
|
|
"mean_token_accuracy": 0.25327396392822266,
|
|
"num_tokens": 82218547.0,
|
|
"step": 35865
|
|
},
|
|
{
|
|
"entropy": 5.020535707473755,
|
|
"epoch": 3.4457252641690683,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00038448406967399334,
|
|
"loss": 4.5906,
|
|
"mean_token_accuracy": 0.2593914374709129,
|
|
"num_tokens": 82230246.0,
|
|
"step": 35870
|
|
},
|
|
{
|
|
"entropy": 4.9510817527771,
|
|
"epoch": 3.446205571565802,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00038445411735697917,
|
|
"loss": 4.5575,
|
|
"mean_token_accuracy": 0.2552237197756767,
|
|
"num_tokens": 82240924.0,
|
|
"step": 35875
|
|
},
|
|
{
|
|
"entropy": 5.022664403915405,
|
|
"epoch": 3.446685878962536,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003844241624987655,
|
|
"loss": 4.6331,
|
|
"mean_token_accuracy": 0.24676787704229355,
|
|
"num_tokens": 82252951.0,
|
|
"step": 35880
|
|
},
|
|
{
|
|
"entropy": 5.098922300338745,
|
|
"epoch": 3.44716618635927,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003843942051000476,
|
|
"loss": 4.6785,
|
|
"mean_token_accuracy": 0.24114069640636443,
|
|
"num_tokens": 82264377.0,
|
|
"step": 35885
|
|
},
|
|
{
|
|
"entropy": 5.125530767440796,
|
|
"epoch": 3.447646493756004,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.000384364245161521,
|
|
"loss": 4.7298,
|
|
"mean_token_accuracy": 0.23500104248523712,
|
|
"num_tokens": 82276736.0,
|
|
"step": 35890
|
|
},
|
|
{
|
|
"entropy": 5.0649346828460695,
|
|
"epoch": 3.4481268011527377,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003843342826838815,
|
|
"loss": 4.6294,
|
|
"mean_token_accuracy": 0.24611299782991408,
|
|
"num_tokens": 82288574.0,
|
|
"step": 35895
|
|
},
|
|
{
|
|
"entropy": 5.029484605789184,
|
|
"epoch": 3.4486071085494716,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00038430431766782463,
|
|
"loss": 4.5331,
|
|
"mean_token_accuracy": 0.265340293943882,
|
|
"num_tokens": 82299570.0,
|
|
"step": 35900
|
|
},
|
|
{
|
|
"entropy": 5.0758363723754885,
|
|
"epoch": 3.4490874159462055,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000384274350114046,
|
|
"loss": 4.6827,
|
|
"mean_token_accuracy": 0.23846648633480072,
|
|
"num_tokens": 82310449.0,
|
|
"step": 35905
|
|
},
|
|
{
|
|
"entropy": 5.053023433685302,
|
|
"epoch": 3.4495677233429394,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038424438002324145,
|
|
"loss": 4.643,
|
|
"mean_token_accuracy": 0.24154511839151382,
|
|
"num_tokens": 82321517.0,
|
|
"step": 35910
|
|
},
|
|
{
|
|
"entropy": 4.999557304382324,
|
|
"epoch": 3.4500480307396733,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00038421440739610683,
|
|
"loss": 4.6294,
|
|
"mean_token_accuracy": 0.24681228399276733,
|
|
"num_tokens": 82334555.0,
|
|
"step": 35915
|
|
},
|
|
{
|
|
"entropy": 4.973876285552978,
|
|
"epoch": 3.450528338136407,
|
|
"grad_norm": 0.86328125,
|
|
"learning_rate": 0.00038418443223333797,
|
|
"loss": 4.478,
|
|
"mean_token_accuracy": 0.26210538744926454,
|
|
"num_tokens": 82347166.0,
|
|
"step": 35920
|
|
},
|
|
{
|
|
"entropy": 4.95529408454895,
|
|
"epoch": 3.451008645533141,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003841544545356308,
|
|
"loss": 4.5209,
|
|
"mean_token_accuracy": 0.2557129502296448,
|
|
"num_tokens": 82358220.0,
|
|
"step": 35925
|
|
},
|
|
{
|
|
"entropy": 5.0186504364013675,
|
|
"epoch": 3.451488952929875,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038412447430368125,
|
|
"loss": 4.633,
|
|
"mean_token_accuracy": 0.24600803405046462,
|
|
"num_tokens": 82369154.0,
|
|
"step": 35930
|
|
},
|
|
{
|
|
"entropy": 5.007594299316406,
|
|
"epoch": 3.4519692603266092,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038409449153818556,
|
|
"loss": 4.5539,
|
|
"mean_token_accuracy": 0.2577602624893188,
|
|
"num_tokens": 82380311.0,
|
|
"step": 35935
|
|
},
|
|
{
|
|
"entropy": 5.017320871353149,
|
|
"epoch": 3.452449567723343,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038406450623983964,
|
|
"loss": 4.5599,
|
|
"mean_token_accuracy": 0.2534759595990181,
|
|
"num_tokens": 82390895.0,
|
|
"step": 35940
|
|
},
|
|
{
|
|
"entropy": 5.009060716629028,
|
|
"epoch": 3.452929875120077,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00038403451840933966,
|
|
"loss": 4.5409,
|
|
"mean_token_accuracy": 0.257034033536911,
|
|
"num_tokens": 82400880.0,
|
|
"step": 35945
|
|
},
|
|
{
|
|
"entropy": 4.976863765716553,
|
|
"epoch": 3.453410182516811,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00038400452804738204,
|
|
"loss": 4.6159,
|
|
"mean_token_accuracy": 0.2538122460246086,
|
|
"num_tokens": 82413566.0,
|
|
"step": 35950
|
|
},
|
|
{
|
|
"entropy": 4.963717746734619,
|
|
"epoch": 3.4538904899135447,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038397453515466297,
|
|
"loss": 4.5475,
|
|
"mean_token_accuracy": 0.2589743047952652,
|
|
"num_tokens": 82424434.0,
|
|
"step": 35955
|
|
},
|
|
{
|
|
"entropy": 5.089448308944702,
|
|
"epoch": 3.4543707973102786,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003839445397318787,
|
|
"loss": 4.6639,
|
|
"mean_token_accuracy": 0.24951853454113007,
|
|
"num_tokens": 82436310.0,
|
|
"step": 35960
|
|
},
|
|
{
|
|
"entropy": 4.98779330253601,
|
|
"epoch": 3.4548511047070125,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003839145417797258,
|
|
"loss": 4.5347,
|
|
"mean_token_accuracy": 0.2546408846974373,
|
|
"num_tokens": 82446484.0,
|
|
"step": 35965
|
|
},
|
|
{
|
|
"entropy": 5.090916872024536,
|
|
"epoch": 3.4553314121037464,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003838845412989006,
|
|
"loss": 4.6964,
|
|
"mean_token_accuracy": 0.23920599222183228,
|
|
"num_tokens": 82458169.0,
|
|
"step": 35970
|
|
},
|
|
{
|
|
"entropy": 4.982842636108399,
|
|
"epoch": 3.4558117195004803,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003838545382900997,
|
|
"loss": 4.5668,
|
|
"mean_token_accuracy": 0.2542534157633781,
|
|
"num_tokens": 82469407.0,
|
|
"step": 35975
|
|
},
|
|
{
|
|
"entropy": 4.986524534225464,
|
|
"epoch": 3.456292026897214,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003838245327540196,
|
|
"loss": 4.5392,
|
|
"mean_token_accuracy": 0.26070113480091095,
|
|
"num_tokens": 82480757.0,
|
|
"step": 35980
|
|
},
|
|
{
|
|
"entropy": 5.008253049850464,
|
|
"epoch": 3.456772334293948,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038379452469135706,
|
|
"loss": 4.6082,
|
|
"mean_token_accuracy": 0.25176827758550646,
|
|
"num_tokens": 82491073.0,
|
|
"step": 35985
|
|
},
|
|
{
|
|
"entropy": 5.055674934387207,
|
|
"epoch": 3.457252641690682,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00038376451410280864,
|
|
"loss": 4.6247,
|
|
"mean_token_accuracy": 0.25961445420980456,
|
|
"num_tokens": 82502243.0,
|
|
"step": 35990
|
|
},
|
|
{
|
|
"entropy": 5.063381910324097,
|
|
"epoch": 3.457732949087416,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00038373450098907124,
|
|
"loss": 4.6124,
|
|
"mean_token_accuracy": 0.2445521369576454,
|
|
"num_tokens": 82514831.0,
|
|
"step": 35995
|
|
},
|
|
{
|
|
"entropy": 5.02369704246521,
|
|
"epoch": 3.4582132564841497,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00038370448535084156,
|
|
"loss": 4.6043,
|
|
"mean_token_accuracy": 0.2521915763616562,
|
|
"num_tokens": 82525907.0,
|
|
"step": 36000
|
|
},
|
|
{
|
|
"epoch": 3.4582132564841497,
|
|
"eval_entropy": 4.835890457549948,
|
|
"eval_loss": 4.756885051727295,
|
|
"eval_mean_token_accuracy": 0.25125276272992053,
|
|
"eval_num_tokens": 82525907.0,
|
|
"eval_runtime": 26.6323,
|
|
"eval_samples_per_second": 1232.153,
|
|
"eval_steps_per_second": 154.024,
|
|
"step": 36000
|
|
},
|
|
{
|
|
"entropy": 5.056993579864502,
|
|
"epoch": 3.4586935638808836,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003836744671888165,
|
|
"loss": 4.6505,
|
|
"mean_token_accuracy": 0.24616679251194,
|
|
"num_tokens": 82537130.0,
|
|
"step": 36005
|
|
},
|
|
{
|
|
"entropy": 5.040928602218628,
|
|
"epoch": 3.459173871277618,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00038364444650369306,
|
|
"loss": 4.5651,
|
|
"mean_token_accuracy": 0.24673843681812285,
|
|
"num_tokens": 82548244.0,
|
|
"step": 36010
|
|
},
|
|
{
|
|
"entropy": 5.023439264297485,
|
|
"epoch": 3.4596541786743518,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038361442329616814,
|
|
"loss": 4.5532,
|
|
"mean_token_accuracy": 0.2544506788253784,
|
|
"num_tokens": 82559291.0,
|
|
"step": 36015
|
|
},
|
|
{
|
|
"entropy": 5.001073026657105,
|
|
"epoch": 3.4601344860710856,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00038358439756693886,
|
|
"loss": 4.5713,
|
|
"mean_token_accuracy": 0.25558389723300934,
|
|
"num_tokens": 82570929.0,
|
|
"step": 36020
|
|
},
|
|
{
|
|
"entropy": 5.005765199661255,
|
|
"epoch": 3.4606147934678195,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038355436931670225,
|
|
"loss": 4.5988,
|
|
"mean_token_accuracy": 0.2526944473385811,
|
|
"num_tokens": 82583016.0,
|
|
"step": 36025
|
|
},
|
|
{
|
|
"entropy": 5.058856105804443,
|
|
"epoch": 3.4610951008645534,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.00038352433854615557,
|
|
"loss": 4.6408,
|
|
"mean_token_accuracy": 0.2517577543854713,
|
|
"num_tokens": 82593950.0,
|
|
"step": 36030
|
|
},
|
|
{
|
|
"entropy": 4.948162460327149,
|
|
"epoch": 3.4615754082612873,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00038349430525599606,
|
|
"loss": 4.5083,
|
|
"mean_token_accuracy": 0.25558039397001264,
|
|
"num_tokens": 82605764.0,
|
|
"step": 36035
|
|
},
|
|
{
|
|
"entropy": 5.008602046966553,
|
|
"epoch": 3.462055715658021,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00038346426944692075,
|
|
"loss": 4.6097,
|
|
"mean_token_accuracy": 0.2569774121046066,
|
|
"num_tokens": 82617336.0,
|
|
"step": 36040
|
|
},
|
|
{
|
|
"entropy": 4.997391939163208,
|
|
"epoch": 3.462536023054755,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003834342311196274,
|
|
"loss": 4.5168,
|
|
"mean_token_accuracy": 0.2639403596520424,
|
|
"num_tokens": 82628999.0,
|
|
"step": 36045
|
|
},
|
|
{
|
|
"entropy": 4.983165693283081,
|
|
"epoch": 3.463016330451489,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038340419027481305,
|
|
"loss": 4.6244,
|
|
"mean_token_accuracy": 0.25390260964632033,
|
|
"num_tokens": 82640141.0,
|
|
"step": 36050
|
|
},
|
|
{
|
|
"entropy": 5.006143474578858,
|
|
"epoch": 3.463496637848223,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003833741469131753,
|
|
"loss": 4.621,
|
|
"mean_token_accuracy": 0.24634535163640975,
|
|
"num_tokens": 82651320.0,
|
|
"step": 36055
|
|
},
|
|
{
|
|
"entropy": 5.031508159637451,
|
|
"epoch": 3.4639769452449567,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003833441010354117,
|
|
"loss": 4.6023,
|
|
"mean_token_accuracy": 0.2525789126753807,
|
|
"num_tokens": 82662126.0,
|
|
"step": 36060
|
|
},
|
|
{
|
|
"entropy": 4.953914594650269,
|
|
"epoch": 3.4644572526416906,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038331405264221974,
|
|
"loss": 4.546,
|
|
"mean_token_accuracy": 0.25654407888650893,
|
|
"num_tokens": 82673266.0,
|
|
"step": 36065
|
|
},
|
|
{
|
|
"entropy": 5.106532764434815,
|
|
"epoch": 3.4649375600384245,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003832840017342972,
|
|
"loss": 4.6342,
|
|
"mean_token_accuracy": 0.245072540640831,
|
|
"num_tokens": 82684797.0,
|
|
"step": 36070
|
|
},
|
|
{
|
|
"entropy": 4.994842576980591,
|
|
"epoch": 3.4654178674351583,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00038325394831234164,
|
|
"loss": 4.5347,
|
|
"mean_token_accuracy": 0.2523220062255859,
|
|
"num_tokens": 82697531.0,
|
|
"step": 36075
|
|
},
|
|
{
|
|
"entropy": 5.092827701568604,
|
|
"epoch": 3.465898174831892,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003832238923770508,
|
|
"loss": 4.6317,
|
|
"mean_token_accuracy": 0.25392968505620955,
|
|
"num_tokens": 82708481.0,
|
|
"step": 36080
|
|
},
|
|
{
|
|
"entropy": 4.941552305221558,
|
|
"epoch": 3.4663784822286265,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003831938339291225,
|
|
"loss": 4.5919,
|
|
"mean_token_accuracy": 0.25105798691511155,
|
|
"num_tokens": 82719569.0,
|
|
"step": 36085
|
|
},
|
|
{
|
|
"entropy": 5.08570122718811,
|
|
"epoch": 3.46685878962536,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00038316377296925474,
|
|
"loss": 4.7007,
|
|
"mean_token_accuracy": 0.24717456400394439,
|
|
"num_tokens": 82730219.0,
|
|
"step": 36090
|
|
},
|
|
{
|
|
"entropy": 5.075848245620728,
|
|
"epoch": 3.4673390970220943,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00038313370949814537,
|
|
"loss": 4.6804,
|
|
"mean_token_accuracy": 0.24185362905263902,
|
|
"num_tokens": 82741865.0,
|
|
"step": 36095
|
|
},
|
|
{
|
|
"entropy": 5.037635278701782,
|
|
"epoch": 3.467819404418828,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003831036435164923,
|
|
"loss": 4.5716,
|
|
"mean_token_accuracy": 0.25491551160812376,
|
|
"num_tokens": 82752751.0,
|
|
"step": 36100
|
|
},
|
|
{
|
|
"entropy": 5.064966249465942,
|
|
"epoch": 3.468299711815562,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003830735750249936,
|
|
"loss": 4.6824,
|
|
"mean_token_accuracy": 0.2444985881447792,
|
|
"num_tokens": 82763946.0,
|
|
"step": 36105
|
|
},
|
|
{
|
|
"entropy": 4.991701745986939,
|
|
"epoch": 3.468780019212296,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003830435040243475,
|
|
"loss": 4.5988,
|
|
"mean_token_accuracy": 0.24963750094175338,
|
|
"num_tokens": 82775659.0,
|
|
"step": 36110
|
|
},
|
|
{
|
|
"entropy": 5.092870664596558,
|
|
"epoch": 3.46926032660903,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.000383013430515252,
|
|
"loss": 4.6029,
|
|
"mean_token_accuracy": 0.24469762146472931,
|
|
"num_tokens": 82787633.0,
|
|
"step": 36115
|
|
},
|
|
{
|
|
"entropy": 5.045297384262085,
|
|
"epoch": 3.4697406340057637,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003829833544984054,
|
|
"loss": 4.6681,
|
|
"mean_token_accuracy": 0.2515715181827545,
|
|
"num_tokens": 82799983.0,
|
|
"step": 36120
|
|
},
|
|
{
|
|
"entropy": 5.101945495605468,
|
|
"epoch": 3.4702209414024976,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003829532759745059,
|
|
"loss": 4.6563,
|
|
"mean_token_accuracy": 0.24534079134464265,
|
|
"num_tokens": 82811733.0,
|
|
"step": 36125
|
|
},
|
|
{
|
|
"entropy": 5.131472873687744,
|
|
"epoch": 3.4707012487992315,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00038292319494425195,
|
|
"loss": 4.6528,
|
|
"mean_token_accuracy": 0.2447332561016083,
|
|
"num_tokens": 82823555.0,
|
|
"step": 36130
|
|
},
|
|
{
|
|
"entropy": 5.065278148651123,
|
|
"epoch": 3.4711815561959654,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003828931114083419,
|
|
"loss": 4.6177,
|
|
"mean_token_accuracy": 0.254763600230217,
|
|
"num_tokens": 82835288.0,
|
|
"step": 36135
|
|
},
|
|
{
|
|
"entropy": 5.032763051986694,
|
|
"epoch": 3.4716618635926992,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000382863025367474,
|
|
"loss": 4.5918,
|
|
"mean_token_accuracy": 0.2621903494000435,
|
|
"num_tokens": 82846298.0,
|
|
"step": 36140
|
|
},
|
|
{
|
|
"entropy": 5.057696104049683,
|
|
"epoch": 3.472142170989433,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003828329368223471,
|
|
"loss": 4.6339,
|
|
"mean_token_accuracy": 0.24337695091962813,
|
|
"num_tokens": 82859265.0,
|
|
"step": 36145
|
|
},
|
|
{
|
|
"entropy": 5.013967418670655,
|
|
"epoch": 3.472622478386167,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003828028457736596,
|
|
"loss": 4.6159,
|
|
"mean_token_accuracy": 0.2510910123586655,
|
|
"num_tokens": 82870167.0,
|
|
"step": 36150
|
|
},
|
|
{
|
|
"entropy": 5.040460157394409,
|
|
"epoch": 3.473102785782901,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0003827727522221101,
|
|
"loss": 4.6581,
|
|
"mean_token_accuracy": 0.2511814534664154,
|
|
"num_tokens": 82883034.0,
|
|
"step": 36155
|
|
},
|
|
{
|
|
"entropy": 5.029521036148071,
|
|
"epoch": 3.473583093179635,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038274265616839733,
|
|
"loss": 4.5365,
|
|
"mean_token_accuracy": 0.2600481018424034,
|
|
"num_tokens": 82894080.0,
|
|
"step": 36160
|
|
},
|
|
{
|
|
"entropy": 5.048698377609253,
|
|
"epoch": 3.4740634005763686,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038271255761322,
|
|
"loss": 4.6105,
|
|
"mean_token_accuracy": 0.251488533616066,
|
|
"num_tokens": 82905158.0,
|
|
"step": 36165
|
|
},
|
|
{
|
|
"entropy": 5.000881671905518,
|
|
"epoch": 3.474543707973103,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003826824565572769,
|
|
"loss": 4.5572,
|
|
"mean_token_accuracy": 0.25473167896270754,
|
|
"num_tokens": 82917246.0,
|
|
"step": 36170
|
|
},
|
|
{
|
|
"entropy": 4.953558731079101,
|
|
"epoch": 3.475024015369837,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003826523530012669,
|
|
"loss": 4.551,
|
|
"mean_token_accuracy": 0.25246447920799253,
|
|
"num_tokens": 82928895.0,
|
|
"step": 36175
|
|
},
|
|
{
|
|
"entropy": 5.001086759567261,
|
|
"epoch": 3.4755043227665707,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000382622246945889,
|
|
"loss": 4.6017,
|
|
"mean_token_accuracy": 0.25646854341030123,
|
|
"num_tokens": 82939239.0,
|
|
"step": 36180
|
|
},
|
|
{
|
|
"entropy": 5.033108806610107,
|
|
"epoch": 3.4759846301633046,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038259213839184205,
|
|
"loss": 4.5888,
|
|
"mean_token_accuracy": 0.25895061790943147,
|
|
"num_tokens": 82950457.0,
|
|
"step": 36185
|
|
},
|
|
{
|
|
"entropy": 5.045436525344849,
|
|
"epoch": 3.4764649375600385,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00038256202733982515,
|
|
"loss": 4.5618,
|
|
"mean_token_accuracy": 0.2577621892094612,
|
|
"num_tokens": 82961589.0,
|
|
"step": 36190
|
|
},
|
|
{
|
|
"entropy": 4.956928873062134,
|
|
"epoch": 3.4769452449567724,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003825319137905373,
|
|
"loss": 4.5025,
|
|
"mean_token_accuracy": 0.257821062207222,
|
|
"num_tokens": 82972397.0,
|
|
"step": 36195
|
|
},
|
|
{
|
|
"entropy": 5.019803237915039,
|
|
"epoch": 3.4774255523535063,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003825017977446778,
|
|
"loss": 4.6031,
|
|
"mean_token_accuracy": 0.25139220952987673,
|
|
"num_tokens": 82984239.0,
|
|
"step": 36200
|
|
},
|
|
{
|
|
"entropy": 5.012581920623779,
|
|
"epoch": 3.47790585975024,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00038247167920294574,
|
|
"loss": 4.6295,
|
|
"mean_token_accuracy": 0.2531572192907333,
|
|
"num_tokens": 82996544.0,
|
|
"step": 36205
|
|
},
|
|
{
|
|
"entropy": 5.130231666564941,
|
|
"epoch": 3.478386167146974,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00038244155816604037,
|
|
"loss": 4.7642,
|
|
"mean_token_accuracy": 0.2391597643494606,
|
|
"num_tokens": 83009002.0,
|
|
"step": 36210
|
|
},
|
|
{
|
|
"entropy": 5.031131029129028,
|
|
"epoch": 3.478866474543708,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003824114346346611,
|
|
"loss": 4.5669,
|
|
"mean_token_accuracy": 0.2468014433979988,
|
|
"num_tokens": 83021225.0,
|
|
"step": 36215
|
|
},
|
|
{
|
|
"entropy": 5.032139873504638,
|
|
"epoch": 3.479346781940442,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003823813086095073,
|
|
"loss": 4.6589,
|
|
"mean_token_accuracy": 0.25498601496219636,
|
|
"num_tokens": 83032682.0,
|
|
"step": 36220
|
|
},
|
|
{
|
|
"entropy": 5.008218288421631,
|
|
"epoch": 3.4798270893371757,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00038235118009127833,
|
|
"loss": 4.5732,
|
|
"mean_token_accuracy": 0.26051925867795944,
|
|
"num_tokens": 83045035.0,
|
|
"step": 36225
|
|
},
|
|
{
|
|
"entropy": 5.114557218551636,
|
|
"epoch": 3.4803073967339095,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003823210490806737,
|
|
"loss": 4.7684,
|
|
"mean_token_accuracy": 0.2362096130847931,
|
|
"num_tokens": 83056724.0,
|
|
"step": 36230
|
|
},
|
|
{
|
|
"entropy": 5.076307010650635,
|
|
"epoch": 3.4807877041306434,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000382290915578393,
|
|
"loss": 4.6429,
|
|
"mean_token_accuracy": 0.2531363174319267,
|
|
"num_tokens": 83067736.0,
|
|
"step": 36235
|
|
},
|
|
{
|
|
"entropy": 5.025189018249511,
|
|
"epoch": 3.4812680115273773,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003822607795851359,
|
|
"loss": 4.5601,
|
|
"mean_token_accuracy": 0.26449885815382,
|
|
"num_tokens": 83079367.0,
|
|
"step": 36240
|
|
},
|
|
{
|
|
"entropy": 5.095752620697022,
|
|
"epoch": 3.4817483189241116,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003822306411016019,
|
|
"loss": 4.6455,
|
|
"mean_token_accuracy": 0.242302206158638,
|
|
"num_tokens": 83091138.0,
|
|
"step": 36245
|
|
},
|
|
{
|
|
"entropy": 5.002807331085205,
|
|
"epoch": 3.4822286263208455,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003822005001284909,
|
|
"loss": 4.5372,
|
|
"mean_token_accuracy": 0.2629186615347862,
|
|
"num_tokens": 83101627.0,
|
|
"step": 36250
|
|
},
|
|
{
|
|
"entropy": 5.035023641586304,
|
|
"epoch": 3.4827089337175794,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003821703566665026,
|
|
"loss": 4.5867,
|
|
"mean_token_accuracy": 0.253347373008728,
|
|
"num_tokens": 83112569.0,
|
|
"step": 36255
|
|
},
|
|
{
|
|
"entropy": 4.98093581199646,
|
|
"epoch": 3.4831892411143133,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003821402107163368,
|
|
"loss": 4.5069,
|
|
"mean_token_accuracy": 0.2610474839806557,
|
|
"num_tokens": 83124019.0,
|
|
"step": 36260
|
|
},
|
|
{
|
|
"entropy": 4.998417139053345,
|
|
"epoch": 3.483669548511047,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003821100622786935,
|
|
"loss": 4.5603,
|
|
"mean_token_accuracy": 0.26268344968557356,
|
|
"num_tokens": 83135032.0,
|
|
"step": 36265
|
|
},
|
|
{
|
|
"entropy": 4.903113174438476,
|
|
"epoch": 3.484149855907781,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00038207991135427255,
|
|
"loss": 4.4568,
|
|
"mean_token_accuracy": 0.2547492355108261,
|
|
"num_tokens": 83147362.0,
|
|
"step": 36270
|
|
},
|
|
{
|
|
"entropy": 5.017651176452636,
|
|
"epoch": 3.484630163304515,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000382049757943774,
|
|
"loss": 4.5895,
|
|
"mean_token_accuracy": 0.2553521737456322,
|
|
"num_tokens": 83158531.0,
|
|
"step": 36275
|
|
},
|
|
{
|
|
"entropy": 5.0827155113220215,
|
|
"epoch": 3.485110470701249,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00038201960204789796,
|
|
"loss": 4.7159,
|
|
"mean_token_accuracy": 0.24603585451841353,
|
|
"num_tokens": 83169565.0,
|
|
"step": 36280
|
|
},
|
|
{
|
|
"entropy": 5.0643932819366455,
|
|
"epoch": 3.4855907780979827,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003819894436673446,
|
|
"loss": 4.6519,
|
|
"mean_token_accuracy": 0.24814677238464355,
|
|
"num_tokens": 83181084.0,
|
|
"step": 36285
|
|
},
|
|
{
|
|
"entropy": 5.095578241348266,
|
|
"epoch": 3.4860710854947166,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038195928280281406,
|
|
"loss": 4.6345,
|
|
"mean_token_accuracy": 0.24124212861061095,
|
|
"num_tokens": 83193477.0,
|
|
"step": 36290
|
|
},
|
|
{
|
|
"entropy": 5.045779752731323,
|
|
"epoch": 3.4865513928914504,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003819291194550066,
|
|
"loss": 4.6237,
|
|
"mean_token_accuracy": 0.24994434863328935,
|
|
"num_tokens": 83204408.0,
|
|
"step": 36295
|
|
},
|
|
{
|
|
"entropy": 5.092052602767945,
|
|
"epoch": 3.4870317002881843,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003818989536246224,
|
|
"loss": 4.7038,
|
|
"mean_token_accuracy": 0.24879524856805801,
|
|
"num_tokens": 83215152.0,
|
|
"step": 36300
|
|
},
|
|
{
|
|
"entropy": 4.952845811843872,
|
|
"epoch": 3.487512007684918,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00038186878531236196,
|
|
"loss": 4.5343,
|
|
"mean_token_accuracy": 0.2582982614636421,
|
|
"num_tokens": 83227094.0,
|
|
"step": 36305
|
|
},
|
|
{
|
|
"entropy": 5.024833011627197,
|
|
"epoch": 3.487992315081652,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038183861451892567,
|
|
"loss": 4.6033,
|
|
"mean_token_accuracy": 0.25574894100427625,
|
|
"num_tokens": 83238149.0,
|
|
"step": 36310
|
|
},
|
|
{
|
|
"entropy": 5.0474512577056885,
|
|
"epoch": 3.488472622478386,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000381808441245014,
|
|
"loss": 4.6635,
|
|
"mean_token_accuracy": 0.24599166810512543,
|
|
"num_tokens": 83249991.0,
|
|
"step": 36315
|
|
},
|
|
{
|
|
"entropy": 5.034723520278931,
|
|
"epoch": 3.4889529298751203,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0003817782654913274,
|
|
"loss": 4.6114,
|
|
"mean_token_accuracy": 0.25361563712358476,
|
|
"num_tokens": 83262987.0,
|
|
"step": 36320
|
|
},
|
|
{
|
|
"entropy": 5.089780187606811,
|
|
"epoch": 3.489433237271854,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003817480872585667,
|
|
"loss": 4.6472,
|
|
"mean_token_accuracy": 0.25067704170942307,
|
|
"num_tokens": 83274155.0,
|
|
"step": 36325
|
|
},
|
|
{
|
|
"entropy": 5.1131970405578615,
|
|
"epoch": 3.489913544668588,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038171790654743234,
|
|
"loss": 4.616,
|
|
"mean_token_accuracy": 0.248615300655365,
|
|
"num_tokens": 83286314.0,
|
|
"step": 36330
|
|
},
|
|
{
|
|
"entropy": 5.0316308498382565,
|
|
"epoch": 3.490393852065322,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000381687723358625,
|
|
"loss": 4.6315,
|
|
"mean_token_accuracy": 0.2464334949851036,
|
|
"num_tokens": 83297435.0,
|
|
"step": 36335
|
|
},
|
|
{
|
|
"entropy": 4.961110067367554,
|
|
"epoch": 3.490874159462056,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003816575376928456,
|
|
"loss": 4.4689,
|
|
"mean_token_accuracy": 0.26051573008298873,
|
|
"num_tokens": 83308248.0,
|
|
"step": 36340
|
|
},
|
|
{
|
|
"entropy": 4.992400121688843,
|
|
"epoch": 3.4913544668587897,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00038162734955079483,
|
|
"loss": 4.5733,
|
|
"mean_token_accuracy": 0.2547511994838715,
|
|
"num_tokens": 83319446.0,
|
|
"step": 36345
|
|
},
|
|
{
|
|
"entropy": 5.12844877243042,
|
|
"epoch": 3.4918347742555236,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003815971589331736,
|
|
"loss": 4.6772,
|
|
"mean_token_accuracy": 0.24939944297075273,
|
|
"num_tokens": 83330340.0,
|
|
"step": 36350
|
|
},
|
|
{
|
|
"entropy": 5.010162782669068,
|
|
"epoch": 3.4923150816522575,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003815669658406829,
|
|
"loss": 4.5163,
|
|
"mean_token_accuracy": 0.25773070603609083,
|
|
"num_tokens": 83342338.0,
|
|
"step": 36355
|
|
},
|
|
{
|
|
"entropy": 4.977046346664428,
|
|
"epoch": 3.4927953890489913,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00038153677027402374,
|
|
"loss": 4.5705,
|
|
"mean_token_accuracy": 0.2573657900094986,
|
|
"num_tokens": 83353272.0,
|
|
"step": 36360
|
|
},
|
|
{
|
|
"entropy": 5.011315250396729,
|
|
"epoch": 3.493275696445725,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00038150657223389703,
|
|
"loss": 4.6643,
|
|
"mean_token_accuracy": 0.2504794612526894,
|
|
"num_tokens": 83364691.0,
|
|
"step": 36365
|
|
},
|
|
{
|
|
"entropy": 5.095743131637573,
|
|
"epoch": 3.493756003842459,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00038147637172100397,
|
|
"loss": 4.5873,
|
|
"mean_token_accuracy": 0.2563665583729744,
|
|
"num_tokens": 83375110.0,
|
|
"step": 36370
|
|
},
|
|
{
|
|
"entropy": 5.005822229385376,
|
|
"epoch": 3.494236311239193,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003814461687360459,
|
|
"loss": 4.583,
|
|
"mean_token_accuracy": 0.24570364952087403,
|
|
"num_tokens": 83387540.0,
|
|
"step": 36375
|
|
},
|
|
{
|
|
"entropy": 5.069326019287109,
|
|
"epoch": 3.494716618635927,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00038141596327972375,
|
|
"loss": 4.6766,
|
|
"mean_token_accuracy": 0.24246444404125214,
|
|
"num_tokens": 83398895.0,
|
|
"step": 36380
|
|
},
|
|
{
|
|
"entropy": 5.0073939800262455,
|
|
"epoch": 3.4951969260326607,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003813857553527389,
|
|
"loss": 4.5472,
|
|
"mean_token_accuracy": 0.25437297224998473,
|
|
"num_tokens": 83410712.0,
|
|
"step": 36385
|
|
},
|
|
{
|
|
"entropy": 5.06603970527649,
|
|
"epoch": 3.4956772334293946,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038135554495579273,
|
|
"loss": 4.6385,
|
|
"mean_token_accuracy": 0.2475124180316925,
|
|
"num_tokens": 83421168.0,
|
|
"step": 36390
|
|
},
|
|
{
|
|
"entropy": 5.06361198425293,
|
|
"epoch": 3.496157540826129,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003813253320895866,
|
|
"loss": 4.6498,
|
|
"mean_token_accuracy": 0.2458742380142212,
|
|
"num_tokens": 83432634.0,
|
|
"step": 36395
|
|
},
|
|
{
|
|
"entropy": 5.016599512100219,
|
|
"epoch": 3.4966378482228624,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000381295116754822,
|
|
"loss": 4.6215,
|
|
"mean_token_accuracy": 0.24506403356790543,
|
|
"num_tokens": 83444024.0,
|
|
"step": 36400
|
|
},
|
|
{
|
|
"entropy": 5.122844123840332,
|
|
"epoch": 3.4971181556195967,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038126489895220047,
|
|
"loss": 4.6744,
|
|
"mean_token_accuracy": 0.2429607018828392,
|
|
"num_tokens": 83455223.0,
|
|
"step": 36405
|
|
},
|
|
{
|
|
"entropy": 5.1787111282348635,
|
|
"epoch": 3.4975984630163306,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00038123467868242345,
|
|
"loss": 4.7132,
|
|
"mean_token_accuracy": 0.24054633677005768,
|
|
"num_tokens": 83467295.0,
|
|
"step": 36410
|
|
},
|
|
{
|
|
"entropy": 5.117932987213135,
|
|
"epoch": 3.4980787704130645,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00038120445594619264,
|
|
"loss": 4.7392,
|
|
"mean_token_accuracy": 0.24111962467432022,
|
|
"num_tokens": 83478445.0,
|
|
"step": 36415
|
|
},
|
|
{
|
|
"entropy": 5.029610872268677,
|
|
"epoch": 3.4985590778097984,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003811742307442098,
|
|
"loss": 4.6221,
|
|
"mean_token_accuracy": 0.24838598370552062,
|
|
"num_tokens": 83490257.0,
|
|
"step": 36420
|
|
},
|
|
{
|
|
"entropy": 5.012696647644043,
|
|
"epoch": 3.4990393852065322,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00038114400307717657,
|
|
"loss": 4.5787,
|
|
"mean_token_accuracy": 0.2519743382930756,
|
|
"num_tokens": 83501773.0,
|
|
"step": 36425
|
|
},
|
|
{
|
|
"entropy": 5.057143926620483,
|
|
"epoch": 3.499519692603266,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003811137729457947,
|
|
"loss": 4.6001,
|
|
"mean_token_accuracy": 0.25844192057847976,
|
|
"num_tokens": 83513654.0,
|
|
"step": 36430
|
|
},
|
|
{
|
|
"entropy": 5.022467184066772,
|
|
"epoch": 3.5,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003810835403507662,
|
|
"loss": 4.6032,
|
|
"mean_token_accuracy": 0.24511602967977525,
|
|
"num_tokens": 83525251.0,
|
|
"step": 36435
|
|
},
|
|
{
|
|
"entropy": 5.066669368743897,
|
|
"epoch": 3.500480307396734,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00038105330529279285,
|
|
"loss": 4.6708,
|
|
"mean_token_accuracy": 0.251289539039135,
|
|
"num_tokens": 83536937.0,
|
|
"step": 36440
|
|
},
|
|
{
|
|
"entropy": 5.0365759372711185,
|
|
"epoch": 3.5009606147934678,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003810230677725767,
|
|
"loss": 4.6063,
|
|
"mean_token_accuracy": 0.24983524084091185,
|
|
"num_tokens": 83549686.0,
|
|
"step": 36445
|
|
},
|
|
{
|
|
"entropy": 5.102137231826783,
|
|
"epoch": 3.5014409221902016,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038099282779081975,
|
|
"loss": 4.6271,
|
|
"mean_token_accuracy": 0.2479284256696701,
|
|
"num_tokens": 83561774.0,
|
|
"step": 36450
|
|
},
|
|
{
|
|
"entropy": 5.06001353263855,
|
|
"epoch": 3.5019212295869355,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00038096258534822404,
|
|
"loss": 4.6229,
|
|
"mean_token_accuracy": 0.24866744577884675,
|
|
"num_tokens": 83573236.0,
|
|
"step": 36455
|
|
},
|
|
{
|
|
"entropy": 5.03421311378479,
|
|
"epoch": 3.5024015369836694,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00038093234044549174,
|
|
"loss": 4.589,
|
|
"mean_token_accuracy": 0.254245400428772,
|
|
"num_tokens": 83584899.0,
|
|
"step": 36460
|
|
},
|
|
{
|
|
"entropy": 5.008595657348633,
|
|
"epoch": 3.5028818443804033,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00038090209308332503,
|
|
"loss": 4.6233,
|
|
"mean_token_accuracy": 0.25674950182437895,
|
|
"num_tokens": 83596880.0,
|
|
"step": 36465
|
|
},
|
|
{
|
|
"entropy": 5.007718896865844,
|
|
"epoch": 3.5033621517771376,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00038087184326242627,
|
|
"loss": 4.6351,
|
|
"mean_token_accuracy": 0.24479610323905945,
|
|
"num_tokens": 83608747.0,
|
|
"step": 36470
|
|
},
|
|
{
|
|
"entropy": 5.032431650161743,
|
|
"epoch": 3.503842459173871,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003808415909834976,
|
|
"loss": 4.6303,
|
|
"mean_token_accuracy": 0.25000889152288436,
|
|
"num_tokens": 83619971.0,
|
|
"step": 36475
|
|
},
|
|
{
|
|
"entropy": 5.1295552253723145,
|
|
"epoch": 3.5043227665706054,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00038081133624724147,
|
|
"loss": 4.7151,
|
|
"mean_token_accuracy": 0.24804797023534775,
|
|
"num_tokens": 83631030.0,
|
|
"step": 36480
|
|
},
|
|
{
|
|
"entropy": 5.063794088363648,
|
|
"epoch": 3.5048030739673393,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00038078107905436035,
|
|
"loss": 4.5632,
|
|
"mean_token_accuracy": 0.25694143772125244,
|
|
"num_tokens": 83642854.0,
|
|
"step": 36485
|
|
},
|
|
{
|
|
"entropy": 5.056593751907348,
|
|
"epoch": 3.505283381364073,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00038075081940555666,
|
|
"loss": 4.5588,
|
|
"mean_token_accuracy": 0.2528766915202141,
|
|
"num_tokens": 83653669.0,
|
|
"step": 36490
|
|
},
|
|
{
|
|
"entropy": 5.048584079742431,
|
|
"epoch": 3.505763688760807,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00038072055730153297,
|
|
"loss": 4.604,
|
|
"mean_token_accuracy": 0.2529552534222603,
|
|
"num_tokens": 83665098.0,
|
|
"step": 36495
|
|
},
|
|
{
|
|
"entropy": 5.013029670715332,
|
|
"epoch": 3.506243996157541,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00038069029274299184,
|
|
"loss": 4.6204,
|
|
"mean_token_accuracy": 0.25545528531074524,
|
|
"num_tokens": 83677494.0,
|
|
"step": 36500
|
|
},
|
|
{
|
|
"entropy": 5.019226741790772,
|
|
"epoch": 3.506724303554275,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003806600257306359,
|
|
"loss": 4.5269,
|
|
"mean_token_accuracy": 0.2605164647102356,
|
|
"num_tokens": 83689128.0,
|
|
"step": 36505
|
|
},
|
|
{
|
|
"entropy": 4.930389022827148,
|
|
"epoch": 3.5072046109510087,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00038062975626516806,
|
|
"loss": 4.5322,
|
|
"mean_token_accuracy": 0.25798874348402023,
|
|
"num_tokens": 83700113.0,
|
|
"step": 36510
|
|
},
|
|
{
|
|
"entropy": 5.062226438522339,
|
|
"epoch": 3.5076849183477425,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00038059948434729086,
|
|
"loss": 4.5736,
|
|
"mean_token_accuracy": 0.2530315175652504,
|
|
"num_tokens": 83709549.0,
|
|
"step": 36515
|
|
},
|
|
{
|
|
"entropy": 5.041370964050293,
|
|
"epoch": 3.5081652257444764,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038056920997770713,
|
|
"loss": 4.6109,
|
|
"mean_token_accuracy": 0.24801046997308732,
|
|
"num_tokens": 83720912.0,
|
|
"step": 36520
|
|
},
|
|
{
|
|
"entropy": 4.996396017074585,
|
|
"epoch": 3.5086455331412103,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00038053893315711997,
|
|
"loss": 4.6321,
|
|
"mean_token_accuracy": 0.24855329245328903,
|
|
"num_tokens": 83732885.0,
|
|
"step": 36525
|
|
},
|
|
{
|
|
"entropy": 5.103625011444092,
|
|
"epoch": 3.509125840537944,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000380508653886232,
|
|
"loss": 4.6328,
|
|
"mean_token_accuracy": 0.25259209871292115,
|
|
"num_tokens": 83744521.0,
|
|
"step": 36530
|
|
},
|
|
{
|
|
"entropy": 5.122366046905517,
|
|
"epoch": 3.509606147934678,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003804783721657465,
|
|
"loss": 4.6387,
|
|
"mean_token_accuracy": 0.2601527154445648,
|
|
"num_tokens": 83756184.0,
|
|
"step": 36535
|
|
},
|
|
{
|
|
"entropy": 4.958087062835693,
|
|
"epoch": 3.510086455331412,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00038044808799636635,
|
|
"loss": 4.5958,
|
|
"mean_token_accuracy": 0.25628584921360015,
|
|
"num_tokens": 83767643.0,
|
|
"step": 36540
|
|
},
|
|
{
|
|
"entropy": 4.9522013664245605,
|
|
"epoch": 3.5105667627281463,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003804178013787947,
|
|
"loss": 4.5011,
|
|
"mean_token_accuracy": 0.2627973765134811,
|
|
"num_tokens": 83778326.0,
|
|
"step": 36545
|
|
},
|
|
{
|
|
"entropy": 5.010363626480102,
|
|
"epoch": 3.5110470701248797,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00038038751231373477,
|
|
"loss": 4.6719,
|
|
"mean_token_accuracy": 0.2513371229171753,
|
|
"num_tokens": 83790396.0,
|
|
"step": 36550
|
|
},
|
|
{
|
|
"entropy": 5.080128145217896,
|
|
"epoch": 3.511527377521614,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00038035722080188974,
|
|
"loss": 4.6234,
|
|
"mean_token_accuracy": 0.2560823678970337,
|
|
"num_tokens": 83803199.0,
|
|
"step": 36555
|
|
},
|
|
{
|
|
"entropy": 5.0286026954650875,
|
|
"epoch": 3.5120076849183475,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038032692684396286,
|
|
"loss": 4.5922,
|
|
"mean_token_accuracy": 0.25335767716169355,
|
|
"num_tokens": 83815184.0,
|
|
"step": 36560
|
|
},
|
|
{
|
|
"entropy": 5.154217863082886,
|
|
"epoch": 3.512487992315082,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003802966304406575,
|
|
"loss": 4.6965,
|
|
"mean_token_accuracy": 0.24844575822353362,
|
|
"num_tokens": 83827627.0,
|
|
"step": 36565
|
|
},
|
|
{
|
|
"entropy": 5.063632535934448,
|
|
"epoch": 3.5129682997118157,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00038026633159267707,
|
|
"loss": 4.5885,
|
|
"mean_token_accuracy": 0.26016982942819594,
|
|
"num_tokens": 83840771.0,
|
|
"step": 36570
|
|
},
|
|
{
|
|
"entropy": 4.999858474731445,
|
|
"epoch": 3.5134486071085496,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00038023603030072503,
|
|
"loss": 4.5717,
|
|
"mean_token_accuracy": 0.25764906108379365,
|
|
"num_tokens": 83851311.0,
|
|
"step": 36575
|
|
},
|
|
{
|
|
"entropy": 5.013825607299805,
|
|
"epoch": 3.5139289145052834,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003802057265655048,
|
|
"loss": 4.6317,
|
|
"mean_token_accuracy": 0.2526480510830879,
|
|
"num_tokens": 83863887.0,
|
|
"step": 36580
|
|
},
|
|
{
|
|
"entropy": 5.081586933135986,
|
|
"epoch": 3.5144092219020173,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00038017542038772003,
|
|
"loss": 4.6128,
|
|
"mean_token_accuracy": 0.2460757315158844,
|
|
"num_tokens": 83874183.0,
|
|
"step": 36585
|
|
},
|
|
{
|
|
"entropy": 5.034558916091919,
|
|
"epoch": 3.514889529298751,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00038014511176807426,
|
|
"loss": 4.6382,
|
|
"mean_token_accuracy": 0.2458342880010605,
|
|
"num_tokens": 83885973.0,
|
|
"step": 36590
|
|
},
|
|
{
|
|
"entropy": 4.939073085784912,
|
|
"epoch": 3.515369836695485,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00038011480070727124,
|
|
"loss": 4.548,
|
|
"mean_token_accuracy": 0.2554843261837959,
|
|
"num_tokens": 83897808.0,
|
|
"step": 36595
|
|
},
|
|
{
|
|
"entropy": 5.016794300079345,
|
|
"epoch": 3.515850144092219,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003800844872060147,
|
|
"loss": 4.614,
|
|
"mean_token_accuracy": 0.2499987468123436,
|
|
"num_tokens": 83909876.0,
|
|
"step": 36600
|
|
},
|
|
{
|
|
"entropy": 4.963265705108642,
|
|
"epoch": 3.516330451488953,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00038005417126500836,
|
|
"loss": 4.5762,
|
|
"mean_token_accuracy": 0.2538855388760567,
|
|
"num_tokens": 83921181.0,
|
|
"step": 36605
|
|
},
|
|
{
|
|
"entropy": 5.086139774322509,
|
|
"epoch": 3.5168107588856867,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00038002385288495613,
|
|
"loss": 4.6026,
|
|
"mean_token_accuracy": 0.25069249272346494,
|
|
"num_tokens": 83932946.0,
|
|
"step": 36610
|
|
},
|
|
{
|
|
"entropy": 4.993628358840942,
|
|
"epoch": 3.5172910662824206,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003799935320665619,
|
|
"loss": 4.5873,
|
|
"mean_token_accuracy": 0.2561313584446907,
|
|
"num_tokens": 83945439.0,
|
|
"step": 36615
|
|
},
|
|
{
|
|
"entropy": 5.009245300292969,
|
|
"epoch": 3.517771373679155,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00037996320881052956,
|
|
"loss": 4.6517,
|
|
"mean_token_accuracy": 0.2523639664053917,
|
|
"num_tokens": 83956979.0,
|
|
"step": 36620
|
|
},
|
|
{
|
|
"entropy": 5.106911325454712,
|
|
"epoch": 3.5182516810758884,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00037993288311756327,
|
|
"loss": 4.6926,
|
|
"mean_token_accuracy": 0.2449944272637367,
|
|
"num_tokens": 83968371.0,
|
|
"step": 36625
|
|
},
|
|
{
|
|
"entropy": 5.090954875946045,
|
|
"epoch": 3.5187319884726227,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000379902554988367,
|
|
"loss": 4.5968,
|
|
"mean_token_accuracy": 0.2517275273799896,
|
|
"num_tokens": 83980815.0,
|
|
"step": 36630
|
|
},
|
|
{
|
|
"entropy": 4.968883085250854,
|
|
"epoch": 3.519212295869356,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003798722244236449,
|
|
"loss": 4.5654,
|
|
"mean_token_accuracy": 0.25360574573278427,
|
|
"num_tokens": 83991764.0,
|
|
"step": 36635
|
|
},
|
|
{
|
|
"entropy": 5.134685850143432,
|
|
"epoch": 3.5196926032660905,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003798418914241011,
|
|
"loss": 4.7369,
|
|
"mean_token_accuracy": 0.24188854694366455,
|
|
"num_tokens": 84003423.0,
|
|
"step": 36640
|
|
},
|
|
{
|
|
"entropy": 5.050960731506348,
|
|
"epoch": 3.5201729106628243,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00037981155599043997,
|
|
"loss": 4.5951,
|
|
"mean_token_accuracy": 0.2526869669556618,
|
|
"num_tokens": 84014431.0,
|
|
"step": 36645
|
|
},
|
|
{
|
|
"entropy": 5.020537710189819,
|
|
"epoch": 3.520653218059558,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003797812181233657,
|
|
"loss": 4.6004,
|
|
"mean_token_accuracy": 0.25224809944629667,
|
|
"num_tokens": 84026024.0,
|
|
"step": 36650
|
|
},
|
|
{
|
|
"entropy": 5.036512231826782,
|
|
"epoch": 3.521133525456292,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00037975087782358264,
|
|
"loss": 4.6183,
|
|
"mean_token_accuracy": 0.2524242863059044,
|
|
"num_tokens": 84037533.0,
|
|
"step": 36655
|
|
},
|
|
{
|
|
"entropy": 5.137826156616211,
|
|
"epoch": 3.521613832853026,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003797205350917953,
|
|
"loss": 4.6684,
|
|
"mean_token_accuracy": 0.24987543672323226,
|
|
"num_tokens": 84049065.0,
|
|
"step": 36660
|
|
},
|
|
{
|
|
"entropy": 5.028875827789307,
|
|
"epoch": 3.52209414024976,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037969018992870814,
|
|
"loss": 4.5778,
|
|
"mean_token_accuracy": 0.25814450681209566,
|
|
"num_tokens": 84060339.0,
|
|
"step": 36665
|
|
},
|
|
{
|
|
"entropy": 4.986513614654541,
|
|
"epoch": 3.5225744476464937,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00037965984233502553,
|
|
"loss": 4.6187,
|
|
"mean_token_accuracy": 0.2542843744158745,
|
|
"num_tokens": 84071470.0,
|
|
"step": 36670
|
|
},
|
|
{
|
|
"entropy": 5.0587029457092285,
|
|
"epoch": 3.5230547550432276,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0003796294923114522,
|
|
"loss": 4.6675,
|
|
"mean_token_accuracy": 0.24480811953544618,
|
|
"num_tokens": 84082831.0,
|
|
"step": 36675
|
|
},
|
|
{
|
|
"entropy": 5.03707799911499,
|
|
"epoch": 3.5235350624399615,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037959913985869276,
|
|
"loss": 4.5934,
|
|
"mean_token_accuracy": 0.25409266650676726,
|
|
"num_tokens": 84094547.0,
|
|
"step": 36680
|
|
},
|
|
{
|
|
"entropy": 5.046794700622558,
|
|
"epoch": 3.5240153698366954,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037956878497745185,
|
|
"loss": 4.5725,
|
|
"mean_token_accuracy": 0.2502446725964546,
|
|
"num_tokens": 84106117.0,
|
|
"step": 36685
|
|
},
|
|
{
|
|
"entropy": 4.97091121673584,
|
|
"epoch": 3.5244956772334293,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003795384276684343,
|
|
"loss": 4.5292,
|
|
"mean_token_accuracy": 0.2588286682963371,
|
|
"num_tokens": 84116654.0,
|
|
"step": 36690
|
|
},
|
|
{
|
|
"entropy": 5.002938985824585,
|
|
"epoch": 3.524975984630163,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003795080679323449,
|
|
"loss": 4.6478,
|
|
"mean_token_accuracy": 0.25334174931049347,
|
|
"num_tokens": 84128217.0,
|
|
"step": 36695
|
|
},
|
|
{
|
|
"entropy": 5.067176151275635,
|
|
"epoch": 3.525456292026897,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003794777057698884,
|
|
"loss": 4.5985,
|
|
"mean_token_accuracy": 0.24770137816667556,
|
|
"num_tokens": 84139638.0,
|
|
"step": 36700
|
|
},
|
|
{
|
|
"entropy": 5.1118241310119625,
|
|
"epoch": 3.5259365994236314,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003794473411817699,
|
|
"loss": 4.6681,
|
|
"mean_token_accuracy": 0.2485482022166252,
|
|
"num_tokens": 84150945.0,
|
|
"step": 36705
|
|
},
|
|
{
|
|
"entropy": 5.0139305114746096,
|
|
"epoch": 3.526416906820365,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003794169741686942,
|
|
"loss": 4.5229,
|
|
"mean_token_accuracy": 0.2580597519874573,
|
|
"num_tokens": 84163443.0,
|
|
"step": 36710
|
|
},
|
|
{
|
|
"entropy": 5.072852373123169,
|
|
"epoch": 3.526897214217099,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003793866047313665,
|
|
"loss": 4.7043,
|
|
"mean_token_accuracy": 0.24534917920827864,
|
|
"num_tokens": 84174291.0,
|
|
"step": 36715
|
|
},
|
|
{
|
|
"entropy": 4.9855207920074465,
|
|
"epoch": 3.527377521613833,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037935623287049173,
|
|
"loss": 4.5431,
|
|
"mean_token_accuracy": 0.2597886070609093,
|
|
"num_tokens": 84185331.0,
|
|
"step": 36720
|
|
},
|
|
{
|
|
"entropy": 4.9538086414337155,
|
|
"epoch": 3.527857829010567,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037932585858677515,
|
|
"loss": 4.5523,
|
|
"mean_token_accuracy": 0.25631087869405744,
|
|
"num_tokens": 84196930.0,
|
|
"step": 36725
|
|
},
|
|
{
|
|
"entropy": 4.931323623657226,
|
|
"epoch": 3.5283381364073008,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00037929548188092186,
|
|
"loss": 4.5388,
|
|
"mean_token_accuracy": 0.25825867205858233,
|
|
"num_tokens": 84208635.0,
|
|
"step": 36730
|
|
},
|
|
{
|
|
"entropy": 5.049507761001587,
|
|
"epoch": 3.5288184438040346,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00037926510275363725,
|
|
"loss": 4.5847,
|
|
"mean_token_accuracy": 0.2578253448009491,
|
|
"num_tokens": 84219392.0,
|
|
"step": 36735
|
|
},
|
|
{
|
|
"entropy": 5.012065982818603,
|
|
"epoch": 3.5292987512007685,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003792347212056265,
|
|
"loss": 4.5297,
|
|
"mean_token_accuracy": 0.25729946941137316,
|
|
"num_tokens": 84230540.0,
|
|
"step": 36740
|
|
},
|
|
{
|
|
"entropy": 5.062520503997803,
|
|
"epoch": 3.5297790585975024,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00037920433723759506,
|
|
"loss": 4.5931,
|
|
"mean_token_accuracy": 0.24887881577014923,
|
|
"num_tokens": 84241339.0,
|
|
"step": 36745
|
|
},
|
|
{
|
|
"entropy": 5.038359689712524,
|
|
"epoch": 3.5302593659942363,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003791739508502483,
|
|
"loss": 4.6166,
|
|
"mean_token_accuracy": 0.24784543365240097,
|
|
"num_tokens": 84252658.0,
|
|
"step": 36750
|
|
},
|
|
{
|
|
"entropy": 4.977032232284546,
|
|
"epoch": 3.53073967339097,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00037914356204429183,
|
|
"loss": 4.5898,
|
|
"mean_token_accuracy": 0.24956208914518357,
|
|
"num_tokens": 84264522.0,
|
|
"step": 36755
|
|
},
|
|
{
|
|
"entropy": 5.074520635604858,
|
|
"epoch": 3.531219980787704,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00037911317082043103,
|
|
"loss": 4.6627,
|
|
"mean_token_accuracy": 0.2505639612674713,
|
|
"num_tokens": 84276330.0,
|
|
"step": 36760
|
|
},
|
|
{
|
|
"entropy": 5.163361597061157,
|
|
"epoch": 3.531700288184438,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003790827771793715,
|
|
"loss": 4.6839,
|
|
"mean_token_accuracy": 0.24179596602916717,
|
|
"num_tokens": 84287288.0,
|
|
"step": 36765
|
|
},
|
|
{
|
|
"entropy": 4.988665628433227,
|
|
"epoch": 3.532180595581172,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000379052381121819,
|
|
"loss": 4.553,
|
|
"mean_token_accuracy": 0.25441337525844576,
|
|
"num_tokens": 84297472.0,
|
|
"step": 36770
|
|
},
|
|
{
|
|
"entropy": 4.942410516738891,
|
|
"epoch": 3.5326609029779057,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003790219826484792,
|
|
"loss": 4.5382,
|
|
"mean_token_accuracy": 0.26187110096216204,
|
|
"num_tokens": 84308726.0,
|
|
"step": 36775
|
|
},
|
|
{
|
|
"entropy": 5.0816575050354,
|
|
"epoch": 3.53314121037464,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037899158176005784,
|
|
"loss": 4.705,
|
|
"mean_token_accuracy": 0.2439972683787346,
|
|
"num_tokens": 84320439.0,
|
|
"step": 36780
|
|
},
|
|
{
|
|
"entropy": 5.086714601516723,
|
|
"epoch": 3.5336215177713735,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037896117845726067,
|
|
"loss": 4.676,
|
|
"mean_token_accuracy": 0.2541764095425606,
|
|
"num_tokens": 84331321.0,
|
|
"step": 36785
|
|
},
|
|
{
|
|
"entropy": 5.114646244049072,
|
|
"epoch": 3.534101825168108,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00037893077274079375,
|
|
"loss": 4.7193,
|
|
"mean_token_accuracy": 0.2458638995885849,
|
|
"num_tokens": 84344095.0,
|
|
"step": 36790
|
|
},
|
|
{
|
|
"entropy": 5.069796895980835,
|
|
"epoch": 3.5345821325648417,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00037890036461136285,
|
|
"loss": 4.6778,
|
|
"mean_token_accuracy": 0.24402085989713668,
|
|
"num_tokens": 84355254.0,
|
|
"step": 36795
|
|
},
|
|
{
|
|
"entropy": 5.002780771255493,
|
|
"epoch": 3.5350624399615755,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000378869954069674,
|
|
"loss": 4.4964,
|
|
"mean_token_accuracy": 0.263630835711956,
|
|
"num_tokens": 84365739.0,
|
|
"step": 36800
|
|
},
|
|
{
|
|
"entropy": 5.009192323684692,
|
|
"epoch": 3.5355427473583094,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003788395411164332,
|
|
"loss": 4.6645,
|
|
"mean_token_accuracy": 0.2528246596455574,
|
|
"num_tokens": 84377881.0,
|
|
"step": 36805
|
|
},
|
|
{
|
|
"entropy": 5.039995288848877,
|
|
"epoch": 3.5360230547550433,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00037880912575234666,
|
|
"loss": 4.6179,
|
|
"mean_token_accuracy": 0.2573153257369995,
|
|
"num_tokens": 84388748.0,
|
|
"step": 36810
|
|
},
|
|
{
|
|
"entropy": 5.046235084533691,
|
|
"epoch": 3.536503362151777,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003787787079781205,
|
|
"loss": 4.583,
|
|
"mean_token_accuracy": 0.25929058343172073,
|
|
"num_tokens": 84399056.0,
|
|
"step": 36815
|
|
},
|
|
{
|
|
"entropy": 5.120830726623535,
|
|
"epoch": 3.536983669548511,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003787482877944608,
|
|
"loss": 4.6303,
|
|
"mean_token_accuracy": 0.251022969186306,
|
|
"num_tokens": 84411441.0,
|
|
"step": 36820
|
|
},
|
|
{
|
|
"entropy": 5.04539909362793,
|
|
"epoch": 3.537463976945245,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000378717865202074,
|
|
"loss": 4.6904,
|
|
"mean_token_accuracy": 0.24597904533147813,
|
|
"num_tokens": 84423430.0,
|
|
"step": 36825
|
|
},
|
|
{
|
|
"entropy": 5.112783527374267,
|
|
"epoch": 3.537944284341979,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00037868744020166635,
|
|
"loss": 4.7274,
|
|
"mean_token_accuracy": 0.24161627292633056,
|
|
"num_tokens": 84435002.0,
|
|
"step": 36830
|
|
},
|
|
{
|
|
"entropy": 5.0408453941345215,
|
|
"epoch": 3.5384245917387127,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00037865701279394414,
|
|
"loss": 4.5827,
|
|
"mean_token_accuracy": 0.2553410977125168,
|
|
"num_tokens": 84446858.0,
|
|
"step": 36835
|
|
},
|
|
{
|
|
"entropy": 5.002702903747559,
|
|
"epoch": 3.5389048991354466,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037862658297961393,
|
|
"loss": 4.5933,
|
|
"mean_token_accuracy": 0.2602873742580414,
|
|
"num_tokens": 84457603.0,
|
|
"step": 36840
|
|
},
|
|
{
|
|
"entropy": 5.051372337341308,
|
|
"epoch": 3.5393852065321805,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003785961507593823,
|
|
"loss": 4.6416,
|
|
"mean_token_accuracy": 0.24970394372940063,
|
|
"num_tokens": 84468755.0,
|
|
"step": 36845
|
|
},
|
|
{
|
|
"entropy": 5.047720575332642,
|
|
"epoch": 3.5398655139289144,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003785657161339555,
|
|
"loss": 4.5913,
|
|
"mean_token_accuracy": 0.2545558258891106,
|
|
"num_tokens": 84480248.0,
|
|
"step": 36850
|
|
},
|
|
{
|
|
"entropy": 5.048943901062012,
|
|
"epoch": 3.5403458213256487,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00037853527910404033,
|
|
"loss": 4.6733,
|
|
"mean_token_accuracy": 0.24693433344364166,
|
|
"num_tokens": 84493290.0,
|
|
"step": 36855
|
|
},
|
|
{
|
|
"entropy": 5.025546646118164,
|
|
"epoch": 3.540826128722382,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003785048396703435,
|
|
"loss": 4.6092,
|
|
"mean_token_accuracy": 0.24675378650426866,
|
|
"num_tokens": 84505353.0,
|
|
"step": 36860
|
|
},
|
|
{
|
|
"entropy": 5.148855400085449,
|
|
"epoch": 3.5413064361191164,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00037847439783357165,
|
|
"loss": 4.7485,
|
|
"mean_token_accuracy": 0.24303271919488906,
|
|
"num_tokens": 84517871.0,
|
|
"step": 36865
|
|
},
|
|
{
|
|
"entropy": 4.984575271606445,
|
|
"epoch": 3.54178674351585,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003784439535944314,
|
|
"loss": 4.4944,
|
|
"mean_token_accuracy": 0.2602106288075447,
|
|
"num_tokens": 84528513.0,
|
|
"step": 36870
|
|
},
|
|
{
|
|
"entropy": 5.14154577255249,
|
|
"epoch": 3.542267050912584,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003784135069536298,
|
|
"loss": 4.7179,
|
|
"mean_token_accuracy": 0.2382369577884674,
|
|
"num_tokens": 84539990.0,
|
|
"step": 36875
|
|
},
|
|
{
|
|
"entropy": 5.039270544052124,
|
|
"epoch": 3.542747358309318,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00037838305791187363,
|
|
"loss": 4.5571,
|
|
"mean_token_accuracy": 0.2567957803606987,
|
|
"num_tokens": 84551218.0,
|
|
"step": 36880
|
|
},
|
|
{
|
|
"entropy": 4.988976240158081,
|
|
"epoch": 3.543227665706052,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00037835260646986986,
|
|
"loss": 4.5103,
|
|
"mean_token_accuracy": 0.25676298439502715,
|
|
"num_tokens": 84561672.0,
|
|
"step": 36885
|
|
},
|
|
{
|
|
"entropy": 4.927873373031616,
|
|
"epoch": 3.543707973102786,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00037832215262832545,
|
|
"loss": 4.5871,
|
|
"mean_token_accuracy": 0.2593684375286102,
|
|
"num_tokens": 84574115.0,
|
|
"step": 36890
|
|
},
|
|
{
|
|
"entropy": 5.036719226837159,
|
|
"epoch": 3.5441882804995197,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003782916963879474,
|
|
"loss": 4.6292,
|
|
"mean_token_accuracy": 0.24902212023735046,
|
|
"num_tokens": 84586014.0,
|
|
"step": 36895
|
|
},
|
|
{
|
|
"entropy": 5.020965814590454,
|
|
"epoch": 3.5446685878962536,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000378261237749443,
|
|
"loss": 4.5792,
|
|
"mean_token_accuracy": 0.2538844972848892,
|
|
"num_tokens": 84597720.0,
|
|
"step": 36900
|
|
},
|
|
{
|
|
"entropy": 4.953783655166626,
|
|
"epoch": 3.5451488952929875,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003782307767135192,
|
|
"loss": 4.5584,
|
|
"mean_token_accuracy": 0.25757319331169126,
|
|
"num_tokens": 84609610.0,
|
|
"step": 36905
|
|
},
|
|
{
|
|
"entropy": 5.0174788475036625,
|
|
"epoch": 3.5456292026897214,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037820031328088325,
|
|
"loss": 4.5866,
|
|
"mean_token_accuracy": 0.2585786610841751,
|
|
"num_tokens": 84619806.0,
|
|
"step": 36910
|
|
},
|
|
{
|
|
"entropy": 4.963759565353394,
|
|
"epoch": 3.5461095100864553,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003781698474522425,
|
|
"loss": 4.529,
|
|
"mean_token_accuracy": 0.26101961433887483,
|
|
"num_tokens": 84631030.0,
|
|
"step": 36915
|
|
},
|
|
{
|
|
"entropy": 5.012641477584839,
|
|
"epoch": 3.546589817483189,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00037813937922830435,
|
|
"loss": 4.6137,
|
|
"mean_token_accuracy": 0.24934932291507722,
|
|
"num_tokens": 84643914.0,
|
|
"step": 36920
|
|
},
|
|
{
|
|
"entropy": 5.026036262512207,
|
|
"epoch": 3.547070124879923,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00037810890860977595,
|
|
"loss": 4.5703,
|
|
"mean_token_accuracy": 0.24540866613388063,
|
|
"num_tokens": 84655055.0,
|
|
"step": 36925
|
|
},
|
|
{
|
|
"entropy": 5.028929090499878,
|
|
"epoch": 3.547550432276657,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00037807843559736494,
|
|
"loss": 4.6225,
|
|
"mean_token_accuracy": 0.25080355554819106,
|
|
"num_tokens": 84667840.0,
|
|
"step": 36930
|
|
},
|
|
{
|
|
"entropy": 5.007719802856445,
|
|
"epoch": 3.5480307396733908,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00037804796019177874,
|
|
"loss": 4.5854,
|
|
"mean_token_accuracy": 0.2512981817126274,
|
|
"num_tokens": 84678412.0,
|
|
"step": 36935
|
|
},
|
|
{
|
|
"entropy": 4.957797288894653,
|
|
"epoch": 3.548511047070125,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003780174823937248,
|
|
"loss": 4.51,
|
|
"mean_token_accuracy": 0.2647060438990593,
|
|
"num_tokens": 84688725.0,
|
|
"step": 36940
|
|
},
|
|
{
|
|
"entropy": 5.077688121795655,
|
|
"epoch": 3.5489913544668585,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 0.00037798700220391087,
|
|
"loss": 4.7102,
|
|
"mean_token_accuracy": 0.2515874519944191,
|
|
"num_tokens": 84702489.0,
|
|
"step": 36945
|
|
},
|
|
{
|
|
"entropy": 4.983448028564453,
|
|
"epoch": 3.549471661863593,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00037795651962304456,
|
|
"loss": 4.486,
|
|
"mean_token_accuracy": 0.26588965207338333,
|
|
"num_tokens": 84712933.0,
|
|
"step": 36950
|
|
},
|
|
{
|
|
"entropy": 4.982404184341431,
|
|
"epoch": 3.5499519692603267,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00037792603465183354,
|
|
"loss": 4.6035,
|
|
"mean_token_accuracy": 0.24843207001686096,
|
|
"num_tokens": 84723540.0,
|
|
"step": 36955
|
|
},
|
|
{
|
|
"entropy": 5.013006210327148,
|
|
"epoch": 3.5504322766570606,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003778955472909857,
|
|
"loss": 4.6814,
|
|
"mean_token_accuracy": 0.2424795523285866,
|
|
"num_tokens": 84735671.0,
|
|
"step": 36960
|
|
},
|
|
{
|
|
"entropy": 5.150403833389282,
|
|
"epoch": 3.5509125840537945,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00037786505754120874,
|
|
"loss": 4.7535,
|
|
"mean_token_accuracy": 0.23617349117994307,
|
|
"num_tokens": 84746719.0,
|
|
"step": 36965
|
|
},
|
|
{
|
|
"entropy": 5.069598388671875,
|
|
"epoch": 3.5513928914505284,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037783456540321065,
|
|
"loss": 4.683,
|
|
"mean_token_accuracy": 0.24379048496484756,
|
|
"num_tokens": 84758774.0,
|
|
"step": 36970
|
|
},
|
|
{
|
|
"entropy": 5.019801664352417,
|
|
"epoch": 3.5518731988472623,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.00037780407087769925,
|
|
"loss": 4.6429,
|
|
"mean_token_accuracy": 0.24591390192508697,
|
|
"num_tokens": 84771047.0,
|
|
"step": 36975
|
|
},
|
|
{
|
|
"entropy": 5.004588460922241,
|
|
"epoch": 3.552353506243996,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003777735739653826,
|
|
"loss": 4.5949,
|
|
"mean_token_accuracy": 0.25586841255426407,
|
|
"num_tokens": 84781249.0,
|
|
"step": 36980
|
|
},
|
|
{
|
|
"entropy": 4.941749191284179,
|
|
"epoch": 3.55283381364073,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003777430746669687,
|
|
"loss": 4.4876,
|
|
"mean_token_accuracy": 0.25815171003341675,
|
|
"num_tokens": 84791722.0,
|
|
"step": 36985
|
|
},
|
|
{
|
|
"entropy": 5.042249345779419,
|
|
"epoch": 3.553314121037464,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00037771257298316573,
|
|
"loss": 4.6573,
|
|
"mean_token_accuracy": 0.2506411150097847,
|
|
"num_tokens": 84804327.0,
|
|
"step": 36990
|
|
},
|
|
{
|
|
"entropy": 4.985011577606201,
|
|
"epoch": 3.553794428434198,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003776820689146818,
|
|
"loss": 4.5817,
|
|
"mean_token_accuracy": 0.2566316545009613,
|
|
"num_tokens": 84814381.0,
|
|
"step": 36995
|
|
},
|
|
{
|
|
"entropy": 5.001188325881958,
|
|
"epoch": 3.5542747358309317,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00037765156246222513,
|
|
"loss": 4.5749,
|
|
"mean_token_accuracy": 0.2628608211874962,
|
|
"num_tokens": 84826274.0,
|
|
"step": 37000
|
|
},
|
|
{
|
|
"entropy": 5.04213433265686,
|
|
"epoch": 3.5547550432276656,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00037762105362650403,
|
|
"loss": 4.575,
|
|
"mean_token_accuracy": 0.24804353713989258,
|
|
"num_tokens": 84838946.0,
|
|
"step": 37005
|
|
},
|
|
{
|
|
"entropy": 5.165134382247925,
|
|
"epoch": 3.5552353506243994,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.00037759054240822675,
|
|
"loss": 4.6802,
|
|
"mean_token_accuracy": 0.2384137764573097,
|
|
"num_tokens": 84851767.0,
|
|
"step": 37010
|
|
},
|
|
{
|
|
"entropy": 5.0057206630706785,
|
|
"epoch": 3.5557156580211338,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003775600288081018,
|
|
"loss": 4.5775,
|
|
"mean_token_accuracy": 0.2535111531615257,
|
|
"num_tokens": 84863712.0,
|
|
"step": 37015
|
|
},
|
|
{
|
|
"entropy": 4.980873346328735,
|
|
"epoch": 3.556195965417867,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003775295128268374,
|
|
"loss": 4.5377,
|
|
"mean_token_accuracy": 0.26317687928676603,
|
|
"num_tokens": 84875699.0,
|
|
"step": 37020
|
|
},
|
|
{
|
|
"entropy": 5.0328755378723145,
|
|
"epoch": 3.5566762728146015,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037749899446514216,
|
|
"loss": 4.5753,
|
|
"mean_token_accuracy": 0.25335094779729844,
|
|
"num_tokens": 84886772.0,
|
|
"step": 37025
|
|
},
|
|
{
|
|
"entropy": 4.988605213165283,
|
|
"epoch": 3.5571565802113354,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003774684737237247,
|
|
"loss": 4.6284,
|
|
"mean_token_accuracy": 0.2503588393330574,
|
|
"num_tokens": 84898501.0,
|
|
"step": 37030
|
|
},
|
|
{
|
|
"entropy": 5.084286451339722,
|
|
"epoch": 3.5576368876080693,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003774379506032935,
|
|
"loss": 4.6846,
|
|
"mean_token_accuracy": 0.24612823575735093,
|
|
"num_tokens": 84908418.0,
|
|
"step": 37035
|
|
},
|
|
{
|
|
"entropy": 5.067788887023926,
|
|
"epoch": 3.558117195004803,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037740742510455726,
|
|
"loss": 4.6521,
|
|
"mean_token_accuracy": 0.25088565796613693,
|
|
"num_tokens": 84919792.0,
|
|
"step": 37040
|
|
},
|
|
{
|
|
"entropy": 5.023893690109253,
|
|
"epoch": 3.558597502401537,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003773768972282248,
|
|
"loss": 4.6245,
|
|
"mean_token_accuracy": 0.25762687623500824,
|
|
"num_tokens": 84931606.0,
|
|
"step": 37045
|
|
},
|
|
{
|
|
"entropy": 4.990866374969483,
|
|
"epoch": 3.559077809798271,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003773463669750046,
|
|
"loss": 4.5781,
|
|
"mean_token_accuracy": 0.2607622638344765,
|
|
"num_tokens": 84943099.0,
|
|
"step": 37050
|
|
},
|
|
{
|
|
"entropy": 5.099212741851806,
|
|
"epoch": 3.559558117195005,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003773158343456058,
|
|
"loss": 4.7177,
|
|
"mean_token_accuracy": 0.23920221775770187,
|
|
"num_tokens": 84954444.0,
|
|
"step": 37055
|
|
},
|
|
{
|
|
"entropy": 5.040729522705078,
|
|
"epoch": 3.5600384245917387,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00037728529934073714,
|
|
"loss": 4.5348,
|
|
"mean_token_accuracy": 0.25703936964273455,
|
|
"num_tokens": 84965342.0,
|
|
"step": 37060
|
|
},
|
|
{
|
|
"entropy": 5.017491054534912,
|
|
"epoch": 3.5605187319884726,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003772547619611075,
|
|
"loss": 4.6071,
|
|
"mean_token_accuracy": 0.25171990394592286,
|
|
"num_tokens": 84977685.0,
|
|
"step": 37065
|
|
},
|
|
{
|
|
"entropy": 5.084903192520142,
|
|
"epoch": 3.5609990393852065,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003772242222074259,
|
|
"loss": 4.6878,
|
|
"mean_token_accuracy": 0.23777244836091996,
|
|
"num_tokens": 84990148.0,
|
|
"step": 37070
|
|
},
|
|
{
|
|
"entropy": 5.169915199279785,
|
|
"epoch": 3.5614793467819403,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037719368008040145,
|
|
"loss": 4.7272,
|
|
"mean_token_accuracy": 0.24931265711784362,
|
|
"num_tokens": 85001611.0,
|
|
"step": 37075
|
|
},
|
|
{
|
|
"entropy": 5.152602958679199,
|
|
"epoch": 3.561959654178674,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00037716313558074327,
|
|
"loss": 4.7025,
|
|
"mean_token_accuracy": 0.24466303288936614,
|
|
"num_tokens": 85012558.0,
|
|
"step": 37080
|
|
},
|
|
{
|
|
"entropy": 5.056896352767945,
|
|
"epoch": 3.562439961575408,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003771325887091603,
|
|
"loss": 4.5796,
|
|
"mean_token_accuracy": 0.2557868078351021,
|
|
"num_tokens": 85022102.0,
|
|
"step": 37085
|
|
},
|
|
{
|
|
"entropy": 4.984315061569214,
|
|
"epoch": 3.5629202689721424,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000377102039466362,
|
|
"loss": 4.5249,
|
|
"mean_token_accuracy": 0.2571453794836998,
|
|
"num_tokens": 85033497.0,
|
|
"step": 37090
|
|
},
|
|
{
|
|
"entropy": 5.009184837341309,
|
|
"epoch": 3.563400576368876,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003770714878530575,
|
|
"loss": 4.6023,
|
|
"mean_token_accuracy": 0.25531150698661803,
|
|
"num_tokens": 85044938.0,
|
|
"step": 37095
|
|
},
|
|
{
|
|
"entropy": 5.052514600753784,
|
|
"epoch": 3.56388088376561,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00037704093386995607,
|
|
"loss": 4.581,
|
|
"mean_token_accuracy": 0.25671431720256804,
|
|
"num_tokens": 85056545.0,
|
|
"step": 37100
|
|
},
|
|
{
|
|
"entropy": 4.987484169006348,
|
|
"epoch": 3.5643611911623436,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003770103775177673,
|
|
"loss": 4.5097,
|
|
"mean_token_accuracy": 0.2655107229948044,
|
|
"num_tokens": 85067660.0,
|
|
"step": 37105
|
|
},
|
|
{
|
|
"entropy": 4.979053592681884,
|
|
"epoch": 3.564841498559078,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00037697981879720034,
|
|
"loss": 4.5637,
|
|
"mean_token_accuracy": 0.2577657073736191,
|
|
"num_tokens": 85078813.0,
|
|
"step": 37110
|
|
},
|
|
{
|
|
"entropy": 5.094251823425293,
|
|
"epoch": 3.565321805955812,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037694925770896486,
|
|
"loss": 4.6309,
|
|
"mean_token_accuracy": 0.24665430635213853,
|
|
"num_tokens": 85089473.0,
|
|
"step": 37115
|
|
},
|
|
{
|
|
"entropy": 5.145241355895996,
|
|
"epoch": 3.5658021133525457,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00037691869425377037,
|
|
"loss": 4.7428,
|
|
"mean_token_accuracy": 0.24423769861459732,
|
|
"num_tokens": 85101325.0,
|
|
"step": 37120
|
|
},
|
|
{
|
|
"entropy": 5.082738590240479,
|
|
"epoch": 3.5662824207492796,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003768881284323263,
|
|
"loss": 4.6947,
|
|
"mean_token_accuracy": 0.24213242381811143,
|
|
"num_tokens": 85112601.0,
|
|
"step": 37125
|
|
},
|
|
{
|
|
"entropy": 4.952612400054932,
|
|
"epoch": 3.5667627281460135,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003768575602453426,
|
|
"loss": 4.5833,
|
|
"mean_token_accuracy": 0.2567975550889969,
|
|
"num_tokens": 85123189.0,
|
|
"step": 37130
|
|
},
|
|
{
|
|
"entropy": 5.044677114486694,
|
|
"epoch": 3.5672430355427474,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00037682698969352876,
|
|
"loss": 4.5931,
|
|
"mean_token_accuracy": 0.25257644802331924,
|
|
"num_tokens": 85133575.0,
|
|
"step": 37135
|
|
},
|
|
{
|
|
"entropy": 5.087544393539429,
|
|
"epoch": 3.5677233429394812,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003767964167775946,
|
|
"loss": 4.5899,
|
|
"mean_token_accuracy": 0.25823838710784913,
|
|
"num_tokens": 85144549.0,
|
|
"step": 37140
|
|
},
|
|
{
|
|
"entropy": 5.052699327468872,
|
|
"epoch": 3.568203650336215,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003767658414982498,
|
|
"loss": 4.6436,
|
|
"mean_token_accuracy": 0.2586609095335007,
|
|
"num_tokens": 85155715.0,
|
|
"step": 37145
|
|
},
|
|
{
|
|
"entropy": 5.058721923828125,
|
|
"epoch": 3.568683957732949,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003767352638562044,
|
|
"loss": 4.6452,
|
|
"mean_token_accuracy": 0.24414433240890504,
|
|
"num_tokens": 85168268.0,
|
|
"step": 37150
|
|
},
|
|
{
|
|
"entropy": 5.034001064300537,
|
|
"epoch": 3.569164265129683,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003767046838521683,
|
|
"loss": 4.6335,
|
|
"mean_token_accuracy": 0.25023345947265624,
|
|
"num_tokens": 85181326.0,
|
|
"step": 37155
|
|
},
|
|
{
|
|
"entropy": 4.992013835906983,
|
|
"epoch": 3.5696445725264168,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003766741014868515,
|
|
"loss": 4.5138,
|
|
"mean_token_accuracy": 0.2572570309042931,
|
|
"num_tokens": 85191640.0,
|
|
"step": 37160
|
|
},
|
|
{
|
|
"entropy": 5.016417551040649,
|
|
"epoch": 3.570124879923151,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00037664351676096385,
|
|
"loss": 4.5905,
|
|
"mean_token_accuracy": 0.2565168783068657,
|
|
"num_tokens": 85202801.0,
|
|
"step": 37165
|
|
},
|
|
{
|
|
"entropy": 5.0061256885528564,
|
|
"epoch": 3.5706051873198845,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037661292967521563,
|
|
"loss": 4.5282,
|
|
"mean_token_accuracy": 0.25236859917640686,
|
|
"num_tokens": 85213833.0,
|
|
"step": 37170
|
|
},
|
|
{
|
|
"entropy": 5.0527002811431885,
|
|
"epoch": 3.571085494716619,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00037658234023031694,
|
|
"loss": 4.5785,
|
|
"mean_token_accuracy": 0.26021457463502884,
|
|
"num_tokens": 85223856.0,
|
|
"step": 37175
|
|
},
|
|
{
|
|
"entropy": 5.051439571380615,
|
|
"epoch": 3.5715658021133523,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003765517484269779,
|
|
"loss": 4.5792,
|
|
"mean_token_accuracy": 0.2511450290679932,
|
|
"num_tokens": 85236282.0,
|
|
"step": 37180
|
|
},
|
|
{
|
|
"entropy": 4.985372638702392,
|
|
"epoch": 3.5720461095100866,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00037652115426590874,
|
|
"loss": 4.5982,
|
|
"mean_token_accuracy": 0.2509107679128647,
|
|
"num_tokens": 85247430.0,
|
|
"step": 37185
|
|
},
|
|
{
|
|
"entropy": 5.099769544601441,
|
|
"epoch": 3.5725264169068205,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037649055774782,
|
|
"loss": 4.6852,
|
|
"mean_token_accuracy": 0.24412306100130082,
|
|
"num_tokens": 85259091.0,
|
|
"step": 37190
|
|
},
|
|
{
|
|
"entropy": 5.016952276229858,
|
|
"epoch": 3.5730067243035544,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003764599588734218,
|
|
"loss": 4.5304,
|
|
"mean_token_accuracy": 0.2607806399464607,
|
|
"num_tokens": 85270686.0,
|
|
"step": 37195
|
|
},
|
|
{
|
|
"entropy": 4.988932466506958,
|
|
"epoch": 3.5734870317002883,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00037642935764342454,
|
|
"loss": 4.5981,
|
|
"mean_token_accuracy": 0.2493707537651062,
|
|
"num_tokens": 85281419.0,
|
|
"step": 37200
|
|
},
|
|
{
|
|
"entropy": 5.008523941040039,
|
|
"epoch": 3.573967339097022,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003763987540585389,
|
|
"loss": 4.6071,
|
|
"mean_token_accuracy": 0.25219342559576036,
|
|
"num_tokens": 85293116.0,
|
|
"step": 37205
|
|
},
|
|
{
|
|
"entropy": 5.140900945663452,
|
|
"epoch": 3.574447646493756,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003763681481194752,
|
|
"loss": 4.6536,
|
|
"mean_token_accuracy": 0.2501092880964279,
|
|
"num_tokens": 85304567.0,
|
|
"step": 37210
|
|
},
|
|
{
|
|
"entropy": 5.018571567535401,
|
|
"epoch": 3.57492795389049,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00037633753982694413,
|
|
"loss": 4.6198,
|
|
"mean_token_accuracy": 0.24950784295797349,
|
|
"num_tokens": 85315947.0,
|
|
"step": 37215
|
|
},
|
|
{
|
|
"entropy": 5.017896223068237,
|
|
"epoch": 3.5754082612872238,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037630692918165634,
|
|
"loss": 4.6509,
|
|
"mean_token_accuracy": 0.24630186408758165,
|
|
"num_tokens": 85327309.0,
|
|
"step": 37220
|
|
},
|
|
{
|
|
"entropy": 5.035592555999756,
|
|
"epoch": 3.5758885686839577,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003762763161843224,
|
|
"loss": 4.678,
|
|
"mean_token_accuracy": 0.25260578840970993,
|
|
"num_tokens": 85339034.0,
|
|
"step": 37225
|
|
},
|
|
{
|
|
"entropy": 5.059793186187744,
|
|
"epoch": 3.5763688760806915,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003762457008356531,
|
|
"loss": 4.6055,
|
|
"mean_token_accuracy": 0.25138003528118136,
|
|
"num_tokens": 85351764.0,
|
|
"step": 37230
|
|
},
|
|
{
|
|
"entropy": 4.986952495574951,
|
|
"epoch": 3.5768491834774254,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00037621508313635945,
|
|
"loss": 4.5218,
|
|
"mean_token_accuracy": 0.2570299327373505,
|
|
"num_tokens": 85363234.0,
|
|
"step": 37235
|
|
},
|
|
{
|
|
"entropy": 4.9970334529876705,
|
|
"epoch": 3.5773294908741593,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003761844630871519,
|
|
"loss": 4.5399,
|
|
"mean_token_accuracy": 0.25671071708202364,
|
|
"num_tokens": 85374148.0,
|
|
"step": 37240
|
|
},
|
|
{
|
|
"entropy": 4.96151876449585,
|
|
"epoch": 3.577809798270893,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003761538406887417,
|
|
"loss": 4.5458,
|
|
"mean_token_accuracy": 0.25735546499490736,
|
|
"num_tokens": 85385952.0,
|
|
"step": 37245
|
|
},
|
|
{
|
|
"entropy": 5.064855861663818,
|
|
"epoch": 3.5782901056676275,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003761232159418397,
|
|
"loss": 4.6195,
|
|
"mean_token_accuracy": 0.2500826954841614,
|
|
"num_tokens": 85396182.0,
|
|
"step": 37250
|
|
},
|
|
{
|
|
"entropy": 5.041394853591919,
|
|
"epoch": 3.578770413064361,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003760925888471568,
|
|
"loss": 4.5713,
|
|
"mean_token_accuracy": 0.2532426804304123,
|
|
"num_tokens": 85408109.0,
|
|
"step": 37255
|
|
},
|
|
{
|
|
"entropy": 5.085792541503906,
|
|
"epoch": 3.5792507204610953,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003760619594054042,
|
|
"loss": 4.759,
|
|
"mean_token_accuracy": 0.24117132723331453,
|
|
"num_tokens": 85420622.0,
|
|
"step": 37260
|
|
},
|
|
{
|
|
"entropy": 5.009101104736328,
|
|
"epoch": 3.579731027857829,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00037603132761729303,
|
|
"loss": 4.542,
|
|
"mean_token_accuracy": 0.2536076888442039,
|
|
"num_tokens": 85431890.0,
|
|
"step": 37265
|
|
},
|
|
{
|
|
"entropy": 4.990065002441407,
|
|
"epoch": 3.580211335254563,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.0003760006934835344,
|
|
"loss": 4.5957,
|
|
"mean_token_accuracy": 0.24594600796699523,
|
|
"num_tokens": 85442953.0,
|
|
"step": 37270
|
|
},
|
|
{
|
|
"entropy": 5.049779462814331,
|
|
"epoch": 3.580691642651297,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003759700570048396,
|
|
"loss": 4.6767,
|
|
"mean_token_accuracy": 0.2523428946733475,
|
|
"num_tokens": 85454125.0,
|
|
"step": 37275
|
|
},
|
|
{
|
|
"entropy": 5.146895170211792,
|
|
"epoch": 3.581171950048031,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003759394181819199,
|
|
"loss": 4.7278,
|
|
"mean_token_accuracy": 0.24198162257671357,
|
|
"num_tokens": 85465162.0,
|
|
"step": 37280
|
|
},
|
|
{
|
|
"entropy": 5.023277664184571,
|
|
"epoch": 3.5816522574447647,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00037590877701548667,
|
|
"loss": 4.5115,
|
|
"mean_token_accuracy": 0.26065645962953565,
|
|
"num_tokens": 85475316.0,
|
|
"step": 37285
|
|
},
|
|
{
|
|
"entropy": 5.072000026702881,
|
|
"epoch": 3.5821325648414986,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003758781335062512,
|
|
"loss": 4.6508,
|
|
"mean_token_accuracy": 0.24775562882423402,
|
|
"num_tokens": 85487138.0,
|
|
"step": 37290
|
|
},
|
|
{
|
|
"entropy": 5.0447369575500485,
|
|
"epoch": 3.5826128722382324,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003758474876549251,
|
|
"loss": 4.5767,
|
|
"mean_token_accuracy": 0.25230991542339326,
|
|
"num_tokens": 85498141.0,
|
|
"step": 37295
|
|
},
|
|
{
|
|
"entropy": 4.9458356380462645,
|
|
"epoch": 3.5830931796349663,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00037581683946221974,
|
|
"loss": 4.5813,
|
|
"mean_token_accuracy": 0.2564206689596176,
|
|
"num_tokens": 85510123.0,
|
|
"step": 37300
|
|
},
|
|
{
|
|
"entropy": 5.0365574836730955,
|
|
"epoch": 3.5835734870317,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003757861889288467,
|
|
"loss": 4.6841,
|
|
"mean_token_accuracy": 0.23422487527132035,
|
|
"num_tokens": 85522003.0,
|
|
"step": 37305
|
|
},
|
|
{
|
|
"entropy": 5.09527735710144,
|
|
"epoch": 3.584053794428434,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00037575553605551765,
|
|
"loss": 4.6308,
|
|
"mean_token_accuracy": 0.24219636470079423,
|
|
"num_tokens": 85533400.0,
|
|
"step": 37310
|
|
},
|
|
{
|
|
"entropy": 4.967593669891357,
|
|
"epoch": 3.584534101825168,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00037572488084294423,
|
|
"loss": 4.5072,
|
|
"mean_token_accuracy": 0.2608699142932892,
|
|
"num_tokens": 85543393.0,
|
|
"step": 37315
|
|
},
|
|
{
|
|
"entropy": 4.922746992111206,
|
|
"epoch": 3.585014409221902,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00037569422329183824,
|
|
"loss": 4.5912,
|
|
"mean_token_accuracy": 0.25594458281993865,
|
|
"num_tokens": 85554596.0,
|
|
"step": 37320
|
|
},
|
|
{
|
|
"entropy": 5.067403411865234,
|
|
"epoch": 3.585494716618636,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003756635634029112,
|
|
"loss": 4.664,
|
|
"mean_token_accuracy": 0.2414974570274353,
|
|
"num_tokens": 85566144.0,
|
|
"step": 37325
|
|
},
|
|
{
|
|
"entropy": 5.091717052459717,
|
|
"epoch": 3.5859750240153696,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00037563290117687535,
|
|
"loss": 4.6337,
|
|
"mean_token_accuracy": 0.25361588448286054,
|
|
"num_tokens": 85576824.0,
|
|
"step": 37330
|
|
},
|
|
{
|
|
"entropy": 5.126081705093384,
|
|
"epoch": 3.586455331412104,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003756022366144423,
|
|
"loss": 4.6705,
|
|
"mean_token_accuracy": 0.24626718312501908,
|
|
"num_tokens": 85588719.0,
|
|
"step": 37335
|
|
},
|
|
{
|
|
"entropy": 5.001444244384766,
|
|
"epoch": 3.586935638808838,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003755715697163239,
|
|
"loss": 4.5009,
|
|
"mean_token_accuracy": 0.258174666762352,
|
|
"num_tokens": 85598783.0,
|
|
"step": 37340
|
|
},
|
|
{
|
|
"entropy": 5.031303882598877,
|
|
"epoch": 3.5874159462055717,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00037554090048323247,
|
|
"loss": 4.6913,
|
|
"mean_token_accuracy": 0.24459472745656968,
|
|
"num_tokens": 85610222.0,
|
|
"step": 37345
|
|
},
|
|
{
|
|
"entropy": 5.0451497554779055,
|
|
"epoch": 3.5878962536023056,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00037551022891587976,
|
|
"loss": 4.5987,
|
|
"mean_token_accuracy": 0.25357581228017806,
|
|
"num_tokens": 85620660.0,
|
|
"step": 37350
|
|
},
|
|
{
|
|
"entropy": 5.1529090881347654,
|
|
"epoch": 3.5883765609990395,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00037547955501497806,
|
|
"loss": 4.7347,
|
|
"mean_token_accuracy": 0.24109016507863998,
|
|
"num_tokens": 85632466.0,
|
|
"step": 37355
|
|
},
|
|
{
|
|
"entropy": 5.172725582122803,
|
|
"epoch": 3.5888568683957733,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003754488787812394,
|
|
"loss": 4.7585,
|
|
"mean_token_accuracy": 0.24347807615995407,
|
|
"num_tokens": 85643343.0,
|
|
"step": 37360
|
|
},
|
|
{
|
|
"entropy": 4.970874691009522,
|
|
"epoch": 3.589337175792507,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0003754182002153761,
|
|
"loss": 4.5529,
|
|
"mean_token_accuracy": 0.25904516875743866,
|
|
"num_tokens": 85655864.0,
|
|
"step": 37365
|
|
},
|
|
{
|
|
"entropy": 5.056605577468872,
|
|
"epoch": 3.589817483189241,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003753875193181003,
|
|
"loss": 4.7003,
|
|
"mean_token_accuracy": 0.24913085401058196,
|
|
"num_tokens": 85667729.0,
|
|
"step": 37370
|
|
},
|
|
{
|
|
"entropy": 5.075022840499878,
|
|
"epoch": 3.590297790585975,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00037535683609012446,
|
|
"loss": 4.6811,
|
|
"mean_token_accuracy": 0.2455104559659958,
|
|
"num_tokens": 85679252.0,
|
|
"step": 37375
|
|
},
|
|
{
|
|
"entropy": 5.007112169265747,
|
|
"epoch": 3.590778097982709,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00037532615053216084,
|
|
"loss": 4.6059,
|
|
"mean_token_accuracy": 0.2625422149896622,
|
|
"num_tokens": 85690598.0,
|
|
"step": 37380
|
|
},
|
|
{
|
|
"entropy": 5.017372560501099,
|
|
"epoch": 3.5912584053794427,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000375295462644922,
|
|
"loss": 4.6098,
|
|
"mean_token_accuracy": 0.25939144790172575,
|
|
"num_tokens": 85703537.0,
|
|
"step": 37385
|
|
},
|
|
{
|
|
"entropy": 5.231055164337159,
|
|
"epoch": 3.5917387127761766,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003752647724291202,
|
|
"loss": 4.818,
|
|
"mean_token_accuracy": 0.23571971654891968,
|
|
"num_tokens": 85717163.0,
|
|
"step": 37390
|
|
},
|
|
{
|
|
"entropy": 5.125612592697143,
|
|
"epoch": 3.5922190201729105,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00037523407988546815,
|
|
"loss": 4.6665,
|
|
"mean_token_accuracy": 0.2451303482055664,
|
|
"num_tokens": 85728749.0,
|
|
"step": 37395
|
|
},
|
|
{
|
|
"entropy": 5.083791446685791,
|
|
"epoch": 3.592699327569645,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003752033850146785,
|
|
"loss": 4.707,
|
|
"mean_token_accuracy": 0.2447576865553856,
|
|
"num_tokens": 85738653.0,
|
|
"step": 37400
|
|
},
|
|
{
|
|
"entropy": 4.989290857315064,
|
|
"epoch": 3.5931796349663783,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003751726878174637,
|
|
"loss": 4.548,
|
|
"mean_token_accuracy": 0.2580173462629318,
|
|
"num_tokens": 85749244.0,
|
|
"step": 37405
|
|
},
|
|
{
|
|
"entropy": 5.035173225402832,
|
|
"epoch": 3.5936599423631126,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003751419882945365,
|
|
"loss": 4.5754,
|
|
"mean_token_accuracy": 0.26200775653123853,
|
|
"num_tokens": 85760314.0,
|
|
"step": 37410
|
|
},
|
|
{
|
|
"entropy": 5.009808874130249,
|
|
"epoch": 3.594140249759846,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003751112864466097,
|
|
"loss": 4.537,
|
|
"mean_token_accuracy": 0.25963049978017805,
|
|
"num_tokens": 85771319.0,
|
|
"step": 37415
|
|
},
|
|
{
|
|
"entropy": 5.088012552261352,
|
|
"epoch": 3.5946205571565804,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003750805822743962,
|
|
"loss": 4.6442,
|
|
"mean_token_accuracy": 0.2563477292656898,
|
|
"num_tokens": 85781687.0,
|
|
"step": 37420
|
|
},
|
|
{
|
|
"entropy": 5.027475261688233,
|
|
"epoch": 3.5951008645533142,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003750498757786087,
|
|
"loss": 4.582,
|
|
"mean_token_accuracy": 0.2522925540804863,
|
|
"num_tokens": 85793775.0,
|
|
"step": 37425
|
|
},
|
|
{
|
|
"entropy": 5.05867280960083,
|
|
"epoch": 3.595581171950048,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003750191669599602,
|
|
"loss": 4.6889,
|
|
"mean_token_accuracy": 0.2488551616668701,
|
|
"num_tokens": 85805404.0,
|
|
"step": 37430
|
|
},
|
|
{
|
|
"entropy": 5.079153394699096,
|
|
"epoch": 3.596061479346782,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003749884558191636,
|
|
"loss": 4.6581,
|
|
"mean_token_accuracy": 0.24934836328029633,
|
|
"num_tokens": 85818182.0,
|
|
"step": 37435
|
|
},
|
|
{
|
|
"entropy": 5.099080753326416,
|
|
"epoch": 3.596541786743516,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00037495774235693193,
|
|
"loss": 4.6165,
|
|
"mean_token_accuracy": 0.2474392905831337,
|
|
"num_tokens": 85828747.0,
|
|
"step": 37440
|
|
},
|
|
{
|
|
"entropy": 5.052905893325805,
|
|
"epoch": 3.5970220941402498,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037492702657397837,
|
|
"loss": 4.5916,
|
|
"mean_token_accuracy": 0.2510372817516327,
|
|
"num_tokens": 85839914.0,
|
|
"step": 37445
|
|
},
|
|
{
|
|
"entropy": 4.996705770492554,
|
|
"epoch": 3.5975024015369836,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000374896308471016,
|
|
"loss": 4.5325,
|
|
"mean_token_accuracy": 0.26017967611551285,
|
|
"num_tokens": 85851252.0,
|
|
"step": 37450
|
|
},
|
|
{
|
|
"entropy": 4.930719184875488,
|
|
"epoch": 3.5979827089337175,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00037486558804875785,
|
|
"loss": 4.5293,
|
|
"mean_token_accuracy": 0.2575101897120476,
|
|
"num_tokens": 85863287.0,
|
|
"step": 37455
|
|
},
|
|
{
|
|
"entropy": 5.076582527160644,
|
|
"epoch": 3.5984630163304514,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003748348653079174,
|
|
"loss": 4.7625,
|
|
"mean_token_accuracy": 0.23952684551477432,
|
|
"num_tokens": 85874575.0,
|
|
"step": 37460
|
|
},
|
|
{
|
|
"entropy": 5.04263072013855,
|
|
"epoch": 3.5989433237271853,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00037480414024920777,
|
|
"loss": 4.5697,
|
|
"mean_token_accuracy": 0.25318289548158646,
|
|
"num_tokens": 85886267.0,
|
|
"step": 37465
|
|
},
|
|
{
|
|
"entropy": 5.015277862548828,
|
|
"epoch": 3.599423631123919,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003747734128733424,
|
|
"loss": 4.6465,
|
|
"mean_token_accuracy": 0.24793727099895477,
|
|
"num_tokens": 85897149.0,
|
|
"step": 37470
|
|
},
|
|
{
|
|
"entropy": 4.91844596862793,
|
|
"epoch": 3.5999039385206535,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003747426831810346,
|
|
"loss": 4.462,
|
|
"mean_token_accuracy": 0.265723717212677,
|
|
"num_tokens": 85909045.0,
|
|
"step": 37475
|
|
},
|
|
{
|
|
"entropy": 5.013857984542847,
|
|
"epoch": 3.600384245917387,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00037471195117299796,
|
|
"loss": 4.5741,
|
|
"mean_token_accuracy": 0.2577277094125748,
|
|
"num_tokens": 85919795.0,
|
|
"step": 37480
|
|
},
|
|
{
|
|
"entropy": 5.048078632354736,
|
|
"epoch": 3.6008645533141213,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0003746812168499458,
|
|
"loss": 4.6244,
|
|
"mean_token_accuracy": 0.24621468186378478,
|
|
"num_tokens": 85930348.0,
|
|
"step": 37485
|
|
},
|
|
{
|
|
"entropy": 5.1199249744415285,
|
|
"epoch": 3.6013448607108547,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003746504802125918,
|
|
"loss": 4.7213,
|
|
"mean_token_accuracy": 0.24409927427768707,
|
|
"num_tokens": 85941956.0,
|
|
"step": 37490
|
|
},
|
|
{
|
|
"entropy": 5.048035573959351,
|
|
"epoch": 3.601825168107589,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00037461974126164953,
|
|
"loss": 4.5555,
|
|
"mean_token_accuracy": 0.25522642135620116,
|
|
"num_tokens": 85953208.0,
|
|
"step": 37495
|
|
},
|
|
{
|
|
"entropy": 5.076632356643676,
|
|
"epoch": 3.602305475504323,
|
|
"grad_norm": 0.91015625,
|
|
"learning_rate": 0.00037458899999783267,
|
|
"loss": 4.641,
|
|
"mean_token_accuracy": 0.24736995100975037,
|
|
"num_tokens": 85967527.0,
|
|
"step": 37500
|
|
},
|
|
{
|
|
"entropy": 5.055927753448486,
|
|
"epoch": 3.6027857829010568,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00037455825642185504,
|
|
"loss": 4.6468,
|
|
"mean_token_accuracy": 0.2466520696878433,
|
|
"num_tokens": 85979013.0,
|
|
"step": 37505
|
|
},
|
|
{
|
|
"entropy": 5.077972173690796,
|
|
"epoch": 3.6032660902977907,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003745275105344303,
|
|
"loss": 4.5983,
|
|
"mean_token_accuracy": 0.2505024611949921,
|
|
"num_tokens": 85990331.0,
|
|
"step": 37510
|
|
},
|
|
{
|
|
"entropy": 5.0534929752349855,
|
|
"epoch": 3.6037463976945245,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003744967623362723,
|
|
"loss": 4.6559,
|
|
"mean_token_accuracy": 0.24482654333114623,
|
|
"num_tokens": 86001883.0,
|
|
"step": 37515
|
|
},
|
|
{
|
|
"entropy": 5.03518009185791,
|
|
"epoch": 3.6042267050912584,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003744660118280949,
|
|
"loss": 4.6284,
|
|
"mean_token_accuracy": 0.25069948881864546,
|
|
"num_tokens": 86013069.0,
|
|
"step": 37520
|
|
},
|
|
{
|
|
"entropy": 5.118577575683593,
|
|
"epoch": 3.6047070124879923,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.000374435259010612,
|
|
"loss": 4.6837,
|
|
"mean_token_accuracy": 0.24625941067934037,
|
|
"num_tokens": 86026107.0,
|
|
"step": 37525
|
|
},
|
|
{
|
|
"entropy": 5.016831254959106,
|
|
"epoch": 3.605187319884726,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00037440450388453767,
|
|
"loss": 4.5195,
|
|
"mean_token_accuracy": 0.2651525393128395,
|
|
"num_tokens": 86036914.0,
|
|
"step": 37530
|
|
},
|
|
{
|
|
"entropy": 5.051985216140747,
|
|
"epoch": 3.60566762728146,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000374373746450586,
|
|
"loss": 4.6639,
|
|
"mean_token_accuracy": 0.24771412312984467,
|
|
"num_tokens": 86047922.0,
|
|
"step": 37535
|
|
},
|
|
{
|
|
"entropy": 5.030571746826172,
|
|
"epoch": 3.606147934678194,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037434298670947097,
|
|
"loss": 4.639,
|
|
"mean_token_accuracy": 0.2478707730770111,
|
|
"num_tokens": 86058843.0,
|
|
"step": 37540
|
|
},
|
|
{
|
|
"entropy": 5.026053619384766,
|
|
"epoch": 3.606628242074928,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00037431222466190677,
|
|
"loss": 4.5317,
|
|
"mean_token_accuracy": 0.2569324165582657,
|
|
"num_tokens": 86069212.0,
|
|
"step": 37545
|
|
},
|
|
{
|
|
"entropy": 4.994998550415039,
|
|
"epoch": 3.6071085494716617,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00037428146030860765,
|
|
"loss": 4.5715,
|
|
"mean_token_accuracy": 0.2556986689567566,
|
|
"num_tokens": 86080952.0,
|
|
"step": 37550
|
|
},
|
|
{
|
|
"entropy": 4.95955491065979,
|
|
"epoch": 3.6075888568683956,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003742506936502878,
|
|
"loss": 4.5813,
|
|
"mean_token_accuracy": 0.2594466909766197,
|
|
"num_tokens": 86091976.0,
|
|
"step": 37555
|
|
},
|
|
{
|
|
"entropy": 4.971715354919434,
|
|
"epoch": 3.60806916426513,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003742199246876616,
|
|
"loss": 4.5101,
|
|
"mean_token_accuracy": 0.2615057066082954,
|
|
"num_tokens": 86103135.0,
|
|
"step": 37560
|
|
},
|
|
{
|
|
"entropy": 5.115024566650391,
|
|
"epoch": 3.6085494716618634,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037418915342144333,
|
|
"loss": 4.6119,
|
|
"mean_token_accuracy": 0.25586079210042956,
|
|
"num_tokens": 86113495.0,
|
|
"step": 37565
|
|
},
|
|
{
|
|
"entropy": 5.088325691223145,
|
|
"epoch": 3.6090297790585977,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003741583798523474,
|
|
"loss": 4.6127,
|
|
"mean_token_accuracy": 0.24502974897623062,
|
|
"num_tokens": 86125489.0,
|
|
"step": 37570
|
|
},
|
|
{
|
|
"entropy": 5.057143306732177,
|
|
"epoch": 3.6095100864553316,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003741276039810884,
|
|
"loss": 4.5944,
|
|
"mean_token_accuracy": 0.250989094376564,
|
|
"num_tokens": 86136109.0,
|
|
"step": 37575
|
|
},
|
|
{
|
|
"entropy": 5.030374002456665,
|
|
"epoch": 3.6099903938520654,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003740968258083808,
|
|
"loss": 4.6303,
|
|
"mean_token_accuracy": 0.2544350877404213,
|
|
"num_tokens": 86147375.0,
|
|
"step": 37580
|
|
},
|
|
{
|
|
"entropy": 5.038099336624145,
|
|
"epoch": 3.6104707012487993,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003740660453349391,
|
|
"loss": 4.6176,
|
|
"mean_token_accuracy": 0.25156146138906477,
|
|
"num_tokens": 86158277.0,
|
|
"step": 37585
|
|
},
|
|
{
|
|
"entropy": 5.10015902519226,
|
|
"epoch": 3.610951008645533,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000374035262561478,
|
|
"loss": 4.6503,
|
|
"mean_token_accuracy": 0.24858063757419585,
|
|
"num_tokens": 86169280.0,
|
|
"step": 37590
|
|
},
|
|
{
|
|
"entropy": 5.066948986053466,
|
|
"epoch": 3.611431316042267,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00037400447748871224,
|
|
"loss": 4.5873,
|
|
"mean_token_accuracy": 0.2560041695833206,
|
|
"num_tokens": 86179918.0,
|
|
"step": 37595
|
|
},
|
|
{
|
|
"entropy": 4.9091442108154295,
|
|
"epoch": 3.611911623439001,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037397369011735643,
|
|
"loss": 4.4684,
|
|
"mean_token_accuracy": 0.2639839082956314,
|
|
"num_tokens": 86192616.0,
|
|
"step": 37600
|
|
},
|
|
{
|
|
"entropy": 5.006022548675537,
|
|
"epoch": 3.612391930835735,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0003739429004481254,
|
|
"loss": 4.5781,
|
|
"mean_token_accuracy": 0.24900186955928802,
|
|
"num_tokens": 86203797.0,
|
|
"step": 37605
|
|
},
|
|
{
|
|
"entropy": 5.003582859039307,
|
|
"epoch": 3.6128722382324687,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00037391210848173413,
|
|
"loss": 4.5487,
|
|
"mean_token_accuracy": 0.262343755364418,
|
|
"num_tokens": 86214408.0,
|
|
"step": 37610
|
|
},
|
|
{
|
|
"entropy": 5.037091684341431,
|
|
"epoch": 3.6133525456292026,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00037388131421889733,
|
|
"loss": 4.5819,
|
|
"mean_token_accuracy": 0.2504035636782646,
|
|
"num_tokens": 86226382.0,
|
|
"step": 37615
|
|
},
|
|
{
|
|
"entropy": 5.074613285064697,
|
|
"epoch": 3.6138328530259365,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003738505176603301,
|
|
"loss": 4.6076,
|
|
"mean_token_accuracy": 0.25608131289482117,
|
|
"num_tokens": 86236962.0,
|
|
"step": 37620
|
|
},
|
|
{
|
|
"entropy": 4.9912190437316895,
|
|
"epoch": 3.6143131604226704,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037381971880674726,
|
|
"loss": 4.5345,
|
|
"mean_token_accuracy": 0.26111999005079267,
|
|
"num_tokens": 86248867.0,
|
|
"step": 37625
|
|
},
|
|
{
|
|
"entropy": 5.0701704025268555,
|
|
"epoch": 3.6147934678194042,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00037378891765886405,
|
|
"loss": 4.6425,
|
|
"mean_token_accuracy": 0.24309103339910507,
|
|
"num_tokens": 86260264.0,
|
|
"step": 37630
|
|
},
|
|
{
|
|
"entropy": 5.049466466903686,
|
|
"epoch": 3.6152737752161386,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00037375811421739555,
|
|
"loss": 4.6581,
|
|
"mean_token_accuracy": 0.23960951566696168,
|
|
"num_tokens": 86271707.0,
|
|
"step": 37635
|
|
},
|
|
{
|
|
"entropy": 4.988983678817749,
|
|
"epoch": 3.615754082612872,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037372730848305683,
|
|
"loss": 4.5576,
|
|
"mean_token_accuracy": 0.25616964101791384,
|
|
"num_tokens": 86282532.0,
|
|
"step": 37640
|
|
},
|
|
{
|
|
"entropy": 5.091235589981079,
|
|
"epoch": 3.6162343900096063,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00037369650045656315,
|
|
"loss": 4.6747,
|
|
"mean_token_accuracy": 0.24786443561315535,
|
|
"num_tokens": 86295256.0,
|
|
"step": 37645
|
|
},
|
|
{
|
|
"entropy": 5.043572235107422,
|
|
"epoch": 3.61671469740634,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00037366569013862986,
|
|
"loss": 4.6011,
|
|
"mean_token_accuracy": 0.2533408299088478,
|
|
"num_tokens": 86306326.0,
|
|
"step": 37650
|
|
},
|
|
{
|
|
"entropy": 4.949450588226318,
|
|
"epoch": 3.617195004803074,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003736348775299722,
|
|
"loss": 4.5092,
|
|
"mean_token_accuracy": 0.2646250709891319,
|
|
"num_tokens": 86317313.0,
|
|
"step": 37655
|
|
},
|
|
{
|
|
"entropy": 5.042888212203979,
|
|
"epoch": 3.617675312199808,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00037360406263130556,
|
|
"loss": 4.5767,
|
|
"mean_token_accuracy": 0.2550095930695534,
|
|
"num_tokens": 86327139.0,
|
|
"step": 37660
|
|
},
|
|
{
|
|
"entropy": 5.012157583236695,
|
|
"epoch": 3.618155619596542,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037357324544334537,
|
|
"loss": 4.6076,
|
|
"mean_token_accuracy": 0.2519445866346359,
|
|
"num_tokens": 86338743.0,
|
|
"step": 37665
|
|
},
|
|
{
|
|
"entropy": 5.056576681137085,
|
|
"epoch": 3.6186359269932757,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003735424259668072,
|
|
"loss": 4.6558,
|
|
"mean_token_accuracy": 0.24874948859214782,
|
|
"num_tokens": 86349790.0,
|
|
"step": 37670
|
|
},
|
|
{
|
|
"entropy": 5.06267466545105,
|
|
"epoch": 3.6191162343900096,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003735116042024064,
|
|
"loss": 4.6154,
|
|
"mean_token_accuracy": 0.2493801236152649,
|
|
"num_tokens": 86361689.0,
|
|
"step": 37675
|
|
},
|
|
{
|
|
"entropy": 5.091046953201294,
|
|
"epoch": 3.6195965417867435,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003734807801508587,
|
|
"loss": 4.6179,
|
|
"mean_token_accuracy": 0.2550936371088028,
|
|
"num_tokens": 86372005.0,
|
|
"step": 37680
|
|
},
|
|
{
|
|
"entropy": 5.049282836914062,
|
|
"epoch": 3.6200768491834774,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00037344995381287984,
|
|
"loss": 4.7141,
|
|
"mean_token_accuracy": 0.24288854748010635,
|
|
"num_tokens": 86383789.0,
|
|
"step": 37685
|
|
},
|
|
{
|
|
"entropy": 4.968661069869995,
|
|
"epoch": 3.6205571565802113,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00037341912518918524,
|
|
"loss": 4.5941,
|
|
"mean_token_accuracy": 0.24938559979200364,
|
|
"num_tokens": 86395831.0,
|
|
"step": 37690
|
|
},
|
|
{
|
|
"entropy": 5.047573900222778,
|
|
"epoch": 3.621037463976945,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00037338829428049087,
|
|
"loss": 4.557,
|
|
"mean_token_accuracy": 0.2554023966193199,
|
|
"num_tokens": 86406807.0,
|
|
"step": 37695
|
|
},
|
|
{
|
|
"entropy": 5.0966907978057865,
|
|
"epoch": 3.621517771373679,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00037335746108751246,
|
|
"loss": 4.6497,
|
|
"mean_token_accuracy": 0.2491500347852707,
|
|
"num_tokens": 86418025.0,
|
|
"step": 37700
|
|
},
|
|
{
|
|
"entropy": 5.075774765014648,
|
|
"epoch": 3.621998078770413,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003733266256109658,
|
|
"loss": 4.6326,
|
|
"mean_token_accuracy": 0.2455480620265007,
|
|
"num_tokens": 86429019.0,
|
|
"step": 37705
|
|
},
|
|
{
|
|
"entropy": 5.137321043014526,
|
|
"epoch": 3.6224783861671472,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00037329578785156686,
|
|
"loss": 4.6946,
|
|
"mean_token_accuracy": 0.24371037632226944,
|
|
"num_tokens": 86439471.0,
|
|
"step": 37710
|
|
},
|
|
{
|
|
"entropy": 5.090334033966064,
|
|
"epoch": 3.6229586935638807,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003732649478100317,
|
|
"loss": 4.6534,
|
|
"mean_token_accuracy": 0.24862606078386307,
|
|
"num_tokens": 86450966.0,
|
|
"step": 37715
|
|
},
|
|
{
|
|
"entropy": 4.92968864440918,
|
|
"epoch": 3.623439000960615,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00037323410548707617,
|
|
"loss": 4.4728,
|
|
"mean_token_accuracy": 0.26175497174263,
|
|
"num_tokens": 86461467.0,
|
|
"step": 37720
|
|
},
|
|
{
|
|
"entropy": 4.994879817962646,
|
|
"epoch": 3.6239193083573484,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003732032608834164,
|
|
"loss": 4.6565,
|
|
"mean_token_accuracy": 0.2475918188691139,
|
|
"num_tokens": 86472743.0,
|
|
"step": 37725
|
|
},
|
|
{
|
|
"entropy": 5.069544315338135,
|
|
"epoch": 3.6243996157540828,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037317241399976856,
|
|
"loss": 4.6126,
|
|
"mean_token_accuracy": 0.25050148665905,
|
|
"num_tokens": 86484101.0,
|
|
"step": 37730
|
|
},
|
|
{
|
|
"entropy": 5.065858840942383,
|
|
"epoch": 3.6248799231508166,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00037314156483684875,
|
|
"loss": 4.6123,
|
|
"mean_token_accuracy": 0.2487994283437729,
|
|
"num_tokens": 86496017.0,
|
|
"step": 37735
|
|
},
|
|
{
|
|
"entropy": 5.003856468200683,
|
|
"epoch": 3.6253602305475505,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003731107133953732,
|
|
"loss": 4.6115,
|
|
"mean_token_accuracy": 0.24343004822731018,
|
|
"num_tokens": 86508198.0,
|
|
"step": 37740
|
|
},
|
|
{
|
|
"entropy": 5.173652935028076,
|
|
"epoch": 3.6258405379442844,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00037307985967605827,
|
|
"loss": 4.7588,
|
|
"mean_token_accuracy": 0.24183924943208696,
|
|
"num_tokens": 86518712.0,
|
|
"step": 37745
|
|
},
|
|
{
|
|
"entropy": 5.011025428771973,
|
|
"epoch": 3.6263208453410183,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00037304900367962017,
|
|
"loss": 4.5728,
|
|
"mean_token_accuracy": 0.25466890186071395,
|
|
"num_tokens": 86531340.0,
|
|
"step": 37750
|
|
},
|
|
{
|
|
"entropy": 5.000905466079712,
|
|
"epoch": 3.626801152737752,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003730181454067754,
|
|
"loss": 4.543,
|
|
"mean_token_accuracy": 0.258446104824543,
|
|
"num_tokens": 86541548.0,
|
|
"step": 37755
|
|
},
|
|
{
|
|
"entropy": 5.038048028945923,
|
|
"epoch": 3.627281460134486,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00037298728485824043,
|
|
"loss": 4.6723,
|
|
"mean_token_accuracy": 0.24935411363840104,
|
|
"num_tokens": 86553259.0,
|
|
"step": 37760
|
|
},
|
|
{
|
|
"entropy": 4.983464479446411,
|
|
"epoch": 3.62776176753122,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003729564220347315,
|
|
"loss": 4.5368,
|
|
"mean_token_accuracy": 0.2561137959361076,
|
|
"num_tokens": 86565199.0,
|
|
"step": 37765
|
|
},
|
|
{
|
|
"entropy": 4.942456912994385,
|
|
"epoch": 3.628242074927954,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003729255569369654,
|
|
"loss": 4.4545,
|
|
"mean_token_accuracy": 0.266314435005188,
|
|
"num_tokens": 86574949.0,
|
|
"step": 37770
|
|
},
|
|
{
|
|
"entropy": 4.962992906570435,
|
|
"epoch": 3.6287223823246877,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003728946895656586,
|
|
"loss": 4.5464,
|
|
"mean_token_accuracy": 0.2574627101421356,
|
|
"num_tokens": 86586039.0,
|
|
"step": 37775
|
|
},
|
|
{
|
|
"entropy": 5.041203022003174,
|
|
"epoch": 3.6292026897214216,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003728638199215279,
|
|
"loss": 4.6269,
|
|
"mean_token_accuracy": 0.2504082053899765,
|
|
"num_tokens": 86598365.0,
|
|
"step": 37780
|
|
},
|
|
{
|
|
"entropy": 5.032601451873779,
|
|
"epoch": 3.629682997118156,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037283294800528985,
|
|
"loss": 4.5206,
|
|
"mean_token_accuracy": 0.26252626776695254,
|
|
"num_tokens": 86609460.0,
|
|
"step": 37785
|
|
},
|
|
{
|
|
"entropy": 4.998996067047119,
|
|
"epoch": 3.6301633045148893,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003728020738176612,
|
|
"loss": 4.5886,
|
|
"mean_token_accuracy": 0.2590646743774414,
|
|
"num_tokens": 86621195.0,
|
|
"step": 37790
|
|
},
|
|
{
|
|
"entropy": 5.062126684188843,
|
|
"epoch": 3.6306436119116237,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003727711973593589,
|
|
"loss": 4.6201,
|
|
"mean_token_accuracy": 0.25227669775485995,
|
|
"num_tokens": 86632671.0,
|
|
"step": 37795
|
|
},
|
|
{
|
|
"entropy": 5.041585779190063,
|
|
"epoch": 3.631123919308357,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003727403186310996,
|
|
"loss": 4.6127,
|
|
"mean_token_accuracy": 0.24835167080163956,
|
|
"num_tokens": 86644224.0,
|
|
"step": 37800
|
|
},
|
|
{
|
|
"entropy": 5.0927361965179445,
|
|
"epoch": 3.6316042267050914,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037270943763360046,
|
|
"loss": 4.7177,
|
|
"mean_token_accuracy": 0.24759551733732224,
|
|
"num_tokens": 86655302.0,
|
|
"step": 37805
|
|
},
|
|
{
|
|
"entropy": 5.120924615859986,
|
|
"epoch": 3.6320845341018253,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003726785543675782,
|
|
"loss": 4.6921,
|
|
"mean_token_accuracy": 0.24950387179851533,
|
|
"num_tokens": 86665907.0,
|
|
"step": 37810
|
|
},
|
|
{
|
|
"entropy": 4.966438961029053,
|
|
"epoch": 3.632564841498559,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00037264766883375003,
|
|
"loss": 4.5229,
|
|
"mean_token_accuracy": 0.26634927839040756,
|
|
"num_tokens": 86676746.0,
|
|
"step": 37815
|
|
},
|
|
{
|
|
"entropy": 4.976216077804565,
|
|
"epoch": 3.633045148895293,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003726167810328329,
|
|
"loss": 4.6078,
|
|
"mean_token_accuracy": 0.251989284157753,
|
|
"num_tokens": 86689274.0,
|
|
"step": 37820
|
|
},
|
|
{
|
|
"entropy": 5.0314594268798825,
|
|
"epoch": 3.633525456292027,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000372585890965544,
|
|
"loss": 4.6113,
|
|
"mean_token_accuracy": 0.25584992170333865,
|
|
"num_tokens": 86700427.0,
|
|
"step": 37825
|
|
},
|
|
{
|
|
"entropy": 5.0255265712738035,
|
|
"epoch": 3.634005763688761,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003725549986326004,
|
|
"loss": 4.567,
|
|
"mean_token_accuracy": 0.25931842923164367,
|
|
"num_tokens": 86712780.0,
|
|
"step": 37830
|
|
},
|
|
{
|
|
"entropy": 5.021234941482544,
|
|
"epoch": 3.6344860710854947,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003725241040347196,
|
|
"loss": 4.6077,
|
|
"mean_token_accuracy": 0.25480413883924485,
|
|
"num_tokens": 86724359.0,
|
|
"step": 37835
|
|
},
|
|
{
|
|
"entropy": 5.026590013504029,
|
|
"epoch": 3.6349663784822286,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003724932071726186,
|
|
"loss": 4.6724,
|
|
"mean_token_accuracy": 0.24946674108505248,
|
|
"num_tokens": 86735540.0,
|
|
"step": 37840
|
|
},
|
|
{
|
|
"entropy": 5.008847904205322,
|
|
"epoch": 3.6354466858789625,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037246230804701476,
|
|
"loss": 4.5956,
|
|
"mean_token_accuracy": 0.25646268874406813,
|
|
"num_tokens": 86746879.0,
|
|
"step": 37845
|
|
},
|
|
{
|
|
"entropy": 5.076841592788696,
|
|
"epoch": 3.6359269932756964,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037243140665862556,
|
|
"loss": 4.6085,
|
|
"mean_token_accuracy": 0.25059174597263334,
|
|
"num_tokens": 86756783.0,
|
|
"step": 37850
|
|
},
|
|
{
|
|
"entropy": 5.096190595626831,
|
|
"epoch": 3.6364073006724302,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00037240050300816843,
|
|
"loss": 4.6109,
|
|
"mean_token_accuracy": 0.2587619319558144,
|
|
"num_tokens": 86768722.0,
|
|
"step": 37855
|
|
},
|
|
{
|
|
"entropy": 5.049224615097046,
|
|
"epoch": 3.636887608069164,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037236959709636085,
|
|
"loss": 4.6765,
|
|
"mean_token_accuracy": 0.25156443268060685,
|
|
"num_tokens": 86780386.0,
|
|
"step": 37860
|
|
},
|
|
{
|
|
"entropy": 4.996360826492309,
|
|
"epoch": 3.637367915465898,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00037233868892392035,
|
|
"loss": 4.5986,
|
|
"mean_token_accuracy": 0.25248226523399353,
|
|
"num_tokens": 86793008.0,
|
|
"step": 37865
|
|
},
|
|
{
|
|
"entropy": 5.071881628036499,
|
|
"epoch": 3.6378482228626323,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003723077784915645,
|
|
"loss": 4.6019,
|
|
"mean_token_accuracy": 0.2526800289750099,
|
|
"num_tokens": 86804451.0,
|
|
"step": 37870
|
|
},
|
|
{
|
|
"entropy": 5.061959314346313,
|
|
"epoch": 3.6383285302593658,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00037227686580001095,
|
|
"loss": 4.6663,
|
|
"mean_token_accuracy": 0.247287118434906,
|
|
"num_tokens": 86816391.0,
|
|
"step": 37875
|
|
},
|
|
{
|
|
"entropy": 5.1050133228302,
|
|
"epoch": 3.6388088376561,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003722459508499775,
|
|
"loss": 4.7046,
|
|
"mean_token_accuracy": 0.2457491472363472,
|
|
"num_tokens": 86828163.0,
|
|
"step": 37880
|
|
},
|
|
{
|
|
"entropy": 5.075819206237793,
|
|
"epoch": 3.639289145052834,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003722150336421818,
|
|
"loss": 4.5949,
|
|
"mean_token_accuracy": 0.2474388062953949,
|
|
"num_tokens": 86839311.0,
|
|
"step": 37885
|
|
},
|
|
{
|
|
"entropy": 5.114630508422851,
|
|
"epoch": 3.639769452449568,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003721841141773417,
|
|
"loss": 4.699,
|
|
"mean_token_accuracy": 0.2435017481446266,
|
|
"num_tokens": 86849921.0,
|
|
"step": 37890
|
|
},
|
|
{
|
|
"entropy": 4.9196069717407225,
|
|
"epoch": 3.6402497598463017,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00037215319245617503,
|
|
"loss": 4.5159,
|
|
"mean_token_accuracy": 0.25568731427192687,
|
|
"num_tokens": 86862858.0,
|
|
"step": 37895
|
|
},
|
|
{
|
|
"entropy": 5.028948640823364,
|
|
"epoch": 3.6407300672430356,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00037212226847939975,
|
|
"loss": 4.531,
|
|
"mean_token_accuracy": 0.26005073487758634,
|
|
"num_tokens": 86874152.0,
|
|
"step": 37900
|
|
},
|
|
{
|
|
"entropy": 5.046748065948487,
|
|
"epoch": 3.6412103746397695,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003720913422477338,
|
|
"loss": 4.5774,
|
|
"mean_token_accuracy": 0.2605211868882179,
|
|
"num_tokens": 86886126.0,
|
|
"step": 37905
|
|
},
|
|
{
|
|
"entropy": 5.188298749923706,
|
|
"epoch": 3.6416906820365034,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00037206041376189515,
|
|
"loss": 4.7635,
|
|
"mean_token_accuracy": 0.24313988238573075,
|
|
"num_tokens": 86896046.0,
|
|
"step": 37910
|
|
},
|
|
{
|
|
"entropy": 4.958950185775757,
|
|
"epoch": 3.6421709894332372,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003720294830226018,
|
|
"loss": 4.5482,
|
|
"mean_token_accuracy": 0.2515521302819252,
|
|
"num_tokens": 86907946.0,
|
|
"step": 37915
|
|
},
|
|
{
|
|
"entropy": 4.963725900650024,
|
|
"epoch": 3.642651296829971,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003719985500305721,
|
|
"loss": 4.5425,
|
|
"mean_token_accuracy": 0.2617046877741814,
|
|
"num_tokens": 86919849.0,
|
|
"step": 37920
|
|
},
|
|
{
|
|
"entropy": 5.079237413406372,
|
|
"epoch": 3.643131604226705,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003719676147865241,
|
|
"loss": 4.6785,
|
|
"mean_token_accuracy": 0.2507407948374748,
|
|
"num_tokens": 86930325.0,
|
|
"step": 37925
|
|
},
|
|
{
|
|
"entropy": 4.9769635677337645,
|
|
"epoch": 3.643611911623439,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.000371936677291176,
|
|
"loss": 4.5102,
|
|
"mean_token_accuracy": 0.2616623714566231,
|
|
"num_tokens": 86942076.0,
|
|
"step": 37930
|
|
},
|
|
{
|
|
"entropy": 4.939378929138184,
|
|
"epoch": 3.6440922190201728,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003719057375452461,
|
|
"loss": 4.4886,
|
|
"mean_token_accuracy": 0.26630276441574097,
|
|
"num_tokens": 86953226.0,
|
|
"step": 37935
|
|
},
|
|
{
|
|
"entropy": 5.098149156570434,
|
|
"epoch": 3.6445725264169067,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003718747955494527,
|
|
"loss": 4.6753,
|
|
"mean_token_accuracy": 0.251187826693058,
|
|
"num_tokens": 86964606.0,
|
|
"step": 37940
|
|
},
|
|
{
|
|
"entropy": 4.995272397994995,
|
|
"epoch": 3.645052833813641,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037184385130451425,
|
|
"loss": 4.5151,
|
|
"mean_token_accuracy": 0.25816057026386263,
|
|
"num_tokens": 86976738.0,
|
|
"step": 37945
|
|
},
|
|
{
|
|
"entropy": 5.045467758178711,
|
|
"epoch": 3.6455331412103744,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037181290481114903,
|
|
"loss": 4.7187,
|
|
"mean_token_accuracy": 0.25128786116838453,
|
|
"num_tokens": 86988445.0,
|
|
"step": 37950
|
|
},
|
|
{
|
|
"entropy": 5.029026460647583,
|
|
"epoch": 3.6460134486071087,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003717819560700758,
|
|
"loss": 4.5506,
|
|
"mean_token_accuracy": 0.2598196491599083,
|
|
"num_tokens": 87000818.0,
|
|
"step": 37955
|
|
},
|
|
{
|
|
"entropy": 5.051932430267334,
|
|
"epoch": 3.6464937560038426,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003717510050820128,
|
|
"loss": 4.6418,
|
|
"mean_token_accuracy": 0.2502064988017082,
|
|
"num_tokens": 87012398.0,
|
|
"step": 37960
|
|
},
|
|
{
|
|
"entropy": 5.076000690460205,
|
|
"epoch": 3.6469740634005765,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00037172005184767885,
|
|
"loss": 4.6732,
|
|
"mean_token_accuracy": 0.24682036638259888,
|
|
"num_tokens": 87022885.0,
|
|
"step": 37965
|
|
},
|
|
{
|
|
"entropy": 5.052633285522461,
|
|
"epoch": 3.6474543707973104,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003716890963677924,
|
|
"loss": 4.6052,
|
|
"mean_token_accuracy": 0.25425350219011306,
|
|
"num_tokens": 87033973.0,
|
|
"step": 37970
|
|
},
|
|
{
|
|
"entropy": 5.030282020568848,
|
|
"epoch": 3.6479346781940443,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003716581386430723,
|
|
"loss": 4.6013,
|
|
"mean_token_accuracy": 0.25372359454631804,
|
|
"num_tokens": 87046391.0,
|
|
"step": 37975
|
|
},
|
|
{
|
|
"entropy": 5.081363391876221,
|
|
"epoch": 3.648414985590778,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003716271786742372,
|
|
"loss": 4.6246,
|
|
"mean_token_accuracy": 0.25792051404714583,
|
|
"num_tokens": 87056913.0,
|
|
"step": 37980
|
|
},
|
|
{
|
|
"entropy": 5.09544005393982,
|
|
"epoch": 3.648895292987512,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00037159621646200595,
|
|
"loss": 4.5921,
|
|
"mean_token_accuracy": 0.25290613919496535,
|
|
"num_tokens": 87068905.0,
|
|
"step": 37985
|
|
},
|
|
{
|
|
"entropy": 5.052115726470947,
|
|
"epoch": 3.649375600384246,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003715652520070974,
|
|
"loss": 4.6805,
|
|
"mean_token_accuracy": 0.24706596583127977,
|
|
"num_tokens": 87080186.0,
|
|
"step": 37990
|
|
},
|
|
{
|
|
"entropy": 4.9819214820861815,
|
|
"epoch": 3.64985590778098,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003715342853102305,
|
|
"loss": 4.5736,
|
|
"mean_token_accuracy": 0.2543834999203682,
|
|
"num_tokens": 87091130.0,
|
|
"step": 37995
|
|
},
|
|
{
|
|
"entropy": 5.114356184005738,
|
|
"epoch": 3.6503362151777137,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003715033163721241,
|
|
"loss": 4.674,
|
|
"mean_token_accuracy": 0.24996891617774963,
|
|
"num_tokens": 87103482.0,
|
|
"step": 38000
|
|
},
|
|
{
|
|
"entropy": 5.093205213546753,
|
|
"epoch": 3.6508165225744476,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003714723451934972,
|
|
"loss": 4.6322,
|
|
"mean_token_accuracy": 0.24901641309261321,
|
|
"num_tokens": 87114850.0,
|
|
"step": 38005
|
|
},
|
|
{
|
|
"entropy": 5.051080274581909,
|
|
"epoch": 3.6512968299711814,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00037144137177506887,
|
|
"loss": 4.6483,
|
|
"mean_token_accuracy": 0.25503017008304596,
|
|
"num_tokens": 87125840.0,
|
|
"step": 38010
|
|
},
|
|
{
|
|
"entropy": 5.050291919708252,
|
|
"epoch": 3.6517771373679153,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003714103961175584,
|
|
"loss": 4.6456,
|
|
"mean_token_accuracy": 0.251423117518425,
|
|
"num_tokens": 87138216.0,
|
|
"step": 38015
|
|
},
|
|
{
|
|
"entropy": 5.07115740776062,
|
|
"epoch": 3.6522574447646496,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003713794182216847,
|
|
"loss": 4.5932,
|
|
"mean_token_accuracy": 0.25946237295866015,
|
|
"num_tokens": 87149071.0,
|
|
"step": 38020
|
|
},
|
|
{
|
|
"entropy": 5.005227518081665,
|
|
"epoch": 3.652737752161383,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00037134843808816717,
|
|
"loss": 4.5838,
|
|
"mean_token_accuracy": 0.2616302609443665,
|
|
"num_tokens": 87160564.0,
|
|
"step": 38025
|
|
},
|
|
{
|
|
"entropy": 4.8894960403442385,
|
|
"epoch": 3.6532180595581174,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00037131745571772497,
|
|
"loss": 4.5192,
|
|
"mean_token_accuracy": 0.2680000767111778,
|
|
"num_tokens": 87171536.0,
|
|
"step": 38030
|
|
},
|
|
{
|
|
"entropy": 5.145585393905639,
|
|
"epoch": 3.653698366954851,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00037128647111107736,
|
|
"loss": 4.7019,
|
|
"mean_token_accuracy": 0.24759867787361145,
|
|
"num_tokens": 87183064.0,
|
|
"step": 38035
|
|
},
|
|
{
|
|
"entropy": 5.123568534851074,
|
|
"epoch": 3.654178674351585,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000371255484268944,
|
|
"loss": 4.6015,
|
|
"mean_token_accuracy": 0.2525403812527657,
|
|
"num_tokens": 87194034.0,
|
|
"step": 38040
|
|
},
|
|
{
|
|
"entropy": 5.050307893753052,
|
|
"epoch": 3.654658981748319,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037122449519204397,
|
|
"loss": 4.6504,
|
|
"mean_token_accuracy": 0.2529598385095596,
|
|
"num_tokens": 87205083.0,
|
|
"step": 38045
|
|
},
|
|
{
|
|
"entropy": 5.102697229385376,
|
|
"epoch": 3.655139289145053,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00037119350388109697,
|
|
"loss": 4.634,
|
|
"mean_token_accuracy": 0.2509476840496063,
|
|
"num_tokens": 87216034.0,
|
|
"step": 38050
|
|
},
|
|
{
|
|
"entropy": 5.007811450958252,
|
|
"epoch": 3.655619596541787,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00037116251033682236,
|
|
"loss": 4.5701,
|
|
"mean_token_accuracy": 0.2543100148439407,
|
|
"num_tokens": 87227231.0,
|
|
"step": 38055
|
|
},
|
|
{
|
|
"entropy": 5.0713011741638185,
|
|
"epoch": 3.6560999039385207,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003711315145599398,
|
|
"loss": 4.6121,
|
|
"mean_token_accuracy": 0.2586121469736099,
|
|
"num_tokens": 87239630.0,
|
|
"step": 38060
|
|
},
|
|
{
|
|
"entropy": 5.063324594497681,
|
|
"epoch": 3.6565802113352546,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000371100516551169,
|
|
"loss": 4.5854,
|
|
"mean_token_accuracy": 0.25167838484048843,
|
|
"num_tokens": 87250008.0,
|
|
"step": 38065
|
|
},
|
|
{
|
|
"entropy": 5.030573081970215,
|
|
"epoch": 3.6570605187319885,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003710695163112295,
|
|
"loss": 4.5987,
|
|
"mean_token_accuracy": 0.25241353213787077,
|
|
"num_tokens": 87260802.0,
|
|
"step": 38070
|
|
},
|
|
{
|
|
"entropy": 5.059394645690918,
|
|
"epoch": 3.6575408261287223,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003710385138408412,
|
|
"loss": 4.7107,
|
|
"mean_token_accuracy": 0.24564984291791916,
|
|
"num_tokens": 87271960.0,
|
|
"step": 38075
|
|
},
|
|
{
|
|
"entropy": 4.963160419464112,
|
|
"epoch": 3.658021133525456,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003710075091407237,
|
|
"loss": 4.5324,
|
|
"mean_token_accuracy": 0.25224921107292175,
|
|
"num_tokens": 87285138.0,
|
|
"step": 38080
|
|
},
|
|
{
|
|
"entropy": 5.035084199905396,
|
|
"epoch": 3.65850144092219,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003709765022115969,
|
|
"loss": 4.5878,
|
|
"mean_token_accuracy": 0.25381687879562376,
|
|
"num_tokens": 87296650.0,
|
|
"step": 38085
|
|
},
|
|
{
|
|
"entropy": 5.029453706741333,
|
|
"epoch": 3.658981748318924,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003709454930541808,
|
|
"loss": 4.5799,
|
|
"mean_token_accuracy": 0.26030330955982206,
|
|
"num_tokens": 87307315.0,
|
|
"step": 38090
|
|
},
|
|
{
|
|
"entropy": 4.9756176471710205,
|
|
"epoch": 3.659462055715658,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00037091448166919516,
|
|
"loss": 4.454,
|
|
"mean_token_accuracy": 0.2635587081313133,
|
|
"num_tokens": 87318842.0,
|
|
"step": 38095
|
|
},
|
|
{
|
|
"entropy": 4.976641464233398,
|
|
"epoch": 3.6599423631123917,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00037088346805736014,
|
|
"loss": 4.512,
|
|
"mean_token_accuracy": 0.2627015203237534,
|
|
"num_tokens": 87329308.0,
|
|
"step": 38100
|
|
},
|
|
{
|
|
"entropy": 5.046340274810791,
|
|
"epoch": 3.660422670509126,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003708524522193957,
|
|
"loss": 4.6628,
|
|
"mean_token_accuracy": 0.2499004051089287,
|
|
"num_tokens": 87340186.0,
|
|
"step": 38105
|
|
},
|
|
{
|
|
"entropy": 5.101902437210083,
|
|
"epoch": 3.6609029779058595,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00037082143415602186,
|
|
"loss": 4.6972,
|
|
"mean_token_accuracy": 0.2431741774082184,
|
|
"num_tokens": 87352547.0,
|
|
"step": 38110
|
|
},
|
|
{
|
|
"entropy": 5.069129943847656,
|
|
"epoch": 3.661383285302594,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003707904138679589,
|
|
"loss": 4.6292,
|
|
"mean_token_accuracy": 0.2482501596212387,
|
|
"num_tokens": 87363602.0,
|
|
"step": 38115
|
|
},
|
|
{
|
|
"entropy": 5.055398654937744,
|
|
"epoch": 3.6618635926993277,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000370759391355927,
|
|
"loss": 4.6029,
|
|
"mean_token_accuracy": 0.25368872582912444,
|
|
"num_tokens": 87375127.0,
|
|
"step": 38120
|
|
},
|
|
{
|
|
"entropy": 5.07559027671814,
|
|
"epoch": 3.6623439000960616,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003707283666206464,
|
|
"loss": 4.6535,
|
|
"mean_token_accuracy": 0.2533215761184692,
|
|
"num_tokens": 87386436.0,
|
|
"step": 38125
|
|
},
|
|
{
|
|
"entropy": 5.006677293777466,
|
|
"epoch": 3.6628242074927955,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003706973396628373,
|
|
"loss": 4.5919,
|
|
"mean_token_accuracy": 0.2590599596500397,
|
|
"num_tokens": 87399584.0,
|
|
"step": 38130
|
|
},
|
|
{
|
|
"entropy": 5.066027879714966,
|
|
"epoch": 3.6633045148895294,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003706663104832202,
|
|
"loss": 4.6362,
|
|
"mean_token_accuracy": 0.2495490148663521,
|
|
"num_tokens": 87410244.0,
|
|
"step": 38135
|
|
},
|
|
{
|
|
"entropy": 5.006075859069824,
|
|
"epoch": 3.6637848222862632,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00037063527908251536,
|
|
"loss": 4.5228,
|
|
"mean_token_accuracy": 0.2595680743455887,
|
|
"num_tokens": 87421907.0,
|
|
"step": 38140
|
|
},
|
|
{
|
|
"entropy": 4.966068506240845,
|
|
"epoch": 3.664265129682997,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003706042454614434,
|
|
"loss": 4.4632,
|
|
"mean_token_accuracy": 0.2666603162884712,
|
|
"num_tokens": 87432202.0,
|
|
"step": 38145
|
|
},
|
|
{
|
|
"entropy": 4.918987035751343,
|
|
"epoch": 3.664745437079731,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003705732096207247,
|
|
"loss": 4.5025,
|
|
"mean_token_accuracy": 0.255435086786747,
|
|
"num_tokens": 87443460.0,
|
|
"step": 38150
|
|
},
|
|
{
|
|
"entropy": 4.957027578353882,
|
|
"epoch": 3.665225744476465,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00037054217156107987,
|
|
"loss": 4.5375,
|
|
"mean_token_accuracy": 0.2564763277769089,
|
|
"num_tokens": 87455900.0,
|
|
"step": 38155
|
|
},
|
|
{
|
|
"entropy": 5.076488351821899,
|
|
"epoch": 3.6657060518731988,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00037051113128322953,
|
|
"loss": 4.6059,
|
|
"mean_token_accuracy": 0.2509634405374527,
|
|
"num_tokens": 87467593.0,
|
|
"step": 38160
|
|
},
|
|
{
|
|
"entropy": 4.997862529754639,
|
|
"epoch": 3.6661863592699326,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00037048008878789437,
|
|
"loss": 4.575,
|
|
"mean_token_accuracy": 0.24982138872146606,
|
|
"num_tokens": 87479230.0,
|
|
"step": 38165
|
|
},
|
|
{
|
|
"entropy": 4.975278902053833,
|
|
"epoch": 3.6666666666666665,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00037044904407579506,
|
|
"loss": 4.5469,
|
|
"mean_token_accuracy": 0.2633537486195564,
|
|
"num_tokens": 87489777.0,
|
|
"step": 38170
|
|
},
|
|
{
|
|
"entropy": 5.042837619781494,
|
|
"epoch": 3.6671469740634004,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00037041799714765226,
|
|
"loss": 4.6771,
|
|
"mean_token_accuracy": 0.24492055028676987,
|
|
"num_tokens": 87501202.0,
|
|
"step": 38175
|
|
},
|
|
{
|
|
"entropy": 5.054458522796631,
|
|
"epoch": 3.6676272814601347,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000370386948004187,
|
|
"loss": 4.5852,
|
|
"mean_token_accuracy": 0.2559574767947197,
|
|
"num_tokens": 87512684.0,
|
|
"step": 38180
|
|
},
|
|
{
|
|
"entropy": 5.0857078552246096,
|
|
"epoch": 3.668107588856868,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00037035589664612004,
|
|
"loss": 4.5987,
|
|
"mean_token_accuracy": 0.25611245036125185,
|
|
"num_tokens": 87525905.0,
|
|
"step": 38185
|
|
},
|
|
{
|
|
"entropy": 4.999360609054565,
|
|
"epoch": 3.6685878962536025,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003703248430741723,
|
|
"loss": 4.5565,
|
|
"mean_token_accuracy": 0.25266881138086317,
|
|
"num_tokens": 87536764.0,
|
|
"step": 38190
|
|
},
|
|
{
|
|
"entropy": 5.061944913864136,
|
|
"epoch": 3.6690682036503364,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003702937872890647,
|
|
"loss": 4.6354,
|
|
"mean_token_accuracy": 0.24819146245718002,
|
|
"num_tokens": 87548807.0,
|
|
"step": 38195
|
|
},
|
|
{
|
|
"entropy": 4.8930505275726315,
|
|
"epoch": 3.6695485110470702,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003702627292915184,
|
|
"loss": 4.4641,
|
|
"mean_token_accuracy": 0.2618598282337189,
|
|
"num_tokens": 87559570.0,
|
|
"step": 38200
|
|
},
|
|
{
|
|
"entropy": 5.071514749526978,
|
|
"epoch": 3.670028818443804,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003702316690822544,
|
|
"loss": 4.6463,
|
|
"mean_token_accuracy": 0.25456818342208865,
|
|
"num_tokens": 87570758.0,
|
|
"step": 38205
|
|
},
|
|
{
|
|
"entropy": 4.977432918548584,
|
|
"epoch": 3.670509125840538,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00037020060666199386,
|
|
"loss": 4.4799,
|
|
"mean_token_accuracy": 0.2540786027908325,
|
|
"num_tokens": 87581985.0,
|
|
"step": 38210
|
|
},
|
|
{
|
|
"entropy": 4.961861658096313,
|
|
"epoch": 3.670989433237272,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0003701695420314578,
|
|
"loss": 4.579,
|
|
"mean_token_accuracy": 0.25066443383693693,
|
|
"num_tokens": 87595130.0,
|
|
"step": 38215
|
|
},
|
|
{
|
|
"entropy": 5.016462182998657,
|
|
"epoch": 3.6714697406340058,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00037013847519136765,
|
|
"loss": 4.6115,
|
|
"mean_token_accuracy": 0.2471124842762947,
|
|
"num_tokens": 87608073.0,
|
|
"step": 38220
|
|
},
|
|
{
|
|
"entropy": 4.917756986618042,
|
|
"epoch": 3.6719500480307397,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003701074061424446,
|
|
"loss": 4.4637,
|
|
"mean_token_accuracy": 0.26785528361797334,
|
|
"num_tokens": 87618889.0,
|
|
"step": 38225
|
|
},
|
|
{
|
|
"entropy": 4.983197259902954,
|
|
"epoch": 3.6724303554274735,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00037007633488541,
|
|
"loss": 4.5188,
|
|
"mean_token_accuracy": 0.2600109174847603,
|
|
"num_tokens": 87630386.0,
|
|
"step": 38230
|
|
},
|
|
{
|
|
"entropy": 5.058142185211182,
|
|
"epoch": 3.6729106628242074,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00037004526142098517,
|
|
"loss": 4.6558,
|
|
"mean_token_accuracy": 0.24589847922325134,
|
|
"num_tokens": 87642761.0,
|
|
"step": 38235
|
|
},
|
|
{
|
|
"entropy": 5.04865403175354,
|
|
"epoch": 3.6733909702209413,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00037001418574989165,
|
|
"loss": 4.6112,
|
|
"mean_token_accuracy": 0.24802527874708175,
|
|
"num_tokens": 87653661.0,
|
|
"step": 38240
|
|
},
|
|
{
|
|
"entropy": 4.989436912536621,
|
|
"epoch": 3.673871277617675,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036998310787285084,
|
|
"loss": 4.5763,
|
|
"mean_token_accuracy": 0.26005816608667376,
|
|
"num_tokens": 87665025.0,
|
|
"step": 38245
|
|
},
|
|
{
|
|
"entropy": 5.055676603317261,
|
|
"epoch": 3.674351585014409,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003699520277905843,
|
|
"loss": 4.5881,
|
|
"mean_token_accuracy": 0.2589236617088318,
|
|
"num_tokens": 87676349.0,
|
|
"step": 38250
|
|
},
|
|
{
|
|
"entropy": 4.956277847290039,
|
|
"epoch": 3.6748318924111434,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003699209455038137,
|
|
"loss": 4.4383,
|
|
"mean_token_accuracy": 0.26954206377267836,
|
|
"num_tokens": 87688149.0,
|
|
"step": 38255
|
|
},
|
|
{
|
|
"entropy": 4.994684839248658,
|
|
"epoch": 3.675312199807877,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00036988986101326056,
|
|
"loss": 4.5269,
|
|
"mean_token_accuracy": 0.2567762568593025,
|
|
"num_tokens": 87699323.0,
|
|
"step": 38260
|
|
},
|
|
{
|
|
"entropy": 5.040358161926269,
|
|
"epoch": 3.675792507204611,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003698587743196465,
|
|
"loss": 4.5978,
|
|
"mean_token_accuracy": 0.2527589723467827,
|
|
"num_tokens": 87709994.0,
|
|
"step": 38265
|
|
},
|
|
{
|
|
"entropy": 4.976486873626709,
|
|
"epoch": 3.6762728146013446,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00036982768542369353,
|
|
"loss": 4.6008,
|
|
"mean_token_accuracy": 0.255145925283432,
|
|
"num_tokens": 87721979.0,
|
|
"step": 38270
|
|
},
|
|
{
|
|
"entropy": 5.023662328720093,
|
|
"epoch": 3.676753121998079,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036979659432612324,
|
|
"loss": 4.5564,
|
|
"mean_token_accuracy": 0.25610930323600767,
|
|
"num_tokens": 87732194.0,
|
|
"step": 38275
|
|
},
|
|
{
|
|
"entropy": 4.937100028991699,
|
|
"epoch": 3.677233429394813,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003697655010276575,
|
|
"loss": 4.4535,
|
|
"mean_token_accuracy": 0.26346102058887483,
|
|
"num_tokens": 87742736.0,
|
|
"step": 38280
|
|
},
|
|
{
|
|
"entropy": 5.022895383834839,
|
|
"epoch": 3.6777137367915467,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003697344055290182,
|
|
"loss": 4.5766,
|
|
"mean_token_accuracy": 0.25482263416051865,
|
|
"num_tokens": 87753557.0,
|
|
"step": 38285
|
|
},
|
|
{
|
|
"entropy": 4.986739301681519,
|
|
"epoch": 3.6781940441882806,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0003697033078309274,
|
|
"loss": 4.6288,
|
|
"mean_token_accuracy": 0.25096924006938937,
|
|
"num_tokens": 87766585.0,
|
|
"step": 38290
|
|
},
|
|
{
|
|
"entropy": 5.01342658996582,
|
|
"epoch": 3.6786743515850144,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036967220793410687,
|
|
"loss": 4.6185,
|
|
"mean_token_accuracy": 0.24985367357730864,
|
|
"num_tokens": 87777679.0,
|
|
"step": 38295
|
|
},
|
|
{
|
|
"entropy": 4.968974685668945,
|
|
"epoch": 3.6791546589817483,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003696411058392789,
|
|
"loss": 4.5265,
|
|
"mean_token_accuracy": 0.26330266147851944,
|
|
"num_tokens": 87790083.0,
|
|
"step": 38300
|
|
},
|
|
{
|
|
"entropy": 4.982248306274414,
|
|
"epoch": 3.679634966378482,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003696100015471654,
|
|
"loss": 4.5182,
|
|
"mean_token_accuracy": 0.26220402419567107,
|
|
"num_tokens": 87802293.0,
|
|
"step": 38305
|
|
},
|
|
{
|
|
"entropy": 4.954427433013916,
|
|
"epoch": 3.680115273775216,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003695788950584886,
|
|
"loss": 4.5411,
|
|
"mean_token_accuracy": 0.2581960201263428,
|
|
"num_tokens": 87813367.0,
|
|
"step": 38310
|
|
},
|
|
{
|
|
"entropy": 4.995804500579834,
|
|
"epoch": 3.68059558117195,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036954778637397084,
|
|
"loss": 4.5702,
|
|
"mean_token_accuracy": 0.2558188125491142,
|
|
"num_tokens": 87824669.0,
|
|
"step": 38315
|
|
},
|
|
{
|
|
"entropy": 5.061767673492431,
|
|
"epoch": 3.681075888568684,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00036951667549433404,
|
|
"loss": 4.7132,
|
|
"mean_token_accuracy": 0.24947847723960875,
|
|
"num_tokens": 87836728.0,
|
|
"step": 38320
|
|
},
|
|
{
|
|
"entropy": 5.108877420425415,
|
|
"epoch": 3.6815561959654177,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00036948556242030083,
|
|
"loss": 4.6838,
|
|
"mean_token_accuracy": 0.24656410068273543,
|
|
"num_tokens": 87848391.0,
|
|
"step": 38325
|
|
},
|
|
{
|
|
"entropy": 5.110084104537964,
|
|
"epoch": 3.682036503362152,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00036945444715259334,
|
|
"loss": 4.6285,
|
|
"mean_token_accuracy": 0.26029346138238907,
|
|
"num_tokens": 87860837.0,
|
|
"step": 38330
|
|
},
|
|
{
|
|
"entropy": 5.031927967071534,
|
|
"epoch": 3.6825168107588855,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003694233296919341,
|
|
"loss": 4.6339,
|
|
"mean_token_accuracy": 0.2474737659096718,
|
|
"num_tokens": 87873766.0,
|
|
"step": 38335
|
|
},
|
|
{
|
|
"entropy": 5.0672790050506595,
|
|
"epoch": 3.68299711815562,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00036939221003904547,
|
|
"loss": 4.5962,
|
|
"mean_token_accuracy": 0.26070495545864103,
|
|
"num_tokens": 87885151.0,
|
|
"step": 38340
|
|
},
|
|
{
|
|
"entropy": 4.892748641967773,
|
|
"epoch": 3.6834774255523532,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00036936108819465006,
|
|
"loss": 4.4735,
|
|
"mean_token_accuracy": 0.2662238538265228,
|
|
"num_tokens": 87896967.0,
|
|
"step": 38345
|
|
},
|
|
{
|
|
"entropy": 5.003177261352539,
|
|
"epoch": 3.6839577329490876,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036932996415947047,
|
|
"loss": 4.5713,
|
|
"mean_token_accuracy": 0.24626848250627517,
|
|
"num_tokens": 87908392.0,
|
|
"step": 38350
|
|
},
|
|
{
|
|
"entropy": 5.121570825576782,
|
|
"epoch": 3.6844380403458215,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00036929883793422906,
|
|
"loss": 4.7075,
|
|
"mean_token_accuracy": 0.24963598996400832,
|
|
"num_tokens": 87920409.0,
|
|
"step": 38355
|
|
},
|
|
{
|
|
"entropy": 5.048176956176758,
|
|
"epoch": 3.6849183477425553,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003692677095196487,
|
|
"loss": 4.6221,
|
|
"mean_token_accuracy": 0.25305686742067335,
|
|
"num_tokens": 87931609.0,
|
|
"step": 38360
|
|
},
|
|
{
|
|
"entropy": 5.110896825790405,
|
|
"epoch": 3.685398655139289,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000369236578916452,
|
|
"loss": 4.7023,
|
|
"mean_token_accuracy": 0.23258274644613267,
|
|
"num_tokens": 87943599.0,
|
|
"step": 38365
|
|
},
|
|
{
|
|
"entropy": 5.087920618057251,
|
|
"epoch": 3.685878962536023,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00036920544612536186,
|
|
"loss": 4.6025,
|
|
"mean_token_accuracy": 0.2509204104542732,
|
|
"num_tokens": 87953659.0,
|
|
"step": 38370
|
|
},
|
|
{
|
|
"entropy": 5.070183134078979,
|
|
"epoch": 3.686359269932757,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003691743111471009,
|
|
"loss": 4.6772,
|
|
"mean_token_accuracy": 0.24249487668275832,
|
|
"num_tokens": 87966174.0,
|
|
"step": 38375
|
|
},
|
|
{
|
|
"entropy": 5.0342730522155765,
|
|
"epoch": 3.686839577329491,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003691431739823922,
|
|
"loss": 4.5876,
|
|
"mean_token_accuracy": 0.24868707358837128,
|
|
"num_tokens": 87978804.0,
|
|
"step": 38380
|
|
},
|
|
{
|
|
"entropy": 5.103684902191162,
|
|
"epoch": 3.6873198847262247,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00036911203463195843,
|
|
"loss": 4.612,
|
|
"mean_token_accuracy": 0.2533311083912849,
|
|
"num_tokens": 87989807.0,
|
|
"step": 38385
|
|
},
|
|
{
|
|
"entropy": 4.970625352859497,
|
|
"epoch": 3.6878001921229586,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00036908089309652266,
|
|
"loss": 4.5446,
|
|
"mean_token_accuracy": 0.25738618820905684,
|
|
"num_tokens": 88000978.0,
|
|
"step": 38390
|
|
},
|
|
{
|
|
"entropy": 5.055232810974121,
|
|
"epoch": 3.6882804995196925,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000369049749376808,
|
|
"loss": 4.6393,
|
|
"mean_token_accuracy": 0.25031792670488356,
|
|
"num_tokens": 88012421.0,
|
|
"step": 38395
|
|
},
|
|
{
|
|
"entropy": 5.079406118392944,
|
|
"epoch": 3.6887608069164264,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00036901860347353735,
|
|
"loss": 4.6322,
|
|
"mean_token_accuracy": 0.25274730622768404,
|
|
"num_tokens": 88023315.0,
|
|
"step": 38400
|
|
},
|
|
{
|
|
"entropy": 5.003723192214966,
|
|
"epoch": 3.6892411143131603,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000368987455387434,
|
|
"loss": 4.5931,
|
|
"mean_token_accuracy": 0.25960519164800644,
|
|
"num_tokens": 88035368.0,
|
|
"step": 38405
|
|
},
|
|
{
|
|
"entropy": 4.95863447189331,
|
|
"epoch": 3.689721421709894,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003689563051192209,
|
|
"loss": 4.5742,
|
|
"mean_token_accuracy": 0.2568479418754578,
|
|
"num_tokens": 88045809.0,
|
|
"step": 38410
|
|
},
|
|
{
|
|
"entropy": 5.092925882339477,
|
|
"epoch": 3.6902017291066285,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00036892515266962145,
|
|
"loss": 4.6028,
|
|
"mean_token_accuracy": 0.2560404971241951,
|
|
"num_tokens": 88056456.0,
|
|
"step": 38415
|
|
},
|
|
{
|
|
"entropy": 5.060942697525024,
|
|
"epoch": 3.690682036503362,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003688939980393588,
|
|
"loss": 4.6041,
|
|
"mean_token_accuracy": 0.25362819582223894,
|
|
"num_tokens": 88069387.0,
|
|
"step": 38420
|
|
},
|
|
{
|
|
"entropy": 5.079400587081909,
|
|
"epoch": 3.6911623439000962,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003688628412291564,
|
|
"loss": 4.5997,
|
|
"mean_token_accuracy": 0.24915609657764434,
|
|
"num_tokens": 88079393.0,
|
|
"step": 38425
|
|
},
|
|
{
|
|
"entropy": 5.087055206298828,
|
|
"epoch": 3.69164265129683,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00036883168223973754,
|
|
"loss": 4.6297,
|
|
"mean_token_accuracy": 0.24831784814596175,
|
|
"num_tokens": 88089151.0,
|
|
"step": 38430
|
|
},
|
|
{
|
|
"entropy": 5.059953784942627,
|
|
"epoch": 3.692122958693564,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003688005210718257,
|
|
"loss": 4.5773,
|
|
"mean_token_accuracy": 0.2521321356296539,
|
|
"num_tokens": 88100497.0,
|
|
"step": 38435
|
|
},
|
|
{
|
|
"entropy": 5.110360717773437,
|
|
"epoch": 3.692603266090298,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003687693577261442,
|
|
"loss": 4.6707,
|
|
"mean_token_accuracy": 0.24471357166767121,
|
|
"num_tokens": 88112818.0,
|
|
"step": 38440
|
|
},
|
|
{
|
|
"entropy": 5.094632291793824,
|
|
"epoch": 3.6930835734870318,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00036873819220341667,
|
|
"loss": 4.6449,
|
|
"mean_token_accuracy": 0.2566545203328133,
|
|
"num_tokens": 88123128.0,
|
|
"step": 38445
|
|
},
|
|
{
|
|
"entropy": 4.956731224060059,
|
|
"epoch": 3.6935638808837656,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00036870702450436655,
|
|
"loss": 4.5809,
|
|
"mean_token_accuracy": 0.2597784236073494,
|
|
"num_tokens": 88136062.0,
|
|
"step": 38450
|
|
},
|
|
{
|
|
"entropy": 5.070009374618531,
|
|
"epoch": 3.6940441882804995,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036867585462971775,
|
|
"loss": 4.6422,
|
|
"mean_token_accuracy": 0.2469232514500618,
|
|
"num_tokens": 88147234.0,
|
|
"step": 38455
|
|
},
|
|
{
|
|
"entropy": 4.998454618453979,
|
|
"epoch": 3.6945244956772334,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003686446825801937,
|
|
"loss": 4.5518,
|
|
"mean_token_accuracy": 0.25606195330619813,
|
|
"num_tokens": 88158537.0,
|
|
"step": 38460
|
|
},
|
|
{
|
|
"entropy": 5.047478055953979,
|
|
"epoch": 3.6950048030739673,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003686135083565181,
|
|
"loss": 4.557,
|
|
"mean_token_accuracy": 0.24938059896230697,
|
|
"num_tokens": 88169652.0,
|
|
"step": 38465
|
|
},
|
|
{
|
|
"entropy": 5.07486138343811,
|
|
"epoch": 3.695485110470701,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00036858233195941493,
|
|
"loss": 4.662,
|
|
"mean_token_accuracy": 0.24607723355293273,
|
|
"num_tokens": 88181919.0,
|
|
"step": 38470
|
|
},
|
|
{
|
|
"entropy": 4.98467984199524,
|
|
"epoch": 3.695965417867435,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003685511533896079,
|
|
"loss": 4.5346,
|
|
"mean_token_accuracy": 0.2617945775389671,
|
|
"num_tokens": 88194070.0,
|
|
"step": 38475
|
|
},
|
|
{
|
|
"entropy": 5.056523180007934,
|
|
"epoch": 3.696445725264169,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00036851997264782083,
|
|
"loss": 4.5857,
|
|
"mean_token_accuracy": 0.25054556131362915,
|
|
"num_tokens": 88206410.0,
|
|
"step": 38480
|
|
},
|
|
{
|
|
"entropy": 4.9747350215911865,
|
|
"epoch": 3.696926032660903,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00036848878973477767,
|
|
"loss": 4.6229,
|
|
"mean_token_accuracy": 0.2564759626984596,
|
|
"num_tokens": 88218921.0,
|
|
"step": 38485
|
|
},
|
|
{
|
|
"entropy": 5.0612266063690186,
|
|
"epoch": 3.697406340057637,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00036845760465120247,
|
|
"loss": 4.5776,
|
|
"mean_token_accuracy": 0.257583013176918,
|
|
"num_tokens": 88231026.0,
|
|
"step": 38490
|
|
},
|
|
{
|
|
"entropy": 5.165704584121704,
|
|
"epoch": 3.6978866474543706,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003684264173978192,
|
|
"loss": 4.6848,
|
|
"mean_token_accuracy": 0.2534632384777069,
|
|
"num_tokens": 88243731.0,
|
|
"step": 38495
|
|
},
|
|
{
|
|
"entropy": 4.987319850921631,
|
|
"epoch": 3.698366954851105,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003683952279753519,
|
|
"loss": 4.5296,
|
|
"mean_token_accuracy": 0.2613707512617111,
|
|
"num_tokens": 88254890.0,
|
|
"step": 38500
|
|
},
|
|
{
|
|
"entropy": 5.040547180175781,
|
|
"epoch": 3.6988472622478388,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00036836403638452477,
|
|
"loss": 4.6069,
|
|
"mean_token_accuracy": 0.244896000623703,
|
|
"num_tokens": 88267169.0,
|
|
"step": 38505
|
|
},
|
|
{
|
|
"entropy": 5.118547153472901,
|
|
"epoch": 3.6993275696445727,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003683328426260619,
|
|
"loss": 4.7233,
|
|
"mean_token_accuracy": 0.24412633180618287,
|
|
"num_tokens": 88278635.0,
|
|
"step": 38510
|
|
},
|
|
{
|
|
"entropy": 5.0479930400848385,
|
|
"epoch": 3.6998078770413065,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00036830164670068754,
|
|
"loss": 4.5571,
|
|
"mean_token_accuracy": 0.25922676771879194,
|
|
"num_tokens": 88290155.0,
|
|
"step": 38515
|
|
},
|
|
{
|
|
"entropy": 5.059586048126221,
|
|
"epoch": 3.7002881844380404,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036827044860912605,
|
|
"loss": 4.5959,
|
|
"mean_token_accuracy": 0.25159293711185454,
|
|
"num_tokens": 88301250.0,
|
|
"step": 38520
|
|
},
|
|
{
|
|
"entropy": 5.031854867935181,
|
|
"epoch": 3.7007684918347743,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003682392483521017,
|
|
"loss": 4.6109,
|
|
"mean_token_accuracy": 0.25665538012981415,
|
|
"num_tokens": 88312214.0,
|
|
"step": 38525
|
|
},
|
|
{
|
|
"entropy": 5.009376335144043,
|
|
"epoch": 3.701248799231508,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00036820804593033893,
|
|
"loss": 4.6344,
|
|
"mean_token_accuracy": 0.25211857408285143,
|
|
"num_tokens": 88322793.0,
|
|
"step": 38530
|
|
},
|
|
{
|
|
"entropy": 5.020911502838135,
|
|
"epoch": 3.701729106628242,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000368176841344562,
|
|
"loss": 4.5879,
|
|
"mean_token_accuracy": 0.25820344388484956,
|
|
"num_tokens": 88335032.0,
|
|
"step": 38535
|
|
},
|
|
{
|
|
"entropy": 5.041450357437133,
|
|
"epoch": 3.702209414024976,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00036814563459549557,
|
|
"loss": 4.5788,
|
|
"mean_token_accuracy": 0.2559799775481224,
|
|
"num_tokens": 88348186.0,
|
|
"step": 38540
|
|
},
|
|
{
|
|
"entropy": 5.144108247756958,
|
|
"epoch": 3.70268972142171,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00036811442568386403,
|
|
"loss": 4.643,
|
|
"mean_token_accuracy": 0.24660568535327912,
|
|
"num_tokens": 88358117.0,
|
|
"step": 38545
|
|
},
|
|
{
|
|
"entropy": 5.115303230285645,
|
|
"epoch": 3.7031700288184437,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00036808321461039207,
|
|
"loss": 4.7166,
|
|
"mean_token_accuracy": 0.24422077983617782,
|
|
"num_tokens": 88368606.0,
|
|
"step": 38550
|
|
},
|
|
{
|
|
"entropy": 5.057606983184814,
|
|
"epoch": 3.7036503362151776,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003680520013758042,
|
|
"loss": 4.6054,
|
|
"mean_token_accuracy": 0.24858625382184982,
|
|
"num_tokens": 88379676.0,
|
|
"step": 38555
|
|
},
|
|
{
|
|
"entropy": 5.067380380630493,
|
|
"epoch": 3.7041306436119115,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036802078598082523,
|
|
"loss": 4.6385,
|
|
"mean_token_accuracy": 0.24942210018634797,
|
|
"num_tokens": 88390824.0,
|
|
"step": 38560
|
|
},
|
|
{
|
|
"entropy": 5.07769250869751,
|
|
"epoch": 3.704610951008646,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003679895684261798,
|
|
"loss": 4.6184,
|
|
"mean_token_accuracy": 0.25372184365987777,
|
|
"num_tokens": 88401783.0,
|
|
"step": 38565
|
|
},
|
|
{
|
|
"entropy": 5.119526958465576,
|
|
"epoch": 3.7050912584053792,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003679583487125927,
|
|
"loss": 4.6783,
|
|
"mean_token_accuracy": 0.24981543719768523,
|
|
"num_tokens": 88414366.0,
|
|
"step": 38570
|
|
},
|
|
{
|
|
"entropy": 5.043867683410644,
|
|
"epoch": 3.7055715658021136,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003679271268407888,
|
|
"loss": 4.6853,
|
|
"mean_token_accuracy": 0.24115570336580278,
|
|
"num_tokens": 88425458.0,
|
|
"step": 38575
|
|
},
|
|
{
|
|
"entropy": 5.113823175430298,
|
|
"epoch": 3.706051873198847,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003678959028114929,
|
|
"loss": 4.6858,
|
|
"mean_token_accuracy": 0.24453675746917725,
|
|
"num_tokens": 88435766.0,
|
|
"step": 38580
|
|
},
|
|
{
|
|
"entropy": 5.060293769836425,
|
|
"epoch": 3.7065321805955813,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00036786467662543,
|
|
"loss": 4.6028,
|
|
"mean_token_accuracy": 0.2538406327366829,
|
|
"num_tokens": 88447807.0,
|
|
"step": 38585
|
|
},
|
|
{
|
|
"entropy": 5.03508415222168,
|
|
"epoch": 3.707012487992315,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003678334482833251,
|
|
"loss": 4.6066,
|
|
"mean_token_accuracy": 0.2579200237989426,
|
|
"num_tokens": 88459248.0,
|
|
"step": 38590
|
|
},
|
|
{
|
|
"entropy": 5.026467561721802,
|
|
"epoch": 3.707492795389049,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00036780221778590316,
|
|
"loss": 4.5098,
|
|
"mean_token_accuracy": 0.2620193034410477,
|
|
"num_tokens": 88471094.0,
|
|
"step": 38595
|
|
},
|
|
{
|
|
"entropy": 5.08675971031189,
|
|
"epoch": 3.707973102785783,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003677709851338893,
|
|
"loss": 4.6666,
|
|
"mean_token_accuracy": 0.2549469769001007,
|
|
"num_tokens": 88482154.0,
|
|
"step": 38600
|
|
},
|
|
{
|
|
"entropy": 5.055421972274781,
|
|
"epoch": 3.708453410182517,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036773975032800867,
|
|
"loss": 4.5705,
|
|
"mean_token_accuracy": 0.2574524343013763,
|
|
"num_tokens": 88493267.0,
|
|
"step": 38605
|
|
},
|
|
{
|
|
"entropy": 5.029069423675537,
|
|
"epoch": 3.7089337175792507,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003677085133689864,
|
|
"loss": 4.6161,
|
|
"mean_token_accuracy": 0.25548227280378344,
|
|
"num_tokens": 88505510.0,
|
|
"step": 38610
|
|
},
|
|
{
|
|
"entropy": 5.007560062408447,
|
|
"epoch": 3.7094140249759846,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00036767727425754767,
|
|
"loss": 4.568,
|
|
"mean_token_accuracy": 0.2545416682958603,
|
|
"num_tokens": 88516657.0,
|
|
"step": 38615
|
|
},
|
|
{
|
|
"entropy": 5.077832984924316,
|
|
"epoch": 3.7098943323727185,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003676460329944179,
|
|
"loss": 4.6387,
|
|
"mean_token_accuracy": 0.2518074452877045,
|
|
"num_tokens": 88527470.0,
|
|
"step": 38620
|
|
},
|
|
{
|
|
"entropy": 5.142323207855225,
|
|
"epoch": 3.7103746397694524,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003676147895803223,
|
|
"loss": 4.7035,
|
|
"mean_token_accuracy": 0.24838360995054246,
|
|
"num_tokens": 88539267.0,
|
|
"step": 38625
|
|
},
|
|
{
|
|
"entropy": 4.954944801330567,
|
|
"epoch": 3.7108549471661862,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003675835440159863,
|
|
"loss": 4.4998,
|
|
"mean_token_accuracy": 0.26381567269563677,
|
|
"num_tokens": 88551466.0,
|
|
"step": 38630
|
|
},
|
|
{
|
|
"entropy": 4.955819225311279,
|
|
"epoch": 3.71133525456292,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003675522963021354,
|
|
"loss": 4.5456,
|
|
"mean_token_accuracy": 0.26012705117464063,
|
|
"num_tokens": 88563991.0,
|
|
"step": 38635
|
|
},
|
|
{
|
|
"entropy": 5.109346008300781,
|
|
"epoch": 3.7118155619596545,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00036752104643949493,
|
|
"loss": 4.7278,
|
|
"mean_token_accuracy": 0.23841619938611985,
|
|
"num_tokens": 88574308.0,
|
|
"step": 38640
|
|
},
|
|
{
|
|
"entropy": 5.129747772216797,
|
|
"epoch": 3.712295869356388,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003674897944287904,
|
|
"loss": 4.6689,
|
|
"mean_token_accuracy": 0.24755637496709823,
|
|
"num_tokens": 88585510.0,
|
|
"step": 38645
|
|
},
|
|
{
|
|
"entropy": 4.982839488983155,
|
|
"epoch": 3.712776176753122,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003674585402707477,
|
|
"loss": 4.4918,
|
|
"mean_token_accuracy": 0.26452442854642866,
|
|
"num_tokens": 88596575.0,
|
|
"step": 38650
|
|
},
|
|
{
|
|
"entropy": 5.101427745819092,
|
|
"epoch": 3.7132564841498557,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003674272839660921,
|
|
"loss": 4.7279,
|
|
"mean_token_accuracy": 0.24416145533323289,
|
|
"num_tokens": 88608549.0,
|
|
"step": 38655
|
|
},
|
|
{
|
|
"entropy": 4.98401517868042,
|
|
"epoch": 3.71373679154659,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00036739602551554945,
|
|
"loss": 4.5391,
|
|
"mean_token_accuracy": 0.26456278562545776,
|
|
"num_tokens": 88620611.0,
|
|
"step": 38660
|
|
},
|
|
{
|
|
"entropy": 5.108675718307495,
|
|
"epoch": 3.714217098943324,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003673647649198454,
|
|
"loss": 4.6722,
|
|
"mean_token_accuracy": 0.2468302518129349,
|
|
"num_tokens": 88630600.0,
|
|
"step": 38665
|
|
},
|
|
{
|
|
"entropy": 5.006364917755127,
|
|
"epoch": 3.7146974063400577,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00036733350217970593,
|
|
"loss": 4.5275,
|
|
"mean_token_accuracy": 0.2586881920695305,
|
|
"num_tokens": 88642846.0,
|
|
"step": 38670
|
|
},
|
|
{
|
|
"entropy": 5.03388991355896,
|
|
"epoch": 3.7151777137367916,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00036730223729585654,
|
|
"loss": 4.6031,
|
|
"mean_token_accuracy": 0.2544480308890343,
|
|
"num_tokens": 88654981.0,
|
|
"step": 38675
|
|
},
|
|
{
|
|
"entropy": 5.127859306335449,
|
|
"epoch": 3.7156580211335255,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003672709702690234,
|
|
"loss": 4.6688,
|
|
"mean_token_accuracy": 0.24729857742786407,
|
|
"num_tokens": 88666737.0,
|
|
"step": 38680
|
|
},
|
|
{
|
|
"entropy": 5.038737154006958,
|
|
"epoch": 3.7161383285302594,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00036723970109993223,
|
|
"loss": 4.5561,
|
|
"mean_token_accuracy": 0.2608724147081375,
|
|
"num_tokens": 88678037.0,
|
|
"step": 38685
|
|
},
|
|
{
|
|
"entropy": 5.038786697387695,
|
|
"epoch": 3.7166186359269933,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003672084297893091,
|
|
"loss": 4.5677,
|
|
"mean_token_accuracy": 0.2536904290318489,
|
|
"num_tokens": 88689215.0,
|
|
"step": 38690
|
|
},
|
|
{
|
|
"entropy": 4.919533157348633,
|
|
"epoch": 3.717098943323727,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00036717715633788014,
|
|
"loss": 4.5526,
|
|
"mean_token_accuracy": 0.2595666363835335,
|
|
"num_tokens": 88701848.0,
|
|
"step": 38695
|
|
},
|
|
{
|
|
"entropy": 4.986253356933593,
|
|
"epoch": 3.717579250720461,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00036714588074637116,
|
|
"loss": 4.4944,
|
|
"mean_token_accuracy": 0.2631195545196533,
|
|
"num_tokens": 88712745.0,
|
|
"step": 38700
|
|
},
|
|
{
|
|
"entropy": 4.987126874923706,
|
|
"epoch": 3.718059558117195,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003671146030155086,
|
|
"loss": 4.4903,
|
|
"mean_token_accuracy": 0.2606608927249908,
|
|
"num_tokens": 88725048.0,
|
|
"step": 38705
|
|
},
|
|
{
|
|
"entropy": 4.962274074554443,
|
|
"epoch": 3.718539865513929,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003670833231460184,
|
|
"loss": 4.5496,
|
|
"mean_token_accuracy": 0.2572333350777626,
|
|
"num_tokens": 88736218.0,
|
|
"step": 38710
|
|
},
|
|
{
|
|
"entropy": 5.073811960220337,
|
|
"epoch": 3.7190201729106627,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00036705204113862683,
|
|
"loss": 4.7041,
|
|
"mean_token_accuracy": 0.24912975281476973,
|
|
"num_tokens": 88747967.0,
|
|
"step": 38715
|
|
},
|
|
{
|
|
"entropy": 5.066172122955322,
|
|
"epoch": 3.7195004803073966,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003670207569940603,
|
|
"loss": 4.566,
|
|
"mean_token_accuracy": 0.25792714953422546,
|
|
"num_tokens": 88759565.0,
|
|
"step": 38720
|
|
},
|
|
{
|
|
"entropy": 4.909289121627808,
|
|
"epoch": 3.719980787704131,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00036698947071304495,
|
|
"loss": 4.434,
|
|
"mean_token_accuracy": 0.2678888335824013,
|
|
"num_tokens": 88771153.0,
|
|
"step": 38725
|
|
},
|
|
{
|
|
"entropy": 5.043943977355957,
|
|
"epoch": 3.7204610951008643,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036695818229630735,
|
|
"loss": 4.6897,
|
|
"mean_token_accuracy": 0.25080784857273103,
|
|
"num_tokens": 88782615.0,
|
|
"step": 38730
|
|
},
|
|
{
|
|
"entropy": 5.042867183685303,
|
|
"epoch": 3.7209414024975986,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003669268917445737,
|
|
"loss": 4.6094,
|
|
"mean_token_accuracy": 0.2547831356525421,
|
|
"num_tokens": 88794054.0,
|
|
"step": 38735
|
|
},
|
|
{
|
|
"entropy": 5.024102640151978,
|
|
"epoch": 3.7214217098943325,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003668955990585706,
|
|
"loss": 4.5851,
|
|
"mean_token_accuracy": 0.2590148538351059,
|
|
"num_tokens": 88804094.0,
|
|
"step": 38740
|
|
},
|
|
{
|
|
"entropy": 5.017012405395508,
|
|
"epoch": 3.7219020172910664,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00036686430423902466,
|
|
"loss": 4.598,
|
|
"mean_token_accuracy": 0.25177251994609834,
|
|
"num_tokens": 88814661.0,
|
|
"step": 38745
|
|
},
|
|
{
|
|
"entropy": 4.981132745742798,
|
|
"epoch": 3.7223823246878003,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003668330072866623,
|
|
"loss": 4.6122,
|
|
"mean_token_accuracy": 0.249338598549366,
|
|
"num_tokens": 88825256.0,
|
|
"step": 38750
|
|
},
|
|
{
|
|
"entropy": 5.044579553604126,
|
|
"epoch": 3.722862632084534,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00036680170820221013,
|
|
"loss": 4.6026,
|
|
"mean_token_accuracy": 0.25479757189750674,
|
|
"num_tokens": 88835914.0,
|
|
"step": 38755
|
|
},
|
|
{
|
|
"entropy": 5.008833312988282,
|
|
"epoch": 3.723342939481268,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000366770406986395,
|
|
"loss": 4.5277,
|
|
"mean_token_accuracy": 0.26011229306459427,
|
|
"num_tokens": 88846847.0,
|
|
"step": 38760
|
|
},
|
|
{
|
|
"entropy": 5.0696423053741455,
|
|
"epoch": 3.723823246878002,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003667391036399435,
|
|
"loss": 4.652,
|
|
"mean_token_accuracy": 0.2519847691059113,
|
|
"num_tokens": 88857899.0,
|
|
"step": 38765
|
|
},
|
|
{
|
|
"entropy": 4.999344301223755,
|
|
"epoch": 3.724303554274736,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003667077981635824,
|
|
"loss": 4.5581,
|
|
"mean_token_accuracy": 0.25986300259828565,
|
|
"num_tokens": 88867972.0,
|
|
"step": 38770
|
|
},
|
|
{
|
|
"entropy": 5.057706212997436,
|
|
"epoch": 3.7247838616714697,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00036667649055803855,
|
|
"loss": 4.6611,
|
|
"mean_token_accuracy": 0.25224111080169676,
|
|
"num_tokens": 88880241.0,
|
|
"step": 38775
|
|
},
|
|
{
|
|
"entropy": 5.109183835983276,
|
|
"epoch": 3.7252641690682036,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00036664518082403876,
|
|
"loss": 4.6349,
|
|
"mean_token_accuracy": 0.25350831896066667,
|
|
"num_tokens": 88891833.0,
|
|
"step": 38780
|
|
},
|
|
{
|
|
"entropy": 4.971796894073487,
|
|
"epoch": 3.7257444764649374,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003666138689623101,
|
|
"loss": 4.5176,
|
|
"mean_token_accuracy": 0.2576854690909386,
|
|
"num_tokens": 88902537.0,
|
|
"step": 38785
|
|
},
|
|
{
|
|
"entropy": 5.00713882446289,
|
|
"epoch": 3.7262247838616713,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00036658255497357936,
|
|
"loss": 4.5958,
|
|
"mean_token_accuracy": 0.2545945480465889,
|
|
"num_tokens": 88914499.0,
|
|
"step": 38790
|
|
},
|
|
{
|
|
"entropy": 5.087353897094727,
|
|
"epoch": 3.726705091258405,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00036655123885857366,
|
|
"loss": 4.6476,
|
|
"mean_token_accuracy": 0.2529897794127464,
|
|
"num_tokens": 88927240.0,
|
|
"step": 38795
|
|
},
|
|
{
|
|
"entropy": 5.010086870193481,
|
|
"epoch": 3.7271853986551395,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00036651992061802007,
|
|
"loss": 4.5741,
|
|
"mean_token_accuracy": 0.25386611074209214,
|
|
"num_tokens": 88939877.0,
|
|
"step": 38800
|
|
},
|
|
{
|
|
"entropy": 4.963146734237671,
|
|
"epoch": 3.727665706051873,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003664886002526457,
|
|
"loss": 4.5386,
|
|
"mean_token_accuracy": 0.26079331934452055,
|
|
"num_tokens": 88952144.0,
|
|
"step": 38805
|
|
},
|
|
{
|
|
"entropy": 5.099475765228272,
|
|
"epoch": 3.7281460134486073,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00036645727776317763,
|
|
"loss": 4.666,
|
|
"mean_token_accuracy": 0.25431815087795256,
|
|
"num_tokens": 88963700.0,
|
|
"step": 38810
|
|
},
|
|
{
|
|
"entropy": 5.1559711456298825,
|
|
"epoch": 3.728626320845341,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003664259531503432,
|
|
"loss": 4.6749,
|
|
"mean_token_accuracy": 0.24484064877033235,
|
|
"num_tokens": 88973744.0,
|
|
"step": 38815
|
|
},
|
|
{
|
|
"entropy": 5.046345472335815,
|
|
"epoch": 3.729106628242075,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003663946264148696,
|
|
"loss": 4.5566,
|
|
"mean_token_accuracy": 0.25666549503803254,
|
|
"num_tokens": 88984506.0,
|
|
"step": 38820
|
|
},
|
|
{
|
|
"entropy": 5.080349397659302,
|
|
"epoch": 3.729586935638809,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003663632975574841,
|
|
"loss": 4.648,
|
|
"mean_token_accuracy": 0.250750894844532,
|
|
"num_tokens": 88995703.0,
|
|
"step": 38825
|
|
},
|
|
{
|
|
"entropy": 4.9521181106567385,
|
|
"epoch": 3.730067243035543,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003663319665789142,
|
|
"loss": 4.5345,
|
|
"mean_token_accuracy": 0.2652203574776649,
|
|
"num_tokens": 89006529.0,
|
|
"step": 38830
|
|
},
|
|
{
|
|
"entropy": 5.02937798500061,
|
|
"epoch": 3.7305475504322767,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003663006334798872,
|
|
"loss": 4.6627,
|
|
"mean_token_accuracy": 0.24915835559368132,
|
|
"num_tokens": 89018555.0,
|
|
"step": 38835
|
|
},
|
|
{
|
|
"entropy": 5.041017627716064,
|
|
"epoch": 3.7310278578290106,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003662692982611306,
|
|
"loss": 4.5399,
|
|
"mean_token_accuracy": 0.26199385672807696,
|
|
"num_tokens": 89029603.0,
|
|
"step": 38840
|
|
},
|
|
{
|
|
"entropy": 5.012571477890015,
|
|
"epoch": 3.7315081652257445,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00036623796092337196,
|
|
"loss": 4.6008,
|
|
"mean_token_accuracy": 0.2522846132516861,
|
|
"num_tokens": 89041560.0,
|
|
"step": 38845
|
|
},
|
|
{
|
|
"entropy": 5.0978460788726805,
|
|
"epoch": 3.7319884726224783,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003662066214673388,
|
|
"loss": 4.6756,
|
|
"mean_token_accuracy": 0.2539414331316948,
|
|
"num_tokens": 89052133.0,
|
|
"step": 38850
|
|
},
|
|
{
|
|
"entropy": 4.956137800216675,
|
|
"epoch": 3.7324687800192122,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003661752798937586,
|
|
"loss": 4.513,
|
|
"mean_token_accuracy": 0.255470035970211,
|
|
"num_tokens": 89064829.0,
|
|
"step": 38855
|
|
},
|
|
{
|
|
"entropy": 5.116791248321533,
|
|
"epoch": 3.732949087415946,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036614393620335923,
|
|
"loss": 4.7128,
|
|
"mean_token_accuracy": 0.24542356580495833,
|
|
"num_tokens": 89076310.0,
|
|
"step": 38860
|
|
},
|
|
{
|
|
"entropy": 5.069875240325928,
|
|
"epoch": 3.73342939481268,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003661125903968683,
|
|
"loss": 4.6283,
|
|
"mean_token_accuracy": 0.24544098526239394,
|
|
"num_tokens": 89088556.0,
|
|
"step": 38865
|
|
},
|
|
{
|
|
"entropy": 4.973018312454224,
|
|
"epoch": 3.733909702209414,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003660812424750135,
|
|
"loss": 4.4711,
|
|
"mean_token_accuracy": 0.2641628310084343,
|
|
"num_tokens": 89100020.0,
|
|
"step": 38870
|
|
},
|
|
{
|
|
"entropy": 4.9373517513275145,
|
|
"epoch": 3.734390009606148,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003660498924385228,
|
|
"loss": 4.4643,
|
|
"mean_token_accuracy": 0.27016140818595885,
|
|
"num_tokens": 89111538.0,
|
|
"step": 38875
|
|
},
|
|
{
|
|
"entropy": 5.026003503799439,
|
|
"epoch": 3.7348703170028816,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00036601854028812394,
|
|
"loss": 4.5994,
|
|
"mean_token_accuracy": 0.2533448651432991,
|
|
"num_tokens": 89122674.0,
|
|
"step": 38880
|
|
},
|
|
{
|
|
"entropy": 5.095046520233154,
|
|
"epoch": 3.735350624399616,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00036598718602454474,
|
|
"loss": 4.6161,
|
|
"mean_token_accuracy": 0.25749556720256805,
|
|
"num_tokens": 89135165.0,
|
|
"step": 38885
|
|
},
|
|
{
|
|
"entropy": 5.012445688247681,
|
|
"epoch": 3.7358309317963494,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003659558296485133,
|
|
"loss": 4.585,
|
|
"mean_token_accuracy": 0.2566659599542618,
|
|
"num_tokens": 89146353.0,
|
|
"step": 38890
|
|
},
|
|
{
|
|
"entropy": 4.99602575302124,
|
|
"epoch": 3.7363112391930837,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036592447116075765,
|
|
"loss": 4.5889,
|
|
"mean_token_accuracy": 0.24548017531633376,
|
|
"num_tokens": 89157589.0,
|
|
"step": 38895
|
|
},
|
|
{
|
|
"entropy": 5.009372282028198,
|
|
"epoch": 3.7367915465898176,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00036589311056200573,
|
|
"loss": 4.5446,
|
|
"mean_token_accuracy": 0.26005731523036957,
|
|
"num_tokens": 89169384.0,
|
|
"step": 38900
|
|
},
|
|
{
|
|
"entropy": 5.028673124313355,
|
|
"epoch": 3.7372718539865515,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003658617478529857,
|
|
"loss": 4.5692,
|
|
"mean_token_accuracy": 0.2518122747540474,
|
|
"num_tokens": 89180443.0,
|
|
"step": 38905
|
|
},
|
|
{
|
|
"entropy": 4.956254291534424,
|
|
"epoch": 3.7377521613832854,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00036583038303442567,
|
|
"loss": 4.5659,
|
|
"mean_token_accuracy": 0.2525232911109924,
|
|
"num_tokens": 89192434.0,
|
|
"step": 38910
|
|
},
|
|
{
|
|
"entropy": 5.064654684066772,
|
|
"epoch": 3.7382324687800192,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00036579901610705377,
|
|
"loss": 4.5812,
|
|
"mean_token_accuracy": 0.25280669033527375,
|
|
"num_tokens": 89203965.0,
|
|
"step": 38915
|
|
},
|
|
{
|
|
"entropy": 4.987227058410644,
|
|
"epoch": 3.738712776176753,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00036576764707159853,
|
|
"loss": 4.5266,
|
|
"mean_token_accuracy": 0.2570886567234993,
|
|
"num_tokens": 89214754.0,
|
|
"step": 38920
|
|
},
|
|
{
|
|
"entropy": 4.932294178009033,
|
|
"epoch": 3.739193083573487,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003657362759287879,
|
|
"loss": 4.504,
|
|
"mean_token_accuracy": 0.2607526838779449,
|
|
"num_tokens": 89226831.0,
|
|
"step": 38925
|
|
},
|
|
{
|
|
"entropy": 5.124674510955811,
|
|
"epoch": 3.739673390970221,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00036570490267935044,
|
|
"loss": 4.6998,
|
|
"mean_token_accuracy": 0.24927456974983214,
|
|
"num_tokens": 89237526.0,
|
|
"step": 38930
|
|
},
|
|
{
|
|
"entropy": 5.021957778930664,
|
|
"epoch": 3.7401536983669548,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036567352732401443,
|
|
"loss": 4.4439,
|
|
"mean_token_accuracy": 0.2707783177495003,
|
|
"num_tokens": 89248841.0,
|
|
"step": 38935
|
|
},
|
|
{
|
|
"entropy": 4.992293739318848,
|
|
"epoch": 3.7406340057636887,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00036564214986350844,
|
|
"loss": 4.5494,
|
|
"mean_token_accuracy": 0.2571530595421791,
|
|
"num_tokens": 89260208.0,
|
|
"step": 38940
|
|
},
|
|
{
|
|
"entropy": 4.974653959274292,
|
|
"epoch": 3.7411143131604225,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00036561077029856077,
|
|
"loss": 4.6028,
|
|
"mean_token_accuracy": 0.2642351359128952,
|
|
"num_tokens": 89270844.0,
|
|
"step": 38945
|
|
},
|
|
{
|
|
"entropy": 5.031313848495484,
|
|
"epoch": 3.7415946205571564,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00036557938862990017,
|
|
"loss": 4.6434,
|
|
"mean_token_accuracy": 0.2553083747625351,
|
|
"num_tokens": 89284301.0,
|
|
"step": 38950
|
|
},
|
|
{
|
|
"entropy": 5.050865650177002,
|
|
"epoch": 3.7420749279538903,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.00036554800485825506,
|
|
"loss": 4.5929,
|
|
"mean_token_accuracy": 0.2549210861325264,
|
|
"num_tokens": 89296753.0,
|
|
"step": 38955
|
|
},
|
|
{
|
|
"entropy": 5.093622398376465,
|
|
"epoch": 3.7425552353506246,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00036551661898435414,
|
|
"loss": 4.6739,
|
|
"mean_token_accuracy": 0.2430781751871109,
|
|
"num_tokens": 89308745.0,
|
|
"step": 38960
|
|
},
|
|
{
|
|
"entropy": 5.055368232727051,
|
|
"epoch": 3.743035542747358,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003654852310089262,
|
|
"loss": 4.6245,
|
|
"mean_token_accuracy": 0.2559262916445732,
|
|
"num_tokens": 89320273.0,
|
|
"step": 38965
|
|
},
|
|
{
|
|
"entropy": 5.120514392852783,
|
|
"epoch": 3.7435158501440924,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036545384093269984,
|
|
"loss": 4.7293,
|
|
"mean_token_accuracy": 0.24691716879606246,
|
|
"num_tokens": 89331657.0,
|
|
"step": 38970
|
|
},
|
|
{
|
|
"entropy": 5.01983232498169,
|
|
"epoch": 3.7439961575408263,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00036542244875640385,
|
|
"loss": 4.5535,
|
|
"mean_token_accuracy": 0.2588197708129883,
|
|
"num_tokens": 89342679.0,
|
|
"step": 38975
|
|
},
|
|
{
|
|
"entropy": 5.018749570846557,
|
|
"epoch": 3.74447646493756,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.00036539105448076713,
|
|
"loss": 4.499,
|
|
"mean_token_accuracy": 0.26584111601114274,
|
|
"num_tokens": 89353830.0,
|
|
"step": 38980
|
|
},
|
|
{
|
|
"entropy": 4.952632188796997,
|
|
"epoch": 3.744956772334294,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003653596581065185,
|
|
"loss": 4.4895,
|
|
"mean_token_accuracy": 0.2632727935910225,
|
|
"num_tokens": 89365397.0,
|
|
"step": 38985
|
|
},
|
|
{
|
|
"entropy": 4.962867069244385,
|
|
"epoch": 3.745437079731028,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0003653282596343869,
|
|
"loss": 4.5285,
|
|
"mean_token_accuracy": 0.25664910972118377,
|
|
"num_tokens": 89377255.0,
|
|
"step": 38990
|
|
},
|
|
{
|
|
"entropy": 4.908399629592895,
|
|
"epoch": 3.745917387127762,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003652968590651014,
|
|
"loss": 4.5163,
|
|
"mean_token_accuracy": 0.26733690351247785,
|
|
"num_tokens": 89388603.0,
|
|
"step": 38995
|
|
},
|
|
{
|
|
"entropy": 5.067599964141846,
|
|
"epoch": 3.7463976945244957,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00036526545639939097,
|
|
"loss": 4.5918,
|
|
"mean_token_accuracy": 0.25655138343572614,
|
|
"num_tokens": 89400510.0,
|
|
"step": 39000
|
|
},
|
|
{
|
|
"epoch": 3.7463976945244957,
|
|
"eval_entropy": 4.887235216080764,
|
|
"eval_loss": 4.720763683319092,
|
|
"eval_mean_token_accuracy": 0.2555414439537274,
|
|
"eval_num_tokens": 89400510.0,
|
|
"eval_runtime": 26.5301,
|
|
"eval_samples_per_second": 1236.896,
|
|
"eval_steps_per_second": 154.617,
|
|
"step": 39000
|
|
},
|
|
{
|
|
"entropy": 5.145085716247559,
|
|
"epoch": 3.7468780019212296,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003652340516379846,
|
|
"loss": 4.6864,
|
|
"mean_token_accuracy": 0.23945362269878387,
|
|
"num_tokens": 89411512.0,
|
|
"step": 39005
|
|
},
|
|
{
|
|
"entropy": 5.031566762924195,
|
|
"epoch": 3.7473583093179634,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036520264478161164,
|
|
"loss": 4.5744,
|
|
"mean_token_accuracy": 0.25761850029230116,
|
|
"num_tokens": 89422394.0,
|
|
"step": 39010
|
|
},
|
|
{
|
|
"entropy": 4.985092687606811,
|
|
"epoch": 3.7478386167146973,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0003651712358310009,
|
|
"loss": 4.5502,
|
|
"mean_token_accuracy": 0.2591294884681702,
|
|
"num_tokens": 89433420.0,
|
|
"step": 39015
|
|
},
|
|
{
|
|
"entropy": 4.992144680023193,
|
|
"epoch": 3.748318924111431,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036513982478688197,
|
|
"loss": 4.5629,
|
|
"mean_token_accuracy": 0.2575372144579887,
|
|
"num_tokens": 89444722.0,
|
|
"step": 39020
|
|
},
|
|
{
|
|
"entropy": 5.032183837890625,
|
|
"epoch": 3.748799231508165,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00036510841164998385,
|
|
"loss": 4.5926,
|
|
"mean_token_accuracy": 0.25613582581281663,
|
|
"num_tokens": 89456601.0,
|
|
"step": 39025
|
|
},
|
|
{
|
|
"entropy": 4.990504598617553,
|
|
"epoch": 3.749279538904899,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003650769964210361,
|
|
"loss": 4.5901,
|
|
"mean_token_accuracy": 0.25124999284744265,
|
|
"num_tokens": 89467753.0,
|
|
"step": 39030
|
|
},
|
|
{
|
|
"entropy": 5.092643070220947,
|
|
"epoch": 3.7497598463016333,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003650455791007679,
|
|
"loss": 4.7063,
|
|
"mean_token_accuracy": 0.24392189532518388,
|
|
"num_tokens": 89479303.0,
|
|
"step": 39035
|
|
},
|
|
{
|
|
"entropy": 5.098826742172241,
|
|
"epoch": 3.7502401536983667,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00036501415968990874,
|
|
"loss": 4.6199,
|
|
"mean_token_accuracy": 0.2503222689032555,
|
|
"num_tokens": 89491334.0,
|
|
"step": 39040
|
|
},
|
|
{
|
|
"entropy": 4.954686117172241,
|
|
"epoch": 3.750720461095101,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003649827381891881,
|
|
"loss": 4.4856,
|
|
"mean_token_accuracy": 0.26560203582048414,
|
|
"num_tokens": 89502788.0,
|
|
"step": 39045
|
|
},
|
|
{
|
|
"entropy": 4.9893101215362545,
|
|
"epoch": 3.751200768491835,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003649513145993355,
|
|
"loss": 4.5513,
|
|
"mean_token_accuracy": 0.25450958758592607,
|
|
"num_tokens": 89514479.0,
|
|
"step": 39050
|
|
},
|
|
{
|
|
"entropy": 5.034900665283203,
|
|
"epoch": 3.751681075888569,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003649198889210804,
|
|
"loss": 4.6153,
|
|
"mean_token_accuracy": 0.25059861689805984,
|
|
"num_tokens": 89526534.0,
|
|
"step": 39055
|
|
},
|
|
{
|
|
"entropy": 5.072126579284668,
|
|
"epoch": 3.7521613832853027,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003648884611551525,
|
|
"loss": 4.5943,
|
|
"mean_token_accuracy": 0.2599215433001518,
|
|
"num_tokens": 89537231.0,
|
|
"step": 39060
|
|
},
|
|
{
|
|
"entropy": 5.046180963516235,
|
|
"epoch": 3.7526416906820366,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00036485703130228147,
|
|
"loss": 4.5852,
|
|
"mean_token_accuracy": 0.24791938960552215,
|
|
"num_tokens": 89547374.0,
|
|
"step": 39065
|
|
},
|
|
{
|
|
"entropy": 4.93974199295044,
|
|
"epoch": 3.7531219980787704,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00036482559936319703,
|
|
"loss": 4.4965,
|
|
"mean_token_accuracy": 0.25770975053310397,
|
|
"num_tokens": 89557776.0,
|
|
"step": 39070
|
|
},
|
|
{
|
|
"entropy": 4.995207118988037,
|
|
"epoch": 3.7536023054755043,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00036479416533862885,
|
|
"loss": 4.5905,
|
|
"mean_token_accuracy": 0.2576539173722267,
|
|
"num_tokens": 89569181.0,
|
|
"step": 39075
|
|
},
|
|
{
|
|
"entropy": 5.084425401687622,
|
|
"epoch": 3.754082612872238,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00036476272922930674,
|
|
"loss": 4.5938,
|
|
"mean_token_accuracy": 0.25060887932777404,
|
|
"num_tokens": 89580375.0,
|
|
"step": 39080
|
|
},
|
|
{
|
|
"entropy": 5.148426485061646,
|
|
"epoch": 3.754562920268972,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00036473129103596077,
|
|
"loss": 4.6687,
|
|
"mean_token_accuracy": 0.24825157821178437,
|
|
"num_tokens": 89591787.0,
|
|
"step": 39085
|
|
},
|
|
{
|
|
"entropy": 4.993230676651001,
|
|
"epoch": 3.755043227665706,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003646998507593205,
|
|
"loss": 4.5738,
|
|
"mean_token_accuracy": 0.25399145632982256,
|
|
"num_tokens": 89604389.0,
|
|
"step": 39090
|
|
},
|
|
{
|
|
"entropy": 4.927833843231201,
|
|
"epoch": 3.75552353506244,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003646684084001161,
|
|
"loss": 4.4651,
|
|
"mean_token_accuracy": 0.26698571145534516,
|
|
"num_tokens": 89615900.0,
|
|
"step": 39095
|
|
},
|
|
{
|
|
"entropy": 4.9376842975616455,
|
|
"epoch": 3.7560038424591737,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003646369639590776,
|
|
"loss": 4.5116,
|
|
"mean_token_accuracy": 0.2544685333967209,
|
|
"num_tokens": 89627056.0,
|
|
"step": 39100
|
|
},
|
|
{
|
|
"entropy": 4.979838800430298,
|
|
"epoch": 3.7564841498559076,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003646055174369348,
|
|
"loss": 4.5185,
|
|
"mean_token_accuracy": 0.2630044177174568,
|
|
"num_tokens": 89637194.0,
|
|
"step": 39105
|
|
},
|
|
{
|
|
"entropy": 5.047923469543457,
|
|
"epoch": 3.756964457252642,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003645740688344181,
|
|
"loss": 4.5683,
|
|
"mean_token_accuracy": 0.258801731467247,
|
|
"num_tokens": 89648132.0,
|
|
"step": 39110
|
|
},
|
|
{
|
|
"entropy": 5.030591869354248,
|
|
"epoch": 3.7574447646493754,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003645426181522575,
|
|
"loss": 4.478,
|
|
"mean_token_accuracy": 0.2567203760147095,
|
|
"num_tokens": 89658236.0,
|
|
"step": 39115
|
|
},
|
|
{
|
|
"entropy": 5.0383047580719,
|
|
"epoch": 3.7579250720461097,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00036451116539118304,
|
|
"loss": 4.5726,
|
|
"mean_token_accuracy": 0.25294652581214905,
|
|
"num_tokens": 89669660.0,
|
|
"step": 39120
|
|
},
|
|
{
|
|
"entropy": 4.983749151229858,
|
|
"epoch": 3.7584053794428436,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00036447971055192526,
|
|
"loss": 4.5639,
|
|
"mean_token_accuracy": 0.2561119973659515,
|
|
"num_tokens": 89681667.0,
|
|
"step": 39125
|
|
},
|
|
{
|
|
"entropy": 4.886267280578613,
|
|
"epoch": 3.7588856868395775,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003644482536352142,
|
|
"loss": 4.434,
|
|
"mean_token_accuracy": 0.2698919475078583,
|
|
"num_tokens": 89694924.0,
|
|
"step": 39130
|
|
},
|
|
{
|
|
"entropy": 5.012494707107544,
|
|
"epoch": 3.7593659942363113,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036441679464178044,
|
|
"loss": 4.5796,
|
|
"mean_token_accuracy": 0.2570414930582047,
|
|
"num_tokens": 89705855.0,
|
|
"step": 39135
|
|
},
|
|
{
|
|
"entropy": 5.020628595352173,
|
|
"epoch": 3.7598463016330452,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003643853335723541,
|
|
"loss": 4.5699,
|
|
"mean_token_accuracy": 0.2542823225259781,
|
|
"num_tokens": 89716766.0,
|
|
"step": 39140
|
|
},
|
|
{
|
|
"entropy": 5.03744854927063,
|
|
"epoch": 3.760326609029779,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00036435387042766575,
|
|
"loss": 4.5959,
|
|
"mean_token_accuracy": 0.25213613063097,
|
|
"num_tokens": 89729595.0,
|
|
"step": 39145
|
|
},
|
|
{
|
|
"entropy": 5.021114730834961,
|
|
"epoch": 3.760806916426513,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003643224052084459,
|
|
"loss": 4.5546,
|
|
"mean_token_accuracy": 0.2572567090392113,
|
|
"num_tokens": 89740972.0,
|
|
"step": 39150
|
|
},
|
|
{
|
|
"entropy": 4.991435575485229,
|
|
"epoch": 3.761287223823247,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000364290937915425,
|
|
"loss": 4.5809,
|
|
"mean_token_accuracy": 0.2558510437607765,
|
|
"num_tokens": 89754660.0,
|
|
"step": 39155
|
|
},
|
|
{
|
|
"entropy": 5.127162933349609,
|
|
"epoch": 3.7617675312199808,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036425946854933374,
|
|
"loss": 4.6815,
|
|
"mean_token_accuracy": 0.24492383003234863,
|
|
"num_tokens": 89766298.0,
|
|
"step": 39160
|
|
},
|
|
{
|
|
"entropy": 5.0629524230957035,
|
|
"epoch": 3.7622478386167146,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00036422799711090253,
|
|
"loss": 4.6516,
|
|
"mean_token_accuracy": 0.24696831852197648,
|
|
"num_tokens": 89778133.0,
|
|
"step": 39165
|
|
},
|
|
{
|
|
"entropy": 4.980006122589112,
|
|
"epoch": 3.7627281460134485,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0003641965236008623,
|
|
"loss": 4.595,
|
|
"mean_token_accuracy": 0.25480067282915114,
|
|
"num_tokens": 89789408.0,
|
|
"step": 39170
|
|
},
|
|
{
|
|
"entropy": 5.01439414024353,
|
|
"epoch": 3.7632084534101824,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003641650480199436,
|
|
"loss": 4.5508,
|
|
"mean_token_accuracy": 0.2578478306531906,
|
|
"num_tokens": 89800287.0,
|
|
"step": 39175
|
|
},
|
|
{
|
|
"entropy": 5.0284778594970705,
|
|
"epoch": 3.7636887608069163,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00036413357036887723,
|
|
"loss": 4.5544,
|
|
"mean_token_accuracy": 0.24715973883867265,
|
|
"num_tokens": 89811639.0,
|
|
"step": 39180
|
|
},
|
|
{
|
|
"entropy": 5.034631061553955,
|
|
"epoch": 3.7641690682036506,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000364102090648394,
|
|
"loss": 4.5981,
|
|
"mean_token_accuracy": 0.2508717328310013,
|
|
"num_tokens": 89823075.0,
|
|
"step": 39185
|
|
},
|
|
{
|
|
"entropy": 4.937863492965699,
|
|
"epoch": 3.764649375600384,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003640706088592249,
|
|
"loss": 4.5417,
|
|
"mean_token_accuracy": 0.25667080879211424,
|
|
"num_tokens": 89835268.0,
|
|
"step": 39190
|
|
},
|
|
{
|
|
"entropy": 5.0749729633331295,
|
|
"epoch": 3.7651296829971184,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00036403912500210066,
|
|
"loss": 4.6207,
|
|
"mean_token_accuracy": 0.24908680319786072,
|
|
"num_tokens": 89846983.0,
|
|
"step": 39195
|
|
},
|
|
{
|
|
"entropy": 5.039799213409424,
|
|
"epoch": 3.765609990393852,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003640076390777524,
|
|
"loss": 4.556,
|
|
"mean_token_accuracy": 0.26075578927993776,
|
|
"num_tokens": 89857372.0,
|
|
"step": 39200
|
|
},
|
|
{
|
|
"entropy": 4.991559791564941,
|
|
"epoch": 3.766090297790586,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000363976151086911,
|
|
"loss": 4.5843,
|
|
"mean_token_accuracy": 0.25578423738479616,
|
|
"num_tokens": 89869286.0,
|
|
"step": 39205
|
|
},
|
|
{
|
|
"entropy": 5.03596248626709,
|
|
"epoch": 3.76657060518732,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003639446610303076,
|
|
"loss": 4.6061,
|
|
"mean_token_accuracy": 0.25037665814161303,
|
|
"num_tokens": 89879646.0,
|
|
"step": 39210
|
|
},
|
|
{
|
|
"entropy": 5.034532833099365,
|
|
"epoch": 3.767050912584054,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00036391316890867334,
|
|
"loss": 4.5562,
|
|
"mean_token_accuracy": 0.26272677779197695,
|
|
"num_tokens": 89891870.0,
|
|
"step": 39215
|
|
},
|
|
{
|
|
"entropy": 5.068647384643555,
|
|
"epoch": 3.7675312199807878,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00036388167472273925,
|
|
"loss": 4.5935,
|
|
"mean_token_accuracy": 0.2502538874745369,
|
|
"num_tokens": 89903383.0,
|
|
"step": 39220
|
|
},
|
|
{
|
|
"entropy": 5.1582865715026855,
|
|
"epoch": 3.7680115273775217,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003638501784732366,
|
|
"loss": 4.7936,
|
|
"mean_token_accuracy": 0.24288628548383712,
|
|
"num_tokens": 89913707.0,
|
|
"step": 39225
|
|
},
|
|
{
|
|
"entropy": 4.9772735118865965,
|
|
"epoch": 3.7684918347742555,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003638186801608966,
|
|
"loss": 4.5226,
|
|
"mean_token_accuracy": 0.26570557355880736,
|
|
"num_tokens": 89924983.0,
|
|
"step": 39230
|
|
},
|
|
{
|
|
"entropy": 4.960193109512329,
|
|
"epoch": 3.7689721421709894,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00036378717978645067,
|
|
"loss": 4.5892,
|
|
"mean_token_accuracy": 0.2522843316197395,
|
|
"num_tokens": 89936404.0,
|
|
"step": 39235
|
|
},
|
|
{
|
|
"entropy": 5.126767921447754,
|
|
"epoch": 3.7694524495677233,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036375567735063,
|
|
"loss": 4.6783,
|
|
"mean_token_accuracy": 0.2502130940556526,
|
|
"num_tokens": 89947916.0,
|
|
"step": 39240
|
|
},
|
|
{
|
|
"entropy": 5.087224864959717,
|
|
"epoch": 3.769932756964457,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003637241728541661,
|
|
"loss": 4.6466,
|
|
"mean_token_accuracy": 0.25392941385507584,
|
|
"num_tokens": 89959436.0,
|
|
"step": 39245
|
|
},
|
|
{
|
|
"entropy": 5.059451627731323,
|
|
"epoch": 3.770413064361191,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00036369266629779037,
|
|
"loss": 4.5926,
|
|
"mean_token_accuracy": 0.2531299561262131,
|
|
"num_tokens": 89969977.0,
|
|
"step": 39250
|
|
},
|
|
{
|
|
"entropy": 5.009925365447998,
|
|
"epoch": 3.770893371757925,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003636611576822342,
|
|
"loss": 4.6225,
|
|
"mean_token_accuracy": 0.24965404868125915,
|
|
"num_tokens": 89981189.0,
|
|
"step": 39255
|
|
},
|
|
{
|
|
"entropy": 4.984711647033691,
|
|
"epoch": 3.771373679154659,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0003636296470082293,
|
|
"loss": 4.635,
|
|
"mean_token_accuracy": 0.2500599637627602,
|
|
"num_tokens": 89994120.0,
|
|
"step": 39260
|
|
},
|
|
{
|
|
"entropy": 4.934603261947632,
|
|
"epoch": 3.7718539865513927,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00036359813427650726,
|
|
"loss": 4.4692,
|
|
"mean_token_accuracy": 0.2654679000377655,
|
|
"num_tokens": 90005953.0,
|
|
"step": 39265
|
|
},
|
|
{
|
|
"entropy": 4.992241477966308,
|
|
"epoch": 3.772334293948127,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00036356661948779945,
|
|
"loss": 4.5095,
|
|
"mean_token_accuracy": 0.25849722027778627,
|
|
"num_tokens": 90017875.0,
|
|
"step": 39270
|
|
},
|
|
{
|
|
"entropy": 5.04215440750122,
|
|
"epoch": 3.7728146013448605,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00036353510264283776,
|
|
"loss": 4.5733,
|
|
"mean_token_accuracy": 0.2559712499380112,
|
|
"num_tokens": 90028096.0,
|
|
"step": 39275
|
|
},
|
|
{
|
|
"entropy": 5.081985092163086,
|
|
"epoch": 3.773294908741595,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003635035837423539,
|
|
"loss": 4.6098,
|
|
"mean_token_accuracy": 0.25106541663408277,
|
|
"num_tokens": 90038834.0,
|
|
"step": 39280
|
|
},
|
|
{
|
|
"entropy": 5.069351434707642,
|
|
"epoch": 3.7737752161383287,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003634720627870797,
|
|
"loss": 4.6427,
|
|
"mean_token_accuracy": 0.2484325125813484,
|
|
"num_tokens": 90051429.0,
|
|
"step": 39285
|
|
},
|
|
{
|
|
"entropy": 5.022020959854126,
|
|
"epoch": 3.7742555235350626,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00036344053977774684,
|
|
"loss": 4.5393,
|
|
"mean_token_accuracy": 0.2589443206787109,
|
|
"num_tokens": 90062269.0,
|
|
"step": 39290
|
|
},
|
|
{
|
|
"entropy": 4.946408605575561,
|
|
"epoch": 3.7747358309317964,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003634090147150873,
|
|
"loss": 4.5332,
|
|
"mean_token_accuracy": 0.26142884492874147,
|
|
"num_tokens": 90073573.0,
|
|
"step": 39295
|
|
},
|
|
{
|
|
"entropy": 4.9522498607635494,
|
|
"epoch": 3.7752161383285303,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003633774875998329,
|
|
"loss": 4.4677,
|
|
"mean_token_accuracy": 0.2643534392118454,
|
|
"num_tokens": 90083543.0,
|
|
"step": 39300
|
|
},
|
|
{
|
|
"entropy": 5.004096412658692,
|
|
"epoch": 3.775696445725264,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003633459584327157,
|
|
"loss": 4.6166,
|
|
"mean_token_accuracy": 0.2502096191048622,
|
|
"num_tokens": 90095108.0,
|
|
"step": 39305
|
|
},
|
|
{
|
|
"entropy": 5.045248889923096,
|
|
"epoch": 3.776176753121998,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.00036331442721446763,
|
|
"loss": 4.692,
|
|
"mean_token_accuracy": 0.2475297197699547,
|
|
"num_tokens": 90107272.0,
|
|
"step": 39310
|
|
},
|
|
{
|
|
"entropy": 5.103474950790405,
|
|
"epoch": 3.776657060518732,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003632828939458209,
|
|
"loss": 4.6391,
|
|
"mean_token_accuracy": 0.24766982793807985,
|
|
"num_tokens": 90118167.0,
|
|
"step": 39315
|
|
},
|
|
{
|
|
"entropy": 5.080233383178711,
|
|
"epoch": 3.777137367915466,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036325135862750745,
|
|
"loss": 4.5928,
|
|
"mean_token_accuracy": 0.24868194460868837,
|
|
"num_tokens": 90129790.0,
|
|
"step": 39320
|
|
},
|
|
{
|
|
"entropy": 4.98678183555603,
|
|
"epoch": 3.7776176753121997,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00036321982126025947,
|
|
"loss": 4.5915,
|
|
"mean_token_accuracy": 0.25665484815835954,
|
|
"num_tokens": 90141367.0,
|
|
"step": 39325
|
|
},
|
|
{
|
|
"entropy": 4.996863460540771,
|
|
"epoch": 3.7780979827089336,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003631882818448093,
|
|
"loss": 4.5679,
|
|
"mean_token_accuracy": 0.25669820606708527,
|
|
"num_tokens": 90151755.0,
|
|
"step": 39330
|
|
},
|
|
{
|
|
"entropy": 5.002778577804565,
|
|
"epoch": 3.7785782901056675,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00036315674038188905,
|
|
"loss": 4.5777,
|
|
"mean_token_accuracy": 0.2563686802983284,
|
|
"num_tokens": 90163262.0,
|
|
"step": 39335
|
|
},
|
|
{
|
|
"entropy": 5.20625638961792,
|
|
"epoch": 3.7790585975024014,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000363125196872231,
|
|
"loss": 4.7363,
|
|
"mean_token_accuracy": 0.2398887410759926,
|
|
"num_tokens": 90174704.0,
|
|
"step": 39340
|
|
},
|
|
{
|
|
"entropy": 4.994236612319947,
|
|
"epoch": 3.7795389048991357,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003630936513165676,
|
|
"loss": 4.5028,
|
|
"mean_token_accuracy": 0.2601668983697891,
|
|
"num_tokens": 90186885.0,
|
|
"step": 39345
|
|
},
|
|
{
|
|
"entropy": 5.034438848495483,
|
|
"epoch": 3.780019212295869,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003630621037156312,
|
|
"loss": 4.5619,
|
|
"mean_token_accuracy": 0.2593349412083626,
|
|
"num_tokens": 90198213.0,
|
|
"step": 39350
|
|
},
|
|
{
|
|
"entropy": 5.071244764328003,
|
|
"epoch": 3.7804995196926034,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036303055407015437,
|
|
"loss": 4.6379,
|
|
"mean_token_accuracy": 0.2507008373737335,
|
|
"num_tokens": 90209623.0,
|
|
"step": 39355
|
|
},
|
|
{
|
|
"entropy": 5.053586721420288,
|
|
"epoch": 3.7809798270893373,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00036299900238086924,
|
|
"loss": 4.5907,
|
|
"mean_token_accuracy": 0.2519779786467552,
|
|
"num_tokens": 90221808.0,
|
|
"step": 39360
|
|
},
|
|
{
|
|
"entropy": 5.052425050735474,
|
|
"epoch": 3.781460134486071,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003629674486485087,
|
|
"loss": 4.5956,
|
|
"mean_token_accuracy": 0.2494976133108139,
|
|
"num_tokens": 90232894.0,
|
|
"step": 39365
|
|
},
|
|
{
|
|
"entropy": 4.969105005264282,
|
|
"epoch": 3.781940441882805,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003629358928738052,
|
|
"loss": 4.5136,
|
|
"mean_token_accuracy": 0.2590539261698723,
|
|
"num_tokens": 90243240.0,
|
|
"step": 39370
|
|
},
|
|
{
|
|
"entropy": 4.943216323852539,
|
|
"epoch": 3.782420749279539,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00036290433505749135,
|
|
"loss": 4.5002,
|
|
"mean_token_accuracy": 0.2636594161391258,
|
|
"num_tokens": 90255082.0,
|
|
"step": 39375
|
|
},
|
|
{
|
|
"entropy": 5.095638132095337,
|
|
"epoch": 3.782901056676273,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0003628727752002999,
|
|
"loss": 4.6525,
|
|
"mean_token_accuracy": 0.2462255135178566,
|
|
"num_tokens": 90266521.0,
|
|
"step": 39380
|
|
},
|
|
{
|
|
"entropy": 5.0100456237792965,
|
|
"epoch": 3.7833813640730067,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003628412133029636,
|
|
"loss": 4.5007,
|
|
"mean_token_accuracy": 0.260246704518795,
|
|
"num_tokens": 90277145.0,
|
|
"step": 39385
|
|
},
|
|
{
|
|
"entropy": 4.915523719787598,
|
|
"epoch": 3.7838616714697406,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003628096493662151,
|
|
"loss": 4.4792,
|
|
"mean_token_accuracy": 0.2607880368828773,
|
|
"num_tokens": 90289290.0,
|
|
"step": 39390
|
|
},
|
|
{
|
|
"entropy": 5.034378290176392,
|
|
"epoch": 3.7843419788664745,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003627780833907873,
|
|
"loss": 4.5778,
|
|
"mean_token_accuracy": 0.25366559624671936,
|
|
"num_tokens": 90299863.0,
|
|
"step": 39395
|
|
},
|
|
{
|
|
"entropy": 4.980896091461181,
|
|
"epoch": 3.7848222862632084,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00036274651537741305,
|
|
"loss": 4.6181,
|
|
"mean_token_accuracy": 0.25568258464336396,
|
|
"num_tokens": 90310905.0,
|
|
"step": 39400
|
|
},
|
|
{
|
|
"entropy": 5.008711957931519,
|
|
"epoch": 3.7853025936599423,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003627149453268252,
|
|
"loss": 4.6697,
|
|
"mean_token_accuracy": 0.2550575345754623,
|
|
"num_tokens": 90322971.0,
|
|
"step": 39405
|
|
},
|
|
{
|
|
"entropy": 5.107990884780884,
|
|
"epoch": 3.785782901056676,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00036268337323975685,
|
|
"loss": 4.584,
|
|
"mean_token_accuracy": 0.25555782318115233,
|
|
"num_tokens": 90333782.0,
|
|
"step": 39410
|
|
},
|
|
{
|
|
"entropy": 5.174541711807251,
|
|
"epoch": 3.78626320845341,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00036265179911694094,
|
|
"loss": 4.7058,
|
|
"mean_token_accuracy": 0.2430574879050255,
|
|
"num_tokens": 90343695.0,
|
|
"step": 39415
|
|
},
|
|
{
|
|
"entropy": 4.955031538009644,
|
|
"epoch": 3.7867435158501443,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003626202229591105,
|
|
"loss": 4.543,
|
|
"mean_token_accuracy": 0.2560980603098869,
|
|
"num_tokens": 90354932.0,
|
|
"step": 39420
|
|
},
|
|
{
|
|
"entropy": 4.992726898193359,
|
|
"epoch": 3.787223823246878,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00036258864476699864,
|
|
"loss": 4.5787,
|
|
"mean_token_accuracy": 0.2558345079421997,
|
|
"num_tokens": 90367449.0,
|
|
"step": 39425
|
|
},
|
|
{
|
|
"entropy": 4.955009126663208,
|
|
"epoch": 3.787704130643612,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003625570645413386,
|
|
"loss": 4.4808,
|
|
"mean_token_accuracy": 0.26710075736045835,
|
|
"num_tokens": 90378915.0,
|
|
"step": 39430
|
|
},
|
|
{
|
|
"entropy": 5.030855941772461,
|
|
"epoch": 3.7881844380403455,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003625254822828635,
|
|
"loss": 4.5494,
|
|
"mean_token_accuracy": 0.2552025139331818,
|
|
"num_tokens": 90391868.0,
|
|
"step": 39435
|
|
},
|
|
{
|
|
"entropy": 5.073581838607788,
|
|
"epoch": 3.78866474543708,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003624938979923065,
|
|
"loss": 4.6153,
|
|
"mean_token_accuracy": 0.24622104167938233,
|
|
"num_tokens": 90402520.0,
|
|
"step": 39440
|
|
},
|
|
{
|
|
"entropy": 5.117380571365357,
|
|
"epoch": 3.7891450528338138,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00036246231167040105,
|
|
"loss": 4.6764,
|
|
"mean_token_accuracy": 0.2491818532347679,
|
|
"num_tokens": 90413425.0,
|
|
"step": 39445
|
|
},
|
|
{
|
|
"entropy": 5.132263422012329,
|
|
"epoch": 3.7896253602305476,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003624307233178805,
|
|
"loss": 4.7262,
|
|
"mean_token_accuracy": 0.24619118869304657,
|
|
"num_tokens": 90424976.0,
|
|
"step": 39450
|
|
},
|
|
{
|
|
"entropy": 4.998862361907959,
|
|
"epoch": 3.7901056676272815,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00036239913293547807,
|
|
"loss": 4.5178,
|
|
"mean_token_accuracy": 0.26014573127031326,
|
|
"num_tokens": 90436177.0,
|
|
"step": 39455
|
|
},
|
|
{
|
|
"entropy": 5.050534057617187,
|
|
"epoch": 3.7905859750240154,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.00036236754052392737,
|
|
"loss": 4.6422,
|
|
"mean_token_accuracy": 0.257307243347168,
|
|
"num_tokens": 90448066.0,
|
|
"step": 39460
|
|
},
|
|
{
|
|
"entropy": 4.996550464630127,
|
|
"epoch": 3.7910662824207493,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003623359460839618,
|
|
"loss": 4.5346,
|
|
"mean_token_accuracy": 0.25996713489294054,
|
|
"num_tokens": 90457913.0,
|
|
"step": 39465
|
|
},
|
|
{
|
|
"entropy": 4.924204921722412,
|
|
"epoch": 3.791546589817483,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003623043496163149,
|
|
"loss": 4.521,
|
|
"mean_token_accuracy": 0.2647727459669113,
|
|
"num_tokens": 90469637.0,
|
|
"step": 39470
|
|
},
|
|
{
|
|
"entropy": 4.971824073791504,
|
|
"epoch": 3.792026897214217,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003622727511217202,
|
|
"loss": 4.5716,
|
|
"mean_token_accuracy": 0.255060039460659,
|
|
"num_tokens": 90481222.0,
|
|
"step": 39475
|
|
},
|
|
{
|
|
"entropy": 5.013655376434326,
|
|
"epoch": 3.792507204610951,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003622411506009113,
|
|
"loss": 4.5214,
|
|
"mean_token_accuracy": 0.25374408811330795,
|
|
"num_tokens": 90492911.0,
|
|
"step": 39480
|
|
},
|
|
{
|
|
"entropy": 5.014966058731079,
|
|
"epoch": 3.792987512007685,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003622095480546221,
|
|
"loss": 4.6017,
|
|
"mean_token_accuracy": 0.2502501279115677,
|
|
"num_tokens": 90504583.0,
|
|
"step": 39485
|
|
},
|
|
{
|
|
"entropy": 5.033026933670044,
|
|
"epoch": 3.7934678194044187,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00036217794348358606,
|
|
"loss": 4.6094,
|
|
"mean_token_accuracy": 0.2523016035556793,
|
|
"num_tokens": 90516006.0,
|
|
"step": 39490
|
|
},
|
|
{
|
|
"entropy": 4.910913181304932,
|
|
"epoch": 3.793948126801153,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000362146336888537,
|
|
"loss": 4.4595,
|
|
"mean_token_accuracy": 0.2613148346543312,
|
|
"num_tokens": 90527362.0,
|
|
"step": 39495
|
|
},
|
|
{
|
|
"entropy": 4.979953575134277,
|
|
"epoch": 3.7944284341978864,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00036211472827020886,
|
|
"loss": 4.553,
|
|
"mean_token_accuracy": 0.2569102063775063,
|
|
"num_tokens": 90538635.0,
|
|
"step": 39500
|
|
},
|
|
{
|
|
"entropy": 5.054415941238403,
|
|
"epoch": 3.7949087415946208,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003620831176293354,
|
|
"loss": 4.551,
|
|
"mean_token_accuracy": 0.2577499940991402,
|
|
"num_tokens": 90549996.0,
|
|
"step": 39505
|
|
},
|
|
{
|
|
"entropy": 5.095646524429322,
|
|
"epoch": 3.795389048991354,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00036205150496665053,
|
|
"loss": 4.6728,
|
|
"mean_token_accuracy": 0.25196476876735685,
|
|
"num_tokens": 90563360.0,
|
|
"step": 39510
|
|
},
|
|
{
|
|
"entropy": 5.063615894317627,
|
|
"epoch": 3.7958693563880885,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00036201989028288815,
|
|
"loss": 4.6696,
|
|
"mean_token_accuracy": 0.24965567588806153,
|
|
"num_tokens": 90574617.0,
|
|
"step": 39515
|
|
},
|
|
{
|
|
"entropy": 5.0583251953125,
|
|
"epoch": 3.7963496637848224,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00036198827357878234,
|
|
"loss": 4.6588,
|
|
"mean_token_accuracy": 0.24757576435804368,
|
|
"num_tokens": 90586209.0,
|
|
"step": 39520
|
|
},
|
|
{
|
|
"entropy": 5.101221704483033,
|
|
"epoch": 3.7968299711815563,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003619566548550671,
|
|
"loss": 4.6632,
|
|
"mean_token_accuracy": 0.24474813938140869,
|
|
"num_tokens": 90598409.0,
|
|
"step": 39525
|
|
},
|
|
{
|
|
"entropy": 5.054576683044433,
|
|
"epoch": 3.79731027857829,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003619250341124765,
|
|
"loss": 4.5616,
|
|
"mean_token_accuracy": 0.2580353066325188,
|
|
"num_tokens": 90610073.0,
|
|
"step": 39530
|
|
},
|
|
{
|
|
"entropy": 5.062566089630127,
|
|
"epoch": 3.797790585975024,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003618934113517448,
|
|
"loss": 4.6382,
|
|
"mean_token_accuracy": 0.2557277113199234,
|
|
"num_tokens": 90621782.0,
|
|
"step": 39535
|
|
},
|
|
{
|
|
"entropy": 5.002816152572632,
|
|
"epoch": 3.798270893371758,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003618617865736061,
|
|
"loss": 4.5877,
|
|
"mean_token_accuracy": 0.2524082362651825,
|
|
"num_tokens": 90633771.0,
|
|
"step": 39540
|
|
},
|
|
{
|
|
"entropy": 5.1221428394317625,
|
|
"epoch": 3.798751200768492,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036183015977879463,
|
|
"loss": 4.6935,
|
|
"mean_token_accuracy": 0.2457364931702614,
|
|
"num_tokens": 90644965.0,
|
|
"step": 39545
|
|
},
|
|
{
|
|
"entropy": 5.021115016937256,
|
|
"epoch": 3.7992315081652257,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00036179853096804474,
|
|
"loss": 4.5659,
|
|
"mean_token_accuracy": 0.25460646897554395,
|
|
"num_tokens": 90656691.0,
|
|
"step": 39550
|
|
},
|
|
{
|
|
"entropy": 5.071902656555176,
|
|
"epoch": 3.7997118155619596,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.00036176690014209055,
|
|
"loss": 4.621,
|
|
"mean_token_accuracy": 0.2554046303033829,
|
|
"num_tokens": 90667760.0,
|
|
"step": 39555
|
|
},
|
|
{
|
|
"entropy": 4.982256507873535,
|
|
"epoch": 3.8001921229586935,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003617352673016667,
|
|
"loss": 4.5968,
|
|
"mean_token_accuracy": 0.2507640853524208,
|
|
"num_tokens": 90679640.0,
|
|
"step": 39560
|
|
},
|
|
{
|
|
"entropy": 4.987677145004272,
|
|
"epoch": 3.8006724303554273,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00036170363244750756,
|
|
"loss": 4.5141,
|
|
"mean_token_accuracy": 0.2619085446000099,
|
|
"num_tokens": 90693187.0,
|
|
"step": 39565
|
|
},
|
|
{
|
|
"entropy": 5.10977520942688,
|
|
"epoch": 3.8011527377521612,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003616719955803474,
|
|
"loss": 4.7122,
|
|
"mean_token_accuracy": 0.24616941064596176,
|
|
"num_tokens": 90704540.0,
|
|
"step": 39570
|
|
},
|
|
{
|
|
"entropy": 5.015437078475952,
|
|
"epoch": 3.801633045148895,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003616403567009209,
|
|
"loss": 4.5706,
|
|
"mean_token_accuracy": 0.2623277544975281,
|
|
"num_tokens": 90716577.0,
|
|
"step": 39575
|
|
},
|
|
{
|
|
"entropy": 4.933183193206787,
|
|
"epoch": 3.8021133525456294,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003616087158099626,
|
|
"loss": 4.4455,
|
|
"mean_token_accuracy": 0.26722894608974457,
|
|
"num_tokens": 90727134.0,
|
|
"step": 39580
|
|
},
|
|
{
|
|
"entropy": 4.982299375534057,
|
|
"epoch": 3.802593659942363,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003615770729082071,
|
|
"loss": 4.5798,
|
|
"mean_token_accuracy": 0.2616988345980644,
|
|
"num_tokens": 90738844.0,
|
|
"step": 39585
|
|
},
|
|
{
|
|
"entropy": 4.963323497772217,
|
|
"epoch": 3.803073967339097,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000361545427996389,
|
|
"loss": 4.5413,
|
|
"mean_token_accuracy": 0.2523704722523689,
|
|
"num_tokens": 90749882.0,
|
|
"step": 39590
|
|
},
|
|
{
|
|
"entropy": 5.061871767044067,
|
|
"epoch": 3.803554274735831,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003615137810752431,
|
|
"loss": 4.5991,
|
|
"mean_token_accuracy": 0.25662742406129835,
|
|
"num_tokens": 90762244.0,
|
|
"step": 39595
|
|
},
|
|
{
|
|
"entropy": 5.111899328231812,
|
|
"epoch": 3.804034582132565,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036148213214550406,
|
|
"loss": 4.6747,
|
|
"mean_token_accuracy": 0.2617145240306854,
|
|
"num_tokens": 90774452.0,
|
|
"step": 39600
|
|
},
|
|
{
|
|
"entropy": 5.012771129608154,
|
|
"epoch": 3.804514889529299,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003614504812079068,
|
|
"loss": 4.5747,
|
|
"mean_token_accuracy": 0.25602411925792695,
|
|
"num_tokens": 90785506.0,
|
|
"step": 39605
|
|
},
|
|
{
|
|
"entropy": 5.028975009918213,
|
|
"epoch": 3.8049951969260327,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00036141882826318597,
|
|
"loss": 4.5005,
|
|
"mean_token_accuracy": 0.265478378534317,
|
|
"num_tokens": 90796606.0,
|
|
"step": 39610
|
|
},
|
|
{
|
|
"entropy": 5.02337327003479,
|
|
"epoch": 3.8054755043227666,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003613871733120766,
|
|
"loss": 4.6059,
|
|
"mean_token_accuracy": 0.2593324020504951,
|
|
"num_tokens": 90807949.0,
|
|
"step": 39615
|
|
},
|
|
{
|
|
"entropy": 5.042545366287231,
|
|
"epoch": 3.8059558117195005,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036135551635531354,
|
|
"loss": 4.6558,
|
|
"mean_token_accuracy": 0.2513423725962639,
|
|
"num_tokens": 90819206.0,
|
|
"step": 39620
|
|
},
|
|
{
|
|
"entropy": 5.145686960220337,
|
|
"epoch": 3.8064361191162344,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003613238573936319,
|
|
"loss": 4.731,
|
|
"mean_token_accuracy": 0.23988629579544068,
|
|
"num_tokens": 90830335.0,
|
|
"step": 39625
|
|
},
|
|
{
|
|
"entropy": 5.078374290466309,
|
|
"epoch": 3.8069164265129682,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003612921964277665,
|
|
"loss": 4.5438,
|
|
"mean_token_accuracy": 0.26632998138666153,
|
|
"num_tokens": 90841236.0,
|
|
"step": 39630
|
|
},
|
|
{
|
|
"entropy": 5.023057222366333,
|
|
"epoch": 3.807396733909702,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003612605334584526,
|
|
"loss": 4.5827,
|
|
"mean_token_accuracy": 0.2523267477750778,
|
|
"num_tokens": 90852772.0,
|
|
"step": 39635
|
|
},
|
|
{
|
|
"entropy": 4.954697227478027,
|
|
"epoch": 3.807877041306436,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00036122886848642536,
|
|
"loss": 4.4701,
|
|
"mean_token_accuracy": 0.26197142750024793,
|
|
"num_tokens": 90864692.0,
|
|
"step": 39640
|
|
},
|
|
{
|
|
"entropy": 5.002537107467651,
|
|
"epoch": 3.80835734870317,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003611972015124197,
|
|
"loss": 4.611,
|
|
"mean_token_accuracy": 0.2544981762766838,
|
|
"num_tokens": 90875398.0,
|
|
"step": 39645
|
|
},
|
|
{
|
|
"entropy": 4.941596794128418,
|
|
"epoch": 3.8088376560999038,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000361165532537171,
|
|
"loss": 4.4392,
|
|
"mean_token_accuracy": 0.2668768286705017,
|
|
"num_tokens": 90886997.0,
|
|
"step": 39650
|
|
},
|
|
{
|
|
"entropy": 5.101465463638306,
|
|
"epoch": 3.809317963496638,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003611338615614145,
|
|
"loss": 4.6308,
|
|
"mean_token_accuracy": 0.24852928817272185,
|
|
"num_tokens": 90898674.0,
|
|
"step": 39655
|
|
},
|
|
{
|
|
"entropy": 5.046799516677856,
|
|
"epoch": 3.8097982708933715,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00036110218858588554,
|
|
"loss": 4.6183,
|
|
"mean_token_accuracy": 0.24932535290718078,
|
|
"num_tokens": 90910165.0,
|
|
"step": 39660
|
|
},
|
|
{
|
|
"entropy": 5.039994478225708,
|
|
"epoch": 3.810278578290106,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00036107051361131934,
|
|
"loss": 4.5907,
|
|
"mean_token_accuracy": 0.2532161742448807,
|
|
"num_tokens": 90922380.0,
|
|
"step": 39665
|
|
},
|
|
{
|
|
"entropy": 5.0779726028442385,
|
|
"epoch": 3.8107588856868397,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00036103883663845155,
|
|
"loss": 4.5642,
|
|
"mean_token_accuracy": 0.25901419669389725,
|
|
"num_tokens": 90932913.0,
|
|
"step": 39670
|
|
},
|
|
{
|
|
"entropy": 4.975921964645385,
|
|
"epoch": 3.8112391930835736,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003610071576680173,
|
|
"loss": 4.603,
|
|
"mean_token_accuracy": 0.2501691058278084,
|
|
"num_tokens": 90945247.0,
|
|
"step": 39675
|
|
},
|
|
{
|
|
"entropy": 5.065691518783569,
|
|
"epoch": 3.8117195004803075,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00036097547670075226,
|
|
"loss": 4.6603,
|
|
"mean_token_accuracy": 0.24857104420661927,
|
|
"num_tokens": 90956064.0,
|
|
"step": 39680
|
|
},
|
|
{
|
|
"entropy": 5.116585397720337,
|
|
"epoch": 3.8121998078770414,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000360943793737392,
|
|
"loss": 4.7437,
|
|
"mean_token_accuracy": 0.23892671167850493,
|
|
"num_tokens": 90967636.0,
|
|
"step": 39685
|
|
},
|
|
{
|
|
"entropy": 5.057141256332398,
|
|
"epoch": 3.8126801152737753,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000360912108778672,
|
|
"loss": 4.6286,
|
|
"mean_token_accuracy": 0.2569655075669289,
|
|
"num_tokens": 90979128.0,
|
|
"step": 39690
|
|
},
|
|
{
|
|
"entropy": 5.044049215316773,
|
|
"epoch": 3.813160422670509,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000360880421825328,
|
|
"loss": 4.5895,
|
|
"mean_token_accuracy": 0.2475995808839798,
|
|
"num_tokens": 90990743.0,
|
|
"step": 39695
|
|
},
|
|
{
|
|
"entropy": 5.0520124435424805,
|
|
"epoch": 3.813640730067243,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003608487328780956,
|
|
"loss": 4.5675,
|
|
"mean_token_accuracy": 0.25309642404317856,
|
|
"num_tokens": 91001160.0,
|
|
"step": 39700
|
|
},
|
|
{
|
|
"entropy": 5.088762426376343,
|
|
"epoch": 3.814121037463977,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00036081704193771056,
|
|
"loss": 4.594,
|
|
"mean_token_accuracy": 0.2605500012636185,
|
|
"num_tokens": 91012125.0,
|
|
"step": 39705
|
|
},
|
|
{
|
|
"entropy": 5.006967353820801,
|
|
"epoch": 3.814601344860711,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003607853490049085,
|
|
"loss": 4.6003,
|
|
"mean_token_accuracy": 0.2604366824030876,
|
|
"num_tokens": 91023557.0,
|
|
"step": 39710
|
|
},
|
|
{
|
|
"entropy": 5.022932386398315,
|
|
"epoch": 3.8150816522574447,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003607536540804255,
|
|
"loss": 4.5747,
|
|
"mean_token_accuracy": 0.251143653690815,
|
|
"num_tokens": 91035274.0,
|
|
"step": 39715
|
|
},
|
|
{
|
|
"entropy": 5.1181622505187985,
|
|
"epoch": 3.8155619596541785,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00036072195716499726,
|
|
"loss": 4.5815,
|
|
"mean_token_accuracy": 0.25730409026145934,
|
|
"num_tokens": 91047516.0,
|
|
"step": 39720
|
|
},
|
|
{
|
|
"entropy": 4.8909070014953615,
|
|
"epoch": 3.8160422670509124,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00036069025825935974,
|
|
"loss": 4.4337,
|
|
"mean_token_accuracy": 0.2649819657206535,
|
|
"num_tokens": 91058529.0,
|
|
"step": 39725
|
|
},
|
|
{
|
|
"entropy": 4.995941638946533,
|
|
"epoch": 3.8165225744476468,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00036065855736424886,
|
|
"loss": 4.5734,
|
|
"mean_token_accuracy": 0.25700714290142057,
|
|
"num_tokens": 91071277.0,
|
|
"step": 39730
|
|
},
|
|
{
|
|
"entropy": 4.967751836776733,
|
|
"epoch": 3.81700288184438,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003606268544804006,
|
|
"loss": 4.5241,
|
|
"mean_token_accuracy": 0.26335603296756743,
|
|
"num_tokens": 91083157.0,
|
|
"step": 39735
|
|
},
|
|
{
|
|
"entropy": 5.102995204925537,
|
|
"epoch": 3.8174831892411145,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00036059514960855104,
|
|
"loss": 4.6588,
|
|
"mean_token_accuracy": 0.24670448750257493,
|
|
"num_tokens": 91095945.0,
|
|
"step": 39740
|
|
},
|
|
{
|
|
"entropy": 5.0695556640625,
|
|
"epoch": 3.817963496637848,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003605634427494363,
|
|
"loss": 4.6106,
|
|
"mean_token_accuracy": 0.2501012355089188,
|
|
"num_tokens": 91107210.0,
|
|
"step": 39745
|
|
},
|
|
{
|
|
"entropy": 5.015468263626099,
|
|
"epoch": 3.8184438040345823,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003605317339037925,
|
|
"loss": 4.6041,
|
|
"mean_token_accuracy": 0.24938620030879974,
|
|
"num_tokens": 91120000.0,
|
|
"step": 39750
|
|
},
|
|
{
|
|
"entropy": 5.009186935424805,
|
|
"epoch": 3.818924111431316,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00036050002307235575,
|
|
"loss": 4.6232,
|
|
"mean_token_accuracy": 0.24943447560071946,
|
|
"num_tokens": 91131794.0,
|
|
"step": 39755
|
|
},
|
|
{
|
|
"entropy": 5.107095527648926,
|
|
"epoch": 3.81940441882805,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003604683102558624,
|
|
"loss": 4.6609,
|
|
"mean_token_accuracy": 0.25223141312599184,
|
|
"num_tokens": 91143644.0,
|
|
"step": 39760
|
|
},
|
|
{
|
|
"entropy": 5.075406074523926,
|
|
"epoch": 3.819884726224784,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00036043659545504866,
|
|
"loss": 4.5672,
|
|
"mean_token_accuracy": 0.2586871713399887,
|
|
"num_tokens": 91153735.0,
|
|
"step": 39765
|
|
},
|
|
{
|
|
"entropy": 4.967553663253784,
|
|
"epoch": 3.820365033621518,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003604048786706509,
|
|
"loss": 4.5437,
|
|
"mean_token_accuracy": 0.2617221474647522,
|
|
"num_tokens": 91165436.0,
|
|
"step": 39770
|
|
},
|
|
{
|
|
"entropy": 5.01377387046814,
|
|
"epoch": 3.8208453410182517,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003603731599034054,
|
|
"loss": 4.641,
|
|
"mean_token_accuracy": 0.2528723806142807,
|
|
"num_tokens": 91177203.0,
|
|
"step": 39775
|
|
},
|
|
{
|
|
"entropy": 4.990478515625,
|
|
"epoch": 3.8213256484149856,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00036034143915404865,
|
|
"loss": 4.551,
|
|
"mean_token_accuracy": 0.2542358264327049,
|
|
"num_tokens": 91190486.0,
|
|
"step": 39780
|
|
},
|
|
{
|
|
"entropy": 5.096856880187988,
|
|
"epoch": 3.8218059558117194,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003603097164233171,
|
|
"loss": 4.6226,
|
|
"mean_token_accuracy": 0.24846416264772414,
|
|
"num_tokens": 91201859.0,
|
|
"step": 39785
|
|
},
|
|
{
|
|
"entropy": 5.105583381652832,
|
|
"epoch": 3.8222862632084533,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003602779917119473,
|
|
"loss": 4.6575,
|
|
"mean_token_accuracy": 0.24960521310567857,
|
|
"num_tokens": 91212743.0,
|
|
"step": 39790
|
|
},
|
|
{
|
|
"entropy": 5.048699474334716,
|
|
"epoch": 3.822766570605187,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00036024626502067565,
|
|
"loss": 4.6001,
|
|
"mean_token_accuracy": 0.25182534754276276,
|
|
"num_tokens": 91223890.0,
|
|
"step": 39795
|
|
},
|
|
{
|
|
"entropy": 4.958105754852295,
|
|
"epoch": 3.823246878001921,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00036021453635023894,
|
|
"loss": 4.5002,
|
|
"mean_token_accuracy": 0.2594543322920799,
|
|
"num_tokens": 91235821.0,
|
|
"step": 39800
|
|
},
|
|
{
|
|
"entropy": 4.999437379837036,
|
|
"epoch": 3.8237271853986554,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00036018280570137374,
|
|
"loss": 4.5903,
|
|
"mean_token_accuracy": 0.25204588323831556,
|
|
"num_tokens": 91246951.0,
|
|
"step": 39805
|
|
},
|
|
{
|
|
"entropy": 4.994590187072754,
|
|
"epoch": 3.824207492795389,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003601510730748167,
|
|
"loss": 4.5154,
|
|
"mean_token_accuracy": 0.26158244013786314,
|
|
"num_tokens": 91257093.0,
|
|
"step": 39810
|
|
},
|
|
{
|
|
"entropy": 5.074928855895996,
|
|
"epoch": 3.824687800192123,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003601193384713046,
|
|
"loss": 4.6508,
|
|
"mean_token_accuracy": 0.25128256529569626,
|
|
"num_tokens": 91269390.0,
|
|
"step": 39815
|
|
},
|
|
{
|
|
"entropy": 5.014584398269653,
|
|
"epoch": 3.8251681075888566,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003600876018915742,
|
|
"loss": 4.5652,
|
|
"mean_token_accuracy": 0.2503513008356094,
|
|
"num_tokens": 91280427.0,
|
|
"step": 39820
|
|
},
|
|
{
|
|
"entropy": 5.07983660697937,
|
|
"epoch": 3.825648414985591,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0003600558633363624,
|
|
"loss": 4.6541,
|
|
"mean_token_accuracy": 0.24744450002908708,
|
|
"num_tokens": 91292415.0,
|
|
"step": 39825
|
|
},
|
|
{
|
|
"entropy": 4.997643756866455,
|
|
"epoch": 3.826128722382325,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003600241228064059,
|
|
"loss": 4.5639,
|
|
"mean_token_accuracy": 0.2610961839556694,
|
|
"num_tokens": 91302750.0,
|
|
"step": 39830
|
|
},
|
|
{
|
|
"entropy": 5.009902858734131,
|
|
"epoch": 3.8266090297790587,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00035999238030244183,
|
|
"loss": 4.5007,
|
|
"mean_token_accuracy": 0.259285406768322,
|
|
"num_tokens": 91313889.0,
|
|
"step": 39835
|
|
},
|
|
{
|
|
"entropy": 5.051788949966431,
|
|
"epoch": 3.8270893371757926,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035996063582520704,
|
|
"loss": 4.614,
|
|
"mean_token_accuracy": 0.25129650384187696,
|
|
"num_tokens": 91324306.0,
|
|
"step": 39840
|
|
},
|
|
{
|
|
"entropy": 5.029631280899048,
|
|
"epoch": 3.8275696445725265,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00035992888937543856,
|
|
"loss": 4.6459,
|
|
"mean_token_accuracy": 0.2577662423253059,
|
|
"num_tokens": 91335317.0,
|
|
"step": 39845
|
|
},
|
|
{
|
|
"entropy": 5.0995752811431885,
|
|
"epoch": 3.8280499519692603,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035989714095387344,
|
|
"loss": 4.652,
|
|
"mean_token_accuracy": 0.25178166925907136,
|
|
"num_tokens": 91346582.0,
|
|
"step": 39850
|
|
},
|
|
{
|
|
"entropy": 4.989064168930054,
|
|
"epoch": 3.8285302593659942,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00035986539056124877,
|
|
"loss": 4.5311,
|
|
"mean_token_accuracy": 0.2612006485462189,
|
|
"num_tokens": 91359135.0,
|
|
"step": 39855
|
|
},
|
|
{
|
|
"entropy": 5.090524101257325,
|
|
"epoch": 3.829010566762728,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003598336381983018,
|
|
"loss": 4.6618,
|
|
"mean_token_accuracy": 0.25515215545892717,
|
|
"num_tokens": 91370309.0,
|
|
"step": 39860
|
|
},
|
|
{
|
|
"entropy": 4.982257747650147,
|
|
"epoch": 3.829490874159462,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035980188386576967,
|
|
"loss": 4.5358,
|
|
"mean_token_accuracy": 0.25170269757509234,
|
|
"num_tokens": 91381322.0,
|
|
"step": 39865
|
|
},
|
|
{
|
|
"entropy": 5.040973091125489,
|
|
"epoch": 3.829971181556196,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003597701275643896,
|
|
"loss": 4.6093,
|
|
"mean_token_accuracy": 0.2571053683757782,
|
|
"num_tokens": 91393466.0,
|
|
"step": 39870
|
|
},
|
|
{
|
|
"entropy": 5.035001945495606,
|
|
"epoch": 3.8304514889529298,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003597383692948988,
|
|
"loss": 4.5348,
|
|
"mean_token_accuracy": 0.26130527555942534,
|
|
"num_tokens": 91403530.0,
|
|
"step": 39875
|
|
},
|
|
{
|
|
"entropy": 5.001945161819458,
|
|
"epoch": 3.8309317963496636,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003597066090580347,
|
|
"loss": 4.5555,
|
|
"mean_token_accuracy": 0.2528704345226288,
|
|
"num_tokens": 91414563.0,
|
|
"step": 39880
|
|
},
|
|
{
|
|
"entropy": 5.04454493522644,
|
|
"epoch": 3.8314121037463975,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003596748468545348,
|
|
"loss": 4.6629,
|
|
"mean_token_accuracy": 0.24869759678840636,
|
|
"num_tokens": 91427125.0,
|
|
"step": 39885
|
|
},
|
|
{
|
|
"entropy": 5.053281641006469,
|
|
"epoch": 3.831892411143132,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00035964308268513623,
|
|
"loss": 4.6097,
|
|
"mean_token_accuracy": 0.25698488503694533,
|
|
"num_tokens": 91438987.0,
|
|
"step": 39890
|
|
},
|
|
{
|
|
"entropy": 4.958277416229248,
|
|
"epoch": 3.8323727185398653,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.00035961131655057674,
|
|
"loss": 4.4903,
|
|
"mean_token_accuracy": 0.26414335817098616,
|
|
"num_tokens": 91450986.0,
|
|
"step": 39895
|
|
},
|
|
{
|
|
"entropy": 4.988449001312256,
|
|
"epoch": 3.8328530259365996,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003595795484515936,
|
|
"loss": 4.5767,
|
|
"mean_token_accuracy": 0.2556659519672394,
|
|
"num_tokens": 91462590.0,
|
|
"step": 39900
|
|
},
|
|
{
|
|
"entropy": 5.060807609558106,
|
|
"epoch": 3.8333333333333335,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00035954777838892454,
|
|
"loss": 4.6169,
|
|
"mean_token_accuracy": 0.25365073084831236,
|
|
"num_tokens": 91472630.0,
|
|
"step": 39905
|
|
},
|
|
{
|
|
"entropy": 5.051064825057983,
|
|
"epoch": 3.8338136407300674,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003595160063633072,
|
|
"loss": 4.6247,
|
|
"mean_token_accuracy": 0.26000601053237915,
|
|
"num_tokens": 91483335.0,
|
|
"step": 39910
|
|
},
|
|
{
|
|
"entropy": 5.082629632949829,
|
|
"epoch": 3.8342939481268012,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003594842323754791,
|
|
"loss": 4.689,
|
|
"mean_token_accuracy": 0.2523458763957024,
|
|
"num_tokens": 91494395.0,
|
|
"step": 39915
|
|
},
|
|
{
|
|
"entropy": 5.03752818107605,
|
|
"epoch": 3.834774255523535,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00035945245642617795,
|
|
"loss": 4.5726,
|
|
"mean_token_accuracy": 0.26089007407426834,
|
|
"num_tokens": 91506412.0,
|
|
"step": 39920
|
|
},
|
|
{
|
|
"entropy": 5.019009208679199,
|
|
"epoch": 3.835254562920269,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035942067851614165,
|
|
"loss": 4.5534,
|
|
"mean_token_accuracy": 0.26383792608976364,
|
|
"num_tokens": 91517624.0,
|
|
"step": 39925
|
|
},
|
|
{
|
|
"entropy": 4.934719610214233,
|
|
"epoch": 3.835734870317003,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00035938889864610776,
|
|
"loss": 4.5594,
|
|
"mean_token_accuracy": 0.2638810306787491,
|
|
"num_tokens": 91529497.0,
|
|
"step": 39930
|
|
},
|
|
{
|
|
"entropy": 5.118938779830932,
|
|
"epoch": 3.8362151777137368,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00035935711681681423,
|
|
"loss": 4.6636,
|
|
"mean_token_accuracy": 0.24932977110147475,
|
|
"num_tokens": 91540454.0,
|
|
"step": 39935
|
|
},
|
|
{
|
|
"entropy": 5.01640830039978,
|
|
"epoch": 3.8366954851104706,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000359325333028999,
|
|
"loss": 4.5132,
|
|
"mean_token_accuracy": 0.26275868266820906,
|
|
"num_tokens": 91551153.0,
|
|
"step": 39940
|
|
},
|
|
{
|
|
"entropy": 5.0182092666625975,
|
|
"epoch": 3.8371757925072045,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00035929354728339986,
|
|
"loss": 4.6261,
|
|
"mean_token_accuracy": 0.25459996312856675,
|
|
"num_tokens": 91562108.0,
|
|
"step": 39945
|
|
},
|
|
{
|
|
"entropy": 5.000110292434693,
|
|
"epoch": 3.8376560999039384,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003592617595807549,
|
|
"loss": 4.4857,
|
|
"mean_token_accuracy": 0.25845351070165634,
|
|
"num_tokens": 91572803.0,
|
|
"step": 39950
|
|
},
|
|
{
|
|
"entropy": 5.074876213073731,
|
|
"epoch": 3.8381364073006723,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00035922996992180213,
|
|
"loss": 4.6174,
|
|
"mean_token_accuracy": 0.25357850939035415,
|
|
"num_tokens": 91584292.0,
|
|
"step": 39955
|
|
},
|
|
{
|
|
"entropy": 5.103971099853515,
|
|
"epoch": 3.838616714697406,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00035919817830727945,
|
|
"loss": 4.654,
|
|
"mean_token_accuracy": 0.2513559848070145,
|
|
"num_tokens": 91596396.0,
|
|
"step": 39960
|
|
},
|
|
{
|
|
"entropy": 5.107650852203369,
|
|
"epoch": 3.8390970220941405,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003591663847379252,
|
|
"loss": 4.6248,
|
|
"mean_token_accuracy": 0.2567043602466583,
|
|
"num_tokens": 91608278.0,
|
|
"step": 39965
|
|
},
|
|
{
|
|
"entropy": 4.987578296661377,
|
|
"epoch": 3.839577329490874,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003591345892144774,
|
|
"loss": 4.547,
|
|
"mean_token_accuracy": 0.26587310433387756,
|
|
"num_tokens": 91619579.0,
|
|
"step": 39970
|
|
},
|
|
{
|
|
"entropy": 4.995828771591187,
|
|
"epoch": 3.8400576368876083,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00035910279173767426,
|
|
"loss": 4.6072,
|
|
"mean_token_accuracy": 0.2611927896738052,
|
|
"num_tokens": 91631586.0,
|
|
"step": 39975
|
|
},
|
|
{
|
|
"entropy": 4.991000556945801,
|
|
"epoch": 3.840537944284342,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000359070992308254,
|
|
"loss": 4.5395,
|
|
"mean_token_accuracy": 0.2630821466445923,
|
|
"num_tokens": 91643164.0,
|
|
"step": 39980
|
|
},
|
|
{
|
|
"entropy": 5.031895637512207,
|
|
"epoch": 3.841018251681076,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000359039190926955,
|
|
"loss": 4.5573,
|
|
"mean_token_accuracy": 0.25454391092061995,
|
|
"num_tokens": 91654111.0,
|
|
"step": 39985
|
|
},
|
|
{
|
|
"entropy": 4.921099138259888,
|
|
"epoch": 3.84149855907781,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003590073875945154,
|
|
"loss": 4.3807,
|
|
"mean_token_accuracy": 0.2692526951432228,
|
|
"num_tokens": 91665474.0,
|
|
"step": 39990
|
|
},
|
|
{
|
|
"entropy": 5.033195829391479,
|
|
"epoch": 3.841978866474544,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035897558231167375,
|
|
"loss": 4.6354,
|
|
"mean_token_accuracy": 0.24294710755348206,
|
|
"num_tokens": 91676609.0,
|
|
"step": 39995
|
|
},
|
|
{
|
|
"entropy": 4.9567913055419925,
|
|
"epoch": 3.8424591738712777,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003589437750791685,
|
|
"loss": 4.5317,
|
|
"mean_token_accuracy": 0.2580776125192642,
|
|
"num_tokens": 91687213.0,
|
|
"step": 40000
|
|
},
|
|
{
|
|
"entropy": 4.980977869033813,
|
|
"epoch": 3.8429394812680115,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035891196589773805,
|
|
"loss": 4.5822,
|
|
"mean_token_accuracy": 0.25387428849935534,
|
|
"num_tokens": 91699220.0,
|
|
"step": 40005
|
|
},
|
|
{
|
|
"entropy": 4.955916023254394,
|
|
"epoch": 3.8434197886647454,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035888015476812085,
|
|
"loss": 4.4583,
|
|
"mean_token_accuracy": 0.26617265343666074,
|
|
"num_tokens": 91710020.0,
|
|
"step": 40010
|
|
},
|
|
{
|
|
"entropy": 4.968705177307129,
|
|
"epoch": 3.8439000960614793,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003588483416910557,
|
|
"loss": 4.5411,
|
|
"mean_token_accuracy": 0.26171448528766633,
|
|
"num_tokens": 91720708.0,
|
|
"step": 40015
|
|
},
|
|
{
|
|
"entropy": 5.007022714614868,
|
|
"epoch": 3.844380403458213,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003588165266672809,
|
|
"loss": 4.594,
|
|
"mean_token_accuracy": 0.2534105256199837,
|
|
"num_tokens": 91731545.0,
|
|
"step": 40020
|
|
},
|
|
{
|
|
"entropy": 5.022836780548095,
|
|
"epoch": 3.844860710854947,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00035878470969753524,
|
|
"loss": 4.5732,
|
|
"mean_token_accuracy": 0.2551906183362007,
|
|
"num_tokens": 91743848.0,
|
|
"step": 40025
|
|
},
|
|
{
|
|
"entropy": 5.068403673171997,
|
|
"epoch": 3.845341018251681,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035875289078255747,
|
|
"loss": 4.5678,
|
|
"mean_token_accuracy": 0.2623034417629242,
|
|
"num_tokens": 91754000.0,
|
|
"step": 40030
|
|
},
|
|
{
|
|
"entropy": 5.028774738311768,
|
|
"epoch": 3.845821325648415,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.00035872106992308614,
|
|
"loss": 4.5539,
|
|
"mean_token_accuracy": 0.2570123583078384,
|
|
"num_tokens": 91765563.0,
|
|
"step": 40035
|
|
},
|
|
{
|
|
"entropy": 5.025885343551636,
|
|
"epoch": 3.846301633045149,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003586892471198603,
|
|
"loss": 4.5666,
|
|
"mean_token_accuracy": 0.2550210729241371,
|
|
"num_tokens": 91777394.0,
|
|
"step": 40040
|
|
},
|
|
{
|
|
"entropy": 5.037660932540893,
|
|
"epoch": 3.8467819404418826,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003586574223736186,
|
|
"loss": 4.518,
|
|
"mean_token_accuracy": 0.26078993231058123,
|
|
"num_tokens": 91787161.0,
|
|
"step": 40045
|
|
},
|
|
{
|
|
"entropy": 5.044700956344604,
|
|
"epoch": 3.847262247838617,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035862559568509994,
|
|
"loss": 4.6193,
|
|
"mean_token_accuracy": 0.25049309730529784,
|
|
"num_tokens": 91797001.0,
|
|
"step": 40050
|
|
},
|
|
{
|
|
"entropy": 5.061703681945801,
|
|
"epoch": 3.8477425552353504,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003585937670550433,
|
|
"loss": 4.6166,
|
|
"mean_token_accuracy": 0.24490341544151306,
|
|
"num_tokens": 91808195.0,
|
|
"step": 40055
|
|
},
|
|
{
|
|
"entropy": 5.060799264907837,
|
|
"epoch": 3.8482228626320847,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00035856193648418753,
|
|
"loss": 4.5924,
|
|
"mean_token_accuracy": 0.25568001717329025,
|
|
"num_tokens": 91819907.0,
|
|
"step": 40060
|
|
},
|
|
{
|
|
"entropy": 5.048443460464478,
|
|
"epoch": 3.8487031700288186,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00035853010397327186,
|
|
"loss": 4.6248,
|
|
"mean_token_accuracy": 0.24797157049179078,
|
|
"num_tokens": 91831960.0,
|
|
"step": 40065
|
|
},
|
|
{
|
|
"entropy": 5.058000946044922,
|
|
"epoch": 3.8491834774255524,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003584982695230351,
|
|
"loss": 4.6637,
|
|
"mean_token_accuracy": 0.24729245752096177,
|
|
"num_tokens": 91844339.0,
|
|
"step": 40070
|
|
},
|
|
{
|
|
"entropy": 4.979017305374145,
|
|
"epoch": 3.8496637848222863,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003584664331342166,
|
|
"loss": 4.5561,
|
|
"mean_token_accuracy": 0.2598024055361748,
|
|
"num_tokens": 91856207.0,
|
|
"step": 40075
|
|
},
|
|
{
|
|
"entropy": 5.053192043304444,
|
|
"epoch": 3.85014409221902,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003584345948075552,
|
|
"loss": 4.6158,
|
|
"mean_token_accuracy": 0.25790252089500426,
|
|
"num_tokens": 91868307.0,
|
|
"step": 40080
|
|
},
|
|
{
|
|
"entropy": 5.0010346412658695,
|
|
"epoch": 3.850624399615754,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003584027545437903,
|
|
"loss": 4.5828,
|
|
"mean_token_accuracy": 0.2565372332930565,
|
|
"num_tokens": 91881164.0,
|
|
"step": 40085
|
|
},
|
|
{
|
|
"entropy": 5.021363210678101,
|
|
"epoch": 3.851104707012488,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035837091234366107,
|
|
"loss": 4.6184,
|
|
"mean_token_accuracy": 0.2601084515452385,
|
|
"num_tokens": 91893725.0,
|
|
"step": 40090
|
|
},
|
|
{
|
|
"entropy": 4.9291609764099125,
|
|
"epoch": 3.851585014409222,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.000358339068207907,
|
|
"loss": 4.4529,
|
|
"mean_token_accuracy": 0.26379876732826235,
|
|
"num_tokens": 91904646.0,
|
|
"step": 40095
|
|
},
|
|
{
|
|
"entropy": 4.995004606246948,
|
|
"epoch": 3.8520653218059557,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00035830722213726695,
|
|
"loss": 4.5477,
|
|
"mean_token_accuracy": 0.2660832405090332,
|
|
"num_tokens": 91916001.0,
|
|
"step": 40100
|
|
},
|
|
{
|
|
"entropy": 5.029636526107788,
|
|
"epoch": 3.8525456292026896,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00035827537413248077,
|
|
"loss": 4.6303,
|
|
"mean_token_accuracy": 0.24753239899873733,
|
|
"num_tokens": 91927247.0,
|
|
"step": 40105
|
|
},
|
|
{
|
|
"entropy": 5.068269920349121,
|
|
"epoch": 3.8530259365994235,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003582435241942877,
|
|
"loss": 4.6061,
|
|
"mean_token_accuracy": 0.2536361888051033,
|
|
"num_tokens": 91938081.0,
|
|
"step": 40110
|
|
},
|
|
{
|
|
"entropy": 5.072588062286377,
|
|
"epoch": 3.8535062439961574,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00035821167232342706,
|
|
"loss": 4.6418,
|
|
"mean_token_accuracy": 0.25105409473180773,
|
|
"num_tokens": 91949497.0,
|
|
"step": 40115
|
|
},
|
|
{
|
|
"entropy": 4.997418355941773,
|
|
"epoch": 3.8539865513928913,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003581798185206385,
|
|
"loss": 4.5792,
|
|
"mean_token_accuracy": 0.26007888168096543,
|
|
"num_tokens": 91960134.0,
|
|
"step": 40120
|
|
},
|
|
{
|
|
"entropy": 4.991606426239014,
|
|
"epoch": 3.8544668587896256,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00035814796278666166,
|
|
"loss": 4.5112,
|
|
"mean_token_accuracy": 0.2648820668458939,
|
|
"num_tokens": 91970402.0,
|
|
"step": 40125
|
|
},
|
|
{
|
|
"entropy": 4.950118112564087,
|
|
"epoch": 3.854947166186359,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00035811610512223585,
|
|
"loss": 4.5189,
|
|
"mean_token_accuracy": 0.26309927701950075,
|
|
"num_tokens": 91980542.0,
|
|
"step": 40130
|
|
},
|
|
{
|
|
"entropy": 4.972359371185303,
|
|
"epoch": 3.8554274735830933,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000358084245528101,
|
|
"loss": 4.4349,
|
|
"mean_token_accuracy": 0.2722579225897789,
|
|
"num_tokens": 91992078.0,
|
|
"step": 40135
|
|
},
|
|
{
|
|
"entropy": 5.022487640380859,
|
|
"epoch": 3.8559077809798272,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00035805238400499667,
|
|
"loss": 4.5101,
|
|
"mean_token_accuracy": 0.25768852680921556,
|
|
"num_tokens": 92002940.0,
|
|
"step": 40140
|
|
},
|
|
{
|
|
"entropy": 5.0083009719848635,
|
|
"epoch": 3.856388088376561,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003580205205536626,
|
|
"loss": 4.5663,
|
|
"mean_token_accuracy": 0.25933564603328707,
|
|
"num_tokens": 92015911.0,
|
|
"step": 40145
|
|
},
|
|
{
|
|
"entropy": 5.0195722579956055,
|
|
"epoch": 3.856868395773295,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003579886551748385,
|
|
"loss": 4.6635,
|
|
"mean_token_accuracy": 0.24959012418985366,
|
|
"num_tokens": 92028008.0,
|
|
"step": 40150
|
|
},
|
|
{
|
|
"entropy": 5.172059535980225,
|
|
"epoch": 3.857348703170029,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035795678786926434,
|
|
"loss": 4.7511,
|
|
"mean_token_accuracy": 0.24683846831321715,
|
|
"num_tokens": 92039833.0,
|
|
"step": 40155
|
|
},
|
|
{
|
|
"entropy": 5.0140434265136715,
|
|
"epoch": 3.8578290105667628,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00035792491863767984,
|
|
"loss": 4.4687,
|
|
"mean_token_accuracy": 0.2658264249563217,
|
|
"num_tokens": 92051929.0,
|
|
"step": 40160
|
|
},
|
|
{
|
|
"entropy": 5.119032955169677,
|
|
"epoch": 3.8583093179634966,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00035789304748082496,
|
|
"loss": 4.6065,
|
|
"mean_token_accuracy": 0.2566548839211464,
|
|
"num_tokens": 92062521.0,
|
|
"step": 40165
|
|
},
|
|
{
|
|
"entropy": 4.945581722259521,
|
|
"epoch": 3.8587896253602305,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00035786117439943967,
|
|
"loss": 4.5469,
|
|
"mean_token_accuracy": 0.25943452715873716,
|
|
"num_tokens": 92074120.0,
|
|
"step": 40170
|
|
},
|
|
{
|
|
"entropy": 5.014218282699585,
|
|
"epoch": 3.8592699327569644,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00035782929939426393,
|
|
"loss": 4.5602,
|
|
"mean_token_accuracy": 0.26194534450769424,
|
|
"num_tokens": 92085687.0,
|
|
"step": 40175
|
|
},
|
|
{
|
|
"entropy": 4.994827747344971,
|
|
"epoch": 3.8597502401536983,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003577974224660379,
|
|
"loss": 4.5433,
|
|
"mean_token_accuracy": 0.2633256301283836,
|
|
"num_tokens": 92098034.0,
|
|
"step": 40180
|
|
},
|
|
{
|
|
"entropy": 5.053682851791382,
|
|
"epoch": 3.860230547550432,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003577655436155015,
|
|
"loss": 4.6606,
|
|
"mean_token_accuracy": 0.25094130337238313,
|
|
"num_tokens": 92108220.0,
|
|
"step": 40185
|
|
},
|
|
{
|
|
"entropy": 5.003575181961059,
|
|
"epoch": 3.860710854947166,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003577336628433949,
|
|
"loss": 4.5313,
|
|
"mean_token_accuracy": 0.2605470433831215,
|
|
"num_tokens": 92120061.0,
|
|
"step": 40190
|
|
},
|
|
{
|
|
"entropy": 5.000588130950928,
|
|
"epoch": 3.8611911623439,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00035770178015045843,
|
|
"loss": 4.5656,
|
|
"mean_token_accuracy": 0.25295558869838713,
|
|
"num_tokens": 92131199.0,
|
|
"step": 40195
|
|
},
|
|
{
|
|
"entropy": 4.971785211563111,
|
|
"epoch": 3.8616714697406342,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003576698955374321,
|
|
"loss": 4.5599,
|
|
"mean_token_accuracy": 0.2541643589735031,
|
|
"num_tokens": 92142738.0,
|
|
"step": 40200
|
|
},
|
|
{
|
|
"entropy": 5.020510292053222,
|
|
"epoch": 3.8621517771373677,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003576380090050564,
|
|
"loss": 4.5925,
|
|
"mean_token_accuracy": 0.25685700327157973,
|
|
"num_tokens": 92154090.0,
|
|
"step": 40205
|
|
},
|
|
{
|
|
"entropy": 5.079177951812744,
|
|
"epoch": 3.862632084534102,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003576061205540714,
|
|
"loss": 4.5684,
|
|
"mean_token_accuracy": 0.2514734923839569,
|
|
"num_tokens": 92166100.0,
|
|
"step": 40210
|
|
},
|
|
{
|
|
"entropy": 5.007316970825196,
|
|
"epoch": 3.863112391930836,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003575742301852176,
|
|
"loss": 4.5968,
|
|
"mean_token_accuracy": 0.25327482670545576,
|
|
"num_tokens": 92178336.0,
|
|
"step": 40215
|
|
},
|
|
{
|
|
"entropy": 5.065899419784546,
|
|
"epoch": 3.8635926993275698,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003575423378992354,
|
|
"loss": 4.5641,
|
|
"mean_token_accuracy": 0.2494867518544197,
|
|
"num_tokens": 92189211.0,
|
|
"step": 40220
|
|
},
|
|
{
|
|
"entropy": 4.924892520904541,
|
|
"epoch": 3.8640730067243036,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003575104436968652,
|
|
"loss": 4.4862,
|
|
"mean_token_accuracy": 0.2656349092721939,
|
|
"num_tokens": 92200699.0,
|
|
"step": 40225
|
|
},
|
|
{
|
|
"entropy": 4.942655563354492,
|
|
"epoch": 3.8645533141210375,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00035747854757884744,
|
|
"loss": 4.476,
|
|
"mean_token_accuracy": 0.25916057378053664,
|
|
"num_tokens": 92212640.0,
|
|
"step": 40230
|
|
},
|
|
{
|
|
"entropy": 5.138998317718506,
|
|
"epoch": 3.8650336215177714,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.00035744664954592283,
|
|
"loss": 4.6725,
|
|
"mean_token_accuracy": 0.2513629883527756,
|
|
"num_tokens": 92225888.0,
|
|
"step": 40235
|
|
},
|
|
{
|
|
"entropy": 5.035348796844483,
|
|
"epoch": 3.8655139289145053,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00035741474959883175,
|
|
"loss": 4.5668,
|
|
"mean_token_accuracy": 0.25548707544803617,
|
|
"num_tokens": 92237842.0,
|
|
"step": 40240
|
|
},
|
|
{
|
|
"entropy": 5.056444454193115,
|
|
"epoch": 3.865994236311239,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.000357382847738315,
|
|
"loss": 4.6448,
|
|
"mean_token_accuracy": 0.2504651963710785,
|
|
"num_tokens": 92249986.0,
|
|
"step": 40245
|
|
},
|
|
{
|
|
"entropy": 5.054948329925537,
|
|
"epoch": 3.866474543707973,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00035735094396511304,
|
|
"loss": 4.5991,
|
|
"mean_token_accuracy": 0.2496475413441658,
|
|
"num_tokens": 92262682.0,
|
|
"step": 40250
|
|
},
|
|
{
|
|
"entropy": 5.056542491912841,
|
|
"epoch": 3.866954851104707,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003573190382799667,
|
|
"loss": 4.6023,
|
|
"mean_token_accuracy": 0.2562183827161789,
|
|
"num_tokens": 92274890.0,
|
|
"step": 40255
|
|
},
|
|
{
|
|
"entropy": 5.052334403991699,
|
|
"epoch": 3.867435158501441,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00035728713068361684,
|
|
"loss": 4.6475,
|
|
"mean_token_accuracy": 0.25369425117969513,
|
|
"num_tokens": 92284790.0,
|
|
"step": 40260
|
|
},
|
|
{
|
|
"entropy": 5.026362419128418,
|
|
"epoch": 3.8679154658981747,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035725522117680404,
|
|
"loss": 4.6319,
|
|
"mean_token_accuracy": 0.2510994240641594,
|
|
"num_tokens": 92296108.0,
|
|
"step": 40265
|
|
},
|
|
{
|
|
"entropy": 5.077983665466308,
|
|
"epoch": 3.8683957732949086,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003572233097602693,
|
|
"loss": 4.6178,
|
|
"mean_token_accuracy": 0.2468857929110527,
|
|
"num_tokens": 92307648.0,
|
|
"step": 40270
|
|
},
|
|
{
|
|
"entropy": 5.066457748413086,
|
|
"epoch": 3.868876080691643,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00035719139643475344,
|
|
"loss": 4.6481,
|
|
"mean_token_accuracy": 0.2497810408473015,
|
|
"num_tokens": 92320405.0,
|
|
"step": 40275
|
|
},
|
|
{
|
|
"entropy": 5.02928204536438,
|
|
"epoch": 3.8693563880883763,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00035715948120099747,
|
|
"loss": 4.5863,
|
|
"mean_token_accuracy": 0.2642838329076767,
|
|
"num_tokens": 92331903.0,
|
|
"step": 40280
|
|
},
|
|
{
|
|
"entropy": 5.074890565872193,
|
|
"epoch": 3.8698366954851107,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035712756405974216,
|
|
"loss": 4.6557,
|
|
"mean_token_accuracy": 0.24650092869997026,
|
|
"num_tokens": 92343336.0,
|
|
"step": 40285
|
|
},
|
|
{
|
|
"entropy": 5.055065298080445,
|
|
"epoch": 3.870317002881844,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003570956450117288,
|
|
"loss": 4.633,
|
|
"mean_token_accuracy": 0.24734124392271042,
|
|
"num_tokens": 92356283.0,
|
|
"step": 40290
|
|
},
|
|
{
|
|
"entropy": 5.058116102218628,
|
|
"epoch": 3.8707973102785784,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003570637240576983,
|
|
"loss": 4.6162,
|
|
"mean_token_accuracy": 0.25456833839416504,
|
|
"num_tokens": 92367893.0,
|
|
"step": 40295
|
|
},
|
|
{
|
|
"entropy": 5.004996109008789,
|
|
"epoch": 3.8712776176753123,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00035703180119839187,
|
|
"loss": 4.4979,
|
|
"mean_token_accuracy": 0.2566749170422554,
|
|
"num_tokens": 92379019.0,
|
|
"step": 40300
|
|
},
|
|
{
|
|
"entropy": 4.991738224029541,
|
|
"epoch": 3.871757925072046,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003569998764345506,
|
|
"loss": 4.5181,
|
|
"mean_token_accuracy": 0.26546483486890793,
|
|
"num_tokens": 92390082.0,
|
|
"step": 40305
|
|
},
|
|
{
|
|
"entropy": 4.96423373222351,
|
|
"epoch": 3.87223823246878,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00035696794976691564,
|
|
"loss": 4.5476,
|
|
"mean_token_accuracy": 0.26004464030265806,
|
|
"num_tokens": 92401323.0,
|
|
"step": 40310
|
|
},
|
|
{
|
|
"entropy": 4.975429821014404,
|
|
"epoch": 3.872718539865514,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00035693602119622823,
|
|
"loss": 4.5061,
|
|
"mean_token_accuracy": 0.2599553197622299,
|
|
"num_tokens": 92412329.0,
|
|
"step": 40315
|
|
},
|
|
{
|
|
"entropy": 5.186695146560669,
|
|
"epoch": 3.873198847262248,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003569040907232298,
|
|
"loss": 4.7025,
|
|
"mean_token_accuracy": 0.2507550254464149,
|
|
"num_tokens": 92423267.0,
|
|
"step": 40320
|
|
},
|
|
{
|
|
"entropy": 4.98967752456665,
|
|
"epoch": 3.8736791546589817,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00035687215834866154,
|
|
"loss": 4.5187,
|
|
"mean_token_accuracy": 0.2621413618326187,
|
|
"num_tokens": 92434519.0,
|
|
"step": 40325
|
|
},
|
|
{
|
|
"entropy": 4.975248765945435,
|
|
"epoch": 3.8741594620557156,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00035684022407326486,
|
|
"loss": 4.4972,
|
|
"mean_token_accuracy": 0.2658769950270653,
|
|
"num_tokens": 92445834.0,
|
|
"step": 40330
|
|
},
|
|
{
|
|
"entropy": 4.983595323562622,
|
|
"epoch": 3.8746397694524495,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00035680828789778126,
|
|
"loss": 4.5913,
|
|
"mean_token_accuracy": 0.2553894594311714,
|
|
"num_tokens": 92457648.0,
|
|
"step": 40335
|
|
},
|
|
{
|
|
"entropy": 5.012076759338379,
|
|
"epoch": 3.8751200768491834,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003567763498229521,
|
|
"loss": 4.5907,
|
|
"mean_token_accuracy": 0.26056757271289827,
|
|
"num_tokens": 92469080.0,
|
|
"step": 40340
|
|
},
|
|
{
|
|
"entropy": 5.0465068340301515,
|
|
"epoch": 3.8756003842459172,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003567444098495189,
|
|
"loss": 4.5919,
|
|
"mean_token_accuracy": 0.25375884771347046,
|
|
"num_tokens": 92481679.0,
|
|
"step": 40345
|
|
},
|
|
{
|
|
"entropy": 5.027431678771973,
|
|
"epoch": 3.8760806916426516,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035671246797822325,
|
|
"loss": 4.5633,
|
|
"mean_token_accuracy": 0.2586736559867859,
|
|
"num_tokens": 92493143.0,
|
|
"step": 40350
|
|
},
|
|
{
|
|
"entropy": 4.99344835281372,
|
|
"epoch": 3.876560999039385,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003566805242098067,
|
|
"loss": 4.5218,
|
|
"mean_token_accuracy": 0.25548364967107773,
|
|
"num_tokens": 92504302.0,
|
|
"step": 40355
|
|
},
|
|
{
|
|
"entropy": 5.049513864517212,
|
|
"epoch": 3.8770413064361193,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000356648578545011,
|
|
"loss": 4.5694,
|
|
"mean_token_accuracy": 0.24792903512716294,
|
|
"num_tokens": 92514797.0,
|
|
"step": 40360
|
|
},
|
|
{
|
|
"entropy": 5.096840286254883,
|
|
"epoch": 3.8775216138328528,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003566166309845777,
|
|
"loss": 4.6646,
|
|
"mean_token_accuracy": 0.24864757657051087,
|
|
"num_tokens": 92526492.0,
|
|
"step": 40365
|
|
},
|
|
{
|
|
"entropy": 5.011972665786743,
|
|
"epoch": 3.878001921229587,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003565846815292486,
|
|
"loss": 4.5989,
|
|
"mean_token_accuracy": 0.25683217495679855,
|
|
"num_tokens": 92536980.0,
|
|
"step": 40370
|
|
},
|
|
{
|
|
"entropy": 4.9599837303161625,
|
|
"epoch": 3.878482228626321,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003565527301797654,
|
|
"loss": 4.5479,
|
|
"mean_token_accuracy": 0.2626758188009262,
|
|
"num_tokens": 92548384.0,
|
|
"step": 40375
|
|
},
|
|
{
|
|
"entropy": 4.972018814086914,
|
|
"epoch": 3.878962536023055,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00035652077693687,
|
|
"loss": 4.5771,
|
|
"mean_token_accuracy": 0.2602414324879646,
|
|
"num_tokens": 92559853.0,
|
|
"step": 40380
|
|
},
|
|
{
|
|
"entropy": 5.073757457733154,
|
|
"epoch": 3.8794428434197887,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003564888218013042,
|
|
"loss": 4.6538,
|
|
"mean_token_accuracy": 0.24451006948947906,
|
|
"num_tokens": 92570740.0,
|
|
"step": 40385
|
|
},
|
|
{
|
|
"entropy": 4.905328321456909,
|
|
"epoch": 3.8799231508165226,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00035645686477380996,
|
|
"loss": 4.4553,
|
|
"mean_token_accuracy": 0.2660682380199432,
|
|
"num_tokens": 92583246.0,
|
|
"step": 40390
|
|
},
|
|
{
|
|
"entropy": 5.088708829879761,
|
|
"epoch": 3.8804034582132565,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003564249058551293,
|
|
"loss": 4.6643,
|
|
"mean_token_accuracy": 0.2513762578368187,
|
|
"num_tokens": 92595551.0,
|
|
"step": 40395
|
|
},
|
|
{
|
|
"entropy": 4.991014051437378,
|
|
"epoch": 3.8808837656099904,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003563929450460039,
|
|
"loss": 4.4839,
|
|
"mean_token_accuracy": 0.26631841659545896,
|
|
"num_tokens": 92608620.0,
|
|
"step": 40400
|
|
},
|
|
{
|
|
"entropy": 5.000298166275025,
|
|
"epoch": 3.8813640730067243,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003563609823471761,
|
|
"loss": 4.5495,
|
|
"mean_token_accuracy": 0.25492335110902786,
|
|
"num_tokens": 92620342.0,
|
|
"step": 40405
|
|
},
|
|
{
|
|
"entropy": 4.9241396427154545,
|
|
"epoch": 3.881844380403458,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035632901775938794,
|
|
"loss": 4.5128,
|
|
"mean_token_accuracy": 0.26748220026493075,
|
|
"num_tokens": 92630633.0,
|
|
"step": 40410
|
|
},
|
|
{
|
|
"entropy": 5.065708589553833,
|
|
"epoch": 3.882324687800192,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003562970512833814,
|
|
"loss": 4.7616,
|
|
"mean_token_accuracy": 0.23544372916221618,
|
|
"num_tokens": 92643134.0,
|
|
"step": 40415
|
|
},
|
|
{
|
|
"entropy": 5.044671964645386,
|
|
"epoch": 3.882804995196926,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0003562650829198988,
|
|
"loss": 4.496,
|
|
"mean_token_accuracy": 0.2599635273218155,
|
|
"num_tokens": 92654765.0,
|
|
"step": 40420
|
|
},
|
|
{
|
|
"entropy": 5.0457817077636715,
|
|
"epoch": 3.88328530259366,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00035623311266968225,
|
|
"loss": 4.6367,
|
|
"mean_token_accuracy": 0.26201675087213516,
|
|
"num_tokens": 92666321.0,
|
|
"step": 40425
|
|
},
|
|
{
|
|
"entropy": 5.008311605453491,
|
|
"epoch": 3.8837656099903937,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.000356201140533474,
|
|
"loss": 4.5524,
|
|
"mean_token_accuracy": 0.2565996006131172,
|
|
"num_tokens": 92679214.0,
|
|
"step": 40430
|
|
},
|
|
{
|
|
"entropy": 4.949558734893799,
|
|
"epoch": 3.884245917387128,
|
|
"grad_norm": 0.8671875,
|
|
"learning_rate": 0.0003561691665120164,
|
|
"loss": 4.5407,
|
|
"mean_token_accuracy": 0.2619334802031517,
|
|
"num_tokens": 92692064.0,
|
|
"step": 40435
|
|
},
|
|
{
|
|
"entropy": 5.042934322357178,
|
|
"epoch": 3.8847262247838614,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0003561371906060518,
|
|
"loss": 4.5877,
|
|
"mean_token_accuracy": 0.25791886895895005,
|
|
"num_tokens": 92704732.0,
|
|
"step": 40440
|
|
},
|
|
{
|
|
"entropy": 5.044079446792603,
|
|
"epoch": 3.8852065321805958,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035610521281632257,
|
|
"loss": 4.5936,
|
|
"mean_token_accuracy": 0.263380928337574,
|
|
"num_tokens": 92714176.0,
|
|
"step": 40445
|
|
},
|
|
{
|
|
"entropy": 5.0051257610321045,
|
|
"epoch": 3.8856868395773296,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003560732331435711,
|
|
"loss": 4.5348,
|
|
"mean_token_accuracy": 0.25943925976753235,
|
|
"num_tokens": 92725874.0,
|
|
"step": 40450
|
|
},
|
|
{
|
|
"entropy": 4.9321184158325195,
|
|
"epoch": 3.8861671469740635,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035604125158853996,
|
|
"loss": 4.4603,
|
|
"mean_token_accuracy": 0.2608669146895409,
|
|
"num_tokens": 92738040.0,
|
|
"step": 40455
|
|
},
|
|
{
|
|
"entropy": 5.064031553268433,
|
|
"epoch": 3.8866474543707974,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003560092681519715,
|
|
"loss": 4.5476,
|
|
"mean_token_accuracy": 0.26041978746652605,
|
|
"num_tokens": 92749818.0,
|
|
"step": 40460
|
|
},
|
|
{
|
|
"entropy": 5.014881896972656,
|
|
"epoch": 3.8871277617675313,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00035597728283460843,
|
|
"loss": 4.552,
|
|
"mean_token_accuracy": 0.2640788063406944,
|
|
"num_tokens": 92761209.0,
|
|
"step": 40465
|
|
},
|
|
{
|
|
"entropy": 4.96976261138916,
|
|
"epoch": 3.887608069164265,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00035594529563719336,
|
|
"loss": 4.4805,
|
|
"mean_token_accuracy": 0.2637583568692207,
|
|
"num_tokens": 92771698.0,
|
|
"step": 40470
|
|
},
|
|
{
|
|
"entropy": 5.025620555877685,
|
|
"epoch": 3.888088376560999,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003559133065604688,
|
|
"loss": 4.61,
|
|
"mean_token_accuracy": 0.26462485194206237,
|
|
"num_tokens": 92783753.0,
|
|
"step": 40475
|
|
},
|
|
{
|
|
"entropy": 5.0019388675689695,
|
|
"epoch": 3.888568683957733,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0003558813156051776,
|
|
"loss": 4.5363,
|
|
"mean_token_accuracy": 0.26133959740400314,
|
|
"num_tokens": 92796640.0,
|
|
"step": 40480
|
|
},
|
|
{
|
|
"entropy": 5.071239852905274,
|
|
"epoch": 3.889048991354467,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003558493227720624,
|
|
"loss": 4.6296,
|
|
"mean_token_accuracy": 0.2512615814805031,
|
|
"num_tokens": 92808163.0,
|
|
"step": 40485
|
|
},
|
|
{
|
|
"entropy": 4.970729207992553,
|
|
"epoch": 3.8895292987512007,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00035581732806186595,
|
|
"loss": 4.5056,
|
|
"mean_token_accuracy": 0.26607673466205595,
|
|
"num_tokens": 92818953.0,
|
|
"step": 40490
|
|
},
|
|
{
|
|
"entropy": 4.972127389907837,
|
|
"epoch": 3.8900096061479346,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0003557853314753311,
|
|
"loss": 4.5561,
|
|
"mean_token_accuracy": 0.25313073992729185,
|
|
"num_tokens": 92830555.0,
|
|
"step": 40495
|
|
},
|
|
{
|
|
"entropy": 5.105244302749634,
|
|
"epoch": 3.8904899135446684,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035575333301320083,
|
|
"loss": 4.6175,
|
|
"mean_token_accuracy": 0.25153629034757613,
|
|
"num_tokens": 92841491.0,
|
|
"step": 40500
|
|
},
|
|
{
|
|
"entropy": 5.062092065811157,
|
|
"epoch": 3.8909702209414023,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00035572133267621793,
|
|
"loss": 4.5981,
|
|
"mean_token_accuracy": 0.26078147292137144,
|
|
"num_tokens": 92853102.0,
|
|
"step": 40505
|
|
},
|
|
{
|
|
"entropy": 4.969434452056885,
|
|
"epoch": 3.8914505283381366,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003556893304651253,
|
|
"loss": 4.5527,
|
|
"mean_token_accuracy": 0.2627414897084236,
|
|
"num_tokens": 92863034.0,
|
|
"step": 40510
|
|
},
|
|
{
|
|
"entropy": 5.0124836444854735,
|
|
"epoch": 3.89193083573487,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003556573263806661,
|
|
"loss": 4.5944,
|
|
"mean_token_accuracy": 0.2548529148101807,
|
|
"num_tokens": 92875076.0,
|
|
"step": 40515
|
|
},
|
|
{
|
|
"entropy": 5.020214176177978,
|
|
"epoch": 3.8924111431316044,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003556253204235833,
|
|
"loss": 4.5798,
|
|
"mean_token_accuracy": 0.25366864949464796,
|
|
"num_tokens": 92886725.0,
|
|
"step": 40520
|
|
},
|
|
{
|
|
"entropy": 5.052117729187012,
|
|
"epoch": 3.8928914505283383,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003555933125946199,
|
|
"loss": 4.5626,
|
|
"mean_token_accuracy": 0.2524331137537956,
|
|
"num_tokens": 92897244.0,
|
|
"step": 40525
|
|
},
|
|
{
|
|
"entropy": 5.118389129638672,
|
|
"epoch": 3.893371757925072,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003555613028945191,
|
|
"loss": 4.6419,
|
|
"mean_token_accuracy": 0.24709675312042237,
|
|
"num_tokens": 92908161.0,
|
|
"step": 40530
|
|
},
|
|
{
|
|
"entropy": 4.991269731521607,
|
|
"epoch": 3.893852065321806,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035552929132402414,
|
|
"loss": 4.5247,
|
|
"mean_token_accuracy": 0.2612187057733536,
|
|
"num_tokens": 92919065.0,
|
|
"step": 40535
|
|
},
|
|
{
|
|
"entropy": 5.092784070968628,
|
|
"epoch": 3.89433237271854,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00035549727788387805,
|
|
"loss": 4.6633,
|
|
"mean_token_accuracy": 0.24890659153461456,
|
|
"num_tokens": 92930224.0,
|
|
"step": 40540
|
|
},
|
|
{
|
|
"entropy": 5.043989801406861,
|
|
"epoch": 3.894812680115274,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00035546526257482426,
|
|
"loss": 4.5954,
|
|
"mean_token_accuracy": 0.2656810373067856,
|
|
"num_tokens": 92942109.0,
|
|
"step": 40545
|
|
},
|
|
{
|
|
"entropy": 5.0112182140350345,
|
|
"epoch": 3.8952929875120077,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035543324539760604,
|
|
"loss": 4.5437,
|
|
"mean_token_accuracy": 0.2597032755613327,
|
|
"num_tokens": 92954984.0,
|
|
"step": 40550
|
|
},
|
|
{
|
|
"entropy": 5.0363400936126705,
|
|
"epoch": 3.8957732949087416,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003554012263529666,
|
|
"loss": 4.585,
|
|
"mean_token_accuracy": 0.2545877620577812,
|
|
"num_tokens": 92965946.0,
|
|
"step": 40555
|
|
},
|
|
{
|
|
"entropy": 5.017441368103027,
|
|
"epoch": 3.8962536023054755,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003553692054416495,
|
|
"loss": 4.565,
|
|
"mean_token_accuracy": 0.26096436381340027,
|
|
"num_tokens": 92978035.0,
|
|
"step": 40560
|
|
},
|
|
{
|
|
"entropy": 5.018599796295166,
|
|
"epoch": 3.8967339097022093,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003553371826643981,
|
|
"loss": 4.5928,
|
|
"mean_token_accuracy": 0.25130487233400345,
|
|
"num_tokens": 92989647.0,
|
|
"step": 40565
|
|
},
|
|
{
|
|
"entropy": 4.983860492706299,
|
|
"epoch": 3.8972142170989432,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003553051580219558,
|
|
"loss": 4.5443,
|
|
"mean_token_accuracy": 0.26015380024909973,
|
|
"num_tokens": 93000699.0,
|
|
"step": 40570
|
|
},
|
|
{
|
|
"entropy": 5.00658802986145,
|
|
"epoch": 3.897694524495677,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003552731315150662,
|
|
"loss": 4.4625,
|
|
"mean_token_accuracy": 0.2607811912894249,
|
|
"num_tokens": 93011463.0,
|
|
"step": 40575
|
|
},
|
|
{
|
|
"entropy": 5.052910804748535,
|
|
"epoch": 3.898174831892411,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00035524110314447295,
|
|
"loss": 4.5977,
|
|
"mean_token_accuracy": 0.25500153601169584,
|
|
"num_tokens": 93022147.0,
|
|
"step": 40580
|
|
},
|
|
{
|
|
"entropy": 4.894663095474243,
|
|
"epoch": 3.8986551392891453,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003552090729109195,
|
|
"loss": 4.4658,
|
|
"mean_token_accuracy": 0.264930659532547,
|
|
"num_tokens": 93035155.0,
|
|
"step": 40585
|
|
},
|
|
{
|
|
"entropy": 5.076678848266601,
|
|
"epoch": 3.8991354466858787,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003551770408151496,
|
|
"loss": 4.5661,
|
|
"mean_token_accuracy": 0.26107007414102557,
|
|
"num_tokens": 93045724.0,
|
|
"step": 40590
|
|
},
|
|
{
|
|
"entropy": 5.069697093963623,
|
|
"epoch": 3.899615754082613,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035514500685790673,
|
|
"loss": 4.6472,
|
|
"mean_token_accuracy": 0.25365510284900666,
|
|
"num_tokens": 93057629.0,
|
|
"step": 40595
|
|
},
|
|
{
|
|
"entropy": 4.997034406661987,
|
|
"epoch": 3.9000960614793465,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00035511297103993486,
|
|
"loss": 4.5902,
|
|
"mean_token_accuracy": 0.25952237844467163,
|
|
"num_tokens": 93068426.0,
|
|
"step": 40600
|
|
},
|
|
{
|
|
"entropy": 4.99796199798584,
|
|
"epoch": 3.900576368876081,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003550809333619778,
|
|
"loss": 4.5913,
|
|
"mean_token_accuracy": 0.2598285138607025,
|
|
"num_tokens": 93079344.0,
|
|
"step": 40605
|
|
},
|
|
{
|
|
"entropy": 4.976623296737671,
|
|
"epoch": 3.9010566762728147,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003550488938247791,
|
|
"loss": 4.5596,
|
|
"mean_token_accuracy": 0.2586202874779701,
|
|
"num_tokens": 93091111.0,
|
|
"step": 40610
|
|
},
|
|
{
|
|
"entropy": 5.062695121765136,
|
|
"epoch": 3.9015369836695486,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003550168524290828,
|
|
"loss": 4.6054,
|
|
"mean_token_accuracy": 0.2525621071457863,
|
|
"num_tokens": 93102202.0,
|
|
"step": 40615
|
|
},
|
|
{
|
|
"entropy": 4.979496574401855,
|
|
"epoch": 3.9020172910662825,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035498480917563294,
|
|
"loss": 4.5303,
|
|
"mean_token_accuracy": 0.2641531974077225,
|
|
"num_tokens": 93113575.0,
|
|
"step": 40620
|
|
},
|
|
{
|
|
"entropy": 5.041490840911865,
|
|
"epoch": 3.9024975984630164,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003549527640651732,
|
|
"loss": 4.5889,
|
|
"mean_token_accuracy": 0.25789677202701566,
|
|
"num_tokens": 93125402.0,
|
|
"step": 40625
|
|
},
|
|
{
|
|
"entropy": 4.9819316387176515,
|
|
"epoch": 3.9029779058597502,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003549207170984476,
|
|
"loss": 4.5502,
|
|
"mean_token_accuracy": 0.26411712169647217,
|
|
"num_tokens": 93138147.0,
|
|
"step": 40630
|
|
},
|
|
{
|
|
"entropy": 5.046895551681518,
|
|
"epoch": 3.903458213256484,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00035488866827620047,
|
|
"loss": 4.5586,
|
|
"mean_token_accuracy": 0.2583329573273659,
|
|
"num_tokens": 93150299.0,
|
|
"step": 40635
|
|
},
|
|
{
|
|
"entropy": 4.928134822845459,
|
|
"epoch": 3.903938520653218,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003548566175991756,
|
|
"loss": 4.4604,
|
|
"mean_token_accuracy": 0.2732646033167839,
|
|
"num_tokens": 93162224.0,
|
|
"step": 40640
|
|
},
|
|
{
|
|
"entropy": 5.016542720794678,
|
|
"epoch": 3.904418828049952,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00035482456506811714,
|
|
"loss": 4.5878,
|
|
"mean_token_accuracy": 0.2569653183221817,
|
|
"num_tokens": 93173144.0,
|
|
"step": 40645
|
|
},
|
|
{
|
|
"entropy": 5.168558788299561,
|
|
"epoch": 3.9048991354466858,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00035479251068376935,
|
|
"loss": 4.7406,
|
|
"mean_token_accuracy": 0.23780472129583358,
|
|
"num_tokens": 93183515.0,
|
|
"step": 40650
|
|
},
|
|
{
|
|
"entropy": 4.986981916427612,
|
|
"epoch": 3.9053794428434196,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00035476045444687637,
|
|
"loss": 4.4881,
|
|
"mean_token_accuracy": 0.2621208518743515,
|
|
"num_tokens": 93194184.0,
|
|
"step": 40655
|
|
},
|
|
{
|
|
"entropy": 5.14276385307312,
|
|
"epoch": 3.905859750240154,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003547283963581825,
|
|
"loss": 4.7494,
|
|
"mean_token_accuracy": 0.2461374044418335,
|
|
"num_tokens": 93205088.0,
|
|
"step": 40660
|
|
},
|
|
{
|
|
"entropy": 5.024680280685425,
|
|
"epoch": 3.9063400576368874,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000354696336418432,
|
|
"loss": 4.5426,
|
|
"mean_token_accuracy": 0.2620361417531967,
|
|
"num_tokens": 93216160.0,
|
|
"step": 40665
|
|
},
|
|
{
|
|
"entropy": 4.97866678237915,
|
|
"epoch": 3.9068203650336217,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00035466427462836923,
|
|
"loss": 4.5053,
|
|
"mean_token_accuracy": 0.2651329576969147,
|
|
"num_tokens": 93227658.0,
|
|
"step": 40670
|
|
},
|
|
{
|
|
"entropy": 5.033227825164795,
|
|
"epoch": 3.907300672430355,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00035463221098873854,
|
|
"loss": 4.5643,
|
|
"mean_token_accuracy": 0.2548597753047943,
|
|
"num_tokens": 93238684.0,
|
|
"step": 40675
|
|
},
|
|
{
|
|
"entropy": 5.089631271362305,
|
|
"epoch": 3.9077809798270895,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003546001455002844,
|
|
"loss": 4.7199,
|
|
"mean_token_accuracy": 0.24306940138339997,
|
|
"num_tokens": 93249772.0,
|
|
"step": 40680
|
|
},
|
|
{
|
|
"entropy": 5.054675102233887,
|
|
"epoch": 3.9082612872238234,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003545680781637512,
|
|
"loss": 4.5717,
|
|
"mean_token_accuracy": 0.25557660311460495,
|
|
"num_tokens": 93261642.0,
|
|
"step": 40685
|
|
},
|
|
{
|
|
"entropy": 5.043794918060303,
|
|
"epoch": 3.9087415946205573,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003545360089798836,
|
|
"loss": 4.5396,
|
|
"mean_token_accuracy": 0.2570954456925392,
|
|
"num_tokens": 93273704.0,
|
|
"step": 40690
|
|
},
|
|
{
|
|
"entropy": 4.995614528656006,
|
|
"epoch": 3.909221902017291,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003545039379494259,
|
|
"loss": 4.5702,
|
|
"mean_token_accuracy": 0.2582040175795555,
|
|
"num_tokens": 93285396.0,
|
|
"step": 40695
|
|
},
|
|
{
|
|
"entropy": 5.042558670043945,
|
|
"epoch": 3.909702209414025,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000354471865073123,
|
|
"loss": 4.6008,
|
|
"mean_token_accuracy": 0.2539967283606529,
|
|
"num_tokens": 93297411.0,
|
|
"step": 40700
|
|
},
|
|
{
|
|
"entropy": 4.9808141708374025,
|
|
"epoch": 3.910182516810759,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003544397903517192,
|
|
"loss": 4.5196,
|
|
"mean_token_accuracy": 0.2590713560581207,
|
|
"num_tokens": 93310545.0,
|
|
"step": 40705
|
|
},
|
|
{
|
|
"entropy": 4.991250991821289,
|
|
"epoch": 3.910662824207493,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003544077137859595,
|
|
"loss": 4.5166,
|
|
"mean_token_accuracy": 0.25909569561481477,
|
|
"num_tokens": 93321851.0,
|
|
"step": 40710
|
|
},
|
|
{
|
|
"entropy": 5.039916086196899,
|
|
"epoch": 3.9111431316042267,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035437563537658835,
|
|
"loss": 4.5457,
|
|
"mean_token_accuracy": 0.26155620366334914,
|
|
"num_tokens": 93332662.0,
|
|
"step": 40715
|
|
},
|
|
{
|
|
"entropy": 4.929525089263916,
|
|
"epoch": 3.9116234390009605,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00035434355512435075,
|
|
"loss": 4.4911,
|
|
"mean_token_accuracy": 0.26295938342809677,
|
|
"num_tokens": 93344979.0,
|
|
"step": 40720
|
|
},
|
|
{
|
|
"entropy": 5.005500793457031,
|
|
"epoch": 3.9121037463976944,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003543114730299913,
|
|
"loss": 4.5226,
|
|
"mean_token_accuracy": 0.2658160224556923,
|
|
"num_tokens": 93356427.0,
|
|
"step": 40725
|
|
},
|
|
{
|
|
"entropy": 5.068350076675415,
|
|
"epoch": 3.9125840537944283,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003542793890942551,
|
|
"loss": 4.5806,
|
|
"mean_token_accuracy": 0.25181279331445694,
|
|
"num_tokens": 93368377.0,
|
|
"step": 40730
|
|
},
|
|
{
|
|
"entropy": 5.010583782196045,
|
|
"epoch": 3.913064361191162,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003542473033178868,
|
|
"loss": 4.5818,
|
|
"mean_token_accuracy": 0.2549967184662819,
|
|
"num_tokens": 93378203.0,
|
|
"step": 40735
|
|
},
|
|
{
|
|
"entropy": 4.902292633056641,
|
|
"epoch": 3.913544668587896,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003542152157016315,
|
|
"loss": 4.4748,
|
|
"mean_token_accuracy": 0.26815159171819686,
|
|
"num_tokens": 93390326.0,
|
|
"step": 40740
|
|
},
|
|
{
|
|
"entropy": 4.976503133773804,
|
|
"epoch": 3.9140249759846304,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000354183126246234,
|
|
"loss": 4.5318,
|
|
"mean_token_accuracy": 0.2609789833426476,
|
|
"num_tokens": 93401364.0,
|
|
"step": 40745
|
|
},
|
|
{
|
|
"entropy": 5.009788227081299,
|
|
"epoch": 3.914505283381364,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003541510349524395,
|
|
"loss": 4.4667,
|
|
"mean_token_accuracy": 0.2640381783246994,
|
|
"num_tokens": 93412468.0,
|
|
"step": 40750
|
|
},
|
|
{
|
|
"entropy": 5.039696311950683,
|
|
"epoch": 3.914985590778098,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000354118941820993,
|
|
"loss": 4.6558,
|
|
"mean_token_accuracy": 0.25004173517227174,
|
|
"num_tokens": 93424605.0,
|
|
"step": 40755
|
|
},
|
|
{
|
|
"entropy": 4.959198617935181,
|
|
"epoch": 3.915465898174832,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035408684685263954,
|
|
"loss": 4.448,
|
|
"mean_token_accuracy": 0.2613840788602829,
|
|
"num_tokens": 93435624.0,
|
|
"step": 40760
|
|
},
|
|
{
|
|
"entropy": 5.115620374679565,
|
|
"epoch": 3.915946205571566,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003540547500481244,
|
|
"loss": 4.7303,
|
|
"mean_token_accuracy": 0.23652007281780243,
|
|
"num_tokens": 93447455.0,
|
|
"step": 40765
|
|
},
|
|
{
|
|
"entropy": 5.006617879867553,
|
|
"epoch": 3.9164265129683,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003540226514081927,
|
|
"loss": 4.5462,
|
|
"mean_token_accuracy": 0.256976281106472,
|
|
"num_tokens": 93459928.0,
|
|
"step": 40770
|
|
},
|
|
{
|
|
"entropy": 5.04410490989685,
|
|
"epoch": 3.9169068203650337,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00035399055093358965,
|
|
"loss": 4.6271,
|
|
"mean_token_accuracy": 0.25560567528009415,
|
|
"num_tokens": 93471277.0,
|
|
"step": 40775
|
|
},
|
|
{
|
|
"entropy": 5.061774063110351,
|
|
"epoch": 3.9173871277617676,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00035395844862506063,
|
|
"loss": 4.5706,
|
|
"mean_token_accuracy": 0.2573514088988304,
|
|
"num_tokens": 93483085.0,
|
|
"step": 40780
|
|
},
|
|
{
|
|
"entropy": 5.025407123565674,
|
|
"epoch": 3.9178674351585014,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003539263444833509,
|
|
"loss": 4.5713,
|
|
"mean_token_accuracy": 0.26146596372127534,
|
|
"num_tokens": 93494769.0,
|
|
"step": 40785
|
|
},
|
|
{
|
|
"entropy": 4.9588886260986325,
|
|
"epoch": 3.9183477425552353,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00035389423850920576,
|
|
"loss": 4.5458,
|
|
"mean_token_accuracy": 0.2514429584145546,
|
|
"num_tokens": 93506860.0,
|
|
"step": 40790
|
|
},
|
|
{
|
|
"entropy": 4.909133005142212,
|
|
"epoch": 3.918828049951969,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00035386213070337063,
|
|
"loss": 4.507,
|
|
"mean_token_accuracy": 0.26390181481838226,
|
|
"num_tokens": 93519260.0,
|
|
"step": 40795
|
|
},
|
|
{
|
|
"entropy": 5.025860834121704,
|
|
"epoch": 3.919308357348703,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.00035383002106659114,
|
|
"loss": 4.5098,
|
|
"mean_token_accuracy": 0.2566466674208641,
|
|
"num_tokens": 93530261.0,
|
|
"step": 40800
|
|
},
|
|
{
|
|
"entropy": 5.0611790180206295,
|
|
"epoch": 3.919788664745437,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00035379790959961247,
|
|
"loss": 4.6,
|
|
"mean_token_accuracy": 0.2593217626214027,
|
|
"num_tokens": 93542317.0,
|
|
"step": 40805
|
|
},
|
|
{
|
|
"entropy": 4.993230152130127,
|
|
"epoch": 3.920268972142171,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003537657963031804,
|
|
"loss": 4.5488,
|
|
"mean_token_accuracy": 0.26327076107263564,
|
|
"num_tokens": 93554038.0,
|
|
"step": 40810
|
|
},
|
|
{
|
|
"entropy": 5.02410740852356,
|
|
"epoch": 3.9207492795389047,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035373368117804047,
|
|
"loss": 4.5542,
|
|
"mean_token_accuracy": 0.25785010904073713,
|
|
"num_tokens": 93565263.0,
|
|
"step": 40815
|
|
},
|
|
{
|
|
"entropy": 5.103114128112793,
|
|
"epoch": 3.921229586935639,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00035370156422493814,
|
|
"loss": 4.6672,
|
|
"mean_token_accuracy": 0.2512001946568489,
|
|
"num_tokens": 93575724.0,
|
|
"step": 40820
|
|
},
|
|
{
|
|
"entropy": 5.037382793426514,
|
|
"epoch": 3.9217098943323725,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00035366944544461926,
|
|
"loss": 4.5393,
|
|
"mean_token_accuracy": 0.2616016209125519,
|
|
"num_tokens": 93587289.0,
|
|
"step": 40825
|
|
},
|
|
{
|
|
"entropy": 5.060944271087647,
|
|
"epoch": 3.922190201729107,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003536373248378295,
|
|
"loss": 4.6514,
|
|
"mean_token_accuracy": 0.2530637040734291,
|
|
"num_tokens": 93599993.0,
|
|
"step": 40830
|
|
},
|
|
{
|
|
"entropy": 5.006586503982544,
|
|
"epoch": 3.9226705091258407,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035360520240531446,
|
|
"loss": 4.5715,
|
|
"mean_token_accuracy": 0.2665791302919388,
|
|
"num_tokens": 93611542.0,
|
|
"step": 40835
|
|
},
|
|
{
|
|
"entropy": 4.968747472763061,
|
|
"epoch": 3.9231508165225746,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035357307814782007,
|
|
"loss": 4.4978,
|
|
"mean_token_accuracy": 0.26159274727106097,
|
|
"num_tokens": 93622650.0,
|
|
"step": 40840
|
|
},
|
|
{
|
|
"entropy": 5.030627298355102,
|
|
"epoch": 3.9236311239193085,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003535409520660922,
|
|
"loss": 4.479,
|
|
"mean_token_accuracy": 0.2641667783260345,
|
|
"num_tokens": 93634481.0,
|
|
"step": 40845
|
|
},
|
|
{
|
|
"entropy": 4.951187038421631,
|
|
"epoch": 3.9241114313160423,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003535088241608765,
|
|
"loss": 4.5723,
|
|
"mean_token_accuracy": 0.2547958791255951,
|
|
"num_tokens": 93647225.0,
|
|
"step": 40850
|
|
},
|
|
{
|
|
"entropy": 4.995564794540405,
|
|
"epoch": 3.9245917387127762,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003534766944329191,
|
|
"loss": 4.5499,
|
|
"mean_token_accuracy": 0.2617680624127388,
|
|
"num_tokens": 93657984.0,
|
|
"step": 40855
|
|
},
|
|
{
|
|
"entropy": 5.05954909324646,
|
|
"epoch": 3.92507204610951,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00035344456288296595,
|
|
"loss": 4.5644,
|
|
"mean_token_accuracy": 0.2548106536269188,
|
|
"num_tokens": 93669420.0,
|
|
"step": 40860
|
|
},
|
|
{
|
|
"entropy": 4.980149507522583,
|
|
"epoch": 3.925552353506244,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003534124295117629,
|
|
"loss": 4.509,
|
|
"mean_token_accuracy": 0.2665799781680107,
|
|
"num_tokens": 93680649.0,
|
|
"step": 40865
|
|
},
|
|
{
|
|
"entropy": 5.007740306854248,
|
|
"epoch": 3.926032660902978,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003533802943200561,
|
|
"loss": 4.6246,
|
|
"mean_token_accuracy": 0.250833123922348,
|
|
"num_tokens": 93693314.0,
|
|
"step": 40870
|
|
},
|
|
{
|
|
"entropy": 5.076145267486572,
|
|
"epoch": 3.9265129682997117,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003533481573085916,
|
|
"loss": 4.5373,
|
|
"mean_token_accuracy": 0.2592066913843155,
|
|
"num_tokens": 93704271.0,
|
|
"step": 40875
|
|
},
|
|
{
|
|
"entropy": 5.012905311584473,
|
|
"epoch": 3.9269932756964456,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003533160184781155,
|
|
"loss": 4.5243,
|
|
"mean_token_accuracy": 0.2644459009170532,
|
|
"num_tokens": 93715561.0,
|
|
"step": 40880
|
|
},
|
|
{
|
|
"entropy": 5.017126178741455,
|
|
"epoch": 3.9274735830931795,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.000353283877829374,
|
|
"loss": 4.5944,
|
|
"mean_token_accuracy": 0.2538530468940735,
|
|
"num_tokens": 93727115.0,
|
|
"step": 40885
|
|
},
|
|
{
|
|
"entropy": 5.035113430023193,
|
|
"epoch": 3.9279538904899134,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003532517353631134,
|
|
"loss": 4.5403,
|
|
"mean_token_accuracy": 0.2546320855617523,
|
|
"num_tokens": 93738439.0,
|
|
"step": 40890
|
|
},
|
|
{
|
|
"entropy": 5.0043834209442135,
|
|
"epoch": 3.9284341978866477,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003532195910800798,
|
|
"loss": 4.5391,
|
|
"mean_token_accuracy": 0.2625705346465111,
|
|
"num_tokens": 93749722.0,
|
|
"step": 40895
|
|
},
|
|
{
|
|
"entropy": 5.020870590209961,
|
|
"epoch": 3.928914505283381,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003531874449810195,
|
|
"loss": 4.5557,
|
|
"mean_token_accuracy": 0.2540207296609879,
|
|
"num_tokens": 93761122.0,
|
|
"step": 40900
|
|
},
|
|
{
|
|
"entropy": 5.050025749206543,
|
|
"epoch": 3.9293948126801155,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00035315529706667905,
|
|
"loss": 4.5907,
|
|
"mean_token_accuracy": 0.251235793530941,
|
|
"num_tokens": 93771369.0,
|
|
"step": 40905
|
|
},
|
|
{
|
|
"entropy": 5.05245451927185,
|
|
"epoch": 3.929875120076849,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003531231473378046,
|
|
"loss": 4.6087,
|
|
"mean_token_accuracy": 0.2507566183805466,
|
|
"num_tokens": 93782380.0,
|
|
"step": 40910
|
|
},
|
|
{
|
|
"entropy": 5.024741077423096,
|
|
"epoch": 3.9303554274735832,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035309099579514255,
|
|
"loss": 4.5348,
|
|
"mean_token_accuracy": 0.2547019883990288,
|
|
"num_tokens": 93793162.0,
|
|
"step": 40915
|
|
},
|
|
{
|
|
"entropy": 5.065177154541016,
|
|
"epoch": 3.930835734870317,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003530588424394396,
|
|
"loss": 4.6202,
|
|
"mean_token_accuracy": 0.2549705386161804,
|
|
"num_tokens": 93804441.0,
|
|
"step": 40920
|
|
},
|
|
{
|
|
"entropy": 5.071021366119385,
|
|
"epoch": 3.931316042267051,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00035302668727144206,
|
|
"loss": 4.5878,
|
|
"mean_token_accuracy": 0.2551292985677719,
|
|
"num_tokens": 93816080.0,
|
|
"step": 40925
|
|
},
|
|
{
|
|
"entropy": 5.007878112792969,
|
|
"epoch": 3.931796349663785,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035299453029189647,
|
|
"loss": 4.5925,
|
|
"mean_token_accuracy": 0.2495984748005867,
|
|
"num_tokens": 93828494.0,
|
|
"step": 40930
|
|
},
|
|
{
|
|
"entropy": 5.022972249984742,
|
|
"epoch": 3.9322766570605188,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.00035296237150154964,
|
|
"loss": 4.5951,
|
|
"mean_token_accuracy": 0.2574044778943062,
|
|
"num_tokens": 93839323.0,
|
|
"step": 40935
|
|
},
|
|
{
|
|
"entropy": 5.064226055145264,
|
|
"epoch": 3.9327569644572526,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003529302109011479,
|
|
"loss": 4.6282,
|
|
"mean_token_accuracy": 0.25686393678188324,
|
|
"num_tokens": 93851474.0,
|
|
"step": 40940
|
|
},
|
|
{
|
|
"entropy": 5.076386451721191,
|
|
"epoch": 3.9332372718539865,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00035289804849143806,
|
|
"loss": 4.6639,
|
|
"mean_token_accuracy": 0.2508619725704193,
|
|
"num_tokens": 93863484.0,
|
|
"step": 40945
|
|
},
|
|
{
|
|
"entropy": 5.020264911651611,
|
|
"epoch": 3.9337175792507204,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000352865884273167,
|
|
"loss": 4.541,
|
|
"mean_token_accuracy": 0.2621795490384102,
|
|
"num_tokens": 93874124.0,
|
|
"step": 40950
|
|
},
|
|
{
|
|
"entropy": 5.029136848449707,
|
|
"epoch": 3.9341978866474543,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003528337182470812,
|
|
"loss": 4.5966,
|
|
"mean_token_accuracy": 0.25266062915325166,
|
|
"num_tokens": 93884880.0,
|
|
"step": 40955
|
|
},
|
|
{
|
|
"entropy": 4.924902057647705,
|
|
"epoch": 3.934678194044188,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00035280155041392757,
|
|
"loss": 4.5025,
|
|
"mean_token_accuracy": 0.26597084701061247,
|
|
"num_tokens": 93895737.0,
|
|
"step": 40960
|
|
},
|
|
{
|
|
"entropy": 5.036904191970825,
|
|
"epoch": 3.935158501440922,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003527693807744531,
|
|
"loss": 4.581,
|
|
"mean_token_accuracy": 0.25939236879348754,
|
|
"num_tokens": 93906916.0,
|
|
"step": 40965
|
|
},
|
|
{
|
|
"entropy": 5.072819757461548,
|
|
"epoch": 3.9356388088376564,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003527372093294044,
|
|
"loss": 4.5886,
|
|
"mean_token_accuracy": 0.24882668554782866,
|
|
"num_tokens": 93918471.0,
|
|
"step": 40970
|
|
},
|
|
{
|
|
"entropy": 4.975347089767456,
|
|
"epoch": 3.93611911623439,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003527050360795285,
|
|
"loss": 4.4698,
|
|
"mean_token_accuracy": 0.26917734593153,
|
|
"num_tokens": 93929874.0,
|
|
"step": 40975
|
|
},
|
|
{
|
|
"entropy": 5.022579336166382,
|
|
"epoch": 3.936599423631124,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003526728610255725,
|
|
"loss": 4.6314,
|
|
"mean_token_accuracy": 0.24598250687122344,
|
|
"num_tokens": 93940785.0,
|
|
"step": 40980
|
|
},
|
|
{
|
|
"entropy": 5.032813549041748,
|
|
"epoch": 3.9370797310278576,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00035264068416828326,
|
|
"loss": 4.5539,
|
|
"mean_token_accuracy": 0.256144805252552,
|
|
"num_tokens": 93953255.0,
|
|
"step": 40985
|
|
},
|
|
{
|
|
"entropy": 4.960422420501709,
|
|
"epoch": 3.937560038424592,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003526085055084079,
|
|
"loss": 4.4986,
|
|
"mean_token_accuracy": 0.25669034421443937,
|
|
"num_tokens": 93964581.0,
|
|
"step": 40990
|
|
},
|
|
{
|
|
"entropy": 5.021775770187378,
|
|
"epoch": 3.938040345821326,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003525763250466935,
|
|
"loss": 4.6228,
|
|
"mean_token_accuracy": 0.25211969912052157,
|
|
"num_tokens": 93976222.0,
|
|
"step": 40995
|
|
},
|
|
{
|
|
"entropy": 5.055632209777832,
|
|
"epoch": 3.9385206532180597,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003525441427838872,
|
|
"loss": 4.6393,
|
|
"mean_token_accuracy": 0.2504964038729668,
|
|
"num_tokens": 93986936.0,
|
|
"step": 41000
|
|
},
|
|
{
|
|
"entropy": 5.0902759552001955,
|
|
"epoch": 3.9390009606147935,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003525119587207361,
|
|
"loss": 4.5973,
|
|
"mean_token_accuracy": 0.2560101792216301,
|
|
"num_tokens": 93999518.0,
|
|
"step": 41005
|
|
},
|
|
{
|
|
"entropy": 5.085747003555298,
|
|
"epoch": 3.9394812680115274,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003524797728579875,
|
|
"loss": 4.5833,
|
|
"mean_token_accuracy": 0.2599573597311974,
|
|
"num_tokens": 94010500.0,
|
|
"step": 41010
|
|
},
|
|
{
|
|
"entropy": 4.91188235282898,
|
|
"epoch": 3.9399615754082613,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003524475851963887,
|
|
"loss": 4.5132,
|
|
"mean_token_accuracy": 0.26271858662366865,
|
|
"num_tokens": 94020848.0,
|
|
"step": 41015
|
|
},
|
|
{
|
|
"entropy": 5.033004283905029,
|
|
"epoch": 3.940441882804995,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00035241539573668693,
|
|
"loss": 4.6348,
|
|
"mean_token_accuracy": 0.25278378576040267,
|
|
"num_tokens": 94033154.0,
|
|
"step": 41020
|
|
},
|
|
{
|
|
"entropy": 5.074343633651734,
|
|
"epoch": 3.940922190201729,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035238320447962947,
|
|
"loss": 4.5544,
|
|
"mean_token_accuracy": 0.25631219893693924,
|
|
"num_tokens": 94044131.0,
|
|
"step": 41025
|
|
},
|
|
{
|
|
"entropy": 5.082620620727539,
|
|
"epoch": 3.941402497598463,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003523510114259638,
|
|
"loss": 4.5442,
|
|
"mean_token_accuracy": 0.25107374489307405,
|
|
"num_tokens": 94055877.0,
|
|
"step": 41030
|
|
},
|
|
{
|
|
"entropy": 4.923304462432862,
|
|
"epoch": 3.941882804995197,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003523188165764374,
|
|
"loss": 4.5034,
|
|
"mean_token_accuracy": 0.26442785412073133,
|
|
"num_tokens": 94067341.0,
|
|
"step": 41035
|
|
},
|
|
{
|
|
"entropy": 4.9702917575836185,
|
|
"epoch": 3.9423631123919307,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00035228661993179757,
|
|
"loss": 4.4882,
|
|
"mean_token_accuracy": 0.2660094290971756,
|
|
"num_tokens": 94079618.0,
|
|
"step": 41040
|
|
},
|
|
{
|
|
"entropy": 5.093248891830444,
|
|
"epoch": 3.9428434197886646,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000352254421492792,
|
|
"loss": 4.6317,
|
|
"mean_token_accuracy": 0.25117712318897245,
|
|
"num_tokens": 94090607.0,
|
|
"step": 41045
|
|
},
|
|
{
|
|
"entropy": 5.055135345458984,
|
|
"epoch": 3.9433237271853985,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035222222126016814,
|
|
"loss": 4.5937,
|
|
"mean_token_accuracy": 0.2528764858841896,
|
|
"num_tokens": 94102078.0,
|
|
"step": 41050
|
|
},
|
|
{
|
|
"entropy": 4.987625694274902,
|
|
"epoch": 3.943804034582133,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003521900192346736,
|
|
"loss": 4.4477,
|
|
"mean_token_accuracy": 0.2633352503180504,
|
|
"num_tokens": 94113524.0,
|
|
"step": 41055
|
|
},
|
|
{
|
|
"entropy": 4.993733501434326,
|
|
"epoch": 3.9442843419788662,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000352157815417056,
|
|
"loss": 4.5869,
|
|
"mean_token_accuracy": 0.26502439826726915,
|
|
"num_tokens": 94125003.0,
|
|
"step": 41060
|
|
},
|
|
{
|
|
"entropy": 5.110818767547608,
|
|
"epoch": 3.9447646493756006,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035212560980806305,
|
|
"loss": 4.7227,
|
|
"mean_token_accuracy": 0.24820438027381897,
|
|
"num_tokens": 94136670.0,
|
|
"step": 41065
|
|
},
|
|
{
|
|
"entropy": 5.089823961257935,
|
|
"epoch": 3.9452449567723344,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003520934024084425,
|
|
"loss": 4.5712,
|
|
"mean_token_accuracy": 0.2575012996792793,
|
|
"num_tokens": 94148541.0,
|
|
"step": 41070
|
|
},
|
|
{
|
|
"entropy": 5.0775189876556395,
|
|
"epoch": 3.9457252641690683,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000352061193218942,
|
|
"loss": 4.6456,
|
|
"mean_token_accuracy": 0.25408685505390166,
|
|
"num_tokens": 94159936.0,
|
|
"step": 41075
|
|
},
|
|
{
|
|
"entropy": 4.988414621353149,
|
|
"epoch": 3.946205571565802,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003520289822403095,
|
|
"loss": 4.5528,
|
|
"mean_token_accuracy": 0.25643136650323867,
|
|
"num_tokens": 94171042.0,
|
|
"step": 41080
|
|
},
|
|
{
|
|
"entropy": 5.060313177108765,
|
|
"epoch": 3.946685878962536,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00035199676947329274,
|
|
"loss": 4.6263,
|
|
"mean_token_accuracy": 0.2555346071720123,
|
|
"num_tokens": 94181610.0,
|
|
"step": 41085
|
|
},
|
|
{
|
|
"entropy": 5.097196865081787,
|
|
"epoch": 3.94716618635927,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00035196455491863954,
|
|
"loss": 4.6551,
|
|
"mean_token_accuracy": 0.2461474522948265,
|
|
"num_tokens": 94192684.0,
|
|
"step": 41090
|
|
},
|
|
{
|
|
"entropy": 5.076653289794922,
|
|
"epoch": 3.947646493756004,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.000351932338577098,
|
|
"loss": 4.6815,
|
|
"mean_token_accuracy": 0.24574387520551683,
|
|
"num_tokens": 94204211.0,
|
|
"step": 41095
|
|
},
|
|
{
|
|
"entropy": 4.981122255325317,
|
|
"epoch": 3.9481268011527377,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00035190012044941605,
|
|
"loss": 4.5213,
|
|
"mean_token_accuracy": 0.2630945473909378,
|
|
"num_tokens": 94216175.0,
|
|
"step": 41100
|
|
},
|
|
{
|
|
"entropy": 5.01258807182312,
|
|
"epoch": 3.9486071085494716,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003518679005363415,
|
|
"loss": 4.5075,
|
|
"mean_token_accuracy": 0.2642325133085251,
|
|
"num_tokens": 94227735.0,
|
|
"step": 41105
|
|
},
|
|
{
|
|
"entropy": 5.005784273147583,
|
|
"epoch": 3.9490874159462055,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003518356788386226,
|
|
"loss": 4.5264,
|
|
"mean_token_accuracy": 0.2656796142458916,
|
|
"num_tokens": 94238247.0,
|
|
"step": 41110
|
|
},
|
|
{
|
|
"entropy": 5.050055503845215,
|
|
"epoch": 3.9495677233429394,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035180345535700745,
|
|
"loss": 4.6379,
|
|
"mean_token_accuracy": 0.25116476565599444,
|
|
"num_tokens": 94249839.0,
|
|
"step": 41115
|
|
},
|
|
{
|
|
"entropy": 5.104556751251221,
|
|
"epoch": 3.9500480307396733,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003517712300922441,
|
|
"loss": 4.654,
|
|
"mean_token_accuracy": 0.25254170447587965,
|
|
"num_tokens": 94261794.0,
|
|
"step": 41120
|
|
},
|
|
{
|
|
"entropy": 5.058972263336182,
|
|
"epoch": 3.950528338136407,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00035173900304508074,
|
|
"loss": 4.6409,
|
|
"mean_token_accuracy": 0.24703244268894195,
|
|
"num_tokens": 94276197.0,
|
|
"step": 41125
|
|
},
|
|
{
|
|
"entropy": 5.0600744724273685,
|
|
"epoch": 3.9510086455331415,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003517067742162656,
|
|
"loss": 4.5916,
|
|
"mean_token_accuracy": 0.25062993317842486,
|
|
"num_tokens": 94288419.0,
|
|
"step": 41130
|
|
},
|
|
{
|
|
"entropy": 5.031345558166504,
|
|
"epoch": 3.951488952929875,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000351674543606547,
|
|
"loss": 4.6059,
|
|
"mean_token_accuracy": 0.2525452867150307,
|
|
"num_tokens": 94300521.0,
|
|
"step": 41135
|
|
},
|
|
{
|
|
"entropy": 4.910266733169555,
|
|
"epoch": 3.9519692603266092,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003516423112166731,
|
|
"loss": 4.4305,
|
|
"mean_token_accuracy": 0.269549497961998,
|
|
"num_tokens": 94311349.0,
|
|
"step": 41140
|
|
},
|
|
{
|
|
"entropy": 4.9410828113555905,
|
|
"epoch": 3.952449567723343,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00035161007704739226,
|
|
"loss": 4.5023,
|
|
"mean_token_accuracy": 0.25759976655244826,
|
|
"num_tokens": 94322383.0,
|
|
"step": 41145
|
|
},
|
|
{
|
|
"entropy": 4.952279138565063,
|
|
"epoch": 3.952929875120077,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000351577841099453,
|
|
"loss": 4.5866,
|
|
"mean_token_accuracy": 0.24914229810237884,
|
|
"num_tokens": 94334807.0,
|
|
"step": 41150
|
|
},
|
|
{
|
|
"entropy": 5.0141016960144045,
|
|
"epoch": 3.953410182516811,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003515456033736036,
|
|
"loss": 4.4927,
|
|
"mean_token_accuracy": 0.2639600828289986,
|
|
"num_tokens": 94345444.0,
|
|
"step": 41155
|
|
},
|
|
{
|
|
"entropy": 5.0604331493377686,
|
|
"epoch": 3.9538904899135447,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00035151336387059275,
|
|
"loss": 4.6257,
|
|
"mean_token_accuracy": 0.25156708657741544,
|
|
"num_tokens": 94356802.0,
|
|
"step": 41160
|
|
},
|
|
{
|
|
"entropy": 4.996308898925781,
|
|
"epoch": 3.9543707973102786,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003514811225911686,
|
|
"loss": 4.5635,
|
|
"mean_token_accuracy": 0.2641602665185928,
|
|
"num_tokens": 94367694.0,
|
|
"step": 41165
|
|
},
|
|
{
|
|
"entropy": 5.040113687515259,
|
|
"epoch": 3.9548511047070125,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035144887953607995,
|
|
"loss": 4.577,
|
|
"mean_token_accuracy": 0.2641072317957878,
|
|
"num_tokens": 94379714.0,
|
|
"step": 41170
|
|
},
|
|
{
|
|
"entropy": 5.10051417350769,
|
|
"epoch": 3.9553314121037464,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003514166347060753,
|
|
"loss": 4.622,
|
|
"mean_token_accuracy": 0.25459016263484957,
|
|
"num_tokens": 94391405.0,
|
|
"step": 41175
|
|
},
|
|
{
|
|
"entropy": 4.944545745849609,
|
|
"epoch": 3.9558117195004803,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003513843881019033,
|
|
"loss": 4.4442,
|
|
"mean_token_accuracy": 0.27106780260801316,
|
|
"num_tokens": 94402785.0,
|
|
"step": 41180
|
|
},
|
|
{
|
|
"entropy": 4.992828035354615,
|
|
"epoch": 3.956292026897214,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00035135213972431267,
|
|
"loss": 4.5591,
|
|
"mean_token_accuracy": 0.2556390807032585,
|
|
"num_tokens": 94414373.0,
|
|
"step": 41185
|
|
},
|
|
{
|
|
"entropy": 4.963530158996582,
|
|
"epoch": 3.956772334293948,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00035131988957405206,
|
|
"loss": 4.4909,
|
|
"mean_token_accuracy": 0.2614923372864723,
|
|
"num_tokens": 94427285.0,
|
|
"step": 41190
|
|
},
|
|
{
|
|
"entropy": 5.065036201477051,
|
|
"epoch": 3.957252641690682,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00035128763765187026,
|
|
"loss": 4.595,
|
|
"mean_token_accuracy": 0.2541166961193085,
|
|
"num_tokens": 94439520.0,
|
|
"step": 41195
|
|
},
|
|
{
|
|
"entropy": 5.0078331470489506,
|
|
"epoch": 3.957732949087416,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000351255383958516,
|
|
"loss": 4.5227,
|
|
"mean_token_accuracy": 0.2663195699453354,
|
|
"num_tokens": 94450707.0,
|
|
"step": 41200
|
|
},
|
|
{
|
|
"entropy": 4.988126134872436,
|
|
"epoch": 3.95821325648415,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003512231284947382,
|
|
"loss": 4.5561,
|
|
"mean_token_accuracy": 0.2565922275185585,
|
|
"num_tokens": 94461941.0,
|
|
"step": 41205
|
|
},
|
|
{
|
|
"entropy": 5.0907402515411375,
|
|
"epoch": 3.9586935638808836,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003511908712612857,
|
|
"loss": 4.5988,
|
|
"mean_token_accuracy": 0.26092382073402404,
|
|
"num_tokens": 94471931.0,
|
|
"step": 41210
|
|
},
|
|
{
|
|
"entropy": 5.040230560302734,
|
|
"epoch": 3.959173871277618,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003511586122589073,
|
|
"loss": 4.5838,
|
|
"mean_token_accuracy": 0.25838238596916197,
|
|
"num_tokens": 94482692.0,
|
|
"step": 41215
|
|
},
|
|
{
|
|
"entropy": 4.9509352684021,
|
|
"epoch": 3.9596541786743513,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003511263514883522,
|
|
"loss": 4.4912,
|
|
"mean_token_accuracy": 0.2681387931108475,
|
|
"num_tokens": 94493650.0,
|
|
"step": 41220
|
|
},
|
|
{
|
|
"entropy": 4.974450016021729,
|
|
"epoch": 3.9601344860710856,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003510940889503692,
|
|
"loss": 4.4126,
|
|
"mean_token_accuracy": 0.27166741639375686,
|
|
"num_tokens": 94503878.0,
|
|
"step": 41225
|
|
},
|
|
{
|
|
"entropy": 5.017516422271728,
|
|
"epoch": 3.9606147934678195,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00035106182464570736,
|
|
"loss": 4.5715,
|
|
"mean_token_accuracy": 0.26330945640802383,
|
|
"num_tokens": 94516930.0,
|
|
"step": 41230
|
|
},
|
|
{
|
|
"entropy": 5.0018165588378904,
|
|
"epoch": 3.9610951008645534,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003510295585751159,
|
|
"loss": 4.5595,
|
|
"mean_token_accuracy": 0.25899527668952943,
|
|
"num_tokens": 94527523.0,
|
|
"step": 41235
|
|
},
|
|
{
|
|
"entropy": 5.0548319816589355,
|
|
"epoch": 3.9615754082612873,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00035099729073934376,
|
|
"loss": 4.572,
|
|
"mean_token_accuracy": 0.26314518600702286,
|
|
"num_tokens": 94537594.0,
|
|
"step": 41240
|
|
},
|
|
{
|
|
"entropy": 5.080398368835449,
|
|
"epoch": 3.962055715658021,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003509650211391403,
|
|
"loss": 4.6469,
|
|
"mean_token_accuracy": 0.24687463492155076,
|
|
"num_tokens": 94548293.0,
|
|
"step": 41245
|
|
},
|
|
{
|
|
"entropy": 5.024765872955323,
|
|
"epoch": 3.962536023054755,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00035093274977525456,
|
|
"loss": 4.6092,
|
|
"mean_token_accuracy": 0.25324745625257494,
|
|
"num_tokens": 94559116.0,
|
|
"step": 41250
|
|
},
|
|
{
|
|
"entropy": 5.0558693408966064,
|
|
"epoch": 3.963016330451489,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003509004766484359,
|
|
"loss": 4.662,
|
|
"mean_token_accuracy": 0.2417558714747429,
|
|
"num_tokens": 94570208.0,
|
|
"step": 41255
|
|
},
|
|
{
|
|
"entropy": 5.013394165039062,
|
|
"epoch": 3.963496637848223,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00035086820175943344,
|
|
"loss": 4.5153,
|
|
"mean_token_accuracy": 0.26516457200050353,
|
|
"num_tokens": 94580982.0,
|
|
"step": 41260
|
|
},
|
|
{
|
|
"entropy": 5.001540088653565,
|
|
"epoch": 3.9639769452449567,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003508359251089967,
|
|
"loss": 4.5645,
|
|
"mean_token_accuracy": 0.2577774852514267,
|
|
"num_tokens": 94593364.0,
|
|
"step": 41265
|
|
},
|
|
{
|
|
"entropy": 5.047458696365356,
|
|
"epoch": 3.9644572526416906,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00035080364669787495,
|
|
"loss": 4.5547,
|
|
"mean_token_accuracy": 0.26008584052324296,
|
|
"num_tokens": 94605072.0,
|
|
"step": 41270
|
|
},
|
|
{
|
|
"entropy": 5.037833547592163,
|
|
"epoch": 3.9649375600384245,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00035077136652681757,
|
|
"loss": 4.6595,
|
|
"mean_token_accuracy": 0.24958467483520508,
|
|
"num_tokens": 94617226.0,
|
|
"step": 41275
|
|
},
|
|
{
|
|
"entropy": 5.0503603458404545,
|
|
"epoch": 3.9654178674351583,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00035073908459657413,
|
|
"loss": 4.5391,
|
|
"mean_token_accuracy": 0.2640505313873291,
|
|
"num_tokens": 94629807.0,
|
|
"step": 41280
|
|
},
|
|
{
|
|
"entropy": 5.049329710006714,
|
|
"epoch": 3.965898174831892,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00035070680090789404,
|
|
"loss": 4.5191,
|
|
"mean_token_accuracy": 0.2600221365690231,
|
|
"num_tokens": 94640866.0,
|
|
"step": 41285
|
|
},
|
|
{
|
|
"entropy": 4.96960301399231,
|
|
"epoch": 3.9663784822286265,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0003506745154615268,
|
|
"loss": 4.535,
|
|
"mean_token_accuracy": 0.2561481386423111,
|
|
"num_tokens": 94651872.0,
|
|
"step": 41290
|
|
},
|
|
{
|
|
"entropy": 5.033800935745239,
|
|
"epoch": 3.96685878962536,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00035064222825822197,
|
|
"loss": 4.6367,
|
|
"mean_token_accuracy": 0.2553319737315178,
|
|
"num_tokens": 94664890.0,
|
|
"step": 41295
|
|
},
|
|
{
|
|
"entropy": 4.881068563461303,
|
|
"epoch": 3.9673390970220943,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.00035060993929872923,
|
|
"loss": 4.3959,
|
|
"mean_token_accuracy": 0.2683215975761414,
|
|
"num_tokens": 94677447.0,
|
|
"step": 41300
|
|
},
|
|
{
|
|
"entropy": 5.017708826065063,
|
|
"epoch": 3.967819404418828,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003505776485837983,
|
|
"loss": 4.6202,
|
|
"mean_token_accuracy": 0.2525124281644821,
|
|
"num_tokens": 94690432.0,
|
|
"step": 41305
|
|
},
|
|
{
|
|
"entropy": 5.005157327651977,
|
|
"epoch": 3.968299711815562,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003505453561141787,
|
|
"loss": 4.5129,
|
|
"mean_token_accuracy": 0.26284523904323576,
|
|
"num_tokens": 94703357.0,
|
|
"step": 41310
|
|
},
|
|
{
|
|
"entropy": 5.041296720504761,
|
|
"epoch": 3.968780019212296,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003505130618906203,
|
|
"loss": 4.5276,
|
|
"mean_token_accuracy": 0.2619605138897896,
|
|
"num_tokens": 94715276.0,
|
|
"step": 41315
|
|
},
|
|
{
|
|
"entropy": 5.020310640335083,
|
|
"epoch": 3.96926032660903,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003504807659138728,
|
|
"loss": 4.5382,
|
|
"mean_token_accuracy": 0.2620819479227066,
|
|
"num_tokens": 94725692.0,
|
|
"step": 41320
|
|
},
|
|
{
|
|
"entropy": 5.058972120285034,
|
|
"epoch": 3.9697406340057637,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003504484681846861,
|
|
"loss": 4.6624,
|
|
"mean_token_accuracy": 0.25147098749876023,
|
|
"num_tokens": 94737211.0,
|
|
"step": 41325
|
|
},
|
|
{
|
|
"entropy": 5.052793502807617,
|
|
"epoch": 3.9702209414024976,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00035041616870380997,
|
|
"loss": 4.6211,
|
|
"mean_token_accuracy": 0.25358060002326965,
|
|
"num_tokens": 94749140.0,
|
|
"step": 41330
|
|
},
|
|
{
|
|
"entropy": 5.073900175094605,
|
|
"epoch": 3.9707012487992315,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00035038386747199437,
|
|
"loss": 4.5344,
|
|
"mean_token_accuracy": 0.25543154776096344,
|
|
"num_tokens": 94759409.0,
|
|
"step": 41335
|
|
},
|
|
{
|
|
"entropy": 5.0065446376800535,
|
|
"epoch": 3.9711815561959654,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003503515644899892,
|
|
"loss": 4.5457,
|
|
"mean_token_accuracy": 0.25262449532747266,
|
|
"num_tokens": 94770267.0,
|
|
"step": 41340
|
|
},
|
|
{
|
|
"entropy": 5.009748554229736,
|
|
"epoch": 3.9716618635926992,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003503192597585444,
|
|
"loss": 4.5612,
|
|
"mean_token_accuracy": 0.2582801625132561,
|
|
"num_tokens": 94780479.0,
|
|
"step": 41345
|
|
},
|
|
{
|
|
"entropy": 4.961665487289428,
|
|
"epoch": 3.972142170989433,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003502869532784101,
|
|
"loss": 4.4895,
|
|
"mean_token_accuracy": 0.26846293807029725,
|
|
"num_tokens": 94791192.0,
|
|
"step": 41350
|
|
},
|
|
{
|
|
"entropy": 5.028629398345947,
|
|
"epoch": 3.972622478386167,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00035025464505033626,
|
|
"loss": 4.5448,
|
|
"mean_token_accuracy": 0.2535667210817337,
|
|
"num_tokens": 94802090.0,
|
|
"step": 41355
|
|
},
|
|
{
|
|
"entropy": 4.902774333953857,
|
|
"epoch": 3.973102785782901,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000350222335075073,
|
|
"loss": 4.4979,
|
|
"mean_token_accuracy": 0.2616896629333496,
|
|
"num_tokens": 94814246.0,
|
|
"step": 41360
|
|
},
|
|
{
|
|
"entropy": 5.040693187713623,
|
|
"epoch": 3.973583093179635,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00035019002335337057,
|
|
"loss": 4.5561,
|
|
"mean_token_accuracy": 0.2635308369994164,
|
|
"num_tokens": 94825754.0,
|
|
"step": 41365
|
|
},
|
|
{
|
|
"entropy": 5.044468402862549,
|
|
"epoch": 3.9740634005763686,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003501577098859791,
|
|
"loss": 4.6458,
|
|
"mean_token_accuracy": 0.25413116812705994,
|
|
"num_tokens": 94837790.0,
|
|
"step": 41370
|
|
},
|
|
{
|
|
"entropy": 4.984940385818481,
|
|
"epoch": 3.974543707973103,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00035012539467364865,
|
|
"loss": 4.5344,
|
|
"mean_token_accuracy": 0.2559735745191574,
|
|
"num_tokens": 94848694.0,
|
|
"step": 41375
|
|
},
|
|
{
|
|
"entropy": 5.011324119567871,
|
|
"epoch": 3.975024015369837,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003500930777171296,
|
|
"loss": 4.5766,
|
|
"mean_token_accuracy": 0.2569711714982986,
|
|
"num_tokens": 94861602.0,
|
|
"step": 41380
|
|
},
|
|
{
|
|
"entropy": 5.091028499603271,
|
|
"epoch": 3.9755043227665707,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003500607590171724,
|
|
"loss": 4.5769,
|
|
"mean_token_accuracy": 0.2563563659787178,
|
|
"num_tokens": 94872857.0,
|
|
"step": 41385
|
|
},
|
|
{
|
|
"entropy": 5.1262603282928465,
|
|
"epoch": 3.9759846301633046,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003500284385745272,
|
|
"loss": 4.6832,
|
|
"mean_token_accuracy": 0.2526934012770653,
|
|
"num_tokens": 94884716.0,
|
|
"step": 41390
|
|
},
|
|
{
|
|
"entropy": 5.0196826457977295,
|
|
"epoch": 3.9764649375600385,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.0003499961163899444,
|
|
"loss": 4.5643,
|
|
"mean_token_accuracy": 0.2603454619646072,
|
|
"num_tokens": 94897721.0,
|
|
"step": 41395
|
|
},
|
|
{
|
|
"entropy": 5.084643125534058,
|
|
"epoch": 3.9769452449567724,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00034996379246417453,
|
|
"loss": 4.5827,
|
|
"mean_token_accuracy": 0.25960332751274107,
|
|
"num_tokens": 94908099.0,
|
|
"step": 41400
|
|
},
|
|
{
|
|
"entropy": 5.041874170303345,
|
|
"epoch": 3.9774255523535063,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034993146679796795,
|
|
"loss": 4.565,
|
|
"mean_token_accuracy": 0.2578753411769867,
|
|
"num_tokens": 94919944.0,
|
|
"step": 41405
|
|
},
|
|
{
|
|
"entropy": 5.035422849655151,
|
|
"epoch": 3.97790585975024,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003498991393920752,
|
|
"loss": 4.5672,
|
|
"mean_token_accuracy": 0.26176036298274996,
|
|
"num_tokens": 94932104.0,
|
|
"step": 41410
|
|
},
|
|
{
|
|
"entropy": 5.059924602508545,
|
|
"epoch": 3.978386167146974,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034986681024724696,
|
|
"loss": 4.6401,
|
|
"mean_token_accuracy": 0.2541901707649231,
|
|
"num_tokens": 94943869.0,
|
|
"step": 41415
|
|
},
|
|
{
|
|
"entropy": 5.092303514480591,
|
|
"epoch": 3.978866474543708,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034983447936423357,
|
|
"loss": 4.6165,
|
|
"mean_token_accuracy": 0.24804800003767014,
|
|
"num_tokens": 94955812.0,
|
|
"step": 41420
|
|
},
|
|
{
|
|
"entropy": 4.967309951782227,
|
|
"epoch": 3.979346781940442,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0003498021467437858,
|
|
"loss": 4.5032,
|
|
"mean_token_accuracy": 0.2605097934603691,
|
|
"num_tokens": 94967320.0,
|
|
"step": 41425
|
|
},
|
|
{
|
|
"entropy": 5.0088409900665285,
|
|
"epoch": 3.9798270893371757,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003497698123866544,
|
|
"loss": 4.5652,
|
|
"mean_token_accuracy": 0.26068347990512847,
|
|
"num_tokens": 94978249.0,
|
|
"step": 41430
|
|
},
|
|
{
|
|
"entropy": 5.052569389343262,
|
|
"epoch": 3.9803073967339095,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00034973747629358983,
|
|
"loss": 4.6145,
|
|
"mean_token_accuracy": 0.25705351680517197,
|
|
"num_tokens": 94990776.0,
|
|
"step": 41435
|
|
},
|
|
{
|
|
"entropy": 5.0667211532592775,
|
|
"epoch": 3.980787704130644,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034970513846534305,
|
|
"loss": 4.5945,
|
|
"mean_token_accuracy": 0.25737289935350416,
|
|
"num_tokens": 95001361.0,
|
|
"step": 41440
|
|
},
|
|
{
|
|
"entropy": 5.092148876190185,
|
|
"epoch": 3.9812680115273773,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003496727989026648,
|
|
"loss": 4.6424,
|
|
"mean_token_accuracy": 0.2524492099881172,
|
|
"num_tokens": 95013748.0,
|
|
"step": 41445
|
|
},
|
|
{
|
|
"entropy": 4.956082487106324,
|
|
"epoch": 3.9817483189241116,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00034964045760630584,
|
|
"loss": 4.514,
|
|
"mean_token_accuracy": 0.2582827717065811,
|
|
"num_tokens": 95025700.0,
|
|
"step": 41450
|
|
},
|
|
{
|
|
"entropy": 5.088579177856445,
|
|
"epoch": 3.982228626320845,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00034960811457701715,
|
|
"loss": 4.6314,
|
|
"mean_token_accuracy": 0.2485818699002266,
|
|
"num_tokens": 95037653.0,
|
|
"step": 41455
|
|
},
|
|
{
|
|
"entropy": 4.982202863693237,
|
|
"epoch": 3.9827089337175794,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034957576981554955,
|
|
"loss": 4.5463,
|
|
"mean_token_accuracy": 0.2615138590335846,
|
|
"num_tokens": 95049859.0,
|
|
"step": 41460
|
|
},
|
|
{
|
|
"entropy": 4.935577011108398,
|
|
"epoch": 3.9831892411143133,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.00034954342332265407,
|
|
"loss": 4.4913,
|
|
"mean_token_accuracy": 0.2656488806009293,
|
|
"num_tokens": 95062862.0,
|
|
"step": 41465
|
|
},
|
|
{
|
|
"entropy": 5.016721725463867,
|
|
"epoch": 3.983669548511047,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00034951107509908157,
|
|
"loss": 4.5563,
|
|
"mean_token_accuracy": 0.25456272065639496,
|
|
"num_tokens": 95073555.0,
|
|
"step": 41470
|
|
},
|
|
{
|
|
"entropy": 5.04867467880249,
|
|
"epoch": 3.984149855907781,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003494787251455832,
|
|
"loss": 4.5404,
|
|
"mean_token_accuracy": 0.2618361756205559,
|
|
"num_tokens": 95086165.0,
|
|
"step": 41475
|
|
},
|
|
{
|
|
"entropy": 5.085599279403686,
|
|
"epoch": 3.984630163304515,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00034944637346291,
|
|
"loss": 4.6361,
|
|
"mean_token_accuracy": 0.2511801466345787,
|
|
"num_tokens": 95098475.0,
|
|
"step": 41480
|
|
},
|
|
{
|
|
"entropy": 5.0781059741973875,
|
|
"epoch": 3.985110470701249,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.000349414020051813,
|
|
"loss": 4.632,
|
|
"mean_token_accuracy": 0.25323943644762037,
|
|
"num_tokens": 95109972.0,
|
|
"step": 41485
|
|
},
|
|
{
|
|
"entropy": 5.052087116241455,
|
|
"epoch": 3.9855907780979827,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003493816649130435,
|
|
"loss": 4.6231,
|
|
"mean_token_accuracy": 0.2563024118542671,
|
|
"num_tokens": 95122043.0,
|
|
"step": 41490
|
|
},
|
|
{
|
|
"entropy": 5.006848621368408,
|
|
"epoch": 3.9860710854947166,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003493493080473526,
|
|
"loss": 4.4794,
|
|
"mean_token_accuracy": 0.2637880235910416,
|
|
"num_tokens": 95133685.0,
|
|
"step": 41495
|
|
},
|
|
{
|
|
"entropy": 5.0229145050048825,
|
|
"epoch": 3.9865513928914504,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034931694945549146,
|
|
"loss": 4.5715,
|
|
"mean_token_accuracy": 0.2517172321677208,
|
|
"num_tokens": 95146018.0,
|
|
"step": 41500
|
|
},
|
|
{
|
|
"entropy": 4.998893117904663,
|
|
"epoch": 3.9870317002881843,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003492845891382115,
|
|
"loss": 4.5983,
|
|
"mean_token_accuracy": 0.25293977558612823,
|
|
"num_tokens": 95158262.0,
|
|
"step": 41505
|
|
},
|
|
{
|
|
"entropy": 5.018766069412232,
|
|
"epoch": 3.987512007684918,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000349252227096264,
|
|
"loss": 4.5711,
|
|
"mean_token_accuracy": 0.2537033289670944,
|
|
"num_tokens": 95170162.0,
|
|
"step": 41510
|
|
},
|
|
{
|
|
"entropy": 5.02782883644104,
|
|
"epoch": 3.9879923150816525,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003492198633304002,
|
|
"loss": 4.5642,
|
|
"mean_token_accuracy": 0.26157948970794676,
|
|
"num_tokens": 95182574.0,
|
|
"step": 41515
|
|
},
|
|
{
|
|
"entropy": 5.094313907623291,
|
|
"epoch": 3.988472622478386,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003491874978413716,
|
|
"loss": 4.6315,
|
|
"mean_token_accuracy": 0.24799866676330568,
|
|
"num_tokens": 95194527.0,
|
|
"step": 41520
|
|
},
|
|
{
|
|
"entropy": 5.003177499771118,
|
|
"epoch": 3.9889529298751203,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003491551306299296,
|
|
"loss": 4.525,
|
|
"mean_token_accuracy": 0.25358218103647234,
|
|
"num_tokens": 95207842.0,
|
|
"step": 41525
|
|
},
|
|
{
|
|
"entropy": 5.075590801239014,
|
|
"epoch": 3.9894332372718537,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00034912276169682556,
|
|
"loss": 4.6119,
|
|
"mean_token_accuracy": 0.2582548275589943,
|
|
"num_tokens": 95219694.0,
|
|
"step": 41530
|
|
},
|
|
{
|
|
"entropy": 4.97084379196167,
|
|
"epoch": 3.989913544668588,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00034909039104281123,
|
|
"loss": 4.519,
|
|
"mean_token_accuracy": 0.26774066537618635,
|
|
"num_tokens": 95231397.0,
|
|
"step": 41535
|
|
},
|
|
{
|
|
"entropy": 4.952249526977539,
|
|
"epoch": 3.990393852065322,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003490580186686379,
|
|
"loss": 4.5234,
|
|
"mean_token_accuracy": 0.2602978691458702,
|
|
"num_tokens": 95243575.0,
|
|
"step": 41540
|
|
},
|
|
{
|
|
"entropy": 5.073389959335327,
|
|
"epoch": 3.990874159462056,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003490256445750573,
|
|
"loss": 4.6804,
|
|
"mean_token_accuracy": 0.25190021097660065,
|
|
"num_tokens": 95255866.0,
|
|
"step": 41545
|
|
},
|
|
{
|
|
"entropy": 5.088688373565674,
|
|
"epoch": 3.9913544668587897,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003489932687628211,
|
|
"loss": 4.5816,
|
|
"mean_token_accuracy": 0.252507820725441,
|
|
"num_tokens": 95266812.0,
|
|
"step": 41550
|
|
},
|
|
{
|
|
"entropy": 5.041789627075195,
|
|
"epoch": 3.9918347742555236,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00034896089123268096,
|
|
"loss": 4.5926,
|
|
"mean_token_accuracy": 0.2544868588447571,
|
|
"num_tokens": 95278482.0,
|
|
"step": 41555
|
|
},
|
|
{
|
|
"entropy": 4.956077671051025,
|
|
"epoch": 3.9923150816522575,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003489285119853884,
|
|
"loss": 4.5991,
|
|
"mean_token_accuracy": 0.26267356276512144,
|
|
"num_tokens": 95289374.0,
|
|
"step": 41560
|
|
},
|
|
{
|
|
"entropy": 5.0133076190948485,
|
|
"epoch": 3.9927953890489913,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003488961310216954,
|
|
"loss": 4.5409,
|
|
"mean_token_accuracy": 0.25626550018787386,
|
|
"num_tokens": 95301913.0,
|
|
"step": 41565
|
|
},
|
|
{
|
|
"entropy": 5.14767804145813,
|
|
"epoch": 3.993275696445725,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0003488637483423537,
|
|
"loss": 4.7062,
|
|
"mean_token_accuracy": 0.24548629224300383,
|
|
"num_tokens": 95314560.0,
|
|
"step": 41570
|
|
},
|
|
{
|
|
"entropy": 5.081859922409057,
|
|
"epoch": 3.993756003842459,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034883136394811505,
|
|
"loss": 4.617,
|
|
"mean_token_accuracy": 0.25024791359901427,
|
|
"num_tokens": 95326956.0,
|
|
"step": 41575
|
|
},
|
|
{
|
|
"entropy": 5.064084482192993,
|
|
"epoch": 3.994236311239193,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034879897783973143,
|
|
"loss": 4.5774,
|
|
"mean_token_accuracy": 0.25643798857927325,
|
|
"num_tokens": 95338405.0,
|
|
"step": 41580
|
|
},
|
|
{
|
|
"entropy": 5.02878623008728,
|
|
"epoch": 3.994716618635927,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003487665900179545,
|
|
"loss": 4.612,
|
|
"mean_token_accuracy": 0.2573488026857376,
|
|
"num_tokens": 95349269.0,
|
|
"step": 41585
|
|
},
|
|
{
|
|
"entropy": 5.106409120559692,
|
|
"epoch": 3.9951969260326607,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003487342004835366,
|
|
"loss": 4.6433,
|
|
"mean_token_accuracy": 0.25200989842414856,
|
|
"num_tokens": 95359940.0,
|
|
"step": 41590
|
|
},
|
|
{
|
|
"entropy": 5.071154975891114,
|
|
"epoch": 3.9956772334293946,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003487018092372294,
|
|
"loss": 4.6443,
|
|
"mean_token_accuracy": 0.25063207298517226,
|
|
"num_tokens": 95370830.0,
|
|
"step": 41595
|
|
},
|
|
{
|
|
"entropy": 5.108475542068481,
|
|
"epoch": 3.996157540826129,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000348669416279785,
|
|
"loss": 4.6707,
|
|
"mean_token_accuracy": 0.2491276502609253,
|
|
"num_tokens": 95382805.0,
|
|
"step": 41600
|
|
},
|
|
{
|
|
"entropy": 4.9815661907196045,
|
|
"epoch": 3.9966378482228624,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00034863702161195566,
|
|
"loss": 4.4523,
|
|
"mean_token_accuracy": 0.2702036157250404,
|
|
"num_tokens": 95394380.0,
|
|
"step": 41605
|
|
},
|
|
{
|
|
"entropy": 4.9308452129364015,
|
|
"epoch": 3.9971181556195967,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0003486046252344933,
|
|
"loss": 4.405,
|
|
"mean_token_accuracy": 0.2683298781514168,
|
|
"num_tokens": 95406240.0,
|
|
"step": 41610
|
|
},
|
|
{
|
|
"entropy": 5.09619722366333,
|
|
"epoch": 3.9975984630163306,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00034857222714814996,
|
|
"loss": 4.617,
|
|
"mean_token_accuracy": 0.24741874635219574,
|
|
"num_tokens": 95417021.0,
|
|
"step": 41615
|
|
},
|
|
{
|
|
"entropy": 5.028838968276977,
|
|
"epoch": 3.9980787704130645,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000348539827353678,
|
|
"loss": 4.647,
|
|
"mean_token_accuracy": 0.24397590458393098,
|
|
"num_tokens": 95429315.0,
|
|
"step": 41620
|
|
},
|
|
{
|
|
"entropy": 5.082918977737426,
|
|
"epoch": 3.9985590778097984,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003485074258518298,
|
|
"loss": 4.6065,
|
|
"mean_token_accuracy": 0.2597420305013657,
|
|
"num_tokens": 95441149.0,
|
|
"step": 41625
|
|
},
|
|
{
|
|
"entropy": 5.11497950553894,
|
|
"epoch": 3.9990393852065322,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003484750226433573,
|
|
"loss": 4.6672,
|
|
"mean_token_accuracy": 0.24913938641548156,
|
|
"num_tokens": 95452753.0,
|
|
"step": 41630
|
|
},
|
|
{
|
|
"entropy": 4.914012670516968,
|
|
"epoch": 3.999519692603266,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00034844261772901293,
|
|
"loss": 4.505,
|
|
"mean_token_accuracy": 0.2673894613981247,
|
|
"num_tokens": 95464901.0,
|
|
"step": 41635
|
|
},
|
|
{
|
|
"entropy": 5.109942483901977,
|
|
"epoch": 4.0,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003484102111095492,
|
|
"loss": 4.5751,
|
|
"mean_token_accuracy": 0.2586257174611092,
|
|
"num_tokens": 95474144.0,
|
|
"step": 41640
|
|
},
|
|
{
|
|
"entropy": 5.0987536907196045,
|
|
"epoch": 4.000480307396734,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00034837780278571814,
|
|
"loss": 4.5413,
|
|
"mean_token_accuracy": 0.2559258297085762,
|
|
"num_tokens": 95483973.0,
|
|
"step": 41645
|
|
},
|
|
{
|
|
"entropy": 5.030422687530518,
|
|
"epoch": 4.000960614793468,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003483453927582725,
|
|
"loss": 4.5754,
|
|
"mean_token_accuracy": 0.25695512294769285,
|
|
"num_tokens": 95495989.0,
|
|
"step": 41650
|
|
},
|
|
{
|
|
"entropy": 4.96630334854126,
|
|
"epoch": 4.001440922190202,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003483129810279647,
|
|
"loss": 4.4376,
|
|
"mean_token_accuracy": 0.2688439816236496,
|
|
"num_tokens": 95507286.0,
|
|
"step": 41655
|
|
},
|
|
{
|
|
"entropy": 5.056946611404419,
|
|
"epoch": 4.0019212295869355,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000348280567595547,
|
|
"loss": 4.5652,
|
|
"mean_token_accuracy": 0.261437414586544,
|
|
"num_tokens": 95519499.0,
|
|
"step": 41660
|
|
},
|
|
{
|
|
"entropy": 5.009938478469849,
|
|
"epoch": 4.00240153698367,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034824815246177227,
|
|
"loss": 4.4621,
|
|
"mean_token_accuracy": 0.2620555073022842,
|
|
"num_tokens": 95532134.0,
|
|
"step": 41665
|
|
},
|
|
{
|
|
"entropy": 5.01798300743103,
|
|
"epoch": 4.002881844380403,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003482157356273929,
|
|
"loss": 4.5197,
|
|
"mean_token_accuracy": 0.25777807980775835,
|
|
"num_tokens": 95542157.0,
|
|
"step": 41670
|
|
},
|
|
{
|
|
"entropy": 5.0512425899505615,
|
|
"epoch": 4.003362151777138,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003481833170931615,
|
|
"loss": 4.4533,
|
|
"mean_token_accuracy": 0.26709835082292555,
|
|
"num_tokens": 95553345.0,
|
|
"step": 41675
|
|
},
|
|
{
|
|
"entropy": 5.0012102127075195,
|
|
"epoch": 4.003842459173871,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034815089685983085,
|
|
"loss": 4.4585,
|
|
"mean_token_accuracy": 0.26358948051929476,
|
|
"num_tokens": 95564795.0,
|
|
"step": 41680
|
|
},
|
|
{
|
|
"entropy": 5.0275537967681885,
|
|
"epoch": 4.004322766570605,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003481184749281536,
|
|
"loss": 4.547,
|
|
"mean_token_accuracy": 0.2625957548618317,
|
|
"num_tokens": 95576777.0,
|
|
"step": 41685
|
|
},
|
|
{
|
|
"entropy": 5.053344058990478,
|
|
"epoch": 4.004803073967339,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00034808605129888246,
|
|
"loss": 4.5194,
|
|
"mean_token_accuracy": 0.268404945731163,
|
|
"num_tokens": 95588483.0,
|
|
"step": 41690
|
|
},
|
|
{
|
|
"entropy": 4.968467664718628,
|
|
"epoch": 4.005283381364073,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00034805362597277017,
|
|
"loss": 4.408,
|
|
"mean_token_accuracy": 0.2726629450917244,
|
|
"num_tokens": 95600300.0,
|
|
"step": 41695
|
|
},
|
|
{
|
|
"entropy": 5.008240270614624,
|
|
"epoch": 4.005763688760807,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00034802119895056973,
|
|
"loss": 4.4891,
|
|
"mean_token_accuracy": 0.2549231857061386,
|
|
"num_tokens": 95611596.0,
|
|
"step": 41700
|
|
},
|
|
{
|
|
"entropy": 4.973463916778565,
|
|
"epoch": 4.006243996157541,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034798877023303385,
|
|
"loss": 4.4785,
|
|
"mean_token_accuracy": 0.2653582811355591,
|
|
"num_tokens": 95622813.0,
|
|
"step": 41705
|
|
},
|
|
{
|
|
"entropy": 4.949472141265869,
|
|
"epoch": 4.006724303554274,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00034795633982091534,
|
|
"loss": 4.4275,
|
|
"mean_token_accuracy": 0.27165781855583193,
|
|
"num_tokens": 95635862.0,
|
|
"step": 41710
|
|
},
|
|
{
|
|
"entropy": 5.120182943344116,
|
|
"epoch": 4.007204610951009,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00034792390771496735,
|
|
"loss": 4.5981,
|
|
"mean_token_accuracy": 0.25693056732416153,
|
|
"num_tokens": 95647396.0,
|
|
"step": 41715
|
|
},
|
|
{
|
|
"entropy": 5.032498979568482,
|
|
"epoch": 4.007684918347743,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00034789147391594275,
|
|
"loss": 4.5282,
|
|
"mean_token_accuracy": 0.26402246952056885,
|
|
"num_tokens": 95657882.0,
|
|
"step": 41720
|
|
},
|
|
{
|
|
"entropy": 4.9984992980957035,
|
|
"epoch": 4.008165225744476,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.0003478590384245945,
|
|
"loss": 4.4152,
|
|
"mean_token_accuracy": 0.2693095371127129,
|
|
"num_tokens": 95669915.0,
|
|
"step": 41725
|
|
},
|
|
{
|
|
"entropy": 4.968413066864014,
|
|
"epoch": 4.008645533141211,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00034782660124167583,
|
|
"loss": 4.3986,
|
|
"mean_token_accuracy": 0.2741385281085968,
|
|
"num_tokens": 95680660.0,
|
|
"step": 41730
|
|
},
|
|
{
|
|
"entropy": 5.009887838363648,
|
|
"epoch": 4.009125840537944,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0003477941623679397,
|
|
"loss": 4.5469,
|
|
"mean_token_accuracy": 0.25134692192077634,
|
|
"num_tokens": 95692604.0,
|
|
"step": 41735
|
|
},
|
|
{
|
|
"entropy": 4.952689599990845,
|
|
"epoch": 4.0096061479346785,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003477617218041393,
|
|
"loss": 4.4727,
|
|
"mean_token_accuracy": 0.26311196237802503,
|
|
"num_tokens": 95703761.0,
|
|
"step": 41740
|
|
},
|
|
{
|
|
"entropy": 4.905816507339478,
|
|
"epoch": 4.010086455331412,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003477292795510277,
|
|
"loss": 4.3704,
|
|
"mean_token_accuracy": 0.2727318063378334,
|
|
"num_tokens": 95715776.0,
|
|
"step": 41745
|
|
},
|
|
{
|
|
"entropy": 5.042010879516601,
|
|
"epoch": 4.010566762728146,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003476968356093582,
|
|
"loss": 4.5679,
|
|
"mean_token_accuracy": 0.26142691522836686,
|
|
"num_tokens": 95727812.0,
|
|
"step": 41750
|
|
},
|
|
{
|
|
"entropy": 4.992965507507324,
|
|
"epoch": 4.01104707012488,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0003476643899798841,
|
|
"loss": 4.483,
|
|
"mean_token_accuracy": 0.2649794265627861,
|
|
"num_tokens": 95740453.0,
|
|
"step": 41755
|
|
},
|
|
{
|
|
"entropy": 4.923837375640869,
|
|
"epoch": 4.011527377521614,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.00034763194266335855,
|
|
"loss": 4.384,
|
|
"mean_token_accuracy": 0.2813527673482895,
|
|
"num_tokens": 95753436.0,
|
|
"step": 41760
|
|
},
|
|
{
|
|
"entropy": 5.057668542861938,
|
|
"epoch": 4.0120076849183475,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003475994936605351,
|
|
"loss": 4.5398,
|
|
"mean_token_accuracy": 0.25754400342702866,
|
|
"num_tokens": 95763936.0,
|
|
"step": 41765
|
|
},
|
|
{
|
|
"entropy": 5.0529945373535154,
|
|
"epoch": 4.012487992315082,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034756704297216686,
|
|
"loss": 4.5137,
|
|
"mean_token_accuracy": 0.26455884873867036,
|
|
"num_tokens": 95775535.0,
|
|
"step": 41770
|
|
},
|
|
{
|
|
"entropy": 5.030935525894165,
|
|
"epoch": 4.012968299711815,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00034753459059900745,
|
|
"loss": 4.5332,
|
|
"mean_token_accuracy": 0.26158440560102464,
|
|
"num_tokens": 95786814.0,
|
|
"step": 41775
|
|
},
|
|
{
|
|
"entropy": 5.085695028305054,
|
|
"epoch": 4.01344860710855,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034750213654181016,
|
|
"loss": 4.5259,
|
|
"mean_token_accuracy": 0.27036806046962736,
|
|
"num_tokens": 95798141.0,
|
|
"step": 41780
|
|
},
|
|
{
|
|
"entropy": 5.058136892318726,
|
|
"epoch": 4.013928914505283,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034746968080132855,
|
|
"loss": 4.5125,
|
|
"mean_token_accuracy": 0.2643093645572662,
|
|
"num_tokens": 95811244.0,
|
|
"step": 41785
|
|
},
|
|
{
|
|
"entropy": 5.026040029525757,
|
|
"epoch": 4.014409221902017,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003474372233783162,
|
|
"loss": 4.4189,
|
|
"mean_token_accuracy": 0.2694830983877182,
|
|
"num_tokens": 95821824.0,
|
|
"step": 41790
|
|
},
|
|
{
|
|
"entropy": 4.943759489059448,
|
|
"epoch": 4.014889529298751,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034740476427352657,
|
|
"loss": 4.3942,
|
|
"mean_token_accuracy": 0.2754273623228073,
|
|
"num_tokens": 95833122.0,
|
|
"step": 41795
|
|
},
|
|
{
|
|
"entropy": 4.916182994842529,
|
|
"epoch": 4.015369836695485,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003473723034877133,
|
|
"loss": 4.4477,
|
|
"mean_token_accuracy": 0.2646457701921463,
|
|
"num_tokens": 95844325.0,
|
|
"step": 41800
|
|
},
|
|
{
|
|
"entropy": 5.009830093383789,
|
|
"epoch": 4.015850144092219,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00034733984102163005,
|
|
"loss": 4.5463,
|
|
"mean_token_accuracy": 0.25758406072854995,
|
|
"num_tokens": 95855788.0,
|
|
"step": 41805
|
|
},
|
|
{
|
|
"entropy": 5.085469532012939,
|
|
"epoch": 4.016330451488953,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003473073768760306,
|
|
"loss": 4.529,
|
|
"mean_token_accuracy": 0.2561642035841942,
|
|
"num_tokens": 95867387.0,
|
|
"step": 41810
|
|
},
|
|
{
|
|
"entropy": 5.095583534240722,
|
|
"epoch": 4.016810758885687,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00034727491105166847,
|
|
"loss": 4.5631,
|
|
"mean_token_accuracy": 0.2577065333724022,
|
|
"num_tokens": 95878258.0,
|
|
"step": 41815
|
|
},
|
|
{
|
|
"entropy": 5.060622644424439,
|
|
"epoch": 4.017291066282421,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003472424435492975,
|
|
"loss": 4.4938,
|
|
"mean_token_accuracy": 0.26320756077766416,
|
|
"num_tokens": 95889312.0,
|
|
"step": 41820
|
|
},
|
|
{
|
|
"entropy": 5.098781681060791,
|
|
"epoch": 4.017771373679155,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003472099743696716,
|
|
"loss": 4.6341,
|
|
"mean_token_accuracy": 0.25594886392354965,
|
|
"num_tokens": 95901930.0,
|
|
"step": 41825
|
|
},
|
|
{
|
|
"entropy": 4.9547265529632565,
|
|
"epoch": 4.018251681075888,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003471775035135444,
|
|
"loss": 4.4042,
|
|
"mean_token_accuracy": 0.26451119780540466,
|
|
"num_tokens": 95913877.0,
|
|
"step": 41830
|
|
},
|
|
{
|
|
"entropy": 5.006029272079468,
|
|
"epoch": 4.018731988472623,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00034714503098166996,
|
|
"loss": 4.5272,
|
|
"mean_token_accuracy": 0.26310057640075685,
|
|
"num_tokens": 95925259.0,
|
|
"step": 41835
|
|
},
|
|
{
|
|
"entropy": 4.998060083389282,
|
|
"epoch": 4.019212295869356,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00034711255677480216,
|
|
"loss": 4.4567,
|
|
"mean_token_accuracy": 0.2699718713760376,
|
|
"num_tokens": 95936612.0,
|
|
"step": 41840
|
|
},
|
|
{
|
|
"entropy": 4.969458532333374,
|
|
"epoch": 4.0196926032660905,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003470800808936948,
|
|
"loss": 4.5038,
|
|
"mean_token_accuracy": 0.2616461098194122,
|
|
"num_tokens": 95948505.0,
|
|
"step": 41845
|
|
},
|
|
{
|
|
"entropy": 4.93705267906189,
|
|
"epoch": 4.020172910662824,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00034704760333910205,
|
|
"loss": 4.4022,
|
|
"mean_token_accuracy": 0.2775100916624069,
|
|
"num_tokens": 95959767.0,
|
|
"step": 41850
|
|
},
|
|
{
|
|
"entropy": 5.094399356842041,
|
|
"epoch": 4.020653218059558,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003470151241117779,
|
|
"loss": 4.6481,
|
|
"mean_token_accuracy": 0.25899033397436144,
|
|
"num_tokens": 95971470.0,
|
|
"step": 41855
|
|
},
|
|
{
|
|
"entropy": 5.009691572189331,
|
|
"epoch": 4.021133525456292,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003469826432124764,
|
|
"loss": 4.4478,
|
|
"mean_token_accuracy": 0.26643633395433425,
|
|
"num_tokens": 95983610.0,
|
|
"step": 41860
|
|
},
|
|
{
|
|
"entropy": 5.0232141494750975,
|
|
"epoch": 4.021613832853026,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003469501606419517,
|
|
"loss": 4.4796,
|
|
"mean_token_accuracy": 0.2666534692049026,
|
|
"num_tokens": 95995199.0,
|
|
"step": 41865
|
|
},
|
|
{
|
|
"entropy": 5.040107393264771,
|
|
"epoch": 4.022094140249759,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003469176764009579,
|
|
"loss": 4.5324,
|
|
"mean_token_accuracy": 0.2565169408917427,
|
|
"num_tokens": 96005881.0,
|
|
"step": 41870
|
|
},
|
|
{
|
|
"entropy": 5.026882123947144,
|
|
"epoch": 4.022574447646494,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00034688519049024924,
|
|
"loss": 4.4407,
|
|
"mean_token_accuracy": 0.26741924285888674,
|
|
"num_tokens": 96016973.0,
|
|
"step": 41875
|
|
},
|
|
{
|
|
"entropy": 5.016121292114258,
|
|
"epoch": 4.023054755043228,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034685270291057987,
|
|
"loss": 4.5613,
|
|
"mean_token_accuracy": 0.24538991451263428,
|
|
"num_tokens": 96029500.0,
|
|
"step": 41880
|
|
},
|
|
{
|
|
"entropy": 4.967826557159424,
|
|
"epoch": 4.0235350624399615,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034682021366270403,
|
|
"loss": 4.4141,
|
|
"mean_token_accuracy": 0.269784078001976,
|
|
"num_tokens": 96042239.0,
|
|
"step": 41885
|
|
},
|
|
{
|
|
"entropy": 5.12619047164917,
|
|
"epoch": 4.024015369836696,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00034678772274737626,
|
|
"loss": 4.5744,
|
|
"mean_token_accuracy": 0.2520944982767105,
|
|
"num_tokens": 96053385.0,
|
|
"step": 41890
|
|
},
|
|
{
|
|
"entropy": 5.003186368942261,
|
|
"epoch": 4.024495677233429,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00034675523016535067,
|
|
"loss": 4.4838,
|
|
"mean_token_accuracy": 0.2656889036297798,
|
|
"num_tokens": 96065600.0,
|
|
"step": 41895
|
|
},
|
|
{
|
|
"entropy": 5.021344137191773,
|
|
"epoch": 4.024975984630164,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034672273591738155,
|
|
"loss": 4.5283,
|
|
"mean_token_accuracy": 0.2588555857539177,
|
|
"num_tokens": 96076591.0,
|
|
"step": 41900
|
|
},
|
|
{
|
|
"entropy": 4.935583257675171,
|
|
"epoch": 4.025456292026897,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0003466902400042236,
|
|
"loss": 4.4449,
|
|
"mean_token_accuracy": 0.27475939095020296,
|
|
"num_tokens": 96088643.0,
|
|
"step": 41905
|
|
},
|
|
{
|
|
"entropy": 5.039946699142456,
|
|
"epoch": 4.025936599423631,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003466577424266312,
|
|
"loss": 4.5184,
|
|
"mean_token_accuracy": 0.2619390651583672,
|
|
"num_tokens": 96101592.0,
|
|
"step": 41910
|
|
},
|
|
{
|
|
"entropy": 4.974374914169312,
|
|
"epoch": 4.026416906820365,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00034662524318535877,
|
|
"loss": 4.4257,
|
|
"mean_token_accuracy": 0.2741742074489594,
|
|
"num_tokens": 96113434.0,
|
|
"step": 41915
|
|
},
|
|
{
|
|
"entropy": 5.025488328933716,
|
|
"epoch": 4.026897214217099,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00034659274228116086,
|
|
"loss": 4.5075,
|
|
"mean_token_accuracy": 0.2692012131214142,
|
|
"num_tokens": 96123576.0,
|
|
"step": 41920
|
|
},
|
|
{
|
|
"entropy": 5.025997447967529,
|
|
"epoch": 4.027377521613833,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00034656023971479206,
|
|
"loss": 4.602,
|
|
"mean_token_accuracy": 0.2541952520608902,
|
|
"num_tokens": 96135415.0,
|
|
"step": 41925
|
|
},
|
|
{
|
|
"entropy": 5.079021692276001,
|
|
"epoch": 4.027857829010567,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003465277354870069,
|
|
"loss": 4.5673,
|
|
"mean_token_accuracy": 0.26210458427667616,
|
|
"num_tokens": 96146740.0,
|
|
"step": 41930
|
|
},
|
|
{
|
|
"entropy": 4.939193820953369,
|
|
"epoch": 4.0283381364073,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034649522959856027,
|
|
"loss": 4.3957,
|
|
"mean_token_accuracy": 0.27863743156194687,
|
|
"num_tokens": 96158650.0,
|
|
"step": 41935
|
|
},
|
|
{
|
|
"entropy": 5.073782682418823,
|
|
"epoch": 4.028818443804035,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00034646272205020664,
|
|
"loss": 4.5489,
|
|
"mean_token_accuracy": 0.26301725655794145,
|
|
"num_tokens": 96169394.0,
|
|
"step": 41940
|
|
},
|
|
{
|
|
"entropy": 5.017283010482788,
|
|
"epoch": 4.029298751200768,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003464302128427008,
|
|
"loss": 4.533,
|
|
"mean_token_accuracy": 0.26658937335014343,
|
|
"num_tokens": 96180791.0,
|
|
"step": 41945
|
|
},
|
|
{
|
|
"entropy": 4.896490383148193,
|
|
"epoch": 4.029779058597502,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003463977019767975,
|
|
"loss": 4.3492,
|
|
"mean_token_accuracy": 0.2803980588912964,
|
|
"num_tokens": 96191705.0,
|
|
"step": 41950
|
|
},
|
|
{
|
|
"entropy": 5.061274003982544,
|
|
"epoch": 4.030259365994237,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003463651894532516,
|
|
"loss": 4.6331,
|
|
"mean_token_accuracy": 0.2575544148683548,
|
|
"num_tokens": 96203742.0,
|
|
"step": 41955
|
|
},
|
|
{
|
|
"entropy": 5.006491565704346,
|
|
"epoch": 4.03073967339097,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.000346332675272818,
|
|
"loss": 4.4872,
|
|
"mean_token_accuracy": 0.27147663086652757,
|
|
"num_tokens": 96214991.0,
|
|
"step": 41960
|
|
},
|
|
{
|
|
"entropy": 4.950018215179443,
|
|
"epoch": 4.0312199807877045,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003463001594362513,
|
|
"loss": 4.4433,
|
|
"mean_token_accuracy": 0.26847639083862307,
|
|
"num_tokens": 96227122.0,
|
|
"step": 41965
|
|
},
|
|
{
|
|
"entropy": 4.995484161376953,
|
|
"epoch": 4.031700288184438,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003462676419443067,
|
|
"loss": 4.4197,
|
|
"mean_token_accuracy": 0.2700593739748001,
|
|
"num_tokens": 96237508.0,
|
|
"step": 41970
|
|
},
|
|
{
|
|
"entropy": 4.964941930770874,
|
|
"epoch": 4.032180595581172,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003462351227977391,
|
|
"loss": 4.5534,
|
|
"mean_token_accuracy": 0.26120382100343703,
|
|
"num_tokens": 96250375.0,
|
|
"step": 41975
|
|
},
|
|
{
|
|
"entropy": 5.054831171035767,
|
|
"epoch": 4.032660902977906,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034620260199730345,
|
|
"loss": 4.5159,
|
|
"mean_token_accuracy": 0.26145862340927123,
|
|
"num_tokens": 96261199.0,
|
|
"step": 41980
|
|
},
|
|
{
|
|
"entropy": 5.0789141178131105,
|
|
"epoch": 4.03314121037464,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00034617007954375486,
|
|
"loss": 4.5418,
|
|
"mean_token_accuracy": 0.2561738058924675,
|
|
"num_tokens": 96272492.0,
|
|
"step": 41985
|
|
},
|
|
{
|
|
"entropy": 4.955269002914429,
|
|
"epoch": 4.0336215177713735,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003461375554378483,
|
|
"loss": 4.405,
|
|
"mean_token_accuracy": 0.26688730120658877,
|
|
"num_tokens": 96284657.0,
|
|
"step": 41990
|
|
},
|
|
{
|
|
"entropy": 5.113492584228515,
|
|
"epoch": 4.034101825168108,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034610502968033895,
|
|
"loss": 4.6104,
|
|
"mean_token_accuracy": 0.2497868597507477,
|
|
"num_tokens": 96296958.0,
|
|
"step": 41995
|
|
},
|
|
{
|
|
"entropy": 5.008513784408569,
|
|
"epoch": 4.034582132564841,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000346072502271982,
|
|
"loss": 4.446,
|
|
"mean_token_accuracy": 0.2774098917841911,
|
|
"num_tokens": 96307923.0,
|
|
"step": 42000
|
|
},
|
|
{
|
|
"epoch": 4.034582132564841,
|
|
"eval_entropy": 4.872048725621518,
|
|
"eval_loss": 4.691915035247803,
|
|
"eval_mean_token_accuracy": 0.2596119696556839,
|
|
"eval_num_tokens": 96307923.0,
|
|
"eval_runtime": 26.6483,
|
|
"eval_samples_per_second": 1231.41,
|
|
"eval_steps_per_second": 153.931,
|
|
"step": 42000
|
|
},
|
|
{
|
|
"entropy": 5.013257694244385,
|
|
"epoch": 4.0350624399615755,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0003460399732135326,
|
|
"loss": 4.5186,
|
|
"mean_token_accuracy": 0.26455856710672376,
|
|
"num_tokens": 96320177.0,
|
|
"step": 42005
|
|
},
|
|
{
|
|
"entropy": 5.051885080337525,
|
|
"epoch": 4.035542747358309,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003460074425057458,
|
|
"loss": 4.5556,
|
|
"mean_token_accuracy": 0.26109528839588164,
|
|
"num_tokens": 96331860.0,
|
|
"step": 42010
|
|
},
|
|
{
|
|
"entropy": 4.890182781219482,
|
|
"epoch": 4.036023054755043,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00034597491014937723,
|
|
"loss": 4.3453,
|
|
"mean_token_accuracy": 0.27454771399497985,
|
|
"num_tokens": 96342111.0,
|
|
"step": 42015
|
|
},
|
|
{
|
|
"entropy": 4.9661883354187015,
|
|
"epoch": 4.036503362151777,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003459423761451819,
|
|
"loss": 4.52,
|
|
"mean_token_accuracy": 0.26080658137798307,
|
|
"num_tokens": 96353930.0,
|
|
"step": 42020
|
|
},
|
|
{
|
|
"entropy": 5.0015472888946535,
|
|
"epoch": 4.036983669548511,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003459098404939153,
|
|
"loss": 4.5147,
|
|
"mean_token_accuracy": 0.2630936220288277,
|
|
"num_tokens": 96364924.0,
|
|
"step": 42025
|
|
},
|
|
{
|
|
"entropy": 5.076598882675171,
|
|
"epoch": 4.037463976945245,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034587730319633265,
|
|
"loss": 4.5389,
|
|
"mean_token_accuracy": 0.2577996075153351,
|
|
"num_tokens": 96377043.0,
|
|
"step": 42030
|
|
},
|
|
{
|
|
"entropy": 5.000241899490357,
|
|
"epoch": 4.037944284341979,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00034584476425318964,
|
|
"loss": 4.4482,
|
|
"mean_token_accuracy": 0.2739641353487968,
|
|
"num_tokens": 96388122.0,
|
|
"step": 42035
|
|
},
|
|
{
|
|
"entropy": 4.93699836730957,
|
|
"epoch": 4.038424591738713,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00034581222366524147,
|
|
"loss": 4.4572,
|
|
"mean_token_accuracy": 0.26637285202741623,
|
|
"num_tokens": 96399347.0,
|
|
"step": 42040
|
|
},
|
|
{
|
|
"entropy": 4.955125522613526,
|
|
"epoch": 4.038904899135447,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034577968143324365,
|
|
"loss": 4.4418,
|
|
"mean_token_accuracy": 0.26936611235141755,
|
|
"num_tokens": 96411249.0,
|
|
"step": 42045
|
|
},
|
|
{
|
|
"entropy": 5.060801887512207,
|
|
"epoch": 4.039385206532181,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000345747137557952,
|
|
"loss": 4.5847,
|
|
"mean_token_accuracy": 0.2579483792185783,
|
|
"num_tokens": 96423274.0,
|
|
"step": 42050
|
|
},
|
|
{
|
|
"entropy": 5.0815558433532715,
|
|
"epoch": 4.039865513928914,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003457145920401218,
|
|
"loss": 4.5373,
|
|
"mean_token_accuracy": 0.25582472831010816,
|
|
"num_tokens": 96434822.0,
|
|
"step": 42055
|
|
},
|
|
{
|
|
"entropy": 5.012590026855468,
|
|
"epoch": 4.040345821325649,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00034568204488050875,
|
|
"loss": 4.4709,
|
|
"mean_token_accuracy": 0.26436175405979156,
|
|
"num_tokens": 96446313.0,
|
|
"step": 42060
|
|
},
|
|
{
|
|
"entropy": 5.0681384086608885,
|
|
"epoch": 4.040826128722382,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003456494960798685,
|
|
"loss": 4.5662,
|
|
"mean_token_accuracy": 0.25745727866888046,
|
|
"num_tokens": 96457669.0,
|
|
"step": 42065
|
|
},
|
|
{
|
|
"entropy": 5.01466703414917,
|
|
"epoch": 4.041306436119116,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003456169456389567,
|
|
"loss": 4.515,
|
|
"mean_token_accuracy": 0.2642508387565613,
|
|
"num_tokens": 96469056.0,
|
|
"step": 42070
|
|
},
|
|
{
|
|
"entropy": 4.926809692382813,
|
|
"epoch": 4.04178674351585,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034558439355852915,
|
|
"loss": 4.3809,
|
|
"mean_token_accuracy": 0.27094448506832125,
|
|
"num_tokens": 96479783.0,
|
|
"step": 42075
|
|
},
|
|
{
|
|
"entropy": 4.963146591186524,
|
|
"epoch": 4.042267050912584,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003455518398393416,
|
|
"loss": 4.4846,
|
|
"mean_token_accuracy": 0.26219048649072646,
|
|
"num_tokens": 96491473.0,
|
|
"step": 42080
|
|
},
|
|
{
|
|
"entropy": 5.0328813076019285,
|
|
"epoch": 4.042747358309318,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00034551928448214983,
|
|
"loss": 4.5472,
|
|
"mean_token_accuracy": 0.2643993556499481,
|
|
"num_tokens": 96502380.0,
|
|
"step": 42085
|
|
},
|
|
{
|
|
"entropy": 5.166278314590454,
|
|
"epoch": 4.043227665706052,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003454867274877096,
|
|
"loss": 4.6314,
|
|
"mean_token_accuracy": 0.2528727948665619,
|
|
"num_tokens": 96514157.0,
|
|
"step": 42090
|
|
},
|
|
{
|
|
"entropy": 5.134456634521484,
|
|
"epoch": 4.043707973102785,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00034545416885677695,
|
|
"loss": 4.63,
|
|
"mean_token_accuracy": 0.25138918310403824,
|
|
"num_tokens": 96526549.0,
|
|
"step": 42095
|
|
},
|
|
{
|
|
"entropy": 4.992153739929199,
|
|
"epoch": 4.04418828049952,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003454216085901076,
|
|
"loss": 4.5156,
|
|
"mean_token_accuracy": 0.2657928645610809,
|
|
"num_tokens": 96539158.0,
|
|
"step": 42100
|
|
},
|
|
{
|
|
"entropy": 4.893184661865234,
|
|
"epoch": 4.044668587896253,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00034538904668845766,
|
|
"loss": 4.436,
|
|
"mean_token_accuracy": 0.27468478977680205,
|
|
"num_tokens": 96552604.0,
|
|
"step": 42105
|
|
},
|
|
{
|
|
"entropy": 4.934816789627075,
|
|
"epoch": 4.0451488952929875,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003453564831525831,
|
|
"loss": 4.4211,
|
|
"mean_token_accuracy": 0.2669402986764908,
|
|
"num_tokens": 96565457.0,
|
|
"step": 42110
|
|
},
|
|
{
|
|
"entropy": 5.048805809020996,
|
|
"epoch": 4.045629202689722,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003453239179832399,
|
|
"loss": 4.497,
|
|
"mean_token_accuracy": 0.26912936568260193,
|
|
"num_tokens": 96575605.0,
|
|
"step": 42115
|
|
},
|
|
{
|
|
"entropy": 5.023149156570435,
|
|
"epoch": 4.046109510086455,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003452913511811841,
|
|
"loss": 4.5101,
|
|
"mean_token_accuracy": 0.2611981302499771,
|
|
"num_tokens": 96586426.0,
|
|
"step": 42120
|
|
},
|
|
{
|
|
"entropy": 5.024201250076294,
|
|
"epoch": 4.04658981748319,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003452587827471719,
|
|
"loss": 4.4756,
|
|
"mean_token_accuracy": 0.2572803169488907,
|
|
"num_tokens": 96597955.0,
|
|
"step": 42125
|
|
},
|
|
{
|
|
"entropy": 4.928853607177734,
|
|
"epoch": 4.047070124879923,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00034522621268195936,
|
|
"loss": 4.4212,
|
|
"mean_token_accuracy": 0.27254017889499665,
|
|
"num_tokens": 96609148.0,
|
|
"step": 42130
|
|
},
|
|
{
|
|
"entropy": 4.9464679718017575,
|
|
"epoch": 4.047550432276657,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003451936409863027,
|
|
"loss": 4.4282,
|
|
"mean_token_accuracy": 0.26841907799243925,
|
|
"num_tokens": 96621221.0,
|
|
"step": 42135
|
|
},
|
|
{
|
|
"entropy": 5.033289575576783,
|
|
"epoch": 4.048030739673391,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00034516106766095813,
|
|
"loss": 4.5517,
|
|
"mean_token_accuracy": 0.2634765341877937,
|
|
"num_tokens": 96632567.0,
|
|
"step": 42140
|
|
},
|
|
{
|
|
"entropy": 5.017474508285522,
|
|
"epoch": 4.048511047070125,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003451284927066819,
|
|
"loss": 4.4207,
|
|
"mean_token_accuracy": 0.26263629347085954,
|
|
"num_tokens": 96643501.0,
|
|
"step": 42145
|
|
},
|
|
{
|
|
"entropy": 4.945333957672119,
|
|
"epoch": 4.0489913544668585,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003450959161242303,
|
|
"loss": 4.4538,
|
|
"mean_token_accuracy": 0.2676896691322327,
|
|
"num_tokens": 96654395.0,
|
|
"step": 42150
|
|
},
|
|
{
|
|
"entropy": 5.058342552185058,
|
|
"epoch": 4.049471661863593,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003450633379143597,
|
|
"loss": 4.6145,
|
|
"mean_token_accuracy": 0.2512867212295532,
|
|
"num_tokens": 96666135.0,
|
|
"step": 42155
|
|
},
|
|
{
|
|
"entropy": 4.972822475433349,
|
|
"epoch": 4.049951969260326,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00034503075807782634,
|
|
"loss": 4.4132,
|
|
"mean_token_accuracy": 0.2704052045941353,
|
|
"num_tokens": 96678149.0,
|
|
"step": 42160
|
|
},
|
|
{
|
|
"entropy": 4.977131938934326,
|
|
"epoch": 4.050432276657061,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003449981766153869,
|
|
"loss": 4.4604,
|
|
"mean_token_accuracy": 0.27504906356334685,
|
|
"num_tokens": 96688557.0,
|
|
"step": 42165
|
|
},
|
|
{
|
|
"entropy": 4.992603302001953,
|
|
"epoch": 4.050912584053794,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003449655935277976,
|
|
"loss": 4.4751,
|
|
"mean_token_accuracy": 0.26966538429260256,
|
|
"num_tokens": 96699404.0,
|
|
"step": 42170
|
|
},
|
|
{
|
|
"entropy": 4.981101131439209,
|
|
"epoch": 4.051392891450528,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034493300881581484,
|
|
"loss": 4.4398,
|
|
"mean_token_accuracy": 0.2660211235284805,
|
|
"num_tokens": 96711673.0,
|
|
"step": 42175
|
|
},
|
|
{
|
|
"entropy": 4.993382406234741,
|
|
"epoch": 4.051873198847262,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003449004224801954,
|
|
"loss": 4.4781,
|
|
"mean_token_accuracy": 0.27254432439804077,
|
|
"num_tokens": 96723113.0,
|
|
"step": 42180
|
|
},
|
|
{
|
|
"entropy": 5.098783874511719,
|
|
"epoch": 4.052353506243996,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0003448678345216957,
|
|
"loss": 4.627,
|
|
"mean_token_accuracy": 0.25553334653377535,
|
|
"num_tokens": 96733323.0,
|
|
"step": 42185
|
|
},
|
|
{
|
|
"entropy": 5.008471202850342,
|
|
"epoch": 4.0528338136407305,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00034483524494107235,
|
|
"loss": 4.4848,
|
|
"mean_token_accuracy": 0.2618389666080475,
|
|
"num_tokens": 96743524.0,
|
|
"step": 42190
|
|
},
|
|
{
|
|
"entropy": 5.081727361679077,
|
|
"epoch": 4.053314121037464,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034480265373908187,
|
|
"loss": 4.6249,
|
|
"mean_token_accuracy": 0.25214796513319016,
|
|
"num_tokens": 96756565.0,
|
|
"step": 42195
|
|
},
|
|
{
|
|
"entropy": 5.0010637760162355,
|
|
"epoch": 4.053794428434198,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00034477006091648116,
|
|
"loss": 4.4428,
|
|
"mean_token_accuracy": 0.2650091081857681,
|
|
"num_tokens": 96767391.0,
|
|
"step": 42200
|
|
},
|
|
{
|
|
"entropy": 4.954708290100098,
|
|
"epoch": 4.054274735830932,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034473746647402674,
|
|
"loss": 4.453,
|
|
"mean_token_accuracy": 0.27267134189605713,
|
|
"num_tokens": 96778074.0,
|
|
"step": 42205
|
|
},
|
|
{
|
|
"entropy": 4.900137710571289,
|
|
"epoch": 4.054755043227666,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003447048704124754,
|
|
"loss": 4.3598,
|
|
"mean_token_accuracy": 0.2792866200208664,
|
|
"num_tokens": 96789669.0,
|
|
"step": 42210
|
|
},
|
|
{
|
|
"entropy": 5.052953004837036,
|
|
"epoch": 4.055235350624399,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003446722727325841,
|
|
"loss": 4.5617,
|
|
"mean_token_accuracy": 0.2610149383544922,
|
|
"num_tokens": 96800089.0,
|
|
"step": 42215
|
|
},
|
|
{
|
|
"entropy": 5.0347206592559814,
|
|
"epoch": 4.055715658021134,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003446396734351094,
|
|
"loss": 4.5798,
|
|
"mean_token_accuracy": 0.25989185124635694,
|
|
"num_tokens": 96812248.0,
|
|
"step": 42220
|
|
},
|
|
{
|
|
"entropy": 5.072267007827759,
|
|
"epoch": 4.056195965417867,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003446070725208083,
|
|
"loss": 4.6057,
|
|
"mean_token_accuracy": 0.25491817146539686,
|
|
"num_tokens": 96824016.0,
|
|
"step": 42225
|
|
},
|
|
{
|
|
"entropy": 5.047389221191406,
|
|
"epoch": 4.0566762728146015,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003445744699904377,
|
|
"loss": 4.4601,
|
|
"mean_token_accuracy": 0.2620326355099678,
|
|
"num_tokens": 96835128.0,
|
|
"step": 42230
|
|
},
|
|
{
|
|
"entropy": 4.954657936096192,
|
|
"epoch": 4.057156580211335,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003445418658447544,
|
|
"loss": 4.4349,
|
|
"mean_token_accuracy": 0.2658333763480186,
|
|
"num_tokens": 96846755.0,
|
|
"step": 42235
|
|
},
|
|
{
|
|
"entropy": 4.953990507125854,
|
|
"epoch": 4.057636887608069,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00034450926008451556,
|
|
"loss": 4.4931,
|
|
"mean_token_accuracy": 0.2683817744255066,
|
|
"num_tokens": 96858262.0,
|
|
"step": 42240
|
|
},
|
|
{
|
|
"entropy": 5.014713096618652,
|
|
"epoch": 4.058117195004803,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003444766527104781,
|
|
"loss": 4.4855,
|
|
"mean_token_accuracy": 0.2639716535806656,
|
|
"num_tokens": 96871533.0,
|
|
"step": 42245
|
|
},
|
|
{
|
|
"entropy": 5.074160528182984,
|
|
"epoch": 4.058597502401537,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000344444043723399,
|
|
"loss": 4.499,
|
|
"mean_token_accuracy": 0.2633610859513283,
|
|
"num_tokens": 96883668.0,
|
|
"step": 42250
|
|
},
|
|
{
|
|
"entropy": 5.057752704620361,
|
|
"epoch": 4.0590778097982705,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003444114331240355,
|
|
"loss": 4.5551,
|
|
"mean_token_accuracy": 0.25410541892051697,
|
|
"num_tokens": 96894196.0,
|
|
"step": 42255
|
|
},
|
|
{
|
|
"entropy": 5.048542118072509,
|
|
"epoch": 4.059558117195005,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003443788209131446,
|
|
"loss": 4.5488,
|
|
"mean_token_accuracy": 0.26270691603422164,
|
|
"num_tokens": 96905570.0,
|
|
"step": 42260
|
|
},
|
|
{
|
|
"entropy": 5.051604795455932,
|
|
"epoch": 4.060038424591739,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034434620709148343,
|
|
"loss": 4.4991,
|
|
"mean_token_accuracy": 0.27238699644804,
|
|
"num_tokens": 96916852.0,
|
|
"step": 42265
|
|
},
|
|
{
|
|
"entropy": 4.989888381958008,
|
|
"epoch": 4.060518731988473,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003443135916598093,
|
|
"loss": 4.4733,
|
|
"mean_token_accuracy": 0.2585155010223389,
|
|
"num_tokens": 96927851.0,
|
|
"step": 42270
|
|
},
|
|
{
|
|
"entropy": 4.951393127441406,
|
|
"epoch": 4.060999039385207,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00034428097461887934,
|
|
"loss": 4.5132,
|
|
"mean_token_accuracy": 0.2627553030848503,
|
|
"num_tokens": 96939976.0,
|
|
"step": 42275
|
|
},
|
|
{
|
|
"entropy": 5.006790113449097,
|
|
"epoch": 4.06147934678194,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003442483559694509,
|
|
"loss": 4.5309,
|
|
"mean_token_accuracy": 0.2626191332936287,
|
|
"num_tokens": 96951920.0,
|
|
"step": 42280
|
|
},
|
|
{
|
|
"entropy": 5.01797308921814,
|
|
"epoch": 4.061959654178675,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00034421573571228134,
|
|
"loss": 4.5128,
|
|
"mean_token_accuracy": 0.2633462116122246,
|
|
"num_tokens": 96963389.0,
|
|
"step": 42285
|
|
},
|
|
{
|
|
"entropy": 4.997462177276612,
|
|
"epoch": 4.062439961575408,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003441831138481279,
|
|
"loss": 4.487,
|
|
"mean_token_accuracy": 0.270888976752758,
|
|
"num_tokens": 96975250.0,
|
|
"step": 42290
|
|
},
|
|
{
|
|
"entropy": 4.9318763256073,
|
|
"epoch": 4.062920268972142,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003441504903777479,
|
|
"loss": 4.4088,
|
|
"mean_token_accuracy": 0.2734381467103958,
|
|
"num_tokens": 96986116.0,
|
|
"step": 42295
|
|
},
|
|
{
|
|
"entropy": 4.9807751178741455,
|
|
"epoch": 4.063400576368876,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034411786530189894,
|
|
"loss": 4.4797,
|
|
"mean_token_accuracy": 0.2650988891720772,
|
|
"num_tokens": 96998370.0,
|
|
"step": 42300
|
|
},
|
|
{
|
|
"entropy": 4.964758586883545,
|
|
"epoch": 4.06388088376561,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034408523862133834,
|
|
"loss": 4.4293,
|
|
"mean_token_accuracy": 0.2626702830195427,
|
|
"num_tokens": 97010866.0,
|
|
"step": 42305
|
|
},
|
|
{
|
|
"entropy": 5.055146503448486,
|
|
"epoch": 4.064361191162344,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034405261033682363,
|
|
"loss": 4.5147,
|
|
"mean_token_accuracy": 0.263331463932991,
|
|
"num_tokens": 97023127.0,
|
|
"step": 42310
|
|
},
|
|
{
|
|
"entropy": 5.054312610626221,
|
|
"epoch": 4.064841498559078,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003440199804491123,
|
|
"loss": 4.5547,
|
|
"mean_token_accuracy": 0.2565387040376663,
|
|
"num_tokens": 97035117.0,
|
|
"step": 42315
|
|
},
|
|
{
|
|
"entropy": 5.03828797340393,
|
|
"epoch": 4.065321805955811,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003439873489589621,
|
|
"loss": 4.5596,
|
|
"mean_token_accuracy": 0.2579404428601265,
|
|
"num_tokens": 97046711.0,
|
|
"step": 42320
|
|
},
|
|
{
|
|
"entropy": 5.017680406570435,
|
|
"epoch": 4.065802113352546,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003439547158671304,
|
|
"loss": 4.4734,
|
|
"mean_token_accuracy": 0.26130240857601167,
|
|
"num_tokens": 97059120.0,
|
|
"step": 42325
|
|
},
|
|
{
|
|
"entropy": 4.967703723907471,
|
|
"epoch": 4.066282420749279,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00034392208117437503,
|
|
"loss": 4.4696,
|
|
"mean_token_accuracy": 0.26379658579826354,
|
|
"num_tokens": 97071879.0,
|
|
"step": 42330
|
|
},
|
|
{
|
|
"entropy": 5.016658544540405,
|
|
"epoch": 4.0667627281460135,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00034388944488145356,
|
|
"loss": 4.499,
|
|
"mean_token_accuracy": 0.26891441494226453,
|
|
"num_tokens": 97082883.0,
|
|
"step": 42335
|
|
},
|
|
{
|
|
"entropy": 4.988123941421509,
|
|
"epoch": 4.067243035542747,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00034385680698912377,
|
|
"loss": 4.4857,
|
|
"mean_token_accuracy": 0.2673839941620827,
|
|
"num_tokens": 97094174.0,
|
|
"step": 42340
|
|
},
|
|
{
|
|
"entropy": 5.041378211975098,
|
|
"epoch": 4.067723342939481,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00034382416749814335,
|
|
"loss": 4.528,
|
|
"mean_token_accuracy": 0.25783237665891645,
|
|
"num_tokens": 97104328.0,
|
|
"step": 42345
|
|
},
|
|
{
|
|
"entropy": 5.003281927108764,
|
|
"epoch": 4.068203650336216,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003437915264092701,
|
|
"loss": 4.4927,
|
|
"mean_token_accuracy": 0.2732357904314995,
|
|
"num_tokens": 97117533.0,
|
|
"step": 42350
|
|
},
|
|
{
|
|
"entropy": 5.045408248901367,
|
|
"epoch": 4.068683957732949,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003437588837232619,
|
|
"loss": 4.5343,
|
|
"mean_token_accuracy": 0.2612572282552719,
|
|
"num_tokens": 97129355.0,
|
|
"step": 42355
|
|
},
|
|
{
|
|
"entropy": 4.970353364944458,
|
|
"epoch": 4.069164265129683,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00034372623944087656,
|
|
"loss": 4.4607,
|
|
"mean_token_accuracy": 0.2668451279401779,
|
|
"num_tokens": 97141424.0,
|
|
"step": 42360
|
|
},
|
|
{
|
|
"entropy": 5.0063148021698,
|
|
"epoch": 4.069644572526417,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003436935935628721,
|
|
"loss": 4.5115,
|
|
"mean_token_accuracy": 0.2659687206149101,
|
|
"num_tokens": 97152900.0,
|
|
"step": 42365
|
|
},
|
|
{
|
|
"entropy": 5.077512884140015,
|
|
"epoch": 4.070124879923151,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034366094609000633,
|
|
"loss": 4.5503,
|
|
"mean_token_accuracy": 0.26175200045108793,
|
|
"num_tokens": 97163894.0,
|
|
"step": 42370
|
|
},
|
|
{
|
|
"entropy": 5.010024785995483,
|
|
"epoch": 4.0706051873198845,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003436282970230372,
|
|
"loss": 4.525,
|
|
"mean_token_accuracy": 0.2590723991394043,
|
|
"num_tokens": 97175385.0,
|
|
"step": 42375
|
|
},
|
|
{
|
|
"entropy": 5.020723867416382,
|
|
"epoch": 4.071085494716619,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00034359564636272287,
|
|
"loss": 4.4957,
|
|
"mean_token_accuracy": 0.26631955057382584,
|
|
"num_tokens": 97186830.0,
|
|
"step": 42380
|
|
},
|
|
{
|
|
"entropy": 5.075593328475952,
|
|
"epoch": 4.071565802113352,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003435629941098213,
|
|
"loss": 4.5481,
|
|
"mean_token_accuracy": 0.26057728826999665,
|
|
"num_tokens": 97197816.0,
|
|
"step": 42385
|
|
},
|
|
{
|
|
"entropy": 5.016417169570923,
|
|
"epoch": 4.072046109510087,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003435303402650906,
|
|
"loss": 4.4838,
|
|
"mean_token_accuracy": 0.2664680197834969,
|
|
"num_tokens": 97208721.0,
|
|
"step": 42390
|
|
},
|
|
{
|
|
"entropy": 5.039650821685791,
|
|
"epoch": 4.07252641690682,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000343497684829289,
|
|
"loss": 4.6121,
|
|
"mean_token_accuracy": 0.25309834331274034,
|
|
"num_tokens": 97220487.0,
|
|
"step": 42395
|
|
},
|
|
{
|
|
"entropy": 4.9389503479003904,
|
|
"epoch": 4.073006724303554,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003434650278031744,
|
|
"loss": 4.4593,
|
|
"mean_token_accuracy": 0.26995796263217925,
|
|
"num_tokens": 97231995.0,
|
|
"step": 42400
|
|
},
|
|
{
|
|
"entropy": 4.975099086761475,
|
|
"epoch": 4.073487031700288,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034343236918750523,
|
|
"loss": 4.4065,
|
|
"mean_token_accuracy": 0.2766061663627625,
|
|
"num_tokens": 97243168.0,
|
|
"step": 42405
|
|
},
|
|
{
|
|
"entropy": 4.9976222038269045,
|
|
"epoch": 4.073967339097022,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034339970898303963,
|
|
"loss": 4.5135,
|
|
"mean_token_accuracy": 0.26653551161289213,
|
|
"num_tokens": 97254361.0,
|
|
"step": 42410
|
|
},
|
|
{
|
|
"entropy": 5.017204332351684,
|
|
"epoch": 4.074447646493756,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00034336704719053584,
|
|
"loss": 4.446,
|
|
"mean_token_accuracy": 0.2676748514175415,
|
|
"num_tokens": 97265535.0,
|
|
"step": 42415
|
|
},
|
|
{
|
|
"entropy": 4.8847493648529055,
|
|
"epoch": 4.07492795389049,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003433343838107523,
|
|
"loss": 4.471,
|
|
"mean_token_accuracy": 0.2753725737333298,
|
|
"num_tokens": 97277426.0,
|
|
"step": 42420
|
|
},
|
|
{
|
|
"entropy": 4.905474424362183,
|
|
"epoch": 4.075408261287224,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034330171884444725,
|
|
"loss": 4.4349,
|
|
"mean_token_accuracy": 0.26689328253269196,
|
|
"num_tokens": 97288897.0,
|
|
"step": 42425
|
|
},
|
|
{
|
|
"entropy": 4.987169075012207,
|
|
"epoch": 4.075888568683958,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034326905229237914,
|
|
"loss": 4.4997,
|
|
"mean_token_accuracy": 0.26009591668844223,
|
|
"num_tokens": 97299288.0,
|
|
"step": 42430
|
|
},
|
|
{
|
|
"entropy": 5.088150072097778,
|
|
"epoch": 4.076368876080692,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00034323638415530627,
|
|
"loss": 4.5019,
|
|
"mean_token_accuracy": 0.26865658462047576,
|
|
"num_tokens": 97309122.0,
|
|
"step": 42435
|
|
},
|
|
{
|
|
"entropy": 4.9162839412689205,
|
|
"epoch": 4.076849183477425,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003432037144339872,
|
|
"loss": 4.4317,
|
|
"mean_token_accuracy": 0.26672160178422927,
|
|
"num_tokens": 97320390.0,
|
|
"step": 42440
|
|
},
|
|
{
|
|
"entropy": 4.978461360931396,
|
|
"epoch": 4.07732949087416,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00034317104312918054,
|
|
"loss": 4.5244,
|
|
"mean_token_accuracy": 0.25710040181875227,
|
|
"num_tokens": 97332062.0,
|
|
"step": 42445
|
|
},
|
|
{
|
|
"entropy": 4.984638977050781,
|
|
"epoch": 4.077809798270893,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003431383702416445,
|
|
"loss": 4.4261,
|
|
"mean_token_accuracy": 0.26824576407670975,
|
|
"num_tokens": 97343197.0,
|
|
"step": 42450
|
|
},
|
|
{
|
|
"entropy": 5.0364179611206055,
|
|
"epoch": 4.0782901056676275,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034310569577213795,
|
|
"loss": 4.4807,
|
|
"mean_token_accuracy": 0.2665139317512512,
|
|
"num_tokens": 97354667.0,
|
|
"step": 42455
|
|
},
|
|
{
|
|
"entropy": 5.110366535186768,
|
|
"epoch": 4.078770413064361,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003430730197214193,
|
|
"loss": 4.6359,
|
|
"mean_token_accuracy": 0.26057023257017137,
|
|
"num_tokens": 97365011.0,
|
|
"step": 42460
|
|
},
|
|
{
|
|
"entropy": 4.981812381744385,
|
|
"epoch": 4.079250720461095,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003430403420902473,
|
|
"loss": 4.5264,
|
|
"mean_token_accuracy": 0.2604891523718834,
|
|
"num_tokens": 97376285.0,
|
|
"step": 42465
|
|
},
|
|
{
|
|
"entropy": 4.9716273784637455,
|
|
"epoch": 4.079731027857829,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00034300766287938066,
|
|
"loss": 4.5282,
|
|
"mean_token_accuracy": 0.26457744687795637,
|
|
"num_tokens": 97387901.0,
|
|
"step": 42470
|
|
},
|
|
{
|
|
"entropy": 4.98234806060791,
|
|
"epoch": 4.080211335254563,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034297498208957797,
|
|
"loss": 4.4549,
|
|
"mean_token_accuracy": 0.27255129218101504,
|
|
"num_tokens": 97399854.0,
|
|
"step": 42475
|
|
},
|
|
{
|
|
"entropy": 4.967026615142823,
|
|
"epoch": 4.0806916426512965,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000342942299721598,
|
|
"loss": 4.4629,
|
|
"mean_token_accuracy": 0.26456455141305923,
|
|
"num_tokens": 97410867.0,
|
|
"step": 42480
|
|
},
|
|
{
|
|
"entropy": 4.963451862335205,
|
|
"epoch": 4.081171950048031,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0003429096157761997,
|
|
"loss": 4.5417,
|
|
"mean_token_accuracy": 0.2598129317164421,
|
|
"num_tokens": 97422890.0,
|
|
"step": 42485
|
|
},
|
|
{
|
|
"entropy": 4.949570178985596,
|
|
"epoch": 4.081652257444764,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003428769302541416,
|
|
"loss": 4.4494,
|
|
"mean_token_accuracy": 0.2680146858096123,
|
|
"num_tokens": 97434465.0,
|
|
"step": 42490
|
|
},
|
|
{
|
|
"entropy": 4.970086717605591,
|
|
"epoch": 4.082132564841499,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003428442431561828,
|
|
"loss": 4.4211,
|
|
"mean_token_accuracy": 0.27310545593500135,
|
|
"num_tokens": 97445358.0,
|
|
"step": 42495
|
|
},
|
|
{
|
|
"entropy": 4.969310903549195,
|
|
"epoch": 4.082612872238233,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003428115544830823,
|
|
"loss": 4.4439,
|
|
"mean_token_accuracy": 0.2645836561918259,
|
|
"num_tokens": 97456120.0,
|
|
"step": 42500
|
|
},
|
|
{
|
|
"entropy": 4.98270845413208,
|
|
"epoch": 4.083093179634966,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00034277886423559863,
|
|
"loss": 4.5092,
|
|
"mean_token_accuracy": 0.2639601990580559,
|
|
"num_tokens": 97467377.0,
|
|
"step": 42505
|
|
},
|
|
{
|
|
"entropy": 4.961084079742432,
|
|
"epoch": 4.083573487031701,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034274617241449105,
|
|
"loss": 4.5314,
|
|
"mean_token_accuracy": 0.2590774044394493,
|
|
"num_tokens": 97479250.0,
|
|
"step": 42510
|
|
},
|
|
{
|
|
"entropy": 5.060076904296875,
|
|
"epoch": 4.084053794428434,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00034271347902051864,
|
|
"loss": 4.5613,
|
|
"mean_token_accuracy": 0.2598003834486008,
|
|
"num_tokens": 97491157.0,
|
|
"step": 42515
|
|
},
|
|
{
|
|
"entropy": 4.921099996566772,
|
|
"epoch": 4.084534101825168,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003426807840544401,
|
|
"loss": 4.3959,
|
|
"mean_token_accuracy": 0.2677940919995308,
|
|
"num_tokens": 97503075.0,
|
|
"step": 42520
|
|
},
|
|
{
|
|
"entropy": 5.047155570983887,
|
|
"epoch": 4.085014409221902,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003426480875170149,
|
|
"loss": 4.6012,
|
|
"mean_token_accuracy": 0.2509763866662979,
|
|
"num_tokens": 97514026.0,
|
|
"step": 42525
|
|
},
|
|
{
|
|
"entropy": 5.0084693908691404,
|
|
"epoch": 4.085494716618636,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034261538940900195,
|
|
"loss": 4.4747,
|
|
"mean_token_accuracy": 0.26553067862987517,
|
|
"num_tokens": 97526866.0,
|
|
"step": 42530
|
|
},
|
|
{
|
|
"entropy": 5.028721332550049,
|
|
"epoch": 4.08597502401537,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003425826897311604,
|
|
"loss": 4.5849,
|
|
"mean_token_accuracy": 0.25570117235183715,
|
|
"num_tokens": 97538632.0,
|
|
"step": 42535
|
|
},
|
|
{
|
|
"entropy": 5.021993112564087,
|
|
"epoch": 4.086455331412104,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034254998848424947,
|
|
"loss": 4.4773,
|
|
"mean_token_accuracy": 0.2632177799940109,
|
|
"num_tokens": 97549288.0,
|
|
"step": 42540
|
|
},
|
|
{
|
|
"entropy": 5.009607219696045,
|
|
"epoch": 4.086935638808837,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003425172856690284,
|
|
"loss": 4.5613,
|
|
"mean_token_accuracy": 0.2592760235071182,
|
|
"num_tokens": 97560582.0,
|
|
"step": 42545
|
|
},
|
|
{
|
|
"entropy": 4.950411796569824,
|
|
"epoch": 4.087415946205572,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034248458128625643,
|
|
"loss": 4.4198,
|
|
"mean_token_accuracy": 0.265521377325058,
|
|
"num_tokens": 97571974.0,
|
|
"step": 42550
|
|
},
|
|
{
|
|
"entropy": 4.958393812179565,
|
|
"epoch": 4.087896253602305,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034245187533669285,
|
|
"loss": 4.4496,
|
|
"mean_token_accuracy": 0.2706846475601196,
|
|
"num_tokens": 97582456.0,
|
|
"step": 42555
|
|
},
|
|
{
|
|
"entropy": 5.05852632522583,
|
|
"epoch": 4.0883765609990395,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00034241916782109714,
|
|
"loss": 4.5595,
|
|
"mean_token_accuracy": 0.26386343240737914,
|
|
"num_tokens": 97593246.0,
|
|
"step": 42560
|
|
},
|
|
{
|
|
"entropy": 4.880468463897705,
|
|
"epoch": 4.088856868395773,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00034238645874022845,
|
|
"loss": 4.3539,
|
|
"mean_token_accuracy": 0.27736635953187944,
|
|
"num_tokens": 97604004.0,
|
|
"step": 42565
|
|
},
|
|
{
|
|
"entropy": 5.0087317943573,
|
|
"epoch": 4.089337175792507,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.00034235374809484626,
|
|
"loss": 4.5776,
|
|
"mean_token_accuracy": 0.2584461584687233,
|
|
"num_tokens": 97616122.0,
|
|
"step": 42570
|
|
},
|
|
{
|
|
"entropy": 4.98045802116394,
|
|
"epoch": 4.0898174831892415,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003423210358857101,
|
|
"loss": 4.4345,
|
|
"mean_token_accuracy": 0.26918732970952985,
|
|
"num_tokens": 97627550.0,
|
|
"step": 42575
|
|
},
|
|
{
|
|
"entropy": 5.074587726593018,
|
|
"epoch": 4.090297790585975,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.00034228832211357935,
|
|
"loss": 4.5808,
|
|
"mean_token_accuracy": 0.25080201476812364,
|
|
"num_tokens": 97639952.0,
|
|
"step": 42580
|
|
},
|
|
{
|
|
"entropy": 4.964069795608521,
|
|
"epoch": 4.090778097982709,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.00034225560677921354,
|
|
"loss": 4.4166,
|
|
"mean_token_accuracy": 0.26915917694568636,
|
|
"num_tokens": 97653158.0,
|
|
"step": 42585
|
|
},
|
|
{
|
|
"entropy": 5.050472068786621,
|
|
"epoch": 4.091258405379443,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00034222288988337233,
|
|
"loss": 4.5631,
|
|
"mean_token_accuracy": 0.26312260180711744,
|
|
"num_tokens": 97666768.0,
|
|
"step": 42590
|
|
},
|
|
{
|
|
"entropy": 5.036763334274292,
|
|
"epoch": 4.091738712776177,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003421901714268152,
|
|
"loss": 4.4931,
|
|
"mean_token_accuracy": 0.2641834497451782,
|
|
"num_tokens": 97678128.0,
|
|
"step": 42595
|
|
},
|
|
{
|
|
"entropy": 5.028664684295654,
|
|
"epoch": 4.0922190201729105,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00034215745141030177,
|
|
"loss": 4.5586,
|
|
"mean_token_accuracy": 0.2613097831606865,
|
|
"num_tokens": 97689565.0,
|
|
"step": 42600
|
|
},
|
|
{
|
|
"entropy": 4.951626491546631,
|
|
"epoch": 4.092699327569645,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034212472983459176,
|
|
"loss": 4.4551,
|
|
"mean_token_accuracy": 0.27016458064317705,
|
|
"num_tokens": 97701315.0,
|
|
"step": 42605
|
|
},
|
|
{
|
|
"entropy": 5.05564661026001,
|
|
"epoch": 4.093179634966378,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003420920067004448,
|
|
"loss": 4.5449,
|
|
"mean_token_accuracy": 0.2623819887638092,
|
|
"num_tokens": 97712404.0,
|
|
"step": 42610
|
|
},
|
|
{
|
|
"entropy": 4.987038850784302,
|
|
"epoch": 4.093659942363113,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003420592820086207,
|
|
"loss": 4.4491,
|
|
"mean_token_accuracy": 0.2659685224294662,
|
|
"num_tokens": 97723547.0,
|
|
"step": 42615
|
|
},
|
|
{
|
|
"entropy": 4.969598054885864,
|
|
"epoch": 4.094140249759846,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00034202655575987925,
|
|
"loss": 4.4594,
|
|
"mean_token_accuracy": 0.2739225745201111,
|
|
"num_tokens": 97734133.0,
|
|
"step": 42620
|
|
},
|
|
{
|
|
"entropy": 5.065438508987427,
|
|
"epoch": 4.09462055715658,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00034199382795498015,
|
|
"loss": 4.564,
|
|
"mean_token_accuracy": 0.2542939558625221,
|
|
"num_tokens": 97746479.0,
|
|
"step": 42625
|
|
},
|
|
{
|
|
"entropy": 5.025899887084961,
|
|
"epoch": 4.095100864553314,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00034196109859468325,
|
|
"loss": 4.5028,
|
|
"mean_token_accuracy": 0.2655098468065262,
|
|
"num_tokens": 97758042.0,
|
|
"step": 42630
|
|
},
|
|
{
|
|
"entropy": 4.984849071502685,
|
|
"epoch": 4.095581171950048,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003419283676797485,
|
|
"loss": 4.4975,
|
|
"mean_token_accuracy": 0.2723266795277596,
|
|
"num_tokens": 97770338.0,
|
|
"step": 42635
|
|
},
|
|
{
|
|
"entropy": 4.985719442367554,
|
|
"epoch": 4.0960614793467816,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00034189563521093587,
|
|
"loss": 4.4943,
|
|
"mean_token_accuracy": 0.2632278576493263,
|
|
"num_tokens": 97781274.0,
|
|
"step": 42640
|
|
},
|
|
{
|
|
"entropy": 5.09106125831604,
|
|
"epoch": 4.096541786743516,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003418629011890051,
|
|
"loss": 4.5796,
|
|
"mean_token_accuracy": 0.25866803973913194,
|
|
"num_tokens": 97792241.0,
|
|
"step": 42645
|
|
},
|
|
{
|
|
"entropy": 5.03477635383606,
|
|
"epoch": 4.09702209414025,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00034183016561471644,
|
|
"loss": 4.5372,
|
|
"mean_token_accuracy": 0.26360258758068084,
|
|
"num_tokens": 97803315.0,
|
|
"step": 42650
|
|
},
|
|
{
|
|
"entropy": 5.120706987380982,
|
|
"epoch": 4.097502401536984,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 0.00034179742848882967,
|
|
"loss": 4.6599,
|
|
"mean_token_accuracy": 0.2512038692831993,
|
|
"num_tokens": 97815138.0,
|
|
"step": 42655
|
|
},
|
|
{
|
|
"entropy": 4.998038339614868,
|
|
"epoch": 4.097982708933718,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00034176468981210494,
|
|
"loss": 4.4606,
|
|
"mean_token_accuracy": 0.2696879908442497,
|
|
"num_tokens": 97826846.0,
|
|
"step": 42660
|
|
},
|
|
{
|
|
"entropy": 5.040591621398926,
|
|
"epoch": 4.098463016330451,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003417319495853025,
|
|
"loss": 4.5252,
|
|
"mean_token_accuracy": 0.2614794492721558,
|
|
"num_tokens": 97838561.0,
|
|
"step": 42665
|
|
},
|
|
{
|
|
"entropy": 5.040173530578613,
|
|
"epoch": 4.098943323727186,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003416992078091823,
|
|
"loss": 4.5087,
|
|
"mean_token_accuracy": 0.2630004435777664,
|
|
"num_tokens": 97850306.0,
|
|
"step": 42670
|
|
},
|
|
{
|
|
"entropy": 5.012949419021607,
|
|
"epoch": 4.099423631123919,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003416664644845044,
|
|
"loss": 4.507,
|
|
"mean_token_accuracy": 0.2609300449490547,
|
|
"num_tokens": 97862193.0,
|
|
"step": 42675
|
|
},
|
|
{
|
|
"entropy": 5.0282610893249515,
|
|
"epoch": 4.0999039385206535,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00034163371961202935,
|
|
"loss": 4.5468,
|
|
"mean_token_accuracy": 0.2541579008102417,
|
|
"num_tokens": 97874557.0,
|
|
"step": 42680
|
|
},
|
|
{
|
|
"entropy": 5.027649974822998,
|
|
"epoch": 4.100384245917387,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034160097319251714,
|
|
"loss": 4.5496,
|
|
"mean_token_accuracy": 0.26270129680633547,
|
|
"num_tokens": 97885334.0,
|
|
"step": 42685
|
|
},
|
|
{
|
|
"entropy": 5.099428462982178,
|
|
"epoch": 4.100864553314121,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00034156822522672805,
|
|
"loss": 4.6047,
|
|
"mean_token_accuracy": 0.2521642237901688,
|
|
"num_tokens": 97896857.0,
|
|
"step": 42690
|
|
},
|
|
{
|
|
"entropy": 4.965286684036255,
|
|
"epoch": 4.101344860710855,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003415354757154225,
|
|
"loss": 4.445,
|
|
"mean_token_accuracy": 0.2625503957271576,
|
|
"num_tokens": 97908172.0,
|
|
"step": 42695
|
|
},
|
|
{
|
|
"entropy": 4.97789740562439,
|
|
"epoch": 4.101825168107589,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00034150272465936066,
|
|
"loss": 4.4419,
|
|
"mean_token_accuracy": 0.270804063975811,
|
|
"num_tokens": 97920176.0,
|
|
"step": 42700
|
|
},
|
|
{
|
|
"entropy": 4.960748815536499,
|
|
"epoch": 4.1023054755043225,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00034146997205930307,
|
|
"loss": 4.4859,
|
|
"mean_token_accuracy": 0.26222688555717466,
|
|
"num_tokens": 97932225.0,
|
|
"step": 42705
|
|
},
|
|
{
|
|
"entropy": 4.991958665847778,
|
|
"epoch": 4.102785782901057,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003414372179160101,
|
|
"loss": 4.4769,
|
|
"mean_token_accuracy": 0.2658188983798027,
|
|
"num_tokens": 97942457.0,
|
|
"step": 42710
|
|
},
|
|
{
|
|
"entropy": 5.072036266326904,
|
|
"epoch": 4.10326609029779,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003414044622302424,
|
|
"loss": 4.5914,
|
|
"mean_token_accuracy": 0.2653363674879074,
|
|
"num_tokens": 97954278.0,
|
|
"step": 42715
|
|
},
|
|
{
|
|
"entropy": 5.060891103744507,
|
|
"epoch": 4.1037463976945245,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00034137170500276004,
|
|
"loss": 4.5124,
|
|
"mean_token_accuracy": 0.25751264542341235,
|
|
"num_tokens": 97966441.0,
|
|
"step": 42720
|
|
},
|
|
{
|
|
"entropy": 5.075611352920532,
|
|
"epoch": 4.104226705091258,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003413389462343238,
|
|
"loss": 4.4753,
|
|
"mean_token_accuracy": 0.2691924929618835,
|
|
"num_tokens": 97976810.0,
|
|
"step": 42725
|
|
},
|
|
{
|
|
"entropy": 4.98665041923523,
|
|
"epoch": 4.104707012487992,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003413061859256943,
|
|
"loss": 4.4782,
|
|
"mean_token_accuracy": 0.26432646363973616,
|
|
"num_tokens": 97989012.0,
|
|
"step": 42730
|
|
},
|
|
{
|
|
"entropy": 5.112366676330566,
|
|
"epoch": 4.105187319884727,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00034127342407763196,
|
|
"loss": 4.6361,
|
|
"mean_token_accuracy": 0.2484620362520218,
|
|
"num_tokens": 98000280.0,
|
|
"step": 42735
|
|
},
|
|
{
|
|
"entropy": 5.115489101409912,
|
|
"epoch": 4.10566762728146,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003412406606908976,
|
|
"loss": 4.5699,
|
|
"mean_token_accuracy": 0.2582164376974106,
|
|
"num_tokens": 98012097.0,
|
|
"step": 42740
|
|
},
|
|
{
|
|
"entropy": 5.092605447769165,
|
|
"epoch": 4.106147934678194,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003412078957662517,
|
|
"loss": 4.5973,
|
|
"mean_token_accuracy": 0.2516476631164551,
|
|
"num_tokens": 98022602.0,
|
|
"step": 42745
|
|
},
|
|
{
|
|
"entropy": 5.048404741287231,
|
|
"epoch": 4.106628242074928,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00034117512930445504,
|
|
"loss": 4.5485,
|
|
"mean_token_accuracy": 0.26056815683841705,
|
|
"num_tokens": 98034535.0,
|
|
"step": 42750
|
|
},
|
|
{
|
|
"entropy": 4.966902589797973,
|
|
"epoch": 4.107108549471662,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00034114236130626855,
|
|
"loss": 4.4762,
|
|
"mean_token_accuracy": 0.264396433532238,
|
|
"num_tokens": 98046525.0,
|
|
"step": 42755
|
|
},
|
|
{
|
|
"entropy": 5.065855264663696,
|
|
"epoch": 4.107588856868396,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003411095917724527,
|
|
"loss": 4.5447,
|
|
"mean_token_accuracy": 0.2669515565037727,
|
|
"num_tokens": 98057540.0,
|
|
"step": 42760
|
|
},
|
|
{
|
|
"entropy": 5.03923487663269,
|
|
"epoch": 4.10806916426513,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00034107682070376845,
|
|
"loss": 4.5232,
|
|
"mean_token_accuracy": 0.2744559422135353,
|
|
"num_tokens": 98068578.0,
|
|
"step": 42765
|
|
},
|
|
{
|
|
"entropy": 4.991632175445557,
|
|
"epoch": 4.108549471661863,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00034104404810097663,
|
|
"loss": 4.4703,
|
|
"mean_token_accuracy": 0.2635844722390175,
|
|
"num_tokens": 98079981.0,
|
|
"step": 42770
|
|
},
|
|
{
|
|
"entropy": 5.0025221824646,
|
|
"epoch": 4.109029779058598,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003410112739648381,
|
|
"loss": 4.5093,
|
|
"mean_token_accuracy": 0.26801645904779436,
|
|
"num_tokens": 98091579.0,
|
|
"step": 42775
|
|
},
|
|
{
|
|
"entropy": 5.010376644134522,
|
|
"epoch": 4.109510086455331,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034097849829611387,
|
|
"loss": 4.5687,
|
|
"mean_token_accuracy": 0.2568126142024994,
|
|
"num_tokens": 98103239.0,
|
|
"step": 42780
|
|
},
|
|
{
|
|
"entropy": 5.013103675842285,
|
|
"epoch": 4.109990393852065,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00034094572109556477,
|
|
"loss": 4.4813,
|
|
"mean_token_accuracy": 0.26476102471351626,
|
|
"num_tokens": 98114865.0,
|
|
"step": 42785
|
|
},
|
|
{
|
|
"entropy": 4.988552522659302,
|
|
"epoch": 4.110470701248799,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00034091294236395186,
|
|
"loss": 4.4583,
|
|
"mean_token_accuracy": 0.2595075473189354,
|
|
"num_tokens": 98125831.0,
|
|
"step": 42790
|
|
},
|
|
{
|
|
"entropy": 4.968686485290528,
|
|
"epoch": 4.110951008645533,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00034088016210203617,
|
|
"loss": 4.4706,
|
|
"mean_token_accuracy": 0.2657591924071312,
|
|
"num_tokens": 98136129.0,
|
|
"step": 42795
|
|
},
|
|
{
|
|
"entropy": 5.052416515350342,
|
|
"epoch": 4.111431316042267,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003408473803105788,
|
|
"loss": 4.576,
|
|
"mean_token_accuracy": 0.2563589125871658,
|
|
"num_tokens": 98148022.0,
|
|
"step": 42800
|
|
},
|
|
{
|
|
"entropy": 5.021619367599487,
|
|
"epoch": 4.111911623439001,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003408145969903406,
|
|
"loss": 4.4337,
|
|
"mean_token_accuracy": 0.2629475861787796,
|
|
"num_tokens": 98159567.0,
|
|
"step": 42805
|
|
},
|
|
{
|
|
"entropy": 5.083532047271729,
|
|
"epoch": 4.112391930835735,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00034078181214208304,
|
|
"loss": 4.5956,
|
|
"mean_token_accuracy": 0.2597316011786461,
|
|
"num_tokens": 98171957.0,
|
|
"step": 42810
|
|
},
|
|
{
|
|
"entropy": 4.935460138320923,
|
|
"epoch": 4.112872238232469,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003407490257665671,
|
|
"loss": 4.4342,
|
|
"mean_token_accuracy": 0.27261184304952624,
|
|
"num_tokens": 98184553.0,
|
|
"step": 42815
|
|
},
|
|
{
|
|
"entropy": 5.064100742340088,
|
|
"epoch": 4.113352545629203,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000340716237864554,
|
|
"loss": 4.5769,
|
|
"mean_token_accuracy": 0.24941230267286302,
|
|
"num_tokens": 98196510.0,
|
|
"step": 42820
|
|
},
|
|
{
|
|
"entropy": 5.076871633529663,
|
|
"epoch": 4.1138328530259365,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00034068344843680493,
|
|
"loss": 4.6116,
|
|
"mean_token_accuracy": 0.2528106853365898,
|
|
"num_tokens": 98207234.0,
|
|
"step": 42825
|
|
},
|
|
{
|
|
"entropy": 4.9937238693237305,
|
|
"epoch": 4.114313160422671,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00034065065748408135,
|
|
"loss": 4.4372,
|
|
"mean_token_accuracy": 0.26532976478338244,
|
|
"num_tokens": 98218853.0,
|
|
"step": 42830
|
|
},
|
|
{
|
|
"entropy": 5.012453651428222,
|
|
"epoch": 4.114793467819404,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00034061786500714434,
|
|
"loss": 4.5213,
|
|
"mean_token_accuracy": 0.26423812061548235,
|
|
"num_tokens": 98230668.0,
|
|
"step": 42835
|
|
},
|
|
{
|
|
"entropy": 4.951230049133301,
|
|
"epoch": 4.115273775216139,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00034058507100675545,
|
|
"loss": 4.4251,
|
|
"mean_token_accuracy": 0.2697641968727112,
|
|
"num_tokens": 98241982.0,
|
|
"step": 42840
|
|
},
|
|
{
|
|
"entropy": 5.044630765914917,
|
|
"epoch": 4.115754082612872,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00034055227548367595,
|
|
"loss": 4.5508,
|
|
"mean_token_accuracy": 0.2572688519954681,
|
|
"num_tokens": 98253726.0,
|
|
"step": 42845
|
|
},
|
|
{
|
|
"entropy": 5.080479717254638,
|
|
"epoch": 4.116234390009606,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003405194784386672,
|
|
"loss": 4.4983,
|
|
"mean_token_accuracy": 0.26118388324975966,
|
|
"num_tokens": 98264290.0,
|
|
"step": 42850
|
|
},
|
|
{
|
|
"entropy": 4.997405624389648,
|
|
"epoch": 4.11671469740634,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003404866798724907,
|
|
"loss": 4.459,
|
|
"mean_token_accuracy": 0.26535205245018006,
|
|
"num_tokens": 98276175.0,
|
|
"step": 42855
|
|
},
|
|
{
|
|
"entropy": 4.885718107223511,
|
|
"epoch": 4.117195004803074,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000340453879785908,
|
|
"loss": 4.4256,
|
|
"mean_token_accuracy": 0.26740616410970686,
|
|
"num_tokens": 98287023.0,
|
|
"step": 42860
|
|
},
|
|
{
|
|
"entropy": 4.972117614746094,
|
|
"epoch": 4.1176753121998075,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00034042107817968054,
|
|
"loss": 4.4444,
|
|
"mean_token_accuracy": 0.27157861888408663,
|
|
"num_tokens": 98297576.0,
|
|
"step": 42865
|
|
},
|
|
{
|
|
"entropy": 4.958441495895386,
|
|
"epoch": 4.118155619596542,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034038827505456995,
|
|
"loss": 4.4963,
|
|
"mean_token_accuracy": 0.2617960095405579,
|
|
"num_tokens": 98308951.0,
|
|
"step": 42870
|
|
},
|
|
{
|
|
"entropy": 5.016416740417481,
|
|
"epoch": 4.118635926993275,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003403554704113378,
|
|
"loss": 4.5187,
|
|
"mean_token_accuracy": 0.2640099138021469,
|
|
"num_tokens": 98320696.0,
|
|
"step": 42875
|
|
},
|
|
{
|
|
"entropy": 5.057635402679443,
|
|
"epoch": 4.11911623439001,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003403226642507456,
|
|
"loss": 4.5112,
|
|
"mean_token_accuracy": 0.2616667687892914,
|
|
"num_tokens": 98331778.0,
|
|
"step": 42880
|
|
},
|
|
{
|
|
"entropy": 5.070875930786133,
|
|
"epoch": 4.119596541786744,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00034028985657355516,
|
|
"loss": 4.5157,
|
|
"mean_token_accuracy": 0.26513914465904237,
|
|
"num_tokens": 98343142.0,
|
|
"step": 42885
|
|
},
|
|
{
|
|
"entropy": 5.024390363693238,
|
|
"epoch": 4.120076849183477,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003402570473805281,
|
|
"loss": 4.599,
|
|
"mean_token_accuracy": 0.2581349164247513,
|
|
"num_tokens": 98354907.0,
|
|
"step": 42890
|
|
},
|
|
{
|
|
"entropy": 5.049481296539307,
|
|
"epoch": 4.120557156580212,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003402242366724262,
|
|
"loss": 4.6297,
|
|
"mean_token_accuracy": 0.2564326286315918,
|
|
"num_tokens": 98366846.0,
|
|
"step": 42895
|
|
},
|
|
{
|
|
"entropy": 5.0509930610656735,
|
|
"epoch": 4.121037463976945,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00034019142445001116,
|
|
"loss": 4.527,
|
|
"mean_token_accuracy": 0.265747132897377,
|
|
"num_tokens": 98379572.0,
|
|
"step": 42900
|
|
},
|
|
{
|
|
"entropy": 5.014222192764282,
|
|
"epoch": 4.1215177713736795,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00034015861071404484,
|
|
"loss": 4.5292,
|
|
"mean_token_accuracy": 0.26229298710823057,
|
|
"num_tokens": 98391151.0,
|
|
"step": 42905
|
|
},
|
|
{
|
|
"entropy": 4.992162275314331,
|
|
"epoch": 4.121998078770413,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003401257954652891,
|
|
"loss": 4.4706,
|
|
"mean_token_accuracy": 0.27066550850868226,
|
|
"num_tokens": 98402012.0,
|
|
"step": 42910
|
|
},
|
|
{
|
|
"entropy": 4.962578296661377,
|
|
"epoch": 4.122478386167147,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00034009297870450574,
|
|
"loss": 4.4297,
|
|
"mean_token_accuracy": 0.2641265198588371,
|
|
"num_tokens": 98413533.0,
|
|
"step": 42915
|
|
},
|
|
{
|
|
"entropy": 4.898211908340454,
|
|
"epoch": 4.122958693563881,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003400601604324567,
|
|
"loss": 4.3965,
|
|
"mean_token_accuracy": 0.27102688550949094,
|
|
"num_tokens": 98425056.0,
|
|
"step": 42920
|
|
},
|
|
{
|
|
"entropy": 5.057673597335816,
|
|
"epoch": 4.123439000960615,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00034002734064990386,
|
|
"loss": 4.5434,
|
|
"mean_token_accuracy": 0.2638516888022423,
|
|
"num_tokens": 98436768.0,
|
|
"step": 42925
|
|
},
|
|
{
|
|
"entropy": 5.094536066055298,
|
|
"epoch": 4.123919308357348,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003399945193576093,
|
|
"loss": 4.5831,
|
|
"mean_token_accuracy": 0.2572285458445549,
|
|
"num_tokens": 98449360.0,
|
|
"step": 42930
|
|
},
|
|
{
|
|
"entropy": 4.981645202636718,
|
|
"epoch": 4.124399615754083,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003399616965563351,
|
|
"loss": 4.4753,
|
|
"mean_token_accuracy": 0.26285948008298876,
|
|
"num_tokens": 98461717.0,
|
|
"step": 42935
|
|
},
|
|
{
|
|
"entropy": 4.993545055389404,
|
|
"epoch": 4.124879923150816,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003399288722468431,
|
|
"loss": 4.4745,
|
|
"mean_token_accuracy": 0.2665898233652115,
|
|
"num_tokens": 98472186.0,
|
|
"step": 42940
|
|
},
|
|
{
|
|
"entropy": 5.042765522003174,
|
|
"epoch": 4.1253602305475505,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003398960464298955,
|
|
"loss": 4.5119,
|
|
"mean_token_accuracy": 0.26460084319114685,
|
|
"num_tokens": 98483266.0,
|
|
"step": 42945
|
|
},
|
|
{
|
|
"entropy": 4.9583411693573,
|
|
"epoch": 4.125840537944284,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003398632191062544,
|
|
"loss": 4.4091,
|
|
"mean_token_accuracy": 0.27046269476413726,
|
|
"num_tokens": 98494984.0,
|
|
"step": 42950
|
|
},
|
|
{
|
|
"entropy": 4.968841648101806,
|
|
"epoch": 4.126320845341018,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000339830390276682,
|
|
"loss": 4.457,
|
|
"mean_token_accuracy": 0.27157214283943176,
|
|
"num_tokens": 98506564.0,
|
|
"step": 42955
|
|
},
|
|
{
|
|
"entropy": 4.946338367462158,
|
|
"epoch": 4.126801152737752,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.00033979755994194033,
|
|
"loss": 4.4603,
|
|
"mean_token_accuracy": 0.2674534648656845,
|
|
"num_tokens": 98519732.0,
|
|
"step": 42960
|
|
},
|
|
{
|
|
"entropy": 5.085115909576416,
|
|
"epoch": 4.127281460134486,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003397647281027919,
|
|
"loss": 4.574,
|
|
"mean_token_accuracy": 0.25395711362361906,
|
|
"num_tokens": 98530271.0,
|
|
"step": 42965
|
|
},
|
|
{
|
|
"entropy": 4.985479164123535,
|
|
"epoch": 4.12776176753122,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003397318947599987,
|
|
"loss": 4.4315,
|
|
"mean_token_accuracy": 0.2724211886525154,
|
|
"num_tokens": 98540178.0,
|
|
"step": 42970
|
|
},
|
|
{
|
|
"entropy": 5.020646333694458,
|
|
"epoch": 4.128242074927954,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00033969905991432307,
|
|
"loss": 4.5359,
|
|
"mean_token_accuracy": 0.26010872721672057,
|
|
"num_tokens": 98551513.0,
|
|
"step": 42975
|
|
},
|
|
{
|
|
"entropy": 5.07339997291565,
|
|
"epoch": 4.128722382324688,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00033966622356652746,
|
|
"loss": 4.6,
|
|
"mean_token_accuracy": 0.2589086890220642,
|
|
"num_tokens": 98563971.0,
|
|
"step": 42980
|
|
},
|
|
{
|
|
"entropy": 5.0909340381622314,
|
|
"epoch": 4.129202689721422,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003396333857173741,
|
|
"loss": 4.5263,
|
|
"mean_token_accuracy": 0.2637458696961403,
|
|
"num_tokens": 98575348.0,
|
|
"step": 42985
|
|
},
|
|
{
|
|
"entropy": 5.0135705947875975,
|
|
"epoch": 4.129682997118156,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003396005463676255,
|
|
"loss": 4.4978,
|
|
"mean_token_accuracy": 0.26002163290977476,
|
|
"num_tokens": 98586238.0,
|
|
"step": 42990
|
|
},
|
|
{
|
|
"entropy": 4.9424135208129885,
|
|
"epoch": 4.130163304514889,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000339567705518044,
|
|
"loss": 4.4741,
|
|
"mean_token_accuracy": 0.26413360238075256,
|
|
"num_tokens": 98599051.0,
|
|
"step": 42995
|
|
},
|
|
{
|
|
"entropy": 4.981280040740967,
|
|
"epoch": 4.130643611911624,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033953486316939214,
|
|
"loss": 4.4548,
|
|
"mean_token_accuracy": 0.26747416108846667,
|
|
"num_tokens": 98610526.0,
|
|
"step": 43000
|
|
},
|
|
{
|
|
"entropy": 4.930198431015015,
|
|
"epoch": 4.131123919308357,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00033950201932243237,
|
|
"loss": 4.3541,
|
|
"mean_token_accuracy": 0.27494517862796786,
|
|
"num_tokens": 98621720.0,
|
|
"step": 43005
|
|
},
|
|
{
|
|
"entropy": 4.952626371383667,
|
|
"epoch": 4.131604226705091,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00033946917397792724,
|
|
"loss": 4.499,
|
|
"mean_token_accuracy": 0.2688994541764259,
|
|
"num_tokens": 98632248.0,
|
|
"step": 43010
|
|
},
|
|
{
|
|
"entropy": 4.938734245300293,
|
|
"epoch": 4.132084534101825,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003394363271366393,
|
|
"loss": 4.4873,
|
|
"mean_token_accuracy": 0.2679453641176224,
|
|
"num_tokens": 98643097.0,
|
|
"step": 43015
|
|
},
|
|
{
|
|
"entropy": 4.967605543136597,
|
|
"epoch": 4.132564841498559,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003394034787993312,
|
|
"loss": 4.3826,
|
|
"mean_token_accuracy": 0.2739040642976761,
|
|
"num_tokens": 98653429.0,
|
|
"step": 43020
|
|
},
|
|
{
|
|
"entropy": 4.992958307266235,
|
|
"epoch": 4.133045148895293,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033937062896676563,
|
|
"loss": 4.4473,
|
|
"mean_token_accuracy": 0.26733019948005676,
|
|
"num_tokens": 98664972.0,
|
|
"step": 43025
|
|
},
|
|
{
|
|
"entropy": 5.027160024642944,
|
|
"epoch": 4.133525456292027,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00033933777763970506,
|
|
"loss": 4.5423,
|
|
"mean_token_accuracy": 0.26083060801029206,
|
|
"num_tokens": 98675075.0,
|
|
"step": 43030
|
|
},
|
|
{
|
|
"entropy": 5.0450530529022215,
|
|
"epoch": 4.13400576368876,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003393049248189124,
|
|
"loss": 4.5659,
|
|
"mean_token_accuracy": 0.26401142328977584,
|
|
"num_tokens": 98686749.0,
|
|
"step": 43035
|
|
},
|
|
{
|
|
"entropy": 5.05974326133728,
|
|
"epoch": 4.134486071085495,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003392720705051504,
|
|
"loss": 4.5745,
|
|
"mean_token_accuracy": 0.26091312766075136,
|
|
"num_tokens": 98698229.0,
|
|
"step": 43040
|
|
},
|
|
{
|
|
"entropy": 5.019454908370972,
|
|
"epoch": 4.134966378482229,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033923921469918174,
|
|
"loss": 4.5382,
|
|
"mean_token_accuracy": 0.2659340113401413,
|
|
"num_tokens": 98708302.0,
|
|
"step": 43045
|
|
},
|
|
{
|
|
"entropy": 5.010962677001953,
|
|
"epoch": 4.1354466858789625,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003392063574017692,
|
|
"loss": 4.4939,
|
|
"mean_token_accuracy": 0.270592688024044,
|
|
"num_tokens": 98719882.0,
|
|
"step": 43050
|
|
},
|
|
{
|
|
"entropy": 5.001523113250732,
|
|
"epoch": 4.135926993275697,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00033917349861367575,
|
|
"loss": 4.4987,
|
|
"mean_token_accuracy": 0.2680410325527191,
|
|
"num_tokens": 98730123.0,
|
|
"step": 43055
|
|
},
|
|
{
|
|
"entropy": 4.963587141036987,
|
|
"epoch": 4.13640730067243,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003391406383356642,
|
|
"loss": 4.481,
|
|
"mean_token_accuracy": 0.2647318497300148,
|
|
"num_tokens": 98739605.0,
|
|
"step": 43060
|
|
},
|
|
{
|
|
"entropy": 5.02074499130249,
|
|
"epoch": 4.136887608069165,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.00033910777656849745,
|
|
"loss": 4.5238,
|
|
"mean_token_accuracy": 0.2638397470116615,
|
|
"num_tokens": 98752474.0,
|
|
"step": 43065
|
|
},
|
|
{
|
|
"entropy": 5.1015783786773685,
|
|
"epoch": 4.137367915465898,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003390749133129386,
|
|
"loss": 4.5966,
|
|
"mean_token_accuracy": 0.2529545485973358,
|
|
"num_tokens": 98764982.0,
|
|
"step": 43070
|
|
},
|
|
{
|
|
"entropy": 4.9798502922058105,
|
|
"epoch": 4.137848222862632,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003390420485697504,
|
|
"loss": 4.47,
|
|
"mean_token_accuracy": 0.26502523124217986,
|
|
"num_tokens": 98777464.0,
|
|
"step": 43075
|
|
},
|
|
{
|
|
"entropy": 5.038424444198609,
|
|
"epoch": 4.138328530259366,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033900918233969606,
|
|
"loss": 4.492,
|
|
"mean_token_accuracy": 0.2674848258495331,
|
|
"num_tokens": 98788672.0,
|
|
"step": 43080
|
|
},
|
|
{
|
|
"entropy": 5.1506751537322994,
|
|
"epoch": 4.1388088376561,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00033897631462353854,
|
|
"loss": 4.6816,
|
|
"mean_token_accuracy": 0.24671790152788162,
|
|
"num_tokens": 98801531.0,
|
|
"step": 43085
|
|
},
|
|
{
|
|
"entropy": 4.984687042236328,
|
|
"epoch": 4.1392891450528335,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000338943445422041,
|
|
"loss": 4.4738,
|
|
"mean_token_accuracy": 0.26456863433122635,
|
|
"num_tokens": 98811534.0,
|
|
"step": 43090
|
|
},
|
|
{
|
|
"entropy": 5.1147078514099125,
|
|
"epoch": 4.139769452449568,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00033891057473596655,
|
|
"loss": 4.5599,
|
|
"mean_token_accuracy": 0.26323226690292356,
|
|
"num_tokens": 98823220.0,
|
|
"step": 43095
|
|
},
|
|
{
|
|
"entropy": 4.966423273086548,
|
|
"epoch": 4.140249759846301,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003388777025660783,
|
|
"loss": 4.4249,
|
|
"mean_token_accuracy": 0.2677594631910324,
|
|
"num_tokens": 98834389.0,
|
|
"step": 43100
|
|
},
|
|
{
|
|
"entropy": 5.00996618270874,
|
|
"epoch": 4.140730067243036,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0003388448289131395,
|
|
"loss": 4.5521,
|
|
"mean_token_accuracy": 0.2582373425364494,
|
|
"num_tokens": 98846399.0,
|
|
"step": 43105
|
|
},
|
|
{
|
|
"entropy": 5.088111209869385,
|
|
"epoch": 4.141210374639769,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00033881195377791337,
|
|
"loss": 4.619,
|
|
"mean_token_accuracy": 0.25431896448135377,
|
|
"num_tokens": 98858448.0,
|
|
"step": 43110
|
|
},
|
|
{
|
|
"entropy": 4.912242412567139,
|
|
"epoch": 4.141690682036503,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00033877907716116314,
|
|
"loss": 4.3542,
|
|
"mean_token_accuracy": 0.27669430822134017,
|
|
"num_tokens": 98871113.0,
|
|
"step": 43115
|
|
},
|
|
{
|
|
"entropy": 5.014299821853638,
|
|
"epoch": 4.142170989433238,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000338746199063652,
|
|
"loss": 4.5144,
|
|
"mean_token_accuracy": 0.2626959472894669,
|
|
"num_tokens": 98881779.0,
|
|
"step": 43120
|
|
},
|
|
{
|
|
"entropy": 5.020384645462036,
|
|
"epoch": 4.142651296829971,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0003387133194861435,
|
|
"loss": 4.4913,
|
|
"mean_token_accuracy": 0.26554492563009263,
|
|
"num_tokens": 98893490.0,
|
|
"step": 43125
|
|
},
|
|
{
|
|
"entropy": 5.075809669494629,
|
|
"epoch": 4.1431316042267055,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00033868043842940097,
|
|
"loss": 4.5678,
|
|
"mean_token_accuracy": 0.25994555205106734,
|
|
"num_tokens": 98904485.0,
|
|
"step": 43130
|
|
},
|
|
{
|
|
"entropy": 5.012277364730835,
|
|
"epoch": 4.143611911623439,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033864755589418764,
|
|
"loss": 4.4886,
|
|
"mean_token_accuracy": 0.2606645613908768,
|
|
"num_tokens": 98916276.0,
|
|
"step": 43135
|
|
},
|
|
{
|
|
"entropy": 4.9569167137146,
|
|
"epoch": 4.144092219020173,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033861467188126714,
|
|
"loss": 4.5106,
|
|
"mean_token_accuracy": 0.2702887073159218,
|
|
"num_tokens": 98927891.0,
|
|
"step": 43140
|
|
},
|
|
{
|
|
"entropy": 5.009888172149658,
|
|
"epoch": 4.144572526416907,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033858178639140286,
|
|
"loss": 4.4796,
|
|
"mean_token_accuracy": 0.2635997772216797,
|
|
"num_tokens": 98939396.0,
|
|
"step": 43145
|
|
},
|
|
{
|
|
"entropy": 4.996801376342773,
|
|
"epoch": 4.145052833813641,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003385488994253583,
|
|
"loss": 4.4661,
|
|
"mean_token_accuracy": 0.2708708569407463,
|
|
"num_tokens": 98950327.0,
|
|
"step": 43150
|
|
},
|
|
{
|
|
"entropy": 5.056891584396363,
|
|
"epoch": 4.145533141210374,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000338516010983897,
|
|
"loss": 4.5516,
|
|
"mean_token_accuracy": 0.2637521684169769,
|
|
"num_tokens": 98961867.0,
|
|
"step": 43155
|
|
},
|
|
{
|
|
"entropy": 5.053594160079956,
|
|
"epoch": 4.146013448607109,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00033848312106778247,
|
|
"loss": 4.5156,
|
|
"mean_token_accuracy": 0.2601000130176544,
|
|
"num_tokens": 98972998.0,
|
|
"step": 43160
|
|
},
|
|
{
|
|
"entropy": 5.068536996841431,
|
|
"epoch": 4.146493756003842,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00033845022967777833,
|
|
"loss": 4.5979,
|
|
"mean_token_accuracy": 0.25660706162452696,
|
|
"num_tokens": 98986898.0,
|
|
"step": 43165
|
|
},
|
|
{
|
|
"entropy": 5.088318872451782,
|
|
"epoch": 4.1469740634005765,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00033841733681464837,
|
|
"loss": 4.5155,
|
|
"mean_token_accuracy": 0.26747147738933563,
|
|
"num_tokens": 98997881.0,
|
|
"step": 43170
|
|
},
|
|
{
|
|
"entropy": 4.9747593879699705,
|
|
"epoch": 4.14745437079731,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00033838444247915613,
|
|
"loss": 4.461,
|
|
"mean_token_accuracy": 0.26701411306858064,
|
|
"num_tokens": 99012328.0,
|
|
"step": 43175
|
|
},
|
|
{
|
|
"entropy": 5.041670942306519,
|
|
"epoch": 4.147934678194044,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003383515466720653,
|
|
"loss": 4.5488,
|
|
"mean_token_accuracy": 0.2605984777212143,
|
|
"num_tokens": 99023383.0,
|
|
"step": 43180
|
|
},
|
|
{
|
|
"entropy": 5.032764625549317,
|
|
"epoch": 4.148414985590778,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00033831864939413975,
|
|
"loss": 4.495,
|
|
"mean_token_accuracy": 0.26852324306964875,
|
|
"num_tokens": 99034264.0,
|
|
"step": 43185
|
|
},
|
|
{
|
|
"entropy": 5.017843246459961,
|
|
"epoch": 4.148895292987512,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003382857506461432,
|
|
"loss": 4.4855,
|
|
"mean_token_accuracy": 0.26999926269054414,
|
|
"num_tokens": 99046593.0,
|
|
"step": 43190
|
|
},
|
|
{
|
|
"entropy": 4.951725721359253,
|
|
"epoch": 4.149375600384246,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033825285042883937,
|
|
"loss": 4.4953,
|
|
"mean_token_accuracy": 0.2642531231045723,
|
|
"num_tokens": 99059068.0,
|
|
"step": 43195
|
|
},
|
|
{
|
|
"entropy": 5.098302936553955,
|
|
"epoch": 4.14985590778098,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033821994874299216,
|
|
"loss": 4.6307,
|
|
"mean_token_accuracy": 0.2503357529640198,
|
|
"num_tokens": 99070547.0,
|
|
"step": 43200
|
|
},
|
|
{
|
|
"entropy": 5.055800104141236,
|
|
"epoch": 4.150336215177714,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003381870455893655,
|
|
"loss": 4.5315,
|
|
"mean_token_accuracy": 0.26682385206222536,
|
|
"num_tokens": 99082118.0,
|
|
"step": 43205
|
|
},
|
|
{
|
|
"entropy": 4.965729665756226,
|
|
"epoch": 4.1508165225744476,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.00033815414096872326,
|
|
"loss": 4.4291,
|
|
"mean_token_accuracy": 0.26022862643003464,
|
|
"num_tokens": 99093514.0,
|
|
"step": 43210
|
|
},
|
|
{
|
|
"entropy": 4.982050037384033,
|
|
"epoch": 4.151296829971182,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003381212348818294,
|
|
"loss": 4.4752,
|
|
"mean_token_accuracy": 0.2657942444086075,
|
|
"num_tokens": 99104309.0,
|
|
"step": 43215
|
|
},
|
|
{
|
|
"entropy": 4.970301342010498,
|
|
"epoch": 4.151777137367915,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003380883273294479,
|
|
"loss": 4.523,
|
|
"mean_token_accuracy": 0.2634188398718834,
|
|
"num_tokens": 99116503.0,
|
|
"step": 43220
|
|
},
|
|
{
|
|
"entropy": 5.019953966140747,
|
|
"epoch": 4.15225744476465,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00033805541831234275,
|
|
"loss": 4.4745,
|
|
"mean_token_accuracy": 0.267247186601162,
|
|
"num_tokens": 99127212.0,
|
|
"step": 43225
|
|
},
|
|
{
|
|
"entropy": 5.0334861278533936,
|
|
"epoch": 4.152737752161383,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000338022507831278,
|
|
"loss": 4.4569,
|
|
"mean_token_accuracy": 0.26712524592876435,
|
|
"num_tokens": 99137947.0,
|
|
"step": 43230
|
|
},
|
|
{
|
|
"entropy": 5.0314606666564945,
|
|
"epoch": 4.153218059558117,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00033798959588701786,
|
|
"loss": 4.5107,
|
|
"mean_token_accuracy": 0.2596985846757889,
|
|
"num_tokens": 99150084.0,
|
|
"step": 43235
|
|
},
|
|
{
|
|
"entropy": 5.0228253364562985,
|
|
"epoch": 4.153698366954851,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003379566824803263,
|
|
"loss": 4.5134,
|
|
"mean_token_accuracy": 0.2642874032258987,
|
|
"num_tokens": 99160126.0,
|
|
"step": 43240
|
|
},
|
|
{
|
|
"entropy": 5.018211412429809,
|
|
"epoch": 4.154178674351585,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003379237676119675,
|
|
"loss": 4.5312,
|
|
"mean_token_accuracy": 0.2587979480624199,
|
|
"num_tokens": 99172349.0,
|
|
"step": 43245
|
|
},
|
|
{
|
|
"entropy": 4.977391004562378,
|
|
"epoch": 4.154658981748319,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003378908512827057,
|
|
"loss": 4.4291,
|
|
"mean_token_accuracy": 0.2672926366329193,
|
|
"num_tokens": 99183826.0,
|
|
"step": 43250
|
|
},
|
|
{
|
|
"entropy": 4.996899604797363,
|
|
"epoch": 4.155139289145053,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00033785793349330495,
|
|
"loss": 4.4741,
|
|
"mean_token_accuracy": 0.27224272340536115,
|
|
"num_tokens": 99195713.0,
|
|
"step": 43255
|
|
},
|
|
{
|
|
"entropy": 5.0316441535949705,
|
|
"epoch": 4.155619596541786,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033782501424452975,
|
|
"loss": 4.5684,
|
|
"mean_token_accuracy": 0.26097533404827117,
|
|
"num_tokens": 99206544.0,
|
|
"step": 43260
|
|
},
|
|
{
|
|
"entropy": 4.911692762374878,
|
|
"epoch": 4.156099903938521,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00033779209353714424,
|
|
"loss": 4.3768,
|
|
"mean_token_accuracy": 0.2794877901673317,
|
|
"num_tokens": 99218073.0,
|
|
"step": 43265
|
|
},
|
|
{
|
|
"entropy": 4.958901739120483,
|
|
"epoch": 4.156580211335255,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003377591713719128,
|
|
"loss": 4.4924,
|
|
"mean_token_accuracy": 0.2623881667852402,
|
|
"num_tokens": 99228627.0,
|
|
"step": 43270
|
|
},
|
|
{
|
|
"entropy": 5.043507146835327,
|
|
"epoch": 4.1570605187319885,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003377262477495997,
|
|
"loss": 4.4812,
|
|
"mean_token_accuracy": 0.27323682606220245,
|
|
"num_tokens": 99240863.0,
|
|
"step": 43275
|
|
},
|
|
{
|
|
"entropy": 5.065694522857666,
|
|
"epoch": 4.157540826128723,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003376933226709694,
|
|
"loss": 4.5325,
|
|
"mean_token_accuracy": 0.2637602150440216,
|
|
"num_tokens": 99253255.0,
|
|
"step": 43280
|
|
},
|
|
{
|
|
"entropy": 5.027507543563843,
|
|
"epoch": 4.158021133525456,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033766039613678634,
|
|
"loss": 4.5138,
|
|
"mean_token_accuracy": 0.26502704322338105,
|
|
"num_tokens": 99264850.0,
|
|
"step": 43285
|
|
},
|
|
{
|
|
"entropy": 4.953792381286621,
|
|
"epoch": 4.1585014409221905,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033762746814781496,
|
|
"loss": 4.3951,
|
|
"mean_token_accuracy": 0.27330732345581055,
|
|
"num_tokens": 99275389.0,
|
|
"step": 43290
|
|
},
|
|
{
|
|
"entropy": 4.9573835849761965,
|
|
"epoch": 4.158981748318924,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003375945387048197,
|
|
"loss": 4.4894,
|
|
"mean_token_accuracy": 0.26031336933374405,
|
|
"num_tokens": 99286265.0,
|
|
"step": 43295
|
|
},
|
|
{
|
|
"entropy": 4.9885289669036865,
|
|
"epoch": 4.159462055715658,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003375616078085651,
|
|
"loss": 4.5217,
|
|
"mean_token_accuracy": 0.2609419196844101,
|
|
"num_tokens": 99298382.0,
|
|
"step": 43300
|
|
},
|
|
{
|
|
"entropy": 5.044124794006348,
|
|
"epoch": 4.159942363112392,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00033752867545981576,
|
|
"loss": 4.5326,
|
|
"mean_token_accuracy": 0.2612334921956062,
|
|
"num_tokens": 99308706.0,
|
|
"step": 43305
|
|
},
|
|
{
|
|
"entropy": 4.990158939361573,
|
|
"epoch": 4.160422670509126,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00033749574165933626,
|
|
"loss": 4.4608,
|
|
"mean_token_accuracy": 0.26846264600753783,
|
|
"num_tokens": 99318952.0,
|
|
"step": 43310
|
|
},
|
|
{
|
|
"entropy": 4.985485553741455,
|
|
"epoch": 4.1609029779058595,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003374628064078912,
|
|
"loss": 4.5386,
|
|
"mean_token_accuracy": 0.256170953810215,
|
|
"num_tokens": 99330857.0,
|
|
"step": 43315
|
|
},
|
|
{
|
|
"entropy": 4.913702487945557,
|
|
"epoch": 4.161383285302594,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00033742986970624526,
|
|
"loss": 4.3562,
|
|
"mean_token_accuracy": 0.2741501107811928,
|
|
"num_tokens": 99341214.0,
|
|
"step": 43320
|
|
},
|
|
{
|
|
"entropy": 4.989003372192383,
|
|
"epoch": 4.161863592699327,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033739693155516315,
|
|
"loss": 4.4763,
|
|
"mean_token_accuracy": 0.2615357369184494,
|
|
"num_tokens": 99353312.0,
|
|
"step": 43325
|
|
},
|
|
{
|
|
"entropy": 5.056026315689087,
|
|
"epoch": 4.162343900096062,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00033736399195540955,
|
|
"loss": 4.5056,
|
|
"mean_token_accuracy": 0.2666723668575287,
|
|
"num_tokens": 99363965.0,
|
|
"step": 43330
|
|
},
|
|
{
|
|
"entropy": 5.026014184951782,
|
|
"epoch": 4.162824207492795,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00033733105090774917,
|
|
"loss": 4.4743,
|
|
"mean_token_accuracy": 0.26856114715337753,
|
|
"num_tokens": 99375027.0,
|
|
"step": 43335
|
|
},
|
|
{
|
|
"entropy": 5.061128234863281,
|
|
"epoch": 4.163304514889529,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000337298108412947,
|
|
"loss": 4.6314,
|
|
"mean_token_accuracy": 0.24833226054906846,
|
|
"num_tokens": 99385998.0,
|
|
"step": 43340
|
|
},
|
|
{
|
|
"entropy": 4.970186996459961,
|
|
"epoch": 4.163784822286263,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033726516447176764,
|
|
"loss": 4.4205,
|
|
"mean_token_accuracy": 0.2670734107494354,
|
|
"num_tokens": 99396358.0,
|
|
"step": 43345
|
|
},
|
|
{
|
|
"entropy": 5.058982467651367,
|
|
"epoch": 4.164265129682997,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003372322190849761,
|
|
"loss": 4.5744,
|
|
"mean_token_accuracy": 0.26104184836149213,
|
|
"num_tokens": 99407289.0,
|
|
"step": 43350
|
|
},
|
|
{
|
|
"entropy": 4.995718145370484,
|
|
"epoch": 4.164745437079731,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003371992722533372,
|
|
"loss": 4.5681,
|
|
"mean_token_accuracy": 0.2578948050737381,
|
|
"num_tokens": 99419036.0,
|
|
"step": 43355
|
|
},
|
|
{
|
|
"entropy": 5.049226760864258,
|
|
"epoch": 4.165225744476465,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003371663239776159,
|
|
"loss": 4.4651,
|
|
"mean_token_accuracy": 0.2611623242497444,
|
|
"num_tokens": 99429589.0,
|
|
"step": 43360
|
|
},
|
|
{
|
|
"entropy": 4.979753541946411,
|
|
"epoch": 4.165706051873199,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033713337425857713,
|
|
"loss": 4.4455,
|
|
"mean_token_accuracy": 0.26525069326162337,
|
|
"num_tokens": 99440398.0,
|
|
"step": 43365
|
|
},
|
|
{
|
|
"entropy": 4.975549554824829,
|
|
"epoch": 4.166186359269933,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000337100423096986,
|
|
"loss": 4.4367,
|
|
"mean_token_accuracy": 0.26496130377054217,
|
|
"num_tokens": 99452000.0,
|
|
"step": 43370
|
|
},
|
|
{
|
|
"entropy": 5.045710706710816,
|
|
"epoch": 4.166666666666667,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003370674704936074,
|
|
"loss": 4.4993,
|
|
"mean_token_accuracy": 0.2579521179199219,
|
|
"num_tokens": 99462403.0,
|
|
"step": 43375
|
|
},
|
|
{
|
|
"entropy": 4.936083030700684,
|
|
"epoch": 4.1671469740634,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033703451644920637,
|
|
"loss": 4.3854,
|
|
"mean_token_accuracy": 0.2731154695153236,
|
|
"num_tokens": 99474284.0,
|
|
"step": 43380
|
|
},
|
|
{
|
|
"entropy": 5.0006309986114506,
|
|
"epoch": 4.167627281460135,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033700156096454816,
|
|
"loss": 4.5495,
|
|
"mean_token_accuracy": 0.2624314740300179,
|
|
"num_tokens": 99486189.0,
|
|
"step": 43385
|
|
},
|
|
{
|
|
"entropy": 5.059164571762085,
|
|
"epoch": 4.168107588856868,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00033696860404039777,
|
|
"loss": 4.5962,
|
|
"mean_token_accuracy": 0.2572004720568657,
|
|
"num_tokens": 99498047.0,
|
|
"step": 43390
|
|
},
|
|
{
|
|
"entropy": 4.976747512817383,
|
|
"epoch": 4.1685878962536025,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033693564567752036,
|
|
"loss": 4.4599,
|
|
"mean_token_accuracy": 0.2645559921860695,
|
|
"num_tokens": 99509523.0,
|
|
"step": 43395
|
|
},
|
|
{
|
|
"entropy": 4.9591728210449215,
|
|
"epoch": 4.169068203650336,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003369026858766813,
|
|
"loss": 4.4225,
|
|
"mean_token_accuracy": 0.26764871180057526,
|
|
"num_tokens": 99521357.0,
|
|
"step": 43400
|
|
},
|
|
{
|
|
"entropy": 5.001942873001099,
|
|
"epoch": 4.16954851104707,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033686972463864553,
|
|
"loss": 4.549,
|
|
"mean_token_accuracy": 0.25778824985027315,
|
|
"num_tokens": 99532484.0,
|
|
"step": 43405
|
|
},
|
|
{
|
|
"entropy": 5.0939521312713625,
|
|
"epoch": 4.170028818443804,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003368367619641785,
|
|
"loss": 4.4879,
|
|
"mean_token_accuracy": 0.26784124225378036,
|
|
"num_tokens": 99543082.0,
|
|
"step": 43410
|
|
},
|
|
{
|
|
"entropy": 5.0140636444091795,
|
|
"epoch": 4.170509125840538,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003368037978540455,
|
|
"loss": 4.552,
|
|
"mean_token_accuracy": 0.25841235667467116,
|
|
"num_tokens": 99555380.0,
|
|
"step": 43415
|
|
},
|
|
{
|
|
"entropy": 5.019193172454834,
|
|
"epoch": 4.1709894332372714,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003367708323090119,
|
|
"loss": 4.5643,
|
|
"mean_token_accuracy": 0.2635782673954964,
|
|
"num_tokens": 99566394.0,
|
|
"step": 43420
|
|
},
|
|
{
|
|
"entropy": 5.029716062545776,
|
|
"epoch": 4.171469740634006,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000336737865329843,
|
|
"loss": 4.4955,
|
|
"mean_token_accuracy": 0.2592089846730232,
|
|
"num_tokens": 99579156.0,
|
|
"step": 43425
|
|
},
|
|
{
|
|
"entropy": 5.017386531829834,
|
|
"epoch": 4.17195004803074,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00033670489691730405,
|
|
"loss": 4.517,
|
|
"mean_token_accuracy": 0.26219437420368197,
|
|
"num_tokens": 99589899.0,
|
|
"step": 43430
|
|
},
|
|
{
|
|
"entropy": 5.014989852905273,
|
|
"epoch": 4.1724303554274735,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003366719270721607,
|
|
"loss": 4.5657,
|
|
"mean_token_accuracy": 0.2594698294997215,
|
|
"num_tokens": 99602337.0,
|
|
"step": 43435
|
|
},
|
|
{
|
|
"entropy": 5.0708637714385985,
|
|
"epoch": 4.172910662824208,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00033663895579517835,
|
|
"loss": 4.5999,
|
|
"mean_token_accuracy": 0.2565570160746574,
|
|
"num_tokens": 99613375.0,
|
|
"step": 43440
|
|
},
|
|
{
|
|
"entropy": 4.99849796295166,
|
|
"epoch": 4.173390970220941,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00033660598308712247,
|
|
"loss": 4.4175,
|
|
"mean_token_accuracy": 0.2759502217173576,
|
|
"num_tokens": 99624156.0,
|
|
"step": 43445
|
|
},
|
|
{
|
|
"entropy": 4.900947093963623,
|
|
"epoch": 4.173871277617676,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033657300894875857,
|
|
"loss": 4.4123,
|
|
"mean_token_accuracy": 0.27073585242033005,
|
|
"num_tokens": 99635360.0,
|
|
"step": 43450
|
|
},
|
|
{
|
|
"entropy": 5.004535102844239,
|
|
"epoch": 4.174351585014409,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033654003338085225,
|
|
"loss": 4.4291,
|
|
"mean_token_accuracy": 0.27863129377365115,
|
|
"num_tokens": 99646479.0,
|
|
"step": 43455
|
|
},
|
|
{
|
|
"entropy": 5.0275803565979,
|
|
"epoch": 4.174831892411143,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00033650705638416914,
|
|
"loss": 4.498,
|
|
"mean_token_accuracy": 0.2613164022564888,
|
|
"num_tokens": 99656688.0,
|
|
"step": 43460
|
|
},
|
|
{
|
|
"entropy": 4.983251333236694,
|
|
"epoch": 4.175312199807877,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003364740779594747,
|
|
"loss": 4.4604,
|
|
"mean_token_accuracy": 0.26806287169456483,
|
|
"num_tokens": 99667859.0,
|
|
"step": 43465
|
|
},
|
|
{
|
|
"entropy": 5.0023510456085205,
|
|
"epoch": 4.175792507204611,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033644109810753483,
|
|
"loss": 4.4866,
|
|
"mean_token_accuracy": 0.2647450938820839,
|
|
"num_tokens": 99679012.0,
|
|
"step": 43470
|
|
},
|
|
{
|
|
"entropy": 4.995203733444214,
|
|
"epoch": 4.176272814601345,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000336408116829115,
|
|
"loss": 4.4815,
|
|
"mean_token_accuracy": 0.2605468764901161,
|
|
"num_tokens": 99689734.0,
|
|
"step": 43475
|
|
},
|
|
{
|
|
"entropy": 4.954161071777344,
|
|
"epoch": 4.176753121998079,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00033637513412498117,
|
|
"loss": 4.487,
|
|
"mean_token_accuracy": 0.2662731632590294,
|
|
"num_tokens": 99700445.0,
|
|
"step": 43480
|
|
},
|
|
{
|
|
"entropy": 5.007417106628418,
|
|
"epoch": 4.177233429394812,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003363421499958989,
|
|
"loss": 4.5425,
|
|
"mean_token_accuracy": 0.2614088043570518,
|
|
"num_tokens": 99711317.0,
|
|
"step": 43485
|
|
},
|
|
{
|
|
"entropy": 4.979241037368775,
|
|
"epoch": 4.177713736791547,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00033630916444263407,
|
|
"loss": 4.4208,
|
|
"mean_token_accuracy": 0.27129930406808855,
|
|
"num_tokens": 99722355.0,
|
|
"step": 43490
|
|
},
|
|
{
|
|
"entropy": 5.034742641448974,
|
|
"epoch": 4.17819404418828,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003362761774659525,
|
|
"loss": 4.4884,
|
|
"mean_token_accuracy": 0.2690602570772171,
|
|
"num_tokens": 99733010.0,
|
|
"step": 43495
|
|
},
|
|
{
|
|
"entropy": 5.058031892776489,
|
|
"epoch": 4.178674351585014,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033624318906662,
|
|
"loss": 4.5455,
|
|
"mean_token_accuracy": 0.2597561329603195,
|
|
"num_tokens": 99744724.0,
|
|
"step": 43500
|
|
},
|
|
{
|
|
"entropy": 5.0864275932312015,
|
|
"epoch": 4.179154658981748,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00033621019924540255,
|
|
"loss": 4.5974,
|
|
"mean_token_accuracy": 0.25134978592395785,
|
|
"num_tokens": 99756772.0,
|
|
"step": 43505
|
|
},
|
|
{
|
|
"entropy": 5.0570070266723635,
|
|
"epoch": 4.179634966378482,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000336177208003066,
|
|
"loss": 4.5829,
|
|
"mean_token_accuracy": 0.2601602911949158,
|
|
"num_tokens": 99768398.0,
|
|
"step": 43510
|
|
},
|
|
{
|
|
"entropy": 5.0874885559082035,
|
|
"epoch": 4.1801152737752165,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003361442153403763,
|
|
"loss": 4.6459,
|
|
"mean_token_accuracy": 0.2528345867991447,
|
|
"num_tokens": 99780110.0,
|
|
"step": 43515
|
|
},
|
|
{
|
|
"entropy": 4.991422843933106,
|
|
"epoch": 4.18059558117195,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003361112212580995,
|
|
"loss": 4.4694,
|
|
"mean_token_accuracy": 0.2680243939161301,
|
|
"num_tokens": 99791329.0,
|
|
"step": 43520
|
|
},
|
|
{
|
|
"entropy": 5.029519939422608,
|
|
"epoch": 4.181075888568684,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003360782257570017,
|
|
"loss": 4.5659,
|
|
"mean_token_accuracy": 0.2531138002872467,
|
|
"num_tokens": 99802890.0,
|
|
"step": 43525
|
|
},
|
|
{
|
|
"entropy": 5.02284574508667,
|
|
"epoch": 4.181556195965418,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003360452288378488,
|
|
"loss": 4.4625,
|
|
"mean_token_accuracy": 0.26419888287782667,
|
|
"num_tokens": 99814076.0,
|
|
"step": 43530
|
|
},
|
|
{
|
|
"entropy": 5.009711456298828,
|
|
"epoch": 4.182036503362152,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00033601223050140687,
|
|
"loss": 4.4829,
|
|
"mean_token_accuracy": 0.26574952751398084,
|
|
"num_tokens": 99825894.0,
|
|
"step": 43535
|
|
},
|
|
{
|
|
"entropy": 5.018270063400268,
|
|
"epoch": 4.1825168107588855,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003359792307484423,
|
|
"loss": 4.4917,
|
|
"mean_token_accuracy": 0.2681758016347885,
|
|
"num_tokens": 99837217.0,
|
|
"step": 43540
|
|
},
|
|
{
|
|
"entropy": 5.127636098861695,
|
|
"epoch": 4.18299711815562,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00033594622957972096,
|
|
"loss": 4.6563,
|
|
"mean_token_accuracy": 0.25301981419324876,
|
|
"num_tokens": 99848728.0,
|
|
"step": 43545
|
|
},
|
|
{
|
|
"entropy": 4.959552383422851,
|
|
"epoch": 4.183477425552353,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003359132269960091,
|
|
"loss": 4.4536,
|
|
"mean_token_accuracy": 0.2711904078722,
|
|
"num_tokens": 99860116.0,
|
|
"step": 43550
|
|
},
|
|
{
|
|
"entropy": 5.026011514663696,
|
|
"epoch": 4.183957732949088,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000335880222998073,
|
|
"loss": 4.4746,
|
|
"mean_token_accuracy": 0.26567276418209074,
|
|
"num_tokens": 99870720.0,
|
|
"step": 43555
|
|
},
|
|
{
|
|
"entropy": 4.95458459854126,
|
|
"epoch": 4.184438040345821,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003358472175866789,
|
|
"loss": 4.4575,
|
|
"mean_token_accuracy": 0.27128739953041076,
|
|
"num_tokens": 99881882.0,
|
|
"step": 43560
|
|
},
|
|
{
|
|
"entropy": 5.1160773754119875,
|
|
"epoch": 4.184918347742555,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003358142107625931,
|
|
"loss": 4.5811,
|
|
"mean_token_accuracy": 0.25837131291627885,
|
|
"num_tokens": 99892845.0,
|
|
"step": 43565
|
|
},
|
|
{
|
|
"entropy": 4.9950371265411375,
|
|
"epoch": 4.185398655139289,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003357812025265819,
|
|
"loss": 4.476,
|
|
"mean_token_accuracy": 0.267107880115509,
|
|
"num_tokens": 99905024.0,
|
|
"step": 43570
|
|
},
|
|
{
|
|
"entropy": 4.991855764389038,
|
|
"epoch": 4.185878962536023,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00033574819287941174,
|
|
"loss": 4.5261,
|
|
"mean_token_accuracy": 0.26617809683084487,
|
|
"num_tokens": 99917899.0,
|
|
"step": 43575
|
|
},
|
|
{
|
|
"entropy": 4.9651154518127445,
|
|
"epoch": 4.1863592699327565,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033571518182184886,
|
|
"loss": 4.5194,
|
|
"mean_token_accuracy": 0.25870402753353117,
|
|
"num_tokens": 99929245.0,
|
|
"step": 43580
|
|
},
|
|
{
|
|
"entropy": 5.088730382919311,
|
|
"epoch": 4.186839577329491,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00033568216935465976,
|
|
"loss": 4.5866,
|
|
"mean_token_accuracy": 0.25513799041509627,
|
|
"num_tokens": 99940806.0,
|
|
"step": 43585
|
|
},
|
|
{
|
|
"entropy": 4.987099123001099,
|
|
"epoch": 4.187319884726225,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003356491554786109,
|
|
"loss": 4.4787,
|
|
"mean_token_accuracy": 0.2660929724574089,
|
|
"num_tokens": 99953369.0,
|
|
"step": 43590
|
|
},
|
|
{
|
|
"entropy": 4.984381103515625,
|
|
"epoch": 4.187800192122959,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033561614019446867,
|
|
"loss": 4.5177,
|
|
"mean_token_accuracy": 0.2646194711327553,
|
|
"num_tokens": 99964563.0,
|
|
"step": 43595
|
|
},
|
|
{
|
|
"entropy": 5.0597480773925785,
|
|
"epoch": 4.188280499519693,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00033558312350299963,
|
|
"loss": 4.5732,
|
|
"mean_token_accuracy": 0.2610611006617546,
|
|
"num_tokens": 99975143.0,
|
|
"step": 43600
|
|
},
|
|
{
|
|
"entropy": 5.152379846572876,
|
|
"epoch": 4.188760806916426,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00033555010540497045,
|
|
"loss": 4.5733,
|
|
"mean_token_accuracy": 0.26072719395160676,
|
|
"num_tokens": 99987430.0,
|
|
"step": 43605
|
|
},
|
|
{
|
|
"entropy": 4.946639204025269,
|
|
"epoch": 4.189241114313161,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003355170859011476,
|
|
"loss": 4.428,
|
|
"mean_token_accuracy": 0.2794035241007805,
|
|
"num_tokens": 99998638.0,
|
|
"step": 43610
|
|
},
|
|
{
|
|
"entropy": 5.017384433746338,
|
|
"epoch": 4.189721421709894,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003354840649922977,
|
|
"loss": 4.5638,
|
|
"mean_token_accuracy": 0.2608239769935608,
|
|
"num_tokens": 100011054.0,
|
|
"step": 43615
|
|
},
|
|
{
|
|
"entropy": 5.136627912521362,
|
|
"epoch": 4.1902017291066285,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003354510426791874,
|
|
"loss": 4.621,
|
|
"mean_token_accuracy": 0.2578920230269432,
|
|
"num_tokens": 100022695.0,
|
|
"step": 43620
|
|
},
|
|
{
|
|
"entropy": 5.0481432437896725,
|
|
"epoch": 4.190682036503362,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.00033541801896258344,
|
|
"loss": 4.5154,
|
|
"mean_token_accuracy": 0.2626476764678955,
|
|
"num_tokens": 100034402.0,
|
|
"step": 43625
|
|
},
|
|
{
|
|
"entropy": 5.070928573608398,
|
|
"epoch": 4.191162343900096,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003353849938432524,
|
|
"loss": 4.617,
|
|
"mean_token_accuracy": 0.25445102602243425,
|
|
"num_tokens": 100046429.0,
|
|
"step": 43630
|
|
},
|
|
{
|
|
"entropy": 4.96663465499878,
|
|
"epoch": 4.19164265129683,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003353519673219612,
|
|
"loss": 4.4494,
|
|
"mean_token_accuracy": 0.2658099830150604,
|
|
"num_tokens": 100057019.0,
|
|
"step": 43635
|
|
},
|
|
{
|
|
"entropy": 5.079247570037841,
|
|
"epoch": 4.192122958693564,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003353189393994765,
|
|
"loss": 4.6021,
|
|
"mean_token_accuracy": 0.25950285643339155,
|
|
"num_tokens": 100069763.0,
|
|
"step": 43640
|
|
},
|
|
{
|
|
"entropy": 4.9571198463439945,
|
|
"epoch": 4.192603266090297,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033528591007656516,
|
|
"loss": 4.4473,
|
|
"mean_token_accuracy": 0.26529968827962874,
|
|
"num_tokens": 100082838.0,
|
|
"step": 43645
|
|
},
|
|
{
|
|
"entropy": 5.012531518936157,
|
|
"epoch": 4.193083573487032,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003352528793539939,
|
|
"loss": 4.538,
|
|
"mean_token_accuracy": 0.26442969143390654,
|
|
"num_tokens": 100095422.0,
|
|
"step": 43650
|
|
},
|
|
{
|
|
"entropy": 4.984251832962036,
|
|
"epoch": 4.193563880883765,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033521984723252973,
|
|
"loss": 4.4809,
|
|
"mean_token_accuracy": 0.2738280028104782,
|
|
"num_tokens": 100107000.0,
|
|
"step": 43655
|
|
},
|
|
{
|
|
"entropy": 5.0265878200531,
|
|
"epoch": 4.1940441882804995,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033518681371293965,
|
|
"loss": 4.5388,
|
|
"mean_token_accuracy": 0.2614258736371994,
|
|
"num_tokens": 100121086.0,
|
|
"step": 43660
|
|
},
|
|
{
|
|
"entropy": 4.928357791900635,
|
|
"epoch": 4.194524495677234,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00033515377879599035,
|
|
"loss": 4.413,
|
|
"mean_token_accuracy": 0.2672556295990944,
|
|
"num_tokens": 100131476.0,
|
|
"step": 43665
|
|
},
|
|
{
|
|
"entropy": 5.005016660690307,
|
|
"epoch": 4.195004803073967,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00033512074248244895,
|
|
"loss": 4.465,
|
|
"mean_token_accuracy": 0.26684005111455916,
|
|
"num_tokens": 100142461.0,
|
|
"step": 43670
|
|
},
|
|
{
|
|
"entropy": 4.914992713928223,
|
|
"epoch": 4.195485110470702,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003350877047730825,
|
|
"loss": 4.4092,
|
|
"mean_token_accuracy": 0.27074444442987444,
|
|
"num_tokens": 100154317.0,
|
|
"step": 43675
|
|
},
|
|
{
|
|
"entropy": 5.028559732437134,
|
|
"epoch": 4.195965417867435,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00033505466566865783,
|
|
"loss": 4.4581,
|
|
"mean_token_accuracy": 0.2736573353409767,
|
|
"num_tokens": 100165197.0,
|
|
"step": 43680
|
|
},
|
|
{
|
|
"entropy": 4.988343238830566,
|
|
"epoch": 4.196445725264169,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003350216251699422,
|
|
"loss": 4.4683,
|
|
"mean_token_accuracy": 0.2688291519880295,
|
|
"num_tokens": 100176745.0,
|
|
"step": 43685
|
|
},
|
|
{
|
|
"entropy": 4.941876316070557,
|
|
"epoch": 4.196926032660903,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00033498858327770276,
|
|
"loss": 4.4878,
|
|
"mean_token_accuracy": 0.26559937447309495,
|
|
"num_tokens": 100188202.0,
|
|
"step": 43690
|
|
},
|
|
{
|
|
"entropy": 5.027809143066406,
|
|
"epoch": 4.197406340057637,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00033495553999270643,
|
|
"loss": 4.5336,
|
|
"mean_token_accuracy": 0.26720512062311175,
|
|
"num_tokens": 100199044.0,
|
|
"step": 43695
|
|
},
|
|
{
|
|
"entropy": 4.999514484405518,
|
|
"epoch": 4.197886647454371,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00033492249531572054,
|
|
"loss": 4.4783,
|
|
"mean_token_accuracy": 0.2676470637321472,
|
|
"num_tokens": 100210272.0,
|
|
"step": 43700
|
|
},
|
|
{
|
|
"entropy": 4.933122158050537,
|
|
"epoch": 4.198366954851105,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0003348894492475123,
|
|
"loss": 4.4122,
|
|
"mean_token_accuracy": 0.2733786627650261,
|
|
"num_tokens": 100220737.0,
|
|
"step": 43705
|
|
},
|
|
{
|
|
"entropy": 5.088550758361817,
|
|
"epoch": 4.198847262247838,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003348564017888488,
|
|
"loss": 4.6049,
|
|
"mean_token_accuracy": 0.2647570729255676,
|
|
"num_tokens": 100232330.0,
|
|
"step": 43710
|
|
},
|
|
{
|
|
"entropy": 5.021116447448731,
|
|
"epoch": 4.199327569644573,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033482335294049735,
|
|
"loss": 4.4499,
|
|
"mean_token_accuracy": 0.2739663362503052,
|
|
"num_tokens": 100243092.0,
|
|
"step": 43715
|
|
},
|
|
{
|
|
"entropy": 5.06925253868103,
|
|
"epoch": 4.199807877041306,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003347903027032254,
|
|
"loss": 4.5901,
|
|
"mean_token_accuracy": 0.2565882682800293,
|
|
"num_tokens": 100254820.0,
|
|
"step": 43720
|
|
},
|
|
{
|
|
"entropy": 5.082266139984131,
|
|
"epoch": 4.20028818443804,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003347572510778001,
|
|
"loss": 4.5773,
|
|
"mean_token_accuracy": 0.2564161255955696,
|
|
"num_tokens": 100267272.0,
|
|
"step": 43725
|
|
},
|
|
{
|
|
"entropy": 4.967203664779663,
|
|
"epoch": 4.200768491834774,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003347241980649889,
|
|
"loss": 4.4538,
|
|
"mean_token_accuracy": 0.2701488628983498,
|
|
"num_tokens": 100279813.0,
|
|
"step": 43730
|
|
},
|
|
{
|
|
"entropy": 4.979673242568969,
|
|
"epoch": 4.201248799231508,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00033469114366555913,
|
|
"loss": 4.513,
|
|
"mean_token_accuracy": 0.25977902859449387,
|
|
"num_tokens": 100292062.0,
|
|
"step": 43735
|
|
},
|
|
{
|
|
"entropy": 5.041900873184204,
|
|
"epoch": 4.2017291066282425,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00033465808788027824,
|
|
"loss": 4.5306,
|
|
"mean_token_accuracy": 0.2687560975551605,
|
|
"num_tokens": 100303595.0,
|
|
"step": 43740
|
|
},
|
|
{
|
|
"entropy": 4.969011402130127,
|
|
"epoch": 4.202209414024976,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033462503070991374,
|
|
"loss": 4.468,
|
|
"mean_token_accuracy": 0.26936782598495485,
|
|
"num_tokens": 100315496.0,
|
|
"step": 43745
|
|
},
|
|
{
|
|
"entropy": 4.971264028549195,
|
|
"epoch": 4.20268972142171,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003345919721552331,
|
|
"loss": 4.4525,
|
|
"mean_token_accuracy": 0.2682551547884941,
|
|
"num_tokens": 100326999.0,
|
|
"step": 43750
|
|
},
|
|
{
|
|
"entropy": 4.9721637725830075,
|
|
"epoch": 4.203170028818444,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033455891221700375,
|
|
"loss": 4.4997,
|
|
"mean_token_accuracy": 0.270705471932888,
|
|
"num_tokens": 100339168.0,
|
|
"step": 43755
|
|
},
|
|
{
|
|
"entropy": 5.037137985229492,
|
|
"epoch": 4.203650336215178,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003345258508959933,
|
|
"loss": 4.4573,
|
|
"mean_token_accuracy": 0.26584478169679643,
|
|
"num_tokens": 100350522.0,
|
|
"step": 43760
|
|
},
|
|
{
|
|
"entropy": 4.964107275009155,
|
|
"epoch": 4.2041306436119115,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003344927881929695,
|
|
"loss": 4.451,
|
|
"mean_token_accuracy": 0.26616150736808775,
|
|
"num_tokens": 100361852.0,
|
|
"step": 43765
|
|
},
|
|
{
|
|
"entropy": 5.011275672912598,
|
|
"epoch": 4.204610951008646,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00033445972410869955,
|
|
"loss": 4.5373,
|
|
"mean_token_accuracy": 0.2612969115376472,
|
|
"num_tokens": 100372276.0,
|
|
"step": 43770
|
|
},
|
|
{
|
|
"entropy": 4.944443464279175,
|
|
"epoch": 4.205091258405379,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003344266586439516,
|
|
"loss": 4.4679,
|
|
"mean_token_accuracy": 0.269510792195797,
|
|
"num_tokens": 100383363.0,
|
|
"step": 43775
|
|
},
|
|
{
|
|
"entropy": 5.058288669586181,
|
|
"epoch": 4.2055715658021136,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00033439359179949303,
|
|
"loss": 4.5399,
|
|
"mean_token_accuracy": 0.261259526014328,
|
|
"num_tokens": 100395912.0,
|
|
"step": 43780
|
|
},
|
|
{
|
|
"entropy": 4.997809314727784,
|
|
"epoch": 4.206051873198847,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00033436052357609157,
|
|
"loss": 4.4485,
|
|
"mean_token_accuracy": 0.26646326184272767,
|
|
"num_tokens": 100405798.0,
|
|
"step": 43785
|
|
},
|
|
{
|
|
"entropy": 4.980653476715088,
|
|
"epoch": 4.206532180595581,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003343274539745151,
|
|
"loss": 4.4854,
|
|
"mean_token_accuracy": 0.2664491057395935,
|
|
"num_tokens": 100416457.0,
|
|
"step": 43790
|
|
},
|
|
{
|
|
"entropy": 4.975752305984497,
|
|
"epoch": 4.207012487992315,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003342943829955313,
|
|
"loss": 4.5624,
|
|
"mean_token_accuracy": 0.25628338754177094,
|
|
"num_tokens": 100427799.0,
|
|
"step": 43795
|
|
},
|
|
{
|
|
"entropy": 4.972875642776489,
|
|
"epoch": 4.207492795389049,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033426131063990787,
|
|
"loss": 4.4781,
|
|
"mean_token_accuracy": 0.26048422455787656,
|
|
"num_tokens": 100439035.0,
|
|
"step": 43800
|
|
},
|
|
{
|
|
"entropy": 4.978266811370849,
|
|
"epoch": 4.2079731027857825,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00033422823690841286,
|
|
"loss": 4.4553,
|
|
"mean_token_accuracy": 0.2703281372785568,
|
|
"num_tokens": 100450490.0,
|
|
"step": 43805
|
|
},
|
|
{
|
|
"entropy": 4.959077596664429,
|
|
"epoch": 4.208453410182517,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0003341951618018141,
|
|
"loss": 4.4503,
|
|
"mean_token_accuracy": 0.2680604815483093,
|
|
"num_tokens": 100461426.0,
|
|
"step": 43810
|
|
},
|
|
{
|
|
"entropy": 5.035855722427368,
|
|
"epoch": 4.208933717579251,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00033416208532087937,
|
|
"loss": 4.5251,
|
|
"mean_token_accuracy": 0.26254452764987946,
|
|
"num_tokens": 100473051.0,
|
|
"step": 43815
|
|
},
|
|
{
|
|
"entropy": 4.955068588256836,
|
|
"epoch": 4.209414024975985,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003341290074663768,
|
|
"loss": 4.4763,
|
|
"mean_token_accuracy": 0.27088096290826796,
|
|
"num_tokens": 100484269.0,
|
|
"step": 43820
|
|
},
|
|
{
|
|
"entropy": 5.005653715133667,
|
|
"epoch": 4.209894332372719,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0003340959282390742,
|
|
"loss": 4.5468,
|
|
"mean_token_accuracy": 0.2591328829526901,
|
|
"num_tokens": 100495928.0,
|
|
"step": 43825
|
|
},
|
|
{
|
|
"entropy": 5.014448118209839,
|
|
"epoch": 4.210374639769452,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00033406284763973964,
|
|
"loss": 4.4562,
|
|
"mean_token_accuracy": 0.2720640659332275,
|
|
"num_tokens": 100507132.0,
|
|
"step": 43830
|
|
},
|
|
{
|
|
"entropy": 5.031512498855591,
|
|
"epoch": 4.210854947166187,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003340297656691411,
|
|
"loss": 4.5729,
|
|
"mean_token_accuracy": 0.2648145005106926,
|
|
"num_tokens": 100518414.0,
|
|
"step": 43835
|
|
},
|
|
{
|
|
"entropy": 4.979145574569702,
|
|
"epoch": 4.21133525456292,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003339966823280467,
|
|
"loss": 4.4667,
|
|
"mean_token_accuracy": 0.2725910276174545,
|
|
"num_tokens": 100528809.0,
|
|
"step": 43840
|
|
},
|
|
{
|
|
"entropy": 5.050474500656128,
|
|
"epoch": 4.2118155619596545,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00033396359761722453,
|
|
"loss": 4.5144,
|
|
"mean_token_accuracy": 0.25912552028894426,
|
|
"num_tokens": 100539224.0,
|
|
"step": 43845
|
|
},
|
|
{
|
|
"entropy": 5.085442161560058,
|
|
"epoch": 4.212295869356388,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00033393051153744274,
|
|
"loss": 4.6249,
|
|
"mean_token_accuracy": 0.26011893004179,
|
|
"num_tokens": 100550984.0,
|
|
"step": 43850
|
|
},
|
|
{
|
|
"entropy": 5.060401248931885,
|
|
"epoch": 4.212776176753122,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00033389742408946937,
|
|
"loss": 4.5531,
|
|
"mean_token_accuracy": 0.2574040815234184,
|
|
"num_tokens": 100562786.0,
|
|
"step": 43855
|
|
},
|
|
{
|
|
"entropy": 4.9973304748535154,
|
|
"epoch": 4.213256484149856,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003338643352740728,
|
|
"loss": 4.451,
|
|
"mean_token_accuracy": 0.26485668271780016,
|
|
"num_tokens": 100573966.0,
|
|
"step": 43860
|
|
},
|
|
{
|
|
"entropy": 5.115858840942383,
|
|
"epoch": 4.21373679154659,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003338312450920211,
|
|
"loss": 4.6226,
|
|
"mean_token_accuracy": 0.2568540558218956,
|
|
"num_tokens": 100586451.0,
|
|
"step": 43865
|
|
},
|
|
{
|
|
"entropy": 5.087671756744385,
|
|
"epoch": 4.214217098943323,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0003337981535440826,
|
|
"loss": 4.6133,
|
|
"mean_token_accuracy": 0.2530244648456573,
|
|
"num_tokens": 100597820.0,
|
|
"step": 43870
|
|
},
|
|
{
|
|
"entropy": 5.021221780776978,
|
|
"epoch": 4.214697406340058,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.00033376506063102556,
|
|
"loss": 4.5619,
|
|
"mean_token_accuracy": 0.26166028529405594,
|
|
"num_tokens": 100611423.0,
|
|
"step": 43875
|
|
},
|
|
{
|
|
"entropy": 5.043032264709472,
|
|
"epoch": 4.215177713736791,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003337319663536183,
|
|
"loss": 4.5849,
|
|
"mean_token_accuracy": 0.25790592432022097,
|
|
"num_tokens": 100623220.0,
|
|
"step": 43880
|
|
},
|
|
{
|
|
"entropy": 5.005544853210449,
|
|
"epoch": 4.2156580211335255,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003336988707126293,
|
|
"loss": 4.479,
|
|
"mean_token_accuracy": 0.2700518235564232,
|
|
"num_tokens": 100634317.0,
|
|
"step": 43885
|
|
},
|
|
{
|
|
"entropy": 5.086635160446167,
|
|
"epoch": 4.216138328530259,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003336657737088267,
|
|
"loss": 4.591,
|
|
"mean_token_accuracy": 0.25752410739660264,
|
|
"num_tokens": 100645743.0,
|
|
"step": 43890
|
|
},
|
|
{
|
|
"entropy": 5.015795946121216,
|
|
"epoch": 4.216618635926993,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00033363267534297896,
|
|
"loss": 4.5602,
|
|
"mean_token_accuracy": 0.2661996051669121,
|
|
"num_tokens": 100658240.0,
|
|
"step": 43895
|
|
},
|
|
{
|
|
"entropy": 5.070752382278442,
|
|
"epoch": 4.217098943323728,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003335995756158548,
|
|
"loss": 4.5565,
|
|
"mean_token_accuracy": 0.2647637754678726,
|
|
"num_tokens": 100669096.0,
|
|
"step": 43900
|
|
},
|
|
{
|
|
"entropy": 5.038179349899292,
|
|
"epoch": 4.217579250720461,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00033356647452822236,
|
|
"loss": 4.4423,
|
|
"mean_token_accuracy": 0.27117386311292646,
|
|
"num_tokens": 100682389.0,
|
|
"step": 43905
|
|
},
|
|
{
|
|
"entropy": 5.0009886741638185,
|
|
"epoch": 4.218059558117195,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00033353337208085035,
|
|
"loss": 4.4697,
|
|
"mean_token_accuracy": 0.26984559893608095,
|
|
"num_tokens": 100693896.0,
|
|
"step": 43910
|
|
},
|
|
{
|
|
"entropy": 4.987539339065552,
|
|
"epoch": 4.218539865513929,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003335002682745072,
|
|
"loss": 4.5213,
|
|
"mean_token_accuracy": 0.2629322364926338,
|
|
"num_tokens": 100704997.0,
|
|
"step": 43915
|
|
},
|
|
{
|
|
"entropy": 5.00959792137146,
|
|
"epoch": 4.219020172910663,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003334671631099615,
|
|
"loss": 4.5118,
|
|
"mean_token_accuracy": 0.2583769604563713,
|
|
"num_tokens": 100716340.0,
|
|
"step": 43920
|
|
},
|
|
{
|
|
"entropy": 5.023182058334351,
|
|
"epoch": 4.2195004803073966,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000333434056587982,
|
|
"loss": 4.5091,
|
|
"mean_token_accuracy": 0.2640167221426964,
|
|
"num_tokens": 100727897.0,
|
|
"step": 43925
|
|
},
|
|
{
|
|
"entropy": 5.056469535827636,
|
|
"epoch": 4.219980787704131,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033340094870933714,
|
|
"loss": 4.5349,
|
|
"mean_token_accuracy": 0.26523641794919967,
|
|
"num_tokens": 100738848.0,
|
|
"step": 43930
|
|
},
|
|
{
|
|
"entropy": 4.97978401184082,
|
|
"epoch": 4.220461095100864,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.00033336783947479566,
|
|
"loss": 4.4958,
|
|
"mean_token_accuracy": 0.2744340434670448,
|
|
"num_tokens": 100750029.0,
|
|
"step": 43935
|
|
},
|
|
{
|
|
"entropy": 5.03196611404419,
|
|
"epoch": 4.220941402497599,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033333472888512626,
|
|
"loss": 4.5451,
|
|
"mean_token_accuracy": 0.2541408255696297,
|
|
"num_tokens": 100762037.0,
|
|
"step": 43940
|
|
},
|
|
{
|
|
"entropy": 5.148063135147095,
|
|
"epoch": 4.221421709894332,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003333016169410978,
|
|
"loss": 4.6326,
|
|
"mean_token_accuracy": 0.25712684988975526,
|
|
"num_tokens": 100774269.0,
|
|
"step": 43945
|
|
},
|
|
{
|
|
"entropy": 5.000710439682007,
|
|
"epoch": 4.221902017291066,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033326850364347876,
|
|
"loss": 4.4939,
|
|
"mean_token_accuracy": 0.26782526075839996,
|
|
"num_tokens": 100785940.0,
|
|
"step": 43950
|
|
},
|
|
{
|
|
"entropy": 5.028797626495361,
|
|
"epoch": 4.2223823246878,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003332353889930381,
|
|
"loss": 4.5323,
|
|
"mean_token_accuracy": 0.2600126340985298,
|
|
"num_tokens": 100797281.0,
|
|
"step": 43955
|
|
},
|
|
{
|
|
"entropy": 5.011584329605102,
|
|
"epoch": 4.222862632084534,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0003332022729905448,
|
|
"loss": 4.551,
|
|
"mean_token_accuracy": 0.2645274430513382,
|
|
"num_tokens": 100807585.0,
|
|
"step": 43960
|
|
},
|
|
{
|
|
"entropy": 4.9884919166564945,
|
|
"epoch": 4.223342939481268,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003331691556367674,
|
|
"loss": 4.4375,
|
|
"mean_token_accuracy": 0.271625342965126,
|
|
"num_tokens": 100818235.0,
|
|
"step": 43965
|
|
},
|
|
{
|
|
"entropy": 5.037183427810669,
|
|
"epoch": 4.223823246878002,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003331360369324749,
|
|
"loss": 4.5349,
|
|
"mean_token_accuracy": 0.26333259642124174,
|
|
"num_tokens": 100829844.0,
|
|
"step": 43970
|
|
},
|
|
{
|
|
"entropy": 4.953172636032105,
|
|
"epoch": 4.224303554274736,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003331029168784363,
|
|
"loss": 4.4585,
|
|
"mean_token_accuracy": 0.27197341471910474,
|
|
"num_tokens": 100841983.0,
|
|
"step": 43975
|
|
},
|
|
{
|
|
"entropy": 5.039448642730713,
|
|
"epoch": 4.22478386167147,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00033306979547542047,
|
|
"loss": 4.5066,
|
|
"mean_token_accuracy": 0.2661105811595917,
|
|
"num_tokens": 100853683.0,
|
|
"step": 43980
|
|
},
|
|
{
|
|
"entropy": 4.984122037887573,
|
|
"epoch": 4.225264169068204,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003330366727241965,
|
|
"loss": 4.4617,
|
|
"mean_token_accuracy": 0.26008470505476,
|
|
"num_tokens": 100865583.0,
|
|
"step": 43985
|
|
},
|
|
{
|
|
"entropy": 5.069153642654419,
|
|
"epoch": 4.2257444764649374,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003330035486255333,
|
|
"loss": 4.5539,
|
|
"mean_token_accuracy": 0.26321441531181333,
|
|
"num_tokens": 100876486.0,
|
|
"step": 43990
|
|
},
|
|
{
|
|
"entropy": 4.981710433959961,
|
|
"epoch": 4.226224783861672,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033297042318019995,
|
|
"loss": 4.4224,
|
|
"mean_token_accuracy": 0.2621622860431671,
|
|
"num_tokens": 100887630.0,
|
|
"step": 43995
|
|
},
|
|
{
|
|
"entropy": 5.004460287094116,
|
|
"epoch": 4.226705091258405,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003329372963889654,
|
|
"loss": 4.4997,
|
|
"mean_token_accuracy": 0.2597271665930748,
|
|
"num_tokens": 100899666.0,
|
|
"step": 44000
|
|
},
|
|
{
|
|
"entropy": 4.953942823410034,
|
|
"epoch": 4.2271853986551395,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000332904168252599,
|
|
"loss": 4.4441,
|
|
"mean_token_accuracy": 0.2711922526359558,
|
|
"num_tokens": 100911066.0,
|
|
"step": 44005
|
|
},
|
|
{
|
|
"entropy": 4.9501101016998295,
|
|
"epoch": 4.227665706051873,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00033287103877186966,
|
|
"loss": 4.4539,
|
|
"mean_token_accuracy": 0.2741879478096962,
|
|
"num_tokens": 100923241.0,
|
|
"step": 44010
|
|
},
|
|
{
|
|
"entropy": 5.074128770828247,
|
|
"epoch": 4.228146013448607,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003328379079475466,
|
|
"loss": 4.5729,
|
|
"mean_token_accuracy": 0.26063798666000365,
|
|
"num_tokens": 100935203.0,
|
|
"step": 44015
|
|
},
|
|
{
|
|
"entropy": 4.964849138259888,
|
|
"epoch": 4.228626320845341,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003328047757803992,
|
|
"loss": 4.4514,
|
|
"mean_token_accuracy": 0.2666185811161995,
|
|
"num_tokens": 100948133.0,
|
|
"step": 44020
|
|
},
|
|
{
|
|
"entropy": 5.145069742202759,
|
|
"epoch": 4.229106628242075,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003327716422711965,
|
|
"loss": 4.6026,
|
|
"mean_token_accuracy": 0.25516093224287034,
|
|
"num_tokens": 100960448.0,
|
|
"step": 44025
|
|
},
|
|
{
|
|
"entropy": 5.070009708404541,
|
|
"epoch": 4.2295869356388085,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00033273850742070784,
|
|
"loss": 4.5314,
|
|
"mean_token_accuracy": 0.2691802099347115,
|
|
"num_tokens": 100971372.0,
|
|
"step": 44030
|
|
},
|
|
{
|
|
"entropy": 5.060222959518432,
|
|
"epoch": 4.230067243035543,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003327053712297024,
|
|
"loss": 4.5633,
|
|
"mean_token_accuracy": 0.2575034111738205,
|
|
"num_tokens": 100982656.0,
|
|
"step": 44035
|
|
},
|
|
{
|
|
"entropy": 5.047744369506836,
|
|
"epoch": 4.230547550432276,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003326722336989497,
|
|
"loss": 4.5329,
|
|
"mean_token_accuracy": 0.25702244490385057,
|
|
"num_tokens": 100994410.0,
|
|
"step": 44040
|
|
},
|
|
{
|
|
"entropy": 5.0288866519927975,
|
|
"epoch": 4.231027857829011,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003326390948292189,
|
|
"loss": 4.5108,
|
|
"mean_token_accuracy": 0.2592564508318901,
|
|
"num_tokens": 101007345.0,
|
|
"step": 44045
|
|
},
|
|
{
|
|
"entropy": 5.150911855697632,
|
|
"epoch": 4.231508165225744,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003326059546212795,
|
|
"loss": 4.6529,
|
|
"mean_token_accuracy": 0.2516884163022041,
|
|
"num_tokens": 101018780.0,
|
|
"step": 44050
|
|
},
|
|
{
|
|
"entropy": 5.061563158035279,
|
|
"epoch": 4.231988472622478,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0003325728130759009,
|
|
"loss": 4.5224,
|
|
"mean_token_accuracy": 0.2664406314492226,
|
|
"num_tokens": 101030221.0,
|
|
"step": 44055
|
|
},
|
|
{
|
|
"entropy": 4.9098540306091305,
|
|
"epoch": 4.232468780019213,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003325396701938525,
|
|
"loss": 4.4498,
|
|
"mean_token_accuracy": 0.2729602977633476,
|
|
"num_tokens": 101041877.0,
|
|
"step": 44060
|
|
},
|
|
{
|
|
"entropy": 5.016185855865478,
|
|
"epoch": 4.232949087415946,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003325065259759039,
|
|
"loss": 4.454,
|
|
"mean_token_accuracy": 0.2690874129533768,
|
|
"num_tokens": 101052331.0,
|
|
"step": 44065
|
|
},
|
|
{
|
|
"entropy": 4.975475025177002,
|
|
"epoch": 4.23342939481268,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003324733804228244,
|
|
"loss": 4.4339,
|
|
"mean_token_accuracy": 0.2662567630410194,
|
|
"num_tokens": 101063211.0,
|
|
"step": 44070
|
|
},
|
|
{
|
|
"entropy": 4.972605037689209,
|
|
"epoch": 4.233909702209414,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00033244023353538367,
|
|
"loss": 4.5115,
|
|
"mean_token_accuracy": 0.26917947828769684,
|
|
"num_tokens": 101074245.0,
|
|
"step": 44075
|
|
},
|
|
{
|
|
"entropy": 4.968069362640381,
|
|
"epoch": 4.234390009606148,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003324070853143514,
|
|
"loss": 4.4166,
|
|
"mean_token_accuracy": 0.27236247062683105,
|
|
"num_tokens": 101084232.0,
|
|
"step": 44080
|
|
},
|
|
{
|
|
"entropy": 5.083905792236328,
|
|
"epoch": 4.234870317002882,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00033237393576049704,
|
|
"loss": 4.6428,
|
|
"mean_token_accuracy": 0.2502471849322319,
|
|
"num_tokens": 101094433.0,
|
|
"step": 44085
|
|
},
|
|
{
|
|
"entropy": 5.006226634979248,
|
|
"epoch": 4.235350624399616,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00033234078487459016,
|
|
"loss": 4.5226,
|
|
"mean_token_accuracy": 0.2697018668055534,
|
|
"num_tokens": 101106895.0,
|
|
"step": 44090
|
|
},
|
|
{
|
|
"entropy": 5.072209167480469,
|
|
"epoch": 4.235830931796349,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003323076326574006,
|
|
"loss": 4.6207,
|
|
"mean_token_accuracy": 0.260856369137764,
|
|
"num_tokens": 101118572.0,
|
|
"step": 44095
|
|
},
|
|
{
|
|
"entropy": 5.1125446319580075,
|
|
"epoch": 4.236311239193084,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000332274479109698,
|
|
"loss": 4.5639,
|
|
"mean_token_accuracy": 0.2524052709341049,
|
|
"num_tokens": 101129257.0,
|
|
"step": 44100
|
|
},
|
|
{
|
|
"entropy": 5.092693281173706,
|
|
"epoch": 4.236791546589817,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000332241324232252,
|
|
"loss": 4.6004,
|
|
"mean_token_accuracy": 0.2675477474927902,
|
|
"num_tokens": 101140025.0,
|
|
"step": 44105
|
|
},
|
|
{
|
|
"entropy": 5.014689540863037,
|
|
"epoch": 4.2372718539865515,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00033220816802583247,
|
|
"loss": 4.5061,
|
|
"mean_token_accuracy": 0.26499441266059875,
|
|
"num_tokens": 101151432.0,
|
|
"step": 44110
|
|
},
|
|
{
|
|
"entropy": 5.015462160110474,
|
|
"epoch": 4.237752161383285,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0003321750104912092,
|
|
"loss": 4.5166,
|
|
"mean_token_accuracy": 0.2603344634175301,
|
|
"num_tokens": 101162624.0,
|
|
"step": 44115
|
|
},
|
|
{
|
|
"entropy": 5.0037706851959225,
|
|
"epoch": 4.238232468780019,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033214185162915185,
|
|
"loss": 4.5682,
|
|
"mean_token_accuracy": 0.26119564175605775,
|
|
"num_tokens": 101175055.0,
|
|
"step": 44120
|
|
},
|
|
{
|
|
"entropy": 5.084257793426514,
|
|
"epoch": 4.238712776176753,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00033210869144043037,
|
|
"loss": 4.5863,
|
|
"mean_token_accuracy": 0.2575683623552322,
|
|
"num_tokens": 101186051.0,
|
|
"step": 44125
|
|
},
|
|
{
|
|
"entropy": 5.049083948135376,
|
|
"epoch": 4.239193083573487,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00033207552992581474,
|
|
"loss": 4.54,
|
|
"mean_token_accuracy": 0.2617225989699364,
|
|
"num_tokens": 101198009.0,
|
|
"step": 44130
|
|
},
|
|
{
|
|
"entropy": 4.944906949996948,
|
|
"epoch": 4.239673390970221,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033204236708607476,
|
|
"loss": 4.4715,
|
|
"mean_token_accuracy": 0.271551176905632,
|
|
"num_tokens": 101208950.0,
|
|
"step": 44135
|
|
},
|
|
{
|
|
"entropy": 5.014553451538086,
|
|
"epoch": 4.240153698366955,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003320092029219804,
|
|
"loss": 4.538,
|
|
"mean_token_accuracy": 0.26413826942443847,
|
|
"num_tokens": 101220673.0,
|
|
"step": 44140
|
|
},
|
|
{
|
|
"entropy": 5.0404878616333,
|
|
"epoch": 4.240634005763689,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003319760374343016,
|
|
"loss": 4.5323,
|
|
"mean_token_accuracy": 0.26352688670158386,
|
|
"num_tokens": 101233629.0,
|
|
"step": 44145
|
|
},
|
|
{
|
|
"entropy": 4.974649906158447,
|
|
"epoch": 4.2411143131604225,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003319428706238085,
|
|
"loss": 4.4295,
|
|
"mean_token_accuracy": 0.28265863060951235,
|
|
"num_tokens": 101245395.0,
|
|
"step": 44150
|
|
},
|
|
{
|
|
"entropy": 5.0080427646636965,
|
|
"epoch": 4.241594620557157,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003319097024912709,
|
|
"loss": 4.5107,
|
|
"mean_token_accuracy": 0.25940213799476625,
|
|
"num_tokens": 101257504.0,
|
|
"step": 44155
|
|
},
|
|
{
|
|
"entropy": 5.004666805267334,
|
|
"epoch": 4.24207492795389,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000331876533037459,
|
|
"loss": 4.593,
|
|
"mean_token_accuracy": 0.25335245579481125,
|
|
"num_tokens": 101268418.0,
|
|
"step": 44160
|
|
},
|
|
{
|
|
"entropy": 4.986159944534302,
|
|
"epoch": 4.242555235350625,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000331843362263143,
|
|
"loss": 4.4448,
|
|
"mean_token_accuracy": 0.27374649345874785,
|
|
"num_tokens": 101280509.0,
|
|
"step": 44165
|
|
},
|
|
{
|
|
"entropy": 5.068910741806031,
|
|
"epoch": 4.243035542747358,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00033181019016909286,
|
|
"loss": 4.5332,
|
|
"mean_token_accuracy": 0.26042267233133315,
|
|
"num_tokens": 101292869.0,
|
|
"step": 44170
|
|
},
|
|
{
|
|
"entropy": 5.066495323181153,
|
|
"epoch": 4.243515850144092,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003317770167560788,
|
|
"loss": 4.471,
|
|
"mean_token_accuracy": 0.27442467510700225,
|
|
"num_tokens": 101304772.0,
|
|
"step": 44175
|
|
},
|
|
{
|
|
"entropy": 4.98872709274292,
|
|
"epoch": 4.243996157540826,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00033174384202487096,
|
|
"loss": 4.5036,
|
|
"mean_token_accuracy": 0.26179519593715667,
|
|
"num_tokens": 101315445.0,
|
|
"step": 44180
|
|
},
|
|
{
|
|
"entropy": 4.935065984725952,
|
|
"epoch": 4.24447646493756,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0003317106659762397,
|
|
"loss": 4.4658,
|
|
"mean_token_accuracy": 0.267618328332901,
|
|
"num_tokens": 101326102.0,
|
|
"step": 44185
|
|
},
|
|
{
|
|
"entropy": 4.973066949844361,
|
|
"epoch": 4.244956772334294,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00033167748861095503,
|
|
"loss": 4.4714,
|
|
"mean_token_accuracy": 0.25955506414175034,
|
|
"num_tokens": 101337419.0,
|
|
"step": 44190
|
|
},
|
|
{
|
|
"entropy": 5.092371797561645,
|
|
"epoch": 4.245437079731028,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003316443099297874,
|
|
"loss": 4.5253,
|
|
"mean_token_accuracy": 0.26321011781692505,
|
|
"num_tokens": 101348630.0,
|
|
"step": 44195
|
|
},
|
|
{
|
|
"entropy": 4.964789628982544,
|
|
"epoch": 4.245917387127761,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033161112993350725,
|
|
"loss": 4.4398,
|
|
"mean_token_accuracy": 0.2646112382411957,
|
|
"num_tokens": 101361357.0,
|
|
"step": 44200
|
|
},
|
|
{
|
|
"entropy": 5.018583202362061,
|
|
"epoch": 4.246397694524496,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00033157794862288467,
|
|
"loss": 4.4576,
|
|
"mean_token_accuracy": 0.26919586658477784,
|
|
"num_tokens": 101372155.0,
|
|
"step": 44205
|
|
},
|
|
{
|
|
"entropy": 4.922289752960205,
|
|
"epoch": 4.24687800192123,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00033154476599869006,
|
|
"loss": 4.3823,
|
|
"mean_token_accuracy": 0.2747590884566307,
|
|
"num_tokens": 101383011.0,
|
|
"step": 44210
|
|
},
|
|
{
|
|
"entropy": 4.9450126647949215,
|
|
"epoch": 4.247358309317963,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.000331511582061694,
|
|
"loss": 4.4677,
|
|
"mean_token_accuracy": 0.2629583850502968,
|
|
"num_tokens": 101394741.0,
|
|
"step": 44215
|
|
},
|
|
{
|
|
"entropy": 5.059960794448853,
|
|
"epoch": 4.247838616714698,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003314783968126667,
|
|
"loss": 4.5805,
|
|
"mean_token_accuracy": 0.2596121415495872,
|
|
"num_tokens": 101405719.0,
|
|
"step": 44220
|
|
},
|
|
{
|
|
"entropy": 4.966331052780151,
|
|
"epoch": 4.248318924111431,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003314452102523788,
|
|
"loss": 4.4639,
|
|
"mean_token_accuracy": 0.26547603458166125,
|
|
"num_tokens": 101417073.0,
|
|
"step": 44225
|
|
},
|
|
{
|
|
"entropy": 5.045261096954346,
|
|
"epoch": 4.2487992315081655,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00033141202238160075,
|
|
"loss": 4.5774,
|
|
"mean_token_accuracy": 0.25980214178562167,
|
|
"num_tokens": 101429674.0,
|
|
"step": 44230
|
|
},
|
|
{
|
|
"entropy": 5.034459590911865,
|
|
"epoch": 4.249279538904899,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00033137883320110296,
|
|
"loss": 4.4843,
|
|
"mean_token_accuracy": 0.26229616403579714,
|
|
"num_tokens": 101441841.0,
|
|
"step": 44235
|
|
},
|
|
{
|
|
"entropy": 5.034803676605224,
|
|
"epoch": 4.249759846301633,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003313456427116561,
|
|
"loss": 4.5297,
|
|
"mean_token_accuracy": 0.2616858914494514,
|
|
"num_tokens": 101452946.0,
|
|
"step": 44240
|
|
},
|
|
{
|
|
"entropy": 5.074759483337402,
|
|
"epoch": 4.250240153698367,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003313124509140308,
|
|
"loss": 4.5489,
|
|
"mean_token_accuracy": 0.2627064362168312,
|
|
"num_tokens": 101462919.0,
|
|
"step": 44245
|
|
},
|
|
{
|
|
"entropy": 4.9132908344268795,
|
|
"epoch": 4.250720461095101,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00033127925780899754,
|
|
"loss": 4.39,
|
|
"mean_token_accuracy": 0.27239869087934493,
|
|
"num_tokens": 101474718.0,
|
|
"step": 44250
|
|
},
|
|
{
|
|
"entropy": 4.935622882843018,
|
|
"epoch": 4.2512007684918345,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000331246063397327,
|
|
"loss": 4.4785,
|
|
"mean_token_accuracy": 0.2608452528715134,
|
|
"num_tokens": 101487179.0,
|
|
"step": 44255
|
|
},
|
|
{
|
|
"entropy": 4.960782098770141,
|
|
"epoch": 4.251681075888569,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00033121286767979,
|
|
"loss": 4.4951,
|
|
"mean_token_accuracy": 0.26846073716878893,
|
|
"num_tokens": 101498816.0,
|
|
"step": 44260
|
|
},
|
|
{
|
|
"entropy": 4.995786237716675,
|
|
"epoch": 4.252161383285302,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000331179670657157,
|
|
"loss": 4.5202,
|
|
"mean_token_accuracy": 0.26631979942321776,
|
|
"num_tokens": 101510945.0,
|
|
"step": 44265
|
|
},
|
|
{
|
|
"entropy": 5.014561700820923,
|
|
"epoch": 4.252641690682037,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003311464723301989,
|
|
"loss": 4.4758,
|
|
"mean_token_accuracy": 0.26516310721635816,
|
|
"num_tokens": 101522344.0,
|
|
"step": 44270
|
|
},
|
|
{
|
|
"entropy": 4.988385772705078,
|
|
"epoch": 4.25312199807877,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003311132726996866,
|
|
"loss": 4.4943,
|
|
"mean_token_accuracy": 0.2668433874845505,
|
|
"num_tokens": 101534071.0,
|
|
"step": 44275
|
|
},
|
|
{
|
|
"entropy": 4.966341352462768,
|
|
"epoch": 4.253602305475504,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003310800717663905,
|
|
"loss": 4.4996,
|
|
"mean_token_accuracy": 0.2620332643389702,
|
|
"num_tokens": 101546418.0,
|
|
"step": 44280
|
|
},
|
|
{
|
|
"entropy": 4.965083026885987,
|
|
"epoch": 4.254082612872239,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033104686953108173,
|
|
"loss": 4.4914,
|
|
"mean_token_accuracy": 0.2686177998781204,
|
|
"num_tokens": 101559423.0,
|
|
"step": 44285
|
|
},
|
|
{
|
|
"entropy": 5.047569274902344,
|
|
"epoch": 4.254562920268972,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003310136659945311,
|
|
"loss": 4.5124,
|
|
"mean_token_accuracy": 0.2679473370313644,
|
|
"num_tokens": 101570308.0,
|
|
"step": 44290
|
|
},
|
|
{
|
|
"entropy": 4.991769075393677,
|
|
"epoch": 4.255043227665706,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00033098046115750946,
|
|
"loss": 4.4606,
|
|
"mean_token_accuracy": 0.2657899796962738,
|
|
"num_tokens": 101582689.0,
|
|
"step": 44295
|
|
},
|
|
{
|
|
"entropy": 4.935950040817261,
|
|
"epoch": 4.25552353506244,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003309472550207877,
|
|
"loss": 4.3978,
|
|
"mean_token_accuracy": 0.27999392449855803,
|
|
"num_tokens": 101593675.0,
|
|
"step": 44300
|
|
},
|
|
{
|
|
"entropy": 4.923684310913086,
|
|
"epoch": 4.256003842459174,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033091404758513686,
|
|
"loss": 4.4039,
|
|
"mean_token_accuracy": 0.28701927661895754,
|
|
"num_tokens": 101604057.0,
|
|
"step": 44305
|
|
},
|
|
{
|
|
"entropy": 5.00604133605957,
|
|
"epoch": 4.256484149855908,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00033088083885132784,
|
|
"loss": 4.4736,
|
|
"mean_token_accuracy": 0.26669894754886625,
|
|
"num_tokens": 101615063.0,
|
|
"step": 44310
|
|
},
|
|
{
|
|
"entropy": 4.936225557327271,
|
|
"epoch": 4.256964457252642,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00033084762882013176,
|
|
"loss": 4.4261,
|
|
"mean_token_accuracy": 0.2668800488114357,
|
|
"num_tokens": 101626059.0,
|
|
"step": 44315
|
|
},
|
|
{
|
|
"entropy": 5.019136095046997,
|
|
"epoch": 4.257444764649375,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00033081441749231944,
|
|
"loss": 4.5313,
|
|
"mean_token_accuracy": 0.2573040008544922,
|
|
"num_tokens": 101637799.0,
|
|
"step": 44320
|
|
},
|
|
{
|
|
"entropy": 4.957232284545898,
|
|
"epoch": 4.25792507204611,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003307812048686621,
|
|
"loss": 4.4396,
|
|
"mean_token_accuracy": 0.2749749764800072,
|
|
"num_tokens": 101648863.0,
|
|
"step": 44325
|
|
},
|
|
{
|
|
"entropy": 4.965475940704346,
|
|
"epoch": 4.258405379442843,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003307479909499309,
|
|
"loss": 4.4352,
|
|
"mean_token_accuracy": 0.2707765996456146,
|
|
"num_tokens": 101660144.0,
|
|
"step": 44330
|
|
},
|
|
{
|
|
"entropy": 5.04048900604248,
|
|
"epoch": 4.2588856868395775,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003307147757368969,
|
|
"loss": 4.5134,
|
|
"mean_token_accuracy": 0.25994552075862887,
|
|
"num_tokens": 101671388.0,
|
|
"step": 44335
|
|
},
|
|
{
|
|
"entropy": 4.989372968673706,
|
|
"epoch": 4.259365994236311,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00033068155923033114,
|
|
"loss": 4.4887,
|
|
"mean_token_accuracy": 0.2723979726433754,
|
|
"num_tokens": 101682542.0,
|
|
"step": 44340
|
|
},
|
|
{
|
|
"entropy": 5.0091626167297365,
|
|
"epoch": 4.259846301633045,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00033064834143100495,
|
|
"loss": 4.4923,
|
|
"mean_token_accuracy": 0.268682649731636,
|
|
"num_tokens": 101692683.0,
|
|
"step": 44345
|
|
},
|
|
{
|
|
"entropy": 5.025346755981445,
|
|
"epoch": 4.260326609029779,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003306151223396896,
|
|
"loss": 4.5451,
|
|
"mean_token_accuracy": 0.25677322298288346,
|
|
"num_tokens": 101704566.0,
|
|
"step": 44350
|
|
},
|
|
{
|
|
"entropy": 5.015668630599976,
|
|
"epoch": 4.260806916426513,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003305819019571562,
|
|
"loss": 4.5356,
|
|
"mean_token_accuracy": 0.25888221263885497,
|
|
"num_tokens": 101716751.0,
|
|
"step": 44355
|
|
},
|
|
{
|
|
"entropy": 5.067608594894409,
|
|
"epoch": 4.261287223823247,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033054868028417606,
|
|
"loss": 4.5761,
|
|
"mean_token_accuracy": 0.26392951011657717,
|
|
"num_tokens": 101728614.0,
|
|
"step": 44360
|
|
},
|
|
{
|
|
"entropy": 5.056732082366944,
|
|
"epoch": 4.261767531219981,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00033051545732152047,
|
|
"loss": 4.5461,
|
|
"mean_token_accuracy": 0.25912673473358155,
|
|
"num_tokens": 101739740.0,
|
|
"step": 44365
|
|
},
|
|
{
|
|
"entropy": 4.964679098129272,
|
|
"epoch": 4.262247838616715,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003304822330699609,
|
|
"loss": 4.4368,
|
|
"mean_token_accuracy": 0.27551603615283965,
|
|
"num_tokens": 101750971.0,
|
|
"step": 44370
|
|
},
|
|
{
|
|
"entropy": 4.949120664596558,
|
|
"epoch": 4.2627281460134485,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003304490075302686,
|
|
"loss": 4.4535,
|
|
"mean_token_accuracy": 0.2626165196299553,
|
|
"num_tokens": 101762955.0,
|
|
"step": 44375
|
|
},
|
|
{
|
|
"entropy": 4.9361804008483885,
|
|
"epoch": 4.263208453410183,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00033041578070321494,
|
|
"loss": 4.4149,
|
|
"mean_token_accuracy": 0.2755949765443802,
|
|
"num_tokens": 101773314.0,
|
|
"step": 44380
|
|
},
|
|
{
|
|
"entropy": 4.931208229064941,
|
|
"epoch": 4.263688760806916,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003303825525895714,
|
|
"loss": 4.4477,
|
|
"mean_token_accuracy": 0.26606224477291107,
|
|
"num_tokens": 101784006.0,
|
|
"step": 44385
|
|
},
|
|
{
|
|
"entropy": 5.018464708328247,
|
|
"epoch": 4.264169068203651,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.00033034932319010946,
|
|
"loss": 4.4623,
|
|
"mean_token_accuracy": 0.2694112777709961,
|
|
"num_tokens": 101795585.0,
|
|
"step": 44390
|
|
},
|
|
{
|
|
"entropy": 5.060614442825317,
|
|
"epoch": 4.264649375600384,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00033031609250560066,
|
|
"loss": 4.547,
|
|
"mean_token_accuracy": 0.2610408112406731,
|
|
"num_tokens": 101806672.0,
|
|
"step": 44395
|
|
},
|
|
{
|
|
"entropy": 5.086917734146118,
|
|
"epoch": 4.265129682997118,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003302828605368164,
|
|
"loss": 4.5676,
|
|
"mean_token_accuracy": 0.2532142326235771,
|
|
"num_tokens": 101819353.0,
|
|
"step": 44400
|
|
},
|
|
{
|
|
"entropy": 5.063417577743531,
|
|
"epoch": 4.265609990393852,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00033024962728452826,
|
|
"loss": 4.5151,
|
|
"mean_token_accuracy": 0.2607672572135925,
|
|
"num_tokens": 101830739.0,
|
|
"step": 44405
|
|
},
|
|
{
|
|
"entropy": 4.996416711807251,
|
|
"epoch": 4.266090297790586,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003302163927495078,
|
|
"loss": 4.541,
|
|
"mean_token_accuracy": 0.2604985013604164,
|
|
"num_tokens": 101842484.0,
|
|
"step": 44410
|
|
},
|
|
{
|
|
"entropy": 5.019021940231323,
|
|
"epoch": 4.26657060518732,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003301831569325267,
|
|
"loss": 4.5421,
|
|
"mean_token_accuracy": 0.2586344093084335,
|
|
"num_tokens": 101854393.0,
|
|
"step": 44415
|
|
},
|
|
{
|
|
"entropy": 4.960885381698608,
|
|
"epoch": 4.267050912584054,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.00033014991983435654,
|
|
"loss": 4.3923,
|
|
"mean_token_accuracy": 0.2705559849739075,
|
|
"num_tokens": 101866266.0,
|
|
"step": 44420
|
|
},
|
|
{
|
|
"entropy": 4.970264005661011,
|
|
"epoch": 4.267531219980787,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003301166814557691,
|
|
"loss": 4.4199,
|
|
"mean_token_accuracy": 0.2727414220571518,
|
|
"num_tokens": 101876727.0,
|
|
"step": 44425
|
|
},
|
|
{
|
|
"entropy": 5.067277908325195,
|
|
"epoch": 4.268011527377522,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00033008344179753584,
|
|
"loss": 4.5534,
|
|
"mean_token_accuracy": 0.25963193029165266,
|
|
"num_tokens": 101887996.0,
|
|
"step": 44430
|
|
},
|
|
{
|
|
"entropy": 4.99131212234497,
|
|
"epoch": 4.268491834774256,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033005020086042874,
|
|
"loss": 4.5081,
|
|
"mean_token_accuracy": 0.27187336087226865,
|
|
"num_tokens": 101899676.0,
|
|
"step": 44435
|
|
},
|
|
{
|
|
"entropy": 5.007819986343383,
|
|
"epoch": 4.268972142170989,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00033001695864521934,
|
|
"loss": 4.5107,
|
|
"mean_token_accuracy": 0.2647139310836792,
|
|
"num_tokens": 101910709.0,
|
|
"step": 44440
|
|
},
|
|
{
|
|
"entropy": 5.053316259384156,
|
|
"epoch": 4.269452449567724,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00032998371515267954,
|
|
"loss": 4.5426,
|
|
"mean_token_accuracy": 0.2617322266101837,
|
|
"num_tokens": 101921375.0,
|
|
"step": 44445
|
|
},
|
|
{
|
|
"entropy": 5.026913642883301,
|
|
"epoch": 4.269932756964457,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003299504703835812,
|
|
"loss": 4.462,
|
|
"mean_token_accuracy": 0.2657062903046608,
|
|
"num_tokens": 101932899.0,
|
|
"step": 44450
|
|
},
|
|
{
|
|
"entropy": 5.039828634262085,
|
|
"epoch": 4.2704130643611915,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003299172243386961,
|
|
"loss": 4.5504,
|
|
"mean_token_accuracy": 0.2637521132826805,
|
|
"num_tokens": 101943749.0,
|
|
"step": 44455
|
|
},
|
|
{
|
|
"entropy": 4.992611265182495,
|
|
"epoch": 4.270893371757925,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003298839770187961,
|
|
"loss": 4.4962,
|
|
"mean_token_accuracy": 0.26211894303560257,
|
|
"num_tokens": 101955166.0,
|
|
"step": 44460
|
|
},
|
|
{
|
|
"entropy": 5.017980623245239,
|
|
"epoch": 4.271373679154659,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00032985072842465314,
|
|
"loss": 4.5131,
|
|
"mean_token_accuracy": 0.2599871546030045,
|
|
"num_tokens": 101966875.0,
|
|
"step": 44465
|
|
},
|
|
{
|
|
"entropy": 5.111761379241943,
|
|
"epoch": 4.271853986551393,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00032981747855703915,
|
|
"loss": 4.5431,
|
|
"mean_token_accuracy": 0.2569878131151199,
|
|
"num_tokens": 101978137.0,
|
|
"step": 44470
|
|
},
|
|
{
|
|
"entropy": 4.94873685836792,
|
|
"epoch": 4.272334293948127,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00032978422741672603,
|
|
"loss": 4.4406,
|
|
"mean_token_accuracy": 0.26249594837427137,
|
|
"num_tokens": 101988953.0,
|
|
"step": 44475
|
|
},
|
|
{
|
|
"entropy": 5.089495372772217,
|
|
"epoch": 4.2728146013448605,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003297509750044858,
|
|
"loss": 4.5987,
|
|
"mean_token_accuracy": 0.2567941814661026,
|
|
"num_tokens": 102001298.0,
|
|
"step": 44480
|
|
},
|
|
{
|
|
"entropy": 5.032405805587769,
|
|
"epoch": 4.273294908741595,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003297177213210907,
|
|
"loss": 4.4722,
|
|
"mean_token_accuracy": 0.2636980086565018,
|
|
"num_tokens": 102012934.0,
|
|
"step": 44485
|
|
},
|
|
{
|
|
"entropy": 4.986426973342896,
|
|
"epoch": 4.273775216138328,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0003296844663673124,
|
|
"loss": 4.4436,
|
|
"mean_token_accuracy": 0.2743574947118759,
|
|
"num_tokens": 102023863.0,
|
|
"step": 44490
|
|
},
|
|
{
|
|
"entropy": 4.952045059204101,
|
|
"epoch": 4.2742555235350626,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003296512101439232,
|
|
"loss": 4.376,
|
|
"mean_token_accuracy": 0.2749819725751877,
|
|
"num_tokens": 102034586.0,
|
|
"step": 44495
|
|
},
|
|
{
|
|
"entropy": 4.905001449584961,
|
|
"epoch": 4.274735830931796,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00032961795265169525,
|
|
"loss": 4.4361,
|
|
"mean_token_accuracy": 0.2698443368077278,
|
|
"num_tokens": 102044933.0,
|
|
"step": 44500
|
|
},
|
|
{
|
|
"entropy": 5.054914474487305,
|
|
"epoch": 4.27521613832853,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003295846938914005,
|
|
"loss": 4.6201,
|
|
"mean_token_accuracy": 0.2584583401679993,
|
|
"num_tokens": 102056370.0,
|
|
"step": 44505
|
|
},
|
|
{
|
|
"entropy": 5.045637845993042,
|
|
"epoch": 4.275696445725265,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003295514338638114,
|
|
"loss": 4.5264,
|
|
"mean_token_accuracy": 0.25621298253536223,
|
|
"num_tokens": 102067428.0,
|
|
"step": 44510
|
|
},
|
|
{
|
|
"entropy": 4.973778629302979,
|
|
"epoch": 4.276176753121998,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003295181725696999,
|
|
"loss": 4.417,
|
|
"mean_token_accuracy": 0.274395252764225,
|
|
"num_tokens": 102079262.0,
|
|
"step": 44515
|
|
},
|
|
{
|
|
"entropy": 4.955228996276856,
|
|
"epoch": 4.276657060518732,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003294849100098383,
|
|
"loss": 4.4727,
|
|
"mean_token_accuracy": 0.27014304846525194,
|
|
"num_tokens": 102088835.0,
|
|
"step": 44520
|
|
},
|
|
{
|
|
"entropy": 5.061141300201416,
|
|
"epoch": 4.277137367915466,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003294516461849988,
|
|
"loss": 4.5753,
|
|
"mean_token_accuracy": 0.26167816370725633,
|
|
"num_tokens": 102100119.0,
|
|
"step": 44525
|
|
},
|
|
{
|
|
"entropy": 5.039008235931396,
|
|
"epoch": 4.2776176753122,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000329418381095954,
|
|
"loss": 4.5495,
|
|
"mean_token_accuracy": 0.2628825157880783,
|
|
"num_tokens": 102111279.0,
|
|
"step": 44530
|
|
},
|
|
{
|
|
"entropy": 4.959822511672973,
|
|
"epoch": 4.278097982708934,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003293851147434758,
|
|
"loss": 4.4776,
|
|
"mean_token_accuracy": 0.26174134612083433,
|
|
"num_tokens": 102122977.0,
|
|
"step": 44535
|
|
},
|
|
{
|
|
"entropy": 5.072029113769531,
|
|
"epoch": 4.278578290105668,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.00032935184712833674,
|
|
"loss": 4.5819,
|
|
"mean_token_accuracy": 0.26314348727464676,
|
|
"num_tokens": 102135343.0,
|
|
"step": 44540
|
|
},
|
|
{
|
|
"entropy": 5.048179244995117,
|
|
"epoch": 4.279058597502401,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0003293185782513093,
|
|
"loss": 4.4831,
|
|
"mean_token_accuracy": 0.2591738164424896,
|
|
"num_tokens": 102148331.0,
|
|
"step": 44545
|
|
},
|
|
{
|
|
"entropy": 4.994099521636963,
|
|
"epoch": 4.279538904899136,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00032928530811316564,
|
|
"loss": 4.4462,
|
|
"mean_token_accuracy": 0.2717635840177536,
|
|
"num_tokens": 102159720.0,
|
|
"step": 44550
|
|
},
|
|
{
|
|
"entropy": 5.078804969787598,
|
|
"epoch": 4.280019212295869,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003292520367146784,
|
|
"loss": 4.573,
|
|
"mean_token_accuracy": 0.2531072497367859,
|
|
"num_tokens": 102172323.0,
|
|
"step": 44555
|
|
},
|
|
{
|
|
"entropy": 5.004259347915649,
|
|
"epoch": 4.2804995196926034,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0003292187640566199,
|
|
"loss": 4.4387,
|
|
"mean_token_accuracy": 0.27144034653902055,
|
|
"num_tokens": 102183998.0,
|
|
"step": 44560
|
|
},
|
|
{
|
|
"entropy": 5.008700323104859,
|
|
"epoch": 4.280979827089337,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00032918549013976267,
|
|
"loss": 4.4597,
|
|
"mean_token_accuracy": 0.268440280854702,
|
|
"num_tokens": 102196193.0,
|
|
"step": 44565
|
|
},
|
|
{
|
|
"entropy": 4.980321788787842,
|
|
"epoch": 4.281460134486071,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003291522149648794,
|
|
"loss": 4.4182,
|
|
"mean_token_accuracy": 0.2709320679306984,
|
|
"num_tokens": 102208042.0,
|
|
"step": 44570
|
|
},
|
|
{
|
|
"entropy": 5.01020393371582,
|
|
"epoch": 4.281940441882805,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00032911893853274234,
|
|
"loss": 4.5101,
|
|
"mean_token_accuracy": 0.26827905774116517,
|
|
"num_tokens": 102219006.0,
|
|
"step": 44575
|
|
},
|
|
{
|
|
"entropy": 5.005380868911743,
|
|
"epoch": 4.282420749279539,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003290856608441243,
|
|
"loss": 4.4836,
|
|
"mean_token_accuracy": 0.2706980466842651,
|
|
"num_tokens": 102230733.0,
|
|
"step": 44580
|
|
},
|
|
{
|
|
"entropy": 5.027128601074219,
|
|
"epoch": 4.282901056676272,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003290523818997978,
|
|
"loss": 4.5323,
|
|
"mean_token_accuracy": 0.26816424280405043,
|
|
"num_tokens": 102240871.0,
|
|
"step": 44585
|
|
},
|
|
{
|
|
"entropy": 4.970370626449585,
|
|
"epoch": 4.283381364073007,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00032901910170053553,
|
|
"loss": 4.3957,
|
|
"mean_token_accuracy": 0.2709580063819885,
|
|
"num_tokens": 102252427.0,
|
|
"step": 44590
|
|
},
|
|
{
|
|
"entropy": 5.052786540985108,
|
|
"epoch": 4.28386167146974,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003289858202471101,
|
|
"loss": 4.5654,
|
|
"mean_token_accuracy": 0.26533797979354856,
|
|
"num_tokens": 102263576.0,
|
|
"step": 44595
|
|
},
|
|
{
|
|
"entropy": 5.065248727798462,
|
|
"epoch": 4.2843419788664745,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00032895253754029415,
|
|
"loss": 4.4629,
|
|
"mean_token_accuracy": 0.26973864883184434,
|
|
"num_tokens": 102275475.0,
|
|
"step": 44600
|
|
},
|
|
{
|
|
"entropy": 5.015602254867554,
|
|
"epoch": 4.284822286263209,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.00032891925358086063,
|
|
"loss": 4.523,
|
|
"mean_token_accuracy": 0.25974294990301133,
|
|
"num_tokens": 102287757.0,
|
|
"step": 44605
|
|
},
|
|
{
|
|
"entropy": 5.022265291213989,
|
|
"epoch": 4.285302593659942,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00032888596836958206,
|
|
"loss": 4.4428,
|
|
"mean_token_accuracy": 0.2685783445835114,
|
|
"num_tokens": 102298732.0,
|
|
"step": 44610
|
|
},
|
|
{
|
|
"entropy": 5.033925342559814,
|
|
"epoch": 4.285782901056677,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00032885268190723134,
|
|
"loss": 4.5295,
|
|
"mean_token_accuracy": 0.2688254788517952,
|
|
"num_tokens": 102310061.0,
|
|
"step": 44615
|
|
},
|
|
{
|
|
"entropy": 5.032386636734008,
|
|
"epoch": 4.28626320845341,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003288193941945812,
|
|
"loss": 4.5264,
|
|
"mean_token_accuracy": 0.2635366067290306,
|
|
"num_tokens": 102320937.0,
|
|
"step": 44620
|
|
},
|
|
{
|
|
"entropy": 5.047990608215332,
|
|
"epoch": 4.286743515850144,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003287861052324047,
|
|
"loss": 4.5973,
|
|
"mean_token_accuracy": 0.25576501786708833,
|
|
"num_tokens": 102331849.0,
|
|
"step": 44625
|
|
},
|
|
{
|
|
"entropy": 5.037664365768433,
|
|
"epoch": 4.287223823246878,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0003287528150214744,
|
|
"loss": 4.5291,
|
|
"mean_token_accuracy": 0.26406891644001007,
|
|
"num_tokens": 102343504.0,
|
|
"step": 44630
|
|
},
|
|
{
|
|
"entropy": 5.083225584030151,
|
|
"epoch": 4.287704130643612,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00032871952356256336,
|
|
"loss": 4.5428,
|
|
"mean_token_accuracy": 0.2591785669326782,
|
|
"num_tokens": 102354476.0,
|
|
"step": 44635
|
|
},
|
|
{
|
|
"entropy": 4.964792346954345,
|
|
"epoch": 4.2881844380403455,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003286862308564445,
|
|
"loss": 4.4104,
|
|
"mean_token_accuracy": 0.2705155685544014,
|
|
"num_tokens": 102365733.0,
|
|
"step": 44640
|
|
},
|
|
{
|
|
"entropy": 5.080849075317383,
|
|
"epoch": 4.28866474543708,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00032865293690389076,
|
|
"loss": 4.557,
|
|
"mean_token_accuracy": 0.2588011547923088,
|
|
"num_tokens": 102377954.0,
|
|
"step": 44645
|
|
},
|
|
{
|
|
"entropy": 5.108122873306274,
|
|
"epoch": 4.289145052833813,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003286196417056752,
|
|
"loss": 4.5536,
|
|
"mean_token_accuracy": 0.2632875621318817,
|
|
"num_tokens": 102388378.0,
|
|
"step": 44650
|
|
},
|
|
{
|
|
"entropy": 4.964575004577637,
|
|
"epoch": 4.289625360230548,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00032858634526257077,
|
|
"loss": 4.4845,
|
|
"mean_token_accuracy": 0.26559562534093856,
|
|
"num_tokens": 102399702.0,
|
|
"step": 44655
|
|
},
|
|
{
|
|
"entropy": 5.047686386108398,
|
|
"epoch": 4.290105667627281,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0003285530475753505,
|
|
"loss": 4.5024,
|
|
"mean_token_accuracy": 0.2619020029902458,
|
|
"num_tokens": 102410393.0,
|
|
"step": 44660
|
|
},
|
|
{
|
|
"entropy": 5.024641466140747,
|
|
"epoch": 4.290585975024015,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00032851974864478743,
|
|
"loss": 4.5597,
|
|
"mean_token_accuracy": 0.27020933479070663,
|
|
"num_tokens": 102423463.0,
|
|
"step": 44665
|
|
},
|
|
{
|
|
"entropy": 5.018656253814697,
|
|
"epoch": 4.291066282420749,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00032848644847165484,
|
|
"loss": 4.4777,
|
|
"mean_token_accuracy": 0.26871681213378906,
|
|
"num_tokens": 102434822.0,
|
|
"step": 44670
|
|
},
|
|
{
|
|
"entropy": 5.001106405258179,
|
|
"epoch": 4.291546589817483,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0003284531470567256,
|
|
"loss": 4.4439,
|
|
"mean_token_accuracy": 0.269171741604805,
|
|
"num_tokens": 102446302.0,
|
|
"step": 44675
|
|
},
|
|
{
|
|
"entropy": 4.978112077713012,
|
|
"epoch": 4.2920268972142175,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003284198444007731,
|
|
"loss": 4.4846,
|
|
"mean_token_accuracy": 0.26790024191141126,
|
|
"num_tokens": 102456866.0,
|
|
"step": 44680
|
|
},
|
|
{
|
|
"entropy": 5.021739387512207,
|
|
"epoch": 4.292507204610951,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00032838654050457047,
|
|
"loss": 4.5767,
|
|
"mean_token_accuracy": 0.26251809298992157,
|
|
"num_tokens": 102468498.0,
|
|
"step": 44685
|
|
},
|
|
{
|
|
"entropy": 5.001620483398438,
|
|
"epoch": 4.292987512007685,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003283532353688908,
|
|
"loss": 4.4798,
|
|
"mean_token_accuracy": 0.26698437333106995,
|
|
"num_tokens": 102481723.0,
|
|
"step": 44690
|
|
},
|
|
{
|
|
"entropy": 5.04767951965332,
|
|
"epoch": 4.293467819404419,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00032831992899450745,
|
|
"loss": 4.479,
|
|
"mean_token_accuracy": 0.26891845762729644,
|
|
"num_tokens": 102492828.0,
|
|
"step": 44695
|
|
},
|
|
{
|
|
"entropy": 4.984335231781006,
|
|
"epoch": 4.293948126801153,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00032828662138219375,
|
|
"loss": 4.4659,
|
|
"mean_token_accuracy": 0.26880336105823516,
|
|
"num_tokens": 102503261.0,
|
|
"step": 44700
|
|
},
|
|
{
|
|
"entropy": 4.8707643985748295,
|
|
"epoch": 4.2944284341978864,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0003282533125327228,
|
|
"loss": 4.3658,
|
|
"mean_token_accuracy": 0.2748368248343468,
|
|
"num_tokens": 102514383.0,
|
|
"step": 44705
|
|
},
|
|
{
|
|
"entropy": 4.949098014831543,
|
|
"epoch": 4.294908741594621,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003282200024468682,
|
|
"loss": 4.5238,
|
|
"mean_token_accuracy": 0.260147850215435,
|
|
"num_tokens": 102524441.0,
|
|
"step": 44710
|
|
},
|
|
{
|
|
"entropy": 5.055233764648437,
|
|
"epoch": 4.295389048991354,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0003281866911254031,
|
|
"loss": 4.4978,
|
|
"mean_token_accuracy": 0.2651292771100998,
|
|
"num_tokens": 102534785.0,
|
|
"step": 44715
|
|
},
|
|
{
|
|
"entropy": 5.007537460327148,
|
|
"epoch": 4.2958693563880885,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000328153378569101,
|
|
"loss": 4.4187,
|
|
"mean_token_accuracy": 0.27675105035305025,
|
|
"num_tokens": 102546596.0,
|
|
"step": 44720
|
|
},
|
|
{
|
|
"entropy": 4.971066284179687,
|
|
"epoch": 4.296349663784822,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003281200647787352,
|
|
"loss": 4.4418,
|
|
"mean_token_accuracy": 0.2704776033759117,
|
|
"num_tokens": 102557035.0,
|
|
"step": 44725
|
|
},
|
|
{
|
|
"entropy": 5.0283956050872805,
|
|
"epoch": 4.296829971181556,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00032808674975507937,
|
|
"loss": 4.5499,
|
|
"mean_token_accuracy": 0.2614870384335518,
|
|
"num_tokens": 102568580.0,
|
|
"step": 44730
|
|
},
|
|
{
|
|
"entropy": 5.128940916061401,
|
|
"epoch": 4.29731027857829,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003280534334989067,
|
|
"loss": 4.5683,
|
|
"mean_token_accuracy": 0.26175253838300705,
|
|
"num_tokens": 102578858.0,
|
|
"step": 44735
|
|
},
|
|
{
|
|
"entropy": 5.002647781372071,
|
|
"epoch": 4.297790585975024,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00032802011601099095,
|
|
"loss": 4.4951,
|
|
"mean_token_accuracy": 0.26391801089048383,
|
|
"num_tokens": 102590855.0,
|
|
"step": 44740
|
|
},
|
|
{
|
|
"entropy": 4.943107414245605,
|
|
"epoch": 4.2982708933717575,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00032798679729210555,
|
|
"loss": 4.4572,
|
|
"mean_token_accuracy": 0.2687337681651115,
|
|
"num_tokens": 102602548.0,
|
|
"step": 44745
|
|
},
|
|
{
|
|
"entropy": 5.042870283126831,
|
|
"epoch": 4.298751200768492,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000327953477343024,
|
|
"loss": 4.5618,
|
|
"mean_token_accuracy": 0.25561906546354296,
|
|
"num_tokens": 102613635.0,
|
|
"step": 44750
|
|
},
|
|
{
|
|
"entropy": 5.060275411605835,
|
|
"epoch": 4.299231508165226,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00032792015616452,
|
|
"loss": 4.5269,
|
|
"mean_token_accuracy": 0.25407602936029433,
|
|
"num_tokens": 102624958.0,
|
|
"step": 44755
|
|
},
|
|
{
|
|
"entropy": 4.977354192733765,
|
|
"epoch": 4.29971181556196,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.000327886833757367,
|
|
"loss": 4.4427,
|
|
"mean_token_accuracy": 0.26454340666532516,
|
|
"num_tokens": 102635616.0,
|
|
"step": 44760
|
|
},
|
|
{
|
|
"entropy": 4.989741849899292,
|
|
"epoch": 4.300192122958694,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003278535101223388,
|
|
"loss": 4.5115,
|
|
"mean_token_accuracy": 0.2625274360179901,
|
|
"num_tokens": 102646654.0,
|
|
"step": 44765
|
|
},
|
|
{
|
|
"entropy": 5.088791418075561,
|
|
"epoch": 4.300672430355427,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003278201852602091,
|
|
"loss": 4.4893,
|
|
"mean_token_accuracy": 0.2696481630206108,
|
|
"num_tokens": 102657624.0,
|
|
"step": 44770
|
|
},
|
|
{
|
|
"entropy": 5.002238416671753,
|
|
"epoch": 4.301152737752162,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.00032778685917175153,
|
|
"loss": 4.4639,
|
|
"mean_token_accuracy": 0.26547174006700514,
|
|
"num_tokens": 102668931.0,
|
|
"step": 44775
|
|
},
|
|
{
|
|
"entropy": 4.9706440448760985,
|
|
"epoch": 4.301633045148895,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00032775353185773983,
|
|
"loss": 4.4801,
|
|
"mean_token_accuracy": 0.2661172315478325,
|
|
"num_tokens": 102680680.0,
|
|
"step": 44780
|
|
},
|
|
{
|
|
"entropy": 4.990474557876587,
|
|
"epoch": 4.302113352545629,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00032772020331894776,
|
|
"loss": 4.5021,
|
|
"mean_token_accuracy": 0.266573803126812,
|
|
"num_tokens": 102692972.0,
|
|
"step": 44785
|
|
},
|
|
{
|
|
"entropy": 5.059896850585938,
|
|
"epoch": 4.302593659942363,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003276868735561491,
|
|
"loss": 4.4618,
|
|
"mean_token_accuracy": 0.25730509757995607,
|
|
"num_tokens": 102705739.0,
|
|
"step": 44790
|
|
},
|
|
{
|
|
"entropy": 4.996896123886108,
|
|
"epoch": 4.303073967339097,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00032765354257011765,
|
|
"loss": 4.4653,
|
|
"mean_token_accuracy": 0.27276801466941836,
|
|
"num_tokens": 102716368.0,
|
|
"step": 44795
|
|
},
|
|
{
|
|
"entropy": 4.966413450241089,
|
|
"epoch": 4.303554274735831,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.00032762021036162735,
|
|
"loss": 4.5792,
|
|
"mean_token_accuracy": 0.2577990874648094,
|
|
"num_tokens": 102729920.0,
|
|
"step": 44800
|
|
},
|
|
{
|
|
"entropy": 5.068599510192871,
|
|
"epoch": 4.304034582132565,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003275868769314521,
|
|
"loss": 4.5336,
|
|
"mean_token_accuracy": 0.25770464092493056,
|
|
"num_tokens": 102740374.0,
|
|
"step": 44805
|
|
},
|
|
{
|
|
"entropy": 5.062683820724487,
|
|
"epoch": 4.304514889529298,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00032755354228036554,
|
|
"loss": 4.555,
|
|
"mean_token_accuracy": 0.2546397089958191,
|
|
"num_tokens": 102750820.0,
|
|
"step": 44810
|
|
},
|
|
{
|
|
"entropy": 5.042868709564209,
|
|
"epoch": 4.304995196926033,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003275202064091419,
|
|
"loss": 4.5226,
|
|
"mean_token_accuracy": 0.26547911465168,
|
|
"num_tokens": 102761312.0,
|
|
"step": 44815
|
|
},
|
|
{
|
|
"entropy": 4.997518730163574,
|
|
"epoch": 4.305475504322766,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00032748686931855495,
|
|
"loss": 4.5005,
|
|
"mean_token_accuracy": 0.2666341751813889,
|
|
"num_tokens": 102772288.0,
|
|
"step": 44820
|
|
},
|
|
{
|
|
"entropy": 5.000406408309937,
|
|
"epoch": 4.3059558117195005,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0003274535310093788,
|
|
"loss": 4.4418,
|
|
"mean_token_accuracy": 0.26716314256191254,
|
|
"num_tokens": 102783533.0,
|
|
"step": 44825
|
|
},
|
|
{
|
|
"entropy": 5.024941635131836,
|
|
"epoch": 4.306436119116235,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003274201914823874,
|
|
"loss": 4.5178,
|
|
"mean_token_accuracy": 0.2684862017631531,
|
|
"num_tokens": 102795940.0,
|
|
"step": 44830
|
|
},
|
|
{
|
|
"entropy": 4.978396034240722,
|
|
"epoch": 4.306916426512968,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00032738685073835475,
|
|
"loss": 4.4986,
|
|
"mean_token_accuracy": 0.26720222681760786,
|
|
"num_tokens": 102807507.0,
|
|
"step": 44835
|
|
},
|
|
{
|
|
"entropy": 4.9635289192199705,
|
|
"epoch": 4.307396733909703,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000327353508778055,
|
|
"loss": 4.4137,
|
|
"mean_token_accuracy": 0.27262254655361173,
|
|
"num_tokens": 102817895.0,
|
|
"step": 44840
|
|
},
|
|
{
|
|
"entropy": 5.035701560974121,
|
|
"epoch": 4.307877041306436,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.00032732016560226235,
|
|
"loss": 4.5012,
|
|
"mean_token_accuracy": 0.25672981441020964,
|
|
"num_tokens": 102829141.0,
|
|
"step": 44845
|
|
},
|
|
{
|
|
"entropy": 4.993556547164917,
|
|
"epoch": 4.30835734870317,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00032728682121175077,
|
|
"loss": 4.5057,
|
|
"mean_token_accuracy": 0.26659169644117353,
|
|
"num_tokens": 102841488.0,
|
|
"step": 44850
|
|
},
|
|
{
|
|
"entropy": 4.9918036460876465,
|
|
"epoch": 4.308837656099904,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00032725347560729433,
|
|
"loss": 4.5211,
|
|
"mean_token_accuracy": 0.26454724818468095,
|
|
"num_tokens": 102853276.0,
|
|
"step": 44855
|
|
},
|
|
{
|
|
"entropy": 4.969988918304443,
|
|
"epoch": 4.309317963496638,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00032722012878966744,
|
|
"loss": 4.489,
|
|
"mean_token_accuracy": 0.2646226927638054,
|
|
"num_tokens": 102864850.0,
|
|
"step": 44860
|
|
},
|
|
{
|
|
"entropy": 5.005258178710937,
|
|
"epoch": 4.3097982708933715,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00032718678075964425,
|
|
"loss": 4.5261,
|
|
"mean_token_accuracy": 0.25914099514484407,
|
|
"num_tokens": 102876692.0,
|
|
"step": 44865
|
|
},
|
|
{
|
|
"entropy": 5.011108684539795,
|
|
"epoch": 4.310278578290106,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0003271534315179989,
|
|
"loss": 4.4802,
|
|
"mean_token_accuracy": 0.2673538625240326,
|
|
"num_tokens": 102887693.0,
|
|
"step": 44870
|
|
},
|
|
{
|
|
"entropy": 4.988899517059326,
|
|
"epoch": 4.310758885686839,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00032712008106550584,
|
|
"loss": 4.5178,
|
|
"mean_token_accuracy": 0.26537582129240034,
|
|
"num_tokens": 102900493.0,
|
|
"step": 44875
|
|
},
|
|
{
|
|
"entropy": 5.050619506835938,
|
|
"epoch": 4.311239193083574,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.00032708672940293923,
|
|
"loss": 4.5215,
|
|
"mean_token_accuracy": 0.2573821634054184,
|
|
"num_tokens": 102911983.0,
|
|
"step": 44880
|
|
},
|
|
{
|
|
"entropy": 4.990151309967041,
|
|
"epoch": 4.311719500480307,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00032705337653107336,
|
|
"loss": 4.4374,
|
|
"mean_token_accuracy": 0.2761127784848213,
|
|
"num_tokens": 102923204.0,
|
|
"step": 44885
|
|
},
|
|
{
|
|
"entropy": 5.017802953720093,
|
|
"epoch": 4.312199807877041,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00032702002245068267,
|
|
"loss": 4.551,
|
|
"mean_token_accuracy": 0.25716054886579515,
|
|
"num_tokens": 102934091.0,
|
|
"step": 44890
|
|
},
|
|
{
|
|
"entropy": 4.932206439971924,
|
|
"epoch": 4.312680115273775,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00032698666716254155,
|
|
"loss": 4.4045,
|
|
"mean_token_accuracy": 0.2706180945038795,
|
|
"num_tokens": 102946044.0,
|
|
"step": 44895
|
|
},
|
|
{
|
|
"entropy": 5.112581253051758,
|
|
"epoch": 4.313160422670509,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003269533106674244,
|
|
"loss": 4.6523,
|
|
"mean_token_accuracy": 0.25425046533346174,
|
|
"num_tokens": 102957399.0,
|
|
"step": 44900
|
|
},
|
|
{
|
|
"entropy": 5.008420944213867,
|
|
"epoch": 4.3136407300672435,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00032691995296610566,
|
|
"loss": 4.4652,
|
|
"mean_token_accuracy": 0.2707404553890228,
|
|
"num_tokens": 102968757.0,
|
|
"step": 44905
|
|
},
|
|
{
|
|
"entropy": 4.97296404838562,
|
|
"epoch": 4.314121037463977,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00032688659405935973,
|
|
"loss": 4.4214,
|
|
"mean_token_accuracy": 0.26737648248672485,
|
|
"num_tokens": 102980128.0,
|
|
"step": 44910
|
|
},
|
|
{
|
|
"entropy": 5.037271022796631,
|
|
"epoch": 4.314601344860711,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003268532339479612,
|
|
"loss": 4.5436,
|
|
"mean_token_accuracy": 0.26030019074678423,
|
|
"num_tokens": 102990951.0,
|
|
"step": 44915
|
|
},
|
|
{
|
|
"entropy": 4.963161611557007,
|
|
"epoch": 4.315081652257445,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0003268198726326846,
|
|
"loss": 4.4852,
|
|
"mean_token_accuracy": 0.2645331859588623,
|
|
"num_tokens": 103003911.0,
|
|
"step": 44920
|
|
},
|
|
{
|
|
"entropy": 5.040442037582397,
|
|
"epoch": 4.315561959654179,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003267865101143043,
|
|
"loss": 4.4924,
|
|
"mean_token_accuracy": 0.26624881476163864,
|
|
"num_tokens": 103015584.0,
|
|
"step": 44925
|
|
},
|
|
{
|
|
"entropy": 5.047384929656983,
|
|
"epoch": 4.316042267050912,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00032675314639359505,
|
|
"loss": 4.5186,
|
|
"mean_token_accuracy": 0.27332261204719543,
|
|
"num_tokens": 103027632.0,
|
|
"step": 44930
|
|
},
|
|
{
|
|
"entropy": 5.018669509887696,
|
|
"epoch": 4.316522574447647,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003267197814713315,
|
|
"loss": 4.4931,
|
|
"mean_token_accuracy": 0.2633676141500473,
|
|
"num_tokens": 103037784.0,
|
|
"step": 44935
|
|
},
|
|
{
|
|
"entropy": 4.975539875030518,
|
|
"epoch": 4.31700288184438,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00032668641534828807,
|
|
"loss": 4.4644,
|
|
"mean_token_accuracy": 0.26633958220481874,
|
|
"num_tokens": 103048753.0,
|
|
"step": 44940
|
|
},
|
|
{
|
|
"entropy": 5.001855897903442,
|
|
"epoch": 4.3174831892411145,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.00032665304802523965,
|
|
"loss": 4.5322,
|
|
"mean_token_accuracy": 0.2625934675335884,
|
|
"num_tokens": 103061372.0,
|
|
"step": 44945
|
|
},
|
|
{
|
|
"entropy": 5.000568008422851,
|
|
"epoch": 4.317963496637848,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00032661967950296084,
|
|
"loss": 4.4943,
|
|
"mean_token_accuracy": 0.26570951342582705,
|
|
"num_tokens": 103074791.0,
|
|
"step": 44950
|
|
},
|
|
{
|
|
"entropy": 5.022289419174195,
|
|
"epoch": 4.318443804034582,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0003265863097822263,
|
|
"loss": 4.5052,
|
|
"mean_token_accuracy": 0.26863654255867003,
|
|
"num_tokens": 103086790.0,
|
|
"step": 44955
|
|
},
|
|
{
|
|
"entropy": 5.006827354431152,
|
|
"epoch": 4.318924111431316,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00032655293886381083,
|
|
"loss": 4.4898,
|
|
"mean_token_accuracy": 0.26330197751522066,
|
|
"num_tokens": 103097819.0,
|
|
"step": 44960
|
|
},
|
|
{
|
|
"entropy": 5.018746328353882,
|
|
"epoch": 4.31940441882805,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0003265195667484892,
|
|
"loss": 4.5583,
|
|
"mean_token_accuracy": 0.26424901485443114,
|
|
"num_tokens": 103110071.0,
|
|
"step": 44965
|
|
},
|
|
{
|
|
"entropy": 4.987532901763916,
|
|
"epoch": 4.3198847262247835,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.00032648619343703626,
|
|
"loss": 4.4926,
|
|
"mean_token_accuracy": 0.2724263921380043,
|
|
"num_tokens": 103122586.0,
|
|
"step": 44970
|
|
},
|
|
{
|
|
"entropy": 5.029897928237915,
|
|
"epoch": 4.320365033621518,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0003264528189302267,
|
|
"loss": 4.4844,
|
|
"mean_token_accuracy": 0.265949210524559,
|
|
"num_tokens": 103133531.0,
|
|
"step": 44975
|
|
},
|
|
{
|
|
"entropy": 4.937599563598633,
|
|
"epoch": 4.320845341018252,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00032641944322883557,
|
|
"loss": 4.415,
|
|
"mean_token_accuracy": 0.2788311868906021,
|
|
"num_tokens": 103144099.0,
|
|
"step": 44980
|
|
},
|
|
{
|
|
"entropy": 4.934232425689697,
|
|
"epoch": 4.321325648414986,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0003263860663336376,
|
|
"loss": 4.4529,
|
|
"mean_token_accuracy": 0.2653230145573616,
|
|
"num_tokens": 103156221.0,
|
|
"step": 44985
|
|
},
|
|
{
|
|
"entropy": 4.975247812271118,
|
|
"epoch": 4.32180595581172,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00032635268824540774,
|
|
"loss": 4.414,
|
|
"mean_token_accuracy": 0.2749579444527626,
|
|
"num_tokens": 103167121.0,
|
|
"step": 44990
|
|
},
|
|
{
|
|
"entropy": 5.12546854019165,
|
|
"epoch": 4.322286263208453,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00032631930896492097,
|
|
"loss": 4.5867,
|
|
"mean_token_accuracy": 0.2545983135700226,
|
|
"num_tokens": 103178169.0,
|
|
"step": 44995
|
|
},
|
|
{
|
|
"entropy": 5.034695625305176,
|
|
"epoch": 4.322766570605188,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0003262859284929522,
|
|
"loss": 4.5359,
|
|
"mean_token_accuracy": 0.26198423504829405,
|
|
"num_tokens": 103190790.0,
|
|
"step": 45000
|
|
},
|
|
{
|
|
"epoch": 4.322766570605188,
|
|
"eval_entropy": 4.832867727402883,
|
|
"eval_loss": 4.6727705001831055,
|
|
"eval_mean_token_accuracy": 0.262444738599257,
|
|
"eval_num_tokens": 103190790.0,
|
|
"eval_runtime": 26.6519,
|
|
"eval_samples_per_second": 1231.243,
|
|
"eval_steps_per_second": 153.91,
|
|
"step": 45000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 104090,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 3000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.57713831212544e+17,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|