3824 lines
108 KiB
JSON
3824 lines
108 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 378,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 0.5677337646484375,
|
||
|
|
"epoch": 0.007936507936507936,
|
||
|
|
"grad_norm": 5.825922321576571,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 1.3956,
|
||
|
|
"mean_token_accuracy": 0.6547382255084813,
|
||
|
|
"num_tokens": 849869.0,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.569549560546875,
|
||
|
|
"epoch": 0.015873015873015872,
|
||
|
|
"grad_norm": 5.801234157189965,
|
||
|
|
"learning_rate": 1.0526315789473685e-06,
|
||
|
|
"loss": 1.4001,
|
||
|
|
"mean_token_accuracy": 0.6515501267276704,
|
||
|
|
"num_tokens": 1710146.0,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5735321044921875,
|
||
|
|
"epoch": 0.023809523809523808,
|
||
|
|
"grad_norm": 5.662678552390535,
|
||
|
|
"learning_rate": 2.105263157894737e-06,
|
||
|
|
"loss": 1.3808,
|
||
|
|
"mean_token_accuracy": 0.6574624702334404,
|
||
|
|
"num_tokens": 2560005.0,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5650634765625,
|
||
|
|
"epoch": 0.031746031746031744,
|
||
|
|
"grad_norm": 5.556776721246513,
|
||
|
|
"learning_rate": 3.157894736842105e-06,
|
||
|
|
"loss": 1.3916,
|
||
|
|
"mean_token_accuracy": 0.6538396221585572,
|
||
|
|
"num_tokens": 3457966.0,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.57452392578125,
|
||
|
|
"epoch": 0.03968253968253968,
|
||
|
|
"grad_norm": 5.365978906848366,
|
||
|
|
"learning_rate": 4.210526315789474e-06,
|
||
|
|
"loss": 1.3703,
|
||
|
|
"mean_token_accuracy": 0.6564755998551846,
|
||
|
|
"num_tokens": 4321827.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5656585693359375,
|
||
|
|
"epoch": 0.047619047619047616,
|
||
|
|
"grad_norm": 4.396681219826731,
|
||
|
|
"learning_rate": 5.263157894736842e-06,
|
||
|
|
"loss": 1.293,
|
||
|
|
"mean_token_accuracy": 0.6733334762975574,
|
||
|
|
"num_tokens": 5188122.0,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5627288818359375,
|
||
|
|
"epoch": 0.05555555555555555,
|
||
|
|
"grad_norm": 3.422587251657164,
|
||
|
|
"learning_rate": 6.31578947368421e-06,
|
||
|
|
"loss": 1.2025,
|
||
|
|
"mean_token_accuracy": 0.6850569075904787,
|
||
|
|
"num_tokens": 6042413.0,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5742950439453125,
|
||
|
|
"epoch": 0.06349206349206349,
|
||
|
|
"grad_norm": 3.305712030477459,
|
||
|
|
"learning_rate": 7.368421052631579e-06,
|
||
|
|
"loss": 1.1674,
|
||
|
|
"mean_token_accuracy": 0.6904466319829226,
|
||
|
|
"num_tokens": 6898441.0,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5521697998046875,
|
||
|
|
"epoch": 0.07142857142857142,
|
||
|
|
"grad_norm": 3.0042279135772834,
|
||
|
|
"learning_rate": 8.421052631578948e-06,
|
||
|
|
"loss": 1.1149,
|
||
|
|
"mean_token_accuracy": 0.700476243160665,
|
||
|
|
"num_tokens": 7794638.0,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5288543701171875,
|
||
|
|
"epoch": 0.07936507936507936,
|
||
|
|
"grad_norm": 5.681857010760831,
|
||
|
|
"learning_rate": 9.473684210526315e-06,
|
||
|
|
"loss": 1.0692,
|
||
|
|
"mean_token_accuracy": 0.7063358542509377,
|
||
|
|
"num_tokens": 8673402.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.536529541015625,
|
||
|
|
"epoch": 0.0873015873015873,
|
||
|
|
"grad_norm": 4.63333613929414,
|
||
|
|
"learning_rate": 1.0526315789473684e-05,
|
||
|
|
"loss": 1.0074,
|
||
|
|
"mean_token_accuracy": 0.7214855612255633,
|
||
|
|
"num_tokens": 9525436.0,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.549041748046875,
|
||
|
|
"epoch": 0.09523809523809523,
|
||
|
|
"grad_norm": 3.571284227430379,
|
||
|
|
"learning_rate": 1.1578947368421053e-05,
|
||
|
|
"loss": 0.9655,
|
||
|
|
"mean_token_accuracy": 0.7288991836830974,
|
||
|
|
"num_tokens": 10358777.0,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5587310791015625,
|
||
|
|
"epoch": 0.10317460317460317,
|
||
|
|
"grad_norm": 5.5475348990467035,
|
||
|
|
"learning_rate": 1.263157894736842e-05,
|
||
|
|
"loss": 0.976,
|
||
|
|
"mean_token_accuracy": 0.7247777748852968,
|
||
|
|
"num_tokens": 11211677.0,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5686798095703125,
|
||
|
|
"epoch": 0.1111111111111111,
|
||
|
|
"grad_norm": 4.953437827171886,
|
||
|
|
"learning_rate": 1.3684210526315791e-05,
|
||
|
|
"loss": 0.9599,
|
||
|
|
"mean_token_accuracy": 0.7281729411333799,
|
||
|
|
"num_tokens": 12067363.0,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5579376220703125,
|
||
|
|
"epoch": 0.11904761904761904,
|
||
|
|
"grad_norm": 3.345315151683028,
|
||
|
|
"learning_rate": 1.4736842105263159e-05,
|
||
|
|
"loss": 0.9403,
|
||
|
|
"mean_token_accuracy": 0.7340105604380369,
|
||
|
|
"num_tokens": 12945458.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.564178466796875,
|
||
|
|
"epoch": 0.12698412698412698,
|
||
|
|
"grad_norm": 3.080260033569176,
|
||
|
|
"learning_rate": 1.578947368421053e-05,
|
||
|
|
"loss": 0.8993,
|
||
|
|
"mean_token_accuracy": 0.7429266143590212,
|
||
|
|
"num_tokens": 13815066.0,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55816650390625,
|
||
|
|
"epoch": 0.1349206349206349,
|
||
|
|
"grad_norm": 3.077777207658634,
|
||
|
|
"learning_rate": 1.6842105263157896e-05,
|
||
|
|
"loss": 0.881,
|
||
|
|
"mean_token_accuracy": 0.7466331031173468,
|
||
|
|
"num_tokens": 14685173.0,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5535430908203125,
|
||
|
|
"epoch": 0.14285714285714285,
|
||
|
|
"grad_norm": 2.5640577389793164,
|
||
|
|
"learning_rate": 1.7894736842105264e-05,
|
||
|
|
"loss": 0.8439,
|
||
|
|
"mean_token_accuracy": 0.7535499525256455,
|
||
|
|
"num_tokens": 15522062.0,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5429534912109375,
|
||
|
|
"epoch": 0.15079365079365079,
|
||
|
|
"grad_norm": 2.4128378954630563,
|
||
|
|
"learning_rate": 1.894736842105263e-05,
|
||
|
|
"loss": 0.815,
|
||
|
|
"mean_token_accuracy": 0.7603332437574863,
|
||
|
|
"num_tokens": 16388252.0,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5367584228515625,
|
||
|
|
"epoch": 0.15873015873015872,
|
||
|
|
"grad_norm": 2.2585216462856423,
|
||
|
|
"learning_rate": 2e-05,
|
||
|
|
"loss": 0.7944,
|
||
|
|
"mean_token_accuracy": 0.764781333040446,
|
||
|
|
"num_tokens": 17235205.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.531982421875,
|
||
|
|
"epoch": 0.16666666666666666,
|
||
|
|
"grad_norm": 2.174159674202976,
|
||
|
|
"learning_rate": 1.999961710642308e-05,
|
||
|
|
"loss": 0.7779,
|
||
|
|
"mean_token_accuracy": 0.7687827018089592,
|
||
|
|
"num_tokens": 18090069.0,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53106689453125,
|
||
|
|
"epoch": 0.1746031746031746,
|
||
|
|
"grad_norm": 1.99421990181621,
|
||
|
|
"learning_rate": 1.9998468455013825e-05,
|
||
|
|
"loss": 0.7668,
|
||
|
|
"mean_token_accuracy": 0.7712001241743565,
|
||
|
|
"num_tokens": 18955962.0,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5294189453125,
|
||
|
|
"epoch": 0.18253968253968253,
|
||
|
|
"grad_norm": 1.6261845638000727,
|
||
|
|
"learning_rate": 1.9996554133734473e-05,
|
||
|
|
"loss": 0.7311,
|
||
|
|
"mean_token_accuracy": 0.7804098608903587,
|
||
|
|
"num_tokens": 19812261.0,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.531005859375,
|
||
|
|
"epoch": 0.19047619047619047,
|
||
|
|
"grad_norm": 2.0323147143629425,
|
||
|
|
"learning_rate": 1.99938742891813e-05,
|
||
|
|
"loss": 0.7211,
|
||
|
|
"mean_token_accuracy": 0.7803159174509346,
|
||
|
|
"num_tokens": 20647709.0,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52178955078125,
|
||
|
|
"epoch": 0.1984126984126984,
|
||
|
|
"grad_norm": 1.9552611093327645,
|
||
|
|
"learning_rate": 1.9990429126573353e-05,
|
||
|
|
"loss": 0.7179,
|
||
|
|
"mean_token_accuracy": 0.7824284308589995,
|
||
|
|
"num_tokens": 21486371.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54156494140625,
|
||
|
|
"epoch": 0.20634920634920634,
|
||
|
|
"grad_norm": 1.7653002599490228,
|
||
|
|
"learning_rate": 1.9986218909736758e-05,
|
||
|
|
"loss": 0.7017,
|
||
|
|
"mean_token_accuracy": 0.782297340221703,
|
||
|
|
"num_tokens": 22318414.0,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5255889892578125,
|
||
|
|
"epoch": 0.21428571428571427,
|
||
|
|
"grad_norm": 1.4137706795116607,
|
||
|
|
"learning_rate": 1.9981243961084516e-05,
|
||
|
|
"loss": 0.6751,
|
||
|
|
"mean_token_accuracy": 0.789531962480396,
|
||
|
|
"num_tokens": 23134604.0,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5037689208984375,
|
||
|
|
"epoch": 0.2222222222222222,
|
||
|
|
"grad_norm": 1.3650570766755068,
|
||
|
|
"learning_rate": 1.99755046615918e-05,
|
||
|
|
"loss": 0.6794,
|
||
|
|
"mean_token_accuracy": 0.789926297031343,
|
||
|
|
"num_tokens": 24009770.0,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.509796142578125,
|
||
|
|
"epoch": 0.23015873015873015,
|
||
|
|
"grad_norm": 1.644136924578907,
|
||
|
|
"learning_rate": 1.9969001450766795e-05,
|
||
|
|
"loss": 0.6845,
|
||
|
|
"mean_token_accuracy": 0.7879371428862214,
|
||
|
|
"num_tokens": 24869806.0,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4919281005859375,
|
||
|
|
"epoch": 0.23809523809523808,
|
||
|
|
"grad_norm": 1.6577988199334892,
|
||
|
|
"learning_rate": 1.9961734826617033e-05,
|
||
|
|
"loss": 0.6806,
|
||
|
|
"mean_token_accuracy": 0.7895815866068006,
|
||
|
|
"num_tokens": 25762999.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4969482421875,
|
||
|
|
"epoch": 0.24603174603174602,
|
||
|
|
"grad_norm": 1.2984344326991837,
|
||
|
|
"learning_rate": 1.995370534561125e-05,
|
||
|
|
"loss": 0.6515,
|
||
|
|
"mean_token_accuracy": 0.7961041065864265,
|
||
|
|
"num_tokens": 26651412.0,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4962921142578125,
|
||
|
|
"epoch": 0.25396825396825395,
|
||
|
|
"grad_norm": 1.4360495249533112,
|
||
|
|
"learning_rate": 1.9944913622636798e-05,
|
||
|
|
"loss": 0.6523,
|
||
|
|
"mean_token_accuracy": 0.7966451491229236,
|
||
|
|
"num_tokens": 27520069.0,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4964752197265625,
|
||
|
|
"epoch": 0.2619047619047619,
|
||
|
|
"grad_norm": 1.5230137669636878,
|
||
|
|
"learning_rate": 1.993536033095252e-05,
|
||
|
|
"loss": 0.6461,
|
||
|
|
"mean_token_accuracy": 0.7965357885695994,
|
||
|
|
"num_tokens": 28367541.0,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4960174560546875,
|
||
|
|
"epoch": 0.2698412698412698,
|
||
|
|
"grad_norm": 1.3174931583623266,
|
||
|
|
"learning_rate": 1.9925046202137215e-05,
|
||
|
|
"loss": 0.6252,
|
||
|
|
"mean_token_accuracy": 0.8011059854179621,
|
||
|
|
"num_tokens": 29217570.0,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48931884765625,
|
||
|
|
"epoch": 0.2777777777777778,
|
||
|
|
"grad_norm": 1.5851822977243282,
|
||
|
|
"learning_rate": 1.991397202603363e-05,
|
||
|
|
"loss": 0.6297,
|
||
|
|
"mean_token_accuracy": 0.8029140271246433,
|
||
|
|
"num_tokens": 30088869.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4861297607421875,
|
||
|
|
"epoch": 0.2857142857142857,
|
||
|
|
"grad_norm": 1.1284230135515079,
|
||
|
|
"learning_rate": 1.9902138650687943e-05,
|
||
|
|
"loss": 0.6252,
|
||
|
|
"mean_token_accuracy": 0.8036042922176421,
|
||
|
|
"num_tokens": 30959821.0,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.487884521484375,
|
||
|
|
"epoch": 0.29365079365079366,
|
||
|
|
"grad_norm": 1.4212003300454723,
|
||
|
|
"learning_rate": 1.9889546982284833e-05,
|
||
|
|
"loss": 0.6302,
|
||
|
|
"mean_token_accuracy": 0.8021468138322234,
|
||
|
|
"num_tokens": 31830767.0,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4900970458984375,
|
||
|
|
"epoch": 0.30158730158730157,
|
||
|
|
"grad_norm": 1.3590880297462014,
|
||
|
|
"learning_rate": 1.987619798507809e-05,
|
||
|
|
"loss": 0.6172,
|
||
|
|
"mean_token_accuracy": 0.803361180704087,
|
||
|
|
"num_tokens": 32692726.0,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4883575439453125,
|
||
|
|
"epoch": 0.30952380952380953,
|
||
|
|
"grad_norm": 1.0740519369196173,
|
||
|
|
"learning_rate": 1.9862092681316774e-05,
|
||
|
|
"loss": 0.5925,
|
||
|
|
"mean_token_accuracy": 0.81046880222857,
|
||
|
|
"num_tokens": 33543076.0,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.487884521484375,
|
||
|
|
"epoch": 0.31746031746031744,
|
||
|
|
"grad_norm": 1.2402102211054793,
|
||
|
|
"learning_rate": 1.984723215116693e-05,
|
||
|
|
"loss": 0.6003,
|
||
|
|
"mean_token_accuracy": 0.8089331048540771,
|
||
|
|
"num_tokens": 34404352.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49627685546875,
|
||
|
|
"epoch": 0.3253968253968254,
|
||
|
|
"grad_norm": 1.2006299948645196,
|
||
|
|
"learning_rate": 1.983161753262886e-05,
|
||
|
|
"loss": 0.6025,
|
||
|
|
"mean_token_accuracy": 0.8069734866730869,
|
||
|
|
"num_tokens": 35236933.0,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.485504150390625,
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"grad_norm": 1.0380668548649223,
|
||
|
|
"learning_rate": 1.9815250021449998e-05,
|
||
|
|
"loss": 0.5956,
|
||
|
|
"mean_token_accuracy": 0.8101956453174353,
|
||
|
|
"num_tokens": 36088050.0,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4821624755859375,
|
||
|
|
"epoch": 0.3412698412698413,
|
||
|
|
"grad_norm": 1.2775189629937171,
|
||
|
|
"learning_rate": 1.9798130871033322e-05,
|
||
|
|
"loss": 0.5916,
|
||
|
|
"mean_token_accuracy": 0.8094993168488145,
|
||
|
|
"num_tokens": 36942407.0,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4796142578125,
|
||
|
|
"epoch": 0.3492063492063492,
|
||
|
|
"grad_norm": 1.403908049116242,
|
||
|
|
"learning_rate": 1.9780261392341383e-05,
|
||
|
|
"loss": 0.5945,
|
||
|
|
"mean_token_accuracy": 0.8099073590710759,
|
||
|
|
"num_tokens": 37803882.0,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4794769287109375,
|
||
|
|
"epoch": 0.35714285714285715,
|
||
|
|
"grad_norm": 1.28597257343405,
|
||
|
|
"learning_rate": 1.9761642953795896e-05,
|
||
|
|
"loss": 0.5943,
|
||
|
|
"mean_token_accuracy": 0.809020611923188,
|
||
|
|
"num_tokens": 38667329.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4782257080078125,
|
||
|
|
"epoch": 0.36507936507936506,
|
||
|
|
"grad_norm": 1.0785371718889085,
|
||
|
|
"learning_rate": 1.9742276981172978e-05,
|
||
|
|
"loss": 0.5797,
|
||
|
|
"mean_token_accuracy": 0.8110241792164743,
|
||
|
|
"num_tokens": 39524166.0,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.473052978515625,
|
||
|
|
"epoch": 0.373015873015873,
|
||
|
|
"grad_norm": 1.0279641554931107,
|
||
|
|
"learning_rate": 1.9722164957493925e-05,
|
||
|
|
"loss": 0.5723,
|
||
|
|
"mean_token_accuracy": 0.814803515560925,
|
||
|
|
"num_tokens": 40389693.0,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4784088134765625,
|
||
|
|
"epoch": 0.38095238095238093,
|
||
|
|
"grad_norm": 0.9899645288539431,
|
||
|
|
"learning_rate": 1.9701308422911674e-05,
|
||
|
|
"loss": 0.5763,
|
||
|
|
"mean_token_accuracy": 0.813154571224004,
|
||
|
|
"num_tokens": 41231841.0,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4776763916015625,
|
||
|
|
"epoch": 0.3888888888888889,
|
||
|
|
"grad_norm": 1.0848064904660297,
|
||
|
|
"learning_rate": 1.967970897459286e-05,
|
||
|
|
"loss": 0.5785,
|
||
|
|
"mean_token_accuracy": 0.8125878237187862,
|
||
|
|
"num_tokens": 42082897.0,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47198486328125,
|
||
|
|
"epoch": 0.3968253968253968,
|
||
|
|
"grad_norm": 1.0914454254108568,
|
||
|
|
"learning_rate": 1.9657368266595477e-05,
|
||
|
|
"loss": 0.5584,
|
||
|
|
"mean_token_accuracy": 0.8166225766763091,
|
||
|
|
"num_tokens": 42941458.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.469757080078125,
|
||
|
|
"epoch": 0.40476190476190477,
|
||
|
|
"grad_norm": 1.0811685110199658,
|
||
|
|
"learning_rate": 1.9634288009742254e-05,
|
||
|
|
"loss": 0.5613,
|
||
|
|
"mean_token_accuracy": 0.8169586607255042,
|
||
|
|
"num_tokens": 43801380.0,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.469635009765625,
|
||
|
|
"epoch": 0.4126984126984127,
|
||
|
|
"grad_norm": 1.0762575387658044,
|
||
|
|
"learning_rate": 1.961046997148961e-05,
|
||
|
|
"loss": 0.5745,
|
||
|
|
"mean_token_accuracy": 0.8137071407400072,
|
||
|
|
"num_tokens": 44671335.0,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47747802734375,
|
||
|
|
"epoch": 0.42063492063492064,
|
||
|
|
"grad_norm": 1.0897325457635905,
|
||
|
|
"learning_rate": 1.958591597579231e-05,
|
||
|
|
"loss": 0.5645,
|
||
|
|
"mean_token_accuracy": 0.814334771130234,
|
||
|
|
"num_tokens": 45508166.0,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4644012451171875,
|
||
|
|
"epoch": 0.42857142857142855,
|
||
|
|
"grad_norm": 1.0941123611019024,
|
||
|
|
"learning_rate": 1.9560627902963808e-05,
|
||
|
|
"loss": 0.5732,
|
||
|
|
"mean_token_accuracy": 0.8126041651703417,
|
||
|
|
"num_tokens": 46398974.0,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.461273193359375,
|
||
|
|
"epoch": 0.4365079365079365,
|
||
|
|
"grad_norm": 1.1949513128743081,
|
||
|
|
"learning_rate": 1.9534607689532236e-05,
|
||
|
|
"loss": 0.5746,
|
||
|
|
"mean_token_accuracy": 0.8108557453379035,
|
||
|
|
"num_tokens": 47311746.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4606475830078125,
|
||
|
|
"epoch": 0.4444444444444444,
|
||
|
|
"grad_norm": 1.0419520721819582,
|
||
|
|
"learning_rate": 1.950785732809211e-05,
|
||
|
|
"loss": 0.5489,
|
||
|
|
"mean_token_accuracy": 0.8197311628609896,
|
||
|
|
"num_tokens": 48177761.0,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4680938720703125,
|
||
|
|
"epoch": 0.4523809523809524,
|
||
|
|
"grad_norm": 1.2749921802895587,
|
||
|
|
"learning_rate": 1.9480378867151746e-05,
|
||
|
|
"loss": 0.5568,
|
||
|
|
"mean_token_accuracy": 0.8179731853306293,
|
||
|
|
"num_tokens": 49026375.0,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4620361328125,
|
||
|
|
"epoch": 0.4603174603174603,
|
||
|
|
"grad_norm": 1.0787103519924253,
|
||
|
|
"learning_rate": 1.9452174410976383e-05,
|
||
|
|
"loss": 0.5613,
|
||
|
|
"mean_token_accuracy": 0.8137976322323084,
|
||
|
|
"num_tokens": 49889163.0,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.467193603515625,
|
||
|
|
"epoch": 0.46825396825396826,
|
||
|
|
"grad_norm": 1.1592461490839407,
|
||
|
|
"learning_rate": 1.9423246119427044e-05,
|
||
|
|
"loss": 0.5544,
|
||
|
|
"mean_token_accuracy": 0.8175857574678957,
|
||
|
|
"num_tokens": 50735361.0,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47021484375,
|
||
|
|
"epoch": 0.47619047619047616,
|
||
|
|
"grad_norm": 1.0574063397814852,
|
||
|
|
"learning_rate": 1.9393596207795135e-05,
|
||
|
|
"loss": 0.5451,
|
||
|
|
"mean_token_accuracy": 0.8189192642457783,
|
||
|
|
"num_tokens": 51597577.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4606781005859375,
|
||
|
|
"epoch": 0.48412698412698413,
|
||
|
|
"grad_norm": 1.1516456097619052,
|
||
|
|
"learning_rate": 1.93632269466328e-05,
|
||
|
|
"loss": 0.5556,
|
||
|
|
"mean_token_accuracy": 0.8182462360709906,
|
||
|
|
"num_tokens": 52482382.0,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.458343505859375,
|
||
|
|
"epoch": 0.49206349206349204,
|
||
|
|
"grad_norm": 1.1488599206834784,
|
||
|
|
"learning_rate": 1.933214066157904e-05,
|
||
|
|
"loss": 0.5502,
|
||
|
|
"mean_token_accuracy": 0.8174868933856487,
|
||
|
|
"num_tokens": 53370750.0,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.462005615234375,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"grad_norm": 0.9651094359764429,
|
||
|
|
"learning_rate": 1.930033973318164e-05,
|
||
|
|
"loss": 0.5415,
|
||
|
|
"mean_token_accuracy": 0.821011008694768,
|
||
|
|
"num_tokens": 54235189.0,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4658966064453125,
|
||
|
|
"epoch": 0.5079365079365079,
|
||
|
|
"grad_norm": 1.0776190917138506,
|
||
|
|
"learning_rate": 1.926782659671484e-05,
|
||
|
|
"loss": 0.5378,
|
||
|
|
"mean_token_accuracy": 0.8214049334637821,
|
||
|
|
"num_tokens": 55066936.0,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4623565673828125,
|
||
|
|
"epoch": 0.5158730158730159,
|
||
|
|
"grad_norm": 1.0551561314848272,
|
||
|
|
"learning_rate": 1.9234603741992864e-05,
|
||
|
|
"loss": 0.5399,
|
||
|
|
"mean_token_accuracy": 0.8217946467921138,
|
||
|
|
"num_tokens": 55922405.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46429443359375,
|
||
|
|
"epoch": 0.5238095238095238,
|
||
|
|
"grad_norm": 1.061935522396689,
|
||
|
|
"learning_rate": 1.9200673713179245e-05,
|
||
|
|
"loss": 0.5368,
|
||
|
|
"mean_token_accuracy": 0.8207846856676042,
|
||
|
|
"num_tokens": 56770275.0,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.461700439453125,
|
||
|
|
"epoch": 0.5317460317460317,
|
||
|
|
"grad_norm": 1.1927343816090568,
|
||
|
|
"learning_rate": 1.9166039108592008e-05,
|
||
|
|
"loss": 0.5454,
|
||
|
|
"mean_token_accuracy": 0.8190617277286947,
|
||
|
|
"num_tokens": 57627870.0,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4640045166015625,
|
||
|
|
"epoch": 0.5396825396825397,
|
||
|
|
"grad_norm": 0.932865808441491,
|
||
|
|
"learning_rate": 1.9130702580504678e-05,
|
||
|
|
"loss": 0.5327,
|
||
|
|
"mean_token_accuracy": 0.8246302427724004,
|
||
|
|
"num_tokens": 58469884.0,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4622344970703125,
|
||
|
|
"epoch": 0.5476190476190477,
|
||
|
|
"grad_norm": 1.0722808809130155,
|
||
|
|
"learning_rate": 1.9094666834943177e-05,
|
||
|
|
"loss": 0.5312,
|
||
|
|
"mean_token_accuracy": 0.8219157354906201,
|
||
|
|
"num_tokens": 59323796.0,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4629058837890625,
|
||
|
|
"epoch": 0.5555555555555556,
|
||
|
|
"grad_norm": 1.0019623771507054,
|
||
|
|
"learning_rate": 1.9057934631478616e-05,
|
||
|
|
"loss": 0.5228,
|
||
|
|
"mean_token_accuracy": 0.8249138863757253,
|
||
|
|
"num_tokens": 60183841.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4584503173828125,
|
||
|
|
"epoch": 0.5634920634920635,
|
||
|
|
"grad_norm": 0.9985559966173079,
|
||
|
|
"learning_rate": 1.9020508783015942e-05,
|
||
|
|
"loss": 0.5351,
|
||
|
|
"mean_token_accuracy": 0.8224714086391032,
|
||
|
|
"num_tokens": 61048601.0,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4614410400390625,
|
||
|
|
"epoch": 0.5714285714285714,
|
||
|
|
"grad_norm": 0.9030749792761511,
|
||
|
|
"learning_rate": 1.898239215557856e-05,
|
||
|
|
"loss": 0.5361,
|
||
|
|
"mean_token_accuracy": 0.8228761674836278,
|
||
|
|
"num_tokens": 61887912.0,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4592132568359375,
|
||
|
|
"epoch": 0.5793650793650794,
|
||
|
|
"grad_norm": 1.0707804587432073,
|
||
|
|
"learning_rate": 1.894358766808883e-05,
|
||
|
|
"loss": 0.5324,
|
||
|
|
"mean_token_accuracy": 0.8242231444455683,
|
||
|
|
"num_tokens": 62741342.0,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4568939208984375,
|
||
|
|
"epoch": 0.5873015873015873,
|
||
|
|
"grad_norm": 0.9716132865694278,
|
||
|
|
"learning_rate": 1.8904098292144556e-05,
|
||
|
|
"loss": 0.5288,
|
||
|
|
"mean_token_accuracy": 0.8249963694252074,
|
||
|
|
"num_tokens": 63594617.0,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4536590576171875,
|
||
|
|
"epoch": 0.5952380952380952,
|
||
|
|
"grad_norm": 0.9051567770653394,
|
||
|
|
"learning_rate": 1.8863927051791418e-05,
|
||
|
|
"loss": 0.5255,
|
||
|
|
"mean_token_accuracy": 0.8258578674867749,
|
||
|
|
"num_tokens": 64467695.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45355224609375,
|
||
|
|
"epoch": 0.6031746031746031,
|
||
|
|
"grad_norm": 0.9565155259831919,
|
||
|
|
"learning_rate": 1.88230770232914e-05,
|
||
|
|
"loss": 0.5282,
|
||
|
|
"mean_token_accuracy": 0.8233450087718666,
|
||
|
|
"num_tokens": 65333788.0,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4597625732421875,
|
||
|
|
"epoch": 0.6111111111111112,
|
||
|
|
"grad_norm": 0.9745677952807633,
|
||
|
|
"learning_rate": 1.8781551334887204e-05,
|
||
|
|
"loss": 0.5202,
|
||
|
|
"mean_token_accuracy": 0.8260575924068689,
|
||
|
|
"num_tokens": 66178918.0,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45465087890625,
|
||
|
|
"epoch": 0.6190476190476191,
|
||
|
|
"grad_norm": 0.893135579624428,
|
||
|
|
"learning_rate": 1.87393531665627e-05,
|
||
|
|
"loss": 0.5315,
|
||
|
|
"mean_token_accuracy": 0.8249422176741064,
|
||
|
|
"num_tokens": 67052342.0,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4491424560546875,
|
||
|
|
"epoch": 0.626984126984127,
|
||
|
|
"grad_norm": 0.888144638384342,
|
||
|
|
"learning_rate": 1.869648574979942e-05,
|
||
|
|
"loss": 0.5236,
|
||
|
|
"mean_token_accuracy": 0.8263327423483133,
|
||
|
|
"num_tokens": 67913391.0,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.451751708984375,
|
||
|
|
"epoch": 0.6349206349206349,
|
||
|
|
"grad_norm": 0.9234771356349594,
|
||
|
|
"learning_rate": 1.865295236732907e-05,
|
||
|
|
"loss": 0.5229,
|
||
|
|
"mean_token_accuracy": 0.8255810001865029,
|
||
|
|
"num_tokens": 68772115.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4463348388671875,
|
||
|
|
"epoch": 0.6428571428571429,
|
||
|
|
"grad_norm": 1.0435810443945315,
|
||
|
|
"learning_rate": 1.8608756352882152e-05,
|
||
|
|
"loss": 0.5137,
|
||
|
|
"mean_token_accuracy": 0.828637046739459,
|
||
|
|
"num_tokens": 69611653.0,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4492645263671875,
|
||
|
|
"epoch": 0.6507936507936508,
|
||
|
|
"grad_norm": 1.050967823837876,
|
||
|
|
"learning_rate": 1.8563901090932673e-05,
|
||
|
|
"loss": 0.5331,
|
||
|
|
"mean_token_accuracy": 0.8240101523697376,
|
||
|
|
"num_tokens": 70496952.0,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4458160400390625,
|
||
|
|
"epoch": 0.6587301587301587,
|
||
|
|
"grad_norm": 0.9615124983710195,
|
||
|
|
"learning_rate": 1.851839001643898e-05,
|
||
|
|
"loss": 0.5149,
|
||
|
|
"mean_token_accuracy": 0.8283301163464785,
|
||
|
|
"num_tokens": 71343921.0,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4455108642578125,
|
||
|
|
"epoch": 0.6666666666666666,
|
||
|
|
"grad_norm": 0.9638388209458152,
|
||
|
|
"learning_rate": 1.847222661458069e-05,
|
||
|
|
"loss": 0.5292,
|
||
|
|
"mean_token_accuracy": 0.8246422847732902,
|
||
|
|
"num_tokens": 72240608.0,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.445037841796875,
|
||
|
|
"epoch": 0.6746031746031746,
|
||
|
|
"grad_norm": 1.1463109632434823,
|
||
|
|
"learning_rate": 1.8425414420491817e-05,
|
||
|
|
"loss": 0.5176,
|
||
|
|
"mean_token_accuracy": 0.826205097604543,
|
||
|
|
"num_tokens": 73118452.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4449615478515625,
|
||
|
|
"epoch": 0.6825396825396826,
|
||
|
|
"grad_norm": 0.9645173292489025,
|
||
|
|
"learning_rate": 1.8377957018990043e-05,
|
||
|
|
"loss": 0.522,
|
||
|
|
"mean_token_accuracy": 0.8236676808446646,
|
||
|
|
"num_tokens": 73991069.0,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4539337158203125,
|
||
|
|
"epoch": 0.6904761904761905,
|
||
|
|
"grad_norm": 1.0557439048120705,
|
||
|
|
"learning_rate": 1.8329858044302212e-05,
|
||
|
|
"loss": 0.5143,
|
||
|
|
"mean_token_accuracy": 0.8274678424932063,
|
||
|
|
"num_tokens": 74839901.0,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4486083984375,
|
||
|
|
"epoch": 0.6984126984126984,
|
||
|
|
"grad_norm": 0.9805168214479623,
|
||
|
|
"learning_rate": 1.8281121179786024e-05,
|
||
|
|
"loss": 0.5255,
|
||
|
|
"mean_token_accuracy": 0.8258101856335998,
|
||
|
|
"num_tokens": 75749725.0,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4547576904296875,
|
||
|
|
"epoch": 0.7063492063492064,
|
||
|
|
"grad_norm": 1.0335119460745013,
|
||
|
|
"learning_rate": 1.823175015764795e-05,
|
||
|
|
"loss": 0.52,
|
||
|
|
"mean_token_accuracy": 0.8265441199764609,
|
||
|
|
"num_tokens": 76593690.0,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4552154541015625,
|
||
|
|
"epoch": 0.7142857142857143,
|
||
|
|
"grad_norm": 1.0221663487735775,
|
||
|
|
"learning_rate": 1.818174875865744e-05,
|
||
|
|
"loss": 0.5127,
|
||
|
|
"mean_token_accuracy": 0.8279964146204293,
|
||
|
|
"num_tokens": 77431030.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4508514404296875,
|
||
|
|
"epoch": 0.7222222222222222,
|
||
|
|
"grad_norm": 1.0129218088142515,
|
||
|
|
"learning_rate": 1.8131120811857398e-05,
|
||
|
|
"loss": 0.5138,
|
||
|
|
"mean_token_accuracy": 0.827417416498065,
|
||
|
|
"num_tokens": 78278605.0,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4442596435546875,
|
||
|
|
"epoch": 0.7301587301587301,
|
||
|
|
"grad_norm": 0.862267079570144,
|
||
|
|
"learning_rate": 1.8079870194270958e-05,
|
||
|
|
"loss": 0.5086,
|
||
|
|
"mean_token_accuracy": 0.8285580319352448,
|
||
|
|
"num_tokens": 79154216.0,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4412384033203125,
|
||
|
|
"epoch": 0.7380952380952381,
|
||
|
|
"grad_norm": 1.0290357398317191,
|
||
|
|
"learning_rate": 1.802800083060457e-05,
|
||
|
|
"loss": 0.5196,
|
||
|
|
"mean_token_accuracy": 0.8259601076133549,
|
||
|
|
"num_tokens": 80039204.0,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.440216064453125,
|
||
|
|
"epoch": 0.746031746031746,
|
||
|
|
"grad_norm": 0.9200513017864748,
|
||
|
|
"learning_rate": 1.7975516692947478e-05,
|
||
|
|
"loss": 0.5122,
|
||
|
|
"mean_token_accuracy": 0.8288197009824216,
|
||
|
|
"num_tokens": 80910348.0,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441497802734375,
|
||
|
|
"epoch": 0.753968253968254,
|
||
|
|
"grad_norm": 0.9187678159469537,
|
||
|
|
"learning_rate": 1.7922421800467515e-05,
|
||
|
|
"loss": 0.51,
|
||
|
|
"mean_token_accuracy": 0.8292136248201132,
|
||
|
|
"num_tokens": 81765325.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448394775390625,
|
||
|
|
"epoch": 0.7619047619047619,
|
||
|
|
"grad_norm": 0.9243368764216906,
|
||
|
|
"learning_rate": 1.7868720219103343e-05,
|
||
|
|
"loss": 0.5021,
|
||
|
|
"mean_token_accuracy": 0.830197315197438,
|
||
|
|
"num_tokens": 82611125.0,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4393768310546875,
|
||
|
|
"epoch": 0.7698412698412699,
|
||
|
|
"grad_norm": 0.9416277689893134,
|
||
|
|
"learning_rate": 1.7814416061253076e-05,
|
||
|
|
"loss": 0.5129,
|
||
|
|
"mean_token_accuracy": 0.8264825385995209,
|
||
|
|
"num_tokens": 83488021.0,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4447784423828125,
|
||
|
|
"epoch": 0.7777777777777778,
|
||
|
|
"grad_norm": 0.9592607594441928,
|
||
|
|
"learning_rate": 1.7759513485459367e-05,
|
||
|
|
"loss": 0.5082,
|
||
|
|
"mean_token_accuracy": 0.828405749052763,
|
||
|
|
"num_tokens": 84321775.0,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4433746337890625,
|
||
|
|
"epoch": 0.7857142857142857,
|
||
|
|
"grad_norm": 1.0051516989789744,
|
||
|
|
"learning_rate": 1.7704016696090936e-05,
|
||
|
|
"loss": 0.4961,
|
||
|
|
"mean_token_accuracy": 0.8327284948900342,
|
||
|
|
"num_tokens": 85167087.0,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44500732421875,
|
||
|
|
"epoch": 0.7936507936507936,
|
||
|
|
"grad_norm": 0.9104747464714449,
|
||
|
|
"learning_rate": 1.7647929943020625e-05,
|
||
|
|
"loss": 0.5084,
|
||
|
|
"mean_token_accuracy": 0.8306850432418287,
|
||
|
|
"num_tokens": 86009383.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441436767578125,
|
||
|
|
"epoch": 0.8015873015873016,
|
||
|
|
"grad_norm": 0.9773768746481235,
|
||
|
|
"learning_rate": 1.759125752129993e-05,
|
||
|
|
"loss": 0.5015,
|
||
|
|
"mean_token_accuracy": 0.8314397023059428,
|
||
|
|
"num_tokens": 86862628.0,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4582061767578125,
|
||
|
|
"epoch": 0.8095238095238095,
|
||
|
|
"grad_norm": 1.059657795313606,
|
||
|
|
"learning_rate": 1.753400377083011e-05,
|
||
|
|
"loss": 0.505,
|
||
|
|
"mean_token_accuracy": 0.8305098316632211,
|
||
|
|
"num_tokens": 87713395.0,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5252227783203125,
|
||
|
|
"epoch": 0.8174603174603174,
|
||
|
|
"grad_norm": 0.9398994877728049,
|
||
|
|
"learning_rate": 1.747617307602982e-05,
|
||
|
|
"loss": 0.5165,
|
||
|
|
"mean_token_accuracy": 0.8260967722162604,
|
||
|
|
"num_tokens": 88602545.0,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.508758544921875,
|
||
|
|
"epoch": 0.8253968253968254,
|
||
|
|
"grad_norm": 0.9520709865588342,
|
||
|
|
"learning_rate": 1.741776986549938e-05,
|
||
|
|
"loss": 0.5006,
|
||
|
|
"mean_token_accuracy": 0.8329666894860566,
|
||
|
|
"num_tokens": 89444255.0,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.482879638671875,
|
||
|
|
"epoch": 0.8333333333333334,
|
||
|
|
"grad_norm": 0.8345183511029048,
|
||
|
|
"learning_rate": 1.735879861168163e-05,
|
||
|
|
"loss": 0.4944,
|
||
|
|
"mean_token_accuracy": 0.8349198163487017,
|
||
|
|
"num_tokens": 90312774.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4606170654296875,
|
||
|
|
"epoch": 0.8412698412698413,
|
||
|
|
"grad_norm": 0.8850629942716965,
|
||
|
|
"learning_rate": 1.729926383051943e-05,
|
||
|
|
"loss": 0.4939,
|
||
|
|
"mean_token_accuracy": 0.832167761400342,
|
||
|
|
"num_tokens": 91177447.0,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.443572998046875,
|
||
|
|
"epoch": 0.8492063492063492,
|
||
|
|
"grad_norm": 0.7755107549812108,
|
||
|
|
"learning_rate": 1.723917008110984e-05,
|
||
|
|
"loss": 0.4936,
|
||
|
|
"mean_token_accuracy": 0.8328760690055788,
|
||
|
|
"num_tokens": 92026164.0,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4337921142578125,
|
||
|
|
"epoch": 0.8571428571428571,
|
||
|
|
"grad_norm": 0.9590789688031214,
|
||
|
|
"learning_rate": 1.7178521965354992e-05,
|
||
|
|
"loss": 0.4946,
|
||
|
|
"mean_token_accuracy": 0.8331891498528421,
|
||
|
|
"num_tokens": 92891631.0,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4388275146484375,
|
||
|
|
"epoch": 0.8650793650793651,
|
||
|
|
"grad_norm": 0.7656212035221501,
|
||
|
|
"learning_rate": 1.7117324127609686e-05,
|
||
|
|
"loss": 0.4979,
|
||
|
|
"mean_token_accuracy": 0.8320917426608503,
|
||
|
|
"num_tokens": 93760535.0,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.436920166015625,
|
||
|
|
"epoch": 0.873015873015873,
|
||
|
|
"grad_norm": 0.8567777469026723,
|
||
|
|
"learning_rate": 1.7055581254325716e-05,
|
||
|
|
"loss": 0.4953,
|
||
|
|
"mean_token_accuracy": 0.832193429581821,
|
||
|
|
"num_tokens": 94599260.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4438934326171875,
|
||
|
|
"epoch": 0.8809523809523809,
|
||
|
|
"grad_norm": 0.8601918515641613,
|
||
|
|
"learning_rate": 1.6993298073693005e-05,
|
||
|
|
"loss": 0.4935,
|
||
|
|
"mean_token_accuracy": 0.8328238227404654,
|
||
|
|
"num_tokens": 95425799.0,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4399871826171875,
|
||
|
|
"epoch": 0.8888888888888888,
|
||
|
|
"grad_norm": 0.8064705741039472,
|
||
|
|
"learning_rate": 1.693047935527751e-05,
|
||
|
|
"loss": 0.4988,
|
||
|
|
"mean_token_accuracy": 0.833238854072988,
|
||
|
|
"num_tokens": 96260271.0,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4324188232421875,
|
||
|
|
"epoch": 0.8968253968253969,
|
||
|
|
"grad_norm": 0.8781501824333462,
|
||
|
|
"learning_rate": 1.6867129909656e-05,
|
||
|
|
"loss": 0.4972,
|
||
|
|
"mean_token_accuracy": 0.8320957766845822,
|
||
|
|
"num_tokens": 97135925.0,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43426513671875,
|
||
|
|
"epoch": 0.9047619047619048,
|
||
|
|
"grad_norm": 0.8118228734667666,
|
||
|
|
"learning_rate": 1.680325458804763e-05,
|
||
|
|
"loss": 0.4989,
|
||
|
|
"mean_token_accuracy": 0.8331179022789001,
|
||
|
|
"num_tokens": 98011108.0,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4383697509765625,
|
||
|
|
"epoch": 0.9126984126984127,
|
||
|
|
"grad_norm": 0.8371447521121621,
|
||
|
|
"learning_rate": 1.6738858281942477e-05,
|
||
|
|
"loss": 0.4878,
|
||
|
|
"mean_token_accuracy": 0.8329889746382833,
|
||
|
|
"num_tokens": 98873029.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437957763671875,
|
||
|
|
"epoch": 0.9206349206349206,
|
||
|
|
"grad_norm": 0.9317880439623095,
|
||
|
|
"learning_rate": 1.6673945922726945e-05,
|
||
|
|
"loss": 0.4956,
|
||
|
|
"mean_token_accuracy": 0.8307403367944062,
|
||
|
|
"num_tokens": 99734864.0,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437530517578125,
|
||
|
|
"epoch": 0.9285714285714286,
|
||
|
|
"grad_norm": 0.8078951251086017,
|
||
|
|
"learning_rate": 1.660852248130611e-05,
|
||
|
|
"loss": 0.4987,
|
||
|
|
"mean_token_accuracy": 0.8325757388956845,
|
||
|
|
"num_tokens": 100606926.0,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441864013671875,
|
||
|
|
"epoch": 0.9365079365079365,
|
||
|
|
"grad_norm": 0.8703434683361917,
|
||
|
|
"learning_rate": 1.6542592967723065e-05,
|
||
|
|
"loss": 0.4862,
|
||
|
|
"mean_token_accuracy": 0.8341412721201777,
|
||
|
|
"num_tokens": 101447599.0,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4399871826171875,
|
||
|
|
"epoch": 0.9444444444444444,
|
||
|
|
"grad_norm": 0.8653941545842383,
|
||
|
|
"learning_rate": 1.6476162430775278e-05,
|
||
|
|
"loss": 0.4875,
|
||
|
|
"mean_token_accuracy": 0.8343443763442338,
|
||
|
|
"num_tokens": 102275358.0,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.443206787109375,
|
||
|
|
"epoch": 0.9523809523809523,
|
||
|
|
"grad_norm": 0.8489052672431998,
|
||
|
|
"learning_rate": 1.6409235957627926e-05,
|
||
|
|
"loss": 0.487,
|
||
|
|
"mean_token_accuracy": 0.833757430780679,
|
||
|
|
"num_tokens": 103113293.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4330596923828125,
|
||
|
|
"epoch": 0.9603174603174603,
|
||
|
|
"grad_norm": 0.8343529697637952,
|
||
|
|
"learning_rate": 1.6341818673424342e-05,
|
||
|
|
"loss": 0.4927,
|
||
|
|
"mean_token_accuracy": 0.8324310649186373,
|
||
|
|
"num_tokens": 104000550.0,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4350738525390625,
|
||
|
|
"epoch": 0.9682539682539683,
|
||
|
|
"grad_norm": 0.8358525938559462,
|
||
|
|
"learning_rate": 1.6273915740893557e-05,
|
||
|
|
"loss": 0.491,
|
||
|
|
"mean_token_accuracy": 0.8333931621164083,
|
||
|
|
"num_tokens": 104859286.0,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4355621337890625,
|
||
|
|
"epoch": 0.9761904761904762,
|
||
|
|
"grad_norm": 0.8709315219677664,
|
||
|
|
"learning_rate": 1.6205532359954905e-05,
|
||
|
|
"loss": 0.4911,
|
||
|
|
"mean_token_accuracy": 0.8330642161890864,
|
||
|
|
"num_tokens": 105729675.0,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430023193359375,
|
||
|
|
"epoch": 0.9841269841269841,
|
||
|
|
"grad_norm": 0.7686865926441706,
|
||
|
|
"learning_rate": 1.6136673767319853e-05,
|
||
|
|
"loss": 0.4874,
|
||
|
|
"mean_token_accuracy": 0.8338018441572785,
|
||
|
|
"num_tokens": 106603968.0,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426727294921875,
|
||
|
|
"epoch": 0.9920634920634921,
|
||
|
|
"grad_norm": 0.8197400248642781,
|
||
|
|
"learning_rate": 1.606734523609097e-05,
|
||
|
|
"loss": 0.4839,
|
||
|
|
"mean_token_accuracy": 0.8362096287310123,
|
||
|
|
"num_tokens": 107495007.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4326019287109375,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 0.7907079441928082,
|
||
|
|
"learning_rate": 1.5997552075358122e-05,
|
||
|
|
"loss": 0.4943,
|
||
|
|
"mean_token_accuracy": 0.8323847940191627,
|
||
|
|
"num_tokens": 108364335.0,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4468841552734375,
|
||
|
|
"epoch": 1.007936507936508,
|
||
|
|
"grad_norm": 0.8800472941103096,
|
||
|
|
"learning_rate": 1.592729962979189e-05,
|
||
|
|
"loss": 0.4628,
|
||
|
|
"mean_token_accuracy": 0.8423365484923124,
|
||
|
|
"num_tokens": 109202665.0,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43121337890625,
|
||
|
|
"epoch": 1.0158730158730158,
|
||
|
|
"grad_norm": 0.7866269381892753,
|
||
|
|
"learning_rate": 1.585659327923432e-05,
|
||
|
|
"loss": 0.4648,
|
||
|
|
"mean_token_accuracy": 0.8404850475490093,
|
||
|
|
"num_tokens": 110061605.0,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.432586669921875,
|
||
|
|
"epoch": 1.0238095238095237,
|
||
|
|
"grad_norm": 0.8561544665592976,
|
||
|
|
"learning_rate": 1.5785438438286892e-05,
|
||
|
|
"loss": 0.4597,
|
||
|
|
"mean_token_accuracy": 0.8419345654547215,
|
||
|
|
"num_tokens": 110924491.0,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430633544921875,
|
||
|
|
"epoch": 1.0317460317460316,
|
||
|
|
"grad_norm": 0.9000097472640374,
|
||
|
|
"learning_rate": 1.5713840555895937e-05,
|
||
|
|
"loss": 0.4624,
|
||
|
|
"mean_token_accuracy": 0.8415501727722585,
|
||
|
|
"num_tokens": 111773832.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42864990234375,
|
||
|
|
"epoch": 1.0396825396825398,
|
||
|
|
"grad_norm": 0.7502414887434086,
|
||
|
|
"learning_rate": 1.5641805114935297e-05,
|
||
|
|
"loss": 0.4542,
|
||
|
|
"mean_token_accuracy": 0.8432249454781413,
|
||
|
|
"num_tokens": 112637470.0,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42877197265625,
|
||
|
|
"epoch": 1.0476190476190477,
|
||
|
|
"grad_norm": 0.7541703719215876,
|
||
|
|
"learning_rate": 1.556933763178651e-05,
|
||
|
|
"loss": 0.4638,
|
||
|
|
"mean_token_accuracy": 0.8415564014576375,
|
||
|
|
"num_tokens": 113501590.0,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.433380126953125,
|
||
|
|
"epoch": 1.0555555555555556,
|
||
|
|
"grad_norm": 0.7325102765605829,
|
||
|
|
"learning_rate": 1.5496443655916348e-05,
|
||
|
|
"loss": 0.4594,
|
||
|
|
"mean_token_accuracy": 0.8436301471665502,
|
||
|
|
"num_tokens": 114360533.0,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4308319091796875,
|
||
|
|
"epoch": 1.0634920634920635,
|
||
|
|
"grad_norm": 0.7230120782824351,
|
||
|
|
"learning_rate": 1.5423128769451832e-05,
|
||
|
|
"loss": 0.4612,
|
||
|
|
"mean_token_accuracy": 0.8420308292843401,
|
||
|
|
"num_tokens": 115231953.0,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4346466064453125,
|
||
|
|
"epoch": 1.0714285714285714,
|
||
|
|
"grad_norm": 0.7168197905226968,
|
||
|
|
"learning_rate": 1.5349398586752794e-05,
|
||
|
|
"loss": 0.4577,
|
||
|
|
"mean_token_accuracy": 0.8419447150081396,
|
||
|
|
"num_tokens": 116092221.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43701171875,
|
||
|
|
"epoch": 1.0793650793650793,
|
||
|
|
"grad_norm": 0.7872377130062906,
|
||
|
|
"learning_rate": 1.52752587539819e-05,
|
||
|
|
"loss": 0.462,
|
||
|
|
"mean_token_accuracy": 0.8413771693594754,
|
||
|
|
"num_tokens": 116984739.0,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.435943603515625,
|
||
|
|
"epoch": 1.0873015873015872,
|
||
|
|
"grad_norm": 0.7254403268334357,
|
||
|
|
"learning_rate": 1.5200714948672313e-05,
|
||
|
|
"loss": 0.4626,
|
||
|
|
"mean_token_accuracy": 0.8416628721170127,
|
||
|
|
"num_tokens": 117852991.0,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4440155029296875,
|
||
|
|
"epoch": 1.0952380952380953,
|
||
|
|
"grad_norm": 0.8923472179704972,
|
||
|
|
"learning_rate": 1.512577287929288e-05,
|
||
|
|
"loss": 0.46,
|
||
|
|
"mean_token_accuracy": 0.8432498262263834,
|
||
|
|
"num_tokens": 118696927.0,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4384002685546875,
|
||
|
|
"epoch": 1.1031746031746033,
|
||
|
|
"grad_norm": 0.8036615162498064,
|
||
|
|
"learning_rate": 1.5050438284811001e-05,
|
||
|
|
"loss": 0.4608,
|
||
|
|
"mean_token_accuracy": 0.8414666503667831,
|
||
|
|
"num_tokens": 119569613.0,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43438720703125,
|
||
|
|
"epoch": 1.1111111111111112,
|
||
|
|
"grad_norm": 0.8378698989178901,
|
||
|
|
"learning_rate": 1.4974716934253146e-05,
|
||
|
|
"loss": 0.4615,
|
||
|
|
"mean_token_accuracy": 0.8419166500680149,
|
||
|
|
"num_tokens": 120447089.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4336395263671875,
|
||
|
|
"epoch": 1.119047619047619,
|
||
|
|
"grad_norm": 0.7339153921994472,
|
||
|
|
"learning_rate": 1.4898614626263066e-05,
|
||
|
|
"loss": 0.452,
|
||
|
|
"mean_token_accuracy": 0.8448847294785082,
|
||
|
|
"num_tokens": 121314886.0,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.436614990234375,
|
||
|
|
"epoch": 1.126984126984127,
|
||
|
|
"grad_norm": 0.7413352124459704,
|
||
|
|
"learning_rate": 1.4822137188657752e-05,
|
||
|
|
"loss": 0.4498,
|
||
|
|
"mean_token_accuracy": 0.8447657427750528,
|
||
|
|
"num_tokens": 122167617.0,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4323883056640625,
|
||
|
|
"epoch": 1.1349206349206349,
|
||
|
|
"grad_norm": 0.7606295937517913,
|
||
|
|
"learning_rate": 1.474529047798112e-05,
|
||
|
|
"loss": 0.4538,
|
||
|
|
"mean_token_accuracy": 0.8420953522436321,
|
||
|
|
"num_tokens": 123013840.0,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4267730712890625,
|
||
|
|
"epoch": 1.1428571428571428,
|
||
|
|
"grad_norm": 0.7759729162933741,
|
||
|
|
"learning_rate": 1.4668080379055563e-05,
|
||
|
|
"loss": 0.4466,
|
||
|
|
"mean_token_accuracy": 0.8450842797756195,
|
||
|
|
"num_tokens": 123876490.0,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4368438720703125,
|
||
|
|
"epoch": 1.1507936507936507,
|
||
|
|
"grad_norm": 0.7751718151560868,
|
||
|
|
"learning_rate": 1.4590512804531272e-05,
|
||
|
|
"loss": 0.4541,
|
||
|
|
"mean_token_accuracy": 0.8422016915865242,
|
||
|
|
"num_tokens": 124713314.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437255859375,
|
||
|
|
"epoch": 1.1587301587301586,
|
||
|
|
"grad_norm": 0.8833882223237038,
|
||
|
|
"learning_rate": 1.4512593694433455e-05,
|
||
|
|
"loss": 0.4641,
|
||
|
|
"mean_token_accuracy": 0.8396992119960487,
|
||
|
|
"num_tokens": 125564746.0,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4323883056640625,
|
||
|
|
"epoch": 1.1666666666666667,
|
||
|
|
"grad_norm": 0.7186599445695117,
|
||
|
|
"learning_rate": 1.4434329015707468e-05,
|
||
|
|
"loss": 0.447,
|
||
|
|
"mean_token_accuracy": 0.8458566442131996,
|
||
|
|
"num_tokens": 126415997.0,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4389801025390625,
|
||
|
|
"epoch": 1.1746031746031746,
|
||
|
|
"grad_norm": 0.7870343406013125,
|
||
|
|
"learning_rate": 1.435572476176187e-05,
|
||
|
|
"loss": 0.4611,
|
||
|
|
"mean_token_accuracy": 0.8420486990362406,
|
||
|
|
"num_tokens": 127285760.0,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.445037841796875,
|
||
|
|
"epoch": 1.1825396825396826,
|
||
|
|
"grad_norm": 0.7564486218479161,
|
||
|
|
"learning_rate": 1.427678695200945e-05,
|
||
|
|
"loss": 0.4633,
|
||
|
|
"mean_token_accuracy": 0.8416752209886909,
|
||
|
|
"num_tokens": 128153685.0,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4512786865234375,
|
||
|
|
"epoch": 1.1904761904761905,
|
||
|
|
"grad_norm": 0.7524650004648994,
|
||
|
|
"learning_rate": 1.4197521631406279e-05,
|
||
|
|
"loss": 0.4477,
|
||
|
|
"mean_token_accuracy": 0.8451099991798401,
|
||
|
|
"num_tokens": 128990738.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4539337158203125,
|
||
|
|
"epoch": 1.1984126984126984,
|
||
|
|
"grad_norm": 0.8270324190177804,
|
||
|
|
"learning_rate": 1.4117934869988776e-05,
|
||
|
|
"loss": 0.4602,
|
||
|
|
"mean_token_accuracy": 0.8432164471596479,
|
||
|
|
"num_tokens": 129848900.0,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4565582275390625,
|
||
|
|
"epoch": 1.2063492063492063,
|
||
|
|
"grad_norm": 0.7668134474607962,
|
||
|
|
"learning_rate": 1.4038032762408897e-05,
|
||
|
|
"loss": 0.4588,
|
||
|
|
"mean_token_accuracy": 0.8423843700438738,
|
||
|
|
"num_tokens": 130724709.0,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.457763671875,
|
||
|
|
"epoch": 1.2142857142857142,
|
||
|
|
"grad_norm": 0.7234771900704076,
|
||
|
|
"learning_rate": 1.3957821427467392e-05,
|
||
|
|
"loss": 0.4448,
|
||
|
|
"mean_token_accuracy": 0.8460949282161891,
|
||
|
|
"num_tokens": 131582811.0,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4557952880859375,
|
||
|
|
"epoch": 1.2222222222222223,
|
||
|
|
"grad_norm": 0.7569275321904251,
|
||
|
|
"learning_rate": 1.3877307007645256e-05,
|
||
|
|
"loss": 0.4521,
|
||
|
|
"mean_token_accuracy": 0.8441468216478825,
|
||
|
|
"num_tokens": 132429743.0,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4557342529296875,
|
||
|
|
"epoch": 1.2301587301587302,
|
||
|
|
"grad_norm": 0.7661971933972097,
|
||
|
|
"learning_rate": 1.3796495668633325e-05,
|
||
|
|
"loss": 0.455,
|
||
|
|
"mean_token_accuracy": 0.8435652130283415,
|
||
|
|
"num_tokens": 133291943.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4580535888671875,
|
||
|
|
"epoch": 1.2380952380952381,
|
||
|
|
"grad_norm": 0.7878876156560011,
|
||
|
|
"learning_rate": 1.3715393598860129e-05,
|
||
|
|
"loss": 0.4515,
|
||
|
|
"mean_token_accuracy": 0.8445776179432869,
|
||
|
|
"num_tokens": 134149814.0,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45654296875,
|
||
|
|
"epoch": 1.246031746031746,
|
||
|
|
"grad_norm": 0.7496215665379915,
|
||
|
|
"learning_rate": 1.3634007009017986e-05,
|
||
|
|
"loss": 0.4435,
|
||
|
|
"mean_token_accuracy": 0.8470091614872217,
|
||
|
|
"num_tokens": 134989406.0,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.449371337890625,
|
||
|
|
"epoch": 1.253968253968254,
|
||
|
|
"grad_norm": 0.7747908901130726,
|
||
|
|
"learning_rate": 1.3552342131587399e-05,
|
||
|
|
"loss": 0.4398,
|
||
|
|
"mean_token_accuracy": 0.8483070912770927,
|
||
|
|
"num_tokens": 135832642.0,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44940185546875,
|
||
|
|
"epoch": 1.2619047619047619,
|
||
|
|
"grad_norm": 0.8393013667899509,
|
||
|
|
"learning_rate": 1.3470405220359773e-05,
|
||
|
|
"loss": 0.4493,
|
||
|
|
"mean_token_accuracy": 0.8453219896182418,
|
||
|
|
"num_tokens": 136724748.0,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4533843994140625,
|
||
|
|
"epoch": 1.2698412698412698,
|
||
|
|
"grad_norm": 0.736903810428085,
|
||
|
|
"learning_rate": 1.3388202549958507e-05,
|
||
|
|
"loss": 0.4487,
|
||
|
|
"mean_token_accuracy": 0.8448820817284286,
|
||
|
|
"num_tokens": 137570382.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.451629638671875,
|
||
|
|
"epoch": 1.2777777777777777,
|
||
|
|
"grad_norm": 0.7268998948956484,
|
||
|
|
"learning_rate": 1.3305740415358506e-05,
|
||
|
|
"loss": 0.4511,
|
||
|
|
"mean_token_accuracy": 0.8454865459352732,
|
||
|
|
"num_tokens": 138431194.0,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44989013671875,
|
||
|
|
"epoch": 1.2857142857142856,
|
||
|
|
"grad_norm": 0.7914645129000742,
|
||
|
|
"learning_rate": 1.3223025131404106e-05,
|
||
|
|
"loss": 0.4367,
|
||
|
|
"mean_token_accuracy": 0.8472912893630564,
|
||
|
|
"num_tokens": 139287890.0,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4466400146484375,
|
||
|
|
"epoch": 1.2936507936507937,
|
||
|
|
"grad_norm": 0.6999054520073498,
|
||
|
|
"learning_rate": 1.3140063032325491e-05,
|
||
|
|
"loss": 0.4509,
|
||
|
|
"mean_token_accuracy": 0.8445514859631658,
|
||
|
|
"num_tokens": 140160179.0,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44244384765625,
|
||
|
|
"epoch": 1.3015873015873016,
|
||
|
|
"grad_norm": 0.7926199960561141,
|
||
|
|
"learning_rate": 1.3056860471253639e-05,
|
||
|
|
"loss": 0.4512,
|
||
|
|
"mean_token_accuracy": 0.8441420421004295,
|
||
|
|
"num_tokens": 141002875.0,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4391632080078125,
|
||
|
|
"epoch": 1.3095238095238095,
|
||
|
|
"grad_norm": 0.7155247497693717,
|
||
|
|
"learning_rate": 1.297342381973379e-05,
|
||
|
|
"loss": 0.4405,
|
||
|
|
"mean_token_accuracy": 0.8467378858476877,
|
||
|
|
"num_tokens": 141858286.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442840576171875,
|
||
|
|
"epoch": 1.3174603174603174,
|
||
|
|
"grad_norm": 0.7604417217639244,
|
||
|
|
"learning_rate": 1.2889759467237532e-05,
|
||
|
|
"loss": 0.4424,
|
||
|
|
"mean_token_accuracy": 0.8471214440651238,
|
||
|
|
"num_tokens": 142698339.0,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44219970703125,
|
||
|
|
"epoch": 1.3253968253968254,
|
||
|
|
"grad_norm": 0.7842027331020274,
|
||
|
|
"learning_rate": 1.2805873820673509e-05,
|
||
|
|
"loss": 0.443,
|
||
|
|
"mean_token_accuracy": 0.84578693844378,
|
||
|
|
"num_tokens": 143561112.0,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4424896240234375,
|
||
|
|
"epoch": 1.3333333333333333,
|
||
|
|
"grad_norm": 0.6815401406283373,
|
||
|
|
"learning_rate": 1.2721773303896765e-05,
|
||
|
|
"loss": 0.4451,
|
||
|
|
"mean_token_accuracy": 0.8464445443823934,
|
||
|
|
"num_tokens": 144390223.0,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.440338134765625,
|
||
|
|
"epoch": 1.3412698412698414,
|
||
|
|
"grad_norm": 0.6935995098018823,
|
||
|
|
"learning_rate": 1.2637464357216847e-05,
|
||
|
|
"loss": 0.4565,
|
||
|
|
"mean_token_accuracy": 0.843538910150528,
|
||
|
|
"num_tokens": 145276276.0,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4456939697265625,
|
||
|
|
"epoch": 1.3492063492063493,
|
||
|
|
"grad_norm": 0.7555715111365282,
|
||
|
|
"learning_rate": 1.2552953436904578e-05,
|
||
|
|
"loss": 0.4464,
|
||
|
|
"mean_token_accuracy": 0.8468158571049571,
|
||
|
|
"num_tokens": 146148957.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45037841796875,
|
||
|
|
"epoch": 1.3571428571428572,
|
||
|
|
"grad_norm": 0.6578938329224823,
|
||
|
|
"learning_rate": 1.246824701469768e-05,
|
||
|
|
"loss": 0.4444,
|
||
|
|
"mean_token_accuracy": 0.8462554500438273,
|
||
|
|
"num_tokens": 146999892.0,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4438934326171875,
|
||
|
|
"epoch": 1.3650793650793651,
|
||
|
|
"grad_norm": 0.7952540493885925,
|
||
|
|
"learning_rate": 1.2383351577305148e-05,
|
||
|
|
"loss": 0.446,
|
||
|
|
"mean_token_accuracy": 0.845737649127841,
|
||
|
|
"num_tokens": 147888947.0,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.450836181640625,
|
||
|
|
"epoch": 1.373015873015873,
|
||
|
|
"grad_norm": 0.7074173689683401,
|
||
|
|
"learning_rate": 1.2298273625910512e-05,
|
||
|
|
"loss": 0.4568,
|
||
|
|
"mean_token_accuracy": 0.8422739477828145,
|
||
|
|
"num_tokens": 148771994.0,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4434661865234375,
|
||
|
|
"epoch": 1.380952380952381,
|
||
|
|
"grad_norm": 0.6710031250840192,
|
||
|
|
"learning_rate": 1.2213019675674008e-05,
|
||
|
|
"loss": 0.4433,
|
||
|
|
"mean_token_accuracy": 0.8467194638215005,
|
||
|
|
"num_tokens": 149626353.0,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4410858154296875,
|
||
|
|
"epoch": 1.3888888888888888,
|
||
|
|
"grad_norm": 0.6457827179824793,
|
||
|
|
"learning_rate": 1.2127596255233622e-05,
|
||
|
|
"loss": 0.4379,
|
||
|
|
"mean_token_accuracy": 0.8480355520732701,
|
||
|
|
"num_tokens": 150484433.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4429931640625,
|
||
|
|
"epoch": 1.3968253968253967,
|
||
|
|
"grad_norm": 0.6573907703715658,
|
||
|
|
"learning_rate": 1.2042009906205152e-05,
|
||
|
|
"loss": 0.4433,
|
||
|
|
"mean_token_accuracy": 0.8479111595079303,
|
||
|
|
"num_tokens": 151351171.0,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4395904541015625,
|
||
|
|
"epoch": 1.4047619047619047,
|
||
|
|
"grad_norm": 0.7949371909592465,
|
||
|
|
"learning_rate": 1.1956267182681265e-05,
|
||
|
|
"loss": 0.4504,
|
||
|
|
"mean_token_accuracy": 0.8442064803093672,
|
||
|
|
"num_tokens": 152198704.0,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442718505859375,
|
||
|
|
"epoch": 1.4126984126984126,
|
||
|
|
"grad_norm": 0.678353055772471,
|
||
|
|
"learning_rate": 1.1870374650729582e-05,
|
||
|
|
"loss": 0.4433,
|
||
|
|
"mean_token_accuracy": 0.8477690340951085,
|
||
|
|
"num_tokens": 153027562.0,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4433441162109375,
|
||
|
|
"epoch": 1.4206349206349207,
|
||
|
|
"grad_norm": 0.6780439570882703,
|
||
|
|
"learning_rate": 1.1784338887889858e-05,
|
||
|
|
"loss": 0.4385,
|
||
|
|
"mean_token_accuracy": 0.8462753728963435,
|
||
|
|
"num_tokens": 153863890.0,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442413330078125,
|
||
|
|
"epoch": 1.4285714285714286,
|
||
|
|
"grad_norm": 0.6587673965640602,
|
||
|
|
"learning_rate": 1.1698166482670293e-05,
|
||
|
|
"loss": 0.446,
|
||
|
|
"mean_token_accuracy": 0.8457558886148036,
|
||
|
|
"num_tokens": 154707913.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4350433349609375,
|
||
|
|
"epoch": 1.4365079365079365,
|
||
|
|
"grad_norm": 0.6816545482512325,
|
||
|
|
"learning_rate": 1.1611864034042972e-05,
|
||
|
|
"loss": 0.4438,
|
||
|
|
"mean_token_accuracy": 0.8467250894755125,
|
||
|
|
"num_tokens": 155590050.0,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4351959228515625,
|
||
|
|
"epoch": 1.4444444444444444,
|
||
|
|
"grad_norm": 0.6868757464599637,
|
||
|
|
"learning_rate": 1.1525438150938554e-05,
|
||
|
|
"loss": 0.4401,
|
||
|
|
"mean_token_accuracy": 0.84777757152915,
|
||
|
|
"num_tokens": 156449879.0,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4363861083984375,
|
||
|
|
"epoch": 1.4523809523809523,
|
||
|
|
"grad_norm": 0.6766249529263749,
|
||
|
|
"learning_rate": 1.1438895451740141e-05,
|
||
|
|
"loss": 0.4292,
|
||
|
|
"mean_token_accuracy": 0.8506109705194831,
|
||
|
|
"num_tokens": 157304143.0,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4354400634765625,
|
||
|
|
"epoch": 1.4603174603174602,
|
||
|
|
"grad_norm": 0.6379506488261414,
|
||
|
|
"learning_rate": 1.135224256377646e-05,
|
||
|
|
"loss": 0.4459,
|
||
|
|
"mean_token_accuracy": 0.8441523900255561,
|
||
|
|
"num_tokens": 158177988.0,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.431732177734375,
|
||
|
|
"epoch": 1.4682539682539684,
|
||
|
|
"grad_norm": 0.6506443662541825,
|
||
|
|
"learning_rate": 1.1265486122814359e-05,
|
||
|
|
"loss": 0.4468,
|
||
|
|
"mean_token_accuracy": 0.845328216906637,
|
||
|
|
"num_tokens": 159060066.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4365386962890625,
|
||
|
|
"epoch": 1.4761904761904763,
|
||
|
|
"grad_norm": 0.6750756914615313,
|
||
|
|
"learning_rate": 1.1178632772550636e-05,
|
||
|
|
"loss": 0.4416,
|
||
|
|
"mean_token_accuracy": 0.846350169274956,
|
||
|
|
"num_tokens": 159942986.0,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.438690185546875,
|
||
|
|
"epoch": 1.4841269841269842,
|
||
|
|
"grad_norm": 0.5998682565802522,
|
||
|
|
"learning_rate": 1.1091689164103281e-05,
|
||
|
|
"loss": 0.4338,
|
||
|
|
"mean_token_accuracy": 0.8496084534563124,
|
||
|
|
"num_tokens": 160782816.0,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4376220703125,
|
||
|
|
"epoch": 1.492063492063492,
|
||
|
|
"grad_norm": 0.7017393839562811,
|
||
|
|
"learning_rate": 1.1004661955502143e-05,
|
||
|
|
"loss": 0.4369,
|
||
|
|
"mean_token_accuracy": 0.8506594416685402,
|
||
|
|
"num_tokens": 161643512.0,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43133544921875,
|
||
|
|
"epoch": 1.5,
|
||
|
|
"grad_norm": 0.6526274910477081,
|
||
|
|
"learning_rate": 1.0917557811179057e-05,
|
||
|
|
"loss": 0.4308,
|
||
|
|
"mean_token_accuracy": 0.849564865231514,
|
||
|
|
"num_tokens": 162503001.0,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4289703369140625,
|
||
|
|
"epoch": 1.507936507936508,
|
||
|
|
"grad_norm": 0.6948150219846541,
|
||
|
|
"learning_rate": 1.0830383401457499e-05,
|
||
|
|
"loss": 0.4423,
|
||
|
|
"mean_token_accuracy": 0.8475161967799067,
|
||
|
|
"num_tokens": 163388010.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.424591064453125,
|
||
|
|
"epoch": 1.5158730158730158,
|
||
|
|
"grad_norm": 0.6197770130603713,
|
||
|
|
"learning_rate": 1.0743145402041781e-05,
|
||
|
|
"loss": 0.4356,
|
||
|
|
"mean_token_accuracy": 0.8487303233705461,
|
||
|
|
"num_tokens": 164270399.0,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4240570068359375,
|
||
|
|
"epoch": 1.5238095238095237,
|
||
|
|
"grad_norm": 0.6196952871398042,
|
||
|
|
"learning_rate": 1.0655850493505834e-05,
|
||
|
|
"loss": 0.4332,
|
||
|
|
"mean_token_accuracy": 0.8493564445525408,
|
||
|
|
"num_tokens": 165155523.0,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42626953125,
|
||
|
|
"epoch": 1.5317460317460316,
|
||
|
|
"grad_norm": 0.6835645143050197,
|
||
|
|
"learning_rate": 1.0568505360781606e-05,
|
||
|
|
"loss": 0.4323,
|
||
|
|
"mean_token_accuracy": 0.8495618836022913,
|
||
|
|
"num_tokens": 166004219.0,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4276275634765625,
|
||
|
|
"epoch": 1.5396825396825395,
|
||
|
|
"grad_norm": 0.6324264334244951,
|
||
|
|
"learning_rate": 1.0481116692647165e-05,
|
||
|
|
"loss": 0.433,
|
||
|
|
"mean_token_accuracy": 0.8492591748945415,
|
||
|
|
"num_tokens": 166887486.0,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.424072265625,
|
||
|
|
"epoch": 1.5476190476190477,
|
||
|
|
"grad_norm": 0.651624241869441,
|
||
|
|
"learning_rate": 1.039369118121445e-05,
|
||
|
|
"loss": 0.4353,
|
||
|
|
"mean_token_accuracy": 0.8506257832050323,
|
||
|
|
"num_tokens": 167743608.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4300537109375,
|
||
|
|
"epoch": 1.5555555555555556,
|
||
|
|
"grad_norm": 0.6114244627798591,
|
||
|
|
"learning_rate": 1.0306235521416822e-05,
|
||
|
|
"loss": 0.4327,
|
||
|
|
"mean_token_accuracy": 0.8502432033419609,
|
||
|
|
"num_tokens": 168602032.0,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42584228515625,
|
||
|
|
"epoch": 1.5634920634920635,
|
||
|
|
"grad_norm": 0.6819738923664077,
|
||
|
|
"learning_rate": 1.0218756410496353e-05,
|
||
|
|
"loss": 0.4399,
|
||
|
|
"mean_token_accuracy": 0.84707788284868,
|
||
|
|
"num_tokens": 169469975.0,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4196014404296875,
|
||
|
|
"epoch": 1.5714285714285714,
|
||
|
|
"grad_norm": 0.6181282041855998,
|
||
|
|
"learning_rate": 1.013126054749099e-05,
|
||
|
|
"loss": 0.4381,
|
||
|
|
"mean_token_accuracy": 0.8472674307413399,
|
||
|
|
"num_tokens": 170343282.0,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4282989501953125,
|
||
|
|
"epoch": 1.5793650793650795,
|
||
|
|
"grad_norm": 0.658618641759341,
|
||
|
|
"learning_rate": 1.0043754632721519e-05,
|
||
|
|
"loss": 0.4372,
|
||
|
|
"mean_token_accuracy": 0.8485192256048322,
|
||
|
|
"num_tokens": 171227432.0,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4275665283203125,
|
||
|
|
"epoch": 1.5873015873015874,
|
||
|
|
"grad_norm": 0.6304982479501472,
|
||
|
|
"learning_rate": 9.956245367278483e-06,
|
||
|
|
"loss": 0.4212,
|
||
|
|
"mean_token_accuracy": 0.8523962092585862,
|
||
|
|
"num_tokens": 172096305.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427978515625,
|
||
|
|
"epoch": 1.5952380952380953,
|
||
|
|
"grad_norm": 0.6371903010309696,
|
||
|
|
"learning_rate": 9.868739452509011e-06,
|
||
|
|
"loss": 0.4255,
|
||
|
|
"mean_token_accuracy": 0.8514103842899203,
|
||
|
|
"num_tokens": 172910673.0,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41900634765625,
|
||
|
|
"epoch": 1.6031746031746033,
|
||
|
|
"grad_norm": 0.6949109352240498,
|
||
|
|
"learning_rate": 9.78124358950365e-06,
|
||
|
|
"loss": 0.4312,
|
||
|
|
"mean_token_accuracy": 0.8511450518853962,
|
||
|
|
"num_tokens": 173775762.0,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4253692626953125,
|
||
|
|
"epoch": 1.6111111111111112,
|
||
|
|
"grad_norm": 0.6809037771433717,
|
||
|
|
"learning_rate": 9.693764478583185e-06,
|
||
|
|
"loss": 0.4341,
|
||
|
|
"mean_token_accuracy": 0.8501040656119585,
|
||
|
|
"num_tokens": 174651858.0,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.428009033203125,
|
||
|
|
"epoch": 1.619047619047619,
|
||
|
|
"grad_norm": 0.6543104002765698,
|
||
|
|
"learning_rate": 9.606308818785552e-06,
|
||
|
|
"loss": 0.425,
|
||
|
|
"mean_token_accuracy": 0.8514936515130103,
|
||
|
|
"num_tokens": 175519282.0,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.429931640625,
|
||
|
|
"epoch": 1.626984126984127,
|
||
|
|
"grad_norm": 0.6352549480190481,
|
||
|
|
"learning_rate": 9.518883307352839e-06,
|
||
|
|
"loss": 0.4405,
|
||
|
|
"mean_token_accuracy": 0.8475879756733775,
|
||
|
|
"num_tokens": 176387199.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4254150390625,
|
||
|
|
"epoch": 1.6349206349206349,
|
||
|
|
"grad_norm": 0.6497469357956864,
|
||
|
|
"learning_rate": 9.431494639218397e-06,
|
||
|
|
"loss": 0.4355,
|
||
|
|
"mean_token_accuracy": 0.8499571783468127,
|
||
|
|
"num_tokens": 177264131.0,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42901611328125,
|
||
|
|
"epoch": 1.6428571428571428,
|
||
|
|
"grad_norm": 0.6810764457880856,
|
||
|
|
"learning_rate": 9.344149506494169e-06,
|
||
|
|
"loss": 0.4281,
|
||
|
|
"mean_token_accuracy": 0.8508090190589428,
|
||
|
|
"num_tokens": 178114003.0,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4324951171875,
|
||
|
|
"epoch": 1.6507936507936507,
|
||
|
|
"grad_norm": 0.6629255607044134,
|
||
|
|
"learning_rate": 9.256854597958222e-06,
|
||
|
|
"loss": 0.4369,
|
||
|
|
"mean_token_accuracy": 0.8478540312498808,
|
||
|
|
"num_tokens": 178950487.0,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4268035888671875,
|
||
|
|
"epoch": 1.6587301587301586,
|
||
|
|
"grad_norm": 0.6985129275896726,
|
||
|
|
"learning_rate": 9.169616598542503e-06,
|
||
|
|
"loss": 0.4379,
|
||
|
|
"mean_token_accuracy": 0.8475267360918224,
|
||
|
|
"num_tokens": 179833212.0,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4263458251953125,
|
||
|
|
"epoch": 1.6666666666666665,
|
||
|
|
"grad_norm": 0.6679619087501362,
|
||
|
|
"learning_rate": 9.082442188820947e-06,
|
||
|
|
"loss": 0.4283,
|
||
|
|
"mean_token_accuracy": 0.8523209383711219,
|
||
|
|
"num_tokens": 180712234.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4289398193359375,
|
||
|
|
"epoch": 1.6746031746031746,
|
||
|
|
"grad_norm": 0.6473182358853972,
|
||
|
|
"learning_rate": 8.995338044497862e-06,
|
||
|
|
"loss": 0.4402,
|
||
|
|
"mean_token_accuracy": 0.8467970639467239,
|
||
|
|
"num_tokens": 181566490.0,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43121337890625,
|
||
|
|
"epoch": 1.6825396825396826,
|
||
|
|
"grad_norm": 0.6906263852183294,
|
||
|
|
"learning_rate": 8.90831083589672e-06,
|
||
|
|
"loss": 0.4335,
|
||
|
|
"mean_token_accuracy": 0.8494043787941337,
|
||
|
|
"num_tokens": 182413254.0,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4273223876953125,
|
||
|
|
"epoch": 1.6904761904761905,
|
||
|
|
"grad_norm": 0.6039213029248242,
|
||
|
|
"learning_rate": 8.821367227449368e-06,
|
||
|
|
"loss": 0.4276,
|
||
|
|
"mean_token_accuracy": 0.8509090105071664,
|
||
|
|
"num_tokens": 183269538.0,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.433563232421875,
|
||
|
|
"epoch": 1.6984126984126984,
|
||
|
|
"grad_norm": 0.7269407148981566,
|
||
|
|
"learning_rate": 8.734513877185644e-06,
|
||
|
|
"loss": 0.4245,
|
||
|
|
"mean_token_accuracy": 0.8521207985468209,
|
||
|
|
"num_tokens": 184109496.0,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4334564208984375,
|
||
|
|
"epoch": 1.7063492063492065,
|
||
|
|
"grad_norm": 0.5897073419234542,
|
||
|
|
"learning_rate": 8.647757436223543e-06,
|
||
|
|
"loss": 0.4383,
|
||
|
|
"mean_token_accuracy": 0.8479864248074591,
|
||
|
|
"num_tokens": 184968366.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4318084716796875,
|
||
|
|
"epoch": 1.7142857142857144,
|
||
|
|
"grad_norm": 0.6884447059302515,
|
||
|
|
"learning_rate": 8.561104548259864e-06,
|
||
|
|
"loss": 0.4376,
|
||
|
|
"mean_token_accuracy": 0.8475788393989205,
|
||
|
|
"num_tokens": 185857166.0,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43670654296875,
|
||
|
|
"epoch": 1.7222222222222223,
|
||
|
|
"grad_norm": 0.6407810130824321,
|
||
|
|
"learning_rate": 8.474561849061446e-06,
|
||
|
|
"loss": 0.4194,
|
||
|
|
"mean_token_accuracy": 0.8531361422501504,
|
||
|
|
"num_tokens": 186684767.0,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4307403564453125,
|
||
|
|
"epoch": 1.7301587301587302,
|
||
|
|
"grad_norm": 0.672082814857435,
|
||
|
|
"learning_rate": 8.388135965957031e-06,
|
||
|
|
"loss": 0.422,
|
||
|
|
"mean_token_accuracy": 0.8524083560332656,
|
||
|
|
"num_tokens": 187534641.0,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4279327392578125,
|
||
|
|
"epoch": 1.7380952380952381,
|
||
|
|
"grad_norm": 0.6434223057887511,
|
||
|
|
"learning_rate": 8.301833517329714e-06,
|
||
|
|
"loss": 0.4254,
|
||
|
|
"mean_token_accuracy": 0.8503254759125412,
|
||
|
|
"num_tokens": 188403747.0,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4255218505859375,
|
||
|
|
"epoch": 1.746031746031746,
|
||
|
|
"grad_norm": 0.660253597993015,
|
||
|
|
"learning_rate": 8.215661112110143e-06,
|
||
|
|
"loss": 0.43,
|
||
|
|
"mean_token_accuracy": 0.8505678987130523,
|
||
|
|
"num_tokens": 189286051.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4325103759765625,
|
||
|
|
"epoch": 1.753968253968254,
|
||
|
|
"grad_norm": 0.6268697653988943,
|
||
|
|
"learning_rate": 8.12962534927042e-06,
|
||
|
|
"loss": 0.4231,
|
||
|
|
"mean_token_accuracy": 0.851342577021569,
|
||
|
|
"num_tokens": 190135846.0,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4230194091796875,
|
||
|
|
"epoch": 1.7619047619047619,
|
||
|
|
"grad_norm": 0.6927249184398813,
|
||
|
|
"learning_rate": 8.043732817318736e-06,
|
||
|
|
"loss": 0.4331,
|
||
|
|
"mean_token_accuracy": 0.8501622658222914,
|
||
|
|
"num_tokens": 191023956.0,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4322509765625,
|
||
|
|
"epoch": 1.7698412698412698,
|
||
|
|
"grad_norm": 0.6104186946004958,
|
||
|
|
"learning_rate": 7.95799009379485e-06,
|
||
|
|
"loss": 0.4182,
|
||
|
|
"mean_token_accuracy": 0.8539444855414331,
|
||
|
|
"num_tokens": 191865715.0,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4320526123046875,
|
||
|
|
"epoch": 1.7777777777777777,
|
||
|
|
"grad_norm": 0.5976651871901888,
|
||
|
|
"learning_rate": 7.872403744766383e-06,
|
||
|
|
"loss": 0.4263,
|
||
|
|
"mean_token_accuracy": 0.8515215283259749,
|
||
|
|
"num_tokens": 192687536.0,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430633544921875,
|
||
|
|
"epoch": 1.7857142857142856,
|
||
|
|
"grad_norm": 0.6460900080888754,
|
||
|
|
"learning_rate": 7.786980324325994e-06,
|
||
|
|
"loss": 0.4199,
|
||
|
|
"mean_token_accuracy": 0.8545417245477438,
|
||
|
|
"num_tokens": 193514317.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4226226806640625,
|
||
|
|
"epoch": 1.7936507936507935,
|
||
|
|
"grad_norm": 0.5833584404095209,
|
||
|
|
"learning_rate": 7.70172637408949e-06,
|
||
|
|
"loss": 0.4224,
|
||
|
|
"mean_token_accuracy": 0.8528628125786781,
|
||
|
|
"num_tokens": 194368425.0,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42388916015625,
|
||
|
|
"epoch": 1.8015873015873016,
|
||
|
|
"grad_norm": 0.6154377441414096,
|
||
|
|
"learning_rate": 7.616648422694858e-06,
|
||
|
|
"loss": 0.4241,
|
||
|
|
"mean_token_accuracy": 0.851911770645529,
|
||
|
|
"num_tokens": 195229420.0,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42431640625,
|
||
|
|
"epoch": 1.8095238095238095,
|
||
|
|
"grad_norm": 0.6645832127632725,
|
||
|
|
"learning_rate": 7.531752985302323e-06,
|
||
|
|
"loss": 0.429,
|
||
|
|
"mean_token_accuracy": 0.8506735726259649,
|
||
|
|
"num_tokens": 196086060.0,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4266510009765625,
|
||
|
|
"epoch": 1.8174603174603174,
|
||
|
|
"grad_norm": 0.5959462674015885,
|
||
|
|
"learning_rate": 7.447046563095425e-06,
|
||
|
|
"loss": 0.4251,
|
||
|
|
"mean_token_accuracy": 0.8506376668810844,
|
||
|
|
"num_tokens": 196949752.0,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425537109375,
|
||
|
|
"epoch": 1.8253968253968254,
|
||
|
|
"grad_norm": 0.6534426125937292,
|
||
|
|
"learning_rate": 7.362535642783155e-06,
|
||
|
|
"loss": 0.4218,
|
||
|
|
"mean_token_accuracy": 0.8530319351702929,
|
||
|
|
"num_tokens": 197787383.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4254608154296875,
|
||
|
|
"epoch": 1.8333333333333335,
|
||
|
|
"grad_norm": 0.5921640707070417,
|
||
|
|
"learning_rate": 7.278226696103239e-06,
|
||
|
|
"loss": 0.4306,
|
||
|
|
"mean_token_accuracy": 0.8518684362061322,
|
||
|
|
"num_tokens": 198640204.0,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418487548828125,
|
||
|
|
"epoch": 1.8412698412698414,
|
||
|
|
"grad_norm": 0.6502344798493351,
|
||
|
|
"learning_rate": 7.194126179326497e-06,
|
||
|
|
"loss": 0.4293,
|
||
|
|
"mean_token_accuracy": 0.850899113342166,
|
||
|
|
"num_tokens": 199534945.0,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42578125,
|
||
|
|
"epoch": 1.8492063492063493,
|
||
|
|
"grad_norm": 0.6743484773379774,
|
||
|
|
"learning_rate": 7.110240532762469e-06,
|
||
|
|
"loss": 0.421,
|
||
|
|
"mean_token_accuracy": 0.8534788498654962,
|
||
|
|
"num_tokens": 200401566.0,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4231109619140625,
|
||
|
|
"epoch": 1.8571428571428572,
|
||
|
|
"grad_norm": 0.5734181994402893,
|
||
|
|
"learning_rate": 7.026576180266213e-06,
|
||
|
|
"loss": 0.4247,
|
||
|
|
"mean_token_accuracy": 0.8510698927566409,
|
||
|
|
"num_tokens": 201286569.0,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423370361328125,
|
||
|
|
"epoch": 1.8650793650793651,
|
||
|
|
"grad_norm": 0.6211303859995316,
|
||
|
|
"learning_rate": 6.9431395287463655e-06,
|
||
|
|
"loss": 0.4216,
|
||
|
|
"mean_token_accuracy": 0.8529601790942252,
|
||
|
|
"num_tokens": 202148785.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4267425537109375,
|
||
|
|
"epoch": 1.873015873015873,
|
||
|
|
"grad_norm": 0.6512618843621472,
|
||
|
|
"learning_rate": 6.859936967674509e-06,
|
||
|
|
"loss": 0.4179,
|
||
|
|
"mean_token_accuracy": 0.8532352782785892,
|
||
|
|
"num_tokens": 202969412.0,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4201812744140625,
|
||
|
|
"epoch": 1.880952380952381,
|
||
|
|
"grad_norm": 0.5685040340161264,
|
||
|
|
"learning_rate": 6.776974868595898e-06,
|
||
|
|
"loss": 0.4228,
|
||
|
|
"mean_token_accuracy": 0.852645758073777,
|
||
|
|
"num_tokens": 203845826.0,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42022705078125,
|
||
|
|
"epoch": 1.8888888888888888,
|
||
|
|
"grad_norm": 0.6361988218426201,
|
||
|
|
"learning_rate": 6.694259584641496e-06,
|
||
|
|
"loss": 0.4194,
|
||
|
|
"mean_token_accuracy": 0.853855645749718,
|
||
|
|
"num_tokens": 204713067.0,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426239013671875,
|
||
|
|
"epoch": 1.8968253968253967,
|
||
|
|
"grad_norm": 0.5758526586055672,
|
||
|
|
"learning_rate": 6.611797450041495e-06,
|
||
|
|
"loss": 0.4189,
|
||
|
|
"mean_token_accuracy": 0.8542146142572165,
|
||
|
|
"num_tokens": 205549482.0,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421630859375,
|
||
|
|
"epoch": 1.9047619047619047,
|
||
|
|
"grad_norm": 0.6073175159295213,
|
||
|
|
"learning_rate": 6.5295947796402315e-06,
|
||
|
|
"loss": 0.4202,
|
||
|
|
"mean_token_accuracy": 0.85362005000934,
|
||
|
|
"num_tokens": 206394578.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4246978759765625,
|
||
|
|
"epoch": 1.9126984126984126,
|
||
|
|
"grad_norm": 0.5769397482915233,
|
||
|
|
"learning_rate": 6.447657868412603e-06,
|
||
|
|
"loss": 0.426,
|
||
|
|
"mean_token_accuracy": 0.8522290964610875,
|
||
|
|
"num_tokens": 207233636.0,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4238433837890625,
|
||
|
|
"epoch": 1.9206349206349205,
|
||
|
|
"grad_norm": 0.6006836089858458,
|
||
|
|
"learning_rate": 6.365992990982015e-06,
|
||
|
|
"loss": 0.4268,
|
||
|
|
"mean_token_accuracy": 0.8512483732774854,
|
||
|
|
"num_tokens": 208074376.0,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.422637939453125,
|
||
|
|
"epoch": 1.9285714285714286,
|
||
|
|
"grad_norm": 0.5738787319949351,
|
||
|
|
"learning_rate": 6.284606401139875e-06,
|
||
|
|
"loss": 0.4262,
|
||
|
|
"mean_token_accuracy": 0.8505582748912275,
|
||
|
|
"num_tokens": 208947370.0,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4206695556640625,
|
||
|
|
"epoch": 1.9365079365079365,
|
||
|
|
"grad_norm": 0.5799710929690532,
|
||
|
|
"learning_rate": 6.203504331366677e-06,
|
||
|
|
"loss": 0.4155,
|
||
|
|
"mean_token_accuracy": 0.8547424203716218,
|
||
|
|
"num_tokens": 209798547.0,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425048828125,
|
||
|
|
"epoch": 1.9444444444444444,
|
||
|
|
"grad_norm": 0.6423312059049974,
|
||
|
|
"learning_rate": 6.122692992354748e-06,
|
||
|
|
"loss": 0.4229,
|
||
|
|
"mean_token_accuracy": 0.8520036181434989,
|
||
|
|
"num_tokens": 210661829.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4195709228515625,
|
||
|
|
"epoch": 1.9523809523809523,
|
||
|
|
"grad_norm": 0.5554620758818444,
|
||
|
|
"learning_rate": 6.0421785725326085e-06,
|
||
|
|
"loss": 0.4216,
|
||
|
|
"mean_token_accuracy": 0.8520813095383346,
|
||
|
|
"num_tokens": 211549046.0,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4189910888671875,
|
||
|
|
"epoch": 1.9603174603174605,
|
||
|
|
"grad_norm": 0.6117133326079128,
|
||
|
|
"learning_rate": 5.9619672375911065e-06,
|
||
|
|
"loss": 0.4148,
|
||
|
|
"mean_token_accuracy": 0.8543260907754302,
|
||
|
|
"num_tokens": 212447521.0,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4197235107421875,
|
||
|
|
"epoch": 1.9682539682539684,
|
||
|
|
"grad_norm": 0.6336191027571944,
|
||
|
|
"learning_rate": 5.882065130011226e-06,
|
||
|
|
"loss": 0.4209,
|
||
|
|
"mean_token_accuracy": 0.8533798800781369,
|
||
|
|
"num_tokens": 213310334.0,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4182586669921875,
|
||
|
|
"epoch": 1.9761904761904763,
|
||
|
|
"grad_norm": 0.7899010517492578,
|
||
|
|
"learning_rate": 5.80247836859372e-06,
|
||
|
|
"loss": 0.4207,
|
||
|
|
"mean_token_accuracy": 0.8531867042183876,
|
||
|
|
"num_tokens": 214169043.0,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4210205078125,
|
||
|
|
"epoch": 1.9841269841269842,
|
||
|
|
"grad_norm": 0.5663331071559328,
|
||
|
|
"learning_rate": 5.723213047990553e-06,
|
||
|
|
"loss": 0.4212,
|
||
|
|
"mean_token_accuracy": 0.8530774302780628,
|
||
|
|
"num_tokens": 215005057.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4158935546875,
|
||
|
|
"epoch": 1.992063492063492,
|
||
|
|
"grad_norm": 0.6025248790984578,
|
||
|
|
"learning_rate": 5.64427523823813e-06,
|
||
|
|
"loss": 0.4104,
|
||
|
|
"mean_token_accuracy": 0.8561887559480965,
|
||
|
|
"num_tokens": 215869766.0,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4177703857421875,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 0.5973147348731676,
|
||
|
|
"learning_rate": 5.5656709842925335e-06,
|
||
|
|
"loss": 0.4144,
|
||
|
|
"mean_token_accuracy": 0.8541272669099271,
|
||
|
|
"num_tokens": 216731206.0,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4171600341796875,
|
||
|
|
"epoch": 2.007936507936508,
|
||
|
|
"grad_norm": 0.6276654160889864,
|
||
|
|
"learning_rate": 5.4874063055665495e-06,
|
||
|
|
"loss": 0.3902,
|
||
|
|
"mean_token_accuracy": 0.8642581212334335,
|
||
|
|
"num_tokens": 217589905.0,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4161834716796875,
|
||
|
|
"epoch": 2.015873015873016,
|
||
|
|
"grad_norm": 0.6655836865810364,
|
||
|
|
"learning_rate": 5.40948719546873e-06,
|
||
|
|
"loss": 0.3946,
|
||
|
|
"mean_token_accuracy": 0.8615933828987181,
|
||
|
|
"num_tokens": 218446876.0,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4152069091796875,
|
||
|
|
"epoch": 2.0238095238095237,
|
||
|
|
"grad_norm": 0.5865307595034494,
|
||
|
|
"learning_rate": 5.331919620944438e-06,
|
||
|
|
"loss": 0.3954,
|
||
|
|
"mean_token_accuracy": 0.86083889240399,
|
||
|
|
"num_tokens": 219322571.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4158782958984375,
|
||
|
|
"epoch": 2.0317460317460316,
|
||
|
|
"grad_norm": 0.529665010763266,
|
||
|
|
"learning_rate": 5.2547095220188815e-06,
|
||
|
|
"loss": 0.3933,
|
||
|
|
"mean_token_accuracy": 0.8596651367843151,
|
||
|
|
"num_tokens": 220180160.0,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4136505126953125,
|
||
|
|
"epoch": 2.0396825396825395,
|
||
|
|
"grad_norm": 0.5832622748528953,
|
||
|
|
"learning_rate": 5.177862811342254e-06,
|
||
|
|
"loss": 0.3836,
|
||
|
|
"mean_token_accuracy": 0.8657393348403275,
|
||
|
|
"num_tokens": 221016578.0,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4110565185546875,
|
||
|
|
"epoch": 2.0476190476190474,
|
||
|
|
"grad_norm": 0.6002714768590333,
|
||
|
|
"learning_rate": 5.101385373736937e-06,
|
||
|
|
"loss": 0.3919,
|
||
|
|
"mean_token_accuracy": 0.8619846105575562,
|
||
|
|
"num_tokens": 221871968.0,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4073638916015625,
|
||
|
|
"epoch": 2.0555555555555554,
|
||
|
|
"grad_norm": 0.5644672942522324,
|
||
|
|
"learning_rate": 5.025283065746855e-06,
|
||
|
|
"loss": 0.3825,
|
||
|
|
"mean_token_accuracy": 0.8645827597938478,
|
||
|
|
"num_tokens": 222738323.0,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4102935791015625,
|
||
|
|
"epoch": 2.0634920634920633,
|
||
|
|
"grad_norm": 0.5904010681889716,
|
||
|
|
"learning_rate": 4.949561715189001e-06,
|
||
|
|
"loss": 0.388,
|
||
|
|
"mean_token_accuracy": 0.8645457159727812,
|
||
|
|
"num_tokens": 223584134.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40545654296875,
|
||
|
|
"epoch": 2.0714285714285716,
|
||
|
|
"grad_norm": 0.5790799071819784,
|
||
|
|
"learning_rate": 4.8742271207071226e-06,
|
||
|
|
"loss": 0.3877,
|
||
|
|
"mean_token_accuracy": 0.8634475646540523,
|
||
|
|
"num_tokens": 224461654.0,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4102325439453125,
|
||
|
|
"epoch": 2.0793650793650795,
|
||
|
|
"grad_norm": 0.5721734360627909,
|
||
|
|
"learning_rate": 4.799285051327686e-06,
|
||
|
|
"loss": 0.3938,
|
||
|
|
"mean_token_accuracy": 0.8604177525267005,
|
||
|
|
"num_tokens": 225327562.0,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4100799560546875,
|
||
|
|
"epoch": 2.0873015873015874,
|
||
|
|
"grad_norm": 0.6109882214140274,
|
||
|
|
"learning_rate": 4.724741246018103e-06,
|
||
|
|
"loss": 0.385,
|
||
|
|
"mean_token_accuracy": 0.8638843321241438,
|
||
|
|
"num_tokens": 226189031.0,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4103240966796875,
|
||
|
|
"epoch": 2.0952380952380953,
|
||
|
|
"grad_norm": 0.5597646515339337,
|
||
|
|
"learning_rate": 4.650601413247214e-06,
|
||
|
|
"loss": 0.3998,
|
||
|
|
"mean_token_accuracy": 0.8596187229268253,
|
||
|
|
"num_tokens": 227062309.0,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4138641357421875,
|
||
|
|
"epoch": 2.1031746031746033,
|
||
|
|
"grad_norm": 0.5631751720121708,
|
||
|
|
"learning_rate": 4.57687123054817e-06,
|
||
|
|
"loss": 0.391,
|
||
|
|
"mean_token_accuracy": 0.8622312569059432,
|
||
|
|
"num_tokens": 227920598.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4128875732421875,
|
||
|
|
"epoch": 2.111111111111111,
|
||
|
|
"grad_norm": 0.5735787131262603,
|
||
|
|
"learning_rate": 4.503556344083656e-06,
|
||
|
|
"loss": 0.3869,
|
||
|
|
"mean_token_accuracy": 0.8629599534906447,
|
||
|
|
"num_tokens": 228773818.0,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.412200927734375,
|
||
|
|
"epoch": 2.119047619047619,
|
||
|
|
"grad_norm": 0.5725239133889661,
|
||
|
|
"learning_rate": 4.4306623682134875e-06,
|
||
|
|
"loss": 0.3827,
|
||
|
|
"mean_token_accuracy": 0.8646458461880684,
|
||
|
|
"num_tokens": 229627711.0,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4097442626953125,
|
||
|
|
"epoch": 2.126984126984127,
|
||
|
|
"grad_norm": 0.5481999431523443,
|
||
|
|
"learning_rate": 4.358194885064704e-06,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"mean_token_accuracy": 0.8609438170678914,
|
||
|
|
"num_tokens": 230489032.0,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.409942626953125,
|
||
|
|
"epoch": 2.134920634920635,
|
||
|
|
"grad_norm": 0.5621397553390326,
|
||
|
|
"learning_rate": 4.286159444104068e-06,
|
||
|
|
"loss": 0.3943,
|
||
|
|
"mean_token_accuracy": 0.8611305872909725,
|
||
|
|
"num_tokens": 231339019.0,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41339111328125,
|
||
|
|
"epoch": 2.142857142857143,
|
||
|
|
"grad_norm": 0.7325265217894285,
|
||
|
|
"learning_rate": 4.2145615617131095e-06,
|
||
|
|
"loss": 0.3935,
|
||
|
|
"mean_token_accuracy": 0.8603947004303336,
|
||
|
|
"num_tokens": 232199672.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415496826171875,
|
||
|
|
"epoch": 2.1507936507936507,
|
||
|
|
"grad_norm": 0.6471864956846543,
|
||
|
|
"learning_rate": 4.143406720765687e-06,
|
||
|
|
"loss": 0.3915,
|
||
|
|
"mean_token_accuracy": 0.8618022156879306,
|
||
|
|
"num_tokens": 233076007.0,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4146575927734375,
|
||
|
|
"epoch": 2.1587301587301586,
|
||
|
|
"grad_norm": 0.5471917966436562,
|
||
|
|
"learning_rate": 4.0727003702081146e-06,
|
||
|
|
"loss": 0.3896,
|
||
|
|
"mean_token_accuracy": 0.8616545354016125,
|
||
|
|
"num_tokens": 233942156.0,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4138031005859375,
|
||
|
|
"epoch": 2.1666666666666665,
|
||
|
|
"grad_norm": 0.5471689661185632,
|
||
|
|
"learning_rate": 4.002447924641882e-06,
|
||
|
|
"loss": 0.3912,
|
||
|
|
"mean_token_accuracy": 0.8624369469471276,
|
||
|
|
"num_tokens": 234844668.0,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4194183349609375,
|
||
|
|
"epoch": 2.1746031746031744,
|
||
|
|
"grad_norm": 0.5341897422332818,
|
||
|
|
"learning_rate": 3.9326547639090315e-06,
|
||
|
|
"loss": 0.3976,
|
||
|
|
"mean_token_accuracy": 0.8597730663605034,
|
||
|
|
"num_tokens": 235697877.0,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4160614013671875,
|
||
|
|
"epoch": 2.1825396825396823,
|
||
|
|
"grad_norm": 0.5650190225140705,
|
||
|
|
"learning_rate": 3.863326232680148e-06,
|
||
|
|
"loss": 0.3867,
|
||
|
|
"mean_token_accuracy": 0.8626719349995255,
|
||
|
|
"num_tokens": 236586699.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4175567626953125,
|
||
|
|
"epoch": 2.1904761904761907,
|
||
|
|
"grad_norm": 0.5875625634296703,
|
||
|
|
"learning_rate": 3.7944676400451017e-06,
|
||
|
|
"loss": 0.3871,
|
||
|
|
"mean_token_accuracy": 0.861822621896863,
|
||
|
|
"num_tokens": 237426378.0,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41094970703125,
|
||
|
|
"epoch": 2.1984126984126986,
|
||
|
|
"grad_norm": 0.6096935487800307,
|
||
|
|
"learning_rate": 3.7260842591064504e-06,
|
||
|
|
"loss": 0.3871,
|
||
|
|
"mean_token_accuracy": 0.8619845635257661,
|
||
|
|
"num_tokens": 238297987.0,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4205474853515625,
|
||
|
|
"epoch": 2.2063492063492065,
|
||
|
|
"grad_norm": 0.5794805245252707,
|
||
|
|
"learning_rate": 3.6581813265756595e-06,
|
||
|
|
"loss": 0.3988,
|
||
|
|
"mean_token_accuracy": 0.8605999210849404,
|
||
|
|
"num_tokens": 239171101.0,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4158172607421875,
|
||
|
|
"epoch": 2.2142857142857144,
|
||
|
|
"grad_norm": 0.5763915780264333,
|
||
|
|
"learning_rate": 3.590764042372079e-06,
|
||
|
|
"loss": 0.3844,
|
||
|
|
"mean_token_accuracy": 0.8631811602972448,
|
||
|
|
"num_tokens": 240034337.0,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4194793701171875,
|
||
|
|
"epoch": 2.2222222222222223,
|
||
|
|
"grad_norm": 0.5303818182249157,
|
||
|
|
"learning_rate": 3.523837569224725e-06,
|
||
|
|
"loss": 0.3792,
|
||
|
|
"mean_token_accuracy": 0.8658822155557573,
|
||
|
|
"num_tokens": 240860927.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41436767578125,
|
||
|
|
"epoch": 2.2301587301587302,
|
||
|
|
"grad_norm": 0.5581623734673887,
|
||
|
|
"learning_rate": 3.4574070322769347e-06,
|
||
|
|
"loss": 0.3896,
|
||
|
|
"mean_token_accuracy": 0.8626445569097996,
|
||
|
|
"num_tokens": 241739076.0,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.414520263671875,
|
||
|
|
"epoch": 2.238095238095238,
|
||
|
|
"grad_norm": 0.5494365768968688,
|
||
|
|
"learning_rate": 3.391477518693894e-06,
|
||
|
|
"loss": 0.3805,
|
||
|
|
"mean_token_accuracy": 0.8653818825259805,
|
||
|
|
"num_tokens": 242574011.0,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415740966796875,
|
||
|
|
"epoch": 2.246031746031746,
|
||
|
|
"grad_norm": 0.5877235744523559,
|
||
|
|
"learning_rate": 3.3260540772730576e-06,
|
||
|
|
"loss": 0.3902,
|
||
|
|
"mean_token_accuracy": 0.8616071529686451,
|
||
|
|
"num_tokens": 243458878.0,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.412841796875,
|
||
|
|
"epoch": 2.253968253968254,
|
||
|
|
"grad_norm": 0.5569142251529777,
|
||
|
|
"learning_rate": 3.261141718057523e-06,
|
||
|
|
"loss": 0.3879,
|
||
|
|
"mean_token_accuracy": 0.8641184438019991,
|
||
|
|
"num_tokens": 244313964.0,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.414337158203125,
|
||
|
|
"epoch": 2.261904761904762,
|
||
|
|
"grad_norm": 0.579133790812722,
|
||
|
|
"learning_rate": 3.1967454119523745e-06,
|
||
|
|
"loss": 0.3827,
|
||
|
|
"mean_token_accuracy": 0.8644507811404765,
|
||
|
|
"num_tokens": 245200322.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41656494140625,
|
||
|
|
"epoch": 2.2698412698412698,
|
||
|
|
"grad_norm": 0.5714755304570499,
|
||
|
|
"learning_rate": 3.1328700903440045e-06,
|
||
|
|
"loss": 0.3867,
|
||
|
|
"mean_token_accuracy": 0.8641348239034414,
|
||
|
|
"num_tokens": 246083539.0,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41748046875,
|
||
|
|
"epoch": 2.2777777777777777,
|
||
|
|
"grad_norm": 0.5402817316233841,
|
||
|
|
"learning_rate": 3.0695206447224923e-06,
|
||
|
|
"loss": 0.3882,
|
||
|
|
"mean_token_accuracy": 0.8631519465707242,
|
||
|
|
"num_tokens": 246933619.0,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4218597412109375,
|
||
|
|
"epoch": 2.2857142857142856,
|
||
|
|
"grad_norm": 0.5351912335816236,
|
||
|
|
"learning_rate": 3.0067019263069973e-06,
|
||
|
|
"loss": 0.3797,
|
||
|
|
"mean_token_accuracy": 0.8656269912607968,
|
||
|
|
"num_tokens": 247765672.0,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4116973876953125,
|
||
|
|
"epoch": 2.2936507936507935,
|
||
|
|
"grad_norm": 0.5559360442278566,
|
||
|
|
"learning_rate": 2.9444187456742855e-06,
|
||
|
|
"loss": 0.3812,
|
||
|
|
"mean_token_accuracy": 0.8642131965607405,
|
||
|
|
"num_tokens": 248628378.0,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4143218994140625,
|
||
|
|
"epoch": 2.3015873015873014,
|
||
|
|
"grad_norm": 0.5890265994765199,
|
||
|
|
"learning_rate": 2.8826758723903192e-06,
|
||
|
|
"loss": 0.3895,
|
||
|
|
"mean_token_accuracy": 0.8638571444898844,
|
||
|
|
"num_tokens": 249501143.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4134368896484375,
|
||
|
|
"epoch": 2.3095238095238093,
|
||
|
|
"grad_norm": 0.5998092954683998,
|
||
|
|
"learning_rate": 2.821478034645009e-06,
|
||
|
|
"loss": 0.3842,
|
||
|
|
"mean_token_accuracy": 0.8642507120966911,
|
||
|
|
"num_tokens": 250356099.0,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4135284423828125,
|
||
|
|
"epoch": 2.317460317460317,
|
||
|
|
"grad_norm": 0.5510958860977246,
|
||
|
|
"learning_rate": 2.7608299188901632e-06,
|
||
|
|
"loss": 0.3861,
|
||
|
|
"mean_token_accuracy": 0.8630354404449463,
|
||
|
|
"num_tokens": 251219125.0,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.410308837890625,
|
||
|
|
"epoch": 2.3253968253968256,
|
||
|
|
"grad_norm": 0.5629966415686625,
|
||
|
|
"learning_rate": 2.7007361694805735e-06,
|
||
|
|
"loss": 0.3852,
|
||
|
|
"mean_token_accuracy": 0.8642032388597727,
|
||
|
|
"num_tokens": 252080434.0,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4159088134765625,
|
||
|
|
"epoch": 2.3333333333333335,
|
||
|
|
"grad_norm": 0.5390902225233088,
|
||
|
|
"learning_rate": 2.64120138831837e-06,
|
||
|
|
"loss": 0.3813,
|
||
|
|
"mean_token_accuracy": 0.8645626241341233,
|
||
|
|
"num_tokens": 252923218.0,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4195098876953125,
|
||
|
|
"epoch": 2.3412698412698414,
|
||
|
|
"grad_norm": 0.539915212152828,
|
||
|
|
"learning_rate": 2.5822301345006196e-06,
|
||
|
|
"loss": 0.3822,
|
||
|
|
"mean_token_accuracy": 0.8648238624446094,
|
||
|
|
"num_tokens": 253761289.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.412567138671875,
|
||
|
|
"epoch": 2.3492063492063493,
|
||
|
|
"grad_norm": 0.5605080176549344,
|
||
|
|
"learning_rate": 2.5238269239701816e-06,
|
||
|
|
"loss": 0.3883,
|
||
|
|
"mean_token_accuracy": 0.862140198238194,
|
||
|
|
"num_tokens": 254643716.0,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.416534423828125,
|
||
|
|
"epoch": 2.357142857142857,
|
||
|
|
"grad_norm": 0.5577110501244517,
|
||
|
|
"learning_rate": 2.4659962291698936e-06,
|
||
|
|
"loss": 0.3878,
|
||
|
|
"mean_token_accuracy": 0.862798870075494,
|
||
|
|
"num_tokens": 255486573.0,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.413330078125,
|
||
|
|
"epoch": 2.365079365079365,
|
||
|
|
"grad_norm": 0.564255039988231,
|
||
|
|
"learning_rate": 2.408742478700071e-06,
|
||
|
|
"loss": 0.3759,
|
||
|
|
"mean_token_accuracy": 0.8661748920567334,
|
||
|
|
"num_tokens": 256345326.0,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.411529541015625,
|
||
|
|
"epoch": 2.373015873015873,
|
||
|
|
"grad_norm": 0.5347008401050389,
|
||
|
|
"learning_rate": 2.352070056979375e-06,
|
||
|
|
"loss": 0.3762,
|
||
|
|
"mean_token_accuracy": 0.867778348736465,
|
||
|
|
"num_tokens": 257185152.0,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41009521484375,
|
||
|
|
"epoch": 2.380952380952381,
|
||
|
|
"grad_norm": 0.543409354225312,
|
||
|
|
"learning_rate": 2.295983303909065e-06,
|
||
|
|
"loss": 0.3821,
|
||
|
|
"mean_token_accuracy": 0.8653234201483428,
|
||
|
|
"num_tokens": 258067282.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4130859375,
|
||
|
|
"epoch": 2.388888888888889,
|
||
|
|
"grad_norm": 0.5346168976688244,
|
||
|
|
"learning_rate": 2.2404865145406353e-06,
|
||
|
|
"loss": 0.3852,
|
||
|
|
"mean_token_accuracy": 0.8636267627589405,
|
||
|
|
"num_tokens": 258944016.0,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4159393310546875,
|
||
|
|
"epoch": 2.3968253968253967,
|
||
|
|
"grad_norm": 0.5607670324824822,
|
||
|
|
"learning_rate": 2.1855839387469237e-06,
|
||
|
|
"loss": 0.3804,
|
||
|
|
"mean_token_accuracy": 0.8650295240804553,
|
||
|
|
"num_tokens": 259808268.0,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4105377197265625,
|
||
|
|
"epoch": 2.4047619047619047,
|
||
|
|
"grad_norm": 0.5763140805976009,
|
||
|
|
"learning_rate": 2.1312797808966625e-06,
|
||
|
|
"loss": 0.3903,
|
||
|
|
"mean_token_accuracy": 0.863428748678416,
|
||
|
|
"num_tokens": 260684292.0,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.414642333984375,
|
||
|
|
"epoch": 2.4126984126984126,
|
||
|
|
"grad_norm": 0.6375048447554551,
|
||
|
|
"learning_rate": 2.0775781995324886e-06,
|
||
|
|
"loss": 0.3858,
|
||
|
|
"mean_token_accuracy": 0.8646045490168035,
|
||
|
|
"num_tokens": 261555918.0,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.413177490234375,
|
||
|
|
"epoch": 2.4206349206349205,
|
||
|
|
"grad_norm": 0.5711513351236631,
|
||
|
|
"learning_rate": 2.024483307052526e-06,
|
||
|
|
"loss": 0.3816,
|
||
|
|
"mean_token_accuracy": 0.8652195022441447,
|
||
|
|
"num_tokens": 262425946.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4171905517578125,
|
||
|
|
"epoch": 2.4285714285714284,
|
||
|
|
"grad_norm": 0.5460960698149459,
|
||
|
|
"learning_rate": 1.971999169395432e-06,
|
||
|
|
"loss": 0.3764,
|
||
|
|
"mean_token_accuracy": 0.8664770247414708,
|
||
|
|
"num_tokens": 263268966.0,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41497802734375,
|
||
|
|
"epoch": 2.4365079365079367,
|
||
|
|
"grad_norm": 0.5256766338517593,
|
||
|
|
"learning_rate": 1.920129805729043e-06,
|
||
|
|
"loss": 0.3806,
|
||
|
|
"mean_token_accuracy": 0.8658863957971334,
|
||
|
|
"num_tokens": 264137961.0,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4152679443359375,
|
||
|
|
"epoch": 2.4444444444444446,
|
||
|
|
"grad_norm": 0.5266763600052715,
|
||
|
|
"learning_rate": 1.8688791881426017e-06,
|
||
|
|
"loss": 0.3805,
|
||
|
|
"mean_token_accuracy": 0.8645090684294701,
|
||
|
|
"num_tokens": 264995910.0,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41461181640625,
|
||
|
|
"epoch": 2.4523809523809526,
|
||
|
|
"grad_norm": 0.5434774314682953,
|
||
|
|
"learning_rate": 1.8182512413425624e-06,
|
||
|
|
"loss": 0.3799,
|
||
|
|
"mean_token_accuracy": 0.865009430795908,
|
||
|
|
"num_tokens": 265867518.0,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41693115234375,
|
||
|
|
"epoch": 2.4603174603174605,
|
||
|
|
"grad_norm": 0.5493374323066029,
|
||
|
|
"learning_rate": 1.7682498423520545e-06,
|
||
|
|
"loss": 0.3848,
|
||
|
|
"mean_token_accuracy": 0.8647962850518525,
|
||
|
|
"num_tokens": 266730039.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41461181640625,
|
||
|
|
"epoch": 2.4682539682539684,
|
||
|
|
"grad_norm": 0.5317398307781561,
|
||
|
|
"learning_rate": 1.7188788202139794e-06,
|
||
|
|
"loss": 0.3875,
|
||
|
|
"mean_token_accuracy": 0.8631189134903252,
|
||
|
|
"num_tokens": 267605673.0,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415802001953125,
|
||
|
|
"epoch": 2.4761904761904763,
|
||
|
|
"grad_norm": 0.5266121906145388,
|
||
|
|
"learning_rate": 1.6701419556977882e-06,
|
||
|
|
"loss": 0.3886,
|
||
|
|
"mean_token_accuracy": 0.864013391546905,
|
||
|
|
"num_tokens": 268470812.0,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4138946533203125,
|
||
|
|
"epoch": 2.484126984126984,
|
||
|
|
"grad_norm": 0.5249027475139008,
|
||
|
|
"learning_rate": 1.6220429810099603e-06,
|
||
|
|
"loss": 0.3792,
|
||
|
|
"mean_token_accuracy": 0.8644625195302069,
|
||
|
|
"num_tokens": 269329768.0,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41455078125,
|
||
|
|
"epoch": 2.492063492063492,
|
||
|
|
"grad_norm": 0.5316446888258474,
|
||
|
|
"learning_rate": 1.5745855795081889e-06,
|
||
|
|
"loss": 0.386,
|
||
|
|
"mean_token_accuracy": 0.8626924455165863,
|
||
|
|
"num_tokens": 270192672.0,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.416595458984375,
|
||
|
|
"epoch": 2.5,
|
||
|
|
"grad_norm": 0.507973852043752,
|
||
|
|
"learning_rate": 1.527773385419311e-06,
|
||
|
|
"loss": 0.3878,
|
||
|
|
"mean_token_accuracy": 0.8625178756192327,
|
||
|
|
"num_tokens": 271053623.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4129638671875,
|
||
|
|
"epoch": 2.507936507936508,
|
||
|
|
"grad_norm": 0.5077791813190773,
|
||
|
|
"learning_rate": 1.4816099835610209e-06,
|
||
|
|
"loss": 0.3834,
|
||
|
|
"mean_token_accuracy": 0.8646475677378476,
|
||
|
|
"num_tokens": 271921985.0,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423919677734375,
|
||
|
|
"epoch": 2.515873015873016,
|
||
|
|
"grad_norm": 0.5348663606444165,
|
||
|
|
"learning_rate": 1.4360989090673284e-06,
|
||
|
|
"loss": 0.3838,
|
||
|
|
"mean_token_accuracy": 0.8645837544463575,
|
||
|
|
"num_tokens": 272757706.0,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.419281005859375,
|
||
|
|
"epoch": 2.5238095238095237,
|
||
|
|
"grad_norm": 0.5272426034742803,
|
||
|
|
"learning_rate": 1.3912436471178525e-06,
|
||
|
|
"loss": 0.3786,
|
||
|
|
"mean_token_accuracy": 0.8656437643803656,
|
||
|
|
"num_tokens": 273603268.0,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4167327880859375,
|
||
|
|
"epoch": 2.5317460317460316,
|
||
|
|
"grad_norm": 0.5340390184826158,
|
||
|
|
"learning_rate": 1.3470476326709337e-06,
|
||
|
|
"loss": 0.3788,
|
||
|
|
"mean_token_accuracy": 0.8665351970121264,
|
||
|
|
"num_tokens": 274458735.0,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41522216796875,
|
||
|
|
"epoch": 2.5396825396825395,
|
||
|
|
"grad_norm": 0.5104651040256136,
|
||
|
|
"learning_rate": 1.3035142502005792e-06,
|
||
|
|
"loss": 0.3821,
|
||
|
|
"mean_token_accuracy": 0.8639437765814364,
|
||
|
|
"num_tokens": 275320028.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4129486083984375,
|
||
|
|
"epoch": 2.5476190476190474,
|
||
|
|
"grad_norm": 0.5922952980019984,
|
||
|
|
"learning_rate": 1.2606468334373e-06,
|
||
|
|
"loss": 0.3774,
|
||
|
|
"mean_token_accuracy": 0.8665279163978994,
|
||
|
|
"num_tokens": 276151262.0,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4136962890625,
|
||
|
|
"epoch": 2.5555555555555554,
|
||
|
|
"grad_norm": 0.5357315155466328,
|
||
|
|
"learning_rate": 1.2184486651128014e-06,
|
||
|
|
"loss": 0.3817,
|
||
|
|
"mean_token_accuracy": 0.8641278254799545,
|
||
|
|
"num_tokens": 276997327.0,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4170989990234375,
|
||
|
|
"epoch": 2.5634920634920633,
|
||
|
|
"grad_norm": 0.5250006813841819,
|
||
|
|
"learning_rate": 1.1769229767086053e-06,
|
||
|
|
"loss": 0.3856,
|
||
|
|
"mean_token_accuracy": 0.8660658891312778,
|
||
|
|
"num_tokens": 277849848.0,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41107177734375,
|
||
|
|
"epoch": 2.571428571428571,
|
||
|
|
"grad_norm": 0.502170658695331,
|
||
|
|
"learning_rate": 1.1360729482085852e-06,
|
||
|
|
"loss": 0.3756,
|
||
|
|
"mean_token_accuracy": 0.8676945436745882,
|
||
|
|
"num_tokens": 278726761.0,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4149017333984375,
|
||
|
|
"epoch": 2.5793650793650795,
|
||
|
|
"grad_norm": 0.5457600582389259,
|
||
|
|
"learning_rate": 1.0959017078554458e-06,
|
||
|
|
"loss": 0.3762,
|
||
|
|
"mean_token_accuracy": 0.8675552252680063,
|
||
|
|
"num_tokens": 279572082.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4125823974609375,
|
||
|
|
"epoch": 2.5873015873015874,
|
||
|
|
"grad_norm": 0.5267025240131151,
|
||
|
|
"learning_rate": 1.0564123319111708e-06,
|
||
|
|
"loss": 0.3798,
|
||
|
|
"mean_token_accuracy": 0.8648343035019934,
|
||
|
|
"num_tokens": 280414468.0,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4168548583984375,
|
||
|
|
"epoch": 2.5952380952380953,
|
||
|
|
"grad_norm": 0.5099791881851006,
|
||
|
|
"learning_rate": 1.017607844421441e-06,
|
||
|
|
"loss": 0.3848,
|
||
|
|
"mean_token_accuracy": 0.864824591204524,
|
||
|
|
"num_tokens": 281237288.0,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41204833984375,
|
||
|
|
"epoch": 2.6031746031746033,
|
||
|
|
"grad_norm": 0.5664377723043557,
|
||
|
|
"learning_rate": 9.794912169840564e-07,
|
||
|
|
"loss": 0.372,
|
||
|
|
"mean_token_accuracy": 0.8691010950133204,
|
||
|
|
"num_tokens": 282118057.0,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4074859619140625,
|
||
|
|
"epoch": 2.611111111111111,
|
||
|
|
"grad_norm": 0.5020933620876052,
|
||
|
|
"learning_rate": 9.420653685213854e-07,
|
||
|
|
"loss": 0.3876,
|
||
|
|
"mean_token_accuracy": 0.8634662297554314,
|
||
|
|
"num_tokens": 283028330.0,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41888427734375,
|
||
|
|
"epoch": 2.619047619047619,
|
||
|
|
"grad_norm": 0.49659765769759895,
|
||
|
|
"learning_rate": 9.053331650568264e-07,
|
||
|
|
"loss": 0.3811,
|
||
|
|
"mean_token_accuracy": 0.8650768841616809,
|
||
|
|
"num_tokens": 283866607.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4181976318359375,
|
||
|
|
"epoch": 2.626984126984127,
|
||
|
|
"grad_norm": 0.5446277616026562,
|
||
|
|
"learning_rate": 8.692974194953263e-07,
|
||
|
|
"loss": 0.3839,
|
||
|
|
"mean_token_accuracy": 0.8648977861739695,
|
||
|
|
"num_tokens": 284705319.0,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4133758544921875,
|
||
|
|
"epoch": 2.634920634920635,
|
||
|
|
"grad_norm": 0.4865910494795602,
|
||
|
|
"learning_rate": 8.339608914079944e-07,
|
||
|
|
"loss": 0.3958,
|
||
|
|
"mean_token_accuracy": 0.8617808111011982,
|
||
|
|
"num_tokens": 285598883.0,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415557861328125,
|
||
|
|
"epoch": 2.642857142857143,
|
||
|
|
"grad_norm": 0.5244468960355501,
|
||
|
|
"learning_rate": 7.993262868207552e-07,
|
||
|
|
"loss": 0.3853,
|
||
|
|
"mean_token_accuracy": 0.8644285243935883,
|
||
|
|
"num_tokens": 286461446.0,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4182281494140625,
|
||
|
|
"epoch": 2.6507936507936507,
|
||
|
|
"grad_norm": 0.7087612664979458,
|
||
|
|
"learning_rate": 7.653962580071384e-07,
|
||
|
|
"loss": 0.3808,
|
||
|
|
"mean_token_accuracy": 0.8640225417912006,
|
||
|
|
"num_tokens": 287308399.0,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40972900390625,
|
||
|
|
"epoch": 2.6587301587301586,
|
||
|
|
"grad_norm": 0.5050650123459585,
|
||
|
|
"learning_rate": 7.321734032851613e-07,
|
||
|
|
"loss": 0.3838,
|
||
|
|
"mean_token_accuracy": 0.8654659832827747,
|
||
|
|
"num_tokens": 288187832.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4111785888671875,
|
||
|
|
"epoch": 2.6666666666666665,
|
||
|
|
"grad_norm": 0.5138723208589175,
|
||
|
|
"learning_rate": 6.996602668183605e-07,
|
||
|
|
"loss": 0.3807,
|
||
|
|
"mean_token_accuracy": 0.8655836824327707,
|
||
|
|
"num_tokens": 289047051.0,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4153594970703125,
|
||
|
|
"epoch": 2.674603174603175,
|
||
|
|
"grad_norm": 0.5408275070681229,
|
||
|
|
"learning_rate": 6.678593384209597e-07,
|
||
|
|
"loss": 0.3884,
|
||
|
|
"mean_token_accuracy": 0.8643983146175742,
|
||
|
|
"num_tokens": 289920851.0,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4125823974609375,
|
||
|
|
"epoch": 2.682539682539683,
|
||
|
|
"grad_norm": 0.4759412131823873,
|
||
|
|
"learning_rate": 6.367730533672035e-07,
|
||
|
|
"loss": 0.3732,
|
||
|
|
"mean_token_accuracy": 0.8673816910013556,
|
||
|
|
"num_tokens": 290766931.0,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42041015625,
|
||
|
|
"epoch": 2.6904761904761907,
|
||
|
|
"grad_norm": 0.5004081975261553,
|
||
|
|
"learning_rate": 6.064037922048661e-07,
|
||
|
|
"loss": 0.3817,
|
||
|
|
"mean_token_accuracy": 0.8643794315867126,
|
||
|
|
"num_tokens": 291599836.0,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4112701416015625,
|
||
|
|
"epoch": 2.6984126984126986,
|
||
|
|
"grad_norm": 0.48982483200118515,
|
||
|
|
"learning_rate": 5.767538805729578e-07,
|
||
|
|
"loss": 0.3856,
|
||
|
|
"mean_token_accuracy": 0.8643845007754862,
|
||
|
|
"num_tokens": 292454972.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418304443359375,
|
||
|
|
"epoch": 2.7063492063492065,
|
||
|
|
"grad_norm": 0.5174894409413827,
|
||
|
|
"learning_rate": 5.478255890236184e-07,
|
||
|
|
"loss": 0.3799,
|
||
|
|
"mean_token_accuracy": 0.8644901039078832,
|
||
|
|
"num_tokens": 293307675.0,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415863037109375,
|
||
|
|
"epoch": 2.7142857142857144,
|
||
|
|
"grad_norm": 0.49053909813768987,
|
||
|
|
"learning_rate": 5.196211328482559e-07,
|
||
|
|
"loss": 0.3817,
|
||
|
|
"mean_token_accuracy": 0.8651680708862841,
|
||
|
|
"num_tokens": 294149752.0,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.419036865234375,
|
||
|
|
"epoch": 2.7222222222222223,
|
||
|
|
"grad_norm": 0.5048327044966334,
|
||
|
|
"learning_rate": 4.921426719078948e-07,
|
||
|
|
"loss": 0.3687,
|
||
|
|
"mean_token_accuracy": 0.8694347636774182,
|
||
|
|
"num_tokens": 294975614.0,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4116058349609375,
|
||
|
|
"epoch": 2.7301587301587302,
|
||
|
|
"grad_norm": 0.4966619149777404,
|
||
|
|
"learning_rate": 4.653923104677671e-07,
|
||
|
|
"loss": 0.3801,
|
||
|
|
"mean_token_accuracy": 0.8655543397180736,
|
||
|
|
"num_tokens": 295846874.0,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4119720458984375,
|
||
|
|
"epoch": 2.738095238095238,
|
||
|
|
"grad_norm": 0.5136264989197665,
|
||
|
|
"learning_rate": 4.3937209703619476e-07,
|
||
|
|
"loss": 0.3806,
|
||
|
|
"mean_token_accuracy": 0.8657941850833595,
|
||
|
|
"num_tokens": 296730922.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4112091064453125,
|
||
|
|
"epoch": 2.746031746031746,
|
||
|
|
"grad_norm": 0.48053531479565836,
|
||
|
|
"learning_rate": 4.140840242076927e-07,
|
||
|
|
"loss": 0.3741,
|
||
|
|
"mean_token_accuracy": 0.8681624769233167,
|
||
|
|
"num_tokens": 297610941.0,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415740966796875,
|
||
|
|
"epoch": 2.753968253968254,
|
||
|
|
"grad_norm": 0.5414004762026651,
|
||
|
|
"learning_rate": 3.895300285103931e-07,
|
||
|
|
"loss": 0.3755,
|
||
|
|
"mean_token_accuracy": 0.864651458337903,
|
||
|
|
"num_tokens": 298464464.0,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4115142822265625,
|
||
|
|
"epoch": 2.761904761904762,
|
||
|
|
"grad_norm": 0.5039880854406058,
|
||
|
|
"learning_rate": 3.657119902577466e-07,
|
||
|
|
"loss": 0.3722,
|
||
|
|
"mean_token_accuracy": 0.8687906567938626,
|
||
|
|
"num_tokens": 299315956.0,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.416900634765625,
|
||
|
|
"epoch": 2.7698412698412698,
|
||
|
|
"grad_norm": 0.5361811952601372,
|
||
|
|
"learning_rate": 3.426317334045226e-07,
|
||
|
|
"loss": 0.3774,
|
||
|
|
"mean_token_accuracy": 0.8656895193271339,
|
||
|
|
"num_tokens": 300162540.0,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4165191650390625,
|
||
|
|
"epoch": 2.7777777777777777,
|
||
|
|
"grad_norm": 0.5147685427448626,
|
||
|
|
"learning_rate": 3.202910254071434e-07,
|
||
|
|
"loss": 0.3751,
|
||
|
|
"mean_token_accuracy": 0.8678149809129536,
|
||
|
|
"num_tokens": 301009855.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4177398681640625,
|
||
|
|
"epoch": 2.7857142857142856,
|
||
|
|
"grad_norm": 0.5293771485579281,
|
||
|
|
"learning_rate": 2.9869157708832805e-07,
|
||
|
|
"loss": 0.3697,
|
||
|
|
"mean_token_accuracy": 0.8683305000886321,
|
||
|
|
"num_tokens": 301852351.0,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.412750244140625,
|
||
|
|
"epoch": 2.7936507936507935,
|
||
|
|
"grad_norm": 0.5809290124899761,
|
||
|
|
"learning_rate": 2.778350425060794e-07,
|
||
|
|
"loss": 0.3726,
|
||
|
|
"mean_token_accuracy": 0.8671337850391865,
|
||
|
|
"num_tokens": 302702189.0,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.417388916015625,
|
||
|
|
"epoch": 2.8015873015873014,
|
||
|
|
"grad_norm": 0.4880249907803546,
|
||
|
|
"learning_rate": 2.5772301882702634e-07,
|
||
|
|
"loss": 0.3734,
|
||
|
|
"mean_token_accuracy": 0.8674810263328254,
|
||
|
|
"num_tokens": 303546117.0,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4113616943359375,
|
||
|
|
"epoch": 2.8095238095238093,
|
||
|
|
"grad_norm": 0.48636087376036335,
|
||
|
|
"learning_rate": 2.3835704620410294e-07,
|
||
|
|
"loss": 0.3778,
|
||
|
|
"mean_token_accuracy": 0.8667405629530549,
|
||
|
|
"num_tokens": 304424136.0,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4126739501953125,
|
||
|
|
"epoch": 2.817460317460317,
|
||
|
|
"grad_norm": 0.5409934504868933,
|
||
|
|
"learning_rate": 2.1973860765861831e-07,
|
||
|
|
"loss": 0.3845,
|
||
|
|
"mean_token_accuracy": 0.8655604547820985,
|
||
|
|
"num_tokens": 305299176.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4120635986328125,
|
||
|
|
"epoch": 2.825396825396825,
|
||
|
|
"grad_norm": 0.5115713722265198,
|
||
|
|
"learning_rate": 2.0186912896667744e-07,
|
||
|
|
"loss": 0.3773,
|
||
|
|
"mean_token_accuracy": 0.8656438020989299,
|
||
|
|
"num_tokens": 306180918.0,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4132080078125,
|
||
|
|
"epoch": 2.8333333333333335,
|
||
|
|
"grad_norm": 0.4843831982656416,
|
||
|
|
"learning_rate": 1.8474997855000177e-07,
|
||
|
|
"loss": 0.3843,
|
||
|
|
"mean_token_accuracy": 0.8643954736180604,
|
||
|
|
"num_tokens": 307053855.0,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.416534423828125,
|
||
|
|
"epoch": 2.8412698412698414,
|
||
|
|
"grad_norm": 0.5147871761826365,
|
||
|
|
"learning_rate": 1.6838246737113983e-07,
|
||
|
|
"loss": 0.3835,
|
||
|
|
"mean_token_accuracy": 0.8651239294558764,
|
||
|
|
"num_tokens": 307908233.0,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4155731201171875,
|
||
|
|
"epoch": 2.8492063492063493,
|
||
|
|
"grad_norm": 0.530597613594654,
|
||
|
|
"learning_rate": 1.5276784883307084e-07,
|
||
|
|
"loss": 0.3697,
|
||
|
|
"mean_token_accuracy": 0.869139929767698,
|
||
|
|
"num_tokens": 308765267.0,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4214630126953125,
|
||
|
|
"epoch": 2.857142857142857,
|
||
|
|
"grad_norm": 0.501462039664022,
|
||
|
|
"learning_rate": 1.3790731868322472e-07,
|
||
|
|
"loss": 0.3763,
|
||
|
|
"mean_token_accuracy": 0.8655449384823442,
|
||
|
|
"num_tokens": 309586897.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41253662109375,
|
||
|
|
"epoch": 2.865079365079365,
|
||
|
|
"grad_norm": 0.4685559999749004,
|
||
|
|
"learning_rate": 1.238020149219099e-07,
|
||
|
|
"loss": 0.3751,
|
||
|
|
"mean_token_accuracy": 0.8660591626539826,
|
||
|
|
"num_tokens": 310440896.0,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4139404296875,
|
||
|
|
"epoch": 2.873015873015873,
|
||
|
|
"grad_norm": 0.4845099487055374,
|
||
|
|
"learning_rate": 1.1045301771516748e-07,
|
||
|
|
"loss": 0.3744,
|
||
|
|
"mean_token_accuracy": 0.8673448082990944,
|
||
|
|
"num_tokens": 311297562.0,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4100494384765625,
|
||
|
|
"epoch": 2.880952380952381,
|
||
|
|
"grad_norm": 0.48128143239397103,
|
||
|
|
"learning_rate": 9.786134931205726e-08,
|
||
|
|
"loss": 0.3875,
|
||
|
|
"mean_token_accuracy": 0.8648535516113043,
|
||
|
|
"num_tokens": 312189513.0,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4131317138671875,
|
||
|
|
"epoch": 2.888888888888889,
|
||
|
|
"grad_norm": 0.4783546055097021,
|
||
|
|
"learning_rate": 8.602797396636941e-08,
|
||
|
|
"loss": 0.3773,
|
||
|
|
"mean_token_accuracy": 0.865596916526556,
|
||
|
|
"num_tokens": 313055977.0,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41400146484375,
|
||
|
|
"epoch": 2.8968253968253967,
|
||
|
|
"grad_norm": 0.5064252562933648,
|
||
|
|
"learning_rate": 7.495379786278456e-08,
|
||
|
|
"loss": 0.3833,
|
||
|
|
"mean_token_accuracy": 0.8648101268336177,
|
||
|
|
"num_tokens": 313934488.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4145965576171875,
|
||
|
|
"epoch": 2.9047619047619047,
|
||
|
|
"grad_norm": 0.5132880237470535,
|
||
|
|
"learning_rate": 6.463966904748487e-08,
|
||
|
|
"loss": 0.3809,
|
||
|
|
"mean_token_accuracy": 0.8653016709722579,
|
||
|
|
"num_tokens": 314782888.0,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41058349609375,
|
||
|
|
"epoch": 2.9126984126984126,
|
||
|
|
"grad_norm": 0.4885061208644805,
|
||
|
|
"learning_rate": 5.508637736320488e-08,
|
||
|
|
"loss": 0.3704,
|
||
|
|
"mean_token_accuracy": 0.8688876410014927,
|
||
|
|
"num_tokens": 315667111.0,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4156036376953125,
|
||
|
|
"epoch": 2.9206349206349205,
|
||
|
|
"grad_norm": 0.4926976995984634,
|
||
|
|
"learning_rate": 4.62946543887488e-08,
|
||
|
|
"loss": 0.38,
|
||
|
|
"mean_token_accuracy": 0.864195094909519,
|
||
|
|
"num_tokens": 316519645.0,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4157562255859375,
|
||
|
|
"epoch": 2.928571428571429,
|
||
|
|
"grad_norm": 0.5012094010197111,
|
||
|
|
"learning_rate": 3.826517338296865e-08,
|
||
|
|
"loss": 0.3778,
|
||
|
|
"mean_token_accuracy": 0.8676678575575352,
|
||
|
|
"num_tokens": 317376914.0,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41656494140625,
|
||
|
|
"epoch": 2.9365079365079367,
|
||
|
|
"grad_norm": 0.48025666248124643,
|
||
|
|
"learning_rate": 3.0998549233205446e-08,
|
||
|
|
"loss": 0.3819,
|
||
|
|
"mean_token_accuracy": 0.8646529386751354,
|
||
|
|
"num_tokens": 318210944.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41094970703125,
|
||
|
|
"epoch": 2.9444444444444446,
|
||
|
|
"grad_norm": 0.4765329017560117,
|
||
|
|
"learning_rate": 2.4495338408201397e-08,
|
||
|
|
"loss": 0.3803,
|
||
|
|
"mean_token_accuracy": 0.8647728296928108,
|
||
|
|
"num_tokens": 319072977.0,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41326904296875,
|
||
|
|
"epoch": 2.9523809523809526,
|
||
|
|
"grad_norm": 0.4877083974615076,
|
||
|
|
"learning_rate": 1.8756038915486165e-08,
|
||
|
|
"loss": 0.3819,
|
||
|
|
"mean_token_accuracy": 0.864876258186996,
|
||
|
|
"num_tokens": 319936836.0,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4106903076171875,
|
||
|
|
"epoch": 2.9603174603174605,
|
||
|
|
"grad_norm": 0.7890691969320142,
|
||
|
|
"learning_rate": 1.3781090263242924e-08,
|
||
|
|
"loss": 0.3777,
|
||
|
|
"mean_token_accuracy": 0.8663178561255336,
|
||
|
|
"num_tokens": 320807541.0,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4145660400390625,
|
||
|
|
"epoch": 2.9682539682539684,
|
||
|
|
"grad_norm": 0.47432214070949463,
|
||
|
|
"learning_rate": 9.570873426649752e-09,
|
||
|
|
"loss": 0.3787,
|
||
|
|
"mean_token_accuracy": 0.8662264375016093,
|
||
|
|
"num_tokens": 321676330.0,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.412261962890625,
|
||
|
|
"epoch": 2.9761904761904763,
|
||
|
|
"grad_norm": 0.4980955112880234,
|
||
|
|
"learning_rate": 6.125710818701836e-09,
|
||
|
|
"loss": 0.3743,
|
||
|
|
"mean_token_accuracy": 0.8680550749413669,
|
||
|
|
"num_tokens": 322526195.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41668701171875,
|
||
|
|
"epoch": 2.984126984126984,
|
||
|
|
"grad_norm": 0.5212561146099488,
|
||
|
|
"learning_rate": 3.445866265526787e-09,
|
||
|
|
"loss": 0.3829,
|
||
|
|
"mean_token_accuracy": 0.8645712514407933,
|
||
|
|
"num_tokens": 323373321.0,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.412017822265625,
|
||
|
|
"epoch": 2.992063492063492,
|
||
|
|
"grad_norm": 0.46886859359122,
|
||
|
|
"learning_rate": 1.531544986177469e-09,
|
||
|
|
"loss": 0.3727,
|
||
|
|
"mean_token_accuracy": 0.865846767090261,
|
||
|
|
"num_tokens": 324241487.0,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4130401611328125,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 0.4965409602777678,
|
||
|
|
"learning_rate": 3.8289357691900785e-10,
|
||
|
|
"loss": 0.3831,
|
||
|
|
"mean_token_accuracy": 0.8650536630302668,
|
||
|
|
"num_tokens": 325114310.0,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0,
|
||
|
|
"step": 378,
|
||
|
|
"total_flos": 601237770600448.0,
|
||
|
|
"train_loss": 0.4870561873786664,
|
||
|
|
"train_runtime": 57802.2631,
|
||
|
|
"train_samples_per_second": 1.274,
|
||
|
|
"train_steps_per_second": 0.007
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 378,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 3,
|
||
|
|
"save_steps": 32,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 601237770600448.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|