7604 lines
214 KiB
JSON
7604 lines
214 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 756,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.5635986328125,
|
|
"epoch": 0.003968253968253968,
|
|
"grad_norm": 5.862754983476997,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.3929,
|
|
"mean_token_accuracy": 0.6520986258983612,
|
|
"num_tokens": 436822.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 0.571868896484375,
|
|
"epoch": 0.007936507936507936,
|
|
"grad_norm": 5.942989842192001,
|
|
"learning_rate": 2.6315789473684213e-07,
|
|
"loss": 1.3984,
|
|
"mean_token_accuracy": 0.6573778251186013,
|
|
"num_tokens": 849869.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 0.571258544921875,
|
|
"epoch": 0.011904761904761904,
|
|
"grad_norm": 6.016113817261652,
|
|
"learning_rate": 5.263157894736843e-07,
|
|
"loss": 1.4022,
|
|
"mean_token_accuracy": 0.6534338416531682,
|
|
"num_tokens": 1257883.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 0.567626953125,
|
|
"epoch": 0.015873015873015872,
|
|
"grad_norm": 5.755030134936764,
|
|
"learning_rate": 7.894736842105263e-07,
|
|
"loss": 1.3977,
|
|
"mean_token_accuracy": 0.650267880409956,
|
|
"num_tokens": 1710146.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.563079833984375,
|
|
"epoch": 0.01984126984126984,
|
|
"grad_norm": 5.759147918749323,
|
|
"learning_rate": 1.0526315789473685e-06,
|
|
"loss": 1.38,
|
|
"mean_token_accuracy": 0.6574590215459466,
|
|
"num_tokens": 2138902.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.5838623046875,
|
|
"epoch": 0.023809523809523808,
|
|
"grad_norm": 5.5950056421057885,
|
|
"learning_rate": 1.3157894736842106e-06,
|
|
"loss": 1.3755,
|
|
"mean_token_accuracy": 0.6586260544136167,
|
|
"num_tokens": 2560005.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.5576171875,
|
|
"epoch": 0.027777777777777776,
|
|
"grad_norm": 5.621048765741401,
|
|
"learning_rate": 1.5789473684210526e-06,
|
|
"loss": 1.3926,
|
|
"mean_token_accuracy": 0.6518173962831497,
|
|
"num_tokens": 3004121.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.572418212890625,
|
|
"epoch": 0.031746031746031744,
|
|
"grad_norm": 5.175193000260237,
|
|
"learning_rate": 1.8421052631578948e-06,
|
|
"loss": 1.3638,
|
|
"mean_token_accuracy": 0.6598256900906563,
|
|
"num_tokens": 3457966.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.565643310546875,
|
|
"epoch": 0.03571428571428571,
|
|
"grad_norm": 5.286230249900356,
|
|
"learning_rate": 2.105263157894737e-06,
|
|
"loss": 1.3632,
|
|
"mean_token_accuracy": 0.660512862727046,
|
|
"num_tokens": 3902759.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.583984375,
|
|
"epoch": 0.03968253968253968,
|
|
"grad_norm": 4.61299440604949,
|
|
"learning_rate": 2.368421052631579e-06,
|
|
"loss": 1.3222,
|
|
"mean_token_accuracy": 0.6607110062614083,
|
|
"num_tokens": 4321827.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.56304931640625,
|
|
"epoch": 0.04365079365079365,
|
|
"grad_norm": 4.311573869738654,
|
|
"learning_rate": 2.631578947368421e-06,
|
|
"loss": 1.2723,
|
|
"mean_token_accuracy": 0.6778731746599078,
|
|
"num_tokens": 4748195.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.566680908203125,
|
|
"epoch": 0.047619047619047616,
|
|
"grad_norm": 4.260816735639912,
|
|
"learning_rate": 2.8947368421052634e-06,
|
|
"loss": 1.2866,
|
|
"mean_token_accuracy": 0.6733056111261249,
|
|
"num_tokens": 5188122.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.56494140625,
|
|
"epoch": 0.051587301587301584,
|
|
"grad_norm": 3.50291785633804,
|
|
"learning_rate": 3.157894736842105e-06,
|
|
"loss": 1.1797,
|
|
"mean_token_accuracy": 0.6891454020515084,
|
|
"num_tokens": 5615040.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.562591552734375,
|
|
"epoch": 0.05555555555555555,
|
|
"grad_norm": 3.4924438085560223,
|
|
"learning_rate": 3.421052631578948e-06,
|
|
"loss": 1.1726,
|
|
"mean_token_accuracy": 0.6897318931296468,
|
|
"num_tokens": 6042413.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.5677490234375,
|
|
"epoch": 0.05952380952380952,
|
|
"grad_norm": 3.1328341857078574,
|
|
"learning_rate": 3.6842105263157896e-06,
|
|
"loss": 1.1382,
|
|
"mean_token_accuracy": 0.6958816023543477,
|
|
"num_tokens": 6468019.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.56500244140625,
|
|
"epoch": 0.06349206349206349,
|
|
"grad_norm": 3.1264058914998327,
|
|
"learning_rate": 3.947368421052632e-06,
|
|
"loss": 1.1368,
|
|
"mean_token_accuracy": 0.6940081315115094,
|
|
"num_tokens": 6898441.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.537628173828125,
|
|
"epoch": 0.06746031746031746,
|
|
"grad_norm": 3.1935735446807105,
|
|
"learning_rate": 4.210526315789474e-06,
|
|
"loss": 1.042,
|
|
"mean_token_accuracy": 0.7142375819385052,
|
|
"num_tokens": 7333054.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.51806640625,
|
|
"epoch": 0.07142857142857142,
|
|
"grad_norm": 3.9117878805623816,
|
|
"learning_rate": 4.473684210526316e-06,
|
|
"loss": 1.0019,
|
|
"mean_token_accuracy": 0.72404795140028,
|
|
"num_tokens": 7794638.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.5318603515625,
|
|
"epoch": 0.07539682539682539,
|
|
"grad_norm": 4.074921603012236,
|
|
"learning_rate": 4.736842105263158e-06,
|
|
"loss": 1.0057,
|
|
"mean_token_accuracy": 0.7181824343279004,
|
|
"num_tokens": 8237624.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.536865234375,
|
|
"epoch": 0.07936507936507936,
|
|
"grad_norm": 3.555487851257003,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.9821,
|
|
"mean_token_accuracy": 0.723413173109293,
|
|
"num_tokens": 8673402.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.530609130859375,
|
|
"epoch": 0.08333333333333333,
|
|
"grad_norm": 2.955594263842942,
|
|
"learning_rate": 5.263157894736842e-06,
|
|
"loss": 0.9539,
|
|
"mean_token_accuracy": 0.7331287879496813,
|
|
"num_tokens": 9121387.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.548980712890625,
|
|
"epoch": 0.0873015873015873,
|
|
"grad_norm": 2.658685672377372,
|
|
"learning_rate": 5.526315789473685e-06,
|
|
"loss": 0.9027,
|
|
"mean_token_accuracy": 0.7447563670575619,
|
|
"num_tokens": 9525436.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.548858642578125,
|
|
"epoch": 0.09126984126984126,
|
|
"grad_norm": 2.169406925194856,
|
|
"learning_rate": 5.789473684210527e-06,
|
|
"loss": 0.8918,
|
|
"mean_token_accuracy": 0.7433666130527854,
|
|
"num_tokens": 9932011.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.532745361328125,
|
|
"epoch": 0.09523809523809523,
|
|
"grad_norm": 2.77364949088275,
|
|
"learning_rate": 6.0526315789473685e-06,
|
|
"loss": 0.8816,
|
|
"mean_token_accuracy": 0.7459379723295569,
|
|
"num_tokens": 10358777.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.535888671875,
|
|
"epoch": 0.0992063492063492,
|
|
"grad_norm": 3.016843229488076,
|
|
"learning_rate": 6.31578947368421e-06,
|
|
"loss": 0.8696,
|
|
"mean_token_accuracy": 0.7488466305658221,
|
|
"num_tokens": 10773051.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.526519775390625,
|
|
"epoch": 0.10317460317460317,
|
|
"grad_norm": 2.7257646532581203,
|
|
"learning_rate": 6.578947368421054e-06,
|
|
"loss": 0.8541,
|
|
"mean_token_accuracy": 0.7512839920818806,
|
|
"num_tokens": 11211677.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.533477783203125,
|
|
"epoch": 0.10714285714285714,
|
|
"grad_norm": 2.4870825582777543,
|
|
"learning_rate": 6.842105263157896e-06,
|
|
"loss": 0.81,
|
|
"mean_token_accuracy": 0.758179577998817,
|
|
"num_tokens": 11628779.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.524932861328125,
|
|
"epoch": 0.1111111111111111,
|
|
"grad_norm": 2.360523836168894,
|
|
"learning_rate": 7.1052631578947375e-06,
|
|
"loss": 0.8368,
|
|
"mean_token_accuracy": 0.756181831471622,
|
|
"num_tokens": 12067363.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.513580322265625,
|
|
"epoch": 0.11507936507936507,
|
|
"grad_norm": 2.340522449916669,
|
|
"learning_rate": 7.368421052631579e-06,
|
|
"loss": 0.8284,
|
|
"mean_token_accuracy": 0.7573374779894948,
|
|
"num_tokens": 12507159.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.509765625,
|
|
"epoch": 0.11904761904761904,
|
|
"grad_norm": 1.9560553431906509,
|
|
"learning_rate": 7.631578947368423e-06,
|
|
"loss": 0.802,
|
|
"mean_token_accuracy": 0.7627127859741449,
|
|
"num_tokens": 12945458.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.523956298828125,
|
|
"epoch": 0.12301587301587301,
|
|
"grad_norm": 2.040122129092966,
|
|
"learning_rate": 7.894736842105265e-06,
|
|
"loss": 0.7837,
|
|
"mean_token_accuracy": 0.7661685338243842,
|
|
"num_tokens": 13370514.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.508392333984375,
|
|
"epoch": 0.12698412698412698,
|
|
"grad_norm": 2.121102369191249,
|
|
"learning_rate": 8.157894736842106e-06,
|
|
"loss": 0.7771,
|
|
"mean_token_accuracy": 0.7685025054961443,
|
|
"num_tokens": 13815066.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.518341064453125,
|
|
"epoch": 0.13095238095238096,
|
|
"grad_norm": 1.935388895167446,
|
|
"learning_rate": 8.421052631578948e-06,
|
|
"loss": 0.7698,
|
|
"mean_token_accuracy": 0.7705040192231536,
|
|
"num_tokens": 14240312.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.511688232421875,
|
|
"epoch": 0.1349206349206349,
|
|
"grad_norm": 1.69571857189707,
|
|
"learning_rate": 8.68421052631579e-06,
|
|
"loss": 0.7653,
|
|
"mean_token_accuracy": 0.7728015650063753,
|
|
"num_tokens": 14685173.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.517608642578125,
|
|
"epoch": 0.1388888888888889,
|
|
"grad_norm": 1.7681452936301536,
|
|
"learning_rate": 8.947368421052632e-06,
|
|
"loss": 0.7449,
|
|
"mean_token_accuracy": 0.7759622316807508,
|
|
"num_tokens": 15099914.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.5123291015625,
|
|
"epoch": 0.14285714285714285,
|
|
"grad_norm": 1.6769532274067533,
|
|
"learning_rate": 9.210526315789474e-06,
|
|
"loss": 0.7344,
|
|
"mean_token_accuracy": 0.777042037807405,
|
|
"num_tokens": 15522062.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.5107421875,
|
|
"epoch": 0.14682539682539683,
|
|
"grad_norm": 1.5490568448744158,
|
|
"learning_rate": 9.473684210526315e-06,
|
|
"loss": 0.7237,
|
|
"mean_token_accuracy": 0.7827096851542592,
|
|
"num_tokens": 15955138.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.5093994140625,
|
|
"epoch": 0.15079365079365079,
|
|
"grad_norm": 1.6589040046358219,
|
|
"learning_rate": 9.736842105263159e-06,
|
|
"loss": 0.703,
|
|
"mean_token_accuracy": 0.7817074777558446,
|
|
"num_tokens": 16388252.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.5093994140625,
|
|
"epoch": 0.15476190476190477,
|
|
"grad_norm": 1.7103852217985493,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.7203,
|
|
"mean_token_accuracy": 0.7796014500781894,
|
|
"num_tokens": 16821287.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.5087890625,
|
|
"epoch": 0.15873015873015872,
|
|
"grad_norm": 1.8720911640791305,
|
|
"learning_rate": 9.99995213807381e-06,
|
|
"loss": 0.6741,
|
|
"mean_token_accuracy": 0.7901940597221255,
|
|
"num_tokens": 17235205.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.504638671875,
|
|
"epoch": 0.1626984126984127,
|
|
"grad_norm": 1.6006252706063373,
|
|
"learning_rate": 9.99980855321154e-06,
|
|
"loss": 0.6899,
|
|
"mean_token_accuracy": 0.7874280894175172,
|
|
"num_tokens": 17657156.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 0.503753662109375,
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 1.5184573996381632,
|
|
"learning_rate": 9.999569248162095e-06,
|
|
"loss": 0.6887,
|
|
"mean_token_accuracy": 0.7868739385157824,
|
|
"num_tokens": 18090069.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 0.497650146484375,
|
|
"epoch": 0.17063492063492064,
|
|
"grad_norm": 1.6431910052473107,
|
|
"learning_rate": 9.999234227506912e-06,
|
|
"loss": 0.6944,
|
|
"mean_token_accuracy": 0.7861603572964668,
|
|
"num_tokens": 18542578.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 0.50860595703125,
|
|
"epoch": 0.1746031746031746,
|
|
"grad_norm": 1.7897905880615403,
|
|
"learning_rate": 9.998803497659885e-06,
|
|
"loss": 0.669,
|
|
"mean_token_accuracy": 0.7912999261170626,
|
|
"num_tokens": 18955962.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 0.5076904296875,
|
|
"epoch": 0.17857142857142858,
|
|
"grad_norm": 1.5708540426011852,
|
|
"learning_rate": 9.998277066867236e-06,
|
|
"loss": 0.6583,
|
|
"mean_token_accuracy": 0.7945169908925891,
|
|
"num_tokens": 19379353.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 0.507659912109375,
|
|
"epoch": 0.18253968253968253,
|
|
"grad_norm": 1.452264150440713,
|
|
"learning_rate": 9.997654945207368e-06,
|
|
"loss": 0.6506,
|
|
"mean_token_accuracy": 0.7967926179990172,
|
|
"num_tokens": 19812261.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 0.524383544921875,
|
|
"epoch": 0.1865079365079365,
|
|
"grad_norm": 1.63583355617904,
|
|
"learning_rate": 9.99693714459065e-06,
|
|
"loss": 0.6458,
|
|
"mean_token_accuracy": 0.7976251384243369,
|
|
"num_tokens": 20210235.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 0.503173828125,
|
|
"epoch": 0.19047619047619047,
|
|
"grad_norm": 2.0289207035123002,
|
|
"learning_rate": 9.996123678759214e-06,
|
|
"loss": 0.65,
|
|
"mean_token_accuracy": 0.7951374817639589,
|
|
"num_tokens": 20647709.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 0.50360107421875,
|
|
"epoch": 0.19444444444444445,
|
|
"grad_norm": 1.5990576885906742,
|
|
"learning_rate": 9.995214563286677e-06,
|
|
"loss": 0.6434,
|
|
"mean_token_accuracy": 0.7995740966871381,
|
|
"num_tokens": 21065897.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 0.5130615234375,
|
|
"epoch": 0.1984126984126984,
|
|
"grad_norm": 1.9524666748685242,
|
|
"learning_rate": 9.994209815577843e-06,
|
|
"loss": 0.6555,
|
|
"mean_token_accuracy": 0.7948365742340684,
|
|
"num_tokens": 21486371.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.51611328125,
|
|
"epoch": 0.20238095238095238,
|
|
"grad_norm": 1.5386890414245815,
|
|
"learning_rate": 9.993109454868379e-06,
|
|
"loss": 0.6435,
|
|
"mean_token_accuracy": 0.796173213981092,
|
|
"num_tokens": 21909309.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 0.5155029296875,
|
|
"epoch": 0.20634920634920634,
|
|
"grad_norm": 1.532778324611743,
|
|
"learning_rate": 9.991913502224438e-06,
|
|
"loss": 0.6319,
|
|
"mean_token_accuracy": 0.7995416941121221,
|
|
"num_tokens": 22318414.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 0.50860595703125,
|
|
"epoch": 0.21031746031746032,
|
|
"grad_norm": 1.6657431535988216,
|
|
"learning_rate": 9.990621980542258e-06,
|
|
"loss": 0.6093,
|
|
"mean_token_accuracy": 0.8053860478103161,
|
|
"num_tokens": 22719471.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 0.504730224609375,
|
|
"epoch": 0.21428571428571427,
|
|
"grad_norm": 1.641225405902951,
|
|
"learning_rate": 9.989234914547725e-06,
|
|
"loss": 0.6216,
|
|
"mean_token_accuracy": 0.8012946872040629,
|
|
"num_tokens": 23134604.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 0.49383544921875,
|
|
"epoch": 0.21825396825396826,
|
|
"grad_norm": 1.4469976883176578,
|
|
"learning_rate": 9.9877523307959e-06,
|
|
"loss": 0.6264,
|
|
"mean_token_accuracy": 0.8012660220265388,
|
|
"num_tokens": 23571548.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 0.49749755859375,
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 1.6227889097439865,
|
|
"learning_rate": 9.986174257670509e-06,
|
|
"loss": 0.6246,
|
|
"mean_token_accuracy": 0.8050137888640165,
|
|
"num_tokens": 24009770.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 0.49603271484375,
|
|
"epoch": 0.2261904761904762,
|
|
"grad_norm": 1.4045322193755005,
|
|
"learning_rate": 9.984500725383397e-06,
|
|
"loss": 0.6324,
|
|
"mean_token_accuracy": 0.8019688781350851,
|
|
"num_tokens": 24447544.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 0.5068359375,
|
|
"epoch": 0.23015873015873015,
|
|
"grad_norm": 1.3881146015309058,
|
|
"learning_rate": 9.98273176597396e-06,
|
|
"loss": 0.6233,
|
|
"mean_token_accuracy": 0.802922697737813,
|
|
"num_tokens": 24869806.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 0.48602294921875,
|
|
"epoch": 0.23412698412698413,
|
|
"grad_norm": 1.4997301127686509,
|
|
"learning_rate": 9.980867413308516e-06,
|
|
"loss": 0.6298,
|
|
"mean_token_accuracy": 0.8009732821956277,
|
|
"num_tokens": 25337885.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 0.4898681640625,
|
|
"epoch": 0.23809523809523808,
|
|
"grad_norm": 1.4955906883793464,
|
|
"learning_rate": 9.978907703079672e-06,
|
|
"loss": 0.6112,
|
|
"mean_token_accuracy": 0.807507585734129,
|
|
"num_tokens": 25762999.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.495452880859375,
|
|
"epoch": 0.24206349206349206,
|
|
"grad_norm": 1.4495106936140836,
|
|
"learning_rate": 9.976852672805625e-06,
|
|
"loss": 0.6071,
|
|
"mean_token_accuracy": 0.8060804437845945,
|
|
"num_tokens": 26204122.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 0.48187255859375,
|
|
"epoch": 0.24603174603174602,
|
|
"grad_norm": 1.394817796737972,
|
|
"learning_rate": 9.974702361829465e-06,
|
|
"loss": 0.5934,
|
|
"mean_token_accuracy": 0.8098774421960115,
|
|
"num_tokens": 26651412.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 0.4937744140625,
|
|
"epoch": 0.25,
|
|
"grad_norm": 1.566600221250525,
|
|
"learning_rate": 9.972456811318399e-06,
|
|
"loss": 0.6075,
|
|
"mean_token_accuracy": 0.8056732397526503,
|
|
"num_tokens": 27080137.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 0.48150634765625,
|
|
"epoch": 0.25396825396825395,
|
|
"grad_norm": 1.4470780226444868,
|
|
"learning_rate": 9.970116064262975e-06,
|
|
"loss": 0.6025,
|
|
"mean_token_accuracy": 0.8087067836895585,
|
|
"num_tokens": 27520069.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 0.485382080078125,
|
|
"epoch": 0.25793650793650796,
|
|
"grad_norm": 1.5004810878183181,
|
|
"learning_rate": 9.96768016547626e-06,
|
|
"loss": 0.6011,
|
|
"mean_token_accuracy": 0.8066385835409164,
|
|
"num_tokens": 27954154.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 0.49444580078125,
|
|
"epoch": 0.2619047619047619,
|
|
"grad_norm": 1.618557504297728,
|
|
"learning_rate": 9.965149161592973e-06,
|
|
"loss": 0.6054,
|
|
"mean_token_accuracy": 0.8067285194993019,
|
|
"num_tokens": 28367541.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 0.4913330078125,
|
|
"epoch": 0.26587301587301587,
|
|
"grad_norm": 1.4570014991732672,
|
|
"learning_rate": 9.962523101068608e-06,
|
|
"loss": 0.573,
|
|
"mean_token_accuracy": 0.8120877193287015,
|
|
"num_tokens": 28779140.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 0.485076904296875,
|
|
"epoch": 0.2698412698412698,
|
|
"grad_norm": 1.5057997069476419,
|
|
"learning_rate": 9.959802034178489e-06,
|
|
"loss": 0.5966,
|
|
"mean_token_accuracy": 0.8073940826579928,
|
|
"num_tokens": 29217570.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 0.479888916015625,
|
|
"epoch": 0.27380952380952384,
|
|
"grad_norm": 1.4032340805398644,
|
|
"learning_rate": 9.956986013016816e-06,
|
|
"loss": 0.5767,
|
|
"mean_token_accuracy": 0.8149419017136097,
|
|
"num_tokens": 29656943.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 0.484710693359375,
|
|
"epoch": 0.2777777777777778,
|
|
"grad_norm": 1.4496205753720897,
|
|
"learning_rate": 9.954075091495669e-06,
|
|
"loss": 0.6001,
|
|
"mean_token_accuracy": 0.8093660045415163,
|
|
"num_tokens": 30088869.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.476165771484375,
|
|
"epoch": 0.28174603174603174,
|
|
"grad_norm": 1.4082524979942377,
|
|
"learning_rate": 9.951069325343972e-06,
|
|
"loss": 0.6016,
|
|
"mean_token_accuracy": 0.8054882632568479,
|
|
"num_tokens": 30550317.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 0.483856201171875,
|
|
"epoch": 0.2857142857142857,
|
|
"grad_norm": 1.3714127497612545,
|
|
"learning_rate": 9.947968772106428e-06,
|
|
"loss": 0.5748,
|
|
"mean_token_accuracy": 0.8156133992597461,
|
|
"num_tokens": 30959821.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 0.47869873046875,
|
|
"epoch": 0.2896825396825397,
|
|
"grad_norm": 1.6516883841068675,
|
|
"learning_rate": 9.944773491142416e-06,
|
|
"loss": 0.5997,
|
|
"mean_token_accuracy": 0.8074251553043723,
|
|
"num_tokens": 31412639.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 0.487518310546875,
|
|
"epoch": 0.29365079365079366,
|
|
"grad_norm": 1.5133792126136842,
|
|
"learning_rate": 9.94148354362486e-06,
|
|
"loss": 0.592,
|
|
"mean_token_accuracy": 0.8129479885101318,
|
|
"num_tokens": 31830767.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 0.482086181640625,
|
|
"epoch": 0.2976190476190476,
|
|
"grad_norm": 1.62731136956083,
|
|
"learning_rate": 9.938098992539045e-06,
|
|
"loss": 0.5835,
|
|
"mean_token_accuracy": 0.8082789676263928,
|
|
"num_tokens": 32267329.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 0.48516845703125,
|
|
"epoch": 0.30158730158730157,
|
|
"grad_norm": 1.4784416203962691,
|
|
"learning_rate": 9.93461990268143e-06,
|
|
"loss": 0.582,
|
|
"mean_token_accuracy": 0.8147872434929013,
|
|
"num_tokens": 32692726.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 0.48876953125,
|
|
"epoch": 0.3055555555555556,
|
|
"grad_norm": 1.5041413810038196,
|
|
"learning_rate": 9.931046340658387e-06,
|
|
"loss": 0.5617,
|
|
"mean_token_accuracy": 0.8183435359969735,
|
|
"num_tokens": 33108936.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 0.472503662109375,
|
|
"epoch": 0.30952380952380953,
|
|
"grad_norm": 1.6817980341644059,
|
|
"learning_rate": 9.927378374884947e-06,
|
|
"loss": 0.5655,
|
|
"mean_token_accuracy": 0.8146926909685135,
|
|
"num_tokens": 33543076.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 0.474945068359375,
|
|
"epoch": 0.3134920634920635,
|
|
"grad_norm": 1.3241417102653499,
|
|
"learning_rate": 9.923616075583465e-06,
|
|
"loss": 0.5738,
|
|
"mean_token_accuracy": 0.8142029214650393,
|
|
"num_tokens": 33980897.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 0.47528076171875,
|
|
"epoch": 0.31746031746031744,
|
|
"grad_norm": 1.4456909932877973,
|
|
"learning_rate": 9.919759514782304e-06,
|
|
"loss": 0.5725,
|
|
"mean_token_accuracy": 0.8150945641100407,
|
|
"num_tokens": 34404352.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.48504638671875,
|
|
"epoch": 0.32142857142857145,
|
|
"grad_norm": 1.2666611706175113,
|
|
"learning_rate": 9.91580876631443e-06,
|
|
"loss": 0.5728,
|
|
"mean_token_accuracy": 0.8147335788235068,
|
|
"num_tokens": 34815122.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 0.49102783203125,
|
|
"epoch": 0.3253968253968254,
|
|
"grad_norm": 1.4465210833568694,
|
|
"learning_rate": 9.91176390581602e-06,
|
|
"loss": 0.5759,
|
|
"mean_token_accuracy": 0.8133391635492444,
|
|
"num_tokens": 35236933.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 0.483489990234375,
|
|
"epoch": 0.32936507936507936,
|
|
"grad_norm": 1.3544114949996022,
|
|
"learning_rate": 9.907625010724999e-06,
|
|
"loss": 0.5724,
|
|
"mean_token_accuracy": 0.8148928321897984,
|
|
"num_tokens": 35664506.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 0.480560302734375,
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 1.3188860783644643,
|
|
"learning_rate": 9.903392160279564e-06,
|
|
"loss": 0.5666,
|
|
"mean_token_accuracy": 0.8133293204009533,
|
|
"num_tokens": 36088050.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 0.48748779296875,
|
|
"epoch": 0.3373015873015873,
|
|
"grad_norm": 1.4518030416485894,
|
|
"learning_rate": 9.899065435516661e-06,
|
|
"loss": 0.5664,
|
|
"mean_token_accuracy": 0.8148653889074922,
|
|
"num_tokens": 36501235.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 0.469940185546875,
|
|
"epoch": 0.3412698412698413,
|
|
"grad_norm": 1.394119633826569,
|
|
"learning_rate": 9.894644919270448e-06,
|
|
"loss": 0.5722,
|
|
"mean_token_accuracy": 0.814102666452527,
|
|
"num_tokens": 36942407.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 0.47369384765625,
|
|
"epoch": 0.34523809523809523,
|
|
"grad_norm": 1.4881887909837872,
|
|
"learning_rate": 9.890130696170691e-06,
|
|
"loss": 0.5714,
|
|
"mean_token_accuracy": 0.8154451455920935,
|
|
"num_tokens": 37381260.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 0.47998046875,
|
|
"epoch": 0.3492063492063492,
|
|
"grad_norm": 1.3217206972932933,
|
|
"learning_rate": 9.885522852641156e-06,
|
|
"loss": 0.5695,
|
|
"mean_token_accuracy": 0.814792038872838,
|
|
"num_tokens": 37803882.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 0.48052978515625,
|
|
"epoch": 0.3531746031746032,
|
|
"grad_norm": 1.5133757517932098,
|
|
"learning_rate": 9.880821476897948e-06,
|
|
"loss": 0.5628,
|
|
"mean_token_accuracy": 0.8151478515937924,
|
|
"num_tokens": 38227635.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 0.475738525390625,
|
|
"epoch": 0.35714285714285715,
|
|
"grad_norm": 1.5653342692191234,
|
|
"learning_rate": 9.87602665894783e-06,
|
|
"loss": 0.5828,
|
|
"mean_token_accuracy": 0.8125336300581694,
|
|
"num_tokens": 38667329.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.473876953125,
|
|
"epoch": 0.3611111111111111,
|
|
"grad_norm": 1.3382017413079235,
|
|
"learning_rate": 9.871138490586489e-06,
|
|
"loss": 0.57,
|
|
"mean_token_accuracy": 0.8121865503489971,
|
|
"num_tokens": 39107330.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 0.47998046875,
|
|
"epoch": 0.36507936507936506,
|
|
"grad_norm": 1.346784303133718,
|
|
"learning_rate": 9.866157065396784e-06,
|
|
"loss": 0.5503,
|
|
"mean_token_accuracy": 0.8177150310948491,
|
|
"num_tokens": 39524166.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 0.469207763671875,
|
|
"epoch": 0.36904761904761907,
|
|
"grad_norm": 1.4083288521133936,
|
|
"learning_rate": 9.861082478746962e-06,
|
|
"loss": 0.5508,
|
|
"mean_token_accuracy": 0.820819640532136,
|
|
"num_tokens": 39952174.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 0.465789794921875,
|
|
"epoch": 0.373015873015873,
|
|
"grad_norm": 1.4473119436564825,
|
|
"learning_rate": 9.855914827788814e-06,
|
|
"loss": 0.5596,
|
|
"mean_token_accuracy": 0.8184320721775293,
|
|
"num_tokens": 40389693.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 0.46807861328125,
|
|
"epoch": 0.376984126984127,
|
|
"grad_norm": 1.3763793812393954,
|
|
"learning_rate": 9.850654211455837e-06,
|
|
"loss": 0.5548,
|
|
"mean_token_accuracy": 0.8205192228779197,
|
|
"num_tokens": 40815730.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 0.484527587890625,
|
|
"epoch": 0.38095238095238093,
|
|
"grad_norm": 1.5969870094084369,
|
|
"learning_rate": 9.84530073046132e-06,
|
|
"loss": 0.564,
|
|
"mean_token_accuracy": 0.816374409943819,
|
|
"num_tokens": 41231841.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 0.492523193359375,
|
|
"epoch": 0.38492063492063494,
|
|
"grad_norm": 1.379364573057709,
|
|
"learning_rate": 9.83985448729643e-06,
|
|
"loss": 0.572,
|
|
"mean_token_accuracy": 0.8147962624207139,
|
|
"num_tokens": 41650119.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 0.4735107421875,
|
|
"epoch": 0.3888888888888889,
|
|
"grad_norm": 1.4022051638675177,
|
|
"learning_rate": 9.83431558622824e-06,
|
|
"loss": 0.5501,
|
|
"mean_token_accuracy": 0.8185382299125195,
|
|
"num_tokens": 42082897.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 0.47802734375,
|
|
"epoch": 0.39285714285714285,
|
|
"grad_norm": 1.3021150947814153,
|
|
"learning_rate": 9.828684133297738e-06,
|
|
"loss": 0.5475,
|
|
"mean_token_accuracy": 0.82077881321311,
|
|
"num_tokens": 42519361.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 0.47802734375,
|
|
"epoch": 0.3968253968253968,
|
|
"grad_norm": 1.3024753376267064,
|
|
"learning_rate": 9.822960236317804e-06,
|
|
"loss": 0.5436,
|
|
"mean_token_accuracy": 0.8204956650733948,
|
|
"num_tokens": 42941458.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.472930908203125,
|
|
"epoch": 0.4007936507936508,
|
|
"grad_norm": 1.4182047962742048,
|
|
"learning_rate": 9.817144004871127e-06,
|
|
"loss": 0.5442,
|
|
"mean_token_accuracy": 0.8214483223855495,
|
|
"num_tokens": 43370971.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 0.476470947265625,
|
|
"epoch": 0.40476190476190477,
|
|
"grad_norm": 1.3283608806953866,
|
|
"learning_rate": 9.811235550308127e-06,
|
|
"loss": 0.551,
|
|
"mean_token_accuracy": 0.8185345204547048,
|
|
"num_tokens": 43801380.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 0.46978759765625,
|
|
"epoch": 0.4087301587301587,
|
|
"grad_norm": 1.2924764394677166,
|
|
"learning_rate": 9.805234985744804e-06,
|
|
"loss": 0.5605,
|
|
"mean_token_accuracy": 0.8147126482799649,
|
|
"num_tokens": 44245066.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 0.485198974609375,
|
|
"epoch": 0.4126984126984127,
|
|
"grad_norm": 1.3073864707831366,
|
|
"learning_rate": 9.799142426060595e-06,
|
|
"loss": 0.5573,
|
|
"mean_token_accuracy": 0.8181026382371783,
|
|
"num_tokens": 44671335.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 0.498046875,
|
|
"epoch": 0.4166666666666667,
|
|
"grad_norm": 1.4213977867693426,
|
|
"learning_rate": 9.792957987896154e-06,
|
|
"loss": 0.5518,
|
|
"mean_token_accuracy": 0.8183343056589365,
|
|
"num_tokens": 45066930.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 0.47454833984375,
|
|
"epoch": 0.42063492063492064,
|
|
"grad_norm": 1.2495857267379333,
|
|
"learning_rate": 9.786681789651134e-06,
|
|
"loss": 0.5472,
|
|
"mean_token_accuracy": 0.8180114766582847,
|
|
"num_tokens": 45508166.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 0.47021484375,
|
|
"epoch": 0.4246031746031746,
|
|
"grad_norm": 1.238708297199452,
|
|
"learning_rate": 9.780313951481904e-06,
|
|
"loss": 0.5612,
|
|
"mean_token_accuracy": 0.8155703386291862,
|
|
"num_tokens": 45960298.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 0.473785400390625,
|
|
"epoch": 0.42857142857142855,
|
|
"grad_norm": 1.367804985831029,
|
|
"learning_rate": 9.773854595299269e-06,
|
|
"loss": 0.5518,
|
|
"mean_token_accuracy": 0.8167815553024411,
|
|
"num_tokens": 46398974.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 0.462677001953125,
|
|
"epoch": 0.43253968253968256,
|
|
"grad_norm": 1.3222167500569877,
|
|
"learning_rate": 9.767303844766118e-06,
|
|
"loss": 0.5548,
|
|
"mean_token_accuracy": 0.8168724188581109,
|
|
"num_tokens": 46837899.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 0.460693359375,
|
|
"epoch": 0.4365079365079365,
|
|
"grad_norm": 1.3681492766242778,
|
|
"learning_rate": 9.760661825295068e-06,
|
|
"loss": 0.5623,
|
|
"mean_token_accuracy": 0.8150366581976414,
|
|
"num_tokens": 47311746.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.466400146484375,
|
|
"epoch": 0.44047619047619047,
|
|
"grad_norm": 1.344685621979837,
|
|
"learning_rate": 9.753928664046055e-06,
|
|
"loss": 0.5392,
|
|
"mean_token_accuracy": 0.822113991715014,
|
|
"num_tokens": 47744340.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 0.4608154296875,
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 1.3313641531925076,
|
|
"learning_rate": 9.747104489923907e-06,
|
|
"loss": 0.5335,
|
|
"mean_token_accuracy": 0.8225171025842428,
|
|
"num_tokens": 48177761.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 0.4722900390625,
|
|
"epoch": 0.44841269841269843,
|
|
"grad_norm": 1.5485087292600126,
|
|
"learning_rate": 9.740189433575873e-06,
|
|
"loss": 0.5511,
|
|
"mean_token_accuracy": 0.8177419500425458,
|
|
"num_tokens": 48604700.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 0.474884033203125,
|
|
"epoch": 0.4523809523809524,
|
|
"grad_norm": 1.3287727548949633,
|
|
"learning_rate": 9.733183627389117e-06,
|
|
"loss": 0.5349,
|
|
"mean_token_accuracy": 0.8249012846499681,
|
|
"num_tokens": 49026375.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 0.461212158203125,
|
|
"epoch": 0.45634920634920634,
|
|
"grad_norm": 1.4235893278514111,
|
|
"learning_rate": 9.726087205488192e-06,
|
|
"loss": 0.5488,
|
|
"mean_token_accuracy": 0.8166424483060837,
|
|
"num_tokens": 49467267.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 0.47381591796875,
|
|
"epoch": 0.4603174603174603,
|
|
"grad_norm": 1.255433079679792,
|
|
"learning_rate": 9.718900303732465e-06,
|
|
"loss": 0.5467,
|
|
"mean_token_accuracy": 0.8177134236320853,
|
|
"num_tokens": 49889163.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 0.476165771484375,
|
|
"epoch": 0.4642857142857143,
|
|
"grad_norm": 1.2666755263949114,
|
|
"learning_rate": 9.711623059713522e-06,
|
|
"loss": 0.5284,
|
|
"mean_token_accuracy": 0.82161083817482,
|
|
"num_tokens": 50300460.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 0.470458984375,
|
|
"epoch": 0.46825396825396826,
|
|
"grad_norm": 1.7054143182470258,
|
|
"learning_rate": 9.70425561275253e-06,
|
|
"loss": 0.553,
|
|
"mean_token_accuracy": 0.8204147005453706,
|
|
"num_tokens": 50735361.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 0.47528076171875,
|
|
"epoch": 0.4722222222222222,
|
|
"grad_norm": 1.2776820782333425,
|
|
"learning_rate": 9.696798103897567e-06,
|
|
"loss": 0.5344,
|
|
"mean_token_accuracy": 0.821893903426826,
|
|
"num_tokens": 51149122.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 0.469268798828125,
|
|
"epoch": 0.47619047619047616,
|
|
"grad_norm": 1.1855022647806321,
|
|
"learning_rate": 9.689250675920932e-06,
|
|
"loss": 0.5371,
|
|
"mean_token_accuracy": 0.8207768378779292,
|
|
"num_tokens": 51597577.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.461181640625,
|
|
"epoch": 0.4801587301587302,
|
|
"grad_norm": 1.3061024406164452,
|
|
"learning_rate": 9.6816134733164e-06,
|
|
"loss": 0.5419,
|
|
"mean_token_accuracy": 0.8211635444313288,
|
|
"num_tokens": 52043666.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 0.4639892578125,
|
|
"epoch": 0.48412698412698413,
|
|
"grad_norm": 1.278485982326175,
|
|
"learning_rate": 9.67388664229646e-06,
|
|
"loss": 0.5457,
|
|
"mean_token_accuracy": 0.8210693299770355,
|
|
"num_tokens": 52482382.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 0.466400146484375,
|
|
"epoch": 0.4880952380952381,
|
|
"grad_norm": 1.3159497560386597,
|
|
"learning_rate": 9.66607033078952e-06,
|
|
"loss": 0.5399,
|
|
"mean_token_accuracy": 0.8193363519385457,
|
|
"num_tokens": 52931115.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 0.462371826171875,
|
|
"epoch": 0.49206349206349204,
|
|
"grad_norm": 1.3013445808543571,
|
|
"learning_rate": 9.658164688437073e-06,
|
|
"loss": 0.5431,
|
|
"mean_token_accuracy": 0.8198595689609647,
|
|
"num_tokens": 53370750.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 0.470245361328125,
|
|
"epoch": 0.49603174603174605,
|
|
"grad_norm": 1.2502745654553475,
|
|
"learning_rate": 9.65016986659082e-06,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.8216186631470919,
|
|
"num_tokens": 53798951.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 0.460723876953125,
|
|
"epoch": 0.5,
|
|
"grad_norm": 1.4425212147696118,
|
|
"learning_rate": 9.642086018309798e-06,
|
|
"loss": 0.528,
|
|
"mean_token_accuracy": 0.8253877777606249,
|
|
"num_tokens": 54235189.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 0.463043212890625,
|
|
"epoch": 0.503968253968254,
|
|
"grad_norm": 1.190227347104015,
|
|
"learning_rate": 9.63391329835742e-06,
|
|
"loss": 0.5215,
|
|
"mean_token_accuracy": 0.825776319950819,
|
|
"num_tokens": 54642925.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 0.470428466796875,
|
|
"epoch": 0.5079365079365079,
|
|
"grad_norm": 1.3119200133443487,
|
|
"learning_rate": 9.625651863198538e-06,
|
|
"loss": 0.5361,
|
|
"mean_token_accuracy": 0.8217763127759099,
|
|
"num_tokens": 55066936.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 0.475128173828125,
|
|
"epoch": 0.5119047619047619,
|
|
"grad_norm": 1.2559808601225464,
|
|
"learning_rate": 9.617301870996432e-06,
|
|
"loss": 0.5271,
|
|
"mean_token_accuracy": 0.8248334173113108,
|
|
"num_tokens": 55484500.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 0.45751953125,
|
|
"epoch": 0.5158730158730159,
|
|
"grad_norm": 1.2089833762472606,
|
|
"learning_rate": 9.608863481609784e-06,
|
|
"loss": 0.5333,
|
|
"mean_token_accuracy": 0.8226035898551345,
|
|
"num_tokens": 55922405.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.4698486328125,
|
|
"epoch": 0.5198412698412699,
|
|
"grad_norm": 1.311622726348439,
|
|
"learning_rate": 9.600336856589622e-06,
|
|
"loss": 0.542,
|
|
"mean_token_accuracy": 0.8179264310747385,
|
|
"num_tokens": 56355834.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 0.469024658203125,
|
|
"epoch": 0.5238095238095238,
|
|
"grad_norm": 1.370201408190726,
|
|
"learning_rate": 9.591722159176229e-06,
|
|
"loss": 0.5209,
|
|
"mean_token_accuracy": 0.8256417205557227,
|
|
"num_tokens": 56770275.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"entropy": 0.467926025390625,
|
|
"epoch": 0.5277777777777778,
|
|
"grad_norm": 1.4107765499615386,
|
|
"learning_rate": 9.583019554296004e-06,
|
|
"loss": 0.54,
|
|
"mean_token_accuracy": 0.8201555293053389,
|
|
"num_tokens": 57203160.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"entropy": 0.469207763671875,
|
|
"epoch": 0.5317460317460317,
|
|
"grad_norm": 1.2667343919182794,
|
|
"learning_rate": 9.574229208558322e-06,
|
|
"loss": 0.535,
|
|
"mean_token_accuracy": 0.8202388240024447,
|
|
"num_tokens": 57627870.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"entropy": 0.46697998046875,
|
|
"epoch": 0.5357142857142857,
|
|
"grad_norm": 1.4012228207534334,
|
|
"learning_rate": 9.565351290252339e-06,
|
|
"loss": 0.5335,
|
|
"mean_token_accuracy": 0.8244267264381051,
|
|
"num_tokens": 58059792.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 0.4700927734375,
|
|
"epoch": 0.5396825396825397,
|
|
"grad_norm": 1.2541013251161421,
|
|
"learning_rate": 9.556385969343756e-06,
|
|
"loss": 0.5178,
|
|
"mean_token_accuracy": 0.8261177660897374,
|
|
"num_tokens": 58469884.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"entropy": 0.460784912109375,
|
|
"epoch": 0.5436507936507936,
|
|
"grad_norm": 1.266853697510061,
|
|
"learning_rate": 9.547333417471589e-06,
|
|
"loss": 0.5218,
|
|
"mean_token_accuracy": 0.824421800673008,
|
|
"num_tokens": 58908403.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"entropy": 0.467498779296875,
|
|
"epoch": 0.5476190476190477,
|
|
"grad_norm": 1.649666578399019,
|
|
"learning_rate": 9.538193807944864e-06,
|
|
"loss": 0.5251,
|
|
"mean_token_accuracy": 0.8241150714457035,
|
|
"num_tokens": 59323796.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"entropy": 0.461883544921875,
|
|
"epoch": 0.5515873015873016,
|
|
"grad_norm": 1.2782211754106552,
|
|
"learning_rate": 9.528967315739308e-06,
|
|
"loss": 0.5231,
|
|
"mean_token_accuracy": 0.8241786258295178,
|
|
"num_tokens": 59751885.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"entropy": 0.464080810546875,
|
|
"epoch": 0.5555555555555556,
|
|
"grad_norm": 1.1911969994875058,
|
|
"learning_rate": 9.519654117493996e-06,
|
|
"loss": 0.5093,
|
|
"mean_token_accuracy": 0.8299755034968257,
|
|
"num_tokens": 60183841.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.467681884765625,
|
|
"epoch": 0.5595238095238095,
|
|
"grad_norm": 1.21584451360531,
|
|
"learning_rate": 9.510254391507971e-06,
|
|
"loss": 0.5323,
|
|
"mean_token_accuracy": 0.8225418599322438,
|
|
"num_tokens": 60605801.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"entropy": 0.465789794921875,
|
|
"epoch": 0.5634920634920635,
|
|
"grad_norm": 1.1387453790165247,
|
|
"learning_rate": 9.500768317736832e-06,
|
|
"loss": 0.527,
|
|
"mean_token_accuracy": 0.8241681484505534,
|
|
"num_tokens": 61048601.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"entropy": 0.47747802734375,
|
|
"epoch": 0.5674603174603174,
|
|
"grad_norm": 1.1374751119159374,
|
|
"learning_rate": 9.49119607778928e-06,
|
|
"loss": 0.5235,
|
|
"mean_token_accuracy": 0.8259162092581391,
|
|
"num_tokens": 61446873.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"entropy": 0.4652099609375,
|
|
"epoch": 0.5714285714285714,
|
|
"grad_norm": 1.2648801316634823,
|
|
"learning_rate": 9.481537854923654e-06,
|
|
"loss": 0.5352,
|
|
"mean_token_accuracy": 0.8220484433695674,
|
|
"num_tokens": 61887912.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"entropy": 0.47418212890625,
|
|
"epoch": 0.5753968253968254,
|
|
"grad_norm": 1.113220988507023,
|
|
"learning_rate": 9.471793834044416e-06,
|
|
"loss": 0.5236,
|
|
"mean_token_accuracy": 0.8275265209376812,
|
|
"num_tokens": 62316051.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 0.459381103515625,
|
|
"epoch": 0.5793650793650794,
|
|
"grad_norm": 1.1782022702716075,
|
|
"learning_rate": 9.461964201698604e-06,
|
|
"loss": 0.5239,
|
|
"mean_token_accuracy": 0.8253972074016929,
|
|
"num_tokens": 62741342.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"entropy": 0.464813232421875,
|
|
"epoch": 0.5833333333333334,
|
|
"grad_norm": 1.3055908871865158,
|
|
"learning_rate": 9.452049146072278e-06,
|
|
"loss": 0.5217,
|
|
"mean_token_accuracy": 0.8288997933268547,
|
|
"num_tokens": 63164890.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"entropy": 0.4561767578125,
|
|
"epoch": 0.5873015873015873,
|
|
"grad_norm": 1.250402921918011,
|
|
"learning_rate": 9.442048856986899e-06,
|
|
"loss": 0.5244,
|
|
"mean_token_accuracy": 0.825376064516604,
|
|
"num_tokens": 63594617.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"entropy": 0.45916748046875,
|
|
"epoch": 0.5912698412698413,
|
|
"grad_norm": 1.2512378704930547,
|
|
"learning_rate": 9.431963525895709e-06,
|
|
"loss": 0.5332,
|
|
"mean_token_accuracy": 0.8236651951447129,
|
|
"num_tokens": 64050293.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"entropy": 0.45831298828125,
|
|
"epoch": 0.5952380952380952,
|
|
"grad_norm": 1.2800747002600605,
|
|
"learning_rate": 9.421793345880055e-06,
|
|
"loss": 0.508,
|
|
"mean_token_accuracy": 0.8307171342894435,
|
|
"num_tokens": 64467695.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.4619140625,
|
|
"epoch": 0.5992063492063492,
|
|
"grad_norm": 1.22106067792139,
|
|
"learning_rate": 9.4115385116457e-06,
|
|
"loss": 0.5273,
|
|
"mean_token_accuracy": 0.8228645129129291,
|
|
"num_tokens": 64908198.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"entropy": 0.465362548828125,
|
|
"epoch": 0.6031746031746031,
|
|
"grad_norm": 1.6011741702601825,
|
|
"learning_rate": 9.401199219519088e-06,
|
|
"loss": 0.5189,
|
|
"mean_token_accuracy": 0.8247488467022777,
|
|
"num_tokens": 65333788.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"entropy": 0.47772216796875,
|
|
"epoch": 0.6071428571428571,
|
|
"grad_norm": 1.289619416717165,
|
|
"learning_rate": 9.390775667443602e-06,
|
|
"loss": 0.5092,
|
|
"mean_token_accuracy": 0.8292458476498723,
|
|
"num_tokens": 65748782.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"entropy": 0.463470458984375,
|
|
"epoch": 0.6111111111111112,
|
|
"grad_norm": 1.3540556513064608,
|
|
"learning_rate": 9.380268054975745e-06,
|
|
"loss": 0.5249,
|
|
"mean_token_accuracy": 0.823799098841846,
|
|
"num_tokens": 66178918.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"entropy": 0.467132568359375,
|
|
"epoch": 0.6150793650793651,
|
|
"grad_norm": 1.441163655667528,
|
|
"learning_rate": 9.36967658328135e-06,
|
|
"loss": 0.5339,
|
|
"mean_token_accuracy": 0.825651915743947,
|
|
"num_tokens": 66603248.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 0.4588623046875,
|
|
"epoch": 0.6190476190476191,
|
|
"grad_norm": 1.2757900624701248,
|
|
"learning_rate": 9.359001455131713e-06,
|
|
"loss": 0.5205,
|
|
"mean_token_accuracy": 0.8264942672103643,
|
|
"num_tokens": 67052342.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"entropy": 0.457855224609375,
|
|
"epoch": 0.623015873015873,
|
|
"grad_norm": 1.3280329459811233,
|
|
"learning_rate": 9.34824287489971e-06,
|
|
"loss": 0.5167,
|
|
"mean_token_accuracy": 0.8265606937929988,
|
|
"num_tokens": 67476890.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"entropy": 0.4544677734375,
|
|
"epoch": 0.626984126984127,
|
|
"grad_norm": 1.4362643018863588,
|
|
"learning_rate": 9.337401048555892e-06,
|
|
"loss": 0.5184,
|
|
"mean_token_accuracy": 0.8287814203649759,
|
|
"num_tokens": 67913391.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"entropy": 0.4598388671875,
|
|
"epoch": 0.6309523809523809,
|
|
"grad_norm": 1.8377059083752896,
|
|
"learning_rate": 9.326476183664535e-06,
|
|
"loss": 0.5086,
|
|
"mean_token_accuracy": 0.8302426496520638,
|
|
"num_tokens": 68339443.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"entropy": 0.457611083984375,
|
|
"epoch": 0.6349206349206349,
|
|
"grad_norm": 1.2472914610462977,
|
|
"learning_rate": 9.315468489379668e-06,
|
|
"loss": 0.5242,
|
|
"mean_token_accuracy": 0.8252703994512558,
|
|
"num_tokens": 68772115.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.454376220703125,
|
|
"epoch": 0.6388888888888888,
|
|
"grad_norm": 1.0940363704932208,
|
|
"learning_rate": 9.304378176441076e-06,
|
|
"loss": 0.5094,
|
|
"mean_token_accuracy": 0.8273925203830004,
|
|
"num_tokens": 69198272.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"entropy": 0.456268310546875,
|
|
"epoch": 0.6428571428571429,
|
|
"grad_norm": 1.250494594040658,
|
|
"learning_rate": 9.29320545717025e-06,
|
|
"loss": 0.5044,
|
|
"mean_token_accuracy": 0.8318730751052499,
|
|
"num_tokens": 69611653.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"entropy": 0.4644775390625,
|
|
"epoch": 0.6468253968253969,
|
|
"grad_norm": 1.3758890462061453,
|
|
"learning_rate": 9.281950545466336e-06,
|
|
"loss": 0.5375,
|
|
"mean_token_accuracy": 0.8206725753843784,
|
|
"num_tokens": 70054917.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"entropy": 0.451385498046875,
|
|
"epoch": 0.6507936507936508,
|
|
"grad_norm": 1.2229845865238094,
|
|
"learning_rate": 9.27061365680204e-06,
|
|
"loss": 0.5148,
|
|
"mean_token_accuracy": 0.8290882222354412,
|
|
"num_tokens": 70496952.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"entropy": 0.452728271484375,
|
|
"epoch": 0.6547619047619048,
|
|
"grad_norm": 1.310715081152188,
|
|
"learning_rate": 9.25919500821949e-06,
|
|
"loss": 0.5108,
|
|
"mean_token_accuracy": 0.8279124954715371,
|
|
"num_tokens": 70919899.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 0.45574951171875,
|
|
"epoch": 0.6587301587301587,
|
|
"grad_norm": 1.2675730907362597,
|
|
"learning_rate": 9.247694818326092e-06,
|
|
"loss": 0.5111,
|
|
"mean_token_accuracy": 0.8315063090994954,
|
|
"num_tokens": 71343921.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"entropy": 0.44989013671875,
|
|
"epoch": 0.6626984126984127,
|
|
"grad_norm": 1.3386162279647864,
|
|
"learning_rate": 9.236113307290345e-06,
|
|
"loss": 0.5343,
|
|
"mean_token_accuracy": 0.821853213943541,
|
|
"num_tokens": 71808905.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"entropy": 0.45709228515625,
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 1.2417954424606619,
|
|
"learning_rate": 9.224450696837617e-06,
|
|
"loss": 0.5137,
|
|
"mean_token_accuracy": 0.8275673342868686,
|
|
"num_tokens": 72240608.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"entropy": 0.4530029296875,
|
|
"epoch": 0.6706349206349206,
|
|
"grad_norm": 1.2477554302346368,
|
|
"learning_rate": 9.212707210245908e-06,
|
|
"loss": 0.505,
|
|
"mean_token_accuracy": 0.8292029527947307,
|
|
"num_tokens": 72668688.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"entropy": 0.453826904296875,
|
|
"epoch": 0.6746031746031746,
|
|
"grad_norm": 1.2403145708249377,
|
|
"learning_rate": 9.200883072341573e-06,
|
|
"loss": 0.5194,
|
|
"mean_token_accuracy": 0.8281446853652596,
|
|
"num_tokens": 73118452.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.45068359375,
|
|
"epoch": 0.6785714285714286,
|
|
"grad_norm": 1.2242088741534112,
|
|
"learning_rate": 9.188978509495022e-06,
|
|
"loss": 0.5228,
|
|
"mean_token_accuracy": 0.8244192777201533,
|
|
"num_tokens": 73569120.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"entropy": 0.448516845703125,
|
|
"epoch": 0.6825396825396826,
|
|
"grad_norm": 1.4410441359720512,
|
|
"learning_rate": 9.176993749616374e-06,
|
|
"loss": 0.5148,
|
|
"mean_token_accuracy": 0.8254242306575179,
|
|
"num_tokens": 73991069.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"entropy": 0.457122802734375,
|
|
"epoch": 0.6865079365079365,
|
|
"grad_norm": 1.4617287104899703,
|
|
"learning_rate": 9.164929022151106e-06,
|
|
"loss": 0.506,
|
|
"mean_token_accuracy": 0.8297470537945628,
|
|
"num_tokens": 74406271.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"entropy": 0.457122802734375,
|
|
"epoch": 0.6904761904761905,
|
|
"grad_norm": 1.2946096899912363,
|
|
"learning_rate": 9.15278455807566e-06,
|
|
"loss": 0.5163,
|
|
"mean_token_accuracy": 0.8275650115683675,
|
|
"num_tokens": 74839901.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"entropy": 0.451202392578125,
|
|
"epoch": 0.6944444444444444,
|
|
"grad_norm": 1.2168830292282429,
|
|
"learning_rate": 9.140560589893012e-06,
|
|
"loss": 0.5088,
|
|
"mean_token_accuracy": 0.8290477497503161,
|
|
"num_tokens": 75280578.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 0.45111083984375,
|
|
"epoch": 0.6984126984126984,
|
|
"grad_norm": 1.1964525447125613,
|
|
"learning_rate": 9.128257351628224e-06,
|
|
"loss": 0.5346,
|
|
"mean_token_accuracy": 0.8231356684118509,
|
|
"num_tokens": 75749725.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"entropy": 0.456024169921875,
|
|
"epoch": 0.7023809523809523,
|
|
"grad_norm": 1.2104495744651753,
|
|
"learning_rate": 9.115875078823975e-06,
|
|
"loss": 0.5188,
|
|
"mean_token_accuracy": 0.8278255322948098,
|
|
"num_tokens": 76175668.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"entropy": 0.45965576171875,
|
|
"epoch": 0.7063492063492064,
|
|
"grad_norm": 1.1865163712517055,
|
|
"learning_rate": 9.103414008536029e-06,
|
|
"loss": 0.5111,
|
|
"mean_token_accuracy": 0.8277882896363735,
|
|
"num_tokens": 76593690.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"entropy": 0.458587646484375,
|
|
"epoch": 0.7103174603174603,
|
|
"grad_norm": 1.6965519987597353,
|
|
"learning_rate": 9.09087437932872e-06,
|
|
"loss": 0.5015,
|
|
"mean_token_accuracy": 0.8323444193229079,
|
|
"num_tokens": 77009261.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"entropy": 0.454925537109375,
|
|
"epoch": 0.7142857142857143,
|
|
"grad_norm": 1.2650031464495928,
|
|
"learning_rate": 9.07825643127037e-06,
|
|
"loss": 0.5157,
|
|
"mean_token_accuracy": 0.8258270686492324,
|
|
"num_tokens": 77431030.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.447906494140625,
|
|
"epoch": 0.7182539682539683,
|
|
"grad_norm": 1.1859012409189014,
|
|
"learning_rate": 9.065560405928699e-06,
|
|
"loss": 0.5023,
|
|
"mean_token_accuracy": 0.8294160980731249,
|
|
"num_tokens": 77852655.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"entropy": 0.45416259765625,
|
|
"epoch": 0.7222222222222222,
|
|
"grad_norm": 1.176919606678633,
|
|
"learning_rate": 9.0527865463662e-06,
|
|
"loss": 0.5162,
|
|
"mean_token_accuracy": 0.8275531772524118,
|
|
"num_tokens": 78278605.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"entropy": 0.4486083984375,
|
|
"epoch": 0.7261904761904762,
|
|
"grad_norm": 1.2918709531705708,
|
|
"learning_rate": 9.039935097135479e-06,
|
|
"loss": 0.5024,
|
|
"mean_token_accuracy": 0.8300044005736709,
|
|
"num_tokens": 78721098.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"entropy": 0.454345703125,
|
|
"epoch": 0.7301587301587301,
|
|
"grad_norm": 1.3064400710795658,
|
|
"learning_rate": 9.027006304274584e-06,
|
|
"loss": 0.5096,
|
|
"mean_token_accuracy": 0.8292623031884432,
|
|
"num_tokens": 79154216.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"entropy": 0.44927978515625,
|
|
"epoch": 0.7341269841269841,
|
|
"grad_norm": 1.2696774197334444,
|
|
"learning_rate": 9.014000415302286e-06,
|
|
"loss": 0.5139,
|
|
"mean_token_accuracy": 0.8276010407134891,
|
|
"num_tokens": 79599332.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 0.45220947265625,
|
|
"epoch": 0.7380952380952381,
|
|
"grad_norm": 1.2548327381579976,
|
|
"learning_rate": 9.000917679213344e-06,
|
|
"loss": 0.5196,
|
|
"mean_token_accuracy": 0.8274355586618185,
|
|
"num_tokens": 80039204.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"entropy": 0.4434814453125,
|
|
"epoch": 0.7420634920634921,
|
|
"grad_norm": 1.180213420756775,
|
|
"learning_rate": 8.987758346473739e-06,
|
|
"loss": 0.503,
|
|
"mean_token_accuracy": 0.8305716142058372,
|
|
"num_tokens": 80472128.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"entropy": 0.449005126953125,
|
|
"epoch": 0.746031746031746,
|
|
"grad_norm": 1.2928756233384209,
|
|
"learning_rate": 8.974522669015872e-06,
|
|
"loss": 0.5174,
|
|
"mean_token_accuracy": 0.8274647342041135,
|
|
"num_tokens": 80910348.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"entropy": 0.448822021484375,
|
|
"epoch": 0.75,
|
|
"grad_norm": 1.153866561909503,
|
|
"learning_rate": 8.961210900233757e-06,
|
|
"loss": 0.5101,
|
|
"mean_token_accuracy": 0.8277234118431807,
|
|
"num_tokens": 81336350.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"entropy": 0.44439697265625,
|
|
"epoch": 0.753968253968254,
|
|
"grad_norm": 1.215655128934687,
|
|
"learning_rate": 8.947823294978147e-06,
|
|
"loss": 0.509,
|
|
"mean_token_accuracy": 0.8286535432562232,
|
|
"num_tokens": 81765325.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.461395263671875,
|
|
"epoch": 0.7579365079365079,
|
|
"grad_norm": 1.4210713418222345,
|
|
"learning_rate": 8.934360109551671e-06,
|
|
"loss": 0.5106,
|
|
"mean_token_accuracy": 0.8299150029197335,
|
|
"num_tokens": 82191876.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"entropy": 0.4591064453125,
|
|
"epoch": 0.7619047619047619,
|
|
"grad_norm": 1.319721918446663,
|
|
"learning_rate": 8.920821601703927e-06,
|
|
"loss": 0.4913,
|
|
"mean_token_accuracy": 0.8329328633844852,
|
|
"num_tokens": 82611125.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"entropy": 0.453155517578125,
|
|
"epoch": 0.7658730158730159,
|
|
"grad_norm": 1.3201749647251046,
|
|
"learning_rate": 8.907208030626538e-06,
|
|
"loss": 0.5129,
|
|
"mean_token_accuracy": 0.8259176956489682,
|
|
"num_tokens": 83051815.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"entropy": 0.4512939453125,
|
|
"epoch": 0.7698412698412699,
|
|
"grad_norm": 1.1719138701614786,
|
|
"learning_rate": 8.8935196569482e-06,
|
|
"loss": 0.5079,
|
|
"mean_token_accuracy": 0.8282450577244163,
|
|
"num_tokens": 83488021.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"entropy": 0.456451416015625,
|
|
"epoch": 0.7738095238095238,
|
|
"grad_norm": 1.2391988296172292,
|
|
"learning_rate": 8.879756742729683e-06,
|
|
"loss": 0.5074,
|
|
"mean_token_accuracy": 0.827914453111589,
|
|
"num_tokens": 83902519.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 0.450653076171875,
|
|
"epoch": 0.7777777777777778,
|
|
"grad_norm": 1.2037962698085334,
|
|
"learning_rate": 8.865919551458823e-06,
|
|
"loss": 0.505,
|
|
"mean_token_accuracy": 0.8286258336156607,
|
|
"num_tokens": 84321775.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"entropy": 0.44927978515625,
|
|
"epoch": 0.7817460317460317,
|
|
"grad_norm": 1.1617039305620294,
|
|
"learning_rate": 8.852008348045468e-06,
|
|
"loss": 0.5019,
|
|
"mean_token_accuracy": 0.8323168307542801,
|
|
"num_tokens": 84745911.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"entropy": 0.451751708984375,
|
|
"epoch": 0.7857142857142857,
|
|
"grad_norm": 1.149795910244863,
|
|
"learning_rate": 8.838023398816417e-06,
|
|
"loss": 0.4857,
|
|
"mean_token_accuracy": 0.8362782001495361,
|
|
"num_tokens": 85167087.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"entropy": 0.4635009765625,
|
|
"epoch": 0.7896825396825397,
|
|
"grad_norm": 1.1483411264804027,
|
|
"learning_rate": 8.823964971510313e-06,
|
|
"loss": 0.5075,
|
|
"mean_token_accuracy": 0.8307431424036622,
|
|
"num_tokens": 85588482.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"entropy": 0.444122314453125,
|
|
"epoch": 0.7936507936507936,
|
|
"grad_norm": 1.0935254315768266,
|
|
"learning_rate": 8.809833335272517e-06,
|
|
"loss": 0.5054,
|
|
"mean_token_accuracy": 0.8298458913341165,
|
|
"num_tokens": 86009383.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.4493408203125,
|
|
"epoch": 0.7976190476190477,
|
|
"grad_norm": 1.1018546509681295,
|
|
"learning_rate": 8.795628760649965e-06,
|
|
"loss": 0.5106,
|
|
"mean_token_accuracy": 0.8295301357284188,
|
|
"num_tokens": 86449600.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"entropy": 0.450439453125,
|
|
"epoch": 0.8015873015873016,
|
|
"grad_norm": 1.306183682510968,
|
|
"learning_rate": 8.781351519585978e-06,
|
|
"loss": 0.4886,
|
|
"mean_token_accuracy": 0.8344141785055399,
|
|
"num_tokens": 86862628.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"entropy": 0.449676513671875,
|
|
"epoch": 0.8055555555555556,
|
|
"grad_norm": 1.0824265526588595,
|
|
"learning_rate": 8.767001885415055e-06,
|
|
"loss": 0.5054,
|
|
"mean_token_accuracy": 0.8296528598293662,
|
|
"num_tokens": 87295233.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"entropy": 0.449310302734375,
|
|
"epoch": 0.8095238095238095,
|
|
"grad_norm": 1.216483297181918,
|
|
"learning_rate": 8.752580132857652e-06,
|
|
"loss": 0.4987,
|
|
"mean_token_accuracy": 0.8328232821077108,
|
|
"num_tokens": 87713395.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"entropy": 0.4515380859375,
|
|
"epoch": 0.8134920634920635,
|
|
"grad_norm": 1.1371633597502904,
|
|
"learning_rate": 8.73808653801491e-06,
|
|
"loss": 0.5216,
|
|
"mean_token_accuracy": 0.8253697715699673,
|
|
"num_tokens": 88158822.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 0.44964599609375,
|
|
"epoch": 0.8174603174603174,
|
|
"grad_norm": 1.2076012965398912,
|
|
"learning_rate": 8.723521378363378e-06,
|
|
"loss": 0.5049,
|
|
"mean_token_accuracy": 0.8300966452807188,
|
|
"num_tokens": 88602545.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"entropy": 0.45513916015625,
|
|
"epoch": 0.8214285714285714,
|
|
"grad_norm": 1.1637271792413393,
|
|
"learning_rate": 8.70888493274969e-06,
|
|
"loss": 0.4854,
|
|
"mean_token_accuracy": 0.8374869581311941,
|
|
"num_tokens": 89025796.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"entropy": 0.44927978515625,
|
|
"epoch": 0.8253968253968254,
|
|
"grad_norm": 1.1305189795680015,
|
|
"learning_rate": 8.694177481385244e-06,
|
|
"loss": 0.5061,
|
|
"mean_token_accuracy": 0.8304181462153792,
|
|
"num_tokens": 89444255.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"entropy": 0.44769287109375,
|
|
"epoch": 0.8293650793650794,
|
|
"grad_norm": 1.065905888231706,
|
|
"learning_rate": 8.679399305840815e-06,
|
|
"loss": 0.511,
|
|
"mean_token_accuracy": 0.8329211305826902,
|
|
"num_tokens": 89894143.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"entropy": 0.448516845703125,
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 1.194800491826659,
|
|
"learning_rate": 8.664550689041187e-06,
|
|
"loss": 0.4704,
|
|
"mean_token_accuracy": 0.8389384057372808,
|
|
"num_tokens": 90312774.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.451995849609375,
|
|
"epoch": 0.8373015873015873,
|
|
"grad_norm": 1.1324678388489409,
|
|
"learning_rate": 8.649631915259716e-06,
|
|
"loss": 0.4959,
|
|
"mean_token_accuracy": 0.832505133934319,
|
|
"num_tokens": 90741787.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"entropy": 0.444610595703125,
|
|
"epoch": 0.8412698412698413,
|
|
"grad_norm": 1.0451373377494304,
|
|
"learning_rate": 8.634643270112903e-06,
|
|
"loss": 0.4874,
|
|
"mean_token_accuracy": 0.8343986244872212,
|
|
"num_tokens": 91177447.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"entropy": 0.448516845703125,
|
|
"epoch": 0.8452380952380952,
|
|
"grad_norm": 1.1350367484478692,
|
|
"learning_rate": 8.61958504055492e-06,
|
|
"loss": 0.4924,
|
|
"mean_token_accuracy": 0.8339378647506237,
|
|
"num_tokens": 91607165.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"entropy": 0.45574951171875,
|
|
"epoch": 0.8492063492063492,
|
|
"grad_norm": 1.1435711522188763,
|
|
"learning_rate": 8.604457514872115e-06,
|
|
"loss": 0.4934,
|
|
"mean_token_accuracy": 0.8312076451256871,
|
|
"num_tokens": 92026164.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"entropy": 0.448028564453125,
|
|
"epoch": 0.8531746031746031,
|
|
"grad_norm": 1.210433236941165,
|
|
"learning_rate": 8.589260982677496e-06,
|
|
"loss": 0.4936,
|
|
"mean_token_accuracy": 0.8334163334220648,
|
|
"num_tokens": 92463989.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 0.4459228515625,
|
|
"epoch": 0.8571428571428571,
|
|
"grad_norm": 1.2030101822851358,
|
|
"learning_rate": 8.573995734905185e-06,
|
|
"loss": 0.4917,
|
|
"mean_token_accuracy": 0.8336746180430055,
|
|
"num_tokens": 92891631.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"entropy": 0.4539794921875,
|
|
"epoch": 0.8611111111111112,
|
|
"grad_norm": 1.0466701342650107,
|
|
"learning_rate": 8.558662063804843e-06,
|
|
"loss": 0.5039,
|
|
"mean_token_accuracy": 0.8325941441580653,
|
|
"num_tokens": 93322969.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"entropy": 0.448883056640625,
|
|
"epoch": 0.8650793650793651,
|
|
"grad_norm": 1.3569379184552983,
|
|
"learning_rate": 8.543260262936087e-06,
|
|
"loss": 0.4942,
|
|
"mean_token_accuracy": 0.8330146428197622,
|
|
"num_tokens": 93760535.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"entropy": 0.445465087890625,
|
|
"epoch": 0.8690476190476191,
|
|
"grad_norm": 1.1285395121488393,
|
|
"learning_rate": 8.527790627162858e-06,
|
|
"loss": 0.485,
|
|
"mean_token_accuracy": 0.835063835605979,
|
|
"num_tokens": 94172398.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"entropy": 0.450775146484375,
|
|
"epoch": 0.873015873015873,
|
|
"grad_norm": 1.2538705581876535,
|
|
"learning_rate": 8.512253452647783e-06,
|
|
"loss": 0.502,
|
|
"mean_token_accuracy": 0.8306903587654233,
|
|
"num_tokens": 94599260.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.45660400390625,
|
|
"epoch": 0.876984126984127,
|
|
"grad_norm": 1.1551796563028132,
|
|
"learning_rate": 8.496649036846502e-06,
|
|
"loss": 0.4946,
|
|
"mean_token_accuracy": 0.8319389009848237,
|
|
"num_tokens": 95019433.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"entropy": 0.461669921875,
|
|
"epoch": 0.8809523809523809,
|
|
"grad_norm": 1.2009353491848689,
|
|
"learning_rate": 8.480977678501974e-06,
|
|
"loss": 0.4915,
|
|
"mean_token_accuracy": 0.8330316534265876,
|
|
"num_tokens": 95425799.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"entropy": 0.45465087890625,
|
|
"epoch": 0.8849206349206349,
|
|
"grad_norm": 1.0850199284929676,
|
|
"learning_rate": 8.465239677638755e-06,
|
|
"loss": 0.4919,
|
|
"mean_token_accuracy": 0.8328907387331128,
|
|
"num_tokens": 95822890.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"entropy": 0.45330810546875,
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 1.4803897939108124,
|
|
"learning_rate": 8.449435335557264e-06,
|
|
"loss": 0.5054,
|
|
"mean_token_accuracy": 0.8312137639150023,
|
|
"num_tokens": 96260271.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"entropy": 0.443267822265625,
|
|
"epoch": 0.8928571428571429,
|
|
"grad_norm": 2.1079096762406238,
|
|
"learning_rate": 8.433564954828e-06,
|
|
"loss": 0.4991,
|
|
"mean_token_accuracy": 0.8311476595699787,
|
|
"num_tokens": 96696652.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 0.450286865234375,
|
|
"epoch": 0.8968253968253969,
|
|
"grad_norm": 1.2706829768849834,
|
|
"learning_rate": 8.417628839285757e-06,
|
|
"loss": 0.4981,
|
|
"mean_token_accuracy": 0.8332603024318814,
|
|
"num_tokens": 97135925.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"entropy": 0.45703125,
|
|
"epoch": 0.9007936507936508,
|
|
"grad_norm": 1.8201254601577819,
|
|
"learning_rate": 8.401627294023815e-06,
|
|
"loss": 0.5142,
|
|
"mean_token_accuracy": 0.828549837693572,
|
|
"num_tokens": 97573810.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"entropy": 0.447784423828125,
|
|
"epoch": 0.9047619047619048,
|
|
"grad_norm": 1.1241933043727534,
|
|
"learning_rate": 8.385560625388081e-06,
|
|
"loss": 0.4831,
|
|
"mean_token_accuracy": 0.8362022209912539,
|
|
"num_tokens": 98011108.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"entropy": 0.454071044921875,
|
|
"epoch": 0.9087301587301587,
|
|
"grad_norm": 1.1121125737189776,
|
|
"learning_rate": 8.369429140971239e-06,
|
|
"loss": 0.4811,
|
|
"mean_token_accuracy": 0.8338787518441677,
|
|
"num_tokens": 98441631.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"entropy": 0.457305908203125,
|
|
"epoch": 0.9126984126984127,
|
|
"grad_norm": 1.0458991032894815,
|
|
"learning_rate": 8.353233149606859e-06,
|
|
"loss": 0.4924,
|
|
"mean_token_accuracy": 0.8308598725125194,
|
|
"num_tokens": 98873029.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.453765869140625,
|
|
"epoch": 0.9166666666666666,
|
|
"grad_norm": 1.2247678683157683,
|
|
"learning_rate": 8.336972961363472e-06,
|
|
"loss": 0.498,
|
|
"mean_token_accuracy": 0.8302338859066367,
|
|
"num_tokens": 99296106.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"entropy": 0.45703125,
|
|
"epoch": 0.9206349206349206,
|
|
"grad_norm": 1.2989134951116341,
|
|
"learning_rate": 8.320648887538657e-06,
|
|
"loss": 0.4957,
|
|
"mean_token_accuracy": 0.8315738271921873,
|
|
"num_tokens": 99734864.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"entropy": 0.45330810546875,
|
|
"epoch": 0.9246031746031746,
|
|
"grad_norm": 1.080222766722178,
|
|
"learning_rate": 8.304261240653054e-06,
|
|
"loss": 0.507,
|
|
"mean_token_accuracy": 0.8313342472538352,
|
|
"num_tokens": 100174517.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"entropy": 0.460418701171875,
|
|
"epoch": 0.9285714285714286,
|
|
"grad_norm": 1.1572509289153226,
|
|
"learning_rate": 8.287810334444406e-06,
|
|
"loss": 0.4926,
|
|
"mean_token_accuracy": 0.8337559709325433,
|
|
"num_tokens": 100606926.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"entropy": 0.458404541015625,
|
|
"epoch": 0.9325396825396826,
|
|
"grad_norm": 1.1066868483832115,
|
|
"learning_rate": 8.271296483861532e-06,
|
|
"loss": 0.4829,
|
|
"mean_token_accuracy": 0.835618756711483,
|
|
"num_tokens": 101020425.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 0.45831298828125,
|
|
"epoch": 0.9365079365079365,
|
|
"grad_norm": 1.060730775603579,
|
|
"learning_rate": 8.254720005058317e-06,
|
|
"loss": 0.4912,
|
|
"mean_token_accuracy": 0.8332764646038413,
|
|
"num_tokens": 101447599.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"entropy": 0.458953857421875,
|
|
"epoch": 0.9404761904761905,
|
|
"grad_norm": 1.1471857859225785,
|
|
"learning_rate": 8.238081215387639e-06,
|
|
"loss": 0.4843,
|
|
"mean_token_accuracy": 0.8348336489871144,
|
|
"num_tokens": 101851986.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"entropy": 0.449310302734375,
|
|
"epoch": 0.9444444444444444,
|
|
"grad_norm": 1.1375613016443888,
|
|
"learning_rate": 8.221380433395308e-06,
|
|
"loss": 0.4934,
|
|
"mean_token_accuracy": 0.8338221423327923,
|
|
"num_tokens": 102275358.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"entropy": 0.459075927734375,
|
|
"epoch": 0.9484126984126984,
|
|
"grad_norm": 1.0708255333770056,
|
|
"learning_rate": 8.204617978813963e-06,
|
|
"loss": 0.4838,
|
|
"mean_token_accuracy": 0.8348392806947231,
|
|
"num_tokens": 102688415.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"entropy": 0.457061767578125,
|
|
"epoch": 0.9523809523809523,
|
|
"grad_norm": 1.269015813917946,
|
|
"learning_rate": 8.187794172556947e-06,
|
|
"loss": 0.4901,
|
|
"mean_token_accuracy": 0.832873186096549,
|
|
"num_tokens": 103113293.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.447052001953125,
|
|
"epoch": 0.9563492063492064,
|
|
"grad_norm": 1.2172067541370395,
|
|
"learning_rate": 8.170909336712171e-06,
|
|
"loss": 0.4934,
|
|
"mean_token_accuracy": 0.8310654619708657,
|
|
"num_tokens": 103566779.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"entropy": 0.442840576171875,
|
|
"epoch": 0.9603174603174603,
|
|
"grad_norm": 1.9614491486328336,
|
|
"learning_rate": 8.153963794535945e-06,
|
|
"loss": 0.4967,
|
|
"mean_token_accuracy": 0.8313373932614923,
|
|
"num_tokens": 104000550.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"entropy": 0.45098876953125,
|
|
"epoch": 0.9642857142857143,
|
|
"grad_norm": 1.2204808359509163,
|
|
"learning_rate": 8.136957870446779e-06,
|
|
"loss": 0.4998,
|
|
"mean_token_accuracy": 0.830800985917449,
|
|
"num_tokens": 104429372.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"entropy": 0.443115234375,
|
|
"epoch": 0.9682539682539683,
|
|
"grad_norm": 1.1287254868438927,
|
|
"learning_rate": 8.119891890019187e-06,
|
|
"loss": 0.486,
|
|
"mean_token_accuracy": 0.8366484735161066,
|
|
"num_tokens": 104859286.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"entropy": 0.45599365234375,
|
|
"epoch": 0.9722222222222222,
|
|
"grad_norm": 1.1632405758479503,
|
|
"learning_rate": 8.102766179977452e-06,
|
|
"loss": 0.4954,
|
|
"mean_token_accuracy": 0.83047538343817,
|
|
"num_tokens": 105281017.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 0.44305419921875,
|
|
"epoch": 0.9761904761904762,
|
|
"grad_norm": 1.0531020537734286,
|
|
"learning_rate": 8.085581068189358e-06,
|
|
"loss": 0.4875,
|
|
"mean_token_accuracy": 0.83509177621454,
|
|
"num_tokens": 105729675.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"entropy": 0.444549560546875,
|
|
"epoch": 0.9801587301587301,
|
|
"grad_norm": 1.136500203665195,
|
|
"learning_rate": 8.068336883659926e-06,
|
|
"loss": 0.4926,
|
|
"mean_token_accuracy": 0.8322630152106285,
|
|
"num_tokens": 106168119.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"entropy": 0.442291259765625,
|
|
"epoch": 0.9841269841269841,
|
|
"grad_norm": 1.0192188396085724,
|
|
"learning_rate": 8.051033956525113e-06,
|
|
"loss": 0.484,
|
|
"mean_token_accuracy": 0.8352002650499344,
|
|
"num_tokens": 106603968.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"entropy": 0.439788818359375,
|
|
"epoch": 0.9880952380952381,
|
|
"grad_norm": 1.1049532946463114,
|
|
"learning_rate": 8.033672618045485e-06,
|
|
"loss": 0.492,
|
|
"mean_token_accuracy": 0.8354252576828003,
|
|
"num_tokens": 107054152.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"entropy": 0.440826416015625,
|
|
"epoch": 0.9920634920634921,
|
|
"grad_norm": 1.0599713800274446,
|
|
"learning_rate": 8.016253200599885e-06,
|
|
"loss": 0.4782,
|
|
"mean_token_accuracy": 0.8366458043456078,
|
|
"num_tokens": 107495007.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.447113037109375,
|
|
"epoch": 0.996031746031746,
|
|
"grad_norm": 1.1844331984330863,
|
|
"learning_rate": 7.998776037679061e-06,
|
|
"loss": 0.4986,
|
|
"mean_token_accuracy": 0.8293369021266699,
|
|
"num_tokens": 107928758.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"entropy": 0.441619873046875,
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.0144603078826888,
|
|
"learning_rate": 7.981241463879284e-06,
|
|
"loss": 0.4922,
|
|
"mean_token_accuracy": 0.8354968074709177,
|
|
"num_tokens": 108364335.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"entropy": 0.46148681640625,
|
|
"epoch": 1.003968253968254,
|
|
"grad_norm": 1.1133036368721527,
|
|
"learning_rate": 7.963649814895945e-06,
|
|
"loss": 0.4675,
|
|
"mean_token_accuracy": 0.8393758479505777,
|
|
"num_tokens": 108775586.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"entropy": 0.452392578125,
|
|
"epoch": 1.007936507936508,
|
|
"grad_norm": 1.0079553576284006,
|
|
"learning_rate": 7.94600142751713e-06,
|
|
"loss": 0.4619,
|
|
"mean_token_accuracy": 0.8416725508868694,
|
|
"num_tokens": 109202665.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"entropy": 0.4403076171875,
|
|
"epoch": 1.0119047619047619,
|
|
"grad_norm": 1.0665471851715955,
|
|
"learning_rate": 7.92829663961716e-06,
|
|
"loss": 0.4616,
|
|
"mean_token_accuracy": 0.843192096799612,
|
|
"num_tokens": 109629975.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 0.440765380859375,
|
|
"epoch": 1.0158730158730158,
|
|
"grad_norm": 1.0527949047806084,
|
|
"learning_rate": 7.910535790150135e-06,
|
|
"loss": 0.4684,
|
|
"mean_token_accuracy": 0.8393022352829576,
|
|
"num_tokens": 110061605.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"entropy": 0.443817138671875,
|
|
"epoch": 1.0198412698412698,
|
|
"grad_norm": 1.037337532935931,
|
|
"learning_rate": 7.892719219143446e-06,
|
|
"loss": 0.458,
|
|
"mean_token_accuracy": 0.842767583206296,
|
|
"num_tokens": 110487591.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"entropy": 0.4439697265625,
|
|
"epoch": 1.0238095238095237,
|
|
"grad_norm": 0.9282961355601993,
|
|
"learning_rate": 7.874847267691254e-06,
|
|
"loss": 0.4674,
|
|
"mean_token_accuracy": 0.8391132960096002,
|
|
"num_tokens": 110924491.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"entropy": 0.44256591796875,
|
|
"epoch": 1.0277777777777777,
|
|
"grad_norm": 1.0812964655312522,
|
|
"learning_rate": 7.856920277947969e-06,
|
|
"loss": 0.4666,
|
|
"mean_token_accuracy": 0.8417868306860328,
|
|
"num_tokens": 111351831.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"entropy": 0.442596435546875,
|
|
"epoch": 1.0317460317460316,
|
|
"grad_norm": 1.0004332183195612,
|
|
"learning_rate": 7.83893859312169e-06,
|
|
"loss": 0.4608,
|
|
"mean_token_accuracy": 0.840317826718092,
|
|
"num_tokens": 111773832.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.441650390625,
|
|
"epoch": 1.0357142857142858,
|
|
"grad_norm": 1.0083023934199706,
|
|
"learning_rate": 7.820902557467648e-06,
|
|
"loss": 0.4546,
|
|
"mean_token_accuracy": 0.8436138844117522,
|
|
"num_tokens": 112210334.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"entropy": 0.440338134765625,
|
|
"epoch": 1.0396825396825398,
|
|
"grad_norm": 1.0205115508469926,
|
|
"learning_rate": 7.80281251628161e-06,
|
|
"loss": 0.4617,
|
|
"mean_token_accuracy": 0.8404037207365036,
|
|
"num_tokens": 112637470.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"entropy": 0.43670654296875,
|
|
"epoch": 1.0436507936507937,
|
|
"grad_norm": 1.1861875486087046,
|
|
"learning_rate": 7.784668815893256e-06,
|
|
"loss": 0.465,
|
|
"mean_token_accuracy": 0.8401956735178828,
|
|
"num_tokens": 113069179.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"entropy": 0.44329833984375,
|
|
"epoch": 1.0476190476190477,
|
|
"grad_norm": 1.0426651868796517,
|
|
"learning_rate": 7.766471803659571e-06,
|
|
"loss": 0.4725,
|
|
"mean_token_accuracy": 0.8395506730303168,
|
|
"num_tokens": 113501590.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"entropy": 0.440948486328125,
|
|
"epoch": 1.0515873015873016,
|
|
"grad_norm": 1.0688154361912685,
|
|
"learning_rate": 7.748221827958174e-06,
|
|
"loss": 0.463,
|
|
"mean_token_accuracy": 0.8411337668076158,
|
|
"num_tokens": 113935337.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 0.44378662109375,
|
|
"epoch": 1.0555555555555556,
|
|
"grad_norm": 0.9973458392577903,
|
|
"learning_rate": 7.729919238180663e-06,
|
|
"loss": 0.4644,
|
|
"mean_token_accuracy": 0.8407721919938922,
|
|
"num_tokens": 114360533.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"entropy": 0.4375,
|
|
"epoch": 1.0595238095238095,
|
|
"grad_norm": 1.0128154565462195,
|
|
"learning_rate": 7.711564384725916e-06,
|
|
"loss": 0.456,
|
|
"mean_token_accuracy": 0.8427523402497172,
|
|
"num_tokens": 114792424.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"entropy": 0.43865966796875,
|
|
"epoch": 1.0634920634920635,
|
|
"grad_norm": 1.1237442568992235,
|
|
"learning_rate": 7.693157618993392e-06,
|
|
"loss": 0.4713,
|
|
"mean_token_accuracy": 0.8381293760612607,
|
|
"num_tokens": 115231953.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"entropy": 0.44390869140625,
|
|
"epoch": 1.0674603174603174,
|
|
"grad_norm": 0.9632813464843945,
|
|
"learning_rate": 7.674699293376397e-06,
|
|
"loss": 0.4606,
|
|
"mean_token_accuracy": 0.8414155915379524,
|
|
"num_tokens": 115664522.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"entropy": 0.439239501953125,
|
|
"epoch": 1.0714285714285714,
|
|
"grad_norm": 1.1143536721017135,
|
|
"learning_rate": 7.656189761255333e-06,
|
|
"loss": 0.4585,
|
|
"mean_token_accuracy": 0.8407229576259851,
|
|
"num_tokens": 116092221.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.4417724609375,
|
|
"epoch": 1.0753968253968254,
|
|
"grad_norm": 1.0175840618853507,
|
|
"learning_rate": 7.63762937699095e-06,
|
|
"loss": 0.4619,
|
|
"mean_token_accuracy": 0.8408547407016158,
|
|
"num_tokens": 116534679.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"entropy": 0.4439697265625,
|
|
"epoch": 1.0793650793650793,
|
|
"grad_norm": 1.0025546600901896,
|
|
"learning_rate": 7.619018495917543e-06,
|
|
"loss": 0.4696,
|
|
"mean_token_accuracy": 0.8394848993048072,
|
|
"num_tokens": 116984739.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"entropy": 0.44073486328125,
|
|
"epoch": 1.0833333333333333,
|
|
"grad_norm": 1.0897542155601712,
|
|
"learning_rate": 7.600357474336157e-06,
|
|
"loss": 0.4662,
|
|
"mean_token_accuracy": 0.8403668319806457,
|
|
"num_tokens": 117413323.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"entropy": 0.4364013671875,
|
|
"epoch": 1.0873015873015872,
|
|
"grad_norm": 1.026521342719511,
|
|
"learning_rate": 7.581646669507768e-06,
|
|
"loss": 0.4631,
|
|
"mean_token_accuracy": 0.8399766776710749,
|
|
"num_tokens": 117852991.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"entropy": 0.4500732421875,
|
|
"epoch": 1.0912698412698412,
|
|
"grad_norm": 1.1089611631121021,
|
|
"learning_rate": 7.56288643964644e-06,
|
|
"loss": 0.4686,
|
|
"mean_token_accuracy": 0.8402743814513087,
|
|
"num_tokens": 118264477.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 0.440032958984375,
|
|
"epoch": 1.0952380952380953,
|
|
"grad_norm": 1.1837449681611911,
|
|
"learning_rate": 7.544077143912467e-06,
|
|
"loss": 0.4596,
|
|
"mean_token_accuracy": 0.8443190716207027,
|
|
"num_tokens": 118696927.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"entropy": 0.43536376953125,
|
|
"epoch": 1.0992063492063493,
|
|
"grad_norm": 1.0567641917315522,
|
|
"learning_rate": 7.525219142405501e-06,
|
|
"loss": 0.4645,
|
|
"mean_token_accuracy": 0.8398779211565852,
|
|
"num_tokens": 119143061.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"entropy": 0.4447021484375,
|
|
"epoch": 1.1031746031746033,
|
|
"grad_norm": 1.0628873288461702,
|
|
"learning_rate": 7.506312796157649e-06,
|
|
"loss": 0.464,
|
|
"mean_token_accuracy": 0.8407185869291425,
|
|
"num_tokens": 119569613.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"entropy": 0.44366455078125,
|
|
"epoch": 1.1071428571428572,
|
|
"grad_norm": 1.3089788931081365,
|
|
"learning_rate": 7.487358467126573e-06,
|
|
"loss": 0.4666,
|
|
"mean_token_accuracy": 0.8411134304478765,
|
|
"num_tokens": 119990044.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"entropy": 0.4305419921875,
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 1.200277045741654,
|
|
"learning_rate": 7.468356518188551e-06,
|
|
"loss": 0.4687,
|
|
"mean_token_accuracy": 0.83890818990767,
|
|
"num_tokens": 120447089.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.435943603515625,
|
|
"epoch": 1.1150793650793651,
|
|
"grad_norm": 1.065088410503753,
|
|
"learning_rate": 7.449307313131533e-06,
|
|
"loss": 0.4481,
|
|
"mean_token_accuracy": 0.846671967767179,
|
|
"num_tokens": 120882118.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"entropy": 0.4400634765625,
|
|
"epoch": 1.119047619047619,
|
|
"grad_norm": 1.0435830370708483,
|
|
"learning_rate": 7.4302112166481814e-06,
|
|
"loss": 0.4653,
|
|
"mean_token_accuracy": 0.8401108030229807,
|
|
"num_tokens": 121314886.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"entropy": 0.444610595703125,
|
|
"epoch": 1.123015873015873,
|
|
"grad_norm": 1.1498875512493505,
|
|
"learning_rate": 7.411068594328876e-06,
|
|
"loss": 0.4506,
|
|
"mean_token_accuracy": 0.8450519479811192,
|
|
"num_tokens": 121731396.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"entropy": 0.441192626953125,
|
|
"epoch": 1.126984126984127,
|
|
"grad_norm": 1.1037530140349723,
|
|
"learning_rate": 7.391879812654727e-06,
|
|
"loss": 0.4573,
|
|
"mean_token_accuracy": 0.8432380286976695,
|
|
"num_tokens": 122167617.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"entropy": 0.436553955078125,
|
|
"epoch": 1.130952380952381,
|
|
"grad_norm": 1.2008296365359707,
|
|
"learning_rate": 7.37264523899056e-06,
|
|
"loss": 0.4564,
|
|
"mean_token_accuracy": 0.8409950910136104,
|
|
"num_tokens": 122593508.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 0.439788818359375,
|
|
"epoch": 1.1349206349206349,
|
|
"grad_norm": 1.1519884106136846,
|
|
"learning_rate": 7.353365241577869e-06,
|
|
"loss": 0.4606,
|
|
"mean_token_accuracy": 0.839851806871593,
|
|
"num_tokens": 123013840.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"entropy": 0.43341064453125,
|
|
"epoch": 1.1388888888888888,
|
|
"grad_norm": 1.0329372716274068,
|
|
"learning_rate": 7.3340401895277816e-06,
|
|
"loss": 0.4498,
|
|
"mean_token_accuracy": 0.8443695362657309,
|
|
"num_tokens": 123444043.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"entropy": 0.436676025390625,
|
|
"epoch": 1.1428571428571428,
|
|
"grad_norm": 1.0218663400951138,
|
|
"learning_rate": 7.314670452813982e-06,
|
|
"loss": 0.4503,
|
|
"mean_token_accuracy": 0.8440707307308912,
|
|
"num_tokens": 123876490.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"entropy": 0.44293212890625,
|
|
"epoch": 1.1468253968253967,
|
|
"grad_norm": 1.0595566545611714,
|
|
"learning_rate": 7.295256402265636e-06,
|
|
"loss": 0.4561,
|
|
"mean_token_accuracy": 0.841067879460752,
|
|
"num_tokens": 124297019.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"entropy": 0.44622802734375,
|
|
"epoch": 1.1507936507936507,
|
|
"grad_norm": 1.1333083345633674,
|
|
"learning_rate": 7.275798409560282e-06,
|
|
"loss": 0.4617,
|
|
"mean_token_accuracy": 0.8422295236960053,
|
|
"num_tokens": 124713314.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.44403076171875,
|
|
"epoch": 1.1547619047619047,
|
|
"grad_norm": 1.1923827872697734,
|
|
"learning_rate": 7.256296847216727e-06,
|
|
"loss": 0.4573,
|
|
"mean_token_accuracy": 0.8406451418995857,
|
|
"num_tokens": 125125061.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"entropy": 0.440155029296875,
|
|
"epoch": 1.1587301587301586,
|
|
"grad_norm": 1.1646433646945646,
|
|
"learning_rate": 7.236752088587905e-06,
|
|
"loss": 0.4735,
|
|
"mean_token_accuracy": 0.8386099971830845,
|
|
"num_tokens": 125564746.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"entropy": 0.435272216796875,
|
|
"epoch": 1.1626984126984128,
|
|
"grad_norm": 1.1116112176497874,
|
|
"learning_rate": 7.217164507853734e-06,
|
|
"loss": 0.4531,
|
|
"mean_token_accuracy": 0.8449215041473508,
|
|
"num_tokens": 125992351.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"entropy": 0.440032958984375,
|
|
"epoch": 1.1666666666666667,
|
|
"grad_norm": 1.0397784652565205,
|
|
"learning_rate": 7.197534480013951e-06,
|
|
"loss": 0.4515,
|
|
"mean_token_accuracy": 0.8436530968174338,
|
|
"num_tokens": 126415997.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"entropy": 0.44482421875,
|
|
"epoch": 1.1706349206349207,
|
|
"grad_norm": 1.129298764751686,
|
|
"learning_rate": 7.177862380880935e-06,
|
|
"loss": 0.4629,
|
|
"mean_token_accuracy": 0.841444781050086,
|
|
"num_tokens": 126851930.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 0.44580078125,
|
|
"epoch": 1.1746031746031746,
|
|
"grad_norm": 1.0985527605182936,
|
|
"learning_rate": 7.158148587072509e-06,
|
|
"loss": 0.467,
|
|
"mean_token_accuracy": 0.8395384335890412,
|
|
"num_tokens": 127285760.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"entropy": 0.455108642578125,
|
|
"epoch": 1.1785714285714286,
|
|
"grad_norm": 1.2001077801428681,
|
|
"learning_rate": 7.138393476004725e-06,
|
|
"loss": 0.4803,
|
|
"mean_token_accuracy": 0.8372842157259583,
|
|
"num_tokens": 127724762.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"entropy": 0.43841552734375,
|
|
"epoch": 1.1825396825396826,
|
|
"grad_norm": 1.054003052207074,
|
|
"learning_rate": 7.118597425884659e-06,
|
|
"loss": 0.4523,
|
|
"mean_token_accuracy": 0.8465767158195376,
|
|
"num_tokens": 128153685.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"entropy": 0.443328857421875,
|
|
"epoch": 1.1865079365079365,
|
|
"grad_norm": 1.0655995217798397,
|
|
"learning_rate": 7.098760815703139e-06,
|
|
"loss": 0.4531,
|
|
"mean_token_accuracy": 0.8448374746367335,
|
|
"num_tokens": 128574985.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"entropy": 0.452362060546875,
|
|
"epoch": 1.1904761904761905,
|
|
"grad_norm": 1.1076879019132861,
|
|
"learning_rate": 7.078884025227519e-06,
|
|
"loss": 0.4515,
|
|
"mean_token_accuracy": 0.8428602814674377,
|
|
"num_tokens": 128990738.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.4468994140625,
|
|
"epoch": 1.1944444444444444,
|
|
"grad_norm": 1.096401426354454,
|
|
"learning_rate": 7.058967434994388e-06,
|
|
"loss": 0.4526,
|
|
"mean_token_accuracy": 0.8467154111713171,
|
|
"num_tokens": 129413253.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"entropy": 0.444061279296875,
|
|
"epoch": 1.1984126984126984,
|
|
"grad_norm": 0.9851920784045842,
|
|
"learning_rate": 7.0390114263022955e-06,
|
|
"loss": 0.474,
|
|
"mean_token_accuracy": 0.8386435657739639,
|
|
"num_tokens": 129848900.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"entropy": 0.44317626953125,
|
|
"epoch": 1.2023809523809523,
|
|
"grad_norm": 1.112135152774716,
|
|
"learning_rate": 7.019016381204448e-06,
|
|
"loss": 0.4553,
|
|
"mean_token_accuracy": 0.8430305812507868,
|
|
"num_tokens": 130278951.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"entropy": 0.444427490234375,
|
|
"epoch": 1.2063492063492063,
|
|
"grad_norm": 1.1661189845303515,
|
|
"learning_rate": 6.998982682501394e-06,
|
|
"loss": 0.4629,
|
|
"mean_token_accuracy": 0.841990914195776,
|
|
"num_tokens": 130724709.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"entropy": 0.445404052734375,
|
|
"epoch": 1.2103174603174602,
|
|
"grad_norm": 0.9959690543341396,
|
|
"learning_rate": 6.978910713733696e-06,
|
|
"loss": 0.4429,
|
|
"mean_token_accuracy": 0.8485971093177795,
|
|
"num_tokens": 131151665.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 0.438751220703125,
|
|
"epoch": 1.2142857142857142,
|
|
"grad_norm": 0.9834937169980936,
|
|
"learning_rate": 6.958800859174591e-06,
|
|
"loss": 0.4491,
|
|
"mean_token_accuracy": 0.845764022320509,
|
|
"num_tokens": 131582811.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"entropy": 0.442840576171875,
|
|
"epoch": 1.2182539682539684,
|
|
"grad_norm": 1.0523226181088532,
|
|
"learning_rate": 6.938653503822628e-06,
|
|
"loss": 0.4574,
|
|
"mean_token_accuracy": 0.8434069091454148,
|
|
"num_tokens": 131998529.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"entropy": 0.4339599609375,
|
|
"epoch": 1.2222222222222223,
|
|
"grad_norm": 1.0371255492047888,
|
|
"learning_rate": 6.9184690333942995e-06,
|
|
"loss": 0.4517,
|
|
"mean_token_accuracy": 0.8438770910724998,
|
|
"num_tokens": 132429743.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"entropy": 0.439239501953125,
|
|
"epoch": 1.2261904761904763,
|
|
"grad_norm": 1.1404078217146265,
|
|
"learning_rate": 6.898247834316662e-06,
|
|
"loss": 0.4576,
|
|
"mean_token_accuracy": 0.8416583137586713,
|
|
"num_tokens": 132864811.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"entropy": 0.437103271484375,
|
|
"epoch": 1.2301587301587302,
|
|
"grad_norm": 1.0196151103714386,
|
|
"learning_rate": 6.877990293719928e-06,
|
|
"loss": 0.4611,
|
|
"mean_token_accuracy": 0.8426391445100307,
|
|
"num_tokens": 133291943.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.4429931640625,
|
|
"epoch": 1.2341269841269842,
|
|
"grad_norm": 1.1597754105733091,
|
|
"learning_rate": 6.857696799430064e-06,
|
|
"loss": 0.4594,
|
|
"mean_token_accuracy": 0.8428373141214252,
|
|
"num_tokens": 133728664.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"entropy": 0.442169189453125,
|
|
"epoch": 1.2380952380952381,
|
|
"grad_norm": 1.0933297455326956,
|
|
"learning_rate": 6.83736773996136e-06,
|
|
"loss": 0.4495,
|
|
"mean_token_accuracy": 0.8465461218729615,
|
|
"num_tokens": 134149814.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"entropy": 0.444610595703125,
|
|
"epoch": 1.242063492063492,
|
|
"grad_norm": 0.9545364491465045,
|
|
"learning_rate": 6.817003504508993e-06,
|
|
"loss": 0.4453,
|
|
"mean_token_accuracy": 0.8452331237494946,
|
|
"num_tokens": 134567037.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"entropy": 0.441436767578125,
|
|
"epoch": 1.246031746031746,
|
|
"grad_norm": 0.9909665760096847,
|
|
"learning_rate": 6.796604482941578e-06,
|
|
"loss": 0.4474,
|
|
"mean_token_accuracy": 0.8466871501877904,
|
|
"num_tokens": 134989406.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"entropy": 0.43414306640625,
|
|
"epoch": 1.25,
|
|
"grad_norm": 1.0159907252981955,
|
|
"learning_rate": 6.7761710657936995e-06,
|
|
"loss": 0.4361,
|
|
"mean_token_accuracy": 0.8494271822273731,
|
|
"num_tokens": 135405949.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 0.436004638671875,
|
|
"epoch": 1.253968253968254,
|
|
"grad_norm": 1.1634799840745833,
|
|
"learning_rate": 6.75570364425844e-06,
|
|
"loss": 0.4552,
|
|
"mean_token_accuracy": 0.8439184688031673,
|
|
"num_tokens": 135832642.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"entropy": 0.43035888671875,
|
|
"epoch": 1.257936507936508,
|
|
"grad_norm": 1.0848830841192156,
|
|
"learning_rate": 6.735202610179886e-06,
|
|
"loss": 0.4588,
|
|
"mean_token_accuracy": 0.8425602596253157,
|
|
"num_tokens": 136281104.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"entropy": 0.4400634765625,
|
|
"epoch": 1.2619047619047619,
|
|
"grad_norm": 1.1024831215933177,
|
|
"learning_rate": 6.714668356045629e-06,
|
|
"loss": 0.4459,
|
|
"mean_token_accuracy": 0.8458384843543172,
|
|
"num_tokens": 136724748.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"entropy": 0.437774658203125,
|
|
"epoch": 1.2658730158730158,
|
|
"grad_norm": 1.14453363380739,
|
|
"learning_rate": 6.694101274979253e-06,
|
|
"loss": 0.4484,
|
|
"mean_token_accuracy": 0.8426429070532322,
|
|
"num_tokens": 137144383.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"entropy": 0.44573974609375,
|
|
"epoch": 1.2698412698412698,
|
|
"grad_norm": 1.1202850192609648,
|
|
"learning_rate": 6.673501760732805e-06,
|
|
"loss": 0.4575,
|
|
"mean_token_accuracy": 0.8433046471327543,
|
|
"num_tokens": 137570382.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.439056396484375,
|
|
"epoch": 1.2738095238095237,
|
|
"grad_norm": 1.1686361321236263,
|
|
"learning_rate": 6.652870207679253e-06,
|
|
"loss": 0.4525,
|
|
"mean_token_accuracy": 0.8428729372099042,
|
|
"num_tokens": 138002323.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"entropy": 0.43701171875,
|
|
"epoch": 1.2777777777777777,
|
|
"grad_norm": 1.1692980704447018,
|
|
"learning_rate": 6.632207010804949e-06,
|
|
"loss": 0.4576,
|
|
"mean_token_accuracy": 0.8453587293624878,
|
|
"num_tokens": 138431194.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"entropy": 0.439239501953125,
|
|
"epoch": 1.2817460317460316,
|
|
"grad_norm": 1.0283968957929952,
|
|
"learning_rate": 6.611512565702053e-06,
|
|
"loss": 0.4494,
|
|
"mean_token_accuracy": 0.8435638211667538,
|
|
"num_tokens": 138863136.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"entropy": 0.43597412109375,
|
|
"epoch": 1.2857142857142856,
|
|
"grad_norm": 1.0723867427352887,
|
|
"learning_rate": 6.590787268560967e-06,
|
|
"loss": 0.4349,
|
|
"mean_token_accuracy": 0.8492929134517908,
|
|
"num_tokens": 139287890.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"entropy": 0.4398193359375,
|
|
"epoch": 1.2896825396825398,
|
|
"grad_norm": 1.0222079112541533,
|
|
"learning_rate": 6.570031516162746e-06,
|
|
"loss": 0.4585,
|
|
"mean_token_accuracy": 0.8433736823499203,
|
|
"num_tokens": 139730663.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 0.435150146484375,
|
|
"epoch": 1.2936507936507937,
|
|
"grad_norm": 0.9275873017340585,
|
|
"learning_rate": 6.549245705871507e-06,
|
|
"loss": 0.4499,
|
|
"mean_token_accuracy": 0.8432614449411631,
|
|
"num_tokens": 140160179.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"entropy": 0.43756103515625,
|
|
"epoch": 1.2976190476190477,
|
|
"grad_norm": 1.174514802084351,
|
|
"learning_rate": 6.528430235626819e-06,
|
|
"loss": 0.4463,
|
|
"mean_token_accuracy": 0.8453215239569545,
|
|
"num_tokens": 140577958.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"entropy": 0.433258056640625,
|
|
"epoch": 1.3015873015873016,
|
|
"grad_norm": 1.091000460313449,
|
|
"learning_rate": 6.5075855039360805e-06,
|
|
"loss": 0.4632,
|
|
"mean_token_accuracy": 0.8417082950472832,
|
|
"num_tokens": 141002875.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"entropy": 0.43377685546875,
|
|
"epoch": 1.3055555555555556,
|
|
"grad_norm": 0.9951305912978812,
|
|
"learning_rate": 6.486711909866895e-06,
|
|
"loss": 0.445,
|
|
"mean_token_accuracy": 0.8452390227466822,
|
|
"num_tokens": 141425392.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"entropy": 0.436004638671875,
|
|
"epoch": 1.3095238095238095,
|
|
"grad_norm": 0.9773602377225085,
|
|
"learning_rate": 6.465809853039431e-06,
|
|
"loss": 0.4429,
|
|
"mean_token_accuracy": 0.8470056857913733,
|
|
"num_tokens": 141858286.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.44110107421875,
|
|
"epoch": 1.3134920634920635,
|
|
"grad_norm": 1.0492801166182826,
|
|
"learning_rate": 6.444879733618766e-06,
|
|
"loss": 0.4432,
|
|
"mean_token_accuracy": 0.8470598505809903,
|
|
"num_tokens": 142279417.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"entropy": 0.439056396484375,
|
|
"epoch": 1.3174603174603174,
|
|
"grad_norm": 0.9459765835539803,
|
|
"learning_rate": 6.423921952307237e-06,
|
|
"loss": 0.4471,
|
|
"mean_token_accuracy": 0.8453462338075042,
|
|
"num_tokens": 142698339.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"entropy": 0.436981201171875,
|
|
"epoch": 1.3214285714285714,
|
|
"grad_norm": 1.075628581502009,
|
|
"learning_rate": 6.4029369103367545e-06,
|
|
"loss": 0.4424,
|
|
"mean_token_accuracy": 0.8465406149625778,
|
|
"num_tokens": 143128013.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"entropy": 0.43994140625,
|
|
"epoch": 1.3253968253968254,
|
|
"grad_norm": 1.0287829199461864,
|
|
"learning_rate": 6.381925009461128e-06,
|
|
"loss": 0.4456,
|
|
"mean_token_accuracy": 0.8456096695736051,
|
|
"num_tokens": 143561112.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"entropy": 0.441192626953125,
|
|
"epoch": 1.3293650793650793,
|
|
"grad_norm": 1.1380572333251808,
|
|
"learning_rate": 6.3608866519483825e-06,
|
|
"loss": 0.4498,
|
|
"mean_token_accuracy": 0.844082260504365,
|
|
"num_tokens": 143970890.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 0.435333251953125,
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 1.102203799703573,
|
|
"learning_rate": 6.339822240573041e-06,
|
|
"loss": 0.4476,
|
|
"mean_token_accuracy": 0.8457837710157037,
|
|
"num_tokens": 144390223.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"entropy": 0.43310546875,
|
|
"epoch": 1.3373015873015874,
|
|
"grad_norm": 1.0745564478696599,
|
|
"learning_rate": 6.3187321786084236e-06,
|
|
"loss": 0.4609,
|
|
"mean_token_accuracy": 0.8417782466858625,
|
|
"num_tokens": 144839957.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"entropy": 0.4366455078125,
|
|
"epoch": 1.3412698412698414,
|
|
"grad_norm": 1.1064436100800052,
|
|
"learning_rate": 6.297616869818926e-06,
|
|
"loss": 0.4627,
|
|
"mean_token_accuracy": 0.8423483874648809,
|
|
"num_tokens": 145276276.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"entropy": 0.43682861328125,
|
|
"epoch": 1.3452380952380953,
|
|
"grad_norm": 1.0627782502876304,
|
|
"learning_rate": 6.276476718452289e-06,
|
|
"loss": 0.4599,
|
|
"mean_token_accuracy": 0.8434413159266114,
|
|
"num_tokens": 145722320.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"entropy": 0.440948486328125,
|
|
"epoch": 1.3492063492063493,
|
|
"grad_norm": 1.0311065316684267,
|
|
"learning_rate": 6.2553121292318595e-06,
|
|
"loss": 0.4445,
|
|
"mean_token_accuracy": 0.8466370198875666,
|
|
"num_tokens": 146148957.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.44580078125,
|
|
"epoch": 1.3531746031746033,
|
|
"grad_norm": 0.9538685008179283,
|
|
"learning_rate": 6.23412350734884e-06,
|
|
"loss": 0.4571,
|
|
"mean_token_accuracy": 0.8417170522734523,
|
|
"num_tokens": 146580318.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"entropy": 0.441864013671875,
|
|
"epoch": 1.3571428571428572,
|
|
"grad_norm": 1.0998343004271525,
|
|
"learning_rate": 6.2129112584545325e-06,
|
|
"loss": 0.4437,
|
|
"mean_token_accuracy": 0.846907963976264,
|
|
"num_tokens": 146999892.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"entropy": 0.441070556640625,
|
|
"epoch": 1.3611111111111112,
|
|
"grad_norm": 1.0173140297071601,
|
|
"learning_rate": 6.191675788652574e-06,
|
|
"loss": 0.4461,
|
|
"mean_token_accuracy": 0.8460167152807117,
|
|
"num_tokens": 147436184.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"entropy": 0.4295654296875,
|
|
"epoch": 1.3650793650793651,
|
|
"grad_norm": 1.0290558215509458,
|
|
"learning_rate": 6.170417504491157e-06,
|
|
"loss": 0.4541,
|
|
"mean_token_accuracy": 0.8437853921204805,
|
|
"num_tokens": 147888947.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"entropy": 0.441253662109375,
|
|
"epoch": 1.369047619047619,
|
|
"grad_norm": 0.9977939913686099,
|
|
"learning_rate": 6.149136812955256e-06,
|
|
"loss": 0.4605,
|
|
"mean_token_accuracy": 0.8413437977433205,
|
|
"num_tokens": 148330624.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 0.44482421875,
|
|
"epoch": 1.373015873015873,
|
|
"grad_norm": 0.9862218483442303,
|
|
"learning_rate": 6.1278341214588255e-06,
|
|
"loss": 0.4608,
|
|
"mean_token_accuracy": 0.84361382573843,
|
|
"num_tokens": 148771994.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"entropy": 0.43817138671875,
|
|
"epoch": 1.376984126984127,
|
|
"grad_norm": 1.0974460607418992,
|
|
"learning_rate": 6.106509837837004e-06,
|
|
"loss": 0.4468,
|
|
"mean_token_accuracy": 0.8459707852452993,
|
|
"num_tokens": 149203608.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"entropy": 0.435211181640625,
|
|
"epoch": 1.380952380952381,
|
|
"grad_norm": 0.9546922816485226,
|
|
"learning_rate": 6.0851643703383066e-06,
|
|
"loss": 0.4456,
|
|
"mean_token_accuracy": 0.8459897711873055,
|
|
"num_tokens": 149626353.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"entropy": 0.43658447265625,
|
|
"epoch": 1.3849206349206349,
|
|
"grad_norm": 1.0823837316088047,
|
|
"learning_rate": 6.063798127616811e-06,
|
|
"loss": 0.4447,
|
|
"mean_token_accuracy": 0.8457578187808394,
|
|
"num_tokens": 150036189.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"entropy": 0.437774658203125,
|
|
"epoch": 1.3888888888888888,
|
|
"grad_norm": 1.1008934320855421,
|
|
"learning_rate": 6.042411518724327e-06,
|
|
"loss": 0.4402,
|
|
"mean_token_accuracy": 0.84851832408458,
|
|
"num_tokens": 150484433.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.441436767578125,
|
|
"epoch": 1.3928571428571428,
|
|
"grad_norm": 1.0679863117222357,
|
|
"learning_rate": 6.021004953102576e-06,
|
|
"loss": 0.4475,
|
|
"mean_token_accuracy": 0.8463964462280273,
|
|
"num_tokens": 150916869.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"entropy": 0.445831298828125,
|
|
"epoch": 1.3968253968253967,
|
|
"grad_norm": 1.0542706083048947,
|
|
"learning_rate": 5.999578840575342e-06,
|
|
"loss": 0.4504,
|
|
"mean_token_accuracy": 0.8455899534747005,
|
|
"num_tokens": 151351171.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"entropy": 0.438507080078125,
|
|
"epoch": 1.4007936507936507,
|
|
"grad_norm": 0.987561703544306,
|
|
"learning_rate": 5.978133591340633e-06,
|
|
"loss": 0.4494,
|
|
"mean_token_accuracy": 0.8452698877081275,
|
|
"num_tokens": 151779921.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"entropy": 0.435516357421875,
|
|
"epoch": 1.4047619047619047,
|
|
"grad_norm": 1.1262078381527667,
|
|
"learning_rate": 5.956669615962821e-06,
|
|
"loss": 0.4602,
|
|
"mean_token_accuracy": 0.8407345684245229,
|
|
"num_tokens": 152198704.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"entropy": 0.43695068359375,
|
|
"epoch": 1.4087301587301586,
|
|
"grad_norm": 1.0842467706193302,
|
|
"learning_rate": 5.935187325364791e-06,
|
|
"loss": 0.4504,
|
|
"mean_token_accuracy": 0.8444310743361712,
|
|
"num_tokens": 152607114.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 0.442291259765625,
|
|
"epoch": 1.4126984126984126,
|
|
"grad_norm": 0.8869868658428021,
|
|
"learning_rate": 5.913687130820064e-06,
|
|
"loss": 0.4441,
|
|
"mean_token_accuracy": 0.846776382997632,
|
|
"num_tokens": 153027562.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"entropy": 0.439239501953125,
|
|
"epoch": 1.4166666666666667,
|
|
"grad_norm": 1.0275339768252305,
|
|
"learning_rate": 5.892169443944929e-06,
|
|
"loss": 0.443,
|
|
"mean_token_accuracy": 0.84731434751302,
|
|
"num_tokens": 153449258.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"entropy": 0.4425048828125,
|
|
"epoch": 1.4206349206349207,
|
|
"grad_norm": 0.9646996873181736,
|
|
"learning_rate": 5.870634676690564e-06,
|
|
"loss": 0.4433,
|
|
"mean_token_accuracy": 0.8452265271916986,
|
|
"num_tokens": 153863890.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"entropy": 0.441680908203125,
|
|
"epoch": 1.4246031746031746,
|
|
"grad_norm": 1.0623013087943407,
|
|
"learning_rate": 5.8490832413351465e-06,
|
|
"loss": 0.4484,
|
|
"mean_token_accuracy": 0.8456388972699642,
|
|
"num_tokens": 154280797.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"entropy": 0.4415283203125,
|
|
"epoch": 1.4285714285714286,
|
|
"grad_norm": 0.9302981043880288,
|
|
"learning_rate": 5.827515550475955e-06,
|
|
"loss": 0.4499,
|
|
"mean_token_accuracy": 0.8448391910642385,
|
|
"num_tokens": 154707913.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.437255859375,
|
|
"epoch": 1.4325396825396826,
|
|
"grad_norm": 0.9416953081304574,
|
|
"learning_rate": 5.805932017021486e-06,
|
|
"loss": 0.4486,
|
|
"mean_token_accuracy": 0.8438430884853005,
|
|
"num_tokens": 155150096.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"entropy": 0.43389892578125,
|
|
"epoch": 1.4365079365079365,
|
|
"grad_norm": 0.9373065849374296,
|
|
"learning_rate": 5.784333054183533e-06,
|
|
"loss": 0.4449,
|
|
"mean_token_accuracy": 0.8454085243865848,
|
|
"num_tokens": 155590050.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"entropy": 0.437469482421875,
|
|
"epoch": 1.4404761904761905,
|
|
"grad_norm": 0.9209854720626441,
|
|
"learning_rate": 5.762719075469277e-06,
|
|
"loss": 0.4465,
|
|
"mean_token_accuracy": 0.846617016941309,
|
|
"num_tokens": 156016093.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"entropy": 0.436737060546875,
|
|
"epoch": 1.4444444444444444,
|
|
"grad_norm": 0.9861130639611431,
|
|
"learning_rate": 5.741090494673386e-06,
|
|
"loss": 0.443,
|
|
"mean_token_accuracy": 0.8471564138308167,
|
|
"num_tokens": 156449879.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"entropy": 0.441497802734375,
|
|
"epoch": 1.4484126984126984,
|
|
"grad_norm": 0.9886455980759782,
|
|
"learning_rate": 5.719447725870071e-06,
|
|
"loss": 0.4337,
|
|
"mean_token_accuracy": 0.849870765581727,
|
|
"num_tokens": 156866761.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 0.4375,
|
|
"epoch": 1.4523809523809523,
|
|
"grad_norm": 0.9241839444207883,
|
|
"learning_rate": 5.697791183405174e-06,
|
|
"loss": 0.4333,
|
|
"mean_token_accuracy": 0.8499069400131702,
|
|
"num_tokens": 157304143.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"entropy": 0.43499755859375,
|
|
"epoch": 1.4563492063492063,
|
|
"grad_norm": 0.9452367705103182,
|
|
"learning_rate": 5.67612128188823e-06,
|
|
"loss": 0.4617,
|
|
"mean_token_accuracy": 0.8407938601449132,
|
|
"num_tokens": 157758390.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"entropy": 0.440948486328125,
|
|
"epoch": 1.4603174603174602,
|
|
"grad_norm": 1.0583903459607955,
|
|
"learning_rate": 5.654438436184531e-06,
|
|
"loss": 0.4393,
|
|
"mean_token_accuracy": 0.845472626388073,
|
|
"num_tokens": 158177988.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"entropy": 0.427886962890625,
|
|
"epoch": 1.4642857142857144,
|
|
"grad_norm": 1.025858650536215,
|
|
"learning_rate": 5.6327430614071794e-06,
|
|
"loss": 0.4551,
|
|
"mean_token_accuracy": 0.843145564198494,
|
|
"num_tokens": 158634349.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"entropy": 0.43743896484375,
|
|
"epoch": 1.4682539682539684,
|
|
"grad_norm": 0.9049075091671224,
|
|
"learning_rate": 5.611035572909147e-06,
|
|
"loss": 0.4462,
|
|
"mean_token_accuracy": 0.8464264376088977,
|
|
"num_tokens": 159060066.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.440399169921875,
|
|
"epoch": 1.4722222222222223,
|
|
"grad_norm": 1.1144452604957675,
|
|
"learning_rate": 5.589316386275318e-06,
|
|
"loss": 0.4474,
|
|
"mean_token_accuracy": 0.8443919736891985,
|
|
"num_tokens": 159490031.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"entropy": 0.431396484375,
|
|
"epoch": 1.4761904761904763,
|
|
"grad_norm": 1.058425365791423,
|
|
"learning_rate": 5.567585917314535e-06,
|
|
"loss": 0.4494,
|
|
"mean_token_accuracy": 0.8443618472665548,
|
|
"num_tokens": 159942986.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"entropy": 0.44146728515625,
|
|
"epoch": 1.4801587301587302,
|
|
"grad_norm": 1.0282647508500027,
|
|
"learning_rate": 5.545844582051641e-06,
|
|
"loss": 0.4265,
|
|
"mean_token_accuracy": 0.8528409609571099,
|
|
"num_tokens": 160355322.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"entropy": 0.43463134765625,
|
|
"epoch": 1.4841269841269842,
|
|
"grad_norm": 1.019933540593848,
|
|
"learning_rate": 5.524092796719507e-06,
|
|
"loss": 0.4521,
|
|
"mean_token_accuracy": 0.8433405430987477,
|
|
"num_tokens": 160782816.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"entropy": 0.435699462890625,
|
|
"epoch": 1.4880952380952381,
|
|
"grad_norm": 0.9737519159679464,
|
|
"learning_rate": 5.502330977751072e-06,
|
|
"loss": 0.4462,
|
|
"mean_token_accuracy": 0.8467091489583254,
|
|
"num_tokens": 161216771.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 0.4390869140625,
|
|
"epoch": 1.492063492063492,
|
|
"grad_norm": 1.0788036881829521,
|
|
"learning_rate": 5.4805595417713634e-06,
|
|
"loss": 0.4353,
|
|
"mean_token_accuracy": 0.8512382041662931,
|
|
"num_tokens": 161643512.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"entropy": 0.43292236328125,
|
|
"epoch": 1.496031746031746,
|
|
"grad_norm": 1.1542521219056112,
|
|
"learning_rate": 5.458778905589528e-06,
|
|
"loss": 0.4366,
|
|
"mean_token_accuracy": 0.8494954742491245,
|
|
"num_tokens": 162077647.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"entropy": 0.4354248046875,
|
|
"epoch": 1.5,
|
|
"grad_norm": 1.072458349595571,
|
|
"learning_rate": 5.436989486190846e-06,
|
|
"loss": 0.4335,
|
|
"mean_token_accuracy": 0.8492478728294373,
|
|
"num_tokens": 162503001.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"entropy": 0.43646240234375,
|
|
"epoch": 1.503968253968254,
|
|
"grad_norm": 0.9350737480343512,
|
|
"learning_rate": 5.415191700728749e-06,
|
|
"loss": 0.4548,
|
|
"mean_token_accuracy": 0.8452032124623656,
|
|
"num_tokens": 162949686.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"entropy": 0.430450439453125,
|
|
"epoch": 1.507936507936508,
|
|
"grad_norm": 0.9306004319608026,
|
|
"learning_rate": 5.393385966516838e-06,
|
|
"loss": 0.4397,
|
|
"mean_token_accuracy": 0.8475316297262907,
|
|
"num_tokens": 163388010.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.4310302734375,
|
|
"epoch": 1.5119047619047619,
|
|
"grad_norm": 0.985227817450923,
|
|
"learning_rate": 5.371572701020891e-06,
|
|
"loss": 0.4341,
|
|
"mean_token_accuracy": 0.8477435661479831,
|
|
"num_tokens": 163816567.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"entropy": 0.4312744140625,
|
|
"epoch": 1.5158730158730158,
|
|
"grad_norm": 0.9677192291245226,
|
|
"learning_rate": 5.349752321850866e-06,
|
|
"loss": 0.448,
|
|
"mean_token_accuracy": 0.8447540532797575,
|
|
"num_tokens": 164270399.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"entropy": 0.429290771484375,
|
|
"epoch": 1.5198412698412698,
|
|
"grad_norm": 0.9069780240418857,
|
|
"learning_rate": 5.327925246752917e-06,
|
|
"loss": 0.4293,
|
|
"mean_token_accuracy": 0.8511379426345229,
|
|
"num_tokens": 164712402.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"entropy": 0.428375244140625,
|
|
"epoch": 1.5238095238095237,
|
|
"grad_norm": 0.9600603128750645,
|
|
"learning_rate": 5.306091893601384e-06,
|
|
"loss": 0.4487,
|
|
"mean_token_accuracy": 0.845952364616096,
|
|
"num_tokens": 165155523.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"entropy": 0.4332275390625,
|
|
"epoch": 1.5277777777777777,
|
|
"grad_norm": 0.9724692951976357,
|
|
"learning_rate": 5.284252680390803e-06,
|
|
"loss": 0.4269,
|
|
"mean_token_accuracy": 0.8531857188791037,
|
|
"num_tokens": 165575993.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 0.43194580078125,
|
|
"epoch": 1.5317460317460316,
|
|
"grad_norm": 0.9712295118336367,
|
|
"learning_rate": 5.2624080252279006e-06,
|
|
"loss": 0.4471,
|
|
"mean_token_accuracy": 0.845876133069396,
|
|
"num_tokens": 166004219.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"entropy": 0.4315185546875,
|
|
"epoch": 1.5357142857142856,
|
|
"grad_norm": 0.9443490875832254,
|
|
"learning_rate": 5.240558346323582e-06,
|
|
"loss": 0.437,
|
|
"mean_token_accuracy": 0.8483765926212072,
|
|
"num_tokens": 166459333.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"entropy": 0.434234619140625,
|
|
"epoch": 1.5396825396825395,
|
|
"grad_norm": 0.948734807560996,
|
|
"learning_rate": 5.218704061984938e-06,
|
|
"loss": 0.4387,
|
|
"mean_token_accuracy": 0.8489022571593523,
|
|
"num_tokens": 166887486.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"entropy": 0.433074951171875,
|
|
"epoch": 1.5436507936507935,
|
|
"grad_norm": 0.9920709984656828,
|
|
"learning_rate": 5.196845590607225e-06,
|
|
"loss": 0.444,
|
|
"mean_token_accuracy": 0.8482109969481826,
|
|
"num_tokens": 167305651.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"entropy": 0.427581787109375,
|
|
"epoch": 1.5476190476190477,
|
|
"grad_norm": 1.007023739541341,
|
|
"learning_rate": 5.174983350665861e-06,
|
|
"loss": 0.4355,
|
|
"mean_token_accuracy": 0.8507700897753239,
|
|
"num_tokens": 167743608.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.435516357421875,
|
|
"epoch": 1.5515873015873016,
|
|
"grad_norm": 0.9396600741753053,
|
|
"learning_rate": 5.153117760708411e-06,
|
|
"loss": 0.4387,
|
|
"mean_token_accuracy": 0.8479267274960876,
|
|
"num_tokens": 168189361.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"entropy": 0.440887451171875,
|
|
"epoch": 1.5555555555555556,
|
|
"grad_norm": 0.9532447871050252,
|
|
"learning_rate": 5.131249239346574e-06,
|
|
"loss": 0.4364,
|
|
"mean_token_accuracy": 0.8505636844784021,
|
|
"num_tokens": 168602032.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"entropy": 0.436492919921875,
|
|
"epoch": 1.5595238095238095,
|
|
"grad_norm": 0.9020737415284749,
|
|
"learning_rate": 5.109378205248177e-06,
|
|
"loss": 0.4426,
|
|
"mean_token_accuracy": 0.8446815246716142,
|
|
"num_tokens": 169036397.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"entropy": 0.43292236328125,
|
|
"epoch": 1.5634920634920635,
|
|
"grad_norm": 1.5261604480695485,
|
|
"learning_rate": 5.087505077129144e-06,
|
|
"loss": 0.4458,
|
|
"mean_token_accuracy": 0.8471705308184028,
|
|
"num_tokens": 169469975.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"entropy": 0.425628662109375,
|
|
"epoch": 1.5674603174603174,
|
|
"grad_norm": 1.0588587386866344,
|
|
"learning_rate": 5.065630273745495e-06,
|
|
"loss": 0.4463,
|
|
"mean_token_accuracy": 0.8460619812831283,
|
|
"num_tokens": 169905002.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 0.429779052734375,
|
|
"epoch": 1.5714285714285714,
|
|
"grad_norm": 1.017609763369074,
|
|
"learning_rate": 5.043754213885319e-06,
|
|
"loss": 0.4437,
|
|
"mean_token_accuracy": 0.8433791399002075,
|
|
"num_tokens": 170343282.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"entropy": 0.436981201171875,
|
|
"epoch": 1.5753968253968254,
|
|
"grad_norm": 0.9564026257150148,
|
|
"learning_rate": 5.021877316360759e-06,
|
|
"loss": 0.4354,
|
|
"mean_token_accuracy": 0.8478411976248026,
|
|
"num_tokens": 170783254.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"entropy": 0.43304443359375,
|
|
"epoch": 1.5793650793650795,
|
|
"grad_norm": 0.9585975685485587,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.4505,
|
|
"mean_token_accuracy": 0.8458553478121758,
|
|
"num_tokens": 171227432.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"entropy": 0.43023681640625,
|
|
"epoch": 1.5833333333333335,
|
|
"grad_norm": 1.0440501055720262,
|
|
"learning_rate": 4.978122683639241e-06,
|
|
"loss": 0.4275,
|
|
"mean_token_accuracy": 0.8501301733776927,
|
|
"num_tokens": 171673565.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"entropy": 0.436431884765625,
|
|
"epoch": 1.5873015873015874,
|
|
"grad_norm": 1.0933083501713738,
|
|
"learning_rate": 4.956245786114683e-06,
|
|
"loss": 0.4295,
|
|
"mean_token_accuracy": 0.8506188867613673,
|
|
"num_tokens": 172096305.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.434814453125,
|
|
"epoch": 1.5912698412698414,
|
|
"grad_norm": 1.1069832769815195,
|
|
"learning_rate": 4.934369726254506e-06,
|
|
"loss": 0.43,
|
|
"mean_token_accuracy": 0.8495042575523257,
|
|
"num_tokens": 172495298.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"entropy": 0.433929443359375,
|
|
"epoch": 1.5952380952380953,
|
|
"grad_norm": 1.120671038507196,
|
|
"learning_rate": 4.9124949228708566e-06,
|
|
"loss": 0.4334,
|
|
"mean_token_accuracy": 0.8499879157170653,
|
|
"num_tokens": 172910673.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"entropy": 0.42694091796875,
|
|
"epoch": 1.5992063492063493,
|
|
"grad_norm": 1.028931284451181,
|
|
"learning_rate": 4.890621794751825e-06,
|
|
"loss": 0.4319,
|
|
"mean_token_accuracy": 0.8494029613211751,
|
|
"num_tokens": 173326209.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"entropy": 0.426605224609375,
|
|
"epoch": 1.6031746031746033,
|
|
"grad_norm": 0.9118168079626323,
|
|
"learning_rate": 4.8687507606534274e-06,
|
|
"loss": 0.4372,
|
|
"mean_token_accuracy": 0.8469415912404656,
|
|
"num_tokens": 173775762.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"entropy": 0.43621826171875,
|
|
"epoch": 1.6071428571428572,
|
|
"grad_norm": 1.0102731648951273,
|
|
"learning_rate": 4.8468822392915925e-06,
|
|
"loss": 0.4367,
|
|
"mean_token_accuracy": 0.8488945597782731,
|
|
"num_tokens": 174200041.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 0.428955078125,
|
|
"epoch": 1.6111111111111112,
|
|
"grad_norm": 0.9690257742063463,
|
|
"learning_rate": 4.82501664933414e-06,
|
|
"loss": 0.4406,
|
|
"mean_token_accuracy": 0.8465389581397176,
|
|
"num_tokens": 174651858.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"entropy": 0.436920166015625,
|
|
"epoch": 1.6150793650793651,
|
|
"grad_norm": 0.8850222581892622,
|
|
"learning_rate": 4.803154409392776e-06,
|
|
"loss": 0.4324,
|
|
"mean_token_accuracy": 0.8495019385591149,
|
|
"num_tokens": 175081173.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"entropy": 0.430511474609375,
|
|
"epoch": 1.619047619047619,
|
|
"grad_norm": 0.95437734633981,
|
|
"learning_rate": 4.781295938015063e-06,
|
|
"loss": 0.4331,
|
|
"mean_token_accuracy": 0.8485972639173269,
|
|
"num_tokens": 175519282.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"entropy": 0.435028076171875,
|
|
"epoch": 1.623015873015873,
|
|
"grad_norm": 1.0123634812749625,
|
|
"learning_rate": 4.759441653676419e-06,
|
|
"loss": 0.4466,
|
|
"mean_token_accuracy": 0.848145549185574,
|
|
"num_tokens": 175965036.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"entropy": 0.431060791015625,
|
|
"epoch": 1.626984126984127,
|
|
"grad_norm": 0.909110311090521,
|
|
"learning_rate": 4.737591974772102e-06,
|
|
"loss": 0.4451,
|
|
"mean_token_accuracy": 0.8459606841206551,
|
|
"num_tokens": 176387199.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.4302978515625,
|
|
"epoch": 1.630952380952381,
|
|
"grad_norm": 0.964606274615154,
|
|
"learning_rate": 4.715747319609199e-06,
|
|
"loss": 0.4414,
|
|
"mean_token_accuracy": 0.8480783235281706,
|
|
"num_tokens": 176823428.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"entropy": 0.423431396484375,
|
|
"epoch": 1.6349206349206349,
|
|
"grad_norm": 0.9360221541198701,
|
|
"learning_rate": 4.693908106398617e-06,
|
|
"loss": 0.4393,
|
|
"mean_token_accuracy": 0.8489115545526147,
|
|
"num_tokens": 177264131.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"entropy": 0.4334716796875,
|
|
"epoch": 1.6388888888888888,
|
|
"grad_norm": 0.9818915467360069,
|
|
"learning_rate": 4.6720747532470845e-06,
|
|
"loss": 0.4294,
|
|
"mean_token_accuracy": 0.8496479475870728,
|
|
"num_tokens": 177680911.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"entropy": 0.432647705078125,
|
|
"epoch": 1.6428571428571428,
|
|
"grad_norm": 0.8978522056780484,
|
|
"learning_rate": 4.650247678149135e-06,
|
|
"loss": 0.4379,
|
|
"mean_token_accuracy": 0.8470958042889833,
|
|
"num_tokens": 178114003.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"entropy": 0.437652587890625,
|
|
"epoch": 1.6468253968253967,
|
|
"grad_norm": 0.9722385088780229,
|
|
"learning_rate": 4.628427298979111e-06,
|
|
"loss": 0.4514,
|
|
"mean_token_accuracy": 0.8430732255801558,
|
|
"num_tokens": 178533077.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 0.437347412109375,
|
|
"epoch": 1.6507936507936507,
|
|
"grad_norm": 1.0373796667738375,
|
|
"learning_rate": 4.606614033483164e-06,
|
|
"loss": 0.4326,
|
|
"mean_token_accuracy": 0.8507428057491779,
|
|
"num_tokens": 178950487.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"entropy": 0.4326171875,
|
|
"epoch": 1.6547619047619047,
|
|
"grad_norm": 1.010237913873583,
|
|
"learning_rate": 4.5848082992712516e-06,
|
|
"loss": 0.4377,
|
|
"mean_token_accuracy": 0.8486862545832992,
|
|
"num_tokens": 179384739.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"entropy": 0.426300048828125,
|
|
"epoch": 1.6587301587301586,
|
|
"grad_norm": 1.0263841694329876,
|
|
"learning_rate": 4.563010513809156e-06,
|
|
"loss": 0.4455,
|
|
"mean_token_accuracy": 0.8446431895717978,
|
|
"num_tokens": 179833212.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"entropy": 0.42828369140625,
|
|
"epoch": 1.6626984126984126,
|
|
"grad_norm": 0.9494913320869729,
|
|
"learning_rate": 4.541221094410473e-06,
|
|
"loss": 0.4306,
|
|
"mean_token_accuracy": 0.8516378318890929,
|
|
"num_tokens": 180259940.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"entropy": 0.42144775390625,
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 0.9739308463131585,
|
|
"learning_rate": 4.519440458228638e-06,
|
|
"loss": 0.4381,
|
|
"mean_token_accuracy": 0.8479503998532891,
|
|
"num_tokens": 180712234.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.4244384765625,
|
|
"epoch": 1.6706349206349205,
|
|
"grad_norm": 1.0181973094308832,
|
|
"learning_rate": 4.497669022248931e-06,
|
|
"loss": 0.4525,
|
|
"mean_token_accuracy": 0.843443606980145,
|
|
"num_tokens": 181151354.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"entropy": 0.430877685546875,
|
|
"epoch": 1.6746031746031746,
|
|
"grad_norm": 3.323978860931596,
|
|
"learning_rate": 4.475907203280494e-06,
|
|
"loss": 0.4383,
|
|
"mean_token_accuracy": 0.8451524330303073,
|
|
"num_tokens": 181566490.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"entropy": 0.428955078125,
|
|
"epoch": 1.6785714285714286,
|
|
"grad_norm": 1.2824867106826667,
|
|
"learning_rate": 4.45415541794836e-06,
|
|
"loss": 0.446,
|
|
"mean_token_accuracy": 0.8463947279378772,
|
|
"num_tokens": 181997420.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"entropy": 0.431793212890625,
|
|
"epoch": 1.6825396825396826,
|
|
"grad_norm": 1.0255881219333862,
|
|
"learning_rate": 4.432414082685466e-06,
|
|
"loss": 0.4358,
|
|
"mean_token_accuracy": 0.8490986367687583,
|
|
"num_tokens": 182413254.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"entropy": 0.42706298828125,
|
|
"epoch": 1.6865079365079365,
|
|
"grad_norm": 1.0665870604693903,
|
|
"learning_rate": 4.410683613724684e-06,
|
|
"loss": 0.4292,
|
|
"mean_token_accuracy": 0.8507826002314687,
|
|
"num_tokens": 182840621.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 0.427398681640625,
|
|
"epoch": 1.6904761904761905,
|
|
"grad_norm": 1.1351262001199722,
|
|
"learning_rate": 4.388964427090855e-06,
|
|
"loss": 0.4359,
|
|
"mean_token_accuracy": 0.846874114125967,
|
|
"num_tokens": 183269538.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"entropy": 0.43524169921875,
|
|
"epoch": 1.6944444444444444,
|
|
"grad_norm": 0.9895934977007657,
|
|
"learning_rate": 4.367256938592822e-06,
|
|
"loss": 0.4231,
|
|
"mean_token_accuracy": 0.8536219568923116,
|
|
"num_tokens": 183684845.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"entropy": 0.43170166015625,
|
|
"epoch": 1.6984126984126984,
|
|
"grad_norm": 1.1767949451847899,
|
|
"learning_rate": 4.345561563815471e-06,
|
|
"loss": 0.4337,
|
|
"mean_token_accuracy": 0.8503425857052207,
|
|
"num_tokens": 184109496.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"entropy": 0.433258056640625,
|
|
"epoch": 1.7023809523809523,
|
|
"grad_norm": 0.9787163441447944,
|
|
"learning_rate": 4.323878718111771e-06,
|
|
"loss": 0.4496,
|
|
"mean_token_accuracy": 0.8437537206336856,
|
|
"num_tokens": 184533568.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"entropy": 0.432220458984375,
|
|
"epoch": 1.7063492063492065,
|
|
"grad_norm": 0.9948605324632119,
|
|
"learning_rate": 4.302208816594829e-06,
|
|
"loss": 0.4387,
|
|
"mean_token_accuracy": 0.8475517062470317,
|
|
"num_tokens": 184968366.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.42999267578125,
|
|
"epoch": 1.7103174603174605,
|
|
"grad_norm": 0.9068147664673831,
|
|
"learning_rate": 4.280552274129932e-06,
|
|
"loss": 0.4376,
|
|
"mean_token_accuracy": 0.8486391613259912,
|
|
"num_tokens": 185404884.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"entropy": 0.427978515625,
|
|
"epoch": 1.7142857142857144,
|
|
"grad_norm": 0.9871014833586675,
|
|
"learning_rate": 4.258909505326617e-06,
|
|
"loss": 0.4451,
|
|
"mean_token_accuracy": 0.8455649884417653,
|
|
"num_tokens": 185857166.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"entropy": 0.432586669921875,
|
|
"epoch": 1.7182539682539684,
|
|
"grad_norm": 0.9995499236592311,
|
|
"learning_rate": 4.237280924530723e-06,
|
|
"loss": 0.425,
|
|
"mean_token_accuracy": 0.8507826123386621,
|
|
"num_tokens": 186278301.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"entropy": 0.43853759765625,
|
|
"epoch": 1.7222222222222223,
|
|
"grad_norm": 0.9796741726346321,
|
|
"learning_rate": 4.215666945816469e-06,
|
|
"loss": 0.4266,
|
|
"mean_token_accuracy": 0.850803654640913,
|
|
"num_tokens": 186684767.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"entropy": 0.4305419921875,
|
|
"epoch": 1.7261904761904763,
|
|
"grad_norm": 0.9307664459487662,
|
|
"learning_rate": 4.194067982978516e-06,
|
|
"loss": 0.4279,
|
|
"mean_token_accuracy": 0.8503124145790935,
|
|
"num_tokens": 187107470.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 0.425567626953125,
|
|
"epoch": 1.7301587301587302,
|
|
"grad_norm": 0.9496403248581704,
|
|
"learning_rate": 4.172484449524047e-06,
|
|
"loss": 0.428,
|
|
"mean_token_accuracy": 0.8510759947821498,
|
|
"num_tokens": 187534641.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"entropy": 0.42620849609375,
|
|
"epoch": 1.7341269841269842,
|
|
"grad_norm": 0.9874730817939584,
|
|
"learning_rate": 4.150916758664857e-06,
|
|
"loss": 0.4352,
|
|
"mean_token_accuracy": 0.848286903463304,
|
|
"num_tokens": 187972052.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"entropy": 0.424652099609375,
|
|
"epoch": 1.7380952380952381,
|
|
"grad_norm": 0.9625644757119309,
|
|
"learning_rate": 4.129365323309436e-06,
|
|
"loss": 0.4295,
|
|
"mean_token_accuracy": 0.8496120125055313,
|
|
"num_tokens": 188403747.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"entropy": 0.425537109375,
|
|
"epoch": 1.742063492063492,
|
|
"grad_norm": 0.9770323219075207,
|
|
"learning_rate": 4.107830556055072e-06,
|
|
"loss": 0.4363,
|
|
"mean_token_accuracy": 0.8482074243947864,
|
|
"num_tokens": 188833376.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"entropy": 0.420562744140625,
|
|
"epoch": 1.746031746031746,
|
|
"grad_norm": 0.9091458418004688,
|
|
"learning_rate": 4.086312869179938e-06,
|
|
"loss": 0.434,
|
|
"mean_token_accuracy": 0.8494348004460335,
|
|
"num_tokens": 189286051.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.4337158203125,
|
|
"epoch": 1.75,
|
|
"grad_norm": 0.9398983504232156,
|
|
"learning_rate": 4.06481267463521e-06,
|
|
"loss": 0.4233,
|
|
"mean_token_accuracy": 0.85198515933007,
|
|
"num_tokens": 189700932.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"entropy": 0.428436279296875,
|
|
"epoch": 1.753968253968254,
|
|
"grad_norm": 0.9954518019783384,
|
|
"learning_rate": 4.04333038403718e-06,
|
|
"loss": 0.4332,
|
|
"mean_token_accuracy": 0.8483901359140873,
|
|
"num_tokens": 190135846.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"entropy": 0.41839599609375,
|
|
"epoch": 1.757936507936508,
|
|
"grad_norm": 0.9235407840660959,
|
|
"learning_rate": 4.021866408659368e-06,
|
|
"loss": 0.4376,
|
|
"mean_token_accuracy": 0.8477007877081633,
|
|
"num_tokens": 190599539.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"entropy": 0.42510986328125,
|
|
"epoch": 1.7619047619047619,
|
|
"grad_norm": 0.9988254434360743,
|
|
"learning_rate": 4.000421159424658e-06,
|
|
"loss": 0.4381,
|
|
"mean_token_accuracy": 0.849124894477427,
|
|
"num_tokens": 191023956.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"entropy": 0.44061279296875,
|
|
"epoch": 1.7658730158730158,
|
|
"grad_norm": 0.9313679757350634,
|
|
"learning_rate": 3.978995046897425e-06,
|
|
"loss": 0.4111,
|
|
"mean_token_accuracy": 0.8550975983962417,
|
|
"num_tokens": 191419256.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 0.42877197265625,
|
|
"epoch": 1.7698412698412698,
|
|
"grad_norm": 0.9424190366763185,
|
|
"learning_rate": 3.957588481275674e-06,
|
|
"loss": 0.438,
|
|
"mean_token_accuracy": 0.848029020242393,
|
|
"num_tokens": 191865715.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"entropy": 0.437103271484375,
|
|
"epoch": 1.7738095238095237,
|
|
"grad_norm": 0.9089004430002622,
|
|
"learning_rate": 3.9362018723831915e-06,
|
|
"loss": 0.4417,
|
|
"mean_token_accuracy": 0.8482843916863203,
|
|
"num_tokens": 192279544.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"entropy": 0.43310546875,
|
|
"epoch": 1.7777777777777777,
|
|
"grad_norm": 1.682337538575509,
|
|
"learning_rate": 3.914835629661695e-06,
|
|
"loss": 0.4219,
|
|
"mean_token_accuracy": 0.8513781204819679,
|
|
"num_tokens": 192687536.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"entropy": 0.434417724609375,
|
|
"epoch": 1.7817460317460316,
|
|
"grad_norm": 1.0677243021549518,
|
|
"learning_rate": 3.893490162162997e-06,
|
|
"loss": 0.427,
|
|
"mean_token_accuracy": 0.8539638724178076,
|
|
"num_tokens": 193092369.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"entropy": 0.43597412109375,
|
|
"epoch": 1.7857142857142856,
|
|
"grad_norm": 0.9415863303290471,
|
|
"learning_rate": 3.872165878541175e-06,
|
|
"loss": 0.4249,
|
|
"mean_token_accuracy": 0.8508947864174843,
|
|
"num_tokens": 193514317.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.4267578125,
|
|
"epoch": 1.7896825396825395,
|
|
"grad_norm": 0.9325477755113131,
|
|
"learning_rate": 3.850863187044745e-06,
|
|
"loss": 0.4311,
|
|
"mean_token_accuracy": 0.8517430359497666,
|
|
"num_tokens": 193943892.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"entropy": 0.4212646484375,
|
|
"epoch": 1.7936507936507935,
|
|
"grad_norm": 1.0936536327558857,
|
|
"learning_rate": 3.829582495508844e-06,
|
|
"loss": 0.428,
|
|
"mean_token_accuracy": 0.8505398780107498,
|
|
"num_tokens": 194368425.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"entropy": 0.425689697265625,
|
|
"epoch": 1.7976190476190477,
|
|
"grad_norm": 0.913775614343544,
|
|
"learning_rate": 3.808324211347429e-06,
|
|
"loss": 0.4263,
|
|
"mean_token_accuracy": 0.8509924123063684,
|
|
"num_tokens": 194781122.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"entropy": 0.42474365234375,
|
|
"epoch": 1.8015873015873016,
|
|
"grad_norm": 0.8819652825019069,
|
|
"learning_rate": 3.7870887415454687e-06,
|
|
"loss": 0.4352,
|
|
"mean_token_accuracy": 0.8501952039077878,
|
|
"num_tokens": 195229420.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"entropy": 0.423248291015625,
|
|
"epoch": 1.8055555555555556,
|
|
"grad_norm": 0.9710832265661201,
|
|
"learning_rate": 3.7658764926511613e-06,
|
|
"loss": 0.4364,
|
|
"mean_token_accuracy": 0.8493523299694061,
|
|
"num_tokens": 195670858.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 0.429229736328125,
|
|
"epoch": 1.8095238095238095,
|
|
"grad_norm": 1.0034882334655617,
|
|
"learning_rate": 3.7446878707681413e-06,
|
|
"loss": 0.4312,
|
|
"mean_token_accuracy": 0.8488902822136879,
|
|
"num_tokens": 196086060.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"entropy": 0.42626953125,
|
|
"epoch": 1.8134920634920635,
|
|
"grad_norm": 0.8967060198023731,
|
|
"learning_rate": 3.7235232815477123e-06,
|
|
"loss": 0.4389,
|
|
"mean_token_accuracy": 0.8454429730772972,
|
|
"num_tokens": 196534067.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"entropy": 0.433380126953125,
|
|
"epoch": 1.8174603174603174,
|
|
"grad_norm": 1.0727361296036093,
|
|
"learning_rate": 3.7023831301810765e-06,
|
|
"loss": 0.4233,
|
|
"mean_token_accuracy": 0.852061620913446,
|
|
"num_tokens": 196949752.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"entropy": 0.4302978515625,
|
|
"epoch": 1.8214285714285714,
|
|
"grad_norm": 0.9533053527391133,
|
|
"learning_rate": 3.6812678213915777e-06,
|
|
"loss": 0.4274,
|
|
"mean_token_accuracy": 0.8499543191865087,
|
|
"num_tokens": 197361623.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"entropy": 0.428863525390625,
|
|
"epoch": 1.8253968253968254,
|
|
"grad_norm": 1.6646105544719645,
|
|
"learning_rate": 3.6601777594269605e-06,
|
|
"loss": 0.4275,
|
|
"mean_token_accuracy": 0.8524315897375345,
|
|
"num_tokens": 197787383.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.427886962890625,
|
|
"epoch": 1.8293650793650795,
|
|
"grad_norm": 0.918452931744825,
|
|
"learning_rate": 3.6391133480516196e-06,
|
|
"loss": 0.4351,
|
|
"mean_token_accuracy": 0.8494909154251218,
|
|
"num_tokens": 198214788.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"entropy": 0.433502197265625,
|
|
"epoch": 1.8333333333333335,
|
|
"grad_norm": 0.9250539034798784,
|
|
"learning_rate": 3.618074990538873e-06,
|
|
"loss": 0.44,
|
|
"mean_token_accuracy": 0.8496057353913784,
|
|
"num_tokens": 198640204.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"entropy": 0.4234619140625,
|
|
"epoch": 1.8373015873015874,
|
|
"grad_norm": 0.8926807300614167,
|
|
"learning_rate": 3.5970630896632485e-06,
|
|
"loss": 0.4373,
|
|
"mean_token_accuracy": 0.8482935605570674,
|
|
"num_tokens": 199086174.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"entropy": 0.423919677734375,
|
|
"epoch": 1.8412698412698414,
|
|
"grad_norm": 0.9317218135024461,
|
|
"learning_rate": 3.5760780476927637e-06,
|
|
"loss": 0.4342,
|
|
"mean_token_accuracy": 0.8504292815923691,
|
|
"num_tokens": 199534945.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"entropy": 0.43280029296875,
|
|
"epoch": 1.8452380952380953,
|
|
"grad_norm": 0.9327031690920736,
|
|
"learning_rate": 3.5551202663812344e-06,
|
|
"loss": 0.428,
|
|
"mean_token_accuracy": 0.851259358227253,
|
|
"num_tokens": 199970879.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 0.43359375,
|
|
"epoch": 1.8492063492063493,
|
|
"grad_norm": 0.9103535545774605,
|
|
"learning_rate": 3.534190146960571e-06,
|
|
"loss": 0.4254,
|
|
"mean_token_accuracy": 0.8511311411857605,
|
|
"num_tokens": 200401566.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"entropy": 0.43096923828125,
|
|
"epoch": 1.8531746031746033,
|
|
"grad_norm": 1.3202029413068583,
|
|
"learning_rate": 3.5132880901331067e-06,
|
|
"loss": 0.4244,
|
|
"mean_token_accuracy": 0.8484150217846036,
|
|
"num_tokens": 200819281.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"entropy": 0.42852783203125,
|
|
"epoch": 1.8571428571428572,
|
|
"grad_norm": 0.9663839835801094,
|
|
"learning_rate": 3.492414496063921e-06,
|
|
"loss": 0.4389,
|
|
"mean_token_accuracy": 0.8492425018921494,
|
|
"num_tokens": 201286569.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"entropy": 0.42816162109375,
|
|
"epoch": 1.8611111111111112,
|
|
"grad_norm": 0.922662186018523,
|
|
"learning_rate": 3.4715697643731828e-06,
|
|
"loss": 0.4286,
|
|
"mean_token_accuracy": 0.8502284437417984,
|
|
"num_tokens": 201729117.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"entropy": 0.4305419921875,
|
|
"epoch": 1.8650793650793651,
|
|
"grad_norm": 0.9615527156025448,
|
|
"learning_rate": 3.4507542941284933e-06,
|
|
"loss": 0.4251,
|
|
"mean_token_accuracy": 0.8521155146881938,
|
|
"num_tokens": 202148785.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.42950439453125,
|
|
"epoch": 1.869047619047619,
|
|
"grad_norm": 0.8896950243538952,
|
|
"learning_rate": 3.4299684838372547e-06,
|
|
"loss": 0.4209,
|
|
"mean_token_accuracy": 0.8519325880333781,
|
|
"num_tokens": 202562335.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"entropy": 0.438201904296875,
|
|
"epoch": 1.873015873015873,
|
|
"grad_norm": 0.896750571119777,
|
|
"learning_rate": 3.4092127314390354e-06,
|
|
"loss": 0.4241,
|
|
"mean_token_accuracy": 0.8511500097811222,
|
|
"num_tokens": 202969412.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"entropy": 0.424560546875,
|
|
"epoch": 1.876984126984127,
|
|
"grad_norm": 0.8342483785030218,
|
|
"learning_rate": 3.388487434297949e-06,
|
|
"loss": 0.4349,
|
|
"mean_token_accuracy": 0.8488007439300418,
|
|
"num_tokens": 203414579.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"entropy": 0.429595947265625,
|
|
"epoch": 1.880952380952381,
|
|
"grad_norm": 0.8918742155840607,
|
|
"learning_rate": 3.3677929891950527e-06,
|
|
"loss": 0.4247,
|
|
"mean_token_accuracy": 0.8510593473911285,
|
|
"num_tokens": 203845826.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"entropy": 0.43017578125,
|
|
"epoch": 1.8849206349206349,
|
|
"grad_norm": 0.9252775003902146,
|
|
"learning_rate": 3.347129792320748e-06,
|
|
"loss": 0.4272,
|
|
"mean_token_accuracy": 0.8510101838037372,
|
|
"num_tokens": 204272914.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 0.424591064453125,
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.9664584622314957,
|
|
"learning_rate": 3.3264982392671973e-06,
|
|
"loss": 0.4204,
|
|
"mean_token_accuracy": 0.8532195715233684,
|
|
"num_tokens": 204713067.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"entropy": 0.42791748046875,
|
|
"epoch": 1.8928571428571428,
|
|
"grad_norm": 0.9292473265869555,
|
|
"learning_rate": 3.3058987250207476e-06,
|
|
"loss": 0.4277,
|
|
"mean_token_accuracy": 0.8527126982808113,
|
|
"num_tokens": 205140799.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"entropy": 0.439788818359375,
|
|
"epoch": 1.8968253968253967,
|
|
"grad_norm": 0.9128528058058363,
|
|
"learning_rate": 3.285331643954372e-06,
|
|
"loss": 0.4234,
|
|
"mean_token_accuracy": 0.8513627136126161,
|
|
"num_tokens": 205549482.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"entropy": 0.428558349609375,
|
|
"epoch": 1.9007936507936507,
|
|
"grad_norm": 0.9344739197051096,
|
|
"learning_rate": 3.2647973898201157e-06,
|
|
"loss": 0.4269,
|
|
"mean_token_accuracy": 0.8505295282229781,
|
|
"num_tokens": 205957709.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"entropy": 0.428436279296875,
|
|
"epoch": 1.9047619047619047,
|
|
"grad_norm": 0.8831126126363492,
|
|
"learning_rate": 3.244296355741561e-06,
|
|
"loss": 0.426,
|
|
"mean_token_accuracy": 0.8514531748369336,
|
|
"num_tokens": 206394578.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.43328857421875,
|
|
"epoch": 1.9087301587301586,
|
|
"grad_norm": 0.8812462855968569,
|
|
"learning_rate": 3.2238289342063013e-06,
|
|
"loss": 0.429,
|
|
"mean_token_accuracy": 0.8510967614129186,
|
|
"num_tokens": 206810851.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"entropy": 0.428375244140625,
|
|
"epoch": 1.9126984126984126,
|
|
"grad_norm": 1.0106928205994128,
|
|
"learning_rate": 3.203395517058423e-06,
|
|
"loss": 0.432,
|
|
"mean_token_accuracy": 0.852095915004611,
|
|
"num_tokens": 207233636.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"entropy": 0.421112060546875,
|
|
"epoch": 1.9166666666666665,
|
|
"grad_norm": 0.9116927331499651,
|
|
"learning_rate": 3.1829964954910076e-06,
|
|
"loss": 0.4363,
|
|
"mean_token_accuracy": 0.8473147870972753,
|
|
"num_tokens": 207671663.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"entropy": 0.437652587890625,
|
|
"epoch": 1.9206349206349205,
|
|
"grad_norm": 0.9660485826307438,
|
|
"learning_rate": 3.1626322600386418e-06,
|
|
"loss": 0.4289,
|
|
"mean_token_accuracy": 0.8505426356568933,
|
|
"num_tokens": 208074376.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"entropy": 0.4241943359375,
|
|
"epoch": 1.9246031746031746,
|
|
"grad_norm": 0.9972216512477222,
|
|
"learning_rate": 3.1423032005699377e-06,
|
|
"loss": 0.4364,
|
|
"mean_token_accuracy": 0.8486529793590307,
|
|
"num_tokens": 208524843.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 0.4322509765625,
|
|
"epoch": 1.9285714285714286,
|
|
"grad_norm": 0.9283266129413389,
|
|
"learning_rate": 3.122009706280072e-06,
|
|
"loss": 0.4277,
|
|
"mean_token_accuracy": 0.8506509074941278,
|
|
"num_tokens": 208947370.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"entropy": 0.42724609375,
|
|
"epoch": 1.9325396825396826,
|
|
"grad_norm": 1.006394801232037,
|
|
"learning_rate": 3.1017521656833384e-06,
|
|
"loss": 0.4146,
|
|
"mean_token_accuracy": 0.8548265127465129,
|
|
"num_tokens": 209354451.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"entropy": 0.4229736328125,
|
|
"epoch": 1.9365079365079365,
|
|
"grad_norm": 0.8314414813893206,
|
|
"learning_rate": 3.0815309666057013e-06,
|
|
"loss": 0.428,
|
|
"mean_token_accuracy": 0.8494690489023924,
|
|
"num_tokens": 209798547.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"entropy": 0.425018310546875,
|
|
"epoch": 1.9404761904761905,
|
|
"grad_norm": 0.9234785434940929,
|
|
"learning_rate": 3.061346496177374e-06,
|
|
"loss": 0.421,
|
|
"mean_token_accuracy": 0.8528507072478533,
|
|
"num_tokens": 210233790.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"entropy": 0.43133544921875,
|
|
"epoch": 1.9444444444444444,
|
|
"grad_norm": 0.8757613774035661,
|
|
"learning_rate": 3.0411991408254116e-06,
|
|
"loss": 0.436,
|
|
"mean_token_accuracy": 0.8493619496002793,
|
|
"num_tokens": 210661829.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.420318603515625,
|
|
"epoch": 1.9484126984126984,
|
|
"grad_norm": 0.8668762253896259,
|
|
"learning_rate": 3.0210892862663043e-06,
|
|
"loss": 0.4267,
|
|
"mean_token_accuracy": 0.8510631760582328,
|
|
"num_tokens": 211113597.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"entropy": 0.4222412109375,
|
|
"epoch": 1.9523809523809523,
|
|
"grad_norm": 0.8822229179162288,
|
|
"learning_rate": 3.001017317498607e-06,
|
|
"loss": 0.4278,
|
|
"mean_token_accuracy": 0.8513042591512203,
|
|
"num_tokens": 211549046.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"entropy": 0.419830322265625,
|
|
"epoch": 1.9563492063492065,
|
|
"grad_norm": 0.9142830959986298,
|
|
"learning_rate": 2.9809836187955532e-06,
|
|
"loss": 0.4139,
|
|
"mean_token_accuracy": 0.8542308090254664,
|
|
"num_tokens": 212000519.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"entropy": 0.42449951171875,
|
|
"epoch": 1.9603174603174605,
|
|
"grad_norm": 0.8634339056465669,
|
|
"learning_rate": 2.960988573697705e-06,
|
|
"loss": 0.428,
|
|
"mean_token_accuracy": 0.8506795652210712,
|
|
"num_tokens": 212447521.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"entropy": 0.42681884765625,
|
|
"epoch": 1.9642857142857144,
|
|
"grad_norm": 0.8734416000621907,
|
|
"learning_rate": 2.941032565005613e-06,
|
|
"loss": 0.4262,
|
|
"mean_token_accuracy": 0.8521596789360046,
|
|
"num_tokens": 212865927.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 0.424072265625,
|
|
"epoch": 1.9682539682539684,
|
|
"grad_norm": 0.8877032051531498,
|
|
"learning_rate": 2.9211159747724813e-06,
|
|
"loss": 0.4264,
|
|
"mean_token_accuracy": 0.851787575520575,
|
|
"num_tokens": 213310334.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"entropy": 0.421661376953125,
|
|
"epoch": 1.9722222222222223,
|
|
"grad_norm": 0.9809567398581039,
|
|
"learning_rate": 2.90123918429686e-06,
|
|
"loss": 0.4246,
|
|
"mean_token_accuracy": 0.8516859589144588,
|
|
"num_tokens": 213742399.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"entropy": 0.42767333984375,
|
|
"epoch": 1.9761904761904763,
|
|
"grad_norm": 0.8738523997394374,
|
|
"learning_rate": 2.881402574115344e-06,
|
|
"loss": 0.4273,
|
|
"mean_token_accuracy": 0.8529170397669077,
|
|
"num_tokens": 214169043.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"entropy": 0.4276123046875,
|
|
"epoch": 1.9801587301587302,
|
|
"grad_norm": 0.9201362022804491,
|
|
"learning_rate": 2.8616065239952763e-06,
|
|
"loss": 0.424,
|
|
"mean_token_accuracy": 0.8526058839634061,
|
|
"num_tokens": 214572957.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"entropy": 0.430877685546875,
|
|
"epoch": 1.9841269841269842,
|
|
"grad_norm": 0.9306770950977414,
|
|
"learning_rate": 2.841851412927495e-06,
|
|
"loss": 0.4314,
|
|
"mean_token_accuracy": 0.8489747159183025,
|
|
"num_tokens": 215005057.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.4188232421875,
|
|
"epoch": 1.9880952380952381,
|
|
"grad_norm": 0.8357685751970109,
|
|
"learning_rate": 2.822137619119065e-06,
|
|
"loss": 0.42,
|
|
"mean_token_accuracy": 0.8517758399248123,
|
|
"num_tokens": 215449399.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"entropy": 0.426727294921875,
|
|
"epoch": 1.992063492063492,
|
|
"grad_norm": 1.1544716066903413,
|
|
"learning_rate": 2.8024655199860495e-06,
|
|
"loss": 0.4154,
|
|
"mean_token_accuracy": 0.8549016704782844,
|
|
"num_tokens": 215869766.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"entropy": 0.427978515625,
|
|
"epoch": 1.996031746031746,
|
|
"grad_norm": 0.8289572581024041,
|
|
"learning_rate": 2.7828354921462668e-06,
|
|
"loss": 0.4184,
|
|
"mean_token_accuracy": 0.8542971862480044,
|
|
"num_tokens": 216298988.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"entropy": 0.4202880859375,
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.8750452382881969,
|
|
"learning_rate": 2.7632479114120963e-06,
|
|
"loss": 0.4177,
|
|
"mean_token_accuracy": 0.8540928428992629,
|
|
"num_tokens": 216731206.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"entropy": 0.420989990234375,
|
|
"epoch": 2.003968253968254,
|
|
"grad_norm": 0.8871159450799843,
|
|
"learning_rate": 2.7437031527832747e-06,
|
|
"loss": 0.3994,
|
|
"mean_token_accuracy": 0.860961563885212,
|
|
"num_tokens": 217159781.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 0.425262451171875,
|
|
"epoch": 2.007936507936508,
|
|
"grad_norm": 0.9044028336131849,
|
|
"learning_rate": 2.72420159043972e-06,
|
|
"loss": 0.3935,
|
|
"mean_token_accuracy": 0.8634284269064665,
|
|
"num_tokens": 217589905.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"entropy": 0.42340087890625,
|
|
"epoch": 2.011904761904762,
|
|
"grad_norm": 0.8841207327958758,
|
|
"learning_rate": 2.704743597734365e-06,
|
|
"loss": 0.3933,
|
|
"mean_token_accuracy": 0.8630258431658149,
|
|
"num_tokens": 218017429.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"entropy": 0.42041015625,
|
|
"epoch": 2.015873015873016,
|
|
"grad_norm": 0.8980425705440174,
|
|
"learning_rate": 2.685329547186018e-06,
|
|
"loss": 0.4083,
|
|
"mean_token_accuracy": 0.8567906338721514,
|
|
"num_tokens": 218446876.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"entropy": 0.4229736328125,
|
|
"epoch": 2.0198412698412698,
|
|
"grad_norm": 0.909158252805293,
|
|
"learning_rate": 2.665959810472219e-06,
|
|
"loss": 0.4067,
|
|
"mean_token_accuracy": 0.8580641169101,
|
|
"num_tokens": 218885713.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"entropy": 0.41693115234375,
|
|
"epoch": 2.0238095238095237,
|
|
"grad_norm": 0.882075206716414,
|
|
"learning_rate": 2.6466347584221314e-06,
|
|
"loss": 0.3961,
|
|
"mean_token_accuracy": 0.861279109492898,
|
|
"num_tokens": 219322571.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.422607421875,
|
|
"epoch": 2.0277777777777777,
|
|
"grad_norm": 0.8895301340223191,
|
|
"learning_rate": 2.6273547610094408e-06,
|
|
"loss": 0.4007,
|
|
"mean_token_accuracy": 0.8570800367742777,
|
|
"num_tokens": 219748508.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"entropy": 0.420166015625,
|
|
"epoch": 2.0317460317460316,
|
|
"grad_norm": 0.908409070674735,
|
|
"learning_rate": 2.608120187345273e-06,
|
|
"loss": 0.3983,
|
|
"mean_token_accuracy": 0.8590443721041083,
|
|
"num_tokens": 220180160.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"entropy": 0.4185791015625,
|
|
"epoch": 2.0357142857142856,
|
|
"grad_norm": 1.034313453109704,
|
|
"learning_rate": 2.588931405671127e-06,
|
|
"loss": 0.3916,
|
|
"mean_token_accuracy": 0.8636050894856453,
|
|
"num_tokens": 220606565.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"entropy": 0.422393798828125,
|
|
"epoch": 2.0396825396825395,
|
|
"grad_norm": 0.8777983265834516,
|
|
"learning_rate": 2.5697887833518215e-06,
|
|
"loss": 0.3897,
|
|
"mean_token_accuracy": 0.8630373626947403,
|
|
"num_tokens": 221016578.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"entropy": 0.41497802734375,
|
|
"epoch": 2.0436507936507935,
|
|
"grad_norm": 0.9119000908237385,
|
|
"learning_rate": 2.5506926868684683e-06,
|
|
"loss": 0.3967,
|
|
"mean_token_accuracy": 0.8603310724720359,
|
|
"num_tokens": 221455851.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 0.424346923828125,
|
|
"epoch": 2.0476190476190474,
|
|
"grad_norm": 0.9104788824732245,
|
|
"learning_rate": 2.5316434818114517e-06,
|
|
"loss": 0.4009,
|
|
"mean_token_accuracy": 0.8583084382116795,
|
|
"num_tokens": 221871968.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"entropy": 0.41632080078125,
|
|
"epoch": 2.0515873015873014,
|
|
"grad_norm": 0.7974753175425153,
|
|
"learning_rate": 2.5126415328734275e-06,
|
|
"loss": 0.3875,
|
|
"mean_token_accuracy": 0.8620841084048152,
|
|
"num_tokens": 222303576.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"entropy": 0.41741943359375,
|
|
"epoch": 2.0555555555555554,
|
|
"grad_norm": 0.8523247821631298,
|
|
"learning_rate": 2.4936872038423516e-06,
|
|
"loss": 0.3935,
|
|
"mean_token_accuracy": 0.8615706618875265,
|
|
"num_tokens": 222738323.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"entropy": 0.416717529296875,
|
|
"epoch": 2.0595238095238093,
|
|
"grad_norm": 0.8420283553726328,
|
|
"learning_rate": 2.4747808575945006e-06,
|
|
"loss": 0.3942,
|
|
"mean_token_accuracy": 0.8623552098870277,
|
|
"num_tokens": 223168261.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"entropy": 0.421295166015625,
|
|
"epoch": 2.0634920634920633,
|
|
"grad_norm": 0.9269712393029744,
|
|
"learning_rate": 2.4559228560875336e-06,
|
|
"loss": 0.3983,
|
|
"mean_token_accuracy": 0.8609938519075513,
|
|
"num_tokens": 223584134.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.41546630859375,
|
|
"epoch": 2.0674603174603177,
|
|
"grad_norm": 0.7913231790323264,
|
|
"learning_rate": 2.4371135603535613e-06,
|
|
"loss": 0.3881,
|
|
"mean_token_accuracy": 0.8632083088159561,
|
|
"num_tokens": 224013215.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"entropy": 0.40972900390625,
|
|
"epoch": 2.0714285714285716,
|
|
"grad_norm": 0.8896009296171342,
|
|
"learning_rate": 2.4183533304922336e-06,
|
|
"loss": 0.4024,
|
|
"mean_token_accuracy": 0.8593400968238711,
|
|
"num_tokens": 224461654.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"entropy": 0.416046142578125,
|
|
"epoch": 2.0753968253968256,
|
|
"grad_norm": 0.8522563242461978,
|
|
"learning_rate": 2.399642525663843e-06,
|
|
"loss": 0.3968,
|
|
"mean_token_accuracy": 0.8609009999781847,
|
|
"num_tokens": 224885889.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"entropy": 0.41802978515625,
|
|
"epoch": 2.0793650793650795,
|
|
"grad_norm": 0.8436355578137702,
|
|
"learning_rate": 2.380981504082459e-06,
|
|
"loss": 0.4051,
|
|
"mean_token_accuracy": 0.8574947854503989,
|
|
"num_tokens": 225327562.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"entropy": 0.410980224609375,
|
|
"epoch": 2.0833333333333335,
|
|
"grad_norm": 0.9234046715388234,
|
|
"learning_rate": 2.3623706230090517e-06,
|
|
"loss": 0.3946,
|
|
"mean_token_accuracy": 0.860747816041112,
|
|
"num_tokens": 225767121.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 0.4215087890625,
|
|
"epoch": 2.0873015873015874,
|
|
"grad_norm": 0.886667769462096,
|
|
"learning_rate": 2.3438102387446686e-06,
|
|
"loss": 0.3887,
|
|
"mean_token_accuracy": 0.8633216423913836,
|
|
"num_tokens": 226189031.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"entropy": 0.41558837890625,
|
|
"epoch": 2.0912698412698414,
|
|
"grad_norm": 0.8295983883133476,
|
|
"learning_rate": 2.325300706623607e-06,
|
|
"loss": 0.4059,
|
|
"mean_token_accuracy": 0.8594214450567961,
|
|
"num_tokens": 226627902.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"entropy": 0.416168212890625,
|
|
"epoch": 2.0952380952380953,
|
|
"grad_norm": 0.8579824625414783,
|
|
"learning_rate": 2.3068423810066085e-06,
|
|
"loss": 0.4086,
|
|
"mean_token_accuracy": 0.8578107142820954,
|
|
"num_tokens": 227062309.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"entropy": 0.418792724609375,
|
|
"epoch": 2.0992063492063493,
|
|
"grad_norm": 0.8717081684901182,
|
|
"learning_rate": 2.288435615274085e-06,
|
|
"loss": 0.4026,
|
|
"mean_token_accuracy": 0.8583700396120548,
|
|
"num_tokens": 227485113.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"entropy": 0.418609619140625,
|
|
"epoch": 2.1031746031746033,
|
|
"grad_norm": 0.8671184672809995,
|
|
"learning_rate": 2.2700807618193393e-06,
|
|
"loss": 0.3945,
|
|
"mean_token_accuracy": 0.8610662836581469,
|
|
"num_tokens": 227920598.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 0.416961669921875,
|
|
"epoch": 2.107142857142857,
|
|
"grad_norm": 0.7659046613801866,
|
|
"learning_rate": 2.251778172041828e-06,
|
|
"loss": 0.391,
|
|
"mean_token_accuracy": 0.8613040810450912,
|
|
"num_tokens": 228346699.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"entropy": 0.41766357421875,
|
|
"epoch": 2.111111111111111,
|
|
"grad_norm": 0.8757955281793407,
|
|
"learning_rate": 2.2335281963404315e-06,
|
|
"loss": 0.3985,
|
|
"mean_token_accuracy": 0.86165143083781,
|
|
"num_tokens": 228773818.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"entropy": 0.41998291015625,
|
|
"epoch": 2.115079365079365,
|
|
"grad_norm": 0.9727283374741916,
|
|
"learning_rate": 2.2153311841067438e-06,
|
|
"loss": 0.3928,
|
|
"mean_token_accuracy": 0.8631924940273166,
|
|
"num_tokens": 229188623.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"entropy": 0.412200927734375,
|
|
"epoch": 2.119047619047619,
|
|
"grad_norm": 0.8392433239210284,
|
|
"learning_rate": 2.1971874837183914e-06,
|
|
"loss": 0.3869,
|
|
"mean_token_accuracy": 0.8635608870536089,
|
|
"num_tokens": 229627711.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"entropy": 0.415802001953125,
|
|
"epoch": 2.123015873015873,
|
|
"grad_norm": 0.9201827428240057,
|
|
"learning_rate": 2.179097442532352e-06,
|
|
"loss": 0.4088,
|
|
"mean_token_accuracy": 0.8568679317831993,
|
|
"num_tokens": 230054209.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 0.41278076171875,
|
|
"epoch": 2.126984126984127,
|
|
"grad_norm": 0.8066388305393899,
|
|
"learning_rate": 2.1610614068783112e-06,
|
|
"loss": 0.3981,
|
|
"mean_token_accuracy": 0.8601069571450353,
|
|
"num_tokens": 230489032.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"entropy": 0.411895751953125,
|
|
"epoch": 2.130952380952381,
|
|
"grad_norm": 0.8350937916956933,
|
|
"learning_rate": 2.143079722052034e-06,
|
|
"loss": 0.4015,
|
|
"mean_token_accuracy": 0.8587260395288467,
|
|
"num_tokens": 230910745.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"entropy": 0.417938232421875,
|
|
"epoch": 2.134920634920635,
|
|
"grad_norm": 0.791508989758568,
|
|
"learning_rate": 2.125152732308747e-06,
|
|
"loss": 0.4049,
|
|
"mean_token_accuracy": 0.8583241375163198,
|
|
"num_tokens": 231339019.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"entropy": 0.4166259765625,
|
|
"epoch": 2.138888888888889,
|
|
"grad_norm": 0.7979398132027408,
|
|
"learning_rate": 2.1072807808565547e-06,
|
|
"loss": 0.4084,
|
|
"mean_token_accuracy": 0.8571968795731664,
|
|
"num_tokens": 231777523.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"entropy": 0.420440673828125,
|
|
"epoch": 2.142857142857143,
|
|
"grad_norm": 0.8603306148484448,
|
|
"learning_rate": 2.0894642098498656e-06,
|
|
"loss": 0.3952,
|
|
"mean_token_accuracy": 0.859032517299056,
|
|
"num_tokens": 232199672.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 0.423187255859375,
|
|
"epoch": 2.1468253968253967,
|
|
"grad_norm": 0.9055074686631474,
|
|
"learning_rate": 2.0717033603828436e-06,
|
|
"loss": 0.3923,
|
|
"mean_token_accuracy": 0.8614393156021833,
|
|
"num_tokens": 232633797.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"entropy": 0.417877197265625,
|
|
"epoch": 2.1507936507936507,
|
|
"grad_norm": 0.8617856992329058,
|
|
"learning_rate": 2.0539985724828736e-06,
|
|
"loss": 0.4081,
|
|
"mean_token_accuracy": 0.8573337839916348,
|
|
"num_tokens": 233076007.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"entropy": 0.41546630859375,
|
|
"epoch": 2.1547619047619047,
|
|
"grad_norm": 0.8903667184752816,
|
|
"learning_rate": 2.0363501851040573e-06,
|
|
"loss": 0.3922,
|
|
"mean_token_accuracy": 0.861387861892581,
|
|
"num_tokens": 233509851.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"entropy": 0.4229736328125,
|
|
"epoch": 2.1587301587301586,
|
|
"grad_norm": 0.8398162712869015,
|
|
"learning_rate": 2.0187585361207174e-06,
|
|
"loss": 0.4043,
|
|
"mean_token_accuracy": 0.857014361768961,
|
|
"num_tokens": 233942156.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"entropy": 0.418701171875,
|
|
"epoch": 2.1626984126984126,
|
|
"grad_norm": 0.8309474925972752,
|
|
"learning_rate": 2.001223962320941e-06,
|
|
"loss": 0.3959,
|
|
"mean_token_accuracy": 0.8592708380892873,
|
|
"num_tokens": 234372963.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 0.414398193359375,
|
|
"epoch": 2.1666666666666665,
|
|
"grad_norm": 0.8088942738118841,
|
|
"learning_rate": 1.9837467994001165e-06,
|
|
"loss": 0.4048,
|
|
"mean_token_accuracy": 0.8613162385299802,
|
|
"num_tokens": 234844668.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"entropy": 0.429656982421875,
|
|
"epoch": 2.1706349206349205,
|
|
"grad_norm": 0.8900138868011044,
|
|
"learning_rate": 1.9663273819545157e-06,
|
|
"loss": 0.4117,
|
|
"mean_token_accuracy": 0.8555487683042884,
|
|
"num_tokens": 235271990.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"entropy": 0.416961669921875,
|
|
"epoch": 2.1746031746031744,
|
|
"grad_norm": 0.8125994478475848,
|
|
"learning_rate": 1.948966043474889e-06,
|
|
"loss": 0.3981,
|
|
"mean_token_accuracy": 0.8588608456775546,
|
|
"num_tokens": 235697877.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"entropy": 0.429046630859375,
|
|
"epoch": 2.1785714285714284,
|
|
"grad_norm": 0.9972924104553051,
|
|
"learning_rate": 1.931663116340074e-06,
|
|
"loss": 0.4049,
|
|
"mean_token_accuracy": 0.8577186185866594,
|
|
"num_tokens": 236134537.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"entropy": 0.410797119140625,
|
|
"epoch": 2.1825396825396823,
|
|
"grad_norm": 0.8632872657339906,
|
|
"learning_rate": 1.914418931810643e-06,
|
|
"loss": 0.3855,
|
|
"mean_token_accuracy": 0.8640564111992717,
|
|
"num_tokens": 236586699.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 0.424530029296875,
|
|
"epoch": 2.1865079365079367,
|
|
"grad_norm": 0.8870689635471863,
|
|
"learning_rate": 1.8972338200225509e-06,
|
|
"loss": 0.3984,
|
|
"mean_token_accuracy": 0.8577613439410925,
|
|
"num_tokens": 236995332.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"entropy": 0.418975830078125,
|
|
"epoch": 2.1904761904761907,
|
|
"grad_norm": 0.9628030178975229,
|
|
"learning_rate": 1.880108109980815e-06,
|
|
"loss": 0.3934,
|
|
"mean_token_accuracy": 0.861169021576643,
|
|
"num_tokens": 237426378.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"entropy": 0.411376953125,
|
|
"epoch": 2.1944444444444446,
|
|
"grad_norm": 0.933588404712383,
|
|
"learning_rate": 1.8630421295532252e-06,
|
|
"loss": 0.3905,
|
|
"mean_token_accuracy": 0.8604107396677136,
|
|
"num_tokens": 237866086.0,
|
|
"step": 553
|
|
},
|
|
{
|
|
"entropy": 0.41845703125,
|
|
"epoch": 2.1984126984126986,
|
|
"grad_norm": 1.0435808914840323,
|
|
"learning_rate": 1.8460362054640573e-06,
|
|
"loss": 0.4007,
|
|
"mean_token_accuracy": 0.8584116594865918,
|
|
"num_tokens": 238297987.0,
|
|
"step": 554
|
|
},
|
|
{
|
|
"entropy": 0.4312744140625,
|
|
"epoch": 2.2023809523809526,
|
|
"grad_norm": 0.9124011744416908,
|
|
"learning_rate": 1.8290906632878297e-06,
|
|
"loss": 0.4056,
|
|
"mean_token_accuracy": 0.8590257493779063,
|
|
"num_tokens": 238729296.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 0.41986083984375,
|
|
"epoch": 2.2063492063492065,
|
|
"grad_norm": 0.9196757371946168,
|
|
"learning_rate": 1.8122058274430542e-06,
|
|
"loss": 0.408,
|
|
"mean_token_accuracy": 0.8594406340271235,
|
|
"num_tokens": 239171101.0,
|
|
"step": 556
|
|
},
|
|
{
|
|
"entropy": 0.42120361328125,
|
|
"epoch": 2.2103174603174605,
|
|
"grad_norm": 0.8297358875305545,
|
|
"learning_rate": 1.7953820211860395e-06,
|
|
"loss": 0.3919,
|
|
"mean_token_accuracy": 0.8603522703051567,
|
|
"num_tokens": 239602299.0,
|
|
"step": 557
|
|
},
|
|
{
|
|
"entropy": 0.41949462890625,
|
|
"epoch": 2.2142857142857144,
|
|
"grad_norm": 1.6698534343246039,
|
|
"learning_rate": 1.7786195666046935e-06,
|
|
"loss": 0.3915,
|
|
"mean_token_accuracy": 0.8623024551197886,
|
|
"num_tokens": 240034337.0,
|
|
"step": 558
|
|
},
|
|
{
|
|
"entropy": 0.42144775390625,
|
|
"epoch": 2.2182539682539684,
|
|
"grad_norm": 0.8963232285622191,
|
|
"learning_rate": 1.7619187846123624e-06,
|
|
"loss": 0.3901,
|
|
"mean_token_accuracy": 0.8617757288739085,
|
|
"num_tokens": 240461291.0,
|
|
"step": 559
|
|
},
|
|
{
|
|
"entropy": 0.42474365234375,
|
|
"epoch": 2.2222222222222223,
|
|
"grad_norm": 0.9778763913057226,
|
|
"learning_rate": 1.7452799949416833e-06,
|
|
"loss": 0.384,
|
|
"mean_token_accuracy": 0.8640343863517046,
|
|
"num_tokens": 240860927.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 0.41705322265625,
|
|
"epoch": 2.2261904761904763,
|
|
"grad_norm": 0.8286270345827924,
|
|
"learning_rate": 1.7287035161384673e-06,
|
|
"loss": 0.3996,
|
|
"mean_token_accuracy": 0.8590253088623285,
|
|
"num_tokens": 241301179.0,
|
|
"step": 561
|
|
},
|
|
{
|
|
"entropy": 0.418853759765625,
|
|
"epoch": 2.2301587301587302,
|
|
"grad_norm": 0.8430918806162481,
|
|
"learning_rate": 1.7121896655555958e-06,
|
|
"loss": 0.396,
|
|
"mean_token_accuracy": 0.860031645745039,
|
|
"num_tokens": 241739076.0,
|
|
"step": 562
|
|
},
|
|
{
|
|
"entropy": 0.424774169921875,
|
|
"epoch": 2.234126984126984,
|
|
"grad_norm": 0.826236198905769,
|
|
"learning_rate": 1.695738759346947e-06,
|
|
"loss": 0.3891,
|
|
"mean_token_accuracy": 0.8625601828098297,
|
|
"num_tokens": 242150640.0,
|
|
"step": 563
|
|
},
|
|
{
|
|
"entropy": 0.412109375,
|
|
"epoch": 2.238095238095238,
|
|
"grad_norm": 0.8853893523977265,
|
|
"learning_rate": 1.6793511124613455e-06,
|
|
"loss": 0.3874,
|
|
"mean_token_accuracy": 0.8637553565204144,
|
|
"num_tokens": 242574011.0,
|
|
"step": 564
|
|
},
|
|
{
|
|
"entropy": 0.422393798828125,
|
|
"epoch": 2.242063492063492,
|
|
"grad_norm": 0.9075367727640452,
|
|
"learning_rate": 1.6630270386365288e-06,
|
|
"loss": 0.3989,
|
|
"mean_token_accuracy": 0.8571943752467632,
|
|
"num_tokens": 243005939.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 0.41766357421875,
|
|
"epoch": 2.246031746031746,
|
|
"grad_norm": 0.8448948319006312,
|
|
"learning_rate": 1.6467668503931432e-06,
|
|
"loss": 0.398,
|
|
"mean_token_accuracy": 0.861447062343359,
|
|
"num_tokens": 243458878.0,
|
|
"step": 566
|
|
},
|
|
{
|
|
"entropy": 0.418365478515625,
|
|
"epoch": 2.25,
|
|
"grad_norm": 0.9930222072087751,
|
|
"learning_rate": 1.6305708590287616e-06,
|
|
"loss": 0.3997,
|
|
"mean_token_accuracy": 0.8600739203393459,
|
|
"num_tokens": 243877438.0,
|
|
"step": 567
|
|
},
|
|
{
|
|
"entropy": 0.41552734375,
|
|
"epoch": 2.253968253968254,
|
|
"grad_norm": 0.8598361323835692,
|
|
"learning_rate": 1.6144393746119208e-06,
|
|
"loss": 0.3943,
|
|
"mean_token_accuracy": 0.8619920583441854,
|
|
"num_tokens": 244313964.0,
|
|
"step": 568
|
|
},
|
|
{
|
|
"entropy": 0.41705322265625,
|
|
"epoch": 2.257936507936508,
|
|
"grad_norm": 0.9059341355540655,
|
|
"learning_rate": 1.5983727059761873e-06,
|
|
"loss": 0.3981,
|
|
"mean_token_accuracy": 0.8603257145732641,
|
|
"num_tokens": 244761734.0,
|
|
"step": 569
|
|
},
|
|
{
|
|
"entropy": 0.417938232421875,
|
|
"epoch": 2.261904761904762,
|
|
"grad_norm": 0.8354660701028858,
|
|
"learning_rate": 1.5823711607142428e-06,
|
|
"loss": 0.3843,
|
|
"mean_token_accuracy": 0.863621992059052,
|
|
"num_tokens": 245200322.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 0.416839599609375,
|
|
"epoch": 2.265873015873016,
|
|
"grad_norm": 0.8345755216968843,
|
|
"learning_rate": 1.5664350451720022e-06,
|
|
"loss": 0.396,
|
|
"mean_token_accuracy": 0.8610862046480179,
|
|
"num_tokens": 245646233.0,
|
|
"step": 571
|
|
},
|
|
{
|
|
"entropy": 0.421661376953125,
|
|
"epoch": 2.2698412698412698,
|
|
"grad_norm": 0.8201081491300131,
|
|
"learning_rate": 1.5505646644427375e-06,
|
|
"loss": 0.395,
|
|
"mean_token_accuracy": 0.8609900875017047,
|
|
"num_tokens": 246083539.0,
|
|
"step": 572
|
|
},
|
|
{
|
|
"entropy": 0.421539306640625,
|
|
"epoch": 2.2738095238095237,
|
|
"grad_norm": 0.8429380051297379,
|
|
"learning_rate": 1.5347603223612462e-06,
|
|
"loss": 0.3963,
|
|
"mean_token_accuracy": 0.860317200422287,
|
|
"num_tokens": 246515677.0,
|
|
"step": 573
|
|
},
|
|
{
|
|
"entropy": 0.4217529296875,
|
|
"epoch": 2.2777777777777777,
|
|
"grad_norm": 0.868322359342986,
|
|
"learning_rate": 1.5190223214980286e-06,
|
|
"loss": 0.3976,
|
|
"mean_token_accuracy": 0.8608297156170011,
|
|
"num_tokens": 246933619.0,
|
|
"step": 574
|
|
},
|
|
{
|
|
"entropy": 0.43359375,
|
|
"epoch": 2.2817460317460316,
|
|
"grad_norm": 0.8952218666631779,
|
|
"learning_rate": 1.5033509631534986e-06,
|
|
"loss": 0.3966,
|
|
"mean_token_accuracy": 0.8629090571776032,
|
|
"num_tokens": 247344382.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 0.41790771484375,
|
|
"epoch": 2.2857142857142856,
|
|
"grad_norm": 0.9480496740892829,
|
|
"learning_rate": 1.4877465473522178e-06,
|
|
"loss": 0.3813,
|
|
"mean_token_accuracy": 0.8640672285109758,
|
|
"num_tokens": 247765672.0,
|
|
"step": 576
|
|
},
|
|
{
|
|
"entropy": 0.42218017578125,
|
|
"epoch": 2.2896825396825395,
|
|
"grad_norm": 0.9704838555740247,
|
|
"learning_rate": 1.4722093728371427e-06,
|
|
"loss": 0.3878,
|
|
"mean_token_accuracy": 0.8612747713923454,
|
|
"num_tokens": 248183306.0,
|
|
"step": 577
|
|
},
|
|
{
|
|
"entropy": 0.410430908203125,
|
|
"epoch": 2.2936507936507935,
|
|
"grad_norm": 0.8533419703585065,
|
|
"learning_rate": 1.4567397370639158e-06,
|
|
"loss": 0.3927,
|
|
"mean_token_accuracy": 0.8615565691143274,
|
|
"num_tokens": 248628378.0,
|
|
"step": 578
|
|
},
|
|
{
|
|
"entropy": 0.41888427734375,
|
|
"epoch": 2.2976190476190474,
|
|
"grad_norm": 0.818324266262677,
|
|
"learning_rate": 1.4413379361951596e-06,
|
|
"loss": 0.4009,
|
|
"mean_token_accuracy": 0.8598908875137568,
|
|
"num_tokens": 249071096.0,
|
|
"step": 579
|
|
},
|
|
{
|
|
"entropy": 0.41949462890625,
|
|
"epoch": 2.3015873015873014,
|
|
"grad_norm": 0.8157937775196074,
|
|
"learning_rate": 1.4260042650948187e-06,
|
|
"loss": 0.3959,
|
|
"mean_token_accuracy": 0.8613967839628458,
|
|
"num_tokens": 249501143.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 0.419769287109375,
|
|
"epoch": 2.3055555555555554,
|
|
"grad_norm": 0.948858831726886,
|
|
"learning_rate": 1.4107390173225045e-06,
|
|
"loss": 0.3945,
|
|
"mean_token_accuracy": 0.8604099499061704,
|
|
"num_tokens": 249948355.0,
|
|
"step": 581
|
|
},
|
|
{
|
|
"entropy": 0.42041015625,
|
|
"epoch": 2.3095238095238093,
|
|
"grad_norm": 0.8758102059030293,
|
|
"learning_rate": 1.395542485127886e-06,
|
|
"loss": 0.388,
|
|
"mean_token_accuracy": 0.8634849116206169,
|
|
"num_tokens": 250356099.0,
|
|
"step": 582
|
|
},
|
|
{
|
|
"entropy": 0.421234130859375,
|
|
"epoch": 2.3134920634920633,
|
|
"grad_norm": 0.8815188369640882,
|
|
"learning_rate": 1.3804149594450816e-06,
|
|
"loss": 0.3919,
|
|
"mean_token_accuracy": 0.8597034253180027,
|
|
"num_tokens": 250775592.0,
|
|
"step": 583
|
|
},
|
|
{
|
|
"entropy": 0.418121337890625,
|
|
"epoch": 2.317460317460317,
|
|
"grad_norm": 0.861023672134407,
|
|
"learning_rate": 1.365356729887099e-06,
|
|
"loss": 0.4,
|
|
"mean_token_accuracy": 0.8603812381625175,
|
|
"num_tokens": 251219125.0,
|
|
"step": 584
|
|
},
|
|
{
|
|
"entropy": 0.415496826171875,
|
|
"epoch": 2.3214285714285716,
|
|
"grad_norm": 0.8641123367226853,
|
|
"learning_rate": 1.3503680847402868e-06,
|
|
"loss": 0.3933,
|
|
"mean_token_accuracy": 0.8616957142949104,
|
|
"num_tokens": 251648861.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 0.41497802734375,
|
|
"epoch": 2.3253968253968256,
|
|
"grad_norm": 0.8154240634747612,
|
|
"learning_rate": 1.3354493109588145e-06,
|
|
"loss": 0.3926,
|
|
"mean_token_accuracy": 0.8618068303912878,
|
|
"num_tokens": 252080434.0,
|
|
"step": 586
|
|
},
|
|
{
|
|
"entropy": 0.417633056640625,
|
|
"epoch": 2.3293650793650795,
|
|
"grad_norm": 0.8354299632421693,
|
|
"learning_rate": 1.320600694159185e-06,
|
|
"loss": 0.3828,
|
|
"mean_token_accuracy": 0.8655170071870089,
|
|
"num_tokens": 252502018.0,
|
|
"step": 587
|
|
},
|
|
{
|
|
"entropy": 0.420166015625,
|
|
"epoch": 2.3333333333333335,
|
|
"grad_norm": 0.9436967025783154,
|
|
"learning_rate": 1.3058225186147572e-06,
|
|
"loss": 0.3957,
|
|
"mean_token_accuracy": 0.8595009902492166,
|
|
"num_tokens": 252923218.0,
|
|
"step": 588
|
|
},
|
|
{
|
|
"entropy": 0.419464111328125,
|
|
"epoch": 2.3373015873015874,
|
|
"grad_norm": 0.8818218399814328,
|
|
"learning_rate": 1.2911150672503098e-06,
|
|
"loss": 0.3867,
|
|
"mean_token_accuracy": 0.8642842434346676,
|
|
"num_tokens": 253337148.0,
|
|
"step": 589
|
|
},
|
|
{
|
|
"entropy": 0.426788330078125,
|
|
"epoch": 2.3412698412698414,
|
|
"grad_norm": 0.8980593730409643,
|
|
"learning_rate": 1.2764786216366236e-06,
|
|
"loss": 0.3988,
|
|
"mean_token_accuracy": 0.8595603117719293,
|
|
"num_tokens": 253761289.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 0.416748046875,
|
|
"epoch": 2.3452380952380953,
|
|
"grad_norm": 0.944966296741567,
|
|
"learning_rate": 1.2619134619850908e-06,
|
|
"loss": 0.3929,
|
|
"mean_token_accuracy": 0.8604479916393757,
|
|
"num_tokens": 254195017.0,
|
|
"step": 591
|
|
},
|
|
{
|
|
"entropy": 0.41595458984375,
|
|
"epoch": 2.3492063492063493,
|
|
"grad_norm": 1.0810430230383554,
|
|
"learning_rate": 1.2474198671423493e-06,
|
|
"loss": 0.3999,
|
|
"mean_token_accuracy": 0.8599454695358872,
|
|
"num_tokens": 254643716.0,
|
|
"step": 592
|
|
},
|
|
{
|
|
"entropy": 0.41754150390625,
|
|
"epoch": 2.3531746031746033,
|
|
"grad_norm": 0.812428846397806,
|
|
"learning_rate": 1.2329981145849468e-06,
|
|
"loss": 0.3977,
|
|
"mean_token_accuracy": 0.8586347484961152,
|
|
"num_tokens": 255069339.0,
|
|
"step": 593
|
|
},
|
|
{
|
|
"entropy": 0.42437744140625,
|
|
"epoch": 2.357142857142857,
|
|
"grad_norm": 0.8302059952828363,
|
|
"learning_rate": 1.2186484804140242e-06,
|
|
"loss": 0.3942,
|
|
"mean_token_accuracy": 0.8609241275116801,
|
|
"num_tokens": 255486573.0,
|
|
"step": 594
|
|
},
|
|
{
|
|
"entropy": 0.415771484375,
|
|
"epoch": 2.361111111111111,
|
|
"grad_norm": 0.8148827903906969,
|
|
"learning_rate": 1.2043712393500355e-06,
|
|
"loss": 0.3876,
|
|
"mean_token_accuracy": 0.8626940259709954,
|
|
"num_tokens": 255913426.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 0.42193603515625,
|
|
"epoch": 2.365079365079365,
|
|
"grad_norm": 1.2289420157864683,
|
|
"learning_rate": 1.1901666647274823e-06,
|
|
"loss": 0.3841,
|
|
"mean_token_accuracy": 0.8637949759140611,
|
|
"num_tokens": 256345326.0,
|
|
"step": 596
|
|
},
|
|
{
|
|
"entropy": 0.416656494140625,
|
|
"epoch": 2.369047619047619,
|
|
"grad_norm": 0.8492473570067233,
|
|
"learning_rate": 1.1760350284896876e-06,
|
|
"loss": 0.388,
|
|
"mean_token_accuracy": 0.864149815402925,
|
|
"num_tokens": 256765173.0,
|
|
"step": 597
|
|
},
|
|
{
|
|
"entropy": 0.418212890625,
|
|
"epoch": 2.373015873015873,
|
|
"grad_norm": 0.7898920278559984,
|
|
"learning_rate": 1.1619766011835832e-06,
|
|
"loss": 0.3797,
|
|
"mean_token_accuracy": 0.8674542000517249,
|
|
"num_tokens": 257185152.0,
|
|
"step": 598
|
|
},
|
|
{
|
|
"entropy": 0.415191650390625,
|
|
"epoch": 2.376984126984127,
|
|
"grad_norm": 0.8450780742867222,
|
|
"learning_rate": 1.1479916519545326e-06,
|
|
"loss": 0.3903,
|
|
"mean_token_accuracy": 0.8624427672475576,
|
|
"num_tokens": 257627732.0,
|
|
"step": 599
|
|
},
|
|
{
|
|
"entropy": 0.414825439453125,
|
|
"epoch": 2.380952380952381,
|
|
"grad_norm": 0.8928696413952878,
|
|
"learning_rate": 1.1340804485411783e-06,
|
|
"loss": 0.3917,
|
|
"mean_token_accuracy": 0.8615064565092325,
|
|
"num_tokens": 258067282.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 0.421417236328125,
|
|
"epoch": 2.384920634920635,
|
|
"grad_norm": 0.9596298099931699,
|
|
"learning_rate": 1.1202432572703176e-06,
|
|
"loss": 0.396,
|
|
"mean_token_accuracy": 0.8607813809067011,
|
|
"num_tokens": 258491168.0,
|
|
"step": 601
|
|
},
|
|
{
|
|
"entropy": 0.412139892578125,
|
|
"epoch": 2.388888888888889,
|
|
"grad_norm": 0.8015642815814561,
|
|
"learning_rate": 1.1064803430518002e-06,
|
|
"loss": 0.3919,
|
|
"mean_token_accuracy": 0.8602419178932905,
|
|
"num_tokens": 258944016.0,
|
|
"step": 602
|
|
},
|
|
{
|
|
"entropy": 0.419189453125,
|
|
"epoch": 2.392857142857143,
|
|
"grad_norm": 0.83893313171213,
|
|
"learning_rate": 1.0927919693734618e-06,
|
|
"loss": 0.3941,
|
|
"mean_token_accuracy": 0.8623963864520192,
|
|
"num_tokens": 259379493.0,
|
|
"step": 603
|
|
},
|
|
{
|
|
"entropy": 0.421142578125,
|
|
"epoch": 2.3968253968253967,
|
|
"grad_norm": 0.806191116058063,
|
|
"learning_rate": 1.0791783982960736e-06,
|
|
"loss": 0.3875,
|
|
"mean_token_accuracy": 0.8618775270879269,
|
|
"num_tokens": 259808268.0,
|
|
"step": 604
|
|
},
|
|
{
|
|
"entropy": 0.412872314453125,
|
|
"epoch": 2.4007936507936507,
|
|
"grad_norm": 0.8986481499489538,
|
|
"learning_rate": 1.0656398904483312e-06,
|
|
"loss": 0.395,
|
|
"mean_token_accuracy": 0.8624038007110357,
|
|
"num_tokens": 260247659.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 0.41680908203125,
|
|
"epoch": 2.4047619047619047,
|
|
"grad_norm": 0.9536388176335355,
|
|
"learning_rate": 1.0521767050218562e-06,
|
|
"loss": 0.4001,
|
|
"mean_token_accuracy": 0.860544073395431,
|
|
"num_tokens": 260684292.0,
|
|
"step": 606
|
|
},
|
|
{
|
|
"entropy": 0.416168212890625,
|
|
"epoch": 2.4087301587301586,
|
|
"grad_norm": 0.8770578300353563,
|
|
"learning_rate": 1.0387890997662443e-06,
|
|
"loss": 0.3945,
|
|
"mean_token_accuracy": 0.8609949657693505,
|
|
"num_tokens": 261121173.0,
|
|
"step": 607
|
|
},
|
|
{
|
|
"entropy": 0.42376708984375,
|
|
"epoch": 2.4126984126984126,
|
|
"grad_norm": 0.8910554686210177,
|
|
"learning_rate": 1.0254773309841277e-06,
|
|
"loss": 0.3967,
|
|
"mean_token_accuracy": 0.8618429079651833,
|
|
"num_tokens": 261555918.0,
|
|
"step": 608
|
|
},
|
|
{
|
|
"entropy": 0.417755126953125,
|
|
"epoch": 2.4166666666666665,
|
|
"grad_norm": 0.777450132911365,
|
|
"learning_rate": 1.012241653526263e-06,
|
|
"loss": 0.3946,
|
|
"mean_token_accuracy": 0.8610922154039145,
|
|
"num_tokens": 262000331.0,
|
|
"step": 609
|
|
},
|
|
{
|
|
"entropy": 0.42059326171875,
|
|
"epoch": 2.4206349206349205,
|
|
"grad_norm": 0.8219463383506274,
|
|
"learning_rate": 9.990823207866578e-07,
|
|
"loss": 0.386,
|
|
"mean_token_accuracy": 0.8632347630336881,
|
|
"num_tokens": 262425946.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 0.42413330078125,
|
|
"epoch": 2.4246031746031744,
|
|
"grad_norm": 0.9964330808029446,
|
|
"learning_rate": 9.85999584697716e-07,
|
|
"loss": 0.3892,
|
|
"mean_token_accuracy": 0.8625029819086194,
|
|
"num_tokens": 262853210.0,
|
|
"step": 611
|
|
},
|
|
{
|
|
"entropy": 0.42291259765625,
|
|
"epoch": 2.4285714285714284,
|
|
"grad_norm": 0.8278237327212594,
|
|
"learning_rate": 9.729936957254165e-07,
|
|
"loss": 0.3822,
|
|
"mean_token_accuracy": 0.864779950119555,
|
|
"num_tokens": 263268966.0,
|
|
"step": 612
|
|
},
|
|
{
|
|
"entropy": 0.41943359375,
|
|
"epoch": 2.432539682539683,
|
|
"grad_norm": 0.9884237647568829,
|
|
"learning_rate": 9.600649028645215e-07,
|
|
"loss": 0.3933,
|
|
"mean_token_accuracy": 0.8612792957574129,
|
|
"num_tokens": 263709583.0,
|
|
"step": 613
|
|
},
|
|
{
|
|
"entropy": 0.418853759765625,
|
|
"epoch": 2.4365079365079367,
|
|
"grad_norm": 0.9015756745222828,
|
|
"learning_rate": 9.472134536338007e-07,
|
|
"loss": 0.3859,
|
|
"mean_token_accuracy": 0.8643078990280628,
|
|
"num_tokens": 264137961.0,
|
|
"step": 614
|
|
},
|
|
{
|
|
"entropy": 0.411834716796875,
|
|
"epoch": 2.4404761904761907,
|
|
"grad_norm": 0.8083110921800731,
|
|
"learning_rate": 9.344395940713009e-07,
|
|
"loss": 0.3905,
|
|
"mean_token_accuracy": 0.8626386728137732,
|
|
"num_tokens": 264579578.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 0.42529296875,
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.8539196851499317,
|
|
"learning_rate": 9.217435687296305e-07,
|
|
"loss": 0.3889,
|
|
"mean_token_accuracy": 0.8617231827229261,
|
|
"num_tokens": 264995910.0,
|
|
"step": 616
|
|
},
|
|
{
|
|
"entropy": 0.4136962890625,
|
|
"epoch": 2.4484126984126986,
|
|
"grad_norm": 0.7995502674271355,
|
|
"learning_rate": 9.091256206712812e-07,
|
|
"loss": 0.3925,
|
|
"mean_token_accuracy": 0.8612663270905614,
|
|
"num_tokens": 265440836.0,
|
|
"step": 617
|
|
},
|
|
{
|
|
"entropy": 0.42041015625,
|
|
"epoch": 2.4523809523809526,
|
|
"grad_norm": 0.8157462797571775,
|
|
"learning_rate": 8.965859914639724e-07,
|
|
"loss": 0.3832,
|
|
"mean_token_accuracy": 0.8630803981795907,
|
|
"num_tokens": 265867518.0,
|
|
"step": 618
|
|
},
|
|
{
|
|
"entropy": 0.420501708984375,
|
|
"epoch": 2.4563492063492065,
|
|
"grad_norm": 0.9573151028277197,
|
|
"learning_rate": 8.841249211760272e-07,
|
|
"loss": 0.4006,
|
|
"mean_token_accuracy": 0.8605411788448691,
|
|
"num_tokens": 266304569.0,
|
|
"step": 619
|
|
},
|
|
{
|
|
"entropy": 0.416839599609375,
|
|
"epoch": 2.4603174603174605,
|
|
"grad_norm": 0.8389660650593388,
|
|
"learning_rate": 8.717426483717762e-07,
|
|
"loss": 0.3843,
|
|
"mean_token_accuracy": 0.8629998695105314,
|
|
"num_tokens": 266730039.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 0.4168701171875,
|
|
"epoch": 2.4642857142857144,
|
|
"grad_norm": 0.8408327374770237,
|
|
"learning_rate": 8.594394101069897e-07,
|
|
"loss": 0.4009,
|
|
"mean_token_accuracy": 0.8605172112584114,
|
|
"num_tokens": 267169815.0,
|
|
"step": 621
|
|
},
|
|
{
|
|
"entropy": 0.41571044921875,
|
|
"epoch": 2.4682539682539684,
|
|
"grad_norm": 0.8011141591258287,
|
|
"learning_rate": 8.472154419243411e-07,
|
|
"loss": 0.3918,
|
|
"mean_token_accuracy": 0.8619374986737967,
|
|
"num_tokens": 267605673.0,
|
|
"step": 622
|
|
},
|
|
{
|
|
"entropy": 0.41705322265625,
|
|
"epoch": 2.4722222222222223,
|
|
"grad_norm": 0.8434082500134104,
|
|
"learning_rate": 8.350709778488941e-07,
|
|
"loss": 0.4014,
|
|
"mean_token_accuracy": 0.8600445203483105,
|
|
"num_tokens": 268044360.0,
|
|
"step": 623
|
|
},
|
|
{
|
|
"entropy": 0.41815185546875,
|
|
"epoch": 2.4761904761904763,
|
|
"grad_norm": 0.8019659782743609,
|
|
"learning_rate": 8.230062503836278e-07,
|
|
"loss": 0.3937,
|
|
"mean_token_accuracy": 0.8604294890537858,
|
|
"num_tokens": 268470812.0,
|
|
"step": 624
|
|
},
|
|
{
|
|
"entropy": 0.417449951171875,
|
|
"epoch": 2.4801587301587302,
|
|
"grad_norm": 0.8264347297639569,
|
|
"learning_rate": 8.110214905049802e-07,
|
|
"loss": 0.3965,
|
|
"mean_token_accuracy": 0.8575309114530683,
|
|
"num_tokens": 268895281.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 0.414459228515625,
|
|
"epoch": 2.484126984126984,
|
|
"grad_norm": 0.7888506972306255,
|
|
"learning_rate": 7.991169276584281e-07,
|
|
"loss": 0.3807,
|
|
"mean_token_accuracy": 0.8645908059552312,
|
|
"num_tokens": 269329768.0,
|
|
"step": 626
|
|
},
|
|
{
|
|
"entropy": 0.41363525390625,
|
|
"epoch": 2.488095238095238,
|
|
"grad_norm": 0.8377743907107998,
|
|
"learning_rate": 7.872927897540944e-07,
|
|
"loss": 0.3948,
|
|
"mean_token_accuracy": 0.8611715780571103,
|
|
"num_tokens": 269763538.0,
|
|
"step": 627
|
|
},
|
|
{
|
|
"entropy": 0.419525146484375,
|
|
"epoch": 2.492063492063492,
|
|
"grad_norm": 0.7713110745405427,
|
|
"learning_rate": 7.75549303162384e-07,
|
|
"loss": 0.3945,
|
|
"mean_token_accuracy": 0.8595996387302876,
|
|
"num_tokens": 270192672.0,
|
|
"step": 628
|
|
},
|
|
{
|
|
"entropy": 0.418792724609375,
|
|
"epoch": 2.496031746031746,
|
|
"grad_norm": 0.8447629896166373,
|
|
"learning_rate": 7.638866927096555e-07,
|
|
"loss": 0.4074,
|
|
"mean_token_accuracy": 0.8587245307862759,
|
|
"num_tokens": 270633240.0,
|
|
"step": 629
|
|
},
|
|
{
|
|
"entropy": 0.419891357421875,
|
|
"epoch": 2.5,
|
|
"grad_norm": 0.7852993278058601,
|
|
"learning_rate": 7.523051816739074e-07,
|
|
"loss": 0.3859,
|
|
"mean_token_accuracy": 0.8630366576835513,
|
|
"num_tokens": 271053623.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 0.41510009765625,
|
|
"epoch": 2.503968253968254,
|
|
"grad_norm": 0.8426473805113363,
|
|
"learning_rate": 7.408049917805104e-07,
|
|
"loss": 0.3881,
|
|
"mean_token_accuracy": 0.8630319554358721,
|
|
"num_tokens": 271492583.0,
|
|
"step": 631
|
|
},
|
|
{
|
|
"entropy": 0.41632080078125,
|
|
"epoch": 2.507936507936508,
|
|
"grad_norm": 0.8529237472508443,
|
|
"learning_rate": 7.293863431979619e-07,
|
|
"loss": 0.395,
|
|
"mean_token_accuracy": 0.861218343488872,
|
|
"num_tokens": 271921985.0,
|
|
"step": 632
|
|
},
|
|
{
|
|
"entropy": 0.42822265625,
|
|
"epoch": 2.511904761904762,
|
|
"grad_norm": 0.7740038021053262,
|
|
"learning_rate": 7.180494545336642e-07,
|
|
"loss": 0.3874,
|
|
"mean_token_accuracy": 0.8652349133044481,
|
|
"num_tokens": 272349367.0,
|
|
"step": 633
|
|
},
|
|
{
|
|
"entropy": 0.426361083984375,
|
|
"epoch": 2.515873015873016,
|
|
"grad_norm": 0.9109105967855416,
|
|
"learning_rate": 7.067945428297524e-07,
|
|
"loss": 0.3976,
|
|
"mean_token_accuracy": 0.8593434160575271,
|
|
"num_tokens": 272757706.0,
|
|
"step": 634
|
|
},
|
|
{
|
|
"entropy": 0.424713134765625,
|
|
"epoch": 2.5198412698412698,
|
|
"grad_norm": 0.8510388770912337,
|
|
"learning_rate": 6.956218235589263e-07,
|
|
"loss": 0.3872,
|
|
"mean_token_accuracy": 0.8625729326158762,
|
|
"num_tokens": 273178323.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 0.420318603515625,
|
|
"epoch": 2.5238095238095237,
|
|
"grad_norm": 0.8277629227526272,
|
|
"learning_rate": 6.845315106203327e-07,
|
|
"loss": 0.3868,
|
|
"mean_token_accuracy": 0.8626482058316469,
|
|
"num_tokens": 273603268.0,
|
|
"step": 636
|
|
},
|
|
{
|
|
"entropy": 0.418853759765625,
|
|
"epoch": 2.5277777777777777,
|
|
"grad_norm": 0.8202191768752707,
|
|
"learning_rate": 6.735238163354669e-07,
|
|
"loss": 0.3847,
|
|
"mean_token_accuracy": 0.8641904015094042,
|
|
"num_tokens": 274036335.0,
|
|
"step": 637
|
|
},
|
|
{
|
|
"entropy": 0.418914794921875,
|
|
"epoch": 2.5317460317460316,
|
|
"grad_norm": 0.8647875520943077,
|
|
"learning_rate": 6.625989514441089e-07,
|
|
"loss": 0.3925,
|
|
"mean_token_accuracy": 0.8626054916530848,
|
|
"num_tokens": 274458735.0,
|
|
"step": 638
|
|
},
|
|
{
|
|
"entropy": 0.412353515625,
|
|
"epoch": 2.5357142857142856,
|
|
"grad_norm": 0.7982027347968378,
|
|
"learning_rate": 6.517571251002896e-07,
|
|
"loss": 0.393,
|
|
"mean_token_accuracy": 0.8614260852336884,
|
|
"num_tokens": 274909982.0,
|
|
"step": 639
|
|
},
|
|
{
|
|
"entropy": 0.42431640625,
|
|
"epoch": 2.5396825396825395,
|
|
"grad_norm": 0.8307645924294975,
|
|
"learning_rate": 6.40998544868287e-07,
|
|
"loss": 0.3889,
|
|
"mean_token_accuracy": 0.8601001044735312,
|
|
"num_tokens": 275320028.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 0.417816162109375,
|
|
"epoch": 2.5436507936507935,
|
|
"grad_norm": 0.8430698509853944,
|
|
"learning_rate": 6.3032341671865e-07,
|
|
"loss": 0.386,
|
|
"mean_token_accuracy": 0.8654862614348531,
|
|
"num_tokens": 275726848.0,
|
|
"step": 641
|
|
},
|
|
{
|
|
"entropy": 0.413848876953125,
|
|
"epoch": 2.5476190476190474,
|
|
"grad_norm": 0.8421768209102014,
|
|
"learning_rate": 6.197319450242562e-07,
|
|
"loss": 0.3867,
|
|
"mean_token_accuracy": 0.8631602311506867,
|
|
"num_tokens": 276151262.0,
|
|
"step": 642
|
|
},
|
|
{
|
|
"entropy": 0.417266845703125,
|
|
"epoch": 2.5515873015873014,
|
|
"grad_norm": 0.8929748589387052,
|
|
"learning_rate": 6.092243325564007e-07,
|
|
"loss": 0.3924,
|
|
"mean_token_accuracy": 0.8615100616589189,
|
|
"num_tokens": 276568860.0,
|
|
"step": 643
|
|
},
|
|
{
|
|
"entropy": 0.41387939453125,
|
|
"epoch": 2.5555555555555554,
|
|
"grad_norm": 0.8040672513690313,
|
|
"learning_rate": 5.98800780480912e-07,
|
|
"loss": 0.3858,
|
|
"mean_token_accuracy": 0.8625959139317274,
|
|
"num_tokens": 276997327.0,
|
|
"step": 644
|
|
},
|
|
{
|
|
"entropy": 0.421234130859375,
|
|
"epoch": 2.5595238095238093,
|
|
"grad_norm": 0.7855164537605119,
|
|
"learning_rate": 5.884614883543027e-07,
|
|
"loss": 0.394,
|
|
"mean_token_accuracy": 0.8626839118078351,
|
|
"num_tokens": 277426196.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 0.4188232421875,
|
|
"epoch": 2.5634920634920633,
|
|
"grad_norm": 0.7843681767955034,
|
|
"learning_rate": 5.782066541199471e-07,
|
|
"loss": 0.3946,
|
|
"mean_token_accuracy": 0.8629313539713621,
|
|
"num_tokens": 277849848.0,
|
|
"step": 646
|
|
},
|
|
{
|
|
"entropy": 0.412078857421875,
|
|
"epoch": 2.567460317460317,
|
|
"grad_norm": 0.8561623782562832,
|
|
"learning_rate": 5.680364741042926e-07,
|
|
"loss": 0.3811,
|
|
"mean_token_accuracy": 0.8668704703450203,
|
|
"num_tokens": 278289888.0,
|
|
"step": 647
|
|
},
|
|
{
|
|
"entropy": 0.414276123046875,
|
|
"epoch": 2.571428571428571,
|
|
"grad_norm": 0.8147935679041525,
|
|
"learning_rate": 5.579511430131018e-07,
|
|
"loss": 0.3872,
|
|
"mean_token_accuracy": 0.8630826137959957,
|
|
"num_tokens": 278726761.0,
|
|
"step": 648
|
|
},
|
|
{
|
|
"entropy": 0.418182373046875,
|
|
"epoch": 2.575396825396825,
|
|
"grad_norm": 0.796874369891308,
|
|
"learning_rate": 5.479508539277229e-07,
|
|
"loss": 0.3801,
|
|
"mean_token_accuracy": 0.8660026481375098,
|
|
"num_tokens": 279136759.0,
|
|
"step": 649
|
|
},
|
|
{
|
|
"entropy": 0.416351318359375,
|
|
"epoch": 2.5793650793650795,
|
|
"grad_norm": 0.8223574515325844,
|
|
"learning_rate": 5.380357983013962e-07,
|
|
"loss": 0.392,
|
|
"mean_token_accuracy": 0.8621972808614373,
|
|
"num_tokens": 279572082.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 0.415252685546875,
|
|
"epoch": 2.5833333333333335,
|
|
"grad_norm": 0.8419256563918806,
|
|
"learning_rate": 5.282061659555854e-07,
|
|
"loss": 0.3957,
|
|
"mean_token_accuracy": 0.8606690457090735,
|
|
"num_tokens": 279994957.0,
|
|
"step": 651
|
|
},
|
|
{
|
|
"entropy": 0.415008544921875,
|
|
"epoch": 2.5873015873015874,
|
|
"grad_norm": 0.8001543694338792,
|
|
"learning_rate": 5.184621450763455e-07,
|
|
"loss": 0.3819,
|
|
"mean_token_accuracy": 0.8638613997027278,
|
|
"num_tokens": 280414468.0,
|
|
"step": 652
|
|
},
|
|
{
|
|
"entropy": 0.41876220703125,
|
|
"epoch": 2.5912698412698414,
|
|
"grad_norm": 0.8281488407232048,
|
|
"learning_rate": 5.088039222107205e-07,
|
|
"loss": 0.405,
|
|
"mean_token_accuracy": 0.8599689844995737,
|
|
"num_tokens": 280832145.0,
|
|
"step": 653
|
|
},
|
|
{
|
|
"entropy": 0.420440673828125,
|
|
"epoch": 2.5952380952380953,
|
|
"grad_norm": 0.8401133410984405,
|
|
"learning_rate": 4.992316822631693e-07,
|
|
"loss": 0.3815,
|
|
"mean_token_accuracy": 0.8656142996624112,
|
|
"num_tokens": 281237288.0,
|
|
"step": 654
|
|
},
|
|
{
|
|
"entropy": 0.412689208984375,
|
|
"epoch": 2.5992063492063493,
|
|
"grad_norm": 0.806223122436009,
|
|
"learning_rate": 4.897456084920282e-07,
|
|
"loss": 0.3862,
|
|
"mean_token_accuracy": 0.8658296698704362,
|
|
"num_tokens": 281692258.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 0.416168212890625,
|
|
"epoch": 2.6031746031746033,
|
|
"grad_norm": 0.8396062477724346,
|
|
"learning_rate": 4.803458825060042e-07,
|
|
"loss": 0.3763,
|
|
"mean_token_accuracy": 0.8662013709545135,
|
|
"num_tokens": 282118057.0,
|
|
"step": 656
|
|
},
|
|
{
|
|
"entropy": 0.412261962890625,
|
|
"epoch": 2.607142857142857,
|
|
"grad_norm": 0.825509139511018,
|
|
"learning_rate": 4.710326842606927e-07,
|
|
"loss": 0.3987,
|
|
"mean_token_accuracy": 0.8584959087893367,
|
|
"num_tokens": 282582066.0,
|
|
"step": 657
|
|
},
|
|
{
|
|
"entropy": 0.40606689453125,
|
|
"epoch": 2.611111111111111,
|
|
"grad_norm": 1.080095799468803,
|
|
"learning_rate": 4.618061920551381e-07,
|
|
"loss": 0.3936,
|
|
"mean_token_accuracy": 0.8631810490041971,
|
|
"num_tokens": 283028330.0,
|
|
"step": 658
|
|
},
|
|
{
|
|
"entropy": 0.42547607421875,
|
|
"epoch": 2.615079365079365,
|
|
"grad_norm": 0.8441240019764062,
|
|
"learning_rate": 4.526665825284132e-07,
|
|
"loss": 0.3936,
|
|
"mean_token_accuracy": 0.8619779404252768,
|
|
"num_tokens": 283436768.0,
|
|
"step": 659
|
|
},
|
|
{
|
|
"entropy": 0.41748046875,
|
|
"epoch": 2.619047619047619,
|
|
"grad_norm": 0.8263929571280181,
|
|
"learning_rate": 4.4361403065624475e-07,
|
|
"loss": 0.3864,
|
|
"mean_token_accuracy": 0.8627992533147335,
|
|
"num_tokens": 283866607.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 0.4234619140625,
|
|
"epoch": 2.623015873015873,
|
|
"grad_norm": 0.844367472303199,
|
|
"learning_rate": 4.3464870974766314e-07,
|
|
"loss": 0.4004,
|
|
"mean_token_accuracy": 0.8607617728412151,
|
|
"num_tokens": 284281791.0,
|
|
"step": 661
|
|
},
|
|
{
|
|
"entropy": 0.419158935546875,
|
|
"epoch": 2.626984126984127,
|
|
"grad_norm": 0.8571993055017914,
|
|
"learning_rate": 4.257707914416781e-07,
|
|
"loss": 0.3874,
|
|
"mean_token_accuracy": 0.8635092154145241,
|
|
"num_tokens": 284705319.0,
|
|
"step": 662
|
|
},
|
|
{
|
|
"entropy": 0.417938232421875,
|
|
"epoch": 2.630952380952381,
|
|
"grad_norm": 0.7780232105885654,
|
|
"learning_rate": 4.169804457039972e-07,
|
|
"loss": 0.4086,
|
|
"mean_token_accuracy": 0.8589063184335828,
|
|
"num_tokens": 285154313.0,
|
|
"step": 663
|
|
},
|
|
{
|
|
"entropy": 0.413238525390625,
|
|
"epoch": 2.634920634920635,
|
|
"grad_norm": 0.850893830182736,
|
|
"learning_rate": 4.082778408237731e-07,
|
|
"loss": 0.4007,
|
|
"mean_token_accuracy": 0.8592528942972422,
|
|
"num_tokens": 285598883.0,
|
|
"step": 664
|
|
},
|
|
{
|
|
"entropy": 0.418487548828125,
|
|
"epoch": 2.638888888888889,
|
|
"grad_norm": 1.1283744707185912,
|
|
"learning_rate": 3.996631434103776e-07,
|
|
"loss": 0.3977,
|
|
"mean_token_accuracy": 0.860667590983212,
|
|
"num_tokens": 286037660.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 0.416961669921875,
|
|
"epoch": 2.642857142857143,
|
|
"grad_norm": 0.8944770514716363,
|
|
"learning_rate": 3.911365183902166e-07,
|
|
"loss": 0.3898,
|
|
"mean_token_accuracy": 0.8620567666366696,
|
|
"num_tokens": 286461446.0,
|
|
"step": 666
|
|
},
|
|
{
|
|
"entropy": 0.419219970703125,
|
|
"epoch": 2.6468253968253967,
|
|
"grad_norm": 0.845344585577405,
|
|
"learning_rate": 3.826981290035692e-07,
|
|
"loss": 0.3898,
|
|
"mean_token_accuracy": 0.860666748136282,
|
|
"num_tokens": 286877023.0,
|
|
"step": 667
|
|
},
|
|
{
|
|
"entropy": 0.422149658203125,
|
|
"epoch": 2.6507936507936507,
|
|
"grad_norm": 0.8457306735031688,
|
|
"learning_rate": 3.7434813680146234e-07,
|
|
"loss": 0.3895,
|
|
"mean_token_accuracy": 0.8613977544009686,
|
|
"num_tokens": 287308399.0,
|
|
"step": 668
|
|
},
|
|
{
|
|
"entropy": 0.412872314453125,
|
|
"epoch": 2.6547619047619047,
|
|
"grad_norm": 0.7957237567868245,
|
|
"learning_rate": 3.6608670164258065e-07,
|
|
"loss": 0.3906,
|
|
"mean_token_accuracy": 0.8631431749090552,
|
|
"num_tokens": 287728804.0,
|
|
"step": 669
|
|
},
|
|
{
|
|
"entropy": 0.411468505859375,
|
|
"epoch": 2.6587301587301586,
|
|
"grad_norm": 0.7621184623535802,
|
|
"learning_rate": 3.5791398169020384e-07,
|
|
"loss": 0.393,
|
|
"mean_token_accuracy": 0.8615291966125369,
|
|
"num_tokens": 288187832.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 0.417144775390625,
|
|
"epoch": 2.6626984126984126,
|
|
"grad_norm": 0.8055399962635597,
|
|
"learning_rate": 3.4983013340918024e-07,
|
|
"loss": 0.3834,
|
|
"mean_token_accuracy": 0.8645481085404754,
|
|
"num_tokens": 288600411.0,
|
|
"step": 671
|
|
},
|
|
{
|
|
"entropy": 0.410888671875,
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.8440468660994543,
|
|
"learning_rate": 3.4183531156292913e-07,
|
|
"loss": 0.394,
|
|
"mean_token_accuracy": 0.8628778494894505,
|
|
"num_tokens": 289047051.0,
|
|
"step": 672
|
|
},
|
|
{
|
|
"entropy": 0.417388916015625,
|
|
"epoch": 2.6706349206349205,
|
|
"grad_norm": 0.8448761260472664,
|
|
"learning_rate": 3.3392966921047984e-07,
|
|
"loss": 0.3932,
|
|
"mean_token_accuracy": 0.8621304808184505,
|
|
"num_tokens": 289478039.0,
|
|
"step": 673
|
|
},
|
|
{
|
|
"entropy": 0.4195556640625,
|
|
"epoch": 2.674603174603175,
|
|
"grad_norm": 0.826243737430181,
|
|
"learning_rate": 3.261133577035408e-07,
|
|
"loss": 0.3992,
|
|
"mean_token_accuracy": 0.8631375981494784,
|
|
"num_tokens": 289920851.0,
|
|
"step": 674
|
|
},
|
|
{
|
|
"entropy": 0.41644287109375,
|
|
"epoch": 2.678571428571429,
|
|
"grad_norm": 0.7480379642047055,
|
|
"learning_rate": 3.1838652668360173e-07,
|
|
"loss": 0.3834,
|
|
"mean_token_accuracy": 0.8634974956512451,
|
|
"num_tokens": 290351325.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 0.4146728515625,
|
|
"epoch": 2.682539682539683,
|
|
"grad_norm": 0.7830053460618754,
|
|
"learning_rate": 3.1074932407906823e-07,
|
|
"loss": 0.3785,
|
|
"mean_token_accuracy": 0.8657077318057418,
|
|
"num_tokens": 290766931.0,
|
|
"step": 676
|
|
},
|
|
{
|
|
"entropy": 0.423828125,
|
|
"epoch": 2.6865079365079367,
|
|
"grad_norm": 0.7864820739930504,
|
|
"learning_rate": 3.0320189610243303e-07,
|
|
"loss": 0.3935,
|
|
"mean_token_accuracy": 0.8595830434933305,
|
|
"num_tokens": 291185306.0,
|
|
"step": 677
|
|
},
|
|
{
|
|
"entropy": 0.422698974609375,
|
|
"epoch": 2.6904761904761907,
|
|
"grad_norm": 0.7974086017120517,
|
|
"learning_rate": 2.957443872474713e-07,
|
|
"loss": 0.3873,
|
|
"mean_token_accuracy": 0.8635625531896949,
|
|
"num_tokens": 291599836.0,
|
|
"step": 678
|
|
},
|
|
{
|
|
"entropy": 0.4146728515625,
|
|
"epoch": 2.6944444444444446,
|
|
"grad_norm": 0.9412910487910857,
|
|
"learning_rate": 2.883769402864789e-07,
|
|
"loss": 0.4001,
|
|
"mean_token_accuracy": 0.8598026670515537,
|
|
"num_tokens": 292026507.0,
|
|
"step": 679
|
|
},
|
|
{
|
|
"entropy": 0.41259765625,
|
|
"epoch": 2.6984126984126986,
|
|
"grad_norm": 0.763447905049642,
|
|
"learning_rate": 2.810996962675361e-07,
|
|
"loss": 0.3903,
|
|
"mean_token_accuracy": 0.8622291041538119,
|
|
"num_tokens": 292454972.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 0.419525146484375,
|
|
"epoch": 2.7023809523809526,
|
|
"grad_norm": 0.7897795759262028,
|
|
"learning_rate": 2.739127945118092e-07,
|
|
"loss": 0.3983,
|
|
"mean_token_accuracy": 0.8589327791705728,
|
|
"num_tokens": 292885705.0,
|
|
"step": 681
|
|
},
|
|
{
|
|
"entropy": 0.42181396484375,
|
|
"epoch": 2.7063492063492065,
|
|
"grad_norm": 0.7799046098775175,
|
|
"learning_rate": 2.668163726108841e-07,
|
|
"loss": 0.3786,
|
|
"mean_token_accuracy": 0.8630655352026224,
|
|
"num_tokens": 293307675.0,
|
|
"step": 682
|
|
},
|
|
{
|
|
"entropy": 0.418853759765625,
|
|
"epoch": 2.7103174603174605,
|
|
"grad_norm": 0.8041790844592746,
|
|
"learning_rate": 2.5981056642412796e-07,
|
|
"loss": 0.3934,
|
|
"mean_token_accuracy": 0.8626653142273426,
|
|
"num_tokens": 293722148.0,
|
|
"step": 683
|
|
},
|
|
{
|
|
"entropy": 0.41864013671875,
|
|
"epoch": 2.7142857142857144,
|
|
"grad_norm": 0.8076107515114371,
|
|
"learning_rate": 2.528955100760938e-07,
|
|
"loss": 0.3858,
|
|
"mean_token_accuracy": 0.863671412691474,
|
|
"num_tokens": 294149752.0,
|
|
"step": 684
|
|
},
|
|
{
|
|
"entropy": 0.422821044921875,
|
|
"epoch": 2.7182539682539684,
|
|
"grad_norm": 0.7969449363252592,
|
|
"learning_rate": 2.460713359539474e-07,
|
|
"loss": 0.3801,
|
|
"mean_token_accuracy": 0.8654317120090127,
|
|
"num_tokens": 294555288.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 0.419830322265625,
|
|
"epoch": 2.7222222222222223,
|
|
"grad_norm": 0.8753720898977475,
|
|
"learning_rate": 2.3933817470493445e-07,
|
|
"loss": 0.3767,
|
|
"mean_token_accuracy": 0.866040863096714,
|
|
"num_tokens": 294975614.0,
|
|
"step": 686
|
|
},
|
|
{
|
|
"entropy": 0.412994384765625,
|
|
"epoch": 2.7261904761904763,
|
|
"grad_norm": 0.8199718663106975,
|
|
"learning_rate": 2.3269615523388355e-07,
|
|
"loss": 0.3918,
|
|
"mean_token_accuracy": 0.860607554204762,
|
|
"num_tokens": 295422071.0,
|
|
"step": 687
|
|
},
|
|
{
|
|
"entropy": 0.41558837890625,
|
|
"epoch": 2.7301587301587302,
|
|
"grad_norm": 0.79423481793127,
|
|
"learning_rate": 2.2614540470073276e-07,
|
|
"loss": 0.3866,
|
|
"mean_token_accuracy": 0.8644118411466479,
|
|
"num_tokens": 295846874.0,
|
|
"step": 688
|
|
},
|
|
{
|
|
"entropy": 0.41741943359375,
|
|
"epoch": 2.734126984126984,
|
|
"grad_norm": 0.8822693061766851,
|
|
"learning_rate": 2.1968604851809738e-07,
|
|
"loss": 0.3866,
|
|
"mean_token_accuracy": 0.8631517272442579,
|
|
"num_tokens": 296288822.0,
|
|
"step": 689
|
|
},
|
|
{
|
|
"entropy": 0.410980224609375,
|
|
"epoch": 2.738095238095238,
|
|
"grad_norm": 0.7846498188049137,
|
|
"learning_rate": 2.1331821034886846e-07,
|
|
"loss": 0.3943,
|
|
"mean_token_accuracy": 0.8625091454014182,
|
|
"num_tokens": 296730922.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 0.41143798828125,
|
|
"epoch": 2.742063492063492,
|
|
"grad_norm": 0.8213197375371964,
|
|
"learning_rate": 2.0704201210384634e-07,
|
|
"loss": 0.3904,
|
|
"mean_token_accuracy": 0.864051777869463,
|
|
"num_tokens": 297169723.0,
|
|
"step": 691
|
|
},
|
|
{
|
|
"entropy": 0.414337158203125,
|
|
"epoch": 2.746031746031746,
|
|
"grad_norm": 1.2967707502021062,
|
|
"learning_rate": 2.0085757393940586e-07,
|
|
"loss": 0.3772,
|
|
"mean_token_accuracy": 0.8671101154759526,
|
|
"num_tokens": 297610941.0,
|
|
"step": 692
|
|
},
|
|
{
|
|
"entropy": 0.41839599609375,
|
|
"epoch": 2.75,
|
|
"grad_norm": 0.8133921789474006,
|
|
"learning_rate": 1.9476501425519656e-07,
|
|
"loss": 0.3833,
|
|
"mean_token_accuracy": 0.860652013681829,
|
|
"num_tokens": 298044879.0,
|
|
"step": 693
|
|
},
|
|
{
|
|
"entropy": 0.416778564453125,
|
|
"epoch": 2.753968253968254,
|
|
"grad_norm": 0.7897145964186679,
|
|
"learning_rate": 1.8876444969187557e-07,
|
|
"loss": 0.3857,
|
|
"mean_token_accuracy": 0.8620300153270364,
|
|
"num_tokens": 298464464.0,
|
|
"step": 694
|
|
},
|
|
{
|
|
"entropy": 0.40887451171875,
|
|
"epoch": 2.757936507936508,
|
|
"grad_norm": 0.9000724609100704,
|
|
"learning_rate": 1.828559951288733e-07,
|
|
"loss": 0.3831,
|
|
"mean_token_accuracy": 0.8644989216700196,
|
|
"num_tokens": 298903233.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 0.41595458984375,
|
|
"epoch": 2.761904761904762,
|
|
"grad_norm": 0.7955961913020823,
|
|
"learning_rate": 1.7703976368219633e-07,
|
|
"loss": 0.3797,
|
|
"mean_token_accuracy": 0.8666205117478967,
|
|
"num_tokens": 299315956.0,
|
|
"step": 696
|
|
},
|
|
{
|
|
"entropy": 0.42193603515625,
|
|
"epoch": 2.765873015873016,
|
|
"grad_norm": 0.8878478545913352,
|
|
"learning_rate": 1.713158667022613e-07,
|
|
"loss": 0.3812,
|
|
"mean_token_accuracy": 0.8661440145224333,
|
|
"num_tokens": 299732237.0,
|
|
"step": 697
|
|
},
|
|
{
|
|
"entropy": 0.4156494140625,
|
|
"epoch": 2.7698412698412698,
|
|
"grad_norm": 0.931222693724496,
|
|
"learning_rate": 1.656844137717617e-07,
|
|
"loss": 0.3924,
|
|
"mean_token_accuracy": 0.8617311324924231,
|
|
"num_tokens": 300162540.0,
|
|
"step": 698
|
|
},
|
|
{
|
|
"entropy": 0.42425537109375,
|
|
"epoch": 2.7738095238095237,
|
|
"grad_norm": 0.8389789359858054,
|
|
"learning_rate": 1.601455127035717e-07,
|
|
"loss": 0.3901,
|
|
"mean_token_accuracy": 0.8636501645669341,
|
|
"num_tokens": 300580263.0,
|
|
"step": 699
|
|
},
|
|
{
|
|
"entropy": 0.41375732421875,
|
|
"epoch": 2.7777777777777777,
|
|
"grad_norm": 0.8084246826822826,
|
|
"learning_rate": 1.5469926953868063e-07,
|
|
"loss": 0.3786,
|
|
"mean_token_accuracy": 0.8661916004493833,
|
|
"num_tokens": 301009855.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 0.421539306640625,
|
|
"epoch": 2.7817460317460316,
|
|
"grad_norm": 0.7920810255445628,
|
|
"learning_rate": 1.4934578854416403e-07,
|
|
"loss": 0.3793,
|
|
"mean_token_accuracy": 0.8652064045891166,
|
|
"num_tokens": 301429836.0,
|
|
"step": 701
|
|
},
|
|
{
|
|
"entropy": 0.417236328125,
|
|
"epoch": 2.7857142857142856,
|
|
"grad_norm": 0.7557065694262733,
|
|
"learning_rate": 1.440851722111858e-07,
|
|
"loss": 0.3775,
|
|
"mean_token_accuracy": 0.8666085209697485,
|
|
"num_tokens": 301852351.0,
|
|
"step": 702
|
|
},
|
|
{
|
|
"entropy": 0.418304443359375,
|
|
"epoch": 2.7896825396825395,
|
|
"grad_norm": 0.8074299266071946,
|
|
"learning_rate": 1.389175212530397e-07,
|
|
"loss": 0.3787,
|
|
"mean_token_accuracy": 0.8652766114100814,
|
|
"num_tokens": 302270241.0,
|
|
"step": 703
|
|
},
|
|
{
|
|
"entropy": 0.41082763671875,
|
|
"epoch": 2.7936507936507935,
|
|
"grad_norm": 0.8030020009685229,
|
|
"learning_rate": 1.3384293460321662e-07,
|
|
"loss": 0.3838,
|
|
"mean_token_accuracy": 0.8642606223002076,
|
|
"num_tokens": 302702189.0,
|
|
"step": 704
|
|
},
|
|
{
|
|
"entropy": 0.416748046875,
|
|
"epoch": 2.7976190476190474,
|
|
"grad_norm": 0.7647096183455645,
|
|
"learning_rate": 1.2886150941351317e-07,
|
|
"loss": 0.3778,
|
|
"mean_token_accuracy": 0.866962157189846,
|
|
"num_tokens": 303138996.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 0.422882080078125,
|
|
"epoch": 2.8015873015873014,
|
|
"grad_norm": 0.8421868959580608,
|
|
"learning_rate": 1.2397334105217097e-07,
|
|
"loss": 0.3868,
|
|
"mean_token_accuracy": 0.8634527139365673,
|
|
"num_tokens": 303546117.0,
|
|
"step": 706
|
|
},
|
|
{
|
|
"entropy": 0.4114990234375,
|
|
"epoch": 2.8055555555555554,
|
|
"grad_norm": 0.7836818266413413,
|
|
"learning_rate": 1.1917852310205147e-07,
|
|
"loss": 0.3866,
|
|
"mean_token_accuracy": 0.8666229834780097,
|
|
"num_tokens": 303985054.0,
|
|
"step": 707
|
|
},
|
|
{
|
|
"entropy": 0.413970947265625,
|
|
"epoch": 2.8095238095238093,
|
|
"grad_norm": 0.7941927914203842,
|
|
"learning_rate": 1.1447714735884463e-07,
|
|
"loss": 0.3854,
|
|
"mean_token_accuracy": 0.8626468563452363,
|
|
"num_tokens": 304424136.0,
|
|
"step": 708
|
|
},
|
|
{
|
|
"entropy": 0.41229248046875,
|
|
"epoch": 2.8134920634920633,
|
|
"grad_norm": 0.8361515314493303,
|
|
"learning_rate": 1.0986930382930916e-07,
|
|
"loss": 0.3881,
|
|
"mean_token_accuracy": 0.8630633186548948,
|
|
"num_tokens": 304862181.0,
|
|
"step": 709
|
|
},
|
|
{
|
|
"entropy": 0.415924072265625,
|
|
"epoch": 2.817460317460317,
|
|
"grad_norm": 0.8355933592816446,
|
|
"learning_rate": 1.0535508072955225e-07,
|
|
"loss": 0.3969,
|
|
"mean_token_accuracy": 0.8627164475619793,
|
|
"num_tokens": 305299176.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 0.412994384765625,
|
|
"epoch": 2.821428571428571,
|
|
"grad_norm": 0.7563718151503291,
|
|
"learning_rate": 1.0093456448333872e-07,
|
|
"loss": 0.3888,
|
|
"mean_token_accuracy": 0.8606778532266617,
|
|
"num_tokens": 305755604.0,
|
|
"step": 711
|
|
},
|
|
{
|
|
"entropy": 0.414825439453125,
|
|
"epoch": 2.825396825396825,
|
|
"grad_norm": 0.7951191207755064,
|
|
"learning_rate": 9.660783972043786e-08,
|
|
"loss": 0.3833,
|
|
"mean_token_accuracy": 0.862731215544045,
|
|
"num_tokens": 306180918.0,
|
|
"step": 712
|
|
},
|
|
{
|
|
"entropy": 0.416290283203125,
|
|
"epoch": 2.8293650793650795,
|
|
"grad_norm": 0.8170483155607833,
|
|
"learning_rate": 9.237498927500088e-08,
|
|
"loss": 0.3962,
|
|
"mean_token_accuracy": 0.861549130640924,
|
|
"num_tokens": 306601315.0,
|
|
"step": 713
|
|
},
|
|
{
|
|
"entropy": 0.413970947265625,
|
|
"epoch": 2.8333333333333335,
|
|
"grad_norm": 0.7633085069328812,
|
|
"learning_rate": 8.823609418397939e-08,
|
|
"loss": 0.3903,
|
|
"mean_token_accuracy": 0.861748369410634,
|
|
"num_tokens": 307053855.0,
|
|
"step": 714
|
|
},
|
|
{
|
|
"entropy": 0.4200439453125,
|
|
"epoch": 2.8373015873015874,
|
|
"grad_norm": 0.84331828046859,
|
|
"learning_rate": 8.419123368556991e-08,
|
|
"loss": 0.3889,
|
|
"mean_token_accuracy": 0.8638924788683653,
|
|
"num_tokens": 307466527.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 0.417083740234375,
|
|
"epoch": 2.8412698412698414,
|
|
"grad_norm": 1.136155627056708,
|
|
"learning_rate": 8.024048521769745e-08,
|
|
"loss": 0.393,
|
|
"mean_token_accuracy": 0.8619293784722686,
|
|
"num_tokens": 307908233.0,
|
|
"step": 716
|
|
},
|
|
{
|
|
"entropy": 0.41851806640625,
|
|
"epoch": 2.8452380952380953,
|
|
"grad_norm": 0.8624286819149608,
|
|
"learning_rate": 7.638392441653542e-08,
|
|
"loss": 0.3815,
|
|
"mean_token_accuracy": 0.8658701097592711,
|
|
"num_tokens": 308331740.0,
|
|
"step": 717
|
|
},
|
|
{
|
|
"entropy": 0.417083740234375,
|
|
"epoch": 2.8492063492063493,
|
|
"grad_norm": 0.7877113344572971,
|
|
"learning_rate": 7.262162511505466e-08,
|
|
"loss": 0.3766,
|
|
"mean_token_accuracy": 0.8655453082174063,
|
|
"num_tokens": 308765267.0,
|
|
"step": 718
|
|
},
|
|
{
|
|
"entropy": 0.419647216796875,
|
|
"epoch": 2.8531746031746033,
|
|
"grad_norm": 0.8003863832330088,
|
|
"learning_rate": 6.895365934161236e-08,
|
|
"loss": 0.3811,
|
|
"mean_token_accuracy": 0.8642518576234579,
|
|
"num_tokens": 309177878.0,
|
|
"step": 719
|
|
},
|
|
{
|
|
"entropy": 0.427581787109375,
|
|
"epoch": 2.857142857142857,
|
|
"grad_norm": 0.7682606770932064,
|
|
"learning_rate": 6.538009731857087e-08,
|
|
"loss": 0.3912,
|
|
"mean_token_accuracy": 0.8608730277046561,
|
|
"num_tokens": 309586897.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 0.41351318359375,
|
|
"epoch": 2.861111111111111,
|
|
"grad_norm": 0.7503328469812236,
|
|
"learning_rate": 6.190100746095495e-08,
|
|
"loss": 0.3831,
|
|
"mean_token_accuracy": 0.8634521188214421,
|
|
"num_tokens": 310011953.0,
|
|
"step": 721
|
|
},
|
|
{
|
|
"entropy": 0.416259765625,
|
|
"epoch": 2.865079365079365,
|
|
"grad_norm": 0.7649905566318169,
|
|
"learning_rate": 5.851645637514114e-08,
|
|
"loss": 0.3851,
|
|
"mean_token_accuracy": 0.8632787046954036,
|
|
"num_tokens": 310440896.0,
|
|
"step": 722
|
|
},
|
|
{
|
|
"entropy": 0.417572021484375,
|
|
"epoch": 2.869047619047619,
|
|
"grad_norm": 0.9526446373854853,
|
|
"learning_rate": 5.522650885758374e-08,
|
|
"loss": 0.3874,
|
|
"mean_token_accuracy": 0.8621506663039327,
|
|
"num_tokens": 310867155.0,
|
|
"step": 723
|
|
},
|
|
{
|
|
"entropy": 0.414215087890625,
|
|
"epoch": 2.873015873015873,
|
|
"grad_norm": 0.8196077414243489,
|
|
"learning_rate": 5.203122789357307e-08,
|
|
"loss": 0.3768,
|
|
"mean_token_accuracy": 0.8689231360331178,
|
|
"num_tokens": 311297562.0,
|
|
"step": 724
|
|
},
|
|
{
|
|
"entropy": 0.41143798828125,
|
|
"epoch": 2.876984126984127,
|
|
"grad_norm": 0.8133318950674621,
|
|
"learning_rate": 4.893067465602863e-08,
|
|
"loss": 0.397,
|
|
"mean_token_accuracy": 0.8604372851550579,
|
|
"num_tokens": 311750163.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 0.413116455078125,
|
|
"epoch": 2.880952380952381,
|
|
"grad_norm": 0.7955536166222298,
|
|
"learning_rate": 4.5924908504331735e-08,
|
|
"loss": 0.3949,
|
|
"mean_token_accuracy": 0.8633811613544822,
|
|
"num_tokens": 312189513.0,
|
|
"step": 726
|
|
},
|
|
{
|
|
"entropy": 0.41363525390625,
|
|
"epoch": 2.884920634920635,
|
|
"grad_norm": 0.7572897725302501,
|
|
"learning_rate": 4.3013986983184705e-08,
|
|
"loss": 0.3854,
|
|
"mean_token_accuracy": 0.8645169893279672,
|
|
"num_tokens": 312626002.0,
|
|
"step": 727
|
|
},
|
|
{
|
|
"entropy": 0.41668701171875,
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.7716288372854705,
|
|
"learning_rate": 4.019796582151181e-08,
|
|
"loss": 0.3876,
|
|
"mean_token_accuracy": 0.862118998542428,
|
|
"num_tokens": 313055977.0,
|
|
"step": 728
|
|
},
|
|
{
|
|
"entropy": 0.415802001953125,
|
|
"epoch": 2.892857142857143,
|
|
"grad_norm": 0.7713232614880875,
|
|
"learning_rate": 3.747689893139228e-08,
|
|
"loss": 0.3854,
|
|
"mean_token_accuracy": 0.8622197173535824,
|
|
"num_tokens": 313491837.0,
|
|
"step": 729
|
|
},
|
|
{
|
|
"entropy": 0.416229248046875,
|
|
"epoch": 2.8968253968253967,
|
|
"grad_norm": 0.7395778636476764,
|
|
"learning_rate": 3.4850838407027297e-08,
|
|
"loss": 0.3979,
|
|
"mean_token_accuracy": 0.8619843171909451,
|
|
"num_tokens": 313934488.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 0.418701171875,
|
|
"epoch": 2.9007936507936507,
|
|
"grad_norm": 0.7612522506605197,
|
|
"learning_rate": 3.2319834523742435e-08,
|
|
"loss": 0.383,
|
|
"mean_token_accuracy": 0.8652700930833817,
|
|
"num_tokens": 314354323.0,
|
|
"step": 731
|
|
},
|
|
{
|
|
"entropy": 0.414703369140625,
|
|
"epoch": 2.9047619047619047,
|
|
"grad_norm": 0.7974226574359506,
|
|
"learning_rate": 2.988393573702675e-08,
|
|
"loss": 0.3946,
|
|
"mean_token_accuracy": 0.8615053938701749,
|
|
"num_tokens": 314782888.0,
|
|
"step": 732
|
|
},
|
|
{
|
|
"entropy": 0.41558837890625,
|
|
"epoch": 2.9087301587301586,
|
|
"grad_norm": 0.8043577847187172,
|
|
"learning_rate": 2.754318868160244e-08,
|
|
"loss": 0.3836,
|
|
"mean_token_accuracy": 0.8651288328692317,
|
|
"num_tokens": 315218545.0,
|
|
"step": 733
|
|
},
|
|
{
|
|
"entropy": 0.409698486328125,
|
|
"epoch": 2.9126984126984126,
|
|
"grad_norm": 0.7532207153914081,
|
|
"learning_rate": 2.5297638170535542e-08,
|
|
"loss": 0.3768,
|
|
"mean_token_accuracy": 0.8678219076246023,
|
|
"num_tokens": 315667111.0,
|
|
"step": 734
|
|
},
|
|
{
|
|
"entropy": 0.423553466796875,
|
|
"epoch": 2.9166666666666665,
|
|
"grad_norm": 0.836022190361025,
|
|
"learning_rate": 2.31473271943744e-08,
|
|
"loss": 0.3848,
|
|
"mean_token_accuracy": 0.8609532006084919,
|
|
"num_tokens": 316078411.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 0.4132080078125,
|
|
"epoch": 2.9206349206349205,
|
|
"grad_norm": 0.7845153141259297,
|
|
"learning_rate": 2.109229692032977e-08,
|
|
"loss": 0.3894,
|
|
"mean_token_accuracy": 0.8628736371174455,
|
|
"num_tokens": 316519645.0,
|
|
"step": 736
|
|
},
|
|
{
|
|
"entropy": 0.418060302734375,
|
|
"epoch": 2.924603174603175,
|
|
"grad_norm": 0.7817679617183846,
|
|
"learning_rate": 1.9132586691484323e-08,
|
|
"loss": 0.3889,
|
|
"mean_token_accuracy": 0.8628808334469795,
|
|
"num_tokens": 316948710.0,
|
|
"step": 737
|
|
},
|
|
{
|
|
"entropy": 0.417327880859375,
|
|
"epoch": 2.928571428571429,
|
|
"grad_norm": 0.7903748369093285,
|
|
"learning_rate": 1.7268234026041053e-08,
|
|
"loss": 0.3836,
|
|
"mean_token_accuracy": 0.8664052626118064,
|
|
"num_tokens": 317376914.0,
|
|
"step": 738
|
|
},
|
|
{
|
|
"entropy": 0.416656494140625,
|
|
"epoch": 2.932539682539683,
|
|
"grad_norm": 0.8013586144607203,
|
|
"learning_rate": 1.5499274616602723e-08,
|
|
"loss": 0.3819,
|
|
"mean_token_accuracy": 0.8635408999398351,
|
|
"num_tokens": 317783718.0,
|
|
"step": 739
|
|
},
|
|
{
|
|
"entropy": 0.4195556640625,
|
|
"epoch": 2.9365079365079367,
|
|
"grad_norm": 1.0799029570111842,
|
|
"learning_rate": 1.3825742329492408e-08,
|
|
"loss": 0.3976,
|
|
"mean_token_accuracy": 0.8611190365627408,
|
|
"num_tokens": 318210944.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 0.41424560546875,
|
|
"epoch": 2.9404761904761907,
|
|
"grad_norm": 0.8046949464266854,
|
|
"learning_rate": 1.2247669204100699e-08,
|
|
"loss": 0.3972,
|
|
"mean_token_accuracy": 0.8586803553625941,
|
|
"num_tokens": 318636758.0,
|
|
"step": 741
|
|
},
|
|
{
|
|
"entropy": 0.412384033203125,
|
|
"epoch": 2.9444444444444446,
|
|
"grad_norm": 0.7492264758448205,
|
|
"learning_rate": 1.0765085452275614e-08,
|
|
"loss": 0.381,
|
|
"mean_token_accuracy": 0.8640262456610799,
|
|
"num_tokens": 319072977.0,
|
|
"step": 742
|
|
},
|
|
{
|
|
"entropy": 0.41888427734375,
|
|
"epoch": 2.9484126984126986,
|
|
"grad_norm": 0.8576996327472883,
|
|
"learning_rate": 9.378019457743082e-09,
|
|
"loss": 0.3825,
|
|
"mean_token_accuracy": 0.8635032856836915,
|
|
"num_tokens": 319491098.0,
|
|
"step": 743
|
|
},
|
|
{
|
|
"entropy": 0.41156005859375,
|
|
"epoch": 2.9523809523809526,
|
|
"grad_norm": 0.8654625317173439,
|
|
"learning_rate": 8.086497775562918e-09,
|
|
"loss": 0.3974,
|
|
"mean_token_accuracy": 0.8603286230936646,
|
|
"num_tokens": 319936836.0,
|
|
"step": 744
|
|
},
|
|
{
|
|
"entropy": 0.4107666015625,
|
|
"epoch": 2.9563492063492065,
|
|
"grad_norm": 0.8355009320099618,
|
|
"learning_rate": 6.890545131621462e-09,
|
|
"loss": 0.3898,
|
|
"mean_token_accuracy": 0.8626588368788362,
|
|
"num_tokens": 320379711.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 0.415008544921875,
|
|
"epoch": 2.9603174603174605,
|
|
"grad_norm": 0.769717293692382,
|
|
"learning_rate": 5.790184422158063e-09,
|
|
"loss": 0.3848,
|
|
"mean_token_accuracy": 0.8654471961781383,
|
|
"num_tokens": 320807541.0,
|
|
"step": 746
|
|
},
|
|
{
|
|
"entropy": 0.41558837890625,
|
|
"epoch": 2.9642857142857144,
|
|
"grad_norm": 1.2627956434015182,
|
|
"learning_rate": 4.785436713324876e-09,
|
|
"loss": 0.3896,
|
|
"mean_token_accuracy": 0.8639781204983592,
|
|
"num_tokens": 321249293.0,
|
|
"step": 747
|
|
},
|
|
{
|
|
"entropy": 0.417449951171875,
|
|
"epoch": 2.9682539682539684,
|
|
"grad_norm": 0.7877216343272654,
|
|
"learning_rate": 3.876321240786629e-09,
|
|
"loss": 0.385,
|
|
"mean_token_accuracy": 0.8626599637791514,
|
|
"num_tokens": 321676330.0,
|
|
"step": 748
|
|
},
|
|
{
|
|
"entropy": 0.41473388671875,
|
|
"epoch": 2.9722222222222223,
|
|
"grad_norm": 0.8047350818501147,
|
|
"learning_rate": 3.062855409350918e-09,
|
|
"loss": 0.3786,
|
|
"mean_token_accuracy": 0.8666502619162202,
|
|
"num_tokens": 322088977.0,
|
|
"step": 749
|
|
},
|
|
{
|
|
"entropy": 0.413360595703125,
|
|
"epoch": 2.9761904761904763,
|
|
"grad_norm": 0.8214751044608514,
|
|
"learning_rate": 2.345054792634027e-09,
|
|
"loss": 0.3863,
|
|
"mean_token_accuracy": 0.8647614009678364,
|
|
"num_tokens": 322526195.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 0.42340087890625,
|
|
"epoch": 2.9801587301587302,
|
|
"grad_norm": 0.7827821829891451,
|
|
"learning_rate": 1.7229331327633935e-09,
|
|
"loss": 0.3884,
|
|
"mean_token_accuracy": 0.8638850962743163,
|
|
"num_tokens": 322944491.0,
|
|
"step": 751
|
|
},
|
|
{
|
|
"entropy": 0.415924072265625,
|
|
"epoch": 2.984126984126984,
|
|
"grad_norm": 0.8044240226452957,
|
|
"learning_rate": 1.1965023401161457e-09,
|
|
"loss": 0.3955,
|
|
"mean_token_accuracy": 0.860909391194582,
|
|
"num_tokens": 323373321.0,
|
|
"step": 752
|
|
},
|
|
{
|
|
"entropy": 0.415069580078125,
|
|
"epoch": 2.988095238095238,
|
|
"grad_norm": 0.7711137719538177,
|
|
"learning_rate": 7.657724930887344e-10,
|
|
"loss": 0.3878,
|
|
"mean_token_accuracy": 0.8620332898572087,
|
|
"num_tokens": 323803097.0,
|
|
"step": 753
|
|
},
|
|
{
|
|
"entropy": 0.413604736328125,
|
|
"epoch": 2.992063492063492,
|
|
"grad_norm": 0.7417880663145604,
|
|
"learning_rate": 4.3075183790541875e-10,
|
|
"loss": 0.3781,
|
|
"mean_token_accuracy": 0.865508021786809,
|
|
"num_tokens": 324241487.0,
|
|
"step": 754
|
|
},
|
|
{
|
|
"entropy": 0.412139892578125,
|
|
"epoch": 2.996031746031746,
|
|
"grad_norm": 0.8400409360541771,
|
|
"learning_rate": 1.9144678845950393e-10,
|
|
"loss": 0.3963,
|
|
"mean_token_accuracy": 0.8602159256115556,
|
|
"num_tokens": 324694260.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 0.41937255859375,
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.8242408910754981,
|
|
"learning_rate": 4.786192619121721e-11,
|
|
"loss": 0.3878,
|
|
"mean_token_accuracy": 0.863866476342082,
|
|
"num_tokens": 325114310.0,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 756,
|
|
"total_flos": 601237772369920.0,
|
|
"train_loss": 0.4835385761011845,
|
|
"train_runtime": 57894.1544,
|
|
"train_samples_per_second": 1.272,
|
|
"train_steps_per_second": 0.013
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 756,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 63,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 601237772369920.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|