Files
P2-split2_prob_Qwen3-8B-Bas…/trainer_state.json

7604 lines
214 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 756,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5635986328125,
"epoch": 0.003968253968253968,
"grad_norm": 5.862754983476997,
"learning_rate": 0.0,
"loss": 1.3929,
"mean_token_accuracy": 0.6520986258983612,
"num_tokens": 436822.0,
"step": 1
},
{
"entropy": 0.571868896484375,
"epoch": 0.007936507936507936,
"grad_norm": 5.942989842192001,
"learning_rate": 2.6315789473684213e-07,
"loss": 1.3984,
"mean_token_accuracy": 0.6573778251186013,
"num_tokens": 849869.0,
"step": 2
},
{
"entropy": 0.571258544921875,
"epoch": 0.011904761904761904,
"grad_norm": 6.016113817261652,
"learning_rate": 5.263157894736843e-07,
"loss": 1.4022,
"mean_token_accuracy": 0.6534338416531682,
"num_tokens": 1257883.0,
"step": 3
},
{
"entropy": 0.567626953125,
"epoch": 0.015873015873015872,
"grad_norm": 5.755030134936764,
"learning_rate": 7.894736842105263e-07,
"loss": 1.3977,
"mean_token_accuracy": 0.650267880409956,
"num_tokens": 1710146.0,
"step": 4
},
{
"entropy": 0.563079833984375,
"epoch": 0.01984126984126984,
"grad_norm": 5.759147918749323,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.38,
"mean_token_accuracy": 0.6574590215459466,
"num_tokens": 2138902.0,
"step": 5
},
{
"entropy": 0.5838623046875,
"epoch": 0.023809523809523808,
"grad_norm": 5.5950056421057885,
"learning_rate": 1.3157894736842106e-06,
"loss": 1.3755,
"mean_token_accuracy": 0.6586260544136167,
"num_tokens": 2560005.0,
"step": 6
},
{
"entropy": 0.5576171875,
"epoch": 0.027777777777777776,
"grad_norm": 5.621048765741401,
"learning_rate": 1.5789473684210526e-06,
"loss": 1.3926,
"mean_token_accuracy": 0.6518173962831497,
"num_tokens": 3004121.0,
"step": 7
},
{
"entropy": 0.572418212890625,
"epoch": 0.031746031746031744,
"grad_norm": 5.175193000260237,
"learning_rate": 1.8421052631578948e-06,
"loss": 1.3638,
"mean_token_accuracy": 0.6598256900906563,
"num_tokens": 3457966.0,
"step": 8
},
{
"entropy": 0.565643310546875,
"epoch": 0.03571428571428571,
"grad_norm": 5.286230249900356,
"learning_rate": 2.105263157894737e-06,
"loss": 1.3632,
"mean_token_accuracy": 0.660512862727046,
"num_tokens": 3902759.0,
"step": 9
},
{
"entropy": 0.583984375,
"epoch": 0.03968253968253968,
"grad_norm": 4.61299440604949,
"learning_rate": 2.368421052631579e-06,
"loss": 1.3222,
"mean_token_accuracy": 0.6607110062614083,
"num_tokens": 4321827.0,
"step": 10
},
{
"entropy": 0.56304931640625,
"epoch": 0.04365079365079365,
"grad_norm": 4.311573869738654,
"learning_rate": 2.631578947368421e-06,
"loss": 1.2723,
"mean_token_accuracy": 0.6778731746599078,
"num_tokens": 4748195.0,
"step": 11
},
{
"entropy": 0.566680908203125,
"epoch": 0.047619047619047616,
"grad_norm": 4.260816735639912,
"learning_rate": 2.8947368421052634e-06,
"loss": 1.2866,
"mean_token_accuracy": 0.6733056111261249,
"num_tokens": 5188122.0,
"step": 12
},
{
"entropy": 0.56494140625,
"epoch": 0.051587301587301584,
"grad_norm": 3.50291785633804,
"learning_rate": 3.157894736842105e-06,
"loss": 1.1797,
"mean_token_accuracy": 0.6891454020515084,
"num_tokens": 5615040.0,
"step": 13
},
{
"entropy": 0.562591552734375,
"epoch": 0.05555555555555555,
"grad_norm": 3.4924438085560223,
"learning_rate": 3.421052631578948e-06,
"loss": 1.1726,
"mean_token_accuracy": 0.6897318931296468,
"num_tokens": 6042413.0,
"step": 14
},
{
"entropy": 0.5677490234375,
"epoch": 0.05952380952380952,
"grad_norm": 3.1328341857078574,
"learning_rate": 3.6842105263157896e-06,
"loss": 1.1382,
"mean_token_accuracy": 0.6958816023543477,
"num_tokens": 6468019.0,
"step": 15
},
{
"entropy": 0.56500244140625,
"epoch": 0.06349206349206349,
"grad_norm": 3.1264058914998327,
"learning_rate": 3.947368421052632e-06,
"loss": 1.1368,
"mean_token_accuracy": 0.6940081315115094,
"num_tokens": 6898441.0,
"step": 16
},
{
"entropy": 0.537628173828125,
"epoch": 0.06746031746031746,
"grad_norm": 3.1935735446807105,
"learning_rate": 4.210526315789474e-06,
"loss": 1.042,
"mean_token_accuracy": 0.7142375819385052,
"num_tokens": 7333054.0,
"step": 17
},
{
"entropy": 0.51806640625,
"epoch": 0.07142857142857142,
"grad_norm": 3.9117878805623816,
"learning_rate": 4.473684210526316e-06,
"loss": 1.0019,
"mean_token_accuracy": 0.72404795140028,
"num_tokens": 7794638.0,
"step": 18
},
{
"entropy": 0.5318603515625,
"epoch": 0.07539682539682539,
"grad_norm": 4.074921603012236,
"learning_rate": 4.736842105263158e-06,
"loss": 1.0057,
"mean_token_accuracy": 0.7181824343279004,
"num_tokens": 8237624.0,
"step": 19
},
{
"entropy": 0.536865234375,
"epoch": 0.07936507936507936,
"grad_norm": 3.555487851257003,
"learning_rate": 5e-06,
"loss": 0.9821,
"mean_token_accuracy": 0.723413173109293,
"num_tokens": 8673402.0,
"step": 20
},
{
"entropy": 0.530609130859375,
"epoch": 0.08333333333333333,
"grad_norm": 2.955594263842942,
"learning_rate": 5.263157894736842e-06,
"loss": 0.9539,
"mean_token_accuracy": 0.7331287879496813,
"num_tokens": 9121387.0,
"step": 21
},
{
"entropy": 0.548980712890625,
"epoch": 0.0873015873015873,
"grad_norm": 2.658685672377372,
"learning_rate": 5.526315789473685e-06,
"loss": 0.9027,
"mean_token_accuracy": 0.7447563670575619,
"num_tokens": 9525436.0,
"step": 22
},
{
"entropy": 0.548858642578125,
"epoch": 0.09126984126984126,
"grad_norm": 2.169406925194856,
"learning_rate": 5.789473684210527e-06,
"loss": 0.8918,
"mean_token_accuracy": 0.7433666130527854,
"num_tokens": 9932011.0,
"step": 23
},
{
"entropy": 0.532745361328125,
"epoch": 0.09523809523809523,
"grad_norm": 2.77364949088275,
"learning_rate": 6.0526315789473685e-06,
"loss": 0.8816,
"mean_token_accuracy": 0.7459379723295569,
"num_tokens": 10358777.0,
"step": 24
},
{
"entropy": 0.535888671875,
"epoch": 0.0992063492063492,
"grad_norm": 3.016843229488076,
"learning_rate": 6.31578947368421e-06,
"loss": 0.8696,
"mean_token_accuracy": 0.7488466305658221,
"num_tokens": 10773051.0,
"step": 25
},
{
"entropy": 0.526519775390625,
"epoch": 0.10317460317460317,
"grad_norm": 2.7257646532581203,
"learning_rate": 6.578947368421054e-06,
"loss": 0.8541,
"mean_token_accuracy": 0.7512839920818806,
"num_tokens": 11211677.0,
"step": 26
},
{
"entropy": 0.533477783203125,
"epoch": 0.10714285714285714,
"grad_norm": 2.4870825582777543,
"learning_rate": 6.842105263157896e-06,
"loss": 0.81,
"mean_token_accuracy": 0.758179577998817,
"num_tokens": 11628779.0,
"step": 27
},
{
"entropy": 0.524932861328125,
"epoch": 0.1111111111111111,
"grad_norm": 2.360523836168894,
"learning_rate": 7.1052631578947375e-06,
"loss": 0.8368,
"mean_token_accuracy": 0.756181831471622,
"num_tokens": 12067363.0,
"step": 28
},
{
"entropy": 0.513580322265625,
"epoch": 0.11507936507936507,
"grad_norm": 2.340522449916669,
"learning_rate": 7.368421052631579e-06,
"loss": 0.8284,
"mean_token_accuracy": 0.7573374779894948,
"num_tokens": 12507159.0,
"step": 29
},
{
"entropy": 0.509765625,
"epoch": 0.11904761904761904,
"grad_norm": 1.9560553431906509,
"learning_rate": 7.631578947368423e-06,
"loss": 0.802,
"mean_token_accuracy": 0.7627127859741449,
"num_tokens": 12945458.0,
"step": 30
},
{
"entropy": 0.523956298828125,
"epoch": 0.12301587301587301,
"grad_norm": 2.040122129092966,
"learning_rate": 7.894736842105265e-06,
"loss": 0.7837,
"mean_token_accuracy": 0.7661685338243842,
"num_tokens": 13370514.0,
"step": 31
},
{
"entropy": 0.508392333984375,
"epoch": 0.12698412698412698,
"grad_norm": 2.121102369191249,
"learning_rate": 8.157894736842106e-06,
"loss": 0.7771,
"mean_token_accuracy": 0.7685025054961443,
"num_tokens": 13815066.0,
"step": 32
},
{
"entropy": 0.518341064453125,
"epoch": 0.13095238095238096,
"grad_norm": 1.935388895167446,
"learning_rate": 8.421052631578948e-06,
"loss": 0.7698,
"mean_token_accuracy": 0.7705040192231536,
"num_tokens": 14240312.0,
"step": 33
},
{
"entropy": 0.511688232421875,
"epoch": 0.1349206349206349,
"grad_norm": 1.69571857189707,
"learning_rate": 8.68421052631579e-06,
"loss": 0.7653,
"mean_token_accuracy": 0.7728015650063753,
"num_tokens": 14685173.0,
"step": 34
},
{
"entropy": 0.517608642578125,
"epoch": 0.1388888888888889,
"grad_norm": 1.7681452936301536,
"learning_rate": 8.947368421052632e-06,
"loss": 0.7449,
"mean_token_accuracy": 0.7759622316807508,
"num_tokens": 15099914.0,
"step": 35
},
{
"entropy": 0.5123291015625,
"epoch": 0.14285714285714285,
"grad_norm": 1.6769532274067533,
"learning_rate": 9.210526315789474e-06,
"loss": 0.7344,
"mean_token_accuracy": 0.777042037807405,
"num_tokens": 15522062.0,
"step": 36
},
{
"entropy": 0.5107421875,
"epoch": 0.14682539682539683,
"grad_norm": 1.5490568448744158,
"learning_rate": 9.473684210526315e-06,
"loss": 0.7237,
"mean_token_accuracy": 0.7827096851542592,
"num_tokens": 15955138.0,
"step": 37
},
{
"entropy": 0.5093994140625,
"epoch": 0.15079365079365079,
"grad_norm": 1.6589040046358219,
"learning_rate": 9.736842105263159e-06,
"loss": 0.703,
"mean_token_accuracy": 0.7817074777558446,
"num_tokens": 16388252.0,
"step": 38
},
{
"entropy": 0.5093994140625,
"epoch": 0.15476190476190477,
"grad_norm": 1.7103852217985493,
"learning_rate": 1e-05,
"loss": 0.7203,
"mean_token_accuracy": 0.7796014500781894,
"num_tokens": 16821287.0,
"step": 39
},
{
"entropy": 0.5087890625,
"epoch": 0.15873015873015872,
"grad_norm": 1.8720911640791305,
"learning_rate": 9.99995213807381e-06,
"loss": 0.6741,
"mean_token_accuracy": 0.7901940597221255,
"num_tokens": 17235205.0,
"step": 40
},
{
"entropy": 0.504638671875,
"epoch": 0.1626984126984127,
"grad_norm": 1.6006252706063373,
"learning_rate": 9.99980855321154e-06,
"loss": 0.6899,
"mean_token_accuracy": 0.7874280894175172,
"num_tokens": 17657156.0,
"step": 41
},
{
"entropy": 0.503753662109375,
"epoch": 0.16666666666666666,
"grad_norm": 1.5184573996381632,
"learning_rate": 9.999569248162095e-06,
"loss": 0.6887,
"mean_token_accuracy": 0.7868739385157824,
"num_tokens": 18090069.0,
"step": 42
},
{
"entropy": 0.497650146484375,
"epoch": 0.17063492063492064,
"grad_norm": 1.6431910052473107,
"learning_rate": 9.999234227506912e-06,
"loss": 0.6944,
"mean_token_accuracy": 0.7861603572964668,
"num_tokens": 18542578.0,
"step": 43
},
{
"entropy": 0.50860595703125,
"epoch": 0.1746031746031746,
"grad_norm": 1.7897905880615403,
"learning_rate": 9.998803497659885e-06,
"loss": 0.669,
"mean_token_accuracy": 0.7912999261170626,
"num_tokens": 18955962.0,
"step": 44
},
{
"entropy": 0.5076904296875,
"epoch": 0.17857142857142858,
"grad_norm": 1.5708540426011852,
"learning_rate": 9.998277066867236e-06,
"loss": 0.6583,
"mean_token_accuracy": 0.7945169908925891,
"num_tokens": 19379353.0,
"step": 45
},
{
"entropy": 0.507659912109375,
"epoch": 0.18253968253968253,
"grad_norm": 1.452264150440713,
"learning_rate": 9.997654945207368e-06,
"loss": 0.6506,
"mean_token_accuracy": 0.7967926179990172,
"num_tokens": 19812261.0,
"step": 46
},
{
"entropy": 0.524383544921875,
"epoch": 0.1865079365079365,
"grad_norm": 1.63583355617904,
"learning_rate": 9.99693714459065e-06,
"loss": 0.6458,
"mean_token_accuracy": 0.7976251384243369,
"num_tokens": 20210235.0,
"step": 47
},
{
"entropy": 0.503173828125,
"epoch": 0.19047619047619047,
"grad_norm": 2.0289207035123002,
"learning_rate": 9.996123678759214e-06,
"loss": 0.65,
"mean_token_accuracy": 0.7951374817639589,
"num_tokens": 20647709.0,
"step": 48
},
{
"entropy": 0.50360107421875,
"epoch": 0.19444444444444445,
"grad_norm": 1.5990576885906742,
"learning_rate": 9.995214563286677e-06,
"loss": 0.6434,
"mean_token_accuracy": 0.7995740966871381,
"num_tokens": 21065897.0,
"step": 49
},
{
"entropy": 0.5130615234375,
"epoch": 0.1984126984126984,
"grad_norm": 1.9524666748685242,
"learning_rate": 9.994209815577843e-06,
"loss": 0.6555,
"mean_token_accuracy": 0.7948365742340684,
"num_tokens": 21486371.0,
"step": 50
},
{
"entropy": 0.51611328125,
"epoch": 0.20238095238095238,
"grad_norm": 1.5386890414245815,
"learning_rate": 9.993109454868379e-06,
"loss": 0.6435,
"mean_token_accuracy": 0.796173213981092,
"num_tokens": 21909309.0,
"step": 51
},
{
"entropy": 0.5155029296875,
"epoch": 0.20634920634920634,
"grad_norm": 1.532778324611743,
"learning_rate": 9.991913502224438e-06,
"loss": 0.6319,
"mean_token_accuracy": 0.7995416941121221,
"num_tokens": 22318414.0,
"step": 52
},
{
"entropy": 0.50860595703125,
"epoch": 0.21031746031746032,
"grad_norm": 1.6657431535988216,
"learning_rate": 9.990621980542258e-06,
"loss": 0.6093,
"mean_token_accuracy": 0.8053860478103161,
"num_tokens": 22719471.0,
"step": 53
},
{
"entropy": 0.504730224609375,
"epoch": 0.21428571428571427,
"grad_norm": 1.641225405902951,
"learning_rate": 9.989234914547725e-06,
"loss": 0.6216,
"mean_token_accuracy": 0.8012946872040629,
"num_tokens": 23134604.0,
"step": 54
},
{
"entropy": 0.49383544921875,
"epoch": 0.21825396825396826,
"grad_norm": 1.4469976883176578,
"learning_rate": 9.9877523307959e-06,
"loss": 0.6264,
"mean_token_accuracy": 0.8012660220265388,
"num_tokens": 23571548.0,
"step": 55
},
{
"entropy": 0.49749755859375,
"epoch": 0.2222222222222222,
"grad_norm": 1.6227889097439865,
"learning_rate": 9.986174257670509e-06,
"loss": 0.6246,
"mean_token_accuracy": 0.8050137888640165,
"num_tokens": 24009770.0,
"step": 56
},
{
"entropy": 0.49603271484375,
"epoch": 0.2261904761904762,
"grad_norm": 1.4045322193755005,
"learning_rate": 9.984500725383397e-06,
"loss": 0.6324,
"mean_token_accuracy": 0.8019688781350851,
"num_tokens": 24447544.0,
"step": 57
},
{
"entropy": 0.5068359375,
"epoch": 0.23015873015873015,
"grad_norm": 1.3881146015309058,
"learning_rate": 9.98273176597396e-06,
"loss": 0.6233,
"mean_token_accuracy": 0.802922697737813,
"num_tokens": 24869806.0,
"step": 58
},
{
"entropy": 0.48602294921875,
"epoch": 0.23412698412698413,
"grad_norm": 1.4997301127686509,
"learning_rate": 9.980867413308516e-06,
"loss": 0.6298,
"mean_token_accuracy": 0.8009732821956277,
"num_tokens": 25337885.0,
"step": 59
},
{
"entropy": 0.4898681640625,
"epoch": 0.23809523809523808,
"grad_norm": 1.4955906883793464,
"learning_rate": 9.978907703079672e-06,
"loss": 0.6112,
"mean_token_accuracy": 0.807507585734129,
"num_tokens": 25762999.0,
"step": 60
},
{
"entropy": 0.495452880859375,
"epoch": 0.24206349206349206,
"grad_norm": 1.4495106936140836,
"learning_rate": 9.976852672805625e-06,
"loss": 0.6071,
"mean_token_accuracy": 0.8060804437845945,
"num_tokens": 26204122.0,
"step": 61
},
{
"entropy": 0.48187255859375,
"epoch": 0.24603174603174602,
"grad_norm": 1.394817796737972,
"learning_rate": 9.974702361829465e-06,
"loss": 0.5934,
"mean_token_accuracy": 0.8098774421960115,
"num_tokens": 26651412.0,
"step": 62
},
{
"entropy": 0.4937744140625,
"epoch": 0.25,
"grad_norm": 1.566600221250525,
"learning_rate": 9.972456811318399e-06,
"loss": 0.6075,
"mean_token_accuracy": 0.8056732397526503,
"num_tokens": 27080137.0,
"step": 63
},
{
"entropy": 0.48150634765625,
"epoch": 0.25396825396825395,
"grad_norm": 1.4470780226444868,
"learning_rate": 9.970116064262975e-06,
"loss": 0.6025,
"mean_token_accuracy": 0.8087067836895585,
"num_tokens": 27520069.0,
"step": 64
},
{
"entropy": 0.485382080078125,
"epoch": 0.25793650793650796,
"grad_norm": 1.5004810878183181,
"learning_rate": 9.96768016547626e-06,
"loss": 0.6011,
"mean_token_accuracy": 0.8066385835409164,
"num_tokens": 27954154.0,
"step": 65
},
{
"entropy": 0.49444580078125,
"epoch": 0.2619047619047619,
"grad_norm": 1.618557504297728,
"learning_rate": 9.965149161592973e-06,
"loss": 0.6054,
"mean_token_accuracy": 0.8067285194993019,
"num_tokens": 28367541.0,
"step": 66
},
{
"entropy": 0.4913330078125,
"epoch": 0.26587301587301587,
"grad_norm": 1.4570014991732672,
"learning_rate": 9.962523101068608e-06,
"loss": 0.573,
"mean_token_accuracy": 0.8120877193287015,
"num_tokens": 28779140.0,
"step": 67
},
{
"entropy": 0.485076904296875,
"epoch": 0.2698412698412698,
"grad_norm": 1.5057997069476419,
"learning_rate": 9.959802034178489e-06,
"loss": 0.5966,
"mean_token_accuracy": 0.8073940826579928,
"num_tokens": 29217570.0,
"step": 68
},
{
"entropy": 0.479888916015625,
"epoch": 0.27380952380952384,
"grad_norm": 1.4032340805398644,
"learning_rate": 9.956986013016816e-06,
"loss": 0.5767,
"mean_token_accuracy": 0.8149419017136097,
"num_tokens": 29656943.0,
"step": 69
},
{
"entropy": 0.484710693359375,
"epoch": 0.2777777777777778,
"grad_norm": 1.4496205753720897,
"learning_rate": 9.954075091495669e-06,
"loss": 0.6001,
"mean_token_accuracy": 0.8093660045415163,
"num_tokens": 30088869.0,
"step": 70
},
{
"entropy": 0.476165771484375,
"epoch": 0.28174603174603174,
"grad_norm": 1.4082524979942377,
"learning_rate": 9.951069325343972e-06,
"loss": 0.6016,
"mean_token_accuracy": 0.8054882632568479,
"num_tokens": 30550317.0,
"step": 71
},
{
"entropy": 0.483856201171875,
"epoch": 0.2857142857142857,
"grad_norm": 1.3714127497612545,
"learning_rate": 9.947968772106428e-06,
"loss": 0.5748,
"mean_token_accuracy": 0.8156133992597461,
"num_tokens": 30959821.0,
"step": 72
},
{
"entropy": 0.47869873046875,
"epoch": 0.2896825396825397,
"grad_norm": 1.6516883841068675,
"learning_rate": 9.944773491142416e-06,
"loss": 0.5997,
"mean_token_accuracy": 0.8074251553043723,
"num_tokens": 31412639.0,
"step": 73
},
{
"entropy": 0.487518310546875,
"epoch": 0.29365079365079366,
"grad_norm": 1.5133792126136842,
"learning_rate": 9.94148354362486e-06,
"loss": 0.592,
"mean_token_accuracy": 0.8129479885101318,
"num_tokens": 31830767.0,
"step": 74
},
{
"entropy": 0.482086181640625,
"epoch": 0.2976190476190476,
"grad_norm": 1.62731136956083,
"learning_rate": 9.938098992539045e-06,
"loss": 0.5835,
"mean_token_accuracy": 0.8082789676263928,
"num_tokens": 32267329.0,
"step": 75
},
{
"entropy": 0.48516845703125,
"epoch": 0.30158730158730157,
"grad_norm": 1.4784416203962691,
"learning_rate": 9.93461990268143e-06,
"loss": 0.582,
"mean_token_accuracy": 0.8147872434929013,
"num_tokens": 32692726.0,
"step": 76
},
{
"entropy": 0.48876953125,
"epoch": 0.3055555555555556,
"grad_norm": 1.5041413810038196,
"learning_rate": 9.931046340658387e-06,
"loss": 0.5617,
"mean_token_accuracy": 0.8183435359969735,
"num_tokens": 33108936.0,
"step": 77
},
{
"entropy": 0.472503662109375,
"epoch": 0.30952380952380953,
"grad_norm": 1.6817980341644059,
"learning_rate": 9.927378374884947e-06,
"loss": 0.5655,
"mean_token_accuracy": 0.8146926909685135,
"num_tokens": 33543076.0,
"step": 78
},
{
"entropy": 0.474945068359375,
"epoch": 0.3134920634920635,
"grad_norm": 1.3241417102653499,
"learning_rate": 9.923616075583465e-06,
"loss": 0.5738,
"mean_token_accuracy": 0.8142029214650393,
"num_tokens": 33980897.0,
"step": 79
},
{
"entropy": 0.47528076171875,
"epoch": 0.31746031746031744,
"grad_norm": 1.4456909932877973,
"learning_rate": 9.919759514782304e-06,
"loss": 0.5725,
"mean_token_accuracy": 0.8150945641100407,
"num_tokens": 34404352.0,
"step": 80
},
{
"entropy": 0.48504638671875,
"epoch": 0.32142857142857145,
"grad_norm": 1.2666611706175113,
"learning_rate": 9.91580876631443e-06,
"loss": 0.5728,
"mean_token_accuracy": 0.8147335788235068,
"num_tokens": 34815122.0,
"step": 81
},
{
"entropy": 0.49102783203125,
"epoch": 0.3253968253968254,
"grad_norm": 1.4465210833568694,
"learning_rate": 9.91176390581602e-06,
"loss": 0.5759,
"mean_token_accuracy": 0.8133391635492444,
"num_tokens": 35236933.0,
"step": 82
},
{
"entropy": 0.483489990234375,
"epoch": 0.32936507936507936,
"grad_norm": 1.3544114949996022,
"learning_rate": 9.907625010724999e-06,
"loss": 0.5724,
"mean_token_accuracy": 0.8148928321897984,
"num_tokens": 35664506.0,
"step": 83
},
{
"entropy": 0.480560302734375,
"epoch": 0.3333333333333333,
"grad_norm": 1.3188860783644643,
"learning_rate": 9.903392160279564e-06,
"loss": 0.5666,
"mean_token_accuracy": 0.8133293204009533,
"num_tokens": 36088050.0,
"step": 84
},
{
"entropy": 0.48748779296875,
"epoch": 0.3373015873015873,
"grad_norm": 1.4518030416485894,
"learning_rate": 9.899065435516661e-06,
"loss": 0.5664,
"mean_token_accuracy": 0.8148653889074922,
"num_tokens": 36501235.0,
"step": 85
},
{
"entropy": 0.469940185546875,
"epoch": 0.3412698412698413,
"grad_norm": 1.394119633826569,
"learning_rate": 9.894644919270448e-06,
"loss": 0.5722,
"mean_token_accuracy": 0.814102666452527,
"num_tokens": 36942407.0,
"step": 86
},
{
"entropy": 0.47369384765625,
"epoch": 0.34523809523809523,
"grad_norm": 1.4881887909837872,
"learning_rate": 9.890130696170691e-06,
"loss": 0.5714,
"mean_token_accuracy": 0.8154451455920935,
"num_tokens": 37381260.0,
"step": 87
},
{
"entropy": 0.47998046875,
"epoch": 0.3492063492063492,
"grad_norm": 1.3217206972932933,
"learning_rate": 9.885522852641156e-06,
"loss": 0.5695,
"mean_token_accuracy": 0.814792038872838,
"num_tokens": 37803882.0,
"step": 88
},
{
"entropy": 0.48052978515625,
"epoch": 0.3531746031746032,
"grad_norm": 1.5133757517932098,
"learning_rate": 9.880821476897948e-06,
"loss": 0.5628,
"mean_token_accuracy": 0.8151478515937924,
"num_tokens": 38227635.0,
"step": 89
},
{
"entropy": 0.475738525390625,
"epoch": 0.35714285714285715,
"grad_norm": 1.5653342692191234,
"learning_rate": 9.87602665894783e-06,
"loss": 0.5828,
"mean_token_accuracy": 0.8125336300581694,
"num_tokens": 38667329.0,
"step": 90
},
{
"entropy": 0.473876953125,
"epoch": 0.3611111111111111,
"grad_norm": 1.3382017413079235,
"learning_rate": 9.871138490586489e-06,
"loss": 0.57,
"mean_token_accuracy": 0.8121865503489971,
"num_tokens": 39107330.0,
"step": 91
},
{
"entropy": 0.47998046875,
"epoch": 0.36507936507936506,
"grad_norm": 1.346784303133718,
"learning_rate": 9.866157065396784e-06,
"loss": 0.5503,
"mean_token_accuracy": 0.8177150310948491,
"num_tokens": 39524166.0,
"step": 92
},
{
"entropy": 0.469207763671875,
"epoch": 0.36904761904761907,
"grad_norm": 1.4083288521133936,
"learning_rate": 9.861082478746962e-06,
"loss": 0.5508,
"mean_token_accuracy": 0.820819640532136,
"num_tokens": 39952174.0,
"step": 93
},
{
"entropy": 0.465789794921875,
"epoch": 0.373015873015873,
"grad_norm": 1.4473119436564825,
"learning_rate": 9.855914827788814e-06,
"loss": 0.5596,
"mean_token_accuracy": 0.8184320721775293,
"num_tokens": 40389693.0,
"step": 94
},
{
"entropy": 0.46807861328125,
"epoch": 0.376984126984127,
"grad_norm": 1.3763793812393954,
"learning_rate": 9.850654211455837e-06,
"loss": 0.5548,
"mean_token_accuracy": 0.8205192228779197,
"num_tokens": 40815730.0,
"step": 95
},
{
"entropy": 0.484527587890625,
"epoch": 0.38095238095238093,
"grad_norm": 1.5969870094084369,
"learning_rate": 9.84530073046132e-06,
"loss": 0.564,
"mean_token_accuracy": 0.816374409943819,
"num_tokens": 41231841.0,
"step": 96
},
{
"entropy": 0.492523193359375,
"epoch": 0.38492063492063494,
"grad_norm": 1.379364573057709,
"learning_rate": 9.83985448729643e-06,
"loss": 0.572,
"mean_token_accuracy": 0.8147962624207139,
"num_tokens": 41650119.0,
"step": 97
},
{
"entropy": 0.4735107421875,
"epoch": 0.3888888888888889,
"grad_norm": 1.4022051638675177,
"learning_rate": 9.83431558622824e-06,
"loss": 0.5501,
"mean_token_accuracy": 0.8185382299125195,
"num_tokens": 42082897.0,
"step": 98
},
{
"entropy": 0.47802734375,
"epoch": 0.39285714285714285,
"grad_norm": 1.3021150947814153,
"learning_rate": 9.828684133297738e-06,
"loss": 0.5475,
"mean_token_accuracy": 0.82077881321311,
"num_tokens": 42519361.0,
"step": 99
},
{
"entropy": 0.47802734375,
"epoch": 0.3968253968253968,
"grad_norm": 1.3024753376267064,
"learning_rate": 9.822960236317804e-06,
"loss": 0.5436,
"mean_token_accuracy": 0.8204956650733948,
"num_tokens": 42941458.0,
"step": 100
},
{
"entropy": 0.472930908203125,
"epoch": 0.4007936507936508,
"grad_norm": 1.4182047962742048,
"learning_rate": 9.817144004871127e-06,
"loss": 0.5442,
"mean_token_accuracy": 0.8214483223855495,
"num_tokens": 43370971.0,
"step": 101
},
{
"entropy": 0.476470947265625,
"epoch": 0.40476190476190477,
"grad_norm": 1.3283608806953866,
"learning_rate": 9.811235550308127e-06,
"loss": 0.551,
"mean_token_accuracy": 0.8185345204547048,
"num_tokens": 43801380.0,
"step": 102
},
{
"entropy": 0.46978759765625,
"epoch": 0.4087301587301587,
"grad_norm": 1.2924764394677166,
"learning_rate": 9.805234985744804e-06,
"loss": 0.5605,
"mean_token_accuracy": 0.8147126482799649,
"num_tokens": 44245066.0,
"step": 103
},
{
"entropy": 0.485198974609375,
"epoch": 0.4126984126984127,
"grad_norm": 1.3073864707831366,
"learning_rate": 9.799142426060595e-06,
"loss": 0.5573,
"mean_token_accuracy": 0.8181026382371783,
"num_tokens": 44671335.0,
"step": 104
},
{
"entropy": 0.498046875,
"epoch": 0.4166666666666667,
"grad_norm": 1.4213977867693426,
"learning_rate": 9.792957987896154e-06,
"loss": 0.5518,
"mean_token_accuracy": 0.8183343056589365,
"num_tokens": 45066930.0,
"step": 105
},
{
"entropy": 0.47454833984375,
"epoch": 0.42063492063492064,
"grad_norm": 1.2495857267379333,
"learning_rate": 9.786681789651134e-06,
"loss": 0.5472,
"mean_token_accuracy": 0.8180114766582847,
"num_tokens": 45508166.0,
"step": 106
},
{
"entropy": 0.47021484375,
"epoch": 0.4246031746031746,
"grad_norm": 1.238708297199452,
"learning_rate": 9.780313951481904e-06,
"loss": 0.5612,
"mean_token_accuracy": 0.8155703386291862,
"num_tokens": 45960298.0,
"step": 107
},
{
"entropy": 0.473785400390625,
"epoch": 0.42857142857142855,
"grad_norm": 1.367804985831029,
"learning_rate": 9.773854595299269e-06,
"loss": 0.5518,
"mean_token_accuracy": 0.8167815553024411,
"num_tokens": 46398974.0,
"step": 108
},
{
"entropy": 0.462677001953125,
"epoch": 0.43253968253968256,
"grad_norm": 1.3222167500569877,
"learning_rate": 9.767303844766118e-06,
"loss": 0.5548,
"mean_token_accuracy": 0.8168724188581109,
"num_tokens": 46837899.0,
"step": 109
},
{
"entropy": 0.460693359375,
"epoch": 0.4365079365079365,
"grad_norm": 1.3681492766242778,
"learning_rate": 9.760661825295068e-06,
"loss": 0.5623,
"mean_token_accuracy": 0.8150366581976414,
"num_tokens": 47311746.0,
"step": 110
},
{
"entropy": 0.466400146484375,
"epoch": 0.44047619047619047,
"grad_norm": 1.344685621979837,
"learning_rate": 9.753928664046055e-06,
"loss": 0.5392,
"mean_token_accuracy": 0.822113991715014,
"num_tokens": 47744340.0,
"step": 111
},
{
"entropy": 0.4608154296875,
"epoch": 0.4444444444444444,
"grad_norm": 1.3313641531925076,
"learning_rate": 9.747104489923907e-06,
"loss": 0.5335,
"mean_token_accuracy": 0.8225171025842428,
"num_tokens": 48177761.0,
"step": 112
},
{
"entropy": 0.4722900390625,
"epoch": 0.44841269841269843,
"grad_norm": 1.5485087292600126,
"learning_rate": 9.740189433575873e-06,
"loss": 0.5511,
"mean_token_accuracy": 0.8177419500425458,
"num_tokens": 48604700.0,
"step": 113
},
{
"entropy": 0.474884033203125,
"epoch": 0.4523809523809524,
"grad_norm": 1.3287727548949633,
"learning_rate": 9.733183627389117e-06,
"loss": 0.5349,
"mean_token_accuracy": 0.8249012846499681,
"num_tokens": 49026375.0,
"step": 114
},
{
"entropy": 0.461212158203125,
"epoch": 0.45634920634920634,
"grad_norm": 1.4235893278514111,
"learning_rate": 9.726087205488192e-06,
"loss": 0.5488,
"mean_token_accuracy": 0.8166424483060837,
"num_tokens": 49467267.0,
"step": 115
},
{
"entropy": 0.47381591796875,
"epoch": 0.4603174603174603,
"grad_norm": 1.255433079679792,
"learning_rate": 9.718900303732465e-06,
"loss": 0.5467,
"mean_token_accuracy": 0.8177134236320853,
"num_tokens": 49889163.0,
"step": 116
},
{
"entropy": 0.476165771484375,
"epoch": 0.4642857142857143,
"grad_norm": 1.2666755263949114,
"learning_rate": 9.711623059713522e-06,
"loss": 0.5284,
"mean_token_accuracy": 0.82161083817482,
"num_tokens": 50300460.0,
"step": 117
},
{
"entropy": 0.470458984375,
"epoch": 0.46825396825396826,
"grad_norm": 1.7054143182470258,
"learning_rate": 9.70425561275253e-06,
"loss": 0.553,
"mean_token_accuracy": 0.8204147005453706,
"num_tokens": 50735361.0,
"step": 118
},
{
"entropy": 0.47528076171875,
"epoch": 0.4722222222222222,
"grad_norm": 1.2776820782333425,
"learning_rate": 9.696798103897567e-06,
"loss": 0.5344,
"mean_token_accuracy": 0.821893903426826,
"num_tokens": 51149122.0,
"step": 119
},
{
"entropy": 0.469268798828125,
"epoch": 0.47619047619047616,
"grad_norm": 1.1855022647806321,
"learning_rate": 9.689250675920932e-06,
"loss": 0.5371,
"mean_token_accuracy": 0.8207768378779292,
"num_tokens": 51597577.0,
"step": 120
},
{
"entropy": 0.461181640625,
"epoch": 0.4801587301587302,
"grad_norm": 1.3061024406164452,
"learning_rate": 9.6816134733164e-06,
"loss": 0.5419,
"mean_token_accuracy": 0.8211635444313288,
"num_tokens": 52043666.0,
"step": 121
},
{
"entropy": 0.4639892578125,
"epoch": 0.48412698412698413,
"grad_norm": 1.278485982326175,
"learning_rate": 9.67388664229646e-06,
"loss": 0.5457,
"mean_token_accuracy": 0.8210693299770355,
"num_tokens": 52482382.0,
"step": 122
},
{
"entropy": 0.466400146484375,
"epoch": 0.4880952380952381,
"grad_norm": 1.3159497560386597,
"learning_rate": 9.66607033078952e-06,
"loss": 0.5399,
"mean_token_accuracy": 0.8193363519385457,
"num_tokens": 52931115.0,
"step": 123
},
{
"entropy": 0.462371826171875,
"epoch": 0.49206349206349204,
"grad_norm": 1.3013445808543571,
"learning_rate": 9.658164688437073e-06,
"loss": 0.5431,
"mean_token_accuracy": 0.8198595689609647,
"num_tokens": 53370750.0,
"step": 124
},
{
"entropy": 0.470245361328125,
"epoch": 0.49603174603174605,
"grad_norm": 1.2502745654553475,
"learning_rate": 9.65016986659082e-06,
"loss": 0.5352,
"mean_token_accuracy": 0.8216186631470919,
"num_tokens": 53798951.0,
"step": 125
},
{
"entropy": 0.460723876953125,
"epoch": 0.5,
"grad_norm": 1.4425212147696118,
"learning_rate": 9.642086018309798e-06,
"loss": 0.528,
"mean_token_accuracy": 0.8253877777606249,
"num_tokens": 54235189.0,
"step": 126
},
{
"entropy": 0.463043212890625,
"epoch": 0.503968253968254,
"grad_norm": 1.190227347104015,
"learning_rate": 9.63391329835742e-06,
"loss": 0.5215,
"mean_token_accuracy": 0.825776319950819,
"num_tokens": 54642925.0,
"step": 127
},
{
"entropy": 0.470428466796875,
"epoch": 0.5079365079365079,
"grad_norm": 1.3119200133443487,
"learning_rate": 9.625651863198538e-06,
"loss": 0.5361,
"mean_token_accuracy": 0.8217763127759099,
"num_tokens": 55066936.0,
"step": 128
},
{
"entropy": 0.475128173828125,
"epoch": 0.5119047619047619,
"grad_norm": 1.2559808601225464,
"learning_rate": 9.617301870996432e-06,
"loss": 0.5271,
"mean_token_accuracy": 0.8248334173113108,
"num_tokens": 55484500.0,
"step": 129
},
{
"entropy": 0.45751953125,
"epoch": 0.5158730158730159,
"grad_norm": 1.2089833762472606,
"learning_rate": 9.608863481609784e-06,
"loss": 0.5333,
"mean_token_accuracy": 0.8226035898551345,
"num_tokens": 55922405.0,
"step": 130
},
{
"entropy": 0.4698486328125,
"epoch": 0.5198412698412699,
"grad_norm": 1.311622726348439,
"learning_rate": 9.600336856589622e-06,
"loss": 0.542,
"mean_token_accuracy": 0.8179264310747385,
"num_tokens": 56355834.0,
"step": 131
},
{
"entropy": 0.469024658203125,
"epoch": 0.5238095238095238,
"grad_norm": 1.370201408190726,
"learning_rate": 9.591722159176229e-06,
"loss": 0.5209,
"mean_token_accuracy": 0.8256417205557227,
"num_tokens": 56770275.0,
"step": 132
},
{
"entropy": 0.467926025390625,
"epoch": 0.5277777777777778,
"grad_norm": 1.4107765499615386,
"learning_rate": 9.583019554296004e-06,
"loss": 0.54,
"mean_token_accuracy": 0.8201555293053389,
"num_tokens": 57203160.0,
"step": 133
},
{
"entropy": 0.469207763671875,
"epoch": 0.5317460317460317,
"grad_norm": 1.2667343919182794,
"learning_rate": 9.574229208558322e-06,
"loss": 0.535,
"mean_token_accuracy": 0.8202388240024447,
"num_tokens": 57627870.0,
"step": 134
},
{
"entropy": 0.46697998046875,
"epoch": 0.5357142857142857,
"grad_norm": 1.4012228207534334,
"learning_rate": 9.565351290252339e-06,
"loss": 0.5335,
"mean_token_accuracy": 0.8244267264381051,
"num_tokens": 58059792.0,
"step": 135
},
{
"entropy": 0.4700927734375,
"epoch": 0.5396825396825397,
"grad_norm": 1.2541013251161421,
"learning_rate": 9.556385969343756e-06,
"loss": 0.5178,
"mean_token_accuracy": 0.8261177660897374,
"num_tokens": 58469884.0,
"step": 136
},
{
"entropy": 0.460784912109375,
"epoch": 0.5436507936507936,
"grad_norm": 1.266853697510061,
"learning_rate": 9.547333417471589e-06,
"loss": 0.5218,
"mean_token_accuracy": 0.824421800673008,
"num_tokens": 58908403.0,
"step": 137
},
{
"entropy": 0.467498779296875,
"epoch": 0.5476190476190477,
"grad_norm": 1.649666578399019,
"learning_rate": 9.538193807944864e-06,
"loss": 0.5251,
"mean_token_accuracy": 0.8241150714457035,
"num_tokens": 59323796.0,
"step": 138
},
{
"entropy": 0.461883544921875,
"epoch": 0.5515873015873016,
"grad_norm": 1.2782211754106552,
"learning_rate": 9.528967315739308e-06,
"loss": 0.5231,
"mean_token_accuracy": 0.8241786258295178,
"num_tokens": 59751885.0,
"step": 139
},
{
"entropy": 0.464080810546875,
"epoch": 0.5555555555555556,
"grad_norm": 1.1911969994875058,
"learning_rate": 9.519654117493996e-06,
"loss": 0.5093,
"mean_token_accuracy": 0.8299755034968257,
"num_tokens": 60183841.0,
"step": 140
},
{
"entropy": 0.467681884765625,
"epoch": 0.5595238095238095,
"grad_norm": 1.21584451360531,
"learning_rate": 9.510254391507971e-06,
"loss": 0.5323,
"mean_token_accuracy": 0.8225418599322438,
"num_tokens": 60605801.0,
"step": 141
},
{
"entropy": 0.465789794921875,
"epoch": 0.5634920634920635,
"grad_norm": 1.1387453790165247,
"learning_rate": 9.500768317736832e-06,
"loss": 0.527,
"mean_token_accuracy": 0.8241681484505534,
"num_tokens": 61048601.0,
"step": 142
},
{
"entropy": 0.47747802734375,
"epoch": 0.5674603174603174,
"grad_norm": 1.1374751119159374,
"learning_rate": 9.49119607778928e-06,
"loss": 0.5235,
"mean_token_accuracy": 0.8259162092581391,
"num_tokens": 61446873.0,
"step": 143
},
{
"entropy": 0.4652099609375,
"epoch": 0.5714285714285714,
"grad_norm": 1.2648801316634823,
"learning_rate": 9.481537854923654e-06,
"loss": 0.5352,
"mean_token_accuracy": 0.8220484433695674,
"num_tokens": 61887912.0,
"step": 144
},
{
"entropy": 0.47418212890625,
"epoch": 0.5753968253968254,
"grad_norm": 1.113220988507023,
"learning_rate": 9.471793834044416e-06,
"loss": 0.5236,
"mean_token_accuracy": 0.8275265209376812,
"num_tokens": 62316051.0,
"step": 145
},
{
"entropy": 0.459381103515625,
"epoch": 0.5793650793650794,
"grad_norm": 1.1782022702716075,
"learning_rate": 9.461964201698604e-06,
"loss": 0.5239,
"mean_token_accuracy": 0.8253972074016929,
"num_tokens": 62741342.0,
"step": 146
},
{
"entropy": 0.464813232421875,
"epoch": 0.5833333333333334,
"grad_norm": 1.3055908871865158,
"learning_rate": 9.452049146072278e-06,
"loss": 0.5217,
"mean_token_accuracy": 0.8288997933268547,
"num_tokens": 63164890.0,
"step": 147
},
{
"entropy": 0.4561767578125,
"epoch": 0.5873015873015873,
"grad_norm": 1.250402921918011,
"learning_rate": 9.442048856986899e-06,
"loss": 0.5244,
"mean_token_accuracy": 0.825376064516604,
"num_tokens": 63594617.0,
"step": 148
},
{
"entropy": 0.45916748046875,
"epoch": 0.5912698412698413,
"grad_norm": 1.2512378704930547,
"learning_rate": 9.431963525895709e-06,
"loss": 0.5332,
"mean_token_accuracy": 0.8236651951447129,
"num_tokens": 64050293.0,
"step": 149
},
{
"entropy": 0.45831298828125,
"epoch": 0.5952380952380952,
"grad_norm": 1.2800747002600605,
"learning_rate": 9.421793345880055e-06,
"loss": 0.508,
"mean_token_accuracy": 0.8307171342894435,
"num_tokens": 64467695.0,
"step": 150
},
{
"entropy": 0.4619140625,
"epoch": 0.5992063492063492,
"grad_norm": 1.22106067792139,
"learning_rate": 9.4115385116457e-06,
"loss": 0.5273,
"mean_token_accuracy": 0.8228645129129291,
"num_tokens": 64908198.0,
"step": 151
},
{
"entropy": 0.465362548828125,
"epoch": 0.6031746031746031,
"grad_norm": 1.6011741702601825,
"learning_rate": 9.401199219519088e-06,
"loss": 0.5189,
"mean_token_accuracy": 0.8247488467022777,
"num_tokens": 65333788.0,
"step": 152
},
{
"entropy": 0.47772216796875,
"epoch": 0.6071428571428571,
"grad_norm": 1.289619416717165,
"learning_rate": 9.390775667443602e-06,
"loss": 0.5092,
"mean_token_accuracy": 0.8292458476498723,
"num_tokens": 65748782.0,
"step": 153
},
{
"entropy": 0.463470458984375,
"epoch": 0.6111111111111112,
"grad_norm": 1.3540556513064608,
"learning_rate": 9.380268054975745e-06,
"loss": 0.5249,
"mean_token_accuracy": 0.823799098841846,
"num_tokens": 66178918.0,
"step": 154
},
{
"entropy": 0.467132568359375,
"epoch": 0.6150793650793651,
"grad_norm": 1.441163655667528,
"learning_rate": 9.36967658328135e-06,
"loss": 0.5339,
"mean_token_accuracy": 0.825651915743947,
"num_tokens": 66603248.0,
"step": 155
},
{
"entropy": 0.4588623046875,
"epoch": 0.6190476190476191,
"grad_norm": 1.2757900624701248,
"learning_rate": 9.359001455131713e-06,
"loss": 0.5205,
"mean_token_accuracy": 0.8264942672103643,
"num_tokens": 67052342.0,
"step": 156
},
{
"entropy": 0.457855224609375,
"epoch": 0.623015873015873,
"grad_norm": 1.3280329459811233,
"learning_rate": 9.34824287489971e-06,
"loss": 0.5167,
"mean_token_accuracy": 0.8265606937929988,
"num_tokens": 67476890.0,
"step": 157
},
{
"entropy": 0.4544677734375,
"epoch": 0.626984126984127,
"grad_norm": 1.4362643018863588,
"learning_rate": 9.337401048555892e-06,
"loss": 0.5184,
"mean_token_accuracy": 0.8287814203649759,
"num_tokens": 67913391.0,
"step": 158
},
{
"entropy": 0.4598388671875,
"epoch": 0.6309523809523809,
"grad_norm": 1.8377059083752896,
"learning_rate": 9.326476183664535e-06,
"loss": 0.5086,
"mean_token_accuracy": 0.8302426496520638,
"num_tokens": 68339443.0,
"step": 159
},
{
"entropy": 0.457611083984375,
"epoch": 0.6349206349206349,
"grad_norm": 1.2472914610462977,
"learning_rate": 9.315468489379668e-06,
"loss": 0.5242,
"mean_token_accuracy": 0.8252703994512558,
"num_tokens": 68772115.0,
"step": 160
},
{
"entropy": 0.454376220703125,
"epoch": 0.6388888888888888,
"grad_norm": 1.0940363704932208,
"learning_rate": 9.304378176441076e-06,
"loss": 0.5094,
"mean_token_accuracy": 0.8273925203830004,
"num_tokens": 69198272.0,
"step": 161
},
{
"entropy": 0.456268310546875,
"epoch": 0.6428571428571429,
"grad_norm": 1.250494594040658,
"learning_rate": 9.29320545717025e-06,
"loss": 0.5044,
"mean_token_accuracy": 0.8318730751052499,
"num_tokens": 69611653.0,
"step": 162
},
{
"entropy": 0.4644775390625,
"epoch": 0.6468253968253969,
"grad_norm": 1.3758890462061453,
"learning_rate": 9.281950545466336e-06,
"loss": 0.5375,
"mean_token_accuracy": 0.8206725753843784,
"num_tokens": 70054917.0,
"step": 163
},
{
"entropy": 0.451385498046875,
"epoch": 0.6507936507936508,
"grad_norm": 1.2229845865238094,
"learning_rate": 9.27061365680204e-06,
"loss": 0.5148,
"mean_token_accuracy": 0.8290882222354412,
"num_tokens": 70496952.0,
"step": 164
},
{
"entropy": 0.452728271484375,
"epoch": 0.6547619047619048,
"grad_norm": 1.310715081152188,
"learning_rate": 9.25919500821949e-06,
"loss": 0.5108,
"mean_token_accuracy": 0.8279124954715371,
"num_tokens": 70919899.0,
"step": 165
},
{
"entropy": 0.45574951171875,
"epoch": 0.6587301587301587,
"grad_norm": 1.2675730907362597,
"learning_rate": 9.247694818326092e-06,
"loss": 0.5111,
"mean_token_accuracy": 0.8315063090994954,
"num_tokens": 71343921.0,
"step": 166
},
{
"entropy": 0.44989013671875,
"epoch": 0.6626984126984127,
"grad_norm": 1.3386162279647864,
"learning_rate": 9.236113307290345e-06,
"loss": 0.5343,
"mean_token_accuracy": 0.821853213943541,
"num_tokens": 71808905.0,
"step": 167
},
{
"entropy": 0.45709228515625,
"epoch": 0.6666666666666666,
"grad_norm": 1.2417954424606619,
"learning_rate": 9.224450696837617e-06,
"loss": 0.5137,
"mean_token_accuracy": 0.8275673342868686,
"num_tokens": 72240608.0,
"step": 168
},
{
"entropy": 0.4530029296875,
"epoch": 0.6706349206349206,
"grad_norm": 1.2477554302346368,
"learning_rate": 9.212707210245908e-06,
"loss": 0.505,
"mean_token_accuracy": 0.8292029527947307,
"num_tokens": 72668688.0,
"step": 169
},
{
"entropy": 0.453826904296875,
"epoch": 0.6746031746031746,
"grad_norm": 1.2403145708249377,
"learning_rate": 9.200883072341573e-06,
"loss": 0.5194,
"mean_token_accuracy": 0.8281446853652596,
"num_tokens": 73118452.0,
"step": 170
},
{
"entropy": 0.45068359375,
"epoch": 0.6785714285714286,
"grad_norm": 1.2242088741534112,
"learning_rate": 9.188978509495022e-06,
"loss": 0.5228,
"mean_token_accuracy": 0.8244192777201533,
"num_tokens": 73569120.0,
"step": 171
},
{
"entropy": 0.448516845703125,
"epoch": 0.6825396825396826,
"grad_norm": 1.4410441359720512,
"learning_rate": 9.176993749616374e-06,
"loss": 0.5148,
"mean_token_accuracy": 0.8254242306575179,
"num_tokens": 73991069.0,
"step": 172
},
{
"entropy": 0.457122802734375,
"epoch": 0.6865079365079365,
"grad_norm": 1.4617287104899703,
"learning_rate": 9.164929022151106e-06,
"loss": 0.506,
"mean_token_accuracy": 0.8297470537945628,
"num_tokens": 74406271.0,
"step": 173
},
{
"entropy": 0.457122802734375,
"epoch": 0.6904761904761905,
"grad_norm": 1.2946096899912363,
"learning_rate": 9.15278455807566e-06,
"loss": 0.5163,
"mean_token_accuracy": 0.8275650115683675,
"num_tokens": 74839901.0,
"step": 174
},
{
"entropy": 0.451202392578125,
"epoch": 0.6944444444444444,
"grad_norm": 1.2168830292282429,
"learning_rate": 9.140560589893012e-06,
"loss": 0.5088,
"mean_token_accuracy": 0.8290477497503161,
"num_tokens": 75280578.0,
"step": 175
},
{
"entropy": 0.45111083984375,
"epoch": 0.6984126984126984,
"grad_norm": 1.1964525447125613,
"learning_rate": 9.128257351628224e-06,
"loss": 0.5346,
"mean_token_accuracy": 0.8231356684118509,
"num_tokens": 75749725.0,
"step": 176
},
{
"entropy": 0.456024169921875,
"epoch": 0.7023809523809523,
"grad_norm": 1.2104495744651753,
"learning_rate": 9.115875078823975e-06,
"loss": 0.5188,
"mean_token_accuracy": 0.8278255322948098,
"num_tokens": 76175668.0,
"step": 177
},
{
"entropy": 0.45965576171875,
"epoch": 0.7063492063492064,
"grad_norm": 1.1865163712517055,
"learning_rate": 9.103414008536029e-06,
"loss": 0.5111,
"mean_token_accuracy": 0.8277882896363735,
"num_tokens": 76593690.0,
"step": 178
},
{
"entropy": 0.458587646484375,
"epoch": 0.7103174603174603,
"grad_norm": 1.6965519987597353,
"learning_rate": 9.09087437932872e-06,
"loss": 0.5015,
"mean_token_accuracy": 0.8323444193229079,
"num_tokens": 77009261.0,
"step": 179
},
{
"entropy": 0.454925537109375,
"epoch": 0.7142857142857143,
"grad_norm": 1.2650031464495928,
"learning_rate": 9.07825643127037e-06,
"loss": 0.5157,
"mean_token_accuracy": 0.8258270686492324,
"num_tokens": 77431030.0,
"step": 180
},
{
"entropy": 0.447906494140625,
"epoch": 0.7182539682539683,
"grad_norm": 1.1859012409189014,
"learning_rate": 9.065560405928699e-06,
"loss": 0.5023,
"mean_token_accuracy": 0.8294160980731249,
"num_tokens": 77852655.0,
"step": 181
},
{
"entropy": 0.45416259765625,
"epoch": 0.7222222222222222,
"grad_norm": 1.176919606678633,
"learning_rate": 9.0527865463662e-06,
"loss": 0.5162,
"mean_token_accuracy": 0.8275531772524118,
"num_tokens": 78278605.0,
"step": 182
},
{
"entropy": 0.4486083984375,
"epoch": 0.7261904761904762,
"grad_norm": 1.2918709531705708,
"learning_rate": 9.039935097135479e-06,
"loss": 0.5024,
"mean_token_accuracy": 0.8300044005736709,
"num_tokens": 78721098.0,
"step": 183
},
{
"entropy": 0.454345703125,
"epoch": 0.7301587301587301,
"grad_norm": 1.3064400710795658,
"learning_rate": 9.027006304274584e-06,
"loss": 0.5096,
"mean_token_accuracy": 0.8292623031884432,
"num_tokens": 79154216.0,
"step": 184
},
{
"entropy": 0.44927978515625,
"epoch": 0.7341269841269841,
"grad_norm": 1.2696774197334444,
"learning_rate": 9.014000415302286e-06,
"loss": 0.5139,
"mean_token_accuracy": 0.8276010407134891,
"num_tokens": 79599332.0,
"step": 185
},
{
"entropy": 0.45220947265625,
"epoch": 0.7380952380952381,
"grad_norm": 1.2548327381579976,
"learning_rate": 9.000917679213344e-06,
"loss": 0.5196,
"mean_token_accuracy": 0.8274355586618185,
"num_tokens": 80039204.0,
"step": 186
},
{
"entropy": 0.4434814453125,
"epoch": 0.7420634920634921,
"grad_norm": 1.180213420756775,
"learning_rate": 8.987758346473739e-06,
"loss": 0.503,
"mean_token_accuracy": 0.8305716142058372,
"num_tokens": 80472128.0,
"step": 187
},
{
"entropy": 0.449005126953125,
"epoch": 0.746031746031746,
"grad_norm": 1.2928756233384209,
"learning_rate": 8.974522669015872e-06,
"loss": 0.5174,
"mean_token_accuracy": 0.8274647342041135,
"num_tokens": 80910348.0,
"step": 188
},
{
"entropy": 0.448822021484375,
"epoch": 0.75,
"grad_norm": 1.153866561909503,
"learning_rate": 8.961210900233757e-06,
"loss": 0.5101,
"mean_token_accuracy": 0.8277234118431807,
"num_tokens": 81336350.0,
"step": 189
},
{
"entropy": 0.44439697265625,
"epoch": 0.753968253968254,
"grad_norm": 1.215655128934687,
"learning_rate": 8.947823294978147e-06,
"loss": 0.509,
"mean_token_accuracy": 0.8286535432562232,
"num_tokens": 81765325.0,
"step": 190
},
{
"entropy": 0.461395263671875,
"epoch": 0.7579365079365079,
"grad_norm": 1.4210713418222345,
"learning_rate": 8.934360109551671e-06,
"loss": 0.5106,
"mean_token_accuracy": 0.8299150029197335,
"num_tokens": 82191876.0,
"step": 191
},
{
"entropy": 0.4591064453125,
"epoch": 0.7619047619047619,
"grad_norm": 1.319721918446663,
"learning_rate": 8.920821601703927e-06,
"loss": 0.4913,
"mean_token_accuracy": 0.8329328633844852,
"num_tokens": 82611125.0,
"step": 192
},
{
"entropy": 0.453155517578125,
"epoch": 0.7658730158730159,
"grad_norm": 1.3201749647251046,
"learning_rate": 8.907208030626538e-06,
"loss": 0.5129,
"mean_token_accuracy": 0.8259176956489682,
"num_tokens": 83051815.0,
"step": 193
},
{
"entropy": 0.4512939453125,
"epoch": 0.7698412698412699,
"grad_norm": 1.1719138701614786,
"learning_rate": 8.8935196569482e-06,
"loss": 0.5079,
"mean_token_accuracy": 0.8282450577244163,
"num_tokens": 83488021.0,
"step": 194
},
{
"entropy": 0.456451416015625,
"epoch": 0.7738095238095238,
"grad_norm": 1.2391988296172292,
"learning_rate": 8.879756742729683e-06,
"loss": 0.5074,
"mean_token_accuracy": 0.827914453111589,
"num_tokens": 83902519.0,
"step": 195
},
{
"entropy": 0.450653076171875,
"epoch": 0.7777777777777778,
"grad_norm": 1.2037962698085334,
"learning_rate": 8.865919551458823e-06,
"loss": 0.505,
"mean_token_accuracy": 0.8286258336156607,
"num_tokens": 84321775.0,
"step": 196
},
{
"entropy": 0.44927978515625,
"epoch": 0.7817460317460317,
"grad_norm": 1.1617039305620294,
"learning_rate": 8.852008348045468e-06,
"loss": 0.5019,
"mean_token_accuracy": 0.8323168307542801,
"num_tokens": 84745911.0,
"step": 197
},
{
"entropy": 0.451751708984375,
"epoch": 0.7857142857142857,
"grad_norm": 1.149795910244863,
"learning_rate": 8.838023398816417e-06,
"loss": 0.4857,
"mean_token_accuracy": 0.8362782001495361,
"num_tokens": 85167087.0,
"step": 198
},
{
"entropy": 0.4635009765625,
"epoch": 0.7896825396825397,
"grad_norm": 1.1483411264804027,
"learning_rate": 8.823964971510313e-06,
"loss": 0.5075,
"mean_token_accuracy": 0.8307431424036622,
"num_tokens": 85588482.0,
"step": 199
},
{
"entropy": 0.444122314453125,
"epoch": 0.7936507936507936,
"grad_norm": 1.0935254315768266,
"learning_rate": 8.809833335272517e-06,
"loss": 0.5054,
"mean_token_accuracy": 0.8298458913341165,
"num_tokens": 86009383.0,
"step": 200
},
{
"entropy": 0.4493408203125,
"epoch": 0.7976190476190477,
"grad_norm": 1.1018546509681295,
"learning_rate": 8.795628760649965e-06,
"loss": 0.5106,
"mean_token_accuracy": 0.8295301357284188,
"num_tokens": 86449600.0,
"step": 201
},
{
"entropy": 0.450439453125,
"epoch": 0.8015873015873016,
"grad_norm": 1.306183682510968,
"learning_rate": 8.781351519585978e-06,
"loss": 0.4886,
"mean_token_accuracy": 0.8344141785055399,
"num_tokens": 86862628.0,
"step": 202
},
{
"entropy": 0.449676513671875,
"epoch": 0.8055555555555556,
"grad_norm": 1.0824265526588595,
"learning_rate": 8.767001885415055e-06,
"loss": 0.5054,
"mean_token_accuracy": 0.8296528598293662,
"num_tokens": 87295233.0,
"step": 203
},
{
"entropy": 0.449310302734375,
"epoch": 0.8095238095238095,
"grad_norm": 1.216483297181918,
"learning_rate": 8.752580132857652e-06,
"loss": 0.4987,
"mean_token_accuracy": 0.8328232821077108,
"num_tokens": 87713395.0,
"step": 204
},
{
"entropy": 0.4515380859375,
"epoch": 0.8134920634920635,
"grad_norm": 1.1371633597502904,
"learning_rate": 8.73808653801491e-06,
"loss": 0.5216,
"mean_token_accuracy": 0.8253697715699673,
"num_tokens": 88158822.0,
"step": 205
},
{
"entropy": 0.44964599609375,
"epoch": 0.8174603174603174,
"grad_norm": 1.2076012965398912,
"learning_rate": 8.723521378363378e-06,
"loss": 0.5049,
"mean_token_accuracy": 0.8300966452807188,
"num_tokens": 88602545.0,
"step": 206
},
{
"entropy": 0.45513916015625,
"epoch": 0.8214285714285714,
"grad_norm": 1.1637271792413393,
"learning_rate": 8.70888493274969e-06,
"loss": 0.4854,
"mean_token_accuracy": 0.8374869581311941,
"num_tokens": 89025796.0,
"step": 207
},
{
"entropy": 0.44927978515625,
"epoch": 0.8253968253968254,
"grad_norm": 1.1305189795680015,
"learning_rate": 8.694177481385244e-06,
"loss": 0.5061,
"mean_token_accuracy": 0.8304181462153792,
"num_tokens": 89444255.0,
"step": 208
},
{
"entropy": 0.44769287109375,
"epoch": 0.8293650793650794,
"grad_norm": 1.065905888231706,
"learning_rate": 8.679399305840815e-06,
"loss": 0.511,
"mean_token_accuracy": 0.8329211305826902,
"num_tokens": 89894143.0,
"step": 209
},
{
"entropy": 0.448516845703125,
"epoch": 0.8333333333333334,
"grad_norm": 1.194800491826659,
"learning_rate": 8.664550689041187e-06,
"loss": 0.4704,
"mean_token_accuracy": 0.8389384057372808,
"num_tokens": 90312774.0,
"step": 210
},
{
"entropy": 0.451995849609375,
"epoch": 0.8373015873015873,
"grad_norm": 1.1324678388489409,
"learning_rate": 8.649631915259716e-06,
"loss": 0.4959,
"mean_token_accuracy": 0.832505133934319,
"num_tokens": 90741787.0,
"step": 211
},
{
"entropy": 0.444610595703125,
"epoch": 0.8412698412698413,
"grad_norm": 1.0451373377494304,
"learning_rate": 8.634643270112903e-06,
"loss": 0.4874,
"mean_token_accuracy": 0.8343986244872212,
"num_tokens": 91177447.0,
"step": 212
},
{
"entropy": 0.448516845703125,
"epoch": 0.8452380952380952,
"grad_norm": 1.1350367484478692,
"learning_rate": 8.61958504055492e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8339378647506237,
"num_tokens": 91607165.0,
"step": 213
},
{
"entropy": 0.45574951171875,
"epoch": 0.8492063492063492,
"grad_norm": 1.1435711522188763,
"learning_rate": 8.604457514872115e-06,
"loss": 0.4934,
"mean_token_accuracy": 0.8312076451256871,
"num_tokens": 92026164.0,
"step": 214
},
{
"entropy": 0.448028564453125,
"epoch": 0.8531746031746031,
"grad_norm": 1.210433236941165,
"learning_rate": 8.589260982677496e-06,
"loss": 0.4936,
"mean_token_accuracy": 0.8334163334220648,
"num_tokens": 92463989.0,
"step": 215
},
{
"entropy": 0.4459228515625,
"epoch": 0.8571428571428571,
"grad_norm": 1.2030101822851358,
"learning_rate": 8.573995734905185e-06,
"loss": 0.4917,
"mean_token_accuracy": 0.8336746180430055,
"num_tokens": 92891631.0,
"step": 216
},
{
"entropy": 0.4539794921875,
"epoch": 0.8611111111111112,
"grad_norm": 1.0466701342650107,
"learning_rate": 8.558662063804843e-06,
"loss": 0.5039,
"mean_token_accuracy": 0.8325941441580653,
"num_tokens": 93322969.0,
"step": 217
},
{
"entropy": 0.448883056640625,
"epoch": 0.8650793650793651,
"grad_norm": 1.3569379184552983,
"learning_rate": 8.543260262936087e-06,
"loss": 0.4942,
"mean_token_accuracy": 0.8330146428197622,
"num_tokens": 93760535.0,
"step": 218
},
{
"entropy": 0.445465087890625,
"epoch": 0.8690476190476191,
"grad_norm": 1.1285395121488393,
"learning_rate": 8.527790627162858e-06,
"loss": 0.485,
"mean_token_accuracy": 0.835063835605979,
"num_tokens": 94172398.0,
"step": 219
},
{
"entropy": 0.450775146484375,
"epoch": 0.873015873015873,
"grad_norm": 1.2538705581876535,
"learning_rate": 8.512253452647783e-06,
"loss": 0.502,
"mean_token_accuracy": 0.8306903587654233,
"num_tokens": 94599260.0,
"step": 220
},
{
"entropy": 0.45660400390625,
"epoch": 0.876984126984127,
"grad_norm": 1.1551796563028132,
"learning_rate": 8.496649036846502e-06,
"loss": 0.4946,
"mean_token_accuracy": 0.8319389009848237,
"num_tokens": 95019433.0,
"step": 221
},
{
"entropy": 0.461669921875,
"epoch": 0.8809523809523809,
"grad_norm": 1.2009353491848689,
"learning_rate": 8.480977678501974e-06,
"loss": 0.4915,
"mean_token_accuracy": 0.8330316534265876,
"num_tokens": 95425799.0,
"step": 222
},
{
"entropy": 0.45465087890625,
"epoch": 0.8849206349206349,
"grad_norm": 1.0850199284929676,
"learning_rate": 8.465239677638755e-06,
"loss": 0.4919,
"mean_token_accuracy": 0.8328907387331128,
"num_tokens": 95822890.0,
"step": 223
},
{
"entropy": 0.45330810546875,
"epoch": 0.8888888888888888,
"grad_norm": 1.4803897939108124,
"learning_rate": 8.449435335557264e-06,
"loss": 0.5054,
"mean_token_accuracy": 0.8312137639150023,
"num_tokens": 96260271.0,
"step": 224
},
{
"entropy": 0.443267822265625,
"epoch": 0.8928571428571429,
"grad_norm": 2.1079096762406238,
"learning_rate": 8.433564954828e-06,
"loss": 0.4991,
"mean_token_accuracy": 0.8311476595699787,
"num_tokens": 96696652.0,
"step": 225
},
{
"entropy": 0.450286865234375,
"epoch": 0.8968253968253969,
"grad_norm": 1.2706829768849834,
"learning_rate": 8.417628839285757e-06,
"loss": 0.4981,
"mean_token_accuracy": 0.8332603024318814,
"num_tokens": 97135925.0,
"step": 226
},
{
"entropy": 0.45703125,
"epoch": 0.9007936507936508,
"grad_norm": 1.8201254601577819,
"learning_rate": 8.401627294023815e-06,
"loss": 0.5142,
"mean_token_accuracy": 0.828549837693572,
"num_tokens": 97573810.0,
"step": 227
},
{
"entropy": 0.447784423828125,
"epoch": 0.9047619047619048,
"grad_norm": 1.1241933043727534,
"learning_rate": 8.385560625388081e-06,
"loss": 0.4831,
"mean_token_accuracy": 0.8362022209912539,
"num_tokens": 98011108.0,
"step": 228
},
{
"entropy": 0.454071044921875,
"epoch": 0.9087301587301587,
"grad_norm": 1.1121125737189776,
"learning_rate": 8.369429140971239e-06,
"loss": 0.4811,
"mean_token_accuracy": 0.8338787518441677,
"num_tokens": 98441631.0,
"step": 229
},
{
"entropy": 0.457305908203125,
"epoch": 0.9126984126984127,
"grad_norm": 1.0458991032894815,
"learning_rate": 8.353233149606859e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8308598725125194,
"num_tokens": 98873029.0,
"step": 230
},
{
"entropy": 0.453765869140625,
"epoch": 0.9166666666666666,
"grad_norm": 1.2247678683157683,
"learning_rate": 8.336972961363472e-06,
"loss": 0.498,
"mean_token_accuracy": 0.8302338859066367,
"num_tokens": 99296106.0,
"step": 231
},
{
"entropy": 0.45703125,
"epoch": 0.9206349206349206,
"grad_norm": 1.2989134951116341,
"learning_rate": 8.320648887538657e-06,
"loss": 0.4957,
"mean_token_accuracy": 0.8315738271921873,
"num_tokens": 99734864.0,
"step": 232
},
{
"entropy": 0.45330810546875,
"epoch": 0.9246031746031746,
"grad_norm": 1.080222766722178,
"learning_rate": 8.304261240653054e-06,
"loss": 0.507,
"mean_token_accuracy": 0.8313342472538352,
"num_tokens": 100174517.0,
"step": 233
},
{
"entropy": 0.460418701171875,
"epoch": 0.9285714285714286,
"grad_norm": 1.1572509289153226,
"learning_rate": 8.287810334444406e-06,
"loss": 0.4926,
"mean_token_accuracy": 0.8337559709325433,
"num_tokens": 100606926.0,
"step": 234
},
{
"entropy": 0.458404541015625,
"epoch": 0.9325396825396826,
"grad_norm": 1.1066868483832115,
"learning_rate": 8.271296483861532e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.835618756711483,
"num_tokens": 101020425.0,
"step": 235
},
{
"entropy": 0.45831298828125,
"epoch": 0.9365079365079365,
"grad_norm": 1.060730775603579,
"learning_rate": 8.254720005058317e-06,
"loss": 0.4912,
"mean_token_accuracy": 0.8332764646038413,
"num_tokens": 101447599.0,
"step": 236
},
{
"entropy": 0.458953857421875,
"epoch": 0.9404761904761905,
"grad_norm": 1.1471857859225785,
"learning_rate": 8.238081215387639e-06,
"loss": 0.4843,
"mean_token_accuracy": 0.8348336489871144,
"num_tokens": 101851986.0,
"step": 237
},
{
"entropy": 0.449310302734375,
"epoch": 0.9444444444444444,
"grad_norm": 1.1375613016443888,
"learning_rate": 8.221380433395308e-06,
"loss": 0.4934,
"mean_token_accuracy": 0.8338221423327923,
"num_tokens": 102275358.0,
"step": 238
},
{
"entropy": 0.459075927734375,
"epoch": 0.9484126984126984,
"grad_norm": 1.0708255333770056,
"learning_rate": 8.204617978813963e-06,
"loss": 0.4838,
"mean_token_accuracy": 0.8348392806947231,
"num_tokens": 102688415.0,
"step": 239
},
{
"entropy": 0.457061767578125,
"epoch": 0.9523809523809523,
"grad_norm": 1.269015813917946,
"learning_rate": 8.187794172556947e-06,
"loss": 0.4901,
"mean_token_accuracy": 0.832873186096549,
"num_tokens": 103113293.0,
"step": 240
},
{
"entropy": 0.447052001953125,
"epoch": 0.9563492063492064,
"grad_norm": 1.2172067541370395,
"learning_rate": 8.170909336712171e-06,
"loss": 0.4934,
"mean_token_accuracy": 0.8310654619708657,
"num_tokens": 103566779.0,
"step": 241
},
{
"entropy": 0.442840576171875,
"epoch": 0.9603174603174603,
"grad_norm": 1.9614491486328336,
"learning_rate": 8.153963794535945e-06,
"loss": 0.4967,
"mean_token_accuracy": 0.8313373932614923,
"num_tokens": 104000550.0,
"step": 242
},
{
"entropy": 0.45098876953125,
"epoch": 0.9642857142857143,
"grad_norm": 1.2204808359509163,
"learning_rate": 8.136957870446779e-06,
"loss": 0.4998,
"mean_token_accuracy": 0.830800985917449,
"num_tokens": 104429372.0,
"step": 243
},
{
"entropy": 0.443115234375,
"epoch": 0.9682539682539683,
"grad_norm": 1.1287254868438927,
"learning_rate": 8.119891890019187e-06,
"loss": 0.486,
"mean_token_accuracy": 0.8366484735161066,
"num_tokens": 104859286.0,
"step": 244
},
{
"entropy": 0.45599365234375,
"epoch": 0.9722222222222222,
"grad_norm": 1.1632405758479503,
"learning_rate": 8.102766179977452e-06,
"loss": 0.4954,
"mean_token_accuracy": 0.83047538343817,
"num_tokens": 105281017.0,
"step": 245
},
{
"entropy": 0.44305419921875,
"epoch": 0.9761904761904762,
"grad_norm": 1.0531020537734286,
"learning_rate": 8.085581068189358e-06,
"loss": 0.4875,
"mean_token_accuracy": 0.83509177621454,
"num_tokens": 105729675.0,
"step": 246
},
{
"entropy": 0.444549560546875,
"epoch": 0.9801587301587301,
"grad_norm": 1.136500203665195,
"learning_rate": 8.068336883659926e-06,
"loss": 0.4926,
"mean_token_accuracy": 0.8322630152106285,
"num_tokens": 106168119.0,
"step": 247
},
{
"entropy": 0.442291259765625,
"epoch": 0.9841269841269841,
"grad_norm": 1.0192188396085724,
"learning_rate": 8.051033956525113e-06,
"loss": 0.484,
"mean_token_accuracy": 0.8352002650499344,
"num_tokens": 106603968.0,
"step": 248
},
{
"entropy": 0.439788818359375,
"epoch": 0.9880952380952381,
"grad_norm": 1.1049532946463114,
"learning_rate": 8.033672618045485e-06,
"loss": 0.492,
"mean_token_accuracy": 0.8354252576828003,
"num_tokens": 107054152.0,
"step": 249
},
{
"entropy": 0.440826416015625,
"epoch": 0.9920634920634921,
"grad_norm": 1.0599713800274446,
"learning_rate": 8.016253200599885e-06,
"loss": 0.4782,
"mean_token_accuracy": 0.8366458043456078,
"num_tokens": 107495007.0,
"step": 250
},
{
"entropy": 0.447113037109375,
"epoch": 0.996031746031746,
"grad_norm": 1.1844331984330863,
"learning_rate": 7.998776037679061e-06,
"loss": 0.4986,
"mean_token_accuracy": 0.8293369021266699,
"num_tokens": 107928758.0,
"step": 251
},
{
"entropy": 0.441619873046875,
"epoch": 1.0,
"grad_norm": 1.0144603078826888,
"learning_rate": 7.981241463879284e-06,
"loss": 0.4922,
"mean_token_accuracy": 0.8354968074709177,
"num_tokens": 108364335.0,
"step": 252
},
{
"entropy": 0.46148681640625,
"epoch": 1.003968253968254,
"grad_norm": 1.1133036368721527,
"learning_rate": 7.963649814895945e-06,
"loss": 0.4675,
"mean_token_accuracy": 0.8393758479505777,
"num_tokens": 108775586.0,
"step": 253
},
{
"entropy": 0.452392578125,
"epoch": 1.007936507936508,
"grad_norm": 1.0079553576284006,
"learning_rate": 7.94600142751713e-06,
"loss": 0.4619,
"mean_token_accuracy": 0.8416725508868694,
"num_tokens": 109202665.0,
"step": 254
},
{
"entropy": 0.4403076171875,
"epoch": 1.0119047619047619,
"grad_norm": 1.0665471851715955,
"learning_rate": 7.92829663961716e-06,
"loss": 0.4616,
"mean_token_accuracy": 0.843192096799612,
"num_tokens": 109629975.0,
"step": 255
},
{
"entropy": 0.440765380859375,
"epoch": 1.0158730158730158,
"grad_norm": 1.0527949047806084,
"learning_rate": 7.910535790150135e-06,
"loss": 0.4684,
"mean_token_accuracy": 0.8393022352829576,
"num_tokens": 110061605.0,
"step": 256
},
{
"entropy": 0.443817138671875,
"epoch": 1.0198412698412698,
"grad_norm": 1.037337532935931,
"learning_rate": 7.892719219143446e-06,
"loss": 0.458,
"mean_token_accuracy": 0.842767583206296,
"num_tokens": 110487591.0,
"step": 257
},
{
"entropy": 0.4439697265625,
"epoch": 1.0238095238095237,
"grad_norm": 0.9282961355601993,
"learning_rate": 7.874847267691254e-06,
"loss": 0.4674,
"mean_token_accuracy": 0.8391132960096002,
"num_tokens": 110924491.0,
"step": 258
},
{
"entropy": 0.44256591796875,
"epoch": 1.0277777777777777,
"grad_norm": 1.0812964655312522,
"learning_rate": 7.856920277947969e-06,
"loss": 0.4666,
"mean_token_accuracy": 0.8417868306860328,
"num_tokens": 111351831.0,
"step": 259
},
{
"entropy": 0.442596435546875,
"epoch": 1.0317460317460316,
"grad_norm": 1.0004332183195612,
"learning_rate": 7.83893859312169e-06,
"loss": 0.4608,
"mean_token_accuracy": 0.840317826718092,
"num_tokens": 111773832.0,
"step": 260
},
{
"entropy": 0.441650390625,
"epoch": 1.0357142857142858,
"grad_norm": 1.0083023934199706,
"learning_rate": 7.820902557467648e-06,
"loss": 0.4546,
"mean_token_accuracy": 0.8436138844117522,
"num_tokens": 112210334.0,
"step": 261
},
{
"entropy": 0.440338134765625,
"epoch": 1.0396825396825398,
"grad_norm": 1.0205115508469926,
"learning_rate": 7.80281251628161e-06,
"loss": 0.4617,
"mean_token_accuracy": 0.8404037207365036,
"num_tokens": 112637470.0,
"step": 262
},
{
"entropy": 0.43670654296875,
"epoch": 1.0436507936507937,
"grad_norm": 1.1861875486087046,
"learning_rate": 7.784668815893256e-06,
"loss": 0.465,
"mean_token_accuracy": 0.8401956735178828,
"num_tokens": 113069179.0,
"step": 263
},
{
"entropy": 0.44329833984375,
"epoch": 1.0476190476190477,
"grad_norm": 1.0426651868796517,
"learning_rate": 7.766471803659571e-06,
"loss": 0.4725,
"mean_token_accuracy": 0.8395506730303168,
"num_tokens": 113501590.0,
"step": 264
},
{
"entropy": 0.440948486328125,
"epoch": 1.0515873015873016,
"grad_norm": 1.0688154361912685,
"learning_rate": 7.748221827958174e-06,
"loss": 0.463,
"mean_token_accuracy": 0.8411337668076158,
"num_tokens": 113935337.0,
"step": 265
},
{
"entropy": 0.44378662109375,
"epoch": 1.0555555555555556,
"grad_norm": 0.9973458392577903,
"learning_rate": 7.729919238180663e-06,
"loss": 0.4644,
"mean_token_accuracy": 0.8407721919938922,
"num_tokens": 114360533.0,
"step": 266
},
{
"entropy": 0.4375,
"epoch": 1.0595238095238095,
"grad_norm": 1.0128154565462195,
"learning_rate": 7.711564384725916e-06,
"loss": 0.456,
"mean_token_accuracy": 0.8427523402497172,
"num_tokens": 114792424.0,
"step": 267
},
{
"entropy": 0.43865966796875,
"epoch": 1.0634920634920635,
"grad_norm": 1.1237442568992235,
"learning_rate": 7.693157618993392e-06,
"loss": 0.4713,
"mean_token_accuracy": 0.8381293760612607,
"num_tokens": 115231953.0,
"step": 268
},
{
"entropy": 0.44390869140625,
"epoch": 1.0674603174603174,
"grad_norm": 0.9632813464843945,
"learning_rate": 7.674699293376397e-06,
"loss": 0.4606,
"mean_token_accuracy": 0.8414155915379524,
"num_tokens": 115664522.0,
"step": 269
},
{
"entropy": 0.439239501953125,
"epoch": 1.0714285714285714,
"grad_norm": 1.1143536721017135,
"learning_rate": 7.656189761255333e-06,
"loss": 0.4585,
"mean_token_accuracy": 0.8407229576259851,
"num_tokens": 116092221.0,
"step": 270
},
{
"entropy": 0.4417724609375,
"epoch": 1.0753968253968254,
"grad_norm": 1.0175840618853507,
"learning_rate": 7.63762937699095e-06,
"loss": 0.4619,
"mean_token_accuracy": 0.8408547407016158,
"num_tokens": 116534679.0,
"step": 271
},
{
"entropy": 0.4439697265625,
"epoch": 1.0793650793650793,
"grad_norm": 1.0025546600901896,
"learning_rate": 7.619018495917543e-06,
"loss": 0.4696,
"mean_token_accuracy": 0.8394848993048072,
"num_tokens": 116984739.0,
"step": 272
},
{
"entropy": 0.44073486328125,
"epoch": 1.0833333333333333,
"grad_norm": 1.0897542155601712,
"learning_rate": 7.600357474336157e-06,
"loss": 0.4662,
"mean_token_accuracy": 0.8403668319806457,
"num_tokens": 117413323.0,
"step": 273
},
{
"entropy": 0.4364013671875,
"epoch": 1.0873015873015872,
"grad_norm": 1.026521342719511,
"learning_rate": 7.581646669507768e-06,
"loss": 0.4631,
"mean_token_accuracy": 0.8399766776710749,
"num_tokens": 117852991.0,
"step": 274
},
{
"entropy": 0.4500732421875,
"epoch": 1.0912698412698412,
"grad_norm": 1.1089611631121021,
"learning_rate": 7.56288643964644e-06,
"loss": 0.4686,
"mean_token_accuracy": 0.8402743814513087,
"num_tokens": 118264477.0,
"step": 275
},
{
"entropy": 0.440032958984375,
"epoch": 1.0952380952380953,
"grad_norm": 1.1837449681611911,
"learning_rate": 7.544077143912467e-06,
"loss": 0.4596,
"mean_token_accuracy": 0.8443190716207027,
"num_tokens": 118696927.0,
"step": 276
},
{
"entropy": 0.43536376953125,
"epoch": 1.0992063492063493,
"grad_norm": 1.0567641917315522,
"learning_rate": 7.525219142405501e-06,
"loss": 0.4645,
"mean_token_accuracy": 0.8398779211565852,
"num_tokens": 119143061.0,
"step": 277
},
{
"entropy": 0.4447021484375,
"epoch": 1.1031746031746033,
"grad_norm": 1.0628873288461702,
"learning_rate": 7.506312796157649e-06,
"loss": 0.464,
"mean_token_accuracy": 0.8407185869291425,
"num_tokens": 119569613.0,
"step": 278
},
{
"entropy": 0.44366455078125,
"epoch": 1.1071428571428572,
"grad_norm": 1.3089788931081365,
"learning_rate": 7.487358467126573e-06,
"loss": 0.4666,
"mean_token_accuracy": 0.8411134304478765,
"num_tokens": 119990044.0,
"step": 279
},
{
"entropy": 0.4305419921875,
"epoch": 1.1111111111111112,
"grad_norm": 1.200277045741654,
"learning_rate": 7.468356518188551e-06,
"loss": 0.4687,
"mean_token_accuracy": 0.83890818990767,
"num_tokens": 120447089.0,
"step": 280
},
{
"entropy": 0.435943603515625,
"epoch": 1.1150793650793651,
"grad_norm": 1.065088410503753,
"learning_rate": 7.449307313131533e-06,
"loss": 0.4481,
"mean_token_accuracy": 0.846671967767179,
"num_tokens": 120882118.0,
"step": 281
},
{
"entropy": 0.4400634765625,
"epoch": 1.119047619047619,
"grad_norm": 1.0435830370708483,
"learning_rate": 7.4302112166481814e-06,
"loss": 0.4653,
"mean_token_accuracy": 0.8401108030229807,
"num_tokens": 121314886.0,
"step": 282
},
{
"entropy": 0.444610595703125,
"epoch": 1.123015873015873,
"grad_norm": 1.1498875512493505,
"learning_rate": 7.411068594328876e-06,
"loss": 0.4506,
"mean_token_accuracy": 0.8450519479811192,
"num_tokens": 121731396.0,
"step": 283
},
{
"entropy": 0.441192626953125,
"epoch": 1.126984126984127,
"grad_norm": 1.1037530140349723,
"learning_rate": 7.391879812654727e-06,
"loss": 0.4573,
"mean_token_accuracy": 0.8432380286976695,
"num_tokens": 122167617.0,
"step": 284
},
{
"entropy": 0.436553955078125,
"epoch": 1.130952380952381,
"grad_norm": 1.2008296365359707,
"learning_rate": 7.37264523899056e-06,
"loss": 0.4564,
"mean_token_accuracy": 0.8409950910136104,
"num_tokens": 122593508.0,
"step": 285
},
{
"entropy": 0.439788818359375,
"epoch": 1.1349206349206349,
"grad_norm": 1.1519884106136846,
"learning_rate": 7.353365241577869e-06,
"loss": 0.4606,
"mean_token_accuracy": 0.839851806871593,
"num_tokens": 123013840.0,
"step": 286
},
{
"entropy": 0.43341064453125,
"epoch": 1.1388888888888888,
"grad_norm": 1.0329372716274068,
"learning_rate": 7.3340401895277816e-06,
"loss": 0.4498,
"mean_token_accuracy": 0.8443695362657309,
"num_tokens": 123444043.0,
"step": 287
},
{
"entropy": 0.436676025390625,
"epoch": 1.1428571428571428,
"grad_norm": 1.0218663400951138,
"learning_rate": 7.314670452813982e-06,
"loss": 0.4503,
"mean_token_accuracy": 0.8440707307308912,
"num_tokens": 123876490.0,
"step": 288
},
{
"entropy": 0.44293212890625,
"epoch": 1.1468253968253967,
"grad_norm": 1.0595566545611714,
"learning_rate": 7.295256402265636e-06,
"loss": 0.4561,
"mean_token_accuracy": 0.841067879460752,
"num_tokens": 124297019.0,
"step": 289
},
{
"entropy": 0.44622802734375,
"epoch": 1.1507936507936507,
"grad_norm": 1.1333083345633674,
"learning_rate": 7.275798409560282e-06,
"loss": 0.4617,
"mean_token_accuracy": 0.8422295236960053,
"num_tokens": 124713314.0,
"step": 290
},
{
"entropy": 0.44403076171875,
"epoch": 1.1547619047619047,
"grad_norm": 1.1923827872697734,
"learning_rate": 7.256296847216727e-06,
"loss": 0.4573,
"mean_token_accuracy": 0.8406451418995857,
"num_tokens": 125125061.0,
"step": 291
},
{
"entropy": 0.440155029296875,
"epoch": 1.1587301587301586,
"grad_norm": 1.1646433646945646,
"learning_rate": 7.236752088587905e-06,
"loss": 0.4735,
"mean_token_accuracy": 0.8386099971830845,
"num_tokens": 125564746.0,
"step": 292
},
{
"entropy": 0.435272216796875,
"epoch": 1.1626984126984128,
"grad_norm": 1.1116112176497874,
"learning_rate": 7.217164507853734e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.8449215041473508,
"num_tokens": 125992351.0,
"step": 293
},
{
"entropy": 0.440032958984375,
"epoch": 1.1666666666666667,
"grad_norm": 1.0397784652565205,
"learning_rate": 7.197534480013951e-06,
"loss": 0.4515,
"mean_token_accuracy": 0.8436530968174338,
"num_tokens": 126415997.0,
"step": 294
},
{
"entropy": 0.44482421875,
"epoch": 1.1706349206349207,
"grad_norm": 1.129298764751686,
"learning_rate": 7.177862380880935e-06,
"loss": 0.4629,
"mean_token_accuracy": 0.841444781050086,
"num_tokens": 126851930.0,
"step": 295
},
{
"entropy": 0.44580078125,
"epoch": 1.1746031746031746,
"grad_norm": 1.0985527605182936,
"learning_rate": 7.158148587072509e-06,
"loss": 0.467,
"mean_token_accuracy": 0.8395384335890412,
"num_tokens": 127285760.0,
"step": 296
},
{
"entropy": 0.455108642578125,
"epoch": 1.1785714285714286,
"grad_norm": 1.2001077801428681,
"learning_rate": 7.138393476004725e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8372842157259583,
"num_tokens": 127724762.0,
"step": 297
},
{
"entropy": 0.43841552734375,
"epoch": 1.1825396825396826,
"grad_norm": 1.054003052207074,
"learning_rate": 7.118597425884659e-06,
"loss": 0.4523,
"mean_token_accuracy": 0.8465767158195376,
"num_tokens": 128153685.0,
"step": 298
},
{
"entropy": 0.443328857421875,
"epoch": 1.1865079365079365,
"grad_norm": 1.0655995217798397,
"learning_rate": 7.098760815703139e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.8448374746367335,
"num_tokens": 128574985.0,
"step": 299
},
{
"entropy": 0.452362060546875,
"epoch": 1.1904761904761905,
"grad_norm": 1.1076879019132861,
"learning_rate": 7.078884025227519e-06,
"loss": 0.4515,
"mean_token_accuracy": 0.8428602814674377,
"num_tokens": 128990738.0,
"step": 300
},
{
"entropy": 0.4468994140625,
"epoch": 1.1944444444444444,
"grad_norm": 1.096401426354454,
"learning_rate": 7.058967434994388e-06,
"loss": 0.4526,
"mean_token_accuracy": 0.8467154111713171,
"num_tokens": 129413253.0,
"step": 301
},
{
"entropy": 0.444061279296875,
"epoch": 1.1984126984126984,
"grad_norm": 0.9851920784045842,
"learning_rate": 7.0390114263022955e-06,
"loss": 0.474,
"mean_token_accuracy": 0.8386435657739639,
"num_tokens": 129848900.0,
"step": 302
},
{
"entropy": 0.44317626953125,
"epoch": 1.2023809523809523,
"grad_norm": 1.112135152774716,
"learning_rate": 7.019016381204448e-06,
"loss": 0.4553,
"mean_token_accuracy": 0.8430305812507868,
"num_tokens": 130278951.0,
"step": 303
},
{
"entropy": 0.444427490234375,
"epoch": 1.2063492063492063,
"grad_norm": 1.1661189845303515,
"learning_rate": 6.998982682501394e-06,
"loss": 0.4629,
"mean_token_accuracy": 0.841990914195776,
"num_tokens": 130724709.0,
"step": 304
},
{
"entropy": 0.445404052734375,
"epoch": 1.2103174603174602,
"grad_norm": 0.9959690543341396,
"learning_rate": 6.978910713733696e-06,
"loss": 0.4429,
"mean_token_accuracy": 0.8485971093177795,
"num_tokens": 131151665.0,
"step": 305
},
{
"entropy": 0.438751220703125,
"epoch": 1.2142857142857142,
"grad_norm": 0.9834937169980936,
"learning_rate": 6.958800859174591e-06,
"loss": 0.4491,
"mean_token_accuracy": 0.845764022320509,
"num_tokens": 131582811.0,
"step": 306
},
{
"entropy": 0.442840576171875,
"epoch": 1.2182539682539684,
"grad_norm": 1.0523226181088532,
"learning_rate": 6.938653503822628e-06,
"loss": 0.4574,
"mean_token_accuracy": 0.8434069091454148,
"num_tokens": 131998529.0,
"step": 307
},
{
"entropy": 0.4339599609375,
"epoch": 1.2222222222222223,
"grad_norm": 1.0371255492047888,
"learning_rate": 6.9184690333942995e-06,
"loss": 0.4517,
"mean_token_accuracy": 0.8438770910724998,
"num_tokens": 132429743.0,
"step": 308
},
{
"entropy": 0.439239501953125,
"epoch": 1.2261904761904763,
"grad_norm": 1.1404078217146265,
"learning_rate": 6.898247834316662e-06,
"loss": 0.4576,
"mean_token_accuracy": 0.8416583137586713,
"num_tokens": 132864811.0,
"step": 309
},
{
"entropy": 0.437103271484375,
"epoch": 1.2301587301587302,
"grad_norm": 1.0196151103714386,
"learning_rate": 6.877990293719928e-06,
"loss": 0.4611,
"mean_token_accuracy": 0.8426391445100307,
"num_tokens": 133291943.0,
"step": 310
},
{
"entropy": 0.4429931640625,
"epoch": 1.2341269841269842,
"grad_norm": 1.1597754105733091,
"learning_rate": 6.857696799430064e-06,
"loss": 0.4594,
"mean_token_accuracy": 0.8428373141214252,
"num_tokens": 133728664.0,
"step": 311
},
{
"entropy": 0.442169189453125,
"epoch": 1.2380952380952381,
"grad_norm": 1.0933297455326956,
"learning_rate": 6.83736773996136e-06,
"loss": 0.4495,
"mean_token_accuracy": 0.8465461218729615,
"num_tokens": 134149814.0,
"step": 312
},
{
"entropy": 0.444610595703125,
"epoch": 1.242063492063492,
"grad_norm": 0.9545364491465045,
"learning_rate": 6.817003504508993e-06,
"loss": 0.4453,
"mean_token_accuracy": 0.8452331237494946,
"num_tokens": 134567037.0,
"step": 313
},
{
"entropy": 0.441436767578125,
"epoch": 1.246031746031746,
"grad_norm": 0.9909665760096847,
"learning_rate": 6.796604482941578e-06,
"loss": 0.4474,
"mean_token_accuracy": 0.8466871501877904,
"num_tokens": 134989406.0,
"step": 314
},
{
"entropy": 0.43414306640625,
"epoch": 1.25,
"grad_norm": 1.0159907252981955,
"learning_rate": 6.7761710657936995e-06,
"loss": 0.4361,
"mean_token_accuracy": 0.8494271822273731,
"num_tokens": 135405949.0,
"step": 315
},
{
"entropy": 0.436004638671875,
"epoch": 1.253968253968254,
"grad_norm": 1.1634799840745833,
"learning_rate": 6.75570364425844e-06,
"loss": 0.4552,
"mean_token_accuracy": 0.8439184688031673,
"num_tokens": 135832642.0,
"step": 316
},
{
"entropy": 0.43035888671875,
"epoch": 1.257936507936508,
"grad_norm": 1.0848830841192156,
"learning_rate": 6.735202610179886e-06,
"loss": 0.4588,
"mean_token_accuracy": 0.8425602596253157,
"num_tokens": 136281104.0,
"step": 317
},
{
"entropy": 0.4400634765625,
"epoch": 1.2619047619047619,
"grad_norm": 1.1024831215933177,
"learning_rate": 6.714668356045629e-06,
"loss": 0.4459,
"mean_token_accuracy": 0.8458384843543172,
"num_tokens": 136724748.0,
"step": 318
},
{
"entropy": 0.437774658203125,
"epoch": 1.2658730158730158,
"grad_norm": 1.14453363380739,
"learning_rate": 6.694101274979253e-06,
"loss": 0.4484,
"mean_token_accuracy": 0.8426429070532322,
"num_tokens": 137144383.0,
"step": 319
},
{
"entropy": 0.44573974609375,
"epoch": 1.2698412698412698,
"grad_norm": 1.1202850192609648,
"learning_rate": 6.673501760732805e-06,
"loss": 0.4575,
"mean_token_accuracy": 0.8433046471327543,
"num_tokens": 137570382.0,
"step": 320
},
{
"entropy": 0.439056396484375,
"epoch": 1.2738095238095237,
"grad_norm": 1.1686361321236263,
"learning_rate": 6.652870207679253e-06,
"loss": 0.4525,
"mean_token_accuracy": 0.8428729372099042,
"num_tokens": 138002323.0,
"step": 321
},
{
"entropy": 0.43701171875,
"epoch": 1.2777777777777777,
"grad_norm": 1.1692980704447018,
"learning_rate": 6.632207010804949e-06,
"loss": 0.4576,
"mean_token_accuracy": 0.8453587293624878,
"num_tokens": 138431194.0,
"step": 322
},
{
"entropy": 0.439239501953125,
"epoch": 1.2817460317460316,
"grad_norm": 1.0283968957929952,
"learning_rate": 6.611512565702053e-06,
"loss": 0.4494,
"mean_token_accuracy": 0.8435638211667538,
"num_tokens": 138863136.0,
"step": 323
},
{
"entropy": 0.43597412109375,
"epoch": 1.2857142857142856,
"grad_norm": 1.0723867427352887,
"learning_rate": 6.590787268560967e-06,
"loss": 0.4349,
"mean_token_accuracy": 0.8492929134517908,
"num_tokens": 139287890.0,
"step": 324
},
{
"entropy": 0.4398193359375,
"epoch": 1.2896825396825398,
"grad_norm": 1.0222079112541533,
"learning_rate": 6.570031516162746e-06,
"loss": 0.4585,
"mean_token_accuracy": 0.8433736823499203,
"num_tokens": 139730663.0,
"step": 325
},
{
"entropy": 0.435150146484375,
"epoch": 1.2936507936507937,
"grad_norm": 0.9275873017340585,
"learning_rate": 6.549245705871507e-06,
"loss": 0.4499,
"mean_token_accuracy": 0.8432614449411631,
"num_tokens": 140160179.0,
"step": 326
},
{
"entropy": 0.43756103515625,
"epoch": 1.2976190476190477,
"grad_norm": 1.174514802084351,
"learning_rate": 6.528430235626819e-06,
"loss": 0.4463,
"mean_token_accuracy": 0.8453215239569545,
"num_tokens": 140577958.0,
"step": 327
},
{
"entropy": 0.433258056640625,
"epoch": 1.3015873015873016,
"grad_norm": 1.091000460313449,
"learning_rate": 6.5075855039360805e-06,
"loss": 0.4632,
"mean_token_accuracy": 0.8417082950472832,
"num_tokens": 141002875.0,
"step": 328
},
{
"entropy": 0.43377685546875,
"epoch": 1.3055555555555556,
"grad_norm": 0.9951305912978812,
"learning_rate": 6.486711909866895e-06,
"loss": 0.445,
"mean_token_accuracy": 0.8452390227466822,
"num_tokens": 141425392.0,
"step": 329
},
{
"entropy": 0.436004638671875,
"epoch": 1.3095238095238095,
"grad_norm": 0.9773602377225085,
"learning_rate": 6.465809853039431e-06,
"loss": 0.4429,
"mean_token_accuracy": 0.8470056857913733,
"num_tokens": 141858286.0,
"step": 330
},
{
"entropy": 0.44110107421875,
"epoch": 1.3134920634920635,
"grad_norm": 1.0492801166182826,
"learning_rate": 6.444879733618766e-06,
"loss": 0.4432,
"mean_token_accuracy": 0.8470598505809903,
"num_tokens": 142279417.0,
"step": 331
},
{
"entropy": 0.439056396484375,
"epoch": 1.3174603174603174,
"grad_norm": 0.9459765835539803,
"learning_rate": 6.423921952307237e-06,
"loss": 0.4471,
"mean_token_accuracy": 0.8453462338075042,
"num_tokens": 142698339.0,
"step": 332
},
{
"entropy": 0.436981201171875,
"epoch": 1.3214285714285714,
"grad_norm": 1.075628581502009,
"learning_rate": 6.4029369103367545e-06,
"loss": 0.4424,
"mean_token_accuracy": 0.8465406149625778,
"num_tokens": 143128013.0,
"step": 333
},
{
"entropy": 0.43994140625,
"epoch": 1.3253968253968254,
"grad_norm": 1.0287829199461864,
"learning_rate": 6.381925009461128e-06,
"loss": 0.4456,
"mean_token_accuracy": 0.8456096695736051,
"num_tokens": 143561112.0,
"step": 334
},
{
"entropy": 0.441192626953125,
"epoch": 1.3293650793650793,
"grad_norm": 1.1380572333251808,
"learning_rate": 6.3608866519483825e-06,
"loss": 0.4498,
"mean_token_accuracy": 0.844082260504365,
"num_tokens": 143970890.0,
"step": 335
},
{
"entropy": 0.435333251953125,
"epoch": 1.3333333333333333,
"grad_norm": 1.102203799703573,
"learning_rate": 6.339822240573041e-06,
"loss": 0.4476,
"mean_token_accuracy": 0.8457837710157037,
"num_tokens": 144390223.0,
"step": 336
},
{
"entropy": 0.43310546875,
"epoch": 1.3373015873015874,
"grad_norm": 1.0745564478696599,
"learning_rate": 6.3187321786084236e-06,
"loss": 0.4609,
"mean_token_accuracy": 0.8417782466858625,
"num_tokens": 144839957.0,
"step": 337
},
{
"entropy": 0.4366455078125,
"epoch": 1.3412698412698414,
"grad_norm": 1.1064436100800052,
"learning_rate": 6.297616869818926e-06,
"loss": 0.4627,
"mean_token_accuracy": 0.8423483874648809,
"num_tokens": 145276276.0,
"step": 338
},
{
"entropy": 0.43682861328125,
"epoch": 1.3452380952380953,
"grad_norm": 1.0627782502876304,
"learning_rate": 6.276476718452289e-06,
"loss": 0.4599,
"mean_token_accuracy": 0.8434413159266114,
"num_tokens": 145722320.0,
"step": 339
},
{
"entropy": 0.440948486328125,
"epoch": 1.3492063492063493,
"grad_norm": 1.0311065316684267,
"learning_rate": 6.2553121292318595e-06,
"loss": 0.4445,
"mean_token_accuracy": 0.8466370198875666,
"num_tokens": 146148957.0,
"step": 340
},
{
"entropy": 0.44580078125,
"epoch": 1.3531746031746033,
"grad_norm": 0.9538685008179283,
"learning_rate": 6.23412350734884e-06,
"loss": 0.4571,
"mean_token_accuracy": 0.8417170522734523,
"num_tokens": 146580318.0,
"step": 341
},
{
"entropy": 0.441864013671875,
"epoch": 1.3571428571428572,
"grad_norm": 1.0998343004271525,
"learning_rate": 6.2129112584545325e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.846907963976264,
"num_tokens": 146999892.0,
"step": 342
},
{
"entropy": 0.441070556640625,
"epoch": 1.3611111111111112,
"grad_norm": 1.0173140297071601,
"learning_rate": 6.191675788652574e-06,
"loss": 0.4461,
"mean_token_accuracy": 0.8460167152807117,
"num_tokens": 147436184.0,
"step": 343
},
{
"entropy": 0.4295654296875,
"epoch": 1.3650793650793651,
"grad_norm": 1.0290558215509458,
"learning_rate": 6.170417504491157e-06,
"loss": 0.4541,
"mean_token_accuracy": 0.8437853921204805,
"num_tokens": 147888947.0,
"step": 344
},
{
"entropy": 0.441253662109375,
"epoch": 1.369047619047619,
"grad_norm": 0.9977939913686099,
"learning_rate": 6.149136812955256e-06,
"loss": 0.4605,
"mean_token_accuracy": 0.8413437977433205,
"num_tokens": 148330624.0,
"step": 345
},
{
"entropy": 0.44482421875,
"epoch": 1.373015873015873,
"grad_norm": 0.9862218483442303,
"learning_rate": 6.1278341214588255e-06,
"loss": 0.4608,
"mean_token_accuracy": 0.84361382573843,
"num_tokens": 148771994.0,
"step": 346
},
{
"entropy": 0.43817138671875,
"epoch": 1.376984126984127,
"grad_norm": 1.0974460607418992,
"learning_rate": 6.106509837837004e-06,
"loss": 0.4468,
"mean_token_accuracy": 0.8459707852452993,
"num_tokens": 149203608.0,
"step": 347
},
{
"entropy": 0.435211181640625,
"epoch": 1.380952380952381,
"grad_norm": 0.9546922816485226,
"learning_rate": 6.0851643703383066e-06,
"loss": 0.4456,
"mean_token_accuracy": 0.8459897711873055,
"num_tokens": 149626353.0,
"step": 348
},
{
"entropy": 0.43658447265625,
"epoch": 1.3849206349206349,
"grad_norm": 1.0823837316088047,
"learning_rate": 6.063798127616811e-06,
"loss": 0.4447,
"mean_token_accuracy": 0.8457578187808394,
"num_tokens": 150036189.0,
"step": 349
},
{
"entropy": 0.437774658203125,
"epoch": 1.3888888888888888,
"grad_norm": 1.1008934320855421,
"learning_rate": 6.042411518724327e-06,
"loss": 0.4402,
"mean_token_accuracy": 0.84851832408458,
"num_tokens": 150484433.0,
"step": 350
},
{
"entropy": 0.441436767578125,
"epoch": 1.3928571428571428,
"grad_norm": 1.0679863117222357,
"learning_rate": 6.021004953102576e-06,
"loss": 0.4475,
"mean_token_accuracy": 0.8463964462280273,
"num_tokens": 150916869.0,
"step": 351
},
{
"entropy": 0.445831298828125,
"epoch": 1.3968253968253967,
"grad_norm": 1.0542706083048947,
"learning_rate": 5.999578840575342e-06,
"loss": 0.4504,
"mean_token_accuracy": 0.8455899534747005,
"num_tokens": 151351171.0,
"step": 352
},
{
"entropy": 0.438507080078125,
"epoch": 1.4007936507936507,
"grad_norm": 0.987561703544306,
"learning_rate": 5.978133591340633e-06,
"loss": 0.4494,
"mean_token_accuracy": 0.8452698877081275,
"num_tokens": 151779921.0,
"step": 353
},
{
"entropy": 0.435516357421875,
"epoch": 1.4047619047619047,
"grad_norm": 1.1262078381527667,
"learning_rate": 5.956669615962821e-06,
"loss": 0.4602,
"mean_token_accuracy": 0.8407345684245229,
"num_tokens": 152198704.0,
"step": 354
},
{
"entropy": 0.43695068359375,
"epoch": 1.4087301587301586,
"grad_norm": 1.0842467706193302,
"learning_rate": 5.935187325364791e-06,
"loss": 0.4504,
"mean_token_accuracy": 0.8444310743361712,
"num_tokens": 152607114.0,
"step": 355
},
{
"entropy": 0.442291259765625,
"epoch": 1.4126984126984126,
"grad_norm": 0.8869868658428021,
"learning_rate": 5.913687130820064e-06,
"loss": 0.4441,
"mean_token_accuracy": 0.846776382997632,
"num_tokens": 153027562.0,
"step": 356
},
{
"entropy": 0.439239501953125,
"epoch": 1.4166666666666667,
"grad_norm": 1.0275339768252305,
"learning_rate": 5.892169443944929e-06,
"loss": 0.443,
"mean_token_accuracy": 0.84731434751302,
"num_tokens": 153449258.0,
"step": 357
},
{
"entropy": 0.4425048828125,
"epoch": 1.4206349206349207,
"grad_norm": 0.9646996873181736,
"learning_rate": 5.870634676690564e-06,
"loss": 0.4433,
"mean_token_accuracy": 0.8452265271916986,
"num_tokens": 153863890.0,
"step": 358
},
{
"entropy": 0.441680908203125,
"epoch": 1.4246031746031746,
"grad_norm": 1.0623013087943407,
"learning_rate": 5.8490832413351465e-06,
"loss": 0.4484,
"mean_token_accuracy": 0.8456388972699642,
"num_tokens": 154280797.0,
"step": 359
},
{
"entropy": 0.4415283203125,
"epoch": 1.4285714285714286,
"grad_norm": 0.9302981043880288,
"learning_rate": 5.827515550475955e-06,
"loss": 0.4499,
"mean_token_accuracy": 0.8448391910642385,
"num_tokens": 154707913.0,
"step": 360
},
{
"entropy": 0.437255859375,
"epoch": 1.4325396825396826,
"grad_norm": 0.9416953081304574,
"learning_rate": 5.805932017021486e-06,
"loss": 0.4486,
"mean_token_accuracy": 0.8438430884853005,
"num_tokens": 155150096.0,
"step": 361
},
{
"entropy": 0.43389892578125,
"epoch": 1.4365079365079365,
"grad_norm": 0.9373065849374296,
"learning_rate": 5.784333054183533e-06,
"loss": 0.4449,
"mean_token_accuracy": 0.8454085243865848,
"num_tokens": 155590050.0,
"step": 362
},
{
"entropy": 0.437469482421875,
"epoch": 1.4404761904761905,
"grad_norm": 0.9209854720626441,
"learning_rate": 5.762719075469277e-06,
"loss": 0.4465,
"mean_token_accuracy": 0.846617016941309,
"num_tokens": 156016093.0,
"step": 363
},
{
"entropy": 0.436737060546875,
"epoch": 1.4444444444444444,
"grad_norm": 0.9861130639611431,
"learning_rate": 5.741090494673386e-06,
"loss": 0.443,
"mean_token_accuracy": 0.8471564138308167,
"num_tokens": 156449879.0,
"step": 364
},
{
"entropy": 0.441497802734375,
"epoch": 1.4484126984126984,
"grad_norm": 0.9886455980759782,
"learning_rate": 5.719447725870071e-06,
"loss": 0.4337,
"mean_token_accuracy": 0.849870765581727,
"num_tokens": 156866761.0,
"step": 365
},
{
"entropy": 0.4375,
"epoch": 1.4523809523809523,
"grad_norm": 0.9241839444207883,
"learning_rate": 5.697791183405174e-06,
"loss": 0.4333,
"mean_token_accuracy": 0.8499069400131702,
"num_tokens": 157304143.0,
"step": 366
},
{
"entropy": 0.43499755859375,
"epoch": 1.4563492063492063,
"grad_norm": 0.9452367705103182,
"learning_rate": 5.67612128188823e-06,
"loss": 0.4617,
"mean_token_accuracy": 0.8407938601449132,
"num_tokens": 157758390.0,
"step": 367
},
{
"entropy": 0.440948486328125,
"epoch": 1.4603174603174602,
"grad_norm": 1.0583903459607955,
"learning_rate": 5.654438436184531e-06,
"loss": 0.4393,
"mean_token_accuracy": 0.845472626388073,
"num_tokens": 158177988.0,
"step": 368
},
{
"entropy": 0.427886962890625,
"epoch": 1.4642857142857144,
"grad_norm": 1.025858650536215,
"learning_rate": 5.6327430614071794e-06,
"loss": 0.4551,
"mean_token_accuracy": 0.843145564198494,
"num_tokens": 158634349.0,
"step": 369
},
{
"entropy": 0.43743896484375,
"epoch": 1.4682539682539684,
"grad_norm": 0.9049075091671224,
"learning_rate": 5.611035572909147e-06,
"loss": 0.4462,
"mean_token_accuracy": 0.8464264376088977,
"num_tokens": 159060066.0,
"step": 370
},
{
"entropy": 0.440399169921875,
"epoch": 1.4722222222222223,
"grad_norm": 1.1144452604957675,
"learning_rate": 5.589316386275318e-06,
"loss": 0.4474,
"mean_token_accuracy": 0.8443919736891985,
"num_tokens": 159490031.0,
"step": 371
},
{
"entropy": 0.431396484375,
"epoch": 1.4761904761904763,
"grad_norm": 1.058425365791423,
"learning_rate": 5.567585917314535e-06,
"loss": 0.4494,
"mean_token_accuracy": 0.8443618472665548,
"num_tokens": 159942986.0,
"step": 372
},
{
"entropy": 0.44146728515625,
"epoch": 1.4801587301587302,
"grad_norm": 1.0282647508500027,
"learning_rate": 5.545844582051641e-06,
"loss": 0.4265,
"mean_token_accuracy": 0.8528409609571099,
"num_tokens": 160355322.0,
"step": 373
},
{
"entropy": 0.43463134765625,
"epoch": 1.4841269841269842,
"grad_norm": 1.019933540593848,
"learning_rate": 5.524092796719507e-06,
"loss": 0.4521,
"mean_token_accuracy": 0.8433405430987477,
"num_tokens": 160782816.0,
"step": 374
},
{
"entropy": 0.435699462890625,
"epoch": 1.4880952380952381,
"grad_norm": 0.9737519159679464,
"learning_rate": 5.502330977751072e-06,
"loss": 0.4462,
"mean_token_accuracy": 0.8467091489583254,
"num_tokens": 161216771.0,
"step": 375
},
{
"entropy": 0.4390869140625,
"epoch": 1.492063492063492,
"grad_norm": 1.0788036881829521,
"learning_rate": 5.4805595417713634e-06,
"loss": 0.4353,
"mean_token_accuracy": 0.8512382041662931,
"num_tokens": 161643512.0,
"step": 376
},
{
"entropy": 0.43292236328125,
"epoch": 1.496031746031746,
"grad_norm": 1.1542521219056112,
"learning_rate": 5.458778905589528e-06,
"loss": 0.4366,
"mean_token_accuracy": 0.8494954742491245,
"num_tokens": 162077647.0,
"step": 377
},
{
"entropy": 0.4354248046875,
"epoch": 1.5,
"grad_norm": 1.072458349595571,
"learning_rate": 5.436989486190846e-06,
"loss": 0.4335,
"mean_token_accuracy": 0.8492478728294373,
"num_tokens": 162503001.0,
"step": 378
},
{
"entropy": 0.43646240234375,
"epoch": 1.503968253968254,
"grad_norm": 0.9350737480343512,
"learning_rate": 5.415191700728749e-06,
"loss": 0.4548,
"mean_token_accuracy": 0.8452032124623656,
"num_tokens": 162949686.0,
"step": 379
},
{
"entropy": 0.430450439453125,
"epoch": 1.507936507936508,
"grad_norm": 0.9306004319608026,
"learning_rate": 5.393385966516838e-06,
"loss": 0.4397,
"mean_token_accuracy": 0.8475316297262907,
"num_tokens": 163388010.0,
"step": 380
},
{
"entropy": 0.4310302734375,
"epoch": 1.5119047619047619,
"grad_norm": 0.985227817450923,
"learning_rate": 5.371572701020891e-06,
"loss": 0.4341,
"mean_token_accuracy": 0.8477435661479831,
"num_tokens": 163816567.0,
"step": 381
},
{
"entropy": 0.4312744140625,
"epoch": 1.5158730158730158,
"grad_norm": 0.9677192291245226,
"learning_rate": 5.349752321850866e-06,
"loss": 0.448,
"mean_token_accuracy": 0.8447540532797575,
"num_tokens": 164270399.0,
"step": 382
},
{
"entropy": 0.429290771484375,
"epoch": 1.5198412698412698,
"grad_norm": 0.9069780240418857,
"learning_rate": 5.327925246752917e-06,
"loss": 0.4293,
"mean_token_accuracy": 0.8511379426345229,
"num_tokens": 164712402.0,
"step": 383
},
{
"entropy": 0.428375244140625,
"epoch": 1.5238095238095237,
"grad_norm": 0.9600603128750645,
"learning_rate": 5.306091893601384e-06,
"loss": 0.4487,
"mean_token_accuracy": 0.845952364616096,
"num_tokens": 165155523.0,
"step": 384
},
{
"entropy": 0.4332275390625,
"epoch": 1.5277777777777777,
"grad_norm": 0.9724692951976357,
"learning_rate": 5.284252680390803e-06,
"loss": 0.4269,
"mean_token_accuracy": 0.8531857188791037,
"num_tokens": 165575993.0,
"step": 385
},
{
"entropy": 0.43194580078125,
"epoch": 1.5317460317460316,
"grad_norm": 0.9712295118336367,
"learning_rate": 5.2624080252279006e-06,
"loss": 0.4471,
"mean_token_accuracy": 0.845876133069396,
"num_tokens": 166004219.0,
"step": 386
},
{
"entropy": 0.4315185546875,
"epoch": 1.5357142857142856,
"grad_norm": 0.9443490875832254,
"learning_rate": 5.240558346323582e-06,
"loss": 0.437,
"mean_token_accuracy": 0.8483765926212072,
"num_tokens": 166459333.0,
"step": 387
},
{
"entropy": 0.434234619140625,
"epoch": 1.5396825396825395,
"grad_norm": 0.948734807560996,
"learning_rate": 5.218704061984938e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.8489022571593523,
"num_tokens": 166887486.0,
"step": 388
},
{
"entropy": 0.433074951171875,
"epoch": 1.5436507936507935,
"grad_norm": 0.9920709984656828,
"learning_rate": 5.196845590607225e-06,
"loss": 0.444,
"mean_token_accuracy": 0.8482109969481826,
"num_tokens": 167305651.0,
"step": 389
},
{
"entropy": 0.427581787109375,
"epoch": 1.5476190476190477,
"grad_norm": 1.007023739541341,
"learning_rate": 5.174983350665861e-06,
"loss": 0.4355,
"mean_token_accuracy": 0.8507700897753239,
"num_tokens": 167743608.0,
"step": 390
},
{
"entropy": 0.435516357421875,
"epoch": 1.5515873015873016,
"grad_norm": 0.9396600741753053,
"learning_rate": 5.153117760708411e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.8479267274960876,
"num_tokens": 168189361.0,
"step": 391
},
{
"entropy": 0.440887451171875,
"epoch": 1.5555555555555556,
"grad_norm": 0.9532447871050252,
"learning_rate": 5.131249239346574e-06,
"loss": 0.4364,
"mean_token_accuracy": 0.8505636844784021,
"num_tokens": 168602032.0,
"step": 392
},
{
"entropy": 0.436492919921875,
"epoch": 1.5595238095238095,
"grad_norm": 0.9020737415284749,
"learning_rate": 5.109378205248177e-06,
"loss": 0.4426,
"mean_token_accuracy": 0.8446815246716142,
"num_tokens": 169036397.0,
"step": 393
},
{
"entropy": 0.43292236328125,
"epoch": 1.5634920634920635,
"grad_norm": 1.5261604480695485,
"learning_rate": 5.087505077129144e-06,
"loss": 0.4458,
"mean_token_accuracy": 0.8471705308184028,
"num_tokens": 169469975.0,
"step": 394
},
{
"entropy": 0.425628662109375,
"epoch": 1.5674603174603174,
"grad_norm": 1.0588587386866344,
"learning_rate": 5.065630273745495e-06,
"loss": 0.4463,
"mean_token_accuracy": 0.8460619812831283,
"num_tokens": 169905002.0,
"step": 395
},
{
"entropy": 0.429779052734375,
"epoch": 1.5714285714285714,
"grad_norm": 1.017609763369074,
"learning_rate": 5.043754213885319e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.8433791399002075,
"num_tokens": 170343282.0,
"step": 396
},
{
"entropy": 0.436981201171875,
"epoch": 1.5753968253968254,
"grad_norm": 0.9564026257150148,
"learning_rate": 5.021877316360759e-06,
"loss": 0.4354,
"mean_token_accuracy": 0.8478411976248026,
"num_tokens": 170783254.0,
"step": 397
},
{
"entropy": 0.43304443359375,
"epoch": 1.5793650793650795,
"grad_norm": 0.9585975685485587,
"learning_rate": 5e-06,
"loss": 0.4505,
"mean_token_accuracy": 0.8458553478121758,
"num_tokens": 171227432.0,
"step": 398
},
{
"entropy": 0.43023681640625,
"epoch": 1.5833333333333335,
"grad_norm": 1.0440501055720262,
"learning_rate": 4.978122683639241e-06,
"loss": 0.4275,
"mean_token_accuracy": 0.8501301733776927,
"num_tokens": 171673565.0,
"step": 399
},
{
"entropy": 0.436431884765625,
"epoch": 1.5873015873015874,
"grad_norm": 1.0933083501713738,
"learning_rate": 4.956245786114683e-06,
"loss": 0.4295,
"mean_token_accuracy": 0.8506188867613673,
"num_tokens": 172096305.0,
"step": 400
},
{
"entropy": 0.434814453125,
"epoch": 1.5912698412698414,
"grad_norm": 1.1069832769815195,
"learning_rate": 4.934369726254506e-06,
"loss": 0.43,
"mean_token_accuracy": 0.8495042575523257,
"num_tokens": 172495298.0,
"step": 401
},
{
"entropy": 0.433929443359375,
"epoch": 1.5952380952380953,
"grad_norm": 1.120671038507196,
"learning_rate": 4.9124949228708566e-06,
"loss": 0.4334,
"mean_token_accuracy": 0.8499879157170653,
"num_tokens": 172910673.0,
"step": 402
},
{
"entropy": 0.42694091796875,
"epoch": 1.5992063492063493,
"grad_norm": 1.028931284451181,
"learning_rate": 4.890621794751825e-06,
"loss": 0.4319,
"mean_token_accuracy": 0.8494029613211751,
"num_tokens": 173326209.0,
"step": 403
},
{
"entropy": 0.426605224609375,
"epoch": 1.6031746031746033,
"grad_norm": 0.9118168079626323,
"learning_rate": 4.8687507606534274e-06,
"loss": 0.4372,
"mean_token_accuracy": 0.8469415912404656,
"num_tokens": 173775762.0,
"step": 404
},
{
"entropy": 0.43621826171875,
"epoch": 1.6071428571428572,
"grad_norm": 1.0102731648951273,
"learning_rate": 4.8468822392915925e-06,
"loss": 0.4367,
"mean_token_accuracy": 0.8488945597782731,
"num_tokens": 174200041.0,
"step": 405
},
{
"entropy": 0.428955078125,
"epoch": 1.6111111111111112,
"grad_norm": 0.9690257742063463,
"learning_rate": 4.82501664933414e-06,
"loss": 0.4406,
"mean_token_accuracy": 0.8465389581397176,
"num_tokens": 174651858.0,
"step": 406
},
{
"entropy": 0.436920166015625,
"epoch": 1.6150793650793651,
"grad_norm": 0.8850222581892622,
"learning_rate": 4.803154409392776e-06,
"loss": 0.4324,
"mean_token_accuracy": 0.8495019385591149,
"num_tokens": 175081173.0,
"step": 407
},
{
"entropy": 0.430511474609375,
"epoch": 1.619047619047619,
"grad_norm": 0.95437734633981,
"learning_rate": 4.781295938015063e-06,
"loss": 0.4331,
"mean_token_accuracy": 0.8485972639173269,
"num_tokens": 175519282.0,
"step": 408
},
{
"entropy": 0.435028076171875,
"epoch": 1.623015873015873,
"grad_norm": 1.0123634812749625,
"learning_rate": 4.759441653676419e-06,
"loss": 0.4466,
"mean_token_accuracy": 0.848145549185574,
"num_tokens": 175965036.0,
"step": 409
},
{
"entropy": 0.431060791015625,
"epoch": 1.626984126984127,
"grad_norm": 0.909110311090521,
"learning_rate": 4.737591974772102e-06,
"loss": 0.4451,
"mean_token_accuracy": 0.8459606841206551,
"num_tokens": 176387199.0,
"step": 410
},
{
"entropy": 0.4302978515625,
"epoch": 1.630952380952381,
"grad_norm": 0.964606274615154,
"learning_rate": 4.715747319609199e-06,
"loss": 0.4414,
"mean_token_accuracy": 0.8480783235281706,
"num_tokens": 176823428.0,
"step": 411
},
{
"entropy": 0.423431396484375,
"epoch": 1.6349206349206349,
"grad_norm": 0.9360221541198701,
"learning_rate": 4.693908106398617e-06,
"loss": 0.4393,
"mean_token_accuracy": 0.8489115545526147,
"num_tokens": 177264131.0,
"step": 412
},
{
"entropy": 0.4334716796875,
"epoch": 1.6388888888888888,
"grad_norm": 0.9818915467360069,
"learning_rate": 4.6720747532470845e-06,
"loss": 0.4294,
"mean_token_accuracy": 0.8496479475870728,
"num_tokens": 177680911.0,
"step": 413
},
{
"entropy": 0.432647705078125,
"epoch": 1.6428571428571428,
"grad_norm": 0.8978522056780484,
"learning_rate": 4.650247678149135e-06,
"loss": 0.4379,
"mean_token_accuracy": 0.8470958042889833,
"num_tokens": 178114003.0,
"step": 414
},
{
"entropy": 0.437652587890625,
"epoch": 1.6468253968253967,
"grad_norm": 0.9722385088780229,
"learning_rate": 4.628427298979111e-06,
"loss": 0.4514,
"mean_token_accuracy": 0.8430732255801558,
"num_tokens": 178533077.0,
"step": 415
},
{
"entropy": 0.437347412109375,
"epoch": 1.6507936507936507,
"grad_norm": 1.0373796667738375,
"learning_rate": 4.606614033483164e-06,
"loss": 0.4326,
"mean_token_accuracy": 0.8507428057491779,
"num_tokens": 178950487.0,
"step": 416
},
{
"entropy": 0.4326171875,
"epoch": 1.6547619047619047,
"grad_norm": 1.010237913873583,
"learning_rate": 4.5848082992712516e-06,
"loss": 0.4377,
"mean_token_accuracy": 0.8486862545832992,
"num_tokens": 179384739.0,
"step": 417
},
{
"entropy": 0.426300048828125,
"epoch": 1.6587301587301586,
"grad_norm": 1.0263841694329876,
"learning_rate": 4.563010513809156e-06,
"loss": 0.4455,
"mean_token_accuracy": 0.8446431895717978,
"num_tokens": 179833212.0,
"step": 418
},
{
"entropy": 0.42828369140625,
"epoch": 1.6626984126984126,
"grad_norm": 0.9494913320869729,
"learning_rate": 4.541221094410473e-06,
"loss": 0.4306,
"mean_token_accuracy": 0.8516378318890929,
"num_tokens": 180259940.0,
"step": 419
},
{
"entropy": 0.42144775390625,
"epoch": 1.6666666666666665,
"grad_norm": 0.9739308463131585,
"learning_rate": 4.519440458228638e-06,
"loss": 0.4381,
"mean_token_accuracy": 0.8479503998532891,
"num_tokens": 180712234.0,
"step": 420
},
{
"entropy": 0.4244384765625,
"epoch": 1.6706349206349205,
"grad_norm": 1.0181973094308832,
"learning_rate": 4.497669022248931e-06,
"loss": 0.4525,
"mean_token_accuracy": 0.843443606980145,
"num_tokens": 181151354.0,
"step": 421
},
{
"entropy": 0.430877685546875,
"epoch": 1.6746031746031746,
"grad_norm": 3.323978860931596,
"learning_rate": 4.475907203280494e-06,
"loss": 0.4383,
"mean_token_accuracy": 0.8451524330303073,
"num_tokens": 181566490.0,
"step": 422
},
{
"entropy": 0.428955078125,
"epoch": 1.6785714285714286,
"grad_norm": 1.2824867106826667,
"learning_rate": 4.45415541794836e-06,
"loss": 0.446,
"mean_token_accuracy": 0.8463947279378772,
"num_tokens": 181997420.0,
"step": 423
},
{
"entropy": 0.431793212890625,
"epoch": 1.6825396825396826,
"grad_norm": 1.0255881219333862,
"learning_rate": 4.432414082685466e-06,
"loss": 0.4358,
"mean_token_accuracy": 0.8490986367687583,
"num_tokens": 182413254.0,
"step": 424
},
{
"entropy": 0.42706298828125,
"epoch": 1.6865079365079365,
"grad_norm": 1.0665870604693903,
"learning_rate": 4.410683613724684e-06,
"loss": 0.4292,
"mean_token_accuracy": 0.8507826002314687,
"num_tokens": 182840621.0,
"step": 425
},
{
"entropy": 0.427398681640625,
"epoch": 1.6904761904761905,
"grad_norm": 1.1351262001199722,
"learning_rate": 4.388964427090855e-06,
"loss": 0.4359,
"mean_token_accuracy": 0.846874114125967,
"num_tokens": 183269538.0,
"step": 426
},
{
"entropy": 0.43524169921875,
"epoch": 1.6944444444444444,
"grad_norm": 0.9895934977007657,
"learning_rate": 4.367256938592822e-06,
"loss": 0.4231,
"mean_token_accuracy": 0.8536219568923116,
"num_tokens": 183684845.0,
"step": 427
},
{
"entropy": 0.43170166015625,
"epoch": 1.6984126984126984,
"grad_norm": 1.1767949451847899,
"learning_rate": 4.345561563815471e-06,
"loss": 0.4337,
"mean_token_accuracy": 0.8503425857052207,
"num_tokens": 184109496.0,
"step": 428
},
{
"entropy": 0.433258056640625,
"epoch": 1.7023809523809523,
"grad_norm": 0.9787163441447944,
"learning_rate": 4.323878718111771e-06,
"loss": 0.4496,
"mean_token_accuracy": 0.8437537206336856,
"num_tokens": 184533568.0,
"step": 429
},
{
"entropy": 0.432220458984375,
"epoch": 1.7063492063492065,
"grad_norm": 0.9948605324632119,
"learning_rate": 4.302208816594829e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.8475517062470317,
"num_tokens": 184968366.0,
"step": 430
},
{
"entropy": 0.42999267578125,
"epoch": 1.7103174603174605,
"grad_norm": 0.9068147664673831,
"learning_rate": 4.280552274129932e-06,
"loss": 0.4376,
"mean_token_accuracy": 0.8486391613259912,
"num_tokens": 185404884.0,
"step": 431
},
{
"entropy": 0.427978515625,
"epoch": 1.7142857142857144,
"grad_norm": 0.9871014833586675,
"learning_rate": 4.258909505326617e-06,
"loss": 0.4451,
"mean_token_accuracy": 0.8455649884417653,
"num_tokens": 185857166.0,
"step": 432
},
{
"entropy": 0.432586669921875,
"epoch": 1.7182539682539684,
"grad_norm": 0.9995499236592311,
"learning_rate": 4.237280924530723e-06,
"loss": 0.425,
"mean_token_accuracy": 0.8507826123386621,
"num_tokens": 186278301.0,
"step": 433
},
{
"entropy": 0.43853759765625,
"epoch": 1.7222222222222223,
"grad_norm": 0.9796741726346321,
"learning_rate": 4.215666945816469e-06,
"loss": 0.4266,
"mean_token_accuracy": 0.850803654640913,
"num_tokens": 186684767.0,
"step": 434
},
{
"entropy": 0.4305419921875,
"epoch": 1.7261904761904763,
"grad_norm": 0.9307664459487662,
"learning_rate": 4.194067982978516e-06,
"loss": 0.4279,
"mean_token_accuracy": 0.8503124145790935,
"num_tokens": 187107470.0,
"step": 435
},
{
"entropy": 0.425567626953125,
"epoch": 1.7301587301587302,
"grad_norm": 0.9496403248581704,
"learning_rate": 4.172484449524047e-06,
"loss": 0.428,
"mean_token_accuracy": 0.8510759947821498,
"num_tokens": 187534641.0,
"step": 436
},
{
"entropy": 0.42620849609375,
"epoch": 1.7341269841269842,
"grad_norm": 0.9874730817939584,
"learning_rate": 4.150916758664857e-06,
"loss": 0.4352,
"mean_token_accuracy": 0.848286903463304,
"num_tokens": 187972052.0,
"step": 437
},
{
"entropy": 0.424652099609375,
"epoch": 1.7380952380952381,
"grad_norm": 0.9625644757119309,
"learning_rate": 4.129365323309436e-06,
"loss": 0.4295,
"mean_token_accuracy": 0.8496120125055313,
"num_tokens": 188403747.0,
"step": 438
},
{
"entropy": 0.425537109375,
"epoch": 1.742063492063492,
"grad_norm": 0.9770323219075207,
"learning_rate": 4.107830556055072e-06,
"loss": 0.4363,
"mean_token_accuracy": 0.8482074243947864,
"num_tokens": 188833376.0,
"step": 439
},
{
"entropy": 0.420562744140625,
"epoch": 1.746031746031746,
"grad_norm": 0.9091458418004688,
"learning_rate": 4.086312869179938e-06,
"loss": 0.434,
"mean_token_accuracy": 0.8494348004460335,
"num_tokens": 189286051.0,
"step": 440
},
{
"entropy": 0.4337158203125,
"epoch": 1.75,
"grad_norm": 0.9398983504232156,
"learning_rate": 4.06481267463521e-06,
"loss": 0.4233,
"mean_token_accuracy": 0.85198515933007,
"num_tokens": 189700932.0,
"step": 441
},
{
"entropy": 0.428436279296875,
"epoch": 1.753968253968254,
"grad_norm": 0.9954518019783384,
"learning_rate": 4.04333038403718e-06,
"loss": 0.4332,
"mean_token_accuracy": 0.8483901359140873,
"num_tokens": 190135846.0,
"step": 442
},
{
"entropy": 0.41839599609375,
"epoch": 1.757936507936508,
"grad_norm": 0.9235407840660959,
"learning_rate": 4.021866408659368e-06,
"loss": 0.4376,
"mean_token_accuracy": 0.8477007877081633,
"num_tokens": 190599539.0,
"step": 443
},
{
"entropy": 0.42510986328125,
"epoch": 1.7619047619047619,
"grad_norm": 0.9988254434360743,
"learning_rate": 4.000421159424658e-06,
"loss": 0.4381,
"mean_token_accuracy": 0.849124894477427,
"num_tokens": 191023956.0,
"step": 444
},
{
"entropy": 0.44061279296875,
"epoch": 1.7658730158730158,
"grad_norm": 0.9313679757350634,
"learning_rate": 3.978995046897425e-06,
"loss": 0.4111,
"mean_token_accuracy": 0.8550975983962417,
"num_tokens": 191419256.0,
"step": 445
},
{
"entropy": 0.42877197265625,
"epoch": 1.7698412698412698,
"grad_norm": 0.9424190366763185,
"learning_rate": 3.957588481275674e-06,
"loss": 0.438,
"mean_token_accuracy": 0.848029020242393,
"num_tokens": 191865715.0,
"step": 446
},
{
"entropy": 0.437103271484375,
"epoch": 1.7738095238095237,
"grad_norm": 0.9089004430002622,
"learning_rate": 3.9362018723831915e-06,
"loss": 0.4417,
"mean_token_accuracy": 0.8482843916863203,
"num_tokens": 192279544.0,
"step": 447
},
{
"entropy": 0.43310546875,
"epoch": 1.7777777777777777,
"grad_norm": 1.682337538575509,
"learning_rate": 3.914835629661695e-06,
"loss": 0.4219,
"mean_token_accuracy": 0.8513781204819679,
"num_tokens": 192687536.0,
"step": 448
},
{
"entropy": 0.434417724609375,
"epoch": 1.7817460317460316,
"grad_norm": 1.0677243021549518,
"learning_rate": 3.893490162162997e-06,
"loss": 0.427,
"mean_token_accuracy": 0.8539638724178076,
"num_tokens": 193092369.0,
"step": 449
},
{
"entropy": 0.43597412109375,
"epoch": 1.7857142857142856,
"grad_norm": 0.9415863303290471,
"learning_rate": 3.872165878541175e-06,
"loss": 0.4249,
"mean_token_accuracy": 0.8508947864174843,
"num_tokens": 193514317.0,
"step": 450
},
{
"entropy": 0.4267578125,
"epoch": 1.7896825396825395,
"grad_norm": 0.9325477755113131,
"learning_rate": 3.850863187044745e-06,
"loss": 0.4311,
"mean_token_accuracy": 0.8517430359497666,
"num_tokens": 193943892.0,
"step": 451
},
{
"entropy": 0.4212646484375,
"epoch": 1.7936507936507935,
"grad_norm": 1.0936536327558857,
"learning_rate": 3.829582495508844e-06,
"loss": 0.428,
"mean_token_accuracy": 0.8505398780107498,
"num_tokens": 194368425.0,
"step": 452
},
{
"entropy": 0.425689697265625,
"epoch": 1.7976190476190477,
"grad_norm": 0.913775614343544,
"learning_rate": 3.808324211347429e-06,
"loss": 0.4263,
"mean_token_accuracy": 0.8509924123063684,
"num_tokens": 194781122.0,
"step": 453
},
{
"entropy": 0.42474365234375,
"epoch": 1.8015873015873016,
"grad_norm": 0.8819652825019069,
"learning_rate": 3.7870887415454687e-06,
"loss": 0.4352,
"mean_token_accuracy": 0.8501952039077878,
"num_tokens": 195229420.0,
"step": 454
},
{
"entropy": 0.423248291015625,
"epoch": 1.8055555555555556,
"grad_norm": 0.9710832265661201,
"learning_rate": 3.7658764926511613e-06,
"loss": 0.4364,
"mean_token_accuracy": 0.8493523299694061,
"num_tokens": 195670858.0,
"step": 455
},
{
"entropy": 0.429229736328125,
"epoch": 1.8095238095238095,
"grad_norm": 1.0034882334655617,
"learning_rate": 3.7446878707681413e-06,
"loss": 0.4312,
"mean_token_accuracy": 0.8488902822136879,
"num_tokens": 196086060.0,
"step": 456
},
{
"entropy": 0.42626953125,
"epoch": 1.8134920634920635,
"grad_norm": 0.8967060198023731,
"learning_rate": 3.7235232815477123e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.8454429730772972,
"num_tokens": 196534067.0,
"step": 457
},
{
"entropy": 0.433380126953125,
"epoch": 1.8174603174603174,
"grad_norm": 1.0727361296036093,
"learning_rate": 3.7023831301810765e-06,
"loss": 0.4233,
"mean_token_accuracy": 0.852061620913446,
"num_tokens": 196949752.0,
"step": 458
},
{
"entropy": 0.4302978515625,
"epoch": 1.8214285714285714,
"grad_norm": 0.9533053527391133,
"learning_rate": 3.6812678213915777e-06,
"loss": 0.4274,
"mean_token_accuracy": 0.8499543191865087,
"num_tokens": 197361623.0,
"step": 459
},
{
"entropy": 0.428863525390625,
"epoch": 1.8253968253968254,
"grad_norm": 1.6646105544719645,
"learning_rate": 3.6601777594269605e-06,
"loss": 0.4275,
"mean_token_accuracy": 0.8524315897375345,
"num_tokens": 197787383.0,
"step": 460
},
{
"entropy": 0.427886962890625,
"epoch": 1.8293650793650795,
"grad_norm": 0.918452931744825,
"learning_rate": 3.6391133480516196e-06,
"loss": 0.4351,
"mean_token_accuracy": 0.8494909154251218,
"num_tokens": 198214788.0,
"step": 461
},
{
"entropy": 0.433502197265625,
"epoch": 1.8333333333333335,
"grad_norm": 0.9250539034798784,
"learning_rate": 3.618074990538873e-06,
"loss": 0.44,
"mean_token_accuracy": 0.8496057353913784,
"num_tokens": 198640204.0,
"step": 462
},
{
"entropy": 0.4234619140625,
"epoch": 1.8373015873015874,
"grad_norm": 0.8926807300614167,
"learning_rate": 3.5970630896632485e-06,
"loss": 0.4373,
"mean_token_accuracy": 0.8482935605570674,
"num_tokens": 199086174.0,
"step": 463
},
{
"entropy": 0.423919677734375,
"epoch": 1.8412698412698414,
"grad_norm": 0.9317218135024461,
"learning_rate": 3.5760780476927637e-06,
"loss": 0.4342,
"mean_token_accuracy": 0.8504292815923691,
"num_tokens": 199534945.0,
"step": 464
},
{
"entropy": 0.43280029296875,
"epoch": 1.8452380952380953,
"grad_norm": 0.9327031690920736,
"learning_rate": 3.5551202663812344e-06,
"loss": 0.428,
"mean_token_accuracy": 0.851259358227253,
"num_tokens": 199970879.0,
"step": 465
},
{
"entropy": 0.43359375,
"epoch": 1.8492063492063493,
"grad_norm": 0.9103535545774605,
"learning_rate": 3.534190146960571e-06,
"loss": 0.4254,
"mean_token_accuracy": 0.8511311411857605,
"num_tokens": 200401566.0,
"step": 466
},
{
"entropy": 0.43096923828125,
"epoch": 1.8531746031746033,
"grad_norm": 1.3202029413068583,
"learning_rate": 3.5132880901331067e-06,
"loss": 0.4244,
"mean_token_accuracy": 0.8484150217846036,
"num_tokens": 200819281.0,
"step": 467
},
{
"entropy": 0.42852783203125,
"epoch": 1.8571428571428572,
"grad_norm": 0.9663839835801094,
"learning_rate": 3.492414496063921e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.8492425018921494,
"num_tokens": 201286569.0,
"step": 468
},
{
"entropy": 0.42816162109375,
"epoch": 1.8611111111111112,
"grad_norm": 0.922662186018523,
"learning_rate": 3.4715697643731828e-06,
"loss": 0.4286,
"mean_token_accuracy": 0.8502284437417984,
"num_tokens": 201729117.0,
"step": 469
},
{
"entropy": 0.4305419921875,
"epoch": 1.8650793650793651,
"grad_norm": 0.9615527156025448,
"learning_rate": 3.4507542941284933e-06,
"loss": 0.4251,
"mean_token_accuracy": 0.8521155146881938,
"num_tokens": 202148785.0,
"step": 470
},
{
"entropy": 0.42950439453125,
"epoch": 1.869047619047619,
"grad_norm": 0.8896950243538952,
"learning_rate": 3.4299684838372547e-06,
"loss": 0.4209,
"mean_token_accuracy": 0.8519325880333781,
"num_tokens": 202562335.0,
"step": 471
},
{
"entropy": 0.438201904296875,
"epoch": 1.873015873015873,
"grad_norm": 0.896750571119777,
"learning_rate": 3.4092127314390354e-06,
"loss": 0.4241,
"mean_token_accuracy": 0.8511500097811222,
"num_tokens": 202969412.0,
"step": 472
},
{
"entropy": 0.424560546875,
"epoch": 1.876984126984127,
"grad_norm": 0.8342483785030218,
"learning_rate": 3.388487434297949e-06,
"loss": 0.4349,
"mean_token_accuracy": 0.8488007439300418,
"num_tokens": 203414579.0,
"step": 473
},
{
"entropy": 0.429595947265625,
"epoch": 1.880952380952381,
"grad_norm": 0.8918742155840607,
"learning_rate": 3.3677929891950527e-06,
"loss": 0.4247,
"mean_token_accuracy": 0.8510593473911285,
"num_tokens": 203845826.0,
"step": 474
},
{
"entropy": 0.43017578125,
"epoch": 1.8849206349206349,
"grad_norm": 0.9252775003902146,
"learning_rate": 3.347129792320748e-06,
"loss": 0.4272,
"mean_token_accuracy": 0.8510101838037372,
"num_tokens": 204272914.0,
"step": 475
},
{
"entropy": 0.424591064453125,
"epoch": 1.8888888888888888,
"grad_norm": 0.9664584622314957,
"learning_rate": 3.3264982392671973e-06,
"loss": 0.4204,
"mean_token_accuracy": 0.8532195715233684,
"num_tokens": 204713067.0,
"step": 476
},
{
"entropy": 0.42791748046875,
"epoch": 1.8928571428571428,
"grad_norm": 0.9292473265869555,
"learning_rate": 3.3058987250207476e-06,
"loss": 0.4277,
"mean_token_accuracy": 0.8527126982808113,
"num_tokens": 205140799.0,
"step": 477
},
{
"entropy": 0.439788818359375,
"epoch": 1.8968253968253967,
"grad_norm": 0.9128528058058363,
"learning_rate": 3.285331643954372e-06,
"loss": 0.4234,
"mean_token_accuracy": 0.8513627136126161,
"num_tokens": 205549482.0,
"step": 478
},
{
"entropy": 0.428558349609375,
"epoch": 1.9007936507936507,
"grad_norm": 0.9344739197051096,
"learning_rate": 3.2647973898201157e-06,
"loss": 0.4269,
"mean_token_accuracy": 0.8505295282229781,
"num_tokens": 205957709.0,
"step": 479
},
{
"entropy": 0.428436279296875,
"epoch": 1.9047619047619047,
"grad_norm": 0.8831126126363492,
"learning_rate": 3.244296355741561e-06,
"loss": 0.426,
"mean_token_accuracy": 0.8514531748369336,
"num_tokens": 206394578.0,
"step": 480
},
{
"entropy": 0.43328857421875,
"epoch": 1.9087301587301586,
"grad_norm": 0.8812462855968569,
"learning_rate": 3.2238289342063013e-06,
"loss": 0.429,
"mean_token_accuracy": 0.8510967614129186,
"num_tokens": 206810851.0,
"step": 481
},
{
"entropy": 0.428375244140625,
"epoch": 1.9126984126984126,
"grad_norm": 1.0106928205994128,
"learning_rate": 3.203395517058423e-06,
"loss": 0.432,
"mean_token_accuracy": 0.852095915004611,
"num_tokens": 207233636.0,
"step": 482
},
{
"entropy": 0.421112060546875,
"epoch": 1.9166666666666665,
"grad_norm": 0.9116927331499651,
"learning_rate": 3.1829964954910076e-06,
"loss": 0.4363,
"mean_token_accuracy": 0.8473147870972753,
"num_tokens": 207671663.0,
"step": 483
},
{
"entropy": 0.437652587890625,
"epoch": 1.9206349206349205,
"grad_norm": 0.9660485826307438,
"learning_rate": 3.1626322600386418e-06,
"loss": 0.4289,
"mean_token_accuracy": 0.8505426356568933,
"num_tokens": 208074376.0,
"step": 484
},
{
"entropy": 0.4241943359375,
"epoch": 1.9246031746031746,
"grad_norm": 0.9972216512477222,
"learning_rate": 3.1423032005699377e-06,
"loss": 0.4364,
"mean_token_accuracy": 0.8486529793590307,
"num_tokens": 208524843.0,
"step": 485
},
{
"entropy": 0.4322509765625,
"epoch": 1.9285714285714286,
"grad_norm": 0.9283266129413389,
"learning_rate": 3.122009706280072e-06,
"loss": 0.4277,
"mean_token_accuracy": 0.8506509074941278,
"num_tokens": 208947370.0,
"step": 486
},
{
"entropy": 0.42724609375,
"epoch": 1.9325396825396826,
"grad_norm": 1.006394801232037,
"learning_rate": 3.1017521656833384e-06,
"loss": 0.4146,
"mean_token_accuracy": 0.8548265127465129,
"num_tokens": 209354451.0,
"step": 487
},
{
"entropy": 0.4229736328125,
"epoch": 1.9365079365079365,
"grad_norm": 0.8314414813893206,
"learning_rate": 3.0815309666057013e-06,
"loss": 0.428,
"mean_token_accuracy": 0.8494690489023924,
"num_tokens": 209798547.0,
"step": 488
},
{
"entropy": 0.425018310546875,
"epoch": 1.9404761904761905,
"grad_norm": 0.9234785434940929,
"learning_rate": 3.061346496177374e-06,
"loss": 0.421,
"mean_token_accuracy": 0.8528507072478533,
"num_tokens": 210233790.0,
"step": 489
},
{
"entropy": 0.43133544921875,
"epoch": 1.9444444444444444,
"grad_norm": 0.8757613774035661,
"learning_rate": 3.0411991408254116e-06,
"loss": 0.436,
"mean_token_accuracy": 0.8493619496002793,
"num_tokens": 210661829.0,
"step": 490
},
{
"entropy": 0.420318603515625,
"epoch": 1.9484126984126984,
"grad_norm": 0.8668762253896259,
"learning_rate": 3.0210892862663043e-06,
"loss": 0.4267,
"mean_token_accuracy": 0.8510631760582328,
"num_tokens": 211113597.0,
"step": 491
},
{
"entropy": 0.4222412109375,
"epoch": 1.9523809523809523,
"grad_norm": 0.8822229179162288,
"learning_rate": 3.001017317498607e-06,
"loss": 0.4278,
"mean_token_accuracy": 0.8513042591512203,
"num_tokens": 211549046.0,
"step": 492
},
{
"entropy": 0.419830322265625,
"epoch": 1.9563492063492065,
"grad_norm": 0.9142830959986298,
"learning_rate": 2.9809836187955532e-06,
"loss": 0.4139,
"mean_token_accuracy": 0.8542308090254664,
"num_tokens": 212000519.0,
"step": 493
},
{
"entropy": 0.42449951171875,
"epoch": 1.9603174603174605,
"grad_norm": 0.8634339056465669,
"learning_rate": 2.960988573697705e-06,
"loss": 0.428,
"mean_token_accuracy": 0.8506795652210712,
"num_tokens": 212447521.0,
"step": 494
},
{
"entropy": 0.42681884765625,
"epoch": 1.9642857142857144,
"grad_norm": 0.8734416000621907,
"learning_rate": 2.941032565005613e-06,
"loss": 0.4262,
"mean_token_accuracy": 0.8521596789360046,
"num_tokens": 212865927.0,
"step": 495
},
{
"entropy": 0.424072265625,
"epoch": 1.9682539682539684,
"grad_norm": 0.8877032051531498,
"learning_rate": 2.9211159747724813e-06,
"loss": 0.4264,
"mean_token_accuracy": 0.851787575520575,
"num_tokens": 213310334.0,
"step": 496
},
{
"entropy": 0.421661376953125,
"epoch": 1.9722222222222223,
"grad_norm": 0.9809567398581039,
"learning_rate": 2.90123918429686e-06,
"loss": 0.4246,
"mean_token_accuracy": 0.8516859589144588,
"num_tokens": 213742399.0,
"step": 497
},
{
"entropy": 0.42767333984375,
"epoch": 1.9761904761904763,
"grad_norm": 0.8738523997394374,
"learning_rate": 2.881402574115344e-06,
"loss": 0.4273,
"mean_token_accuracy": 0.8529170397669077,
"num_tokens": 214169043.0,
"step": 498
},
{
"entropy": 0.4276123046875,
"epoch": 1.9801587301587302,
"grad_norm": 0.9201362022804491,
"learning_rate": 2.8616065239952763e-06,
"loss": 0.424,
"mean_token_accuracy": 0.8526058839634061,
"num_tokens": 214572957.0,
"step": 499
},
{
"entropy": 0.430877685546875,
"epoch": 1.9841269841269842,
"grad_norm": 0.9306770950977414,
"learning_rate": 2.841851412927495e-06,
"loss": 0.4314,
"mean_token_accuracy": 0.8489747159183025,
"num_tokens": 215005057.0,
"step": 500
},
{
"entropy": 0.4188232421875,
"epoch": 1.9880952380952381,
"grad_norm": 0.8357685751970109,
"learning_rate": 2.822137619119065e-06,
"loss": 0.42,
"mean_token_accuracy": 0.8517758399248123,
"num_tokens": 215449399.0,
"step": 501
},
{
"entropy": 0.426727294921875,
"epoch": 1.992063492063492,
"grad_norm": 1.1544716066903413,
"learning_rate": 2.8024655199860495e-06,
"loss": 0.4154,
"mean_token_accuracy": 0.8549016704782844,
"num_tokens": 215869766.0,
"step": 502
},
{
"entropy": 0.427978515625,
"epoch": 1.996031746031746,
"grad_norm": 0.8289572581024041,
"learning_rate": 2.7828354921462668e-06,
"loss": 0.4184,
"mean_token_accuracy": 0.8542971862480044,
"num_tokens": 216298988.0,
"step": 503
},
{
"entropy": 0.4202880859375,
"epoch": 2.0,
"grad_norm": 0.8750452382881969,
"learning_rate": 2.7632479114120963e-06,
"loss": 0.4177,
"mean_token_accuracy": 0.8540928428992629,
"num_tokens": 216731206.0,
"step": 504
},
{
"entropy": 0.420989990234375,
"epoch": 2.003968253968254,
"grad_norm": 0.8871159450799843,
"learning_rate": 2.7437031527832747e-06,
"loss": 0.3994,
"mean_token_accuracy": 0.860961563885212,
"num_tokens": 217159781.0,
"step": 505
},
{
"entropy": 0.425262451171875,
"epoch": 2.007936507936508,
"grad_norm": 0.9044028336131849,
"learning_rate": 2.72420159043972e-06,
"loss": 0.3935,
"mean_token_accuracy": 0.8634284269064665,
"num_tokens": 217589905.0,
"step": 506
},
{
"entropy": 0.42340087890625,
"epoch": 2.011904761904762,
"grad_norm": 0.8841207327958758,
"learning_rate": 2.704743597734365e-06,
"loss": 0.3933,
"mean_token_accuracy": 0.8630258431658149,
"num_tokens": 218017429.0,
"step": 507
},
{
"entropy": 0.42041015625,
"epoch": 2.015873015873016,
"grad_norm": 0.8980425705440174,
"learning_rate": 2.685329547186018e-06,
"loss": 0.4083,
"mean_token_accuracy": 0.8567906338721514,
"num_tokens": 218446876.0,
"step": 508
},
{
"entropy": 0.4229736328125,
"epoch": 2.0198412698412698,
"grad_norm": 0.909158252805293,
"learning_rate": 2.665959810472219e-06,
"loss": 0.4067,
"mean_token_accuracy": 0.8580641169101,
"num_tokens": 218885713.0,
"step": 509
},
{
"entropy": 0.41693115234375,
"epoch": 2.0238095238095237,
"grad_norm": 0.882075206716414,
"learning_rate": 2.6466347584221314e-06,
"loss": 0.3961,
"mean_token_accuracy": 0.861279109492898,
"num_tokens": 219322571.0,
"step": 510
},
{
"entropy": 0.422607421875,
"epoch": 2.0277777777777777,
"grad_norm": 0.8895301340223191,
"learning_rate": 2.6273547610094408e-06,
"loss": 0.4007,
"mean_token_accuracy": 0.8570800367742777,
"num_tokens": 219748508.0,
"step": 511
},
{
"entropy": 0.420166015625,
"epoch": 2.0317460317460316,
"grad_norm": 0.908409070674735,
"learning_rate": 2.608120187345273e-06,
"loss": 0.3983,
"mean_token_accuracy": 0.8590443721041083,
"num_tokens": 220180160.0,
"step": 512
},
{
"entropy": 0.4185791015625,
"epoch": 2.0357142857142856,
"grad_norm": 1.034313453109704,
"learning_rate": 2.588931405671127e-06,
"loss": 0.3916,
"mean_token_accuracy": 0.8636050894856453,
"num_tokens": 220606565.0,
"step": 513
},
{
"entropy": 0.422393798828125,
"epoch": 2.0396825396825395,
"grad_norm": 0.8777983265834516,
"learning_rate": 2.5697887833518215e-06,
"loss": 0.3897,
"mean_token_accuracy": 0.8630373626947403,
"num_tokens": 221016578.0,
"step": 514
},
{
"entropy": 0.41497802734375,
"epoch": 2.0436507936507935,
"grad_norm": 0.9119000908237385,
"learning_rate": 2.5506926868684683e-06,
"loss": 0.3967,
"mean_token_accuracy": 0.8603310724720359,
"num_tokens": 221455851.0,
"step": 515
},
{
"entropy": 0.424346923828125,
"epoch": 2.0476190476190474,
"grad_norm": 0.9104788824732245,
"learning_rate": 2.5316434818114517e-06,
"loss": 0.4009,
"mean_token_accuracy": 0.8583084382116795,
"num_tokens": 221871968.0,
"step": 516
},
{
"entropy": 0.41632080078125,
"epoch": 2.0515873015873014,
"grad_norm": 0.7974753175425153,
"learning_rate": 2.5126415328734275e-06,
"loss": 0.3875,
"mean_token_accuracy": 0.8620841084048152,
"num_tokens": 222303576.0,
"step": 517
},
{
"entropy": 0.41741943359375,
"epoch": 2.0555555555555554,
"grad_norm": 0.8523247821631298,
"learning_rate": 2.4936872038423516e-06,
"loss": 0.3935,
"mean_token_accuracy": 0.8615706618875265,
"num_tokens": 222738323.0,
"step": 518
},
{
"entropy": 0.416717529296875,
"epoch": 2.0595238095238093,
"grad_norm": 0.8420283553726328,
"learning_rate": 2.4747808575945006e-06,
"loss": 0.3942,
"mean_token_accuracy": 0.8623552098870277,
"num_tokens": 223168261.0,
"step": 519
},
{
"entropy": 0.421295166015625,
"epoch": 2.0634920634920633,
"grad_norm": 0.9269712393029744,
"learning_rate": 2.4559228560875336e-06,
"loss": 0.3983,
"mean_token_accuracy": 0.8609938519075513,
"num_tokens": 223584134.0,
"step": 520
},
{
"entropy": 0.41546630859375,
"epoch": 2.0674603174603177,
"grad_norm": 0.7913231790323264,
"learning_rate": 2.4371135603535613e-06,
"loss": 0.3881,
"mean_token_accuracy": 0.8632083088159561,
"num_tokens": 224013215.0,
"step": 521
},
{
"entropy": 0.40972900390625,
"epoch": 2.0714285714285716,
"grad_norm": 0.8896009296171342,
"learning_rate": 2.4183533304922336e-06,
"loss": 0.4024,
"mean_token_accuracy": 0.8593400968238711,
"num_tokens": 224461654.0,
"step": 522
},
{
"entropy": 0.416046142578125,
"epoch": 2.0753968253968256,
"grad_norm": 0.8522563242461978,
"learning_rate": 2.399642525663843e-06,
"loss": 0.3968,
"mean_token_accuracy": 0.8609009999781847,
"num_tokens": 224885889.0,
"step": 523
},
{
"entropy": 0.41802978515625,
"epoch": 2.0793650793650795,
"grad_norm": 0.8436355578137702,
"learning_rate": 2.380981504082459e-06,
"loss": 0.4051,
"mean_token_accuracy": 0.8574947854503989,
"num_tokens": 225327562.0,
"step": 524
},
{
"entropy": 0.410980224609375,
"epoch": 2.0833333333333335,
"grad_norm": 0.9234046715388234,
"learning_rate": 2.3623706230090517e-06,
"loss": 0.3946,
"mean_token_accuracy": 0.860747816041112,
"num_tokens": 225767121.0,
"step": 525
},
{
"entropy": 0.4215087890625,
"epoch": 2.0873015873015874,
"grad_norm": 0.886667769462096,
"learning_rate": 2.3438102387446686e-06,
"loss": 0.3887,
"mean_token_accuracy": 0.8633216423913836,
"num_tokens": 226189031.0,
"step": 526
},
{
"entropy": 0.41558837890625,
"epoch": 2.0912698412698414,
"grad_norm": 0.8295983883133476,
"learning_rate": 2.325300706623607e-06,
"loss": 0.4059,
"mean_token_accuracy": 0.8594214450567961,
"num_tokens": 226627902.0,
"step": 527
},
{
"entropy": 0.416168212890625,
"epoch": 2.0952380952380953,
"grad_norm": 0.8579824625414783,
"learning_rate": 2.3068423810066085e-06,
"loss": 0.4086,
"mean_token_accuracy": 0.8578107142820954,
"num_tokens": 227062309.0,
"step": 528
},
{
"entropy": 0.418792724609375,
"epoch": 2.0992063492063493,
"grad_norm": 0.8717081684901182,
"learning_rate": 2.288435615274085e-06,
"loss": 0.4026,
"mean_token_accuracy": 0.8583700396120548,
"num_tokens": 227485113.0,
"step": 529
},
{
"entropy": 0.418609619140625,
"epoch": 2.1031746031746033,
"grad_norm": 0.8671184672809995,
"learning_rate": 2.2700807618193393e-06,
"loss": 0.3945,
"mean_token_accuracy": 0.8610662836581469,
"num_tokens": 227920598.0,
"step": 530
},
{
"entropy": 0.416961669921875,
"epoch": 2.107142857142857,
"grad_norm": 0.7659046613801866,
"learning_rate": 2.251778172041828e-06,
"loss": 0.391,
"mean_token_accuracy": 0.8613040810450912,
"num_tokens": 228346699.0,
"step": 531
},
{
"entropy": 0.41766357421875,
"epoch": 2.111111111111111,
"grad_norm": 0.8757955281793407,
"learning_rate": 2.2335281963404315e-06,
"loss": 0.3985,
"mean_token_accuracy": 0.86165143083781,
"num_tokens": 228773818.0,
"step": 532
},
{
"entropy": 0.41998291015625,
"epoch": 2.115079365079365,
"grad_norm": 0.9727283374741916,
"learning_rate": 2.2153311841067438e-06,
"loss": 0.3928,
"mean_token_accuracy": 0.8631924940273166,
"num_tokens": 229188623.0,
"step": 533
},
{
"entropy": 0.412200927734375,
"epoch": 2.119047619047619,
"grad_norm": 0.8392433239210284,
"learning_rate": 2.1971874837183914e-06,
"loss": 0.3869,
"mean_token_accuracy": 0.8635608870536089,
"num_tokens": 229627711.0,
"step": 534
},
{
"entropy": 0.415802001953125,
"epoch": 2.123015873015873,
"grad_norm": 0.9201827428240057,
"learning_rate": 2.179097442532352e-06,
"loss": 0.4088,
"mean_token_accuracy": 0.8568679317831993,
"num_tokens": 230054209.0,
"step": 535
},
{
"entropy": 0.41278076171875,
"epoch": 2.126984126984127,
"grad_norm": 0.8066388305393899,
"learning_rate": 2.1610614068783112e-06,
"loss": 0.3981,
"mean_token_accuracy": 0.8601069571450353,
"num_tokens": 230489032.0,
"step": 536
},
{
"entropy": 0.411895751953125,
"epoch": 2.130952380952381,
"grad_norm": 0.8350937916956933,
"learning_rate": 2.143079722052034e-06,
"loss": 0.4015,
"mean_token_accuracy": 0.8587260395288467,
"num_tokens": 230910745.0,
"step": 537
},
{
"entropy": 0.417938232421875,
"epoch": 2.134920634920635,
"grad_norm": 0.791508989758568,
"learning_rate": 2.125152732308747e-06,
"loss": 0.4049,
"mean_token_accuracy": 0.8583241375163198,
"num_tokens": 231339019.0,
"step": 538
},
{
"entropy": 0.4166259765625,
"epoch": 2.138888888888889,
"grad_norm": 0.7979398132027408,
"learning_rate": 2.1072807808565547e-06,
"loss": 0.4084,
"mean_token_accuracy": 0.8571968795731664,
"num_tokens": 231777523.0,
"step": 539
},
{
"entropy": 0.420440673828125,
"epoch": 2.142857142857143,
"grad_norm": 0.8603306148484448,
"learning_rate": 2.0894642098498656e-06,
"loss": 0.3952,
"mean_token_accuracy": 0.859032517299056,
"num_tokens": 232199672.0,
"step": 540
},
{
"entropy": 0.423187255859375,
"epoch": 2.1468253968253967,
"grad_norm": 0.9055074686631474,
"learning_rate": 2.0717033603828436e-06,
"loss": 0.3923,
"mean_token_accuracy": 0.8614393156021833,
"num_tokens": 232633797.0,
"step": 541
},
{
"entropy": 0.417877197265625,
"epoch": 2.1507936507936507,
"grad_norm": 0.8617856992329058,
"learning_rate": 2.0539985724828736e-06,
"loss": 0.4081,
"mean_token_accuracy": 0.8573337839916348,
"num_tokens": 233076007.0,
"step": 542
},
{
"entropy": 0.41546630859375,
"epoch": 2.1547619047619047,
"grad_norm": 0.8903667184752816,
"learning_rate": 2.0363501851040573e-06,
"loss": 0.3922,
"mean_token_accuracy": 0.861387861892581,
"num_tokens": 233509851.0,
"step": 543
},
{
"entropy": 0.4229736328125,
"epoch": 2.1587301587301586,
"grad_norm": 0.8398162712869015,
"learning_rate": 2.0187585361207174e-06,
"loss": 0.4043,
"mean_token_accuracy": 0.857014361768961,
"num_tokens": 233942156.0,
"step": 544
},
{
"entropy": 0.418701171875,
"epoch": 2.1626984126984126,
"grad_norm": 0.8309474925972752,
"learning_rate": 2.001223962320941e-06,
"loss": 0.3959,
"mean_token_accuracy": 0.8592708380892873,
"num_tokens": 234372963.0,
"step": 545
},
{
"entropy": 0.414398193359375,
"epoch": 2.1666666666666665,
"grad_norm": 0.8088942738118841,
"learning_rate": 1.9837467994001165e-06,
"loss": 0.4048,
"mean_token_accuracy": 0.8613162385299802,
"num_tokens": 234844668.0,
"step": 546
},
{
"entropy": 0.429656982421875,
"epoch": 2.1706349206349205,
"grad_norm": 0.8900138868011044,
"learning_rate": 1.9663273819545157e-06,
"loss": 0.4117,
"mean_token_accuracy": 0.8555487683042884,
"num_tokens": 235271990.0,
"step": 547
},
{
"entropy": 0.416961669921875,
"epoch": 2.1746031746031744,
"grad_norm": 0.8125994478475848,
"learning_rate": 1.948966043474889e-06,
"loss": 0.3981,
"mean_token_accuracy": 0.8588608456775546,
"num_tokens": 235697877.0,
"step": 548
},
{
"entropy": 0.429046630859375,
"epoch": 2.1785714285714284,
"grad_norm": 0.9972924104553051,
"learning_rate": 1.931663116340074e-06,
"loss": 0.4049,
"mean_token_accuracy": 0.8577186185866594,
"num_tokens": 236134537.0,
"step": 549
},
{
"entropy": 0.410797119140625,
"epoch": 2.1825396825396823,
"grad_norm": 0.8632872657339906,
"learning_rate": 1.914418931810643e-06,
"loss": 0.3855,
"mean_token_accuracy": 0.8640564111992717,
"num_tokens": 236586699.0,
"step": 550
},
{
"entropy": 0.424530029296875,
"epoch": 2.1865079365079367,
"grad_norm": 0.8870689635471863,
"learning_rate": 1.8972338200225509e-06,
"loss": 0.3984,
"mean_token_accuracy": 0.8577613439410925,
"num_tokens": 236995332.0,
"step": 551
},
{
"entropy": 0.418975830078125,
"epoch": 2.1904761904761907,
"grad_norm": 0.9628030178975229,
"learning_rate": 1.880108109980815e-06,
"loss": 0.3934,
"mean_token_accuracy": 0.861169021576643,
"num_tokens": 237426378.0,
"step": 552
},
{
"entropy": 0.411376953125,
"epoch": 2.1944444444444446,
"grad_norm": 0.933588404712383,
"learning_rate": 1.8630421295532252e-06,
"loss": 0.3905,
"mean_token_accuracy": 0.8604107396677136,
"num_tokens": 237866086.0,
"step": 553
},
{
"entropy": 0.41845703125,
"epoch": 2.1984126984126986,
"grad_norm": 1.0435808914840323,
"learning_rate": 1.8460362054640573e-06,
"loss": 0.4007,
"mean_token_accuracy": 0.8584116594865918,
"num_tokens": 238297987.0,
"step": 554
},
{
"entropy": 0.4312744140625,
"epoch": 2.2023809523809526,
"grad_norm": 0.9124011744416908,
"learning_rate": 1.8290906632878297e-06,
"loss": 0.4056,
"mean_token_accuracy": 0.8590257493779063,
"num_tokens": 238729296.0,
"step": 555
},
{
"entropy": 0.41986083984375,
"epoch": 2.2063492063492065,
"grad_norm": 0.9196757371946168,
"learning_rate": 1.8122058274430542e-06,
"loss": 0.408,
"mean_token_accuracy": 0.8594406340271235,
"num_tokens": 239171101.0,
"step": 556
},
{
"entropy": 0.42120361328125,
"epoch": 2.2103174603174605,
"grad_norm": 0.8297358875305545,
"learning_rate": 1.7953820211860395e-06,
"loss": 0.3919,
"mean_token_accuracy": 0.8603522703051567,
"num_tokens": 239602299.0,
"step": 557
},
{
"entropy": 0.41949462890625,
"epoch": 2.2142857142857144,
"grad_norm": 1.6698534343246039,
"learning_rate": 1.7786195666046935e-06,
"loss": 0.3915,
"mean_token_accuracy": 0.8623024551197886,
"num_tokens": 240034337.0,
"step": 558
},
{
"entropy": 0.42144775390625,
"epoch": 2.2182539682539684,
"grad_norm": 0.8963232285622191,
"learning_rate": 1.7619187846123624e-06,
"loss": 0.3901,
"mean_token_accuracy": 0.8617757288739085,
"num_tokens": 240461291.0,
"step": 559
},
{
"entropy": 0.42474365234375,
"epoch": 2.2222222222222223,
"grad_norm": 0.9778763913057226,
"learning_rate": 1.7452799949416833e-06,
"loss": 0.384,
"mean_token_accuracy": 0.8640343863517046,
"num_tokens": 240860927.0,
"step": 560
},
{
"entropy": 0.41705322265625,
"epoch": 2.2261904761904763,
"grad_norm": 0.8286270345827924,
"learning_rate": 1.7287035161384673e-06,
"loss": 0.3996,
"mean_token_accuracy": 0.8590253088623285,
"num_tokens": 241301179.0,
"step": 561
},
{
"entropy": 0.418853759765625,
"epoch": 2.2301587301587302,
"grad_norm": 0.8430918806162481,
"learning_rate": 1.7121896655555958e-06,
"loss": 0.396,
"mean_token_accuracy": 0.860031645745039,
"num_tokens": 241739076.0,
"step": 562
},
{
"entropy": 0.424774169921875,
"epoch": 2.234126984126984,
"grad_norm": 0.826236198905769,
"learning_rate": 1.695738759346947e-06,
"loss": 0.3891,
"mean_token_accuracy": 0.8625601828098297,
"num_tokens": 242150640.0,
"step": 563
},
{
"entropy": 0.412109375,
"epoch": 2.238095238095238,
"grad_norm": 0.8853893523977265,
"learning_rate": 1.6793511124613455e-06,
"loss": 0.3874,
"mean_token_accuracy": 0.8637553565204144,
"num_tokens": 242574011.0,
"step": 564
},
{
"entropy": 0.422393798828125,
"epoch": 2.242063492063492,
"grad_norm": 0.9075367727640452,
"learning_rate": 1.6630270386365288e-06,
"loss": 0.3989,
"mean_token_accuracy": 0.8571943752467632,
"num_tokens": 243005939.0,
"step": 565
},
{
"entropy": 0.41766357421875,
"epoch": 2.246031746031746,
"grad_norm": 0.8448948319006312,
"learning_rate": 1.6467668503931432e-06,
"loss": 0.398,
"mean_token_accuracy": 0.861447062343359,
"num_tokens": 243458878.0,
"step": 566
},
{
"entropy": 0.418365478515625,
"epoch": 2.25,
"grad_norm": 0.9930222072087751,
"learning_rate": 1.6305708590287616e-06,
"loss": 0.3997,
"mean_token_accuracy": 0.8600739203393459,
"num_tokens": 243877438.0,
"step": 567
},
{
"entropy": 0.41552734375,
"epoch": 2.253968253968254,
"grad_norm": 0.8598361323835692,
"learning_rate": 1.6144393746119208e-06,
"loss": 0.3943,
"mean_token_accuracy": 0.8619920583441854,
"num_tokens": 244313964.0,
"step": 568
},
{
"entropy": 0.41705322265625,
"epoch": 2.257936507936508,
"grad_norm": 0.9059341355540655,
"learning_rate": 1.5983727059761873e-06,
"loss": 0.3981,
"mean_token_accuracy": 0.8603257145732641,
"num_tokens": 244761734.0,
"step": 569
},
{
"entropy": 0.417938232421875,
"epoch": 2.261904761904762,
"grad_norm": 0.8354660701028858,
"learning_rate": 1.5823711607142428e-06,
"loss": 0.3843,
"mean_token_accuracy": 0.863621992059052,
"num_tokens": 245200322.0,
"step": 570
},
{
"entropy": 0.416839599609375,
"epoch": 2.265873015873016,
"grad_norm": 0.8345755216968843,
"learning_rate": 1.5664350451720022e-06,
"loss": 0.396,
"mean_token_accuracy": 0.8610862046480179,
"num_tokens": 245646233.0,
"step": 571
},
{
"entropy": 0.421661376953125,
"epoch": 2.2698412698412698,
"grad_norm": 0.8201081491300131,
"learning_rate": 1.5505646644427375e-06,
"loss": 0.395,
"mean_token_accuracy": 0.8609900875017047,
"num_tokens": 246083539.0,
"step": 572
},
{
"entropy": 0.421539306640625,
"epoch": 2.2738095238095237,
"grad_norm": 0.8429380051297379,
"learning_rate": 1.5347603223612462e-06,
"loss": 0.3963,
"mean_token_accuracy": 0.860317200422287,
"num_tokens": 246515677.0,
"step": 573
},
{
"entropy": 0.4217529296875,
"epoch": 2.2777777777777777,
"grad_norm": 0.868322359342986,
"learning_rate": 1.5190223214980286e-06,
"loss": 0.3976,
"mean_token_accuracy": 0.8608297156170011,
"num_tokens": 246933619.0,
"step": 574
},
{
"entropy": 0.43359375,
"epoch": 2.2817460317460316,
"grad_norm": 0.8952218666631779,
"learning_rate": 1.5033509631534986e-06,
"loss": 0.3966,
"mean_token_accuracy": 0.8629090571776032,
"num_tokens": 247344382.0,
"step": 575
},
{
"entropy": 0.41790771484375,
"epoch": 2.2857142857142856,
"grad_norm": 0.9480496740892829,
"learning_rate": 1.4877465473522178e-06,
"loss": 0.3813,
"mean_token_accuracy": 0.8640672285109758,
"num_tokens": 247765672.0,
"step": 576
},
{
"entropy": 0.42218017578125,
"epoch": 2.2896825396825395,
"grad_norm": 0.9704838555740247,
"learning_rate": 1.4722093728371427e-06,
"loss": 0.3878,
"mean_token_accuracy": 0.8612747713923454,
"num_tokens": 248183306.0,
"step": 577
},
{
"entropy": 0.410430908203125,
"epoch": 2.2936507936507935,
"grad_norm": 0.8533419703585065,
"learning_rate": 1.4567397370639158e-06,
"loss": 0.3927,
"mean_token_accuracy": 0.8615565691143274,
"num_tokens": 248628378.0,
"step": 578
},
{
"entropy": 0.41888427734375,
"epoch": 2.2976190476190474,
"grad_norm": 0.818324266262677,
"learning_rate": 1.4413379361951596e-06,
"loss": 0.4009,
"mean_token_accuracy": 0.8598908875137568,
"num_tokens": 249071096.0,
"step": 579
},
{
"entropy": 0.41949462890625,
"epoch": 2.3015873015873014,
"grad_norm": 0.8157937775196074,
"learning_rate": 1.4260042650948187e-06,
"loss": 0.3959,
"mean_token_accuracy": 0.8613967839628458,
"num_tokens": 249501143.0,
"step": 580
},
{
"entropy": 0.419769287109375,
"epoch": 2.3055555555555554,
"grad_norm": 0.948858831726886,
"learning_rate": 1.4107390173225045e-06,
"loss": 0.3945,
"mean_token_accuracy": 0.8604099499061704,
"num_tokens": 249948355.0,
"step": 581
},
{
"entropy": 0.42041015625,
"epoch": 2.3095238095238093,
"grad_norm": 0.8758102059030293,
"learning_rate": 1.395542485127886e-06,
"loss": 0.388,
"mean_token_accuracy": 0.8634849116206169,
"num_tokens": 250356099.0,
"step": 582
},
{
"entropy": 0.421234130859375,
"epoch": 2.3134920634920633,
"grad_norm": 0.8815188369640882,
"learning_rate": 1.3804149594450816e-06,
"loss": 0.3919,
"mean_token_accuracy": 0.8597034253180027,
"num_tokens": 250775592.0,
"step": 583
},
{
"entropy": 0.418121337890625,
"epoch": 2.317460317460317,
"grad_norm": 0.861023672134407,
"learning_rate": 1.365356729887099e-06,
"loss": 0.4,
"mean_token_accuracy": 0.8603812381625175,
"num_tokens": 251219125.0,
"step": 584
},
{
"entropy": 0.415496826171875,
"epoch": 2.3214285714285716,
"grad_norm": 0.8641123367226853,
"learning_rate": 1.3503680847402868e-06,
"loss": 0.3933,
"mean_token_accuracy": 0.8616957142949104,
"num_tokens": 251648861.0,
"step": 585
},
{
"entropy": 0.41497802734375,
"epoch": 2.3253968253968256,
"grad_norm": 0.8154240634747612,
"learning_rate": 1.3354493109588145e-06,
"loss": 0.3926,
"mean_token_accuracy": 0.8618068303912878,
"num_tokens": 252080434.0,
"step": 586
},
{
"entropy": 0.417633056640625,
"epoch": 2.3293650793650795,
"grad_norm": 0.8354299632421693,
"learning_rate": 1.320600694159185e-06,
"loss": 0.3828,
"mean_token_accuracy": 0.8655170071870089,
"num_tokens": 252502018.0,
"step": 587
},
{
"entropy": 0.420166015625,
"epoch": 2.3333333333333335,
"grad_norm": 0.9436967025783154,
"learning_rate": 1.3058225186147572e-06,
"loss": 0.3957,
"mean_token_accuracy": 0.8595009902492166,
"num_tokens": 252923218.0,
"step": 588
},
{
"entropy": 0.419464111328125,
"epoch": 2.3373015873015874,
"grad_norm": 0.8818218399814328,
"learning_rate": 1.2911150672503098e-06,
"loss": 0.3867,
"mean_token_accuracy": 0.8642842434346676,
"num_tokens": 253337148.0,
"step": 589
},
{
"entropy": 0.426788330078125,
"epoch": 2.3412698412698414,
"grad_norm": 0.8980593730409643,
"learning_rate": 1.2764786216366236e-06,
"loss": 0.3988,
"mean_token_accuracy": 0.8595603117719293,
"num_tokens": 253761289.0,
"step": 590
},
{
"entropy": 0.416748046875,
"epoch": 2.3452380952380953,
"grad_norm": 0.944966296741567,
"learning_rate": 1.2619134619850908e-06,
"loss": 0.3929,
"mean_token_accuracy": 0.8604479916393757,
"num_tokens": 254195017.0,
"step": 591
},
{
"entropy": 0.41595458984375,
"epoch": 2.3492063492063493,
"grad_norm": 1.0810430230383554,
"learning_rate": 1.2474198671423493e-06,
"loss": 0.3999,
"mean_token_accuracy": 0.8599454695358872,
"num_tokens": 254643716.0,
"step": 592
},
{
"entropy": 0.41754150390625,
"epoch": 2.3531746031746033,
"grad_norm": 0.812428846397806,
"learning_rate": 1.2329981145849468e-06,
"loss": 0.3977,
"mean_token_accuracy": 0.8586347484961152,
"num_tokens": 255069339.0,
"step": 593
},
{
"entropy": 0.42437744140625,
"epoch": 2.357142857142857,
"grad_norm": 0.8302059952828363,
"learning_rate": 1.2186484804140242e-06,
"loss": 0.3942,
"mean_token_accuracy": 0.8609241275116801,
"num_tokens": 255486573.0,
"step": 594
},
{
"entropy": 0.415771484375,
"epoch": 2.361111111111111,
"grad_norm": 0.8148827903906969,
"learning_rate": 1.2043712393500355e-06,
"loss": 0.3876,
"mean_token_accuracy": 0.8626940259709954,
"num_tokens": 255913426.0,
"step": 595
},
{
"entropy": 0.42193603515625,
"epoch": 2.365079365079365,
"grad_norm": 1.2289420157864683,
"learning_rate": 1.1901666647274823e-06,
"loss": 0.3841,
"mean_token_accuracy": 0.8637949759140611,
"num_tokens": 256345326.0,
"step": 596
},
{
"entropy": 0.416656494140625,
"epoch": 2.369047619047619,
"grad_norm": 0.8492473570067233,
"learning_rate": 1.1760350284896876e-06,
"loss": 0.388,
"mean_token_accuracy": 0.864149815402925,
"num_tokens": 256765173.0,
"step": 597
},
{
"entropy": 0.418212890625,
"epoch": 2.373015873015873,
"grad_norm": 0.7898920278559984,
"learning_rate": 1.1619766011835832e-06,
"loss": 0.3797,
"mean_token_accuracy": 0.8674542000517249,
"num_tokens": 257185152.0,
"step": 598
},
{
"entropy": 0.415191650390625,
"epoch": 2.376984126984127,
"grad_norm": 0.8450780742867222,
"learning_rate": 1.1479916519545326e-06,
"loss": 0.3903,
"mean_token_accuracy": 0.8624427672475576,
"num_tokens": 257627732.0,
"step": 599
},
{
"entropy": 0.414825439453125,
"epoch": 2.380952380952381,
"grad_norm": 0.8928696413952878,
"learning_rate": 1.1340804485411783e-06,
"loss": 0.3917,
"mean_token_accuracy": 0.8615064565092325,
"num_tokens": 258067282.0,
"step": 600
},
{
"entropy": 0.421417236328125,
"epoch": 2.384920634920635,
"grad_norm": 0.9596298099931699,
"learning_rate": 1.1202432572703176e-06,
"loss": 0.396,
"mean_token_accuracy": 0.8607813809067011,
"num_tokens": 258491168.0,
"step": 601
},
{
"entropy": 0.412139892578125,
"epoch": 2.388888888888889,
"grad_norm": 0.8015642815814561,
"learning_rate": 1.1064803430518002e-06,
"loss": 0.3919,
"mean_token_accuracy": 0.8602419178932905,
"num_tokens": 258944016.0,
"step": 602
},
{
"entropy": 0.419189453125,
"epoch": 2.392857142857143,
"grad_norm": 0.83893313171213,
"learning_rate": 1.0927919693734618e-06,
"loss": 0.3941,
"mean_token_accuracy": 0.8623963864520192,
"num_tokens": 259379493.0,
"step": 603
},
{
"entropy": 0.421142578125,
"epoch": 2.3968253968253967,
"grad_norm": 0.806191116058063,
"learning_rate": 1.0791783982960736e-06,
"loss": 0.3875,
"mean_token_accuracy": 0.8618775270879269,
"num_tokens": 259808268.0,
"step": 604
},
{
"entropy": 0.412872314453125,
"epoch": 2.4007936507936507,
"grad_norm": 0.8986481499489538,
"learning_rate": 1.0656398904483312e-06,
"loss": 0.395,
"mean_token_accuracy": 0.8624038007110357,
"num_tokens": 260247659.0,
"step": 605
},
{
"entropy": 0.41680908203125,
"epoch": 2.4047619047619047,
"grad_norm": 0.9536388176335355,
"learning_rate": 1.0521767050218562e-06,
"loss": 0.4001,
"mean_token_accuracy": 0.860544073395431,
"num_tokens": 260684292.0,
"step": 606
},
{
"entropy": 0.416168212890625,
"epoch": 2.4087301587301586,
"grad_norm": 0.8770578300353563,
"learning_rate": 1.0387890997662443e-06,
"loss": 0.3945,
"mean_token_accuracy": 0.8609949657693505,
"num_tokens": 261121173.0,
"step": 607
},
{
"entropy": 0.42376708984375,
"epoch": 2.4126984126984126,
"grad_norm": 0.8910554686210177,
"learning_rate": 1.0254773309841277e-06,
"loss": 0.3967,
"mean_token_accuracy": 0.8618429079651833,
"num_tokens": 261555918.0,
"step": 608
},
{
"entropy": 0.417755126953125,
"epoch": 2.4166666666666665,
"grad_norm": 0.777450132911365,
"learning_rate": 1.012241653526263e-06,
"loss": 0.3946,
"mean_token_accuracy": 0.8610922154039145,
"num_tokens": 262000331.0,
"step": 609
},
{
"entropy": 0.42059326171875,
"epoch": 2.4206349206349205,
"grad_norm": 0.8219463383506274,
"learning_rate": 9.990823207866578e-07,
"loss": 0.386,
"mean_token_accuracy": 0.8632347630336881,
"num_tokens": 262425946.0,
"step": 610
},
{
"entropy": 0.42413330078125,
"epoch": 2.4246031746031744,
"grad_norm": 0.9964330808029446,
"learning_rate": 9.85999584697716e-07,
"loss": 0.3892,
"mean_token_accuracy": 0.8625029819086194,
"num_tokens": 262853210.0,
"step": 611
},
{
"entropy": 0.42291259765625,
"epoch": 2.4285714285714284,
"grad_norm": 0.8278237327212594,
"learning_rate": 9.729936957254165e-07,
"loss": 0.3822,
"mean_token_accuracy": 0.864779950119555,
"num_tokens": 263268966.0,
"step": 612
},
{
"entropy": 0.41943359375,
"epoch": 2.432539682539683,
"grad_norm": 0.9884237647568829,
"learning_rate": 9.600649028645215e-07,
"loss": 0.3933,
"mean_token_accuracy": 0.8612792957574129,
"num_tokens": 263709583.0,
"step": 613
},
{
"entropy": 0.418853759765625,
"epoch": 2.4365079365079367,
"grad_norm": 0.9015756745222828,
"learning_rate": 9.472134536338007e-07,
"loss": 0.3859,
"mean_token_accuracy": 0.8643078990280628,
"num_tokens": 264137961.0,
"step": 614
},
{
"entropy": 0.411834716796875,
"epoch": 2.4404761904761907,
"grad_norm": 0.8083110921800731,
"learning_rate": 9.344395940713009e-07,
"loss": 0.3905,
"mean_token_accuracy": 0.8626386728137732,
"num_tokens": 264579578.0,
"step": 615
},
{
"entropy": 0.42529296875,
"epoch": 2.4444444444444446,
"grad_norm": 0.8539196851499317,
"learning_rate": 9.217435687296305e-07,
"loss": 0.3889,
"mean_token_accuracy": 0.8617231827229261,
"num_tokens": 264995910.0,
"step": 616
},
{
"entropy": 0.4136962890625,
"epoch": 2.4484126984126986,
"grad_norm": 0.7995502674271355,
"learning_rate": 9.091256206712812e-07,
"loss": 0.3925,
"mean_token_accuracy": 0.8612663270905614,
"num_tokens": 265440836.0,
"step": 617
},
{
"entropy": 0.42041015625,
"epoch": 2.4523809523809526,
"grad_norm": 0.8157462797571775,
"learning_rate": 8.965859914639724e-07,
"loss": 0.3832,
"mean_token_accuracy": 0.8630803981795907,
"num_tokens": 265867518.0,
"step": 618
},
{
"entropy": 0.420501708984375,
"epoch": 2.4563492063492065,
"grad_norm": 0.9573151028277197,
"learning_rate": 8.841249211760272e-07,
"loss": 0.4006,
"mean_token_accuracy": 0.8605411788448691,
"num_tokens": 266304569.0,
"step": 619
},
{
"entropy": 0.416839599609375,
"epoch": 2.4603174603174605,
"grad_norm": 0.8389660650593388,
"learning_rate": 8.717426483717762e-07,
"loss": 0.3843,
"mean_token_accuracy": 0.8629998695105314,
"num_tokens": 266730039.0,
"step": 620
},
{
"entropy": 0.4168701171875,
"epoch": 2.4642857142857144,
"grad_norm": 0.8408327374770237,
"learning_rate": 8.594394101069897e-07,
"loss": 0.4009,
"mean_token_accuracy": 0.8605172112584114,
"num_tokens": 267169815.0,
"step": 621
},
{
"entropy": 0.41571044921875,
"epoch": 2.4682539682539684,
"grad_norm": 0.8011141591258287,
"learning_rate": 8.472154419243411e-07,
"loss": 0.3918,
"mean_token_accuracy": 0.8619374986737967,
"num_tokens": 267605673.0,
"step": 622
},
{
"entropy": 0.41705322265625,
"epoch": 2.4722222222222223,
"grad_norm": 0.8434082500134104,
"learning_rate": 8.350709778488941e-07,
"loss": 0.4014,
"mean_token_accuracy": 0.8600445203483105,
"num_tokens": 268044360.0,
"step": 623
},
{
"entropy": 0.41815185546875,
"epoch": 2.4761904761904763,
"grad_norm": 0.8019659782743609,
"learning_rate": 8.230062503836278e-07,
"loss": 0.3937,
"mean_token_accuracy": 0.8604294890537858,
"num_tokens": 268470812.0,
"step": 624
},
{
"entropy": 0.417449951171875,
"epoch": 2.4801587301587302,
"grad_norm": 0.8264347297639569,
"learning_rate": 8.110214905049802e-07,
"loss": 0.3965,
"mean_token_accuracy": 0.8575309114530683,
"num_tokens": 268895281.0,
"step": 625
},
{
"entropy": 0.414459228515625,
"epoch": 2.484126984126984,
"grad_norm": 0.7888506972306255,
"learning_rate": 7.991169276584281e-07,
"loss": 0.3807,
"mean_token_accuracy": 0.8645908059552312,
"num_tokens": 269329768.0,
"step": 626
},
{
"entropy": 0.41363525390625,
"epoch": 2.488095238095238,
"grad_norm": 0.8377743907107998,
"learning_rate": 7.872927897540944e-07,
"loss": 0.3948,
"mean_token_accuracy": 0.8611715780571103,
"num_tokens": 269763538.0,
"step": 627
},
{
"entropy": 0.419525146484375,
"epoch": 2.492063492063492,
"grad_norm": 0.7713110745405427,
"learning_rate": 7.75549303162384e-07,
"loss": 0.3945,
"mean_token_accuracy": 0.8595996387302876,
"num_tokens": 270192672.0,
"step": 628
},
{
"entropy": 0.418792724609375,
"epoch": 2.496031746031746,
"grad_norm": 0.8447629896166373,
"learning_rate": 7.638866927096555e-07,
"loss": 0.4074,
"mean_token_accuracy": 0.8587245307862759,
"num_tokens": 270633240.0,
"step": 629
},
{
"entropy": 0.419891357421875,
"epoch": 2.5,
"grad_norm": 0.7852993278058601,
"learning_rate": 7.523051816739074e-07,
"loss": 0.3859,
"mean_token_accuracy": 0.8630366576835513,
"num_tokens": 271053623.0,
"step": 630
},
{
"entropy": 0.41510009765625,
"epoch": 2.503968253968254,
"grad_norm": 0.8426473805113363,
"learning_rate": 7.408049917805104e-07,
"loss": 0.3881,
"mean_token_accuracy": 0.8630319554358721,
"num_tokens": 271492583.0,
"step": 631
},
{
"entropy": 0.41632080078125,
"epoch": 2.507936507936508,
"grad_norm": 0.8529237472508443,
"learning_rate": 7.293863431979619e-07,
"loss": 0.395,
"mean_token_accuracy": 0.861218343488872,
"num_tokens": 271921985.0,
"step": 632
},
{
"entropy": 0.42822265625,
"epoch": 2.511904761904762,
"grad_norm": 0.7740038021053262,
"learning_rate": 7.180494545336642e-07,
"loss": 0.3874,
"mean_token_accuracy": 0.8652349133044481,
"num_tokens": 272349367.0,
"step": 633
},
{
"entropy": 0.426361083984375,
"epoch": 2.515873015873016,
"grad_norm": 0.9109105967855416,
"learning_rate": 7.067945428297524e-07,
"loss": 0.3976,
"mean_token_accuracy": 0.8593434160575271,
"num_tokens": 272757706.0,
"step": 634
},
{
"entropy": 0.424713134765625,
"epoch": 2.5198412698412698,
"grad_norm": 0.8510388770912337,
"learning_rate": 6.956218235589263e-07,
"loss": 0.3872,
"mean_token_accuracy": 0.8625729326158762,
"num_tokens": 273178323.0,
"step": 635
},
{
"entropy": 0.420318603515625,
"epoch": 2.5238095238095237,
"grad_norm": 0.8277629227526272,
"learning_rate": 6.845315106203327e-07,
"loss": 0.3868,
"mean_token_accuracy": 0.8626482058316469,
"num_tokens": 273603268.0,
"step": 636
},
{
"entropy": 0.418853759765625,
"epoch": 2.5277777777777777,
"grad_norm": 0.8202191768752707,
"learning_rate": 6.735238163354669e-07,
"loss": 0.3847,
"mean_token_accuracy": 0.8641904015094042,
"num_tokens": 274036335.0,
"step": 637
},
{
"entropy": 0.418914794921875,
"epoch": 2.5317460317460316,
"grad_norm": 0.8647875520943077,
"learning_rate": 6.625989514441089e-07,
"loss": 0.3925,
"mean_token_accuracy": 0.8626054916530848,
"num_tokens": 274458735.0,
"step": 638
},
{
"entropy": 0.412353515625,
"epoch": 2.5357142857142856,
"grad_norm": 0.7982027347968378,
"learning_rate": 6.517571251002896e-07,
"loss": 0.393,
"mean_token_accuracy": 0.8614260852336884,
"num_tokens": 274909982.0,
"step": 639
},
{
"entropy": 0.42431640625,
"epoch": 2.5396825396825395,
"grad_norm": 0.8307645924294975,
"learning_rate": 6.40998544868287e-07,
"loss": 0.3889,
"mean_token_accuracy": 0.8601001044735312,
"num_tokens": 275320028.0,
"step": 640
},
{
"entropy": 0.417816162109375,
"epoch": 2.5436507936507935,
"grad_norm": 0.8430698509853944,
"learning_rate": 6.3032341671865e-07,
"loss": 0.386,
"mean_token_accuracy": 0.8654862614348531,
"num_tokens": 275726848.0,
"step": 641
},
{
"entropy": 0.413848876953125,
"epoch": 2.5476190476190474,
"grad_norm": 0.8421768209102014,
"learning_rate": 6.197319450242562e-07,
"loss": 0.3867,
"mean_token_accuracy": 0.8631602311506867,
"num_tokens": 276151262.0,
"step": 642
},
{
"entropy": 0.417266845703125,
"epoch": 2.5515873015873014,
"grad_norm": 0.8929748589387052,
"learning_rate": 6.092243325564007e-07,
"loss": 0.3924,
"mean_token_accuracy": 0.8615100616589189,
"num_tokens": 276568860.0,
"step": 643
},
{
"entropy": 0.41387939453125,
"epoch": 2.5555555555555554,
"grad_norm": 0.8040672513690313,
"learning_rate": 5.98800780480912e-07,
"loss": 0.3858,
"mean_token_accuracy": 0.8625959139317274,
"num_tokens": 276997327.0,
"step": 644
},
{
"entropy": 0.421234130859375,
"epoch": 2.5595238095238093,
"grad_norm": 0.7855164537605119,
"learning_rate": 5.884614883543027e-07,
"loss": 0.394,
"mean_token_accuracy": 0.8626839118078351,
"num_tokens": 277426196.0,
"step": 645
},
{
"entropy": 0.4188232421875,
"epoch": 2.5634920634920633,
"grad_norm": 0.7843681767955034,
"learning_rate": 5.782066541199471e-07,
"loss": 0.3946,
"mean_token_accuracy": 0.8629313539713621,
"num_tokens": 277849848.0,
"step": 646
},
{
"entropy": 0.412078857421875,
"epoch": 2.567460317460317,
"grad_norm": 0.8561623782562832,
"learning_rate": 5.680364741042926e-07,
"loss": 0.3811,
"mean_token_accuracy": 0.8668704703450203,
"num_tokens": 278289888.0,
"step": 647
},
{
"entropy": 0.414276123046875,
"epoch": 2.571428571428571,
"grad_norm": 0.8147935679041525,
"learning_rate": 5.579511430131018e-07,
"loss": 0.3872,
"mean_token_accuracy": 0.8630826137959957,
"num_tokens": 278726761.0,
"step": 648
},
{
"entropy": 0.418182373046875,
"epoch": 2.575396825396825,
"grad_norm": 0.796874369891308,
"learning_rate": 5.479508539277229e-07,
"loss": 0.3801,
"mean_token_accuracy": 0.8660026481375098,
"num_tokens": 279136759.0,
"step": 649
},
{
"entropy": 0.416351318359375,
"epoch": 2.5793650793650795,
"grad_norm": 0.8223574515325844,
"learning_rate": 5.380357983013962e-07,
"loss": 0.392,
"mean_token_accuracy": 0.8621972808614373,
"num_tokens": 279572082.0,
"step": 650
},
{
"entropy": 0.415252685546875,
"epoch": 2.5833333333333335,
"grad_norm": 0.8419256563918806,
"learning_rate": 5.282061659555854e-07,
"loss": 0.3957,
"mean_token_accuracy": 0.8606690457090735,
"num_tokens": 279994957.0,
"step": 651
},
{
"entropy": 0.415008544921875,
"epoch": 2.5873015873015874,
"grad_norm": 0.8001543694338792,
"learning_rate": 5.184621450763455e-07,
"loss": 0.3819,
"mean_token_accuracy": 0.8638613997027278,
"num_tokens": 280414468.0,
"step": 652
},
{
"entropy": 0.41876220703125,
"epoch": 2.5912698412698414,
"grad_norm": 0.8281488407232048,
"learning_rate": 5.088039222107205e-07,
"loss": 0.405,
"mean_token_accuracy": 0.8599689844995737,
"num_tokens": 280832145.0,
"step": 653
},
{
"entropy": 0.420440673828125,
"epoch": 2.5952380952380953,
"grad_norm": 0.8401133410984405,
"learning_rate": 4.992316822631693e-07,
"loss": 0.3815,
"mean_token_accuracy": 0.8656142996624112,
"num_tokens": 281237288.0,
"step": 654
},
{
"entropy": 0.412689208984375,
"epoch": 2.5992063492063493,
"grad_norm": 0.806223122436009,
"learning_rate": 4.897456084920282e-07,
"loss": 0.3862,
"mean_token_accuracy": 0.8658296698704362,
"num_tokens": 281692258.0,
"step": 655
},
{
"entropy": 0.416168212890625,
"epoch": 2.6031746031746033,
"grad_norm": 0.8396062477724346,
"learning_rate": 4.803458825060042e-07,
"loss": 0.3763,
"mean_token_accuracy": 0.8662013709545135,
"num_tokens": 282118057.0,
"step": 656
},
{
"entropy": 0.412261962890625,
"epoch": 2.607142857142857,
"grad_norm": 0.825509139511018,
"learning_rate": 4.710326842606927e-07,
"loss": 0.3987,
"mean_token_accuracy": 0.8584959087893367,
"num_tokens": 282582066.0,
"step": 657
},
{
"entropy": 0.40606689453125,
"epoch": 2.611111111111111,
"grad_norm": 1.080095799468803,
"learning_rate": 4.618061920551381e-07,
"loss": 0.3936,
"mean_token_accuracy": 0.8631810490041971,
"num_tokens": 283028330.0,
"step": 658
},
{
"entropy": 0.42547607421875,
"epoch": 2.615079365079365,
"grad_norm": 0.8441240019764062,
"learning_rate": 4.526665825284132e-07,
"loss": 0.3936,
"mean_token_accuracy": 0.8619779404252768,
"num_tokens": 283436768.0,
"step": 659
},
{
"entropy": 0.41748046875,
"epoch": 2.619047619047619,
"grad_norm": 0.8263929571280181,
"learning_rate": 4.4361403065624475e-07,
"loss": 0.3864,
"mean_token_accuracy": 0.8627992533147335,
"num_tokens": 283866607.0,
"step": 660
},
{
"entropy": 0.4234619140625,
"epoch": 2.623015873015873,
"grad_norm": 0.844367472303199,
"learning_rate": 4.3464870974766314e-07,
"loss": 0.4004,
"mean_token_accuracy": 0.8607617728412151,
"num_tokens": 284281791.0,
"step": 661
},
{
"entropy": 0.419158935546875,
"epoch": 2.626984126984127,
"grad_norm": 0.8571993055017914,
"learning_rate": 4.257707914416781e-07,
"loss": 0.3874,
"mean_token_accuracy": 0.8635092154145241,
"num_tokens": 284705319.0,
"step": 662
},
{
"entropy": 0.417938232421875,
"epoch": 2.630952380952381,
"grad_norm": 0.7780232105885654,
"learning_rate": 4.169804457039972e-07,
"loss": 0.4086,
"mean_token_accuracy": 0.8589063184335828,
"num_tokens": 285154313.0,
"step": 663
},
{
"entropy": 0.413238525390625,
"epoch": 2.634920634920635,
"grad_norm": 0.850893830182736,
"learning_rate": 4.082778408237731e-07,
"loss": 0.4007,
"mean_token_accuracy": 0.8592528942972422,
"num_tokens": 285598883.0,
"step": 664
},
{
"entropy": 0.418487548828125,
"epoch": 2.638888888888889,
"grad_norm": 1.1283744707185912,
"learning_rate": 3.996631434103776e-07,
"loss": 0.3977,
"mean_token_accuracy": 0.860667590983212,
"num_tokens": 286037660.0,
"step": 665
},
{
"entropy": 0.416961669921875,
"epoch": 2.642857142857143,
"grad_norm": 0.8944770514716363,
"learning_rate": 3.911365183902166e-07,
"loss": 0.3898,
"mean_token_accuracy": 0.8620567666366696,
"num_tokens": 286461446.0,
"step": 666
},
{
"entropy": 0.419219970703125,
"epoch": 2.6468253968253967,
"grad_norm": 0.845344585577405,
"learning_rate": 3.826981290035692e-07,
"loss": 0.3898,
"mean_token_accuracy": 0.860666748136282,
"num_tokens": 286877023.0,
"step": 667
},
{
"entropy": 0.422149658203125,
"epoch": 2.6507936507936507,
"grad_norm": 0.8457306735031688,
"learning_rate": 3.7434813680146234e-07,
"loss": 0.3895,
"mean_token_accuracy": 0.8613977544009686,
"num_tokens": 287308399.0,
"step": 668
},
{
"entropy": 0.412872314453125,
"epoch": 2.6547619047619047,
"grad_norm": 0.7957237567868245,
"learning_rate": 3.6608670164258065e-07,
"loss": 0.3906,
"mean_token_accuracy": 0.8631431749090552,
"num_tokens": 287728804.0,
"step": 669
},
{
"entropy": 0.411468505859375,
"epoch": 2.6587301587301586,
"grad_norm": 0.7621184623535802,
"learning_rate": 3.5791398169020384e-07,
"loss": 0.393,
"mean_token_accuracy": 0.8615291966125369,
"num_tokens": 288187832.0,
"step": 670
},
{
"entropy": 0.417144775390625,
"epoch": 2.6626984126984126,
"grad_norm": 0.8055399962635597,
"learning_rate": 3.4983013340918024e-07,
"loss": 0.3834,
"mean_token_accuracy": 0.8645481085404754,
"num_tokens": 288600411.0,
"step": 671
},
{
"entropy": 0.410888671875,
"epoch": 2.6666666666666665,
"grad_norm": 0.8440468660994543,
"learning_rate": 3.4183531156292913e-07,
"loss": 0.394,
"mean_token_accuracy": 0.8628778494894505,
"num_tokens": 289047051.0,
"step": 672
},
{
"entropy": 0.417388916015625,
"epoch": 2.6706349206349205,
"grad_norm": 0.8448761260472664,
"learning_rate": 3.3392966921047984e-07,
"loss": 0.3932,
"mean_token_accuracy": 0.8621304808184505,
"num_tokens": 289478039.0,
"step": 673
},
{
"entropy": 0.4195556640625,
"epoch": 2.674603174603175,
"grad_norm": 0.826243737430181,
"learning_rate": 3.261133577035408e-07,
"loss": 0.3992,
"mean_token_accuracy": 0.8631375981494784,
"num_tokens": 289920851.0,
"step": 674
},
{
"entropy": 0.41644287109375,
"epoch": 2.678571428571429,
"grad_norm": 0.7480379642047055,
"learning_rate": 3.1838652668360173e-07,
"loss": 0.3834,
"mean_token_accuracy": 0.8634974956512451,
"num_tokens": 290351325.0,
"step": 675
},
{
"entropy": 0.4146728515625,
"epoch": 2.682539682539683,
"grad_norm": 0.7830053460618754,
"learning_rate": 3.1074932407906823e-07,
"loss": 0.3785,
"mean_token_accuracy": 0.8657077318057418,
"num_tokens": 290766931.0,
"step": 676
},
{
"entropy": 0.423828125,
"epoch": 2.6865079365079367,
"grad_norm": 0.7864820739930504,
"learning_rate": 3.0320189610243303e-07,
"loss": 0.3935,
"mean_token_accuracy": 0.8595830434933305,
"num_tokens": 291185306.0,
"step": 677
},
{
"entropy": 0.422698974609375,
"epoch": 2.6904761904761907,
"grad_norm": 0.7974086017120517,
"learning_rate": 2.957443872474713e-07,
"loss": 0.3873,
"mean_token_accuracy": 0.8635625531896949,
"num_tokens": 291599836.0,
"step": 678
},
{
"entropy": 0.4146728515625,
"epoch": 2.6944444444444446,
"grad_norm": 0.9412910487910857,
"learning_rate": 2.883769402864789e-07,
"loss": 0.4001,
"mean_token_accuracy": 0.8598026670515537,
"num_tokens": 292026507.0,
"step": 679
},
{
"entropy": 0.41259765625,
"epoch": 2.6984126984126986,
"grad_norm": 0.763447905049642,
"learning_rate": 2.810996962675361e-07,
"loss": 0.3903,
"mean_token_accuracy": 0.8622291041538119,
"num_tokens": 292454972.0,
"step": 680
},
{
"entropy": 0.419525146484375,
"epoch": 2.7023809523809526,
"grad_norm": 0.7897795759262028,
"learning_rate": 2.739127945118092e-07,
"loss": 0.3983,
"mean_token_accuracy": 0.8589327791705728,
"num_tokens": 292885705.0,
"step": 681
},
{
"entropy": 0.42181396484375,
"epoch": 2.7063492063492065,
"grad_norm": 0.7799046098775175,
"learning_rate": 2.668163726108841e-07,
"loss": 0.3786,
"mean_token_accuracy": 0.8630655352026224,
"num_tokens": 293307675.0,
"step": 682
},
{
"entropy": 0.418853759765625,
"epoch": 2.7103174603174605,
"grad_norm": 0.8041790844592746,
"learning_rate": 2.5981056642412796e-07,
"loss": 0.3934,
"mean_token_accuracy": 0.8626653142273426,
"num_tokens": 293722148.0,
"step": 683
},
{
"entropy": 0.41864013671875,
"epoch": 2.7142857142857144,
"grad_norm": 0.8076107515114371,
"learning_rate": 2.528955100760938e-07,
"loss": 0.3858,
"mean_token_accuracy": 0.863671412691474,
"num_tokens": 294149752.0,
"step": 684
},
{
"entropy": 0.422821044921875,
"epoch": 2.7182539682539684,
"grad_norm": 0.7969449363252592,
"learning_rate": 2.460713359539474e-07,
"loss": 0.3801,
"mean_token_accuracy": 0.8654317120090127,
"num_tokens": 294555288.0,
"step": 685
},
{
"entropy": 0.419830322265625,
"epoch": 2.7222222222222223,
"grad_norm": 0.8753720898977475,
"learning_rate": 2.3933817470493445e-07,
"loss": 0.3767,
"mean_token_accuracy": 0.866040863096714,
"num_tokens": 294975614.0,
"step": 686
},
{
"entropy": 0.412994384765625,
"epoch": 2.7261904761904763,
"grad_norm": 0.8199718663106975,
"learning_rate": 2.3269615523388355e-07,
"loss": 0.3918,
"mean_token_accuracy": 0.860607554204762,
"num_tokens": 295422071.0,
"step": 687
},
{
"entropy": 0.41558837890625,
"epoch": 2.7301587301587302,
"grad_norm": 0.79423481793127,
"learning_rate": 2.2614540470073276e-07,
"loss": 0.3866,
"mean_token_accuracy": 0.8644118411466479,
"num_tokens": 295846874.0,
"step": 688
},
{
"entropy": 0.41741943359375,
"epoch": 2.734126984126984,
"grad_norm": 0.8822693061766851,
"learning_rate": 2.1968604851809738e-07,
"loss": 0.3866,
"mean_token_accuracy": 0.8631517272442579,
"num_tokens": 296288822.0,
"step": 689
},
{
"entropy": 0.410980224609375,
"epoch": 2.738095238095238,
"grad_norm": 0.7846498188049137,
"learning_rate": 2.1331821034886846e-07,
"loss": 0.3943,
"mean_token_accuracy": 0.8625091454014182,
"num_tokens": 296730922.0,
"step": 690
},
{
"entropy": 0.41143798828125,
"epoch": 2.742063492063492,
"grad_norm": 0.8213197375371964,
"learning_rate": 2.0704201210384634e-07,
"loss": 0.3904,
"mean_token_accuracy": 0.864051777869463,
"num_tokens": 297169723.0,
"step": 691
},
{
"entropy": 0.414337158203125,
"epoch": 2.746031746031746,
"grad_norm": 1.2967707502021062,
"learning_rate": 2.0085757393940586e-07,
"loss": 0.3772,
"mean_token_accuracy": 0.8671101154759526,
"num_tokens": 297610941.0,
"step": 692
},
{
"entropy": 0.41839599609375,
"epoch": 2.75,
"grad_norm": 0.8133921789474006,
"learning_rate": 1.9476501425519656e-07,
"loss": 0.3833,
"mean_token_accuracy": 0.860652013681829,
"num_tokens": 298044879.0,
"step": 693
},
{
"entropy": 0.416778564453125,
"epoch": 2.753968253968254,
"grad_norm": 0.7897145964186679,
"learning_rate": 1.8876444969187557e-07,
"loss": 0.3857,
"mean_token_accuracy": 0.8620300153270364,
"num_tokens": 298464464.0,
"step": 694
},
{
"entropy": 0.40887451171875,
"epoch": 2.757936507936508,
"grad_norm": 0.9000724609100704,
"learning_rate": 1.828559951288733e-07,
"loss": 0.3831,
"mean_token_accuracy": 0.8644989216700196,
"num_tokens": 298903233.0,
"step": 695
},
{
"entropy": 0.41595458984375,
"epoch": 2.761904761904762,
"grad_norm": 0.7955961913020823,
"learning_rate": 1.7703976368219633e-07,
"loss": 0.3797,
"mean_token_accuracy": 0.8666205117478967,
"num_tokens": 299315956.0,
"step": 696
},
{
"entropy": 0.42193603515625,
"epoch": 2.765873015873016,
"grad_norm": 0.8878478545913352,
"learning_rate": 1.713158667022613e-07,
"loss": 0.3812,
"mean_token_accuracy": 0.8661440145224333,
"num_tokens": 299732237.0,
"step": 697
},
{
"entropy": 0.4156494140625,
"epoch": 2.7698412698412698,
"grad_norm": 0.931222693724496,
"learning_rate": 1.656844137717617e-07,
"loss": 0.3924,
"mean_token_accuracy": 0.8617311324924231,
"num_tokens": 300162540.0,
"step": 698
},
{
"entropy": 0.42425537109375,
"epoch": 2.7738095238095237,
"grad_norm": 0.8389789359858054,
"learning_rate": 1.601455127035717e-07,
"loss": 0.3901,
"mean_token_accuracy": 0.8636501645669341,
"num_tokens": 300580263.0,
"step": 699
},
{
"entropy": 0.41375732421875,
"epoch": 2.7777777777777777,
"grad_norm": 0.8084246826822826,
"learning_rate": 1.5469926953868063e-07,
"loss": 0.3786,
"mean_token_accuracy": 0.8661916004493833,
"num_tokens": 301009855.0,
"step": 700
},
{
"entropy": 0.421539306640625,
"epoch": 2.7817460317460316,
"grad_norm": 0.7920810255445628,
"learning_rate": 1.4934578854416403e-07,
"loss": 0.3793,
"mean_token_accuracy": 0.8652064045891166,
"num_tokens": 301429836.0,
"step": 701
},
{
"entropy": 0.417236328125,
"epoch": 2.7857142857142856,
"grad_norm": 0.7557065694262733,
"learning_rate": 1.440851722111858e-07,
"loss": 0.3775,
"mean_token_accuracy": 0.8666085209697485,
"num_tokens": 301852351.0,
"step": 702
},
{
"entropy": 0.418304443359375,
"epoch": 2.7896825396825395,
"grad_norm": 0.8074299266071946,
"learning_rate": 1.389175212530397e-07,
"loss": 0.3787,
"mean_token_accuracy": 0.8652766114100814,
"num_tokens": 302270241.0,
"step": 703
},
{
"entropy": 0.41082763671875,
"epoch": 2.7936507936507935,
"grad_norm": 0.8030020009685229,
"learning_rate": 1.3384293460321662e-07,
"loss": 0.3838,
"mean_token_accuracy": 0.8642606223002076,
"num_tokens": 302702189.0,
"step": 704
},
{
"entropy": 0.416748046875,
"epoch": 2.7976190476190474,
"grad_norm": 0.7647096183455645,
"learning_rate": 1.2886150941351317e-07,
"loss": 0.3778,
"mean_token_accuracy": 0.866962157189846,
"num_tokens": 303138996.0,
"step": 705
},
{
"entropy": 0.422882080078125,
"epoch": 2.8015873015873014,
"grad_norm": 0.8421868959580608,
"learning_rate": 1.2397334105217097e-07,
"loss": 0.3868,
"mean_token_accuracy": 0.8634527139365673,
"num_tokens": 303546117.0,
"step": 706
},
{
"entropy": 0.4114990234375,
"epoch": 2.8055555555555554,
"grad_norm": 0.7836818266413413,
"learning_rate": 1.1917852310205147e-07,
"loss": 0.3866,
"mean_token_accuracy": 0.8666229834780097,
"num_tokens": 303985054.0,
"step": 707
},
{
"entropy": 0.413970947265625,
"epoch": 2.8095238095238093,
"grad_norm": 0.7941927914203842,
"learning_rate": 1.1447714735884463e-07,
"loss": 0.3854,
"mean_token_accuracy": 0.8626468563452363,
"num_tokens": 304424136.0,
"step": 708
},
{
"entropy": 0.41229248046875,
"epoch": 2.8134920634920633,
"grad_norm": 0.8361515314493303,
"learning_rate": 1.0986930382930916e-07,
"loss": 0.3881,
"mean_token_accuracy": 0.8630633186548948,
"num_tokens": 304862181.0,
"step": 709
},
{
"entropy": 0.415924072265625,
"epoch": 2.817460317460317,
"grad_norm": 0.8355933592816446,
"learning_rate": 1.0535508072955225e-07,
"loss": 0.3969,
"mean_token_accuracy": 0.8627164475619793,
"num_tokens": 305299176.0,
"step": 710
},
{
"entropy": 0.412994384765625,
"epoch": 2.821428571428571,
"grad_norm": 0.7563718151503291,
"learning_rate": 1.0093456448333872e-07,
"loss": 0.3888,
"mean_token_accuracy": 0.8606778532266617,
"num_tokens": 305755604.0,
"step": 711
},
{
"entropy": 0.414825439453125,
"epoch": 2.825396825396825,
"grad_norm": 0.7951191207755064,
"learning_rate": 9.660783972043786e-08,
"loss": 0.3833,
"mean_token_accuracy": 0.862731215544045,
"num_tokens": 306180918.0,
"step": 712
},
{
"entropy": 0.416290283203125,
"epoch": 2.8293650793650795,
"grad_norm": 0.8170483155607833,
"learning_rate": 9.237498927500088e-08,
"loss": 0.3962,
"mean_token_accuracy": 0.861549130640924,
"num_tokens": 306601315.0,
"step": 713
},
{
"entropy": 0.413970947265625,
"epoch": 2.8333333333333335,
"grad_norm": 0.7633085069328812,
"learning_rate": 8.823609418397939e-08,
"loss": 0.3903,
"mean_token_accuracy": 0.861748369410634,
"num_tokens": 307053855.0,
"step": 714
},
{
"entropy": 0.4200439453125,
"epoch": 2.8373015873015874,
"grad_norm": 0.84331828046859,
"learning_rate": 8.419123368556991e-08,
"loss": 0.3889,
"mean_token_accuracy": 0.8638924788683653,
"num_tokens": 307466527.0,
"step": 715
},
{
"entropy": 0.417083740234375,
"epoch": 2.8412698412698414,
"grad_norm": 1.136155627056708,
"learning_rate": 8.024048521769745e-08,
"loss": 0.393,
"mean_token_accuracy": 0.8619293784722686,
"num_tokens": 307908233.0,
"step": 716
},
{
"entropy": 0.41851806640625,
"epoch": 2.8452380952380953,
"grad_norm": 0.8624286819149608,
"learning_rate": 7.638392441653542e-08,
"loss": 0.3815,
"mean_token_accuracy": 0.8658701097592711,
"num_tokens": 308331740.0,
"step": 717
},
{
"entropy": 0.417083740234375,
"epoch": 2.8492063492063493,
"grad_norm": 0.7877113344572971,
"learning_rate": 7.262162511505466e-08,
"loss": 0.3766,
"mean_token_accuracy": 0.8655453082174063,
"num_tokens": 308765267.0,
"step": 718
},
{
"entropy": 0.419647216796875,
"epoch": 2.8531746031746033,
"grad_norm": 0.8003863832330088,
"learning_rate": 6.895365934161236e-08,
"loss": 0.3811,
"mean_token_accuracy": 0.8642518576234579,
"num_tokens": 309177878.0,
"step": 719
},
{
"entropy": 0.427581787109375,
"epoch": 2.857142857142857,
"grad_norm": 0.7682606770932064,
"learning_rate": 6.538009731857087e-08,
"loss": 0.3912,
"mean_token_accuracy": 0.8608730277046561,
"num_tokens": 309586897.0,
"step": 720
},
{
"entropy": 0.41351318359375,
"epoch": 2.861111111111111,
"grad_norm": 0.7503328469812236,
"learning_rate": 6.190100746095495e-08,
"loss": 0.3831,
"mean_token_accuracy": 0.8634521188214421,
"num_tokens": 310011953.0,
"step": 721
},
{
"entropy": 0.416259765625,
"epoch": 2.865079365079365,
"grad_norm": 0.7649905566318169,
"learning_rate": 5.851645637514114e-08,
"loss": 0.3851,
"mean_token_accuracy": 0.8632787046954036,
"num_tokens": 310440896.0,
"step": 722
},
{
"entropy": 0.417572021484375,
"epoch": 2.869047619047619,
"grad_norm": 0.9526446373854853,
"learning_rate": 5.522650885758374e-08,
"loss": 0.3874,
"mean_token_accuracy": 0.8621506663039327,
"num_tokens": 310867155.0,
"step": 723
},
{
"entropy": 0.414215087890625,
"epoch": 2.873015873015873,
"grad_norm": 0.8196077414243489,
"learning_rate": 5.203122789357307e-08,
"loss": 0.3768,
"mean_token_accuracy": 0.8689231360331178,
"num_tokens": 311297562.0,
"step": 724
},
{
"entropy": 0.41143798828125,
"epoch": 2.876984126984127,
"grad_norm": 0.8133318950674621,
"learning_rate": 4.893067465602863e-08,
"loss": 0.397,
"mean_token_accuracy": 0.8604372851550579,
"num_tokens": 311750163.0,
"step": 725
},
{
"entropy": 0.413116455078125,
"epoch": 2.880952380952381,
"grad_norm": 0.7955536166222298,
"learning_rate": 4.5924908504331735e-08,
"loss": 0.3949,
"mean_token_accuracy": 0.8633811613544822,
"num_tokens": 312189513.0,
"step": 726
},
{
"entropy": 0.41363525390625,
"epoch": 2.884920634920635,
"grad_norm": 0.7572897725302501,
"learning_rate": 4.3013986983184705e-08,
"loss": 0.3854,
"mean_token_accuracy": 0.8645169893279672,
"num_tokens": 312626002.0,
"step": 727
},
{
"entropy": 0.41668701171875,
"epoch": 2.888888888888889,
"grad_norm": 0.7716288372854705,
"learning_rate": 4.019796582151181e-08,
"loss": 0.3876,
"mean_token_accuracy": 0.862118998542428,
"num_tokens": 313055977.0,
"step": 728
},
{
"entropy": 0.415802001953125,
"epoch": 2.892857142857143,
"grad_norm": 0.7713232614880875,
"learning_rate": 3.747689893139228e-08,
"loss": 0.3854,
"mean_token_accuracy": 0.8622197173535824,
"num_tokens": 313491837.0,
"step": 729
},
{
"entropy": 0.416229248046875,
"epoch": 2.8968253968253967,
"grad_norm": 0.7395778636476764,
"learning_rate": 3.4850838407027297e-08,
"loss": 0.3979,
"mean_token_accuracy": 0.8619843171909451,
"num_tokens": 313934488.0,
"step": 730
},
{
"entropy": 0.418701171875,
"epoch": 2.9007936507936507,
"grad_norm": 0.7612522506605197,
"learning_rate": 3.2319834523742435e-08,
"loss": 0.383,
"mean_token_accuracy": 0.8652700930833817,
"num_tokens": 314354323.0,
"step": 731
},
{
"entropy": 0.414703369140625,
"epoch": 2.9047619047619047,
"grad_norm": 0.7974226574359506,
"learning_rate": 2.988393573702675e-08,
"loss": 0.3946,
"mean_token_accuracy": 0.8615053938701749,
"num_tokens": 314782888.0,
"step": 732
},
{
"entropy": 0.41558837890625,
"epoch": 2.9087301587301586,
"grad_norm": 0.8043577847187172,
"learning_rate": 2.754318868160244e-08,
"loss": 0.3836,
"mean_token_accuracy": 0.8651288328692317,
"num_tokens": 315218545.0,
"step": 733
},
{
"entropy": 0.409698486328125,
"epoch": 2.9126984126984126,
"grad_norm": 0.7532207153914081,
"learning_rate": 2.5297638170535542e-08,
"loss": 0.3768,
"mean_token_accuracy": 0.8678219076246023,
"num_tokens": 315667111.0,
"step": 734
},
{
"entropy": 0.423553466796875,
"epoch": 2.9166666666666665,
"grad_norm": 0.836022190361025,
"learning_rate": 2.31473271943744e-08,
"loss": 0.3848,
"mean_token_accuracy": 0.8609532006084919,
"num_tokens": 316078411.0,
"step": 735
},
{
"entropy": 0.4132080078125,
"epoch": 2.9206349206349205,
"grad_norm": 0.7845153141259297,
"learning_rate": 2.109229692032977e-08,
"loss": 0.3894,
"mean_token_accuracy": 0.8628736371174455,
"num_tokens": 316519645.0,
"step": 736
},
{
"entropy": 0.418060302734375,
"epoch": 2.924603174603175,
"grad_norm": 0.7817679617183846,
"learning_rate": 1.9132586691484323e-08,
"loss": 0.3889,
"mean_token_accuracy": 0.8628808334469795,
"num_tokens": 316948710.0,
"step": 737
},
{
"entropy": 0.417327880859375,
"epoch": 2.928571428571429,
"grad_norm": 0.7903748369093285,
"learning_rate": 1.7268234026041053e-08,
"loss": 0.3836,
"mean_token_accuracy": 0.8664052626118064,
"num_tokens": 317376914.0,
"step": 738
},
{
"entropy": 0.416656494140625,
"epoch": 2.932539682539683,
"grad_norm": 0.8013586144607203,
"learning_rate": 1.5499274616602723e-08,
"loss": 0.3819,
"mean_token_accuracy": 0.8635408999398351,
"num_tokens": 317783718.0,
"step": 739
},
{
"entropy": 0.4195556640625,
"epoch": 2.9365079365079367,
"grad_norm": 1.0799029570111842,
"learning_rate": 1.3825742329492408e-08,
"loss": 0.3976,
"mean_token_accuracy": 0.8611190365627408,
"num_tokens": 318210944.0,
"step": 740
},
{
"entropy": 0.41424560546875,
"epoch": 2.9404761904761907,
"grad_norm": 0.8046949464266854,
"learning_rate": 1.2247669204100699e-08,
"loss": 0.3972,
"mean_token_accuracy": 0.8586803553625941,
"num_tokens": 318636758.0,
"step": 741
},
{
"entropy": 0.412384033203125,
"epoch": 2.9444444444444446,
"grad_norm": 0.7492264758448205,
"learning_rate": 1.0765085452275614e-08,
"loss": 0.381,
"mean_token_accuracy": 0.8640262456610799,
"num_tokens": 319072977.0,
"step": 742
},
{
"entropy": 0.41888427734375,
"epoch": 2.9484126984126986,
"grad_norm": 0.8576996327472883,
"learning_rate": 9.378019457743082e-09,
"loss": 0.3825,
"mean_token_accuracy": 0.8635032856836915,
"num_tokens": 319491098.0,
"step": 743
},
{
"entropy": 0.41156005859375,
"epoch": 2.9523809523809526,
"grad_norm": 0.8654625317173439,
"learning_rate": 8.086497775562918e-09,
"loss": 0.3974,
"mean_token_accuracy": 0.8603286230936646,
"num_tokens": 319936836.0,
"step": 744
},
{
"entropy": 0.4107666015625,
"epoch": 2.9563492063492065,
"grad_norm": 0.8355009320099618,
"learning_rate": 6.890545131621462e-09,
"loss": 0.3898,
"mean_token_accuracy": 0.8626588368788362,
"num_tokens": 320379711.0,
"step": 745
},
{
"entropy": 0.415008544921875,
"epoch": 2.9603174603174605,
"grad_norm": 0.769717293692382,
"learning_rate": 5.790184422158063e-09,
"loss": 0.3848,
"mean_token_accuracy": 0.8654471961781383,
"num_tokens": 320807541.0,
"step": 746
},
{
"entropy": 0.41558837890625,
"epoch": 2.9642857142857144,
"grad_norm": 1.2627956434015182,
"learning_rate": 4.785436713324876e-09,
"loss": 0.3896,
"mean_token_accuracy": 0.8639781204983592,
"num_tokens": 321249293.0,
"step": 747
},
{
"entropy": 0.417449951171875,
"epoch": 2.9682539682539684,
"grad_norm": 0.7877216343272654,
"learning_rate": 3.876321240786629e-09,
"loss": 0.385,
"mean_token_accuracy": 0.8626599637791514,
"num_tokens": 321676330.0,
"step": 748
},
{
"entropy": 0.41473388671875,
"epoch": 2.9722222222222223,
"grad_norm": 0.8047350818501147,
"learning_rate": 3.062855409350918e-09,
"loss": 0.3786,
"mean_token_accuracy": 0.8666502619162202,
"num_tokens": 322088977.0,
"step": 749
},
{
"entropy": 0.413360595703125,
"epoch": 2.9761904761904763,
"grad_norm": 0.8214751044608514,
"learning_rate": 2.345054792634027e-09,
"loss": 0.3863,
"mean_token_accuracy": 0.8647614009678364,
"num_tokens": 322526195.0,
"step": 750
},
{
"entropy": 0.42340087890625,
"epoch": 2.9801587301587302,
"grad_norm": 0.7827821829891451,
"learning_rate": 1.7229331327633935e-09,
"loss": 0.3884,
"mean_token_accuracy": 0.8638850962743163,
"num_tokens": 322944491.0,
"step": 751
},
{
"entropy": 0.415924072265625,
"epoch": 2.984126984126984,
"grad_norm": 0.8044240226452957,
"learning_rate": 1.1965023401161457e-09,
"loss": 0.3955,
"mean_token_accuracy": 0.860909391194582,
"num_tokens": 323373321.0,
"step": 752
},
{
"entropy": 0.415069580078125,
"epoch": 2.988095238095238,
"grad_norm": 0.7711137719538177,
"learning_rate": 7.657724930887344e-10,
"loss": 0.3878,
"mean_token_accuracy": 0.8620332898572087,
"num_tokens": 323803097.0,
"step": 753
},
{
"entropy": 0.413604736328125,
"epoch": 2.992063492063492,
"grad_norm": 0.7417880663145604,
"learning_rate": 4.3075183790541875e-10,
"loss": 0.3781,
"mean_token_accuracy": 0.865508021786809,
"num_tokens": 324241487.0,
"step": 754
},
{
"entropy": 0.412139892578125,
"epoch": 2.996031746031746,
"grad_norm": 0.8400409360541771,
"learning_rate": 1.9144678845950393e-10,
"loss": 0.3963,
"mean_token_accuracy": 0.8602159256115556,
"num_tokens": 324694260.0,
"step": 755
},
{
"entropy": 0.41937255859375,
"epoch": 3.0,
"grad_norm": 0.8242408910754981,
"learning_rate": 4.786192619121721e-11,
"loss": 0.3878,
"mean_token_accuracy": 0.863866476342082,
"num_tokens": 325114310.0,
"step": 756
},
{
"epoch": 3.0,
"step": 756,
"total_flos": 601237772369920.0,
"train_loss": 0.4835385761011845,
"train_runtime": 57894.1544,
"train_samples_per_second": 1.272,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 756,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 63,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 601237772369920.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}