{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 756, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5635986328125, "epoch": 0.003968253968253968, "grad_norm": 5.862754983476997, "learning_rate": 0.0, "loss": 1.3929, "mean_token_accuracy": 0.6520986258983612, "num_tokens": 436822.0, "step": 1 }, { "entropy": 0.571868896484375, "epoch": 0.007936507936507936, "grad_norm": 5.942989842192001, "learning_rate": 2.6315789473684213e-07, "loss": 1.3984, "mean_token_accuracy": 0.6573778251186013, "num_tokens": 849869.0, "step": 2 }, { "entropy": 0.571258544921875, "epoch": 0.011904761904761904, "grad_norm": 6.016113817261652, "learning_rate": 5.263157894736843e-07, "loss": 1.4022, "mean_token_accuracy": 0.6534338416531682, "num_tokens": 1257883.0, "step": 3 }, { "entropy": 0.567626953125, "epoch": 0.015873015873015872, "grad_norm": 5.755030134936764, "learning_rate": 7.894736842105263e-07, "loss": 1.3977, "mean_token_accuracy": 0.650267880409956, "num_tokens": 1710146.0, "step": 4 }, { "entropy": 0.563079833984375, "epoch": 0.01984126984126984, "grad_norm": 5.759147918749323, "learning_rate": 1.0526315789473685e-06, "loss": 1.38, "mean_token_accuracy": 0.6574590215459466, "num_tokens": 2138902.0, "step": 5 }, { "entropy": 0.5838623046875, "epoch": 0.023809523809523808, "grad_norm": 5.5950056421057885, "learning_rate": 1.3157894736842106e-06, "loss": 1.3755, "mean_token_accuracy": 0.6586260544136167, "num_tokens": 2560005.0, "step": 6 }, { "entropy": 0.5576171875, "epoch": 0.027777777777777776, "grad_norm": 5.621048765741401, "learning_rate": 1.5789473684210526e-06, "loss": 1.3926, "mean_token_accuracy": 0.6518173962831497, "num_tokens": 3004121.0, "step": 7 }, { "entropy": 0.572418212890625, "epoch": 0.031746031746031744, "grad_norm": 5.175193000260237, "learning_rate": 1.8421052631578948e-06, "loss": 1.3638, "mean_token_accuracy": 0.6598256900906563, "num_tokens": 3457966.0, "step": 8 }, { "entropy": 0.565643310546875, "epoch": 0.03571428571428571, "grad_norm": 5.286230249900356, "learning_rate": 2.105263157894737e-06, "loss": 1.3632, "mean_token_accuracy": 0.660512862727046, "num_tokens": 3902759.0, "step": 9 }, { "entropy": 0.583984375, "epoch": 0.03968253968253968, "grad_norm": 4.61299440604949, "learning_rate": 2.368421052631579e-06, "loss": 1.3222, "mean_token_accuracy": 0.6607110062614083, "num_tokens": 4321827.0, "step": 10 }, { "entropy": 0.56304931640625, "epoch": 0.04365079365079365, "grad_norm": 4.311573869738654, "learning_rate": 2.631578947368421e-06, "loss": 1.2723, "mean_token_accuracy": 0.6778731746599078, "num_tokens": 4748195.0, "step": 11 }, { "entropy": 0.566680908203125, "epoch": 0.047619047619047616, "grad_norm": 4.260816735639912, "learning_rate": 2.8947368421052634e-06, "loss": 1.2866, "mean_token_accuracy": 0.6733056111261249, "num_tokens": 5188122.0, "step": 12 }, { "entropy": 0.56494140625, "epoch": 0.051587301587301584, "grad_norm": 3.50291785633804, "learning_rate": 3.157894736842105e-06, "loss": 1.1797, "mean_token_accuracy": 0.6891454020515084, "num_tokens": 5615040.0, "step": 13 }, { "entropy": 0.562591552734375, "epoch": 0.05555555555555555, "grad_norm": 3.4924438085560223, "learning_rate": 3.421052631578948e-06, "loss": 1.1726, "mean_token_accuracy": 0.6897318931296468, "num_tokens": 6042413.0, "step": 14 }, { "entropy": 0.5677490234375, "epoch": 0.05952380952380952, "grad_norm": 3.1328341857078574, "learning_rate": 3.6842105263157896e-06, "loss": 1.1382, "mean_token_accuracy": 0.6958816023543477, "num_tokens": 6468019.0, "step": 15 }, { "entropy": 0.56500244140625, "epoch": 0.06349206349206349, "grad_norm": 3.1264058914998327, "learning_rate": 3.947368421052632e-06, "loss": 1.1368, "mean_token_accuracy": 0.6940081315115094, "num_tokens": 6898441.0, "step": 16 }, { "entropy": 0.537628173828125, "epoch": 0.06746031746031746, "grad_norm": 3.1935735446807105, "learning_rate": 4.210526315789474e-06, "loss": 1.042, "mean_token_accuracy": 0.7142375819385052, "num_tokens": 7333054.0, "step": 17 }, { "entropy": 0.51806640625, "epoch": 0.07142857142857142, "grad_norm": 3.9117878805623816, "learning_rate": 4.473684210526316e-06, "loss": 1.0019, "mean_token_accuracy": 0.72404795140028, "num_tokens": 7794638.0, "step": 18 }, { "entropy": 0.5318603515625, "epoch": 0.07539682539682539, "grad_norm": 4.074921603012236, "learning_rate": 4.736842105263158e-06, "loss": 1.0057, "mean_token_accuracy": 0.7181824343279004, "num_tokens": 8237624.0, "step": 19 }, { "entropy": 0.536865234375, "epoch": 0.07936507936507936, "grad_norm": 3.555487851257003, "learning_rate": 5e-06, "loss": 0.9821, "mean_token_accuracy": 0.723413173109293, "num_tokens": 8673402.0, "step": 20 }, { "entropy": 0.530609130859375, "epoch": 0.08333333333333333, "grad_norm": 2.955594263842942, "learning_rate": 5.263157894736842e-06, "loss": 0.9539, "mean_token_accuracy": 0.7331287879496813, "num_tokens": 9121387.0, "step": 21 }, { "entropy": 0.548980712890625, "epoch": 0.0873015873015873, "grad_norm": 2.658685672377372, "learning_rate": 5.526315789473685e-06, "loss": 0.9027, "mean_token_accuracy": 0.7447563670575619, "num_tokens": 9525436.0, "step": 22 }, { "entropy": 0.548858642578125, "epoch": 0.09126984126984126, "grad_norm": 2.169406925194856, "learning_rate": 5.789473684210527e-06, "loss": 0.8918, "mean_token_accuracy": 0.7433666130527854, "num_tokens": 9932011.0, "step": 23 }, { "entropy": 0.532745361328125, "epoch": 0.09523809523809523, "grad_norm": 2.77364949088275, "learning_rate": 6.0526315789473685e-06, "loss": 0.8816, "mean_token_accuracy": 0.7459379723295569, "num_tokens": 10358777.0, "step": 24 }, { "entropy": 0.535888671875, "epoch": 0.0992063492063492, "grad_norm": 3.016843229488076, "learning_rate": 6.31578947368421e-06, "loss": 0.8696, "mean_token_accuracy": 0.7488466305658221, "num_tokens": 10773051.0, "step": 25 }, { "entropy": 0.526519775390625, "epoch": 0.10317460317460317, "grad_norm": 2.7257646532581203, "learning_rate": 6.578947368421054e-06, "loss": 0.8541, "mean_token_accuracy": 0.7512839920818806, "num_tokens": 11211677.0, "step": 26 }, { "entropy": 0.533477783203125, "epoch": 0.10714285714285714, "grad_norm": 2.4870825582777543, "learning_rate": 6.842105263157896e-06, "loss": 0.81, "mean_token_accuracy": 0.758179577998817, "num_tokens": 11628779.0, "step": 27 }, { "entropy": 0.524932861328125, "epoch": 0.1111111111111111, "grad_norm": 2.360523836168894, "learning_rate": 7.1052631578947375e-06, "loss": 0.8368, "mean_token_accuracy": 0.756181831471622, "num_tokens": 12067363.0, "step": 28 }, { "entropy": 0.513580322265625, "epoch": 0.11507936507936507, "grad_norm": 2.340522449916669, "learning_rate": 7.368421052631579e-06, "loss": 0.8284, "mean_token_accuracy": 0.7573374779894948, "num_tokens": 12507159.0, "step": 29 }, { "entropy": 0.509765625, "epoch": 0.11904761904761904, "grad_norm": 1.9560553431906509, "learning_rate": 7.631578947368423e-06, "loss": 0.802, "mean_token_accuracy": 0.7627127859741449, "num_tokens": 12945458.0, "step": 30 }, { "entropy": 0.523956298828125, "epoch": 0.12301587301587301, "grad_norm": 2.040122129092966, "learning_rate": 7.894736842105265e-06, "loss": 0.7837, "mean_token_accuracy": 0.7661685338243842, "num_tokens": 13370514.0, "step": 31 }, { "entropy": 0.508392333984375, "epoch": 0.12698412698412698, "grad_norm": 2.121102369191249, "learning_rate": 8.157894736842106e-06, "loss": 0.7771, "mean_token_accuracy": 0.7685025054961443, "num_tokens": 13815066.0, "step": 32 }, { "entropy": 0.518341064453125, "epoch": 0.13095238095238096, "grad_norm": 1.935388895167446, "learning_rate": 8.421052631578948e-06, "loss": 0.7698, "mean_token_accuracy": 0.7705040192231536, "num_tokens": 14240312.0, "step": 33 }, { "entropy": 0.511688232421875, "epoch": 0.1349206349206349, "grad_norm": 1.69571857189707, "learning_rate": 8.68421052631579e-06, "loss": 0.7653, "mean_token_accuracy": 0.7728015650063753, "num_tokens": 14685173.0, "step": 34 }, { "entropy": 0.517608642578125, "epoch": 0.1388888888888889, "grad_norm": 1.7681452936301536, "learning_rate": 8.947368421052632e-06, "loss": 0.7449, "mean_token_accuracy": 0.7759622316807508, "num_tokens": 15099914.0, "step": 35 }, { "entropy": 0.5123291015625, "epoch": 0.14285714285714285, "grad_norm": 1.6769532274067533, "learning_rate": 9.210526315789474e-06, "loss": 0.7344, "mean_token_accuracy": 0.777042037807405, "num_tokens": 15522062.0, "step": 36 }, { "entropy": 0.5107421875, "epoch": 0.14682539682539683, "grad_norm": 1.5490568448744158, "learning_rate": 9.473684210526315e-06, "loss": 0.7237, "mean_token_accuracy": 0.7827096851542592, "num_tokens": 15955138.0, "step": 37 }, { "entropy": 0.5093994140625, "epoch": 0.15079365079365079, "grad_norm": 1.6589040046358219, "learning_rate": 9.736842105263159e-06, "loss": 0.703, "mean_token_accuracy": 0.7817074777558446, "num_tokens": 16388252.0, "step": 38 }, { "entropy": 0.5093994140625, "epoch": 0.15476190476190477, "grad_norm": 1.7103852217985493, "learning_rate": 1e-05, "loss": 0.7203, "mean_token_accuracy": 0.7796014500781894, "num_tokens": 16821287.0, "step": 39 }, { "entropy": 0.5087890625, "epoch": 0.15873015873015872, "grad_norm": 1.8720911640791305, "learning_rate": 9.99995213807381e-06, "loss": 0.6741, "mean_token_accuracy": 0.7901940597221255, "num_tokens": 17235205.0, "step": 40 }, { "entropy": 0.504638671875, "epoch": 0.1626984126984127, "grad_norm": 1.6006252706063373, "learning_rate": 9.99980855321154e-06, "loss": 0.6899, "mean_token_accuracy": 0.7874280894175172, "num_tokens": 17657156.0, "step": 41 }, { "entropy": 0.503753662109375, "epoch": 0.16666666666666666, "grad_norm": 1.5184573996381632, "learning_rate": 9.999569248162095e-06, "loss": 0.6887, "mean_token_accuracy": 0.7868739385157824, "num_tokens": 18090069.0, "step": 42 }, { "entropy": 0.497650146484375, "epoch": 0.17063492063492064, "grad_norm": 1.6431910052473107, "learning_rate": 9.999234227506912e-06, "loss": 0.6944, "mean_token_accuracy": 0.7861603572964668, "num_tokens": 18542578.0, "step": 43 }, { "entropy": 0.50860595703125, "epoch": 0.1746031746031746, "grad_norm": 1.7897905880615403, "learning_rate": 9.998803497659885e-06, "loss": 0.669, "mean_token_accuracy": 0.7912999261170626, "num_tokens": 18955962.0, "step": 44 }, { "entropy": 0.5076904296875, "epoch": 0.17857142857142858, "grad_norm": 1.5708540426011852, "learning_rate": 9.998277066867236e-06, "loss": 0.6583, "mean_token_accuracy": 0.7945169908925891, "num_tokens": 19379353.0, "step": 45 }, { "entropy": 0.507659912109375, "epoch": 0.18253968253968253, "grad_norm": 1.452264150440713, "learning_rate": 9.997654945207368e-06, "loss": 0.6506, "mean_token_accuracy": 0.7967926179990172, "num_tokens": 19812261.0, "step": 46 }, { "entropy": 0.524383544921875, "epoch": 0.1865079365079365, "grad_norm": 1.63583355617904, "learning_rate": 9.99693714459065e-06, "loss": 0.6458, "mean_token_accuracy": 0.7976251384243369, "num_tokens": 20210235.0, "step": 47 }, { "entropy": 0.503173828125, "epoch": 0.19047619047619047, "grad_norm": 2.0289207035123002, "learning_rate": 9.996123678759214e-06, "loss": 0.65, "mean_token_accuracy": 0.7951374817639589, "num_tokens": 20647709.0, "step": 48 }, { "entropy": 0.50360107421875, "epoch": 0.19444444444444445, "grad_norm": 1.5990576885906742, "learning_rate": 9.995214563286677e-06, "loss": 0.6434, "mean_token_accuracy": 0.7995740966871381, "num_tokens": 21065897.0, "step": 49 }, { "entropy": 0.5130615234375, "epoch": 0.1984126984126984, "grad_norm": 1.9524666748685242, "learning_rate": 9.994209815577843e-06, "loss": 0.6555, "mean_token_accuracy": 0.7948365742340684, "num_tokens": 21486371.0, "step": 50 }, { "entropy": 0.51611328125, "epoch": 0.20238095238095238, "grad_norm": 1.5386890414245815, "learning_rate": 9.993109454868379e-06, "loss": 0.6435, "mean_token_accuracy": 0.796173213981092, "num_tokens": 21909309.0, "step": 51 }, { "entropy": 0.5155029296875, "epoch": 0.20634920634920634, "grad_norm": 1.532778324611743, "learning_rate": 9.991913502224438e-06, "loss": 0.6319, "mean_token_accuracy": 0.7995416941121221, "num_tokens": 22318414.0, "step": 52 }, { "entropy": 0.50860595703125, "epoch": 0.21031746031746032, "grad_norm": 1.6657431535988216, "learning_rate": 9.990621980542258e-06, "loss": 0.6093, "mean_token_accuracy": 0.8053860478103161, "num_tokens": 22719471.0, "step": 53 }, { "entropy": 0.504730224609375, "epoch": 0.21428571428571427, "grad_norm": 1.641225405902951, "learning_rate": 9.989234914547725e-06, "loss": 0.6216, "mean_token_accuracy": 0.8012946872040629, "num_tokens": 23134604.0, "step": 54 }, { "entropy": 0.49383544921875, "epoch": 0.21825396825396826, "grad_norm": 1.4469976883176578, "learning_rate": 9.9877523307959e-06, "loss": 0.6264, "mean_token_accuracy": 0.8012660220265388, "num_tokens": 23571548.0, "step": 55 }, { "entropy": 0.49749755859375, "epoch": 0.2222222222222222, "grad_norm": 1.6227889097439865, "learning_rate": 9.986174257670509e-06, "loss": 0.6246, "mean_token_accuracy": 0.8050137888640165, "num_tokens": 24009770.0, "step": 56 }, { "entropy": 0.49603271484375, "epoch": 0.2261904761904762, "grad_norm": 1.4045322193755005, "learning_rate": 9.984500725383397e-06, "loss": 0.6324, "mean_token_accuracy": 0.8019688781350851, "num_tokens": 24447544.0, "step": 57 }, { "entropy": 0.5068359375, "epoch": 0.23015873015873015, "grad_norm": 1.3881146015309058, "learning_rate": 9.98273176597396e-06, "loss": 0.6233, "mean_token_accuracy": 0.802922697737813, "num_tokens": 24869806.0, "step": 58 }, { "entropy": 0.48602294921875, "epoch": 0.23412698412698413, "grad_norm": 1.4997301127686509, "learning_rate": 9.980867413308516e-06, "loss": 0.6298, "mean_token_accuracy": 0.8009732821956277, "num_tokens": 25337885.0, "step": 59 }, { "entropy": 0.4898681640625, "epoch": 0.23809523809523808, "grad_norm": 1.4955906883793464, "learning_rate": 9.978907703079672e-06, "loss": 0.6112, "mean_token_accuracy": 0.807507585734129, "num_tokens": 25762999.0, "step": 60 }, { "entropy": 0.495452880859375, "epoch": 0.24206349206349206, "grad_norm": 1.4495106936140836, "learning_rate": 9.976852672805625e-06, "loss": 0.6071, "mean_token_accuracy": 0.8060804437845945, "num_tokens": 26204122.0, "step": 61 }, { "entropy": 0.48187255859375, "epoch": 0.24603174603174602, "grad_norm": 1.394817796737972, "learning_rate": 9.974702361829465e-06, "loss": 0.5934, "mean_token_accuracy": 0.8098774421960115, "num_tokens": 26651412.0, "step": 62 }, { "entropy": 0.4937744140625, "epoch": 0.25, "grad_norm": 1.566600221250525, "learning_rate": 9.972456811318399e-06, "loss": 0.6075, "mean_token_accuracy": 0.8056732397526503, "num_tokens": 27080137.0, "step": 63 }, { "entropy": 0.48150634765625, "epoch": 0.25396825396825395, "grad_norm": 1.4470780226444868, "learning_rate": 9.970116064262975e-06, "loss": 0.6025, "mean_token_accuracy": 0.8087067836895585, "num_tokens": 27520069.0, "step": 64 }, { "entropy": 0.485382080078125, "epoch": 0.25793650793650796, "grad_norm": 1.5004810878183181, "learning_rate": 9.96768016547626e-06, "loss": 0.6011, "mean_token_accuracy": 0.8066385835409164, "num_tokens": 27954154.0, "step": 65 }, { "entropy": 0.49444580078125, "epoch": 0.2619047619047619, "grad_norm": 1.618557504297728, "learning_rate": 9.965149161592973e-06, "loss": 0.6054, "mean_token_accuracy": 0.8067285194993019, "num_tokens": 28367541.0, "step": 66 }, { "entropy": 0.4913330078125, "epoch": 0.26587301587301587, "grad_norm": 1.4570014991732672, "learning_rate": 9.962523101068608e-06, "loss": 0.573, "mean_token_accuracy": 0.8120877193287015, "num_tokens": 28779140.0, "step": 67 }, { "entropy": 0.485076904296875, "epoch": 0.2698412698412698, "grad_norm": 1.5057997069476419, "learning_rate": 9.959802034178489e-06, "loss": 0.5966, "mean_token_accuracy": 0.8073940826579928, "num_tokens": 29217570.0, "step": 68 }, { "entropy": 0.479888916015625, "epoch": 0.27380952380952384, "grad_norm": 1.4032340805398644, "learning_rate": 9.956986013016816e-06, "loss": 0.5767, "mean_token_accuracy": 0.8149419017136097, "num_tokens": 29656943.0, "step": 69 }, { "entropy": 0.484710693359375, "epoch": 0.2777777777777778, "grad_norm": 1.4496205753720897, "learning_rate": 9.954075091495669e-06, "loss": 0.6001, "mean_token_accuracy": 0.8093660045415163, "num_tokens": 30088869.0, "step": 70 }, { "entropy": 0.476165771484375, "epoch": 0.28174603174603174, "grad_norm": 1.4082524979942377, "learning_rate": 9.951069325343972e-06, "loss": 0.6016, "mean_token_accuracy": 0.8054882632568479, "num_tokens": 30550317.0, "step": 71 }, { "entropy": 0.483856201171875, "epoch": 0.2857142857142857, "grad_norm": 1.3714127497612545, "learning_rate": 9.947968772106428e-06, "loss": 0.5748, "mean_token_accuracy": 0.8156133992597461, "num_tokens": 30959821.0, "step": 72 }, { "entropy": 0.47869873046875, "epoch": 0.2896825396825397, "grad_norm": 1.6516883841068675, "learning_rate": 9.944773491142416e-06, "loss": 0.5997, "mean_token_accuracy": 0.8074251553043723, "num_tokens": 31412639.0, "step": 73 }, { "entropy": 0.487518310546875, "epoch": 0.29365079365079366, "grad_norm": 1.5133792126136842, "learning_rate": 9.94148354362486e-06, "loss": 0.592, "mean_token_accuracy": 0.8129479885101318, "num_tokens": 31830767.0, "step": 74 }, { "entropy": 0.482086181640625, "epoch": 0.2976190476190476, "grad_norm": 1.62731136956083, "learning_rate": 9.938098992539045e-06, "loss": 0.5835, "mean_token_accuracy": 0.8082789676263928, "num_tokens": 32267329.0, "step": 75 }, { "entropy": 0.48516845703125, "epoch": 0.30158730158730157, "grad_norm": 1.4784416203962691, "learning_rate": 9.93461990268143e-06, "loss": 0.582, "mean_token_accuracy": 0.8147872434929013, "num_tokens": 32692726.0, "step": 76 }, { "entropy": 0.48876953125, "epoch": 0.3055555555555556, "grad_norm": 1.5041413810038196, "learning_rate": 9.931046340658387e-06, "loss": 0.5617, "mean_token_accuracy": 0.8183435359969735, "num_tokens": 33108936.0, "step": 77 }, { "entropy": 0.472503662109375, "epoch": 0.30952380952380953, "grad_norm": 1.6817980341644059, "learning_rate": 9.927378374884947e-06, "loss": 0.5655, "mean_token_accuracy": 0.8146926909685135, "num_tokens": 33543076.0, "step": 78 }, { "entropy": 0.474945068359375, "epoch": 0.3134920634920635, "grad_norm": 1.3241417102653499, "learning_rate": 9.923616075583465e-06, "loss": 0.5738, "mean_token_accuracy": 0.8142029214650393, "num_tokens": 33980897.0, "step": 79 }, { "entropy": 0.47528076171875, "epoch": 0.31746031746031744, "grad_norm": 1.4456909932877973, "learning_rate": 9.919759514782304e-06, "loss": 0.5725, "mean_token_accuracy": 0.8150945641100407, "num_tokens": 34404352.0, "step": 80 }, { "entropy": 0.48504638671875, "epoch": 0.32142857142857145, "grad_norm": 1.2666611706175113, "learning_rate": 9.91580876631443e-06, "loss": 0.5728, "mean_token_accuracy": 0.8147335788235068, "num_tokens": 34815122.0, "step": 81 }, { "entropy": 0.49102783203125, "epoch": 0.3253968253968254, "grad_norm": 1.4465210833568694, "learning_rate": 9.91176390581602e-06, "loss": 0.5759, "mean_token_accuracy": 0.8133391635492444, "num_tokens": 35236933.0, "step": 82 }, { "entropy": 0.483489990234375, "epoch": 0.32936507936507936, "grad_norm": 1.3544114949996022, "learning_rate": 9.907625010724999e-06, "loss": 0.5724, "mean_token_accuracy": 0.8148928321897984, "num_tokens": 35664506.0, "step": 83 }, { "entropy": 0.480560302734375, "epoch": 0.3333333333333333, "grad_norm": 1.3188860783644643, "learning_rate": 9.903392160279564e-06, "loss": 0.5666, "mean_token_accuracy": 0.8133293204009533, "num_tokens": 36088050.0, "step": 84 }, { "entropy": 0.48748779296875, "epoch": 0.3373015873015873, "grad_norm": 1.4518030416485894, "learning_rate": 9.899065435516661e-06, "loss": 0.5664, "mean_token_accuracy": 0.8148653889074922, "num_tokens": 36501235.0, "step": 85 }, { "entropy": 0.469940185546875, "epoch": 0.3412698412698413, "grad_norm": 1.394119633826569, "learning_rate": 9.894644919270448e-06, "loss": 0.5722, "mean_token_accuracy": 0.814102666452527, "num_tokens": 36942407.0, "step": 86 }, { "entropy": 0.47369384765625, "epoch": 0.34523809523809523, "grad_norm": 1.4881887909837872, "learning_rate": 9.890130696170691e-06, "loss": 0.5714, "mean_token_accuracy": 0.8154451455920935, "num_tokens": 37381260.0, "step": 87 }, { "entropy": 0.47998046875, "epoch": 0.3492063492063492, "grad_norm": 1.3217206972932933, "learning_rate": 9.885522852641156e-06, "loss": 0.5695, "mean_token_accuracy": 0.814792038872838, "num_tokens": 37803882.0, "step": 88 }, { "entropy": 0.48052978515625, "epoch": 0.3531746031746032, "grad_norm": 1.5133757517932098, "learning_rate": 9.880821476897948e-06, "loss": 0.5628, "mean_token_accuracy": 0.8151478515937924, "num_tokens": 38227635.0, "step": 89 }, { "entropy": 0.475738525390625, "epoch": 0.35714285714285715, "grad_norm": 1.5653342692191234, "learning_rate": 9.87602665894783e-06, "loss": 0.5828, "mean_token_accuracy": 0.8125336300581694, "num_tokens": 38667329.0, "step": 90 }, { "entropy": 0.473876953125, "epoch": 0.3611111111111111, "grad_norm": 1.3382017413079235, "learning_rate": 9.871138490586489e-06, "loss": 0.57, "mean_token_accuracy": 0.8121865503489971, "num_tokens": 39107330.0, "step": 91 }, { "entropy": 0.47998046875, "epoch": 0.36507936507936506, "grad_norm": 1.346784303133718, "learning_rate": 9.866157065396784e-06, "loss": 0.5503, "mean_token_accuracy": 0.8177150310948491, "num_tokens": 39524166.0, "step": 92 }, { "entropy": 0.469207763671875, "epoch": 0.36904761904761907, "grad_norm": 1.4083288521133936, "learning_rate": 9.861082478746962e-06, "loss": 0.5508, "mean_token_accuracy": 0.820819640532136, "num_tokens": 39952174.0, "step": 93 }, { "entropy": 0.465789794921875, "epoch": 0.373015873015873, "grad_norm": 1.4473119436564825, "learning_rate": 9.855914827788814e-06, "loss": 0.5596, "mean_token_accuracy": 0.8184320721775293, "num_tokens": 40389693.0, "step": 94 }, { "entropy": 0.46807861328125, "epoch": 0.376984126984127, "grad_norm": 1.3763793812393954, "learning_rate": 9.850654211455837e-06, "loss": 0.5548, "mean_token_accuracy": 0.8205192228779197, "num_tokens": 40815730.0, "step": 95 }, { "entropy": 0.484527587890625, "epoch": 0.38095238095238093, "grad_norm": 1.5969870094084369, "learning_rate": 9.84530073046132e-06, "loss": 0.564, "mean_token_accuracy": 0.816374409943819, "num_tokens": 41231841.0, "step": 96 }, { "entropy": 0.492523193359375, "epoch": 0.38492063492063494, "grad_norm": 1.379364573057709, "learning_rate": 9.83985448729643e-06, "loss": 0.572, "mean_token_accuracy": 0.8147962624207139, "num_tokens": 41650119.0, "step": 97 }, { "entropy": 0.4735107421875, "epoch": 0.3888888888888889, "grad_norm": 1.4022051638675177, "learning_rate": 9.83431558622824e-06, "loss": 0.5501, "mean_token_accuracy": 0.8185382299125195, "num_tokens": 42082897.0, "step": 98 }, { "entropy": 0.47802734375, "epoch": 0.39285714285714285, "grad_norm": 1.3021150947814153, "learning_rate": 9.828684133297738e-06, "loss": 0.5475, "mean_token_accuracy": 0.82077881321311, "num_tokens": 42519361.0, "step": 99 }, { "entropy": 0.47802734375, "epoch": 0.3968253968253968, "grad_norm": 1.3024753376267064, "learning_rate": 9.822960236317804e-06, "loss": 0.5436, "mean_token_accuracy": 0.8204956650733948, "num_tokens": 42941458.0, "step": 100 }, { "entropy": 0.472930908203125, "epoch": 0.4007936507936508, "grad_norm": 1.4182047962742048, "learning_rate": 9.817144004871127e-06, "loss": 0.5442, "mean_token_accuracy": 0.8214483223855495, "num_tokens": 43370971.0, "step": 101 }, { "entropy": 0.476470947265625, "epoch": 0.40476190476190477, "grad_norm": 1.3283608806953866, "learning_rate": 9.811235550308127e-06, "loss": 0.551, "mean_token_accuracy": 0.8185345204547048, "num_tokens": 43801380.0, "step": 102 }, { "entropy": 0.46978759765625, "epoch": 0.4087301587301587, "grad_norm": 1.2924764394677166, "learning_rate": 9.805234985744804e-06, "loss": 0.5605, "mean_token_accuracy": 0.8147126482799649, "num_tokens": 44245066.0, "step": 103 }, { "entropy": 0.485198974609375, "epoch": 0.4126984126984127, "grad_norm": 1.3073864707831366, "learning_rate": 9.799142426060595e-06, "loss": 0.5573, "mean_token_accuracy": 0.8181026382371783, "num_tokens": 44671335.0, "step": 104 }, { "entropy": 0.498046875, "epoch": 0.4166666666666667, "grad_norm": 1.4213977867693426, "learning_rate": 9.792957987896154e-06, "loss": 0.5518, "mean_token_accuracy": 0.8183343056589365, "num_tokens": 45066930.0, "step": 105 }, { "entropy": 0.47454833984375, "epoch": 0.42063492063492064, "grad_norm": 1.2495857267379333, "learning_rate": 9.786681789651134e-06, "loss": 0.5472, "mean_token_accuracy": 0.8180114766582847, "num_tokens": 45508166.0, "step": 106 }, { "entropy": 0.47021484375, "epoch": 0.4246031746031746, "grad_norm": 1.238708297199452, "learning_rate": 9.780313951481904e-06, "loss": 0.5612, "mean_token_accuracy": 0.8155703386291862, "num_tokens": 45960298.0, "step": 107 }, { "entropy": 0.473785400390625, "epoch": 0.42857142857142855, "grad_norm": 1.367804985831029, "learning_rate": 9.773854595299269e-06, "loss": 0.5518, "mean_token_accuracy": 0.8167815553024411, "num_tokens": 46398974.0, "step": 108 }, { "entropy": 0.462677001953125, "epoch": 0.43253968253968256, "grad_norm": 1.3222167500569877, "learning_rate": 9.767303844766118e-06, "loss": 0.5548, "mean_token_accuracy": 0.8168724188581109, "num_tokens": 46837899.0, "step": 109 }, { "entropy": 0.460693359375, "epoch": 0.4365079365079365, "grad_norm": 1.3681492766242778, "learning_rate": 9.760661825295068e-06, "loss": 0.5623, "mean_token_accuracy": 0.8150366581976414, "num_tokens": 47311746.0, "step": 110 }, { "entropy": 0.466400146484375, "epoch": 0.44047619047619047, "grad_norm": 1.344685621979837, "learning_rate": 9.753928664046055e-06, "loss": 0.5392, "mean_token_accuracy": 0.822113991715014, "num_tokens": 47744340.0, "step": 111 }, { "entropy": 0.4608154296875, "epoch": 0.4444444444444444, "grad_norm": 1.3313641531925076, "learning_rate": 9.747104489923907e-06, "loss": 0.5335, "mean_token_accuracy": 0.8225171025842428, "num_tokens": 48177761.0, "step": 112 }, { "entropy": 0.4722900390625, "epoch": 0.44841269841269843, "grad_norm": 1.5485087292600126, "learning_rate": 9.740189433575873e-06, "loss": 0.5511, "mean_token_accuracy": 0.8177419500425458, "num_tokens": 48604700.0, "step": 113 }, { "entropy": 0.474884033203125, "epoch": 0.4523809523809524, "grad_norm": 1.3287727548949633, "learning_rate": 9.733183627389117e-06, "loss": 0.5349, "mean_token_accuracy": 0.8249012846499681, "num_tokens": 49026375.0, "step": 114 }, { "entropy": 0.461212158203125, "epoch": 0.45634920634920634, "grad_norm": 1.4235893278514111, "learning_rate": 9.726087205488192e-06, "loss": 0.5488, "mean_token_accuracy": 0.8166424483060837, "num_tokens": 49467267.0, "step": 115 }, { "entropy": 0.47381591796875, "epoch": 0.4603174603174603, "grad_norm": 1.255433079679792, "learning_rate": 9.718900303732465e-06, "loss": 0.5467, "mean_token_accuracy": 0.8177134236320853, "num_tokens": 49889163.0, "step": 116 }, { "entropy": 0.476165771484375, "epoch": 0.4642857142857143, "grad_norm": 1.2666755263949114, "learning_rate": 9.711623059713522e-06, "loss": 0.5284, "mean_token_accuracy": 0.82161083817482, "num_tokens": 50300460.0, "step": 117 }, { "entropy": 0.470458984375, "epoch": 0.46825396825396826, "grad_norm": 1.7054143182470258, "learning_rate": 9.70425561275253e-06, "loss": 0.553, "mean_token_accuracy": 0.8204147005453706, "num_tokens": 50735361.0, "step": 118 }, { "entropy": 0.47528076171875, "epoch": 0.4722222222222222, "grad_norm": 1.2776820782333425, "learning_rate": 9.696798103897567e-06, "loss": 0.5344, "mean_token_accuracy": 0.821893903426826, "num_tokens": 51149122.0, "step": 119 }, { "entropy": 0.469268798828125, "epoch": 0.47619047619047616, "grad_norm": 1.1855022647806321, "learning_rate": 9.689250675920932e-06, "loss": 0.5371, "mean_token_accuracy": 0.8207768378779292, "num_tokens": 51597577.0, "step": 120 }, { "entropy": 0.461181640625, "epoch": 0.4801587301587302, "grad_norm": 1.3061024406164452, "learning_rate": 9.6816134733164e-06, "loss": 0.5419, "mean_token_accuracy": 0.8211635444313288, "num_tokens": 52043666.0, "step": 121 }, { "entropy": 0.4639892578125, "epoch": 0.48412698412698413, "grad_norm": 1.278485982326175, "learning_rate": 9.67388664229646e-06, "loss": 0.5457, "mean_token_accuracy": 0.8210693299770355, "num_tokens": 52482382.0, "step": 122 }, { "entropy": 0.466400146484375, "epoch": 0.4880952380952381, "grad_norm": 1.3159497560386597, "learning_rate": 9.66607033078952e-06, "loss": 0.5399, "mean_token_accuracy": 0.8193363519385457, "num_tokens": 52931115.0, "step": 123 }, { "entropy": 0.462371826171875, "epoch": 0.49206349206349204, "grad_norm": 1.3013445808543571, "learning_rate": 9.658164688437073e-06, "loss": 0.5431, "mean_token_accuracy": 0.8198595689609647, "num_tokens": 53370750.0, "step": 124 }, { "entropy": 0.470245361328125, "epoch": 0.49603174603174605, "grad_norm": 1.2502745654553475, "learning_rate": 9.65016986659082e-06, "loss": 0.5352, "mean_token_accuracy": 0.8216186631470919, "num_tokens": 53798951.0, "step": 125 }, { "entropy": 0.460723876953125, "epoch": 0.5, "grad_norm": 1.4425212147696118, "learning_rate": 9.642086018309798e-06, "loss": 0.528, "mean_token_accuracy": 0.8253877777606249, "num_tokens": 54235189.0, "step": 126 }, { "entropy": 0.463043212890625, "epoch": 0.503968253968254, "grad_norm": 1.190227347104015, "learning_rate": 9.63391329835742e-06, "loss": 0.5215, "mean_token_accuracy": 0.825776319950819, "num_tokens": 54642925.0, "step": 127 }, { "entropy": 0.470428466796875, "epoch": 0.5079365079365079, "grad_norm": 1.3119200133443487, "learning_rate": 9.625651863198538e-06, "loss": 0.5361, "mean_token_accuracy": 0.8217763127759099, "num_tokens": 55066936.0, "step": 128 }, { "entropy": 0.475128173828125, "epoch": 0.5119047619047619, "grad_norm": 1.2559808601225464, "learning_rate": 9.617301870996432e-06, "loss": 0.5271, "mean_token_accuracy": 0.8248334173113108, "num_tokens": 55484500.0, "step": 129 }, { "entropy": 0.45751953125, "epoch": 0.5158730158730159, "grad_norm": 1.2089833762472606, "learning_rate": 9.608863481609784e-06, "loss": 0.5333, "mean_token_accuracy": 0.8226035898551345, "num_tokens": 55922405.0, "step": 130 }, { "entropy": 0.4698486328125, "epoch": 0.5198412698412699, "grad_norm": 1.311622726348439, "learning_rate": 9.600336856589622e-06, "loss": 0.542, "mean_token_accuracy": 0.8179264310747385, "num_tokens": 56355834.0, "step": 131 }, { "entropy": 0.469024658203125, "epoch": 0.5238095238095238, "grad_norm": 1.370201408190726, "learning_rate": 9.591722159176229e-06, "loss": 0.5209, "mean_token_accuracy": 0.8256417205557227, "num_tokens": 56770275.0, "step": 132 }, { "entropy": 0.467926025390625, "epoch": 0.5277777777777778, "grad_norm": 1.4107765499615386, "learning_rate": 9.583019554296004e-06, "loss": 0.54, "mean_token_accuracy": 0.8201555293053389, "num_tokens": 57203160.0, "step": 133 }, { "entropy": 0.469207763671875, "epoch": 0.5317460317460317, "grad_norm": 1.2667343919182794, "learning_rate": 9.574229208558322e-06, "loss": 0.535, "mean_token_accuracy": 0.8202388240024447, "num_tokens": 57627870.0, "step": 134 }, { "entropy": 0.46697998046875, "epoch": 0.5357142857142857, "grad_norm": 1.4012228207534334, "learning_rate": 9.565351290252339e-06, "loss": 0.5335, "mean_token_accuracy": 0.8244267264381051, "num_tokens": 58059792.0, "step": 135 }, { "entropy": 0.4700927734375, "epoch": 0.5396825396825397, "grad_norm": 1.2541013251161421, "learning_rate": 9.556385969343756e-06, "loss": 0.5178, "mean_token_accuracy": 0.8261177660897374, "num_tokens": 58469884.0, "step": 136 }, { "entropy": 0.460784912109375, "epoch": 0.5436507936507936, "grad_norm": 1.266853697510061, "learning_rate": 9.547333417471589e-06, "loss": 0.5218, "mean_token_accuracy": 0.824421800673008, "num_tokens": 58908403.0, "step": 137 }, { "entropy": 0.467498779296875, "epoch": 0.5476190476190477, "grad_norm": 1.649666578399019, "learning_rate": 9.538193807944864e-06, "loss": 0.5251, "mean_token_accuracy": 0.8241150714457035, "num_tokens": 59323796.0, "step": 138 }, { "entropy": 0.461883544921875, "epoch": 0.5515873015873016, "grad_norm": 1.2782211754106552, "learning_rate": 9.528967315739308e-06, "loss": 0.5231, "mean_token_accuracy": 0.8241786258295178, "num_tokens": 59751885.0, "step": 139 }, { "entropy": 0.464080810546875, "epoch": 0.5555555555555556, "grad_norm": 1.1911969994875058, "learning_rate": 9.519654117493996e-06, "loss": 0.5093, "mean_token_accuracy": 0.8299755034968257, "num_tokens": 60183841.0, "step": 140 }, { "entropy": 0.467681884765625, "epoch": 0.5595238095238095, "grad_norm": 1.21584451360531, "learning_rate": 9.510254391507971e-06, "loss": 0.5323, "mean_token_accuracy": 0.8225418599322438, "num_tokens": 60605801.0, "step": 141 }, { "entropy": 0.465789794921875, "epoch": 0.5634920634920635, "grad_norm": 1.1387453790165247, "learning_rate": 9.500768317736832e-06, "loss": 0.527, "mean_token_accuracy": 0.8241681484505534, "num_tokens": 61048601.0, "step": 142 }, { "entropy": 0.47747802734375, "epoch": 0.5674603174603174, "grad_norm": 1.1374751119159374, "learning_rate": 9.49119607778928e-06, "loss": 0.5235, "mean_token_accuracy": 0.8259162092581391, "num_tokens": 61446873.0, "step": 143 }, { "entropy": 0.4652099609375, "epoch": 0.5714285714285714, "grad_norm": 1.2648801316634823, "learning_rate": 9.481537854923654e-06, "loss": 0.5352, "mean_token_accuracy": 0.8220484433695674, "num_tokens": 61887912.0, "step": 144 }, { "entropy": 0.47418212890625, "epoch": 0.5753968253968254, "grad_norm": 1.113220988507023, "learning_rate": 9.471793834044416e-06, "loss": 0.5236, "mean_token_accuracy": 0.8275265209376812, "num_tokens": 62316051.0, "step": 145 }, { "entropy": 0.459381103515625, "epoch": 0.5793650793650794, "grad_norm": 1.1782022702716075, "learning_rate": 9.461964201698604e-06, "loss": 0.5239, "mean_token_accuracy": 0.8253972074016929, "num_tokens": 62741342.0, "step": 146 }, { "entropy": 0.464813232421875, "epoch": 0.5833333333333334, "grad_norm": 1.3055908871865158, "learning_rate": 9.452049146072278e-06, "loss": 0.5217, "mean_token_accuracy": 0.8288997933268547, "num_tokens": 63164890.0, "step": 147 }, { "entropy": 0.4561767578125, "epoch": 0.5873015873015873, "grad_norm": 1.250402921918011, "learning_rate": 9.442048856986899e-06, "loss": 0.5244, "mean_token_accuracy": 0.825376064516604, "num_tokens": 63594617.0, "step": 148 }, { "entropy": 0.45916748046875, "epoch": 0.5912698412698413, "grad_norm": 1.2512378704930547, "learning_rate": 9.431963525895709e-06, "loss": 0.5332, "mean_token_accuracy": 0.8236651951447129, "num_tokens": 64050293.0, "step": 149 }, { "entropy": 0.45831298828125, "epoch": 0.5952380952380952, "grad_norm": 1.2800747002600605, "learning_rate": 9.421793345880055e-06, "loss": 0.508, "mean_token_accuracy": 0.8307171342894435, "num_tokens": 64467695.0, "step": 150 }, { "entropy": 0.4619140625, "epoch": 0.5992063492063492, "grad_norm": 1.22106067792139, "learning_rate": 9.4115385116457e-06, "loss": 0.5273, "mean_token_accuracy": 0.8228645129129291, "num_tokens": 64908198.0, "step": 151 }, { "entropy": 0.465362548828125, "epoch": 0.6031746031746031, "grad_norm": 1.6011741702601825, "learning_rate": 9.401199219519088e-06, "loss": 0.5189, "mean_token_accuracy": 0.8247488467022777, "num_tokens": 65333788.0, "step": 152 }, { "entropy": 0.47772216796875, "epoch": 0.6071428571428571, "grad_norm": 1.289619416717165, "learning_rate": 9.390775667443602e-06, "loss": 0.5092, "mean_token_accuracy": 0.8292458476498723, "num_tokens": 65748782.0, "step": 153 }, { "entropy": 0.463470458984375, "epoch": 0.6111111111111112, "grad_norm": 1.3540556513064608, "learning_rate": 9.380268054975745e-06, "loss": 0.5249, "mean_token_accuracy": 0.823799098841846, "num_tokens": 66178918.0, "step": 154 }, { "entropy": 0.467132568359375, "epoch": 0.6150793650793651, "grad_norm": 1.441163655667528, "learning_rate": 9.36967658328135e-06, "loss": 0.5339, "mean_token_accuracy": 0.825651915743947, "num_tokens": 66603248.0, "step": 155 }, { "entropy": 0.4588623046875, "epoch": 0.6190476190476191, "grad_norm": 1.2757900624701248, "learning_rate": 9.359001455131713e-06, "loss": 0.5205, "mean_token_accuracy": 0.8264942672103643, "num_tokens": 67052342.0, "step": 156 }, { "entropy": 0.457855224609375, "epoch": 0.623015873015873, "grad_norm": 1.3280329459811233, "learning_rate": 9.34824287489971e-06, "loss": 0.5167, "mean_token_accuracy": 0.8265606937929988, "num_tokens": 67476890.0, "step": 157 }, { "entropy": 0.4544677734375, "epoch": 0.626984126984127, "grad_norm": 1.4362643018863588, "learning_rate": 9.337401048555892e-06, "loss": 0.5184, "mean_token_accuracy": 0.8287814203649759, "num_tokens": 67913391.0, "step": 158 }, { "entropy": 0.4598388671875, "epoch": 0.6309523809523809, "grad_norm": 1.8377059083752896, "learning_rate": 9.326476183664535e-06, "loss": 0.5086, "mean_token_accuracy": 0.8302426496520638, "num_tokens": 68339443.0, "step": 159 }, { "entropy": 0.457611083984375, "epoch": 0.6349206349206349, "grad_norm": 1.2472914610462977, "learning_rate": 9.315468489379668e-06, "loss": 0.5242, "mean_token_accuracy": 0.8252703994512558, "num_tokens": 68772115.0, "step": 160 }, { "entropy": 0.454376220703125, "epoch": 0.6388888888888888, "grad_norm": 1.0940363704932208, "learning_rate": 9.304378176441076e-06, "loss": 0.5094, "mean_token_accuracy": 0.8273925203830004, "num_tokens": 69198272.0, "step": 161 }, { "entropy": 0.456268310546875, "epoch": 0.6428571428571429, "grad_norm": 1.250494594040658, "learning_rate": 9.29320545717025e-06, "loss": 0.5044, "mean_token_accuracy": 0.8318730751052499, "num_tokens": 69611653.0, "step": 162 }, { "entropy": 0.4644775390625, "epoch": 0.6468253968253969, "grad_norm": 1.3758890462061453, "learning_rate": 9.281950545466336e-06, "loss": 0.5375, "mean_token_accuracy": 0.8206725753843784, "num_tokens": 70054917.0, "step": 163 }, { "entropy": 0.451385498046875, "epoch": 0.6507936507936508, "grad_norm": 1.2229845865238094, "learning_rate": 9.27061365680204e-06, "loss": 0.5148, "mean_token_accuracy": 0.8290882222354412, "num_tokens": 70496952.0, "step": 164 }, { "entropy": 0.452728271484375, "epoch": 0.6547619047619048, "grad_norm": 1.310715081152188, "learning_rate": 9.25919500821949e-06, "loss": 0.5108, "mean_token_accuracy": 0.8279124954715371, "num_tokens": 70919899.0, "step": 165 }, { "entropy": 0.45574951171875, "epoch": 0.6587301587301587, "grad_norm": 1.2675730907362597, "learning_rate": 9.247694818326092e-06, "loss": 0.5111, "mean_token_accuracy": 0.8315063090994954, "num_tokens": 71343921.0, "step": 166 }, { "entropy": 0.44989013671875, "epoch": 0.6626984126984127, "grad_norm": 1.3386162279647864, "learning_rate": 9.236113307290345e-06, "loss": 0.5343, "mean_token_accuracy": 0.821853213943541, "num_tokens": 71808905.0, "step": 167 }, { "entropy": 0.45709228515625, "epoch": 0.6666666666666666, "grad_norm": 1.2417954424606619, "learning_rate": 9.224450696837617e-06, "loss": 0.5137, "mean_token_accuracy": 0.8275673342868686, "num_tokens": 72240608.0, "step": 168 }, { "entropy": 0.4530029296875, "epoch": 0.6706349206349206, "grad_norm": 1.2477554302346368, "learning_rate": 9.212707210245908e-06, "loss": 0.505, "mean_token_accuracy": 0.8292029527947307, "num_tokens": 72668688.0, "step": 169 }, { "entropy": 0.453826904296875, "epoch": 0.6746031746031746, "grad_norm": 1.2403145708249377, "learning_rate": 9.200883072341573e-06, "loss": 0.5194, "mean_token_accuracy": 0.8281446853652596, "num_tokens": 73118452.0, "step": 170 }, { "entropy": 0.45068359375, "epoch": 0.6785714285714286, "grad_norm": 1.2242088741534112, "learning_rate": 9.188978509495022e-06, "loss": 0.5228, "mean_token_accuracy": 0.8244192777201533, "num_tokens": 73569120.0, "step": 171 }, { "entropy": 0.448516845703125, "epoch": 0.6825396825396826, "grad_norm": 1.4410441359720512, "learning_rate": 9.176993749616374e-06, "loss": 0.5148, "mean_token_accuracy": 0.8254242306575179, "num_tokens": 73991069.0, "step": 172 }, { "entropy": 0.457122802734375, "epoch": 0.6865079365079365, "grad_norm": 1.4617287104899703, "learning_rate": 9.164929022151106e-06, "loss": 0.506, "mean_token_accuracy": 0.8297470537945628, "num_tokens": 74406271.0, "step": 173 }, { "entropy": 0.457122802734375, "epoch": 0.6904761904761905, "grad_norm": 1.2946096899912363, "learning_rate": 9.15278455807566e-06, "loss": 0.5163, "mean_token_accuracy": 0.8275650115683675, "num_tokens": 74839901.0, "step": 174 }, { "entropy": 0.451202392578125, "epoch": 0.6944444444444444, "grad_norm": 1.2168830292282429, "learning_rate": 9.140560589893012e-06, "loss": 0.5088, "mean_token_accuracy": 0.8290477497503161, "num_tokens": 75280578.0, "step": 175 }, { "entropy": 0.45111083984375, "epoch": 0.6984126984126984, "grad_norm": 1.1964525447125613, "learning_rate": 9.128257351628224e-06, "loss": 0.5346, "mean_token_accuracy": 0.8231356684118509, "num_tokens": 75749725.0, "step": 176 }, { "entropy": 0.456024169921875, "epoch": 0.7023809523809523, "grad_norm": 1.2104495744651753, "learning_rate": 9.115875078823975e-06, "loss": 0.5188, "mean_token_accuracy": 0.8278255322948098, "num_tokens": 76175668.0, "step": 177 }, { "entropy": 0.45965576171875, "epoch": 0.7063492063492064, "grad_norm": 1.1865163712517055, "learning_rate": 9.103414008536029e-06, "loss": 0.5111, "mean_token_accuracy": 0.8277882896363735, "num_tokens": 76593690.0, "step": 178 }, { "entropy": 0.458587646484375, "epoch": 0.7103174603174603, "grad_norm": 1.6965519987597353, "learning_rate": 9.09087437932872e-06, "loss": 0.5015, "mean_token_accuracy": 0.8323444193229079, "num_tokens": 77009261.0, "step": 179 }, { "entropy": 0.454925537109375, "epoch": 0.7142857142857143, "grad_norm": 1.2650031464495928, "learning_rate": 9.07825643127037e-06, "loss": 0.5157, "mean_token_accuracy": 0.8258270686492324, "num_tokens": 77431030.0, "step": 180 }, { "entropy": 0.447906494140625, "epoch": 0.7182539682539683, "grad_norm": 1.1859012409189014, "learning_rate": 9.065560405928699e-06, "loss": 0.5023, "mean_token_accuracy": 0.8294160980731249, "num_tokens": 77852655.0, "step": 181 }, { "entropy": 0.45416259765625, "epoch": 0.7222222222222222, "grad_norm": 1.176919606678633, "learning_rate": 9.0527865463662e-06, "loss": 0.5162, "mean_token_accuracy": 0.8275531772524118, "num_tokens": 78278605.0, "step": 182 }, { "entropy": 0.4486083984375, "epoch": 0.7261904761904762, "grad_norm": 1.2918709531705708, "learning_rate": 9.039935097135479e-06, "loss": 0.5024, "mean_token_accuracy": 0.8300044005736709, "num_tokens": 78721098.0, "step": 183 }, { "entropy": 0.454345703125, "epoch": 0.7301587301587301, "grad_norm": 1.3064400710795658, "learning_rate": 9.027006304274584e-06, "loss": 0.5096, "mean_token_accuracy": 0.8292623031884432, "num_tokens": 79154216.0, "step": 184 }, { "entropy": 0.44927978515625, "epoch": 0.7341269841269841, "grad_norm": 1.2696774197334444, "learning_rate": 9.014000415302286e-06, "loss": 0.5139, "mean_token_accuracy": 0.8276010407134891, "num_tokens": 79599332.0, "step": 185 }, { "entropy": 0.45220947265625, "epoch": 0.7380952380952381, "grad_norm": 1.2548327381579976, "learning_rate": 9.000917679213344e-06, "loss": 0.5196, "mean_token_accuracy": 0.8274355586618185, "num_tokens": 80039204.0, "step": 186 }, { "entropy": 0.4434814453125, "epoch": 0.7420634920634921, "grad_norm": 1.180213420756775, "learning_rate": 8.987758346473739e-06, "loss": 0.503, "mean_token_accuracy": 0.8305716142058372, "num_tokens": 80472128.0, "step": 187 }, { "entropy": 0.449005126953125, "epoch": 0.746031746031746, "grad_norm": 1.2928756233384209, "learning_rate": 8.974522669015872e-06, "loss": 0.5174, "mean_token_accuracy": 0.8274647342041135, "num_tokens": 80910348.0, "step": 188 }, { "entropy": 0.448822021484375, "epoch": 0.75, "grad_norm": 1.153866561909503, "learning_rate": 8.961210900233757e-06, "loss": 0.5101, "mean_token_accuracy": 0.8277234118431807, "num_tokens": 81336350.0, "step": 189 }, { "entropy": 0.44439697265625, "epoch": 0.753968253968254, "grad_norm": 1.215655128934687, "learning_rate": 8.947823294978147e-06, "loss": 0.509, "mean_token_accuracy": 0.8286535432562232, "num_tokens": 81765325.0, "step": 190 }, { "entropy": 0.461395263671875, "epoch": 0.7579365079365079, "grad_norm": 1.4210713418222345, "learning_rate": 8.934360109551671e-06, "loss": 0.5106, "mean_token_accuracy": 0.8299150029197335, "num_tokens": 82191876.0, "step": 191 }, { "entropy": 0.4591064453125, "epoch": 0.7619047619047619, "grad_norm": 1.319721918446663, "learning_rate": 8.920821601703927e-06, "loss": 0.4913, "mean_token_accuracy": 0.8329328633844852, "num_tokens": 82611125.0, "step": 192 }, { "entropy": 0.453155517578125, "epoch": 0.7658730158730159, "grad_norm": 1.3201749647251046, "learning_rate": 8.907208030626538e-06, "loss": 0.5129, "mean_token_accuracy": 0.8259176956489682, "num_tokens": 83051815.0, "step": 193 }, { "entropy": 0.4512939453125, "epoch": 0.7698412698412699, "grad_norm": 1.1719138701614786, "learning_rate": 8.8935196569482e-06, "loss": 0.5079, "mean_token_accuracy": 0.8282450577244163, "num_tokens": 83488021.0, "step": 194 }, { "entropy": 0.456451416015625, "epoch": 0.7738095238095238, "grad_norm": 1.2391988296172292, "learning_rate": 8.879756742729683e-06, "loss": 0.5074, "mean_token_accuracy": 0.827914453111589, "num_tokens": 83902519.0, "step": 195 }, { "entropy": 0.450653076171875, "epoch": 0.7777777777777778, "grad_norm": 1.2037962698085334, "learning_rate": 8.865919551458823e-06, "loss": 0.505, "mean_token_accuracy": 0.8286258336156607, "num_tokens": 84321775.0, "step": 196 }, { "entropy": 0.44927978515625, "epoch": 0.7817460317460317, "grad_norm": 1.1617039305620294, "learning_rate": 8.852008348045468e-06, "loss": 0.5019, "mean_token_accuracy": 0.8323168307542801, "num_tokens": 84745911.0, "step": 197 }, { "entropy": 0.451751708984375, "epoch": 0.7857142857142857, "grad_norm": 1.149795910244863, "learning_rate": 8.838023398816417e-06, "loss": 0.4857, "mean_token_accuracy": 0.8362782001495361, "num_tokens": 85167087.0, "step": 198 }, { "entropy": 0.4635009765625, "epoch": 0.7896825396825397, "grad_norm": 1.1483411264804027, "learning_rate": 8.823964971510313e-06, "loss": 0.5075, "mean_token_accuracy": 0.8307431424036622, "num_tokens": 85588482.0, "step": 199 }, { "entropy": 0.444122314453125, "epoch": 0.7936507936507936, "grad_norm": 1.0935254315768266, "learning_rate": 8.809833335272517e-06, "loss": 0.5054, "mean_token_accuracy": 0.8298458913341165, "num_tokens": 86009383.0, "step": 200 }, { "entropy": 0.4493408203125, "epoch": 0.7976190476190477, "grad_norm": 1.1018546509681295, "learning_rate": 8.795628760649965e-06, "loss": 0.5106, "mean_token_accuracy": 0.8295301357284188, "num_tokens": 86449600.0, "step": 201 }, { "entropy": 0.450439453125, "epoch": 0.8015873015873016, "grad_norm": 1.306183682510968, "learning_rate": 8.781351519585978e-06, "loss": 0.4886, "mean_token_accuracy": 0.8344141785055399, "num_tokens": 86862628.0, "step": 202 }, { "entropy": 0.449676513671875, "epoch": 0.8055555555555556, "grad_norm": 1.0824265526588595, "learning_rate": 8.767001885415055e-06, "loss": 0.5054, "mean_token_accuracy": 0.8296528598293662, "num_tokens": 87295233.0, "step": 203 }, { "entropy": 0.449310302734375, "epoch": 0.8095238095238095, "grad_norm": 1.216483297181918, "learning_rate": 8.752580132857652e-06, "loss": 0.4987, "mean_token_accuracy": 0.8328232821077108, "num_tokens": 87713395.0, "step": 204 }, { "entropy": 0.4515380859375, "epoch": 0.8134920634920635, "grad_norm": 1.1371633597502904, "learning_rate": 8.73808653801491e-06, "loss": 0.5216, "mean_token_accuracy": 0.8253697715699673, "num_tokens": 88158822.0, "step": 205 }, { "entropy": 0.44964599609375, "epoch": 0.8174603174603174, "grad_norm": 1.2076012965398912, "learning_rate": 8.723521378363378e-06, "loss": 0.5049, "mean_token_accuracy": 0.8300966452807188, "num_tokens": 88602545.0, "step": 206 }, { "entropy": 0.45513916015625, "epoch": 0.8214285714285714, "grad_norm": 1.1637271792413393, "learning_rate": 8.70888493274969e-06, "loss": 0.4854, "mean_token_accuracy": 0.8374869581311941, "num_tokens": 89025796.0, "step": 207 }, { "entropy": 0.44927978515625, "epoch": 0.8253968253968254, "grad_norm": 1.1305189795680015, "learning_rate": 8.694177481385244e-06, "loss": 0.5061, "mean_token_accuracy": 0.8304181462153792, "num_tokens": 89444255.0, "step": 208 }, { "entropy": 0.44769287109375, "epoch": 0.8293650793650794, "grad_norm": 1.065905888231706, "learning_rate": 8.679399305840815e-06, "loss": 0.511, "mean_token_accuracy": 0.8329211305826902, "num_tokens": 89894143.0, "step": 209 }, { "entropy": 0.448516845703125, "epoch": 0.8333333333333334, "grad_norm": 1.194800491826659, "learning_rate": 8.664550689041187e-06, "loss": 0.4704, "mean_token_accuracy": 0.8389384057372808, "num_tokens": 90312774.0, "step": 210 }, { "entropy": 0.451995849609375, "epoch": 0.8373015873015873, "grad_norm": 1.1324678388489409, "learning_rate": 8.649631915259716e-06, "loss": 0.4959, "mean_token_accuracy": 0.832505133934319, "num_tokens": 90741787.0, "step": 211 }, { "entropy": 0.444610595703125, "epoch": 0.8412698412698413, "grad_norm": 1.0451373377494304, "learning_rate": 8.634643270112903e-06, "loss": 0.4874, "mean_token_accuracy": 0.8343986244872212, "num_tokens": 91177447.0, "step": 212 }, { "entropy": 0.448516845703125, "epoch": 0.8452380952380952, "grad_norm": 1.1350367484478692, "learning_rate": 8.61958504055492e-06, "loss": 0.4924, "mean_token_accuracy": 0.8339378647506237, "num_tokens": 91607165.0, "step": 213 }, { "entropy": 0.45574951171875, "epoch": 0.8492063492063492, "grad_norm": 1.1435711522188763, "learning_rate": 8.604457514872115e-06, "loss": 0.4934, "mean_token_accuracy": 0.8312076451256871, "num_tokens": 92026164.0, "step": 214 }, { "entropy": 0.448028564453125, "epoch": 0.8531746031746031, "grad_norm": 1.210433236941165, "learning_rate": 8.589260982677496e-06, "loss": 0.4936, "mean_token_accuracy": 0.8334163334220648, "num_tokens": 92463989.0, "step": 215 }, { "entropy": 0.4459228515625, "epoch": 0.8571428571428571, "grad_norm": 1.2030101822851358, "learning_rate": 8.573995734905185e-06, "loss": 0.4917, "mean_token_accuracy": 0.8336746180430055, "num_tokens": 92891631.0, "step": 216 }, { "entropy": 0.4539794921875, "epoch": 0.8611111111111112, "grad_norm": 1.0466701342650107, "learning_rate": 8.558662063804843e-06, "loss": 0.5039, "mean_token_accuracy": 0.8325941441580653, "num_tokens": 93322969.0, "step": 217 }, { "entropy": 0.448883056640625, "epoch": 0.8650793650793651, "grad_norm": 1.3569379184552983, "learning_rate": 8.543260262936087e-06, "loss": 0.4942, "mean_token_accuracy": 0.8330146428197622, "num_tokens": 93760535.0, "step": 218 }, { "entropy": 0.445465087890625, "epoch": 0.8690476190476191, "grad_norm": 1.1285395121488393, "learning_rate": 8.527790627162858e-06, "loss": 0.485, "mean_token_accuracy": 0.835063835605979, "num_tokens": 94172398.0, "step": 219 }, { "entropy": 0.450775146484375, "epoch": 0.873015873015873, "grad_norm": 1.2538705581876535, "learning_rate": 8.512253452647783e-06, "loss": 0.502, "mean_token_accuracy": 0.8306903587654233, "num_tokens": 94599260.0, "step": 220 }, { "entropy": 0.45660400390625, "epoch": 0.876984126984127, "grad_norm": 1.1551796563028132, "learning_rate": 8.496649036846502e-06, "loss": 0.4946, "mean_token_accuracy": 0.8319389009848237, "num_tokens": 95019433.0, "step": 221 }, { "entropy": 0.461669921875, "epoch": 0.8809523809523809, "grad_norm": 1.2009353491848689, "learning_rate": 8.480977678501974e-06, "loss": 0.4915, "mean_token_accuracy": 0.8330316534265876, "num_tokens": 95425799.0, "step": 222 }, { "entropy": 0.45465087890625, "epoch": 0.8849206349206349, "grad_norm": 1.0850199284929676, "learning_rate": 8.465239677638755e-06, "loss": 0.4919, "mean_token_accuracy": 0.8328907387331128, "num_tokens": 95822890.0, "step": 223 }, { "entropy": 0.45330810546875, "epoch": 0.8888888888888888, "grad_norm": 1.4803897939108124, "learning_rate": 8.449435335557264e-06, "loss": 0.5054, "mean_token_accuracy": 0.8312137639150023, "num_tokens": 96260271.0, "step": 224 }, { "entropy": 0.443267822265625, "epoch": 0.8928571428571429, "grad_norm": 2.1079096762406238, "learning_rate": 8.433564954828e-06, "loss": 0.4991, "mean_token_accuracy": 0.8311476595699787, "num_tokens": 96696652.0, "step": 225 }, { "entropy": 0.450286865234375, "epoch": 0.8968253968253969, "grad_norm": 1.2706829768849834, "learning_rate": 8.417628839285757e-06, "loss": 0.4981, "mean_token_accuracy": 0.8332603024318814, "num_tokens": 97135925.0, "step": 226 }, { "entropy": 0.45703125, "epoch": 0.9007936507936508, "grad_norm": 1.8201254601577819, "learning_rate": 8.401627294023815e-06, "loss": 0.5142, "mean_token_accuracy": 0.828549837693572, "num_tokens": 97573810.0, "step": 227 }, { "entropy": 0.447784423828125, "epoch": 0.9047619047619048, "grad_norm": 1.1241933043727534, "learning_rate": 8.385560625388081e-06, "loss": 0.4831, "mean_token_accuracy": 0.8362022209912539, "num_tokens": 98011108.0, "step": 228 }, { "entropy": 0.454071044921875, "epoch": 0.9087301587301587, "grad_norm": 1.1121125737189776, "learning_rate": 8.369429140971239e-06, "loss": 0.4811, "mean_token_accuracy": 0.8338787518441677, "num_tokens": 98441631.0, "step": 229 }, { "entropy": 0.457305908203125, "epoch": 0.9126984126984127, "grad_norm": 1.0458991032894815, "learning_rate": 8.353233149606859e-06, "loss": 0.4924, "mean_token_accuracy": 0.8308598725125194, "num_tokens": 98873029.0, "step": 230 }, { "entropy": 0.453765869140625, "epoch": 0.9166666666666666, "grad_norm": 1.2247678683157683, "learning_rate": 8.336972961363472e-06, "loss": 0.498, "mean_token_accuracy": 0.8302338859066367, "num_tokens": 99296106.0, "step": 231 }, { "entropy": 0.45703125, "epoch": 0.9206349206349206, "grad_norm": 1.2989134951116341, "learning_rate": 8.320648887538657e-06, "loss": 0.4957, "mean_token_accuracy": 0.8315738271921873, "num_tokens": 99734864.0, "step": 232 }, { "entropy": 0.45330810546875, "epoch": 0.9246031746031746, "grad_norm": 1.080222766722178, "learning_rate": 8.304261240653054e-06, "loss": 0.507, "mean_token_accuracy": 0.8313342472538352, "num_tokens": 100174517.0, "step": 233 }, { "entropy": 0.460418701171875, "epoch": 0.9285714285714286, "grad_norm": 1.1572509289153226, "learning_rate": 8.287810334444406e-06, "loss": 0.4926, "mean_token_accuracy": 0.8337559709325433, "num_tokens": 100606926.0, "step": 234 }, { "entropy": 0.458404541015625, "epoch": 0.9325396825396826, "grad_norm": 1.1066868483832115, "learning_rate": 8.271296483861532e-06, "loss": 0.4829, "mean_token_accuracy": 0.835618756711483, "num_tokens": 101020425.0, "step": 235 }, { "entropy": 0.45831298828125, "epoch": 0.9365079365079365, "grad_norm": 1.060730775603579, "learning_rate": 8.254720005058317e-06, "loss": 0.4912, "mean_token_accuracy": 0.8332764646038413, "num_tokens": 101447599.0, "step": 236 }, { "entropy": 0.458953857421875, "epoch": 0.9404761904761905, "grad_norm": 1.1471857859225785, "learning_rate": 8.238081215387639e-06, "loss": 0.4843, "mean_token_accuracy": 0.8348336489871144, "num_tokens": 101851986.0, "step": 237 }, { "entropy": 0.449310302734375, "epoch": 0.9444444444444444, "grad_norm": 1.1375613016443888, "learning_rate": 8.221380433395308e-06, "loss": 0.4934, "mean_token_accuracy": 0.8338221423327923, "num_tokens": 102275358.0, "step": 238 }, { "entropy": 0.459075927734375, "epoch": 0.9484126984126984, "grad_norm": 1.0708255333770056, "learning_rate": 8.204617978813963e-06, "loss": 0.4838, "mean_token_accuracy": 0.8348392806947231, "num_tokens": 102688415.0, "step": 239 }, { "entropy": 0.457061767578125, "epoch": 0.9523809523809523, "grad_norm": 1.269015813917946, "learning_rate": 8.187794172556947e-06, "loss": 0.4901, "mean_token_accuracy": 0.832873186096549, "num_tokens": 103113293.0, "step": 240 }, { "entropy": 0.447052001953125, "epoch": 0.9563492063492064, "grad_norm": 1.2172067541370395, "learning_rate": 8.170909336712171e-06, "loss": 0.4934, "mean_token_accuracy": 0.8310654619708657, "num_tokens": 103566779.0, "step": 241 }, { "entropy": 0.442840576171875, "epoch": 0.9603174603174603, "grad_norm": 1.9614491486328336, "learning_rate": 8.153963794535945e-06, "loss": 0.4967, "mean_token_accuracy": 0.8313373932614923, "num_tokens": 104000550.0, "step": 242 }, { "entropy": 0.45098876953125, "epoch": 0.9642857142857143, "grad_norm": 1.2204808359509163, "learning_rate": 8.136957870446779e-06, "loss": 0.4998, "mean_token_accuracy": 0.830800985917449, "num_tokens": 104429372.0, "step": 243 }, { "entropy": 0.443115234375, "epoch": 0.9682539682539683, "grad_norm": 1.1287254868438927, "learning_rate": 8.119891890019187e-06, "loss": 0.486, "mean_token_accuracy": 0.8366484735161066, "num_tokens": 104859286.0, "step": 244 }, { "entropy": 0.45599365234375, "epoch": 0.9722222222222222, "grad_norm": 1.1632405758479503, "learning_rate": 8.102766179977452e-06, "loss": 0.4954, "mean_token_accuracy": 0.83047538343817, "num_tokens": 105281017.0, "step": 245 }, { "entropy": 0.44305419921875, "epoch": 0.9761904761904762, "grad_norm": 1.0531020537734286, "learning_rate": 8.085581068189358e-06, "loss": 0.4875, "mean_token_accuracy": 0.83509177621454, "num_tokens": 105729675.0, "step": 246 }, { "entropy": 0.444549560546875, "epoch": 0.9801587301587301, "grad_norm": 1.136500203665195, "learning_rate": 8.068336883659926e-06, "loss": 0.4926, "mean_token_accuracy": 0.8322630152106285, "num_tokens": 106168119.0, "step": 247 }, { "entropy": 0.442291259765625, "epoch": 0.9841269841269841, "grad_norm": 1.0192188396085724, "learning_rate": 8.051033956525113e-06, "loss": 0.484, "mean_token_accuracy": 0.8352002650499344, "num_tokens": 106603968.0, "step": 248 }, { "entropy": 0.439788818359375, "epoch": 0.9880952380952381, "grad_norm": 1.1049532946463114, "learning_rate": 8.033672618045485e-06, "loss": 0.492, "mean_token_accuracy": 0.8354252576828003, "num_tokens": 107054152.0, "step": 249 }, { "entropy": 0.440826416015625, "epoch": 0.9920634920634921, "grad_norm": 1.0599713800274446, "learning_rate": 8.016253200599885e-06, "loss": 0.4782, "mean_token_accuracy": 0.8366458043456078, "num_tokens": 107495007.0, "step": 250 }, { "entropy": 0.447113037109375, "epoch": 0.996031746031746, "grad_norm": 1.1844331984330863, "learning_rate": 7.998776037679061e-06, "loss": 0.4986, "mean_token_accuracy": 0.8293369021266699, "num_tokens": 107928758.0, "step": 251 }, { "entropy": 0.441619873046875, "epoch": 1.0, "grad_norm": 1.0144603078826888, "learning_rate": 7.981241463879284e-06, "loss": 0.4922, "mean_token_accuracy": 0.8354968074709177, "num_tokens": 108364335.0, "step": 252 }, { "entropy": 0.46148681640625, "epoch": 1.003968253968254, "grad_norm": 1.1133036368721527, "learning_rate": 7.963649814895945e-06, "loss": 0.4675, "mean_token_accuracy": 0.8393758479505777, "num_tokens": 108775586.0, "step": 253 }, { "entropy": 0.452392578125, "epoch": 1.007936507936508, "grad_norm": 1.0079553576284006, "learning_rate": 7.94600142751713e-06, "loss": 0.4619, "mean_token_accuracy": 0.8416725508868694, "num_tokens": 109202665.0, "step": 254 }, { "entropy": 0.4403076171875, "epoch": 1.0119047619047619, "grad_norm": 1.0665471851715955, "learning_rate": 7.92829663961716e-06, "loss": 0.4616, "mean_token_accuracy": 0.843192096799612, "num_tokens": 109629975.0, "step": 255 }, { "entropy": 0.440765380859375, "epoch": 1.0158730158730158, "grad_norm": 1.0527949047806084, "learning_rate": 7.910535790150135e-06, "loss": 0.4684, "mean_token_accuracy": 0.8393022352829576, "num_tokens": 110061605.0, "step": 256 }, { "entropy": 0.443817138671875, "epoch": 1.0198412698412698, "grad_norm": 1.037337532935931, "learning_rate": 7.892719219143446e-06, "loss": 0.458, "mean_token_accuracy": 0.842767583206296, "num_tokens": 110487591.0, "step": 257 }, { "entropy": 0.4439697265625, "epoch": 1.0238095238095237, "grad_norm": 0.9282961355601993, "learning_rate": 7.874847267691254e-06, "loss": 0.4674, "mean_token_accuracy": 0.8391132960096002, "num_tokens": 110924491.0, "step": 258 }, { "entropy": 0.44256591796875, "epoch": 1.0277777777777777, "grad_norm": 1.0812964655312522, "learning_rate": 7.856920277947969e-06, "loss": 0.4666, "mean_token_accuracy": 0.8417868306860328, "num_tokens": 111351831.0, "step": 259 }, { "entropy": 0.442596435546875, "epoch": 1.0317460317460316, "grad_norm": 1.0004332183195612, "learning_rate": 7.83893859312169e-06, "loss": 0.4608, "mean_token_accuracy": 0.840317826718092, "num_tokens": 111773832.0, "step": 260 }, { "entropy": 0.441650390625, "epoch": 1.0357142857142858, "grad_norm": 1.0083023934199706, "learning_rate": 7.820902557467648e-06, "loss": 0.4546, "mean_token_accuracy": 0.8436138844117522, "num_tokens": 112210334.0, "step": 261 }, { "entropy": 0.440338134765625, "epoch": 1.0396825396825398, "grad_norm": 1.0205115508469926, "learning_rate": 7.80281251628161e-06, "loss": 0.4617, "mean_token_accuracy": 0.8404037207365036, "num_tokens": 112637470.0, "step": 262 }, { "entropy": 0.43670654296875, "epoch": 1.0436507936507937, "grad_norm": 1.1861875486087046, "learning_rate": 7.784668815893256e-06, "loss": 0.465, "mean_token_accuracy": 0.8401956735178828, "num_tokens": 113069179.0, "step": 263 }, { "entropy": 0.44329833984375, "epoch": 1.0476190476190477, "grad_norm": 1.0426651868796517, "learning_rate": 7.766471803659571e-06, "loss": 0.4725, "mean_token_accuracy": 0.8395506730303168, "num_tokens": 113501590.0, "step": 264 }, { "entropy": 0.440948486328125, "epoch": 1.0515873015873016, "grad_norm": 1.0688154361912685, "learning_rate": 7.748221827958174e-06, "loss": 0.463, "mean_token_accuracy": 0.8411337668076158, "num_tokens": 113935337.0, "step": 265 }, { "entropy": 0.44378662109375, "epoch": 1.0555555555555556, "grad_norm": 0.9973458392577903, "learning_rate": 7.729919238180663e-06, "loss": 0.4644, "mean_token_accuracy": 0.8407721919938922, "num_tokens": 114360533.0, "step": 266 }, { "entropy": 0.4375, "epoch": 1.0595238095238095, "grad_norm": 1.0128154565462195, "learning_rate": 7.711564384725916e-06, "loss": 0.456, "mean_token_accuracy": 0.8427523402497172, "num_tokens": 114792424.0, "step": 267 }, { "entropy": 0.43865966796875, "epoch": 1.0634920634920635, "grad_norm": 1.1237442568992235, "learning_rate": 7.693157618993392e-06, "loss": 0.4713, "mean_token_accuracy": 0.8381293760612607, "num_tokens": 115231953.0, "step": 268 }, { "entropy": 0.44390869140625, "epoch": 1.0674603174603174, "grad_norm": 0.9632813464843945, "learning_rate": 7.674699293376397e-06, "loss": 0.4606, "mean_token_accuracy": 0.8414155915379524, "num_tokens": 115664522.0, "step": 269 }, { "entropy": 0.439239501953125, "epoch": 1.0714285714285714, "grad_norm": 1.1143536721017135, "learning_rate": 7.656189761255333e-06, "loss": 0.4585, "mean_token_accuracy": 0.8407229576259851, "num_tokens": 116092221.0, "step": 270 }, { "entropy": 0.4417724609375, "epoch": 1.0753968253968254, "grad_norm": 1.0175840618853507, "learning_rate": 7.63762937699095e-06, "loss": 0.4619, "mean_token_accuracy": 0.8408547407016158, "num_tokens": 116534679.0, "step": 271 }, { "entropy": 0.4439697265625, "epoch": 1.0793650793650793, "grad_norm": 1.0025546600901896, "learning_rate": 7.619018495917543e-06, "loss": 0.4696, "mean_token_accuracy": 0.8394848993048072, "num_tokens": 116984739.0, "step": 272 }, { "entropy": 0.44073486328125, "epoch": 1.0833333333333333, "grad_norm": 1.0897542155601712, "learning_rate": 7.600357474336157e-06, "loss": 0.4662, "mean_token_accuracy": 0.8403668319806457, "num_tokens": 117413323.0, "step": 273 }, { "entropy": 0.4364013671875, "epoch": 1.0873015873015872, "grad_norm": 1.026521342719511, "learning_rate": 7.581646669507768e-06, "loss": 0.4631, "mean_token_accuracy": 0.8399766776710749, "num_tokens": 117852991.0, "step": 274 }, { "entropy": 0.4500732421875, "epoch": 1.0912698412698412, "grad_norm": 1.1089611631121021, "learning_rate": 7.56288643964644e-06, "loss": 0.4686, "mean_token_accuracy": 0.8402743814513087, "num_tokens": 118264477.0, "step": 275 }, { "entropy": 0.440032958984375, "epoch": 1.0952380952380953, "grad_norm": 1.1837449681611911, "learning_rate": 7.544077143912467e-06, "loss": 0.4596, "mean_token_accuracy": 0.8443190716207027, "num_tokens": 118696927.0, "step": 276 }, { "entropy": 0.43536376953125, "epoch": 1.0992063492063493, "grad_norm": 1.0567641917315522, "learning_rate": 7.525219142405501e-06, "loss": 0.4645, "mean_token_accuracy": 0.8398779211565852, "num_tokens": 119143061.0, "step": 277 }, { "entropy": 0.4447021484375, "epoch": 1.1031746031746033, "grad_norm": 1.0628873288461702, "learning_rate": 7.506312796157649e-06, "loss": 0.464, "mean_token_accuracy": 0.8407185869291425, "num_tokens": 119569613.0, "step": 278 }, { "entropy": 0.44366455078125, "epoch": 1.1071428571428572, "grad_norm": 1.3089788931081365, "learning_rate": 7.487358467126573e-06, "loss": 0.4666, "mean_token_accuracy": 0.8411134304478765, "num_tokens": 119990044.0, "step": 279 }, { "entropy": 0.4305419921875, "epoch": 1.1111111111111112, "grad_norm": 1.200277045741654, "learning_rate": 7.468356518188551e-06, "loss": 0.4687, "mean_token_accuracy": 0.83890818990767, "num_tokens": 120447089.0, "step": 280 }, { "entropy": 0.435943603515625, "epoch": 1.1150793650793651, "grad_norm": 1.065088410503753, "learning_rate": 7.449307313131533e-06, "loss": 0.4481, "mean_token_accuracy": 0.846671967767179, "num_tokens": 120882118.0, "step": 281 }, { "entropy": 0.4400634765625, "epoch": 1.119047619047619, "grad_norm": 1.0435830370708483, "learning_rate": 7.4302112166481814e-06, "loss": 0.4653, "mean_token_accuracy": 0.8401108030229807, "num_tokens": 121314886.0, "step": 282 }, { "entropy": 0.444610595703125, "epoch": 1.123015873015873, "grad_norm": 1.1498875512493505, "learning_rate": 7.411068594328876e-06, "loss": 0.4506, "mean_token_accuracy": 0.8450519479811192, "num_tokens": 121731396.0, "step": 283 }, { "entropy": 0.441192626953125, "epoch": 1.126984126984127, "grad_norm": 1.1037530140349723, "learning_rate": 7.391879812654727e-06, "loss": 0.4573, "mean_token_accuracy": 0.8432380286976695, "num_tokens": 122167617.0, "step": 284 }, { "entropy": 0.436553955078125, "epoch": 1.130952380952381, "grad_norm": 1.2008296365359707, "learning_rate": 7.37264523899056e-06, "loss": 0.4564, "mean_token_accuracy": 0.8409950910136104, "num_tokens": 122593508.0, "step": 285 }, { "entropy": 0.439788818359375, "epoch": 1.1349206349206349, "grad_norm": 1.1519884106136846, "learning_rate": 7.353365241577869e-06, "loss": 0.4606, "mean_token_accuracy": 0.839851806871593, "num_tokens": 123013840.0, "step": 286 }, { "entropy": 0.43341064453125, "epoch": 1.1388888888888888, "grad_norm": 1.0329372716274068, "learning_rate": 7.3340401895277816e-06, "loss": 0.4498, "mean_token_accuracy": 0.8443695362657309, "num_tokens": 123444043.0, "step": 287 }, { "entropy": 0.436676025390625, "epoch": 1.1428571428571428, "grad_norm": 1.0218663400951138, "learning_rate": 7.314670452813982e-06, "loss": 0.4503, "mean_token_accuracy": 0.8440707307308912, "num_tokens": 123876490.0, "step": 288 }, { "entropy": 0.44293212890625, "epoch": 1.1468253968253967, "grad_norm": 1.0595566545611714, "learning_rate": 7.295256402265636e-06, "loss": 0.4561, "mean_token_accuracy": 0.841067879460752, "num_tokens": 124297019.0, "step": 289 }, { "entropy": 0.44622802734375, "epoch": 1.1507936507936507, "grad_norm": 1.1333083345633674, "learning_rate": 7.275798409560282e-06, "loss": 0.4617, "mean_token_accuracy": 0.8422295236960053, "num_tokens": 124713314.0, "step": 290 }, { "entropy": 0.44403076171875, "epoch": 1.1547619047619047, "grad_norm": 1.1923827872697734, "learning_rate": 7.256296847216727e-06, "loss": 0.4573, "mean_token_accuracy": 0.8406451418995857, "num_tokens": 125125061.0, "step": 291 }, { "entropy": 0.440155029296875, "epoch": 1.1587301587301586, "grad_norm": 1.1646433646945646, "learning_rate": 7.236752088587905e-06, "loss": 0.4735, "mean_token_accuracy": 0.8386099971830845, "num_tokens": 125564746.0, "step": 292 }, { "entropy": 0.435272216796875, "epoch": 1.1626984126984128, "grad_norm": 1.1116112176497874, "learning_rate": 7.217164507853734e-06, "loss": 0.4531, "mean_token_accuracy": 0.8449215041473508, "num_tokens": 125992351.0, "step": 293 }, { "entropy": 0.440032958984375, "epoch": 1.1666666666666667, "grad_norm": 1.0397784652565205, "learning_rate": 7.197534480013951e-06, "loss": 0.4515, "mean_token_accuracy": 0.8436530968174338, "num_tokens": 126415997.0, "step": 294 }, { "entropy": 0.44482421875, "epoch": 1.1706349206349207, "grad_norm": 1.129298764751686, "learning_rate": 7.177862380880935e-06, "loss": 0.4629, "mean_token_accuracy": 0.841444781050086, "num_tokens": 126851930.0, "step": 295 }, { "entropy": 0.44580078125, "epoch": 1.1746031746031746, "grad_norm": 1.0985527605182936, "learning_rate": 7.158148587072509e-06, "loss": 0.467, "mean_token_accuracy": 0.8395384335890412, "num_tokens": 127285760.0, "step": 296 }, { "entropy": 0.455108642578125, "epoch": 1.1785714285714286, "grad_norm": 1.2001077801428681, "learning_rate": 7.138393476004725e-06, "loss": 0.4803, "mean_token_accuracy": 0.8372842157259583, "num_tokens": 127724762.0, "step": 297 }, { "entropy": 0.43841552734375, "epoch": 1.1825396825396826, "grad_norm": 1.054003052207074, "learning_rate": 7.118597425884659e-06, "loss": 0.4523, "mean_token_accuracy": 0.8465767158195376, "num_tokens": 128153685.0, "step": 298 }, { "entropy": 0.443328857421875, "epoch": 1.1865079365079365, "grad_norm": 1.0655995217798397, "learning_rate": 7.098760815703139e-06, "loss": 0.4531, "mean_token_accuracy": 0.8448374746367335, "num_tokens": 128574985.0, "step": 299 }, { "entropy": 0.452362060546875, "epoch": 1.1904761904761905, "grad_norm": 1.1076879019132861, "learning_rate": 7.078884025227519e-06, "loss": 0.4515, "mean_token_accuracy": 0.8428602814674377, "num_tokens": 128990738.0, "step": 300 }, { "entropy": 0.4468994140625, "epoch": 1.1944444444444444, "grad_norm": 1.096401426354454, "learning_rate": 7.058967434994388e-06, "loss": 0.4526, "mean_token_accuracy": 0.8467154111713171, "num_tokens": 129413253.0, "step": 301 }, { "entropy": 0.444061279296875, "epoch": 1.1984126984126984, "grad_norm": 0.9851920784045842, "learning_rate": 7.0390114263022955e-06, "loss": 0.474, "mean_token_accuracy": 0.8386435657739639, "num_tokens": 129848900.0, "step": 302 }, { "entropy": 0.44317626953125, "epoch": 1.2023809523809523, "grad_norm": 1.112135152774716, "learning_rate": 7.019016381204448e-06, "loss": 0.4553, "mean_token_accuracy": 0.8430305812507868, "num_tokens": 130278951.0, "step": 303 }, { "entropy": 0.444427490234375, "epoch": 1.2063492063492063, "grad_norm": 1.1661189845303515, "learning_rate": 6.998982682501394e-06, "loss": 0.4629, "mean_token_accuracy": 0.841990914195776, "num_tokens": 130724709.0, "step": 304 }, { "entropy": 0.445404052734375, "epoch": 1.2103174603174602, "grad_norm": 0.9959690543341396, "learning_rate": 6.978910713733696e-06, "loss": 0.4429, "mean_token_accuracy": 0.8485971093177795, "num_tokens": 131151665.0, "step": 305 }, { "entropy": 0.438751220703125, "epoch": 1.2142857142857142, "grad_norm": 0.9834937169980936, "learning_rate": 6.958800859174591e-06, "loss": 0.4491, "mean_token_accuracy": 0.845764022320509, "num_tokens": 131582811.0, "step": 306 }, { "entropy": 0.442840576171875, "epoch": 1.2182539682539684, "grad_norm": 1.0523226181088532, "learning_rate": 6.938653503822628e-06, "loss": 0.4574, "mean_token_accuracy": 0.8434069091454148, "num_tokens": 131998529.0, "step": 307 }, { "entropy": 0.4339599609375, "epoch": 1.2222222222222223, "grad_norm": 1.0371255492047888, "learning_rate": 6.9184690333942995e-06, "loss": 0.4517, "mean_token_accuracy": 0.8438770910724998, "num_tokens": 132429743.0, "step": 308 }, { "entropy": 0.439239501953125, "epoch": 1.2261904761904763, "grad_norm": 1.1404078217146265, "learning_rate": 6.898247834316662e-06, "loss": 0.4576, "mean_token_accuracy": 0.8416583137586713, "num_tokens": 132864811.0, "step": 309 }, { "entropy": 0.437103271484375, "epoch": 1.2301587301587302, "grad_norm": 1.0196151103714386, "learning_rate": 6.877990293719928e-06, "loss": 0.4611, "mean_token_accuracy": 0.8426391445100307, "num_tokens": 133291943.0, "step": 310 }, { "entropy": 0.4429931640625, "epoch": 1.2341269841269842, "grad_norm": 1.1597754105733091, "learning_rate": 6.857696799430064e-06, "loss": 0.4594, "mean_token_accuracy": 0.8428373141214252, "num_tokens": 133728664.0, "step": 311 }, { "entropy": 0.442169189453125, "epoch": 1.2380952380952381, "grad_norm": 1.0933297455326956, "learning_rate": 6.83736773996136e-06, "loss": 0.4495, "mean_token_accuracy": 0.8465461218729615, "num_tokens": 134149814.0, "step": 312 }, { "entropy": 0.444610595703125, "epoch": 1.242063492063492, "grad_norm": 0.9545364491465045, "learning_rate": 6.817003504508993e-06, "loss": 0.4453, "mean_token_accuracy": 0.8452331237494946, "num_tokens": 134567037.0, "step": 313 }, { "entropy": 0.441436767578125, "epoch": 1.246031746031746, "grad_norm": 0.9909665760096847, "learning_rate": 6.796604482941578e-06, "loss": 0.4474, "mean_token_accuracy": 0.8466871501877904, "num_tokens": 134989406.0, "step": 314 }, { "entropy": 0.43414306640625, "epoch": 1.25, "grad_norm": 1.0159907252981955, "learning_rate": 6.7761710657936995e-06, "loss": 0.4361, "mean_token_accuracy": 0.8494271822273731, "num_tokens": 135405949.0, "step": 315 }, { "entropy": 0.436004638671875, "epoch": 1.253968253968254, "grad_norm": 1.1634799840745833, "learning_rate": 6.75570364425844e-06, "loss": 0.4552, "mean_token_accuracy": 0.8439184688031673, "num_tokens": 135832642.0, "step": 316 }, { "entropy": 0.43035888671875, "epoch": 1.257936507936508, "grad_norm": 1.0848830841192156, "learning_rate": 6.735202610179886e-06, "loss": 0.4588, "mean_token_accuracy": 0.8425602596253157, "num_tokens": 136281104.0, "step": 317 }, { "entropy": 0.4400634765625, "epoch": 1.2619047619047619, "grad_norm": 1.1024831215933177, "learning_rate": 6.714668356045629e-06, "loss": 0.4459, "mean_token_accuracy": 0.8458384843543172, "num_tokens": 136724748.0, "step": 318 }, { "entropy": 0.437774658203125, "epoch": 1.2658730158730158, "grad_norm": 1.14453363380739, "learning_rate": 6.694101274979253e-06, "loss": 0.4484, "mean_token_accuracy": 0.8426429070532322, "num_tokens": 137144383.0, "step": 319 }, { "entropy": 0.44573974609375, "epoch": 1.2698412698412698, "grad_norm": 1.1202850192609648, "learning_rate": 6.673501760732805e-06, "loss": 0.4575, "mean_token_accuracy": 0.8433046471327543, "num_tokens": 137570382.0, "step": 320 }, { "entropy": 0.439056396484375, "epoch": 1.2738095238095237, "grad_norm": 1.1686361321236263, "learning_rate": 6.652870207679253e-06, "loss": 0.4525, "mean_token_accuracy": 0.8428729372099042, "num_tokens": 138002323.0, "step": 321 }, { "entropy": 0.43701171875, "epoch": 1.2777777777777777, "grad_norm": 1.1692980704447018, "learning_rate": 6.632207010804949e-06, "loss": 0.4576, "mean_token_accuracy": 0.8453587293624878, "num_tokens": 138431194.0, "step": 322 }, { "entropy": 0.439239501953125, "epoch": 1.2817460317460316, "grad_norm": 1.0283968957929952, "learning_rate": 6.611512565702053e-06, "loss": 0.4494, "mean_token_accuracy": 0.8435638211667538, "num_tokens": 138863136.0, "step": 323 }, { "entropy": 0.43597412109375, "epoch": 1.2857142857142856, "grad_norm": 1.0723867427352887, "learning_rate": 6.590787268560967e-06, "loss": 0.4349, "mean_token_accuracy": 0.8492929134517908, "num_tokens": 139287890.0, "step": 324 }, { "entropy": 0.4398193359375, "epoch": 1.2896825396825398, "grad_norm": 1.0222079112541533, "learning_rate": 6.570031516162746e-06, "loss": 0.4585, "mean_token_accuracy": 0.8433736823499203, "num_tokens": 139730663.0, "step": 325 }, { "entropy": 0.435150146484375, "epoch": 1.2936507936507937, "grad_norm": 0.9275873017340585, "learning_rate": 6.549245705871507e-06, "loss": 0.4499, "mean_token_accuracy": 0.8432614449411631, "num_tokens": 140160179.0, "step": 326 }, { "entropy": 0.43756103515625, "epoch": 1.2976190476190477, "grad_norm": 1.174514802084351, "learning_rate": 6.528430235626819e-06, "loss": 0.4463, "mean_token_accuracy": 0.8453215239569545, "num_tokens": 140577958.0, "step": 327 }, { "entropy": 0.433258056640625, "epoch": 1.3015873015873016, "grad_norm": 1.091000460313449, "learning_rate": 6.5075855039360805e-06, "loss": 0.4632, "mean_token_accuracy": 0.8417082950472832, "num_tokens": 141002875.0, "step": 328 }, { "entropy": 0.43377685546875, "epoch": 1.3055555555555556, "grad_norm": 0.9951305912978812, "learning_rate": 6.486711909866895e-06, "loss": 0.445, "mean_token_accuracy": 0.8452390227466822, "num_tokens": 141425392.0, "step": 329 }, { "entropy": 0.436004638671875, "epoch": 1.3095238095238095, "grad_norm": 0.9773602377225085, "learning_rate": 6.465809853039431e-06, "loss": 0.4429, "mean_token_accuracy": 0.8470056857913733, "num_tokens": 141858286.0, "step": 330 }, { "entropy": 0.44110107421875, "epoch": 1.3134920634920635, "grad_norm": 1.0492801166182826, "learning_rate": 6.444879733618766e-06, "loss": 0.4432, "mean_token_accuracy": 0.8470598505809903, "num_tokens": 142279417.0, "step": 331 }, { "entropy": 0.439056396484375, "epoch": 1.3174603174603174, "grad_norm": 0.9459765835539803, "learning_rate": 6.423921952307237e-06, "loss": 0.4471, "mean_token_accuracy": 0.8453462338075042, "num_tokens": 142698339.0, "step": 332 }, { "entropy": 0.436981201171875, "epoch": 1.3214285714285714, "grad_norm": 1.075628581502009, "learning_rate": 6.4029369103367545e-06, "loss": 0.4424, "mean_token_accuracy": 0.8465406149625778, "num_tokens": 143128013.0, "step": 333 }, { "entropy": 0.43994140625, "epoch": 1.3253968253968254, "grad_norm": 1.0287829199461864, "learning_rate": 6.381925009461128e-06, "loss": 0.4456, "mean_token_accuracy": 0.8456096695736051, "num_tokens": 143561112.0, "step": 334 }, { "entropy": 0.441192626953125, "epoch": 1.3293650793650793, "grad_norm": 1.1380572333251808, "learning_rate": 6.3608866519483825e-06, "loss": 0.4498, "mean_token_accuracy": 0.844082260504365, "num_tokens": 143970890.0, "step": 335 }, { "entropy": 0.435333251953125, "epoch": 1.3333333333333333, "grad_norm": 1.102203799703573, "learning_rate": 6.339822240573041e-06, "loss": 0.4476, "mean_token_accuracy": 0.8457837710157037, "num_tokens": 144390223.0, "step": 336 }, { "entropy": 0.43310546875, "epoch": 1.3373015873015874, "grad_norm": 1.0745564478696599, "learning_rate": 6.3187321786084236e-06, "loss": 0.4609, "mean_token_accuracy": 0.8417782466858625, "num_tokens": 144839957.0, "step": 337 }, { "entropy": 0.4366455078125, "epoch": 1.3412698412698414, "grad_norm": 1.1064436100800052, "learning_rate": 6.297616869818926e-06, "loss": 0.4627, "mean_token_accuracy": 0.8423483874648809, "num_tokens": 145276276.0, "step": 338 }, { "entropy": 0.43682861328125, "epoch": 1.3452380952380953, "grad_norm": 1.0627782502876304, "learning_rate": 6.276476718452289e-06, "loss": 0.4599, "mean_token_accuracy": 0.8434413159266114, "num_tokens": 145722320.0, "step": 339 }, { "entropy": 0.440948486328125, "epoch": 1.3492063492063493, "grad_norm": 1.0311065316684267, "learning_rate": 6.2553121292318595e-06, "loss": 0.4445, "mean_token_accuracy": 0.8466370198875666, "num_tokens": 146148957.0, "step": 340 }, { "entropy": 0.44580078125, "epoch": 1.3531746031746033, "grad_norm": 0.9538685008179283, "learning_rate": 6.23412350734884e-06, "loss": 0.4571, "mean_token_accuracy": 0.8417170522734523, "num_tokens": 146580318.0, "step": 341 }, { "entropy": 0.441864013671875, "epoch": 1.3571428571428572, "grad_norm": 1.0998343004271525, "learning_rate": 6.2129112584545325e-06, "loss": 0.4437, "mean_token_accuracy": 0.846907963976264, "num_tokens": 146999892.0, "step": 342 }, { "entropy": 0.441070556640625, "epoch": 1.3611111111111112, "grad_norm": 1.0173140297071601, "learning_rate": 6.191675788652574e-06, "loss": 0.4461, "mean_token_accuracy": 0.8460167152807117, "num_tokens": 147436184.0, "step": 343 }, { "entropy": 0.4295654296875, "epoch": 1.3650793650793651, "grad_norm": 1.0290558215509458, "learning_rate": 6.170417504491157e-06, "loss": 0.4541, "mean_token_accuracy": 0.8437853921204805, "num_tokens": 147888947.0, "step": 344 }, { "entropy": 0.441253662109375, "epoch": 1.369047619047619, "grad_norm": 0.9977939913686099, "learning_rate": 6.149136812955256e-06, "loss": 0.4605, "mean_token_accuracy": 0.8413437977433205, "num_tokens": 148330624.0, "step": 345 }, { "entropy": 0.44482421875, "epoch": 1.373015873015873, "grad_norm": 0.9862218483442303, "learning_rate": 6.1278341214588255e-06, "loss": 0.4608, "mean_token_accuracy": 0.84361382573843, "num_tokens": 148771994.0, "step": 346 }, { "entropy": 0.43817138671875, "epoch": 1.376984126984127, "grad_norm": 1.0974460607418992, "learning_rate": 6.106509837837004e-06, "loss": 0.4468, "mean_token_accuracy": 0.8459707852452993, "num_tokens": 149203608.0, "step": 347 }, { "entropy": 0.435211181640625, "epoch": 1.380952380952381, "grad_norm": 0.9546922816485226, "learning_rate": 6.0851643703383066e-06, "loss": 0.4456, "mean_token_accuracy": 0.8459897711873055, "num_tokens": 149626353.0, "step": 348 }, { "entropy": 0.43658447265625, "epoch": 1.3849206349206349, "grad_norm": 1.0823837316088047, "learning_rate": 6.063798127616811e-06, "loss": 0.4447, "mean_token_accuracy": 0.8457578187808394, "num_tokens": 150036189.0, "step": 349 }, { "entropy": 0.437774658203125, "epoch": 1.3888888888888888, "grad_norm": 1.1008934320855421, "learning_rate": 6.042411518724327e-06, "loss": 0.4402, "mean_token_accuracy": 0.84851832408458, "num_tokens": 150484433.0, "step": 350 }, { "entropy": 0.441436767578125, "epoch": 1.3928571428571428, "grad_norm": 1.0679863117222357, "learning_rate": 6.021004953102576e-06, "loss": 0.4475, "mean_token_accuracy": 0.8463964462280273, "num_tokens": 150916869.0, "step": 351 }, { "entropy": 0.445831298828125, "epoch": 1.3968253968253967, "grad_norm": 1.0542706083048947, "learning_rate": 5.999578840575342e-06, "loss": 0.4504, "mean_token_accuracy": 0.8455899534747005, "num_tokens": 151351171.0, "step": 352 }, { "entropy": 0.438507080078125, "epoch": 1.4007936507936507, "grad_norm": 0.987561703544306, "learning_rate": 5.978133591340633e-06, "loss": 0.4494, "mean_token_accuracy": 0.8452698877081275, "num_tokens": 151779921.0, "step": 353 }, { "entropy": 0.435516357421875, "epoch": 1.4047619047619047, "grad_norm": 1.1262078381527667, "learning_rate": 5.956669615962821e-06, "loss": 0.4602, "mean_token_accuracy": 0.8407345684245229, "num_tokens": 152198704.0, "step": 354 }, { "entropy": 0.43695068359375, "epoch": 1.4087301587301586, "grad_norm": 1.0842467706193302, "learning_rate": 5.935187325364791e-06, "loss": 0.4504, "mean_token_accuracy": 0.8444310743361712, "num_tokens": 152607114.0, "step": 355 }, { "entropy": 0.442291259765625, "epoch": 1.4126984126984126, "grad_norm": 0.8869868658428021, "learning_rate": 5.913687130820064e-06, "loss": 0.4441, "mean_token_accuracy": 0.846776382997632, "num_tokens": 153027562.0, "step": 356 }, { "entropy": 0.439239501953125, "epoch": 1.4166666666666667, "grad_norm": 1.0275339768252305, "learning_rate": 5.892169443944929e-06, "loss": 0.443, "mean_token_accuracy": 0.84731434751302, "num_tokens": 153449258.0, "step": 357 }, { "entropy": 0.4425048828125, "epoch": 1.4206349206349207, "grad_norm": 0.9646996873181736, "learning_rate": 5.870634676690564e-06, "loss": 0.4433, "mean_token_accuracy": 0.8452265271916986, "num_tokens": 153863890.0, "step": 358 }, { "entropy": 0.441680908203125, "epoch": 1.4246031746031746, "grad_norm": 1.0623013087943407, "learning_rate": 5.8490832413351465e-06, "loss": 0.4484, "mean_token_accuracy": 0.8456388972699642, "num_tokens": 154280797.0, "step": 359 }, { "entropy": 0.4415283203125, "epoch": 1.4285714285714286, "grad_norm": 0.9302981043880288, "learning_rate": 5.827515550475955e-06, "loss": 0.4499, "mean_token_accuracy": 0.8448391910642385, "num_tokens": 154707913.0, "step": 360 }, { "entropy": 0.437255859375, "epoch": 1.4325396825396826, "grad_norm": 0.9416953081304574, "learning_rate": 5.805932017021486e-06, "loss": 0.4486, "mean_token_accuracy": 0.8438430884853005, "num_tokens": 155150096.0, "step": 361 }, { "entropy": 0.43389892578125, "epoch": 1.4365079365079365, "grad_norm": 0.9373065849374296, "learning_rate": 5.784333054183533e-06, "loss": 0.4449, "mean_token_accuracy": 0.8454085243865848, "num_tokens": 155590050.0, "step": 362 }, { "entropy": 0.437469482421875, "epoch": 1.4404761904761905, "grad_norm": 0.9209854720626441, "learning_rate": 5.762719075469277e-06, "loss": 0.4465, "mean_token_accuracy": 0.846617016941309, "num_tokens": 156016093.0, "step": 363 }, { "entropy": 0.436737060546875, "epoch": 1.4444444444444444, "grad_norm": 0.9861130639611431, "learning_rate": 5.741090494673386e-06, "loss": 0.443, "mean_token_accuracy": 0.8471564138308167, "num_tokens": 156449879.0, "step": 364 }, { "entropy": 0.441497802734375, "epoch": 1.4484126984126984, "grad_norm": 0.9886455980759782, "learning_rate": 5.719447725870071e-06, "loss": 0.4337, "mean_token_accuracy": 0.849870765581727, "num_tokens": 156866761.0, "step": 365 }, { "entropy": 0.4375, "epoch": 1.4523809523809523, "grad_norm": 0.9241839444207883, "learning_rate": 5.697791183405174e-06, "loss": 0.4333, "mean_token_accuracy": 0.8499069400131702, "num_tokens": 157304143.0, "step": 366 }, { "entropy": 0.43499755859375, "epoch": 1.4563492063492063, "grad_norm": 0.9452367705103182, "learning_rate": 5.67612128188823e-06, "loss": 0.4617, "mean_token_accuracy": 0.8407938601449132, "num_tokens": 157758390.0, "step": 367 }, { "entropy": 0.440948486328125, "epoch": 1.4603174603174602, "grad_norm": 1.0583903459607955, "learning_rate": 5.654438436184531e-06, "loss": 0.4393, "mean_token_accuracy": 0.845472626388073, "num_tokens": 158177988.0, "step": 368 }, { "entropy": 0.427886962890625, "epoch": 1.4642857142857144, "grad_norm": 1.025858650536215, "learning_rate": 5.6327430614071794e-06, "loss": 0.4551, "mean_token_accuracy": 0.843145564198494, "num_tokens": 158634349.0, "step": 369 }, { "entropy": 0.43743896484375, "epoch": 1.4682539682539684, "grad_norm": 0.9049075091671224, "learning_rate": 5.611035572909147e-06, "loss": 0.4462, "mean_token_accuracy": 0.8464264376088977, "num_tokens": 159060066.0, "step": 370 }, { "entropy": 0.440399169921875, "epoch": 1.4722222222222223, "grad_norm": 1.1144452604957675, "learning_rate": 5.589316386275318e-06, "loss": 0.4474, "mean_token_accuracy": 0.8443919736891985, "num_tokens": 159490031.0, "step": 371 }, { "entropy": 0.431396484375, "epoch": 1.4761904761904763, "grad_norm": 1.058425365791423, "learning_rate": 5.567585917314535e-06, "loss": 0.4494, "mean_token_accuracy": 0.8443618472665548, "num_tokens": 159942986.0, "step": 372 }, { "entropy": 0.44146728515625, "epoch": 1.4801587301587302, "grad_norm": 1.0282647508500027, "learning_rate": 5.545844582051641e-06, "loss": 0.4265, "mean_token_accuracy": 0.8528409609571099, "num_tokens": 160355322.0, "step": 373 }, { "entropy": 0.43463134765625, "epoch": 1.4841269841269842, "grad_norm": 1.019933540593848, "learning_rate": 5.524092796719507e-06, "loss": 0.4521, "mean_token_accuracy": 0.8433405430987477, "num_tokens": 160782816.0, "step": 374 }, { "entropy": 0.435699462890625, "epoch": 1.4880952380952381, "grad_norm": 0.9737519159679464, "learning_rate": 5.502330977751072e-06, "loss": 0.4462, "mean_token_accuracy": 0.8467091489583254, "num_tokens": 161216771.0, "step": 375 }, { "entropy": 0.4390869140625, "epoch": 1.492063492063492, "grad_norm": 1.0788036881829521, "learning_rate": 5.4805595417713634e-06, "loss": 0.4353, "mean_token_accuracy": 0.8512382041662931, "num_tokens": 161643512.0, "step": 376 }, { "entropy": 0.43292236328125, "epoch": 1.496031746031746, "grad_norm": 1.1542521219056112, "learning_rate": 5.458778905589528e-06, "loss": 0.4366, "mean_token_accuracy": 0.8494954742491245, "num_tokens": 162077647.0, "step": 377 }, { "entropy": 0.4354248046875, "epoch": 1.5, "grad_norm": 1.072458349595571, "learning_rate": 5.436989486190846e-06, "loss": 0.4335, "mean_token_accuracy": 0.8492478728294373, "num_tokens": 162503001.0, "step": 378 }, { "entropy": 0.43646240234375, "epoch": 1.503968253968254, "grad_norm": 0.9350737480343512, "learning_rate": 5.415191700728749e-06, "loss": 0.4548, "mean_token_accuracy": 0.8452032124623656, "num_tokens": 162949686.0, "step": 379 }, { "entropy": 0.430450439453125, "epoch": 1.507936507936508, "grad_norm": 0.9306004319608026, "learning_rate": 5.393385966516838e-06, "loss": 0.4397, "mean_token_accuracy": 0.8475316297262907, "num_tokens": 163388010.0, "step": 380 }, { "entropy": 0.4310302734375, "epoch": 1.5119047619047619, "grad_norm": 0.985227817450923, "learning_rate": 5.371572701020891e-06, "loss": 0.4341, "mean_token_accuracy": 0.8477435661479831, "num_tokens": 163816567.0, "step": 381 }, { "entropy": 0.4312744140625, "epoch": 1.5158730158730158, "grad_norm": 0.9677192291245226, "learning_rate": 5.349752321850866e-06, "loss": 0.448, "mean_token_accuracy": 0.8447540532797575, "num_tokens": 164270399.0, "step": 382 }, { "entropy": 0.429290771484375, "epoch": 1.5198412698412698, "grad_norm": 0.9069780240418857, "learning_rate": 5.327925246752917e-06, "loss": 0.4293, "mean_token_accuracy": 0.8511379426345229, "num_tokens": 164712402.0, "step": 383 }, { "entropy": 0.428375244140625, "epoch": 1.5238095238095237, "grad_norm": 0.9600603128750645, "learning_rate": 5.306091893601384e-06, "loss": 0.4487, "mean_token_accuracy": 0.845952364616096, "num_tokens": 165155523.0, "step": 384 }, { "entropy": 0.4332275390625, "epoch": 1.5277777777777777, "grad_norm": 0.9724692951976357, "learning_rate": 5.284252680390803e-06, "loss": 0.4269, "mean_token_accuracy": 0.8531857188791037, "num_tokens": 165575993.0, "step": 385 }, { "entropy": 0.43194580078125, "epoch": 1.5317460317460316, "grad_norm": 0.9712295118336367, "learning_rate": 5.2624080252279006e-06, "loss": 0.4471, "mean_token_accuracy": 0.845876133069396, "num_tokens": 166004219.0, "step": 386 }, { "entropy": 0.4315185546875, "epoch": 1.5357142857142856, "grad_norm": 0.9443490875832254, "learning_rate": 5.240558346323582e-06, "loss": 0.437, "mean_token_accuracy": 0.8483765926212072, "num_tokens": 166459333.0, "step": 387 }, { "entropy": 0.434234619140625, "epoch": 1.5396825396825395, "grad_norm": 0.948734807560996, "learning_rate": 5.218704061984938e-06, "loss": 0.4387, "mean_token_accuracy": 0.8489022571593523, "num_tokens": 166887486.0, "step": 388 }, { "entropy": 0.433074951171875, "epoch": 1.5436507936507935, "grad_norm": 0.9920709984656828, "learning_rate": 5.196845590607225e-06, "loss": 0.444, "mean_token_accuracy": 0.8482109969481826, "num_tokens": 167305651.0, "step": 389 }, { "entropy": 0.427581787109375, "epoch": 1.5476190476190477, "grad_norm": 1.007023739541341, "learning_rate": 5.174983350665861e-06, "loss": 0.4355, "mean_token_accuracy": 0.8507700897753239, "num_tokens": 167743608.0, "step": 390 }, { "entropy": 0.435516357421875, "epoch": 1.5515873015873016, "grad_norm": 0.9396600741753053, "learning_rate": 5.153117760708411e-06, "loss": 0.4387, "mean_token_accuracy": 0.8479267274960876, "num_tokens": 168189361.0, "step": 391 }, { "entropy": 0.440887451171875, "epoch": 1.5555555555555556, "grad_norm": 0.9532447871050252, "learning_rate": 5.131249239346574e-06, "loss": 0.4364, "mean_token_accuracy": 0.8505636844784021, "num_tokens": 168602032.0, "step": 392 }, { "entropy": 0.436492919921875, "epoch": 1.5595238095238095, "grad_norm": 0.9020737415284749, "learning_rate": 5.109378205248177e-06, "loss": 0.4426, "mean_token_accuracy": 0.8446815246716142, "num_tokens": 169036397.0, "step": 393 }, { "entropy": 0.43292236328125, "epoch": 1.5634920634920635, "grad_norm": 1.5261604480695485, "learning_rate": 5.087505077129144e-06, "loss": 0.4458, "mean_token_accuracy": 0.8471705308184028, "num_tokens": 169469975.0, "step": 394 }, { "entropy": 0.425628662109375, "epoch": 1.5674603174603174, "grad_norm": 1.0588587386866344, "learning_rate": 5.065630273745495e-06, "loss": 0.4463, "mean_token_accuracy": 0.8460619812831283, "num_tokens": 169905002.0, "step": 395 }, { "entropy": 0.429779052734375, "epoch": 1.5714285714285714, "grad_norm": 1.017609763369074, "learning_rate": 5.043754213885319e-06, "loss": 0.4437, "mean_token_accuracy": 0.8433791399002075, "num_tokens": 170343282.0, "step": 396 }, { "entropy": 0.436981201171875, "epoch": 1.5753968253968254, "grad_norm": 0.9564026257150148, "learning_rate": 5.021877316360759e-06, "loss": 0.4354, "mean_token_accuracy": 0.8478411976248026, "num_tokens": 170783254.0, "step": 397 }, { "entropy": 0.43304443359375, "epoch": 1.5793650793650795, "grad_norm": 0.9585975685485587, "learning_rate": 5e-06, "loss": 0.4505, "mean_token_accuracy": 0.8458553478121758, "num_tokens": 171227432.0, "step": 398 }, { "entropy": 0.43023681640625, "epoch": 1.5833333333333335, "grad_norm": 1.0440501055720262, "learning_rate": 4.978122683639241e-06, "loss": 0.4275, "mean_token_accuracy": 0.8501301733776927, "num_tokens": 171673565.0, "step": 399 }, { "entropy": 0.436431884765625, "epoch": 1.5873015873015874, "grad_norm": 1.0933083501713738, "learning_rate": 4.956245786114683e-06, "loss": 0.4295, "mean_token_accuracy": 0.8506188867613673, "num_tokens": 172096305.0, "step": 400 }, { "entropy": 0.434814453125, "epoch": 1.5912698412698414, "grad_norm": 1.1069832769815195, "learning_rate": 4.934369726254506e-06, "loss": 0.43, "mean_token_accuracy": 0.8495042575523257, "num_tokens": 172495298.0, "step": 401 }, { "entropy": 0.433929443359375, "epoch": 1.5952380952380953, "grad_norm": 1.120671038507196, "learning_rate": 4.9124949228708566e-06, "loss": 0.4334, "mean_token_accuracy": 0.8499879157170653, "num_tokens": 172910673.0, "step": 402 }, { "entropy": 0.42694091796875, "epoch": 1.5992063492063493, "grad_norm": 1.028931284451181, "learning_rate": 4.890621794751825e-06, "loss": 0.4319, "mean_token_accuracy": 0.8494029613211751, "num_tokens": 173326209.0, "step": 403 }, { "entropy": 0.426605224609375, "epoch": 1.6031746031746033, "grad_norm": 0.9118168079626323, "learning_rate": 4.8687507606534274e-06, "loss": 0.4372, "mean_token_accuracy": 0.8469415912404656, "num_tokens": 173775762.0, "step": 404 }, { "entropy": 0.43621826171875, "epoch": 1.6071428571428572, "grad_norm": 1.0102731648951273, "learning_rate": 4.8468822392915925e-06, "loss": 0.4367, "mean_token_accuracy": 0.8488945597782731, "num_tokens": 174200041.0, "step": 405 }, { "entropy": 0.428955078125, "epoch": 1.6111111111111112, "grad_norm": 0.9690257742063463, "learning_rate": 4.82501664933414e-06, "loss": 0.4406, "mean_token_accuracy": 0.8465389581397176, "num_tokens": 174651858.0, "step": 406 }, { "entropy": 0.436920166015625, "epoch": 1.6150793650793651, "grad_norm": 0.8850222581892622, "learning_rate": 4.803154409392776e-06, "loss": 0.4324, "mean_token_accuracy": 0.8495019385591149, "num_tokens": 175081173.0, "step": 407 }, { "entropy": 0.430511474609375, "epoch": 1.619047619047619, "grad_norm": 0.95437734633981, "learning_rate": 4.781295938015063e-06, "loss": 0.4331, "mean_token_accuracy": 0.8485972639173269, "num_tokens": 175519282.0, "step": 408 }, { "entropy": 0.435028076171875, "epoch": 1.623015873015873, "grad_norm": 1.0123634812749625, "learning_rate": 4.759441653676419e-06, "loss": 0.4466, "mean_token_accuracy": 0.848145549185574, "num_tokens": 175965036.0, "step": 409 }, { "entropy": 0.431060791015625, "epoch": 1.626984126984127, "grad_norm": 0.909110311090521, "learning_rate": 4.737591974772102e-06, "loss": 0.4451, "mean_token_accuracy": 0.8459606841206551, "num_tokens": 176387199.0, "step": 410 }, { "entropy": 0.4302978515625, "epoch": 1.630952380952381, "grad_norm": 0.964606274615154, "learning_rate": 4.715747319609199e-06, "loss": 0.4414, "mean_token_accuracy": 0.8480783235281706, "num_tokens": 176823428.0, "step": 411 }, { "entropy": 0.423431396484375, "epoch": 1.6349206349206349, "grad_norm": 0.9360221541198701, "learning_rate": 4.693908106398617e-06, "loss": 0.4393, "mean_token_accuracy": 0.8489115545526147, "num_tokens": 177264131.0, "step": 412 }, { "entropy": 0.4334716796875, "epoch": 1.6388888888888888, "grad_norm": 0.9818915467360069, "learning_rate": 4.6720747532470845e-06, "loss": 0.4294, "mean_token_accuracy": 0.8496479475870728, "num_tokens": 177680911.0, "step": 413 }, { "entropy": 0.432647705078125, "epoch": 1.6428571428571428, "grad_norm": 0.8978522056780484, "learning_rate": 4.650247678149135e-06, "loss": 0.4379, "mean_token_accuracy": 0.8470958042889833, "num_tokens": 178114003.0, "step": 414 }, { "entropy": 0.437652587890625, "epoch": 1.6468253968253967, "grad_norm": 0.9722385088780229, "learning_rate": 4.628427298979111e-06, "loss": 0.4514, "mean_token_accuracy": 0.8430732255801558, "num_tokens": 178533077.0, "step": 415 }, { "entropy": 0.437347412109375, "epoch": 1.6507936507936507, "grad_norm": 1.0373796667738375, "learning_rate": 4.606614033483164e-06, "loss": 0.4326, "mean_token_accuracy": 0.8507428057491779, "num_tokens": 178950487.0, "step": 416 }, { "entropy": 0.4326171875, "epoch": 1.6547619047619047, "grad_norm": 1.010237913873583, "learning_rate": 4.5848082992712516e-06, "loss": 0.4377, "mean_token_accuracy": 0.8486862545832992, "num_tokens": 179384739.0, "step": 417 }, { "entropy": 0.426300048828125, "epoch": 1.6587301587301586, "grad_norm": 1.0263841694329876, "learning_rate": 4.563010513809156e-06, "loss": 0.4455, "mean_token_accuracy": 0.8446431895717978, "num_tokens": 179833212.0, "step": 418 }, { "entropy": 0.42828369140625, "epoch": 1.6626984126984126, "grad_norm": 0.9494913320869729, "learning_rate": 4.541221094410473e-06, "loss": 0.4306, "mean_token_accuracy": 0.8516378318890929, "num_tokens": 180259940.0, "step": 419 }, { "entropy": 0.42144775390625, "epoch": 1.6666666666666665, "grad_norm": 0.9739308463131585, "learning_rate": 4.519440458228638e-06, "loss": 0.4381, "mean_token_accuracy": 0.8479503998532891, "num_tokens": 180712234.0, "step": 420 }, { "entropy": 0.4244384765625, "epoch": 1.6706349206349205, "grad_norm": 1.0181973094308832, "learning_rate": 4.497669022248931e-06, "loss": 0.4525, "mean_token_accuracy": 0.843443606980145, "num_tokens": 181151354.0, "step": 421 }, { "entropy": 0.430877685546875, "epoch": 1.6746031746031746, "grad_norm": 3.323978860931596, "learning_rate": 4.475907203280494e-06, "loss": 0.4383, "mean_token_accuracy": 0.8451524330303073, "num_tokens": 181566490.0, "step": 422 }, { "entropy": 0.428955078125, "epoch": 1.6785714285714286, "grad_norm": 1.2824867106826667, "learning_rate": 4.45415541794836e-06, "loss": 0.446, "mean_token_accuracy": 0.8463947279378772, "num_tokens": 181997420.0, "step": 423 }, { "entropy": 0.431793212890625, "epoch": 1.6825396825396826, "grad_norm": 1.0255881219333862, "learning_rate": 4.432414082685466e-06, "loss": 0.4358, "mean_token_accuracy": 0.8490986367687583, "num_tokens": 182413254.0, "step": 424 }, { "entropy": 0.42706298828125, "epoch": 1.6865079365079365, "grad_norm": 1.0665870604693903, "learning_rate": 4.410683613724684e-06, "loss": 0.4292, "mean_token_accuracy": 0.8507826002314687, "num_tokens": 182840621.0, "step": 425 }, { "entropy": 0.427398681640625, "epoch": 1.6904761904761905, "grad_norm": 1.1351262001199722, "learning_rate": 4.388964427090855e-06, "loss": 0.4359, "mean_token_accuracy": 0.846874114125967, "num_tokens": 183269538.0, "step": 426 }, { "entropy": 0.43524169921875, "epoch": 1.6944444444444444, "grad_norm": 0.9895934977007657, "learning_rate": 4.367256938592822e-06, "loss": 0.4231, "mean_token_accuracy": 0.8536219568923116, "num_tokens": 183684845.0, "step": 427 }, { "entropy": 0.43170166015625, "epoch": 1.6984126984126984, "grad_norm": 1.1767949451847899, "learning_rate": 4.345561563815471e-06, "loss": 0.4337, "mean_token_accuracy": 0.8503425857052207, "num_tokens": 184109496.0, "step": 428 }, { "entropy": 0.433258056640625, "epoch": 1.7023809523809523, "grad_norm": 0.9787163441447944, "learning_rate": 4.323878718111771e-06, "loss": 0.4496, "mean_token_accuracy": 0.8437537206336856, "num_tokens": 184533568.0, "step": 429 }, { "entropy": 0.432220458984375, "epoch": 1.7063492063492065, "grad_norm": 0.9948605324632119, "learning_rate": 4.302208816594829e-06, "loss": 0.4387, "mean_token_accuracy": 0.8475517062470317, "num_tokens": 184968366.0, "step": 430 }, { "entropy": 0.42999267578125, "epoch": 1.7103174603174605, "grad_norm": 0.9068147664673831, "learning_rate": 4.280552274129932e-06, "loss": 0.4376, "mean_token_accuracy": 0.8486391613259912, "num_tokens": 185404884.0, "step": 431 }, { "entropy": 0.427978515625, "epoch": 1.7142857142857144, "grad_norm": 0.9871014833586675, "learning_rate": 4.258909505326617e-06, "loss": 0.4451, "mean_token_accuracy": 0.8455649884417653, "num_tokens": 185857166.0, "step": 432 }, { "entropy": 0.432586669921875, "epoch": 1.7182539682539684, "grad_norm": 0.9995499236592311, "learning_rate": 4.237280924530723e-06, "loss": 0.425, "mean_token_accuracy": 0.8507826123386621, "num_tokens": 186278301.0, "step": 433 }, { "entropy": 0.43853759765625, "epoch": 1.7222222222222223, "grad_norm": 0.9796741726346321, "learning_rate": 4.215666945816469e-06, "loss": 0.4266, "mean_token_accuracy": 0.850803654640913, "num_tokens": 186684767.0, "step": 434 }, { "entropy": 0.4305419921875, "epoch": 1.7261904761904763, "grad_norm": 0.9307664459487662, "learning_rate": 4.194067982978516e-06, "loss": 0.4279, "mean_token_accuracy": 0.8503124145790935, "num_tokens": 187107470.0, "step": 435 }, { "entropy": 0.425567626953125, "epoch": 1.7301587301587302, "grad_norm": 0.9496403248581704, "learning_rate": 4.172484449524047e-06, "loss": 0.428, "mean_token_accuracy": 0.8510759947821498, "num_tokens": 187534641.0, "step": 436 }, { "entropy": 0.42620849609375, "epoch": 1.7341269841269842, "grad_norm": 0.9874730817939584, "learning_rate": 4.150916758664857e-06, "loss": 0.4352, "mean_token_accuracy": 0.848286903463304, "num_tokens": 187972052.0, "step": 437 }, { "entropy": 0.424652099609375, "epoch": 1.7380952380952381, "grad_norm": 0.9625644757119309, "learning_rate": 4.129365323309436e-06, "loss": 0.4295, "mean_token_accuracy": 0.8496120125055313, "num_tokens": 188403747.0, "step": 438 }, { "entropy": 0.425537109375, "epoch": 1.742063492063492, "grad_norm": 0.9770323219075207, "learning_rate": 4.107830556055072e-06, "loss": 0.4363, "mean_token_accuracy": 0.8482074243947864, "num_tokens": 188833376.0, "step": 439 }, { "entropy": 0.420562744140625, "epoch": 1.746031746031746, "grad_norm": 0.9091458418004688, "learning_rate": 4.086312869179938e-06, "loss": 0.434, "mean_token_accuracy": 0.8494348004460335, "num_tokens": 189286051.0, "step": 440 }, { "entropy": 0.4337158203125, "epoch": 1.75, "grad_norm": 0.9398983504232156, "learning_rate": 4.06481267463521e-06, "loss": 0.4233, "mean_token_accuracy": 0.85198515933007, "num_tokens": 189700932.0, "step": 441 }, { "entropy": 0.428436279296875, "epoch": 1.753968253968254, "grad_norm": 0.9954518019783384, "learning_rate": 4.04333038403718e-06, "loss": 0.4332, "mean_token_accuracy": 0.8483901359140873, "num_tokens": 190135846.0, "step": 442 }, { "entropy": 0.41839599609375, "epoch": 1.757936507936508, "grad_norm": 0.9235407840660959, "learning_rate": 4.021866408659368e-06, "loss": 0.4376, "mean_token_accuracy": 0.8477007877081633, "num_tokens": 190599539.0, "step": 443 }, { "entropy": 0.42510986328125, "epoch": 1.7619047619047619, "grad_norm": 0.9988254434360743, "learning_rate": 4.000421159424658e-06, "loss": 0.4381, "mean_token_accuracy": 0.849124894477427, "num_tokens": 191023956.0, "step": 444 }, { "entropy": 0.44061279296875, "epoch": 1.7658730158730158, "grad_norm": 0.9313679757350634, "learning_rate": 3.978995046897425e-06, "loss": 0.4111, "mean_token_accuracy": 0.8550975983962417, "num_tokens": 191419256.0, "step": 445 }, { "entropy": 0.42877197265625, "epoch": 1.7698412698412698, "grad_norm": 0.9424190366763185, "learning_rate": 3.957588481275674e-06, "loss": 0.438, "mean_token_accuracy": 0.848029020242393, "num_tokens": 191865715.0, "step": 446 }, { "entropy": 0.437103271484375, "epoch": 1.7738095238095237, "grad_norm": 0.9089004430002622, "learning_rate": 3.9362018723831915e-06, "loss": 0.4417, "mean_token_accuracy": 0.8482843916863203, "num_tokens": 192279544.0, "step": 447 }, { "entropy": 0.43310546875, "epoch": 1.7777777777777777, "grad_norm": 1.682337538575509, "learning_rate": 3.914835629661695e-06, "loss": 0.4219, "mean_token_accuracy": 0.8513781204819679, "num_tokens": 192687536.0, "step": 448 }, { "entropy": 0.434417724609375, "epoch": 1.7817460317460316, "grad_norm": 1.0677243021549518, "learning_rate": 3.893490162162997e-06, "loss": 0.427, "mean_token_accuracy": 0.8539638724178076, "num_tokens": 193092369.0, "step": 449 }, { "entropy": 0.43597412109375, "epoch": 1.7857142857142856, "grad_norm": 0.9415863303290471, "learning_rate": 3.872165878541175e-06, "loss": 0.4249, "mean_token_accuracy": 0.8508947864174843, "num_tokens": 193514317.0, "step": 450 }, { "entropy": 0.4267578125, "epoch": 1.7896825396825395, "grad_norm": 0.9325477755113131, "learning_rate": 3.850863187044745e-06, "loss": 0.4311, "mean_token_accuracy": 0.8517430359497666, "num_tokens": 193943892.0, "step": 451 }, { "entropy": 0.4212646484375, "epoch": 1.7936507936507935, "grad_norm": 1.0936536327558857, "learning_rate": 3.829582495508844e-06, "loss": 0.428, "mean_token_accuracy": 0.8505398780107498, "num_tokens": 194368425.0, "step": 452 }, { "entropy": 0.425689697265625, "epoch": 1.7976190476190477, "grad_norm": 0.913775614343544, "learning_rate": 3.808324211347429e-06, "loss": 0.4263, "mean_token_accuracy": 0.8509924123063684, "num_tokens": 194781122.0, "step": 453 }, { "entropy": 0.42474365234375, "epoch": 1.8015873015873016, "grad_norm": 0.8819652825019069, "learning_rate": 3.7870887415454687e-06, "loss": 0.4352, "mean_token_accuracy": 0.8501952039077878, "num_tokens": 195229420.0, "step": 454 }, { "entropy": 0.423248291015625, "epoch": 1.8055555555555556, "grad_norm": 0.9710832265661201, "learning_rate": 3.7658764926511613e-06, "loss": 0.4364, "mean_token_accuracy": 0.8493523299694061, "num_tokens": 195670858.0, "step": 455 }, { "entropy": 0.429229736328125, "epoch": 1.8095238095238095, "grad_norm": 1.0034882334655617, "learning_rate": 3.7446878707681413e-06, "loss": 0.4312, "mean_token_accuracy": 0.8488902822136879, "num_tokens": 196086060.0, "step": 456 }, { "entropy": 0.42626953125, "epoch": 1.8134920634920635, "grad_norm": 0.8967060198023731, "learning_rate": 3.7235232815477123e-06, "loss": 0.4389, "mean_token_accuracy": 0.8454429730772972, "num_tokens": 196534067.0, "step": 457 }, { "entropy": 0.433380126953125, "epoch": 1.8174603174603174, "grad_norm": 1.0727361296036093, "learning_rate": 3.7023831301810765e-06, "loss": 0.4233, "mean_token_accuracy": 0.852061620913446, "num_tokens": 196949752.0, "step": 458 }, { "entropy": 0.4302978515625, "epoch": 1.8214285714285714, "grad_norm": 0.9533053527391133, "learning_rate": 3.6812678213915777e-06, "loss": 0.4274, "mean_token_accuracy": 0.8499543191865087, "num_tokens": 197361623.0, "step": 459 }, { "entropy": 0.428863525390625, "epoch": 1.8253968253968254, "grad_norm": 1.6646105544719645, "learning_rate": 3.6601777594269605e-06, "loss": 0.4275, "mean_token_accuracy": 0.8524315897375345, "num_tokens": 197787383.0, "step": 460 }, { "entropy": 0.427886962890625, "epoch": 1.8293650793650795, "grad_norm": 0.918452931744825, "learning_rate": 3.6391133480516196e-06, "loss": 0.4351, "mean_token_accuracy": 0.8494909154251218, "num_tokens": 198214788.0, "step": 461 }, { "entropy": 0.433502197265625, "epoch": 1.8333333333333335, "grad_norm": 0.9250539034798784, "learning_rate": 3.618074990538873e-06, "loss": 0.44, "mean_token_accuracy": 0.8496057353913784, "num_tokens": 198640204.0, "step": 462 }, { "entropy": 0.4234619140625, "epoch": 1.8373015873015874, "grad_norm": 0.8926807300614167, "learning_rate": 3.5970630896632485e-06, "loss": 0.4373, "mean_token_accuracy": 0.8482935605570674, "num_tokens": 199086174.0, "step": 463 }, { "entropy": 0.423919677734375, "epoch": 1.8412698412698414, "grad_norm": 0.9317218135024461, "learning_rate": 3.5760780476927637e-06, "loss": 0.4342, "mean_token_accuracy": 0.8504292815923691, "num_tokens": 199534945.0, "step": 464 }, { "entropy": 0.43280029296875, "epoch": 1.8452380952380953, "grad_norm": 0.9327031690920736, "learning_rate": 3.5551202663812344e-06, "loss": 0.428, "mean_token_accuracy": 0.851259358227253, "num_tokens": 199970879.0, "step": 465 }, { "entropy": 0.43359375, "epoch": 1.8492063492063493, "grad_norm": 0.9103535545774605, "learning_rate": 3.534190146960571e-06, "loss": 0.4254, "mean_token_accuracy": 0.8511311411857605, "num_tokens": 200401566.0, "step": 466 }, { "entropy": 0.43096923828125, "epoch": 1.8531746031746033, "grad_norm": 1.3202029413068583, "learning_rate": 3.5132880901331067e-06, "loss": 0.4244, "mean_token_accuracy": 0.8484150217846036, "num_tokens": 200819281.0, "step": 467 }, { "entropy": 0.42852783203125, "epoch": 1.8571428571428572, "grad_norm": 0.9663839835801094, "learning_rate": 3.492414496063921e-06, "loss": 0.4389, "mean_token_accuracy": 0.8492425018921494, "num_tokens": 201286569.0, "step": 468 }, { "entropy": 0.42816162109375, "epoch": 1.8611111111111112, "grad_norm": 0.922662186018523, "learning_rate": 3.4715697643731828e-06, "loss": 0.4286, "mean_token_accuracy": 0.8502284437417984, "num_tokens": 201729117.0, "step": 469 }, { "entropy": 0.4305419921875, "epoch": 1.8650793650793651, "grad_norm": 0.9615527156025448, "learning_rate": 3.4507542941284933e-06, "loss": 0.4251, "mean_token_accuracy": 0.8521155146881938, "num_tokens": 202148785.0, "step": 470 }, { "entropy": 0.42950439453125, "epoch": 1.869047619047619, "grad_norm": 0.8896950243538952, "learning_rate": 3.4299684838372547e-06, "loss": 0.4209, "mean_token_accuracy": 0.8519325880333781, "num_tokens": 202562335.0, "step": 471 }, { "entropy": 0.438201904296875, "epoch": 1.873015873015873, "grad_norm": 0.896750571119777, "learning_rate": 3.4092127314390354e-06, "loss": 0.4241, "mean_token_accuracy": 0.8511500097811222, "num_tokens": 202969412.0, "step": 472 }, { "entropy": 0.424560546875, "epoch": 1.876984126984127, "grad_norm": 0.8342483785030218, "learning_rate": 3.388487434297949e-06, "loss": 0.4349, "mean_token_accuracy": 0.8488007439300418, "num_tokens": 203414579.0, "step": 473 }, { "entropy": 0.429595947265625, "epoch": 1.880952380952381, "grad_norm": 0.8918742155840607, "learning_rate": 3.3677929891950527e-06, "loss": 0.4247, "mean_token_accuracy": 0.8510593473911285, "num_tokens": 203845826.0, "step": 474 }, { "entropy": 0.43017578125, "epoch": 1.8849206349206349, "grad_norm": 0.9252775003902146, "learning_rate": 3.347129792320748e-06, "loss": 0.4272, "mean_token_accuracy": 0.8510101838037372, "num_tokens": 204272914.0, "step": 475 }, { "entropy": 0.424591064453125, "epoch": 1.8888888888888888, "grad_norm": 0.9664584622314957, "learning_rate": 3.3264982392671973e-06, "loss": 0.4204, "mean_token_accuracy": 0.8532195715233684, "num_tokens": 204713067.0, "step": 476 }, { "entropy": 0.42791748046875, "epoch": 1.8928571428571428, "grad_norm": 0.9292473265869555, "learning_rate": 3.3058987250207476e-06, "loss": 0.4277, "mean_token_accuracy": 0.8527126982808113, "num_tokens": 205140799.0, "step": 477 }, { "entropy": 0.439788818359375, "epoch": 1.8968253968253967, "grad_norm": 0.9128528058058363, "learning_rate": 3.285331643954372e-06, "loss": 0.4234, "mean_token_accuracy": 0.8513627136126161, "num_tokens": 205549482.0, "step": 478 }, { "entropy": 0.428558349609375, "epoch": 1.9007936507936507, "grad_norm": 0.9344739197051096, "learning_rate": 3.2647973898201157e-06, "loss": 0.4269, "mean_token_accuracy": 0.8505295282229781, "num_tokens": 205957709.0, "step": 479 }, { "entropy": 0.428436279296875, "epoch": 1.9047619047619047, "grad_norm": 0.8831126126363492, "learning_rate": 3.244296355741561e-06, "loss": 0.426, "mean_token_accuracy": 0.8514531748369336, "num_tokens": 206394578.0, "step": 480 }, { "entropy": 0.43328857421875, "epoch": 1.9087301587301586, "grad_norm": 0.8812462855968569, "learning_rate": 3.2238289342063013e-06, "loss": 0.429, "mean_token_accuracy": 0.8510967614129186, "num_tokens": 206810851.0, "step": 481 }, { "entropy": 0.428375244140625, "epoch": 1.9126984126984126, "grad_norm": 1.0106928205994128, "learning_rate": 3.203395517058423e-06, "loss": 0.432, "mean_token_accuracy": 0.852095915004611, "num_tokens": 207233636.0, "step": 482 }, { "entropy": 0.421112060546875, "epoch": 1.9166666666666665, "grad_norm": 0.9116927331499651, "learning_rate": 3.1829964954910076e-06, "loss": 0.4363, "mean_token_accuracy": 0.8473147870972753, "num_tokens": 207671663.0, "step": 483 }, { "entropy": 0.437652587890625, "epoch": 1.9206349206349205, "grad_norm": 0.9660485826307438, "learning_rate": 3.1626322600386418e-06, "loss": 0.4289, "mean_token_accuracy": 0.8505426356568933, "num_tokens": 208074376.0, "step": 484 }, { "entropy": 0.4241943359375, "epoch": 1.9246031746031746, "grad_norm": 0.9972216512477222, "learning_rate": 3.1423032005699377e-06, "loss": 0.4364, "mean_token_accuracy": 0.8486529793590307, "num_tokens": 208524843.0, "step": 485 }, { "entropy": 0.4322509765625, "epoch": 1.9285714285714286, "grad_norm": 0.9283266129413389, "learning_rate": 3.122009706280072e-06, "loss": 0.4277, "mean_token_accuracy": 0.8506509074941278, "num_tokens": 208947370.0, "step": 486 }, { "entropy": 0.42724609375, "epoch": 1.9325396825396826, "grad_norm": 1.006394801232037, "learning_rate": 3.1017521656833384e-06, "loss": 0.4146, "mean_token_accuracy": 0.8548265127465129, "num_tokens": 209354451.0, "step": 487 }, { "entropy": 0.4229736328125, "epoch": 1.9365079365079365, "grad_norm": 0.8314414813893206, "learning_rate": 3.0815309666057013e-06, "loss": 0.428, "mean_token_accuracy": 0.8494690489023924, "num_tokens": 209798547.0, "step": 488 }, { "entropy": 0.425018310546875, "epoch": 1.9404761904761905, "grad_norm": 0.9234785434940929, "learning_rate": 3.061346496177374e-06, "loss": 0.421, "mean_token_accuracy": 0.8528507072478533, "num_tokens": 210233790.0, "step": 489 }, { "entropy": 0.43133544921875, "epoch": 1.9444444444444444, "grad_norm": 0.8757613774035661, "learning_rate": 3.0411991408254116e-06, "loss": 0.436, "mean_token_accuracy": 0.8493619496002793, "num_tokens": 210661829.0, "step": 490 }, { "entropy": 0.420318603515625, "epoch": 1.9484126984126984, "grad_norm": 0.8668762253896259, "learning_rate": 3.0210892862663043e-06, "loss": 0.4267, "mean_token_accuracy": 0.8510631760582328, "num_tokens": 211113597.0, "step": 491 }, { "entropy": 0.4222412109375, "epoch": 1.9523809523809523, "grad_norm": 0.8822229179162288, "learning_rate": 3.001017317498607e-06, "loss": 0.4278, "mean_token_accuracy": 0.8513042591512203, "num_tokens": 211549046.0, "step": 492 }, { "entropy": 0.419830322265625, "epoch": 1.9563492063492065, "grad_norm": 0.9142830959986298, "learning_rate": 2.9809836187955532e-06, "loss": 0.4139, "mean_token_accuracy": 0.8542308090254664, "num_tokens": 212000519.0, "step": 493 }, { "entropy": 0.42449951171875, "epoch": 1.9603174603174605, "grad_norm": 0.8634339056465669, "learning_rate": 2.960988573697705e-06, "loss": 0.428, "mean_token_accuracy": 0.8506795652210712, "num_tokens": 212447521.0, "step": 494 }, { "entropy": 0.42681884765625, "epoch": 1.9642857142857144, "grad_norm": 0.8734416000621907, "learning_rate": 2.941032565005613e-06, "loss": 0.4262, "mean_token_accuracy": 0.8521596789360046, "num_tokens": 212865927.0, "step": 495 }, { "entropy": 0.424072265625, "epoch": 1.9682539682539684, "grad_norm": 0.8877032051531498, "learning_rate": 2.9211159747724813e-06, "loss": 0.4264, "mean_token_accuracy": 0.851787575520575, "num_tokens": 213310334.0, "step": 496 }, { "entropy": 0.421661376953125, "epoch": 1.9722222222222223, "grad_norm": 0.9809567398581039, "learning_rate": 2.90123918429686e-06, "loss": 0.4246, "mean_token_accuracy": 0.8516859589144588, "num_tokens": 213742399.0, "step": 497 }, { "entropy": 0.42767333984375, "epoch": 1.9761904761904763, "grad_norm": 0.8738523997394374, "learning_rate": 2.881402574115344e-06, "loss": 0.4273, "mean_token_accuracy": 0.8529170397669077, "num_tokens": 214169043.0, "step": 498 }, { "entropy": 0.4276123046875, "epoch": 1.9801587301587302, "grad_norm": 0.9201362022804491, "learning_rate": 2.8616065239952763e-06, "loss": 0.424, "mean_token_accuracy": 0.8526058839634061, "num_tokens": 214572957.0, "step": 499 }, { "entropy": 0.430877685546875, "epoch": 1.9841269841269842, "grad_norm": 0.9306770950977414, "learning_rate": 2.841851412927495e-06, "loss": 0.4314, "mean_token_accuracy": 0.8489747159183025, "num_tokens": 215005057.0, "step": 500 }, { "entropy": 0.4188232421875, "epoch": 1.9880952380952381, "grad_norm": 0.8357685751970109, "learning_rate": 2.822137619119065e-06, "loss": 0.42, "mean_token_accuracy": 0.8517758399248123, "num_tokens": 215449399.0, "step": 501 }, { "entropy": 0.426727294921875, "epoch": 1.992063492063492, "grad_norm": 1.1544716066903413, "learning_rate": 2.8024655199860495e-06, "loss": 0.4154, "mean_token_accuracy": 0.8549016704782844, "num_tokens": 215869766.0, "step": 502 }, { "entropy": 0.427978515625, "epoch": 1.996031746031746, "grad_norm": 0.8289572581024041, "learning_rate": 2.7828354921462668e-06, "loss": 0.4184, "mean_token_accuracy": 0.8542971862480044, "num_tokens": 216298988.0, "step": 503 }, { "entropy": 0.4202880859375, "epoch": 2.0, "grad_norm": 0.8750452382881969, "learning_rate": 2.7632479114120963e-06, "loss": 0.4177, "mean_token_accuracy": 0.8540928428992629, "num_tokens": 216731206.0, "step": 504 }, { "entropy": 0.420989990234375, "epoch": 2.003968253968254, "grad_norm": 0.8871159450799843, "learning_rate": 2.7437031527832747e-06, "loss": 0.3994, "mean_token_accuracy": 0.860961563885212, "num_tokens": 217159781.0, "step": 505 }, { "entropy": 0.425262451171875, "epoch": 2.007936507936508, "grad_norm": 0.9044028336131849, "learning_rate": 2.72420159043972e-06, "loss": 0.3935, "mean_token_accuracy": 0.8634284269064665, "num_tokens": 217589905.0, "step": 506 }, { "entropy": 0.42340087890625, "epoch": 2.011904761904762, "grad_norm": 0.8841207327958758, "learning_rate": 2.704743597734365e-06, "loss": 0.3933, "mean_token_accuracy": 0.8630258431658149, "num_tokens": 218017429.0, "step": 507 }, { "entropy": 0.42041015625, "epoch": 2.015873015873016, "grad_norm": 0.8980425705440174, "learning_rate": 2.685329547186018e-06, "loss": 0.4083, "mean_token_accuracy": 0.8567906338721514, "num_tokens": 218446876.0, "step": 508 }, { "entropy": 0.4229736328125, "epoch": 2.0198412698412698, "grad_norm": 0.909158252805293, "learning_rate": 2.665959810472219e-06, "loss": 0.4067, "mean_token_accuracy": 0.8580641169101, "num_tokens": 218885713.0, "step": 509 }, { "entropy": 0.41693115234375, "epoch": 2.0238095238095237, "grad_norm": 0.882075206716414, "learning_rate": 2.6466347584221314e-06, "loss": 0.3961, "mean_token_accuracy": 0.861279109492898, "num_tokens": 219322571.0, "step": 510 }, { "entropy": 0.422607421875, "epoch": 2.0277777777777777, "grad_norm": 0.8895301340223191, "learning_rate": 2.6273547610094408e-06, "loss": 0.4007, "mean_token_accuracy": 0.8570800367742777, "num_tokens": 219748508.0, "step": 511 }, { "entropy": 0.420166015625, "epoch": 2.0317460317460316, "grad_norm": 0.908409070674735, "learning_rate": 2.608120187345273e-06, "loss": 0.3983, "mean_token_accuracy": 0.8590443721041083, "num_tokens": 220180160.0, "step": 512 }, { "entropy": 0.4185791015625, "epoch": 2.0357142857142856, "grad_norm": 1.034313453109704, "learning_rate": 2.588931405671127e-06, "loss": 0.3916, "mean_token_accuracy": 0.8636050894856453, "num_tokens": 220606565.0, "step": 513 }, { "entropy": 0.422393798828125, "epoch": 2.0396825396825395, "grad_norm": 0.8777983265834516, "learning_rate": 2.5697887833518215e-06, "loss": 0.3897, "mean_token_accuracy": 0.8630373626947403, "num_tokens": 221016578.0, "step": 514 }, { "entropy": 0.41497802734375, "epoch": 2.0436507936507935, "grad_norm": 0.9119000908237385, "learning_rate": 2.5506926868684683e-06, "loss": 0.3967, "mean_token_accuracy": 0.8603310724720359, "num_tokens": 221455851.0, "step": 515 }, { "entropy": 0.424346923828125, "epoch": 2.0476190476190474, "grad_norm": 0.9104788824732245, "learning_rate": 2.5316434818114517e-06, "loss": 0.4009, "mean_token_accuracy": 0.8583084382116795, "num_tokens": 221871968.0, "step": 516 }, { "entropy": 0.41632080078125, "epoch": 2.0515873015873014, "grad_norm": 0.7974753175425153, "learning_rate": 2.5126415328734275e-06, "loss": 0.3875, "mean_token_accuracy": 0.8620841084048152, "num_tokens": 222303576.0, "step": 517 }, { "entropy": 0.41741943359375, "epoch": 2.0555555555555554, "grad_norm": 0.8523247821631298, "learning_rate": 2.4936872038423516e-06, "loss": 0.3935, "mean_token_accuracy": 0.8615706618875265, "num_tokens": 222738323.0, "step": 518 }, { "entropy": 0.416717529296875, "epoch": 2.0595238095238093, "grad_norm": 0.8420283553726328, "learning_rate": 2.4747808575945006e-06, "loss": 0.3942, "mean_token_accuracy": 0.8623552098870277, "num_tokens": 223168261.0, "step": 519 }, { "entropy": 0.421295166015625, "epoch": 2.0634920634920633, "grad_norm": 0.9269712393029744, "learning_rate": 2.4559228560875336e-06, "loss": 0.3983, "mean_token_accuracy": 0.8609938519075513, "num_tokens": 223584134.0, "step": 520 }, { "entropy": 0.41546630859375, "epoch": 2.0674603174603177, "grad_norm": 0.7913231790323264, "learning_rate": 2.4371135603535613e-06, "loss": 0.3881, "mean_token_accuracy": 0.8632083088159561, "num_tokens": 224013215.0, "step": 521 }, { "entropy": 0.40972900390625, "epoch": 2.0714285714285716, "grad_norm": 0.8896009296171342, "learning_rate": 2.4183533304922336e-06, "loss": 0.4024, "mean_token_accuracy": 0.8593400968238711, "num_tokens": 224461654.0, "step": 522 }, { "entropy": 0.416046142578125, "epoch": 2.0753968253968256, "grad_norm": 0.8522563242461978, "learning_rate": 2.399642525663843e-06, "loss": 0.3968, "mean_token_accuracy": 0.8609009999781847, "num_tokens": 224885889.0, "step": 523 }, { "entropy": 0.41802978515625, "epoch": 2.0793650793650795, "grad_norm": 0.8436355578137702, "learning_rate": 2.380981504082459e-06, "loss": 0.4051, "mean_token_accuracy": 0.8574947854503989, "num_tokens": 225327562.0, "step": 524 }, { "entropy": 0.410980224609375, "epoch": 2.0833333333333335, "grad_norm": 0.9234046715388234, "learning_rate": 2.3623706230090517e-06, "loss": 0.3946, "mean_token_accuracy": 0.860747816041112, "num_tokens": 225767121.0, "step": 525 }, { "entropy": 0.4215087890625, "epoch": 2.0873015873015874, "grad_norm": 0.886667769462096, "learning_rate": 2.3438102387446686e-06, "loss": 0.3887, "mean_token_accuracy": 0.8633216423913836, "num_tokens": 226189031.0, "step": 526 }, { "entropy": 0.41558837890625, "epoch": 2.0912698412698414, "grad_norm": 0.8295983883133476, "learning_rate": 2.325300706623607e-06, "loss": 0.4059, "mean_token_accuracy": 0.8594214450567961, "num_tokens": 226627902.0, "step": 527 }, { "entropy": 0.416168212890625, "epoch": 2.0952380952380953, "grad_norm": 0.8579824625414783, "learning_rate": 2.3068423810066085e-06, "loss": 0.4086, "mean_token_accuracy": 0.8578107142820954, "num_tokens": 227062309.0, "step": 528 }, { "entropy": 0.418792724609375, "epoch": 2.0992063492063493, "grad_norm": 0.8717081684901182, "learning_rate": 2.288435615274085e-06, "loss": 0.4026, "mean_token_accuracy": 0.8583700396120548, "num_tokens": 227485113.0, "step": 529 }, { "entropy": 0.418609619140625, "epoch": 2.1031746031746033, "grad_norm": 0.8671184672809995, "learning_rate": 2.2700807618193393e-06, "loss": 0.3945, "mean_token_accuracy": 0.8610662836581469, "num_tokens": 227920598.0, "step": 530 }, { "entropy": 0.416961669921875, "epoch": 2.107142857142857, "grad_norm": 0.7659046613801866, "learning_rate": 2.251778172041828e-06, "loss": 0.391, "mean_token_accuracy": 0.8613040810450912, "num_tokens": 228346699.0, "step": 531 }, { "entropy": 0.41766357421875, "epoch": 2.111111111111111, "grad_norm": 0.8757955281793407, "learning_rate": 2.2335281963404315e-06, "loss": 0.3985, "mean_token_accuracy": 0.86165143083781, "num_tokens": 228773818.0, "step": 532 }, { "entropy": 0.41998291015625, "epoch": 2.115079365079365, "grad_norm": 0.9727283374741916, "learning_rate": 2.2153311841067438e-06, "loss": 0.3928, "mean_token_accuracy": 0.8631924940273166, "num_tokens": 229188623.0, "step": 533 }, { "entropy": 0.412200927734375, "epoch": 2.119047619047619, "grad_norm": 0.8392433239210284, "learning_rate": 2.1971874837183914e-06, "loss": 0.3869, "mean_token_accuracy": 0.8635608870536089, "num_tokens": 229627711.0, "step": 534 }, { "entropy": 0.415802001953125, "epoch": 2.123015873015873, "grad_norm": 0.9201827428240057, "learning_rate": 2.179097442532352e-06, "loss": 0.4088, "mean_token_accuracy": 0.8568679317831993, "num_tokens": 230054209.0, "step": 535 }, { "entropy": 0.41278076171875, "epoch": 2.126984126984127, "grad_norm": 0.8066388305393899, "learning_rate": 2.1610614068783112e-06, "loss": 0.3981, "mean_token_accuracy": 0.8601069571450353, "num_tokens": 230489032.0, "step": 536 }, { "entropy": 0.411895751953125, "epoch": 2.130952380952381, "grad_norm": 0.8350937916956933, "learning_rate": 2.143079722052034e-06, "loss": 0.4015, "mean_token_accuracy": 0.8587260395288467, "num_tokens": 230910745.0, "step": 537 }, { "entropy": 0.417938232421875, "epoch": 2.134920634920635, "grad_norm": 0.791508989758568, "learning_rate": 2.125152732308747e-06, "loss": 0.4049, "mean_token_accuracy": 0.8583241375163198, "num_tokens": 231339019.0, "step": 538 }, { "entropy": 0.4166259765625, "epoch": 2.138888888888889, "grad_norm": 0.7979398132027408, "learning_rate": 2.1072807808565547e-06, "loss": 0.4084, "mean_token_accuracy": 0.8571968795731664, "num_tokens": 231777523.0, "step": 539 }, { "entropy": 0.420440673828125, "epoch": 2.142857142857143, "grad_norm": 0.8603306148484448, "learning_rate": 2.0894642098498656e-06, "loss": 0.3952, "mean_token_accuracy": 0.859032517299056, "num_tokens": 232199672.0, "step": 540 }, { "entropy": 0.423187255859375, "epoch": 2.1468253968253967, "grad_norm": 0.9055074686631474, "learning_rate": 2.0717033603828436e-06, "loss": 0.3923, "mean_token_accuracy": 0.8614393156021833, "num_tokens": 232633797.0, "step": 541 }, { "entropy": 0.417877197265625, "epoch": 2.1507936507936507, "grad_norm": 0.8617856992329058, "learning_rate": 2.0539985724828736e-06, "loss": 0.4081, "mean_token_accuracy": 0.8573337839916348, "num_tokens": 233076007.0, "step": 542 }, { "entropy": 0.41546630859375, "epoch": 2.1547619047619047, "grad_norm": 0.8903667184752816, "learning_rate": 2.0363501851040573e-06, "loss": 0.3922, "mean_token_accuracy": 0.861387861892581, "num_tokens": 233509851.0, "step": 543 }, { "entropy": 0.4229736328125, "epoch": 2.1587301587301586, "grad_norm": 0.8398162712869015, "learning_rate": 2.0187585361207174e-06, "loss": 0.4043, "mean_token_accuracy": 0.857014361768961, "num_tokens": 233942156.0, "step": 544 }, { "entropy": 0.418701171875, "epoch": 2.1626984126984126, "grad_norm": 0.8309474925972752, "learning_rate": 2.001223962320941e-06, "loss": 0.3959, "mean_token_accuracy": 0.8592708380892873, "num_tokens": 234372963.0, "step": 545 }, { "entropy": 0.414398193359375, "epoch": 2.1666666666666665, "grad_norm": 0.8088942738118841, "learning_rate": 1.9837467994001165e-06, "loss": 0.4048, "mean_token_accuracy": 0.8613162385299802, "num_tokens": 234844668.0, "step": 546 }, { "entropy": 0.429656982421875, "epoch": 2.1706349206349205, "grad_norm": 0.8900138868011044, "learning_rate": 1.9663273819545157e-06, "loss": 0.4117, "mean_token_accuracy": 0.8555487683042884, "num_tokens": 235271990.0, "step": 547 }, { "entropy": 0.416961669921875, "epoch": 2.1746031746031744, "grad_norm": 0.8125994478475848, "learning_rate": 1.948966043474889e-06, "loss": 0.3981, "mean_token_accuracy": 0.8588608456775546, "num_tokens": 235697877.0, "step": 548 }, { "entropy": 0.429046630859375, "epoch": 2.1785714285714284, "grad_norm": 0.9972924104553051, "learning_rate": 1.931663116340074e-06, "loss": 0.4049, "mean_token_accuracy": 0.8577186185866594, "num_tokens": 236134537.0, "step": 549 }, { "entropy": 0.410797119140625, "epoch": 2.1825396825396823, "grad_norm": 0.8632872657339906, "learning_rate": 1.914418931810643e-06, "loss": 0.3855, "mean_token_accuracy": 0.8640564111992717, "num_tokens": 236586699.0, "step": 550 }, { "entropy": 0.424530029296875, "epoch": 2.1865079365079367, "grad_norm": 0.8870689635471863, "learning_rate": 1.8972338200225509e-06, "loss": 0.3984, "mean_token_accuracy": 0.8577613439410925, "num_tokens": 236995332.0, "step": 551 }, { "entropy": 0.418975830078125, "epoch": 2.1904761904761907, "grad_norm": 0.9628030178975229, "learning_rate": 1.880108109980815e-06, "loss": 0.3934, "mean_token_accuracy": 0.861169021576643, "num_tokens": 237426378.0, "step": 552 }, { "entropy": 0.411376953125, "epoch": 2.1944444444444446, "grad_norm": 0.933588404712383, "learning_rate": 1.8630421295532252e-06, "loss": 0.3905, "mean_token_accuracy": 0.8604107396677136, "num_tokens": 237866086.0, "step": 553 }, { "entropy": 0.41845703125, "epoch": 2.1984126984126986, "grad_norm": 1.0435808914840323, "learning_rate": 1.8460362054640573e-06, "loss": 0.4007, "mean_token_accuracy": 0.8584116594865918, "num_tokens": 238297987.0, "step": 554 }, { "entropy": 0.4312744140625, "epoch": 2.2023809523809526, "grad_norm": 0.9124011744416908, "learning_rate": 1.8290906632878297e-06, "loss": 0.4056, "mean_token_accuracy": 0.8590257493779063, "num_tokens": 238729296.0, "step": 555 }, { "entropy": 0.41986083984375, "epoch": 2.2063492063492065, "grad_norm": 0.9196757371946168, "learning_rate": 1.8122058274430542e-06, "loss": 0.408, "mean_token_accuracy": 0.8594406340271235, "num_tokens": 239171101.0, "step": 556 }, { "entropy": 0.42120361328125, "epoch": 2.2103174603174605, "grad_norm": 0.8297358875305545, "learning_rate": 1.7953820211860395e-06, "loss": 0.3919, "mean_token_accuracy": 0.8603522703051567, "num_tokens": 239602299.0, "step": 557 }, { "entropy": 0.41949462890625, "epoch": 2.2142857142857144, "grad_norm": 1.6698534343246039, "learning_rate": 1.7786195666046935e-06, "loss": 0.3915, "mean_token_accuracy": 0.8623024551197886, "num_tokens": 240034337.0, "step": 558 }, { "entropy": 0.42144775390625, "epoch": 2.2182539682539684, "grad_norm": 0.8963232285622191, "learning_rate": 1.7619187846123624e-06, "loss": 0.3901, "mean_token_accuracy": 0.8617757288739085, "num_tokens": 240461291.0, "step": 559 }, { "entropy": 0.42474365234375, "epoch": 2.2222222222222223, "grad_norm": 0.9778763913057226, "learning_rate": 1.7452799949416833e-06, "loss": 0.384, "mean_token_accuracy": 0.8640343863517046, "num_tokens": 240860927.0, "step": 560 }, { "entropy": 0.41705322265625, "epoch": 2.2261904761904763, "grad_norm": 0.8286270345827924, "learning_rate": 1.7287035161384673e-06, "loss": 0.3996, "mean_token_accuracy": 0.8590253088623285, "num_tokens": 241301179.0, "step": 561 }, { "entropy": 0.418853759765625, "epoch": 2.2301587301587302, "grad_norm": 0.8430918806162481, "learning_rate": 1.7121896655555958e-06, "loss": 0.396, "mean_token_accuracy": 0.860031645745039, "num_tokens": 241739076.0, "step": 562 }, { "entropy": 0.424774169921875, "epoch": 2.234126984126984, "grad_norm": 0.826236198905769, "learning_rate": 1.695738759346947e-06, "loss": 0.3891, "mean_token_accuracy": 0.8625601828098297, "num_tokens": 242150640.0, "step": 563 }, { "entropy": 0.412109375, "epoch": 2.238095238095238, "grad_norm": 0.8853893523977265, "learning_rate": 1.6793511124613455e-06, "loss": 0.3874, "mean_token_accuracy": 0.8637553565204144, "num_tokens": 242574011.0, "step": 564 }, { "entropy": 0.422393798828125, "epoch": 2.242063492063492, "grad_norm": 0.9075367727640452, "learning_rate": 1.6630270386365288e-06, "loss": 0.3989, "mean_token_accuracy": 0.8571943752467632, "num_tokens": 243005939.0, "step": 565 }, { "entropy": 0.41766357421875, "epoch": 2.246031746031746, "grad_norm": 0.8448948319006312, "learning_rate": 1.6467668503931432e-06, "loss": 0.398, "mean_token_accuracy": 0.861447062343359, "num_tokens": 243458878.0, "step": 566 }, { "entropy": 0.418365478515625, "epoch": 2.25, "grad_norm": 0.9930222072087751, "learning_rate": 1.6305708590287616e-06, "loss": 0.3997, "mean_token_accuracy": 0.8600739203393459, "num_tokens": 243877438.0, "step": 567 }, { "entropy": 0.41552734375, "epoch": 2.253968253968254, "grad_norm": 0.8598361323835692, "learning_rate": 1.6144393746119208e-06, "loss": 0.3943, "mean_token_accuracy": 0.8619920583441854, "num_tokens": 244313964.0, "step": 568 }, { "entropy": 0.41705322265625, "epoch": 2.257936507936508, "grad_norm": 0.9059341355540655, "learning_rate": 1.5983727059761873e-06, "loss": 0.3981, "mean_token_accuracy": 0.8603257145732641, "num_tokens": 244761734.0, "step": 569 }, { "entropy": 0.417938232421875, "epoch": 2.261904761904762, "grad_norm": 0.8354660701028858, "learning_rate": 1.5823711607142428e-06, "loss": 0.3843, "mean_token_accuracy": 0.863621992059052, "num_tokens": 245200322.0, "step": 570 }, { "entropy": 0.416839599609375, "epoch": 2.265873015873016, "grad_norm": 0.8345755216968843, "learning_rate": 1.5664350451720022e-06, "loss": 0.396, "mean_token_accuracy": 0.8610862046480179, "num_tokens": 245646233.0, "step": 571 }, { "entropy": 0.421661376953125, "epoch": 2.2698412698412698, "grad_norm": 0.8201081491300131, "learning_rate": 1.5505646644427375e-06, "loss": 0.395, "mean_token_accuracy": 0.8609900875017047, "num_tokens": 246083539.0, "step": 572 }, { "entropy": 0.421539306640625, "epoch": 2.2738095238095237, "grad_norm": 0.8429380051297379, "learning_rate": 1.5347603223612462e-06, "loss": 0.3963, "mean_token_accuracy": 0.860317200422287, "num_tokens": 246515677.0, "step": 573 }, { "entropy": 0.4217529296875, "epoch": 2.2777777777777777, "grad_norm": 0.868322359342986, "learning_rate": 1.5190223214980286e-06, "loss": 0.3976, "mean_token_accuracy": 0.8608297156170011, "num_tokens": 246933619.0, "step": 574 }, { "entropy": 0.43359375, "epoch": 2.2817460317460316, "grad_norm": 0.8952218666631779, "learning_rate": 1.5033509631534986e-06, "loss": 0.3966, "mean_token_accuracy": 0.8629090571776032, "num_tokens": 247344382.0, "step": 575 }, { "entropy": 0.41790771484375, "epoch": 2.2857142857142856, "grad_norm": 0.9480496740892829, "learning_rate": 1.4877465473522178e-06, "loss": 0.3813, "mean_token_accuracy": 0.8640672285109758, "num_tokens": 247765672.0, "step": 576 }, { "entropy": 0.42218017578125, "epoch": 2.2896825396825395, "grad_norm": 0.9704838555740247, "learning_rate": 1.4722093728371427e-06, "loss": 0.3878, "mean_token_accuracy": 0.8612747713923454, "num_tokens": 248183306.0, "step": 577 }, { "entropy": 0.410430908203125, "epoch": 2.2936507936507935, "grad_norm": 0.8533419703585065, "learning_rate": 1.4567397370639158e-06, "loss": 0.3927, "mean_token_accuracy": 0.8615565691143274, "num_tokens": 248628378.0, "step": 578 }, { "entropy": 0.41888427734375, "epoch": 2.2976190476190474, "grad_norm": 0.818324266262677, "learning_rate": 1.4413379361951596e-06, "loss": 0.4009, "mean_token_accuracy": 0.8598908875137568, "num_tokens": 249071096.0, "step": 579 }, { "entropy": 0.41949462890625, "epoch": 2.3015873015873014, "grad_norm": 0.8157937775196074, "learning_rate": 1.4260042650948187e-06, "loss": 0.3959, "mean_token_accuracy": 0.8613967839628458, "num_tokens": 249501143.0, "step": 580 }, { "entropy": 0.419769287109375, "epoch": 2.3055555555555554, "grad_norm": 0.948858831726886, "learning_rate": 1.4107390173225045e-06, "loss": 0.3945, "mean_token_accuracy": 0.8604099499061704, "num_tokens": 249948355.0, "step": 581 }, { "entropy": 0.42041015625, "epoch": 2.3095238095238093, "grad_norm": 0.8758102059030293, "learning_rate": 1.395542485127886e-06, "loss": 0.388, "mean_token_accuracy": 0.8634849116206169, "num_tokens": 250356099.0, "step": 582 }, { "entropy": 0.421234130859375, "epoch": 2.3134920634920633, "grad_norm": 0.8815188369640882, "learning_rate": 1.3804149594450816e-06, "loss": 0.3919, "mean_token_accuracy": 0.8597034253180027, "num_tokens": 250775592.0, "step": 583 }, { "entropy": 0.418121337890625, "epoch": 2.317460317460317, "grad_norm": 0.861023672134407, "learning_rate": 1.365356729887099e-06, "loss": 0.4, "mean_token_accuracy": 0.8603812381625175, "num_tokens": 251219125.0, "step": 584 }, { "entropy": 0.415496826171875, "epoch": 2.3214285714285716, "grad_norm": 0.8641123367226853, "learning_rate": 1.3503680847402868e-06, "loss": 0.3933, "mean_token_accuracy": 0.8616957142949104, "num_tokens": 251648861.0, "step": 585 }, { "entropy": 0.41497802734375, "epoch": 2.3253968253968256, "grad_norm": 0.8154240634747612, "learning_rate": 1.3354493109588145e-06, "loss": 0.3926, "mean_token_accuracy": 0.8618068303912878, "num_tokens": 252080434.0, "step": 586 }, { "entropy": 0.417633056640625, "epoch": 2.3293650793650795, "grad_norm": 0.8354299632421693, "learning_rate": 1.320600694159185e-06, "loss": 0.3828, "mean_token_accuracy": 0.8655170071870089, "num_tokens": 252502018.0, "step": 587 }, { "entropy": 0.420166015625, "epoch": 2.3333333333333335, "grad_norm": 0.9436967025783154, "learning_rate": 1.3058225186147572e-06, "loss": 0.3957, "mean_token_accuracy": 0.8595009902492166, "num_tokens": 252923218.0, "step": 588 }, { "entropy": 0.419464111328125, "epoch": 2.3373015873015874, "grad_norm": 0.8818218399814328, "learning_rate": 1.2911150672503098e-06, "loss": 0.3867, "mean_token_accuracy": 0.8642842434346676, "num_tokens": 253337148.0, "step": 589 }, { "entropy": 0.426788330078125, "epoch": 2.3412698412698414, "grad_norm": 0.8980593730409643, "learning_rate": 1.2764786216366236e-06, "loss": 0.3988, "mean_token_accuracy": 0.8595603117719293, "num_tokens": 253761289.0, "step": 590 }, { "entropy": 0.416748046875, "epoch": 2.3452380952380953, "grad_norm": 0.944966296741567, "learning_rate": 1.2619134619850908e-06, "loss": 0.3929, "mean_token_accuracy": 0.8604479916393757, "num_tokens": 254195017.0, "step": 591 }, { "entropy": 0.41595458984375, "epoch": 2.3492063492063493, "grad_norm": 1.0810430230383554, "learning_rate": 1.2474198671423493e-06, "loss": 0.3999, "mean_token_accuracy": 0.8599454695358872, "num_tokens": 254643716.0, "step": 592 }, { "entropy": 0.41754150390625, "epoch": 2.3531746031746033, "grad_norm": 0.812428846397806, "learning_rate": 1.2329981145849468e-06, "loss": 0.3977, "mean_token_accuracy": 0.8586347484961152, "num_tokens": 255069339.0, "step": 593 }, { "entropy": 0.42437744140625, "epoch": 2.357142857142857, "grad_norm": 0.8302059952828363, "learning_rate": 1.2186484804140242e-06, "loss": 0.3942, "mean_token_accuracy": 0.8609241275116801, "num_tokens": 255486573.0, "step": 594 }, { "entropy": 0.415771484375, "epoch": 2.361111111111111, "grad_norm": 0.8148827903906969, "learning_rate": 1.2043712393500355e-06, "loss": 0.3876, "mean_token_accuracy": 0.8626940259709954, "num_tokens": 255913426.0, "step": 595 }, { "entropy": 0.42193603515625, "epoch": 2.365079365079365, "grad_norm": 1.2289420157864683, "learning_rate": 1.1901666647274823e-06, "loss": 0.3841, "mean_token_accuracy": 0.8637949759140611, "num_tokens": 256345326.0, "step": 596 }, { "entropy": 0.416656494140625, "epoch": 2.369047619047619, "grad_norm": 0.8492473570067233, "learning_rate": 1.1760350284896876e-06, "loss": 0.388, "mean_token_accuracy": 0.864149815402925, "num_tokens": 256765173.0, "step": 597 }, { "entropy": 0.418212890625, "epoch": 2.373015873015873, "grad_norm": 0.7898920278559984, "learning_rate": 1.1619766011835832e-06, "loss": 0.3797, "mean_token_accuracy": 0.8674542000517249, "num_tokens": 257185152.0, "step": 598 }, { "entropy": 0.415191650390625, "epoch": 2.376984126984127, "grad_norm": 0.8450780742867222, "learning_rate": 1.1479916519545326e-06, "loss": 0.3903, "mean_token_accuracy": 0.8624427672475576, "num_tokens": 257627732.0, "step": 599 }, { "entropy": 0.414825439453125, "epoch": 2.380952380952381, "grad_norm": 0.8928696413952878, "learning_rate": 1.1340804485411783e-06, "loss": 0.3917, "mean_token_accuracy": 0.8615064565092325, "num_tokens": 258067282.0, "step": 600 }, { "entropy": 0.421417236328125, "epoch": 2.384920634920635, "grad_norm": 0.9596298099931699, "learning_rate": 1.1202432572703176e-06, "loss": 0.396, "mean_token_accuracy": 0.8607813809067011, "num_tokens": 258491168.0, "step": 601 }, { "entropy": 0.412139892578125, "epoch": 2.388888888888889, "grad_norm": 0.8015642815814561, "learning_rate": 1.1064803430518002e-06, "loss": 0.3919, "mean_token_accuracy": 0.8602419178932905, "num_tokens": 258944016.0, "step": 602 }, { "entropy": 0.419189453125, "epoch": 2.392857142857143, "grad_norm": 0.83893313171213, "learning_rate": 1.0927919693734618e-06, "loss": 0.3941, "mean_token_accuracy": 0.8623963864520192, "num_tokens": 259379493.0, "step": 603 }, { "entropy": 0.421142578125, "epoch": 2.3968253968253967, "grad_norm": 0.806191116058063, "learning_rate": 1.0791783982960736e-06, "loss": 0.3875, "mean_token_accuracy": 0.8618775270879269, "num_tokens": 259808268.0, "step": 604 }, { "entropy": 0.412872314453125, "epoch": 2.4007936507936507, "grad_norm": 0.8986481499489538, "learning_rate": 1.0656398904483312e-06, "loss": 0.395, "mean_token_accuracy": 0.8624038007110357, "num_tokens": 260247659.0, "step": 605 }, { "entropy": 0.41680908203125, "epoch": 2.4047619047619047, "grad_norm": 0.9536388176335355, "learning_rate": 1.0521767050218562e-06, "loss": 0.4001, "mean_token_accuracy": 0.860544073395431, "num_tokens": 260684292.0, "step": 606 }, { "entropy": 0.416168212890625, "epoch": 2.4087301587301586, "grad_norm": 0.8770578300353563, "learning_rate": 1.0387890997662443e-06, "loss": 0.3945, "mean_token_accuracy": 0.8609949657693505, "num_tokens": 261121173.0, "step": 607 }, { "entropy": 0.42376708984375, "epoch": 2.4126984126984126, "grad_norm": 0.8910554686210177, "learning_rate": 1.0254773309841277e-06, "loss": 0.3967, "mean_token_accuracy": 0.8618429079651833, "num_tokens": 261555918.0, "step": 608 }, { "entropy": 0.417755126953125, "epoch": 2.4166666666666665, "grad_norm": 0.777450132911365, "learning_rate": 1.012241653526263e-06, "loss": 0.3946, "mean_token_accuracy": 0.8610922154039145, "num_tokens": 262000331.0, "step": 609 }, { "entropy": 0.42059326171875, "epoch": 2.4206349206349205, "grad_norm": 0.8219463383506274, "learning_rate": 9.990823207866578e-07, "loss": 0.386, "mean_token_accuracy": 0.8632347630336881, "num_tokens": 262425946.0, "step": 610 }, { "entropy": 0.42413330078125, "epoch": 2.4246031746031744, "grad_norm": 0.9964330808029446, "learning_rate": 9.85999584697716e-07, "loss": 0.3892, "mean_token_accuracy": 0.8625029819086194, "num_tokens": 262853210.0, "step": 611 }, { "entropy": 0.42291259765625, "epoch": 2.4285714285714284, "grad_norm": 0.8278237327212594, "learning_rate": 9.729936957254165e-07, "loss": 0.3822, "mean_token_accuracy": 0.864779950119555, "num_tokens": 263268966.0, "step": 612 }, { "entropy": 0.41943359375, "epoch": 2.432539682539683, "grad_norm": 0.9884237647568829, "learning_rate": 9.600649028645215e-07, "loss": 0.3933, "mean_token_accuracy": 0.8612792957574129, "num_tokens": 263709583.0, "step": 613 }, { "entropy": 0.418853759765625, "epoch": 2.4365079365079367, "grad_norm": 0.9015756745222828, "learning_rate": 9.472134536338007e-07, "loss": 0.3859, "mean_token_accuracy": 0.8643078990280628, "num_tokens": 264137961.0, "step": 614 }, { "entropy": 0.411834716796875, "epoch": 2.4404761904761907, "grad_norm": 0.8083110921800731, "learning_rate": 9.344395940713009e-07, "loss": 0.3905, "mean_token_accuracy": 0.8626386728137732, "num_tokens": 264579578.0, "step": 615 }, { "entropy": 0.42529296875, "epoch": 2.4444444444444446, "grad_norm": 0.8539196851499317, "learning_rate": 9.217435687296305e-07, "loss": 0.3889, "mean_token_accuracy": 0.8617231827229261, "num_tokens": 264995910.0, "step": 616 }, { "entropy": 0.4136962890625, "epoch": 2.4484126984126986, "grad_norm": 0.7995502674271355, "learning_rate": 9.091256206712812e-07, "loss": 0.3925, "mean_token_accuracy": 0.8612663270905614, "num_tokens": 265440836.0, "step": 617 }, { "entropy": 0.42041015625, "epoch": 2.4523809523809526, "grad_norm": 0.8157462797571775, "learning_rate": 8.965859914639724e-07, "loss": 0.3832, "mean_token_accuracy": 0.8630803981795907, "num_tokens": 265867518.0, "step": 618 }, { "entropy": 0.420501708984375, "epoch": 2.4563492063492065, "grad_norm": 0.9573151028277197, "learning_rate": 8.841249211760272e-07, "loss": 0.4006, "mean_token_accuracy": 0.8605411788448691, "num_tokens": 266304569.0, "step": 619 }, { "entropy": 0.416839599609375, "epoch": 2.4603174603174605, "grad_norm": 0.8389660650593388, "learning_rate": 8.717426483717762e-07, "loss": 0.3843, "mean_token_accuracy": 0.8629998695105314, "num_tokens": 266730039.0, "step": 620 }, { "entropy": 0.4168701171875, "epoch": 2.4642857142857144, "grad_norm": 0.8408327374770237, "learning_rate": 8.594394101069897e-07, "loss": 0.4009, "mean_token_accuracy": 0.8605172112584114, "num_tokens": 267169815.0, "step": 621 }, { "entropy": 0.41571044921875, "epoch": 2.4682539682539684, "grad_norm": 0.8011141591258287, "learning_rate": 8.472154419243411e-07, "loss": 0.3918, "mean_token_accuracy": 0.8619374986737967, "num_tokens": 267605673.0, "step": 622 }, { "entropy": 0.41705322265625, "epoch": 2.4722222222222223, "grad_norm": 0.8434082500134104, "learning_rate": 8.350709778488941e-07, "loss": 0.4014, "mean_token_accuracy": 0.8600445203483105, "num_tokens": 268044360.0, "step": 623 }, { "entropy": 0.41815185546875, "epoch": 2.4761904761904763, "grad_norm": 0.8019659782743609, "learning_rate": 8.230062503836278e-07, "loss": 0.3937, "mean_token_accuracy": 0.8604294890537858, "num_tokens": 268470812.0, "step": 624 }, { "entropy": 0.417449951171875, "epoch": 2.4801587301587302, "grad_norm": 0.8264347297639569, "learning_rate": 8.110214905049802e-07, "loss": 0.3965, "mean_token_accuracy": 0.8575309114530683, "num_tokens": 268895281.0, "step": 625 }, { "entropy": 0.414459228515625, "epoch": 2.484126984126984, "grad_norm": 0.7888506972306255, "learning_rate": 7.991169276584281e-07, "loss": 0.3807, "mean_token_accuracy": 0.8645908059552312, "num_tokens": 269329768.0, "step": 626 }, { "entropy": 0.41363525390625, "epoch": 2.488095238095238, "grad_norm": 0.8377743907107998, "learning_rate": 7.872927897540944e-07, "loss": 0.3948, "mean_token_accuracy": 0.8611715780571103, "num_tokens": 269763538.0, "step": 627 }, { "entropy": 0.419525146484375, "epoch": 2.492063492063492, "grad_norm": 0.7713110745405427, "learning_rate": 7.75549303162384e-07, "loss": 0.3945, "mean_token_accuracy": 0.8595996387302876, "num_tokens": 270192672.0, "step": 628 }, { "entropy": 0.418792724609375, "epoch": 2.496031746031746, "grad_norm": 0.8447629896166373, "learning_rate": 7.638866927096555e-07, "loss": 0.4074, "mean_token_accuracy": 0.8587245307862759, "num_tokens": 270633240.0, "step": 629 }, { "entropy": 0.419891357421875, "epoch": 2.5, "grad_norm": 0.7852993278058601, "learning_rate": 7.523051816739074e-07, "loss": 0.3859, "mean_token_accuracy": 0.8630366576835513, "num_tokens": 271053623.0, "step": 630 }, { "entropy": 0.41510009765625, "epoch": 2.503968253968254, "grad_norm": 0.8426473805113363, "learning_rate": 7.408049917805104e-07, "loss": 0.3881, "mean_token_accuracy": 0.8630319554358721, "num_tokens": 271492583.0, "step": 631 }, { "entropy": 0.41632080078125, "epoch": 2.507936507936508, "grad_norm": 0.8529237472508443, "learning_rate": 7.293863431979619e-07, "loss": 0.395, "mean_token_accuracy": 0.861218343488872, "num_tokens": 271921985.0, "step": 632 }, { "entropy": 0.42822265625, "epoch": 2.511904761904762, "grad_norm": 0.7740038021053262, "learning_rate": 7.180494545336642e-07, "loss": 0.3874, "mean_token_accuracy": 0.8652349133044481, "num_tokens": 272349367.0, "step": 633 }, { "entropy": 0.426361083984375, "epoch": 2.515873015873016, "grad_norm": 0.9109105967855416, "learning_rate": 7.067945428297524e-07, "loss": 0.3976, "mean_token_accuracy": 0.8593434160575271, "num_tokens": 272757706.0, "step": 634 }, { "entropy": 0.424713134765625, "epoch": 2.5198412698412698, "grad_norm": 0.8510388770912337, "learning_rate": 6.956218235589263e-07, "loss": 0.3872, "mean_token_accuracy": 0.8625729326158762, "num_tokens": 273178323.0, "step": 635 }, { "entropy": 0.420318603515625, "epoch": 2.5238095238095237, "grad_norm": 0.8277629227526272, "learning_rate": 6.845315106203327e-07, "loss": 0.3868, "mean_token_accuracy": 0.8626482058316469, "num_tokens": 273603268.0, "step": 636 }, { "entropy": 0.418853759765625, "epoch": 2.5277777777777777, "grad_norm": 0.8202191768752707, "learning_rate": 6.735238163354669e-07, "loss": 0.3847, "mean_token_accuracy": 0.8641904015094042, "num_tokens": 274036335.0, "step": 637 }, { "entropy": 0.418914794921875, "epoch": 2.5317460317460316, "grad_norm": 0.8647875520943077, "learning_rate": 6.625989514441089e-07, "loss": 0.3925, "mean_token_accuracy": 0.8626054916530848, "num_tokens": 274458735.0, "step": 638 }, { "entropy": 0.412353515625, "epoch": 2.5357142857142856, "grad_norm": 0.7982027347968378, "learning_rate": 6.517571251002896e-07, "loss": 0.393, "mean_token_accuracy": 0.8614260852336884, "num_tokens": 274909982.0, "step": 639 }, { "entropy": 0.42431640625, "epoch": 2.5396825396825395, "grad_norm": 0.8307645924294975, "learning_rate": 6.40998544868287e-07, "loss": 0.3889, "mean_token_accuracy": 0.8601001044735312, "num_tokens": 275320028.0, "step": 640 }, { "entropy": 0.417816162109375, "epoch": 2.5436507936507935, "grad_norm": 0.8430698509853944, "learning_rate": 6.3032341671865e-07, "loss": 0.386, "mean_token_accuracy": 0.8654862614348531, "num_tokens": 275726848.0, "step": 641 }, { "entropy": 0.413848876953125, "epoch": 2.5476190476190474, "grad_norm": 0.8421768209102014, "learning_rate": 6.197319450242562e-07, "loss": 0.3867, "mean_token_accuracy": 0.8631602311506867, "num_tokens": 276151262.0, "step": 642 }, { "entropy": 0.417266845703125, "epoch": 2.5515873015873014, "grad_norm": 0.8929748589387052, "learning_rate": 6.092243325564007e-07, "loss": 0.3924, "mean_token_accuracy": 0.8615100616589189, "num_tokens": 276568860.0, "step": 643 }, { "entropy": 0.41387939453125, "epoch": 2.5555555555555554, "grad_norm": 0.8040672513690313, "learning_rate": 5.98800780480912e-07, "loss": 0.3858, "mean_token_accuracy": 0.8625959139317274, "num_tokens": 276997327.0, "step": 644 }, { "entropy": 0.421234130859375, "epoch": 2.5595238095238093, "grad_norm": 0.7855164537605119, "learning_rate": 5.884614883543027e-07, "loss": 0.394, "mean_token_accuracy": 0.8626839118078351, "num_tokens": 277426196.0, "step": 645 }, { "entropy": 0.4188232421875, "epoch": 2.5634920634920633, "grad_norm": 0.7843681767955034, "learning_rate": 5.782066541199471e-07, "loss": 0.3946, "mean_token_accuracy": 0.8629313539713621, "num_tokens": 277849848.0, "step": 646 }, { "entropy": 0.412078857421875, "epoch": 2.567460317460317, "grad_norm": 0.8561623782562832, "learning_rate": 5.680364741042926e-07, "loss": 0.3811, "mean_token_accuracy": 0.8668704703450203, "num_tokens": 278289888.0, "step": 647 }, { "entropy": 0.414276123046875, "epoch": 2.571428571428571, "grad_norm": 0.8147935679041525, "learning_rate": 5.579511430131018e-07, "loss": 0.3872, "mean_token_accuracy": 0.8630826137959957, "num_tokens": 278726761.0, "step": 648 }, { "entropy": 0.418182373046875, "epoch": 2.575396825396825, "grad_norm": 0.796874369891308, "learning_rate": 5.479508539277229e-07, "loss": 0.3801, "mean_token_accuracy": 0.8660026481375098, "num_tokens": 279136759.0, "step": 649 }, { "entropy": 0.416351318359375, "epoch": 2.5793650793650795, "grad_norm": 0.8223574515325844, "learning_rate": 5.380357983013962e-07, "loss": 0.392, "mean_token_accuracy": 0.8621972808614373, "num_tokens": 279572082.0, "step": 650 }, { "entropy": 0.415252685546875, "epoch": 2.5833333333333335, "grad_norm": 0.8419256563918806, "learning_rate": 5.282061659555854e-07, "loss": 0.3957, "mean_token_accuracy": 0.8606690457090735, "num_tokens": 279994957.0, "step": 651 }, { "entropy": 0.415008544921875, "epoch": 2.5873015873015874, "grad_norm": 0.8001543694338792, "learning_rate": 5.184621450763455e-07, "loss": 0.3819, "mean_token_accuracy": 0.8638613997027278, "num_tokens": 280414468.0, "step": 652 }, { "entropy": 0.41876220703125, "epoch": 2.5912698412698414, "grad_norm": 0.8281488407232048, "learning_rate": 5.088039222107205e-07, "loss": 0.405, "mean_token_accuracy": 0.8599689844995737, "num_tokens": 280832145.0, "step": 653 }, { "entropy": 0.420440673828125, "epoch": 2.5952380952380953, "grad_norm": 0.8401133410984405, "learning_rate": 4.992316822631693e-07, "loss": 0.3815, "mean_token_accuracy": 0.8656142996624112, "num_tokens": 281237288.0, "step": 654 }, { "entropy": 0.412689208984375, "epoch": 2.5992063492063493, "grad_norm": 0.806223122436009, "learning_rate": 4.897456084920282e-07, "loss": 0.3862, "mean_token_accuracy": 0.8658296698704362, "num_tokens": 281692258.0, "step": 655 }, { "entropy": 0.416168212890625, "epoch": 2.6031746031746033, "grad_norm": 0.8396062477724346, "learning_rate": 4.803458825060042e-07, "loss": 0.3763, "mean_token_accuracy": 0.8662013709545135, "num_tokens": 282118057.0, "step": 656 }, { "entropy": 0.412261962890625, "epoch": 2.607142857142857, "grad_norm": 0.825509139511018, "learning_rate": 4.710326842606927e-07, "loss": 0.3987, "mean_token_accuracy": 0.8584959087893367, "num_tokens": 282582066.0, "step": 657 }, { "entropy": 0.40606689453125, "epoch": 2.611111111111111, "grad_norm": 1.080095799468803, "learning_rate": 4.618061920551381e-07, "loss": 0.3936, "mean_token_accuracy": 0.8631810490041971, "num_tokens": 283028330.0, "step": 658 }, { "entropy": 0.42547607421875, "epoch": 2.615079365079365, "grad_norm": 0.8441240019764062, "learning_rate": 4.526665825284132e-07, "loss": 0.3936, "mean_token_accuracy": 0.8619779404252768, "num_tokens": 283436768.0, "step": 659 }, { "entropy": 0.41748046875, "epoch": 2.619047619047619, "grad_norm": 0.8263929571280181, "learning_rate": 4.4361403065624475e-07, "loss": 0.3864, "mean_token_accuracy": 0.8627992533147335, "num_tokens": 283866607.0, "step": 660 }, { "entropy": 0.4234619140625, "epoch": 2.623015873015873, "grad_norm": 0.844367472303199, "learning_rate": 4.3464870974766314e-07, "loss": 0.4004, "mean_token_accuracy": 0.8607617728412151, "num_tokens": 284281791.0, "step": 661 }, { "entropy": 0.419158935546875, "epoch": 2.626984126984127, "grad_norm": 0.8571993055017914, "learning_rate": 4.257707914416781e-07, "loss": 0.3874, "mean_token_accuracy": 0.8635092154145241, "num_tokens": 284705319.0, "step": 662 }, { "entropy": 0.417938232421875, "epoch": 2.630952380952381, "grad_norm": 0.7780232105885654, "learning_rate": 4.169804457039972e-07, "loss": 0.4086, "mean_token_accuracy": 0.8589063184335828, "num_tokens": 285154313.0, "step": 663 }, { "entropy": 0.413238525390625, "epoch": 2.634920634920635, "grad_norm": 0.850893830182736, "learning_rate": 4.082778408237731e-07, "loss": 0.4007, "mean_token_accuracy": 0.8592528942972422, "num_tokens": 285598883.0, "step": 664 }, { "entropy": 0.418487548828125, "epoch": 2.638888888888889, "grad_norm": 1.1283744707185912, "learning_rate": 3.996631434103776e-07, "loss": 0.3977, "mean_token_accuracy": 0.860667590983212, "num_tokens": 286037660.0, "step": 665 }, { "entropy": 0.416961669921875, "epoch": 2.642857142857143, "grad_norm": 0.8944770514716363, "learning_rate": 3.911365183902166e-07, "loss": 0.3898, "mean_token_accuracy": 0.8620567666366696, "num_tokens": 286461446.0, "step": 666 }, { "entropy": 0.419219970703125, "epoch": 2.6468253968253967, "grad_norm": 0.845344585577405, "learning_rate": 3.826981290035692e-07, "loss": 0.3898, "mean_token_accuracy": 0.860666748136282, "num_tokens": 286877023.0, "step": 667 }, { "entropy": 0.422149658203125, "epoch": 2.6507936507936507, "grad_norm": 0.8457306735031688, "learning_rate": 3.7434813680146234e-07, "loss": 0.3895, "mean_token_accuracy": 0.8613977544009686, "num_tokens": 287308399.0, "step": 668 }, { "entropy": 0.412872314453125, "epoch": 2.6547619047619047, "grad_norm": 0.7957237567868245, "learning_rate": 3.6608670164258065e-07, "loss": 0.3906, "mean_token_accuracy": 0.8631431749090552, "num_tokens": 287728804.0, "step": 669 }, { "entropy": 0.411468505859375, "epoch": 2.6587301587301586, "grad_norm": 0.7621184623535802, "learning_rate": 3.5791398169020384e-07, "loss": 0.393, "mean_token_accuracy": 0.8615291966125369, "num_tokens": 288187832.0, "step": 670 }, { "entropy": 0.417144775390625, "epoch": 2.6626984126984126, "grad_norm": 0.8055399962635597, "learning_rate": 3.4983013340918024e-07, "loss": 0.3834, "mean_token_accuracy": 0.8645481085404754, "num_tokens": 288600411.0, "step": 671 }, { "entropy": 0.410888671875, "epoch": 2.6666666666666665, "grad_norm": 0.8440468660994543, "learning_rate": 3.4183531156292913e-07, "loss": 0.394, "mean_token_accuracy": 0.8628778494894505, "num_tokens": 289047051.0, "step": 672 }, { "entropy": 0.417388916015625, "epoch": 2.6706349206349205, "grad_norm": 0.8448761260472664, "learning_rate": 3.3392966921047984e-07, "loss": 0.3932, "mean_token_accuracy": 0.8621304808184505, "num_tokens": 289478039.0, "step": 673 }, { "entropy": 0.4195556640625, "epoch": 2.674603174603175, "grad_norm": 0.826243737430181, "learning_rate": 3.261133577035408e-07, "loss": 0.3992, "mean_token_accuracy": 0.8631375981494784, "num_tokens": 289920851.0, "step": 674 }, { "entropy": 0.41644287109375, "epoch": 2.678571428571429, "grad_norm": 0.7480379642047055, "learning_rate": 3.1838652668360173e-07, "loss": 0.3834, "mean_token_accuracy": 0.8634974956512451, "num_tokens": 290351325.0, "step": 675 }, { "entropy": 0.4146728515625, "epoch": 2.682539682539683, "grad_norm": 0.7830053460618754, "learning_rate": 3.1074932407906823e-07, "loss": 0.3785, "mean_token_accuracy": 0.8657077318057418, "num_tokens": 290766931.0, "step": 676 }, { "entropy": 0.423828125, "epoch": 2.6865079365079367, "grad_norm": 0.7864820739930504, "learning_rate": 3.0320189610243303e-07, "loss": 0.3935, "mean_token_accuracy": 0.8595830434933305, "num_tokens": 291185306.0, "step": 677 }, { "entropy": 0.422698974609375, "epoch": 2.6904761904761907, "grad_norm": 0.7974086017120517, "learning_rate": 2.957443872474713e-07, "loss": 0.3873, "mean_token_accuracy": 0.8635625531896949, "num_tokens": 291599836.0, "step": 678 }, { "entropy": 0.4146728515625, "epoch": 2.6944444444444446, "grad_norm": 0.9412910487910857, "learning_rate": 2.883769402864789e-07, "loss": 0.4001, "mean_token_accuracy": 0.8598026670515537, "num_tokens": 292026507.0, "step": 679 }, { "entropy": 0.41259765625, "epoch": 2.6984126984126986, "grad_norm": 0.763447905049642, "learning_rate": 2.810996962675361e-07, "loss": 0.3903, "mean_token_accuracy": 0.8622291041538119, "num_tokens": 292454972.0, "step": 680 }, { "entropy": 0.419525146484375, "epoch": 2.7023809523809526, "grad_norm": 0.7897795759262028, "learning_rate": 2.739127945118092e-07, "loss": 0.3983, "mean_token_accuracy": 0.8589327791705728, "num_tokens": 292885705.0, "step": 681 }, { "entropy": 0.42181396484375, "epoch": 2.7063492063492065, "grad_norm": 0.7799046098775175, "learning_rate": 2.668163726108841e-07, "loss": 0.3786, "mean_token_accuracy": 0.8630655352026224, "num_tokens": 293307675.0, "step": 682 }, { "entropy": 0.418853759765625, "epoch": 2.7103174603174605, "grad_norm": 0.8041790844592746, "learning_rate": 2.5981056642412796e-07, "loss": 0.3934, "mean_token_accuracy": 0.8626653142273426, "num_tokens": 293722148.0, "step": 683 }, { "entropy": 0.41864013671875, "epoch": 2.7142857142857144, "grad_norm": 0.8076107515114371, "learning_rate": 2.528955100760938e-07, "loss": 0.3858, "mean_token_accuracy": 0.863671412691474, "num_tokens": 294149752.0, "step": 684 }, { "entropy": 0.422821044921875, "epoch": 2.7182539682539684, "grad_norm": 0.7969449363252592, "learning_rate": 2.460713359539474e-07, "loss": 0.3801, "mean_token_accuracy": 0.8654317120090127, "num_tokens": 294555288.0, "step": 685 }, { "entropy": 0.419830322265625, "epoch": 2.7222222222222223, "grad_norm": 0.8753720898977475, "learning_rate": 2.3933817470493445e-07, "loss": 0.3767, "mean_token_accuracy": 0.866040863096714, "num_tokens": 294975614.0, "step": 686 }, { "entropy": 0.412994384765625, "epoch": 2.7261904761904763, "grad_norm": 0.8199718663106975, "learning_rate": 2.3269615523388355e-07, "loss": 0.3918, "mean_token_accuracy": 0.860607554204762, "num_tokens": 295422071.0, "step": 687 }, { "entropy": 0.41558837890625, "epoch": 2.7301587301587302, "grad_norm": 0.79423481793127, "learning_rate": 2.2614540470073276e-07, "loss": 0.3866, "mean_token_accuracy": 0.8644118411466479, "num_tokens": 295846874.0, "step": 688 }, { "entropy": 0.41741943359375, "epoch": 2.734126984126984, "grad_norm": 0.8822693061766851, "learning_rate": 2.1968604851809738e-07, "loss": 0.3866, "mean_token_accuracy": 0.8631517272442579, "num_tokens": 296288822.0, "step": 689 }, { "entropy": 0.410980224609375, "epoch": 2.738095238095238, "grad_norm": 0.7846498188049137, "learning_rate": 2.1331821034886846e-07, "loss": 0.3943, "mean_token_accuracy": 0.8625091454014182, "num_tokens": 296730922.0, "step": 690 }, { "entropy": 0.41143798828125, "epoch": 2.742063492063492, "grad_norm": 0.8213197375371964, "learning_rate": 2.0704201210384634e-07, "loss": 0.3904, "mean_token_accuracy": 0.864051777869463, "num_tokens": 297169723.0, "step": 691 }, { "entropy": 0.414337158203125, "epoch": 2.746031746031746, "grad_norm": 1.2967707502021062, "learning_rate": 2.0085757393940586e-07, "loss": 0.3772, "mean_token_accuracy": 0.8671101154759526, "num_tokens": 297610941.0, "step": 692 }, { "entropy": 0.41839599609375, "epoch": 2.75, "grad_norm": 0.8133921789474006, "learning_rate": 1.9476501425519656e-07, "loss": 0.3833, "mean_token_accuracy": 0.860652013681829, "num_tokens": 298044879.0, "step": 693 }, { "entropy": 0.416778564453125, "epoch": 2.753968253968254, "grad_norm": 0.7897145964186679, "learning_rate": 1.8876444969187557e-07, "loss": 0.3857, "mean_token_accuracy": 0.8620300153270364, "num_tokens": 298464464.0, "step": 694 }, { "entropy": 0.40887451171875, "epoch": 2.757936507936508, "grad_norm": 0.9000724609100704, "learning_rate": 1.828559951288733e-07, "loss": 0.3831, "mean_token_accuracy": 0.8644989216700196, "num_tokens": 298903233.0, "step": 695 }, { "entropy": 0.41595458984375, "epoch": 2.761904761904762, "grad_norm": 0.7955961913020823, "learning_rate": 1.7703976368219633e-07, "loss": 0.3797, "mean_token_accuracy": 0.8666205117478967, "num_tokens": 299315956.0, "step": 696 }, { "entropy": 0.42193603515625, "epoch": 2.765873015873016, "grad_norm": 0.8878478545913352, "learning_rate": 1.713158667022613e-07, "loss": 0.3812, "mean_token_accuracy": 0.8661440145224333, "num_tokens": 299732237.0, "step": 697 }, { "entropy": 0.4156494140625, "epoch": 2.7698412698412698, "grad_norm": 0.931222693724496, "learning_rate": 1.656844137717617e-07, "loss": 0.3924, "mean_token_accuracy": 0.8617311324924231, "num_tokens": 300162540.0, "step": 698 }, { "entropy": 0.42425537109375, "epoch": 2.7738095238095237, "grad_norm": 0.8389789359858054, "learning_rate": 1.601455127035717e-07, "loss": 0.3901, "mean_token_accuracy": 0.8636501645669341, "num_tokens": 300580263.0, "step": 699 }, { "entropy": 0.41375732421875, "epoch": 2.7777777777777777, "grad_norm": 0.8084246826822826, "learning_rate": 1.5469926953868063e-07, "loss": 0.3786, "mean_token_accuracy": 0.8661916004493833, "num_tokens": 301009855.0, "step": 700 }, { "entropy": 0.421539306640625, "epoch": 2.7817460317460316, "grad_norm": 0.7920810255445628, "learning_rate": 1.4934578854416403e-07, "loss": 0.3793, "mean_token_accuracy": 0.8652064045891166, "num_tokens": 301429836.0, "step": 701 }, { "entropy": 0.417236328125, "epoch": 2.7857142857142856, "grad_norm": 0.7557065694262733, "learning_rate": 1.440851722111858e-07, "loss": 0.3775, "mean_token_accuracy": 0.8666085209697485, "num_tokens": 301852351.0, "step": 702 }, { "entropy": 0.418304443359375, "epoch": 2.7896825396825395, "grad_norm": 0.8074299266071946, "learning_rate": 1.389175212530397e-07, "loss": 0.3787, "mean_token_accuracy": 0.8652766114100814, "num_tokens": 302270241.0, "step": 703 }, { "entropy": 0.41082763671875, "epoch": 2.7936507936507935, "grad_norm": 0.8030020009685229, "learning_rate": 1.3384293460321662e-07, "loss": 0.3838, "mean_token_accuracy": 0.8642606223002076, "num_tokens": 302702189.0, "step": 704 }, { "entropy": 0.416748046875, "epoch": 2.7976190476190474, "grad_norm": 0.7647096183455645, "learning_rate": 1.2886150941351317e-07, "loss": 0.3778, "mean_token_accuracy": 0.866962157189846, "num_tokens": 303138996.0, "step": 705 }, { "entropy": 0.422882080078125, "epoch": 2.8015873015873014, "grad_norm": 0.8421868959580608, "learning_rate": 1.2397334105217097e-07, "loss": 0.3868, "mean_token_accuracy": 0.8634527139365673, "num_tokens": 303546117.0, "step": 706 }, { "entropy": 0.4114990234375, "epoch": 2.8055555555555554, "grad_norm": 0.7836818266413413, "learning_rate": 1.1917852310205147e-07, "loss": 0.3866, "mean_token_accuracy": 0.8666229834780097, "num_tokens": 303985054.0, "step": 707 }, { "entropy": 0.413970947265625, "epoch": 2.8095238095238093, "grad_norm": 0.7941927914203842, "learning_rate": 1.1447714735884463e-07, "loss": 0.3854, "mean_token_accuracy": 0.8626468563452363, "num_tokens": 304424136.0, "step": 708 }, { "entropy": 0.41229248046875, "epoch": 2.8134920634920633, "grad_norm": 0.8361515314493303, "learning_rate": 1.0986930382930916e-07, "loss": 0.3881, "mean_token_accuracy": 0.8630633186548948, "num_tokens": 304862181.0, "step": 709 }, { "entropy": 0.415924072265625, "epoch": 2.817460317460317, "grad_norm": 0.8355933592816446, "learning_rate": 1.0535508072955225e-07, "loss": 0.3969, "mean_token_accuracy": 0.8627164475619793, "num_tokens": 305299176.0, "step": 710 }, { "entropy": 0.412994384765625, "epoch": 2.821428571428571, "grad_norm": 0.7563718151503291, "learning_rate": 1.0093456448333872e-07, "loss": 0.3888, "mean_token_accuracy": 0.8606778532266617, "num_tokens": 305755604.0, "step": 711 }, { "entropy": 0.414825439453125, "epoch": 2.825396825396825, "grad_norm": 0.7951191207755064, "learning_rate": 9.660783972043786e-08, "loss": 0.3833, "mean_token_accuracy": 0.862731215544045, "num_tokens": 306180918.0, "step": 712 }, { "entropy": 0.416290283203125, "epoch": 2.8293650793650795, "grad_norm": 0.8170483155607833, "learning_rate": 9.237498927500088e-08, "loss": 0.3962, "mean_token_accuracy": 0.861549130640924, "num_tokens": 306601315.0, "step": 713 }, { "entropy": 0.413970947265625, "epoch": 2.8333333333333335, "grad_norm": 0.7633085069328812, "learning_rate": 8.823609418397939e-08, "loss": 0.3903, "mean_token_accuracy": 0.861748369410634, "num_tokens": 307053855.0, "step": 714 }, { "entropy": 0.4200439453125, "epoch": 2.8373015873015874, "grad_norm": 0.84331828046859, "learning_rate": 8.419123368556991e-08, "loss": 0.3889, "mean_token_accuracy": 0.8638924788683653, "num_tokens": 307466527.0, "step": 715 }, { "entropy": 0.417083740234375, "epoch": 2.8412698412698414, "grad_norm": 1.136155627056708, "learning_rate": 8.024048521769745e-08, "loss": 0.393, "mean_token_accuracy": 0.8619293784722686, "num_tokens": 307908233.0, "step": 716 }, { "entropy": 0.41851806640625, "epoch": 2.8452380952380953, "grad_norm": 0.8624286819149608, "learning_rate": 7.638392441653542e-08, "loss": 0.3815, "mean_token_accuracy": 0.8658701097592711, "num_tokens": 308331740.0, "step": 717 }, { "entropy": 0.417083740234375, "epoch": 2.8492063492063493, "grad_norm": 0.7877113344572971, "learning_rate": 7.262162511505466e-08, "loss": 0.3766, "mean_token_accuracy": 0.8655453082174063, "num_tokens": 308765267.0, "step": 718 }, { "entropy": 0.419647216796875, "epoch": 2.8531746031746033, "grad_norm": 0.8003863832330088, "learning_rate": 6.895365934161236e-08, "loss": 0.3811, "mean_token_accuracy": 0.8642518576234579, "num_tokens": 309177878.0, "step": 719 }, { "entropy": 0.427581787109375, "epoch": 2.857142857142857, "grad_norm": 0.7682606770932064, "learning_rate": 6.538009731857087e-08, "loss": 0.3912, "mean_token_accuracy": 0.8608730277046561, "num_tokens": 309586897.0, "step": 720 }, { "entropy": 0.41351318359375, "epoch": 2.861111111111111, "grad_norm": 0.7503328469812236, "learning_rate": 6.190100746095495e-08, "loss": 0.3831, "mean_token_accuracy": 0.8634521188214421, "num_tokens": 310011953.0, "step": 721 }, { "entropy": 0.416259765625, "epoch": 2.865079365079365, "grad_norm": 0.7649905566318169, "learning_rate": 5.851645637514114e-08, "loss": 0.3851, "mean_token_accuracy": 0.8632787046954036, "num_tokens": 310440896.0, "step": 722 }, { "entropy": 0.417572021484375, "epoch": 2.869047619047619, "grad_norm": 0.9526446373854853, "learning_rate": 5.522650885758374e-08, "loss": 0.3874, "mean_token_accuracy": 0.8621506663039327, "num_tokens": 310867155.0, "step": 723 }, { "entropy": 0.414215087890625, "epoch": 2.873015873015873, "grad_norm": 0.8196077414243489, "learning_rate": 5.203122789357307e-08, "loss": 0.3768, "mean_token_accuracy": 0.8689231360331178, "num_tokens": 311297562.0, "step": 724 }, { "entropy": 0.41143798828125, "epoch": 2.876984126984127, "grad_norm": 0.8133318950674621, "learning_rate": 4.893067465602863e-08, "loss": 0.397, "mean_token_accuracy": 0.8604372851550579, "num_tokens": 311750163.0, "step": 725 }, { "entropy": 0.413116455078125, "epoch": 2.880952380952381, "grad_norm": 0.7955536166222298, "learning_rate": 4.5924908504331735e-08, "loss": 0.3949, "mean_token_accuracy": 0.8633811613544822, "num_tokens": 312189513.0, "step": 726 }, { "entropy": 0.41363525390625, "epoch": 2.884920634920635, "grad_norm": 0.7572897725302501, "learning_rate": 4.3013986983184705e-08, "loss": 0.3854, "mean_token_accuracy": 0.8645169893279672, "num_tokens": 312626002.0, "step": 727 }, { "entropy": 0.41668701171875, "epoch": 2.888888888888889, "grad_norm": 0.7716288372854705, "learning_rate": 4.019796582151181e-08, "loss": 0.3876, "mean_token_accuracy": 0.862118998542428, "num_tokens": 313055977.0, "step": 728 }, { "entropy": 0.415802001953125, "epoch": 2.892857142857143, "grad_norm": 0.7713232614880875, "learning_rate": 3.747689893139228e-08, "loss": 0.3854, "mean_token_accuracy": 0.8622197173535824, "num_tokens": 313491837.0, "step": 729 }, { "entropy": 0.416229248046875, "epoch": 2.8968253968253967, "grad_norm": 0.7395778636476764, "learning_rate": 3.4850838407027297e-08, "loss": 0.3979, "mean_token_accuracy": 0.8619843171909451, "num_tokens": 313934488.0, "step": 730 }, { "entropy": 0.418701171875, "epoch": 2.9007936507936507, "grad_norm": 0.7612522506605197, "learning_rate": 3.2319834523742435e-08, "loss": 0.383, "mean_token_accuracy": 0.8652700930833817, "num_tokens": 314354323.0, "step": 731 }, { "entropy": 0.414703369140625, "epoch": 2.9047619047619047, "grad_norm": 0.7974226574359506, "learning_rate": 2.988393573702675e-08, "loss": 0.3946, "mean_token_accuracy": 0.8615053938701749, "num_tokens": 314782888.0, "step": 732 }, { "entropy": 0.41558837890625, "epoch": 2.9087301587301586, "grad_norm": 0.8043577847187172, "learning_rate": 2.754318868160244e-08, "loss": 0.3836, "mean_token_accuracy": 0.8651288328692317, "num_tokens": 315218545.0, "step": 733 }, { "entropy": 0.409698486328125, "epoch": 2.9126984126984126, "grad_norm": 0.7532207153914081, "learning_rate": 2.5297638170535542e-08, "loss": 0.3768, "mean_token_accuracy": 0.8678219076246023, "num_tokens": 315667111.0, "step": 734 }, { "entropy": 0.423553466796875, "epoch": 2.9166666666666665, "grad_norm": 0.836022190361025, "learning_rate": 2.31473271943744e-08, "loss": 0.3848, "mean_token_accuracy": 0.8609532006084919, "num_tokens": 316078411.0, "step": 735 }, { "entropy": 0.4132080078125, "epoch": 2.9206349206349205, "grad_norm": 0.7845153141259297, "learning_rate": 2.109229692032977e-08, "loss": 0.3894, "mean_token_accuracy": 0.8628736371174455, "num_tokens": 316519645.0, "step": 736 }, { "entropy": 0.418060302734375, "epoch": 2.924603174603175, "grad_norm": 0.7817679617183846, "learning_rate": 1.9132586691484323e-08, "loss": 0.3889, "mean_token_accuracy": 0.8628808334469795, "num_tokens": 316948710.0, "step": 737 }, { "entropy": 0.417327880859375, "epoch": 2.928571428571429, "grad_norm": 0.7903748369093285, "learning_rate": 1.7268234026041053e-08, "loss": 0.3836, "mean_token_accuracy": 0.8664052626118064, "num_tokens": 317376914.0, "step": 738 }, { "entropy": 0.416656494140625, "epoch": 2.932539682539683, "grad_norm": 0.8013586144607203, "learning_rate": 1.5499274616602723e-08, "loss": 0.3819, "mean_token_accuracy": 0.8635408999398351, "num_tokens": 317783718.0, "step": 739 }, { "entropy": 0.4195556640625, "epoch": 2.9365079365079367, "grad_norm": 1.0799029570111842, "learning_rate": 1.3825742329492408e-08, "loss": 0.3976, "mean_token_accuracy": 0.8611190365627408, "num_tokens": 318210944.0, "step": 740 }, { "entropy": 0.41424560546875, "epoch": 2.9404761904761907, "grad_norm": 0.8046949464266854, "learning_rate": 1.2247669204100699e-08, "loss": 0.3972, "mean_token_accuracy": 0.8586803553625941, "num_tokens": 318636758.0, "step": 741 }, { "entropy": 0.412384033203125, "epoch": 2.9444444444444446, "grad_norm": 0.7492264758448205, "learning_rate": 1.0765085452275614e-08, "loss": 0.381, "mean_token_accuracy": 0.8640262456610799, "num_tokens": 319072977.0, "step": 742 }, { "entropy": 0.41888427734375, "epoch": 2.9484126984126986, "grad_norm": 0.8576996327472883, "learning_rate": 9.378019457743082e-09, "loss": 0.3825, "mean_token_accuracy": 0.8635032856836915, "num_tokens": 319491098.0, "step": 743 }, { "entropy": 0.41156005859375, "epoch": 2.9523809523809526, "grad_norm": 0.8654625317173439, "learning_rate": 8.086497775562918e-09, "loss": 0.3974, "mean_token_accuracy": 0.8603286230936646, "num_tokens": 319936836.0, "step": 744 }, { "entropy": 0.4107666015625, "epoch": 2.9563492063492065, "grad_norm": 0.8355009320099618, "learning_rate": 6.890545131621462e-09, "loss": 0.3898, "mean_token_accuracy": 0.8626588368788362, "num_tokens": 320379711.0, "step": 745 }, { "entropy": 0.415008544921875, "epoch": 2.9603174603174605, "grad_norm": 0.769717293692382, "learning_rate": 5.790184422158063e-09, "loss": 0.3848, "mean_token_accuracy": 0.8654471961781383, "num_tokens": 320807541.0, "step": 746 }, { "entropy": 0.41558837890625, "epoch": 2.9642857142857144, "grad_norm": 1.2627956434015182, "learning_rate": 4.785436713324876e-09, "loss": 0.3896, "mean_token_accuracy": 0.8639781204983592, "num_tokens": 321249293.0, "step": 747 }, { "entropy": 0.417449951171875, "epoch": 2.9682539682539684, "grad_norm": 0.7877216343272654, "learning_rate": 3.876321240786629e-09, "loss": 0.385, "mean_token_accuracy": 0.8626599637791514, "num_tokens": 321676330.0, "step": 748 }, { "entropy": 0.41473388671875, "epoch": 2.9722222222222223, "grad_norm": 0.8047350818501147, "learning_rate": 3.062855409350918e-09, "loss": 0.3786, "mean_token_accuracy": 0.8666502619162202, "num_tokens": 322088977.0, "step": 749 }, { "entropy": 0.413360595703125, "epoch": 2.9761904761904763, "grad_norm": 0.8214751044608514, "learning_rate": 2.345054792634027e-09, "loss": 0.3863, "mean_token_accuracy": 0.8647614009678364, "num_tokens": 322526195.0, "step": 750 }, { "entropy": 0.42340087890625, "epoch": 2.9801587301587302, "grad_norm": 0.7827821829891451, "learning_rate": 1.7229331327633935e-09, "loss": 0.3884, "mean_token_accuracy": 0.8638850962743163, "num_tokens": 322944491.0, "step": 751 }, { "entropy": 0.415924072265625, "epoch": 2.984126984126984, "grad_norm": 0.8044240226452957, "learning_rate": 1.1965023401161457e-09, "loss": 0.3955, "mean_token_accuracy": 0.860909391194582, "num_tokens": 323373321.0, "step": 752 }, { "entropy": 0.415069580078125, "epoch": 2.988095238095238, "grad_norm": 0.7711137719538177, "learning_rate": 7.657724930887344e-10, "loss": 0.3878, "mean_token_accuracy": 0.8620332898572087, "num_tokens": 323803097.0, "step": 753 }, { "entropy": 0.413604736328125, "epoch": 2.992063492063492, "grad_norm": 0.7417880663145604, "learning_rate": 4.3075183790541875e-10, "loss": 0.3781, "mean_token_accuracy": 0.865508021786809, "num_tokens": 324241487.0, "step": 754 }, { "entropy": 0.412139892578125, "epoch": 2.996031746031746, "grad_norm": 0.8400409360541771, "learning_rate": 1.9144678845950393e-10, "loss": 0.3963, "mean_token_accuracy": 0.8602159256115556, "num_tokens": 324694260.0, "step": 755 }, { "entropy": 0.41937255859375, "epoch": 3.0, "grad_norm": 0.8242408910754981, "learning_rate": 4.786192619121721e-11, "loss": 0.3878, "mean_token_accuracy": 0.863866476342082, "num_tokens": 325114310.0, "step": 756 }, { "epoch": 3.0, "step": 756, "total_flos": 601237772369920.0, "train_loss": 0.4835385761011845, "train_runtime": 57894.1544, "train_samples_per_second": 1.272, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 756, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 601237772369920.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }