{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 762, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.57855224609375, "epoch": 0.003937007874015748, "grad_norm": 6.017421193838227, "learning_rate": 0.0, "loss": 1.3958, "mean_token_accuracy": 0.6527928654104471, "num_tokens": 405091.0, "step": 1 }, { "entropy": 0.57177734375, "epoch": 0.007874015748031496, "grad_norm": 5.993264916340701, "learning_rate": 5.128205128205128e-07, "loss": 1.4236, "mean_token_accuracy": 0.6467109536752105, "num_tokens": 829295.0, "step": 2 }, { "entropy": 0.586273193359375, "epoch": 0.011811023622047244, "grad_norm": 5.967015920534343, "learning_rate": 1.0256410256410257e-06, "loss": 1.4081, "mean_token_accuracy": 0.6507676001638174, "num_tokens": 1234884.0, "step": 3 }, { "entropy": 0.575836181640625, "epoch": 0.015748031496062992, "grad_norm": 5.845522648758348, "learning_rate": 1.5384615384615387e-06, "loss": 1.3926, "mean_token_accuracy": 0.6580017423257232, "num_tokens": 1629773.0, "step": 4 }, { "entropy": 0.56964111328125, "epoch": 0.01968503937007874, "grad_norm": 5.790573559158371, "learning_rate": 2.0512820512820513e-06, "loss": 1.3868, "mean_token_accuracy": 0.657335345633328, "num_tokens": 2041925.0, "step": 5 }, { "entropy": 0.56451416015625, "epoch": 0.023622047244094488, "grad_norm": 5.568170796627478, "learning_rate": 2.564102564102564e-06, "loss": 1.3857, "mean_token_accuracy": 0.6550841787829995, "num_tokens": 2489592.0, "step": 6 }, { "entropy": 0.568939208984375, "epoch": 0.027559055118110236, "grad_norm": 5.1196996584808865, "learning_rate": 3.0769230769230774e-06, "loss": 1.3463, "mean_token_accuracy": 0.6607445329427719, "num_tokens": 2911373.0, "step": 7 }, { "entropy": 0.573883056640625, "epoch": 0.031496062992125984, "grad_norm": 4.493954294279661, "learning_rate": 3.58974358974359e-06, "loss": 1.3163, "mean_token_accuracy": 0.6666090982034802, "num_tokens": 3345237.0, "step": 8 }, { "entropy": 0.582183837890625, "epoch": 0.03543307086614173, "grad_norm": 4.315469270405331, "learning_rate": 4.102564102564103e-06, "loss": 1.2769, "mean_token_accuracy": 0.6745583917945623, "num_tokens": 3754911.0, "step": 9 }, { "entropy": 0.564117431640625, "epoch": 0.03937007874015748, "grad_norm": 3.6044990712171083, "learning_rate": 4.615384615384616e-06, "loss": 1.1654, "mean_token_accuracy": 0.6969763962551951, "num_tokens": 4180069.0, "step": 10 }, { "entropy": 0.5650634765625, "epoch": 0.04330708661417323, "grad_norm": 3.34006760349023, "learning_rate": 5.128205128205128e-06, "loss": 1.1559, "mean_token_accuracy": 0.6937189754098654, "num_tokens": 4610750.0, "step": 11 }, { "entropy": 0.55633544921875, "epoch": 0.047244094488188976, "grad_norm": 3.2756727794133242, "learning_rate": 5.641025641025641e-06, "loss": 1.1369, "mean_token_accuracy": 0.6968221105635166, "num_tokens": 5055896.0, "step": 12 }, { "entropy": 0.53607177734375, "epoch": 0.051181102362204724, "grad_norm": 4.3300668215494555, "learning_rate": 6.153846153846155e-06, "loss": 1.0162, "mean_token_accuracy": 0.720275528728962, "num_tokens": 5468686.0, "step": 13 }, { "entropy": 0.54345703125, "epoch": 0.05511811023622047, "grad_norm": 4.5925042609040645, "learning_rate": 6.666666666666667e-06, "loss": 1.0286, "mean_token_accuracy": 0.7187379905954003, "num_tokens": 5886307.0, "step": 14 }, { "entropy": 0.539398193359375, "epoch": 0.05905511811023622, "grad_norm": 3.8670134026701626, "learning_rate": 7.17948717948718e-06, "loss": 0.9893, "mean_token_accuracy": 0.7269825097173452, "num_tokens": 6325637.0, "step": 15 }, { "entropy": 0.538116455078125, "epoch": 0.06299212598425197, "grad_norm": 3.3424168128794434, "learning_rate": 7.692307692307694e-06, "loss": 0.9672, "mean_token_accuracy": 0.7305122185498476, "num_tokens": 6766914.0, "step": 16 }, { "entropy": 0.55633544921875, "epoch": 0.06692913385826772, "grad_norm": 3.7699936932143077, "learning_rate": 8.205128205128205e-06, "loss": 0.9261, "mean_token_accuracy": 0.7375357151031494, "num_tokens": 7185216.0, "step": 17 }, { "entropy": 0.5638427734375, "epoch": 0.07086614173228346, "grad_norm": 4.148747463938917, "learning_rate": 8.717948717948719e-06, "loss": 0.9387, "mean_token_accuracy": 0.7338155778124928, "num_tokens": 7596226.0, "step": 18 }, { "entropy": 0.5389404296875, "epoch": 0.07480314960629922, "grad_norm": 4.006513472613672, "learning_rate": 9.230769230769232e-06, "loss": 0.907, "mean_token_accuracy": 0.7441706955432892, "num_tokens": 8031675.0, "step": 19 }, { "entropy": 0.550994873046875, "epoch": 0.07874015748031496, "grad_norm": 3.0586688382376694, "learning_rate": 9.743589743589744e-06, "loss": 0.8915, "mean_token_accuracy": 0.7450471529737115, "num_tokens": 8463875.0, "step": 20 }, { "entropy": 0.5362548828125, "epoch": 0.08267716535433071, "grad_norm": 3.010274397022863, "learning_rate": 1.0256410256410256e-05, "loss": 0.8649, "mean_token_accuracy": 0.749404520727694, "num_tokens": 8891456.0, "step": 21 }, { "entropy": 0.534088134765625, "epoch": 0.08661417322834646, "grad_norm": 2.5938467957096667, "learning_rate": 1.076923076923077e-05, "loss": 0.8574, "mean_token_accuracy": 0.7520366236567497, "num_tokens": 9346156.0, "step": 22 }, { "entropy": 0.542938232421875, "epoch": 0.09055118110236221, "grad_norm": 2.2126973604191456, "learning_rate": 1.1282051282051283e-05, "loss": 0.8397, "mean_token_accuracy": 0.7569433562457561, "num_tokens": 9774883.0, "step": 23 }, { "entropy": 0.53131103515625, "epoch": 0.09448818897637795, "grad_norm": 2.6706476920762214, "learning_rate": 1.1794871794871796e-05, "loss": 0.8108, "mean_token_accuracy": 0.7596273683011532, "num_tokens": 10201146.0, "step": 24 }, { "entropy": 0.537384033203125, "epoch": 0.0984251968503937, "grad_norm": 2.2269608876043154, "learning_rate": 1.230769230769231e-05, "loss": 0.7871, "mean_token_accuracy": 0.7649025870487094, "num_tokens": 10610203.0, "step": 25 }, { "entropy": 0.531219482421875, "epoch": 0.10236220472440945, "grad_norm": 2.170252611622614, "learning_rate": 1.2820512820512823e-05, "loss": 0.7906, "mean_token_accuracy": 0.7660603849217296, "num_tokens": 11068101.0, "step": 26 }, { "entropy": 0.52508544921875, "epoch": 0.1062992125984252, "grad_norm": 2.242670995581003, "learning_rate": 1.3333333333333333e-05, "loss": 0.7748, "mean_token_accuracy": 0.7676612958312035, "num_tokens": 11493402.0, "step": 27 }, { "entropy": 0.513397216796875, "epoch": 0.11023622047244094, "grad_norm": 1.7608050419499566, "learning_rate": 1.3846153846153847e-05, "loss": 0.7437, "mean_token_accuracy": 0.7780561083927751, "num_tokens": 11933204.0, "step": 28 }, { "entropy": 0.510833740234375, "epoch": 0.1141732283464567, "grad_norm": 1.6033916769131809, "learning_rate": 1.435897435897436e-05, "loss": 0.7334, "mean_token_accuracy": 0.7764604520052671, "num_tokens": 12380898.0, "step": 29 }, { "entropy": 0.52008056640625, "epoch": 0.11811023622047244, "grad_norm": 1.80791166592839, "learning_rate": 1.4871794871794874e-05, "loss": 0.7291, "mean_token_accuracy": 0.7800389584153891, "num_tokens": 12802897.0, "step": 30 }, { "entropy": 0.518524169921875, "epoch": 0.1220472440944882, "grad_norm": 1.6689890484137595, "learning_rate": 1.5384615384615387e-05, "loss": 0.7372, "mean_token_accuracy": 0.7804073309525847, "num_tokens": 13239071.0, "step": 31 }, { "entropy": 0.5186767578125, "epoch": 0.12598425196850394, "grad_norm": 1.71311066387976, "learning_rate": 1.5897435897435897e-05, "loss": 0.7011, "mean_token_accuracy": 0.7851214902475476, "num_tokens": 13671846.0, "step": 32 }, { "entropy": 0.517120361328125, "epoch": 0.12992125984251968, "grad_norm": 1.653752860519691, "learning_rate": 1.641025641025641e-05, "loss": 0.6941, "mean_token_accuracy": 0.7865419248118997, "num_tokens": 14096702.0, "step": 33 }, { "entropy": 0.51727294921875, "epoch": 0.13385826771653545, "grad_norm": 1.8994101238336887, "learning_rate": 1.6923076923076924e-05, "loss": 0.6762, "mean_token_accuracy": 0.787972204387188, "num_tokens": 14507141.0, "step": 34 }, { "entropy": 0.50054931640625, "epoch": 0.1377952755905512, "grad_norm": 1.8005803715283153, "learning_rate": 1.7435897435897438e-05, "loss": 0.6678, "mean_token_accuracy": 0.7918295972049236, "num_tokens": 14924605.0, "step": 35 }, { "entropy": 0.498931884765625, "epoch": 0.14173228346456693, "grad_norm": 1.7427723538583522, "learning_rate": 1.794871794871795e-05, "loss": 0.6568, "mean_token_accuracy": 0.7931245760992169, "num_tokens": 15340274.0, "step": 36 }, { "entropy": 0.50238037109375, "epoch": 0.14566929133858267, "grad_norm": 1.9022526038852137, "learning_rate": 1.8461538461538465e-05, "loss": 0.6598, "mean_token_accuracy": 0.7942408351227641, "num_tokens": 15760102.0, "step": 37 }, { "entropy": 0.505035400390625, "epoch": 0.14960629921259844, "grad_norm": 1.9173596829441977, "learning_rate": 1.8974358974358975e-05, "loss": 0.6603, "mean_token_accuracy": 0.7934675076976418, "num_tokens": 16168355.0, "step": 38 }, { "entropy": 0.50341796875, "epoch": 0.15354330708661418, "grad_norm": 1.737835763355968, "learning_rate": 1.9487179487179488e-05, "loss": 0.6678, "mean_token_accuracy": 0.7944545326754451, "num_tokens": 16616270.0, "step": 39 }, { "entropy": 0.504364013671875, "epoch": 0.15748031496062992, "grad_norm": 1.6371465461053467, "learning_rate": 2e-05, "loss": 0.6598, "mean_token_accuracy": 0.7918455330654979, "num_tokens": 17047860.0, "step": 40 }, { "entropy": 0.4927978515625, "epoch": 0.16141732283464566, "grad_norm": 1.6621682164769442, "learning_rate": 1.999990559554882e-05, "loss": 0.6623, "mean_token_accuracy": 0.792108066380024, "num_tokens": 17485177.0, "step": 41 }, { "entropy": 0.496307373046875, "epoch": 0.16535433070866143, "grad_norm": 1.8543495232878142, "learning_rate": 1.9999622383977725e-05, "loss": 0.6467, "mean_token_accuracy": 0.797334254719317, "num_tokens": 17910948.0, "step": 42 }, { "entropy": 0.490631103515625, "epoch": 0.16929133858267717, "grad_norm": 1.8574036554957438, "learning_rate": 1.9999150370633987e-05, "loss": 0.646, "mean_token_accuracy": 0.7992923380807042, "num_tokens": 18347007.0, "step": 43 }, { "entropy": 0.503265380859375, "epoch": 0.1732283464566929, "grad_norm": 1.4203675083785523, "learning_rate": 1.9998489564429656e-05, "loss": 0.6524, "mean_token_accuracy": 0.7955402638763189, "num_tokens": 18770125.0, "step": 44 }, { "entropy": 0.4832763671875, "epoch": 0.17716535433070865, "grad_norm": 1.699095417852413, "learning_rate": 1.999763997784133e-05, "loss": 0.6559, "mean_token_accuracy": 0.7955031348392367, "num_tokens": 19226466.0, "step": 45 }, { "entropy": 0.478118896484375, "epoch": 0.18110236220472442, "grad_norm": 1.5920566823332218, "learning_rate": 1.9996601626909962e-05, "loss": 0.6405, "mean_token_accuracy": 0.7994664330035448, "num_tokens": 19670619.0, "step": 46 }, { "entropy": 0.4974365234375, "epoch": 0.18503937007874016, "grad_norm": 1.332039937086849, "learning_rate": 1.999537453124055e-05, "loss": 0.6231, "mean_token_accuracy": 0.8019787659868598, "num_tokens": 20079141.0, "step": 47 }, { "entropy": 0.49603271484375, "epoch": 0.1889763779527559, "grad_norm": 1.5722338994254577, "learning_rate": 1.9993958714001738e-05, "loss": 0.6262, "mean_token_accuracy": 0.8007524479180574, "num_tokens": 20519574.0, "step": 48 }, { "entropy": 0.4832763671875, "epoch": 0.19291338582677164, "grad_norm": 1.4266430941019363, "learning_rate": 1.9992354201925427e-05, "loss": 0.6217, "mean_token_accuracy": 0.8023361451923847, "num_tokens": 20947519.0, "step": 49 }, { "entropy": 0.4827880859375, "epoch": 0.1968503937007874, "grad_norm": 1.7573722850889697, "learning_rate": 1.9990561025306232e-05, "loss": 0.6224, "mean_token_accuracy": 0.8040017504245043, "num_tokens": 21392560.0, "step": 50 }, { "entropy": 0.4984130859375, "epoch": 0.20078740157480315, "grad_norm": 1.3813190235222876, "learning_rate": 1.998857921800092e-05, "loss": 0.6275, "mean_token_accuracy": 0.80109344702214, "num_tokens": 21833277.0, "step": 51 }, { "entropy": 0.49365234375, "epoch": 0.2047244094488189, "grad_norm": 1.4982744826386218, "learning_rate": 1.998640881742778e-05, "loss": 0.6005, "mean_token_accuracy": 0.8037025630474091, "num_tokens": 22257152.0, "step": 52 }, { "entropy": 0.478729248046875, "epoch": 0.20866141732283464, "grad_norm": 1.4345392282851195, "learning_rate": 1.998404986456591e-05, "loss": 0.5978, "mean_token_accuracy": 0.8084583384916186, "num_tokens": 22686084.0, "step": 53 }, { "entropy": 0.49456787109375, "epoch": 0.2125984251968504, "grad_norm": 1.3302730274926833, "learning_rate": 1.9981502403954435e-05, "loss": 0.5817, "mean_token_accuracy": 0.8112034667283297, "num_tokens": 23097849.0, "step": 54 }, { "entropy": 0.49212646484375, "epoch": 0.21653543307086615, "grad_norm": 1.5182591089716235, "learning_rate": 1.997876648369168e-05, "loss": 0.6066, "mean_token_accuracy": 0.8056635642424226, "num_tokens": 23540073.0, "step": 55 }, { "entropy": 0.479339599609375, "epoch": 0.2204724409448819, "grad_norm": 1.471676822279833, "learning_rate": 1.9975842155434253e-05, "loss": 0.6004, "mean_token_accuracy": 0.806885845027864, "num_tokens": 23983334.0, "step": 56 }, { "entropy": 0.472015380859375, "epoch": 0.22440944881889763, "grad_norm": 1.7984365891799328, "learning_rate": 1.997272947439608e-05, "loss": 0.5887, "mean_token_accuracy": 0.8096871245652437, "num_tokens": 24429322.0, "step": 57 }, { "entropy": 0.4752197265625, "epoch": 0.2283464566929134, "grad_norm": 1.4009385017224354, "learning_rate": 1.996942849934735e-05, "loss": 0.6013, "mean_token_accuracy": 0.8048065342009068, "num_tokens": 24871384.0, "step": 58 }, { "entropy": 0.47406005859375, "epoch": 0.23228346456692914, "grad_norm": 1.4868806329344062, "learning_rate": 1.9965939292613408e-05, "loss": 0.5783, "mean_token_accuracy": 0.8113819938153028, "num_tokens": 25319061.0, "step": 59 }, { "entropy": 0.474151611328125, "epoch": 0.23622047244094488, "grad_norm": 1.3967298894617224, "learning_rate": 1.996226192007358e-05, "loss": 0.58, "mean_token_accuracy": 0.8119395393878222, "num_tokens": 25766012.0, "step": 60 }, { "entropy": 0.4749755859375, "epoch": 0.24015748031496062, "grad_norm": 1.4262112118632393, "learning_rate": 1.9958396451159937e-05, "loss": 0.5713, "mean_token_accuracy": 0.8120222119614482, "num_tokens": 26170408.0, "step": 61 }, { "entropy": 0.48187255859375, "epoch": 0.2440944881889764, "grad_norm": 1.505498450930936, "learning_rate": 1.995434295885598e-05, "loss": 0.5727, "mean_token_accuracy": 0.8128972761332989, "num_tokens": 26585896.0, "step": 62 }, { "entropy": 0.47503662109375, "epoch": 0.24803149606299213, "grad_norm": 1.457370555253117, "learning_rate": 1.995010151969524e-05, "loss": 0.5814, "mean_token_accuracy": 0.8106799507513642, "num_tokens": 27022289.0, "step": 63 }, { "entropy": 0.481109619140625, "epoch": 0.25196850393700787, "grad_norm": 1.3774207441827904, "learning_rate": 1.9945672213759872e-05, "loss": 0.5924, "mean_token_accuracy": 0.8066158397123218, "num_tokens": 27468846.0, "step": 64 }, { "entropy": 0.47772216796875, "epoch": 0.2559055118110236, "grad_norm": 1.6141808039785952, "learning_rate": 1.9941055124679108e-05, "loss": 0.5868, "mean_token_accuracy": 0.8115468313917518, "num_tokens": 27905936.0, "step": 65 }, { "entropy": 0.481658935546875, "epoch": 0.25984251968503935, "grad_norm": 1.4300524182611083, "learning_rate": 1.993625033962771e-05, "loss": 0.5921, "mean_token_accuracy": 0.8093285923823714, "num_tokens": 28353036.0, "step": 66 }, { "entropy": 0.4754638671875, "epoch": 0.2637795275590551, "grad_norm": 1.4415181715635927, "learning_rate": 1.993125794932429e-05, "loss": 0.5804, "mean_token_accuracy": 0.8094886504113674, "num_tokens": 28792395.0, "step": 67 }, { "entropy": 0.4808349609375, "epoch": 0.2677165354330709, "grad_norm": 1.5030670439825435, "learning_rate": 1.9926078048029623e-05, "loss": 0.5889, "mean_token_accuracy": 0.8070614328607917, "num_tokens": 29202926.0, "step": 68 }, { "entropy": 0.468170166015625, "epoch": 0.27165354330708663, "grad_norm": 1.463016395641609, "learning_rate": 1.992071073354486e-05, "loss": 0.5804, "mean_token_accuracy": 0.8089981079101562, "num_tokens": 29633215.0, "step": 69 }, { "entropy": 0.472808837890625, "epoch": 0.2755905511811024, "grad_norm": 1.2691410832084924, "learning_rate": 1.9915156107209673e-05, "loss": 0.5654, "mean_token_accuracy": 0.8132820166647434, "num_tokens": 30053469.0, "step": 70 }, { "entropy": 0.4683837890625, "epoch": 0.2795275590551181, "grad_norm": 1.35929946128328, "learning_rate": 1.9909414273900353e-05, "loss": 0.5827, "mean_token_accuracy": 0.8104306925088167, "num_tokens": 30508051.0, "step": 71 }, { "entropy": 0.469024658203125, "epoch": 0.28346456692913385, "grad_norm": 1.314977800443806, "learning_rate": 1.9903485342027827e-05, "loss": 0.5563, "mean_token_accuracy": 0.8167074229568243, "num_tokens": 30946214.0, "step": 72 }, { "entropy": 0.481658935546875, "epoch": 0.2874015748031496, "grad_norm": 1.2406296758259667, "learning_rate": 1.98973694235356e-05, "loss": 0.5642, "mean_token_accuracy": 0.8149285055696964, "num_tokens": 31363171.0, "step": 73 }, { "entropy": 0.467681884765625, "epoch": 0.29133858267716534, "grad_norm": 1.2216342233110984, "learning_rate": 1.9891066633897666e-05, "loss": 0.555, "mean_token_accuracy": 0.8174580186605453, "num_tokens": 31771585.0, "step": 74 }, { "entropy": 0.4698486328125, "epoch": 0.2952755905511811, "grad_norm": 1.2181141707893957, "learning_rate": 1.9884577092116296e-05, "loss": 0.5565, "mean_token_accuracy": 0.8176529305055737, "num_tokens": 32178646.0, "step": 75 }, { "entropy": 0.459503173828125, "epoch": 0.2992125984251969, "grad_norm": 1.3696165596387524, "learning_rate": 1.9877900920719825e-05, "loss": 0.5627, "mean_token_accuracy": 0.813455811701715, "num_tokens": 32609106.0, "step": 76 }, { "entropy": 0.468231201171875, "epoch": 0.3031496062992126, "grad_norm": 1.2372308799624376, "learning_rate": 1.9871038245760305e-05, "loss": 0.5708, "mean_token_accuracy": 0.8143888600170612, "num_tokens": 33037253.0, "step": 77 }, { "entropy": 0.47723388671875, "epoch": 0.30708661417322836, "grad_norm": 1.3187950742682812, "learning_rate": 1.9863989196811153e-05, "loss": 0.5671, "mean_token_accuracy": 0.8115543182939291, "num_tokens": 33447162.0, "step": 78 }, { "entropy": 0.463226318359375, "epoch": 0.3110236220472441, "grad_norm": 1.2233802262332847, "learning_rate": 1.9856753906964686e-05, "loss": 0.5571, "mean_token_accuracy": 0.8157372018322349, "num_tokens": 33878990.0, "step": 79 }, { "entropy": 0.469940185546875, "epoch": 0.31496062992125984, "grad_norm": 1.185901461351517, "learning_rate": 1.9849332512829624e-05, "loss": 0.5568, "mean_token_accuracy": 0.8197338776662946, "num_tokens": 34304647.0, "step": 80 }, { "entropy": 0.465545654296875, "epoch": 0.3188976377952756, "grad_norm": 1.4246612321853744, "learning_rate": 1.9841725154528485e-05, "loss": 0.5829, "mean_token_accuracy": 0.8095870474353433, "num_tokens": 34750816.0, "step": 81 }, { "entropy": 0.4696044921875, "epoch": 0.3228346456692913, "grad_norm": 1.2185392732273213, "learning_rate": 1.983393197569497e-05, "loss": 0.5585, "mean_token_accuracy": 0.8164394591003656, "num_tokens": 35180424.0, "step": 82 }, { "entropy": 0.4609375, "epoch": 0.32677165354330706, "grad_norm": 1.2799767664999213, "learning_rate": 1.9825953123471235e-05, "loss": 0.5514, "mean_token_accuracy": 0.8194698123261333, "num_tokens": 35639723.0, "step": 83 }, { "entropy": 0.472259521484375, "epoch": 0.33070866141732286, "grad_norm": 1.2501368772967398, "learning_rate": 1.981778874850511e-05, "loss": 0.5625, "mean_token_accuracy": 0.8149236952885985, "num_tokens": 36075331.0, "step": 84 }, { "entropy": 0.463348388671875, "epoch": 0.3346456692913386, "grad_norm": 1.3053960236087885, "learning_rate": 1.980943900494727e-05, "loss": 0.5476, "mean_token_accuracy": 0.8175922827795148, "num_tokens": 36513527.0, "step": 85 }, { "entropy": 0.4549560546875, "epoch": 0.33858267716535434, "grad_norm": 1.1850931627983934, "learning_rate": 1.9800904050448296e-05, "loss": 0.5608, "mean_token_accuracy": 0.8153009815141559, "num_tokens": 36982726.0, "step": 86 }, { "entropy": 0.457763671875, "epoch": 0.3425196850393701, "grad_norm": 1.3097335637467127, "learning_rate": 1.9792184046155733e-05, "loss": 0.5376, "mean_token_accuracy": 0.8229149403050542, "num_tokens": 37406085.0, "step": 87 }, { "entropy": 0.46282958984375, "epoch": 0.3464566929133858, "grad_norm": 1.3701986539966717, "learning_rate": 1.9783279156711022e-05, "loss": 0.5721, "mean_token_accuracy": 0.8138612108305097, "num_tokens": 37849707.0, "step": 88 }, { "entropy": 0.459503173828125, "epoch": 0.35039370078740156, "grad_norm": 1.324893149036643, "learning_rate": 1.9774189550246407e-05, "loss": 0.5421, "mean_token_accuracy": 0.8197554592043161, "num_tokens": 38283659.0, "step": 89 }, { "entropy": 0.46405029296875, "epoch": 0.3543307086614173, "grad_norm": 1.2919563457717147, "learning_rate": 1.976491539838175e-05, "loss": 0.5467, "mean_token_accuracy": 0.816823348402977, "num_tokens": 38709966.0, "step": 90 }, { "entropy": 0.462371826171875, "epoch": 0.35826771653543305, "grad_norm": 1.2797648374269206, "learning_rate": 1.975545687622129e-05, "loss": 0.5444, "mean_token_accuracy": 0.8186370227485895, "num_tokens": 39150013.0, "step": 91 }, { "entropy": 0.45489501953125, "epoch": 0.36220472440944884, "grad_norm": 1.4022337956026676, "learning_rate": 1.974581416235035e-05, "loss": 0.5517, "mean_token_accuracy": 0.8169004768133163, "num_tokens": 39576553.0, "step": 92 }, { "entropy": 0.45465087890625, "epoch": 0.3661417322834646, "grad_norm": 1.297870829963053, "learning_rate": 1.9735987438831947e-05, "loss": 0.5429, "mean_token_accuracy": 0.821047849021852, "num_tokens": 40010713.0, "step": 93 }, { "entropy": 0.450592041015625, "epoch": 0.3700787401574803, "grad_norm": 1.5793780108337674, "learning_rate": 1.972597689120338e-05, "loss": 0.5452, "mean_token_accuracy": 0.8186437683179975, "num_tokens": 40443614.0, "step": 94 }, { "entropy": 0.44610595703125, "epoch": 0.37401574803149606, "grad_norm": 1.2564999884723695, "learning_rate": 1.9715782708472685e-05, "loss": 0.5562, "mean_token_accuracy": 0.8183111995458603, "num_tokens": 40904703.0, "step": 95 }, { "entropy": 0.453094482421875, "epoch": 0.3779527559055118, "grad_norm": 1.2231168223051487, "learning_rate": 1.9705405083115118e-05, "loss": 0.5411, "mean_token_accuracy": 0.8229661779478192, "num_tokens": 41340079.0, "step": 96 }, { "entropy": 0.467529296875, "epoch": 0.38188976377952755, "grad_norm": 1.1470871497193091, "learning_rate": 1.9694844211069477e-05, "loss": 0.5267, "mean_token_accuracy": 0.8263011984527111, "num_tokens": 41744766.0, "step": 97 }, { "entropy": 0.45391845703125, "epoch": 0.3858267716535433, "grad_norm": 1.219783013561985, "learning_rate": 1.9684100291734437e-05, "loss": 0.5404, "mean_token_accuracy": 0.8202607650309801, "num_tokens": 42191460.0, "step": 98 }, { "entropy": 0.454803466796875, "epoch": 0.38976377952755903, "grad_norm": 1.5385898015342816, "learning_rate": 1.9673173527964753e-05, "loss": 0.5477, "mean_token_accuracy": 0.8171384297311306, "num_tokens": 42627016.0, "step": 99 }, { "entropy": 0.452423095703125, "epoch": 0.3937007874015748, "grad_norm": 1.061036138445293, "learning_rate": 1.966206412606745e-05, "loss": 0.5432, "mean_token_accuracy": 0.820900421589613, "num_tokens": 43064915.0, "step": 100 }, { "entropy": 0.45361328125, "epoch": 0.39763779527559057, "grad_norm": 1.169828618735002, "learning_rate": 1.9650772295797934e-05, "loss": 0.5481, "mean_token_accuracy": 0.8199361320585012, "num_tokens": 43503850.0, "step": 101 }, { "entropy": 0.449493408203125, "epoch": 0.4015748031496063, "grad_norm": 1.2711708641467119, "learning_rate": 1.963929825035601e-05, "loss": 0.5433, "mean_token_accuracy": 0.8212072784081101, "num_tokens": 43933193.0, "step": 102 }, { "entropy": 0.455078125, "epoch": 0.40551181102362205, "grad_norm": 1.2479358738950146, "learning_rate": 1.9627642206381864e-05, "loss": 0.543, "mean_token_accuracy": 0.8205253165215254, "num_tokens": 44356307.0, "step": 103 }, { "entropy": 0.46270751953125, "epoch": 0.4094488188976378, "grad_norm": 1.0684479566223315, "learning_rate": 1.9615804383951992e-05, "loss": 0.5313, "mean_token_accuracy": 0.8274707896634936, "num_tokens": 44764401.0, "step": 104 }, { "entropy": 0.45220947265625, "epoch": 0.41338582677165353, "grad_norm": 1.22972672302474, "learning_rate": 1.9603785006575015e-05, "loss": 0.5378, "mean_token_accuracy": 0.822184014134109, "num_tokens": 45203043.0, "step": 105 }, { "entropy": 0.46295166015625, "epoch": 0.41732283464566927, "grad_norm": 1.2066718330624946, "learning_rate": 1.9591584301187477e-05, "loss": 0.5529, "mean_token_accuracy": 0.8195339059457183, "num_tokens": 45618924.0, "step": 106 }, { "entropy": 0.456146240234375, "epoch": 0.421259842519685, "grad_norm": 1.1758858200925488, "learning_rate": 1.9579202498149562e-05, "loss": 0.5466, "mean_token_accuracy": 0.8180765705183148, "num_tokens": 46051522.0, "step": 107 }, { "entropy": 0.4459228515625, "epoch": 0.4251968503937008, "grad_norm": 1.3596788756711078, "learning_rate": 1.956663983124073e-05, "loss": 0.5337, "mean_token_accuracy": 0.8209798075258732, "num_tokens": 46497049.0, "step": 108 }, { "entropy": 0.447113037109375, "epoch": 0.42913385826771655, "grad_norm": 1.3368376077642434, "learning_rate": 1.9553896537655317e-05, "loss": 0.5195, "mean_token_accuracy": 0.8260382236912847, "num_tokens": 46930522.0, "step": 109 }, { "entropy": 0.4459228515625, "epoch": 0.4330708661417323, "grad_norm": 1.3620343015512997, "learning_rate": 1.954097285799805e-05, "loss": 0.5354, "mean_token_accuracy": 0.8241122309118509, "num_tokens": 47372874.0, "step": 110 }, { "entropy": 0.468780517578125, "epoch": 0.43700787401574803, "grad_norm": 1.2408939744606213, "learning_rate": 1.9527869036279507e-05, "loss": 0.5411, "mean_token_accuracy": 0.8190175397321582, "num_tokens": 47781013.0, "step": 111 }, { "entropy": 0.45635986328125, "epoch": 0.4409448818897638, "grad_norm": 1.1631375104825024, "learning_rate": 1.951458531991151e-05, "loss": 0.5488, "mean_token_accuracy": 0.8172104032710195, "num_tokens": 48226903.0, "step": 112 }, { "entropy": 0.459869384765625, "epoch": 0.4448818897637795, "grad_norm": 2.1105607272357965, "learning_rate": 1.9501121959702444e-05, "loss": 0.5223, "mean_token_accuracy": 0.8250879934057593, "num_tokens": 48666939.0, "step": 113 }, { "entropy": 0.467315673828125, "epoch": 0.44881889763779526, "grad_norm": 1.2466596421057292, "learning_rate": 1.9487479209852537e-05, "loss": 0.5223, "mean_token_accuracy": 0.8233406702056527, "num_tokens": 49095261.0, "step": 114 }, { "entropy": 0.462646484375, "epoch": 0.452755905511811, "grad_norm": 0.9969642868725791, "learning_rate": 1.9473657327949055e-05, "loss": 0.5118, "mean_token_accuracy": 0.829724857583642, "num_tokens": 49516172.0, "step": 115 }, { "entropy": 0.4603271484375, "epoch": 0.4566929133858268, "grad_norm": 1.1110224749457402, "learning_rate": 1.9459656574961427e-05, "loss": 0.5304, "mean_token_accuracy": 0.8251465121284127, "num_tokens": 49942159.0, "step": 116 }, { "entropy": 0.455718994140625, "epoch": 0.46062992125984253, "grad_norm": 1.1683751909145832, "learning_rate": 1.9445477215236343e-05, "loss": 0.5391, "mean_token_accuracy": 0.8204762721434236, "num_tokens": 50362851.0, "step": 117 }, { "entropy": 0.459686279296875, "epoch": 0.4645669291338583, "grad_norm": 1.1316191678427665, "learning_rate": 1.9431119516492725e-05, "loss": 0.5307, "mean_token_accuracy": 0.8236522153019905, "num_tokens": 50797404.0, "step": 118 }, { "entropy": 0.45440673828125, "epoch": 0.468503937007874, "grad_norm": 1.068136681966582, "learning_rate": 1.941658374981672e-05, "loss": 0.5181, "mean_token_accuracy": 0.8265996854752302, "num_tokens": 51236009.0, "step": 119 }, { "entropy": 0.456207275390625, "epoch": 0.47244094488188976, "grad_norm": 1.2446449512193436, "learning_rate": 1.9401870189656534e-05, "loss": 0.5218, "mean_token_accuracy": 0.8257655389606953, "num_tokens": 51653792.0, "step": 120 }, { "entropy": 0.45611572265625, "epoch": 0.4763779527559055, "grad_norm": 1.1566053130388678, "learning_rate": 1.9386979113817283e-05, "loss": 0.5246, "mean_token_accuracy": 0.8237393777817488, "num_tokens": 52081608.0, "step": 121 }, { "entropy": 0.46746826171875, "epoch": 0.48031496062992124, "grad_norm": 1.0908915988571897, "learning_rate": 1.937191080345574e-05, "loss": 0.5131, "mean_token_accuracy": 0.8298475751653314, "num_tokens": 52491118.0, "step": 122 }, { "entropy": 0.47027587890625, "epoch": 0.484251968503937, "grad_norm": 1.1892400483856065, "learning_rate": 1.9356665543075013e-05, "loss": 0.5208, "mean_token_accuracy": 0.8260363638401031, "num_tokens": 52914582.0, "step": 123 }, { "entropy": 0.458648681640625, "epoch": 0.4881889763779528, "grad_norm": 1.1290959732866495, "learning_rate": 1.934124362051919e-05, "loss": 0.5392, "mean_token_accuracy": 0.8196229068562388, "num_tokens": 53371925.0, "step": 124 }, { "entropy": 0.482818603515625, "epoch": 0.4921259842519685, "grad_norm": 1.2157854071703211, "learning_rate": 1.9325645326967904e-05, "loss": 0.5451, "mean_token_accuracy": 0.8218522099778056, "num_tokens": 53774738.0, "step": 125 }, { "entropy": 0.453887939453125, "epoch": 0.49606299212598426, "grad_norm": 1.1985989720544075, "learning_rate": 1.9309870956930818e-05, "loss": 0.5239, "mean_token_accuracy": 0.8237557969987392, "num_tokens": 54223418.0, "step": 126 }, { "entropy": 0.4586181640625, "epoch": 0.5, "grad_norm": 1.104716347359038, "learning_rate": 1.9293920808242084e-05, "loss": 0.5216, "mean_token_accuracy": 0.8251854004338384, "num_tokens": 54655762.0, "step": 127 }, { "entropy": 0.4671630859375, "epoch": 0.5039370078740157, "grad_norm": 1.1779574568986346, "learning_rate": 1.927779518205471e-05, "loss": 0.5229, "mean_token_accuracy": 0.8235677415505052, "num_tokens": 55081121.0, "step": 128 }, { "entropy": 0.4610595703125, "epoch": 0.5078740157480315, "grad_norm": 1.211241476354621, "learning_rate": 1.9261494382834866e-05, "loss": 0.5206, "mean_token_accuracy": 0.8260276168584824, "num_tokens": 55501868.0, "step": 129 }, { "entropy": 0.455474853515625, "epoch": 0.5118110236220472, "grad_norm": 1.2106744968057903, "learning_rate": 1.924501871835616e-05, "loss": 0.5363, "mean_token_accuracy": 0.8244349677115679, "num_tokens": 55942607.0, "step": 130 }, { "entropy": 0.45953369140625, "epoch": 0.515748031496063, "grad_norm": 1.1675102322000497, "learning_rate": 1.9228368499693805e-05, "loss": 0.5182, "mean_token_accuracy": 0.825249838642776, "num_tokens": 56371760.0, "step": 131 }, { "entropy": 0.458892822265625, "epoch": 0.5196850393700787, "grad_norm": 1.0672534247625016, "learning_rate": 1.9211544041218752e-05, "loss": 0.5127, "mean_token_accuracy": 0.8282053135335445, "num_tokens": 56805459.0, "step": 132 }, { "entropy": 0.461151123046875, "epoch": 0.5236220472440944, "grad_norm": 1.0870642318591395, "learning_rate": 1.9194545660591753e-05, "loss": 0.5412, "mean_token_accuracy": 0.8184263566508889, "num_tokens": 57252250.0, "step": 133 }, { "entropy": 0.454803466796875, "epoch": 0.5275590551181102, "grad_norm": 0.9981587768631067, "learning_rate": 1.917737367875736e-05, "loss": 0.5303, "mean_token_accuracy": 0.8198808785527945, "num_tokens": 57699541.0, "step": 134 }, { "entropy": 0.462493896484375, "epoch": 0.531496062992126, "grad_norm": 1.1368326435130822, "learning_rate": 1.916002841993789e-05, "loss": 0.5344, "mean_token_accuracy": 0.8233586028218269, "num_tokens": 58126669.0, "step": 135 }, { "entropy": 0.46136474609375, "epoch": 0.5354330708661418, "grad_norm": 1.022490800089311, "learning_rate": 1.9142510211627264e-05, "loss": 0.5189, "mean_token_accuracy": 0.8275930220261216, "num_tokens": 58554779.0, "step": 136 }, { "entropy": 0.471771240234375, "epoch": 0.5393700787401575, "grad_norm": 1.0787531090577647, "learning_rate": 1.912481938458485e-05, "loss": 0.5099, "mean_token_accuracy": 0.8287343252450228, "num_tokens": 58973692.0, "step": 137 }, { "entropy": 0.4613037109375, "epoch": 0.5433070866141733, "grad_norm": 0.9846465542742977, "learning_rate": 1.9106956272829212e-05, "loss": 0.5308, "mean_token_accuracy": 0.8231041831895709, "num_tokens": 59427304.0, "step": 138 }, { "entropy": 0.465423583984375, "epoch": 0.547244094488189, "grad_norm": 1.020134766573174, "learning_rate": 1.9088921213631803e-05, "loss": 0.5177, "mean_token_accuracy": 0.8275422500446439, "num_tokens": 59859256.0, "step": 139 }, { "entropy": 0.453399658203125, "epoch": 0.5511811023622047, "grad_norm": 1.07451870093493, "learning_rate": 1.9070714547510593e-05, "loss": 0.5157, "mean_token_accuracy": 0.8267948348075151, "num_tokens": 60291249.0, "step": 140 }, { "entropy": 0.467132568359375, "epoch": 0.5551181102362205, "grad_norm": 1.0197584285113064, "learning_rate": 1.9052336618223655e-05, "loss": 0.4942, "mean_token_accuracy": 0.8316316213458776, "num_tokens": 60700236.0, "step": 141 }, { "entropy": 0.455078125, "epoch": 0.5590551181102362, "grad_norm": 0.9730024523474553, "learning_rate": 1.9033787772762647e-05, "loss": 0.5184, "mean_token_accuracy": 0.8264486761763692, "num_tokens": 61135518.0, "step": 142 }, { "entropy": 0.458465576171875, "epoch": 0.562992125984252, "grad_norm": 1.4869151975995116, "learning_rate": 1.9015068361346284e-05, "loss": 0.5108, "mean_token_accuracy": 0.8289940198883414, "num_tokens": 61551878.0, "step": 143 }, { "entropy": 0.444732666015625, "epoch": 0.5669291338582677, "grad_norm": 1.066403284862127, "learning_rate": 1.8996178737413724e-05, "loss": 0.5103, "mean_token_accuracy": 0.8301606168970466, "num_tokens": 62004999.0, "step": 144 }, { "entropy": 0.463714599609375, "epoch": 0.5708661417322834, "grad_norm": 0.9982577993437047, "learning_rate": 1.8977119257617878e-05, "loss": 0.5149, "mean_token_accuracy": 0.8276943136006594, "num_tokens": 62421515.0, "step": 145 }, { "entropy": 0.45458984375, "epoch": 0.5748031496062992, "grad_norm": 1.0544064260214807, "learning_rate": 1.8957890281818693e-05, "loss": 0.5116, "mean_token_accuracy": 0.8283577999100089, "num_tokens": 62842401.0, "step": 146 }, { "entropy": 0.464019775390625, "epoch": 0.5787401574803149, "grad_norm": 1.0336300313219318, "learning_rate": 1.893849217307635e-05, "loss": 0.5168, "mean_token_accuracy": 0.8278819629922509, "num_tokens": 63269680.0, "step": 147 }, { "entropy": 0.4522705078125, "epoch": 0.5826771653543307, "grad_norm": 1.090759657848, "learning_rate": 1.8918925297644418e-05, "loss": 0.5079, "mean_token_accuracy": 0.8282352862879634, "num_tokens": 63710872.0, "step": 148 }, { "entropy": 0.455718994140625, "epoch": 0.5866141732283464, "grad_norm": 1.1258346244217974, "learning_rate": 1.889919002496291e-05, "loss": 0.516, "mean_token_accuracy": 0.8259612387046218, "num_tokens": 64150024.0, "step": 149 }, { "entropy": 0.46331787109375, "epoch": 0.5905511811023622, "grad_norm": 0.99965957473205, "learning_rate": 1.8879286727651357e-05, "loss": 0.5257, "mean_token_accuracy": 0.8258456196635962, "num_tokens": 64581389.0, "step": 150 }, { "entropy": 0.46087646484375, "epoch": 0.594488188976378, "grad_norm": 1.050563121756213, "learning_rate": 1.8859215781501727e-05, "loss": 0.5092, "mean_token_accuracy": 0.8281868807971478, "num_tokens": 65008879.0, "step": 151 }, { "entropy": 0.46453857421875, "epoch": 0.5984251968503937, "grad_norm": 0.9756638749951236, "learning_rate": 1.8838977565471343e-05, "loss": 0.5102, "mean_token_accuracy": 0.8313626917079091, "num_tokens": 65418174.0, "step": 152 }, { "entropy": 0.4627685546875, "epoch": 0.6023622047244095, "grad_norm": 0.9933688064734119, "learning_rate": 1.881857246167575e-05, "loss": 0.5034, "mean_token_accuracy": 0.8302843710407615, "num_tokens": 65832965.0, "step": 153 }, { "entropy": 0.461334228515625, "epoch": 0.6062992125984252, "grad_norm": 0.9166610309560314, "learning_rate": 1.8798000855381472e-05, "loss": 0.4962, "mean_token_accuracy": 0.8299053413793445, "num_tokens": 66238636.0, "step": 154 }, { "entropy": 0.45989990234375, "epoch": 0.610236220472441, "grad_norm": 1.0171788498896126, "learning_rate": 1.8777263134998745e-05, "loss": 0.5026, "mean_token_accuracy": 0.8297234000638127, "num_tokens": 66650377.0, "step": 155 }, { "entropy": 0.456085205078125, "epoch": 0.6141732283464567, "grad_norm": 0.9317200752741837, "learning_rate": 1.8756359692074192e-05, "loss": 0.5201, "mean_token_accuracy": 0.8240464190021157, "num_tokens": 67099891.0, "step": 156 }, { "entropy": 0.455780029296875, "epoch": 0.6181102362204725, "grad_norm": 1.0545205323300713, "learning_rate": 1.873529092128343e-05, "loss": 0.5135, "mean_token_accuracy": 0.8289516335353255, "num_tokens": 67526947.0, "step": 157 }, { "entropy": 0.45404052734375, "epoch": 0.6220472440944882, "grad_norm": 0.9464181426572896, "learning_rate": 1.8714057220423604e-05, "loss": 0.4911, "mean_token_accuracy": 0.8318861154839396, "num_tokens": 67941621.0, "step": 158 }, { "entropy": 0.45538330078125, "epoch": 0.6259842519685039, "grad_norm": 0.8719131150322863, "learning_rate": 1.8692658990405887e-05, "loss": 0.5166, "mean_token_accuracy": 0.8262722417712212, "num_tokens": 68373473.0, "step": 159 }, { "entropy": 0.452972412109375, "epoch": 0.6299212598425197, "grad_norm": 0.9697678103322591, "learning_rate": 1.8671096635247914e-05, "loss": 0.5089, "mean_token_accuracy": 0.8285679388791323, "num_tokens": 68787973.0, "step": 160 }, { "entropy": 0.4500732421875, "epoch": 0.6338582677165354, "grad_norm": 0.9035700093333056, "learning_rate": 1.8649370562066147e-05, "loss": 0.4953, "mean_token_accuracy": 0.8336192965507507, "num_tokens": 69220309.0, "step": 161 }, { "entropy": 0.456329345703125, "epoch": 0.6377952755905512, "grad_norm": 0.8949309294710832, "learning_rate": 1.8627481181068185e-05, "loss": 0.4982, "mean_token_accuracy": 0.8325092112645507, "num_tokens": 69626711.0, "step": 162 }, { "entropy": 0.4501953125, "epoch": 0.6417322834645669, "grad_norm": 0.9662881010100235, "learning_rate": 1.860542890554503e-05, "loss": 0.4961, "mean_token_accuracy": 0.8319829516112804, "num_tokens": 70049083.0, "step": 163 }, { "entropy": 0.452392578125, "epoch": 0.6456692913385826, "grad_norm": 0.9112855567573753, "learning_rate": 1.8583214151863277e-05, "loss": 0.5062, "mean_token_accuracy": 0.8289196388795972, "num_tokens": 70467285.0, "step": 164 }, { "entropy": 0.44775390625, "epoch": 0.6496062992125984, "grad_norm": 0.9403307323488074, "learning_rate": 1.856083733945725e-05, "loss": 0.5192, "mean_token_accuracy": 0.8249025819823146, "num_tokens": 70915085.0, "step": 165 }, { "entropy": 0.44525146484375, "epoch": 0.6535433070866141, "grad_norm": 0.8985452902281895, "learning_rate": 1.853829889082109e-05, "loss": 0.5034, "mean_token_accuracy": 0.830441677942872, "num_tokens": 71339757.0, "step": 166 }, { "entropy": 0.435821533203125, "epoch": 0.65748031496063, "grad_norm": 0.8822733945012894, "learning_rate": 1.851559923150077e-05, "loss": 0.5137, "mean_token_accuracy": 0.8269774587824941, "num_tokens": 71798915.0, "step": 167 }, { "entropy": 0.452911376953125, "epoch": 0.6614173228346457, "grad_norm": 0.8920203408684533, "learning_rate": 1.8492738790086066e-05, "loss": 0.509, "mean_token_accuracy": 0.8303144676610827, "num_tokens": 72213283.0, "step": 168 }, { "entropy": 0.4490966796875, "epoch": 0.6653543307086615, "grad_norm": 0.9204794974635663, "learning_rate": 1.8469717998202464e-05, "loss": 0.5014, "mean_token_accuracy": 0.8291385043412447, "num_tokens": 72640357.0, "step": 169 }, { "entropy": 0.44903564453125, "epoch": 0.6692913385826772, "grad_norm": 0.8803703609637004, "learning_rate": 1.844653729050301e-05, "loss": 0.5083, "mean_token_accuracy": 0.8296322766691446, "num_tokens": 73071318.0, "step": 170 }, { "entropy": 0.44970703125, "epoch": 0.6732283464566929, "grad_norm": 0.9449900174934937, "learning_rate": 1.8423197104660094e-05, "loss": 0.4944, "mean_token_accuracy": 0.8333448059856892, "num_tokens": 73496580.0, "step": 171 }, { "entropy": 0.45001220703125, "epoch": 0.6771653543307087, "grad_norm": 0.9485819451800577, "learning_rate": 1.8399697881357214e-05, "loss": 0.5093, "mean_token_accuracy": 0.8251696135848761, "num_tokens": 73911332.0, "step": 172 }, { "entropy": 0.4556884765625, "epoch": 0.6811023622047244, "grad_norm": 0.8788533882995756, "learning_rate": 1.8376040064280616e-05, "loss": 0.5064, "mean_token_accuracy": 0.8283712277188897, "num_tokens": 74343121.0, "step": 173 }, { "entropy": 0.446075439453125, "epoch": 0.6850393700787402, "grad_norm": 1.1568959205226408, "learning_rate": 1.835222410011096e-05, "loss": 0.4985, "mean_token_accuracy": 0.8313534203916788, "num_tokens": 74767266.0, "step": 174 }, { "entropy": 0.452178955078125, "epoch": 0.6889763779527559, "grad_norm": 1.025805343277443, "learning_rate": 1.8328250438514837e-05, "loss": 0.5061, "mean_token_accuracy": 0.8259405111894011, "num_tokens": 75183236.0, "step": 175 }, { "entropy": 0.45050048828125, "epoch": 0.6929133858267716, "grad_norm": 0.9573441596688619, "learning_rate": 1.8304119532136328e-05, "loss": 0.5113, "mean_token_accuracy": 0.8278427310287952, "num_tokens": 75607777.0, "step": 176 }, { "entropy": 0.453765869140625, "epoch": 0.6968503937007874, "grad_norm": 1.002212658374949, "learning_rate": 1.8279831836588427e-05, "loss": 0.5021, "mean_token_accuracy": 0.8309840820729733, "num_tokens": 76033909.0, "step": 177 }, { "entropy": 0.45233154296875, "epoch": 0.7007874015748031, "grad_norm": 1.0753125904594512, "learning_rate": 1.8255387810444447e-05, "loss": 0.4992, "mean_token_accuracy": 0.832436814904213, "num_tokens": 76455698.0, "step": 178 }, { "entropy": 0.44952392578125, "epoch": 0.7047244094488189, "grad_norm": 0.8922465191801553, "learning_rate": 1.8230787915229358e-05, "loss": 0.505, "mean_token_accuracy": 0.8306124126538634, "num_tokens": 76889182.0, "step": 179 }, { "entropy": 0.441192626953125, "epoch": 0.7086614173228346, "grad_norm": 1.0851600035028615, "learning_rate": 1.8206032615411092e-05, "loss": 0.5052, "mean_token_accuracy": 0.8298691343516111, "num_tokens": 77329833.0, "step": 180 }, { "entropy": 0.44793701171875, "epoch": 0.7125984251968503, "grad_norm": 0.9640089148569666, "learning_rate": 1.818112237839174e-05, "loss": 0.4972, "mean_token_accuracy": 0.8315172418951988, "num_tokens": 77743805.0, "step": 181 }, { "entropy": 0.45074462890625, "epoch": 0.7165354330708661, "grad_norm": 0.8688883877508005, "learning_rate": 1.8156057674498756e-05, "loss": 0.497, "mean_token_accuracy": 0.8296341849491, "num_tokens": 78174355.0, "step": 182 }, { "entropy": 0.437103271484375, "epoch": 0.7204724409448819, "grad_norm": 0.8482457279373461, "learning_rate": 1.8130838976976058e-05, "loss": 0.4947, "mean_token_accuracy": 0.8344184467568994, "num_tokens": 78613242.0, "step": 183 }, { "entropy": 0.44915771484375, "epoch": 0.7244094488188977, "grad_norm": 0.8888701349814255, "learning_rate": 1.810546676197511e-05, "loss": 0.4879, "mean_token_accuracy": 0.8355774069204926, "num_tokens": 79037611.0, "step": 184 }, { "entropy": 0.449859619140625, "epoch": 0.7283464566929134, "grad_norm": 0.8760275198824425, "learning_rate": 1.807994150854592e-05, "loss": 0.5002, "mean_token_accuracy": 0.8316125152632594, "num_tokens": 79453628.0, "step": 185 }, { "entropy": 0.44781494140625, "epoch": 0.7322834645669292, "grad_norm": 1.2974591275029912, "learning_rate": 1.805426369862799e-05, "loss": 0.5035, "mean_token_accuracy": 0.8302051173523068, "num_tokens": 79888974.0, "step": 186 }, { "entropy": 0.4373779296875, "epoch": 0.7362204724409449, "grad_norm": 0.8929755000611298, "learning_rate": 1.8028433817041237e-05, "loss": 0.4822, "mean_token_accuracy": 0.8354063536971807, "num_tokens": 80321324.0, "step": 187 }, { "entropy": 0.443572998046875, "epoch": 0.7401574803149606, "grad_norm": 0.8329491046833114, "learning_rate": 1.8002452351476817e-05, "loss": 0.5047, "mean_token_accuracy": 0.8291633054614067, "num_tokens": 80766297.0, "step": 188 }, { "entropy": 0.445404052734375, "epoch": 0.7440944881889764, "grad_norm": 0.8665441369913496, "learning_rate": 1.7976319792487933e-05, "loss": 0.4915, "mean_token_accuracy": 0.8326784670352936, "num_tokens": 81192416.0, "step": 189 }, { "entropy": 0.442962646484375, "epoch": 0.7480314960629921, "grad_norm": 0.8336203122290462, "learning_rate": 1.7950036633480557e-05, "loss": 0.4895, "mean_token_accuracy": 0.8337859380990267, "num_tokens": 81633131.0, "step": 190 }, { "entropy": 0.438995361328125, "epoch": 0.7519685039370079, "grad_norm": 0.8560354139325418, "learning_rate": 1.7923603370704136e-05, "loss": 0.4849, "mean_token_accuracy": 0.8339350931346416, "num_tokens": 82073657.0, "step": 191 }, { "entropy": 0.43817138671875, "epoch": 0.7559055118110236, "grad_norm": 0.8646497974459243, "learning_rate": 1.7897020503242192e-05, "loss": 0.4829, "mean_token_accuracy": 0.8361313687637448, "num_tokens": 82495559.0, "step": 192 }, { "entropy": 0.44427490234375, "epoch": 0.7598425196850394, "grad_norm": 0.8131289484833817, "learning_rate": 1.787028853300294e-05, "loss": 0.4807, "mean_token_accuracy": 0.8349549919366837, "num_tokens": 82909007.0, "step": 193 }, { "entropy": 0.4383544921875, "epoch": 0.7637795275590551, "grad_norm": 0.7972139228607141, "learning_rate": 1.7843407964709773e-05, "loss": 0.482, "mean_token_accuracy": 0.8354136543348432, "num_tokens": 83334137.0, "step": 194 }, { "entropy": 0.438873291015625, "epoch": 0.7677165354330708, "grad_norm": 0.882340235786015, "learning_rate": 1.7816379305891743e-05, "loss": 0.4882, "mean_token_accuracy": 0.8349952660501003, "num_tokens": 83761623.0, "step": 195 }, { "entropy": 0.4444580078125, "epoch": 0.7716535433070866, "grad_norm": 0.8653917946283164, "learning_rate": 1.7789203066874e-05, "loss": 0.4914, "mean_token_accuracy": 0.8350548483431339, "num_tokens": 84184837.0, "step": 196 }, { "entropy": 0.44403076171875, "epoch": 0.7755905511811023, "grad_norm": 0.9248212539704527, "learning_rate": 1.7761879760768123e-05, "loss": 0.4928, "mean_token_accuracy": 0.8332647895440459, "num_tokens": 84627232.0, "step": 197 }, { "entropy": 0.43585205078125, "epoch": 0.7795275590551181, "grad_norm": 0.8874785042910702, "learning_rate": 1.7734409903462454e-05, "loss": 0.4851, "mean_token_accuracy": 0.8350456738844514, "num_tokens": 85067016.0, "step": 198 }, { "entropy": 0.4476318359375, "epoch": 0.7834645669291339, "grad_norm": 0.8774288891360084, "learning_rate": 1.7706794013612367e-05, "loss": 0.497, "mean_token_accuracy": 0.8308952897787094, "num_tokens": 85474661.0, "step": 199 }, { "entropy": 0.440887451171875, "epoch": 0.7874015748031497, "grad_norm": 0.950578748327104, "learning_rate": 1.7679032612630432e-05, "loss": 0.4872, "mean_token_accuracy": 0.8345352541655302, "num_tokens": 85907009.0, "step": 200 }, { "entropy": 0.441009521484375, "epoch": 0.7913385826771654, "grad_norm": 0.8638996030741382, "learning_rate": 1.7651126224676616e-05, "loss": 0.4971, "mean_token_accuracy": 0.8309849118813872, "num_tokens": 86334007.0, "step": 201 }, { "entropy": 0.4449462890625, "epoch": 0.7952755905511811, "grad_norm": 0.9130367054606549, "learning_rate": 1.7623075376648374e-05, "loss": 0.4906, "mean_token_accuracy": 0.8348276494070888, "num_tokens": 86756048.0, "step": 202 }, { "entropy": 0.4334716796875, "epoch": 0.7992125984251969, "grad_norm": 0.8490365529225518, "learning_rate": 1.7594880598170688e-05, "loss": 0.4932, "mean_token_accuracy": 0.8335457816720009, "num_tokens": 87189461.0, "step": 203 }, { "entropy": 0.440887451171875, "epoch": 0.8031496062992126, "grad_norm": 0.8512282062983372, "learning_rate": 1.756654242158607e-05, "loss": 0.4802, "mean_token_accuracy": 0.8350167060270905, "num_tokens": 87612667.0, "step": 204 }, { "entropy": 0.446807861328125, "epoch": 0.8070866141732284, "grad_norm": 0.8346172013248041, "learning_rate": 1.7538061381944524e-05, "loss": 0.4828, "mean_token_accuracy": 0.8335348330438137, "num_tokens": 88032722.0, "step": 205 }, { "entropy": 0.436798095703125, "epoch": 0.8110236220472441, "grad_norm": 0.9406039912781453, "learning_rate": 1.7509438016993435e-05, "loss": 0.5043, "mean_token_accuracy": 0.828425613231957, "num_tokens": 88455185.0, "step": 206 }, { "entropy": 0.443450927734375, "epoch": 0.8149606299212598, "grad_norm": 0.8878870161898881, "learning_rate": 1.748067286716741e-05, "loss": 0.5053, "mean_token_accuracy": 0.8270987952128053, "num_tokens": 88887403.0, "step": 207 }, { "entropy": 0.43701171875, "epoch": 0.8188976377952756, "grad_norm": 0.833846984688481, "learning_rate": 1.745176647557809e-05, "loss": 0.4806, "mean_token_accuracy": 0.8361224737018347, "num_tokens": 89321230.0, "step": 208 }, { "entropy": 0.434295654296875, "epoch": 0.8228346456692913, "grad_norm": 0.8800807090551229, "learning_rate": 1.7422719388003882e-05, "loss": 0.4851, "mean_token_accuracy": 0.834100566804409, "num_tokens": 89752168.0, "step": 209 }, { "entropy": 0.4471435546875, "epoch": 0.8267716535433071, "grad_norm": 0.8536884570400775, "learning_rate": 1.739353215287965e-05, "loss": 0.5013, "mean_token_accuracy": 0.8307216819375753, "num_tokens": 90185259.0, "step": 210 }, { "entropy": 0.43634033203125, "epoch": 0.8307086614173228, "grad_norm": 0.838204501619093, "learning_rate": 1.7364205321286393e-05, "loss": 0.4741, "mean_token_accuracy": 0.8391950000077486, "num_tokens": 90618555.0, "step": 211 }, { "entropy": 0.438018798828125, "epoch": 0.8346456692913385, "grad_norm": 0.8339181556557149, "learning_rate": 1.7334739446940785e-05, "loss": 0.487, "mean_token_accuracy": 0.8343816567212343, "num_tokens": 91070952.0, "step": 212 }, { "entropy": 0.444671630859375, "epoch": 0.8385826771653543, "grad_norm": 0.9454821175064054, "learning_rate": 1.730513508618477e-05, "loss": 0.4879, "mean_token_accuracy": 0.8355796793475747, "num_tokens": 91492780.0, "step": 213 }, { "entropy": 0.443389892578125, "epoch": 0.84251968503937, "grad_norm": 0.9537363792825977, "learning_rate": 1.7275392797975034e-05, "loss": 0.4788, "mean_token_accuracy": 0.8368481360375881, "num_tokens": 91928189.0, "step": 214 }, { "entropy": 0.439117431640625, "epoch": 0.8464566929133859, "grad_norm": 0.8170845597708071, "learning_rate": 1.7245513143872458e-05, "loss": 0.4796, "mean_token_accuracy": 0.8352800803259015, "num_tokens": 92380202.0, "step": 215 }, { "entropy": 0.438140869140625, "epoch": 0.8503937007874016, "grad_norm": 0.8684828468772561, "learning_rate": 1.7215496688031504e-05, "loss": 0.4824, "mean_token_accuracy": 0.8344074506312609, "num_tokens": 92821942.0, "step": 216 }, { "entropy": 0.4383544921875, "epoch": 0.8543307086614174, "grad_norm": 0.9016894253389791, "learning_rate": 1.718534399718959e-05, "loss": 0.4843, "mean_token_accuracy": 0.8332484466955066, "num_tokens": 93256559.0, "step": 217 }, { "entropy": 0.448089599609375, "epoch": 0.8582677165354331, "grad_norm": 0.7926466666351812, "learning_rate": 1.7155055640656353e-05, "loss": 0.4779, "mean_token_accuracy": 0.8372692186385393, "num_tokens": 93678457.0, "step": 218 }, { "entropy": 0.453216552734375, "epoch": 0.8622047244094488, "grad_norm": 0.7983330308845654, "learning_rate": 1.7124632190302936e-05, "loss": 0.4632, "mean_token_accuracy": 0.8423045501112938, "num_tokens": 94088744.0, "step": 219 }, { "entropy": 0.445556640625, "epoch": 0.8661417322834646, "grad_norm": 0.9230403722775413, "learning_rate": 1.709407422055116e-05, "loss": 0.4675, "mean_token_accuracy": 0.8398523181676865, "num_tokens": 94509523.0, "step": 220 }, { "entropy": 0.449920654296875, "epoch": 0.8700787401574803, "grad_norm": 0.9307744641259067, "learning_rate": 1.70633823083627e-05, "loss": 0.4788, "mean_token_accuracy": 0.8371898429468274, "num_tokens": 94939487.0, "step": 221 }, { "entropy": 0.45404052734375, "epoch": 0.8740157480314961, "grad_norm": 0.8081498704504936, "learning_rate": 1.7032557033228184e-05, "loss": 0.4831, "mean_token_accuracy": 0.8366042831912637, "num_tokens": 95362681.0, "step": 222 }, { "entropy": 0.441864013671875, "epoch": 0.8779527559055118, "grad_norm": 0.7609168209994998, "learning_rate": 1.700159897715624e-05, "loss": 0.4823, "mean_token_accuracy": 0.836696463637054, "num_tokens": 95815007.0, "step": 223 }, { "entropy": 0.452392578125, "epoch": 0.8818897637795275, "grad_norm": 0.8691379438133096, "learning_rate": 1.6970508724662536e-05, "loss": 0.4847, "mean_token_accuracy": 0.8337752502411604, "num_tokens": 96216692.0, "step": 224 }, { "entropy": 0.453155517578125, "epoch": 0.8858267716535433, "grad_norm": 0.9417537213558551, "learning_rate": 1.693928686275871e-05, "loss": 0.4959, "mean_token_accuracy": 0.8307985952124, "num_tokens": 96638434.0, "step": 225 }, { "entropy": 0.45635986328125, "epoch": 0.889763779527559, "grad_norm": 0.8178474225248942, "learning_rate": 1.6907933980941312e-05, "loss": 0.4803, "mean_token_accuracy": 0.8372792527079582, "num_tokens": 97062650.0, "step": 226 }, { "entropy": 0.448638916015625, "epoch": 0.8937007874015748, "grad_norm": 0.8660818057993216, "learning_rate": 1.6876450671180667e-05, "loss": 0.4875, "mean_token_accuracy": 0.8336331462487578, "num_tokens": 97490818.0, "step": 227 }, { "entropy": 0.452239990234375, "epoch": 0.8976377952755905, "grad_norm": 0.8601663155835422, "learning_rate": 1.6844837527909682e-05, "loss": 0.4819, "mean_token_accuracy": 0.8355707786977291, "num_tokens": 97912123.0, "step": 228 }, { "entropy": 0.448455810546875, "epoch": 0.9015748031496063, "grad_norm": 0.8792189335920719, "learning_rate": 1.681309514801265e-05, "loss": 0.4795, "mean_token_accuracy": 0.8368852250277996, "num_tokens": 98346951.0, "step": 229 }, { "entropy": 0.446990966796875, "epoch": 0.905511811023622, "grad_norm": 0.8589439183505942, "learning_rate": 1.6781224130813966e-05, "loss": 0.4751, "mean_token_accuracy": 0.8384532378986478, "num_tokens": 98782373.0, "step": 230 }, { "entropy": 0.44207763671875, "epoch": 0.9094488188976378, "grad_norm": 0.9131583177729585, "learning_rate": 1.6749225078066796e-05, "loss": 0.4732, "mean_token_accuracy": 0.8387185446918011, "num_tokens": 99222036.0, "step": 231 }, { "entropy": 0.442413330078125, "epoch": 0.9133858267716536, "grad_norm": 0.8600516435524386, "learning_rate": 1.6717098593941753e-05, "loss": 0.4793, "mean_token_accuracy": 0.8373282151296735, "num_tokens": 99650218.0, "step": 232 }, { "entropy": 0.440216064453125, "epoch": 0.9173228346456693, "grad_norm": 0.866595517834408, "learning_rate": 1.6684845285015453e-05, "loss": 0.4706, "mean_token_accuracy": 0.8394803171977401, "num_tokens": 100066418.0, "step": 233 }, { "entropy": 0.4385986328125, "epoch": 0.9212598425196851, "grad_norm": 0.8506430586031988, "learning_rate": 1.665246576025908e-05, "loss": 0.4887, "mean_token_accuracy": 0.8329129619523883, "num_tokens": 100509512.0, "step": 234 }, { "entropy": 0.444915771484375, "epoch": 0.9251968503937008, "grad_norm": 0.8365324292407998, "learning_rate": 1.661996063102689e-05, "loss": 0.4746, "mean_token_accuracy": 0.8389232521876693, "num_tokens": 100925854.0, "step": 235 }, { "entropy": 0.4608154296875, "epoch": 0.9291338582677166, "grad_norm": 0.9178339434645408, "learning_rate": 1.658733051104466e-05, "loss": 0.4845, "mean_token_accuracy": 0.8356641856953502, "num_tokens": 101324784.0, "step": 236 }, { "entropy": 0.444549560546875, "epoch": 0.9330708661417323, "grad_norm": 0.8526785526614675, "learning_rate": 1.65545760163981e-05, "loss": 0.4686, "mean_token_accuracy": 0.8390642059966922, "num_tokens": 101737781.0, "step": 237 }, { "entropy": 0.442047119140625, "epoch": 0.937007874015748, "grad_norm": 0.8633913144106983, "learning_rate": 1.6521697765521232e-05, "loss": 0.4724, "mean_token_accuracy": 0.8398132715374231, "num_tokens": 102163549.0, "step": 238 }, { "entropy": 0.440887451171875, "epoch": 0.9409448818897638, "grad_norm": 0.9052461846043562, "learning_rate": 1.64886963791847e-05, "loss": 0.4707, "mean_token_accuracy": 0.8368164198473096, "num_tokens": 102610992.0, "step": 239 }, { "entropy": 0.4443359375, "epoch": 0.9448818897637795, "grad_norm": 0.7465118786788207, "learning_rate": 1.645557248048406e-05, "loss": 0.4678, "mean_token_accuracy": 0.8387394333258271, "num_tokens": 103035316.0, "step": 240 }, { "entropy": 0.44122314453125, "epoch": 0.9488188976377953, "grad_norm": 0.8866699103487138, "learning_rate": 1.642232669482801e-05, "loss": 0.4675, "mean_token_accuracy": 0.8410151610150933, "num_tokens": 103473084.0, "step": 241 }, { "entropy": 0.43780517578125, "epoch": 0.952755905511811, "grad_norm": 0.7937264780981268, "learning_rate": 1.6388959649926567e-05, "loss": 0.4732, "mean_token_accuracy": 0.8379040919244289, "num_tokens": 103902168.0, "step": 242 }, { "entropy": 0.44012451171875, "epoch": 0.9566929133858267, "grad_norm": 0.7697616947989054, "learning_rate": 1.6355471975779255e-05, "loss": 0.4592, "mean_token_accuracy": 0.8422065414488316, "num_tokens": 104326224.0, "step": 243 }, { "entropy": 0.439117431640625, "epoch": 0.9606299212598425, "grad_norm": 0.7598725141371296, "learning_rate": 1.6321864304663174e-05, "loss": 0.4707, "mean_token_accuracy": 0.8379590632393956, "num_tokens": 104764846.0, "step": 244 }, { "entropy": 0.441436767578125, "epoch": 0.9645669291338582, "grad_norm": 0.7779917047277318, "learning_rate": 1.6288137271121066e-05, "loss": 0.4792, "mean_token_accuracy": 0.8374456604942679, "num_tokens": 105184708.0, "step": 245 }, { "entropy": 0.439361572265625, "epoch": 0.968503937007874, "grad_norm": 0.8225151639761566, "learning_rate": 1.6254291511949353e-05, "loss": 0.472, "mean_token_accuracy": 0.8368659280240536, "num_tokens": 105616981.0, "step": 246 }, { "entropy": 0.458038330078125, "epoch": 0.9724409448818898, "grad_norm": 0.8481725978257536, "learning_rate": 1.62203276661861e-05, "loss": 0.4583, "mean_token_accuracy": 0.84222552459687, "num_tokens": 106023465.0, "step": 247 }, { "entropy": 0.4459228515625, "epoch": 0.9763779527559056, "grad_norm": 0.8204906148542874, "learning_rate": 1.618624637509895e-05, "loss": 0.4653, "mean_token_accuracy": 0.842322469688952, "num_tokens": 106446966.0, "step": 248 }, { "entropy": 0.449951171875, "epoch": 0.9803149606299213, "grad_norm": 0.7515720467344851, "learning_rate": 1.615204828217302e-05, "loss": 0.4818, "mean_token_accuracy": 0.8364261239767075, "num_tokens": 106898608.0, "step": 249 }, { "entropy": 0.448944091796875, "epoch": 0.984251968503937, "grad_norm": 0.8467560345004221, "learning_rate": 1.6117734033098744e-05, "loss": 0.4605, "mean_token_accuracy": 0.8424621764570475, "num_tokens": 107332517.0, "step": 250 }, { "entropy": 0.449981689453125, "epoch": 0.9881889763779528, "grad_norm": 0.7946487404168803, "learning_rate": 1.60833042757597e-05, "loss": 0.4609, "mean_token_accuracy": 0.8438995983451605, "num_tokens": 107756302.0, "step": 251 }, { "entropy": 0.453155517578125, "epoch": 0.9921259842519685, "grad_norm": 0.797337818033653, "learning_rate": 1.604875966022035e-05, "loss": 0.4702, "mean_token_accuracy": 0.8400178952142596, "num_tokens": 108189062.0, "step": 252 }, { "entropy": 0.468475341796875, "epoch": 0.9960629921259843, "grad_norm": 0.8572234329705702, "learning_rate": 1.6014100838713796e-05, "loss": 0.4686, "mean_token_accuracy": 0.8383585326373577, "num_tokens": 108597149.0, "step": 253 }, { "entropy": 0.4630126953125, "epoch": 1.0, "grad_norm": 0.8143644791410944, "learning_rate": 1.5979328465629435e-05, "loss": 0.4634, "mean_token_accuracy": 0.8411326240748167, "num_tokens": 109014820.0, "step": 254 }, { "entropy": 0.461944580078125, "epoch": 1.0039370078740157, "grad_norm": 0.7954464559368906, "learning_rate": 1.5944443197500633e-05, "loss": 0.4338, "mean_token_accuracy": 0.8487551026046276, "num_tokens": 109433802.0, "step": 255 }, { "entropy": 0.4547119140625, "epoch": 1.0078740157480315, "grad_norm": 0.8003325077804105, "learning_rate": 1.59094456929923e-05, "loss": 0.4411, "mean_token_accuracy": 0.8471461059525609, "num_tokens": 109861074.0, "step": 256 }, { "entropy": 0.444732666015625, "epoch": 1.0118110236220472, "grad_norm": 0.8193521250103798, "learning_rate": 1.5874336612888487e-05, "loss": 0.4529, "mean_token_accuracy": 0.8427032921463251, "num_tokens": 110311771.0, "step": 257 }, { "entropy": 0.445892333984375, "epoch": 1.015748031496063, "grad_norm": 0.8092675680016121, "learning_rate": 1.5839116620079874e-05, "loss": 0.4455, "mean_token_accuracy": 0.8459412306547165, "num_tokens": 110748553.0, "step": 258 }, { "entropy": 0.453948974609375, "epoch": 1.0196850393700787, "grad_norm": 0.832494554347309, "learning_rate": 1.580378637955128e-05, "loss": 0.4401, "mean_token_accuracy": 0.8475761925801635, "num_tokens": 111156318.0, "step": 259 }, { "entropy": 0.451507568359375, "epoch": 1.0236220472440944, "grad_norm": 0.773496689384206, "learning_rate": 1.5768346558369105e-05, "loss": 0.4306, "mean_token_accuracy": 0.8483728105202317, "num_tokens": 111582549.0, "step": 260 }, { "entropy": 0.44976806640625, "epoch": 1.0275590551181102, "grad_norm": 0.7754814952822502, "learning_rate": 1.5732797825668714e-05, "loss": 0.4365, "mean_token_accuracy": 0.8502898076549172, "num_tokens": 111998521.0, "step": 261 }, { "entropy": 0.448455810546875, "epoch": 1.031496062992126, "grad_norm": 0.8112260492156416, "learning_rate": 1.5697140852641835e-05, "loss": 0.4466, "mean_token_accuracy": 0.8444716399535537, "num_tokens": 112418889.0, "step": 262 }, { "entropy": 0.443206787109375, "epoch": 1.0354330708661417, "grad_norm": 0.792232440371198, "learning_rate": 1.5661376312523854e-05, "loss": 0.4454, "mean_token_accuracy": 0.8456090791150928, "num_tokens": 112847214.0, "step": 263 }, { "entropy": 0.44891357421875, "epoch": 1.0393700787401574, "grad_norm": 0.8601941273573265, "learning_rate": 1.5625504880581136e-05, "loss": 0.4437, "mean_token_accuracy": 0.8455395260825753, "num_tokens": 113289373.0, "step": 264 }, { "entropy": 0.44927978515625, "epoch": 1.0433070866141732, "grad_norm": 0.7714822661259353, "learning_rate": 1.5589527234098247e-05, "loss": 0.4456, "mean_token_accuracy": 0.8444314748048782, "num_tokens": 113723545.0, "step": 265 }, { "entropy": 0.442169189453125, "epoch": 1.047244094488189, "grad_norm": 0.8190193432749275, "learning_rate": 1.5553444052365176e-05, "loss": 0.4325, "mean_token_accuracy": 0.8491929210722446, "num_tokens": 114162192.0, "step": 266 }, { "entropy": 0.441619873046875, "epoch": 1.0511811023622046, "grad_norm": 0.7349029400939283, "learning_rate": 1.5517256016664524e-05, "loss": 0.4262, "mean_token_accuracy": 0.8510750867426395, "num_tokens": 114571282.0, "step": 267 }, { "entropy": 0.441680908203125, "epoch": 1.0551181102362204, "grad_norm": 0.7903067387402938, "learning_rate": 1.5480963810258614e-05, "loss": 0.4254, "mean_token_accuracy": 0.8503110585734248, "num_tokens": 115010760.0, "step": 268 }, { "entropy": 0.437774658203125, "epoch": 1.0590551181102361, "grad_norm": 0.8078264014108711, "learning_rate": 1.5444568118376615e-05, "loss": 0.4266, "mean_token_accuracy": 0.8510611765086651, "num_tokens": 115428842.0, "step": 269 }, { "entropy": 0.43804931640625, "epoch": 1.0629921259842519, "grad_norm": 0.7113878989173036, "learning_rate": 1.5408069628201597e-05, "loss": 0.4411, "mean_token_accuracy": 0.8480326728895307, "num_tokens": 115858145.0, "step": 270 }, { "entropy": 0.4405517578125, "epoch": 1.0669291338582678, "grad_norm": 0.7629159309151201, "learning_rate": 1.5371469028857534e-05, "loss": 0.44, "mean_token_accuracy": 0.8475739294663072, "num_tokens": 116286624.0, "step": 271 }, { "entropy": 0.43695068359375, "epoch": 1.0708661417322836, "grad_norm": 0.727617304934651, "learning_rate": 1.533476701139633e-05, "loss": 0.4364, "mean_token_accuracy": 0.8470908785238862, "num_tokens": 116713316.0, "step": 272 }, { "entropy": 0.427978515625, "epoch": 1.0748031496062993, "grad_norm": 0.6824270776024675, "learning_rate": 1.5297964268784757e-05, "loss": 0.445, "mean_token_accuracy": 0.8450411949306726, "num_tokens": 117167235.0, "step": 273 }, { "entropy": 0.432647705078125, "epoch": 1.078740157480315, "grad_norm": 0.7312979875445866, "learning_rate": 1.5261061495891345e-05, "loss": 0.4315, "mean_token_accuracy": 0.8500176034867764, "num_tokens": 117613634.0, "step": 274 }, { "entropy": 0.442596435546875, "epoch": 1.0826771653543308, "grad_norm": 0.8021478759232983, "learning_rate": 1.5224059389473305e-05, "loss": 0.4262, "mean_token_accuracy": 0.8529012883082032, "num_tokens": 118016077.0, "step": 275 }, { "entropy": 0.430023193359375, "epoch": 1.0866141732283465, "grad_norm": 0.7358571235840625, "learning_rate": 1.5186958648163344e-05, "loss": 0.4444, "mean_token_accuracy": 0.8468069788068533, "num_tokens": 118481042.0, "step": 276 }, { "entropy": 0.4302978515625, "epoch": 1.0905511811023623, "grad_norm": 0.7025837618217786, "learning_rate": 1.514975997245649e-05, "loss": 0.4256, "mean_token_accuracy": 0.8535407232120633, "num_tokens": 118916540.0, "step": 277 }, { "entropy": 0.427886962890625, "epoch": 1.094488188976378, "grad_norm": 0.7256948688401913, "learning_rate": 1.5112464064696857e-05, "loss": 0.4287, "mean_token_accuracy": 0.8516247281804681, "num_tokens": 119355413.0, "step": 278 }, { "entropy": 0.428436279296875, "epoch": 1.0984251968503937, "grad_norm": 0.7780711790160693, "learning_rate": 1.5075071629064381e-05, "loss": 0.4472, "mean_token_accuracy": 0.8435391476377845, "num_tokens": 119789004.0, "step": 279 }, { "entropy": 0.432952880859375, "epoch": 1.1023622047244095, "grad_norm": 0.7299240812653669, "learning_rate": 1.5037583371561538e-05, "loss": 0.4361, "mean_token_accuracy": 0.8491124296560884, "num_tokens": 120207170.0, "step": 280 }, { "entropy": 0.43359375, "epoch": 1.1062992125984252, "grad_norm": 0.7409579348895512, "learning_rate": 1.5000000000000002e-05, "loss": 0.4435, "mean_token_accuracy": 0.8463100017979741, "num_tokens": 120639252.0, "step": 281 }, { "entropy": 0.4307861328125, "epoch": 1.110236220472441, "grad_norm": 0.7628848757746238, "learning_rate": 1.4962322223987284e-05, "loss": 0.4293, "mean_token_accuracy": 0.8522774102166295, "num_tokens": 121066062.0, "step": 282 }, { "entropy": 0.433319091796875, "epoch": 1.1141732283464567, "grad_norm": 0.819524267296068, "learning_rate": 1.4924550754913341e-05, "loss": 0.4334, "mean_token_accuracy": 0.8487180238589644, "num_tokens": 121494245.0, "step": 283 }, { "entropy": 0.424285888671875, "epoch": 1.1181102362204725, "grad_norm": 0.7201240645893007, "learning_rate": 1.4886686305937133e-05, "loss": 0.4231, "mean_token_accuracy": 0.8517216173931956, "num_tokens": 121939800.0, "step": 284 }, { "entropy": 0.424102783203125, "epoch": 1.1220472440944882, "grad_norm": 0.7187534396317979, "learning_rate": 1.4848729591973165e-05, "loss": 0.4276, "mean_token_accuracy": 0.8503343118354678, "num_tokens": 122376313.0, "step": 285 }, { "entropy": 0.4246826171875, "epoch": 1.125984251968504, "grad_norm": 0.7377358272265221, "learning_rate": 1.4810681329677988e-05, "loss": 0.4319, "mean_token_accuracy": 0.8497972404584289, "num_tokens": 122805350.0, "step": 286 }, { "entropy": 0.42626953125, "epoch": 1.1299212598425197, "grad_norm": 1.0338835019493653, "learning_rate": 1.477254223743666e-05, "loss": 0.4286, "mean_token_accuracy": 0.8504892103374004, "num_tokens": 123229585.0, "step": 287 }, { "entropy": 0.430145263671875, "epoch": 1.1338582677165354, "grad_norm": 0.7549593653526081, "learning_rate": 1.4734313035349205e-05, "loss": 0.4157, "mean_token_accuracy": 0.8539745900779963, "num_tokens": 123667139.0, "step": 288 }, { "entropy": 0.42559814453125, "epoch": 1.1377952755905512, "grad_norm": 0.7134584103671152, "learning_rate": 1.4695994445216985e-05, "loss": 0.4429, "mean_token_accuracy": 0.8458532355725765, "num_tokens": 124098824.0, "step": 289 }, { "entropy": 0.433685302734375, "epoch": 1.141732283464567, "grad_norm": 0.7330700557433506, "learning_rate": 1.4657587190529099e-05, "loss": 0.4262, "mean_token_accuracy": 0.8504032399505377, "num_tokens": 124515502.0, "step": 290 }, { "entropy": 0.429229736328125, "epoch": 1.1456692913385826, "grad_norm": 0.7185743079970203, "learning_rate": 1.4619091996448703e-05, "loss": 0.4283, "mean_token_accuracy": 0.8502658074721694, "num_tokens": 124940837.0, "step": 291 }, { "entropy": 0.442840576171875, "epoch": 1.1496062992125984, "grad_norm": 0.7104408412257786, "learning_rate": 1.458050958979933e-05, "loss": 0.4236, "mean_token_accuracy": 0.8509505931288004, "num_tokens": 125365604.0, "step": 292 }, { "entropy": 0.4324951171875, "epoch": 1.1535433070866141, "grad_norm": 0.7048242096715929, "learning_rate": 1.4541840699051168e-05, "loss": 0.4348, "mean_token_accuracy": 0.8513169949874282, "num_tokens": 125806425.0, "step": 293 }, { "entropy": 0.439178466796875, "epoch": 1.1574803149606299, "grad_norm": 0.7362993621651959, "learning_rate": 1.4503086054307299e-05, "loss": 0.4319, "mean_token_accuracy": 0.8472118573263288, "num_tokens": 126232534.0, "step": 294 }, { "entropy": 0.437774658203125, "epoch": 1.1614173228346456, "grad_norm": 0.7408531510746268, "learning_rate": 1.4464246387289913e-05, "loss": 0.4302, "mean_token_accuracy": 0.8503606310114264, "num_tokens": 126682871.0, "step": 295 }, { "entropy": 0.44171142578125, "epoch": 1.1653543307086613, "grad_norm": 0.6950128099475602, "learning_rate": 1.4425322431326504e-05, "loss": 0.4388, "mean_token_accuracy": 0.8464201996102929, "num_tokens": 127113863.0, "step": 296 }, { "entropy": 0.44354248046875, "epoch": 1.169291338582677, "grad_norm": 0.7118074965909784, "learning_rate": 1.438631492133601e-05, "loss": 0.4127, "mean_token_accuracy": 0.8560283463448286, "num_tokens": 127533345.0, "step": 297 }, { "entropy": 0.44549560546875, "epoch": 1.1732283464566928, "grad_norm": 0.7536132862966228, "learning_rate": 1.4347224593814946e-05, "loss": 0.4319, "mean_token_accuracy": 0.8498925063759089, "num_tokens": 127948674.0, "step": 298 }, { "entropy": 0.448089599609375, "epoch": 1.1771653543307086, "grad_norm": 0.7828383574148693, "learning_rate": 1.4308052186823494e-05, "loss": 0.4221, "mean_token_accuracy": 0.8534400537610054, "num_tokens": 128362416.0, "step": 299 }, { "entropy": 0.43896484375, "epoch": 1.1811023622047245, "grad_norm": 0.7094320191090657, "learning_rate": 1.4268798439971572e-05, "loss": 0.4291, "mean_token_accuracy": 0.8505124570801854, "num_tokens": 128812262.0, "step": 300 }, { "entropy": 0.431182861328125, "epoch": 1.1850393700787403, "grad_norm": 0.7150260895657441, "learning_rate": 1.4229464094404866e-05, "loss": 0.4327, "mean_token_accuracy": 0.850852720439434, "num_tokens": 129267041.0, "step": 301 }, { "entropy": 0.437042236328125, "epoch": 1.188976377952756, "grad_norm": 0.7384859056056637, "learning_rate": 1.4190049892790838e-05, "loss": 0.4278, "mean_token_accuracy": 0.8514499422162771, "num_tokens": 129701771.0, "step": 302 }, { "entropy": 0.442169189453125, "epoch": 1.1929133858267718, "grad_norm": 0.7886399080694347, "learning_rate": 1.4150556579304699e-05, "loss": 0.442, "mean_token_accuracy": 0.8480188464745879, "num_tokens": 130124702.0, "step": 303 }, { "entropy": 0.441558837890625, "epoch": 1.1968503937007875, "grad_norm": 0.7466370557730762, "learning_rate": 1.4110984899615367e-05, "loss": 0.4191, "mean_token_accuracy": 0.8521094862371683, "num_tokens": 130538776.0, "step": 304 }, { "entropy": 0.436798095703125, "epoch": 1.2007874015748032, "grad_norm": 0.8104033828186294, "learning_rate": 1.4071335600871388e-05, "loss": 0.4228, "mean_token_accuracy": 0.8536786120384932, "num_tokens": 130979212.0, "step": 305 }, { "entropy": 0.443572998046875, "epoch": 1.204724409448819, "grad_norm": 0.7850350920921043, "learning_rate": 1.4031609431686809e-05, "loss": 0.4163, "mean_token_accuracy": 0.8537906985729933, "num_tokens": 131405428.0, "step": 306 }, { "entropy": 0.441162109375, "epoch": 1.2086614173228347, "grad_norm": 0.7762261376252096, "learning_rate": 1.3991807142127082e-05, "loss": 0.4339, "mean_token_accuracy": 0.8480509323999286, "num_tokens": 131837961.0, "step": 307 }, { "entropy": 0.4342041015625, "epoch": 1.2125984251968505, "grad_norm": 0.6805963341437536, "learning_rate": 1.3951929483694855e-05, "loss": 0.4219, "mean_token_accuracy": 0.8525160830467939, "num_tokens": 132267303.0, "step": 308 }, { "entropy": 0.431793212890625, "epoch": 1.2165354330708662, "grad_norm": 0.7879655600731412, "learning_rate": 1.3911977209315828e-05, "loss": 0.4412, "mean_token_accuracy": 0.8463876061141491, "num_tokens": 132723994.0, "step": 309 }, { "entropy": 0.43023681640625, "epoch": 1.220472440944882, "grad_norm": 0.7753424379604112, "learning_rate": 1.3871951073324508e-05, "loss": 0.4229, "mean_token_accuracy": 0.8535848595201969, "num_tokens": 133172478.0, "step": 310 }, { "entropy": 0.43377685546875, "epoch": 1.2244094488188977, "grad_norm": 0.7633773162167979, "learning_rate": 1.3831851831449973e-05, "loss": 0.4372, "mean_token_accuracy": 0.8473470462486148, "num_tokens": 133631582.0, "step": 311 }, { "entropy": 0.437347412109375, "epoch": 1.2283464566929134, "grad_norm": 0.7490203495280126, "learning_rate": 1.3791680240801608e-05, "loss": 0.4253, "mean_token_accuracy": 0.8518748395144939, "num_tokens": 134067377.0, "step": 312 }, { "entropy": 0.437225341796875, "epoch": 1.2322834645669292, "grad_norm": 0.7821105835443857, "learning_rate": 1.3751437059854809e-05, "loss": 0.43, "mean_token_accuracy": 0.8515894012525678, "num_tokens": 134491172.0, "step": 313 }, { "entropy": 0.440277099609375, "epoch": 1.236220472440945, "grad_norm": 0.7967419864814663, "learning_rate": 1.3711123048436652e-05, "loss": 0.4194, "mean_token_accuracy": 0.8529211021959782, "num_tokens": 134898327.0, "step": 314 }, { "entropy": 0.432159423828125, "epoch": 1.2401574803149606, "grad_norm": 0.7914552161401749, "learning_rate": 1.3670738967711566e-05, "loss": 0.421, "mean_token_accuracy": 0.8497829381376505, "num_tokens": 135323334.0, "step": 315 }, { "entropy": 0.43121337890625, "epoch": 1.2440944881889764, "grad_norm": 0.7866132156657194, "learning_rate": 1.3630285580166946e-05, "loss": 0.4255, "mean_token_accuracy": 0.8531960425898433, "num_tokens": 135755228.0, "step": 316 }, { "entropy": 0.426910400390625, "epoch": 1.2480314960629921, "grad_norm": 0.770938614701006, "learning_rate": 1.358976364959876e-05, "loss": 0.4332, "mean_token_accuracy": 0.8500475706532598, "num_tokens": 136203741.0, "step": 317 }, { "entropy": 0.41986083984375, "epoch": 1.2519685039370079, "grad_norm": 0.6865132743346913, "learning_rate": 1.3549173941097134e-05, "loss": 0.4131, "mean_token_accuracy": 0.8563643284142017, "num_tokens": 136627448.0, "step": 318 }, { "entropy": 0.42071533203125, "epoch": 1.2559055118110236, "grad_norm": 0.76446995637985, "learning_rate": 1.3508517221031898e-05, "loss": 0.4306, "mean_token_accuracy": 0.8519408302381635, "num_tokens": 137055328.0, "step": 319 }, { "entropy": 0.418701171875, "epoch": 1.2598425196850394, "grad_norm": 0.7091617898089253, "learning_rate": 1.346779425703812e-05, "loss": 0.4129, "mean_token_accuracy": 0.8539134385064244, "num_tokens": 137484547.0, "step": 320 }, { "entropy": 0.423858642578125, "epoch": 1.263779527559055, "grad_norm": 0.7176796997298532, "learning_rate": 1.3427005818001615e-05, "loss": 0.4299, "mean_token_accuracy": 0.8518371032550931, "num_tokens": 137902620.0, "step": 321 }, { "entropy": 0.423004150390625, "epoch": 1.2677165354330708, "grad_norm": 0.7495725876408128, "learning_rate": 1.3386152674044421e-05, "loss": 0.4316, "mean_token_accuracy": 0.8498124582692981, "num_tokens": 138337754.0, "step": 322 }, { "entropy": 0.41943359375, "epoch": 1.2716535433070866, "grad_norm": 0.7128450005810961, "learning_rate": 1.334523559651027e-05, "loss": 0.4182, "mean_token_accuracy": 0.8539979690685868, "num_tokens": 138765177.0, "step": 323 }, { "entropy": 0.429351806640625, "epoch": 1.2755905511811023, "grad_norm": 0.7646983127030109, "learning_rate": 1.3304255357950004e-05, "loss": 0.4144, "mean_token_accuracy": 0.8527556182816625, "num_tokens": 139191798.0, "step": 324 }, { "entropy": 0.419830322265625, "epoch": 1.279527559055118, "grad_norm": 0.720920386611709, "learning_rate": 1.3263212732107014e-05, "loss": 0.4307, "mean_token_accuracy": 0.8504047309979796, "num_tokens": 139628968.0, "step": 325 }, { "entropy": 0.42205810546875, "epoch": 1.2834645669291338, "grad_norm": 0.7638329733373392, "learning_rate": 1.3222108493902613e-05, "loss": 0.4227, "mean_token_accuracy": 0.8527109837159514, "num_tokens": 140048601.0, "step": 326 }, { "entropy": 0.422271728515625, "epoch": 1.2874015748031495, "grad_norm": 0.7211271314435947, "learning_rate": 1.3180943419421409e-05, "loss": 0.4166, "mean_token_accuracy": 0.8536871457472444, "num_tokens": 140465892.0, "step": 327 }, { "entropy": 0.415863037109375, "epoch": 1.2913385826771653, "grad_norm": 0.6870608297675647, "learning_rate": 1.3139718285896657e-05, "loss": 0.4196, "mean_token_accuracy": 0.854581861756742, "num_tokens": 140899011.0, "step": 328 }, { "entropy": 0.4163818359375, "epoch": 1.295275590551181, "grad_norm": 0.7349072604607171, "learning_rate": 1.3098433871695572e-05, "loss": 0.4247, "mean_token_accuracy": 0.8518269741907716, "num_tokens": 141327992.0, "step": 329 }, { "entropy": 0.413848876953125, "epoch": 1.2992125984251968, "grad_norm": 0.7355537771461196, "learning_rate": 1.305709095630466e-05, "loss": 0.4277, "mean_token_accuracy": 0.850621142424643, "num_tokens": 141766293.0, "step": 330 }, { "entropy": 0.420196533203125, "epoch": 1.3031496062992125, "grad_norm": 0.676410365989162, "learning_rate": 1.3015690320314952e-05, "loss": 0.4117, "mean_token_accuracy": 0.8557650512084365, "num_tokens": 142186747.0, "step": 331 }, { "entropy": 0.418426513671875, "epoch": 1.3070866141732282, "grad_norm": 0.7156171897978183, "learning_rate": 1.2974232745407326e-05, "loss": 0.4005, "mean_token_accuracy": 0.8567338529974222, "num_tokens": 142604960.0, "step": 332 }, { "entropy": 0.426544189453125, "epoch": 1.311023622047244, "grad_norm": 0.6649803220610805, "learning_rate": 1.2932719014337697e-05, "loss": 0.4207, "mean_token_accuracy": 0.8537461366504431, "num_tokens": 143019812.0, "step": 333 }, { "entropy": 0.4180908203125, "epoch": 1.3149606299212597, "grad_norm": 0.6991751691443175, "learning_rate": 1.2891149910922267e-05, "loss": 0.4184, "mean_token_accuracy": 0.8524497682228684, "num_tokens": 143476442.0, "step": 334 }, { "entropy": 0.417327880859375, "epoch": 1.3188976377952755, "grad_norm": 0.7211632281848889, "learning_rate": 1.2849526220022713e-05, "loss": 0.4192, "mean_token_accuracy": 0.8534535896033049, "num_tokens": 143905945.0, "step": 335 }, { "entropy": 0.42132568359375, "epoch": 1.3228346456692912, "grad_norm": 0.7616936561270807, "learning_rate": 1.2807848727531372e-05, "loss": 0.4269, "mean_token_accuracy": 0.8515564789995551, "num_tokens": 144328854.0, "step": 336 }, { "entropy": 0.4254150390625, "epoch": 1.326771653543307, "grad_norm": 0.6751865858547865, "learning_rate": 1.276611822035641e-05, "loss": 0.4183, "mean_token_accuracy": 0.8517815675586462, "num_tokens": 144767518.0, "step": 337 }, { "entropy": 0.42669677734375, "epoch": 1.330708661417323, "grad_norm": 0.7290990437060788, "learning_rate": 1.2724335486406947e-05, "loss": 0.4058, "mean_token_accuracy": 0.8585278857499361, "num_tokens": 145204450.0, "step": 338 }, { "entropy": 0.421051025390625, "epoch": 1.3346456692913387, "grad_norm": 0.687664495765818, "learning_rate": 1.26825013145782e-05, "loss": 0.4086, "mean_token_accuracy": 0.8543958617374301, "num_tokens": 145650203.0, "step": 339 }, { "entropy": 0.417572021484375, "epoch": 1.3385826771653544, "grad_norm": 0.746553500387901, "learning_rate": 1.264061649473657e-05, "loss": 0.4309, "mean_token_accuracy": 0.8497815914452076, "num_tokens": 146086535.0, "step": 340 }, { "entropy": 0.420928955078125, "epoch": 1.3425196850393701, "grad_norm": 0.642370025581711, "learning_rate": 1.2598681817704755e-05, "loss": 0.4232, "mean_token_accuracy": 0.8529811156913638, "num_tokens": 146525758.0, "step": 341 }, { "entropy": 0.43072509765625, "epoch": 1.3464566929133859, "grad_norm": 0.6619628860223508, "learning_rate": 1.2556698075246776e-05, "loss": 0.4163, "mean_token_accuracy": 0.853263552300632, "num_tokens": 146949125.0, "step": 342 }, { "entropy": 0.420806884765625, "epoch": 1.3503937007874016, "grad_norm": 0.7236645387749685, "learning_rate": 1.2514666060053075e-05, "loss": 0.426, "mean_token_accuracy": 0.8519914764910936, "num_tokens": 147387665.0, "step": 343 }, { "entropy": 0.42279052734375, "epoch": 1.3543307086614174, "grad_norm": 0.7020684815127656, "learning_rate": 1.2472586565725513e-05, "loss": 0.4075, "mean_token_accuracy": 0.8560398099943995, "num_tokens": 147814355.0, "step": 344 }, { "entropy": 0.41729736328125, "epoch": 1.358267716535433, "grad_norm": 0.7351973042533226, "learning_rate": 1.2430460386762406e-05, "loss": 0.4176, "mean_token_accuracy": 0.8536938540637493, "num_tokens": 148258418.0, "step": 345 }, { "entropy": 0.4271240234375, "epoch": 1.3622047244094488, "grad_norm": 0.7201567319054922, "learning_rate": 1.2388288318543513e-05, "loss": 0.4225, "mean_token_accuracy": 0.8535017529502511, "num_tokens": 148709126.0, "step": 346 }, { "entropy": 0.430023193359375, "epoch": 1.3661417322834646, "grad_norm": 0.7437528228507801, "learning_rate": 1.2346071157315026e-05, "loss": 0.4164, "mean_token_accuracy": 0.8525076750665903, "num_tokens": 149129878.0, "step": 347 }, { "entropy": 0.43548583984375, "epoch": 1.3700787401574803, "grad_norm": 0.7497143090625151, "learning_rate": 1.230380970017453e-05, "loss": 0.4273, "mean_token_accuracy": 0.85127994697541, "num_tokens": 149546730.0, "step": 348 }, { "entropy": 0.42578125, "epoch": 1.374015748031496, "grad_norm": 0.741499648414104, "learning_rate": 1.2261504745055963e-05, "loss": 0.4188, "mean_token_accuracy": 0.8544770767912269, "num_tokens": 149964392.0, "step": 349 }, { "entropy": 0.426513671875, "epoch": 1.3779527559055118, "grad_norm": 0.699150853468733, "learning_rate": 1.2219157090714536e-05, "loss": 0.4203, "mean_token_accuracy": 0.8539074826985598, "num_tokens": 150387746.0, "step": 350 }, { "entropy": 0.43170166015625, "epoch": 1.3818897637795275, "grad_norm": 0.7799990935872828, "learning_rate": 1.2176767536711658e-05, "loss": 0.4148, "mean_token_accuracy": 0.8564818482846022, "num_tokens": 150818559.0, "step": 351 }, { "entropy": 0.42596435546875, "epoch": 1.3858267716535433, "grad_norm": 0.680008208583702, "learning_rate": 1.2134336883399855e-05, "loss": 0.4068, "mean_token_accuracy": 0.8564103506505489, "num_tokens": 151239247.0, "step": 352 }, { "entropy": 0.422210693359375, "epoch": 1.389763779527559, "grad_norm": 0.7126291861119709, "learning_rate": 1.2091865931907627e-05, "loss": 0.4151, "mean_token_accuracy": 0.8551490902900696, "num_tokens": 151671201.0, "step": 353 }, { "entropy": 0.429656982421875, "epoch": 1.3937007874015748, "grad_norm": 0.7386831790798457, "learning_rate": 1.2049355484124351e-05, "loss": 0.4214, "mean_token_accuracy": 0.8525898391380906, "num_tokens": 152092308.0, "step": 354 }, { "entropy": 0.42755126953125, "epoch": 1.3976377952755905, "grad_norm": 0.6966152016027973, "learning_rate": 1.2006806342685127e-05, "loss": 0.4244, "mean_token_accuracy": 0.8513237368315458, "num_tokens": 152521483.0, "step": 355 }, { "entropy": 0.42974853515625, "epoch": 1.4015748031496063, "grad_norm": 0.6830368381973837, "learning_rate": 1.196421931095562e-05, "loss": 0.4075, "mean_token_accuracy": 0.8552977237850428, "num_tokens": 152951062.0, "step": 356 }, { "entropy": 0.4261474609375, "epoch": 1.405511811023622, "grad_norm": 0.7312814209734928, "learning_rate": 1.1921595193016905e-05, "loss": 0.4078, "mean_token_accuracy": 0.8583087539300323, "num_tokens": 153368483.0, "step": 357 }, { "entropy": 0.4244384765625, "epoch": 1.4094488188976377, "grad_norm": 0.6835796438866236, "learning_rate": 1.1878934793650273e-05, "loss": 0.4146, "mean_token_accuracy": 0.8542582355439663, "num_tokens": 153791350.0, "step": 358 }, { "entropy": 0.41558837890625, "epoch": 1.4133858267716535, "grad_norm": 0.6586015598075508, "learning_rate": 1.1836238918322041e-05, "loss": 0.4094, "mean_token_accuracy": 0.855911853723228, "num_tokens": 154243369.0, "step": 359 }, { "entropy": 0.425201416015625, "epoch": 1.4173228346456692, "grad_norm": 0.6850251308250546, "learning_rate": 1.1793508373168346e-05, "loss": 0.4108, "mean_token_accuracy": 0.8540153652429581, "num_tokens": 154653609.0, "step": 360 }, { "entropy": 0.427581787109375, "epoch": 1.421259842519685, "grad_norm": 0.6685207920133769, "learning_rate": 1.1750743964979919e-05, "loss": 0.4191, "mean_token_accuracy": 0.8522771028801799, "num_tokens": 155070913.0, "step": 361 }, { "entropy": 0.416900634765625, "epoch": 1.425196850393701, "grad_norm": 0.7561179827701838, "learning_rate": 1.1707946501186853e-05, "loss": 0.4167, "mean_token_accuracy": 0.8548763170838356, "num_tokens": 155529833.0, "step": 362 }, { "entropy": 0.422332763671875, "epoch": 1.4291338582677167, "grad_norm": 0.6786764165360469, "learning_rate": 1.1665116789843376e-05, "loss": 0.412, "mean_token_accuracy": 0.8565678047016263, "num_tokens": 155974428.0, "step": 363 }, { "entropy": 0.4224853515625, "epoch": 1.4330708661417324, "grad_norm": 0.6451845587584786, "learning_rate": 1.1622255639612553e-05, "loss": 0.4125, "mean_token_accuracy": 0.8562461519613862, "num_tokens": 156403714.0, "step": 364 }, { "entropy": 0.418853759765625, "epoch": 1.4370078740157481, "grad_norm": 0.6849351310551496, "learning_rate": 1.1579363859751069e-05, "loss": 0.4234, "mean_token_accuracy": 0.8534889034926891, "num_tokens": 156832734.0, "step": 365 }, { "entropy": 0.41510009765625, "epoch": 1.4409448818897639, "grad_norm": 0.7306208881773922, "learning_rate": 1.1536442260093908e-05, "loss": 0.4125, "mean_token_accuracy": 0.8564850222319365, "num_tokens": 157293224.0, "step": 366 }, { "entropy": 0.42584228515625, "epoch": 1.4448818897637796, "grad_norm": 0.6623778060860301, "learning_rate": 1.1493491651039077e-05, "loss": 0.4085, "mean_token_accuracy": 0.8566481098532677, "num_tokens": 157714322.0, "step": 367 }, { "entropy": 0.42413330078125, "epoch": 1.4488188976377954, "grad_norm": 0.6686515862412451, "learning_rate": 1.1450512843532315e-05, "loss": 0.4232, "mean_token_accuracy": 0.8522218987345695, "num_tokens": 158153426.0, "step": 368 }, { "entropy": 0.431060791015625, "epoch": 1.452755905511811, "grad_norm": 0.6883001908644174, "learning_rate": 1.140750664905177e-05, "loss": 0.4226, "mean_token_accuracy": 0.8529786402359605, "num_tokens": 158578195.0, "step": 369 }, { "entropy": 0.423797607421875, "epoch": 1.4566929133858268, "grad_norm": 0.6606135266079957, "learning_rate": 1.1364473879592674e-05, "loss": 0.413, "mean_token_accuracy": 0.8547450276091695, "num_tokens": 159000177.0, "step": 370 }, { "entropy": 0.4185791015625, "epoch": 1.4606299212598426, "grad_norm": 0.7149594414525444, "learning_rate": 1.1321415347652031e-05, "loss": 0.3968, "mean_token_accuracy": 0.8579149143770337, "num_tokens": 159434631.0, "step": 371 }, { "entropy": 0.421356201171875, "epoch": 1.4645669291338583, "grad_norm": 0.7071013927213717, "learning_rate": 1.1278331866213253e-05, "loss": 0.3968, "mean_token_accuracy": 0.8596271779388189, "num_tokens": 159850666.0, "step": 372 }, { "entropy": 0.420806884765625, "epoch": 1.468503937007874, "grad_norm": 0.8505216784835236, "learning_rate": 1.1235224248730821e-05, "loss": 0.4221, "mean_token_accuracy": 0.8532926142215729, "num_tokens": 160304984.0, "step": 373 }, { "entropy": 0.422149658203125, "epoch": 1.4724409448818898, "grad_norm": 0.7819391261612006, "learning_rate": 1.1192093309114933e-05, "loss": 0.4048, "mean_token_accuracy": 0.8586824173107743, "num_tokens": 160717222.0, "step": 374 }, { "entropy": 0.434478759765625, "epoch": 1.4763779527559056, "grad_norm": 0.6951485933891031, "learning_rate": 1.1148939861716124e-05, "loss": 0.3963, "mean_token_accuracy": 0.8589826161041856, "num_tokens": 161120940.0, "step": 375 }, { "entropy": 0.428924560546875, "epoch": 1.4803149606299213, "grad_norm": 0.7048568705927686, "learning_rate": 1.11057647213099e-05, "loss": 0.3902, "mean_token_accuracy": 0.8643534425646067, "num_tokens": 161534008.0, "step": 376 }, { "entropy": 0.43157958984375, "epoch": 1.484251968503937, "grad_norm": 0.6722355315921953, "learning_rate": 1.1062568703081345e-05, "loss": 0.4055, "mean_token_accuracy": 0.8573996061459184, "num_tokens": 161960631.0, "step": 377 }, { "entropy": 0.4361572265625, "epoch": 1.4881889763779528, "grad_norm": 0.7490074817348712, "learning_rate": 1.1019352622609739e-05, "loss": 0.4032, "mean_token_accuracy": 0.8587049478664994, "num_tokens": 162370841.0, "step": 378 }, { "entropy": 0.43182373046875, "epoch": 1.4921259842519685, "grad_norm": 0.7287285943859506, "learning_rate": 1.0976117295853155e-05, "loss": 0.4025, "mean_token_accuracy": 0.8602571506053209, "num_tokens": 162777887.0, "step": 379 }, { "entropy": 0.4241943359375, "epoch": 1.4960629921259843, "grad_norm": 0.7026020806956982, "learning_rate": 1.093286353913305e-05, "loss": 0.4161, "mean_token_accuracy": 0.8565910197794437, "num_tokens": 163218983.0, "step": 380 }, { "entropy": 0.433135986328125, "epoch": 1.5, "grad_norm": 0.7204054324234719, "learning_rate": 1.0889592169118857e-05, "loss": 0.3933, "mean_token_accuracy": 0.8597572650760412, "num_tokens": 163623329.0, "step": 381 }, { "entropy": 0.427337646484375, "epoch": 1.5039370078740157, "grad_norm": 0.6903686768507965, "learning_rate": 1.0846304002812564e-05, "loss": 0.4033, "mean_token_accuracy": 0.858160094358027, "num_tokens": 164045642.0, "step": 382 }, { "entropy": 0.421173095703125, "epoch": 1.5078740157480315, "grad_norm": 0.6980585646431972, "learning_rate": 1.0802999857533288e-05, "loss": 0.4081, "mean_token_accuracy": 0.8575689736753702, "num_tokens": 164478955.0, "step": 383 }, { "entropy": 0.42193603515625, "epoch": 1.5118110236220472, "grad_norm": 0.9737017821239141, "learning_rate": 1.0759680550901843e-05, "loss": 0.4136, "mean_token_accuracy": 0.8535346928983927, "num_tokens": 164909956.0, "step": 384 }, { "entropy": 0.430023193359375, "epoch": 1.515748031496063, "grad_norm": 0.7106007919465092, "learning_rate": 1.0716346900825298e-05, "loss": 0.3999, "mean_token_accuracy": 0.8596385335549712, "num_tokens": 165331198.0, "step": 385 }, { "entropy": 0.42340087890625, "epoch": 1.5196850393700787, "grad_norm": 0.6820846944502266, "learning_rate": 1.0672999725481549e-05, "loss": 0.4079, "mean_token_accuracy": 0.8579740738496184, "num_tokens": 165758143.0, "step": 386 }, { "entropy": 0.42694091796875, "epoch": 1.5236220472440944, "grad_norm": 0.7542107896226161, "learning_rate": 1.0629639843303857e-05, "loss": 0.4011, "mean_token_accuracy": 0.8585777133703232, "num_tokens": 166173128.0, "step": 387 }, { "entropy": 0.421539306640625, "epoch": 1.5275590551181102, "grad_norm": 0.6609543935957453, "learning_rate": 1.0586268072965395e-05, "loss": 0.4126, "mean_token_accuracy": 0.856118586845696, "num_tokens": 166625567.0, "step": 388 }, { "entropy": 0.426116943359375, "epoch": 1.531496062992126, "grad_norm": 0.6894576484238519, "learning_rate": 1.0542885233363797e-05, "loss": 0.4006, "mean_token_accuracy": 0.8593427939340472, "num_tokens": 167051168.0, "step": 389 }, { "entropy": 0.426666259765625, "epoch": 1.5354330708661417, "grad_norm": 0.6806968879939391, "learning_rate": 1.0499492143605698e-05, "loss": 0.4015, "mean_token_accuracy": 0.8581527229398489, "num_tokens": 167486244.0, "step": 390 }, { "entropy": 0.425384521484375, "epoch": 1.5393700787401574, "grad_norm": 0.7586136635597545, "learning_rate": 1.0456089622991264e-05, "loss": 0.4226, "mean_token_accuracy": 0.8518975591287017, "num_tokens": 167928276.0, "step": 391 }, { "entropy": 0.42388916015625, "epoch": 1.5433070866141732, "grad_norm": 0.6526633786320399, "learning_rate": 1.0412678490998717e-05, "loss": 0.4031, "mean_token_accuracy": 0.8584257867187262, "num_tokens": 168355746.0, "step": 392 }, { "entropy": 0.4183349609375, "epoch": 1.547244094488189, "grad_norm": 0.6770265837460369, "learning_rate": 1.0369259567268882e-05, "loss": 0.3949, "mean_token_accuracy": 0.8612643834203482, "num_tokens": 168798129.0, "step": 393 }, { "entropy": 0.42486572265625, "epoch": 1.5511811023622046, "grad_norm": 0.6920016700595446, "learning_rate": 1.0325833671589687e-05, "loss": 0.3995, "mean_token_accuracy": 0.8574335686862469, "num_tokens": 169217374.0, "step": 394 }, { "entropy": 0.42083740234375, "epoch": 1.5551181102362204, "grad_norm": 0.7056602515336996, "learning_rate": 1.0282401623880704e-05, "loss": 0.4057, "mean_token_accuracy": 0.8575547644868493, "num_tokens": 169656947.0, "step": 395 }, { "entropy": 0.42193603515625, "epoch": 1.5590551181102361, "grad_norm": 0.6909377706505936, "learning_rate": 1.0238964244177657e-05, "loss": 0.4048, "mean_token_accuracy": 0.858373093418777, "num_tokens": 170077729.0, "step": 396 }, { "entropy": 0.425323486328125, "epoch": 1.5629921259842519, "grad_norm": 0.7117437962313572, "learning_rate": 1.0195522352616942e-05, "loss": 0.4132, "mean_token_accuracy": 0.8565368922427297, "num_tokens": 170499116.0, "step": 397 }, { "entropy": 0.423797607421875, "epoch": 1.5669291338582676, "grad_norm": 0.7009070523233998, "learning_rate": 1.0152076769420153e-05, "loss": 0.3969, "mean_token_accuracy": 0.8602678831666708, "num_tokens": 170919221.0, "step": 398 }, { "entropy": 0.42755126953125, "epoch": 1.5708661417322833, "grad_norm": 0.6952782642754524, "learning_rate": 1.0108628314878572e-05, "loss": 0.4128, "mean_token_accuracy": 0.8561601033434272, "num_tokens": 171346417.0, "step": 399 }, { "entropy": 0.431640625, "epoch": 1.574803149606299, "grad_norm": 0.6787762500080776, "learning_rate": 1.0065177809337703e-05, "loss": 0.3997, "mean_token_accuracy": 0.8580770511180162, "num_tokens": 171771196.0, "step": 400 }, { "entropy": 0.42333984375, "epoch": 1.5787401574803148, "grad_norm": 0.6641496005676443, "learning_rate": 1.002172607318177e-05, "loss": 0.3952, "mean_token_accuracy": 0.8605634858831763, "num_tokens": 172208563.0, "step": 401 }, { "entropy": 0.42852783203125, "epoch": 1.5826771653543306, "grad_norm": 0.6711002341847971, "learning_rate": 9.978273926818233e-06, "loss": 0.4041, "mean_token_accuracy": 0.8582491222769022, "num_tokens": 172622056.0, "step": 402 }, { "entropy": 0.4249267578125, "epoch": 1.5866141732283463, "grad_norm": 0.6591812100767495, "learning_rate": 9.934822190662299e-06, "loss": 0.4133, "mean_token_accuracy": 0.8562856521457434, "num_tokens": 173072118.0, "step": 403 }, { "entropy": 0.424102783203125, "epoch": 1.590551181102362, "grad_norm": 0.670607342101713, "learning_rate": 9.89137168512143e-06, "loss": 0.4036, "mean_token_accuracy": 0.8581979488953948, "num_tokens": 173501197.0, "step": 404 }, { "entropy": 0.42413330078125, "epoch": 1.594488188976378, "grad_norm": 0.7082529364693488, "learning_rate": 9.847923230579848e-06, "loss": 0.4006, "mean_token_accuracy": 0.8585393913090229, "num_tokens": 173938951.0, "step": 405 }, { "entropy": 0.432098388671875, "epoch": 1.5984251968503937, "grad_norm": 0.7505153208274351, "learning_rate": 9.804477647383061e-06, "loss": 0.4051, "mean_token_accuracy": 0.8565549207851291, "num_tokens": 174344014.0, "step": 406 }, { "entropy": 0.425811767578125, "epoch": 1.6023622047244095, "grad_norm": 0.6819153612915562, "learning_rate": 9.761035755822347e-06, "loss": 0.3974, "mean_token_accuracy": 0.8594374163076282, "num_tokens": 174767953.0, "step": 407 }, { "entropy": 0.426727294921875, "epoch": 1.6062992125984252, "grad_norm": 0.661234527207505, "learning_rate": 9.717598376119301e-06, "loss": 0.4028, "mean_token_accuracy": 0.858024075627327, "num_tokens": 175190002.0, "step": 408 }, { "entropy": 0.42169189453125, "epoch": 1.610236220472441, "grad_norm": 0.678400946836127, "learning_rate": 9.674166328410318e-06, "loss": 0.4057, "mean_token_accuracy": 0.8575535602867603, "num_tokens": 175637779.0, "step": 409 }, { "entropy": 0.429595947265625, "epoch": 1.6141732283464567, "grad_norm": 0.6484799686944109, "learning_rate": 9.630740432731123e-06, "loss": 0.396, "mean_token_accuracy": 0.8635142697021365, "num_tokens": 176063629.0, "step": 410 }, { "entropy": 0.432220458984375, "epoch": 1.6181102362204725, "grad_norm": 0.6562134804867598, "learning_rate": 9.587321509001288e-06, "loss": 0.4129, "mean_token_accuracy": 0.8563750553876162, "num_tokens": 176489720.0, "step": 411 }, { "entropy": 0.429534912109375, "epoch": 1.6220472440944882, "grad_norm": 0.619336896640731, "learning_rate": 9.543910377008741e-06, "loss": 0.4094, "mean_token_accuracy": 0.859122664667666, "num_tokens": 176932692.0, "step": 412 }, { "entropy": 0.4376220703125, "epoch": 1.625984251968504, "grad_norm": 0.6608522160653064, "learning_rate": 9.5005078563943e-06, "loss": 0.3992, "mean_token_accuracy": 0.8599905716255307, "num_tokens": 177345218.0, "step": 413 }, { "entropy": 0.44317626953125, "epoch": 1.6299212598425197, "grad_norm": 0.6802186143674549, "learning_rate": 9.457114766636203e-06, "loss": 0.4074, "mean_token_accuracy": 0.8593994919210672, "num_tokens": 177769367.0, "step": 414 }, { "entropy": 0.42620849609375, "epoch": 1.6338582677165354, "grad_norm": 0.6424723588977458, "learning_rate": 9.413731927034607e-06, "loss": 0.3942, "mean_token_accuracy": 0.8619447741657495, "num_tokens": 178213481.0, "step": 415 }, { "entropy": 0.431243896484375, "epoch": 1.6377952755905512, "grad_norm": 0.6273568531212063, "learning_rate": 9.370360156696143e-06, "loss": 0.4009, "mean_token_accuracy": 0.8575205830857158, "num_tokens": 178649774.0, "step": 416 }, { "entropy": 0.43670654296875, "epoch": 1.641732283464567, "grad_norm": 0.6791296212575202, "learning_rate": 9.327000274518453e-06, "loss": 0.4069, "mean_token_accuracy": 0.8575573619455099, "num_tokens": 179067613.0, "step": 417 }, { "entropy": 0.433624267578125, "epoch": 1.6456692913385826, "grad_norm": 0.6728543531262471, "learning_rate": 9.283653099174704e-06, "loss": 0.4207, "mean_token_accuracy": 0.8519806191325188, "num_tokens": 179499205.0, "step": 418 }, { "entropy": 0.437164306640625, "epoch": 1.6496062992125984, "grad_norm": 0.649958590187722, "learning_rate": 9.24031944909816e-06, "loss": 0.3958, "mean_token_accuracy": 0.8601369233801961, "num_tokens": 179929093.0, "step": 419 }, { "entropy": 0.434326171875, "epoch": 1.6535433070866141, "grad_norm": 0.6737577071985799, "learning_rate": 9.197000142466715e-06, "loss": 0.4059, "mean_token_accuracy": 0.8580764941871166, "num_tokens": 180389799.0, "step": 420 }, { "entropy": 0.432830810546875, "epoch": 1.65748031496063, "grad_norm": 0.6492707624195736, "learning_rate": 9.15369599718744e-06, "loss": 0.3915, "mean_token_accuracy": 0.8608764903619885, "num_tokens": 180820852.0, "step": 421 }, { "entropy": 0.4285888671875, "epoch": 1.6614173228346458, "grad_norm": 0.6603995867326616, "learning_rate": 9.110407830881146e-06, "loss": 0.381, "mean_token_accuracy": 0.8649307256564498, "num_tokens": 181247065.0, "step": 422 }, { "entropy": 0.421783447265625, "epoch": 1.6653543307086616, "grad_norm": 0.6781308462951336, "learning_rate": 9.067136460866954e-06, "loss": 0.4085, "mean_token_accuracy": 0.8567010900005698, "num_tokens": 181685279.0, "step": 423 }, { "entropy": 0.423583984375, "epoch": 1.6692913385826773, "grad_norm": 0.6770679227525491, "learning_rate": 9.023882704146848e-06, "loss": 0.3951, "mean_token_accuracy": 0.8610436161980033, "num_tokens": 182096431.0, "step": 424 }, { "entropy": 0.427703857421875, "epoch": 1.673228346456693, "grad_norm": 0.7056744478166704, "learning_rate": 8.980647377390263e-06, "loss": 0.4031, "mean_token_accuracy": 0.8577226242050529, "num_tokens": 182526927.0, "step": 425 }, { "entropy": 0.423248291015625, "epoch": 1.6771653543307088, "grad_norm": 0.6929097942926685, "learning_rate": 8.937431296918658e-06, "loss": 0.3962, "mean_token_accuracy": 0.8589769685640931, "num_tokens": 182948540.0, "step": 426 }, { "entropy": 0.426116943359375, "epoch": 1.6811023622047245, "grad_norm": 0.6513465373807524, "learning_rate": 8.894235278690104e-06, "loss": 0.396, "mean_token_accuracy": 0.8591319024562836, "num_tokens": 183370596.0, "step": 427 }, { "entropy": 0.436279296875, "epoch": 1.6850393700787403, "grad_norm": 0.6697597022943024, "learning_rate": 8.85106013828388e-06, "loss": 0.3925, "mean_token_accuracy": 0.8593562422320247, "num_tokens": 183754398.0, "step": 428 }, { "entropy": 0.42852783203125, "epoch": 1.688976377952756, "grad_norm": 0.6752483884886014, "learning_rate": 8.80790669088507e-06, "loss": 0.3984, "mean_token_accuracy": 0.8606278160586953, "num_tokens": 184176837.0, "step": 429 }, { "entropy": 0.425018310546875, "epoch": 1.6929133858267718, "grad_norm": 0.666204242518818, "learning_rate": 8.764775751269184e-06, "loss": 0.3927, "mean_token_accuracy": 0.8630478298291564, "num_tokens": 184600270.0, "step": 430 }, { "entropy": 0.421630859375, "epoch": 1.6968503937007875, "grad_norm": 0.6686378991246827, "learning_rate": 8.721668133786752e-06, "loss": 0.3942, "mean_token_accuracy": 0.8612590469419956, "num_tokens": 185036728.0, "step": 431 }, { "entropy": 0.4161376953125, "epoch": 1.7007874015748032, "grad_norm": 0.7354074820468248, "learning_rate": 8.678584652347974e-06, "loss": 0.4132, "mean_token_accuracy": 0.8553070295602083, "num_tokens": 185474069.0, "step": 432 }, { "entropy": 0.426055908203125, "epoch": 1.704724409448819, "grad_norm": 0.7695082385707869, "learning_rate": 8.63552612040733e-06, "loss": 0.4045, "mean_token_accuracy": 0.8592064557597041, "num_tokens": 185892235.0, "step": 433 }, { "entropy": 0.42218017578125, "epoch": 1.7086614173228347, "grad_norm": 0.6611227329939291, "learning_rate": 8.592493350948237e-06, "loss": 0.3902, "mean_token_accuracy": 0.8618718609213829, "num_tokens": 186316312.0, "step": 434 }, { "entropy": 0.4195556640625, "epoch": 1.7125984251968505, "grad_norm": 0.7015841661068855, "learning_rate": 8.549487156467691e-06, "loss": 0.3939, "mean_token_accuracy": 0.8596938429400325, "num_tokens": 186742337.0, "step": 435 }, { "entropy": 0.420379638671875, "epoch": 1.7165354330708662, "grad_norm": 0.7060512471368613, "learning_rate": 8.506508348960924e-06, "loss": 0.3865, "mean_token_accuracy": 0.8619933761656284, "num_tokens": 187178664.0, "step": 436 }, { "entropy": 0.42059326171875, "epoch": 1.720472440944882, "grad_norm": 0.6387961887270812, "learning_rate": 8.463557739906094e-06, "loss": 0.3926, "mean_token_accuracy": 0.8623356893658638, "num_tokens": 187604003.0, "step": 437 }, { "entropy": 0.416229248046875, "epoch": 1.7244094488188977, "grad_norm": 0.672058128974811, "learning_rate": 8.42063614024893e-06, "loss": 0.3944, "mean_token_accuracy": 0.859029428102076, "num_tokens": 188050204.0, "step": 438 }, { "entropy": 0.417755126953125, "epoch": 1.7283464566929134, "grad_norm": 0.7264979233715146, "learning_rate": 8.377744360387447e-06, "loss": 0.396, "mean_token_accuracy": 0.8614224148914218, "num_tokens": 188482220.0, "step": 439 }, { "entropy": 0.41650390625, "epoch": 1.7322834645669292, "grad_norm": 0.673741771374478, "learning_rate": 8.334883210156629e-06, "loss": 0.3869, "mean_token_accuracy": 0.8622207688167691, "num_tokens": 188913952.0, "step": 440 }, { "entropy": 0.418426513671875, "epoch": 1.736220472440945, "grad_norm": 0.6980231663029676, "learning_rate": 8.292053498813149e-06, "loss": 0.3896, "mean_token_accuracy": 0.8624574858695269, "num_tokens": 189328708.0, "step": 441 }, { "entropy": 0.4112548828125, "epoch": 1.7401574803149606, "grad_norm": 0.7061589196986463, "learning_rate": 8.249256035020086e-06, "loss": 0.3915, "mean_token_accuracy": 0.8615807592868805, "num_tokens": 189795592.0, "step": 442 }, { "entropy": 0.418060302734375, "epoch": 1.7440944881889764, "grad_norm": 0.6641651313861893, "learning_rate": 8.20649162683166e-06, "loss": 0.3904, "mean_token_accuracy": 0.8617926817387342, "num_tokens": 190225732.0, "step": 443 }, { "entropy": 0.41802978515625, "epoch": 1.7480314960629921, "grad_norm": 0.877405172892866, "learning_rate": 8.163761081677962e-06, "loss": 0.4026, "mean_token_accuracy": 0.8586821621283889, "num_tokens": 190657380.0, "step": 444 }, { "entropy": 0.417144775390625, "epoch": 1.7519685039370079, "grad_norm": 0.7049436005394818, "learning_rate": 8.12106520634973e-06, "loss": 0.3976, "mean_token_accuracy": 0.8597701685503125, "num_tokens": 191094942.0, "step": 445 }, { "entropy": 0.430206298828125, "epoch": 1.7559055118110236, "grad_norm": 0.6749513439611118, "learning_rate": 8.078404806983096e-06, "loss": 0.3812, "mean_token_accuracy": 0.8642371194437146, "num_tokens": 191505327.0, "step": 446 }, { "entropy": 0.417510986328125, "epoch": 1.7598425196850394, "grad_norm": 0.7424818686985509, "learning_rate": 8.035780689044381e-06, "loss": 0.3866, "mean_token_accuracy": 0.8627121299505234, "num_tokens": 191944808.0, "step": 447 }, { "entropy": 0.420654296875, "epoch": 1.763779527559055, "grad_norm": 0.6958968245355566, "learning_rate": 7.993193657314874e-06, "loss": 0.3908, "mean_token_accuracy": 0.8628187980502844, "num_tokens": 192357839.0, "step": 448 }, { "entropy": 0.42572021484375, "epoch": 1.7677165354330708, "grad_norm": 0.6651030427108865, "learning_rate": 7.95064451587565e-06, "loss": 0.4052, "mean_token_accuracy": 0.8572182497009635, "num_tokens": 192784432.0, "step": 449 }, { "entropy": 0.41754150390625, "epoch": 1.7716535433070866, "grad_norm": 0.626078997321363, "learning_rate": 7.908134068092375e-06, "loss": 0.3913, "mean_token_accuracy": 0.8627305366098881, "num_tokens": 193232872.0, "step": 450 }, { "entropy": 0.42828369140625, "epoch": 1.7755905511811023, "grad_norm": 0.6801797942178992, "learning_rate": 7.865663116600149e-06, "loss": 0.3999, "mean_token_accuracy": 0.859979891218245, "num_tokens": 193673419.0, "step": 451 }, { "entropy": 0.42181396484375, "epoch": 1.779527559055118, "grad_norm": 0.6282853116668798, "learning_rate": 7.823232463288344e-06, "loss": 0.384, "mean_token_accuracy": 0.8639516020193696, "num_tokens": 194112394.0, "step": 452 }, { "entropy": 0.418121337890625, "epoch": 1.7834645669291338, "grad_norm": 0.6796178904809435, "learning_rate": 7.780842909285471e-06, "loss": 0.4193, "mean_token_accuracy": 0.8537283595651388, "num_tokens": 194562565.0, "step": 453 }, { "entropy": 0.42059326171875, "epoch": 1.7874015748031495, "grad_norm": 0.704596185547038, "learning_rate": 7.738495254944042e-06, "loss": 0.3904, "mean_token_accuracy": 0.8616365287452936, "num_tokens": 195006242.0, "step": 454 }, { "entropy": 0.42645263671875, "epoch": 1.7913385826771653, "grad_norm": 0.660325371978726, "learning_rate": 7.696190299825474e-06, "loss": 0.4005, "mean_token_accuracy": 0.8596520023420453, "num_tokens": 195428816.0, "step": 455 }, { "entropy": 0.423370361328125, "epoch": 1.795275590551181, "grad_norm": 0.6709634013713723, "learning_rate": 7.65392884268498e-06, "loss": 0.3912, "mean_token_accuracy": 0.8635533200576901, "num_tokens": 195853603.0, "step": 456 }, { "entropy": 0.426971435546875, "epoch": 1.7992125984251968, "grad_norm": 0.6881407153331328, "learning_rate": 7.611711681456493e-06, "loss": 0.401, "mean_token_accuracy": 0.8594867596402764, "num_tokens": 196275977.0, "step": 457 }, { "entropy": 0.420867919921875, "epoch": 1.8031496062992125, "grad_norm": 0.6417147807851576, "learning_rate": 7.569539613237595e-06, "loss": 0.3954, "mean_token_accuracy": 0.8610730767250061, "num_tokens": 196713889.0, "step": 458 }, { "entropy": 0.418243408203125, "epoch": 1.8070866141732282, "grad_norm": 0.6310110212469618, "learning_rate": 7.527413434274487e-06, "loss": 0.3885, "mean_token_accuracy": 0.8633995288982987, "num_tokens": 197144027.0, "step": 459 }, { "entropy": 0.417816162109375, "epoch": 1.811023622047244, "grad_norm": 0.6597677933522972, "learning_rate": 7.485333939946926e-06, "loss": 0.3949, "mean_token_accuracy": 0.8605340076610446, "num_tokens": 197580520.0, "step": 460 }, { "entropy": 0.4188232421875, "epoch": 1.8149606299212597, "grad_norm": 0.6723363401244932, "learning_rate": 7.443301924753224e-06, "loss": 0.3993, "mean_token_accuracy": 0.8572817407548428, "num_tokens": 198014473.0, "step": 461 }, { "entropy": 0.41595458984375, "epoch": 1.8188976377952755, "grad_norm": 0.6173635578239941, "learning_rate": 7.4013181822952484e-06, "loss": 0.3894, "mean_token_accuracy": 0.8615096509456635, "num_tokens": 198455804.0, "step": 462 }, { "entropy": 0.41363525390625, "epoch": 1.8228346456692912, "grad_norm": 0.6661812853397687, "learning_rate": 7.359383505263431e-06, "loss": 0.3856, "mean_token_accuracy": 0.8620446948334575, "num_tokens": 198870929.0, "step": 463 }, { "entropy": 0.4166259765625, "epoch": 1.826771653543307, "grad_norm": 0.6574118930371434, "learning_rate": 7.317498685421803e-06, "loss": 0.3879, "mean_token_accuracy": 0.8604755392298102, "num_tokens": 199305535.0, "step": 464 }, { "entropy": 0.4122314453125, "epoch": 1.8307086614173227, "grad_norm": 0.6434036995854752, "learning_rate": 7.275664513593057e-06, "loss": 0.3836, "mean_token_accuracy": 0.8634475152939558, "num_tokens": 199750969.0, "step": 465 }, { "entropy": 0.4150390625, "epoch": 1.8346456692913384, "grad_norm": 0.6764928500122582, "learning_rate": 7.233881779643595e-06, "loss": 0.3916, "mean_token_accuracy": 0.863021994009614, "num_tokens": 200195670.0, "step": 466 }, { "entropy": 0.4140625, "epoch": 1.8385826771653542, "grad_norm": 0.9197242588519385, "learning_rate": 7.19215127246863e-06, "loss": 0.3788, "mean_token_accuracy": 0.8645974956452847, "num_tokens": 200634020.0, "step": 467 }, { "entropy": 0.420623779296875, "epoch": 1.84251968503937, "grad_norm": 0.683308798654025, "learning_rate": 7.150473779977292e-06, "loss": 0.3927, "mean_token_accuracy": 0.8626933787018061, "num_tokens": 201068695.0, "step": 468 }, { "entropy": 0.420379638671875, "epoch": 1.8464566929133859, "grad_norm": 0.6859535560379857, "learning_rate": 7.108850089077736e-06, "loss": 0.3938, "mean_token_accuracy": 0.8602238912135363, "num_tokens": 201508090.0, "step": 469 }, { "entropy": 0.4178466796875, "epoch": 1.8503937007874016, "grad_norm": 0.6740037040573118, "learning_rate": 7.0672809856623036e-06, "loss": 0.3792, "mean_token_accuracy": 0.8625660231336951, "num_tokens": 201919084.0, "step": 470 }, { "entropy": 0.41619873046875, "epoch": 1.8543307086614174, "grad_norm": 0.6767452030032078, "learning_rate": 7.0257672545926755e-06, "loss": 0.3829, "mean_token_accuracy": 0.864514097571373, "num_tokens": 202352097.0, "step": 471 }, { "entropy": 0.4151611328125, "epoch": 1.858267716535433, "grad_norm": 0.6681058032124264, "learning_rate": 6.984309679685049e-06, "loss": 0.3896, "mean_token_accuracy": 0.8648245232179761, "num_tokens": 202784053.0, "step": 472 }, { "entropy": 0.420684814453125, "epoch": 1.8622047244094488, "grad_norm": 0.6281109991828322, "learning_rate": 6.942909043695345e-06, "loss": 0.394, "mean_token_accuracy": 0.8599920589476824, "num_tokens": 203218387.0, "step": 473 }, { "entropy": 0.420074462890625, "epoch": 1.8661417322834646, "grad_norm": 0.6650717181718757, "learning_rate": 6.901566128304429e-06, "loss": 0.3949, "mean_token_accuracy": 0.8606854053214192, "num_tokens": 203648613.0, "step": 474 }, { "entropy": 0.41729736328125, "epoch": 1.8700787401574803, "grad_norm": 0.688270984124111, "learning_rate": 6.86028171410335e-06, "loss": 0.3977, "mean_token_accuracy": 0.8606042871251702, "num_tokens": 204085760.0, "step": 475 }, { "entropy": 0.424652099609375, "epoch": 1.874015748031496, "grad_norm": 0.66428003722815, "learning_rate": 6.8190565805785965e-06, "loss": 0.3819, "mean_token_accuracy": 0.8656391901895404, "num_tokens": 204510920.0, "step": 476 }, { "entropy": 0.42144775390625, "epoch": 1.8779527559055118, "grad_norm": 0.7095703761600634, "learning_rate": 6.777891506097394e-06, "loss": 0.3817, "mean_token_accuracy": 0.8649838771671057, "num_tokens": 204928475.0, "step": 477 }, { "entropy": 0.41656494140625, "epoch": 1.8818897637795275, "grad_norm": 0.6557237079195142, "learning_rate": 6.736787267892991e-06, "loss": 0.3752, "mean_token_accuracy": 0.8673708308488131, "num_tokens": 205375452.0, "step": 478 }, { "entropy": 0.427093505859375, "epoch": 1.8858267716535433, "grad_norm": 0.6954548155651175, "learning_rate": 6.695744642050001e-06, "loss": 0.3928, "mean_token_accuracy": 0.8597937086597085, "num_tokens": 205791665.0, "step": 479 }, { "entropy": 0.418548583984375, "epoch": 1.889763779527559, "grad_norm": 0.6341218489395974, "learning_rate": 6.654764403489737e-06, "loss": 0.3775, "mean_token_accuracy": 0.8657321650534868, "num_tokens": 206213381.0, "step": 480 }, { "entropy": 0.424468994140625, "epoch": 1.8937007874015748, "grad_norm": 0.6614311846483276, "learning_rate": 6.613847325955578e-06, "loss": 0.3786, "mean_token_accuracy": 0.8649399066343904, "num_tokens": 206621419.0, "step": 481 }, { "entropy": 0.4228515625, "epoch": 1.8976377952755905, "grad_norm": 0.6827200000056922, "learning_rate": 6.572994181998385e-06, "loss": 0.3867, "mean_token_accuracy": 0.8616899996995926, "num_tokens": 207051076.0, "step": 482 }, { "entropy": 0.418701171875, "epoch": 1.9015748031496063, "grad_norm": 0.6787415604952725, "learning_rate": 6.532205742961881e-06, "loss": 0.3903, "mean_token_accuracy": 0.8626525811851025, "num_tokens": 207493089.0, "step": 483 }, { "entropy": 0.428619384765625, "epoch": 1.905511811023622, "grad_norm": 0.7016721452472603, "learning_rate": 6.491482778968103e-06, "loss": 0.3934, "mean_token_accuracy": 0.8631287338212132, "num_tokens": 207907472.0, "step": 484 }, { "entropy": 0.4241943359375, "epoch": 1.909448818897638, "grad_norm": 0.7185424042860358, "learning_rate": 6.450826058902868e-06, "loss": 0.4029, "mean_token_accuracy": 0.858769909478724, "num_tokens": 208333643.0, "step": 485 }, { "entropy": 0.426910400390625, "epoch": 1.9133858267716537, "grad_norm": 0.6673224326835845, "learning_rate": 6.41023635040124e-06, "loss": 0.3878, "mean_token_accuracy": 0.862129864282906, "num_tokens": 208770435.0, "step": 486 }, { "entropy": 0.430267333984375, "epoch": 1.9173228346456694, "grad_norm": 0.6437703200019689, "learning_rate": 6.369714419833056e-06, "loss": 0.3834, "mean_token_accuracy": 0.8656902518123388, "num_tokens": 209205852.0, "step": 487 }, { "entropy": 0.420867919921875, "epoch": 1.9212598425196852, "grad_norm": 0.6072676743596053, "learning_rate": 6.3292610322884365e-06, "loss": 0.3792, "mean_token_accuracy": 0.8643869431689382, "num_tokens": 209663707.0, "step": 488 }, { "entropy": 0.429290771484375, "epoch": 1.925196850393701, "grad_norm": 0.6705397286490944, "learning_rate": 6.288876951563352e-06, "loss": 0.3654, "mean_token_accuracy": 0.868832329288125, "num_tokens": 210064008.0, "step": 489 }, { "entropy": 0.428009033203125, "epoch": 1.9291338582677167, "grad_norm": 0.6636918090501086, "learning_rate": 6.2485629401451954e-06, "loss": 0.3944, "mean_token_accuracy": 0.8630169397220016, "num_tokens": 210492962.0, "step": 490 }, { "entropy": 0.42431640625, "epoch": 1.9330708661417324, "grad_norm": 0.6519084693087212, "learning_rate": 6.2083197591983935e-06, "loss": 0.3829, "mean_token_accuracy": 0.8642466831952333, "num_tokens": 210923592.0, "step": 491 }, { "entropy": 0.413330078125, "epoch": 1.9370078740157481, "grad_norm": 0.61204996961206, "learning_rate": 6.168148168550029e-06, "loss": 0.3808, "mean_token_accuracy": 0.8643651902675629, "num_tokens": 211364636.0, "step": 492 }, { "entropy": 0.4154052734375, "epoch": 1.9409448818897639, "grad_norm": 0.636254485190738, "learning_rate": 6.128048926675494e-06, "loss": 0.3759, "mean_token_accuracy": 0.8656032215803862, "num_tokens": 211799550.0, "step": 493 }, { "entropy": 0.41650390625, "epoch": 1.9448818897637796, "grad_norm": 0.6406331468098087, "learning_rate": 6.088022790684174e-06, "loss": 0.3794, "mean_token_accuracy": 0.8653112007305026, "num_tokens": 212222704.0, "step": 494 }, { "entropy": 0.418792724609375, "epoch": 1.9488188976377954, "grad_norm": 0.6499997573957319, "learning_rate": 6.048070516305147e-06, "loss": 0.3799, "mean_token_accuracy": 0.86427709646523, "num_tokens": 212669140.0, "step": 495 }, { "entropy": 0.425079345703125, "epoch": 1.952755905511811, "grad_norm": 0.665145279400102, "learning_rate": 6.0081928578729235e-06, "loss": 0.3802, "mean_token_accuracy": 0.8652894785627723, "num_tokens": 213102653.0, "step": 496 }, { "entropy": 0.4298095703125, "epoch": 1.9566929133858268, "grad_norm": 0.6567115782965014, "learning_rate": 5.968390568313194e-06, "loss": 0.4041, "mean_token_accuracy": 0.8583439188078046, "num_tokens": 213537439.0, "step": 497 }, { "entropy": 0.421966552734375, "epoch": 1.9606299212598426, "grad_norm": 0.6356796950899889, "learning_rate": 5.928664399128618e-06, "loss": 0.3956, "mean_token_accuracy": 0.8614333514124155, "num_tokens": 213977409.0, "step": 498 }, { "entropy": 0.427032470703125, "epoch": 1.9645669291338583, "grad_norm": 0.6258173971122529, "learning_rate": 5.889015100384636e-06, "loss": 0.3927, "mean_token_accuracy": 0.8633041819557548, "num_tokens": 214409581.0, "step": 499 }, { "entropy": 0.419891357421875, "epoch": 1.968503937007874, "grad_norm": 0.6365710396494279, "learning_rate": 5.8494434206953054e-06, "loss": 0.3745, "mean_token_accuracy": 0.8675504606217146, "num_tokens": 214842294.0, "step": 500 }, { "entropy": 0.419921875, "epoch": 1.9724409448818898, "grad_norm": 0.673936395220552, "learning_rate": 5.809950107209168e-06, "loss": 0.3825, "mean_token_accuracy": 0.863592054694891, "num_tokens": 215283386.0, "step": 501 }, { "entropy": 0.416656494140625, "epoch": 1.9763779527559056, "grad_norm": 0.6782512956847635, "learning_rate": 5.770535905595138e-06, "loss": 0.39, "mean_token_accuracy": 0.8610916286706924, "num_tokens": 215720079.0, "step": 502 }, { "entropy": 0.423675537109375, "epoch": 1.9803149606299213, "grad_norm": 0.6940032370900541, "learning_rate": 5.731201560028432e-06, "loss": 0.3809, "mean_token_accuracy": 0.8642973145470023, "num_tokens": 216131160.0, "step": 503 }, { "entropy": 0.422760009765625, "epoch": 1.984251968503937, "grad_norm": 0.6566881027504372, "learning_rate": 5.6919478131765075e-06, "loss": 0.3771, "mean_token_accuracy": 0.8662382122129202, "num_tokens": 216557614.0, "step": 504 }, { "entropy": 0.4281005859375, "epoch": 1.9881889763779528, "grad_norm": 0.6871982977212773, "learning_rate": 5.652775406185056e-06, "loss": 0.3787, "mean_token_accuracy": 0.8649186259135604, "num_tokens": 216973484.0, "step": 505 }, { "entropy": 0.420867919921875, "epoch": 1.9921259842519685, "grad_norm": 0.6620807044087577, "learning_rate": 5.613685078663993e-06, "loss": 0.3779, "mean_token_accuracy": 0.8658335618674755, "num_tokens": 217399571.0, "step": 506 }, { "entropy": 0.422088623046875, "epoch": 1.9960629921259843, "grad_norm": 0.6447316252113497, "learning_rate": 5.574677568673499e-06, "loss": 0.375, "mean_token_accuracy": 0.8671787939965725, "num_tokens": 217820936.0, "step": 507 }, { "entropy": 0.41986083984375, "epoch": 2.0, "grad_norm": 0.6671106585235214, "learning_rate": 5.535753612710091e-06, "loss": 0.3747, "mean_token_accuracy": 0.866083949804306, "num_tokens": 218248519.0, "step": 508 }, { "entropy": 0.4205322265625, "epoch": 2.0039370078740157, "grad_norm": 0.7691060576967338, "learning_rate": 5.496913945692706e-06, "loss": 0.3357, "mean_token_accuracy": 0.8782863048836589, "num_tokens": 218681722.0, "step": 509 }, { "entropy": 0.41546630859375, "epoch": 2.0078740157480315, "grad_norm": 0.706074532223459, "learning_rate": 5.458159300948837e-06, "loss": 0.3355, "mean_token_accuracy": 0.8784531345590949, "num_tokens": 219097671.0, "step": 510 }, { "entropy": 0.40863037109375, "epoch": 2.0118110236220472, "grad_norm": 0.6827314808658607, "learning_rate": 5.419490410200675e-06, "loss": 0.3501, "mean_token_accuracy": 0.8720254069194198, "num_tokens": 219548387.0, "step": 511 }, { "entropy": 0.399017333984375, "epoch": 2.015748031496063, "grad_norm": 0.7517593980159016, "learning_rate": 5.3809080035513e-06, "loss": 0.3217, "mean_token_accuracy": 0.883345877751708, "num_tokens": 219990743.0, "step": 512 }, { "entropy": 0.40924072265625, "epoch": 2.0196850393700787, "grad_norm": 0.9325438750744767, "learning_rate": 5.342412809470903e-06, "loss": 0.3426, "mean_token_accuracy": 0.87769855838269, "num_tokens": 220414422.0, "step": 513 }, { "entropy": 0.403961181640625, "epoch": 2.0236220472440944, "grad_norm": 0.6686083056431692, "learning_rate": 5.304005554783015e-06, "loss": 0.3353, "mean_token_accuracy": 0.8792219227179885, "num_tokens": 220847772.0, "step": 514 }, { "entropy": 0.4119873046875, "epoch": 2.02755905511811, "grad_norm": 0.6959836173942446, "learning_rate": 5.265686964650796e-06, "loss": 0.3315, "mean_token_accuracy": 0.8809208925813437, "num_tokens": 221279092.0, "step": 515 }, { "entropy": 0.4127197265625, "epoch": 2.031496062992126, "grad_norm": 0.705326284320785, "learning_rate": 5.227457762563339e-06, "loss": 0.3372, "mean_token_accuracy": 0.8781589884310961, "num_tokens": 221698923.0, "step": 516 }, { "entropy": 0.415802001953125, "epoch": 2.0354330708661417, "grad_norm": 0.6949441787042704, "learning_rate": 5.189318670322016e-06, "loss": 0.3398, "mean_token_accuracy": 0.8793862201273441, "num_tokens": 222134184.0, "step": 517 }, { "entropy": 0.408905029296875, "epoch": 2.0393700787401574, "grad_norm": 0.7195006498702371, "learning_rate": 5.151270408026839e-06, "loss": 0.3281, "mean_token_accuracy": 0.8829669477418065, "num_tokens": 222571682.0, "step": 518 }, { "entropy": 0.415283203125, "epoch": 2.043307086614173, "grad_norm": 0.6442176858506412, "learning_rate": 5.113313694062869e-06, "loss": 0.3345, "mean_token_accuracy": 0.8803455736488104, "num_tokens": 223006861.0, "step": 519 }, { "entropy": 0.40582275390625, "epoch": 2.047244094488189, "grad_norm": 0.7512390521563385, "learning_rate": 5.075449245086661e-06, "loss": 0.3332, "mean_token_accuracy": 0.8804942537099123, "num_tokens": 223430085.0, "step": 520 }, { "entropy": 0.40545654296875, "epoch": 2.0511811023622046, "grad_norm": 0.7818713463029238, "learning_rate": 5.037677776012719e-06, "loss": 0.3502, "mean_token_accuracy": 0.8745955023914576, "num_tokens": 223863341.0, "step": 521 }, { "entropy": 0.402984619140625, "epoch": 2.0551181102362204, "grad_norm": 0.6654412815987092, "learning_rate": 5.000000000000003e-06, "loss": 0.353, "mean_token_accuracy": 0.8740874016657472, "num_tokens": 224309404.0, "step": 522 }, { "entropy": 0.40625, "epoch": 2.059055118110236, "grad_norm": 0.6620816958634905, "learning_rate": 4.962416628438466e-06, "loss": 0.3322, "mean_token_accuracy": 0.8814442995935678, "num_tokens": 224731123.0, "step": 523 }, { "entropy": 0.404052734375, "epoch": 2.062992125984252, "grad_norm": 0.6684098815971495, "learning_rate": 4.924928370935622e-06, "loss": 0.3352, "mean_token_accuracy": 0.8791022077202797, "num_tokens": 225161374.0, "step": 524 }, { "entropy": 0.412261962890625, "epoch": 2.0669291338582676, "grad_norm": 0.6915973611592388, "learning_rate": 4.887535935303147e-06, "loss": 0.3306, "mean_token_accuracy": 0.880140382796526, "num_tokens": 225592845.0, "step": 525 }, { "entropy": 0.40618896484375, "epoch": 2.0708661417322833, "grad_norm": 0.6917073506681685, "learning_rate": 4.850240027543509e-06, "loss": 0.3411, "mean_token_accuracy": 0.8773935958743095, "num_tokens": 226031171.0, "step": 526 }, { "entropy": 0.407501220703125, "epoch": 2.074803149606299, "grad_norm": 0.6590869362543731, "learning_rate": 4.813041351836657e-06, "loss": 0.3253, "mean_token_accuracy": 0.8814301686361432, "num_tokens": 226484203.0, "step": 527 }, { "entropy": 0.4034423828125, "epoch": 2.078740157480315, "grad_norm": 0.660774132093501, "learning_rate": 4.775940610526698e-06, "loss": 0.3371, "mean_token_accuracy": 0.8811690313741565, "num_tokens": 226923813.0, "step": 528 }, { "entropy": 0.4007568359375, "epoch": 2.0826771653543306, "grad_norm": 0.7267964307643108, "learning_rate": 4.738938504108659e-06, "loss": 0.3287, "mean_token_accuracy": 0.8822143021970987, "num_tokens": 227364235.0, "step": 529 }, { "entropy": 0.401336669921875, "epoch": 2.0866141732283463, "grad_norm": 0.7011620090348606, "learning_rate": 4.702035731215249e-06, "loss": 0.3245, "mean_token_accuracy": 0.8842730978503823, "num_tokens": 227804681.0, "step": 530 }, { "entropy": 0.399322509765625, "epoch": 2.090551181102362, "grad_norm": 0.6994922281758611, "learning_rate": 4.665232988603671e-06, "loss": 0.3346, "mean_token_accuracy": 0.8798423083499074, "num_tokens": 228233941.0, "step": 531 }, { "entropy": 0.408050537109375, "epoch": 2.094488188976378, "grad_norm": 0.659969290721089, "learning_rate": 4.6285309711424706e-06, "loss": 0.3282, "mean_token_accuracy": 0.8802306912839413, "num_tokens": 228671574.0, "step": 532 }, { "entropy": 0.40264892578125, "epoch": 2.0984251968503935, "grad_norm": 0.7160845199984103, "learning_rate": 4.59193037179841e-06, "loss": 0.334, "mean_token_accuracy": 0.8797960076481104, "num_tokens": 229097422.0, "step": 533 }, { "entropy": 0.410980224609375, "epoch": 2.1023622047244093, "grad_norm": 0.6651663210919708, "learning_rate": 4.555431881623384e-06, "loss": 0.3324, "mean_token_accuracy": 0.8817898659035563, "num_tokens": 229532488.0, "step": 534 }, { "entropy": 0.4046630859375, "epoch": 2.106299212598425, "grad_norm": 0.6719211249811485, "learning_rate": 4.519036189741386e-06, "loss": 0.3378, "mean_token_accuracy": 0.8793261991813779, "num_tokens": 229955370.0, "step": 535 }, { "entropy": 0.4071044921875, "epoch": 2.1102362204724407, "grad_norm": 0.683765412515, "learning_rate": 4.482743983335478e-06, "loss": 0.3203, "mean_token_accuracy": 0.883794778957963, "num_tokens": 230366605.0, "step": 536 }, { "entropy": 0.403594970703125, "epoch": 2.1141732283464565, "grad_norm": 0.6938692767492354, "learning_rate": 4.446555947634825e-06, "loss": 0.3329, "mean_token_accuracy": 0.8809925802052021, "num_tokens": 230800239.0, "step": 537 }, { "entropy": 0.400390625, "epoch": 2.1181102362204722, "grad_norm": 0.6579903284426181, "learning_rate": 4.410472765901755e-06, "loss": 0.3368, "mean_token_accuracy": 0.8784988336265087, "num_tokens": 231220452.0, "step": 538 }, { "entropy": 0.402130126953125, "epoch": 2.122047244094488, "grad_norm": 0.6480009306742889, "learning_rate": 4.3744951194188645e-06, "loss": 0.3318, "mean_token_accuracy": 0.8805544385686517, "num_tokens": 231669259.0, "step": 539 }, { "entropy": 0.404144287109375, "epoch": 2.1259842519685037, "grad_norm": 0.6762874373310868, "learning_rate": 4.3386236874761455e-06, "loss": 0.3307, "mean_token_accuracy": 0.8821213049814105, "num_tokens": 232092788.0, "step": 540 }, { "entropy": 0.405059814453125, "epoch": 2.1299212598425195, "grad_norm": 0.7007715804079939, "learning_rate": 4.302859147358168e-06, "loss": 0.3285, "mean_token_accuracy": 0.8826727429404855, "num_tokens": 232515170.0, "step": 541 }, { "entropy": 0.402435302734375, "epoch": 2.1338582677165356, "grad_norm": 0.7058653274548208, "learning_rate": 4.267202174331288e-06, "loss": 0.3293, "mean_token_accuracy": 0.8815928604453802, "num_tokens": 232951104.0, "step": 542 }, { "entropy": 0.401519775390625, "epoch": 2.1377952755905514, "grad_norm": 0.6786820083449475, "learning_rate": 4.231653441630899e-06, "loss": 0.332, "mean_token_accuracy": 0.8781663812696934, "num_tokens": 233357394.0, "step": 543 }, { "entropy": 0.40057373046875, "epoch": 2.141732283464567, "grad_norm": 0.6791074299791867, "learning_rate": 4.196213620448724e-06, "loss": 0.3297, "mean_token_accuracy": 0.8811912108212709, "num_tokens": 233777649.0, "step": 544 }, { "entropy": 0.4022216796875, "epoch": 2.145669291338583, "grad_norm": 0.6865111051458076, "learning_rate": 4.160883379920132e-06, "loss": 0.3254, "mean_token_accuracy": 0.8841365138068795, "num_tokens": 234210947.0, "step": 545 }, { "entropy": 0.40753173828125, "epoch": 2.1496062992125986, "grad_norm": 0.6603880166281437, "learning_rate": 4.125663387111519e-06, "loss": 0.3354, "mean_token_accuracy": 0.8794803349301219, "num_tokens": 234642824.0, "step": 546 }, { "entropy": 0.402801513671875, "epoch": 2.1535433070866143, "grad_norm": 0.6910095210213717, "learning_rate": 4.0905543070077036e-06, "loss": 0.3411, "mean_token_accuracy": 0.8780597625300288, "num_tokens": 235081344.0, "step": 547 }, { "entropy": 0.40283203125, "epoch": 2.15748031496063, "grad_norm": 0.6916532852158944, "learning_rate": 4.055556802499373e-06, "loss": 0.3245, "mean_token_accuracy": 0.8826562752947211, "num_tokens": 235519773.0, "step": 548 }, { "entropy": 0.403900146484375, "epoch": 2.161417322834646, "grad_norm": 0.6605629307385316, "learning_rate": 4.020671534370566e-06, "loss": 0.3392, "mean_token_accuracy": 0.8770075533539057, "num_tokens": 235952995.0, "step": 549 }, { "entropy": 0.40240478515625, "epoch": 2.1653543307086616, "grad_norm": 0.6954491968470974, "learning_rate": 3.985899161286205e-06, "loss": 0.3411, "mean_token_accuracy": 0.8809783374890685, "num_tokens": 236377384.0, "step": 550 }, { "entropy": 0.400390625, "epoch": 2.1692913385826773, "grad_norm": 0.6955945501474788, "learning_rate": 3.951240339779649e-06, "loss": 0.3383, "mean_token_accuracy": 0.8795864386484027, "num_tokens": 236805272.0, "step": 551 }, { "entropy": 0.399261474609375, "epoch": 2.173228346456693, "grad_norm": 0.6811865815204354, "learning_rate": 3.916695724240302e-06, "loss": 0.3243, "mean_token_accuracy": 0.8818806270137429, "num_tokens": 237240034.0, "step": 552 }, { "entropy": 0.404266357421875, "epoch": 2.177165354330709, "grad_norm": 0.6604827060686905, "learning_rate": 3.882265966901257e-06, "loss": 0.3338, "mean_token_accuracy": 0.8813010770827532, "num_tokens": 237676815.0, "step": 553 }, { "entropy": 0.397491455078125, "epoch": 2.1811023622047245, "grad_norm": 0.6951568747277557, "learning_rate": 3.847951717826984e-06, "loss": 0.3196, "mean_token_accuracy": 0.8818888068199158, "num_tokens": 238121169.0, "step": 554 }, { "entropy": 0.3974609375, "epoch": 2.1850393700787403, "grad_norm": 0.6604458548896505, "learning_rate": 3.813753624901053e-06, "loss": 0.3235, "mean_token_accuracy": 0.883362052962184, "num_tokens": 238560402.0, "step": 555 }, { "entropy": 0.399688720703125, "epoch": 2.188976377952756, "grad_norm": 0.6595354267544808, "learning_rate": 3.7796723338138995e-06, "loss": 0.3425, "mean_token_accuracy": 0.876252400688827, "num_tokens": 238996471.0, "step": 556 }, { "entropy": 0.39691162109375, "epoch": 2.1929133858267718, "grad_norm": 0.6640704940514063, "learning_rate": 3.7457084880506465e-06, "loss": 0.3284, "mean_token_accuracy": 0.8791890293359756, "num_tokens": 239431088.0, "step": 557 }, { "entropy": 0.402496337890625, "epoch": 2.1968503937007875, "grad_norm": 0.6267558305460987, "learning_rate": 3.7118627288789355e-06, "loss": 0.3274, "mean_token_accuracy": 0.8825696604326367, "num_tokens": 239851102.0, "step": 558 }, { "entropy": 0.39837646484375, "epoch": 2.2007874015748032, "grad_norm": 0.650661471745278, "learning_rate": 3.6781356953368286e-06, "loss": 0.3253, "mean_token_accuracy": 0.8800821900367737, "num_tokens": 240286779.0, "step": 559 }, { "entropy": 0.4083251953125, "epoch": 2.204724409448819, "grad_norm": 0.6676395521640903, "learning_rate": 3.644528024220745e-06, "loss": 0.352, "mean_token_accuracy": 0.8772099521011114, "num_tokens": 240715293.0, "step": 560 }, { "entropy": 0.4019775390625, "epoch": 2.2086614173228347, "grad_norm": 0.6661145894616234, "learning_rate": 3.6110403500734325e-06, "loss": 0.3277, "mean_token_accuracy": 0.8814953323453665, "num_tokens": 241151984.0, "step": 561 }, { "entropy": 0.4093017578125, "epoch": 2.2125984251968505, "grad_norm": 0.6267579335510625, "learning_rate": 3.5776733051719935e-06, "loss": 0.3276, "mean_token_accuracy": 0.882298044860363, "num_tokens": 241582173.0, "step": 562 }, { "entropy": 0.396942138671875, "epoch": 2.216535433070866, "grad_norm": 1.2833446633947432, "learning_rate": 3.5444275195159395e-06, "loss": 0.3259, "mean_token_accuracy": 0.8836051663383842, "num_tokens": 242022323.0, "step": 563 }, { "entropy": 0.397369384765625, "epoch": 2.220472440944882, "grad_norm": 0.6483374974300078, "learning_rate": 3.5113036208152994e-06, "loss": 0.3138, "mean_token_accuracy": 0.8874066807329655, "num_tokens": 242449211.0, "step": 564 }, { "entropy": 0.39886474609375, "epoch": 2.2244094488188977, "grad_norm": 0.7449795107940003, "learning_rate": 3.4783022344787698e-06, "loss": 0.3416, "mean_token_accuracy": 0.8797894669696689, "num_tokens": 242868913.0, "step": 565 }, { "entropy": 0.39984130859375, "epoch": 2.2283464566929134, "grad_norm": 0.7384068020075348, "learning_rate": 3.4454239836019032e-06, "loss": 0.335, "mean_token_accuracy": 0.8808618625625968, "num_tokens": 243288176.0, "step": 566 }, { "entropy": 0.40191650390625, "epoch": 2.232283464566929, "grad_norm": 0.6138346261513192, "learning_rate": 3.412669488955346e-06, "loss": 0.3265, "mean_token_accuracy": 0.8829402485862374, "num_tokens": 243712004.0, "step": 567 }, { "entropy": 0.398529052734375, "epoch": 2.236220472440945, "grad_norm": 0.6826323150688026, "learning_rate": 3.380039368973115e-06, "loss": 0.3525, "mean_token_accuracy": 0.8730159010738134, "num_tokens": 244151217.0, "step": 568 }, { "entropy": 0.39910888671875, "epoch": 2.2401574803149606, "grad_norm": 0.7731434019517666, "learning_rate": 3.347534239740925e-06, "loss": 0.3299, "mean_token_accuracy": 0.8819707138463855, "num_tokens": 244570025.0, "step": 569 }, { "entropy": 0.40252685546875, "epoch": 2.2440944881889764, "grad_norm": 0.7958821930394692, "learning_rate": 3.315154714984554e-06, "loss": 0.3311, "mean_token_accuracy": 0.8825913481414318, "num_tokens": 244995470.0, "step": 570 }, { "entropy": 0.395721435546875, "epoch": 2.248031496062992, "grad_norm": 0.6803195610061324, "learning_rate": 3.2829014060582498e-06, "loss": 0.3174, "mean_token_accuracy": 0.8865452529862523, "num_tokens": 245441326.0, "step": 571 }, { "entropy": 0.396636962890625, "epoch": 2.251968503937008, "grad_norm": 0.6508860212504988, "learning_rate": 3.2507749219332065e-06, "loss": 0.3249, "mean_token_accuracy": 0.8850123547017574, "num_tokens": 245869452.0, "step": 572 }, { "entropy": 0.39361572265625, "epoch": 2.2559055118110236, "grad_norm": 0.6814325606200489, "learning_rate": 3.218775869186038e-06, "loss": 0.32, "mean_token_accuracy": 0.8850808152928948, "num_tokens": 246283765.0, "step": 573 }, { "entropy": 0.390350341796875, "epoch": 2.2598425196850394, "grad_norm": 0.7249147176255266, "learning_rate": 3.1869048519873514e-06, "loss": 0.3319, "mean_token_accuracy": 0.882703147828579, "num_tokens": 246712103.0, "step": 574 }, { "entropy": 0.397247314453125, "epoch": 2.263779527559055, "grad_norm": 0.7259919964209407, "learning_rate": 3.1551624720903197e-06, "loss": 0.3278, "mean_token_accuracy": 0.8822907945141196, "num_tokens": 247128803.0, "step": 575 }, { "entropy": 0.408172607421875, "epoch": 2.267716535433071, "grad_norm": 0.6430937463791561, "learning_rate": 3.1235493288193363e-06, "loss": 0.3154, "mean_token_accuracy": 0.8839439991861582, "num_tokens": 247530472.0, "step": 576 }, { "entropy": 0.3909912109375, "epoch": 2.2716535433070866, "grad_norm": 0.6538968560952247, "learning_rate": 3.0920660190586893e-06, "loss": 0.3227, "mean_token_accuracy": 0.8826503995805979, "num_tokens": 247965594.0, "step": 577 }, { "entropy": 0.392974853515625, "epoch": 2.2755905511811023, "grad_norm": 0.65502612474774, "learning_rate": 3.0607131372412903e-06, "loss": 0.3342, "mean_token_accuracy": 0.8804387804120779, "num_tokens": 248401499.0, "step": 578 }, { "entropy": 0.40069580078125, "epoch": 2.279527559055118, "grad_norm": 0.6717303153956555, "learning_rate": 3.029491275337466e-06, "loss": 0.3267, "mean_token_accuracy": 0.8830945594236255, "num_tokens": 248833275.0, "step": 579 }, { "entropy": 0.391693115234375, "epoch": 2.283464566929134, "grad_norm": 0.662496244283556, "learning_rate": 2.998401022843761e-06, "loss": 0.3285, "mean_token_accuracy": 0.8807523930445313, "num_tokens": 249261595.0, "step": 580 }, { "entropy": 0.3966064453125, "epoch": 2.2874015748031495, "grad_norm": 0.6439773446458666, "learning_rate": 2.9674429667718198e-06, "loss": 0.3233, "mean_token_accuracy": 0.8819929100573063, "num_tokens": 249679398.0, "step": 581 }, { "entropy": 0.394989013671875, "epoch": 2.2913385826771653, "grad_norm": 0.618547629691431, "learning_rate": 2.9366176916373024e-06, "loss": 0.3257, "mean_token_accuracy": 0.8828444425016642, "num_tokens": 250123653.0, "step": 582 }, { "entropy": 0.395904541015625, "epoch": 2.295275590551181, "grad_norm": 0.6734411095539774, "learning_rate": 2.9059257794488428e-06, "loss": 0.3253, "mean_token_accuracy": 0.8810829911381006, "num_tokens": 250553262.0, "step": 583 }, { "entropy": 0.395111083984375, "epoch": 2.2992125984251968, "grad_norm": 0.6609096857306654, "learning_rate": 2.875367809697067e-06, "loss": 0.3247, "mean_token_accuracy": 0.8829443035647273, "num_tokens": 250963697.0, "step": 584 }, { "entropy": 0.394805908203125, "epoch": 2.3031496062992125, "grad_norm": 0.6843839300325607, "learning_rate": 2.84494435934365e-06, "loss": 0.3174, "mean_token_accuracy": 0.8860497623682022, "num_tokens": 251383717.0, "step": 585 }, { "entropy": 0.4027099609375, "epoch": 2.3070866141732282, "grad_norm": 0.693475193681948, "learning_rate": 2.8146560028104155e-06, "loss": 0.3268, "mean_token_accuracy": 0.8848175024613738, "num_tokens": 251809123.0, "step": 586 }, { "entropy": 0.399444580078125, "epoch": 2.311023622047244, "grad_norm": 0.6482014364527828, "learning_rate": 2.7845033119684996e-06, "loss": 0.3249, "mean_token_accuracy": 0.8843581713736057, "num_tokens": 252232897.0, "step": 587 }, { "entropy": 0.394683837890625, "epoch": 2.3149606299212597, "grad_norm": 0.6511044909303425, "learning_rate": 2.7544868561275473e-06, "loss": 0.3033, "mean_token_accuracy": 0.8875886900350451, "num_tokens": 252652937.0, "step": 588 }, { "entropy": 0.393707275390625, "epoch": 2.3188976377952755, "grad_norm": 0.6526243088549442, "learning_rate": 2.724607202024969e-06, "loss": 0.3151, "mean_token_accuracy": 0.884684244170785, "num_tokens": 253092659.0, "step": 589 }, { "entropy": 0.400543212890625, "epoch": 2.322834645669291, "grad_norm": 0.7074127197129519, "learning_rate": 2.694864913815234e-06, "loss": 0.3318, "mean_token_accuracy": 0.8794507039710879, "num_tokens": 253513268.0, "step": 590 }, { "entropy": 0.3931884765625, "epoch": 2.326771653543307, "grad_norm": 0.6708725885175661, "learning_rate": 2.665260553059219e-06, "loss": 0.3079, "mean_token_accuracy": 0.8883746191859245, "num_tokens": 253932423.0, "step": 591 }, { "entropy": 0.391082763671875, "epoch": 2.3307086614173227, "grad_norm": 0.642752601240437, "learning_rate": 2.635794678713611e-06, "loss": 0.3335, "mean_token_accuracy": 0.8798313392326236, "num_tokens": 254366200.0, "step": 592 }, { "entropy": 0.39373779296875, "epoch": 2.3346456692913384, "grad_norm": 0.632491220317453, "learning_rate": 2.6064678471203497e-06, "loss": 0.3197, "mean_token_accuracy": 0.8848578063771129, "num_tokens": 254777165.0, "step": 593 }, { "entropy": 0.391265869140625, "epoch": 2.338582677165354, "grad_norm": 0.6657812420395008, "learning_rate": 2.5772806119961204e-06, "loss": 0.3273, "mean_token_accuracy": 0.8817225815728307, "num_tokens": 255230273.0, "step": 594 }, { "entropy": 0.393798828125, "epoch": 2.34251968503937, "grad_norm": 0.6631893071514698, "learning_rate": 2.5482335244219114e-06, "loss": 0.3415, "mean_token_accuracy": 0.8777058375999331, "num_tokens": 255677101.0, "step": 595 }, { "entropy": 0.394866943359375, "epoch": 2.3464566929133857, "grad_norm": 0.6801590026511766, "learning_rate": 2.519327132832592e-06, "loss": 0.3023, "mean_token_accuracy": 0.8904533553868532, "num_tokens": 256095773.0, "step": 596 }, { "entropy": 0.40203857421875, "epoch": 2.3503937007874014, "grad_norm": 0.6545771849436208, "learning_rate": 2.4905619830065685e-06, "loss": 0.3276, "mean_token_accuracy": 0.8830998111516237, "num_tokens": 256527168.0, "step": 597 }, { "entropy": 0.398193359375, "epoch": 2.354330708661417, "grad_norm": 0.639726060039768, "learning_rate": 2.4619386180554783e-06, "loss": 0.3177, "mean_token_accuracy": 0.8855596333742142, "num_tokens": 256941879.0, "step": 598 }, { "entropy": 0.391021728515625, "epoch": 2.358267716535433, "grad_norm": 0.673255368473798, "learning_rate": 2.4334575784139324e-06, "loss": 0.3216, "mean_token_accuracy": 0.8818031437695026, "num_tokens": 257380979.0, "step": 599 }, { "entropy": 0.3917236328125, "epoch": 2.362204724409449, "grad_norm": 0.6712526931241498, "learning_rate": 2.405119401829312e-06, "loss": 0.331, "mean_token_accuracy": 0.8820068500936031, "num_tokens": 257816910.0, "step": 600 }, { "entropy": 0.395477294921875, "epoch": 2.366141732283465, "grad_norm": 0.6286157138570574, "learning_rate": 2.3769246233516243e-06, "loss": 0.3185, "mean_token_accuracy": 0.8859440181404352, "num_tokens": 258256027.0, "step": 601 }, { "entropy": 0.398040771484375, "epoch": 2.3700787401574805, "grad_norm": 0.6470378030744797, "learning_rate": 2.3488737753233827e-06, "loss": 0.3177, "mean_token_accuracy": 0.883867921307683, "num_tokens": 258673460.0, "step": 602 }, { "entropy": 0.39007568359375, "epoch": 2.3740157480314963, "grad_norm": 0.6228299960277243, "learning_rate": 2.3209673873695705e-06, "loss": 0.3264, "mean_token_accuracy": 0.8804994663223624, "num_tokens": 259111848.0, "step": 603 }, { "entropy": 0.394622802734375, "epoch": 2.377952755905512, "grad_norm": 0.679880113137842, "learning_rate": 2.2932059863876364e-06, "loss": 0.3189, "mean_token_accuracy": 0.8832629825919867, "num_tokens": 259542914.0, "step": 604 }, { "entropy": 0.389068603515625, "epoch": 2.3818897637795278, "grad_norm": 0.6413194426701236, "learning_rate": 2.2655900965375454e-06, "loss": 0.3221, "mean_token_accuracy": 0.8834966970607638, "num_tokens": 259974950.0, "step": 605 }, { "entropy": 0.39306640625, "epoch": 2.3858267716535435, "grad_norm": 0.647333297597132, "learning_rate": 2.2381202392318813e-06, "loss": 0.3255, "mean_token_accuracy": 0.8841407634317875, "num_tokens": 260391029.0, "step": 606 }, { "entropy": 0.389190673828125, "epoch": 2.3897637795275593, "grad_norm": 0.6237674488982197, "learning_rate": 2.210796933126005e-06, "loss": 0.3207, "mean_token_accuracy": 0.8847579173743725, "num_tokens": 260813537.0, "step": 607 }, { "entropy": 0.399444580078125, "epoch": 2.393700787401575, "grad_norm": 0.6641198284295734, "learning_rate": 2.1836206941082593e-06, "loss": 0.3236, "mean_token_accuracy": 0.884418660774827, "num_tokens": 261224513.0, "step": 608 }, { "entropy": 0.39337158203125, "epoch": 2.3976377952755907, "grad_norm": 0.6284008254683265, "learning_rate": 2.1565920352902327e-06, "loss": 0.316, "mean_token_accuracy": 0.8880952065810561, "num_tokens": 261640630.0, "step": 609 }, { "entropy": 0.391754150390625, "epoch": 2.4015748031496065, "grad_norm": 0.63953172691108, "learning_rate": 2.129711466997062e-06, "loss": 0.3103, "mean_token_accuracy": 0.8891152497380972, "num_tokens": 262069836.0, "step": 610 }, { "entropy": 0.39288330078125, "epoch": 2.405511811023622, "grad_norm": 0.64156503400504, "learning_rate": 2.10297949675781e-06, "loss": 0.3337, "mean_token_accuracy": 0.8809280870482326, "num_tokens": 262485062.0, "step": 611 }, { "entropy": 0.391937255859375, "epoch": 2.409448818897638, "grad_norm": 0.7503194408517748, "learning_rate": 2.0763966292958704e-06, "loss": 0.3377, "mean_token_accuracy": 0.8801953559741378, "num_tokens": 262914987.0, "step": 612 }, { "entropy": 0.39837646484375, "epoch": 2.4133858267716537, "grad_norm": 0.6110565618144053, "learning_rate": 2.049963366519446e-06, "loss": 0.3221, "mean_token_accuracy": 0.8836126467213035, "num_tokens": 263333219.0, "step": 613 }, { "entropy": 0.402984619140625, "epoch": 2.4173228346456694, "grad_norm": 0.6475195761761684, "learning_rate": 2.023680207512071e-06, "loss": 0.3216, "mean_token_accuracy": 0.8832600386813283, "num_tokens": 263740746.0, "step": 614 }, { "entropy": 0.394622802734375, "epoch": 2.421259842519685, "grad_norm": 0.6209714529513158, "learning_rate": 1.9975476485231847e-06, "loss": 0.3309, "mean_token_accuracy": 0.8813778571784496, "num_tokens": 264164051.0, "step": 615 }, { "entropy": 0.395050048828125, "epoch": 2.425196850393701, "grad_norm": 0.6604985399806831, "learning_rate": 1.9715661829587653e-06, "loss": 0.3246, "mean_token_accuracy": 0.8825316475704312, "num_tokens": 264589410.0, "step": 616 }, { "entropy": 0.394378662109375, "epoch": 2.4291338582677167, "grad_norm": 0.64135319275765, "learning_rate": 1.94573630137201e-06, "loss": 0.3202, "mean_token_accuracy": 0.8855673084035516, "num_tokens": 265016663.0, "step": 617 }, { "entropy": 0.394073486328125, "epoch": 2.4330708661417324, "grad_norm": 0.6549755012645107, "learning_rate": 1.9200584914540833e-06, "loss": 0.3233, "mean_token_accuracy": 0.8843048512935638, "num_tokens": 265448816.0, "step": 618 }, { "entropy": 0.39239501953125, "epoch": 2.437007874015748, "grad_norm": 0.6460597097434501, "learning_rate": 1.8945332380248914e-06, "loss": 0.3162, "mean_token_accuracy": 0.8841556925326586, "num_tokens": 265877226.0, "step": 619 }, { "entropy": 0.39520263671875, "epoch": 2.440944881889764, "grad_norm": 0.6172518945931613, "learning_rate": 1.8691610230239443e-06, "loss": 0.3289, "mean_token_accuracy": 0.883539610542357, "num_tokens": 266327100.0, "step": 620 }, { "entropy": 0.39593505859375, "epoch": 2.4448818897637796, "grad_norm": 0.6661038350327722, "learning_rate": 1.8439423255012478e-06, "loss": 0.3242, "mean_token_accuracy": 0.8828055150806904, "num_tokens": 266753411.0, "step": 621 }, { "entropy": 0.391571044921875, "epoch": 2.4488188976377954, "grad_norm": 0.6803435522040893, "learning_rate": 1.8188776216082604e-06, "loss": 0.3167, "mean_token_accuracy": 0.8865473745390773, "num_tokens": 267178072.0, "step": 622 }, { "entropy": 0.394989013671875, "epoch": 2.452755905511811, "grad_norm": 0.6808244304054752, "learning_rate": 1.7939673845889072e-06, "loss": 0.3271, "mean_token_accuracy": 0.8820376275107265, "num_tokens": 267599228.0, "step": 623 }, { "entropy": 0.39794921875, "epoch": 2.456692913385827, "grad_norm": 0.6497891086437605, "learning_rate": 1.7692120847706396e-06, "loss": 0.3193, "mean_token_accuracy": 0.8842871803790331, "num_tokens": 268031239.0, "step": 624 }, { "entropy": 0.395477294921875, "epoch": 2.4606299212598426, "grad_norm": 0.6905683476666384, "learning_rate": 1.7446121895555556e-06, "loss": 0.319, "mean_token_accuracy": 0.8847667053341866, "num_tokens": 268447599.0, "step": 625 }, { "entropy": 0.395751953125, "epoch": 2.4645669291338583, "grad_norm": 0.6265658635480155, "learning_rate": 1.7201681634115753e-06, "loss": 0.3156, "mean_token_accuracy": 0.8880419284105301, "num_tokens": 268881017.0, "step": 626 }, { "entropy": 0.396240234375, "epoch": 2.468503937007874, "grad_norm": 0.6399557345959777, "learning_rate": 1.6958804678636743e-06, "loss": 0.3179, "mean_token_accuracy": 0.8842396680265665, "num_tokens": 269311901.0, "step": 627 }, { "entropy": 0.39349365234375, "epoch": 2.47244094488189, "grad_norm": 0.6593703467640568, "learning_rate": 1.6717495614851654e-06, "loss": 0.3247, "mean_token_accuracy": 0.883893528021872, "num_tokens": 269753510.0, "step": 628 }, { "entropy": 0.39080810546875, "epoch": 2.4763779527559056, "grad_norm": 0.6767235781794113, "learning_rate": 1.6477758998890448e-06, "loss": 0.3234, "mean_token_accuracy": 0.8831140054389834, "num_tokens": 270180847.0, "step": 629 }, { "entropy": 0.390716552734375, "epoch": 2.4803149606299213, "grad_norm": 0.628556080122311, "learning_rate": 1.6239599357193837e-06, "loss": 0.3222, "mean_token_accuracy": 0.8835263950750232, "num_tokens": 270614343.0, "step": 630 }, { "entropy": 0.389892578125, "epoch": 2.484251968503937, "grad_norm": 0.6910689957499726, "learning_rate": 1.6003021186427892e-06, "loss": 0.3109, "mean_token_accuracy": 0.8867247756570578, "num_tokens": 271063084.0, "step": 631 }, { "entropy": 0.3984375, "epoch": 2.4881889763779528, "grad_norm": 0.665136959096579, "learning_rate": 1.5768028953399083e-06, "loss": 0.3133, "mean_token_accuracy": 0.8876866241917014, "num_tokens": 271492678.0, "step": 632 }, { "entropy": 0.39459228515625, "epoch": 2.4921259842519685, "grad_norm": 0.6786290792116132, "learning_rate": 1.5534627094969957e-06, "loss": 0.3408, "mean_token_accuracy": 0.8770330473780632, "num_tokens": 271927919.0, "step": 633 }, { "entropy": 0.40264892578125, "epoch": 2.4960629921259843, "grad_norm": 0.6314184625745893, "learning_rate": 1.5302820017975396e-06, "loss": 0.3266, "mean_token_accuracy": 0.8817340964451432, "num_tokens": 272340839.0, "step": 634 }, { "entropy": 0.39813232421875, "epoch": 2.5, "grad_norm": 0.6330687294612445, "learning_rate": 1.5072612099139373e-06, "loss": 0.3285, "mean_token_accuracy": 0.8819912485778332, "num_tokens": 272758612.0, "step": 635 }, { "entropy": 0.39630126953125, "epoch": 2.5039370078740157, "grad_norm": 0.6669821941938795, "learning_rate": 1.4844007684992333e-06, "loss": 0.3138, "mean_token_accuracy": 0.886656578630209, "num_tokens": 273181297.0, "step": 636 }, { "entropy": 0.393096923828125, "epoch": 2.5078740157480315, "grad_norm": 0.6345157626325129, "learning_rate": 1.4617011091789135e-06, "loss": 0.3153, "mean_token_accuracy": 0.8863168517127633, "num_tokens": 273608125.0, "step": 637 }, { "entropy": 0.40460205078125, "epoch": 2.5118110236220472, "grad_norm": 0.6666402919975754, "learning_rate": 1.4391626605427522e-06, "loss": 0.3112, "mean_token_accuracy": 0.8885293649509549, "num_tokens": 274013088.0, "step": 638 }, { "entropy": 0.398956298828125, "epoch": 2.515748031496063, "grad_norm": 0.6605459057250379, "learning_rate": 1.4167858481367237e-06, "loss": 0.3177, "mean_token_accuracy": 0.8849203772842884, "num_tokens": 274443071.0, "step": 639 }, { "entropy": 0.39727783203125, "epoch": 2.5196850393700787, "grad_norm": 0.6263187778391787, "learning_rate": 1.3945710944549705e-06, "loss": 0.3252, "mean_token_accuracy": 0.8821171736344695, "num_tokens": 274872156.0, "step": 640 }, { "entropy": 0.39178466796875, "epoch": 2.5236220472440944, "grad_norm": 0.6356426496587686, "learning_rate": 1.3725188189318172e-06, "loss": 0.3124, "mean_token_accuracy": 0.8871043566614389, "num_tokens": 275329376.0, "step": 641 }, { "entropy": 0.397705078125, "epoch": 2.52755905511811, "grad_norm": 0.6682552356019673, "learning_rate": 1.3506294379338557e-06, "loss": 0.3337, "mean_token_accuracy": 0.877638204023242, "num_tokens": 275767280.0, "step": 642 }, { "entropy": 0.396026611328125, "epoch": 2.531496062992126, "grad_norm": 0.6390620503683951, "learning_rate": 1.3289033647520878e-06, "loss": 0.3122, "mean_token_accuracy": 0.8879886958748102, "num_tokens": 276195274.0, "step": 643 }, { "entropy": 0.398406982421875, "epoch": 2.5354330708661417, "grad_norm": 0.6441985779511971, "learning_rate": 1.307341009594113e-06, "loss": 0.3274, "mean_token_accuracy": 0.8814769377931952, "num_tokens": 276629597.0, "step": 644 }, { "entropy": 0.39813232421875, "epoch": 2.5393700787401574, "grad_norm": 0.6314415905953961, "learning_rate": 1.2859427795763967e-06, "loss": 0.3109, "mean_token_accuracy": 0.8865178329870105, "num_tokens": 277046245.0, "step": 645 }, { "entropy": 0.39410400390625, "epoch": 2.543307086614173, "grad_norm": 0.6215596209130474, "learning_rate": 1.2647090787165694e-06, "loss": 0.3256, "mean_token_accuracy": 0.8811031272634864, "num_tokens": 277475376.0, "step": 646 }, { "entropy": 0.3934326171875, "epoch": 2.547244094488189, "grad_norm": 0.6234194349157417, "learning_rate": 1.2436403079258064e-06, "loss": 0.3094, "mean_token_accuracy": 0.8892135825008154, "num_tokens": 277902729.0, "step": 647 }, { "entropy": 0.39385986328125, "epoch": 2.5511811023622046, "grad_norm": 0.7226064894835298, "learning_rate": 1.2227368650012572e-06, "loss": 0.3279, "mean_token_accuracy": 0.8839946733787656, "num_tokens": 278350458.0, "step": 648 }, { "entropy": 0.394744873046875, "epoch": 2.5551181102362204, "grad_norm": 0.6298237267921505, "learning_rate": 1.201999144618531e-06, "loss": 0.3196, "mean_token_accuracy": 0.8864294402301311, "num_tokens": 278781796.0, "step": 649 }, { "entropy": 0.402099609375, "epoch": 2.559055118110236, "grad_norm": 0.7016612151580089, "learning_rate": 1.1814275383242512e-06, "loss": 0.3317, "mean_token_accuracy": 0.8814294217154384, "num_tokens": 279203350.0, "step": 650 }, { "entropy": 0.396270751953125, "epoch": 2.562992125984252, "grad_norm": 0.6241050703271862, "learning_rate": 1.1610224345286591e-06, "loss": 0.3137, "mean_token_accuracy": 0.8861905531957746, "num_tokens": 279639552.0, "step": 651 }, { "entropy": 0.392730712890625, "epoch": 2.5669291338582676, "grad_norm": 0.659361603190389, "learning_rate": 1.1407842184982786e-06, "loss": 0.3058, "mean_token_accuracy": 0.8890553684905171, "num_tokens": 280082152.0, "step": 652 }, { "entropy": 0.393646240234375, "epoch": 2.5708661417322833, "grad_norm": 0.6373452639560493, "learning_rate": 1.1207132723486457e-06, "loss": 0.3193, "mean_token_accuracy": 0.883937232196331, "num_tokens": 280513039.0, "step": 653 }, { "entropy": 0.395751953125, "epoch": 2.574803149606299, "grad_norm": 0.6515723371089116, "learning_rate": 1.1008099750370916e-06, "loss": 0.3106, "mean_token_accuracy": 0.8856142768636346, "num_tokens": 280939007.0, "step": 654 }, { "entropy": 0.398284912109375, "epoch": 2.578740157480315, "grad_norm": 0.6514931861993788, "learning_rate": 1.0810747023555879e-06, "loss": 0.3011, "mean_token_accuracy": 0.8899843348190188, "num_tokens": 281341467.0, "step": 655 }, { "entropy": 0.401214599609375, "epoch": 2.5826771653543306, "grad_norm": 0.6165563490279274, "learning_rate": 1.0615078269236512e-06, "loss": 0.3268, "mean_token_accuracy": 0.8844519322738051, "num_tokens": 281763144.0, "step": 656 }, { "entropy": 0.3929443359375, "epoch": 2.5866141732283463, "grad_norm": 0.8159003176179213, "learning_rate": 1.04210971818131e-06, "loss": 0.3331, "mean_token_accuracy": 0.8811143329367042, "num_tokens": 282216187.0, "step": 657 }, { "entropy": 0.38885498046875, "epoch": 2.590551181102362, "grad_norm": 0.6293311703276512, "learning_rate": 1.0228807423821262e-06, "loss": 0.3184, "mean_token_accuracy": 0.8853057865053415, "num_tokens": 282677398.0, "step": 658 }, { "entropy": 0.40081787109375, "epoch": 2.594488188976378, "grad_norm": 0.6521783698950797, "learning_rate": 1.0038212625862799e-06, "loss": 0.3185, "mean_token_accuracy": 0.8854817440733314, "num_tokens": 283091191.0, "step": 659 }, { "entropy": 0.3934326171875, "epoch": 2.5984251968503935, "grad_norm": 0.615351062838791, "learning_rate": 9.84931638653719e-07, "loss": 0.3089, "mean_token_accuracy": 0.8892610957846045, "num_tokens": 283521595.0, "step": 660 }, { "entropy": 0.39398193359375, "epoch": 2.6023622047244093, "grad_norm": 0.6227949884838964, "learning_rate": 9.662122272373574e-07, "loss": 0.316, "mean_token_accuracy": 0.8863938516005874, "num_tokens": 283954903.0, "step": 661 }, { "entropy": 0.39398193359375, "epoch": 2.606299212598425, "grad_norm": 0.6245845638981473, "learning_rate": 9.476633817763481e-07, "loss": 0.3312, "mean_token_accuracy": 0.8837857628241181, "num_tokens": 284389477.0, "step": 662 }, { "entropy": 0.4013671875, "epoch": 2.6102362204724407, "grad_norm": 0.6468912877856337, "learning_rate": 9.292854524894068e-07, "loss": 0.3064, "mean_token_accuracy": 0.8889474645256996, "num_tokens": 284806724.0, "step": 663 }, { "entropy": 0.398712158203125, "epoch": 2.6141732283464565, "grad_norm": 0.6124857634152812, "learning_rate": 9.110787863682002e-07, "loss": 0.3194, "mean_token_accuracy": 0.8831356568261981, "num_tokens": 285235987.0, "step": 664 }, { "entropy": 0.393951416015625, "epoch": 2.6181102362204722, "grad_norm": 0.6135551165191099, "learning_rate": 8.930437271707915e-07, "loss": 0.3071, "mean_token_accuracy": 0.8895376035943627, "num_tokens": 285668668.0, "step": 665 }, { "entropy": 0.39849853515625, "epoch": 2.622047244094488, "grad_norm": 0.6279955512566158, "learning_rate": 8.751806154151521e-07, "loss": 0.3096, "mean_token_accuracy": 0.8884470723569393, "num_tokens": 286103078.0, "step": 666 }, { "entropy": 0.394317626953125, "epoch": 2.6259842519685037, "grad_norm": 0.738468612337961, "learning_rate": 8.574897883727384e-07, "loss": 0.3257, "mean_token_accuracy": 0.8832857329398394, "num_tokens": 286553130.0, "step": 667 }, { "entropy": 0.392608642578125, "epoch": 2.6299212598425195, "grad_norm": 1.2334069200957607, "learning_rate": 8.399715800621111e-07, "loss": 0.3111, "mean_token_accuracy": 0.8888679994270205, "num_tokens": 286979768.0, "step": 668 }, { "entropy": 0.386383056640625, "epoch": 2.633858267716535, "grad_norm": 0.6221501280148931, "learning_rate": 8.226263212426389e-07, "loss": 0.315, "mean_token_accuracy": 0.8863368751481175, "num_tokens": 287440144.0, "step": 669 }, { "entropy": 0.396026611328125, "epoch": 2.637795275590551, "grad_norm": 0.6190885324530173, "learning_rate": 8.054543394082503e-07, "loss": 0.3277, "mean_token_accuracy": 0.8823181875050068, "num_tokens": 287886492.0, "step": 670 }, { "entropy": 0.393524169921875, "epoch": 2.6417322834645667, "grad_norm": 0.7064829350170301, "learning_rate": 7.884559587812501e-07, "loss": 0.317, "mean_token_accuracy": 0.8856177758425474, "num_tokens": 288320175.0, "step": 671 }, { "entropy": 0.399383544921875, "epoch": 2.6456692913385824, "grad_norm": 0.7329568438915423, "learning_rate": 7.716315003061948e-07, "loss": 0.3193, "mean_token_accuracy": 0.8855800237506628, "num_tokens": 288747403.0, "step": 672 }, { "entropy": 0.39471435546875, "epoch": 2.649606299212598, "grad_norm": 0.61185048591417, "learning_rate": 7.549812816438395e-07, "loss": 0.3117, "mean_token_accuracy": 0.8875846909359097, "num_tokens": 289190749.0, "step": 673 }, { "entropy": 0.3912353515625, "epoch": 2.653543307086614, "grad_norm": 0.6087484451984008, "learning_rate": 7.38505617165135e-07, "loss": 0.315, "mean_token_accuracy": 0.88607323076576, "num_tokens": 289624045.0, "step": 674 }, { "entropy": 0.387939453125, "epoch": 2.65748031496063, "grad_norm": 0.6318153118085255, "learning_rate": 7.222048179452945e-07, "loss": 0.3175, "mean_token_accuracy": 0.8839446315541863, "num_tokens": 290070600.0, "step": 675 }, { "entropy": 0.390716552734375, "epoch": 2.661417322834646, "grad_norm": 0.6310528037803923, "learning_rate": 7.06079191757918e-07, "loss": 0.316, "mean_token_accuracy": 0.8856059042736888, "num_tokens": 290513687.0, "step": 676 }, { "entropy": 0.395843505859375, "epoch": 2.6653543307086616, "grad_norm": 0.6093719666352239, "learning_rate": 6.901290430691842e-07, "loss": 0.3252, "mean_token_accuracy": 0.8829275881871581, "num_tokens": 290928435.0, "step": 677 }, { "entropy": 0.3968505859375, "epoch": 2.6692913385826773, "grad_norm": 0.6849993862489645, "learning_rate": 6.743546730320993e-07, "loss": 0.3281, "mean_token_accuracy": 0.8848955575376749, "num_tokens": 291364474.0, "step": 678 }, { "entropy": 0.389373779296875, "epoch": 2.673228346456693, "grad_norm": 0.6129024908856187, "learning_rate": 6.587563794808127e-07, "loss": 0.3157, "mean_token_accuracy": 0.8874608399346471, "num_tokens": 291802575.0, "step": 679 }, { "entropy": 0.39959716796875, "epoch": 2.677165354330709, "grad_norm": 0.6279482452089803, "learning_rate": 6.433344569249922e-07, "loss": 0.312, "mean_token_accuracy": 0.8877925118431449, "num_tokens": 292217867.0, "step": 680 }, { "entropy": 0.394195556640625, "epoch": 2.6811023622047245, "grad_norm": 0.618547629928698, "learning_rate": 6.280891965442648e-07, "loss": 0.3039, "mean_token_accuracy": 0.8895618692040443, "num_tokens": 292638162.0, "step": 681 }, { "entropy": 0.392333984375, "epoch": 2.6850393700787403, "grad_norm": 0.609097120858831, "learning_rate": 6.130208861827203e-07, "loss": 0.3213, "mean_token_accuracy": 0.8848212473094463, "num_tokens": 293080208.0, "step": 682 }, { "entropy": 0.3900146484375, "epoch": 2.688976377952756, "grad_norm": 0.6103219460414863, "learning_rate": 5.981298103434696e-07, "loss": 0.314, "mean_token_accuracy": 0.885158559307456, "num_tokens": 293497642.0, "step": 683 }, { "entropy": 0.394195556640625, "epoch": 2.6929133858267718, "grad_norm": 0.6218219688195682, "learning_rate": 5.83416250183283e-07, "loss": 0.3215, "mean_token_accuracy": 0.8855108115822077, "num_tokens": 293933914.0, "step": 684 }, { "entropy": 0.39300537109375, "epoch": 2.6968503937007875, "grad_norm": 0.633185380507804, "learning_rate": 5.688804835072748e-07, "loss": 0.3008, "mean_token_accuracy": 0.8915995480492711, "num_tokens": 294354925.0, "step": 685 }, { "entropy": 0.39642333984375, "epoch": 2.7007874015748032, "grad_norm": 0.6221615782319272, "learning_rate": 5.545227847636602e-07, "loss": 0.3257, "mean_token_accuracy": 0.8829503497108817, "num_tokens": 294776144.0, "step": 686 }, { "entropy": 0.387420654296875, "epoch": 2.704724409448819, "grad_norm": 0.7005879338266516, "learning_rate": 5.40343425038573e-07, "loss": 0.3199, "mean_token_accuracy": 0.886635722592473, "num_tokens": 295217208.0, "step": 687 }, { "entropy": 0.39031982421875, "epoch": 2.7086614173228347, "grad_norm": 0.5926582054950879, "learning_rate": 5.263426720509469e-07, "loss": 0.3178, "mean_token_accuracy": 0.8865943877026439, "num_tokens": 295677828.0, "step": 688 }, { "entropy": 0.3997802734375, "epoch": 2.7125984251968505, "grad_norm": 0.6117770278180463, "learning_rate": 5.125207901474638e-07, "loss": 0.3104, "mean_token_accuracy": 0.8866582782939076, "num_tokens": 296105909.0, "step": 689 }, { "entropy": 0.392242431640625, "epoch": 2.716535433070866, "grad_norm": 0.6130135566792134, "learning_rate": 4.98878040297559e-07, "loss": 0.3213, "mean_token_accuracy": 0.883451035246253, "num_tokens": 296530077.0, "step": 690 }, { "entropy": 0.39068603515625, "epoch": 2.720472440944882, "grad_norm": 0.6247127760703306, "learning_rate": 4.854146800884929e-07, "loss": 0.3219, "mean_token_accuracy": 0.8842593487352133, "num_tokens": 296965385.0, "step": 691 }, { "entropy": 0.4078369140625, "epoch": 2.7244094488188977, "grad_norm": 0.6192979804386426, "learning_rate": 4.7213096372049404e-07, "loss": 0.3173, "mean_token_accuracy": 0.8843513103201985, "num_tokens": 297381008.0, "step": 692 }, { "entropy": 0.402801513671875, "epoch": 2.7283464566929134, "grad_norm": 0.6493879750559709, "learning_rate": 4.59027142001951e-07, "loss": 0.2997, "mean_token_accuracy": 0.8891726117581129, "num_tokens": 297761093.0, "step": 693 }, { "entropy": 0.3902587890625, "epoch": 2.732283464566929, "grad_norm": 0.5908426802287144, "learning_rate": 4.461034623446847e-07, "loss": 0.3104, "mean_token_accuracy": 0.8874157816171646, "num_tokens": 298211265.0, "step": 694 }, { "entropy": 0.398223876953125, "epoch": 2.736220472440945, "grad_norm": 0.6216414668892356, "learning_rate": 4.333601687592714e-07, "loss": 0.3073, "mean_token_accuracy": 0.8897824250161648, "num_tokens": 298633043.0, "step": 695 }, { "entropy": 0.391265869140625, "epoch": 2.7401574803149606, "grad_norm": 0.6174935182318937, "learning_rate": 4.2079750185043955e-07, "loss": 0.3194, "mean_token_accuracy": 0.8854984659701586, "num_tokens": 299078818.0, "step": 696 }, { "entropy": 0.39642333984375, "epoch": 2.7440944881889764, "grad_norm": 0.617257000557681, "learning_rate": 4.084156988125232e-07, "loss": 0.3398, "mean_token_accuracy": 0.8800790719687939, "num_tokens": 299506515.0, "step": 697 }, { "entropy": 0.39508056640625, "epoch": 2.748031496062992, "grad_norm": 0.6355414363515316, "learning_rate": 3.9621499342498706e-07, "loss": 0.2952, "mean_token_accuracy": 0.8933856235817075, "num_tokens": 299906938.0, "step": 698 }, { "entropy": 0.402252197265625, "epoch": 2.751968503937008, "grad_norm": 0.6480768361433473, "learning_rate": 3.841956160480098e-07, "loss": 0.3253, "mean_token_accuracy": 0.8832006398588419, "num_tokens": 300309118.0, "step": 699 }, { "entropy": 0.3931884765625, "epoch": 2.7559055118110236, "grad_norm": 0.6200677492498358, "learning_rate": 3.723577936181366e-07, "loss": 0.3143, "mean_token_accuracy": 0.8876901566982269, "num_tokens": 300730067.0, "step": 700 }, { "entropy": 0.39215087890625, "epoch": 2.7598425196850394, "grad_norm": 0.6282804561575106, "learning_rate": 3.607017496439935e-07, "loss": 0.3063, "mean_token_accuracy": 0.8888409864157438, "num_tokens": 301152430.0, "step": 701 }, { "entropy": 0.39166259765625, "epoch": 2.763779527559055, "grad_norm": 0.6167008846266515, "learning_rate": 3.4922770420206754e-07, "loss": 0.3138, "mean_token_accuracy": 0.887148299254477, "num_tokens": 301597234.0, "step": 702 }, { "entropy": 0.387939453125, "epoch": 2.767716535433071, "grad_norm": 0.6274220462057958, "learning_rate": 3.3793587393255e-07, "loss": 0.3082, "mean_token_accuracy": 0.889225204475224, "num_tokens": 302034846.0, "step": 703 }, { "entropy": 0.397613525390625, "epoch": 2.7716535433070866, "grad_norm": 0.6217284823956639, "learning_rate": 3.2682647203525095e-07, "loss": 0.3091, "mean_token_accuracy": 0.8873699698597193, "num_tokens": 302468902.0, "step": 704 }, { "entropy": 0.391632080078125, "epoch": 2.7755905511811023, "grad_norm": 0.6012726111153172, "learning_rate": 3.158997082655668e-07, "loss": 0.3058, "mean_token_accuracy": 0.8894424652680755, "num_tokens": 302896329.0, "step": 705 }, { "entropy": 0.389434814453125, "epoch": 2.779527559055118, "grad_norm": 0.6234688848554906, "learning_rate": 3.0515578893052343e-07, "loss": 0.318, "mean_token_accuracy": 0.8851411901414394, "num_tokens": 303323908.0, "step": 706 }, { "entropy": 0.397430419921875, "epoch": 2.783464566929134, "grad_norm": 0.6061624445272218, "learning_rate": 2.9459491688488604e-07, "loss": 0.3036, "mean_token_accuracy": 0.888351739384234, "num_tokens": 303745436.0, "step": 707 }, { "entropy": 0.399261474609375, "epoch": 2.7874015748031495, "grad_norm": 0.7479126909396716, "learning_rate": 2.8421729152731783e-07, "loss": 0.3268, "mean_token_accuracy": 0.883736445568502, "num_tokens": 304165087.0, "step": 708 }, { "entropy": 0.390350341796875, "epoch": 2.7913385826771653, "grad_norm": 0.6211293030549162, "learning_rate": 2.7402310879662497e-07, "loss": 0.3108, "mean_token_accuracy": 0.8857911806553602, "num_tokens": 304597934.0, "step": 709 }, { "entropy": 0.395965576171875, "epoch": 2.795275590551181, "grad_norm": 0.6111833255354941, "learning_rate": 2.640125611680522e-07, "loss": 0.307, "mean_token_accuracy": 0.8857949497178197, "num_tokens": 305018870.0, "step": 710 }, { "entropy": 0.394439697265625, "epoch": 2.7992125984251968, "grad_norm": 0.6245889179074243, "learning_rate": 2.54185837649652e-07, "loss": 0.3146, "mean_token_accuracy": 0.8863369012251496, "num_tokens": 305445367.0, "step": 711 }, { "entropy": 0.38995361328125, "epoch": 2.8031496062992125, "grad_norm": 0.6361336623622774, "learning_rate": 2.4454312377871105e-07, "loss": 0.3054, "mean_token_accuracy": 0.8899529185146093, "num_tokens": 305863242.0, "step": 712 }, { "entropy": 0.393829345703125, "epoch": 2.8070866141732282, "grad_norm": 0.628109002053711, "learning_rate": 2.3508460161825176e-07, "loss": 0.3193, "mean_token_accuracy": 0.8849193248897791, "num_tokens": 306299748.0, "step": 713 }, { "entropy": 0.393280029296875, "epoch": 2.811023622047244, "grad_norm": 0.6279036033740151, "learning_rate": 2.25810449753594e-07, "loss": 0.314, "mean_token_accuracy": 0.8862151158973575, "num_tokens": 306721284.0, "step": 714 }, { "entropy": 0.3931884765625, "epoch": 2.8149606299212597, "grad_norm": 0.6192907683317119, "learning_rate": 2.167208432889789e-07, "loss": 0.3112, "mean_token_accuracy": 0.886825337074697, "num_tokens": 307148450.0, "step": 715 }, { "entropy": 0.3929443359375, "epoch": 2.8188976377952755, "grad_norm": 0.6423049938131764, "learning_rate": 2.0781595384427032e-07, "loss": 0.3213, "mean_token_accuracy": 0.8844384793192148, "num_tokens": 307575103.0, "step": 716 }, { "entropy": 0.388519287109375, "epoch": 2.822834645669291, "grad_norm": 0.6895754865141671, "learning_rate": 1.9909594955170752e-07, "loss": 0.3166, "mean_token_accuracy": 0.8848179634660482, "num_tokens": 308030230.0, "step": 717 }, { "entropy": 0.39794921875, "epoch": 2.826771653543307, "grad_norm": 0.6307804441913263, "learning_rate": 1.9056099505273428e-07, "loss": 0.3054, "mean_token_accuracy": 0.8902622666209936, "num_tokens": 308450663.0, "step": 718 }, { "entropy": 0.389678955078125, "epoch": 2.8307086614173227, "grad_norm": 0.6146486060327043, "learning_rate": 1.8221125149489038e-07, "loss": 0.3064, "mean_token_accuracy": 0.8884834293276072, "num_tokens": 308894935.0, "step": 719 }, { "entropy": 0.390594482421875, "epoch": 2.8346456692913384, "grad_norm": 0.6205281415006133, "learning_rate": 1.7404687652876728e-07, "loss": 0.3069, "mean_token_accuracy": 0.8885259497910738, "num_tokens": 309328947.0, "step": 720 }, { "entropy": 0.38934326171875, "epoch": 2.838582677165354, "grad_norm": 0.6507938234415173, "learning_rate": 1.6606802430503166e-07, "loss": 0.3189, "mean_token_accuracy": 0.8856641910970211, "num_tokens": 309766264.0, "step": 721 }, { "entropy": 0.390716552734375, "epoch": 2.84251968503937, "grad_norm": 0.6272447591973911, "learning_rate": 1.5827484547151772e-07, "loss": 0.3177, "mean_token_accuracy": 0.8859880482777953, "num_tokens": 310197799.0, "step": 722 }, { "entropy": 0.390716552734375, "epoch": 2.846456692913386, "grad_norm": 0.6513111313353903, "learning_rate": 1.506674871703795e-07, "loss": 0.3109, "mean_token_accuracy": 0.8863828666508198, "num_tokens": 310620770.0, "step": 723 }, { "entropy": 0.393646240234375, "epoch": 2.850393700787402, "grad_norm": 0.6625745601868105, "learning_rate": 1.43246093035313e-07, "loss": 0.3184, "mean_token_accuracy": 0.8855500891804695, "num_tokens": 311036524.0, "step": 724 }, { "entropy": 0.396697998046875, "epoch": 2.8543307086614176, "grad_norm": 0.6566020287580173, "learning_rate": 1.360108031888474e-07, "loss": 0.3299, "mean_token_accuracy": 0.8796233041211963, "num_tokens": 311469275.0, "step": 725 }, { "entropy": 0.3905029296875, "epoch": 2.8582677165354333, "grad_norm": 0.6006475257455051, "learning_rate": 1.2896175423969592e-07, "loss": 0.3115, "mean_token_accuracy": 0.8865141673013568, "num_tokens": 311898980.0, "step": 726 }, { "entropy": 0.39263916015625, "epoch": 2.862204724409449, "grad_norm": 0.6191607551639687, "learning_rate": 1.2209907928017794e-07, "loss": 0.3289, "mean_token_accuracy": 0.8829834200441837, "num_tokens": 312348478.0, "step": 727 }, { "entropy": 0.38818359375, "epoch": 2.866141732283465, "grad_norm": 0.6791667510376468, "learning_rate": 1.1542290788370547e-07, "loss": 0.327, "mean_token_accuracy": 0.8845472950488329, "num_tokens": 312779963.0, "step": 728 }, { "entropy": 0.39227294921875, "epoch": 2.8700787401574805, "grad_norm": 0.6297603203621586, "learning_rate": 1.089333661023373e-07, "loss": 0.3175, "mean_token_accuracy": 0.8854846712201834, "num_tokens": 313230002.0, "step": 729 }, { "entropy": 0.393951416015625, "epoch": 2.8740157480314963, "grad_norm": 0.6307950928733317, "learning_rate": 1.02630576464402e-07, "loss": 0.3097, "mean_token_accuracy": 0.8888869564980268, "num_tokens": 313655734.0, "step": 730 }, { "entropy": 0.4013671875, "epoch": 2.877952755905512, "grad_norm": 0.6300374206111313, "learning_rate": 9.651465797217652e-08, "loss": 0.3016, "mean_token_accuracy": 0.8900213818997145, "num_tokens": 314053306.0, "step": 731 }, { "entropy": 0.397491455078125, "epoch": 2.8818897637795278, "grad_norm": 0.6472776766881085, "learning_rate": 9.058572609964788e-08, "loss": 0.3188, "mean_token_accuracy": 0.8872040212154388, "num_tokens": 314496320.0, "step": 732 }, { "entropy": 0.39404296875, "epoch": 2.8858267716535435, "grad_norm": 0.6313115545178322, "learning_rate": 8.484389279032835e-08, "loss": 0.3238, "mean_token_accuracy": 0.88473956938833, "num_tokens": 314930406.0, "step": 733 }, { "entropy": 0.388824462890625, "epoch": 2.8897637795275593, "grad_norm": 0.6382768601008938, "learning_rate": 7.928926645514034e-08, "loss": 0.3052, "mean_token_accuracy": 0.8889888888224959, "num_tokens": 315367644.0, "step": 734 }, { "entropy": 0.3909912109375, "epoch": 2.893700787401575, "grad_norm": 0.6284317329218896, "learning_rate": 7.39219519703771e-08, "loss": 0.3191, "mean_token_accuracy": 0.8852613195776939, "num_tokens": 315807014.0, "step": 735 }, { "entropy": 0.39532470703125, "epoch": 2.8976377952755907, "grad_norm": 0.6357283678636514, "learning_rate": 6.874205067571082e-08, "loss": 0.3036, "mean_token_accuracy": 0.8906446024775505, "num_tokens": 316233269.0, "step": 736 }, { "entropy": 0.389434814453125, "epoch": 2.9015748031496065, "grad_norm": 0.6074500598261228, "learning_rate": 6.374966037229202e-08, "loss": 0.3162, "mean_token_accuracy": 0.8877827478572726, "num_tokens": 316683927.0, "step": 737 }, { "entropy": 0.39202880859375, "epoch": 2.905511811023622, "grad_norm": 0.6689918803322398, "learning_rate": 5.894487532089321e-08, "loss": 0.3108, "mean_token_accuracy": 0.8866195920854807, "num_tokens": 317113161.0, "step": 738 }, { "entropy": 0.396728515625, "epoch": 2.909448818897638, "grad_norm": 0.6665066258545567, "learning_rate": 5.4327786240132576e-08, "loss": 0.3087, "mean_token_accuracy": 0.8856972297653556, "num_tokens": 317534548.0, "step": 739 }, { "entropy": 0.3934326171875, "epoch": 2.9133858267716537, "grad_norm": 0.6254976882222406, "learning_rate": 4.989848030476307e-08, "loss": 0.3152, "mean_token_accuracy": 0.888129792176187, "num_tokens": 317967477.0, "step": 740 }, { "entropy": 0.39166259765625, "epoch": 2.9173228346456694, "grad_norm": 0.6188976112537805, "learning_rate": 4.5657041144023804e-08, "loss": 0.3131, "mean_token_accuracy": 0.8876488180831075, "num_tokens": 318401623.0, "step": 741 }, { "entropy": 0.3941650390625, "epoch": 2.921259842519685, "grad_norm": 0.6224520183112306, "learning_rate": 4.1603548840062344e-08, "loss": 0.3144, "mean_token_accuracy": 0.8860314814373851, "num_tokens": 318829485.0, "step": 742 }, { "entropy": 0.389404296875, "epoch": 2.925196850393701, "grad_norm": 0.6621931157658465, "learning_rate": 3.773807992642153e-08, "loss": 0.3119, "mean_token_accuracy": 0.8872817764058709, "num_tokens": 319262158.0, "step": 743 }, { "entropy": 0.3955078125, "epoch": 2.9291338582677167, "grad_norm": 0.6448239265983623, "learning_rate": 3.406070738659617e-08, "loss": 0.3119, "mean_token_accuracy": 0.8864805707708001, "num_tokens": 319687504.0, "step": 744 }, { "entropy": 0.388275146484375, "epoch": 2.9330708661417324, "grad_norm": 0.6124809281758588, "learning_rate": 3.0571500652651906e-08, "loss": 0.3146, "mean_token_accuracy": 0.8860159404575825, "num_tokens": 320119340.0, "step": 745 }, { "entropy": 0.390960693359375, "epoch": 2.937007874015748, "grad_norm": 0.6988960170224446, "learning_rate": 2.7270525603920738e-08, "loss": 0.3181, "mean_token_accuracy": 0.8859586268663406, "num_tokens": 320552288.0, "step": 746 }, { "entropy": 0.395050048828125, "epoch": 2.940944881889764, "grad_norm": 1.1277672104662997, "learning_rate": 2.4157844565747546e-08, "loss": 0.3105, "mean_token_accuracy": 0.8864605631679296, "num_tokens": 320970311.0, "step": 747 }, { "entropy": 0.393951416015625, "epoch": 2.9448818897637796, "grad_norm": 0.606549790004319, "learning_rate": 2.1233516308323266e-08, "loss": 0.3196, "mean_token_accuracy": 0.8869588067755103, "num_tokens": 321394036.0, "step": 748 }, { "entropy": 0.38861083984375, "epoch": 2.9488188976377954, "grad_norm": 0.610184978078755, "learning_rate": 1.8497596045568002e-08, "loss": 0.3108, "mean_token_accuracy": 0.888460848480463, "num_tokens": 321823512.0, "step": 749 }, { "entropy": 0.39996337890625, "epoch": 2.952755905511811, "grad_norm": 0.6265535840932653, "learning_rate": 1.5950135434091853e-08, "loss": 0.3102, "mean_token_accuracy": 0.8862089244648814, "num_tokens": 322231250.0, "step": 750 }, { "entropy": 0.396820068359375, "epoch": 2.956692913385827, "grad_norm": 0.6406905383870145, "learning_rate": 1.3591182572219031e-08, "loss": 0.3111, "mean_token_accuracy": 0.8872844418510795, "num_tokens": 322651161.0, "step": 751 }, { "entropy": 0.39044189453125, "epoch": 2.9606299212598426, "grad_norm": 0.5965161904330141, "learning_rate": 1.14207819990797e-08, "loss": 0.3153, "mean_token_accuracy": 0.8871821071952581, "num_tokens": 323098909.0, "step": 752 }, { "entropy": 0.393218994140625, "epoch": 2.9645669291338583, "grad_norm": 0.6127435054185151, "learning_rate": 9.438974693768421e-09, "loss": 0.314, "mean_token_accuracy": 0.8866363558918238, "num_tokens": 323538525.0, "step": 753 }, { "entropy": 0.3946533203125, "epoch": 2.968503937007874, "grad_norm": 0.6146105288887688, "learning_rate": 7.645798074572552e-09, "loss": 0.3038, "mean_token_accuracy": 0.8898013606667519, "num_tokens": 323972527.0, "step": 754 }, { "entropy": 0.39141845703125, "epoch": 2.97244094488189, "grad_norm": 0.6236575631946764, "learning_rate": 6.0412859982628135e-09, "loss": 0.3336, "mean_token_accuracy": 0.8825578549876809, "num_tokens": 324414207.0, "step": 755 }, { "entropy": 0.38677978515625, "epoch": 2.9763779527559056, "grad_norm": 0.6194778300675513, "learning_rate": 4.6254687594538e-09, "loss": 0.3132, "mean_token_accuracy": 0.8866851180791855, "num_tokens": 324850535.0, "step": 756 }, { "entropy": 0.389617919921875, "epoch": 2.9803149606299213, "grad_norm": 0.622216336551404, "learning_rate": 3.3983730900377654e-09, "loss": 0.3062, "mean_token_accuracy": 0.8894290942698717, "num_tokens": 325286762.0, "step": 757 }, { "entropy": 0.389984130859375, "epoch": 2.984251968503937, "grad_norm": 0.5998108891950104, "learning_rate": 2.3600221586717043e-09, "loss": 0.3134, "mean_token_accuracy": 0.8875658120959997, "num_tokens": 325712765.0, "step": 758 }, { "entropy": 0.38818359375, "epoch": 2.9881889763779528, "grad_norm": 0.6160587396973891, "learning_rate": 1.5104355703465801e-09, "loss": 0.3021, "mean_token_accuracy": 0.8908078372478485, "num_tokens": 326158601.0, "step": 759 }, { "entropy": 0.3980712890625, "epoch": 2.9921259842519685, "grad_norm": 0.631334091047954, "learning_rate": 8.496293660120725e-10, "loss": 0.309, "mean_token_accuracy": 0.8900680867955089, "num_tokens": 326581952.0, "step": 760 }, { "entropy": 0.393585205078125, "epoch": 2.9960629921259843, "grad_norm": 0.6025923101996601, "learning_rate": 3.7761602227903705e-10, "loss": 0.3141, "mean_token_accuracy": 0.885806068778038, "num_tokens": 327014051.0, "step": 761 }, { "entropy": 0.39324951171875, "epoch": 3.0, "grad_norm": 0.646299642666546, "learning_rate": 9.44044511796971e-11, "loss": 0.3105, "mean_token_accuracy": 0.8872898044064641, "num_tokens": 327455853.0, "step": 762 }, { "epoch": 3.0, "step": 762, "total_flos": 605566838833152.0, "train_loss": 0.4404701322238902, "train_runtime": 58825.4442, "train_samples_per_second": 1.255, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 762, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 64, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 605566838833152.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }