Files
PS_only_answer_Qwen3-4B-Bas…/trainer_state.json

5564 lines
156 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 552,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.4132080078125,
"epoch": 0.010869565217391304,
"grad_norm": 311.4203607984754,
"learning_rate": 0.0,
"loss": 8.9732,
"mean_token_accuracy": 0.0026041667442768812,
"num_tokens": 736049.0,
"step": 1
},
{
"entropy": 0.3986663818359375,
"epoch": 0.021739130434782608,
"grad_norm": 320.59921807794416,
"learning_rate": 3.5714285714285716e-07,
"loss": 8.9248,
"mean_token_accuracy": 0.0,
"num_tokens": 1463777.0,
"step": 2
},
{
"entropy": 0.4055328369140625,
"epoch": 0.03260869565217391,
"grad_norm": 307.5231697291134,
"learning_rate": 7.142857142857143e-07,
"loss": 8.9711,
"mean_token_accuracy": 0.0,
"num_tokens": 2219300.0,
"step": 3
},
{
"entropy": 0.3943328857421875,
"epoch": 0.043478260869565216,
"grad_norm": 323.32099848994,
"learning_rate": 1.0714285714285714e-06,
"loss": 8.7592,
"mean_token_accuracy": 0.0,
"num_tokens": 2979743.0,
"step": 4
},
{
"entropy": 0.420135498046875,
"epoch": 0.05434782608695652,
"grad_norm": 340.3141229539087,
"learning_rate": 1.4285714285714286e-06,
"loss": 8.2909,
"mean_token_accuracy": 0.0,
"num_tokens": 3706150.0,
"step": 5
},
{
"entropy": 0.408721923828125,
"epoch": 0.06521739130434782,
"grad_norm": 344.270504919432,
"learning_rate": 1.7857142857142859e-06,
"loss": 8.0728,
"mean_token_accuracy": 0.0,
"num_tokens": 4445188.0,
"step": 6
},
{
"entropy": 0.39560699462890625,
"epoch": 0.07608695652173914,
"grad_norm": 302.2444503869232,
"learning_rate": 2.1428571428571427e-06,
"loss": 6.2297,
"mean_token_accuracy": 0.031250000931322575,
"num_tokens": 5187957.0,
"step": 7
},
{
"entropy": 0.4117279052734375,
"epoch": 0.08695652173913043,
"grad_norm": 219.04723364481447,
"learning_rate": 2.5e-06,
"loss": 5.4674,
"mean_token_accuracy": 0.08072916907258332,
"num_tokens": 5921002.0,
"step": 8
},
{
"entropy": 0.417694091796875,
"epoch": 0.09782608695652174,
"grad_norm": 98.1081259847685,
"learning_rate": 2.8571428571428573e-06,
"loss": 4.2197,
"mean_token_accuracy": 0.4973958481568843,
"num_tokens": 6652142.0,
"step": 9
},
{
"entropy": 0.40997314453125,
"epoch": 0.10869565217391304,
"grad_norm": 86.7533080434819,
"learning_rate": 3.2142857142857147e-06,
"loss": 4.0085,
"mean_token_accuracy": 0.49479168141260743,
"num_tokens": 7393898.0,
"step": 10
},
{
"entropy": 0.4106292724609375,
"epoch": 0.11956521739130435,
"grad_norm": 59.58522339179672,
"learning_rate": 3.5714285714285718e-06,
"loss": 3.3894,
"mean_token_accuracy": 0.505208348389715,
"num_tokens": 8102676.0,
"step": 11
},
{
"entropy": 0.40155029296875,
"epoch": 0.13043478260869565,
"grad_norm": 58.92890419627513,
"learning_rate": 3.928571428571429e-06,
"loss": 3.3107,
"mean_token_accuracy": 0.4843750144354999,
"num_tokens": 8809290.0,
"step": 12
},
{
"entropy": 0.393829345703125,
"epoch": 0.14130434782608695,
"grad_norm": 57.409462446242955,
"learning_rate": 4.2857142857142855e-06,
"loss": 3.2212,
"mean_token_accuracy": 0.5156250153668225,
"num_tokens": 9557288.0,
"step": 13
},
{
"entropy": 0.3956146240234375,
"epoch": 0.15217391304347827,
"grad_norm": 56.488189115901584,
"learning_rate": 4.642857142857144e-06,
"loss": 3.1327,
"mean_token_accuracy": 0.505208348389715,
"num_tokens": 10319598.0,
"step": 14
},
{
"entropy": 0.4060211181640625,
"epoch": 0.16304347826086957,
"grad_norm": 60.282711315492946,
"learning_rate": 5e-06,
"loss": 3.0223,
"mean_token_accuracy": 0.5026041816454381,
"num_tokens": 11024225.0,
"step": 15
},
{
"entropy": 0.3878936767578125,
"epoch": 0.17391304347826086,
"grad_norm": 55.83706199973999,
"learning_rate": 5.357142857142857e-06,
"loss": 2.9584,
"mean_token_accuracy": 0.52604168234393,
"num_tokens": 11782202.0,
"step": 16
},
{
"entropy": 0.4119720458984375,
"epoch": 0.18478260869565216,
"grad_norm": 56.515615405805015,
"learning_rate": 5.7142857142857145e-06,
"loss": 2.9469,
"mean_token_accuracy": 0.5026041816454381,
"num_tokens": 12501745.0,
"step": 17
},
{
"entropy": 0.4099273681640625,
"epoch": 0.1956521739130435,
"grad_norm": 56.196543894293534,
"learning_rate": 6.071428571428571e-06,
"loss": 2.9031,
"mean_token_accuracy": 0.52604168234393,
"num_tokens": 13238411.0,
"step": 18
},
{
"entropy": 0.3933258056640625,
"epoch": 0.20652173913043478,
"grad_norm": 55.19469257080713,
"learning_rate": 6.4285714285714295e-06,
"loss": 2.8638,
"mean_token_accuracy": 0.5546875165309757,
"num_tokens": 13976737.0,
"step": 19
},
{
"entropy": 0.4046630859375,
"epoch": 0.21739130434782608,
"grad_norm": 55.62375832255656,
"learning_rate": 6.785714285714287e-06,
"loss": 2.8226,
"mean_token_accuracy": 0.52604168234393,
"num_tokens": 14713129.0,
"step": 20
},
{
"entropy": 0.4174346923828125,
"epoch": 0.22826086956521738,
"grad_norm": 56.0776537005582,
"learning_rate": 7.1428571428571436e-06,
"loss": 2.7738,
"mean_token_accuracy": 0.5390625160653144,
"num_tokens": 15414280.0,
"step": 21
},
{
"entropy": 0.3958892822265625,
"epoch": 0.2391304347826087,
"grad_norm": 55.84684848536741,
"learning_rate": 7.500000000000001e-06,
"loss": 2.7211,
"mean_token_accuracy": 0.5598958500195295,
"num_tokens": 16146409.0,
"step": 22
},
{
"entropy": 0.394927978515625,
"epoch": 0.25,
"grad_norm": 56.70304819091595,
"learning_rate": 7.857142857142858e-06,
"loss": 2.6884,
"mean_token_accuracy": 0.5651041835080832,
"num_tokens": 16876424.0,
"step": 23
},
{
"entropy": 0.3933563232421875,
"epoch": 0.2608695652173913,
"grad_norm": 59.405424994443024,
"learning_rate": 8.214285714285714e-06,
"loss": 2.686,
"mean_token_accuracy": 0.5572916832752526,
"num_tokens": 17607756.0,
"step": 24
},
{
"entropy": 0.391387939453125,
"epoch": 0.2717391304347826,
"grad_norm": 57.418991339188196,
"learning_rate": 8.571428571428571e-06,
"loss": 2.6324,
"mean_token_accuracy": 0.5312500158324838,
"num_tokens": 18339588.0,
"step": 25
},
{
"entropy": 0.4117889404296875,
"epoch": 0.2826086956521739,
"grad_norm": 57.04690525036641,
"learning_rate": 8.92857142857143e-06,
"loss": 2.5908,
"mean_token_accuracy": 0.5781250172294676,
"num_tokens": 19052228.0,
"step": 26
},
{
"entropy": 0.413299560546875,
"epoch": 0.29347826086956524,
"grad_norm": 57.87482177137187,
"learning_rate": 9.285714285714288e-06,
"loss": 2.5745,
"mean_token_accuracy": 0.5546875165309757,
"num_tokens": 19761480.0,
"step": 27
},
{
"entropy": 0.3918609619140625,
"epoch": 0.30434782608695654,
"grad_norm": 57.6709336727282,
"learning_rate": 9.642857142857144e-06,
"loss": 2.5131,
"mean_token_accuracy": 0.5729166837409139,
"num_tokens": 20518198.0,
"step": 28
},
{
"entropy": 0.3912506103515625,
"epoch": 0.31521739130434784,
"grad_norm": 57.59250346538701,
"learning_rate": 1e-05,
"loss": 2.4797,
"mean_token_accuracy": 0.5390625160653144,
"num_tokens": 21233376.0,
"step": 29
},
{
"entropy": 0.3871612548828125,
"epoch": 0.32608695652173914,
"grad_norm": 57.36161657267134,
"learning_rate": 9.999910138041584e-06,
"loss": 2.4115,
"mean_token_accuracy": 0.5885416842065752,
"num_tokens": 21965608.0,
"step": 30
},
{
"entropy": 0.3805999755859375,
"epoch": 0.33695652173913043,
"grad_norm": 57.50995417467593,
"learning_rate": 9.999640555396404e-06,
"loss": 2.3874,
"mean_token_accuracy": 0.5598958500195295,
"num_tokens": 22688324.0,
"step": 31
},
{
"entropy": 0.3845367431640625,
"epoch": 0.34782608695652173,
"grad_norm": 58.39720778593916,
"learning_rate": 9.99919126175455e-06,
"loss": 2.3259,
"mean_token_accuracy": 0.5885416832752526,
"num_tokens": 23409211.0,
"step": 32
},
{
"entropy": 0.387359619140625,
"epoch": 0.358695652173913,
"grad_norm": 58.26716191906752,
"learning_rate": 9.998562273265786e-06,
"loss": 2.2932,
"mean_token_accuracy": 0.8880208395421505,
"num_tokens": 24134105.0,
"step": 33
},
{
"entropy": 0.4016571044921875,
"epoch": 0.3695652173913043,
"grad_norm": 58.24491582430228,
"learning_rate": 9.997753612538963e-06,
"loss": 2.2414,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 24858194.0,
"step": 34
},
{
"entropy": 0.3777618408203125,
"epoch": 0.3804347826086957,
"grad_norm": 58.651315915936905,
"learning_rate": 9.996765308641218e-06,
"loss": 2.1791,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 25608809.0,
"step": 35
},
{
"entropy": 0.390777587890625,
"epoch": 0.391304347826087,
"grad_norm": 59.35487193293602,
"learning_rate": 9.995597397096923e-06,
"loss": 2.1431,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 26351472.0,
"step": 36
},
{
"entropy": 0.410919189453125,
"epoch": 0.40217391304347827,
"grad_norm": 59.07734759750127,
"learning_rate": 9.994249919886402e-06,
"loss": 2.1013,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 27081828.0,
"step": 37
},
{
"entropy": 0.38934326171875,
"epoch": 0.41304347826086957,
"grad_norm": 59.15541447484555,
"learning_rate": 9.992722925444434e-06,
"loss": 2.0633,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 27798222.0,
"step": 38
},
{
"entropy": 0.3936767578125,
"epoch": 0.42391304347826086,
"grad_norm": 58.8800906283516,
"learning_rate": 9.9910164686585e-06,
"loss": 2.0148,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 28537568.0,
"step": 39
},
{
"entropy": 0.3974609375,
"epoch": 0.43478260869565216,
"grad_norm": 59.1178475676971,
"learning_rate": 9.989130610866822e-06,
"loss": 1.9566,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 29282405.0,
"step": 40
},
{
"entropy": 0.39190673828125,
"epoch": 0.44565217391304346,
"grad_norm": 58.455690329387025,
"learning_rate": 9.98706541985615e-06,
"loss": 1.8993,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 30041949.0,
"step": 41
},
{
"entropy": 0.38946533203125,
"epoch": 0.45652173913043476,
"grad_norm": 58.21255505443314,
"learning_rate": 9.984820969859326e-06,
"loss": 1.8697,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 30784326.0,
"step": 42
},
{
"entropy": 0.38653564453125,
"epoch": 0.4673913043478261,
"grad_norm": 61.152558796905076,
"learning_rate": 9.98239734155262e-06,
"loss": 1.8516,
"mean_token_accuracy": 0.8958333395421505,
"num_tokens": 31494883.0,
"step": 43
},
{
"entropy": 0.3789520263671875,
"epoch": 0.4782608695652174,
"grad_norm": 64.01566395627057,
"learning_rate": 9.979794622052825e-06,
"loss": 1.8419,
"mean_token_accuracy": 0.8671875079162419,
"num_tokens": 32249196.0,
"step": 44
},
{
"entropy": 0.4061431884765625,
"epoch": 0.4891304347826087,
"grad_norm": 58.01942955061165,
"learning_rate": 9.977012904914133e-06,
"loss": 1.7189,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 32943506.0,
"step": 45
},
{
"entropy": 0.3825225830078125,
"epoch": 0.5,
"grad_norm": 57.61228110390323,
"learning_rate": 9.97405229012476e-06,
"loss": 1.6684,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 33693820.0,
"step": 46
},
{
"entropy": 0.395050048828125,
"epoch": 0.5108695652173914,
"grad_norm": 57.71747140390215,
"learning_rate": 9.970912884103365e-06,
"loss": 1.6254,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 34423797.0,
"step": 47
},
{
"entropy": 0.406280517578125,
"epoch": 0.5217391304347826,
"grad_norm": 57.7963582112258,
"learning_rate": 9.967594799695218e-06,
"loss": 1.5949,
"mean_token_accuracy": 0.890625006519258,
"num_tokens": 35134310.0,
"step": 48
},
{
"entropy": 0.4052734375,
"epoch": 0.532608695652174,
"grad_norm": 56.73762320496077,
"learning_rate": 9.964098156168143e-06,
"loss": 1.5258,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 35851216.0,
"step": 49
},
{
"entropy": 0.382781982421875,
"epoch": 0.5434782608695652,
"grad_norm": 56.57460935743424,
"learning_rate": 9.960423079208235e-06,
"loss": 1.461,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 36622748.0,
"step": 50
},
{
"entropy": 0.4014434814453125,
"epoch": 0.5543478260869565,
"grad_norm": 56.449396812129784,
"learning_rate": 9.956569700915338e-06,
"loss": 1.4112,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 37341849.0,
"step": 51
},
{
"entropy": 0.38623046875,
"epoch": 0.5652173913043478,
"grad_norm": 57.34475610674306,
"learning_rate": 9.9525381597983e-06,
"loss": 1.3685,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 38078370.0,
"step": 52
},
{
"entropy": 0.403656005859375,
"epoch": 0.5760869565217391,
"grad_norm": 56.628062051209035,
"learning_rate": 9.948328600769996e-06,
"loss": 1.3272,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 38799653.0,
"step": 53
},
{
"entropy": 0.3992919921875,
"epoch": 0.5869565217391305,
"grad_norm": 56.43643883036958,
"learning_rate": 9.943941175142109e-06,
"loss": 1.2695,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 39508420.0,
"step": 54
},
{
"entropy": 0.3996429443359375,
"epoch": 0.5978260869565217,
"grad_norm": 56.337771571623705,
"learning_rate": 9.939376040619707e-06,
"loss": 1.2108,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 40235911.0,
"step": 55
},
{
"entropy": 0.3965911865234375,
"epoch": 0.6086956521739131,
"grad_norm": 56.128155563321904,
"learning_rate": 9.934633361295558e-06,
"loss": 1.1563,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 40975042.0,
"step": 56
},
{
"entropy": 0.400970458984375,
"epoch": 0.6195652173913043,
"grad_norm": 55.26315542185204,
"learning_rate": 9.929713307644245e-06,
"loss": 1.1246,
"mean_token_accuracy": 0.8958333395421505,
"num_tokens": 41696396.0,
"step": 57
},
{
"entropy": 0.39263916015625,
"epoch": 0.6304347826086957,
"grad_norm": 54.86254923259469,
"learning_rate": 9.924616056516027e-06,
"loss": 1.076,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 42440574.0,
"step": 58
},
{
"entropy": 0.3855133056640625,
"epoch": 0.6413043478260869,
"grad_norm": 54.37535851726419,
"learning_rate": 9.919341791130496e-06,
"loss": 0.9996,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 43201491.0,
"step": 59
},
{
"entropy": 0.3922271728515625,
"epoch": 0.6521739130434783,
"grad_norm": 53.843542444920786,
"learning_rate": 9.91389070106998e-06,
"loss": 0.939,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 43941990.0,
"step": 60
},
{
"entropy": 0.392181396484375,
"epoch": 0.6630434782608695,
"grad_norm": 53.193793655115314,
"learning_rate": 9.908262982272724e-06,
"loss": 0.9157,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 44683606.0,
"step": 61
},
{
"entropy": 0.4019012451171875,
"epoch": 0.6739130434782609,
"grad_norm": 52.50158074780275,
"learning_rate": 9.902458837025865e-06,
"loss": 0.9166,
"mean_token_accuracy": 0.8750000074505806,
"num_tokens": 45432434.0,
"step": 62
},
{
"entropy": 0.3917388916015625,
"epoch": 0.6847826086956522,
"grad_norm": 51.20620506081717,
"learning_rate": 9.896478473958147e-06,
"loss": 0.8248,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 46172808.0,
"step": 63
},
{
"entropy": 0.42156982421875,
"epoch": 0.6956521739130435,
"grad_norm": 49.71377838953342,
"learning_rate": 9.890322108032423e-06,
"loss": 0.7796,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 46887344.0,
"step": 64
},
{
"entropy": 0.412841796875,
"epoch": 0.7065217391304348,
"grad_norm": 48.79946579114015,
"learning_rate": 9.883989960537934e-06,
"loss": 0.7169,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 47598804.0,
"step": 65
},
{
"entropy": 0.4025421142578125,
"epoch": 0.717391304347826,
"grad_norm": 47.47246229291085,
"learning_rate": 9.87748225908235e-06,
"loss": 0.6599,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 48328626.0,
"step": 66
},
{
"entropy": 0.3900604248046875,
"epoch": 0.7282608695652174,
"grad_norm": 45.6630505460685,
"learning_rate": 9.870799237583586e-06,
"loss": 0.6299,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 49050818.0,
"step": 67
},
{
"entropy": 0.3900146484375,
"epoch": 0.7391304347826086,
"grad_norm": 43.671137236728555,
"learning_rate": 9.863941136261409e-06,
"loss": 0.5987,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 49821603.0,
"step": 68
},
{
"entropy": 0.3853912353515625,
"epoch": 0.75,
"grad_norm": 41.75463992853865,
"learning_rate": 9.85690820162878e-06,
"loss": 0.5554,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 50583851.0,
"step": 69
},
{
"entropy": 0.39080810546875,
"epoch": 0.7608695652173914,
"grad_norm": 39.63626833273986,
"learning_rate": 9.849700686483016e-06,
"loss": 0.5429,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 51340765.0,
"step": 70
},
{
"entropy": 0.3896942138671875,
"epoch": 0.7717391304347826,
"grad_norm": 37.2850766781592,
"learning_rate": 9.842318849896679e-06,
"loss": 0.5126,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 52062797.0,
"step": 71
},
{
"entropy": 0.4081268310546875,
"epoch": 0.782608695652174,
"grad_norm": 34.8524617938976,
"learning_rate": 9.834762957208293e-06,
"loss": 0.4817,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 52763819.0,
"step": 72
},
{
"entropy": 0.4026336669921875,
"epoch": 0.7934782608695652,
"grad_norm": 32.73006841336539,
"learning_rate": 9.827033280012783e-06,
"loss": 0.4532,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 53476825.0,
"step": 73
},
{
"entropy": 0.38232421875,
"epoch": 0.8043478260869565,
"grad_norm": 30.598447911376223,
"learning_rate": 9.819130096151718e-06,
"loss": 0.4288,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 54230469.0,
"step": 74
},
{
"entropy": 0.3890380859375,
"epoch": 0.8152173913043478,
"grad_norm": 28.510481522648345,
"learning_rate": 9.811053689703333e-06,
"loss": 0.3822,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 54968247.0,
"step": 75
},
{
"entropy": 0.3891143798828125,
"epoch": 0.8260869565217391,
"grad_norm": 26.69143261415315,
"learning_rate": 9.802804350972308e-06,
"loss": 0.3698,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 55729266.0,
"step": 76
},
{
"entropy": 0.4080963134765625,
"epoch": 0.8369565217391305,
"grad_norm": 24.853889219833917,
"learning_rate": 9.794382376479334e-06,
"loss": 0.3282,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 56439606.0,
"step": 77
},
{
"entropy": 0.39483642578125,
"epoch": 0.8478260869565217,
"grad_norm": 23.879103930139745,
"learning_rate": 9.785788068950463e-06,
"loss": 0.3492,
"mean_token_accuracy": 0.8958333395421505,
"num_tokens": 57174042.0,
"step": 78
},
{
"entropy": 0.393463134765625,
"epoch": 0.8586956521739131,
"grad_norm": 29.502826535200015,
"learning_rate": 9.777021737306214e-06,
"loss": 0.3805,
"mean_token_accuracy": 0.890625006519258,
"num_tokens": 57922090.0,
"step": 79
},
{
"entropy": 0.39227294921875,
"epoch": 0.8695652173913043,
"grad_norm": 25.186278274805534,
"learning_rate": 9.768083696650481e-06,
"loss": 0.3131,
"mean_token_accuracy": 0.8828125069849193,
"num_tokens": 58657068.0,
"step": 80
},
{
"entropy": 0.388702392578125,
"epoch": 0.8804347826086957,
"grad_norm": 20.39179919664989,
"learning_rate": 9.7589742682592e-06,
"loss": 0.3105,
"mean_token_accuracy": 0.8541666753590107,
"num_tokens": 59371848.0,
"step": 81
},
{
"entropy": 0.39642333984375,
"epoch": 0.8913043478260869,
"grad_norm": 16.48573667654869,
"learning_rate": 9.749693779568799e-06,
"loss": 0.2881,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 60138700.0,
"step": 82
},
{
"entropy": 0.418212890625,
"epoch": 0.9021739130434783,
"grad_norm": 16.736139521937194,
"learning_rate": 9.740242564164433e-06,
"loss": 0.2842,
"mean_token_accuracy": 0.8880208400078118,
"num_tokens": 60857518.0,
"step": 83
},
{
"entropy": 0.406829833984375,
"epoch": 0.9130434782608695,
"grad_norm": 11.573091220404486,
"learning_rate": 9.730620961767996e-06,
"loss": 0.2459,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 61574756.0,
"step": 84
},
{
"entropy": 0.3925933837890625,
"epoch": 0.9239130434782609,
"grad_norm": 15.167601713145734,
"learning_rate": 9.720829318225897e-06,
"loss": 0.2541,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 62301917.0,
"step": 85
},
{
"entropy": 0.4003753662109375,
"epoch": 0.9347826086956522,
"grad_norm": 10.3890102579825,
"learning_rate": 9.710867985496644e-06,
"loss": 0.2287,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 63032620.0,
"step": 86
},
{
"entropy": 0.390167236328125,
"epoch": 0.9456521739130435,
"grad_norm": 8.81818629332175,
"learning_rate": 9.700737321638185e-06,
"loss": 0.254,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 63766906.0,
"step": 87
},
{
"entropy": 0.4013671875,
"epoch": 0.9565217391304348,
"grad_norm": 9.910698551626139,
"learning_rate": 9.690437690795038e-06,
"loss": 0.2311,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 64494352.0,
"step": 88
},
{
"entropy": 0.395233154296875,
"epoch": 0.967391304347826,
"grad_norm": 6.574734122970261,
"learning_rate": 9.6799694631852e-06,
"loss": 0.2114,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 65236044.0,
"step": 89
},
{
"entropy": 0.40948486328125,
"epoch": 0.9782608695652174,
"grad_norm": 6.510590706786288,
"learning_rate": 9.669333015086847e-06,
"loss": 0.2034,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 65983515.0,
"step": 90
},
{
"entropy": 0.3949737548828125,
"epoch": 0.9891304347826086,
"grad_norm": 5.9428204712433415,
"learning_rate": 9.658528728824799e-06,
"loss": 0.2004,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 66708318.0,
"step": 91
},
{
"entropy": 0.395599365234375,
"epoch": 1.0,
"grad_norm": 7.805501519637745,
"learning_rate": 9.647556992756789e-06,
"loss": 0.2292,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 67446881.0,
"step": 92
},
{
"entropy": 0.3989410400390625,
"epoch": 1.0108695652173914,
"grad_norm": 6.762899007449279,
"learning_rate": 9.63641820125949e-06,
"loss": 0.1931,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 68189298.0,
"step": 93
},
{
"entropy": 0.383544921875,
"epoch": 1.0217391304347827,
"grad_norm": 26.558510253726908,
"learning_rate": 9.62511275471435e-06,
"loss": 0.3214,
"mean_token_accuracy": 0.8854166734963655,
"num_tokens": 68923012.0,
"step": 94
},
{
"entropy": 0.4062652587890625,
"epoch": 1.0326086956521738,
"grad_norm": 17.66752124642183,
"learning_rate": 9.613641059493197e-06,
"loss": 0.2273,
"mean_token_accuracy": 0.8750000074505806,
"num_tokens": 69634729.0,
"step": 95
},
{
"entropy": 0.42718505859375,
"epoch": 1.0434782608695652,
"grad_norm": 6.440308115084329,
"learning_rate": 9.602003527943629e-06,
"loss": 0.2182,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 70360145.0,
"step": 96
},
{
"entropy": 0.419647216796875,
"epoch": 1.0543478260869565,
"grad_norm": 3.313729334755876,
"learning_rate": 9.590200578374198e-06,
"loss": 0.1965,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 71094229.0,
"step": 97
},
{
"entropy": 0.4211273193359375,
"epoch": 1.065217391304348,
"grad_norm": 6.472092255757511,
"learning_rate": 9.578232635039368e-06,
"loss": 0.1891,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 71837508.0,
"step": 98
},
{
"entropy": 0.4221038818359375,
"epoch": 1.0760869565217392,
"grad_norm": 5.65079339360605,
"learning_rate": 9.56610012812427e-06,
"loss": 0.1976,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 72578472.0,
"step": 99
},
{
"entropy": 0.4143218994140625,
"epoch": 1.0869565217391304,
"grad_norm": 10.526588616167475,
"learning_rate": 9.553803493729237e-06,
"loss": 0.2276,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 73309250.0,
"step": 100
},
{
"entropy": 0.4214324951171875,
"epoch": 1.0978260869565217,
"grad_norm": 6.76038126286873,
"learning_rate": 9.541343173854128e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 74039837.0,
"step": 101
},
{
"entropy": 0.426239013671875,
"epoch": 1.108695652173913,
"grad_norm": 2.550406593305114,
"learning_rate": 9.528719616382443e-06,
"loss": 0.194,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 74782288.0,
"step": 102
},
{
"entropy": 0.4285736083984375,
"epoch": 1.1195652173913044,
"grad_norm": 1.9861505083333468,
"learning_rate": 9.515933275065218e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 75514570.0,
"step": 103
},
{
"entropy": 0.4146728515625,
"epoch": 1.1304347826086956,
"grad_norm": 2.0937110928111204,
"learning_rate": 9.502984609504724e-06,
"loss": 0.1657,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 76263471.0,
"step": 104
},
{
"entropy": 0.409637451171875,
"epoch": 1.141304347826087,
"grad_norm": 4.0370749201612925,
"learning_rate": 9.48987408513794e-06,
"loss": 0.2,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 77032808.0,
"step": 105
},
{
"entropy": 0.43310546875,
"epoch": 1.1521739130434783,
"grad_norm": 6.171691874710987,
"learning_rate": 9.476602173219822e-06,
"loss": 0.1915,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 77755289.0,
"step": 106
},
{
"entropy": 0.421905517578125,
"epoch": 1.1630434782608696,
"grad_norm": 9.707071779843087,
"learning_rate": 9.463169350806369e-06,
"loss": 0.218,
"mean_token_accuracy": 0.8854166734963655,
"num_tokens": 78504728.0,
"step": 107
},
{
"entropy": 0.442291259765625,
"epoch": 1.1739130434782608,
"grad_norm": 4.492512568138839,
"learning_rate": 9.449576100737474e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 79234552.0,
"step": 108
},
{
"entropy": 0.42706298828125,
"epoch": 1.184782608695652,
"grad_norm": 7.98756060312019,
"learning_rate": 9.435822911619564e-06,
"loss": 0.2187,
"mean_token_accuracy": 0.8645833414047956,
"num_tokens": 79986236.0,
"step": 109
},
{
"entropy": 0.430419921875,
"epoch": 1.1956521739130435,
"grad_norm": 2.1129664499398926,
"learning_rate": 9.421910277808044e-06,
"loss": 0.1867,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 80716096.0,
"step": 110
},
{
"entropy": 0.4068450927734375,
"epoch": 1.2065217391304348,
"grad_norm": 16.963736777066014,
"learning_rate": 9.407838699389525e-06,
"loss": 0.3354,
"mean_token_accuracy": 0.8723958409391344,
"num_tokens": 81458849.0,
"step": 111
},
{
"entropy": 0.4076385498046875,
"epoch": 1.2173913043478262,
"grad_norm": 13.924252562663128,
"learning_rate": 9.39360868216384e-06,
"loss": 0.3034,
"mean_token_accuracy": 0.8776041739620268,
"num_tokens": 82211351.0,
"step": 112
},
{
"entropy": 0.4111328125,
"epoch": 1.2282608695652173,
"grad_norm": 1.7100595172279562,
"learning_rate": 9.379220737625877e-06,
"loss": 0.1878,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 82980102.0,
"step": 113
},
{
"entropy": 0.433319091796875,
"epoch": 1.2391304347826086,
"grad_norm": 5.846423629455087,
"learning_rate": 9.364675382947185e-06,
"loss": 0.2073,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 83711400.0,
"step": 114
},
{
"entropy": 0.445159912109375,
"epoch": 1.25,
"grad_norm": 6.451469321919068,
"learning_rate": 9.349973140957392e-06,
"loss": 0.2106,
"mean_token_accuracy": 0.8854166734963655,
"num_tokens": 84460019.0,
"step": 115
},
{
"entropy": 0.447662353515625,
"epoch": 1.2608695652173914,
"grad_norm": 2.7790448798410115,
"learning_rate": 9.335114540125393e-06,
"loss": 0.1996,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 85193928.0,
"step": 116
},
{
"entropy": 0.4636077880859375,
"epoch": 1.2717391304347827,
"grad_norm": 2.6482470281746333,
"learning_rate": 9.320100114540382e-06,
"loss": 0.1937,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 85934868.0,
"step": 117
},
{
"entropy": 0.4525604248046875,
"epoch": 1.2826086956521738,
"grad_norm": 3.315578614192242,
"learning_rate": 9.304930403892633e-06,
"loss": 0.1847,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 86646205.0,
"step": 118
},
{
"entropy": 0.4474029541015625,
"epoch": 1.2934782608695652,
"grad_norm": 3.26330878602713,
"learning_rate": 9.289605953454108e-06,
"loss": 0.1799,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 87395348.0,
"step": 119
},
{
"entropy": 0.451995849609375,
"epoch": 1.3043478260869565,
"grad_norm": 4.510835834017429,
"learning_rate": 9.274127314058857e-06,
"loss": 0.2158,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 88139881.0,
"step": 120
},
{
"entropy": 0.4410247802734375,
"epoch": 1.315217391304348,
"grad_norm": 3.053218547982469,
"learning_rate": 9.258495042083222e-06,
"loss": 0.1895,
"mean_token_accuracy": 0.8958333395421505,
"num_tokens": 88852096.0,
"step": 121
},
{
"entropy": 0.434051513671875,
"epoch": 1.3260869565217392,
"grad_norm": 1.5033565251314938,
"learning_rate": 9.242709699425833e-06,
"loss": 0.1819,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 89626261.0,
"step": 122
},
{
"entropy": 0.4545745849609375,
"epoch": 1.3369565217391304,
"grad_norm": 3.0951856378919347,
"learning_rate": 9.226771853487411e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 90365741.0,
"step": 123
},
{
"entropy": 0.4754486083984375,
"epoch": 1.3478260869565217,
"grad_norm": 1.9859436895610123,
"learning_rate": 9.210682077150375e-06,
"loss": 0.1646,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 91057876.0,
"step": 124
},
{
"entropy": 0.487884521484375,
"epoch": 1.358695652173913,
"grad_norm": 8.27013486394484,
"learning_rate": 9.19444094875825e-06,
"loss": 0.1843,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 91761477.0,
"step": 125
},
{
"entropy": 0.4636993408203125,
"epoch": 1.3695652173913042,
"grad_norm": 2.2934540711997515,
"learning_rate": 9.178049052094881e-06,
"loss": 0.1618,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 92506148.0,
"step": 126
},
{
"entropy": 0.4479217529296875,
"epoch": 1.3804347826086958,
"grad_norm": 4.786209262389493,
"learning_rate": 9.161506976363438e-06,
"loss": 0.1983,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 93254912.0,
"step": 127
},
{
"entropy": 0.4602508544921875,
"epoch": 1.391304347826087,
"grad_norm": 5.7691216780542005,
"learning_rate": 9.144815316165251e-06,
"loss": 0.1935,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 93978264.0,
"step": 128
},
{
"entropy": 0.4593658447265625,
"epoch": 1.4021739130434783,
"grad_norm": 11.199636434276805,
"learning_rate": 9.127974671478432e-06,
"loss": 0.2107,
"mean_token_accuracy": 0.8776041739620268,
"num_tokens": 94710015.0,
"step": 129
},
{
"entropy": 0.446746826171875,
"epoch": 1.4130434782608696,
"grad_norm": 0.9923518223404385,
"learning_rate": 9.110985647636303e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 95446308.0,
"step": 130
},
{
"entropy": 0.476165771484375,
"epoch": 1.4239130434782608,
"grad_norm": 4.474677717637186,
"learning_rate": 9.09384885530565e-06,
"loss": 0.1851,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 96158637.0,
"step": 131
},
{
"entropy": 0.481658935546875,
"epoch": 1.434782608695652,
"grad_norm": 2.185059548670286,
"learning_rate": 9.076564910464753e-06,
"loss": 0.2109,
"mean_token_accuracy": 0.8958333395421505,
"num_tokens": 96918874.0,
"step": 132
},
{
"entropy": 0.5084381103515625,
"epoch": 1.4456521739130435,
"grad_norm": 2.6152649554216567,
"learning_rate": 9.059134434381274e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 97644105.0,
"step": 133
},
{
"entropy": 0.5358123779296875,
"epoch": 1.4565217391304348,
"grad_norm": 3.5962048690843793,
"learning_rate": 9.041558053589894e-06,
"loss": 0.19,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 98355051.0,
"step": 134
},
{
"entropy": 0.5171661376953125,
"epoch": 1.4673913043478262,
"grad_norm": 1.2450335202012426,
"learning_rate": 9.023836399869814e-06,
"loss": 0.1829,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 99093771.0,
"step": 135
},
{
"entropy": 0.517242431640625,
"epoch": 1.4782608695652173,
"grad_norm": 2.8717902587276094,
"learning_rate": 9.00597011022204e-06,
"loss": 0.1513,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 99833460.0,
"step": 136
},
{
"entropy": 0.482940673828125,
"epoch": 1.4891304347826086,
"grad_norm": 1.364200004039475,
"learning_rate": 8.987959826846479e-06,
"loss": 0.2022,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 100568646.0,
"step": 137
},
{
"entropy": 0.4926300048828125,
"epoch": 1.5,
"grad_norm": 0.597811474604225,
"learning_rate": 8.96980619711887e-06,
"loss": 0.1535,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 101307254.0,
"step": 138
},
{
"entropy": 0.514373779296875,
"epoch": 1.5108695652173914,
"grad_norm": 0.7538518313390983,
"learning_rate": 8.951509873567498e-06,
"loss": 0.1703,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 102001272.0,
"step": 139
},
{
"entropy": 0.496429443359375,
"epoch": 1.5217391304347827,
"grad_norm": 6.358019373787709,
"learning_rate": 8.93307151384975e-06,
"loss": 0.2264,
"mean_token_accuracy": 0.8958333395421505,
"num_tokens": 102695383.0,
"step": 140
},
{
"entropy": 0.46575927734375,
"epoch": 1.5326086956521738,
"grad_norm": 5.517487056748144,
"learning_rate": 8.914491780728471e-06,
"loss": 0.2207,
"mean_token_accuracy": 0.8776041739620268,
"num_tokens": 103429899.0,
"step": 141
},
{
"entropy": 0.4832611083984375,
"epoch": 1.5434782608695652,
"grad_norm": 2.4395442761334785,
"learning_rate": 8.895771342048145e-06,
"loss": 0.1596,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 104163455.0,
"step": 142
},
{
"entropy": 0.4860687255859375,
"epoch": 1.5543478260869565,
"grad_norm": 4.472158201631238,
"learning_rate": 8.876910870710885e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 104930091.0,
"step": 143
},
{
"entropy": 0.5011749267578125,
"epoch": 1.5652173913043477,
"grad_norm": 1.8149497345123533,
"learning_rate": 8.857911044652244e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 105653123.0,
"step": 144
},
{
"entropy": 0.5025482177734375,
"epoch": 1.5760869565217392,
"grad_norm": 3.8636950718069123,
"learning_rate": 8.838772546816857e-06,
"loss": 0.1808,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 106368006.0,
"step": 145
},
{
"entropy": 0.5029754638671875,
"epoch": 1.5869565217391304,
"grad_norm": 5.002115277057554,
"learning_rate": 8.819496065133879e-06,
"loss": 0.1834,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 107097616.0,
"step": 146
},
{
"entropy": 0.521240234375,
"epoch": 1.5978260869565217,
"grad_norm": 1.015730869884965,
"learning_rate": 8.800082292492274e-06,
"loss": 0.1662,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 107812186.0,
"step": 147
},
{
"entropy": 0.4935760498046875,
"epoch": 1.608695652173913,
"grad_norm": 1.0635747380852985,
"learning_rate": 8.780531926715888e-06,
"loss": 0.1704,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 108547506.0,
"step": 148
},
{
"entropy": 0.534637451171875,
"epoch": 1.6195652173913042,
"grad_norm": 1.5449003097461054,
"learning_rate": 8.760845670538387e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 109266322.0,
"step": 149
},
{
"entropy": 0.5189208984375,
"epoch": 1.6304347826086958,
"grad_norm": 2.0268875856085344,
"learning_rate": 8.741024231577983e-06,
"loss": 0.1517,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 110008922.0,
"step": 150
},
{
"entropy": 0.5161285400390625,
"epoch": 1.641304347826087,
"grad_norm": 1.0672579311719492,
"learning_rate": 8.721068322312007e-06,
"loss": 0.1531,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 110727558.0,
"step": 151
},
{
"entropy": 0.521392822265625,
"epoch": 1.6521739130434783,
"grad_norm": 5.213780878437388,
"learning_rate": 8.700978660051293e-06,
"loss": 0.195,
"mean_token_accuracy": 0.890625006519258,
"num_tokens": 111471527.0,
"step": 152
},
{
"entropy": 0.5205535888671875,
"epoch": 1.6630434782608696,
"grad_norm": 1.9517069380322867,
"learning_rate": 8.6807559669144e-06,
"loss": 0.1596,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 112239963.0,
"step": 153
},
{
"entropy": 0.561065673828125,
"epoch": 1.6739130434782608,
"grad_norm": 2.7705962729348363,
"learning_rate": 8.660400969801653e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 112976654.0,
"step": 154
},
{
"entropy": 0.5628204345703125,
"epoch": 1.6847826086956523,
"grad_norm": 4.898163032031099,
"learning_rate": 8.63991440036901e-06,
"loss": 0.1707,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 113718848.0,
"step": 155
},
{
"entropy": 0.5825958251953125,
"epoch": 1.6956521739130435,
"grad_norm": 2.833385371279203,
"learning_rate": 8.619296995001773e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 114442519.0,
"step": 156
},
{
"entropy": 0.57159423828125,
"epoch": 1.7065217391304348,
"grad_norm": 4.117296477591056,
"learning_rate": 8.598549494788111e-06,
"loss": 0.1806,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 115181598.0,
"step": 157
},
{
"entropy": 0.5811614990234375,
"epoch": 1.7173913043478262,
"grad_norm": 4.092040299967927,
"learning_rate": 8.577672645492426e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 115915785.0,
"step": 158
},
{
"entropy": 0.5871734619140625,
"epoch": 1.7282608695652173,
"grad_norm": 2.8908222719784127,
"learning_rate": 8.556667197528543e-06,
"loss": 0.1816,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 116621451.0,
"step": 159
},
{
"entropy": 0.5701141357421875,
"epoch": 1.7391304347826086,
"grad_norm": 3.6224602544096918,
"learning_rate": 8.535533905932739e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 117310119.0,
"step": 160
},
{
"entropy": 0.56756591796875,
"epoch": 1.75,
"grad_norm": 1.4631757964659389,
"learning_rate": 8.5142735303366e-06,
"loss": 0.1736,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 118041095.0,
"step": 161
},
{
"entropy": 0.5458984375,
"epoch": 1.7608695652173914,
"grad_norm": 2.3531502542663385,
"learning_rate": 8.492886834939722e-06,
"loss": 0.1568,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 118753230.0,
"step": 162
},
{
"entropy": 0.5509033203125,
"epoch": 1.7717391304347827,
"grad_norm": 5.023926546982622,
"learning_rate": 8.47137458848224e-06,
"loss": 0.1753,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 119489643.0,
"step": 163
},
{
"entropy": 0.53118896484375,
"epoch": 1.7826086956521738,
"grad_norm": 1.721920120015578,
"learning_rate": 8.44973756421719e-06,
"loss": 0.15,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 120231301.0,
"step": 164
},
{
"entropy": 0.52777099609375,
"epoch": 1.7934782608695652,
"grad_norm": 3.418076490433067,
"learning_rate": 8.427976539882725e-06,
"loss": 0.1613,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 120962416.0,
"step": 165
},
{
"entropy": 0.51080322265625,
"epoch": 1.8043478260869565,
"grad_norm": 3.4170913157737797,
"learning_rate": 8.406092297674146e-06,
"loss": 0.1552,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 121726443.0,
"step": 166
},
{
"entropy": 0.5138702392578125,
"epoch": 1.8152173913043477,
"grad_norm": 4.469668539839008,
"learning_rate": 8.384085624215801e-06,
"loss": 0.1692,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 122456880.0,
"step": 167
},
{
"entropy": 0.5077056884765625,
"epoch": 1.8260869565217392,
"grad_norm": 3.398224636055471,
"learning_rate": 8.3619573105328e-06,
"loss": 0.1976,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 123192926.0,
"step": 168
},
{
"entropy": 0.5028533935546875,
"epoch": 1.8369565217391304,
"grad_norm": 1.1058962935015557,
"learning_rate": 8.339708152022586e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 123947455.0,
"step": 169
},
{
"entropy": 0.5254058837890625,
"epoch": 1.8478260869565217,
"grad_norm": 5.167659625792978,
"learning_rate": 8.317338948426338e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 124663961.0,
"step": 170
},
{
"entropy": 0.5162200927734375,
"epoch": 1.858695652173913,
"grad_norm": 3.2546914206847983,
"learning_rate": 8.294850503800237e-06,
"loss": 0.1575,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 125403597.0,
"step": 171
},
{
"entropy": 0.536865234375,
"epoch": 1.8695652173913042,
"grad_norm": 2.121124553980237,
"learning_rate": 8.272243626486553e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 126132729.0,
"step": 172
},
{
"entropy": 0.511260986328125,
"epoch": 1.8804347826086958,
"grad_norm": 3.615045249735466,
"learning_rate": 8.24951912908459e-06,
"loss": 0.1607,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 126885495.0,
"step": 173
},
{
"entropy": 0.5345001220703125,
"epoch": 1.891304347826087,
"grad_norm": 2.959072727194171,
"learning_rate": 8.22667782842149e-06,
"loss": 0.1873,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 127611736.0,
"step": 174
},
{
"entropy": 0.544708251953125,
"epoch": 1.9021739130434783,
"grad_norm": 0.8408245157759806,
"learning_rate": 8.203720545522852e-06,
"loss": 0.1548,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 128331508.0,
"step": 175
},
{
"entropy": 0.542083740234375,
"epoch": 1.9130434782608696,
"grad_norm": 3.1936191732342483,
"learning_rate": 8.18064810558324e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 129056107.0,
"step": 176
},
{
"entropy": 0.52423095703125,
"epoch": 1.9239130434782608,
"grad_norm": 1.6394513978982945,
"learning_rate": 8.157461337936506e-06,
"loss": 0.15,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 129807571.0,
"step": 177
},
{
"entropy": 0.5415802001953125,
"epoch": 1.9347826086956523,
"grad_norm": 0.7311318246016283,
"learning_rate": 8.134161076025992e-06,
"loss": 0.1555,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 130558970.0,
"step": 178
},
{
"entropy": 0.5243377685546875,
"epoch": 1.9456521739130435,
"grad_norm": 8.45397210689231,
"learning_rate": 8.110748157374566e-06,
"loss": 0.1935,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 131313276.0,
"step": 179
},
{
"entropy": 0.5270843505859375,
"epoch": 1.9565217391304348,
"grad_norm": 4.680053604300379,
"learning_rate": 8.087223423554513e-06,
"loss": 0.1442,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 132053009.0,
"step": 180
},
{
"entropy": 0.5287017822265625,
"epoch": 1.9673913043478262,
"grad_norm": 1.4362435578406598,
"learning_rate": 8.063587720157298e-06,
"loss": 0.1577,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 132815611.0,
"step": 181
},
{
"entropy": 0.5521240234375,
"epoch": 1.9782608695652173,
"grad_norm": 4.592426202753763,
"learning_rate": 8.039841896763157e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 133527514.0,
"step": 182
},
{
"entropy": 0.562744140625,
"epoch": 1.9891304347826086,
"grad_norm": 8.804668623486542,
"learning_rate": 8.01598680691057e-06,
"loss": 0.1974,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 134233532.0,
"step": 183
},
{
"entropy": 0.52972412109375,
"epoch": 2.0,
"grad_norm": 5.442557880753617,
"learning_rate": 7.99202330806557e-06,
"loss": 0.1489,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 134970186.0,
"step": 184
},
{
"entropy": 0.5426025390625,
"epoch": 2.010869565217391,
"grad_norm": 3.8135433451565692,
"learning_rate": 7.967952261590936e-06,
"loss": 0.1616,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 135686807.0,
"step": 185
},
{
"entropy": 0.532379150390625,
"epoch": 2.0217391304347827,
"grad_norm": 5.581932015960598,
"learning_rate": 7.943774532715215e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 136422808.0,
"step": 186
},
{
"entropy": 0.515899658203125,
"epoch": 2.032608695652174,
"grad_norm": 6.432043736528405,
"learning_rate": 7.919490990501636e-06,
"loss": 0.1646,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 137151038.0,
"step": 187
},
{
"entropy": 0.51885986328125,
"epoch": 2.0434782608695654,
"grad_norm": 4.974490425422215,
"learning_rate": 7.895102507816866e-06,
"loss": 0.1638,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 137880949.0,
"step": 188
},
{
"entropy": 0.52191162109375,
"epoch": 2.0543478260869565,
"grad_norm": 3.2102438591558133,
"learning_rate": 7.870609961299627e-06,
"loss": 0.152,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 138598332.0,
"step": 189
},
{
"entropy": 0.517822265625,
"epoch": 2.0652173913043477,
"grad_norm": 1.129167074131451,
"learning_rate": 7.8460142313292e-06,
"loss": 0.1986,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 139306369.0,
"step": 190
},
{
"entropy": 0.4926605224609375,
"epoch": 2.0760869565217392,
"grad_norm": 5.511214416637161,
"learning_rate": 7.821316201993768e-06,
"loss": 0.191,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 140050111.0,
"step": 191
},
{
"entropy": 0.4912261962890625,
"epoch": 2.0869565217391304,
"grad_norm": 4.589563951441601,
"learning_rate": 7.796516761058649e-06,
"loss": 0.1709,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 140770436.0,
"step": 192
},
{
"entropy": 0.517242431640625,
"epoch": 2.097826086956522,
"grad_norm": 3.307436574573834,
"learning_rate": 7.771616799934372e-06,
"loss": 0.1747,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 141460692.0,
"step": 193
},
{
"entropy": 0.495513916015625,
"epoch": 2.108695652173913,
"grad_norm": 1.2213195030130277,
"learning_rate": 7.746617213644646e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 142200106.0,
"step": 194
},
{
"entropy": 0.530914306640625,
"epoch": 2.119565217391304,
"grad_norm": 2.175857735434003,
"learning_rate": 7.721518900794186e-06,
"loss": 0.1586,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 142904935.0,
"step": 195
},
{
"entropy": 0.492340087890625,
"epoch": 2.130434782608696,
"grad_norm": 1.3407787633418933,
"learning_rate": 7.696322763536408e-06,
"loss": 0.1657,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 143621023.0,
"step": 196
},
{
"entropy": 0.5035247802734375,
"epoch": 2.141304347826087,
"grad_norm": 2.742789829325077,
"learning_rate": 7.67102970754101e-06,
"loss": 0.1473,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 144348296.0,
"step": 197
},
{
"entropy": 0.4730987548828125,
"epoch": 2.1521739130434785,
"grad_norm": 2.911801084282523,
"learning_rate": 7.645640641961407e-06,
"loss": 0.1464,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 145094605.0,
"step": 198
},
{
"entropy": 0.4618377685546875,
"epoch": 2.1630434782608696,
"grad_norm": 3.6081222338142305,
"learning_rate": 7.620156479402066e-06,
"loss": 0.1526,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 145847120.0,
"step": 199
},
{
"entropy": 0.478057861328125,
"epoch": 2.1739130434782608,
"grad_norm": 1.0514452021998937,
"learning_rate": 7.594578135885684e-06,
"loss": 0.1418,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 146577419.0,
"step": 200
},
{
"entropy": 0.466522216796875,
"epoch": 2.1847826086956523,
"grad_norm": 1.7411693991777,
"learning_rate": 7.568906530820281e-06,
"loss": 0.1391,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 147309360.0,
"step": 201
},
{
"entropy": 0.466461181640625,
"epoch": 2.1956521739130435,
"grad_norm": 2.17620767082738,
"learning_rate": 7.543142586966139e-06,
"loss": 0.1429,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 148048145.0,
"step": 202
},
{
"entropy": 0.460052490234375,
"epoch": 2.2065217391304346,
"grad_norm": 1.1004916008068764,
"learning_rate": 7.517287230402639e-06,
"loss": 0.1792,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 148793377.0,
"step": 203
},
{
"entropy": 0.4695587158203125,
"epoch": 2.217391304347826,
"grad_norm": 3.228916035303672,
"learning_rate": 7.491341390494971e-06,
"loss": 0.2017,
"mean_token_accuracy": 0.9036458390764892,
"num_tokens": 149529344.0,
"step": 204
},
{
"entropy": 0.495758056640625,
"epoch": 2.2282608695652173,
"grad_norm": 2.0917537496816916,
"learning_rate": 7.465305999860728e-06,
"loss": 0.1602,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 150239023.0,
"step": 205
},
{
"entropy": 0.5149688720703125,
"epoch": 2.239130434782609,
"grad_norm": 1.7658146369992271,
"learning_rate": 7.439181994336389e-06,
"loss": 0.1559,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 150959182.0,
"step": 206
},
{
"entropy": 0.491668701171875,
"epoch": 2.25,
"grad_norm": 1.842592163481376,
"learning_rate": 7.412970312943672e-06,
"loss": 0.1593,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 151696790.0,
"step": 207
},
{
"entropy": 0.5184173583984375,
"epoch": 2.260869565217391,
"grad_norm": 1.5342919929733787,
"learning_rate": 7.386671897855786e-06,
"loss": 0.146,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 152399305.0,
"step": 208
},
{
"entropy": 0.5020751953125,
"epoch": 2.2717391304347827,
"grad_norm": 1.285400661348482,
"learning_rate": 7.360287694363566e-06,
"loss": 0.1389,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 153144631.0,
"step": 209
},
{
"entropy": 0.4999237060546875,
"epoch": 2.282608695652174,
"grad_norm": 1.089626772132905,
"learning_rate": 7.333818650841489e-06,
"loss": 0.1728,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 153863039.0,
"step": 210
},
{
"entropy": 0.4629669189453125,
"epoch": 2.2934782608695654,
"grad_norm": 0.7958565648055749,
"learning_rate": 7.3072657187135895e-06,
"loss": 0.1365,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 154627008.0,
"step": 211
},
{
"entropy": 0.4714813232421875,
"epoch": 2.3043478260869565,
"grad_norm": 2.1294805846667435,
"learning_rate": 7.280629852419263e-06,
"loss": 0.1185,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 155336075.0,
"step": 212
},
{
"entropy": 0.4590911865234375,
"epoch": 2.3152173913043477,
"grad_norm": 2.2936903239742428,
"learning_rate": 7.253912009378953e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 156069482.0,
"step": 213
},
{
"entropy": 0.4477081298828125,
"epoch": 2.3260869565217392,
"grad_norm": 1.399883348847463,
"learning_rate": 7.227113149959738e-06,
"loss": 0.165,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 156810168.0,
"step": 214
},
{
"entropy": 0.4658660888671875,
"epoch": 2.3369565217391304,
"grad_norm": 1.1309340794568266,
"learning_rate": 7.200234237440815e-06,
"loss": 0.1495,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 157525831.0,
"step": 215
},
{
"entropy": 0.452239990234375,
"epoch": 2.3478260869565215,
"grad_norm": 3.438678019243807,
"learning_rate": 7.173276237978872e-06,
"loss": 0.1612,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 158256452.0,
"step": 216
},
{
"entropy": 0.4614105224609375,
"epoch": 2.358695652173913,
"grad_norm": 0.7820925016217779,
"learning_rate": 7.146240120573358e-06,
"loss": 0.1356,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 158989349.0,
"step": 217
},
{
"entropy": 0.4483489990234375,
"epoch": 2.369565217391304,
"grad_norm": 0.9729093397557323,
"learning_rate": 7.1191268570316575e-06,
"loss": 0.1493,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 159726218.0,
"step": 218
},
{
"entropy": 0.457611083984375,
"epoch": 2.380434782608696,
"grad_norm": 2.9206076228482334,
"learning_rate": 7.091937421934158e-06,
"loss": 0.1624,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 160463988.0,
"step": 219
},
{
"entropy": 0.4862823486328125,
"epoch": 2.391304347826087,
"grad_norm": 3.479020953108577,
"learning_rate": 7.064672792599208e-06,
"loss": 0.1396,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 161181564.0,
"step": 220
},
{
"entropy": 0.4723968505859375,
"epoch": 2.4021739130434785,
"grad_norm": 0.8546526492754443,
"learning_rate": 7.037333949048005e-06,
"loss": 0.1331,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 161913838.0,
"step": 221
},
{
"entropy": 0.4722442626953125,
"epoch": 2.4130434782608696,
"grad_norm": 0.8101386153737262,
"learning_rate": 7.009921873969359e-06,
"loss": 0.149,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 162640954.0,
"step": 222
},
{
"entropy": 0.48663330078125,
"epoch": 2.4239130434782608,
"grad_norm": 2.1383118886350183,
"learning_rate": 6.9824375526843705e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 163373073.0,
"step": 223
},
{
"entropy": 0.512115478515625,
"epoch": 2.4347826086956523,
"grad_norm": 0.8385687350546065,
"learning_rate": 6.954881973111013e-06,
"loss": 0.1554,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 164096548.0,
"step": 224
},
{
"entropy": 0.529052734375,
"epoch": 2.4456521739130435,
"grad_norm": 1.0514628608699,
"learning_rate": 6.927256125728624e-06,
"loss": 0.1373,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 164820809.0,
"step": 225
},
{
"entropy": 0.5160675048828125,
"epoch": 2.4565217391304346,
"grad_norm": 2.698419287754823,
"learning_rate": 6.8995610035423044e-06,
"loss": 0.153,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 165550845.0,
"step": 226
},
{
"entropy": 0.5378570556640625,
"epoch": 2.467391304347826,
"grad_norm": 0.7993047970571002,
"learning_rate": 6.871797602047221e-06,
"loss": 0.1372,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 166277643.0,
"step": 227
},
{
"entropy": 0.576873779296875,
"epoch": 2.4782608695652173,
"grad_norm": 1.5400335597456314,
"learning_rate": 6.843966919192827e-06,
"loss": 0.138,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 166987602.0,
"step": 228
},
{
"entropy": 0.544677734375,
"epoch": 2.489130434782609,
"grad_norm": 1.3501457655398248,
"learning_rate": 6.816069955346986e-06,
"loss": 0.1527,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 167701559.0,
"step": 229
},
{
"entropy": 0.5312347412109375,
"epoch": 2.5,
"grad_norm": 1.346411855493066,
"learning_rate": 6.788107713260023e-06,
"loss": 0.1398,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 168423290.0,
"step": 230
},
{
"entropy": 0.5373382568359375,
"epoch": 2.5108695652173916,
"grad_norm": 1.5812163663322734,
"learning_rate": 6.760081198028671e-06,
"loss": 0.1524,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 169153964.0,
"step": 231
},
{
"entropy": 0.5088958740234375,
"epoch": 2.5217391304347827,
"grad_norm": 1.7413265225657986,
"learning_rate": 6.731991417059947e-06,
"loss": 0.1376,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 169921938.0,
"step": 232
},
{
"entropy": 0.5161895751953125,
"epoch": 2.532608695652174,
"grad_norm": 1.3981089558799913,
"learning_rate": 6.703839380034945e-06,
"loss": 0.1301,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 170682692.0,
"step": 233
},
{
"entropy": 0.5208587646484375,
"epoch": 2.5434782608695654,
"grad_norm": 2.8149627673456936,
"learning_rate": 6.675626098872536e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 171432994.0,
"step": 234
},
{
"entropy": 0.51824951171875,
"epoch": 2.5543478260869565,
"grad_norm": 1.4684058065548915,
"learning_rate": 6.647352587693001e-06,
"loss": 0.1624,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 172141981.0,
"step": 235
},
{
"entropy": 0.4888153076171875,
"epoch": 2.5652173913043477,
"grad_norm": 1.4406536148696736,
"learning_rate": 6.619019862781571e-06,
"loss": 0.1396,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 172897864.0,
"step": 236
},
{
"entropy": 0.50592041015625,
"epoch": 2.5760869565217392,
"grad_norm": 2.7792231969842742,
"learning_rate": 6.590628942551909e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 173612172.0,
"step": 237
},
{
"entropy": 0.479766845703125,
"epoch": 2.5869565217391304,
"grad_norm": 0.8612062254275914,
"learning_rate": 6.5621808475094904e-06,
"loss": 0.1472,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 174336959.0,
"step": 238
},
{
"entropy": 0.463836669921875,
"epoch": 2.5978260869565215,
"grad_norm": 2.1652743091901083,
"learning_rate": 6.533676600214929e-06,
"loss": 0.1323,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 175095005.0,
"step": 239
},
{
"entropy": 0.4647979736328125,
"epoch": 2.608695652173913,
"grad_norm": 1.0538425000460028,
"learning_rate": 6.505117225247218e-06,
"loss": 0.1559,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 175821704.0,
"step": 240
},
{
"entropy": 0.471099853515625,
"epoch": 2.619565217391304,
"grad_norm": 0.9819078153379016,
"learning_rate": 6.476503749166903e-06,
"loss": 0.1457,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 176557432.0,
"step": 241
},
{
"entropy": 0.4735260009765625,
"epoch": 2.630434782608696,
"grad_norm": 1.778816402900594,
"learning_rate": 6.447837200479187e-06,
"loss": 0.1823,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 177267268.0,
"step": 242
},
{
"entropy": 0.476226806640625,
"epoch": 2.641304347826087,
"grad_norm": 0.7823399457915889,
"learning_rate": 6.419118609596948e-06,
"loss": 0.146,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 177988371.0,
"step": 243
},
{
"entropy": 0.4737396240234375,
"epoch": 2.6521739130434785,
"grad_norm": 2.300505507236317,
"learning_rate": 6.390349008803717e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 178721738.0,
"step": 244
},
{
"entropy": 0.4729461669921875,
"epoch": 2.6630434782608696,
"grad_norm": 0.9400854604905068,
"learning_rate": 6.36152943221656e-06,
"loss": 0.1878,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 179467726.0,
"step": 245
},
{
"entropy": 0.4758758544921875,
"epoch": 2.6739130434782608,
"grad_norm": 2.605291347050962,
"learning_rate": 6.332660915748915e-06,
"loss": 0.1686,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 180215932.0,
"step": 246
},
{
"entropy": 0.48907470703125,
"epoch": 2.6847826086956523,
"grad_norm": 2.962520848096924,
"learning_rate": 6.303744497073352e-06,
"loss": 0.1508,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 180961953.0,
"step": 247
},
{
"entropy": 0.493072509765625,
"epoch": 2.6956521739130435,
"grad_norm": 1.0342817962395638,
"learning_rate": 6.274781215584277e-06,
"loss": 0.1489,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 181653249.0,
"step": 248
},
{
"entropy": 0.5084991455078125,
"epoch": 2.7065217391304346,
"grad_norm": 0.6822142118394301,
"learning_rate": 6.245772112360568e-06,
"loss": 0.1564,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 182374091.0,
"step": 249
},
{
"entropy": 0.4775848388671875,
"epoch": 2.717391304347826,
"grad_norm": 2.62505064720756,
"learning_rate": 6.216718230128156e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 183112589.0,
"step": 250
},
{
"entropy": 0.4820556640625,
"epoch": 2.7282608695652173,
"grad_norm": 1.4778946605073953,
"learning_rate": 6.187620613222544e-06,
"loss": 0.1516,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 183857175.0,
"step": 251
},
{
"entropy": 0.4744110107421875,
"epoch": 2.7391304347826084,
"grad_norm": 1.6896217372430502,
"learning_rate": 6.158480307551269e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 184586476.0,
"step": 252
},
{
"entropy": 0.492889404296875,
"epoch": 2.75,
"grad_norm": 0.7824494663876816,
"learning_rate": 6.129298360556304e-06,
"loss": 0.1207,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 185333896.0,
"step": 253
},
{
"entropy": 0.48492431640625,
"epoch": 2.7608695652173916,
"grad_norm": 0.885817818754658,
"learning_rate": 6.100075821176412e-06,
"loss": 0.1531,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 186054040.0,
"step": 254
},
{
"entropy": 0.5035552978515625,
"epoch": 2.7717391304347827,
"grad_norm": 3.4109040646320348,
"learning_rate": 6.070813739809443e-06,
"loss": 0.1741,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 186783234.0,
"step": 255
},
{
"entropy": 0.52508544921875,
"epoch": 2.782608695652174,
"grad_norm": 3.8891039102099123,
"learning_rate": 6.041513168274568e-06,
"loss": 0.1891,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 187490178.0,
"step": 256
},
{
"entropy": 0.5135955810546875,
"epoch": 2.7934782608695654,
"grad_norm": 2.3436153874765773,
"learning_rate": 6.012175159774488e-06,
"loss": 0.1298,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 188244843.0,
"step": 257
},
{
"entropy": 0.5222015380859375,
"epoch": 2.8043478260869565,
"grad_norm": 3.255145078571251,
"learning_rate": 5.982800768857561e-06,
"loss": 0.1579,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 188985421.0,
"step": 258
},
{
"entropy": 0.5480804443359375,
"epoch": 2.8152173913043477,
"grad_norm": 2.7396951574762514,
"learning_rate": 5.953391051379904e-06,
"loss": 0.145,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 189704280.0,
"step": 259
},
{
"entropy": 0.55145263671875,
"epoch": 2.8260869565217392,
"grad_norm": 3.533652625022049,
"learning_rate": 5.9239470644674425e-06,
"loss": 0.1493,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 190430021.0,
"step": 260
},
{
"entropy": 0.5504913330078125,
"epoch": 2.8369565217391304,
"grad_norm": 2.3908884569654347,
"learning_rate": 5.894469866477905e-06,
"loss": 0.1492,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 191164968.0,
"step": 261
},
{
"entropy": 0.553436279296875,
"epoch": 2.8478260869565215,
"grad_norm": 1.8403956420887753,
"learning_rate": 5.864960516962791e-06,
"loss": 0.157,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 191897977.0,
"step": 262
},
{
"entropy": 0.5384979248046875,
"epoch": 2.858695652173913,
"grad_norm": 2.840652908639724,
"learning_rate": 5.835420076629273e-06,
"loss": 0.1421,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 192644903.0,
"step": 263
},
{
"entropy": 0.5287322998046875,
"epoch": 2.869565217391304,
"grad_norm": 1.0019073525611903,
"learning_rate": 5.805849607302081e-06,
"loss": 0.1327,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 193387031.0,
"step": 264
},
{
"entropy": 0.5142059326171875,
"epoch": 2.880434782608696,
"grad_norm": 1.0267626138197101,
"learning_rate": 5.776250171885329e-06,
"loss": 0.1677,
"mean_token_accuracy": 0.9062500055879354,
"num_tokens": 194117382.0,
"step": 265
},
{
"entropy": 0.5182952880859375,
"epoch": 2.891304347826087,
"grad_norm": 2.459122457455405,
"learning_rate": 5.74662283432431e-06,
"loss": 0.1402,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 194861082.0,
"step": 266
},
{
"entropy": 0.4894866943359375,
"epoch": 2.9021739130434785,
"grad_norm": 2.2801769816915796,
"learning_rate": 5.716968659567256e-06,
"loss": 0.1338,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 195621115.0,
"step": 267
},
{
"entropy": 0.4940338134765625,
"epoch": 2.9130434782608696,
"grad_norm": 0.76096101535856,
"learning_rate": 5.687288713527051e-06,
"loss": 0.1329,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 196360159.0,
"step": 268
},
{
"entropy": 0.4987640380859375,
"epoch": 2.9239130434782608,
"grad_norm": 1.1507883154034404,
"learning_rate": 5.6575840630429295e-06,
"loss": 0.161,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 197074701.0,
"step": 269
},
{
"entropy": 0.478546142578125,
"epoch": 2.9347826086956523,
"grad_norm": 4.258626023619959,
"learning_rate": 5.627855775842116e-06,
"loss": 0.1538,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 197809329.0,
"step": 270
},
{
"entropy": 0.5005645751953125,
"epoch": 2.9456521739130435,
"grad_norm": 3.415851630206049,
"learning_rate": 5.598104920501455e-06,
"loss": 0.1445,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 198531748.0,
"step": 271
},
{
"entropy": 0.483428955078125,
"epoch": 2.9565217391304346,
"grad_norm": 1.8451160224609997,
"learning_rate": 5.568332566408995e-06,
"loss": 0.1229,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 199273498.0,
"step": 272
},
{
"entropy": 0.4969635009765625,
"epoch": 2.967391304347826,
"grad_norm": 1.3096913208218943,
"learning_rate": 5.538539783725556e-06,
"loss": 0.124,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 200009778.0,
"step": 273
},
{
"entropy": 0.5011138916015625,
"epoch": 2.9782608695652173,
"grad_norm": 2.6192863885199102,
"learning_rate": 5.508727643346257e-06,
"loss": 0.1422,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 200752432.0,
"step": 274
},
{
"entropy": 0.5101165771484375,
"epoch": 2.9891304347826084,
"grad_norm": 1.9357570578321412,
"learning_rate": 5.478897216862026e-06,
"loss": 0.1396,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 201496295.0,
"step": 275
},
{
"entropy": 0.4974517822265625,
"epoch": 3.0,
"grad_norm": 2.658448706719779,
"learning_rate": 5.4490495765210795e-06,
"loss": 0.1312,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 202238152.0,
"step": 276
},
{
"entropy": 0.4904632568359375,
"epoch": 3.010869565217391,
"grad_norm": 0.7111278030242015,
"learning_rate": 5.4191857951903825e-06,
"loss": 0.1297,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 203018911.0,
"step": 277
},
{
"entropy": 0.4854278564453125,
"epoch": 3.0217391304347827,
"grad_norm": 3.685677125315361,
"learning_rate": 5.389306946317089e-06,
"loss": 0.1452,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 203769923.0,
"step": 278
},
{
"entropy": 0.5066680908203125,
"epoch": 3.032608695652174,
"grad_norm": 5.926720540282865,
"learning_rate": 5.359414103889947e-06,
"loss": 0.1639,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 204482884.0,
"step": 279
},
{
"entropy": 0.5011138916015625,
"epoch": 3.0434782608695654,
"grad_norm": 3.828517904681439,
"learning_rate": 5.329508342400702e-06,
"loss": 0.1307,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 205197104.0,
"step": 280
},
{
"entropy": 0.5212249755859375,
"epoch": 3.0543478260869565,
"grad_norm": 2.371018766229332,
"learning_rate": 5.29959073680547e-06,
"loss": 0.1074,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 205909556.0,
"step": 281
},
{
"entropy": 0.4830169677734375,
"epoch": 3.0652173913043477,
"grad_norm": 3.4453484681704536,
"learning_rate": 5.2696623624861065e-06,
"loss": 0.122,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 206648457.0,
"step": 282
},
{
"entropy": 0.4988861083984375,
"epoch": 3.0760869565217392,
"grad_norm": 4.521405572801564,
"learning_rate": 5.239724295211541e-06,
"loss": 0.1369,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 207369598.0,
"step": 283
},
{
"entropy": 0.4881439208984375,
"epoch": 3.0869565217391304,
"grad_norm": 5.674022612225397,
"learning_rate": 5.209777611099117e-06,
"loss": 0.1471,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 208096720.0,
"step": 284
},
{
"entropy": 0.470123291015625,
"epoch": 3.097826086956522,
"grad_norm": 4.813184131741794,
"learning_rate": 5.179823386575908e-06,
"loss": 0.1037,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 208844663.0,
"step": 285
},
{
"entropy": 0.4812469482421875,
"epoch": 3.108695652173913,
"grad_norm": 0.935293372415571,
"learning_rate": 5.1498626983400215e-06,
"loss": 0.1306,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 209580199.0,
"step": 286
},
{
"entropy": 0.487884521484375,
"epoch": 3.119565217391304,
"grad_norm": 1.853600268381173,
"learning_rate": 5.11989662332191e-06,
"loss": 0.1191,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 210310644.0,
"step": 287
},
{
"entropy": 0.478851318359375,
"epoch": 3.130434782608696,
"grad_norm": 2.6861430284164887,
"learning_rate": 5.089926238645645e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 211009831.0,
"step": 288
},
{
"entropy": 0.4602813720703125,
"epoch": 3.141304347826087,
"grad_norm": 0.9379319523834283,
"learning_rate": 5.059952621590216e-06,
"loss": 0.1145,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 211728283.0,
"step": 289
},
{
"entropy": 0.4534759521484375,
"epoch": 3.1521739130434785,
"grad_norm": 66.38606728912778,
"learning_rate": 5.029976849550789e-06,
"loss": 0.2548,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 212461065.0,
"step": 290
},
{
"entropy": 0.4560089111328125,
"epoch": 3.1630434782608696,
"grad_norm": 16.872257168957795,
"learning_rate": 5e-06,
"loss": 0.1922,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 213186327.0,
"step": 291
},
{
"entropy": 0.4451904296875,
"epoch": 3.1739130434782608,
"grad_norm": 1.5461776477660296,
"learning_rate": 4.970023150449212e-06,
"loss": 0.1435,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 213911814.0,
"step": 292
},
{
"entropy": 0.4291839599609375,
"epoch": 3.1847826086956523,
"grad_norm": 1.710426030936313,
"learning_rate": 4.940047378409786e-06,
"loss": 0.1205,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 214654874.0,
"step": 293
},
{
"entropy": 0.423095703125,
"epoch": 3.1956521739130435,
"grad_norm": 1.0827875337801205,
"learning_rate": 4.910073761354354e-06,
"loss": 0.1095,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 215389761.0,
"step": 294
},
{
"entropy": 0.4220123291015625,
"epoch": 3.2065217391304346,
"grad_norm": 5.1333586379045855,
"learning_rate": 4.880103376678092e-06,
"loss": 0.1316,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 216127217.0,
"step": 295
},
{
"entropy": 0.4204559326171875,
"epoch": 3.217391304347826,
"grad_norm": 1.9567851992549676,
"learning_rate": 4.85013730165998e-06,
"loss": 0.1523,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 216859096.0,
"step": 296
},
{
"entropy": 0.4506072998046875,
"epoch": 3.2282608695652173,
"grad_norm": 1.1177237092438557,
"learning_rate": 4.820176613424095e-06,
"loss": 0.1092,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 217571797.0,
"step": 297
},
{
"entropy": 0.4485321044921875,
"epoch": 3.239130434782609,
"grad_norm": 1.0790133746273967,
"learning_rate": 4.790222388900884e-06,
"loss": 0.1246,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 218283906.0,
"step": 298
},
{
"entropy": 0.434783935546875,
"epoch": 3.25,
"grad_norm": 1.023940043012299,
"learning_rate": 4.76027570478846e-06,
"loss": 0.1359,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 219029245.0,
"step": 299
},
{
"entropy": 0.424102783203125,
"epoch": 3.260869565217391,
"grad_norm": 1.4712175212615914,
"learning_rate": 4.730337637513895e-06,
"loss": 0.1248,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 219788983.0,
"step": 300
},
{
"entropy": 0.4358978271484375,
"epoch": 3.2717391304347827,
"grad_norm": 1.466288039664087,
"learning_rate": 4.7004092631945315e-06,
"loss": 0.1388,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 220519820.0,
"step": 301
},
{
"entropy": 0.465911865234375,
"epoch": 3.282608695652174,
"grad_norm": 1.3963153804305062,
"learning_rate": 4.6704916575993005e-06,
"loss": 0.118,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 221236275.0,
"step": 302
},
{
"entropy": 0.456451416015625,
"epoch": 3.2934782608695654,
"grad_norm": 1.2767429519721747,
"learning_rate": 4.640585896110054e-06,
"loss": 0.1136,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 221950844.0,
"step": 303
},
{
"entropy": 0.448486328125,
"epoch": 3.3043478260869565,
"grad_norm": 1.9283093557056667,
"learning_rate": 4.610693053682912e-06,
"loss": 0.1205,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 222698753.0,
"step": 304
},
{
"entropy": 0.460296630859375,
"epoch": 3.3152173913043477,
"grad_norm": 1.602506073665712,
"learning_rate": 4.580814204809618e-06,
"loss": 0.1194,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 223409864.0,
"step": 305
},
{
"entropy": 0.4370574951171875,
"epoch": 3.3260869565217392,
"grad_norm": 2.812987069937644,
"learning_rate": 4.550950423478923e-06,
"loss": 0.1207,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 224144482.0,
"step": 306
},
{
"entropy": 0.448516845703125,
"epoch": 3.3369565217391304,
"grad_norm": 2.7268302318992252,
"learning_rate": 4.521102783137976e-06,
"loss": 0.1197,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 224874129.0,
"step": 307
},
{
"entropy": 0.4384918212890625,
"epoch": 3.3478260869565215,
"grad_norm": 1.8209870494232845,
"learning_rate": 4.491272356653744e-06,
"loss": 0.1359,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 225607747.0,
"step": 308
},
{
"entropy": 0.4431610107421875,
"epoch": 3.358695652173913,
"grad_norm": 1.8893569417936162,
"learning_rate": 4.4614602162744455e-06,
"loss": 0.1397,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 226351883.0,
"step": 309
},
{
"entropy": 0.4349212646484375,
"epoch": 3.369565217391304,
"grad_norm": 2.7117060715938406,
"learning_rate": 4.431667433591006e-06,
"loss": 0.1505,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 227071902.0,
"step": 310
},
{
"entropy": 0.4327392578125,
"epoch": 3.380434782608696,
"grad_norm": 3.2489408268123703,
"learning_rate": 4.401895079498547e-06,
"loss": 0.1347,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 227818670.0,
"step": 311
},
{
"entropy": 0.4308319091796875,
"epoch": 3.391304347826087,
"grad_norm": 1.036775916385551,
"learning_rate": 4.372144224157886e-06,
"loss": 0.1275,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 228554871.0,
"step": 312
},
{
"entropy": 0.45361328125,
"epoch": 3.4021739130434785,
"grad_norm": 1.7332669275989192,
"learning_rate": 4.342415936957073e-06,
"loss": 0.0901,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 229248652.0,
"step": 313
},
{
"entropy": 0.4354248046875,
"epoch": 3.4130434782608696,
"grad_norm": 3.623367592607393,
"learning_rate": 4.312711286472951e-06,
"loss": 0.1053,
"mean_token_accuracy": 0.9635416688397527,
"num_tokens": 230001926.0,
"step": 314
},
{
"entropy": 0.425079345703125,
"epoch": 3.4239130434782608,
"grad_norm": 1.241563582622002,
"learning_rate": 4.2830313404327475e-06,
"loss": 0.1394,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 230730268.0,
"step": 315
},
{
"entropy": 0.421630859375,
"epoch": 3.4347826086956523,
"grad_norm": 1.6961274867544411,
"learning_rate": 4.253377165675691e-06,
"loss": 0.1254,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 231470870.0,
"step": 316
},
{
"entropy": 0.449676513671875,
"epoch": 3.4456521739130435,
"grad_norm": 1.4182450270471312,
"learning_rate": 4.223749828114672e-06,
"loss": 0.1227,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 232205169.0,
"step": 317
},
{
"entropy": 0.42352294921875,
"epoch": 3.4565217391304346,
"grad_norm": 2.7039132599106854,
"learning_rate": 4.19415039269792e-06,
"loss": 0.1415,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 232956149.0,
"step": 318
},
{
"entropy": 0.4373626708984375,
"epoch": 3.467391304347826,
"grad_norm": 0.8829096146841485,
"learning_rate": 4.1645799233707286e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 233695610.0,
"step": 319
},
{
"entropy": 0.430999755859375,
"epoch": 3.4782608695652173,
"grad_norm": 2.6434624794405757,
"learning_rate": 4.1350394830372106e-06,
"loss": 0.1252,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 234423864.0,
"step": 320
},
{
"entropy": 0.4183349609375,
"epoch": 3.489130434782609,
"grad_norm": 1.4738342290133946,
"learning_rate": 4.105530133522096e-06,
"loss": 0.1179,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 235180826.0,
"step": 321
},
{
"entropy": 0.422515869140625,
"epoch": 3.5,
"grad_norm": 2.2621689843040866,
"learning_rate": 4.076052935532559e-06,
"loss": 0.1521,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 235940658.0,
"step": 322
},
{
"entropy": 0.422821044921875,
"epoch": 3.5108695652173916,
"grad_norm": 1.5419057612886151,
"learning_rate": 4.046608948620098e-06,
"loss": 0.1066,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 236655968.0,
"step": 323
},
{
"entropy": 0.4370269775390625,
"epoch": 3.5217391304347827,
"grad_norm": 1.9425609756123423,
"learning_rate": 4.017199231142441e-06,
"loss": 0.1161,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 237371740.0,
"step": 324
},
{
"entropy": 0.4402618408203125,
"epoch": 3.532608695652174,
"grad_norm": 2.2856146900763683,
"learning_rate": 3.987824840225512e-06,
"loss": 0.1788,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 238102988.0,
"step": 325
},
{
"entropy": 0.4551849365234375,
"epoch": 3.5434782608695654,
"grad_norm": 2.8367564342261473,
"learning_rate": 3.9584868317254325e-06,
"loss": 0.1046,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 238832212.0,
"step": 326
},
{
"entropy": 0.442779541015625,
"epoch": 3.5543478260869565,
"grad_norm": 1.041413390162045,
"learning_rate": 3.92918626019056e-06,
"loss": 0.0965,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 239560698.0,
"step": 327
},
{
"entropy": 0.441986083984375,
"epoch": 3.5652173913043477,
"grad_norm": 3.79920854717062,
"learning_rate": 3.8999241788235896e-06,
"loss": 0.1541,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 240284379.0,
"step": 328
},
{
"entropy": 0.430267333984375,
"epoch": 3.5760869565217392,
"grad_norm": 1.5276414681228787,
"learning_rate": 3.8707016394436985e-06,
"loss": 0.1275,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 241043511.0,
"step": 329
},
{
"entropy": 0.4509429931640625,
"epoch": 3.5869565217391304,
"grad_norm": 1.2736529719713083,
"learning_rate": 3.841519692448732e-06,
"loss": 0.1286,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 241759435.0,
"step": 330
},
{
"entropy": 0.45465087890625,
"epoch": 3.5978260869565215,
"grad_norm": 1.9141550160209704,
"learning_rate": 3.8123793867774573e-06,
"loss": 0.0899,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 242491369.0,
"step": 331
},
{
"entropy": 0.5021209716796875,
"epoch": 3.608695652173913,
"grad_norm": 1.232332216642473,
"learning_rate": 3.7832817698718456e-06,
"loss": 0.1205,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 243199357.0,
"step": 332
},
{
"entropy": 0.4455413818359375,
"epoch": 3.619565217391304,
"grad_norm": 2.1444573158673723,
"learning_rate": 3.754227887639434e-06,
"loss": 0.1158,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 243912435.0,
"step": 333
},
{
"entropy": 0.4324493408203125,
"epoch": 3.630434782608696,
"grad_norm": 1.617461677829012,
"learning_rate": 3.725218784415723e-06,
"loss": 0.1092,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 244654934.0,
"step": 334
},
{
"entropy": 0.444549560546875,
"epoch": 3.641304347826087,
"grad_norm": 1.377702218043193,
"learning_rate": 3.6962555029266488e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 245389036.0,
"step": 335
},
{
"entropy": 0.434906005859375,
"epoch": 3.6521739130434785,
"grad_norm": 1.727264760442294,
"learning_rate": 3.667339084251087e-06,
"loss": 0.1071,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 246134106.0,
"step": 336
},
{
"entropy": 0.43316650390625,
"epoch": 3.6630434782608696,
"grad_norm": 1.3954090035983964,
"learning_rate": 3.638470567783442e-06,
"loss": 0.1179,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 246849956.0,
"step": 337
},
{
"entropy": 0.453948974609375,
"epoch": 3.6739130434782608,
"grad_norm": 1.6447060709500603,
"learning_rate": 3.609650991196285e-06,
"loss": 0.0936,
"mean_token_accuracy": 0.9687500018626451,
"num_tokens": 247577548.0,
"step": 338
},
{
"entropy": 0.4385833740234375,
"epoch": 3.6847826086956523,
"grad_norm": 1.887620262109647,
"learning_rate": 3.5808813904030517e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 248301732.0,
"step": 339
},
{
"entropy": 0.43109130859375,
"epoch": 3.6956521739130435,
"grad_norm": 1.5374227030971785,
"learning_rate": 3.5521627995208146e-06,
"loss": 0.088,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 249061110.0,
"step": 340
},
{
"entropy": 0.442901611328125,
"epoch": 3.7065217391304346,
"grad_norm": 3.639926746576837,
"learning_rate": 3.523496250833098e-06,
"loss": 0.0975,
"mean_token_accuracy": 0.9635416688397527,
"num_tokens": 249771909.0,
"step": 341
},
{
"entropy": 0.437469482421875,
"epoch": 3.717391304347826,
"grad_norm": 2.4252733850331043,
"learning_rate": 3.4948827747527846e-06,
"loss": 0.1198,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 250508824.0,
"step": 342
},
{
"entropy": 0.43988037109375,
"epoch": 3.7282608695652173,
"grad_norm": 2.470431180963349,
"learning_rate": 3.466323399785072e-06,
"loss": 0.1032,
"mean_token_accuracy": 0.9635416688397527,
"num_tokens": 251258370.0,
"step": 343
},
{
"entropy": 0.43865966796875,
"epoch": 3.7391304347826084,
"grad_norm": 3.615490506805586,
"learning_rate": 3.4378191524905104e-06,
"loss": 0.1176,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 251982193.0,
"step": 344
},
{
"entropy": 0.434356689453125,
"epoch": 3.75,
"grad_norm": 4.953188731098825,
"learning_rate": 3.4093710574480926e-06,
"loss": 0.0944,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 252725373.0,
"step": 345
},
{
"entropy": 0.441253662109375,
"epoch": 3.7608695652173916,
"grad_norm": 2.903524695818942,
"learning_rate": 3.3809801372184305e-06,
"loss": 0.1043,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 253444253.0,
"step": 346
},
{
"entropy": 0.440887451171875,
"epoch": 3.7717391304347827,
"grad_norm": 1.8484878167112617,
"learning_rate": 3.352647412307002e-06,
"loss": 0.0971,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 254196231.0,
"step": 347
},
{
"entropy": 0.4503631591796875,
"epoch": 3.782608695652174,
"grad_norm": 2.3324615390252603,
"learning_rate": 3.3243739011274645e-06,
"loss": 0.0915,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 254897104.0,
"step": 348
},
{
"entropy": 0.42547607421875,
"epoch": 3.7934782608695654,
"grad_norm": 1.8126624217835543,
"learning_rate": 3.296160619965056e-06,
"loss": 0.1015,
"mean_token_accuracy": 0.9635416688397527,
"num_tokens": 255625847.0,
"step": 349
},
{
"entropy": 0.4334869384765625,
"epoch": 3.8043478260869565,
"grad_norm": 3.1451739862900654,
"learning_rate": 3.2680085829400553e-06,
"loss": 0.1178,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 256357804.0,
"step": 350
},
{
"entropy": 0.44500732421875,
"epoch": 3.8152173913043477,
"grad_norm": 1.366277116524626,
"learning_rate": 3.2399188019713325e-06,
"loss": 0.0686,
"mean_token_accuracy": 0.9687500018626451,
"num_tokens": 257106678.0,
"step": 351
},
{
"entropy": 0.4403076171875,
"epoch": 3.8260869565217392,
"grad_norm": 1.8547202991721121,
"learning_rate": 3.2118922867399776e-06,
"loss": 0.0654,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 257830430.0,
"step": 352
},
{
"entropy": 0.4456939697265625,
"epoch": 3.8369565217391304,
"grad_norm": 2.5528798400662334,
"learning_rate": 3.183930044653014e-06,
"loss": 0.083,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 258564915.0,
"step": 353
},
{
"entropy": 0.4347991943359375,
"epoch": 3.8478260869565215,
"grad_norm": 2.7742470177999454,
"learning_rate": 3.156033080807175e-06,
"loss": 0.0773,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 259323546.0,
"step": 354
},
{
"entropy": 0.435638427734375,
"epoch": 3.858695652173913,
"grad_norm": 2.78613102328482,
"learning_rate": 3.128202397952781e-06,
"loss": 0.1138,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 260059087.0,
"step": 355
},
{
"entropy": 0.448394775390625,
"epoch": 3.869565217391304,
"grad_norm": 3.749640348606413,
"learning_rate": 3.1004389964576976e-06,
"loss": 0.1093,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 260766551.0,
"step": 356
},
{
"entropy": 0.4436798095703125,
"epoch": 3.880434782608696,
"grad_norm": 4.7477247915918195,
"learning_rate": 3.0727438742713766e-06,
"loss": 0.1368,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 261503811.0,
"step": 357
},
{
"entropy": 0.4672088623046875,
"epoch": 3.891304347826087,
"grad_norm": 3.833134489829662,
"learning_rate": 3.045118026888988e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 262230419.0,
"step": 358
},
{
"entropy": 0.464874267578125,
"epoch": 3.9021739130434785,
"grad_norm": 3.0516617638198302,
"learning_rate": 3.0175624473156315e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 262970363.0,
"step": 359
},
{
"entropy": 0.4435577392578125,
"epoch": 3.9130434782608696,
"grad_norm": 3.7678168274529997,
"learning_rate": 2.9900781260306427e-06,
"loss": 0.0866,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 263725373.0,
"step": 360
},
{
"entropy": 0.450714111328125,
"epoch": 3.9239130434782608,
"grad_norm": 2.043005160780546,
"learning_rate": 2.962666050951997e-06,
"loss": 0.0858,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 264460162.0,
"step": 361
},
{
"entropy": 0.4597625732421875,
"epoch": 3.9347826086956523,
"grad_norm": 2.2439286805770218,
"learning_rate": 2.9353272074007933e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 265193348.0,
"step": 362
},
{
"entropy": 0.4772186279296875,
"epoch": 3.9456521739130435,
"grad_norm": 3.008568678345949,
"learning_rate": 2.9080625780658455e-06,
"loss": 0.1165,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 265886823.0,
"step": 363
},
{
"entropy": 0.459228515625,
"epoch": 3.9565217391304346,
"grad_norm": 2.159391990479657,
"learning_rate": 2.8808731429683433e-06,
"loss": 0.0739,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 266623713.0,
"step": 364
},
{
"entropy": 0.466949462890625,
"epoch": 3.967391304347826,
"grad_norm": 2.8677224790187372,
"learning_rate": 2.853759879426644e-06,
"loss": 0.1018,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 267356085.0,
"step": 365
},
{
"entropy": 0.46905517578125,
"epoch": 3.9782608695652173,
"grad_norm": 3.763284574242391,
"learning_rate": 2.8267237620211296e-06,
"loss": 0.1038,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 268076004.0,
"step": 366
},
{
"entropy": 0.4678497314453125,
"epoch": 3.9891304347826084,
"grad_norm": 3.601468098277866,
"learning_rate": 2.7997657625591866e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9635416688397527,
"num_tokens": 268830382.0,
"step": 367
},
{
"entropy": 0.468475341796875,
"epoch": 4.0,
"grad_norm": 2.611708119130719,
"learning_rate": 2.772886850040264e-06,
"loss": 0.0904,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 269574137.0,
"step": 368
},
{
"entropy": 0.4866485595703125,
"epoch": 4.010869565217392,
"grad_norm": 2.7300781208155427,
"learning_rate": 2.7460879906210485e-06,
"loss": 0.064,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 270306277.0,
"step": 369
},
{
"entropy": 0.4772186279296875,
"epoch": 4.021739130434782,
"grad_norm": 1.4961978907102276,
"learning_rate": 2.7193701475807376e-06,
"loss": 0.063,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 271027387.0,
"step": 370
},
{
"entropy": 0.4515380859375,
"epoch": 4.032608695652174,
"grad_norm": 1.7987674036485675,
"learning_rate": 2.6927342812864117e-06,
"loss": 0.0798,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 271778153.0,
"step": 371
},
{
"entropy": 0.45458984375,
"epoch": 4.043478260869565,
"grad_norm": 2.9488112617854667,
"learning_rate": 2.6661813491585133e-06,
"loss": 0.052,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 272503209.0,
"step": 372
},
{
"entropy": 0.4464874267578125,
"epoch": 4.054347826086956,
"grad_norm": 2.625401389327965,
"learning_rate": 2.6397123056364364e-06,
"loss": 0.0665,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 273237290.0,
"step": 373
},
{
"entropy": 0.4439697265625,
"epoch": 4.065217391304348,
"grad_norm": 1.4718508783115323,
"learning_rate": 2.613328102144216e-06,
"loss": 0.0606,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 273964978.0,
"step": 374
},
{
"entropy": 0.4301605224609375,
"epoch": 4.076086956521739,
"grad_norm": 3.2460386674373254,
"learning_rate": 2.5870296870563287e-06,
"loss": 0.0876,
"mean_token_accuracy": 0.9635416688397527,
"num_tokens": 274693952.0,
"step": 375
},
{
"entropy": 0.418304443359375,
"epoch": 4.086956521739131,
"grad_norm": 2.35569042854768,
"learning_rate": 2.5608180056636123e-06,
"loss": 0.0904,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 275447596.0,
"step": 376
},
{
"entropy": 0.4223175048828125,
"epoch": 4.0978260869565215,
"grad_norm": 2.4698945876545335,
"learning_rate": 2.534694000139273e-06,
"loss": 0.0559,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 276183120.0,
"step": 377
},
{
"entropy": 0.4175567626953125,
"epoch": 4.108695652173913,
"grad_norm": 1.928510597054212,
"learning_rate": 2.5086586095050314e-06,
"loss": 0.0494,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 276914364.0,
"step": 378
},
{
"entropy": 0.428558349609375,
"epoch": 4.119565217391305,
"grad_norm": 3.457540026725172,
"learning_rate": 2.482712769597363e-06,
"loss": 0.039,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 277625330.0,
"step": 379
},
{
"entropy": 0.41448974609375,
"epoch": 4.130434782608695,
"grad_norm": 2.085040589673449,
"learning_rate": 2.4568574130338624e-06,
"loss": 0.046,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 278372481.0,
"step": 380
},
{
"entropy": 0.415679931640625,
"epoch": 4.141304347826087,
"grad_norm": 3.983539704513102,
"learning_rate": 2.4310934691797207e-06,
"loss": 0.0509,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 279098982.0,
"step": 381
},
{
"entropy": 0.4116668701171875,
"epoch": 4.1521739130434785,
"grad_norm": 1.9912291616847635,
"learning_rate": 2.405421864114318e-06,
"loss": 0.0526,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 279812107.0,
"step": 382
},
{
"entropy": 0.42218017578125,
"epoch": 4.163043478260869,
"grad_norm": 2.704559272932083,
"learning_rate": 2.379843520597937e-06,
"loss": 0.0681,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 280527610.0,
"step": 383
},
{
"entropy": 0.4107208251953125,
"epoch": 4.173913043478261,
"grad_norm": 3.5111658042208567,
"learning_rate": 2.3543593580385925e-06,
"loss": 0.0492,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 281253019.0,
"step": 384
},
{
"entropy": 0.4045867919921875,
"epoch": 4.184782608695652,
"grad_norm": 5.079290167429306,
"learning_rate": 2.3289702924589914e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 281998234.0,
"step": 385
},
{
"entropy": 0.4412689208984375,
"epoch": 4.195652173913044,
"grad_norm": 5.139561224962418,
"learning_rate": 2.303677236463593e-06,
"loss": 0.0898,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 282738530.0,
"step": 386
},
{
"entropy": 0.4209747314453125,
"epoch": 4.206521739130435,
"grad_norm": 4.810884443758544,
"learning_rate": 2.2784810992058155e-06,
"loss": 0.0537,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 283445582.0,
"step": 387
},
{
"entropy": 0.406951904296875,
"epoch": 4.217391304347826,
"grad_norm": 2.7111175696393426,
"learning_rate": 2.2533827863553552e-06,
"loss": 0.0662,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 284183621.0,
"step": 388
},
{
"entropy": 0.397796630859375,
"epoch": 4.228260869565218,
"grad_norm": 4.9399851145714075,
"learning_rate": 2.2283832000656304e-06,
"loss": 0.0619,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 284935356.0,
"step": 389
},
{
"entropy": 0.411773681640625,
"epoch": 4.239130434782608,
"grad_norm": 4.9335772334323025,
"learning_rate": 2.2034832389413536e-06,
"loss": 0.0565,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 285665897.0,
"step": 390
},
{
"entropy": 0.4003753662109375,
"epoch": 4.25,
"grad_norm": 2.312671320421061,
"learning_rate": 2.178683798006234e-06,
"loss": 0.0493,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 286411607.0,
"step": 391
},
{
"entropy": 0.4046630859375,
"epoch": 4.260869565217392,
"grad_norm": 2.319802222347104,
"learning_rate": 2.153985768670803e-06,
"loss": 0.0513,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 287131523.0,
"step": 392
},
{
"entropy": 0.4002532958984375,
"epoch": 4.271739130434782,
"grad_norm": 2.0631506419054917,
"learning_rate": 2.1293900387003742e-06,
"loss": 0.0504,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 287870432.0,
"step": 393
},
{
"entropy": 0.4129486083984375,
"epoch": 4.282608695652174,
"grad_norm": 1.6882906349576583,
"learning_rate": 2.104897492183135e-06,
"loss": 0.0339,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 288590314.0,
"step": 394
},
{
"entropy": 0.399658203125,
"epoch": 4.293478260869565,
"grad_norm": 3.6116780855643635,
"learning_rate": 2.080509009498364e-06,
"loss": 0.0766,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 289342128.0,
"step": 395
},
{
"entropy": 0.4263458251953125,
"epoch": 4.304347826086957,
"grad_norm": 1.9675619499566213,
"learning_rate": 2.056225467284786e-06,
"loss": 0.0489,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 290061861.0,
"step": 396
},
{
"entropy": 0.407196044921875,
"epoch": 4.315217391304348,
"grad_norm": 1.993285317102883,
"learning_rate": 2.0320477384090665e-06,
"loss": 0.0544,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 290800081.0,
"step": 397
},
{
"entropy": 0.4045867919921875,
"epoch": 4.326086956521739,
"grad_norm": 4.819721946991274,
"learning_rate": 2.007976691934432e-06,
"loss": 0.0495,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 291522494.0,
"step": 398
},
{
"entropy": 0.395233154296875,
"epoch": 4.336956521739131,
"grad_norm": 4.76315710555539,
"learning_rate": 1.9840131930894334e-06,
"loss": 0.0724,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 292257524.0,
"step": 399
},
{
"entropy": 0.408905029296875,
"epoch": 4.3478260869565215,
"grad_norm": 2.8754184768577162,
"learning_rate": 1.9601581032368457e-06,
"loss": 0.0358,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 293018088.0,
"step": 400
},
{
"entropy": 0.414581298828125,
"epoch": 4.358695652173913,
"grad_norm": 2.4081677258047987,
"learning_rate": 1.936412279842705e-06,
"loss": 0.0491,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 293750057.0,
"step": 401
},
{
"entropy": 0.402923583984375,
"epoch": 4.369565217391305,
"grad_norm": 3.0372455968061867,
"learning_rate": 1.912776576445488e-06,
"loss": 0.0473,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 294478098.0,
"step": 402
},
{
"entropy": 0.434967041015625,
"epoch": 4.380434782608695,
"grad_norm": 3.812490747787835,
"learning_rate": 1.8892518426254363e-06,
"loss": 0.0786,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 295180680.0,
"step": 403
},
{
"entropy": 0.412994384765625,
"epoch": 4.391304347826087,
"grad_norm": 3.585209828081671,
"learning_rate": 1.8658389239740094e-06,
"loss": 0.0546,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 295903805.0,
"step": 404
},
{
"entropy": 0.43218994140625,
"epoch": 4.4021739130434785,
"grad_norm": 2.812768680407068,
"learning_rate": 1.8425386620634961e-06,
"loss": 0.0612,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 296641989.0,
"step": 405
},
{
"entropy": 0.4144134521484375,
"epoch": 4.413043478260869,
"grad_norm": 2.821636952059335,
"learning_rate": 1.8193518944167625e-06,
"loss": 0.0411,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 297380486.0,
"step": 406
},
{
"entropy": 0.416015625,
"epoch": 4.423913043478261,
"grad_norm": 2.6578372050676173,
"learning_rate": 1.7962794544771477e-06,
"loss": 0.0331,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 298106103.0,
"step": 407
},
{
"entropy": 0.4125518798828125,
"epoch": 4.434782608695652,
"grad_norm": 4.680570450169205,
"learning_rate": 1.773322171578512e-06,
"loss": 0.0548,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 298864936.0,
"step": 408
},
{
"entropy": 0.43536376953125,
"epoch": 4.445652173913043,
"grad_norm": 3.200460191935974,
"learning_rate": 1.7504808709154104e-06,
"loss": 0.0725,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 299589040.0,
"step": 409
},
{
"entropy": 0.42657470703125,
"epoch": 4.456521739130435,
"grad_norm": 2.771274243214472,
"learning_rate": 1.727756373513449e-06,
"loss": 0.0776,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 300295523.0,
"step": 410
},
{
"entropy": 0.4076995849609375,
"epoch": 4.467391304347826,
"grad_norm": 2.548796401378428,
"learning_rate": 1.7051494961997623e-06,
"loss": 0.0582,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 301005765.0,
"step": 411
},
{
"entropy": 0.4084625244140625,
"epoch": 4.478260869565218,
"grad_norm": 1.9994386544286782,
"learning_rate": 1.6826610515736618e-06,
"loss": 0.0292,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 301748290.0,
"step": 412
},
{
"entropy": 0.4356689453125,
"epoch": 4.489130434782608,
"grad_norm": 2.2197819076897787,
"learning_rate": 1.660291847977415e-06,
"loss": 0.041,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 302451329.0,
"step": 413
},
{
"entropy": 0.41204833984375,
"epoch": 4.5,
"grad_norm": 2.455349815343268,
"learning_rate": 1.6380426894672003e-06,
"loss": 0.0457,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 303175092.0,
"step": 414
},
{
"entropy": 0.4294891357421875,
"epoch": 4.510869565217392,
"grad_norm": 2.710943705952911,
"learning_rate": 1.6159143757842005e-06,
"loss": 0.0454,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 303896428.0,
"step": 415
},
{
"entropy": 0.4051971435546875,
"epoch": 4.521739130434782,
"grad_norm": 3.0032821856789065,
"learning_rate": 1.5939077023258547e-06,
"loss": 0.0547,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 304634834.0,
"step": 416
},
{
"entropy": 0.445343017578125,
"epoch": 4.532608695652174,
"grad_norm": 2.2530273539388146,
"learning_rate": 1.5720234601172767e-06,
"loss": 0.0293,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 305325860.0,
"step": 417
},
{
"entropy": 0.4124908447265625,
"epoch": 4.543478260869565,
"grad_norm": 3.1330667538082126,
"learning_rate": 1.5502624357828118e-06,
"loss": 0.0802,
"mean_token_accuracy": 0.9687500018626451,
"num_tokens": 306089245.0,
"step": 418
},
{
"entropy": 0.41497802734375,
"epoch": 4.554347826086957,
"grad_norm": 2.2467240345711645,
"learning_rate": 1.5286254115177623e-06,
"loss": 0.0373,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 306822344.0,
"step": 419
},
{
"entropy": 0.4322509765625,
"epoch": 4.565217391304348,
"grad_norm": 2.2227469183901736,
"learning_rate": 1.5071131650602782e-06,
"loss": 0.0372,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 307541285.0,
"step": 420
},
{
"entropy": 0.413787841796875,
"epoch": 4.576086956521739,
"grad_norm": 2.0709109351711836,
"learning_rate": 1.485726469663401e-06,
"loss": 0.0316,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 308256892.0,
"step": 421
},
{
"entropy": 0.422515869140625,
"epoch": 4.586956521739131,
"grad_norm": 1.5331678246800737,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0215,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 308999745.0,
"step": 422
},
{
"entropy": 0.4016265869140625,
"epoch": 4.5978260869565215,
"grad_norm": 2.5471266140170443,
"learning_rate": 1.4433328024714583e-06,
"loss": 0.069,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 309740799.0,
"step": 423
},
{
"entropy": 0.420562744140625,
"epoch": 4.608695652173913,
"grad_norm": 3.167188925399225,
"learning_rate": 1.422327354507575e-06,
"loss": 0.0406,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 310458494.0,
"step": 424
},
{
"entropy": 0.409942626953125,
"epoch": 4.619565217391305,
"grad_norm": 2.3943435455869926,
"learning_rate": 1.4014505052118893e-06,
"loss": 0.054,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 311194917.0,
"step": 425
},
{
"entropy": 0.4096221923828125,
"epoch": 4.630434782608695,
"grad_norm": 2.212368683866918,
"learning_rate": 1.3807030049982284e-06,
"loss": 0.0315,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 311938164.0,
"step": 426
},
{
"entropy": 0.39923095703125,
"epoch": 4.641304347826087,
"grad_norm": 2.5878663864858718,
"learning_rate": 1.3600855996309937e-06,
"loss": 0.0391,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 312684433.0,
"step": 427
},
{
"entropy": 0.41021728515625,
"epoch": 4.6521739130434785,
"grad_norm": 2.560421888081383,
"learning_rate": 1.339599030198351e-06,
"loss": 0.0572,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 313420352.0,
"step": 428
},
{
"entropy": 0.4083251953125,
"epoch": 4.663043478260869,
"grad_norm": 3.054237583231214,
"learning_rate": 1.3192440330856005e-06,
"loss": 0.048,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 314169602.0,
"step": 429
},
{
"entropy": 0.4042510986328125,
"epoch": 4.673913043478261,
"grad_norm": 1.916804219563132,
"learning_rate": 1.2990213399487078e-06,
"loss": 0.0457,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 314913664.0,
"step": 430
},
{
"entropy": 0.4028472900390625,
"epoch": 4.684782608695652,
"grad_norm": 2.0422782684611116,
"learning_rate": 1.278931677687994e-06,
"loss": 0.0424,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 315646144.0,
"step": 431
},
{
"entropy": 0.4032135009765625,
"epoch": 4.695652173913043,
"grad_norm": 1.4351345675904197,
"learning_rate": 1.2589757684220182e-06,
"loss": 0.023,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 316380571.0,
"step": 432
},
{
"entropy": 0.4063262939453125,
"epoch": 4.706521739130435,
"grad_norm": 1.7418995711205454,
"learning_rate": 1.239154329461615e-06,
"loss": 0.0203,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 317129506.0,
"step": 433
},
{
"entropy": 0.4080352783203125,
"epoch": 4.717391304347826,
"grad_norm": 3.0546055267777277,
"learning_rate": 1.2194680732841125e-06,
"loss": 0.0284,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 317878755.0,
"step": 434
},
{
"entropy": 0.4080810546875,
"epoch": 4.728260869565218,
"grad_norm": 1.2821499432415668,
"learning_rate": 1.1999177075077278e-06,
"loss": 0.0173,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 318614683.0,
"step": 435
},
{
"entropy": 0.4033203125,
"epoch": 4.739130434782608,
"grad_norm": 2.7222945012776996,
"learning_rate": 1.1805039348661213e-06,
"loss": 0.0246,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 319344907.0,
"step": 436
},
{
"entropy": 0.408966064453125,
"epoch": 4.75,
"grad_norm": 3.1591784602515274,
"learning_rate": 1.1612274531831463e-06,
"loss": 0.0525,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 320081214.0,
"step": 437
},
{
"entropy": 0.4129791259765625,
"epoch": 4.760869565217392,
"grad_norm": 2.484789091986584,
"learning_rate": 1.1420889553477577e-06,
"loss": 0.0689,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 320808939.0,
"step": 438
},
{
"entropy": 0.4048614501953125,
"epoch": 4.771739130434782,
"grad_norm": 2.419811107661141,
"learning_rate": 1.1230891292891173e-06,
"loss": 0.0379,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 321535295.0,
"step": 439
},
{
"entropy": 0.414337158203125,
"epoch": 4.782608695652174,
"grad_norm": 2.064500040614814,
"learning_rate": 1.1042286579518556e-06,
"loss": 0.0403,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 322261465.0,
"step": 440
},
{
"entropy": 0.4001312255859375,
"epoch": 4.793478260869565,
"grad_norm": 2.297195906490076,
"learning_rate": 1.0855082192715294e-06,
"loss": 0.0453,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 323010771.0,
"step": 441
},
{
"entropy": 0.4105072021484375,
"epoch": 4.804347826086957,
"grad_norm": 2.2277803267926277,
"learning_rate": 1.0669284861502517e-06,
"loss": 0.0285,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 323747839.0,
"step": 442
},
{
"entropy": 0.39654541015625,
"epoch": 4.815217391304348,
"grad_norm": 2.2167121974335333,
"learning_rate": 1.0484901264325026e-06,
"loss": 0.0253,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 324512017.0,
"step": 443
},
{
"entropy": 0.4126739501953125,
"epoch": 4.826086956521739,
"grad_norm": 1.131288128180686,
"learning_rate": 1.0301938028811303e-06,
"loss": 0.0149,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 325231917.0,
"step": 444
},
{
"entropy": 0.4065704345703125,
"epoch": 4.836956521739131,
"grad_norm": 3.562991857009424,
"learning_rate": 1.0120401731535213e-06,
"loss": 0.0437,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 325965200.0,
"step": 445
},
{
"entropy": 0.3999481201171875,
"epoch": 4.8478260869565215,
"grad_norm": 3.485697515835889,
"learning_rate": 9.940298897779615e-07,
"loss": 0.0414,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 326710482.0,
"step": 446
},
{
"entropy": 0.412750244140625,
"epoch": 4.858695652173913,
"grad_norm": 1.4653396598394415,
"learning_rate": 9.761636001301872e-07,
"loss": 0.0185,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 327427331.0,
"step": 447
},
{
"entropy": 0.4132080078125,
"epoch": 4.869565217391305,
"grad_norm": 3.621100589978088,
"learning_rate": 9.58441946410108e-07,
"loss": 0.037,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 328147723.0,
"step": 448
},
{
"entropy": 0.4010467529296875,
"epoch": 4.880434782608695,
"grad_norm": 3.171039122195853,
"learning_rate": 9.408655656187282e-07,
"loss": 0.0256,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 328924086.0,
"step": 449
},
{
"entropy": 0.4055328369140625,
"epoch": 4.891304347826087,
"grad_norm": 2.523795123674481,
"learning_rate": 9.234350895352479e-07,
"loss": 0.0243,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 329658771.0,
"step": 450
},
{
"entropy": 0.4088897705078125,
"epoch": 4.9021739130434785,
"grad_norm": 3.42519876063796,
"learning_rate": 9.061511446943533e-07,
"loss": 0.023,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 330385752.0,
"step": 451
},
{
"entropy": 0.4098968505859375,
"epoch": 4.913043478260869,
"grad_norm": 2.01468212891926,
"learning_rate": 8.890143523636968e-07,
"loss": 0.0199,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 331104115.0,
"step": 452
},
{
"entropy": 0.4082794189453125,
"epoch": 4.923913043478261,
"grad_norm": 3.1125668067551993,
"learning_rate": 8.720253285215685e-07,
"loss": 0.0414,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 331824536.0,
"step": 453
},
{
"entropy": 0.396453857421875,
"epoch": 4.934782608695652,
"grad_norm": 2.7597760592043388,
"learning_rate": 8.551846838347489e-07,
"loss": 0.019,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 332563726.0,
"step": 454
},
{
"entropy": 0.420623779296875,
"epoch": 4.945652173913043,
"grad_norm": 3.494544234545317,
"learning_rate": 8.384930236365629e-07,
"loss": 0.0441,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 333288671.0,
"step": 455
},
{
"entropy": 0.391998291015625,
"epoch": 4.956521739130435,
"grad_norm": 2.2247266199545424,
"learning_rate": 8.219509479051202e-07,
"loss": 0.0278,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 334073387.0,
"step": 456
},
{
"entropy": 0.4291839599609375,
"epoch": 4.967391304347826,
"grad_norm": 2.3531506297864713,
"learning_rate": 8.055590512417499e-07,
"loss": 0.0297,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 334816728.0,
"step": 457
},
{
"entropy": 0.404052734375,
"epoch": 4.978260869565218,
"grad_norm": 1.5945315425002895,
"learning_rate": 7.893179228496261e-07,
"loss": 0.0242,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 335532419.0,
"step": 458
},
{
"entropy": 0.390106201171875,
"epoch": 4.989130434782608,
"grad_norm": 1.797015867033134,
"learning_rate": 7.732281465125907e-07,
"loss": 0.0203,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 336287282.0,
"step": 459
},
{
"entropy": 0.3868865966796875,
"epoch": 5.0,
"grad_norm": 4.054309515163909,
"learning_rate": 7.572903005741689e-07,
"loss": 0.038,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 337040768.0,
"step": 460
},
{
"entropy": 0.4239654541015625,
"epoch": 5.010869565217392,
"grad_norm": 2.037867510102136,
"learning_rate": 7.415049579167783e-07,
"loss": 0.0085,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 337766168.0,
"step": 461
},
{
"entropy": 0.3897705078125,
"epoch": 5.021739130434782,
"grad_norm": 4.130199703611077,
"learning_rate": 7.258726859411435e-07,
"loss": 0.0486,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 338509308.0,
"step": 462
},
{
"entropy": 0.4114532470703125,
"epoch": 5.032608695652174,
"grad_norm": 1.4084584784753773,
"learning_rate": 7.103940465458936e-07,
"loss": 0.0164,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 339241775.0,
"step": 463
},
{
"entropy": 0.3938446044921875,
"epoch": 5.043478260869565,
"grad_norm": 4.223867668677694,
"learning_rate": 6.950695961073684e-07,
"loss": 0.0266,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 339977899.0,
"step": 464
},
{
"entropy": 0.405853271484375,
"epoch": 5.054347826086956,
"grad_norm": 3.1513324473644087,
"learning_rate": 6.79899885459619e-07,
"loss": 0.0278,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 340699822.0,
"step": 465
},
{
"entropy": 0.402862548828125,
"epoch": 5.065217391304348,
"grad_norm": 1.9272349862009421,
"learning_rate": 6.64885459874608e-07,
"loss": 0.0237,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 341438940.0,
"step": 466
},
{
"entropy": 0.3831787109375,
"epoch": 5.076086956521739,
"grad_norm": 1.5225484641534506,
"learning_rate": 6.500268590426107e-07,
"loss": 0.0159,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 342184972.0,
"step": 467
},
{
"entropy": 0.3907012939453125,
"epoch": 5.086956521739131,
"grad_norm": 3.9148783887118763,
"learning_rate": 6.353246170528149e-07,
"loss": 0.0225,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 342940915.0,
"step": 468
},
{
"entropy": 0.40972900390625,
"epoch": 5.0978260869565215,
"grad_norm": 1.4281558107061891,
"learning_rate": 6.207792623741249e-07,
"loss": 0.0142,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 343675628.0,
"step": 469
},
{
"entropy": 0.40313720703125,
"epoch": 5.108695652173913,
"grad_norm": 1.206776637290113,
"learning_rate": 6.063913178361614e-07,
"loss": 0.0076,
"mean_token_accuracy": 1.0,
"num_tokens": 344397191.0,
"step": 470
},
{
"entropy": 0.385772705078125,
"epoch": 5.119565217391305,
"grad_norm": 2.2509076542953297,
"learning_rate": 5.921613006104765e-07,
"loss": 0.0214,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 345139527.0,
"step": 471
},
{
"entropy": 0.4058380126953125,
"epoch": 5.130434782608695,
"grad_norm": 2.681777682259471,
"learning_rate": 5.780897221919551e-07,
"loss": 0.0232,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 345859581.0,
"step": 472
},
{
"entropy": 0.3974609375,
"epoch": 5.141304347826087,
"grad_norm": 2.562368235939727,
"learning_rate": 5.641770883804365e-07,
"loss": 0.0277,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 346594728.0,
"step": 473
},
{
"entropy": 0.3978271484375,
"epoch": 5.1521739130434785,
"grad_norm": 1.260409714507234,
"learning_rate": 5.504238992625277e-07,
"loss": 0.0107,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 347345412.0,
"step": 474
},
{
"entropy": 0.394134521484375,
"epoch": 5.163043478260869,
"grad_norm": 2.8754774462772286,
"learning_rate": 5.368306491936326e-07,
"loss": 0.0158,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 348061065.0,
"step": 475
},
{
"entropy": 0.40869140625,
"epoch": 5.173913043478261,
"grad_norm": 1.023086461493808,
"learning_rate": 5.233978267801798e-07,
"loss": 0.0073,
"mean_token_accuracy": 1.0,
"num_tokens": 348783776.0,
"step": 476
},
{
"entropy": 0.401458740234375,
"epoch": 5.184782608695652,
"grad_norm": 1.4929118650933983,
"learning_rate": 5.101259148620618e-07,
"loss": 0.0099,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 349514350.0,
"step": 477
},
{
"entropy": 0.3917999267578125,
"epoch": 5.195652173913044,
"grad_norm": 1.294188438794479,
"learning_rate": 4.970153904952768e-07,
"loss": 0.0107,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 350248076.0,
"step": 478
},
{
"entropy": 0.3802642822265625,
"epoch": 5.206521739130435,
"grad_norm": 1.5868086446888314,
"learning_rate": 4.840667249347824e-07,
"loss": 0.0294,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 350991865.0,
"step": 479
},
{
"entropy": 0.38580322265625,
"epoch": 5.217391304347826,
"grad_norm": 1.7262483127382957,
"learning_rate": 4.7128038361755836e-07,
"loss": 0.0239,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 351745006.0,
"step": 480
},
{
"entropy": 0.4080352783203125,
"epoch": 5.228260869565218,
"grad_norm": 5.0340551829352,
"learning_rate": 4.586568261458729e-07,
"loss": 0.0455,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 352474084.0,
"step": 481
},
{
"entropy": 0.4060821533203125,
"epoch": 5.239130434782608,
"grad_norm": 2.8279201983140814,
"learning_rate": 4.461965062707646e-07,
"loss": 0.0234,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 353211715.0,
"step": 482
},
{
"entropy": 0.392120361328125,
"epoch": 5.25,
"grad_norm": 2.1056087405556725,
"learning_rate": 4.338998718757315e-07,
"loss": 0.0087,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 353950858.0,
"step": 483
},
{
"entropy": 0.4037322998046875,
"epoch": 5.260869565217392,
"grad_norm": 1.9071121793728274,
"learning_rate": 4.2176736496063406e-07,
"loss": 0.0047,
"mean_token_accuracy": 1.0,
"num_tokens": 354656306.0,
"step": 484
},
{
"entropy": 0.4117431640625,
"epoch": 5.271739130434782,
"grad_norm": 2.1832059782760695,
"learning_rate": 4.0979942162580387e-07,
"loss": 0.011,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 355373127.0,
"step": 485
},
{
"entropy": 0.387969970703125,
"epoch": 5.282608695652174,
"grad_norm": 0.9338603119206235,
"learning_rate": 3.979964720563728e-07,
"loss": 0.015,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 356134595.0,
"step": 486
},
{
"entropy": 0.3863067626953125,
"epoch": 5.293478260869565,
"grad_norm": 0.8035643508790813,
"learning_rate": 3.863589405068047e-07,
"loss": 0.0033,
"mean_token_accuracy": 1.0,
"num_tokens": 356874140.0,
"step": 487
},
{
"entropy": 0.3829498291015625,
"epoch": 5.304347826086957,
"grad_norm": 2.330137070929337,
"learning_rate": 3.748872452856506e-07,
"loss": 0.0091,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 357620724.0,
"step": 488
},
{
"entropy": 0.3927459716796875,
"epoch": 5.315217391304348,
"grad_norm": 1.2276302828645993,
"learning_rate": 3.63581798740511e-07,
"loss": 0.0049,
"mean_token_accuracy": 1.0,
"num_tokens": 358355003.0,
"step": 489
},
{
"entropy": 0.3846282958984375,
"epoch": 5.326086956521739,
"grad_norm": 3.1122288247660985,
"learning_rate": 3.524430072432117e-07,
"loss": 0.0311,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 359088947.0,
"step": 490
},
{
"entropy": 0.3803253173828125,
"epoch": 5.336956521739131,
"grad_norm": 1.031743535694978,
"learning_rate": 3.414712711752011e-07,
"loss": 0.0033,
"mean_token_accuracy": 1.0,
"num_tokens": 359839271.0,
"step": 491
},
{
"entropy": 0.392791748046875,
"epoch": 5.3478260869565215,
"grad_norm": 2.4003564062037217,
"learning_rate": 3.306669849131544e-07,
"loss": 0.0222,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 360555118.0,
"step": 492
},
{
"entropy": 0.4043121337890625,
"epoch": 5.358695652173913,
"grad_norm": 1.5916000916175594,
"learning_rate": 3.20030536814801e-07,
"loss": 0.0138,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 361282594.0,
"step": 493
},
{
"entropy": 0.3998870849609375,
"epoch": 5.369565217391305,
"grad_norm": 5.135801576812412,
"learning_rate": 3.095623092049632e-07,
"loss": 0.0457,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 362002556.0,
"step": 494
},
{
"entropy": 0.3865966796875,
"epoch": 5.380434782608695,
"grad_norm": 3.090721052570619,
"learning_rate": 2.992626783618152e-07,
"loss": 0.031,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 362743968.0,
"step": 495
},
{
"entropy": 0.38818359375,
"epoch": 5.391304347826087,
"grad_norm": 2.1833786614192396,
"learning_rate": 2.891320145033566e-07,
"loss": 0.0093,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 363477550.0,
"step": 496
},
{
"entropy": 0.3900604248046875,
"epoch": 5.4021739130434785,
"grad_norm": 0.5228962640445148,
"learning_rate": 2.791706817741041e-07,
"loss": 0.0034,
"mean_token_accuracy": 1.0,
"num_tokens": 364226587.0,
"step": 497
},
{
"entropy": 0.3830413818359375,
"epoch": 5.413043478260869,
"grad_norm": 2.177304130904656,
"learning_rate": 2.693790382320055e-07,
"loss": 0.0175,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 364959239.0,
"step": 498
},
{
"entropy": 0.3940887451171875,
"epoch": 5.423913043478261,
"grad_norm": 2.7692226509966775,
"learning_rate": 2.59757435835567e-07,
"loss": 0.0152,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 365696800.0,
"step": 499
},
{
"entropy": 0.3874969482421875,
"epoch": 5.434782608695652,
"grad_norm": 0.6622227905804676,
"learning_rate": 2.5030622043120237e-07,
"loss": 0.0033,
"mean_token_accuracy": 1.0,
"num_tokens": 366426471.0,
"step": 500
},
{
"entropy": 0.3917999267578125,
"epoch": 5.445652173913043,
"grad_norm": 3.4285563843356326,
"learning_rate": 2.41025731740801e-07,
"loss": 0.0333,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 367174609.0,
"step": 501
},
{
"entropy": 0.3955535888671875,
"epoch": 5.456521739130435,
"grad_norm": 3.7100370301508296,
"learning_rate": 2.319163033495192e-07,
"loss": 0.0424,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 367898714.0,
"step": 502
},
{
"entropy": 0.380218505859375,
"epoch": 5.467391304347826,
"grad_norm": 1.7830513614583579,
"learning_rate": 2.2297826269378653e-07,
"loss": 0.013,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 368639031.0,
"step": 503
},
{
"entropy": 0.3790130615234375,
"epoch": 5.478260869565218,
"grad_norm": 2.2310214846672896,
"learning_rate": 2.142119310495383e-07,
"loss": 0.01,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 369397263.0,
"step": 504
},
{
"entropy": 0.3843231201171875,
"epoch": 5.489130434782608,
"grad_norm": 3.113797314828611,
"learning_rate": 2.0561762352066638e-07,
"loss": 0.0121,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 370138352.0,
"step": 505
},
{
"entropy": 0.3926849365234375,
"epoch": 5.5,
"grad_norm": 7.027103058482383,
"learning_rate": 1.9719564902769272e-07,
"loss": 0.0268,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 370867304.0,
"step": 506
},
{
"entropy": 0.385528564453125,
"epoch": 5.510869565217392,
"grad_norm": 3.158975096906896,
"learning_rate": 1.889463102966671e-07,
"loss": 0.0136,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 371612613.0,
"step": 507
},
{
"entropy": 0.40679931640625,
"epoch": 5.521739130434782,
"grad_norm": 3.7256140843342846,
"learning_rate": 1.8086990384828195e-07,
"loss": 0.0322,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 372320930.0,
"step": 508
},
{
"entropy": 0.3902587890625,
"epoch": 5.532608695652174,
"grad_norm": 3.640770823761304,
"learning_rate": 1.729667199872187e-07,
"loss": 0.0172,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 373025821.0,
"step": 509
},
{
"entropy": 0.386322021484375,
"epoch": 5.543478260869565,
"grad_norm": 4.3600778017735875,
"learning_rate": 1.6523704279170773e-07,
"loss": 0.0302,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 373758445.0,
"step": 510
},
{
"entropy": 0.3842010498046875,
"epoch": 5.554347826086957,
"grad_norm": 2.8904903079482365,
"learning_rate": 1.5768115010332207e-07,
"loss": 0.0394,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 374507004.0,
"step": 511
},
{
"entropy": 0.4094390869140625,
"epoch": 5.565217391304348,
"grad_norm": 1.5280072777152858,
"learning_rate": 1.5029931351698723e-07,
"loss": 0.0062,
"mean_token_accuracy": 1.0,
"num_tokens": 375216818.0,
"step": 512
},
{
"entropy": 0.3795166015625,
"epoch": 5.576086956521739,
"grad_norm": 2.9083912204527858,
"learning_rate": 1.4309179837122045e-07,
"loss": 0.0095,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 375963533.0,
"step": 513
},
{
"entropy": 0.388153076171875,
"epoch": 5.586956521739131,
"grad_norm": 2.494676483675169,
"learning_rate": 1.3605886373859234e-07,
"loss": 0.0121,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 376695849.0,
"step": 514
},
{
"entropy": 0.4017486572265625,
"epoch": 5.5978260869565215,
"grad_norm": 5.605449787153541,
"learning_rate": 1.2920076241641376e-07,
"loss": 0.0333,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 377408950.0,
"step": 515
},
{
"entropy": 0.3864898681640625,
"epoch": 5.608695652173913,
"grad_norm": 2.8048999910075283,
"learning_rate": 1.22517740917652e-07,
"loss": 0.0137,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 378154501.0,
"step": 516
},
{
"entropy": 0.397857666015625,
"epoch": 5.619565217391305,
"grad_norm": 1.585255503223806,
"learning_rate": 1.1601003946206723e-07,
"loss": 0.0124,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 378878351.0,
"step": 517
},
{
"entropy": 0.390869140625,
"epoch": 5.630434782608695,
"grad_norm": 1.6591111399858995,
"learning_rate": 1.0967789196757839e-07,
"loss": 0.0057,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 379589042.0,
"step": 518
},
{
"entropy": 0.3965606689453125,
"epoch": 5.641304347826087,
"grad_norm": 3.8513221893355163,
"learning_rate": 1.0352152604185429e-07,
"loss": 0.013,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 380316238.0,
"step": 519
},
{
"entropy": 0.3861236572265625,
"epoch": 5.6521739130434785,
"grad_norm": 1.2903428759830748,
"learning_rate": 9.754116297413574e-08,
"loss": 0.0164,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 381060518.0,
"step": 520
},
{
"entropy": 0.3924102783203125,
"epoch": 5.663043478260869,
"grad_norm": 2.3661355025868915,
"learning_rate": 9.17370177272775e-08,
"loss": 0.0135,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 381794517.0,
"step": 521
},
{
"entropy": 0.41534423828125,
"epoch": 5.673913043478261,
"grad_norm": 1.7159919960698389,
"learning_rate": 8.610929893002274e-08,
"loss": 0.0116,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 382509591.0,
"step": 522
},
{
"entropy": 0.388824462890625,
"epoch": 5.684782608695652,
"grad_norm": 3.1200032663581574,
"learning_rate": 8.065820886950404e-08,
"loss": 0.0316,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 383254229.0,
"step": 523
},
{
"entropy": 0.3953857421875,
"epoch": 5.695652173913043,
"grad_norm": 4.159894595972317,
"learning_rate": 7.538394348397316e-08,
"loss": 0.0659,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 383988955.0,
"step": 524
},
{
"entropy": 0.386077880859375,
"epoch": 5.706521739130435,
"grad_norm": 1.368823358453224,
"learning_rate": 7.028669235575714e-08,
"loss": 0.0053,
"mean_token_accuracy": 1.0,
"num_tokens": 384708327.0,
"step": 525
},
{
"entropy": 0.41278076171875,
"epoch": 5.717391304347826,
"grad_norm": 2.818039902986355,
"learning_rate": 6.536663870444382e-08,
"loss": 0.0107,
"mean_token_accuracy": 1.0,
"num_tokens": 385418118.0,
"step": 526
},
{
"entropy": 0.3767852783203125,
"epoch": 5.728260869565218,
"grad_norm": 3.546014549466884,
"learning_rate": 6.062395938029485e-08,
"loss": 0.0315,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 386175413.0,
"step": 527
},
{
"entropy": 0.401458740234375,
"epoch": 5.739130434782608,
"grad_norm": 2.389814493856653,
"learning_rate": 5.605882485789138e-08,
"loss": 0.0318,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 386897197.0,
"step": 528
},
{
"entropy": 0.3771820068359375,
"epoch": 5.75,
"grad_norm": 1.0744355499348452,
"learning_rate": 5.167139923000553e-08,
"loss": 0.0056,
"mean_token_accuracy": 1.0,
"num_tokens": 387648661.0,
"step": 529
},
{
"entropy": 0.380645751953125,
"epoch": 5.760869565217392,
"grad_norm": 0.9597020947603775,
"learning_rate": 4.746184020170019e-08,
"loss": 0.0053,
"mean_token_accuracy": 1.0,
"num_tokens": 388401188.0,
"step": 530
},
{
"entropy": 0.386199951171875,
"epoch": 5.771739130434782,
"grad_norm": 1.0214090880188462,
"learning_rate": 4.3430299084663006e-08,
"loss": 0.0052,
"mean_token_accuracy": 1.0,
"num_tokens": 389132125.0,
"step": 531
},
{
"entropy": 0.416839599609375,
"epoch": 5.782608695652174,
"grad_norm": 1.0946483645943863,
"learning_rate": 3.957692079176623e-08,
"loss": 0.0081,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 389862785.0,
"step": 532
},
{
"entropy": 0.390167236328125,
"epoch": 5.793478260869565,
"grad_norm": 1.6459688782297026,
"learning_rate": 3.590184383185758e-08,
"loss": 0.0062,
"mean_token_accuracy": 1.0,
"num_tokens": 390597870.0,
"step": 533
},
{
"entropy": 0.3839874267578125,
"epoch": 5.804347826086957,
"grad_norm": 2.0698702706686567,
"learning_rate": 3.240520030478256e-08,
"loss": 0.0288,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 391319503.0,
"step": 534
},
{
"entropy": 0.3899383544921875,
"epoch": 5.815217391304348,
"grad_norm": 1.8622051935678263,
"learning_rate": 2.9087115896635486e-08,
"loss": 0.0103,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 392069003.0,
"step": 535
},
{
"entropy": 0.4185791015625,
"epoch": 5.826086956521739,
"grad_norm": 0.9838782015391406,
"learning_rate": 2.5947709875240867e-08,
"loss": 0.0161,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 392791197.0,
"step": 536
},
{
"entropy": 0.4089813232421875,
"epoch": 5.836956521739131,
"grad_norm": 0.5779206893200703,
"learning_rate": 2.298709508586794e-08,
"loss": 0.0033,
"mean_token_accuracy": 1.0,
"num_tokens": 393512216.0,
"step": 537
},
{
"entropy": 0.391387939453125,
"epoch": 5.8478260869565215,
"grad_norm": 1.5997986902738377,
"learning_rate": 2.0205377947174475e-08,
"loss": 0.0254,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 394224676.0,
"step": 538
},
{
"entropy": 0.40155029296875,
"epoch": 5.858695652173913,
"grad_norm": 0.9538716174458627,
"learning_rate": 1.760265844738096e-08,
"loss": 0.0052,
"mean_token_accuracy": 1.0,
"num_tokens": 394946689.0,
"step": 539
},
{
"entropy": 0.41473388671875,
"epoch": 5.869565217391305,
"grad_norm": 3.1692455070380507,
"learning_rate": 1.5179030140675122e-08,
"loss": 0.006,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 395674368.0,
"step": 540
},
{
"entropy": 0.3944549560546875,
"epoch": 5.880434782608695,
"grad_norm": 3.822951895963981,
"learning_rate": 1.2934580143851294e-08,
"loss": 0.0268,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 396397890.0,
"step": 541
},
{
"entropy": 0.3854827880859375,
"epoch": 5.891304347826087,
"grad_norm": 1.7269765059882336,
"learning_rate": 1.0869389133178477e-08,
"loss": 0.0132,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 397122155.0,
"step": 542
},
{
"entropy": 0.383026123046875,
"epoch": 5.9021739130434785,
"grad_norm": 2.3437834826407107,
"learning_rate": 8.983531341500984e-09,
"loss": 0.0227,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 397856299.0,
"step": 543
},
{
"entropy": 0.393585205078125,
"epoch": 5.913043478260869,
"grad_norm": 2.8396564206572372,
"learning_rate": 7.277074555567809e-09,
"loss": 0.0068,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 398575680.0,
"step": 544
},
{
"entropy": 0.4019622802734375,
"epoch": 5.923913043478261,
"grad_norm": 2.1229188120999614,
"learning_rate": 5.750080113598455e-09,
"loss": 0.023,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 399294327.0,
"step": 545
},
{
"entropy": 0.3828125,
"epoch": 5.934782608695652,
"grad_norm": 2.7811799795374825,
"learning_rate": 4.40260290307748e-09,
"loss": 0.0162,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 400028937.0,
"step": 546
},
{
"entropy": 0.39453125,
"epoch": 5.945652173913043,
"grad_norm": 1.2135571583249902,
"learning_rate": 3.2346913587816275e-09,
"loss": 0.005,
"mean_token_accuracy": 1.0,
"num_tokens": 400760386.0,
"step": 547
},
{
"entropy": 0.383087158203125,
"epoch": 5.956521739130435,
"grad_norm": 2.037257238818469,
"learning_rate": 2.2463874610378912e-09,
"loss": 0.0149,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 401508363.0,
"step": 548
},
{
"entropy": 0.395111083984375,
"epoch": 5.967391304347826,
"grad_norm": 0.9899828962837534,
"learning_rate": 1.4377267342158274e-09,
"loss": 0.0094,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 402255596.0,
"step": 549
},
{
"entropy": 0.391082763671875,
"epoch": 5.978260869565218,
"grad_norm": 3.1153076835245432,
"learning_rate": 8.087382454502468e-10,
"loss": 0.0421,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 402993150.0,
"step": 550
},
{
"entropy": 0.3865966796875,
"epoch": 5.989130434782608,
"grad_norm": 1.329602352306281,
"learning_rate": 3.594446035964927e-10,
"loss": 0.0157,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 403711677.0,
"step": 551
},
{
"entropy": 0.397247314453125,
"epoch": 6.0,
"grad_norm": 1.779545542710601,
"learning_rate": 8.986195841609313e-11,
"loss": 0.0062,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 404454221.0,
"step": 552
},
{
"epoch": 6.0,
"step": 552,
"total_flos": 475865375768576.0,
"train_loss": 0.4369911788587384,
"train_runtime": 50253.5416,
"train_samples_per_second": 2.224,
"train_steps_per_second": 0.011
}
],
"logging_steps": 1,
"max_steps": 552,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 46,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 475865375768576.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}