{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 774, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.553863525390625, "epoch": 0.003875968992248062, "grad_norm": 5.757997452916937, "learning_rate": 0.0, "loss": 1.3887, "mean_token_accuracy": 0.6599368844181299, "num_tokens": 438497.0, "step": 1 }, { "entropy": 0.55859375, "epoch": 0.007751937984496124, "grad_norm": 5.784239047600006, "learning_rate": 5.128205128205128e-07, "loss": 1.3966, "mean_token_accuracy": 0.6559260198846459, "num_tokens": 881766.0, "step": 2 }, { "entropy": 0.5675048828125, "epoch": 0.011627906976744186, "grad_norm": 5.831349144495169, "learning_rate": 1.0256410256410257e-06, "loss": 1.4124, "mean_token_accuracy": 0.6495780032128096, "num_tokens": 1313728.0, "step": 3 }, { "entropy": 0.565948486328125, "epoch": 0.015503875968992248, "grad_norm": 5.999742316986466, "learning_rate": 1.5384615384615387e-06, "loss": 1.4314, "mean_token_accuracy": 0.6460442841053009, "num_tokens": 1736438.0, "step": 4 }, { "entropy": 0.5799560546875, "epoch": 0.01937984496124031, "grad_norm": 5.62840439540393, "learning_rate": 2.0512820512820513e-06, "loss": 1.3791, "mean_token_accuracy": 0.6597223430871964, "num_tokens": 2159510.0, "step": 5 }, { "entropy": 0.562835693359375, "epoch": 0.023255813953488372, "grad_norm": 5.559443063914328, "learning_rate": 2.564102564102564e-06, "loss": 1.3862, "mean_token_accuracy": 0.6581119578331709, "num_tokens": 2593106.0, "step": 6 }, { "entropy": 0.5673828125, "epoch": 0.027131782945736434, "grad_norm": 4.991039051490951, "learning_rate": 3.0769230769230774e-06, "loss": 1.3499, "mean_token_accuracy": 0.661266727373004, "num_tokens": 3009566.0, "step": 7 }, { "entropy": 0.57232666015625, "epoch": 0.031007751937984496, "grad_norm": 4.609727165215335, "learning_rate": 3.58974358974359e-06, "loss": 1.3037, "mean_token_accuracy": 0.666324052028358, "num_tokens": 3442267.0, "step": 8 }, { "entropy": 0.5614013671875, "epoch": 0.03488372093023256, "grad_norm": 4.395216373431362, "learning_rate": 4.102564102564103e-06, "loss": 1.3005, "mean_token_accuracy": 0.6676654135808349, "num_tokens": 3885637.0, "step": 9 }, { "entropy": 0.58343505859375, "epoch": 0.03875968992248062, "grad_norm": 3.6789554797458583, "learning_rate": 4.615384615384616e-06, "loss": 1.1796, "mean_token_accuracy": 0.6888375803828239, "num_tokens": 4289326.0, "step": 10 }, { "entropy": 0.5760498046875, "epoch": 0.04263565891472868, "grad_norm": 3.4456412878449334, "learning_rate": 5.128205128205128e-06, "loss": 1.1616, "mean_token_accuracy": 0.6923027131706476, "num_tokens": 4705258.0, "step": 11 }, { "entropy": 0.562744140625, "epoch": 0.046511627906976744, "grad_norm": 3.3438497846016206, "learning_rate": 5.641025641025641e-06, "loss": 1.1297, "mean_token_accuracy": 0.6954672196879983, "num_tokens": 5141166.0, "step": 12 }, { "entropy": 0.527740478515625, "epoch": 0.050387596899224806, "grad_norm": 4.642324495039121, "learning_rate": 6.153846153846155e-06, "loss": 1.0554, "mean_token_accuracy": 0.711206135340035, "num_tokens": 5583075.0, "step": 13 }, { "entropy": 0.549102783203125, "epoch": 0.05426356589147287, "grad_norm": 4.625817739349075, "learning_rate": 6.666666666666667e-06, "loss": 1.0342, "mean_token_accuracy": 0.7171442015096545, "num_tokens": 5997782.0, "step": 14 }, { "entropy": 0.549407958984375, "epoch": 0.05813953488372093, "grad_norm": 4.011955003610355, "learning_rate": 7.17948717948718e-06, "loss": 0.9871, "mean_token_accuracy": 0.7261330829933286, "num_tokens": 6431769.0, "step": 15 }, { "entropy": 0.5477294921875, "epoch": 0.06201550387596899, "grad_norm": 3.457808901288651, "learning_rate": 7.692307692307694e-06, "loss": 0.9568, "mean_token_accuracy": 0.7292733397334814, "num_tokens": 6854070.0, "step": 16 }, { "entropy": 0.54443359375, "epoch": 0.06589147286821706, "grad_norm": 2.6640028081114506, "learning_rate": 8.205128205128205e-06, "loss": 0.9393, "mean_token_accuracy": 0.7348427921533585, "num_tokens": 7300929.0, "step": 17 }, { "entropy": 0.541290283203125, "epoch": 0.06976744186046512, "grad_norm": 3.867353613777132, "learning_rate": 8.717948717948719e-06, "loss": 0.9207, "mean_token_accuracy": 0.7395377028733492, "num_tokens": 7743375.0, "step": 18 }, { "entropy": 0.542083740234375, "epoch": 0.07364341085271318, "grad_norm": 3.7108241527896415, "learning_rate": 9.230769230769232e-06, "loss": 0.9346, "mean_token_accuracy": 0.7370091788470745, "num_tokens": 8185557.0, "step": 19 }, { "entropy": 0.536529541015625, "epoch": 0.07751937984496124, "grad_norm": 3.052580376312922, "learning_rate": 9.743589743589744e-06, "loss": 0.889, "mean_token_accuracy": 0.7455960083752871, "num_tokens": 8632051.0, "step": 20 }, { "entropy": 0.542724609375, "epoch": 0.08139534883720931, "grad_norm": 2.773614625944707, "learning_rate": 1.0256410256410256e-05, "loss": 0.8748, "mean_token_accuracy": 0.7437297496944666, "num_tokens": 9049387.0, "step": 21 }, { "entropy": 0.5391845703125, "epoch": 0.08527131782945736, "grad_norm": 2.3103684483676905, "learning_rate": 1.076923076923077e-05, "loss": 0.8399, "mean_token_accuracy": 0.7560118213295937, "num_tokens": 9472374.0, "step": 22 }, { "entropy": 0.53289794921875, "epoch": 0.08914728682170543, "grad_norm": 2.423538463976918, "learning_rate": 1.1282051282051283e-05, "loss": 0.8428, "mean_token_accuracy": 0.7558635827153921, "num_tokens": 9904619.0, "step": 23 }, { "entropy": 0.53314208984375, "epoch": 0.09302325581395349, "grad_norm": 3.133872651991572, "learning_rate": 1.1794871794871796e-05, "loss": 0.8011, "mean_token_accuracy": 0.7630726611241698, "num_tokens": 10316799.0, "step": 24 }, { "entropy": 0.53680419921875, "epoch": 0.09689922480620156, "grad_norm": 2.307993317837642, "learning_rate": 1.230769230769231e-05, "loss": 0.7905, "mean_token_accuracy": 0.7649277085438371, "num_tokens": 10724410.0, "step": 25 }, { "entropy": 0.53424072265625, "epoch": 0.10077519379844961, "grad_norm": 2.3004423511325736, "learning_rate": 1.2820512820512823e-05, "loss": 0.784, "mean_token_accuracy": 0.7660581776872277, "num_tokens": 11157441.0, "step": 26 }, { "entropy": 0.519683837890625, "epoch": 0.10465116279069768, "grad_norm": 2.0605076377059737, "learning_rate": 1.3333333333333333e-05, "loss": 0.7622, "mean_token_accuracy": 0.7721455879509449, "num_tokens": 11583383.0, "step": 27 }, { "entropy": 0.530609130859375, "epoch": 0.10852713178294573, "grad_norm": 1.7051329435139446, "learning_rate": 1.3846153846153847e-05, "loss": 0.7549, "mean_token_accuracy": 0.7728732898831367, "num_tokens": 11996833.0, "step": 28 }, { "entropy": 0.50787353515625, "epoch": 0.1124031007751938, "grad_norm": 1.653966676070023, "learning_rate": 1.435897435897436e-05, "loss": 0.7424, "mean_token_accuracy": 0.7756998986005783, "num_tokens": 12434838.0, "step": 29 }, { "entropy": 0.50762939453125, "epoch": 0.11627906976744186, "grad_norm": 1.7404676617867714, "learning_rate": 1.4871794871794874e-05, "loss": 0.7389, "mean_token_accuracy": 0.7756242621690035, "num_tokens": 12880605.0, "step": 30 }, { "entropy": 0.53057861328125, "epoch": 0.12015503875968993, "grad_norm": 1.6941417095470896, "learning_rate": 1.5384615384615387e-05, "loss": 0.6965, "mean_token_accuracy": 0.7865999517962337, "num_tokens": 13286289.0, "step": 31 }, { "entropy": 0.513275146484375, "epoch": 0.12403100775193798, "grad_norm": 1.777615436740328, "learning_rate": 1.5897435897435897e-05, "loss": 0.7276, "mean_token_accuracy": 0.7772727366536856, "num_tokens": 13720208.0, "step": 32 }, { "entropy": 0.5091552734375, "epoch": 0.12790697674418605, "grad_norm": 2.169414280230603, "learning_rate": 1.641025641025641e-05, "loss": 0.7014, "mean_token_accuracy": 0.7825290272012353, "num_tokens": 14151016.0, "step": 33 }, { "entropy": 0.499481201171875, "epoch": 0.13178294573643412, "grad_norm": 1.6130944050268388, "learning_rate": 1.6923076923076924e-05, "loss": 0.7115, "mean_token_accuracy": 0.7820462808012962, "num_tokens": 14593303.0, "step": 34 }, { "entropy": 0.513458251953125, "epoch": 0.13565891472868216, "grad_norm": 1.6123542005336788, "learning_rate": 1.7435897435897438e-05, "loss": 0.6765, "mean_token_accuracy": 0.7885658349841833, "num_tokens": 15017503.0, "step": 35 }, { "entropy": 0.490997314453125, "epoch": 0.13953488372093023, "grad_norm": 2.012326490340564, "learning_rate": 1.794871794871795e-05, "loss": 0.6943, "mean_token_accuracy": 0.7859224667772651, "num_tokens": 15471309.0, "step": 36 }, { "entropy": 0.498046875, "epoch": 0.1434108527131783, "grad_norm": 1.7927014251735478, "learning_rate": 1.8461538461538465e-05, "loss": 0.6843, "mean_token_accuracy": 0.7870860742405057, "num_tokens": 15908325.0, "step": 37 }, { "entropy": 0.499267578125, "epoch": 0.14728682170542637, "grad_norm": 1.6105672802191198, "learning_rate": 1.8974358974358975e-05, "loss": 0.6589, "mean_token_accuracy": 0.7950629340484738, "num_tokens": 16345910.0, "step": 38 }, { "entropy": 0.49749755859375, "epoch": 0.1511627906976744, "grad_norm": 1.5453421039467747, "learning_rate": 1.9487179487179488e-05, "loss": 0.6827, "mean_token_accuracy": 0.7869697958230972, "num_tokens": 16794948.0, "step": 39 }, { "entropy": 0.49737548828125, "epoch": 0.15503875968992248, "grad_norm": 1.6951113924313514, "learning_rate": 2e-05, "loss": 0.6532, "mean_token_accuracy": 0.7926666866987944, "num_tokens": 17211373.0, "step": 40 }, { "entropy": 0.49774169921875, "epoch": 0.15891472868217055, "grad_norm": 1.5679650999550647, "learning_rate": 1.9999908652974457e-05, "loss": 0.6461, "mean_token_accuracy": 0.7951579857617617, "num_tokens": 17650141.0, "step": 41 }, { "entropy": 0.507293701171875, "epoch": 0.16279069767441862, "grad_norm": 1.868205623671047, "learning_rate": 1.9999634613566673e-05, "loss": 0.6458, "mean_token_accuracy": 0.7966311117634177, "num_tokens": 18079352.0, "step": 42 }, { "entropy": 0.497589111328125, "epoch": 0.16666666666666666, "grad_norm": 1.584702952530748, "learning_rate": 1.9999177886783194e-05, "loss": 0.6422, "mean_token_accuracy": 0.7978830458596349, "num_tokens": 18510478.0, "step": 43 }, { "entropy": 0.492156982421875, "epoch": 0.17054263565891473, "grad_norm": 1.6650565736181908, "learning_rate": 1.9998538480968142e-05, "loss": 0.6564, "mean_token_accuracy": 0.7950685685500503, "num_tokens": 18967235.0, "step": 44 }, { "entropy": 0.502593994140625, "epoch": 0.1744186046511628, "grad_norm": 1.7541270410806087, "learning_rate": 1.999771640780308e-05, "loss": 0.6327, "mean_token_accuracy": 0.7998538371175528, "num_tokens": 19402140.0, "step": 45 }, { "entropy": 0.4981689453125, "epoch": 0.17829457364341086, "grad_norm": 1.7630109068067972, "learning_rate": 1.99967116823068e-05, "loss": 0.6245, "mean_token_accuracy": 0.8009940264746547, "num_tokens": 19828968.0, "step": 46 }, { "entropy": 0.491119384765625, "epoch": 0.1821705426356589, "grad_norm": 1.6817263064733463, "learning_rate": 1.9995524322835035e-05, "loss": 0.6187, "mean_token_accuracy": 0.8020930821076035, "num_tokens": 20257306.0, "step": 47 }, { "entropy": 0.494384765625, "epoch": 0.18604651162790697, "grad_norm": 1.6183581152741309, "learning_rate": 1.9994154351080137e-05, "loss": 0.6278, "mean_token_accuracy": 0.801664580591023, "num_tokens": 20703796.0, "step": 48 }, { "entropy": 0.491363525390625, "epoch": 0.18992248062015504, "grad_norm": 1.6613810662187647, "learning_rate": 1.999260179207068e-05, "loss": 0.6129, "mean_token_accuracy": 0.8025477975606918, "num_tokens": 21131092.0, "step": 49 }, { "entropy": 0.4857177734375, "epoch": 0.1937984496124031, "grad_norm": 1.426212419722124, "learning_rate": 1.9990866674170984e-05, "loss": 0.6166, "mean_token_accuracy": 0.8041007313877344, "num_tokens": 21568909.0, "step": 50 }, { "entropy": 0.4915771484375, "epoch": 0.19767441860465115, "grad_norm": 1.3780333302834848, "learning_rate": 1.9988949029080625e-05, "loss": 0.599, "mean_token_accuracy": 0.8107355292886496, "num_tokens": 21994935.0, "step": 51 }, { "entropy": 0.5010986328125, "epoch": 0.20155038759689922, "grad_norm": 1.7509993242634168, "learning_rate": 1.9986848891833846e-05, "loss": 0.615, "mean_token_accuracy": 0.8016165606677532, "num_tokens": 22410374.0, "step": 52 }, { "entropy": 0.482513427734375, "epoch": 0.2054263565891473, "grad_norm": 1.3707684982728474, "learning_rate": 1.9984566300798895e-05, "loss": 0.6021, "mean_token_accuracy": 0.8062009122222662, "num_tokens": 22839536.0, "step": 53 }, { "entropy": 0.4803466796875, "epoch": 0.20930232558139536, "grad_norm": 1.4431367638619368, "learning_rate": 1.998210129767735e-05, "loss": 0.5923, "mean_token_accuracy": 0.8107659220695496, "num_tokens": 23270394.0, "step": 54 }, { "entropy": 0.4873046875, "epoch": 0.2131782945736434, "grad_norm": 1.6712200349650355, "learning_rate": 1.9979453927503366e-05, "loss": 0.626, "mean_token_accuracy": 0.802630621008575, "num_tokens": 23714867.0, "step": 55 }, { "entropy": 0.48211669921875, "epoch": 0.21705426356589147, "grad_norm": 1.416046601988453, "learning_rate": 1.997662423864281e-05, "loss": 0.6048, "mean_token_accuracy": 0.8021730659529567, "num_tokens": 24149194.0, "step": 56 }, { "entropy": 0.499420166015625, "epoch": 0.22093023255813954, "grad_norm": 1.3801609002151292, "learning_rate": 1.9973612282792413e-05, "loss": 0.5847, "mean_token_accuracy": 0.8101371973752975, "num_tokens": 24552337.0, "step": 57 }, { "entropy": 0.478302001953125, "epoch": 0.2248062015503876, "grad_norm": 1.5671567435801814, "learning_rate": 1.997041811497882e-05, "loss": 0.6018, "mean_token_accuracy": 0.8056067563593388, "num_tokens": 25009560.0, "step": 58 }, { "entropy": 0.486572265625, "epoch": 0.22868217054263565, "grad_norm": 1.7189084201411189, "learning_rate": 1.9967041793557578e-05, "loss": 0.5965, "mean_token_accuracy": 0.8089054571464658, "num_tokens": 25448410.0, "step": 59 }, { "entropy": 0.4847412109375, "epoch": 0.23255813953488372, "grad_norm": 1.7298449784625014, "learning_rate": 1.996348338021207e-05, "loss": 0.6061, "mean_token_accuracy": 0.8035977333784103, "num_tokens": 25869456.0, "step": 60 }, { "entropy": 0.47747802734375, "epoch": 0.2364341085271318, "grad_norm": 1.4378887326409655, "learning_rate": 1.9959742939952393e-05, "loss": 0.5958, "mean_token_accuracy": 0.8067287458106875, "num_tokens": 26304651.0, "step": 61 }, { "entropy": 0.470184326171875, "epoch": 0.24031007751937986, "grad_norm": 1.408145966867428, "learning_rate": 1.995582054111416e-05, "loss": 0.5639, "mean_token_accuracy": 0.8142334129661322, "num_tokens": 26717402.0, "step": 62 }, { "entropy": 0.477020263671875, "epoch": 0.2441860465116279, "grad_norm": 1.286721956523712, "learning_rate": 1.9951716255357267e-05, "loss": 0.5788, "mean_token_accuracy": 0.811956575140357, "num_tokens": 27150636.0, "step": 63 }, { "entropy": 0.47705078125, "epoch": 0.24806201550387597, "grad_norm": 1.5668383477560968, "learning_rate": 1.9947430157664575e-05, "loss": 0.587, "mean_token_accuracy": 0.8080101488158107, "num_tokens": 27591516.0, "step": 64 }, { "entropy": 0.465057373046875, "epoch": 0.25193798449612403, "grad_norm": 1.2590904210941338, "learning_rate": 1.994296232634054e-05, "loss": 0.5843, "mean_token_accuracy": 0.8114851666614413, "num_tokens": 28032955.0, "step": 65 }, { "entropy": 0.47210693359375, "epoch": 0.2558139534883721, "grad_norm": 1.3064580012149183, "learning_rate": 1.9938312843009776e-05, "loss": 0.581, "mean_token_accuracy": 0.8110779244452715, "num_tokens": 28461266.0, "step": 66 }, { "entropy": 0.46649169921875, "epoch": 0.2596899224806202, "grad_norm": 1.443208706347969, "learning_rate": 1.9933481792615583e-05, "loss": 0.5769, "mean_token_accuracy": 0.8113029273226857, "num_tokens": 28910287.0, "step": 67 }, { "entropy": 0.469696044921875, "epoch": 0.26356589147286824, "grad_norm": 1.3107422511431897, "learning_rate": 1.9928469263418376e-05, "loss": 0.5764, "mean_token_accuracy": 0.8129542609676719, "num_tokens": 29337607.0, "step": 68 }, { "entropy": 0.47607421875, "epoch": 0.26744186046511625, "grad_norm": 1.5954507940802074, "learning_rate": 1.992327534699408e-05, "loss": 0.5704, "mean_token_accuracy": 0.8138690665364265, "num_tokens": 29727407.0, "step": 69 }, { "entropy": 0.459442138671875, "epoch": 0.2713178294573643, "grad_norm": 1.2075602107711025, "learning_rate": 1.991790013823246e-05, "loss": 0.5662, "mean_token_accuracy": 0.8136689653620124, "num_tokens": 30159242.0, "step": 70 }, { "entropy": 0.462005615234375, "epoch": 0.2751937984496124, "grad_norm": 1.3818376110580417, "learning_rate": 1.991234373533539e-05, "loss": 0.5497, "mean_token_accuracy": 0.8170208567753434, "num_tokens": 30583266.0, "step": 71 }, { "entropy": 0.46826171875, "epoch": 0.27906976744186046, "grad_norm": 1.4506362747599748, "learning_rate": 1.990660623981503e-05, "loss": 0.5711, "mean_token_accuracy": 0.8127527991309762, "num_tokens": 31014469.0, "step": 72 }, { "entropy": 0.469696044921875, "epoch": 0.28294573643410853, "grad_norm": 1.4597438563853555, "learning_rate": 1.9900687756492022e-05, "loss": 0.5742, "mean_token_accuracy": 0.8152262289077044, "num_tokens": 31442182.0, "step": 73 }, { "entropy": 0.463592529296875, "epoch": 0.2868217054263566, "grad_norm": 1.587910669929024, "learning_rate": 1.9894588393493528e-05, "loss": 0.5704, "mean_token_accuracy": 0.8138687778264284, "num_tokens": 31881511.0, "step": 74 }, { "entropy": 0.462158203125, "epoch": 0.29069767441860467, "grad_norm": 1.3596755737390338, "learning_rate": 1.9888308262251286e-05, "loss": 0.5686, "mean_token_accuracy": 0.8140115709975362, "num_tokens": 32311809.0, "step": 75 }, { "entropy": 0.4627685546875, "epoch": 0.29457364341085274, "grad_norm": 1.5095142996250903, "learning_rate": 1.988184747749956e-05, "loss": 0.5612, "mean_token_accuracy": 0.8161912616342306, "num_tokens": 32730110.0, "step": 76 }, { "entropy": 0.455291748046875, "epoch": 0.29844961240310075, "grad_norm": 1.4739840663337003, "learning_rate": 1.9875206157273038e-05, "loss": 0.5626, "mean_token_accuracy": 0.814151655882597, "num_tokens": 33157816.0, "step": 77 }, { "entropy": 0.45819091796875, "epoch": 0.3023255813953488, "grad_norm": 1.312276706222236, "learning_rate": 1.9868384422904693e-05, "loss": 0.5811, "mean_token_accuracy": 0.8097445927560329, "num_tokens": 33594309.0, "step": 78 }, { "entropy": 0.461273193359375, "epoch": 0.3062015503875969, "grad_norm": 1.3662318896095438, "learning_rate": 1.986138239902355e-05, "loss": 0.5398, "mean_token_accuracy": 0.8207592982798815, "num_tokens": 34027149.0, "step": 79 }, { "entropy": 0.460845947265625, "epoch": 0.31007751937984496, "grad_norm": 1.114084887070516, "learning_rate": 1.9854200213552426e-05, "loss": 0.5491, "mean_token_accuracy": 0.8194785909727216, "num_tokens": 34443239.0, "step": 80 }, { "entropy": 0.455780029296875, "epoch": 0.313953488372093, "grad_norm": 1.3793377958773556, "learning_rate": 1.9846837997705576e-05, "loss": 0.5644, "mean_token_accuracy": 0.8135287668555975, "num_tokens": 34888207.0, "step": 81 }, { "entropy": 0.457061767578125, "epoch": 0.3178294573643411, "grad_norm": 1.2399000210904676, "learning_rate": 1.98392958859863e-05, "loss": 0.5535, "mean_token_accuracy": 0.8170906333252788, "num_tokens": 35326139.0, "step": 82 }, { "entropy": 0.45587158203125, "epoch": 0.32170542635658916, "grad_norm": 1.2255651250046555, "learning_rate": 1.9831574016184493e-05, "loss": 0.5619, "mean_token_accuracy": 0.8146381946280599, "num_tokens": 35772478.0, "step": 83 }, { "entropy": 0.460906982421875, "epoch": 0.32558139534883723, "grad_norm": 1.1547393809140933, "learning_rate": 1.9823672529374123e-05, "loss": 0.5442, "mean_token_accuracy": 0.8206607829779387, "num_tokens": 36195639.0, "step": 84 }, { "entropy": 0.464202880859375, "epoch": 0.32945736434108525, "grad_norm": 1.3004399814018137, "learning_rate": 1.9815591569910654e-05, "loss": 0.5507, "mean_token_accuracy": 0.8176483260467649, "num_tokens": 36620535.0, "step": 85 }, { "entropy": 0.4609375, "epoch": 0.3333333333333333, "grad_norm": 1.2762821791429464, "learning_rate": 1.980733128542841e-05, "loss": 0.5518, "mean_token_accuracy": 0.8188320798799396, "num_tokens": 37051548.0, "step": 86 }, { "entropy": 0.46221923828125, "epoch": 0.3372093023255814, "grad_norm": 1.2824385762190618, "learning_rate": 1.9798891826837872e-05, "loss": 0.5435, "mean_token_accuracy": 0.8216770840808749, "num_tokens": 37483698.0, "step": 87 }, { "entropy": 0.46661376953125, "epoch": 0.34108527131782945, "grad_norm": 1.3876212166283788, "learning_rate": 1.979027334832293e-05, "loss": 0.5626, "mean_token_accuracy": 0.8143940027803183, "num_tokens": 37910246.0, "step": 88 }, { "entropy": 0.46575927734375, "epoch": 0.3449612403100775, "grad_norm": 1.1660372191371922, "learning_rate": 1.9781476007338058e-05, "loss": 0.5428, "mean_token_accuracy": 0.820567911490798, "num_tokens": 38319746.0, "step": 89 }, { "entropy": 0.47100830078125, "epoch": 0.3488372093023256, "grad_norm": 1.1487166285802124, "learning_rate": 1.977249996460544e-05, "loss": 0.5604, "mean_token_accuracy": 0.816031564027071, "num_tokens": 38752631.0, "step": 90 }, { "entropy": 0.460296630859375, "epoch": 0.35271317829457366, "grad_norm": 1.1402604232378695, "learning_rate": 1.9763345384112044e-05, "loss": 0.5473, "mean_token_accuracy": 0.8196038343012333, "num_tokens": 39182910.0, "step": 91 }, { "entropy": 0.46826171875, "epoch": 0.35658914728682173, "grad_norm": 1.1923633175594284, "learning_rate": 1.97540124331066e-05, "loss": 0.5409, "mean_token_accuracy": 0.8211136739701033, "num_tokens": 39621180.0, "step": 92 }, { "entropy": 0.460357666015625, "epoch": 0.36046511627906974, "grad_norm": 1.2665373099510882, "learning_rate": 1.974450128209658e-05, "loss": 0.5531, "mean_token_accuracy": 0.8183085061609745, "num_tokens": 40050290.0, "step": 93 }, { "entropy": 0.46466064453125, "epoch": 0.3643410852713178, "grad_norm": 1.2259431636846179, "learning_rate": 1.973481210484505e-05, "loss": 0.5419, "mean_token_accuracy": 0.8202582132071257, "num_tokens": 40464004.0, "step": 94 }, { "entropy": 0.45953369140625, "epoch": 0.3682170542635659, "grad_norm": 1.2314434799144434, "learning_rate": 1.9724945078367513e-05, "loss": 0.5331, "mean_token_accuracy": 0.8198390873149037, "num_tokens": 40902322.0, "step": 95 }, { "entropy": 0.46234130859375, "epoch": 0.37209302325581395, "grad_norm": 1.149693863746958, "learning_rate": 1.9714900382928674e-05, "loss": 0.5322, "mean_token_accuracy": 0.8233453892171383, "num_tokens": 41302073.0, "step": 96 }, { "entropy": 0.463897705078125, "epoch": 0.375968992248062, "grad_norm": 1.2008159652924766, "learning_rate": 1.9704678202039148e-05, "loss": 0.5298, "mean_token_accuracy": 0.8223507273942232, "num_tokens": 41716373.0, "step": 97 }, { "entropy": 0.46221923828125, "epoch": 0.3798449612403101, "grad_norm": 1.1035610955872097, "learning_rate": 1.9694278722452092e-05, "loss": 0.556, "mean_token_accuracy": 0.8180048149079084, "num_tokens": 42144096.0, "step": 98 }, { "entropy": 0.4696044921875, "epoch": 0.38372093023255816, "grad_norm": 1.2329688382688684, "learning_rate": 1.9683702134159815e-05, "loss": 0.5375, "mean_token_accuracy": 0.8225236749276519, "num_tokens": 42563834.0, "step": 99 }, { "entropy": 0.46099853515625, "epoch": 0.3875968992248062, "grad_norm": 1.1709642889265268, "learning_rate": 1.9672948630390296e-05, "loss": 0.5384, "mean_token_accuracy": 0.819816923700273, "num_tokens": 42973627.0, "step": 100 }, { "entropy": 0.45849609375, "epoch": 0.39147286821705424, "grad_norm": 1.1856435417034512, "learning_rate": 1.9662018407603643e-05, "loss": 0.5435, "mean_token_accuracy": 0.8200850309804082, "num_tokens": 43404575.0, "step": 101 }, { "entropy": 0.4561767578125, "epoch": 0.3953488372093023, "grad_norm": 1.228277406849483, "learning_rate": 1.9650911665488533e-05, "loss": 0.5292, "mean_token_accuracy": 0.8224469656124711, "num_tokens": 43837685.0, "step": 102 }, { "entropy": 0.46295166015625, "epoch": 0.3992248062015504, "grad_norm": 1.0680952459464894, "learning_rate": 1.9639628606958535e-05, "loss": 0.5404, "mean_token_accuracy": 0.8188497675582767, "num_tokens": 44260189.0, "step": 103 }, { "entropy": 0.447418212890625, "epoch": 0.40310077519379844, "grad_norm": 1.1981648971283583, "learning_rate": 1.9628169438148414e-05, "loss": 0.544, "mean_token_accuracy": 0.8205827260389924, "num_tokens": 44688279.0, "step": 104 }, { "entropy": 0.45977783203125, "epoch": 0.4069767441860465, "grad_norm": 1.2193905149033133, "learning_rate": 1.9616534368410364e-05, "loss": 0.5429, "mean_token_accuracy": 0.8211770560592413, "num_tokens": 45096594.0, "step": 105 }, { "entropy": 0.45819091796875, "epoch": 0.4108527131782946, "grad_norm": 1.1549313358131803, "learning_rate": 1.9604723610310195e-05, "loss": 0.5442, "mean_token_accuracy": 0.8212394239380956, "num_tokens": 45512935.0, "step": 106 }, { "entropy": 0.45819091796875, "epoch": 0.41472868217054265, "grad_norm": 1.1221291833410496, "learning_rate": 1.9592737379623427e-05, "loss": 0.5384, "mean_token_accuracy": 0.8197078760713339, "num_tokens": 45942440.0, "step": 107 }, { "entropy": 0.45684814453125, "epoch": 0.4186046511627907, "grad_norm": 1.0352641413953396, "learning_rate": 1.9580575895331364e-05, "loss": 0.5403, "mean_token_accuracy": 0.8169021736830473, "num_tokens": 46386550.0, "step": 108 }, { "entropy": 0.459136962890625, "epoch": 0.42248062015503873, "grad_norm": 1.105470239941562, "learning_rate": 1.956823937961709e-05, "loss": 0.5533, "mean_token_accuracy": 0.81779002584517, "num_tokens": 46827445.0, "step": 109 }, { "entropy": 0.463409423828125, "epoch": 0.4263565891472868, "grad_norm": 1.179096360799542, "learning_rate": 1.955572805786141e-05, "loss": 0.5264, "mean_token_accuracy": 0.8238548217341304, "num_tokens": 47239972.0, "step": 110 }, { "entropy": 0.456756591796875, "epoch": 0.43023255813953487, "grad_norm": 1.1110689927671835, "learning_rate": 1.9543042158638728e-05, "loss": 0.5279, "mean_token_accuracy": 0.8251189421862364, "num_tokens": 47645121.0, "step": 111 }, { "entropy": 0.453582763671875, "epoch": 0.43410852713178294, "grad_norm": 1.2062009597372043, "learning_rate": 1.9530181913712875e-05, "loss": 0.5388, "mean_token_accuracy": 0.8221446331590414, "num_tokens": 48092214.0, "step": 112 }, { "entropy": 0.45489501953125, "epoch": 0.437984496124031, "grad_norm": 1.103293645738396, "learning_rate": 1.9517147558032877e-05, "loss": 0.5341, "mean_token_accuracy": 0.8221006505191326, "num_tokens": 48526733.0, "step": 113 }, { "entropy": 0.457305908203125, "epoch": 0.4418604651162791, "grad_norm": 1.0568614511684307, "learning_rate": 1.9503939329728657e-05, "loss": 0.5296, "mean_token_accuracy": 0.8257120624184608, "num_tokens": 48933549.0, "step": 114 }, { "entropy": 0.449493408203125, "epoch": 0.44573643410852715, "grad_norm": 1.0530308845913583, "learning_rate": 1.949055747010669e-05, "loss": 0.5476, "mean_token_accuracy": 0.8189520025625825, "num_tokens": 49385777.0, "step": 115 }, { "entropy": 0.451873779296875, "epoch": 0.4496124031007752, "grad_norm": 1.201764585546533, "learning_rate": 1.9477002223645587e-05, "loss": 0.518, "mean_token_accuracy": 0.8288652747869492, "num_tokens": 49815021.0, "step": 116 }, { "entropy": 0.4442138671875, "epoch": 0.45348837209302323, "grad_norm": 1.1614506102963222, "learning_rate": 1.9463273837991643e-05, "loss": 0.5252, "mean_token_accuracy": 0.8240685043856502, "num_tokens": 50253819.0, "step": 117 }, { "entropy": 0.441131591796875, "epoch": 0.4573643410852713, "grad_norm": 1.057659936365217, "learning_rate": 1.9449372563954293e-05, "loss": 0.5335, "mean_token_accuracy": 0.8224316090345383, "num_tokens": 50705508.0, "step": 118 }, { "entropy": 0.450897216796875, "epoch": 0.46124031007751937, "grad_norm": 1.1289057507904565, "learning_rate": 1.9435298655501547e-05, "loss": 0.5202, "mean_token_accuracy": 0.8252356611192226, "num_tokens": 51126464.0, "step": 119 }, { "entropy": 0.4454345703125, "epoch": 0.46511627906976744, "grad_norm": 1.1783048839311618, "learning_rate": 1.9421052369755335e-05, "loss": 0.5383, "mean_token_accuracy": 0.8221455095335841, "num_tokens": 51558968.0, "step": 120 }, { "entropy": 0.454193115234375, "epoch": 0.4689922480620155, "grad_norm": 1.1118532244729238, "learning_rate": 1.9406633966986828e-05, "loss": 0.5349, "mean_token_accuracy": 0.8201886266469955, "num_tokens": 51978742.0, "step": 121 }, { "entropy": 0.456939697265625, "epoch": 0.4728682170542636, "grad_norm": 1.3616417906601193, "learning_rate": 1.939204371061166e-05, "loss": 0.531, "mean_token_accuracy": 0.8195713134482503, "num_tokens": 52402161.0, "step": 122 }, { "entropy": 0.4530029296875, "epoch": 0.47674418604651164, "grad_norm": 1.0890779163412931, "learning_rate": 1.9377281867185145e-05, "loss": 0.5412, "mean_token_accuracy": 0.8204515632241964, "num_tokens": 52839250.0, "step": 123 }, { "entropy": 0.44537353515625, "epoch": 0.4806201550387597, "grad_norm": 1.3916228521360827, "learning_rate": 1.9362348706397374e-05, "loss": 0.5496, "mean_token_accuracy": 0.8194613959640265, "num_tokens": 53288733.0, "step": 124 }, { "entropy": 0.45404052734375, "epoch": 0.4844961240310077, "grad_norm": 1.1216245307718116, "learning_rate": 1.934724450106831e-05, "loss": 0.518, "mean_token_accuracy": 0.8268485888838768, "num_tokens": 53703856.0, "step": 125 }, { "entropy": 0.4644775390625, "epoch": 0.4883720930232558, "grad_norm": 1.292642203861098, "learning_rate": 1.9331969527142805e-05, "loss": 0.5248, "mean_token_accuracy": 0.8242862829938531, "num_tokens": 54125616.0, "step": 126 }, { "entropy": 0.457183837890625, "epoch": 0.49224806201550386, "grad_norm": 1.1577638631630167, "learning_rate": 1.9316524063685544e-05, "loss": 0.5329, "mean_token_accuracy": 0.8215478174388409, "num_tokens": 54559965.0, "step": 127 }, { "entropy": 0.463348388671875, "epoch": 0.49612403100775193, "grad_norm": 1.1108314222482614, "learning_rate": 1.930090839287595e-05, "loss": 0.5224, "mean_token_accuracy": 0.8258008388802409, "num_tokens": 54983536.0, "step": 128 }, { "entropy": 0.46112060546875, "epoch": 0.5, "grad_norm": 1.209480669363572, "learning_rate": 1.9285122800003045e-05, "loss": 0.5335, "mean_token_accuracy": 0.8223184822127223, "num_tokens": 55414016.0, "step": 129 }, { "entropy": 0.447906494140625, "epoch": 0.5038759689922481, "grad_norm": 1.1619619363279732, "learning_rate": 1.926916757346022e-05, "loss": 0.529, "mean_token_accuracy": 0.8222861513495445, "num_tokens": 55844385.0, "step": 130 }, { "entropy": 0.461822509765625, "epoch": 0.5077519379844961, "grad_norm": 1.1171998492566917, "learning_rate": 1.9253043004739967e-05, "loss": 0.5229, "mean_token_accuracy": 0.8233607662841678, "num_tokens": 56275861.0, "step": 131 }, { "entropy": 0.45672607421875, "epoch": 0.5116279069767442, "grad_norm": 1.0256784025552215, "learning_rate": 1.923674938842857e-05, "loss": 0.5292, "mean_token_accuracy": 0.8243364058434963, "num_tokens": 56732989.0, "step": 132 }, { "entropy": 0.464263916015625, "epoch": 0.5155038759689923, "grad_norm": 1.0143232022410413, "learning_rate": 1.9220287022200707e-05, "loss": 0.5018, "mean_token_accuracy": 0.8304745489731431, "num_tokens": 57152314.0, "step": 133 }, { "entropy": 0.4564208984375, "epoch": 0.5193798449612403, "grad_norm": 1.1661400246068792, "learning_rate": 1.920365620681401e-05, "loss": 0.5211, "mean_token_accuracy": 0.8252125987783074, "num_tokens": 57590357.0, "step": 134 }, { "entropy": 0.456329345703125, "epoch": 0.5232558139534884, "grad_norm": 0.9456257334190764, "learning_rate": 1.9186857246103586e-05, "loss": 0.5186, "mean_token_accuracy": 0.824681076221168, "num_tokens": 58041311.0, "step": 135 }, { "entropy": 0.4566650390625, "epoch": 0.5271317829457365, "grad_norm": 1.0414842833260838, "learning_rate": 1.9169890446976454e-05, "loss": 0.5184, "mean_token_accuracy": 0.8255736064165831, "num_tokens": 58469229.0, "step": 136 }, { "entropy": 0.46527099609375, "epoch": 0.5310077519379846, "grad_norm": 0.9659958295123616, "learning_rate": 1.9152756119405937e-05, "loss": 0.5214, "mean_token_accuracy": 0.8240566346794367, "num_tokens": 58871704.0, "step": 137 }, { "entropy": 0.465423583984375, "epoch": 0.5348837209302325, "grad_norm": 1.1234923378415513, "learning_rate": 1.913545457642601e-05, "loss": 0.5264, "mean_token_accuracy": 0.8233613995835185, "num_tokens": 59297215.0, "step": 138 }, { "entropy": 0.4613037109375, "epoch": 0.5387596899224806, "grad_norm": 0.9179083293680214, "learning_rate": 1.911798613412557e-05, "loss": 0.5181, "mean_token_accuracy": 0.8269952731207013, "num_tokens": 59730637.0, "step": 139 }, { "entropy": 0.46160888671875, "epoch": 0.5426356589147286, "grad_norm": 0.9491732293660206, "learning_rate": 1.9100351111642666e-05, "loss": 0.5289, "mean_token_accuracy": 0.8205523490905762, "num_tokens": 60156285.0, "step": 140 }, { "entropy": 0.453521728515625, "epoch": 0.5465116279069767, "grad_norm": 1.102929475751906, "learning_rate": 1.908254983115867e-05, "loss": 0.5133, "mean_token_accuracy": 0.8261066768318415, "num_tokens": 60596318.0, "step": 141 }, { "entropy": 0.455657958984375, "epoch": 0.5503875968992248, "grad_norm": 1.0744997289996236, "learning_rate": 1.9064582617892383e-05, "loss": 0.5054, "mean_token_accuracy": 0.8298247829079628, "num_tokens": 61012662.0, "step": 142 }, { "entropy": 0.44940185546875, "epoch": 0.5542635658914729, "grad_norm": 1.153734884287017, "learning_rate": 1.9046449800094103e-05, "loss": 0.5114, "mean_token_accuracy": 0.8287463616579771, "num_tokens": 61440619.0, "step": 143 }, { "entropy": 0.452392578125, "epoch": 0.5581395348837209, "grad_norm": 1.1039031001679573, "learning_rate": 1.902815170903963e-05, "loss": 0.5241, "mean_token_accuracy": 0.8248511329293251, "num_tokens": 61883413.0, "step": 144 }, { "entropy": 0.4530029296875, "epoch": 0.562015503875969, "grad_norm": 1.068677659477954, "learning_rate": 1.900968867902419e-05, "loss": 0.5244, "mean_token_accuracy": 0.8252197271212935, "num_tokens": 62329563.0, "step": 145 }, { "entropy": 0.4559326171875, "epoch": 0.5658914728682171, "grad_norm": 1.0529708726567546, "learning_rate": 1.8991061047356374e-05, "loss": 0.514, "mean_token_accuracy": 0.82704047113657, "num_tokens": 62770914.0, "step": 146 }, { "entropy": 0.4534912109375, "epoch": 0.5697674418604651, "grad_norm": 0.9988406542616296, "learning_rate": 1.8972269154351917e-05, "loss": 0.5222, "mean_token_accuracy": 0.8278125440701842, "num_tokens": 63212782.0, "step": 147 }, { "entropy": 0.4549560546875, "epoch": 0.5736434108527132, "grad_norm": 0.969074986348695, "learning_rate": 1.895331334332753e-05, "loss": 0.5268, "mean_token_accuracy": 0.8241426143795252, "num_tokens": 63652708.0, "step": 148 }, { "entropy": 0.45428466796875, "epoch": 0.5775193798449613, "grad_norm": 0.977410620254015, "learning_rate": 1.893419396059461e-05, "loss": 0.5115, "mean_token_accuracy": 0.8301441119983792, "num_tokens": 64074452.0, "step": 149 }, { "entropy": 0.453765869140625, "epoch": 0.5813953488372093, "grad_norm": 1.0071558774043947, "learning_rate": 1.8914911355452895e-05, "loss": 0.5063, "mean_token_accuracy": 0.8300048960372806, "num_tokens": 64484680.0, "step": 150 }, { "entropy": 0.44122314453125, "epoch": 0.5852713178294574, "grad_norm": 0.956524944955269, "learning_rate": 1.889546588018412e-05, "loss": 0.5159, "mean_token_accuracy": 0.8281158301979303, "num_tokens": 64924985.0, "step": 151 }, { "entropy": 0.4344482421875, "epoch": 0.5891472868217055, "grad_norm": 0.9899399473264823, "learning_rate": 1.8875857890045544e-05, "loss": 0.5146, "mean_token_accuracy": 0.8271653046831489, "num_tokens": 65366395.0, "step": 152 }, { "entropy": 0.4498291015625, "epoch": 0.5930232558139535, "grad_norm": 0.9972264103529548, "learning_rate": 1.885608774326348e-05, "loss": 0.5066, "mean_token_accuracy": 0.8289199098944664, "num_tokens": 65787947.0, "step": 153 }, { "entropy": 0.4351806640625, "epoch": 0.5968992248062015, "grad_norm": 0.948063517501392, "learning_rate": 1.8836155801026754e-05, "loss": 0.5193, "mean_token_accuracy": 0.8260297365486622, "num_tokens": 66248911.0, "step": 154 }, { "entropy": 0.449737548828125, "epoch": 0.6007751937984496, "grad_norm": 0.9561345172101009, "learning_rate": 1.881606242748009e-05, "loss": 0.4962, "mean_token_accuracy": 0.8330451222136617, "num_tokens": 66672337.0, "step": 155 }, { "entropy": 0.45330810546875, "epoch": 0.6046511627906976, "grad_norm": 1.0343090275971194, "learning_rate": 1.8795807989717473e-05, "loss": 0.5035, "mean_token_accuracy": 0.8295647175982594, "num_tokens": 67089851.0, "step": 156 }, { "entropy": 0.462554931640625, "epoch": 0.6085271317829457, "grad_norm": 0.8231981809906675, "learning_rate": 1.877539285777543e-05, "loss": 0.514, "mean_token_accuracy": 0.8288662061095238, "num_tokens": 67512314.0, "step": 157 }, { "entropy": 0.450164794921875, "epoch": 0.6124031007751938, "grad_norm": 1.0053067273936602, "learning_rate": 1.8754817404626275e-05, "loss": 0.5092, "mean_token_accuracy": 0.8304671561345458, "num_tokens": 67935246.0, "step": 158 }, { "entropy": 0.45330810546875, "epoch": 0.6162790697674418, "grad_norm": 1.0010500407032097, "learning_rate": 1.87340820061713e-05, "loss": 0.5173, "mean_token_accuracy": 0.8267777897417545, "num_tokens": 68349691.0, "step": 159 }, { "entropy": 0.449676513671875, "epoch": 0.6201550387596899, "grad_norm": 0.9522314009042264, "learning_rate": 1.8713187041233896e-05, "loss": 0.5141, "mean_token_accuracy": 0.8266640789806843, "num_tokens": 68790259.0, "step": 160 }, { "entropy": 0.451171875, "epoch": 0.624031007751938, "grad_norm": 1.0963807833125467, "learning_rate": 1.8692132891552644e-05, "loss": 0.5176, "mean_token_accuracy": 0.826544975861907, "num_tokens": 69228021.0, "step": 161 }, { "entropy": 0.458221435546875, "epoch": 0.627906976744186, "grad_norm": 1.0045498488288913, "learning_rate": 1.867091994177433e-05, "loss": 0.5075, "mean_token_accuracy": 0.8274988839402795, "num_tokens": 69643321.0, "step": 162 }, { "entropy": 0.44891357421875, "epoch": 0.6317829457364341, "grad_norm": 0.9622723880491996, "learning_rate": 1.8649548579446938e-05, "loss": 0.5072, "mean_token_accuracy": 0.8281608214601874, "num_tokens": 70091782.0, "step": 163 }, { "entropy": 0.450439453125, "epoch": 0.6356589147286822, "grad_norm": 0.9636809427862026, "learning_rate": 1.862801919501253e-05, "loss": 0.5006, "mean_token_accuracy": 0.8330097962170839, "num_tokens": 70515135.0, "step": 164 }, { "entropy": 0.445159912109375, "epoch": 0.6395348837209303, "grad_norm": 0.9632629012572823, "learning_rate": 1.8606332181800165e-05, "loss": 0.507, "mean_token_accuracy": 0.8288661614060402, "num_tokens": 70952827.0, "step": 165 }, { "entropy": 0.450958251953125, "epoch": 0.6434108527131783, "grad_norm": 0.9924678229906452, "learning_rate": 1.8584487936018663e-05, "loss": 0.5127, "mean_token_accuracy": 0.8285386795178056, "num_tokens": 71384968.0, "step": 166 }, { "entropy": 0.45819091796875, "epoch": 0.6472868217054264, "grad_norm": 0.9772765903059605, "learning_rate": 1.8562486856749403e-05, "loss": 0.5048, "mean_token_accuracy": 0.8292614417150617, "num_tokens": 71795936.0, "step": 167 }, { "entropy": 0.44146728515625, "epoch": 0.6511627906976745, "grad_norm": 0.9934626771494418, "learning_rate": 1.8540329345939015e-05, "loss": 0.4996, "mean_token_accuracy": 0.8288496835157275, "num_tokens": 72234579.0, "step": 168 }, { "entropy": 0.446746826171875, "epoch": 0.6550387596899225, "grad_norm": 0.9589171182214611, "learning_rate": 1.8518015808392045e-05, "loss": 0.5056, "mean_token_accuracy": 0.8287281664088368, "num_tokens": 72666870.0, "step": 169 }, { "entropy": 0.453857421875, "epoch": 0.6589147286821705, "grad_norm": 0.9087478052569928, "learning_rate": 1.849554665176354e-05, "loss": 0.5077, "mean_token_accuracy": 0.8301580296829343, "num_tokens": 73105141.0, "step": 170 }, { "entropy": 0.45330810546875, "epoch": 0.6627906976744186, "grad_norm": 1.0302730313362078, "learning_rate": 1.8472922286551633e-05, "loss": 0.5096, "mean_token_accuracy": 0.827417085878551, "num_tokens": 73535123.0, "step": 171 }, { "entropy": 0.4488525390625, "epoch": 0.6666666666666666, "grad_norm": 0.840700088225803, "learning_rate": 1.8450143126090015e-05, "loss": 0.4873, "mean_token_accuracy": 0.8343714782968163, "num_tokens": 73959285.0, "step": 172 }, { "entropy": 0.446319580078125, "epoch": 0.6705426356589147, "grad_norm": 0.9380020082651943, "learning_rate": 1.8427209586540392e-05, "loss": 0.5034, "mean_token_accuracy": 0.8296555746346712, "num_tokens": 74402105.0, "step": 173 }, { "entropy": 0.4476318359375, "epoch": 0.6744186046511628, "grad_norm": 0.9403613147062398, "learning_rate": 1.8404122086884898e-05, "loss": 0.5018, "mean_token_accuracy": 0.8316757902503014, "num_tokens": 74832515.0, "step": 174 }, { "entropy": 0.439788818359375, "epoch": 0.6782945736434108, "grad_norm": 0.8586381294647795, "learning_rate": 1.8380881048918406e-05, "loss": 0.4989, "mean_token_accuracy": 0.8328359462320805, "num_tokens": 75286480.0, "step": 175 }, { "entropy": 0.452789306640625, "epoch": 0.6821705426356589, "grad_norm": 0.8058022848926649, "learning_rate": 1.8357486897240866e-05, "loss": 0.5016, "mean_token_accuracy": 0.8294941317290068, "num_tokens": 75693389.0, "step": 176 }, { "entropy": 0.458770751953125, "epoch": 0.686046511627907, "grad_norm": 1.0045483433681783, "learning_rate": 1.83339400592495e-05, "loss": 0.4939, "mean_token_accuracy": 0.8322997633367777, "num_tokens": 76123537.0, "step": 177 }, { "entropy": 0.4451904296875, "epoch": 0.689922480620155, "grad_norm": 0.8651822682228671, "learning_rate": 1.831024096513104e-05, "loss": 0.4962, "mean_token_accuracy": 0.832309733144939, "num_tokens": 76575873.0, "step": 178 }, { "entropy": 0.45037841796875, "epoch": 0.6937984496124031, "grad_norm": 0.878788280336596, "learning_rate": 1.8286390047853835e-05, "loss": 0.4738, "mean_token_accuracy": 0.837097929790616, "num_tokens": 76979316.0, "step": 179 }, { "entropy": 0.45184326171875, "epoch": 0.6976744186046512, "grad_norm": 0.9694188623669399, "learning_rate": 1.826238774315995e-05, "loss": 0.4956, "mean_token_accuracy": 0.8334890305995941, "num_tokens": 77407125.0, "step": 180 }, { "entropy": 0.441192626953125, "epoch": 0.7015503875968992, "grad_norm": 0.8080436391493971, "learning_rate": 1.8238234489557217e-05, "loss": 0.4953, "mean_token_accuracy": 0.829826689325273, "num_tokens": 77849752.0, "step": 181 }, { "entropy": 0.446533203125, "epoch": 0.7054263565891473, "grad_norm": 1.0021744357127353, "learning_rate": 1.821393072831121e-05, "loss": 0.5097, "mean_token_accuracy": 0.8279161658138037, "num_tokens": 78281078.0, "step": 182 }, { "entropy": 0.45159912109375, "epoch": 0.7093023255813954, "grad_norm": 0.9280479231187806, "learning_rate": 1.818947690343719e-05, "loss": 0.4951, "mean_token_accuracy": 0.8309991173446178, "num_tokens": 78706365.0, "step": 183 }, { "entropy": 0.448455810546875, "epoch": 0.7131782945736435, "grad_norm": 0.9870261846497186, "learning_rate": 1.8164873461691987e-05, "loss": 0.4978, "mean_token_accuracy": 0.8303654547780752, "num_tokens": 79135421.0, "step": 184 }, { "entropy": 0.44256591796875, "epoch": 0.7170542635658915, "grad_norm": 0.9269194775003367, "learning_rate": 1.814012085256585e-05, "loss": 0.4831, "mean_token_accuracy": 0.8345851162448525, "num_tokens": 79566359.0, "step": 185 }, { "entropy": 0.450531005859375, "epoch": 0.7209302325581395, "grad_norm": 0.9698714267278364, "learning_rate": 1.811521952827422e-05, "loss": 0.4889, "mean_token_accuracy": 0.8325546151027083, "num_tokens": 79990413.0, "step": 186 }, { "entropy": 0.44677734375, "epoch": 0.7248062015503876, "grad_norm": 0.9371152240387562, "learning_rate": 1.8090169943749477e-05, "loss": 0.4935, "mean_token_accuracy": 0.8314268151298165, "num_tokens": 80408016.0, "step": 187 }, { "entropy": 0.448150634765625, "epoch": 0.7286821705426356, "grad_norm": 0.8702924632944642, "learning_rate": 1.806497255663263e-05, "loss": 0.4957, "mean_token_accuracy": 0.8298279447481036, "num_tokens": 80821220.0, "step": 188 }, { "entropy": 0.438232421875, "epoch": 0.7325581395348837, "grad_norm": 0.9905266797503475, "learning_rate": 1.8039627827264953e-05, "loss": 0.5055, "mean_token_accuracy": 0.829126013442874, "num_tokens": 81260674.0, "step": 189 }, { "entropy": 0.443389892578125, "epoch": 0.7364341085271318, "grad_norm": 0.8574532206726159, "learning_rate": 1.8014136218679566e-05, "loss": 0.4995, "mean_token_accuracy": 0.8337188037112355, "num_tokens": 81699370.0, "step": 190 }, { "entropy": 0.447723388671875, "epoch": 0.7403100775193798, "grad_norm": 0.8787803121596964, "learning_rate": 1.7988498196593007e-05, "loss": 0.5025, "mean_token_accuracy": 0.831497854553163, "num_tokens": 82126270.0, "step": 191 }, { "entropy": 0.439666748046875, "epoch": 0.7441860465116279, "grad_norm": 0.9001985159271618, "learning_rate": 1.796271422939668e-05, "loss": 0.4999, "mean_token_accuracy": 0.8317448329180479, "num_tokens": 82567340.0, "step": 192 }, { "entropy": 0.446136474609375, "epoch": 0.748062015503876, "grad_norm": 0.9164028392242967, "learning_rate": 1.793678478814833e-05, "loss": 0.5049, "mean_token_accuracy": 0.828097378835082, "num_tokens": 82992466.0, "step": 193 }, { "entropy": 0.4488525390625, "epoch": 0.751937984496124, "grad_norm": 0.8485676413423029, "learning_rate": 1.7910710346563417e-05, "loss": 0.4957, "mean_token_accuracy": 0.8310582870617509, "num_tokens": 83434886.0, "step": 194 }, { "entropy": 0.446441650390625, "epoch": 0.7558139534883721, "grad_norm": 0.9286191377343639, "learning_rate": 1.788449138100648e-05, "loss": 0.4932, "mean_token_accuracy": 0.8343960093334317, "num_tokens": 83880177.0, "step": 195 }, { "entropy": 0.4471435546875, "epoch": 0.7596899224806202, "grad_norm": 0.8796929401948946, "learning_rate": 1.7858128370482427e-05, "loss": 0.4784, "mean_token_accuracy": 0.8362671909853816, "num_tokens": 84287722.0, "step": 196 }, { "entropy": 0.435089111328125, "epoch": 0.7635658914728682, "grad_norm": 0.8294179065956871, "learning_rate": 1.7831621796627773e-05, "loss": 0.5043, "mean_token_accuracy": 0.8294558906927705, "num_tokens": 84744150.0, "step": 197 }, { "entropy": 0.4432373046875, "epoch": 0.7674418604651163, "grad_norm": 0.8675227812064625, "learning_rate": 1.7804972143701853e-05, "loss": 0.4927, "mean_token_accuracy": 0.8332074852660298, "num_tokens": 85180131.0, "step": 198 }, { "entropy": 0.44091796875, "epoch": 0.7713178294573644, "grad_norm": 0.8952478928442558, "learning_rate": 1.7778179898577973e-05, "loss": 0.4982, "mean_token_accuracy": 0.8292250717058778, "num_tokens": 85630008.0, "step": 199 }, { "entropy": 0.44366455078125, "epoch": 0.7751937984496124, "grad_norm": 0.8447485958176423, "learning_rate": 1.775124555073452e-05, "loss": 0.4868, "mean_token_accuracy": 0.8350116610527039, "num_tokens": 86037948.0, "step": 200 }, { "entropy": 0.4471435546875, "epoch": 0.7790697674418605, "grad_norm": 0.8111264675206218, "learning_rate": 1.7724169592245996e-05, "loss": 0.4847, "mean_token_accuracy": 0.836847304366529, "num_tokens": 86448906.0, "step": 201 }, { "entropy": 0.437042236328125, "epoch": 0.7829457364341085, "grad_norm": 0.887117371160646, "learning_rate": 1.769695251777406e-05, "loss": 0.4912, "mean_token_accuracy": 0.8314886456355453, "num_tokens": 86877191.0, "step": 202 }, { "entropy": 0.438079833984375, "epoch": 0.7868217054263565, "grad_norm": 0.8276242886119606, "learning_rate": 1.7669594824558474e-05, "loss": 0.4848, "mean_token_accuracy": 0.8351955693215132, "num_tokens": 87313657.0, "step": 203 }, { "entropy": 0.444000244140625, "epoch": 0.7906976744186046, "grad_norm": 0.7558616852027694, "learning_rate": 1.7642097012408013e-05, "loss": 0.4865, "mean_token_accuracy": 0.835452251136303, "num_tokens": 87736991.0, "step": 204 }, { "entropy": 0.4342041015625, "epoch": 0.7945736434108527, "grad_norm": 0.8020890652572583, "learning_rate": 1.7614459583691346e-05, "loss": 0.4813, "mean_token_accuracy": 0.8367800936102867, "num_tokens": 88175181.0, "step": 205 }, { "entropy": 0.44451904296875, "epoch": 0.7984496124031008, "grad_norm": 0.8402541788890805, "learning_rate": 1.758668304332786e-05, "loss": 0.4756, "mean_token_accuracy": 0.8371487222611904, "num_tokens": 88589103.0, "step": 206 }, { "entropy": 0.436370849609375, "epoch": 0.8023255813953488, "grad_norm": 0.7812249400484098, "learning_rate": 1.755876789877842e-05, "loss": 0.4844, "mean_token_accuracy": 0.8344348035752773, "num_tokens": 89015985.0, "step": 207 }, { "entropy": 0.44293212890625, "epoch": 0.8062015503875969, "grad_norm": 0.8424590194694084, "learning_rate": 1.7530714660036112e-05, "loss": 0.4936, "mean_token_accuracy": 0.8327551614493132, "num_tokens": 89446817.0, "step": 208 }, { "entropy": 0.444427490234375, "epoch": 0.810077519379845, "grad_norm": 0.7878206490332407, "learning_rate": 1.7502523839616916e-05, "loss": 0.5012, "mean_token_accuracy": 0.8302563494071364, "num_tokens": 89867870.0, "step": 209 }, { "entropy": 0.446258544921875, "epoch": 0.813953488372093, "grad_norm": 0.8461103209901029, "learning_rate": 1.7474195952550355e-05, "loss": 0.4935, "mean_token_accuracy": 0.8328652335330844, "num_tokens": 90321128.0, "step": 210 }, { "entropy": 0.45703125, "epoch": 0.8178294573643411, "grad_norm": 0.8655537216150097, "learning_rate": 1.744573151637007e-05, "loss": 0.4983, "mean_token_accuracy": 0.8308916166424751, "num_tokens": 90750470.0, "step": 211 }, { "entropy": 0.4500732421875, "epoch": 0.8217054263565892, "grad_norm": 0.8454143832435108, "learning_rate": 1.7417131051104382e-05, "loss": 0.475, "mean_token_accuracy": 0.8389784749597311, "num_tokens": 91185119.0, "step": 212 }, { "entropy": 0.449249267578125, "epoch": 0.8255813953488372, "grad_norm": 0.8064273191403369, "learning_rate": 1.738839507926677e-05, "loss": 0.4794, "mean_token_accuracy": 0.8374122427776456, "num_tokens": 91618372.0, "step": 213 }, { "entropy": 0.442901611328125, "epoch": 0.8294573643410853, "grad_norm": 0.8099550078779665, "learning_rate": 1.7359524125846353e-05, "loss": 0.4675, "mean_token_accuracy": 0.8417509058490396, "num_tokens": 92056592.0, "step": 214 }, { "entropy": 0.445098876953125, "epoch": 0.8333333333333334, "grad_norm": 0.871757253886241, "learning_rate": 1.7330518718298263e-05, "loss": 0.4946, "mean_token_accuracy": 0.8306501191109419, "num_tokens": 92492584.0, "step": 215 }, { "entropy": 0.437835693359375, "epoch": 0.8372093023255814, "grad_norm": 0.7581284379435382, "learning_rate": 1.7301379386534056e-05, "loss": 0.4727, "mean_token_accuracy": 0.8387518906965852, "num_tokens": 92924009.0, "step": 216 }, { "entropy": 0.438751220703125, "epoch": 0.8410852713178295, "grad_norm": 0.8993405013438669, "learning_rate": 1.7272106662911972e-05, "loss": 0.4799, "mean_token_accuracy": 0.8359887674450874, "num_tokens": 93371957.0, "step": 217 }, { "entropy": 0.452850341796875, "epoch": 0.8449612403100775, "grad_norm": 0.724479689279493, "learning_rate": 1.7242701082227275e-05, "loss": 0.4773, "mean_token_accuracy": 0.8368725245818496, "num_tokens": 93780438.0, "step": 218 }, { "entropy": 0.44256591796875, "epoch": 0.8488372093023255, "grad_norm": 0.8311485238950603, "learning_rate": 1.721316318170242e-05, "loss": 0.4933, "mean_token_accuracy": 0.8339337343350053, "num_tokens": 94225581.0, "step": 219 }, { "entropy": 0.43878173828125, "epoch": 0.8527131782945736, "grad_norm": 0.77900635570311, "learning_rate": 1.7183493500977277e-05, "loss": 0.4739, "mean_token_accuracy": 0.8379270052537322, "num_tokens": 94658592.0, "step": 220 }, { "entropy": 0.4503173828125, "epoch": 0.8565891472868217, "grad_norm": 0.7732389296731101, "learning_rate": 1.715369258209927e-05, "loss": 0.4706, "mean_token_accuracy": 0.8392182057723403, "num_tokens": 95083295.0, "step": 221 }, { "entropy": 0.445404052734375, "epoch": 0.8604651162790697, "grad_norm": 0.9115352102726256, "learning_rate": 1.712376096951345e-05, "loss": 0.4731, "mean_token_accuracy": 0.8384861033409834, "num_tokens": 95521036.0, "step": 222 }, { "entropy": 0.453369140625, "epoch": 0.8643410852713178, "grad_norm": 3.654039788029253, "learning_rate": 1.709369921005258e-05, "loss": 0.4775, "mean_token_accuracy": 0.8348365603014827, "num_tokens": 95942840.0, "step": 223 }, { "entropy": 0.445953369140625, "epoch": 0.8682170542635659, "grad_norm": 0.8876473841192518, "learning_rate": 1.7063507852927113e-05, "loss": 0.4853, "mean_token_accuracy": 0.8355412427335978, "num_tokens": 96379381.0, "step": 224 }, { "entropy": 0.44903564453125, "epoch": 0.872093023255814, "grad_norm": 0.8029328363551933, "learning_rate": 1.7033187449715195e-05, "loss": 0.491, "mean_token_accuracy": 0.8344325283542275, "num_tokens": 96810978.0, "step": 225 }, { "entropy": 0.443756103515625, "epoch": 0.875968992248062, "grad_norm": 0.7943668108623076, "learning_rate": 1.700273855435255e-05, "loss": 0.474, "mean_token_accuracy": 0.8393369819968939, "num_tokens": 97230264.0, "step": 226 }, { "entropy": 0.439453125, "epoch": 0.8798449612403101, "grad_norm": 0.8335982186316193, "learning_rate": 1.697216172312238e-05, "loss": 0.4757, "mean_token_accuracy": 0.8387380233034492, "num_tokens": 97669092.0, "step": 227 }, { "entropy": 0.438995361328125, "epoch": 0.8837209302325582, "grad_norm": 0.8313343101425054, "learning_rate": 1.6941457514645207e-05, "loss": 0.4709, "mean_token_accuracy": 0.8384785000234842, "num_tokens": 98083117.0, "step": 228 }, { "entropy": 0.440704345703125, "epoch": 0.8875968992248062, "grad_norm": 0.7332959697247916, "learning_rate": 1.691062648986865e-05, "loss": 0.4656, "mean_token_accuracy": 0.8408916248008609, "num_tokens": 98508532.0, "step": 229 }, { "entropy": 0.438232421875, "epoch": 0.8914728682170543, "grad_norm": 0.846162571369785, "learning_rate": 1.6879669212057187e-05, "loss": 0.4721, "mean_token_accuracy": 0.8412652369588614, "num_tokens": 98949168.0, "step": 230 }, { "entropy": 0.43841552734375, "epoch": 0.8953488372093024, "grad_norm": 0.7298784972709891, "learning_rate": 1.684858624678188e-05, "loss": 0.4722, "mean_token_accuracy": 0.8387791896238923, "num_tokens": 99376686.0, "step": 231 }, { "entropy": 0.44097900390625, "epoch": 0.8992248062015504, "grad_norm": 0.8091551233214312, "learning_rate": 1.6817378161909995e-05, "loss": 0.47, "mean_token_accuracy": 0.8381591122597456, "num_tokens": 99795406.0, "step": 232 }, { "entropy": 0.441375732421875, "epoch": 0.9031007751937985, "grad_norm": 0.7872364416163475, "learning_rate": 1.6786045527594693e-05, "loss": 0.4761, "mean_token_accuracy": 0.8357131062075496, "num_tokens": 100221423.0, "step": 233 }, { "entropy": 0.445556640625, "epoch": 0.9069767441860465, "grad_norm": 0.8082366968087289, "learning_rate": 1.6754588916264563e-05, "loss": 0.4641, "mean_token_accuracy": 0.840836713090539, "num_tokens": 100631261.0, "step": 234 }, { "entropy": 0.436309814453125, "epoch": 0.9108527131782945, "grad_norm": 0.7489337052822527, "learning_rate": 1.672300890261317e-05, "loss": 0.4821, "mean_token_accuracy": 0.8357611820101738, "num_tokens": 101073449.0, "step": 235 }, { "entropy": 0.44140625, "epoch": 0.9147286821705426, "grad_norm": 0.821237789529478, "learning_rate": 1.6691306063588583e-05, "loss": 0.4803, "mean_token_accuracy": 0.8347616344690323, "num_tokens": 101499902.0, "step": 236 }, { "entropy": 0.449005126953125, "epoch": 0.9186046511627907, "grad_norm": 0.8551074877141371, "learning_rate": 1.6659480978382815e-05, "loss": 0.4908, "mean_token_accuracy": 0.8317182743921876, "num_tokens": 101919468.0, "step": 237 }, { "entropy": 0.449981689453125, "epoch": 0.9224806201550387, "grad_norm": 0.783108156538897, "learning_rate": 1.662753422842123e-05, "loss": 0.4793, "mean_token_accuracy": 0.837918421253562, "num_tokens": 102351125.0, "step": 238 }, { "entropy": 0.44158935546875, "epoch": 0.9263565891472868, "grad_norm": 0.7892781334018207, "learning_rate": 1.6595466397351955e-05, "loss": 0.4738, "mean_token_accuracy": 0.8374282121658325, "num_tokens": 102794918.0, "step": 239 }, { "entropy": 0.4407958984375, "epoch": 0.9302325581395349, "grad_norm": 0.7954577060406584, "learning_rate": 1.6563278071035182e-05, "loss": 0.4771, "mean_token_accuracy": 0.8359366981312633, "num_tokens": 103244596.0, "step": 240 }, { "entropy": 0.45074462890625, "epoch": 0.9341085271317829, "grad_norm": 0.7600997638428164, "learning_rate": 1.6530969837532487e-05, "loss": 0.4725, "mean_token_accuracy": 0.8393061570823193, "num_tokens": 103694943.0, "step": 241 }, { "entropy": 0.43280029296875, "epoch": 0.937984496124031, "grad_norm": 0.798802849580159, "learning_rate": 1.6498542287096074e-05, "loss": 0.4745, "mean_token_accuracy": 0.8393577989190817, "num_tokens": 104139199.0, "step": 242 }, { "entropy": 0.439971923828125, "epoch": 0.9418604651162791, "grad_norm": 0.7448764201270436, "learning_rate": 1.6465996012157996e-05, "loss": 0.4581, "mean_token_accuracy": 0.8409141302108765, "num_tokens": 104560030.0, "step": 243 }, { "entropy": 0.4364013671875, "epoch": 0.9457364341085271, "grad_norm": 0.7890873960924092, "learning_rate": 1.6433331607319342e-05, "loss": 0.4707, "mean_token_accuracy": 0.8371709603816271, "num_tokens": 104979376.0, "step": 244 }, { "entropy": 0.44085693359375, "epoch": 0.9496124031007752, "grad_norm": 0.7542687515989888, "learning_rate": 1.640054966933935e-05, "loss": 0.4777, "mean_token_accuracy": 0.8365635378286242, "num_tokens": 105405940.0, "step": 245 }, { "entropy": 0.4351806640625, "epoch": 0.9534883720930233, "grad_norm": 0.8975716299466565, "learning_rate": 1.636765079712453e-05, "loss": 0.4617, "mean_token_accuracy": 0.8432385390624404, "num_tokens": 105819745.0, "step": 246 }, { "entropy": 0.44439697265625, "epoch": 0.9573643410852714, "grad_norm": 0.8433624087080338, "learning_rate": 1.63346355917177e-05, "loss": 0.4717, "mean_token_accuracy": 0.8381514484062791, "num_tokens": 106235149.0, "step": 247 }, { "entropy": 0.449920654296875, "epoch": 0.9612403100775194, "grad_norm": 0.791209417025907, "learning_rate": 1.6301504656287027e-05, "loss": 0.4661, "mean_token_accuracy": 0.8396106716245413, "num_tokens": 106650083.0, "step": 248 }, { "entropy": 0.4412841796875, "epoch": 0.9651162790697675, "grad_norm": 0.7804784526481979, "learning_rate": 1.626825859611499e-05, "loss": 0.4727, "mean_token_accuracy": 0.8388944864273071, "num_tokens": 107072567.0, "step": 249 }, { "entropy": 0.43536376953125, "epoch": 0.9689922480620154, "grad_norm": 0.8192426752598542, "learning_rate": 1.6234898018587336e-05, "loss": 0.4779, "mean_token_accuracy": 0.8374818284064531, "num_tokens": 107523545.0, "step": 250 }, { "entropy": 0.43695068359375, "epoch": 0.9728682170542635, "grad_norm": 0.8517630233628122, "learning_rate": 1.6201423533181965e-05, "loss": 0.4664, "mean_token_accuracy": 0.8392149573192, "num_tokens": 107958397.0, "step": 251 }, { "entropy": 0.43719482421875, "epoch": 0.9767441860465116, "grad_norm": 0.798869637694991, "learning_rate": 1.6167835751457812e-05, "loss": 0.4617, "mean_token_accuracy": 0.8421220034360886, "num_tokens": 108402895.0, "step": 252 }, { "entropy": 0.4276123046875, "epoch": 0.9806201550387597, "grad_norm": 0.7966618537088541, "learning_rate": 1.6134135287043668e-05, "loss": 0.4668, "mean_token_accuracy": 0.8377330722287297, "num_tokens": 108824927.0, "step": 253 }, { "entropy": 0.427734375, "epoch": 0.9844961240310077, "grad_norm": 0.7727490730992017, "learning_rate": 1.610032275562697e-05, "loss": 0.4765, "mean_token_accuracy": 0.8366042710840702, "num_tokens": 109261228.0, "step": 254 }, { "entropy": 0.430694580078125, "epoch": 0.9883720930232558, "grad_norm": 0.8579153905672905, "learning_rate": 1.6066398774942556e-05, "loss": 0.4736, "mean_token_accuracy": 0.8386273989453912, "num_tokens": 109702965.0, "step": 255 }, { "entropy": 0.43243408203125, "epoch": 0.9922480620155039, "grad_norm": 0.8076432536420048, "learning_rate": 1.6032363964761363e-05, "loss": 0.4821, "mean_token_accuracy": 0.8358757542446256, "num_tokens": 110136786.0, "step": 256 }, { "entropy": 0.432037353515625, "epoch": 0.9961240310077519, "grad_norm": 0.7604697931060647, "learning_rate": 1.599821894687914e-05, "loss": 0.4734, "mean_token_accuracy": 0.8369957143440843, "num_tokens": 110574894.0, "step": 257 }, { "entropy": 0.431243896484375, "epoch": 1.0, "grad_norm": 0.8716310508142131, "learning_rate": 1.5963964345105038e-05, "loss": 0.4667, "mean_token_accuracy": 0.8397450698539615, "num_tokens": 111005104.0, "step": 258 }, { "entropy": 0.440673828125, "epoch": 1.003875968992248, "grad_norm": 0.8235418343380135, "learning_rate": 1.592960078525026e-05, "loss": 0.4346, "mean_token_accuracy": 0.8489210112020373, "num_tokens": 111416103.0, "step": 259 }, { "entropy": 0.4296875, "epoch": 1.0077519379844961, "grad_norm": 0.7597015966730314, "learning_rate": 1.58951288951166e-05, "loss": 0.4534, "mean_token_accuracy": 0.8417296558618546, "num_tokens": 111861275.0, "step": 260 }, { "entropy": 0.43377685546875, "epoch": 1.0116279069767442, "grad_norm": 0.822573314981198, "learning_rate": 1.5860549304484986e-05, "loss": 0.4418, "mean_token_accuracy": 0.8477731151506305, "num_tokens": 112300406.0, "step": 261 }, { "entropy": 0.42333984375, "epoch": 1.0155038759689923, "grad_norm": 0.757456488834231, "learning_rate": 1.5825862645103962e-05, "loss": 0.4334, "mean_token_accuracy": 0.8519468028098345, "num_tokens": 112738254.0, "step": 262 }, { "entropy": 0.4271240234375, "epoch": 1.0193798449612403, "grad_norm": 0.7849441545164026, "learning_rate": 1.579106955067817e-05, "loss": 0.4239, "mean_token_accuracy": 0.8551452234387398, "num_tokens": 113165687.0, "step": 263 }, { "entropy": 0.415191650390625, "epoch": 1.0232558139534884, "grad_norm": 0.7790501279119509, "learning_rate": 1.575617065685674e-05, "loss": 0.449, "mean_token_accuracy": 0.8445991091430187, "num_tokens": 113612143.0, "step": 264 }, { "entropy": 0.415740966796875, "epoch": 1.0271317829457365, "grad_norm": 0.8396079499552707, "learning_rate": 1.5721166601221697e-05, "loss": 0.433, "mean_token_accuracy": 0.848967888392508, "num_tokens": 114055171.0, "step": 265 }, { "entropy": 0.4189453125, "epoch": 1.0310077519379846, "grad_norm": 0.7568866287481192, "learning_rate": 1.5686058023276324e-05, "loss": 0.4383, "mean_token_accuracy": 0.8488096483051777, "num_tokens": 114488971.0, "step": 266 }, { "entropy": 0.43145751953125, "epoch": 1.0348837209302326, "grad_norm": 0.7881343959390121, "learning_rate": 1.565084556443345e-05, "loss": 0.434, "mean_token_accuracy": 0.8480666261166334, "num_tokens": 114894217.0, "step": 267 }, { "entropy": 0.422027587890625, "epoch": 1.0387596899224807, "grad_norm": 0.8019993519205597, "learning_rate": 1.561552986800375e-05, "loss": 0.4333, "mean_token_accuracy": 0.8492436576634645, "num_tokens": 115316721.0, "step": 268 }, { "entropy": 0.4254150390625, "epoch": 1.0426356589147288, "grad_norm": 0.7433123503993253, "learning_rate": 1.558011157918399e-05, "loss": 0.4285, "mean_token_accuracy": 0.8507828311994672, "num_tokens": 115748926.0, "step": 269 }, { "entropy": 0.43426513671875, "epoch": 1.0465116279069768, "grad_norm": 0.7717661614873045, "learning_rate": 1.554459134504523e-05, "loss": 0.4321, "mean_token_accuracy": 0.8468701997771859, "num_tokens": 116164117.0, "step": 270 }, { "entropy": 0.425567626953125, "epoch": 1.050387596899225, "grad_norm": 0.7108056865458328, "learning_rate": 1.5508969814521026e-05, "loss": 0.4343, "mean_token_accuracy": 0.848011078312993, "num_tokens": 116600627.0, "step": 271 }, { "entropy": 0.42022705078125, "epoch": 1.054263565891473, "grad_norm": 0.7587295229376584, "learning_rate": 1.5473247638395547e-05, "loss": 0.4345, "mean_token_accuracy": 0.8474989645183086, "num_tokens": 117032055.0, "step": 272 }, { "entropy": 0.4144287109375, "epoch": 1.058139534883721, "grad_norm": 0.7514092686548347, "learning_rate": 1.54374254692917e-05, "loss": 0.4314, "mean_token_accuracy": 0.8491556569933891, "num_tokens": 117471178.0, "step": 273 }, { "entropy": 0.421173095703125, "epoch": 1.062015503875969, "grad_norm": 0.8051438236503431, "learning_rate": 1.5401503961659202e-05, "loss": 0.442, "mean_token_accuracy": 0.8469699621200562, "num_tokens": 117906370.0, "step": 274 }, { "entropy": 0.42413330078125, "epoch": 1.0658914728682172, "grad_norm": 0.7607882516236013, "learning_rate": 1.536548377176263e-05, "loss": 0.4375, "mean_token_accuracy": 0.8471976118162274, "num_tokens": 118336719.0, "step": 275 }, { "entropy": 0.427337646484375, "epoch": 1.069767441860465, "grad_norm": 0.7169629382848399, "learning_rate": 1.5329365557669427e-05, "loss": 0.4181, "mean_token_accuracy": 0.8526374585926533, "num_tokens": 118735169.0, "step": 276 }, { "entropy": 0.42352294921875, "epoch": 1.073643410852713, "grad_norm": 0.753541645601325, "learning_rate": 1.5293149979237875e-05, "loss": 0.4364, "mean_token_accuracy": 0.8475749678909779, "num_tokens": 119165398.0, "step": 277 }, { "entropy": 0.431304931640625, "epoch": 1.0775193798449612, "grad_norm": 0.7376279482610961, "learning_rate": 1.5256837698105047e-05, "loss": 0.4393, "mean_token_accuracy": 0.8488104958087206, "num_tokens": 119597939.0, "step": 278 }, { "entropy": 0.4210205078125, "epoch": 1.0813953488372092, "grad_norm": 0.707931634848308, "learning_rate": 1.5220429377674724e-05, "loss": 0.4258, "mean_token_accuracy": 0.8488586442545056, "num_tokens": 120043390.0, "step": 279 }, { "entropy": 0.426422119140625, "epoch": 1.0852713178294573, "grad_norm": 0.7629223961734812, "learning_rate": 1.5183925683105254e-05, "loss": 0.4269, "mean_token_accuracy": 0.8513823468238115, "num_tokens": 120458989.0, "step": 280 }, { "entropy": 0.426177978515625, "epoch": 1.0891472868217054, "grad_norm": 0.7895833351493825, "learning_rate": 1.5147327281297421e-05, "loss": 0.4273, "mean_token_accuracy": 0.8519961144775152, "num_tokens": 120880842.0, "step": 281 }, { "entropy": 0.4229736328125, "epoch": 1.0930232558139534, "grad_norm": 0.756105261455271, "learning_rate": 1.5110634840882258e-05, "loss": 0.4262, "mean_token_accuracy": 0.8532844102010131, "num_tokens": 121310709.0, "step": 282 }, { "entropy": 0.416046142578125, "epoch": 1.0968992248062015, "grad_norm": 0.7357203616832968, "learning_rate": 1.5073849032208823e-05, "loss": 0.4352, "mean_token_accuracy": 0.8488708259537816, "num_tokens": 121755899.0, "step": 283 }, { "entropy": 0.4189453125, "epoch": 1.1007751937984496, "grad_norm": 0.7677030953711179, "learning_rate": 1.5036970527331955e-05, "loss": 0.4194, "mean_token_accuracy": 0.8525788122788072, "num_tokens": 122197116.0, "step": 284 }, { "entropy": 0.430389404296875, "epoch": 1.1046511627906976, "grad_norm": 0.7113451501233499, "learning_rate": 1.5000000000000002e-05, "loss": 0.4326, "mean_token_accuracy": 0.8495708471164107, "num_tokens": 122612155.0, "step": 285 }, { "entropy": 0.41802978515625, "epoch": 1.1085271317829457, "grad_norm": 0.7164644043332025, "learning_rate": 1.4962938125642504e-05, "loss": 0.4309, "mean_token_accuracy": 0.851562624797225, "num_tokens": 123055956.0, "step": 286 }, { "entropy": 0.42437744140625, "epoch": 1.1124031007751938, "grad_norm": 0.78372528910054, "learning_rate": 1.4925785581357852e-05, "loss": 0.4279, "mean_token_accuracy": 0.8492834325879812, "num_tokens": 123478026.0, "step": 287 }, { "entropy": 0.421844482421875, "epoch": 1.1162790697674418, "grad_norm": 0.6861426810283858, "learning_rate": 1.4888543045900938e-05, "loss": 0.4465, "mean_token_accuracy": 0.8457562746480107, "num_tokens": 123922207.0, "step": 288 }, { "entropy": 0.427825927734375, "epoch": 1.12015503875969, "grad_norm": 0.6763098044720579, "learning_rate": 1.485121119967072e-05, "loss": 0.432, "mean_token_accuracy": 0.8471246156841516, "num_tokens": 124362582.0, "step": 289 }, { "entropy": 0.42303466796875, "epoch": 1.124031007751938, "grad_norm": 0.7290088686941348, "learning_rate": 1.4813790724697832e-05, "loss": 0.4495, "mean_token_accuracy": 0.845588430762291, "num_tokens": 124806716.0, "step": 290 }, { "entropy": 0.4210205078125, "epoch": 1.127906976744186, "grad_norm": 0.6888112492696885, "learning_rate": 1.4776282304632078e-05, "loss": 0.4378, "mean_token_accuracy": 0.8481010003015399, "num_tokens": 125233523.0, "step": 291 }, { "entropy": 0.42071533203125, "epoch": 1.1317829457364341, "grad_norm": 0.698153653677859, "learning_rate": 1.4738686624729987e-05, "loss": 0.4307, "mean_token_accuracy": 0.8496177345514297, "num_tokens": 125679583.0, "step": 292 }, { "entropy": 0.417327880859375, "epoch": 1.1356589147286822, "grad_norm": 0.7783000500288277, "learning_rate": 1.4701004371842264e-05, "loss": 0.4179, "mean_token_accuracy": 0.8545960262417793, "num_tokens": 126126243.0, "step": 293 }, { "entropy": 0.417816162109375, "epoch": 1.1395348837209303, "grad_norm": 0.7409730122451095, "learning_rate": 1.4663236234401253e-05, "loss": 0.4362, "mean_token_accuracy": 0.8477563932538033, "num_tokens": 126566551.0, "step": 294 }, { "entropy": 0.418304443359375, "epoch": 1.1434108527131783, "grad_norm": 0.7021480114616296, "learning_rate": 1.4625382902408356e-05, "loss": 0.4321, "mean_token_accuracy": 0.8495519608259201, "num_tokens": 127004749.0, "step": 295 }, { "entropy": 0.415771484375, "epoch": 1.1472868217054264, "grad_norm": 0.7175857859009526, "learning_rate": 1.4587445067421429e-05, "loss": 0.4342, "mean_token_accuracy": 0.8500601844862103, "num_tokens": 127436381.0, "step": 296 }, { "entropy": 0.420257568359375, "epoch": 1.1511627906976745, "grad_norm": 0.7229560006972757, "learning_rate": 1.4549423422542148e-05, "loss": 0.4157, "mean_token_accuracy": 0.8538505714386702, "num_tokens": 127851171.0, "step": 297 }, { "entropy": 0.421539306640625, "epoch": 1.1550387596899225, "grad_norm": 0.6985006915852251, "learning_rate": 1.4511318662403347e-05, "loss": 0.4274, "mean_token_accuracy": 0.8511299481615424, "num_tokens": 128268747.0, "step": 298 }, { "entropy": 0.417510986328125, "epoch": 1.1589147286821706, "grad_norm": 0.7072433737368383, "learning_rate": 1.4473131483156326e-05, "loss": 0.4361, "mean_token_accuracy": 0.8500646986067295, "num_tokens": 128707759.0, "step": 299 }, { "entropy": 0.423553466796875, "epoch": 1.1627906976744187, "grad_norm": 0.7755469923778686, "learning_rate": 1.4434862582458136e-05, "loss": 0.4416, "mean_token_accuracy": 0.8463389156386256, "num_tokens": 129142557.0, "step": 300 }, { "entropy": 0.421630859375, "epoch": 1.1666666666666667, "grad_norm": 0.7155794871533206, "learning_rate": 1.4396512659458824e-05, "loss": 0.4214, "mean_token_accuracy": 0.8518092129379511, "num_tokens": 129570205.0, "step": 301 }, { "entropy": 0.4237060546875, "epoch": 1.1705426356589148, "grad_norm": 0.7896631453072354, "learning_rate": 1.4358082414788666e-05, "loss": 0.4254, "mean_token_accuracy": 0.8514489009976387, "num_tokens": 129975368.0, "step": 302 }, { "entropy": 0.41925048828125, "epoch": 1.1744186046511629, "grad_norm": 0.7703803004356109, "learning_rate": 1.4319572550545374e-05, "loss": 0.4283, "mean_token_accuracy": 0.8518984178081155, "num_tokens": 130408915.0, "step": 303 }, { "entropy": 0.41986083984375, "epoch": 1.178294573643411, "grad_norm": 0.7534066489126126, "learning_rate": 1.4280983770281258e-05, "loss": 0.4161, "mean_token_accuracy": 0.8520697662606835, "num_tokens": 130831126.0, "step": 304 }, { "entropy": 0.410888671875, "epoch": 1.1821705426356588, "grad_norm": 0.781352178713833, "learning_rate": 1.4242316778990373e-05, "loss": 0.436, "mean_token_accuracy": 0.8507404867559671, "num_tokens": 131263278.0, "step": 305 }, { "entropy": 0.4180908203125, "epoch": 1.1860465116279069, "grad_norm": 0.8259033580922711, "learning_rate": 1.4203572283095657e-05, "loss": 0.4328, "mean_token_accuracy": 0.8494941433891654, "num_tokens": 131692010.0, "step": 306 }, { "entropy": 0.418975830078125, "epoch": 1.189922480620155, "grad_norm": 0.7865591711742697, "learning_rate": 1.4164750990435991e-05, "loss": 0.4349, "mean_token_accuracy": 0.8495086506009102, "num_tokens": 132134463.0, "step": 307 }, { "entropy": 0.4202880859375, "epoch": 1.193798449612403, "grad_norm": 0.731714021499699, "learning_rate": 1.4125853610253306e-05, "loss": 0.4295, "mean_token_accuracy": 0.850966832600534, "num_tokens": 132567775.0, "step": 308 }, { "entropy": 0.4229736328125, "epoch": 1.197674418604651, "grad_norm": 0.8413435296703591, "learning_rate": 1.4086880853179592e-05, "loss": 0.4306, "mean_token_accuracy": 0.8497058739885688, "num_tokens": 133000302.0, "step": 309 }, { "entropy": 0.426361083984375, "epoch": 1.2015503875968991, "grad_norm": 0.8120752444730676, "learning_rate": 1.4047833431223938e-05, "loss": 0.4334, "mean_token_accuracy": 0.8482353119179606, "num_tokens": 133422180.0, "step": 310 }, { "entropy": 0.416290283203125, "epoch": 1.2054263565891472, "grad_norm": 0.7944571026325243, "learning_rate": 1.4008712057759519e-05, "loss": 0.4295, "mean_token_accuracy": 0.8492019288241863, "num_tokens": 133866767.0, "step": 311 }, { "entropy": 0.416961669921875, "epoch": 1.2093023255813953, "grad_norm": 0.7212454482113841, "learning_rate": 1.3969517447510546e-05, "loss": 0.4333, "mean_token_accuracy": 0.847635168582201, "num_tokens": 134323563.0, "step": 312 }, { "entropy": 0.422698974609375, "epoch": 1.2131782945736433, "grad_norm": 0.8453314046957682, "learning_rate": 1.3930250316539237e-05, "loss": 0.4474, "mean_token_accuracy": 0.8459825245663524, "num_tokens": 134759823.0, "step": 313 }, { "entropy": 0.41668701171875, "epoch": 1.2170542635658914, "grad_norm": 0.7480620726932977, "learning_rate": 1.3890911382232717e-05, "loss": 0.4047, "mean_token_accuracy": 0.8580910852178931, "num_tokens": 135186114.0, "step": 314 }, { "entropy": 0.427459716796875, "epoch": 1.2209302325581395, "grad_norm": 0.7821854419992171, "learning_rate": 1.3851501363289907e-05, "loss": 0.4303, "mean_token_accuracy": 0.8487546825781465, "num_tokens": 135610201.0, "step": 315 }, { "entropy": 0.425079345703125, "epoch": 1.2248062015503876, "grad_norm": 0.7421312963343092, "learning_rate": 1.3812020979708418e-05, "loss": 0.4238, "mean_token_accuracy": 0.8516996335238218, "num_tokens": 136031193.0, "step": 316 }, { "entropy": 0.425933837890625, "epoch": 1.2286821705426356, "grad_norm": 0.6956434644749807, "learning_rate": 1.3772470952771364e-05, "loss": 0.4199, "mean_token_accuracy": 0.8497325489297509, "num_tokens": 136451557.0, "step": 317 }, { "entropy": 0.423492431640625, "epoch": 1.2325581395348837, "grad_norm": 0.7325561302834672, "learning_rate": 1.3732852005034212e-05, "loss": 0.4222, "mean_token_accuracy": 0.8517925990745425, "num_tokens": 136877337.0, "step": 318 }, { "entropy": 0.415557861328125, "epoch": 1.2364341085271318, "grad_norm": 0.7118813272913093, "learning_rate": 1.3693164860311565e-05, "loss": 0.4218, "mean_token_accuracy": 0.8540134867653251, "num_tokens": 137321216.0, "step": 319 }, { "entropy": 0.418212890625, "epoch": 1.2403100775193798, "grad_norm": 0.7988600048796505, "learning_rate": 1.3653410243663953e-05, "loss": 0.4321, "mean_token_accuracy": 0.8492146460339427, "num_tokens": 137772473.0, "step": 320 }, { "entropy": 0.423370361328125, "epoch": 1.244186046511628, "grad_norm": 0.7627685472813416, "learning_rate": 1.3613588881384565e-05, "loss": 0.4234, "mean_token_accuracy": 0.8511487627401948, "num_tokens": 138194574.0, "step": 321 }, { "entropy": 0.43115234375, "epoch": 1.248062015503876, "grad_norm": 0.7255243137072226, "learning_rate": 1.3573701500986012e-05, "loss": 0.4109, "mean_token_accuracy": 0.8555542044341564, "num_tokens": 138603729.0, "step": 322 }, { "entropy": 0.42938232421875, "epoch": 1.251937984496124, "grad_norm": 0.720487935434389, "learning_rate": 1.3533748831186992e-05, "loss": 0.4273, "mean_token_accuracy": 0.8499365914613008, "num_tokens": 139009030.0, "step": 323 }, { "entropy": 0.425750732421875, "epoch": 1.255813953488372, "grad_norm": 0.7319411749735341, "learning_rate": 1.3493731601899023e-05, "loss": 0.4205, "mean_token_accuracy": 0.8510079709812999, "num_tokens": 139433412.0, "step": 324 }, { "entropy": 0.42303466796875, "epoch": 1.2596899224806202, "grad_norm": 0.7403000017923702, "learning_rate": 1.3453650544213078e-05, "loss": 0.4102, "mean_token_accuracy": 0.8584857322275639, "num_tokens": 139845008.0, "step": 325 }, { "entropy": 0.428375244140625, "epoch": 1.2635658914728682, "grad_norm": 0.7414567866770928, "learning_rate": 1.3413506390386233e-05, "loss": 0.4238, "mean_token_accuracy": 0.8523871023207903, "num_tokens": 140279966.0, "step": 326 }, { "entropy": 0.42449951171875, "epoch": 1.2674418604651163, "grad_norm": 0.706884408701634, "learning_rate": 1.3373299873828303e-05, "loss": 0.4064, "mean_token_accuracy": 0.8562332447618246, "num_tokens": 140702025.0, "step": 327 }, { "entropy": 0.42193603515625, "epoch": 1.2713178294573644, "grad_norm": 0.7172204794469905, "learning_rate": 1.333303172908842e-05, "loss": 0.4188, "mean_token_accuracy": 0.8553838301450014, "num_tokens": 141123133.0, "step": 328 }, { "entropy": 0.42510986328125, "epoch": 1.2751937984496124, "grad_norm": 0.7709096687308097, "learning_rate": 1.3292702691841637e-05, "loss": 0.4302, "mean_token_accuracy": 0.8494652854278684, "num_tokens": 141553515.0, "step": 329 }, { "entropy": 0.430084228515625, "epoch": 1.2790697674418605, "grad_norm": 0.7368366945349477, "learning_rate": 1.3252313498875473e-05, "loss": 0.4287, "mean_token_accuracy": 0.8533294908702374, "num_tokens": 141974216.0, "step": 330 }, { "entropy": 0.4251708984375, "epoch": 1.2829457364341086, "grad_norm": 0.7185600809351362, "learning_rate": 1.3211864888076458e-05, "loss": 0.4353, "mean_token_accuracy": 0.849311051890254, "num_tokens": 142392154.0, "step": 331 }, { "entropy": 0.430389404296875, "epoch": 1.2868217054263567, "grad_norm": 0.7422111929493115, "learning_rate": 1.3171357598416642e-05, "loss": 0.4162, "mean_token_accuracy": 0.8547528050839901, "num_tokens": 142813018.0, "step": 332 }, { "entropy": 0.424835205078125, "epoch": 1.2906976744186047, "grad_norm": 0.7133086085038155, "learning_rate": 1.313079236994012e-05, "loss": 0.4262, "mean_token_accuracy": 0.8504309616982937, "num_tokens": 143242752.0, "step": 333 }, { "entropy": 0.423187255859375, "epoch": 1.2945736434108528, "grad_norm": 0.6789452214744285, "learning_rate": 1.3090169943749475e-05, "loss": 0.4086, "mean_token_accuracy": 0.8557021962478757, "num_tokens": 143688265.0, "step": 334 }, { "entropy": 0.424835205078125, "epoch": 1.2984496124031009, "grad_norm": 0.6735243128953131, "learning_rate": 1.3049491061992274e-05, "loss": 0.4102, "mean_token_accuracy": 0.8555810833349824, "num_tokens": 144113299.0, "step": 335 }, { "entropy": 0.425140380859375, "epoch": 1.302325581395349, "grad_norm": 0.6878761206422891, "learning_rate": 1.3008756467847486e-05, "loss": 0.4102, "mean_token_accuracy": 0.8577935267239809, "num_tokens": 144547082.0, "step": 336 }, { "entropy": 0.4267578125, "epoch": 1.306201550387597, "grad_norm": 0.735956052863927, "learning_rate": 1.2967966905511906e-05, "loss": 0.4171, "mean_token_accuracy": 0.8548558866605163, "num_tokens": 144962488.0, "step": 337 }, { "entropy": 0.416900634765625, "epoch": 1.310077519379845, "grad_norm": 0.6868378061745052, "learning_rate": 1.2927123120186584e-05, "loss": 0.4103, "mean_token_accuracy": 0.8534078542143106, "num_tokens": 145397748.0, "step": 338 }, { "entropy": 0.421356201171875, "epoch": 1.3139534883720931, "grad_norm": 0.7224003448379767, "learning_rate": 1.2886225858063175e-05, "loss": 0.4273, "mean_token_accuracy": 0.8498321361839771, "num_tokens": 145835597.0, "step": 339 }, { "entropy": 0.421783447265625, "epoch": 1.3178294573643412, "grad_norm": 0.6466091718297071, "learning_rate": 1.2845275866310325e-05, "loss": 0.417, "mean_token_accuracy": 0.8557677045464516, "num_tokens": 146284089.0, "step": 340 }, { "entropy": 0.42236328125, "epoch": 1.3217054263565893, "grad_norm": 0.7153551768847246, "learning_rate": 1.2804273893060028e-05, "loss": 0.4207, "mean_token_accuracy": 0.8512698579579592, "num_tokens": 146739438.0, "step": 341 }, { "entropy": 0.429473876953125, "epoch": 1.3255813953488373, "grad_norm": 0.6659058031600372, "learning_rate": 1.2763220687393942e-05, "loss": 0.417, "mean_token_accuracy": 0.8553942264989018, "num_tokens": 147142580.0, "step": 342 }, { "entropy": 0.424896240234375, "epoch": 1.3294573643410852, "grad_norm": 0.6617647509435118, "learning_rate": 1.2722116999329712e-05, "loss": 0.4164, "mean_token_accuracy": 0.8552069365978241, "num_tokens": 147565462.0, "step": 343 }, { "entropy": 0.42559814453125, "epoch": 1.3333333333333333, "grad_norm": 0.6976673140780362, "learning_rate": 1.2680963579807268e-05, "loss": 0.4194, "mean_token_accuracy": 0.8545995801687241, "num_tokens": 148000343.0, "step": 344 }, { "entropy": 0.426544189453125, "epoch": 1.3372093023255813, "grad_norm": 0.6976500715660131, "learning_rate": 1.2639761180675098e-05, "loss": 0.4193, "mean_token_accuracy": 0.8548473976552486, "num_tokens": 148450057.0, "step": 345 }, { "entropy": 0.4208984375, "epoch": 1.3410852713178294, "grad_norm": 0.6998064472558588, "learning_rate": 1.259851055467653e-05, "loss": 0.4253, "mean_token_accuracy": 0.8499715300276875, "num_tokens": 148898991.0, "step": 346 }, { "entropy": 0.4200439453125, "epoch": 1.3449612403100775, "grad_norm": 0.6567142053816986, "learning_rate": 1.2557212455435958e-05, "loss": 0.4107, "mean_token_accuracy": 0.8560676537454128, "num_tokens": 149332404.0, "step": 347 }, { "entropy": 0.428253173828125, "epoch": 1.3488372093023255, "grad_norm": 0.6246728679193004, "learning_rate": 1.2515867637445088e-05, "loss": 0.4091, "mean_token_accuracy": 0.855487022548914, "num_tokens": 149739226.0, "step": 348 }, { "entropy": 0.42034912109375, "epoch": 1.3527131782945736, "grad_norm": 0.6872162569734706, "learning_rate": 1.2474476856049145e-05, "loss": 0.4161, "mean_token_accuracy": 0.8547687204554677, "num_tokens": 150201075.0, "step": 349 }, { "entropy": 0.419647216796875, "epoch": 1.3565891472868217, "grad_norm": 0.6444775302707234, "learning_rate": 1.2433040867433087e-05, "loss": 0.4294, "mean_token_accuracy": 0.8485890505835414, "num_tokens": 150646168.0, "step": 350 }, { "entropy": 0.425750732421875, "epoch": 1.3604651162790697, "grad_norm": 0.6872503649025473, "learning_rate": 1.2391560428607776e-05, "loss": 0.4113, "mean_token_accuracy": 0.8564107986167073, "num_tokens": 151063971.0, "step": 351 }, { "entropy": 0.420989990234375, "epoch": 1.3643410852713178, "grad_norm": 0.7340706794906026, "learning_rate": 1.2350036297396153e-05, "loss": 0.4216, "mean_token_accuracy": 0.8534470964223146, "num_tokens": 151491366.0, "step": 352 }, { "entropy": 0.420074462890625, "epoch": 1.3682170542635659, "grad_norm": 0.6810651675173917, "learning_rate": 1.2308469232419387e-05, "loss": 0.4149, "mean_token_accuracy": 0.8558577708899975, "num_tokens": 151931328.0, "step": 353 }, { "entropy": 0.410614013671875, "epoch": 1.372093023255814, "grad_norm": 0.6681093420635835, "learning_rate": 1.2266859993083037e-05, "loss": 0.4104, "mean_token_accuracy": 0.8580976203083992, "num_tokens": 152372760.0, "step": 354 }, { "entropy": 0.419769287109375, "epoch": 1.375968992248062, "grad_norm": 0.7329839399462965, "learning_rate": 1.2225209339563144e-05, "loss": 0.4064, "mean_token_accuracy": 0.857388780452311, "num_tokens": 152786975.0, "step": 355 }, { "entropy": 0.426300048828125, "epoch": 1.37984496124031, "grad_norm": 0.7332044687155054, "learning_rate": 1.2183518032792376e-05, "loss": 0.41, "mean_token_accuracy": 0.8559472924098372, "num_tokens": 153199532.0, "step": 356 }, { "entropy": 0.42572021484375, "epoch": 1.3837209302325582, "grad_norm": 0.6799405186287775, "learning_rate": 1.2141786834446105e-05, "loss": 0.4188, "mean_token_accuracy": 0.8518085544928908, "num_tokens": 153632347.0, "step": 357 }, { "entropy": 0.426361083984375, "epoch": 1.3875968992248062, "grad_norm": 0.6449750458569437, "learning_rate": 1.2100016506928494e-05, "loss": 0.4199, "mean_token_accuracy": 0.8516499819234014, "num_tokens": 154065344.0, "step": 358 }, { "entropy": 0.42437744140625, "epoch": 1.3914728682170543, "grad_norm": 0.6763480475833509, "learning_rate": 1.2058207813358587e-05, "loss": 0.4083, "mean_token_accuracy": 0.8591054771095514, "num_tokens": 154482988.0, "step": 359 }, { "entropy": 0.41851806640625, "epoch": 1.3953488372093024, "grad_norm": 0.7346435605404793, "learning_rate": 1.2016361517556334e-05, "loss": 0.4244, "mean_token_accuracy": 0.8513177242130041, "num_tokens": 154920519.0, "step": 360 }, { "entropy": 0.42596435546875, "epoch": 1.3992248062015504, "grad_norm": 0.6715875218591029, "learning_rate": 1.1974478384028672e-05, "loss": 0.4163, "mean_token_accuracy": 0.8543539261445403, "num_tokens": 155349717.0, "step": 361 }, { "entropy": 0.4283447265625, "epoch": 1.4031007751937985, "grad_norm": 0.6724081825580828, "learning_rate": 1.1932559177955533e-05, "loss": 0.4079, "mean_token_accuracy": 0.8556110095232725, "num_tokens": 155763526.0, "step": 362 }, { "entropy": 0.420196533203125, "epoch": 1.4069767441860466, "grad_norm": 0.6752640946014242, "learning_rate": 1.1890604665175878e-05, "loss": 0.4217, "mean_token_accuracy": 0.8532299809157848, "num_tokens": 156181266.0, "step": 363 }, { "entropy": 0.421661376953125, "epoch": 1.4108527131782946, "grad_norm": 0.6672888444227608, "learning_rate": 1.1848615612173689e-05, "loss": 0.4116, "mean_token_accuracy": 0.8545421287417412, "num_tokens": 156600191.0, "step": 364 }, { "entropy": 0.42340087890625, "epoch": 1.4147286821705427, "grad_norm": 0.6532653684313114, "learning_rate": 1.1806592786063991e-05, "loss": 0.4064, "mean_token_accuracy": 0.8569002998992801, "num_tokens": 157038509.0, "step": 365 }, { "entropy": 0.412811279296875, "epoch": 1.4186046511627908, "grad_norm": 0.6533143053038322, "learning_rate": 1.1764536954578817e-05, "loss": 0.409, "mean_token_accuracy": 0.8555587902665138, "num_tokens": 157468885.0, "step": 366 }, { "entropy": 0.414794921875, "epoch": 1.4224806201550386, "grad_norm": 0.626757449703669, "learning_rate": 1.172244888605319e-05, "loss": 0.4028, "mean_token_accuracy": 0.8573278188705444, "num_tokens": 157897065.0, "step": 367 }, { "entropy": 0.41864013671875, "epoch": 1.4263565891472867, "grad_norm": 0.6747328666294428, "learning_rate": 1.1680329349411086e-05, "loss": 0.4079, "mean_token_accuracy": 0.8546053608879447, "num_tokens": 158329094.0, "step": 368 }, { "entropy": 0.418060302734375, "epoch": 1.4302325581395348, "grad_norm": 0.6724341191502806, "learning_rate": 1.1638179114151378e-05, "loss": 0.4238, "mean_token_accuracy": 0.8480729442089796, "num_tokens": 158773650.0, "step": 369 }, { "entropy": 0.41351318359375, "epoch": 1.4341085271317828, "grad_norm": 0.6191751963920147, "learning_rate": 1.1595998950333794e-05, "loss": 0.4078, "mean_token_accuracy": 0.8567248536273837, "num_tokens": 159212465.0, "step": 370 }, { "entropy": 0.422271728515625, "epoch": 1.437984496124031, "grad_norm": 0.662553873559576, "learning_rate": 1.1553789628564832e-05, "loss": 0.4066, "mean_token_accuracy": 0.8549947030842304, "num_tokens": 159612271.0, "step": 371 }, { "entropy": 0.415802001953125, "epoch": 1.441860465116279, "grad_norm": 0.6596280602581531, "learning_rate": 1.151155191998369e-05, "loss": 0.418, "mean_token_accuracy": 0.8543671853840351, "num_tokens": 160056420.0, "step": 372 }, { "entropy": 0.41851806640625, "epoch": 1.445736434108527, "grad_norm": 0.718746454796938, "learning_rate": 1.1469286596248181e-05, "loss": 0.4176, "mean_token_accuracy": 0.8542827153578401, "num_tokens": 160475081.0, "step": 373 }, { "entropy": 0.41009521484375, "epoch": 1.449612403100775, "grad_norm": 0.652059758876679, "learning_rate": 1.1426994429520622e-05, "loss": 0.4093, "mean_token_accuracy": 0.8568979185074568, "num_tokens": 160908789.0, "step": 374 }, { "entropy": 0.41717529296875, "epoch": 1.4534883720930232, "grad_norm": 0.6430873724176981, "learning_rate": 1.138467619245374e-05, "loss": 0.4092, "mean_token_accuracy": 0.8570778556168079, "num_tokens": 161339820.0, "step": 375 }, { "entropy": 0.413787841796875, "epoch": 1.4573643410852712, "grad_norm": 0.7222965583263963, "learning_rate": 1.1342332658176556e-05, "loss": 0.4184, "mean_token_accuracy": 0.8542719716206193, "num_tokens": 161777346.0, "step": 376 }, { "entropy": 0.419158935546875, "epoch": 1.4612403100775193, "grad_norm": 0.6609105273639142, "learning_rate": 1.1299964600280247e-05, "loss": 0.3931, "mean_token_accuracy": 0.861978692933917, "num_tokens": 162199267.0, "step": 377 }, { "entropy": 0.40478515625, "epoch": 1.4651162790697674, "grad_norm": 0.6412126135519434, "learning_rate": 1.1257572792804028e-05, "loss": 0.4063, "mean_token_accuracy": 0.8554513454437256, "num_tokens": 162633923.0, "step": 378 }, { "entropy": 0.41717529296875, "epoch": 1.4689922480620154, "grad_norm": 0.6876116007476988, "learning_rate": 1.1215158010221005e-05, "loss": 0.4069, "mean_token_accuracy": 0.8563102921471, "num_tokens": 163043237.0, "step": 379 }, { "entropy": 0.4061279296875, "epoch": 1.4728682170542635, "grad_norm": 0.6428579499395306, "learning_rate": 1.1172721027424021e-05, "loss": 0.4079, "mean_token_accuracy": 0.858618251979351, "num_tokens": 163486291.0, "step": 380 }, { "entropy": 0.419342041015625, "epoch": 1.4767441860465116, "grad_norm": 0.687187689490725, "learning_rate": 1.1130262619711505e-05, "loss": 0.4093, "mean_token_accuracy": 0.8567131292074919, "num_tokens": 163906930.0, "step": 381 }, { "entropy": 0.409820556640625, "epoch": 1.4806201550387597, "grad_norm": 0.6756942959039264, "learning_rate": 1.108778356277331e-05, "loss": 0.4137, "mean_token_accuracy": 0.8561842953786254, "num_tokens": 164346184.0, "step": 382 }, { "entropy": 0.416900634765625, "epoch": 1.4844961240310077, "grad_norm": 0.678878649649754, "learning_rate": 1.1045284632676535e-05, "loss": 0.4142, "mean_token_accuracy": 0.8553884662687778, "num_tokens": 164758508.0, "step": 383 }, { "entropy": 0.41632080078125, "epoch": 1.4883720930232558, "grad_norm": 0.697760524956994, "learning_rate": 1.1002766605851353e-05, "loss": 0.4044, "mean_token_accuracy": 0.8585758162662387, "num_tokens": 165177286.0, "step": 384 }, { "entropy": 0.41912841796875, "epoch": 1.4922480620155039, "grad_norm": 0.6519093686870057, "learning_rate": 1.0960230259076819e-05, "loss": 0.3998, "mean_token_accuracy": 0.859433357603848, "num_tokens": 165601852.0, "step": 385 }, { "entropy": 0.40447998046875, "epoch": 1.496124031007752, "grad_norm": 0.6543050312276183, "learning_rate": 1.0917676369466683e-05, "loss": 0.4067, "mean_token_accuracy": 0.8571837488561869, "num_tokens": 166030463.0, "step": 386 }, { "entropy": 0.41229248046875, "epoch": 1.5, "grad_norm": 0.7035556193327832, "learning_rate": 1.0875105714455193e-05, "loss": 0.4074, "mean_token_accuracy": 0.8557658046483994, "num_tokens": 166444670.0, "step": 387 }, { "entropy": 0.41241455078125, "epoch": 1.503875968992248, "grad_norm": 0.6563472796443234, "learning_rate": 1.0832519071782895e-05, "loss": 0.4055, "mean_token_accuracy": 0.8547043697908521, "num_tokens": 166872503.0, "step": 388 }, { "entropy": 0.401947021484375, "epoch": 1.5077519379844961, "grad_norm": 0.709679816736362, "learning_rate": 1.0789917219482413e-05, "loss": 0.4073, "mean_token_accuracy": 0.8544203815981746, "num_tokens": 167320746.0, "step": 389 }, { "entropy": 0.410614013671875, "epoch": 1.5116279069767442, "grad_norm": 0.6541389657979753, "learning_rate": 1.0747300935864245e-05, "loss": 0.4019, "mean_token_accuracy": 0.8587576989084482, "num_tokens": 167759677.0, "step": 390 }, { "entropy": 0.41009521484375, "epoch": 1.5155038759689923, "grad_norm": 0.667881183522671, "learning_rate": 1.070467099950254e-05, "loss": 0.4053, "mean_token_accuracy": 0.8573957877233624, "num_tokens": 168199322.0, "step": 391 }, { "entropy": 0.407257080078125, "epoch": 1.5193798449612403, "grad_norm": 0.6633613364086678, "learning_rate": 1.0662028189220876e-05, "loss": 0.4057, "mean_token_accuracy": 0.8579470701515675, "num_tokens": 168640945.0, "step": 392 }, { "entropy": 0.412139892578125, "epoch": 1.5232558139534884, "grad_norm": 0.6831773547333138, "learning_rate": 1.0619373284078032e-05, "loss": 0.4084, "mean_token_accuracy": 0.8582353731617332, "num_tokens": 169064555.0, "step": 393 }, { "entropy": 0.4107666015625, "epoch": 1.5271317829457365, "grad_norm": 0.6948469884270619, "learning_rate": 1.0576707063353745e-05, "loss": 0.4125, "mean_token_accuracy": 0.8543423097580671, "num_tokens": 169485377.0, "step": 394 }, { "entropy": 0.414642333984375, "epoch": 1.5310077519379846, "grad_norm": 0.6926922523877129, "learning_rate": 1.0534030306534491e-05, "loss": 0.4149, "mean_token_accuracy": 0.8544100457802415, "num_tokens": 169927287.0, "step": 395 }, { "entropy": 0.413848876953125, "epoch": 1.5348837209302326, "grad_norm": 0.6813641145842185, "learning_rate": 1.0491343793299225e-05, "loss": 0.4093, "mean_token_accuracy": 0.8580722212791443, "num_tokens": 170354986.0, "step": 396 }, { "entropy": 0.417449951171875, "epoch": 1.5387596899224807, "grad_norm": 0.6807712850448645, "learning_rate": 1.044864830350515e-05, "loss": 0.4133, "mean_token_accuracy": 0.8556122053414583, "num_tokens": 170779478.0, "step": 397 }, { "entropy": 0.4117431640625, "epoch": 1.5426356589147288, "grad_norm": 0.6651144480747045, "learning_rate": 1.040594461717347e-05, "loss": 0.4043, "mean_token_accuracy": 0.8557463986799121, "num_tokens": 171202689.0, "step": 398 }, { "entropy": 0.409576416015625, "epoch": 1.5465116279069768, "grad_norm": 0.680059415481525, "learning_rate": 1.0363233514475121e-05, "loss": 0.4093, "mean_token_accuracy": 0.8574331281706691, "num_tokens": 171652048.0, "step": 399 }, { "entropy": 0.418182373046875, "epoch": 1.550387596899225, "grad_norm": 0.7290148760854265, "learning_rate": 1.0320515775716556e-05, "loss": 0.3907, "mean_token_accuracy": 0.8639104012399912, "num_tokens": 172075445.0, "step": 400 }, { "entropy": 0.405548095703125, "epoch": 1.554263565891473, "grad_norm": 0.6573849244620844, "learning_rate": 1.027779218132543e-05, "loss": 0.4076, "mean_token_accuracy": 0.8560699671506882, "num_tokens": 172531164.0, "step": 401 }, { "entropy": 0.415771484375, "epoch": 1.558139534883721, "grad_norm": 0.6954008019614183, "learning_rate": 1.0235063511836416e-05, "loss": 0.4133, "mean_token_accuracy": 0.8537066699936986, "num_tokens": 172962665.0, "step": 402 }, { "entropy": 0.415863037109375, "epoch": 1.562015503875969, "grad_norm": 0.6562796726655113, "learning_rate": 1.0192330547876871e-05, "loss": 0.409, "mean_token_accuracy": 0.8552350932732224, "num_tokens": 173377559.0, "step": 403 }, { "entropy": 0.415191650390625, "epoch": 1.5658914728682172, "grad_norm": 0.667137112397098, "learning_rate": 1.0149594070152638e-05, "loss": 0.3962, "mean_token_accuracy": 0.8587388163432479, "num_tokens": 173786515.0, "step": 404 }, { "entropy": 0.407806396484375, "epoch": 1.5697674418604652, "grad_norm": 0.6768098234173051, "learning_rate": 1.0106854859433734e-05, "loss": 0.397, "mean_token_accuracy": 0.8586418740451336, "num_tokens": 174219900.0, "step": 405 }, { "entropy": 0.412872314453125, "epoch": 1.5736434108527133, "grad_norm": 0.6885176427252887, "learning_rate": 1.0064113696540112e-05, "loss": 0.3971, "mean_token_accuracy": 0.8602798972278833, "num_tokens": 174657405.0, "step": 406 }, { "entropy": 0.4066162109375, "epoch": 1.5775193798449614, "grad_norm": 0.631994956220823, "learning_rate": 1.0021371362327397e-05, "loss": 0.3829, "mean_token_accuracy": 0.8659420674666762, "num_tokens": 175080761.0, "step": 407 }, { "entropy": 0.398590087890625, "epoch": 1.5813953488372094, "grad_norm": 0.6912443194591774, "learning_rate": 9.978628637672604e-06, "loss": 0.411, "mean_token_accuracy": 0.853080808185041, "num_tokens": 175544696.0, "step": 408 }, { "entropy": 0.40594482421875, "epoch": 1.5852713178294575, "grad_norm": 0.708091166473127, "learning_rate": 9.93588630345989e-06, "loss": 0.4132, "mean_token_accuracy": 0.8563732989132404, "num_tokens": 175983787.0, "step": 409 }, { "entropy": 0.41259765625, "epoch": 1.5891472868217056, "grad_norm": 0.6387188583839009, "learning_rate": 9.89314514056627e-06, "loss": 0.4068, "mean_token_accuracy": 0.8602753495797515, "num_tokens": 176413686.0, "step": 410 }, { "entropy": 0.4127197265625, "epoch": 1.5930232558139537, "grad_norm": 0.6592710314757873, "learning_rate": 9.850405929847367e-06, "loss": 0.3979, "mean_token_accuracy": 0.8593348637223244, "num_tokens": 176834640.0, "step": 411 }, { "entropy": 0.411285400390625, "epoch": 1.5968992248062015, "grad_norm": 0.6738215140979368, "learning_rate": 9.80766945212313e-06, "loss": 0.4071, "mean_token_accuracy": 0.856034941971302, "num_tokens": 177250198.0, "step": 412 }, { "entropy": 0.410552978515625, "epoch": 1.6007751937984496, "grad_norm": 0.7220302341014333, "learning_rate": 9.764936488163585e-06, "loss": 0.3955, "mean_token_accuracy": 0.8606227496638894, "num_tokens": 177679620.0, "step": 413 }, { "entropy": 0.412078857421875, "epoch": 1.6046511627906976, "grad_norm": 0.6909184770326399, "learning_rate": 9.72220781867457e-06, "loss": 0.4066, "mean_token_accuracy": 0.8569380175322294, "num_tokens": 178105455.0, "step": 414 }, { "entropy": 0.41461181640625, "epoch": 1.6085271317829457, "grad_norm": 0.6798184746040906, "learning_rate": 9.67948422428345e-06, "loss": 0.3995, "mean_token_accuracy": 0.8596138171851635, "num_tokens": 178517293.0, "step": 415 }, { "entropy": 0.405731201171875, "epoch": 1.6124031007751938, "grad_norm": 0.6364483551849868, "learning_rate": 9.63676648552488e-06, "loss": 0.3909, "mean_token_accuracy": 0.8604221493005753, "num_tokens": 178951290.0, "step": 416 }, { "entropy": 0.41064453125, "epoch": 1.6162790697674418, "grad_norm": 0.6417949151025134, "learning_rate": 9.594055382826534e-06, "loss": 0.3842, "mean_token_accuracy": 0.8645245768129826, "num_tokens": 179368525.0, "step": 417 }, { "entropy": 0.409881591796875, "epoch": 1.62015503875969, "grad_norm": 0.6122486116049144, "learning_rate": 9.551351696494854e-06, "loss": 0.4031, "mean_token_accuracy": 0.859037296846509, "num_tokens": 179810881.0, "step": 418 }, { "entropy": 0.4073486328125, "epoch": 1.624031007751938, "grad_norm": 0.7193263550810518, "learning_rate": 9.508656206700778e-06, "loss": 0.423, "mean_token_accuracy": 0.8512195134535432, "num_tokens": 180256456.0, "step": 419 }, { "entropy": 0.4124755859375, "epoch": 1.627906976744186, "grad_norm": 0.6396498575313437, "learning_rate": 9.46596969346551e-06, "loss": 0.4117, "mean_token_accuracy": 0.8570121200755239, "num_tokens": 180695177.0, "step": 420 }, { "entropy": 0.41705322265625, "epoch": 1.6317829457364341, "grad_norm": 0.6192449327399918, "learning_rate": 9.423292936646258e-06, "loss": 0.3966, "mean_token_accuracy": 0.8602887643501163, "num_tokens": 181108644.0, "step": 421 }, { "entropy": 0.420989990234375, "epoch": 1.6356589147286822, "grad_norm": 0.632880768949324, "learning_rate": 9.380626715921972e-06, "loss": 0.3993, "mean_token_accuracy": 0.8594755912199616, "num_tokens": 181522964.0, "step": 422 }, { "entropy": 0.417572021484375, "epoch": 1.6395348837209303, "grad_norm": 0.6923163082611585, "learning_rate": 9.337971810779127e-06, "loss": 0.3936, "mean_token_accuracy": 0.861396661028266, "num_tokens": 181949804.0, "step": 423 }, { "entropy": 0.408599853515625, "epoch": 1.6434108527131783, "grad_norm": 0.6622239880569959, "learning_rate": 9.29532900049746e-06, "loss": 0.3942, "mean_token_accuracy": 0.8627764778211713, "num_tokens": 182378755.0, "step": 424 }, { "entropy": 0.412200927734375, "epoch": 1.6472868217054264, "grad_norm": 0.6279180554692346, "learning_rate": 9.252699064135759e-06, "loss": 0.3984, "mean_token_accuracy": 0.8595286570489407, "num_tokens": 182804060.0, "step": 425 }, { "entropy": 0.406951904296875, "epoch": 1.6511627906976745, "grad_norm": 0.6621427730372136, "learning_rate": 9.21008278051759e-06, "loss": 0.396, "mean_token_accuracy": 0.8606462860479951, "num_tokens": 183243736.0, "step": 426 }, { "entropy": 0.401947021484375, "epoch": 1.6550387596899225, "grad_norm": 0.6817892729294643, "learning_rate": 9.167480928217108e-06, "loss": 0.4068, "mean_token_accuracy": 0.8560972642153502, "num_tokens": 183696488.0, "step": 427 }, { "entropy": 0.40386962890625, "epoch": 1.6589147286821704, "grad_norm": 0.6817036559805504, "learning_rate": 9.124894285544808e-06, "loss": 0.405, "mean_token_accuracy": 0.8574656415730715, "num_tokens": 184134192.0, "step": 428 }, { "entropy": 0.412811279296875, "epoch": 1.6627906976744184, "grad_norm": 0.6761747313195076, "learning_rate": 9.082323630533317e-06, "loss": 0.3904, "mean_token_accuracy": 0.8623712658882141, "num_tokens": 184547689.0, "step": 429 }, { "entropy": 0.407196044921875, "epoch": 1.6666666666666665, "grad_norm": 0.6093614910819201, "learning_rate": 9.039769740923183e-06, "loss": 0.3954, "mean_token_accuracy": 0.8592645572498441, "num_tokens": 184980830.0, "step": 430 }, { "entropy": 0.4166259765625, "epoch": 1.6705426356589146, "grad_norm": 0.6332487484043614, "learning_rate": 8.997233394148648e-06, "loss": 0.3918, "mean_token_accuracy": 0.8626098716631532, "num_tokens": 185396383.0, "step": 431 }, { "entropy": 0.41619873046875, "epoch": 1.6744186046511627, "grad_norm": 0.6580432745200419, "learning_rate": 8.954715367323468e-06, "loss": 0.4013, "mean_token_accuracy": 0.8584313243627548, "num_tokens": 185821177.0, "step": 432 }, { "entropy": 0.414154052734375, "epoch": 1.6782945736434107, "grad_norm": 0.6851065385902475, "learning_rate": 8.912216437226692e-06, "loss": 0.396, "mean_token_accuracy": 0.860386623069644, "num_tokens": 186247442.0, "step": 433 }, { "entropy": 0.420745849609375, "epoch": 1.6821705426356588, "grad_norm": 0.6568781689919727, "learning_rate": 8.869737380288502e-06, "loss": 0.4014, "mean_token_accuracy": 0.8592943148687482, "num_tokens": 186670839.0, "step": 434 }, { "entropy": 0.41497802734375, "epoch": 1.6860465116279069, "grad_norm": 0.6319514560027869, "learning_rate": 8.827278972575984e-06, "loss": 0.3916, "mean_token_accuracy": 0.8617346873506904, "num_tokens": 187095317.0, "step": 435 }, { "entropy": 0.406524658203125, "epoch": 1.689922480620155, "grad_norm": 0.6682072796432892, "learning_rate": 8.784841989778997e-06, "loss": 0.3979, "mean_token_accuracy": 0.859723481349647, "num_tokens": 187535915.0, "step": 436 }, { "entropy": 0.41015625, "epoch": 1.693798449612403, "grad_norm": 0.6691193142851936, "learning_rate": 8.742427207195975e-06, "loss": 0.3883, "mean_token_accuracy": 0.8643459100276232, "num_tokens": 187962507.0, "step": 437 }, { "entropy": 0.4033203125, "epoch": 1.697674418604651, "grad_norm": 0.6803532551447259, "learning_rate": 8.700035399719754e-06, "loss": 0.4016, "mean_token_accuracy": 0.8600505525246263, "num_tokens": 188414361.0, "step": 438 }, { "entropy": 0.4100341796875, "epoch": 1.7015503875968991, "grad_norm": 0.6705454693190775, "learning_rate": 8.657667341823449e-06, "loss": 0.4131, "mean_token_accuracy": 0.8566976124420762, "num_tokens": 188861294.0, "step": 439 }, { "entropy": 0.416778564453125, "epoch": 1.7054263565891472, "grad_norm": 0.6425730442834899, "learning_rate": 8.615323807546258e-06, "loss": 0.4022, "mean_token_accuracy": 0.8590597696602345, "num_tokens": 189287748.0, "step": 440 }, { "entropy": 0.41192626953125, "epoch": 1.7093023255813953, "grad_norm": 0.6409295984728016, "learning_rate": 8.57300557047938e-06, "loss": 0.3931, "mean_token_accuracy": 0.8600128889083862, "num_tokens": 189737727.0, "step": 441 }, { "entropy": 0.414276123046875, "epoch": 1.7131782945736433, "grad_norm": 0.6433632025761097, "learning_rate": 8.530713403751822e-06, "loss": 0.396, "mean_token_accuracy": 0.8607654105871916, "num_tokens": 190166544.0, "step": 442 }, { "entropy": 0.41436767578125, "epoch": 1.7170542635658914, "grad_norm": 0.6296991719670527, "learning_rate": 8.488448080016312e-06, "loss": 0.3805, "mean_token_accuracy": 0.8660553842782974, "num_tokens": 190605188.0, "step": 443 }, { "entropy": 0.409088134765625, "epoch": 1.7209302325581395, "grad_norm": 0.659206038831299, "learning_rate": 8.446210371435172e-06, "loss": 0.3953, "mean_token_accuracy": 0.8605815563350916, "num_tokens": 191035664.0, "step": 444 }, { "entropy": 0.408050537109375, "epoch": 1.7248062015503876, "grad_norm": 0.652051845273745, "learning_rate": 8.404001049666211e-06, "loss": 0.3899, "mean_token_accuracy": 0.8624423686414957, "num_tokens": 191462754.0, "step": 445 }, { "entropy": 0.416046142578125, "epoch": 1.7286821705426356, "grad_norm": 0.6692499987903555, "learning_rate": 8.361820885848623e-06, "loss": 0.4025, "mean_token_accuracy": 0.8589138938114047, "num_tokens": 191882226.0, "step": 446 }, { "entropy": 0.4078369140625, "epoch": 1.7325581395348837, "grad_norm": 0.623322525199549, "learning_rate": 8.319670650588916e-06, "loss": 0.3786, "mean_token_accuracy": 0.8644889798015356, "num_tokens": 192307832.0, "step": 447 }, { "entropy": 0.4110107421875, "epoch": 1.7364341085271318, "grad_norm": 0.6189571086504357, "learning_rate": 8.277551113946812e-06, "loss": 0.3889, "mean_token_accuracy": 0.8638109732419252, "num_tokens": 192740453.0, "step": 448 }, { "entropy": 0.406951904296875, "epoch": 1.7403100775193798, "grad_norm": 0.6436896246044009, "learning_rate": 8.235463045421186e-06, "loss": 0.3797, "mean_token_accuracy": 0.8641934292390943, "num_tokens": 193161453.0, "step": 449 }, { "entropy": 0.405487060546875, "epoch": 1.744186046511628, "grad_norm": 0.7136051111650151, "learning_rate": 8.193407213936014e-06, "loss": 0.3923, "mean_token_accuracy": 0.8620244851335883, "num_tokens": 193581605.0, "step": 450 }, { "entropy": 0.41082763671875, "epoch": 1.748062015503876, "grad_norm": 0.6504324698248508, "learning_rate": 8.151384387826313e-06, "loss": 0.3867, "mean_token_accuracy": 0.8634668812155724, "num_tokens": 193986862.0, "step": 451 }, { "entropy": 0.410003662109375, "epoch": 1.751937984496124, "grad_norm": 0.6572442527472421, "learning_rate": 8.109395334824127e-06, "loss": 0.3986, "mean_token_accuracy": 0.8602865533903241, "num_tokens": 194402333.0, "step": 452 }, { "entropy": 0.409820556640625, "epoch": 1.755813953488372, "grad_norm": 0.6434483925139697, "learning_rate": 8.06744082204447e-06, "loss": 0.4005, "mean_token_accuracy": 0.8582842675969005, "num_tokens": 194857383.0, "step": 453 }, { "entropy": 0.408966064453125, "epoch": 1.7596899224806202, "grad_norm": 0.6680448763661996, "learning_rate": 8.02552161597133e-06, "loss": 0.3928, "mean_token_accuracy": 0.8633231353014708, "num_tokens": 195286940.0, "step": 454 }, { "entropy": 0.406494140625, "epoch": 1.7635658914728682, "grad_norm": 0.6369078203975965, "learning_rate": 7.983638482443671e-06, "loss": 0.3988, "mean_token_accuracy": 0.859028734266758, "num_tokens": 195736988.0, "step": 455 }, { "entropy": 0.40313720703125, "epoch": 1.7674418604651163, "grad_norm": 0.643861645368624, "learning_rate": 7.941792186641417e-06, "loss": 0.3833, "mean_token_accuracy": 0.864879741333425, "num_tokens": 196169764.0, "step": 456 }, { "entropy": 0.40716552734375, "epoch": 1.7713178294573644, "grad_norm": 0.6561302747521901, "learning_rate": 7.899983493071506e-06, "loss": 0.3871, "mean_token_accuracy": 0.8641843870282173, "num_tokens": 196598690.0, "step": 457 }, { "entropy": 0.409393310546875, "epoch": 1.7751937984496124, "grad_norm": 0.6594934710060417, "learning_rate": 7.858213165553897e-06, "loss": 0.3963, "mean_token_accuracy": 0.8609117828309536, "num_tokens": 197014178.0, "step": 458 }, { "entropy": 0.399658203125, "epoch": 1.7790697674418605, "grad_norm": 0.6478982839392694, "learning_rate": 7.816481967207627e-06, "loss": 0.3875, "mean_token_accuracy": 0.8637743312865496, "num_tokens": 197476046.0, "step": 459 }, { "entropy": 0.406494140625, "epoch": 1.7829457364341086, "grad_norm": 0.6157464144088126, "learning_rate": 7.774790660436857e-06, "loss": 0.3912, "mean_token_accuracy": 0.862570708617568, "num_tokens": 197900624.0, "step": 460 }, { "entropy": 0.41302490234375, "epoch": 1.7868217054263567, "grad_norm": 0.683013379784653, "learning_rate": 7.733140006916968e-06, "loss": 0.3799, "mean_token_accuracy": 0.8661030624061823, "num_tokens": 198310858.0, "step": 461 }, { "entropy": 0.41424560546875, "epoch": 1.7906976744186047, "grad_norm": 0.6367234615706063, "learning_rate": 7.691530767580613e-06, "loss": 0.3873, "mean_token_accuracy": 0.861991093493998, "num_tokens": 198746731.0, "step": 462 }, { "entropy": 0.410888671875, "epoch": 1.7945736434108528, "grad_norm": 0.6977685434199898, "learning_rate": 7.649963702603848e-06, "loss": 0.3856, "mean_token_accuracy": 0.863556420430541, "num_tokens": 199169885.0, "step": 463 }, { "entropy": 0.4053955078125, "epoch": 1.7984496124031009, "grad_norm": 0.6602781525851062, "learning_rate": 7.608439571392227e-06, "loss": 0.3939, "mean_token_accuracy": 0.861305077560246, "num_tokens": 199606209.0, "step": 464 }, { "entropy": 0.40625, "epoch": 1.802325581395349, "grad_norm": 0.6523264761673283, "learning_rate": 7.566959132566914e-06, "loss": 0.4128, "mean_token_accuracy": 0.8558422513306141, "num_tokens": 200040899.0, "step": 465 }, { "entropy": 0.404632568359375, "epoch": 1.806201550387597, "grad_norm": 0.6136108426271604, "learning_rate": 7.525523143950859e-06, "loss": 0.3943, "mean_token_accuracy": 0.8612562520429492, "num_tokens": 200483003.0, "step": 466 }, { "entropy": 0.404266357421875, "epoch": 1.810077519379845, "grad_norm": 0.6808527274202781, "learning_rate": 7.484132362554915e-06, "loss": 0.4035, "mean_token_accuracy": 0.8603929774835706, "num_tokens": 200918100.0, "step": 467 }, { "entropy": 0.415069580078125, "epoch": 1.8139534883720931, "grad_norm": 0.7268727125086556, "learning_rate": 7.442787544564044e-06, "loss": 0.3884, "mean_token_accuracy": 0.8616755288094282, "num_tokens": 201332208.0, "step": 468 }, { "entropy": 0.406463623046875, "epoch": 1.8178294573643412, "grad_norm": 0.6327268458389125, "learning_rate": 7.401489445323473e-06, "loss": 0.3825, "mean_token_accuracy": 0.8625819915905595, "num_tokens": 201748342.0, "step": 469 }, { "entropy": 0.406768798828125, "epoch": 1.8217054263565893, "grad_norm": 0.5843910418974224, "learning_rate": 7.360238819324903e-06, "loss": 0.3883, "mean_token_accuracy": 0.8621263904497027, "num_tokens": 202190718.0, "step": 470 }, { "entropy": 0.401763916015625, "epoch": 1.8255813953488373, "grad_norm": 0.6553658624886732, "learning_rate": 7.319036420192737e-06, "loss": 0.386, "mean_token_accuracy": 0.8664320418611169, "num_tokens": 202617317.0, "step": 471 }, { "entropy": 0.401702880859375, "epoch": 1.8294573643410854, "grad_norm": 0.6697797086612566, "learning_rate": 7.27788300067029e-06, "loss": 0.3898, "mean_token_accuracy": 0.8603641046211123, "num_tokens": 203052208.0, "step": 472 }, { "entropy": 0.40185546875, "epoch": 1.8333333333333335, "grad_norm": 0.6123304501658193, "learning_rate": 7.236779312606059e-06, "loss": 0.3825, "mean_token_accuracy": 0.8642279775813222, "num_tokens": 203497214.0, "step": 473 }, { "entropy": 0.41339111328125, "epoch": 1.8372093023255816, "grad_norm": 0.6202994666777202, "learning_rate": 7.1957261069399745e-06, "loss": 0.3801, "mean_token_accuracy": 0.8631016416475177, "num_tokens": 203906589.0, "step": 474 }, { "entropy": 0.41558837890625, "epoch": 1.8410852713178296, "grad_norm": 0.6791465966432566, "learning_rate": 7.154724133689677e-06, "loss": 0.3972, "mean_token_accuracy": 0.8600956695154309, "num_tokens": 204327623.0, "step": 475 }, { "entropy": 0.408416748046875, "epoch": 1.8449612403100775, "grad_norm": 0.6350010558970041, "learning_rate": 7.113774141936829e-06, "loss": 0.4041, "mean_token_accuracy": 0.8566409824416041, "num_tokens": 204761918.0, "step": 476 }, { "entropy": 0.41143798828125, "epoch": 1.8488372093023255, "grad_norm": 0.6313657105845258, "learning_rate": 7.0728768798134195e-06, "loss": 0.3909, "mean_token_accuracy": 0.8599191624671221, "num_tokens": 205174533.0, "step": 477 }, { "entropy": 0.41021728515625, "epoch": 1.8527131782945736, "grad_norm": 0.6378670434188185, "learning_rate": 7.032033094488094e-06, "loss": 0.3828, "mean_token_accuracy": 0.8645174792036414, "num_tokens": 205594485.0, "step": 478 }, { "entropy": 0.405364990234375, "epoch": 1.8565891472868217, "grad_norm": 0.6543240433781295, "learning_rate": 6.9912435321525185e-06, "loss": 0.3916, "mean_token_accuracy": 0.861815670505166, "num_tokens": 206032827.0, "step": 479 }, { "entropy": 0.402587890625, "epoch": 1.8604651162790697, "grad_norm": 0.6530799396059754, "learning_rate": 6.95050893800773e-06, "loss": 0.3705, "mean_token_accuracy": 0.8680164245888591, "num_tokens": 206464664.0, "step": 480 }, { "entropy": 0.40667724609375, "epoch": 1.8643410852713178, "grad_norm": 0.6166340670957584, "learning_rate": 6.909830056250527e-06, "loss": 0.3819, "mean_token_accuracy": 0.8618542673066258, "num_tokens": 206892855.0, "step": 481 }, { "entropy": 0.398773193359375, "epoch": 1.8682170542635659, "grad_norm": 0.6776441639714464, "learning_rate": 6.869207630059885e-06, "loss": 0.3851, "mean_token_accuracy": 0.8656446663662791, "num_tokens": 207340245.0, "step": 482 }, { "entropy": 0.40667724609375, "epoch": 1.872093023255814, "grad_norm": 0.701960556262414, "learning_rate": 6.8286424015833585e-06, "loss": 0.3945, "mean_token_accuracy": 0.8633942920714617, "num_tokens": 207781765.0, "step": 483 }, { "entropy": 0.401336669921875, "epoch": 1.875968992248062, "grad_norm": 0.6417634518224469, "learning_rate": 6.788135111923545e-06, "loss": 0.3715, "mean_token_accuracy": 0.8689904985949397, "num_tokens": 208199533.0, "step": 484 }, { "entropy": 0.403167724609375, "epoch": 1.87984496124031, "grad_norm": 0.6314848357880424, "learning_rate": 6.747686501124531e-06, "loss": 0.3956, "mean_token_accuracy": 0.8605082351714373, "num_tokens": 208637751.0, "step": 485 }, { "entropy": 0.4044189453125, "epoch": 1.8837209302325582, "grad_norm": 0.6526708888274303, "learning_rate": 6.707297308158366e-06, "loss": 0.3839, "mean_token_accuracy": 0.8627981888130307, "num_tokens": 209072826.0, "step": 486 }, { "entropy": 0.404571533203125, "epoch": 1.8875968992248062, "grad_norm": 0.6120925587753634, "learning_rate": 6.666968270911585e-06, "loss": 0.3865, "mean_token_accuracy": 0.8635251615196466, "num_tokens": 209523954.0, "step": 487 }, { "entropy": 0.40380859375, "epoch": 1.8914728682170543, "grad_norm": 0.6272847749525875, "learning_rate": 6.6267001261717015e-06, "loss": 0.3821, "mean_token_accuracy": 0.8653721883893013, "num_tokens": 209972017.0, "step": 488 }, { "entropy": 0.4151611328125, "epoch": 1.8953488372093024, "grad_norm": 0.6479005625946697, "learning_rate": 6.586493609613768e-06, "loss": 0.3881, "mean_token_accuracy": 0.86237673740834, "num_tokens": 210406408.0, "step": 489 }, { "entropy": 0.404541015625, "epoch": 1.8992248062015504, "grad_norm": 0.6180822520605622, "learning_rate": 6.546349455786926e-06, "loss": 0.366, "mean_token_accuracy": 0.8684000810608268, "num_tokens": 210815640.0, "step": 490 }, { "entropy": 0.39935302734375, "epoch": 1.9031007751937985, "grad_norm": 0.6425138033116063, "learning_rate": 6.506268398100979e-06, "loss": 0.3849, "mean_token_accuracy": 0.8652486437931657, "num_tokens": 211258730.0, "step": 491 }, { "entropy": 0.396697998046875, "epoch": 1.9069767441860463, "grad_norm": 0.6475606998124774, "learning_rate": 6.46625116881301e-06, "loss": 0.3881, "mean_token_accuracy": 0.8615107480436563, "num_tokens": 211691792.0, "step": 492 }, { "entropy": 0.4013671875, "epoch": 1.9108527131782944, "grad_norm": 0.676923862359068, "learning_rate": 6.426298499013994e-06, "loss": 0.3864, "mean_token_accuracy": 0.8626894094049931, "num_tokens": 212108212.0, "step": 493 }, { "entropy": 0.40142822265625, "epoch": 1.9147286821705425, "grad_norm": 0.6493841657255129, "learning_rate": 6.386411118615434e-06, "loss": 0.3841, "mean_token_accuracy": 0.8646282628178596, "num_tokens": 212524846.0, "step": 494 }, { "entropy": 0.407470703125, "epoch": 1.9186046511627906, "grad_norm": 0.6517307413767497, "learning_rate": 6.34658975633605e-06, "loss": 0.3862, "mean_token_accuracy": 0.866667116060853, "num_tokens": 212949216.0, "step": 495 }, { "entropy": 0.399688720703125, "epoch": 1.9224806201550386, "grad_norm": 0.639749877293263, "learning_rate": 6.306835139688439e-06, "loss": 0.374, "mean_token_accuracy": 0.8665135633200407, "num_tokens": 213375704.0, "step": 496 }, { "entropy": 0.404449462890625, "epoch": 1.9263565891472867, "grad_norm": 0.6396313970116991, "learning_rate": 6.267147994965792e-06, "loss": 0.3898, "mean_token_accuracy": 0.861801334656775, "num_tokens": 213818308.0, "step": 497 }, { "entropy": 0.40673828125, "epoch": 1.9302325581395348, "grad_norm": 0.6293975472846354, "learning_rate": 6.2275290472286406e-06, "loss": 0.3828, "mean_token_accuracy": 0.866341283544898, "num_tokens": 214250620.0, "step": 498 }, { "entropy": 0.400238037109375, "epoch": 1.9341085271317828, "grad_norm": 0.6307078281794433, "learning_rate": 6.187979020291584e-06, "loss": 0.3947, "mean_token_accuracy": 0.8613047506660223, "num_tokens": 214695002.0, "step": 499 }, { "entropy": 0.4093017578125, "epoch": 1.937984496124031, "grad_norm": 0.6607845873938842, "learning_rate": 6.148498636710092e-06, "loss": 0.3632, "mean_token_accuracy": 0.8689653361216187, "num_tokens": 215101633.0, "step": 500 }, { "entropy": 0.408416748046875, "epoch": 1.941860465116279, "grad_norm": 0.6961543921206979, "learning_rate": 6.109088617767287e-06, "loss": 0.3871, "mean_token_accuracy": 0.8612672435119748, "num_tokens": 215530624.0, "step": 501 }, { "entropy": 0.403350830078125, "epoch": 1.945736434108527, "grad_norm": 0.6886590872632472, "learning_rate": 6.069749683460765e-06, "loss": 0.3848, "mean_token_accuracy": 0.8641320113092661, "num_tokens": 215963019.0, "step": 502 }, { "entropy": 0.39825439453125, "epoch": 1.949612403100775, "grad_norm": 0.6433536008619647, "learning_rate": 6.030482552489458e-06, "loss": 0.3815, "mean_token_accuracy": 0.8654372049495578, "num_tokens": 216395090.0, "step": 503 }, { "entropy": 0.39599609375, "epoch": 1.9534883720930232, "grad_norm": 0.6124588094534525, "learning_rate": 5.9912879422404864e-06, "loss": 0.3775, "mean_token_accuracy": 0.8667404530569911, "num_tokens": 216852626.0, "step": 504 }, { "entropy": 0.4044189453125, "epoch": 1.9573643410852712, "grad_norm": 0.6969406927169308, "learning_rate": 5.952166568776062e-06, "loss": 0.3824, "mean_token_accuracy": 0.8645559353753924, "num_tokens": 217284606.0, "step": 505 }, { "entropy": 0.4014892578125, "epoch": 1.9612403100775193, "grad_norm": 0.6289776276400901, "learning_rate": 5.91311914682041e-06, "loss": 0.3855, "mean_token_accuracy": 0.8640624480322003, "num_tokens": 217702095.0, "step": 506 }, { "entropy": 0.40374755859375, "epoch": 1.9651162790697674, "grad_norm": 0.6292663972830993, "learning_rate": 5.874146389746697e-06, "loss": 0.3725, "mean_token_accuracy": 0.8682069405913353, "num_tokens": 218124430.0, "step": 507 }, { "entropy": 0.403778076171875, "epoch": 1.9689922480620154, "grad_norm": 0.6405088898702465, "learning_rate": 5.835249009564013e-06, "loss": 0.3819, "mean_token_accuracy": 0.8638218138366938, "num_tokens": 218559364.0, "step": 508 }, { "entropy": 0.40142822265625, "epoch": 1.9728682170542635, "grad_norm": 0.6594125717541489, "learning_rate": 5.796427716904347e-06, "loss": 0.3788, "mean_token_accuracy": 0.866894249804318, "num_tokens": 218987214.0, "step": 509 }, { "entropy": 0.4051513671875, "epoch": 1.9767441860465116, "grad_norm": 0.6394085248266738, "learning_rate": 5.757683221009625e-06, "loss": 0.3842, "mean_token_accuracy": 0.8652555495500565, "num_tokens": 219428184.0, "step": 510 }, { "entropy": 0.40252685546875, "epoch": 1.9806201550387597, "grad_norm": 0.7035221174607182, "learning_rate": 5.719016229718748e-06, "loss": 0.386, "mean_token_accuracy": 0.8648106651380658, "num_tokens": 219866920.0, "step": 511 }, { "entropy": 0.403076171875, "epoch": 1.9844961240310077, "grad_norm": 0.6765569141175194, "learning_rate": 5.680427449454631e-06, "loss": 0.3757, "mean_token_accuracy": 0.8669820064678788, "num_tokens": 220314515.0, "step": 512 }, { "entropy": 0.396575927734375, "epoch": 1.9883720930232558, "grad_norm": 0.6391561698642179, "learning_rate": 5.641917585211338e-06, "loss": 0.3824, "mean_token_accuracy": 0.8642657212913036, "num_tokens": 220750298.0, "step": 513 }, { "entropy": 0.398956298828125, "epoch": 1.9922480620155039, "grad_norm": 0.6432687978090328, "learning_rate": 5.60348734054118e-06, "loss": 0.3857, "mean_token_accuracy": 0.8652426460757852, "num_tokens": 221200699.0, "step": 514 }, { "entropy": 0.401336669921875, "epoch": 1.996124031007752, "grad_norm": 0.6660097790435096, "learning_rate": 5.565137417541866e-06, "loss": 0.3647, "mean_token_accuracy": 0.870651887729764, "num_tokens": 221631125.0, "step": 515 }, { "entropy": 0.4005126953125, "epoch": 2.0, "grad_norm": 0.654520060535491, "learning_rate": 5.526868516843673e-06, "loss": 0.3735, "mean_token_accuracy": 0.8669488895684481, "num_tokens": 222046610.0, "step": 516 }, { "entropy": 0.400299072265625, "epoch": 2.003875968992248, "grad_norm": 0.6936563180726831, "learning_rate": 5.488681337596653e-06, "loss": 0.3466, "mean_token_accuracy": 0.8784803748130798, "num_tokens": 222488581.0, "step": 517 }, { "entropy": 0.401611328125, "epoch": 2.007751937984496, "grad_norm": 0.6926888243105179, "learning_rate": 5.450576577457858e-06, "loss": 0.3293, "mean_token_accuracy": 0.8813051115721464, "num_tokens": 222901744.0, "step": 518 }, { "entropy": 0.39300537109375, "epoch": 2.011627906976744, "grad_norm": 0.6643375347233971, "learning_rate": 5.412554932578578e-06, "loss": 0.3349, "mean_token_accuracy": 0.8790784049779177, "num_tokens": 223337496.0, "step": 519 }, { "entropy": 0.392425537109375, "epoch": 2.0155038759689923, "grad_norm": 0.7298666897594812, "learning_rate": 5.37461709759165e-06, "loss": 0.3462, "mean_token_accuracy": 0.8755828496068716, "num_tokens": 223777243.0, "step": 520 }, { "entropy": 0.3936767578125, "epoch": 2.0193798449612403, "grad_norm": 0.7323552015632304, "learning_rate": 5.3367637655987515e-06, "loss": 0.3315, "mean_token_accuracy": 0.8803297802805901, "num_tokens": 224185656.0, "step": 521 }, { "entropy": 0.382598876953125, "epoch": 2.0232558139534884, "grad_norm": 0.7128553544647434, "learning_rate": 5.298995628157738e-06, "loss": 0.3201, "mean_token_accuracy": 0.8834712980315089, "num_tokens": 224616727.0, "step": 522 }, { "entropy": 0.387176513671875, "epoch": 2.0271317829457365, "grad_norm": 0.6357745281130776, "learning_rate": 5.2613133752700145e-06, "loss": 0.3315, "mean_token_accuracy": 0.8827424431219697, "num_tokens": 225029016.0, "step": 523 }, { "entropy": 0.394073486328125, "epoch": 2.0310077519379846, "grad_norm": 0.685138271938773, "learning_rate": 5.223717695367922e-06, "loss": 0.3343, "mean_token_accuracy": 0.8783880360424519, "num_tokens": 225455623.0, "step": 524 }, { "entropy": 0.403533935546875, "epoch": 2.0348837209302326, "grad_norm": 0.6897816458876816, "learning_rate": 5.186209275302175e-06, "loss": 0.3365, "mean_token_accuracy": 0.8792239772155881, "num_tokens": 225868920.0, "step": 525 }, { "entropy": 0.395294189453125, "epoch": 2.0387596899224807, "grad_norm": 0.7020774690634989, "learning_rate": 5.148788800329279e-06, "loss": 0.3366, "mean_token_accuracy": 0.8801201498135924, "num_tokens": 226290859.0, "step": 526 }, { "entropy": 0.39208984375, "epoch": 2.0426356589147288, "grad_norm": 0.6544070719356055, "learning_rate": 5.111456954099064e-06, "loss": 0.325, "mean_token_accuracy": 0.8827378544956446, "num_tokens": 226729135.0, "step": 527 }, { "entropy": 0.384185791015625, "epoch": 2.046511627906977, "grad_norm": 0.6922594514727605, "learning_rate": 5.0742144186421484e-06, "loss": 0.3341, "mean_token_accuracy": 0.8800540259107947, "num_tokens": 227155610.0, "step": 528 }, { "entropy": 0.389190673828125, "epoch": 2.050387596899225, "grad_norm": 0.6710910306877101, "learning_rate": 5.037061874357503e-06, "loss": 0.3391, "mean_token_accuracy": 0.8789013354107738, "num_tokens": 227602237.0, "step": 529 }, { "entropy": 0.385589599609375, "epoch": 2.054263565891473, "grad_norm": 0.6061828959415225, "learning_rate": 5.000000000000003e-06, "loss": 0.3346, "mean_token_accuracy": 0.8803443806245923, "num_tokens": 228063981.0, "step": 530 }, { "entropy": 0.39202880859375, "epoch": 2.058139534883721, "grad_norm": 0.6141481495965043, "learning_rate": 4.963029472668044e-06, "loss": 0.3302, "mean_token_accuracy": 0.8802597392350435, "num_tokens": 228496111.0, "step": 531 }, { "entropy": 0.385711669921875, "epoch": 2.062015503875969, "grad_norm": 0.6384598762874943, "learning_rate": 4.92615096779118e-06, "loss": 0.3338, "mean_token_accuracy": 0.880737591534853, "num_tokens": 228944550.0, "step": 532 }, { "entropy": 0.395233154296875, "epoch": 2.065891472868217, "grad_norm": 0.6698946719476137, "learning_rate": 4.889365159117744e-06, "loss": 0.3321, "mean_token_accuracy": 0.8801658367738128, "num_tokens": 229382741.0, "step": 533 }, { "entropy": 0.385955810546875, "epoch": 2.0697674418604652, "grad_norm": 0.6778064236402924, "learning_rate": 4.852672718702581e-06, "loss": 0.3299, "mean_token_accuracy": 0.8840415002778172, "num_tokens": 229821349.0, "step": 534 }, { "entropy": 0.387420654296875, "epoch": 2.0736434108527133, "grad_norm": 0.6328680945170063, "learning_rate": 4.81607431689475e-06, "loss": 0.3166, "mean_token_accuracy": 0.8840623144060373, "num_tokens": 230260834.0, "step": 535 }, { "entropy": 0.390625, "epoch": 2.0775193798449614, "grad_norm": 0.6408500370095382, "learning_rate": 4.779570622325284e-06, "loss": 0.3324, "mean_token_accuracy": 0.8803566815331578, "num_tokens": 230677028.0, "step": 536 }, { "entropy": 0.383026123046875, "epoch": 2.0813953488372094, "grad_norm": 0.6331145987896816, "learning_rate": 4.743162301894952e-06, "loss": 0.3314, "mean_token_accuracy": 0.8804466081783175, "num_tokens": 231123890.0, "step": 537 }, { "entropy": 0.3948974609375, "epoch": 2.0852713178294575, "grad_norm": 0.6360376709937053, "learning_rate": 4.706850020762126e-06, "loss": 0.3203, "mean_token_accuracy": 0.8849101848900318, "num_tokens": 231537700.0, "step": 538 }, { "entropy": 0.39019775390625, "epoch": 2.0891472868217056, "grad_norm": 0.6480759056424732, "learning_rate": 4.6706344423305775e-06, "loss": 0.3258, "mean_token_accuracy": 0.8819584492594004, "num_tokens": 231966260.0, "step": 539 }, { "entropy": 0.393280029296875, "epoch": 2.0930232558139537, "grad_norm": 0.6365457220306857, "learning_rate": 4.634516228237372e-06, "loss": 0.3328, "mean_token_accuracy": 0.882646357640624, "num_tokens": 232394010.0, "step": 540 }, { "entropy": 0.3883056640625, "epoch": 2.0968992248062017, "grad_norm": 0.6729577807372763, "learning_rate": 4.598496038340801e-06, "loss": 0.3312, "mean_token_accuracy": 0.8807763801887631, "num_tokens": 232833486.0, "step": 541 }, { "entropy": 0.386322021484375, "epoch": 2.10077519379845, "grad_norm": 0.6467656132694498, "learning_rate": 4.5625745307083e-06, "loss": 0.339, "mean_token_accuracy": 0.8785864366218448, "num_tokens": 233276645.0, "step": 542 }, { "entropy": 0.392120361328125, "epoch": 2.104651162790698, "grad_norm": 0.6341672757705268, "learning_rate": 4.526752361604455e-06, "loss": 0.3279, "mean_token_accuracy": 0.8815192077308893, "num_tokens": 233710256.0, "step": 543 }, { "entropy": 0.385162353515625, "epoch": 2.108527131782946, "grad_norm": 0.661558127233088, "learning_rate": 4.491030185478976e-06, "loss": 0.3181, "mean_token_accuracy": 0.8827753607183695, "num_tokens": 234144022.0, "step": 544 }, { "entropy": 0.3856201171875, "epoch": 2.112403100775194, "grad_norm": 0.638959340473472, "learning_rate": 4.455408654954771e-06, "loss": 0.3246, "mean_token_accuracy": 0.8830780945718288, "num_tokens": 234561879.0, "step": 545 }, { "entropy": 0.388946533203125, "epoch": 2.116279069767442, "grad_norm": 0.6321172574448592, "learning_rate": 4.419888420816015e-06, "loss": 0.3237, "mean_token_accuracy": 0.8818927984684706, "num_tokens": 234987166.0, "step": 546 }, { "entropy": 0.384979248046875, "epoch": 2.12015503875969, "grad_norm": 0.6645224096624154, "learning_rate": 4.3844701319962525e-06, "loss": 0.333, "mean_token_accuracy": 0.8798952642828226, "num_tokens": 235438001.0, "step": 547 }, { "entropy": 0.395172119140625, "epoch": 2.124031007751938, "grad_norm": 0.6734141584773501, "learning_rate": 4.349154435566551e-06, "loss": 0.3339, "mean_token_accuracy": 0.8803757233545184, "num_tokens": 235852297.0, "step": 548 }, { "entropy": 0.388427734375, "epoch": 2.1279069767441863, "grad_norm": 0.6638427414245535, "learning_rate": 4.313941976723677e-06, "loss": 0.3249, "mean_token_accuracy": 0.8819608362391591, "num_tokens": 236284675.0, "step": 549 }, { "entropy": 0.390655517578125, "epoch": 2.1317829457364343, "grad_norm": 0.6269884063395513, "learning_rate": 4.278833398778306e-06, "loss": 0.3227, "mean_token_accuracy": 0.8828015690669417, "num_tokens": 236717833.0, "step": 550 }, { "entropy": 0.39385986328125, "epoch": 2.135658914728682, "grad_norm": 0.6090452595776772, "learning_rate": 4.2438293431432665e-06, "loss": 0.3346, "mean_token_accuracy": 0.8799500595778227, "num_tokens": 237135700.0, "step": 551 }, { "entropy": 0.39410400390625, "epoch": 2.13953488372093, "grad_norm": 0.6332869380655064, "learning_rate": 4.2089304493218355e-06, "loss": 0.3325, "mean_token_accuracy": 0.8803482167422771, "num_tokens": 237556435.0, "step": 552 }, { "entropy": 0.39239501953125, "epoch": 2.143410852713178, "grad_norm": 0.6129101680068585, "learning_rate": 4.17413735489604e-06, "loss": 0.3364, "mean_token_accuracy": 0.8808656400069594, "num_tokens": 237993246.0, "step": 553 }, { "entropy": 0.394775390625, "epoch": 2.147286821705426, "grad_norm": 0.622513871237599, "learning_rate": 4.139450695515018e-06, "loss": 0.3177, "mean_token_accuracy": 0.8859911020845175, "num_tokens": 238416417.0, "step": 554 }, { "entropy": 0.385986328125, "epoch": 2.1511627906976742, "grad_norm": 0.6686843413819599, "learning_rate": 4.104871104883403e-06, "loss": 0.3476, "mean_token_accuracy": 0.8776693055406213, "num_tokens": 238872766.0, "step": 555 }, { "entropy": 0.392120361328125, "epoch": 2.1550387596899223, "grad_norm": 0.6426341081994442, "learning_rate": 4.070399214749743e-06, "loss": 0.3362, "mean_token_accuracy": 0.8793003624305129, "num_tokens": 239296976.0, "step": 556 }, { "entropy": 0.3946533203125, "epoch": 2.1589147286821704, "grad_norm": 0.6721747752054241, "learning_rate": 4.036035654894967e-06, "loss": 0.3176, "mean_token_accuracy": 0.8857940044254065, "num_tokens": 239703256.0, "step": 557 }, { "entropy": 0.38916015625, "epoch": 2.1627906976744184, "grad_norm": 0.6255029124954248, "learning_rate": 4.001781053120863e-06, "loss": 0.3407, "mean_token_accuracy": 0.8771230475977063, "num_tokens": 240138828.0, "step": 558 }, { "entropy": 0.39349365234375, "epoch": 2.1666666666666665, "grad_norm": 0.6818590678557627, "learning_rate": 3.967636035238636e-06, "loss": 0.341, "mean_token_accuracy": 0.8793875314295292, "num_tokens": 240559168.0, "step": 559 }, { "entropy": 0.38836669921875, "epoch": 2.1705426356589146, "grad_norm": 0.6518531386657375, "learning_rate": 3.933601225057446e-06, "loss": 0.3272, "mean_token_accuracy": 0.8832004126161337, "num_tokens": 240997393.0, "step": 560 }, { "entropy": 0.388427734375, "epoch": 2.1744186046511627, "grad_norm": 0.6162239541133624, "learning_rate": 3.8996772443730335e-06, "loss": 0.3289, "mean_token_accuracy": 0.8784168781712651, "num_tokens": 241432591.0, "step": 561 }, { "entropy": 0.388946533203125, "epoch": 2.1782945736434107, "grad_norm": 0.6484603141660229, "learning_rate": 3.865864712956336e-06, "loss": 0.3398, "mean_token_accuracy": 0.8791722999885678, "num_tokens": 241869468.0, "step": 562 }, { "entropy": 0.39080810546875, "epoch": 2.182170542635659, "grad_norm": 0.6345203708850213, "learning_rate": 3.832164248542192e-06, "loss": 0.3165, "mean_token_accuracy": 0.8831057138741016, "num_tokens": 242308134.0, "step": 563 }, { "entropy": 0.391815185546875, "epoch": 2.186046511627907, "grad_norm": 0.6250147866285306, "learning_rate": 3.798576466818038e-06, "loss": 0.3163, "mean_token_accuracy": 0.8841667361557484, "num_tokens": 242743846.0, "step": 564 }, { "entropy": 0.387176513671875, "epoch": 2.189922480620155, "grad_norm": 0.6587618073867574, "learning_rate": 3.7651019814126656e-06, "loss": 0.3372, "mean_token_accuracy": 0.879564318805933, "num_tokens": 243184279.0, "step": 565 }, { "entropy": 0.391998291015625, "epoch": 2.193798449612403, "grad_norm": 0.6231496336515444, "learning_rate": 3.7317414038850085e-06, "loss": 0.3349, "mean_token_accuracy": 0.8822369873523712, "num_tokens": 243603345.0, "step": 566 }, { "entropy": 0.38885498046875, "epoch": 2.197674418604651, "grad_norm": 0.6243098230227394, "learning_rate": 3.6984953437129734e-06, "loss": 0.328, "mean_token_accuracy": 0.8829399077221751, "num_tokens": 244029585.0, "step": 567 }, { "entropy": 0.390411376953125, "epoch": 2.201550387596899, "grad_norm": 0.6183649760265033, "learning_rate": 3.665364408282305e-06, "loss": 0.3335, "mean_token_accuracy": 0.8813704084604979, "num_tokens": 244452331.0, "step": 568 }, { "entropy": 0.39208984375, "epoch": 2.205426356589147, "grad_norm": 0.6835852376079943, "learning_rate": 3.6323492028754724e-06, "loss": 0.3263, "mean_token_accuracy": 0.881412522867322, "num_tokens": 244882973.0, "step": 569 }, { "entropy": 0.392364501953125, "epoch": 2.2093023255813953, "grad_norm": 0.6327714392921648, "learning_rate": 3.5994503306606497e-06, "loss": 0.3156, "mean_token_accuracy": 0.8864553738385439, "num_tokens": 245295491.0, "step": 570 }, { "entropy": 0.38726806640625, "epoch": 2.2131782945736433, "grad_norm": 0.6256797059109875, "learning_rate": 3.5666683926806623e-06, "loss": 0.3356, "mean_token_accuracy": 0.8783683739602566, "num_tokens": 245759364.0, "step": 571 }, { "entropy": 0.383056640625, "epoch": 2.2170542635658914, "grad_norm": 0.6399228477600465, "learning_rate": 3.534003987842005e-06, "loss": 0.3219, "mean_token_accuracy": 0.8829577537253499, "num_tokens": 246203758.0, "step": 572 }, { "entropy": 0.386688232421875, "epoch": 2.2209302325581395, "grad_norm": 0.6415173203869877, "learning_rate": 3.5014577129039296e-06, "loss": 0.3243, "mean_token_accuracy": 0.8814328899607062, "num_tokens": 246645472.0, "step": 573 }, { "entropy": 0.390869140625, "epoch": 2.2248062015503876, "grad_norm": 0.6580150116543236, "learning_rate": 3.4690301624675127e-06, "loss": 0.3251, "mean_token_accuracy": 0.8838908141478896, "num_tokens": 247064090.0, "step": 574 }, { "entropy": 0.385711669921875, "epoch": 2.2286821705426356, "grad_norm": 0.6616373941415779, "learning_rate": 3.4367219289648192e-06, "loss": 0.3406, "mean_token_accuracy": 0.8785260496661067, "num_tokens": 247508703.0, "step": 575 }, { "entropy": 0.38714599609375, "epoch": 2.2325581395348837, "grad_norm": 0.6720601378427568, "learning_rate": 3.4045336026480457e-06, "loss": 0.3338, "mean_token_accuracy": 0.8801887268200517, "num_tokens": 247934655.0, "step": 576 }, { "entropy": 0.3848876953125, "epoch": 2.2364341085271318, "grad_norm": 0.6255047863223557, "learning_rate": 3.372465771578771e-06, "loss": 0.3321, "mean_token_accuracy": 0.8830512659624219, "num_tokens": 248373031.0, "step": 577 }, { "entropy": 0.391143798828125, "epoch": 2.24031007751938, "grad_norm": 0.636261521458766, "learning_rate": 3.340519021617189e-06, "loss": 0.3368, "mean_token_accuracy": 0.877558303065598, "num_tokens": 248800011.0, "step": 578 }, { "entropy": 0.392242431640625, "epoch": 2.244186046511628, "grad_norm": 0.6811192022983874, "learning_rate": 3.308693936411421e-06, "loss": 0.3231, "mean_token_accuracy": 0.8823585864156485, "num_tokens": 249220660.0, "step": 579 }, { "entropy": 0.400482177734375, "epoch": 2.248062015503876, "grad_norm": 0.6678322815867844, "learning_rate": 3.2769910973868314e-06, "loss": 0.3111, "mean_token_accuracy": 0.8872639862820506, "num_tokens": 249612238.0, "step": 580 }, { "entropy": 0.392578125, "epoch": 2.251937984496124, "grad_norm": 0.6680878488658235, "learning_rate": 3.24541108373544e-06, "loss": 0.3221, "mean_token_accuracy": 0.8829946629703045, "num_tokens": 250039769.0, "step": 581 }, { "entropy": 0.386383056640625, "epoch": 2.255813953488372, "grad_norm": 0.658743704605255, "learning_rate": 3.2139544724053083e-06, "loss": 0.3112, "mean_token_accuracy": 0.8881248384714127, "num_tokens": 250468645.0, "step": 582 }, { "entropy": 0.38922119140625, "epoch": 2.25968992248062, "grad_norm": 0.7447959483419172, "learning_rate": 3.1826218380900066e-06, "loss": 0.3341, "mean_token_accuracy": 0.8805524576455355, "num_tokens": 250883417.0, "step": 583 }, { "entropy": 0.380645751953125, "epoch": 2.2635658914728682, "grad_norm": 0.6453212527580212, "learning_rate": 3.1514137532181265e-06, "loss": 0.3276, "mean_token_accuracy": 0.8826109319925308, "num_tokens": 251327846.0, "step": 584 }, { "entropy": 0.391693115234375, "epoch": 2.2674418604651163, "grad_norm": 0.657520936501664, "learning_rate": 3.1203307879428146e-06, "loss": 0.3215, "mean_token_accuracy": 0.8842367362231016, "num_tokens": 251752385.0, "step": 585 }, { "entropy": 0.390045166015625, "epoch": 2.2713178294573644, "grad_norm": 0.6650188059458049, "learning_rate": 3.089373510131354e-06, "loss": 0.3084, "mean_token_accuracy": 0.8864572271704674, "num_tokens": 252174547.0, "step": 586 }, { "entropy": 0.39080810546875, "epoch": 2.2751937984496124, "grad_norm": 0.6417234553292116, "learning_rate": 3.0585424853547953e-06, "loss": 0.3238, "mean_token_accuracy": 0.8842586716637015, "num_tokens": 252596335.0, "step": 587 }, { "entropy": 0.392608642578125, "epoch": 2.2790697674418605, "grad_norm": 0.6639151590319899, "learning_rate": 3.0278382768776193e-06, "loss": 0.3339, "mean_token_accuracy": 0.8815375939011574, "num_tokens": 253021440.0, "step": 588 }, { "entropy": 0.39007568359375, "epoch": 2.2829457364341086, "grad_norm": 0.6644409987626044, "learning_rate": 2.9972614456474537e-06, "loss": 0.3289, "mean_token_accuracy": 0.8809984363615513, "num_tokens": 253440932.0, "step": 589 }, { "entropy": 0.385986328125, "epoch": 2.2868217054263567, "grad_norm": 0.6481260918771182, "learning_rate": 2.9668125502848035e-06, "loss": 0.3184, "mean_token_accuracy": 0.8846102599054575, "num_tokens": 253865848.0, "step": 590 }, { "entropy": 0.384033203125, "epoch": 2.2906976744186047, "grad_norm": 0.6548821507339614, "learning_rate": 2.936492147072885e-06, "loss": 0.3212, "mean_token_accuracy": 0.8840930741280317, "num_tokens": 254319258.0, "step": 591 }, { "entropy": 0.3841552734375, "epoch": 2.294573643410853, "grad_norm": 0.6143947491644914, "learning_rate": 2.9063007899474214e-06, "loss": 0.3198, "mean_token_accuracy": 0.8845802173018456, "num_tokens": 254778429.0, "step": 592 }, { "entropy": 0.382781982421875, "epoch": 2.298449612403101, "grad_norm": 0.5994337091831552, "learning_rate": 2.876239030486554e-06, "loss": 0.333, "mean_token_accuracy": 0.8806400252506137, "num_tokens": 255246376.0, "step": 593 }, { "entropy": 0.39447021484375, "epoch": 2.302325581395349, "grad_norm": 0.6370982481961588, "learning_rate": 2.8463074179007356e-06, "loss": 0.3298, "mean_token_accuracy": 0.8816047692671418, "num_tokens": 255679298.0, "step": 594 }, { "entropy": 0.38818359375, "epoch": 2.306201550387597, "grad_norm": 0.6326100288460391, "learning_rate": 2.8165064990227255e-06, "loss": 0.3139, "mean_token_accuracy": 0.8859955314546824, "num_tokens": 256126964.0, "step": 595 }, { "entropy": 0.3873291015625, "epoch": 2.310077519379845, "grad_norm": 0.6281151901542636, "learning_rate": 2.7868368182975835e-06, "loss": 0.3328, "mean_token_accuracy": 0.8832689542323351, "num_tokens": 256574701.0, "step": 596 }, { "entropy": 0.391632080078125, "epoch": 2.313953488372093, "grad_norm": 0.6543928974172343, "learning_rate": 2.757298917772727e-06, "loss": 0.3294, "mean_token_accuracy": 0.8810861445963383, "num_tokens": 256980992.0, "step": 597 }, { "entropy": 0.3851318359375, "epoch": 2.317829457364341, "grad_norm": 0.615207001414363, "learning_rate": 2.7278933370880267e-06, "loss": 0.3109, "mean_token_accuracy": 0.887361123226583, "num_tokens": 257433482.0, "step": 598 }, { "entropy": 0.392578125, "epoch": 2.3217054263565893, "grad_norm": 0.622271406233036, "learning_rate": 2.6986206134659477e-06, "loss": 0.32, "mean_token_accuracy": 0.8841242687776685, "num_tokens": 257864399.0, "step": 599 }, { "entropy": 0.384429931640625, "epoch": 2.3255813953488373, "grad_norm": 0.6493402572209033, "learning_rate": 2.669481281701739e-06, "loss": 0.3159, "mean_token_accuracy": 0.8869122276082635, "num_tokens": 258323505.0, "step": 600 }, { "entropy": 0.385650634765625, "epoch": 2.3294573643410854, "grad_norm": 0.655441563972454, "learning_rate": 2.640475874153651e-06, "loss": 0.3273, "mean_token_accuracy": 0.8811930902302265, "num_tokens": 258755764.0, "step": 601 }, { "entropy": 0.386383056640625, "epoch": 2.3333333333333335, "grad_norm": 0.6241612484793835, "learning_rate": 2.6116049207332304e-06, "loss": 0.3169, "mean_token_accuracy": 0.8841598564758897, "num_tokens": 259174373.0, "step": 602 }, { "entropy": 0.38812255859375, "epoch": 2.3372093023255816, "grad_norm": 0.6869232640948452, "learning_rate": 2.582868948895623e-06, "loss": 0.3271, "mean_token_accuracy": 0.8831503242254257, "num_tokens": 259633845.0, "step": 603 }, { "entropy": 0.394287109375, "epoch": 2.3410852713178296, "grad_norm": 0.6445176893131164, "learning_rate": 2.5542684836299316e-06, "loss": 0.3232, "mean_token_accuracy": 0.8843172611668706, "num_tokens": 260062070.0, "step": 604 }, { "entropy": 0.388092041015625, "epoch": 2.3449612403100777, "grad_norm": 0.644553761738823, "learning_rate": 2.5258040474496483e-06, "loss": 0.3167, "mean_token_accuracy": 0.8840411538258195, "num_tokens": 260479228.0, "step": 605 }, { "entropy": 0.384735107421875, "epoch": 2.3488372093023258, "grad_norm": 0.6303072264697063, "learning_rate": 2.4974761603830865e-06, "loss": 0.3274, "mean_token_accuracy": 0.883203936740756, "num_tokens": 260915469.0, "step": 606 }, { "entropy": 0.387939453125, "epoch": 2.352713178294574, "grad_norm": 0.6726103111482618, "learning_rate": 2.469285339963892e-06, "loss": 0.3199, "mean_token_accuracy": 0.8849070286378264, "num_tokens": 261365417.0, "step": 607 }, { "entropy": 0.38800048828125, "epoch": 2.356589147286822, "grad_norm": 0.6399916127128915, "learning_rate": 2.4412321012215824e-06, "loss": 0.3317, "mean_token_accuracy": 0.880051271058619, "num_tokens": 261812841.0, "step": 608 }, { "entropy": 0.394989013671875, "epoch": 2.3604651162790695, "grad_norm": 0.613061425373263, "learning_rate": 2.4133169566721426e-06, "loss": 0.3278, "mean_token_accuracy": 0.883314672857523, "num_tokens": 262236708.0, "step": 609 }, { "entropy": 0.389251708984375, "epoch": 2.3643410852713176, "grad_norm": 0.6588087474437182, "learning_rate": 2.3855404163086558e-06, "loss": 0.3013, "mean_token_accuracy": 0.8911587707698345, "num_tokens": 262663914.0, "step": 610 }, { "entropy": 0.388580322265625, "epoch": 2.3682170542635657, "grad_norm": 0.6650335500867999, "learning_rate": 2.3579029875919933e-06, "loss": 0.3337, "mean_token_accuracy": 0.8805716382339597, "num_tokens": 263097308.0, "step": 611 }, { "entropy": 0.38568115234375, "epoch": 2.3720930232558137, "grad_norm": 0.6613953074032827, "learning_rate": 2.330405175441529e-06, "loss": 0.3222, "mean_token_accuracy": 0.8836323749274015, "num_tokens": 263521684.0, "step": 612 }, { "entropy": 0.38543701171875, "epoch": 2.375968992248062, "grad_norm": 0.6419694798675706, "learning_rate": 2.3030474822259396e-06, "loss": 0.3215, "mean_token_accuracy": 0.8862187461927533, "num_tokens": 263961105.0, "step": 613 }, { "entropy": 0.384033203125, "epoch": 2.37984496124031, "grad_norm": 0.639786053232097, "learning_rate": 2.275830407754006e-06, "loss": 0.3083, "mean_token_accuracy": 0.8866908960044384, "num_tokens": 264391888.0, "step": 614 }, { "entropy": 0.388092041015625, "epoch": 2.383720930232558, "grad_norm": 0.6195099013880934, "learning_rate": 2.2487544492654832e-06, "loss": 0.3269, "mean_token_accuracy": 0.8840560354292393, "num_tokens": 264826227.0, "step": 615 }, { "entropy": 0.387176513671875, "epoch": 2.387596899224806, "grad_norm": 0.6003192458281774, "learning_rate": 2.2218201014220266e-06, "loss": 0.3172, "mean_token_accuracy": 0.8835435407236218, "num_tokens": 265245659.0, "step": 616 }, { "entropy": 0.38287353515625, "epoch": 2.391472868217054, "grad_norm": 0.6213328046705171, "learning_rate": 2.1950278562981497e-06, "loss": 0.3127, "mean_token_accuracy": 0.8866761410608888, "num_tokens": 265682628.0, "step": 617 }, { "entropy": 0.392486572265625, "epoch": 2.395348837209302, "grad_norm": 0.6357426254067038, "learning_rate": 2.1683782033722313e-06, "loss": 0.3376, "mean_token_accuracy": 0.880131833255291, "num_tokens": 266095365.0, "step": 618 }, { "entropy": 0.3843994140625, "epoch": 2.39922480620155, "grad_norm": 0.6563518530775935, "learning_rate": 2.1418716295175766e-06, "loss": 0.3132, "mean_token_accuracy": 0.8866546172648668, "num_tokens": 266542811.0, "step": 619 }, { "entropy": 0.388702392578125, "epoch": 2.4031007751937983, "grad_norm": 0.6423172247359347, "learning_rate": 2.1155086189935227e-06, "loss": 0.3204, "mean_token_accuracy": 0.8823329349979758, "num_tokens": 266968847.0, "step": 620 }, { "entropy": 0.383148193359375, "epoch": 2.4069767441860463, "grad_norm": 0.6140724947507443, "learning_rate": 2.08928965343659e-06, "loss": 0.3114, "mean_token_accuracy": 0.8879816886037588, "num_tokens": 267405210.0, "step": 621 }, { "entropy": 0.3831787109375, "epoch": 2.4108527131782944, "grad_norm": 0.6120711901597875, "learning_rate": 2.063215211851678e-06, "loss": 0.3102, "mean_token_accuracy": 0.8873960571363568, "num_tokens": 267834286.0, "step": 622 }, { "entropy": 0.38360595703125, "epoch": 2.4147286821705425, "grad_norm": 0.6702572768032118, "learning_rate": 2.037285770603321e-06, "loss": 0.3297, "mean_token_accuracy": 0.882229084149003, "num_tokens": 268270543.0, "step": 623 }, { "entropy": 0.37982177734375, "epoch": 2.4186046511627906, "grad_norm": 0.6677987053876467, "learning_rate": 2.0115018034069955e-06, "loss": 0.3136, "mean_token_accuracy": 0.8866470847278833, "num_tokens": 268692883.0, "step": 624 }, { "entropy": 0.379608154296875, "epoch": 2.4224806201550386, "grad_norm": 1.0553125635780334, "learning_rate": 1.9858637813204352e-06, "loss": 0.3089, "mean_token_accuracy": 0.8852367643266916, "num_tokens": 269108495.0, "step": 625 }, { "entropy": 0.383544921875, "epoch": 2.4263565891472867, "grad_norm": 0.6385113376841698, "learning_rate": 1.9603721727350532e-06, "loss": 0.312, "mean_token_accuracy": 0.8891161847859621, "num_tokens": 269541818.0, "step": 626 }, { "entropy": 0.38232421875, "epoch": 2.4302325581395348, "grad_norm": 0.6093604650776818, "learning_rate": 1.9350274433673745e-06, "loss": 0.3065, "mean_token_accuracy": 0.8878201972693205, "num_tokens": 269944971.0, "step": 627 }, { "entropy": 0.388641357421875, "epoch": 2.434108527131783, "grad_norm": 0.6283112616971976, "learning_rate": 1.9098300562505266e-06, "loss": 0.314, "mean_token_accuracy": 0.8887106478214264, "num_tokens": 270357437.0, "step": 628 }, { "entropy": 0.384918212890625, "epoch": 2.437984496124031, "grad_norm": 0.6207773662097917, "learning_rate": 1.8847804717257833e-06, "loss": 0.3184, "mean_token_accuracy": 0.884671707637608, "num_tokens": 270804654.0, "step": 629 }, { "entropy": 0.38421630859375, "epoch": 2.441860465116279, "grad_norm": 0.6193418360544493, "learning_rate": 1.8598791474341516e-06, "loss": 0.3234, "mean_token_accuracy": 0.8826173283159733, "num_tokens": 271227560.0, "step": 630 }, { "entropy": 0.3800048828125, "epoch": 2.445736434108527, "grad_norm": 0.6173876400834213, "learning_rate": 1.835126538308013e-06, "loss": 0.3123, "mean_token_accuracy": 0.8874257709830999, "num_tokens": 271645073.0, "step": 631 }, { "entropy": 0.3846435546875, "epoch": 2.449612403100775, "grad_norm": 0.6232085631801352, "learning_rate": 1.810523096562814e-06, "loss": 0.3124, "mean_token_accuracy": 0.8858788376674056, "num_tokens": 272066074.0, "step": 632 }, { "entropy": 0.383758544921875, "epoch": 2.453488372093023, "grad_norm": 0.6681731482811958, "learning_rate": 1.7860692716887906e-06, "loss": 0.3101, "mean_token_accuracy": 0.8878279887139797, "num_tokens": 272500490.0, "step": 633 }, { "entropy": 0.38494873046875, "epoch": 2.4573643410852712, "grad_norm": 0.6414729428255811, "learning_rate": 1.7617655104427833e-06, "loss": 0.333, "mean_token_accuracy": 0.8813438573852181, "num_tokens": 272914398.0, "step": 634 }, { "entropy": 0.383514404296875, "epoch": 2.4612403100775193, "grad_norm": 0.6210433661294859, "learning_rate": 1.7376122568400533e-06, "loss": 0.3239, "mean_token_accuracy": 0.882821892388165, "num_tokens": 273359632.0, "step": 635 }, { "entropy": 0.377685546875, "epoch": 2.4651162790697674, "grad_norm": 0.6122133439124037, "learning_rate": 1.713609952146168e-06, "loss": 0.3229, "mean_token_accuracy": 0.8840822214260697, "num_tokens": 273808032.0, "step": 636 }, { "entropy": 0.3824462890625, "epoch": 2.4689922480620154, "grad_norm": 0.6021018597895732, "learning_rate": 1.6897590348689607e-06, "loss": 0.3068, "mean_token_accuracy": 0.8891383018344641, "num_tokens": 274244780.0, "step": 637 }, { "entropy": 0.3824462890625, "epoch": 2.4728682170542635, "grad_norm": 0.6348069079815698, "learning_rate": 1.6660599407504995e-06, "loss": 0.319, "mean_token_accuracy": 0.8837192356586456, "num_tokens": 274681493.0, "step": 638 }, { "entropy": 0.385833740234375, "epoch": 2.4767441860465116, "grad_norm": 0.6124007089537341, "learning_rate": 1.6425131027591368e-06, "loss": 0.3299, "mean_token_accuracy": 0.8837573220953345, "num_tokens": 275115770.0, "step": 639 }, { "entropy": 0.3885498046875, "epoch": 2.4806201550387597, "grad_norm": 0.6271542904897481, "learning_rate": 1.6191189510815942e-06, "loss": 0.323, "mean_token_accuracy": 0.8834406100213528, "num_tokens": 275540602.0, "step": 640 }, { "entropy": 0.3846435546875, "epoch": 2.4844961240310077, "grad_norm": 0.6598534403286116, "learning_rate": 1.5958779131151049e-06, "loss": 0.3317, "mean_token_accuracy": 0.8816557712852955, "num_tokens": 275960564.0, "step": 641 }, { "entropy": 0.38409423828125, "epoch": 2.488372093023256, "grad_norm": 0.616565215055412, "learning_rate": 1.5727904134596084e-06, "loss": 0.3083, "mean_token_accuracy": 0.8885947009548545, "num_tokens": 276395546.0, "step": 642 }, { "entropy": 0.38604736328125, "epoch": 2.492248062015504, "grad_norm": 0.6157486130398027, "learning_rate": 1.5498568739099907e-06, "loss": 0.3155, "mean_token_accuracy": 0.8881635349243879, "num_tokens": 276819951.0, "step": 643 }, { "entropy": 0.3807373046875, "epoch": 2.496124031007752, "grad_norm": 0.6303652006027609, "learning_rate": 1.5270777134483683e-06, "loss": 0.3351, "mean_token_accuracy": 0.8798212753608823, "num_tokens": 277257356.0, "step": 644 }, { "entropy": 0.388397216796875, "epoch": 2.5, "grad_norm": 0.642010581870401, "learning_rate": 1.504453348236461e-06, "loss": 0.3212, "mean_token_accuracy": 0.8849497428163886, "num_tokens": 277661317.0, "step": 645 }, { "entropy": 0.382965087890625, "epoch": 2.503875968992248, "grad_norm": 0.6252232844795501, "learning_rate": 1.481984191607959e-06, "loss": 0.3105, "mean_token_accuracy": 0.8894343795254827, "num_tokens": 278088072.0, "step": 646 }, { "entropy": 0.380126953125, "epoch": 2.507751937984496, "grad_norm": 0.582093369587659, "learning_rate": 1.4596706540609862e-06, "loss": 0.3238, "mean_token_accuracy": 0.8829025160521269, "num_tokens": 278525498.0, "step": 647 }, { "entropy": 0.393646240234375, "epoch": 2.511627906976744, "grad_norm": 0.6328882087488623, "learning_rate": 1.4375131432505984e-06, "loss": 0.317, "mean_token_accuracy": 0.8850847911089659, "num_tokens": 278934425.0, "step": 648 }, { "entropy": 0.3792724609375, "epoch": 2.5155038759689923, "grad_norm": 0.6316222487630614, "learning_rate": 1.4155120639813392e-06, "loss": 0.3147, "mean_token_accuracy": 0.8857432128861547, "num_tokens": 279368869.0, "step": 649 }, { "entropy": 0.384033203125, "epoch": 2.5193798449612403, "grad_norm": 0.6163425061600645, "learning_rate": 1.3936678181998376e-06, "loss": 0.3077, "mean_token_accuracy": 0.8878246061503887, "num_tokens": 279776778.0, "step": 650 }, { "entropy": 0.384521484375, "epoch": 2.5232558139534884, "grad_norm": 0.6010971074385706, "learning_rate": 1.3719808049874695e-06, "loss": 0.3182, "mean_token_accuracy": 0.885104707442224, "num_tokens": 280216614.0, "step": 651 }, { "entropy": 0.383270263671875, "epoch": 2.5271317829457365, "grad_norm": 0.6249118870033076, "learning_rate": 1.350451420553065e-06, "loss": 0.3095, "mean_token_accuracy": 0.8848751662299037, "num_tokens": 280634172.0, "step": 652 }, { "entropy": 0.385467529296875, "epoch": 2.5310077519379846, "grad_norm": 0.6762125774083043, "learning_rate": 1.3290800582256714e-06, "loss": 0.3141, "mean_token_accuracy": 0.8861252348870039, "num_tokens": 281056902.0, "step": 653 }, { "entropy": 0.37860107421875, "epoch": 2.5348837209302326, "grad_norm": 0.6074642502490406, "learning_rate": 1.3078671084473604e-06, "loss": 0.3079, "mean_token_accuracy": 0.8881129696965218, "num_tokens": 281497206.0, "step": 654 }, { "entropy": 0.380523681640625, "epoch": 2.5387596899224807, "grad_norm": 0.6209025013219114, "learning_rate": 1.286812958766106e-06, "loss": 0.3049, "mean_token_accuracy": 0.8877112930640578, "num_tokens": 281916304.0, "step": 655 }, { "entropy": 0.385955810546875, "epoch": 2.5426356589147288, "grad_norm": 0.6328788729805691, "learning_rate": 1.2659179938287035e-06, "loss": 0.3231, "mean_token_accuracy": 0.883956940844655, "num_tokens": 282338790.0, "step": 656 }, { "entropy": 0.387176513671875, "epoch": 2.546511627906977, "grad_norm": 0.6132138038624881, "learning_rate": 1.2451825953737273e-06, "loss": 0.3125, "mean_token_accuracy": 0.8869189685210586, "num_tokens": 282771871.0, "step": 657 }, { "entropy": 0.383087158203125, "epoch": 2.550387596899225, "grad_norm": 0.6100791024748512, "learning_rate": 1.224607142224572e-06, "loss": 0.3213, "mean_token_accuracy": 0.8853695271536708, "num_tokens": 283186245.0, "step": 658 }, { "entropy": 0.387054443359375, "epoch": 2.554263565891473, "grad_norm": 0.6239314174742299, "learning_rate": 1.2041920102825277e-06, "loss": 0.299, "mean_token_accuracy": 0.8931658444926143, "num_tokens": 283603528.0, "step": 659 }, { "entropy": 0.390350341796875, "epoch": 2.558139534883721, "grad_norm": 0.5964951578284012, "learning_rate": 1.1839375725199098e-06, "loss": 0.3206, "mean_token_accuracy": 0.8860745606943965, "num_tokens": 284012300.0, "step": 660 }, { "entropy": 0.37884521484375, "epoch": 2.562015503875969, "grad_norm": 0.6186039138525928, "learning_rate": 1.1638441989732474e-06, "loss": 0.305, "mean_token_accuracy": 0.8898253720253706, "num_tokens": 284456555.0, "step": 661 }, { "entropy": 0.38311767578125, "epoch": 2.565891472868217, "grad_norm": 0.6436702012478264, "learning_rate": 1.1439122567365214e-06, "loss": 0.3143, "mean_token_accuracy": 0.8866937700659037, "num_tokens": 284898106.0, "step": 662 }, { "entropy": 0.38592529296875, "epoch": 2.5697674418604652, "grad_norm": 0.6081422963421407, "learning_rate": 1.124142109954459e-06, "loss": 0.3092, "mean_token_accuracy": 0.8888078099116683, "num_tokens": 285322897.0, "step": 663 }, { "entropy": 0.3870849609375, "epoch": 2.5736434108527133, "grad_norm": 0.618254819328906, "learning_rate": 1.1045341198158833e-06, "loss": 0.2992, "mean_token_accuracy": 0.892044042237103, "num_tokens": 285743476.0, "step": 664 }, { "entropy": 0.382415771484375, "epoch": 2.5775193798449614, "grad_norm": 0.6382580440374259, "learning_rate": 1.0850886445471055e-06, "loss": 0.3199, "mean_token_accuracy": 0.8851598743349314, "num_tokens": 286167399.0, "step": 665 }, { "entropy": 0.38165283203125, "epoch": 2.5813953488372094, "grad_norm": 0.6249775328464647, "learning_rate": 1.0658060394053904e-06, "loss": 0.3105, "mean_token_accuracy": 0.8882809020578861, "num_tokens": 286601559.0, "step": 666 }, { "entropy": 0.385833740234375, "epoch": 2.5852713178294575, "grad_norm": 0.6355443861472679, "learning_rate": 1.0466866566724698e-06, "loss": 0.3164, "mean_token_accuracy": 0.8866121266037226, "num_tokens": 287036421.0, "step": 667 }, { "entropy": 0.38177490234375, "epoch": 2.5891472868217056, "grad_norm": 0.5788824075949772, "learning_rate": 1.027730845648085e-06, "loss": 0.3273, "mean_token_accuracy": 0.8835360938683152, "num_tokens": 287484834.0, "step": 668 }, { "entropy": 0.381439208984375, "epoch": 2.5930232558139537, "grad_norm": 0.6105981189688949, "learning_rate": 1.0089389526436299e-06, "loss": 0.3098, "mean_token_accuracy": 0.8879820080474019, "num_tokens": 287920778.0, "step": 669 }, { "entropy": 0.377593994140625, "epoch": 2.5968992248062017, "grad_norm": 0.6244913453198664, "learning_rate": 9.903113209758098e-07, "loss": 0.3051, "mean_token_accuracy": 0.8882512943819165, "num_tokens": 288354358.0, "step": 670 }, { "entropy": 0.37994384765625, "epoch": 2.60077519379845, "grad_norm": 0.6181953248203848, "learning_rate": 9.718482909603732e-07, "loss": 0.3117, "mean_token_accuracy": 0.8855423256754875, "num_tokens": 288793843.0, "step": 671 }, { "entropy": 0.37725830078125, "epoch": 2.604651162790698, "grad_norm": 0.6349924792877945, "learning_rate": 9.535501999058971e-07, "loss": 0.3168, "mean_token_accuracy": 0.8862435938790441, "num_tokens": 289225299.0, "step": 672 }, { "entropy": 0.387481689453125, "epoch": 2.608527131782946, "grad_norm": 0.6693755629098281, "learning_rate": 9.354173821076184e-07, "loss": 0.3234, "mean_token_accuracy": 0.8822949966415763, "num_tokens": 289644576.0, "step": 673 }, { "entropy": 0.386260986328125, "epoch": 2.612403100775194, "grad_norm": 0.6097539865340834, "learning_rate": 9.174501688413329e-07, "loss": 0.3205, "mean_token_accuracy": 0.886177402921021, "num_tokens": 290070186.0, "step": 674 }, { "entropy": 0.38409423828125, "epoch": 2.616279069767442, "grad_norm": 0.6152331705902413, "learning_rate": 8.996488883573351e-07, "loss": 0.3229, "mean_token_accuracy": 0.8854604810476303, "num_tokens": 290499509.0, "step": 675 }, { "entropy": 0.383544921875, "epoch": 2.62015503875969, "grad_norm": 0.6258210400477547, "learning_rate": 8.820138658744304e-07, "loss": 0.304, "mean_token_accuracy": 0.8898686449974775, "num_tokens": 290921470.0, "step": 676 }, { "entropy": 0.387298583984375, "epoch": 2.624031007751938, "grad_norm": 0.6137335164148037, "learning_rate": 8.645454235739903e-07, "loss": 0.325, "mean_token_accuracy": 0.8838518625125289, "num_tokens": 291348905.0, "step": 677 }, { "entropy": 0.386138916015625, "epoch": 2.6279069767441863, "grad_norm": 0.6432892349676601, "learning_rate": 8.472438805940652e-07, "loss": 0.332, "mean_token_accuracy": 0.8829165082424879, "num_tokens": 291774652.0, "step": 678 }, { "entropy": 0.3824462890625, "epoch": 2.6317829457364343, "grad_norm": 0.6291908471810097, "learning_rate": 8.301095530235492e-07, "loss": 0.338, "mean_token_accuracy": 0.882173334248364, "num_tokens": 292214413.0, "step": 679 }, { "entropy": 0.38372802734375, "epoch": 2.6356589147286824, "grad_norm": 0.6063757017276409, "learning_rate": 8.131427538964165e-07, "loss": 0.3184, "mean_token_accuracy": 0.8866541981697083, "num_tokens": 292653229.0, "step": 680 }, { "entropy": 0.38775634765625, "epoch": 2.6395348837209305, "grad_norm": 0.6573377080205017, "learning_rate": 7.963437931859919e-07, "loss": 0.297, "mean_token_accuracy": 0.8897636560723186, "num_tokens": 293062442.0, "step": 681 }, { "entropy": 0.3831787109375, "epoch": 2.6434108527131785, "grad_norm": 0.6270362942338372, "learning_rate": 7.797129777992951e-07, "loss": 0.3213, "mean_token_accuracy": 0.8836455037817359, "num_tokens": 293504484.0, "step": 682 }, { "entropy": 0.38446044921875, "epoch": 2.6472868217054266, "grad_norm": 0.6577635211374225, "learning_rate": 7.632506115714289e-07, "loss": 0.3089, "mean_token_accuracy": 0.8872796315699816, "num_tokens": 293948693.0, "step": 683 }, { "entropy": 0.38397216796875, "epoch": 2.6511627906976747, "grad_norm": 0.6293287670711796, "learning_rate": 7.46956995260033e-07, "loss": 0.317, "mean_token_accuracy": 0.8846086421981454, "num_tokens": 294376049.0, "step": 684 }, { "entropy": 0.382598876953125, "epoch": 2.6550387596899228, "grad_norm": 0.6101235033718403, "learning_rate": 7.308324265397837e-07, "loss": 0.3053, "mean_token_accuracy": 0.8887326065450907, "num_tokens": 294817272.0, "step": 685 }, { "entropy": 0.384246826171875, "epoch": 2.6589147286821704, "grad_norm": 0.603135068569057, "learning_rate": 7.148771999969573e-07, "loss": 0.3187, "mean_token_accuracy": 0.8846358032897115, "num_tokens": 295268820.0, "step": 686 }, { "entropy": 0.375823974609375, "epoch": 2.6627906976744184, "grad_norm": 0.6370470013882648, "learning_rate": 6.990916071240506e-07, "loss": 0.3162, "mean_token_accuracy": 0.8857185756787658, "num_tokens": 295704514.0, "step": 687 }, { "entropy": 0.3878173828125, "epoch": 2.6666666666666665, "grad_norm": 0.6307798739932974, "learning_rate": 6.834759363144595e-07, "loss": 0.321, "mean_token_accuracy": 0.884716515429318, "num_tokens": 296123518.0, "step": 688 }, { "entropy": 0.38555908203125, "epoch": 2.6705426356589146, "grad_norm": 0.6359467279336724, "learning_rate": 6.680304728571963e-07, "loss": 0.3115, "mean_token_accuracy": 0.887707345187664, "num_tokens": 296568970.0, "step": 689 }, { "entropy": 0.38232421875, "epoch": 2.6744186046511627, "grad_norm": 0.6594731380323239, "learning_rate": 6.527554989316898e-07, "loss": 0.3147, "mean_token_accuracy": 0.8869885383173823, "num_tokens": 297009689.0, "step": 690 }, { "entropy": 0.376495361328125, "epoch": 2.6782945736434107, "grad_norm": 0.6528547449153141, "learning_rate": 6.37651293602628e-07, "loss": 0.3056, "mean_token_accuracy": 0.8896724404767156, "num_tokens": 297466234.0, "step": 691 }, { "entropy": 0.380035400390625, "epoch": 2.682170542635659, "grad_norm": 0.5984715392007748, "learning_rate": 6.227181328148568e-07, "loss": 0.3074, "mean_token_accuracy": 0.8880112646147609, "num_tokens": 297913176.0, "step": 692 }, { "entropy": 0.382171630859375, "epoch": 2.686046511627907, "grad_norm": 0.6055523687350702, "learning_rate": 6.079562893883395e-07, "loss": 0.3136, "mean_token_accuracy": 0.8882074085995555, "num_tokens": 298336850.0, "step": 693 }, { "entropy": 0.380035400390625, "epoch": 2.689922480620155, "grad_norm": 0.6196322460239065, "learning_rate": 5.933660330131752e-07, "loss": 0.3129, "mean_token_accuracy": 0.8837150661274791, "num_tokens": 298778162.0, "step": 694 }, { "entropy": 0.382781982421875, "epoch": 2.693798449612403, "grad_norm": 0.6053882069820191, "learning_rate": 5.789476302446662e-07, "loss": 0.3206, "mean_token_accuracy": 0.8839719081297517, "num_tokens": 299213302.0, "step": 695 }, { "entropy": 0.37969970703125, "epoch": 2.697674418604651, "grad_norm": 0.6516034301333918, "learning_rate": 5.647013444984561e-07, "loss": 0.3086, "mean_token_accuracy": 0.8866365505382419, "num_tokens": 299633216.0, "step": 696 }, { "entropy": 0.378387451171875, "epoch": 2.701550387596899, "grad_norm": 0.6020971649541776, "learning_rate": 5.506274360457087e-07, "loss": 0.3094, "mean_token_accuracy": 0.8886294420808554, "num_tokens": 300056664.0, "step": 697 }, { "entropy": 0.388641357421875, "epoch": 2.705426356589147, "grad_norm": 0.699089060760847, "learning_rate": 5.367261620083575e-07, "loss": 0.3257, "mean_token_accuracy": 0.8841283833608031, "num_tokens": 300476483.0, "step": 698 }, { "entropy": 0.388946533203125, "epoch": 2.7093023255813953, "grad_norm": 0.5976278141664447, "learning_rate": 5.229977763544148e-07, "loss": 0.3267, "mean_token_accuracy": 0.8839065292850137, "num_tokens": 300896040.0, "step": 699 }, { "entropy": 0.38214111328125, "epoch": 2.7131782945736433, "grad_norm": 0.598960900544294, "learning_rate": 5.094425298933136e-07, "loss": 0.3161, "mean_token_accuracy": 0.8860001573339105, "num_tokens": 301328932.0, "step": 700 }, { "entropy": 0.378326416015625, "epoch": 2.7170542635658914, "grad_norm": 0.6121343549337338, "learning_rate": 4.960606702713466e-07, "loss": 0.3095, "mean_token_accuracy": 0.8888444481417537, "num_tokens": 301789862.0, "step": 701 }, { "entropy": 0.385162353515625, "epoch": 2.7209302325581395, "grad_norm": 0.5949190313734338, "learning_rate": 4.828524419671266e-07, "loss": 0.3166, "mean_token_accuracy": 0.8873936915770173, "num_tokens": 302211924.0, "step": 702 }, { "entropy": 0.387908935546875, "epoch": 2.7248062015503876, "grad_norm": 0.6231799085677647, "learning_rate": 4.6981808628712823e-07, "loss": 0.3226, "mean_token_accuracy": 0.8857519812881947, "num_tokens": 302632584.0, "step": 703 }, { "entropy": 0.381256103515625, "epoch": 2.7286821705426356, "grad_norm": 0.597154298203785, "learning_rate": 4.569578413612752e-07, "loss": 0.3246, "mean_token_accuracy": 0.8845319030806422, "num_tokens": 303079662.0, "step": 704 }, { "entropy": 0.381011962890625, "epoch": 2.7325581395348837, "grad_norm": 0.6097019666151339, "learning_rate": 4.4427194213859216e-07, "loss": 0.3055, "mean_token_accuracy": 0.8890186436474323, "num_tokens": 303513162.0, "step": 705 }, { "entropy": 0.377777099609375, "epoch": 2.7364341085271318, "grad_norm": 0.6097219538821098, "learning_rate": 4.3176062038291275e-07, "loss": 0.3084, "mean_token_accuracy": 0.8881680406630039, "num_tokens": 303965110.0, "step": 706 }, { "entropy": 0.3841552734375, "epoch": 2.74031007751938, "grad_norm": 0.5987018303788194, "learning_rate": 4.194241046686398e-07, "loss": 0.3233, "mean_token_accuracy": 0.8819524059072137, "num_tokens": 304417429.0, "step": 707 }, { "entropy": 0.379608154296875, "epoch": 2.744186046511628, "grad_norm": 0.6088852283958164, "learning_rate": 4.0726262037657506e-07, "loss": 0.3172, "mean_token_accuracy": 0.8878186987712979, "num_tokens": 304848098.0, "step": 708 }, { "entropy": 0.38323974609375, "epoch": 2.748062015503876, "grad_norm": 0.6247123304946086, "learning_rate": 3.9527638968980707e-07, "loss": 0.3051, "mean_token_accuracy": 0.8870606170967221, "num_tokens": 305267991.0, "step": 709 }, { "entropy": 0.3841552734375, "epoch": 2.751937984496124, "grad_norm": 0.6268733632031187, "learning_rate": 3.834656315896379e-07, "loss": 0.3134, "mean_token_accuracy": 0.8865107716992497, "num_tokens": 305695308.0, "step": 710 }, { "entropy": 0.3807373046875, "epoch": 2.755813953488372, "grad_norm": 0.6379178455584169, "learning_rate": 3.718305618515905e-07, "loss": 0.3188, "mean_token_accuracy": 0.8853443302214146, "num_tokens": 306143903.0, "step": 711 }, { "entropy": 0.380645751953125, "epoch": 2.75968992248062, "grad_norm": 0.6004645935396044, "learning_rate": 3.603713930414676e-07, "loss": 0.3128, "mean_token_accuracy": 0.8875391287729144, "num_tokens": 306583114.0, "step": 712 }, { "entropy": 0.377777099609375, "epoch": 2.7635658914728682, "grad_norm": 0.6013189306552227, "learning_rate": 3.490883345114671e-07, "loss": 0.3099, "mean_token_accuracy": 0.8893257696181536, "num_tokens": 307035468.0, "step": 713 }, { "entropy": 0.377227783203125, "epoch": 2.7674418604651163, "grad_norm": 0.6471701813407102, "learning_rate": 3.3798159239635585e-07, "loss": 0.3104, "mean_token_accuracy": 0.8878767946735024, "num_tokens": 307468508.0, "step": 714 }, { "entropy": 0.3798828125, "epoch": 2.7713178294573644, "grad_norm": 0.6053150592034582, "learning_rate": 3.2705136960970554e-07, "loss": 0.3051, "mean_token_accuracy": 0.8884820081293583, "num_tokens": 307897907.0, "step": 715 }, { "entropy": 0.386627197265625, "epoch": 2.7751937984496124, "grad_norm": 0.6168893552698425, "learning_rate": 3.1629786584018387e-07, "loss": 0.3104, "mean_token_accuracy": 0.8878091182559729, "num_tokens": 308327346.0, "step": 716 }, { "entropy": 0.383270263671875, "epoch": 2.7790697674418605, "grad_norm": 0.6368678138884837, "learning_rate": 3.05721277547909e-07, "loss": 0.3224, "mean_token_accuracy": 0.8836130304262042, "num_tokens": 308762802.0, "step": 717 }, { "entropy": 0.377960205078125, "epoch": 2.7829457364341086, "grad_norm": 0.6316411958735261, "learning_rate": 2.9532179796085356e-07, "loss": 0.2994, "mean_token_accuracy": 0.8904037978500128, "num_tokens": 309183208.0, "step": 718 }, { "entropy": 0.3790283203125, "epoch": 2.7868217054263567, "grad_norm": 0.6158546042861247, "learning_rate": 2.8509961707132496e-07, "loss": 0.3092, "mean_token_accuracy": 0.8881755471229553, "num_tokens": 309623208.0, "step": 719 }, { "entropy": 0.374969482421875, "epoch": 2.7906976744186047, "grad_norm": 0.6093210127446324, "learning_rate": 2.750549216324894e-07, "loss": 0.3, "mean_token_accuracy": 0.8928798316046596, "num_tokens": 310055523.0, "step": 720 }, { "entropy": 0.38287353515625, "epoch": 2.794573643410853, "grad_norm": 0.6559455125212247, "learning_rate": 2.6518789515495356e-07, "loss": 0.3148, "mean_token_accuracy": 0.8859463995322585, "num_tokens": 310485887.0, "step": 721 }, { "entropy": 0.38128662109375, "epoch": 2.798449612403101, "grad_norm": 0.6032446881532523, "learning_rate": 2.554987179034218e-07, "loss": 0.3221, "mean_token_accuracy": 0.8861085483804345, "num_tokens": 310926474.0, "step": 722 }, { "entropy": 0.385955810546875, "epoch": 2.802325581395349, "grad_norm": 0.6176670649999463, "learning_rate": 2.4598756689339975e-07, "loss": 0.3116, "mean_token_accuracy": 0.8888868298381567, "num_tokens": 311337806.0, "step": 723 }, { "entropy": 0.37939453125, "epoch": 2.806201550387597, "grad_norm": 0.6115489071383378, "learning_rate": 2.3665461588795902e-07, "loss": 0.3099, "mean_token_accuracy": 0.887202151119709, "num_tokens": 311771969.0, "step": 724 }, { "entropy": 0.380401611328125, "epoch": 2.810077519379845, "grad_norm": 0.6168345748959235, "learning_rate": 2.2750003539456e-07, "loss": 0.2989, "mean_token_accuracy": 0.8897026870399714, "num_tokens": 312196977.0, "step": 725 }, { "entropy": 0.378082275390625, "epoch": 2.813953488372093, "grad_norm": 0.5932331085634238, "learning_rate": 2.1852399266194312e-07, "loss": 0.2981, "mean_token_accuracy": 0.8931511901319027, "num_tokens": 312628560.0, "step": 726 }, { "entropy": 0.378814697265625, "epoch": 2.817829457364341, "grad_norm": 0.6263166553893282, "learning_rate": 2.097266516770713e-07, "loss": 0.2902, "mean_token_accuracy": 0.8911039233207703, "num_tokens": 313057357.0, "step": 727 }, { "entropy": 0.38458251953125, "epoch": 2.8217054263565893, "grad_norm": 0.6039916001081406, "learning_rate": 2.0110817316212893e-07, "loss": 0.3166, "mean_token_accuracy": 0.8877752646803856, "num_tokens": 313489145.0, "step": 728 }, { "entropy": 0.3828125, "epoch": 2.8255813953488373, "grad_norm": 0.5978445141112033, "learning_rate": 1.9266871457159108e-07, "loss": 0.3148, "mean_token_accuracy": 0.8861930128186941, "num_tokens": 313909863.0, "step": 729 }, { "entropy": 0.37738037109375, "epoch": 2.8294573643410854, "grad_norm": 0.6048382839853098, "learning_rate": 1.844084300893456e-07, "loss": 0.3046, "mean_token_accuracy": 0.8915831623598933, "num_tokens": 314341606.0, "step": 730 }, { "entropy": 0.380645751953125, "epoch": 2.8333333333333335, "grad_norm": 0.6482349068060914, "learning_rate": 1.7632747062587884e-07, "loss": 0.3061, "mean_token_accuracy": 0.8873017486184835, "num_tokens": 314769168.0, "step": 731 }, { "entropy": 0.377166748046875, "epoch": 2.8372093023255816, "grad_norm": 0.6049506747628145, "learning_rate": 1.6842598381551e-07, "loss": 0.2998, "mean_token_accuracy": 0.8923018351197243, "num_tokens": 315195435.0, "step": 732 }, { "entropy": 0.388031005859375, "epoch": 2.8410852713178296, "grad_norm": 0.5889538128743542, "learning_rate": 1.6070411401370335e-07, "loss": 0.305, "mean_token_accuracy": 0.8921971945092082, "num_tokens": 315604073.0, "step": 733 }, { "entropy": 0.37384033203125, "epoch": 2.8449612403100772, "grad_norm": 0.6226688966249363, "learning_rate": 1.531620022944269e-07, "loss": 0.3002, "mean_token_accuracy": 0.8922487432137132, "num_tokens": 316040545.0, "step": 734 }, { "entropy": 0.38275146484375, "epoch": 2.8488372093023253, "grad_norm": 0.668427970745727, "learning_rate": 1.4579978644757463e-07, "loss": 0.3181, "mean_token_accuracy": 0.8864294197410345, "num_tokens": 316468028.0, "step": 735 }, { "entropy": 0.378875732421875, "epoch": 2.8527131782945734, "grad_norm": 0.5924749050744685, "learning_rate": 1.3861760097645062e-07, "loss": 0.3031, "mean_token_accuracy": 0.8921865597367287, "num_tokens": 316904656.0, "step": 736 }, { "entropy": 0.375030517578125, "epoch": 2.8565891472868215, "grad_norm": 0.6041440396713224, "learning_rate": 1.3161557709530982e-07, "loss": 0.3117, "mean_token_accuracy": 0.8871205970644951, "num_tokens": 317339393.0, "step": 737 }, { "entropy": 0.385498046875, "epoch": 2.8604651162790695, "grad_norm": 0.6084600637159178, "learning_rate": 1.2479384272696572e-07, "loss": 0.3076, "mean_token_accuracy": 0.8861882789060473, "num_tokens": 317754313.0, "step": 738 }, { "entropy": 0.382904052734375, "epoch": 2.8643410852713176, "grad_norm": 0.5945461244400033, "learning_rate": 1.1815252250044318e-07, "loss": 0.3045, "mean_token_accuracy": 0.8900884315371513, "num_tokens": 318175206.0, "step": 739 }, { "entropy": 0.38031005859375, "epoch": 2.8682170542635657, "grad_norm": 0.5881787762100744, "learning_rate": 1.1169173774871478e-07, "loss": 0.2931, "mean_token_accuracy": 0.8941018283367157, "num_tokens": 318603349.0, "step": 740 }, { "entropy": 0.3812255859375, "epoch": 2.8720930232558137, "grad_norm": 0.5955703008177884, "learning_rate": 1.0541160650647364e-07, "loss": 0.3113, "mean_token_accuracy": 0.8901800969615579, "num_tokens": 319029522.0, "step": 741 }, { "entropy": 0.38336181640625, "epoch": 2.875968992248062, "grad_norm": 0.620592670160612, "learning_rate": 9.931224350798185e-08, "loss": 0.3163, "mean_token_accuracy": 0.889438440091908, "num_tokens": 319467084.0, "step": 742 }, { "entropy": 0.389007568359375, "epoch": 2.87984496124031, "grad_norm": 0.6263135619513145, "learning_rate": 9.339376018497216e-08, "loss": 0.3135, "mean_token_accuracy": 0.8862051470205188, "num_tokens": 319883624.0, "step": 743 }, { "entropy": 0.374481201171875, "epoch": 2.883720930232558, "grad_norm": 0.6146314660717794, "learning_rate": 8.765626466461397e-08, "loss": 0.3245, "mean_token_accuracy": 0.8832680499181151, "num_tokens": 320347517.0, "step": 744 }, { "entropy": 0.379791259765625, "epoch": 2.887596899224806, "grad_norm": 0.6482836636122309, "learning_rate": 8.209986176753947e-08, "loss": 0.316, "mean_token_accuracy": 0.885383871383965, "num_tokens": 320769928.0, "step": 745 }, { "entropy": 0.3804931640625, "epoch": 2.891472868217054, "grad_norm": 0.6068986326895162, "learning_rate": 7.672465300592069e-08, "loss": 0.3152, "mean_token_accuracy": 0.8881333563476801, "num_tokens": 321217963.0, "step": 746 }, { "entropy": 0.380157470703125, "epoch": 2.895348837209302, "grad_norm": 0.5942824706627966, "learning_rate": 7.153073658162646e-08, "loss": 0.3131, "mean_token_accuracy": 0.8865061281248927, "num_tokens": 321637092.0, "step": 747 }, { "entropy": 0.378143310546875, "epoch": 2.89922480620155, "grad_norm": 0.6413515101364472, "learning_rate": 6.65182073844195e-08, "loss": 0.3028, "mean_token_accuracy": 0.8900581542402506, "num_tokens": 322084312.0, "step": 748 }, { "entropy": 0.379058837890625, "epoch": 2.9031007751937983, "grad_norm": 0.5995579886871476, "learning_rate": 6.168715699022776e-08, "loss": 0.3201, "mean_token_accuracy": 0.8862557569518685, "num_tokens": 322517226.0, "step": 749 }, { "entropy": 0.379669189453125, "epoch": 2.9069767441860463, "grad_norm": 0.6139724741481611, "learning_rate": 5.7037673659464664e-08, "loss": 0.3062, "mean_token_accuracy": 0.8911184314638376, "num_tokens": 322943803.0, "step": 750 }, { "entropy": 0.38592529296875, "epoch": 2.9108527131782944, "grad_norm": 0.6371344577180248, "learning_rate": 5.256984233542595e-08, "loss": 0.3227, "mean_token_accuracy": 0.8849206436425447, "num_tokens": 323366694.0, "step": 751 }, { "entropy": 0.38250732421875, "epoch": 2.9147286821705425, "grad_norm": 0.595286110379896, "learning_rate": 4.828374464273422e-08, "loss": 0.3146, "mean_token_accuracy": 0.8888753112405539, "num_tokens": 323787284.0, "step": 752 }, { "entropy": 0.379486083984375, "epoch": 2.9186046511627906, "grad_norm": 0.608322323266727, "learning_rate": 4.417945888584241e-08, "loss": 0.3023, "mean_token_accuracy": 0.8914187019690871, "num_tokens": 324198852.0, "step": 753 }, { "entropy": 0.386444091796875, "epoch": 2.9224806201550386, "grad_norm": 0.6120063755796368, "learning_rate": 4.025706004760932e-08, "loss": 0.3149, "mean_token_accuracy": 0.8872729791328311, "num_tokens": 324621415.0, "step": 754 }, { "entropy": 0.387176513671875, "epoch": 2.9263565891472867, "grad_norm": 0.623477103826481, "learning_rate": 3.651661978793075e-08, "loss": 0.3043, "mean_token_accuracy": 0.8890358442440629, "num_tokens": 325023864.0, "step": 755 }, { "entropy": 0.3787841796875, "epoch": 2.9302325581395348, "grad_norm": 0.6148588161751789, "learning_rate": 3.2958206442422754e-08, "loss": 0.3102, "mean_token_accuracy": 0.8885551122948527, "num_tokens": 325444314.0, "step": 756 }, { "entropy": 0.377716064453125, "epoch": 2.934108527131783, "grad_norm": 0.6007577376002693, "learning_rate": 2.9581885021181534e-08, "loss": 0.3032, "mean_token_accuracy": 0.8900681380182505, "num_tokens": 325899500.0, "step": 757 }, { "entropy": 0.381317138671875, "epoch": 2.937984496124031, "grad_norm": 0.6031347151158447, "learning_rate": 2.6387717207589925e-08, "loss": 0.3206, "mean_token_accuracy": 0.8850037911906838, "num_tokens": 326331108.0, "step": 758 }, { "entropy": 0.385345458984375, "epoch": 2.941860465116279, "grad_norm": 0.601218932311134, "learning_rate": 2.3375761357193883e-08, "loss": 0.3109, "mean_token_accuracy": 0.8882812475785613, "num_tokens": 326742800.0, "step": 759 }, { "entropy": 0.37762451171875, "epoch": 2.945736434108527, "grad_norm": 0.6655915324031394, "learning_rate": 2.054607249663665e-08, "loss": 0.3276, "mean_token_accuracy": 0.88413349352777, "num_tokens": 327198682.0, "step": 760 }, { "entropy": 0.38250732421875, "epoch": 2.949612403100775, "grad_norm": 0.6064845718430714, "learning_rate": 1.7898702322648453e-08, "loss": 0.3008, "mean_token_accuracy": 0.8902482967823744, "num_tokens": 327619351.0, "step": 761 }, { "entropy": 0.379852294921875, "epoch": 2.953488372093023, "grad_norm": 0.6119773387004711, "learning_rate": 1.5433699201108377e-08, "loss": 0.3153, "mean_token_accuracy": 0.8852389631792903, "num_tokens": 328045535.0, "step": 762 }, { "entropy": 0.38037109375, "epoch": 2.9573643410852712, "grad_norm": 0.6198918451978505, "learning_rate": 1.3151108166156168e-08, "loss": 0.3096, "mean_token_accuracy": 0.8888922138139606, "num_tokens": 328473736.0, "step": 763 }, { "entropy": 0.378204345703125, "epoch": 2.9612403100775193, "grad_norm": 0.5998301701967552, "learning_rate": 1.1050970919374016e-08, "loss": 0.2997, "mean_token_accuracy": 0.8889357475563884, "num_tokens": 328890873.0, "step": 764 }, { "entropy": 0.378997802734375, "epoch": 2.9651162790697674, "grad_norm": 0.6177168341536942, "learning_rate": 9.13332582901716e-09, "loss": 0.3165, "mean_token_accuracy": 0.8886342123150826, "num_tokens": 329322746.0, "step": 765 }, { "entropy": 0.379241943359375, "epoch": 2.9689922480620154, "grad_norm": 0.592192707262181, "learning_rate": 7.3982079293233314e-09, "loss": 0.3056, "mean_token_accuracy": 0.8878153394907713, "num_tokens": 329742686.0, "step": 766 }, { "entropy": 0.3809814453125, "epoch": 2.9728682170542635, "grad_norm": 0.5800435523051897, "learning_rate": 5.845648919863278e-09, "loss": 0.3224, "mean_token_accuracy": 0.8846305012702942, "num_tokens": 330187335.0, "step": 767 }, { "entropy": 0.37921142578125, "epoch": 2.9767441860465116, "grad_norm": 0.6234929524950468, "learning_rate": 4.475677164966774e-09, "loss": 0.3253, "mean_token_accuracy": 0.8846317222341895, "num_tokens": 330621750.0, "step": 768 }, { "entropy": 0.383514404296875, "epoch": 2.9806201550387597, "grad_norm": 0.621643677788845, "learning_rate": 3.2883176932019255e-09, "loss": 0.3168, "mean_token_accuracy": 0.8865220500156283, "num_tokens": 331055767.0, "step": 769 }, { "entropy": 0.380706787109375, "epoch": 2.9844961240310077, "grad_norm": 0.6025789986792925, "learning_rate": 2.2835921969210917e-09, "loss": 0.2977, "mean_token_accuracy": 0.890127319842577, "num_tokens": 331478526.0, "step": 770 }, { "entropy": 0.38348388671875, "epoch": 2.988372093023256, "grad_norm": 0.621377310978937, "learning_rate": 1.4615190318600924e-09, "loss": 0.3107, "mean_token_accuracy": 0.8878284217789769, "num_tokens": 331903679.0, "step": 771 }, { "entropy": 0.382965087890625, "epoch": 2.992248062015504, "grad_norm": 0.6302553243798728, "learning_rate": 8.221132168073631e-10, "loss": 0.317, "mean_token_accuracy": 0.8872353015467525, "num_tokens": 332342987.0, "step": 772 }, { "entropy": 0.389373779296875, "epoch": 2.996124031007752, "grad_norm": 0.6125161547285307, "learning_rate": 3.653864333275081e-10, "loss": 0.3118, "mean_token_accuracy": 0.888038388453424, "num_tokens": 332763851.0, "step": 773 }, { "entropy": 0.37945556640625, "epoch": 3.0, "grad_norm": 0.600207904778677, "learning_rate": 9.134702554591812e-11, "loss": 0.3101, "mean_token_accuracy": 0.8894096985459328, "num_tokens": 333188605.0, "step": 774 }, { "epoch": 3.0, "step": 774, "total_flos": 616169609822208.0, "train_loss": 0.4388826183339422, "train_runtime": 59766.2651, "train_samples_per_second": 1.24, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 774, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 65, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 616169609822208.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }