{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 456, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0043859649122807015, "grad_norm": 100.72002410888672, "learning_rate": 2.173913043478261e-07, "loss": 2.8882, "mean_token_accuracy": 0.5581395626068115, "step": 1 }, { "epoch": 0.008771929824561403, "grad_norm": 102.30889892578125, "learning_rate": 4.347826086956522e-07, "loss": 2.7578, "mean_token_accuracy": 0.5808748602867126, "step": 2 }, { "epoch": 0.013157894736842105, "grad_norm": 95.59414672851562, "learning_rate": 6.521739130434783e-07, "loss": 2.8921, "mean_token_accuracy": 0.5645005106925964, "step": 3 }, { "epoch": 0.017543859649122806, "grad_norm": 104.23493957519531, "learning_rate": 8.695652173913044e-07, "loss": 2.8879, "mean_token_accuracy": 0.5589743852615356, "step": 4 }, { "epoch": 0.021929824561403508, "grad_norm": 104.50048065185547, "learning_rate": 1.0869565217391306e-06, "loss": 2.8202, "mean_token_accuracy": 0.5714285969734192, "step": 5 }, { "epoch": 0.02631578947368421, "grad_norm": 92.29496765136719, "learning_rate": 1.3043478260869566e-06, "loss": 2.5391, "mean_token_accuracy": 0.6177605986595154, "step": 6 }, { "epoch": 0.03070175438596491, "grad_norm": 97.3690414428711, "learning_rate": 1.521739130434783e-06, "loss": 2.5906, "mean_token_accuracy": 0.6210092902183533, "step": 7 }, { "epoch": 0.03508771929824561, "grad_norm": 104.06405639648438, "learning_rate": 1.7391304347826088e-06, "loss": 2.1757, "mean_token_accuracy": 0.7049999833106995, "step": 8 }, { "epoch": 0.039473684210526314, "grad_norm": 88.53056335449219, "learning_rate": 1.956521739130435e-06, "loss": 2.2199, "mean_token_accuracy": 0.6958661675453186, "step": 9 }, { "epoch": 0.043859649122807015, "grad_norm": 61.665733337402344, "learning_rate": 2.173913043478261e-06, "loss": 1.6644, "mean_token_accuracy": 0.7212614417076111, "step": 10 }, { "epoch": 0.04824561403508772, "grad_norm": 45.59661865234375, "learning_rate": 2.391304347826087e-06, "loss": 1.3645, "mean_token_accuracy": 0.7565470337867737, "step": 11 }, { "epoch": 0.05263157894736842, "grad_norm": 42.23125076293945, "learning_rate": 2.6086956521739132e-06, "loss": 1.3545, "mean_token_accuracy": 0.7615955471992493, "step": 12 }, { "epoch": 0.05701754385964912, "grad_norm": 41.3696403503418, "learning_rate": 2.8260869565217393e-06, "loss": 1.2655, "mean_token_accuracy": 0.7810526490211487, "step": 13 }, { "epoch": 0.06140350877192982, "grad_norm": 20.49009895324707, "learning_rate": 3.043478260869566e-06, "loss": 0.8974, "mean_token_accuracy": 0.8418847918510437, "step": 14 }, { "epoch": 0.06578947368421052, "grad_norm": 16.676067352294922, "learning_rate": 3.2608695652173914e-06, "loss": 0.7967, "mean_token_accuracy": 0.8536585569381714, "step": 15 }, { "epoch": 0.07017543859649122, "grad_norm": 14.577007293701172, "learning_rate": 3.4782608695652175e-06, "loss": 0.7143, "mean_token_accuracy": 0.8759920597076416, "step": 16 }, { "epoch": 0.07456140350877193, "grad_norm": 12.738288879394531, "learning_rate": 3.6956521739130436e-06, "loss": 0.628, "mean_token_accuracy": 0.8964803218841553, "step": 17 }, { "epoch": 0.07894736842105263, "grad_norm": 12.496009826660156, "learning_rate": 3.91304347826087e-06, "loss": 0.6106, "mean_token_accuracy": 0.902184247970581, "step": 18 }, { "epoch": 0.08333333333333333, "grad_norm": 12.505276679992676, "learning_rate": 4.130434782608696e-06, "loss": 0.5771, "mean_token_accuracy": 0.9018287062644958, "step": 19 }, { "epoch": 0.08771929824561403, "grad_norm": 14.074338912963867, "learning_rate": 4.347826086956522e-06, "loss": 0.5896, "mean_token_accuracy": 0.9002057909965515, "step": 20 }, { "epoch": 0.09210526315789473, "grad_norm": 11.973801612854004, "learning_rate": 4.565217391304348e-06, "loss": 0.5058, "mean_token_accuracy": 0.9128630757331848, "step": 21 }, { "epoch": 0.09649122807017543, "grad_norm": 12.036285400390625, "learning_rate": 4.782608695652174e-06, "loss": 0.445, "mean_token_accuracy": 0.9133782386779785, "step": 22 }, { "epoch": 0.10087719298245613, "grad_norm": 9.241310119628906, "learning_rate": 5e-06, "loss": 0.4018, "mean_token_accuracy": 0.9313346147537231, "step": 23 }, { "epoch": 0.10526315789473684, "grad_norm": 11.00639533996582, "learning_rate": 5.2173913043478265e-06, "loss": 0.4168, "mean_token_accuracy": 0.9190573692321777, "step": 24 }, { "epoch": 0.10964912280701754, "grad_norm": 10.37211799621582, "learning_rate": 5.4347826086956525e-06, "loss": 0.4274, "mean_token_accuracy": 0.9305699467658997, "step": 25 }, { "epoch": 0.11403508771929824, "grad_norm": 9.478952407836914, "learning_rate": 5.652173913043479e-06, "loss": 0.3642, "mean_token_accuracy": 0.9342916011810303, "step": 26 }, { "epoch": 0.11842105263157894, "grad_norm": 10.124141693115234, "learning_rate": 5.8695652173913055e-06, "loss": 0.4091, "mean_token_accuracy": 0.9287148714065552, "step": 27 }, { "epoch": 0.12280701754385964, "grad_norm": 9.518803596496582, "learning_rate": 6.086956521739132e-06, "loss": 0.3748, "mean_token_accuracy": 0.9275220632553101, "step": 28 }, { "epoch": 0.12719298245614036, "grad_norm": 9.434673309326172, "learning_rate": 6.304347826086958e-06, "loss": 0.3611, "mean_token_accuracy": 0.9374359250068665, "step": 29 }, { "epoch": 0.13157894736842105, "grad_norm": 11.588494300842285, "learning_rate": 6.521739130434783e-06, "loss": 0.426, "mean_token_accuracy": 0.9164086580276489, "step": 30 }, { "epoch": 0.13596491228070176, "grad_norm": 9.927175521850586, "learning_rate": 6.739130434782609e-06, "loss": 0.3641, "mean_token_accuracy": 0.9309309124946594, "step": 31 }, { "epoch": 0.14035087719298245, "grad_norm": 9.435439109802246, "learning_rate": 6.956521739130435e-06, "loss": 0.3461, "mean_token_accuracy": 0.9301868081092834, "step": 32 }, { "epoch": 0.14473684210526316, "grad_norm": 9.709479331970215, "learning_rate": 7.173913043478261e-06, "loss": 0.3551, "mean_token_accuracy": 0.9335317611694336, "step": 33 }, { "epoch": 0.14912280701754385, "grad_norm": 9.065229415893555, "learning_rate": 7.391304347826087e-06, "loss": 0.3177, "mean_token_accuracy": 0.9381918907165527, "step": 34 }, { "epoch": 0.15350877192982457, "grad_norm": 9.64979076385498, "learning_rate": 7.608695652173914e-06, "loss": 0.3644, "mean_token_accuracy": 0.9416754841804504, "step": 35 }, { "epoch": 0.15789473684210525, "grad_norm": 9.246652603149414, "learning_rate": 7.82608695652174e-06, "loss": 0.3028, "mean_token_accuracy": 0.9778012633323669, "step": 36 }, { "epoch": 0.16228070175438597, "grad_norm": 8.778101921081543, "learning_rate": 8.043478260869566e-06, "loss": 0.2831, "mean_token_accuracy": 0.9801587462425232, "step": 37 }, { "epoch": 0.16666666666666666, "grad_norm": 9.583342552185059, "learning_rate": 8.260869565217392e-06, "loss": 0.337, "mean_token_accuracy": 0.9642857313156128, "step": 38 }, { "epoch": 0.17105263157894737, "grad_norm": 9.742164611816406, "learning_rate": 8.478260869565218e-06, "loss": 0.292, "mean_token_accuracy": 0.9780701994895935, "step": 39 }, { "epoch": 0.17543859649122806, "grad_norm": 9.522716522216797, "learning_rate": 8.695652173913044e-06, "loss": 0.2891, "mean_token_accuracy": 0.9725118279457092, "step": 40 }, { "epoch": 0.17982456140350878, "grad_norm": 8.957447052001953, "learning_rate": 8.91304347826087e-06, "loss": 0.271, "mean_token_accuracy": 0.9785056114196777, "step": 41 }, { "epoch": 0.18421052631578946, "grad_norm": 8.779328346252441, "learning_rate": 9.130434782608697e-06, "loss": 0.2596, "mean_token_accuracy": 0.9759036302566528, "step": 42 }, { "epoch": 0.18859649122807018, "grad_norm": 8.714072227478027, "learning_rate": 9.347826086956523e-06, "loss": 0.2528, "mean_token_accuracy": 0.9750249981880188, "step": 43 }, { "epoch": 0.19298245614035087, "grad_norm": 9.57017707824707, "learning_rate": 9.565217391304349e-06, "loss": 0.2737, "mean_token_accuracy": 0.9671794772148132, "step": 44 }, { "epoch": 0.19736842105263158, "grad_norm": 8.689032554626465, "learning_rate": 9.782608695652175e-06, "loss": 0.2213, "mean_token_accuracy": 0.9827935099601746, "step": 45 }, { "epoch": 0.20175438596491227, "grad_norm": 8.658659934997559, "learning_rate": 1e-05, "loss": 0.2289, "mean_token_accuracy": 0.977505087852478, "step": 46 }, { "epoch": 0.20614035087719298, "grad_norm": 8.370903015136719, "learning_rate": 9.999867897077623e-06, "loss": 0.2087, "mean_token_accuracy": 0.9791460037231445, "step": 47 }, { "epoch": 0.21052631578947367, "grad_norm": 8.627614974975586, "learning_rate": 9.999471596066567e-06, "loss": 0.2199, "mean_token_accuracy": 0.9729458689689636, "step": 48 }, { "epoch": 0.2149122807017544, "grad_norm": 8.98874568939209, "learning_rate": 9.998811120234624e-06, "loss": 0.2204, "mean_token_accuracy": 0.9805327653884888, "step": 49 }, { "epoch": 0.21929824561403508, "grad_norm": 8.562795639038086, "learning_rate": 9.99788650835992e-06, "loss": 0.194, "mean_token_accuracy": 0.9865702390670776, "step": 50 }, { "epoch": 0.2236842105263158, "grad_norm": 8.354143142700195, "learning_rate": 9.996697814728646e-06, "loss": 0.2058, "mean_token_accuracy": 0.9749276638031006, "step": 51 }, { "epoch": 0.22807017543859648, "grad_norm": 8.861373901367188, "learning_rate": 9.99524510913187e-06, "loss": 0.1785, "mean_token_accuracy": 0.9775280952453613, "step": 52 }, { "epoch": 0.2324561403508772, "grad_norm": 8.467371940612793, "learning_rate": 9.99352847686144e-06, "loss": 0.1753, "mean_token_accuracy": 0.9784172773361206, "step": 53 }, { "epoch": 0.23684210526315788, "grad_norm": 9.050318717956543, "learning_rate": 9.991548018704971e-06, "loss": 0.181, "mean_token_accuracy": 0.9764453768730164, "step": 54 }, { "epoch": 0.2412280701754386, "grad_norm": 8.318038940429688, "learning_rate": 9.989303850939937e-06, "loss": 0.1424, "mean_token_accuracy": 0.9843096137046814, "step": 55 }, { "epoch": 0.24561403508771928, "grad_norm": 8.480911254882812, "learning_rate": 9.986796105326832e-06, "loss": 0.1465, "mean_token_accuracy": 0.9778246879577637, "step": 56 }, { "epoch": 0.25, "grad_norm": 7.978242874145508, "learning_rate": 9.98402492910145e-06, "loss": 0.1582, "mean_token_accuracy": 0.971230149269104, "step": 57 }, { "epoch": 0.2543859649122807, "grad_norm": 7.869690895080566, "learning_rate": 9.98099048496622e-06, "loss": 0.1625, "mean_token_accuracy": 0.9744094610214233, "step": 58 }, { "epoch": 0.25877192982456143, "grad_norm": 7.296746253967285, "learning_rate": 9.977692951080673e-06, "loss": 0.0961, "mean_token_accuracy": 0.9874759316444397, "step": 59 }, { "epoch": 0.2631578947368421, "grad_norm": 6.89448881149292, "learning_rate": 9.97413252105097e-06, "loss": 0.1193, "mean_token_accuracy": 0.9717624187469482, "step": 60 }, { "epoch": 0.2675438596491228, "grad_norm": 6.936161994934082, "learning_rate": 9.970309403918538e-06, "loss": 0.1294, "mean_token_accuracy": 0.9783037304878235, "step": 61 }, { "epoch": 0.2719298245614035, "grad_norm": 6.3501715660095215, "learning_rate": 9.966223824147798e-06, "loss": 0.1022, "mean_token_accuracy": 0.9817629456520081, "step": 62 }, { "epoch": 0.27631578947368424, "grad_norm": 6.021699905395508, "learning_rate": 9.961876021612984e-06, "loss": 0.0861, "mean_token_accuracy": 0.9848331809043884, "step": 63 }, { "epoch": 0.2807017543859649, "grad_norm": 5.777260780334473, "learning_rate": 9.957266251584061e-06, "loss": 0.0754, "mean_token_accuracy": 0.9884817004203796, "step": 64 }, { "epoch": 0.2850877192982456, "grad_norm": 5.9405131340026855, "learning_rate": 9.952394784711736e-06, "loss": 0.1364, "mean_token_accuracy": 0.9699481725692749, "step": 65 }, { "epoch": 0.2894736842105263, "grad_norm": 5.398240566253662, "learning_rate": 9.94726190701157e-06, "loss": 0.1021, "mean_token_accuracy": 0.9772727489471436, "step": 66 }, { "epoch": 0.29385964912280704, "grad_norm": 4.3773932456970215, "learning_rate": 9.94186791984718e-06, "loss": 0.0721, "mean_token_accuracy": 0.9819819927215576, "step": 67 }, { "epoch": 0.2982456140350877, "grad_norm": 7.893673896789551, "learning_rate": 9.936213139912555e-06, "loss": 0.0739, "mean_token_accuracy": 0.9837892651557922, "step": 68 }, { "epoch": 0.3026315789473684, "grad_norm": 3.6643731594085693, "learning_rate": 9.930297899213454e-06, "loss": 0.0751, "mean_token_accuracy": 0.9825462102890015, "step": 69 }, { "epoch": 0.30701754385964913, "grad_norm": 3.828979253768921, "learning_rate": 9.924122545047908e-06, "loss": 0.0995, "mean_token_accuracy": 0.9770458936691284, "step": 70 }, { "epoch": 0.31140350877192985, "grad_norm": 3.4599835872650146, "learning_rate": 9.917687439985848e-06, "loss": 0.0729, "mean_token_accuracy": 0.981500506401062, "step": 71 }, { "epoch": 0.3157894736842105, "grad_norm": 3.9461429119110107, "learning_rate": 9.910992961847798e-06, "loss": 0.0799, "mean_token_accuracy": 0.9785177111625671, "step": 72 }, { "epoch": 0.3201754385964912, "grad_norm": 2.959583044052124, "learning_rate": 9.904039503682701e-06, "loss": 0.0668, "mean_token_accuracy": 0.9801192879676819, "step": 73 }, { "epoch": 0.32456140350877194, "grad_norm": 3.5721518993377686, "learning_rate": 9.896827473744848e-06, "loss": 0.0691, "mean_token_accuracy": 0.9830328822135925, "step": 74 }, { "epoch": 0.32894736842105265, "grad_norm": 4.679792404174805, "learning_rate": 9.889357295469893e-06, "loss": 0.0941, "mean_token_accuracy": 0.9726997017860413, "step": 75 }, { "epoch": 0.3333333333333333, "grad_norm": 3.811170816421509, "learning_rate": 9.881629407450007e-06, "loss": 0.1065, "mean_token_accuracy": 0.9740259647369385, "step": 76 }, { "epoch": 0.33771929824561403, "grad_norm": 2.699470281600952, "learning_rate": 9.873644263408119e-06, "loss": 0.0468, "mean_token_accuracy": 0.9821228981018066, "step": 77 }, { "epoch": 0.34210526315789475, "grad_norm": 2.773603677749634, "learning_rate": 9.86540233217128e-06, "loss": 0.0804, "mean_token_accuracy": 0.9749518036842346, "step": 78 }, { "epoch": 0.34649122807017546, "grad_norm": 2.799196243286133, "learning_rate": 9.856904097643136e-06, "loss": 0.0758, "mean_token_accuracy": 0.9768844246864319, "step": 79 }, { "epoch": 0.3508771929824561, "grad_norm": 3.8493893146514893, "learning_rate": 9.848150058775514e-06, "loss": 0.0876, "mean_token_accuracy": 0.9722222089767456, "step": 80 }, { "epoch": 0.35526315789473684, "grad_norm": 2.733839273452759, "learning_rate": 9.839140729539135e-06, "loss": 0.0707, "mean_token_accuracy": 0.9825102686882019, "step": 81 }, { "epoch": 0.35964912280701755, "grad_norm": 2.1981351375579834, "learning_rate": 9.829876638893432e-06, "loss": 0.041, "mean_token_accuracy": 0.9840085506439209, "step": 82 }, { "epoch": 0.36403508771929827, "grad_norm": 2.9656527042388916, "learning_rate": 9.820358330755487e-06, "loss": 0.0606, "mean_token_accuracy": 0.97921222448349, "step": 83 }, { "epoch": 0.3684210526315789, "grad_norm": 4.108867645263672, "learning_rate": 9.810586363968115e-06, "loss": 0.084, "mean_token_accuracy": 0.9738767147064209, "step": 84 }, { "epoch": 0.37280701754385964, "grad_norm": 3.470015048980713, "learning_rate": 9.800561312267033e-06, "loss": 0.0822, "mean_token_accuracy": 0.9793713092803955, "step": 85 }, { "epoch": 0.37719298245614036, "grad_norm": 3.3155014514923096, "learning_rate": 9.790283764247188e-06, "loss": 0.0689, "mean_token_accuracy": 0.9789999723434448, "step": 86 }, { "epoch": 0.3815789473684211, "grad_norm": 3.8658089637756348, "learning_rate": 9.779754323328192e-06, "loss": 0.0872, "mean_token_accuracy": 0.9719334840774536, "step": 87 }, { "epoch": 0.38596491228070173, "grad_norm": 3.226743459701538, "learning_rate": 9.768973607718896e-06, "loss": 0.095, "mean_token_accuracy": 0.9710578918457031, "step": 88 }, { "epoch": 0.39035087719298245, "grad_norm": 3.8896260261535645, "learning_rate": 9.757942250381094e-06, "loss": 0.0723, "mean_token_accuracy": 0.977412760257721, "step": 89 }, { "epoch": 0.39473684210526316, "grad_norm": 3.5911498069763184, "learning_rate": 9.746660898992362e-06, "loss": 0.0869, "mean_token_accuracy": 0.9721115827560425, "step": 90 }, { "epoch": 0.3991228070175439, "grad_norm": 2.7276499271392822, "learning_rate": 9.735130215908027e-06, "loss": 0.0732, "mean_token_accuracy": 0.9744848012924194, "step": 91 }, { "epoch": 0.40350877192982454, "grad_norm": 3.3096349239349365, "learning_rate": 9.723350878122283e-06, "loss": 0.0701, "mean_token_accuracy": 0.9788944721221924, "step": 92 }, { "epoch": 0.40789473684210525, "grad_norm": 2.4994590282440186, "learning_rate": 9.711323577228433e-06, "loss": 0.0917, "mean_token_accuracy": 0.9757575988769531, "step": 93 }, { "epoch": 0.41228070175438597, "grad_norm": 3.2870211601257324, "learning_rate": 9.699049019378303e-06, "loss": 0.0891, "mean_token_accuracy": 0.9718719720840454, "step": 94 }, { "epoch": 0.4166666666666667, "grad_norm": 2.1593267917633057, "learning_rate": 9.686527925240763e-06, "loss": 0.0488, "mean_token_accuracy": 0.9822263717651367, "step": 95 }, { "epoch": 0.42105263157894735, "grad_norm": 2.629718542098999, "learning_rate": 9.673761029959427e-06, "loss": 0.0504, "mean_token_accuracy": 0.9824742078781128, "step": 96 }, { "epoch": 0.42543859649122806, "grad_norm": 1.9966559410095215, "learning_rate": 9.660749083109483e-06, "loss": 0.0637, "mean_token_accuracy": 0.976313054561615, "step": 97 }, { "epoch": 0.4298245614035088, "grad_norm": 2.178070545196533, "learning_rate": 9.647492848653689e-06, "loss": 0.0437, "mean_token_accuracy": 0.9812304377555847, "step": 98 }, { "epoch": 0.4342105263157895, "grad_norm": 2.226331949234009, "learning_rate": 9.633993104897516e-06, "loss": 0.0525, "mean_token_accuracy": 0.9882352948188782, "step": 99 }, { "epoch": 0.43859649122807015, "grad_norm": 3.1283533573150635, "learning_rate": 9.620250644443454e-06, "loss": 0.0688, "mean_token_accuracy": 0.9809809923171997, "step": 100 }, { "epoch": 0.44298245614035087, "grad_norm": 2.914991855621338, "learning_rate": 9.606266274144475e-06, "loss": 0.067, "mean_token_accuracy": 0.9824561476707458, "step": 101 }, { "epoch": 0.4473684210526316, "grad_norm": 2.58962082862854, "learning_rate": 9.592040815056662e-06, "loss": 0.0503, "mean_token_accuracy": 0.9818181991577148, "step": 102 }, { "epoch": 0.4517543859649123, "grad_norm": 2.361135959625244, "learning_rate": 9.577575102390999e-06, "loss": 0.0605, "mean_token_accuracy": 0.9817444086074829, "step": 103 }, { "epoch": 0.45614035087719296, "grad_norm": 2.864607334136963, "learning_rate": 9.562869985464341e-06, "loss": 0.0812, "mean_token_accuracy": 0.9810924530029297, "step": 104 }, { "epoch": 0.4605263157894737, "grad_norm": 3.783154249191284, "learning_rate": 9.547926327649535e-06, "loss": 0.1249, "mean_token_accuracy": 0.9709419012069702, "step": 105 }, { "epoch": 0.4649122807017544, "grad_norm": 2.7710580825805664, "learning_rate": 9.53274500632475e-06, "loss": 0.0613, "mean_token_accuracy": 0.98591548204422, "step": 106 }, { "epoch": 0.4692982456140351, "grad_norm": 2.278409004211426, "learning_rate": 9.517326912821948e-06, "loss": 0.0571, "mean_token_accuracy": 0.9854369163513184, "step": 107 }, { "epoch": 0.47368421052631576, "grad_norm": 2.6353845596313477, "learning_rate": 9.501672952374551e-06, "loss": 0.0494, "mean_token_accuracy": 0.9879253506660461, "step": 108 }, { "epoch": 0.4780701754385965, "grad_norm": 2.9735920429229736, "learning_rate": 9.485784044064305e-06, "loss": 0.0702, "mean_token_accuracy": 0.98103266954422, "step": 109 }, { "epoch": 0.4824561403508772, "grad_norm": 2.455169916152954, "learning_rate": 9.469661120767308e-06, "loss": 0.08, "mean_token_accuracy": 0.9808428883552551, "step": 110 }, { "epoch": 0.4868421052631579, "grad_norm": 2.6597843170166016, "learning_rate": 9.453305129099241e-06, "loss": 0.0754, "mean_token_accuracy": 0.9796791672706604, "step": 111 }, { "epoch": 0.49122807017543857, "grad_norm": 2.5103378295898438, "learning_rate": 9.436717029359794e-06, "loss": 0.0408, "mean_token_accuracy": 0.9863013625144958, "step": 112 }, { "epoch": 0.4956140350877193, "grad_norm": 2.8557910919189453, "learning_rate": 9.419897795476276e-06, "loss": 0.0682, "mean_token_accuracy": 0.9836065769195557, "step": 113 }, { "epoch": 0.5, "grad_norm": 2.57025146484375, "learning_rate": 9.402848414946445e-06, "loss": 0.0571, "mean_token_accuracy": 0.9806763529777527, "step": 114 }, { "epoch": 0.5043859649122807, "grad_norm": 2.4798214435577393, "learning_rate": 9.385569888780517e-06, "loss": 0.0725, "mean_token_accuracy": 0.9782823324203491, "step": 115 }, { "epoch": 0.5087719298245614, "grad_norm": 3.3407113552093506, "learning_rate": 9.368063231442406e-06, "loss": 0.0828, "mean_token_accuracy": 0.9743863344192505, "step": 116 }, { "epoch": 0.5131578947368421, "grad_norm": 2.729813814163208, "learning_rate": 9.350329470790153e-06, "loss": 0.0752, "mean_token_accuracy": 0.98037189245224, "step": 117 }, { "epoch": 0.5175438596491229, "grad_norm": 2.1768548488616943, "learning_rate": 9.332369648015583e-06, "loss": 0.0566, "mean_token_accuracy": 0.9804727435112, "step": 118 }, { "epoch": 0.5219298245614035, "grad_norm": 2.738077163696289, "learning_rate": 9.314184817583176e-06, "loss": 0.0873, "mean_token_accuracy": 0.9718456864356995, "step": 119 }, { "epoch": 0.5263157894736842, "grad_norm": 2.769399404525757, "learning_rate": 9.295776047168149e-06, "loss": 0.051, "mean_token_accuracy": 0.984329104423523, "step": 120 }, { "epoch": 0.5307017543859649, "grad_norm": 2.4384264945983887, "learning_rate": 9.277144417593777e-06, "loss": 0.056, "mean_token_accuracy": 0.985029935836792, "step": 121 }, { "epoch": 0.5350877192982456, "grad_norm": 1.7621722221374512, "learning_rate": 9.258291022767932e-06, "loss": 0.0445, "mean_token_accuracy": 0.9854318499565125, "step": 122 }, { "epoch": 0.5394736842105263, "grad_norm": 1.8469805717468262, "learning_rate": 9.239216969618862e-06, "loss": 0.0338, "mean_token_accuracy": 0.9893719553947449, "step": 123 }, { "epoch": 0.543859649122807, "grad_norm": 1.7815845012664795, "learning_rate": 9.219923378030197e-06, "loss": 0.0531, "mean_token_accuracy": 0.9894737005233765, "step": 124 }, { "epoch": 0.5482456140350878, "grad_norm": 1.8177460432052612, "learning_rate": 9.200411380775192e-06, "loss": 0.0463, "mean_token_accuracy": 0.9866529703140259, "step": 125 }, { "epoch": 0.5526315789473685, "grad_norm": 1.940299153327942, "learning_rate": 9.180682123450232e-06, "loss": 0.0294, "mean_token_accuracy": 0.9908722043037415, "step": 126 }, { "epoch": 0.5570175438596491, "grad_norm": 2.27999210357666, "learning_rate": 9.160736764407555e-06, "loss": 0.0511, "mean_token_accuracy": 0.9812623262405396, "step": 127 }, { "epoch": 0.5614035087719298, "grad_norm": 1.8023228645324707, "learning_rate": 9.140576474687263e-06, "loss": 0.0373, "mean_token_accuracy": 0.9895506501197815, "step": 128 }, { "epoch": 0.5657894736842105, "grad_norm": 2.3387715816497803, "learning_rate": 9.120202437948551e-06, "loss": 0.0406, "mean_token_accuracy": 0.9860514998435974, "step": 129 }, { "epoch": 0.5701754385964912, "grad_norm": 1.9008742570877075, "learning_rate": 9.099615850400214e-06, "loss": 0.0484, "mean_token_accuracy": 0.9894737005233765, "step": 130 }, { "epoch": 0.5745614035087719, "grad_norm": 2.219818353652954, "learning_rate": 9.078817920730421e-06, "loss": 0.0492, "mean_token_accuracy": 0.9812304377555847, "step": 131 }, { "epoch": 0.5789473684210527, "grad_norm": 2.4068856239318848, "learning_rate": 9.057809870035743e-06, "loss": 0.0622, "mean_token_accuracy": 0.9838362336158752, "step": 132 }, { "epoch": 0.5833333333333334, "grad_norm": 3.42411470413208, "learning_rate": 9.036592931749463e-06, "loss": 0.0811, "mean_token_accuracy": 0.9856262803077698, "step": 133 }, { "epoch": 0.5877192982456141, "grad_norm": 2.5055322647094727, "learning_rate": 9.015168351569165e-06, "loss": 0.0614, "mean_token_accuracy": 0.9802761077880859, "step": 134 }, { "epoch": 0.5921052631578947, "grad_norm": 2.5999653339385986, "learning_rate": 8.993537387383579e-06, "loss": 0.0571, "mean_token_accuracy": 0.9836065769195557, "step": 135 }, { "epoch": 0.5964912280701754, "grad_norm": 1.6160295009613037, "learning_rate": 8.971701309198744e-06, "loss": 0.0345, "mean_token_accuracy": 0.9874739050865173, "step": 136 }, { "epoch": 0.6008771929824561, "grad_norm": 1.483092188835144, "learning_rate": 8.949661399063432e-06, "loss": 0.026, "mean_token_accuracy": 0.9906445145606995, "step": 137 }, { "epoch": 0.6052631578947368, "grad_norm": 2.4867641925811768, "learning_rate": 8.927418950993885e-06, "loss": 0.0674, "mean_token_accuracy": 0.9795719981193542, "step": 138 }, { "epoch": 0.6096491228070176, "grad_norm": 2.1540322303771973, "learning_rate": 8.90497527089783e-06, "loss": 0.0647, "mean_token_accuracy": 0.980079710483551, "step": 139 }, { "epoch": 0.6140350877192983, "grad_norm": 2.6197245121002197, "learning_rate": 8.882331676497813e-06, "loss": 0.0803, "mean_token_accuracy": 0.9824561476707458, "step": 140 }, { "epoch": 0.618421052631579, "grad_norm": 3.1673412322998047, "learning_rate": 8.859489497253833e-06, "loss": 0.0944, "mean_token_accuracy": 0.9688473343849182, "step": 141 }, { "epoch": 0.6228070175438597, "grad_norm": 3.3862664699554443, "learning_rate": 8.83645007428528e-06, "loss": 0.0543, "mean_token_accuracy": 0.9792576432228088, "step": 142 }, { "epoch": 0.6271929824561403, "grad_norm": 2.722994327545166, "learning_rate": 8.813214760292202e-06, "loss": 0.0618, "mean_token_accuracy": 0.9790794849395752, "step": 143 }, { "epoch": 0.631578947368421, "grad_norm": 2.353830337524414, "learning_rate": 8.789784919475878e-06, "loss": 0.0628, "mean_token_accuracy": 0.9831013679504395, "step": 144 }, { "epoch": 0.6359649122807017, "grad_norm": 2.3788843154907227, "learning_rate": 8.766161927458726e-06, "loss": 0.0523, "mean_token_accuracy": 0.9803729057312012, "step": 145 }, { "epoch": 0.6403508771929824, "grad_norm": 2.782294750213623, "learning_rate": 8.742347171203542e-06, "loss": 0.0635, "mean_token_accuracy": 0.9749739170074463, "step": 146 }, { "epoch": 0.6447368421052632, "grad_norm": 1.9423679113388062, "learning_rate": 8.718342048932054e-06, "loss": 0.0422, "mean_token_accuracy": 0.9858012199401855, "step": 147 }, { "epoch": 0.6491228070175439, "grad_norm": 2.4177629947662354, "learning_rate": 8.694147970042842e-06, "loss": 0.0492, "mean_token_accuracy": 0.9847250580787659, "step": 148 }, { "epoch": 0.6535087719298246, "grad_norm": 3.1622345447540283, "learning_rate": 8.669766355028584e-06, "loss": 0.07, "mean_token_accuracy": 0.9742063283920288, "step": 149 }, { "epoch": 0.6578947368421053, "grad_norm": 2.079878330230713, "learning_rate": 8.645198635392659e-06, "loss": 0.0496, "mean_token_accuracy": 0.9880715608596802, "step": 150 }, { "epoch": 0.6622807017543859, "grad_norm": 1.5153844356536865, "learning_rate": 8.620446253565088e-06, "loss": 0.0359, "mean_token_accuracy": 0.9877049326896667, "step": 151 }, { "epoch": 0.6666666666666666, "grad_norm": 2.652635335922241, "learning_rate": 8.595510662817865e-06, "loss": 0.0743, "mean_token_accuracy": 0.9824368953704834, "step": 152 }, { "epoch": 0.6710526315789473, "grad_norm": 1.8495056629180908, "learning_rate": 8.570393327179614e-06, "loss": 0.0389, "mean_token_accuracy": 0.9873816967010498, "step": 153 }, { "epoch": 0.6754385964912281, "grad_norm": 2.0945048332214355, "learning_rate": 8.545095721349641e-06, "loss": 0.0464, "mean_token_accuracy": 0.9847763776779175, "step": 154 }, { "epoch": 0.6798245614035088, "grad_norm": 1.8139218091964722, "learning_rate": 8.519619330611353e-06, "loss": 0.055, "mean_token_accuracy": 0.9879153966903687, "step": 155 }, { "epoch": 0.6842105263157895, "grad_norm": 1.7457425594329834, "learning_rate": 8.493965650745043e-06, "loss": 0.0367, "mean_token_accuracy": 0.9838547110557556, "step": 156 }, { "epoch": 0.6885964912280702, "grad_norm": 2.1221344470977783, "learning_rate": 8.468136187940087e-06, "loss": 0.0479, "mean_token_accuracy": 0.9841938614845276, "step": 157 }, { "epoch": 0.6929824561403509, "grad_norm": 1.792470932006836, "learning_rate": 8.442132458706484e-06, "loss": 0.0418, "mean_token_accuracy": 0.9871794581413269, "step": 158 }, { "epoch": 0.6973684210526315, "grad_norm": 2.0108189582824707, "learning_rate": 8.415955989785852e-06, "loss": 0.0651, "mean_token_accuracy": 0.9837563633918762, "step": 159 }, { "epoch": 0.7017543859649122, "grad_norm": 3.3567967414855957, "learning_rate": 8.389608318061761e-06, "loss": 0.0594, "mean_token_accuracy": 0.9821615815162659, "step": 160 }, { "epoch": 0.706140350877193, "grad_norm": 1.998265027999878, "learning_rate": 8.36309099046952e-06, "loss": 0.0469, "mean_token_accuracy": 0.9834024906158447, "step": 161 }, { "epoch": 0.7105263157894737, "grad_norm": 1.778838038444519, "learning_rate": 8.336405563905333e-06, "loss": 0.0467, "mean_token_accuracy": 0.9862475395202637, "step": 162 }, { "epoch": 0.7149122807017544, "grad_norm": 2.1717565059661865, "learning_rate": 8.309553605134904e-06, "loss": 0.0599, "mean_token_accuracy": 0.9841112494468689, "step": 163 }, { "epoch": 0.7192982456140351, "grad_norm": 1.5761713981628418, "learning_rate": 8.282536690701446e-06, "loss": 0.0368, "mean_token_accuracy": 0.9864583611488342, "step": 164 }, { "epoch": 0.7236842105263158, "grad_norm": 2.007516384124756, "learning_rate": 8.25535640683311e-06, "loss": 0.0528, "mean_token_accuracy": 0.9837398529052734, "step": 165 }, { "epoch": 0.7280701754385965, "grad_norm": 2.010671615600586, "learning_rate": 8.228014349349872e-06, "loss": 0.0387, "mean_token_accuracy": 0.9889112710952759, "step": 166 }, { "epoch": 0.7324561403508771, "grad_norm": 2.147995710372925, "learning_rate": 8.200512123569817e-06, "loss": 0.0697, "mean_token_accuracy": 0.9813725352287292, "step": 167 }, { "epoch": 0.7368421052631579, "grad_norm": 1.8654723167419434, "learning_rate": 8.172851344214896e-06, "loss": 0.0355, "mean_token_accuracy": 0.9886947870254517, "step": 168 }, { "epoch": 0.7412280701754386, "grad_norm": 1.949415922164917, "learning_rate": 8.14503363531613e-06, "loss": 0.0425, "mean_token_accuracy": 0.9887295365333557, "step": 169 }, { "epoch": 0.7456140350877193, "grad_norm": 2.284515857696533, "learning_rate": 8.117060630118246e-06, "loss": 0.0708, "mean_token_accuracy": 0.981500506401062, "step": 170 }, { "epoch": 0.75, "grad_norm": 1.6988317966461182, "learning_rate": 8.088933970983793e-06, "loss": 0.0411, "mean_token_accuracy": 0.9900442361831665, "step": 171 }, { "epoch": 0.7543859649122807, "grad_norm": 2.115851640701294, "learning_rate": 8.060655309296712e-06, "loss": 0.0756, "mean_token_accuracy": 0.9807886481285095, "step": 172 }, { "epoch": 0.7587719298245614, "grad_norm": 1.499506950378418, "learning_rate": 8.032226305365383e-06, "loss": 0.0352, "mean_token_accuracy": 0.9842436909675598, "step": 173 }, { "epoch": 0.7631578947368421, "grad_norm": 1.6798099279403687, "learning_rate": 8.003648628325136e-06, "loss": 0.0349, "mean_token_accuracy": 0.988095223903656, "step": 174 }, { "epoch": 0.7675438596491229, "grad_norm": 2.1273345947265625, "learning_rate": 7.974923956040262e-06, "loss": 0.0552, "mean_token_accuracy": 0.9790836572647095, "step": 175 }, { "epoch": 0.7719298245614035, "grad_norm": 2.9694130420684814, "learning_rate": 7.946053975005495e-06, "loss": 0.0586, "mean_token_accuracy": 0.9815950989723206, "step": 176 }, { "epoch": 0.7763157894736842, "grad_norm": 1.9922927618026733, "learning_rate": 7.917040380247e-06, "loss": 0.0384, "mean_token_accuracy": 0.9907881021499634, "step": 177 }, { "epoch": 0.7807017543859649, "grad_norm": 1.1386786699295044, "learning_rate": 7.887884875222841e-06, "loss": 0.0269, "mean_token_accuracy": 0.9908906817436218, "step": 178 }, { "epoch": 0.7850877192982456, "grad_norm": 2.196960687637329, "learning_rate": 7.858589171722985e-06, "loss": 0.044, "mean_token_accuracy": 0.9834437370300293, "step": 179 }, { "epoch": 0.7894736842105263, "grad_norm": 1.7304738759994507, "learning_rate": 7.829154989768784e-06, "loss": 0.0344, "mean_token_accuracy": 0.987590491771698, "step": 180 }, { "epoch": 0.793859649122807, "grad_norm": 2.542288303375244, "learning_rate": 7.799584057511997e-06, "loss": 0.049, "mean_token_accuracy": 0.9837892651557922, "step": 181 }, { "epoch": 0.7982456140350878, "grad_norm": 1.137778639793396, "learning_rate": 7.76987811113332e-06, "loss": 0.0261, "mean_token_accuracy": 0.9931372404098511, "step": 182 }, { "epoch": 0.8026315789473685, "grad_norm": 1.5463237762451172, "learning_rate": 7.740038894740454e-06, "loss": 0.036, "mean_token_accuracy": 0.9913138151168823, "step": 183 }, { "epoch": 0.8070175438596491, "grad_norm": 2.4163358211517334, "learning_rate": 7.710068160265705e-06, "loss": 0.066, "mean_token_accuracy": 0.9861407279968262, "step": 184 }, { "epoch": 0.8114035087719298, "grad_norm": 1.2295856475830078, "learning_rate": 7.679967667363121e-06, "loss": 0.0263, "mean_token_accuracy": 0.9938461780548096, "step": 185 }, { "epoch": 0.8157894736842105, "grad_norm": 2.1350033283233643, "learning_rate": 7.649739183305184e-06, "loss": 0.0648, "mean_token_accuracy": 0.9821073412895203, "step": 186 }, { "epoch": 0.8201754385964912, "grad_norm": 1.8890544176101685, "learning_rate": 7.619384482879039e-06, "loss": 0.039, "mean_token_accuracy": 0.991769552230835, "step": 187 }, { "epoch": 0.8245614035087719, "grad_norm": 1.8316863775253296, "learning_rate": 7.5889053482823015e-06, "loss": 0.0364, "mean_token_accuracy": 0.9871931672096252, "step": 188 }, { "epoch": 0.8289473684210527, "grad_norm": 1.6888874769210815, "learning_rate": 7.558303569018417e-06, "loss": 0.0373, "mean_token_accuracy": 0.9896265268325806, "step": 189 }, { "epoch": 0.8333333333333334, "grad_norm": 2.052993059158325, "learning_rate": 7.527580941791595e-06, "loss": 0.0588, "mean_token_accuracy": 0.9858155846595764, "step": 190 }, { "epoch": 0.8377192982456141, "grad_norm": 1.7255383729934692, "learning_rate": 7.49673927040132e-06, "loss": 0.0362, "mean_token_accuracy": 0.9906250238418579, "step": 191 }, { "epoch": 0.8421052631578947, "grad_norm": 1.6616889238357544, "learning_rate": 7.465780365636445e-06, "loss": 0.0391, "mean_token_accuracy": 0.9904153347015381, "step": 192 }, { "epoch": 0.8464912280701754, "grad_norm": 1.6150552034378052, "learning_rate": 7.4347060451688805e-06, "loss": 0.0328, "mean_token_accuracy": 0.9899193644523621, "step": 193 }, { "epoch": 0.8508771929824561, "grad_norm": 1.7972464561462402, "learning_rate": 7.403518133446866e-06, "loss": 0.0631, "mean_token_accuracy": 0.9849849939346313, "step": 194 }, { "epoch": 0.8552631578947368, "grad_norm": 2.3736069202423096, "learning_rate": 7.37221846158786e-06, "loss": 0.0716, "mean_token_accuracy": 0.9841740727424622, "step": 195 }, { "epoch": 0.8596491228070176, "grad_norm": 1.8168158531188965, "learning_rate": 7.340808867271031e-06, "loss": 0.0572, "mean_token_accuracy": 0.9852786660194397, "step": 196 }, { "epoch": 0.8640350877192983, "grad_norm": 2.351414203643799, "learning_rate": 7.309291194629352e-06, "loss": 0.0585, "mean_token_accuracy": 0.9844617247581482, "step": 197 }, { "epoch": 0.868421052631579, "grad_norm": 1.9125292301177979, "learning_rate": 7.277667294141345e-06, "loss": 0.0449, "mean_token_accuracy": 0.9858443140983582, "step": 198 }, { "epoch": 0.8728070175438597, "grad_norm": 1.656516194343567, "learning_rate": 7.245939022522413e-06, "loss": 0.0377, "mean_token_accuracy": 0.9922394752502441, "step": 199 }, { "epoch": 0.8771929824561403, "grad_norm": 1.8018559217453003, "learning_rate": 7.214108242615852e-06, "loss": 0.0479, "mean_token_accuracy": 0.9854227304458618, "step": 200 }, { "epoch": 0.881578947368421, "grad_norm": 2.188966989517212, "learning_rate": 7.1821768232834595e-06, "loss": 0.0439, "mean_token_accuracy": 0.9846153855323792, "step": 201 }, { "epoch": 0.8859649122807017, "grad_norm": 2.883554697036743, "learning_rate": 7.150146639295816e-06, "loss": 0.0622, "mean_token_accuracy": 0.9843912720680237, "step": 202 }, { "epoch": 0.8903508771929824, "grad_norm": 2.633408546447754, "learning_rate": 7.118019571222216e-06, "loss": 0.0701, "mean_token_accuracy": 0.9815950989723206, "step": 203 }, { "epoch": 0.8947368421052632, "grad_norm": 2.7991299629211426, "learning_rate": 7.0857975053202485e-06, "loss": 0.0405, "mean_token_accuracy": 0.9858155846595764, "step": 204 }, { "epoch": 0.8991228070175439, "grad_norm": 1.856713056564331, "learning_rate": 7.053482333425057e-06, "loss": 0.0295, "mean_token_accuracy": 0.9893048405647278, "step": 205 }, { "epoch": 0.9035087719298246, "grad_norm": 2.318593740463257, "learning_rate": 7.021075952838262e-06, "loss": 0.0622, "mean_token_accuracy": 0.9828282594680786, "step": 206 }, { "epoch": 0.9078947368421053, "grad_norm": 1.599013090133667, "learning_rate": 6.988580266216566e-06, "loss": 0.0302, "mean_token_accuracy": 0.9893513917922974, "step": 207 }, { "epoch": 0.9122807017543859, "grad_norm": 3.1583027839660645, "learning_rate": 6.955997181460041e-06, "loss": 0.077, "mean_token_accuracy": 0.9788732528686523, "step": 208 }, { "epoch": 0.9166666666666666, "grad_norm": 2.7289540767669678, "learning_rate": 6.9233286116001194e-06, "loss": 0.0492, "mean_token_accuracy": 0.9878910183906555, "step": 209 }, { "epoch": 0.9210526315789473, "grad_norm": 3.2893893718719482, "learning_rate": 6.890576474687264e-06, "loss": 0.0353, "mean_token_accuracy": 0.9867549538612366, "step": 210 }, { "epoch": 0.9254385964912281, "grad_norm": 1.6859092712402344, "learning_rate": 6.857742693678367e-06, "loss": 0.0275, "mean_token_accuracy": 0.9899193644523621, "step": 211 }, { "epoch": 0.9298245614035088, "grad_norm": 4.221460342407227, "learning_rate": 6.824829196323836e-06, "loss": 0.0779, "mean_token_accuracy": 0.9783315062522888, "step": 212 }, { "epoch": 0.9342105263157895, "grad_norm": 2.7743587493896484, "learning_rate": 6.791837915054422e-06, "loss": 0.0534, "mean_token_accuracy": 0.9824561476707458, "step": 213 }, { "epoch": 0.9385964912280702, "grad_norm": 2.1949446201324463, "learning_rate": 6.7587707868677566e-06, "loss": 0.0397, "mean_token_accuracy": 0.9856997132301331, "step": 214 }, { "epoch": 0.9429824561403509, "grad_norm": 1.6603456735610962, "learning_rate": 6.725629753214624e-06, "loss": 0.0382, "mean_token_accuracy": 0.99170982837677, "step": 215 }, { "epoch": 0.9473684210526315, "grad_norm": 1.5715985298156738, "learning_rate": 6.692416759884978e-06, "loss": 0.0368, "mean_token_accuracy": 0.9817258715629578, "step": 216 }, { "epoch": 0.9517543859649122, "grad_norm": 1.9068866968154907, "learning_rate": 6.659133756893701e-06, "loss": 0.0418, "mean_token_accuracy": 0.9910537004470825, "step": 217 }, { "epoch": 0.956140350877193, "grad_norm": 1.595156192779541, "learning_rate": 6.6257826983661044e-06, "loss": 0.0243, "mean_token_accuracy": 0.99028080701828, "step": 218 }, { "epoch": 0.9605263157894737, "grad_norm": 2.257960557937622, "learning_rate": 6.592365542423213e-06, "loss": 0.0546, "mean_token_accuracy": 0.9854227304458618, "step": 219 }, { "epoch": 0.9649122807017544, "grad_norm": 1.503311276435852, "learning_rate": 6.558884251066784e-06, "loss": 0.0379, "mean_token_accuracy": 0.9878172874450684, "step": 220 }, { "epoch": 0.9692982456140351, "grad_norm": 1.5574872493743896, "learning_rate": 6.5253407900641195e-06, "loss": 0.0304, "mean_token_accuracy": 0.9918946027755737, "step": 221 }, { "epoch": 0.9736842105263158, "grad_norm": 1.161798357963562, "learning_rate": 6.4917371288326554e-06, "loss": 0.0259, "mean_token_accuracy": 0.9921962022781372, "step": 222 }, { "epoch": 0.9780701754385965, "grad_norm": 1.8784377574920654, "learning_rate": 6.458075240324324e-06, "loss": 0.0419, "mean_token_accuracy": 0.9917780160903931, "step": 223 }, { "epoch": 0.9824561403508771, "grad_norm": 1.6808606386184692, "learning_rate": 6.424357100909724e-06, "loss": 0.0448, "mean_token_accuracy": 0.988095223903656, "step": 224 }, { "epoch": 0.9868421052631579, "grad_norm": 1.553889513015747, "learning_rate": 6.390584690262079e-06, "loss": 0.0274, "mean_token_accuracy": 0.9935364723205566, "step": 225 }, { "epoch": 0.9912280701754386, "grad_norm": 1.9178175926208496, "learning_rate": 6.356759991241008e-06, "loss": 0.0525, "mean_token_accuracy": 0.9872298836708069, "step": 226 }, { "epoch": 0.9956140350877193, "grad_norm": 1.2284913063049316, "learning_rate": 6.3228849897761055e-06, "loss": 0.0245, "mean_token_accuracy": 0.989382266998291, "step": 227 }, { "epoch": 1.0, "grad_norm": 3.0383493900299072, "learning_rate": 6.288961674750346e-06, "loss": 0.0677, "mean_token_accuracy": 0.9824380278587341, "step": 228 }, { "epoch": 1.0043859649122806, "grad_norm": 1.9716832637786865, "learning_rate": 6.2549920378833055e-06, "loss": 0.0398, "mean_token_accuracy": 0.9869346618652344, "step": 229 }, { "epoch": 1.0087719298245614, "grad_norm": 1.5856348276138306, "learning_rate": 6.22097807361423e-06, "loss": 0.0206, "mean_token_accuracy": 0.9938837885856628, "step": 230 }, { "epoch": 1.013157894736842, "grad_norm": 1.0828473567962646, "learning_rate": 6.186921778984936e-06, "loss": 0.017, "mean_token_accuracy": 0.9967741966247559, "step": 231 }, { "epoch": 1.0175438596491229, "grad_norm": 1.3558406829833984, "learning_rate": 6.152825153522552e-06, "loss": 0.0331, "mean_token_accuracy": 0.9902912378311157, "step": 232 }, { "epoch": 1.0219298245614035, "grad_norm": 1.2174006700515747, "learning_rate": 6.118690199122133e-06, "loss": 0.0291, "mean_token_accuracy": 0.9949392676353455, "step": 233 }, { "epoch": 1.0263157894736843, "grad_norm": 1.1833233833312988, "learning_rate": 6.084518919929112e-06, "loss": 0.0186, "mean_token_accuracy": 0.9923830032348633, "step": 234 }, { "epoch": 1.030701754385965, "grad_norm": 1.1958109140396118, "learning_rate": 6.050313322221645e-06, "loss": 0.0238, "mean_token_accuracy": 0.9938016533851624, "step": 235 }, { "epoch": 1.0350877192982457, "grad_norm": 2.4705111980438232, "learning_rate": 6.016075414292804e-06, "loss": 0.0591, "mean_token_accuracy": 0.9824198484420776, "step": 236 }, { "epoch": 1.0394736842105263, "grad_norm": 0.6340071558952332, "learning_rate": 5.981807206332674e-06, "loss": 0.0089, "mean_token_accuracy": 0.9979423880577087, "step": 237 }, { "epoch": 1.043859649122807, "grad_norm": 1.5429046154022217, "learning_rate": 5.947510710310332e-06, "loss": 0.015, "mean_token_accuracy": 0.991919219493866, "step": 238 }, { "epoch": 1.0482456140350878, "grad_norm": 1.2308100461959839, "learning_rate": 5.9131879398557125e-06, "loss": 0.0154, "mean_token_accuracy": 0.9931707382202148, "step": 239 }, { "epoch": 1.0526315789473684, "grad_norm": 2.413168430328369, "learning_rate": 5.878840910141382e-06, "loss": 0.0452, "mean_token_accuracy": 0.989130437374115, "step": 240 }, { "epoch": 1.0570175438596492, "grad_norm": 1.1270220279693604, "learning_rate": 5.844471637764232e-06, "loss": 0.0154, "mean_token_accuracy": 0.9969262480735779, "step": 241 }, { "epoch": 1.0614035087719298, "grad_norm": 1.9341720342636108, "learning_rate": 5.810082140627069e-06, "loss": 0.0201, "mean_token_accuracy": 0.9940770268440247, "step": 242 }, { "epoch": 1.0657894736842106, "grad_norm": 0.9762948155403137, "learning_rate": 5.77567443782015e-06, "loss": 0.0115, "mean_token_accuracy": 0.9935622215270996, "step": 243 }, { "epoch": 1.0701754385964912, "grad_norm": 1.2798123359680176, "learning_rate": 5.7412505495026265e-06, "loss": 0.0204, "mean_token_accuracy": 0.9909182786941528, "step": 244 }, { "epoch": 1.0745614035087718, "grad_norm": 1.5217316150665283, "learning_rate": 5.70681249678394e-06, "loss": 0.0249, "mean_token_accuracy": 0.9922480583190918, "step": 245 }, { "epoch": 1.0789473684210527, "grad_norm": 1.015491008758545, "learning_rate": 5.67236230160516e-06, "loss": 0.0201, "mean_token_accuracy": 0.995854914188385, "step": 246 }, { "epoch": 1.0833333333333333, "grad_norm": 1.6943317651748657, "learning_rate": 5.63790198662027e-06, "loss": 0.0299, "mean_token_accuracy": 0.9893730282783508, "step": 247 }, { "epoch": 1.087719298245614, "grad_norm": 1.3557161092758179, "learning_rate": 5.6034335750774086e-06, "loss": 0.0187, "mean_token_accuracy": 0.9929292798042297, "step": 248 }, { "epoch": 1.0921052631578947, "grad_norm": 1.4643901586532593, "learning_rate": 5.568959090700085e-06, "loss": 0.0185, "mean_token_accuracy": 0.9926701784133911, "step": 249 }, { "epoch": 1.0964912280701755, "grad_norm": 1.6736350059509277, "learning_rate": 5.534480557568358e-06, "loss": 0.0438, "mean_token_accuracy": 0.9869608879089355, "step": 250 }, { "epoch": 1.1008771929824561, "grad_norm": 1.204957127571106, "learning_rate": 5.500000000000001e-06, "loss": 0.0214, "mean_token_accuracy": 0.9910891056060791, "step": 251 }, { "epoch": 1.1052631578947367, "grad_norm": 1.2613508701324463, "learning_rate": 5.465519442431644e-06, "loss": 0.0209, "mean_token_accuracy": 0.9940416812896729, "step": 252 }, { "epoch": 1.1096491228070176, "grad_norm": 1.2684029340744019, "learning_rate": 5.431040909299917e-06, "loss": 0.0141, "mean_token_accuracy": 0.9940357804298401, "step": 253 }, { "epoch": 1.1140350877192982, "grad_norm": 1.2532188892364502, "learning_rate": 5.3965664249225945e-06, "loss": 0.0256, "mean_token_accuracy": 0.9927404522895813, "step": 254 }, { "epoch": 1.118421052631579, "grad_norm": 1.1357946395874023, "learning_rate": 5.362098013379732e-06, "loss": 0.0198, "mean_token_accuracy": 0.9929078221321106, "step": 255 }, { "epoch": 1.1228070175438596, "grad_norm": 1.2526124715805054, "learning_rate": 5.327637698394842e-06, "loss": 0.0305, "mean_token_accuracy": 0.9918450713157654, "step": 256 }, { "epoch": 1.1271929824561404, "grad_norm": 1.2404338121414185, "learning_rate": 5.293187503216062e-06, "loss": 0.0221, "mean_token_accuracy": 0.9930555820465088, "step": 257 }, { "epoch": 1.131578947368421, "grad_norm": 1.745115041732788, "learning_rate": 5.258749450497376e-06, "loss": 0.0247, "mean_token_accuracy": 0.9915878176689148, "step": 258 }, { "epoch": 1.1359649122807018, "grad_norm": 1.7782152891159058, "learning_rate": 5.224325562179852e-06, "loss": 0.0403, "mean_token_accuracy": 0.9923737049102783, "step": 259 }, { "epoch": 1.1403508771929824, "grad_norm": 0.9483814835548401, "learning_rate": 5.189917859372933e-06, "loss": 0.0144, "mean_token_accuracy": 0.9959514141082764, "step": 260 }, { "epoch": 1.1447368421052633, "grad_norm": 0.9481673240661621, "learning_rate": 5.15552836223577e-06, "loss": 0.008, "mean_token_accuracy": 0.9958333373069763, "step": 261 }, { "epoch": 1.1491228070175439, "grad_norm": 1.904180884361267, "learning_rate": 5.121159089858619e-06, "loss": 0.0318, "mean_token_accuracy": 0.9882978796958923, "step": 262 }, { "epoch": 1.1535087719298245, "grad_norm": 1.5921107530593872, "learning_rate": 5.08681206014429e-06, "loss": 0.0271, "mean_token_accuracy": 0.9932692050933838, "step": 263 }, { "epoch": 1.1578947368421053, "grad_norm": 0.7929293513298035, "learning_rate": 5.0524892896896685e-06, "loss": 0.0077, "mean_token_accuracy": 0.9979695677757263, "step": 264 }, { "epoch": 1.162280701754386, "grad_norm": 0.8570319414138794, "learning_rate": 5.0181927936673265e-06, "loss": 0.0132, "mean_token_accuracy": 0.9949290156364441, "step": 265 }, { "epoch": 1.1666666666666667, "grad_norm": 1.7126463651657104, "learning_rate": 4.983924585707199e-06, "loss": 0.0254, "mean_token_accuracy": 0.9899899959564209, "step": 266 }, { "epoch": 1.1710526315789473, "grad_norm": 1.0722357034683228, "learning_rate": 4.949686677778357e-06, "loss": 0.0195, "mean_token_accuracy": 0.994908332824707, "step": 267 }, { "epoch": 1.1754385964912282, "grad_norm": 1.97144615650177, "learning_rate": 4.915481080070887e-06, "loss": 0.0277, "mean_token_accuracy": 0.9918864369392395, "step": 268 }, { "epoch": 1.1798245614035088, "grad_norm": 1.4325839281082153, "learning_rate": 4.8813098008778685e-06, "loss": 0.0249, "mean_token_accuracy": 0.9949135184288025, "step": 269 }, { "epoch": 1.1842105263157894, "grad_norm": 1.6865849494934082, "learning_rate": 4.847174846477448e-06, "loss": 0.024, "mean_token_accuracy": 0.9917440414428711, "step": 270 }, { "epoch": 1.1885964912280702, "grad_norm": 1.0741541385650635, "learning_rate": 4.813078221015065e-06, "loss": 0.011, "mean_token_accuracy": 0.9949698448181152, "step": 271 }, { "epoch": 1.1929824561403508, "grad_norm": 0.9860951900482178, "learning_rate": 4.779021926385771e-06, "loss": 0.0132, "mean_token_accuracy": 0.9968119263648987, "step": 272 }, { "epoch": 1.1973684210526316, "grad_norm": 0.8894346952438354, "learning_rate": 4.745007962116697e-06, "loss": 0.0098, "mean_token_accuracy": 0.996842086315155, "step": 273 }, { "epoch": 1.2017543859649122, "grad_norm": 1.0604538917541504, "learning_rate": 4.711038325249655e-06, "loss": 0.0187, "mean_token_accuracy": 0.9912366271018982, "step": 274 }, { "epoch": 1.206140350877193, "grad_norm": 1.4495034217834473, "learning_rate": 4.677115010223895e-06, "loss": 0.0242, "mean_token_accuracy": 0.9911330342292786, "step": 275 }, { "epoch": 1.2105263157894737, "grad_norm": 1.5287433862686157, "learning_rate": 4.6432400087589925e-06, "loss": 0.0239, "mean_token_accuracy": 0.9931237697601318, "step": 276 }, { "epoch": 1.2149122807017543, "grad_norm": 0.8271854519844055, "learning_rate": 4.609415309737922e-06, "loss": 0.0118, "mean_token_accuracy": 0.9979296326637268, "step": 277 }, { "epoch": 1.219298245614035, "grad_norm": 2.2274882793426514, "learning_rate": 4.5756428990902765e-06, "loss": 0.0347, "mean_token_accuracy": 0.9863445162773132, "step": 278 }, { "epoch": 1.2236842105263157, "grad_norm": 1.3514066934585571, "learning_rate": 4.541924759675677e-06, "loss": 0.028, "mean_token_accuracy": 0.9940179586410522, "step": 279 }, { "epoch": 1.2280701754385965, "grad_norm": 1.8070344924926758, "learning_rate": 4.508262871167347e-06, "loss": 0.0321, "mean_token_accuracy": 0.9906928539276123, "step": 280 }, { "epoch": 1.2324561403508771, "grad_norm": 0.6789629459381104, "learning_rate": 4.474659209935882e-06, "loss": 0.006, "mean_token_accuracy": 0.9977426528930664, "step": 281 }, { "epoch": 1.236842105263158, "grad_norm": 1.686585783958435, "learning_rate": 4.441115748933219e-06, "loss": 0.0238, "mean_token_accuracy": 0.9934210777282715, "step": 282 }, { "epoch": 1.2412280701754386, "grad_norm": 2.098029375076294, "learning_rate": 4.4076344575767895e-06, "loss": 0.018, "mean_token_accuracy": 0.9947643876075745, "step": 283 }, { "epoch": 1.2456140350877192, "grad_norm": 0.9982621669769287, "learning_rate": 4.374217301633897e-06, "loss": 0.0159, "mean_token_accuracy": 0.9956803321838379, "step": 284 }, { "epoch": 1.25, "grad_norm": 1.0374560356140137, "learning_rate": 4.340866243106302e-06, "loss": 0.0211, "mean_token_accuracy": 0.9930692911148071, "step": 285 }, { "epoch": 1.2543859649122808, "grad_norm": 1.416849970817566, "learning_rate": 4.307583240115024e-06, "loss": 0.0355, "mean_token_accuracy": 0.9919517040252686, "step": 286 }, { "epoch": 1.2587719298245614, "grad_norm": 1.1965305805206299, "learning_rate": 4.274370246785379e-06, "loss": 0.0094, "mean_token_accuracy": 0.9968051314353943, "step": 287 }, { "epoch": 1.263157894736842, "grad_norm": 0.9822368025779724, "learning_rate": 4.241229213132245e-06, "loss": 0.0133, "mean_token_accuracy": 0.995708167552948, "step": 288 }, { "epoch": 1.2675438596491229, "grad_norm": 1.1890642642974854, "learning_rate": 4.208162084945579e-06, "loss": 0.0237, "mean_token_accuracy": 0.994106113910675, "step": 289 }, { "epoch": 1.2719298245614035, "grad_norm": 0.9328476786613464, "learning_rate": 4.175170803676166e-06, "loss": 0.0177, "mean_token_accuracy": 0.9957805871963501, "step": 290 }, { "epoch": 1.2763157894736843, "grad_norm": 1.1378854513168335, "learning_rate": 4.142257306321635e-06, "loss": 0.0119, "mean_token_accuracy": 0.9957671761512756, "step": 291 }, { "epoch": 1.280701754385965, "grad_norm": 1.4593102931976318, "learning_rate": 4.109423525312738e-06, "loss": 0.0371, "mean_token_accuracy": 0.9897330403327942, "step": 292 }, { "epoch": 1.2850877192982457, "grad_norm": 1.4562864303588867, "learning_rate": 4.076671388399882e-06, "loss": 0.0268, "mean_token_accuracy": 0.9888211488723755, "step": 293 }, { "epoch": 1.2894736842105263, "grad_norm": 1.087637186050415, "learning_rate": 4.044002818539959e-06, "loss": 0.0126, "mean_token_accuracy": 0.9948612451553345, "step": 294 }, { "epoch": 1.293859649122807, "grad_norm": 1.2622966766357422, "learning_rate": 4.011419733783436e-06, "loss": 0.0174, "mean_token_accuracy": 0.9938587546348572, "step": 295 }, { "epoch": 1.2982456140350878, "grad_norm": 0.9577850103378296, "learning_rate": 3.978924047161738e-06, "loss": 0.0093, "mean_token_accuracy": 0.9958677887916565, "step": 296 }, { "epoch": 1.3026315789473684, "grad_norm": 1.082053780555725, "learning_rate": 3.946517666574944e-06, "loss": 0.0215, "mean_token_accuracy": 0.9957401752471924, "step": 297 }, { "epoch": 1.3070175438596492, "grad_norm": 1.4833756685256958, "learning_rate": 3.914202494679753e-06, "loss": 0.0253, "mean_token_accuracy": 0.9936237931251526, "step": 298 }, { "epoch": 1.3114035087719298, "grad_norm": 1.4708722829818726, "learning_rate": 3.8819804287777855e-06, "loss": 0.0256, "mean_token_accuracy": 0.9921104311943054, "step": 299 }, { "epoch": 1.3157894736842106, "grad_norm": 1.3255459070205688, "learning_rate": 3.849853360704185e-06, "loss": 0.0101, "mean_token_accuracy": 0.9958974123001099, "step": 300 }, { "epoch": 1.3201754385964912, "grad_norm": 1.0774561166763306, "learning_rate": 3.817823176716541e-06, "loss": 0.0208, "mean_token_accuracy": 0.997140109539032, "step": 301 }, { "epoch": 1.3245614035087718, "grad_norm": 0.8801477551460266, "learning_rate": 3.785891757384148e-06, "loss": 0.0129, "mean_token_accuracy": 0.9960474371910095, "step": 302 }, { "epoch": 1.3289473684210527, "grad_norm": 1.8041269779205322, "learning_rate": 3.7540609774775872e-06, "loss": 0.0319, "mean_token_accuracy": 0.9916054606437683, "step": 303 }, { "epoch": 1.3333333333333333, "grad_norm": 1.3232316970825195, "learning_rate": 3.7223327058586566e-06, "loss": 0.0249, "mean_token_accuracy": 0.9913700222969055, "step": 304 }, { "epoch": 1.337719298245614, "grad_norm": 1.796228289604187, "learning_rate": 3.6907088053706486e-06, "loss": 0.0267, "mean_token_accuracy": 0.9892683029174805, "step": 305 }, { "epoch": 1.3421052631578947, "grad_norm": 0.7775062918663025, "learning_rate": 3.659191132728971e-06, "loss": 0.0105, "mean_token_accuracy": 0.997863233089447, "step": 306 }, { "epoch": 1.3464912280701755, "grad_norm": 1.0526679754257202, "learning_rate": 3.6277815384121408e-06, "loss": 0.0108, "mean_token_accuracy": 0.9970443248748779, "step": 307 }, { "epoch": 1.3508771929824561, "grad_norm": 1.436594843864441, "learning_rate": 3.5964818665531365e-06, "loss": 0.0221, "mean_token_accuracy": 0.9921645522117615, "step": 308 }, { "epoch": 1.3552631578947367, "grad_norm": 1.1766964197158813, "learning_rate": 3.5652939548311217e-06, "loss": 0.0156, "mean_token_accuracy": 0.9943872690200806, "step": 309 }, { "epoch": 1.3596491228070176, "grad_norm": 1.0627126693725586, "learning_rate": 3.534219634363557e-06, "loss": 0.0187, "mean_token_accuracy": 0.9959142208099365, "step": 310 }, { "epoch": 1.3640350877192984, "grad_norm": 1.667644739151001, "learning_rate": 3.503260729598681e-06, "loss": 0.0326, "mean_token_accuracy": 0.9905857443809509, "step": 311 }, { "epoch": 1.368421052631579, "grad_norm": 2.0375561714172363, "learning_rate": 3.4724190582084073e-06, "loss": 0.0481, "mean_token_accuracy": 0.9823834300041199, "step": 312 }, { "epoch": 1.3728070175438596, "grad_norm": 1.3789243698120117, "learning_rate": 3.441696430981585e-06, "loss": 0.0201, "mean_token_accuracy": 0.9931034445762634, "step": 313 }, { "epoch": 1.3771929824561404, "grad_norm": 0.9940765500068665, "learning_rate": 3.4110946517176995e-06, "loss": 0.0178, "mean_token_accuracy": 0.9958890080451965, "step": 314 }, { "epoch": 1.381578947368421, "grad_norm": 1.2933381795883179, "learning_rate": 3.3806155171209632e-06, "loss": 0.0252, "mean_token_accuracy": 0.9917948842048645, "step": 315 }, { "epoch": 1.3859649122807016, "grad_norm": 1.69535493850708, "learning_rate": 3.3502608166948166e-06, "loss": 0.0376, "mean_token_accuracy": 0.991062581539154, "step": 316 }, { "epoch": 1.3903508771929824, "grad_norm": 0.9709140658378601, "learning_rate": 3.320032332636879e-06, "loss": 0.0115, "mean_token_accuracy": 0.9969103932380676, "step": 317 }, { "epoch": 1.3947368421052633, "grad_norm": 1.2243115901947021, "learning_rate": 3.2899318397342954e-06, "loss": 0.015, "mean_token_accuracy": 0.9939024448394775, "step": 318 }, { "epoch": 1.3991228070175439, "grad_norm": 2.733086347579956, "learning_rate": 3.2599611052595474e-06, "loss": 0.0081, "mean_token_accuracy": 0.9990205764770508, "step": 319 }, { "epoch": 1.4035087719298245, "grad_norm": 0.9694793820381165, "learning_rate": 3.2301218888666807e-06, "loss": 0.0129, "mean_token_accuracy": 0.9959142208099365, "step": 320 }, { "epoch": 1.4078947368421053, "grad_norm": 1.1064481735229492, "learning_rate": 3.200415942488003e-06, "loss": 0.0163, "mean_token_accuracy": 0.995854914188385, "step": 321 }, { "epoch": 1.412280701754386, "grad_norm": 1.0130923986434937, "learning_rate": 3.170845010231216e-06, "loss": 0.0142, "mean_token_accuracy": 0.9929789304733276, "step": 322 }, { "epoch": 1.4166666666666667, "grad_norm": 0.9644030928611755, "learning_rate": 3.141410828277015e-06, "loss": 0.0111, "mean_token_accuracy": 0.9979209899902344, "step": 323 }, { "epoch": 1.4210526315789473, "grad_norm": 0.9800613522529602, "learning_rate": 3.1121151247771595e-06, "loss": 0.014, "mean_token_accuracy": 0.9939637780189514, "step": 324 }, { "epoch": 1.4254385964912282, "grad_norm": 1.5895040035247803, "learning_rate": 3.082959619753001e-06, "loss": 0.0252, "mean_token_accuracy": 0.9931906461715698, "step": 325 }, { "epoch": 1.4298245614035088, "grad_norm": 1.449415683746338, "learning_rate": 3.053946024994506e-06, "loss": 0.0261, "mean_token_accuracy": 0.993062436580658, "step": 326 }, { "epoch": 1.4342105263157894, "grad_norm": 1.403334617614746, "learning_rate": 3.025076043959739e-06, "loss": 0.0311, "mean_token_accuracy": 0.9867346882820129, "step": 327 }, { "epoch": 1.4385964912280702, "grad_norm": 0.661690354347229, "learning_rate": 2.9963513716748656e-06, "loss": 0.006, "mean_token_accuracy": 0.9989304542541504, "step": 328 }, { "epoch": 1.4429824561403508, "grad_norm": 1.2454185485839844, "learning_rate": 2.96777369463462e-06, "loss": 0.0167, "mean_token_accuracy": 0.9941291809082031, "step": 329 }, { "epoch": 1.4473684210526316, "grad_norm": 1.5708200931549072, "learning_rate": 2.9393446907032886e-06, "loss": 0.0275, "mean_token_accuracy": 0.9918946027755737, "step": 330 }, { "epoch": 1.4517543859649122, "grad_norm": 1.4437129497528076, "learning_rate": 2.911066029016208e-06, "loss": 0.0185, "mean_token_accuracy": 0.9933142066001892, "step": 331 }, { "epoch": 1.456140350877193, "grad_norm": 1.310154914855957, "learning_rate": 2.8829393698817566e-06, "loss": 0.0146, "mean_token_accuracy": 0.9949392676353455, "step": 332 }, { "epoch": 1.4605263157894737, "grad_norm": 1.182435154914856, "learning_rate": 2.854966364683872e-06, "loss": 0.0145, "mean_token_accuracy": 0.9941691160202026, "step": 333 }, { "epoch": 1.4649122807017543, "grad_norm": 1.2059062719345093, "learning_rate": 2.827148655785107e-06, "loss": 0.0204, "mean_token_accuracy": 0.9931640625, "step": 334 }, { "epoch": 1.469298245614035, "grad_norm": 1.8456898927688599, "learning_rate": 2.7994878764301857e-06, "loss": 0.0249, "mean_token_accuracy": 0.9907975196838379, "step": 335 }, { "epoch": 1.4736842105263157, "grad_norm": 0.9849246740341187, "learning_rate": 2.771985650650131e-06, "loss": 0.0084, "mean_token_accuracy": 0.9968619346618652, "step": 336 }, { "epoch": 1.4780701754385965, "grad_norm": 0.9812407493591309, "learning_rate": 2.7446435931668913e-06, "loss": 0.0146, "mean_token_accuracy": 0.9926624894142151, "step": 337 }, { "epoch": 1.4824561403508771, "grad_norm": 1.353092908859253, "learning_rate": 2.717463309298557e-06, "loss": 0.0243, "mean_token_accuracy": 0.990981936454773, "step": 338 }, { "epoch": 1.486842105263158, "grad_norm": 1.1078283786773682, "learning_rate": 2.6904463948650994e-06, "loss": 0.0099, "mean_token_accuracy": 0.9957716464996338, "step": 339 }, { "epoch": 1.4912280701754386, "grad_norm": 1.0464521646499634, "learning_rate": 2.663594436094669e-06, "loss": 0.0116, "mean_token_accuracy": 0.9948132634162903, "step": 340 }, { "epoch": 1.4956140350877192, "grad_norm": 1.4984095096588135, "learning_rate": 2.6369090095304824e-06, "loss": 0.0168, "mean_token_accuracy": 0.9929577708244324, "step": 341 }, { "epoch": 1.5, "grad_norm": 1.4189928770065308, "learning_rate": 2.610391681938239e-06, "loss": 0.0192, "mean_token_accuracy": 0.9928789138793945, "step": 342 }, { "epoch": 1.5043859649122808, "grad_norm": 1.3250926733016968, "learning_rate": 2.5840440102141506e-06, "loss": 0.0171, "mean_token_accuracy": 0.9968684911727905, "step": 343 }, { "epoch": 1.5087719298245614, "grad_norm": 1.1156545877456665, "learning_rate": 2.5578675412935172e-06, "loss": 0.012, "mean_token_accuracy": 0.9957310557365417, "step": 344 }, { "epoch": 1.513157894736842, "grad_norm": 1.6553047895431519, "learning_rate": 2.531863812059916e-06, "loss": 0.0299, "mean_token_accuracy": 0.9908722043037415, "step": 345 }, { "epoch": 1.5175438596491229, "grad_norm": 1.5688374042510986, "learning_rate": 2.5060343492549567e-06, "loss": 0.0273, "mean_token_accuracy": 0.9896324276924133, "step": 346 }, { "epoch": 1.5219298245614035, "grad_norm": 0.9431729316711426, "learning_rate": 2.480380669388648e-06, "loss": 0.0113, "mean_token_accuracy": 0.9944953918457031, "step": 347 }, { "epoch": 1.526315789473684, "grad_norm": 1.5625497102737427, "learning_rate": 2.45490427865036e-06, "loss": 0.0251, "mean_token_accuracy": 0.991769552230835, "step": 348 }, { "epoch": 1.530701754385965, "grad_norm": 0.818818211555481, "learning_rate": 2.429606672820387e-06, "loss": 0.0083, "mean_token_accuracy": 0.998031497001648, "step": 349 }, { "epoch": 1.5350877192982457, "grad_norm": 1.1630852222442627, "learning_rate": 2.4044893371821373e-06, "loss": 0.0156, "mean_token_accuracy": 0.9918367266654968, "step": 350 }, { "epoch": 1.5394736842105263, "grad_norm": 1.1115548610687256, "learning_rate": 2.379553746434913e-06, "loss": 0.023, "mean_token_accuracy": 0.9939209818840027, "step": 351 }, { "epoch": 1.543859649122807, "grad_norm": 1.636430025100708, "learning_rate": 2.3548013646073427e-06, "loss": 0.0279, "mean_token_accuracy": 0.991623044013977, "step": 352 }, { "epoch": 1.5482456140350878, "grad_norm": 1.3159112930297852, "learning_rate": 2.3302336449714166e-06, "loss": 0.0231, "mean_token_accuracy": 0.9938587546348572, "step": 353 }, { "epoch": 1.5526315789473686, "grad_norm": 1.300433874130249, "learning_rate": 2.305852029957159e-06, "loss": 0.0177, "mean_token_accuracy": 0.9944812655448914, "step": 354 }, { "epoch": 1.557017543859649, "grad_norm": 0.9912800788879395, "learning_rate": 2.281657951067948e-06, "loss": 0.014, "mean_token_accuracy": 0.9920477271080017, "step": 355 }, { "epoch": 1.5614035087719298, "grad_norm": 1.0530201196670532, "learning_rate": 2.257652828796459e-06, "loss": 0.0199, "mean_token_accuracy": 0.9917184114456177, "step": 356 }, { "epoch": 1.5657894736842106, "grad_norm": 1.6638238430023193, "learning_rate": 2.233838072541273e-06, "loss": 0.0315, "mean_token_accuracy": 0.9910714030265808, "step": 357 }, { "epoch": 1.5701754385964912, "grad_norm": 0.8437574505805969, "learning_rate": 2.2102150805241233e-06, "loss": 0.0093, "mean_token_accuracy": 0.9969450235366821, "step": 358 }, { "epoch": 1.5745614035087718, "grad_norm": 0.6092004179954529, "learning_rate": 2.186785239707799e-06, "loss": 0.0062, "mean_token_accuracy": 0.998971164226532, "step": 359 }, { "epoch": 1.5789473684210527, "grad_norm": 1.3415589332580566, "learning_rate": 2.163549925714721e-06, "loss": 0.0254, "mean_token_accuracy": 0.9937824010848999, "step": 360 }, { "epoch": 1.5833333333333335, "grad_norm": 1.4913095235824585, "learning_rate": 2.140510502746168e-06, "loss": 0.0208, "mean_token_accuracy": 0.9936237931251526, "step": 361 }, { "epoch": 1.587719298245614, "grad_norm": 1.1327183246612549, "learning_rate": 2.1176683235021885e-06, "loss": 0.0154, "mean_token_accuracy": 0.9950690269470215, "step": 362 }, { "epoch": 1.5921052631578947, "grad_norm": 0.7053467631340027, "learning_rate": 2.0950247291021713e-06, "loss": 0.0081, "mean_token_accuracy": 0.996999979019165, "step": 363 }, { "epoch": 1.5964912280701755, "grad_norm": 0.7855185270309448, "learning_rate": 2.0725810490061156e-06, "loss": 0.0195, "mean_token_accuracy": 0.9970443248748779, "step": 364 }, { "epoch": 1.6008771929824561, "grad_norm": 1.0088672637939453, "learning_rate": 2.0503386009365685e-06, "loss": 0.0177, "mean_token_accuracy": 0.9940298795700073, "step": 365 }, { "epoch": 1.6052631578947367, "grad_norm": 1.4785208702087402, "learning_rate": 2.028298690801257e-06, "loss": 0.0254, "mean_token_accuracy": 0.9921645522117615, "step": 366 }, { "epoch": 1.6096491228070176, "grad_norm": 1.0246776342391968, "learning_rate": 2.006462612616422e-06, "loss": 0.0166, "mean_token_accuracy": 0.9929364323616028, "step": 367 }, { "epoch": 1.6140350877192984, "grad_norm": 1.642774224281311, "learning_rate": 1.984831648430836e-06, "loss": 0.0268, "mean_token_accuracy": 0.9900596141815186, "step": 368 }, { "epoch": 1.618421052631579, "grad_norm": 1.5654659271240234, "learning_rate": 1.963407068250538e-06, "loss": 0.0154, "mean_token_accuracy": 0.991623044013977, "step": 369 }, { "epoch": 1.6228070175438596, "grad_norm": 1.4609651565551758, "learning_rate": 1.9421901299642597e-06, "loss": 0.0242, "mean_token_accuracy": 0.9956569075584412, "step": 370 }, { "epoch": 1.6271929824561404, "grad_norm": 0.9023415446281433, "learning_rate": 1.9211820792695808e-06, "loss": 0.0117, "mean_token_accuracy": 0.9969040155410767, "step": 371 }, { "epoch": 1.631578947368421, "grad_norm": 1.6618549823760986, "learning_rate": 1.900384149599787e-06, "loss": 0.0204, "mean_token_accuracy": 0.9948979616165161, "step": 372 }, { "epoch": 1.6359649122807016, "grad_norm": 1.1040427684783936, "learning_rate": 1.8797975620514497e-06, "loss": 0.0156, "mean_token_accuracy": 0.9969199299812317, "step": 373 }, { "epoch": 1.6403508771929824, "grad_norm": 0.9365503191947937, "learning_rate": 1.8594235253127373e-06, "loss": 0.0138, "mean_token_accuracy": 0.9949596524238586, "step": 374 }, { "epoch": 1.6447368421052633, "grad_norm": 1.4119096994400024, "learning_rate": 1.8392632355924454e-06, "loss": 0.027, "mean_token_accuracy": 0.9888888597488403, "step": 375 }, { "epoch": 1.6491228070175439, "grad_norm": 1.5925312042236328, "learning_rate": 1.819317876549771e-06, "loss": 0.0221, "mean_token_accuracy": 0.9939698576927185, "step": 376 }, { "epoch": 1.6535087719298245, "grad_norm": 1.48525071144104, "learning_rate": 1.7995886192248091e-06, "loss": 0.0246, "mean_token_accuracy": 0.9900990128517151, "step": 377 }, { "epoch": 1.6578947368421053, "grad_norm": 1.229453682899475, "learning_rate": 1.7800766219698033e-06, "loss": 0.024, "mean_token_accuracy": 0.9931840300559998, "step": 378 }, { "epoch": 1.662280701754386, "grad_norm": 0.9059959053993225, "learning_rate": 1.760783030381138e-06, "loss": 0.0114, "mean_token_accuracy": 0.99798184633255, "step": 379 }, { "epoch": 1.6666666666666665, "grad_norm": 1.1679930686950684, "learning_rate": 1.74170897723207e-06, "loss": 0.0271, "mean_token_accuracy": 0.9939698576927185, "step": 380 }, { "epoch": 1.6710526315789473, "grad_norm": 1.0880180597305298, "learning_rate": 1.7228555824062254e-06, "loss": 0.0253, "mean_token_accuracy": 0.9911330342292786, "step": 381 }, { "epoch": 1.6754385964912282, "grad_norm": 1.014100432395935, "learning_rate": 1.7042239528318539e-06, "loss": 0.0176, "mean_token_accuracy": 0.9950099587440491, "step": 382 }, { "epoch": 1.6798245614035088, "grad_norm": 0.5912336707115173, "learning_rate": 1.6858151824168254e-06, "loss": 0.0094, "mean_token_accuracy": 0.9989626407623291, "step": 383 }, { "epoch": 1.6842105263157894, "grad_norm": 0.47988444566726685, "learning_rate": 1.6676303519844179e-06, "loss": 0.0045, "mean_token_accuracy": 0.9989888668060303, "step": 384 }, { "epoch": 1.6885964912280702, "grad_norm": 0.8239793181419373, "learning_rate": 1.649670529209848e-06, "loss": 0.0134, "mean_token_accuracy": 0.9968085289001465, "step": 385 }, { "epoch": 1.692982456140351, "grad_norm": 0.7349763512611389, "learning_rate": 1.631936768557596e-06, "loss": 0.0158, "mean_token_accuracy": 0.9979444742202759, "step": 386 }, { "epoch": 1.6973684210526314, "grad_norm": 0.9158388376235962, "learning_rate": 1.6144301112194843e-06, "loss": 0.018, "mean_token_accuracy": 0.9950787425041199, "step": 387 }, { "epoch": 1.7017543859649122, "grad_norm": 0.8673951029777527, "learning_rate": 1.5971515850535568e-06, "loss": 0.0143, "mean_token_accuracy": 0.9948024749755859, "step": 388 }, { "epoch": 1.706140350877193, "grad_norm": 1.3981205224990845, "learning_rate": 1.5801022045237252e-06, "loss": 0.0158, "mean_token_accuracy": 0.9948822855949402, "step": 389 }, { "epoch": 1.7105263157894737, "grad_norm": 1.0226213932037354, "learning_rate": 1.5632829706402076e-06, "loss": 0.0157, "mean_token_accuracy": 0.992790937423706, "step": 390 }, { "epoch": 1.7149122807017543, "grad_norm": 1.0367653369903564, "learning_rate": 1.5466948709007604e-06, "loss": 0.0128, "mean_token_accuracy": 0.9968944191932678, "step": 391 }, { "epoch": 1.719298245614035, "grad_norm": 0.9219287633895874, "learning_rate": 1.5303388792326934e-06, "loss": 0.0091, "mean_token_accuracy": 0.9959595799446106, "step": 392 }, { "epoch": 1.723684210526316, "grad_norm": 1.0634158849716187, "learning_rate": 1.5142159559356961e-06, "loss": 0.0217, "mean_token_accuracy": 0.9946178793907166, "step": 393 }, { "epoch": 1.7280701754385965, "grad_norm": 0.6948128938674927, "learning_rate": 1.4983270476254503e-06, "loss": 0.0101, "mean_token_accuracy": 0.9969666600227356, "step": 394 }, { "epoch": 1.7324561403508771, "grad_norm": 0.7936986088752747, "learning_rate": 1.4826730871780534e-06, "loss": 0.008, "mean_token_accuracy": 0.9979550242424011, "step": 395 }, { "epoch": 1.736842105263158, "grad_norm": 1.4505982398986816, "learning_rate": 1.4672549936752507e-06, "loss": 0.0238, "mean_token_accuracy": 0.9917269945144653, "step": 396 }, { "epoch": 1.7412280701754386, "grad_norm": 1.2325935363769531, "learning_rate": 1.4520736723504658e-06, "loss": 0.0295, "mean_token_accuracy": 0.992790937423706, "step": 397 }, { "epoch": 1.7456140350877192, "grad_norm": 0.6576675772666931, "learning_rate": 1.437130014535662e-06, "loss": 0.0054, "mean_token_accuracy": 0.9979166388511658, "step": 398 }, { "epoch": 1.75, "grad_norm": 1.0155889987945557, "learning_rate": 1.4224248976090016e-06, "loss": 0.017, "mean_token_accuracy": 0.9958974123001099, "step": 399 }, { "epoch": 1.7543859649122808, "grad_norm": 1.2243722677230835, "learning_rate": 1.4079591849433383e-06, "loss": 0.0179, "mean_token_accuracy": 0.9928352236747742, "step": 400 }, { "epoch": 1.7587719298245614, "grad_norm": 0.9892923831939697, "learning_rate": 1.3937337258555252e-06, "loss": 0.0109, "mean_token_accuracy": 0.9957761168479919, "step": 401 }, { "epoch": 1.763157894736842, "grad_norm": 0.8138551115989685, "learning_rate": 1.379749355556547e-06, "loss": 0.0151, "mean_token_accuracy": 0.9960435032844543, "step": 402 }, { "epoch": 1.7675438596491229, "grad_norm": 0.639743447303772, "learning_rate": 1.3660068951024857e-06, "loss": 0.0098, "mean_token_accuracy": 0.9989506602287292, "step": 403 }, { "epoch": 1.7719298245614035, "grad_norm": 1.450443148612976, "learning_rate": 1.3525071513463128e-06, "loss": 0.0236, "mean_token_accuracy": 0.9903948903083801, "step": 404 }, { "epoch": 1.776315789473684, "grad_norm": 1.0791845321655273, "learning_rate": 1.339250916890519e-06, "loss": 0.0215, "mean_token_accuracy": 0.992790937423706, "step": 405 }, { "epoch": 1.780701754385965, "grad_norm": 1.399596929550171, "learning_rate": 1.3262389700405746e-06, "loss": 0.0186, "mean_token_accuracy": 0.9940476417541504, "step": 406 }, { "epoch": 1.7850877192982457, "grad_norm": 0.9804695248603821, "learning_rate": 1.3134720747592373e-06, "loss": 0.0184, "mean_token_accuracy": 0.9948506951332092, "step": 407 }, { "epoch": 1.7894736842105263, "grad_norm": 0.8765610456466675, "learning_rate": 1.3009509806216986e-06, "loss": 0.0074, "mean_token_accuracy": 0.9968782663345337, "step": 408 }, { "epoch": 1.793859649122807, "grad_norm": 1.2848998308181763, "learning_rate": 1.2886764227715679e-06, "loss": 0.0178, "mean_token_accuracy": 0.9906736016273499, "step": 409 }, { "epoch": 1.7982456140350878, "grad_norm": 1.2028905153274536, "learning_rate": 1.2766491218777197e-06, "loss": 0.0173, "mean_token_accuracy": 0.9950835704803467, "step": 410 }, { "epoch": 1.8026315789473686, "grad_norm": 1.0921279191970825, "learning_rate": 1.2648697840919732e-06, "loss": 0.014, "mean_token_accuracy": 0.9947368502616882, "step": 411 }, { "epoch": 1.807017543859649, "grad_norm": 0.8034150004386902, "learning_rate": 1.2533391010076381e-06, "loss": 0.0115, "mean_token_accuracy": 0.9960317611694336, "step": 412 }, { "epoch": 1.8114035087719298, "grad_norm": 1.2984596490859985, "learning_rate": 1.2420577496189063e-06, "loss": 0.0372, "mean_token_accuracy": 0.9912366271018982, "step": 413 }, { "epoch": 1.8157894736842106, "grad_norm": 1.6242046356201172, "learning_rate": 1.2310263922811048e-06, "loss": 0.018, "mean_token_accuracy": 0.9937499761581421, "step": 414 }, { "epoch": 1.8201754385964912, "grad_norm": 0.7923622131347656, "learning_rate": 1.2202456766718092e-06, "loss": 0.0183, "mean_token_accuracy": 0.9956849813461304, "step": 415 }, { "epoch": 1.8245614035087718, "grad_norm": 0.9306057095527649, "learning_rate": 1.2097162357528128e-06, "loss": 0.0118, "mean_token_accuracy": 0.9959555268287659, "step": 416 }, { "epoch": 1.8289473684210527, "grad_norm": 1.0107431411743164, "learning_rate": 1.1994386877329678e-06, "loss": 0.0182, "mean_token_accuracy": 0.9948875308036804, "step": 417 }, { "epoch": 1.8333333333333335, "grad_norm": 1.1881065368652344, "learning_rate": 1.189413636031886e-06, "loss": 0.0172, "mean_token_accuracy": 0.9960707426071167, "step": 418 }, { "epoch": 1.837719298245614, "grad_norm": 1.2647409439086914, "learning_rate": 1.179641669244514e-06, "loss": 0.0264, "mean_token_accuracy": 0.990750253200531, "step": 419 }, { "epoch": 1.8421052631578947, "grad_norm": 1.1645160913467407, "learning_rate": 1.1701233611065705e-06, "loss": 0.0123, "mean_token_accuracy": 0.9961464405059814, "step": 420 }, { "epoch": 1.8464912280701755, "grad_norm": 1.337747573852539, "learning_rate": 1.1608592704608656e-06, "loss": 0.0087, "mean_token_accuracy": 0.995708167552948, "step": 421 }, { "epoch": 1.8508771929824561, "grad_norm": 1.1848918199539185, "learning_rate": 1.1518499412244872e-06, "loss": 0.0178, "mean_token_accuracy": 0.9929006099700928, "step": 422 }, { "epoch": 1.8552631578947367, "grad_norm": 1.308233380317688, "learning_rate": 1.1430959023568654e-06, "loss": 0.0227, "mean_token_accuracy": 0.9948186278343201, "step": 423 }, { "epoch": 1.8596491228070176, "grad_norm": 0.8815935254096985, "learning_rate": 1.1345976678287216e-06, "loss": 0.0095, "mean_token_accuracy": 0.9958115220069885, "step": 424 }, { "epoch": 1.8640350877192984, "grad_norm": 1.0849359035491943, "learning_rate": 1.126355736591882e-06, "loss": 0.0213, "mean_token_accuracy": 0.9948875308036804, "step": 425 }, { "epoch": 1.868421052631579, "grad_norm": 0.9567351937294006, "learning_rate": 1.1183705925499948e-06, "loss": 0.0096, "mean_token_accuracy": 0.9960707426071167, "step": 426 }, { "epoch": 1.8728070175438596, "grad_norm": 0.8804600834846497, "learning_rate": 1.1106427045301085e-06, "loss": 0.0111, "mean_token_accuracy": 0.99689120054245, "step": 427 }, { "epoch": 1.8771929824561404, "grad_norm": 1.2652560472488403, "learning_rate": 1.1031725262551536e-06, "loss": 0.0207, "mean_token_accuracy": 0.9969574213027954, "step": 428 }, { "epoch": 1.881578947368421, "grad_norm": 0.8557901978492737, "learning_rate": 1.0959604963172996e-06, "loss": 0.0101, "mean_token_accuracy": 0.9969879388809204, "step": 429 }, { "epoch": 1.8859649122807016, "grad_norm": 0.7391759753227234, "learning_rate": 1.0890070381522038e-06, "loss": 0.0109, "mean_token_accuracy": 0.998993992805481, "step": 430 }, { "epoch": 1.8903508771929824, "grad_norm": 1.2102177143096924, "learning_rate": 1.0823125600141529e-06, "loss": 0.0222, "mean_token_accuracy": 0.9951028227806091, "step": 431 }, { "epoch": 1.8947368421052633, "grad_norm": 1.0668492317199707, "learning_rate": 1.0758774549520922e-06, "loss": 0.0212, "mean_token_accuracy": 0.9962581992149353, "step": 432 }, { "epoch": 1.8991228070175439, "grad_norm": 0.9030259847640991, "learning_rate": 1.069702100786548e-06, "loss": 0.0115, "mean_token_accuracy": 0.9958376884460449, "step": 433 }, { "epoch": 1.9035087719298245, "grad_norm": 1.0430710315704346, "learning_rate": 1.0637868600874448e-06, "loss": 0.017, "mean_token_accuracy": 0.9969103932380676, "step": 434 }, { "epoch": 1.9078947368421053, "grad_norm": 1.3210341930389404, "learning_rate": 1.0581320801528202e-06, "loss": 0.0143, "mean_token_accuracy": 0.9938207864761353, "step": 435 }, { "epoch": 1.912280701754386, "grad_norm": 0.8668481111526489, "learning_rate": 1.0527380929884324e-06, "loss": 0.0096, "mean_token_accuracy": 0.9968119263648987, "step": 436 }, { "epoch": 1.9166666666666665, "grad_norm": 0.8239589333534241, "learning_rate": 1.0476052152882653e-06, "loss": 0.0103, "mean_token_accuracy": 0.9959636926651001, "step": 437 }, { "epoch": 1.9210526315789473, "grad_norm": 1.4379644393920898, "learning_rate": 1.0427337484159404e-06, "loss": 0.0382, "mean_token_accuracy": 0.9892367720603943, "step": 438 }, { "epoch": 1.9254385964912282, "grad_norm": 1.5320522785186768, "learning_rate": 1.0381239783870168e-06, "loss": 0.0294, "mean_token_accuracy": 0.9927158951759338, "step": 439 }, { "epoch": 1.9298245614035088, "grad_norm": 0.6085926294326782, "learning_rate": 1.0337761758522028e-06, "loss": 0.0043, "mean_token_accuracy": 0.9989583492279053, "step": 440 }, { "epoch": 1.9342105263157894, "grad_norm": 1.037218451499939, "learning_rate": 1.0296905960814626e-06, "loss": 0.0144, "mean_token_accuracy": 0.9921348094940186, "step": 441 }, { "epoch": 1.9385964912280702, "grad_norm": 0.5523264408111572, "learning_rate": 1.025867478949031e-06, "loss": 0.0044, "mean_token_accuracy": 0.9989429116249084, "step": 442 }, { "epoch": 1.942982456140351, "grad_norm": 0.9617688655853271, "learning_rate": 1.0223070489193277e-06, "loss": 0.0185, "mean_token_accuracy": 0.9959058165550232, "step": 443 }, { "epoch": 1.9473684210526314, "grad_norm": 0.8199120163917542, "learning_rate": 1.0190095150337812e-06, "loss": 0.0086, "mean_token_accuracy": 0.9969819188117981, "step": 444 }, { "epoch": 1.9517543859649122, "grad_norm": 0.6348384022712708, "learning_rate": 1.015975070898552e-06, "loss": 0.007, "mean_token_accuracy": 0.9978540539741516, "step": 445 }, { "epoch": 1.956140350877193, "grad_norm": 1.40470552444458, "learning_rate": 1.0132038946731682e-06, "loss": 0.0218, "mean_token_accuracy": 0.9930139780044556, "step": 446 }, { "epoch": 1.9605263157894737, "grad_norm": 1.2154960632324219, "learning_rate": 1.0106961490600648e-06, "loss": 0.0158, "mean_token_accuracy": 0.9927158951759338, "step": 447 }, { "epoch": 1.9649122807017543, "grad_norm": 1.1413288116455078, "learning_rate": 1.0084519812950302e-06, "loss": 0.0191, "mean_token_accuracy": 0.994301974773407, "step": 448 }, { "epoch": 1.969298245614035, "grad_norm": 0.8637524247169495, "learning_rate": 1.0064715231385614e-06, "loss": 0.0116, "mean_token_accuracy": 0.9956803321838379, "step": 449 }, { "epoch": 1.973684210526316, "grad_norm": 0.9894522428512573, "learning_rate": 1.0047548908681308e-06, "loss": 0.01, "mean_token_accuracy": 0.9977900385856628, "step": 450 }, { "epoch": 1.9780701754385965, "grad_norm": 0.6048401594161987, "learning_rate": 1.003302185271355e-06, "loss": 0.0048, "mean_token_accuracy": 0.998971164226532, "step": 451 }, { "epoch": 1.9824561403508771, "grad_norm": 0.8870363831520081, "learning_rate": 1.002113491640081e-06, "loss": 0.013, "mean_token_accuracy": 0.9958974123001099, "step": 452 }, { "epoch": 1.986842105263158, "grad_norm": 1.6314061880111694, "learning_rate": 1.001188879765377e-06, "loss": 0.0356, "mean_token_accuracy": 0.9894958138465881, "step": 453 }, { "epoch": 1.9912280701754386, "grad_norm": 1.25128173828125, "learning_rate": 1.000528403933433e-06, "loss": 0.0161, "mean_token_accuracy": 0.9958847761154175, "step": 454 }, { "epoch": 1.9956140350877192, "grad_norm": 1.4350489377975464, "learning_rate": 1.0001321029223788e-06, "loss": 0.0141, "mean_token_accuracy": 0.9948875308036804, "step": 455 }, { "epoch": 2.0, "grad_norm": 1.1879558563232422, "learning_rate": 1.0000000000000002e-06, "loss": 0.0127, "mean_token_accuracy": 0.9937952160835266, "step": 456 }, { "epoch": 2.0, "step": 456, "total_flos": 1.0952542154391552e+17, "train_loss": 0.13070940664814165, "train_runtime": 1765.7438, "train_samples_per_second": 8.246, "train_steps_per_second": 0.258 } ], "logging_steps": 1, "max_steps": 456, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0952542154391552e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }