{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 430, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004651162790697674, "grad_norm": 19.502025604248047, "learning_rate": 0.0, "loss": 0.842, "mean_token_accuracy": 0.8234314918518066, "step": 1 }, { "epoch": 0.009302325581395349, "grad_norm": 21.47846031188965, "learning_rate": 2.3255813953488374e-07, "loss": 0.8778, "mean_token_accuracy": 0.8233309388160706, "step": 2 }, { "epoch": 0.013953488372093023, "grad_norm": 19.13929557800293, "learning_rate": 4.651162790697675e-07, "loss": 0.8513, "mean_token_accuracy": 0.8330637216567993, "step": 3 }, { "epoch": 0.018604651162790697, "grad_norm": 18.666189193725586, "learning_rate": 6.976744186046513e-07, "loss": 0.8217, "mean_token_accuracy": 0.8303450345993042, "step": 4 }, { "epoch": 0.023255813953488372, "grad_norm": 22.670207977294922, "learning_rate": 9.30232558139535e-07, "loss": 0.9557, "mean_token_accuracy": 0.8066383600234985, "step": 5 }, { "epoch": 0.027906976744186046, "grad_norm": 21.466005325317383, "learning_rate": 1.1627906976744188e-06, "loss": 0.845, "mean_token_accuracy": 0.8213537931442261, "step": 6 }, { "epoch": 0.03255813953488372, "grad_norm": 20.89145851135254, "learning_rate": 1.3953488372093025e-06, "loss": 0.8985, "mean_token_accuracy": 0.8202102184295654, "step": 7 }, { "epoch": 0.037209302325581395, "grad_norm": 13.6134614944458, "learning_rate": 1.6279069767441862e-06, "loss": 0.755, "mean_token_accuracy": 0.8332804441452026, "step": 8 }, { "epoch": 0.04186046511627907, "grad_norm": 12.933883666992188, "learning_rate": 1.86046511627907e-06, "loss": 0.6942, "mean_token_accuracy": 0.8436604738235474, "step": 9 }, { "epoch": 0.046511627906976744, "grad_norm": 11.615815162658691, "learning_rate": 2.0930232558139536e-06, "loss": 0.645, "mean_token_accuracy": 0.8547393083572388, "step": 10 }, { "epoch": 0.05116279069767442, "grad_norm": 11.619194030761719, "learning_rate": 2.3255813953488376e-06, "loss": 0.5963, "mean_token_accuracy": 0.8689830303192139, "step": 11 }, { "epoch": 0.05581395348837209, "grad_norm": 9.622608184814453, "learning_rate": 2.558139534883721e-06, "loss": 0.6343, "mean_token_accuracy": 0.8541088104248047, "step": 12 }, { "epoch": 0.06046511627906977, "grad_norm": 8.54611873626709, "learning_rate": 2.790697674418605e-06, "loss": 0.5877, "mean_token_accuracy": 0.8466028571128845, "step": 13 }, { "epoch": 0.06511627906976744, "grad_norm": 9.963361740112305, "learning_rate": 3.0232558139534885e-06, "loss": 0.4568, "mean_token_accuracy": 0.8818826079368591, "step": 14 }, { "epoch": 0.06976744186046512, "grad_norm": 8.291352272033691, "learning_rate": 3.2558139534883724e-06, "loss": 0.4129, "mean_token_accuracy": 0.8876066207885742, "step": 15 }, { "epoch": 0.07441860465116279, "grad_norm": 7.158071994781494, "learning_rate": 3.4883720930232564e-06, "loss": 0.4261, "mean_token_accuracy": 0.8846276998519897, "step": 16 }, { "epoch": 0.07906976744186046, "grad_norm": 4.239832878112793, "learning_rate": 3.72093023255814e-06, "loss": 0.3677, "mean_token_accuracy": 0.8992332816123962, "step": 17 }, { "epoch": 0.08372093023255814, "grad_norm": 4.137484073638916, "learning_rate": 3.953488372093024e-06, "loss": 0.3967, "mean_token_accuracy": 0.8831896185874939, "step": 18 }, { "epoch": 0.08837209302325581, "grad_norm": 7.036076068878174, "learning_rate": 4.186046511627907e-06, "loss": 0.4065, "mean_token_accuracy": 0.8782775402069092, "step": 19 }, { "epoch": 0.09302325581395349, "grad_norm": 3.3408806324005127, "learning_rate": 4.418604651162791e-06, "loss": 0.3653, "mean_token_accuracy": 0.8928460478782654, "step": 20 }, { "epoch": 0.09767441860465116, "grad_norm": 3.5764951705932617, "learning_rate": 4.651162790697675e-06, "loss": 0.4158, "mean_token_accuracy": 0.8744533658027649, "step": 21 }, { "epoch": 0.10232558139534884, "grad_norm": 3.2680561542510986, "learning_rate": 4.883720930232559e-06, "loss": 0.3608, "mean_token_accuracy": 0.8843502998352051, "step": 22 }, { "epoch": 0.10697674418604651, "grad_norm": 3.0347847938537598, "learning_rate": 5.116279069767442e-06, "loss": 0.3442, "mean_token_accuracy": 0.8947029709815979, "step": 23 }, { "epoch": 0.11162790697674418, "grad_norm": 3.145249843597412, "learning_rate": 5.348837209302326e-06, "loss": 0.3726, "mean_token_accuracy": 0.882703423500061, "step": 24 }, { "epoch": 0.11627906976744186, "grad_norm": 3.2718427181243896, "learning_rate": 5.58139534883721e-06, "loss": 0.3467, "mean_token_accuracy": 0.8916211128234863, "step": 25 }, { "epoch": 0.12093023255813953, "grad_norm": 3.159402847290039, "learning_rate": 5.8139534883720935e-06, "loss": 0.3374, "mean_token_accuracy": 0.8960058689117432, "step": 26 }, { "epoch": 0.12558139534883722, "grad_norm": 3.0180864334106445, "learning_rate": 6.046511627906977e-06, "loss": 0.3579, "mean_token_accuracy": 0.8927838802337646, "step": 27 }, { "epoch": 0.13023255813953488, "grad_norm": 3.1647789478302, "learning_rate": 6.279069767441861e-06, "loss": 0.3569, "mean_token_accuracy": 0.8896810412406921, "step": 28 }, { "epoch": 0.13488372093023257, "grad_norm": 2.948369264602661, "learning_rate": 6.511627906976745e-06, "loss": 0.3782, "mean_token_accuracy": 0.8793545365333557, "step": 29 }, { "epoch": 0.13953488372093023, "grad_norm": 3.0937540531158447, "learning_rate": 6.744186046511628e-06, "loss": 0.3703, "mean_token_accuracy": 0.8813475966453552, "step": 30 }, { "epoch": 0.14418604651162792, "grad_norm": 3.3827579021453857, "learning_rate": 6.976744186046513e-06, "loss": 0.3959, "mean_token_accuracy": 0.8739346861839294, "step": 31 }, { "epoch": 0.14883720930232558, "grad_norm": 3.005185127258301, "learning_rate": 7.209302325581395e-06, "loss": 0.3191, "mean_token_accuracy": 0.9007467031478882, "step": 32 }, { "epoch": 0.15348837209302327, "grad_norm": 3.6860156059265137, "learning_rate": 7.44186046511628e-06, "loss": 0.3518, "mean_token_accuracy": 0.8902492523193359, "step": 33 }, { "epoch": 0.15813953488372093, "grad_norm": 2.850628137588501, "learning_rate": 7.674418604651164e-06, "loss": 0.3055, "mean_token_accuracy": 0.9046649932861328, "step": 34 }, { "epoch": 0.16279069767441862, "grad_norm": 3.145270347595215, "learning_rate": 7.906976744186048e-06, "loss": 0.3553, "mean_token_accuracy": 0.893163800239563, "step": 35 }, { "epoch": 0.16744186046511628, "grad_norm": 3.0683932304382324, "learning_rate": 8.139534883720931e-06, "loss": 0.3326, "mean_token_accuracy": 0.8977251648902893, "step": 36 }, { "epoch": 0.17209302325581396, "grad_norm": 3.071561336517334, "learning_rate": 8.372093023255815e-06, "loss": 0.3218, "mean_token_accuracy": 0.9021276831626892, "step": 37 }, { "epoch": 0.17674418604651163, "grad_norm": 3.128781318664551, "learning_rate": 8.604651162790698e-06, "loss": 0.3417, "mean_token_accuracy": 0.899016261100769, "step": 38 }, { "epoch": 0.1813953488372093, "grad_norm": 2.9364817142486572, "learning_rate": 8.837209302325582e-06, "loss": 0.3202, "mean_token_accuracy": 0.9026311039924622, "step": 39 }, { "epoch": 0.18604651162790697, "grad_norm": 2.9265284538269043, "learning_rate": 9.069767441860465e-06, "loss": 0.3426, "mean_token_accuracy": 0.8922522664070129, "step": 40 }, { "epoch": 0.19069767441860466, "grad_norm": 2.898973226547241, "learning_rate": 9.30232558139535e-06, "loss": 0.3363, "mean_token_accuracy": 0.8978930115699768, "step": 41 }, { "epoch": 0.19534883720930232, "grad_norm": 3.2877488136291504, "learning_rate": 9.534883720930234e-06, "loss": 0.3378, "mean_token_accuracy": 0.8954758048057556, "step": 42 }, { "epoch": 0.2, "grad_norm": 3.0985326766967773, "learning_rate": 9.767441860465117e-06, "loss": 0.3116, "mean_token_accuracy": 0.9019308090209961, "step": 43 }, { "epoch": 0.20465116279069767, "grad_norm": 2.912959575653076, "learning_rate": 1e-05, "loss": 0.3394, "mean_token_accuracy": 0.8979052901268005, "step": 44 }, { "epoch": 0.20930232558139536, "grad_norm": 3.3199048042297363, "learning_rate": 9.999851728408726e-06, "loss": 0.3074, "mean_token_accuracy": 0.9037721157073975, "step": 45 }, { "epoch": 0.21395348837209302, "grad_norm": 3.068047046661377, "learning_rate": 9.999406923405777e-06, "loss": 0.3081, "mean_token_accuracy": 0.9021856188774109, "step": 46 }, { "epoch": 0.2186046511627907, "grad_norm": 2.947935104370117, "learning_rate": 9.998665614303127e-06, "loss": 0.3174, "mean_token_accuracy": 0.900806725025177, "step": 47 }, { "epoch": 0.22325581395348837, "grad_norm": 2.833920478820801, "learning_rate": 9.997627849951926e-06, "loss": 0.3281, "mean_token_accuracy": 0.8946992754936218, "step": 48 }, { "epoch": 0.22790697674418606, "grad_norm": 3.0928986072540283, "learning_rate": 9.996293698739271e-06, "loss": 0.3201, "mean_token_accuracy": 0.8966386318206787, "step": 49 }, { "epoch": 0.23255813953488372, "grad_norm": 2.9233152866363525, "learning_rate": 9.994663248583704e-06, "loss": 0.3263, "mean_token_accuracy": 0.8985762000083923, "step": 50 }, { "epoch": 0.2372093023255814, "grad_norm": 3.122283697128296, "learning_rate": 9.992736606929422e-06, "loss": 0.3401, "mean_token_accuracy": 0.8825533390045166, "step": 51 }, { "epoch": 0.24186046511627907, "grad_norm": 2.8861806392669678, "learning_rate": 9.990513900739192e-06, "loss": 0.3384, "mean_token_accuracy": 0.8865090012550354, "step": 52 }, { "epoch": 0.24651162790697675, "grad_norm": 3.0938143730163574, "learning_rate": 9.987995276485984e-06, "loss": 0.3231, "mean_token_accuracy": 0.8968112468719482, "step": 53 }, { "epoch": 0.25116279069767444, "grad_norm": 2.87567400932312, "learning_rate": 9.985180900143318e-06, "loss": 0.3352, "mean_token_accuracy": 0.8892236948013306, "step": 54 }, { "epoch": 0.2558139534883721, "grad_norm": 2.760570764541626, "learning_rate": 9.982070957174334e-06, "loss": 0.3362, "mean_token_accuracy": 0.8886131048202515, "step": 55 }, { "epoch": 0.26046511627906976, "grad_norm": 2.875344753265381, "learning_rate": 9.978665652519562e-06, "loss": 0.3292, "mean_token_accuracy": 0.8967674374580383, "step": 56 }, { "epoch": 0.2651162790697674, "grad_norm": 2.552971124649048, "learning_rate": 9.97496521058342e-06, "loss": 0.2887, "mean_token_accuracy": 0.9075325727462769, "step": 57 }, { "epoch": 0.26976744186046514, "grad_norm": 2.6377763748168945, "learning_rate": 9.970969875219422e-06, "loss": 0.316, "mean_token_accuracy": 0.8951440453529358, "step": 58 }, { "epoch": 0.2744186046511628, "grad_norm": 2.8528380393981934, "learning_rate": 9.96667990971412e-06, "loss": 0.2988, "mean_token_accuracy": 0.9043198823928833, "step": 59 }, { "epoch": 0.27906976744186046, "grad_norm": 2.762105703353882, "learning_rate": 9.962095596769738e-06, "loss": 0.3143, "mean_token_accuracy": 0.894010066986084, "step": 60 }, { "epoch": 0.2837209302325581, "grad_norm": 2.994401216506958, "learning_rate": 9.957217238485557e-06, "loss": 0.3472, "mean_token_accuracy": 0.8801930546760559, "step": 61 }, { "epoch": 0.28837209302325584, "grad_norm": 2.898961067199707, "learning_rate": 9.952045156337998e-06, "loss": 0.345, "mean_token_accuracy": 0.8863232731819153, "step": 62 }, { "epoch": 0.2930232558139535, "grad_norm": 2.6734066009521484, "learning_rate": 9.946579691159434e-06, "loss": 0.3159, "mean_token_accuracy": 0.8985881805419922, "step": 63 }, { "epoch": 0.29767441860465116, "grad_norm": 2.6497905254364014, "learning_rate": 9.940821203115742e-06, "loss": 0.3156, "mean_token_accuracy": 0.8967551589012146, "step": 64 }, { "epoch": 0.3023255813953488, "grad_norm": 2.952893018722534, "learning_rate": 9.934770071682563e-06, "loss": 0.3379, "mean_token_accuracy": 0.8862144351005554, "step": 65 }, { "epoch": 0.30697674418604654, "grad_norm": 2.4500250816345215, "learning_rate": 9.928426695620288e-06, "loss": 0.3051, "mean_token_accuracy": 0.8942438960075378, "step": 66 }, { "epoch": 0.3116279069767442, "grad_norm": 2.6351726055145264, "learning_rate": 9.92179149294779e-06, "loss": 0.3226, "mean_token_accuracy": 0.8906832337379456, "step": 67 }, { "epoch": 0.31627906976744186, "grad_norm": 2.530116558074951, "learning_rate": 9.914864900914875e-06, "loss": 0.2882, "mean_token_accuracy": 0.899300217628479, "step": 68 }, { "epoch": 0.3209302325581395, "grad_norm": 2.615708827972412, "learning_rate": 9.907647375973461e-06, "loss": 0.309, "mean_token_accuracy": 0.8936994075775146, "step": 69 }, { "epoch": 0.32558139534883723, "grad_norm": 2.7337634563446045, "learning_rate": 9.90013939374751e-06, "loss": 0.3138, "mean_token_accuracy": 0.8906617164611816, "step": 70 }, { "epoch": 0.3302325581395349, "grad_norm": 2.6460962295532227, "learning_rate": 9.892341449001673e-06, "loss": 0.3118, "mean_token_accuracy": 0.8964554071426392, "step": 71 }, { "epoch": 0.33488372093023255, "grad_norm": 2.540764093399048, "learning_rate": 9.884254055608696e-06, "loss": 0.3142, "mean_token_accuracy": 0.8957387208938599, "step": 72 }, { "epoch": 0.3395348837209302, "grad_norm": 2.7574427127838135, "learning_rate": 9.875877746515556e-06, "loss": 0.3377, "mean_token_accuracy": 0.8824289441108704, "step": 73 }, { "epoch": 0.34418604651162793, "grad_norm": 2.4597768783569336, "learning_rate": 9.867213073708324e-06, "loss": 0.2907, "mean_token_accuracy": 0.8995174169540405, "step": 74 }, { "epoch": 0.3488372093023256, "grad_norm": 2.7972140312194824, "learning_rate": 9.858260608175816e-06, "loss": 0.3253, "mean_token_accuracy": 0.8898718357086182, "step": 75 }, { "epoch": 0.35348837209302325, "grad_norm": 2.4993696212768555, "learning_rate": 9.849020939871951e-06, "loss": 0.2904, "mean_token_accuracy": 0.9023594856262207, "step": 76 }, { "epoch": 0.3581395348837209, "grad_norm": 2.5091350078582764, "learning_rate": 9.839494677676865e-06, "loss": 0.3146, "mean_token_accuracy": 0.8932511210441589, "step": 77 }, { "epoch": 0.3627906976744186, "grad_norm": 2.4673449993133545, "learning_rate": 9.829682449356807e-06, "loss": 0.2777, "mean_token_accuracy": 0.9046984314918518, "step": 78 }, { "epoch": 0.3674418604651163, "grad_norm": 2.791132688522339, "learning_rate": 9.819584901522761e-06, "loss": 0.3151, "mean_token_accuracy": 0.8915500044822693, "step": 79 }, { "epoch": 0.37209302325581395, "grad_norm": 2.322134017944336, "learning_rate": 9.809202699587828e-06, "loss": 0.3315, "mean_token_accuracy": 0.8860378861427307, "step": 80 }, { "epoch": 0.3767441860465116, "grad_norm": 2.75315523147583, "learning_rate": 9.798536527723388e-06, "loss": 0.2886, "mean_token_accuracy": 0.8997381329536438, "step": 81 }, { "epoch": 0.3813953488372093, "grad_norm": 2.623223066329956, "learning_rate": 9.787587088814007e-06, "loss": 0.2831, "mean_token_accuracy": 0.902046799659729, "step": 82 }, { "epoch": 0.386046511627907, "grad_norm": 2.417131185531616, "learning_rate": 9.776355104411123e-06, "loss": 0.2905, "mean_token_accuracy": 0.902239978313446, "step": 83 }, { "epoch": 0.39069767441860465, "grad_norm": 2.4174351692199707, "learning_rate": 9.764841314685487e-06, "loss": 0.3314, "mean_token_accuracy": 0.8844476938247681, "step": 84 }, { "epoch": 0.3953488372093023, "grad_norm": 2.363462448120117, "learning_rate": 9.753046478378403e-06, "loss": 0.3021, "mean_token_accuracy": 0.8904982805252075, "step": 85 }, { "epoch": 0.4, "grad_norm": 2.6582350730895996, "learning_rate": 9.740971372751715e-06, "loss": 0.3318, "mean_token_accuracy": 0.8901240825653076, "step": 86 }, { "epoch": 0.4046511627906977, "grad_norm": 2.460733652114868, "learning_rate": 9.728616793536588e-06, "loss": 0.3071, "mean_token_accuracy": 0.8879771828651428, "step": 87 }, { "epoch": 0.40930232558139534, "grad_norm": 3.4467663764953613, "learning_rate": 9.715983554881077e-06, "loss": 0.3561, "mean_token_accuracy": 0.8794929385185242, "step": 88 }, { "epoch": 0.413953488372093, "grad_norm": 2.4044029712677, "learning_rate": 9.703072489296467e-06, "loss": 0.3067, "mean_token_accuracy": 0.9014084339141846, "step": 89 }, { "epoch": 0.4186046511627907, "grad_norm": 2.3434884548187256, "learning_rate": 9.689884447602423e-06, "loss": 0.2613, "mean_token_accuracy": 0.9113546013832092, "step": 90 }, { "epoch": 0.4232558139534884, "grad_norm": 2.498213529586792, "learning_rate": 9.67642029887091e-06, "loss": 0.2702, "mean_token_accuracy": 0.9063438177108765, "step": 91 }, { "epoch": 0.42790697674418604, "grad_norm": 2.4761126041412354, "learning_rate": 9.662680930368934e-06, "loss": 0.3227, "mean_token_accuracy": 0.8934637308120728, "step": 92 }, { "epoch": 0.4325581395348837, "grad_norm": 2.3710319995880127, "learning_rate": 9.648667247500065e-06, "loss": 0.2956, "mean_token_accuracy": 0.8998901844024658, "step": 93 }, { "epoch": 0.4372093023255814, "grad_norm": 2.3822901248931885, "learning_rate": 9.634380173744771e-06, "loss": 0.3107, "mean_token_accuracy": 0.8973243236541748, "step": 94 }, { "epoch": 0.4418604651162791, "grad_norm": 2.4040417671203613, "learning_rate": 9.619820650599568e-06, "loss": 0.305, "mean_token_accuracy": 0.8939420580863953, "step": 95 }, { "epoch": 0.44651162790697674, "grad_norm": 2.35666823387146, "learning_rate": 9.604989637514976e-06, "loss": 0.2911, "mean_token_accuracy": 0.9011731147766113, "step": 96 }, { "epoch": 0.4511627906976744, "grad_norm": 2.3194568157196045, "learning_rate": 9.589888111832284e-06, "loss": 0.2524, "mean_token_accuracy": 0.910751223564148, "step": 97 }, { "epoch": 0.4558139534883721, "grad_norm": 2.354339838027954, "learning_rate": 9.57451706871916e-06, "loss": 0.3063, "mean_token_accuracy": 0.8894273638725281, "step": 98 }, { "epoch": 0.4604651162790698, "grad_norm": 2.2645998001098633, "learning_rate": 9.558877521104059e-06, "loss": 0.2627, "mean_token_accuracy": 0.9070965647697449, "step": 99 }, { "epoch": 0.46511627906976744, "grad_norm": 2.35956072807312, "learning_rate": 9.54297049960947e-06, "loss": 0.3314, "mean_token_accuracy": 0.8871257901191711, "step": 100 }, { "epoch": 0.4697674418604651, "grad_norm": 2.3613924980163574, "learning_rate": 9.526797052484013e-06, "loss": 0.3145, "mean_token_accuracy": 0.8908148407936096, "step": 101 }, { "epoch": 0.4744186046511628, "grad_norm": 2.3541440963745117, "learning_rate": 9.510358245533355e-06, "loss": 0.3106, "mean_token_accuracy": 0.8964611291885376, "step": 102 }, { "epoch": 0.4790697674418605, "grad_norm": 2.2093734741210938, "learning_rate": 9.493655162049963e-06, "loss": 0.3013, "mean_token_accuracy": 0.8970993161201477, "step": 103 }, { "epoch": 0.48372093023255813, "grad_norm": 2.1879260540008545, "learning_rate": 9.476688902741737e-06, "loss": 0.2825, "mean_token_accuracy": 0.9041797518730164, "step": 104 }, { "epoch": 0.4883720930232558, "grad_norm": 2.3790950775146484, "learning_rate": 9.459460585659461e-06, "loss": 0.3142, "mean_token_accuracy": 0.8897626399993896, "step": 105 }, { "epoch": 0.4930232558139535, "grad_norm": 2.383648633956909, "learning_rate": 9.44197134612313e-06, "loss": 0.2819, "mean_token_accuracy": 0.9058862924575806, "step": 106 }, { "epoch": 0.49767441860465117, "grad_norm": 2.3105363845825195, "learning_rate": 9.424222336647135e-06, "loss": 0.3438, "mean_token_accuracy": 0.889424741268158, "step": 107 }, { "epoch": 0.5023255813953489, "grad_norm": 2.4852828979492188, "learning_rate": 9.406214726864308e-06, "loss": 0.3239, "mean_token_accuracy": 0.884105384349823, "step": 108 }, { "epoch": 0.5069767441860465, "grad_norm": 2.5007684230804443, "learning_rate": 9.387949703448855e-06, "loss": 0.2802, "mean_token_accuracy": 0.9021934270858765, "step": 109 }, { "epoch": 0.5116279069767442, "grad_norm": 2.4788923263549805, "learning_rate": 9.369428470038146e-06, "loss": 0.2924, "mean_token_accuracy": 0.8947466611862183, "step": 110 }, { "epoch": 0.5162790697674419, "grad_norm": 2.3672244548797607, "learning_rate": 9.350652247153405e-06, "loss": 0.317, "mean_token_accuracy": 0.8881039023399353, "step": 111 }, { "epoch": 0.5209302325581395, "grad_norm": 2.3644192218780518, "learning_rate": 9.331622272119272e-06, "loss": 0.2936, "mean_token_accuracy": 0.897639274597168, "step": 112 }, { "epoch": 0.5255813953488372, "grad_norm": 2.3581173419952393, "learning_rate": 9.312339798982271e-06, "loss": 0.3333, "mean_token_accuracy": 0.8910724520683289, "step": 113 }, { "epoch": 0.5302325581395348, "grad_norm": 2.267174005508423, "learning_rate": 9.292806098428174e-06, "loss": 0.3053, "mean_token_accuracy": 0.9002794027328491, "step": 114 }, { "epoch": 0.5348837209302325, "grad_norm": 2.5837225914001465, "learning_rate": 9.27302245769825e-06, "loss": 0.3058, "mean_token_accuracy": 0.8905591368675232, "step": 115 }, { "epoch": 0.5395348837209303, "grad_norm": 2.686472177505493, "learning_rate": 9.252990180504451e-06, "loss": 0.3117, "mean_token_accuracy": 0.89449542760849, "step": 116 }, { "epoch": 0.5441860465116279, "grad_norm": 2.654763698577881, "learning_rate": 9.232710586943498e-06, "loss": 0.3193, "mean_token_accuracy": 0.8888066411018372, "step": 117 }, { "epoch": 0.5488372093023256, "grad_norm": 2.2310616970062256, "learning_rate": 9.21218501340988e-06, "loss": 0.2677, "mean_token_accuracy": 0.9084425568580627, "step": 118 }, { "epoch": 0.5534883720930233, "grad_norm": 2.363785743713379, "learning_rate": 9.1914148125078e-06, "loss": 0.2578, "mean_token_accuracy": 0.9098054766654968, "step": 119 }, { "epoch": 0.5581395348837209, "grad_norm": 2.268791437149048, "learning_rate": 9.170401352962028e-06, "loss": 0.2621, "mean_token_accuracy": 0.9081822633743286, "step": 120 }, { "epoch": 0.5627906976744186, "grad_norm": 2.216200590133667, "learning_rate": 9.149146019527715e-06, "loss": 0.2358, "mean_token_accuracy": 0.9134396314620972, "step": 121 }, { "epoch": 0.5674418604651162, "grad_norm": 2.481264114379883, "learning_rate": 9.127650212899133e-06, "loss": 0.3148, "mean_token_accuracy": 0.8915533423423767, "step": 122 }, { "epoch": 0.5720930232558139, "grad_norm": 2.7739858627319336, "learning_rate": 9.105915349617372e-06, "loss": 0.3498, "mean_token_accuracy": 0.8751838207244873, "step": 123 }, { "epoch": 0.5767441860465117, "grad_norm": 2.3887064456939697, "learning_rate": 9.083942861976991e-06, "loss": 0.3148, "mean_token_accuracy": 0.8894332647323608, "step": 124 }, { "epoch": 0.5813953488372093, "grad_norm": 2.2876875400543213, "learning_rate": 9.061734197931645e-06, "loss": 0.2463, "mean_token_accuracy": 0.9108673334121704, "step": 125 }, { "epoch": 0.586046511627907, "grad_norm": 2.95443058013916, "learning_rate": 9.03929082099864e-06, "loss": 0.33, "mean_token_accuracy": 0.8911877274513245, "step": 126 }, { "epoch": 0.5906976744186047, "grad_norm": 2.2911572456359863, "learning_rate": 9.016614210162523e-06, "loss": 0.257, "mean_token_accuracy": 0.9141084551811218, "step": 127 }, { "epoch": 0.5953488372093023, "grad_norm": 2.452328681945801, "learning_rate": 8.993705859777587e-06, "loss": 0.3074, "mean_token_accuracy": 0.8989298343658447, "step": 128 }, { "epoch": 0.6, "grad_norm": 2.8409242630004883, "learning_rate": 8.970567279469417e-06, "loss": 0.3673, "mean_token_accuracy": 0.8829707503318787, "step": 129 }, { "epoch": 0.6046511627906976, "grad_norm": 2.5934958457946777, "learning_rate": 8.947199994035402e-06, "loss": 0.292, "mean_token_accuracy": 0.8922224640846252, "step": 130 }, { "epoch": 0.6093023255813953, "grad_norm": 2.518958330154419, "learning_rate": 8.923605543344252e-06, "loss": 0.2793, "mean_token_accuracy": 0.9032871127128601, "step": 131 }, { "epoch": 0.6139534883720931, "grad_norm": 2.331040620803833, "learning_rate": 8.89978548223452e-06, "loss": 0.2803, "mean_token_accuracy": 0.8997518420219421, "step": 132 }, { "epoch": 0.6186046511627907, "grad_norm": 2.249236583709717, "learning_rate": 8.875741380412149e-06, "loss": 0.3143, "mean_token_accuracy": 0.8874413967132568, "step": 133 }, { "epoch": 0.6232558139534884, "grad_norm": 2.3988537788391113, "learning_rate": 8.85147482234702e-06, "loss": 0.3187, "mean_token_accuracy": 0.8939599394798279, "step": 134 }, { "epoch": 0.627906976744186, "grad_norm": 2.3432514667510986, "learning_rate": 8.826987407168546e-06, "loss": 0.308, "mean_token_accuracy": 0.8958589434623718, "step": 135 }, { "epoch": 0.6325581395348837, "grad_norm": 2.433220386505127, "learning_rate": 8.80228074856029e-06, "loss": 0.3033, "mean_token_accuracy": 0.8961129784584045, "step": 136 }, { "epoch": 0.6372093023255814, "grad_norm": 2.2877557277679443, "learning_rate": 8.777356474653623e-06, "loss": 0.2938, "mean_token_accuracy": 0.9027256965637207, "step": 137 }, { "epoch": 0.641860465116279, "grad_norm": 2.2342700958251953, "learning_rate": 8.752216227920436e-06, "loss": 0.3012, "mean_token_accuracy": 0.8945578336715698, "step": 138 }, { "epoch": 0.6465116279069767, "grad_norm": 2.2655529975891113, "learning_rate": 8.726861665064903e-06, "loss": 0.2812, "mean_token_accuracy": 0.9012300372123718, "step": 139 }, { "epoch": 0.6511627906976745, "grad_norm": 2.375288248062134, "learning_rate": 8.701294456914301e-06, "loss": 0.3202, "mean_token_accuracy": 0.8921115398406982, "step": 140 }, { "epoch": 0.6558139534883721, "grad_norm": 2.3848204612731934, "learning_rate": 8.675516288308916e-06, "loss": 0.3188, "mean_token_accuracy": 0.891019344329834, "step": 141 }, { "epoch": 0.6604651162790698, "grad_norm": 2.4406163692474365, "learning_rate": 8.649528857991005e-06, "loss": 0.2917, "mean_token_accuracy": 0.8987188339233398, "step": 142 }, { "epoch": 0.6651162790697674, "grad_norm": 2.3761322498321533, "learning_rate": 8.623333878492853e-06, "loss": 0.2779, "mean_token_accuracy": 0.9028142094612122, "step": 143 }, { "epoch": 0.6697674418604651, "grad_norm": 2.5542666912078857, "learning_rate": 8.596933076023927e-06, "loss": 0.3606, "mean_token_accuracy": 0.8800684213638306, "step": 144 }, { "epoch": 0.6744186046511628, "grad_norm": 2.644892692565918, "learning_rate": 8.57032819035711e-06, "loss": 0.3358, "mean_token_accuracy": 0.8846637010574341, "step": 145 }, { "epoch": 0.6790697674418604, "grad_norm": 2.3639047145843506, "learning_rate": 8.543520974714062e-06, "loss": 0.3492, "mean_token_accuracy": 0.8764985203742981, "step": 146 }, { "epoch": 0.6837209302325581, "grad_norm": 2.4288532733917236, "learning_rate": 8.516513195649686e-06, "loss": 0.2936, "mean_token_accuracy": 0.8934412002563477, "step": 147 }, { "epoch": 0.6883720930232559, "grad_norm": 2.626235246658325, "learning_rate": 8.489306632935698e-06, "loss": 0.2826, "mean_token_accuracy": 0.9025585055351257, "step": 148 }, { "epoch": 0.6930232558139535, "grad_norm": 2.367561101913452, "learning_rate": 8.461903079443367e-06, "loss": 0.3243, "mean_token_accuracy": 0.8897247314453125, "step": 149 }, { "epoch": 0.6976744186046512, "grad_norm": 2.6119275093078613, "learning_rate": 8.434304341025352e-06, "loss": 0.2953, "mean_token_accuracy": 0.8976051211357117, "step": 150 }, { "epoch": 0.7023255813953488, "grad_norm": 2.146310329437256, "learning_rate": 8.406512236396705e-06, "loss": 0.2886, "mean_token_accuracy": 0.8988166451454163, "step": 151 }, { "epoch": 0.7069767441860465, "grad_norm": 2.6668293476104736, "learning_rate": 8.378528597015011e-06, "loss": 0.3446, "mean_token_accuracy": 0.8813462853431702, "step": 152 }, { "epoch": 0.7116279069767442, "grad_norm": 2.6185600757598877, "learning_rate": 8.350355266959715e-06, "loss": 0.3204, "mean_token_accuracy": 0.8888283371925354, "step": 153 }, { "epoch": 0.7162790697674418, "grad_norm": 2.2908973693847656, "learning_rate": 8.321994102810585e-06, "loss": 0.2912, "mean_token_accuracy": 0.8983022570610046, "step": 154 }, { "epoch": 0.7209302325581395, "grad_norm": 2.1010096073150635, "learning_rate": 8.293446973525368e-06, "loss": 0.2701, "mean_token_accuracy": 0.9079765677452087, "step": 155 }, { "epoch": 0.7255813953488373, "grad_norm": 2.3426928520202637, "learning_rate": 8.26471576031664e-06, "loss": 0.2493, "mean_token_accuracy": 0.9087682962417603, "step": 156 }, { "epoch": 0.7302325581395349, "grad_norm": 2.581718683242798, "learning_rate": 8.235802356527821e-06, "loss": 0.3091, "mean_token_accuracy": 0.8938986659049988, "step": 157 }, { "epoch": 0.7348837209302326, "grad_norm": 2.3071372509002686, "learning_rate": 8.206708667508418e-06, "loss": 0.2966, "mean_token_accuracy": 0.8992342352867126, "step": 158 }, { "epoch": 0.7395348837209302, "grad_norm": 2.2428340911865234, "learning_rate": 8.177436610488459e-06, "loss": 0.3142, "mean_token_accuracy": 0.8913880586624146, "step": 159 }, { "epoch": 0.7441860465116279, "grad_norm": 2.289311647415161, "learning_rate": 8.147988114452159e-06, "loss": 0.3302, "mean_token_accuracy": 0.885582685470581, "step": 160 }, { "epoch": 0.7488372093023256, "grad_norm": 2.49139666557312, "learning_rate": 8.11836512001079e-06, "loss": 0.2575, "mean_token_accuracy": 0.9106836318969727, "step": 161 }, { "epoch": 0.7534883720930232, "grad_norm": 2.2492876052856445, "learning_rate": 8.088569579274804e-06, "loss": 0.2915, "mean_token_accuracy": 0.9044934511184692, "step": 162 }, { "epoch": 0.7581395348837209, "grad_norm": 2.5655603408813477, "learning_rate": 8.058603455725202e-06, "loss": 0.3009, "mean_token_accuracy": 0.9001610279083252, "step": 163 }, { "epoch": 0.7627906976744186, "grad_norm": 2.376429319381714, "learning_rate": 8.028468724084121e-06, "loss": 0.3134, "mean_token_accuracy": 0.8932521343231201, "step": 164 }, { "epoch": 0.7674418604651163, "grad_norm": 2.4902546405792236, "learning_rate": 7.99816737018473e-06, "loss": 0.3132, "mean_token_accuracy": 0.8874028325080872, "step": 165 }, { "epoch": 0.772093023255814, "grad_norm": 2.286018133163452, "learning_rate": 7.967701390840339e-06, "loss": 0.3051, "mean_token_accuracy": 0.8954407572746277, "step": 166 }, { "epoch": 0.7767441860465116, "grad_norm": 2.1808741092681885, "learning_rate": 7.93707279371283e-06, "loss": 0.2888, "mean_token_accuracy": 0.9057142734527588, "step": 167 }, { "epoch": 0.7813953488372093, "grad_norm": 2.4743106365203857, "learning_rate": 7.906283597180357e-06, "loss": 0.3272, "mean_token_accuracy": 0.8907923102378845, "step": 168 }, { "epoch": 0.786046511627907, "grad_norm": 2.4288330078125, "learning_rate": 7.875335830204323e-06, "loss": 0.2612, "mean_token_accuracy": 0.9085434675216675, "step": 169 }, { "epoch": 0.7906976744186046, "grad_norm": 2.390599012374878, "learning_rate": 7.844231532195686e-06, "loss": 0.2717, "mean_token_accuracy": 0.9026373028755188, "step": 170 }, { "epoch": 0.7953488372093023, "grad_norm": 2.386613368988037, "learning_rate": 7.812972752880566e-06, "loss": 0.3131, "mean_token_accuracy": 0.8961803913116455, "step": 171 }, { "epoch": 0.8, "grad_norm": 2.338465929031372, "learning_rate": 7.781561552165156e-06, "loss": 0.2846, "mean_token_accuracy": 0.9064022302627563, "step": 172 }, { "epoch": 0.8046511627906977, "grad_norm": 2.4404520988464355, "learning_rate": 7.75e-06, "loss": 0.301, "mean_token_accuracy": 0.9013170003890991, "step": 173 }, { "epoch": 0.8093023255813954, "grad_norm": 2.441054582595825, "learning_rate": 7.71829017624357e-06, "loss": 0.2902, "mean_token_accuracy": 0.8965333104133606, "step": 174 }, { "epoch": 0.813953488372093, "grad_norm": 2.3165605068206787, "learning_rate": 7.686434170525213e-06, "loss": 0.2986, "mean_token_accuracy": 0.8884420394897461, "step": 175 }, { "epoch": 0.8186046511627907, "grad_norm": 2.340376377105713, "learning_rate": 7.654434082107442e-06, "loss": 0.2753, "mean_token_accuracy": 0.9065356254577637, "step": 176 }, { "epoch": 0.8232558139534883, "grad_norm": 5.1856608390808105, "learning_rate": 7.622292019747604e-06, "loss": 0.3261, "mean_token_accuracy": 0.887184739112854, "step": 177 }, { "epoch": 0.827906976744186, "grad_norm": 2.327040195465088, "learning_rate": 7.590010101558913e-06, "loss": 0.2815, "mean_token_accuracy": 0.9034022688865662, "step": 178 }, { "epoch": 0.8325581395348837, "grad_norm": 2.146498203277588, "learning_rate": 7.557590454870874e-06, "loss": 0.2995, "mean_token_accuracy": 0.8991208672523499, "step": 179 }, { "epoch": 0.8372093023255814, "grad_norm": 2.472043514251709, "learning_rate": 7.525035216089086e-06, "loss": 0.2884, "mean_token_accuracy": 0.900874137878418, "step": 180 }, { "epoch": 0.8418604651162791, "grad_norm": 2.309894561767578, "learning_rate": 7.492346530554463e-06, "loss": 0.2919, "mean_token_accuracy": 0.8961777091026306, "step": 181 }, { "epoch": 0.8465116279069768, "grad_norm": 2.3329412937164307, "learning_rate": 7.459526552401861e-06, "loss": 0.2993, "mean_token_accuracy": 0.9014893174171448, "step": 182 }, { "epoch": 0.8511627906976744, "grad_norm": 2.2241170406341553, "learning_rate": 7.4265774444181184e-06, "loss": 0.3036, "mean_token_accuracy": 0.8940504193305969, "step": 183 }, { "epoch": 0.8558139534883721, "grad_norm": 2.4457859992980957, "learning_rate": 7.39350137789953e-06, "loss": 0.3062, "mean_token_accuracy": 0.894417941570282, "step": 184 }, { "epoch": 0.8604651162790697, "grad_norm": 2.253269672393799, "learning_rate": 7.360300532508775e-06, "loss": 0.282, "mean_token_accuracy": 0.9023967385292053, "step": 185 }, { "epoch": 0.8651162790697674, "grad_norm": 2.2496144771575928, "learning_rate": 7.3269770961312616e-06, "loss": 0.3094, "mean_token_accuracy": 0.8949607610702515, "step": 186 }, { "epoch": 0.8697674418604651, "grad_norm": 2.237771511077881, "learning_rate": 7.2935332647309624e-06, "loss": 0.271, "mean_token_accuracy": 0.9009068608283997, "step": 187 }, { "epoch": 0.8744186046511628, "grad_norm": 2.1698238849639893, "learning_rate": 7.259971242205702e-06, "loss": 0.293, "mean_token_accuracy": 0.9026434421539307, "step": 188 }, { "epoch": 0.8790697674418605, "grad_norm": 2.1061506271362305, "learning_rate": 7.226293240241918e-06, "loss": 0.2693, "mean_token_accuracy": 0.9091058969497681, "step": 189 }, { "epoch": 0.8837209302325582, "grad_norm": 2.3756353855133057, "learning_rate": 7.1925014781689185e-06, "loss": 0.2892, "mean_token_accuracy": 0.8960843086242676, "step": 190 }, { "epoch": 0.8883720930232558, "grad_norm": 2.4632489681243896, "learning_rate": 7.158598182812628e-06, "loss": 0.337, "mean_token_accuracy": 0.8823441863059998, "step": 191 }, { "epoch": 0.8930232558139535, "grad_norm": 2.2445359230041504, "learning_rate": 7.12458558834885e-06, "loss": 0.2502, "mean_token_accuracy": 0.9130662083625793, "step": 192 }, { "epoch": 0.8976744186046511, "grad_norm": 2.2357635498046875, "learning_rate": 7.090465936156028e-06, "loss": 0.2964, "mean_token_accuracy": 0.8986391425132751, "step": 193 }, { "epoch": 0.9023255813953488, "grad_norm": 2.2954163551330566, "learning_rate": 7.056241474667552e-06, "loss": 0.2769, "mean_token_accuracy": 0.9078544974327087, "step": 194 }, { "epoch": 0.9069767441860465, "grad_norm": 2.27939510345459, "learning_rate": 7.021914459223586e-06, "loss": 0.2765, "mean_token_accuracy": 0.9080632925033569, "step": 195 }, { "epoch": 0.9116279069767442, "grad_norm": 2.234778881072998, "learning_rate": 6.987487151922439e-06, "loss": 0.3069, "mean_token_accuracy": 0.8933604955673218, "step": 196 }, { "epoch": 0.9162790697674419, "grad_norm": 2.3990750312805176, "learning_rate": 6.952961821471509e-06, "loss": 0.2639, "mean_token_accuracy": 0.9116538166999817, "step": 197 }, { "epoch": 0.9209302325581395, "grad_norm": 2.3618667125701904, "learning_rate": 6.9183407430377645e-06, "loss": 0.2734, "mean_token_accuracy": 0.9068870544433594, "step": 198 }, { "epoch": 0.9255813953488372, "grad_norm": 2.492704153060913, "learning_rate": 6.883626198097825e-06, "loss": 0.3028, "mean_token_accuracy": 0.8957169651985168, "step": 199 }, { "epoch": 0.9302325581395349, "grad_norm": 2.458460569381714, "learning_rate": 6.84882047428761e-06, "loss": 0.2783, "mean_token_accuracy": 0.906191349029541, "step": 200 }, { "epoch": 0.9348837209302325, "grad_norm": 2.3292219638824463, "learning_rate": 6.813925865251587e-06, "loss": 0.305, "mean_token_accuracy": 0.8985507488250732, "step": 201 }, { "epoch": 0.9395348837209302, "grad_norm": 2.1781818866729736, "learning_rate": 6.77894467049163e-06, "loss": 0.2698, "mean_token_accuracy": 0.9075833559036255, "step": 202 }, { "epoch": 0.9441860465116279, "grad_norm": 2.580416679382324, "learning_rate": 6.743879195215472e-06, "loss": 0.2944, "mean_token_accuracy": 0.8997113704681396, "step": 203 }, { "epoch": 0.9488372093023256, "grad_norm": 2.0918219089508057, "learning_rate": 6.708731750184815e-06, "loss": 0.2585, "mean_token_accuracy": 0.9011285305023193, "step": 204 }, { "epoch": 0.9534883720930233, "grad_norm": 2.086010456085205, "learning_rate": 6.673504651563035e-06, "loss": 0.2583, "mean_token_accuracy": 0.9062029719352722, "step": 205 }, { "epoch": 0.958139534883721, "grad_norm": 2.3076541423797607, "learning_rate": 6.638200220762563e-06, "loss": 0.2861, "mean_token_accuracy": 0.9016419649124146, "step": 206 }, { "epoch": 0.9627906976744186, "grad_norm": 2.5171306133270264, "learning_rate": 6.602820784291907e-06, "loss": 0.3105, "mean_token_accuracy": 0.8922398090362549, "step": 207 }, { "epoch": 0.9674418604651163, "grad_norm": 2.2755181789398193, "learning_rate": 6.5673686736023245e-06, "loss": 0.2753, "mean_token_accuracy": 0.9048746824264526, "step": 208 }, { "epoch": 0.9720930232558139, "grad_norm": 2.164433479309082, "learning_rate": 6.531846224934206e-06, "loss": 0.2596, "mean_token_accuracy": 0.9133898019790649, "step": 209 }, { "epoch": 0.9767441860465116, "grad_norm": 2.0087199211120605, "learning_rate": 6.4962557791631e-06, "loss": 0.2715, "mean_token_accuracy": 0.9074162840843201, "step": 210 }, { "epoch": 0.9813953488372092, "grad_norm": 2.2094898223876953, "learning_rate": 6.460599681645462e-06, "loss": 0.2664, "mean_token_accuracy": 0.907378077507019, "step": 211 }, { "epoch": 0.986046511627907, "grad_norm": 2.1591098308563232, "learning_rate": 6.424880282064103e-06, "loss": 0.2926, "mean_token_accuracy": 0.9012137055397034, "step": 212 }, { "epoch": 0.9906976744186047, "grad_norm": 2.1089112758636475, "learning_rate": 6.3890999342733396e-06, "loss": 0.2566, "mean_token_accuracy": 0.9117971062660217, "step": 213 }, { "epoch": 0.9953488372093023, "grad_norm": 2.244309186935425, "learning_rate": 6.353260996143884e-06, "loss": 0.2977, "mean_token_accuracy": 0.8972173929214478, "step": 214 }, { "epoch": 1.0, "grad_norm": 1.9823395013809204, "learning_rate": 6.317365829407465e-06, "loss": 0.2511, "mean_token_accuracy": 0.9041892886161804, "step": 215 }, { "epoch": 1.0046511627906978, "grad_norm": 3.1557061672210693, "learning_rate": 6.281416799501188e-06, "loss": 0.1454, "mean_token_accuracy": 0.9540554881095886, "step": 216 }, { "epoch": 1.0093023255813953, "grad_norm": 2.734393835067749, "learning_rate": 6.245416275411661e-06, "loss": 0.1622, "mean_token_accuracy": 0.9430840611457825, "step": 217 }, { "epoch": 1.013953488372093, "grad_norm": 2.5812294483184814, "learning_rate": 6.2093666295188816e-06, "loss": 0.1363, "mean_token_accuracy": 0.955654501914978, "step": 218 }, { "epoch": 1.0186046511627906, "grad_norm": 2.294768810272217, "learning_rate": 6.173270237439901e-06, "loss": 0.1409, "mean_token_accuracy": 0.9525974988937378, "step": 219 }, { "epoch": 1.0232558139534884, "grad_norm": 2.10762619972229, "learning_rate": 6.1371294778722705e-06, "loss": 0.1142, "mean_token_accuracy": 0.9630529284477234, "step": 220 }, { "epoch": 1.027906976744186, "grad_norm": 1.7485405206680298, "learning_rate": 6.100946732437291e-06, "loss": 0.1396, "mean_token_accuracy": 0.9498718976974487, "step": 221 }, { "epoch": 1.0325581395348837, "grad_norm": 1.6996605396270752, "learning_rate": 6.064724385523073e-06, "loss": 0.1211, "mean_token_accuracy": 0.9572222232818604, "step": 222 }, { "epoch": 1.0372093023255813, "grad_norm": 1.8055946826934814, "learning_rate": 6.028464824127399e-06, "loss": 0.1245, "mean_token_accuracy": 0.9569948315620422, "step": 223 }, { "epoch": 1.041860465116279, "grad_norm": 2.2612037658691406, "learning_rate": 5.992170437700436e-06, "loss": 0.1184, "mean_token_accuracy": 0.9553605318069458, "step": 224 }, { "epoch": 1.0465116279069768, "grad_norm": 2.0997769832611084, "learning_rate": 5.955843617987259e-06, "loss": 0.1302, "mean_token_accuracy": 0.9527406692504883, "step": 225 }, { "epoch": 1.0511627906976744, "grad_norm": 2.1007702350616455, "learning_rate": 5.919486758870257e-06, "loss": 0.118, "mean_token_accuracy": 0.9590943455696106, "step": 226 }, { "epoch": 1.0558139534883721, "grad_norm": 2.0043294429779053, "learning_rate": 5.883102256211361e-06, "loss": 0.1404, "mean_token_accuracy": 0.9549180269241333, "step": 227 }, { "epoch": 1.0604651162790697, "grad_norm": 2.222476005554199, "learning_rate": 5.8466925076941785e-06, "loss": 0.142, "mean_token_accuracy": 0.9515007734298706, "step": 228 }, { "epoch": 1.0651162790697675, "grad_norm": 2.368314504623413, "learning_rate": 5.810259912665973e-06, "loss": 0.1406, "mean_token_accuracy": 0.9539972543716431, "step": 229 }, { "epoch": 1.069767441860465, "grad_norm": 1.954593300819397, "learning_rate": 5.773806871979564e-06, "loss": 0.1184, "mean_token_accuracy": 0.958109974861145, "step": 230 }, { "epoch": 1.0744186046511628, "grad_norm": 1.8620705604553223, "learning_rate": 5.7373357878351055e-06, "loss": 0.1202, "mean_token_accuracy": 0.9591098427772522, "step": 231 }, { "epoch": 1.0790697674418606, "grad_norm": 1.718745231628418, "learning_rate": 5.700849063621789e-06, "loss": 0.1072, "mean_token_accuracy": 0.9625619053840637, "step": 232 }, { "epoch": 1.083720930232558, "grad_norm": 2.1164543628692627, "learning_rate": 5.664349103759467e-06, "loss": 0.1303, "mean_token_accuracy": 0.9507201313972473, "step": 233 }, { "epoch": 1.0883720930232559, "grad_norm": 1.8693439960479736, "learning_rate": 5.627838313540191e-06, "loss": 0.1475, "mean_token_accuracy": 0.9470409750938416, "step": 234 }, { "epoch": 1.0930232558139534, "grad_norm": 1.944366216659546, "learning_rate": 5.591319098969727e-06, "loss": 0.1476, "mean_token_accuracy": 0.9515094757080078, "step": 235 }, { "epoch": 1.0976744186046512, "grad_norm": 1.9349125623703003, "learning_rate": 5.55479386660899e-06, "loss": 0.1238, "mean_token_accuracy": 0.9569832682609558, "step": 236 }, { "epoch": 1.1023255813953488, "grad_norm": 1.7475922107696533, "learning_rate": 5.5182650234154544e-06, "loss": 0.1181, "mean_token_accuracy": 0.9598400592803955, "step": 237 }, { "epoch": 1.1069767441860465, "grad_norm": 1.7131977081298828, "learning_rate": 5.481734976584546e-06, "loss": 0.1207, "mean_token_accuracy": 0.955795168876648, "step": 238 }, { "epoch": 1.1116279069767443, "grad_norm": 1.8039817810058594, "learning_rate": 5.4452061333910125e-06, "loss": 0.1257, "mean_token_accuracy": 0.9571937918663025, "step": 239 }, { "epoch": 1.1162790697674418, "grad_norm": 1.6558111906051636, "learning_rate": 5.4086809010302734e-06, "loss": 0.1084, "mean_token_accuracy": 0.9620627164840698, "step": 240 }, { "epoch": 1.1209302325581396, "grad_norm": 1.880353331565857, "learning_rate": 5.3721616864598094e-06, "loss": 0.1209, "mean_token_accuracy": 0.95799320936203, "step": 241 }, { "epoch": 1.1255813953488372, "grad_norm": 1.9320411682128906, "learning_rate": 5.3356508962405355e-06, "loss": 0.1293, "mean_token_accuracy": 0.9533839821815491, "step": 242 }, { "epoch": 1.130232558139535, "grad_norm": 1.9183322191238403, "learning_rate": 5.299150936378212e-06, "loss": 0.1248, "mean_token_accuracy": 0.9582386016845703, "step": 243 }, { "epoch": 1.1348837209302325, "grad_norm": 3.065585136413574, "learning_rate": 5.262664212164898e-06, "loss": 0.1176, "mean_token_accuracy": 0.9561497569084167, "step": 244 }, { "epoch": 1.1395348837209303, "grad_norm": 1.8231462240219116, "learning_rate": 5.226193128020438e-06, "loss": 0.1244, "mean_token_accuracy": 0.9571564793586731, "step": 245 }, { "epoch": 1.1441860465116278, "grad_norm": 1.9364738464355469, "learning_rate": 5.189740087334029e-06, "loss": 0.1279, "mean_token_accuracy": 0.952070415019989, "step": 246 }, { "epoch": 1.1488372093023256, "grad_norm": 1.7137261629104614, "learning_rate": 5.153307492305824e-06, "loss": 0.1109, "mean_token_accuracy": 0.959574818611145, "step": 247 }, { "epoch": 1.1534883720930234, "grad_norm": 1.8068032264709473, "learning_rate": 5.116897743788639e-06, "loss": 0.1196, "mean_token_accuracy": 0.9579454064369202, "step": 248 }, { "epoch": 1.158139534883721, "grad_norm": 2.0845437049865723, "learning_rate": 5.080513241129745e-06, "loss": 0.1388, "mean_token_accuracy": 0.950688898563385, "step": 249 }, { "epoch": 1.1627906976744187, "grad_norm": 2.2837700843811035, "learning_rate": 5.044156382012742e-06, "loss": 0.1194, "mean_token_accuracy": 0.959080159664154, "step": 250 }, { "epoch": 1.1674418604651162, "grad_norm": 3.3996706008911133, "learning_rate": 5.007829562299567e-06, "loss": 0.1162, "mean_token_accuracy": 0.9592645764350891, "step": 251 }, { "epoch": 1.172093023255814, "grad_norm": 1.7496733665466309, "learning_rate": 4.9715351758726015e-06, "loss": 0.1233, "mean_token_accuracy": 0.957980215549469, "step": 252 }, { "epoch": 1.1767441860465115, "grad_norm": 2.1975224018096924, "learning_rate": 4.9352756144769285e-06, "loss": 0.1487, "mean_token_accuracy": 0.9537729620933533, "step": 253 }, { "epoch": 1.1813953488372093, "grad_norm": 2.0753679275512695, "learning_rate": 4.89905326756271e-06, "loss": 0.115, "mean_token_accuracy": 0.9599113464355469, "step": 254 }, { "epoch": 1.1860465116279069, "grad_norm": 2.137821912765503, "learning_rate": 4.862870522127731e-06, "loss": 0.1306, "mean_token_accuracy": 0.9522392749786377, "step": 255 }, { "epoch": 1.1906976744186046, "grad_norm": 1.751489520072937, "learning_rate": 4.8267297625601e-06, "loss": 0.1194, "mean_token_accuracy": 0.9590802192687988, "step": 256 }, { "epoch": 1.1953488372093024, "grad_norm": 1.7990120649337769, "learning_rate": 4.790633370481121e-06, "loss": 0.119, "mean_token_accuracy": 0.9594948291778564, "step": 257 }, { "epoch": 1.2, "grad_norm": 1.755265474319458, "learning_rate": 4.754583724588342e-06, "loss": 0.1143, "mean_token_accuracy": 0.9570131301879883, "step": 258 }, { "epoch": 1.2046511627906977, "grad_norm": 1.8024441003799438, "learning_rate": 4.718583200498814e-06, "loss": 0.1064, "mean_token_accuracy": 0.9623507857322693, "step": 259 }, { "epoch": 1.2093023255813953, "grad_norm": 1.8982021808624268, "learning_rate": 4.682634170592537e-06, "loss": 0.1362, "mean_token_accuracy": 0.9527772665023804, "step": 260 }, { "epoch": 1.213953488372093, "grad_norm": 1.9453184604644775, "learning_rate": 4.646739003856117e-06, "loss": 0.1338, "mean_token_accuracy": 0.9539026618003845, "step": 261 }, { "epoch": 1.2186046511627908, "grad_norm": 1.8251960277557373, "learning_rate": 4.610900065726661e-06, "loss": 0.1315, "mean_token_accuracy": 0.9544464349746704, "step": 262 }, { "epoch": 1.2232558139534884, "grad_norm": 1.9196908473968506, "learning_rate": 4.575119717935898e-06, "loss": 0.1272, "mean_token_accuracy": 0.9516383409500122, "step": 263 }, { "epoch": 1.2279069767441861, "grad_norm": 1.615856647491455, "learning_rate": 4.53940031835454e-06, "loss": 0.1187, "mean_token_accuracy": 0.9601250886917114, "step": 264 }, { "epoch": 1.2325581395348837, "grad_norm": 1.9295324087142944, "learning_rate": 4.503744220836902e-06, "loss": 0.1485, "mean_token_accuracy": 0.9450379610061646, "step": 265 }, { "epoch": 1.2372093023255815, "grad_norm": 2.03997802734375, "learning_rate": 4.468153775065795e-06, "loss": 0.1439, "mean_token_accuracy": 0.9504145383834839, "step": 266 }, { "epoch": 1.241860465116279, "grad_norm": 2.085965394973755, "learning_rate": 4.432631326397676e-06, "loss": 0.1598, "mean_token_accuracy": 0.9417144656181335, "step": 267 }, { "epoch": 1.2465116279069768, "grad_norm": 1.768497347831726, "learning_rate": 4.397179215708095e-06, "loss": 0.108, "mean_token_accuracy": 0.9582984447479248, "step": 268 }, { "epoch": 1.2511627906976743, "grad_norm": 1.8647841215133667, "learning_rate": 4.3617997792374365e-06, "loss": 0.1178, "mean_token_accuracy": 0.9578408598899841, "step": 269 }, { "epoch": 1.255813953488372, "grad_norm": 1.831526279449463, "learning_rate": 4.326495348436966e-06, "loss": 0.1085, "mean_token_accuracy": 0.9653323292732239, "step": 270 }, { "epoch": 1.2604651162790699, "grad_norm": 1.904144287109375, "learning_rate": 4.291268249815188e-06, "loss": 0.1176, "mean_token_accuracy": 0.9586750268936157, "step": 271 }, { "epoch": 1.2651162790697674, "grad_norm": 2.3330485820770264, "learning_rate": 4.256120804784528e-06, "loss": 0.1105, "mean_token_accuracy": 0.9589130878448486, "step": 272 }, { "epoch": 1.2697674418604652, "grad_norm": 1.7716329097747803, "learning_rate": 4.221055329508372e-06, "loss": 0.1061, "mean_token_accuracy": 0.9632218480110168, "step": 273 }, { "epoch": 1.2744186046511627, "grad_norm": 1.6046407222747803, "learning_rate": 4.186074134748414e-06, "loss": 0.1012, "mean_token_accuracy": 0.9630866646766663, "step": 274 }, { "epoch": 1.2790697674418605, "grad_norm": 1.8529785871505737, "learning_rate": 4.151179525712392e-06, "loss": 0.1206, "mean_token_accuracy": 0.9566202163696289, "step": 275 }, { "epoch": 1.283720930232558, "grad_norm": 1.8019949197769165, "learning_rate": 4.116373801902176e-06, "loss": 0.1035, "mean_token_accuracy": 0.9640029072761536, "step": 276 }, { "epoch": 1.2883720930232558, "grad_norm": 2.1905837059020996, "learning_rate": 4.081659256962237e-06, "loss": 0.1273, "mean_token_accuracy": 0.9552264213562012, "step": 277 }, { "epoch": 1.2930232558139534, "grad_norm": 1.8310658931732178, "learning_rate": 4.047038178528494e-06, "loss": 0.1202, "mean_token_accuracy": 0.9595242142677307, "step": 278 }, { "epoch": 1.2976744186046512, "grad_norm": 1.7499853372573853, "learning_rate": 4.012512848077562e-06, "loss": 0.1129, "mean_token_accuracy": 0.9607168436050415, "step": 279 }, { "epoch": 1.302325581395349, "grad_norm": 1.76249361038208, "learning_rate": 3.978085540776416e-06, "loss": 0.1158, "mean_token_accuracy": 0.9601153135299683, "step": 280 }, { "epoch": 1.3069767441860465, "grad_norm": 1.9077409505844116, "learning_rate": 3.94375852533245e-06, "loss": 0.1326, "mean_token_accuracy": 0.9523097276687622, "step": 281 }, { "epoch": 1.3116279069767443, "grad_norm": 2.2027199268341064, "learning_rate": 3.9095340638439735e-06, "loss": 0.1613, "mean_token_accuracy": 0.9432445764541626, "step": 282 }, { "epoch": 1.3162790697674418, "grad_norm": 1.6710178852081299, "learning_rate": 3.8754144116511516e-06, "loss": 0.106, "mean_token_accuracy": 0.9620450735092163, "step": 283 }, { "epoch": 1.3209302325581396, "grad_norm": 1.8264927864074707, "learning_rate": 3.8414018171873725e-06, "loss": 0.1296, "mean_token_accuracy": 0.9531369209289551, "step": 284 }, { "epoch": 1.3255813953488373, "grad_norm": 1.7415143251419067, "learning_rate": 3.8074985218310833e-06, "loss": 0.0949, "mean_token_accuracy": 0.9662027955055237, "step": 285 }, { "epoch": 1.330232558139535, "grad_norm": 1.7828929424285889, "learning_rate": 3.7737067597580822e-06, "loss": 0.1238, "mean_token_accuracy": 0.9578856825828552, "step": 286 }, { "epoch": 1.3348837209302324, "grad_norm": 1.7337418794631958, "learning_rate": 3.7400287577942994e-06, "loss": 0.1313, "mean_token_accuracy": 0.9521440863609314, "step": 287 }, { "epoch": 1.3395348837209302, "grad_norm": 2.2492868900299072, "learning_rate": 3.7064667352690386e-06, "loss": 0.1318, "mean_token_accuracy": 0.9546619653701782, "step": 288 }, { "epoch": 1.344186046511628, "grad_norm": 1.9296540021896362, "learning_rate": 3.6730229038687403e-06, "loss": 0.1205, "mean_token_accuracy": 0.9589089155197144, "step": 289 }, { "epoch": 1.3488372093023255, "grad_norm": 2.0041182041168213, "learning_rate": 3.639699467491228e-06, "loss": 0.1267, "mean_token_accuracy": 0.9551070332527161, "step": 290 }, { "epoch": 1.3534883720930233, "grad_norm": 2.0701072216033936, "learning_rate": 3.6064986221004704e-06, "loss": 0.1407, "mean_token_accuracy": 0.9523890614509583, "step": 291 }, { "epoch": 1.3581395348837209, "grad_norm": 1.9877480268478394, "learning_rate": 3.5734225555818847e-06, "loss": 0.1376, "mean_token_accuracy": 0.9489011168479919, "step": 292 }, { "epoch": 1.3627906976744186, "grad_norm": 1.8878151178359985, "learning_rate": 3.5404734475981405e-06, "loss": 0.1275, "mean_token_accuracy": 0.9542827010154724, "step": 293 }, { "epoch": 1.3674418604651164, "grad_norm": 2.095825672149658, "learning_rate": 3.5076534694455376e-06, "loss": 0.1026, "mean_token_accuracy": 0.9636394381523132, "step": 294 }, { "epoch": 1.372093023255814, "grad_norm": 1.8154828548431396, "learning_rate": 3.474964783910916e-06, "loss": 0.1295, "mean_token_accuracy": 0.9553645253181458, "step": 295 }, { "epoch": 1.3767441860465115, "grad_norm": 1.8968881368637085, "learning_rate": 3.4424095451291273e-06, "loss": 0.1288, "mean_token_accuracy": 0.9542672038078308, "step": 296 }, { "epoch": 1.3813953488372093, "grad_norm": 1.8607019186019897, "learning_rate": 3.409989898441086e-06, "loss": 0.1278, "mean_token_accuracy": 0.9535946846008301, "step": 297 }, { "epoch": 1.386046511627907, "grad_norm": 1.7035014629364014, "learning_rate": 3.3777079802523976e-06, "loss": 0.117, "mean_token_accuracy": 0.9564270377159119, "step": 298 }, { "epoch": 1.3906976744186046, "grad_norm": 1.8858115673065186, "learning_rate": 3.345565917892561e-06, "loss": 0.1174, "mean_token_accuracy": 0.9577510952949524, "step": 299 }, { "epoch": 1.3953488372093024, "grad_norm": 2.2695724964141846, "learning_rate": 3.3135658294747886e-06, "loss": 0.1232, "mean_token_accuracy": 0.9565290212631226, "step": 300 }, { "epoch": 1.4, "grad_norm": 1.6535850763320923, "learning_rate": 3.2817098237564292e-06, "loss": 0.1078, "mean_token_accuracy": 0.95890873670578, "step": 301 }, { "epoch": 1.4046511627906977, "grad_norm": 1.7052584886550903, "learning_rate": 3.2500000000000015e-06, "loss": 0.1302, "mean_token_accuracy": 0.9495029449462891, "step": 302 }, { "epoch": 1.4093023255813955, "grad_norm": 1.785459280014038, "learning_rate": 3.218438447834845e-06, "loss": 0.1311, "mean_token_accuracy": 0.9482329487800598, "step": 303 }, { "epoch": 1.413953488372093, "grad_norm": 1.7197625637054443, "learning_rate": 3.1870272471194363e-06, "loss": 0.1245, "mean_token_accuracy": 0.9550527930259705, "step": 304 }, { "epoch": 1.4186046511627908, "grad_norm": 1.738937497138977, "learning_rate": 3.1557684678043145e-06, "loss": 0.1393, "mean_token_accuracy": 0.9483538866043091, "step": 305 }, { "epoch": 1.4232558139534883, "grad_norm": 1.6273069381713867, "learning_rate": 3.124664169795677e-06, "loss": 0.1112, "mean_token_accuracy": 0.9574396014213562, "step": 306 }, { "epoch": 1.427906976744186, "grad_norm": 2.0904576778411865, "learning_rate": 3.0937164028196443e-06, "loss": 0.1204, "mean_token_accuracy": 0.958777666091919, "step": 307 }, { "epoch": 1.4325581395348836, "grad_norm": 1.9097553491592407, "learning_rate": 3.0629272062871697e-06, "loss": 0.1296, "mean_token_accuracy": 0.9571428298950195, "step": 308 }, { "epoch": 1.4372093023255814, "grad_norm": 1.8212406635284424, "learning_rate": 3.032298609159664e-06, "loss": 0.1247, "mean_token_accuracy": 0.9599398374557495, "step": 309 }, { "epoch": 1.441860465116279, "grad_norm": 2.2745423316955566, "learning_rate": 3.0018326298152716e-06, "loss": 0.1258, "mean_token_accuracy": 0.9553403854370117, "step": 310 }, { "epoch": 1.4465116279069767, "grad_norm": 1.7779735326766968, "learning_rate": 2.9715312759158776e-06, "loss": 0.1004, "mean_token_accuracy": 0.9625091552734375, "step": 311 }, { "epoch": 1.4511627906976745, "grad_norm": 1.987384557723999, "learning_rate": 2.9413965442748e-06, "loss": 0.1444, "mean_token_accuracy": 0.9496188759803772, "step": 312 }, { "epoch": 1.455813953488372, "grad_norm": 1.733598232269287, "learning_rate": 2.9114304207251966e-06, "loss": 0.1261, "mean_token_accuracy": 0.9566228985786438, "step": 313 }, { "epoch": 1.4604651162790698, "grad_norm": 1.6986511945724487, "learning_rate": 2.8816348799892134e-06, "loss": 0.115, "mean_token_accuracy": 0.9578772783279419, "step": 314 }, { "epoch": 1.4651162790697674, "grad_norm": 1.7694047689437866, "learning_rate": 2.8520118855478425e-06, "loss": 0.1163, "mean_token_accuracy": 0.9581395387649536, "step": 315 }, { "epoch": 1.4697674418604652, "grad_norm": 2.0580880641937256, "learning_rate": 2.822563389511542e-06, "loss": 0.1496, "mean_token_accuracy": 0.9496130347251892, "step": 316 }, { "epoch": 1.474418604651163, "grad_norm": 1.9992523193359375, "learning_rate": 2.793291332491584e-06, "loss": 0.1246, "mean_token_accuracy": 0.9544215202331543, "step": 317 }, { "epoch": 1.4790697674418605, "grad_norm": 1.7976362705230713, "learning_rate": 2.7641976434721795e-06, "loss": 0.1236, "mean_token_accuracy": 0.9601340293884277, "step": 318 }, { "epoch": 1.483720930232558, "grad_norm": 1.844316840171814, "learning_rate": 2.735284239683361e-06, "loss": 0.1216, "mean_token_accuracy": 0.9563649296760559, "step": 319 }, { "epoch": 1.4883720930232558, "grad_norm": 1.6500184535980225, "learning_rate": 2.706553026474632e-06, "loss": 0.1058, "mean_token_accuracy": 0.9624313116073608, "step": 320 }, { "epoch": 1.4930232558139536, "grad_norm": 1.7818459272384644, "learning_rate": 2.6780058971894175e-06, "loss": 0.1181, "mean_token_accuracy": 0.9556201696395874, "step": 321 }, { "epoch": 1.4976744186046511, "grad_norm": 1.8266305923461914, "learning_rate": 2.6496447330402857e-06, "loss": 0.1125, "mean_token_accuracy": 0.9616958498954773, "step": 322 }, { "epoch": 1.5023255813953489, "grad_norm": 1.7001821994781494, "learning_rate": 2.621471402984991e-06, "loss": 0.1056, "mean_token_accuracy": 0.9641386270523071, "step": 323 }, { "epoch": 1.5069767441860464, "grad_norm": 1.7136989831924438, "learning_rate": 2.5934877636032975e-06, "loss": 0.094, "mean_token_accuracy": 0.965951144695282, "step": 324 }, { "epoch": 1.5116279069767442, "grad_norm": 1.9149391651153564, "learning_rate": 2.5656956589746486e-06, "loss": 0.1203, "mean_token_accuracy": 0.9579139947891235, "step": 325 }, { "epoch": 1.516279069767442, "grad_norm": 1.828525185585022, "learning_rate": 2.538096920556635e-06, "loss": 0.1244, "mean_token_accuracy": 0.9578738212585449, "step": 326 }, { "epoch": 1.5209302325581395, "grad_norm": 1.8189513683319092, "learning_rate": 2.510693367064304e-06, "loss": 0.1187, "mean_token_accuracy": 0.9572099447250366, "step": 327 }, { "epoch": 1.525581395348837, "grad_norm": 2.128922939300537, "learning_rate": 2.4834868043503176e-06, "loss": 0.1543, "mean_token_accuracy": 0.9494983553886414, "step": 328 }, { "epoch": 1.5302325581395348, "grad_norm": 1.8340766429901123, "learning_rate": 2.4564790252859377e-06, "loss": 0.1143, "mean_token_accuracy": 0.9591530561447144, "step": 329 }, { "epoch": 1.5348837209302326, "grad_norm": 1.4942302703857422, "learning_rate": 2.4296718096428903e-06, "loss": 0.0954, "mean_token_accuracy": 0.9681605696678162, "step": 330 }, { "epoch": 1.5395348837209304, "grad_norm": 2.0481481552124023, "learning_rate": 2.403066923976075e-06, "loss": 0.1042, "mean_token_accuracy": 0.9619013071060181, "step": 331 }, { "epoch": 1.544186046511628, "grad_norm": 1.8036472797393799, "learning_rate": 2.3766661215071473e-06, "loss": 0.1153, "mean_token_accuracy": 0.9580026268959045, "step": 332 }, { "epoch": 1.5488372093023255, "grad_norm": 1.7446261644363403, "learning_rate": 2.3504711420089975e-06, "loss": 0.1212, "mean_token_accuracy": 0.957176148891449, "step": 333 }, { "epoch": 1.5534883720930233, "grad_norm": 1.9047147035598755, "learning_rate": 2.324483711691085e-06, "loss": 0.1185, "mean_token_accuracy": 0.9612976908683777, "step": 334 }, { "epoch": 1.558139534883721, "grad_norm": 1.7788504362106323, "learning_rate": 2.298705543085701e-06, "loss": 0.1367, "mean_token_accuracy": 0.9494468569755554, "step": 335 }, { "epoch": 1.5627906976744186, "grad_norm": 1.6594794988632202, "learning_rate": 2.273138334935099e-06, "loss": 0.1044, "mean_token_accuracy": 0.9624683260917664, "step": 336 }, { "epoch": 1.5674418604651161, "grad_norm": 1.673941969871521, "learning_rate": 2.2477837720795647e-06, "loss": 0.0971, "mean_token_accuracy": 0.965850293636322, "step": 337 }, { "epoch": 1.572093023255814, "grad_norm": 1.8094561100006104, "learning_rate": 2.222643525346379e-06, "loss": 0.1199, "mean_token_accuracy": 0.9571694731712341, "step": 338 }, { "epoch": 1.5767441860465117, "grad_norm": 1.9664356708526611, "learning_rate": 2.1977192514397115e-06, "loss": 0.1432, "mean_token_accuracy": 0.9504490494728088, "step": 339 }, { "epoch": 1.5813953488372094, "grad_norm": 1.7851786613464355, "learning_rate": 2.1730125928314566e-06, "loss": 0.1267, "mean_token_accuracy": 0.9532294273376465, "step": 340 }, { "epoch": 1.586046511627907, "grad_norm": 1.9649142026901245, "learning_rate": 2.148525177652982e-06, "loss": 0.1133, "mean_token_accuracy": 0.9610248804092407, "step": 341 }, { "epoch": 1.5906976744186045, "grad_norm": 1.8593871593475342, "learning_rate": 2.124258619587853e-06, "loss": 0.1147, "mean_token_accuracy": 0.9592034220695496, "step": 342 }, { "epoch": 1.5953488372093023, "grad_norm": 1.9876712560653687, "learning_rate": 2.100214517765481e-06, "loss": 0.1309, "mean_token_accuracy": 0.9497568607330322, "step": 343 }, { "epoch": 1.6, "grad_norm": 1.8158270120620728, "learning_rate": 2.076394456655749e-06, "loss": 0.1225, "mean_token_accuracy": 0.9589306116104126, "step": 344 }, { "epoch": 1.6046511627906976, "grad_norm": 1.985721468925476, "learning_rate": 2.0528000059646e-06, "loss": 0.1192, "mean_token_accuracy": 0.9585253596305847, "step": 345 }, { "epoch": 1.6093023255813952, "grad_norm": 1.5803555250167847, "learning_rate": 2.029432720530585e-06, "loss": 0.1024, "mean_token_accuracy": 0.9614368677139282, "step": 346 }, { "epoch": 1.613953488372093, "grad_norm": 1.97838294506073, "learning_rate": 2.006294140222416e-06, "loss": 0.1354, "mean_token_accuracy": 0.9507514238357544, "step": 347 }, { "epoch": 1.6186046511627907, "grad_norm": 1.9568642377853394, "learning_rate": 1.9833857898374796e-06, "loss": 0.129, "mean_token_accuracy": 0.9536830186843872, "step": 348 }, { "epoch": 1.6232558139534885, "grad_norm": 1.998573899269104, "learning_rate": 1.960709179001361e-06, "loss": 0.1113, "mean_token_accuracy": 0.9594219923019409, "step": 349 }, { "epoch": 1.627906976744186, "grad_norm": 1.615460991859436, "learning_rate": 1.9382658020683572e-06, "loss": 0.0981, "mean_token_accuracy": 0.9683917760848999, "step": 350 }, { "epoch": 1.6325581395348836, "grad_norm": 2.0428853034973145, "learning_rate": 1.9160571380230087e-06, "loss": 0.1365, "mean_token_accuracy": 0.9510607123374939, "step": 351 }, { "epoch": 1.6372093023255814, "grad_norm": 1.8019459247589111, "learning_rate": 1.8940846503826302e-06, "loss": 0.1203, "mean_token_accuracy": 0.9561994671821594, "step": 352 }, { "epoch": 1.6418604651162791, "grad_norm": 1.7695238590240479, "learning_rate": 1.8723497871008678e-06, "loss": 0.1181, "mean_token_accuracy": 0.9560151100158691, "step": 353 }, { "epoch": 1.6465116279069767, "grad_norm": 2.7435801029205322, "learning_rate": 1.8508539804722847e-06, "loss": 0.1274, "mean_token_accuracy": 0.9515418410301208, "step": 354 }, { "epoch": 1.6511627906976745, "grad_norm": 1.7049776315689087, "learning_rate": 1.8295986470379726e-06, "loss": 0.1002, "mean_token_accuracy": 0.9640846252441406, "step": 355 }, { "epoch": 1.655813953488372, "grad_norm": 1.8068418502807617, "learning_rate": 1.8085851874922012e-06, "loss": 0.1215, "mean_token_accuracy": 0.9560132622718811, "step": 356 }, { "epoch": 1.6604651162790698, "grad_norm": 1.7952479124069214, "learning_rate": 1.7878149865901207e-06, "loss": 0.1124, "mean_token_accuracy": 0.9578981399536133, "step": 357 }, { "epoch": 1.6651162790697676, "grad_norm": 1.6922228336334229, "learning_rate": 1.7672894130565033e-06, "loss": 0.1118, "mean_token_accuracy": 0.9599378705024719, "step": 358 }, { "epoch": 1.669767441860465, "grad_norm": 1.9545196294784546, "learning_rate": 1.7470098194955502e-06, "loss": 0.1497, "mean_token_accuracy": 0.9433708786964417, "step": 359 }, { "epoch": 1.6744186046511627, "grad_norm": 1.7300385236740112, "learning_rate": 1.7269775423017513e-06, "loss": 0.1078, "mean_token_accuracy": 0.9611701965332031, "step": 360 }, { "epoch": 1.6790697674418604, "grad_norm": 1.9443196058273315, "learning_rate": 1.7071939015718264e-06, "loss": 0.1382, "mean_token_accuracy": 0.9518130421638489, "step": 361 }, { "epoch": 1.6837209302325582, "grad_norm": 1.7919453382492065, "learning_rate": 1.687660201017729e-06, "loss": 0.1257, "mean_token_accuracy": 0.9533194899559021, "step": 362 }, { "epoch": 1.688372093023256, "grad_norm": 1.6777992248535156, "learning_rate": 1.6683777278807296e-06, "loss": 0.1083, "mean_token_accuracy": 0.9628692269325256, "step": 363 }, { "epoch": 1.6930232558139535, "grad_norm": 1.7619105577468872, "learning_rate": 1.6493477528465974e-06, "loss": 0.1051, "mean_token_accuracy": 0.9633173942565918, "step": 364 }, { "epoch": 1.697674418604651, "grad_norm": 1.6752265691757202, "learning_rate": 1.6305715299618547e-06, "loss": 0.1187, "mean_token_accuracy": 0.9575539827346802, "step": 365 }, { "epoch": 1.7023255813953488, "grad_norm": 1.687470555305481, "learning_rate": 1.6120502965511467e-06, "loss": 0.1188, "mean_token_accuracy": 0.9554470777511597, "step": 366 }, { "epoch": 1.7069767441860466, "grad_norm": 1.5344772338867188, "learning_rate": 1.5937852731356923e-06, "loss": 0.0966, "mean_token_accuracy": 0.9682371616363525, "step": 367 }, { "epoch": 1.7116279069767442, "grad_norm": 1.6204265356063843, "learning_rate": 1.5757776633528654e-06, "loss": 0.1017, "mean_token_accuracy": 0.9634027481079102, "step": 368 }, { "epoch": 1.7162790697674417, "grad_norm": 1.8507808446884155, "learning_rate": 1.5580286538768705e-06, "loss": 0.1104, "mean_token_accuracy": 0.9628064036369324, "step": 369 }, { "epoch": 1.7209302325581395, "grad_norm": 1.6015089750289917, "learning_rate": 1.5405394143405394e-06, "loss": 0.103, "mean_token_accuracy": 0.9659707546234131, "step": 370 }, { "epoch": 1.7255813953488373, "grad_norm": 1.7210160493850708, "learning_rate": 1.5233110972582646e-06, "loss": 0.13, "mean_token_accuracy": 0.9564670920372009, "step": 371 }, { "epoch": 1.730232558139535, "grad_norm": 1.7505390644073486, "learning_rate": 1.506344837950038e-06, "loss": 0.1151, "mean_token_accuracy": 0.959705650806427, "step": 372 }, { "epoch": 1.7348837209302326, "grad_norm": 1.6202950477600098, "learning_rate": 1.4896417544666476e-06, "loss": 0.1068, "mean_token_accuracy": 0.9642030000686646, "step": 373 }, { "epoch": 1.7395348837209301, "grad_norm": 1.9536902904510498, "learning_rate": 1.473202947515987e-06, "loss": 0.1329, "mean_token_accuracy": 0.9557550549507141, "step": 374 }, { "epoch": 1.744186046511628, "grad_norm": 1.72047758102417, "learning_rate": 1.4570295003905314e-06, "loss": 0.1082, "mean_token_accuracy": 0.9637593030929565, "step": 375 }, { "epoch": 1.7488372093023257, "grad_norm": 1.5606462955474854, "learning_rate": 1.4411224788959439e-06, "loss": 0.0997, "mean_token_accuracy": 0.9654178619384766, "step": 376 }, { "epoch": 1.7534883720930232, "grad_norm": 1.6534537076950073, "learning_rate": 1.4254829312808405e-06, "loss": 0.1086, "mean_token_accuracy": 0.9666008353233337, "step": 377 }, { "epoch": 1.7581395348837208, "grad_norm": 1.7377055883407593, "learning_rate": 1.4101118881677161e-06, "loss": 0.1235, "mean_token_accuracy": 0.9573296904563904, "step": 378 }, { "epoch": 1.7627906976744185, "grad_norm": 1.8244922161102295, "learning_rate": 1.3950103624850264e-06, "loss": 0.1085, "mean_token_accuracy": 0.9609296321868896, "step": 379 }, { "epoch": 1.7674418604651163, "grad_norm": 1.6825976371765137, "learning_rate": 1.3801793494004336e-06, "loss": 0.117, "mean_token_accuracy": 0.9583475589752197, "step": 380 }, { "epoch": 1.772093023255814, "grad_norm": 1.9061481952667236, "learning_rate": 1.365619826255231e-06, "loss": 0.1181, "mean_token_accuracy": 0.955722451210022, "step": 381 }, { "epoch": 1.7767441860465116, "grad_norm": 2.0009758472442627, "learning_rate": 1.351332752499936e-06, "loss": 0.1313, "mean_token_accuracy": 0.9538022875785828, "step": 382 }, { "epoch": 1.7813953488372092, "grad_norm": 1.6647913455963135, "learning_rate": 1.3373190696310664e-06, "loss": 0.0934, "mean_token_accuracy": 0.9652481079101562, "step": 383 }, { "epoch": 1.786046511627907, "grad_norm": 1.8045130968093872, "learning_rate": 1.3235797011290902e-06, "loss": 0.118, "mean_token_accuracy": 0.9571335911750793, "step": 384 }, { "epoch": 1.7906976744186047, "grad_norm": 1.8397167921066284, "learning_rate": 1.3101155523975787e-06, "loss": 0.1196, "mean_token_accuracy": 0.9594667553901672, "step": 385 }, { "epoch": 1.7953488372093023, "grad_norm": 2.044651508331299, "learning_rate": 1.2969275107035344e-06, "loss": 0.1493, "mean_token_accuracy": 0.949529767036438, "step": 386 }, { "epoch": 1.8, "grad_norm": 2.365513563156128, "learning_rate": 1.2840164451189253e-06, "loss": 0.1246, "mean_token_accuracy": 0.9561926126480103, "step": 387 }, { "epoch": 1.8046511627906976, "grad_norm": 1.9553759098052979, "learning_rate": 1.2713832064634127e-06, "loss": 0.1206, "mean_token_accuracy": 0.9597529172897339, "step": 388 }, { "epoch": 1.8093023255813954, "grad_norm": 1.8584918975830078, "learning_rate": 1.2590286272482852e-06, "loss": 0.1085, "mean_token_accuracy": 0.9604840278625488, "step": 389 }, { "epoch": 1.8139534883720931, "grad_norm": 1.8316679000854492, "learning_rate": 1.246953521621597e-06, "loss": 0.1202, "mean_token_accuracy": 0.957573652267456, "step": 390 }, { "epoch": 1.8186046511627907, "grad_norm": 1.8838489055633545, "learning_rate": 1.2351586853145135e-06, "loss": 0.1148, "mean_token_accuracy": 0.9598619937896729, "step": 391 }, { "epoch": 1.8232558139534882, "grad_norm": 1.7429184913635254, "learning_rate": 1.2236448955888793e-06, "loss": 0.1129, "mean_token_accuracy": 0.9605519771575928, "step": 392 }, { "epoch": 1.827906976744186, "grad_norm": 1.4080933332443237, "learning_rate": 1.212412911185994e-06, "loss": 0.0993, "mean_token_accuracy": 0.9594095945358276, "step": 393 }, { "epoch": 1.8325581395348838, "grad_norm": 1.9343010187149048, "learning_rate": 1.2014634722766138e-06, "loss": 0.1342, "mean_token_accuracy": 0.9440457224845886, "step": 394 }, { "epoch": 1.8372093023255816, "grad_norm": 1.7764984369277954, "learning_rate": 1.190797300412174e-06, "loss": 0.1123, "mean_token_accuracy": 0.9590568542480469, "step": 395 }, { "epoch": 1.841860465116279, "grad_norm": 1.8751682043075562, "learning_rate": 1.1804150984772405e-06, "loss": 0.1265, "mean_token_accuracy": 0.9582030773162842, "step": 396 }, { "epoch": 1.8465116279069766, "grad_norm": 1.5821163654327393, "learning_rate": 1.1703175506431936e-06, "loss": 0.1142, "mean_token_accuracy": 0.9597623348236084, "step": 397 }, { "epoch": 1.8511627906976744, "grad_norm": 1.7600274085998535, "learning_rate": 1.1605053223231367e-06, "loss": 0.1189, "mean_token_accuracy": 0.9547197222709656, "step": 398 }, { "epoch": 1.8558139534883722, "grad_norm": 1.8238446712493896, "learning_rate": 1.1509790601280508e-06, "loss": 0.1183, "mean_token_accuracy": 0.9584389328956604, "step": 399 }, { "epoch": 1.8604651162790697, "grad_norm": 1.825140357017517, "learning_rate": 1.1417393918241832e-06, "loss": 0.1149, "mean_token_accuracy": 0.9614356756210327, "step": 400 }, { "epoch": 1.8651162790697673, "grad_norm": 1.8221229314804077, "learning_rate": 1.1327869262916764e-06, "loss": 0.1177, "mean_token_accuracy": 0.954285740852356, "step": 401 }, { "epoch": 1.869767441860465, "grad_norm": 2.279554843902588, "learning_rate": 1.1241222534844456e-06, "loss": 0.1456, "mean_token_accuracy": 0.9513813853263855, "step": 402 }, { "epoch": 1.8744186046511628, "grad_norm": 1.7494654655456543, "learning_rate": 1.1157459443913036e-06, "loss": 0.1034, "mean_token_accuracy": 0.9605114459991455, "step": 403 }, { "epoch": 1.8790697674418606, "grad_norm": 1.9064695835113525, "learning_rate": 1.1076585509983285e-06, "loss": 0.1237, "mean_token_accuracy": 0.9582035541534424, "step": 404 }, { "epoch": 1.8837209302325582, "grad_norm": 1.8712602853775024, "learning_rate": 1.0998606062524917e-06, "loss": 0.121, "mean_token_accuracy": 0.9547223448753357, "step": 405 }, { "epoch": 1.8883720930232557, "grad_norm": 1.6676530838012695, "learning_rate": 1.0923526240265397e-06, "loss": 0.1131, "mean_token_accuracy": 0.9593326449394226, "step": 406 }, { "epoch": 1.8930232558139535, "grad_norm": 1.7135958671569824, "learning_rate": 1.085135099085126e-06, "loss": 0.102, "mean_token_accuracy": 0.961244523525238, "step": 407 }, { "epoch": 1.8976744186046512, "grad_norm": 2.4428012371063232, "learning_rate": 1.07820850705221e-06, "loss": 0.1375, "mean_token_accuracy": 0.9498010873794556, "step": 408 }, { "epoch": 1.9023255813953488, "grad_norm": 1.8211802244186401, "learning_rate": 1.0715733043797121e-06, "loss": 0.1109, "mean_token_accuracy": 0.9600291848182678, "step": 409 }, { "epoch": 1.9069767441860463, "grad_norm": 1.9840325117111206, "learning_rate": 1.065229928317438e-06, "loss": 0.1219, "mean_token_accuracy": 0.95467209815979, "step": 410 }, { "epoch": 1.9116279069767441, "grad_norm": 1.722679615020752, "learning_rate": 1.0591787968842587e-06, "loss": 0.1223, "mean_token_accuracy": 0.9608275294303894, "step": 411 }, { "epoch": 1.916279069767442, "grad_norm": 1.979507327079773, "learning_rate": 1.0534203088405679e-06, "loss": 0.1175, "mean_token_accuracy": 0.9590908885002136, "step": 412 }, { "epoch": 1.9209302325581397, "grad_norm": 1.834620714187622, "learning_rate": 1.047954843662004e-06, "loss": 0.1246, "mean_token_accuracy": 0.9556223750114441, "step": 413 }, { "epoch": 1.9255813953488372, "grad_norm": 1.8021700382232666, "learning_rate": 1.0427827615144432e-06, "loss": 0.1171, "mean_token_accuracy": 0.9610346555709839, "step": 414 }, { "epoch": 1.9302325581395348, "grad_norm": 1.7287272214889526, "learning_rate": 1.0379044032302621e-06, "loss": 0.1111, "mean_token_accuracy": 0.95878666639328, "step": 415 }, { "epoch": 1.9348837209302325, "grad_norm": 1.7230780124664307, "learning_rate": 1.0333200902858814e-06, "loss": 0.1162, "mean_token_accuracy": 0.9592291712760925, "step": 416 }, { "epoch": 1.9395348837209303, "grad_norm": 1.9648188352584839, "learning_rate": 1.0290301247805788e-06, "loss": 0.1217, "mean_token_accuracy": 0.9570950269699097, "step": 417 }, { "epoch": 1.9441860465116279, "grad_norm": 1.5934516191482544, "learning_rate": 1.0250347894165825e-06, "loss": 0.1006, "mean_token_accuracy": 0.9645111560821533, "step": 418 }, { "epoch": 1.9488372093023256, "grad_norm": 1.874199628829956, "learning_rate": 1.021334347480439e-06, "loss": 0.123, "mean_token_accuracy": 0.9575681686401367, "step": 419 }, { "epoch": 1.9534883720930232, "grad_norm": 1.9127286672592163, "learning_rate": 1.0179290428256663e-06, "loss": 0.1208, "mean_token_accuracy": 0.9570922255516052, "step": 420 }, { "epoch": 1.958139534883721, "grad_norm": 1.699216604232788, "learning_rate": 1.014819099856683e-06, "loss": 0.1182, "mean_token_accuracy": 0.9542940855026245, "step": 421 }, { "epoch": 1.9627906976744187, "grad_norm": 1.9871116876602173, "learning_rate": 1.0120047235140178e-06, "loss": 0.1239, "mean_token_accuracy": 0.9596614241600037, "step": 422 }, { "epoch": 1.9674418604651163, "grad_norm": 1.7161730527877808, "learning_rate": 1.0094860992608083e-06, "loss": 0.0969, "mean_token_accuracy": 0.9646767973899841, "step": 423 }, { "epoch": 1.9720930232558138, "grad_norm": 1.6434606313705444, "learning_rate": 1.0072633930705777e-06, "loss": 0.1096, "mean_token_accuracy": 0.9614861011505127, "step": 424 }, { "epoch": 1.9767441860465116, "grad_norm": 1.6597245931625366, "learning_rate": 1.0053367514162967e-06, "loss": 0.1082, "mean_token_accuracy": 0.963832676410675, "step": 425 }, { "epoch": 1.9813953488372094, "grad_norm": 1.8078020811080933, "learning_rate": 1.0037063012607302e-06, "loss": 0.1163, "mean_token_accuracy": 0.9587554335594177, "step": 426 }, { "epoch": 1.9860465116279071, "grad_norm": 1.8626351356506348, "learning_rate": 1.0023721500480747e-06, "loss": 0.1105, "mean_token_accuracy": 0.9619902968406677, "step": 427 }, { "epoch": 1.9906976744186047, "grad_norm": 1.7608290910720825, "learning_rate": 1.001334385696873e-06, "loss": 0.1287, "mean_token_accuracy": 0.9509281516075134, "step": 428 }, { "epoch": 1.9953488372093022, "grad_norm": 1.7384284734725952, "learning_rate": 1.0005930765942238e-06, "loss": 0.1167, "mean_token_accuracy": 0.9590829014778137, "step": 429 }, { "epoch": 2.0, "grad_norm": 1.298376202583313, "learning_rate": 1.0001482715912744e-06, "loss": 0.0638, "mean_token_accuracy": 0.9736953973770142, "step": 430 }, { "epoch": 2.0, "step": 430, "total_flos": 4.0376628208572826e+17, "train_loss": 0.22927807642276896, "train_runtime": 2180.6252, "train_samples_per_second": 6.284, "train_steps_per_second": 0.197 } ], "logging_steps": 1, "max_steps": 430, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.0376628208572826e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }