{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5898, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000508646998982706, "grad_norm": 32.115596771240234, "learning_rate": 0.0, "loss": 1.2961, "mean_token_accuracy": 0.7449109554290771, "num_tokens": 156200.0, "step": 1 }, { "epoch": 0.001017293997965412, "grad_norm": 31.279529571533203, "learning_rate": 1.6949152542372882e-08, "loss": 1.2669, "mean_token_accuracy": 0.7504401206970215, "num_tokens": 314183.0, "step": 2 }, { "epoch": 0.001525940996948118, "grad_norm": 30.887014389038086, "learning_rate": 3.3898305084745764e-08, "loss": 1.307, "mean_token_accuracy": 0.7392708659172058, "num_tokens": 470040.0, "step": 3 }, { "epoch": 0.002034587995930824, "grad_norm": 31.73535919189453, "learning_rate": 5.0847457627118645e-08, "loss": 1.3081, "mean_token_accuracy": 0.7396830916404724, "num_tokens": 627227.0, "step": 4 }, { "epoch": 0.00254323499491353, "grad_norm": 31.332460403442383, "learning_rate": 6.779661016949153e-08, "loss": 1.3231, "mean_token_accuracy": 0.7378069758415222, "num_tokens": 785176.0, "step": 5 }, { "epoch": 0.003051881993896236, "grad_norm": 32.63821029663086, "learning_rate": 8.474576271186442e-08, "loss": 1.3232, "mean_token_accuracy": 0.7402999401092529, "num_tokens": 926658.0, "step": 6 }, { "epoch": 0.003560528992878942, "grad_norm": 31.21712303161621, "learning_rate": 1.0169491525423729e-07, "loss": 1.334, "mean_token_accuracy": 0.7354116439819336, "num_tokens": 1089803.0, "step": 7 }, { "epoch": 0.004069175991861648, "grad_norm": 30.991884231567383, "learning_rate": 1.1864406779661018e-07, "loss": 1.2785, "mean_token_accuracy": 0.7471437454223633, "num_tokens": 1253116.0, "step": 8 }, { "epoch": 0.004577822990844354, "grad_norm": 29.952585220336914, "learning_rate": 1.3559322033898305e-07, "loss": 1.2702, "mean_token_accuracy": 0.7474691867828369, "num_tokens": 1419240.0, "step": 9 }, { "epoch": 0.00508646998982706, "grad_norm": 30.35393714904785, "learning_rate": 1.5254237288135596e-07, "loss": 1.2841, "mean_token_accuracy": 0.7456669211387634, "num_tokens": 1582136.0, "step": 10 }, { "epoch": 0.005595116988809766, "grad_norm": 31.37613868713379, "learning_rate": 1.6949152542372883e-07, "loss": 1.2995, "mean_token_accuracy": 0.7446110844612122, "num_tokens": 1738808.0, "step": 11 }, { "epoch": 0.006103763987792472, "grad_norm": 30.47648048400879, "learning_rate": 1.8644067796610173e-07, "loss": 1.2831, "mean_token_accuracy": 0.7443124055862427, "num_tokens": 1899325.0, "step": 12 }, { "epoch": 0.006612410986775178, "grad_norm": 30.103708267211914, "learning_rate": 2.0338983050847458e-07, "loss": 1.2404, "mean_token_accuracy": 0.7517591714859009, "num_tokens": 2067149.0, "step": 13 }, { "epoch": 0.007121057985757884, "grad_norm": 29.119670867919922, "learning_rate": 2.2033898305084748e-07, "loss": 1.3089, "mean_token_accuracy": 0.7355021834373474, "num_tokens": 2225992.0, "step": 14 }, { "epoch": 0.0076297049847405905, "grad_norm": 30.91548728942871, "learning_rate": 2.3728813559322036e-07, "loss": 1.288, "mean_token_accuracy": 0.7432087063789368, "num_tokens": 2391475.0, "step": 15 }, { "epoch": 0.008138351983723296, "grad_norm": 27.432802200317383, "learning_rate": 2.5423728813559323e-07, "loss": 1.2468, "mean_token_accuracy": 0.7456105947494507, "num_tokens": 2564876.0, "step": 16 }, { "epoch": 0.008646998982706003, "grad_norm": 26.89838218688965, "learning_rate": 2.711864406779661e-07, "loss": 1.2802, "mean_token_accuracy": 0.7338192462921143, "num_tokens": 2728912.0, "step": 17 }, { "epoch": 0.009155645981688708, "grad_norm": 28.779483795166016, "learning_rate": 2.88135593220339e-07, "loss": 1.2371, "mean_token_accuracy": 0.7503151297569275, "num_tokens": 2885094.0, "step": 18 }, { "epoch": 0.009664292980671414, "grad_norm": 28.094846725463867, "learning_rate": 3.050847457627119e-07, "loss": 1.2683, "mean_token_accuracy": 0.7387984991073608, "num_tokens": 3043505.0, "step": 19 }, { "epoch": 0.01017293997965412, "grad_norm": 28.859289169311523, "learning_rate": 3.2203389830508473e-07, "loss": 1.1843, "mean_token_accuracy": 0.7617287635803223, "num_tokens": 3200547.0, "step": 20 }, { "epoch": 0.010681586978636826, "grad_norm": 27.48524284362793, "learning_rate": 3.3898305084745766e-07, "loss": 1.2653, "mean_token_accuracy": 0.7411770224571228, "num_tokens": 3352567.0, "step": 21 }, { "epoch": 0.011190233977619531, "grad_norm": 26.313617706298828, "learning_rate": 3.5593220338983054e-07, "loss": 1.2687, "mean_token_accuracy": 0.7347060441970825, "num_tokens": 3522299.0, "step": 22 }, { "epoch": 0.011698880976602238, "grad_norm": 21.680171966552734, "learning_rate": 3.7288135593220347e-07, "loss": 1.2141, "mean_token_accuracy": 0.7363147735595703, "num_tokens": 3678029.0, "step": 23 }, { "epoch": 0.012207527975584944, "grad_norm": 19.086170196533203, "learning_rate": 3.8983050847457634e-07, "loss": 1.1484, "mean_token_accuracy": 0.7474343180656433, "num_tokens": 3843052.0, "step": 24 }, { "epoch": 0.01271617497456765, "grad_norm": 19.647796630859375, "learning_rate": 4.0677966101694916e-07, "loss": 1.1527, "mean_token_accuracy": 0.7474626302719116, "num_tokens": 4003590.0, "step": 25 }, { "epoch": 0.013224821973550356, "grad_norm": 18.843202590942383, "learning_rate": 4.2372881355932204e-07, "loss": 1.1387, "mean_token_accuracy": 0.748569130897522, "num_tokens": 4163923.0, "step": 26 }, { "epoch": 0.013733468972533061, "grad_norm": 17.923465728759766, "learning_rate": 4.4067796610169497e-07, "loss": 1.1809, "mean_token_accuracy": 0.7369387149810791, "num_tokens": 4329915.0, "step": 27 }, { "epoch": 0.014242115971515769, "grad_norm": 16.861265182495117, "learning_rate": 4.5762711864406784e-07, "loss": 1.1181, "mean_token_accuracy": 0.7488915324211121, "num_tokens": 4501785.0, "step": 28 }, { "epoch": 0.014750762970498474, "grad_norm": 17.190711975097656, "learning_rate": 4.745762711864407e-07, "loss": 1.1287, "mean_token_accuracy": 0.7477043867111206, "num_tokens": 4665203.0, "step": 29 }, { "epoch": 0.015259409969481181, "grad_norm": 17.02585220336914, "learning_rate": 4.915254237288136e-07, "loss": 1.1092, "mean_token_accuracy": 0.7519813776016235, "num_tokens": 4826248.0, "step": 30 }, { "epoch": 0.015768056968463885, "grad_norm": 15.061558723449707, "learning_rate": 5.084745762711865e-07, "loss": 1.0856, "mean_token_accuracy": 0.750836193561554, "num_tokens": 5002184.0, "step": 31 }, { "epoch": 0.01627670396744659, "grad_norm": 8.035051345825195, "learning_rate": 5.254237288135593e-07, "loss": 1.0415, "mean_token_accuracy": 0.7453987002372742, "num_tokens": 5159429.0, "step": 32 }, { "epoch": 0.0167853509664293, "grad_norm": 5.959025859832764, "learning_rate": 5.423728813559322e-07, "loss": 1.0122, "mean_token_accuracy": 0.752120852470398, "num_tokens": 5319981.0, "step": 33 }, { "epoch": 0.017293997965412006, "grad_norm": 5.894038677215576, "learning_rate": 5.593220338983051e-07, "loss": 1.0374, "mean_token_accuracy": 0.7448702454566956, "num_tokens": 5470920.0, "step": 34 }, { "epoch": 0.01780264496439471, "grad_norm": 5.831063270568848, "learning_rate": 5.76271186440678e-07, "loss": 1.0496, "mean_token_accuracy": 0.7414418458938599, "num_tokens": 5621279.0, "step": 35 }, { "epoch": 0.018311291963377416, "grad_norm": 5.664057731628418, "learning_rate": 5.93220338983051e-07, "loss": 1.002, "mean_token_accuracy": 0.7509836554527283, "num_tokens": 5783722.0, "step": 36 }, { "epoch": 0.018819938962360123, "grad_norm": 5.4525675773620605, "learning_rate": 6.101694915254238e-07, "loss": 1.0055, "mean_token_accuracy": 0.7488620281219482, "num_tokens": 5946247.0, "step": 37 }, { "epoch": 0.019328585961342827, "grad_norm": 5.1193952560424805, "learning_rate": 6.271186440677966e-07, "loss": 1.0142, "mean_token_accuracy": 0.7472719550132751, "num_tokens": 6105483.0, "step": 38 }, { "epoch": 0.019837232960325534, "grad_norm": 4.926507949829102, "learning_rate": 6.440677966101695e-07, "loss": 1.0268, "mean_token_accuracy": 0.7443497776985168, "num_tokens": 6267257.0, "step": 39 }, { "epoch": 0.02034587995930824, "grad_norm": 4.7316999435424805, "learning_rate": 6.610169491525425e-07, "loss": 0.9777, "mean_token_accuracy": 0.7549794316291809, "num_tokens": 6423575.0, "step": 40 }, { "epoch": 0.020854526958290945, "grad_norm": 4.418249607086182, "learning_rate": 6.779661016949153e-07, "loss": 0.9854, "mean_token_accuracy": 0.75412917137146, "num_tokens": 6580647.0, "step": 41 }, { "epoch": 0.021363173957273652, "grad_norm": 3.7720751762390137, "learning_rate": 6.949152542372882e-07, "loss": 0.956, "mean_token_accuracy": 0.7564523816108704, "num_tokens": 6740933.0, "step": 42 }, { "epoch": 0.02187182095625636, "grad_norm": 4.199706554412842, "learning_rate": 7.118644067796611e-07, "loss": 0.9435, "mean_token_accuracy": 0.7573504447937012, "num_tokens": 6888534.0, "step": 43 }, { "epoch": 0.022380467955239063, "grad_norm": 5.001425266265869, "learning_rate": 7.28813559322034e-07, "loss": 0.934, "mean_token_accuracy": 0.7598178386688232, "num_tokens": 7053795.0, "step": 44 }, { "epoch": 0.02288911495422177, "grad_norm": 4.949459552764893, "learning_rate": 7.457627118644069e-07, "loss": 0.9264, "mean_token_accuracy": 0.7590354681015015, "num_tokens": 7206610.0, "step": 45 }, { "epoch": 0.023397761953204477, "grad_norm": 5.248065948486328, "learning_rate": 7.627118644067798e-07, "loss": 0.981, "mean_token_accuracy": 0.7476550340652466, "num_tokens": 7375236.0, "step": 46 }, { "epoch": 0.023906408952187184, "grad_norm": 5.145699501037598, "learning_rate": 7.796610169491527e-07, "loss": 0.9164, "mean_token_accuracy": 0.7608495950698853, "num_tokens": 7533110.0, "step": 47 }, { "epoch": 0.024415055951169887, "grad_norm": 5.29788064956665, "learning_rate": 7.966101694915255e-07, "loss": 0.943, "mean_token_accuracy": 0.7549550533294678, "num_tokens": 7693804.0, "step": 48 }, { "epoch": 0.024923702950152594, "grad_norm": 4.676819324493408, "learning_rate": 8.135593220338983e-07, "loss": 0.8807, "mean_token_accuracy": 0.7704964280128479, "num_tokens": 7873397.0, "step": 49 }, { "epoch": 0.0254323499491353, "grad_norm": 4.2016096115112305, "learning_rate": 8.305084745762712e-07, "loss": 0.9193, "mean_token_accuracy": 0.7602928876876831, "num_tokens": 8024175.0, "step": 50 }, { "epoch": 0.025940996948118005, "grad_norm": 3.9279732704162598, "learning_rate": 8.474576271186441e-07, "loss": 0.897, "mean_token_accuracy": 0.7654056549072266, "num_tokens": 8177438.0, "step": 51 }, { "epoch": 0.026449643947100712, "grad_norm": 3.5190112590789795, "learning_rate": 8.64406779661017e-07, "loss": 0.8926, "mean_token_accuracy": 0.7674021124839783, "num_tokens": 8351946.0, "step": 52 }, { "epoch": 0.02695829094608342, "grad_norm": 3.196254253387451, "learning_rate": 8.813559322033899e-07, "loss": 0.899, "mean_token_accuracy": 0.7623940706253052, "num_tokens": 8504280.0, "step": 53 }, { "epoch": 0.027466937945066123, "grad_norm": 2.7295444011688232, "learning_rate": 8.983050847457628e-07, "loss": 0.8606, "mean_token_accuracy": 0.7741144895553589, "num_tokens": 8661017.0, "step": 54 }, { "epoch": 0.02797558494404883, "grad_norm": 2.655987024307251, "learning_rate": 9.152542372881357e-07, "loss": 0.8937, "mean_token_accuracy": 0.7659112811088562, "num_tokens": 8819796.0, "step": 55 }, { "epoch": 0.028484231943031537, "grad_norm": 2.4142653942108154, "learning_rate": 9.322033898305086e-07, "loss": 0.8583, "mean_token_accuracy": 0.7737700343132019, "num_tokens": 8975390.0, "step": 56 }, { "epoch": 0.02899287894201424, "grad_norm": 2.334871530532837, "learning_rate": 9.491525423728814e-07, "loss": 0.8747, "mean_token_accuracy": 0.7709072828292847, "num_tokens": 9128994.0, "step": 57 }, { "epoch": 0.029501525940996948, "grad_norm": 2.374896287918091, "learning_rate": 9.661016949152544e-07, "loss": 0.8484, "mean_token_accuracy": 0.7743436098098755, "num_tokens": 9280736.0, "step": 58 }, { "epoch": 0.030010172939979655, "grad_norm": 2.246040105819702, "learning_rate": 9.830508474576272e-07, "loss": 0.8892, "mean_token_accuracy": 0.7636919021606445, "num_tokens": 9431826.0, "step": 59 }, { "epoch": 0.030518819938962362, "grad_norm": 2.068854808807373, "learning_rate": 1.0000000000000002e-06, "loss": 0.8208, "mean_token_accuracy": 0.7815329432487488, "num_tokens": 9587259.0, "step": 60 }, { "epoch": 0.031027466937945065, "grad_norm": 1.8931329250335693, "learning_rate": 1.016949152542373e-06, "loss": 0.828, "mean_token_accuracy": 0.7761904001235962, "num_tokens": 9735856.0, "step": 61 }, { "epoch": 0.03153611393692777, "grad_norm": 1.6215320825576782, "learning_rate": 1.0338983050847457e-06, "loss": 0.8292, "mean_token_accuracy": 0.7751116752624512, "num_tokens": 9903156.0, "step": 62 }, { "epoch": 0.03204476093591048, "grad_norm": 1.7067248821258545, "learning_rate": 1.0508474576271187e-06, "loss": 0.8219, "mean_token_accuracy": 0.7756357192993164, "num_tokens": 10054899.0, "step": 63 }, { "epoch": 0.03255340793489318, "grad_norm": 1.7187061309814453, "learning_rate": 1.0677966101694917e-06, "loss": 0.7922, "mean_token_accuracy": 0.7845475673675537, "num_tokens": 10214147.0, "step": 64 }, { "epoch": 0.03306205493387589, "grad_norm": 1.8380779027938843, "learning_rate": 1.0847457627118644e-06, "loss": 0.7682, "mean_token_accuracy": 0.7888185977935791, "num_tokens": 10376357.0, "step": 65 }, { "epoch": 0.0335707019328586, "grad_norm": 1.7338484525680542, "learning_rate": 1.1016949152542374e-06, "loss": 0.8376, "mean_token_accuracy": 0.7710230350494385, "num_tokens": 10528001.0, "step": 66 }, { "epoch": 0.0340793489318413, "grad_norm": 1.5959792137145996, "learning_rate": 1.1186440677966102e-06, "loss": 0.8306, "mean_token_accuracy": 0.7736530303955078, "num_tokens": 10697458.0, "step": 67 }, { "epoch": 0.03458799593082401, "grad_norm": 1.5118943452835083, "learning_rate": 1.1355932203389832e-06, "loss": 0.8437, "mean_token_accuracy": 0.7709776759147644, "num_tokens": 10852213.0, "step": 68 }, { "epoch": 0.035096642929806715, "grad_norm": 1.425594449043274, "learning_rate": 1.152542372881356e-06, "loss": 0.803, "mean_token_accuracy": 0.7797463536262512, "num_tokens": 11018742.0, "step": 69 }, { "epoch": 0.03560528992878942, "grad_norm": 1.3931862115859985, "learning_rate": 1.169491525423729e-06, "loss": 0.8333, "mean_token_accuracy": 0.7734776735305786, "num_tokens": 11178241.0, "step": 70 }, { "epoch": 0.03611393692777213, "grad_norm": 1.4249283075332642, "learning_rate": 1.186440677966102e-06, "loss": 0.7853, "mean_token_accuracy": 0.7858003377914429, "num_tokens": 11339213.0, "step": 71 }, { "epoch": 0.03662258392675483, "grad_norm": 1.4698734283447266, "learning_rate": 1.2033898305084747e-06, "loss": 0.7764, "mean_token_accuracy": 0.7849376797676086, "num_tokens": 11489548.0, "step": 72 }, { "epoch": 0.037131230925737536, "grad_norm": 1.3906108140945435, "learning_rate": 1.2203389830508477e-06, "loss": 0.7814, "mean_token_accuracy": 0.785656213760376, "num_tokens": 11656772.0, "step": 73 }, { "epoch": 0.03763987792472025, "grad_norm": 1.2897934913635254, "learning_rate": 1.2372881355932204e-06, "loss": 0.7617, "mean_token_accuracy": 0.7886397838592529, "num_tokens": 11818390.0, "step": 74 }, { "epoch": 0.03814852492370295, "grad_norm": 1.343746304512024, "learning_rate": 1.2542372881355932e-06, "loss": 0.811, "mean_token_accuracy": 0.7766581773757935, "num_tokens": 11965717.0, "step": 75 }, { "epoch": 0.038657171922685654, "grad_norm": 1.2226605415344238, "learning_rate": 1.2711864406779662e-06, "loss": 0.7907, "mean_token_accuracy": 0.7826157808303833, "num_tokens": 12131732.0, "step": 76 }, { "epoch": 0.039165818921668365, "grad_norm": 1.3067526817321777, "learning_rate": 1.288135593220339e-06, "loss": 0.8071, "mean_token_accuracy": 0.777775764465332, "num_tokens": 12283927.0, "step": 77 }, { "epoch": 0.03967446592065107, "grad_norm": 1.277056336402893, "learning_rate": 1.305084745762712e-06, "loss": 0.7693, "mean_token_accuracy": 0.7876085042953491, "num_tokens": 12436906.0, "step": 78 }, { "epoch": 0.04018311291963377, "grad_norm": 1.2849011421203613, "learning_rate": 1.322033898305085e-06, "loss": 0.7794, "mean_token_accuracy": 0.7832804918289185, "num_tokens": 12600258.0, "step": 79 }, { "epoch": 0.04069175991861648, "grad_norm": 1.2474393844604492, "learning_rate": 1.3389830508474577e-06, "loss": 0.7745, "mean_token_accuracy": 0.7856120467185974, "num_tokens": 12747689.0, "step": 80 }, { "epoch": 0.041200406917599186, "grad_norm": 1.144174337387085, "learning_rate": 1.3559322033898307e-06, "loss": 0.7597, "mean_token_accuracy": 0.7897878885269165, "num_tokens": 12911145.0, "step": 81 }, { "epoch": 0.04170905391658189, "grad_norm": 1.2651646137237549, "learning_rate": 1.3728813559322034e-06, "loss": 0.7638, "mean_token_accuracy": 0.7870814800262451, "num_tokens": 13081387.0, "step": 82 }, { "epoch": 0.0422177009155646, "grad_norm": 1.167580485343933, "learning_rate": 1.3898305084745764e-06, "loss": 0.7787, "mean_token_accuracy": 0.7832891345024109, "num_tokens": 13243319.0, "step": 83 }, { "epoch": 0.042726347914547304, "grad_norm": 1.133839726448059, "learning_rate": 1.4067796610169494e-06, "loss": 0.7589, "mean_token_accuracy": 0.7909195423126221, "num_tokens": 13408622.0, "step": 84 }, { "epoch": 0.04323499491353001, "grad_norm": 1.2236430644989014, "learning_rate": 1.4237288135593222e-06, "loss": 0.7724, "mean_token_accuracy": 0.7842013835906982, "num_tokens": 13562201.0, "step": 85 }, { "epoch": 0.04374364191251272, "grad_norm": 1.2251627445220947, "learning_rate": 1.4406779661016951e-06, "loss": 0.76, "mean_token_accuracy": 0.7882030010223389, "num_tokens": 13706016.0, "step": 86 }, { "epoch": 0.04425228891149542, "grad_norm": 1.1422505378723145, "learning_rate": 1.457627118644068e-06, "loss": 0.7532, "mean_token_accuracy": 0.7909828424453735, "num_tokens": 13872736.0, "step": 87 }, { "epoch": 0.044760935910478125, "grad_norm": 1.164376139640808, "learning_rate": 1.4745762711864409e-06, "loss": 0.7102, "mean_token_accuracy": 0.8004215955734253, "num_tokens": 14029427.0, "step": 88 }, { "epoch": 0.045269582909460836, "grad_norm": 1.1551074981689453, "learning_rate": 1.4915254237288139e-06, "loss": 0.7426, "mean_token_accuracy": 0.791038990020752, "num_tokens": 14192123.0, "step": 89 }, { "epoch": 0.04577822990844354, "grad_norm": 1.1146512031555176, "learning_rate": 1.5084745762711866e-06, "loss": 0.7181, "mean_token_accuracy": 0.7969399690628052, "num_tokens": 14356828.0, "step": 90 }, { "epoch": 0.04628687690742624, "grad_norm": 1.0763921737670898, "learning_rate": 1.5254237288135596e-06, "loss": 0.6807, "mean_token_accuracy": 0.8067453503608704, "num_tokens": 14522417.0, "step": 91 }, { "epoch": 0.04679552390640895, "grad_norm": 1.1564586162567139, "learning_rate": 1.5423728813559324e-06, "loss": 0.7619, "mean_token_accuracy": 0.7866380214691162, "num_tokens": 14675485.0, "step": 92 }, { "epoch": 0.04730417090539166, "grad_norm": 1.138609528541565, "learning_rate": 1.5593220338983054e-06, "loss": 0.7253, "mean_token_accuracy": 0.7969965934753418, "num_tokens": 14831686.0, "step": 93 }, { "epoch": 0.04781281790437437, "grad_norm": 1.1271717548370361, "learning_rate": 1.5762711864406781e-06, "loss": 0.7507, "mean_token_accuracy": 0.7900179624557495, "num_tokens": 14983974.0, "step": 94 }, { "epoch": 0.04832146490335707, "grad_norm": 1.1798120737075806, "learning_rate": 1.593220338983051e-06, "loss": 0.7766, "mean_token_accuracy": 0.7809305191040039, "num_tokens": 15135323.0, "step": 95 }, { "epoch": 0.048830111902339775, "grad_norm": 1.2915998697280884, "learning_rate": 1.6101694915254237e-06, "loss": 0.7481, "mean_token_accuracy": 0.7903225421905518, "num_tokens": 15289033.0, "step": 96 }, { "epoch": 0.049338758901322485, "grad_norm": 1.1710273027420044, "learning_rate": 1.6271186440677967e-06, "loss": 0.7504, "mean_token_accuracy": 0.7876750826835632, "num_tokens": 15439451.0, "step": 97 }, { "epoch": 0.04984740590030519, "grad_norm": 1.1403133869171143, "learning_rate": 1.6440677966101694e-06, "loss": 0.7274, "mean_token_accuracy": 0.7948034405708313, "num_tokens": 15594811.0, "step": 98 }, { "epoch": 0.05035605289928789, "grad_norm": 1.1216124296188354, "learning_rate": 1.6610169491525424e-06, "loss": 0.7363, "mean_token_accuracy": 0.7924157381057739, "num_tokens": 15758944.0, "step": 99 }, { "epoch": 0.0508646998982706, "grad_norm": 1.1310335397720337, "learning_rate": 1.6779661016949154e-06, "loss": 0.6821, "mean_token_accuracy": 0.8055119514465332, "num_tokens": 15922340.0, "step": 100 }, { "epoch": 0.05137334689725331, "grad_norm": 1.1045717000961304, "learning_rate": 1.6949152542372882e-06, "loss": 0.701, "mean_token_accuracy": 0.8013116121292114, "num_tokens": 16074162.0, "step": 101 }, { "epoch": 0.05188199389623601, "grad_norm": 1.141825556755066, "learning_rate": 1.7118644067796611e-06, "loss": 0.7233, "mean_token_accuracy": 0.7949929237365723, "num_tokens": 16238658.0, "step": 102 }, { "epoch": 0.05239064089521872, "grad_norm": 1.1329325437545776, "learning_rate": 1.728813559322034e-06, "loss": 0.6977, "mean_token_accuracy": 0.8014352321624756, "num_tokens": 16396393.0, "step": 103 }, { "epoch": 0.052899287894201424, "grad_norm": 1.1402873992919922, "learning_rate": 1.7457627118644069e-06, "loss": 0.7219, "mean_token_accuracy": 0.7958400249481201, "num_tokens": 16546360.0, "step": 104 }, { "epoch": 0.05340793489318413, "grad_norm": 1.116026759147644, "learning_rate": 1.7627118644067799e-06, "loss": 0.7024, "mean_token_accuracy": 0.7998898029327393, "num_tokens": 16704979.0, "step": 105 }, { "epoch": 0.05391658189216684, "grad_norm": 1.0816538333892822, "learning_rate": 1.7796610169491526e-06, "loss": 0.6952, "mean_token_accuracy": 0.8007581233978271, "num_tokens": 16860358.0, "step": 106 }, { "epoch": 0.05442522889114954, "grad_norm": 1.114751935005188, "learning_rate": 1.7966101694915256e-06, "loss": 0.7154, "mean_token_accuracy": 0.797700047492981, "num_tokens": 17034067.0, "step": 107 }, { "epoch": 0.054933875890132246, "grad_norm": 1.2079203128814697, "learning_rate": 1.8135593220338984e-06, "loss": 0.747, "mean_token_accuracy": 0.7880150079727173, "num_tokens": 17181452.0, "step": 108 }, { "epoch": 0.055442522889114956, "grad_norm": 4.387100696563721, "learning_rate": 1.8305084745762714e-06, "loss": 0.6798, "mean_token_accuracy": 0.8064528107643127, "num_tokens": 17338963.0, "step": 109 }, { "epoch": 0.05595116988809766, "grad_norm": 1.2586617469787598, "learning_rate": 1.8474576271186441e-06, "loss": 0.7127, "mean_token_accuracy": 0.7975611686706543, "num_tokens": 17493379.0, "step": 110 }, { "epoch": 0.056459816887080364, "grad_norm": 1.1230461597442627, "learning_rate": 1.8644067796610171e-06, "loss": 0.6778, "mean_token_accuracy": 0.8056212663650513, "num_tokens": 17651741.0, "step": 111 }, { "epoch": 0.056968463886063074, "grad_norm": 1.1480046510696411, "learning_rate": 1.88135593220339e-06, "loss": 0.6992, "mean_token_accuracy": 0.7992340922355652, "num_tokens": 17810924.0, "step": 112 }, { "epoch": 0.05747711088504578, "grad_norm": 1.0441348552703857, "learning_rate": 1.8983050847457629e-06, "loss": 0.6995, "mean_token_accuracy": 0.8028820753097534, "num_tokens": 17976758.0, "step": 113 }, { "epoch": 0.05798575788402848, "grad_norm": 1.1486817598342896, "learning_rate": 1.9152542372881356e-06, "loss": 0.7392, "mean_token_accuracy": 0.7913587689399719, "num_tokens": 18134320.0, "step": 114 }, { "epoch": 0.05849440488301119, "grad_norm": 1.22743821144104, "learning_rate": 1.932203389830509e-06, "loss": 0.7303, "mean_token_accuracy": 0.7928489446640015, "num_tokens": 18276678.0, "step": 115 }, { "epoch": 0.059003051881993895, "grad_norm": 1.9819073677062988, "learning_rate": 1.9491525423728816e-06, "loss": 0.6889, "mean_token_accuracy": 0.8051313161849976, "num_tokens": 18426725.0, "step": 116 }, { "epoch": 0.0595116988809766, "grad_norm": 1.3526983261108398, "learning_rate": 1.9661016949152544e-06, "loss": 0.7195, "mean_token_accuracy": 0.7980197668075562, "num_tokens": 18587522.0, "step": 117 }, { "epoch": 0.06002034587995931, "grad_norm": 1.1162952184677124, "learning_rate": 1.9830508474576276e-06, "loss": 0.7087, "mean_token_accuracy": 0.7977943420410156, "num_tokens": 18749574.0, "step": 118 }, { "epoch": 0.06052899287894201, "grad_norm": 1.1709731817245483, "learning_rate": 2.0000000000000003e-06, "loss": 0.6925, "mean_token_accuracy": 0.8042123317718506, "num_tokens": 18906800.0, "step": 119 }, { "epoch": 0.061037639877924724, "grad_norm": 1.1323966979980469, "learning_rate": 2.016949152542373e-06, "loss": 0.7452, "mean_token_accuracy": 0.7882239818572998, "num_tokens": 19068637.0, "step": 120 }, { "epoch": 0.06154628687690743, "grad_norm": 1.1877466440200806, "learning_rate": 2.033898305084746e-06, "loss": 0.7036, "mean_token_accuracy": 0.7979512810707092, "num_tokens": 19219209.0, "step": 121 }, { "epoch": 0.06205493387589013, "grad_norm": 1.1348536014556885, "learning_rate": 2.0508474576271186e-06, "loss": 0.6824, "mean_token_accuracy": 0.8026146292686462, "num_tokens": 19367232.0, "step": 122 }, { "epoch": 0.06256358087487283, "grad_norm": 1.0699137449264526, "learning_rate": 2.0677966101694914e-06, "loss": 0.677, "mean_token_accuracy": 0.8075043559074402, "num_tokens": 19525992.0, "step": 123 }, { "epoch": 0.06307222787385554, "grad_norm": 1.1170971393585205, "learning_rate": 2.0847457627118646e-06, "loss": 0.6963, "mean_token_accuracy": 0.8016464710235596, "num_tokens": 19693698.0, "step": 124 }, { "epoch": 0.06358087487283826, "grad_norm": 1.1918888092041016, "learning_rate": 2.1016949152542374e-06, "loss": 0.683, "mean_token_accuracy": 0.8041555881500244, "num_tokens": 19841203.0, "step": 125 }, { "epoch": 0.06408952187182096, "grad_norm": 1.1442811489105225, "learning_rate": 2.11864406779661e-06, "loss": 0.7213, "mean_token_accuracy": 0.7939434051513672, "num_tokens": 20001680.0, "step": 126 }, { "epoch": 0.06459816887080366, "grad_norm": 1.1511716842651367, "learning_rate": 2.1355932203389833e-06, "loss": 0.6835, "mean_token_accuracy": 0.8039097785949707, "num_tokens": 20150787.0, "step": 127 }, { "epoch": 0.06510681586978637, "grad_norm": 1.0827454328536987, "learning_rate": 2.152542372881356e-06, "loss": 0.6907, "mean_token_accuracy": 0.8021340370178223, "num_tokens": 20326096.0, "step": 128 }, { "epoch": 0.06561546286876907, "grad_norm": 1.1005094051361084, "learning_rate": 2.169491525423729e-06, "loss": 0.7339, "mean_token_accuracy": 0.7913875579833984, "num_tokens": 20490433.0, "step": 129 }, { "epoch": 0.06612410986775177, "grad_norm": 1.0656397342681885, "learning_rate": 2.1864406779661016e-06, "loss": 0.7223, "mean_token_accuracy": 0.7938638925552368, "num_tokens": 20657596.0, "step": 130 }, { "epoch": 0.06663275686673449, "grad_norm": 1.0604357719421387, "learning_rate": 2.203389830508475e-06, "loss": 0.6711, "mean_token_accuracy": 0.8067337870597839, "num_tokens": 20827354.0, "step": 131 }, { "epoch": 0.0671414038657172, "grad_norm": 1.1440101861953735, "learning_rate": 2.2203389830508476e-06, "loss": 0.704, "mean_token_accuracy": 0.799992024898529, "num_tokens": 20978319.0, "step": 132 }, { "epoch": 0.0676500508646999, "grad_norm": 1.1271777153015137, "learning_rate": 2.2372881355932204e-06, "loss": 0.6845, "mean_token_accuracy": 0.8048183917999268, "num_tokens": 21124807.0, "step": 133 }, { "epoch": 0.0681586978636826, "grad_norm": 1.1711620092391968, "learning_rate": 2.2542372881355936e-06, "loss": 0.6677, "mean_token_accuracy": 0.8087946176528931, "num_tokens": 21283777.0, "step": 134 }, { "epoch": 0.0686673448626653, "grad_norm": 1.1323461532592773, "learning_rate": 2.2711864406779663e-06, "loss": 0.682, "mean_token_accuracy": 0.8036584258079529, "num_tokens": 21436297.0, "step": 135 }, { "epoch": 0.06917599186164802, "grad_norm": 1.1845351457595825, "learning_rate": 2.288135593220339e-06, "loss": 0.6991, "mean_token_accuracy": 0.7986134886741638, "num_tokens": 21590885.0, "step": 136 }, { "epoch": 0.06968463886063073, "grad_norm": 1.139819622039795, "learning_rate": 2.305084745762712e-06, "loss": 0.7272, "mean_token_accuracy": 0.7913652062416077, "num_tokens": 21746247.0, "step": 137 }, { "epoch": 0.07019328585961343, "grad_norm": 1.1518213748931885, "learning_rate": 2.322033898305085e-06, "loss": 0.6775, "mean_token_accuracy": 0.8056344985961914, "num_tokens": 21898426.0, "step": 138 }, { "epoch": 0.07070193285859613, "grad_norm": 1.1628955602645874, "learning_rate": 2.338983050847458e-06, "loss": 0.7437, "mean_token_accuracy": 0.7898159027099609, "num_tokens": 22054155.0, "step": 139 }, { "epoch": 0.07121057985757884, "grad_norm": 1.1349354982376099, "learning_rate": 2.3559322033898306e-06, "loss": 0.7113, "mean_token_accuracy": 0.7953236103057861, "num_tokens": 22215818.0, "step": 140 }, { "epoch": 0.07171922685656154, "grad_norm": 1.136148452758789, "learning_rate": 2.372881355932204e-06, "loss": 0.6908, "mean_token_accuracy": 0.8000525832176208, "num_tokens": 22367981.0, "step": 141 }, { "epoch": 0.07222787385554426, "grad_norm": 1.0790364742279053, "learning_rate": 2.3898305084745766e-06, "loss": 0.689, "mean_token_accuracy": 0.802665114402771, "num_tokens": 22537203.0, "step": 142 }, { "epoch": 0.07273652085452696, "grad_norm": 1.1381577253341675, "learning_rate": 2.4067796610169493e-06, "loss": 0.6895, "mean_token_accuracy": 0.801916241645813, "num_tokens": 22696776.0, "step": 143 }, { "epoch": 0.07324516785350967, "grad_norm": 1.1592439413070679, "learning_rate": 2.4237288135593225e-06, "loss": 0.6637, "mean_token_accuracy": 0.8090922236442566, "num_tokens": 22850630.0, "step": 144 }, { "epoch": 0.07375381485249237, "grad_norm": 1.1903307437896729, "learning_rate": 2.4406779661016953e-06, "loss": 0.7183, "mean_token_accuracy": 0.7970988750457764, "num_tokens": 23002043.0, "step": 145 }, { "epoch": 0.07426246185147507, "grad_norm": 1.135934591293335, "learning_rate": 2.457627118644068e-06, "loss": 0.6473, "mean_token_accuracy": 0.8129653930664062, "num_tokens": 23159234.0, "step": 146 }, { "epoch": 0.07477110885045778, "grad_norm": 1.1974645853042603, "learning_rate": 2.474576271186441e-06, "loss": 0.6752, "mean_token_accuracy": 0.8057656288146973, "num_tokens": 23310355.0, "step": 147 }, { "epoch": 0.0752797558494405, "grad_norm": 1.0872167348861694, "learning_rate": 2.491525423728814e-06, "loss": 0.664, "mean_token_accuracy": 0.8108680248260498, "num_tokens": 23473092.0, "step": 148 }, { "epoch": 0.0757884028484232, "grad_norm": 1.1531084775924683, "learning_rate": 2.5084745762711864e-06, "loss": 0.68, "mean_token_accuracy": 0.8035491704940796, "num_tokens": 23627766.0, "step": 149 }, { "epoch": 0.0762970498474059, "grad_norm": 1.096634864807129, "learning_rate": 2.5254237288135596e-06, "loss": 0.6537, "mean_token_accuracy": 0.812416672706604, "num_tokens": 23777307.0, "step": 150 }, { "epoch": 0.0768056968463886, "grad_norm": 1.1573667526245117, "learning_rate": 2.5423728813559323e-06, "loss": 0.6942, "mean_token_accuracy": 0.7996103763580322, "num_tokens": 23923715.0, "step": 151 }, { "epoch": 0.07731434384537131, "grad_norm": 1.1179906129837036, "learning_rate": 2.5593220338983055e-06, "loss": 0.6794, "mean_token_accuracy": 0.8048681020736694, "num_tokens": 24087522.0, "step": 152 }, { "epoch": 0.07782299084435401, "grad_norm": 1.2234212160110474, "learning_rate": 2.576271186440678e-06, "loss": 0.6577, "mean_token_accuracy": 0.8093551993370056, "num_tokens": 24231067.0, "step": 153 }, { "epoch": 0.07833163784333673, "grad_norm": 1.1112676858901978, "learning_rate": 2.593220338983051e-06, "loss": 0.6982, "mean_token_accuracy": 0.8004816770553589, "num_tokens": 24394212.0, "step": 154 }, { "epoch": 0.07884028484231943, "grad_norm": 1.2034029960632324, "learning_rate": 2.610169491525424e-06, "loss": 0.6691, "mean_token_accuracy": 0.8068128824234009, "num_tokens": 24539665.0, "step": 155 }, { "epoch": 0.07934893184130214, "grad_norm": 1.1976460218429565, "learning_rate": 2.627118644067797e-06, "loss": 0.7036, "mean_token_accuracy": 0.7969769239425659, "num_tokens": 24703099.0, "step": 156 }, { "epoch": 0.07985757884028484, "grad_norm": 4.0496087074279785, "learning_rate": 2.64406779661017e-06, "loss": 0.6776, "mean_token_accuracy": 0.8049623370170593, "num_tokens": 24851707.0, "step": 157 }, { "epoch": 0.08036622583926754, "grad_norm": 1.2107317447662354, "learning_rate": 2.661016949152543e-06, "loss": 0.6605, "mean_token_accuracy": 0.8087326288223267, "num_tokens": 25018302.0, "step": 158 }, { "epoch": 0.08087487283825025, "grad_norm": 1.1594712734222412, "learning_rate": 2.6779661016949153e-06, "loss": 0.6659, "mean_token_accuracy": 0.8069348335266113, "num_tokens": 25177030.0, "step": 159 }, { "epoch": 0.08138351983723296, "grad_norm": 1.0774612426757812, "learning_rate": 2.6949152542372885e-06, "loss": 0.6909, "mean_token_accuracy": 0.80242520570755, "num_tokens": 25345695.0, "step": 160 }, { "epoch": 0.08189216683621567, "grad_norm": 1.034545660018921, "learning_rate": 2.7118644067796613e-06, "loss": 0.6396, "mean_token_accuracy": 0.8151862025260925, "num_tokens": 25513624.0, "step": 161 }, { "epoch": 0.08240081383519837, "grad_norm": 1.1746925115585327, "learning_rate": 2.7288135593220336e-06, "loss": 0.681, "mean_token_accuracy": 0.8041950464248657, "num_tokens": 25674463.0, "step": 162 }, { "epoch": 0.08290946083418108, "grad_norm": 1.139805555343628, "learning_rate": 2.745762711864407e-06, "loss": 0.6566, "mean_token_accuracy": 0.8090215921401978, "num_tokens": 25834424.0, "step": 163 }, { "epoch": 0.08341810783316378, "grad_norm": 1.5135971307754517, "learning_rate": 2.7627118644067796e-06, "loss": 0.6966, "mean_token_accuracy": 0.7990592122077942, "num_tokens": 25991547.0, "step": 164 }, { "epoch": 0.0839267548321465, "grad_norm": 1.1292972564697266, "learning_rate": 2.779661016949153e-06, "loss": 0.6788, "mean_token_accuracy": 0.8033425807952881, "num_tokens": 26151034.0, "step": 165 }, { "epoch": 0.0844354018311292, "grad_norm": 1.1054205894470215, "learning_rate": 2.7966101694915256e-06, "loss": 0.6489, "mean_token_accuracy": 0.8113046288490295, "num_tokens": 26310637.0, "step": 166 }, { "epoch": 0.0849440488301119, "grad_norm": 1.0914794206619263, "learning_rate": 2.8135593220338988e-06, "loss": 0.6457, "mean_token_accuracy": 0.8134191632270813, "num_tokens": 26465678.0, "step": 167 }, { "epoch": 0.08545269582909461, "grad_norm": 1.0801182985305786, "learning_rate": 2.830508474576271e-06, "loss": 0.676, "mean_token_accuracy": 0.8042765855789185, "num_tokens": 26632464.0, "step": 168 }, { "epoch": 0.08596134282807731, "grad_norm": 1.1082203388214111, "learning_rate": 2.8474576271186443e-06, "loss": 0.6338, "mean_token_accuracy": 0.8155921697616577, "num_tokens": 26795346.0, "step": 169 }, { "epoch": 0.08646998982706001, "grad_norm": 1.0745552778244019, "learning_rate": 2.864406779661017e-06, "loss": 0.6937, "mean_token_accuracy": 0.7970446944236755, "num_tokens": 26951965.0, "step": 170 }, { "epoch": 0.08697863682604273, "grad_norm": 1.1002435684204102, "learning_rate": 2.8813559322033903e-06, "loss": 0.6389, "mean_token_accuracy": 0.8136718273162842, "num_tokens": 27113922.0, "step": 171 }, { "epoch": 0.08748728382502544, "grad_norm": 1.087834358215332, "learning_rate": 2.8983050847457626e-06, "loss": 0.6479, "mean_token_accuracy": 0.8113669157028198, "num_tokens": 27279172.0, "step": 172 }, { "epoch": 0.08799593082400814, "grad_norm": 1.1026259660720825, "learning_rate": 2.915254237288136e-06, "loss": 0.6726, "mean_token_accuracy": 0.805874764919281, "num_tokens": 27449975.0, "step": 173 }, { "epoch": 0.08850457782299084, "grad_norm": 1.106402039527893, "learning_rate": 2.9322033898305086e-06, "loss": 0.684, "mean_token_accuracy": 0.8012702465057373, "num_tokens": 27595872.0, "step": 174 }, { "epoch": 0.08901322482197355, "grad_norm": 1.1090171337127686, "learning_rate": 2.9491525423728818e-06, "loss": 0.6552, "mean_token_accuracy": 0.8108108043670654, "num_tokens": 27760835.0, "step": 175 }, { "epoch": 0.08952187182095625, "grad_norm": 1.0845763683319092, "learning_rate": 2.9661016949152545e-06, "loss": 0.6519, "mean_token_accuracy": 0.810937225818634, "num_tokens": 27923943.0, "step": 176 }, { "epoch": 0.09003051881993897, "grad_norm": 1.1143302917480469, "learning_rate": 2.9830508474576277e-06, "loss": 0.6752, "mean_token_accuracy": 0.8048059344291687, "num_tokens": 28085891.0, "step": 177 }, { "epoch": 0.09053916581892167, "grad_norm": 1.1437448263168335, "learning_rate": 3e-06, "loss": 0.6196, "mean_token_accuracy": 0.818387508392334, "num_tokens": 28243518.0, "step": 178 }, { "epoch": 0.09104781281790437, "grad_norm": 1.0629551410675049, "learning_rate": 3.0169491525423733e-06, "loss": 0.6494, "mean_token_accuracy": 0.8105939030647278, "num_tokens": 28405483.0, "step": 179 }, { "epoch": 0.09155645981688708, "grad_norm": 1.1684266328811646, "learning_rate": 3.033898305084746e-06, "loss": 0.6803, "mean_token_accuracy": 0.8017261028289795, "num_tokens": 28558441.0, "step": 180 }, { "epoch": 0.09206510681586978, "grad_norm": 1.1008362770080566, "learning_rate": 3.0508474576271192e-06, "loss": 0.6324, "mean_token_accuracy": 0.815455436706543, "num_tokens": 28707646.0, "step": 181 }, { "epoch": 0.09257375381485249, "grad_norm": 1.1428719758987427, "learning_rate": 3.0677966101694916e-06, "loss": 0.6594, "mean_token_accuracy": 0.8092727065086365, "num_tokens": 28877502.0, "step": 182 }, { "epoch": 0.0930824008138352, "grad_norm": 1.1864022016525269, "learning_rate": 3.0847457627118648e-06, "loss": 0.7025, "mean_token_accuracy": 0.7980859279632568, "num_tokens": 29040815.0, "step": 183 }, { "epoch": 0.0935910478128179, "grad_norm": 1.1755472421646118, "learning_rate": 3.1016949152542375e-06, "loss": 0.6417, "mean_token_accuracy": 0.8110471963882446, "num_tokens": 29189342.0, "step": 184 }, { "epoch": 0.09409969481180061, "grad_norm": 1.1948294639587402, "learning_rate": 3.1186440677966107e-06, "loss": 0.6632, "mean_token_accuracy": 0.809227466583252, "num_tokens": 29361787.0, "step": 185 }, { "epoch": 0.09460834181078331, "grad_norm": 1.1376807689666748, "learning_rate": 3.135593220338983e-06, "loss": 0.5833, "mean_token_accuracy": 0.8278385400772095, "num_tokens": 29523929.0, "step": 186 }, { "epoch": 0.09511698880976602, "grad_norm": 1.181722640991211, "learning_rate": 3.1525423728813563e-06, "loss": 0.6812, "mean_token_accuracy": 0.8037586212158203, "num_tokens": 29682593.0, "step": 187 }, { "epoch": 0.09562563580874874, "grad_norm": 1.120546817779541, "learning_rate": 3.169491525423729e-06, "loss": 0.6301, "mean_token_accuracy": 0.8173536062240601, "num_tokens": 29839653.0, "step": 188 }, { "epoch": 0.09613428280773144, "grad_norm": 1.2152618169784546, "learning_rate": 3.186440677966102e-06, "loss": 0.6717, "mean_token_accuracy": 0.8038546442985535, "num_tokens": 30001594.0, "step": 189 }, { "epoch": 0.09664292980671414, "grad_norm": 1.130861759185791, "learning_rate": 3.203389830508475e-06, "loss": 0.6589, "mean_token_accuracy": 0.8086852431297302, "num_tokens": 30163501.0, "step": 190 }, { "epoch": 0.09715157680569685, "grad_norm": 1.117408037185669, "learning_rate": 3.2203389830508473e-06, "loss": 0.642, "mean_token_accuracy": 0.8118177056312561, "num_tokens": 30324797.0, "step": 191 }, { "epoch": 0.09766022380467955, "grad_norm": 1.0812642574310303, "learning_rate": 3.2372881355932205e-06, "loss": 0.6553, "mean_token_accuracy": 0.809535801410675, "num_tokens": 30492907.0, "step": 192 }, { "epoch": 0.09816887080366225, "grad_norm": 1.1507676839828491, "learning_rate": 3.2542372881355933e-06, "loss": 0.6121, "mean_token_accuracy": 0.8196245431900024, "num_tokens": 30667482.0, "step": 193 }, { "epoch": 0.09867751780264497, "grad_norm": 1.0629740953445435, "learning_rate": 3.2711864406779665e-06, "loss": 0.6429, "mean_token_accuracy": 0.8122947216033936, "num_tokens": 30830951.0, "step": 194 }, { "epoch": 0.09918616480162767, "grad_norm": 1.255733609199524, "learning_rate": 3.288135593220339e-06, "loss": 0.6417, "mean_token_accuracy": 0.8119453191757202, "num_tokens": 30992078.0, "step": 195 }, { "epoch": 0.09969481180061038, "grad_norm": 1.2162625789642334, "learning_rate": 3.305084745762712e-06, "loss": 0.6353, "mean_token_accuracy": 0.8144551515579224, "num_tokens": 31153089.0, "step": 196 }, { "epoch": 0.10020345879959308, "grad_norm": 1.1391067504882812, "learning_rate": 3.322033898305085e-06, "loss": 0.6408, "mean_token_accuracy": 0.8137332201004028, "num_tokens": 31297334.0, "step": 197 }, { "epoch": 0.10071210579857579, "grad_norm": 1.3500770330429077, "learning_rate": 3.338983050847458e-06, "loss": 0.6841, "mean_token_accuracy": 0.803351640701294, "num_tokens": 31462790.0, "step": 198 }, { "epoch": 0.10122075279755849, "grad_norm": 1.2101603746414185, "learning_rate": 3.3559322033898308e-06, "loss": 0.6686, "mean_token_accuracy": 0.8038467168807983, "num_tokens": 31614333.0, "step": 199 }, { "epoch": 0.1017293997965412, "grad_norm": 1.2734079360961914, "learning_rate": 3.372881355932204e-06, "loss": 0.6503, "mean_token_accuracy": 0.8107385039329529, "num_tokens": 31757983.0, "step": 200 }, { "epoch": 0.10223804679552391, "grad_norm": 1.1623011827468872, "learning_rate": 3.3898305084745763e-06, "loss": 0.6641, "mean_token_accuracy": 0.8052271604537964, "num_tokens": 31902198.0, "step": 201 }, { "epoch": 0.10274669379450661, "grad_norm": 1.1549586057662964, "learning_rate": 3.4067796610169495e-06, "loss": 0.6403, "mean_token_accuracy": 0.8139992952346802, "num_tokens": 32072015.0, "step": 202 }, { "epoch": 0.10325534079348932, "grad_norm": 1.25985586643219, "learning_rate": 3.4237288135593223e-06, "loss": 0.6587, "mean_token_accuracy": 0.8067578077316284, "num_tokens": 32231536.0, "step": 203 }, { "epoch": 0.10376398779247202, "grad_norm": 1.0511059761047363, "learning_rate": 3.4406779661016955e-06, "loss": 0.6077, "mean_token_accuracy": 0.8200663924217224, "num_tokens": 32401421.0, "step": 204 }, { "epoch": 0.10427263479145472, "grad_norm": 1.1063411235809326, "learning_rate": 3.457627118644068e-06, "loss": 0.6417, "mean_token_accuracy": 0.8133328557014465, "num_tokens": 32559615.0, "step": 205 }, { "epoch": 0.10478128179043744, "grad_norm": 1.2056820392608643, "learning_rate": 3.474576271186441e-06, "loss": 0.6428, "mean_token_accuracy": 0.8104065656661987, "num_tokens": 32710194.0, "step": 206 }, { "epoch": 0.10528992878942015, "grad_norm": 1.1128931045532227, "learning_rate": 3.4915254237288138e-06, "loss": 0.6477, "mean_token_accuracy": 0.8102855682373047, "num_tokens": 32878068.0, "step": 207 }, { "epoch": 0.10579857578840285, "grad_norm": 1.1195472478866577, "learning_rate": 3.508474576271187e-06, "loss": 0.6779, "mean_token_accuracy": 0.8018409013748169, "num_tokens": 33048273.0, "step": 208 }, { "epoch": 0.10630722278738555, "grad_norm": 1.1822260618209839, "learning_rate": 3.5254237288135597e-06, "loss": 0.6465, "mean_token_accuracy": 0.8118051886558533, "num_tokens": 33209096.0, "step": 209 }, { "epoch": 0.10681586978636826, "grad_norm": 1.11614990234375, "learning_rate": 3.5423728813559325e-06, "loss": 0.6205, "mean_token_accuracy": 0.8186913132667542, "num_tokens": 33375803.0, "step": 210 }, { "epoch": 0.10732451678535096, "grad_norm": 1.171306848526001, "learning_rate": 3.5593220338983053e-06, "loss": 0.6644, "mean_token_accuracy": 0.8052268028259277, "num_tokens": 33519566.0, "step": 211 }, { "epoch": 0.10783316378433368, "grad_norm": 1.1052064895629883, "learning_rate": 3.5762711864406785e-06, "loss": 0.6734, "mean_token_accuracy": 0.8042608499526978, "num_tokens": 33675958.0, "step": 212 }, { "epoch": 0.10834181078331638, "grad_norm": 1.1608377695083618, "learning_rate": 3.5932203389830512e-06, "loss": 0.6456, "mean_token_accuracy": 0.8108775615692139, "num_tokens": 33848664.0, "step": 213 }, { "epoch": 0.10885045778229908, "grad_norm": 1.1084113121032715, "learning_rate": 3.6101694915254244e-06, "loss": 0.6494, "mean_token_accuracy": 0.8081250190734863, "num_tokens": 34013911.0, "step": 214 }, { "epoch": 0.10935910478128179, "grad_norm": 1.183755874633789, "learning_rate": 3.6271186440677968e-06, "loss": 0.6292, "mean_token_accuracy": 0.8147969245910645, "num_tokens": 34175183.0, "step": 215 }, { "epoch": 0.10986775178026449, "grad_norm": 1.4397618770599365, "learning_rate": 3.6440677966101695e-06, "loss": 0.6242, "mean_token_accuracy": 0.8160824179649353, "num_tokens": 34337717.0, "step": 216 }, { "epoch": 0.11037639877924721, "grad_norm": 1.154085636138916, "learning_rate": 3.6610169491525427e-06, "loss": 0.657, "mean_token_accuracy": 0.807664155960083, "num_tokens": 34491024.0, "step": 217 }, { "epoch": 0.11088504577822991, "grad_norm": 1.061474323272705, "learning_rate": 3.6779661016949155e-06, "loss": 0.6206, "mean_token_accuracy": 0.8178257942199707, "num_tokens": 34668424.0, "step": 218 }, { "epoch": 0.11139369277721262, "grad_norm": 1.098557949066162, "learning_rate": 3.6949152542372883e-06, "loss": 0.6584, "mean_token_accuracy": 0.8083437085151672, "num_tokens": 34823916.0, "step": 219 }, { "epoch": 0.11190233977619532, "grad_norm": 1.1224589347839355, "learning_rate": 3.711864406779661e-06, "loss": 0.6324, "mean_token_accuracy": 0.8144303560256958, "num_tokens": 34986133.0, "step": 220 }, { "epoch": 0.11241098677517802, "grad_norm": 1.1159178018569946, "learning_rate": 3.7288135593220342e-06, "loss": 0.6126, "mean_token_accuracy": 0.8220010995864868, "num_tokens": 35146182.0, "step": 221 }, { "epoch": 0.11291963377416073, "grad_norm": 1.103893756866455, "learning_rate": 3.745762711864407e-06, "loss": 0.6566, "mean_token_accuracy": 0.8071621060371399, "num_tokens": 35301888.0, "step": 222 }, { "epoch": 0.11342828077314344, "grad_norm": 1.0538965463638306, "learning_rate": 3.76271186440678e-06, "loss": 0.6183, "mean_token_accuracy": 0.8180220723152161, "num_tokens": 35461669.0, "step": 223 }, { "epoch": 0.11393692777212615, "grad_norm": 1.122003436088562, "learning_rate": 3.7796610169491525e-06, "loss": 0.6365, "mean_token_accuracy": 0.8118059635162354, "num_tokens": 35621625.0, "step": 224 }, { "epoch": 0.11444557477110885, "grad_norm": 1.1433473825454712, "learning_rate": 3.7966101694915257e-06, "loss": 0.6216, "mean_token_accuracy": 0.8183375000953674, "num_tokens": 35773479.0, "step": 225 }, { "epoch": 0.11495422177009156, "grad_norm": 1.0675594806671143, "learning_rate": 3.8135593220338985e-06, "loss": 0.6141, "mean_token_accuracy": 0.8205496072769165, "num_tokens": 35936577.0, "step": 226 }, { "epoch": 0.11546286876907426, "grad_norm": 1.119829535484314, "learning_rate": 3.830508474576271e-06, "loss": 0.661, "mean_token_accuracy": 0.8067289590835571, "num_tokens": 36094734.0, "step": 227 }, { "epoch": 0.11597151576805696, "grad_norm": 1.115922451019287, "learning_rate": 3.8474576271186445e-06, "loss": 0.616, "mean_token_accuracy": 0.8180555105209351, "num_tokens": 36258492.0, "step": 228 }, { "epoch": 0.11648016276703968, "grad_norm": 1.1563063859939575, "learning_rate": 3.864406779661018e-06, "loss": 0.6357, "mean_token_accuracy": 0.8144519329071045, "num_tokens": 36423218.0, "step": 229 }, { "epoch": 0.11698880976602238, "grad_norm": 1.1410115957260132, "learning_rate": 3.88135593220339e-06, "loss": 0.6051, "mean_token_accuracy": 0.819241464138031, "num_tokens": 36569958.0, "step": 230 }, { "epoch": 0.11749745676500509, "grad_norm": 1.147270917892456, "learning_rate": 3.898305084745763e-06, "loss": 0.6577, "mean_token_accuracy": 0.807749330997467, "num_tokens": 36735839.0, "step": 231 }, { "epoch": 0.11800610376398779, "grad_norm": 1.240912675857544, "learning_rate": 3.9152542372881355e-06, "loss": 0.6424, "mean_token_accuracy": 0.8123055696487427, "num_tokens": 36886881.0, "step": 232 }, { "epoch": 0.1185147507629705, "grad_norm": 1.1100605726242065, "learning_rate": 3.932203389830509e-06, "loss": 0.636, "mean_token_accuracy": 0.8126474618911743, "num_tokens": 37055602.0, "step": 233 }, { "epoch": 0.1190233977619532, "grad_norm": 1.1091521978378296, "learning_rate": 3.949152542372882e-06, "loss": 0.6136, "mean_token_accuracy": 0.822191596031189, "num_tokens": 37215889.0, "step": 234 }, { "epoch": 0.11953204476093592, "grad_norm": 1.1006335020065308, "learning_rate": 3.966101694915255e-06, "loss": 0.619, "mean_token_accuracy": 0.8180478811264038, "num_tokens": 37368358.0, "step": 235 }, { "epoch": 0.12004069175991862, "grad_norm": 1.2350478172302246, "learning_rate": 3.9830508474576275e-06, "loss": 0.6275, "mean_token_accuracy": 0.8164553642272949, "num_tokens": 37528382.0, "step": 236 }, { "epoch": 0.12054933875890132, "grad_norm": 1.1838717460632324, "learning_rate": 4.000000000000001e-06, "loss": 0.6435, "mean_token_accuracy": 0.811959981918335, "num_tokens": 37685103.0, "step": 237 }, { "epoch": 0.12105798575788403, "grad_norm": 1.1500831842422485, "learning_rate": 4.016949152542373e-06, "loss": 0.6558, "mean_token_accuracy": 0.8082505464553833, "num_tokens": 37848901.0, "step": 238 }, { "epoch": 0.12156663275686673, "grad_norm": 1.1872674226760864, "learning_rate": 4.033898305084746e-06, "loss": 0.6512, "mean_token_accuracy": 0.8086193799972534, "num_tokens": 38014704.0, "step": 239 }, { "epoch": 0.12207527975584945, "grad_norm": 1.246338129043579, "learning_rate": 4.0508474576271186e-06, "loss": 0.6506, "mean_token_accuracy": 0.8080781698226929, "num_tokens": 38166048.0, "step": 240 }, { "epoch": 0.12258392675483215, "grad_norm": 1.1303752660751343, "learning_rate": 4.067796610169492e-06, "loss": 0.5975, "mean_token_accuracy": 0.8239215016365051, "num_tokens": 38326365.0, "step": 241 }, { "epoch": 0.12309257375381485, "grad_norm": 1.2110315561294556, "learning_rate": 4.084745762711865e-06, "loss": 0.6211, "mean_token_accuracy": 0.8160551190376282, "num_tokens": 38487350.0, "step": 242 }, { "epoch": 0.12360122075279756, "grad_norm": 1.161855697631836, "learning_rate": 4.101694915254237e-06, "loss": 0.6647, "mean_token_accuracy": 0.807031512260437, "num_tokens": 38653367.0, "step": 243 }, { "epoch": 0.12410986775178026, "grad_norm": 1.153646469116211, "learning_rate": 4.1186440677966105e-06, "loss": 0.6446, "mean_token_accuracy": 0.8096780776977539, "num_tokens": 38803903.0, "step": 244 }, { "epoch": 0.12461851475076297, "grad_norm": 1.1783907413482666, "learning_rate": 4.135593220338983e-06, "loss": 0.6259, "mean_token_accuracy": 0.8163446187973022, "num_tokens": 38956439.0, "step": 245 }, { "epoch": 0.12512716174974567, "grad_norm": 1.1585876941680908, "learning_rate": 4.152542372881356e-06, "loss": 0.6092, "mean_token_accuracy": 0.819117546081543, "num_tokens": 39109262.0, "step": 246 }, { "epoch": 0.12563580874872837, "grad_norm": 1.1186983585357666, "learning_rate": 4.169491525423729e-06, "loss": 0.6135, "mean_token_accuracy": 0.8192994594573975, "num_tokens": 39271741.0, "step": 247 }, { "epoch": 0.12614445574771108, "grad_norm": 1.135287880897522, "learning_rate": 4.186440677966102e-06, "loss": 0.6397, "mean_token_accuracy": 0.8109389543533325, "num_tokens": 39438988.0, "step": 248 }, { "epoch": 0.1266531027466938, "grad_norm": 1.198437213897705, "learning_rate": 4.203389830508475e-06, "loss": 0.6526, "mean_token_accuracy": 0.809927225112915, "num_tokens": 39606007.0, "step": 249 }, { "epoch": 0.1271617497456765, "grad_norm": 1.1955232620239258, "learning_rate": 4.220338983050848e-06, "loss": 0.6858, "mean_token_accuracy": 0.7995947003364563, "num_tokens": 39780179.0, "step": 250 }, { "epoch": 0.12767039674465921, "grad_norm": 1.141065001487732, "learning_rate": 4.23728813559322e-06, "loss": 0.6405, "mean_token_accuracy": 0.8117692470550537, "num_tokens": 39939761.0, "step": 251 }, { "epoch": 0.12817904374364192, "grad_norm": 1.222945213317871, "learning_rate": 4.2542372881355935e-06, "loss": 0.6651, "mean_token_accuracy": 0.8068735003471375, "num_tokens": 40103824.0, "step": 252 }, { "epoch": 0.12868769074262462, "grad_norm": 1.299753189086914, "learning_rate": 4.271186440677967e-06, "loss": 0.6438, "mean_token_accuracy": 0.8107660412788391, "num_tokens": 40275322.0, "step": 253 }, { "epoch": 0.12919633774160733, "grad_norm": 1.1835240125656128, "learning_rate": 4.28813559322034e-06, "loss": 0.6179, "mean_token_accuracy": 0.8164398074150085, "num_tokens": 40433373.0, "step": 254 }, { "epoch": 0.12970498474059003, "grad_norm": 1.172080636024475, "learning_rate": 4.305084745762712e-06, "loss": 0.6467, "mean_token_accuracy": 0.809874415397644, "num_tokens": 40580246.0, "step": 255 }, { "epoch": 0.13021363173957273, "grad_norm": 1.3525550365447998, "learning_rate": 4.322033898305085e-06, "loss": 0.6176, "mean_token_accuracy": 0.8184188008308411, "num_tokens": 40743349.0, "step": 256 }, { "epoch": 0.13072227873855544, "grad_norm": 1.152124285697937, "learning_rate": 4.338983050847458e-06, "loss": 0.6069, "mean_token_accuracy": 0.8179450035095215, "num_tokens": 40909931.0, "step": 257 }, { "epoch": 0.13123092573753814, "grad_norm": 1.133946418762207, "learning_rate": 4.355932203389831e-06, "loss": 0.616, "mean_token_accuracy": 0.8172526359558105, "num_tokens": 41064135.0, "step": 258 }, { "epoch": 0.13173957273652084, "grad_norm": 1.3298335075378418, "learning_rate": 4.372881355932203e-06, "loss": 0.6107, "mean_token_accuracy": 0.8188907504081726, "num_tokens": 41215121.0, "step": 259 }, { "epoch": 0.13224821973550355, "grad_norm": 1.2448811531066895, "learning_rate": 4.3898305084745765e-06, "loss": 0.6229, "mean_token_accuracy": 0.817119836807251, "num_tokens": 41373300.0, "step": 260 }, { "epoch": 0.13275686673448628, "grad_norm": 1.1549959182739258, "learning_rate": 4.40677966101695e-06, "loss": 0.6281, "mean_token_accuracy": 0.8145886659622192, "num_tokens": 41538822.0, "step": 261 }, { "epoch": 0.13326551373346898, "grad_norm": 1.3290430307388306, "learning_rate": 4.423728813559323e-06, "loss": 0.6715, "mean_token_accuracy": 0.8023037910461426, "num_tokens": 41699217.0, "step": 262 }, { "epoch": 0.13377416073245169, "grad_norm": 1.1931039094924927, "learning_rate": 4.440677966101695e-06, "loss": 0.6279, "mean_token_accuracy": 0.8135525584220886, "num_tokens": 41870291.0, "step": 263 }, { "epoch": 0.1342828077314344, "grad_norm": 1.2231743335723877, "learning_rate": 4.457627118644068e-06, "loss": 0.6595, "mean_token_accuracy": 0.8069267272949219, "num_tokens": 42029303.0, "step": 264 }, { "epoch": 0.1347914547304171, "grad_norm": 1.1536788940429688, "learning_rate": 4.474576271186441e-06, "loss": 0.6467, "mean_token_accuracy": 0.8107583522796631, "num_tokens": 42197688.0, "step": 265 }, { "epoch": 0.1353001017293998, "grad_norm": 1.23910391330719, "learning_rate": 4.491525423728814e-06, "loss": 0.6301, "mean_token_accuracy": 0.8137621879577637, "num_tokens": 42349174.0, "step": 266 }, { "epoch": 0.1358087487283825, "grad_norm": 1.164609432220459, "learning_rate": 4.508474576271187e-06, "loss": 0.6095, "mean_token_accuracy": 0.8188919425010681, "num_tokens": 42506878.0, "step": 267 }, { "epoch": 0.1363173957273652, "grad_norm": 1.158076286315918, "learning_rate": 4.52542372881356e-06, "loss": 0.6489, "mean_token_accuracy": 0.8091750741004944, "num_tokens": 42676569.0, "step": 268 }, { "epoch": 0.1368260427263479, "grad_norm": 1.172566533088684, "learning_rate": 4.542372881355933e-06, "loss": 0.639, "mean_token_accuracy": 0.8108097910881042, "num_tokens": 42831149.0, "step": 269 }, { "epoch": 0.1373346897253306, "grad_norm": 1.0729190111160278, "learning_rate": 4.559322033898305e-06, "loss": 0.6385, "mean_token_accuracy": 0.8113300800323486, "num_tokens": 42992073.0, "step": 270 }, { "epoch": 0.13784333672431331, "grad_norm": 1.0952930450439453, "learning_rate": 4.576271186440678e-06, "loss": 0.6504, "mean_token_accuracy": 0.8107742071151733, "num_tokens": 43157608.0, "step": 271 }, { "epoch": 0.13835198372329605, "grad_norm": 1.1373825073242188, "learning_rate": 4.5932203389830506e-06, "loss": 0.6053, "mean_token_accuracy": 0.8202582597732544, "num_tokens": 43319905.0, "step": 272 }, { "epoch": 0.13886063072227875, "grad_norm": 1.1396281719207764, "learning_rate": 4.610169491525424e-06, "loss": 0.6442, "mean_token_accuracy": 0.81023108959198, "num_tokens": 43483878.0, "step": 273 }, { "epoch": 0.13936927772126145, "grad_norm": 1.0831595659255981, "learning_rate": 4.627118644067797e-06, "loss": 0.6014, "mean_token_accuracy": 0.8223749995231628, "num_tokens": 43642021.0, "step": 274 }, { "epoch": 0.13987792472024416, "grad_norm": 1.2875261306762695, "learning_rate": 4.64406779661017e-06, "loss": 0.6157, "mean_token_accuracy": 0.8163620233535767, "num_tokens": 43788563.0, "step": 275 }, { "epoch": 0.14038657171922686, "grad_norm": 1.1488323211669922, "learning_rate": 4.6610169491525425e-06, "loss": 0.6473, "mean_token_accuracy": 0.8100237846374512, "num_tokens": 43948725.0, "step": 276 }, { "epoch": 0.14089521871820956, "grad_norm": 1.088779091835022, "learning_rate": 4.677966101694916e-06, "loss": 0.6068, "mean_token_accuracy": 0.819377064704895, "num_tokens": 44107222.0, "step": 277 }, { "epoch": 0.14140386571719227, "grad_norm": 1.162422776222229, "learning_rate": 4.694915254237288e-06, "loss": 0.6296, "mean_token_accuracy": 0.8144094944000244, "num_tokens": 44265793.0, "step": 278 }, { "epoch": 0.14191251271617497, "grad_norm": 1.285162091255188, "learning_rate": 4.711864406779661e-06, "loss": 0.6396, "mean_token_accuracy": 0.8107025027275085, "num_tokens": 44416663.0, "step": 279 }, { "epoch": 0.14242115971515767, "grad_norm": 1.2526861429214478, "learning_rate": 4.728813559322034e-06, "loss": 0.629, "mean_token_accuracy": 0.8151564598083496, "num_tokens": 44585677.0, "step": 280 }, { "epoch": 0.14292980671414038, "grad_norm": 1.271353006362915, "learning_rate": 4.745762711864408e-06, "loss": 0.6325, "mean_token_accuracy": 0.8143658638000488, "num_tokens": 44733031.0, "step": 281 }, { "epoch": 0.14343845371312308, "grad_norm": 1.6910256147384644, "learning_rate": 4.76271186440678e-06, "loss": 0.6524, "mean_token_accuracy": 0.8086466789245605, "num_tokens": 44896422.0, "step": 282 }, { "epoch": 0.14394710071210579, "grad_norm": 1.168068766593933, "learning_rate": 4.779661016949153e-06, "loss": 0.6153, "mean_token_accuracy": 0.8179129958152771, "num_tokens": 45063428.0, "step": 283 }, { "epoch": 0.14445574771108852, "grad_norm": 1.2636305093765259, "learning_rate": 4.7966101694915255e-06, "loss": 0.6059, "mean_token_accuracy": 0.8190077543258667, "num_tokens": 45222963.0, "step": 284 }, { "epoch": 0.14496439471007122, "grad_norm": 1.121416687965393, "learning_rate": 4.813559322033899e-06, "loss": 0.6207, "mean_token_accuracy": 0.8159518241882324, "num_tokens": 45369420.0, "step": 285 }, { "epoch": 0.14547304170905392, "grad_norm": 1.2177066802978516, "learning_rate": 4.830508474576272e-06, "loss": 0.589, "mean_token_accuracy": 0.8277342319488525, "num_tokens": 45525565.0, "step": 286 }, { "epoch": 0.14598168870803663, "grad_norm": 1.2821828126907349, "learning_rate": 4.847457627118645e-06, "loss": 0.6338, "mean_token_accuracy": 0.8130690455436707, "num_tokens": 45688704.0, "step": 287 }, { "epoch": 0.14649033570701933, "grad_norm": 1.1307625770568848, "learning_rate": 4.864406779661017e-06, "loss": 0.6274, "mean_token_accuracy": 0.813611626625061, "num_tokens": 45852191.0, "step": 288 }, { "epoch": 0.14699898270600203, "grad_norm": 1.269042730331421, "learning_rate": 4.881355932203391e-06, "loss": 0.6412, "mean_token_accuracy": 0.8145066499710083, "num_tokens": 46009944.0, "step": 289 }, { "epoch": 0.14750762970498474, "grad_norm": 1.25006103515625, "learning_rate": 4.898305084745763e-06, "loss": 0.6723, "mean_token_accuracy": 0.8050320148468018, "num_tokens": 46163464.0, "step": 290 }, { "epoch": 0.14801627670396744, "grad_norm": 1.1930915117263794, "learning_rate": 4.915254237288136e-06, "loss": 0.6295, "mean_token_accuracy": 0.8156224489212036, "num_tokens": 46315469.0, "step": 291 }, { "epoch": 0.14852492370295015, "grad_norm": 1.1504136323928833, "learning_rate": 4.9322033898305085e-06, "loss": 0.6161, "mean_token_accuracy": 0.8182114958763123, "num_tokens": 46475098.0, "step": 292 }, { "epoch": 0.14903357070193285, "grad_norm": 1.1462980508804321, "learning_rate": 4.949152542372882e-06, "loss": 0.6266, "mean_token_accuracy": 0.8146628141403198, "num_tokens": 46645510.0, "step": 293 }, { "epoch": 0.14954221770091555, "grad_norm": 1.1933327913284302, "learning_rate": 4.966101694915255e-06, "loss": 0.612, "mean_token_accuracy": 0.8185099959373474, "num_tokens": 46799760.0, "step": 294 }, { "epoch": 0.15005086469989828, "grad_norm": 1.2216078042984009, "learning_rate": 4.983050847457628e-06, "loss": 0.615, "mean_token_accuracy": 0.8170180320739746, "num_tokens": 46965987.0, "step": 295 }, { "epoch": 0.150559511698881, "grad_norm": 1.2336102724075317, "learning_rate": 5e-06, "loss": 0.5949, "mean_token_accuracy": 0.8221638202667236, "num_tokens": 47128771.0, "step": 296 }, { "epoch": 0.1510681586978637, "grad_norm": 1.1848920583724976, "learning_rate": 5.016949152542373e-06, "loss": 0.6477, "mean_token_accuracy": 0.8114307522773743, "num_tokens": 47288316.0, "step": 297 }, { "epoch": 0.1515768056968464, "grad_norm": 1.2134908437728882, "learning_rate": 5.033898305084746e-06, "loss": 0.6137, "mean_token_accuracy": 0.8162137269973755, "num_tokens": 47455167.0, "step": 298 }, { "epoch": 0.1520854526958291, "grad_norm": 1.1854346990585327, "learning_rate": 5.050847457627119e-06, "loss": 0.6353, "mean_token_accuracy": 0.8133347034454346, "num_tokens": 47600859.0, "step": 299 }, { "epoch": 0.1525940996948118, "grad_norm": 1.2845247983932495, "learning_rate": 5.067796610169492e-06, "loss": 0.6107, "mean_token_accuracy": 0.8182547688484192, "num_tokens": 47768491.0, "step": 300 }, { "epoch": 0.1531027466937945, "grad_norm": 1.2308179140090942, "learning_rate": 5.084745762711865e-06, "loss": 0.6199, "mean_token_accuracy": 0.8154051303863525, "num_tokens": 47918622.0, "step": 301 }, { "epoch": 0.1536113936927772, "grad_norm": 1.2153412103652954, "learning_rate": 5.101694915254237e-06, "loss": 0.5681, "mean_token_accuracy": 0.8302782773971558, "num_tokens": 48076479.0, "step": 302 }, { "epoch": 0.1541200406917599, "grad_norm": 1.2243883609771729, "learning_rate": 5.118644067796611e-06, "loss": 0.6139, "mean_token_accuracy": 0.8183121681213379, "num_tokens": 48226249.0, "step": 303 }, { "epoch": 0.15462868769074262, "grad_norm": 1.0889983177185059, "learning_rate": 5.135593220338983e-06, "loss": 0.5715, "mean_token_accuracy": 0.8308095335960388, "num_tokens": 48390048.0, "step": 304 }, { "epoch": 0.15513733468972532, "grad_norm": 1.2139381170272827, "learning_rate": 5.152542372881356e-06, "loss": 0.5961, "mean_token_accuracy": 0.822356104850769, "num_tokens": 48558970.0, "step": 305 }, { "epoch": 0.15564598168870802, "grad_norm": 1.1127007007598877, "learning_rate": 5.169491525423729e-06, "loss": 0.6033, "mean_token_accuracy": 0.8220300674438477, "num_tokens": 48712282.0, "step": 306 }, { "epoch": 0.15615462868769076, "grad_norm": 1.232642412185669, "learning_rate": 5.186440677966102e-06, "loss": 0.6247, "mean_token_accuracy": 0.8168089985847473, "num_tokens": 48874280.0, "step": 307 }, { "epoch": 0.15666327568667346, "grad_norm": 1.2171428203582764, "learning_rate": 5.203389830508475e-06, "loss": 0.6643, "mean_token_accuracy": 0.8060603141784668, "num_tokens": 49032570.0, "step": 308 }, { "epoch": 0.15717192268565616, "grad_norm": 1.1634498834609985, "learning_rate": 5.220338983050848e-06, "loss": 0.6029, "mean_token_accuracy": 0.8210587501525879, "num_tokens": 49206160.0, "step": 309 }, { "epoch": 0.15768056968463887, "grad_norm": 1.1851032972335815, "learning_rate": 5.23728813559322e-06, "loss": 0.6534, "mean_token_accuracy": 0.8060614466667175, "num_tokens": 49357658.0, "step": 310 }, { "epoch": 0.15818921668362157, "grad_norm": 1.223835825920105, "learning_rate": 5.254237288135594e-06, "loss": 0.6275, "mean_token_accuracy": 0.8136088252067566, "num_tokens": 49513574.0, "step": 311 }, { "epoch": 0.15869786368260427, "grad_norm": 1.1291638612747192, "learning_rate": 5.271186440677966e-06, "loss": 0.5827, "mean_token_accuracy": 0.8268251419067383, "num_tokens": 49678022.0, "step": 312 }, { "epoch": 0.15920651068158698, "grad_norm": 1.2867546081542969, "learning_rate": 5.28813559322034e-06, "loss": 0.6355, "mean_token_accuracy": 0.8134222030639648, "num_tokens": 49844519.0, "step": 313 }, { "epoch": 0.15971515768056968, "grad_norm": 1.2789193391799927, "learning_rate": 5.305084745762712e-06, "loss": 0.5766, "mean_token_accuracy": 0.8284808397293091, "num_tokens": 50003658.0, "step": 314 }, { "epoch": 0.16022380467955238, "grad_norm": 1.2516961097717285, "learning_rate": 5.322033898305086e-06, "loss": 0.6287, "mean_token_accuracy": 0.8147469758987427, "num_tokens": 50169430.0, "step": 315 }, { "epoch": 0.1607324516785351, "grad_norm": 1.2042748928070068, "learning_rate": 5.338983050847458e-06, "loss": 0.6007, "mean_token_accuracy": 0.8214442729949951, "num_tokens": 50337188.0, "step": 316 }, { "epoch": 0.1612410986775178, "grad_norm": 1.3558809757232666, "learning_rate": 5.355932203389831e-06, "loss": 0.6417, "mean_token_accuracy": 0.8108523488044739, "num_tokens": 50488304.0, "step": 317 }, { "epoch": 0.1617497456765005, "grad_norm": 1.2530821561813354, "learning_rate": 5.372881355932204e-06, "loss": 0.6376, "mean_token_accuracy": 0.8133925199508667, "num_tokens": 50649690.0, "step": 318 }, { "epoch": 0.16225839267548323, "grad_norm": 1.2648253440856934, "learning_rate": 5.389830508474577e-06, "loss": 0.5934, "mean_token_accuracy": 0.8214468955993652, "num_tokens": 50809934.0, "step": 319 }, { "epoch": 0.16276703967446593, "grad_norm": 1.289933204650879, "learning_rate": 5.40677966101695e-06, "loss": 0.6457, "mean_token_accuracy": 0.8087766170501709, "num_tokens": 50973651.0, "step": 320 }, { "epoch": 0.16327568667344863, "grad_norm": 1.2105209827423096, "learning_rate": 5.423728813559323e-06, "loss": 0.6661, "mean_token_accuracy": 0.8031639456748962, "num_tokens": 51152576.0, "step": 321 }, { "epoch": 0.16378433367243134, "grad_norm": 3.8657143115997314, "learning_rate": 5.440677966101695e-06, "loss": 0.5896, "mean_token_accuracy": 0.8247578740119934, "num_tokens": 51302446.0, "step": 322 }, { "epoch": 0.16429298067141404, "grad_norm": 1.6237268447875977, "learning_rate": 5.457627118644067e-06, "loss": 0.626, "mean_token_accuracy": 0.8159285187721252, "num_tokens": 51443754.0, "step": 323 }, { "epoch": 0.16480162767039674, "grad_norm": 1.179121971130371, "learning_rate": 5.474576271186441e-06, "loss": 0.6163, "mean_token_accuracy": 0.8178367018699646, "num_tokens": 51613127.0, "step": 324 }, { "epoch": 0.16531027466937945, "grad_norm": 1.1720213890075684, "learning_rate": 5.491525423728814e-06, "loss": 0.6054, "mean_token_accuracy": 0.8200127482414246, "num_tokens": 51778162.0, "step": 325 }, { "epoch": 0.16581892166836215, "grad_norm": 1.1967053413391113, "learning_rate": 5.508474576271187e-06, "loss": 0.602, "mean_token_accuracy": 0.8214297890663147, "num_tokens": 51935798.0, "step": 326 }, { "epoch": 0.16632756866734486, "grad_norm": 1.2503341436386108, "learning_rate": 5.525423728813559e-06, "loss": 0.6247, "mean_token_accuracy": 0.8145067691802979, "num_tokens": 52089803.0, "step": 327 }, { "epoch": 0.16683621566632756, "grad_norm": 1.2478593587875366, "learning_rate": 5.542372881355933e-06, "loss": 0.6118, "mean_token_accuracy": 0.8154499530792236, "num_tokens": 52241386.0, "step": 328 }, { "epoch": 0.16734486266531026, "grad_norm": 1.168043613433838, "learning_rate": 5.559322033898306e-06, "loss": 0.6269, "mean_token_accuracy": 0.8142362236976624, "num_tokens": 52397569.0, "step": 329 }, { "epoch": 0.167853509664293, "grad_norm": 1.290001630783081, "learning_rate": 5.576271186440678e-06, "loss": 0.6198, "mean_token_accuracy": 0.8157522678375244, "num_tokens": 52550468.0, "step": 330 }, { "epoch": 0.1683621566632757, "grad_norm": 1.2450792789459229, "learning_rate": 5.593220338983051e-06, "loss": 0.6044, "mean_token_accuracy": 0.8214341402053833, "num_tokens": 52705147.0, "step": 331 }, { "epoch": 0.1688708036622584, "grad_norm": 1.2368236780166626, "learning_rate": 5.610169491525424e-06, "loss": 0.6489, "mean_token_accuracy": 0.8084084987640381, "num_tokens": 52870603.0, "step": 332 }, { "epoch": 0.1693794506612411, "grad_norm": 1.301377296447754, "learning_rate": 5.6271186440677975e-06, "loss": 0.6287, "mean_token_accuracy": 0.8143856525421143, "num_tokens": 53027492.0, "step": 333 }, { "epoch": 0.1698880976602238, "grad_norm": 1.2675710916519165, "learning_rate": 5.64406779661017e-06, "loss": 0.6272, "mean_token_accuracy": 0.8145819902420044, "num_tokens": 53186596.0, "step": 334 }, { "epoch": 0.1703967446592065, "grad_norm": 1.2474238872528076, "learning_rate": 5.661016949152542e-06, "loss": 0.6111, "mean_token_accuracy": 0.8180166482925415, "num_tokens": 53341286.0, "step": 335 }, { "epoch": 0.17090539165818922, "grad_norm": 1.1779944896697998, "learning_rate": 5.677966101694916e-06, "loss": 0.6307, "mean_token_accuracy": 0.8118512034416199, "num_tokens": 53511107.0, "step": 336 }, { "epoch": 0.17141403865717192, "grad_norm": 1.151164174079895, "learning_rate": 5.694915254237289e-06, "loss": 0.5874, "mean_token_accuracy": 0.8255366086959839, "num_tokens": 53673861.0, "step": 337 }, { "epoch": 0.17192268565615462, "grad_norm": 1.3543330430984497, "learning_rate": 5.711864406779661e-06, "loss": 0.6194, "mean_token_accuracy": 0.8178123831748962, "num_tokens": 53829950.0, "step": 338 }, { "epoch": 0.17243133265513733, "grad_norm": 1.2115367650985718, "learning_rate": 5.728813559322034e-06, "loss": 0.6358, "mean_token_accuracy": 0.8141074180603027, "num_tokens": 53992561.0, "step": 339 }, { "epoch": 0.17293997965412003, "grad_norm": 1.124480128288269, "learning_rate": 5.745762711864407e-06, "loss": 0.6296, "mean_token_accuracy": 0.8138951063156128, "num_tokens": 54156414.0, "step": 340 }, { "epoch": 0.17344862665310273, "grad_norm": 1.212111234664917, "learning_rate": 5.7627118644067805e-06, "loss": 0.6212, "mean_token_accuracy": 0.8161656856536865, "num_tokens": 54315479.0, "step": 341 }, { "epoch": 0.17395727365208546, "grad_norm": 1.1296113729476929, "learning_rate": 5.779661016949153e-06, "loss": 0.6132, "mean_token_accuracy": 0.8164753913879395, "num_tokens": 54487854.0, "step": 342 }, { "epoch": 0.17446592065106817, "grad_norm": 1.1760289669036865, "learning_rate": 5.796610169491525e-06, "loss": 0.6085, "mean_token_accuracy": 0.8185662031173706, "num_tokens": 54637940.0, "step": 343 }, { "epoch": 0.17497456765005087, "grad_norm": 1.222051978111267, "learning_rate": 5.813559322033899e-06, "loss": 0.6142, "mean_token_accuracy": 0.8181614279747009, "num_tokens": 54799217.0, "step": 344 }, { "epoch": 0.17548321464903358, "grad_norm": 1.1629087924957275, "learning_rate": 5.830508474576272e-06, "loss": 0.601, "mean_token_accuracy": 0.8221548795700073, "num_tokens": 54952384.0, "step": 345 }, { "epoch": 0.17599186164801628, "grad_norm": 1.1986356973648071, "learning_rate": 5.847457627118645e-06, "loss": 0.622, "mean_token_accuracy": 0.8151795864105225, "num_tokens": 55126417.0, "step": 346 }, { "epoch": 0.17650050864699898, "grad_norm": 1.1795095205307007, "learning_rate": 5.864406779661017e-06, "loss": 0.6564, "mean_token_accuracy": 0.8053359985351562, "num_tokens": 55296549.0, "step": 347 }, { "epoch": 0.1770091556459817, "grad_norm": 1.2172720432281494, "learning_rate": 5.881355932203391e-06, "loss": 0.5921, "mean_token_accuracy": 0.8246195316314697, "num_tokens": 55457557.0, "step": 348 }, { "epoch": 0.1775178026449644, "grad_norm": 1.1696581840515137, "learning_rate": 5.8983050847457635e-06, "loss": 0.6151, "mean_token_accuracy": 0.8157768249511719, "num_tokens": 55615322.0, "step": 349 }, { "epoch": 0.1780264496439471, "grad_norm": 1.2739365100860596, "learning_rate": 5.915254237288136e-06, "loss": 0.6278, "mean_token_accuracy": 0.8150952458381653, "num_tokens": 55780975.0, "step": 350 }, { "epoch": 0.1785350966429298, "grad_norm": 1.2735313177108765, "learning_rate": 5.932203389830509e-06, "loss": 0.6269, "mean_token_accuracy": 0.8135517835617065, "num_tokens": 55931561.0, "step": 351 }, { "epoch": 0.1790437436419125, "grad_norm": 1.2213759422302246, "learning_rate": 5.949152542372881e-06, "loss": 0.6508, "mean_token_accuracy": 0.8071233034133911, "num_tokens": 56097949.0, "step": 352 }, { "epoch": 0.17955239064089523, "grad_norm": 1.164584994316101, "learning_rate": 5.9661016949152555e-06, "loss": 0.6596, "mean_token_accuracy": 0.8046541213989258, "num_tokens": 56256104.0, "step": 353 }, { "epoch": 0.18006103763987794, "grad_norm": 1.2391185760498047, "learning_rate": 5.983050847457628e-06, "loss": 0.6413, "mean_token_accuracy": 0.8112727403640747, "num_tokens": 56427795.0, "step": 354 }, { "epoch": 0.18056968463886064, "grad_norm": 1.2065702676773071, "learning_rate": 6e-06, "loss": 0.6143, "mean_token_accuracy": 0.8167493343353271, "num_tokens": 56580755.0, "step": 355 }, { "epoch": 0.18107833163784334, "grad_norm": 1.212113380432129, "learning_rate": 6.0169491525423725e-06, "loss": 0.6219, "mean_token_accuracy": 0.8156697750091553, "num_tokens": 56725887.0, "step": 356 }, { "epoch": 0.18158697863682605, "grad_norm": 1.1100784540176392, "learning_rate": 6.0338983050847465e-06, "loss": 0.6086, "mean_token_accuracy": 0.8173103332519531, "num_tokens": 56893128.0, "step": 357 }, { "epoch": 0.18209562563580875, "grad_norm": 1.0563573837280273, "learning_rate": 6.050847457627119e-06, "loss": 0.6011, "mean_token_accuracy": 0.821357250213623, "num_tokens": 57066741.0, "step": 358 }, { "epoch": 0.18260427263479145, "grad_norm": 1.2157350778579712, "learning_rate": 6.067796610169492e-06, "loss": 0.5994, "mean_token_accuracy": 0.8220726847648621, "num_tokens": 57215386.0, "step": 359 }, { "epoch": 0.18311291963377416, "grad_norm": 1.2231732606887817, "learning_rate": 6.084745762711864e-06, "loss": 0.6222, "mean_token_accuracy": 0.8157521486282349, "num_tokens": 57382943.0, "step": 360 }, { "epoch": 0.18362156663275686, "grad_norm": 1.1992939710617065, "learning_rate": 6.1016949152542385e-06, "loss": 0.6004, "mean_token_accuracy": 0.8207837343215942, "num_tokens": 57533799.0, "step": 361 }, { "epoch": 0.18413021363173956, "grad_norm": 1.2414939403533936, "learning_rate": 6.118644067796611e-06, "loss": 0.6133, "mean_token_accuracy": 0.8167545199394226, "num_tokens": 57697387.0, "step": 362 }, { "epoch": 0.18463886063072227, "grad_norm": 1.1865259408950806, "learning_rate": 6.135593220338983e-06, "loss": 0.6199, "mean_token_accuracy": 0.8168951869010925, "num_tokens": 57855776.0, "step": 363 }, { "epoch": 0.18514750762970497, "grad_norm": 1.1268796920776367, "learning_rate": 6.152542372881356e-06, "loss": 0.6, "mean_token_accuracy": 0.8215888738632202, "num_tokens": 58024333.0, "step": 364 }, { "epoch": 0.1856561546286877, "grad_norm": 1.254247784614563, "learning_rate": 6.1694915254237295e-06, "loss": 0.6002, "mean_token_accuracy": 0.8212365508079529, "num_tokens": 58184182.0, "step": 365 }, { "epoch": 0.1861648016276704, "grad_norm": 1.1949880123138428, "learning_rate": 6.186440677966103e-06, "loss": 0.6216, "mean_token_accuracy": 0.8140779733657837, "num_tokens": 58338761.0, "step": 366 }, { "epoch": 0.1866734486266531, "grad_norm": 1.2143034934997559, "learning_rate": 6.203389830508475e-06, "loss": 0.621, "mean_token_accuracy": 0.8147321939468384, "num_tokens": 58485678.0, "step": 367 }, { "epoch": 0.1871820956256358, "grad_norm": 1.2522412538528442, "learning_rate": 6.2203389830508474e-06, "loss": 0.6444, "mean_token_accuracy": 0.8117758631706238, "num_tokens": 58643138.0, "step": 368 }, { "epoch": 0.18769074262461852, "grad_norm": 1.1688337326049805, "learning_rate": 6.2372881355932215e-06, "loss": 0.5978, "mean_token_accuracy": 0.821524441242218, "num_tokens": 58804789.0, "step": 369 }, { "epoch": 0.18819938962360122, "grad_norm": 1.1917411088943481, "learning_rate": 6.254237288135594e-06, "loss": 0.6014, "mean_token_accuracy": 0.8203130960464478, "num_tokens": 58978624.0, "step": 370 }, { "epoch": 0.18870803662258392, "grad_norm": 1.1307294368743896, "learning_rate": 6.271186440677966e-06, "loss": 0.5915, "mean_token_accuracy": 0.8264414668083191, "num_tokens": 59150156.0, "step": 371 }, { "epoch": 0.18921668362156663, "grad_norm": 1.1567331552505493, "learning_rate": 6.288135593220339e-06, "loss": 0.5645, "mean_token_accuracy": 0.8300540447235107, "num_tokens": 59307831.0, "step": 372 }, { "epoch": 0.18972533062054933, "grad_norm": 1.2923393249511719, "learning_rate": 6.3050847457627125e-06, "loss": 0.6242, "mean_token_accuracy": 0.8148703575134277, "num_tokens": 59458464.0, "step": 373 }, { "epoch": 0.19023397761953204, "grad_norm": 1.2552546262741089, "learning_rate": 6.322033898305086e-06, "loss": 0.6044, "mean_token_accuracy": 0.821365237236023, "num_tokens": 59612951.0, "step": 374 }, { "epoch": 0.19074262461851474, "grad_norm": 1.2645580768585205, "learning_rate": 6.338983050847458e-06, "loss": 0.5737, "mean_token_accuracy": 0.8263217806816101, "num_tokens": 59774372.0, "step": 375 }, { "epoch": 0.19125127161749747, "grad_norm": 1.2015972137451172, "learning_rate": 6.3559322033898304e-06, "loss": 0.5775, "mean_token_accuracy": 0.8266362547874451, "num_tokens": 59940444.0, "step": 376 }, { "epoch": 0.19175991861648017, "grad_norm": 1.3558928966522217, "learning_rate": 6.372881355932204e-06, "loss": 0.6211, "mean_token_accuracy": 0.8140040636062622, "num_tokens": 60097403.0, "step": 377 }, { "epoch": 0.19226856561546288, "grad_norm": 1.1792396306991577, "learning_rate": 6.389830508474577e-06, "loss": 0.5988, "mean_token_accuracy": 0.820154070854187, "num_tokens": 60270518.0, "step": 378 }, { "epoch": 0.19277721261444558, "grad_norm": 1.29557204246521, "learning_rate": 6.40677966101695e-06, "loss": 0.5914, "mean_token_accuracy": 0.8230335712432861, "num_tokens": 60434705.0, "step": 379 }, { "epoch": 0.19328585961342828, "grad_norm": 1.251295804977417, "learning_rate": 6.423728813559322e-06, "loss": 0.6274, "mean_token_accuracy": 0.814145028591156, "num_tokens": 60587162.0, "step": 380 }, { "epoch": 0.193794506612411, "grad_norm": 1.1563841104507446, "learning_rate": 6.440677966101695e-06, "loss": 0.5998, "mean_token_accuracy": 0.8219859600067139, "num_tokens": 60749142.0, "step": 381 }, { "epoch": 0.1943031536113937, "grad_norm": 1.303562879562378, "learning_rate": 6.457627118644069e-06, "loss": 0.5886, "mean_token_accuracy": 0.823674201965332, "num_tokens": 60917294.0, "step": 382 }, { "epoch": 0.1948118006103764, "grad_norm": 1.3838107585906982, "learning_rate": 6.474576271186441e-06, "loss": 0.5929, "mean_token_accuracy": 0.8225725889205933, "num_tokens": 61084543.0, "step": 383 }, { "epoch": 0.1953204476093591, "grad_norm": 1.2146414518356323, "learning_rate": 6.491525423728814e-06, "loss": 0.6255, "mean_token_accuracy": 0.8173696398735046, "num_tokens": 61248579.0, "step": 384 }, { "epoch": 0.1958290946083418, "grad_norm": 1.264757752418518, "learning_rate": 6.508474576271187e-06, "loss": 0.6135, "mean_token_accuracy": 0.8174546360969543, "num_tokens": 61403557.0, "step": 385 }, { "epoch": 0.1963377416073245, "grad_norm": 1.2781434059143066, "learning_rate": 6.52542372881356e-06, "loss": 0.6065, "mean_token_accuracy": 0.8191805481910706, "num_tokens": 61568050.0, "step": 386 }, { "epoch": 0.1968463886063072, "grad_norm": 1.1514405012130737, "learning_rate": 6.542372881355933e-06, "loss": 0.6281, "mean_token_accuracy": 0.8139955997467041, "num_tokens": 61735941.0, "step": 387 }, { "epoch": 0.19735503560528994, "grad_norm": 1.1844816207885742, "learning_rate": 6.559322033898305e-06, "loss": 0.5633, "mean_token_accuracy": 0.8317704200744629, "num_tokens": 61895500.0, "step": 388 }, { "epoch": 0.19786368260427264, "grad_norm": 1.1167271137237549, "learning_rate": 6.576271186440678e-06, "loss": 0.6092, "mean_token_accuracy": 0.8186742067337036, "num_tokens": 62062619.0, "step": 389 }, { "epoch": 0.19837232960325535, "grad_norm": 1.2830709218978882, "learning_rate": 6.593220338983052e-06, "loss": 0.6137, "mean_token_accuracy": 0.8206446766853333, "num_tokens": 62247115.0, "step": 390 }, { "epoch": 0.19888097660223805, "grad_norm": 1.105200171470642, "learning_rate": 6.610169491525424e-06, "loss": 0.6043, "mean_token_accuracy": 0.8199334740638733, "num_tokens": 62404051.0, "step": 391 }, { "epoch": 0.19938962360122076, "grad_norm": 4.479599952697754, "learning_rate": 6.627118644067797e-06, "loss": 0.6184, "mean_token_accuracy": 0.8156611919403076, "num_tokens": 62559441.0, "step": 392 }, { "epoch": 0.19989827060020346, "grad_norm": 1.32205069065094, "learning_rate": 6.64406779661017e-06, "loss": 0.6477, "mean_token_accuracy": 0.8093985915184021, "num_tokens": 62719791.0, "step": 393 }, { "epoch": 0.20040691759918616, "grad_norm": 1.2664779424667358, "learning_rate": 6.661016949152544e-06, "loss": 0.6088, "mean_token_accuracy": 0.818210780620575, "num_tokens": 62863338.0, "step": 394 }, { "epoch": 0.20091556459816887, "grad_norm": 1.157365083694458, "learning_rate": 6.677966101694916e-06, "loss": 0.5954, "mean_token_accuracy": 0.8220131397247314, "num_tokens": 63024566.0, "step": 395 }, { "epoch": 0.20142421159715157, "grad_norm": 1.1239126920700073, "learning_rate": 6.694915254237288e-06, "loss": 0.5648, "mean_token_accuracy": 0.8296006917953491, "num_tokens": 63189778.0, "step": 396 }, { "epoch": 0.20193285859613427, "grad_norm": 1.2238695621490479, "learning_rate": 6.7118644067796615e-06, "loss": 0.6229, "mean_token_accuracy": 0.8139328956604004, "num_tokens": 63347541.0, "step": 397 }, { "epoch": 0.20244150559511698, "grad_norm": 1.1842625141143799, "learning_rate": 6.728813559322035e-06, "loss": 0.6311, "mean_token_accuracy": 0.8118071556091309, "num_tokens": 63513614.0, "step": 398 }, { "epoch": 0.2029501525940997, "grad_norm": 1.3057259321212769, "learning_rate": 6.745762711864408e-06, "loss": 0.6071, "mean_token_accuracy": 0.8176935315132141, "num_tokens": 63657699.0, "step": 399 }, { "epoch": 0.2034587995930824, "grad_norm": 1.132493495941162, "learning_rate": 6.76271186440678e-06, "loss": 0.6029, "mean_token_accuracy": 0.8205714821815491, "num_tokens": 63810862.0, "step": 400 }, { "epoch": 0.20396744659206512, "grad_norm": 1.1689492464065552, "learning_rate": 6.779661016949153e-06, "loss": 0.6263, "mean_token_accuracy": 0.8131099939346313, "num_tokens": 63976337.0, "step": 401 }, { "epoch": 0.20447609359104782, "grad_norm": 1.246856927871704, "learning_rate": 6.796610169491527e-06, "loss": 0.6094, "mean_token_accuracy": 0.8199894428253174, "num_tokens": 64131809.0, "step": 402 }, { "epoch": 0.20498474059003052, "grad_norm": 1.1247804164886475, "learning_rate": 6.813559322033899e-06, "loss": 0.6244, "mean_token_accuracy": 0.8156639337539673, "num_tokens": 64301531.0, "step": 403 }, { "epoch": 0.20549338758901323, "grad_norm": 1.205808401107788, "learning_rate": 6.830508474576271e-06, "loss": 0.6042, "mean_token_accuracy": 0.8193472623825073, "num_tokens": 64453734.0, "step": 404 }, { "epoch": 0.20600203458799593, "grad_norm": 1.227185845375061, "learning_rate": 6.8474576271186445e-06, "loss": 0.6337, "mean_token_accuracy": 0.814788818359375, "num_tokens": 64614638.0, "step": 405 }, { "epoch": 0.20651068158697863, "grad_norm": 1.187564730644226, "learning_rate": 6.864406779661017e-06, "loss": 0.5787, "mean_token_accuracy": 0.8256047368049622, "num_tokens": 64783877.0, "step": 406 }, { "epoch": 0.20701932858596134, "grad_norm": 1.1679338216781616, "learning_rate": 6.881355932203391e-06, "loss": 0.6134, "mean_token_accuracy": 0.8183737993240356, "num_tokens": 64946232.0, "step": 407 }, { "epoch": 0.20752797558494404, "grad_norm": 1.1042274236679077, "learning_rate": 6.898305084745763e-06, "loss": 0.5988, "mean_token_accuracy": 0.821270227432251, "num_tokens": 65105547.0, "step": 408 }, { "epoch": 0.20803662258392674, "grad_norm": 1.2437533140182495, "learning_rate": 6.915254237288136e-06, "loss": 0.6193, "mean_token_accuracy": 0.8171321153640747, "num_tokens": 65262260.0, "step": 409 }, { "epoch": 0.20854526958290945, "grad_norm": 1.145785927772522, "learning_rate": 6.932203389830509e-06, "loss": 0.6059, "mean_token_accuracy": 0.8211365938186646, "num_tokens": 65425596.0, "step": 410 }, { "epoch": 0.20905391658189218, "grad_norm": 1.1632986068725586, "learning_rate": 6.949152542372882e-06, "loss": 0.6114, "mean_token_accuracy": 0.8181918263435364, "num_tokens": 65582379.0, "step": 411 }, { "epoch": 0.20956256358087488, "grad_norm": 1.200835108757019, "learning_rate": 6.966101694915255e-06, "loss": 0.6299, "mean_token_accuracy": 0.8122112154960632, "num_tokens": 65753543.0, "step": 412 }, { "epoch": 0.2100712105798576, "grad_norm": 1.144328236579895, "learning_rate": 6.9830508474576275e-06, "loss": 0.5904, "mean_token_accuracy": 0.8234543800354004, "num_tokens": 65920741.0, "step": 413 }, { "epoch": 0.2105798575788403, "grad_norm": 1.059288740158081, "learning_rate": 7e-06, "loss": 0.5766, "mean_token_accuracy": 0.8277970552444458, "num_tokens": 66079261.0, "step": 414 }, { "epoch": 0.211088504577823, "grad_norm": 1.083725929260254, "learning_rate": 7.016949152542374e-06, "loss": 0.6064, "mean_token_accuracy": 0.8182491064071655, "num_tokens": 66249455.0, "step": 415 }, { "epoch": 0.2115971515768057, "grad_norm": 1.195258617401123, "learning_rate": 7.033898305084746e-06, "loss": 0.6339, "mean_token_accuracy": 0.811680793762207, "num_tokens": 66410765.0, "step": 416 }, { "epoch": 0.2121057985757884, "grad_norm": 1.1450555324554443, "learning_rate": 7.0508474576271195e-06, "loss": 0.5983, "mean_token_accuracy": 0.8226056098937988, "num_tokens": 66564476.0, "step": 417 }, { "epoch": 0.2126144455747711, "grad_norm": 1.1363308429718018, "learning_rate": 7.067796610169492e-06, "loss": 0.6271, "mean_token_accuracy": 0.8152729272842407, "num_tokens": 66737804.0, "step": 418 }, { "epoch": 0.2131230925737538, "grad_norm": 1.1649593114852905, "learning_rate": 7.084745762711865e-06, "loss": 0.6233, "mean_token_accuracy": 0.8146108388900757, "num_tokens": 66895240.0, "step": 419 }, { "epoch": 0.2136317395727365, "grad_norm": 1.182517647743225, "learning_rate": 7.101694915254238e-06, "loss": 0.6034, "mean_token_accuracy": 0.8202614188194275, "num_tokens": 67056288.0, "step": 420 }, { "epoch": 0.21414038657171922, "grad_norm": 1.193202018737793, "learning_rate": 7.1186440677966106e-06, "loss": 0.6147, "mean_token_accuracy": 0.8172149658203125, "num_tokens": 67206207.0, "step": 421 }, { "epoch": 0.21464903357070192, "grad_norm": 1.1349166631698608, "learning_rate": 7.135593220338983e-06, "loss": 0.5734, "mean_token_accuracy": 0.8272489309310913, "num_tokens": 67369756.0, "step": 422 }, { "epoch": 0.21515768056968465, "grad_norm": 1.1669480800628662, "learning_rate": 7.152542372881357e-06, "loss": 0.586, "mean_token_accuracy": 0.822455883026123, "num_tokens": 67527516.0, "step": 423 }, { "epoch": 0.21566632756866735, "grad_norm": 1.1813911199569702, "learning_rate": 7.169491525423729e-06, "loss": 0.5945, "mean_token_accuracy": 0.8220915794372559, "num_tokens": 67688042.0, "step": 424 }, { "epoch": 0.21617497456765006, "grad_norm": 1.2895721197128296, "learning_rate": 7.1864406779661025e-06, "loss": 0.5985, "mean_token_accuracy": 0.8207123279571533, "num_tokens": 67854191.0, "step": 425 }, { "epoch": 0.21668362156663276, "grad_norm": 1.1939265727996826, "learning_rate": 7.203389830508475e-06, "loss": 0.5893, "mean_token_accuracy": 0.8222360610961914, "num_tokens": 68008533.0, "step": 426 }, { "epoch": 0.21719226856561547, "grad_norm": 1.2244884967803955, "learning_rate": 7.220338983050849e-06, "loss": 0.6094, "mean_token_accuracy": 0.8172198534011841, "num_tokens": 68162894.0, "step": 427 }, { "epoch": 0.21770091556459817, "grad_norm": 1.1649295091629028, "learning_rate": 7.237288135593221e-06, "loss": 0.5621, "mean_token_accuracy": 0.8311367630958557, "num_tokens": 68332401.0, "step": 428 }, { "epoch": 0.21820956256358087, "grad_norm": 1.2211418151855469, "learning_rate": 7.2542372881355936e-06, "loss": 0.5759, "mean_token_accuracy": 0.826878547668457, "num_tokens": 68489170.0, "step": 429 }, { "epoch": 0.21871820956256358, "grad_norm": 1.3775643110275269, "learning_rate": 7.271186440677967e-06, "loss": 0.6185, "mean_token_accuracy": 0.8159877061843872, "num_tokens": 68643444.0, "step": 430 }, { "epoch": 0.21922685656154628, "grad_norm": 1.2026543617248535, "learning_rate": 7.288135593220339e-06, "loss": 0.6023, "mean_token_accuracy": 0.8209705352783203, "num_tokens": 68812499.0, "step": 431 }, { "epoch": 0.21973550356052898, "grad_norm": 1.309005856513977, "learning_rate": 7.305084745762713e-06, "loss": 0.5912, "mean_token_accuracy": 0.8220794200897217, "num_tokens": 68976455.0, "step": 432 }, { "epoch": 0.2202441505595117, "grad_norm": 1.4292582273483276, "learning_rate": 7.3220338983050855e-06, "loss": 0.6372, "mean_token_accuracy": 0.8120876550674438, "num_tokens": 69142925.0, "step": 433 }, { "epoch": 0.22075279755849442, "grad_norm": 1.242922306060791, "learning_rate": 7.338983050847458e-06, "loss": 0.6253, "mean_token_accuracy": 0.8133541345596313, "num_tokens": 69307832.0, "step": 434 }, { "epoch": 0.22126144455747712, "grad_norm": 1.163408637046814, "learning_rate": 7.355932203389831e-06, "loss": 0.5749, "mean_token_accuracy": 0.8260928392410278, "num_tokens": 69466104.0, "step": 435 }, { "epoch": 0.22177009155645983, "grad_norm": 1.1773909330368042, "learning_rate": 7.372881355932204e-06, "loss": 0.5611, "mean_token_accuracy": 0.8318485021591187, "num_tokens": 69619859.0, "step": 436 }, { "epoch": 0.22227873855544253, "grad_norm": 1.2546722888946533, "learning_rate": 7.3898305084745766e-06, "loss": 0.5674, "mean_token_accuracy": 0.8298258781433105, "num_tokens": 69774687.0, "step": 437 }, { "epoch": 0.22278738555442523, "grad_norm": 1.3101048469543457, "learning_rate": 7.40677966101695e-06, "loss": 0.6228, "mean_token_accuracy": 0.8147264719009399, "num_tokens": 69924843.0, "step": 438 }, { "epoch": 0.22329603255340794, "grad_norm": 1.2783973217010498, "learning_rate": 7.423728813559322e-06, "loss": 0.6056, "mean_token_accuracy": 0.8204407691955566, "num_tokens": 70088441.0, "step": 439 }, { "epoch": 0.22380467955239064, "grad_norm": 1.2047439813613892, "learning_rate": 7.440677966101696e-06, "loss": 0.5955, "mean_token_accuracy": 0.8198369741439819, "num_tokens": 70244433.0, "step": 440 }, { "epoch": 0.22431332655137334, "grad_norm": 1.2373347282409668, "learning_rate": 7.4576271186440685e-06, "loss": 0.6113, "mean_token_accuracy": 0.8163229823112488, "num_tokens": 70412934.0, "step": 441 }, { "epoch": 0.22482197355035605, "grad_norm": 1.2173701524734497, "learning_rate": 7.474576271186441e-06, "loss": 0.6004, "mean_token_accuracy": 0.8211686015129089, "num_tokens": 70568120.0, "step": 442 }, { "epoch": 0.22533062054933875, "grad_norm": 1.2755416631698608, "learning_rate": 7.491525423728814e-06, "loss": 0.6173, "mean_token_accuracy": 0.8141643404960632, "num_tokens": 70725251.0, "step": 443 }, { "epoch": 0.22583926754832145, "grad_norm": 1.3401257991790771, "learning_rate": 7.508474576271187e-06, "loss": 0.613, "mean_token_accuracy": 0.8169766068458557, "num_tokens": 70881374.0, "step": 444 }, { "epoch": 0.22634791454730416, "grad_norm": 1.2987544536590576, "learning_rate": 7.52542372881356e-06, "loss": 0.6096, "mean_token_accuracy": 0.817240834236145, "num_tokens": 71029051.0, "step": 445 }, { "epoch": 0.2268565615462869, "grad_norm": 1.3045681715011597, "learning_rate": 7.542372881355933e-06, "loss": 0.5953, "mean_token_accuracy": 0.8227115869522095, "num_tokens": 71192310.0, "step": 446 }, { "epoch": 0.2273652085452696, "grad_norm": 1.2596161365509033, "learning_rate": 7.559322033898305e-06, "loss": 0.6205, "mean_token_accuracy": 0.8148484230041504, "num_tokens": 71355710.0, "step": 447 }, { "epoch": 0.2278738555442523, "grad_norm": 1.2501027584075928, "learning_rate": 7.576271186440679e-06, "loss": 0.624, "mean_token_accuracy": 0.8155069351196289, "num_tokens": 71509580.0, "step": 448 }, { "epoch": 0.228382502543235, "grad_norm": 1.1821587085723877, "learning_rate": 7.5932203389830515e-06, "loss": 0.5762, "mean_token_accuracy": 0.8275756239891052, "num_tokens": 71666995.0, "step": 449 }, { "epoch": 0.2288911495422177, "grad_norm": 1.2694047689437866, "learning_rate": 7.610169491525425e-06, "loss": 0.5938, "mean_token_accuracy": 0.8225254416465759, "num_tokens": 71825201.0, "step": 450 }, { "epoch": 0.2293997965412004, "grad_norm": 1.2232170104980469, "learning_rate": 7.627118644067797e-06, "loss": 0.5787, "mean_token_accuracy": 0.8244859576225281, "num_tokens": 71984961.0, "step": 451 }, { "epoch": 0.2299084435401831, "grad_norm": 1.261027455329895, "learning_rate": 7.64406779661017e-06, "loss": 0.6008, "mean_token_accuracy": 0.8211561441421509, "num_tokens": 72158603.0, "step": 452 }, { "epoch": 0.23041709053916581, "grad_norm": 1.2971307039260864, "learning_rate": 7.661016949152543e-06, "loss": 0.5777, "mean_token_accuracy": 0.8278082609176636, "num_tokens": 72313855.0, "step": 453 }, { "epoch": 0.23092573753814852, "grad_norm": 1.1784662008285522, "learning_rate": 7.677966101694917e-06, "loss": 0.5981, "mean_token_accuracy": 0.8198465704917908, "num_tokens": 72476307.0, "step": 454 }, { "epoch": 0.23143438453713122, "grad_norm": 1.2892508506774902, "learning_rate": 7.694915254237289e-06, "loss": 0.5903, "mean_token_accuracy": 0.8216632604598999, "num_tokens": 72630768.0, "step": 455 }, { "epoch": 0.23194303153611392, "grad_norm": 1.192068338394165, "learning_rate": 7.711864406779663e-06, "loss": 0.5911, "mean_token_accuracy": 0.8217214345932007, "num_tokens": 72805014.0, "step": 456 }, { "epoch": 0.23245167853509666, "grad_norm": 1.2026872634887695, "learning_rate": 7.728813559322035e-06, "loss": 0.5546, "mean_token_accuracy": 0.8341190218925476, "num_tokens": 72965078.0, "step": 457 }, { "epoch": 0.23296032553407936, "grad_norm": 1.1919342279434204, "learning_rate": 7.745762711864408e-06, "loss": 0.574, "mean_token_accuracy": 0.8285409808158875, "num_tokens": 73123811.0, "step": 458 }, { "epoch": 0.23346897253306206, "grad_norm": 1.2350062131881714, "learning_rate": 7.76271186440678e-06, "loss": 0.5785, "mean_token_accuracy": 0.8264639377593994, "num_tokens": 73286856.0, "step": 459 }, { "epoch": 0.23397761953204477, "grad_norm": 1.1893889904022217, "learning_rate": 7.779661016949152e-06, "loss": 0.6018, "mean_token_accuracy": 0.8182100057601929, "num_tokens": 73432474.0, "step": 460 }, { "epoch": 0.23448626653102747, "grad_norm": 1.2224371433258057, "learning_rate": 7.796610169491526e-06, "loss": 0.5944, "mean_token_accuracy": 0.8205611705780029, "num_tokens": 73589719.0, "step": 461 }, { "epoch": 0.23499491353001017, "grad_norm": 1.1163181066513062, "learning_rate": 7.813559322033899e-06, "loss": 0.5929, "mean_token_accuracy": 0.8211892247200012, "num_tokens": 73759617.0, "step": 462 }, { "epoch": 0.23550356052899288, "grad_norm": 1.317673683166504, "learning_rate": 7.830508474576271e-06, "loss": 0.6155, "mean_token_accuracy": 0.8157134056091309, "num_tokens": 73911743.0, "step": 463 }, { "epoch": 0.23601220752797558, "grad_norm": 1.2159312963485718, "learning_rate": 7.847457627118643e-06, "loss": 0.6012, "mean_token_accuracy": 0.8198647499084473, "num_tokens": 74082923.0, "step": 464 }, { "epoch": 0.23652085452695829, "grad_norm": 1.2829540967941284, "learning_rate": 7.864406779661017e-06, "loss": 0.5893, "mean_token_accuracy": 0.8235445022583008, "num_tokens": 74243087.0, "step": 465 }, { "epoch": 0.237029501525941, "grad_norm": 1.17428457736969, "learning_rate": 7.88135593220339e-06, "loss": 0.5769, "mean_token_accuracy": 0.8249427080154419, "num_tokens": 74410401.0, "step": 466 }, { "epoch": 0.2375381485249237, "grad_norm": 1.2670059204101562, "learning_rate": 7.898305084745764e-06, "loss": 0.5923, "mean_token_accuracy": 0.8222395777702332, "num_tokens": 74568603.0, "step": 467 }, { "epoch": 0.2380467955239064, "grad_norm": 1.3219184875488281, "learning_rate": 7.915254237288136e-06, "loss": 0.6009, "mean_token_accuracy": 0.8215954303741455, "num_tokens": 74727709.0, "step": 468 }, { "epoch": 0.23855544252288913, "grad_norm": 1.1674619913101196, "learning_rate": 7.93220338983051e-06, "loss": 0.5822, "mean_token_accuracy": 0.8243297338485718, "num_tokens": 74890980.0, "step": 469 }, { "epoch": 0.23906408952187183, "grad_norm": 1.2517329454421997, "learning_rate": 7.949152542372883e-06, "loss": 0.5495, "mean_token_accuracy": 0.8352354764938354, "num_tokens": 75056158.0, "step": 470 }, { "epoch": 0.23957273652085453, "grad_norm": 1.224470615386963, "learning_rate": 7.966101694915255e-06, "loss": 0.5858, "mean_token_accuracy": 0.8256843090057373, "num_tokens": 75217454.0, "step": 471 }, { "epoch": 0.24008138351983724, "grad_norm": 1.2624529600143433, "learning_rate": 7.983050847457627e-06, "loss": 0.5861, "mean_token_accuracy": 0.8228200674057007, "num_tokens": 75380129.0, "step": 472 }, { "epoch": 0.24059003051881994, "grad_norm": 1.237319827079773, "learning_rate": 8.000000000000001e-06, "loss": 0.5642, "mean_token_accuracy": 0.8284198045730591, "num_tokens": 75538207.0, "step": 473 }, { "epoch": 0.24109867751780265, "grad_norm": 1.1995705366134644, "learning_rate": 8.016949152542374e-06, "loss": 0.5607, "mean_token_accuracy": 0.8310052156448364, "num_tokens": 75692158.0, "step": 474 }, { "epoch": 0.24160732451678535, "grad_norm": 1.2999194860458374, "learning_rate": 8.033898305084746e-06, "loss": 0.5739, "mean_token_accuracy": 0.8259695768356323, "num_tokens": 75850325.0, "step": 475 }, { "epoch": 0.24211597151576805, "grad_norm": 1.207257628440857, "learning_rate": 8.050847457627118e-06, "loss": 0.5581, "mean_token_accuracy": 0.8317427635192871, "num_tokens": 76015367.0, "step": 476 }, { "epoch": 0.24262461851475076, "grad_norm": 1.1974592208862305, "learning_rate": 8.067796610169492e-06, "loss": 0.5692, "mean_token_accuracy": 0.8268818855285645, "num_tokens": 76165209.0, "step": 477 }, { "epoch": 0.24313326551373346, "grad_norm": 1.3411630392074585, "learning_rate": 8.084745762711865e-06, "loss": 0.6101, "mean_token_accuracy": 0.8171261548995972, "num_tokens": 76309618.0, "step": 478 }, { "epoch": 0.24364191251271616, "grad_norm": 1.2520160675048828, "learning_rate": 8.101694915254237e-06, "loss": 0.5645, "mean_token_accuracy": 0.8279448747634888, "num_tokens": 76468138.0, "step": 479 }, { "epoch": 0.2441505595116989, "grad_norm": 1.327293038368225, "learning_rate": 8.118644067796611e-06, "loss": 0.5847, "mean_token_accuracy": 0.8226466178894043, "num_tokens": 76638347.0, "step": 480 }, { "epoch": 0.2446592065106816, "grad_norm": 1.2002809047698975, "learning_rate": 8.135593220338983e-06, "loss": 0.6024, "mean_token_accuracy": 0.8184869289398193, "num_tokens": 76807912.0, "step": 481 }, { "epoch": 0.2451678535096643, "grad_norm": 1.269645094871521, "learning_rate": 8.152542372881358e-06, "loss": 0.5857, "mean_token_accuracy": 0.8238763809204102, "num_tokens": 76960377.0, "step": 482 }, { "epoch": 0.245676500508647, "grad_norm": 1.2257829904556274, "learning_rate": 8.16949152542373e-06, "loss": 0.644, "mean_token_accuracy": 0.8092539310455322, "num_tokens": 77135219.0, "step": 483 }, { "epoch": 0.2461851475076297, "grad_norm": 1.2210476398468018, "learning_rate": 8.186440677966102e-06, "loss": 0.6071, "mean_token_accuracy": 0.8186261653900146, "num_tokens": 77284277.0, "step": 484 }, { "epoch": 0.2466937945066124, "grad_norm": 1.1874899864196777, "learning_rate": 8.203389830508475e-06, "loss": 0.5955, "mean_token_accuracy": 0.8232449293136597, "num_tokens": 77463576.0, "step": 485 }, { "epoch": 0.24720244150559512, "grad_norm": 1.288148283958435, "learning_rate": 8.220338983050849e-06, "loss": 0.5746, "mean_token_accuracy": 0.8276445865631104, "num_tokens": 77624014.0, "step": 486 }, { "epoch": 0.24771108850457782, "grad_norm": 1.3062938451766968, "learning_rate": 8.237288135593221e-06, "loss": 0.5965, "mean_token_accuracy": 0.8204386234283447, "num_tokens": 77786805.0, "step": 487 }, { "epoch": 0.24821973550356052, "grad_norm": 1.3395079374313354, "learning_rate": 8.254237288135593e-06, "loss": 0.6057, "mean_token_accuracy": 0.8180563449859619, "num_tokens": 77955105.0, "step": 488 }, { "epoch": 0.24872838250254323, "grad_norm": 1.370811104774475, "learning_rate": 8.271186440677966e-06, "loss": 0.5758, "mean_token_accuracy": 0.8270718455314636, "num_tokens": 78103025.0, "step": 489 }, { "epoch": 0.24923702950152593, "grad_norm": 1.405150294303894, "learning_rate": 8.28813559322034e-06, "loss": 0.5757, "mean_token_accuracy": 0.826675534248352, "num_tokens": 78259934.0, "step": 490 }, { "epoch": 0.24974567650050863, "grad_norm": 1.173972249031067, "learning_rate": 8.305084745762712e-06, "loss": 0.5774, "mean_token_accuracy": 0.8275264501571655, "num_tokens": 78420450.0, "step": 491 }, { "epoch": 0.25025432349949134, "grad_norm": 1.3530043363571167, "learning_rate": 8.322033898305086e-06, "loss": 0.5926, "mean_token_accuracy": 0.8219928741455078, "num_tokens": 78575928.0, "step": 492 }, { "epoch": 0.25076297049847407, "grad_norm": 1.2971301078796387, "learning_rate": 8.338983050847458e-06, "loss": 0.5863, "mean_token_accuracy": 0.8221102356910706, "num_tokens": 78723238.0, "step": 493 }, { "epoch": 0.25127161749745675, "grad_norm": 1.2696784734725952, "learning_rate": 8.35593220338983e-06, "loss": 0.5965, "mean_token_accuracy": 0.8210808038711548, "num_tokens": 78897539.0, "step": 494 }, { "epoch": 0.2517802644964395, "grad_norm": 1.2298940420150757, "learning_rate": 8.372881355932205e-06, "loss": 0.5512, "mean_token_accuracy": 0.8342494964599609, "num_tokens": 79059834.0, "step": 495 }, { "epoch": 0.25228891149542215, "grad_norm": 1.2379639148712158, "learning_rate": 8.389830508474577e-06, "loss": 0.5971, "mean_token_accuracy": 0.8209534883499146, "num_tokens": 79206412.0, "step": 496 }, { "epoch": 0.2527975584944049, "grad_norm": 1.1856814622879028, "learning_rate": 8.40677966101695e-06, "loss": 0.5919, "mean_token_accuracy": 0.8220817446708679, "num_tokens": 79379375.0, "step": 497 }, { "epoch": 0.2533062054933876, "grad_norm": 5.187645435333252, "learning_rate": 8.423728813559324e-06, "loss": 0.5795, "mean_token_accuracy": 0.8255907297134399, "num_tokens": 79550225.0, "step": 498 }, { "epoch": 0.2538148524923703, "grad_norm": 1.6230169534683228, "learning_rate": 8.440677966101696e-06, "loss": 0.5654, "mean_token_accuracy": 0.8300318121910095, "num_tokens": 79705365.0, "step": 499 }, { "epoch": 0.254323499491353, "grad_norm": 1.5571225881576538, "learning_rate": 8.457627118644068e-06, "loss": 0.5913, "mean_token_accuracy": 0.8222099542617798, "num_tokens": 79861100.0, "step": 500 }, { "epoch": 0.2548321464903357, "grad_norm": 1.3249210119247437, "learning_rate": 8.47457627118644e-06, "loss": 0.5905, "mean_token_accuracy": 0.8226123452186584, "num_tokens": 80017875.0, "step": 501 }, { "epoch": 0.25534079348931843, "grad_norm": 1.379267930984497, "learning_rate": 8.491525423728815e-06, "loss": 0.5906, "mean_token_accuracy": 0.8216269016265869, "num_tokens": 80184953.0, "step": 502 }, { "epoch": 0.2558494404883011, "grad_norm": 1.2471497058868408, "learning_rate": 8.508474576271187e-06, "loss": 0.5887, "mean_token_accuracy": 0.8234388828277588, "num_tokens": 80342864.0, "step": 503 }, { "epoch": 0.25635808748728384, "grad_norm": 1.3340461254119873, "learning_rate": 8.52542372881356e-06, "loss": 0.5575, "mean_token_accuracy": 0.8304026126861572, "num_tokens": 80497644.0, "step": 504 }, { "epoch": 0.2568667344862665, "grad_norm": 1.4473865032196045, "learning_rate": 8.542372881355933e-06, "loss": 0.6242, "mean_token_accuracy": 0.8131608963012695, "num_tokens": 80652132.0, "step": 505 }, { "epoch": 0.25737538148524924, "grad_norm": 1.4180216789245605, "learning_rate": 8.559322033898306e-06, "loss": 0.584, "mean_token_accuracy": 0.8273070454597473, "num_tokens": 80815501.0, "step": 506 }, { "epoch": 0.2578840284842319, "grad_norm": 1.296979546546936, "learning_rate": 8.57627118644068e-06, "loss": 0.6033, "mean_token_accuracy": 0.818980872631073, "num_tokens": 80982538.0, "step": 507 }, { "epoch": 0.25839267548321465, "grad_norm": 1.4863985776901245, "learning_rate": 8.593220338983052e-06, "loss": 0.6042, "mean_token_accuracy": 0.8181418776512146, "num_tokens": 81137163.0, "step": 508 }, { "epoch": 0.2589013224821974, "grad_norm": 1.335146427154541, "learning_rate": 8.610169491525424e-06, "loss": 0.6109, "mean_token_accuracy": 0.8161569833755493, "num_tokens": 81304752.0, "step": 509 }, { "epoch": 0.25940996948118006, "grad_norm": 1.484490990638733, "learning_rate": 8.627118644067798e-06, "loss": 0.5753, "mean_token_accuracy": 0.8248833417892456, "num_tokens": 81448192.0, "step": 510 }, { "epoch": 0.2599186164801628, "grad_norm": 1.4455455541610718, "learning_rate": 8.64406779661017e-06, "loss": 0.6117, "mean_token_accuracy": 0.816500723361969, "num_tokens": 81608372.0, "step": 511 }, { "epoch": 0.26042726347914547, "grad_norm": 1.2930644750595093, "learning_rate": 8.661016949152543e-06, "loss": 0.5756, "mean_token_accuracy": 0.8255362510681152, "num_tokens": 81757788.0, "step": 512 }, { "epoch": 0.2609359104781282, "grad_norm": 1.2497493028640747, "learning_rate": 8.677966101694915e-06, "loss": 0.5735, "mean_token_accuracy": 0.8275926113128662, "num_tokens": 81917301.0, "step": 513 }, { "epoch": 0.2614445574771109, "grad_norm": 1.1094409227371216, "learning_rate": 8.694915254237288e-06, "loss": 0.5829, "mean_token_accuracy": 0.8236455917358398, "num_tokens": 82085431.0, "step": 514 }, { "epoch": 0.2619532044760936, "grad_norm": 1.2383760213851929, "learning_rate": 8.711864406779662e-06, "loss": 0.5753, "mean_token_accuracy": 0.8278086185455322, "num_tokens": 82238284.0, "step": 515 }, { "epoch": 0.2624618514750763, "grad_norm": 1.1112157106399536, "learning_rate": 8.728813559322034e-06, "loss": 0.6045, "mean_token_accuracy": 0.8183934688568115, "num_tokens": 82404647.0, "step": 516 }, { "epoch": 0.262970498474059, "grad_norm": 1.2201389074325562, "learning_rate": 8.745762711864407e-06, "loss": 0.5801, "mean_token_accuracy": 0.8245712518692017, "num_tokens": 82572960.0, "step": 517 }, { "epoch": 0.2634791454730417, "grad_norm": 1.177706003189087, "learning_rate": 8.76271186440678e-06, "loss": 0.5881, "mean_token_accuracy": 0.8239415884017944, "num_tokens": 82738083.0, "step": 518 }, { "epoch": 0.2639877924720244, "grad_norm": 1.3163994550704956, "learning_rate": 8.779661016949153e-06, "loss": 0.5782, "mean_token_accuracy": 0.8258814215660095, "num_tokens": 82887087.0, "step": 519 }, { "epoch": 0.2644964394710071, "grad_norm": 1.3369812965393066, "learning_rate": 8.796610169491527e-06, "loss": 0.5901, "mean_token_accuracy": 0.8220720291137695, "num_tokens": 83033584.0, "step": 520 }, { "epoch": 0.2650050864699898, "grad_norm": 1.3403503894805908, "learning_rate": 8.8135593220339e-06, "loss": 0.5989, "mean_token_accuracy": 0.8187283277511597, "num_tokens": 83195499.0, "step": 521 }, { "epoch": 0.26551373346897256, "grad_norm": 1.2609878778457642, "learning_rate": 8.830508474576272e-06, "loss": 0.5901, "mean_token_accuracy": 0.8238098621368408, "num_tokens": 83363928.0, "step": 522 }, { "epoch": 0.26602238046795523, "grad_norm": 1.2445399761199951, "learning_rate": 8.847457627118646e-06, "loss": 0.5802, "mean_token_accuracy": 0.8244575262069702, "num_tokens": 83520177.0, "step": 523 }, { "epoch": 0.26653102746693796, "grad_norm": 1.289551854133606, "learning_rate": 8.864406779661018e-06, "loss": 0.5654, "mean_token_accuracy": 0.8277870416641235, "num_tokens": 83680255.0, "step": 524 }, { "epoch": 0.26703967446592064, "grad_norm": 1.2121199369430542, "learning_rate": 8.88135593220339e-06, "loss": 0.5824, "mean_token_accuracy": 0.8237113952636719, "num_tokens": 83852400.0, "step": 525 }, { "epoch": 0.26754832146490337, "grad_norm": 1.316636562347412, "learning_rate": 8.898305084745763e-06, "loss": 0.611, "mean_token_accuracy": 0.8159763813018799, "num_tokens": 84014162.0, "step": 526 }, { "epoch": 0.26805696846388605, "grad_norm": 1.1623677015304565, "learning_rate": 8.915254237288137e-06, "loss": 0.5672, "mean_token_accuracy": 0.8281941413879395, "num_tokens": 84170100.0, "step": 527 }, { "epoch": 0.2685656154628688, "grad_norm": 1.1831893920898438, "learning_rate": 8.932203389830509e-06, "loss": 0.5499, "mean_token_accuracy": 0.833954930305481, "num_tokens": 84329460.0, "step": 528 }, { "epoch": 0.26907426246185145, "grad_norm": 1.1870083808898926, "learning_rate": 8.949152542372881e-06, "loss": 0.5441, "mean_token_accuracy": 0.8352469801902771, "num_tokens": 84486375.0, "step": 529 }, { "epoch": 0.2695829094608342, "grad_norm": 1.3223252296447754, "learning_rate": 8.966101694915254e-06, "loss": 0.5534, "mean_token_accuracy": 0.8336907625198364, "num_tokens": 84643661.0, "step": 530 }, { "epoch": 0.27009155645981686, "grad_norm": 1.1877634525299072, "learning_rate": 8.983050847457628e-06, "loss": 0.5679, "mean_token_accuracy": 0.829535186290741, "num_tokens": 84812504.0, "step": 531 }, { "epoch": 0.2706002034587996, "grad_norm": 1.2430506944656372, "learning_rate": 9e-06, "loss": 0.5689, "mean_token_accuracy": 0.8278781771659851, "num_tokens": 84968874.0, "step": 532 }, { "epoch": 0.2711088504577823, "grad_norm": 1.1894301176071167, "learning_rate": 9.016949152542374e-06, "loss": 0.5746, "mean_token_accuracy": 0.824907660484314, "num_tokens": 85129918.0, "step": 533 }, { "epoch": 0.271617497456765, "grad_norm": 1.2176803350448608, "learning_rate": 9.033898305084747e-06, "loss": 0.5669, "mean_token_accuracy": 0.8289027214050293, "num_tokens": 85292100.0, "step": 534 }, { "epoch": 0.27212614445574773, "grad_norm": 1.238106608390808, "learning_rate": 9.05084745762712e-06, "loss": 0.552, "mean_token_accuracy": 0.8348509073257446, "num_tokens": 85445236.0, "step": 535 }, { "epoch": 0.2726347914547304, "grad_norm": 1.2461129426956177, "learning_rate": 9.067796610169493e-06, "loss": 0.587, "mean_token_accuracy": 0.822391927242279, "num_tokens": 85605945.0, "step": 536 }, { "epoch": 0.27314343845371314, "grad_norm": 1.171604871749878, "learning_rate": 9.084745762711865e-06, "loss": 0.5675, "mean_token_accuracy": 0.8278486728668213, "num_tokens": 85760177.0, "step": 537 }, { "epoch": 0.2736520854526958, "grad_norm": 1.1491703987121582, "learning_rate": 9.101694915254238e-06, "loss": 0.5855, "mean_token_accuracy": 0.8246716260910034, "num_tokens": 85924331.0, "step": 538 }, { "epoch": 0.27416073245167855, "grad_norm": 1.1518174409866333, "learning_rate": 9.11864406779661e-06, "loss": 0.6048, "mean_token_accuracy": 0.8184250593185425, "num_tokens": 86087686.0, "step": 539 }, { "epoch": 0.2746693794506612, "grad_norm": 1.2020530700683594, "learning_rate": 9.135593220338984e-06, "loss": 0.6024, "mean_token_accuracy": 0.8186061978340149, "num_tokens": 86231923.0, "step": 540 }, { "epoch": 0.27517802644964395, "grad_norm": 1.246408462524414, "learning_rate": 9.152542372881356e-06, "loss": 0.5731, "mean_token_accuracy": 0.8254373669624329, "num_tokens": 86393599.0, "step": 541 }, { "epoch": 0.27568667344862663, "grad_norm": 1.2908697128295898, "learning_rate": 9.169491525423729e-06, "loss": 0.5437, "mean_token_accuracy": 0.8329447507858276, "num_tokens": 86546912.0, "step": 542 }, { "epoch": 0.27619532044760936, "grad_norm": 1.3099312782287598, "learning_rate": 9.186440677966101e-06, "loss": 0.6402, "mean_token_accuracy": 0.8103553056716919, "num_tokens": 86710627.0, "step": 543 }, { "epoch": 0.2767039674465921, "grad_norm": 1.2442010641098022, "learning_rate": 9.203389830508475e-06, "loss": 0.5928, "mean_token_accuracy": 0.8202736377716064, "num_tokens": 86872441.0, "step": 544 }, { "epoch": 0.27721261444557477, "grad_norm": 1.3643654584884644, "learning_rate": 9.220338983050847e-06, "loss": 0.5981, "mean_token_accuracy": 0.8186464309692383, "num_tokens": 87040426.0, "step": 545 }, { "epoch": 0.2777212614445575, "grad_norm": 1.2602033615112305, "learning_rate": 9.237288135593222e-06, "loss": 0.5838, "mean_token_accuracy": 0.8244240283966064, "num_tokens": 87204457.0, "step": 546 }, { "epoch": 0.2782299084435402, "grad_norm": 1.3273673057556152, "learning_rate": 9.254237288135594e-06, "loss": 0.5934, "mean_token_accuracy": 0.8224654197692871, "num_tokens": 87350790.0, "step": 547 }, { "epoch": 0.2787385554425229, "grad_norm": 1.3505678176879883, "learning_rate": 9.271186440677968e-06, "loss": 0.5992, "mean_token_accuracy": 0.8188058137893677, "num_tokens": 87501470.0, "step": 548 }, { "epoch": 0.2792472024415056, "grad_norm": 1.3277742862701416, "learning_rate": 9.28813559322034e-06, "loss": 0.5902, "mean_token_accuracy": 0.8227229118347168, "num_tokens": 87654280.0, "step": 549 }, { "epoch": 0.2797558494404883, "grad_norm": 1.2117854356765747, "learning_rate": 9.305084745762713e-06, "loss": 0.5726, "mean_token_accuracy": 0.8264171481132507, "num_tokens": 87816423.0, "step": 550 }, { "epoch": 0.280264496439471, "grad_norm": 1.4077842235565186, "learning_rate": 9.322033898305085e-06, "loss": 0.6106, "mean_token_accuracy": 0.8154207468032837, "num_tokens": 87960156.0, "step": 551 }, { "epoch": 0.2807731434384537, "grad_norm": 1.3092397451400757, "learning_rate": 9.338983050847459e-06, "loss": 0.5554, "mean_token_accuracy": 0.8319989442825317, "num_tokens": 88105701.0, "step": 552 }, { "epoch": 0.2812817904374364, "grad_norm": 1.2441811561584473, "learning_rate": 9.355932203389831e-06, "loss": 0.5804, "mean_token_accuracy": 0.8244675397872925, "num_tokens": 88261189.0, "step": 553 }, { "epoch": 0.28179043743641913, "grad_norm": 1.3278307914733887, "learning_rate": 9.372881355932204e-06, "loss": 0.592, "mean_token_accuracy": 0.8210278153419495, "num_tokens": 88429993.0, "step": 554 }, { "epoch": 0.2822990844354018, "grad_norm": 1.2581342458724976, "learning_rate": 9.389830508474576e-06, "loss": 0.578, "mean_token_accuracy": 0.8260815143585205, "num_tokens": 88598116.0, "step": 555 }, { "epoch": 0.28280773143438453, "grad_norm": 1.350409746170044, "learning_rate": 9.40677966101695e-06, "loss": 0.6042, "mean_token_accuracy": 0.8164364099502563, "num_tokens": 88754109.0, "step": 556 }, { "epoch": 0.28331637843336727, "grad_norm": 1.393012285232544, "learning_rate": 9.423728813559322e-06, "loss": 0.5888, "mean_token_accuracy": 0.8246204853057861, "num_tokens": 88911407.0, "step": 557 }, { "epoch": 0.28382502543234994, "grad_norm": 1.4434447288513184, "learning_rate": 9.440677966101696e-06, "loss": 0.6229, "mean_token_accuracy": 0.8114631175994873, "num_tokens": 89075556.0, "step": 558 }, { "epoch": 0.2843336724313327, "grad_norm": 1.2736214399337769, "learning_rate": 9.457627118644069e-06, "loss": 0.5838, "mean_token_accuracy": 0.8232846260070801, "num_tokens": 89234090.0, "step": 559 }, { "epoch": 0.28484231943031535, "grad_norm": 1.4164948463439941, "learning_rate": 9.474576271186441e-06, "loss": 0.596, "mean_token_accuracy": 0.8208360075950623, "num_tokens": 89394941.0, "step": 560 }, { "epoch": 0.2853509664292981, "grad_norm": 1.3688839673995972, "learning_rate": 9.491525423728815e-06, "loss": 0.5935, "mean_token_accuracy": 0.8203197717666626, "num_tokens": 89560585.0, "step": 561 }, { "epoch": 0.28585961342828076, "grad_norm": 1.2683659791946411, "learning_rate": 9.508474576271188e-06, "loss": 0.5913, "mean_token_accuracy": 0.8229166865348816, "num_tokens": 89726320.0, "step": 562 }, { "epoch": 0.2863682604272635, "grad_norm": 1.242645502090454, "learning_rate": 9.52542372881356e-06, "loss": 0.5569, "mean_token_accuracy": 0.8335822224617004, "num_tokens": 89880236.0, "step": 563 }, { "epoch": 0.28687690742624616, "grad_norm": 1.1361278295516968, "learning_rate": 9.542372881355934e-06, "loss": 0.6169, "mean_token_accuracy": 0.8147539496421814, "num_tokens": 90046963.0, "step": 564 }, { "epoch": 0.2873855544252289, "grad_norm": 1.174190640449524, "learning_rate": 9.559322033898306e-06, "loss": 0.5615, "mean_token_accuracy": 0.8308785557746887, "num_tokens": 90208654.0, "step": 565 }, { "epoch": 0.28789420142421157, "grad_norm": 1.269404649734497, "learning_rate": 9.576271186440679e-06, "loss": 0.5913, "mean_token_accuracy": 0.8214564323425293, "num_tokens": 90360145.0, "step": 566 }, { "epoch": 0.2884028484231943, "grad_norm": 1.4272724390029907, "learning_rate": 9.593220338983051e-06, "loss": 0.5825, "mean_token_accuracy": 0.8251073360443115, "num_tokens": 90517341.0, "step": 567 }, { "epoch": 0.28891149542217703, "grad_norm": 1.2705575227737427, "learning_rate": 9.610169491525423e-06, "loss": 0.5811, "mean_token_accuracy": 0.8249518275260925, "num_tokens": 90666248.0, "step": 568 }, { "epoch": 0.2894201424211597, "grad_norm": 1.2269147634506226, "learning_rate": 9.627118644067797e-06, "loss": 0.5657, "mean_token_accuracy": 0.8296713829040527, "num_tokens": 90824390.0, "step": 569 }, { "epoch": 0.28992878942014244, "grad_norm": 1.2506111860275269, "learning_rate": 9.64406779661017e-06, "loss": 0.5953, "mean_token_accuracy": 0.8209437727928162, "num_tokens": 90990168.0, "step": 570 }, { "epoch": 0.2904374364191251, "grad_norm": 1.2558051347732544, "learning_rate": 9.661016949152544e-06, "loss": 0.5779, "mean_token_accuracy": 0.8250951170921326, "num_tokens": 91130945.0, "step": 571 }, { "epoch": 0.29094608341810785, "grad_norm": 1.2120747566223145, "learning_rate": 9.677966101694916e-06, "loss": 0.5693, "mean_token_accuracy": 0.8284271955490112, "num_tokens": 91287120.0, "step": 572 }, { "epoch": 0.2914547304170905, "grad_norm": 1.3007855415344238, "learning_rate": 9.69491525423729e-06, "loss": 0.5856, "mean_token_accuracy": 0.82295161485672, "num_tokens": 91445888.0, "step": 573 }, { "epoch": 0.29196337741607326, "grad_norm": 1.2200978994369507, "learning_rate": 9.711864406779662e-06, "loss": 0.5471, "mean_token_accuracy": 0.8336622714996338, "num_tokens": 91605888.0, "step": 574 }, { "epoch": 0.29247202441505593, "grad_norm": 1.305680274963379, "learning_rate": 9.728813559322035e-06, "loss": 0.6212, "mean_token_accuracy": 0.8143027424812317, "num_tokens": 91766382.0, "step": 575 }, { "epoch": 0.29298067141403866, "grad_norm": 1.149032711982727, "learning_rate": 9.745762711864407e-06, "loss": 0.5802, "mean_token_accuracy": 0.8255360126495361, "num_tokens": 91923111.0, "step": 576 }, { "epoch": 0.29348931841302134, "grad_norm": 1.212319254875183, "learning_rate": 9.762711864406781e-06, "loss": 0.602, "mean_token_accuracy": 0.8194811344146729, "num_tokens": 92076217.0, "step": 577 }, { "epoch": 0.29399796541200407, "grad_norm": 1.2744840383529663, "learning_rate": 9.779661016949154e-06, "loss": 0.5347, "mean_token_accuracy": 0.836704432964325, "num_tokens": 92233100.0, "step": 578 }, { "epoch": 0.2945066124109868, "grad_norm": 1.250063419342041, "learning_rate": 9.796610169491526e-06, "loss": 0.6222, "mean_token_accuracy": 0.8125181198120117, "num_tokens": 92384392.0, "step": 579 }, { "epoch": 0.2950152594099695, "grad_norm": 1.2378636598587036, "learning_rate": 9.813559322033898e-06, "loss": 0.5395, "mean_token_accuracy": 0.8373066186904907, "num_tokens": 92538563.0, "step": 580 }, { "epoch": 0.2955239064089522, "grad_norm": 1.3510687351226807, "learning_rate": 9.830508474576272e-06, "loss": 0.5621, "mean_token_accuracy": 0.8300359845161438, "num_tokens": 92702514.0, "step": 581 }, { "epoch": 0.2960325534079349, "grad_norm": 1.1700639724731445, "learning_rate": 9.847457627118645e-06, "loss": 0.5268, "mean_token_accuracy": 0.8393378257751465, "num_tokens": 92861897.0, "step": 582 }, { "epoch": 0.2965412004069176, "grad_norm": 1.4047576189041138, "learning_rate": 9.864406779661017e-06, "loss": 0.5431, "mean_token_accuracy": 0.8350200653076172, "num_tokens": 93021485.0, "step": 583 }, { "epoch": 0.2970498474059003, "grad_norm": 1.2136011123657227, "learning_rate": 9.881355932203391e-06, "loss": 0.5115, "mean_token_accuracy": 0.8411509990692139, "num_tokens": 93169971.0, "step": 584 }, { "epoch": 0.297558494404883, "grad_norm": 1.3670368194580078, "learning_rate": 9.898305084745763e-06, "loss": 0.5697, "mean_token_accuracy": 0.8275981545448303, "num_tokens": 93321364.0, "step": 585 }, { "epoch": 0.2980671414038657, "grad_norm": 1.283612847328186, "learning_rate": 9.915254237288137e-06, "loss": 0.5931, "mean_token_accuracy": 0.8204793930053711, "num_tokens": 93476129.0, "step": 586 }, { "epoch": 0.29857578840284843, "grad_norm": 1.3070842027664185, "learning_rate": 9.93220338983051e-06, "loss": 0.5813, "mean_token_accuracy": 0.8225874304771423, "num_tokens": 93637944.0, "step": 587 }, { "epoch": 0.2990844354018311, "grad_norm": 1.1480412483215332, "learning_rate": 9.949152542372882e-06, "loss": 0.5669, "mean_token_accuracy": 0.8301283121109009, "num_tokens": 93795891.0, "step": 588 }, { "epoch": 0.29959308240081384, "grad_norm": 1.242099404335022, "learning_rate": 9.966101694915256e-06, "loss": 0.6161, "mean_token_accuracy": 0.8146437406539917, "num_tokens": 93949753.0, "step": 589 }, { "epoch": 0.30010172939979657, "grad_norm": 1.2934678792953491, "learning_rate": 9.983050847457628e-06, "loss": 0.5363, "mean_token_accuracy": 0.8385515213012695, "num_tokens": 94099353.0, "step": 590 }, { "epoch": 0.30061037639877924, "grad_norm": 1.4157675504684448, "learning_rate": 1e-05, "loss": 0.6421, "mean_token_accuracy": 0.808651328086853, "num_tokens": 94262648.0, "step": 591 }, { "epoch": 0.301119023397762, "grad_norm": 1.358555793762207, "learning_rate": 1e-05, "loss": 0.582, "mean_token_accuracy": 0.8241909742355347, "num_tokens": 94420660.0, "step": 592 }, { "epoch": 0.30162767039674465, "grad_norm": 1.3359708786010742, "learning_rate": 1e-05, "loss": 0.5721, "mean_token_accuracy": 0.8296599388122559, "num_tokens": 94580578.0, "step": 593 }, { "epoch": 0.3021363173957274, "grad_norm": 1.4047656059265137, "learning_rate": 1e-05, "loss": 0.5795, "mean_token_accuracy": 0.8244053721427917, "num_tokens": 94745810.0, "step": 594 }, { "epoch": 0.30264496439471006, "grad_norm": 1.289602518081665, "learning_rate": 1e-05, "loss": 0.5907, "mean_token_accuracy": 0.8213446140289307, "num_tokens": 94904762.0, "step": 595 }, { "epoch": 0.3031536113936928, "grad_norm": 1.376883625984192, "learning_rate": 1e-05, "loss": 0.5718, "mean_token_accuracy": 0.8260505199432373, "num_tokens": 95053065.0, "step": 596 }, { "epoch": 0.30366225839267547, "grad_norm": 1.2694602012634277, "learning_rate": 1e-05, "loss": 0.5914, "mean_token_accuracy": 0.8208502531051636, "num_tokens": 95207324.0, "step": 597 }, { "epoch": 0.3041709053916582, "grad_norm": 1.331419825553894, "learning_rate": 1e-05, "loss": 0.5597, "mean_token_accuracy": 0.8295787572860718, "num_tokens": 95365564.0, "step": 598 }, { "epoch": 0.3046795523906409, "grad_norm": 1.2676899433135986, "learning_rate": 1e-05, "loss": 0.5765, "mean_token_accuracy": 0.8234357833862305, "num_tokens": 95522523.0, "step": 599 }, { "epoch": 0.3051881993896236, "grad_norm": 1.3706655502319336, "learning_rate": 1e-05, "loss": 0.5494, "mean_token_accuracy": 0.8335732221603394, "num_tokens": 95677617.0, "step": 600 }, { "epoch": 0.3056968463886063, "grad_norm": 1.1500375270843506, "learning_rate": 1e-05, "loss": 0.581, "mean_token_accuracy": 0.8245093822479248, "num_tokens": 95855477.0, "step": 601 }, { "epoch": 0.306205493387589, "grad_norm": 1.2335515022277832, "learning_rate": 1e-05, "loss": 0.6158, "mean_token_accuracy": 0.8146917819976807, "num_tokens": 96017692.0, "step": 602 }, { "epoch": 0.30671414038657174, "grad_norm": 1.223585605621338, "learning_rate": 1e-05, "loss": 0.573, "mean_token_accuracy": 0.8267310857772827, "num_tokens": 96175479.0, "step": 603 }, { "epoch": 0.3072227873855544, "grad_norm": 1.2164987325668335, "learning_rate": 1e-05, "loss": 0.5855, "mean_token_accuracy": 0.8224807977676392, "num_tokens": 96337729.0, "step": 604 }, { "epoch": 0.30773143438453715, "grad_norm": 1.1218758821487427, "learning_rate": 1e-05, "loss": 0.5696, "mean_token_accuracy": 0.827846884727478, "num_tokens": 96492659.0, "step": 605 }, { "epoch": 0.3082400813835198, "grad_norm": 1.2200160026550293, "learning_rate": 1e-05, "loss": 0.5835, "mean_token_accuracy": 0.8219121694564819, "num_tokens": 96644965.0, "step": 606 }, { "epoch": 0.30874872838250256, "grad_norm": 1.1906354427337646, "learning_rate": 1e-05, "loss": 0.544, "mean_token_accuracy": 0.8338629007339478, "num_tokens": 96801670.0, "step": 607 }, { "epoch": 0.30925737538148523, "grad_norm": 1.2348442077636719, "learning_rate": 1e-05, "loss": 0.5751, "mean_token_accuracy": 0.8240618705749512, "num_tokens": 96966380.0, "step": 608 }, { "epoch": 0.30976602238046796, "grad_norm": 1.207759976387024, "learning_rate": 1e-05, "loss": 0.5759, "mean_token_accuracy": 0.8252097368240356, "num_tokens": 97116653.0, "step": 609 }, { "epoch": 0.31027466937945064, "grad_norm": 1.3898676633834839, "learning_rate": 1e-05, "loss": 0.5701, "mean_token_accuracy": 0.8274800777435303, "num_tokens": 97279451.0, "step": 610 }, { "epoch": 0.31078331637843337, "grad_norm": 1.2286509275436401, "learning_rate": 1e-05, "loss": 0.6084, "mean_token_accuracy": 0.8180156350135803, "num_tokens": 97439569.0, "step": 611 }, { "epoch": 0.31129196337741605, "grad_norm": 1.1610603332519531, "learning_rate": 1e-05, "loss": 0.5625, "mean_token_accuracy": 0.8294017910957336, "num_tokens": 97595648.0, "step": 612 }, { "epoch": 0.3118006103763988, "grad_norm": 1.1723705530166626, "learning_rate": 1e-05, "loss": 0.602, "mean_token_accuracy": 0.8190659284591675, "num_tokens": 97754844.0, "step": 613 }, { "epoch": 0.3123092573753815, "grad_norm": 1.1151776313781738, "learning_rate": 1e-05, "loss": 0.5651, "mean_token_accuracy": 0.8298476338386536, "num_tokens": 97916479.0, "step": 614 }, { "epoch": 0.3128179043743642, "grad_norm": 1.2659906148910522, "learning_rate": 1e-05, "loss": 0.5978, "mean_token_accuracy": 0.8206175565719604, "num_tokens": 98083229.0, "step": 615 }, { "epoch": 0.3133265513733469, "grad_norm": 1.1788604259490967, "learning_rate": 1e-05, "loss": 0.5743, "mean_token_accuracy": 0.8246487379074097, "num_tokens": 98234766.0, "step": 616 }, { "epoch": 0.3138351983723296, "grad_norm": 1.1908249855041504, "learning_rate": 1e-05, "loss": 0.564, "mean_token_accuracy": 0.8283237218856812, "num_tokens": 98401537.0, "step": 617 }, { "epoch": 0.3143438453713123, "grad_norm": 1.152085781097412, "learning_rate": 1e-05, "loss": 0.5548, "mean_token_accuracy": 0.8315668702125549, "num_tokens": 98567715.0, "step": 618 }, { "epoch": 0.314852492370295, "grad_norm": 1.1295671463012695, "learning_rate": 1e-05, "loss": 0.5742, "mean_token_accuracy": 0.8245363235473633, "num_tokens": 98735978.0, "step": 619 }, { "epoch": 0.31536113936927773, "grad_norm": 1.2672114372253418, "learning_rate": 1e-05, "loss": 0.6023, "mean_token_accuracy": 0.8201767206192017, "num_tokens": 98906165.0, "step": 620 }, { "epoch": 0.3158697863682604, "grad_norm": 1.220890760421753, "learning_rate": 1e-05, "loss": 0.5827, "mean_token_accuracy": 0.8241899013519287, "num_tokens": 99056653.0, "step": 621 }, { "epoch": 0.31637843336724314, "grad_norm": 1.1554323434829712, "learning_rate": 1e-05, "loss": 0.5379, "mean_token_accuracy": 0.835025429725647, "num_tokens": 99211579.0, "step": 622 }, { "epoch": 0.3168870803662258, "grad_norm": 1.2123159170150757, "learning_rate": 1e-05, "loss": 0.5864, "mean_token_accuracy": 0.8231475353240967, "num_tokens": 99361670.0, "step": 623 }, { "epoch": 0.31739572736520855, "grad_norm": 1.1268131732940674, "learning_rate": 1e-05, "loss": 0.6015, "mean_token_accuracy": 0.8182185888290405, "num_tokens": 99530349.0, "step": 624 }, { "epoch": 0.3179043743641913, "grad_norm": 1.1105821132659912, "learning_rate": 1e-05, "loss": 0.5594, "mean_token_accuracy": 0.8318790197372437, "num_tokens": 99693143.0, "step": 625 }, { "epoch": 0.31841302136317395, "grad_norm": 1.0727262496948242, "learning_rate": 1e-05, "loss": 0.6265, "mean_token_accuracy": 0.8118510246276855, "num_tokens": 99867681.0, "step": 626 }, { "epoch": 0.3189216683621567, "grad_norm": 1.245917558670044, "learning_rate": 1e-05, "loss": 0.5739, "mean_token_accuracy": 0.8248531222343445, "num_tokens": 100026848.0, "step": 627 }, { "epoch": 0.31943031536113936, "grad_norm": 1.0944061279296875, "learning_rate": 1e-05, "loss": 0.5631, "mean_token_accuracy": 0.8312759399414062, "num_tokens": 100198041.0, "step": 628 }, { "epoch": 0.3199389623601221, "grad_norm": 1.275220274925232, "learning_rate": 1e-05, "loss": 0.5433, "mean_token_accuracy": 0.8339123725891113, "num_tokens": 100352919.0, "step": 629 }, { "epoch": 0.32044760935910477, "grad_norm": 1.1218000650405884, "learning_rate": 1e-05, "loss": 0.5449, "mean_token_accuracy": 0.8353885412216187, "num_tokens": 100509537.0, "step": 630 }, { "epoch": 0.3209562563580875, "grad_norm": 1.2154606580734253, "learning_rate": 1e-05, "loss": 0.5481, "mean_token_accuracy": 0.8327236771583557, "num_tokens": 100663059.0, "step": 631 }, { "epoch": 0.3214649033570702, "grad_norm": 1.4552714824676514, "learning_rate": 1e-05, "loss": 0.6006, "mean_token_accuracy": 0.8186696767807007, "num_tokens": 100831452.0, "step": 632 }, { "epoch": 0.3219735503560529, "grad_norm": 1.2636407613754272, "learning_rate": 1e-05, "loss": 0.5707, "mean_token_accuracy": 0.8284827470779419, "num_tokens": 100984332.0, "step": 633 }, { "epoch": 0.3224821973550356, "grad_norm": 1.3425766229629517, "learning_rate": 1e-05, "loss": 0.5877, "mean_token_accuracy": 0.8216133117675781, "num_tokens": 101131963.0, "step": 634 }, { "epoch": 0.3229908443540183, "grad_norm": 1.1513028144836426, "learning_rate": 1e-05, "loss": 0.5675, "mean_token_accuracy": 0.8274757862091064, "num_tokens": 101289236.0, "step": 635 }, { "epoch": 0.323499491353001, "grad_norm": 1.3217164278030396, "learning_rate": 1e-05, "loss": 0.569, "mean_token_accuracy": 0.8281552791595459, "num_tokens": 101448301.0, "step": 636 }, { "epoch": 0.3240081383519837, "grad_norm": 1.212892770767212, "learning_rate": 1e-05, "loss": 0.5739, "mean_token_accuracy": 0.8250631093978882, "num_tokens": 101601568.0, "step": 637 }, { "epoch": 0.32451678535096645, "grad_norm": 1.3648805618286133, "learning_rate": 1e-05, "loss": 0.5734, "mean_token_accuracy": 0.824871301651001, "num_tokens": 101750506.0, "step": 638 }, { "epoch": 0.32502543234994913, "grad_norm": 1.2872990369796753, "learning_rate": 1e-05, "loss": 0.5893, "mean_token_accuracy": 0.8241720199584961, "num_tokens": 101915028.0, "step": 639 }, { "epoch": 0.32553407934893186, "grad_norm": 1.2380262613296509, "learning_rate": 1e-05, "loss": 0.5814, "mean_token_accuracy": 0.8232290744781494, "num_tokens": 102079659.0, "step": 640 }, { "epoch": 0.32604272634791454, "grad_norm": 1.1958699226379395, "learning_rate": 1e-05, "loss": 0.5496, "mean_token_accuracy": 0.8321324586868286, "num_tokens": 102247764.0, "step": 641 }, { "epoch": 0.32655137334689727, "grad_norm": 1.235268473625183, "learning_rate": 1e-05, "loss": 0.5493, "mean_token_accuracy": 0.8314043283462524, "num_tokens": 102405734.0, "step": 642 }, { "epoch": 0.32706002034587994, "grad_norm": 1.3108789920806885, "learning_rate": 1e-05, "loss": 0.5702, "mean_token_accuracy": 0.8258588314056396, "num_tokens": 102566525.0, "step": 643 }, { "epoch": 0.3275686673448627, "grad_norm": 1.1007673740386963, "learning_rate": 1e-05, "loss": 0.6158, "mean_token_accuracy": 0.8150623440742493, "num_tokens": 102739870.0, "step": 644 }, { "epoch": 0.32807731434384535, "grad_norm": 1.3075753450393677, "learning_rate": 1e-05, "loss": 0.5888, "mean_token_accuracy": 0.8222059011459351, "num_tokens": 102895661.0, "step": 645 }, { "epoch": 0.3285859613428281, "grad_norm": 1.282270073890686, "learning_rate": 1e-05, "loss": 0.5613, "mean_token_accuracy": 0.830429196357727, "num_tokens": 103055954.0, "step": 646 }, { "epoch": 0.32909460834181076, "grad_norm": 1.1636611223220825, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.8302597999572754, "num_tokens": 103214101.0, "step": 647 }, { "epoch": 0.3296032553407935, "grad_norm": 1.2605210542678833, "learning_rate": 1e-05, "loss": 0.5941, "mean_token_accuracy": 0.8216217756271362, "num_tokens": 103373341.0, "step": 648 }, { "epoch": 0.3301119023397762, "grad_norm": 1.1215887069702148, "learning_rate": 1e-05, "loss": 0.5567, "mean_token_accuracy": 0.8300506472587585, "num_tokens": 103546489.0, "step": 649 }, { "epoch": 0.3306205493387589, "grad_norm": 1.2471184730529785, "learning_rate": 1e-05, "loss": 0.5582, "mean_token_accuracy": 0.8286738991737366, "num_tokens": 103709672.0, "step": 650 }, { "epoch": 0.3311291963377416, "grad_norm": 1.2193257808685303, "learning_rate": 1e-05, "loss": 0.5431, "mean_token_accuracy": 0.8333966732025146, "num_tokens": 103875427.0, "step": 651 }, { "epoch": 0.3316378433367243, "grad_norm": 1.2777115106582642, "learning_rate": 1e-05, "loss": 0.568, "mean_token_accuracy": 0.8270902633666992, "num_tokens": 104027661.0, "step": 652 }, { "epoch": 0.33214649033570703, "grad_norm": 1.2929627895355225, "learning_rate": 1e-05, "loss": 0.6048, "mean_token_accuracy": 0.8180147409439087, "num_tokens": 104180735.0, "step": 653 }, { "epoch": 0.3326551373346897, "grad_norm": 1.2298574447631836, "learning_rate": 1e-05, "loss": 0.5642, "mean_token_accuracy": 0.8300544023513794, "num_tokens": 104353014.0, "step": 654 }, { "epoch": 0.33316378433367244, "grad_norm": 1.287480354309082, "learning_rate": 1e-05, "loss": 0.5738, "mean_token_accuracy": 0.8261777758598328, "num_tokens": 104517143.0, "step": 655 }, { "epoch": 0.3336724313326551, "grad_norm": 1.1733216047286987, "learning_rate": 1e-05, "loss": 0.6069, "mean_token_accuracy": 0.8148963451385498, "num_tokens": 104669203.0, "step": 656 }, { "epoch": 0.33418107833163785, "grad_norm": 1.364167332649231, "learning_rate": 1e-05, "loss": 0.5939, "mean_token_accuracy": 0.8200170993804932, "num_tokens": 104810466.0, "step": 657 }, { "epoch": 0.3346897253306205, "grad_norm": 1.1087124347686768, "learning_rate": 1e-05, "loss": 0.5942, "mean_token_accuracy": 0.8199101686477661, "num_tokens": 104980367.0, "step": 658 }, { "epoch": 0.33519837232960326, "grad_norm": 1.5046215057373047, "learning_rate": 1e-05, "loss": 0.5528, "mean_token_accuracy": 0.8326549530029297, "num_tokens": 105131160.0, "step": 659 }, { "epoch": 0.335707019328586, "grad_norm": 1.1873219013214111, "learning_rate": 1e-05, "loss": 0.6084, "mean_token_accuracy": 0.818149983882904, "num_tokens": 105298008.0, "step": 660 }, { "epoch": 0.33621566632756866, "grad_norm": 1.428198218345642, "learning_rate": 1e-05, "loss": 0.6123, "mean_token_accuracy": 0.8152676820755005, "num_tokens": 105452271.0, "step": 661 }, { "epoch": 0.3367243133265514, "grad_norm": 1.202708125114441, "learning_rate": 1e-05, "loss": 0.5469, "mean_token_accuracy": 0.8334332704544067, "num_tokens": 105619978.0, "step": 662 }, { "epoch": 0.33723296032553407, "grad_norm": 1.1747825145721436, "learning_rate": 1e-05, "loss": 0.5598, "mean_token_accuracy": 0.829487681388855, "num_tokens": 105786355.0, "step": 663 }, { "epoch": 0.3377416073245168, "grad_norm": 1.3457754850387573, "learning_rate": 1e-05, "loss": 0.5984, "mean_token_accuracy": 0.819837749004364, "num_tokens": 105942503.0, "step": 664 }, { "epoch": 0.3382502543234995, "grad_norm": 1.232384204864502, "learning_rate": 1e-05, "loss": 0.5735, "mean_token_accuracy": 0.8262651562690735, "num_tokens": 106088220.0, "step": 665 }, { "epoch": 0.3387589013224822, "grad_norm": 1.2785651683807373, "learning_rate": 1e-05, "loss": 0.5332, "mean_token_accuracy": 0.8363296985626221, "num_tokens": 106246812.0, "step": 666 }, { "epoch": 0.3392675483214649, "grad_norm": 1.1574139595031738, "learning_rate": 1e-05, "loss": 0.5324, "mean_token_accuracy": 0.8359112739562988, "num_tokens": 106405912.0, "step": 667 }, { "epoch": 0.3397761953204476, "grad_norm": 1.167323350906372, "learning_rate": 1e-05, "loss": 0.6198, "mean_token_accuracy": 0.8138099908828735, "num_tokens": 106556412.0, "step": 668 }, { "epoch": 0.3402848423194303, "grad_norm": 1.261059284210205, "learning_rate": 1e-05, "loss": 0.5882, "mean_token_accuracy": 0.8206124305725098, "num_tokens": 106712640.0, "step": 669 }, { "epoch": 0.340793489318413, "grad_norm": 1.237960696220398, "learning_rate": 1e-05, "loss": 0.5775, "mean_token_accuracy": 0.8242462873458862, "num_tokens": 106869710.0, "step": 670 }, { "epoch": 0.34130213631739575, "grad_norm": 1.2517688274383545, "learning_rate": 1e-05, "loss": 0.5506, "mean_token_accuracy": 0.8329342603683472, "num_tokens": 107034402.0, "step": 671 }, { "epoch": 0.34181078331637843, "grad_norm": 1.1442136764526367, "learning_rate": 1e-05, "loss": 0.5805, "mean_token_accuracy": 0.8242872357368469, "num_tokens": 107197492.0, "step": 672 }, { "epoch": 0.34231943031536116, "grad_norm": 1.2725573778152466, "learning_rate": 1e-05, "loss": 0.5624, "mean_token_accuracy": 0.8281676769256592, "num_tokens": 107355096.0, "step": 673 }, { "epoch": 0.34282807731434384, "grad_norm": 1.2090405225753784, "learning_rate": 1e-05, "loss": 0.5604, "mean_token_accuracy": 0.8302109241485596, "num_tokens": 107520176.0, "step": 674 }, { "epoch": 0.34333672431332657, "grad_norm": 1.1444567441940308, "learning_rate": 1e-05, "loss": 0.55, "mean_token_accuracy": 0.8324422836303711, "num_tokens": 107676895.0, "step": 675 }, { "epoch": 0.34384537131230924, "grad_norm": 1.2566196918487549, "learning_rate": 1e-05, "loss": 0.5666, "mean_token_accuracy": 0.8281424045562744, "num_tokens": 107829848.0, "step": 676 }, { "epoch": 0.344354018311292, "grad_norm": 1.143470048904419, "learning_rate": 1e-05, "loss": 0.6006, "mean_token_accuracy": 0.8164473176002502, "num_tokens": 108003143.0, "step": 677 }, { "epoch": 0.34486266531027465, "grad_norm": 1.2039549350738525, "learning_rate": 1e-05, "loss": 0.5863, "mean_token_accuracy": 0.823419451713562, "num_tokens": 108169162.0, "step": 678 }, { "epoch": 0.3453713123092574, "grad_norm": 1.162801742553711, "learning_rate": 1e-05, "loss": 0.548, "mean_token_accuracy": 0.8335537910461426, "num_tokens": 108326285.0, "step": 679 }, { "epoch": 0.34587995930824006, "grad_norm": 1.1526139974594116, "learning_rate": 1e-05, "loss": 0.5487, "mean_token_accuracy": 0.8326566219329834, "num_tokens": 108481647.0, "step": 680 }, { "epoch": 0.3463886063072228, "grad_norm": 1.1998027563095093, "learning_rate": 1e-05, "loss": 0.5556, "mean_token_accuracy": 0.8318803906440735, "num_tokens": 108632219.0, "step": 681 }, { "epoch": 0.34689725330620547, "grad_norm": 1.1241905689239502, "learning_rate": 1e-05, "loss": 0.5693, "mean_token_accuracy": 0.8260743618011475, "num_tokens": 108795594.0, "step": 682 }, { "epoch": 0.3474059003051882, "grad_norm": 1.1517409086227417, "learning_rate": 1e-05, "loss": 0.5577, "mean_token_accuracy": 0.8296923637390137, "num_tokens": 108947589.0, "step": 683 }, { "epoch": 0.34791454730417093, "grad_norm": 1.1438885927200317, "learning_rate": 1e-05, "loss": 0.5731, "mean_token_accuracy": 0.8259981274604797, "num_tokens": 109100848.0, "step": 684 }, { "epoch": 0.3484231943031536, "grad_norm": 1.2787206172943115, "learning_rate": 1e-05, "loss": 0.6061, "mean_token_accuracy": 0.8187931180000305, "num_tokens": 109268616.0, "step": 685 }, { "epoch": 0.34893184130213634, "grad_norm": 1.1225264072418213, "learning_rate": 1e-05, "loss": 0.5434, "mean_token_accuracy": 0.833125650882721, "num_tokens": 109425497.0, "step": 686 }, { "epoch": 0.349440488301119, "grad_norm": 1.3023948669433594, "learning_rate": 1e-05, "loss": 0.5646, "mean_token_accuracy": 0.82758629322052, "num_tokens": 109596743.0, "step": 687 }, { "epoch": 0.34994913530010174, "grad_norm": 1.1158123016357422, "learning_rate": 1e-05, "loss": 0.54, "mean_token_accuracy": 0.8356212377548218, "num_tokens": 109752861.0, "step": 688 }, { "epoch": 0.3504577822990844, "grad_norm": 1.1276473999023438, "learning_rate": 1e-05, "loss": 0.5434, "mean_token_accuracy": 0.8341058492660522, "num_tokens": 109912156.0, "step": 689 }, { "epoch": 0.35096642929806715, "grad_norm": 1.326568365097046, "learning_rate": 1e-05, "loss": 0.5503, "mean_token_accuracy": 0.8320469260215759, "num_tokens": 110075837.0, "step": 690 }, { "epoch": 0.3514750762970498, "grad_norm": 1.1527599096298218, "learning_rate": 1e-05, "loss": 0.587, "mean_token_accuracy": 0.8230042457580566, "num_tokens": 110230103.0, "step": 691 }, { "epoch": 0.35198372329603256, "grad_norm": 1.2510868310928345, "learning_rate": 1e-05, "loss": 0.5628, "mean_token_accuracy": 0.8295976519584656, "num_tokens": 110388178.0, "step": 692 }, { "epoch": 0.35249237029501523, "grad_norm": 1.2448580265045166, "learning_rate": 1e-05, "loss": 0.575, "mean_token_accuracy": 0.8250375390052795, "num_tokens": 110556832.0, "step": 693 }, { "epoch": 0.35300101729399797, "grad_norm": 1.3243716955184937, "learning_rate": 1e-05, "loss": 0.5814, "mean_token_accuracy": 0.8233970403671265, "num_tokens": 110712545.0, "step": 694 }, { "epoch": 0.3535096642929807, "grad_norm": 1.2904678583145142, "learning_rate": 1e-05, "loss": 0.547, "mean_token_accuracy": 0.8315588235855103, "num_tokens": 110855683.0, "step": 695 }, { "epoch": 0.3540183112919634, "grad_norm": 1.129303216934204, "learning_rate": 1e-05, "loss": 0.5724, "mean_token_accuracy": 0.8251450657844543, "num_tokens": 111014148.0, "step": 696 }, { "epoch": 0.3545269582909461, "grad_norm": 1.3024146556854248, "learning_rate": 1e-05, "loss": 0.5699, "mean_token_accuracy": 0.8282129764556885, "num_tokens": 111168794.0, "step": 697 }, { "epoch": 0.3550356052899288, "grad_norm": 1.2099299430847168, "learning_rate": 1e-05, "loss": 0.5728, "mean_token_accuracy": 0.8253839612007141, "num_tokens": 111318802.0, "step": 698 }, { "epoch": 0.3555442522889115, "grad_norm": 1.365613579750061, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.831680417060852, "num_tokens": 111482438.0, "step": 699 }, { "epoch": 0.3560528992878942, "grad_norm": 1.1956545114517212, "learning_rate": 1e-05, "loss": 0.5713, "mean_token_accuracy": 0.8248770833015442, "num_tokens": 111631108.0, "step": 700 }, { "epoch": 0.3565615462868769, "grad_norm": 1.1379332542419434, "learning_rate": 1e-05, "loss": 0.5813, "mean_token_accuracy": 0.8245474696159363, "num_tokens": 111792659.0, "step": 701 }, { "epoch": 0.3570701932858596, "grad_norm": 1.311396837234497, "learning_rate": 1e-05, "loss": 0.575, "mean_token_accuracy": 0.8252518177032471, "num_tokens": 111956492.0, "step": 702 }, { "epoch": 0.3575788402848423, "grad_norm": 1.1396161317825317, "learning_rate": 1e-05, "loss": 0.5696, "mean_token_accuracy": 0.82741379737854, "num_tokens": 112111925.0, "step": 703 }, { "epoch": 0.358087487283825, "grad_norm": 1.2385365962982178, "learning_rate": 1e-05, "loss": 0.5718, "mean_token_accuracy": 0.8264093399047852, "num_tokens": 112271716.0, "step": 704 }, { "epoch": 0.35859613428280773, "grad_norm": 1.1056485176086426, "learning_rate": 1e-05, "loss": 0.5399, "mean_token_accuracy": 0.833939790725708, "num_tokens": 112433508.0, "step": 705 }, { "epoch": 0.35910478128179046, "grad_norm": 1.142451524734497, "learning_rate": 1e-05, "loss": 0.5366, "mean_token_accuracy": 0.8368534445762634, "num_tokens": 112595775.0, "step": 706 }, { "epoch": 0.35961342828077314, "grad_norm": 1.3010401725769043, "learning_rate": 1e-05, "loss": 0.5498, "mean_token_accuracy": 0.8305310010910034, "num_tokens": 112747723.0, "step": 707 }, { "epoch": 0.36012207527975587, "grad_norm": 1.1268346309661865, "learning_rate": 1e-05, "loss": 0.5627, "mean_token_accuracy": 0.828546404838562, "num_tokens": 112906425.0, "step": 708 }, { "epoch": 0.36063072227873855, "grad_norm": 1.151298999786377, "learning_rate": 1e-05, "loss": 0.5531, "mean_token_accuracy": 0.8322466015815735, "num_tokens": 113062355.0, "step": 709 }, { "epoch": 0.3611393692777213, "grad_norm": 1.1866037845611572, "learning_rate": 1e-05, "loss": 0.5839, "mean_token_accuracy": 0.8216893672943115, "num_tokens": 113237249.0, "step": 710 }, { "epoch": 0.36164801627670395, "grad_norm": 1.164064884185791, "learning_rate": 1e-05, "loss": 0.5826, "mean_token_accuracy": 0.8220279216766357, "num_tokens": 113399443.0, "step": 711 }, { "epoch": 0.3621566632756867, "grad_norm": 1.190131425857544, "learning_rate": 1e-05, "loss": 0.5407, "mean_token_accuracy": 0.8338578343391418, "num_tokens": 113555518.0, "step": 712 }, { "epoch": 0.36266531027466936, "grad_norm": 1.2120224237442017, "learning_rate": 1e-05, "loss": 0.5716, "mean_token_accuracy": 0.8261467218399048, "num_tokens": 113698492.0, "step": 713 }, { "epoch": 0.3631739572736521, "grad_norm": 1.243842363357544, "learning_rate": 1e-05, "loss": 0.6163, "mean_token_accuracy": 0.8146119713783264, "num_tokens": 113869795.0, "step": 714 }, { "epoch": 0.36368260427263477, "grad_norm": 1.1908296346664429, "learning_rate": 1e-05, "loss": 0.5441, "mean_token_accuracy": 0.833182692527771, "num_tokens": 114035400.0, "step": 715 }, { "epoch": 0.3641912512716175, "grad_norm": 1.2101727724075317, "learning_rate": 1e-05, "loss": 0.5856, "mean_token_accuracy": 0.8229519128799438, "num_tokens": 114180922.0, "step": 716 }, { "epoch": 0.3646998982706002, "grad_norm": 1.2070703506469727, "learning_rate": 1e-05, "loss": 0.5583, "mean_token_accuracy": 0.8307386636734009, "num_tokens": 114351814.0, "step": 717 }, { "epoch": 0.3652085452695829, "grad_norm": 1.186836838722229, "learning_rate": 1e-05, "loss": 0.5417, "mean_token_accuracy": 0.832840085029602, "num_tokens": 114503125.0, "step": 718 }, { "epoch": 0.36571719226856564, "grad_norm": 1.173302173614502, "learning_rate": 1e-05, "loss": 0.5824, "mean_token_accuracy": 0.8242769241333008, "num_tokens": 114658004.0, "step": 719 }, { "epoch": 0.3662258392675483, "grad_norm": 1.3086676597595215, "learning_rate": 1e-05, "loss": 0.5966, "mean_token_accuracy": 0.8192692995071411, "num_tokens": 114829230.0, "step": 720 }, { "epoch": 0.36673448626653105, "grad_norm": 1.2513558864593506, "learning_rate": 1e-05, "loss": 0.5651, "mean_token_accuracy": 0.8275444507598877, "num_tokens": 114987847.0, "step": 721 }, { "epoch": 0.3672431332655137, "grad_norm": 1.1308586597442627, "learning_rate": 1e-05, "loss": 0.5543, "mean_token_accuracy": 0.8325884938240051, "num_tokens": 115154558.0, "step": 722 }, { "epoch": 0.36775178026449645, "grad_norm": 1.2177181243896484, "learning_rate": 1e-05, "loss": 0.5585, "mean_token_accuracy": 0.8294394612312317, "num_tokens": 115315702.0, "step": 723 }, { "epoch": 0.36826042726347913, "grad_norm": 1.2306625843048096, "learning_rate": 1e-05, "loss": 0.5576, "mean_token_accuracy": 0.8295246958732605, "num_tokens": 115471150.0, "step": 724 }, { "epoch": 0.36876907426246186, "grad_norm": 1.1928412914276123, "learning_rate": 1e-05, "loss": 0.5782, "mean_token_accuracy": 0.8248293399810791, "num_tokens": 115630359.0, "step": 725 }, { "epoch": 0.36927772126144454, "grad_norm": 1.2332453727722168, "learning_rate": 1e-05, "loss": 0.5759, "mean_token_accuracy": 0.8268725872039795, "num_tokens": 115802068.0, "step": 726 }, { "epoch": 0.36978636826042727, "grad_norm": 1.221351981163025, "learning_rate": 1e-05, "loss": 0.5523, "mean_token_accuracy": 0.831588625907898, "num_tokens": 115954597.0, "step": 727 }, { "epoch": 0.37029501525940994, "grad_norm": 1.178653359413147, "learning_rate": 1e-05, "loss": 0.5899, "mean_token_accuracy": 0.8207197189331055, "num_tokens": 116121996.0, "step": 728 }, { "epoch": 0.3708036622583927, "grad_norm": 1.2622578144073486, "learning_rate": 1e-05, "loss": 0.5705, "mean_token_accuracy": 0.8262423276901245, "num_tokens": 116286650.0, "step": 729 }, { "epoch": 0.3713123092573754, "grad_norm": 1.219468116760254, "learning_rate": 1e-05, "loss": 0.551, "mean_token_accuracy": 0.8329716324806213, "num_tokens": 116456648.0, "step": 730 }, { "epoch": 0.3718209562563581, "grad_norm": 1.1355637311935425, "learning_rate": 1e-05, "loss": 0.5541, "mean_token_accuracy": 0.8318668603897095, "num_tokens": 116621931.0, "step": 731 }, { "epoch": 0.3723296032553408, "grad_norm": 1.310593605041504, "learning_rate": 1e-05, "loss": 0.6152, "mean_token_accuracy": 0.8147826790809631, "num_tokens": 116787232.0, "step": 732 }, { "epoch": 0.3728382502543235, "grad_norm": 1.2487748861312866, "learning_rate": 1e-05, "loss": 0.6179, "mean_token_accuracy": 0.8136881589889526, "num_tokens": 116949019.0, "step": 733 }, { "epoch": 0.3733468972533062, "grad_norm": 1.1336181163787842, "learning_rate": 1e-05, "loss": 0.5902, "mean_token_accuracy": 0.8222196102142334, "num_tokens": 117122455.0, "step": 734 }, { "epoch": 0.3738555442522889, "grad_norm": 1.2856602668762207, "learning_rate": 1e-05, "loss": 0.5639, "mean_token_accuracy": 0.828788161277771, "num_tokens": 117267738.0, "step": 735 }, { "epoch": 0.3743641912512716, "grad_norm": 1.1545137166976929, "learning_rate": 1e-05, "loss": 0.5654, "mean_token_accuracy": 0.8256614208221436, "num_tokens": 117430802.0, "step": 736 }, { "epoch": 0.3748728382502543, "grad_norm": 1.143939733505249, "learning_rate": 1e-05, "loss": 0.5776, "mean_token_accuracy": 0.8231573104858398, "num_tokens": 117587032.0, "step": 737 }, { "epoch": 0.37538148524923703, "grad_norm": 1.060882568359375, "learning_rate": 1e-05, "loss": 0.5871, "mean_token_accuracy": 0.8218531608581543, "num_tokens": 117747701.0, "step": 738 }, { "epoch": 0.3758901322482197, "grad_norm": 1.1606186628341675, "learning_rate": 1e-05, "loss": 0.5624, "mean_token_accuracy": 0.8296399116516113, "num_tokens": 117912178.0, "step": 739 }, { "epoch": 0.37639877924720244, "grad_norm": 1.1886235475540161, "learning_rate": 1e-05, "loss": 0.5331, "mean_token_accuracy": 0.8355053663253784, "num_tokens": 118069631.0, "step": 740 }, { "epoch": 0.3769074262461852, "grad_norm": 1.2048909664154053, "learning_rate": 1e-05, "loss": 0.5472, "mean_token_accuracy": 0.832554817199707, "num_tokens": 118216192.0, "step": 741 }, { "epoch": 0.37741607324516785, "grad_norm": 1.2597253322601318, "learning_rate": 1e-05, "loss": 0.5484, "mean_token_accuracy": 0.8324501514434814, "num_tokens": 118364107.0, "step": 742 }, { "epoch": 0.3779247202441506, "grad_norm": 1.1237263679504395, "learning_rate": 1e-05, "loss": 0.5928, "mean_token_accuracy": 0.8186395168304443, "num_tokens": 118532657.0, "step": 743 }, { "epoch": 0.37843336724313326, "grad_norm": 1.2016260623931885, "learning_rate": 1e-05, "loss": 0.5557, "mean_token_accuracy": 0.8294884562492371, "num_tokens": 118689942.0, "step": 744 }, { "epoch": 0.378942014242116, "grad_norm": 1.329368233680725, "learning_rate": 1e-05, "loss": 0.5555, "mean_token_accuracy": 0.8295120596885681, "num_tokens": 118861717.0, "step": 745 }, { "epoch": 0.37945066124109866, "grad_norm": 1.2445459365844727, "learning_rate": 1e-05, "loss": 0.568, "mean_token_accuracy": 0.8270270228385925, "num_tokens": 119027905.0, "step": 746 }, { "epoch": 0.3799593082400814, "grad_norm": 1.2161781787872314, "learning_rate": 1e-05, "loss": 0.5114, "mean_token_accuracy": 0.8412526845932007, "num_tokens": 119173593.0, "step": 747 }, { "epoch": 0.38046795523906407, "grad_norm": 1.3197194337844849, "learning_rate": 1e-05, "loss": 0.6057, "mean_token_accuracy": 0.8171380162239075, "num_tokens": 119333549.0, "step": 748 }, { "epoch": 0.3809766022380468, "grad_norm": 1.2615206241607666, "learning_rate": 1e-05, "loss": 0.552, "mean_token_accuracy": 0.8305937647819519, "num_tokens": 119485196.0, "step": 749 }, { "epoch": 0.3814852492370295, "grad_norm": 1.241072416305542, "learning_rate": 1e-05, "loss": 0.5502, "mean_token_accuracy": 0.8316583633422852, "num_tokens": 119635566.0, "step": 750 }, { "epoch": 0.3819938962360122, "grad_norm": 1.3704606294631958, "learning_rate": 1e-05, "loss": 0.5569, "mean_token_accuracy": 0.8296416997909546, "num_tokens": 119796534.0, "step": 751 }, { "epoch": 0.38250254323499494, "grad_norm": 1.2328382730484009, "learning_rate": 1e-05, "loss": 0.6011, "mean_token_accuracy": 0.8162603378295898, "num_tokens": 119965071.0, "step": 752 }, { "epoch": 0.3830111902339776, "grad_norm": 1.320970892906189, "learning_rate": 1e-05, "loss": 0.5441, "mean_token_accuracy": 0.8339331746101379, "num_tokens": 120116111.0, "step": 753 }, { "epoch": 0.38351983723296035, "grad_norm": 1.234978199005127, "learning_rate": 1e-05, "loss": 0.585, "mean_token_accuracy": 0.8218483924865723, "num_tokens": 120275408.0, "step": 754 }, { "epoch": 0.384028484231943, "grad_norm": 1.2488471269607544, "learning_rate": 1e-05, "loss": 0.5602, "mean_token_accuracy": 0.8297809362411499, "num_tokens": 120432203.0, "step": 755 }, { "epoch": 0.38453713123092575, "grad_norm": 1.2408305406570435, "learning_rate": 1e-05, "loss": 0.5966, "mean_token_accuracy": 0.8207588791847229, "num_tokens": 120600556.0, "step": 756 }, { "epoch": 0.38504577822990843, "grad_norm": 1.1832354068756104, "learning_rate": 1e-05, "loss": 0.5647, "mean_token_accuracy": 0.8281243443489075, "num_tokens": 120758883.0, "step": 757 }, { "epoch": 0.38555442522889116, "grad_norm": 1.1637436151504517, "learning_rate": 1e-05, "loss": 0.5468, "mean_token_accuracy": 0.8347381949424744, "num_tokens": 120922100.0, "step": 758 }, { "epoch": 0.38606307222787384, "grad_norm": 1.2801555395126343, "learning_rate": 1e-05, "loss": 0.5651, "mean_token_accuracy": 0.8274706602096558, "num_tokens": 121075897.0, "step": 759 }, { "epoch": 0.38657171922685657, "grad_norm": 1.390296459197998, "learning_rate": 1e-05, "loss": 0.551, "mean_token_accuracy": 0.8321919441223145, "num_tokens": 121235661.0, "step": 760 }, { "epoch": 0.38708036622583925, "grad_norm": 1.1444047689437866, "learning_rate": 1e-05, "loss": 0.5786, "mean_token_accuracy": 0.8248952627182007, "num_tokens": 121393640.0, "step": 761 }, { "epoch": 0.387589013224822, "grad_norm": 1.3989663124084473, "learning_rate": 1e-05, "loss": 0.5751, "mean_token_accuracy": 0.8251594305038452, "num_tokens": 121539889.0, "step": 762 }, { "epoch": 0.38809766022380465, "grad_norm": 1.1752854585647583, "learning_rate": 1e-05, "loss": 0.5567, "mean_token_accuracy": 0.8292431235313416, "num_tokens": 121699739.0, "step": 763 }, { "epoch": 0.3886063072227874, "grad_norm": 1.4760440587997437, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8403042554855347, "num_tokens": 121844594.0, "step": 764 }, { "epoch": 0.3891149542217701, "grad_norm": 1.1970382928848267, "learning_rate": 1e-05, "loss": 0.5733, "mean_token_accuracy": 0.8265695571899414, "num_tokens": 122018838.0, "step": 765 }, { "epoch": 0.3896236012207528, "grad_norm": 1.3410618305206299, "learning_rate": 1e-05, "loss": 0.5922, "mean_token_accuracy": 0.8223105669021606, "num_tokens": 122181626.0, "step": 766 }, { "epoch": 0.3901322482197355, "grad_norm": 1.3157473802566528, "learning_rate": 1e-05, "loss": 0.5569, "mean_token_accuracy": 0.8304774761199951, "num_tokens": 122336319.0, "step": 767 }, { "epoch": 0.3906408952187182, "grad_norm": 1.3343852758407593, "learning_rate": 1e-05, "loss": 0.5262, "mean_token_accuracy": 0.8391714096069336, "num_tokens": 122486556.0, "step": 768 }, { "epoch": 0.39114954221770093, "grad_norm": 1.2602474689483643, "learning_rate": 1e-05, "loss": 0.5704, "mean_token_accuracy": 0.8253237009048462, "num_tokens": 122643871.0, "step": 769 }, { "epoch": 0.3916581892166836, "grad_norm": 1.2566087245941162, "learning_rate": 1e-05, "loss": 0.5638, "mean_token_accuracy": 0.8279736042022705, "num_tokens": 122806517.0, "step": 770 }, { "epoch": 0.39216683621566634, "grad_norm": 1.23412024974823, "learning_rate": 1e-05, "loss": 0.56, "mean_token_accuracy": 0.8286000490188599, "num_tokens": 122963330.0, "step": 771 }, { "epoch": 0.392675483214649, "grad_norm": 1.1621471643447876, "learning_rate": 1e-05, "loss": 0.5703, "mean_token_accuracy": 0.8259589076042175, "num_tokens": 123130884.0, "step": 772 }, { "epoch": 0.39318413021363174, "grad_norm": 1.1377054452896118, "learning_rate": 1e-05, "loss": 0.5535, "mean_token_accuracy": 0.8336734175682068, "num_tokens": 123291806.0, "step": 773 }, { "epoch": 0.3936927772126144, "grad_norm": 1.12781822681427, "learning_rate": 1e-05, "loss": 0.51, "mean_token_accuracy": 0.8437920212745667, "num_tokens": 123442859.0, "step": 774 }, { "epoch": 0.39420142421159715, "grad_norm": 1.2181215286254883, "learning_rate": 1e-05, "loss": 0.5546, "mean_token_accuracy": 0.8290634155273438, "num_tokens": 123592681.0, "step": 775 }, { "epoch": 0.3947100712105799, "grad_norm": 1.3110220432281494, "learning_rate": 1e-05, "loss": 0.5395, "mean_token_accuracy": 0.8334207534790039, "num_tokens": 123764752.0, "step": 776 }, { "epoch": 0.39521871820956256, "grad_norm": 1.2244796752929688, "learning_rate": 1e-05, "loss": 0.5725, "mean_token_accuracy": 0.8263367414474487, "num_tokens": 123922538.0, "step": 777 }, { "epoch": 0.3957273652085453, "grad_norm": 1.2441754341125488, "learning_rate": 1e-05, "loss": 0.543, "mean_token_accuracy": 0.8332741260528564, "num_tokens": 124083040.0, "step": 778 }, { "epoch": 0.39623601220752797, "grad_norm": 1.3164564371109009, "learning_rate": 1e-05, "loss": 0.5608, "mean_token_accuracy": 0.8291635513305664, "num_tokens": 124239220.0, "step": 779 }, { "epoch": 0.3967446592065107, "grad_norm": 1.1562912464141846, "learning_rate": 1e-05, "loss": 0.5583, "mean_token_accuracy": 0.8303772807121277, "num_tokens": 124398859.0, "step": 780 }, { "epoch": 0.3972533062054934, "grad_norm": 1.2362325191497803, "learning_rate": 1e-05, "loss": 0.582, "mean_token_accuracy": 0.8240430951118469, "num_tokens": 124549761.0, "step": 781 }, { "epoch": 0.3977619532044761, "grad_norm": 1.1560744047164917, "learning_rate": 1e-05, "loss": 0.5853, "mean_token_accuracy": 0.8228815793991089, "num_tokens": 124705300.0, "step": 782 }, { "epoch": 0.3982706002034588, "grad_norm": 1.2574455738067627, "learning_rate": 1e-05, "loss": 0.5631, "mean_token_accuracy": 0.8280406594276428, "num_tokens": 124863824.0, "step": 783 }, { "epoch": 0.3987792472024415, "grad_norm": 1.2234323024749756, "learning_rate": 1e-05, "loss": 0.5901, "mean_token_accuracy": 0.8210317492485046, "num_tokens": 125017794.0, "step": 784 }, { "epoch": 0.3992878942014242, "grad_norm": 1.2508187294006348, "learning_rate": 1e-05, "loss": 0.5789, "mean_token_accuracy": 0.8266098499298096, "num_tokens": 125178609.0, "step": 785 }, { "epoch": 0.3997965412004069, "grad_norm": 1.2754849195480347, "learning_rate": 1e-05, "loss": 0.5759, "mean_token_accuracy": 0.8228343725204468, "num_tokens": 125331610.0, "step": 786 }, { "epoch": 0.40030518819938965, "grad_norm": 1.1867464780807495, "learning_rate": 1e-05, "loss": 0.5324, "mean_token_accuracy": 0.8351470232009888, "num_tokens": 125481250.0, "step": 787 }, { "epoch": 0.4008138351983723, "grad_norm": 1.333287239074707, "learning_rate": 1e-05, "loss": 0.5767, "mean_token_accuracy": 0.825285017490387, "num_tokens": 125650464.0, "step": 788 }, { "epoch": 0.40132248219735506, "grad_norm": 1.4091664552688599, "learning_rate": 1e-05, "loss": 0.5532, "mean_token_accuracy": 0.8286569118499756, "num_tokens": 125817057.0, "step": 789 }, { "epoch": 0.40183112919633773, "grad_norm": 1.0249348878860474, "learning_rate": 1e-05, "loss": 0.5409, "mean_token_accuracy": 0.833737850189209, "num_tokens": 125964354.0, "step": 790 }, { "epoch": 0.40233977619532046, "grad_norm": 1.2201231718063354, "learning_rate": 1e-05, "loss": 0.5382, "mean_token_accuracy": 0.8345334529876709, "num_tokens": 126119274.0, "step": 791 }, { "epoch": 0.40284842319430314, "grad_norm": 1.3263883590698242, "learning_rate": 1e-05, "loss": 0.5606, "mean_token_accuracy": 0.8295636773109436, "num_tokens": 126266332.0, "step": 792 }, { "epoch": 0.40335707019328587, "grad_norm": 1.1370054483413696, "learning_rate": 1e-05, "loss": 0.5671, "mean_token_accuracy": 0.8283527493476868, "num_tokens": 126416815.0, "step": 793 }, { "epoch": 0.40386571719226855, "grad_norm": 1.232959508895874, "learning_rate": 1e-05, "loss": 0.5465, "mean_token_accuracy": 0.8347012400627136, "num_tokens": 126582617.0, "step": 794 }, { "epoch": 0.4043743641912513, "grad_norm": 1.158482551574707, "learning_rate": 1e-05, "loss": 0.5592, "mean_token_accuracy": 0.8307364583015442, "num_tokens": 126734456.0, "step": 795 }, { "epoch": 0.40488301119023395, "grad_norm": 1.0260847806930542, "learning_rate": 1e-05, "loss": 0.5504, "mean_token_accuracy": 0.832145631313324, "num_tokens": 126894951.0, "step": 796 }, { "epoch": 0.4053916581892167, "grad_norm": 1.1842215061187744, "learning_rate": 1e-05, "loss": 0.5751, "mean_token_accuracy": 0.8247040510177612, "num_tokens": 127040903.0, "step": 797 }, { "epoch": 0.4059003051881994, "grad_norm": 1.2335243225097656, "learning_rate": 1e-05, "loss": 0.5908, "mean_token_accuracy": 0.8203405737876892, "num_tokens": 127193010.0, "step": 798 }, { "epoch": 0.4064089521871821, "grad_norm": 1.0869059562683105, "learning_rate": 1e-05, "loss": 0.5698, "mean_token_accuracy": 0.8263587951660156, "num_tokens": 127356124.0, "step": 799 }, { "epoch": 0.4069175991861648, "grad_norm": 1.0540488958358765, "learning_rate": 1e-05, "loss": 0.5488, "mean_token_accuracy": 0.8332864046096802, "num_tokens": 127530354.0, "step": 800 }, { "epoch": 0.4074262461851475, "grad_norm": 1.14442777633667, "learning_rate": 1e-05, "loss": 0.556, "mean_token_accuracy": 0.8312243223190308, "num_tokens": 127674617.0, "step": 801 }, { "epoch": 0.40793489318413023, "grad_norm": 1.1911969184875488, "learning_rate": 1e-05, "loss": 0.5508, "mean_token_accuracy": 0.8314390182495117, "num_tokens": 127833308.0, "step": 802 }, { "epoch": 0.4084435401831129, "grad_norm": 1.2103899717330933, "learning_rate": 1e-05, "loss": 0.5811, "mean_token_accuracy": 0.8233762979507446, "num_tokens": 127994496.0, "step": 803 }, { "epoch": 0.40895218718209564, "grad_norm": 1.246382713317871, "learning_rate": 1e-05, "loss": 0.5577, "mean_token_accuracy": 0.8302004337310791, "num_tokens": 128161302.0, "step": 804 }, { "epoch": 0.4094608341810783, "grad_norm": 1.2232249975204468, "learning_rate": 1e-05, "loss": 0.5679, "mean_token_accuracy": 0.8264790773391724, "num_tokens": 128318321.0, "step": 805 }, { "epoch": 0.40996948118006105, "grad_norm": 1.2698410749435425, "learning_rate": 1e-05, "loss": 0.5626, "mean_token_accuracy": 0.8275660872459412, "num_tokens": 128485010.0, "step": 806 }, { "epoch": 0.4104781281790437, "grad_norm": 1.1866873502731323, "learning_rate": 1e-05, "loss": 0.5847, "mean_token_accuracy": 0.8216865658760071, "num_tokens": 128651307.0, "step": 807 }, { "epoch": 0.41098677517802645, "grad_norm": 1.2357579469680786, "learning_rate": 1e-05, "loss": 0.5459, "mean_token_accuracy": 0.8317310810089111, "num_tokens": 128794807.0, "step": 808 }, { "epoch": 0.41149542217700913, "grad_norm": 1.2447888851165771, "learning_rate": 1e-05, "loss": 0.5525, "mean_token_accuracy": 0.8310381770133972, "num_tokens": 128954027.0, "step": 809 }, { "epoch": 0.41200406917599186, "grad_norm": 1.1477376222610474, "learning_rate": 1e-05, "loss": 0.5514, "mean_token_accuracy": 0.8300490379333496, "num_tokens": 129117445.0, "step": 810 }, { "epoch": 0.4125127161749746, "grad_norm": 1.1172250509262085, "learning_rate": 1e-05, "loss": 0.5521, "mean_token_accuracy": 0.830201268196106, "num_tokens": 129285951.0, "step": 811 }, { "epoch": 0.41302136317395727, "grad_norm": 1.1050262451171875, "learning_rate": 1e-05, "loss": 0.5875, "mean_token_accuracy": 0.8233176469802856, "num_tokens": 129465561.0, "step": 812 }, { "epoch": 0.41353001017294, "grad_norm": 1.23516845703125, "learning_rate": 1e-05, "loss": 0.5097, "mean_token_accuracy": 0.8415330648422241, "num_tokens": 129614099.0, "step": 813 }, { "epoch": 0.4140386571719227, "grad_norm": 1.1005412340164185, "learning_rate": 1e-05, "loss": 0.55, "mean_token_accuracy": 0.8317654728889465, "num_tokens": 129789431.0, "step": 814 }, { "epoch": 0.4145473041709054, "grad_norm": 1.0260275602340698, "learning_rate": 1e-05, "loss": 0.5099, "mean_token_accuracy": 0.8431083559989929, "num_tokens": 129947839.0, "step": 815 }, { "epoch": 0.4150559511698881, "grad_norm": 1.2571784257888794, "learning_rate": 1e-05, "loss": 0.5925, "mean_token_accuracy": 0.8207975029945374, "num_tokens": 130099262.0, "step": 816 }, { "epoch": 0.4155645981688708, "grad_norm": 1.1722218990325928, "learning_rate": 1e-05, "loss": 0.5542, "mean_token_accuracy": 0.830123245716095, "num_tokens": 130262007.0, "step": 817 }, { "epoch": 0.4160732451678535, "grad_norm": 1.234281063079834, "learning_rate": 1e-05, "loss": 0.5842, "mean_token_accuracy": 0.8230294585227966, "num_tokens": 130412634.0, "step": 818 }, { "epoch": 0.4165818921668362, "grad_norm": 1.166526198387146, "learning_rate": 1e-05, "loss": 0.5665, "mean_token_accuracy": 0.8284645080566406, "num_tokens": 130576459.0, "step": 819 }, { "epoch": 0.4170905391658189, "grad_norm": 1.139403223991394, "learning_rate": 1e-05, "loss": 0.5654, "mean_token_accuracy": 0.8261489272117615, "num_tokens": 130738545.0, "step": 820 }, { "epoch": 0.41759918616480163, "grad_norm": 1.0868419408798218, "learning_rate": 1e-05, "loss": 0.5814, "mean_token_accuracy": 0.8246670961380005, "num_tokens": 130899383.0, "step": 821 }, { "epoch": 0.41810783316378436, "grad_norm": 1.1820749044418335, "learning_rate": 1e-05, "loss": 0.5486, "mean_token_accuracy": 0.8303079605102539, "num_tokens": 131069889.0, "step": 822 }, { "epoch": 0.41861648016276704, "grad_norm": 1.1270747184753418, "learning_rate": 1e-05, "loss": 0.5625, "mean_token_accuracy": 0.8289308547973633, "num_tokens": 131230218.0, "step": 823 }, { "epoch": 0.41912512716174977, "grad_norm": 1.3298698663711548, "learning_rate": 1e-05, "loss": 0.5661, "mean_token_accuracy": 0.826373815536499, "num_tokens": 131374265.0, "step": 824 }, { "epoch": 0.41963377416073244, "grad_norm": 1.1981703042984009, "learning_rate": 1e-05, "loss": 0.5483, "mean_token_accuracy": 0.8322027921676636, "num_tokens": 131543646.0, "step": 825 }, { "epoch": 0.4201424211597152, "grad_norm": 1.183638334274292, "learning_rate": 1e-05, "loss": 0.5913, "mean_token_accuracy": 0.8200170397758484, "num_tokens": 131702768.0, "step": 826 }, { "epoch": 0.42065106815869785, "grad_norm": 1.2016685009002686, "learning_rate": 1e-05, "loss": 0.5742, "mean_token_accuracy": 0.826677680015564, "num_tokens": 131860643.0, "step": 827 }, { "epoch": 0.4211597151576806, "grad_norm": 1.2117851972579956, "learning_rate": 1e-05, "loss": 0.5348, "mean_token_accuracy": 0.835245668888092, "num_tokens": 132005394.0, "step": 828 }, { "epoch": 0.42166836215666326, "grad_norm": 1.2130509614944458, "learning_rate": 1e-05, "loss": 0.5418, "mean_token_accuracy": 0.8323445320129395, "num_tokens": 132162321.0, "step": 829 }, { "epoch": 0.422177009155646, "grad_norm": 1.1499370336532593, "learning_rate": 1e-05, "loss": 0.5328, "mean_token_accuracy": 0.8365702629089355, "num_tokens": 132318929.0, "step": 830 }, { "epoch": 0.42268565615462866, "grad_norm": 1.1348990201950073, "learning_rate": 1e-05, "loss": 0.5476, "mean_token_accuracy": 0.8332790732383728, "num_tokens": 132479785.0, "step": 831 }, { "epoch": 0.4231943031536114, "grad_norm": 1.1613720655441284, "learning_rate": 1e-05, "loss": 0.5196, "mean_token_accuracy": 0.8395429253578186, "num_tokens": 132639135.0, "step": 832 }, { "epoch": 0.4237029501525941, "grad_norm": 1.314821720123291, "learning_rate": 1e-05, "loss": 0.6086, "mean_token_accuracy": 0.8147256970405579, "num_tokens": 132798835.0, "step": 833 }, { "epoch": 0.4242115971515768, "grad_norm": 1.0963644981384277, "learning_rate": 1e-05, "loss": 0.5403, "mean_token_accuracy": 0.8340123891830444, "num_tokens": 132955884.0, "step": 834 }, { "epoch": 0.42472024415055953, "grad_norm": 1.202355980873108, "learning_rate": 1e-05, "loss": 0.5384, "mean_token_accuracy": 0.834297776222229, "num_tokens": 133118521.0, "step": 835 }, { "epoch": 0.4252288911495422, "grad_norm": 1.1341054439544678, "learning_rate": 1e-05, "loss": 0.5146, "mean_token_accuracy": 0.842079758644104, "num_tokens": 133281242.0, "step": 836 }, { "epoch": 0.42573753814852494, "grad_norm": 1.160120964050293, "learning_rate": 1e-05, "loss": 0.5471, "mean_token_accuracy": 0.8323448896408081, "num_tokens": 133450654.0, "step": 837 }, { "epoch": 0.4262461851475076, "grad_norm": 1.1472262144088745, "learning_rate": 1e-05, "loss": 0.5742, "mean_token_accuracy": 0.8254837989807129, "num_tokens": 133620766.0, "step": 838 }, { "epoch": 0.42675483214649035, "grad_norm": 1.1196868419647217, "learning_rate": 1e-05, "loss": 0.5786, "mean_token_accuracy": 0.8239191174507141, "num_tokens": 133790293.0, "step": 839 }, { "epoch": 0.427263479145473, "grad_norm": 1.1362370252609253, "learning_rate": 1e-05, "loss": 0.536, "mean_token_accuracy": 0.8357857465744019, "num_tokens": 133962681.0, "step": 840 }, { "epoch": 0.42777212614445576, "grad_norm": 1.1518752574920654, "learning_rate": 1e-05, "loss": 0.57, "mean_token_accuracy": 0.8252599239349365, "num_tokens": 134128875.0, "step": 841 }, { "epoch": 0.42828077314343843, "grad_norm": 1.142612099647522, "learning_rate": 1e-05, "loss": 0.6033, "mean_token_accuracy": 0.8177984952926636, "num_tokens": 134299862.0, "step": 842 }, { "epoch": 0.42878942014242116, "grad_norm": 1.2787010669708252, "learning_rate": 1e-05, "loss": 0.535, "mean_token_accuracy": 0.8356817960739136, "num_tokens": 134447887.0, "step": 843 }, { "epoch": 0.42929806714140384, "grad_norm": 1.1462864875793457, "learning_rate": 1e-05, "loss": 0.5543, "mean_token_accuracy": 0.8310085535049438, "num_tokens": 134603411.0, "step": 844 }, { "epoch": 0.42980671414038657, "grad_norm": 1.2700620889663696, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8362004160881042, "num_tokens": 134753687.0, "step": 845 }, { "epoch": 0.4303153611393693, "grad_norm": 1.2534452676773071, "learning_rate": 1e-05, "loss": 0.5627, "mean_token_accuracy": 0.8288543224334717, "num_tokens": 134910477.0, "step": 846 }, { "epoch": 0.430824008138352, "grad_norm": 1.0986623764038086, "learning_rate": 1e-05, "loss": 0.569, "mean_token_accuracy": 0.8261380791664124, "num_tokens": 135065619.0, "step": 847 }, { "epoch": 0.4313326551373347, "grad_norm": 1.3233753442764282, "learning_rate": 1e-05, "loss": 0.5968, "mean_token_accuracy": 0.8202333450317383, "num_tokens": 135215377.0, "step": 848 }, { "epoch": 0.4318413021363174, "grad_norm": 1.2474511861801147, "learning_rate": 1e-05, "loss": 0.5717, "mean_token_accuracy": 0.8252692222595215, "num_tokens": 135371149.0, "step": 849 }, { "epoch": 0.4323499491353001, "grad_norm": 1.084205150604248, "learning_rate": 1e-05, "loss": 0.5536, "mean_token_accuracy": 0.8310810327529907, "num_tokens": 135530087.0, "step": 850 }, { "epoch": 0.4328585961342828, "grad_norm": 1.149458408355713, "learning_rate": 1e-05, "loss": 0.5481, "mean_token_accuracy": 0.8319704532623291, "num_tokens": 135689766.0, "step": 851 }, { "epoch": 0.4333672431332655, "grad_norm": 1.1447380781173706, "learning_rate": 1e-05, "loss": 0.5274, "mean_token_accuracy": 0.838167130947113, "num_tokens": 135852811.0, "step": 852 }, { "epoch": 0.4338758901322482, "grad_norm": 1.070308804512024, "learning_rate": 1e-05, "loss": 0.55, "mean_token_accuracy": 0.8330268859863281, "num_tokens": 136011756.0, "step": 853 }, { "epoch": 0.43438453713123093, "grad_norm": 1.2195512056350708, "learning_rate": 1e-05, "loss": 0.5499, "mean_token_accuracy": 0.8309155106544495, "num_tokens": 136177024.0, "step": 854 }, { "epoch": 0.4348931841302136, "grad_norm": 1.1714608669281006, "learning_rate": 1e-05, "loss": 0.523, "mean_token_accuracy": 0.8379561305046082, "num_tokens": 136325905.0, "step": 855 }, { "epoch": 0.43540183112919634, "grad_norm": 1.1079695224761963, "learning_rate": 1e-05, "loss": 0.5771, "mean_token_accuracy": 0.8236668109893799, "num_tokens": 136482739.0, "step": 856 }, { "epoch": 0.43591047812817907, "grad_norm": 1.145799160003662, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.8371036052703857, "num_tokens": 136652674.0, "step": 857 }, { "epoch": 0.43641912512716174, "grad_norm": 1.2814421653747559, "learning_rate": 1e-05, "loss": 0.5385, "mean_token_accuracy": 0.8340997695922852, "num_tokens": 136822688.0, "step": 858 }, { "epoch": 0.4369277721261445, "grad_norm": 1.0849554538726807, "learning_rate": 1e-05, "loss": 0.5552, "mean_token_accuracy": 0.8300553560256958, "num_tokens": 136980565.0, "step": 859 }, { "epoch": 0.43743641912512715, "grad_norm": 1.0674481391906738, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.8417307138442993, "num_tokens": 137137295.0, "step": 860 }, { "epoch": 0.4379450661241099, "grad_norm": 1.2410690784454346, "learning_rate": 1e-05, "loss": 0.5406, "mean_token_accuracy": 0.83237624168396, "num_tokens": 137301870.0, "step": 861 }, { "epoch": 0.43845371312309256, "grad_norm": 1.118271827697754, "learning_rate": 1e-05, "loss": 0.5551, "mean_token_accuracy": 0.830010175704956, "num_tokens": 137449013.0, "step": 862 }, { "epoch": 0.4389623601220753, "grad_norm": 1.2069499492645264, "learning_rate": 1e-05, "loss": 0.5572, "mean_token_accuracy": 0.8290837407112122, "num_tokens": 137604020.0, "step": 863 }, { "epoch": 0.43947100712105797, "grad_norm": 1.1198499202728271, "learning_rate": 1e-05, "loss": 0.537, "mean_token_accuracy": 0.8342312574386597, "num_tokens": 137767680.0, "step": 864 }, { "epoch": 0.4399796541200407, "grad_norm": 1.056345820426941, "learning_rate": 1e-05, "loss": 0.5484, "mean_token_accuracy": 0.8331097960472107, "num_tokens": 137912184.0, "step": 865 }, { "epoch": 0.4404883011190234, "grad_norm": 1.2465364933013916, "learning_rate": 1e-05, "loss": 0.5658, "mean_token_accuracy": 0.8300220966339111, "num_tokens": 138066689.0, "step": 866 }, { "epoch": 0.4409969481180061, "grad_norm": 1.1372525691986084, "learning_rate": 1e-05, "loss": 0.5661, "mean_token_accuracy": 0.8273263573646545, "num_tokens": 138235428.0, "step": 867 }, { "epoch": 0.44150559511698884, "grad_norm": 1.4288570880889893, "learning_rate": 1e-05, "loss": 0.6132, "mean_token_accuracy": 0.8172624111175537, "num_tokens": 138390463.0, "step": 868 }, { "epoch": 0.4420142421159715, "grad_norm": 1.215345025062561, "learning_rate": 1e-05, "loss": 0.5718, "mean_token_accuracy": 0.8258681297302246, "num_tokens": 138552942.0, "step": 869 }, { "epoch": 0.44252288911495424, "grad_norm": 1.1598032712936401, "learning_rate": 1e-05, "loss": 0.5485, "mean_token_accuracy": 0.8313640356063843, "num_tokens": 138712726.0, "step": 870 }, { "epoch": 0.4430315361139369, "grad_norm": 1.1277799606323242, "learning_rate": 1e-05, "loss": 0.5425, "mean_token_accuracy": 0.8344129323959351, "num_tokens": 138880829.0, "step": 871 }, { "epoch": 0.44354018311291965, "grad_norm": 1.2340621948242188, "learning_rate": 1e-05, "loss": 0.564, "mean_token_accuracy": 0.826789140701294, "num_tokens": 139056494.0, "step": 872 }, { "epoch": 0.4440488301119023, "grad_norm": 1.2002519369125366, "learning_rate": 1e-05, "loss": 0.6162, "mean_token_accuracy": 0.8133549690246582, "num_tokens": 139216913.0, "step": 873 }, { "epoch": 0.44455747711088506, "grad_norm": 1.2109215259552002, "learning_rate": 1e-05, "loss": 0.5111, "mean_token_accuracy": 0.8412950038909912, "num_tokens": 139362971.0, "step": 874 }, { "epoch": 0.44506612410986773, "grad_norm": 1.1629414558410645, "learning_rate": 1e-05, "loss": 0.5791, "mean_token_accuracy": 0.8220655918121338, "num_tokens": 139513998.0, "step": 875 }, { "epoch": 0.44557477110885046, "grad_norm": 1.2482560873031616, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.8395774960517883, "num_tokens": 139661330.0, "step": 876 }, { "epoch": 0.44608341810783314, "grad_norm": 1.1493937969207764, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.8332647681236267, "num_tokens": 139830108.0, "step": 877 }, { "epoch": 0.44659206510681587, "grad_norm": 1.1169462203979492, "learning_rate": 1e-05, "loss": 0.5895, "mean_token_accuracy": 0.8215635418891907, "num_tokens": 139996848.0, "step": 878 }, { "epoch": 0.4471007121057986, "grad_norm": 1.1568292379379272, "learning_rate": 1e-05, "loss": 0.5616, "mean_token_accuracy": 0.8280988335609436, "num_tokens": 140162511.0, "step": 879 }, { "epoch": 0.4476093591047813, "grad_norm": 1.1624438762664795, "learning_rate": 1e-05, "loss": 0.5412, "mean_token_accuracy": 0.8338565826416016, "num_tokens": 140315404.0, "step": 880 }, { "epoch": 0.448118006103764, "grad_norm": 1.1456266641616821, "learning_rate": 1e-05, "loss": 0.5844, "mean_token_accuracy": 0.8213433623313904, "num_tokens": 140482165.0, "step": 881 }, { "epoch": 0.4486266531027467, "grad_norm": 1.1150074005126953, "learning_rate": 1e-05, "loss": 0.5454, "mean_token_accuracy": 0.8319356441497803, "num_tokens": 140632686.0, "step": 882 }, { "epoch": 0.4491353001017294, "grad_norm": 1.145408272743225, "learning_rate": 1e-05, "loss": 0.5557, "mean_token_accuracy": 0.8295482397079468, "num_tokens": 140795616.0, "step": 883 }, { "epoch": 0.4496439471007121, "grad_norm": 1.180821180343628, "learning_rate": 1e-05, "loss": 0.5282, "mean_token_accuracy": 0.8360565900802612, "num_tokens": 140945277.0, "step": 884 }, { "epoch": 0.4501525940996948, "grad_norm": 1.1291463375091553, "learning_rate": 1e-05, "loss": 0.5532, "mean_token_accuracy": 0.8307245969772339, "num_tokens": 141105845.0, "step": 885 }, { "epoch": 0.4506612410986775, "grad_norm": 1.061436653137207, "learning_rate": 1e-05, "loss": 0.5555, "mean_token_accuracy": 0.829817533493042, "num_tokens": 141261262.0, "step": 886 }, { "epoch": 0.45116988809766023, "grad_norm": 1.2997404336929321, "learning_rate": 1e-05, "loss": 0.5312, "mean_token_accuracy": 0.8381103277206421, "num_tokens": 141419743.0, "step": 887 }, { "epoch": 0.4516785350966429, "grad_norm": 1.1619393825531006, "learning_rate": 1e-05, "loss": 0.5665, "mean_token_accuracy": 0.8256223201751709, "num_tokens": 141584264.0, "step": 888 }, { "epoch": 0.45218718209562564, "grad_norm": 1.069907784461975, "learning_rate": 1e-05, "loss": 0.5579, "mean_token_accuracy": 0.8292261362075806, "num_tokens": 141744206.0, "step": 889 }, { "epoch": 0.4526958290946083, "grad_norm": 1.191235065460205, "learning_rate": 1e-05, "loss": 0.5037, "mean_token_accuracy": 0.842210590839386, "num_tokens": 141902690.0, "step": 890 }, { "epoch": 0.45320447609359105, "grad_norm": 1.1242882013320923, "learning_rate": 1e-05, "loss": 0.5517, "mean_token_accuracy": 0.8317337036132812, "num_tokens": 142051810.0, "step": 891 }, { "epoch": 0.4537131230925738, "grad_norm": 1.2400939464569092, "learning_rate": 1e-05, "loss": 0.5613, "mean_token_accuracy": 0.8265682458877563, "num_tokens": 142212848.0, "step": 892 }, { "epoch": 0.45422177009155645, "grad_norm": 1.1740095615386963, "learning_rate": 1e-05, "loss": 0.5759, "mean_token_accuracy": 0.8247115612030029, "num_tokens": 142378279.0, "step": 893 }, { "epoch": 0.4547304170905392, "grad_norm": 1.1696137189865112, "learning_rate": 1e-05, "loss": 0.5672, "mean_token_accuracy": 0.8267513513565063, "num_tokens": 142533419.0, "step": 894 }, { "epoch": 0.45523906408952186, "grad_norm": 1.1340491771697998, "learning_rate": 1e-05, "loss": 0.5554, "mean_token_accuracy": 0.8306752443313599, "num_tokens": 142681407.0, "step": 895 }, { "epoch": 0.4557477110885046, "grad_norm": 1.1108263731002808, "learning_rate": 1e-05, "loss": 0.5576, "mean_token_accuracy": 0.8294530510902405, "num_tokens": 142834121.0, "step": 896 }, { "epoch": 0.45625635808748727, "grad_norm": 1.1582520008087158, "learning_rate": 1e-05, "loss": 0.552, "mean_token_accuracy": 0.8302773237228394, "num_tokens": 142995941.0, "step": 897 }, { "epoch": 0.45676500508647, "grad_norm": 1.0763006210327148, "learning_rate": 1e-05, "loss": 0.5422, "mean_token_accuracy": 0.8331145644187927, "num_tokens": 143165685.0, "step": 898 }, { "epoch": 0.4572736520854527, "grad_norm": 1.2574509382247925, "learning_rate": 1e-05, "loss": 0.5354, "mean_token_accuracy": 0.835533618927002, "num_tokens": 143324501.0, "step": 899 }, { "epoch": 0.4577822990844354, "grad_norm": 1.1457747220993042, "learning_rate": 1e-05, "loss": 0.5445, "mean_token_accuracy": 0.8330206274986267, "num_tokens": 143485496.0, "step": 900 }, { "epoch": 0.4582909460834181, "grad_norm": 1.1134295463562012, "learning_rate": 1e-05, "loss": 0.5779, "mean_token_accuracy": 0.8217025399208069, "num_tokens": 143640935.0, "step": 901 }, { "epoch": 0.4587995930824008, "grad_norm": 1.3248838186264038, "learning_rate": 1e-05, "loss": 0.5638, "mean_token_accuracy": 0.8270464539527893, "num_tokens": 143805905.0, "step": 902 }, { "epoch": 0.45930824008138355, "grad_norm": 1.1526035070419312, "learning_rate": 1e-05, "loss": 0.5757, "mean_token_accuracy": 0.8245916366577148, "num_tokens": 143960760.0, "step": 903 }, { "epoch": 0.4598168870803662, "grad_norm": 1.0641709566116333, "learning_rate": 1e-05, "loss": 0.5105, "mean_token_accuracy": 0.8424915671348572, "num_tokens": 144107657.0, "step": 904 }, { "epoch": 0.46032553407934895, "grad_norm": 1.5470635890960693, "learning_rate": 1e-05, "loss": 0.5587, "mean_token_accuracy": 0.8290267586708069, "num_tokens": 144261684.0, "step": 905 }, { "epoch": 0.46083418107833163, "grad_norm": 1.2497509717941284, "learning_rate": 1e-05, "loss": 0.571, "mean_token_accuracy": 0.8268980979919434, "num_tokens": 144426788.0, "step": 906 }, { "epoch": 0.46134282807731436, "grad_norm": 1.128784418106079, "learning_rate": 1e-05, "loss": 0.5252, "mean_token_accuracy": 0.8391796946525574, "num_tokens": 144588094.0, "step": 907 }, { "epoch": 0.46185147507629704, "grad_norm": 1.4454443454742432, "learning_rate": 1e-05, "loss": 0.5486, "mean_token_accuracy": 0.8304920196533203, "num_tokens": 144742302.0, "step": 908 }, { "epoch": 0.46236012207527977, "grad_norm": 1.1578365564346313, "learning_rate": 1e-05, "loss": 0.5661, "mean_token_accuracy": 0.8252307176589966, "num_tokens": 144889815.0, "step": 909 }, { "epoch": 0.46286876907426244, "grad_norm": 1.193178653717041, "learning_rate": 1e-05, "loss": 0.5428, "mean_token_accuracy": 0.8328741788864136, "num_tokens": 145041480.0, "step": 910 }, { "epoch": 0.4633774160732452, "grad_norm": 1.1511712074279785, "learning_rate": 1e-05, "loss": 0.532, "mean_token_accuracy": 0.8380119800567627, "num_tokens": 145207314.0, "step": 911 }, { "epoch": 0.46388606307222785, "grad_norm": 1.1427394151687622, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.8285709023475647, "num_tokens": 145365776.0, "step": 912 }, { "epoch": 0.4643947100712106, "grad_norm": 1.1380131244659424, "learning_rate": 1e-05, "loss": 0.5531, "mean_token_accuracy": 0.8311206102371216, "num_tokens": 145530569.0, "step": 913 }, { "epoch": 0.4649033570701933, "grad_norm": 1.1422157287597656, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8367600440979004, "num_tokens": 145686829.0, "step": 914 }, { "epoch": 0.465412004069176, "grad_norm": 1.2226670980453491, "learning_rate": 1e-05, "loss": 0.5495, "mean_token_accuracy": 0.82952880859375, "num_tokens": 145853471.0, "step": 915 }, { "epoch": 0.4659206510681587, "grad_norm": 1.1117241382598877, "learning_rate": 1e-05, "loss": 0.5677, "mean_token_accuracy": 0.8247131109237671, "num_tokens": 146031134.0, "step": 916 }, { "epoch": 0.4664292980671414, "grad_norm": 1.225894570350647, "learning_rate": 1e-05, "loss": 0.5986, "mean_token_accuracy": 0.8198223114013672, "num_tokens": 146186384.0, "step": 917 }, { "epoch": 0.4669379450661241, "grad_norm": 1.0776770114898682, "learning_rate": 1e-05, "loss": 0.5679, "mean_token_accuracy": 0.8272132277488708, "num_tokens": 146342004.0, "step": 918 }, { "epoch": 0.4674465920651068, "grad_norm": 1.0807486772537231, "learning_rate": 1e-05, "loss": 0.5441, "mean_token_accuracy": 0.8315936923027039, "num_tokens": 146496365.0, "step": 919 }, { "epoch": 0.46795523906408953, "grad_norm": 1.1356221437454224, "learning_rate": 1e-05, "loss": 0.5655, "mean_token_accuracy": 0.8282874822616577, "num_tokens": 146660600.0, "step": 920 }, { "epoch": 0.4684638860630722, "grad_norm": 1.1465460062026978, "learning_rate": 1e-05, "loss": 0.6184, "mean_token_accuracy": 0.8121355772018433, "num_tokens": 146823441.0, "step": 921 }, { "epoch": 0.46897253306205494, "grad_norm": 1.2211737632751465, "learning_rate": 1e-05, "loss": 0.5871, "mean_token_accuracy": 0.8233176469802856, "num_tokens": 146980645.0, "step": 922 }, { "epoch": 0.4694811800610376, "grad_norm": 1.1721731424331665, "learning_rate": 1e-05, "loss": 0.58, "mean_token_accuracy": 0.8220207095146179, "num_tokens": 147140337.0, "step": 923 }, { "epoch": 0.46998982706002035, "grad_norm": 1.1544034481048584, "learning_rate": 1e-05, "loss": 0.5585, "mean_token_accuracy": 0.8287423849105835, "num_tokens": 147298752.0, "step": 924 }, { "epoch": 0.470498474059003, "grad_norm": 1.0708504915237427, "learning_rate": 1e-05, "loss": 0.5061, "mean_token_accuracy": 0.8421027660369873, "num_tokens": 147452293.0, "step": 925 }, { "epoch": 0.47100712105798576, "grad_norm": 1.4411351680755615, "learning_rate": 1e-05, "loss": 0.5483, "mean_token_accuracy": 0.8321117162704468, "num_tokens": 147606035.0, "step": 926 }, { "epoch": 0.4715157680569685, "grad_norm": 1.2126202583312988, "learning_rate": 1e-05, "loss": 0.52, "mean_token_accuracy": 0.8392668962478638, "num_tokens": 147740900.0, "step": 927 }, { "epoch": 0.47202441505595116, "grad_norm": 1.2801289558410645, "learning_rate": 1e-05, "loss": 0.5198, "mean_token_accuracy": 0.8385640978813171, "num_tokens": 147900796.0, "step": 928 }, { "epoch": 0.4725330620549339, "grad_norm": 1.2138499021530151, "learning_rate": 1e-05, "loss": 0.5677, "mean_token_accuracy": 0.8254543542861938, "num_tokens": 148048395.0, "step": 929 }, { "epoch": 0.47304170905391657, "grad_norm": 1.1250996589660645, "learning_rate": 1e-05, "loss": 0.4999, "mean_token_accuracy": 0.8460116386413574, "num_tokens": 148213700.0, "step": 930 }, { "epoch": 0.4735503560528993, "grad_norm": 1.2026902437210083, "learning_rate": 1e-05, "loss": 0.5484, "mean_token_accuracy": 0.8299406170845032, "num_tokens": 148360212.0, "step": 931 }, { "epoch": 0.474059003051882, "grad_norm": 1.1241765022277832, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.8328520655632019, "num_tokens": 148519502.0, "step": 932 }, { "epoch": 0.4745676500508647, "grad_norm": 1.1928129196166992, "learning_rate": 1e-05, "loss": 0.5709, "mean_token_accuracy": 0.8253394365310669, "num_tokens": 148685441.0, "step": 933 }, { "epoch": 0.4750762970498474, "grad_norm": 1.168749451637268, "learning_rate": 1e-05, "loss": 0.5677, "mean_token_accuracy": 0.8280705809593201, "num_tokens": 148837050.0, "step": 934 }, { "epoch": 0.4755849440488301, "grad_norm": 1.1415495872497559, "learning_rate": 1e-05, "loss": 0.5312, "mean_token_accuracy": 0.8370417356491089, "num_tokens": 149001259.0, "step": 935 }, { "epoch": 0.4760935910478128, "grad_norm": 1.066994309425354, "learning_rate": 1e-05, "loss": 0.5791, "mean_token_accuracy": 0.8250643610954285, "num_tokens": 149158085.0, "step": 936 }, { "epoch": 0.4766022380467955, "grad_norm": 1.1611919403076172, "learning_rate": 1e-05, "loss": 0.5181, "mean_token_accuracy": 0.8408796787261963, "num_tokens": 149306579.0, "step": 937 }, { "epoch": 0.47711088504577825, "grad_norm": 1.1194415092468262, "learning_rate": 1e-05, "loss": 0.604, "mean_token_accuracy": 0.8182238340377808, "num_tokens": 149470715.0, "step": 938 }, { "epoch": 0.47761953204476093, "grad_norm": 1.149794578552246, "learning_rate": 1e-05, "loss": 0.5573, "mean_token_accuracy": 0.8303046226501465, "num_tokens": 149626418.0, "step": 939 }, { "epoch": 0.47812817904374366, "grad_norm": 1.0877587795257568, "learning_rate": 1e-05, "loss": 0.5724, "mean_token_accuracy": 0.8244882822036743, "num_tokens": 149779243.0, "step": 940 }, { "epoch": 0.47863682604272634, "grad_norm": 1.0702544450759888, "learning_rate": 1e-05, "loss": 0.5454, "mean_token_accuracy": 0.832623302936554, "num_tokens": 149928437.0, "step": 941 }, { "epoch": 0.47914547304170907, "grad_norm": 1.1327913999557495, "learning_rate": 1e-05, "loss": 0.6013, "mean_token_accuracy": 0.8169209957122803, "num_tokens": 150086890.0, "step": 942 }, { "epoch": 0.47965412004069174, "grad_norm": 0.9984365105628967, "learning_rate": 1e-05, "loss": 0.5139, "mean_token_accuracy": 0.8412238359451294, "num_tokens": 150246489.0, "step": 943 }, { "epoch": 0.4801627670396745, "grad_norm": 0.9913185834884644, "learning_rate": 1e-05, "loss": 0.5422, "mean_token_accuracy": 0.8335740566253662, "num_tokens": 150414091.0, "step": 944 }, { "epoch": 0.48067141403865715, "grad_norm": 1.057745337486267, "learning_rate": 1e-05, "loss": 0.5625, "mean_token_accuracy": 0.828140139579773, "num_tokens": 150581538.0, "step": 945 }, { "epoch": 0.4811800610376399, "grad_norm": 1.0825947523117065, "learning_rate": 1e-05, "loss": 0.5693, "mean_token_accuracy": 0.8258931636810303, "num_tokens": 150749759.0, "step": 946 }, { "epoch": 0.48168870803662256, "grad_norm": 1.0107381343841553, "learning_rate": 1e-05, "loss": 0.5458, "mean_token_accuracy": 0.8325010538101196, "num_tokens": 150911352.0, "step": 947 }, { "epoch": 0.4821973550356053, "grad_norm": 1.1812866926193237, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.8303778171539307, "num_tokens": 151078517.0, "step": 948 }, { "epoch": 0.482706002034588, "grad_norm": 1.0815045833587646, "learning_rate": 1e-05, "loss": 0.5745, "mean_token_accuracy": 0.8253252506256104, "num_tokens": 151244643.0, "step": 949 }, { "epoch": 0.4832146490335707, "grad_norm": 1.175293207168579, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.8352696895599365, "num_tokens": 151404760.0, "step": 950 }, { "epoch": 0.48372329603255343, "grad_norm": 1.2163329124450684, "learning_rate": 1e-05, "loss": 0.5421, "mean_token_accuracy": 0.8340187072753906, "num_tokens": 151555236.0, "step": 951 }, { "epoch": 0.4842319430315361, "grad_norm": 1.1777931451797485, "learning_rate": 1e-05, "loss": 0.5089, "mean_token_accuracy": 0.841942310333252, "num_tokens": 151714374.0, "step": 952 }, { "epoch": 0.48474059003051884, "grad_norm": 1.1555335521697998, "learning_rate": 1e-05, "loss": 0.5672, "mean_token_accuracy": 0.8266357183456421, "num_tokens": 151873694.0, "step": 953 }, { "epoch": 0.4852492370295015, "grad_norm": 1.3585796356201172, "learning_rate": 1e-05, "loss": 0.5114, "mean_token_accuracy": 0.8436131477355957, "num_tokens": 152034628.0, "step": 954 }, { "epoch": 0.48575788402848424, "grad_norm": 1.2470099925994873, "learning_rate": 1e-05, "loss": 0.5767, "mean_token_accuracy": 0.8253637552261353, "num_tokens": 152182009.0, "step": 955 }, { "epoch": 0.4862665310274669, "grad_norm": 1.1471227407455444, "learning_rate": 1e-05, "loss": 0.5231, "mean_token_accuracy": 0.8380463719367981, "num_tokens": 152336970.0, "step": 956 }, { "epoch": 0.48677517802644965, "grad_norm": 1.0701148509979248, "learning_rate": 1e-05, "loss": 0.567, "mean_token_accuracy": 0.8259918689727783, "num_tokens": 152512035.0, "step": 957 }, { "epoch": 0.4872838250254323, "grad_norm": 1.0370410680770874, "learning_rate": 1e-05, "loss": 0.5557, "mean_token_accuracy": 0.8298166990280151, "num_tokens": 152684663.0, "step": 958 }, { "epoch": 0.48779247202441506, "grad_norm": 1.0619418621063232, "learning_rate": 1e-05, "loss": 0.5838, "mean_token_accuracy": 0.8224383592605591, "num_tokens": 152850154.0, "step": 959 }, { "epoch": 0.4883011190233978, "grad_norm": 1.0469465255737305, "learning_rate": 1e-05, "loss": 0.5535, "mean_token_accuracy": 0.8312027454376221, "num_tokens": 153010247.0, "step": 960 }, { "epoch": 0.48880976602238047, "grad_norm": 1.0778266191482544, "learning_rate": 1e-05, "loss": 0.5189, "mean_token_accuracy": 0.8391969203948975, "num_tokens": 153174299.0, "step": 961 }, { "epoch": 0.4893184130213632, "grad_norm": 2.199824810028076, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.835580050945282, "num_tokens": 153337306.0, "step": 962 }, { "epoch": 0.4898270600203459, "grad_norm": 1.198591947555542, "learning_rate": 1e-05, "loss": 0.562, "mean_token_accuracy": 0.8278141617774963, "num_tokens": 153492940.0, "step": 963 }, { "epoch": 0.4903357070193286, "grad_norm": 1.6478595733642578, "learning_rate": 1e-05, "loss": 0.5354, "mean_token_accuracy": 0.8336241245269775, "num_tokens": 153652759.0, "step": 964 }, { "epoch": 0.4908443540183113, "grad_norm": 1.2517733573913574, "learning_rate": 1e-05, "loss": 0.5255, "mean_token_accuracy": 0.837712287902832, "num_tokens": 153818682.0, "step": 965 }, { "epoch": 0.491353001017294, "grad_norm": 1.17851984500885, "learning_rate": 1e-05, "loss": 0.538, "mean_token_accuracy": 0.8355500102043152, "num_tokens": 153982904.0, "step": 966 }, { "epoch": 0.4918616480162767, "grad_norm": 1.2655420303344727, "learning_rate": 1e-05, "loss": 0.5469, "mean_token_accuracy": 0.8304451704025269, "num_tokens": 154142701.0, "step": 967 }, { "epoch": 0.4923702950152594, "grad_norm": 1.3124351501464844, "learning_rate": 1e-05, "loss": 0.5952, "mean_token_accuracy": 0.8196591138839722, "num_tokens": 154318454.0, "step": 968 }, { "epoch": 0.4928789420142421, "grad_norm": 1.16322922706604, "learning_rate": 1e-05, "loss": 0.5998, "mean_token_accuracy": 0.8195618987083435, "num_tokens": 154485998.0, "step": 969 }, { "epoch": 0.4933875890132248, "grad_norm": 1.2704930305480957, "learning_rate": 1e-05, "loss": 0.5492, "mean_token_accuracy": 0.8322117328643799, "num_tokens": 154646655.0, "step": 970 }, { "epoch": 0.4938962360122075, "grad_norm": 1.1108856201171875, "learning_rate": 1e-05, "loss": 0.5453, "mean_token_accuracy": 0.8348992466926575, "num_tokens": 154807349.0, "step": 971 }, { "epoch": 0.49440488301119023, "grad_norm": 1.1756478548049927, "learning_rate": 1e-05, "loss": 0.5526, "mean_token_accuracy": 0.829748272895813, "num_tokens": 154961491.0, "step": 972 }, { "epoch": 0.49491353001017296, "grad_norm": 1.2276084423065186, "learning_rate": 1e-05, "loss": 0.5433, "mean_token_accuracy": 0.8331449031829834, "num_tokens": 155114869.0, "step": 973 }, { "epoch": 0.49542217700915564, "grad_norm": 1.1128617525100708, "learning_rate": 1e-05, "loss": 0.5419, "mean_token_accuracy": 0.8313051462173462, "num_tokens": 155273562.0, "step": 974 }, { "epoch": 0.49593082400813837, "grad_norm": 1.2433724403381348, "learning_rate": 1e-05, "loss": 0.483, "mean_token_accuracy": 0.84830641746521, "num_tokens": 155431483.0, "step": 975 }, { "epoch": 0.49643947100712105, "grad_norm": 1.2141236066818237, "learning_rate": 1e-05, "loss": 0.5588, "mean_token_accuracy": 0.8308509588241577, "num_tokens": 155577059.0, "step": 976 }, { "epoch": 0.4969481180061038, "grad_norm": 1.0577449798583984, "learning_rate": 1e-05, "loss": 0.5757, "mean_token_accuracy": 0.8266185522079468, "num_tokens": 155734946.0, "step": 977 }, { "epoch": 0.49745676500508645, "grad_norm": 1.3903453350067139, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.8309212923049927, "num_tokens": 155896969.0, "step": 978 }, { "epoch": 0.4979654120040692, "grad_norm": 1.0999048948287964, "learning_rate": 1e-05, "loss": 0.5681, "mean_token_accuracy": 0.827406644821167, "num_tokens": 156060582.0, "step": 979 }, { "epoch": 0.49847405900305186, "grad_norm": 1.138027310371399, "learning_rate": 1e-05, "loss": 0.5366, "mean_token_accuracy": 0.8354133367538452, "num_tokens": 156221661.0, "step": 980 }, { "epoch": 0.4989827060020346, "grad_norm": 1.1465777158737183, "learning_rate": 1e-05, "loss": 0.5373, "mean_token_accuracy": 0.8339407444000244, "num_tokens": 156378238.0, "step": 981 }, { "epoch": 0.49949135300101727, "grad_norm": 1.1058530807495117, "learning_rate": 1e-05, "loss": 0.5856, "mean_token_accuracy": 0.820149838924408, "num_tokens": 156538559.0, "step": 982 }, { "epoch": 0.5, "grad_norm": 1.1324735879898071, "learning_rate": 1e-05, "loss": 0.5597, "mean_token_accuracy": 0.8275347948074341, "num_tokens": 156700036.0, "step": 983 }, { "epoch": 0.5005086469989827, "grad_norm": 1.0057607889175415, "learning_rate": 1e-05, "loss": 0.5674, "mean_token_accuracy": 0.8275812864303589, "num_tokens": 156879118.0, "step": 984 }, { "epoch": 0.5010172939979655, "grad_norm": 1.0707874298095703, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.8437913656234741, "num_tokens": 157043565.0, "step": 985 }, { "epoch": 0.5015259409969481, "grad_norm": 1.1633177995681763, "learning_rate": 1e-05, "loss": 0.5465, "mean_token_accuracy": 0.8311898708343506, "num_tokens": 157198178.0, "step": 986 }, { "epoch": 0.5020345879959308, "grad_norm": 1.0955051183700562, "learning_rate": 1e-05, "loss": 0.5503, "mean_token_accuracy": 0.8318169116973877, "num_tokens": 157349621.0, "step": 987 }, { "epoch": 0.5025432349949135, "grad_norm": 1.1708135604858398, "learning_rate": 1e-05, "loss": 0.5367, "mean_token_accuracy": 0.835382878780365, "num_tokens": 157519093.0, "step": 988 }, { "epoch": 0.5030518819938963, "grad_norm": 1.075040340423584, "learning_rate": 1e-05, "loss": 0.5385, "mean_token_accuracy": 0.8336274027824402, "num_tokens": 157671835.0, "step": 989 }, { "epoch": 0.503560528992879, "grad_norm": 1.1236952543258667, "learning_rate": 1e-05, "loss": 0.5372, "mean_token_accuracy": 0.8344086408615112, "num_tokens": 157826508.0, "step": 990 }, { "epoch": 0.5040691759918616, "grad_norm": 1.1851485967636108, "learning_rate": 1e-05, "loss": 0.5642, "mean_token_accuracy": 0.8271092772483826, "num_tokens": 157983854.0, "step": 991 }, { "epoch": 0.5045778229908443, "grad_norm": 1.127675175666809, "learning_rate": 1e-05, "loss": 0.5536, "mean_token_accuracy": 0.8302778601646423, "num_tokens": 158153856.0, "step": 992 }, { "epoch": 0.5050864699898271, "grad_norm": 1.0169093608856201, "learning_rate": 1e-05, "loss": 0.5267, "mean_token_accuracy": 0.8384323120117188, "num_tokens": 158311723.0, "step": 993 }, { "epoch": 0.5055951169888098, "grad_norm": 1.11962890625, "learning_rate": 1e-05, "loss": 0.5822, "mean_token_accuracy": 0.825655460357666, "num_tokens": 158468860.0, "step": 994 }, { "epoch": 0.5061037639877924, "grad_norm": 1.0792301893234253, "learning_rate": 1e-05, "loss": 0.5576, "mean_token_accuracy": 0.8288288712501526, "num_tokens": 158623811.0, "step": 995 }, { "epoch": 0.5066124109867752, "grad_norm": 1.1109192371368408, "learning_rate": 1e-05, "loss": 0.5424, "mean_token_accuracy": 0.8330716490745544, "num_tokens": 158789067.0, "step": 996 }, { "epoch": 0.5071210579857579, "grad_norm": 1.1863211393356323, "learning_rate": 1e-05, "loss": 0.5679, "mean_token_accuracy": 0.8257003426551819, "num_tokens": 158950421.0, "step": 997 }, { "epoch": 0.5076297049847406, "grad_norm": 1.1955516338348389, "learning_rate": 1e-05, "loss": 0.5381, "mean_token_accuracy": 0.8345174789428711, "num_tokens": 159115006.0, "step": 998 }, { "epoch": 0.5081383519837233, "grad_norm": 1.2774521112442017, "learning_rate": 1e-05, "loss": 0.5341, "mean_token_accuracy": 0.8339723348617554, "num_tokens": 159267147.0, "step": 999 }, { "epoch": 0.508646998982706, "grad_norm": 1.023662805557251, "learning_rate": 1e-05, "loss": 0.5263, "mean_token_accuracy": 0.8373016119003296, "num_tokens": 159430845.0, "step": 1000 }, { "epoch": 0.5091556459816887, "grad_norm": 1.1863523721694946, "learning_rate": 1e-05, "loss": 0.5531, "mean_token_accuracy": 0.8315171003341675, "num_tokens": 159592673.0, "step": 1001 }, { "epoch": 0.5096642929806714, "grad_norm": 1.152750849723816, "learning_rate": 1e-05, "loss": 0.5563, "mean_token_accuracy": 0.8282335996627808, "num_tokens": 159766028.0, "step": 1002 }, { "epoch": 0.5101729399796541, "grad_norm": 1.1441240310668945, "learning_rate": 1e-05, "loss": 0.5446, "mean_token_accuracy": 0.8327241539955139, "num_tokens": 159928990.0, "step": 1003 }, { "epoch": 0.5106815869786369, "grad_norm": 1.1187294721603394, "learning_rate": 1e-05, "loss": 0.5088, "mean_token_accuracy": 0.8430362939834595, "num_tokens": 160093084.0, "step": 1004 }, { "epoch": 0.5111902339776195, "grad_norm": 1.1033687591552734, "learning_rate": 1e-05, "loss": 0.5685, "mean_token_accuracy": 0.8255316019058228, "num_tokens": 160253282.0, "step": 1005 }, { "epoch": 0.5116988809766022, "grad_norm": 1.1011110544204712, "learning_rate": 1e-05, "loss": 0.5266, "mean_token_accuracy": 0.839083731174469, "num_tokens": 160414551.0, "step": 1006 }, { "epoch": 0.512207527975585, "grad_norm": 1.2247616052627563, "learning_rate": 1e-05, "loss": 0.5724, "mean_token_accuracy": 0.8251423239707947, "num_tokens": 160566903.0, "step": 1007 }, { "epoch": 0.5127161749745677, "grad_norm": 0.9921314716339111, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8400283455848694, "num_tokens": 160732492.0, "step": 1008 }, { "epoch": 0.5132248219735503, "grad_norm": 1.0750997066497803, "learning_rate": 1e-05, "loss": 0.5712, "mean_token_accuracy": 0.825303852558136, "num_tokens": 160901405.0, "step": 1009 }, { "epoch": 0.513733468972533, "grad_norm": 1.1010903120040894, "learning_rate": 1e-05, "loss": 0.5304, "mean_token_accuracy": 0.8378999829292297, "num_tokens": 161063180.0, "step": 1010 }, { "epoch": 0.5142421159715158, "grad_norm": 1.110854983329773, "learning_rate": 1e-05, "loss": 0.5633, "mean_token_accuracy": 0.8287266492843628, "num_tokens": 161224667.0, "step": 1011 }, { "epoch": 0.5147507629704985, "grad_norm": 1.1269981861114502, "learning_rate": 1e-05, "loss": 0.5547, "mean_token_accuracy": 0.8304728865623474, "num_tokens": 161375840.0, "step": 1012 }, { "epoch": 0.5152594099694812, "grad_norm": 1.1674067974090576, "learning_rate": 1e-05, "loss": 0.5615, "mean_token_accuracy": 0.8291001319885254, "num_tokens": 161539381.0, "step": 1013 }, { "epoch": 0.5157680569684638, "grad_norm": 1.091300129890442, "learning_rate": 1e-05, "loss": 0.5591, "mean_token_accuracy": 0.8284760117530823, "num_tokens": 161712451.0, "step": 1014 }, { "epoch": 0.5162767039674466, "grad_norm": 1.140444278717041, "learning_rate": 1e-05, "loss": 0.559, "mean_token_accuracy": 0.8282883167266846, "num_tokens": 161891850.0, "step": 1015 }, { "epoch": 0.5167853509664293, "grad_norm": 1.1514443159103394, "learning_rate": 1e-05, "loss": 0.5263, "mean_token_accuracy": 0.8369109034538269, "num_tokens": 162053742.0, "step": 1016 }, { "epoch": 0.517293997965412, "grad_norm": 1.0525281429290771, "learning_rate": 1e-05, "loss": 0.5303, "mean_token_accuracy": 0.8366672992706299, "num_tokens": 162209650.0, "step": 1017 }, { "epoch": 0.5178026449643948, "grad_norm": 1.1147992610931396, "learning_rate": 1e-05, "loss": 0.58, "mean_token_accuracy": 0.8237215280532837, "num_tokens": 162383502.0, "step": 1018 }, { "epoch": 0.5183112919633774, "grad_norm": 1.1428780555725098, "learning_rate": 1e-05, "loss": 0.5154, "mean_token_accuracy": 0.8408278822898865, "num_tokens": 162544656.0, "step": 1019 }, { "epoch": 0.5188199389623601, "grad_norm": 1.012333631515503, "learning_rate": 1e-05, "loss": 0.556, "mean_token_accuracy": 0.8279523849487305, "num_tokens": 162721800.0, "step": 1020 }, { "epoch": 0.5193285859613428, "grad_norm": 1.2154921293258667, "learning_rate": 1e-05, "loss": 0.5456, "mean_token_accuracy": 0.8320083618164062, "num_tokens": 162876310.0, "step": 1021 }, { "epoch": 0.5198372329603256, "grad_norm": 1.0225021839141846, "learning_rate": 1e-05, "loss": 0.5454, "mean_token_accuracy": 0.8321446776390076, "num_tokens": 163040860.0, "step": 1022 }, { "epoch": 0.5203458799593083, "grad_norm": 1.1398831605911255, "learning_rate": 1e-05, "loss": 0.5557, "mean_token_accuracy": 0.8298647999763489, "num_tokens": 163207134.0, "step": 1023 }, { "epoch": 0.5208545269582909, "grad_norm": 1.1399364471435547, "learning_rate": 1e-05, "loss": 0.5316, "mean_token_accuracy": 0.8351213335990906, "num_tokens": 163354092.0, "step": 1024 }, { "epoch": 0.5213631739572736, "grad_norm": 1.1851685047149658, "learning_rate": 1e-05, "loss": 0.5525, "mean_token_accuracy": 0.8305678367614746, "num_tokens": 163506047.0, "step": 1025 }, { "epoch": 0.5218718209562564, "grad_norm": 1.1319615840911865, "learning_rate": 1e-05, "loss": 0.5456, "mean_token_accuracy": 0.832294762134552, "num_tokens": 163677302.0, "step": 1026 }, { "epoch": 0.5223804679552391, "grad_norm": 1.1624354124069214, "learning_rate": 1e-05, "loss": 0.5315, "mean_token_accuracy": 0.8342126607894897, "num_tokens": 163838092.0, "step": 1027 }, { "epoch": 0.5228891149542217, "grad_norm": 1.1127524375915527, "learning_rate": 1e-05, "loss": 0.5237, "mean_token_accuracy": 0.8383346796035767, "num_tokens": 163993198.0, "step": 1028 }, { "epoch": 0.5233977619532044, "grad_norm": 1.3149102926254272, "learning_rate": 1e-05, "loss": 0.5819, "mean_token_accuracy": 0.8223558068275452, "num_tokens": 164159543.0, "step": 1029 }, { "epoch": 0.5239064089521872, "grad_norm": 1.1892825365066528, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.8346065282821655, "num_tokens": 164325603.0, "step": 1030 }, { "epoch": 0.5244150559511699, "grad_norm": 1.3377810716629028, "learning_rate": 1e-05, "loss": 0.5318, "mean_token_accuracy": 0.8361243009567261, "num_tokens": 164481267.0, "step": 1031 }, { "epoch": 0.5249237029501526, "grad_norm": 1.2706295251846313, "learning_rate": 1e-05, "loss": 0.5183, "mean_token_accuracy": 0.8390676975250244, "num_tokens": 164634955.0, "step": 1032 }, { "epoch": 0.5254323499491353, "grad_norm": 1.0936428308486938, "learning_rate": 1e-05, "loss": 0.5247, "mean_token_accuracy": 0.8365492820739746, "num_tokens": 164788873.0, "step": 1033 }, { "epoch": 0.525940996948118, "grad_norm": 1.2996271848678589, "learning_rate": 1e-05, "loss": 0.5261, "mean_token_accuracy": 0.8357228636741638, "num_tokens": 164949736.0, "step": 1034 }, { "epoch": 0.5264496439471007, "grad_norm": 1.0801299810409546, "learning_rate": 1e-05, "loss": 0.5636, "mean_token_accuracy": 0.826639711856842, "num_tokens": 165111991.0, "step": 1035 }, { "epoch": 0.5269582909460834, "grad_norm": 1.163053274154663, "learning_rate": 1e-05, "loss": 0.5301, "mean_token_accuracy": 0.837220311164856, "num_tokens": 165268681.0, "step": 1036 }, { "epoch": 0.5274669379450662, "grad_norm": 1.0252866744995117, "learning_rate": 1e-05, "loss": 0.514, "mean_token_accuracy": 0.841158390045166, "num_tokens": 165439160.0, "step": 1037 }, { "epoch": 0.5279755849440488, "grad_norm": 1.0195773839950562, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8401865363121033, "num_tokens": 165611395.0, "step": 1038 }, { "epoch": 0.5284842319430315, "grad_norm": 1.4410414695739746, "learning_rate": 1e-05, "loss": 0.5624, "mean_token_accuracy": 0.8270487189292908, "num_tokens": 165770065.0, "step": 1039 }, { "epoch": 0.5289928789420142, "grad_norm": 1.1248342990875244, "learning_rate": 1e-05, "loss": 0.5825, "mean_token_accuracy": 0.8211767673492432, "num_tokens": 165930541.0, "step": 1040 }, { "epoch": 0.529501525940997, "grad_norm": 1.0635501146316528, "learning_rate": 1e-05, "loss": 0.5391, "mean_token_accuracy": 0.8336089849472046, "num_tokens": 166095798.0, "step": 1041 }, { "epoch": 0.5300101729399797, "grad_norm": 1.1478872299194336, "learning_rate": 1e-05, "loss": 0.5411, "mean_token_accuracy": 0.8351947665214539, "num_tokens": 166253912.0, "step": 1042 }, { "epoch": 0.5305188199389623, "grad_norm": 1.0534822940826416, "learning_rate": 1e-05, "loss": 0.5214, "mean_token_accuracy": 0.8404939770698547, "num_tokens": 166414865.0, "step": 1043 }, { "epoch": 0.5310274669379451, "grad_norm": 1.2754970788955688, "learning_rate": 1e-05, "loss": 0.5586, "mean_token_accuracy": 0.8279399871826172, "num_tokens": 166570026.0, "step": 1044 }, { "epoch": 0.5315361139369278, "grad_norm": 1.091422438621521, "learning_rate": 1e-05, "loss": 0.5383, "mean_token_accuracy": 0.8338953852653503, "num_tokens": 166723092.0, "step": 1045 }, { "epoch": 0.5320447609359105, "grad_norm": 1.176932454109192, "learning_rate": 1e-05, "loss": 0.5556, "mean_token_accuracy": 0.8305802345275879, "num_tokens": 166891414.0, "step": 1046 }, { "epoch": 0.5325534079348931, "grad_norm": 1.0924545526504517, "learning_rate": 1e-05, "loss": 0.5757, "mean_token_accuracy": 0.8248872756958008, "num_tokens": 167043099.0, "step": 1047 }, { "epoch": 0.5330620549338759, "grad_norm": 1.1945780515670776, "learning_rate": 1e-05, "loss": 0.5437, "mean_token_accuracy": 0.833451509475708, "num_tokens": 167199448.0, "step": 1048 }, { "epoch": 0.5335707019328586, "grad_norm": 1.0573724508285522, "learning_rate": 1e-05, "loss": 0.5551, "mean_token_accuracy": 0.8307552337646484, "num_tokens": 167356786.0, "step": 1049 }, { "epoch": 0.5340793489318413, "grad_norm": 1.07781982421875, "learning_rate": 1e-05, "loss": 0.5367, "mean_token_accuracy": 0.8337694406509399, "num_tokens": 167508836.0, "step": 1050 }, { "epoch": 0.534587995930824, "grad_norm": 1.102200984954834, "learning_rate": 1e-05, "loss": 0.5294, "mean_token_accuracy": 0.836653470993042, "num_tokens": 167670690.0, "step": 1051 }, { "epoch": 0.5350966429298067, "grad_norm": 1.1723092794418335, "learning_rate": 1e-05, "loss": 0.5686, "mean_token_accuracy": 0.8260093331336975, "num_tokens": 167821540.0, "step": 1052 }, { "epoch": 0.5356052899287894, "grad_norm": 1.1371201276779175, "learning_rate": 1e-05, "loss": 0.5225, "mean_token_accuracy": 0.8382167220115662, "num_tokens": 167998236.0, "step": 1053 }, { "epoch": 0.5361139369277721, "grad_norm": 1.0949405431747437, "learning_rate": 1e-05, "loss": 0.5429, "mean_token_accuracy": 0.8328778743743896, "num_tokens": 168163810.0, "step": 1054 }, { "epoch": 0.5366225839267549, "grad_norm": 1.0404272079467773, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8400763273239136, "num_tokens": 168312526.0, "step": 1055 }, { "epoch": 0.5371312309257376, "grad_norm": 1.4301345348358154, "learning_rate": 1e-05, "loss": 0.5279, "mean_token_accuracy": 0.8389478921890259, "num_tokens": 168465144.0, "step": 1056 }, { "epoch": 0.5376398779247202, "grad_norm": 1.148573875427246, "learning_rate": 1e-05, "loss": 0.5309, "mean_token_accuracy": 0.8361720442771912, "num_tokens": 168621602.0, "step": 1057 }, { "epoch": 0.5381485249237029, "grad_norm": 1.119145393371582, "learning_rate": 1e-05, "loss": 0.5687, "mean_token_accuracy": 0.8250982165336609, "num_tokens": 168772296.0, "step": 1058 }, { "epoch": 0.5386571719226857, "grad_norm": 1.237609624862671, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.835430383682251, "num_tokens": 168930909.0, "step": 1059 }, { "epoch": 0.5391658189216684, "grad_norm": 1.0655325651168823, "learning_rate": 1e-05, "loss": 0.5587, "mean_token_accuracy": 0.828170895576477, "num_tokens": 169084941.0, "step": 1060 }, { "epoch": 0.539674465920651, "grad_norm": 1.2636488676071167, "learning_rate": 1e-05, "loss": 0.5254, "mean_token_accuracy": 0.8372913599014282, "num_tokens": 169235523.0, "step": 1061 }, { "epoch": 0.5401831129196337, "grad_norm": 1.2027055025100708, "learning_rate": 1e-05, "loss": 0.526, "mean_token_accuracy": 0.8375190496444702, "num_tokens": 169398931.0, "step": 1062 }, { "epoch": 0.5406917599186165, "grad_norm": 1.132272720336914, "learning_rate": 1e-05, "loss": 0.5217, "mean_token_accuracy": 0.8366562724113464, "num_tokens": 169559733.0, "step": 1063 }, { "epoch": 0.5412004069175992, "grad_norm": 1.028488039970398, "learning_rate": 1e-05, "loss": 0.5385, "mean_token_accuracy": 0.8357600569725037, "num_tokens": 169713890.0, "step": 1064 }, { "epoch": 0.5417090539165819, "grad_norm": 1.1254281997680664, "learning_rate": 1e-05, "loss": 0.5288, "mean_token_accuracy": 0.8391003608703613, "num_tokens": 169872319.0, "step": 1065 }, { "epoch": 0.5422177009155646, "grad_norm": 1.1480300426483154, "learning_rate": 1e-05, "loss": 0.5769, "mean_token_accuracy": 0.8241935968399048, "num_tokens": 170042021.0, "step": 1066 }, { "epoch": 0.5427263479145473, "grad_norm": 1.2215930223464966, "learning_rate": 1e-05, "loss": 0.5465, "mean_token_accuracy": 0.8304744362831116, "num_tokens": 170190297.0, "step": 1067 }, { "epoch": 0.54323499491353, "grad_norm": 1.1113042831420898, "learning_rate": 1e-05, "loss": 0.5116, "mean_token_accuracy": 0.8419883251190186, "num_tokens": 170344681.0, "step": 1068 }, { "epoch": 0.5437436419125127, "grad_norm": 1.1927094459533691, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.8285712003707886, "num_tokens": 170503123.0, "step": 1069 }, { "epoch": 0.5442522889114955, "grad_norm": 1.1104354858398438, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8371394872665405, "num_tokens": 170662002.0, "step": 1070 }, { "epoch": 0.5447609359104781, "grad_norm": 1.0148671865463257, "learning_rate": 1e-05, "loss": 0.5888, "mean_token_accuracy": 0.8197829723358154, "num_tokens": 170819743.0, "step": 1071 }, { "epoch": 0.5452695829094608, "grad_norm": 1.0770317316055298, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8423032760620117, "num_tokens": 170965381.0, "step": 1072 }, { "epoch": 0.5457782299084435, "grad_norm": 1.5431606769561768, "learning_rate": 1e-05, "loss": 0.5538, "mean_token_accuracy": 0.8318929076194763, "num_tokens": 171118434.0, "step": 1073 }, { "epoch": 0.5462868769074263, "grad_norm": 1.0981653928756714, "learning_rate": 1e-05, "loss": 0.5933, "mean_token_accuracy": 0.8221914768218994, "num_tokens": 171297534.0, "step": 1074 }, { "epoch": 0.546795523906409, "grad_norm": 1.0307037830352783, "learning_rate": 1e-05, "loss": 0.527, "mean_token_accuracy": 0.8369515538215637, "num_tokens": 171454468.0, "step": 1075 }, { "epoch": 0.5473041709053916, "grad_norm": 1.146727442741394, "learning_rate": 1e-05, "loss": 0.558, "mean_token_accuracy": 0.8274430632591248, "num_tokens": 171615049.0, "step": 1076 }, { "epoch": 0.5478128179043744, "grad_norm": 1.060461401939392, "learning_rate": 1e-05, "loss": 0.5265, "mean_token_accuracy": 0.8362975120544434, "num_tokens": 171777974.0, "step": 1077 }, { "epoch": 0.5483214649033571, "grad_norm": 1.0394412279129028, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.8367418050765991, "num_tokens": 171931276.0, "step": 1078 }, { "epoch": 0.5488301119023398, "grad_norm": 1.0544158220291138, "learning_rate": 1e-05, "loss": 0.5323, "mean_token_accuracy": 0.8363548517227173, "num_tokens": 172092986.0, "step": 1079 }, { "epoch": 0.5493387589013224, "grad_norm": 1.0615379810333252, "learning_rate": 1e-05, "loss": 0.5448, "mean_token_accuracy": 0.8330628871917725, "num_tokens": 172259334.0, "step": 1080 }, { "epoch": 0.5498474059003052, "grad_norm": 1.0669916868209839, "learning_rate": 1e-05, "loss": 0.5323, "mean_token_accuracy": 0.8377177715301514, "num_tokens": 172432734.0, "step": 1081 }, { "epoch": 0.5503560528992879, "grad_norm": 1.0713269710540771, "learning_rate": 1e-05, "loss": 0.5222, "mean_token_accuracy": 0.8381586074829102, "num_tokens": 172580086.0, "step": 1082 }, { "epoch": 0.5508646998982706, "grad_norm": 1.1586099863052368, "learning_rate": 1e-05, "loss": 0.5418, "mean_token_accuracy": 0.8332674503326416, "num_tokens": 172733854.0, "step": 1083 }, { "epoch": 0.5513733468972533, "grad_norm": 1.0385310649871826, "learning_rate": 1e-05, "loss": 0.5574, "mean_token_accuracy": 0.8295480012893677, "num_tokens": 172895819.0, "step": 1084 }, { "epoch": 0.551881993896236, "grad_norm": 1.0283936262130737, "learning_rate": 1e-05, "loss": 0.5268, "mean_token_accuracy": 0.8370460867881775, "num_tokens": 173057284.0, "step": 1085 }, { "epoch": 0.5523906408952187, "grad_norm": 1.065392255783081, "learning_rate": 1e-05, "loss": 0.5244, "mean_token_accuracy": 0.8387396931648254, "num_tokens": 173207899.0, "step": 1086 }, { "epoch": 0.5528992878942014, "grad_norm": 1.185757040977478, "learning_rate": 1e-05, "loss": 0.5484, "mean_token_accuracy": 0.8312965631484985, "num_tokens": 173378839.0, "step": 1087 }, { "epoch": 0.5534079348931842, "grad_norm": 1.1472268104553223, "learning_rate": 1e-05, "loss": 0.5372, "mean_token_accuracy": 0.8326317667961121, "num_tokens": 173547290.0, "step": 1088 }, { "epoch": 0.5539165818921669, "grad_norm": 1.1030000448226929, "learning_rate": 1e-05, "loss": 0.5495, "mean_token_accuracy": 0.8318257331848145, "num_tokens": 173706562.0, "step": 1089 }, { "epoch": 0.5544252288911495, "grad_norm": 1.065309762954712, "learning_rate": 1e-05, "loss": 0.5729, "mean_token_accuracy": 0.8261637687683105, "num_tokens": 173872148.0, "step": 1090 }, { "epoch": 0.5549338758901322, "grad_norm": 1.1334059238433838, "learning_rate": 1e-05, "loss": 0.5297, "mean_token_accuracy": 0.8351319432258606, "num_tokens": 174021510.0, "step": 1091 }, { "epoch": 0.555442522889115, "grad_norm": 1.1944429874420166, "learning_rate": 1e-05, "loss": 0.5754, "mean_token_accuracy": 0.8241192102432251, "num_tokens": 174174594.0, "step": 1092 }, { "epoch": 0.5559511698880977, "grad_norm": 1.242767095565796, "learning_rate": 1e-05, "loss": 0.5618, "mean_token_accuracy": 0.8247935771942139, "num_tokens": 174327672.0, "step": 1093 }, { "epoch": 0.5564598168870803, "grad_norm": 1.1942962408065796, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8370338678359985, "num_tokens": 174483118.0, "step": 1094 }, { "epoch": 0.556968463886063, "grad_norm": 1.097080111503601, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.8347339630126953, "num_tokens": 174644489.0, "step": 1095 }, { "epoch": 0.5574771108850458, "grad_norm": 1.1676805019378662, "learning_rate": 1e-05, "loss": 0.5421, "mean_token_accuracy": 0.8333439826965332, "num_tokens": 174798555.0, "step": 1096 }, { "epoch": 0.5579857578840285, "grad_norm": 1.094206690788269, "learning_rate": 1e-05, "loss": 0.535, "mean_token_accuracy": 0.8350372910499573, "num_tokens": 174955732.0, "step": 1097 }, { "epoch": 0.5584944048830112, "grad_norm": 0.9773333668708801, "learning_rate": 1e-05, "loss": 0.5332, "mean_token_accuracy": 0.8351495862007141, "num_tokens": 175130207.0, "step": 1098 }, { "epoch": 0.559003051881994, "grad_norm": 1.1231311559677124, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.8395377397537231, "num_tokens": 175283656.0, "step": 1099 }, { "epoch": 0.5595116988809766, "grad_norm": 1.065348744392395, "learning_rate": 1e-05, "loss": 0.5255, "mean_token_accuracy": 0.8385817408561707, "num_tokens": 175445505.0, "step": 1100 }, { "epoch": 0.5600203458799593, "grad_norm": 1.126814603805542, "learning_rate": 1e-05, "loss": 0.5341, "mean_token_accuracy": 0.8354888558387756, "num_tokens": 175600838.0, "step": 1101 }, { "epoch": 0.560528992878942, "grad_norm": 1.0460002422332764, "learning_rate": 1e-05, "loss": 0.5612, "mean_token_accuracy": 0.827361524105072, "num_tokens": 175767713.0, "step": 1102 }, { "epoch": 0.5610376398779248, "grad_norm": 1.156819462776184, "learning_rate": 1e-05, "loss": 0.5619, "mean_token_accuracy": 0.8260256052017212, "num_tokens": 175927627.0, "step": 1103 }, { "epoch": 0.5615462868769074, "grad_norm": 1.053056001663208, "learning_rate": 1e-05, "loss": 0.5522, "mean_token_accuracy": 0.8309029340744019, "num_tokens": 176094548.0, "step": 1104 }, { "epoch": 0.5620549338758901, "grad_norm": 1.022141933441162, "learning_rate": 1e-05, "loss": 0.5339, "mean_token_accuracy": 0.8342273235321045, "num_tokens": 176249226.0, "step": 1105 }, { "epoch": 0.5625635808748728, "grad_norm": 1.0103915929794312, "learning_rate": 1e-05, "loss": 0.5394, "mean_token_accuracy": 0.8349782824516296, "num_tokens": 176419770.0, "step": 1106 }, { "epoch": 0.5630722278738556, "grad_norm": 1.1695556640625, "learning_rate": 1e-05, "loss": 0.5223, "mean_token_accuracy": 0.8384734988212585, "num_tokens": 176579372.0, "step": 1107 }, { "epoch": 0.5635808748728383, "grad_norm": 1.1732338666915894, "learning_rate": 1e-05, "loss": 0.5587, "mean_token_accuracy": 0.8287216424942017, "num_tokens": 176731375.0, "step": 1108 }, { "epoch": 0.5640895218718209, "grad_norm": 1.058566689491272, "learning_rate": 1e-05, "loss": 0.5262, "mean_token_accuracy": 0.8381431698799133, "num_tokens": 176882380.0, "step": 1109 }, { "epoch": 0.5645981688708036, "grad_norm": 1.0918478965759277, "learning_rate": 1e-05, "loss": 0.5236, "mean_token_accuracy": 0.8384039402008057, "num_tokens": 177037040.0, "step": 1110 }, { "epoch": 0.5651068158697864, "grad_norm": 1.040511965751648, "learning_rate": 1e-05, "loss": 0.563, "mean_token_accuracy": 0.8262896537780762, "num_tokens": 177193855.0, "step": 1111 }, { "epoch": 0.5656154628687691, "grad_norm": 1.1200957298278809, "learning_rate": 1e-05, "loss": 0.5427, "mean_token_accuracy": 0.8336125016212463, "num_tokens": 177353869.0, "step": 1112 }, { "epoch": 0.5661241098677517, "grad_norm": 1.0688713788986206, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.8350938558578491, "num_tokens": 177503862.0, "step": 1113 }, { "epoch": 0.5666327568667345, "grad_norm": 1.0373692512512207, "learning_rate": 1e-05, "loss": 0.5115, "mean_token_accuracy": 0.8404502272605896, "num_tokens": 177663239.0, "step": 1114 }, { "epoch": 0.5671414038657172, "grad_norm": 1.0870791673660278, "learning_rate": 1e-05, "loss": 0.5215, "mean_token_accuracy": 0.8382793068885803, "num_tokens": 177822738.0, "step": 1115 }, { "epoch": 0.5676500508646999, "grad_norm": 1.2546334266662598, "learning_rate": 1e-05, "loss": 0.5379, "mean_token_accuracy": 0.8341033458709717, "num_tokens": 177989713.0, "step": 1116 }, { "epoch": 0.5681586978636826, "grad_norm": 1.1117446422576904, "learning_rate": 1e-05, "loss": 0.5515, "mean_token_accuracy": 0.8312805891036987, "num_tokens": 178140529.0, "step": 1117 }, { "epoch": 0.5686673448626653, "grad_norm": 1.0564945936203003, "learning_rate": 1e-05, "loss": 0.5421, "mean_token_accuracy": 0.8333523273468018, "num_tokens": 178311252.0, "step": 1118 }, { "epoch": 0.569175991861648, "grad_norm": 1.1679905652999878, "learning_rate": 1e-05, "loss": 0.5638, "mean_token_accuracy": 0.8269079923629761, "num_tokens": 178475164.0, "step": 1119 }, { "epoch": 0.5696846388606307, "grad_norm": 1.115697979927063, "learning_rate": 1e-05, "loss": 0.5685, "mean_token_accuracy": 0.8261163830757141, "num_tokens": 178635634.0, "step": 1120 }, { "epoch": 0.5701932858596134, "grad_norm": 1.1107937097549438, "learning_rate": 1e-05, "loss": 0.5671, "mean_token_accuracy": 0.8257395625114441, "num_tokens": 178786348.0, "step": 1121 }, { "epoch": 0.5707019328585962, "grad_norm": 1.0822064876556396, "learning_rate": 1e-05, "loss": 0.5457, "mean_token_accuracy": 0.8325570225715637, "num_tokens": 178948373.0, "step": 1122 }, { "epoch": 0.5712105798575788, "grad_norm": 1.142685055732727, "learning_rate": 1e-05, "loss": 0.509, "mean_token_accuracy": 0.8404792547225952, "num_tokens": 179104122.0, "step": 1123 }, { "epoch": 0.5717192268565615, "grad_norm": 1.0351077318191528, "learning_rate": 1e-05, "loss": 0.5318, "mean_token_accuracy": 0.8346115350723267, "num_tokens": 179265598.0, "step": 1124 }, { "epoch": 0.5722278738555443, "grad_norm": 1.0350379943847656, "learning_rate": 1e-05, "loss": 0.5521, "mean_token_accuracy": 0.8314869999885559, "num_tokens": 179417035.0, "step": 1125 }, { "epoch": 0.572736520854527, "grad_norm": 1.0398163795471191, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8392678499221802, "num_tokens": 179577100.0, "step": 1126 }, { "epoch": 0.5732451678535097, "grad_norm": 1.0036226511001587, "learning_rate": 1e-05, "loss": 0.5552, "mean_token_accuracy": 0.8290267586708069, "num_tokens": 179744420.0, "step": 1127 }, { "epoch": 0.5737538148524923, "grad_norm": 1.1067705154418945, "learning_rate": 1e-05, "loss": 0.54, "mean_token_accuracy": 0.8334242701530457, "num_tokens": 179892505.0, "step": 1128 }, { "epoch": 0.5742624618514751, "grad_norm": 1.1229734420776367, "learning_rate": 1e-05, "loss": 0.5621, "mean_token_accuracy": 0.8270100355148315, "num_tokens": 180051796.0, "step": 1129 }, { "epoch": 0.5747711088504578, "grad_norm": 1.0072256326675415, "learning_rate": 1e-05, "loss": 0.5334, "mean_token_accuracy": 0.8366422653198242, "num_tokens": 180223161.0, "step": 1130 }, { "epoch": 0.5752797558494405, "grad_norm": 1.038099765777588, "learning_rate": 1e-05, "loss": 0.5532, "mean_token_accuracy": 0.8312008380889893, "num_tokens": 180381038.0, "step": 1131 }, { "epoch": 0.5757884028484231, "grad_norm": 1.06266450881958, "learning_rate": 1e-05, "loss": 0.5434, "mean_token_accuracy": 0.8321468234062195, "num_tokens": 180529292.0, "step": 1132 }, { "epoch": 0.5762970498474059, "grad_norm": 1.0975128412246704, "learning_rate": 1e-05, "loss": 0.5221, "mean_token_accuracy": 0.8384957313537598, "num_tokens": 180673192.0, "step": 1133 }, { "epoch": 0.5768056968463886, "grad_norm": 1.063176155090332, "learning_rate": 1e-05, "loss": 0.5348, "mean_token_accuracy": 0.8349170684814453, "num_tokens": 180831990.0, "step": 1134 }, { "epoch": 0.5773143438453713, "grad_norm": 1.0480600595474243, "learning_rate": 1e-05, "loss": 0.5492, "mean_token_accuracy": 0.830919623374939, "num_tokens": 180994135.0, "step": 1135 }, { "epoch": 0.5778229908443541, "grad_norm": 1.0666347742080688, "learning_rate": 1e-05, "loss": 0.5516, "mean_token_accuracy": 0.8308299779891968, "num_tokens": 181158179.0, "step": 1136 }, { "epoch": 0.5783316378433367, "grad_norm": 1.0088244676589966, "learning_rate": 1e-05, "loss": 0.5192, "mean_token_accuracy": 0.8389890193939209, "num_tokens": 181316469.0, "step": 1137 }, { "epoch": 0.5788402848423194, "grad_norm": 1.0174601078033447, "learning_rate": 1e-05, "loss": 0.518, "mean_token_accuracy": 0.8383738994598389, "num_tokens": 181465876.0, "step": 1138 }, { "epoch": 0.5793489318413021, "grad_norm": 1.0556000471115112, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8405688405036926, "num_tokens": 181626190.0, "step": 1139 }, { "epoch": 0.5798575788402849, "grad_norm": 1.082321047782898, "learning_rate": 1e-05, "loss": 0.586, "mean_token_accuracy": 0.8204494714736938, "num_tokens": 181772486.0, "step": 1140 }, { "epoch": 0.5803662258392676, "grad_norm": 1.0585923194885254, "learning_rate": 1e-05, "loss": 0.5468, "mean_token_accuracy": 0.8332372903823853, "num_tokens": 181930828.0, "step": 1141 }, { "epoch": 0.5808748728382502, "grad_norm": 1.0120269060134888, "learning_rate": 1e-05, "loss": 0.5399, "mean_token_accuracy": 0.8326765298843384, "num_tokens": 182092553.0, "step": 1142 }, { "epoch": 0.5813835198372329, "grad_norm": 1.0479254722595215, "learning_rate": 1e-05, "loss": 0.5443, "mean_token_accuracy": 0.8317548036575317, "num_tokens": 182242889.0, "step": 1143 }, { "epoch": 0.5818921668362157, "grad_norm": 1.0533218383789062, "learning_rate": 1e-05, "loss": 0.5567, "mean_token_accuracy": 0.8292123079299927, "num_tokens": 182399236.0, "step": 1144 }, { "epoch": 0.5824008138351984, "grad_norm": 1.0289981365203857, "learning_rate": 1e-05, "loss": 0.564, "mean_token_accuracy": 0.827014684677124, "num_tokens": 182555200.0, "step": 1145 }, { "epoch": 0.582909460834181, "grad_norm": 1.0775690078735352, "learning_rate": 1e-05, "loss": 0.5442, "mean_token_accuracy": 0.8320527076721191, "num_tokens": 182711095.0, "step": 1146 }, { "epoch": 0.5834181078331638, "grad_norm": 1.0060006380081177, "learning_rate": 1e-05, "loss": 0.564, "mean_token_accuracy": 0.8269708752632141, "num_tokens": 182884563.0, "step": 1147 }, { "epoch": 0.5839267548321465, "grad_norm": 1.0512561798095703, "learning_rate": 1e-05, "loss": 0.5789, "mean_token_accuracy": 0.8244001865386963, "num_tokens": 183045279.0, "step": 1148 }, { "epoch": 0.5844354018311292, "grad_norm": 1.0781697034835815, "learning_rate": 1e-05, "loss": 0.5444, "mean_token_accuracy": 0.8334587216377258, "num_tokens": 183209431.0, "step": 1149 }, { "epoch": 0.5849440488301119, "grad_norm": 1.094925045967102, "learning_rate": 1e-05, "loss": 0.5202, "mean_token_accuracy": 0.8398051261901855, "num_tokens": 183377751.0, "step": 1150 }, { "epoch": 0.5854526958290946, "grad_norm": 1.044399619102478, "learning_rate": 1e-05, "loss": 0.5227, "mean_token_accuracy": 0.8384654521942139, "num_tokens": 183530201.0, "step": 1151 }, { "epoch": 0.5859613428280773, "grad_norm": 1.09066641330719, "learning_rate": 1e-05, "loss": 0.5473, "mean_token_accuracy": 0.8314449191093445, "num_tokens": 183676743.0, "step": 1152 }, { "epoch": 0.58646998982706, "grad_norm": 1.0624159574508667, "learning_rate": 1e-05, "loss": 0.5284, "mean_token_accuracy": 0.8383419513702393, "num_tokens": 183838597.0, "step": 1153 }, { "epoch": 0.5869786368260427, "grad_norm": 1.0591371059417725, "learning_rate": 1e-05, "loss": 0.5352, "mean_token_accuracy": 0.8353383541107178, "num_tokens": 184006491.0, "step": 1154 }, { "epoch": 0.5874872838250255, "grad_norm": 1.000448226928711, "learning_rate": 1e-05, "loss": 0.5334, "mean_token_accuracy": 0.8357807993888855, "num_tokens": 184170537.0, "step": 1155 }, { "epoch": 0.5879959308240081, "grad_norm": 1.1702582836151123, "learning_rate": 1e-05, "loss": 0.523, "mean_token_accuracy": 0.8372135162353516, "num_tokens": 184334536.0, "step": 1156 }, { "epoch": 0.5885045778229908, "grad_norm": 1.1041123867034912, "learning_rate": 1e-05, "loss": 0.5201, "mean_token_accuracy": 0.835742712020874, "num_tokens": 184489719.0, "step": 1157 }, { "epoch": 0.5890132248219736, "grad_norm": 1.1316978931427002, "learning_rate": 1e-05, "loss": 0.5582, "mean_token_accuracy": 0.8292644619941711, "num_tokens": 184644555.0, "step": 1158 }, { "epoch": 0.5895218718209563, "grad_norm": 1.07127046585083, "learning_rate": 1e-05, "loss": 0.5291, "mean_token_accuracy": 0.8366115093231201, "num_tokens": 184803771.0, "step": 1159 }, { "epoch": 0.590030518819939, "grad_norm": 1.0761888027191162, "learning_rate": 1e-05, "loss": 0.5475, "mean_token_accuracy": 0.8302804231643677, "num_tokens": 184956764.0, "step": 1160 }, { "epoch": 0.5905391658189216, "grad_norm": 1.1104953289031982, "learning_rate": 1e-05, "loss": 0.5195, "mean_token_accuracy": 0.8388170599937439, "num_tokens": 185122654.0, "step": 1161 }, { "epoch": 0.5910478128179044, "grad_norm": 1.101953387260437, "learning_rate": 1e-05, "loss": 0.5535, "mean_token_accuracy": 0.8308421969413757, "num_tokens": 185278874.0, "step": 1162 }, { "epoch": 0.5915564598168871, "grad_norm": 1.1842914819717407, "learning_rate": 1e-05, "loss": 0.5308, "mean_token_accuracy": 0.8350153565406799, "num_tokens": 185438919.0, "step": 1163 }, { "epoch": 0.5920651068158698, "grad_norm": 1.2792435884475708, "learning_rate": 1e-05, "loss": 0.5258, "mean_token_accuracy": 0.8371274471282959, "num_tokens": 185603559.0, "step": 1164 }, { "epoch": 0.5925737538148524, "grad_norm": 1.0808415412902832, "learning_rate": 1e-05, "loss": 0.5209, "mean_token_accuracy": 0.8390897512435913, "num_tokens": 185745740.0, "step": 1165 }, { "epoch": 0.5930824008138352, "grad_norm": 1.0706919431686401, "learning_rate": 1e-05, "loss": 0.5521, "mean_token_accuracy": 0.8303017616271973, "num_tokens": 185904158.0, "step": 1166 }, { "epoch": 0.5935910478128179, "grad_norm": 1.181048035621643, "learning_rate": 1e-05, "loss": 0.5384, "mean_token_accuracy": 0.8349406123161316, "num_tokens": 186062692.0, "step": 1167 }, { "epoch": 0.5940996948118006, "grad_norm": 1.0889734029769897, "learning_rate": 1e-05, "loss": 0.5162, "mean_token_accuracy": 0.8389154672622681, "num_tokens": 186208990.0, "step": 1168 }, { "epoch": 0.5946083418107834, "grad_norm": 1.1245179176330566, "learning_rate": 1e-05, "loss": 0.531, "mean_token_accuracy": 0.8353163003921509, "num_tokens": 186353260.0, "step": 1169 }, { "epoch": 0.595116988809766, "grad_norm": 1.2074040174484253, "learning_rate": 1e-05, "loss": 0.5836, "mean_token_accuracy": 0.821225643157959, "num_tokens": 186527391.0, "step": 1170 }, { "epoch": 0.5956256358087487, "grad_norm": 1.1722588539123535, "learning_rate": 1e-05, "loss": 0.5531, "mean_token_accuracy": 0.8324877023696899, "num_tokens": 186690975.0, "step": 1171 }, { "epoch": 0.5961342828077314, "grad_norm": 1.001542568206787, "learning_rate": 1e-05, "loss": 0.4979, "mean_token_accuracy": 0.8467972874641418, "num_tokens": 186851711.0, "step": 1172 }, { "epoch": 0.5966429298067142, "grad_norm": 1.0898191928863525, "learning_rate": 1e-05, "loss": 0.5337, "mean_token_accuracy": 0.8319215178489685, "num_tokens": 187016950.0, "step": 1173 }, { "epoch": 0.5971515768056969, "grad_norm": 1.1995619535446167, "learning_rate": 1e-05, "loss": 0.55, "mean_token_accuracy": 0.8314080238342285, "num_tokens": 187158410.0, "step": 1174 }, { "epoch": 0.5976602238046795, "grad_norm": 1.0415512323379517, "learning_rate": 1e-05, "loss": 0.5048, "mean_token_accuracy": 0.843203067779541, "num_tokens": 187319518.0, "step": 1175 }, { "epoch": 0.5981688708036622, "grad_norm": 1.0477491617202759, "learning_rate": 1e-05, "loss": 0.5288, "mean_token_accuracy": 0.8374745845794678, "num_tokens": 187492387.0, "step": 1176 }, { "epoch": 0.598677517802645, "grad_norm": 1.053478479385376, "learning_rate": 1e-05, "loss": 0.5652, "mean_token_accuracy": 0.8271386623382568, "num_tokens": 187660820.0, "step": 1177 }, { "epoch": 0.5991861648016277, "grad_norm": 1.1635273694992065, "learning_rate": 1e-05, "loss": 0.5464, "mean_token_accuracy": 0.8312414884567261, "num_tokens": 187815200.0, "step": 1178 }, { "epoch": 0.5996948118006104, "grad_norm": 1.155785083770752, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8382295370101929, "num_tokens": 187976720.0, "step": 1179 }, { "epoch": 0.6002034587995931, "grad_norm": 1.0066823959350586, "learning_rate": 1e-05, "loss": 0.5092, "mean_token_accuracy": 0.841545581817627, "num_tokens": 188141681.0, "step": 1180 }, { "epoch": 0.6007121057985758, "grad_norm": 1.1968673467636108, "learning_rate": 1e-05, "loss": 0.5348, "mean_token_accuracy": 0.8361446857452393, "num_tokens": 188306729.0, "step": 1181 }, { "epoch": 0.6012207527975585, "grad_norm": 1.034803867340088, "learning_rate": 1e-05, "loss": 0.5675, "mean_token_accuracy": 0.8266303539276123, "num_tokens": 188459816.0, "step": 1182 }, { "epoch": 0.6017293997965412, "grad_norm": 1.1092537641525269, "learning_rate": 1e-05, "loss": 0.5556, "mean_token_accuracy": 0.8273969292640686, "num_tokens": 188624442.0, "step": 1183 }, { "epoch": 0.602238046795524, "grad_norm": 1.0602318048477173, "learning_rate": 1e-05, "loss": 0.5564, "mean_token_accuracy": 0.8292548060417175, "num_tokens": 188793110.0, "step": 1184 }, { "epoch": 0.6027466937945066, "grad_norm": 1.0197434425354004, "learning_rate": 1e-05, "loss": 0.51, "mean_token_accuracy": 0.839547872543335, "num_tokens": 188959411.0, "step": 1185 }, { "epoch": 0.6032553407934893, "grad_norm": 1.0977286100387573, "learning_rate": 1e-05, "loss": 0.5326, "mean_token_accuracy": 0.8351724147796631, "num_tokens": 189123138.0, "step": 1186 }, { "epoch": 0.603763987792472, "grad_norm": 1.0537692308425903, "learning_rate": 1e-05, "loss": 0.529, "mean_token_accuracy": 0.836279034614563, "num_tokens": 189288712.0, "step": 1187 }, { "epoch": 0.6042726347914548, "grad_norm": 1.103000283241272, "learning_rate": 1e-05, "loss": 0.5238, "mean_token_accuracy": 0.838005781173706, "num_tokens": 189448908.0, "step": 1188 }, { "epoch": 0.6047812817904374, "grad_norm": 1.1983293294906616, "learning_rate": 1e-05, "loss": 0.5514, "mean_token_accuracy": 0.8291587233543396, "num_tokens": 189601380.0, "step": 1189 }, { "epoch": 0.6052899287894201, "grad_norm": 1.2270739078521729, "learning_rate": 1e-05, "loss": 0.5507, "mean_token_accuracy": 0.8289862871170044, "num_tokens": 189760563.0, "step": 1190 }, { "epoch": 0.6057985757884028, "grad_norm": 1.155634880065918, "learning_rate": 1e-05, "loss": 0.5417, "mean_token_accuracy": 0.8324066400527954, "num_tokens": 189919046.0, "step": 1191 }, { "epoch": 0.6063072227873856, "grad_norm": 1.123648762702942, "learning_rate": 1e-05, "loss": 0.547, "mean_token_accuracy": 0.8326415419578552, "num_tokens": 190067307.0, "step": 1192 }, { "epoch": 0.6068158697863683, "grad_norm": 1.0511082410812378, "learning_rate": 1e-05, "loss": 0.5268, "mean_token_accuracy": 0.8373522758483887, "num_tokens": 190216290.0, "step": 1193 }, { "epoch": 0.6073245167853509, "grad_norm": 1.1812682151794434, "learning_rate": 1e-05, "loss": 0.5315, "mean_token_accuracy": 0.834931492805481, "num_tokens": 190372698.0, "step": 1194 }, { "epoch": 0.6078331637843337, "grad_norm": 1.1167006492614746, "learning_rate": 1e-05, "loss": 0.5295, "mean_token_accuracy": 0.8358839154243469, "num_tokens": 190528564.0, "step": 1195 }, { "epoch": 0.6083418107833164, "grad_norm": 1.1498664617538452, "learning_rate": 1e-05, "loss": 0.559, "mean_token_accuracy": 0.8274659514427185, "num_tokens": 190679521.0, "step": 1196 }, { "epoch": 0.6088504577822991, "grad_norm": 1.1137135028839111, "learning_rate": 1e-05, "loss": 0.5435, "mean_token_accuracy": 0.8332474827766418, "num_tokens": 190826754.0, "step": 1197 }, { "epoch": 0.6093591047812817, "grad_norm": 1.2709776163101196, "learning_rate": 1e-05, "loss": 0.5358, "mean_token_accuracy": 0.8357667326927185, "num_tokens": 190990350.0, "step": 1198 }, { "epoch": 0.6098677517802645, "grad_norm": 1.2216440439224243, "learning_rate": 1e-05, "loss": 0.5632, "mean_token_accuracy": 0.827555239200592, "num_tokens": 191150121.0, "step": 1199 }, { "epoch": 0.6103763987792472, "grad_norm": 1.1574770212173462, "learning_rate": 1e-05, "loss": 0.5383, "mean_token_accuracy": 0.8342841267585754, "num_tokens": 191298356.0, "step": 1200 }, { "epoch": 0.6108850457782299, "grad_norm": 1.108181357383728, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.8400946855545044, "num_tokens": 191454560.0, "step": 1201 }, { "epoch": 0.6113936927772126, "grad_norm": 1.1643569469451904, "learning_rate": 1e-05, "loss": 0.4993, "mean_token_accuracy": 0.8451148867607117, "num_tokens": 191607061.0, "step": 1202 }, { "epoch": 0.6119023397761953, "grad_norm": 1.0500165224075317, "learning_rate": 1e-05, "loss": 0.536, "mean_token_accuracy": 0.8340716361999512, "num_tokens": 191764683.0, "step": 1203 }, { "epoch": 0.612410986775178, "grad_norm": 1.1072869300842285, "learning_rate": 1e-05, "loss": 0.5273, "mean_token_accuracy": 0.8370859622955322, "num_tokens": 191920982.0, "step": 1204 }, { "epoch": 0.6129196337741607, "grad_norm": 1.034971833229065, "learning_rate": 1e-05, "loss": 0.5239, "mean_token_accuracy": 0.8371504545211792, "num_tokens": 192084464.0, "step": 1205 }, { "epoch": 0.6134282807731435, "grad_norm": 1.0584032535552979, "learning_rate": 1e-05, "loss": 0.5443, "mean_token_accuracy": 0.8337501883506775, "num_tokens": 192244544.0, "step": 1206 }, { "epoch": 0.6139369277721262, "grad_norm": 1.1733242273330688, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8383514881134033, "num_tokens": 192398895.0, "step": 1207 }, { "epoch": 0.6144455747711088, "grad_norm": 1.1394904851913452, "learning_rate": 1e-05, "loss": 0.5679, "mean_token_accuracy": 0.8283772468566895, "num_tokens": 192568952.0, "step": 1208 }, { "epoch": 0.6149542217700915, "grad_norm": 1.1953922510147095, "learning_rate": 1e-05, "loss": 0.5564, "mean_token_accuracy": 0.8289297819137573, "num_tokens": 192726852.0, "step": 1209 }, { "epoch": 0.6154628687690743, "grad_norm": 1.1235586404800415, "learning_rate": 1e-05, "loss": 0.4998, "mean_token_accuracy": 0.8443677425384521, "num_tokens": 192871340.0, "step": 1210 }, { "epoch": 0.615971515768057, "grad_norm": 1.1388280391693115, "learning_rate": 1e-05, "loss": 0.5144, "mean_token_accuracy": 0.8400600552558899, "num_tokens": 193042111.0, "step": 1211 }, { "epoch": 0.6164801627670397, "grad_norm": 1.1637824773788452, "learning_rate": 1e-05, "loss": 0.5454, "mean_token_accuracy": 0.8325772881507874, "num_tokens": 193194025.0, "step": 1212 }, { "epoch": 0.6169888097660223, "grad_norm": 1.2491023540496826, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.8449327945709229, "num_tokens": 193349718.0, "step": 1213 }, { "epoch": 0.6174974567650051, "grad_norm": 1.1179122924804688, "learning_rate": 1e-05, "loss": 0.5388, "mean_token_accuracy": 0.8341668844223022, "num_tokens": 193505599.0, "step": 1214 }, { "epoch": 0.6180061037639878, "grad_norm": 1.0245755910873413, "learning_rate": 1e-05, "loss": 0.5785, "mean_token_accuracy": 0.8252415060997009, "num_tokens": 193681994.0, "step": 1215 }, { "epoch": 0.6185147507629705, "grad_norm": 1.0279556512832642, "learning_rate": 1e-05, "loss": 0.5564, "mean_token_accuracy": 0.8292478919029236, "num_tokens": 193846107.0, "step": 1216 }, { "epoch": 0.6190233977619533, "grad_norm": 1.0754578113555908, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.835884690284729, "num_tokens": 194001101.0, "step": 1217 }, { "epoch": 0.6195320447609359, "grad_norm": 0.9642245173454285, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.8447218537330627, "num_tokens": 194162440.0, "step": 1218 }, { "epoch": 0.6200406917599186, "grad_norm": 1.0722028017044067, "learning_rate": 1e-05, "loss": 0.5374, "mean_token_accuracy": 0.8334408402442932, "num_tokens": 194316830.0, "step": 1219 }, { "epoch": 0.6205493387589013, "grad_norm": 0.9881809949874878, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8374440670013428, "num_tokens": 194471827.0, "step": 1220 }, { "epoch": 0.6210579857578841, "grad_norm": 1.063515305519104, "learning_rate": 1e-05, "loss": 0.5354, "mean_token_accuracy": 0.833842396736145, "num_tokens": 194631877.0, "step": 1221 }, { "epoch": 0.6215666327568667, "grad_norm": 0.9928879737854004, "learning_rate": 1e-05, "loss": 0.53, "mean_token_accuracy": 0.8351453542709351, "num_tokens": 194798176.0, "step": 1222 }, { "epoch": 0.6220752797558494, "grad_norm": 1.0375672578811646, "learning_rate": 1e-05, "loss": 0.5402, "mean_token_accuracy": 0.8339008688926697, "num_tokens": 194954556.0, "step": 1223 }, { "epoch": 0.6225839267548321, "grad_norm": 1.1741032600402832, "learning_rate": 1e-05, "loss": 0.5645, "mean_token_accuracy": 0.8256007432937622, "num_tokens": 195115010.0, "step": 1224 }, { "epoch": 0.6230925737538149, "grad_norm": 1.037577748298645, "learning_rate": 1e-05, "loss": 0.5128, "mean_token_accuracy": 0.8424239754676819, "num_tokens": 195277332.0, "step": 1225 }, { "epoch": 0.6236012207527976, "grad_norm": 1.1055760383605957, "learning_rate": 1e-05, "loss": 0.5151, "mean_token_accuracy": 0.8401551246643066, "num_tokens": 195442166.0, "step": 1226 }, { "epoch": 0.6241098677517802, "grad_norm": 0.9993090629577637, "learning_rate": 1e-05, "loss": 0.5111, "mean_token_accuracy": 0.8418375849723816, "num_tokens": 195598203.0, "step": 1227 }, { "epoch": 0.624618514750763, "grad_norm": 1.062853217124939, "learning_rate": 1e-05, "loss": 0.5156, "mean_token_accuracy": 0.8409103155136108, "num_tokens": 195755945.0, "step": 1228 }, { "epoch": 0.6251271617497457, "grad_norm": 0.9994860291481018, "learning_rate": 1e-05, "loss": 0.5165, "mean_token_accuracy": 0.8400459289550781, "num_tokens": 195923721.0, "step": 1229 }, { "epoch": 0.6256358087487284, "grad_norm": 1.10517418384552, "learning_rate": 1e-05, "loss": 0.5064, "mean_token_accuracy": 0.8433425426483154, "num_tokens": 196076642.0, "step": 1230 }, { "epoch": 0.626144455747711, "grad_norm": 1.0957393646240234, "learning_rate": 1e-05, "loss": 0.5347, "mean_token_accuracy": 0.8352852463722229, "num_tokens": 196232622.0, "step": 1231 }, { "epoch": 0.6266531027466938, "grad_norm": 1.0993536710739136, "learning_rate": 1e-05, "loss": 0.5545, "mean_token_accuracy": 0.8281955122947693, "num_tokens": 196384640.0, "step": 1232 }, { "epoch": 0.6271617497456765, "grad_norm": 1.0579915046691895, "learning_rate": 1e-05, "loss": 0.5706, "mean_token_accuracy": 0.8254827857017517, "num_tokens": 196546059.0, "step": 1233 }, { "epoch": 0.6276703967446592, "grad_norm": 1.0779695510864258, "learning_rate": 1e-05, "loss": 0.5603, "mean_token_accuracy": 0.8296513557434082, "num_tokens": 196709416.0, "step": 1234 }, { "epoch": 0.6281790437436419, "grad_norm": 0.9989771246910095, "learning_rate": 1e-05, "loss": 0.542, "mean_token_accuracy": 0.8338413834571838, "num_tokens": 196874595.0, "step": 1235 }, { "epoch": 0.6286876907426246, "grad_norm": 1.0544723272323608, "learning_rate": 1e-05, "loss": 0.5132, "mean_token_accuracy": 0.8396172523498535, "num_tokens": 197038688.0, "step": 1236 }, { "epoch": 0.6291963377416073, "grad_norm": 1.1366897821426392, "learning_rate": 1e-05, "loss": 0.5745, "mean_token_accuracy": 0.8230605125427246, "num_tokens": 197192465.0, "step": 1237 }, { "epoch": 0.62970498474059, "grad_norm": 1.061204433441162, "learning_rate": 1e-05, "loss": 0.5529, "mean_token_accuracy": 0.8329615592956543, "num_tokens": 197361082.0, "step": 1238 }, { "epoch": 0.6302136317395728, "grad_norm": 1.0565414428710938, "learning_rate": 1e-05, "loss": 0.4889, "mean_token_accuracy": 0.8487517833709717, "num_tokens": 197512368.0, "step": 1239 }, { "epoch": 0.6307222787385555, "grad_norm": 1.0725106000900269, "learning_rate": 1e-05, "loss": 0.5561, "mean_token_accuracy": 0.829352617263794, "num_tokens": 197654672.0, "step": 1240 }, { "epoch": 0.6312309257375381, "grad_norm": 1.1748433113098145, "learning_rate": 1e-05, "loss": 0.5574, "mean_token_accuracy": 0.8295996189117432, "num_tokens": 197816279.0, "step": 1241 }, { "epoch": 0.6317395727365208, "grad_norm": 1.0469516515731812, "learning_rate": 1e-05, "loss": 0.5124, "mean_token_accuracy": 0.8417311906814575, "num_tokens": 197972971.0, "step": 1242 }, { "epoch": 0.6322482197355036, "grad_norm": 1.1066620349884033, "learning_rate": 1e-05, "loss": 0.526, "mean_token_accuracy": 0.8364421129226685, "num_tokens": 198125745.0, "step": 1243 }, { "epoch": 0.6327568667344863, "grad_norm": 0.9559982419013977, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8472656607627869, "num_tokens": 198288270.0, "step": 1244 }, { "epoch": 0.633265513733469, "grad_norm": 1.037071943283081, "learning_rate": 1e-05, "loss": 0.5423, "mean_token_accuracy": 0.8322509527206421, "num_tokens": 198445798.0, "step": 1245 }, { "epoch": 0.6337741607324516, "grad_norm": 1.0956902503967285, "learning_rate": 1e-05, "loss": 0.5091, "mean_token_accuracy": 0.8409451246261597, "num_tokens": 198600564.0, "step": 1246 }, { "epoch": 0.6342828077314344, "grad_norm": 0.9643290638923645, "learning_rate": 1e-05, "loss": 0.507, "mean_token_accuracy": 0.843378484249115, "num_tokens": 198768145.0, "step": 1247 }, { "epoch": 0.6347914547304171, "grad_norm": 1.114691138267517, "learning_rate": 1e-05, "loss": 0.5142, "mean_token_accuracy": 0.8397768139839172, "num_tokens": 198919816.0, "step": 1248 }, { "epoch": 0.6353001017293998, "grad_norm": 1.0919520854949951, "learning_rate": 1e-05, "loss": 0.5617, "mean_token_accuracy": 0.8267562389373779, "num_tokens": 199084288.0, "step": 1249 }, { "epoch": 0.6358087487283826, "grad_norm": 1.0663131475448608, "learning_rate": 1e-05, "loss": 0.5508, "mean_token_accuracy": 0.830141544342041, "num_tokens": 199234621.0, "step": 1250 }, { "epoch": 0.6363173957273652, "grad_norm": 0.9842988848686218, "learning_rate": 1e-05, "loss": 0.5205, "mean_token_accuracy": 0.8383627533912659, "num_tokens": 199415992.0, "step": 1251 }, { "epoch": 0.6368260427263479, "grad_norm": 1.0851852893829346, "learning_rate": 1e-05, "loss": 0.5415, "mean_token_accuracy": 0.8332940340042114, "num_tokens": 199581880.0, "step": 1252 }, { "epoch": 0.6373346897253306, "grad_norm": 1.1240712404251099, "learning_rate": 1e-05, "loss": 0.5182, "mean_token_accuracy": 0.8378025889396667, "num_tokens": 199728425.0, "step": 1253 }, { "epoch": 0.6378433367243134, "grad_norm": 1.100454568862915, "learning_rate": 1e-05, "loss": 0.5766, "mean_token_accuracy": 0.8229030966758728, "num_tokens": 199887035.0, "step": 1254 }, { "epoch": 0.638351983723296, "grad_norm": 1.128618597984314, "learning_rate": 1e-05, "loss": 0.5494, "mean_token_accuracy": 0.8319924473762512, "num_tokens": 200042873.0, "step": 1255 }, { "epoch": 0.6388606307222787, "grad_norm": 1.069466471672058, "learning_rate": 1e-05, "loss": 0.5192, "mean_token_accuracy": 0.8399227857589722, "num_tokens": 200218153.0, "step": 1256 }, { "epoch": 0.6393692777212614, "grad_norm": 1.1159545183181763, "learning_rate": 1e-05, "loss": 0.5434, "mean_token_accuracy": 0.8340426683425903, "num_tokens": 200375972.0, "step": 1257 }, { "epoch": 0.6398779247202442, "grad_norm": 1.0914868116378784, "learning_rate": 1e-05, "loss": 0.5517, "mean_token_accuracy": 0.8304837942123413, "num_tokens": 200538925.0, "step": 1258 }, { "epoch": 0.6403865717192269, "grad_norm": 1.016735315322876, "learning_rate": 1e-05, "loss": 0.5222, "mean_token_accuracy": 0.8377134799957275, "num_tokens": 200702936.0, "step": 1259 }, { "epoch": 0.6408952187182095, "grad_norm": 1.1668028831481934, "learning_rate": 1e-05, "loss": 0.5199, "mean_token_accuracy": 0.8381738662719727, "num_tokens": 200852605.0, "step": 1260 }, { "epoch": 0.6414038657171923, "grad_norm": 1.106713891029358, "learning_rate": 1e-05, "loss": 0.5192, "mean_token_accuracy": 0.8384389877319336, "num_tokens": 201011038.0, "step": 1261 }, { "epoch": 0.641912512716175, "grad_norm": 1.081728458404541, "learning_rate": 1e-05, "loss": 0.5503, "mean_token_accuracy": 0.8293412327766418, "num_tokens": 201171353.0, "step": 1262 }, { "epoch": 0.6424211597151577, "grad_norm": 1.0619851350784302, "learning_rate": 1e-05, "loss": 0.5578, "mean_token_accuracy": 0.8283439874649048, "num_tokens": 201335064.0, "step": 1263 }, { "epoch": 0.6429298067141404, "grad_norm": 1.0915756225585938, "learning_rate": 1e-05, "loss": 0.5157, "mean_token_accuracy": 0.8394124507904053, "num_tokens": 201496911.0, "step": 1264 }, { "epoch": 0.6434384537131231, "grad_norm": 1.0689359903335571, "learning_rate": 1e-05, "loss": 0.5414, "mean_token_accuracy": 0.8329365253448486, "num_tokens": 201650881.0, "step": 1265 }, { "epoch": 0.6439471007121058, "grad_norm": 1.0703952312469482, "learning_rate": 1e-05, "loss": 0.5636, "mean_token_accuracy": 0.8266373872756958, "num_tokens": 201822744.0, "step": 1266 }, { "epoch": 0.6444557477110885, "grad_norm": 0.9987941980361938, "learning_rate": 1e-05, "loss": 0.52, "mean_token_accuracy": 0.8399134278297424, "num_tokens": 201988138.0, "step": 1267 }, { "epoch": 0.6449643947100712, "grad_norm": 0.9540848731994629, "learning_rate": 1e-05, "loss": 0.5188, "mean_token_accuracy": 0.8398693203926086, "num_tokens": 202163172.0, "step": 1268 }, { "epoch": 0.645473041709054, "grad_norm": 1.0938210487365723, "learning_rate": 1e-05, "loss": 0.5251, "mean_token_accuracy": 0.8377525806427002, "num_tokens": 202330354.0, "step": 1269 }, { "epoch": 0.6459816887080366, "grad_norm": 1.0494672060012817, "learning_rate": 1e-05, "loss": 0.5443, "mean_token_accuracy": 0.8317760229110718, "num_tokens": 202477704.0, "step": 1270 }, { "epoch": 0.6464903357070193, "grad_norm": 1.0470654964447021, "learning_rate": 1e-05, "loss": 0.5715, "mean_token_accuracy": 0.8244767785072327, "num_tokens": 202638535.0, "step": 1271 }, { "epoch": 0.646998982706002, "grad_norm": 1.0278775691986084, "learning_rate": 1e-05, "loss": 0.5526, "mean_token_accuracy": 0.8294790983200073, "num_tokens": 202799169.0, "step": 1272 }, { "epoch": 0.6475076297049848, "grad_norm": 1.0553829669952393, "learning_rate": 1e-05, "loss": 0.5316, "mean_token_accuracy": 0.8347764015197754, "num_tokens": 202948731.0, "step": 1273 }, { "epoch": 0.6480162767039674, "grad_norm": 1.0590755939483643, "learning_rate": 1e-05, "loss": 0.5301, "mean_token_accuracy": 0.8380409479141235, "num_tokens": 203103019.0, "step": 1274 }, { "epoch": 0.6485249237029501, "grad_norm": 1.1748765707015991, "learning_rate": 1e-05, "loss": 0.5641, "mean_token_accuracy": 0.8264802694320679, "num_tokens": 203258243.0, "step": 1275 }, { "epoch": 0.6490335707019329, "grad_norm": 0.9675571918487549, "learning_rate": 1e-05, "loss": 0.5274, "mean_token_accuracy": 0.8368325233459473, "num_tokens": 203417686.0, "step": 1276 }, { "epoch": 0.6495422177009156, "grad_norm": 1.0740697383880615, "learning_rate": 1e-05, "loss": 0.5338, "mean_token_accuracy": 0.8351479768753052, "num_tokens": 203581925.0, "step": 1277 }, { "epoch": 0.6500508646998983, "grad_norm": 1.079943299293518, "learning_rate": 1e-05, "loss": 0.5607, "mean_token_accuracy": 0.8276282548904419, "num_tokens": 203737547.0, "step": 1278 }, { "epoch": 0.6505595116988809, "grad_norm": 1.1141538619995117, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8363447189331055, "num_tokens": 203886501.0, "step": 1279 }, { "epoch": 0.6510681586978637, "grad_norm": 1.2464641332626343, "learning_rate": 1e-05, "loss": 0.5374, "mean_token_accuracy": 0.8330584764480591, "num_tokens": 204040525.0, "step": 1280 }, { "epoch": 0.6515768056968464, "grad_norm": 1.1051416397094727, "learning_rate": 1e-05, "loss": 0.5219, "mean_token_accuracy": 0.8374311923980713, "num_tokens": 204205921.0, "step": 1281 }, { "epoch": 0.6520854526958291, "grad_norm": 1.125686764717102, "learning_rate": 1e-05, "loss": 0.5151, "mean_token_accuracy": 0.8404279947280884, "num_tokens": 204371623.0, "step": 1282 }, { "epoch": 0.6525940996948117, "grad_norm": 1.036682367324829, "learning_rate": 1e-05, "loss": 0.5175, "mean_token_accuracy": 0.8396049737930298, "num_tokens": 204531555.0, "step": 1283 }, { "epoch": 0.6531027466937945, "grad_norm": 1.085217833518982, "learning_rate": 1e-05, "loss": 0.5249, "mean_token_accuracy": 0.8371258974075317, "num_tokens": 204695860.0, "step": 1284 }, { "epoch": 0.6536113936927772, "grad_norm": 1.0670760869979858, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8381375074386597, "num_tokens": 204849971.0, "step": 1285 }, { "epoch": 0.6541200406917599, "grad_norm": 0.9454727172851562, "learning_rate": 1e-05, "loss": 0.4977, "mean_token_accuracy": 0.8452643752098083, "num_tokens": 205022471.0, "step": 1286 }, { "epoch": 0.6546286876907427, "grad_norm": 1.0735646486282349, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8420166969299316, "num_tokens": 205191250.0, "step": 1287 }, { "epoch": 0.6551373346897253, "grad_norm": 1.1079990863800049, "learning_rate": 1e-05, "loss": 0.5513, "mean_token_accuracy": 0.828972578048706, "num_tokens": 205338248.0, "step": 1288 }, { "epoch": 0.655645981688708, "grad_norm": 1.0452680587768555, "learning_rate": 1e-05, "loss": 0.5566, "mean_token_accuracy": 0.8274991512298584, "num_tokens": 205510777.0, "step": 1289 }, { "epoch": 0.6561546286876907, "grad_norm": 1.0797021389007568, "learning_rate": 1e-05, "loss": 0.5465, "mean_token_accuracy": 0.8306173086166382, "num_tokens": 205654689.0, "step": 1290 }, { "epoch": 0.6566632756866735, "grad_norm": 0.9831796884536743, "learning_rate": 1e-05, "loss": 0.5565, "mean_token_accuracy": 0.8293613195419312, "num_tokens": 205818547.0, "step": 1291 }, { "epoch": 0.6571719226856562, "grad_norm": 1.095979928970337, "learning_rate": 1e-05, "loss": 0.5728, "mean_token_accuracy": 0.8247345685958862, "num_tokens": 205984399.0, "step": 1292 }, { "epoch": 0.6576805696846388, "grad_norm": 1.0510746240615845, "learning_rate": 1e-05, "loss": 0.5789, "mean_token_accuracy": 0.8231319189071655, "num_tokens": 206148673.0, "step": 1293 }, { "epoch": 0.6581892166836215, "grad_norm": 1.2273945808410645, "learning_rate": 1e-05, "loss": 0.5174, "mean_token_accuracy": 0.8394807577133179, "num_tokens": 206308245.0, "step": 1294 }, { "epoch": 0.6586978636826043, "grad_norm": 1.0817188024520874, "learning_rate": 1e-05, "loss": 0.4983, "mean_token_accuracy": 0.8441655039787292, "num_tokens": 206471247.0, "step": 1295 }, { "epoch": 0.659206510681587, "grad_norm": 1.0500448942184448, "learning_rate": 1e-05, "loss": 0.5385, "mean_token_accuracy": 0.8328456282615662, "num_tokens": 206613022.0, "step": 1296 }, { "epoch": 0.6597151576805697, "grad_norm": 1.0789843797683716, "learning_rate": 1e-05, "loss": 0.5426, "mean_token_accuracy": 0.8310835957527161, "num_tokens": 206781121.0, "step": 1297 }, { "epoch": 0.6602238046795524, "grad_norm": 1.0747429132461548, "learning_rate": 1e-05, "loss": 0.5464, "mean_token_accuracy": 0.8310299515724182, "num_tokens": 206935916.0, "step": 1298 }, { "epoch": 0.6607324516785351, "grad_norm": 1.1076595783233643, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8450763821601868, "num_tokens": 207089114.0, "step": 1299 }, { "epoch": 0.6612410986775178, "grad_norm": 1.0206778049468994, "learning_rate": 1e-05, "loss": 0.5213, "mean_token_accuracy": 0.8379412293434143, "num_tokens": 207249857.0, "step": 1300 }, { "epoch": 0.6617497456765005, "grad_norm": 1.0830273628234863, "learning_rate": 1e-05, "loss": 0.5693, "mean_token_accuracy": 0.8260539174079895, "num_tokens": 207415519.0, "step": 1301 }, { "epoch": 0.6622583926754833, "grad_norm": 1.032319188117981, "learning_rate": 1e-05, "loss": 0.5331, "mean_token_accuracy": 0.8347049951553345, "num_tokens": 207585946.0, "step": 1302 }, { "epoch": 0.6627670396744659, "grad_norm": 1.0059924125671387, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.8431366682052612, "num_tokens": 207742283.0, "step": 1303 }, { "epoch": 0.6632756866734486, "grad_norm": 1.0041545629501343, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.8438891768455505, "num_tokens": 207905907.0, "step": 1304 }, { "epoch": 0.6637843336724313, "grad_norm": 1.0187472105026245, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8417147397994995, "num_tokens": 208066296.0, "step": 1305 }, { "epoch": 0.6642929806714141, "grad_norm": 1.0259668827056885, "learning_rate": 1e-05, "loss": 0.5098, "mean_token_accuracy": 0.8407272100448608, "num_tokens": 208221915.0, "step": 1306 }, { "epoch": 0.6648016276703967, "grad_norm": 1.1533657312393188, "learning_rate": 1e-05, "loss": 0.4846, "mean_token_accuracy": 0.8486118316650391, "num_tokens": 208374510.0, "step": 1307 }, { "epoch": 0.6653102746693794, "grad_norm": 1.092882513999939, "learning_rate": 1e-05, "loss": 0.5408, "mean_token_accuracy": 0.8338409662246704, "num_tokens": 208541953.0, "step": 1308 }, { "epoch": 0.6658189216683622, "grad_norm": 1.1926003694534302, "learning_rate": 1e-05, "loss": 0.5219, "mean_token_accuracy": 0.8373793959617615, "num_tokens": 208698721.0, "step": 1309 }, { "epoch": 0.6663275686673449, "grad_norm": 1.087581992149353, "learning_rate": 1e-05, "loss": 0.5256, "mean_token_accuracy": 0.8379611968994141, "num_tokens": 208855549.0, "step": 1310 }, { "epoch": 0.6668362156663276, "grad_norm": 1.1358734369277954, "learning_rate": 1e-05, "loss": 0.5506, "mean_token_accuracy": 0.8312427997589111, "num_tokens": 209015229.0, "step": 1311 }, { "epoch": 0.6673448626653102, "grad_norm": 1.130253791809082, "learning_rate": 1e-05, "loss": 0.54, "mean_token_accuracy": 0.8337419033050537, "num_tokens": 209167815.0, "step": 1312 }, { "epoch": 0.667853509664293, "grad_norm": 0.9735279083251953, "learning_rate": 1e-05, "loss": 0.5478, "mean_token_accuracy": 0.8318684101104736, "num_tokens": 209339913.0, "step": 1313 }, { "epoch": 0.6683621566632757, "grad_norm": 1.1487743854522705, "learning_rate": 1e-05, "loss": 0.5725, "mean_token_accuracy": 0.8254092931747437, "num_tokens": 209506826.0, "step": 1314 }, { "epoch": 0.6688708036622584, "grad_norm": 1.1382986307144165, "learning_rate": 1e-05, "loss": 0.5195, "mean_token_accuracy": 0.8394043445587158, "num_tokens": 209658534.0, "step": 1315 }, { "epoch": 0.669379450661241, "grad_norm": 1.1165403127670288, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.8393932580947876, "num_tokens": 209810131.0, "step": 1316 }, { "epoch": 0.6698880976602238, "grad_norm": 1.0738933086395264, "learning_rate": 1e-05, "loss": 0.4967, "mean_token_accuracy": 0.8456311225891113, "num_tokens": 209972848.0, "step": 1317 }, { "epoch": 0.6703967446592065, "grad_norm": 1.0726091861724854, "learning_rate": 1e-05, "loss": 0.5678, "mean_token_accuracy": 0.8266811370849609, "num_tokens": 210123131.0, "step": 1318 }, { "epoch": 0.6709053916581892, "grad_norm": 1.0094369649887085, "learning_rate": 1e-05, "loss": 0.5373, "mean_token_accuracy": 0.835343599319458, "num_tokens": 210281302.0, "step": 1319 }, { "epoch": 0.671414038657172, "grad_norm": 1.08613920211792, "learning_rate": 1e-05, "loss": 0.5162, "mean_token_accuracy": 0.8415795564651489, "num_tokens": 210444300.0, "step": 1320 }, { "epoch": 0.6719226856561547, "grad_norm": 1.0181106328964233, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.8374539613723755, "num_tokens": 210610063.0, "step": 1321 }, { "epoch": 0.6724313326551373, "grad_norm": 1.0776914358139038, "learning_rate": 1e-05, "loss": 0.5356, "mean_token_accuracy": 0.8327101469039917, "num_tokens": 210771386.0, "step": 1322 }, { "epoch": 0.67293997965412, "grad_norm": 1.0810972452163696, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8414871692657471, "num_tokens": 210919989.0, "step": 1323 }, { "epoch": 0.6734486266531028, "grad_norm": 1.2551976442337036, "learning_rate": 1e-05, "loss": 0.5499, "mean_token_accuracy": 0.831946611404419, "num_tokens": 211073799.0, "step": 1324 }, { "epoch": 0.6739572736520855, "grad_norm": 1.0054110288619995, "learning_rate": 1e-05, "loss": 0.5276, "mean_token_accuracy": 0.8376283645629883, "num_tokens": 211241360.0, "step": 1325 }, { "epoch": 0.6744659206510681, "grad_norm": 1.1998662948608398, "learning_rate": 1e-05, "loss": 0.5513, "mean_token_accuracy": 0.8302354216575623, "num_tokens": 211407370.0, "step": 1326 }, { "epoch": 0.6749745676500508, "grad_norm": 0.9885632991790771, "learning_rate": 1e-05, "loss": 0.5349, "mean_token_accuracy": 0.8335657119750977, "num_tokens": 211581491.0, "step": 1327 }, { "epoch": 0.6754832146490336, "grad_norm": 1.1440314054489136, "learning_rate": 1e-05, "loss": 0.5158, "mean_token_accuracy": 0.8405625820159912, "num_tokens": 211739189.0, "step": 1328 }, { "epoch": 0.6759918616480163, "grad_norm": 1.19303297996521, "learning_rate": 1e-05, "loss": 0.5386, "mean_token_accuracy": 0.8338267803192139, "num_tokens": 211893052.0, "step": 1329 }, { "epoch": 0.676500508646999, "grad_norm": 0.9618560075759888, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.8445158004760742, "num_tokens": 212047699.0, "step": 1330 }, { "epoch": 0.6770091556459817, "grad_norm": 1.1449192762374878, "learning_rate": 1e-05, "loss": 0.5301, "mean_token_accuracy": 0.834658682346344, "num_tokens": 212189491.0, "step": 1331 }, { "epoch": 0.6775178026449644, "grad_norm": 1.1187528371810913, "learning_rate": 1e-05, "loss": 0.5411, "mean_token_accuracy": 0.8329986333847046, "num_tokens": 212350768.0, "step": 1332 }, { "epoch": 0.6780264496439471, "grad_norm": 1.0808460712432861, "learning_rate": 1e-05, "loss": 0.5457, "mean_token_accuracy": 0.8318155407905579, "num_tokens": 212505424.0, "step": 1333 }, { "epoch": 0.6785350966429298, "grad_norm": 1.0829100608825684, "learning_rate": 1e-05, "loss": 0.5315, "mean_token_accuracy": 0.8344807028770447, "num_tokens": 212656414.0, "step": 1334 }, { "epoch": 0.6790437436419126, "grad_norm": 1.0648113489151, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.8352001905441284, "num_tokens": 212814380.0, "step": 1335 }, { "epoch": 0.6795523906408952, "grad_norm": 0.9756075143814087, "learning_rate": 1e-05, "loss": 0.5248, "mean_token_accuracy": 0.8369483947753906, "num_tokens": 212970630.0, "step": 1336 }, { "epoch": 0.6800610376398779, "grad_norm": 1.0037658214569092, "learning_rate": 1e-05, "loss": 0.5312, "mean_token_accuracy": 0.8363174200057983, "num_tokens": 213135584.0, "step": 1337 }, { "epoch": 0.6805696846388606, "grad_norm": 1.008080005645752, "learning_rate": 1e-05, "loss": 0.5304, "mean_token_accuracy": 0.8350541591644287, "num_tokens": 213296625.0, "step": 1338 }, { "epoch": 0.6810783316378434, "grad_norm": 1.0014381408691406, "learning_rate": 1e-05, "loss": 0.526, "mean_token_accuracy": 0.8380976915359497, "num_tokens": 213466408.0, "step": 1339 }, { "epoch": 0.681586978636826, "grad_norm": 1.0413174629211426, "learning_rate": 1e-05, "loss": 0.4952, "mean_token_accuracy": 0.846105694770813, "num_tokens": 213608058.0, "step": 1340 }, { "epoch": 0.6820956256358087, "grad_norm": 0.990386962890625, "learning_rate": 1e-05, "loss": 0.5326, "mean_token_accuracy": 0.835658848285675, "num_tokens": 213757840.0, "step": 1341 }, { "epoch": 0.6826042726347915, "grad_norm": 1.2512933015823364, "learning_rate": 1e-05, "loss": 0.5094, "mean_token_accuracy": 0.8409282565116882, "num_tokens": 213919287.0, "step": 1342 }, { "epoch": 0.6831129196337742, "grad_norm": 0.9421449303627014, "learning_rate": 1e-05, "loss": 0.5271, "mean_token_accuracy": 0.8369660973548889, "num_tokens": 214089237.0, "step": 1343 }, { "epoch": 0.6836215666327569, "grad_norm": 1.0769293308258057, "learning_rate": 1e-05, "loss": 0.5197, "mean_token_accuracy": 0.8391597270965576, "num_tokens": 214248836.0, "step": 1344 }, { "epoch": 0.6841302136317395, "grad_norm": 0.948756754398346, "learning_rate": 1e-05, "loss": 0.5196, "mean_token_accuracy": 0.8406849503517151, "num_tokens": 214419340.0, "step": 1345 }, { "epoch": 0.6846388606307223, "grad_norm": 0.9942221641540527, "learning_rate": 1e-05, "loss": 0.5296, "mean_token_accuracy": 0.8333039879798889, "num_tokens": 214578559.0, "step": 1346 }, { "epoch": 0.685147507629705, "grad_norm": 1.0239410400390625, "learning_rate": 1e-05, "loss": 0.5269, "mean_token_accuracy": 0.83641517162323, "num_tokens": 214735210.0, "step": 1347 }, { "epoch": 0.6856561546286877, "grad_norm": 1.0552555322647095, "learning_rate": 1e-05, "loss": 0.5374, "mean_token_accuracy": 0.8334919214248657, "num_tokens": 214902909.0, "step": 1348 }, { "epoch": 0.6861648016276704, "grad_norm": 1.0437448024749756, "learning_rate": 1e-05, "loss": 0.5295, "mean_token_accuracy": 0.8360548615455627, "num_tokens": 215064535.0, "step": 1349 }, { "epoch": 0.6866734486266531, "grad_norm": 0.9955970048904419, "learning_rate": 1e-05, "loss": 0.5288, "mean_token_accuracy": 0.835818886756897, "num_tokens": 215220787.0, "step": 1350 }, { "epoch": 0.6871820956256358, "grad_norm": 1.1820982694625854, "learning_rate": 1e-05, "loss": 0.5509, "mean_token_accuracy": 0.8299815654754639, "num_tokens": 215375205.0, "step": 1351 }, { "epoch": 0.6876907426246185, "grad_norm": 1.0263277292251587, "learning_rate": 1e-05, "loss": 0.5418, "mean_token_accuracy": 0.8360893726348877, "num_tokens": 215532596.0, "step": 1352 }, { "epoch": 0.6881993896236012, "grad_norm": 0.972446084022522, "learning_rate": 1e-05, "loss": 0.54, "mean_token_accuracy": 0.8345771431922913, "num_tokens": 215686435.0, "step": 1353 }, { "epoch": 0.688708036622584, "grad_norm": 1.0651381015777588, "learning_rate": 1e-05, "loss": 0.5407, "mean_token_accuracy": 0.8325681686401367, "num_tokens": 215854411.0, "step": 1354 }, { "epoch": 0.6892166836215666, "grad_norm": 1.0745397806167603, "learning_rate": 1e-05, "loss": 0.5056, "mean_token_accuracy": 0.8420690298080444, "num_tokens": 216016683.0, "step": 1355 }, { "epoch": 0.6897253306205493, "grad_norm": 1.0858376026153564, "learning_rate": 1e-05, "loss": 0.5441, "mean_token_accuracy": 0.8298423886299133, "num_tokens": 216176025.0, "step": 1356 }, { "epoch": 0.6902339776195321, "grad_norm": 1.060082197189331, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.8398188352584839, "num_tokens": 216335904.0, "step": 1357 }, { "epoch": 0.6907426246185148, "grad_norm": 1.125820279121399, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.8364633321762085, "num_tokens": 216490560.0, "step": 1358 }, { "epoch": 0.6912512716174974, "grad_norm": 1.0891369581222534, "learning_rate": 1e-05, "loss": 0.542, "mean_token_accuracy": 0.8322654962539673, "num_tokens": 216636941.0, "step": 1359 }, { "epoch": 0.6917599186164801, "grad_norm": 1.0387393236160278, "learning_rate": 1e-05, "loss": 0.5247, "mean_token_accuracy": 0.8380802869796753, "num_tokens": 216788892.0, "step": 1360 }, { "epoch": 0.6922685656154629, "grad_norm": 1.1280726194381714, "learning_rate": 1e-05, "loss": 0.5075, "mean_token_accuracy": 0.8425440192222595, "num_tokens": 216933108.0, "step": 1361 }, { "epoch": 0.6927772126144456, "grad_norm": 1.1861090660095215, "learning_rate": 1e-05, "loss": 0.5248, "mean_token_accuracy": 0.8362356424331665, "num_tokens": 217091789.0, "step": 1362 }, { "epoch": 0.6932858596134283, "grad_norm": 1.0378928184509277, "learning_rate": 1e-05, "loss": 0.5254, "mean_token_accuracy": 0.8354607820510864, "num_tokens": 217237394.0, "step": 1363 }, { "epoch": 0.6937945066124109, "grad_norm": 1.079342007637024, "learning_rate": 1e-05, "loss": 0.5319, "mean_token_accuracy": 0.8338744044303894, "num_tokens": 217391649.0, "step": 1364 }, { "epoch": 0.6943031536113937, "grad_norm": 1.0112749338150024, "learning_rate": 1e-05, "loss": 0.5069, "mean_token_accuracy": 0.8414908051490784, "num_tokens": 217554324.0, "step": 1365 }, { "epoch": 0.6948118006103764, "grad_norm": 1.1308326721191406, "learning_rate": 1e-05, "loss": 0.5229, "mean_token_accuracy": 0.837146520614624, "num_tokens": 217705676.0, "step": 1366 }, { "epoch": 0.6953204476093591, "grad_norm": 1.031843662261963, "learning_rate": 1e-05, "loss": 0.544, "mean_token_accuracy": 0.8329341411590576, "num_tokens": 217870837.0, "step": 1367 }, { "epoch": 0.6958290946083419, "grad_norm": 1.030326247215271, "learning_rate": 1e-05, "loss": 0.5513, "mean_token_accuracy": 0.830209493637085, "num_tokens": 218034192.0, "step": 1368 }, { "epoch": 0.6963377416073245, "grad_norm": 1.058383822441101, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.8376389741897583, "num_tokens": 218201741.0, "step": 1369 }, { "epoch": 0.6968463886063072, "grad_norm": 1.0227019786834717, "learning_rate": 1e-05, "loss": 0.5384, "mean_token_accuracy": 0.8350127935409546, "num_tokens": 218361880.0, "step": 1370 }, { "epoch": 0.6973550356052899, "grad_norm": 1.0567728281021118, "learning_rate": 1e-05, "loss": 0.5002, "mean_token_accuracy": 0.8430039882659912, "num_tokens": 218507302.0, "step": 1371 }, { "epoch": 0.6978636826042727, "grad_norm": 0.9965653419494629, "learning_rate": 1e-05, "loss": 0.527, "mean_token_accuracy": 0.8355226516723633, "num_tokens": 218677170.0, "step": 1372 }, { "epoch": 0.6983723296032553, "grad_norm": 1.1475521326065063, "learning_rate": 1e-05, "loss": 0.5284, "mean_token_accuracy": 0.8367115259170532, "num_tokens": 218830583.0, "step": 1373 }, { "epoch": 0.698880976602238, "grad_norm": 1.0074726343154907, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8365511894226074, "num_tokens": 218990542.0, "step": 1374 }, { "epoch": 0.6993896236012207, "grad_norm": 1.1430093050003052, "learning_rate": 1e-05, "loss": 0.5373, "mean_token_accuracy": 0.834356427192688, "num_tokens": 219150118.0, "step": 1375 }, { "epoch": 0.6998982706002035, "grad_norm": 1.0113097429275513, "learning_rate": 1e-05, "loss": 0.512, "mean_token_accuracy": 0.8409308195114136, "num_tokens": 219320664.0, "step": 1376 }, { "epoch": 0.7004069175991862, "grad_norm": 1.0833232402801514, "learning_rate": 1e-05, "loss": 0.5412, "mean_token_accuracy": 0.8324387073516846, "num_tokens": 219483168.0, "step": 1377 }, { "epoch": 0.7009155645981688, "grad_norm": 1.008895993232727, "learning_rate": 1e-05, "loss": 0.5301, "mean_token_accuracy": 0.8351591229438782, "num_tokens": 219643460.0, "step": 1378 }, { "epoch": 0.7014242115971516, "grad_norm": 1.0583291053771973, "learning_rate": 1e-05, "loss": 0.5234, "mean_token_accuracy": 0.8379659056663513, "num_tokens": 219801607.0, "step": 1379 }, { "epoch": 0.7019328585961343, "grad_norm": 1.1074832677841187, "learning_rate": 1e-05, "loss": 0.5377, "mean_token_accuracy": 0.834449291229248, "num_tokens": 219953874.0, "step": 1380 }, { "epoch": 0.702441505595117, "grad_norm": 1.0328203439712524, "learning_rate": 1e-05, "loss": 0.4908, "mean_token_accuracy": 0.8464456796646118, "num_tokens": 220101993.0, "step": 1381 }, { "epoch": 0.7029501525940997, "grad_norm": 0.9966577887535095, "learning_rate": 1e-05, "loss": 0.5292, "mean_token_accuracy": 0.8362774848937988, "num_tokens": 220272230.0, "step": 1382 }, { "epoch": 0.7034587995930824, "grad_norm": 0.9913000464439392, "learning_rate": 1e-05, "loss": 0.5257, "mean_token_accuracy": 0.8375822305679321, "num_tokens": 220432579.0, "step": 1383 }, { "epoch": 0.7039674465920651, "grad_norm": 1.0375653505325317, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8354050517082214, "num_tokens": 220588939.0, "step": 1384 }, { "epoch": 0.7044760935910478, "grad_norm": 0.9509556293487549, "learning_rate": 1e-05, "loss": 0.5041, "mean_token_accuracy": 0.8444002270698547, "num_tokens": 220759155.0, "step": 1385 }, { "epoch": 0.7049847405900305, "grad_norm": 0.9617030620574951, "learning_rate": 1e-05, "loss": 0.5248, "mean_token_accuracy": 0.8367959260940552, "num_tokens": 220916843.0, "step": 1386 }, { "epoch": 0.7054933875890133, "grad_norm": 1.019019603729248, "learning_rate": 1e-05, "loss": 0.5339, "mean_token_accuracy": 0.8349019885063171, "num_tokens": 221083858.0, "step": 1387 }, { "epoch": 0.7060020345879959, "grad_norm": 1.065538763999939, "learning_rate": 1e-05, "loss": 0.5757, "mean_token_accuracy": 0.8243120908737183, "num_tokens": 221258649.0, "step": 1388 }, { "epoch": 0.7065106815869786, "grad_norm": 1.0140337944030762, "learning_rate": 1e-05, "loss": 0.5384, "mean_token_accuracy": 0.832329273223877, "num_tokens": 221425144.0, "step": 1389 }, { "epoch": 0.7070193285859614, "grad_norm": 1.085599660873413, "learning_rate": 1e-05, "loss": 0.5375, "mean_token_accuracy": 0.834128201007843, "num_tokens": 221583215.0, "step": 1390 }, { "epoch": 0.7075279755849441, "grad_norm": 0.9903988242149353, "learning_rate": 1e-05, "loss": 0.542, "mean_token_accuracy": 0.8328906893730164, "num_tokens": 221755255.0, "step": 1391 }, { "epoch": 0.7080366225839267, "grad_norm": 1.087711215019226, "learning_rate": 1e-05, "loss": 0.5549, "mean_token_accuracy": 0.8285048007965088, "num_tokens": 221923372.0, "step": 1392 }, { "epoch": 0.7085452695829094, "grad_norm": 0.9438535571098328, "learning_rate": 1e-05, "loss": 0.5308, "mean_token_accuracy": 0.8352342247962952, "num_tokens": 222080701.0, "step": 1393 }, { "epoch": 0.7090539165818922, "grad_norm": 0.9913089871406555, "learning_rate": 1e-05, "loss": 0.4932, "mean_token_accuracy": 0.8428090214729309, "num_tokens": 222233287.0, "step": 1394 }, { "epoch": 0.7095625635808749, "grad_norm": 1.000691294670105, "learning_rate": 1e-05, "loss": 0.5478, "mean_token_accuracy": 0.8302925825119019, "num_tokens": 222399597.0, "step": 1395 }, { "epoch": 0.7100712105798576, "grad_norm": 1.1176949739456177, "learning_rate": 1e-05, "loss": 0.5346, "mean_token_accuracy": 0.8353996276855469, "num_tokens": 222549595.0, "step": 1396 }, { "epoch": 0.7105798575788402, "grad_norm": 0.968652069568634, "learning_rate": 1e-05, "loss": 0.5561, "mean_token_accuracy": 0.8303560018539429, "num_tokens": 222711422.0, "step": 1397 }, { "epoch": 0.711088504577823, "grad_norm": 1.0685300827026367, "learning_rate": 1e-05, "loss": 0.5077, "mean_token_accuracy": 0.8400074243545532, "num_tokens": 222873104.0, "step": 1398 }, { "epoch": 0.7115971515768057, "grad_norm": 1.0692155361175537, "learning_rate": 1e-05, "loss": 0.5379, "mean_token_accuracy": 0.8347507119178772, "num_tokens": 223020625.0, "step": 1399 }, { "epoch": 0.7121057985757884, "grad_norm": 1.057916283607483, "learning_rate": 1e-05, "loss": 0.5137, "mean_token_accuracy": 0.8409354090690613, "num_tokens": 223175189.0, "step": 1400 }, { "epoch": 0.7126144455747712, "grad_norm": 1.0396839380264282, "learning_rate": 1e-05, "loss": 0.5184, "mean_token_accuracy": 0.836387038230896, "num_tokens": 223335073.0, "step": 1401 }, { "epoch": 0.7131230925737538, "grad_norm": 0.9966652393341064, "learning_rate": 1e-05, "loss": 0.5288, "mean_token_accuracy": 0.8362771272659302, "num_tokens": 223499553.0, "step": 1402 }, { "epoch": 0.7136317395727365, "grad_norm": 1.064444899559021, "learning_rate": 1e-05, "loss": 0.5123, "mean_token_accuracy": 0.8392417430877686, "num_tokens": 223657911.0, "step": 1403 }, { "epoch": 0.7141403865717192, "grad_norm": 1.1067264080047607, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8337592482566833, "num_tokens": 223808765.0, "step": 1404 }, { "epoch": 0.714649033570702, "grad_norm": 1.0045108795166016, "learning_rate": 1e-05, "loss": 0.5086, "mean_token_accuracy": 0.8400107622146606, "num_tokens": 223974533.0, "step": 1405 }, { "epoch": 0.7151576805696847, "grad_norm": 1.034972906112671, "learning_rate": 1e-05, "loss": 0.4909, "mean_token_accuracy": 0.8464032411575317, "num_tokens": 224133481.0, "step": 1406 }, { "epoch": 0.7156663275686673, "grad_norm": 1.1038155555725098, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8401011824607849, "num_tokens": 224282640.0, "step": 1407 }, { "epoch": 0.71617497456765, "grad_norm": 1.0537794828414917, "learning_rate": 1e-05, "loss": 0.5516, "mean_token_accuracy": 0.8289327621459961, "num_tokens": 224447298.0, "step": 1408 }, { "epoch": 0.7166836215666328, "grad_norm": 1.0741024017333984, "learning_rate": 1e-05, "loss": 0.5102, "mean_token_accuracy": 0.8410897254943848, "num_tokens": 224596045.0, "step": 1409 }, { "epoch": 0.7171922685656155, "grad_norm": 1.037316083908081, "learning_rate": 1e-05, "loss": 0.5357, "mean_token_accuracy": 0.8368381261825562, "num_tokens": 224757199.0, "step": 1410 }, { "epoch": 0.7177009155645981, "grad_norm": 1.006191372871399, "learning_rate": 1e-05, "loss": 0.5133, "mean_token_accuracy": 0.8390223383903503, "num_tokens": 224913566.0, "step": 1411 }, { "epoch": 0.7182095625635809, "grad_norm": 1.0501819849014282, "learning_rate": 1e-05, "loss": 0.5285, "mean_token_accuracy": 0.8382315039634705, "num_tokens": 225078638.0, "step": 1412 }, { "epoch": 0.7187182095625636, "grad_norm": 0.9292073845863342, "learning_rate": 1e-05, "loss": 0.5319, "mean_token_accuracy": 0.8357728123664856, "num_tokens": 225242585.0, "step": 1413 }, { "epoch": 0.7192268565615463, "grad_norm": 1.1309953927993774, "learning_rate": 1e-05, "loss": 0.503, "mean_token_accuracy": 0.8418674468994141, "num_tokens": 225402143.0, "step": 1414 }, { "epoch": 0.719735503560529, "grad_norm": 1.0340876579284668, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.83051598072052, "num_tokens": 225561481.0, "step": 1415 }, { "epoch": 0.7202441505595117, "grad_norm": 1.0265415906906128, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.8427028059959412, "num_tokens": 225720513.0, "step": 1416 }, { "epoch": 0.7207527975584944, "grad_norm": 1.1584059000015259, "learning_rate": 1e-05, "loss": 0.5592, "mean_token_accuracy": 0.8283109664916992, "num_tokens": 225873349.0, "step": 1417 }, { "epoch": 0.7212614445574771, "grad_norm": 1.0035206079483032, "learning_rate": 1e-05, "loss": 0.5073, "mean_token_accuracy": 0.8412970900535583, "num_tokens": 226039357.0, "step": 1418 }, { "epoch": 0.7217700915564598, "grad_norm": 1.0063248872756958, "learning_rate": 1e-05, "loss": 0.5033, "mean_token_accuracy": 0.8425672054290771, "num_tokens": 226189348.0, "step": 1419 }, { "epoch": 0.7222787385554426, "grad_norm": 1.0380918979644775, "learning_rate": 1e-05, "loss": 0.5352, "mean_token_accuracy": 0.8346319198608398, "num_tokens": 226335920.0, "step": 1420 }, { "epoch": 0.7227873855544252, "grad_norm": 1.1460973024368286, "learning_rate": 1e-05, "loss": 0.5388, "mean_token_accuracy": 0.8333281874656677, "num_tokens": 226495005.0, "step": 1421 }, { "epoch": 0.7232960325534079, "grad_norm": 1.0217316150665283, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8454707860946655, "num_tokens": 226657995.0, "step": 1422 }, { "epoch": 0.7238046795523907, "grad_norm": 1.2179800271987915, "learning_rate": 1e-05, "loss": 0.5315, "mean_token_accuracy": 0.836176335811615, "num_tokens": 226828202.0, "step": 1423 }, { "epoch": 0.7243133265513734, "grad_norm": 1.0598959922790527, "learning_rate": 1e-05, "loss": 0.542, "mean_token_accuracy": 0.8340860605239868, "num_tokens": 226982248.0, "step": 1424 }, { "epoch": 0.724821973550356, "grad_norm": 1.1656885147094727, "learning_rate": 1e-05, "loss": 0.5318, "mean_token_accuracy": 0.8348649144172668, "num_tokens": 227129474.0, "step": 1425 }, { "epoch": 0.7253306205493387, "grad_norm": 1.257928490638733, "learning_rate": 1e-05, "loss": 0.5091, "mean_token_accuracy": 0.8404781818389893, "num_tokens": 227274613.0, "step": 1426 }, { "epoch": 0.7258392675483215, "grad_norm": 1.0831841230392456, "learning_rate": 1e-05, "loss": 0.5461, "mean_token_accuracy": 0.8328593373298645, "num_tokens": 227440413.0, "step": 1427 }, { "epoch": 0.7263479145473042, "grad_norm": 1.2265989780426025, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.8368296027183533, "num_tokens": 227605134.0, "step": 1428 }, { "epoch": 0.7268565615462869, "grad_norm": 0.9749830365180969, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8399536609649658, "num_tokens": 227761956.0, "step": 1429 }, { "epoch": 0.7273652085452695, "grad_norm": 1.2150827646255493, "learning_rate": 1e-05, "loss": 0.5796, "mean_token_accuracy": 0.8240715265274048, "num_tokens": 227922627.0, "step": 1430 }, { "epoch": 0.7278738555442523, "grad_norm": 1.0878562927246094, "learning_rate": 1e-05, "loss": 0.5264, "mean_token_accuracy": 0.835365891456604, "num_tokens": 228080800.0, "step": 1431 }, { "epoch": 0.728382502543235, "grad_norm": 1.1435154676437378, "learning_rate": 1e-05, "loss": 0.5195, "mean_token_accuracy": 0.8392224311828613, "num_tokens": 228242141.0, "step": 1432 }, { "epoch": 0.7288911495422177, "grad_norm": 1.3589378595352173, "learning_rate": 1e-05, "loss": 0.5227, "mean_token_accuracy": 0.8367589712142944, "num_tokens": 228387653.0, "step": 1433 }, { "epoch": 0.7293997965412004, "grad_norm": 1.0286848545074463, "learning_rate": 1e-05, "loss": 0.5023, "mean_token_accuracy": 0.8427996635437012, "num_tokens": 228545473.0, "step": 1434 }, { "epoch": 0.7299084435401831, "grad_norm": 1.1867001056671143, "learning_rate": 1e-05, "loss": 0.5364, "mean_token_accuracy": 0.8338367938995361, "num_tokens": 228713308.0, "step": 1435 }, { "epoch": 0.7304170905391658, "grad_norm": 1.0601273775100708, "learning_rate": 1e-05, "loss": 0.5541, "mean_token_accuracy": 0.8306837677955627, "num_tokens": 228884216.0, "step": 1436 }, { "epoch": 0.7309257375381485, "grad_norm": 1.101950764656067, "learning_rate": 1e-05, "loss": 0.5667, "mean_token_accuracy": 0.8280445337295532, "num_tokens": 229029827.0, "step": 1437 }, { "epoch": 0.7314343845371313, "grad_norm": 1.1425142288208008, "learning_rate": 1e-05, "loss": 0.5437, "mean_token_accuracy": 0.8314026594161987, "num_tokens": 229181920.0, "step": 1438 }, { "epoch": 0.731943031536114, "grad_norm": 0.9540426731109619, "learning_rate": 1e-05, "loss": 0.5452, "mean_token_accuracy": 0.8314418792724609, "num_tokens": 229349488.0, "step": 1439 }, { "epoch": 0.7324516785350966, "grad_norm": 1.516485333442688, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8390694260597229, "num_tokens": 229505884.0, "step": 1440 }, { "epoch": 0.7329603255340793, "grad_norm": 1.3585577011108398, "learning_rate": 1e-05, "loss": 0.5283, "mean_token_accuracy": 0.8348572254180908, "num_tokens": 229673440.0, "step": 1441 }, { "epoch": 0.7334689725330621, "grad_norm": 1.0918457508087158, "learning_rate": 1e-05, "loss": 0.516, "mean_token_accuracy": 0.8398845195770264, "num_tokens": 229844345.0, "step": 1442 }, { "epoch": 0.7339776195320448, "grad_norm": 1.2803473472595215, "learning_rate": 1e-05, "loss": 0.5754, "mean_token_accuracy": 0.8247080445289612, "num_tokens": 230001344.0, "step": 1443 }, { "epoch": 0.7344862665310274, "grad_norm": 1.09883451461792, "learning_rate": 1e-05, "loss": 0.5085, "mean_token_accuracy": 0.8424191474914551, "num_tokens": 230160760.0, "step": 1444 }, { "epoch": 0.7349949135300101, "grad_norm": 1.2438838481903076, "learning_rate": 1e-05, "loss": 0.5198, "mean_token_accuracy": 0.8385148644447327, "num_tokens": 230318252.0, "step": 1445 }, { "epoch": 0.7355035605289929, "grad_norm": 1.1230305433273315, "learning_rate": 1e-05, "loss": 0.534, "mean_token_accuracy": 0.8361711502075195, "num_tokens": 230480542.0, "step": 1446 }, { "epoch": 0.7360122075279756, "grad_norm": 1.1084116697311401, "learning_rate": 1e-05, "loss": 0.5423, "mean_token_accuracy": 0.8342896103858948, "num_tokens": 230631226.0, "step": 1447 }, { "epoch": 0.7365208545269583, "grad_norm": 1.1472543478012085, "learning_rate": 1e-05, "loss": 0.5009, "mean_token_accuracy": 0.8427757024765015, "num_tokens": 230793950.0, "step": 1448 }, { "epoch": 0.737029501525941, "grad_norm": 1.0803556442260742, "learning_rate": 1e-05, "loss": 0.5749, "mean_token_accuracy": 0.8247697949409485, "num_tokens": 230949258.0, "step": 1449 }, { "epoch": 0.7375381485249237, "grad_norm": 1.0975403785705566, "learning_rate": 1e-05, "loss": 0.5154, "mean_token_accuracy": 0.8405905961990356, "num_tokens": 231116995.0, "step": 1450 }, { "epoch": 0.7380467955239064, "grad_norm": 0.9167547821998596, "learning_rate": 1e-05, "loss": 0.4993, "mean_token_accuracy": 0.8458133339881897, "num_tokens": 231286840.0, "step": 1451 }, { "epoch": 0.7385554425228891, "grad_norm": 1.0844895839691162, "learning_rate": 1e-05, "loss": 0.5208, "mean_token_accuracy": 0.8365471959114075, "num_tokens": 231446504.0, "step": 1452 }, { "epoch": 0.7390640895218719, "grad_norm": 0.9481300711631775, "learning_rate": 1e-05, "loss": 0.5508, "mean_token_accuracy": 0.8306936025619507, "num_tokens": 231614914.0, "step": 1453 }, { "epoch": 0.7395727365208545, "grad_norm": 1.0026079416275024, "learning_rate": 1e-05, "loss": 0.5209, "mean_token_accuracy": 0.8390685319900513, "num_tokens": 231772874.0, "step": 1454 }, { "epoch": 0.7400813835198372, "grad_norm": 0.9564797282218933, "learning_rate": 1e-05, "loss": 0.5323, "mean_token_accuracy": 0.834564208984375, "num_tokens": 231937051.0, "step": 1455 }, { "epoch": 0.7405900305188199, "grad_norm": 1.0185620784759521, "learning_rate": 1e-05, "loss": 0.517, "mean_token_accuracy": 0.8383815884590149, "num_tokens": 232087611.0, "step": 1456 }, { "epoch": 0.7410986775178027, "grad_norm": 1.0209577083587646, "learning_rate": 1e-05, "loss": 0.5688, "mean_token_accuracy": 0.8257862329483032, "num_tokens": 232246891.0, "step": 1457 }, { "epoch": 0.7416073245167853, "grad_norm": 0.9779701232910156, "learning_rate": 1e-05, "loss": 0.5092, "mean_token_accuracy": 0.8411809206008911, "num_tokens": 232405181.0, "step": 1458 }, { "epoch": 0.742115971515768, "grad_norm": 0.9819640517234802, "learning_rate": 1e-05, "loss": 0.5713, "mean_token_accuracy": 0.8215596675872803, "num_tokens": 232569215.0, "step": 1459 }, { "epoch": 0.7426246185147508, "grad_norm": 0.9781466126441956, "learning_rate": 1e-05, "loss": 0.5015, "mean_token_accuracy": 0.8428027629852295, "num_tokens": 232730814.0, "step": 1460 }, { "epoch": 0.7431332655137335, "grad_norm": 1.0607805252075195, "learning_rate": 1e-05, "loss": 0.5266, "mean_token_accuracy": 0.8380945324897766, "num_tokens": 232892427.0, "step": 1461 }, { "epoch": 0.7436419125127162, "grad_norm": 1.0117738246917725, "learning_rate": 1e-05, "loss": 0.5172, "mean_token_accuracy": 0.8377254009246826, "num_tokens": 233062413.0, "step": 1462 }, { "epoch": 0.7441505595116988, "grad_norm": 1.094952940940857, "learning_rate": 1e-05, "loss": 0.5061, "mean_token_accuracy": 0.8411229848861694, "num_tokens": 233223449.0, "step": 1463 }, { "epoch": 0.7446592065106816, "grad_norm": 1.0199710130691528, "learning_rate": 1e-05, "loss": 0.5188, "mean_token_accuracy": 0.8373159170150757, "num_tokens": 233382314.0, "step": 1464 }, { "epoch": 0.7451678535096643, "grad_norm": 0.9318074584007263, "learning_rate": 1e-05, "loss": 0.5111, "mean_token_accuracy": 0.8436965346336365, "num_tokens": 233549471.0, "step": 1465 }, { "epoch": 0.745676500508647, "grad_norm": 0.9865319728851318, "learning_rate": 1e-05, "loss": 0.5337, "mean_token_accuracy": 0.8344173431396484, "num_tokens": 233714244.0, "step": 1466 }, { "epoch": 0.7461851475076297, "grad_norm": 1.060027837753296, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8394288420677185, "num_tokens": 233873031.0, "step": 1467 }, { "epoch": 0.7466937945066124, "grad_norm": 0.9694737195968628, "learning_rate": 1e-05, "loss": 0.5047, "mean_token_accuracy": 0.8412796854972839, "num_tokens": 234028924.0, "step": 1468 }, { "epoch": 0.7472024415055951, "grad_norm": 1.07378089427948, "learning_rate": 1e-05, "loss": 0.5137, "mean_token_accuracy": 0.8402303457260132, "num_tokens": 234187316.0, "step": 1469 }, { "epoch": 0.7477110885045778, "grad_norm": 1.0076998472213745, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8435081243515015, "num_tokens": 234352052.0, "step": 1470 }, { "epoch": 0.7482197355035606, "grad_norm": 1.1248302459716797, "learning_rate": 1e-05, "loss": 0.5054, "mean_token_accuracy": 0.8418548703193665, "num_tokens": 234522725.0, "step": 1471 }, { "epoch": 0.7487283825025433, "grad_norm": 1.0523717403411865, "learning_rate": 1e-05, "loss": 0.5446, "mean_token_accuracy": 0.8295474052429199, "num_tokens": 234685164.0, "step": 1472 }, { "epoch": 0.7492370295015259, "grad_norm": 1.009028673171997, "learning_rate": 1e-05, "loss": 0.5101, "mean_token_accuracy": 0.8410037755966187, "num_tokens": 234844848.0, "step": 1473 }, { "epoch": 0.7497456765005086, "grad_norm": 1.1979455947875977, "learning_rate": 1e-05, "loss": 0.497, "mean_token_accuracy": 0.8434077501296997, "num_tokens": 234999779.0, "step": 1474 }, { "epoch": 0.7502543234994914, "grad_norm": 1.081596851348877, "learning_rate": 1e-05, "loss": 0.5912, "mean_token_accuracy": 0.8188192844390869, "num_tokens": 235162263.0, "step": 1475 }, { "epoch": 0.7507629704984741, "grad_norm": 1.0012668371200562, "learning_rate": 1e-05, "loss": 0.5698, "mean_token_accuracy": 0.8237411975860596, "num_tokens": 235319603.0, "step": 1476 }, { "epoch": 0.7512716174974567, "grad_norm": 1.0081019401550293, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.8393720984458923, "num_tokens": 235479108.0, "step": 1477 }, { "epoch": 0.7517802644964394, "grad_norm": 1.0606874227523804, "learning_rate": 1e-05, "loss": 0.5275, "mean_token_accuracy": 0.8372191786766052, "num_tokens": 235635662.0, "step": 1478 }, { "epoch": 0.7522889114954222, "grad_norm": 1.1336381435394287, "learning_rate": 1e-05, "loss": 0.5402, "mean_token_accuracy": 0.8327375054359436, "num_tokens": 235778848.0, "step": 1479 }, { "epoch": 0.7527975584944049, "grad_norm": 0.9920240044593811, "learning_rate": 1e-05, "loss": 0.5048, "mean_token_accuracy": 0.8434538841247559, "num_tokens": 235935563.0, "step": 1480 }, { "epoch": 0.7533062054933876, "grad_norm": 1.0288413763046265, "learning_rate": 1e-05, "loss": 0.5292, "mean_token_accuracy": 0.8377621173858643, "num_tokens": 236102712.0, "step": 1481 }, { "epoch": 0.7538148524923703, "grad_norm": 1.0564460754394531, "learning_rate": 1e-05, "loss": 0.5207, "mean_token_accuracy": 0.8376520872116089, "num_tokens": 236247865.0, "step": 1482 }, { "epoch": 0.754323499491353, "grad_norm": 1.0536723136901855, "learning_rate": 1e-05, "loss": 0.527, "mean_token_accuracy": 0.8366076350212097, "num_tokens": 236406459.0, "step": 1483 }, { "epoch": 0.7548321464903357, "grad_norm": 0.9918603301048279, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.8441921472549438, "num_tokens": 236548474.0, "step": 1484 }, { "epoch": 0.7553407934893184, "grad_norm": 1.125241994857788, "learning_rate": 1e-05, "loss": 0.5428, "mean_token_accuracy": 0.8297998905181885, "num_tokens": 236699987.0, "step": 1485 }, { "epoch": 0.7558494404883012, "grad_norm": 1.0547477006912231, "learning_rate": 1e-05, "loss": 0.4975, "mean_token_accuracy": 0.844157338142395, "num_tokens": 236859308.0, "step": 1486 }, { "epoch": 0.7563580874872838, "grad_norm": 1.0355929136276245, "learning_rate": 1e-05, "loss": 0.4996, "mean_token_accuracy": 0.843138575553894, "num_tokens": 237011851.0, "step": 1487 }, { "epoch": 0.7568667344862665, "grad_norm": 1.0818662643432617, "learning_rate": 1e-05, "loss": 0.5279, "mean_token_accuracy": 0.8360995650291443, "num_tokens": 237170473.0, "step": 1488 }, { "epoch": 0.7573753814852492, "grad_norm": 1.2234809398651123, "learning_rate": 1e-05, "loss": 0.5049, "mean_token_accuracy": 0.8414808511734009, "num_tokens": 237317583.0, "step": 1489 }, { "epoch": 0.757884028484232, "grad_norm": 1.0319559574127197, "learning_rate": 1e-05, "loss": 0.5018, "mean_token_accuracy": 0.8432955145835876, "num_tokens": 237483382.0, "step": 1490 }, { "epoch": 0.7583926754832147, "grad_norm": 1.0208332538604736, "learning_rate": 1e-05, "loss": 0.5151, "mean_token_accuracy": 0.8389483094215393, "num_tokens": 237642857.0, "step": 1491 }, { "epoch": 0.7589013224821973, "grad_norm": 1.0073045492172241, "learning_rate": 1e-05, "loss": 0.5174, "mean_token_accuracy": 0.8391669988632202, "num_tokens": 237809071.0, "step": 1492 }, { "epoch": 0.7594099694811801, "grad_norm": 1.1355136632919312, "learning_rate": 1e-05, "loss": 0.5396, "mean_token_accuracy": 0.8332517147064209, "num_tokens": 237977918.0, "step": 1493 }, { "epoch": 0.7599186164801628, "grad_norm": 1.0361236333847046, "learning_rate": 1e-05, "loss": 0.537, "mean_token_accuracy": 0.8351563215255737, "num_tokens": 238146507.0, "step": 1494 }, { "epoch": 0.7604272634791455, "grad_norm": 0.9729189872741699, "learning_rate": 1e-05, "loss": 0.5075, "mean_token_accuracy": 0.8422070741653442, "num_tokens": 238317537.0, "step": 1495 }, { "epoch": 0.7609359104781281, "grad_norm": 1.0099557638168335, "learning_rate": 1e-05, "loss": 0.5084, "mean_token_accuracy": 0.8419430255889893, "num_tokens": 238483591.0, "step": 1496 }, { "epoch": 0.7614445574771109, "grad_norm": 0.9618970155715942, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.8457642793655396, "num_tokens": 238641826.0, "step": 1497 }, { "epoch": 0.7619532044760936, "grad_norm": 1.0791717767715454, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8391422033309937, "num_tokens": 238793523.0, "step": 1498 }, { "epoch": 0.7624618514750763, "grad_norm": 1.0534913539886475, "learning_rate": 1e-05, "loss": 0.5243, "mean_token_accuracy": 0.8369989395141602, "num_tokens": 238956488.0, "step": 1499 }, { "epoch": 0.762970498474059, "grad_norm": 0.9715752601623535, "learning_rate": 1e-05, "loss": 0.4962, "mean_token_accuracy": 0.8451597094535828, "num_tokens": 239112968.0, "step": 1500 }, { "epoch": 0.7634791454730417, "grad_norm": 1.042768955230713, "learning_rate": 1e-05, "loss": 0.5518, "mean_token_accuracy": 0.83104407787323, "num_tokens": 239278918.0, "step": 1501 }, { "epoch": 0.7639877924720244, "grad_norm": 1.1038720607757568, "learning_rate": 1e-05, "loss": 0.5063, "mean_token_accuracy": 0.8409518003463745, "num_tokens": 239429305.0, "step": 1502 }, { "epoch": 0.7644964394710071, "grad_norm": 1.003100037574768, "learning_rate": 1e-05, "loss": 0.4916, "mean_token_accuracy": 0.8468797206878662, "num_tokens": 239592603.0, "step": 1503 }, { "epoch": 0.7650050864699899, "grad_norm": 1.1595673561096191, "learning_rate": 1e-05, "loss": 0.5495, "mean_token_accuracy": 0.8297867178916931, "num_tokens": 239745054.0, "step": 1504 }, { "epoch": 0.7655137334689726, "grad_norm": 1.01282799243927, "learning_rate": 1e-05, "loss": 0.5121, "mean_token_accuracy": 0.8400813341140747, "num_tokens": 239909253.0, "step": 1505 }, { "epoch": 0.7660223804679552, "grad_norm": 1.0849213600158691, "learning_rate": 1e-05, "loss": 0.513, "mean_token_accuracy": 0.8397663831710815, "num_tokens": 240051112.0, "step": 1506 }, { "epoch": 0.7665310274669379, "grad_norm": 1.0645639896392822, "learning_rate": 1e-05, "loss": 0.484, "mean_token_accuracy": 0.847365140914917, "num_tokens": 240211700.0, "step": 1507 }, { "epoch": 0.7670396744659207, "grad_norm": 1.019892930984497, "learning_rate": 1e-05, "loss": 0.5217, "mean_token_accuracy": 0.8375338912010193, "num_tokens": 240373170.0, "step": 1508 }, { "epoch": 0.7675483214649034, "grad_norm": 1.1592234373092651, "learning_rate": 1e-05, "loss": 0.5409, "mean_token_accuracy": 0.8331730365753174, "num_tokens": 240518936.0, "step": 1509 }, { "epoch": 0.768056968463886, "grad_norm": 1.040312647819519, "learning_rate": 1e-05, "loss": 0.5215, "mean_token_accuracy": 0.8367902636528015, "num_tokens": 240668912.0, "step": 1510 }, { "epoch": 0.7685656154628687, "grad_norm": 1.066318154335022, "learning_rate": 1e-05, "loss": 0.5073, "mean_token_accuracy": 0.8416951894760132, "num_tokens": 240828027.0, "step": 1511 }, { "epoch": 0.7690742624618515, "grad_norm": 0.9991273283958435, "learning_rate": 1e-05, "loss": 0.5108, "mean_token_accuracy": 0.840943455696106, "num_tokens": 240996115.0, "step": 1512 }, { "epoch": 0.7695829094608342, "grad_norm": 1.1857500076293945, "learning_rate": 1e-05, "loss": 0.5093, "mean_token_accuracy": 0.8417667746543884, "num_tokens": 241154020.0, "step": 1513 }, { "epoch": 0.7700915564598169, "grad_norm": 1.0184717178344727, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8427598476409912, "num_tokens": 241313175.0, "step": 1514 }, { "epoch": 0.7706002034587996, "grad_norm": 1.2196848392486572, "learning_rate": 1e-05, "loss": 0.5209, "mean_token_accuracy": 0.8368412852287292, "num_tokens": 241461371.0, "step": 1515 }, { "epoch": 0.7711088504577823, "grad_norm": 1.0729477405548096, "learning_rate": 1e-05, "loss": 0.5422, "mean_token_accuracy": 0.8319259881973267, "num_tokens": 241631176.0, "step": 1516 }, { "epoch": 0.771617497456765, "grad_norm": 1.0394853353500366, "learning_rate": 1e-05, "loss": 0.5506, "mean_token_accuracy": 0.829657793045044, "num_tokens": 241791904.0, "step": 1517 }, { "epoch": 0.7721261444557477, "grad_norm": 1.1062549352645874, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8426122069358826, "num_tokens": 241954029.0, "step": 1518 }, { "epoch": 0.7726347914547305, "grad_norm": 1.0023213624954224, "learning_rate": 1e-05, "loss": 0.573, "mean_token_accuracy": 0.8251248598098755, "num_tokens": 242106558.0, "step": 1519 }, { "epoch": 0.7731434384537131, "grad_norm": 1.079978585243225, "learning_rate": 1e-05, "loss": 0.536, "mean_token_accuracy": 0.8343393802642822, "num_tokens": 242276624.0, "step": 1520 }, { "epoch": 0.7736520854526958, "grad_norm": 1.0364209413528442, "learning_rate": 1e-05, "loss": 0.529, "mean_token_accuracy": 0.8353438973426819, "num_tokens": 242434754.0, "step": 1521 }, { "epoch": 0.7741607324516785, "grad_norm": 1.0392584800720215, "learning_rate": 1e-05, "loss": 0.4997, "mean_token_accuracy": 0.8424927592277527, "num_tokens": 242576931.0, "step": 1522 }, { "epoch": 0.7746693794506613, "grad_norm": 1.045095443725586, "learning_rate": 1e-05, "loss": 0.5205, "mean_token_accuracy": 0.8376889228820801, "num_tokens": 242733793.0, "step": 1523 }, { "epoch": 0.775178026449644, "grad_norm": 1.132960557937622, "learning_rate": 1e-05, "loss": 0.5181, "mean_token_accuracy": 0.838951587677002, "num_tokens": 242882978.0, "step": 1524 }, { "epoch": 0.7756866734486266, "grad_norm": 1.0570266246795654, "learning_rate": 1e-05, "loss": 0.5178, "mean_token_accuracy": 0.8384246826171875, "num_tokens": 243038462.0, "step": 1525 }, { "epoch": 0.7761953204476093, "grad_norm": 1.1150120496749878, "learning_rate": 1e-05, "loss": 0.5308, "mean_token_accuracy": 0.8351293802261353, "num_tokens": 243206785.0, "step": 1526 }, { "epoch": 0.7767039674465921, "grad_norm": 0.9758465886116028, "learning_rate": 1e-05, "loss": 0.5166, "mean_token_accuracy": 0.839024543762207, "num_tokens": 243357645.0, "step": 1527 }, { "epoch": 0.7772126144455748, "grad_norm": 1.1001802682876587, "learning_rate": 1e-05, "loss": 0.5088, "mean_token_accuracy": 0.8403192758560181, "num_tokens": 243512602.0, "step": 1528 }, { "epoch": 0.7777212614445574, "grad_norm": 1.087286353111267, "learning_rate": 1e-05, "loss": 0.5116, "mean_token_accuracy": 0.839819073677063, "num_tokens": 243687260.0, "step": 1529 }, { "epoch": 0.7782299084435402, "grad_norm": 1.0088173151016235, "learning_rate": 1e-05, "loss": 0.5192, "mean_token_accuracy": 0.8393971920013428, "num_tokens": 243845125.0, "step": 1530 }, { "epoch": 0.7787385554425229, "grad_norm": 1.0728702545166016, "learning_rate": 1e-05, "loss": 0.4801, "mean_token_accuracy": 0.8486957550048828, "num_tokens": 243985343.0, "step": 1531 }, { "epoch": 0.7792472024415056, "grad_norm": 1.0450102090835571, "learning_rate": 1e-05, "loss": 0.5358, "mean_token_accuracy": 0.8348969221115112, "num_tokens": 244139431.0, "step": 1532 }, { "epoch": 0.7797558494404883, "grad_norm": 1.116742730140686, "learning_rate": 1e-05, "loss": 0.5322, "mean_token_accuracy": 0.8358186483383179, "num_tokens": 244296684.0, "step": 1533 }, { "epoch": 0.780264496439471, "grad_norm": 1.0600138902664185, "learning_rate": 1e-05, "loss": 0.527, "mean_token_accuracy": 0.8371978402137756, "num_tokens": 244456341.0, "step": 1534 }, { "epoch": 0.7807731434384537, "grad_norm": 1.1324113607406616, "learning_rate": 1e-05, "loss": 0.4989, "mean_token_accuracy": 0.8429820537567139, "num_tokens": 244612234.0, "step": 1535 }, { "epoch": 0.7812817904374364, "grad_norm": 1.0579875707626343, "learning_rate": 1e-05, "loss": 0.5438, "mean_token_accuracy": 0.8308662176132202, "num_tokens": 244763392.0, "step": 1536 }, { "epoch": 0.7817904374364191, "grad_norm": 1.105968713760376, "learning_rate": 1e-05, "loss": 0.5399, "mean_token_accuracy": 0.8314764499664307, "num_tokens": 244921544.0, "step": 1537 }, { "epoch": 0.7822990844354019, "grad_norm": 1.1011391878128052, "learning_rate": 1e-05, "loss": 0.5108, "mean_token_accuracy": 0.8399180173873901, "num_tokens": 245082597.0, "step": 1538 }, { "epoch": 0.7828077314343845, "grad_norm": 1.0996878147125244, "learning_rate": 1e-05, "loss": 0.5323, "mean_token_accuracy": 0.8340091705322266, "num_tokens": 245229522.0, "step": 1539 }, { "epoch": 0.7833163784333672, "grad_norm": 1.0516421794891357, "learning_rate": 1e-05, "loss": 0.5278, "mean_token_accuracy": 0.835968554019928, "num_tokens": 245379953.0, "step": 1540 }, { "epoch": 0.78382502543235, "grad_norm": 1.0144932270050049, "learning_rate": 1e-05, "loss": 0.5131, "mean_token_accuracy": 0.839628279209137, "num_tokens": 245531845.0, "step": 1541 }, { "epoch": 0.7843336724313327, "grad_norm": 1.0560461282730103, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.8432492613792419, "num_tokens": 245696677.0, "step": 1542 }, { "epoch": 0.7848423194303153, "grad_norm": 1.0136598348617554, "learning_rate": 1e-05, "loss": 0.5166, "mean_token_accuracy": 0.8379706144332886, "num_tokens": 245866942.0, "step": 1543 }, { "epoch": 0.785350966429298, "grad_norm": 1.0351725816726685, "learning_rate": 1e-05, "loss": 0.5417, "mean_token_accuracy": 0.8329663276672363, "num_tokens": 246030786.0, "step": 1544 }, { "epoch": 0.7858596134282808, "grad_norm": 1.1024549007415771, "learning_rate": 1e-05, "loss": 0.5229, "mean_token_accuracy": 0.836799681186676, "num_tokens": 246181272.0, "step": 1545 }, { "epoch": 0.7863682604272635, "grad_norm": 1.1076233386993408, "learning_rate": 1e-05, "loss": 0.4818, "mean_token_accuracy": 0.8464345335960388, "num_tokens": 246332311.0, "step": 1546 }, { "epoch": 0.7868769074262462, "grad_norm": 1.084722876548767, "learning_rate": 1e-05, "loss": 0.5509, "mean_token_accuracy": 0.830653190612793, "num_tokens": 246491365.0, "step": 1547 }, { "epoch": 0.7873855544252288, "grad_norm": 1.0792385339736938, "learning_rate": 1e-05, "loss": 0.4855, "mean_token_accuracy": 0.8466219902038574, "num_tokens": 246646398.0, "step": 1548 }, { "epoch": 0.7878942014242116, "grad_norm": 1.161864995956421, "learning_rate": 1e-05, "loss": 0.5148, "mean_token_accuracy": 0.83869469165802, "num_tokens": 246806249.0, "step": 1549 }, { "epoch": 0.7884028484231943, "grad_norm": 1.0989301204681396, "learning_rate": 1e-05, "loss": 0.4808, "mean_token_accuracy": 0.8468911051750183, "num_tokens": 246957754.0, "step": 1550 }, { "epoch": 0.788911495422177, "grad_norm": 0.945274829864502, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8512423038482666, "num_tokens": 247122388.0, "step": 1551 }, { "epoch": 0.7894201424211598, "grad_norm": 1.0815494060516357, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8495436310768127, "num_tokens": 247284678.0, "step": 1552 }, { "epoch": 0.7899287894201424, "grad_norm": 1.0339261293411255, "learning_rate": 1e-05, "loss": 0.5423, "mean_token_accuracy": 0.8329569101333618, "num_tokens": 247442396.0, "step": 1553 }, { "epoch": 0.7904374364191251, "grad_norm": 3.8189172744750977, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.8405753970146179, "num_tokens": 247601438.0, "step": 1554 }, { "epoch": 0.7909460834181078, "grad_norm": 1.2736577987670898, "learning_rate": 1e-05, "loss": 0.5051, "mean_token_accuracy": 0.8410643339157104, "num_tokens": 247766929.0, "step": 1555 }, { "epoch": 0.7914547304170906, "grad_norm": 1.0385481119155884, "learning_rate": 1e-05, "loss": 0.533, "mean_token_accuracy": 0.835472583770752, "num_tokens": 247935788.0, "step": 1556 }, { "epoch": 0.7919633774160733, "grad_norm": 1.1767652034759521, "learning_rate": 1e-05, "loss": 0.499, "mean_token_accuracy": 0.845313310623169, "num_tokens": 248102197.0, "step": 1557 }, { "epoch": 0.7924720244150559, "grad_norm": 1.1459380388259888, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.8455806374549866, "num_tokens": 248252748.0, "step": 1558 }, { "epoch": 0.7929806714140386, "grad_norm": 1.030868649482727, "learning_rate": 1e-05, "loss": 0.5086, "mean_token_accuracy": 0.8415130376815796, "num_tokens": 248410690.0, "step": 1559 }, { "epoch": 0.7934893184130214, "grad_norm": 1.288917899131775, "learning_rate": 1e-05, "loss": 0.5049, "mean_token_accuracy": 0.8433955907821655, "num_tokens": 248562629.0, "step": 1560 }, { "epoch": 0.7939979654120041, "grad_norm": 1.0981899499893188, "learning_rate": 1e-05, "loss": 0.5357, "mean_token_accuracy": 0.8328940868377686, "num_tokens": 248724426.0, "step": 1561 }, { "epoch": 0.7945066124109867, "grad_norm": 1.112394094467163, "learning_rate": 1e-05, "loss": 0.533, "mean_token_accuracy": 0.8356070518493652, "num_tokens": 248884711.0, "step": 1562 }, { "epoch": 0.7950152594099695, "grad_norm": 1.0571964979171753, "learning_rate": 1e-05, "loss": 0.5299, "mean_token_accuracy": 0.8359705209732056, "num_tokens": 249039319.0, "step": 1563 }, { "epoch": 0.7955239064089522, "grad_norm": 1.1130516529083252, "learning_rate": 1e-05, "loss": 0.5171, "mean_token_accuracy": 0.8393397927284241, "num_tokens": 249198668.0, "step": 1564 }, { "epoch": 0.7960325534079349, "grad_norm": 1.1128276586532593, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.837859570980072, "num_tokens": 249355386.0, "step": 1565 }, { "epoch": 0.7965412004069176, "grad_norm": 1.033722162246704, "learning_rate": 1e-05, "loss": 0.5328, "mean_token_accuracy": 0.8348727226257324, "num_tokens": 249506726.0, "step": 1566 }, { "epoch": 0.7970498474059003, "grad_norm": 1.0181803703308105, "learning_rate": 1e-05, "loss": 0.5501, "mean_token_accuracy": 0.830032467842102, "num_tokens": 249668433.0, "step": 1567 }, { "epoch": 0.797558494404883, "grad_norm": 1.0626704692840576, "learning_rate": 1e-05, "loss": 0.4986, "mean_token_accuracy": 0.8434553146362305, "num_tokens": 249815595.0, "step": 1568 }, { "epoch": 0.7980671414038657, "grad_norm": 1.022221565246582, "learning_rate": 1e-05, "loss": 0.5164, "mean_token_accuracy": 0.837891697883606, "num_tokens": 249977103.0, "step": 1569 }, { "epoch": 0.7985757884028484, "grad_norm": 0.9750449657440186, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8367565870285034, "num_tokens": 250145992.0, "step": 1570 }, { "epoch": 0.7990844354018312, "grad_norm": 1.1764782667160034, "learning_rate": 1e-05, "loss": 0.5263, "mean_token_accuracy": 0.8365598320960999, "num_tokens": 250280246.0, "step": 1571 }, { "epoch": 0.7995930824008138, "grad_norm": 0.9848682880401611, "learning_rate": 1e-05, "loss": 0.5299, "mean_token_accuracy": 0.8356618881225586, "num_tokens": 250443792.0, "step": 1572 }, { "epoch": 0.8001017293997965, "grad_norm": 0.9653362035751343, "learning_rate": 1e-05, "loss": 0.5148, "mean_token_accuracy": 0.8378146290779114, "num_tokens": 250599631.0, "step": 1573 }, { "epoch": 0.8006103763987793, "grad_norm": 1.061545968055725, "learning_rate": 1e-05, "loss": 0.5423, "mean_token_accuracy": 0.8325547575950623, "num_tokens": 250747877.0, "step": 1574 }, { "epoch": 0.801119023397762, "grad_norm": 1.0658345222473145, "learning_rate": 1e-05, "loss": 0.5348, "mean_token_accuracy": 0.833031952381134, "num_tokens": 250900217.0, "step": 1575 }, { "epoch": 0.8016276703967447, "grad_norm": 0.9847412109375, "learning_rate": 1e-05, "loss": 0.4963, "mean_token_accuracy": 0.8432402610778809, "num_tokens": 251052502.0, "step": 1576 }, { "epoch": 0.8021363173957273, "grad_norm": 0.9033166170120239, "learning_rate": 1e-05, "loss": 0.4988, "mean_token_accuracy": 0.8441382050514221, "num_tokens": 251208564.0, "step": 1577 }, { "epoch": 0.8026449643947101, "grad_norm": 1.023217797279358, "learning_rate": 1e-05, "loss": 0.5155, "mean_token_accuracy": 0.8398065567016602, "num_tokens": 251352775.0, "step": 1578 }, { "epoch": 0.8031536113936928, "grad_norm": 0.9612480401992798, "learning_rate": 1e-05, "loss": 0.5119, "mean_token_accuracy": 0.8414446711540222, "num_tokens": 251518171.0, "step": 1579 }, { "epoch": 0.8036622583926755, "grad_norm": 0.9663166999816895, "learning_rate": 1e-05, "loss": 0.5203, "mean_token_accuracy": 0.8399222493171692, "num_tokens": 251677652.0, "step": 1580 }, { "epoch": 0.8041709053916581, "grad_norm": 1.036909580230713, "learning_rate": 1e-05, "loss": 0.5303, "mean_token_accuracy": 0.8345316648483276, "num_tokens": 251821333.0, "step": 1581 }, { "epoch": 0.8046795523906409, "grad_norm": 0.984411358833313, "learning_rate": 1e-05, "loss": 0.575, "mean_token_accuracy": 0.8244682550430298, "num_tokens": 251995232.0, "step": 1582 }, { "epoch": 0.8051881993896236, "grad_norm": 0.9926067590713501, "learning_rate": 1e-05, "loss": 0.5079, "mean_token_accuracy": 0.8403122425079346, "num_tokens": 252151382.0, "step": 1583 }, { "epoch": 0.8056968463886063, "grad_norm": 0.9969213008880615, "learning_rate": 1e-05, "loss": 0.5407, "mean_token_accuracy": 0.8337632417678833, "num_tokens": 252318139.0, "step": 1584 }, { "epoch": 0.8062054933875891, "grad_norm": 1.0000914335250854, "learning_rate": 1e-05, "loss": 0.5066, "mean_token_accuracy": 0.8416249752044678, "num_tokens": 252479916.0, "step": 1585 }, { "epoch": 0.8067141403865717, "grad_norm": 0.9684925079345703, "learning_rate": 1e-05, "loss": 0.5192, "mean_token_accuracy": 0.8398261070251465, "num_tokens": 252655015.0, "step": 1586 }, { "epoch": 0.8072227873855544, "grad_norm": 1.0065932273864746, "learning_rate": 1e-05, "loss": 0.5135, "mean_token_accuracy": 0.8411272168159485, "num_tokens": 252810669.0, "step": 1587 }, { "epoch": 0.8077314343845371, "grad_norm": 1.0063505172729492, "learning_rate": 1e-05, "loss": 0.5104, "mean_token_accuracy": 0.8413547873497009, "num_tokens": 252975872.0, "step": 1588 }, { "epoch": 0.8082400813835199, "grad_norm": 1.00690495967865, "learning_rate": 1e-05, "loss": 0.5231, "mean_token_accuracy": 0.8373821377754211, "num_tokens": 253140927.0, "step": 1589 }, { "epoch": 0.8087487283825026, "grad_norm": 1.0648139715194702, "learning_rate": 1e-05, "loss": 0.5302, "mean_token_accuracy": 0.8346395492553711, "num_tokens": 253298562.0, "step": 1590 }, { "epoch": 0.8092573753814852, "grad_norm": 0.9983528256416321, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.836251974105835, "num_tokens": 253469245.0, "step": 1591 }, { "epoch": 0.8097660223804679, "grad_norm": 0.987586259841919, "learning_rate": 1e-05, "loss": 0.4635, "mean_token_accuracy": 0.8542689085006714, "num_tokens": 253623711.0, "step": 1592 }, { "epoch": 0.8102746693794507, "grad_norm": 1.0418356657028198, "learning_rate": 1e-05, "loss": 0.5004, "mean_token_accuracy": 0.8439185619354248, "num_tokens": 253793013.0, "step": 1593 }, { "epoch": 0.8107833163784334, "grad_norm": 1.1564536094665527, "learning_rate": 1e-05, "loss": 0.556, "mean_token_accuracy": 0.827933669090271, "num_tokens": 253951728.0, "step": 1594 }, { "epoch": 0.811291963377416, "grad_norm": 1.0996955633163452, "learning_rate": 1e-05, "loss": 0.5163, "mean_token_accuracy": 0.8392601013183594, "num_tokens": 254107635.0, "step": 1595 }, { "epoch": 0.8118006103763988, "grad_norm": 1.0114479064941406, "learning_rate": 1e-05, "loss": 0.5291, "mean_token_accuracy": 0.8350151777267456, "num_tokens": 254256769.0, "step": 1596 }, { "epoch": 0.8123092573753815, "grad_norm": 1.1125953197479248, "learning_rate": 1e-05, "loss": 0.5573, "mean_token_accuracy": 0.8287286162376404, "num_tokens": 254426121.0, "step": 1597 }, { "epoch": 0.8128179043743642, "grad_norm": 0.9474158883094788, "learning_rate": 1e-05, "loss": 0.5213, "mean_token_accuracy": 0.8386734127998352, "num_tokens": 254592698.0, "step": 1598 }, { "epoch": 0.8133265513733469, "grad_norm": 0.9946539998054504, "learning_rate": 1e-05, "loss": 0.5251, "mean_token_accuracy": 0.8364081382751465, "num_tokens": 254757687.0, "step": 1599 }, { "epoch": 0.8138351983723296, "grad_norm": 1.0554031133651733, "learning_rate": 1e-05, "loss": 0.5254, "mean_token_accuracy": 0.8361053466796875, "num_tokens": 254907582.0, "step": 1600 }, { "epoch": 0.8143438453713123, "grad_norm": 1.0031455755233765, "learning_rate": 1e-05, "loss": 0.5018, "mean_token_accuracy": 0.8430084586143494, "num_tokens": 255060864.0, "step": 1601 }, { "epoch": 0.814852492370295, "grad_norm": 1.090097427368164, "learning_rate": 1e-05, "loss": 0.4965, "mean_token_accuracy": 0.8441165685653687, "num_tokens": 255220612.0, "step": 1602 }, { "epoch": 0.8153611393692777, "grad_norm": 1.1100863218307495, "learning_rate": 1e-05, "loss": 0.5731, "mean_token_accuracy": 0.8228561878204346, "num_tokens": 255381676.0, "step": 1603 }, { "epoch": 0.8158697863682605, "grad_norm": 1.1037805080413818, "learning_rate": 1e-05, "loss": 0.4941, "mean_token_accuracy": 0.8476219177246094, "num_tokens": 255530491.0, "step": 1604 }, { "epoch": 0.8163784333672431, "grad_norm": 1.0004202127456665, "learning_rate": 1e-05, "loss": 0.5148, "mean_token_accuracy": 0.840594470500946, "num_tokens": 255693669.0, "step": 1605 }, { "epoch": 0.8168870803662258, "grad_norm": 1.0348905324935913, "learning_rate": 1e-05, "loss": 0.5237, "mean_token_accuracy": 0.8360775709152222, "num_tokens": 255861693.0, "step": 1606 }, { "epoch": 0.8173957273652085, "grad_norm": 1.048621416091919, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8353121876716614, "num_tokens": 256028864.0, "step": 1607 }, { "epoch": 0.8179043743641913, "grad_norm": 1.1196389198303223, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8500303626060486, "num_tokens": 256184434.0, "step": 1608 }, { "epoch": 0.818413021363174, "grad_norm": 1.0022071599960327, "learning_rate": 1e-05, "loss": 0.484, "mean_token_accuracy": 0.847318708896637, "num_tokens": 256342000.0, "step": 1609 }, { "epoch": 0.8189216683621566, "grad_norm": 1.1261740922927856, "learning_rate": 1e-05, "loss": 0.5286, "mean_token_accuracy": 0.8349587917327881, "num_tokens": 256517073.0, "step": 1610 }, { "epoch": 0.8194303153611394, "grad_norm": 1.0318447351455688, "learning_rate": 1e-05, "loss": 0.4905, "mean_token_accuracy": 0.8450832366943359, "num_tokens": 256674055.0, "step": 1611 }, { "epoch": 0.8199389623601221, "grad_norm": 1.1506813764572144, "learning_rate": 1e-05, "loss": 0.5446, "mean_token_accuracy": 0.8306101560592651, "num_tokens": 256848111.0, "step": 1612 }, { "epoch": 0.8204476093591048, "grad_norm": 1.055451512336731, "learning_rate": 1e-05, "loss": 0.5006, "mean_token_accuracy": 0.8442692756652832, "num_tokens": 257016873.0, "step": 1613 }, { "epoch": 0.8209562563580874, "grad_norm": 1.036156415939331, "learning_rate": 1e-05, "loss": 0.5416, "mean_token_accuracy": 0.8340879678726196, "num_tokens": 257182035.0, "step": 1614 }, { "epoch": 0.8214649033570702, "grad_norm": 1.0551000833511353, "learning_rate": 1e-05, "loss": 0.5548, "mean_token_accuracy": 0.827684223651886, "num_tokens": 257351888.0, "step": 1615 }, { "epoch": 0.8219735503560529, "grad_norm": 0.9917570352554321, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8370511531829834, "num_tokens": 257508833.0, "step": 1616 }, { "epoch": 0.8224821973550356, "grad_norm": 1.1158199310302734, "learning_rate": 1e-05, "loss": 0.5041, "mean_token_accuracy": 0.8426960706710815, "num_tokens": 257666306.0, "step": 1617 }, { "epoch": 0.8229908443540183, "grad_norm": 0.9963831901550293, "learning_rate": 1e-05, "loss": 0.522, "mean_token_accuracy": 0.8374859690666199, "num_tokens": 257832236.0, "step": 1618 }, { "epoch": 0.823499491353001, "grad_norm": 1.141318917274475, "learning_rate": 1e-05, "loss": 0.4946, "mean_token_accuracy": 0.8444522619247437, "num_tokens": 257996091.0, "step": 1619 }, { "epoch": 0.8240081383519837, "grad_norm": 0.9614141583442688, "learning_rate": 1e-05, "loss": 0.5153, "mean_token_accuracy": 0.8377129435539246, "num_tokens": 258156268.0, "step": 1620 }, { "epoch": 0.8245167853509664, "grad_norm": 1.0855903625488281, "learning_rate": 1e-05, "loss": 0.5324, "mean_token_accuracy": 0.8347781300544739, "num_tokens": 258323374.0, "step": 1621 }, { "epoch": 0.8250254323499492, "grad_norm": 0.993126630783081, "learning_rate": 1e-05, "loss": 0.5211, "mean_token_accuracy": 0.8383423089981079, "num_tokens": 258495850.0, "step": 1622 }, { "epoch": 0.8255340793489319, "grad_norm": 1.0016838312149048, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.8375632166862488, "num_tokens": 258653458.0, "step": 1623 }, { "epoch": 0.8260427263479145, "grad_norm": 1.0147916078567505, "learning_rate": 1e-05, "loss": 0.5316, "mean_token_accuracy": 0.8373314738273621, "num_tokens": 258823978.0, "step": 1624 }, { "epoch": 0.8265513733468972, "grad_norm": 1.0405744314193726, "learning_rate": 1e-05, "loss": 0.5138, "mean_token_accuracy": 0.8393408060073853, "num_tokens": 258981414.0, "step": 1625 }, { "epoch": 0.82706002034588, "grad_norm": 0.9844928979873657, "learning_rate": 1e-05, "loss": 0.5174, "mean_token_accuracy": 0.837151288986206, "num_tokens": 259143758.0, "step": 1626 }, { "epoch": 0.8275686673448627, "grad_norm": 1.1062935590744019, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.834156334400177, "num_tokens": 259297261.0, "step": 1627 }, { "epoch": 0.8280773143438453, "grad_norm": 1.0649558305740356, "learning_rate": 1e-05, "loss": 0.5231, "mean_token_accuracy": 0.8374667167663574, "num_tokens": 259459034.0, "step": 1628 }, { "epoch": 0.828585961342828, "grad_norm": 0.9770439267158508, "learning_rate": 1e-05, "loss": 0.5036, "mean_token_accuracy": 0.8402736783027649, "num_tokens": 259614641.0, "step": 1629 }, { "epoch": 0.8290946083418108, "grad_norm": 1.07271146774292, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.8429797291755676, "num_tokens": 259769628.0, "step": 1630 }, { "epoch": 0.8296032553407935, "grad_norm": 1.238389253616333, "learning_rate": 1e-05, "loss": 0.5086, "mean_token_accuracy": 0.8421880006790161, "num_tokens": 259925515.0, "step": 1631 }, { "epoch": 0.8301119023397762, "grad_norm": 1.0082037448883057, "learning_rate": 1e-05, "loss": 0.5637, "mean_token_accuracy": 0.8264005184173584, "num_tokens": 260091045.0, "step": 1632 }, { "epoch": 0.830620549338759, "grad_norm": 1.0819079875946045, "learning_rate": 1e-05, "loss": 0.5048, "mean_token_accuracy": 0.8421937227249146, "num_tokens": 260243318.0, "step": 1633 }, { "epoch": 0.8311291963377416, "grad_norm": 1.0576622486114502, "learning_rate": 1e-05, "loss": 0.5265, "mean_token_accuracy": 0.8375747203826904, "num_tokens": 260414955.0, "step": 1634 }, { "epoch": 0.8316378433367243, "grad_norm": 1.120271921157837, "learning_rate": 1e-05, "loss": 0.515, "mean_token_accuracy": 0.8407070636749268, "num_tokens": 260573372.0, "step": 1635 }, { "epoch": 0.832146490335707, "grad_norm": 0.9665607213973999, "learning_rate": 1e-05, "loss": 0.4649, "mean_token_accuracy": 0.8517748117446899, "num_tokens": 260738547.0, "step": 1636 }, { "epoch": 0.8326551373346898, "grad_norm": 1.0370997190475464, "learning_rate": 1e-05, "loss": 0.5054, "mean_token_accuracy": 0.8430832624435425, "num_tokens": 260892133.0, "step": 1637 }, { "epoch": 0.8331637843336724, "grad_norm": 1.1478748321533203, "learning_rate": 1e-05, "loss": 0.5364, "mean_token_accuracy": 0.8316822052001953, "num_tokens": 261051308.0, "step": 1638 }, { "epoch": 0.8336724313326551, "grad_norm": 1.0233328342437744, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8349559307098389, "num_tokens": 261205776.0, "step": 1639 }, { "epoch": 0.8341810783316378, "grad_norm": 1.065025806427002, "learning_rate": 1e-05, "loss": 0.4871, "mean_token_accuracy": 0.8467060327529907, "num_tokens": 261358402.0, "step": 1640 }, { "epoch": 0.8346897253306206, "grad_norm": 1.1216325759887695, "learning_rate": 1e-05, "loss": 0.5393, "mean_token_accuracy": 0.8320754766464233, "num_tokens": 261524731.0, "step": 1641 }, { "epoch": 0.8351983723296033, "grad_norm": 0.9606276750564575, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.8425238132476807, "num_tokens": 261687233.0, "step": 1642 }, { "epoch": 0.8357070193285859, "grad_norm": 1.090203046798706, "learning_rate": 1e-05, "loss": 0.5293, "mean_token_accuracy": 0.8350657224655151, "num_tokens": 261854243.0, "step": 1643 }, { "epoch": 0.8362156663275687, "grad_norm": 0.9650431275367737, "learning_rate": 1e-05, "loss": 0.5342, "mean_token_accuracy": 0.8349401950836182, "num_tokens": 262026963.0, "step": 1644 }, { "epoch": 0.8367243133265514, "grad_norm": 0.9442290663719177, "learning_rate": 1e-05, "loss": 0.4959, "mean_token_accuracy": 0.8443392515182495, "num_tokens": 262196156.0, "step": 1645 }, { "epoch": 0.8372329603255341, "grad_norm": 1.0255963802337646, "learning_rate": 1e-05, "loss": 0.5325, "mean_token_accuracy": 0.8355637788772583, "num_tokens": 262365347.0, "step": 1646 }, { "epoch": 0.8377416073245167, "grad_norm": 1.0480962991714478, "learning_rate": 1e-05, "loss": 0.5263, "mean_token_accuracy": 0.837949812412262, "num_tokens": 262519307.0, "step": 1647 }, { "epoch": 0.8382502543234995, "grad_norm": 1.1212228536605835, "learning_rate": 1e-05, "loss": 0.5002, "mean_token_accuracy": 0.84267258644104, "num_tokens": 262673650.0, "step": 1648 }, { "epoch": 0.8387589013224822, "grad_norm": 1.1384960412979126, "learning_rate": 1e-05, "loss": 0.518, "mean_token_accuracy": 0.8377788066864014, "num_tokens": 262812546.0, "step": 1649 }, { "epoch": 0.8392675483214649, "grad_norm": 1.0220293998718262, "learning_rate": 1e-05, "loss": 0.5455, "mean_token_accuracy": 0.8309825658798218, "num_tokens": 262970304.0, "step": 1650 }, { "epoch": 0.8397761953204476, "grad_norm": 1.1437078714370728, "learning_rate": 1e-05, "loss": 0.5425, "mean_token_accuracy": 0.8323178887367249, "num_tokens": 263132275.0, "step": 1651 }, { "epoch": 0.8402848423194303, "grad_norm": 1.028165578842163, "learning_rate": 1e-05, "loss": 0.518, "mean_token_accuracy": 0.8380722403526306, "num_tokens": 263293537.0, "step": 1652 }, { "epoch": 0.840793489318413, "grad_norm": 0.9466443657875061, "learning_rate": 1e-05, "loss": 0.5183, "mean_token_accuracy": 0.8380733728408813, "num_tokens": 263455476.0, "step": 1653 }, { "epoch": 0.8413021363173957, "grad_norm": 1.03770911693573, "learning_rate": 1e-05, "loss": 0.5061, "mean_token_accuracy": 0.8408149480819702, "num_tokens": 263615516.0, "step": 1654 }, { "epoch": 0.8418107833163785, "grad_norm": 1.0772814750671387, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8437204360961914, "num_tokens": 263759112.0, "step": 1655 }, { "epoch": 0.8423194303153612, "grad_norm": 0.9608617424964905, "learning_rate": 1e-05, "loss": 0.5035, "mean_token_accuracy": 0.8425559401512146, "num_tokens": 263924035.0, "step": 1656 }, { "epoch": 0.8428280773143438, "grad_norm": 0.9779507517814636, "learning_rate": 1e-05, "loss": 0.5685, "mean_token_accuracy": 0.8232150077819824, "num_tokens": 264096716.0, "step": 1657 }, { "epoch": 0.8433367243133265, "grad_norm": 0.9801053404808044, "learning_rate": 1e-05, "loss": 0.5298, "mean_token_accuracy": 0.833958625793457, "num_tokens": 264253124.0, "step": 1658 }, { "epoch": 0.8438453713123093, "grad_norm": 0.9987591505050659, "learning_rate": 1e-05, "loss": 0.4906, "mean_token_accuracy": 0.8448872566223145, "num_tokens": 264412558.0, "step": 1659 }, { "epoch": 0.844354018311292, "grad_norm": 0.9199841022491455, "learning_rate": 1e-05, "loss": 0.5281, "mean_token_accuracy": 0.8362597227096558, "num_tokens": 264578027.0, "step": 1660 }, { "epoch": 0.8448626653102747, "grad_norm": 1.0133132934570312, "learning_rate": 1e-05, "loss": 0.5204, "mean_token_accuracy": 0.8379366397857666, "num_tokens": 264737779.0, "step": 1661 }, { "epoch": 0.8453713123092573, "grad_norm": 1.0267291069030762, "learning_rate": 1e-05, "loss": 0.5308, "mean_token_accuracy": 0.8346647024154663, "num_tokens": 264888054.0, "step": 1662 }, { "epoch": 0.8458799593082401, "grad_norm": 0.9384878873825073, "learning_rate": 1e-05, "loss": 0.5277, "mean_token_accuracy": 0.8353392481803894, "num_tokens": 265048672.0, "step": 1663 }, { "epoch": 0.8463886063072228, "grad_norm": 3.248255729675293, "learning_rate": 1e-05, "loss": 0.515, "mean_token_accuracy": 0.840934157371521, "num_tokens": 265191225.0, "step": 1664 }, { "epoch": 0.8468972533062055, "grad_norm": 1.18904447555542, "learning_rate": 1e-05, "loss": 0.5388, "mean_token_accuracy": 0.8337969183921814, "num_tokens": 265357998.0, "step": 1665 }, { "epoch": 0.8474059003051883, "grad_norm": 0.9972271919250488, "learning_rate": 1e-05, "loss": 0.5118, "mean_token_accuracy": 0.8408945202827454, "num_tokens": 265529823.0, "step": 1666 }, { "epoch": 0.8479145473041709, "grad_norm": 1.0963144302368164, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8357887864112854, "num_tokens": 265683573.0, "step": 1667 }, { "epoch": 0.8484231943031536, "grad_norm": 1.0239129066467285, "learning_rate": 1e-05, "loss": 0.4825, "mean_token_accuracy": 0.8485143184661865, "num_tokens": 265845965.0, "step": 1668 }, { "epoch": 0.8489318413021363, "grad_norm": 1.1555248498916626, "learning_rate": 1e-05, "loss": 0.5508, "mean_token_accuracy": 0.8298073410987854, "num_tokens": 265991385.0, "step": 1669 }, { "epoch": 0.8494404883011191, "grad_norm": 1.1584995985031128, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.8365756273269653, "num_tokens": 266161615.0, "step": 1670 }, { "epoch": 0.8499491353001017, "grad_norm": 1.168226957321167, "learning_rate": 1e-05, "loss": 0.5393, "mean_token_accuracy": 0.8331434726715088, "num_tokens": 266310118.0, "step": 1671 }, { "epoch": 0.8504577822990844, "grad_norm": 1.0551339387893677, "learning_rate": 1e-05, "loss": 0.5057, "mean_token_accuracy": 0.8417863845825195, "num_tokens": 266462436.0, "step": 1672 }, { "epoch": 0.8509664292980671, "grad_norm": 1.0659514665603638, "learning_rate": 1e-05, "loss": 0.5139, "mean_token_accuracy": 0.8392844200134277, "num_tokens": 266613253.0, "step": 1673 }, { "epoch": 0.8514750762970499, "grad_norm": 1.0820661783218384, "learning_rate": 1e-05, "loss": 0.52, "mean_token_accuracy": 0.8378826379776001, "num_tokens": 266773293.0, "step": 1674 }, { "epoch": 0.8519837232960326, "grad_norm": 1.0080983638763428, "learning_rate": 1e-05, "loss": 0.5274, "mean_token_accuracy": 0.835020899772644, "num_tokens": 266935261.0, "step": 1675 }, { "epoch": 0.8524923702950152, "grad_norm": 1.1683578491210938, "learning_rate": 1e-05, "loss": 0.5339, "mean_token_accuracy": 0.8347437977790833, "num_tokens": 267100828.0, "step": 1676 }, { "epoch": 0.853001017293998, "grad_norm": 1.112990140914917, "learning_rate": 1e-05, "loss": 0.5207, "mean_token_accuracy": 0.8374387621879578, "num_tokens": 267269826.0, "step": 1677 }, { "epoch": 0.8535096642929807, "grad_norm": 0.9589158296585083, "learning_rate": 1e-05, "loss": 0.5052, "mean_token_accuracy": 0.8411589860916138, "num_tokens": 267429972.0, "step": 1678 }, { "epoch": 0.8540183112919634, "grad_norm": 1.1416692733764648, "learning_rate": 1e-05, "loss": 0.5185, "mean_token_accuracy": 0.8388045430183411, "num_tokens": 267590240.0, "step": 1679 }, { "epoch": 0.854526958290946, "grad_norm": 0.9843428730964661, "learning_rate": 1e-05, "loss": 0.4795, "mean_token_accuracy": 0.8487104773521423, "num_tokens": 267748631.0, "step": 1680 }, { "epoch": 0.8550356052899288, "grad_norm": 0.9813622236251831, "learning_rate": 1e-05, "loss": 0.5237, "mean_token_accuracy": 0.8359060287475586, "num_tokens": 267906365.0, "step": 1681 }, { "epoch": 0.8555442522889115, "grad_norm": 1.0184063911437988, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.843595027923584, "num_tokens": 268058483.0, "step": 1682 }, { "epoch": 0.8560528992878942, "grad_norm": 1.0338000059127808, "learning_rate": 1e-05, "loss": 0.4977, "mean_token_accuracy": 0.8444024324417114, "num_tokens": 268217173.0, "step": 1683 }, { "epoch": 0.8565615462868769, "grad_norm": 1.0170055627822876, "learning_rate": 1e-05, "loss": 0.5311, "mean_token_accuracy": 0.8358046412467957, "num_tokens": 268382188.0, "step": 1684 }, { "epoch": 0.8570701932858596, "grad_norm": 1.0744831562042236, "learning_rate": 1e-05, "loss": 0.5146, "mean_token_accuracy": 0.8386661410331726, "num_tokens": 268543730.0, "step": 1685 }, { "epoch": 0.8575788402848423, "grad_norm": 1.038621187210083, "learning_rate": 1e-05, "loss": 0.5027, "mean_token_accuracy": 0.8423762321472168, "num_tokens": 268701052.0, "step": 1686 }, { "epoch": 0.858087487283825, "grad_norm": 1.0548534393310547, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8422161936759949, "num_tokens": 268852461.0, "step": 1687 }, { "epoch": 0.8585961342828077, "grad_norm": 1.250922441482544, "learning_rate": 1e-05, "loss": 0.4818, "mean_token_accuracy": 0.8479307889938354, "num_tokens": 268996708.0, "step": 1688 }, { "epoch": 0.8591047812817905, "grad_norm": 1.1026182174682617, "learning_rate": 1e-05, "loss": 0.5421, "mean_token_accuracy": 0.8324263095855713, "num_tokens": 269146990.0, "step": 1689 }, { "epoch": 0.8596134282807731, "grad_norm": 1.1113507747650146, "learning_rate": 1e-05, "loss": 0.4792, "mean_token_accuracy": 0.8493058681488037, "num_tokens": 269311951.0, "step": 1690 }, { "epoch": 0.8601220752797558, "grad_norm": 1.0716196298599243, "learning_rate": 1e-05, "loss": 0.522, "mean_token_accuracy": 0.8371376991271973, "num_tokens": 269450985.0, "step": 1691 }, { "epoch": 0.8606307222787386, "grad_norm": 1.0759919881820679, "learning_rate": 1e-05, "loss": 0.4785, "mean_token_accuracy": 0.8490256071090698, "num_tokens": 269598930.0, "step": 1692 }, { "epoch": 0.8611393692777213, "grad_norm": 1.248870849609375, "learning_rate": 1e-05, "loss": 0.5399, "mean_token_accuracy": 0.8343132734298706, "num_tokens": 269760034.0, "step": 1693 }, { "epoch": 0.861648016276704, "grad_norm": 1.086024522781372, "learning_rate": 1e-05, "loss": 0.5293, "mean_token_accuracy": 0.8356090188026428, "num_tokens": 269921692.0, "step": 1694 }, { "epoch": 0.8621566632756866, "grad_norm": 1.1465421915054321, "learning_rate": 1e-05, "loss": 0.4961, "mean_token_accuracy": 0.8453572392463684, "num_tokens": 270073024.0, "step": 1695 }, { "epoch": 0.8626653102746694, "grad_norm": 1.1328400373458862, "learning_rate": 1e-05, "loss": 0.5391, "mean_token_accuracy": 0.8332090377807617, "num_tokens": 270232620.0, "step": 1696 }, { "epoch": 0.8631739572736521, "grad_norm": 1.1193041801452637, "learning_rate": 1e-05, "loss": 0.5131, "mean_token_accuracy": 0.8391777276992798, "num_tokens": 270385773.0, "step": 1697 }, { "epoch": 0.8636826042726348, "grad_norm": 1.0104612112045288, "learning_rate": 1e-05, "loss": 0.5078, "mean_token_accuracy": 0.8431419134140015, "num_tokens": 270560619.0, "step": 1698 }, { "epoch": 0.8641912512716174, "grad_norm": 1.0650326013565063, "learning_rate": 1e-05, "loss": 0.5316, "mean_token_accuracy": 0.8359506130218506, "num_tokens": 270719944.0, "step": 1699 }, { "epoch": 0.8646998982706002, "grad_norm": 1.106897234916687, "learning_rate": 1e-05, "loss": 0.5024, "mean_token_accuracy": 0.8426070213317871, "num_tokens": 270874728.0, "step": 1700 }, { "epoch": 0.8652085452695829, "grad_norm": 0.972864031791687, "learning_rate": 1e-05, "loss": 0.514, "mean_token_accuracy": 0.8413708209991455, "num_tokens": 271036953.0, "step": 1701 }, { "epoch": 0.8657171922685656, "grad_norm": 1.0611705780029297, "learning_rate": 1e-05, "loss": 0.5096, "mean_token_accuracy": 0.8410873413085938, "num_tokens": 271202782.0, "step": 1702 }, { "epoch": 0.8662258392675484, "grad_norm": 1.021085262298584, "learning_rate": 1e-05, "loss": 0.5245, "mean_token_accuracy": 0.838525652885437, "num_tokens": 271367311.0, "step": 1703 }, { "epoch": 0.866734486266531, "grad_norm": 1.002583622932434, "learning_rate": 1e-05, "loss": 0.5201, "mean_token_accuracy": 0.838585615158081, "num_tokens": 271529123.0, "step": 1704 }, { "epoch": 0.8672431332655137, "grad_norm": 1.1310274600982666, "learning_rate": 1e-05, "loss": 0.5038, "mean_token_accuracy": 0.8432391881942749, "num_tokens": 271673733.0, "step": 1705 }, { "epoch": 0.8677517802644964, "grad_norm": 1.1639235019683838, "learning_rate": 1e-05, "loss": 0.4958, "mean_token_accuracy": 0.8447306156158447, "num_tokens": 271828281.0, "step": 1706 }, { "epoch": 0.8682604272634792, "grad_norm": 1.1261109113693237, "learning_rate": 1e-05, "loss": 0.5364, "mean_token_accuracy": 0.8337569236755371, "num_tokens": 271997170.0, "step": 1707 }, { "epoch": 0.8687690742624619, "grad_norm": 0.9615015387535095, "learning_rate": 1e-05, "loss": 0.4922, "mean_token_accuracy": 0.8460158109664917, "num_tokens": 272152291.0, "step": 1708 }, { "epoch": 0.8692777212614445, "grad_norm": 1.131570816040039, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.8301962018013, "num_tokens": 272307347.0, "step": 1709 }, { "epoch": 0.8697863682604272, "grad_norm": 1.0198192596435547, "learning_rate": 1e-05, "loss": 0.5348, "mean_token_accuracy": 0.8341028690338135, "num_tokens": 272469049.0, "step": 1710 }, { "epoch": 0.87029501525941, "grad_norm": 1.0531460046768188, "learning_rate": 1e-05, "loss": 0.533, "mean_token_accuracy": 0.8349130153656006, "num_tokens": 272612517.0, "step": 1711 }, { "epoch": 0.8708036622583927, "grad_norm": 1.121743083000183, "learning_rate": 1e-05, "loss": 0.5492, "mean_token_accuracy": 0.8294485211372375, "num_tokens": 272768408.0, "step": 1712 }, { "epoch": 0.8713123092573754, "grad_norm": 0.9979460835456848, "learning_rate": 1e-05, "loss": 0.5217, "mean_token_accuracy": 0.8372259140014648, "num_tokens": 272919347.0, "step": 1713 }, { "epoch": 0.8718209562563581, "grad_norm": 1.1718113422393799, "learning_rate": 1e-05, "loss": 0.5065, "mean_token_accuracy": 0.841956377029419, "num_tokens": 273089732.0, "step": 1714 }, { "epoch": 0.8723296032553408, "grad_norm": 1.0107232332229614, "learning_rate": 1e-05, "loss": 0.4993, "mean_token_accuracy": 0.8463701009750366, "num_tokens": 273242780.0, "step": 1715 }, { "epoch": 0.8728382502543235, "grad_norm": 0.9980534315109253, "learning_rate": 1e-05, "loss": 0.5054, "mean_token_accuracy": 0.8422236442565918, "num_tokens": 273407433.0, "step": 1716 }, { "epoch": 0.8733468972533062, "grad_norm": 2.6608331203460693, "learning_rate": 1e-05, "loss": 0.5206, "mean_token_accuracy": 0.8389767408370972, "num_tokens": 273570166.0, "step": 1717 }, { "epoch": 0.873855544252289, "grad_norm": 1.1380822658538818, "learning_rate": 1e-05, "loss": 0.5361, "mean_token_accuracy": 0.8322861194610596, "num_tokens": 273726324.0, "step": 1718 }, { "epoch": 0.8743641912512716, "grad_norm": 1.0491878986358643, "learning_rate": 1e-05, "loss": 0.5264, "mean_token_accuracy": 0.8350399732589722, "num_tokens": 273890563.0, "step": 1719 }, { "epoch": 0.8748728382502543, "grad_norm": 1.0042964220046997, "learning_rate": 1e-05, "loss": 0.484, "mean_token_accuracy": 0.8472247123718262, "num_tokens": 274048187.0, "step": 1720 }, { "epoch": 0.875381485249237, "grad_norm": 1.0009158849716187, "learning_rate": 1e-05, "loss": 0.4828, "mean_token_accuracy": 0.8478761911392212, "num_tokens": 274221855.0, "step": 1721 }, { "epoch": 0.8758901322482198, "grad_norm": 0.9791696667671204, "learning_rate": 1e-05, "loss": 0.5279, "mean_token_accuracy": 0.8349093198776245, "num_tokens": 274377833.0, "step": 1722 }, { "epoch": 0.8763987792472024, "grad_norm": 0.9939912557601929, "learning_rate": 1e-05, "loss": 0.5423, "mean_token_accuracy": 0.8309345245361328, "num_tokens": 274541936.0, "step": 1723 }, { "epoch": 0.8769074262461851, "grad_norm": 1.0004394054412842, "learning_rate": 1e-05, "loss": 0.5375, "mean_token_accuracy": 0.8317330479621887, "num_tokens": 274705006.0, "step": 1724 }, { "epoch": 0.8774160732451679, "grad_norm": 0.9913457632064819, "learning_rate": 1e-05, "loss": 0.5086, "mean_token_accuracy": 0.8392433524131775, "num_tokens": 274854136.0, "step": 1725 }, { "epoch": 0.8779247202441506, "grad_norm": 1.064914345741272, "learning_rate": 1e-05, "loss": 0.4971, "mean_token_accuracy": 0.8464230298995972, "num_tokens": 275014059.0, "step": 1726 }, { "epoch": 0.8784333672431333, "grad_norm": 1.2501319646835327, "learning_rate": 1e-05, "loss": 0.5594, "mean_token_accuracy": 0.8279653787612915, "num_tokens": 275187361.0, "step": 1727 }, { "epoch": 0.8789420142421159, "grad_norm": 1.013777732849121, "learning_rate": 1e-05, "loss": 0.5058, "mean_token_accuracy": 0.8425623178482056, "num_tokens": 275338869.0, "step": 1728 }, { "epoch": 0.8794506612410987, "grad_norm": 0.9364235997200012, "learning_rate": 1e-05, "loss": 0.511, "mean_token_accuracy": 0.8407465219497681, "num_tokens": 275496774.0, "step": 1729 }, { "epoch": 0.8799593082400814, "grad_norm": 1.1220306158065796, "learning_rate": 1e-05, "loss": 0.5401, "mean_token_accuracy": 0.8303672075271606, "num_tokens": 275653401.0, "step": 1730 }, { "epoch": 0.8804679552390641, "grad_norm": 1.0587559938430786, "learning_rate": 1e-05, "loss": 0.4986, "mean_token_accuracy": 0.8438327312469482, "num_tokens": 275806070.0, "step": 1731 }, { "epoch": 0.8809766022380467, "grad_norm": 1.1446789503097534, "learning_rate": 1e-05, "loss": 0.5118, "mean_token_accuracy": 0.8396477103233337, "num_tokens": 275962373.0, "step": 1732 }, { "epoch": 0.8814852492370295, "grad_norm": 1.0831222534179688, "learning_rate": 1e-05, "loss": 0.5185, "mean_token_accuracy": 0.8397834300994873, "num_tokens": 276129118.0, "step": 1733 }, { "epoch": 0.8819938962360122, "grad_norm": 1.0126874446868896, "learning_rate": 1e-05, "loss": 0.5259, "mean_token_accuracy": 0.8371516466140747, "num_tokens": 276287064.0, "step": 1734 }, { "epoch": 0.8825025432349949, "grad_norm": 1.0329630374908447, "learning_rate": 1e-05, "loss": 0.5033, "mean_token_accuracy": 0.8428022861480713, "num_tokens": 276457678.0, "step": 1735 }, { "epoch": 0.8830111902339777, "grad_norm": 1.0754644870758057, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.834283709526062, "num_tokens": 276610916.0, "step": 1736 }, { "epoch": 0.8835198372329603, "grad_norm": 0.9800335764884949, "learning_rate": 1e-05, "loss": 0.4946, "mean_token_accuracy": 0.8463819026947021, "num_tokens": 276764925.0, "step": 1737 }, { "epoch": 0.884028484231943, "grad_norm": 1.0709609985351562, "learning_rate": 1e-05, "loss": 0.574, "mean_token_accuracy": 0.8228265047073364, "num_tokens": 276928368.0, "step": 1738 }, { "epoch": 0.8845371312309257, "grad_norm": 1.0306966304779053, "learning_rate": 1e-05, "loss": 0.4964, "mean_token_accuracy": 0.8455370664596558, "num_tokens": 277094718.0, "step": 1739 }, { "epoch": 0.8850457782299085, "grad_norm": 1.026525855064392, "learning_rate": 1e-05, "loss": 0.5012, "mean_token_accuracy": 0.8419168591499329, "num_tokens": 277246859.0, "step": 1740 }, { "epoch": 0.8855544252288912, "grad_norm": 0.9500491619110107, "learning_rate": 1e-05, "loss": 0.528, "mean_token_accuracy": 0.83522629737854, "num_tokens": 277414068.0, "step": 1741 }, { "epoch": 0.8860630722278738, "grad_norm": 1.0432976484298706, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.840704083442688, "num_tokens": 277573295.0, "step": 1742 }, { "epoch": 0.8865717192268565, "grad_norm": 1.0290638208389282, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8381516933441162, "num_tokens": 277726913.0, "step": 1743 }, { "epoch": 0.8870803662258393, "grad_norm": 0.9862432479858398, "learning_rate": 1e-05, "loss": 0.4924, "mean_token_accuracy": 0.8451281785964966, "num_tokens": 277877252.0, "step": 1744 }, { "epoch": 0.887589013224822, "grad_norm": 1.0773284435272217, "learning_rate": 1e-05, "loss": 0.5378, "mean_token_accuracy": 0.8321132659912109, "num_tokens": 278034976.0, "step": 1745 }, { "epoch": 0.8880976602238047, "grad_norm": 0.9903329014778137, "learning_rate": 1e-05, "loss": 0.4778, "mean_token_accuracy": 0.8489717841148376, "num_tokens": 278194339.0, "step": 1746 }, { "epoch": 0.8886063072227874, "grad_norm": 0.9978926777839661, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.83891361951828, "num_tokens": 278352150.0, "step": 1747 }, { "epoch": 0.8891149542217701, "grad_norm": 1.0282410383224487, "learning_rate": 1e-05, "loss": 0.5348, "mean_token_accuracy": 0.832777738571167, "num_tokens": 278516575.0, "step": 1748 }, { "epoch": 0.8896236012207528, "grad_norm": 0.9718902111053467, "learning_rate": 1e-05, "loss": 0.5342, "mean_token_accuracy": 0.8338038921356201, "num_tokens": 278687157.0, "step": 1749 }, { "epoch": 0.8901322482197355, "grad_norm": 1.0574067831039429, "learning_rate": 1e-05, "loss": 0.5331, "mean_token_accuracy": 0.8335607051849365, "num_tokens": 278844290.0, "step": 1750 }, { "epoch": 0.8906408952187183, "grad_norm": 1.118514895439148, "learning_rate": 1e-05, "loss": 0.5178, "mean_token_accuracy": 0.8380013704299927, "num_tokens": 279002276.0, "step": 1751 }, { "epoch": 0.8911495422177009, "grad_norm": 1.017069935798645, "learning_rate": 1e-05, "loss": 0.5315, "mean_token_accuracy": 0.835065484046936, "num_tokens": 279161918.0, "step": 1752 }, { "epoch": 0.8916581892166836, "grad_norm": 1.1437948942184448, "learning_rate": 1e-05, "loss": 0.5303, "mean_token_accuracy": 0.8335355520248413, "num_tokens": 279321930.0, "step": 1753 }, { "epoch": 0.8921668362156663, "grad_norm": 1.0639396905899048, "learning_rate": 1e-05, "loss": 0.5353, "mean_token_accuracy": 0.8358509540557861, "num_tokens": 279481295.0, "step": 1754 }, { "epoch": 0.8926754832146491, "grad_norm": 0.9928602576255798, "learning_rate": 1e-05, "loss": 0.5236, "mean_token_accuracy": 0.8368161916732788, "num_tokens": 279633071.0, "step": 1755 }, { "epoch": 0.8931841302136317, "grad_norm": 1.0300835371017456, "learning_rate": 1e-05, "loss": 0.4902, "mean_token_accuracy": 0.8459980487823486, "num_tokens": 279800329.0, "step": 1756 }, { "epoch": 0.8936927772126144, "grad_norm": 1.0273916721343994, "learning_rate": 1e-05, "loss": 0.483, "mean_token_accuracy": 0.8479832410812378, "num_tokens": 279944707.0, "step": 1757 }, { "epoch": 0.8942014242115972, "grad_norm": 0.9553089737892151, "learning_rate": 1e-05, "loss": 0.5054, "mean_token_accuracy": 0.8420307636260986, "num_tokens": 280111037.0, "step": 1758 }, { "epoch": 0.8947100712105799, "grad_norm": 1.0497819185256958, "learning_rate": 1e-05, "loss": 0.5061, "mean_token_accuracy": 0.8428786993026733, "num_tokens": 280258749.0, "step": 1759 }, { "epoch": 0.8952187182095626, "grad_norm": 1.0025304555892944, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8372910618782043, "num_tokens": 280405852.0, "step": 1760 }, { "epoch": 0.8957273652085452, "grad_norm": 0.9646736979484558, "learning_rate": 1e-05, "loss": 0.5166, "mean_token_accuracy": 0.8398212194442749, "num_tokens": 280580217.0, "step": 1761 }, { "epoch": 0.896236012207528, "grad_norm": 1.0457273721694946, "learning_rate": 1e-05, "loss": 0.534, "mean_token_accuracy": 0.8361070156097412, "num_tokens": 280727104.0, "step": 1762 }, { "epoch": 0.8967446592065107, "grad_norm": 1.0271176099777222, "learning_rate": 1e-05, "loss": 0.4906, "mean_token_accuracy": 0.8460567593574524, "num_tokens": 280889043.0, "step": 1763 }, { "epoch": 0.8972533062054934, "grad_norm": 1.0649844408035278, "learning_rate": 1e-05, "loss": 0.5078, "mean_token_accuracy": 0.8414275646209717, "num_tokens": 281043942.0, "step": 1764 }, { "epoch": 0.897761953204476, "grad_norm": 1.0987054109573364, "learning_rate": 1e-05, "loss": 0.5214, "mean_token_accuracy": 0.8381832838058472, "num_tokens": 281216118.0, "step": 1765 }, { "epoch": 0.8982706002034588, "grad_norm": 1.0467629432678223, "learning_rate": 1e-05, "loss": 0.5184, "mean_token_accuracy": 0.839253842830658, "num_tokens": 281374849.0, "step": 1766 }, { "epoch": 0.8987792472024415, "grad_norm": 1.060920238494873, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.8411917090415955, "num_tokens": 281528478.0, "step": 1767 }, { "epoch": 0.8992878942014242, "grad_norm": 1.1491779088974, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8416895866394043, "num_tokens": 281681982.0, "step": 1768 }, { "epoch": 0.8997965412004069, "grad_norm": 1.028265357017517, "learning_rate": 1e-05, "loss": 0.4975, "mean_token_accuracy": 0.8440154790878296, "num_tokens": 281846049.0, "step": 1769 }, { "epoch": 0.9003051881993896, "grad_norm": 1.1693898439407349, "learning_rate": 1e-05, "loss": 0.5579, "mean_token_accuracy": 0.8275507688522339, "num_tokens": 282019096.0, "step": 1770 }, { "epoch": 0.9008138351983723, "grad_norm": 1.0625677108764648, "learning_rate": 1e-05, "loss": 0.4945, "mean_token_accuracy": 0.8449739217758179, "num_tokens": 282196142.0, "step": 1771 }, { "epoch": 0.901322482197355, "grad_norm": 0.9961878061294556, "learning_rate": 1e-05, "loss": 0.5269, "mean_token_accuracy": 0.8377020359039307, "num_tokens": 282360553.0, "step": 1772 }, { "epoch": 0.9018311291963378, "grad_norm": 1.1472853422164917, "learning_rate": 1e-05, "loss": 0.4972, "mean_token_accuracy": 0.8453328609466553, "num_tokens": 282527151.0, "step": 1773 }, { "epoch": 0.9023397761953205, "grad_norm": 1.0794323682785034, "learning_rate": 1e-05, "loss": 0.5417, "mean_token_accuracy": 0.8325939178466797, "num_tokens": 282674995.0, "step": 1774 }, { "epoch": 0.9028484231943031, "grad_norm": 1.19264554977417, "learning_rate": 1e-05, "loss": 0.4699, "mean_token_accuracy": 0.8525264859199524, "num_tokens": 282832990.0, "step": 1775 }, { "epoch": 0.9033570701932858, "grad_norm": 1.1174076795578003, "learning_rate": 1e-05, "loss": 0.5375, "mean_token_accuracy": 0.8341360688209534, "num_tokens": 282997592.0, "step": 1776 }, { "epoch": 0.9038657171922686, "grad_norm": 1.1486574411392212, "learning_rate": 1e-05, "loss": 0.5164, "mean_token_accuracy": 0.8390986919403076, "num_tokens": 283155838.0, "step": 1777 }, { "epoch": 0.9043743641912513, "grad_norm": 0.977383017539978, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8494383096694946, "num_tokens": 283307527.0, "step": 1778 }, { "epoch": 0.904883011190234, "grad_norm": 1.0261812210083008, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8334873914718628, "num_tokens": 283465966.0, "step": 1779 }, { "epoch": 0.9053916581892166, "grad_norm": 1.0674028396606445, "learning_rate": 1e-05, "loss": 0.5333, "mean_token_accuracy": 0.8340882062911987, "num_tokens": 283620377.0, "step": 1780 }, { "epoch": 0.9059003051881994, "grad_norm": 0.9716681241989136, "learning_rate": 1e-05, "loss": 0.4934, "mean_token_accuracy": 0.8443837761878967, "num_tokens": 283777695.0, "step": 1781 }, { "epoch": 0.9064089521871821, "grad_norm": 1.0851030349731445, "learning_rate": 1e-05, "loss": 0.5108, "mean_token_accuracy": 0.8417867422103882, "num_tokens": 283931044.0, "step": 1782 }, { "epoch": 0.9069175991861648, "grad_norm": 1.069290041923523, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.8441795110702515, "num_tokens": 284089124.0, "step": 1783 }, { "epoch": 0.9074262461851476, "grad_norm": 1.009878158569336, "learning_rate": 1e-05, "loss": 0.5346, "mean_token_accuracy": 0.8334824442863464, "num_tokens": 284248921.0, "step": 1784 }, { "epoch": 0.9079348931841302, "grad_norm": 1.088663935661316, "learning_rate": 1e-05, "loss": 0.501, "mean_token_accuracy": 0.8420149683952332, "num_tokens": 284406295.0, "step": 1785 }, { "epoch": 0.9084435401831129, "grad_norm": 1.072481393814087, "learning_rate": 1e-05, "loss": 0.4704, "mean_token_accuracy": 0.8499975800514221, "num_tokens": 284570062.0, "step": 1786 }, { "epoch": 0.9089521871820956, "grad_norm": 1.067689299583435, "learning_rate": 1e-05, "loss": 0.5181, "mean_token_accuracy": 0.8386754989624023, "num_tokens": 284731919.0, "step": 1787 }, { "epoch": 0.9094608341810784, "grad_norm": 1.002181053161621, "learning_rate": 1e-05, "loss": 0.5293, "mean_token_accuracy": 0.8357716798782349, "num_tokens": 284892788.0, "step": 1788 }, { "epoch": 0.909969481180061, "grad_norm": 1.1112078428268433, "learning_rate": 1e-05, "loss": 0.5359, "mean_token_accuracy": 0.8343470096588135, "num_tokens": 285053351.0, "step": 1789 }, { "epoch": 0.9104781281790437, "grad_norm": 1.0172306299209595, "learning_rate": 1e-05, "loss": 0.5216, "mean_token_accuracy": 0.8377166986465454, "num_tokens": 285203700.0, "step": 1790 }, { "epoch": 0.9109867751780264, "grad_norm": 1.0339511632919312, "learning_rate": 1e-05, "loss": 0.5193, "mean_token_accuracy": 0.8383980989456177, "num_tokens": 285375982.0, "step": 1791 }, { "epoch": 0.9114954221770092, "grad_norm": 0.9519206285476685, "learning_rate": 1e-05, "loss": 0.5129, "mean_token_accuracy": 0.8404895663261414, "num_tokens": 285550546.0, "step": 1792 }, { "epoch": 0.9120040691759919, "grad_norm": 1.0351858139038086, "learning_rate": 1e-05, "loss": 0.523, "mean_token_accuracy": 0.8380796909332275, "num_tokens": 285699042.0, "step": 1793 }, { "epoch": 0.9125127161749745, "grad_norm": 1.031059980392456, "learning_rate": 1e-05, "loss": 0.5172, "mean_token_accuracy": 0.8366599678993225, "num_tokens": 285845604.0, "step": 1794 }, { "epoch": 0.9130213631739573, "grad_norm": 1.0507643222808838, "learning_rate": 1e-05, "loss": 0.5195, "mean_token_accuracy": 0.838219165802002, "num_tokens": 285989582.0, "step": 1795 }, { "epoch": 0.91353001017294, "grad_norm": 1.0662038326263428, "learning_rate": 1e-05, "loss": 0.5288, "mean_token_accuracy": 0.8362869024276733, "num_tokens": 286144853.0, "step": 1796 }, { "epoch": 0.9140386571719227, "grad_norm": 1.100429892539978, "learning_rate": 1e-05, "loss": 0.5145, "mean_token_accuracy": 0.838462233543396, "num_tokens": 286306136.0, "step": 1797 }, { "epoch": 0.9145473041709054, "grad_norm": 1.0728235244750977, "learning_rate": 1e-05, "loss": 0.5172, "mean_token_accuracy": 0.8359116911888123, "num_tokens": 286458777.0, "step": 1798 }, { "epoch": 0.9150559511698881, "grad_norm": 0.9926060438156128, "learning_rate": 1e-05, "loss": 0.5225, "mean_token_accuracy": 0.8370160460472107, "num_tokens": 286618373.0, "step": 1799 }, { "epoch": 0.9155645981688708, "grad_norm": 1.0963010787963867, "learning_rate": 1e-05, "loss": 0.5096, "mean_token_accuracy": 0.8409956097602844, "num_tokens": 286774374.0, "step": 1800 }, { "epoch": 0.9160732451678535, "grad_norm": 1.0256036520004272, "learning_rate": 1e-05, "loss": 0.4933, "mean_token_accuracy": 0.8443099856376648, "num_tokens": 286926506.0, "step": 1801 }, { "epoch": 0.9165818921668362, "grad_norm": 1.0650215148925781, "learning_rate": 1e-05, "loss": 0.5427, "mean_token_accuracy": 0.8311566114425659, "num_tokens": 287085779.0, "step": 1802 }, { "epoch": 0.917090539165819, "grad_norm": 1.0663613080978394, "learning_rate": 1e-05, "loss": 0.5112, "mean_token_accuracy": 0.8394999504089355, "num_tokens": 287253165.0, "step": 1803 }, { "epoch": 0.9175991861648016, "grad_norm": 1.076520562171936, "learning_rate": 1e-05, "loss": 0.5377, "mean_token_accuracy": 0.832918643951416, "num_tokens": 287405700.0, "step": 1804 }, { "epoch": 0.9181078331637843, "grad_norm": 1.0021073818206787, "learning_rate": 1e-05, "loss": 0.477, "mean_token_accuracy": 0.849860668182373, "num_tokens": 287551897.0, "step": 1805 }, { "epoch": 0.9186164801627671, "grad_norm": 1.0759339332580566, "learning_rate": 1e-05, "loss": 0.5147, "mean_token_accuracy": 0.8378969430923462, "num_tokens": 287716716.0, "step": 1806 }, { "epoch": 0.9191251271617498, "grad_norm": 1.0549094676971436, "learning_rate": 1e-05, "loss": 0.5497, "mean_token_accuracy": 0.8297152519226074, "num_tokens": 287885440.0, "step": 1807 }, { "epoch": 0.9196337741607324, "grad_norm": 1.0533418655395508, "learning_rate": 1e-05, "loss": 0.4901, "mean_token_accuracy": 0.8451825380325317, "num_tokens": 288040043.0, "step": 1808 }, { "epoch": 0.9201424211597151, "grad_norm": 1.0616881847381592, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.8455978631973267, "num_tokens": 288190884.0, "step": 1809 }, { "epoch": 0.9206510681586979, "grad_norm": 1.0778988599777222, "learning_rate": 1e-05, "loss": 0.5411, "mean_token_accuracy": 0.8317572474479675, "num_tokens": 288352383.0, "step": 1810 }, { "epoch": 0.9211597151576806, "grad_norm": 0.968621015548706, "learning_rate": 1e-05, "loss": 0.475, "mean_token_accuracy": 0.8496113419532776, "num_tokens": 288513226.0, "step": 1811 }, { "epoch": 0.9216683621566633, "grad_norm": 1.3760226964950562, "learning_rate": 1e-05, "loss": 0.5276, "mean_token_accuracy": 0.8349040746688843, "num_tokens": 288681122.0, "step": 1812 }, { "epoch": 0.9221770091556459, "grad_norm": 1.0464659929275513, "learning_rate": 1e-05, "loss": 0.5393, "mean_token_accuracy": 0.8343937397003174, "num_tokens": 288840955.0, "step": 1813 }, { "epoch": 0.9226856561546287, "grad_norm": 0.9643142223358154, "learning_rate": 1e-05, "loss": 0.5309, "mean_token_accuracy": 0.8338766098022461, "num_tokens": 289007588.0, "step": 1814 }, { "epoch": 0.9231943031536114, "grad_norm": 1.0363613367080688, "learning_rate": 1e-05, "loss": 0.5117, "mean_token_accuracy": 0.840315043926239, "num_tokens": 289171963.0, "step": 1815 }, { "epoch": 0.9237029501525941, "grad_norm": 0.975771963596344, "learning_rate": 1e-05, "loss": 0.5043, "mean_token_accuracy": 0.8419756889343262, "num_tokens": 289338174.0, "step": 1816 }, { "epoch": 0.9242115971515769, "grad_norm": 0.9576640725135803, "learning_rate": 1e-05, "loss": 0.504, "mean_token_accuracy": 0.8424282670021057, "num_tokens": 289498369.0, "step": 1817 }, { "epoch": 0.9247202441505595, "grad_norm": 1.0540850162506104, "learning_rate": 1e-05, "loss": 0.5032, "mean_token_accuracy": 0.8419172763824463, "num_tokens": 289657315.0, "step": 1818 }, { "epoch": 0.9252288911495422, "grad_norm": 1.0213912725448608, "learning_rate": 1e-05, "loss": 0.4994, "mean_token_accuracy": 0.8438811302185059, "num_tokens": 289824552.0, "step": 1819 }, { "epoch": 0.9257375381485249, "grad_norm": 0.9739988446235657, "learning_rate": 1e-05, "loss": 0.5164, "mean_token_accuracy": 0.8370422124862671, "num_tokens": 289982984.0, "step": 1820 }, { "epoch": 0.9262461851475077, "grad_norm": 0.9719370007514954, "learning_rate": 1e-05, "loss": 0.5018, "mean_token_accuracy": 0.8428477644920349, "num_tokens": 290127147.0, "step": 1821 }, { "epoch": 0.9267548321464903, "grad_norm": 0.9787293672561646, "learning_rate": 1e-05, "loss": 0.5066, "mean_token_accuracy": 0.8425807952880859, "num_tokens": 290298649.0, "step": 1822 }, { "epoch": 0.927263479145473, "grad_norm": 1.0080795288085938, "learning_rate": 1e-05, "loss": 0.5337, "mean_token_accuracy": 0.8355758190155029, "num_tokens": 290460355.0, "step": 1823 }, { "epoch": 0.9277721261444557, "grad_norm": 0.9061076045036316, "learning_rate": 1e-05, "loss": 0.4846, "mean_token_accuracy": 0.8472140431404114, "num_tokens": 290617255.0, "step": 1824 }, { "epoch": 0.9282807731434385, "grad_norm": 1.1104837656021118, "learning_rate": 1e-05, "loss": 0.5201, "mean_token_accuracy": 0.8363896608352661, "num_tokens": 290773839.0, "step": 1825 }, { "epoch": 0.9287894201424212, "grad_norm": 0.9883270263671875, "learning_rate": 1e-05, "loss": 0.49, "mean_token_accuracy": 0.846264123916626, "num_tokens": 290937706.0, "step": 1826 }, { "epoch": 0.9292980671414038, "grad_norm": 0.9738947153091431, "learning_rate": 1e-05, "loss": 0.508, "mean_token_accuracy": 0.8415963649749756, "num_tokens": 291095257.0, "step": 1827 }, { "epoch": 0.9298067141403866, "grad_norm": 1.1056708097457886, "learning_rate": 1e-05, "loss": 0.5293, "mean_token_accuracy": 0.8362318277359009, "num_tokens": 291248241.0, "step": 1828 }, { "epoch": 0.9303153611393693, "grad_norm": 1.0047028064727783, "learning_rate": 1e-05, "loss": 0.5443, "mean_token_accuracy": 0.8309669494628906, "num_tokens": 291408732.0, "step": 1829 }, { "epoch": 0.930824008138352, "grad_norm": 0.943976104259491, "learning_rate": 1e-05, "loss": 0.5021, "mean_token_accuracy": 0.8414051532745361, "num_tokens": 291586275.0, "step": 1830 }, { "epoch": 0.9313326551373347, "grad_norm": 1.0755298137664795, "learning_rate": 1e-05, "loss": 0.4734, "mean_token_accuracy": 0.8507115840911865, "num_tokens": 291748574.0, "step": 1831 }, { "epoch": 0.9318413021363174, "grad_norm": 1.173866868019104, "learning_rate": 1e-05, "loss": 0.5208, "mean_token_accuracy": 0.8381999731063843, "num_tokens": 291904561.0, "step": 1832 }, { "epoch": 0.9323499491353001, "grad_norm": 3.691843271255493, "learning_rate": 1e-05, "loss": 0.5084, "mean_token_accuracy": 0.8408045768737793, "num_tokens": 292063897.0, "step": 1833 }, { "epoch": 0.9328585961342828, "grad_norm": 1.1650604009628296, "learning_rate": 1e-05, "loss": 0.5194, "mean_token_accuracy": 0.8382998704910278, "num_tokens": 292214056.0, "step": 1834 }, { "epoch": 0.9333672431332655, "grad_norm": 1.0645188093185425, "learning_rate": 1e-05, "loss": 0.5278, "mean_token_accuracy": 0.8340986371040344, "num_tokens": 292349124.0, "step": 1835 }, { "epoch": 0.9338758901322483, "grad_norm": 1.0586034059524536, "learning_rate": 1e-05, "loss": 0.5013, "mean_token_accuracy": 0.8424099087715149, "num_tokens": 292505385.0, "step": 1836 }, { "epoch": 0.9343845371312309, "grad_norm": 1.0245041847229004, "learning_rate": 1e-05, "loss": 0.5198, "mean_token_accuracy": 0.8401522636413574, "num_tokens": 292665526.0, "step": 1837 }, { "epoch": 0.9348931841302136, "grad_norm": 0.9853573441505432, "learning_rate": 1e-05, "loss": 0.5365, "mean_token_accuracy": 0.8344807624816895, "num_tokens": 292829795.0, "step": 1838 }, { "epoch": 0.9354018311291964, "grad_norm": 1.0970759391784668, "learning_rate": 1e-05, "loss": 0.5013, "mean_token_accuracy": 0.8453795313835144, "num_tokens": 292987496.0, "step": 1839 }, { "epoch": 0.9359104781281791, "grad_norm": 0.9906995892524719, "learning_rate": 1e-05, "loss": 0.4999, "mean_token_accuracy": 0.842957079410553, "num_tokens": 293144553.0, "step": 1840 }, { "epoch": 0.9364191251271617, "grad_norm": 1.069620966911316, "learning_rate": 1e-05, "loss": 0.5448, "mean_token_accuracy": 0.8327444791793823, "num_tokens": 293302965.0, "step": 1841 }, { "epoch": 0.9369277721261444, "grad_norm": 0.9693588614463806, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8441400527954102, "num_tokens": 293463953.0, "step": 1842 }, { "epoch": 0.9374364191251272, "grad_norm": 1.0056179761886597, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.842492401599884, "num_tokens": 293629039.0, "step": 1843 }, { "epoch": 0.9379450661241099, "grad_norm": 0.977562665939331, "learning_rate": 1e-05, "loss": 0.4593, "mean_token_accuracy": 0.8544231057167053, "num_tokens": 293773864.0, "step": 1844 }, { "epoch": 0.9384537131230926, "grad_norm": 1.0503692626953125, "learning_rate": 1e-05, "loss": 0.5065, "mean_token_accuracy": 0.8430094718933105, "num_tokens": 293937000.0, "step": 1845 }, { "epoch": 0.9389623601220752, "grad_norm": 0.9278028011322021, "learning_rate": 1e-05, "loss": 0.4832, "mean_token_accuracy": 0.8477224111557007, "num_tokens": 294103688.0, "step": 1846 }, { "epoch": 0.939471007121058, "grad_norm": 0.9603849053382874, "learning_rate": 1e-05, "loss": 0.4943, "mean_token_accuracy": 0.8472484350204468, "num_tokens": 294266886.0, "step": 1847 }, { "epoch": 0.9399796541200407, "grad_norm": 0.9893496036529541, "learning_rate": 1e-05, "loss": 0.4778, "mean_token_accuracy": 0.8487132787704468, "num_tokens": 294425536.0, "step": 1848 }, { "epoch": 0.9404883011190234, "grad_norm": 0.9785003662109375, "learning_rate": 1e-05, "loss": 0.5227, "mean_token_accuracy": 0.8382445573806763, "num_tokens": 294590856.0, "step": 1849 }, { "epoch": 0.940996948118006, "grad_norm": 1.0731697082519531, "learning_rate": 1e-05, "loss": 0.5099, "mean_token_accuracy": 0.841046929359436, "num_tokens": 294743889.0, "step": 1850 }, { "epoch": 0.9415055951169888, "grad_norm": 1.0036062002182007, "learning_rate": 1e-05, "loss": 0.5033, "mean_token_accuracy": 0.8415602445602417, "num_tokens": 294899666.0, "step": 1851 }, { "epoch": 0.9420142421159715, "grad_norm": 0.9595556855201721, "learning_rate": 1e-05, "loss": 0.5585, "mean_token_accuracy": 0.8298701047897339, "num_tokens": 295078964.0, "step": 1852 }, { "epoch": 0.9425228891149542, "grad_norm": 1.0190085172653198, "learning_rate": 1e-05, "loss": 0.4816, "mean_token_accuracy": 0.8475434184074402, "num_tokens": 295221135.0, "step": 1853 }, { "epoch": 0.943031536113937, "grad_norm": 1.0703707933425903, "learning_rate": 1e-05, "loss": 0.4964, "mean_token_accuracy": 0.8452032804489136, "num_tokens": 295382594.0, "step": 1854 }, { "epoch": 0.9435401831129197, "grad_norm": 1.0177465677261353, "learning_rate": 1e-05, "loss": 0.5324, "mean_token_accuracy": 0.8352153301239014, "num_tokens": 295541772.0, "step": 1855 }, { "epoch": 0.9440488301119023, "grad_norm": 1.011955738067627, "learning_rate": 1e-05, "loss": 0.4917, "mean_token_accuracy": 0.8448024988174438, "num_tokens": 295694723.0, "step": 1856 }, { "epoch": 0.944557477110885, "grad_norm": 0.9742183685302734, "learning_rate": 1e-05, "loss": 0.4873, "mean_token_accuracy": 0.8474919199943542, "num_tokens": 295848132.0, "step": 1857 }, { "epoch": 0.9450661241098678, "grad_norm": 1.0941163301467896, "learning_rate": 1e-05, "loss": 0.5045, "mean_token_accuracy": 0.8426496386528015, "num_tokens": 295998034.0, "step": 1858 }, { "epoch": 0.9455747711088505, "grad_norm": 0.9956144690513611, "learning_rate": 1e-05, "loss": 0.5207, "mean_token_accuracy": 0.8375461101531982, "num_tokens": 296142665.0, "step": 1859 }, { "epoch": 0.9460834181078331, "grad_norm": 0.9501311182975769, "learning_rate": 1e-05, "loss": 0.4997, "mean_token_accuracy": 0.8430957794189453, "num_tokens": 296290081.0, "step": 1860 }, { "epoch": 0.9465920651068158, "grad_norm": 1.0607683658599854, "learning_rate": 1e-05, "loss": 0.5181, "mean_token_accuracy": 0.8405077457427979, "num_tokens": 296439741.0, "step": 1861 }, { "epoch": 0.9471007121057986, "grad_norm": 0.9842624664306641, "learning_rate": 1e-05, "loss": 0.49, "mean_token_accuracy": 0.8464711904525757, "num_tokens": 296598043.0, "step": 1862 }, { "epoch": 0.9476093591047813, "grad_norm": 1.018387794494629, "learning_rate": 1e-05, "loss": 0.4958, "mean_token_accuracy": 0.8460543751716614, "num_tokens": 296748422.0, "step": 1863 }, { "epoch": 0.948118006103764, "grad_norm": 1.0162421464920044, "learning_rate": 1e-05, "loss": 0.5107, "mean_token_accuracy": 0.8393254280090332, "num_tokens": 296907598.0, "step": 1864 }, { "epoch": 0.9486266531027467, "grad_norm": 1.0726654529571533, "learning_rate": 1e-05, "loss": 0.5236, "mean_token_accuracy": 0.8362458348274231, "num_tokens": 297068522.0, "step": 1865 }, { "epoch": 0.9491353001017294, "grad_norm": 0.9776252508163452, "learning_rate": 1e-05, "loss": 0.4941, "mean_token_accuracy": 0.8452169895172119, "num_tokens": 297229538.0, "step": 1866 }, { "epoch": 0.9496439471007121, "grad_norm": 1.0472756624221802, "learning_rate": 1e-05, "loss": 0.5119, "mean_token_accuracy": 0.8398129940032959, "num_tokens": 297390837.0, "step": 1867 }, { "epoch": 0.9501525940996948, "grad_norm": 1.1011399030685425, "learning_rate": 1e-05, "loss": 0.5551, "mean_token_accuracy": 0.8309721946716309, "num_tokens": 297545849.0, "step": 1868 }, { "epoch": 0.9506612410986776, "grad_norm": 1.0296998023986816, "learning_rate": 1e-05, "loss": 0.4996, "mean_token_accuracy": 0.8422638177871704, "num_tokens": 297702474.0, "step": 1869 }, { "epoch": 0.9511698880976602, "grad_norm": 1.101548194885254, "learning_rate": 1e-05, "loss": 0.5465, "mean_token_accuracy": 0.831012487411499, "num_tokens": 297857385.0, "step": 1870 }, { "epoch": 0.9516785350966429, "grad_norm": 1.0518685579299927, "learning_rate": 1e-05, "loss": 0.4988, "mean_token_accuracy": 0.8414741158485413, "num_tokens": 298007755.0, "step": 1871 }, { "epoch": 0.9521871820956256, "grad_norm": 1.0469207763671875, "learning_rate": 1e-05, "loss": 0.5145, "mean_token_accuracy": 0.8405040502548218, "num_tokens": 298178437.0, "step": 1872 }, { "epoch": 0.9526958290946084, "grad_norm": 1.1140328645706177, "learning_rate": 1e-05, "loss": 0.5155, "mean_token_accuracy": 0.8406937122344971, "num_tokens": 298335117.0, "step": 1873 }, { "epoch": 0.953204476093591, "grad_norm": 1.0772130489349365, "learning_rate": 1e-05, "loss": 0.4882, "mean_token_accuracy": 0.8448014855384827, "num_tokens": 298502544.0, "step": 1874 }, { "epoch": 0.9537131230925737, "grad_norm": 1.6122941970825195, "learning_rate": 1e-05, "loss": 0.5114, "mean_token_accuracy": 0.8376173377037048, "num_tokens": 298660517.0, "step": 1875 }, { "epoch": 0.9542217700915565, "grad_norm": 1.0106393098831177, "learning_rate": 1e-05, "loss": 0.5264, "mean_token_accuracy": 0.8369991779327393, "num_tokens": 298813928.0, "step": 1876 }, { "epoch": 0.9547304170905392, "grad_norm": 0.9368502497673035, "learning_rate": 1e-05, "loss": 0.4926, "mean_token_accuracy": 0.8439090251922607, "num_tokens": 298982892.0, "step": 1877 }, { "epoch": 0.9552390640895219, "grad_norm": 0.9868039488792419, "learning_rate": 1e-05, "loss": 0.4975, "mean_token_accuracy": 0.8426105976104736, "num_tokens": 299145054.0, "step": 1878 }, { "epoch": 0.9557477110885045, "grad_norm": 0.9505625367164612, "learning_rate": 1e-05, "loss": 0.4993, "mean_token_accuracy": 0.8433289527893066, "num_tokens": 299310640.0, "step": 1879 }, { "epoch": 0.9562563580874873, "grad_norm": 1.135102391242981, "learning_rate": 1e-05, "loss": 0.5154, "mean_token_accuracy": 0.8381885290145874, "num_tokens": 299484449.0, "step": 1880 }, { "epoch": 0.95676500508647, "grad_norm": 1.0819809436798096, "learning_rate": 1e-05, "loss": 0.5274, "mean_token_accuracy": 0.8357242345809937, "num_tokens": 299657556.0, "step": 1881 }, { "epoch": 0.9572736520854527, "grad_norm": 1.0308226346969604, "learning_rate": 1e-05, "loss": 0.5189, "mean_token_accuracy": 0.8379658460617065, "num_tokens": 299821866.0, "step": 1882 }, { "epoch": 0.9577822990844354, "grad_norm": 1.0907171964645386, "learning_rate": 1e-05, "loss": 0.5252, "mean_token_accuracy": 0.8378491401672363, "num_tokens": 299975575.0, "step": 1883 }, { "epoch": 0.9582909460834181, "grad_norm": 1.1639477014541626, "learning_rate": 1e-05, "loss": 0.5145, "mean_token_accuracy": 0.8402724862098694, "num_tokens": 300122149.0, "step": 1884 }, { "epoch": 0.9587995930824008, "grad_norm": 0.9936185479164124, "learning_rate": 1e-05, "loss": 0.5247, "mean_token_accuracy": 0.8354524374008179, "num_tokens": 300275629.0, "step": 1885 }, { "epoch": 0.9593082400813835, "grad_norm": 1.031510829925537, "learning_rate": 1e-05, "loss": 0.5151, "mean_token_accuracy": 0.8391316533088684, "num_tokens": 300434761.0, "step": 1886 }, { "epoch": 0.9598168870803663, "grad_norm": 1.049988865852356, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8404117822647095, "num_tokens": 300595753.0, "step": 1887 }, { "epoch": 0.960325534079349, "grad_norm": 1.060204029083252, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8364053964614868, "num_tokens": 300761476.0, "step": 1888 }, { "epoch": 0.9608341810783316, "grad_norm": 1.0777702331542969, "learning_rate": 1e-05, "loss": 0.4798, "mean_token_accuracy": 0.8494613170623779, "num_tokens": 300907255.0, "step": 1889 }, { "epoch": 0.9613428280773143, "grad_norm": 0.9640669822692871, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.8406792283058167, "num_tokens": 301078714.0, "step": 1890 }, { "epoch": 0.9618514750762971, "grad_norm": 1.027019739151001, "learning_rate": 1e-05, "loss": 0.5121, "mean_token_accuracy": 0.8419984579086304, "num_tokens": 301235054.0, "step": 1891 }, { "epoch": 0.9623601220752798, "grad_norm": 1.0344923734664917, "learning_rate": 1e-05, "loss": 0.5137, "mean_token_accuracy": 0.8401339054107666, "num_tokens": 301390887.0, "step": 1892 }, { "epoch": 0.9628687690742624, "grad_norm": 0.9816353917121887, "learning_rate": 1e-05, "loss": 0.526, "mean_token_accuracy": 0.8368819952011108, "num_tokens": 301553051.0, "step": 1893 }, { "epoch": 0.9633774160732451, "grad_norm": 1.041377067565918, "learning_rate": 1e-05, "loss": 0.5005, "mean_token_accuracy": 0.8439712524414062, "num_tokens": 301709240.0, "step": 1894 }, { "epoch": 0.9638860630722279, "grad_norm": 0.9798972010612488, "learning_rate": 1e-05, "loss": 0.5152, "mean_token_accuracy": 0.8388742804527283, "num_tokens": 301873555.0, "step": 1895 }, { "epoch": 0.9643947100712106, "grad_norm": 1.0527175664901733, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.8380733728408813, "num_tokens": 302020968.0, "step": 1896 }, { "epoch": 0.9649033570701933, "grad_norm": 1.01810622215271, "learning_rate": 1e-05, "loss": 0.5486, "mean_token_accuracy": 0.8303346037864685, "num_tokens": 302178645.0, "step": 1897 }, { "epoch": 0.965412004069176, "grad_norm": 1.0569944381713867, "learning_rate": 1e-05, "loss": 0.5532, "mean_token_accuracy": 0.8283469676971436, "num_tokens": 302349196.0, "step": 1898 }, { "epoch": 0.9659206510681587, "grad_norm": 1.0374387502670288, "learning_rate": 1e-05, "loss": 0.5273, "mean_token_accuracy": 0.8351933360099792, "num_tokens": 302506292.0, "step": 1899 }, { "epoch": 0.9664292980671414, "grad_norm": 1.0482120513916016, "learning_rate": 1e-05, "loss": 0.5322, "mean_token_accuracy": 0.8335620164871216, "num_tokens": 302661848.0, "step": 1900 }, { "epoch": 0.9669379450661241, "grad_norm": 0.978890597820282, "learning_rate": 1e-05, "loss": 0.5473, "mean_token_accuracy": 0.8288494944572449, "num_tokens": 302822413.0, "step": 1901 }, { "epoch": 0.9674465920651069, "grad_norm": 0.9532626867294312, "learning_rate": 1e-05, "loss": 0.4979, "mean_token_accuracy": 0.8441656827926636, "num_tokens": 302985335.0, "step": 1902 }, { "epoch": 0.9679552390640895, "grad_norm": 1.0545532703399658, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.843116044998169, "num_tokens": 303149395.0, "step": 1903 }, { "epoch": 0.9684638860630722, "grad_norm": 1.0217972993850708, "learning_rate": 1e-05, "loss": 0.4876, "mean_token_accuracy": 0.8471866846084595, "num_tokens": 303312885.0, "step": 1904 }, { "epoch": 0.9689725330620549, "grad_norm": 0.9480841159820557, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8366263508796692, "num_tokens": 303478385.0, "step": 1905 }, { "epoch": 0.9694811800610377, "grad_norm": 1.0249661207199097, "learning_rate": 1e-05, "loss": 0.5149, "mean_token_accuracy": 0.8401471972465515, "num_tokens": 303632397.0, "step": 1906 }, { "epoch": 0.9699898270600203, "grad_norm": 0.9777758121490479, "learning_rate": 1e-05, "loss": 0.5155, "mean_token_accuracy": 0.8376993536949158, "num_tokens": 303793145.0, "step": 1907 }, { "epoch": 0.970498474059003, "grad_norm": 1.0128408670425415, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.833324670791626, "num_tokens": 303954402.0, "step": 1908 }, { "epoch": 0.9710071210579858, "grad_norm": 1.0025849342346191, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.837595522403717, "num_tokens": 304115485.0, "step": 1909 }, { "epoch": 0.9715157680569685, "grad_norm": 0.9397594332695007, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8387343287467957, "num_tokens": 304285608.0, "step": 1910 }, { "epoch": 0.9720244150559512, "grad_norm": 0.9547067284584045, "learning_rate": 1e-05, "loss": 0.4658, "mean_token_accuracy": 0.8515326976776123, "num_tokens": 304444429.0, "step": 1911 }, { "epoch": 0.9725330620549338, "grad_norm": 1.0898922681808472, "learning_rate": 1e-05, "loss": 0.4982, "mean_token_accuracy": 0.8440953493118286, "num_tokens": 304604968.0, "step": 1912 }, { "epoch": 0.9730417090539166, "grad_norm": 1.0754295587539673, "learning_rate": 1e-05, "loss": 0.4895, "mean_token_accuracy": 0.8468303084373474, "num_tokens": 304762895.0, "step": 1913 }, { "epoch": 0.9735503560528993, "grad_norm": 1.0889835357666016, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8393197059631348, "num_tokens": 304919739.0, "step": 1914 }, { "epoch": 0.974059003051882, "grad_norm": 0.9590617418289185, "learning_rate": 1e-05, "loss": 0.5234, "mean_token_accuracy": 0.8376962542533875, "num_tokens": 305084650.0, "step": 1915 }, { "epoch": 0.9745676500508647, "grad_norm": 0.9880885481834412, "learning_rate": 1e-05, "loss": 0.5042, "mean_token_accuracy": 0.8409290313720703, "num_tokens": 305245025.0, "step": 1916 }, { "epoch": 0.9750762970498474, "grad_norm": 0.996501088142395, "learning_rate": 1e-05, "loss": 0.5005, "mean_token_accuracy": 0.8432765007019043, "num_tokens": 305403475.0, "step": 1917 }, { "epoch": 0.9755849440488301, "grad_norm": 0.9220758080482483, "learning_rate": 1e-05, "loss": 0.5154, "mean_token_accuracy": 0.8385381102561951, "num_tokens": 305554045.0, "step": 1918 }, { "epoch": 0.9760935910478128, "grad_norm": 1.0082985162734985, "learning_rate": 1e-05, "loss": 0.4907, "mean_token_accuracy": 0.846260666847229, "num_tokens": 305712579.0, "step": 1919 }, { "epoch": 0.9766022380467956, "grad_norm": 0.987065851688385, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8460931777954102, "num_tokens": 305880937.0, "step": 1920 }, { "epoch": 0.9771108850457783, "grad_norm": 0.9880823493003845, "learning_rate": 1e-05, "loss": 0.4994, "mean_token_accuracy": 0.8439615964889526, "num_tokens": 306037487.0, "step": 1921 }, { "epoch": 0.9776195320447609, "grad_norm": 0.9675670862197876, "learning_rate": 1e-05, "loss": 0.5101, "mean_token_accuracy": 0.8417237401008606, "num_tokens": 306205585.0, "step": 1922 }, { "epoch": 0.9781281790437436, "grad_norm": 1.1072001457214355, "learning_rate": 1e-05, "loss": 0.5216, "mean_token_accuracy": 0.8380455374717712, "num_tokens": 306363764.0, "step": 1923 }, { "epoch": 0.9786368260427264, "grad_norm": 0.98995441198349, "learning_rate": 1e-05, "loss": 0.501, "mean_token_accuracy": 0.843612551689148, "num_tokens": 306521564.0, "step": 1924 }, { "epoch": 0.9791454730417091, "grad_norm": 1.060228705406189, "learning_rate": 1e-05, "loss": 0.5014, "mean_token_accuracy": 0.8430065512657166, "num_tokens": 306674429.0, "step": 1925 }, { "epoch": 0.9796541200406917, "grad_norm": 1.018702507019043, "learning_rate": 1e-05, "loss": 0.5409, "mean_token_accuracy": 0.8309513330459595, "num_tokens": 306832682.0, "step": 1926 }, { "epoch": 0.9801627670396744, "grad_norm": 1.13094961643219, "learning_rate": 1e-05, "loss": 0.481, "mean_token_accuracy": 0.8488995432853699, "num_tokens": 306990015.0, "step": 1927 }, { "epoch": 0.9806714140386572, "grad_norm": 0.9446847438812256, "learning_rate": 1e-05, "loss": 0.5033, "mean_token_accuracy": 0.8416002988815308, "num_tokens": 307155638.0, "step": 1928 }, { "epoch": 0.9811800610376399, "grad_norm": 1.0495901107788086, "learning_rate": 1e-05, "loss": 0.4828, "mean_token_accuracy": 0.8481249809265137, "num_tokens": 307317052.0, "step": 1929 }, { "epoch": 0.9816887080366226, "grad_norm": 0.997715950012207, "learning_rate": 1e-05, "loss": 0.5117, "mean_token_accuracy": 0.8404562473297119, "num_tokens": 307461975.0, "step": 1930 }, { "epoch": 0.9821973550356052, "grad_norm": 1.0938962697982788, "learning_rate": 1e-05, "loss": 0.5479, "mean_token_accuracy": 0.8287118673324585, "num_tokens": 307618995.0, "step": 1931 }, { "epoch": 0.982706002034588, "grad_norm": 1.0079737901687622, "learning_rate": 1e-05, "loss": 0.5047, "mean_token_accuracy": 0.8424063920974731, "num_tokens": 307787106.0, "step": 1932 }, { "epoch": 0.9832146490335707, "grad_norm": 0.9513643980026245, "learning_rate": 1e-05, "loss": 0.5141, "mean_token_accuracy": 0.8406564593315125, "num_tokens": 307957991.0, "step": 1933 }, { "epoch": 0.9837232960325534, "grad_norm": 1.1360417604446411, "learning_rate": 1e-05, "loss": 0.531, "mean_token_accuracy": 0.8353432416915894, "num_tokens": 308114670.0, "step": 1934 }, { "epoch": 0.9842319430315362, "grad_norm": 1.0051788091659546, "learning_rate": 1e-05, "loss": 0.5346, "mean_token_accuracy": 0.8331654071807861, "num_tokens": 308273482.0, "step": 1935 }, { "epoch": 0.9847405900305188, "grad_norm": 0.9172512292861938, "learning_rate": 1e-05, "loss": 0.5107, "mean_token_accuracy": 0.8405485153198242, "num_tokens": 308438717.0, "step": 1936 }, { "epoch": 0.9852492370295015, "grad_norm": 1.0240017175674438, "learning_rate": 1e-05, "loss": 0.4711, "mean_token_accuracy": 0.8513554334640503, "num_tokens": 308606239.0, "step": 1937 }, { "epoch": 0.9857578840284842, "grad_norm": 0.9732988476753235, "learning_rate": 1e-05, "loss": 0.513, "mean_token_accuracy": 0.8397548198699951, "num_tokens": 308770389.0, "step": 1938 }, { "epoch": 0.986266531027467, "grad_norm": 1.0476912260055542, "learning_rate": 1e-05, "loss": 0.5123, "mean_token_accuracy": 0.8402565121650696, "num_tokens": 308938986.0, "step": 1939 }, { "epoch": 0.9867751780264497, "grad_norm": 0.9261658787727356, "learning_rate": 1e-05, "loss": 0.4896, "mean_token_accuracy": 0.8465872406959534, "num_tokens": 309098467.0, "step": 1940 }, { "epoch": 0.9872838250254323, "grad_norm": 1.0257370471954346, "learning_rate": 1e-05, "loss": 0.5159, "mean_token_accuracy": 0.837329626083374, "num_tokens": 309271767.0, "step": 1941 }, { "epoch": 0.987792472024415, "grad_norm": 0.9949601292610168, "learning_rate": 1e-05, "loss": 0.5321, "mean_token_accuracy": 0.8342413902282715, "num_tokens": 309430767.0, "step": 1942 }, { "epoch": 0.9883011190233978, "grad_norm": 1.031502604484558, "learning_rate": 1e-05, "loss": 0.5133, "mean_token_accuracy": 0.8384732604026794, "num_tokens": 309588008.0, "step": 1943 }, { "epoch": 0.9888097660223805, "grad_norm": 0.976768434047699, "learning_rate": 1e-05, "loss": 0.5165, "mean_token_accuracy": 0.8383283615112305, "num_tokens": 309743337.0, "step": 1944 }, { "epoch": 0.9893184130213631, "grad_norm": 1.033813714981079, "learning_rate": 1e-05, "loss": 0.5188, "mean_token_accuracy": 0.8372408747673035, "num_tokens": 309895383.0, "step": 1945 }, { "epoch": 0.9898270600203459, "grad_norm": 1.203722596168518, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.8340418338775635, "num_tokens": 310051593.0, "step": 1946 }, { "epoch": 0.9903357070193286, "grad_norm": 1.0881632566452026, "learning_rate": 1e-05, "loss": 0.5241, "mean_token_accuracy": 0.8362948894500732, "num_tokens": 310222144.0, "step": 1947 }, { "epoch": 0.9908443540183113, "grad_norm": 0.967003345489502, "learning_rate": 1e-05, "loss": 0.487, "mean_token_accuracy": 0.8465834259986877, "num_tokens": 310376369.0, "step": 1948 }, { "epoch": 0.991353001017294, "grad_norm": 0.9813733696937561, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8346635103225708, "num_tokens": 310540773.0, "step": 1949 }, { "epoch": 0.9918616480162767, "grad_norm": 1.0921562910079956, "learning_rate": 1e-05, "loss": 0.5189, "mean_token_accuracy": 0.8375669121742249, "num_tokens": 310705795.0, "step": 1950 }, { "epoch": 0.9923702950152594, "grad_norm": 0.9453521966934204, "learning_rate": 1e-05, "loss": 0.4979, "mean_token_accuracy": 0.8441818356513977, "num_tokens": 310870435.0, "step": 1951 }, { "epoch": 0.9928789420142421, "grad_norm": 0.9923408031463623, "learning_rate": 1e-05, "loss": 0.5326, "mean_token_accuracy": 0.833539605140686, "num_tokens": 311023220.0, "step": 1952 }, { "epoch": 0.9933875890132248, "grad_norm": 0.971986711025238, "learning_rate": 1e-05, "loss": 0.5149, "mean_token_accuracy": 0.8401809334754944, "num_tokens": 311192559.0, "step": 1953 }, { "epoch": 0.9938962360122076, "grad_norm": 1.054033637046814, "learning_rate": 1e-05, "loss": 0.5368, "mean_token_accuracy": 0.8325077891349792, "num_tokens": 311361294.0, "step": 1954 }, { "epoch": 0.9944048830111902, "grad_norm": 1.0171631574630737, "learning_rate": 1e-05, "loss": 0.5425, "mean_token_accuracy": 0.830352783203125, "num_tokens": 311513624.0, "step": 1955 }, { "epoch": 0.9949135300101729, "grad_norm": 1.076941967010498, "learning_rate": 1e-05, "loss": 0.5534, "mean_token_accuracy": 0.8284592032432556, "num_tokens": 311682933.0, "step": 1956 }, { "epoch": 0.9954221770091557, "grad_norm": 1.0492786169052124, "learning_rate": 1e-05, "loss": 0.5291, "mean_token_accuracy": 0.8351242542266846, "num_tokens": 311836701.0, "step": 1957 }, { "epoch": 0.9959308240081384, "grad_norm": 0.9567775726318359, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.8453713655471802, "num_tokens": 311997757.0, "step": 1958 }, { "epoch": 0.996439471007121, "grad_norm": 1.0214710235595703, "learning_rate": 1e-05, "loss": 0.5105, "mean_token_accuracy": 0.840237021446228, "num_tokens": 312141716.0, "step": 1959 }, { "epoch": 0.9969481180061037, "grad_norm": 0.9785928130149841, "learning_rate": 1e-05, "loss": 0.5123, "mean_token_accuracy": 0.8398464918136597, "num_tokens": 312300232.0, "step": 1960 }, { "epoch": 0.9974567650050865, "grad_norm": 0.9771664142608643, "learning_rate": 1e-05, "loss": 0.5236, "mean_token_accuracy": 0.8375827670097351, "num_tokens": 312460325.0, "step": 1961 }, { "epoch": 0.9979654120040692, "grad_norm": 0.9888442754745483, "learning_rate": 1e-05, "loss": 0.5002, "mean_token_accuracy": 0.844666600227356, "num_tokens": 312617272.0, "step": 1962 }, { "epoch": 0.9984740590030519, "grad_norm": 0.9254450798034668, "learning_rate": 1e-05, "loss": 0.5047, "mean_token_accuracy": 0.8417128324508667, "num_tokens": 312782970.0, "step": 1963 }, { "epoch": 0.9989827060020345, "grad_norm": 1.0160105228424072, "learning_rate": 1e-05, "loss": 0.5225, "mean_token_accuracy": 0.8384478092193604, "num_tokens": 312940535.0, "step": 1964 }, { "epoch": 0.9994913530010173, "grad_norm": 0.9914231896400452, "learning_rate": 1e-05, "loss": 0.5035, "mean_token_accuracy": 0.8427067995071411, "num_tokens": 313094817.0, "step": 1965 }, { "epoch": 1.0, "grad_norm": 1.0151771306991577, "learning_rate": 1e-05, "loss": 0.5262, "mean_token_accuracy": 0.8351953625679016, "num_tokens": 313255150.0, "step": 1966 }, { "epoch": 1.0005086469989828, "grad_norm": 1.1044137477874756, "learning_rate": 1e-05, "loss": 0.4927, "mean_token_accuracy": 0.8446985483169556, "num_tokens": 313416474.0, "step": 1967 }, { "epoch": 1.0010172939979654, "grad_norm": 1.0383646488189697, "learning_rate": 1e-05, "loss": 0.4672, "mean_token_accuracy": 0.8528542518615723, "num_tokens": 313576480.0, "step": 1968 }, { "epoch": 1.0015259409969481, "grad_norm": 1.0455737113952637, "learning_rate": 1e-05, "loss": 0.5156, "mean_token_accuracy": 0.8373713493347168, "num_tokens": 313725157.0, "step": 1969 }, { "epoch": 1.002034587995931, "grad_norm": 1.0510598421096802, "learning_rate": 1e-05, "loss": 0.5082, "mean_token_accuracy": 0.8405560851097107, "num_tokens": 313894362.0, "step": 1970 }, { "epoch": 1.0025432349949135, "grad_norm": 1.1204897165298462, "learning_rate": 1e-05, "loss": 0.506, "mean_token_accuracy": 0.840548038482666, "num_tokens": 314046204.0, "step": 1971 }, { "epoch": 1.0030518819938963, "grad_norm": 1.0423732995986938, "learning_rate": 1e-05, "loss": 0.471, "mean_token_accuracy": 0.849092960357666, "num_tokens": 314199033.0, "step": 1972 }, { "epoch": 1.0035605289928788, "grad_norm": 0.9673753380775452, "learning_rate": 1e-05, "loss": 0.4707, "mean_token_accuracy": 0.8502670526504517, "num_tokens": 314357461.0, "step": 1973 }, { "epoch": 1.0040691759918616, "grad_norm": 1.0575178861618042, "learning_rate": 1e-05, "loss": 0.4874, "mean_token_accuracy": 0.8461350202560425, "num_tokens": 314529806.0, "step": 1974 }, { "epoch": 1.0045778229908444, "grad_norm": 1.0000046491622925, "learning_rate": 1e-05, "loss": 0.4759, "mean_token_accuracy": 0.8510136008262634, "num_tokens": 314685706.0, "step": 1975 }, { "epoch": 1.005086469989827, "grad_norm": 0.9816222786903381, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8437051773071289, "num_tokens": 314836980.0, "step": 1976 }, { "epoch": 1.0055951169888098, "grad_norm": 0.9830356240272522, "learning_rate": 1e-05, "loss": 0.507, "mean_token_accuracy": 0.8409254550933838, "num_tokens": 315004512.0, "step": 1977 }, { "epoch": 1.0061037639877926, "grad_norm": 1.0614138841629028, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8373826742172241, "num_tokens": 315155851.0, "step": 1978 }, { "epoch": 1.0066124109867751, "grad_norm": 1.0838987827301025, "learning_rate": 1e-05, "loss": 0.4765, "mean_token_accuracy": 0.8484848141670227, "num_tokens": 315323264.0, "step": 1979 }, { "epoch": 1.007121057985758, "grad_norm": 1.0375914573669434, "learning_rate": 1e-05, "loss": 0.4774, "mean_token_accuracy": 0.8493205904960632, "num_tokens": 315477553.0, "step": 1980 }, { "epoch": 1.0076297049847407, "grad_norm": 1.0500863790512085, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8408976793289185, "num_tokens": 315636869.0, "step": 1981 }, { "epoch": 1.0081383519837233, "grad_norm": 1.0597950220108032, "learning_rate": 1e-05, "loss": 0.4785, "mean_token_accuracy": 0.8486466407775879, "num_tokens": 315789205.0, "step": 1982 }, { "epoch": 1.008646998982706, "grad_norm": 1.0866880416870117, "learning_rate": 1e-05, "loss": 0.4856, "mean_token_accuracy": 0.8461682200431824, "num_tokens": 315949798.0, "step": 1983 }, { "epoch": 1.0091556459816886, "grad_norm": 1.0247963666915894, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.843558132648468, "num_tokens": 316108041.0, "step": 1984 }, { "epoch": 1.0096642929806714, "grad_norm": 1.1197168827056885, "learning_rate": 1e-05, "loss": 0.4658, "mean_token_accuracy": 0.850692629814148, "num_tokens": 316271100.0, "step": 1985 }, { "epoch": 1.0101729399796542, "grad_norm": 1.0855802297592163, "learning_rate": 1e-05, "loss": 0.51, "mean_token_accuracy": 0.840994656085968, "num_tokens": 316433915.0, "step": 1986 }, { "epoch": 1.0106815869786367, "grad_norm": 1.0583574771881104, "learning_rate": 1e-05, "loss": 0.5189, "mean_token_accuracy": 0.8364496231079102, "num_tokens": 316609023.0, "step": 1987 }, { "epoch": 1.0111902339776195, "grad_norm": 0.9981748461723328, "learning_rate": 1e-05, "loss": 0.4573, "mean_token_accuracy": 0.8547645211219788, "num_tokens": 316764032.0, "step": 1988 }, { "epoch": 1.0116988809766023, "grad_norm": 0.9980987906455994, "learning_rate": 1e-05, "loss": 0.5112, "mean_token_accuracy": 0.8400788307189941, "num_tokens": 316924785.0, "step": 1989 }, { "epoch": 1.0122075279755849, "grad_norm": 1.0092408657073975, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8480200171470642, "num_tokens": 317068693.0, "step": 1990 }, { "epoch": 1.0127161749745677, "grad_norm": 1.148630142211914, "learning_rate": 1e-05, "loss": 0.4868, "mean_token_accuracy": 0.846847653388977, "num_tokens": 317231900.0, "step": 1991 }, { "epoch": 1.0132248219735505, "grad_norm": 1.0388243198394775, "learning_rate": 1e-05, "loss": 0.4764, "mean_token_accuracy": 0.8494249582290649, "num_tokens": 317399513.0, "step": 1992 }, { "epoch": 1.013733468972533, "grad_norm": 0.9582164883613586, "learning_rate": 1e-05, "loss": 0.4889, "mean_token_accuracy": 0.8472070693969727, "num_tokens": 317561644.0, "step": 1993 }, { "epoch": 1.0142421159715158, "grad_norm": 0.9729287028312683, "learning_rate": 1e-05, "loss": 0.4728, "mean_token_accuracy": 0.8492978811264038, "num_tokens": 317722640.0, "step": 1994 }, { "epoch": 1.0147507629704984, "grad_norm": 1.0420944690704346, "learning_rate": 1e-05, "loss": 0.4729, "mean_token_accuracy": 0.8504926562309265, "num_tokens": 317881374.0, "step": 1995 }, { "epoch": 1.0152594099694812, "grad_norm": 1.0152589082717896, "learning_rate": 1e-05, "loss": 0.4817, "mean_token_accuracy": 0.8495712280273438, "num_tokens": 318033418.0, "step": 1996 }, { "epoch": 1.015768056968464, "grad_norm": 1.0669312477111816, "learning_rate": 1e-05, "loss": 0.5092, "mean_token_accuracy": 0.8412264585494995, "num_tokens": 318183443.0, "step": 1997 }, { "epoch": 1.0162767039674465, "grad_norm": 1.1268723011016846, "learning_rate": 1e-05, "loss": 0.4913, "mean_token_accuracy": 0.845348596572876, "num_tokens": 318339258.0, "step": 1998 }, { "epoch": 1.0167853509664293, "grad_norm": 1.0353676080703735, "learning_rate": 1e-05, "loss": 0.4586, "mean_token_accuracy": 0.8535833954811096, "num_tokens": 318495267.0, "step": 1999 }, { "epoch": 1.017293997965412, "grad_norm": 1.0712075233459473, "learning_rate": 1e-05, "loss": 0.4798, "mean_token_accuracy": 0.848924994468689, "num_tokens": 318648663.0, "step": 2000 }, { "epoch": 1.0178026449643947, "grad_norm": 1.197359323501587, "learning_rate": 1e-05, "loss": 0.4963, "mean_token_accuracy": 0.8441604375839233, "num_tokens": 318803431.0, "step": 2001 }, { "epoch": 1.0183112919633774, "grad_norm": 1.0463975667953491, "learning_rate": 1e-05, "loss": 0.4914, "mean_token_accuracy": 0.8444896936416626, "num_tokens": 318960576.0, "step": 2002 }, { "epoch": 1.0188199389623602, "grad_norm": 1.0998681783676147, "learning_rate": 1e-05, "loss": 0.469, "mean_token_accuracy": 0.8531366586685181, "num_tokens": 319111546.0, "step": 2003 }, { "epoch": 1.0193285859613428, "grad_norm": 1.059656023979187, "learning_rate": 1e-05, "loss": 0.4952, "mean_token_accuracy": 0.8430195450782776, "num_tokens": 319276087.0, "step": 2004 }, { "epoch": 1.0198372329603256, "grad_norm": 1.1603281497955322, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8404607772827148, "num_tokens": 319423273.0, "step": 2005 }, { "epoch": 1.0203458799593081, "grad_norm": 1.0294604301452637, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8559034466743469, "num_tokens": 319576787.0, "step": 2006 }, { "epoch": 1.020854526958291, "grad_norm": 1.0385469198226929, "learning_rate": 1e-05, "loss": 0.4849, "mean_token_accuracy": 0.8464286923408508, "num_tokens": 319740271.0, "step": 2007 }, { "epoch": 1.0213631739572737, "grad_norm": 1.0036464929580688, "learning_rate": 1e-05, "loss": 0.4896, "mean_token_accuracy": 0.8454139828681946, "num_tokens": 319902904.0, "step": 2008 }, { "epoch": 1.0218718209562563, "grad_norm": 1.1799579858779907, "learning_rate": 1e-05, "loss": 0.5066, "mean_token_accuracy": 0.8418562412261963, "num_tokens": 320057699.0, "step": 2009 }, { "epoch": 1.022380467955239, "grad_norm": 1.1496026515960693, "learning_rate": 1e-05, "loss": 0.4769, "mean_token_accuracy": 0.8499530553817749, "num_tokens": 320212284.0, "step": 2010 }, { "epoch": 1.0228891149542219, "grad_norm": 0.9787399768829346, "learning_rate": 1e-05, "loss": 0.4947, "mean_token_accuracy": 0.8445889949798584, "num_tokens": 320374261.0, "step": 2011 }, { "epoch": 1.0233977619532044, "grad_norm": 1.1572648286819458, "learning_rate": 1e-05, "loss": 0.5078, "mean_token_accuracy": 0.8388811349868774, "num_tokens": 320529125.0, "step": 2012 }, { "epoch": 1.0239064089521872, "grad_norm": 1.0809093713760376, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8499780893325806, "num_tokens": 320672706.0, "step": 2013 }, { "epoch": 1.02441505595117, "grad_norm": 1.005975604057312, "learning_rate": 1e-05, "loss": 0.486, "mean_token_accuracy": 0.8479955196380615, "num_tokens": 320834351.0, "step": 2014 }, { "epoch": 1.0249237029501526, "grad_norm": 1.150314450263977, "learning_rate": 1e-05, "loss": 0.4988, "mean_token_accuracy": 0.841154158115387, "num_tokens": 321001419.0, "step": 2015 }, { "epoch": 1.0254323499491353, "grad_norm": 1.0990020036697388, "learning_rate": 1e-05, "loss": 0.4776, "mean_token_accuracy": 0.8472510576248169, "num_tokens": 321155407.0, "step": 2016 }, { "epoch": 1.025940996948118, "grad_norm": 1.1457676887512207, "learning_rate": 1e-05, "loss": 0.4997, "mean_token_accuracy": 0.8403406739234924, "num_tokens": 321311640.0, "step": 2017 }, { "epoch": 1.0264496439471007, "grad_norm": 1.0399061441421509, "learning_rate": 1e-05, "loss": 0.5103, "mean_token_accuracy": 0.8391156792640686, "num_tokens": 321466786.0, "step": 2018 }, { "epoch": 1.0269582909460835, "grad_norm": 1.0857439041137695, "learning_rate": 1e-05, "loss": 0.4786, "mean_token_accuracy": 0.8480024933815002, "num_tokens": 321630638.0, "step": 2019 }, { "epoch": 1.027466937945066, "grad_norm": 1.072295069694519, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8517623543739319, "num_tokens": 321794738.0, "step": 2020 }, { "epoch": 1.0279755849440488, "grad_norm": 1.0555133819580078, "learning_rate": 1e-05, "loss": 0.5279, "mean_token_accuracy": 0.8357231616973877, "num_tokens": 321953968.0, "step": 2021 }, { "epoch": 1.0284842319430316, "grad_norm": 1.181735873222351, "learning_rate": 1e-05, "loss": 0.4526, "mean_token_accuracy": 0.8548371195793152, "num_tokens": 322105270.0, "step": 2022 }, { "epoch": 1.0289928789420142, "grad_norm": 0.9960180521011353, "learning_rate": 1e-05, "loss": 0.4867, "mean_token_accuracy": 0.8455308079719543, "num_tokens": 322267010.0, "step": 2023 }, { "epoch": 1.029501525940997, "grad_norm": 1.0796343088150024, "learning_rate": 1e-05, "loss": 0.4888, "mean_token_accuracy": 0.8429710865020752, "num_tokens": 322430631.0, "step": 2024 }, { "epoch": 1.0300101729399798, "grad_norm": 1.0349819660186768, "learning_rate": 1e-05, "loss": 0.4803, "mean_token_accuracy": 0.8490273952484131, "num_tokens": 322593551.0, "step": 2025 }, { "epoch": 1.0305188199389623, "grad_norm": 0.9780552387237549, "learning_rate": 1e-05, "loss": 0.4559, "mean_token_accuracy": 0.8544877171516418, "num_tokens": 322762321.0, "step": 2026 }, { "epoch": 1.0310274669379451, "grad_norm": 1.1404180526733398, "learning_rate": 1e-05, "loss": 0.4898, "mean_token_accuracy": 0.8458755016326904, "num_tokens": 322922120.0, "step": 2027 }, { "epoch": 1.0315361139369277, "grad_norm": 0.9819681644439697, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.8418173789978027, "num_tokens": 323079150.0, "step": 2028 }, { "epoch": 1.0320447609359105, "grad_norm": 0.976266622543335, "learning_rate": 1e-05, "loss": 0.4823, "mean_token_accuracy": 0.8466558456420898, "num_tokens": 323238505.0, "step": 2029 }, { "epoch": 1.0325534079348933, "grad_norm": 0.9896778464317322, "learning_rate": 1e-05, "loss": 0.5024, "mean_token_accuracy": 0.8411461710929871, "num_tokens": 323409267.0, "step": 2030 }, { "epoch": 1.0330620549338758, "grad_norm": 1.007405161857605, "learning_rate": 1e-05, "loss": 0.5161, "mean_token_accuracy": 0.8370198011398315, "num_tokens": 323574623.0, "step": 2031 }, { "epoch": 1.0335707019328586, "grad_norm": 1.0376770496368408, "learning_rate": 1e-05, "loss": 0.4955, "mean_token_accuracy": 0.8449573516845703, "num_tokens": 323717438.0, "step": 2032 }, { "epoch": 1.0340793489318414, "grad_norm": 0.9660647511482239, "learning_rate": 1e-05, "loss": 0.4681, "mean_token_accuracy": 0.851511538028717, "num_tokens": 323876742.0, "step": 2033 }, { "epoch": 1.034587995930824, "grad_norm": 1.0799311399459839, "learning_rate": 1e-05, "loss": 0.5258, "mean_token_accuracy": 0.835211455821991, "num_tokens": 324033814.0, "step": 2034 }, { "epoch": 1.0350966429298067, "grad_norm": 1.0707277059555054, "learning_rate": 1e-05, "loss": 0.511, "mean_token_accuracy": 0.8382892608642578, "num_tokens": 324183869.0, "step": 2035 }, { "epoch": 1.0356052899287893, "grad_norm": 1.0144026279449463, "learning_rate": 1e-05, "loss": 0.4757, "mean_token_accuracy": 0.8489046096801758, "num_tokens": 324339938.0, "step": 2036 }, { "epoch": 1.036113936927772, "grad_norm": 0.952110230922699, "learning_rate": 1e-05, "loss": 0.5004, "mean_token_accuracy": 0.8435638546943665, "num_tokens": 324495300.0, "step": 2037 }, { "epoch": 1.0366225839267549, "grad_norm": 1.0803865194320679, "learning_rate": 1e-05, "loss": 0.485, "mean_token_accuracy": 0.8461806774139404, "num_tokens": 324649242.0, "step": 2038 }, { "epoch": 1.0371312309257374, "grad_norm": 1.0200896263122559, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8410135507583618, "num_tokens": 324807416.0, "step": 2039 }, { "epoch": 1.0376398779247202, "grad_norm": 1.008554458618164, "learning_rate": 1e-05, "loss": 0.4786, "mean_token_accuracy": 0.8479388952255249, "num_tokens": 324973812.0, "step": 2040 }, { "epoch": 1.038148524923703, "grad_norm": 0.9462720155715942, "learning_rate": 1e-05, "loss": 0.4948, "mean_token_accuracy": 0.8447370529174805, "num_tokens": 325141500.0, "step": 2041 }, { "epoch": 1.0386571719226856, "grad_norm": 1.0549455881118774, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.8438306450843811, "num_tokens": 325293984.0, "step": 2042 }, { "epoch": 1.0391658189216684, "grad_norm": 1.0447968244552612, "learning_rate": 1e-05, "loss": 0.4903, "mean_token_accuracy": 0.8438676595687866, "num_tokens": 325446428.0, "step": 2043 }, { "epoch": 1.0396744659206512, "grad_norm": 1.0543104410171509, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.8362342119216919, "num_tokens": 325602970.0, "step": 2044 }, { "epoch": 1.0401831129196337, "grad_norm": 1.1367326974868774, "learning_rate": 1e-05, "loss": 0.5208, "mean_token_accuracy": 0.8370205760002136, "num_tokens": 325754688.0, "step": 2045 }, { "epoch": 1.0406917599186165, "grad_norm": 0.986159086227417, "learning_rate": 1e-05, "loss": 0.4598, "mean_token_accuracy": 0.8532018065452576, "num_tokens": 325898114.0, "step": 2046 }, { "epoch": 1.041200406917599, "grad_norm": 1.0895652770996094, "learning_rate": 1e-05, "loss": 0.5051, "mean_token_accuracy": 0.8412327766418457, "num_tokens": 326050746.0, "step": 2047 }, { "epoch": 1.0417090539165819, "grad_norm": 1.020145058631897, "learning_rate": 1e-05, "loss": 0.4913, "mean_token_accuracy": 0.8442976474761963, "num_tokens": 326210039.0, "step": 2048 }, { "epoch": 1.0422177009155646, "grad_norm": 1.036494493484497, "learning_rate": 1e-05, "loss": 0.5133, "mean_token_accuracy": 0.8386721014976501, "num_tokens": 326372235.0, "step": 2049 }, { "epoch": 1.0427263479145472, "grad_norm": 1.0419977903366089, "learning_rate": 1e-05, "loss": 0.5094, "mean_token_accuracy": 0.840260922908783, "num_tokens": 326539486.0, "step": 2050 }, { "epoch": 1.04323499491353, "grad_norm": 1.0354567766189575, "learning_rate": 1e-05, "loss": 0.4931, "mean_token_accuracy": 0.8432134389877319, "num_tokens": 326697650.0, "step": 2051 }, { "epoch": 1.0437436419125128, "grad_norm": 1.0431313514709473, "learning_rate": 1e-05, "loss": 0.4811, "mean_token_accuracy": 0.8483883142471313, "num_tokens": 326855696.0, "step": 2052 }, { "epoch": 1.0442522889114954, "grad_norm": 1.0206801891326904, "learning_rate": 1e-05, "loss": 0.5135, "mean_token_accuracy": 0.8393006324768066, "num_tokens": 327015415.0, "step": 2053 }, { "epoch": 1.0447609359104781, "grad_norm": 1.0079280138015747, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8453682065010071, "num_tokens": 327187580.0, "step": 2054 }, { "epoch": 1.045269582909461, "grad_norm": 1.0278279781341553, "learning_rate": 1e-05, "loss": 0.4992, "mean_token_accuracy": 0.8421951532363892, "num_tokens": 327357070.0, "step": 2055 }, { "epoch": 1.0457782299084435, "grad_norm": 1.018744707107544, "learning_rate": 1e-05, "loss": 0.5024, "mean_token_accuracy": 0.8416374921798706, "num_tokens": 327507342.0, "step": 2056 }, { "epoch": 1.0462868769074263, "grad_norm": 1.0134575366973877, "learning_rate": 1e-05, "loss": 0.4886, "mean_token_accuracy": 0.8452538251876831, "num_tokens": 327667011.0, "step": 2057 }, { "epoch": 1.0467955239064088, "grad_norm": 1.0265029668807983, "learning_rate": 1e-05, "loss": 0.508, "mean_token_accuracy": 0.8401978015899658, "num_tokens": 327836633.0, "step": 2058 }, { "epoch": 1.0473041709053916, "grad_norm": 1.0409969091415405, "learning_rate": 1e-05, "loss": 0.4697, "mean_token_accuracy": 0.8507318496704102, "num_tokens": 327973733.0, "step": 2059 }, { "epoch": 1.0478128179043744, "grad_norm": 1.0191627740859985, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8423019647598267, "num_tokens": 328133022.0, "step": 2060 }, { "epoch": 1.048321464903357, "grad_norm": 1.214523196220398, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8397959470748901, "num_tokens": 328277969.0, "step": 2061 }, { "epoch": 1.0488301119023398, "grad_norm": 1.123297095298767, "learning_rate": 1e-05, "loss": 0.4865, "mean_token_accuracy": 0.8451673984527588, "num_tokens": 328442910.0, "step": 2062 }, { "epoch": 1.0493387589013226, "grad_norm": 1.0316575765609741, "learning_rate": 1e-05, "loss": 0.4745, "mean_token_accuracy": 0.850106954574585, "num_tokens": 328593481.0, "step": 2063 }, { "epoch": 1.0498474059003051, "grad_norm": 1.0077111721038818, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8421992659568787, "num_tokens": 328766960.0, "step": 2064 }, { "epoch": 1.050356052899288, "grad_norm": 1.0166810750961304, "learning_rate": 1e-05, "loss": 0.5074, "mean_token_accuracy": 0.8405725955963135, "num_tokens": 328919144.0, "step": 2065 }, { "epoch": 1.0508646998982707, "grad_norm": 1.0643290281295776, "learning_rate": 1e-05, "loss": 0.5067, "mean_token_accuracy": 0.8404885530471802, "num_tokens": 329079757.0, "step": 2066 }, { "epoch": 1.0513733468972533, "grad_norm": 0.9662615656852722, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.8407886028289795, "num_tokens": 329239039.0, "step": 2067 }, { "epoch": 1.051881993896236, "grad_norm": 0.9852247834205627, "learning_rate": 1e-05, "loss": 0.5069, "mean_token_accuracy": 0.8418384194374084, "num_tokens": 329396255.0, "step": 2068 }, { "epoch": 1.0523906408952186, "grad_norm": 1.0212758779525757, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8585224151611328, "num_tokens": 329542301.0, "step": 2069 }, { "epoch": 1.0528992878942014, "grad_norm": 1.0381677150726318, "learning_rate": 1e-05, "loss": 0.4746, "mean_token_accuracy": 0.8479477763175964, "num_tokens": 329691299.0, "step": 2070 }, { "epoch": 1.0534079348931842, "grad_norm": 0.9904256463050842, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8424674272537231, "num_tokens": 329860490.0, "step": 2071 }, { "epoch": 1.0539165818921667, "grad_norm": 1.053727388381958, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.8522511720657349, "num_tokens": 330014371.0, "step": 2072 }, { "epoch": 1.0544252288911495, "grad_norm": 0.9747759699821472, "learning_rate": 1e-05, "loss": 0.4756, "mean_token_accuracy": 0.8489809036254883, "num_tokens": 330193372.0, "step": 2073 }, { "epoch": 1.0549338758901323, "grad_norm": 1.0386484861373901, "learning_rate": 1e-05, "loss": 0.5115, "mean_token_accuracy": 0.840887188911438, "num_tokens": 330363604.0, "step": 2074 }, { "epoch": 1.0554425228891149, "grad_norm": 1.0518840551376343, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8496643304824829, "num_tokens": 330524103.0, "step": 2075 }, { "epoch": 1.0559511698880977, "grad_norm": 1.0117826461791992, "learning_rate": 1e-05, "loss": 0.4649, "mean_token_accuracy": 0.8528888821601868, "num_tokens": 330698211.0, "step": 2076 }, { "epoch": 1.0564598168870805, "grad_norm": 1.1235733032226562, "learning_rate": 1e-05, "loss": 0.5136, "mean_token_accuracy": 0.8376957178115845, "num_tokens": 330868485.0, "step": 2077 }, { "epoch": 1.056968463886063, "grad_norm": 1.1107542514801025, "learning_rate": 1e-05, "loss": 0.4661, "mean_token_accuracy": 0.8518024682998657, "num_tokens": 331039546.0, "step": 2078 }, { "epoch": 1.0574771108850458, "grad_norm": 1.0027269124984741, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8440948724746704, "num_tokens": 331203155.0, "step": 2079 }, { "epoch": 1.0579857578840284, "grad_norm": 1.0469239950180054, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8530600070953369, "num_tokens": 331349908.0, "step": 2080 }, { "epoch": 1.0584944048830112, "grad_norm": 0.9495140910148621, "learning_rate": 1e-05, "loss": 0.4945, "mean_token_accuracy": 0.8430256843566895, "num_tokens": 331518516.0, "step": 2081 }, { "epoch": 1.059003051881994, "grad_norm": 0.972150444984436, "learning_rate": 1e-05, "loss": 0.494, "mean_token_accuracy": 0.8443804383277893, "num_tokens": 331685842.0, "step": 2082 }, { "epoch": 1.0595116988809765, "grad_norm": 1.0417505502700806, "learning_rate": 1e-05, "loss": 0.4905, "mean_token_accuracy": 0.8454884886741638, "num_tokens": 331844766.0, "step": 2083 }, { "epoch": 1.0600203458799593, "grad_norm": 1.0959299802780151, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.8524262309074402, "num_tokens": 332002207.0, "step": 2084 }, { "epoch": 1.060528992878942, "grad_norm": 0.9652280807495117, "learning_rate": 1e-05, "loss": 0.4804, "mean_token_accuracy": 0.8485095500946045, "num_tokens": 332156850.0, "step": 2085 }, { "epoch": 1.0610376398779247, "grad_norm": 1.0024477243423462, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8554891347885132, "num_tokens": 332313613.0, "step": 2086 }, { "epoch": 1.0615462868769074, "grad_norm": 1.0409293174743652, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.8442890644073486, "num_tokens": 332479167.0, "step": 2087 }, { "epoch": 1.0620549338758902, "grad_norm": 0.9680423140525818, "learning_rate": 1e-05, "loss": 0.4787, "mean_token_accuracy": 0.8484407663345337, "num_tokens": 332638608.0, "step": 2088 }, { "epoch": 1.0625635808748728, "grad_norm": 0.9663770794868469, "learning_rate": 1e-05, "loss": 0.4992, "mean_token_accuracy": 0.8420337438583374, "num_tokens": 332793827.0, "step": 2089 }, { "epoch": 1.0630722278738556, "grad_norm": 0.9979959726333618, "learning_rate": 1e-05, "loss": 0.4556, "mean_token_accuracy": 0.8552628755569458, "num_tokens": 332951297.0, "step": 2090 }, { "epoch": 1.0635808748728381, "grad_norm": 1.112915277481079, "learning_rate": 1e-05, "loss": 0.5213, "mean_token_accuracy": 0.8363105654716492, "num_tokens": 333110022.0, "step": 2091 }, { "epoch": 1.064089521871821, "grad_norm": 0.9754927158355713, "learning_rate": 1e-05, "loss": 0.4827, "mean_token_accuracy": 0.8483446836471558, "num_tokens": 333249289.0, "step": 2092 }, { "epoch": 1.0645981688708037, "grad_norm": 0.9905518889427185, "learning_rate": 1e-05, "loss": 0.4548, "mean_token_accuracy": 0.8558975458145142, "num_tokens": 333400414.0, "step": 2093 }, { "epoch": 1.0651068158697863, "grad_norm": 0.9375442266464233, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.856006920337677, "num_tokens": 333565896.0, "step": 2094 }, { "epoch": 1.065615462868769, "grad_norm": 0.9339344501495361, "learning_rate": 1e-05, "loss": 0.4827, "mean_token_accuracy": 0.848110020160675, "num_tokens": 333744235.0, "step": 2095 }, { "epoch": 1.0661241098677519, "grad_norm": 0.9189639091491699, "learning_rate": 1e-05, "loss": 0.5094, "mean_token_accuracy": 0.8405580520629883, "num_tokens": 333906023.0, "step": 2096 }, { "epoch": 1.0666327568667344, "grad_norm": 1.0412248373031616, "learning_rate": 1e-05, "loss": 0.4929, "mean_token_accuracy": 0.843265175819397, "num_tokens": 334070290.0, "step": 2097 }, { "epoch": 1.0671414038657172, "grad_norm": 0.9829212427139282, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.836217999458313, "num_tokens": 334238798.0, "step": 2098 }, { "epoch": 1.0676500508647, "grad_norm": 1.2226958274841309, "learning_rate": 1e-05, "loss": 0.4986, "mean_token_accuracy": 0.8441588878631592, "num_tokens": 334405320.0, "step": 2099 }, { "epoch": 1.0681586978636826, "grad_norm": 1.1169629096984863, "learning_rate": 1e-05, "loss": 0.5335, "mean_token_accuracy": 0.8330006003379822, "num_tokens": 334559804.0, "step": 2100 }, { "epoch": 1.0686673448626653, "grad_norm": 1.0243042707443237, "learning_rate": 1e-05, "loss": 0.4874, "mean_token_accuracy": 0.8441094756126404, "num_tokens": 334706374.0, "step": 2101 }, { "epoch": 1.069175991861648, "grad_norm": 0.9952787160873413, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8492131233215332, "num_tokens": 334867946.0, "step": 2102 }, { "epoch": 1.0696846388606307, "grad_norm": 1.0360610485076904, "learning_rate": 1e-05, "loss": 0.4586, "mean_token_accuracy": 0.8524994254112244, "num_tokens": 335013479.0, "step": 2103 }, { "epoch": 1.0701932858596135, "grad_norm": 1.0457143783569336, "learning_rate": 1e-05, "loss": 0.5183, "mean_token_accuracy": 0.8378509283065796, "num_tokens": 335181587.0, "step": 2104 }, { "epoch": 1.070701932858596, "grad_norm": 1.0518393516540527, "learning_rate": 1e-05, "loss": 0.4798, "mean_token_accuracy": 0.8486339449882507, "num_tokens": 335346515.0, "step": 2105 }, { "epoch": 1.0712105798575788, "grad_norm": 1.0435230731964111, "learning_rate": 1e-05, "loss": 0.5007, "mean_token_accuracy": 0.8433654308319092, "num_tokens": 335509021.0, "step": 2106 }, { "epoch": 1.0717192268565616, "grad_norm": 1.1404730081558228, "learning_rate": 1e-05, "loss": 0.5184, "mean_token_accuracy": 0.8369382619857788, "num_tokens": 335659431.0, "step": 2107 }, { "epoch": 1.0722278738555442, "grad_norm": 1.0483952760696411, "learning_rate": 1e-05, "loss": 0.4522, "mean_token_accuracy": 0.8552796840667725, "num_tokens": 335809158.0, "step": 2108 }, { "epoch": 1.072736520854527, "grad_norm": 1.113566279411316, "learning_rate": 1e-05, "loss": 0.5126, "mean_token_accuracy": 0.8386800289154053, "num_tokens": 335959650.0, "step": 2109 }, { "epoch": 1.0732451678535098, "grad_norm": 0.9882280826568604, "learning_rate": 1e-05, "loss": 0.472, "mean_token_accuracy": 0.8503919839859009, "num_tokens": 336119431.0, "step": 2110 }, { "epoch": 1.0737538148524923, "grad_norm": 0.9893407225608826, "learning_rate": 1e-05, "loss": 0.4846, "mean_token_accuracy": 0.8462191820144653, "num_tokens": 336280307.0, "step": 2111 }, { "epoch": 1.0742624618514751, "grad_norm": 1.079674482345581, "learning_rate": 1e-05, "loss": 0.5072, "mean_token_accuracy": 0.8399666547775269, "num_tokens": 336442944.0, "step": 2112 }, { "epoch": 1.0747711088504577, "grad_norm": 0.9803399443626404, "learning_rate": 1e-05, "loss": 0.4599, "mean_token_accuracy": 0.8556622862815857, "num_tokens": 336583703.0, "step": 2113 }, { "epoch": 1.0752797558494405, "grad_norm": 1.0098974704742432, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8502632975578308, "num_tokens": 336743733.0, "step": 2114 }, { "epoch": 1.0757884028484233, "grad_norm": 0.922788143157959, "learning_rate": 1e-05, "loss": 0.5002, "mean_token_accuracy": 0.8431379199028015, "num_tokens": 336923035.0, "step": 2115 }, { "epoch": 1.0762970498474058, "grad_norm": 1.0645747184753418, "learning_rate": 1e-05, "loss": 0.5146, "mean_token_accuracy": 0.8396292328834534, "num_tokens": 337080544.0, "step": 2116 }, { "epoch": 1.0768056968463886, "grad_norm": 1.0139567852020264, "learning_rate": 1e-05, "loss": 0.5178, "mean_token_accuracy": 0.837389349937439, "num_tokens": 337234518.0, "step": 2117 }, { "epoch": 1.0773143438453714, "grad_norm": 1.058477759361267, "learning_rate": 1e-05, "loss": 0.4748, "mean_token_accuracy": 0.848781406879425, "num_tokens": 337392278.0, "step": 2118 }, { "epoch": 1.077822990844354, "grad_norm": 0.9809451699256897, "learning_rate": 1e-05, "loss": 0.457, "mean_token_accuracy": 0.8538768291473389, "num_tokens": 337552580.0, "step": 2119 }, { "epoch": 1.0783316378433367, "grad_norm": 0.9701200723648071, "learning_rate": 1e-05, "loss": 0.4707, "mean_token_accuracy": 0.8502329587936401, "num_tokens": 337706603.0, "step": 2120 }, { "epoch": 1.0788402848423195, "grad_norm": 1.1232140064239502, "learning_rate": 1e-05, "loss": 0.4921, "mean_token_accuracy": 0.8441417813301086, "num_tokens": 337861542.0, "step": 2121 }, { "epoch": 1.079348931841302, "grad_norm": 0.986573338508606, "learning_rate": 1e-05, "loss": 0.4726, "mean_token_accuracy": 0.8506069183349609, "num_tokens": 338026298.0, "step": 2122 }, { "epoch": 1.0798575788402849, "grad_norm": 1.0073679685592651, "learning_rate": 1e-05, "loss": 0.4842, "mean_token_accuracy": 0.8481048345565796, "num_tokens": 338196691.0, "step": 2123 }, { "epoch": 1.0803662258392674, "grad_norm": 0.989485502243042, "learning_rate": 1e-05, "loss": 0.4596, "mean_token_accuracy": 0.8538649082183838, "num_tokens": 338365205.0, "step": 2124 }, { "epoch": 1.0808748728382502, "grad_norm": 0.9852368831634521, "learning_rate": 1e-05, "loss": 0.5001, "mean_token_accuracy": 0.8432480096817017, "num_tokens": 338521286.0, "step": 2125 }, { "epoch": 1.081383519837233, "grad_norm": 1.0921062231063843, "learning_rate": 1e-05, "loss": 0.4975, "mean_token_accuracy": 0.8422955274581909, "num_tokens": 338668185.0, "step": 2126 }, { "epoch": 1.0818921668362156, "grad_norm": 0.9980047941207886, "learning_rate": 1e-05, "loss": 0.4877, "mean_token_accuracy": 0.8452361822128296, "num_tokens": 338835626.0, "step": 2127 }, { "epoch": 1.0824008138351984, "grad_norm": 0.9752321839332581, "learning_rate": 1e-05, "loss": 0.4943, "mean_token_accuracy": 0.8446649312973022, "num_tokens": 338990507.0, "step": 2128 }, { "epoch": 1.0829094608341812, "grad_norm": 1.0377464294433594, "learning_rate": 1e-05, "loss": 0.4939, "mean_token_accuracy": 0.8445258140563965, "num_tokens": 339154124.0, "step": 2129 }, { "epoch": 1.0834181078331637, "grad_norm": 0.9970061779022217, "learning_rate": 1e-05, "loss": 0.4862, "mean_token_accuracy": 0.8468868136405945, "num_tokens": 339326912.0, "step": 2130 }, { "epoch": 1.0839267548321465, "grad_norm": 1.0732330083847046, "learning_rate": 1e-05, "loss": 0.4909, "mean_token_accuracy": 0.8445082306861877, "num_tokens": 339475749.0, "step": 2131 }, { "epoch": 1.0844354018311293, "grad_norm": 1.000402808189392, "learning_rate": 1e-05, "loss": 0.4994, "mean_token_accuracy": 0.8430907726287842, "num_tokens": 339637655.0, "step": 2132 }, { "epoch": 1.0849440488301119, "grad_norm": 1.0055508613586426, "learning_rate": 1e-05, "loss": 0.4603, "mean_token_accuracy": 0.8536675572395325, "num_tokens": 339791196.0, "step": 2133 }, { "epoch": 1.0854526958290946, "grad_norm": 0.9924202561378479, "learning_rate": 1e-05, "loss": 0.4646, "mean_token_accuracy": 0.8522224426269531, "num_tokens": 339944200.0, "step": 2134 }, { "epoch": 1.0859613428280772, "grad_norm": 1.0728756189346313, "learning_rate": 1e-05, "loss": 0.4872, "mean_token_accuracy": 0.846017599105835, "num_tokens": 340097746.0, "step": 2135 }, { "epoch": 1.08646998982706, "grad_norm": 0.9821423888206482, "learning_rate": 1e-05, "loss": 0.5064, "mean_token_accuracy": 0.8397558927536011, "num_tokens": 340252788.0, "step": 2136 }, { "epoch": 1.0869786368260428, "grad_norm": 0.962029755115509, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8433777093887329, "num_tokens": 340418368.0, "step": 2137 }, { "epoch": 1.0874872838250254, "grad_norm": 0.9940122365951538, "learning_rate": 1e-05, "loss": 0.481, "mean_token_accuracy": 0.8478418588638306, "num_tokens": 340571484.0, "step": 2138 }, { "epoch": 1.0879959308240081, "grad_norm": 1.0245951414108276, "learning_rate": 1e-05, "loss": 0.4901, "mean_token_accuracy": 0.8461571931838989, "num_tokens": 340720288.0, "step": 2139 }, { "epoch": 1.088504577822991, "grad_norm": 0.9604802131652832, "learning_rate": 1e-05, "loss": 0.4796, "mean_token_accuracy": 0.8480315208435059, "num_tokens": 340886244.0, "step": 2140 }, { "epoch": 1.0890132248219735, "grad_norm": 1.0930918455123901, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8429490327835083, "num_tokens": 341039977.0, "step": 2141 }, { "epoch": 1.0895218718209563, "grad_norm": 1.1034718751907349, "learning_rate": 1e-05, "loss": 0.4837, "mean_token_accuracy": 0.8489624261856079, "num_tokens": 341199202.0, "step": 2142 }, { "epoch": 1.090030518819939, "grad_norm": 0.9594838619232178, "learning_rate": 1e-05, "loss": 0.4877, "mean_token_accuracy": 0.8449561595916748, "num_tokens": 341365619.0, "step": 2143 }, { "epoch": 1.0905391658189216, "grad_norm": 0.9820337295532227, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.8415205478668213, "num_tokens": 341532738.0, "step": 2144 }, { "epoch": 1.0910478128179044, "grad_norm": 0.9692781567573547, "learning_rate": 1e-05, "loss": 0.49, "mean_token_accuracy": 0.8459674715995789, "num_tokens": 341683366.0, "step": 2145 }, { "epoch": 1.091556459816887, "grad_norm": 1.0075808763504028, "learning_rate": 1e-05, "loss": 0.4857, "mean_token_accuracy": 0.8459229469299316, "num_tokens": 341847416.0, "step": 2146 }, { "epoch": 1.0920651068158698, "grad_norm": 0.9658797383308411, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8468917608261108, "num_tokens": 342011022.0, "step": 2147 }, { "epoch": 1.0925737538148526, "grad_norm": 0.9919976592063904, "learning_rate": 1e-05, "loss": 0.4787, "mean_token_accuracy": 0.8484220504760742, "num_tokens": 342175374.0, "step": 2148 }, { "epoch": 1.0930824008138351, "grad_norm": 0.9361450672149658, "learning_rate": 1e-05, "loss": 0.5137, "mean_token_accuracy": 0.8397796750068665, "num_tokens": 342344134.0, "step": 2149 }, { "epoch": 1.093591047812818, "grad_norm": 0.961454451084137, "learning_rate": 1e-05, "loss": 0.5006, "mean_token_accuracy": 0.8421031832695007, "num_tokens": 342506392.0, "step": 2150 }, { "epoch": 1.0940996948118007, "grad_norm": 1.0065735578536987, "learning_rate": 1e-05, "loss": 0.4779, "mean_token_accuracy": 0.8483932018280029, "num_tokens": 342671330.0, "step": 2151 }, { "epoch": 1.0946083418107833, "grad_norm": 0.9278548955917358, "learning_rate": 1e-05, "loss": 0.5141, "mean_token_accuracy": 0.8382604718208313, "num_tokens": 342830993.0, "step": 2152 }, { "epoch": 1.095116988809766, "grad_norm": 1.0080245733261108, "learning_rate": 1e-05, "loss": 0.4944, "mean_token_accuracy": 0.843571126461029, "num_tokens": 342983161.0, "step": 2153 }, { "epoch": 1.0956256358087488, "grad_norm": 1.0011775493621826, "learning_rate": 1e-05, "loss": 0.4865, "mean_token_accuracy": 0.8465944528579712, "num_tokens": 343141065.0, "step": 2154 }, { "epoch": 1.0961342828077314, "grad_norm": 1.0971297025680542, "learning_rate": 1e-05, "loss": 0.5232, "mean_token_accuracy": 0.8360178470611572, "num_tokens": 343293439.0, "step": 2155 }, { "epoch": 1.0966429298067142, "grad_norm": 0.9907771944999695, "learning_rate": 1e-05, "loss": 0.5133, "mean_token_accuracy": 0.8384357690811157, "num_tokens": 343447392.0, "step": 2156 }, { "epoch": 1.0971515768056967, "grad_norm": 1.0306109189987183, "learning_rate": 1e-05, "loss": 0.4763, "mean_token_accuracy": 0.8493988513946533, "num_tokens": 343594086.0, "step": 2157 }, { "epoch": 1.0976602238046795, "grad_norm": 0.956362783908844, "learning_rate": 1e-05, "loss": 0.4872, "mean_token_accuracy": 0.8483714461326599, "num_tokens": 343757619.0, "step": 2158 }, { "epoch": 1.0981688708036623, "grad_norm": 0.9461917281150818, "learning_rate": 1e-05, "loss": 0.4671, "mean_token_accuracy": 0.852462887763977, "num_tokens": 343918481.0, "step": 2159 }, { "epoch": 1.0986775178026449, "grad_norm": 1.0135009288787842, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8400470018386841, "num_tokens": 344073814.0, "step": 2160 }, { "epoch": 1.0991861648016277, "grad_norm": 0.9641030430793762, "learning_rate": 1e-05, "loss": 0.4726, "mean_token_accuracy": 0.8498222231864929, "num_tokens": 344234173.0, "step": 2161 }, { "epoch": 1.0996948118006105, "grad_norm": 0.9917462468147278, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8430187702178955, "num_tokens": 344391441.0, "step": 2162 }, { "epoch": 1.100203458799593, "grad_norm": 0.9825915694236755, "learning_rate": 1e-05, "loss": 0.4846, "mean_token_accuracy": 0.8467402458190918, "num_tokens": 344548813.0, "step": 2163 }, { "epoch": 1.1007121057985758, "grad_norm": 1.006765365600586, "learning_rate": 1e-05, "loss": 0.5063, "mean_token_accuracy": 0.8426176309585571, "num_tokens": 344712984.0, "step": 2164 }, { "epoch": 1.1012207527975586, "grad_norm": 0.9520547389984131, "learning_rate": 1e-05, "loss": 0.4933, "mean_token_accuracy": 0.8418510556221008, "num_tokens": 344873127.0, "step": 2165 }, { "epoch": 1.1017293997965412, "grad_norm": 0.9368497133255005, "learning_rate": 1e-05, "loss": 0.5136, "mean_token_accuracy": 0.8412468433380127, "num_tokens": 345036641.0, "step": 2166 }, { "epoch": 1.102238046795524, "grad_norm": 1.083411693572998, "learning_rate": 1e-05, "loss": 0.4849, "mean_token_accuracy": 0.8463590145111084, "num_tokens": 345205810.0, "step": 2167 }, { "epoch": 1.1027466937945065, "grad_norm": 0.988730251789093, "learning_rate": 1e-05, "loss": 0.491, "mean_token_accuracy": 0.8430241942405701, "num_tokens": 345369577.0, "step": 2168 }, { "epoch": 1.1032553407934893, "grad_norm": 1.005668044090271, "learning_rate": 1e-05, "loss": 0.4628, "mean_token_accuracy": 0.8519200682640076, "num_tokens": 345522712.0, "step": 2169 }, { "epoch": 1.103763987792472, "grad_norm": 0.9839890599250793, "learning_rate": 1e-05, "loss": 0.4813, "mean_token_accuracy": 0.8477115631103516, "num_tokens": 345683367.0, "step": 2170 }, { "epoch": 1.1042726347914547, "grad_norm": 1.0047062635421753, "learning_rate": 1e-05, "loss": 0.4924, "mean_token_accuracy": 0.8448252081871033, "num_tokens": 345836341.0, "step": 2171 }, { "epoch": 1.1047812817904374, "grad_norm": 0.9978728294372559, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.8382081985473633, "num_tokens": 346003193.0, "step": 2172 }, { "epoch": 1.1052899287894202, "grad_norm": 0.9945022463798523, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8414233326911926, "num_tokens": 346173134.0, "step": 2173 }, { "epoch": 1.1057985757884028, "grad_norm": 0.9274640679359436, "learning_rate": 1e-05, "loss": 0.4744, "mean_token_accuracy": 0.8477193713188171, "num_tokens": 346330561.0, "step": 2174 }, { "epoch": 1.1063072227873856, "grad_norm": 0.9226838946342468, "learning_rate": 1e-05, "loss": 0.502, "mean_token_accuracy": 0.8429021239280701, "num_tokens": 346499119.0, "step": 2175 }, { "epoch": 1.1068158697863684, "grad_norm": 0.9797883033752441, "learning_rate": 1e-05, "loss": 0.4945, "mean_token_accuracy": 0.8447971940040588, "num_tokens": 346660957.0, "step": 2176 }, { "epoch": 1.107324516785351, "grad_norm": 0.9634705781936646, "learning_rate": 1e-05, "loss": 0.4471, "mean_token_accuracy": 0.85663241147995, "num_tokens": 346820626.0, "step": 2177 }, { "epoch": 1.1078331637843337, "grad_norm": 0.9565395712852478, "learning_rate": 1e-05, "loss": 0.4825, "mean_token_accuracy": 0.8473824858665466, "num_tokens": 346990623.0, "step": 2178 }, { "epoch": 1.1083418107833163, "grad_norm": 0.9224358797073364, "learning_rate": 1e-05, "loss": 0.4954, "mean_token_accuracy": 0.8449086546897888, "num_tokens": 347168826.0, "step": 2179 }, { "epoch": 1.108850457782299, "grad_norm": 0.9978235960006714, "learning_rate": 1e-05, "loss": 0.4954, "mean_token_accuracy": 0.8437415361404419, "num_tokens": 347324012.0, "step": 2180 }, { "epoch": 1.1093591047812819, "grad_norm": 1.0280753374099731, "learning_rate": 1e-05, "loss": 0.4929, "mean_token_accuracy": 0.8424971103668213, "num_tokens": 347478544.0, "step": 2181 }, { "epoch": 1.1098677517802644, "grad_norm": 0.9893693923950195, "learning_rate": 1e-05, "loss": 0.4984, "mean_token_accuracy": 0.8421764373779297, "num_tokens": 347632344.0, "step": 2182 }, { "epoch": 1.1103763987792472, "grad_norm": 1.0173699855804443, "learning_rate": 1e-05, "loss": 0.5012, "mean_token_accuracy": 0.8403029441833496, "num_tokens": 347788398.0, "step": 2183 }, { "epoch": 1.11088504577823, "grad_norm": 1.399958610534668, "learning_rate": 1e-05, "loss": 0.4959, "mean_token_accuracy": 0.840847909450531, "num_tokens": 347925739.0, "step": 2184 }, { "epoch": 1.1113936927772126, "grad_norm": 1.0244845151901245, "learning_rate": 1e-05, "loss": 0.4522, "mean_token_accuracy": 0.854158341884613, "num_tokens": 348085226.0, "step": 2185 }, { "epoch": 1.1119023397761953, "grad_norm": 0.9952741265296936, "learning_rate": 1e-05, "loss": 0.4837, "mean_token_accuracy": 0.8468140959739685, "num_tokens": 348240499.0, "step": 2186 }, { "epoch": 1.1124109867751781, "grad_norm": 1.0770624876022339, "learning_rate": 1e-05, "loss": 0.463, "mean_token_accuracy": 0.8523979783058167, "num_tokens": 348394436.0, "step": 2187 }, { "epoch": 1.1129196337741607, "grad_norm": 0.97931307554245, "learning_rate": 1e-05, "loss": 0.4806, "mean_token_accuracy": 0.8491225838661194, "num_tokens": 348555987.0, "step": 2188 }, { "epoch": 1.1134282807731435, "grad_norm": 1.1255407333374023, "learning_rate": 1e-05, "loss": 0.4873, "mean_token_accuracy": 0.8461816310882568, "num_tokens": 348708430.0, "step": 2189 }, { "epoch": 1.113936927772126, "grad_norm": 1.0181074142456055, "learning_rate": 1e-05, "loss": 0.4883, "mean_token_accuracy": 0.8455284833908081, "num_tokens": 348866347.0, "step": 2190 }, { "epoch": 1.1144455747711088, "grad_norm": 0.957888662815094, "learning_rate": 1e-05, "loss": 0.4735, "mean_token_accuracy": 0.8496738076210022, "num_tokens": 349035863.0, "step": 2191 }, { "epoch": 1.1149542217700916, "grad_norm": 1.0182090997695923, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.8527451753616333, "num_tokens": 349194754.0, "step": 2192 }, { "epoch": 1.1154628687690742, "grad_norm": 1.0608559846878052, "learning_rate": 1e-05, "loss": 0.4881, "mean_token_accuracy": 0.8453966975212097, "num_tokens": 349350492.0, "step": 2193 }, { "epoch": 1.115971515768057, "grad_norm": 1.0267398357391357, "learning_rate": 1e-05, "loss": 0.471, "mean_token_accuracy": 0.8491342067718506, "num_tokens": 349501086.0, "step": 2194 }, { "epoch": 1.1164801627670398, "grad_norm": 0.9610785245895386, "learning_rate": 1e-05, "loss": 0.4651, "mean_token_accuracy": 0.8535009622573853, "num_tokens": 349655643.0, "step": 2195 }, { "epoch": 1.1169888097660223, "grad_norm": 0.9949964880943298, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.844301164150238, "num_tokens": 349818869.0, "step": 2196 }, { "epoch": 1.1174974567650051, "grad_norm": 0.9660967588424683, "learning_rate": 1e-05, "loss": 0.468, "mean_token_accuracy": 0.850468635559082, "num_tokens": 349975870.0, "step": 2197 }, { "epoch": 1.118006103763988, "grad_norm": 1.0754655599594116, "learning_rate": 1e-05, "loss": 0.4859, "mean_token_accuracy": 0.8464107513427734, "num_tokens": 350142280.0, "step": 2198 }, { "epoch": 1.1185147507629705, "grad_norm": 1.0086798667907715, "learning_rate": 1e-05, "loss": 0.4557, "mean_token_accuracy": 0.855668842792511, "num_tokens": 350299480.0, "step": 2199 }, { "epoch": 1.1190233977619533, "grad_norm": 0.9509584307670593, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.8424690365791321, "num_tokens": 350469513.0, "step": 2200 }, { "epoch": 1.1195320447609358, "grad_norm": 1.24341881275177, "learning_rate": 1e-05, "loss": 0.4855, "mean_token_accuracy": 0.8451713919639587, "num_tokens": 350617118.0, "step": 2201 }, { "epoch": 1.1200406917599186, "grad_norm": 0.9689223766326904, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8523330688476562, "num_tokens": 350784696.0, "step": 2202 }, { "epoch": 1.1205493387589014, "grad_norm": 0.9255483746528625, "learning_rate": 1e-05, "loss": 0.4559, "mean_token_accuracy": 0.8544161915779114, "num_tokens": 350943249.0, "step": 2203 }, { "epoch": 1.121057985757884, "grad_norm": 1.0668855905532837, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8542884588241577, "num_tokens": 351109045.0, "step": 2204 }, { "epoch": 1.1215666327568667, "grad_norm": 0.9399425387382507, "learning_rate": 1e-05, "loss": 0.4764, "mean_token_accuracy": 0.8503850698471069, "num_tokens": 351282282.0, "step": 2205 }, { "epoch": 1.1220752797558495, "grad_norm": 1.0401631593704224, "learning_rate": 1e-05, "loss": 0.4641, "mean_token_accuracy": 0.852576494216919, "num_tokens": 351433256.0, "step": 2206 }, { "epoch": 1.122583926754832, "grad_norm": 1.0747698545455933, "learning_rate": 1e-05, "loss": 0.511, "mean_token_accuracy": 0.8402716517448425, "num_tokens": 351581777.0, "step": 2207 }, { "epoch": 1.1230925737538149, "grad_norm": 1.0110957622528076, "learning_rate": 1e-05, "loss": 0.4989, "mean_token_accuracy": 0.8414695262908936, "num_tokens": 351734672.0, "step": 2208 }, { "epoch": 1.1236012207527977, "grad_norm": 1.0141860246658325, "learning_rate": 1e-05, "loss": 0.4772, "mean_token_accuracy": 0.84971022605896, "num_tokens": 351891165.0, "step": 2209 }, { "epoch": 1.1241098677517802, "grad_norm": 0.9789010882377625, "learning_rate": 1e-05, "loss": 0.4811, "mean_token_accuracy": 0.8458177447319031, "num_tokens": 352042249.0, "step": 2210 }, { "epoch": 1.124618514750763, "grad_norm": 1.061063528060913, "learning_rate": 1e-05, "loss": 0.4866, "mean_token_accuracy": 0.844889760017395, "num_tokens": 352204407.0, "step": 2211 }, { "epoch": 1.1251271617497456, "grad_norm": 0.9890654683113098, "learning_rate": 1e-05, "loss": 0.4935, "mean_token_accuracy": 0.8418614268302917, "num_tokens": 352358733.0, "step": 2212 }, { "epoch": 1.1256358087487284, "grad_norm": 0.9955906867980957, "learning_rate": 1e-05, "loss": 0.4872, "mean_token_accuracy": 0.8459783792495728, "num_tokens": 352511222.0, "step": 2213 }, { "epoch": 1.1261444557477112, "grad_norm": 0.9302468299865723, "learning_rate": 1e-05, "loss": 0.493, "mean_token_accuracy": 0.8450722694396973, "num_tokens": 352667221.0, "step": 2214 }, { "epoch": 1.1266531027466937, "grad_norm": 0.9680992960929871, "learning_rate": 1e-05, "loss": 0.4665, "mean_token_accuracy": 0.8513346910476685, "num_tokens": 352830778.0, "step": 2215 }, { "epoch": 1.1271617497456765, "grad_norm": 0.9598748683929443, "learning_rate": 1e-05, "loss": 0.4831, "mean_token_accuracy": 0.8466798663139343, "num_tokens": 352982322.0, "step": 2216 }, { "epoch": 1.1276703967446593, "grad_norm": 0.9731796979904175, "learning_rate": 1e-05, "loss": 0.4825, "mean_token_accuracy": 0.8480831980705261, "num_tokens": 353143982.0, "step": 2217 }, { "epoch": 1.1281790437436419, "grad_norm": 0.9734607338905334, "learning_rate": 1e-05, "loss": 0.49, "mean_token_accuracy": 0.8462680578231812, "num_tokens": 353294238.0, "step": 2218 }, { "epoch": 1.1286876907426246, "grad_norm": 0.9904110431671143, "learning_rate": 1e-05, "loss": 0.4869, "mean_token_accuracy": 0.8465762138366699, "num_tokens": 353451803.0, "step": 2219 }, { "epoch": 1.1291963377416074, "grad_norm": 1.043080449104309, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.8410126566886902, "num_tokens": 353617488.0, "step": 2220 }, { "epoch": 1.12970498474059, "grad_norm": 0.9859865307807922, "learning_rate": 1e-05, "loss": 0.4842, "mean_token_accuracy": 0.8460755944252014, "num_tokens": 353774242.0, "step": 2221 }, { "epoch": 1.1302136317395728, "grad_norm": 1.063538908958435, "learning_rate": 1e-05, "loss": 0.4824, "mean_token_accuracy": 0.8463836908340454, "num_tokens": 353942973.0, "step": 2222 }, { "epoch": 1.1307222787385554, "grad_norm": 1.027944803237915, "learning_rate": 1e-05, "loss": 0.5363, "mean_token_accuracy": 0.8324622511863708, "num_tokens": 354091657.0, "step": 2223 }, { "epoch": 1.1312309257375381, "grad_norm": 1.0895392894744873, "learning_rate": 1e-05, "loss": 0.4823, "mean_token_accuracy": 0.8465158939361572, "num_tokens": 354241839.0, "step": 2224 }, { "epoch": 1.131739572736521, "grad_norm": 1.0151928663253784, "learning_rate": 1e-05, "loss": 0.4842, "mean_token_accuracy": 0.8473328351974487, "num_tokens": 354404721.0, "step": 2225 }, { "epoch": 1.1322482197355035, "grad_norm": 1.0319733619689941, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.8381280303001404, "num_tokens": 354575014.0, "step": 2226 }, { "epoch": 1.1327568667344863, "grad_norm": 1.0670835971832275, "learning_rate": 1e-05, "loss": 0.4944, "mean_token_accuracy": 0.8440436124801636, "num_tokens": 354737329.0, "step": 2227 }, { "epoch": 1.133265513733469, "grad_norm": 0.9908400774002075, "learning_rate": 1e-05, "loss": 0.4825, "mean_token_accuracy": 0.8480251431465149, "num_tokens": 354915712.0, "step": 2228 }, { "epoch": 1.1337741607324516, "grad_norm": 0.9610514640808105, "learning_rate": 1e-05, "loss": 0.5001, "mean_token_accuracy": 0.8409698605537415, "num_tokens": 355070597.0, "step": 2229 }, { "epoch": 1.1342828077314344, "grad_norm": 1.01914644241333, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8394330739974976, "num_tokens": 355237653.0, "step": 2230 }, { "epoch": 1.1347914547304172, "grad_norm": 0.9027879238128662, "learning_rate": 1e-05, "loss": 0.503, "mean_token_accuracy": 0.8422386050224304, "num_tokens": 355408779.0, "step": 2231 }, { "epoch": 1.1353001017293998, "grad_norm": 0.9325588941574097, "learning_rate": 1e-05, "loss": 0.4643, "mean_token_accuracy": 0.8536386489868164, "num_tokens": 355582832.0, "step": 2232 }, { "epoch": 1.1358087487283826, "grad_norm": 1.0098017454147339, "learning_rate": 1e-05, "loss": 0.4743, "mean_token_accuracy": 0.8489354252815247, "num_tokens": 355724880.0, "step": 2233 }, { "epoch": 1.1363173957273651, "grad_norm": 1.0128886699676514, "learning_rate": 1e-05, "loss": 0.5105, "mean_token_accuracy": 0.8378116488456726, "num_tokens": 355882190.0, "step": 2234 }, { "epoch": 1.136826042726348, "grad_norm": 0.9628869295120239, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8522527813911438, "num_tokens": 356027036.0, "step": 2235 }, { "epoch": 1.1373346897253307, "grad_norm": 0.9411493539810181, "learning_rate": 1e-05, "loss": 0.4819, "mean_token_accuracy": 0.8461790680885315, "num_tokens": 356192221.0, "step": 2236 }, { "epoch": 1.1378433367243133, "grad_norm": 0.9630653858184814, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8419363498687744, "num_tokens": 356354570.0, "step": 2237 }, { "epoch": 1.138351983723296, "grad_norm": 1.036780595779419, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8373992443084717, "num_tokens": 356515011.0, "step": 2238 }, { "epoch": 1.1388606307222788, "grad_norm": 1.0112197399139404, "learning_rate": 1e-05, "loss": 0.4697, "mean_token_accuracy": 0.8511573076248169, "num_tokens": 356676989.0, "step": 2239 }, { "epoch": 1.1393692777212614, "grad_norm": 1.0344760417938232, "learning_rate": 1e-05, "loss": 0.4934, "mean_token_accuracy": 0.8441804051399231, "num_tokens": 356840072.0, "step": 2240 }, { "epoch": 1.1398779247202442, "grad_norm": 0.972714364528656, "learning_rate": 1e-05, "loss": 0.4925, "mean_token_accuracy": 0.8463292121887207, "num_tokens": 357003414.0, "step": 2241 }, { "epoch": 1.140386571719227, "grad_norm": 0.9765809774398804, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.8423345685005188, "num_tokens": 357164387.0, "step": 2242 }, { "epoch": 1.1408952187182095, "grad_norm": 0.9447953104972839, "learning_rate": 1e-05, "loss": 0.4613, "mean_token_accuracy": 0.8525022864341736, "num_tokens": 357323400.0, "step": 2243 }, { "epoch": 1.1414038657171923, "grad_norm": 0.946564793586731, "learning_rate": 1e-05, "loss": 0.4763, "mean_token_accuracy": 0.8481206893920898, "num_tokens": 357480123.0, "step": 2244 }, { "epoch": 1.1419125127161749, "grad_norm": 1.0055655241012573, "learning_rate": 1e-05, "loss": 0.4601, "mean_token_accuracy": 0.8537294268608093, "num_tokens": 357642932.0, "step": 2245 }, { "epoch": 1.1424211597151577, "grad_norm": 0.9800072908401489, "learning_rate": 1e-05, "loss": 0.462, "mean_token_accuracy": 0.8519518375396729, "num_tokens": 357799511.0, "step": 2246 }, { "epoch": 1.1429298067141405, "grad_norm": 1.0585764646530151, "learning_rate": 1e-05, "loss": 0.4837, "mean_token_accuracy": 0.8486620187759399, "num_tokens": 357950084.0, "step": 2247 }, { "epoch": 1.143438453713123, "grad_norm": 1.0663877725601196, "learning_rate": 1e-05, "loss": 0.5124, "mean_token_accuracy": 0.8382563591003418, "num_tokens": 358095805.0, "step": 2248 }, { "epoch": 1.1439471007121058, "grad_norm": 1.0571013689041138, "learning_rate": 1e-05, "loss": 0.4997, "mean_token_accuracy": 0.8429567217826843, "num_tokens": 358247573.0, "step": 2249 }, { "epoch": 1.1444557477110886, "grad_norm": 0.9647451639175415, "learning_rate": 1e-05, "loss": 0.4571, "mean_token_accuracy": 0.8561835289001465, "num_tokens": 358405765.0, "step": 2250 }, { "epoch": 1.1449643947100712, "grad_norm": 0.9498294591903687, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.8418372869491577, "num_tokens": 358572605.0, "step": 2251 }, { "epoch": 1.145473041709054, "grad_norm": 1.0757380723953247, "learning_rate": 1e-05, "loss": 0.55, "mean_token_accuracy": 0.8287532925605774, "num_tokens": 358741998.0, "step": 2252 }, { "epoch": 1.1459816887080367, "grad_norm": 1.0514572858810425, "learning_rate": 1e-05, "loss": 0.4619, "mean_token_accuracy": 0.8518444299697876, "num_tokens": 358892427.0, "step": 2253 }, { "epoch": 1.1464903357070193, "grad_norm": 0.9669085741043091, "learning_rate": 1e-05, "loss": 0.4671, "mean_token_accuracy": 0.854065477848053, "num_tokens": 359053024.0, "step": 2254 }, { "epoch": 1.146998982706002, "grad_norm": 1.0323379039764404, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8478610515594482, "num_tokens": 359211001.0, "step": 2255 }, { "epoch": 1.1475076297049847, "grad_norm": 1.0277243852615356, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8460548520088196, "num_tokens": 359375453.0, "step": 2256 }, { "epoch": 1.1480162767039674, "grad_norm": 1.0732399225234985, "learning_rate": 1e-05, "loss": 0.461, "mean_token_accuracy": 0.8520042300224304, "num_tokens": 359521205.0, "step": 2257 }, { "epoch": 1.1485249237029502, "grad_norm": 1.0115880966186523, "learning_rate": 1e-05, "loss": 0.4612, "mean_token_accuracy": 0.8529631495475769, "num_tokens": 359692458.0, "step": 2258 }, { "epoch": 1.1490335707019328, "grad_norm": 1.0270670652389526, "learning_rate": 1e-05, "loss": 0.4832, "mean_token_accuracy": 0.8482770919799805, "num_tokens": 359854056.0, "step": 2259 }, { "epoch": 1.1495422177009156, "grad_norm": 1.1131222248077393, "learning_rate": 1e-05, "loss": 0.5044, "mean_token_accuracy": 0.8396096229553223, "num_tokens": 360008843.0, "step": 2260 }, { "epoch": 1.1500508646998984, "grad_norm": 1.0628095865249634, "learning_rate": 1e-05, "loss": 0.4878, "mean_token_accuracy": 0.8451195955276489, "num_tokens": 360171252.0, "step": 2261 }, { "epoch": 1.150559511698881, "grad_norm": 0.9063826203346252, "learning_rate": 1e-05, "loss": 0.4578, "mean_token_accuracy": 0.8525300025939941, "num_tokens": 360333592.0, "step": 2262 }, { "epoch": 1.1510681586978637, "grad_norm": 1.0526858568191528, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8553949594497681, "num_tokens": 360485356.0, "step": 2263 }, { "epoch": 1.1515768056968465, "grad_norm": 1.6245720386505127, "learning_rate": 1e-05, "loss": 0.4326, "mean_token_accuracy": 0.8621503114700317, "num_tokens": 360646033.0, "step": 2264 }, { "epoch": 1.152085452695829, "grad_norm": 1.6461989879608154, "learning_rate": 1e-05, "loss": 0.5018, "mean_token_accuracy": 0.8413547277450562, "num_tokens": 360822067.0, "step": 2265 }, { "epoch": 1.1525940996948119, "grad_norm": 1.0477701425552368, "learning_rate": 1e-05, "loss": 0.5035, "mean_token_accuracy": 0.8408612012863159, "num_tokens": 360980106.0, "step": 2266 }, { "epoch": 1.1531027466937944, "grad_norm": 0.9780399799346924, "learning_rate": 1e-05, "loss": 0.477, "mean_token_accuracy": 0.8492373824119568, "num_tokens": 361141106.0, "step": 2267 }, { "epoch": 1.1536113936927772, "grad_norm": 1.042273998260498, "learning_rate": 1e-05, "loss": 0.4797, "mean_token_accuracy": 0.8492919206619263, "num_tokens": 361296869.0, "step": 2268 }, { "epoch": 1.15412004069176, "grad_norm": 1.0089274644851685, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.840201735496521, "num_tokens": 361439766.0, "step": 2269 }, { "epoch": 1.1546286876907426, "grad_norm": 0.9834834933280945, "learning_rate": 1e-05, "loss": 0.4702, "mean_token_accuracy": 0.850745439529419, "num_tokens": 361597141.0, "step": 2270 }, { "epoch": 1.1551373346897253, "grad_norm": 1.0707170963287354, "learning_rate": 1e-05, "loss": 0.4817, "mean_token_accuracy": 0.846406102180481, "num_tokens": 361760028.0, "step": 2271 }, { "epoch": 1.155645981688708, "grad_norm": 1.0368983745574951, "learning_rate": 1e-05, "loss": 0.4804, "mean_token_accuracy": 0.8464101552963257, "num_tokens": 361931768.0, "step": 2272 }, { "epoch": 1.1561546286876907, "grad_norm": 1.189819574356079, "learning_rate": 1e-05, "loss": 0.4851, "mean_token_accuracy": 0.8495547771453857, "num_tokens": 362095781.0, "step": 2273 }, { "epoch": 1.1566632756866735, "grad_norm": 1.113652229309082, "learning_rate": 1e-05, "loss": 0.4277, "mean_token_accuracy": 0.8631555438041687, "num_tokens": 362246811.0, "step": 2274 }, { "epoch": 1.1571719226856563, "grad_norm": 1.0526518821716309, "learning_rate": 1e-05, "loss": 0.4915, "mean_token_accuracy": 0.8442506790161133, "num_tokens": 362394108.0, "step": 2275 }, { "epoch": 1.1576805696846388, "grad_norm": 1.0336838960647583, "learning_rate": 1e-05, "loss": 0.4785, "mean_token_accuracy": 0.8486695289611816, "num_tokens": 362555139.0, "step": 2276 }, { "epoch": 1.1581892166836216, "grad_norm": 1.0744441747665405, "learning_rate": 1e-05, "loss": 0.4495, "mean_token_accuracy": 0.8572794198989868, "num_tokens": 362718486.0, "step": 2277 }, { "epoch": 1.1586978636826042, "grad_norm": 1.03730309009552, "learning_rate": 1e-05, "loss": 0.4917, "mean_token_accuracy": 0.8441042304039001, "num_tokens": 362881363.0, "step": 2278 }, { "epoch": 1.159206510681587, "grad_norm": 1.021234154701233, "learning_rate": 1e-05, "loss": 0.5158, "mean_token_accuracy": 0.8376511335372925, "num_tokens": 363054375.0, "step": 2279 }, { "epoch": 1.1597151576805698, "grad_norm": 1.0153952836990356, "learning_rate": 1e-05, "loss": 0.4688, "mean_token_accuracy": 0.8510196805000305, "num_tokens": 363207013.0, "step": 2280 }, { "epoch": 1.1602238046795523, "grad_norm": 0.9547268748283386, "learning_rate": 1e-05, "loss": 0.4591, "mean_token_accuracy": 0.8534315228462219, "num_tokens": 363364310.0, "step": 2281 }, { "epoch": 1.1607324516785351, "grad_norm": 1.0788716077804565, "learning_rate": 1e-05, "loss": 0.4878, "mean_token_accuracy": 0.8448123931884766, "num_tokens": 363501174.0, "step": 2282 }, { "epoch": 1.1612410986775177, "grad_norm": 1.002040982246399, "learning_rate": 1e-05, "loss": 0.4881, "mean_token_accuracy": 0.8461840748786926, "num_tokens": 363667951.0, "step": 2283 }, { "epoch": 1.1617497456765005, "grad_norm": 0.9879819750785828, "learning_rate": 1e-05, "loss": 0.478, "mean_token_accuracy": 0.848838210105896, "num_tokens": 363831350.0, "step": 2284 }, { "epoch": 1.1622583926754833, "grad_norm": 0.9468021392822266, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8422623872756958, "num_tokens": 364006126.0, "step": 2285 }, { "epoch": 1.162767039674466, "grad_norm": 0.9665511846542358, "learning_rate": 1e-05, "loss": 0.4556, "mean_token_accuracy": 0.8548101186752319, "num_tokens": 364161537.0, "step": 2286 }, { "epoch": 1.1632756866734486, "grad_norm": 0.9804579019546509, "learning_rate": 1e-05, "loss": 0.4885, "mean_token_accuracy": 0.8468865156173706, "num_tokens": 364318922.0, "step": 2287 }, { "epoch": 1.1637843336724314, "grad_norm": 0.9549255967140198, "learning_rate": 1e-05, "loss": 0.4707, "mean_token_accuracy": 0.8510030508041382, "num_tokens": 364468701.0, "step": 2288 }, { "epoch": 1.164292980671414, "grad_norm": 0.9666069746017456, "learning_rate": 1e-05, "loss": 0.5088, "mean_token_accuracy": 0.8389880061149597, "num_tokens": 364633717.0, "step": 2289 }, { "epoch": 1.1648016276703967, "grad_norm": 1.091866135597229, "learning_rate": 1e-05, "loss": 0.4549, "mean_token_accuracy": 0.8548486232757568, "num_tokens": 364793011.0, "step": 2290 }, { "epoch": 1.1653102746693795, "grad_norm": 1.9256987571716309, "learning_rate": 1e-05, "loss": 0.4831, "mean_token_accuracy": 0.8470572233200073, "num_tokens": 364962011.0, "step": 2291 }, { "epoch": 1.165818921668362, "grad_norm": 1.0450749397277832, "learning_rate": 1e-05, "loss": 0.4711, "mean_token_accuracy": 0.8515802621841431, "num_tokens": 365125068.0, "step": 2292 }, { "epoch": 1.1663275686673449, "grad_norm": 0.9731523990631104, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8373537063598633, "num_tokens": 365284576.0, "step": 2293 }, { "epoch": 1.1668362156663274, "grad_norm": 1.0927114486694336, "learning_rate": 1e-05, "loss": 0.494, "mean_token_accuracy": 0.8448081612586975, "num_tokens": 365447819.0, "step": 2294 }, { "epoch": 1.1673448626653102, "grad_norm": 0.9757579565048218, "learning_rate": 1e-05, "loss": 0.478, "mean_token_accuracy": 0.8482175469398499, "num_tokens": 365608276.0, "step": 2295 }, { "epoch": 1.167853509664293, "grad_norm": 0.9663304686546326, "learning_rate": 1e-05, "loss": 0.5026, "mean_token_accuracy": 0.8430571556091309, "num_tokens": 365766838.0, "step": 2296 }, { "epoch": 1.1683621566632758, "grad_norm": 0.9581319689750671, "learning_rate": 1e-05, "loss": 0.4884, "mean_token_accuracy": 0.8461264371871948, "num_tokens": 365938182.0, "step": 2297 }, { "epoch": 1.1688708036622584, "grad_norm": 0.9772895574569702, "learning_rate": 1e-05, "loss": 0.4514, "mean_token_accuracy": 0.8557355403900146, "num_tokens": 366101878.0, "step": 2298 }, { "epoch": 1.1693794506612412, "grad_norm": 0.9259494543075562, "learning_rate": 1e-05, "loss": 0.4814, "mean_token_accuracy": 0.8471612930297852, "num_tokens": 366268381.0, "step": 2299 }, { "epoch": 1.1698880976602237, "grad_norm": 1.005831003189087, "learning_rate": 1e-05, "loss": 0.4718, "mean_token_accuracy": 0.85094153881073, "num_tokens": 366439188.0, "step": 2300 }, { "epoch": 1.1703967446592065, "grad_norm": 0.9507022500038147, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8511804938316345, "num_tokens": 366610850.0, "step": 2301 }, { "epoch": 1.1709053916581893, "grad_norm": 0.9647862315177917, "learning_rate": 1e-05, "loss": 0.531, "mean_token_accuracy": 0.8360495567321777, "num_tokens": 366769572.0, "step": 2302 }, { "epoch": 1.1714140386571719, "grad_norm": 0.9812116622924805, "learning_rate": 1e-05, "loss": 0.4471, "mean_token_accuracy": 0.8576605319976807, "num_tokens": 366937681.0, "step": 2303 }, { "epoch": 1.1719226856561547, "grad_norm": 1.0458122491836548, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8458260297775269, "num_tokens": 367097208.0, "step": 2304 }, { "epoch": 1.1724313326551372, "grad_norm": 1.0308116674423218, "learning_rate": 1e-05, "loss": 0.4944, "mean_token_accuracy": 0.8454901576042175, "num_tokens": 367263406.0, "step": 2305 }, { "epoch": 1.17293997965412, "grad_norm": 1.1648168563842773, "learning_rate": 1e-05, "loss": 0.5273, "mean_token_accuracy": 0.8344169855117798, "num_tokens": 367424929.0, "step": 2306 }, { "epoch": 1.1734486266531028, "grad_norm": 1.046728491783142, "learning_rate": 1e-05, "loss": 0.4756, "mean_token_accuracy": 0.8486387729644775, "num_tokens": 367591140.0, "step": 2307 }, { "epoch": 1.1739572736520856, "grad_norm": 0.9279728531837463, "learning_rate": 1e-05, "loss": 0.4859, "mean_token_accuracy": 0.8459349870681763, "num_tokens": 367753127.0, "step": 2308 }, { "epoch": 1.1744659206510681, "grad_norm": 1.1688655614852905, "learning_rate": 1e-05, "loss": 0.4931, "mean_token_accuracy": 0.8448759913444519, "num_tokens": 367915582.0, "step": 2309 }, { "epoch": 1.174974567650051, "grad_norm": 0.9621530175209045, "learning_rate": 1e-05, "loss": 0.4815, "mean_token_accuracy": 0.8467572331428528, "num_tokens": 368073652.0, "step": 2310 }, { "epoch": 1.1754832146490335, "grad_norm": 1.200062870979309, "learning_rate": 1e-05, "loss": 0.4631, "mean_token_accuracy": 0.851768970489502, "num_tokens": 368224644.0, "step": 2311 }, { "epoch": 1.1759918616480163, "grad_norm": 1.0809787511825562, "learning_rate": 1e-05, "loss": 0.4825, "mean_token_accuracy": 0.8461123704910278, "num_tokens": 368389034.0, "step": 2312 }, { "epoch": 1.176500508646999, "grad_norm": 1.009333848953247, "learning_rate": 1e-05, "loss": 0.4701, "mean_token_accuracy": 0.8512672185897827, "num_tokens": 368548528.0, "step": 2313 }, { "epoch": 1.1770091556459816, "grad_norm": 1.1124660968780518, "learning_rate": 1e-05, "loss": 0.5256, "mean_token_accuracy": 0.8362528085708618, "num_tokens": 368721238.0, "step": 2314 }, { "epoch": 1.1775178026449644, "grad_norm": 1.0473283529281616, "learning_rate": 1e-05, "loss": 0.4848, "mean_token_accuracy": 0.8465226888656616, "num_tokens": 368875782.0, "step": 2315 }, { "epoch": 1.178026449643947, "grad_norm": 1.037091612815857, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8469461798667908, "num_tokens": 369039106.0, "step": 2316 }, { "epoch": 1.1785350966429298, "grad_norm": 1.1117887496948242, "learning_rate": 1e-05, "loss": 0.4911, "mean_token_accuracy": 0.8467146754264832, "num_tokens": 369201771.0, "step": 2317 }, { "epoch": 1.1790437436419126, "grad_norm": 1.0993705987930298, "learning_rate": 1e-05, "loss": 0.4529, "mean_token_accuracy": 0.8551526069641113, "num_tokens": 369375622.0, "step": 2318 }, { "epoch": 1.1795523906408953, "grad_norm": 1.03167724609375, "learning_rate": 1e-05, "loss": 0.4946, "mean_token_accuracy": 0.8446077108383179, "num_tokens": 369535466.0, "step": 2319 }, { "epoch": 1.180061037639878, "grad_norm": 1.0695133209228516, "learning_rate": 1e-05, "loss": 0.5084, "mean_token_accuracy": 0.840103030204773, "num_tokens": 369688789.0, "step": 2320 }, { "epoch": 1.1805696846388607, "grad_norm": 0.9820632338523865, "learning_rate": 1e-05, "loss": 0.4818, "mean_token_accuracy": 0.8472625017166138, "num_tokens": 369844154.0, "step": 2321 }, { "epoch": 1.1810783316378433, "grad_norm": 1.0760818719863892, "learning_rate": 1e-05, "loss": 0.5241, "mean_token_accuracy": 0.8362928628921509, "num_tokens": 369998967.0, "step": 2322 }, { "epoch": 1.181586978636826, "grad_norm": 1.0259249210357666, "learning_rate": 1e-05, "loss": 0.499, "mean_token_accuracy": 0.8428303599357605, "num_tokens": 370168382.0, "step": 2323 }, { "epoch": 1.1820956256358088, "grad_norm": 0.956731379032135, "learning_rate": 1e-05, "loss": 0.4839, "mean_token_accuracy": 0.8463218212127686, "num_tokens": 370333473.0, "step": 2324 }, { "epoch": 1.1826042726347914, "grad_norm": 1.0034064054489136, "learning_rate": 1e-05, "loss": 0.4796, "mean_token_accuracy": 0.8488554954528809, "num_tokens": 370507572.0, "step": 2325 }, { "epoch": 1.1831129196337742, "grad_norm": 0.9154096841812134, "learning_rate": 1e-05, "loss": 0.4994, "mean_token_accuracy": 0.8431752920150757, "num_tokens": 370660466.0, "step": 2326 }, { "epoch": 1.1836215666327567, "grad_norm": 0.9893860220909119, "learning_rate": 1e-05, "loss": 0.5255, "mean_token_accuracy": 0.8330589532852173, "num_tokens": 370828832.0, "step": 2327 }, { "epoch": 1.1841302136317395, "grad_norm": 1.0146458148956299, "learning_rate": 1e-05, "loss": 0.4863, "mean_token_accuracy": 0.8479539155960083, "num_tokens": 370998386.0, "step": 2328 }, { "epoch": 1.1846388606307223, "grad_norm": 0.9473051428794861, "learning_rate": 1e-05, "loss": 0.4789, "mean_token_accuracy": 0.8470596075057983, "num_tokens": 371166920.0, "step": 2329 }, { "epoch": 1.1851475076297049, "grad_norm": 0.9573273062705994, "learning_rate": 1e-05, "loss": 0.4817, "mean_token_accuracy": 0.8494331240653992, "num_tokens": 371332751.0, "step": 2330 }, { "epoch": 1.1856561546286877, "grad_norm": 0.9692023992538452, "learning_rate": 1e-05, "loss": 0.5028, "mean_token_accuracy": 0.8427772521972656, "num_tokens": 371495502.0, "step": 2331 }, { "epoch": 1.1861648016276705, "grad_norm": 0.9364044666290283, "learning_rate": 1e-05, "loss": 0.4888, "mean_token_accuracy": 0.8459036350250244, "num_tokens": 371662573.0, "step": 2332 }, { "epoch": 1.186673448626653, "grad_norm": 0.9484080076217651, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8498533964157104, "num_tokens": 371825073.0, "step": 2333 }, { "epoch": 1.1871820956256358, "grad_norm": 0.9878451228141785, "learning_rate": 1e-05, "loss": 0.4483, "mean_token_accuracy": 0.856770932674408, "num_tokens": 371986831.0, "step": 2334 }, { "epoch": 1.1876907426246186, "grad_norm": 0.9890164732933044, "learning_rate": 1e-05, "loss": 0.5071, "mean_token_accuracy": 0.8417010307312012, "num_tokens": 372140500.0, "step": 2335 }, { "epoch": 1.1881993896236012, "grad_norm": 0.9335831999778748, "learning_rate": 1e-05, "loss": 0.5248, "mean_token_accuracy": 0.8363795876502991, "num_tokens": 372315615.0, "step": 2336 }, { "epoch": 1.188708036622584, "grad_norm": 1.0428636074066162, "learning_rate": 1e-05, "loss": 0.4701, "mean_token_accuracy": 0.8501526117324829, "num_tokens": 372464286.0, "step": 2337 }, { "epoch": 1.1892166836215665, "grad_norm": 0.9321058988571167, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.844826877117157, "num_tokens": 372638897.0, "step": 2338 }, { "epoch": 1.1897253306205493, "grad_norm": 1.0857921838760376, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8441271781921387, "num_tokens": 372795195.0, "step": 2339 }, { "epoch": 1.190233977619532, "grad_norm": 1.0063886642456055, "learning_rate": 1e-05, "loss": 0.4696, "mean_token_accuracy": 0.8519972562789917, "num_tokens": 372956645.0, "step": 2340 }, { "epoch": 1.1907426246185147, "grad_norm": 0.969150722026825, "learning_rate": 1e-05, "loss": 0.4692, "mean_token_accuracy": 0.8504284024238586, "num_tokens": 373118539.0, "step": 2341 }, { "epoch": 1.1912512716174974, "grad_norm": 0.9527782201766968, "learning_rate": 1e-05, "loss": 0.4593, "mean_token_accuracy": 0.8541048765182495, "num_tokens": 373265980.0, "step": 2342 }, { "epoch": 1.1917599186164802, "grad_norm": 0.9804020524024963, "learning_rate": 1e-05, "loss": 0.521, "mean_token_accuracy": 0.8361110687255859, "num_tokens": 373433225.0, "step": 2343 }, { "epoch": 1.1922685656154628, "grad_norm": 1.0810366868972778, "learning_rate": 1e-05, "loss": 0.4707, "mean_token_accuracy": 0.8493367433547974, "num_tokens": 373575839.0, "step": 2344 }, { "epoch": 1.1927772126144456, "grad_norm": 1.061769962310791, "learning_rate": 1e-05, "loss": 0.4745, "mean_token_accuracy": 0.8492453098297119, "num_tokens": 373725507.0, "step": 2345 }, { "epoch": 1.1932858596134284, "grad_norm": 1.0141011476516724, "learning_rate": 1e-05, "loss": 0.4809, "mean_token_accuracy": 0.8460336923599243, "num_tokens": 373896838.0, "step": 2346 }, { "epoch": 1.193794506612411, "grad_norm": 0.9880650639533997, "learning_rate": 1e-05, "loss": 0.4847, "mean_token_accuracy": 0.8462492227554321, "num_tokens": 374061213.0, "step": 2347 }, { "epoch": 1.1943031536113937, "grad_norm": 1.1626392602920532, "learning_rate": 1e-05, "loss": 0.4729, "mean_token_accuracy": 0.850334882736206, "num_tokens": 374210420.0, "step": 2348 }, { "epoch": 1.1948118006103763, "grad_norm": 1.1458953619003296, "learning_rate": 1e-05, "loss": 0.4472, "mean_token_accuracy": 0.8559433221817017, "num_tokens": 374372042.0, "step": 2349 }, { "epoch": 1.195320447609359, "grad_norm": 1.1105952262878418, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.8556099534034729, "num_tokens": 374539265.0, "step": 2350 }, { "epoch": 1.1958290946083419, "grad_norm": 1.0160725116729736, "learning_rate": 1e-05, "loss": 0.487, "mean_token_accuracy": 0.8456284999847412, "num_tokens": 374697019.0, "step": 2351 }, { "epoch": 1.1963377416073244, "grad_norm": 0.9500008821487427, "learning_rate": 1e-05, "loss": 0.4688, "mean_token_accuracy": 0.8500959873199463, "num_tokens": 374846358.0, "step": 2352 }, { "epoch": 1.1968463886063072, "grad_norm": 1.1950064897537231, "learning_rate": 1e-05, "loss": 0.4595, "mean_token_accuracy": 0.8534548282623291, "num_tokens": 375007223.0, "step": 2353 }, { "epoch": 1.19735503560529, "grad_norm": 1.174160122871399, "learning_rate": 1e-05, "loss": 0.4704, "mean_token_accuracy": 0.8509343266487122, "num_tokens": 375157542.0, "step": 2354 }, { "epoch": 1.1978636826042726, "grad_norm": 1.091294765472412, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8505253791809082, "num_tokens": 375315585.0, "step": 2355 }, { "epoch": 1.1983723296032553, "grad_norm": 0.9935697913169861, "learning_rate": 1e-05, "loss": 0.4855, "mean_token_accuracy": 0.8474482893943787, "num_tokens": 375472279.0, "step": 2356 }, { "epoch": 1.1988809766022381, "grad_norm": 1.0689009428024292, "learning_rate": 1e-05, "loss": 0.4832, "mean_token_accuracy": 0.8478238582611084, "num_tokens": 375630662.0, "step": 2357 }, { "epoch": 1.1993896236012207, "grad_norm": 1.1201637983322144, "learning_rate": 1e-05, "loss": 0.4646, "mean_token_accuracy": 0.8541030883789062, "num_tokens": 375785251.0, "step": 2358 }, { "epoch": 1.1998982706002035, "grad_norm": 0.9344379901885986, "learning_rate": 1e-05, "loss": 0.4822, "mean_token_accuracy": 0.8468241691589355, "num_tokens": 375945570.0, "step": 2359 }, { "epoch": 1.200406917599186, "grad_norm": 1.052104115486145, "learning_rate": 1e-05, "loss": 0.4458, "mean_token_accuracy": 0.857330858707428, "num_tokens": 376099013.0, "step": 2360 }, { "epoch": 1.2009155645981688, "grad_norm": 0.9729582071304321, "learning_rate": 1e-05, "loss": 0.4897, "mean_token_accuracy": 0.8475154638290405, "num_tokens": 376260578.0, "step": 2361 }, { "epoch": 1.2014242115971516, "grad_norm": 0.9846155047416687, "learning_rate": 1e-05, "loss": 0.4777, "mean_token_accuracy": 0.848920464515686, "num_tokens": 376414415.0, "step": 2362 }, { "epoch": 1.2019328585961342, "grad_norm": 0.9831588864326477, "learning_rate": 1e-05, "loss": 0.4809, "mean_token_accuracy": 0.8465712070465088, "num_tokens": 376570177.0, "step": 2363 }, { "epoch": 1.202441505595117, "grad_norm": 0.9749376773834229, "learning_rate": 1e-05, "loss": 0.5013, "mean_token_accuracy": 0.8418674468994141, "num_tokens": 376731071.0, "step": 2364 }, { "epoch": 1.2029501525940998, "grad_norm": 1.0017136335372925, "learning_rate": 1e-05, "loss": 0.4833, "mean_token_accuracy": 0.8458402752876282, "num_tokens": 376900019.0, "step": 2365 }, { "epoch": 1.2034587995930823, "grad_norm": 0.9792733788490295, "learning_rate": 1e-05, "loss": 0.5065, "mean_token_accuracy": 0.8399918079376221, "num_tokens": 377061505.0, "step": 2366 }, { "epoch": 1.2039674465920651, "grad_norm": 1.000549554824829, "learning_rate": 1e-05, "loss": 0.4922, "mean_token_accuracy": 0.8457726836204529, "num_tokens": 377219211.0, "step": 2367 }, { "epoch": 1.204476093591048, "grad_norm": 1.007737398147583, "learning_rate": 1e-05, "loss": 0.5006, "mean_token_accuracy": 0.8432072401046753, "num_tokens": 377377739.0, "step": 2368 }, { "epoch": 1.2049847405900305, "grad_norm": 0.9702564477920532, "learning_rate": 1e-05, "loss": 0.526, "mean_token_accuracy": 0.8351813554763794, "num_tokens": 377541932.0, "step": 2369 }, { "epoch": 1.2054933875890133, "grad_norm": 1.081968903541565, "learning_rate": 1e-05, "loss": 0.4676, "mean_token_accuracy": 0.8512855172157288, "num_tokens": 377694342.0, "step": 2370 }, { "epoch": 1.2060020345879958, "grad_norm": 0.9188628196716309, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.858360230922699, "num_tokens": 377858941.0, "step": 2371 }, { "epoch": 1.2065106815869786, "grad_norm": 1.0161525011062622, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8422462940216064, "num_tokens": 378013523.0, "step": 2372 }, { "epoch": 1.2070193285859614, "grad_norm": 1.1344497203826904, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.8525291681289673, "num_tokens": 378174442.0, "step": 2373 }, { "epoch": 1.207527975584944, "grad_norm": 1.032597303390503, "learning_rate": 1e-05, "loss": 0.481, "mean_token_accuracy": 0.8476738333702087, "num_tokens": 378315219.0, "step": 2374 }, { "epoch": 1.2080366225839267, "grad_norm": 1.0668175220489502, "learning_rate": 1e-05, "loss": 0.4805, "mean_token_accuracy": 0.8466565608978271, "num_tokens": 378458710.0, "step": 2375 }, { "epoch": 1.2085452695829095, "grad_norm": 0.9233808517456055, "learning_rate": 1e-05, "loss": 0.4621, "mean_token_accuracy": 0.855979859828949, "num_tokens": 378622003.0, "step": 2376 }, { "epoch": 1.209053916581892, "grad_norm": 1.073775053024292, "learning_rate": 1e-05, "loss": 0.4566, "mean_token_accuracy": 0.8557116985321045, "num_tokens": 378784837.0, "step": 2377 }, { "epoch": 1.2095625635808749, "grad_norm": 1.0039360523223877, "learning_rate": 1e-05, "loss": 0.4823, "mean_token_accuracy": 0.8465985059738159, "num_tokens": 378932580.0, "step": 2378 }, { "epoch": 1.2100712105798577, "grad_norm": 1.0067378282546997, "learning_rate": 1e-05, "loss": 0.4733, "mean_token_accuracy": 0.8507084846496582, "num_tokens": 379088195.0, "step": 2379 }, { "epoch": 1.2105798575788402, "grad_norm": 1.0960066318511963, "learning_rate": 1e-05, "loss": 0.523, "mean_token_accuracy": 0.8363871574401855, "num_tokens": 379261815.0, "step": 2380 }, { "epoch": 1.211088504577823, "grad_norm": 0.9723873734474182, "learning_rate": 1e-05, "loss": 0.4883, "mean_token_accuracy": 0.8440775871276855, "num_tokens": 379421440.0, "step": 2381 }, { "epoch": 1.2115971515768056, "grad_norm": 1.0995724201202393, "learning_rate": 1e-05, "loss": 0.4903, "mean_token_accuracy": 0.8438718318939209, "num_tokens": 379589270.0, "step": 2382 }, { "epoch": 1.2121057985757884, "grad_norm": 1.0866910219192505, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.8414846658706665, "num_tokens": 379744264.0, "step": 2383 }, { "epoch": 1.2126144455747712, "grad_norm": 1.0899282693862915, "learning_rate": 1e-05, "loss": 0.4717, "mean_token_accuracy": 0.8495341539382935, "num_tokens": 379905958.0, "step": 2384 }, { "epoch": 1.2131230925737537, "grad_norm": 1.0874824523925781, "learning_rate": 1e-05, "loss": 0.4888, "mean_token_accuracy": 0.8447736501693726, "num_tokens": 380056524.0, "step": 2385 }, { "epoch": 1.2136317395727365, "grad_norm": 1.0588302612304688, "learning_rate": 1e-05, "loss": 0.4844, "mean_token_accuracy": 0.8479455709457397, "num_tokens": 380215696.0, "step": 2386 }, { "epoch": 1.2141403865717193, "grad_norm": 1.008784294128418, "learning_rate": 1e-05, "loss": 0.4769, "mean_token_accuracy": 0.8480744957923889, "num_tokens": 380386760.0, "step": 2387 }, { "epoch": 1.2146490335707019, "grad_norm": 1.0145485401153564, "learning_rate": 1e-05, "loss": 0.4522, "mean_token_accuracy": 0.8556555509567261, "num_tokens": 380552451.0, "step": 2388 }, { "epoch": 1.2151576805696847, "grad_norm": 1.0312669277191162, "learning_rate": 1e-05, "loss": 0.4869, "mean_token_accuracy": 0.8460147380828857, "num_tokens": 380699985.0, "step": 2389 }, { "epoch": 1.2156663275686674, "grad_norm": 0.9727829694747925, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.8518386483192444, "num_tokens": 380862532.0, "step": 2390 }, { "epoch": 1.21617497456765, "grad_norm": 0.9736183881759644, "learning_rate": 1e-05, "loss": 0.4602, "mean_token_accuracy": 0.8531427383422852, "num_tokens": 381023201.0, "step": 2391 }, { "epoch": 1.2166836215666328, "grad_norm": 1.0878318548202515, "learning_rate": 1e-05, "loss": 0.4953, "mean_token_accuracy": 0.842427134513855, "num_tokens": 381181297.0, "step": 2392 }, { "epoch": 1.2171922685656154, "grad_norm": 0.9806477427482605, "learning_rate": 1e-05, "loss": 0.534, "mean_token_accuracy": 0.8337027430534363, "num_tokens": 381335556.0, "step": 2393 }, { "epoch": 1.2177009155645981, "grad_norm": 1.0319772958755493, "learning_rate": 1e-05, "loss": 0.5084, "mean_token_accuracy": 0.8385846614837646, "num_tokens": 381492918.0, "step": 2394 }, { "epoch": 1.218209562563581, "grad_norm": 1.013101577758789, "learning_rate": 1e-05, "loss": 0.4856, "mean_token_accuracy": 0.8462007641792297, "num_tokens": 381650827.0, "step": 2395 }, { "epoch": 1.2187182095625635, "grad_norm": 0.9341661334037781, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.8424181938171387, "num_tokens": 381810610.0, "step": 2396 }, { "epoch": 1.2192268565615463, "grad_norm": 1.0404284000396729, "learning_rate": 1e-05, "loss": 0.4435, "mean_token_accuracy": 0.8567433953285217, "num_tokens": 381974046.0, "step": 2397 }, { "epoch": 1.219735503560529, "grad_norm": 1.1205888986587524, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8454039096832275, "num_tokens": 382127627.0, "step": 2398 }, { "epoch": 1.2202441505595116, "grad_norm": 1.0638031959533691, "learning_rate": 1e-05, "loss": 0.4833, "mean_token_accuracy": 0.8463817238807678, "num_tokens": 382289172.0, "step": 2399 }, { "epoch": 1.2207527975584944, "grad_norm": 1.1557694673538208, "learning_rate": 1e-05, "loss": 0.4869, "mean_token_accuracy": 0.8461000323295593, "num_tokens": 382453657.0, "step": 2400 }, { "epoch": 1.2212614445574772, "grad_norm": 1.068440556526184, "learning_rate": 1e-05, "loss": 0.4782, "mean_token_accuracy": 0.8476709127426147, "num_tokens": 382617744.0, "step": 2401 }, { "epoch": 1.2217700915564598, "grad_norm": 0.9085849523544312, "learning_rate": 1e-05, "loss": 0.4676, "mean_token_accuracy": 0.851020336151123, "num_tokens": 382781691.0, "step": 2402 }, { "epoch": 1.2222787385554426, "grad_norm": 1.0901038646697998, "learning_rate": 1e-05, "loss": 0.4633, "mean_token_accuracy": 0.8521700501441956, "num_tokens": 382944190.0, "step": 2403 }, { "epoch": 1.2227873855544251, "grad_norm": 0.984150767326355, "learning_rate": 1e-05, "loss": 0.4909, "mean_token_accuracy": 0.8450653553009033, "num_tokens": 383107927.0, "step": 2404 }, { "epoch": 1.223296032553408, "grad_norm": 1.288863182067871, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8521753549575806, "num_tokens": 383270582.0, "step": 2405 }, { "epoch": 1.2238046795523907, "grad_norm": 1.1820831298828125, "learning_rate": 1e-05, "loss": 0.5097, "mean_token_accuracy": 0.8402947783470154, "num_tokens": 383429967.0, "step": 2406 }, { "epoch": 1.2243133265513733, "grad_norm": 1.016980528831482, "learning_rate": 1e-05, "loss": 0.4619, "mean_token_accuracy": 0.8534340858459473, "num_tokens": 383588086.0, "step": 2407 }, { "epoch": 1.224821973550356, "grad_norm": 1.2975318431854248, "learning_rate": 1e-05, "loss": 0.5041, "mean_token_accuracy": 0.8402259349822998, "num_tokens": 383744521.0, "step": 2408 }, { "epoch": 1.2253306205493388, "grad_norm": 1.1058008670806885, "learning_rate": 1e-05, "loss": 0.4759, "mean_token_accuracy": 0.8495995998382568, "num_tokens": 383912014.0, "step": 2409 }, { "epoch": 1.2258392675483214, "grad_norm": 1.1800510883331299, "learning_rate": 1e-05, "loss": 0.4881, "mean_token_accuracy": 0.8459175825119019, "num_tokens": 384070051.0, "step": 2410 }, { "epoch": 1.2263479145473042, "grad_norm": 1.0510597229003906, "learning_rate": 1e-05, "loss": 0.507, "mean_token_accuracy": 0.8406599760055542, "num_tokens": 384232662.0, "step": 2411 }, { "epoch": 1.226856561546287, "grad_norm": 0.9730938076972961, "learning_rate": 1e-05, "loss": 0.4872, "mean_token_accuracy": 0.8466057777404785, "num_tokens": 384381225.0, "step": 2412 }, { "epoch": 1.2273652085452695, "grad_norm": 0.9850808382034302, "learning_rate": 1e-05, "loss": 0.4859, "mean_token_accuracy": 0.8452000021934509, "num_tokens": 384542789.0, "step": 2413 }, { "epoch": 1.2278738555442523, "grad_norm": 0.9626375436782837, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.8400334119796753, "num_tokens": 384710145.0, "step": 2414 }, { "epoch": 1.2283825025432349, "grad_norm": 1.0044533014297485, "learning_rate": 1e-05, "loss": 0.5257, "mean_token_accuracy": 0.8348107933998108, "num_tokens": 384873444.0, "step": 2415 }, { "epoch": 1.2288911495422177, "grad_norm": 1.079222321510315, "learning_rate": 1e-05, "loss": 0.4924, "mean_token_accuracy": 0.8461150527000427, "num_tokens": 385030375.0, "step": 2416 }, { "epoch": 1.2293997965412005, "grad_norm": 1.0069643259048462, "learning_rate": 1e-05, "loss": 0.481, "mean_token_accuracy": 0.8482726216316223, "num_tokens": 385195808.0, "step": 2417 }, { "epoch": 1.229908443540183, "grad_norm": 1.000921368598938, "learning_rate": 1e-05, "loss": 0.5255, "mean_token_accuracy": 0.8357363939285278, "num_tokens": 385361429.0, "step": 2418 }, { "epoch": 1.2304170905391658, "grad_norm": 0.9638259410858154, "learning_rate": 1e-05, "loss": 0.4537, "mean_token_accuracy": 0.8551135063171387, "num_tokens": 385524948.0, "step": 2419 }, { "epoch": 1.2309257375381486, "grad_norm": 0.9751047492027283, "learning_rate": 1e-05, "loss": 0.525, "mean_token_accuracy": 0.834947407245636, "num_tokens": 385690394.0, "step": 2420 }, { "epoch": 1.2314343845371312, "grad_norm": 0.9985384941101074, "learning_rate": 1e-05, "loss": 0.4961, "mean_token_accuracy": 0.8441144227981567, "num_tokens": 385849268.0, "step": 2421 }, { "epoch": 1.231943031536114, "grad_norm": 0.9478381276130676, "learning_rate": 1e-05, "loss": 0.4534, "mean_token_accuracy": 0.8564085960388184, "num_tokens": 386012537.0, "step": 2422 }, { "epoch": 1.2324516785350967, "grad_norm": 1.10886812210083, "learning_rate": 1e-05, "loss": 0.5219, "mean_token_accuracy": 0.8358174562454224, "num_tokens": 386175987.0, "step": 2423 }, { "epoch": 1.2329603255340793, "grad_norm": 0.9503421187400818, "learning_rate": 1e-05, "loss": 0.4535, "mean_token_accuracy": 0.8555249571800232, "num_tokens": 386338087.0, "step": 2424 }, { "epoch": 1.233468972533062, "grad_norm": 0.9710291624069214, "learning_rate": 1e-05, "loss": 0.4705, "mean_token_accuracy": 0.8473029136657715, "num_tokens": 386495175.0, "step": 2425 }, { "epoch": 1.2339776195320447, "grad_norm": 1.0143202543258667, "learning_rate": 1e-05, "loss": 0.4872, "mean_token_accuracy": 0.8451850414276123, "num_tokens": 386663700.0, "step": 2426 }, { "epoch": 1.2344862665310274, "grad_norm": 0.9353373050689697, "learning_rate": 1e-05, "loss": 0.4714, "mean_token_accuracy": 0.852155327796936, "num_tokens": 386818763.0, "step": 2427 }, { "epoch": 1.2349949135300102, "grad_norm": 0.9928601980209351, "learning_rate": 1e-05, "loss": 0.499, "mean_token_accuracy": 0.842134952545166, "num_tokens": 386979867.0, "step": 2428 }, { "epoch": 1.2355035605289928, "grad_norm": 0.9530983567237854, "learning_rate": 1e-05, "loss": 0.4602, "mean_token_accuracy": 0.8531316518783569, "num_tokens": 387146689.0, "step": 2429 }, { "epoch": 1.2360122075279756, "grad_norm": 0.9834848642349243, "learning_rate": 1e-05, "loss": 0.4913, "mean_token_accuracy": 0.8447766304016113, "num_tokens": 387308229.0, "step": 2430 }, { "epoch": 1.2365208545269584, "grad_norm": 1.017050862312317, "learning_rate": 1e-05, "loss": 0.5054, "mean_token_accuracy": 0.839943528175354, "num_tokens": 387464799.0, "step": 2431 }, { "epoch": 1.237029501525941, "grad_norm": 1.0119413137435913, "learning_rate": 1e-05, "loss": 0.5069, "mean_token_accuracy": 0.8400273323059082, "num_tokens": 387633541.0, "step": 2432 }, { "epoch": 1.2375381485249237, "grad_norm": 0.9309226274490356, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8526896834373474, "num_tokens": 387791273.0, "step": 2433 }, { "epoch": 1.2380467955239065, "grad_norm": 1.1053496599197388, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8502742648124695, "num_tokens": 387945149.0, "step": 2434 }, { "epoch": 1.238555442522889, "grad_norm": 1.0243339538574219, "learning_rate": 1e-05, "loss": 0.4535, "mean_token_accuracy": 0.8547742962837219, "num_tokens": 388104340.0, "step": 2435 }, { "epoch": 1.2390640895218719, "grad_norm": 0.9826825857162476, "learning_rate": 1e-05, "loss": 0.4782, "mean_token_accuracy": 0.8481093049049377, "num_tokens": 388262139.0, "step": 2436 }, { "epoch": 1.2395727365208544, "grad_norm": 0.9026824235916138, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8482067584991455, "num_tokens": 388415175.0, "step": 2437 }, { "epoch": 1.2400813835198372, "grad_norm": 1.0203884840011597, "learning_rate": 1e-05, "loss": 0.4813, "mean_token_accuracy": 0.8465292453765869, "num_tokens": 388567660.0, "step": 2438 }, { "epoch": 1.24059003051882, "grad_norm": 1.0784242153167725, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.8525174856185913, "num_tokens": 388711146.0, "step": 2439 }, { "epoch": 1.2410986775178026, "grad_norm": 1.0541739463806152, "learning_rate": 1e-05, "loss": 0.4858, "mean_token_accuracy": 0.8467655181884766, "num_tokens": 388862732.0, "step": 2440 }, { "epoch": 1.2416073245167853, "grad_norm": 1.068041443824768, "learning_rate": 1e-05, "loss": 0.5087, "mean_token_accuracy": 0.8395960927009583, "num_tokens": 389018134.0, "step": 2441 }, { "epoch": 1.2421159715157681, "grad_norm": 0.9338946342468262, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.8567076921463013, "num_tokens": 389171597.0, "step": 2442 }, { "epoch": 1.2426246185147507, "grad_norm": 0.9714147448539734, "learning_rate": 1e-05, "loss": 0.5077, "mean_token_accuracy": 0.8383303880691528, "num_tokens": 389316922.0, "step": 2443 }, { "epoch": 1.2431332655137335, "grad_norm": 0.9373910427093506, "learning_rate": 1e-05, "loss": 0.4776, "mean_token_accuracy": 0.8496674299240112, "num_tokens": 389471070.0, "step": 2444 }, { "epoch": 1.2436419125127163, "grad_norm": 0.9623604416847229, "learning_rate": 1e-05, "loss": 0.4782, "mean_token_accuracy": 0.8476322889328003, "num_tokens": 389630233.0, "step": 2445 }, { "epoch": 1.2441505595116988, "grad_norm": 0.9489457607269287, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8417967557907104, "num_tokens": 389778197.0, "step": 2446 }, { "epoch": 1.2446592065106816, "grad_norm": 0.8921250700950623, "learning_rate": 1e-05, "loss": 0.507, "mean_token_accuracy": 0.8411307334899902, "num_tokens": 389946253.0, "step": 2447 }, { "epoch": 1.2451678535096642, "grad_norm": 0.9162742495536804, "learning_rate": 1e-05, "loss": 0.4587, "mean_token_accuracy": 0.8542463779449463, "num_tokens": 390102274.0, "step": 2448 }, { "epoch": 1.245676500508647, "grad_norm": 1.0814255475997925, "learning_rate": 1e-05, "loss": 0.4908, "mean_token_accuracy": 0.8456716537475586, "num_tokens": 390252711.0, "step": 2449 }, { "epoch": 1.2461851475076298, "grad_norm": 0.9779260754585266, "learning_rate": 1e-05, "loss": 0.5072, "mean_token_accuracy": 0.8394188284873962, "num_tokens": 390422450.0, "step": 2450 }, { "epoch": 1.2466937945066123, "grad_norm": 0.9985749125480652, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8499858975410461, "num_tokens": 390587459.0, "step": 2451 }, { "epoch": 1.2472024415055951, "grad_norm": 1.0046234130859375, "learning_rate": 1e-05, "loss": 0.47, "mean_token_accuracy": 0.8494718074798584, "num_tokens": 390752194.0, "step": 2452 }, { "epoch": 1.247711088504578, "grad_norm": 1.045386791229248, "learning_rate": 1e-05, "loss": 0.4826, "mean_token_accuracy": 0.8462944030761719, "num_tokens": 390918814.0, "step": 2453 }, { "epoch": 1.2482197355035605, "grad_norm": 0.9235852360725403, "learning_rate": 1e-05, "loss": 0.4547, "mean_token_accuracy": 0.8562573194503784, "num_tokens": 391076755.0, "step": 2454 }, { "epoch": 1.2487283825025433, "grad_norm": 0.9939376711845398, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8503568768501282, "num_tokens": 391234191.0, "step": 2455 }, { "epoch": 1.249237029501526, "grad_norm": 1.0112658739089966, "learning_rate": 1e-05, "loss": 0.4806, "mean_token_accuracy": 0.847571611404419, "num_tokens": 391386429.0, "step": 2456 }, { "epoch": 1.2497456765005086, "grad_norm": 0.9470198750495911, "learning_rate": 1e-05, "loss": 0.4651, "mean_token_accuracy": 0.849887490272522, "num_tokens": 391554073.0, "step": 2457 }, { "epoch": 1.2502543234994914, "grad_norm": 1.2515413761138916, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8373808860778809, "num_tokens": 391709893.0, "step": 2458 }, { "epoch": 1.250762970498474, "grad_norm": 1.092207908630371, "learning_rate": 1e-05, "loss": 0.5099, "mean_token_accuracy": 0.837600827217102, "num_tokens": 391870192.0, "step": 2459 }, { "epoch": 1.2512716174974567, "grad_norm": 0.9322696328163147, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.8519747257232666, "num_tokens": 392024517.0, "step": 2460 }, { "epoch": 1.2517802644964395, "grad_norm": 0.991862416267395, "learning_rate": 1e-05, "loss": 0.4607, "mean_token_accuracy": 0.851301908493042, "num_tokens": 392177494.0, "step": 2461 }, { "epoch": 1.252288911495422, "grad_norm": 1.0980491638183594, "learning_rate": 1e-05, "loss": 0.4835, "mean_token_accuracy": 0.8465778827667236, "num_tokens": 392336854.0, "step": 2462 }, { "epoch": 1.2527975584944049, "grad_norm": 0.9788417816162109, "learning_rate": 1e-05, "loss": 0.4925, "mean_token_accuracy": 0.8434962630271912, "num_tokens": 392496140.0, "step": 2463 }, { "epoch": 1.2533062054933877, "grad_norm": 0.9595546722412109, "learning_rate": 1e-05, "loss": 0.4907, "mean_token_accuracy": 0.8437904119491577, "num_tokens": 392656687.0, "step": 2464 }, { "epoch": 1.2538148524923702, "grad_norm": 0.9926989674568176, "learning_rate": 1e-05, "loss": 0.5189, "mean_token_accuracy": 0.836510419845581, "num_tokens": 392812396.0, "step": 2465 }, { "epoch": 1.254323499491353, "grad_norm": 1.0025798082351685, "learning_rate": 1e-05, "loss": 0.5382, "mean_token_accuracy": 0.8307243585586548, "num_tokens": 392972527.0, "step": 2466 }, { "epoch": 1.2548321464903358, "grad_norm": 0.9914138913154602, "learning_rate": 1e-05, "loss": 0.4762, "mean_token_accuracy": 0.8475267291069031, "num_tokens": 393135630.0, "step": 2467 }, { "epoch": 1.2553407934893184, "grad_norm": 0.9821697473526001, "learning_rate": 1e-05, "loss": 0.4857, "mean_token_accuracy": 0.8450480699539185, "num_tokens": 393296349.0, "step": 2468 }, { "epoch": 1.2558494404883012, "grad_norm": 0.9580907821655273, "learning_rate": 1e-05, "loss": 0.4362, "mean_token_accuracy": 0.8602911233901978, "num_tokens": 393459529.0, "step": 2469 }, { "epoch": 1.2563580874872837, "grad_norm": 0.9034733772277832, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8528581857681274, "num_tokens": 393622385.0, "step": 2470 }, { "epoch": 1.2568667344862665, "grad_norm": 1.0865966081619263, "learning_rate": 1e-05, "loss": 0.4819, "mean_token_accuracy": 0.8464517593383789, "num_tokens": 393786264.0, "step": 2471 }, { "epoch": 1.2573753814852493, "grad_norm": 0.9148275852203369, "learning_rate": 1e-05, "loss": 0.4678, "mean_token_accuracy": 0.8529778718948364, "num_tokens": 393940347.0, "step": 2472 }, { "epoch": 1.2578840284842319, "grad_norm": 0.9414737820625305, "learning_rate": 1e-05, "loss": 0.47, "mean_token_accuracy": 0.8517553210258484, "num_tokens": 394104484.0, "step": 2473 }, { "epoch": 1.2583926754832147, "grad_norm": 1.1321500539779663, "learning_rate": 1e-05, "loss": 0.5036, "mean_token_accuracy": 0.8403670191764832, "num_tokens": 394274909.0, "step": 2474 }, { "epoch": 1.2589013224821974, "grad_norm": 1.0167399644851685, "learning_rate": 1e-05, "loss": 0.4441, "mean_token_accuracy": 0.8573471307754517, "num_tokens": 394422663.0, "step": 2475 }, { "epoch": 1.25940996948118, "grad_norm": 1.066833257675171, "learning_rate": 1e-05, "loss": 0.4819, "mean_token_accuracy": 0.8465922474861145, "num_tokens": 394572404.0, "step": 2476 }, { "epoch": 1.2599186164801628, "grad_norm": 1.0820730924606323, "learning_rate": 1e-05, "loss": 0.5148, "mean_token_accuracy": 0.8378000855445862, "num_tokens": 394741473.0, "step": 2477 }, { "epoch": 1.2604272634791456, "grad_norm": 0.9718859791755676, "learning_rate": 1e-05, "loss": 0.5259, "mean_token_accuracy": 0.8359262943267822, "num_tokens": 394904157.0, "step": 2478 }, { "epoch": 1.2609359104781281, "grad_norm": 1.038017988204956, "learning_rate": 1e-05, "loss": 0.4661, "mean_token_accuracy": 0.8514251112937927, "num_tokens": 395046309.0, "step": 2479 }, { "epoch": 1.261444557477111, "grad_norm": 1.0430866479873657, "learning_rate": 1e-05, "loss": 0.4947, "mean_token_accuracy": 0.8451223373413086, "num_tokens": 395205744.0, "step": 2480 }, { "epoch": 1.2619532044760935, "grad_norm": 0.9635177850723267, "learning_rate": 1e-05, "loss": 0.4728, "mean_token_accuracy": 0.8498380184173584, "num_tokens": 395370214.0, "step": 2481 }, { "epoch": 1.2624618514750763, "grad_norm": 1.0214176177978516, "learning_rate": 1e-05, "loss": 0.4738, "mean_token_accuracy": 0.8482320308685303, "num_tokens": 395535591.0, "step": 2482 }, { "epoch": 1.262970498474059, "grad_norm": 0.9416934847831726, "learning_rate": 1e-05, "loss": 0.4763, "mean_token_accuracy": 0.8485772609710693, "num_tokens": 395720117.0, "step": 2483 }, { "epoch": 1.2634791454730416, "grad_norm": 1.0123696327209473, "learning_rate": 1e-05, "loss": 0.5005, "mean_token_accuracy": 0.8418780565261841, "num_tokens": 395887109.0, "step": 2484 }, { "epoch": 1.2639877924720244, "grad_norm": 0.9469811916351318, "learning_rate": 1e-05, "loss": 0.513, "mean_token_accuracy": 0.8381527662277222, "num_tokens": 396064517.0, "step": 2485 }, { "epoch": 1.264496439471007, "grad_norm": 1.1035939455032349, "learning_rate": 1e-05, "loss": 0.4926, "mean_token_accuracy": 0.843600869178772, "num_tokens": 396215248.0, "step": 2486 }, { "epoch": 1.2650050864699898, "grad_norm": 1.066406011581421, "learning_rate": 1e-05, "loss": 0.4698, "mean_token_accuracy": 0.8506541848182678, "num_tokens": 396380487.0, "step": 2487 }, { "epoch": 1.2655137334689726, "grad_norm": 0.9918091893196106, "learning_rate": 1e-05, "loss": 0.5373, "mean_token_accuracy": 0.8316949605941772, "num_tokens": 396538633.0, "step": 2488 }, { "epoch": 1.2660223804679553, "grad_norm": 1.0956774950027466, "learning_rate": 1e-05, "loss": 0.472, "mean_token_accuracy": 0.848578691482544, "num_tokens": 396707894.0, "step": 2489 }, { "epoch": 1.266531027466938, "grad_norm": 0.9060829877853394, "learning_rate": 1e-05, "loss": 0.4735, "mean_token_accuracy": 0.8491650819778442, "num_tokens": 396867308.0, "step": 2490 }, { "epoch": 1.2670396744659207, "grad_norm": 1.0590204000473022, "learning_rate": 1e-05, "loss": 0.5051, "mean_token_accuracy": 0.8396422863006592, "num_tokens": 397012467.0, "step": 2491 }, { "epoch": 1.2675483214649033, "grad_norm": 0.9565798044204712, "learning_rate": 1e-05, "loss": 0.4777, "mean_token_accuracy": 0.8488767743110657, "num_tokens": 397176821.0, "step": 2492 }, { "epoch": 1.268056968463886, "grad_norm": 0.990504801273346, "learning_rate": 1e-05, "loss": 0.4582, "mean_token_accuracy": 0.8547040820121765, "num_tokens": 397331212.0, "step": 2493 }, { "epoch": 1.2685656154628688, "grad_norm": 1.0066111087799072, "learning_rate": 1e-05, "loss": 0.4664, "mean_token_accuracy": 0.852090060710907, "num_tokens": 397507718.0, "step": 2494 }, { "epoch": 1.2690742624618514, "grad_norm": 0.9095807671546936, "learning_rate": 1e-05, "loss": 0.4625, "mean_token_accuracy": 0.8523800373077393, "num_tokens": 397664472.0, "step": 2495 }, { "epoch": 1.2695829094608342, "grad_norm": 1.028704047203064, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8492536544799805, "num_tokens": 397832455.0, "step": 2496 }, { "epoch": 1.2700915564598168, "grad_norm": 0.970306396484375, "learning_rate": 1e-05, "loss": 0.4922, "mean_token_accuracy": 0.8438721895217896, "num_tokens": 397993790.0, "step": 2497 }, { "epoch": 1.2706002034587995, "grad_norm": 1.730830192565918, "learning_rate": 1e-05, "loss": 0.4765, "mean_token_accuracy": 0.8479042053222656, "num_tokens": 398143376.0, "step": 2498 }, { "epoch": 1.2711088504577823, "grad_norm": 0.9698526859283447, "learning_rate": 1e-05, "loss": 0.4615, "mean_token_accuracy": 0.8523569703102112, "num_tokens": 398307832.0, "step": 2499 }, { "epoch": 1.2716174974567651, "grad_norm": 0.9873821139335632, "learning_rate": 1e-05, "loss": 0.479, "mean_token_accuracy": 0.84932541847229, "num_tokens": 398461934.0, "step": 2500 }, { "epoch": 1.2721261444557477, "grad_norm": 1.029078483581543, "learning_rate": 1e-05, "loss": 0.4955, "mean_token_accuracy": 0.8425160646438599, "num_tokens": 398614969.0, "step": 2501 }, { "epoch": 1.2726347914547305, "grad_norm": 0.9706425666809082, "learning_rate": 1e-05, "loss": 0.4734, "mean_token_accuracy": 0.8488206267356873, "num_tokens": 398775458.0, "step": 2502 }, { "epoch": 1.273143438453713, "grad_norm": 1.0119162797927856, "learning_rate": 1e-05, "loss": 0.5239, "mean_token_accuracy": 0.8351165652275085, "num_tokens": 398931322.0, "step": 2503 }, { "epoch": 1.2736520854526958, "grad_norm": 0.9316058158874512, "learning_rate": 1e-05, "loss": 0.4926, "mean_token_accuracy": 0.8441140055656433, "num_tokens": 399094210.0, "step": 2504 }, { "epoch": 1.2741607324516786, "grad_norm": 1.0092213153839111, "learning_rate": 1e-05, "loss": 0.4898, "mean_token_accuracy": 0.8453854322433472, "num_tokens": 399251980.0, "step": 2505 }, { "epoch": 1.2746693794506612, "grad_norm": 0.9519758820533752, "learning_rate": 1e-05, "loss": 0.4617, "mean_token_accuracy": 0.8530810475349426, "num_tokens": 399408637.0, "step": 2506 }, { "epoch": 1.275178026449644, "grad_norm": 1.0151798725128174, "learning_rate": 1e-05, "loss": 0.4917, "mean_token_accuracy": 0.8444891571998596, "num_tokens": 399567395.0, "step": 2507 }, { "epoch": 1.2756866734486265, "grad_norm": 1.0232245922088623, "learning_rate": 1e-05, "loss": 0.4792, "mean_token_accuracy": 0.848068356513977, "num_tokens": 399726159.0, "step": 2508 }, { "epoch": 1.2761953204476093, "grad_norm": 0.9277547597885132, "learning_rate": 1e-05, "loss": 0.4508, "mean_token_accuracy": 0.8560090065002441, "num_tokens": 399888825.0, "step": 2509 }, { "epoch": 1.276703967446592, "grad_norm": 0.9623938202857971, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8442527651786804, "num_tokens": 400058221.0, "step": 2510 }, { "epoch": 1.2772126144455749, "grad_norm": 0.9173216819763184, "learning_rate": 1e-05, "loss": 0.4642, "mean_token_accuracy": 0.8523637652397156, "num_tokens": 400223047.0, "step": 2511 }, { "epoch": 1.2777212614445574, "grad_norm": 0.9462375640869141, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8448326587677002, "num_tokens": 400381507.0, "step": 2512 }, { "epoch": 1.2782299084435402, "grad_norm": 0.9886204600334167, "learning_rate": 1e-05, "loss": 0.4859, "mean_token_accuracy": 0.846415102481842, "num_tokens": 400546494.0, "step": 2513 }, { "epoch": 1.2787385554425228, "grad_norm": 1.0517821311950684, "learning_rate": 1e-05, "loss": 0.4728, "mean_token_accuracy": 0.8493285179138184, "num_tokens": 400704230.0, "step": 2514 }, { "epoch": 1.2792472024415056, "grad_norm": 0.9828445911407471, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8424510359764099, "num_tokens": 400862305.0, "step": 2515 }, { "epoch": 1.2797558494404884, "grad_norm": 0.9465426802635193, "learning_rate": 1e-05, "loss": 0.4534, "mean_token_accuracy": 0.8549497127532959, "num_tokens": 401018958.0, "step": 2516 }, { "epoch": 1.280264496439471, "grad_norm": 0.9962822794914246, "learning_rate": 1e-05, "loss": 0.506, "mean_token_accuracy": 0.8403021097183228, "num_tokens": 401178895.0, "step": 2517 }, { "epoch": 1.2807731434384537, "grad_norm": 0.9522839784622192, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8494329452514648, "num_tokens": 401325913.0, "step": 2518 }, { "epoch": 1.2812817904374363, "grad_norm": 0.9846671223640442, "learning_rate": 1e-05, "loss": 0.4927, "mean_token_accuracy": 0.8454416394233704, "num_tokens": 401492299.0, "step": 2519 }, { "epoch": 1.281790437436419, "grad_norm": 0.9660159945487976, "learning_rate": 1e-05, "loss": 0.5084, "mean_token_accuracy": 0.838665246963501, "num_tokens": 401650213.0, "step": 2520 }, { "epoch": 1.2822990844354019, "grad_norm": 1.0401360988616943, "learning_rate": 1e-05, "loss": 0.4862, "mean_token_accuracy": 0.8451883792877197, "num_tokens": 401803969.0, "step": 2521 }, { "epoch": 1.2828077314343846, "grad_norm": 0.9664472937583923, "learning_rate": 1e-05, "loss": 0.4705, "mean_token_accuracy": 0.8487030267715454, "num_tokens": 401952668.0, "step": 2522 }, { "epoch": 1.2833163784333672, "grad_norm": 0.9471451640129089, "learning_rate": 1e-05, "loss": 0.4936, "mean_token_accuracy": 0.8442484736442566, "num_tokens": 402114443.0, "step": 2523 }, { "epoch": 1.28382502543235, "grad_norm": 0.9966514706611633, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.857598066329956, "num_tokens": 402266408.0, "step": 2524 }, { "epoch": 1.2843336724313326, "grad_norm": 0.9822807908058167, "learning_rate": 1e-05, "loss": 0.529, "mean_token_accuracy": 0.8337392807006836, "num_tokens": 402435949.0, "step": 2525 }, { "epoch": 1.2848423194303153, "grad_norm": 1.0041320323944092, "learning_rate": 1e-05, "loss": 0.4955, "mean_token_accuracy": 0.8431259989738464, "num_tokens": 402595965.0, "step": 2526 }, { "epoch": 1.2853509664292981, "grad_norm": 1.0624803304672241, "learning_rate": 1e-05, "loss": 0.5135, "mean_token_accuracy": 0.8371453285217285, "num_tokens": 402757273.0, "step": 2527 }, { "epoch": 1.2858596134282807, "grad_norm": 1.093480110168457, "learning_rate": 1e-05, "loss": 0.5032, "mean_token_accuracy": 0.8406103849411011, "num_tokens": 402919556.0, "step": 2528 }, { "epoch": 1.2863682604272635, "grad_norm": 0.9978317618370056, "learning_rate": 1e-05, "loss": 0.4968, "mean_token_accuracy": 0.8422409892082214, "num_tokens": 403073153.0, "step": 2529 }, { "epoch": 1.286876907426246, "grad_norm": 1.011997103691101, "learning_rate": 1e-05, "loss": 0.4662, "mean_token_accuracy": 0.8520162105560303, "num_tokens": 403234129.0, "step": 2530 }, { "epoch": 1.2873855544252288, "grad_norm": 0.9961965680122375, "learning_rate": 1e-05, "loss": 0.4843, "mean_token_accuracy": 0.8475301265716553, "num_tokens": 403389659.0, "step": 2531 }, { "epoch": 1.2878942014242116, "grad_norm": 0.9876997470855713, "learning_rate": 1e-05, "loss": 0.5039, "mean_token_accuracy": 0.8410624265670776, "num_tokens": 403545378.0, "step": 2532 }, { "epoch": 1.2884028484231944, "grad_norm": 0.9761861562728882, "learning_rate": 1e-05, "loss": 0.4943, "mean_token_accuracy": 0.8429926633834839, "num_tokens": 403721714.0, "step": 2533 }, { "epoch": 1.288911495422177, "grad_norm": 1.060185194015503, "learning_rate": 1e-05, "loss": 0.5029, "mean_token_accuracy": 0.8406179547309875, "num_tokens": 403879895.0, "step": 2534 }, { "epoch": 1.2894201424211598, "grad_norm": 0.9180830121040344, "learning_rate": 1e-05, "loss": 0.4445, "mean_token_accuracy": 0.857403039932251, "num_tokens": 404039422.0, "step": 2535 }, { "epoch": 1.2899287894201423, "grad_norm": 0.9527869820594788, "learning_rate": 1e-05, "loss": 0.4928, "mean_token_accuracy": 0.8432321548461914, "num_tokens": 404194940.0, "step": 2536 }, { "epoch": 1.2904374364191251, "grad_norm": 1.004622459411621, "learning_rate": 1e-05, "loss": 0.544, "mean_token_accuracy": 0.8304028511047363, "num_tokens": 404356690.0, "step": 2537 }, { "epoch": 1.290946083418108, "grad_norm": 0.9781545400619507, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.8512308597564697, "num_tokens": 404516058.0, "step": 2538 }, { "epoch": 1.2914547304170905, "grad_norm": 1.0009372234344482, "learning_rate": 1e-05, "loss": 0.5014, "mean_token_accuracy": 0.8395940065383911, "num_tokens": 404673804.0, "step": 2539 }, { "epoch": 1.2919633774160733, "grad_norm": 1.014769196510315, "learning_rate": 1e-05, "loss": 0.4988, "mean_token_accuracy": 0.8452779054641724, "num_tokens": 404823463.0, "step": 2540 }, { "epoch": 1.2924720244150558, "grad_norm": 1.0117167234420776, "learning_rate": 1e-05, "loss": 0.4734, "mean_token_accuracy": 0.8504893183708191, "num_tokens": 404978592.0, "step": 2541 }, { "epoch": 1.2929806714140386, "grad_norm": 1.0531048774719238, "learning_rate": 1e-05, "loss": 0.5031, "mean_token_accuracy": 0.8393880128860474, "num_tokens": 405133209.0, "step": 2542 }, { "epoch": 1.2934893184130214, "grad_norm": 0.986916184425354, "learning_rate": 1e-05, "loss": 0.4757, "mean_token_accuracy": 0.8475457429885864, "num_tokens": 405291663.0, "step": 2543 }, { "epoch": 1.2939979654120042, "grad_norm": 0.9645692706108093, "learning_rate": 1e-05, "loss": 0.4935, "mean_token_accuracy": 0.8424645662307739, "num_tokens": 405447583.0, "step": 2544 }, { "epoch": 1.2945066124109867, "grad_norm": 1.0199670791625977, "learning_rate": 1e-05, "loss": 0.48, "mean_token_accuracy": 0.8481231927871704, "num_tokens": 405609401.0, "step": 2545 }, { "epoch": 1.2950152594099695, "grad_norm": 1.0394608974456787, "learning_rate": 1e-05, "loss": 0.4514, "mean_token_accuracy": 0.8566803932189941, "num_tokens": 405772749.0, "step": 2546 }, { "epoch": 1.295523906408952, "grad_norm": 1.018204689025879, "learning_rate": 1e-05, "loss": 0.465, "mean_token_accuracy": 0.8515591025352478, "num_tokens": 405944255.0, "step": 2547 }, { "epoch": 1.2960325534079349, "grad_norm": 1.1013845205307007, "learning_rate": 1e-05, "loss": 0.4873, "mean_token_accuracy": 0.8453958034515381, "num_tokens": 406102126.0, "step": 2548 }, { "epoch": 1.2965412004069177, "grad_norm": 0.9380601644515991, "learning_rate": 1e-05, "loss": 0.4628, "mean_token_accuracy": 0.8526260256767273, "num_tokens": 406265194.0, "step": 2549 }, { "epoch": 1.2970498474059002, "grad_norm": 1.021324634552002, "learning_rate": 1e-05, "loss": 0.5045, "mean_token_accuracy": 0.8427411317825317, "num_tokens": 406423086.0, "step": 2550 }, { "epoch": 1.297558494404883, "grad_norm": 1.0104503631591797, "learning_rate": 1e-05, "loss": 0.4555, "mean_token_accuracy": 0.8549754619598389, "num_tokens": 406587567.0, "step": 2551 }, { "epoch": 1.2980671414038656, "grad_norm": 0.9233775734901428, "learning_rate": 1e-05, "loss": 0.4687, "mean_token_accuracy": 0.8530840873718262, "num_tokens": 406744290.0, "step": 2552 }, { "epoch": 1.2985757884028484, "grad_norm": 1.0981473922729492, "learning_rate": 1e-05, "loss": 0.4951, "mean_token_accuracy": 0.8443585634231567, "num_tokens": 406899607.0, "step": 2553 }, { "epoch": 1.2990844354018312, "grad_norm": 1.0113608837127686, "learning_rate": 1e-05, "loss": 0.4793, "mean_token_accuracy": 0.8468562960624695, "num_tokens": 407056075.0, "step": 2554 }, { "epoch": 1.299593082400814, "grad_norm": 0.9467877149581909, "learning_rate": 1e-05, "loss": 0.4814, "mean_token_accuracy": 0.8476945161819458, "num_tokens": 407213538.0, "step": 2555 }, { "epoch": 1.3001017293997965, "grad_norm": 1.080271601676941, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.841745138168335, "num_tokens": 407364405.0, "step": 2556 }, { "epoch": 1.3006103763987793, "grad_norm": 1.006981611251831, "learning_rate": 1e-05, "loss": 0.4803, "mean_token_accuracy": 0.8464992642402649, "num_tokens": 407530938.0, "step": 2557 }, { "epoch": 1.3011190233977619, "grad_norm": 0.956074595451355, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8508355021476746, "num_tokens": 407689878.0, "step": 2558 }, { "epoch": 1.3016276703967447, "grad_norm": 0.9157811999320984, "learning_rate": 1e-05, "loss": 0.4618, "mean_token_accuracy": 0.8531558513641357, "num_tokens": 407842592.0, "step": 2559 }, { "epoch": 1.3021363173957274, "grad_norm": 0.9051852822303772, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8445741534233093, "num_tokens": 408019653.0, "step": 2560 }, { "epoch": 1.30264496439471, "grad_norm": 0.8561477661132812, "learning_rate": 1e-05, "loss": 0.4602, "mean_token_accuracy": 0.8530067205429077, "num_tokens": 408182639.0, "step": 2561 }, { "epoch": 1.3031536113936928, "grad_norm": 0.9158396124839783, "learning_rate": 1e-05, "loss": 0.4943, "mean_token_accuracy": 0.8439619541168213, "num_tokens": 408343205.0, "step": 2562 }, { "epoch": 1.3036622583926754, "grad_norm": 0.940427839756012, "learning_rate": 1e-05, "loss": 0.4771, "mean_token_accuracy": 0.8503564596176147, "num_tokens": 408505934.0, "step": 2563 }, { "epoch": 1.3041709053916581, "grad_norm": 0.9462770223617554, "learning_rate": 1e-05, "loss": 0.4903, "mean_token_accuracy": 0.8440700173377991, "num_tokens": 408666686.0, "step": 2564 }, { "epoch": 1.304679552390641, "grad_norm": 0.9128457903862, "learning_rate": 1e-05, "loss": 0.4983, "mean_token_accuracy": 0.8417966365814209, "num_tokens": 408829260.0, "step": 2565 }, { "epoch": 1.3051881993896237, "grad_norm": 0.95364910364151, "learning_rate": 1e-05, "loss": 0.4976, "mean_token_accuracy": 0.8429286479949951, "num_tokens": 408989130.0, "step": 2566 }, { "epoch": 1.3056968463886063, "grad_norm": 1.0803797245025635, "learning_rate": 1e-05, "loss": 0.4891, "mean_token_accuracy": 0.8446954488754272, "num_tokens": 409160100.0, "step": 2567 }, { "epoch": 1.306205493387589, "grad_norm": 0.9539232850074768, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8547990918159485, "num_tokens": 409319697.0, "step": 2568 }, { "epoch": 1.3067141403865716, "grad_norm": 1.019270896911621, "learning_rate": 1e-05, "loss": 0.4858, "mean_token_accuracy": 0.8463223576545715, "num_tokens": 409473663.0, "step": 2569 }, { "epoch": 1.3072227873855544, "grad_norm": 0.9860494136810303, "learning_rate": 1e-05, "loss": 0.4802, "mean_token_accuracy": 0.8477706909179688, "num_tokens": 409638569.0, "step": 2570 }, { "epoch": 1.3077314343845372, "grad_norm": 0.9673585295677185, "learning_rate": 1e-05, "loss": 0.4917, "mean_token_accuracy": 0.8434401750564575, "num_tokens": 409802014.0, "step": 2571 }, { "epoch": 1.3082400813835198, "grad_norm": 1.1659724712371826, "learning_rate": 1e-05, "loss": 0.4864, "mean_token_accuracy": 0.8469688892364502, "num_tokens": 409963322.0, "step": 2572 }, { "epoch": 1.3087487283825026, "grad_norm": 0.9789167642593384, "learning_rate": 1e-05, "loss": 0.4718, "mean_token_accuracy": 0.8485277891159058, "num_tokens": 410124722.0, "step": 2573 }, { "epoch": 1.3092573753814851, "grad_norm": 0.8991252183914185, "learning_rate": 1e-05, "loss": 0.4293, "mean_token_accuracy": 0.8623579144477844, "num_tokens": 410289774.0, "step": 2574 }, { "epoch": 1.309766022380468, "grad_norm": 0.9171684980392456, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8557093739509583, "num_tokens": 410456209.0, "step": 2575 }, { "epoch": 1.3102746693794507, "grad_norm": 1.0491565465927124, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.8587504625320435, "num_tokens": 410603142.0, "step": 2576 }, { "epoch": 1.3107833163784335, "grad_norm": 1.041669487953186, "learning_rate": 1e-05, "loss": 0.466, "mean_token_accuracy": 0.852723240852356, "num_tokens": 410770784.0, "step": 2577 }, { "epoch": 1.311291963377416, "grad_norm": 0.9561178684234619, "learning_rate": 1e-05, "loss": 0.5213, "mean_token_accuracy": 0.8371676802635193, "num_tokens": 410934281.0, "step": 2578 }, { "epoch": 1.3118006103763988, "grad_norm": 0.9502045512199402, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.8432497382164001, "num_tokens": 411097001.0, "step": 2579 }, { "epoch": 1.3123092573753814, "grad_norm": 1.0522953271865845, "learning_rate": 1e-05, "loss": 0.4945, "mean_token_accuracy": 0.8432407975196838, "num_tokens": 411262888.0, "step": 2580 }, { "epoch": 1.3128179043743642, "grad_norm": 0.9856316447257996, "learning_rate": 1e-05, "loss": 0.4722, "mean_token_accuracy": 0.8499524593353271, "num_tokens": 411412576.0, "step": 2581 }, { "epoch": 1.313326551373347, "grad_norm": 0.9543809294700623, "learning_rate": 1e-05, "loss": 0.4711, "mean_token_accuracy": 0.8511854410171509, "num_tokens": 411567594.0, "step": 2582 }, { "epoch": 1.3138351983723295, "grad_norm": 0.9256858229637146, "learning_rate": 1e-05, "loss": 0.515, "mean_token_accuracy": 0.8376957178115845, "num_tokens": 411735468.0, "step": 2583 }, { "epoch": 1.3143438453713123, "grad_norm": 0.9677025675773621, "learning_rate": 1e-05, "loss": 0.4797, "mean_token_accuracy": 0.8493592143058777, "num_tokens": 411893042.0, "step": 2584 }, { "epoch": 1.314852492370295, "grad_norm": 0.9563512206077576, "learning_rate": 1e-05, "loss": 0.4854, "mean_token_accuracy": 0.846301794052124, "num_tokens": 412061278.0, "step": 2585 }, { "epoch": 1.3153611393692777, "grad_norm": 0.9341158270835876, "learning_rate": 1e-05, "loss": 0.4609, "mean_token_accuracy": 0.8514420390129089, "num_tokens": 412215931.0, "step": 2586 }, { "epoch": 1.3158697863682605, "grad_norm": 0.9857239723205566, "learning_rate": 1e-05, "loss": 0.485, "mean_token_accuracy": 0.8454148173332214, "num_tokens": 412390962.0, "step": 2587 }, { "epoch": 1.3163784333672433, "grad_norm": 0.9439259171485901, "learning_rate": 1e-05, "loss": 0.4777, "mean_token_accuracy": 0.8475058674812317, "num_tokens": 412552715.0, "step": 2588 }, { "epoch": 1.3168870803662258, "grad_norm": 0.9452398419380188, "learning_rate": 1e-05, "loss": 0.4911, "mean_token_accuracy": 0.8462110757827759, "num_tokens": 412719801.0, "step": 2589 }, { "epoch": 1.3173957273652086, "grad_norm": 0.9879827499389648, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.844982385635376, "num_tokens": 412869340.0, "step": 2590 }, { "epoch": 1.3179043743641912, "grad_norm": 0.9580371975898743, "learning_rate": 1e-05, "loss": 0.4622, "mean_token_accuracy": 0.8534672260284424, "num_tokens": 413025251.0, "step": 2591 }, { "epoch": 1.318413021363174, "grad_norm": 0.9868455529212952, "learning_rate": 1e-05, "loss": 0.4776, "mean_token_accuracy": 0.8489521741867065, "num_tokens": 413187434.0, "step": 2592 }, { "epoch": 1.3189216683621567, "grad_norm": 1.007317304611206, "learning_rate": 1e-05, "loss": 0.5096, "mean_token_accuracy": 0.8397385478019714, "num_tokens": 413359026.0, "step": 2593 }, { "epoch": 1.3194303153611393, "grad_norm": 0.9267711639404297, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.8407220244407654, "num_tokens": 413523255.0, "step": 2594 }, { "epoch": 1.319938962360122, "grad_norm": 0.9290441870689392, "learning_rate": 1e-05, "loss": 0.4554, "mean_token_accuracy": 0.8536738157272339, "num_tokens": 413677670.0, "step": 2595 }, { "epoch": 1.3204476093591047, "grad_norm": 0.9466893672943115, "learning_rate": 1e-05, "loss": 0.5042, "mean_token_accuracy": 0.8413321375846863, "num_tokens": 413836107.0, "step": 2596 }, { "epoch": 1.3209562563580874, "grad_norm": 0.8821899890899658, "learning_rate": 1e-05, "loss": 0.4428, "mean_token_accuracy": 0.8579069972038269, "num_tokens": 414000734.0, "step": 2597 }, { "epoch": 1.3214649033570702, "grad_norm": 0.9268909692764282, "learning_rate": 1e-05, "loss": 0.4694, "mean_token_accuracy": 0.8505789637565613, "num_tokens": 414156938.0, "step": 2598 }, { "epoch": 1.321973550356053, "grad_norm": 0.9249274730682373, "learning_rate": 1e-05, "loss": 0.4856, "mean_token_accuracy": 0.847085177898407, "num_tokens": 414318719.0, "step": 2599 }, { "epoch": 1.3224821973550356, "grad_norm": 0.9466115236282349, "learning_rate": 1e-05, "loss": 0.5139, "mean_token_accuracy": 0.8403315544128418, "num_tokens": 414482061.0, "step": 2600 }, { "epoch": 1.3229908443540184, "grad_norm": 0.9923340082168579, "learning_rate": 1e-05, "loss": 0.4805, "mean_token_accuracy": 0.8463633060455322, "num_tokens": 414637332.0, "step": 2601 }, { "epoch": 1.323499491353001, "grad_norm": 0.8582419753074646, "learning_rate": 1e-05, "loss": 0.4549, "mean_token_accuracy": 0.8543981313705444, "num_tokens": 414806350.0, "step": 2602 }, { "epoch": 1.3240081383519837, "grad_norm": 0.9188969135284424, "learning_rate": 1e-05, "loss": 0.5154, "mean_token_accuracy": 0.8376389145851135, "num_tokens": 414976420.0, "step": 2603 }, { "epoch": 1.3245167853509665, "grad_norm": 0.9199648499488831, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8413670659065247, "num_tokens": 415151776.0, "step": 2604 }, { "epoch": 1.325025432349949, "grad_norm": 0.9353446960449219, "learning_rate": 1e-05, "loss": 0.4777, "mean_token_accuracy": 0.8477828502655029, "num_tokens": 415305958.0, "step": 2605 }, { "epoch": 1.3255340793489319, "grad_norm": 0.9904804229736328, "learning_rate": 1e-05, "loss": 0.4977, "mean_token_accuracy": 0.8437171578407288, "num_tokens": 415475717.0, "step": 2606 }, { "epoch": 1.3260427263479144, "grad_norm": 0.9145898818969727, "learning_rate": 1e-05, "loss": 0.4738, "mean_token_accuracy": 0.8503988981246948, "num_tokens": 415628323.0, "step": 2607 }, { "epoch": 1.3265513733468972, "grad_norm": 0.9652713537216187, "learning_rate": 1e-05, "loss": 0.4594, "mean_token_accuracy": 0.854482889175415, "num_tokens": 415782339.0, "step": 2608 }, { "epoch": 1.32706002034588, "grad_norm": 0.9327253103256226, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8490644693374634, "num_tokens": 415936748.0, "step": 2609 }, { "epoch": 1.3275686673448628, "grad_norm": 0.9666473269462585, "learning_rate": 1e-05, "loss": 0.4866, "mean_token_accuracy": 0.8455109596252441, "num_tokens": 416091384.0, "step": 2610 }, { "epoch": 1.3280773143438453, "grad_norm": 0.9348263144493103, "learning_rate": 1e-05, "loss": 0.4822, "mean_token_accuracy": 0.8458163738250732, "num_tokens": 416245335.0, "step": 2611 }, { "epoch": 1.3285859613428281, "grad_norm": 0.9533799290657043, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8398832082748413, "num_tokens": 416408034.0, "step": 2612 }, { "epoch": 1.3290946083418107, "grad_norm": 0.962467610836029, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8395194411277771, "num_tokens": 416567114.0, "step": 2613 }, { "epoch": 1.3296032553407935, "grad_norm": 0.9966299533843994, "learning_rate": 1e-05, "loss": 0.4661, "mean_token_accuracy": 0.8517531156539917, "num_tokens": 416715317.0, "step": 2614 }, { "epoch": 1.3301119023397763, "grad_norm": 0.993205726146698, "learning_rate": 1e-05, "loss": 0.4863, "mean_token_accuracy": 0.8464823365211487, "num_tokens": 416862929.0, "step": 2615 }, { "epoch": 1.3306205493387588, "grad_norm": 1.0152370929718018, "learning_rate": 1e-05, "loss": 0.477, "mean_token_accuracy": 0.8483762145042419, "num_tokens": 417017357.0, "step": 2616 }, { "epoch": 1.3311291963377416, "grad_norm": 0.9989106059074402, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.8536456227302551, "num_tokens": 417166390.0, "step": 2617 }, { "epoch": 1.3316378433367242, "grad_norm": 0.9126229882240295, "learning_rate": 1e-05, "loss": 0.4658, "mean_token_accuracy": 0.8509222269058228, "num_tokens": 417324031.0, "step": 2618 }, { "epoch": 1.332146490335707, "grad_norm": 0.9760608673095703, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.842339813709259, "num_tokens": 417488486.0, "step": 2619 }, { "epoch": 1.3326551373346898, "grad_norm": 0.9777554273605347, "learning_rate": 1e-05, "loss": 0.4588, "mean_token_accuracy": 0.8518586754798889, "num_tokens": 417635848.0, "step": 2620 }, { "epoch": 1.3331637843336726, "grad_norm": 0.9069948196411133, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8558377027511597, "num_tokens": 417804152.0, "step": 2621 }, { "epoch": 1.3336724313326551, "grad_norm": 0.9307112693786621, "learning_rate": 1e-05, "loss": 0.4491, "mean_token_accuracy": 0.8567745685577393, "num_tokens": 417948063.0, "step": 2622 }, { "epoch": 1.334181078331638, "grad_norm": 0.938089907169342, "learning_rate": 1e-05, "loss": 0.5172, "mean_token_accuracy": 0.8359748721122742, "num_tokens": 418099041.0, "step": 2623 }, { "epoch": 1.3346897253306205, "grad_norm": 0.8834355473518372, "learning_rate": 1e-05, "loss": 0.5069, "mean_token_accuracy": 0.8408173322677612, "num_tokens": 418263839.0, "step": 2624 }, { "epoch": 1.3351983723296033, "grad_norm": 0.9467998743057251, "learning_rate": 1e-05, "loss": 0.4412, "mean_token_accuracy": 0.8598389625549316, "num_tokens": 418422529.0, "step": 2625 }, { "epoch": 1.335707019328586, "grad_norm": 0.9922293424606323, "learning_rate": 1e-05, "loss": 0.4842, "mean_token_accuracy": 0.8466914892196655, "num_tokens": 418582350.0, "step": 2626 }, { "epoch": 1.3362156663275686, "grad_norm": 0.8823071122169495, "learning_rate": 1e-05, "loss": 0.4842, "mean_token_accuracy": 0.8463080525398254, "num_tokens": 418747149.0, "step": 2627 }, { "epoch": 1.3367243133265514, "grad_norm": 0.908643364906311, "learning_rate": 1e-05, "loss": 0.4397, "mean_token_accuracy": 0.8584833741188049, "num_tokens": 418893429.0, "step": 2628 }, { "epoch": 1.337232960325534, "grad_norm": 0.9817284345626831, "learning_rate": 1e-05, "loss": 0.4935, "mean_token_accuracy": 0.8436906933784485, "num_tokens": 419062878.0, "step": 2629 }, { "epoch": 1.3377416073245167, "grad_norm": 0.9526888728141785, "learning_rate": 1e-05, "loss": 0.4648, "mean_token_accuracy": 0.8509718775749207, "num_tokens": 419228134.0, "step": 2630 }, { "epoch": 1.3382502543234995, "grad_norm": 1.013670563697815, "learning_rate": 1e-05, "loss": 0.5534, "mean_token_accuracy": 0.8273931741714478, "num_tokens": 419385610.0, "step": 2631 }, { "epoch": 1.3387589013224823, "grad_norm": 1.023470163345337, "learning_rate": 1e-05, "loss": 0.4808, "mean_token_accuracy": 0.8471496105194092, "num_tokens": 419546274.0, "step": 2632 }, { "epoch": 1.3392675483214649, "grad_norm": 1.1507434844970703, "learning_rate": 1e-05, "loss": 0.5087, "mean_token_accuracy": 0.8395289182662964, "num_tokens": 419698007.0, "step": 2633 }, { "epoch": 1.3397761953204477, "grad_norm": 0.9994651079177856, "learning_rate": 1e-05, "loss": 0.4856, "mean_token_accuracy": 0.8446428179740906, "num_tokens": 419861881.0, "step": 2634 }, { "epoch": 1.3402848423194302, "grad_norm": 0.9943557381629944, "learning_rate": 1e-05, "loss": 0.4999, "mean_token_accuracy": 0.8429102897644043, "num_tokens": 420021371.0, "step": 2635 }, { "epoch": 1.340793489318413, "grad_norm": 1.1529191732406616, "learning_rate": 1e-05, "loss": 0.4341, "mean_token_accuracy": 0.8602442741394043, "num_tokens": 420174883.0, "step": 2636 }, { "epoch": 1.3413021363173958, "grad_norm": 0.9898726344108582, "learning_rate": 1e-05, "loss": 0.5235, "mean_token_accuracy": 0.8371015787124634, "num_tokens": 420329703.0, "step": 2637 }, { "epoch": 1.3418107833163784, "grad_norm": 0.980955958366394, "learning_rate": 1e-05, "loss": 0.4659, "mean_token_accuracy": 0.851169764995575, "num_tokens": 420490175.0, "step": 2638 }, { "epoch": 1.3423194303153612, "grad_norm": 0.8918364644050598, "learning_rate": 1e-05, "loss": 0.467, "mean_token_accuracy": 0.8519745469093323, "num_tokens": 420654883.0, "step": 2639 }, { "epoch": 1.3428280773143437, "grad_norm": 1.001033902168274, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8565718531608582, "num_tokens": 420818659.0, "step": 2640 }, { "epoch": 1.3433367243133265, "grad_norm": 0.9805684089660645, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8446347713470459, "num_tokens": 420990326.0, "step": 2641 }, { "epoch": 1.3438453713123093, "grad_norm": 0.966640055179596, "learning_rate": 1e-05, "loss": 0.4882, "mean_token_accuracy": 0.8449985980987549, "num_tokens": 421147515.0, "step": 2642 }, { "epoch": 1.344354018311292, "grad_norm": 1.0281821489334106, "learning_rate": 1e-05, "loss": 0.4496, "mean_token_accuracy": 0.8562248945236206, "num_tokens": 421305867.0, "step": 2643 }, { "epoch": 1.3448626653102747, "grad_norm": 1.0086641311645508, "learning_rate": 1e-05, "loss": 0.5143, "mean_token_accuracy": 0.8380690813064575, "num_tokens": 421474359.0, "step": 2644 }, { "epoch": 1.3453713123092574, "grad_norm": 0.9449248313903809, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8509700298309326, "num_tokens": 421633879.0, "step": 2645 }, { "epoch": 1.34587995930824, "grad_norm": 1.2007900476455688, "learning_rate": 1e-05, "loss": 0.4892, "mean_token_accuracy": 0.8451941013336182, "num_tokens": 421795778.0, "step": 2646 }, { "epoch": 1.3463886063072228, "grad_norm": 1.054432988166809, "learning_rate": 1e-05, "loss": 0.4985, "mean_token_accuracy": 0.8436516523361206, "num_tokens": 421962592.0, "step": 2647 }, { "epoch": 1.3468972533062056, "grad_norm": 0.9219520092010498, "learning_rate": 1e-05, "loss": 0.4862, "mean_token_accuracy": 0.8458356857299805, "num_tokens": 422127600.0, "step": 2648 }, { "epoch": 1.3474059003051881, "grad_norm": 1.0039230585098267, "learning_rate": 1e-05, "loss": 0.4762, "mean_token_accuracy": 0.8490136861801147, "num_tokens": 422286922.0, "step": 2649 }, { "epoch": 1.347914547304171, "grad_norm": 1.0406755208969116, "learning_rate": 1e-05, "loss": 0.5171, "mean_token_accuracy": 0.8369194269180298, "num_tokens": 422441520.0, "step": 2650 }, { "epoch": 1.3484231943031535, "grad_norm": 0.985836386680603, "learning_rate": 1e-05, "loss": 0.5037, "mean_token_accuracy": 0.8412511348724365, "num_tokens": 422613305.0, "step": 2651 }, { "epoch": 1.3489318413021363, "grad_norm": 1.060807466506958, "learning_rate": 1e-05, "loss": 0.5082, "mean_token_accuracy": 0.8395723104476929, "num_tokens": 422758073.0, "step": 2652 }, { "epoch": 1.349440488301119, "grad_norm": 1.0247995853424072, "learning_rate": 1e-05, "loss": 0.4683, "mean_token_accuracy": 0.8498069643974304, "num_tokens": 422916224.0, "step": 2653 }, { "epoch": 1.3499491353001019, "grad_norm": 0.9895526170730591, "learning_rate": 1e-05, "loss": 0.4935, "mean_token_accuracy": 0.8443385362625122, "num_tokens": 423072900.0, "step": 2654 }, { "epoch": 1.3504577822990844, "grad_norm": 0.9098997116088867, "learning_rate": 1e-05, "loss": 0.4692, "mean_token_accuracy": 0.8502326607704163, "num_tokens": 423239477.0, "step": 2655 }, { "epoch": 1.3509664292980672, "grad_norm": 1.019323706626892, "learning_rate": 1e-05, "loss": 0.4586, "mean_token_accuracy": 0.851577639579773, "num_tokens": 423399159.0, "step": 2656 }, { "epoch": 1.3514750762970498, "grad_norm": 1.067026138305664, "learning_rate": 1e-05, "loss": 0.511, "mean_token_accuracy": 0.8384853601455688, "num_tokens": 423550338.0, "step": 2657 }, { "epoch": 1.3519837232960326, "grad_norm": 0.9990204572677612, "learning_rate": 1e-05, "loss": 0.4742, "mean_token_accuracy": 0.8494395613670349, "num_tokens": 423723160.0, "step": 2658 }, { "epoch": 1.3524923702950153, "grad_norm": 0.9186148047447205, "learning_rate": 1e-05, "loss": 0.449, "mean_token_accuracy": 0.8554156422615051, "num_tokens": 423884871.0, "step": 2659 }, { "epoch": 1.353001017293998, "grad_norm": 0.946681022644043, "learning_rate": 1e-05, "loss": 0.4214, "mean_token_accuracy": 0.8641282916069031, "num_tokens": 424042895.0, "step": 2660 }, { "epoch": 1.3535096642929807, "grad_norm": 0.9847366809844971, "learning_rate": 1e-05, "loss": 0.4319, "mean_token_accuracy": 0.8606440424919128, "num_tokens": 424204670.0, "step": 2661 }, { "epoch": 1.3540183112919633, "grad_norm": 1.00178062915802, "learning_rate": 1e-05, "loss": 0.5076, "mean_token_accuracy": 0.8411481976509094, "num_tokens": 424359175.0, "step": 2662 }, { "epoch": 1.354526958290946, "grad_norm": 1.051777720451355, "learning_rate": 1e-05, "loss": 0.4765, "mean_token_accuracy": 0.8494985103607178, "num_tokens": 424520295.0, "step": 2663 }, { "epoch": 1.3550356052899288, "grad_norm": 1.0117286443710327, "learning_rate": 1e-05, "loss": 0.4905, "mean_token_accuracy": 0.8441912531852722, "num_tokens": 424688740.0, "step": 2664 }, { "epoch": 1.3555442522889116, "grad_norm": 0.9656341075897217, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.853062629699707, "num_tokens": 424850384.0, "step": 2665 }, { "epoch": 1.3560528992878942, "grad_norm": 1.0329508781433105, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8456158638000488, "num_tokens": 425017780.0, "step": 2666 }, { "epoch": 1.356561546286877, "grad_norm": 0.9582757949829102, "learning_rate": 1e-05, "loss": 0.486, "mean_token_accuracy": 0.8466944098472595, "num_tokens": 425188533.0, "step": 2667 }, { "epoch": 1.3570701932858595, "grad_norm": 0.9417458772659302, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8438390493392944, "num_tokens": 425345218.0, "step": 2668 }, { "epoch": 1.3575788402848423, "grad_norm": 0.9999868273735046, "learning_rate": 1e-05, "loss": 0.4624, "mean_token_accuracy": 0.852497935295105, "num_tokens": 425502223.0, "step": 2669 }, { "epoch": 1.3580874872838251, "grad_norm": 1.0143228769302368, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.841201901435852, "num_tokens": 425660450.0, "step": 2670 }, { "epoch": 1.3585961342828077, "grad_norm": 0.9608351588249207, "learning_rate": 1e-05, "loss": 0.4843, "mean_token_accuracy": 0.8468529582023621, "num_tokens": 425816822.0, "step": 2671 }, { "epoch": 1.3591047812817905, "grad_norm": 1.0515697002410889, "learning_rate": 1e-05, "loss": 0.4861, "mean_token_accuracy": 0.8458720445632935, "num_tokens": 425961495.0, "step": 2672 }, { "epoch": 1.359613428280773, "grad_norm": 1.0757752656936646, "learning_rate": 1e-05, "loss": 0.4844, "mean_token_accuracy": 0.8457462787628174, "num_tokens": 426120184.0, "step": 2673 }, { "epoch": 1.3601220752797558, "grad_norm": 0.9009885787963867, "learning_rate": 1e-05, "loss": 0.4612, "mean_token_accuracy": 0.8526752591133118, "num_tokens": 426290019.0, "step": 2674 }, { "epoch": 1.3606307222787386, "grad_norm": 0.9713742733001709, "learning_rate": 1e-05, "loss": 0.4482, "mean_token_accuracy": 0.8567054271697998, "num_tokens": 426453175.0, "step": 2675 }, { "epoch": 1.3611393692777214, "grad_norm": 1.000468134880066, "learning_rate": 1e-05, "loss": 0.4714, "mean_token_accuracy": 0.8515945672988892, "num_tokens": 426600712.0, "step": 2676 }, { "epoch": 1.361648016276704, "grad_norm": 1.0043721199035645, "learning_rate": 1e-05, "loss": 0.4928, "mean_token_accuracy": 0.8436155915260315, "num_tokens": 426771838.0, "step": 2677 }, { "epoch": 1.3621566632756867, "grad_norm": 0.937410295009613, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8478071093559265, "num_tokens": 426930227.0, "step": 2678 }, { "epoch": 1.3626653102746693, "grad_norm": 0.9257480502128601, "learning_rate": 1e-05, "loss": 0.4607, "mean_token_accuracy": 0.8536421656608582, "num_tokens": 427089271.0, "step": 2679 }, { "epoch": 1.363173957273652, "grad_norm": 0.9284461140632629, "learning_rate": 1e-05, "loss": 0.4707, "mean_token_accuracy": 0.8504741191864014, "num_tokens": 427263929.0, "step": 2680 }, { "epoch": 1.3636826042726349, "grad_norm": 0.8936075568199158, "learning_rate": 1e-05, "loss": 0.5087, "mean_token_accuracy": 0.8401069045066833, "num_tokens": 427428756.0, "step": 2681 }, { "epoch": 1.3641912512716174, "grad_norm": 0.9413860440254211, "learning_rate": 1e-05, "loss": 0.4682, "mean_token_accuracy": 0.8501747250556946, "num_tokens": 427581870.0, "step": 2682 }, { "epoch": 1.3646998982706002, "grad_norm": 0.9177972078323364, "learning_rate": 1e-05, "loss": 0.4617, "mean_token_accuracy": 0.8546660542488098, "num_tokens": 427739317.0, "step": 2683 }, { "epoch": 1.3652085452695828, "grad_norm": 0.9576936364173889, "learning_rate": 1e-05, "loss": 0.4697, "mean_token_accuracy": 0.8502979874610901, "num_tokens": 427890146.0, "step": 2684 }, { "epoch": 1.3657171922685656, "grad_norm": 0.8964239954948425, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.8439350128173828, "num_tokens": 428056020.0, "step": 2685 }, { "epoch": 1.3662258392675484, "grad_norm": 0.9161334037780762, "learning_rate": 1e-05, "loss": 0.4696, "mean_token_accuracy": 0.8516874313354492, "num_tokens": 428210521.0, "step": 2686 }, { "epoch": 1.3667344862665312, "grad_norm": 0.9134512543678284, "learning_rate": 1e-05, "loss": 0.4618, "mean_token_accuracy": 0.8511298298835754, "num_tokens": 428375323.0, "step": 2687 }, { "epoch": 1.3672431332655137, "grad_norm": 1.0040498971939087, "learning_rate": 1e-05, "loss": 0.5006, "mean_token_accuracy": 0.8420723676681519, "num_tokens": 428539565.0, "step": 2688 }, { "epoch": 1.3677517802644965, "grad_norm": 0.935520589351654, "learning_rate": 1e-05, "loss": 0.4939, "mean_token_accuracy": 0.8450379371643066, "num_tokens": 428720975.0, "step": 2689 }, { "epoch": 1.368260427263479, "grad_norm": 0.9524109363555908, "learning_rate": 1e-05, "loss": 0.4965, "mean_token_accuracy": 0.8424445986747742, "num_tokens": 428868507.0, "step": 2690 }, { "epoch": 1.3687690742624619, "grad_norm": 0.9552338123321533, "learning_rate": 1e-05, "loss": 0.5028, "mean_token_accuracy": 0.8432607650756836, "num_tokens": 429029080.0, "step": 2691 }, { "epoch": 1.3692777212614446, "grad_norm": 0.9479795098304749, "learning_rate": 1e-05, "loss": 0.4696, "mean_token_accuracy": 0.8508580327033997, "num_tokens": 429192002.0, "step": 2692 }, { "epoch": 1.3697863682604272, "grad_norm": 0.9779879450798035, "learning_rate": 1e-05, "loss": 0.4772, "mean_token_accuracy": 0.8503663539886475, "num_tokens": 429340295.0, "step": 2693 }, { "epoch": 1.37029501525941, "grad_norm": 0.9469121098518372, "learning_rate": 1e-05, "loss": 0.4849, "mean_token_accuracy": 0.8442733883857727, "num_tokens": 429499970.0, "step": 2694 }, { "epoch": 1.3708036622583926, "grad_norm": 0.9048606157302856, "learning_rate": 1e-05, "loss": 0.4583, "mean_token_accuracy": 0.8535619378089905, "num_tokens": 429658951.0, "step": 2695 }, { "epoch": 1.3713123092573754, "grad_norm": 0.9049401879310608, "learning_rate": 1e-05, "loss": 0.4954, "mean_token_accuracy": 0.8439792990684509, "num_tokens": 429822566.0, "step": 2696 }, { "epoch": 1.3718209562563581, "grad_norm": 0.9330436587333679, "learning_rate": 1e-05, "loss": 0.4538, "mean_token_accuracy": 0.8555499315261841, "num_tokens": 429982638.0, "step": 2697 }, { "epoch": 1.372329603255341, "grad_norm": 0.9345403909683228, "learning_rate": 1e-05, "loss": 0.459, "mean_token_accuracy": 0.8530346155166626, "num_tokens": 430138798.0, "step": 2698 }, { "epoch": 1.3728382502543235, "grad_norm": 0.9769986867904663, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.849714994430542, "num_tokens": 430294611.0, "step": 2699 }, { "epoch": 1.3733468972533063, "grad_norm": 0.9289859533309937, "learning_rate": 1e-05, "loss": 0.4339, "mean_token_accuracy": 0.8610995411872864, "num_tokens": 430457184.0, "step": 2700 }, { "epoch": 1.3738555442522888, "grad_norm": 0.959317684173584, "learning_rate": 1e-05, "loss": 0.4856, "mean_token_accuracy": 0.8475931882858276, "num_tokens": 430616726.0, "step": 2701 }, { "epoch": 1.3743641912512716, "grad_norm": 0.9652572870254517, "learning_rate": 1e-05, "loss": 0.4958, "mean_token_accuracy": 0.8435888290405273, "num_tokens": 430782101.0, "step": 2702 }, { "epoch": 1.3748728382502544, "grad_norm": 0.9083567261695862, "learning_rate": 1e-05, "loss": 0.4564, "mean_token_accuracy": 0.8545889854431152, "num_tokens": 430931165.0, "step": 2703 }, { "epoch": 1.375381485249237, "grad_norm": 1.0234804153442383, "learning_rate": 1e-05, "loss": 0.4629, "mean_token_accuracy": 0.85181725025177, "num_tokens": 431093719.0, "step": 2704 }, { "epoch": 1.3758901322482198, "grad_norm": 0.920213520526886, "learning_rate": 1e-05, "loss": 0.4506, "mean_token_accuracy": 0.8569689989089966, "num_tokens": 431256289.0, "step": 2705 }, { "epoch": 1.3763987792472023, "grad_norm": 0.9725545644760132, "learning_rate": 1e-05, "loss": 0.5236, "mean_token_accuracy": 0.836646318435669, "num_tokens": 431420888.0, "step": 2706 }, { "epoch": 1.3769074262461851, "grad_norm": 0.9515104293823242, "learning_rate": 1e-05, "loss": 0.4903, "mean_token_accuracy": 0.8439248204231262, "num_tokens": 431587261.0, "step": 2707 }, { "epoch": 1.377416073245168, "grad_norm": 0.9942727088928223, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8490264415740967, "num_tokens": 431747572.0, "step": 2708 }, { "epoch": 1.3779247202441507, "grad_norm": 0.9128475785255432, "learning_rate": 1e-05, "loss": 0.4742, "mean_token_accuracy": 0.8506051301956177, "num_tokens": 431901013.0, "step": 2709 }, { "epoch": 1.3784333672431333, "grad_norm": 1.0360462665557861, "learning_rate": 1e-05, "loss": 0.5207, "mean_token_accuracy": 0.8344942331314087, "num_tokens": 432053072.0, "step": 2710 }, { "epoch": 1.378942014242116, "grad_norm": 1.0364712476730347, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8600242137908936, "num_tokens": 432218359.0, "step": 2711 }, { "epoch": 1.3794506612410986, "grad_norm": 1.0053229331970215, "learning_rate": 1e-05, "loss": 0.4712, "mean_token_accuracy": 0.8485782742500305, "num_tokens": 432356553.0, "step": 2712 }, { "epoch": 1.3799593082400814, "grad_norm": 0.9933727383613586, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.841928243637085, "num_tokens": 432507222.0, "step": 2713 }, { "epoch": 1.3804679552390642, "grad_norm": 0.9446233510971069, "learning_rate": 1e-05, "loss": 0.4859, "mean_token_accuracy": 0.8459019660949707, "num_tokens": 432670085.0, "step": 2714 }, { "epoch": 1.3809766022380467, "grad_norm": 0.9640635848045349, "learning_rate": 1e-05, "loss": 0.5024, "mean_token_accuracy": 0.8412888050079346, "num_tokens": 432833746.0, "step": 2715 }, { "epoch": 1.3814852492370295, "grad_norm": 0.9870038032531738, "learning_rate": 1e-05, "loss": 0.5188, "mean_token_accuracy": 0.8366783261299133, "num_tokens": 433004996.0, "step": 2716 }, { "epoch": 1.381993896236012, "grad_norm": 0.9928244948387146, "learning_rate": 1e-05, "loss": 0.4501, "mean_token_accuracy": 0.8561815023422241, "num_tokens": 433157665.0, "step": 2717 }, { "epoch": 1.3825025432349949, "grad_norm": 0.9694227576255798, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8482884764671326, "num_tokens": 433309340.0, "step": 2718 }, { "epoch": 1.3830111902339777, "grad_norm": 0.9639489054679871, "learning_rate": 1e-05, "loss": 0.4527, "mean_token_accuracy": 0.8555192947387695, "num_tokens": 433484664.0, "step": 2719 }, { "epoch": 1.3835198372329605, "grad_norm": 0.9023261666297913, "learning_rate": 1e-05, "loss": 0.4643, "mean_token_accuracy": 0.8527742028236389, "num_tokens": 433655158.0, "step": 2720 }, { "epoch": 1.384028484231943, "grad_norm": 1.0446405410766602, "learning_rate": 1e-05, "loss": 0.4675, "mean_token_accuracy": 0.851974368095398, "num_tokens": 433811771.0, "step": 2721 }, { "epoch": 1.3845371312309258, "grad_norm": 0.9548596143722534, "learning_rate": 1e-05, "loss": 0.4499, "mean_token_accuracy": 0.8571314811706543, "num_tokens": 433968624.0, "step": 2722 }, { "epoch": 1.3850457782299084, "grad_norm": 0.9368325471878052, "learning_rate": 1e-05, "loss": 0.488, "mean_token_accuracy": 0.8435842990875244, "num_tokens": 434132817.0, "step": 2723 }, { "epoch": 1.3855544252288912, "grad_norm": 0.9053055644035339, "learning_rate": 1e-05, "loss": 0.4657, "mean_token_accuracy": 0.8512722253799438, "num_tokens": 434295363.0, "step": 2724 }, { "epoch": 1.386063072227874, "grad_norm": 0.9808530807495117, "learning_rate": 1e-05, "loss": 0.4855, "mean_token_accuracy": 0.8470052480697632, "num_tokens": 434461451.0, "step": 2725 }, { "epoch": 1.3865717192268565, "grad_norm": 0.9754900932312012, "learning_rate": 1e-05, "loss": 0.4883, "mean_token_accuracy": 0.8470308184623718, "num_tokens": 434613231.0, "step": 2726 }, { "epoch": 1.3870803662258393, "grad_norm": 0.975768506526947, "learning_rate": 1e-05, "loss": 0.4967, "mean_token_accuracy": 0.842704713344574, "num_tokens": 434761743.0, "step": 2727 }, { "epoch": 1.3875890132248219, "grad_norm": 1.0329039096832275, "learning_rate": 1e-05, "loss": 0.4867, "mean_token_accuracy": 0.8458406329154968, "num_tokens": 434917053.0, "step": 2728 }, { "epoch": 1.3880976602238047, "grad_norm": 1.0342756509780884, "learning_rate": 1e-05, "loss": 0.5157, "mean_token_accuracy": 0.8370108008384705, "num_tokens": 435071996.0, "step": 2729 }, { "epoch": 1.3886063072227874, "grad_norm": 1.015746831893921, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.8457257747650146, "num_tokens": 435226320.0, "step": 2730 }, { "epoch": 1.3891149542217702, "grad_norm": 1.011082410812378, "learning_rate": 1e-05, "loss": 0.4614, "mean_token_accuracy": 0.8516829013824463, "num_tokens": 435383004.0, "step": 2731 }, { "epoch": 1.3896236012207528, "grad_norm": 1.0925073623657227, "learning_rate": 1e-05, "loss": 0.4571, "mean_token_accuracy": 0.8538006544113159, "num_tokens": 435536990.0, "step": 2732 }, { "epoch": 1.3901322482197356, "grad_norm": 0.9984236359596252, "learning_rate": 1e-05, "loss": 0.4619, "mean_token_accuracy": 0.8531023859977722, "num_tokens": 435694177.0, "step": 2733 }, { "epoch": 1.3906408952187181, "grad_norm": 1.0924962759017944, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.8423185348510742, "num_tokens": 435847048.0, "step": 2734 }, { "epoch": 1.391149542217701, "grad_norm": 1.1045939922332764, "learning_rate": 1e-05, "loss": 0.5205, "mean_token_accuracy": 0.8350080251693726, "num_tokens": 436003472.0, "step": 2735 }, { "epoch": 1.3916581892166837, "grad_norm": 1.0720640420913696, "learning_rate": 1e-05, "loss": 0.4872, "mean_token_accuracy": 0.8450804352760315, "num_tokens": 436161792.0, "step": 2736 }, { "epoch": 1.3921668362156663, "grad_norm": 0.997451663017273, "learning_rate": 1e-05, "loss": 0.5235, "mean_token_accuracy": 0.8371497988700867, "num_tokens": 436330865.0, "step": 2737 }, { "epoch": 1.392675483214649, "grad_norm": 1.0413810014724731, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.8415367007255554, "num_tokens": 436497752.0, "step": 2738 }, { "epoch": 1.3931841302136316, "grad_norm": 1.0180333852767944, "learning_rate": 1e-05, "loss": 0.4683, "mean_token_accuracy": 0.8512299656867981, "num_tokens": 436671181.0, "step": 2739 }, { "epoch": 1.3936927772126144, "grad_norm": 0.9152499437332153, "learning_rate": 1e-05, "loss": 0.4608, "mean_token_accuracy": 0.8541569113731384, "num_tokens": 436827912.0, "step": 2740 }, { "epoch": 1.3942014242115972, "grad_norm": 0.9704434871673584, "learning_rate": 1e-05, "loss": 0.4664, "mean_token_accuracy": 0.8514987230300903, "num_tokens": 436994067.0, "step": 2741 }, { "epoch": 1.39471007121058, "grad_norm": 1.0042692422866821, "learning_rate": 1e-05, "loss": 0.4946, "mean_token_accuracy": 0.8427151441574097, "num_tokens": 437161113.0, "step": 2742 }, { "epoch": 1.3952187182095626, "grad_norm": 1.05972158908844, "learning_rate": 1e-05, "loss": 0.4934, "mean_token_accuracy": 0.8417543172836304, "num_tokens": 437304648.0, "step": 2743 }, { "epoch": 1.3957273652085453, "grad_norm": 0.996813952922821, "learning_rate": 1e-05, "loss": 0.5008, "mean_token_accuracy": 0.8440243005752563, "num_tokens": 437453477.0, "step": 2744 }, { "epoch": 1.396236012207528, "grad_norm": 0.9492610692977905, "learning_rate": 1e-05, "loss": 0.492, "mean_token_accuracy": 0.8438138961791992, "num_tokens": 437593153.0, "step": 2745 }, { "epoch": 1.3967446592065107, "grad_norm": 1.0397744178771973, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.842868447303772, "num_tokens": 437749330.0, "step": 2746 }, { "epoch": 1.3972533062054935, "grad_norm": 0.9676275253295898, "learning_rate": 1e-05, "loss": 0.4434, "mean_token_accuracy": 0.8574696779251099, "num_tokens": 437902460.0, "step": 2747 }, { "epoch": 1.397761953204476, "grad_norm": 0.9755284190177917, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.850979745388031, "num_tokens": 438064905.0, "step": 2748 }, { "epoch": 1.3982706002034588, "grad_norm": 0.9887830018997192, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8454263210296631, "num_tokens": 438217621.0, "step": 2749 }, { "epoch": 1.3987792472024414, "grad_norm": 0.9100257754325867, "learning_rate": 1e-05, "loss": 0.45, "mean_token_accuracy": 0.8556119203567505, "num_tokens": 438374191.0, "step": 2750 }, { "epoch": 1.3992878942014242, "grad_norm": 1.0211602449417114, "learning_rate": 1e-05, "loss": 0.475, "mean_token_accuracy": 0.847649097442627, "num_tokens": 438520298.0, "step": 2751 }, { "epoch": 1.399796541200407, "grad_norm": 0.9627428650856018, "learning_rate": 1e-05, "loss": 0.4474, "mean_token_accuracy": 0.8568810224533081, "num_tokens": 438672284.0, "step": 2752 }, { "epoch": 1.4003051881993898, "grad_norm": 0.966880738735199, "learning_rate": 1e-05, "loss": 0.5125, "mean_token_accuracy": 0.8379854559898376, "num_tokens": 438822201.0, "step": 2753 }, { "epoch": 1.4008138351983723, "grad_norm": 0.9599084854125977, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8557698726654053, "num_tokens": 438980241.0, "step": 2754 }, { "epoch": 1.4013224821973551, "grad_norm": 0.9639296531677246, "learning_rate": 1e-05, "loss": 0.447, "mean_token_accuracy": 0.8570913076400757, "num_tokens": 439138528.0, "step": 2755 }, { "epoch": 1.4018311291963377, "grad_norm": 1.021277666091919, "learning_rate": 1e-05, "loss": 0.4915, "mean_token_accuracy": 0.8457634449005127, "num_tokens": 439300154.0, "step": 2756 }, { "epoch": 1.4023397761953205, "grad_norm": 0.9952744245529175, "learning_rate": 1e-05, "loss": 0.4269, "mean_token_accuracy": 0.8626008033752441, "num_tokens": 439457699.0, "step": 2757 }, { "epoch": 1.4028484231943033, "grad_norm": 0.9347396492958069, "learning_rate": 1e-05, "loss": 0.4666, "mean_token_accuracy": 0.851100742816925, "num_tokens": 439612805.0, "step": 2758 }, { "epoch": 1.4033570701932858, "grad_norm": 0.960252583026886, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8494982719421387, "num_tokens": 439772835.0, "step": 2759 }, { "epoch": 1.4038657171922686, "grad_norm": 1.1808850765228271, "learning_rate": 1e-05, "loss": 0.4788, "mean_token_accuracy": 0.8483887910842896, "num_tokens": 439927068.0, "step": 2760 }, { "epoch": 1.4043743641912512, "grad_norm": 1.0060442686080933, "learning_rate": 1e-05, "loss": 0.4624, "mean_token_accuracy": 0.8521533608436584, "num_tokens": 440082717.0, "step": 2761 }, { "epoch": 1.404883011190234, "grad_norm": 1.1049740314483643, "learning_rate": 1e-05, "loss": 0.5135, "mean_token_accuracy": 0.8387951850891113, "num_tokens": 440231502.0, "step": 2762 }, { "epoch": 1.4053916581892167, "grad_norm": 1.0528883934020996, "learning_rate": 1e-05, "loss": 0.4577, "mean_token_accuracy": 0.8528305888175964, "num_tokens": 440385997.0, "step": 2763 }, { "epoch": 1.4059003051881995, "grad_norm": 0.94994056224823, "learning_rate": 1e-05, "loss": 0.5004, "mean_token_accuracy": 0.8418850302696228, "num_tokens": 440550613.0, "step": 2764 }, { "epoch": 1.406408952187182, "grad_norm": 1.0399938821792603, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.8524987697601318, "num_tokens": 440724447.0, "step": 2765 }, { "epoch": 1.4069175991861649, "grad_norm": 1.128157138824463, "learning_rate": 1e-05, "loss": 0.4896, "mean_token_accuracy": 0.8465457558631897, "num_tokens": 440878494.0, "step": 2766 }, { "epoch": 1.4074262461851474, "grad_norm": 1.0129573345184326, "learning_rate": 1e-05, "loss": 0.4749, "mean_token_accuracy": 0.8479048609733582, "num_tokens": 441034220.0, "step": 2767 }, { "epoch": 1.4079348931841302, "grad_norm": 1.0731834173202515, "learning_rate": 1e-05, "loss": 0.4724, "mean_token_accuracy": 0.8481513857841492, "num_tokens": 441175256.0, "step": 2768 }, { "epoch": 1.408443540183113, "grad_norm": 1.1229848861694336, "learning_rate": 1e-05, "loss": 0.4729, "mean_token_accuracy": 0.8501735925674438, "num_tokens": 441330143.0, "step": 2769 }, { "epoch": 1.4089521871820956, "grad_norm": 0.996660053730011, "learning_rate": 1e-05, "loss": 0.4831, "mean_token_accuracy": 0.8471207618713379, "num_tokens": 441490539.0, "step": 2770 }, { "epoch": 1.4094608341810784, "grad_norm": 1.021069049835205, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8487145900726318, "num_tokens": 441650753.0, "step": 2771 }, { "epoch": 1.409969481180061, "grad_norm": 1.0124939680099487, "learning_rate": 1e-05, "loss": 0.478, "mean_token_accuracy": 0.8463742136955261, "num_tokens": 441811314.0, "step": 2772 }, { "epoch": 1.4104781281790437, "grad_norm": 0.939225971698761, "learning_rate": 1e-05, "loss": 0.4578, "mean_token_accuracy": 0.8548750281333923, "num_tokens": 441966161.0, "step": 2773 }, { "epoch": 1.4109867751780265, "grad_norm": 1.051579475402832, "learning_rate": 1e-05, "loss": 0.4782, "mean_token_accuracy": 0.847815752029419, "num_tokens": 442127917.0, "step": 2774 }, { "epoch": 1.411495422177009, "grad_norm": 1.3779577016830444, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8557332754135132, "num_tokens": 442276671.0, "step": 2775 }, { "epoch": 1.4120040691759919, "grad_norm": 1.0016024112701416, "learning_rate": 1e-05, "loss": 0.5314, "mean_token_accuracy": 0.8357734680175781, "num_tokens": 442435182.0, "step": 2776 }, { "epoch": 1.4125127161749746, "grad_norm": 0.9864020347595215, "learning_rate": 1e-05, "loss": 0.4863, "mean_token_accuracy": 0.8437013030052185, "num_tokens": 442605940.0, "step": 2777 }, { "epoch": 1.4130213631739572, "grad_norm": 1.0168684720993042, "learning_rate": 1e-05, "loss": 0.4304, "mean_token_accuracy": 0.8620567321777344, "num_tokens": 442769476.0, "step": 2778 }, { "epoch": 1.41353001017294, "grad_norm": 0.9997568130493164, "learning_rate": 1e-05, "loss": 0.4958, "mean_token_accuracy": 0.8428398966789246, "num_tokens": 442932405.0, "step": 2779 }, { "epoch": 1.4140386571719228, "grad_norm": 1.027672290802002, "learning_rate": 1e-05, "loss": 0.4648, "mean_token_accuracy": 0.8519032001495361, "num_tokens": 443108364.0, "step": 2780 }, { "epoch": 1.4145473041709054, "grad_norm": 1.060289740562439, "learning_rate": 1e-05, "loss": 0.4696, "mean_token_accuracy": 0.8508599996566772, "num_tokens": 443270874.0, "step": 2781 }, { "epoch": 1.4150559511698881, "grad_norm": 0.9960584044456482, "learning_rate": 1e-05, "loss": 0.4639, "mean_token_accuracy": 0.8511028289794922, "num_tokens": 443434228.0, "step": 2782 }, { "epoch": 1.4155645981688707, "grad_norm": 1.0055328607559204, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8413923382759094, "num_tokens": 443602526.0, "step": 2783 }, { "epoch": 1.4160732451678535, "grad_norm": 1.0092580318450928, "learning_rate": 1e-05, "loss": 0.4724, "mean_token_accuracy": 0.8493560552597046, "num_tokens": 443753663.0, "step": 2784 }, { "epoch": 1.4165818921668363, "grad_norm": 0.9981493353843689, "learning_rate": 1e-05, "loss": 0.4607, "mean_token_accuracy": 0.8523511290550232, "num_tokens": 443913758.0, "step": 2785 }, { "epoch": 1.4170905391658188, "grad_norm": 0.988707959651947, "learning_rate": 1e-05, "loss": 0.4502, "mean_token_accuracy": 0.8555818796157837, "num_tokens": 444064404.0, "step": 2786 }, { "epoch": 1.4175991861648016, "grad_norm": 0.8756354451179504, "learning_rate": 1e-05, "loss": 0.463, "mean_token_accuracy": 0.8522771596908569, "num_tokens": 444225462.0, "step": 2787 }, { "epoch": 1.4181078331637844, "grad_norm": 1.0623990297317505, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.84184730052948, "num_tokens": 444385872.0, "step": 2788 }, { "epoch": 1.418616480162767, "grad_norm": 1.0685012340545654, "learning_rate": 1e-05, "loss": 0.4789, "mean_token_accuracy": 0.8478412628173828, "num_tokens": 444533417.0, "step": 2789 }, { "epoch": 1.4191251271617498, "grad_norm": 0.924235999584198, "learning_rate": 1e-05, "loss": 0.4786, "mean_token_accuracy": 0.8480336666107178, "num_tokens": 444686912.0, "step": 2790 }, { "epoch": 1.4196337741607326, "grad_norm": 0.9503185153007507, "learning_rate": 1e-05, "loss": 0.4612, "mean_token_accuracy": 0.8519868850708008, "num_tokens": 444848516.0, "step": 2791 }, { "epoch": 1.4201424211597151, "grad_norm": 0.9894545674324036, "learning_rate": 1e-05, "loss": 0.472, "mean_token_accuracy": 0.8499791622161865, "num_tokens": 445012426.0, "step": 2792 }, { "epoch": 1.420651068158698, "grad_norm": 1.0658549070358276, "learning_rate": 1e-05, "loss": 0.4923, "mean_token_accuracy": 0.8419213891029358, "num_tokens": 445177318.0, "step": 2793 }, { "epoch": 1.4211597151576805, "grad_norm": 0.8910920023918152, "learning_rate": 1e-05, "loss": 0.4855, "mean_token_accuracy": 0.8462452292442322, "num_tokens": 445343179.0, "step": 2794 }, { "epoch": 1.4216683621566633, "grad_norm": 0.9932237863540649, "learning_rate": 1e-05, "loss": 0.4626, "mean_token_accuracy": 0.8527805805206299, "num_tokens": 445513613.0, "step": 2795 }, { "epoch": 1.422177009155646, "grad_norm": 0.9207007884979248, "learning_rate": 1e-05, "loss": 0.4447, "mean_token_accuracy": 0.8579326868057251, "num_tokens": 445669852.0, "step": 2796 }, { "epoch": 1.4226856561546286, "grad_norm": 1.049477219581604, "learning_rate": 1e-05, "loss": 0.5077, "mean_token_accuracy": 0.8388053178787231, "num_tokens": 445837533.0, "step": 2797 }, { "epoch": 1.4231943031536114, "grad_norm": 1.0226757526397705, "learning_rate": 1e-05, "loss": 0.4385, "mean_token_accuracy": 0.8591240644454956, "num_tokens": 445986396.0, "step": 2798 }, { "epoch": 1.4237029501525942, "grad_norm": 0.9482563734054565, "learning_rate": 1e-05, "loss": 0.4733, "mean_token_accuracy": 0.8491603136062622, "num_tokens": 446136330.0, "step": 2799 }, { "epoch": 1.4242115971515767, "grad_norm": 1.045009970664978, "learning_rate": 1e-05, "loss": 0.4835, "mean_token_accuracy": 0.8467520475387573, "num_tokens": 446284635.0, "step": 2800 }, { "epoch": 1.4247202441505595, "grad_norm": 0.9708003997802734, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8428059816360474, "num_tokens": 446448720.0, "step": 2801 }, { "epoch": 1.4252288911495423, "grad_norm": 1.0531318187713623, "learning_rate": 1e-05, "loss": 0.469, "mean_token_accuracy": 0.8497306108474731, "num_tokens": 446599329.0, "step": 2802 }, { "epoch": 1.4257375381485249, "grad_norm": 0.9104205369949341, "learning_rate": 1e-05, "loss": 0.4749, "mean_token_accuracy": 0.8491522669792175, "num_tokens": 446764283.0, "step": 2803 }, { "epoch": 1.4262461851475077, "grad_norm": 0.9720317125320435, "learning_rate": 1e-05, "loss": 0.4796, "mean_token_accuracy": 0.8502120971679688, "num_tokens": 446921393.0, "step": 2804 }, { "epoch": 1.4267548321464902, "grad_norm": 0.9275528192520142, "learning_rate": 1e-05, "loss": 0.4643, "mean_token_accuracy": 0.8525756597518921, "num_tokens": 447086304.0, "step": 2805 }, { "epoch": 1.427263479145473, "grad_norm": 0.9052826166152954, "learning_rate": 1e-05, "loss": 0.4782, "mean_token_accuracy": 0.8469159603118896, "num_tokens": 447257986.0, "step": 2806 }, { "epoch": 1.4277721261444558, "grad_norm": 0.965103268623352, "learning_rate": 1e-05, "loss": 0.4179, "mean_token_accuracy": 0.8643838763237, "num_tokens": 447408803.0, "step": 2807 }, { "epoch": 1.4282807731434384, "grad_norm": 0.9626489877700806, "learning_rate": 1e-05, "loss": 0.4811, "mean_token_accuracy": 0.8474384546279907, "num_tokens": 447568033.0, "step": 2808 }, { "epoch": 1.4287894201424212, "grad_norm": 0.9533666968345642, "learning_rate": 1e-05, "loss": 0.4789, "mean_token_accuracy": 0.8479937314987183, "num_tokens": 447727480.0, "step": 2809 }, { "epoch": 1.4292980671414037, "grad_norm": 0.9389809966087341, "learning_rate": 1e-05, "loss": 0.4553, "mean_token_accuracy": 0.8539700508117676, "num_tokens": 447885294.0, "step": 2810 }, { "epoch": 1.4298067141403865, "grad_norm": 0.95237797498703, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8428686857223511, "num_tokens": 448041135.0, "step": 2811 }, { "epoch": 1.4303153611393693, "grad_norm": 0.9649375081062317, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8458592295646667, "num_tokens": 448201132.0, "step": 2812 }, { "epoch": 1.430824008138352, "grad_norm": 0.9626204967498779, "learning_rate": 1e-05, "loss": 0.4825, "mean_token_accuracy": 0.8470381498336792, "num_tokens": 448364825.0, "step": 2813 }, { "epoch": 1.4313326551373347, "grad_norm": 1.020817518234253, "learning_rate": 1e-05, "loss": 0.4839, "mean_token_accuracy": 0.8466824293136597, "num_tokens": 448516015.0, "step": 2814 }, { "epoch": 1.4318413021363174, "grad_norm": 1.0039154291152954, "learning_rate": 1e-05, "loss": 0.4984, "mean_token_accuracy": 0.841713011264801, "num_tokens": 448680126.0, "step": 2815 }, { "epoch": 1.4323499491353, "grad_norm": 0.9738489389419556, "learning_rate": 1e-05, "loss": 0.4619, "mean_token_accuracy": 0.8518779277801514, "num_tokens": 448828846.0, "step": 2816 }, { "epoch": 1.4328585961342828, "grad_norm": 0.9059901237487793, "learning_rate": 1e-05, "loss": 0.4671, "mean_token_accuracy": 0.8512499332427979, "num_tokens": 448987846.0, "step": 2817 }, { "epoch": 1.4333672431332656, "grad_norm": 0.9546349048614502, "learning_rate": 1e-05, "loss": 0.5249, "mean_token_accuracy": 0.8342456221580505, "num_tokens": 449153221.0, "step": 2818 }, { "epoch": 1.4338758901322481, "grad_norm": 0.9544728994369507, "learning_rate": 1e-05, "loss": 0.4812, "mean_token_accuracy": 0.8467628955841064, "num_tokens": 449310276.0, "step": 2819 }, { "epoch": 1.434384537131231, "grad_norm": 0.9785590767860413, "learning_rate": 1e-05, "loss": 0.508, "mean_token_accuracy": 0.8397389650344849, "num_tokens": 449462904.0, "step": 2820 }, { "epoch": 1.4348931841302135, "grad_norm": 0.9594732522964478, "learning_rate": 1e-05, "loss": 0.4715, "mean_token_accuracy": 0.8496230244636536, "num_tokens": 449625366.0, "step": 2821 }, { "epoch": 1.4354018311291963, "grad_norm": 0.9512030482292175, "learning_rate": 1e-05, "loss": 0.479, "mean_token_accuracy": 0.8478878140449524, "num_tokens": 449779209.0, "step": 2822 }, { "epoch": 1.435910478128179, "grad_norm": 0.9894237518310547, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.837615966796875, "num_tokens": 449930647.0, "step": 2823 }, { "epoch": 1.4364191251271619, "grad_norm": 0.9918603301048279, "learning_rate": 1e-05, "loss": 0.48, "mean_token_accuracy": 0.8468407392501831, "num_tokens": 450077259.0, "step": 2824 }, { "epoch": 1.4369277721261444, "grad_norm": 1.0565818548202515, "learning_rate": 1e-05, "loss": 0.4795, "mean_token_accuracy": 0.8497186899185181, "num_tokens": 450234173.0, "step": 2825 }, { "epoch": 1.4374364191251272, "grad_norm": 0.9873266220092773, "learning_rate": 1e-05, "loss": 0.4826, "mean_token_accuracy": 0.8455828428268433, "num_tokens": 450408833.0, "step": 2826 }, { "epoch": 1.4379450661241098, "grad_norm": 1.0310784578323364, "learning_rate": 1e-05, "loss": 0.4976, "mean_token_accuracy": 0.842021107673645, "num_tokens": 450574471.0, "step": 2827 }, { "epoch": 1.4384537131230926, "grad_norm": 0.9773332476615906, "learning_rate": 1e-05, "loss": 0.4595, "mean_token_accuracy": 0.8536139130592346, "num_tokens": 450736983.0, "step": 2828 }, { "epoch": 1.4389623601220753, "grad_norm": 1.030726432800293, "learning_rate": 1e-05, "loss": 0.4519, "mean_token_accuracy": 0.8553815484046936, "num_tokens": 450896131.0, "step": 2829 }, { "epoch": 1.439471007121058, "grad_norm": 0.9751784801483154, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8510292768478394, "num_tokens": 451046227.0, "step": 2830 }, { "epoch": 1.4399796541200407, "grad_norm": 1.0584436655044556, "learning_rate": 1e-05, "loss": 0.4779, "mean_token_accuracy": 0.849427342414856, "num_tokens": 451194092.0, "step": 2831 }, { "epoch": 1.4404883011190233, "grad_norm": 1.0584428310394287, "learning_rate": 1e-05, "loss": 0.4836, "mean_token_accuracy": 0.8473069667816162, "num_tokens": 451349176.0, "step": 2832 }, { "epoch": 1.440996948118006, "grad_norm": 0.9864810705184937, "learning_rate": 1e-05, "loss": 0.4852, "mean_token_accuracy": 0.8452579379081726, "num_tokens": 451503657.0, "step": 2833 }, { "epoch": 1.4415055951169888, "grad_norm": 0.9788768291473389, "learning_rate": 1e-05, "loss": 0.4766, "mean_token_accuracy": 0.8486111164093018, "num_tokens": 451667384.0, "step": 2834 }, { "epoch": 1.4420142421159716, "grad_norm": 0.9842734932899475, "learning_rate": 1e-05, "loss": 0.4831, "mean_token_accuracy": 0.8461251258850098, "num_tokens": 451825588.0, "step": 2835 }, { "epoch": 1.4425228891149542, "grad_norm": 0.9226424098014832, "learning_rate": 1e-05, "loss": 0.4933, "mean_token_accuracy": 0.8441954255104065, "num_tokens": 451978435.0, "step": 2836 }, { "epoch": 1.443031536113937, "grad_norm": 1.0472816228866577, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8495010137557983, "num_tokens": 452132400.0, "step": 2837 }, { "epoch": 1.4435401831129195, "grad_norm": 0.9353339076042175, "learning_rate": 1e-05, "loss": 0.4927, "mean_token_accuracy": 0.8432925939559937, "num_tokens": 452287174.0, "step": 2838 }, { "epoch": 1.4440488301119023, "grad_norm": 0.919854462146759, "learning_rate": 1e-05, "loss": 0.479, "mean_token_accuracy": 0.8476002216339111, "num_tokens": 452446708.0, "step": 2839 }, { "epoch": 1.4445574771108851, "grad_norm": 1.0054397583007812, "learning_rate": 1e-05, "loss": 0.4946, "mean_token_accuracy": 0.8426610231399536, "num_tokens": 452595925.0, "step": 2840 }, { "epoch": 1.4450661241098677, "grad_norm": 0.9773548245429993, "learning_rate": 1e-05, "loss": 0.4619, "mean_token_accuracy": 0.8544117212295532, "num_tokens": 452741850.0, "step": 2841 }, { "epoch": 1.4455747711088505, "grad_norm": 0.9018303751945496, "learning_rate": 1e-05, "loss": 0.456, "mean_token_accuracy": 0.8554398417472839, "num_tokens": 452902299.0, "step": 2842 }, { "epoch": 1.446083418107833, "grad_norm": 0.9995993375778198, "learning_rate": 1e-05, "loss": 0.4995, "mean_token_accuracy": 0.8423516750335693, "num_tokens": 453060469.0, "step": 2843 }, { "epoch": 1.4465920651068158, "grad_norm": 1.0751382112503052, "learning_rate": 1e-05, "loss": 0.476, "mean_token_accuracy": 0.8492323160171509, "num_tokens": 453219512.0, "step": 2844 }, { "epoch": 1.4471007121057986, "grad_norm": 0.9150761961936951, "learning_rate": 1e-05, "loss": 0.456, "mean_token_accuracy": 0.8534543514251709, "num_tokens": 453373593.0, "step": 2845 }, { "epoch": 1.4476093591047814, "grad_norm": 0.9494074583053589, "learning_rate": 1e-05, "loss": 0.4873, "mean_token_accuracy": 0.8472472429275513, "num_tokens": 453528683.0, "step": 2846 }, { "epoch": 1.448118006103764, "grad_norm": 0.9087952971458435, "learning_rate": 1e-05, "loss": 0.4866, "mean_token_accuracy": 0.8457107543945312, "num_tokens": 453691751.0, "step": 2847 }, { "epoch": 1.4486266531027467, "grad_norm": 0.9003292322158813, "learning_rate": 1e-05, "loss": 0.4711, "mean_token_accuracy": 0.8518234491348267, "num_tokens": 453856172.0, "step": 2848 }, { "epoch": 1.4491353001017293, "grad_norm": 0.9344632029533386, "learning_rate": 1e-05, "loss": 0.4716, "mean_token_accuracy": 0.8498226404190063, "num_tokens": 454018226.0, "step": 2849 }, { "epoch": 1.449643947100712, "grad_norm": 0.9255077242851257, "learning_rate": 1e-05, "loss": 0.4281, "mean_token_accuracy": 0.8616542816162109, "num_tokens": 454181489.0, "step": 2850 }, { "epoch": 1.4501525940996949, "grad_norm": 0.9817742109298706, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8535650968551636, "num_tokens": 454331972.0, "step": 2851 }, { "epoch": 1.4506612410986774, "grad_norm": 0.9063531756401062, "learning_rate": 1e-05, "loss": 0.5027, "mean_token_accuracy": 0.8408591747283936, "num_tokens": 454494258.0, "step": 2852 }, { "epoch": 1.4511698880976602, "grad_norm": 0.9713846445083618, "learning_rate": 1e-05, "loss": 0.4273, "mean_token_accuracy": 0.8623003363609314, "num_tokens": 454638672.0, "step": 2853 }, { "epoch": 1.4516785350966428, "grad_norm": 0.9173727631568909, "learning_rate": 1e-05, "loss": 0.4706, "mean_token_accuracy": 0.8480606079101562, "num_tokens": 454794488.0, "step": 2854 }, { "epoch": 1.4521871820956256, "grad_norm": 0.9979090094566345, "learning_rate": 1e-05, "loss": 0.5043, "mean_token_accuracy": 0.8406027555465698, "num_tokens": 454949085.0, "step": 2855 }, { "epoch": 1.4526958290946084, "grad_norm": 1.0404869318008423, "learning_rate": 1e-05, "loss": 0.4942, "mean_token_accuracy": 0.8417496085166931, "num_tokens": 455107150.0, "step": 2856 }, { "epoch": 1.4532044760935912, "grad_norm": 0.9395345449447632, "learning_rate": 1e-05, "loss": 0.4606, "mean_token_accuracy": 0.8518513441085815, "num_tokens": 455273881.0, "step": 2857 }, { "epoch": 1.4537131230925737, "grad_norm": 1.007689356803894, "learning_rate": 1e-05, "loss": 0.4746, "mean_token_accuracy": 0.8480565547943115, "num_tokens": 455429921.0, "step": 2858 }, { "epoch": 1.4542217700915565, "grad_norm": 0.9877992868423462, "learning_rate": 1e-05, "loss": 0.4968, "mean_token_accuracy": 0.8433884382247925, "num_tokens": 455589102.0, "step": 2859 }, { "epoch": 1.454730417090539, "grad_norm": 0.8932809829711914, "learning_rate": 1e-05, "loss": 0.4724, "mean_token_accuracy": 0.8504021763801575, "num_tokens": 455754141.0, "step": 2860 }, { "epoch": 1.4552390640895219, "grad_norm": 0.9541919827461243, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8518562912940979, "num_tokens": 455928739.0, "step": 2861 }, { "epoch": 1.4557477110885046, "grad_norm": 0.9939335584640503, "learning_rate": 1e-05, "loss": 0.4828, "mean_token_accuracy": 0.8472523093223572, "num_tokens": 456076223.0, "step": 2862 }, { "epoch": 1.4562563580874872, "grad_norm": 0.9644919037818909, "learning_rate": 1e-05, "loss": 0.4599, "mean_token_accuracy": 0.8529149293899536, "num_tokens": 456244340.0, "step": 2863 }, { "epoch": 1.45676500508647, "grad_norm": 0.9466691613197327, "learning_rate": 1e-05, "loss": 0.4939, "mean_token_accuracy": 0.8433713912963867, "num_tokens": 456396940.0, "step": 2864 }, { "epoch": 1.4572736520854526, "grad_norm": 0.9644611477851868, "learning_rate": 1e-05, "loss": 0.4386, "mean_token_accuracy": 0.8583876490592957, "num_tokens": 456544680.0, "step": 2865 }, { "epoch": 1.4577822990844354, "grad_norm": 0.9246994256973267, "learning_rate": 1e-05, "loss": 0.482, "mean_token_accuracy": 0.8474558591842651, "num_tokens": 456705322.0, "step": 2866 }, { "epoch": 1.4582909460834181, "grad_norm": 0.9289038181304932, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8451316356658936, "num_tokens": 456868645.0, "step": 2867 }, { "epoch": 1.458799593082401, "grad_norm": 1.0041005611419678, "learning_rate": 1e-05, "loss": 0.4812, "mean_token_accuracy": 0.8481215238571167, "num_tokens": 457034824.0, "step": 2868 }, { "epoch": 1.4593082400813835, "grad_norm": 0.9717512130737305, "learning_rate": 1e-05, "loss": 0.5092, "mean_token_accuracy": 0.8409024477005005, "num_tokens": 457207183.0, "step": 2869 }, { "epoch": 1.4598168870803663, "grad_norm": 1.0197564363479614, "learning_rate": 1e-05, "loss": 0.4877, "mean_token_accuracy": 0.846825361251831, "num_tokens": 457358314.0, "step": 2870 }, { "epoch": 1.4603255340793488, "grad_norm": 0.9384062886238098, "learning_rate": 1e-05, "loss": 0.4532, "mean_token_accuracy": 0.8543113470077515, "num_tokens": 457523034.0, "step": 2871 }, { "epoch": 1.4608341810783316, "grad_norm": 1.0533404350280762, "learning_rate": 1e-05, "loss": 0.4765, "mean_token_accuracy": 0.8486883640289307, "num_tokens": 457677702.0, "step": 2872 }, { "epoch": 1.4613428280773144, "grad_norm": 0.9609192609786987, "learning_rate": 1e-05, "loss": 0.4841, "mean_token_accuracy": 0.8472496271133423, "num_tokens": 457841186.0, "step": 2873 }, { "epoch": 1.461851475076297, "grad_norm": 0.913457453250885, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8447855114936829, "num_tokens": 458004068.0, "step": 2874 }, { "epoch": 1.4623601220752798, "grad_norm": 1.0519827604293823, "learning_rate": 1e-05, "loss": 0.5015, "mean_token_accuracy": 0.8410388827323914, "num_tokens": 458171129.0, "step": 2875 }, { "epoch": 1.4628687690742623, "grad_norm": 0.8884391784667969, "learning_rate": 1e-05, "loss": 0.4743, "mean_token_accuracy": 0.8498002886772156, "num_tokens": 458327986.0, "step": 2876 }, { "epoch": 1.4633774160732451, "grad_norm": 0.9776370525360107, "learning_rate": 1e-05, "loss": 0.4947, "mean_token_accuracy": 0.8417224884033203, "num_tokens": 458492443.0, "step": 2877 }, { "epoch": 1.463886063072228, "grad_norm": 0.906736433506012, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8565185070037842, "num_tokens": 458641744.0, "step": 2878 }, { "epoch": 1.4643947100712107, "grad_norm": 0.9633562564849854, "learning_rate": 1e-05, "loss": 0.4599, "mean_token_accuracy": 0.8528835773468018, "num_tokens": 458800987.0, "step": 2879 }, { "epoch": 1.4649033570701933, "grad_norm": 0.9794289469718933, "learning_rate": 1e-05, "loss": 0.4757, "mean_token_accuracy": 0.8491966724395752, "num_tokens": 458963974.0, "step": 2880 }, { "epoch": 1.465412004069176, "grad_norm": 0.9550290703773499, "learning_rate": 1e-05, "loss": 0.47, "mean_token_accuracy": 0.8509371280670166, "num_tokens": 459117725.0, "step": 2881 }, { "epoch": 1.4659206510681586, "grad_norm": 1.0013341903686523, "learning_rate": 1e-05, "loss": 0.4812, "mean_token_accuracy": 0.8475896120071411, "num_tokens": 459273431.0, "step": 2882 }, { "epoch": 1.4664292980671414, "grad_norm": 0.9773209691047668, "learning_rate": 1e-05, "loss": 0.4775, "mean_token_accuracy": 0.848754346370697, "num_tokens": 459436131.0, "step": 2883 }, { "epoch": 1.4669379450661242, "grad_norm": 1.0352989435195923, "learning_rate": 1e-05, "loss": 0.4753, "mean_token_accuracy": 0.849433422088623, "num_tokens": 459590197.0, "step": 2884 }, { "epoch": 1.4674465920651067, "grad_norm": 1.086367130279541, "learning_rate": 1e-05, "loss": 0.4758, "mean_token_accuracy": 0.8498767614364624, "num_tokens": 459743580.0, "step": 2885 }, { "epoch": 1.4679552390640895, "grad_norm": 0.8803079128265381, "learning_rate": 1e-05, "loss": 0.4811, "mean_token_accuracy": 0.8476182222366333, "num_tokens": 459898577.0, "step": 2886 }, { "epoch": 1.468463886063072, "grad_norm": 1.1584640741348267, "learning_rate": 1e-05, "loss": 0.4915, "mean_token_accuracy": 0.8442654609680176, "num_tokens": 460056967.0, "step": 2887 }, { "epoch": 1.4689725330620549, "grad_norm": 1.0022369623184204, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.8600075244903564, "num_tokens": 460209370.0, "step": 2888 }, { "epoch": 1.4694811800610377, "grad_norm": 1.0320205688476562, "learning_rate": 1e-05, "loss": 0.4401, "mean_token_accuracy": 0.859348475933075, "num_tokens": 460373485.0, "step": 2889 }, { "epoch": 1.4699898270600205, "grad_norm": 1.0857659578323364, "learning_rate": 1e-05, "loss": 0.4768, "mean_token_accuracy": 0.847952127456665, "num_tokens": 460538263.0, "step": 2890 }, { "epoch": 1.470498474059003, "grad_norm": 1.0369950532913208, "learning_rate": 1e-05, "loss": 0.4798, "mean_token_accuracy": 0.846710741519928, "num_tokens": 460698779.0, "step": 2891 }, { "epoch": 1.4710071210579858, "grad_norm": 1.0814622640609741, "learning_rate": 1e-05, "loss": 0.4706, "mean_token_accuracy": 0.8494322299957275, "num_tokens": 460866578.0, "step": 2892 }, { "epoch": 1.4715157680569684, "grad_norm": 0.9672597646713257, "learning_rate": 1e-05, "loss": 0.4703, "mean_token_accuracy": 0.8501402139663696, "num_tokens": 461012557.0, "step": 2893 }, { "epoch": 1.4720244150559512, "grad_norm": 1.0468100309371948, "learning_rate": 1e-05, "loss": 0.5112, "mean_token_accuracy": 0.8384190797805786, "num_tokens": 461171630.0, "step": 2894 }, { "epoch": 1.472533062054934, "grad_norm": 0.9749298691749573, "learning_rate": 1e-05, "loss": 0.4624, "mean_token_accuracy": 0.852161705493927, "num_tokens": 461336078.0, "step": 2895 }, { "epoch": 1.4730417090539165, "grad_norm": 0.9439693093299866, "learning_rate": 1e-05, "loss": 0.4835, "mean_token_accuracy": 0.844796359539032, "num_tokens": 461506509.0, "step": 2896 }, { "epoch": 1.4735503560528993, "grad_norm": 0.9988195896148682, "learning_rate": 1e-05, "loss": 0.4394, "mean_token_accuracy": 0.860859751701355, "num_tokens": 461677135.0, "step": 2897 }, { "epoch": 1.4740590030518819, "grad_norm": 0.9463707804679871, "learning_rate": 1e-05, "loss": 0.434, "mean_token_accuracy": 0.8603488802909851, "num_tokens": 461835821.0, "step": 2898 }, { "epoch": 1.4745676500508647, "grad_norm": 1.0607638359069824, "learning_rate": 1e-05, "loss": 0.459, "mean_token_accuracy": 0.8542230725288391, "num_tokens": 461982449.0, "step": 2899 }, { "epoch": 1.4750762970498474, "grad_norm": 0.9559944868087769, "learning_rate": 1e-05, "loss": 0.4636, "mean_token_accuracy": 0.85247403383255, "num_tokens": 462141478.0, "step": 2900 }, { "epoch": 1.4755849440488302, "grad_norm": 0.9273171424865723, "learning_rate": 1e-05, "loss": 0.4469, "mean_token_accuracy": 0.8564932346343994, "num_tokens": 462308851.0, "step": 2901 }, { "epoch": 1.4760935910478128, "grad_norm": 0.932365894317627, "learning_rate": 1e-05, "loss": 0.4631, "mean_token_accuracy": 0.8522019386291504, "num_tokens": 462469829.0, "step": 2902 }, { "epoch": 1.4766022380467956, "grad_norm": 1.1404035091400146, "learning_rate": 1e-05, "loss": 0.4357, "mean_token_accuracy": 0.8589218258857727, "num_tokens": 462635407.0, "step": 2903 }, { "epoch": 1.4771108850457781, "grad_norm": 1.0725706815719604, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.8407667875289917, "num_tokens": 462805295.0, "step": 2904 }, { "epoch": 1.477619532044761, "grad_norm": 0.9216992855072021, "learning_rate": 1e-05, "loss": 0.4468, "mean_token_accuracy": 0.8585999011993408, "num_tokens": 462971691.0, "step": 2905 }, { "epoch": 1.4781281790437437, "grad_norm": 0.9899900555610657, "learning_rate": 1e-05, "loss": 0.4804, "mean_token_accuracy": 0.8486272096633911, "num_tokens": 463137500.0, "step": 2906 }, { "epoch": 1.4786368260427263, "grad_norm": 1.0153653621673584, "learning_rate": 1e-05, "loss": 0.478, "mean_token_accuracy": 0.8489232659339905, "num_tokens": 463288904.0, "step": 2907 }, { "epoch": 1.479145473041709, "grad_norm": 1.4542001485824585, "learning_rate": 1e-05, "loss": 0.5027, "mean_token_accuracy": 0.8426099419593811, "num_tokens": 463448939.0, "step": 2908 }, { "epoch": 1.4796541200406916, "grad_norm": 1.0369772911071777, "learning_rate": 1e-05, "loss": 0.4975, "mean_token_accuracy": 0.8416335582733154, "num_tokens": 463601841.0, "step": 2909 }, { "epoch": 1.4801627670396744, "grad_norm": 0.9426633715629578, "learning_rate": 1e-05, "loss": 0.4753, "mean_token_accuracy": 0.8484392166137695, "num_tokens": 463759585.0, "step": 2910 }, { "epoch": 1.4806714140386572, "grad_norm": 1.0142484903335571, "learning_rate": 1e-05, "loss": 0.466, "mean_token_accuracy": 0.8509640097618103, "num_tokens": 463928168.0, "step": 2911 }, { "epoch": 1.48118006103764, "grad_norm": 0.9512984752655029, "learning_rate": 1e-05, "loss": 0.4816, "mean_token_accuracy": 0.8469193577766418, "num_tokens": 464082388.0, "step": 2912 }, { "epoch": 1.4816887080366226, "grad_norm": 0.977675199508667, "learning_rate": 1e-05, "loss": 0.4683, "mean_token_accuracy": 0.8513289093971252, "num_tokens": 464242956.0, "step": 2913 }, { "epoch": 1.4821973550356053, "grad_norm": 0.9527803659439087, "learning_rate": 1e-05, "loss": 0.4922, "mean_token_accuracy": 0.8448150157928467, "num_tokens": 464417202.0, "step": 2914 }, { "epoch": 1.482706002034588, "grad_norm": 0.9302776455879211, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8493098020553589, "num_tokens": 464570429.0, "step": 2915 }, { "epoch": 1.4832146490335707, "grad_norm": 0.9582393765449524, "learning_rate": 1e-05, "loss": 0.461, "mean_token_accuracy": 0.8533623218536377, "num_tokens": 464725276.0, "step": 2916 }, { "epoch": 1.4837232960325535, "grad_norm": 0.9441304206848145, "learning_rate": 1e-05, "loss": 0.4636, "mean_token_accuracy": 0.8537716865539551, "num_tokens": 464893941.0, "step": 2917 }, { "epoch": 1.484231943031536, "grad_norm": 0.9462320804595947, "learning_rate": 1e-05, "loss": 0.4502, "mean_token_accuracy": 0.8577545285224915, "num_tokens": 465058613.0, "step": 2918 }, { "epoch": 1.4847405900305188, "grad_norm": 0.9499002695083618, "learning_rate": 1e-05, "loss": 0.4862, "mean_token_accuracy": 0.8443703651428223, "num_tokens": 465218139.0, "step": 2919 }, { "epoch": 1.4852492370295014, "grad_norm": 0.958915650844574, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.8586746454238892, "num_tokens": 465376097.0, "step": 2920 }, { "epoch": 1.4857578840284842, "grad_norm": 0.9286035895347595, "learning_rate": 1e-05, "loss": 0.5144, "mean_token_accuracy": 0.8386704921722412, "num_tokens": 465532660.0, "step": 2921 }, { "epoch": 1.486266531027467, "grad_norm": 0.983659029006958, "learning_rate": 1e-05, "loss": 0.4876, "mean_token_accuracy": 0.8438690900802612, "num_tokens": 465702748.0, "step": 2922 }, { "epoch": 1.4867751780264498, "grad_norm": 0.9186338782310486, "learning_rate": 1e-05, "loss": 0.4297, "mean_token_accuracy": 0.8618059158325195, "num_tokens": 465854774.0, "step": 2923 }, { "epoch": 1.4872838250254323, "grad_norm": 1.0261900424957275, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8528866171836853, "num_tokens": 466007913.0, "step": 2924 }, { "epoch": 1.4877924720244151, "grad_norm": 0.9390912055969238, "learning_rate": 1e-05, "loss": 0.4954, "mean_token_accuracy": 0.8455357551574707, "num_tokens": 466164365.0, "step": 2925 }, { "epoch": 1.4883011190233977, "grad_norm": 0.8921604156494141, "learning_rate": 1e-05, "loss": 0.4732, "mean_token_accuracy": 0.849597692489624, "num_tokens": 466327362.0, "step": 2926 }, { "epoch": 1.4888097660223805, "grad_norm": 0.989403247833252, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8581273555755615, "num_tokens": 466477193.0, "step": 2927 }, { "epoch": 1.4893184130213633, "grad_norm": 0.944860577583313, "learning_rate": 1e-05, "loss": 0.4569, "mean_token_accuracy": 0.8537037372589111, "num_tokens": 466638743.0, "step": 2928 }, { "epoch": 1.4898270600203458, "grad_norm": 1.0039745569229126, "learning_rate": 1e-05, "loss": 0.457, "mean_token_accuracy": 0.8552755117416382, "num_tokens": 466797661.0, "step": 2929 }, { "epoch": 1.4903357070193286, "grad_norm": 0.9398649334907532, "learning_rate": 1e-05, "loss": 0.492, "mean_token_accuracy": 0.8440238237380981, "num_tokens": 466958501.0, "step": 2930 }, { "epoch": 1.4908443540183112, "grad_norm": 1.174691915512085, "learning_rate": 1e-05, "loss": 0.4423, "mean_token_accuracy": 0.8575791120529175, "num_tokens": 467110269.0, "step": 2931 }, { "epoch": 1.491353001017294, "grad_norm": 1.0185178518295288, "learning_rate": 1e-05, "loss": 0.5289, "mean_token_accuracy": 0.8333423733711243, "num_tokens": 467271547.0, "step": 2932 }, { "epoch": 1.4918616480162767, "grad_norm": 0.8953125476837158, "learning_rate": 1e-05, "loss": 0.4392, "mean_token_accuracy": 0.8588826656341553, "num_tokens": 467440161.0, "step": 2933 }, { "epoch": 1.4923702950152595, "grad_norm": 0.9780067801475525, "learning_rate": 1e-05, "loss": 0.4829, "mean_token_accuracy": 0.8456743359565735, "num_tokens": 467597696.0, "step": 2934 }, { "epoch": 1.492878942014242, "grad_norm": 0.9480140209197998, "learning_rate": 1e-05, "loss": 0.4682, "mean_token_accuracy": 0.8502559661865234, "num_tokens": 467763338.0, "step": 2935 }, { "epoch": 1.4933875890132249, "grad_norm": 0.9286141395568848, "learning_rate": 1e-05, "loss": 0.4871, "mean_token_accuracy": 0.8461545705795288, "num_tokens": 467925187.0, "step": 2936 }, { "epoch": 1.4938962360122074, "grad_norm": 1.008232593536377, "learning_rate": 1e-05, "loss": 0.5066, "mean_token_accuracy": 0.8377925157546997, "num_tokens": 468077219.0, "step": 2937 }, { "epoch": 1.4944048830111902, "grad_norm": 1.0144633054733276, "learning_rate": 1e-05, "loss": 0.488, "mean_token_accuracy": 0.8458861112594604, "num_tokens": 468226973.0, "step": 2938 }, { "epoch": 1.494913530010173, "grad_norm": 1.2867398262023926, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.8544325828552246, "num_tokens": 468386458.0, "step": 2939 }, { "epoch": 1.4954221770091556, "grad_norm": 0.9466829299926758, "learning_rate": 1e-05, "loss": 0.4535, "mean_token_accuracy": 0.8576674461364746, "num_tokens": 468534657.0, "step": 2940 }, { "epoch": 1.4959308240081384, "grad_norm": 0.9843382239341736, "learning_rate": 1e-05, "loss": 0.4552, "mean_token_accuracy": 0.855600118637085, "num_tokens": 468699181.0, "step": 2941 }, { "epoch": 1.496439471007121, "grad_norm": 0.9357199668884277, "learning_rate": 1e-05, "loss": 0.5075, "mean_token_accuracy": 0.8387920260429382, "num_tokens": 468863037.0, "step": 2942 }, { "epoch": 1.4969481180061037, "grad_norm": 0.8901887536048889, "learning_rate": 1e-05, "loss": 0.4801, "mean_token_accuracy": 0.8496588468551636, "num_tokens": 469024753.0, "step": 2943 }, { "epoch": 1.4974567650050865, "grad_norm": 1.0716995000839233, "learning_rate": 1e-05, "loss": 0.4935, "mean_token_accuracy": 0.8449990153312683, "num_tokens": 469175522.0, "step": 2944 }, { "epoch": 1.4979654120040693, "grad_norm": 0.918582022190094, "learning_rate": 1e-05, "loss": 0.4658, "mean_token_accuracy": 0.8516040444374084, "num_tokens": 469335707.0, "step": 2945 }, { "epoch": 1.4984740590030519, "grad_norm": 1.4671798944473267, "learning_rate": 1e-05, "loss": 0.4507, "mean_token_accuracy": 0.8577188849449158, "num_tokens": 469489877.0, "step": 2946 }, { "epoch": 1.4989827060020346, "grad_norm": 1.0488964319229126, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.8407468795776367, "num_tokens": 469649523.0, "step": 2947 }, { "epoch": 1.4994913530010172, "grad_norm": 0.928181529045105, "learning_rate": 1e-05, "loss": 0.4671, "mean_token_accuracy": 0.8499090671539307, "num_tokens": 469823003.0, "step": 2948 }, { "epoch": 1.5, "grad_norm": 1.0365320444107056, "learning_rate": 1e-05, "loss": 0.4836, "mean_token_accuracy": 0.8448408842086792, "num_tokens": 469987692.0, "step": 2949 }, { "epoch": 1.5005086469989828, "grad_norm": 1.0148674249649048, "learning_rate": 1e-05, "loss": 0.4765, "mean_token_accuracy": 0.8493233919143677, "num_tokens": 470149249.0, "step": 2950 }, { "epoch": 1.5010172939979656, "grad_norm": 0.9570903182029724, "learning_rate": 1e-05, "loss": 0.4921, "mean_token_accuracy": 0.844947099685669, "num_tokens": 470300640.0, "step": 2951 }, { "epoch": 1.5015259409969481, "grad_norm": 1.0189590454101562, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.8395256400108337, "num_tokens": 470472689.0, "step": 2952 }, { "epoch": 1.5020345879959307, "grad_norm": 1.0926448106765747, "learning_rate": 1e-05, "loss": 0.4706, "mean_token_accuracy": 0.8507436513900757, "num_tokens": 470632121.0, "step": 2953 }, { "epoch": 1.5025432349949135, "grad_norm": 0.9752507209777832, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8419481515884399, "num_tokens": 470771648.0, "step": 2954 }, { "epoch": 1.5030518819938963, "grad_norm": 0.921377956867218, "learning_rate": 1e-05, "loss": 0.4622, "mean_token_accuracy": 0.8515791893005371, "num_tokens": 470935021.0, "step": 2955 }, { "epoch": 1.503560528992879, "grad_norm": 0.9844980835914612, "learning_rate": 1e-05, "loss": 0.4852, "mean_token_accuracy": 0.8456733226776123, "num_tokens": 471089333.0, "step": 2956 }, { "epoch": 1.5040691759918616, "grad_norm": 1.0702826976776123, "learning_rate": 1e-05, "loss": 0.4828, "mean_token_accuracy": 0.8475288152694702, "num_tokens": 471253508.0, "step": 2957 }, { "epoch": 1.5045778229908442, "grad_norm": 0.9100438356399536, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8491068482398987, "num_tokens": 471422317.0, "step": 2958 }, { "epoch": 1.505086469989827, "grad_norm": 1.096585988998413, "learning_rate": 1e-05, "loss": 0.4363, "mean_token_accuracy": 0.8580671548843384, "num_tokens": 471571311.0, "step": 2959 }, { "epoch": 1.5055951169888098, "grad_norm": 1.007280707359314, "learning_rate": 1e-05, "loss": 0.5051, "mean_token_accuracy": 0.8421881794929504, "num_tokens": 471730047.0, "step": 2960 }, { "epoch": 1.5061037639877926, "grad_norm": 1.070167899131775, "learning_rate": 1e-05, "loss": 0.5001, "mean_token_accuracy": 0.8414835333824158, "num_tokens": 471891342.0, "step": 2961 }, { "epoch": 1.5066124109867753, "grad_norm": 1.114966630935669, "learning_rate": 1e-05, "loss": 0.4803, "mean_token_accuracy": 0.8474112749099731, "num_tokens": 472056255.0, "step": 2962 }, { "epoch": 1.507121057985758, "grad_norm": 0.967980682849884, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8452816009521484, "num_tokens": 472225606.0, "step": 2963 }, { "epoch": 1.5076297049847405, "grad_norm": 0.9875637292861938, "learning_rate": 1e-05, "loss": 0.4599, "mean_token_accuracy": 0.853312611579895, "num_tokens": 472385581.0, "step": 2964 }, { "epoch": 1.5081383519837233, "grad_norm": 1.0010428428649902, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.8569035530090332, "num_tokens": 472549805.0, "step": 2965 }, { "epoch": 1.508646998982706, "grad_norm": 1.1223161220550537, "learning_rate": 1e-05, "loss": 0.4737, "mean_token_accuracy": 0.849111795425415, "num_tokens": 472697254.0, "step": 2966 }, { "epoch": 1.5091556459816888, "grad_norm": 1.1635527610778809, "learning_rate": 1e-05, "loss": 0.4757, "mean_token_accuracy": 0.8492852449417114, "num_tokens": 472853595.0, "step": 2967 }, { "epoch": 1.5096642929806714, "grad_norm": 0.9469373822212219, "learning_rate": 1e-05, "loss": 0.5128, "mean_token_accuracy": 0.8389683961868286, "num_tokens": 473029719.0, "step": 2968 }, { "epoch": 1.510172939979654, "grad_norm": 1.0621305704116821, "learning_rate": 1e-05, "loss": 0.4757, "mean_token_accuracy": 0.8480264544487, "num_tokens": 473188682.0, "step": 2969 }, { "epoch": 1.5106815869786367, "grad_norm": 0.9598436951637268, "learning_rate": 1e-05, "loss": 0.4881, "mean_token_accuracy": 0.8451364040374756, "num_tokens": 473348305.0, "step": 2970 }, { "epoch": 1.5111902339776195, "grad_norm": 0.9786617159843445, "learning_rate": 1e-05, "loss": 0.4676, "mean_token_accuracy": 0.8512166738510132, "num_tokens": 473502448.0, "step": 2971 }, { "epoch": 1.5116988809766023, "grad_norm": 0.9746079444885254, "learning_rate": 1e-05, "loss": 0.4894, "mean_token_accuracy": 0.8454301953315735, "num_tokens": 473666571.0, "step": 2972 }, { "epoch": 1.512207527975585, "grad_norm": 0.9722834825515747, "learning_rate": 1e-05, "loss": 0.4342, "mean_token_accuracy": 0.8606178760528564, "num_tokens": 473823460.0, "step": 2973 }, { "epoch": 1.5127161749745677, "grad_norm": 0.9689220190048218, "learning_rate": 1e-05, "loss": 0.4749, "mean_token_accuracy": 0.8500169515609741, "num_tokens": 473976615.0, "step": 2974 }, { "epoch": 1.5132248219735502, "grad_norm": 1.01953125, "learning_rate": 1e-05, "loss": 0.43, "mean_token_accuracy": 0.8606699705123901, "num_tokens": 474127194.0, "step": 2975 }, { "epoch": 1.513733468972533, "grad_norm": 0.9314214587211609, "learning_rate": 1e-05, "loss": 0.4925, "mean_token_accuracy": 0.8451440334320068, "num_tokens": 474309906.0, "step": 2976 }, { "epoch": 1.5142421159715158, "grad_norm": 0.9425230026245117, "learning_rate": 1e-05, "loss": 0.4732, "mean_token_accuracy": 0.8487437963485718, "num_tokens": 474468000.0, "step": 2977 }, { "epoch": 1.5147507629704986, "grad_norm": 1.0697709321975708, "learning_rate": 1e-05, "loss": 0.502, "mean_token_accuracy": 0.8403645753860474, "num_tokens": 474620054.0, "step": 2978 }, { "epoch": 1.5152594099694812, "grad_norm": 0.9415473937988281, "learning_rate": 1e-05, "loss": 0.4734, "mean_token_accuracy": 0.8491325974464417, "num_tokens": 474780460.0, "step": 2979 }, { "epoch": 1.5157680569684637, "grad_norm": 0.9371607899665833, "learning_rate": 1e-05, "loss": 0.4792, "mean_token_accuracy": 0.8479374647140503, "num_tokens": 474940167.0, "step": 2980 }, { "epoch": 1.5162767039674465, "grad_norm": 0.9356675148010254, "learning_rate": 1e-05, "loss": 0.458, "mean_token_accuracy": 0.8535172939300537, "num_tokens": 475100842.0, "step": 2981 }, { "epoch": 1.5167853509664293, "grad_norm": 0.9361054301261902, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.8528018593788147, "num_tokens": 475258659.0, "step": 2982 }, { "epoch": 1.517293997965412, "grad_norm": 0.9554124474525452, "learning_rate": 1e-05, "loss": 0.4714, "mean_token_accuracy": 0.8502927422523499, "num_tokens": 475420431.0, "step": 2983 }, { "epoch": 1.5178026449643949, "grad_norm": 0.9128940105438232, "learning_rate": 1e-05, "loss": 0.4588, "mean_token_accuracy": 0.8527088165283203, "num_tokens": 475577873.0, "step": 2984 }, { "epoch": 1.5183112919633774, "grad_norm": 0.9470569491386414, "learning_rate": 1e-05, "loss": 0.4894, "mean_token_accuracy": 0.843989372253418, "num_tokens": 475739554.0, "step": 2985 }, { "epoch": 1.51881993896236, "grad_norm": 0.9532779455184937, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8447748422622681, "num_tokens": 475909298.0, "step": 2986 }, { "epoch": 1.5193285859613428, "grad_norm": 0.9041764736175537, "learning_rate": 1e-05, "loss": 0.5138, "mean_token_accuracy": 0.83664470911026, "num_tokens": 476081499.0, "step": 2987 }, { "epoch": 1.5198372329603256, "grad_norm": 0.8667990565299988, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.8529576063156128, "num_tokens": 476240775.0, "step": 2988 }, { "epoch": 1.5203458799593084, "grad_norm": 0.9188169836997986, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8563479781150818, "num_tokens": 476404318.0, "step": 2989 }, { "epoch": 1.520854526958291, "grad_norm": 1.072871208190918, "learning_rate": 1e-05, "loss": 0.506, "mean_token_accuracy": 0.8398078680038452, "num_tokens": 476556580.0, "step": 2990 }, { "epoch": 1.5213631739572735, "grad_norm": 0.9862761497497559, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8528329730033875, "num_tokens": 476695113.0, "step": 2991 }, { "epoch": 1.5218718209562563, "grad_norm": 0.9302590489387512, "learning_rate": 1e-05, "loss": 0.4677, "mean_token_accuracy": 0.8522852659225464, "num_tokens": 476855526.0, "step": 2992 }, { "epoch": 1.522380467955239, "grad_norm": 1.0184320211410522, "learning_rate": 1e-05, "loss": 0.4701, "mean_token_accuracy": 0.8505998253822327, "num_tokens": 477014468.0, "step": 2993 }, { "epoch": 1.5228891149542219, "grad_norm": 0.9482423067092896, "learning_rate": 1e-05, "loss": 0.4907, "mean_token_accuracy": 0.8458576202392578, "num_tokens": 477177840.0, "step": 2994 }, { "epoch": 1.5233977619532044, "grad_norm": 1.9431716203689575, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8519662618637085, "num_tokens": 477338479.0, "step": 2995 }, { "epoch": 1.5239064089521872, "grad_norm": 0.9888985753059387, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8544604778289795, "num_tokens": 477502383.0, "step": 2996 }, { "epoch": 1.5244150559511698, "grad_norm": 0.9551851153373718, "learning_rate": 1e-05, "loss": 0.4485, "mean_token_accuracy": 0.8576740026473999, "num_tokens": 477664750.0, "step": 2997 }, { "epoch": 1.5249237029501526, "grad_norm": 0.9770562648773193, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.8444640636444092, "num_tokens": 477826492.0, "step": 2998 }, { "epoch": 1.5254323499491353, "grad_norm": 1.3451825380325317, "learning_rate": 1e-05, "loss": 0.4512, "mean_token_accuracy": 0.8568999767303467, "num_tokens": 477974878.0, "step": 2999 }, { "epoch": 1.5259409969481181, "grad_norm": 1.1307803392410278, "learning_rate": 1e-05, "loss": 0.4546, "mean_token_accuracy": 0.8543623685836792, "num_tokens": 478124029.0, "step": 3000 }, { "epoch": 1.5264496439471007, "grad_norm": 0.9587520360946655, "learning_rate": 1e-05, "loss": 0.46, "mean_token_accuracy": 0.8539354801177979, "num_tokens": 478282115.0, "step": 3001 }, { "epoch": 1.5269582909460833, "grad_norm": 0.9479132890701294, "learning_rate": 1e-05, "loss": 0.4697, "mean_token_accuracy": 0.8511796593666077, "num_tokens": 478451961.0, "step": 3002 }, { "epoch": 1.527466937945066, "grad_norm": 1.134979486465454, "learning_rate": 1e-05, "loss": 0.4863, "mean_token_accuracy": 0.8461675643920898, "num_tokens": 478608587.0, "step": 3003 }, { "epoch": 1.5279755849440488, "grad_norm": 0.8995083570480347, "learning_rate": 1e-05, "loss": 0.4951, "mean_token_accuracy": 0.8425230979919434, "num_tokens": 478777570.0, "step": 3004 }, { "epoch": 1.5284842319430316, "grad_norm": 1.0487364530563354, "learning_rate": 1e-05, "loss": 0.4638, "mean_token_accuracy": 0.8535329103469849, "num_tokens": 478927415.0, "step": 3005 }, { "epoch": 1.5289928789420142, "grad_norm": 0.9376885294914246, "learning_rate": 1e-05, "loss": 0.4691, "mean_token_accuracy": 0.8515335917472839, "num_tokens": 479091694.0, "step": 3006 }, { "epoch": 1.529501525940997, "grad_norm": 1.0092753171920776, "learning_rate": 1e-05, "loss": 0.4915, "mean_token_accuracy": 0.8434138298034668, "num_tokens": 479247246.0, "step": 3007 }, { "epoch": 1.5300101729399795, "grad_norm": 0.9187234044075012, "learning_rate": 1e-05, "loss": 0.4664, "mean_token_accuracy": 0.8512427806854248, "num_tokens": 479395630.0, "step": 3008 }, { "epoch": 1.5305188199389623, "grad_norm": 0.9388754367828369, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.8556320071220398, "num_tokens": 479557400.0, "step": 3009 }, { "epoch": 1.5310274669379451, "grad_norm": 0.9462489485740662, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8584042191505432, "num_tokens": 479705902.0, "step": 3010 }, { "epoch": 1.531536113936928, "grad_norm": 0.9888403415679932, "learning_rate": 1e-05, "loss": 0.4769, "mean_token_accuracy": 0.8489364385604858, "num_tokens": 479864999.0, "step": 3011 }, { "epoch": 1.5320447609359105, "grad_norm": 0.9582159519195557, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.852144718170166, "num_tokens": 480016304.0, "step": 3012 }, { "epoch": 1.532553407934893, "grad_norm": 0.9440920352935791, "learning_rate": 1e-05, "loss": 0.4821, "mean_token_accuracy": 0.8469336032867432, "num_tokens": 480184076.0, "step": 3013 }, { "epoch": 1.5330620549338758, "grad_norm": 0.9551937580108643, "learning_rate": 1e-05, "loss": 0.4392, "mean_token_accuracy": 0.8583161234855652, "num_tokens": 480346413.0, "step": 3014 }, { "epoch": 1.5335707019328586, "grad_norm": 0.9670236110687256, "learning_rate": 1e-05, "loss": 0.4395, "mean_token_accuracy": 0.8601056337356567, "num_tokens": 480496300.0, "step": 3015 }, { "epoch": 1.5340793489318414, "grad_norm": 0.961797297000885, "learning_rate": 1e-05, "loss": 0.499, "mean_token_accuracy": 0.841756284236908, "num_tokens": 480658234.0, "step": 3016 }, { "epoch": 1.534587995930824, "grad_norm": 0.9484275579452515, "learning_rate": 1e-05, "loss": 0.4755, "mean_token_accuracy": 0.8482771515846252, "num_tokens": 480815704.0, "step": 3017 }, { "epoch": 1.5350966429298067, "grad_norm": 1.072632908821106, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.8507258892059326, "num_tokens": 480980014.0, "step": 3018 }, { "epoch": 1.5356052899287893, "grad_norm": 0.8930185437202454, "learning_rate": 1e-05, "loss": 0.4989, "mean_token_accuracy": 0.8436600565910339, "num_tokens": 481147388.0, "step": 3019 }, { "epoch": 1.536113936927772, "grad_norm": 0.9993252158164978, "learning_rate": 1e-05, "loss": 0.512, "mean_token_accuracy": 0.8393548727035522, "num_tokens": 481310267.0, "step": 3020 }, { "epoch": 1.5366225839267549, "grad_norm": 1.0826072692871094, "learning_rate": 1e-05, "loss": 0.5009, "mean_token_accuracy": 0.8405871391296387, "num_tokens": 481466862.0, "step": 3021 }, { "epoch": 1.5371312309257377, "grad_norm": 0.9848809242248535, "learning_rate": 1e-05, "loss": 0.5347, "mean_token_accuracy": 0.8312636613845825, "num_tokens": 481623100.0, "step": 3022 }, { "epoch": 1.5376398779247202, "grad_norm": 0.9985467195510864, "learning_rate": 1e-05, "loss": 0.4824, "mean_token_accuracy": 0.8475914001464844, "num_tokens": 481786925.0, "step": 3023 }, { "epoch": 1.5381485249237028, "grad_norm": 1.0154087543487549, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.8535425662994385, "num_tokens": 481941546.0, "step": 3024 }, { "epoch": 1.5386571719226856, "grad_norm": 0.9510856866836548, "learning_rate": 1e-05, "loss": 0.5078, "mean_token_accuracy": 0.8398616313934326, "num_tokens": 482102028.0, "step": 3025 }, { "epoch": 1.5391658189216684, "grad_norm": 0.9930600523948669, "learning_rate": 1e-05, "loss": 0.5023, "mean_token_accuracy": 0.8423943519592285, "num_tokens": 482261954.0, "step": 3026 }, { "epoch": 1.5396744659206512, "grad_norm": 0.9215907454490662, "learning_rate": 1e-05, "loss": 0.47, "mean_token_accuracy": 0.8503847122192383, "num_tokens": 482424450.0, "step": 3027 }, { "epoch": 1.5401831129196337, "grad_norm": 1.0794174671173096, "learning_rate": 1e-05, "loss": 0.4712, "mean_token_accuracy": 0.8509054183959961, "num_tokens": 482583861.0, "step": 3028 }, { "epoch": 1.5406917599186165, "grad_norm": 0.9406433701515198, "learning_rate": 1e-05, "loss": 0.4569, "mean_token_accuracy": 0.8541445732116699, "num_tokens": 482744473.0, "step": 3029 }, { "epoch": 1.541200406917599, "grad_norm": 0.896527111530304, "learning_rate": 1e-05, "loss": 0.4813, "mean_token_accuracy": 0.8471190929412842, "num_tokens": 482917251.0, "step": 3030 }, { "epoch": 1.5417090539165819, "grad_norm": 1.0127754211425781, "learning_rate": 1e-05, "loss": 0.4708, "mean_token_accuracy": 0.8493427634239197, "num_tokens": 483088314.0, "step": 3031 }, { "epoch": 1.5422177009155646, "grad_norm": 0.9132606387138367, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8510922789573669, "num_tokens": 483235136.0, "step": 3032 }, { "epoch": 1.5427263479145474, "grad_norm": 1.001651406288147, "learning_rate": 1e-05, "loss": 0.4588, "mean_token_accuracy": 0.8525159358978271, "num_tokens": 483379352.0, "step": 3033 }, { "epoch": 1.54323499491353, "grad_norm": 0.9252734184265137, "learning_rate": 1e-05, "loss": 0.4719, "mean_token_accuracy": 0.8498355746269226, "num_tokens": 483551953.0, "step": 3034 }, { "epoch": 1.5437436419125126, "grad_norm": 0.9206449389457703, "learning_rate": 1e-05, "loss": 0.4612, "mean_token_accuracy": 0.8521483540534973, "num_tokens": 483702864.0, "step": 3035 }, { "epoch": 1.5442522889114954, "grad_norm": 1.0111031532287598, "learning_rate": 1e-05, "loss": 0.4822, "mean_token_accuracy": 0.8495075702667236, "num_tokens": 483861059.0, "step": 3036 }, { "epoch": 1.5447609359104781, "grad_norm": 0.91339111328125, "learning_rate": 1e-05, "loss": 0.4671, "mean_token_accuracy": 0.8509300947189331, "num_tokens": 484016392.0, "step": 3037 }, { "epoch": 1.545269582909461, "grad_norm": 0.8808000087738037, "learning_rate": 1e-05, "loss": 0.4525, "mean_token_accuracy": 0.8548637628555298, "num_tokens": 484173960.0, "step": 3038 }, { "epoch": 1.5457782299084435, "grad_norm": 1.042173147201538, "learning_rate": 1e-05, "loss": 0.4829, "mean_token_accuracy": 0.8460304737091064, "num_tokens": 484317468.0, "step": 3039 }, { "epoch": 1.5462868769074263, "grad_norm": 0.9629754424095154, "learning_rate": 1e-05, "loss": 0.5107, "mean_token_accuracy": 0.8375293016433716, "num_tokens": 484486741.0, "step": 3040 }, { "epoch": 1.5467955239064088, "grad_norm": 0.9451351761817932, "learning_rate": 1e-05, "loss": 0.4739, "mean_token_accuracy": 0.8494521379470825, "num_tokens": 484652684.0, "step": 3041 }, { "epoch": 1.5473041709053916, "grad_norm": 0.8708462715148926, "learning_rate": 1e-05, "loss": 0.4878, "mean_token_accuracy": 0.8446558117866516, "num_tokens": 484829318.0, "step": 3042 }, { "epoch": 1.5478128179043744, "grad_norm": 1.038165807723999, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.848456084728241, "num_tokens": 484991769.0, "step": 3043 }, { "epoch": 1.5483214649033572, "grad_norm": 0.9187168478965759, "learning_rate": 1e-05, "loss": 0.4832, "mean_token_accuracy": 0.846601128578186, "num_tokens": 485146681.0, "step": 3044 }, { "epoch": 1.5488301119023398, "grad_norm": 1.0916402339935303, "learning_rate": 1e-05, "loss": 0.4771, "mean_token_accuracy": 0.8475044369697571, "num_tokens": 485304764.0, "step": 3045 }, { "epoch": 1.5493387589013223, "grad_norm": 0.9968273043632507, "learning_rate": 1e-05, "loss": 0.4709, "mean_token_accuracy": 0.8496679663658142, "num_tokens": 485461388.0, "step": 3046 }, { "epoch": 1.5498474059003051, "grad_norm": 1.01937735080719, "learning_rate": 1e-05, "loss": 0.476, "mean_token_accuracy": 0.8473126292228699, "num_tokens": 485612954.0, "step": 3047 }, { "epoch": 1.550356052899288, "grad_norm": 0.9482355713844299, "learning_rate": 1e-05, "loss": 0.4601, "mean_token_accuracy": 0.8525004386901855, "num_tokens": 485774911.0, "step": 3048 }, { "epoch": 1.5508646998982707, "grad_norm": 0.8762860298156738, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.850465714931488, "num_tokens": 485941736.0, "step": 3049 }, { "epoch": 1.5513733468972533, "grad_norm": 0.9830577969551086, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.8413152098655701, "num_tokens": 486099339.0, "step": 3050 }, { "epoch": 1.551881993896236, "grad_norm": 0.9780283570289612, "learning_rate": 1e-05, "loss": 0.4502, "mean_token_accuracy": 0.8565871715545654, "num_tokens": 486253663.0, "step": 3051 }, { "epoch": 1.5523906408952186, "grad_norm": 0.9304409623146057, "learning_rate": 1e-05, "loss": 0.4832, "mean_token_accuracy": 0.8460574150085449, "num_tokens": 486412016.0, "step": 3052 }, { "epoch": 1.5528992878942014, "grad_norm": 1.0042747259140015, "learning_rate": 1e-05, "loss": 0.4613, "mean_token_accuracy": 0.8515927791595459, "num_tokens": 486559498.0, "step": 3053 }, { "epoch": 1.5534079348931842, "grad_norm": 0.9880870580673218, "learning_rate": 1e-05, "loss": 0.4478, "mean_token_accuracy": 0.8557184934616089, "num_tokens": 486710623.0, "step": 3054 }, { "epoch": 1.553916581892167, "grad_norm": 0.9632434844970703, "learning_rate": 1e-05, "loss": 0.5085, "mean_token_accuracy": 0.8388104438781738, "num_tokens": 486868761.0, "step": 3055 }, { "epoch": 1.5544252288911495, "grad_norm": 0.9700770974159241, "learning_rate": 1e-05, "loss": 0.4615, "mean_token_accuracy": 0.851329505443573, "num_tokens": 487025165.0, "step": 3056 }, { "epoch": 1.554933875890132, "grad_norm": 0.9787797331809998, "learning_rate": 1e-05, "loss": 0.465, "mean_token_accuracy": 0.8498197197914124, "num_tokens": 487184848.0, "step": 3057 }, { "epoch": 1.5554425228891149, "grad_norm": 1.0208553075790405, "learning_rate": 1e-05, "loss": 0.5141, "mean_token_accuracy": 0.8400205373764038, "num_tokens": 487349819.0, "step": 3058 }, { "epoch": 1.5559511698880977, "grad_norm": 1.0137851238250732, "learning_rate": 1e-05, "loss": 0.5014, "mean_token_accuracy": 0.8405393362045288, "num_tokens": 487509643.0, "step": 3059 }, { "epoch": 1.5564598168870805, "grad_norm": 0.9738591313362122, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8491359353065491, "num_tokens": 487662393.0, "step": 3060 }, { "epoch": 1.556968463886063, "grad_norm": 0.9042809009552002, "learning_rate": 1e-05, "loss": 0.501, "mean_token_accuracy": 0.8427039384841919, "num_tokens": 487826775.0, "step": 3061 }, { "epoch": 1.5574771108850458, "grad_norm": 1.1072074174880981, "learning_rate": 1e-05, "loss": 0.4798, "mean_token_accuracy": 0.8482124209403992, "num_tokens": 487985726.0, "step": 3062 }, { "epoch": 1.5579857578840284, "grad_norm": 0.9234567880630493, "learning_rate": 1e-05, "loss": 0.4363, "mean_token_accuracy": 0.8603277802467346, "num_tokens": 488154603.0, "step": 3063 }, { "epoch": 1.5584944048830112, "grad_norm": 0.9251611828804016, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8493567109107971, "num_tokens": 488319919.0, "step": 3064 }, { "epoch": 1.559003051881994, "grad_norm": 0.9729562401771545, "learning_rate": 1e-05, "loss": 0.4396, "mean_token_accuracy": 0.8612502813339233, "num_tokens": 488472353.0, "step": 3065 }, { "epoch": 1.5595116988809767, "grad_norm": 0.9432415962219238, "learning_rate": 1e-05, "loss": 0.5015, "mean_token_accuracy": 0.8404766321182251, "num_tokens": 488630649.0, "step": 3066 }, { "epoch": 1.5600203458799593, "grad_norm": 0.9517757892608643, "learning_rate": 1e-05, "loss": 0.5058, "mean_token_accuracy": 0.8391520380973816, "num_tokens": 488786552.0, "step": 3067 }, { "epoch": 1.5605289928789419, "grad_norm": 0.9881300330162048, "learning_rate": 1e-05, "loss": 0.4396, "mean_token_accuracy": 0.8583213090896606, "num_tokens": 488932053.0, "step": 3068 }, { "epoch": 1.5610376398779247, "grad_norm": 1.0126222372055054, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8509069681167603, "num_tokens": 489084924.0, "step": 3069 }, { "epoch": 1.5615462868769074, "grad_norm": 0.9657802581787109, "learning_rate": 1e-05, "loss": 0.4812, "mean_token_accuracy": 0.8476794958114624, "num_tokens": 489250757.0, "step": 3070 }, { "epoch": 1.5620549338758902, "grad_norm": 0.9316444396972656, "learning_rate": 1e-05, "loss": 0.463, "mean_token_accuracy": 0.851240873336792, "num_tokens": 489413258.0, "step": 3071 }, { "epoch": 1.5625635808748728, "grad_norm": 1.0126928091049194, "learning_rate": 1e-05, "loss": 0.5092, "mean_token_accuracy": 0.8393445014953613, "num_tokens": 489567664.0, "step": 3072 }, { "epoch": 1.5630722278738556, "grad_norm": 0.9944961667060852, "learning_rate": 1e-05, "loss": 0.4904, "mean_token_accuracy": 0.8438583612442017, "num_tokens": 489723061.0, "step": 3073 }, { "epoch": 1.5635808748728381, "grad_norm": 0.9280596971511841, "learning_rate": 1e-05, "loss": 0.4354, "mean_token_accuracy": 0.8593460321426392, "num_tokens": 489874451.0, "step": 3074 }, { "epoch": 1.564089521871821, "grad_norm": 1.079498291015625, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8525861501693726, "num_tokens": 490036501.0, "step": 3075 }, { "epoch": 1.5645981688708037, "grad_norm": 0.9194795489311218, "learning_rate": 1e-05, "loss": 0.4691, "mean_token_accuracy": 0.8513813018798828, "num_tokens": 490204909.0, "step": 3076 }, { "epoch": 1.5651068158697865, "grad_norm": 1.0592505931854248, "learning_rate": 1e-05, "loss": 0.4855, "mean_token_accuracy": 0.8467689752578735, "num_tokens": 490356463.0, "step": 3077 }, { "epoch": 1.565615462868769, "grad_norm": 0.9179127216339111, "learning_rate": 1e-05, "loss": 0.489, "mean_token_accuracy": 0.8445532321929932, "num_tokens": 490514531.0, "step": 3078 }, { "epoch": 1.5661241098677516, "grad_norm": 0.9455628991127014, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8507919311523438, "num_tokens": 490685759.0, "step": 3079 }, { "epoch": 1.5666327568667344, "grad_norm": 0.891564130783081, "learning_rate": 1e-05, "loss": 0.4763, "mean_token_accuracy": 0.8482155799865723, "num_tokens": 490845055.0, "step": 3080 }, { "epoch": 1.5671414038657172, "grad_norm": 0.9942660927772522, "learning_rate": 1e-05, "loss": 0.4928, "mean_token_accuracy": 0.8423429727554321, "num_tokens": 491001535.0, "step": 3081 }, { "epoch": 1.5676500508647, "grad_norm": 0.9702527523040771, "learning_rate": 1e-05, "loss": 0.5115, "mean_token_accuracy": 0.8396128416061401, "num_tokens": 491152661.0, "step": 3082 }, { "epoch": 1.5681586978636826, "grad_norm": 0.9330050349235535, "learning_rate": 1e-05, "loss": 0.4752, "mean_token_accuracy": 0.8495821356773376, "num_tokens": 491305291.0, "step": 3083 }, { "epoch": 1.5686673448626653, "grad_norm": 0.9987598061561584, "learning_rate": 1e-05, "loss": 0.4681, "mean_token_accuracy": 0.8500514030456543, "num_tokens": 491455713.0, "step": 3084 }, { "epoch": 1.569175991861648, "grad_norm": 0.9497177600860596, "learning_rate": 1e-05, "loss": 0.445, "mean_token_accuracy": 0.857146143913269, "num_tokens": 491607483.0, "step": 3085 }, { "epoch": 1.5696846388606307, "grad_norm": 0.9814060926437378, "learning_rate": 1e-05, "loss": 0.472, "mean_token_accuracy": 0.8496817350387573, "num_tokens": 491772168.0, "step": 3086 }, { "epoch": 1.5701932858596135, "grad_norm": 0.9190930724143982, "learning_rate": 1e-05, "loss": 0.4644, "mean_token_accuracy": 0.8526386618614197, "num_tokens": 491931085.0, "step": 3087 }, { "epoch": 1.5707019328585963, "grad_norm": 0.9814797639846802, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8529306650161743, "num_tokens": 492088615.0, "step": 3088 }, { "epoch": 1.5712105798575788, "grad_norm": 0.9221906065940857, "learning_rate": 1e-05, "loss": 0.4398, "mean_token_accuracy": 0.8584678769111633, "num_tokens": 492240833.0, "step": 3089 }, { "epoch": 1.5717192268565614, "grad_norm": 0.9911572337150574, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8544842004776001, "num_tokens": 492397349.0, "step": 3090 }, { "epoch": 1.5722278738555442, "grad_norm": 0.9610999226570129, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8520728945732117, "num_tokens": 492556148.0, "step": 3091 }, { "epoch": 1.572736520854527, "grad_norm": 0.9577346444129944, "learning_rate": 1e-05, "loss": 0.4795, "mean_token_accuracy": 0.8496525287628174, "num_tokens": 492713338.0, "step": 3092 }, { "epoch": 1.5732451678535098, "grad_norm": 0.9709999561309814, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.848212718963623, "num_tokens": 492861776.0, "step": 3093 }, { "epoch": 1.5737538148524923, "grad_norm": 0.9600496292114258, "learning_rate": 1e-05, "loss": 0.4566, "mean_token_accuracy": 0.853322446346283, "num_tokens": 493019564.0, "step": 3094 }, { "epoch": 1.5742624618514751, "grad_norm": 0.8980912566184998, "learning_rate": 1e-05, "loss": 0.4642, "mean_token_accuracy": 0.8510512113571167, "num_tokens": 493172976.0, "step": 3095 }, { "epoch": 1.5747711088504577, "grad_norm": 0.896973729133606, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8487134575843811, "num_tokens": 493335981.0, "step": 3096 }, { "epoch": 1.5752797558494405, "grad_norm": 0.9582679867744446, "learning_rate": 1e-05, "loss": 0.4489, "mean_token_accuracy": 0.8563022613525391, "num_tokens": 493497636.0, "step": 3097 }, { "epoch": 1.5757884028484233, "grad_norm": 0.9179272055625916, "learning_rate": 1e-05, "loss": 0.4587, "mean_token_accuracy": 0.85282301902771, "num_tokens": 493652983.0, "step": 3098 }, { "epoch": 1.576297049847406, "grad_norm": 0.9353819489479065, "learning_rate": 1e-05, "loss": 0.4602, "mean_token_accuracy": 0.8537269234657288, "num_tokens": 493819335.0, "step": 3099 }, { "epoch": 1.5768056968463886, "grad_norm": 1.002805233001709, "learning_rate": 1e-05, "loss": 0.465, "mean_token_accuracy": 0.8521015644073486, "num_tokens": 493983683.0, "step": 3100 }, { "epoch": 1.5773143438453712, "grad_norm": 1.0276256799697876, "learning_rate": 1e-05, "loss": 0.4521, "mean_token_accuracy": 0.8568700551986694, "num_tokens": 494126765.0, "step": 3101 }, { "epoch": 1.577822990844354, "grad_norm": 1.151731252670288, "learning_rate": 1e-05, "loss": 0.5269, "mean_token_accuracy": 0.8336443901062012, "num_tokens": 494296649.0, "step": 3102 }, { "epoch": 1.5783316378433367, "grad_norm": 0.9901059865951538, "learning_rate": 1e-05, "loss": 0.487, "mean_token_accuracy": 0.8451051712036133, "num_tokens": 494456286.0, "step": 3103 }, { "epoch": 1.5788402848423195, "grad_norm": 0.9622817039489746, "learning_rate": 1e-05, "loss": 0.4659, "mean_token_accuracy": 0.8511883020401001, "num_tokens": 494612800.0, "step": 3104 }, { "epoch": 1.579348931841302, "grad_norm": 0.9548298120498657, "learning_rate": 1e-05, "loss": 0.4726, "mean_token_accuracy": 0.8519192337989807, "num_tokens": 494766468.0, "step": 3105 }, { "epoch": 1.5798575788402849, "grad_norm": 0.979957103729248, "learning_rate": 1e-05, "loss": 0.4711, "mean_token_accuracy": 0.8517515659332275, "num_tokens": 494920941.0, "step": 3106 }, { "epoch": 1.5803662258392674, "grad_norm": 0.903709352016449, "learning_rate": 1e-05, "loss": 0.4501, "mean_token_accuracy": 0.8568116426467896, "num_tokens": 495068008.0, "step": 3107 }, { "epoch": 1.5808748728382502, "grad_norm": 0.8897936344146729, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8548612594604492, "num_tokens": 495234703.0, "step": 3108 }, { "epoch": 1.581383519837233, "grad_norm": 0.9432336091995239, "learning_rate": 1e-05, "loss": 0.4619, "mean_token_accuracy": 0.8530568480491638, "num_tokens": 495393286.0, "step": 3109 }, { "epoch": 1.5818921668362158, "grad_norm": 0.9855477809906006, "learning_rate": 1e-05, "loss": 0.4629, "mean_token_accuracy": 0.8511521220207214, "num_tokens": 495546408.0, "step": 3110 }, { "epoch": 1.5824008138351984, "grad_norm": 0.9574191570281982, "learning_rate": 1e-05, "loss": 0.4921, "mean_token_accuracy": 0.8433710336685181, "num_tokens": 495707039.0, "step": 3111 }, { "epoch": 1.582909460834181, "grad_norm": 1.2597713470458984, "learning_rate": 1e-05, "loss": 0.487, "mean_token_accuracy": 0.8451650142669678, "num_tokens": 495873689.0, "step": 3112 }, { "epoch": 1.5834181078331637, "grad_norm": 0.9632464051246643, "learning_rate": 1e-05, "loss": 0.4677, "mean_token_accuracy": 0.8511648178100586, "num_tokens": 496043972.0, "step": 3113 }, { "epoch": 1.5839267548321465, "grad_norm": 0.8824676871299744, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8504089713096619, "num_tokens": 496200993.0, "step": 3114 }, { "epoch": 1.5844354018311293, "grad_norm": 0.9932284355163574, "learning_rate": 1e-05, "loss": 0.4885, "mean_token_accuracy": 0.8445684313774109, "num_tokens": 496361750.0, "step": 3115 }, { "epoch": 1.5849440488301119, "grad_norm": 0.9311143159866333, "learning_rate": 1e-05, "loss": 0.454, "mean_token_accuracy": 0.8534303903579712, "num_tokens": 496528159.0, "step": 3116 }, { "epoch": 1.5854526958290946, "grad_norm": 0.9174624085426331, "learning_rate": 1e-05, "loss": 0.4881, "mean_token_accuracy": 0.845794141292572, "num_tokens": 496683804.0, "step": 3117 }, { "epoch": 1.5859613428280772, "grad_norm": 0.9836673736572266, "learning_rate": 1e-05, "loss": 0.4607, "mean_token_accuracy": 0.8526643514633179, "num_tokens": 496842977.0, "step": 3118 }, { "epoch": 1.58646998982706, "grad_norm": 0.9321286082267761, "learning_rate": 1e-05, "loss": 0.4901, "mean_token_accuracy": 0.8443841338157654, "num_tokens": 496996452.0, "step": 3119 }, { "epoch": 1.5869786368260428, "grad_norm": 0.9813646674156189, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8516721129417419, "num_tokens": 497141795.0, "step": 3120 }, { "epoch": 1.5874872838250256, "grad_norm": 0.9760952591896057, "learning_rate": 1e-05, "loss": 0.4914, "mean_token_accuracy": 0.8431864976882935, "num_tokens": 497294276.0, "step": 3121 }, { "epoch": 1.5879959308240081, "grad_norm": 0.9025095701217651, "learning_rate": 1e-05, "loss": 0.4497, "mean_token_accuracy": 0.8556056022644043, "num_tokens": 497453652.0, "step": 3122 }, { "epoch": 1.5885045778229907, "grad_norm": 0.8846112489700317, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8493298888206482, "num_tokens": 497617124.0, "step": 3123 }, { "epoch": 1.5890132248219735, "grad_norm": 0.9269668459892273, "learning_rate": 1e-05, "loss": 0.4815, "mean_token_accuracy": 0.8469797372817993, "num_tokens": 497784575.0, "step": 3124 }, { "epoch": 1.5895218718209563, "grad_norm": 0.9350006580352783, "learning_rate": 1e-05, "loss": 0.4618, "mean_token_accuracy": 0.8537191152572632, "num_tokens": 497936438.0, "step": 3125 }, { "epoch": 1.590030518819939, "grad_norm": 0.8660092949867249, "learning_rate": 1e-05, "loss": 0.4664, "mean_token_accuracy": 0.8507699966430664, "num_tokens": 498096276.0, "step": 3126 }, { "epoch": 1.5905391658189216, "grad_norm": 0.9839761853218079, "learning_rate": 1e-05, "loss": 0.4714, "mean_token_accuracy": 0.8495392799377441, "num_tokens": 498259727.0, "step": 3127 }, { "epoch": 1.5910478128179044, "grad_norm": 0.9081756472587585, "learning_rate": 1e-05, "loss": 0.4701, "mean_token_accuracy": 0.8499031066894531, "num_tokens": 498422502.0, "step": 3128 }, { "epoch": 1.591556459816887, "grad_norm": 0.9107279181480408, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8488689064979553, "num_tokens": 498583191.0, "step": 3129 }, { "epoch": 1.5920651068158698, "grad_norm": 1.0051671266555786, "learning_rate": 1e-05, "loss": 0.448, "mean_token_accuracy": 0.856336236000061, "num_tokens": 498743572.0, "step": 3130 }, { "epoch": 1.5925737538148526, "grad_norm": 1.0061246156692505, "learning_rate": 1e-05, "loss": 0.4886, "mean_token_accuracy": 0.8462451696395874, "num_tokens": 498892569.0, "step": 3131 }, { "epoch": 1.5930824008138353, "grad_norm": 0.925137460231781, "learning_rate": 1e-05, "loss": 0.4897, "mean_token_accuracy": 0.8452982902526855, "num_tokens": 499051577.0, "step": 3132 }, { "epoch": 1.593591047812818, "grad_norm": 1.0042551755905151, "learning_rate": 1e-05, "loss": 0.4804, "mean_token_accuracy": 0.8471189141273499, "num_tokens": 499199085.0, "step": 3133 }, { "epoch": 1.5940996948118005, "grad_norm": 0.9724690318107605, "learning_rate": 1e-05, "loss": 0.4791, "mean_token_accuracy": 0.8482787013053894, "num_tokens": 499360910.0, "step": 3134 }, { "epoch": 1.5946083418107833, "grad_norm": 0.8974447846412659, "learning_rate": 1e-05, "loss": 0.4493, "mean_token_accuracy": 0.8557643890380859, "num_tokens": 499520756.0, "step": 3135 }, { "epoch": 1.595116988809766, "grad_norm": 0.943656861782074, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8480054140090942, "num_tokens": 499675969.0, "step": 3136 }, { "epoch": 1.5956256358087488, "grad_norm": 0.9828064441680908, "learning_rate": 1e-05, "loss": 0.4798, "mean_token_accuracy": 0.8466557264328003, "num_tokens": 499838062.0, "step": 3137 }, { "epoch": 1.5961342828077314, "grad_norm": 0.867965817451477, "learning_rate": 1e-05, "loss": 0.4383, "mean_token_accuracy": 0.8599332571029663, "num_tokens": 499988817.0, "step": 3138 }, { "epoch": 1.5966429298067142, "grad_norm": 0.9451082944869995, "learning_rate": 1e-05, "loss": 0.4682, "mean_token_accuracy": 0.8521145582199097, "num_tokens": 500156820.0, "step": 3139 }, { "epoch": 1.5971515768056967, "grad_norm": 0.9329448938369751, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.8403822183609009, "num_tokens": 500312578.0, "step": 3140 }, { "epoch": 1.5976602238046795, "grad_norm": 0.9193881750106812, "learning_rate": 1e-05, "loss": 0.4543, "mean_token_accuracy": 0.8545393943786621, "num_tokens": 500465103.0, "step": 3141 }, { "epoch": 1.5981688708036623, "grad_norm": 0.9008822441101074, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8494077920913696, "num_tokens": 500631973.0, "step": 3142 }, { "epoch": 1.598677517802645, "grad_norm": 0.937802791595459, "learning_rate": 1e-05, "loss": 0.4552, "mean_token_accuracy": 0.8539186716079712, "num_tokens": 500791918.0, "step": 3143 }, { "epoch": 1.5991861648016277, "grad_norm": 2.2799768447875977, "learning_rate": 1e-05, "loss": 0.4847, "mean_token_accuracy": 0.8453787565231323, "num_tokens": 500955095.0, "step": 3144 }, { "epoch": 1.5996948118006102, "grad_norm": 1.041445016860962, "learning_rate": 1e-05, "loss": 0.4926, "mean_token_accuracy": 0.8438977599143982, "num_tokens": 501118364.0, "step": 3145 }, { "epoch": 1.600203458799593, "grad_norm": 0.9780532121658325, "learning_rate": 1e-05, "loss": 0.4739, "mean_token_accuracy": 0.849367618560791, "num_tokens": 501271088.0, "step": 3146 }, { "epoch": 1.6007121057985758, "grad_norm": 0.8955594301223755, "learning_rate": 1e-05, "loss": 0.4717, "mean_token_accuracy": 0.8512964248657227, "num_tokens": 501434010.0, "step": 3147 }, { "epoch": 1.6012207527975586, "grad_norm": 1.0110737085342407, "learning_rate": 1e-05, "loss": 0.453, "mean_token_accuracy": 0.8546295762062073, "num_tokens": 501573475.0, "step": 3148 }, { "epoch": 1.6017293997965412, "grad_norm": 1.03841233253479, "learning_rate": 1e-05, "loss": 0.4524, "mean_token_accuracy": 0.8547658920288086, "num_tokens": 501727861.0, "step": 3149 }, { "epoch": 1.602238046795524, "grad_norm": 0.9409387111663818, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8487837314605713, "num_tokens": 501887647.0, "step": 3150 }, { "epoch": 1.6027466937945065, "grad_norm": 1.0022470951080322, "learning_rate": 1e-05, "loss": 0.4441, "mean_token_accuracy": 0.8576607704162598, "num_tokens": 502035810.0, "step": 3151 }, { "epoch": 1.6032553407934893, "grad_norm": 0.9948771595954895, "learning_rate": 1e-05, "loss": 0.4761, "mean_token_accuracy": 0.8488028645515442, "num_tokens": 502192904.0, "step": 3152 }, { "epoch": 1.603763987792472, "grad_norm": 1.203999400138855, "learning_rate": 1e-05, "loss": 0.444, "mean_token_accuracy": 0.8586881160736084, "num_tokens": 502349122.0, "step": 3153 }, { "epoch": 1.6042726347914549, "grad_norm": 0.9819480776786804, "learning_rate": 1e-05, "loss": 0.486, "mean_token_accuracy": 0.8465654850006104, "num_tokens": 502505274.0, "step": 3154 }, { "epoch": 1.6047812817904374, "grad_norm": 1.0000137090682983, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.8373558521270752, "num_tokens": 502661135.0, "step": 3155 }, { "epoch": 1.60528992878942, "grad_norm": 0.9279018044471741, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.8506321310997009, "num_tokens": 502817024.0, "step": 3156 }, { "epoch": 1.6057985757884028, "grad_norm": 1.0391238927841187, "learning_rate": 1e-05, "loss": 0.4841, "mean_token_accuracy": 0.8479592800140381, "num_tokens": 502985732.0, "step": 3157 }, { "epoch": 1.6063072227873856, "grad_norm": 0.9349561929702759, "learning_rate": 1e-05, "loss": 0.5151, "mean_token_accuracy": 0.8382826447486877, "num_tokens": 503156030.0, "step": 3158 }, { "epoch": 1.6068158697863684, "grad_norm": 0.9930242896080017, "learning_rate": 1e-05, "loss": 0.4962, "mean_token_accuracy": 0.8423037528991699, "num_tokens": 503304262.0, "step": 3159 }, { "epoch": 1.607324516785351, "grad_norm": 0.8912495970726013, "learning_rate": 1e-05, "loss": 0.4212, "mean_token_accuracy": 0.8649030327796936, "num_tokens": 503459669.0, "step": 3160 }, { "epoch": 1.6078331637843337, "grad_norm": 0.9261059165000916, "learning_rate": 1e-05, "loss": 0.4589, "mean_token_accuracy": 0.8540016412734985, "num_tokens": 503610334.0, "step": 3161 }, { "epoch": 1.6083418107833163, "grad_norm": 0.9560903310775757, "learning_rate": 1e-05, "loss": 0.4734, "mean_token_accuracy": 0.8490592837333679, "num_tokens": 503766380.0, "step": 3162 }, { "epoch": 1.608850457782299, "grad_norm": 0.9605829119682312, "learning_rate": 1e-05, "loss": 0.4962, "mean_token_accuracy": 0.8422528505325317, "num_tokens": 503917967.0, "step": 3163 }, { "epoch": 1.6093591047812819, "grad_norm": 0.9508742690086365, "learning_rate": 1e-05, "loss": 0.4961, "mean_token_accuracy": 0.8426268100738525, "num_tokens": 504077350.0, "step": 3164 }, { "epoch": 1.6098677517802646, "grad_norm": 0.9653372764587402, "learning_rate": 1e-05, "loss": 0.4481, "mean_token_accuracy": 0.8559508323669434, "num_tokens": 504239254.0, "step": 3165 }, { "epoch": 1.6103763987792472, "grad_norm": 0.9955853223800659, "learning_rate": 1e-05, "loss": 0.4737, "mean_token_accuracy": 0.8484251499176025, "num_tokens": 504396848.0, "step": 3166 }, { "epoch": 1.6108850457782298, "grad_norm": 0.9986701011657715, "learning_rate": 1e-05, "loss": 0.4809, "mean_token_accuracy": 0.8472124338150024, "num_tokens": 504548510.0, "step": 3167 }, { "epoch": 1.6113936927772126, "grad_norm": 1.056422233581543, "learning_rate": 1e-05, "loss": 0.4313, "mean_token_accuracy": 0.861392617225647, "num_tokens": 504709088.0, "step": 3168 }, { "epoch": 1.6119023397761953, "grad_norm": 0.9725289940834045, "learning_rate": 1e-05, "loss": 0.4898, "mean_token_accuracy": 0.844748318195343, "num_tokens": 504861511.0, "step": 3169 }, { "epoch": 1.6124109867751781, "grad_norm": 0.889563262462616, "learning_rate": 1e-05, "loss": 0.4743, "mean_token_accuracy": 0.8478610515594482, "num_tokens": 505040673.0, "step": 3170 }, { "epoch": 1.6129196337741607, "grad_norm": 1.020880937576294, "learning_rate": 1e-05, "loss": 0.4616, "mean_token_accuracy": 0.8515779972076416, "num_tokens": 505192544.0, "step": 3171 }, { "epoch": 1.6134282807731435, "grad_norm": 0.9151832461357117, "learning_rate": 1e-05, "loss": 0.4703, "mean_token_accuracy": 0.8499784469604492, "num_tokens": 505369618.0, "step": 3172 }, { "epoch": 1.613936927772126, "grad_norm": 1.049196720123291, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.856555163860321, "num_tokens": 505522912.0, "step": 3173 }, { "epoch": 1.6144455747711088, "grad_norm": 0.9513052701950073, "learning_rate": 1e-05, "loss": 0.4705, "mean_token_accuracy": 0.8521981239318848, "num_tokens": 505672795.0, "step": 3174 }, { "epoch": 1.6149542217700916, "grad_norm": 1.0559369325637817, "learning_rate": 1e-05, "loss": 0.4628, "mean_token_accuracy": 0.853662371635437, "num_tokens": 505848647.0, "step": 3175 }, { "epoch": 1.6154628687690744, "grad_norm": 1.0654804706573486, "learning_rate": 1e-05, "loss": 0.4785, "mean_token_accuracy": 0.8477578163146973, "num_tokens": 506006904.0, "step": 3176 }, { "epoch": 1.615971515768057, "grad_norm": 1.124934196472168, "learning_rate": 1e-05, "loss": 0.4498, "mean_token_accuracy": 0.8563681840896606, "num_tokens": 506170327.0, "step": 3177 }, { "epoch": 1.6164801627670395, "grad_norm": 1.0646849870681763, "learning_rate": 1e-05, "loss": 0.4437, "mean_token_accuracy": 0.8576815128326416, "num_tokens": 506328403.0, "step": 3178 }, { "epoch": 1.6169888097660223, "grad_norm": 1.1809217929840088, "learning_rate": 1e-05, "loss": 0.4721, "mean_token_accuracy": 0.8506449460983276, "num_tokens": 506476733.0, "step": 3179 }, { "epoch": 1.6174974567650051, "grad_norm": 1.1678529977798462, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8535448312759399, "num_tokens": 506646322.0, "step": 3180 }, { "epoch": 1.618006103763988, "grad_norm": 0.9654884934425354, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.8569132089614868, "num_tokens": 506800989.0, "step": 3181 }, { "epoch": 1.6185147507629705, "grad_norm": 1.0711699724197388, "learning_rate": 1e-05, "loss": 0.4402, "mean_token_accuracy": 0.8574779033660889, "num_tokens": 506963590.0, "step": 3182 }, { "epoch": 1.6190233977619533, "grad_norm": 0.9858824610710144, "learning_rate": 1e-05, "loss": 0.4563, "mean_token_accuracy": 0.8534616231918335, "num_tokens": 507110698.0, "step": 3183 }, { "epoch": 1.6195320447609358, "grad_norm": 1.0069845914840698, "learning_rate": 1e-05, "loss": 0.4589, "mean_token_accuracy": 0.8537305593490601, "num_tokens": 507261633.0, "step": 3184 }, { "epoch": 1.6200406917599186, "grad_norm": 1.1432651281356812, "learning_rate": 1e-05, "loss": 0.4445, "mean_token_accuracy": 0.856499195098877, "num_tokens": 507422355.0, "step": 3185 }, { "epoch": 1.6205493387589014, "grad_norm": 0.9364808201789856, "learning_rate": 1e-05, "loss": 0.4864, "mean_token_accuracy": 0.8467901945114136, "num_tokens": 507586130.0, "step": 3186 }, { "epoch": 1.6210579857578842, "grad_norm": 1.058851957321167, "learning_rate": 1e-05, "loss": 0.4884, "mean_token_accuracy": 0.8438219428062439, "num_tokens": 507730780.0, "step": 3187 }, { "epoch": 1.6215666327568667, "grad_norm": 1.098166823387146, "learning_rate": 1e-05, "loss": 0.4592, "mean_token_accuracy": 0.8547759056091309, "num_tokens": 507895824.0, "step": 3188 }, { "epoch": 1.6220752797558493, "grad_norm": 1.0428221225738525, "learning_rate": 1e-05, "loss": 0.4798, "mean_token_accuracy": 0.8461632132530212, "num_tokens": 508059047.0, "step": 3189 }, { "epoch": 1.622583926754832, "grad_norm": 1.0780528783798218, "learning_rate": 1e-05, "loss": 0.4984, "mean_token_accuracy": 0.8422732353210449, "num_tokens": 508213579.0, "step": 3190 }, { "epoch": 1.6230925737538149, "grad_norm": 0.9325408935546875, "learning_rate": 1e-05, "loss": 0.4898, "mean_token_accuracy": 0.8447512984275818, "num_tokens": 508362321.0, "step": 3191 }, { "epoch": 1.6236012207527977, "grad_norm": 1.1149005889892578, "learning_rate": 1e-05, "loss": 0.4461, "mean_token_accuracy": 0.8572701215744019, "num_tokens": 508527614.0, "step": 3192 }, { "epoch": 1.6241098677517802, "grad_norm": 1.0653656721115112, "learning_rate": 1e-05, "loss": 0.4562, "mean_token_accuracy": 0.8521193265914917, "num_tokens": 508675165.0, "step": 3193 }, { "epoch": 1.624618514750763, "grad_norm": 1.1204813718795776, "learning_rate": 1e-05, "loss": 0.491, "mean_token_accuracy": 0.8450343012809753, "num_tokens": 508844166.0, "step": 3194 }, { "epoch": 1.6251271617497456, "grad_norm": 1.0192406177520752, "learning_rate": 1e-05, "loss": 0.4668, "mean_token_accuracy": 0.8499091267585754, "num_tokens": 508999496.0, "step": 3195 }, { "epoch": 1.6256358087487284, "grad_norm": 1.937743067741394, "learning_rate": 1e-05, "loss": 0.4516, "mean_token_accuracy": 0.854866623878479, "num_tokens": 509153388.0, "step": 3196 }, { "epoch": 1.6261444557477112, "grad_norm": 1.1351182460784912, "learning_rate": 1e-05, "loss": 0.4592, "mean_token_accuracy": 0.8531141877174377, "num_tokens": 509320466.0, "step": 3197 }, { "epoch": 1.626653102746694, "grad_norm": 0.9390095472335815, "learning_rate": 1e-05, "loss": 0.4827, "mean_token_accuracy": 0.8455871343612671, "num_tokens": 509478651.0, "step": 3198 }, { "epoch": 1.6271617497456765, "grad_norm": 0.9299654960632324, "learning_rate": 1e-05, "loss": 0.4765, "mean_token_accuracy": 0.8481392860412598, "num_tokens": 509632225.0, "step": 3199 }, { "epoch": 1.627670396744659, "grad_norm": 1.0026236772537231, "learning_rate": 1e-05, "loss": 0.4672, "mean_token_accuracy": 0.8508972525596619, "num_tokens": 509789533.0, "step": 3200 }, { "epoch": 1.6281790437436419, "grad_norm": 0.8967097401618958, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.849647045135498, "num_tokens": 509956113.0, "step": 3201 }, { "epoch": 1.6286876907426246, "grad_norm": 1.0988428592681885, "learning_rate": 1e-05, "loss": 0.458, "mean_token_accuracy": 0.853933572769165, "num_tokens": 510127406.0, "step": 3202 }, { "epoch": 1.6291963377416074, "grad_norm": 0.9219009876251221, "learning_rate": 1e-05, "loss": 0.4392, "mean_token_accuracy": 0.8581788539886475, "num_tokens": 510279425.0, "step": 3203 }, { "epoch": 1.62970498474059, "grad_norm": 1.0581265687942505, "learning_rate": 1e-05, "loss": 0.4592, "mean_token_accuracy": 0.852953314781189, "num_tokens": 510431387.0, "step": 3204 }, { "epoch": 1.6302136317395728, "grad_norm": 0.9527878761291504, "learning_rate": 1e-05, "loss": 0.4769, "mean_token_accuracy": 0.8487293720245361, "num_tokens": 510592010.0, "step": 3205 }, { "epoch": 1.6307222787385554, "grad_norm": 0.8919398188591003, "learning_rate": 1e-05, "loss": 0.496, "mean_token_accuracy": 0.8447635769844055, "num_tokens": 510762803.0, "step": 3206 }, { "epoch": 1.6312309257375381, "grad_norm": 0.9669695496559143, "learning_rate": 1e-05, "loss": 0.4622, "mean_token_accuracy": 0.8502144813537598, "num_tokens": 510931494.0, "step": 3207 }, { "epoch": 1.631739572736521, "grad_norm": 1.350732684135437, "learning_rate": 1e-05, "loss": 0.512, "mean_token_accuracy": 0.8376517295837402, "num_tokens": 511087806.0, "step": 3208 }, { "epoch": 1.6322482197355037, "grad_norm": 1.014112949371338, "learning_rate": 1e-05, "loss": 0.4787, "mean_token_accuracy": 0.8486483693122864, "num_tokens": 511250013.0, "step": 3209 }, { "epoch": 1.6327568667344863, "grad_norm": 0.9049482941627502, "learning_rate": 1e-05, "loss": 0.4982, "mean_token_accuracy": 0.8427013158798218, "num_tokens": 511413057.0, "step": 3210 }, { "epoch": 1.6332655137334688, "grad_norm": 0.9740550518035889, "learning_rate": 1e-05, "loss": 0.45, "mean_token_accuracy": 0.8555504679679871, "num_tokens": 511569137.0, "step": 3211 }, { "epoch": 1.6337741607324516, "grad_norm": 0.9177180528640747, "learning_rate": 1e-05, "loss": 0.4316, "mean_token_accuracy": 0.8619349002838135, "num_tokens": 511722415.0, "step": 3212 }, { "epoch": 1.6342828077314344, "grad_norm": 0.8687677383422852, "learning_rate": 1e-05, "loss": 0.4424, "mean_token_accuracy": 0.8564339876174927, "num_tokens": 511879049.0, "step": 3213 }, { "epoch": 1.6347914547304172, "grad_norm": 1.2000905275344849, "learning_rate": 1e-05, "loss": 0.4566, "mean_token_accuracy": 0.8552175760269165, "num_tokens": 512040496.0, "step": 3214 }, { "epoch": 1.6353001017293998, "grad_norm": 0.9453958868980408, "learning_rate": 1e-05, "loss": 0.4675, "mean_token_accuracy": 0.8500424027442932, "num_tokens": 512186456.0, "step": 3215 }, { "epoch": 1.6358087487283826, "grad_norm": 1.0896542072296143, "learning_rate": 1e-05, "loss": 0.4889, "mean_token_accuracy": 0.8457797765731812, "num_tokens": 512348538.0, "step": 3216 }, { "epoch": 1.6363173957273651, "grad_norm": 1.0226482152938843, "learning_rate": 1e-05, "loss": 0.4726, "mean_token_accuracy": 0.8481237292289734, "num_tokens": 512510611.0, "step": 3217 }, { "epoch": 1.636826042726348, "grad_norm": 0.9434159994125366, "learning_rate": 1e-05, "loss": 0.4704, "mean_token_accuracy": 0.8490217924118042, "num_tokens": 512672571.0, "step": 3218 }, { "epoch": 1.6373346897253307, "grad_norm": 0.9946489334106445, "learning_rate": 1e-05, "loss": 0.4948, "mean_token_accuracy": 0.8430628776550293, "num_tokens": 512827563.0, "step": 3219 }, { "epoch": 1.6378433367243135, "grad_norm": 0.9760912656784058, "learning_rate": 1e-05, "loss": 0.4492, "mean_token_accuracy": 0.8558773994445801, "num_tokens": 512983469.0, "step": 3220 }, { "epoch": 1.638351983723296, "grad_norm": 0.9320849180221558, "learning_rate": 1e-05, "loss": 0.4554, "mean_token_accuracy": 0.8541078567504883, "num_tokens": 513142429.0, "step": 3221 }, { "epoch": 1.6388606307222786, "grad_norm": 0.915862500667572, "learning_rate": 1e-05, "loss": 0.4616, "mean_token_accuracy": 0.8523277044296265, "num_tokens": 513304550.0, "step": 3222 }, { "epoch": 1.6393692777212614, "grad_norm": 0.8940544128417969, "learning_rate": 1e-05, "loss": 0.4753, "mean_token_accuracy": 0.8485328555107117, "num_tokens": 513477949.0, "step": 3223 }, { "epoch": 1.6398779247202442, "grad_norm": 0.927986204624176, "learning_rate": 1e-05, "loss": 0.4848, "mean_token_accuracy": 0.8451883792877197, "num_tokens": 513639647.0, "step": 3224 }, { "epoch": 1.640386571719227, "grad_norm": 0.8704172372817993, "learning_rate": 1e-05, "loss": 0.4726, "mean_token_accuracy": 0.8500398397445679, "num_tokens": 513797781.0, "step": 3225 }, { "epoch": 1.6408952187182095, "grad_norm": 0.9353987574577332, "learning_rate": 1e-05, "loss": 0.471, "mean_token_accuracy": 0.8504478931427002, "num_tokens": 513955189.0, "step": 3226 }, { "epoch": 1.6414038657171923, "grad_norm": 0.9797881841659546, "learning_rate": 1e-05, "loss": 0.4615, "mean_token_accuracy": 0.8517624139785767, "num_tokens": 514110474.0, "step": 3227 }, { "epoch": 1.6419125127161749, "grad_norm": 0.9477497339248657, "learning_rate": 1e-05, "loss": 0.472, "mean_token_accuracy": 0.850496768951416, "num_tokens": 514262184.0, "step": 3228 }, { "epoch": 1.6424211597151577, "grad_norm": 0.8949544429779053, "learning_rate": 1e-05, "loss": 0.4406, "mean_token_accuracy": 0.8600969314575195, "num_tokens": 514417324.0, "step": 3229 }, { "epoch": 1.6429298067141405, "grad_norm": 0.9569036364555359, "learning_rate": 1e-05, "loss": 0.4761, "mean_token_accuracy": 0.8493359088897705, "num_tokens": 514577426.0, "step": 3230 }, { "epoch": 1.6434384537131232, "grad_norm": 0.9404508471488953, "learning_rate": 1e-05, "loss": 0.4703, "mean_token_accuracy": 0.8487488031387329, "num_tokens": 514715226.0, "step": 3231 }, { "epoch": 1.6439471007121058, "grad_norm": 0.8887859582901001, "learning_rate": 1e-05, "loss": 0.46, "mean_token_accuracy": 0.8533378839492798, "num_tokens": 514882566.0, "step": 3232 }, { "epoch": 1.6444557477110884, "grad_norm": 0.9008356332778931, "learning_rate": 1e-05, "loss": 0.4308, "mean_token_accuracy": 0.8623565435409546, "num_tokens": 515038542.0, "step": 3233 }, { "epoch": 1.6449643947100712, "grad_norm": 0.9309217929840088, "learning_rate": 1e-05, "loss": 0.4989, "mean_token_accuracy": 0.8433068990707397, "num_tokens": 515212612.0, "step": 3234 }, { "epoch": 1.645473041709054, "grad_norm": 0.8963108658790588, "learning_rate": 1e-05, "loss": 0.4512, "mean_token_accuracy": 0.854758083820343, "num_tokens": 515364485.0, "step": 3235 }, { "epoch": 1.6459816887080367, "grad_norm": 0.9246181845664978, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8418968915939331, "num_tokens": 515525982.0, "step": 3236 }, { "epoch": 1.6464903357070193, "grad_norm": 0.8769820332527161, "learning_rate": 1e-05, "loss": 0.4497, "mean_token_accuracy": 0.8552553057670593, "num_tokens": 515695826.0, "step": 3237 }, { "epoch": 1.6469989827060019, "grad_norm": 0.9627645611763, "learning_rate": 1e-05, "loss": 0.4615, "mean_token_accuracy": 0.8528941869735718, "num_tokens": 515853344.0, "step": 3238 }, { "epoch": 1.6475076297049847, "grad_norm": 1.0333333015441895, "learning_rate": 1e-05, "loss": 0.4764, "mean_token_accuracy": 0.8480497598648071, "num_tokens": 516010142.0, "step": 3239 }, { "epoch": 1.6480162767039674, "grad_norm": 0.9454230666160583, "learning_rate": 1e-05, "loss": 0.4571, "mean_token_accuracy": 0.8532935976982117, "num_tokens": 516159473.0, "step": 3240 }, { "epoch": 1.6485249237029502, "grad_norm": 0.8893861174583435, "learning_rate": 1e-05, "loss": 0.4733, "mean_token_accuracy": 0.8496898412704468, "num_tokens": 516332297.0, "step": 3241 }, { "epoch": 1.649033570701933, "grad_norm": 0.9125193357467651, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.8416143655776978, "num_tokens": 516492142.0, "step": 3242 }, { "epoch": 1.6495422177009156, "grad_norm": 0.9185030460357666, "learning_rate": 1e-05, "loss": 0.4638, "mean_token_accuracy": 0.852362871170044, "num_tokens": 516646489.0, "step": 3243 }, { "epoch": 1.6500508646998981, "grad_norm": 0.9418681859970093, "learning_rate": 1e-05, "loss": 0.466, "mean_token_accuracy": 0.8507981300354004, "num_tokens": 516798136.0, "step": 3244 }, { "epoch": 1.650559511698881, "grad_norm": 0.980635941028595, "learning_rate": 1e-05, "loss": 0.4736, "mean_token_accuracy": 0.8493054509162903, "num_tokens": 516964061.0, "step": 3245 }, { "epoch": 1.6510681586978637, "grad_norm": 1.006500482559204, "learning_rate": 1e-05, "loss": 0.4793, "mean_token_accuracy": 0.8486887216567993, "num_tokens": 517108993.0, "step": 3246 }, { "epoch": 1.6515768056968465, "grad_norm": 0.9133172631263733, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8549003005027771, "num_tokens": 517276121.0, "step": 3247 }, { "epoch": 1.652085452695829, "grad_norm": 0.943731427192688, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8488215208053589, "num_tokens": 517439575.0, "step": 3248 }, { "epoch": 1.6525940996948116, "grad_norm": 1.0023247003555298, "learning_rate": 1e-05, "loss": 0.4772, "mean_token_accuracy": 0.8474256992340088, "num_tokens": 517594951.0, "step": 3249 }, { "epoch": 1.6531027466937944, "grad_norm": 0.919856607913971, "learning_rate": 1e-05, "loss": 0.4724, "mean_token_accuracy": 0.8480158448219299, "num_tokens": 517744319.0, "step": 3250 }, { "epoch": 1.6536113936927772, "grad_norm": 0.993774950504303, "learning_rate": 1e-05, "loss": 0.4821, "mean_token_accuracy": 0.8462665677070618, "num_tokens": 517908794.0, "step": 3251 }, { "epoch": 1.65412004069176, "grad_norm": 1.0048195123672485, "learning_rate": 1e-05, "loss": 0.4826, "mean_token_accuracy": 0.8465999364852905, "num_tokens": 518071406.0, "step": 3252 }, { "epoch": 1.6546286876907428, "grad_norm": 1.0032241344451904, "learning_rate": 1e-05, "loss": 0.488, "mean_token_accuracy": 0.8444687724113464, "num_tokens": 518229568.0, "step": 3253 }, { "epoch": 1.6551373346897253, "grad_norm": 0.9415081739425659, "learning_rate": 1e-05, "loss": 0.4766, "mean_token_accuracy": 0.8483941555023193, "num_tokens": 518389903.0, "step": 3254 }, { "epoch": 1.655645981688708, "grad_norm": 1.036271333694458, "learning_rate": 1e-05, "loss": 0.4758, "mean_token_accuracy": 0.84796142578125, "num_tokens": 518538304.0, "step": 3255 }, { "epoch": 1.6561546286876907, "grad_norm": 1.052514910697937, "learning_rate": 1e-05, "loss": 0.4604, "mean_token_accuracy": 0.8535677194595337, "num_tokens": 518689792.0, "step": 3256 }, { "epoch": 1.6566632756866735, "grad_norm": 1.0288665294647217, "learning_rate": 1e-05, "loss": 0.4699, "mean_token_accuracy": 0.8506037592887878, "num_tokens": 518829689.0, "step": 3257 }, { "epoch": 1.6571719226856563, "grad_norm": 1.1159483194351196, "learning_rate": 1e-05, "loss": 0.4446, "mean_token_accuracy": 0.8576152324676514, "num_tokens": 518982145.0, "step": 3258 }, { "epoch": 1.6576805696846388, "grad_norm": 0.9410718083381653, "learning_rate": 1e-05, "loss": 0.483, "mean_token_accuracy": 0.846442461013794, "num_tokens": 519145357.0, "step": 3259 }, { "epoch": 1.6581892166836214, "grad_norm": 1.0251357555389404, "learning_rate": 1e-05, "loss": 0.5004, "mean_token_accuracy": 0.8411550521850586, "num_tokens": 519307621.0, "step": 3260 }, { "epoch": 1.6586978636826042, "grad_norm": 0.9591336846351624, "learning_rate": 1e-05, "loss": 0.47, "mean_token_accuracy": 0.8503050804138184, "num_tokens": 519461973.0, "step": 3261 }, { "epoch": 1.659206510681587, "grad_norm": 0.9231889247894287, "learning_rate": 1e-05, "loss": 0.4767, "mean_token_accuracy": 0.8487023115158081, "num_tokens": 519626545.0, "step": 3262 }, { "epoch": 1.6597151576805698, "grad_norm": 1.0337306261062622, "learning_rate": 1e-05, "loss": 0.4793, "mean_token_accuracy": 0.8472877144813538, "num_tokens": 519790645.0, "step": 3263 }, { "epoch": 1.6602238046795526, "grad_norm": 1.428776741027832, "learning_rate": 1e-05, "loss": 0.4682, "mean_token_accuracy": 0.8494995832443237, "num_tokens": 519945118.0, "step": 3264 }, { "epoch": 1.6607324516785351, "grad_norm": 0.9849370718002319, "learning_rate": 1e-05, "loss": 0.493, "mean_token_accuracy": 0.8436269760131836, "num_tokens": 520102088.0, "step": 3265 }, { "epoch": 1.6612410986775177, "grad_norm": 1.0113447904586792, "learning_rate": 1e-05, "loss": 0.5157, "mean_token_accuracy": 0.8369056582450867, "num_tokens": 520263275.0, "step": 3266 }, { "epoch": 1.6617497456765005, "grad_norm": 0.8960872292518616, "learning_rate": 1e-05, "loss": 0.4738, "mean_token_accuracy": 0.8496039509773254, "num_tokens": 520427222.0, "step": 3267 }, { "epoch": 1.6622583926754833, "grad_norm": 1.0057883262634277, "learning_rate": 1e-05, "loss": 0.4931, "mean_token_accuracy": 0.8433455228805542, "num_tokens": 520587306.0, "step": 3268 }, { "epoch": 1.662767039674466, "grad_norm": 0.919667661190033, "learning_rate": 1e-05, "loss": 0.4534, "mean_token_accuracy": 0.8553017377853394, "num_tokens": 520752920.0, "step": 3269 }, { "epoch": 1.6632756866734486, "grad_norm": 0.9011039733886719, "learning_rate": 1e-05, "loss": 0.4776, "mean_token_accuracy": 0.8484370708465576, "num_tokens": 520922639.0, "step": 3270 }, { "epoch": 1.6637843336724312, "grad_norm": 0.9278181791305542, "learning_rate": 1e-05, "loss": 0.4655, "mean_token_accuracy": 0.8510851263999939, "num_tokens": 521077842.0, "step": 3271 }, { "epoch": 1.664292980671414, "grad_norm": 0.9309078454971313, "learning_rate": 1e-05, "loss": 0.5157, "mean_token_accuracy": 0.8380452394485474, "num_tokens": 521246669.0, "step": 3272 }, { "epoch": 1.6648016276703967, "grad_norm": 0.9402255415916443, "learning_rate": 1e-05, "loss": 0.4442, "mean_token_accuracy": 0.8570981621742249, "num_tokens": 521403766.0, "step": 3273 }, { "epoch": 1.6653102746693795, "grad_norm": 0.9927259683609009, "learning_rate": 1e-05, "loss": 0.4566, "mean_token_accuracy": 0.8527973890304565, "num_tokens": 521552038.0, "step": 3274 }, { "epoch": 1.6658189216683623, "grad_norm": 0.9242224097251892, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.8570798635482788, "num_tokens": 521715941.0, "step": 3275 }, { "epoch": 1.6663275686673449, "grad_norm": 1.0413960218429565, "learning_rate": 1e-05, "loss": 0.5049, "mean_token_accuracy": 0.8396327495574951, "num_tokens": 521871204.0, "step": 3276 }, { "epoch": 1.6668362156663274, "grad_norm": 1.067278265953064, "learning_rate": 1e-05, "loss": 0.5112, "mean_token_accuracy": 0.839201807975769, "num_tokens": 522031714.0, "step": 3277 }, { "epoch": 1.6673448626653102, "grad_norm": 0.8974424600601196, "learning_rate": 1e-05, "loss": 0.4461, "mean_token_accuracy": 0.8575136661529541, "num_tokens": 522180569.0, "step": 3278 }, { "epoch": 1.667853509664293, "grad_norm": 1.0562061071395874, "learning_rate": 1e-05, "loss": 0.4653, "mean_token_accuracy": 0.8524356484413147, "num_tokens": 522337674.0, "step": 3279 }, { "epoch": 1.6683621566632758, "grad_norm": 1.050594687461853, "learning_rate": 1e-05, "loss": 0.4799, "mean_token_accuracy": 0.8469610214233398, "num_tokens": 522491069.0, "step": 3280 }, { "epoch": 1.6688708036622584, "grad_norm": 0.9911378026008606, "learning_rate": 1e-05, "loss": 0.4613, "mean_token_accuracy": 0.8525702357292175, "num_tokens": 522650079.0, "step": 3281 }, { "epoch": 1.669379450661241, "grad_norm": 0.9349995255470276, "learning_rate": 1e-05, "loss": 0.4731, "mean_token_accuracy": 0.85033118724823, "num_tokens": 522813823.0, "step": 3282 }, { "epoch": 1.6698880976602237, "grad_norm": 0.9397663474082947, "learning_rate": 1e-05, "loss": 0.5216, "mean_token_accuracy": 0.8359174132347107, "num_tokens": 522976265.0, "step": 3283 }, { "epoch": 1.6703967446592065, "grad_norm": 1.0165876150131226, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8488295078277588, "num_tokens": 523142670.0, "step": 3284 }, { "epoch": 1.6709053916581893, "grad_norm": 0.9085330367088318, "learning_rate": 1e-05, "loss": 0.4653, "mean_token_accuracy": 0.851130485534668, "num_tokens": 523300688.0, "step": 3285 }, { "epoch": 1.671414038657172, "grad_norm": 0.9892270565032959, "learning_rate": 1e-05, "loss": 0.5006, "mean_token_accuracy": 0.8418188095092773, "num_tokens": 523461987.0, "step": 3286 }, { "epoch": 1.6719226856561547, "grad_norm": 1.0040111541748047, "learning_rate": 1e-05, "loss": 0.4892, "mean_token_accuracy": 0.8444796800613403, "num_tokens": 523623517.0, "step": 3287 }, { "epoch": 1.6724313326551372, "grad_norm": 0.8981871604919434, "learning_rate": 1e-05, "loss": 0.4521, "mean_token_accuracy": 0.8556160926818848, "num_tokens": 523794689.0, "step": 3288 }, { "epoch": 1.67293997965412, "grad_norm": 0.9815360307693481, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8480269312858582, "num_tokens": 523958641.0, "step": 3289 }, { "epoch": 1.6734486266531028, "grad_norm": 0.915729820728302, "learning_rate": 1e-05, "loss": 0.48, "mean_token_accuracy": 0.847346842288971, "num_tokens": 524128171.0, "step": 3290 }, { "epoch": 1.6739572736520856, "grad_norm": 0.9879436492919922, "learning_rate": 1e-05, "loss": 0.5075, "mean_token_accuracy": 0.8403210639953613, "num_tokens": 524291831.0, "step": 3291 }, { "epoch": 1.6744659206510681, "grad_norm": 0.9411258697509766, "learning_rate": 1e-05, "loss": 0.4742, "mean_token_accuracy": 0.847053050994873, "num_tokens": 524455562.0, "step": 3292 }, { "epoch": 1.6749745676500507, "grad_norm": 0.9418855309486389, "learning_rate": 1e-05, "loss": 0.4988, "mean_token_accuracy": 0.8415638208389282, "num_tokens": 524615758.0, "step": 3293 }, { "epoch": 1.6754832146490335, "grad_norm": 0.9279276728630066, "learning_rate": 1e-05, "loss": 0.4915, "mean_token_accuracy": 0.8443039655685425, "num_tokens": 524788549.0, "step": 3294 }, { "epoch": 1.6759918616480163, "grad_norm": 0.9746510982513428, "learning_rate": 1e-05, "loss": 0.4568, "mean_token_accuracy": 0.8545578122138977, "num_tokens": 524946069.0, "step": 3295 }, { "epoch": 1.676500508646999, "grad_norm": 0.955940842628479, "learning_rate": 1e-05, "loss": 0.5109, "mean_token_accuracy": 0.8380922079086304, "num_tokens": 525108018.0, "step": 3296 }, { "epoch": 1.6770091556459819, "grad_norm": 0.9279923439025879, "learning_rate": 1e-05, "loss": 0.4315, "mean_token_accuracy": 0.8598830699920654, "num_tokens": 525261353.0, "step": 3297 }, { "epoch": 1.6775178026449644, "grad_norm": 0.883661687374115, "learning_rate": 1e-05, "loss": 0.4715, "mean_token_accuracy": 0.8499032258987427, "num_tokens": 525429783.0, "step": 3298 }, { "epoch": 1.678026449643947, "grad_norm": 0.9393394589424133, "learning_rate": 1e-05, "loss": 0.4914, "mean_token_accuracy": 0.8439207673072815, "num_tokens": 525596465.0, "step": 3299 }, { "epoch": 1.6785350966429298, "grad_norm": 0.9309011101722717, "learning_rate": 1e-05, "loss": 0.4995, "mean_token_accuracy": 0.8434082269668579, "num_tokens": 525756468.0, "step": 3300 }, { "epoch": 1.6790437436419126, "grad_norm": 0.8695400357246399, "learning_rate": 1e-05, "loss": 0.4486, "mean_token_accuracy": 0.8543292284011841, "num_tokens": 525916556.0, "step": 3301 }, { "epoch": 1.6795523906408953, "grad_norm": 1.1616344451904297, "learning_rate": 1e-05, "loss": 0.4591, "mean_token_accuracy": 0.8515818119049072, "num_tokens": 526072332.0, "step": 3302 }, { "epoch": 1.680061037639878, "grad_norm": 0.9855133891105652, "learning_rate": 1e-05, "loss": 0.4694, "mean_token_accuracy": 0.8512064218521118, "num_tokens": 526232023.0, "step": 3303 }, { "epoch": 1.6805696846388605, "grad_norm": 0.922696053981781, "learning_rate": 1e-05, "loss": 0.4724, "mean_token_accuracy": 0.8474718332290649, "num_tokens": 526390503.0, "step": 3304 }, { "epoch": 1.6810783316378433, "grad_norm": 0.9263838529586792, "learning_rate": 1e-05, "loss": 0.5006, "mean_token_accuracy": 0.8425577282905579, "num_tokens": 526553435.0, "step": 3305 }, { "epoch": 1.681586978636826, "grad_norm": 0.9973815679550171, "learning_rate": 1e-05, "loss": 0.483, "mean_token_accuracy": 0.8466613292694092, "num_tokens": 526702092.0, "step": 3306 }, { "epoch": 1.6820956256358088, "grad_norm": 0.9127100110054016, "learning_rate": 1e-05, "loss": 0.447, "mean_token_accuracy": 0.8569740653038025, "num_tokens": 526864272.0, "step": 3307 }, { "epoch": 1.6826042726347916, "grad_norm": 0.8550009727478027, "learning_rate": 1e-05, "loss": 0.4777, "mean_token_accuracy": 0.8481435775756836, "num_tokens": 527035121.0, "step": 3308 }, { "epoch": 1.6831129196337742, "grad_norm": 0.8917046189308167, "learning_rate": 1e-05, "loss": 0.4541, "mean_token_accuracy": 0.8557920455932617, "num_tokens": 527188367.0, "step": 3309 }, { "epoch": 1.6836215666327567, "grad_norm": 0.9677354097366333, "learning_rate": 1e-05, "loss": 0.4872, "mean_token_accuracy": 0.8458775281906128, "num_tokens": 527350842.0, "step": 3310 }, { "epoch": 1.6841302136317395, "grad_norm": 0.9248644709587097, "learning_rate": 1e-05, "loss": 0.5035, "mean_token_accuracy": 0.84101402759552, "num_tokens": 527515994.0, "step": 3311 }, { "epoch": 1.6846388606307223, "grad_norm": 0.9055294394493103, "learning_rate": 1e-05, "loss": 0.4599, "mean_token_accuracy": 0.8526661396026611, "num_tokens": 527685407.0, "step": 3312 }, { "epoch": 1.685147507629705, "grad_norm": 0.9836543202400208, "learning_rate": 1e-05, "loss": 0.4398, "mean_token_accuracy": 0.8585208654403687, "num_tokens": 527850629.0, "step": 3313 }, { "epoch": 1.6856561546286877, "grad_norm": 0.9496813416481018, "learning_rate": 1e-05, "loss": 0.471, "mean_token_accuracy": 0.8488203883171082, "num_tokens": 528011456.0, "step": 3314 }, { "epoch": 1.6861648016276702, "grad_norm": 0.9263770580291748, "learning_rate": 1e-05, "loss": 0.4259, "mean_token_accuracy": 0.8631601333618164, "num_tokens": 528160030.0, "step": 3315 }, { "epoch": 1.686673448626653, "grad_norm": 1.0096482038497925, "learning_rate": 1e-05, "loss": 0.534, "mean_token_accuracy": 0.8320564031600952, "num_tokens": 528319045.0, "step": 3316 }, { "epoch": 1.6871820956256358, "grad_norm": 0.9198050498962402, "learning_rate": 1e-05, "loss": 0.4711, "mean_token_accuracy": 0.8489158153533936, "num_tokens": 528482912.0, "step": 3317 }, { "epoch": 1.6876907426246186, "grad_norm": 1.0153566598892212, "learning_rate": 1e-05, "loss": 0.4855, "mean_token_accuracy": 0.8455990552902222, "num_tokens": 528635401.0, "step": 3318 }, { "epoch": 1.6881993896236012, "grad_norm": 0.9170647859573364, "learning_rate": 1e-05, "loss": 0.4495, "mean_token_accuracy": 0.85591059923172, "num_tokens": 528784018.0, "step": 3319 }, { "epoch": 1.688708036622584, "grad_norm": 1.0029700994491577, "learning_rate": 1e-05, "loss": 0.4593, "mean_token_accuracy": 0.8527106642723083, "num_tokens": 528936587.0, "step": 3320 }, { "epoch": 1.6892166836215665, "grad_norm": 0.9631544351577759, "learning_rate": 1e-05, "loss": 0.4952, "mean_token_accuracy": 0.8434855937957764, "num_tokens": 529091571.0, "step": 3321 }, { "epoch": 1.6897253306205493, "grad_norm": 0.9077914953231812, "learning_rate": 1e-05, "loss": 0.4763, "mean_token_accuracy": 0.8488157987594604, "num_tokens": 529263800.0, "step": 3322 }, { "epoch": 1.690233977619532, "grad_norm": 1.0570645332336426, "learning_rate": 1e-05, "loss": 0.4325, "mean_token_accuracy": 0.8580660820007324, "num_tokens": 529420232.0, "step": 3323 }, { "epoch": 1.6907426246185149, "grad_norm": 1.0088614225387573, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8567444682121277, "num_tokens": 529579783.0, "step": 3324 }, { "epoch": 1.6912512716174974, "grad_norm": 0.9414284825325012, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.8432716727256775, "num_tokens": 529751869.0, "step": 3325 }, { "epoch": 1.69175991861648, "grad_norm": 0.9591031670570374, "learning_rate": 1e-05, "loss": 0.4377, "mean_token_accuracy": 0.8570700883865356, "num_tokens": 529922657.0, "step": 3326 }, { "epoch": 1.6922685656154628, "grad_norm": 1.011871099472046, "learning_rate": 1e-05, "loss": 0.4457, "mean_token_accuracy": 0.8572059869766235, "num_tokens": 530083357.0, "step": 3327 }, { "epoch": 1.6927772126144456, "grad_norm": 1.0337574481964111, "learning_rate": 1e-05, "loss": 0.4375, "mean_token_accuracy": 0.858768105506897, "num_tokens": 530237269.0, "step": 3328 }, { "epoch": 1.6932858596134284, "grad_norm": 1.017116904258728, "learning_rate": 1e-05, "loss": 0.4935, "mean_token_accuracy": 0.8441458940505981, "num_tokens": 530400984.0, "step": 3329 }, { "epoch": 1.693794506612411, "grad_norm": 0.9320374131202698, "learning_rate": 1e-05, "loss": 0.479, "mean_token_accuracy": 0.8480983972549438, "num_tokens": 530563601.0, "step": 3330 }, { "epoch": 1.6943031536113937, "grad_norm": 0.9566795825958252, "learning_rate": 1e-05, "loss": 0.4789, "mean_token_accuracy": 0.8461617827415466, "num_tokens": 530706352.0, "step": 3331 }, { "epoch": 1.6948118006103763, "grad_norm": 0.9467067718505859, "learning_rate": 1e-05, "loss": 0.4941, "mean_token_accuracy": 0.8445472717285156, "num_tokens": 530875466.0, "step": 3332 }, { "epoch": 1.695320447609359, "grad_norm": 0.9797875285148621, "learning_rate": 1e-05, "loss": 0.4903, "mean_token_accuracy": 0.844483494758606, "num_tokens": 531040970.0, "step": 3333 }, { "epoch": 1.6958290946083419, "grad_norm": 0.9778969883918762, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8568530082702637, "num_tokens": 531199906.0, "step": 3334 }, { "epoch": 1.6963377416073246, "grad_norm": 0.9380336403846741, "learning_rate": 1e-05, "loss": 0.4665, "mean_token_accuracy": 0.8512696027755737, "num_tokens": 531354083.0, "step": 3335 }, { "epoch": 1.6968463886063072, "grad_norm": 0.9415258169174194, "learning_rate": 1e-05, "loss": 0.4728, "mean_token_accuracy": 0.8497647643089294, "num_tokens": 531509317.0, "step": 3336 }, { "epoch": 1.6973550356052898, "grad_norm": 1.0049771070480347, "learning_rate": 1e-05, "loss": 0.4709, "mean_token_accuracy": 0.8508015871047974, "num_tokens": 531661237.0, "step": 3337 }, { "epoch": 1.6978636826042726, "grad_norm": 0.9479979276657104, "learning_rate": 1e-05, "loss": 0.4649, "mean_token_accuracy": 0.8530048727989197, "num_tokens": 531821184.0, "step": 3338 }, { "epoch": 1.6983723296032553, "grad_norm": 0.9929404854774475, "learning_rate": 1e-05, "loss": 0.4419, "mean_token_accuracy": 0.8580644130706787, "num_tokens": 531968990.0, "step": 3339 }, { "epoch": 1.6988809766022381, "grad_norm": 0.9658786058425903, "learning_rate": 1e-05, "loss": 0.4726, "mean_token_accuracy": 0.8487871289253235, "num_tokens": 532137310.0, "step": 3340 }, { "epoch": 1.6993896236012207, "grad_norm": 1.012675404548645, "learning_rate": 1e-05, "loss": 0.5001, "mean_token_accuracy": 0.8426909446716309, "num_tokens": 532289496.0, "step": 3341 }, { "epoch": 1.6998982706002035, "grad_norm": 1.0348429679870605, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8566396832466125, "num_tokens": 532440662.0, "step": 3342 }, { "epoch": 1.700406917599186, "grad_norm": 0.9613810181617737, "learning_rate": 1e-05, "loss": 0.4637, "mean_token_accuracy": 0.852282702922821, "num_tokens": 532597195.0, "step": 3343 }, { "epoch": 1.7009155645981688, "grad_norm": 0.9847121238708496, "learning_rate": 1e-05, "loss": 0.4641, "mean_token_accuracy": 0.8513263463973999, "num_tokens": 532750623.0, "step": 3344 }, { "epoch": 1.7014242115971516, "grad_norm": 0.9310582280158997, "learning_rate": 1e-05, "loss": 0.4507, "mean_token_accuracy": 0.8549737930297852, "num_tokens": 532913640.0, "step": 3345 }, { "epoch": 1.7019328585961344, "grad_norm": 0.9432613849639893, "learning_rate": 1e-05, "loss": 0.4538, "mean_token_accuracy": 0.8539129495620728, "num_tokens": 533079069.0, "step": 3346 }, { "epoch": 1.702441505595117, "grad_norm": 0.9648125767707825, "learning_rate": 1e-05, "loss": 0.444, "mean_token_accuracy": 0.8555746078491211, "num_tokens": 533235104.0, "step": 3347 }, { "epoch": 1.7029501525940995, "grad_norm": 0.9137231111526489, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.8608599901199341, "num_tokens": 533404987.0, "step": 3348 }, { "epoch": 1.7034587995930823, "grad_norm": 0.9805552959442139, "learning_rate": 1e-05, "loss": 0.481, "mean_token_accuracy": 0.8472720384597778, "num_tokens": 533563719.0, "step": 3349 }, { "epoch": 1.7039674465920651, "grad_norm": 1.0190812349319458, "learning_rate": 1e-05, "loss": 0.4625, "mean_token_accuracy": 0.8517487049102783, "num_tokens": 533715865.0, "step": 3350 }, { "epoch": 1.704476093591048, "grad_norm": 1.0097111463546753, "learning_rate": 1e-05, "loss": 0.4783, "mean_token_accuracy": 0.8472844958305359, "num_tokens": 533870888.0, "step": 3351 }, { "epoch": 1.7049847405900305, "grad_norm": 1.0112862586975098, "learning_rate": 1e-05, "loss": 0.4807, "mean_token_accuracy": 0.8472076654434204, "num_tokens": 534023360.0, "step": 3352 }, { "epoch": 1.7054933875890133, "grad_norm": 1.0216953754425049, "learning_rate": 1e-05, "loss": 0.4389, "mean_token_accuracy": 0.8584215641021729, "num_tokens": 534175128.0, "step": 3353 }, { "epoch": 1.7060020345879958, "grad_norm": 1.0892550945281982, "learning_rate": 1e-05, "loss": 0.4712, "mean_token_accuracy": 0.8495419025421143, "num_tokens": 534338342.0, "step": 3354 }, { "epoch": 1.7065106815869786, "grad_norm": 0.9826159477233887, "learning_rate": 1e-05, "loss": 0.472, "mean_token_accuracy": 0.8509111404418945, "num_tokens": 534485674.0, "step": 3355 }, { "epoch": 1.7070193285859614, "grad_norm": 1.0366008281707764, "learning_rate": 1e-05, "loss": 0.4733, "mean_token_accuracy": 0.8497737050056458, "num_tokens": 534643540.0, "step": 3356 }, { "epoch": 1.7075279755849442, "grad_norm": 0.9178564548492432, "learning_rate": 1e-05, "loss": 0.4426, "mean_token_accuracy": 0.8565972447395325, "num_tokens": 534800038.0, "step": 3357 }, { "epoch": 1.7080366225839267, "grad_norm": 1.001478672027588, "learning_rate": 1e-05, "loss": 0.4895, "mean_token_accuracy": 0.8442537784576416, "num_tokens": 534946089.0, "step": 3358 }, { "epoch": 1.7085452695829093, "grad_norm": 1.0083726644515991, "learning_rate": 1e-05, "loss": 0.4632, "mean_token_accuracy": 0.851345419883728, "num_tokens": 535099631.0, "step": 3359 }, { "epoch": 1.709053916581892, "grad_norm": 0.8685449361801147, "learning_rate": 1e-05, "loss": 0.5055, "mean_token_accuracy": 0.8397232294082642, "num_tokens": 535248651.0, "step": 3360 }, { "epoch": 1.7095625635808749, "grad_norm": 1.0024147033691406, "learning_rate": 1e-05, "loss": 0.5107, "mean_token_accuracy": 0.8402670621871948, "num_tokens": 535415988.0, "step": 3361 }, { "epoch": 1.7100712105798577, "grad_norm": 2.0469908714294434, "learning_rate": 1e-05, "loss": 0.4683, "mean_token_accuracy": 0.8498460650444031, "num_tokens": 535572330.0, "step": 3362 }, { "epoch": 1.7105798575788402, "grad_norm": 0.9829951524734497, "learning_rate": 1e-05, "loss": 0.4548, "mean_token_accuracy": 0.8559287786483765, "num_tokens": 535736424.0, "step": 3363 }, { "epoch": 1.711088504577823, "grad_norm": 0.948924720287323, "learning_rate": 1e-05, "loss": 0.4865, "mean_token_accuracy": 0.8448041081428528, "num_tokens": 535900015.0, "step": 3364 }, { "epoch": 1.7115971515768056, "grad_norm": 1.0147477388381958, "learning_rate": 1e-05, "loss": 0.4883, "mean_token_accuracy": 0.8448580503463745, "num_tokens": 536054345.0, "step": 3365 }, { "epoch": 1.7121057985757884, "grad_norm": 0.9223432540893555, "learning_rate": 1e-05, "loss": 0.4805, "mean_token_accuracy": 0.8478121757507324, "num_tokens": 536213601.0, "step": 3366 }, { "epoch": 1.7126144455747712, "grad_norm": 0.9273245334625244, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.840658962726593, "num_tokens": 536381796.0, "step": 3367 }, { "epoch": 1.713123092573754, "grad_norm": 0.9985995888710022, "learning_rate": 1e-05, "loss": 0.4743, "mean_token_accuracy": 0.8496761918067932, "num_tokens": 536546179.0, "step": 3368 }, { "epoch": 1.7136317395727365, "grad_norm": 0.9761607050895691, "learning_rate": 1e-05, "loss": 0.4832, "mean_token_accuracy": 0.8463513255119324, "num_tokens": 536716604.0, "step": 3369 }, { "epoch": 1.714140386571719, "grad_norm": 0.9189473986625671, "learning_rate": 1e-05, "loss": 0.4958, "mean_token_accuracy": 0.8437198400497437, "num_tokens": 536873341.0, "step": 3370 }, { "epoch": 1.7146490335707019, "grad_norm": 1.053405523300171, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.8505102396011353, "num_tokens": 537029255.0, "step": 3371 }, { "epoch": 1.7151576805696847, "grad_norm": 0.952011227607727, "learning_rate": 1e-05, "loss": 0.4874, "mean_token_accuracy": 0.8456932306289673, "num_tokens": 537182989.0, "step": 3372 }, { "epoch": 1.7156663275686674, "grad_norm": 0.9287518858909607, "learning_rate": 1e-05, "loss": 0.4676, "mean_token_accuracy": 0.8501575589179993, "num_tokens": 537353885.0, "step": 3373 }, { "epoch": 1.71617497456765, "grad_norm": 1.0261600017547607, "learning_rate": 1e-05, "loss": 0.4848, "mean_token_accuracy": 0.845282256603241, "num_tokens": 537519346.0, "step": 3374 }, { "epoch": 1.7166836215666328, "grad_norm": 1.0151768922805786, "learning_rate": 1e-05, "loss": 0.4527, "mean_token_accuracy": 0.8560212850570679, "num_tokens": 537690640.0, "step": 3375 }, { "epoch": 1.7171922685656154, "grad_norm": 0.9206743240356445, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8524140119552612, "num_tokens": 537849156.0, "step": 3376 }, { "epoch": 1.7177009155645981, "grad_norm": 1.1978951692581177, "learning_rate": 1e-05, "loss": 0.477, "mean_token_accuracy": 0.8484640121459961, "num_tokens": 538012098.0, "step": 3377 }, { "epoch": 1.718209562563581, "grad_norm": 0.9505128264427185, "learning_rate": 1e-05, "loss": 0.5071, "mean_token_accuracy": 0.8388642072677612, "num_tokens": 538167004.0, "step": 3378 }, { "epoch": 1.7187182095625637, "grad_norm": 1.0761715173721313, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.854249894618988, "num_tokens": 538306082.0, "step": 3379 }, { "epoch": 1.7192268565615463, "grad_norm": 1.0668213367462158, "learning_rate": 1e-05, "loss": 0.4613, "mean_token_accuracy": 0.8530638217926025, "num_tokens": 538455354.0, "step": 3380 }, { "epoch": 1.7197355035605288, "grad_norm": 0.8889974355697632, "learning_rate": 1e-05, "loss": 0.4448, "mean_token_accuracy": 0.8573060035705566, "num_tokens": 538622580.0, "step": 3381 }, { "epoch": 1.7202441505595116, "grad_norm": 1.1304421424865723, "learning_rate": 1e-05, "loss": 0.4882, "mean_token_accuracy": 0.84532630443573, "num_tokens": 538776413.0, "step": 3382 }, { "epoch": 1.7207527975584944, "grad_norm": 0.8981946706771851, "learning_rate": 1e-05, "loss": 0.4826, "mean_token_accuracy": 0.8455299139022827, "num_tokens": 538935256.0, "step": 3383 }, { "epoch": 1.7212614445574772, "grad_norm": 0.9575291872024536, "learning_rate": 1e-05, "loss": 0.4575, "mean_token_accuracy": 0.8531308174133301, "num_tokens": 539089207.0, "step": 3384 }, { "epoch": 1.7217700915564598, "grad_norm": 0.9755882620811462, "learning_rate": 1e-05, "loss": 0.4779, "mean_token_accuracy": 0.8488665223121643, "num_tokens": 539251537.0, "step": 3385 }, { "epoch": 1.7222787385554426, "grad_norm": 0.9240638613700867, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8592078685760498, "num_tokens": 539409402.0, "step": 3386 }, { "epoch": 1.7227873855544251, "grad_norm": 1.0655521154403687, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8432095050811768, "num_tokens": 539571156.0, "step": 3387 }, { "epoch": 1.723296032553408, "grad_norm": 1.0824614763259888, "learning_rate": 1e-05, "loss": 0.4777, "mean_token_accuracy": 0.8460273742675781, "num_tokens": 539724109.0, "step": 3388 }, { "epoch": 1.7238046795523907, "grad_norm": 1.0291532278060913, "learning_rate": 1e-05, "loss": 0.4638, "mean_token_accuracy": 0.8528157472610474, "num_tokens": 539889499.0, "step": 3389 }, { "epoch": 1.7243133265513735, "grad_norm": 1.1965833902359009, "learning_rate": 1e-05, "loss": 0.4762, "mean_token_accuracy": 0.847673773765564, "num_tokens": 540055954.0, "step": 3390 }, { "epoch": 1.724821973550356, "grad_norm": 0.8850274085998535, "learning_rate": 1e-05, "loss": 0.4743, "mean_token_accuracy": 0.8486838936805725, "num_tokens": 540222107.0, "step": 3391 }, { "epoch": 1.7253306205493386, "grad_norm": 1.187686800956726, "learning_rate": 1e-05, "loss": 0.5221, "mean_token_accuracy": 0.8357970714569092, "num_tokens": 540380282.0, "step": 3392 }, { "epoch": 1.7258392675483214, "grad_norm": 1.1276291608810425, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8447475433349609, "num_tokens": 540548234.0, "step": 3393 }, { "epoch": 1.7263479145473042, "grad_norm": 0.9580877423286438, "learning_rate": 1e-05, "loss": 0.4854, "mean_token_accuracy": 0.8469442129135132, "num_tokens": 540703596.0, "step": 3394 }, { "epoch": 1.726856561546287, "grad_norm": 1.0890614986419678, "learning_rate": 1e-05, "loss": 0.4776, "mean_token_accuracy": 0.8463518619537354, "num_tokens": 540857729.0, "step": 3395 }, { "epoch": 1.7273652085452695, "grad_norm": 0.9593095779418945, "learning_rate": 1e-05, "loss": 0.4937, "mean_token_accuracy": 0.8443906307220459, "num_tokens": 541024110.0, "step": 3396 }, { "epoch": 1.7278738555442523, "grad_norm": 1.0046390295028687, "learning_rate": 1e-05, "loss": 0.4517, "mean_token_accuracy": 0.8553571701049805, "num_tokens": 541183931.0, "step": 3397 }, { "epoch": 1.7283825025432349, "grad_norm": 1.067029356956482, "learning_rate": 1e-05, "loss": 0.4856, "mean_token_accuracy": 0.8454616069793701, "num_tokens": 541343999.0, "step": 3398 }, { "epoch": 1.7288911495422177, "grad_norm": 1.0075136423110962, "learning_rate": 1e-05, "loss": 0.4784, "mean_token_accuracy": 0.8482916951179504, "num_tokens": 541501447.0, "step": 3399 }, { "epoch": 1.7293997965412005, "grad_norm": 0.9967670440673828, "learning_rate": 1e-05, "loss": 0.4791, "mean_token_accuracy": 0.8465551137924194, "num_tokens": 541666906.0, "step": 3400 }, { "epoch": 1.7299084435401832, "grad_norm": 0.9384946823120117, "learning_rate": 1e-05, "loss": 0.4472, "mean_token_accuracy": 0.8563881516456604, "num_tokens": 541823176.0, "step": 3401 }, { "epoch": 1.7304170905391658, "grad_norm": 0.9869144558906555, "learning_rate": 1e-05, "loss": 0.5131, "mean_token_accuracy": 0.8384538292884827, "num_tokens": 541998847.0, "step": 3402 }, { "epoch": 1.7309257375381484, "grad_norm": 0.9360270500183105, "learning_rate": 1e-05, "loss": 0.4833, "mean_token_accuracy": 0.8461273908615112, "num_tokens": 542167353.0, "step": 3403 }, { "epoch": 1.7314343845371312, "grad_norm": 0.9577409625053406, "learning_rate": 1e-05, "loss": 0.4472, "mean_token_accuracy": 0.8582079410552979, "num_tokens": 542330954.0, "step": 3404 }, { "epoch": 1.731943031536114, "grad_norm": 0.9778986573219299, "learning_rate": 1e-05, "loss": 0.4694, "mean_token_accuracy": 0.8505184650421143, "num_tokens": 542484284.0, "step": 3405 }, { "epoch": 1.7324516785350967, "grad_norm": 0.8824731111526489, "learning_rate": 1e-05, "loss": 0.4554, "mean_token_accuracy": 0.8546308279037476, "num_tokens": 542649706.0, "step": 3406 }, { "epoch": 1.7329603255340793, "grad_norm": 1.0359128713607788, "learning_rate": 1e-05, "loss": 0.4899, "mean_token_accuracy": 0.8440340757369995, "num_tokens": 542812938.0, "step": 3407 }, { "epoch": 1.733468972533062, "grad_norm": 1.0210932493209839, "learning_rate": 1e-05, "loss": 0.5077, "mean_token_accuracy": 0.8378579020500183, "num_tokens": 542971174.0, "step": 3408 }, { "epoch": 1.7339776195320447, "grad_norm": 0.9628419876098633, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8492684960365295, "num_tokens": 543130106.0, "step": 3409 }, { "epoch": 1.7344862665310274, "grad_norm": 0.9311797022819519, "learning_rate": 1e-05, "loss": 0.4633, "mean_token_accuracy": 0.8529977798461914, "num_tokens": 543286863.0, "step": 3410 }, { "epoch": 1.7349949135300102, "grad_norm": 0.9846572875976562, "learning_rate": 1e-05, "loss": 0.4886, "mean_token_accuracy": 0.844512939453125, "num_tokens": 543444719.0, "step": 3411 }, { "epoch": 1.735503560528993, "grad_norm": 0.9851809144020081, "learning_rate": 1e-05, "loss": 0.5049, "mean_token_accuracy": 0.8395944237709045, "num_tokens": 543603599.0, "step": 3412 }, { "epoch": 1.7360122075279756, "grad_norm": 0.9662925004959106, "learning_rate": 1e-05, "loss": 0.4698, "mean_token_accuracy": 0.8492670655250549, "num_tokens": 543764939.0, "step": 3413 }, { "epoch": 1.7365208545269581, "grad_norm": 0.9885513186454773, "learning_rate": 1e-05, "loss": 0.4916, "mean_token_accuracy": 0.8415821194648743, "num_tokens": 543929074.0, "step": 3414 }, { "epoch": 1.737029501525941, "grad_norm": 0.9492428302764893, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.8441232442855835, "num_tokens": 544087367.0, "step": 3415 }, { "epoch": 1.7375381485249237, "grad_norm": 0.9702391028404236, "learning_rate": 1e-05, "loss": 0.4893, "mean_token_accuracy": 0.8451366424560547, "num_tokens": 544254565.0, "step": 3416 }, { "epoch": 1.7380467955239065, "grad_norm": 0.9493220448493958, "learning_rate": 1e-05, "loss": 0.4651, "mean_token_accuracy": 0.8521139621734619, "num_tokens": 544423653.0, "step": 3417 }, { "epoch": 1.738555442522889, "grad_norm": 0.92607182264328, "learning_rate": 1e-05, "loss": 0.4616, "mean_token_accuracy": 0.8525439500808716, "num_tokens": 544594001.0, "step": 3418 }, { "epoch": 1.7390640895218719, "grad_norm": 0.9776398539543152, "learning_rate": 1e-05, "loss": 0.4866, "mean_token_accuracy": 0.8456598520278931, "num_tokens": 544743578.0, "step": 3419 }, { "epoch": 1.7395727365208544, "grad_norm": 0.9213610887527466, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8459997177124023, "num_tokens": 544905906.0, "step": 3420 }, { "epoch": 1.7400813835198372, "grad_norm": 1.0641958713531494, "learning_rate": 1e-05, "loss": 0.47, "mean_token_accuracy": 0.8510778546333313, "num_tokens": 545068059.0, "step": 3421 }, { "epoch": 1.74059003051882, "grad_norm": 0.8985027074813843, "learning_rate": 1e-05, "loss": 0.4428, "mean_token_accuracy": 0.8590704202651978, "num_tokens": 545218275.0, "step": 3422 }, { "epoch": 1.7410986775178028, "grad_norm": 0.9916641712188721, "learning_rate": 1e-05, "loss": 0.5017, "mean_token_accuracy": 0.8406370282173157, "num_tokens": 545387771.0, "step": 3423 }, { "epoch": 1.7416073245167853, "grad_norm": 1.1028623580932617, "learning_rate": 1e-05, "loss": 0.4573, "mean_token_accuracy": 0.8541989326477051, "num_tokens": 545540433.0, "step": 3424 }, { "epoch": 1.742115971515768, "grad_norm": 1.007652759552002, "learning_rate": 1e-05, "loss": 0.4571, "mean_token_accuracy": 0.8547618389129639, "num_tokens": 545694394.0, "step": 3425 }, { "epoch": 1.7426246185147507, "grad_norm": 1.0314555168151855, "learning_rate": 1e-05, "loss": 0.4774, "mean_token_accuracy": 0.8469586372375488, "num_tokens": 545853945.0, "step": 3426 }, { "epoch": 1.7431332655137335, "grad_norm": 1.0091915130615234, "learning_rate": 1e-05, "loss": 0.456, "mean_token_accuracy": 0.8543394804000854, "num_tokens": 546017542.0, "step": 3427 }, { "epoch": 1.7436419125127163, "grad_norm": 1.021324634552002, "learning_rate": 1e-05, "loss": 0.4489, "mean_token_accuracy": 0.8562377095222473, "num_tokens": 546163829.0, "step": 3428 }, { "epoch": 1.7441505595116988, "grad_norm": 0.9751710891723633, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.8416702151298523, "num_tokens": 546326213.0, "step": 3429 }, { "epoch": 1.7446592065106816, "grad_norm": 0.9528321027755737, "learning_rate": 1e-05, "loss": 0.4336, "mean_token_accuracy": 0.8595539331436157, "num_tokens": 546476551.0, "step": 3430 }, { "epoch": 1.7451678535096642, "grad_norm": 0.8223140239715576, "learning_rate": 1e-05, "loss": 0.4742, "mean_token_accuracy": 0.8496396541595459, "num_tokens": 546647073.0, "step": 3431 }, { "epoch": 1.745676500508647, "grad_norm": 0.9821622967720032, "learning_rate": 1e-05, "loss": 0.446, "mean_token_accuracy": 0.8567888140678406, "num_tokens": 546808037.0, "step": 3432 }, { "epoch": 1.7461851475076298, "grad_norm": 0.9108555316925049, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8557436466217041, "num_tokens": 546973916.0, "step": 3433 }, { "epoch": 1.7466937945066126, "grad_norm": 0.9076216220855713, "learning_rate": 1e-05, "loss": 0.4517, "mean_token_accuracy": 0.8551619052886963, "num_tokens": 547138121.0, "step": 3434 }, { "epoch": 1.7472024415055951, "grad_norm": 0.9469711780548096, "learning_rate": 1e-05, "loss": 0.4623, "mean_token_accuracy": 0.8510894775390625, "num_tokens": 547294960.0, "step": 3435 }, { "epoch": 1.7477110885045777, "grad_norm": 0.8839051723480225, "learning_rate": 1e-05, "loss": 0.4721, "mean_token_accuracy": 0.8495970368385315, "num_tokens": 547455414.0, "step": 3436 }, { "epoch": 1.7482197355035605, "grad_norm": 0.9603871703147888, "learning_rate": 1e-05, "loss": 0.4371, "mean_token_accuracy": 0.8589041233062744, "num_tokens": 547604728.0, "step": 3437 }, { "epoch": 1.7487283825025433, "grad_norm": 0.9335736036300659, "learning_rate": 1e-05, "loss": 0.4744, "mean_token_accuracy": 0.850881814956665, "num_tokens": 547758420.0, "step": 3438 }, { "epoch": 1.749237029501526, "grad_norm": 0.9160023331642151, "learning_rate": 1e-05, "loss": 0.4584, "mean_token_accuracy": 0.8534228801727295, "num_tokens": 547909778.0, "step": 3439 }, { "epoch": 1.7497456765005086, "grad_norm": 0.9696451425552368, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.8512312173843384, "num_tokens": 548059321.0, "step": 3440 }, { "epoch": 1.7502543234994914, "grad_norm": 1.0125298500061035, "learning_rate": 1e-05, "loss": 0.4938, "mean_token_accuracy": 0.8438988924026489, "num_tokens": 548212480.0, "step": 3441 }, { "epoch": 1.750762970498474, "grad_norm": 0.9607093930244446, "learning_rate": 1e-05, "loss": 0.4945, "mean_token_accuracy": 0.8422509431838989, "num_tokens": 548363768.0, "step": 3442 }, { "epoch": 1.7512716174974567, "grad_norm": 0.9785159230232239, "learning_rate": 1e-05, "loss": 0.4682, "mean_token_accuracy": 0.8506906032562256, "num_tokens": 548513409.0, "step": 3443 }, { "epoch": 1.7517802644964395, "grad_norm": 1.0101968050003052, "learning_rate": 1e-05, "loss": 0.4716, "mean_token_accuracy": 0.8490024209022522, "num_tokens": 548670132.0, "step": 3444 }, { "epoch": 1.7522889114954223, "grad_norm": 0.9235634207725525, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8533692359924316, "num_tokens": 548823795.0, "step": 3445 }, { "epoch": 1.7527975584944049, "grad_norm": 1.0365976095199585, "learning_rate": 1e-05, "loss": 0.4686, "mean_token_accuracy": 0.8518203496932983, "num_tokens": 548986310.0, "step": 3446 }, { "epoch": 1.7533062054933874, "grad_norm": 1.0411449670791626, "learning_rate": 1e-05, "loss": 0.4776, "mean_token_accuracy": 0.8472661375999451, "num_tokens": 549140095.0, "step": 3447 }, { "epoch": 1.7538148524923702, "grad_norm": 0.8725220561027527, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.8547745943069458, "num_tokens": 549294340.0, "step": 3448 }, { "epoch": 1.754323499491353, "grad_norm": 0.9153843522071838, "learning_rate": 1e-05, "loss": 0.4109, "mean_token_accuracy": 0.8675376176834106, "num_tokens": 549453074.0, "step": 3449 }, { "epoch": 1.7548321464903358, "grad_norm": 1.0639971494674683, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8492655754089355, "num_tokens": 549610886.0, "step": 3450 }, { "epoch": 1.7553407934893184, "grad_norm": 0.9058477878570557, "learning_rate": 1e-05, "loss": 0.4443, "mean_token_accuracy": 0.8552689552307129, "num_tokens": 549767763.0, "step": 3451 }, { "epoch": 1.7558494404883012, "grad_norm": 0.9886971116065979, "learning_rate": 1e-05, "loss": 0.4911, "mean_token_accuracy": 0.8447088003158569, "num_tokens": 549926561.0, "step": 3452 }, { "epoch": 1.7563580874872837, "grad_norm": 1.0823899507522583, "learning_rate": 1e-05, "loss": 0.4934, "mean_token_accuracy": 0.8419996500015259, "num_tokens": 550090259.0, "step": 3453 }, { "epoch": 1.7568667344862665, "grad_norm": 0.9325345754623413, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.8606714606285095, "num_tokens": 550244848.0, "step": 3454 }, { "epoch": 1.7573753814852493, "grad_norm": 1.036022663116455, "learning_rate": 1e-05, "loss": 0.4409, "mean_token_accuracy": 0.8577229976654053, "num_tokens": 550406425.0, "step": 3455 }, { "epoch": 1.757884028484232, "grad_norm": 0.9253838658332825, "learning_rate": 1e-05, "loss": 0.453, "mean_token_accuracy": 0.8544574975967407, "num_tokens": 550565225.0, "step": 3456 }, { "epoch": 1.7583926754832147, "grad_norm": 0.9430886507034302, "learning_rate": 1e-05, "loss": 0.4815, "mean_token_accuracy": 0.8463444709777832, "num_tokens": 550724486.0, "step": 3457 }, { "epoch": 1.7589013224821972, "grad_norm": 1.0232832431793213, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8553723096847534, "num_tokens": 550869158.0, "step": 3458 }, { "epoch": 1.75940996948118, "grad_norm": 0.8895917534828186, "learning_rate": 1e-05, "loss": 0.4964, "mean_token_accuracy": 0.8435969352722168, "num_tokens": 551034748.0, "step": 3459 }, { "epoch": 1.7599186164801628, "grad_norm": 1.019453525543213, "learning_rate": 1e-05, "loss": 0.4516, "mean_token_accuracy": 0.8550435900688171, "num_tokens": 551189311.0, "step": 3460 }, { "epoch": 1.7604272634791456, "grad_norm": 0.9567839503288269, "learning_rate": 1e-05, "loss": 0.4844, "mean_token_accuracy": 0.8456395864486694, "num_tokens": 551335943.0, "step": 3461 }, { "epoch": 1.7609359104781281, "grad_norm": 1.0265204906463623, "learning_rate": 1e-05, "loss": 0.4808, "mean_token_accuracy": 0.845983624458313, "num_tokens": 551500042.0, "step": 3462 }, { "epoch": 1.761444557477111, "grad_norm": 0.9160285592079163, "learning_rate": 1e-05, "loss": 0.488, "mean_token_accuracy": 0.8447883725166321, "num_tokens": 551661086.0, "step": 3463 }, { "epoch": 1.7619532044760935, "grad_norm": 0.9425185918807983, "learning_rate": 1e-05, "loss": 0.4344, "mean_token_accuracy": 0.8605625033378601, "num_tokens": 551811847.0, "step": 3464 }, { "epoch": 1.7624618514750763, "grad_norm": 0.9993616938591003, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.8575040102005005, "num_tokens": 551963564.0, "step": 3465 }, { "epoch": 1.762970498474059, "grad_norm": 0.9410571455955505, "learning_rate": 1e-05, "loss": 0.4712, "mean_token_accuracy": 0.8500432372093201, "num_tokens": 552131236.0, "step": 3466 }, { "epoch": 1.7634791454730419, "grad_norm": 1.0863051414489746, "learning_rate": 1e-05, "loss": 0.4748, "mean_token_accuracy": 0.8502572178840637, "num_tokens": 552287004.0, "step": 3467 }, { "epoch": 1.7639877924720244, "grad_norm": 0.9163442254066467, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.8505086898803711, "num_tokens": 552456755.0, "step": 3468 }, { "epoch": 1.764496439471007, "grad_norm": 1.024658203125, "learning_rate": 1e-05, "loss": 0.4303, "mean_token_accuracy": 0.8616164326667786, "num_tokens": 552619812.0, "step": 3469 }, { "epoch": 1.7650050864699898, "grad_norm": 0.965026319026947, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.850260317325592, "num_tokens": 552777207.0, "step": 3470 }, { "epoch": 1.7655137334689726, "grad_norm": 0.91949063539505, "learning_rate": 1e-05, "loss": 0.4653, "mean_token_accuracy": 0.8517131805419922, "num_tokens": 552931843.0, "step": 3471 }, { "epoch": 1.7660223804679553, "grad_norm": 0.9592922925949097, "learning_rate": 1e-05, "loss": 0.4775, "mean_token_accuracy": 0.8479909300804138, "num_tokens": 553087376.0, "step": 3472 }, { "epoch": 1.766531027466938, "grad_norm": 0.8795325756072998, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8521772623062134, "num_tokens": 553258033.0, "step": 3473 }, { "epoch": 1.7670396744659207, "grad_norm": 1.0181102752685547, "learning_rate": 1e-05, "loss": 0.477, "mean_token_accuracy": 0.8480532169342041, "num_tokens": 553406997.0, "step": 3474 }, { "epoch": 1.7675483214649033, "grad_norm": 0.9451496005058289, "learning_rate": 1e-05, "loss": 0.4944, "mean_token_accuracy": 0.8428871631622314, "num_tokens": 553571801.0, "step": 3475 }, { "epoch": 1.768056968463886, "grad_norm": 1.011832356452942, "learning_rate": 1e-05, "loss": 0.505, "mean_token_accuracy": 0.8400646448135376, "num_tokens": 553726380.0, "step": 3476 }, { "epoch": 1.7685656154628688, "grad_norm": 0.9240113496780396, "learning_rate": 1e-05, "loss": 0.4651, "mean_token_accuracy": 0.8522868752479553, "num_tokens": 553891035.0, "step": 3477 }, { "epoch": 1.7690742624618516, "grad_norm": 0.9595242142677307, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.8421164751052856, "num_tokens": 554060617.0, "step": 3478 }, { "epoch": 1.7695829094608342, "grad_norm": 0.8867053389549255, "learning_rate": 1e-05, "loss": 0.4858, "mean_token_accuracy": 0.8462978601455688, "num_tokens": 554226977.0, "step": 3479 }, { "epoch": 1.7700915564598168, "grad_norm": 0.9090108871459961, "learning_rate": 1e-05, "loss": 0.4527, "mean_token_accuracy": 0.8542250394821167, "num_tokens": 554385731.0, "step": 3480 }, { "epoch": 1.7706002034587995, "grad_norm": 0.9475415349006653, "learning_rate": 1e-05, "loss": 0.467, "mean_token_accuracy": 0.8525370359420776, "num_tokens": 554555504.0, "step": 3481 }, { "epoch": 1.7711088504577823, "grad_norm": 0.9586099982261658, "learning_rate": 1e-05, "loss": 0.4636, "mean_token_accuracy": 0.8518487215042114, "num_tokens": 554714277.0, "step": 3482 }, { "epoch": 1.7716174974567651, "grad_norm": 0.8920243382453918, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8517918586730957, "num_tokens": 554875296.0, "step": 3483 }, { "epoch": 1.7721261444557477, "grad_norm": 0.8927356600761414, "learning_rate": 1e-05, "loss": 0.484, "mean_token_accuracy": 0.8461599946022034, "num_tokens": 555033920.0, "step": 3484 }, { "epoch": 1.7726347914547305, "grad_norm": 2.070472002029419, "learning_rate": 1e-05, "loss": 0.4716, "mean_token_accuracy": 0.8495326042175293, "num_tokens": 555194410.0, "step": 3485 }, { "epoch": 1.773143438453713, "grad_norm": 0.9416306614875793, "learning_rate": 1e-05, "loss": 0.4571, "mean_token_accuracy": 0.8557838797569275, "num_tokens": 555371365.0, "step": 3486 }, { "epoch": 1.7736520854526958, "grad_norm": 0.9121522903442383, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8585523366928101, "num_tokens": 555531745.0, "step": 3487 }, { "epoch": 1.7741607324516786, "grad_norm": 0.9331509470939636, "learning_rate": 1e-05, "loss": 0.4758, "mean_token_accuracy": 0.8486753702163696, "num_tokens": 555695553.0, "step": 3488 }, { "epoch": 1.7746693794506614, "grad_norm": 0.9343940615653992, "learning_rate": 1e-05, "loss": 0.491, "mean_token_accuracy": 0.8444896936416626, "num_tokens": 555857378.0, "step": 3489 }, { "epoch": 1.775178026449644, "grad_norm": 0.9072713255882263, "learning_rate": 1e-05, "loss": 0.4805, "mean_token_accuracy": 0.8467872142791748, "num_tokens": 556006070.0, "step": 3490 }, { "epoch": 1.7756866734486265, "grad_norm": 0.8870847225189209, "learning_rate": 1e-05, "loss": 0.4462, "mean_token_accuracy": 0.8569936156272888, "num_tokens": 556165767.0, "step": 3491 }, { "epoch": 1.7761953204476093, "grad_norm": 0.9200275540351868, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8556621074676514, "num_tokens": 556324330.0, "step": 3492 }, { "epoch": 1.776703967446592, "grad_norm": 0.9575535655021667, "learning_rate": 1e-05, "loss": 0.4426, "mean_token_accuracy": 0.8579325675964355, "num_tokens": 556483980.0, "step": 3493 }, { "epoch": 1.7772126144455749, "grad_norm": 0.995994508266449, "learning_rate": 1e-05, "loss": 0.4761, "mean_token_accuracy": 0.8466376662254333, "num_tokens": 556639463.0, "step": 3494 }, { "epoch": 1.7777212614445574, "grad_norm": 0.9333937764167786, "learning_rate": 1e-05, "loss": 0.5029, "mean_token_accuracy": 0.840882420539856, "num_tokens": 556796632.0, "step": 3495 }, { "epoch": 1.7782299084435402, "grad_norm": 0.9078387022018433, "learning_rate": 1e-05, "loss": 0.4568, "mean_token_accuracy": 0.854224681854248, "num_tokens": 556964733.0, "step": 3496 }, { "epoch": 1.7787385554425228, "grad_norm": 0.9770086407661438, "learning_rate": 1e-05, "loss": 0.463, "mean_token_accuracy": 0.8511612415313721, "num_tokens": 557132319.0, "step": 3497 }, { "epoch": 1.7792472024415056, "grad_norm": 0.9639984369277954, "learning_rate": 1e-05, "loss": 0.4995, "mean_token_accuracy": 0.8427587747573853, "num_tokens": 557293193.0, "step": 3498 }, { "epoch": 1.7797558494404884, "grad_norm": 0.9718248844146729, "learning_rate": 1e-05, "loss": 0.455, "mean_token_accuracy": 0.8534946441650391, "num_tokens": 557457261.0, "step": 3499 }, { "epoch": 1.7802644964394712, "grad_norm": 0.8903071284294128, "learning_rate": 1e-05, "loss": 0.4606, "mean_token_accuracy": 0.8538823127746582, "num_tokens": 557609634.0, "step": 3500 }, { "epoch": 1.7807731434384537, "grad_norm": 0.9682722091674805, "learning_rate": 1e-05, "loss": 0.461, "mean_token_accuracy": 0.8518296480178833, "num_tokens": 557760431.0, "step": 3501 }, { "epoch": 1.7812817904374363, "grad_norm": 0.9154918193817139, "learning_rate": 1e-05, "loss": 0.451, "mean_token_accuracy": 0.8548795580863953, "num_tokens": 557933267.0, "step": 3502 }, { "epoch": 1.781790437436419, "grad_norm": 0.9262132048606873, "learning_rate": 1e-05, "loss": 0.4929, "mean_token_accuracy": 0.8442500233650208, "num_tokens": 558081783.0, "step": 3503 }, { "epoch": 1.7822990844354019, "grad_norm": 1.055605411529541, "learning_rate": 1e-05, "loss": 0.5099, "mean_token_accuracy": 0.8384379744529724, "num_tokens": 558238439.0, "step": 3504 }, { "epoch": 1.7828077314343846, "grad_norm": 0.9379321336746216, "learning_rate": 1e-05, "loss": 0.4678, "mean_token_accuracy": 0.8530690670013428, "num_tokens": 558406721.0, "step": 3505 }, { "epoch": 1.7833163784333672, "grad_norm": 0.8967541456222534, "learning_rate": 1e-05, "loss": 0.4885, "mean_token_accuracy": 0.8444415330886841, "num_tokens": 558564521.0, "step": 3506 }, { "epoch": 1.78382502543235, "grad_norm": 0.9698563814163208, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8436890840530396, "num_tokens": 558715095.0, "step": 3507 }, { "epoch": 1.7843336724313326, "grad_norm": 0.9144454598426819, "learning_rate": 1e-05, "loss": 0.4926, "mean_token_accuracy": 0.8440608978271484, "num_tokens": 558870711.0, "step": 3508 }, { "epoch": 1.7848423194303153, "grad_norm": 1.0247725248336792, "learning_rate": 1e-05, "loss": 0.4907, "mean_token_accuracy": 0.8435952663421631, "num_tokens": 559037633.0, "step": 3509 }, { "epoch": 1.7853509664292981, "grad_norm": 0.9614184498786926, "learning_rate": 1e-05, "loss": 0.4447, "mean_token_accuracy": 0.8556063175201416, "num_tokens": 559184486.0, "step": 3510 }, { "epoch": 1.785859613428281, "grad_norm": 0.9238786101341248, "learning_rate": 1e-05, "loss": 0.4756, "mean_token_accuracy": 0.848760724067688, "num_tokens": 559340860.0, "step": 3511 }, { "epoch": 1.7863682604272635, "grad_norm": 1.0116618871688843, "learning_rate": 1e-05, "loss": 0.4517, "mean_token_accuracy": 0.8541946411132812, "num_tokens": 559487831.0, "step": 3512 }, { "epoch": 1.786876907426246, "grad_norm": 1.0261667966842651, "learning_rate": 1e-05, "loss": 0.478, "mean_token_accuracy": 0.8468774557113647, "num_tokens": 559650109.0, "step": 3513 }, { "epoch": 1.7873855544252288, "grad_norm": 0.8953947424888611, "learning_rate": 1e-05, "loss": 0.4339, "mean_token_accuracy": 0.8612636923789978, "num_tokens": 559805689.0, "step": 3514 }, { "epoch": 1.7878942014242116, "grad_norm": 0.8966084122657776, "learning_rate": 1e-05, "loss": 0.4536, "mean_token_accuracy": 0.8546212911605835, "num_tokens": 559966866.0, "step": 3515 }, { "epoch": 1.7884028484231944, "grad_norm": 0.8931212425231934, "learning_rate": 1e-05, "loss": 0.4357, "mean_token_accuracy": 0.8599777221679688, "num_tokens": 560117177.0, "step": 3516 }, { "epoch": 1.788911495422177, "grad_norm": 0.9382513761520386, "learning_rate": 1e-05, "loss": 0.4479, "mean_token_accuracy": 0.8547048568725586, "num_tokens": 560278984.0, "step": 3517 }, { "epoch": 1.7894201424211598, "grad_norm": 0.879797101020813, "learning_rate": 1e-05, "loss": 0.4968, "mean_token_accuracy": 0.8421691060066223, "num_tokens": 560434600.0, "step": 3518 }, { "epoch": 1.7899287894201423, "grad_norm": 0.9176720976829529, "learning_rate": 1e-05, "loss": 0.4718, "mean_token_accuracy": 0.8502687215805054, "num_tokens": 560598239.0, "step": 3519 }, { "epoch": 1.7904374364191251, "grad_norm": 0.9511398077011108, "learning_rate": 1e-05, "loss": 0.4632, "mean_token_accuracy": 0.8514447212219238, "num_tokens": 560752929.0, "step": 3520 }, { "epoch": 1.790946083418108, "grad_norm": 0.8993409872055054, "learning_rate": 1e-05, "loss": 0.456, "mean_token_accuracy": 0.855494499206543, "num_tokens": 560905046.0, "step": 3521 }, { "epoch": 1.7914547304170907, "grad_norm": 1.001287579536438, "learning_rate": 1e-05, "loss": 0.4969, "mean_token_accuracy": 0.8412386178970337, "num_tokens": 561057758.0, "step": 3522 }, { "epoch": 1.7919633774160733, "grad_norm": 0.8689597249031067, "learning_rate": 1e-05, "loss": 0.4605, "mean_token_accuracy": 0.8530293107032776, "num_tokens": 561215568.0, "step": 3523 }, { "epoch": 1.7924720244150558, "grad_norm": 1.0701673030853271, "learning_rate": 1e-05, "loss": 0.4498, "mean_token_accuracy": 0.8540624380111694, "num_tokens": 561364341.0, "step": 3524 }, { "epoch": 1.7929806714140386, "grad_norm": 0.9774520397186279, "learning_rate": 1e-05, "loss": 0.4625, "mean_token_accuracy": 0.8525039553642273, "num_tokens": 561511545.0, "step": 3525 }, { "epoch": 1.7934893184130214, "grad_norm": 0.8649917840957642, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8515691757202148, "num_tokens": 561682192.0, "step": 3526 }, { "epoch": 1.7939979654120042, "grad_norm": 1.0334358215332031, "learning_rate": 1e-05, "loss": 0.5196, "mean_token_accuracy": 0.8361889123916626, "num_tokens": 561837097.0, "step": 3527 }, { "epoch": 1.7945066124109867, "grad_norm": 0.9365693926811218, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8510040640830994, "num_tokens": 561984366.0, "step": 3528 }, { "epoch": 1.7950152594099695, "grad_norm": 0.933411180973053, "learning_rate": 1e-05, "loss": 0.4774, "mean_token_accuracy": 0.8487666845321655, "num_tokens": 562152993.0, "step": 3529 }, { "epoch": 1.795523906408952, "grad_norm": 0.8894389867782593, "learning_rate": 1e-05, "loss": 0.4619, "mean_token_accuracy": 0.8522475957870483, "num_tokens": 562319171.0, "step": 3530 }, { "epoch": 1.7960325534079349, "grad_norm": 0.9173430800437927, "learning_rate": 1e-05, "loss": 0.486, "mean_token_accuracy": 0.8453357219696045, "num_tokens": 562482117.0, "step": 3531 }, { "epoch": 1.7965412004069177, "grad_norm": 0.9890989661216736, "learning_rate": 1e-05, "loss": 0.4921, "mean_token_accuracy": 0.8463920950889587, "num_tokens": 562649718.0, "step": 3532 }, { "epoch": 1.7970498474059005, "grad_norm": 0.8388110995292664, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8532881736755371, "num_tokens": 562813763.0, "step": 3533 }, { "epoch": 1.797558494404883, "grad_norm": 0.9548352360725403, "learning_rate": 1e-05, "loss": 0.4609, "mean_token_accuracy": 0.8530546426773071, "num_tokens": 562962769.0, "step": 3534 }, { "epoch": 1.7980671414038656, "grad_norm": 0.9055401682853699, "learning_rate": 1e-05, "loss": 0.4676, "mean_token_accuracy": 0.8494167327880859, "num_tokens": 563125097.0, "step": 3535 }, { "epoch": 1.7985757884028484, "grad_norm": 0.8799210786819458, "learning_rate": 1e-05, "loss": 0.4839, "mean_token_accuracy": 0.8458794355392456, "num_tokens": 563281657.0, "step": 3536 }, { "epoch": 1.7990844354018312, "grad_norm": 1.0961902141571045, "learning_rate": 1e-05, "loss": 0.4553, "mean_token_accuracy": 0.8538365364074707, "num_tokens": 563438677.0, "step": 3537 }, { "epoch": 1.799593082400814, "grad_norm": 1.0078305006027222, "learning_rate": 1e-05, "loss": 0.4888, "mean_token_accuracy": 0.8463952541351318, "num_tokens": 563599442.0, "step": 3538 }, { "epoch": 1.8001017293997965, "grad_norm": 0.9052742123603821, "learning_rate": 1e-05, "loss": 0.4857, "mean_token_accuracy": 0.8461458683013916, "num_tokens": 563766967.0, "step": 3539 }, { "epoch": 1.8006103763987793, "grad_norm": 0.9112515449523926, "learning_rate": 1e-05, "loss": 0.4644, "mean_token_accuracy": 0.8512401580810547, "num_tokens": 563938038.0, "step": 3540 }, { "epoch": 1.8011190233977619, "grad_norm": 0.9067733883857727, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.856181263923645, "num_tokens": 564094217.0, "step": 3541 }, { "epoch": 1.8016276703967447, "grad_norm": 0.9368350505828857, "learning_rate": 1e-05, "loss": 0.4464, "mean_token_accuracy": 0.8560757040977478, "num_tokens": 564249675.0, "step": 3542 }, { "epoch": 1.8021363173957274, "grad_norm": 0.9412768483161926, "learning_rate": 1e-05, "loss": 0.4787, "mean_token_accuracy": 0.8480315208435059, "num_tokens": 564412107.0, "step": 3543 }, { "epoch": 1.8026449643947102, "grad_norm": 0.9945979118347168, "learning_rate": 1e-05, "loss": 0.4632, "mean_token_accuracy": 0.8529903888702393, "num_tokens": 564549442.0, "step": 3544 }, { "epoch": 1.8031536113936928, "grad_norm": 1.0442930459976196, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8548238277435303, "num_tokens": 564712875.0, "step": 3545 }, { "epoch": 1.8036622583926754, "grad_norm": 0.9492664337158203, "learning_rate": 1e-05, "loss": 0.4592, "mean_token_accuracy": 0.8510094285011292, "num_tokens": 564879892.0, "step": 3546 }, { "epoch": 1.8041709053916581, "grad_norm": 1.9171106815338135, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8533254861831665, "num_tokens": 565048939.0, "step": 3547 }, { "epoch": 1.804679552390641, "grad_norm": 0.9038177132606506, "learning_rate": 1e-05, "loss": 0.48, "mean_token_accuracy": 0.8476209044456482, "num_tokens": 565210219.0, "step": 3548 }, { "epoch": 1.8051881993896237, "grad_norm": 0.9310301542282104, "learning_rate": 1e-05, "loss": 0.453, "mean_token_accuracy": 0.8554023504257202, "num_tokens": 565365789.0, "step": 3549 }, { "epoch": 1.8056968463886063, "grad_norm": 1.6419590711593628, "learning_rate": 1e-05, "loss": 0.4635, "mean_token_accuracy": 0.853683352470398, "num_tokens": 565526228.0, "step": 3550 }, { "epoch": 1.806205493387589, "grad_norm": 0.9045491814613342, "learning_rate": 1e-05, "loss": 0.4818, "mean_token_accuracy": 0.8482503294944763, "num_tokens": 565690503.0, "step": 3551 }, { "epoch": 1.8067141403865716, "grad_norm": 0.8996527194976807, "learning_rate": 1e-05, "loss": 0.497, "mean_token_accuracy": 0.8439865708351135, "num_tokens": 565852900.0, "step": 3552 }, { "epoch": 1.8072227873855544, "grad_norm": 0.8703089356422424, "learning_rate": 1e-05, "loss": 0.4649, "mean_token_accuracy": 0.8514032959938049, "num_tokens": 566016393.0, "step": 3553 }, { "epoch": 1.8077314343845372, "grad_norm": 0.9336738586425781, "learning_rate": 1e-05, "loss": 0.4574, "mean_token_accuracy": 0.8547062873840332, "num_tokens": 566180405.0, "step": 3554 }, { "epoch": 1.80824008138352, "grad_norm": 0.9066407084465027, "learning_rate": 1e-05, "loss": 0.4934, "mean_token_accuracy": 0.8425087928771973, "num_tokens": 566345044.0, "step": 3555 }, { "epoch": 1.8087487283825026, "grad_norm": 0.8604444265365601, "learning_rate": 1e-05, "loss": 0.4663, "mean_token_accuracy": 0.8514249324798584, "num_tokens": 566504302.0, "step": 3556 }, { "epoch": 1.8092573753814851, "grad_norm": 1.1371344327926636, "learning_rate": 1e-05, "loss": 0.4911, "mean_token_accuracy": 0.8447737097740173, "num_tokens": 566670105.0, "step": 3557 }, { "epoch": 1.809766022380468, "grad_norm": 0.9432363510131836, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8473440408706665, "num_tokens": 566830753.0, "step": 3558 }, { "epoch": 1.8102746693794507, "grad_norm": 0.900536835193634, "learning_rate": 1e-05, "loss": 0.4277, "mean_token_accuracy": 0.860796332359314, "num_tokens": 566970910.0, "step": 3559 }, { "epoch": 1.8107833163784335, "grad_norm": 1.056198239326477, "learning_rate": 1e-05, "loss": 0.4783, "mean_token_accuracy": 0.8483229875564575, "num_tokens": 567125149.0, "step": 3560 }, { "epoch": 1.811291963377416, "grad_norm": 0.8736128807067871, "learning_rate": 1e-05, "loss": 0.5018, "mean_token_accuracy": 0.8416284322738647, "num_tokens": 567287899.0, "step": 3561 }, { "epoch": 1.8118006103763988, "grad_norm": 0.8953394889831543, "learning_rate": 1e-05, "loss": 0.448, "mean_token_accuracy": 0.8561390042304993, "num_tokens": 567439219.0, "step": 3562 }, { "epoch": 1.8123092573753814, "grad_norm": 0.9433287978172302, "learning_rate": 1e-05, "loss": 0.4534, "mean_token_accuracy": 0.8540156483650208, "num_tokens": 567603590.0, "step": 3563 }, { "epoch": 1.8128179043743642, "grad_norm": 0.9658059477806091, "learning_rate": 1e-05, "loss": 0.4791, "mean_token_accuracy": 0.8478438854217529, "num_tokens": 567757245.0, "step": 3564 }, { "epoch": 1.813326551373347, "grad_norm": 0.958234429359436, "learning_rate": 1e-05, "loss": 0.4642, "mean_token_accuracy": 0.8501091599464417, "num_tokens": 567914473.0, "step": 3565 }, { "epoch": 1.8138351983723298, "grad_norm": 1.0078924894332886, "learning_rate": 1e-05, "loss": 0.4494, "mean_token_accuracy": 0.8547240495681763, "num_tokens": 568063203.0, "step": 3566 }, { "epoch": 1.8143438453713123, "grad_norm": 0.9522838592529297, "learning_rate": 1e-05, "loss": 0.4676, "mean_token_accuracy": 0.8510032296180725, "num_tokens": 568219189.0, "step": 3567 }, { "epoch": 1.814852492370295, "grad_norm": 0.9724707007408142, "learning_rate": 1e-05, "loss": 0.477, "mean_token_accuracy": 0.8469789028167725, "num_tokens": 568375485.0, "step": 3568 }, { "epoch": 1.8153611393692777, "grad_norm": 0.9488127827644348, "learning_rate": 1e-05, "loss": 0.5014, "mean_token_accuracy": 0.8410286903381348, "num_tokens": 568532599.0, "step": 3569 }, { "epoch": 1.8158697863682605, "grad_norm": 0.8995364904403687, "learning_rate": 1e-05, "loss": 0.4487, "mean_token_accuracy": 0.8555278182029724, "num_tokens": 568704776.0, "step": 3570 }, { "epoch": 1.8163784333672433, "grad_norm": 0.9317036867141724, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.855113685131073, "num_tokens": 568876940.0, "step": 3571 }, { "epoch": 1.8168870803662258, "grad_norm": 1.001330018043518, "learning_rate": 1e-05, "loss": 0.4664, "mean_token_accuracy": 0.8509033918380737, "num_tokens": 569029468.0, "step": 3572 }, { "epoch": 1.8173957273652084, "grad_norm": 0.9407387375831604, "learning_rate": 1e-05, "loss": 0.4605, "mean_token_accuracy": 0.8527392148971558, "num_tokens": 569180360.0, "step": 3573 }, { "epoch": 1.8179043743641912, "grad_norm": 0.9812650084495544, "learning_rate": 1e-05, "loss": 0.4849, "mean_token_accuracy": 0.8459858894348145, "num_tokens": 569332534.0, "step": 3574 }, { "epoch": 1.818413021363174, "grad_norm": 0.9319610595703125, "learning_rate": 1e-05, "loss": 0.4346, "mean_token_accuracy": 0.8603183031082153, "num_tokens": 569494821.0, "step": 3575 }, { "epoch": 1.8189216683621567, "grad_norm": 0.8834575414657593, "learning_rate": 1e-05, "loss": 0.4783, "mean_token_accuracy": 0.8467259407043457, "num_tokens": 569647573.0, "step": 3576 }, { "epoch": 1.8194303153611395, "grad_norm": 0.9952992796897888, "learning_rate": 1e-05, "loss": 0.4734, "mean_token_accuracy": 0.8493619561195374, "num_tokens": 569797825.0, "step": 3577 }, { "epoch": 1.819938962360122, "grad_norm": 0.947803795337677, "learning_rate": 1e-05, "loss": 0.4603, "mean_token_accuracy": 0.8515059351921082, "num_tokens": 569960390.0, "step": 3578 }, { "epoch": 1.8204476093591047, "grad_norm": 0.9164172410964966, "learning_rate": 1e-05, "loss": 0.4571, "mean_token_accuracy": 0.8544726371765137, "num_tokens": 570118035.0, "step": 3579 }, { "epoch": 1.8209562563580874, "grad_norm": 0.9546960592269897, "learning_rate": 1e-05, "loss": 0.4986, "mean_token_accuracy": 0.8426315784454346, "num_tokens": 570276992.0, "step": 3580 }, { "epoch": 1.8214649033570702, "grad_norm": 0.9669478535652161, "learning_rate": 1e-05, "loss": 0.4513, "mean_token_accuracy": 0.8550679087638855, "num_tokens": 570437162.0, "step": 3581 }, { "epoch": 1.821973550356053, "grad_norm": 0.8663604259490967, "learning_rate": 1e-05, "loss": 0.4708, "mean_token_accuracy": 0.85112065076828, "num_tokens": 570601038.0, "step": 3582 }, { "epoch": 1.8224821973550356, "grad_norm": 1.0352712869644165, "learning_rate": 1e-05, "loss": 0.4778, "mean_token_accuracy": 0.8478525876998901, "num_tokens": 570764320.0, "step": 3583 }, { "epoch": 1.8229908443540181, "grad_norm": 0.8922290205955505, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8514959812164307, "num_tokens": 570927501.0, "step": 3584 }, { "epoch": 1.823499491353001, "grad_norm": 0.937830924987793, "learning_rate": 1e-05, "loss": 0.4735, "mean_token_accuracy": 0.8487762808799744, "num_tokens": 571072594.0, "step": 3585 }, { "epoch": 1.8240081383519837, "grad_norm": 0.9236015677452087, "learning_rate": 1e-05, "loss": 0.47, "mean_token_accuracy": 0.849860429763794, "num_tokens": 571225760.0, "step": 3586 }, { "epoch": 1.8245167853509665, "grad_norm": 0.9017430543899536, "learning_rate": 1e-05, "loss": 0.4913, "mean_token_accuracy": 0.8431181311607361, "num_tokens": 571383451.0, "step": 3587 }, { "epoch": 1.8250254323499493, "grad_norm": 0.9215783476829529, "learning_rate": 1e-05, "loss": 0.4537, "mean_token_accuracy": 0.8537969589233398, "num_tokens": 571536763.0, "step": 3588 }, { "epoch": 1.8255340793489319, "grad_norm": 0.9324192404747009, "learning_rate": 1e-05, "loss": 0.4644, "mean_token_accuracy": 0.8512316942214966, "num_tokens": 571709893.0, "step": 3589 }, { "epoch": 1.8260427263479144, "grad_norm": 0.9075586199760437, "learning_rate": 1e-05, "loss": 0.4368, "mean_token_accuracy": 0.8588082790374756, "num_tokens": 571865425.0, "step": 3590 }, { "epoch": 1.8265513733468972, "grad_norm": 0.926460325717926, "learning_rate": 1e-05, "loss": 0.4549, "mean_token_accuracy": 0.8537696599960327, "num_tokens": 572024420.0, "step": 3591 }, { "epoch": 1.82706002034588, "grad_norm": 0.9003691673278809, "learning_rate": 1e-05, "loss": 0.478, "mean_token_accuracy": 0.8481897711753845, "num_tokens": 572184872.0, "step": 3592 }, { "epoch": 1.8275686673448628, "grad_norm": 0.9034071564674377, "learning_rate": 1e-05, "loss": 0.4808, "mean_token_accuracy": 0.8484635353088379, "num_tokens": 572353776.0, "step": 3593 }, { "epoch": 1.8280773143438453, "grad_norm": 0.8958525061607361, "learning_rate": 1e-05, "loss": 0.4622, "mean_token_accuracy": 0.8530550003051758, "num_tokens": 572506248.0, "step": 3594 }, { "epoch": 1.828585961342828, "grad_norm": 0.9878848195075989, "learning_rate": 1e-05, "loss": 0.5038, "mean_token_accuracy": 0.8405113220214844, "num_tokens": 572672814.0, "step": 3595 }, { "epoch": 1.8290946083418107, "grad_norm": 1.085959553718567, "learning_rate": 1e-05, "loss": 0.4639, "mean_token_accuracy": 0.8505579233169556, "num_tokens": 572820670.0, "step": 3596 }, { "epoch": 1.8296032553407935, "grad_norm": 0.9331508278846741, "learning_rate": 1e-05, "loss": 0.4324, "mean_token_accuracy": 0.8600931167602539, "num_tokens": 572974866.0, "step": 3597 }, { "epoch": 1.8301119023397763, "grad_norm": 0.917675793170929, "learning_rate": 1e-05, "loss": 0.4578, "mean_token_accuracy": 0.8545408248901367, "num_tokens": 573129521.0, "step": 3598 }, { "epoch": 1.830620549338759, "grad_norm": 3.0825843811035156, "learning_rate": 1e-05, "loss": 0.4918, "mean_token_accuracy": 0.8449922204017639, "num_tokens": 573296454.0, "step": 3599 }, { "epoch": 1.8311291963377416, "grad_norm": 0.9181584715843201, "learning_rate": 1e-05, "loss": 0.4642, "mean_token_accuracy": 0.8511255979537964, "num_tokens": 573458787.0, "step": 3600 }, { "epoch": 1.8316378433367242, "grad_norm": 0.8957834839820862, "learning_rate": 1e-05, "loss": 0.495, "mean_token_accuracy": 0.8457474708557129, "num_tokens": 573619962.0, "step": 3601 }, { "epoch": 1.832146490335707, "grad_norm": 0.9297384023666382, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8491976261138916, "num_tokens": 573794196.0, "step": 3602 }, { "epoch": 1.8326551373346898, "grad_norm": 0.906842052936554, "learning_rate": 1e-05, "loss": 0.4414, "mean_token_accuracy": 0.8586156368255615, "num_tokens": 573952177.0, "step": 3603 }, { "epoch": 1.8331637843336726, "grad_norm": 0.9280809164047241, "learning_rate": 1e-05, "loss": 0.4908, "mean_token_accuracy": 0.8437187671661377, "num_tokens": 574118594.0, "step": 3604 }, { "epoch": 1.8336724313326551, "grad_norm": 0.9042379856109619, "learning_rate": 1e-05, "loss": 0.4653, "mean_token_accuracy": 0.850511372089386, "num_tokens": 574274813.0, "step": 3605 }, { "epoch": 1.8341810783316377, "grad_norm": 1.0138126611709595, "learning_rate": 1e-05, "loss": 0.4396, "mean_token_accuracy": 0.8595215082168579, "num_tokens": 574429910.0, "step": 3606 }, { "epoch": 1.8346897253306205, "grad_norm": 0.9278191924095154, "learning_rate": 1e-05, "loss": 0.4736, "mean_token_accuracy": 0.8506954908370972, "num_tokens": 574598948.0, "step": 3607 }, { "epoch": 1.8351983723296033, "grad_norm": 1.0183500051498413, "learning_rate": 1e-05, "loss": 0.4376, "mean_token_accuracy": 0.8583874702453613, "num_tokens": 574750525.0, "step": 3608 }, { "epoch": 1.835707019328586, "grad_norm": 0.9347324967384338, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8530687689781189, "num_tokens": 574920331.0, "step": 3609 }, { "epoch": 1.8362156663275688, "grad_norm": 1.1582096815109253, "learning_rate": 1e-05, "loss": 0.4834, "mean_token_accuracy": 0.8469882011413574, "num_tokens": 575086614.0, "step": 3610 }, { "epoch": 1.8367243133265514, "grad_norm": 1.0273844003677368, "learning_rate": 1e-05, "loss": 0.4602, "mean_token_accuracy": 0.8511192202568054, "num_tokens": 575243072.0, "step": 3611 }, { "epoch": 1.837232960325534, "grad_norm": 0.8761646151542664, "learning_rate": 1e-05, "loss": 0.445, "mean_token_accuracy": 0.8564286828041077, "num_tokens": 575407175.0, "step": 3612 }, { "epoch": 1.8377416073245167, "grad_norm": 0.9320701956748962, "learning_rate": 1e-05, "loss": 0.454, "mean_token_accuracy": 0.8560305237770081, "num_tokens": 575560729.0, "step": 3613 }, { "epoch": 1.8382502543234995, "grad_norm": 0.9284968376159668, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.8568035364151001, "num_tokens": 575734331.0, "step": 3614 }, { "epoch": 1.8387589013224823, "grad_norm": 0.9640108346939087, "learning_rate": 1e-05, "loss": 0.4764, "mean_token_accuracy": 0.8468776345252991, "num_tokens": 575903732.0, "step": 3615 }, { "epoch": 1.8392675483214649, "grad_norm": 0.9263799786567688, "learning_rate": 1e-05, "loss": 0.4829, "mean_token_accuracy": 0.8463630676269531, "num_tokens": 576048212.0, "step": 3616 }, { "epoch": 1.8397761953204474, "grad_norm": 0.9382118582725525, "learning_rate": 1e-05, "loss": 0.4596, "mean_token_accuracy": 0.8531448841094971, "num_tokens": 576207321.0, "step": 3617 }, { "epoch": 1.8402848423194302, "grad_norm": 1.1217350959777832, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.840814471244812, "num_tokens": 576379541.0, "step": 3618 }, { "epoch": 1.840793489318413, "grad_norm": 0.875610888004303, "learning_rate": 1e-05, "loss": 0.439, "mean_token_accuracy": 0.8584346771240234, "num_tokens": 576542203.0, "step": 3619 }, { "epoch": 1.8413021363173958, "grad_norm": 0.9747274518013, "learning_rate": 1e-05, "loss": 0.4954, "mean_token_accuracy": 0.8422902822494507, "num_tokens": 576693938.0, "step": 3620 }, { "epoch": 1.8418107833163786, "grad_norm": 1.030494213104248, "learning_rate": 1e-05, "loss": 0.4637, "mean_token_accuracy": 0.8514543771743774, "num_tokens": 576848972.0, "step": 3621 }, { "epoch": 1.8423194303153612, "grad_norm": 0.9475911259651184, "learning_rate": 1e-05, "loss": 0.4612, "mean_token_accuracy": 0.851582407951355, "num_tokens": 576996432.0, "step": 3622 }, { "epoch": 1.8428280773143437, "grad_norm": 0.9580101370811462, "learning_rate": 1e-05, "loss": 0.5064, "mean_token_accuracy": 0.8397974967956543, "num_tokens": 577165777.0, "step": 3623 }, { "epoch": 1.8433367243133265, "grad_norm": 0.9337804913520813, "learning_rate": 1e-05, "loss": 0.4639, "mean_token_accuracy": 0.8514394760131836, "num_tokens": 577320221.0, "step": 3624 }, { "epoch": 1.8438453713123093, "grad_norm": 0.8653109669685364, "learning_rate": 1e-05, "loss": 0.4743, "mean_token_accuracy": 0.8486281633377075, "num_tokens": 577482320.0, "step": 3625 }, { "epoch": 1.844354018311292, "grad_norm": 2.005347967147827, "learning_rate": 1e-05, "loss": 0.4726, "mean_token_accuracy": 0.8509484529495239, "num_tokens": 577643317.0, "step": 3626 }, { "epoch": 1.8448626653102747, "grad_norm": 1.2213129997253418, "learning_rate": 1e-05, "loss": 0.463, "mean_token_accuracy": 0.8512935638427734, "num_tokens": 577797333.0, "step": 3627 }, { "epoch": 1.8453713123092572, "grad_norm": 0.9152913689613342, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.852689802646637, "num_tokens": 577962353.0, "step": 3628 }, { "epoch": 1.84587995930824, "grad_norm": 0.9832363128662109, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.849490225315094, "num_tokens": 578124756.0, "step": 3629 }, { "epoch": 1.8463886063072228, "grad_norm": 0.9140488505363464, "learning_rate": 1e-05, "loss": 0.4898, "mean_token_accuracy": 0.8460453748703003, "num_tokens": 578280116.0, "step": 3630 }, { "epoch": 1.8468972533062056, "grad_norm": 1.0018895864486694, "learning_rate": 1e-05, "loss": 0.4624, "mean_token_accuracy": 0.8521567583084106, "num_tokens": 578428148.0, "step": 3631 }, { "epoch": 1.8474059003051884, "grad_norm": 0.8730851411819458, "learning_rate": 1e-05, "loss": 0.4659, "mean_token_accuracy": 0.8521086573600769, "num_tokens": 578587724.0, "step": 3632 }, { "epoch": 1.847914547304171, "grad_norm": 0.9751337766647339, "learning_rate": 1e-05, "loss": 0.4795, "mean_token_accuracy": 0.8483328223228455, "num_tokens": 578744811.0, "step": 3633 }, { "epoch": 1.8484231943031535, "grad_norm": 0.952669084072113, "learning_rate": 1e-05, "loss": 0.4775, "mean_token_accuracy": 0.8489563465118408, "num_tokens": 578907271.0, "step": 3634 }, { "epoch": 1.8489318413021363, "grad_norm": 0.9171006083488464, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8486084342002869, "num_tokens": 579067155.0, "step": 3635 }, { "epoch": 1.849440488301119, "grad_norm": 1.0136206150054932, "learning_rate": 1e-05, "loss": 0.4782, "mean_token_accuracy": 0.8471975326538086, "num_tokens": 579229124.0, "step": 3636 }, { "epoch": 1.8499491353001019, "grad_norm": 0.9842079877853394, "learning_rate": 1e-05, "loss": 0.4558, "mean_token_accuracy": 0.8547145128250122, "num_tokens": 579380265.0, "step": 3637 }, { "epoch": 1.8504577822990844, "grad_norm": 0.8879310488700867, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8601967096328735, "num_tokens": 579541087.0, "step": 3638 }, { "epoch": 1.850966429298067, "grad_norm": 0.9165470004081726, "learning_rate": 1e-05, "loss": 0.4499, "mean_token_accuracy": 0.8543002009391785, "num_tokens": 579691325.0, "step": 3639 }, { "epoch": 1.8514750762970498, "grad_norm": 1.1003382205963135, "learning_rate": 1e-05, "loss": 0.5004, "mean_token_accuracy": 0.8400880098342896, "num_tokens": 579853781.0, "step": 3640 }, { "epoch": 1.8519837232960326, "grad_norm": 0.9798530340194702, "learning_rate": 1e-05, "loss": 0.4806, "mean_token_accuracy": 0.8484410047531128, "num_tokens": 580011285.0, "step": 3641 }, { "epoch": 1.8524923702950153, "grad_norm": 0.8885270357131958, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8579368591308594, "num_tokens": 580175507.0, "step": 3642 }, { "epoch": 1.8530010172939981, "grad_norm": 0.8953379988670349, "learning_rate": 1e-05, "loss": 0.4748, "mean_token_accuracy": 0.8487176895141602, "num_tokens": 580334727.0, "step": 3643 }, { "epoch": 1.8535096642929807, "grad_norm": 1.237749457359314, "learning_rate": 1e-05, "loss": 0.4699, "mean_token_accuracy": 0.8505109548568726, "num_tokens": 580499293.0, "step": 3644 }, { "epoch": 1.8540183112919633, "grad_norm": 1.0147404670715332, "learning_rate": 1e-05, "loss": 0.4476, "mean_token_accuracy": 0.8560020327568054, "num_tokens": 580643374.0, "step": 3645 }, { "epoch": 1.854526958290946, "grad_norm": 0.908033549785614, "learning_rate": 1e-05, "loss": 0.501, "mean_token_accuracy": 0.8429970741271973, "num_tokens": 580805832.0, "step": 3646 }, { "epoch": 1.8550356052899288, "grad_norm": 0.9476537704467773, "learning_rate": 1e-05, "loss": 0.4337, "mean_token_accuracy": 0.8594100475311279, "num_tokens": 580956923.0, "step": 3647 }, { "epoch": 1.8555442522889116, "grad_norm": 0.925979495048523, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.8583248853683472, "num_tokens": 581107989.0, "step": 3648 }, { "epoch": 1.8560528992878942, "grad_norm": 0.9476032257080078, "learning_rate": 1e-05, "loss": 0.4507, "mean_token_accuracy": 0.8557018041610718, "num_tokens": 581257593.0, "step": 3649 }, { "epoch": 1.8565615462868768, "grad_norm": 0.9103021621704102, "learning_rate": 1e-05, "loss": 0.4687, "mean_token_accuracy": 0.8508168458938599, "num_tokens": 581423977.0, "step": 3650 }, { "epoch": 1.8570701932858595, "grad_norm": 0.8881806135177612, "learning_rate": 1e-05, "loss": 0.465, "mean_token_accuracy": 0.8523017764091492, "num_tokens": 581587347.0, "step": 3651 }, { "epoch": 1.8575788402848423, "grad_norm": 1.0127854347229004, "learning_rate": 1e-05, "loss": 0.5131, "mean_token_accuracy": 0.8396639823913574, "num_tokens": 581742263.0, "step": 3652 }, { "epoch": 1.8580874872838251, "grad_norm": 0.9147594571113586, "learning_rate": 1e-05, "loss": 0.4786, "mean_token_accuracy": 0.8464944958686829, "num_tokens": 581907547.0, "step": 3653 }, { "epoch": 1.8585961342828077, "grad_norm": 0.919609785079956, "learning_rate": 1e-05, "loss": 0.4675, "mean_token_accuracy": 0.850050151348114, "num_tokens": 582060741.0, "step": 3654 }, { "epoch": 1.8591047812817905, "grad_norm": 0.9043321013450623, "learning_rate": 1e-05, "loss": 0.4559, "mean_token_accuracy": 0.8531760573387146, "num_tokens": 582214993.0, "step": 3655 }, { "epoch": 1.859613428280773, "grad_norm": 0.8717383146286011, "learning_rate": 1e-05, "loss": 0.4703, "mean_token_accuracy": 0.8501156568527222, "num_tokens": 582383233.0, "step": 3656 }, { "epoch": 1.8601220752797558, "grad_norm": 0.8819180130958557, "learning_rate": 1e-05, "loss": 0.4808, "mean_token_accuracy": 0.8476206064224243, "num_tokens": 582545163.0, "step": 3657 }, { "epoch": 1.8606307222787386, "grad_norm": 0.989859938621521, "learning_rate": 1e-05, "loss": 0.5048, "mean_token_accuracy": 0.8376556634902954, "num_tokens": 582700928.0, "step": 3658 }, { "epoch": 1.8611393692777214, "grad_norm": 0.8996127247810364, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8479574918746948, "num_tokens": 582871470.0, "step": 3659 }, { "epoch": 1.861648016276704, "grad_norm": 0.8768028616905212, "learning_rate": 1e-05, "loss": 0.4492, "mean_token_accuracy": 0.8561135530471802, "num_tokens": 583019968.0, "step": 3660 }, { "epoch": 1.8621566632756865, "grad_norm": 0.9248018264770508, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8482380509376526, "num_tokens": 583166808.0, "step": 3661 }, { "epoch": 1.8626653102746693, "grad_norm": 0.9277891516685486, "learning_rate": 1e-05, "loss": 0.4679, "mean_token_accuracy": 0.8508737087249756, "num_tokens": 583318374.0, "step": 3662 }, { "epoch": 1.863173957273652, "grad_norm": 0.9190016388893127, "learning_rate": 1e-05, "loss": 0.4715, "mean_token_accuracy": 0.8488146662712097, "num_tokens": 583484043.0, "step": 3663 }, { "epoch": 1.8636826042726349, "grad_norm": 1.0272608995437622, "learning_rate": 1e-05, "loss": 0.4421, "mean_token_accuracy": 0.857718825340271, "num_tokens": 583646646.0, "step": 3664 }, { "epoch": 1.8641912512716174, "grad_norm": 0.8706140518188477, "learning_rate": 1e-05, "loss": 0.4779, "mean_token_accuracy": 0.847746729850769, "num_tokens": 583808674.0, "step": 3665 }, { "epoch": 1.8646998982706002, "grad_norm": 1.0054000616073608, "learning_rate": 1e-05, "loss": 0.4471, "mean_token_accuracy": 0.8554672002792358, "num_tokens": 583961908.0, "step": 3666 }, { "epoch": 1.8652085452695828, "grad_norm": 0.9957459568977356, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8503569960594177, "num_tokens": 584121007.0, "step": 3667 }, { "epoch": 1.8657171922685656, "grad_norm": 0.9274004101753235, "learning_rate": 1e-05, "loss": 0.4514, "mean_token_accuracy": 0.8552700877189636, "num_tokens": 584274112.0, "step": 3668 }, { "epoch": 1.8662258392675484, "grad_norm": 1.0314558744430542, "learning_rate": 1e-05, "loss": 0.4322, "mean_token_accuracy": 0.8610309362411499, "num_tokens": 584437482.0, "step": 3669 }, { "epoch": 1.8667344862665312, "grad_norm": 0.9328179955482483, "learning_rate": 1e-05, "loss": 0.4557, "mean_token_accuracy": 0.8532506227493286, "num_tokens": 584604753.0, "step": 3670 }, { "epoch": 1.8672431332655137, "grad_norm": 0.9097633361816406, "learning_rate": 1e-05, "loss": 0.4708, "mean_token_accuracy": 0.8506937026977539, "num_tokens": 584768863.0, "step": 3671 }, { "epoch": 1.8677517802644963, "grad_norm": 0.9894227385520935, "learning_rate": 1e-05, "loss": 0.4727, "mean_token_accuracy": 0.8492318391799927, "num_tokens": 584922779.0, "step": 3672 }, { "epoch": 1.868260427263479, "grad_norm": 1.0254460573196411, "learning_rate": 1e-05, "loss": 0.475, "mean_token_accuracy": 0.848928689956665, "num_tokens": 585082703.0, "step": 3673 }, { "epoch": 1.8687690742624619, "grad_norm": 0.9415755271911621, "learning_rate": 1e-05, "loss": 0.5065, "mean_token_accuracy": 0.8420661687850952, "num_tokens": 585258146.0, "step": 3674 }, { "epoch": 1.8692777212614446, "grad_norm": 0.9524638652801514, "learning_rate": 1e-05, "loss": 0.4575, "mean_token_accuracy": 0.8552057147026062, "num_tokens": 585416065.0, "step": 3675 }, { "epoch": 1.8697863682604272, "grad_norm": 0.9377021789550781, "learning_rate": 1e-05, "loss": 0.4464, "mean_token_accuracy": 0.8568128347396851, "num_tokens": 585575451.0, "step": 3676 }, { "epoch": 1.87029501525941, "grad_norm": 0.9163416028022766, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8492084741592407, "num_tokens": 585739539.0, "step": 3677 }, { "epoch": 1.8708036622583926, "grad_norm": 0.959039568901062, "learning_rate": 1e-05, "loss": 0.4717, "mean_token_accuracy": 0.8480402231216431, "num_tokens": 585907402.0, "step": 3678 }, { "epoch": 1.8713123092573754, "grad_norm": 0.8940199613571167, "learning_rate": 1e-05, "loss": 0.4905, "mean_token_accuracy": 0.8439282178878784, "num_tokens": 586083334.0, "step": 3679 }, { "epoch": 1.8718209562563581, "grad_norm": 0.9631999135017395, "learning_rate": 1e-05, "loss": 0.4664, "mean_token_accuracy": 0.8514367341995239, "num_tokens": 586248102.0, "step": 3680 }, { "epoch": 1.872329603255341, "grad_norm": 1.0510187149047852, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8436463475227356, "num_tokens": 586402076.0, "step": 3681 }, { "epoch": 1.8728382502543235, "grad_norm": 0.9832147359848022, "learning_rate": 1e-05, "loss": 0.4784, "mean_token_accuracy": 0.847326934337616, "num_tokens": 586569377.0, "step": 3682 }, { "epoch": 1.873346897253306, "grad_norm": 0.9283510446548462, "learning_rate": 1e-05, "loss": 0.4705, "mean_token_accuracy": 0.8506478071212769, "num_tokens": 586729634.0, "step": 3683 }, { "epoch": 1.8738555442522888, "grad_norm": 0.9281595349311829, "learning_rate": 1e-05, "loss": 0.4558, "mean_token_accuracy": 0.8545751571655273, "num_tokens": 586891543.0, "step": 3684 }, { "epoch": 1.8743641912512716, "grad_norm": 1.007712960243225, "learning_rate": 1e-05, "loss": 0.4496, "mean_token_accuracy": 0.8549709320068359, "num_tokens": 587053469.0, "step": 3685 }, { "epoch": 1.8748728382502544, "grad_norm": 0.9378703236579895, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8443782329559326, "num_tokens": 587214972.0, "step": 3686 }, { "epoch": 1.875381485249237, "grad_norm": 0.9779534339904785, "learning_rate": 1e-05, "loss": 0.5014, "mean_token_accuracy": 0.8421754240989685, "num_tokens": 587380598.0, "step": 3687 }, { "epoch": 1.8758901322482198, "grad_norm": 0.9017239809036255, "learning_rate": 1e-05, "loss": 0.4774, "mean_token_accuracy": 0.8475435972213745, "num_tokens": 587541589.0, "step": 3688 }, { "epoch": 1.8763987792472023, "grad_norm": 0.9856732487678528, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8449099659919739, "num_tokens": 587704662.0, "step": 3689 }, { "epoch": 1.8769074262461851, "grad_norm": 0.9052358865737915, "learning_rate": 1e-05, "loss": 0.4933, "mean_token_accuracy": 0.8428623080253601, "num_tokens": 587860976.0, "step": 3690 }, { "epoch": 1.877416073245168, "grad_norm": 0.9494747519493103, "learning_rate": 1e-05, "loss": 0.5027, "mean_token_accuracy": 0.8417651653289795, "num_tokens": 588018853.0, "step": 3691 }, { "epoch": 1.8779247202441507, "grad_norm": 0.9226229786872864, "learning_rate": 1e-05, "loss": 0.4397, "mean_token_accuracy": 0.8583682775497437, "num_tokens": 588168649.0, "step": 3692 }, { "epoch": 1.8784333672431333, "grad_norm": 0.8945475816726685, "learning_rate": 1e-05, "loss": 0.4372, "mean_token_accuracy": 0.8595737218856812, "num_tokens": 588328963.0, "step": 3693 }, { "epoch": 1.8789420142421158, "grad_norm": 0.9041112065315247, "learning_rate": 1e-05, "loss": 0.4589, "mean_token_accuracy": 0.8514023423194885, "num_tokens": 588491275.0, "step": 3694 }, { "epoch": 1.8794506612410986, "grad_norm": 0.8546394109725952, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8534486889839172, "num_tokens": 588654078.0, "step": 3695 }, { "epoch": 1.8799593082400814, "grad_norm": 0.8748382925987244, "learning_rate": 1e-05, "loss": 0.4724, "mean_token_accuracy": 0.8489651679992676, "num_tokens": 588817168.0, "step": 3696 }, { "epoch": 1.8804679552390642, "grad_norm": 0.9193595051765442, "learning_rate": 1e-05, "loss": 0.4365, "mean_token_accuracy": 0.8599328994750977, "num_tokens": 588976724.0, "step": 3697 }, { "epoch": 1.8809766022380467, "grad_norm": 0.8867758512496948, "learning_rate": 1e-05, "loss": 0.4854, "mean_token_accuracy": 0.8471266627311707, "num_tokens": 589135465.0, "step": 3698 }, { "epoch": 1.8814852492370295, "grad_norm": 0.9284715056419373, "learning_rate": 1e-05, "loss": 0.4877, "mean_token_accuracy": 0.8443389534950256, "num_tokens": 589304160.0, "step": 3699 }, { "epoch": 1.881993896236012, "grad_norm": 0.9017884731292725, "learning_rate": 1e-05, "loss": 0.4997, "mean_token_accuracy": 0.8431593179702759, "num_tokens": 589459066.0, "step": 3700 }, { "epoch": 1.8825025432349949, "grad_norm": 0.8958366513252258, "learning_rate": 1e-05, "loss": 0.4609, "mean_token_accuracy": 0.8517111539840698, "num_tokens": 589634237.0, "step": 3701 }, { "epoch": 1.8830111902339777, "grad_norm": 0.9634583592414856, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8483635187149048, "num_tokens": 589787765.0, "step": 3702 }, { "epoch": 1.8835198372329605, "grad_norm": 0.8931185007095337, "learning_rate": 1e-05, "loss": 0.4692, "mean_token_accuracy": 0.8505688905715942, "num_tokens": 589955052.0, "step": 3703 }, { "epoch": 1.884028484231943, "grad_norm": 0.9743666648864746, "learning_rate": 1e-05, "loss": 0.466, "mean_token_accuracy": 0.8499509692192078, "num_tokens": 590119500.0, "step": 3704 }, { "epoch": 1.8845371312309256, "grad_norm": 0.8730661273002625, "learning_rate": 1e-05, "loss": 0.4745, "mean_token_accuracy": 0.849245011806488, "num_tokens": 590274290.0, "step": 3705 }, { "epoch": 1.8850457782299084, "grad_norm": 1.013344168663025, "learning_rate": 1e-05, "loss": 0.4978, "mean_token_accuracy": 0.8414533138275146, "num_tokens": 590427085.0, "step": 3706 }, { "epoch": 1.8855544252288912, "grad_norm": 0.902656614780426, "learning_rate": 1e-05, "loss": 0.4469, "mean_token_accuracy": 0.8566779494285583, "num_tokens": 590586848.0, "step": 3707 }, { "epoch": 1.886063072227874, "grad_norm": 0.9930974841117859, "learning_rate": 1e-05, "loss": 0.4867, "mean_token_accuracy": 0.8459283113479614, "num_tokens": 590750486.0, "step": 3708 }, { "epoch": 1.8865717192268565, "grad_norm": 0.9296849966049194, "learning_rate": 1e-05, "loss": 0.469, "mean_token_accuracy": 0.8507835268974304, "num_tokens": 590909813.0, "step": 3709 }, { "epoch": 1.8870803662258393, "grad_norm": 1.0177860260009766, "learning_rate": 1e-05, "loss": 0.4679, "mean_token_accuracy": 0.8514139652252197, "num_tokens": 591063252.0, "step": 3710 }, { "epoch": 1.8875890132248219, "grad_norm": 0.9661931991577148, "learning_rate": 1e-05, "loss": 0.4554, "mean_token_accuracy": 0.853483259677887, "num_tokens": 591212843.0, "step": 3711 }, { "epoch": 1.8880976602238047, "grad_norm": 0.9466366171836853, "learning_rate": 1e-05, "loss": 0.4627, "mean_token_accuracy": 0.8509161472320557, "num_tokens": 591367281.0, "step": 3712 }, { "epoch": 1.8886063072227874, "grad_norm": 0.9832890033721924, "learning_rate": 1e-05, "loss": 0.4941, "mean_token_accuracy": 0.8426257371902466, "num_tokens": 591533903.0, "step": 3713 }, { "epoch": 1.8891149542217702, "grad_norm": 0.9140143990516663, "learning_rate": 1e-05, "loss": 0.4831, "mean_token_accuracy": 0.8483765721321106, "num_tokens": 591689473.0, "step": 3714 }, { "epoch": 1.8896236012207528, "grad_norm": 0.9094839096069336, "learning_rate": 1e-05, "loss": 0.4704, "mean_token_accuracy": 0.848656177520752, "num_tokens": 591853239.0, "step": 3715 }, { "epoch": 1.8901322482197354, "grad_norm": 0.9416212439537048, "learning_rate": 1e-05, "loss": 0.4584, "mean_token_accuracy": 0.8517597913742065, "num_tokens": 592022697.0, "step": 3716 }, { "epoch": 1.8906408952187181, "grad_norm": 0.9125696420669556, "learning_rate": 1e-05, "loss": 0.5124, "mean_token_accuracy": 0.8381770849227905, "num_tokens": 592183835.0, "step": 3717 }, { "epoch": 1.891149542217701, "grad_norm": 0.8517329096794128, "learning_rate": 1e-05, "loss": 0.4316, "mean_token_accuracy": 0.8600273728370667, "num_tokens": 592341817.0, "step": 3718 }, { "epoch": 1.8916581892166837, "grad_norm": 0.911518931388855, "learning_rate": 1e-05, "loss": 0.4754, "mean_token_accuracy": 0.8477194905281067, "num_tokens": 592503410.0, "step": 3719 }, { "epoch": 1.8921668362156663, "grad_norm": 0.9388558864593506, "learning_rate": 1e-05, "loss": 0.4723, "mean_token_accuracy": 0.8482939004898071, "num_tokens": 592667852.0, "step": 3720 }, { "epoch": 1.892675483214649, "grad_norm": 0.9579901695251465, "learning_rate": 1e-05, "loss": 0.4793, "mean_token_accuracy": 0.8475110530853271, "num_tokens": 592827432.0, "step": 3721 }, { "epoch": 1.8931841302136316, "grad_norm": 0.8853926062583923, "learning_rate": 1e-05, "loss": 0.4786, "mean_token_accuracy": 0.8468433022499084, "num_tokens": 592990364.0, "step": 3722 }, { "epoch": 1.8936927772126144, "grad_norm": 0.9219601154327393, "learning_rate": 1e-05, "loss": 0.4753, "mean_token_accuracy": 0.8503814935684204, "num_tokens": 593145072.0, "step": 3723 }, { "epoch": 1.8942014242115972, "grad_norm": 0.8941967487335205, "learning_rate": 1e-05, "loss": 0.4716, "mean_token_accuracy": 0.8500727415084839, "num_tokens": 593313033.0, "step": 3724 }, { "epoch": 1.89471007121058, "grad_norm": 0.9067884087562561, "learning_rate": 1e-05, "loss": 0.4808, "mean_token_accuracy": 0.8485181331634521, "num_tokens": 593473823.0, "step": 3725 }, { "epoch": 1.8952187182095626, "grad_norm": 0.9031772017478943, "learning_rate": 1e-05, "loss": 0.4473, "mean_token_accuracy": 0.8565289974212646, "num_tokens": 593629565.0, "step": 3726 }, { "epoch": 1.8957273652085451, "grad_norm": 0.9642429351806641, "learning_rate": 1e-05, "loss": 0.4716, "mean_token_accuracy": 0.8493976593017578, "num_tokens": 593780513.0, "step": 3727 }, { "epoch": 1.896236012207528, "grad_norm": 0.9130746126174927, "learning_rate": 1e-05, "loss": 0.4846, "mean_token_accuracy": 0.8470790386199951, "num_tokens": 593932811.0, "step": 3728 }, { "epoch": 1.8967446592065107, "grad_norm": 0.9594608545303345, "learning_rate": 1e-05, "loss": 0.4612, "mean_token_accuracy": 0.851692795753479, "num_tokens": 594084116.0, "step": 3729 }, { "epoch": 1.8972533062054935, "grad_norm": 0.9444839954376221, "learning_rate": 1e-05, "loss": 0.4491, "mean_token_accuracy": 0.856663703918457, "num_tokens": 594251800.0, "step": 3730 }, { "epoch": 1.897761953204476, "grad_norm": 0.9218647480010986, "learning_rate": 1e-05, "loss": 0.4788, "mean_token_accuracy": 0.8491775393486023, "num_tokens": 594412917.0, "step": 3731 }, { "epoch": 1.8982706002034588, "grad_norm": 0.9814186096191406, "learning_rate": 1e-05, "loss": 0.4792, "mean_token_accuracy": 0.8472409844398499, "num_tokens": 594566525.0, "step": 3732 }, { "epoch": 1.8987792472024414, "grad_norm": 0.9457241892814636, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.8417283296585083, "num_tokens": 594735192.0, "step": 3733 }, { "epoch": 1.8992878942014242, "grad_norm": 0.8630687594413757, "learning_rate": 1e-05, "loss": 0.4424, "mean_token_accuracy": 0.8581480383872986, "num_tokens": 594903927.0, "step": 3734 }, { "epoch": 1.899796541200407, "grad_norm": 0.9283967018127441, "learning_rate": 1e-05, "loss": 0.4681, "mean_token_accuracy": 0.8492915630340576, "num_tokens": 595059127.0, "step": 3735 }, { "epoch": 1.9003051881993898, "grad_norm": 0.9585233330726624, "learning_rate": 1e-05, "loss": 0.4506, "mean_token_accuracy": 0.8559272289276123, "num_tokens": 595203411.0, "step": 3736 }, { "epoch": 1.9008138351983723, "grad_norm": 0.9517952799797058, "learning_rate": 1e-05, "loss": 0.5178, "mean_token_accuracy": 0.8370660543441772, "num_tokens": 595365030.0, "step": 3737 }, { "epoch": 1.901322482197355, "grad_norm": 0.9465552568435669, "learning_rate": 1e-05, "loss": 0.4722, "mean_token_accuracy": 0.8481013774871826, "num_tokens": 595523298.0, "step": 3738 }, { "epoch": 1.9018311291963377, "grad_norm": 0.9181548953056335, "learning_rate": 1e-05, "loss": 0.465, "mean_token_accuracy": 0.8516472578048706, "num_tokens": 595685557.0, "step": 3739 }, { "epoch": 1.9023397761953205, "grad_norm": 0.9235841035842896, "learning_rate": 1e-05, "loss": 0.4406, "mean_token_accuracy": 0.8593134880065918, "num_tokens": 595851295.0, "step": 3740 }, { "epoch": 1.9028484231943033, "grad_norm": 0.9412876963615417, "learning_rate": 1e-05, "loss": 0.4595, "mean_token_accuracy": 0.8510797619819641, "num_tokens": 596006533.0, "step": 3741 }, { "epoch": 1.9033570701932858, "grad_norm": 0.9959353804588318, "learning_rate": 1e-05, "loss": 0.4873, "mean_token_accuracy": 0.8455770015716553, "num_tokens": 596165520.0, "step": 3742 }, { "epoch": 1.9038657171922686, "grad_norm": 0.968315064907074, "learning_rate": 1e-05, "loss": 0.483, "mean_token_accuracy": 0.8484834432601929, "num_tokens": 596331455.0, "step": 3743 }, { "epoch": 1.9043743641912512, "grad_norm": 0.9146832227706909, "learning_rate": 1e-05, "loss": 0.4644, "mean_token_accuracy": 0.8519558310508728, "num_tokens": 596492724.0, "step": 3744 }, { "epoch": 1.904883011190234, "grad_norm": 0.8960966467857361, "learning_rate": 1e-05, "loss": 0.4594, "mean_token_accuracy": 0.8516942858695984, "num_tokens": 596655813.0, "step": 3745 }, { "epoch": 1.9053916581892167, "grad_norm": 0.8998123407363892, "learning_rate": 1e-05, "loss": 0.4325, "mean_token_accuracy": 0.8601040244102478, "num_tokens": 596815498.0, "step": 3746 }, { "epoch": 1.9059003051881995, "grad_norm": 0.9438427686691284, "learning_rate": 1e-05, "loss": 0.5112, "mean_token_accuracy": 0.8384228348731995, "num_tokens": 596976097.0, "step": 3747 }, { "epoch": 1.906408952187182, "grad_norm": 0.8788058161735535, "learning_rate": 1e-05, "loss": 0.4693, "mean_token_accuracy": 0.8497171401977539, "num_tokens": 597137505.0, "step": 3748 }, { "epoch": 1.9069175991861647, "grad_norm": 0.9773399829864502, "learning_rate": 1e-05, "loss": 0.4739, "mean_token_accuracy": 0.8490920066833496, "num_tokens": 597290245.0, "step": 3749 }, { "epoch": 1.9074262461851474, "grad_norm": 0.9383112788200378, "learning_rate": 1e-05, "loss": 0.4779, "mean_token_accuracy": 0.8457980751991272, "num_tokens": 597439964.0, "step": 3750 }, { "epoch": 1.9079348931841302, "grad_norm": 0.894231379032135, "learning_rate": 1e-05, "loss": 0.4605, "mean_token_accuracy": 0.8520057797431946, "num_tokens": 597601954.0, "step": 3751 }, { "epoch": 1.908443540183113, "grad_norm": 0.9662824273109436, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.8424139022827148, "num_tokens": 597762524.0, "step": 3752 }, { "epoch": 1.9089521871820956, "grad_norm": 0.9119341373443604, "learning_rate": 1e-05, "loss": 0.4681, "mean_token_accuracy": 0.8486347198486328, "num_tokens": 597915383.0, "step": 3753 }, { "epoch": 1.9094608341810784, "grad_norm": 0.9073940515518188, "learning_rate": 1e-05, "loss": 0.5003, "mean_token_accuracy": 0.8417505025863647, "num_tokens": 598088815.0, "step": 3754 }, { "epoch": 1.909969481180061, "grad_norm": 0.9285572171211243, "learning_rate": 1e-05, "loss": 0.4633, "mean_token_accuracy": 0.8495465517044067, "num_tokens": 598243445.0, "step": 3755 }, { "epoch": 1.9104781281790437, "grad_norm": 0.920777440071106, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8522055149078369, "num_tokens": 598408435.0, "step": 3756 }, { "epoch": 1.9109867751780265, "grad_norm": 0.9562985301017761, "learning_rate": 1e-05, "loss": 0.4431, "mean_token_accuracy": 0.8574119806289673, "num_tokens": 598545916.0, "step": 3757 }, { "epoch": 1.9114954221770093, "grad_norm": 0.9341357350349426, "learning_rate": 1e-05, "loss": 0.4815, "mean_token_accuracy": 0.8483132123947144, "num_tokens": 598703467.0, "step": 3758 }, { "epoch": 1.9120040691759919, "grad_norm": 0.9093040823936462, "learning_rate": 1e-05, "loss": 0.4778, "mean_token_accuracy": 0.8470190167427063, "num_tokens": 598878943.0, "step": 3759 }, { "epoch": 1.9125127161749744, "grad_norm": 0.8956009745597839, "learning_rate": 1e-05, "loss": 0.4641, "mean_token_accuracy": 0.8517540693283081, "num_tokens": 599047533.0, "step": 3760 }, { "epoch": 1.9130213631739572, "grad_norm": 0.9305961728096008, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8554340600967407, "num_tokens": 599208433.0, "step": 3761 }, { "epoch": 1.91353001017294, "grad_norm": 0.9422276020050049, "learning_rate": 1e-05, "loss": 0.4778, "mean_token_accuracy": 0.8485419750213623, "num_tokens": 599369815.0, "step": 3762 }, { "epoch": 1.9140386571719228, "grad_norm": 0.9445098638534546, "learning_rate": 1e-05, "loss": 0.4454, "mean_token_accuracy": 0.8574182391166687, "num_tokens": 599529177.0, "step": 3763 }, { "epoch": 1.9145473041709054, "grad_norm": 1.0218015909194946, "learning_rate": 1e-05, "loss": 0.4378, "mean_token_accuracy": 0.858683705329895, "num_tokens": 599687509.0, "step": 3764 }, { "epoch": 1.9150559511698881, "grad_norm": 0.9703574180603027, "learning_rate": 1e-05, "loss": 0.4278, "mean_token_accuracy": 0.8607935905456543, "num_tokens": 599837337.0, "step": 3765 }, { "epoch": 1.9155645981688707, "grad_norm": 0.9127360582351685, "learning_rate": 1e-05, "loss": 0.4582, "mean_token_accuracy": 0.8523753881454468, "num_tokens": 599989016.0, "step": 3766 }, { "epoch": 1.9160732451678535, "grad_norm": 0.8837084770202637, "learning_rate": 1e-05, "loss": 0.4835, "mean_token_accuracy": 0.845979630947113, "num_tokens": 600150348.0, "step": 3767 }, { "epoch": 1.9165818921668363, "grad_norm": 0.993800163269043, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8495070934295654, "num_tokens": 600296480.0, "step": 3768 }, { "epoch": 1.917090539165819, "grad_norm": 0.9457738995552063, "learning_rate": 1e-05, "loss": 0.453, "mean_token_accuracy": 0.8544372320175171, "num_tokens": 600457101.0, "step": 3769 }, { "epoch": 1.9175991861648016, "grad_norm": 0.8690885305404663, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.853378176689148, "num_tokens": 600617011.0, "step": 3770 }, { "epoch": 1.9181078331637842, "grad_norm": 1.2345824241638184, "learning_rate": 1e-05, "loss": 0.5044, "mean_token_accuracy": 0.840251088142395, "num_tokens": 600776697.0, "step": 3771 }, { "epoch": 1.918616480162767, "grad_norm": 0.9083021283149719, "learning_rate": 1e-05, "loss": 0.4732, "mean_token_accuracy": 0.849480152130127, "num_tokens": 600942695.0, "step": 3772 }, { "epoch": 1.9191251271617498, "grad_norm": 0.9992514252662659, "learning_rate": 1e-05, "loss": 0.4724, "mean_token_accuracy": 0.848492443561554, "num_tokens": 601098526.0, "step": 3773 }, { "epoch": 1.9196337741607326, "grad_norm": 0.9319624304771423, "learning_rate": 1e-05, "loss": 0.4845, "mean_token_accuracy": 0.8464127779006958, "num_tokens": 601254189.0, "step": 3774 }, { "epoch": 1.9201424211597151, "grad_norm": 0.8973638415336609, "learning_rate": 1e-05, "loss": 0.4569, "mean_token_accuracy": 0.8527525663375854, "num_tokens": 601408256.0, "step": 3775 }, { "epoch": 1.920651068158698, "grad_norm": 0.9371774196624756, "learning_rate": 1e-05, "loss": 0.4829, "mean_token_accuracy": 0.8458545207977295, "num_tokens": 601571439.0, "step": 3776 }, { "epoch": 1.9211597151576805, "grad_norm": 0.9487343430519104, "learning_rate": 1e-05, "loss": 0.4487, "mean_token_accuracy": 0.8576250076293945, "num_tokens": 601734945.0, "step": 3777 }, { "epoch": 1.9216683621566633, "grad_norm": 0.937295138835907, "learning_rate": 1e-05, "loss": 0.4883, "mean_token_accuracy": 0.8438887000083923, "num_tokens": 601896520.0, "step": 3778 }, { "epoch": 1.922177009155646, "grad_norm": 0.9660419821739197, "learning_rate": 1e-05, "loss": 0.4683, "mean_token_accuracy": 0.8490821123123169, "num_tokens": 602048648.0, "step": 3779 }, { "epoch": 1.9226856561546288, "grad_norm": 0.9533944725990295, "learning_rate": 1e-05, "loss": 0.4638, "mean_token_accuracy": 0.8520326614379883, "num_tokens": 602203498.0, "step": 3780 }, { "epoch": 1.9231943031536114, "grad_norm": 0.9050354361534119, "learning_rate": 1e-05, "loss": 0.4609, "mean_token_accuracy": 0.8517625331878662, "num_tokens": 602353361.0, "step": 3781 }, { "epoch": 1.923702950152594, "grad_norm": 0.9708511233329773, "learning_rate": 1e-05, "loss": 0.4756, "mean_token_accuracy": 0.8472548723220825, "num_tokens": 602510131.0, "step": 3782 }, { "epoch": 1.9242115971515767, "grad_norm": 0.8962816596031189, "learning_rate": 1e-05, "loss": 0.4759, "mean_token_accuracy": 0.8474619388580322, "num_tokens": 602679344.0, "step": 3783 }, { "epoch": 1.9247202441505595, "grad_norm": 0.9285560250282288, "learning_rate": 1e-05, "loss": 0.4485, "mean_token_accuracy": 0.8549748063087463, "num_tokens": 602835451.0, "step": 3784 }, { "epoch": 1.9252288911495423, "grad_norm": 0.9716651439666748, "learning_rate": 1e-05, "loss": 0.4734, "mean_token_accuracy": 0.8499223589897156, "num_tokens": 602995790.0, "step": 3785 }, { "epoch": 1.9257375381485249, "grad_norm": 0.9989169239997864, "learning_rate": 1e-05, "loss": 0.4802, "mean_token_accuracy": 0.846304714679718, "num_tokens": 603141191.0, "step": 3786 }, { "epoch": 1.9262461851475077, "grad_norm": 0.8882007002830505, "learning_rate": 1e-05, "loss": 0.4587, "mean_token_accuracy": 0.8536261320114136, "num_tokens": 603312610.0, "step": 3787 }, { "epoch": 1.9267548321464902, "grad_norm": 0.9280068278312683, "learning_rate": 1e-05, "loss": 0.4574, "mean_token_accuracy": 0.8533625602722168, "num_tokens": 603470478.0, "step": 3788 }, { "epoch": 1.927263479145473, "grad_norm": 1.2315386533737183, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8540915846824646, "num_tokens": 603640127.0, "step": 3789 }, { "epoch": 1.9277721261444558, "grad_norm": 0.9625018835067749, "learning_rate": 1e-05, "loss": 0.4568, "mean_token_accuracy": 0.8532472252845764, "num_tokens": 603791455.0, "step": 3790 }, { "epoch": 1.9282807731434386, "grad_norm": 0.9542778134346008, "learning_rate": 1e-05, "loss": 0.4507, "mean_token_accuracy": 0.8542107343673706, "num_tokens": 603950683.0, "step": 3791 }, { "epoch": 1.9287894201424212, "grad_norm": 1.1490851640701294, "learning_rate": 1e-05, "loss": 0.5005, "mean_token_accuracy": 0.8407775163650513, "num_tokens": 604109105.0, "step": 3792 }, { "epoch": 1.9292980671414037, "grad_norm": 0.9285960793495178, "learning_rate": 1e-05, "loss": 0.493, "mean_token_accuracy": 0.8430469036102295, "num_tokens": 604268766.0, "step": 3793 }, { "epoch": 1.9298067141403865, "grad_norm": 0.954233705997467, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8519355058670044, "num_tokens": 604425281.0, "step": 3794 }, { "epoch": 1.9303153611393693, "grad_norm": 0.9654402136802673, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8387899398803711, "num_tokens": 604591434.0, "step": 3795 }, { "epoch": 1.930824008138352, "grad_norm": 0.9682581424713135, "learning_rate": 1e-05, "loss": 0.4742, "mean_token_accuracy": 0.8516649007797241, "num_tokens": 604760916.0, "step": 3796 }, { "epoch": 1.9313326551373347, "grad_norm": 0.9230402112007141, "learning_rate": 1e-05, "loss": 0.497, "mean_token_accuracy": 0.8434281945228577, "num_tokens": 604925201.0, "step": 3797 }, { "epoch": 1.9318413021363174, "grad_norm": 0.9125484824180603, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8544818162918091, "num_tokens": 605069512.0, "step": 3798 }, { "epoch": 1.9323499491353, "grad_norm": 0.9310237169265747, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.8439196348190308, "num_tokens": 605236735.0, "step": 3799 }, { "epoch": 1.9328585961342828, "grad_norm": 0.9001007080078125, "learning_rate": 1e-05, "loss": 0.4428, "mean_token_accuracy": 0.8576923608779907, "num_tokens": 605388317.0, "step": 3800 }, { "epoch": 1.9333672431332656, "grad_norm": 0.9525231122970581, "learning_rate": 1e-05, "loss": 0.5133, "mean_token_accuracy": 0.8374272584915161, "num_tokens": 605553305.0, "step": 3801 }, { "epoch": 1.9338758901322484, "grad_norm": 0.9368336796760559, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8605653643608093, "num_tokens": 605720362.0, "step": 3802 }, { "epoch": 1.934384537131231, "grad_norm": 0.9109113216400146, "learning_rate": 1e-05, "loss": 0.4945, "mean_token_accuracy": 0.8430343866348267, "num_tokens": 605884975.0, "step": 3803 }, { "epoch": 1.9348931841302135, "grad_norm": 0.9070175886154175, "learning_rate": 1e-05, "loss": 0.485, "mean_token_accuracy": 0.8453513979911804, "num_tokens": 606043288.0, "step": 3804 }, { "epoch": 1.9354018311291963, "grad_norm": 0.9492138624191284, "learning_rate": 1e-05, "loss": 0.4822, "mean_token_accuracy": 0.8475384712219238, "num_tokens": 606192599.0, "step": 3805 }, { "epoch": 1.935910478128179, "grad_norm": 0.8870913982391357, "learning_rate": 1e-05, "loss": 0.4756, "mean_token_accuracy": 0.8476678729057312, "num_tokens": 606348632.0, "step": 3806 }, { "epoch": 1.9364191251271619, "grad_norm": 0.9374875426292419, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.8640589118003845, "num_tokens": 606512463.0, "step": 3807 }, { "epoch": 1.9369277721261444, "grad_norm": 0.927408754825592, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8542242050170898, "num_tokens": 606676157.0, "step": 3808 }, { "epoch": 1.9374364191251272, "grad_norm": 0.8937217593193054, "learning_rate": 1e-05, "loss": 0.4604, "mean_token_accuracy": 0.8511909246444702, "num_tokens": 606843986.0, "step": 3809 }, { "epoch": 1.9379450661241098, "grad_norm": 0.9094258546829224, "learning_rate": 1e-05, "loss": 0.4873, "mean_token_accuracy": 0.8464955687522888, "num_tokens": 607011481.0, "step": 3810 }, { "epoch": 1.9384537131230926, "grad_norm": 0.9036715030670166, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8398281335830688, "num_tokens": 607178849.0, "step": 3811 }, { "epoch": 1.9389623601220753, "grad_norm": 0.9271400570869446, "learning_rate": 1e-05, "loss": 0.4615, "mean_token_accuracy": 0.851945161819458, "num_tokens": 607334595.0, "step": 3812 }, { "epoch": 1.9394710071210581, "grad_norm": 0.907317042350769, "learning_rate": 1e-05, "loss": 0.4502, "mean_token_accuracy": 0.8564980030059814, "num_tokens": 607488276.0, "step": 3813 }, { "epoch": 1.9399796541200407, "grad_norm": 1.031917929649353, "learning_rate": 1e-05, "loss": 0.452, "mean_token_accuracy": 0.8556896448135376, "num_tokens": 607647463.0, "step": 3814 }, { "epoch": 1.9404883011190233, "grad_norm": 0.9541162252426147, "learning_rate": 1e-05, "loss": 0.4722, "mean_token_accuracy": 0.8490946888923645, "num_tokens": 607808299.0, "step": 3815 }, { "epoch": 1.940996948118006, "grad_norm": 1.3289088010787964, "learning_rate": 1e-05, "loss": 0.4582, "mean_token_accuracy": 0.8528410196304321, "num_tokens": 607969176.0, "step": 3816 }, { "epoch": 1.9415055951169888, "grad_norm": 0.9064154624938965, "learning_rate": 1e-05, "loss": 0.5181, "mean_token_accuracy": 0.835903525352478, "num_tokens": 608134248.0, "step": 3817 }, { "epoch": 1.9420142421159716, "grad_norm": 0.936055064201355, "learning_rate": 1e-05, "loss": 0.4516, "mean_token_accuracy": 0.8539500832557678, "num_tokens": 608294081.0, "step": 3818 }, { "epoch": 1.9425228891149542, "grad_norm": 0.9543054103851318, "learning_rate": 1e-05, "loss": 0.4757, "mean_token_accuracy": 0.8487963676452637, "num_tokens": 608454699.0, "step": 3819 }, { "epoch": 1.943031536113937, "grad_norm": 0.8393166065216064, "learning_rate": 1e-05, "loss": 0.4812, "mean_token_accuracy": 0.8456804752349854, "num_tokens": 608626842.0, "step": 3820 }, { "epoch": 1.9435401831129195, "grad_norm": 0.8989098072052002, "learning_rate": 1e-05, "loss": 0.4605, "mean_token_accuracy": 0.8546445369720459, "num_tokens": 608803353.0, "step": 3821 }, { "epoch": 1.9440488301119023, "grad_norm": 0.9190250039100647, "learning_rate": 1e-05, "loss": 0.467, "mean_token_accuracy": 0.8513407707214355, "num_tokens": 608969123.0, "step": 3822 }, { "epoch": 1.9445574771108851, "grad_norm": 0.9733286499977112, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8549008369445801, "num_tokens": 609112871.0, "step": 3823 }, { "epoch": 1.945066124109868, "grad_norm": 1.037373423576355, "learning_rate": 1e-05, "loss": 0.4717, "mean_token_accuracy": 0.8502563238143921, "num_tokens": 609267308.0, "step": 3824 }, { "epoch": 1.9455747711088505, "grad_norm": 0.9469419121742249, "learning_rate": 1e-05, "loss": 0.4755, "mean_token_accuracy": 0.846880316734314, "num_tokens": 609429279.0, "step": 3825 }, { "epoch": 1.946083418107833, "grad_norm": 0.9067391157150269, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8481675386428833, "num_tokens": 609605979.0, "step": 3826 }, { "epoch": 1.9465920651068158, "grad_norm": 1.0206915140151978, "learning_rate": 1e-05, "loss": 0.4456, "mean_token_accuracy": 0.8566611409187317, "num_tokens": 609754605.0, "step": 3827 }, { "epoch": 1.9471007121057986, "grad_norm": 0.9272356033325195, "learning_rate": 1e-05, "loss": 0.4788, "mean_token_accuracy": 0.8478904962539673, "num_tokens": 609905194.0, "step": 3828 }, { "epoch": 1.9476093591047814, "grad_norm": 0.889542818069458, "learning_rate": 1e-05, "loss": 0.4646, "mean_token_accuracy": 0.8507856130599976, "num_tokens": 610068975.0, "step": 3829 }, { "epoch": 1.948118006103764, "grad_norm": 0.9088366031646729, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.8457269668579102, "num_tokens": 610232914.0, "step": 3830 }, { "epoch": 1.9486266531027467, "grad_norm": 0.9174922108650208, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8533996343612671, "num_tokens": 610377414.0, "step": 3831 }, { "epoch": 1.9491353001017293, "grad_norm": 0.952247679233551, "learning_rate": 1e-05, "loss": 0.4495, "mean_token_accuracy": 0.8532795906066895, "num_tokens": 610524356.0, "step": 3832 }, { "epoch": 1.949643947100712, "grad_norm": 0.8449368476867676, "learning_rate": 1e-05, "loss": 0.4706, "mean_token_accuracy": 0.8495937585830688, "num_tokens": 610687425.0, "step": 3833 }, { "epoch": 1.9501525940996949, "grad_norm": 0.9186002612113953, "learning_rate": 1e-05, "loss": 0.4445, "mean_token_accuracy": 0.8576358556747437, "num_tokens": 610848911.0, "step": 3834 }, { "epoch": 1.9506612410986777, "grad_norm": 0.9892232418060303, "learning_rate": 1e-05, "loss": 0.4378, "mean_token_accuracy": 0.8579406142234802, "num_tokens": 610992150.0, "step": 3835 }, { "epoch": 1.9511698880976602, "grad_norm": 0.9537031650543213, "learning_rate": 1e-05, "loss": 0.4697, "mean_token_accuracy": 0.8509016036987305, "num_tokens": 611151896.0, "step": 3836 }, { "epoch": 1.9516785350966428, "grad_norm": 0.8665832877159119, "learning_rate": 1e-05, "loss": 0.4631, "mean_token_accuracy": 0.852397084236145, "num_tokens": 611322548.0, "step": 3837 }, { "epoch": 1.9521871820956256, "grad_norm": 0.9472004771232605, "learning_rate": 1e-05, "loss": 0.4391, "mean_token_accuracy": 0.8582301735877991, "num_tokens": 611477707.0, "step": 3838 }, { "epoch": 1.9526958290946084, "grad_norm": 1.0446181297302246, "learning_rate": 1e-05, "loss": 0.4804, "mean_token_accuracy": 0.8453882932662964, "num_tokens": 611633623.0, "step": 3839 }, { "epoch": 1.9532044760935912, "grad_norm": 0.9511471390724182, "learning_rate": 1e-05, "loss": 0.4801, "mean_token_accuracy": 0.8465399742126465, "num_tokens": 611786310.0, "step": 3840 }, { "epoch": 1.9537131230925737, "grad_norm": 1.009140968322754, "learning_rate": 1e-05, "loss": 0.4833, "mean_token_accuracy": 0.8464534282684326, "num_tokens": 611939348.0, "step": 3841 }, { "epoch": 1.9542217700915565, "grad_norm": 0.9803367257118225, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8545321822166443, "num_tokens": 612106242.0, "step": 3842 }, { "epoch": 1.954730417090539, "grad_norm": 0.9608579277992249, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.8532856702804565, "num_tokens": 612259738.0, "step": 3843 }, { "epoch": 1.9552390640895219, "grad_norm": 1.0368393659591675, "learning_rate": 1e-05, "loss": 0.4404, "mean_token_accuracy": 0.8589491844177246, "num_tokens": 612408694.0, "step": 3844 }, { "epoch": 1.9557477110885046, "grad_norm": 1.0139336585998535, "learning_rate": 1e-05, "loss": 0.4909, "mean_token_accuracy": 0.8452524542808533, "num_tokens": 612570005.0, "step": 3845 }, { "epoch": 1.9562563580874874, "grad_norm": 0.9647461175918579, "learning_rate": 1e-05, "loss": 0.4298, "mean_token_accuracy": 0.8602635860443115, "num_tokens": 612723003.0, "step": 3846 }, { "epoch": 1.95676500508647, "grad_norm": 0.9352390170097351, "learning_rate": 1e-05, "loss": 0.4532, "mean_token_accuracy": 0.8548156023025513, "num_tokens": 612893200.0, "step": 3847 }, { "epoch": 1.9572736520854526, "grad_norm": 1.0115501880645752, "learning_rate": 1e-05, "loss": 0.4864, "mean_token_accuracy": 0.8474304676055908, "num_tokens": 613044117.0, "step": 3848 }, { "epoch": 1.9577822990844354, "grad_norm": 0.969480037689209, "learning_rate": 1e-05, "loss": 0.4243, "mean_token_accuracy": 0.8618882894515991, "num_tokens": 613176151.0, "step": 3849 }, { "epoch": 1.9582909460834181, "grad_norm": 0.8841404914855957, "learning_rate": 1e-05, "loss": 0.4612, "mean_token_accuracy": 0.8527694344520569, "num_tokens": 613322625.0, "step": 3850 }, { "epoch": 1.958799593082401, "grad_norm": 1.028300404548645, "learning_rate": 1e-05, "loss": 0.4302, "mean_token_accuracy": 0.8599109649658203, "num_tokens": 613484461.0, "step": 3851 }, { "epoch": 1.9593082400813835, "grad_norm": 1.005175232887268, "learning_rate": 1e-05, "loss": 0.4941, "mean_token_accuracy": 0.8423586487770081, "num_tokens": 613652844.0, "step": 3852 }, { "epoch": 1.9598168870803663, "grad_norm": 1.6460040807724, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8538551330566406, "num_tokens": 613810470.0, "step": 3853 }, { "epoch": 1.9603255340793488, "grad_norm": 1.1169441938400269, "learning_rate": 1e-05, "loss": 0.459, "mean_token_accuracy": 0.8533007502555847, "num_tokens": 613969421.0, "step": 3854 }, { "epoch": 1.9608341810783316, "grad_norm": 0.9980722665786743, "learning_rate": 1e-05, "loss": 0.4502, "mean_token_accuracy": 0.8556852340698242, "num_tokens": 614130745.0, "step": 3855 }, { "epoch": 1.9613428280773144, "grad_norm": 0.9866142868995667, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8559843301773071, "num_tokens": 614295637.0, "step": 3856 }, { "epoch": 1.9618514750762972, "grad_norm": 1.0588595867156982, "learning_rate": 1e-05, "loss": 0.4838, "mean_token_accuracy": 0.8471004962921143, "num_tokens": 614455524.0, "step": 3857 }, { "epoch": 1.9623601220752798, "grad_norm": 0.9205344915390015, "learning_rate": 1e-05, "loss": 0.4495, "mean_token_accuracy": 0.8560166358947754, "num_tokens": 614612191.0, "step": 3858 }, { "epoch": 1.9628687690742623, "grad_norm": 0.9048281908035278, "learning_rate": 1e-05, "loss": 0.4677, "mean_token_accuracy": 0.8504944443702698, "num_tokens": 614782233.0, "step": 3859 }, { "epoch": 1.9633774160732451, "grad_norm": 1.050432562828064, "learning_rate": 1e-05, "loss": 0.4825, "mean_token_accuracy": 0.8455482125282288, "num_tokens": 614941005.0, "step": 3860 }, { "epoch": 1.963886063072228, "grad_norm": 0.865611732006073, "learning_rate": 1e-05, "loss": 0.4472, "mean_token_accuracy": 0.8554700613021851, "num_tokens": 615100608.0, "step": 3861 }, { "epoch": 1.9643947100712107, "grad_norm": 0.9181163311004639, "learning_rate": 1e-05, "loss": 0.4519, "mean_token_accuracy": 0.8545412421226501, "num_tokens": 615260901.0, "step": 3862 }, { "epoch": 1.9649033570701933, "grad_norm": 1.0207750797271729, "learning_rate": 1e-05, "loss": 0.4659, "mean_token_accuracy": 0.853108286857605, "num_tokens": 615416052.0, "step": 3863 }, { "epoch": 1.965412004069176, "grad_norm": 0.8985828161239624, "learning_rate": 1e-05, "loss": 0.4758, "mean_token_accuracy": 0.8490435481071472, "num_tokens": 615576002.0, "step": 3864 }, { "epoch": 1.9659206510681586, "grad_norm": 1.0118473768234253, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8502198457717896, "num_tokens": 615742455.0, "step": 3865 }, { "epoch": 1.9664292980671414, "grad_norm": 0.9573753476142883, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8564034700393677, "num_tokens": 615906641.0, "step": 3866 }, { "epoch": 1.9669379450661242, "grad_norm": 0.9813663363456726, "learning_rate": 1e-05, "loss": 0.4824, "mean_token_accuracy": 0.8461043834686279, "num_tokens": 616075924.0, "step": 3867 }, { "epoch": 1.967446592065107, "grad_norm": 1.0511075258255005, "learning_rate": 1e-05, "loss": 0.5225, "mean_token_accuracy": 0.8352948427200317, "num_tokens": 616248968.0, "step": 3868 }, { "epoch": 1.9679552390640895, "grad_norm": 0.9328294396400452, "learning_rate": 1e-05, "loss": 0.4535, "mean_token_accuracy": 0.8543003797531128, "num_tokens": 616416342.0, "step": 3869 }, { "epoch": 1.968463886063072, "grad_norm": 1.0230273008346558, "learning_rate": 1e-05, "loss": 0.4659, "mean_token_accuracy": 0.84980309009552, "num_tokens": 616578475.0, "step": 3870 }, { "epoch": 1.9689725330620549, "grad_norm": 0.910184919834137, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8411458730697632, "num_tokens": 616746225.0, "step": 3871 }, { "epoch": 1.9694811800610377, "grad_norm": 0.8828282356262207, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8588099479675293, "num_tokens": 616899976.0, "step": 3872 }, { "epoch": 1.9699898270600205, "grad_norm": 1.0311261415481567, "learning_rate": 1e-05, "loss": 0.4711, "mean_token_accuracy": 0.8495997190475464, "num_tokens": 617064650.0, "step": 3873 }, { "epoch": 1.970498474059003, "grad_norm": 0.947348952293396, "learning_rate": 1e-05, "loss": 0.4353, "mean_token_accuracy": 0.8598195910453796, "num_tokens": 617214839.0, "step": 3874 }, { "epoch": 1.9710071210579858, "grad_norm": 1.03840172290802, "learning_rate": 1e-05, "loss": 0.4986, "mean_token_accuracy": 0.8405134677886963, "num_tokens": 617367210.0, "step": 3875 }, { "epoch": 1.9715157680569684, "grad_norm": 0.9516762495040894, "learning_rate": 1e-05, "loss": 0.4653, "mean_token_accuracy": 0.8516678214073181, "num_tokens": 617512120.0, "step": 3876 }, { "epoch": 1.9720244150559512, "grad_norm": 0.9232913255691528, "learning_rate": 1e-05, "loss": 0.4512, "mean_token_accuracy": 0.8564640879631042, "num_tokens": 617668697.0, "step": 3877 }, { "epoch": 1.972533062054934, "grad_norm": 1.0209755897521973, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8471225500106812, "num_tokens": 617823554.0, "step": 3878 }, { "epoch": 1.9730417090539167, "grad_norm": 1.0261807441711426, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8503115773200989, "num_tokens": 617983196.0, "step": 3879 }, { "epoch": 1.9735503560528993, "grad_norm": 0.923484206199646, "learning_rate": 1e-05, "loss": 0.4574, "mean_token_accuracy": 0.8527460098266602, "num_tokens": 618137243.0, "step": 3880 }, { "epoch": 1.9740590030518819, "grad_norm": 0.9335493445396423, "learning_rate": 1e-05, "loss": 0.454, "mean_token_accuracy": 0.8534466624259949, "num_tokens": 618302801.0, "step": 3881 }, { "epoch": 1.9745676500508647, "grad_norm": 0.9859886169433594, "learning_rate": 1e-05, "loss": 0.5018, "mean_token_accuracy": 0.8423020839691162, "num_tokens": 618473609.0, "step": 3882 }, { "epoch": 1.9750762970498474, "grad_norm": 0.962785005569458, "learning_rate": 1e-05, "loss": 0.4451, "mean_token_accuracy": 0.8573927879333496, "num_tokens": 618635491.0, "step": 3883 }, { "epoch": 1.9755849440488302, "grad_norm": 0.9044583439826965, "learning_rate": 1e-05, "loss": 0.4552, "mean_token_accuracy": 0.8537480235099792, "num_tokens": 618796427.0, "step": 3884 }, { "epoch": 1.9760935910478128, "grad_norm": 0.9657168984413147, "learning_rate": 1e-05, "loss": 0.48, "mean_token_accuracy": 0.8483161330223083, "num_tokens": 618958212.0, "step": 3885 }, { "epoch": 1.9766022380467956, "grad_norm": 0.9363645911216736, "learning_rate": 1e-05, "loss": 0.4401, "mean_token_accuracy": 0.8569280505180359, "num_tokens": 619108709.0, "step": 3886 }, { "epoch": 1.9771108850457781, "grad_norm": 0.8866698145866394, "learning_rate": 1e-05, "loss": 0.4345, "mean_token_accuracy": 0.8591384291648865, "num_tokens": 619264010.0, "step": 3887 }, { "epoch": 1.977619532044761, "grad_norm": 0.9166999459266663, "learning_rate": 1e-05, "loss": 0.5002, "mean_token_accuracy": 0.8409247398376465, "num_tokens": 619435220.0, "step": 3888 }, { "epoch": 1.9781281790437437, "grad_norm": 0.9197640419006348, "learning_rate": 1e-05, "loss": 0.4705, "mean_token_accuracy": 0.848235011100769, "num_tokens": 619595878.0, "step": 3889 }, { "epoch": 1.9786368260427265, "grad_norm": 0.918525755405426, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.8348990678787231, "num_tokens": 619759819.0, "step": 3890 }, { "epoch": 1.979145473041709, "grad_norm": 1.0172572135925293, "learning_rate": 1e-05, "loss": 0.4545, "mean_token_accuracy": 0.8554086685180664, "num_tokens": 619915648.0, "step": 3891 }, { "epoch": 1.9796541200406916, "grad_norm": 0.9677065014839172, "learning_rate": 1e-05, "loss": 0.4713, "mean_token_accuracy": 0.8500718474388123, "num_tokens": 620070498.0, "step": 3892 }, { "epoch": 1.9801627670396744, "grad_norm": 0.9305118322372437, "learning_rate": 1e-05, "loss": 0.4747, "mean_token_accuracy": 0.8483951687812805, "num_tokens": 620230320.0, "step": 3893 }, { "epoch": 1.9806714140386572, "grad_norm": 0.8849009275436401, "learning_rate": 1e-05, "loss": 0.4403, "mean_token_accuracy": 0.8593703508377075, "num_tokens": 620393836.0, "step": 3894 }, { "epoch": 1.98118006103764, "grad_norm": 0.9445408582687378, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8560349941253662, "num_tokens": 620550585.0, "step": 3895 }, { "epoch": 1.9816887080366226, "grad_norm": 0.8695283532142639, "learning_rate": 1e-05, "loss": 0.4788, "mean_token_accuracy": 0.8496056795120239, "num_tokens": 620719684.0, "step": 3896 }, { "epoch": 1.9821973550356051, "grad_norm": 0.9322693943977356, "learning_rate": 1e-05, "loss": 0.4809, "mean_token_accuracy": 0.8461048603057861, "num_tokens": 620887069.0, "step": 3897 }, { "epoch": 1.982706002034588, "grad_norm": 0.9163092970848083, "learning_rate": 1e-05, "loss": 0.4684, "mean_token_accuracy": 0.8510671854019165, "num_tokens": 621050520.0, "step": 3898 }, { "epoch": 1.9832146490335707, "grad_norm": 1.0227288007736206, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8476278185844421, "num_tokens": 621205316.0, "step": 3899 }, { "epoch": 1.9837232960325535, "grad_norm": 0.9090538620948792, "learning_rate": 1e-05, "loss": 0.4524, "mean_token_accuracy": 0.8544093370437622, "num_tokens": 621363543.0, "step": 3900 }, { "epoch": 1.9842319430315363, "grad_norm": 0.8535559773445129, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.8503838181495667, "num_tokens": 621525240.0, "step": 3901 }, { "epoch": 1.9847405900305188, "grad_norm": 0.928386390209198, "learning_rate": 1e-05, "loss": 0.4446, "mean_token_accuracy": 0.856638491153717, "num_tokens": 621674098.0, "step": 3902 }, { "epoch": 1.9852492370295014, "grad_norm": 0.8908268213272095, "learning_rate": 1e-05, "loss": 0.4659, "mean_token_accuracy": 0.8504698276519775, "num_tokens": 621831203.0, "step": 3903 }, { "epoch": 1.9857578840284842, "grad_norm": 0.8927626013755798, "learning_rate": 1e-05, "loss": 0.4554, "mean_token_accuracy": 0.8538251519203186, "num_tokens": 621989155.0, "step": 3904 }, { "epoch": 1.986266531027467, "grad_norm": 0.9428632855415344, "learning_rate": 1e-05, "loss": 0.4319, "mean_token_accuracy": 0.8602399826049805, "num_tokens": 622136977.0, "step": 3905 }, { "epoch": 1.9867751780264498, "grad_norm": 0.909707248210907, "learning_rate": 1e-05, "loss": 0.5064, "mean_token_accuracy": 0.8414626121520996, "num_tokens": 622317261.0, "step": 3906 }, { "epoch": 1.9872838250254323, "grad_norm": 0.8628239035606384, "learning_rate": 1e-05, "loss": 0.4943, "mean_token_accuracy": 0.8427656292915344, "num_tokens": 622482892.0, "step": 3907 }, { "epoch": 1.987792472024415, "grad_norm": 0.8984335064888, "learning_rate": 1e-05, "loss": 0.4598, "mean_token_accuracy": 0.8521856665611267, "num_tokens": 622647732.0, "step": 3908 }, { "epoch": 1.9883011190233977, "grad_norm": 0.8870931267738342, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8595765233039856, "num_tokens": 622821700.0, "step": 3909 }, { "epoch": 1.9888097660223805, "grad_norm": 0.9406011700630188, "learning_rate": 1e-05, "loss": 0.4693, "mean_token_accuracy": 0.8493680953979492, "num_tokens": 622984287.0, "step": 3910 }, { "epoch": 1.9893184130213633, "grad_norm": 1.0311652421951294, "learning_rate": 1e-05, "loss": 0.4232, "mean_token_accuracy": 0.8638802766799927, "num_tokens": 623129707.0, "step": 3911 }, { "epoch": 1.989827060020346, "grad_norm": 0.9765109419822693, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8548345565795898, "num_tokens": 623279377.0, "step": 3912 }, { "epoch": 1.9903357070193286, "grad_norm": 0.9124788045883179, "learning_rate": 1e-05, "loss": 0.471, "mean_token_accuracy": 0.8486224412918091, "num_tokens": 623451034.0, "step": 3913 }, { "epoch": 1.9908443540183112, "grad_norm": 0.9086233973503113, "learning_rate": 1e-05, "loss": 0.4584, "mean_token_accuracy": 0.8524532318115234, "num_tokens": 623617093.0, "step": 3914 }, { "epoch": 1.991353001017294, "grad_norm": 0.8954272270202637, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8520581126213074, "num_tokens": 623769321.0, "step": 3915 }, { "epoch": 1.9918616480162767, "grad_norm": 0.916243314743042, "learning_rate": 1e-05, "loss": 0.4947, "mean_token_accuracy": 0.8445735573768616, "num_tokens": 623935367.0, "step": 3916 }, { "epoch": 1.9923702950152595, "grad_norm": 0.8950624465942383, "learning_rate": 1e-05, "loss": 0.4808, "mean_token_accuracy": 0.8477447032928467, "num_tokens": 624097806.0, "step": 3917 }, { "epoch": 1.992878942014242, "grad_norm": 0.8239250183105469, "learning_rate": 1e-05, "loss": 0.4581, "mean_token_accuracy": 0.8526318073272705, "num_tokens": 624268903.0, "step": 3918 }, { "epoch": 1.9933875890132247, "grad_norm": 0.9191027283668518, "learning_rate": 1e-05, "loss": 0.4719, "mean_token_accuracy": 0.8489470481872559, "num_tokens": 624421868.0, "step": 3919 }, { "epoch": 1.9938962360122074, "grad_norm": 0.8871040940284729, "learning_rate": 1e-05, "loss": 0.476, "mean_token_accuracy": 0.8485807180404663, "num_tokens": 624585621.0, "step": 3920 }, { "epoch": 1.9944048830111902, "grad_norm": 0.9006506204605103, "learning_rate": 1e-05, "loss": 0.5007, "mean_token_accuracy": 0.8404300808906555, "num_tokens": 624745618.0, "step": 3921 }, { "epoch": 1.994913530010173, "grad_norm": 0.8424310684204102, "learning_rate": 1e-05, "loss": 0.4745, "mean_token_accuracy": 0.8490932583808899, "num_tokens": 624919983.0, "step": 3922 }, { "epoch": 1.9954221770091558, "grad_norm": 0.8778648376464844, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.8425777554512024, "num_tokens": 625088682.0, "step": 3923 }, { "epoch": 1.9959308240081384, "grad_norm": 0.9202529191970825, "learning_rate": 1e-05, "loss": 0.5111, "mean_token_accuracy": 0.8390426635742188, "num_tokens": 625244861.0, "step": 3924 }, { "epoch": 1.996439471007121, "grad_norm": 0.8500998616218567, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.853619396686554, "num_tokens": 625402853.0, "step": 3925 }, { "epoch": 1.9969481180061037, "grad_norm": 0.9115105271339417, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.853884756565094, "num_tokens": 625553633.0, "step": 3926 }, { "epoch": 1.9974567650050865, "grad_norm": 0.8809705972671509, "learning_rate": 1e-05, "loss": 0.4812, "mean_token_accuracy": 0.8468331098556519, "num_tokens": 625717102.0, "step": 3927 }, { "epoch": 1.9979654120040693, "grad_norm": 0.841272234916687, "learning_rate": 1e-05, "loss": 0.4817, "mean_token_accuracy": 0.8469640016555786, "num_tokens": 625879310.0, "step": 3928 }, { "epoch": 1.9984740590030519, "grad_norm": 0.9201867580413818, "learning_rate": 1e-05, "loss": 0.4555, "mean_token_accuracy": 0.8538429737091064, "num_tokens": 626035166.0, "step": 3929 }, { "epoch": 1.9989827060020344, "grad_norm": 0.8562085628509521, "learning_rate": 1e-05, "loss": 0.457, "mean_token_accuracy": 0.8542649149894714, "num_tokens": 626194056.0, "step": 3930 }, { "epoch": 1.9994913530010172, "grad_norm": 0.8933717012405396, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.846855103969574, "num_tokens": 626349258.0, "step": 3931 }, { "epoch": 2.0, "grad_norm": 0.915454626083374, "learning_rate": 1e-05, "loss": 0.4431, "mean_token_accuracy": 0.8570587038993835, "num_tokens": 626514393.0, "step": 3932 }, { "epoch": 2.000508646998983, "grad_norm": 0.9072067737579346, "learning_rate": 1e-05, "loss": 0.4253, "mean_token_accuracy": 0.8628668785095215, "num_tokens": 626678112.0, "step": 3933 }, { "epoch": 2.0010172939979656, "grad_norm": 0.9203000068664551, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.860080897808075, "num_tokens": 626843728.0, "step": 3934 }, { "epoch": 2.001525940996948, "grad_norm": 0.9907317161560059, "learning_rate": 1e-05, "loss": 0.4362, "mean_token_accuracy": 0.8583724498748779, "num_tokens": 627012779.0, "step": 3935 }, { "epoch": 2.0020345879959307, "grad_norm": 0.9471038579940796, "learning_rate": 1e-05, "loss": 0.4623, "mean_token_accuracy": 0.8516335487365723, "num_tokens": 627179506.0, "step": 3936 }, { "epoch": 2.0025432349949135, "grad_norm": 1.0517023801803589, "learning_rate": 1e-05, "loss": 0.4371, "mean_token_accuracy": 0.8596482872962952, "num_tokens": 627321312.0, "step": 3937 }, { "epoch": 2.0030518819938963, "grad_norm": 1.1122736930847168, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8612704277038574, "num_tokens": 627483549.0, "step": 3938 }, { "epoch": 2.003560528992879, "grad_norm": 0.9688953757286072, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.8595166802406311, "num_tokens": 627652408.0, "step": 3939 }, { "epoch": 2.004069175991862, "grad_norm": 1.0007293224334717, "learning_rate": 1e-05, "loss": 0.4446, "mean_token_accuracy": 0.8554959893226624, "num_tokens": 627813505.0, "step": 3940 }, { "epoch": 2.004577822990844, "grad_norm": 0.981654703617096, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.8577959537506104, "num_tokens": 627970583.0, "step": 3941 }, { "epoch": 2.005086469989827, "grad_norm": 0.9396583437919617, "learning_rate": 1e-05, "loss": 0.4526, "mean_token_accuracy": 0.8536440134048462, "num_tokens": 628136119.0, "step": 3942 }, { "epoch": 2.0055951169888098, "grad_norm": 0.9022424817085266, "learning_rate": 1e-05, "loss": 0.4277, "mean_token_accuracy": 0.8626178503036499, "num_tokens": 628294673.0, "step": 3943 }, { "epoch": 2.0061037639877926, "grad_norm": 0.9768242239952087, "learning_rate": 1e-05, "loss": 0.4266, "mean_token_accuracy": 0.8636049032211304, "num_tokens": 628450141.0, "step": 3944 }, { "epoch": 2.0066124109867753, "grad_norm": 0.9007858037948608, "learning_rate": 1e-05, "loss": 0.4216, "mean_token_accuracy": 0.8628674745559692, "num_tokens": 628608570.0, "step": 3945 }, { "epoch": 2.0071210579857577, "grad_norm": 0.9086043238639832, "learning_rate": 1e-05, "loss": 0.4257, "mean_token_accuracy": 0.8627064228057861, "num_tokens": 628770893.0, "step": 3946 }, { "epoch": 2.0076297049847405, "grad_norm": 0.9539920687675476, "learning_rate": 1e-05, "loss": 0.4487, "mean_token_accuracy": 0.8553727865219116, "num_tokens": 628932043.0, "step": 3947 }, { "epoch": 2.0081383519837233, "grad_norm": 0.9863036274909973, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8575353026390076, "num_tokens": 629091061.0, "step": 3948 }, { "epoch": 2.008646998982706, "grad_norm": 0.9033030271530151, "learning_rate": 1e-05, "loss": 0.4115, "mean_token_accuracy": 0.8660039901733398, "num_tokens": 629252044.0, "step": 3949 }, { "epoch": 2.009155645981689, "grad_norm": 0.9724506735801697, "learning_rate": 1e-05, "loss": 0.4186, "mean_token_accuracy": 0.8633401393890381, "num_tokens": 629412815.0, "step": 3950 }, { "epoch": 2.0096642929806716, "grad_norm": 0.8986982107162476, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8590998649597168, "num_tokens": 629565057.0, "step": 3951 }, { "epoch": 2.010172939979654, "grad_norm": 0.9415915012359619, "learning_rate": 1e-05, "loss": 0.449, "mean_token_accuracy": 0.8536491990089417, "num_tokens": 629722640.0, "step": 3952 }, { "epoch": 2.0106815869786367, "grad_norm": 0.8877673149108887, "learning_rate": 1e-05, "loss": 0.4247, "mean_token_accuracy": 0.8628571033477783, "num_tokens": 629883654.0, "step": 3953 }, { "epoch": 2.0111902339776195, "grad_norm": 0.8674871325492859, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8509914875030518, "num_tokens": 630044682.0, "step": 3954 }, { "epoch": 2.0116988809766023, "grad_norm": 0.9668797254562378, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.8393916487693787, "num_tokens": 630189841.0, "step": 3955 }, { "epoch": 2.012207527975585, "grad_norm": 0.9578163623809814, "learning_rate": 1e-05, "loss": 0.4261, "mean_token_accuracy": 0.8615664839744568, "num_tokens": 630333856.0, "step": 3956 }, { "epoch": 2.0127161749745675, "grad_norm": 0.9142828583717346, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.8505555391311646, "num_tokens": 630485294.0, "step": 3957 }, { "epoch": 2.0132248219735502, "grad_norm": 1.0230820178985596, "learning_rate": 1e-05, "loss": 0.4553, "mean_token_accuracy": 0.8525019884109497, "num_tokens": 630642252.0, "step": 3958 }, { "epoch": 2.013733468972533, "grad_norm": 0.9404370784759521, "learning_rate": 1e-05, "loss": 0.4426, "mean_token_accuracy": 0.8559948205947876, "num_tokens": 630810155.0, "step": 3959 }, { "epoch": 2.014242115971516, "grad_norm": 0.9590718150138855, "learning_rate": 1e-05, "loss": 0.4487, "mean_token_accuracy": 0.854925274848938, "num_tokens": 630967182.0, "step": 3960 }, { "epoch": 2.0147507629704986, "grad_norm": 0.8920328617095947, "learning_rate": 1e-05, "loss": 0.4026, "mean_token_accuracy": 0.8685610294342041, "num_tokens": 631119247.0, "step": 3961 }, { "epoch": 2.0152594099694814, "grad_norm": 0.9542795419692993, "learning_rate": 1e-05, "loss": 0.4006, "mean_token_accuracy": 0.8695034980773926, "num_tokens": 631277861.0, "step": 3962 }, { "epoch": 2.0157680569684637, "grad_norm": 0.8820130825042725, "learning_rate": 1e-05, "loss": 0.3956, "mean_token_accuracy": 0.8703972101211548, "num_tokens": 631431425.0, "step": 3963 }, { "epoch": 2.0162767039674465, "grad_norm": 1.0530736446380615, "learning_rate": 1e-05, "loss": 0.4092, "mean_token_accuracy": 0.8659566640853882, "num_tokens": 631581074.0, "step": 3964 }, { "epoch": 2.0167853509664293, "grad_norm": 0.9052873253822327, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8610852956771851, "num_tokens": 631737047.0, "step": 3965 }, { "epoch": 2.017293997965412, "grad_norm": 0.8882894515991211, "learning_rate": 1e-05, "loss": 0.4342, "mean_token_accuracy": 0.8599810600280762, "num_tokens": 631908085.0, "step": 3966 }, { "epoch": 2.017802644964395, "grad_norm": 1.0958209037780762, "learning_rate": 1e-05, "loss": 0.4618, "mean_token_accuracy": 0.8512526750564575, "num_tokens": 632082603.0, "step": 3967 }, { "epoch": 2.018311291963377, "grad_norm": 1.1128910779953003, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.8599388599395752, "num_tokens": 632228951.0, "step": 3968 }, { "epoch": 2.01881993896236, "grad_norm": 0.9493024945259094, "learning_rate": 1e-05, "loss": 0.4291, "mean_token_accuracy": 0.8602290153503418, "num_tokens": 632395510.0, "step": 3969 }, { "epoch": 2.019328585961343, "grad_norm": 0.9741524457931519, "learning_rate": 1e-05, "loss": 0.4367, "mean_token_accuracy": 0.858225405216217, "num_tokens": 632557630.0, "step": 3970 }, { "epoch": 2.0198372329603256, "grad_norm": 0.9519668221473694, "learning_rate": 1e-05, "loss": 0.431, "mean_token_accuracy": 0.8605380058288574, "num_tokens": 632721024.0, "step": 3971 }, { "epoch": 2.0203458799593084, "grad_norm": 0.9784402847290039, "learning_rate": 1e-05, "loss": 0.4625, "mean_token_accuracy": 0.8513401746749878, "num_tokens": 632888435.0, "step": 3972 }, { "epoch": 2.020854526958291, "grad_norm": 0.8853612542152405, "learning_rate": 1e-05, "loss": 0.4005, "mean_token_accuracy": 0.8683898448944092, "num_tokens": 633047095.0, "step": 3973 }, { "epoch": 2.0213631739572735, "grad_norm": 1.0621803998947144, "learning_rate": 1e-05, "loss": 0.4412, "mean_token_accuracy": 0.8565826416015625, "num_tokens": 633213083.0, "step": 3974 }, { "epoch": 2.0218718209562563, "grad_norm": 0.9953148365020752, "learning_rate": 1e-05, "loss": 0.436, "mean_token_accuracy": 0.8580330610275269, "num_tokens": 633369616.0, "step": 3975 }, { "epoch": 2.022380467955239, "grad_norm": 0.9554272890090942, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8606266975402832, "num_tokens": 633521507.0, "step": 3976 }, { "epoch": 2.022889114954222, "grad_norm": 1.0419212579727173, "learning_rate": 1e-05, "loss": 0.4395, "mean_token_accuracy": 0.8574559688568115, "num_tokens": 633678242.0, "step": 3977 }, { "epoch": 2.0233977619532046, "grad_norm": 1.0358978509902954, "learning_rate": 1e-05, "loss": 0.4423, "mean_token_accuracy": 0.8566417694091797, "num_tokens": 633851293.0, "step": 3978 }, { "epoch": 2.023906408952187, "grad_norm": 1.0834808349609375, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8513150811195374, "num_tokens": 634019931.0, "step": 3979 }, { "epoch": 2.0244150559511698, "grad_norm": 0.8701474666595459, "learning_rate": 1e-05, "loss": 0.4333, "mean_token_accuracy": 0.859501838684082, "num_tokens": 634182314.0, "step": 3980 }, { "epoch": 2.0249237029501526, "grad_norm": 0.9976544976234436, "learning_rate": 1e-05, "loss": 0.4501, "mean_token_accuracy": 0.855332612991333, "num_tokens": 634351784.0, "step": 3981 }, { "epoch": 2.0254323499491353, "grad_norm": 0.9032160043716431, "learning_rate": 1e-05, "loss": 0.4203, "mean_token_accuracy": 0.8629529476165771, "num_tokens": 634503716.0, "step": 3982 }, { "epoch": 2.025940996948118, "grad_norm": 0.9581498503684998, "learning_rate": 1e-05, "loss": 0.4547, "mean_token_accuracy": 0.8519119024276733, "num_tokens": 634660553.0, "step": 3983 }, { "epoch": 2.026449643947101, "grad_norm": 0.9951674938201904, "learning_rate": 1e-05, "loss": 0.4104, "mean_token_accuracy": 0.8657628893852234, "num_tokens": 634812750.0, "step": 3984 }, { "epoch": 2.0269582909460833, "grad_norm": 1.1226887702941895, "learning_rate": 1e-05, "loss": 0.4382, "mean_token_accuracy": 0.8587206602096558, "num_tokens": 634964107.0, "step": 3985 }, { "epoch": 2.027466937945066, "grad_norm": 1.053794503211975, "learning_rate": 1e-05, "loss": 0.4382, "mean_token_accuracy": 0.8575358986854553, "num_tokens": 635124979.0, "step": 3986 }, { "epoch": 2.027975584944049, "grad_norm": 0.8636070489883423, "learning_rate": 1e-05, "loss": 0.4459, "mean_token_accuracy": 0.8556486964225769, "num_tokens": 635294168.0, "step": 3987 }, { "epoch": 2.0284842319430316, "grad_norm": 0.9470566511154175, "learning_rate": 1e-05, "loss": 0.4219, "mean_token_accuracy": 0.8628208041191101, "num_tokens": 635455215.0, "step": 3988 }, { "epoch": 2.0289928789420144, "grad_norm": 0.9437407851219177, "learning_rate": 1e-05, "loss": 0.451, "mean_token_accuracy": 0.8550064563751221, "num_tokens": 635615392.0, "step": 3989 }, { "epoch": 2.0295015259409968, "grad_norm": 0.887579083442688, "learning_rate": 1e-05, "loss": 0.4342, "mean_token_accuracy": 0.8611311912536621, "num_tokens": 635778771.0, "step": 3990 }, { "epoch": 2.0300101729399795, "grad_norm": 0.9505531191825867, "learning_rate": 1e-05, "loss": 0.4582, "mean_token_accuracy": 0.8537662029266357, "num_tokens": 635940023.0, "step": 3991 }, { "epoch": 2.0305188199389623, "grad_norm": 0.9255881309509277, "learning_rate": 1e-05, "loss": 0.4522, "mean_token_accuracy": 0.8540250062942505, "num_tokens": 636091977.0, "step": 3992 }, { "epoch": 2.031027466937945, "grad_norm": 1.0332192182540894, "learning_rate": 1e-05, "loss": 0.4233, "mean_token_accuracy": 0.8634705543518066, "num_tokens": 636260118.0, "step": 3993 }, { "epoch": 2.031536113936928, "grad_norm": 0.9251944422721863, "learning_rate": 1e-05, "loss": 0.4222, "mean_token_accuracy": 0.862652063369751, "num_tokens": 636426068.0, "step": 3994 }, { "epoch": 2.0320447609359107, "grad_norm": 0.9779582619667053, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8561336994171143, "num_tokens": 636574295.0, "step": 3995 }, { "epoch": 2.032553407934893, "grad_norm": 0.9237884879112244, "learning_rate": 1e-05, "loss": 0.4179, "mean_token_accuracy": 0.8642343878746033, "num_tokens": 636728987.0, "step": 3996 }, { "epoch": 2.033062054933876, "grad_norm": 0.8933495283126831, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8535547256469727, "num_tokens": 636889748.0, "step": 3997 }, { "epoch": 2.0335707019328586, "grad_norm": 1.0079468488693237, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8520773649215698, "num_tokens": 637048170.0, "step": 3998 }, { "epoch": 2.0340793489318414, "grad_norm": 0.9072739481925964, "learning_rate": 1e-05, "loss": 0.4405, "mean_token_accuracy": 0.8564949631690979, "num_tokens": 637205011.0, "step": 3999 }, { "epoch": 2.034587995930824, "grad_norm": 1.0024240016937256, "learning_rate": 1e-05, "loss": 0.447, "mean_token_accuracy": 0.8577027320861816, "num_tokens": 637376842.0, "step": 4000 }, { "epoch": 2.0350966429298065, "grad_norm": 0.8983991146087646, "learning_rate": 1e-05, "loss": 0.4599, "mean_token_accuracy": 0.8515585660934448, "num_tokens": 637544257.0, "step": 4001 }, { "epoch": 2.0356052899287893, "grad_norm": 0.9078333973884583, "learning_rate": 1e-05, "loss": 0.4541, "mean_token_accuracy": 0.8519089818000793, "num_tokens": 637708776.0, "step": 4002 }, { "epoch": 2.036113936927772, "grad_norm": 1.111275315284729, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8573497533798218, "num_tokens": 637866051.0, "step": 4003 }, { "epoch": 2.036622583926755, "grad_norm": 0.9840702414512634, "learning_rate": 1e-05, "loss": 0.4424, "mean_token_accuracy": 0.857101559638977, "num_tokens": 638026761.0, "step": 4004 }, { "epoch": 2.0371312309257377, "grad_norm": 0.9235756397247314, "learning_rate": 1e-05, "loss": 0.4365, "mean_token_accuracy": 0.8585684299468994, "num_tokens": 638183663.0, "step": 4005 }, { "epoch": 2.0376398779247205, "grad_norm": 1.0132715702056885, "learning_rate": 1e-05, "loss": 0.4557, "mean_token_accuracy": 0.8540298938751221, "num_tokens": 638337687.0, "step": 4006 }, { "epoch": 2.038148524923703, "grad_norm": 0.9375067353248596, "learning_rate": 1e-05, "loss": 0.4079, "mean_token_accuracy": 0.8654524087905884, "num_tokens": 638500144.0, "step": 4007 }, { "epoch": 2.0386571719226856, "grad_norm": 0.9727280139923096, "learning_rate": 1e-05, "loss": 0.4133, "mean_token_accuracy": 0.8654862642288208, "num_tokens": 638663203.0, "step": 4008 }, { "epoch": 2.0391658189216684, "grad_norm": 1.001802682876587, "learning_rate": 1e-05, "loss": 0.463, "mean_token_accuracy": 0.8518264293670654, "num_tokens": 638816564.0, "step": 4009 }, { "epoch": 2.039674465920651, "grad_norm": 0.9148018956184387, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.860122561454773, "num_tokens": 638985653.0, "step": 4010 }, { "epoch": 2.040183112919634, "grad_norm": 1.015397548675537, "learning_rate": 1e-05, "loss": 0.4381, "mean_token_accuracy": 0.857643187046051, "num_tokens": 639144412.0, "step": 4011 }, { "epoch": 2.0406917599186163, "grad_norm": 0.9982504844665527, "learning_rate": 1e-05, "loss": 0.4698, "mean_token_accuracy": 0.8505040407180786, "num_tokens": 639296652.0, "step": 4012 }, { "epoch": 2.041200406917599, "grad_norm": 1.0248266458511353, "learning_rate": 1e-05, "loss": 0.4815, "mean_token_accuracy": 0.8467176556587219, "num_tokens": 639467379.0, "step": 4013 }, { "epoch": 2.041709053916582, "grad_norm": 0.9136324524879456, "learning_rate": 1e-05, "loss": 0.4248, "mean_token_accuracy": 0.8617210388183594, "num_tokens": 639629812.0, "step": 4014 }, { "epoch": 2.0422177009155646, "grad_norm": 0.9851207137107849, "learning_rate": 1e-05, "loss": 0.3942, "mean_token_accuracy": 0.871574878692627, "num_tokens": 639790695.0, "step": 4015 }, { "epoch": 2.0427263479145474, "grad_norm": 0.9477247595787048, "learning_rate": 1e-05, "loss": 0.4488, "mean_token_accuracy": 0.855469822883606, "num_tokens": 639953979.0, "step": 4016 }, { "epoch": 2.0432349949135302, "grad_norm": 1.1952741146087646, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8568698763847351, "num_tokens": 640110677.0, "step": 4017 }, { "epoch": 2.0437436419125126, "grad_norm": 0.9191387295722961, "learning_rate": 1e-05, "loss": 0.4127, "mean_token_accuracy": 0.8653135299682617, "num_tokens": 640271057.0, "step": 4018 }, { "epoch": 2.0442522889114954, "grad_norm": 0.886969268321991, "learning_rate": 1e-05, "loss": 0.4156, "mean_token_accuracy": 0.862865686416626, "num_tokens": 640421336.0, "step": 4019 }, { "epoch": 2.044760935910478, "grad_norm": 0.9596548080444336, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8615579605102539, "num_tokens": 640574916.0, "step": 4020 }, { "epoch": 2.045269582909461, "grad_norm": 0.9139665365219116, "learning_rate": 1e-05, "loss": 0.4235, "mean_token_accuracy": 0.8608518838882446, "num_tokens": 640731484.0, "step": 4021 }, { "epoch": 2.0457782299084437, "grad_norm": 0.9046313166618347, "learning_rate": 1e-05, "loss": 0.4313, "mean_token_accuracy": 0.8604253530502319, "num_tokens": 640899252.0, "step": 4022 }, { "epoch": 2.046286876907426, "grad_norm": 0.9831649661064148, "learning_rate": 1e-05, "loss": 0.4448, "mean_token_accuracy": 0.8558986186981201, "num_tokens": 641049998.0, "step": 4023 }, { "epoch": 2.046795523906409, "grad_norm": 0.9268972873687744, "learning_rate": 1e-05, "loss": 0.4109, "mean_token_accuracy": 0.8665068745613098, "num_tokens": 641225765.0, "step": 4024 }, { "epoch": 2.0473041709053916, "grad_norm": 1.0772287845611572, "learning_rate": 1e-05, "loss": 0.4348, "mean_token_accuracy": 0.8591381311416626, "num_tokens": 641395261.0, "step": 4025 }, { "epoch": 2.0478128179043744, "grad_norm": 0.9058048129081726, "learning_rate": 1e-05, "loss": 0.4159, "mean_token_accuracy": 0.8644649982452393, "num_tokens": 641553952.0, "step": 4026 }, { "epoch": 2.048321464903357, "grad_norm": 0.9972060322761536, "learning_rate": 1e-05, "loss": 0.4822, "mean_token_accuracy": 0.8465654253959656, "num_tokens": 641713306.0, "step": 4027 }, { "epoch": 2.04883011190234, "grad_norm": 0.9957790970802307, "learning_rate": 1e-05, "loss": 0.4189, "mean_token_accuracy": 0.862214207649231, "num_tokens": 641871688.0, "step": 4028 }, { "epoch": 2.0493387589013223, "grad_norm": 0.9785782098770142, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8533655405044556, "num_tokens": 642028112.0, "step": 4029 }, { "epoch": 2.049847405900305, "grad_norm": 1.0586796998977661, "learning_rate": 1e-05, "loss": 0.4646, "mean_token_accuracy": 0.8501312136650085, "num_tokens": 642182839.0, "step": 4030 }, { "epoch": 2.050356052899288, "grad_norm": 0.9724810719490051, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8565281629562378, "num_tokens": 642333528.0, "step": 4031 }, { "epoch": 2.0508646998982707, "grad_norm": 1.0518903732299805, "learning_rate": 1e-05, "loss": 0.4605, "mean_token_accuracy": 0.8530091047286987, "num_tokens": 642490309.0, "step": 4032 }, { "epoch": 2.0513733468972535, "grad_norm": 0.9552509784698486, "learning_rate": 1e-05, "loss": 0.4149, "mean_token_accuracy": 0.8634740114212036, "num_tokens": 642640172.0, "step": 4033 }, { "epoch": 2.051881993896236, "grad_norm": 1.0244795083999634, "learning_rate": 1e-05, "loss": 0.4371, "mean_token_accuracy": 0.8561656475067139, "num_tokens": 642789777.0, "step": 4034 }, { "epoch": 2.0523906408952186, "grad_norm": 1.0393922328948975, "learning_rate": 1e-05, "loss": 0.4637, "mean_token_accuracy": 0.8515689969062805, "num_tokens": 642951366.0, "step": 4035 }, { "epoch": 2.0528992878942014, "grad_norm": 0.9119642972946167, "learning_rate": 1e-05, "loss": 0.4285, "mean_token_accuracy": 0.8608314990997314, "num_tokens": 643121797.0, "step": 4036 }, { "epoch": 2.053407934893184, "grad_norm": 1.0008642673492432, "learning_rate": 1e-05, "loss": 0.4698, "mean_token_accuracy": 0.8494364023208618, "num_tokens": 643279667.0, "step": 4037 }, { "epoch": 2.053916581892167, "grad_norm": 0.9313419461250305, "learning_rate": 1e-05, "loss": 0.4313, "mean_token_accuracy": 0.8611078262329102, "num_tokens": 643442233.0, "step": 4038 }, { "epoch": 2.0544252288911498, "grad_norm": 0.9488298892974854, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8599388003349304, "num_tokens": 643614205.0, "step": 4039 }, { "epoch": 2.054933875890132, "grad_norm": 0.9752569198608398, "learning_rate": 1e-05, "loss": 0.4316, "mean_token_accuracy": 0.8609519004821777, "num_tokens": 643770297.0, "step": 4040 }, { "epoch": 2.055442522889115, "grad_norm": 0.9722061157226562, "learning_rate": 1e-05, "loss": 0.4264, "mean_token_accuracy": 0.8619430065155029, "num_tokens": 643916346.0, "step": 4041 }, { "epoch": 2.0559511698880977, "grad_norm": 1.0140228271484375, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8532426357269287, "num_tokens": 644077898.0, "step": 4042 }, { "epoch": 2.0564598168870805, "grad_norm": 0.9496407508850098, "learning_rate": 1e-05, "loss": 0.4408, "mean_token_accuracy": 0.8577236533164978, "num_tokens": 644239431.0, "step": 4043 }, { "epoch": 2.0569684638860632, "grad_norm": 1.0077439546585083, "learning_rate": 1e-05, "loss": 0.4554, "mean_token_accuracy": 0.8524376153945923, "num_tokens": 644396823.0, "step": 4044 }, { "epoch": 2.0574771108850456, "grad_norm": 0.8989365100860596, "learning_rate": 1e-05, "loss": 0.4367, "mean_token_accuracy": 0.8576108813285828, "num_tokens": 644551897.0, "step": 4045 }, { "epoch": 2.0579857578840284, "grad_norm": 0.9695183634757996, "learning_rate": 1e-05, "loss": 0.4428, "mean_token_accuracy": 0.8565598130226135, "num_tokens": 644715765.0, "step": 4046 }, { "epoch": 2.058494404883011, "grad_norm": 0.9331927299499512, "learning_rate": 1e-05, "loss": 0.4486, "mean_token_accuracy": 0.8558671474456787, "num_tokens": 644885356.0, "step": 4047 }, { "epoch": 2.059003051881994, "grad_norm": 0.9889021515846252, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.8565429449081421, "num_tokens": 645039148.0, "step": 4048 }, { "epoch": 2.0595116988809767, "grad_norm": 0.9650540351867676, "learning_rate": 1e-05, "loss": 0.4473, "mean_token_accuracy": 0.8539718389511108, "num_tokens": 645202089.0, "step": 4049 }, { "epoch": 2.0600203458799595, "grad_norm": 0.9073891043663025, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.859544038772583, "num_tokens": 645359940.0, "step": 4050 }, { "epoch": 2.060528992878942, "grad_norm": 0.9147620797157288, "learning_rate": 1e-05, "loss": 0.4402, "mean_token_accuracy": 0.8565871715545654, "num_tokens": 645530000.0, "step": 4051 }, { "epoch": 2.0610376398779247, "grad_norm": 0.9782045483589172, "learning_rate": 1e-05, "loss": 0.4254, "mean_token_accuracy": 0.8615814447402954, "num_tokens": 645695713.0, "step": 4052 }, { "epoch": 2.0615462868769074, "grad_norm": 1.0421298742294312, "learning_rate": 1e-05, "loss": 0.4073, "mean_token_accuracy": 0.8664171695709229, "num_tokens": 645845526.0, "step": 4053 }, { "epoch": 2.0620549338758902, "grad_norm": 0.9888449311256409, "learning_rate": 1e-05, "loss": 0.4495, "mean_token_accuracy": 0.8537807464599609, "num_tokens": 646007060.0, "step": 4054 }, { "epoch": 2.062563580874873, "grad_norm": 1.046412467956543, "learning_rate": 1e-05, "loss": 0.4361, "mean_token_accuracy": 0.8585008382797241, "num_tokens": 646158897.0, "step": 4055 }, { "epoch": 2.0630722278738554, "grad_norm": 1.0488885641098022, "learning_rate": 1e-05, "loss": 0.4464, "mean_token_accuracy": 0.8543227910995483, "num_tokens": 646310789.0, "step": 4056 }, { "epoch": 2.063580874872838, "grad_norm": 0.9440844655036926, "learning_rate": 1e-05, "loss": 0.4204, "mean_token_accuracy": 0.8615769147872925, "num_tokens": 646489056.0, "step": 4057 }, { "epoch": 2.064089521871821, "grad_norm": 1.0296015739440918, "learning_rate": 1e-05, "loss": 0.4104, "mean_token_accuracy": 0.8658794164657593, "num_tokens": 646651450.0, "step": 4058 }, { "epoch": 2.0645981688708037, "grad_norm": 1.0239273309707642, "learning_rate": 1e-05, "loss": 0.4552, "mean_token_accuracy": 0.8538615703582764, "num_tokens": 646808789.0, "step": 4059 }, { "epoch": 2.0651068158697865, "grad_norm": 0.9998471736907959, "learning_rate": 1e-05, "loss": 0.4459, "mean_token_accuracy": 0.8558937907218933, "num_tokens": 646974846.0, "step": 4060 }, { "epoch": 2.0656154628687693, "grad_norm": 0.9561287760734558, "learning_rate": 1e-05, "loss": 0.4152, "mean_token_accuracy": 0.8642705082893372, "num_tokens": 647131712.0, "step": 4061 }, { "epoch": 2.0661241098677516, "grad_norm": 1.3276989459991455, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8598711490631104, "num_tokens": 647294337.0, "step": 4062 }, { "epoch": 2.0666327568667344, "grad_norm": 1.076204776763916, "learning_rate": 1e-05, "loss": 0.4626, "mean_token_accuracy": 0.8524924516677856, "num_tokens": 647453242.0, "step": 4063 }, { "epoch": 2.067141403865717, "grad_norm": 0.8968902230262756, "learning_rate": 1e-05, "loss": 0.4183, "mean_token_accuracy": 0.863444447517395, "num_tokens": 647614724.0, "step": 4064 }, { "epoch": 2.0676500508647, "grad_norm": 1.007246732711792, "learning_rate": 1e-05, "loss": 0.4166, "mean_token_accuracy": 0.8639742732048035, "num_tokens": 647774121.0, "step": 4065 }, { "epoch": 2.068158697863683, "grad_norm": 0.9755057692527771, "learning_rate": 1e-05, "loss": 0.4477, "mean_token_accuracy": 0.855438232421875, "num_tokens": 647933285.0, "step": 4066 }, { "epoch": 2.068667344862665, "grad_norm": 0.9740897417068481, "learning_rate": 1e-05, "loss": 0.4314, "mean_token_accuracy": 0.8596476316452026, "num_tokens": 648098194.0, "step": 4067 }, { "epoch": 2.069175991861648, "grad_norm": 0.9107229113578796, "learning_rate": 1e-05, "loss": 0.4188, "mean_token_accuracy": 0.8632190227508545, "num_tokens": 648251173.0, "step": 4068 }, { "epoch": 2.0696846388606307, "grad_norm": 1.126113772392273, "learning_rate": 1e-05, "loss": 0.4806, "mean_token_accuracy": 0.845462441444397, "num_tokens": 648398627.0, "step": 4069 }, { "epoch": 2.0701932858596135, "grad_norm": 1.0377577543258667, "learning_rate": 1e-05, "loss": 0.4416, "mean_token_accuracy": 0.8560168743133545, "num_tokens": 648551725.0, "step": 4070 }, { "epoch": 2.0707019328585963, "grad_norm": 0.9613800048828125, "learning_rate": 1e-05, "loss": 0.4529, "mean_token_accuracy": 0.8547857999801636, "num_tokens": 648708382.0, "step": 4071 }, { "epoch": 2.0712105798575786, "grad_norm": 1.0431551933288574, "learning_rate": 1e-05, "loss": 0.4177, "mean_token_accuracy": 0.8637215495109558, "num_tokens": 648863508.0, "step": 4072 }, { "epoch": 2.0717192268565614, "grad_norm": 0.9346727132797241, "learning_rate": 1e-05, "loss": 0.468, "mean_token_accuracy": 0.8500462770462036, "num_tokens": 649012495.0, "step": 4073 }, { "epoch": 2.072227873855544, "grad_norm": 0.8879013061523438, "learning_rate": 1e-05, "loss": 0.3821, "mean_token_accuracy": 0.8734450340270996, "num_tokens": 649176493.0, "step": 4074 }, { "epoch": 2.072736520854527, "grad_norm": 0.966341495513916, "learning_rate": 1e-05, "loss": 0.415, "mean_token_accuracy": 0.8645215034484863, "num_tokens": 649334145.0, "step": 4075 }, { "epoch": 2.0732451678535098, "grad_norm": 0.8374088406562805, "learning_rate": 1e-05, "loss": 0.4019, "mean_token_accuracy": 0.8681143522262573, "num_tokens": 649497977.0, "step": 4076 }, { "epoch": 2.0737538148524925, "grad_norm": 0.9355109930038452, "learning_rate": 1e-05, "loss": 0.4626, "mean_token_accuracy": 0.8502155542373657, "num_tokens": 649665574.0, "step": 4077 }, { "epoch": 2.074262461851475, "grad_norm": 0.9582520723342896, "learning_rate": 1e-05, "loss": 0.4244, "mean_token_accuracy": 0.8623635768890381, "num_tokens": 649826804.0, "step": 4078 }, { "epoch": 2.0747711088504577, "grad_norm": 0.9194473624229431, "learning_rate": 1e-05, "loss": 0.4076, "mean_token_accuracy": 0.8666621446609497, "num_tokens": 649979766.0, "step": 4079 }, { "epoch": 2.0752797558494405, "grad_norm": 0.9332849383354187, "learning_rate": 1e-05, "loss": 0.4414, "mean_token_accuracy": 0.857512891292572, "num_tokens": 650125023.0, "step": 4080 }, { "epoch": 2.0757884028484233, "grad_norm": 0.9183851480484009, "learning_rate": 1e-05, "loss": 0.4326, "mean_token_accuracy": 0.8598358631134033, "num_tokens": 650285291.0, "step": 4081 }, { "epoch": 2.076297049847406, "grad_norm": 0.8456012606620789, "learning_rate": 1e-05, "loss": 0.4216, "mean_token_accuracy": 0.8641607761383057, "num_tokens": 650457502.0, "step": 4082 }, { "epoch": 2.076805696846389, "grad_norm": 0.8970609903335571, "learning_rate": 1e-05, "loss": 0.4354, "mean_token_accuracy": 0.859481155872345, "num_tokens": 650618262.0, "step": 4083 }, { "epoch": 2.077314343845371, "grad_norm": 0.8981460928916931, "learning_rate": 1e-05, "loss": 0.4358, "mean_token_accuracy": 0.8589435815811157, "num_tokens": 650774229.0, "step": 4084 }, { "epoch": 2.077822990844354, "grad_norm": 0.9190006852149963, "learning_rate": 1e-05, "loss": 0.4386, "mean_token_accuracy": 0.8579449653625488, "num_tokens": 650932968.0, "step": 4085 }, { "epoch": 2.0783316378433367, "grad_norm": 0.960010290145874, "learning_rate": 1e-05, "loss": 0.4584, "mean_token_accuracy": 0.8517380356788635, "num_tokens": 651094235.0, "step": 4086 }, { "epoch": 2.0788402848423195, "grad_norm": 1.0507935285568237, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8477388620376587, "num_tokens": 651247051.0, "step": 4087 }, { "epoch": 2.0793489318413023, "grad_norm": 0.8672595024108887, "learning_rate": 1e-05, "loss": 0.4247, "mean_token_accuracy": 0.8619122505187988, "num_tokens": 651403530.0, "step": 4088 }, { "epoch": 2.0798575788402847, "grad_norm": 0.8593379855155945, "learning_rate": 1e-05, "loss": 0.4425, "mean_token_accuracy": 0.8561261892318726, "num_tokens": 651579353.0, "step": 4089 }, { "epoch": 2.0803662258392674, "grad_norm": 0.9281165599822998, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.852559506893158, "num_tokens": 651737149.0, "step": 4090 }, { "epoch": 2.0808748728382502, "grad_norm": 0.9167788028717041, "learning_rate": 1e-05, "loss": 0.4347, "mean_token_accuracy": 0.8594133257865906, "num_tokens": 651896201.0, "step": 4091 }, { "epoch": 2.081383519837233, "grad_norm": 1.0042438507080078, "learning_rate": 1e-05, "loss": 0.4514, "mean_token_accuracy": 0.854072630405426, "num_tokens": 652057532.0, "step": 4092 }, { "epoch": 2.081892166836216, "grad_norm": 0.8949097990989685, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8516091704368591, "num_tokens": 652209462.0, "step": 4093 }, { "epoch": 2.082400813835198, "grad_norm": 0.9609045386314392, "learning_rate": 1e-05, "loss": 0.4275, "mean_token_accuracy": 0.8596907258033752, "num_tokens": 652364933.0, "step": 4094 }, { "epoch": 2.082909460834181, "grad_norm": 0.902187705039978, "learning_rate": 1e-05, "loss": 0.4317, "mean_token_accuracy": 0.8602837324142456, "num_tokens": 652537736.0, "step": 4095 }, { "epoch": 2.0834181078331637, "grad_norm": 0.9226665496826172, "learning_rate": 1e-05, "loss": 0.4525, "mean_token_accuracy": 0.8551701307296753, "num_tokens": 652693646.0, "step": 4096 }, { "epoch": 2.0839267548321465, "grad_norm": 0.9428882002830505, "learning_rate": 1e-05, "loss": 0.4278, "mean_token_accuracy": 0.8616924285888672, "num_tokens": 652848681.0, "step": 4097 }, { "epoch": 2.0844354018311293, "grad_norm": 0.8364573121070862, "learning_rate": 1e-05, "loss": 0.4111, "mean_token_accuracy": 0.8645583391189575, "num_tokens": 653007977.0, "step": 4098 }, { "epoch": 2.084944048830112, "grad_norm": 0.9261143207550049, "learning_rate": 1e-05, "loss": 0.4133, "mean_token_accuracy": 0.866054117679596, "num_tokens": 653154060.0, "step": 4099 }, { "epoch": 2.0854526958290944, "grad_norm": 0.966831386089325, "learning_rate": 1e-05, "loss": 0.4412, "mean_token_accuracy": 0.8580394387245178, "num_tokens": 653315197.0, "step": 4100 }, { "epoch": 2.085961342828077, "grad_norm": 0.850862979888916, "learning_rate": 1e-05, "loss": 0.4063, "mean_token_accuracy": 0.8678941130638123, "num_tokens": 653474119.0, "step": 4101 }, { "epoch": 2.08646998982706, "grad_norm": 0.9313889741897583, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8533434867858887, "num_tokens": 653634130.0, "step": 4102 }, { "epoch": 2.086978636826043, "grad_norm": 0.8664780259132385, "learning_rate": 1e-05, "loss": 0.4169, "mean_token_accuracy": 0.8633689284324646, "num_tokens": 653796634.0, "step": 4103 }, { "epoch": 2.0874872838250256, "grad_norm": 0.9221550822257996, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8431581258773804, "num_tokens": 653956123.0, "step": 4104 }, { "epoch": 2.0879959308240084, "grad_norm": 0.9604993462562561, "learning_rate": 1e-05, "loss": 0.4909, "mean_token_accuracy": 0.8433293104171753, "num_tokens": 654110242.0, "step": 4105 }, { "epoch": 2.0885045778229907, "grad_norm": 0.8623951077461243, "learning_rate": 1e-05, "loss": 0.4236, "mean_token_accuracy": 0.8634408712387085, "num_tokens": 654284883.0, "step": 4106 }, { "epoch": 2.0890132248219735, "grad_norm": 0.893561840057373, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8487818837165833, "num_tokens": 654451207.0, "step": 4107 }, { "epoch": 2.0895218718209563, "grad_norm": 0.8882175087928772, "learning_rate": 1e-05, "loss": 0.4679, "mean_token_accuracy": 0.8507075905799866, "num_tokens": 654620877.0, "step": 4108 }, { "epoch": 2.090030518819939, "grad_norm": 0.8599258661270142, "learning_rate": 1e-05, "loss": 0.4693, "mean_token_accuracy": 0.8494659662246704, "num_tokens": 654789815.0, "step": 4109 }, { "epoch": 2.090539165818922, "grad_norm": 0.869370698928833, "learning_rate": 1e-05, "loss": 0.4545, "mean_token_accuracy": 0.8524019122123718, "num_tokens": 654955083.0, "step": 4110 }, { "epoch": 2.091047812817904, "grad_norm": 0.9025420546531677, "learning_rate": 1e-05, "loss": 0.4118, "mean_token_accuracy": 0.865307092666626, "num_tokens": 655105899.0, "step": 4111 }, { "epoch": 2.091556459816887, "grad_norm": 0.8740807175636292, "learning_rate": 1e-05, "loss": 0.4223, "mean_token_accuracy": 0.8631594181060791, "num_tokens": 655262947.0, "step": 4112 }, { "epoch": 2.0920651068158698, "grad_norm": 0.8836954832077026, "learning_rate": 1e-05, "loss": 0.4259, "mean_token_accuracy": 0.8628745079040527, "num_tokens": 655424948.0, "step": 4113 }, { "epoch": 2.0925737538148526, "grad_norm": 0.9837310314178467, "learning_rate": 1e-05, "loss": 0.4239, "mean_token_accuracy": 0.8626061677932739, "num_tokens": 655582982.0, "step": 4114 }, { "epoch": 2.0930824008138353, "grad_norm": 0.9199106097221375, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8546483516693115, "num_tokens": 655736818.0, "step": 4115 }, { "epoch": 2.0935910478128177, "grad_norm": 0.9366452097892761, "learning_rate": 1e-05, "loss": 0.4269, "mean_token_accuracy": 0.860552191734314, "num_tokens": 655892296.0, "step": 4116 }, { "epoch": 2.0940996948118005, "grad_norm": 0.9860832691192627, "learning_rate": 1e-05, "loss": 0.4259, "mean_token_accuracy": 0.859500527381897, "num_tokens": 656042630.0, "step": 4117 }, { "epoch": 2.0946083418107833, "grad_norm": 0.8909306526184082, "learning_rate": 1e-05, "loss": 0.4208, "mean_token_accuracy": 0.8629220128059387, "num_tokens": 656199760.0, "step": 4118 }, { "epoch": 2.095116988809766, "grad_norm": 0.9349894523620605, "learning_rate": 1e-05, "loss": 0.4527, "mean_token_accuracy": 0.8533021211624146, "num_tokens": 656356098.0, "step": 4119 }, { "epoch": 2.095625635808749, "grad_norm": 0.9588991403579712, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8521022796630859, "num_tokens": 656514419.0, "step": 4120 }, { "epoch": 2.0961342828077316, "grad_norm": 0.8751233816146851, "learning_rate": 1e-05, "loss": 0.3911, "mean_token_accuracy": 0.8713945150375366, "num_tokens": 656661884.0, "step": 4121 }, { "epoch": 2.096642929806714, "grad_norm": 0.8489260077476501, "learning_rate": 1e-05, "loss": 0.4073, "mean_token_accuracy": 0.8669377565383911, "num_tokens": 656815650.0, "step": 4122 }, { "epoch": 2.0971515768056967, "grad_norm": 0.9509919881820679, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.8608969449996948, "num_tokens": 656980239.0, "step": 4123 }, { "epoch": 2.0976602238046795, "grad_norm": 0.9359029531478882, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8602707982063293, "num_tokens": 657128763.0, "step": 4124 }, { "epoch": 2.0981688708036623, "grad_norm": 0.8939815759658813, "learning_rate": 1e-05, "loss": 0.4478, "mean_token_accuracy": 0.8559001088142395, "num_tokens": 657290891.0, "step": 4125 }, { "epoch": 2.098677517802645, "grad_norm": 0.9452753067016602, "learning_rate": 1e-05, "loss": 0.4483, "mean_token_accuracy": 0.8554445505142212, "num_tokens": 657442965.0, "step": 4126 }, { "epoch": 2.099186164801628, "grad_norm": 0.832610011100769, "learning_rate": 1e-05, "loss": 0.4376, "mean_token_accuracy": 0.8597967624664307, "num_tokens": 657627642.0, "step": 4127 }, { "epoch": 2.0996948118006102, "grad_norm": 0.9224095344543457, "learning_rate": 1e-05, "loss": 0.4298, "mean_token_accuracy": 0.8613498210906982, "num_tokens": 657787301.0, "step": 4128 }, { "epoch": 2.100203458799593, "grad_norm": 0.9500312805175781, "learning_rate": 1e-05, "loss": 0.4367, "mean_token_accuracy": 0.859756350517273, "num_tokens": 657932879.0, "step": 4129 }, { "epoch": 2.100712105798576, "grad_norm": 0.9640877842903137, "learning_rate": 1e-05, "loss": 0.4298, "mean_token_accuracy": 0.8614565134048462, "num_tokens": 658086444.0, "step": 4130 }, { "epoch": 2.1012207527975586, "grad_norm": 0.9052125215530396, "learning_rate": 1e-05, "loss": 0.4639, "mean_token_accuracy": 0.8509527444839478, "num_tokens": 658255960.0, "step": 4131 }, { "epoch": 2.1017293997965414, "grad_norm": 0.9720301628112793, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.8574552536010742, "num_tokens": 658417852.0, "step": 4132 }, { "epoch": 2.1022380467955237, "grad_norm": 0.9318115711212158, "learning_rate": 1e-05, "loss": 0.4523, "mean_token_accuracy": 0.8530559539794922, "num_tokens": 658574716.0, "step": 4133 }, { "epoch": 2.1027466937945065, "grad_norm": 0.9879447817802429, "learning_rate": 1e-05, "loss": 0.4223, "mean_token_accuracy": 0.8617345094680786, "num_tokens": 658727698.0, "step": 4134 }, { "epoch": 2.1032553407934893, "grad_norm": 0.9011422991752625, "learning_rate": 1e-05, "loss": 0.4256, "mean_token_accuracy": 0.8616060018539429, "num_tokens": 658885875.0, "step": 4135 }, { "epoch": 2.103763987792472, "grad_norm": 0.9072966575622559, "learning_rate": 1e-05, "loss": 0.4321, "mean_token_accuracy": 0.860308825969696, "num_tokens": 659036589.0, "step": 4136 }, { "epoch": 2.104272634791455, "grad_norm": 0.9436938166618347, "learning_rate": 1e-05, "loss": 0.4455, "mean_token_accuracy": 0.8565719127655029, "num_tokens": 659196714.0, "step": 4137 }, { "epoch": 2.104781281790437, "grad_norm": 0.9317730069160461, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8525628447532654, "num_tokens": 659356387.0, "step": 4138 }, { "epoch": 2.10528992878942, "grad_norm": 0.8680718541145325, "learning_rate": 1e-05, "loss": 0.4282, "mean_token_accuracy": 0.8603150844573975, "num_tokens": 659511890.0, "step": 4139 }, { "epoch": 2.105798575788403, "grad_norm": 0.9945595264434814, "learning_rate": 1e-05, "loss": 0.4458, "mean_token_accuracy": 0.8552508354187012, "num_tokens": 659665573.0, "step": 4140 }, { "epoch": 2.1063072227873856, "grad_norm": 1.0013288259506226, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.8601479530334473, "num_tokens": 659823114.0, "step": 4141 }, { "epoch": 2.1068158697863684, "grad_norm": 0.91698157787323, "learning_rate": 1e-05, "loss": 0.4518, "mean_token_accuracy": 0.855778694152832, "num_tokens": 659986298.0, "step": 4142 }, { "epoch": 2.107324516785351, "grad_norm": 0.9010708332061768, "learning_rate": 1e-05, "loss": 0.4145, "mean_token_accuracy": 0.8645851612091064, "num_tokens": 660144890.0, "step": 4143 }, { "epoch": 2.1078331637843335, "grad_norm": 0.8915697932243347, "learning_rate": 1e-05, "loss": 0.4248, "mean_token_accuracy": 0.8608888387680054, "num_tokens": 660295004.0, "step": 4144 }, { "epoch": 2.1083418107833163, "grad_norm": 0.9084402322769165, "learning_rate": 1e-05, "loss": 0.4168, "mean_token_accuracy": 0.8644142150878906, "num_tokens": 660447649.0, "step": 4145 }, { "epoch": 2.108850457782299, "grad_norm": 0.9173476696014404, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8596066236495972, "num_tokens": 660607411.0, "step": 4146 }, { "epoch": 2.109359104781282, "grad_norm": 0.9064167737960815, "learning_rate": 1e-05, "loss": 0.4361, "mean_token_accuracy": 0.8588464260101318, "num_tokens": 660761477.0, "step": 4147 }, { "epoch": 2.1098677517802646, "grad_norm": 0.9110246896743774, "learning_rate": 1e-05, "loss": 0.4104, "mean_token_accuracy": 0.8663687705993652, "num_tokens": 660921469.0, "step": 4148 }, { "epoch": 2.1103763987792474, "grad_norm": 0.9070311784744263, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8485805988311768, "num_tokens": 661082876.0, "step": 4149 }, { "epoch": 2.1108850457782298, "grad_norm": 0.9548289179801941, "learning_rate": 1e-05, "loss": 0.4772, "mean_token_accuracy": 0.8454893231391907, "num_tokens": 661237468.0, "step": 4150 }, { "epoch": 2.1113936927772126, "grad_norm": 0.8961446285247803, "learning_rate": 1e-05, "loss": 0.435, "mean_token_accuracy": 0.859317421913147, "num_tokens": 661405065.0, "step": 4151 }, { "epoch": 2.1119023397761953, "grad_norm": 0.9669830203056335, "learning_rate": 1e-05, "loss": 0.4198, "mean_token_accuracy": 0.8659858703613281, "num_tokens": 661553561.0, "step": 4152 }, { "epoch": 2.112410986775178, "grad_norm": 0.9334442615509033, "learning_rate": 1e-05, "loss": 0.4588, "mean_token_accuracy": 0.8505697250366211, "num_tokens": 661711645.0, "step": 4153 }, { "epoch": 2.112919633774161, "grad_norm": 1.0024757385253906, "learning_rate": 1e-05, "loss": 0.4365, "mean_token_accuracy": 0.8586753606796265, "num_tokens": 661865294.0, "step": 4154 }, { "epoch": 2.1134282807731433, "grad_norm": 0.8926826119422913, "learning_rate": 1e-05, "loss": 0.4305, "mean_token_accuracy": 0.8604754209518433, "num_tokens": 662026583.0, "step": 4155 }, { "epoch": 2.113936927772126, "grad_norm": 0.9363488554954529, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8561187982559204, "num_tokens": 662192533.0, "step": 4156 }, { "epoch": 2.114445574771109, "grad_norm": 0.9798843264579773, "learning_rate": 1e-05, "loss": 0.4305, "mean_token_accuracy": 0.8614747524261475, "num_tokens": 662346698.0, "step": 4157 }, { "epoch": 2.1149542217700916, "grad_norm": 0.9556357264518738, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8487550616264343, "num_tokens": 662502572.0, "step": 4158 }, { "epoch": 2.1154628687690744, "grad_norm": 0.938748836517334, "learning_rate": 1e-05, "loss": 0.4425, "mean_token_accuracy": 0.8568930625915527, "num_tokens": 662658294.0, "step": 4159 }, { "epoch": 2.1159715157680568, "grad_norm": 1.0129530429840088, "learning_rate": 1e-05, "loss": 0.4735, "mean_token_accuracy": 0.8483498692512512, "num_tokens": 662821554.0, "step": 4160 }, { "epoch": 2.1164801627670395, "grad_norm": 0.9006751179695129, "learning_rate": 1e-05, "loss": 0.4248, "mean_token_accuracy": 0.862185001373291, "num_tokens": 662974128.0, "step": 4161 }, { "epoch": 2.1169888097660223, "grad_norm": 1.0422261953353882, "learning_rate": 1e-05, "loss": 0.4582, "mean_token_accuracy": 0.8512689471244812, "num_tokens": 663123259.0, "step": 4162 }, { "epoch": 2.117497456765005, "grad_norm": 0.9526087641716003, "learning_rate": 1e-05, "loss": 0.4219, "mean_token_accuracy": 0.8615497350692749, "num_tokens": 663280117.0, "step": 4163 }, { "epoch": 2.118006103763988, "grad_norm": 0.9149175882339478, "learning_rate": 1e-05, "loss": 0.4161, "mean_token_accuracy": 0.8647677898406982, "num_tokens": 663445600.0, "step": 4164 }, { "epoch": 2.1185147507629707, "grad_norm": 1.089755892753601, "learning_rate": 1e-05, "loss": 0.4349, "mean_token_accuracy": 0.8590355515480042, "num_tokens": 663599402.0, "step": 4165 }, { "epoch": 2.119023397761953, "grad_norm": 1.0591999292373657, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8534281849861145, "num_tokens": 663747652.0, "step": 4166 }, { "epoch": 2.119532044760936, "grad_norm": 0.9893795847892761, "learning_rate": 1e-05, "loss": 0.4519, "mean_token_accuracy": 0.8535148501396179, "num_tokens": 663903015.0, "step": 4167 }, { "epoch": 2.1200406917599186, "grad_norm": 1.015095591545105, "learning_rate": 1e-05, "loss": 0.4977, "mean_token_accuracy": 0.8418223857879639, "num_tokens": 664069904.0, "step": 4168 }, { "epoch": 2.1205493387589014, "grad_norm": 0.9398860931396484, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8568176627159119, "num_tokens": 664228482.0, "step": 4169 }, { "epoch": 2.121057985757884, "grad_norm": 1.008710265159607, "learning_rate": 1e-05, "loss": 0.4427, "mean_token_accuracy": 0.8567774295806885, "num_tokens": 664386511.0, "step": 4170 }, { "epoch": 2.1215666327568665, "grad_norm": 0.9196237325668335, "learning_rate": 1e-05, "loss": 0.4461, "mean_token_accuracy": 0.8564049005508423, "num_tokens": 664551869.0, "step": 4171 }, { "epoch": 2.1220752797558493, "grad_norm": 0.9031176567077637, "learning_rate": 1e-05, "loss": 0.4236, "mean_token_accuracy": 0.8635759949684143, "num_tokens": 664712795.0, "step": 4172 }, { "epoch": 2.122583926754832, "grad_norm": 1.0113224983215332, "learning_rate": 1e-05, "loss": 0.4479, "mean_token_accuracy": 0.8555098176002502, "num_tokens": 664889059.0, "step": 4173 }, { "epoch": 2.123092573753815, "grad_norm": 0.9578308463096619, "learning_rate": 1e-05, "loss": 0.4197, "mean_token_accuracy": 0.8610748648643494, "num_tokens": 665036489.0, "step": 4174 }, { "epoch": 2.1236012207527977, "grad_norm": 0.9232265949249268, "learning_rate": 1e-05, "loss": 0.4169, "mean_token_accuracy": 0.864449143409729, "num_tokens": 665192739.0, "step": 4175 }, { "epoch": 2.1241098677517805, "grad_norm": 0.9943797588348389, "learning_rate": 1e-05, "loss": 0.4364, "mean_token_accuracy": 0.8594911098480225, "num_tokens": 665345764.0, "step": 4176 }, { "epoch": 2.124618514750763, "grad_norm": 0.9452236890792847, "learning_rate": 1e-05, "loss": 0.4367, "mean_token_accuracy": 0.8575600385665894, "num_tokens": 665498670.0, "step": 4177 }, { "epoch": 2.1251271617497456, "grad_norm": 0.8776096701622009, "learning_rate": 1e-05, "loss": 0.4315, "mean_token_accuracy": 0.8611912727355957, "num_tokens": 665650495.0, "step": 4178 }, { "epoch": 2.1256358087487284, "grad_norm": 0.9938535690307617, "learning_rate": 1e-05, "loss": 0.4524, "mean_token_accuracy": 0.8539614081382751, "num_tokens": 665814461.0, "step": 4179 }, { "epoch": 2.126144455747711, "grad_norm": 0.9521315097808838, "learning_rate": 1e-05, "loss": 0.4477, "mean_token_accuracy": 0.8568743467330933, "num_tokens": 665976759.0, "step": 4180 }, { "epoch": 2.126653102746694, "grad_norm": 0.8951365947723389, "learning_rate": 1e-05, "loss": 0.4541, "mean_token_accuracy": 0.8524468541145325, "num_tokens": 666134195.0, "step": 4181 }, { "epoch": 2.1271617497456763, "grad_norm": 1.0694705247879028, "learning_rate": 1e-05, "loss": 0.4307, "mean_token_accuracy": 0.8601644039154053, "num_tokens": 666274229.0, "step": 4182 }, { "epoch": 2.127670396744659, "grad_norm": 0.9351718425750732, "learning_rate": 1e-05, "loss": 0.4448, "mean_token_accuracy": 0.8556416034698486, "num_tokens": 666445641.0, "step": 4183 }, { "epoch": 2.128179043743642, "grad_norm": 0.9262452125549316, "learning_rate": 1e-05, "loss": 0.4588, "mean_token_accuracy": 0.851718544960022, "num_tokens": 666617768.0, "step": 4184 }, { "epoch": 2.1286876907426246, "grad_norm": 0.8724216222763062, "learning_rate": 1e-05, "loss": 0.4291, "mean_token_accuracy": 0.8605974912643433, "num_tokens": 666775755.0, "step": 4185 }, { "epoch": 2.1291963377416074, "grad_norm": 0.9080948233604431, "learning_rate": 1e-05, "loss": 0.4679, "mean_token_accuracy": 0.8505131006240845, "num_tokens": 666935081.0, "step": 4186 }, { "epoch": 2.1297049847405902, "grad_norm": 0.9036731719970703, "learning_rate": 1e-05, "loss": 0.4081, "mean_token_accuracy": 0.8680239915847778, "num_tokens": 667083302.0, "step": 4187 }, { "epoch": 2.1302136317395726, "grad_norm": 0.9800155758857727, "learning_rate": 1e-05, "loss": 0.4653, "mean_token_accuracy": 0.8516967296600342, "num_tokens": 667248835.0, "step": 4188 }, { "epoch": 2.1307222787385554, "grad_norm": 0.9089070558547974, "learning_rate": 1e-05, "loss": 0.4707, "mean_token_accuracy": 0.8470054864883423, "num_tokens": 667402091.0, "step": 4189 }, { "epoch": 2.131230925737538, "grad_norm": 0.9217497706413269, "learning_rate": 1e-05, "loss": 0.4316, "mean_token_accuracy": 0.8607636094093323, "num_tokens": 667556405.0, "step": 4190 }, { "epoch": 2.131739572736521, "grad_norm": 0.9196179509162903, "learning_rate": 1e-05, "loss": 0.4536, "mean_token_accuracy": 0.8530000448226929, "num_tokens": 667723355.0, "step": 4191 }, { "epoch": 2.1322482197355037, "grad_norm": 0.9593817591667175, "learning_rate": 1e-05, "loss": 0.4511, "mean_token_accuracy": 0.853350043296814, "num_tokens": 667894932.0, "step": 4192 }, { "epoch": 2.1327568667344865, "grad_norm": 0.8810621500015259, "learning_rate": 1e-05, "loss": 0.4378, "mean_token_accuracy": 0.8580273389816284, "num_tokens": 668054091.0, "step": 4193 }, { "epoch": 2.133265513733469, "grad_norm": 0.9316674470901489, "learning_rate": 1e-05, "loss": 0.4252, "mean_token_accuracy": 0.8623379468917847, "num_tokens": 668202295.0, "step": 4194 }, { "epoch": 2.1337741607324516, "grad_norm": 0.8470092415809631, "learning_rate": 1e-05, "loss": 0.4093, "mean_token_accuracy": 0.8674142360687256, "num_tokens": 668373127.0, "step": 4195 }, { "epoch": 2.1342828077314344, "grad_norm": 0.9411569237709045, "learning_rate": 1e-05, "loss": 0.4912, "mean_token_accuracy": 0.8427003622055054, "num_tokens": 668537919.0, "step": 4196 }, { "epoch": 2.134791454730417, "grad_norm": 1.4109463691711426, "learning_rate": 1e-05, "loss": 0.4535, "mean_token_accuracy": 0.8517212271690369, "num_tokens": 668698199.0, "step": 4197 }, { "epoch": 2.1353001017294, "grad_norm": 0.9155637621879578, "learning_rate": 1e-05, "loss": 0.4086, "mean_token_accuracy": 0.8653716444969177, "num_tokens": 668854088.0, "step": 4198 }, { "epoch": 2.1358087487283823, "grad_norm": 0.9064233303070068, "learning_rate": 1e-05, "loss": 0.4226, "mean_token_accuracy": 0.8617438673973083, "num_tokens": 669017211.0, "step": 4199 }, { "epoch": 2.136317395727365, "grad_norm": 0.9097160696983337, "learning_rate": 1e-05, "loss": 0.4261, "mean_token_accuracy": 0.862013041973114, "num_tokens": 669192028.0, "step": 4200 }, { "epoch": 2.136826042726348, "grad_norm": 0.9515498876571655, "learning_rate": 1e-05, "loss": 0.4183, "mean_token_accuracy": 0.8625266551971436, "num_tokens": 669352646.0, "step": 4201 }, { "epoch": 2.1373346897253307, "grad_norm": 0.9216282963752747, "learning_rate": 1e-05, "loss": 0.4375, "mean_token_accuracy": 0.8592369556427002, "num_tokens": 669516310.0, "step": 4202 }, { "epoch": 2.1378433367243135, "grad_norm": 0.9496901035308838, "learning_rate": 1e-05, "loss": 0.4458, "mean_token_accuracy": 0.8564037680625916, "num_tokens": 669669101.0, "step": 4203 }, { "epoch": 2.138351983723296, "grad_norm": 1.013985276222229, "learning_rate": 1e-05, "loss": 0.451, "mean_token_accuracy": 0.8551113605499268, "num_tokens": 669843789.0, "step": 4204 }, { "epoch": 2.1388606307222786, "grad_norm": 0.9502270221710205, "learning_rate": 1e-05, "loss": 0.4476, "mean_token_accuracy": 0.8561847805976868, "num_tokens": 670005303.0, "step": 4205 }, { "epoch": 2.1393692777212614, "grad_norm": 0.9519503116607666, "learning_rate": 1e-05, "loss": 0.4327, "mean_token_accuracy": 0.8593729734420776, "num_tokens": 670161044.0, "step": 4206 }, { "epoch": 2.139877924720244, "grad_norm": 0.9718447923660278, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.8558679819107056, "num_tokens": 670314008.0, "step": 4207 }, { "epoch": 2.140386571719227, "grad_norm": 0.9260292053222656, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8572784066200256, "num_tokens": 670470095.0, "step": 4208 }, { "epoch": 2.1408952187182098, "grad_norm": 0.9734790921211243, "learning_rate": 1e-05, "loss": 0.4735, "mean_token_accuracy": 0.8485541343688965, "num_tokens": 670636949.0, "step": 4209 }, { "epoch": 2.141403865717192, "grad_norm": 0.8780142664909363, "learning_rate": 1e-05, "loss": 0.4157, "mean_token_accuracy": 0.864952564239502, "num_tokens": 670784785.0, "step": 4210 }, { "epoch": 2.141912512716175, "grad_norm": 1.1260484457015991, "learning_rate": 1e-05, "loss": 0.4157, "mean_token_accuracy": 0.864479660987854, "num_tokens": 670956463.0, "step": 4211 }, { "epoch": 2.1424211597151577, "grad_norm": 0.9373603463172913, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.8588770031929016, "num_tokens": 671102909.0, "step": 4212 }, { "epoch": 2.1429298067141405, "grad_norm": 0.8711733818054199, "learning_rate": 1e-05, "loss": 0.4303, "mean_token_accuracy": 0.8594882488250732, "num_tokens": 671264081.0, "step": 4213 }, { "epoch": 2.1434384537131232, "grad_norm": 1.0101747512817383, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8565623164176941, "num_tokens": 671421218.0, "step": 4214 }, { "epoch": 2.1439471007121056, "grad_norm": 0.9408650994300842, "learning_rate": 1e-05, "loss": 0.4549, "mean_token_accuracy": 0.8543228507041931, "num_tokens": 671578732.0, "step": 4215 }, { "epoch": 2.1444557477110884, "grad_norm": 1.341538429260254, "learning_rate": 1e-05, "loss": 0.433, "mean_token_accuracy": 0.8586952090263367, "num_tokens": 671755061.0, "step": 4216 }, { "epoch": 2.144964394710071, "grad_norm": 0.937558114528656, "learning_rate": 1e-05, "loss": 0.3851, "mean_token_accuracy": 0.8737555146217346, "num_tokens": 671921101.0, "step": 4217 }, { "epoch": 2.145473041709054, "grad_norm": 0.8825535178184509, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.858366072177887, "num_tokens": 672085682.0, "step": 4218 }, { "epoch": 2.1459816887080367, "grad_norm": 0.9849353432655334, "learning_rate": 1e-05, "loss": 0.457, "mean_token_accuracy": 0.851929783821106, "num_tokens": 672242998.0, "step": 4219 }, { "epoch": 2.1464903357070195, "grad_norm": 0.9155281186103821, "learning_rate": 1e-05, "loss": 0.4727, "mean_token_accuracy": 0.8466140031814575, "num_tokens": 672406808.0, "step": 4220 }, { "epoch": 2.146998982706002, "grad_norm": 0.9637844562530518, "learning_rate": 1e-05, "loss": 0.4826, "mean_token_accuracy": 0.8453380465507507, "num_tokens": 672582458.0, "step": 4221 }, { "epoch": 2.1475076297049847, "grad_norm": 0.9090501070022583, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.857498288154602, "num_tokens": 672742756.0, "step": 4222 }, { "epoch": 2.1480162767039674, "grad_norm": 0.9212506413459778, "learning_rate": 1e-05, "loss": 0.429, "mean_token_accuracy": 0.860740065574646, "num_tokens": 672909847.0, "step": 4223 }, { "epoch": 2.1485249237029502, "grad_norm": 0.8985199332237244, "learning_rate": 1e-05, "loss": 0.4101, "mean_token_accuracy": 0.8661127090454102, "num_tokens": 673076346.0, "step": 4224 }, { "epoch": 2.149033570701933, "grad_norm": 0.9333046078681946, "learning_rate": 1e-05, "loss": 0.4646, "mean_token_accuracy": 0.8508179187774658, "num_tokens": 673225384.0, "step": 4225 }, { "epoch": 2.1495422177009154, "grad_norm": 0.874630868434906, "learning_rate": 1e-05, "loss": 0.4412, "mean_token_accuracy": 0.8571752309799194, "num_tokens": 673390482.0, "step": 4226 }, { "epoch": 2.150050864699898, "grad_norm": 0.8874839544296265, "learning_rate": 1e-05, "loss": 0.4285, "mean_token_accuracy": 0.8597501516342163, "num_tokens": 673557143.0, "step": 4227 }, { "epoch": 2.150559511698881, "grad_norm": 0.9544134736061096, "learning_rate": 1e-05, "loss": 0.4667, "mean_token_accuracy": 0.8492194414138794, "num_tokens": 673724830.0, "step": 4228 }, { "epoch": 2.1510681586978637, "grad_norm": 0.9367156028747559, "learning_rate": 1e-05, "loss": 0.4566, "mean_token_accuracy": 0.8530341386795044, "num_tokens": 673885742.0, "step": 4229 }, { "epoch": 2.1515768056968465, "grad_norm": 0.9363099932670593, "learning_rate": 1e-05, "loss": 0.446, "mean_token_accuracy": 0.8563543558120728, "num_tokens": 674041915.0, "step": 4230 }, { "epoch": 2.1520854526958293, "grad_norm": 0.9003242254257202, "learning_rate": 1e-05, "loss": 0.427, "mean_token_accuracy": 0.8603519797325134, "num_tokens": 674203202.0, "step": 4231 }, { "epoch": 2.1525940996948116, "grad_norm": 0.8814538717269897, "learning_rate": 1e-05, "loss": 0.4837, "mean_token_accuracy": 0.8455694317817688, "num_tokens": 674376067.0, "step": 4232 }, { "epoch": 2.1531027466937944, "grad_norm": 0.9837362766265869, "learning_rate": 1e-05, "loss": 0.4417, "mean_token_accuracy": 0.8560826182365417, "num_tokens": 674530535.0, "step": 4233 }, { "epoch": 2.153611393692777, "grad_norm": 0.9501599073410034, "learning_rate": 1e-05, "loss": 0.4345, "mean_token_accuracy": 0.8594794273376465, "num_tokens": 674683835.0, "step": 4234 }, { "epoch": 2.15412004069176, "grad_norm": 0.8494070768356323, "learning_rate": 1e-05, "loss": 0.4346, "mean_token_accuracy": 0.8582461476325989, "num_tokens": 674852193.0, "step": 4235 }, { "epoch": 2.154628687690743, "grad_norm": 0.884700357913971, "learning_rate": 1e-05, "loss": 0.4657, "mean_token_accuracy": 0.8502305746078491, "num_tokens": 675021716.0, "step": 4236 }, { "epoch": 2.155137334689725, "grad_norm": 0.9532796144485474, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.8599141836166382, "num_tokens": 675172087.0, "step": 4237 }, { "epoch": 2.155645981688708, "grad_norm": 0.9468601942062378, "learning_rate": 1e-05, "loss": 0.4473, "mean_token_accuracy": 0.8549070954322815, "num_tokens": 675339134.0, "step": 4238 }, { "epoch": 2.1561546286876907, "grad_norm": 0.9046058058738708, "learning_rate": 1e-05, "loss": 0.4525, "mean_token_accuracy": 0.8540990352630615, "num_tokens": 675497933.0, "step": 4239 }, { "epoch": 2.1566632756866735, "grad_norm": 0.9532819986343384, "learning_rate": 1e-05, "loss": 0.4512, "mean_token_accuracy": 0.8545533418655396, "num_tokens": 675650088.0, "step": 4240 }, { "epoch": 2.1571719226856563, "grad_norm": 0.8591033220291138, "learning_rate": 1e-05, "loss": 0.4223, "mean_token_accuracy": 0.8633793592453003, "num_tokens": 675815367.0, "step": 4241 }, { "epoch": 2.157680569684639, "grad_norm": 0.9060652256011963, "learning_rate": 1e-05, "loss": 0.4362, "mean_token_accuracy": 0.8582218289375305, "num_tokens": 675976634.0, "step": 4242 }, { "epoch": 2.1581892166836214, "grad_norm": 0.9459686279296875, "learning_rate": 1e-05, "loss": 0.4633, "mean_token_accuracy": 0.8501343131065369, "num_tokens": 676134723.0, "step": 4243 }, { "epoch": 2.158697863682604, "grad_norm": 0.886417031288147, "learning_rate": 1e-05, "loss": 0.4586, "mean_token_accuracy": 0.851806104183197, "num_tokens": 676301382.0, "step": 4244 }, { "epoch": 2.159206510681587, "grad_norm": 0.8885103464126587, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8584810495376587, "num_tokens": 676457791.0, "step": 4245 }, { "epoch": 2.1597151576805698, "grad_norm": 0.9095398783683777, "learning_rate": 1e-05, "loss": 0.4475, "mean_token_accuracy": 0.8562595844268799, "num_tokens": 676630331.0, "step": 4246 }, { "epoch": 2.1602238046795526, "grad_norm": 0.8835845589637756, "learning_rate": 1e-05, "loss": 0.4616, "mean_token_accuracy": 0.8514906167984009, "num_tokens": 676806010.0, "step": 4247 }, { "epoch": 2.160732451678535, "grad_norm": 0.9202662110328674, "learning_rate": 1e-05, "loss": 0.4659, "mean_token_accuracy": 0.850483775138855, "num_tokens": 676971441.0, "step": 4248 }, { "epoch": 2.1612410986775177, "grad_norm": 1.0267709493637085, "learning_rate": 1e-05, "loss": 0.425, "mean_token_accuracy": 0.8617174625396729, "num_tokens": 677126808.0, "step": 4249 }, { "epoch": 2.1617497456765005, "grad_norm": 1.0010653734207153, "learning_rate": 1e-05, "loss": 0.4493, "mean_token_accuracy": 0.8573693037033081, "num_tokens": 677292539.0, "step": 4250 }, { "epoch": 2.1622583926754833, "grad_norm": 0.9092729091644287, "learning_rate": 1e-05, "loss": 0.4129, "mean_token_accuracy": 0.865876317024231, "num_tokens": 677442157.0, "step": 4251 }, { "epoch": 2.162767039674466, "grad_norm": 0.8797513246536255, "learning_rate": 1e-05, "loss": 0.4062, "mean_token_accuracy": 0.8681555986404419, "num_tokens": 677609509.0, "step": 4252 }, { "epoch": 2.163275686673449, "grad_norm": 0.9524892568588257, "learning_rate": 1e-05, "loss": 0.425, "mean_token_accuracy": 0.8616195917129517, "num_tokens": 677762481.0, "step": 4253 }, { "epoch": 2.163784333672431, "grad_norm": 0.9220227003097534, "learning_rate": 1e-05, "loss": 0.4607, "mean_token_accuracy": 0.851956307888031, "num_tokens": 677937681.0, "step": 4254 }, { "epoch": 2.164292980671414, "grad_norm": 0.947273850440979, "learning_rate": 1e-05, "loss": 0.4253, "mean_token_accuracy": 0.8622714877128601, "num_tokens": 678092076.0, "step": 4255 }, { "epoch": 2.1648016276703967, "grad_norm": 0.9841005206108093, "learning_rate": 1e-05, "loss": 0.3996, "mean_token_accuracy": 0.8677623271942139, "num_tokens": 678236943.0, "step": 4256 }, { "epoch": 2.1653102746693795, "grad_norm": 0.9958181977272034, "learning_rate": 1e-05, "loss": 0.4666, "mean_token_accuracy": 0.8516644239425659, "num_tokens": 678377775.0, "step": 4257 }, { "epoch": 2.1658189216683623, "grad_norm": 0.9725253582000732, "learning_rate": 1e-05, "loss": 0.4431, "mean_token_accuracy": 0.8572109937667847, "num_tokens": 678533114.0, "step": 4258 }, { "epoch": 2.1663275686673447, "grad_norm": 0.9555931091308594, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8605384230613708, "num_tokens": 678694904.0, "step": 4259 }, { "epoch": 2.1668362156663274, "grad_norm": 1.3989323377609253, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8648083209991455, "num_tokens": 678854315.0, "step": 4260 }, { "epoch": 2.1673448626653102, "grad_norm": 0.9446583390235901, "learning_rate": 1e-05, "loss": 0.4159, "mean_token_accuracy": 0.8639305830001831, "num_tokens": 679013629.0, "step": 4261 }, { "epoch": 2.167853509664293, "grad_norm": 0.8738034963607788, "learning_rate": 1e-05, "loss": 0.439, "mean_token_accuracy": 0.8571431636810303, "num_tokens": 679170763.0, "step": 4262 }, { "epoch": 2.168362156663276, "grad_norm": 1.0060641765594482, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.8516331911087036, "num_tokens": 679322506.0, "step": 4263 }, { "epoch": 2.1688708036622586, "grad_norm": 0.8976203203201294, "learning_rate": 1e-05, "loss": 0.4425, "mean_token_accuracy": 0.8569817543029785, "num_tokens": 679480751.0, "step": 4264 }, { "epoch": 2.169379450661241, "grad_norm": 0.8971258997917175, "learning_rate": 1e-05, "loss": 0.4574, "mean_token_accuracy": 0.8524659872055054, "num_tokens": 679639537.0, "step": 4265 }, { "epoch": 2.1698880976602237, "grad_norm": 0.925749659538269, "learning_rate": 1e-05, "loss": 0.4199, "mean_token_accuracy": 0.8633953332901001, "num_tokens": 679799542.0, "step": 4266 }, { "epoch": 2.1703967446592065, "grad_norm": 0.8783673048019409, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8596926331520081, "num_tokens": 679969541.0, "step": 4267 }, { "epoch": 2.1709053916581893, "grad_norm": 0.9182345271110535, "learning_rate": 1e-05, "loss": 0.4261, "mean_token_accuracy": 0.8597497940063477, "num_tokens": 680126987.0, "step": 4268 }, { "epoch": 2.171414038657172, "grad_norm": 0.8887823224067688, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8598739504814148, "num_tokens": 680294249.0, "step": 4269 }, { "epoch": 2.1719226856561544, "grad_norm": 0.8925853967666626, "learning_rate": 1e-05, "loss": 0.4499, "mean_token_accuracy": 0.8564271330833435, "num_tokens": 680445654.0, "step": 4270 }, { "epoch": 2.172431332655137, "grad_norm": 0.8978914022445679, "learning_rate": 1e-05, "loss": 0.4569, "mean_token_accuracy": 0.8519887924194336, "num_tokens": 680608877.0, "step": 4271 }, { "epoch": 2.17293997965412, "grad_norm": 0.8895770311355591, "learning_rate": 1e-05, "loss": 0.4516, "mean_token_accuracy": 0.855591356754303, "num_tokens": 680766149.0, "step": 4272 }, { "epoch": 2.173448626653103, "grad_norm": 0.8657394647598267, "learning_rate": 1e-05, "loss": 0.4091, "mean_token_accuracy": 0.8654162883758545, "num_tokens": 680924361.0, "step": 4273 }, { "epoch": 2.1739572736520856, "grad_norm": 0.9019215703010559, "learning_rate": 1e-05, "loss": 0.4399, "mean_token_accuracy": 0.8570451140403748, "num_tokens": 681081193.0, "step": 4274 }, { "epoch": 2.1744659206510684, "grad_norm": 0.8810397982597351, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.8547847867012024, "num_tokens": 681233678.0, "step": 4275 }, { "epoch": 2.1749745676500507, "grad_norm": 0.9701651930809021, "learning_rate": 1e-05, "loss": 0.454, "mean_token_accuracy": 0.8537286520004272, "num_tokens": 681389768.0, "step": 4276 }, { "epoch": 2.1754832146490335, "grad_norm": 0.9187273979187012, "learning_rate": 1e-05, "loss": 0.4531, "mean_token_accuracy": 0.8543602228164673, "num_tokens": 681558379.0, "step": 4277 }, { "epoch": 2.1759918616480163, "grad_norm": 0.8782539367675781, "learning_rate": 1e-05, "loss": 0.4065, "mean_token_accuracy": 0.8672250509262085, "num_tokens": 681716150.0, "step": 4278 }, { "epoch": 2.176500508646999, "grad_norm": 0.9115551114082336, "learning_rate": 1e-05, "loss": 0.481, "mean_token_accuracy": 0.8463177680969238, "num_tokens": 681881029.0, "step": 4279 }, { "epoch": 2.177009155645982, "grad_norm": 0.9686064124107361, "learning_rate": 1e-05, "loss": 0.4325, "mean_token_accuracy": 0.8595022559165955, "num_tokens": 682034982.0, "step": 4280 }, { "epoch": 2.177517802644964, "grad_norm": 0.9756225943565369, "learning_rate": 1e-05, "loss": 0.4371, "mean_token_accuracy": 0.8567383289337158, "num_tokens": 682196326.0, "step": 4281 }, { "epoch": 2.178026449643947, "grad_norm": 0.9038667678833008, "learning_rate": 1e-05, "loss": 0.4279, "mean_token_accuracy": 0.8613846302032471, "num_tokens": 682360761.0, "step": 4282 }, { "epoch": 2.1785350966429298, "grad_norm": 0.9914163947105408, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8609004020690918, "num_tokens": 682525520.0, "step": 4283 }, { "epoch": 2.1790437436419126, "grad_norm": 0.8739263415336609, "learning_rate": 1e-05, "loss": 0.4104, "mean_token_accuracy": 0.8661122918128967, "num_tokens": 682683740.0, "step": 4284 }, { "epoch": 2.1795523906408953, "grad_norm": 1.0040781497955322, "learning_rate": 1e-05, "loss": 0.4298, "mean_token_accuracy": 0.8597602248191833, "num_tokens": 682851431.0, "step": 4285 }, { "epoch": 2.180061037639878, "grad_norm": 1.0031808614730835, "learning_rate": 1e-05, "loss": 0.421, "mean_token_accuracy": 0.8615937232971191, "num_tokens": 682998758.0, "step": 4286 }, { "epoch": 2.1805696846388605, "grad_norm": 1.015446662902832, "learning_rate": 1e-05, "loss": 0.4384, "mean_token_accuracy": 0.8581506013870239, "num_tokens": 683156422.0, "step": 4287 }, { "epoch": 2.1810783316378433, "grad_norm": 1.040065050125122, "learning_rate": 1e-05, "loss": 0.4259, "mean_token_accuracy": 0.8622639179229736, "num_tokens": 683315540.0, "step": 4288 }, { "epoch": 2.181586978636826, "grad_norm": 0.889073371887207, "learning_rate": 1e-05, "loss": 0.4592, "mean_token_accuracy": 0.8521596193313599, "num_tokens": 683475268.0, "step": 4289 }, { "epoch": 2.182095625635809, "grad_norm": 1.0887638330459595, "learning_rate": 1e-05, "loss": 0.4273, "mean_token_accuracy": 0.8617396950721741, "num_tokens": 683639710.0, "step": 4290 }, { "epoch": 2.1826042726347916, "grad_norm": 0.8876301646232605, "learning_rate": 1e-05, "loss": 0.4458, "mean_token_accuracy": 0.8568639755249023, "num_tokens": 683807593.0, "step": 4291 }, { "epoch": 2.183112919633774, "grad_norm": 1.022648572921753, "learning_rate": 1e-05, "loss": 0.4809, "mean_token_accuracy": 0.8475189208984375, "num_tokens": 683962253.0, "step": 4292 }, { "epoch": 2.1836215666327567, "grad_norm": 0.9099440574645996, "learning_rate": 1e-05, "loss": 0.4019, "mean_token_accuracy": 0.8683190941810608, "num_tokens": 684134385.0, "step": 4293 }, { "epoch": 2.1841302136317395, "grad_norm": 0.9514361619949341, "learning_rate": 1e-05, "loss": 0.4632, "mean_token_accuracy": 0.8512458801269531, "num_tokens": 684276805.0, "step": 4294 }, { "epoch": 2.1846388606307223, "grad_norm": 1.0674186944961548, "learning_rate": 1e-05, "loss": 0.4138, "mean_token_accuracy": 0.8648924827575684, "num_tokens": 684434079.0, "step": 4295 }, { "epoch": 2.185147507629705, "grad_norm": 0.9300355315208435, "learning_rate": 1e-05, "loss": 0.4212, "mean_token_accuracy": 0.8619647026062012, "num_tokens": 684600612.0, "step": 4296 }, { "epoch": 2.185656154628688, "grad_norm": 0.9934197664260864, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8561198711395264, "num_tokens": 684769275.0, "step": 4297 }, { "epoch": 2.1861648016276702, "grad_norm": 1.076294183731079, "learning_rate": 1e-05, "loss": 0.4367, "mean_token_accuracy": 0.8571916222572327, "num_tokens": 684915465.0, "step": 4298 }, { "epoch": 2.186673448626653, "grad_norm": 0.9175110459327698, "learning_rate": 1e-05, "loss": 0.4641, "mean_token_accuracy": 0.851553738117218, "num_tokens": 685086328.0, "step": 4299 }, { "epoch": 2.187182095625636, "grad_norm": 1.1934930086135864, "learning_rate": 1e-05, "loss": 0.4367, "mean_token_accuracy": 0.859502911567688, "num_tokens": 685243003.0, "step": 4300 }, { "epoch": 2.1876907426246186, "grad_norm": 0.9275957345962524, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8554542064666748, "num_tokens": 685418099.0, "step": 4301 }, { "epoch": 2.1881993896236014, "grad_norm": 0.9224422574043274, "learning_rate": 1e-05, "loss": 0.4245, "mean_token_accuracy": 0.8605654239654541, "num_tokens": 685586437.0, "step": 4302 }, { "epoch": 2.1887080366225837, "grad_norm": 0.966139018535614, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8592056035995483, "num_tokens": 685737580.0, "step": 4303 }, { "epoch": 2.1892166836215665, "grad_norm": 0.9049349427223206, "learning_rate": 1e-05, "loss": 0.4198, "mean_token_accuracy": 0.8639073371887207, "num_tokens": 685883071.0, "step": 4304 }, { "epoch": 2.1897253306205493, "grad_norm": 0.9150710701942444, "learning_rate": 1e-05, "loss": 0.4383, "mean_token_accuracy": 0.8586542010307312, "num_tokens": 686047266.0, "step": 4305 }, { "epoch": 2.190233977619532, "grad_norm": 0.9289497137069702, "learning_rate": 1e-05, "loss": 0.4143, "mean_token_accuracy": 0.8645882606506348, "num_tokens": 686203095.0, "step": 4306 }, { "epoch": 2.190742624618515, "grad_norm": 0.9255957007408142, "learning_rate": 1e-05, "loss": 0.4079, "mean_token_accuracy": 0.8672335147857666, "num_tokens": 686358907.0, "step": 4307 }, { "epoch": 2.1912512716174977, "grad_norm": 0.874142587184906, "learning_rate": 1e-05, "loss": 0.4394, "mean_token_accuracy": 0.8571419715881348, "num_tokens": 686515166.0, "step": 4308 }, { "epoch": 2.19175991861648, "grad_norm": 0.9326626658439636, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8514823317527771, "num_tokens": 686681595.0, "step": 4309 }, { "epoch": 2.192268565615463, "grad_norm": 1.4150363206863403, "learning_rate": 1e-05, "loss": 0.4354, "mean_token_accuracy": 0.8597804307937622, "num_tokens": 686847911.0, "step": 4310 }, { "epoch": 2.1927772126144456, "grad_norm": 0.9543857574462891, "learning_rate": 1e-05, "loss": 0.4362, "mean_token_accuracy": 0.8587651252746582, "num_tokens": 686996951.0, "step": 4311 }, { "epoch": 2.1932858596134284, "grad_norm": 0.9119658470153809, "learning_rate": 1e-05, "loss": 0.4445, "mean_token_accuracy": 0.8558597564697266, "num_tokens": 687161028.0, "step": 4312 }, { "epoch": 2.193794506612411, "grad_norm": 0.8924853801727295, "learning_rate": 1e-05, "loss": 0.4274, "mean_token_accuracy": 0.8626085519790649, "num_tokens": 687314790.0, "step": 4313 }, { "epoch": 2.1943031536113935, "grad_norm": 0.9578880071640015, "learning_rate": 1e-05, "loss": 0.4478, "mean_token_accuracy": 0.8548815846443176, "num_tokens": 687462776.0, "step": 4314 }, { "epoch": 2.1948118006103763, "grad_norm": 0.8604456186294556, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.8595722913742065, "num_tokens": 687627884.0, "step": 4315 }, { "epoch": 2.195320447609359, "grad_norm": 0.9253394603729248, "learning_rate": 1e-05, "loss": 0.4244, "mean_token_accuracy": 0.8634549975395203, "num_tokens": 687785512.0, "step": 4316 }, { "epoch": 2.195829094608342, "grad_norm": 1.0204318761825562, "learning_rate": 1e-05, "loss": 0.4369, "mean_token_accuracy": 0.8581687211990356, "num_tokens": 687955390.0, "step": 4317 }, { "epoch": 2.1963377416073246, "grad_norm": 0.8677594661712646, "learning_rate": 1e-05, "loss": 0.4241, "mean_token_accuracy": 0.8619781136512756, "num_tokens": 688112149.0, "step": 4318 }, { "epoch": 2.196846388606307, "grad_norm": 0.87612384557724, "learning_rate": 1e-05, "loss": 0.4144, "mean_token_accuracy": 0.8636021018028259, "num_tokens": 688264263.0, "step": 4319 }, { "epoch": 2.1973550356052898, "grad_norm": 0.8514624834060669, "learning_rate": 1e-05, "loss": 0.42, "mean_token_accuracy": 0.8636860847473145, "num_tokens": 688432958.0, "step": 4320 }, { "epoch": 2.1978636826042726, "grad_norm": 0.9815389513969421, "learning_rate": 1e-05, "loss": 0.4768, "mean_token_accuracy": 0.8477411270141602, "num_tokens": 688593908.0, "step": 4321 }, { "epoch": 2.1983723296032553, "grad_norm": 0.9428776502609253, "learning_rate": 1e-05, "loss": 0.4225, "mean_token_accuracy": 0.8623034954071045, "num_tokens": 688745272.0, "step": 4322 }, { "epoch": 2.198880976602238, "grad_norm": 0.9332963228225708, "learning_rate": 1e-05, "loss": 0.4616, "mean_token_accuracy": 0.8518528938293457, "num_tokens": 688903056.0, "step": 4323 }, { "epoch": 2.199389623601221, "grad_norm": 0.917495846748352, "learning_rate": 1e-05, "loss": 0.4289, "mean_token_accuracy": 0.8599016666412354, "num_tokens": 689058958.0, "step": 4324 }, { "epoch": 2.1998982706002033, "grad_norm": 0.939020037651062, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8560481071472168, "num_tokens": 689234401.0, "step": 4325 }, { "epoch": 2.200406917599186, "grad_norm": 0.9460886120796204, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8559812307357788, "num_tokens": 689387117.0, "step": 4326 }, { "epoch": 2.200915564598169, "grad_norm": 0.9061490893363953, "learning_rate": 1e-05, "loss": 0.4375, "mean_token_accuracy": 0.8584349155426025, "num_tokens": 689537247.0, "step": 4327 }, { "epoch": 2.2014242115971516, "grad_norm": 1.0042805671691895, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8566774725914001, "num_tokens": 689703523.0, "step": 4328 }, { "epoch": 2.2019328585961344, "grad_norm": 0.9766663312911987, "learning_rate": 1e-05, "loss": 0.463, "mean_token_accuracy": 0.849650502204895, "num_tokens": 689854306.0, "step": 4329 }, { "epoch": 2.202441505595117, "grad_norm": 0.9077035784721375, "learning_rate": 1e-05, "loss": 0.452, "mean_token_accuracy": 0.8559194803237915, "num_tokens": 690011320.0, "step": 4330 }, { "epoch": 2.2029501525940995, "grad_norm": 1.0141128301620483, "learning_rate": 1e-05, "loss": 0.4295, "mean_token_accuracy": 0.8605471849441528, "num_tokens": 690159881.0, "step": 4331 }, { "epoch": 2.2034587995930823, "grad_norm": 0.9698449969291687, "learning_rate": 1e-05, "loss": 0.4123, "mean_token_accuracy": 0.8654199838638306, "num_tokens": 690312474.0, "step": 4332 }, { "epoch": 2.203967446592065, "grad_norm": 0.950181245803833, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8600283861160278, "num_tokens": 690471811.0, "step": 4333 }, { "epoch": 2.204476093591048, "grad_norm": 1.8057748079299927, "learning_rate": 1e-05, "loss": 0.4181, "mean_token_accuracy": 0.8640917539596558, "num_tokens": 690629913.0, "step": 4334 }, { "epoch": 2.2049847405900307, "grad_norm": 0.9295744895935059, "learning_rate": 1e-05, "loss": 0.411, "mean_token_accuracy": 0.8665107488632202, "num_tokens": 690791021.0, "step": 4335 }, { "epoch": 2.205493387589013, "grad_norm": 0.9357677698135376, "learning_rate": 1e-05, "loss": 0.4188, "mean_token_accuracy": 0.8637399077415466, "num_tokens": 690946910.0, "step": 4336 }, { "epoch": 2.206002034587996, "grad_norm": 0.9426012635231018, "learning_rate": 1e-05, "loss": 0.456, "mean_token_accuracy": 0.8539572954177856, "num_tokens": 691114027.0, "step": 4337 }, { "epoch": 2.2065106815869786, "grad_norm": 0.8814846873283386, "learning_rate": 1e-05, "loss": 0.4558, "mean_token_accuracy": 0.8528778553009033, "num_tokens": 691287105.0, "step": 4338 }, { "epoch": 2.2070193285859614, "grad_norm": 0.9679071307182312, "learning_rate": 1e-05, "loss": 0.4367, "mean_token_accuracy": 0.8572427034378052, "num_tokens": 691433141.0, "step": 4339 }, { "epoch": 2.207527975584944, "grad_norm": 0.9318785667419434, "learning_rate": 1e-05, "loss": 0.4304, "mean_token_accuracy": 0.8598240613937378, "num_tokens": 691590813.0, "step": 4340 }, { "epoch": 2.2080366225839265, "grad_norm": 0.9134104251861572, "learning_rate": 1e-05, "loss": 0.4523, "mean_token_accuracy": 0.8552306890487671, "num_tokens": 691753280.0, "step": 4341 }, { "epoch": 2.2085452695829093, "grad_norm": 0.9650377631187439, "learning_rate": 1e-05, "loss": 0.4263, "mean_token_accuracy": 0.8613597750663757, "num_tokens": 691912782.0, "step": 4342 }, { "epoch": 2.209053916581892, "grad_norm": 0.8906856775283813, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8498727083206177, "num_tokens": 692077739.0, "step": 4343 }, { "epoch": 2.209562563580875, "grad_norm": 0.9788679480552673, "learning_rate": 1e-05, "loss": 0.4191, "mean_token_accuracy": 0.8626728057861328, "num_tokens": 692242294.0, "step": 4344 }, { "epoch": 2.2100712105798577, "grad_norm": 0.9652032256126404, "learning_rate": 1e-05, "loss": 0.4521, "mean_token_accuracy": 0.8537179231643677, "num_tokens": 692397707.0, "step": 4345 }, { "epoch": 2.2105798575788405, "grad_norm": 0.9755709767341614, "learning_rate": 1e-05, "loss": 0.4716, "mean_token_accuracy": 0.8492090702056885, "num_tokens": 692562541.0, "step": 4346 }, { "epoch": 2.211088504577823, "grad_norm": 0.971756637096405, "learning_rate": 1e-05, "loss": 0.4406, "mean_token_accuracy": 0.8570781350135803, "num_tokens": 692709163.0, "step": 4347 }, { "epoch": 2.2115971515768056, "grad_norm": 0.9199160933494568, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8610886335372925, "num_tokens": 692863692.0, "step": 4348 }, { "epoch": 2.2121057985757884, "grad_norm": 0.9759620428085327, "learning_rate": 1e-05, "loss": 0.4133, "mean_token_accuracy": 0.8647420406341553, "num_tokens": 693018522.0, "step": 4349 }, { "epoch": 2.212614445574771, "grad_norm": 0.872011125087738, "learning_rate": 1e-05, "loss": 0.4495, "mean_token_accuracy": 0.8554370403289795, "num_tokens": 693172309.0, "step": 4350 }, { "epoch": 2.213123092573754, "grad_norm": 0.9128387570381165, "learning_rate": 1e-05, "loss": 0.4629, "mean_token_accuracy": 0.8515660762786865, "num_tokens": 693334327.0, "step": 4351 }, { "epoch": 2.2136317395727367, "grad_norm": 0.946731686592102, "learning_rate": 1e-05, "loss": 0.4352, "mean_token_accuracy": 0.8582457304000854, "num_tokens": 693494304.0, "step": 4352 }, { "epoch": 2.214140386571719, "grad_norm": 0.9041470289230347, "learning_rate": 1e-05, "loss": 0.4477, "mean_token_accuracy": 0.8558679819107056, "num_tokens": 693664029.0, "step": 4353 }, { "epoch": 2.214649033570702, "grad_norm": 1.008070945739746, "learning_rate": 1e-05, "loss": 0.42, "mean_token_accuracy": 0.8630983829498291, "num_tokens": 693827204.0, "step": 4354 }, { "epoch": 2.2151576805696847, "grad_norm": 0.8689998984336853, "learning_rate": 1e-05, "loss": 0.4728, "mean_token_accuracy": 0.848616361618042, "num_tokens": 694006803.0, "step": 4355 }, { "epoch": 2.2156663275686674, "grad_norm": 0.9089202880859375, "learning_rate": 1e-05, "loss": 0.4485, "mean_token_accuracy": 0.8560207486152649, "num_tokens": 694172486.0, "step": 4356 }, { "epoch": 2.2161749745676502, "grad_norm": 0.9482621550559998, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.8523670434951782, "num_tokens": 694346204.0, "step": 4357 }, { "epoch": 2.2166836215666326, "grad_norm": 1.0576685667037964, "learning_rate": 1e-05, "loss": 0.4839, "mean_token_accuracy": 0.8448648452758789, "num_tokens": 694504560.0, "step": 4358 }, { "epoch": 2.2171922685656154, "grad_norm": 0.9813886880874634, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8526740074157715, "num_tokens": 694652226.0, "step": 4359 }, { "epoch": 2.217700915564598, "grad_norm": 0.9225329756736755, "learning_rate": 1e-05, "loss": 0.4628, "mean_token_accuracy": 0.849311351776123, "num_tokens": 694817430.0, "step": 4360 }, { "epoch": 2.218209562563581, "grad_norm": 0.9040138721466064, "learning_rate": 1e-05, "loss": 0.4274, "mean_token_accuracy": 0.8618972301483154, "num_tokens": 694970575.0, "step": 4361 }, { "epoch": 2.2187182095625637, "grad_norm": 1.0440372228622437, "learning_rate": 1e-05, "loss": 0.4345, "mean_token_accuracy": 0.8597314953804016, "num_tokens": 695139610.0, "step": 4362 }, { "epoch": 2.219226856561546, "grad_norm": 0.9187480807304382, "learning_rate": 1e-05, "loss": 0.4307, "mean_token_accuracy": 0.8600801229476929, "num_tokens": 695290776.0, "step": 4363 }, { "epoch": 2.219735503560529, "grad_norm": 0.9653752446174622, "learning_rate": 1e-05, "loss": 0.4271, "mean_token_accuracy": 0.8609426021575928, "num_tokens": 695455043.0, "step": 4364 }, { "epoch": 2.2202441505595116, "grad_norm": 0.843532919883728, "learning_rate": 1e-05, "loss": 0.4309, "mean_token_accuracy": 0.8611189126968384, "num_tokens": 695627666.0, "step": 4365 }, { "epoch": 2.2207527975584944, "grad_norm": 0.9980239272117615, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8516084551811218, "num_tokens": 695788464.0, "step": 4366 }, { "epoch": 2.221261444557477, "grad_norm": 0.9190233945846558, "learning_rate": 1e-05, "loss": 0.4705, "mean_token_accuracy": 0.849383533000946, "num_tokens": 695943810.0, "step": 4367 }, { "epoch": 2.22177009155646, "grad_norm": 0.9599975943565369, "learning_rate": 1e-05, "loss": 0.4153, "mean_token_accuracy": 0.8644353151321411, "num_tokens": 696110054.0, "step": 4368 }, { "epoch": 2.2222787385554423, "grad_norm": 0.970862627029419, "learning_rate": 1e-05, "loss": 0.4151, "mean_token_accuracy": 0.8649633526802063, "num_tokens": 696264565.0, "step": 4369 }, { "epoch": 2.222787385554425, "grad_norm": 0.9327432513237, "learning_rate": 1e-05, "loss": 0.4454, "mean_token_accuracy": 0.8549506068229675, "num_tokens": 696419384.0, "step": 4370 }, { "epoch": 2.223296032553408, "grad_norm": 0.9206057190895081, "learning_rate": 1e-05, "loss": 0.4202, "mean_token_accuracy": 0.8625045418739319, "num_tokens": 696584696.0, "step": 4371 }, { "epoch": 2.2238046795523907, "grad_norm": 0.8772811889648438, "learning_rate": 1e-05, "loss": 0.4287, "mean_token_accuracy": 0.8601574301719666, "num_tokens": 696754841.0, "step": 4372 }, { "epoch": 2.2243133265513735, "grad_norm": 0.9989537596702576, "learning_rate": 1e-05, "loss": 0.445, "mean_token_accuracy": 0.8562902808189392, "num_tokens": 696901513.0, "step": 4373 }, { "epoch": 2.2248219735503563, "grad_norm": 0.8996737599372864, "learning_rate": 1e-05, "loss": 0.4527, "mean_token_accuracy": 0.8534572124481201, "num_tokens": 697066242.0, "step": 4374 }, { "epoch": 2.2253306205493386, "grad_norm": 0.9436988234519958, "learning_rate": 1e-05, "loss": 0.4468, "mean_token_accuracy": 0.8555812835693359, "num_tokens": 697217547.0, "step": 4375 }, { "epoch": 2.2258392675483214, "grad_norm": 0.859586775302887, "learning_rate": 1e-05, "loss": 0.4321, "mean_token_accuracy": 0.8602193593978882, "num_tokens": 697384006.0, "step": 4376 }, { "epoch": 2.226347914547304, "grad_norm": 1.0298329591751099, "learning_rate": 1e-05, "loss": 0.4831, "mean_token_accuracy": 0.8449706435203552, "num_tokens": 697534624.0, "step": 4377 }, { "epoch": 2.226856561546287, "grad_norm": 0.973120927810669, "learning_rate": 1e-05, "loss": 0.4476, "mean_token_accuracy": 0.8544663786888123, "num_tokens": 697695648.0, "step": 4378 }, { "epoch": 2.2273652085452698, "grad_norm": 0.8641661405563354, "learning_rate": 1e-05, "loss": 0.4363, "mean_token_accuracy": 0.8581136465072632, "num_tokens": 697856787.0, "step": 4379 }, { "epoch": 2.227873855544252, "grad_norm": 0.9121147394180298, "learning_rate": 1e-05, "loss": 0.4387, "mean_token_accuracy": 0.8577170968055725, "num_tokens": 698019018.0, "step": 4380 }, { "epoch": 2.228382502543235, "grad_norm": 0.9240990281105042, "learning_rate": 1e-05, "loss": 0.4384, "mean_token_accuracy": 0.8579505681991577, "num_tokens": 698180128.0, "step": 4381 }, { "epoch": 2.2288911495422177, "grad_norm": 0.8957146406173706, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8546497821807861, "num_tokens": 698339787.0, "step": 4382 }, { "epoch": 2.2293997965412005, "grad_norm": 0.9592480063438416, "learning_rate": 1e-05, "loss": 0.4234, "mean_token_accuracy": 0.8617354035377502, "num_tokens": 698490297.0, "step": 4383 }, { "epoch": 2.2299084435401832, "grad_norm": 0.9136049747467041, "learning_rate": 1e-05, "loss": 0.449, "mean_token_accuracy": 0.8556954264640808, "num_tokens": 698644406.0, "step": 4384 }, { "epoch": 2.2304170905391656, "grad_norm": 0.9772505760192871, "learning_rate": 1e-05, "loss": 0.4451, "mean_token_accuracy": 0.8578481674194336, "num_tokens": 698794461.0, "step": 4385 }, { "epoch": 2.2309257375381484, "grad_norm": 0.8731710910797119, "learning_rate": 1e-05, "loss": 0.4063, "mean_token_accuracy": 0.8656190037727356, "num_tokens": 698949448.0, "step": 4386 }, { "epoch": 2.231434384537131, "grad_norm": 0.926904559135437, "learning_rate": 1e-05, "loss": 0.4207, "mean_token_accuracy": 0.8636926412582397, "num_tokens": 699113160.0, "step": 4387 }, { "epoch": 2.231943031536114, "grad_norm": 0.888241708278656, "learning_rate": 1e-05, "loss": 0.4762, "mean_token_accuracy": 0.847213864326477, "num_tokens": 699276343.0, "step": 4388 }, { "epoch": 2.2324516785350967, "grad_norm": 0.9436233639717102, "learning_rate": 1e-05, "loss": 0.4292, "mean_token_accuracy": 0.8591353893280029, "num_tokens": 699436640.0, "step": 4389 }, { "epoch": 2.2329603255340795, "grad_norm": 0.8924297094345093, "learning_rate": 1e-05, "loss": 0.4631, "mean_token_accuracy": 0.8514504432678223, "num_tokens": 699600680.0, "step": 4390 }, { "epoch": 2.233468972533062, "grad_norm": 0.9052316546440125, "learning_rate": 1e-05, "loss": 0.4125, "mean_token_accuracy": 0.8639260530471802, "num_tokens": 699758447.0, "step": 4391 }, { "epoch": 2.2339776195320447, "grad_norm": 0.8749424815177917, "learning_rate": 1e-05, "loss": 0.435, "mean_token_accuracy": 0.8595032691955566, "num_tokens": 699920412.0, "step": 4392 }, { "epoch": 2.2344862665310274, "grad_norm": 0.925260603427887, "learning_rate": 1e-05, "loss": 0.44, "mean_token_accuracy": 0.8578627109527588, "num_tokens": 700075114.0, "step": 4393 }, { "epoch": 2.2349949135300102, "grad_norm": 0.8851991891860962, "learning_rate": 1e-05, "loss": 0.4436, "mean_token_accuracy": 0.8560046553611755, "num_tokens": 700246268.0, "step": 4394 }, { "epoch": 2.235503560528993, "grad_norm": 0.8956559896469116, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8600329160690308, "num_tokens": 700400215.0, "step": 4395 }, { "epoch": 2.236012207527976, "grad_norm": 0.8950102925300598, "learning_rate": 1e-05, "loss": 0.4378, "mean_token_accuracy": 0.858242392539978, "num_tokens": 700561439.0, "step": 4396 }, { "epoch": 2.236520854526958, "grad_norm": 0.8950687646865845, "learning_rate": 1e-05, "loss": 0.4296, "mean_token_accuracy": 0.8612974882125854, "num_tokens": 700726538.0, "step": 4397 }, { "epoch": 2.237029501525941, "grad_norm": 0.9209567904472351, "learning_rate": 1e-05, "loss": 0.4342, "mean_token_accuracy": 0.8593178987503052, "num_tokens": 700904533.0, "step": 4398 }, { "epoch": 2.2375381485249237, "grad_norm": 0.9276517629623413, "learning_rate": 1e-05, "loss": 0.4568, "mean_token_accuracy": 0.8548173904418945, "num_tokens": 701063561.0, "step": 4399 }, { "epoch": 2.2380467955239065, "grad_norm": 0.8883805871009827, "learning_rate": 1e-05, "loss": 0.459, "mean_token_accuracy": 0.8530396223068237, "num_tokens": 701220420.0, "step": 4400 }, { "epoch": 2.2385554425228893, "grad_norm": 0.9363575577735901, "learning_rate": 1e-05, "loss": 0.4423, "mean_token_accuracy": 0.8569149971008301, "num_tokens": 701384057.0, "step": 4401 }, { "epoch": 2.2390640895218716, "grad_norm": 0.904596745967865, "learning_rate": 1e-05, "loss": 0.4252, "mean_token_accuracy": 0.8628685474395752, "num_tokens": 701544168.0, "step": 4402 }, { "epoch": 2.2395727365208544, "grad_norm": 0.9615168571472168, "learning_rate": 1e-05, "loss": 0.4427, "mean_token_accuracy": 0.8559268712997437, "num_tokens": 701710466.0, "step": 4403 }, { "epoch": 2.240081383519837, "grad_norm": 0.9265811443328857, "learning_rate": 1e-05, "loss": 0.4579, "mean_token_accuracy": 0.8512325882911682, "num_tokens": 701866354.0, "step": 4404 }, { "epoch": 2.24059003051882, "grad_norm": 0.8753740787506104, "learning_rate": 1e-05, "loss": 0.4098, "mean_token_accuracy": 0.8662247657775879, "num_tokens": 702021535.0, "step": 4405 }, { "epoch": 2.241098677517803, "grad_norm": 0.9615698456764221, "learning_rate": 1e-05, "loss": 0.4578, "mean_token_accuracy": 0.8537775278091431, "num_tokens": 702180586.0, "step": 4406 }, { "epoch": 2.241607324516785, "grad_norm": 0.9781903624534607, "learning_rate": 1e-05, "loss": 0.4257, "mean_token_accuracy": 0.860724925994873, "num_tokens": 702337600.0, "step": 4407 }, { "epoch": 2.242115971515768, "grad_norm": 0.8906936049461365, "learning_rate": 1e-05, "loss": 0.4554, "mean_token_accuracy": 0.8546968102455139, "num_tokens": 702507782.0, "step": 4408 }, { "epoch": 2.2426246185147507, "grad_norm": 0.9217578172683716, "learning_rate": 1e-05, "loss": 0.4464, "mean_token_accuracy": 0.8542501926422119, "num_tokens": 702673172.0, "step": 4409 }, { "epoch": 2.2431332655137335, "grad_norm": 0.9569871425628662, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8487017154693604, "num_tokens": 702834861.0, "step": 4410 }, { "epoch": 2.2436419125127163, "grad_norm": 0.8756091594696045, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8541702628135681, "num_tokens": 703001374.0, "step": 4411 }, { "epoch": 2.244150559511699, "grad_norm": 1.146317481994629, "learning_rate": 1e-05, "loss": 0.4394, "mean_token_accuracy": 0.8578425645828247, "num_tokens": 703173572.0, "step": 4412 }, { "epoch": 2.2446592065106814, "grad_norm": 0.9265860319137573, "learning_rate": 1e-05, "loss": 0.4521, "mean_token_accuracy": 0.8535090088844299, "num_tokens": 703335847.0, "step": 4413 }, { "epoch": 2.245167853509664, "grad_norm": 0.8638989925384521, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8653352856636047, "num_tokens": 703498507.0, "step": 4414 }, { "epoch": 2.245676500508647, "grad_norm": 0.8545963168144226, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8569827079772949, "num_tokens": 703665569.0, "step": 4415 }, { "epoch": 2.2461851475076298, "grad_norm": 0.9048717617988586, "learning_rate": 1e-05, "loss": 0.4296, "mean_token_accuracy": 0.8589744567871094, "num_tokens": 703823540.0, "step": 4416 }, { "epoch": 2.2466937945066126, "grad_norm": 0.9196427464485168, "learning_rate": 1e-05, "loss": 0.452, "mean_token_accuracy": 0.8551338911056519, "num_tokens": 703996737.0, "step": 4417 }, { "epoch": 2.2472024415055953, "grad_norm": 0.9343538880348206, "learning_rate": 1e-05, "loss": 0.4369, "mean_token_accuracy": 0.8585703372955322, "num_tokens": 704156689.0, "step": 4418 }, { "epoch": 2.2477110885045777, "grad_norm": 1.2548093795776367, "learning_rate": 1e-05, "loss": 0.4295, "mean_token_accuracy": 0.8610302209854126, "num_tokens": 704312861.0, "step": 4419 }, { "epoch": 2.2482197355035605, "grad_norm": 0.927617073059082, "learning_rate": 1e-05, "loss": 0.4424, "mean_token_accuracy": 0.8566126823425293, "num_tokens": 704467781.0, "step": 4420 }, { "epoch": 2.2487283825025433, "grad_norm": 0.8665664196014404, "learning_rate": 1e-05, "loss": 0.4541, "mean_token_accuracy": 0.8544788360595703, "num_tokens": 704643910.0, "step": 4421 }, { "epoch": 2.249237029501526, "grad_norm": 0.8504548668861389, "learning_rate": 1e-05, "loss": 0.4283, "mean_token_accuracy": 0.8599859476089478, "num_tokens": 704805338.0, "step": 4422 }, { "epoch": 2.249745676500509, "grad_norm": 0.8804091215133667, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.8539838194847107, "num_tokens": 704969847.0, "step": 4423 }, { "epoch": 2.250254323499491, "grad_norm": 0.8922030925750732, "learning_rate": 1e-05, "loss": 0.4409, "mean_token_accuracy": 0.8573827743530273, "num_tokens": 705135627.0, "step": 4424 }, { "epoch": 2.250762970498474, "grad_norm": 0.9630568623542786, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8535498380661011, "num_tokens": 705287010.0, "step": 4425 }, { "epoch": 2.2512716174974567, "grad_norm": 0.888202428817749, "learning_rate": 1e-05, "loss": 0.4497, "mean_token_accuracy": 0.8549824357032776, "num_tokens": 705443703.0, "step": 4426 }, { "epoch": 2.2517802644964395, "grad_norm": 0.9108331799507141, "learning_rate": 1e-05, "loss": 0.4264, "mean_token_accuracy": 0.8600156903266907, "num_tokens": 705610804.0, "step": 4427 }, { "epoch": 2.2522889114954223, "grad_norm": 0.9033510684967041, "learning_rate": 1e-05, "loss": 0.4293, "mean_token_accuracy": 0.8614766597747803, "num_tokens": 705767052.0, "step": 4428 }, { "epoch": 2.2527975584944047, "grad_norm": 0.8618059158325195, "learning_rate": 1e-05, "loss": 0.4284, "mean_token_accuracy": 0.8605534434318542, "num_tokens": 705941182.0, "step": 4429 }, { "epoch": 2.2533062054933874, "grad_norm": 0.89496248960495, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8533060550689697, "num_tokens": 706101095.0, "step": 4430 }, { "epoch": 2.2538148524923702, "grad_norm": 0.8547977805137634, "learning_rate": 1e-05, "loss": 0.4347, "mean_token_accuracy": 0.8595601320266724, "num_tokens": 706262686.0, "step": 4431 }, { "epoch": 2.254323499491353, "grad_norm": 0.8869041204452515, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8575032949447632, "num_tokens": 706423567.0, "step": 4432 }, { "epoch": 2.254832146490336, "grad_norm": 0.9196506142616272, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8587489724159241, "num_tokens": 706581619.0, "step": 4433 }, { "epoch": 2.2553407934893186, "grad_norm": 0.8818204402923584, "learning_rate": 1e-05, "loss": 0.4347, "mean_token_accuracy": 0.8590331077575684, "num_tokens": 706747002.0, "step": 4434 }, { "epoch": 2.255849440488301, "grad_norm": 0.8719470500946045, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8537620306015015, "num_tokens": 706919479.0, "step": 4435 }, { "epoch": 2.2563580874872837, "grad_norm": 0.9123954176902771, "learning_rate": 1e-05, "loss": 0.4182, "mean_token_accuracy": 0.8630255460739136, "num_tokens": 707083769.0, "step": 4436 }, { "epoch": 2.2568667344862665, "grad_norm": 0.8835598826408386, "learning_rate": 1e-05, "loss": 0.4275, "mean_token_accuracy": 0.8604206442832947, "num_tokens": 707240251.0, "step": 4437 }, { "epoch": 2.2573753814852493, "grad_norm": 0.8867762684822083, "learning_rate": 1e-05, "loss": 0.4362, "mean_token_accuracy": 0.859366774559021, "num_tokens": 707407475.0, "step": 4438 }, { "epoch": 2.257884028484232, "grad_norm": 0.941148042678833, "learning_rate": 1e-05, "loss": 0.4276, "mean_token_accuracy": 0.8595365285873413, "num_tokens": 707565685.0, "step": 4439 }, { "epoch": 2.258392675483215, "grad_norm": 0.8768989443778992, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8487832546234131, "num_tokens": 707731659.0, "step": 4440 }, { "epoch": 2.258901322482197, "grad_norm": 0.9788554906845093, "learning_rate": 1e-05, "loss": 0.421, "mean_token_accuracy": 0.8615604639053345, "num_tokens": 707888682.0, "step": 4441 }, { "epoch": 2.25940996948118, "grad_norm": 0.8423621654510498, "learning_rate": 1e-05, "loss": 0.4269, "mean_token_accuracy": 0.8617719411849976, "num_tokens": 708049516.0, "step": 4442 }, { "epoch": 2.259918616480163, "grad_norm": 0.9760943651199341, "learning_rate": 1e-05, "loss": 0.4498, "mean_token_accuracy": 0.857210636138916, "num_tokens": 708204574.0, "step": 4443 }, { "epoch": 2.2604272634791456, "grad_norm": 0.9054985642433167, "learning_rate": 1e-05, "loss": 0.4141, "mean_token_accuracy": 0.8644254207611084, "num_tokens": 708365608.0, "step": 4444 }, { "epoch": 2.2609359104781284, "grad_norm": 0.8849431276321411, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.858661413192749, "num_tokens": 708519060.0, "step": 4445 }, { "epoch": 2.2614445574771107, "grad_norm": 0.948762059211731, "learning_rate": 1e-05, "loss": 0.4068, "mean_token_accuracy": 0.8667903542518616, "num_tokens": 708668919.0, "step": 4446 }, { "epoch": 2.2619532044760935, "grad_norm": 0.9438794255256653, "learning_rate": 1e-05, "loss": 0.4186, "mean_token_accuracy": 0.8631476759910583, "num_tokens": 708825717.0, "step": 4447 }, { "epoch": 2.2624618514750763, "grad_norm": 0.9877505302429199, "learning_rate": 1e-05, "loss": 0.384, "mean_token_accuracy": 0.8736969828605652, "num_tokens": 708978992.0, "step": 4448 }, { "epoch": 2.262970498474059, "grad_norm": 0.9099939465522766, "learning_rate": 1e-05, "loss": 0.4437, "mean_token_accuracy": 0.857690691947937, "num_tokens": 709146131.0, "step": 4449 }, { "epoch": 2.263479145473042, "grad_norm": 1.0143423080444336, "learning_rate": 1e-05, "loss": 0.3974, "mean_token_accuracy": 0.8684823513031006, "num_tokens": 709289990.0, "step": 4450 }, { "epoch": 2.263987792472024, "grad_norm": 0.9650892615318298, "learning_rate": 1e-05, "loss": 0.4684, "mean_token_accuracy": 0.8513463735580444, "num_tokens": 709454058.0, "step": 4451 }, { "epoch": 2.264496439471007, "grad_norm": 0.9049556851387024, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8524051904678345, "num_tokens": 709620493.0, "step": 4452 }, { "epoch": 2.2650050864699898, "grad_norm": 0.9698246121406555, "learning_rate": 1e-05, "loss": 0.4363, "mean_token_accuracy": 0.8589226007461548, "num_tokens": 709793625.0, "step": 4453 }, { "epoch": 2.2655137334689726, "grad_norm": 0.9414761066436768, "learning_rate": 1e-05, "loss": 0.4402, "mean_token_accuracy": 0.8582148551940918, "num_tokens": 709945354.0, "step": 4454 }, { "epoch": 2.2660223804679553, "grad_norm": 1.0288310050964355, "learning_rate": 1e-05, "loss": 0.4179, "mean_token_accuracy": 0.8637502193450928, "num_tokens": 710109134.0, "step": 4455 }, { "epoch": 2.266531027466938, "grad_norm": 0.9398514032363892, "learning_rate": 1e-05, "loss": 0.4224, "mean_token_accuracy": 0.8614974021911621, "num_tokens": 710261263.0, "step": 4456 }, { "epoch": 2.2670396744659205, "grad_norm": 1.0151559114456177, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8599753975868225, "num_tokens": 710421327.0, "step": 4457 }, { "epoch": 2.2675483214649033, "grad_norm": 0.9492122530937195, "learning_rate": 1e-05, "loss": 0.3898, "mean_token_accuracy": 0.8706491589546204, "num_tokens": 710579962.0, "step": 4458 }, { "epoch": 2.268056968463886, "grad_norm": 0.9367904663085938, "learning_rate": 1e-05, "loss": 0.4307, "mean_token_accuracy": 0.859420657157898, "num_tokens": 710740178.0, "step": 4459 }, { "epoch": 2.268565615462869, "grad_norm": 0.9847875237464905, "learning_rate": 1e-05, "loss": 0.4442, "mean_token_accuracy": 0.8568622469902039, "num_tokens": 710918771.0, "step": 4460 }, { "epoch": 2.2690742624618516, "grad_norm": 0.8942195773124695, "learning_rate": 1e-05, "loss": 0.4174, "mean_token_accuracy": 0.8642637133598328, "num_tokens": 711075031.0, "step": 4461 }, { "epoch": 2.2695829094608344, "grad_norm": 1.0235880613327026, "learning_rate": 1e-05, "loss": 0.4537, "mean_token_accuracy": 0.8542577028274536, "num_tokens": 711248181.0, "step": 4462 }, { "epoch": 2.2700915564598168, "grad_norm": 0.9366216063499451, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8548239469528198, "num_tokens": 711404363.0, "step": 4463 }, { "epoch": 2.2706002034587995, "grad_norm": 0.9664139151573181, "learning_rate": 1e-05, "loss": 0.4234, "mean_token_accuracy": 0.8624817728996277, "num_tokens": 711562515.0, "step": 4464 }, { "epoch": 2.2711088504577823, "grad_norm": 0.9802444577217102, "learning_rate": 1e-05, "loss": 0.4304, "mean_token_accuracy": 0.8602179884910583, "num_tokens": 711712262.0, "step": 4465 }, { "epoch": 2.271617497456765, "grad_norm": 0.8687217831611633, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8610938787460327, "num_tokens": 711864605.0, "step": 4466 }, { "epoch": 2.272126144455748, "grad_norm": 1.1002277135849, "learning_rate": 1e-05, "loss": 0.461, "mean_token_accuracy": 0.8507526516914368, "num_tokens": 712030169.0, "step": 4467 }, { "epoch": 2.2726347914547302, "grad_norm": 0.965670645236969, "learning_rate": 1e-05, "loss": 0.4781, "mean_token_accuracy": 0.8478188514709473, "num_tokens": 712202336.0, "step": 4468 }, { "epoch": 2.273143438453713, "grad_norm": 0.8851255774497986, "learning_rate": 1e-05, "loss": 0.4434, "mean_token_accuracy": 0.8568148612976074, "num_tokens": 712359950.0, "step": 4469 }, { "epoch": 2.273652085452696, "grad_norm": 1.0171219110488892, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8495321273803711, "num_tokens": 712510845.0, "step": 4470 }, { "epoch": 2.2741607324516786, "grad_norm": 0.9366046190261841, "learning_rate": 1e-05, "loss": 0.4215, "mean_token_accuracy": 0.8625251650810242, "num_tokens": 712667175.0, "step": 4471 }, { "epoch": 2.2746693794506614, "grad_norm": 0.949114203453064, "learning_rate": 1e-05, "loss": 0.4208, "mean_token_accuracy": 0.8625385761260986, "num_tokens": 712815226.0, "step": 4472 }, { "epoch": 2.2751780264496437, "grad_norm": 0.9524646401405334, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8614475727081299, "num_tokens": 712966666.0, "step": 4473 }, { "epoch": 2.2756866734486265, "grad_norm": 0.8781487941741943, "learning_rate": 1e-05, "loss": 0.4488, "mean_token_accuracy": 0.8545585870742798, "num_tokens": 713136477.0, "step": 4474 }, { "epoch": 2.2761953204476093, "grad_norm": 0.9310004711151123, "learning_rate": 1e-05, "loss": 0.4312, "mean_token_accuracy": 0.8581027984619141, "num_tokens": 713294399.0, "step": 4475 }, { "epoch": 2.276703967446592, "grad_norm": 0.8774072527885437, "learning_rate": 1e-05, "loss": 0.4156, "mean_token_accuracy": 0.8643208742141724, "num_tokens": 713456971.0, "step": 4476 }, { "epoch": 2.277212614445575, "grad_norm": 0.8634635806083679, "learning_rate": 1e-05, "loss": 0.4297, "mean_token_accuracy": 0.8596638441085815, "num_tokens": 713616120.0, "step": 4477 }, { "epoch": 2.2777212614445577, "grad_norm": 0.9329019784927368, "learning_rate": 1e-05, "loss": 0.4727, "mean_token_accuracy": 0.8485960364341736, "num_tokens": 713779256.0, "step": 4478 }, { "epoch": 2.27822990844354, "grad_norm": 0.9431872367858887, "learning_rate": 1e-05, "loss": 0.4389, "mean_token_accuracy": 0.8568741083145142, "num_tokens": 713937883.0, "step": 4479 }, { "epoch": 2.278738555442523, "grad_norm": 0.8608583807945251, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8544870615005493, "num_tokens": 714098535.0, "step": 4480 }, { "epoch": 2.2792472024415056, "grad_norm": 0.9586653113365173, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8556101322174072, "num_tokens": 714256873.0, "step": 4481 }, { "epoch": 2.2797558494404884, "grad_norm": 0.9250736236572266, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8535689115524292, "num_tokens": 714415903.0, "step": 4482 }, { "epoch": 2.280264496439471, "grad_norm": 1.0306283235549927, "learning_rate": 1e-05, "loss": 0.4531, "mean_token_accuracy": 0.8542593121528625, "num_tokens": 714569794.0, "step": 4483 }, { "epoch": 2.280773143438454, "grad_norm": 0.9855635166168213, "learning_rate": 1e-05, "loss": 0.4382, "mean_token_accuracy": 0.8572566509246826, "num_tokens": 714721963.0, "step": 4484 }, { "epoch": 2.2812817904374363, "grad_norm": 0.9278120994567871, "learning_rate": 1e-05, "loss": 0.4181, "mean_token_accuracy": 0.8632841110229492, "num_tokens": 714882142.0, "step": 4485 }, { "epoch": 2.281790437436419, "grad_norm": 0.9876662492752075, "learning_rate": 1e-05, "loss": 0.4712, "mean_token_accuracy": 0.8483551144599915, "num_tokens": 715036265.0, "step": 4486 }, { "epoch": 2.282299084435402, "grad_norm": 0.9069980382919312, "learning_rate": 1e-05, "loss": 0.4629, "mean_token_accuracy": 0.84922194480896, "num_tokens": 715196287.0, "step": 4487 }, { "epoch": 2.2828077314343846, "grad_norm": 0.907392680644989, "learning_rate": 1e-05, "loss": 0.426, "mean_token_accuracy": 0.8612542152404785, "num_tokens": 715351692.0, "step": 4488 }, { "epoch": 2.2833163784333674, "grad_norm": 0.9390474557876587, "learning_rate": 1e-05, "loss": 0.4124, "mean_token_accuracy": 0.8654056787490845, "num_tokens": 715509487.0, "step": 4489 }, { "epoch": 2.2838250254323498, "grad_norm": 0.8880806565284729, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.848419725894928, "num_tokens": 715667028.0, "step": 4490 }, { "epoch": 2.2843336724313326, "grad_norm": 0.8902788758277893, "learning_rate": 1e-05, "loss": 0.4285, "mean_token_accuracy": 0.8602020144462585, "num_tokens": 715828741.0, "step": 4491 }, { "epoch": 2.2848423194303153, "grad_norm": 0.9005802273750305, "learning_rate": 1e-05, "loss": 0.4398, "mean_token_accuracy": 0.857980489730835, "num_tokens": 715978519.0, "step": 4492 }, { "epoch": 2.285350966429298, "grad_norm": 0.8823654055595398, "learning_rate": 1e-05, "loss": 0.4389, "mean_token_accuracy": 0.8587138056755066, "num_tokens": 716138914.0, "step": 4493 }, { "epoch": 2.285859613428281, "grad_norm": 0.941698431968689, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8479748964309692, "num_tokens": 716294789.0, "step": 4494 }, { "epoch": 2.2863682604272633, "grad_norm": 0.9303914904594421, "learning_rate": 1e-05, "loss": 0.43, "mean_token_accuracy": 0.8607516288757324, "num_tokens": 716454119.0, "step": 4495 }, { "epoch": 2.286876907426246, "grad_norm": 0.9308568239212036, "learning_rate": 1e-05, "loss": 0.4225, "mean_token_accuracy": 0.8629610538482666, "num_tokens": 716601971.0, "step": 4496 }, { "epoch": 2.287385554425229, "grad_norm": 0.9544360041618347, "learning_rate": 1e-05, "loss": 0.4428, "mean_token_accuracy": 0.855931282043457, "num_tokens": 716762602.0, "step": 4497 }, { "epoch": 2.2878942014242116, "grad_norm": 0.8890806436538696, "learning_rate": 1e-05, "loss": 0.4297, "mean_token_accuracy": 0.8613079786300659, "num_tokens": 716916644.0, "step": 4498 }, { "epoch": 2.2884028484231944, "grad_norm": 0.8517684936523438, "learning_rate": 1e-05, "loss": 0.4343, "mean_token_accuracy": 0.8590466380119324, "num_tokens": 717073633.0, "step": 4499 }, { "epoch": 2.288911495422177, "grad_norm": 0.9687017202377319, "learning_rate": 1e-05, "loss": 0.4184, "mean_token_accuracy": 0.8640453219413757, "num_tokens": 717218108.0, "step": 4500 }, { "epoch": 2.2894201424211595, "grad_norm": 0.8814558386802673, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8642098903656006, "num_tokens": 717365072.0, "step": 4501 }, { "epoch": 2.2899287894201423, "grad_norm": 0.9428908228874207, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.855121374130249, "num_tokens": 717517090.0, "step": 4502 }, { "epoch": 2.290437436419125, "grad_norm": 0.9154829978942871, "learning_rate": 1e-05, "loss": 0.4381, "mean_token_accuracy": 0.857797384262085, "num_tokens": 717675365.0, "step": 4503 }, { "epoch": 2.290946083418108, "grad_norm": 0.8993266224861145, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.8522711992263794, "num_tokens": 717846642.0, "step": 4504 }, { "epoch": 2.2914547304170907, "grad_norm": 0.8822490572929382, "learning_rate": 1e-05, "loss": 0.4352, "mean_token_accuracy": 0.8591576814651489, "num_tokens": 718021255.0, "step": 4505 }, { "epoch": 2.2919633774160735, "grad_norm": 0.9621962308883667, "learning_rate": 1e-05, "loss": 0.4211, "mean_token_accuracy": 0.861912190914154, "num_tokens": 718173696.0, "step": 4506 }, { "epoch": 2.292472024415056, "grad_norm": 0.9636359810829163, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8525373935699463, "num_tokens": 718326779.0, "step": 4507 }, { "epoch": 2.2929806714140386, "grad_norm": 0.9130626320838928, "learning_rate": 1e-05, "loss": 0.4677, "mean_token_accuracy": 0.8482925891876221, "num_tokens": 718494164.0, "step": 4508 }, { "epoch": 2.2934893184130214, "grad_norm": 0.9200655221939087, "learning_rate": 1e-05, "loss": 0.4396, "mean_token_accuracy": 0.8560786247253418, "num_tokens": 718652480.0, "step": 4509 }, { "epoch": 2.293997965412004, "grad_norm": 0.8752954602241516, "learning_rate": 1e-05, "loss": 0.4355, "mean_token_accuracy": 0.8592425584793091, "num_tokens": 718815468.0, "step": 4510 }, { "epoch": 2.294506612410987, "grad_norm": 0.8763068914413452, "learning_rate": 1e-05, "loss": 0.4633, "mean_token_accuracy": 0.8521791696548462, "num_tokens": 718981560.0, "step": 4511 }, { "epoch": 2.2950152594099693, "grad_norm": 0.9308648109436035, "learning_rate": 1e-05, "loss": 0.4639, "mean_token_accuracy": 0.8517353534698486, "num_tokens": 719136494.0, "step": 4512 }, { "epoch": 2.295523906408952, "grad_norm": 0.9274827241897583, "learning_rate": 1e-05, "loss": 0.4766, "mean_token_accuracy": 0.8475729823112488, "num_tokens": 719310020.0, "step": 4513 }, { "epoch": 2.296032553407935, "grad_norm": 0.8966953158378601, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8623745441436768, "num_tokens": 719474769.0, "step": 4514 }, { "epoch": 2.2965412004069177, "grad_norm": 0.9108511209487915, "learning_rate": 1e-05, "loss": 0.4459, "mean_token_accuracy": 0.856505274772644, "num_tokens": 719627800.0, "step": 4515 }, { "epoch": 2.2970498474059005, "grad_norm": 0.9913238883018494, "learning_rate": 1e-05, "loss": 0.4565, "mean_token_accuracy": 0.8522071838378906, "num_tokens": 719775158.0, "step": 4516 }, { "epoch": 2.297558494404883, "grad_norm": 0.931964099407196, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8504119515419006, "num_tokens": 719928924.0, "step": 4517 }, { "epoch": 2.2980671414038656, "grad_norm": 0.9912185668945312, "learning_rate": 1e-05, "loss": 0.4179, "mean_token_accuracy": 0.8640331625938416, "num_tokens": 720075628.0, "step": 4518 }, { "epoch": 2.2985757884028484, "grad_norm": 1.0474159717559814, "learning_rate": 1e-05, "loss": 0.4772, "mean_token_accuracy": 0.8476606011390686, "num_tokens": 720233916.0, "step": 4519 }, { "epoch": 2.299084435401831, "grad_norm": 0.9056844711303711, "learning_rate": 1e-05, "loss": 0.4338, "mean_token_accuracy": 0.8600153923034668, "num_tokens": 720390860.0, "step": 4520 }, { "epoch": 2.299593082400814, "grad_norm": 0.9942348003387451, "learning_rate": 1e-05, "loss": 0.4526, "mean_token_accuracy": 0.856066882610321, "num_tokens": 720561398.0, "step": 4521 }, { "epoch": 2.3001017293997967, "grad_norm": 1.002570629119873, "learning_rate": 1e-05, "loss": 0.4321, "mean_token_accuracy": 0.8591198921203613, "num_tokens": 720717089.0, "step": 4522 }, { "epoch": 2.300610376398779, "grad_norm": 0.9176241755485535, "learning_rate": 1e-05, "loss": 0.4232, "mean_token_accuracy": 0.8634848594665527, "num_tokens": 720878498.0, "step": 4523 }, { "epoch": 2.301119023397762, "grad_norm": 1.0189260244369507, "learning_rate": 1e-05, "loss": 0.435, "mean_token_accuracy": 0.8588002324104309, "num_tokens": 721053638.0, "step": 4524 }, { "epoch": 2.3016276703967447, "grad_norm": 0.8975340127944946, "learning_rate": 1e-05, "loss": 0.4415, "mean_token_accuracy": 0.8558616638183594, "num_tokens": 721223050.0, "step": 4525 }, { "epoch": 2.3021363173957274, "grad_norm": 0.9887248873710632, "learning_rate": 1e-05, "loss": 0.4332, "mean_token_accuracy": 0.857849657535553, "num_tokens": 721379440.0, "step": 4526 }, { "epoch": 2.3026449643947102, "grad_norm": 0.9114660024642944, "learning_rate": 1e-05, "loss": 0.4479, "mean_token_accuracy": 0.8539832234382629, "num_tokens": 721540215.0, "step": 4527 }, { "epoch": 2.303153611393693, "grad_norm": 1.0104368925094604, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8585807085037231, "num_tokens": 721691506.0, "step": 4528 }, { "epoch": 2.3036622583926754, "grad_norm": 0.8985596895217896, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8575284481048584, "num_tokens": 721850085.0, "step": 4529 }, { "epoch": 2.304170905391658, "grad_norm": 0.9269108176231384, "learning_rate": 1e-05, "loss": 0.4415, "mean_token_accuracy": 0.8557599782943726, "num_tokens": 721991138.0, "step": 4530 }, { "epoch": 2.304679552390641, "grad_norm": 0.898306131362915, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8564971685409546, "num_tokens": 722158468.0, "step": 4531 }, { "epoch": 2.3051881993896237, "grad_norm": 0.9220131039619446, "learning_rate": 1e-05, "loss": 0.4573, "mean_token_accuracy": 0.8531209826469421, "num_tokens": 722325029.0, "step": 4532 }, { "epoch": 2.3056968463886065, "grad_norm": 0.9147054553031921, "learning_rate": 1e-05, "loss": 0.4498, "mean_token_accuracy": 0.8561792969703674, "num_tokens": 722478424.0, "step": 4533 }, { "epoch": 2.306205493387589, "grad_norm": 0.9304201006889343, "learning_rate": 1e-05, "loss": 0.4223, "mean_token_accuracy": 0.8611265420913696, "num_tokens": 722639326.0, "step": 4534 }, { "epoch": 2.3067141403865716, "grad_norm": 0.9701643586158752, "learning_rate": 1e-05, "loss": 0.4742, "mean_token_accuracy": 0.8480482697486877, "num_tokens": 722788846.0, "step": 4535 }, { "epoch": 2.3072227873855544, "grad_norm": 0.9791626930236816, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8625706434249878, "num_tokens": 722945492.0, "step": 4536 }, { "epoch": 2.307731434384537, "grad_norm": 1.2803678512573242, "learning_rate": 1e-05, "loss": 0.4417, "mean_token_accuracy": 0.8556251525878906, "num_tokens": 723112669.0, "step": 4537 }, { "epoch": 2.30824008138352, "grad_norm": 1.0095150470733643, "learning_rate": 1e-05, "loss": 0.5039, "mean_token_accuracy": 0.839377760887146, "num_tokens": 723262253.0, "step": 4538 }, { "epoch": 2.3087487283825023, "grad_norm": 0.8973872065544128, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.850966215133667, "num_tokens": 723418494.0, "step": 4539 }, { "epoch": 2.309257375381485, "grad_norm": 0.9098710417747498, "learning_rate": 1e-05, "loss": 0.4593, "mean_token_accuracy": 0.85355544090271, "num_tokens": 723575991.0, "step": 4540 }, { "epoch": 2.309766022380468, "grad_norm": 0.9440922737121582, "learning_rate": 1e-05, "loss": 0.4563, "mean_token_accuracy": 0.8534846305847168, "num_tokens": 723732068.0, "step": 4541 }, { "epoch": 2.3102746693794507, "grad_norm": 0.9420719742774963, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8521120548248291, "num_tokens": 723892353.0, "step": 4542 }, { "epoch": 2.3107833163784335, "grad_norm": 0.8923868536949158, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8646926283836365, "num_tokens": 724053937.0, "step": 4543 }, { "epoch": 2.311291963377416, "grad_norm": 0.9159566164016724, "learning_rate": 1e-05, "loss": 0.4475, "mean_token_accuracy": 0.855766773223877, "num_tokens": 724220225.0, "step": 4544 }, { "epoch": 2.3118006103763986, "grad_norm": 0.9505776762962341, "learning_rate": 1e-05, "loss": 0.4641, "mean_token_accuracy": 0.8512469530105591, "num_tokens": 724386972.0, "step": 4545 }, { "epoch": 2.3123092573753814, "grad_norm": 1.0132519006729126, "learning_rate": 1e-05, "loss": 0.4338, "mean_token_accuracy": 0.8611284494400024, "num_tokens": 724542617.0, "step": 4546 }, { "epoch": 2.312817904374364, "grad_norm": 0.8784273862838745, "learning_rate": 1e-05, "loss": 0.4289, "mean_token_accuracy": 0.8610193133354187, "num_tokens": 724718591.0, "step": 4547 }, { "epoch": 2.313326551373347, "grad_norm": 0.9775454998016357, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8595489859580994, "num_tokens": 724863471.0, "step": 4548 }, { "epoch": 2.3138351983723298, "grad_norm": 0.8906097412109375, "learning_rate": 1e-05, "loss": 0.4691, "mean_token_accuracy": 0.850866436958313, "num_tokens": 725027948.0, "step": 4549 }, { "epoch": 2.3143438453713125, "grad_norm": 0.9929198622703552, "learning_rate": 1e-05, "loss": 0.4457, "mean_token_accuracy": 0.8580214977264404, "num_tokens": 725205147.0, "step": 4550 }, { "epoch": 2.314852492370295, "grad_norm": 0.8838443756103516, "learning_rate": 1e-05, "loss": 0.4399, "mean_token_accuracy": 0.8587745428085327, "num_tokens": 725371502.0, "step": 4551 }, { "epoch": 2.3153611393692777, "grad_norm": 0.953943133354187, "learning_rate": 1e-05, "loss": 0.438, "mean_token_accuracy": 0.8588062524795532, "num_tokens": 725544049.0, "step": 4552 }, { "epoch": 2.3158697863682605, "grad_norm": 0.9285664558410645, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.857245922088623, "num_tokens": 725706763.0, "step": 4553 }, { "epoch": 2.3163784333672433, "grad_norm": 0.9023874998092651, "learning_rate": 1e-05, "loss": 0.4485, "mean_token_accuracy": 0.8561683893203735, "num_tokens": 725863765.0, "step": 4554 }, { "epoch": 2.316887080366226, "grad_norm": 1.0573559999465942, "learning_rate": 1e-05, "loss": 0.4623, "mean_token_accuracy": 0.851728081703186, "num_tokens": 726017034.0, "step": 4555 }, { "epoch": 2.3173957273652084, "grad_norm": 0.8584126830101013, "learning_rate": 1e-05, "loss": 0.437, "mean_token_accuracy": 0.8590702414512634, "num_tokens": 726172877.0, "step": 4556 }, { "epoch": 2.317904374364191, "grad_norm": 0.9466336369514465, "learning_rate": 1e-05, "loss": 0.4343, "mean_token_accuracy": 0.8593477010726929, "num_tokens": 726326358.0, "step": 4557 }, { "epoch": 2.318413021363174, "grad_norm": 0.9617027640342712, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.8561615943908691, "num_tokens": 726475413.0, "step": 4558 }, { "epoch": 2.3189216683621567, "grad_norm": 0.9507430791854858, "learning_rate": 1e-05, "loss": 0.4893, "mean_token_accuracy": 0.8450244665145874, "num_tokens": 726644059.0, "step": 4559 }, { "epoch": 2.3194303153611395, "grad_norm": 0.9864686727523804, "learning_rate": 1e-05, "loss": 0.4401, "mean_token_accuracy": 0.8587028980255127, "num_tokens": 726809779.0, "step": 4560 }, { "epoch": 2.319938962360122, "grad_norm": 0.9368869662284851, "learning_rate": 1e-05, "loss": 0.4226, "mean_token_accuracy": 0.8631230592727661, "num_tokens": 726970762.0, "step": 4561 }, { "epoch": 2.3204476093591047, "grad_norm": 0.9573301076889038, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.8558048605918884, "num_tokens": 727125760.0, "step": 4562 }, { "epoch": 2.3209562563580874, "grad_norm": 0.9050489068031311, "learning_rate": 1e-05, "loss": 0.4357, "mean_token_accuracy": 0.85848069190979, "num_tokens": 727284662.0, "step": 4563 }, { "epoch": 2.3214649033570702, "grad_norm": 1.0446038246154785, "learning_rate": 1e-05, "loss": 0.454, "mean_token_accuracy": 0.8540219664573669, "num_tokens": 727438075.0, "step": 4564 }, { "epoch": 2.321973550356053, "grad_norm": 0.9578843116760254, "learning_rate": 1e-05, "loss": 0.4557, "mean_token_accuracy": 0.8550673723220825, "num_tokens": 727597413.0, "step": 4565 }, { "epoch": 2.3224821973550354, "grad_norm": 0.8926106691360474, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8584423065185547, "num_tokens": 727764115.0, "step": 4566 }, { "epoch": 2.322990844354018, "grad_norm": 0.9425592422485352, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8564541339874268, "num_tokens": 727931105.0, "step": 4567 }, { "epoch": 2.323499491353001, "grad_norm": 0.858015775680542, "learning_rate": 1e-05, "loss": 0.4403, "mean_token_accuracy": 0.857099175453186, "num_tokens": 728092211.0, "step": 4568 }, { "epoch": 2.3240081383519837, "grad_norm": 0.9361171126365662, "learning_rate": 1e-05, "loss": 0.4751, "mean_token_accuracy": 0.8496037721633911, "num_tokens": 728258373.0, "step": 4569 }, { "epoch": 2.3245167853509665, "grad_norm": 0.9430844187736511, "learning_rate": 1e-05, "loss": 0.4247, "mean_token_accuracy": 0.8636689186096191, "num_tokens": 728425475.0, "step": 4570 }, { "epoch": 2.3250254323499493, "grad_norm": 0.9531563520431519, "learning_rate": 1e-05, "loss": 0.4712, "mean_token_accuracy": 0.8488687872886658, "num_tokens": 728590364.0, "step": 4571 }, { "epoch": 2.325534079348932, "grad_norm": 0.9843347668647766, "learning_rate": 1e-05, "loss": 0.4317, "mean_token_accuracy": 0.8596490025520325, "num_tokens": 728738679.0, "step": 4572 }, { "epoch": 2.3260427263479144, "grad_norm": 0.8593404293060303, "learning_rate": 1e-05, "loss": 0.4657, "mean_token_accuracy": 0.84918212890625, "num_tokens": 728901362.0, "step": 4573 }, { "epoch": 2.326551373346897, "grad_norm": 0.8554495573043823, "learning_rate": 1e-05, "loss": 0.4677, "mean_token_accuracy": 0.8513251543045044, "num_tokens": 729077322.0, "step": 4574 }, { "epoch": 2.32706002034588, "grad_norm": 0.9305860996246338, "learning_rate": 1e-05, "loss": 0.4228, "mean_token_accuracy": 0.8608434200286865, "num_tokens": 729222577.0, "step": 4575 }, { "epoch": 2.327568667344863, "grad_norm": 0.8833337426185608, "learning_rate": 1e-05, "loss": 0.4401, "mean_token_accuracy": 0.8577058911323547, "num_tokens": 729376355.0, "step": 4576 }, { "epoch": 2.3280773143438456, "grad_norm": 0.8967020511627197, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8576448559761047, "num_tokens": 729528421.0, "step": 4577 }, { "epoch": 2.328585961342828, "grad_norm": 0.8985277414321899, "learning_rate": 1e-05, "loss": 0.4433, "mean_token_accuracy": 0.8569909930229187, "num_tokens": 729697233.0, "step": 4578 }, { "epoch": 2.3290946083418107, "grad_norm": 0.8995670676231384, "learning_rate": 1e-05, "loss": 0.4314, "mean_token_accuracy": 0.8587366938591003, "num_tokens": 729850719.0, "step": 4579 }, { "epoch": 2.3296032553407935, "grad_norm": 0.9012477397918701, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8507951498031616, "num_tokens": 730009711.0, "step": 4580 }, { "epoch": 2.3301119023397763, "grad_norm": 0.8857806921005249, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.8554847240447998, "num_tokens": 730162457.0, "step": 4581 }, { "epoch": 2.330620549338759, "grad_norm": 0.9476555585861206, "learning_rate": 1e-05, "loss": 0.4753, "mean_token_accuracy": 0.8466869592666626, "num_tokens": 730315273.0, "step": 4582 }, { "epoch": 2.3311291963377414, "grad_norm": 0.8659005165100098, "learning_rate": 1e-05, "loss": 0.4301, "mean_token_accuracy": 0.8618274927139282, "num_tokens": 730464801.0, "step": 4583 }, { "epoch": 2.331637843336724, "grad_norm": 0.8836120963096619, "learning_rate": 1e-05, "loss": 0.4251, "mean_token_accuracy": 0.8608165979385376, "num_tokens": 730622310.0, "step": 4584 }, { "epoch": 2.332146490335707, "grad_norm": 0.8895693421363831, "learning_rate": 1e-05, "loss": 0.4398, "mean_token_accuracy": 0.8577582836151123, "num_tokens": 730778206.0, "step": 4585 }, { "epoch": 2.3326551373346898, "grad_norm": 0.9397143721580505, "learning_rate": 1e-05, "loss": 0.4631, "mean_token_accuracy": 0.8512009382247925, "num_tokens": 730938482.0, "step": 4586 }, { "epoch": 2.3331637843336726, "grad_norm": 0.9026806950569153, "learning_rate": 1e-05, "loss": 0.4486, "mean_token_accuracy": 0.8560445308685303, "num_tokens": 731100793.0, "step": 4587 }, { "epoch": 2.333672431332655, "grad_norm": 0.9613650441169739, "learning_rate": 1e-05, "loss": 0.4775, "mean_token_accuracy": 0.8481220602989197, "num_tokens": 731256654.0, "step": 4588 }, { "epoch": 2.3341810783316377, "grad_norm": 0.8520517349243164, "learning_rate": 1e-05, "loss": 0.4388, "mean_token_accuracy": 0.8573417067527771, "num_tokens": 731413316.0, "step": 4589 }, { "epoch": 2.3346897253306205, "grad_norm": 0.9405336976051331, "learning_rate": 1e-05, "loss": 0.4106, "mean_token_accuracy": 0.864769458770752, "num_tokens": 731560273.0, "step": 4590 }, { "epoch": 2.3351983723296033, "grad_norm": 0.879772424697876, "learning_rate": 1e-05, "loss": 0.4109, "mean_token_accuracy": 0.8663976788520813, "num_tokens": 731716793.0, "step": 4591 }, { "epoch": 2.335707019328586, "grad_norm": 0.9004299640655518, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8643734455108643, "num_tokens": 731882252.0, "step": 4592 }, { "epoch": 2.336215666327569, "grad_norm": 0.8501989245414734, "learning_rate": 1e-05, "loss": 0.4383, "mean_token_accuracy": 0.8574635982513428, "num_tokens": 732050123.0, "step": 4593 }, { "epoch": 2.3367243133265516, "grad_norm": 0.8961136937141418, "learning_rate": 1e-05, "loss": 0.4315, "mean_token_accuracy": 0.8601347208023071, "num_tokens": 732205142.0, "step": 4594 }, { "epoch": 2.337232960325534, "grad_norm": 0.9832904934883118, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8558573722839355, "num_tokens": 732361135.0, "step": 4595 }, { "epoch": 2.3377416073245167, "grad_norm": 0.9565850496292114, "learning_rate": 1e-05, "loss": 0.4633, "mean_token_accuracy": 0.8502834439277649, "num_tokens": 732523658.0, "step": 4596 }, { "epoch": 2.3382502543234995, "grad_norm": 0.9144861102104187, "learning_rate": 1e-05, "loss": 0.4566, "mean_token_accuracy": 0.8545467853546143, "num_tokens": 732677365.0, "step": 4597 }, { "epoch": 2.3387589013224823, "grad_norm": 0.8910345435142517, "learning_rate": 1e-05, "loss": 0.438, "mean_token_accuracy": 0.857315182685852, "num_tokens": 732842797.0, "step": 4598 }, { "epoch": 2.339267548321465, "grad_norm": 0.9919068217277527, "learning_rate": 1e-05, "loss": 0.4601, "mean_token_accuracy": 0.8525848984718323, "num_tokens": 733002613.0, "step": 4599 }, { "epoch": 2.3397761953204474, "grad_norm": 0.9106639623641968, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8479093909263611, "num_tokens": 733168024.0, "step": 4600 }, { "epoch": 2.3402848423194302, "grad_norm": 0.9065938591957092, "learning_rate": 1e-05, "loss": 0.4181, "mean_token_accuracy": 0.8635804653167725, "num_tokens": 733327001.0, "step": 4601 }, { "epoch": 2.340793489318413, "grad_norm": 0.8956307172775269, "learning_rate": 1e-05, "loss": 0.4305, "mean_token_accuracy": 0.859512984752655, "num_tokens": 733495169.0, "step": 4602 }, { "epoch": 2.341302136317396, "grad_norm": 0.9894669055938721, "learning_rate": 1e-05, "loss": 0.4321, "mean_token_accuracy": 0.8590751886367798, "num_tokens": 733633586.0, "step": 4603 }, { "epoch": 2.3418107833163786, "grad_norm": 0.9020395278930664, "learning_rate": 1e-05, "loss": 0.4425, "mean_token_accuracy": 0.8553770184516907, "num_tokens": 733783519.0, "step": 4604 }, { "epoch": 2.342319430315361, "grad_norm": 1.063341736793518, "learning_rate": 1e-05, "loss": 0.4437, "mean_token_accuracy": 0.8559243679046631, "num_tokens": 733947060.0, "step": 4605 }, { "epoch": 2.3428280773143437, "grad_norm": 0.9173327684402466, "learning_rate": 1e-05, "loss": 0.4185, "mean_token_accuracy": 0.8639484643936157, "num_tokens": 734114541.0, "step": 4606 }, { "epoch": 2.3433367243133265, "grad_norm": 1.05625319480896, "learning_rate": 1e-05, "loss": 0.4475, "mean_token_accuracy": 0.8549398183822632, "num_tokens": 734282987.0, "step": 4607 }, { "epoch": 2.3438453713123093, "grad_norm": 0.9332970380783081, "learning_rate": 1e-05, "loss": 0.4727, "mean_token_accuracy": 0.8487000465393066, "num_tokens": 734444293.0, "step": 4608 }, { "epoch": 2.344354018311292, "grad_norm": 0.9928670525550842, "learning_rate": 1e-05, "loss": 0.4531, "mean_token_accuracy": 0.8533625602722168, "num_tokens": 734591939.0, "step": 4609 }, { "epoch": 2.3448626653102744, "grad_norm": 0.8672624230384827, "learning_rate": 1e-05, "loss": 0.4033, "mean_token_accuracy": 0.8678281903266907, "num_tokens": 734747917.0, "step": 4610 }, { "epoch": 2.345371312309257, "grad_norm": 0.969215989112854, "learning_rate": 1e-05, "loss": 0.4205, "mean_token_accuracy": 0.8641879558563232, "num_tokens": 734912507.0, "step": 4611 }, { "epoch": 2.34587995930824, "grad_norm": 0.9247586131095886, "learning_rate": 1e-05, "loss": 0.4181, "mean_token_accuracy": 0.8647682666778564, "num_tokens": 735069410.0, "step": 4612 }, { "epoch": 2.346388606307223, "grad_norm": 0.8570278882980347, "learning_rate": 1e-05, "loss": 0.4173, "mean_token_accuracy": 0.8636913299560547, "num_tokens": 735225465.0, "step": 4613 }, { "epoch": 2.3468972533062056, "grad_norm": 0.9816693663597107, "learning_rate": 1e-05, "loss": 0.4198, "mean_token_accuracy": 0.8631116151809692, "num_tokens": 735380532.0, "step": 4614 }, { "epoch": 2.3474059003051884, "grad_norm": 0.8836074471473694, "learning_rate": 1e-05, "loss": 0.4263, "mean_token_accuracy": 0.8616890907287598, "num_tokens": 735530373.0, "step": 4615 }, { "epoch": 2.347914547304171, "grad_norm": 0.9004747867584229, "learning_rate": 1e-05, "loss": 0.4675, "mean_token_accuracy": 0.8508669137954712, "num_tokens": 735689763.0, "step": 4616 }, { "epoch": 2.3484231943031535, "grad_norm": 0.9340812563896179, "learning_rate": 1e-05, "loss": 0.4009, "mean_token_accuracy": 0.8688367009162903, "num_tokens": 735841618.0, "step": 4617 }, { "epoch": 2.3489318413021363, "grad_norm": 0.9207538962364197, "learning_rate": 1e-05, "loss": 0.4093, "mean_token_accuracy": 0.8663789629936218, "num_tokens": 736000433.0, "step": 4618 }, { "epoch": 2.349440488301119, "grad_norm": 0.9837210178375244, "learning_rate": 1e-05, "loss": 0.4337, "mean_token_accuracy": 0.857857346534729, "num_tokens": 736157243.0, "step": 4619 }, { "epoch": 2.349949135300102, "grad_norm": 0.8667672872543335, "learning_rate": 1e-05, "loss": 0.4309, "mean_token_accuracy": 0.8590836524963379, "num_tokens": 736329409.0, "step": 4620 }, { "epoch": 2.3504577822990846, "grad_norm": 1.0480256080627441, "learning_rate": 1e-05, "loss": 0.4376, "mean_token_accuracy": 0.856756329536438, "num_tokens": 736483322.0, "step": 4621 }, { "epoch": 2.350966429298067, "grad_norm": 0.9391472935676575, "learning_rate": 1e-05, "loss": 0.4443, "mean_token_accuracy": 0.8562041521072388, "num_tokens": 736642712.0, "step": 4622 }, { "epoch": 2.3514750762970498, "grad_norm": 0.927949845790863, "learning_rate": 1e-05, "loss": 0.4378, "mean_token_accuracy": 0.8579003214836121, "num_tokens": 736795254.0, "step": 4623 }, { "epoch": 2.3519837232960326, "grad_norm": 0.9654059410095215, "learning_rate": 1e-05, "loss": 0.4435, "mean_token_accuracy": 0.855154812335968, "num_tokens": 736956462.0, "step": 4624 }, { "epoch": 2.3524923702950153, "grad_norm": 0.9332446455955505, "learning_rate": 1e-05, "loss": 0.4476, "mean_token_accuracy": 0.8547961115837097, "num_tokens": 737113166.0, "step": 4625 }, { "epoch": 2.353001017293998, "grad_norm": 0.8744509816169739, "learning_rate": 1e-05, "loss": 0.42, "mean_token_accuracy": 0.8633664846420288, "num_tokens": 737279301.0, "step": 4626 }, { "epoch": 2.3535096642929805, "grad_norm": 0.8209827542304993, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8543109893798828, "num_tokens": 737455486.0, "step": 4627 }, { "epoch": 2.3540183112919633, "grad_norm": 0.8656889796257019, "learning_rate": 1e-05, "loss": 0.4337, "mean_token_accuracy": 0.8588730692863464, "num_tokens": 737617291.0, "step": 4628 }, { "epoch": 2.354526958290946, "grad_norm": 0.9394766092300415, "learning_rate": 1e-05, "loss": 0.448, "mean_token_accuracy": 0.8561714291572571, "num_tokens": 737775312.0, "step": 4629 }, { "epoch": 2.355035605289929, "grad_norm": 0.8893435597419739, "learning_rate": 1e-05, "loss": 0.4203, "mean_token_accuracy": 0.862876296043396, "num_tokens": 737938881.0, "step": 4630 }, { "epoch": 2.3555442522889116, "grad_norm": 0.9624534249305725, "learning_rate": 1e-05, "loss": 0.4545, "mean_token_accuracy": 0.8524142503738403, "num_tokens": 738091397.0, "step": 4631 }, { "epoch": 2.356052899287894, "grad_norm": 0.8696402311325073, "learning_rate": 1e-05, "loss": 0.4214, "mean_token_accuracy": 0.8624419569969177, "num_tokens": 738254476.0, "step": 4632 }, { "epoch": 2.3565615462868768, "grad_norm": 0.9505226016044617, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8569638133049011, "num_tokens": 738427280.0, "step": 4633 }, { "epoch": 2.3570701932858595, "grad_norm": 0.9444771409034729, "learning_rate": 1e-05, "loss": 0.4246, "mean_token_accuracy": 0.8611069917678833, "num_tokens": 738587235.0, "step": 4634 }, { "epoch": 2.3575788402848423, "grad_norm": 0.8936687707901001, "learning_rate": 1e-05, "loss": 0.4012, "mean_token_accuracy": 0.8680918216705322, "num_tokens": 738736548.0, "step": 4635 }, { "epoch": 2.358087487283825, "grad_norm": 1.0075676441192627, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8534650802612305, "num_tokens": 738903821.0, "step": 4636 }, { "epoch": 2.358596134282808, "grad_norm": 0.9175559878349304, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8470964431762695, "num_tokens": 739056824.0, "step": 4637 }, { "epoch": 2.3591047812817907, "grad_norm": 1.0028626918792725, "learning_rate": 1e-05, "loss": 0.4199, "mean_token_accuracy": 0.8617447018623352, "num_tokens": 739208932.0, "step": 4638 }, { "epoch": 2.359613428280773, "grad_norm": 0.9204167723655701, "learning_rate": 1e-05, "loss": 0.4502, "mean_token_accuracy": 0.8545858860015869, "num_tokens": 739369912.0, "step": 4639 }, { "epoch": 2.360122075279756, "grad_norm": 1.0421146154403687, "learning_rate": 1e-05, "loss": 0.4257, "mean_token_accuracy": 0.8625353574752808, "num_tokens": 739529429.0, "step": 4640 }, { "epoch": 2.3606307222787386, "grad_norm": 1.040346622467041, "learning_rate": 1e-05, "loss": 0.4208, "mean_token_accuracy": 0.8633586168289185, "num_tokens": 739682071.0, "step": 4641 }, { "epoch": 2.3611393692777214, "grad_norm": 0.914192795753479, "learning_rate": 1e-05, "loss": 0.4338, "mean_token_accuracy": 0.8609576225280762, "num_tokens": 739826514.0, "step": 4642 }, { "epoch": 2.361648016276704, "grad_norm": 0.9608312249183655, "learning_rate": 1e-05, "loss": 0.4699, "mean_token_accuracy": 0.8505814671516418, "num_tokens": 739987484.0, "step": 4643 }, { "epoch": 2.3621566632756865, "grad_norm": 0.9846158027648926, "learning_rate": 1e-05, "loss": 0.4235, "mean_token_accuracy": 0.8619943857192993, "num_tokens": 740139289.0, "step": 4644 }, { "epoch": 2.3626653102746693, "grad_norm": 0.9244502782821655, "learning_rate": 1e-05, "loss": 0.4384, "mean_token_accuracy": 0.8589105606079102, "num_tokens": 740295292.0, "step": 4645 }, { "epoch": 2.363173957273652, "grad_norm": 0.967990517616272, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8616007566452026, "num_tokens": 740449970.0, "step": 4646 }, { "epoch": 2.363682604272635, "grad_norm": 0.8939878940582275, "learning_rate": 1e-05, "loss": 0.4469, "mean_token_accuracy": 0.8555958271026611, "num_tokens": 740607621.0, "step": 4647 }, { "epoch": 2.3641912512716177, "grad_norm": 0.9425458908081055, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.8575696349143982, "num_tokens": 740754163.0, "step": 4648 }, { "epoch": 2.3646998982706, "grad_norm": 0.9591701626777649, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8612913489341736, "num_tokens": 740909565.0, "step": 4649 }, { "epoch": 2.365208545269583, "grad_norm": 0.9903981685638428, "learning_rate": 1e-05, "loss": 0.4322, "mean_token_accuracy": 0.8616185188293457, "num_tokens": 741074703.0, "step": 4650 }, { "epoch": 2.3657171922685656, "grad_norm": 0.9808692932128906, "learning_rate": 1e-05, "loss": 0.4508, "mean_token_accuracy": 0.854599118232727, "num_tokens": 741232219.0, "step": 4651 }, { "epoch": 2.3662258392675484, "grad_norm": 0.874530017375946, "learning_rate": 1e-05, "loss": 0.4125, "mean_token_accuracy": 0.8656905889511108, "num_tokens": 741403525.0, "step": 4652 }, { "epoch": 2.366734486266531, "grad_norm": 0.9854841828346252, "learning_rate": 1e-05, "loss": 0.4002, "mean_token_accuracy": 0.8691383600234985, "num_tokens": 741564479.0, "step": 4653 }, { "epoch": 2.3672431332655135, "grad_norm": 0.9224373698234558, "learning_rate": 1e-05, "loss": 0.4258, "mean_token_accuracy": 0.8609287738800049, "num_tokens": 741728453.0, "step": 4654 }, { "epoch": 2.3677517802644963, "grad_norm": 0.9162703156471252, "learning_rate": 1e-05, "loss": 0.4397, "mean_token_accuracy": 0.8567160367965698, "num_tokens": 741877478.0, "step": 4655 }, { "epoch": 2.368260427263479, "grad_norm": 1.03668212890625, "learning_rate": 1e-05, "loss": 0.4199, "mean_token_accuracy": 0.8631287217140198, "num_tokens": 742045389.0, "step": 4656 }, { "epoch": 2.368769074262462, "grad_norm": 0.8980605006217957, "learning_rate": 1e-05, "loss": 0.439, "mean_token_accuracy": 0.8585948348045349, "num_tokens": 742197956.0, "step": 4657 }, { "epoch": 2.3692777212614446, "grad_norm": 0.9165881872177124, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8621605038642883, "num_tokens": 742355845.0, "step": 4658 }, { "epoch": 2.3697863682604274, "grad_norm": 0.8757665753364563, "learning_rate": 1e-05, "loss": 0.4593, "mean_token_accuracy": 0.851668119430542, "num_tokens": 742516329.0, "step": 4659 }, { "epoch": 2.3702950152594098, "grad_norm": 0.935166597366333, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8546200394630432, "num_tokens": 742671682.0, "step": 4660 }, { "epoch": 2.3708036622583926, "grad_norm": 1.0466597080230713, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8509244918823242, "num_tokens": 742824141.0, "step": 4661 }, { "epoch": 2.3713123092573754, "grad_norm": 0.8813978433609009, "learning_rate": 1e-05, "loss": 0.4489, "mean_token_accuracy": 0.8555780649185181, "num_tokens": 742999389.0, "step": 4662 }, { "epoch": 2.371820956256358, "grad_norm": 0.8976166248321533, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8543903827667236, "num_tokens": 743157150.0, "step": 4663 }, { "epoch": 2.372329603255341, "grad_norm": 0.9232761859893799, "learning_rate": 1e-05, "loss": 0.4397, "mean_token_accuracy": 0.8589328527450562, "num_tokens": 743328436.0, "step": 4664 }, { "epoch": 2.3728382502543237, "grad_norm": 0.8817173838615417, "learning_rate": 1e-05, "loss": 0.4234, "mean_token_accuracy": 0.8617080450057983, "num_tokens": 743481174.0, "step": 4665 }, { "epoch": 2.373346897253306, "grad_norm": 1.13491952419281, "learning_rate": 1e-05, "loss": 0.4597, "mean_token_accuracy": 0.8530697822570801, "num_tokens": 743642776.0, "step": 4666 }, { "epoch": 2.373855544252289, "grad_norm": 1.4366291761398315, "learning_rate": 1e-05, "loss": 0.4247, "mean_token_accuracy": 0.8627820014953613, "num_tokens": 743809658.0, "step": 4667 }, { "epoch": 2.3743641912512716, "grad_norm": 0.9859811067581177, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8563073873519897, "num_tokens": 743962306.0, "step": 4668 }, { "epoch": 2.3748728382502544, "grad_norm": 0.9425814151763916, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8557022213935852, "num_tokens": 744117518.0, "step": 4669 }, { "epoch": 2.375381485249237, "grad_norm": 0.9705214500427246, "learning_rate": 1e-05, "loss": 0.4738, "mean_token_accuracy": 0.850643515586853, "num_tokens": 744277189.0, "step": 4670 }, { "epoch": 2.3758901322482195, "grad_norm": 0.9333838820457458, "learning_rate": 1e-05, "loss": 0.4614, "mean_token_accuracy": 0.8518938422203064, "num_tokens": 744432354.0, "step": 4671 }, { "epoch": 2.3763987792472023, "grad_norm": 0.8503909111022949, "learning_rate": 1e-05, "loss": 0.4127, "mean_token_accuracy": 0.8648768067359924, "num_tokens": 744602450.0, "step": 4672 }, { "epoch": 2.376907426246185, "grad_norm": 0.9568102955818176, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8558696508407593, "num_tokens": 744763687.0, "step": 4673 }, { "epoch": 2.377416073245168, "grad_norm": 0.9368235468864441, "learning_rate": 1e-05, "loss": 0.4323, "mean_token_accuracy": 0.859917938709259, "num_tokens": 744941184.0, "step": 4674 }, { "epoch": 2.3779247202441507, "grad_norm": 1.0416042804718018, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.8539679050445557, "num_tokens": 745110653.0, "step": 4675 }, { "epoch": 2.378433367243133, "grad_norm": 0.9496248364448547, "learning_rate": 1e-05, "loss": 0.4354, "mean_token_accuracy": 0.8581831455230713, "num_tokens": 745273816.0, "step": 4676 }, { "epoch": 2.378942014242116, "grad_norm": 0.9366162419319153, "learning_rate": 1e-05, "loss": 0.4498, "mean_token_accuracy": 0.854411780834198, "num_tokens": 745424921.0, "step": 4677 }, { "epoch": 2.3794506612410986, "grad_norm": 0.8841323852539062, "learning_rate": 1e-05, "loss": 0.466, "mean_token_accuracy": 0.8519430756568909, "num_tokens": 745600889.0, "step": 4678 }, { "epoch": 2.3799593082400814, "grad_norm": 0.9119141101837158, "learning_rate": 1e-05, "loss": 0.4206, "mean_token_accuracy": 0.8637368083000183, "num_tokens": 745766797.0, "step": 4679 }, { "epoch": 2.380467955239064, "grad_norm": 1.0425060987472534, "learning_rate": 1e-05, "loss": 0.4193, "mean_token_accuracy": 0.8617416024208069, "num_tokens": 745931100.0, "step": 4680 }, { "epoch": 2.380976602238047, "grad_norm": 0.9240050315856934, "learning_rate": 1e-05, "loss": 0.4364, "mean_token_accuracy": 0.8603978157043457, "num_tokens": 746090202.0, "step": 4681 }, { "epoch": 2.3814852492370293, "grad_norm": 0.9670682549476624, "learning_rate": 1e-05, "loss": 0.4434, "mean_token_accuracy": 0.8564561605453491, "num_tokens": 746255957.0, "step": 4682 }, { "epoch": 2.381993896236012, "grad_norm": 0.9642400145530701, "learning_rate": 1e-05, "loss": 0.4271, "mean_token_accuracy": 0.8614307641983032, "num_tokens": 746420006.0, "step": 4683 }, { "epoch": 2.382502543234995, "grad_norm": 0.9733436107635498, "learning_rate": 1e-05, "loss": 0.4519, "mean_token_accuracy": 0.8539668321609497, "num_tokens": 746589548.0, "step": 4684 }, { "epoch": 2.3830111902339777, "grad_norm": 0.9867450594902039, "learning_rate": 1e-05, "loss": 0.423, "mean_token_accuracy": 0.8615120649337769, "num_tokens": 746746846.0, "step": 4685 }, { "epoch": 2.3835198372329605, "grad_norm": 0.9444828629493713, "learning_rate": 1e-05, "loss": 0.4395, "mean_token_accuracy": 0.8575142025947571, "num_tokens": 746894866.0, "step": 4686 }, { "epoch": 2.3840284842319432, "grad_norm": 1.0291507244110107, "learning_rate": 1e-05, "loss": 0.4396, "mean_token_accuracy": 0.8585408926010132, "num_tokens": 747055985.0, "step": 4687 }, { "epoch": 2.3845371312309256, "grad_norm": 0.850340723991394, "learning_rate": 1e-05, "loss": 0.4064, "mean_token_accuracy": 0.8683385848999023, "num_tokens": 747223779.0, "step": 4688 }, { "epoch": 2.3850457782299084, "grad_norm": 1.0372014045715332, "learning_rate": 1e-05, "loss": 0.4207, "mean_token_accuracy": 0.863206148147583, "num_tokens": 747378261.0, "step": 4689 }, { "epoch": 2.385554425228891, "grad_norm": 0.9969184994697571, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8603659868240356, "num_tokens": 747533998.0, "step": 4690 }, { "epoch": 2.386063072227874, "grad_norm": 0.9529933929443359, "learning_rate": 1e-05, "loss": 0.4821, "mean_token_accuracy": 0.8441590070724487, "num_tokens": 747695636.0, "step": 4691 }, { "epoch": 2.3865717192268567, "grad_norm": 1.0091328620910645, "learning_rate": 1e-05, "loss": 0.4624, "mean_token_accuracy": 0.851618230342865, "num_tokens": 747850725.0, "step": 4692 }, { "epoch": 2.387080366225839, "grad_norm": 0.9514685273170471, "learning_rate": 1e-05, "loss": 0.4305, "mean_token_accuracy": 0.8622990846633911, "num_tokens": 748011157.0, "step": 4693 }, { "epoch": 2.387589013224822, "grad_norm": 0.961980402469635, "learning_rate": 1e-05, "loss": 0.4373, "mean_token_accuracy": 0.8580829501152039, "num_tokens": 748172245.0, "step": 4694 }, { "epoch": 2.3880976602238047, "grad_norm": 0.967473566532135, "learning_rate": 1e-05, "loss": 0.4206, "mean_token_accuracy": 0.8628791570663452, "num_tokens": 748325050.0, "step": 4695 }, { "epoch": 2.3886063072227874, "grad_norm": 0.9420827031135559, "learning_rate": 1e-05, "loss": 0.4446, "mean_token_accuracy": 0.8563902974128723, "num_tokens": 748476090.0, "step": 4696 }, { "epoch": 2.3891149542217702, "grad_norm": 0.9462102055549622, "learning_rate": 1e-05, "loss": 0.4593, "mean_token_accuracy": 0.8517200350761414, "num_tokens": 748638219.0, "step": 4697 }, { "epoch": 2.3896236012207526, "grad_norm": 0.9324440956115723, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8563007712364197, "num_tokens": 748802917.0, "step": 4698 }, { "epoch": 2.3901322482197354, "grad_norm": 0.9615978598594666, "learning_rate": 1e-05, "loss": 0.4408, "mean_token_accuracy": 0.8580653667449951, "num_tokens": 748955626.0, "step": 4699 }, { "epoch": 2.390640895218718, "grad_norm": 0.9752944707870483, "learning_rate": 1e-05, "loss": 0.4816, "mean_token_accuracy": 0.8470532894134521, "num_tokens": 749117584.0, "step": 4700 }, { "epoch": 2.391149542217701, "grad_norm": 0.9216363430023193, "learning_rate": 1e-05, "loss": 0.4566, "mean_token_accuracy": 0.854560375213623, "num_tokens": 749275558.0, "step": 4701 }, { "epoch": 2.3916581892166837, "grad_norm": 0.9103134274482727, "learning_rate": 1e-05, "loss": 0.4513, "mean_token_accuracy": 0.8529074192047119, "num_tokens": 749440693.0, "step": 4702 }, { "epoch": 2.3921668362156665, "grad_norm": 0.9516760110855103, "learning_rate": 1e-05, "loss": 0.4196, "mean_token_accuracy": 0.8639135360717773, "num_tokens": 749592575.0, "step": 4703 }, { "epoch": 2.392675483214649, "grad_norm": 0.9901017546653748, "learning_rate": 1e-05, "loss": 0.425, "mean_token_accuracy": 0.8616241812705994, "num_tokens": 749746118.0, "step": 4704 }, { "epoch": 2.3931841302136316, "grad_norm": 0.8785843849182129, "learning_rate": 1e-05, "loss": 0.4213, "mean_token_accuracy": 0.8633971214294434, "num_tokens": 749904411.0, "step": 4705 }, { "epoch": 2.3936927772126144, "grad_norm": 0.88527512550354, "learning_rate": 1e-05, "loss": 0.439, "mean_token_accuracy": 0.8575128316879272, "num_tokens": 750064386.0, "step": 4706 }, { "epoch": 2.394201424211597, "grad_norm": 0.9171162843704224, "learning_rate": 1e-05, "loss": 0.4385, "mean_token_accuracy": 0.8575048446655273, "num_tokens": 750223529.0, "step": 4707 }, { "epoch": 2.39471007121058, "grad_norm": 0.8819383382797241, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8574961423873901, "num_tokens": 750392408.0, "step": 4708 }, { "epoch": 2.395218718209563, "grad_norm": 0.8735357522964478, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8608437776565552, "num_tokens": 750558566.0, "step": 4709 }, { "epoch": 2.395727365208545, "grad_norm": 0.954378068447113, "learning_rate": 1e-05, "loss": 0.4457, "mean_token_accuracy": 0.8561128377914429, "num_tokens": 750725594.0, "step": 4710 }, { "epoch": 2.396236012207528, "grad_norm": 0.9977381229400635, "learning_rate": 1e-05, "loss": 0.4512, "mean_token_accuracy": 0.8524102568626404, "num_tokens": 750864466.0, "step": 4711 }, { "epoch": 2.3967446592065107, "grad_norm": 0.9387546181678772, "learning_rate": 1e-05, "loss": 0.4139, "mean_token_accuracy": 0.8657817840576172, "num_tokens": 751019674.0, "step": 4712 }, { "epoch": 2.3972533062054935, "grad_norm": 0.9221512079238892, "learning_rate": 1e-05, "loss": 0.4186, "mean_token_accuracy": 0.8629974126815796, "num_tokens": 751181685.0, "step": 4713 }, { "epoch": 2.3977619532044763, "grad_norm": 0.8834879994392395, "learning_rate": 1e-05, "loss": 0.4368, "mean_token_accuracy": 0.85898756980896, "num_tokens": 751341611.0, "step": 4714 }, { "epoch": 2.3982706002034586, "grad_norm": 0.865632176399231, "learning_rate": 1e-05, "loss": 0.4394, "mean_token_accuracy": 0.8603726029396057, "num_tokens": 751497507.0, "step": 4715 }, { "epoch": 2.3987792472024414, "grad_norm": 0.9302217364311218, "learning_rate": 1e-05, "loss": 0.4292, "mean_token_accuracy": 0.8591216206550598, "num_tokens": 751658357.0, "step": 4716 }, { "epoch": 2.399287894201424, "grad_norm": 0.8736921548843384, "learning_rate": 1e-05, "loss": 0.4151, "mean_token_accuracy": 0.8649030923843384, "num_tokens": 751819220.0, "step": 4717 }, { "epoch": 2.399796541200407, "grad_norm": 0.8840773105621338, "learning_rate": 1e-05, "loss": 0.4596, "mean_token_accuracy": 0.8527463674545288, "num_tokens": 751985925.0, "step": 4718 }, { "epoch": 2.4003051881993898, "grad_norm": 1.068041443824768, "learning_rate": 1e-05, "loss": 0.4467, "mean_token_accuracy": 0.8539555072784424, "num_tokens": 752152586.0, "step": 4719 }, { "epoch": 2.400813835198372, "grad_norm": 0.9924994111061096, "learning_rate": 1e-05, "loss": 0.4491, "mean_token_accuracy": 0.8539040684700012, "num_tokens": 752297184.0, "step": 4720 }, { "epoch": 2.401322482197355, "grad_norm": 0.9711671471595764, "learning_rate": 1e-05, "loss": 0.4473, "mean_token_accuracy": 0.8555803298950195, "num_tokens": 752461486.0, "step": 4721 }, { "epoch": 2.4018311291963377, "grad_norm": 0.9455640912055969, "learning_rate": 1e-05, "loss": 0.4323, "mean_token_accuracy": 0.8586920499801636, "num_tokens": 752623708.0, "step": 4722 }, { "epoch": 2.4023397761953205, "grad_norm": 0.8912090063095093, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8601329922676086, "num_tokens": 752783092.0, "step": 4723 }, { "epoch": 2.4028484231943033, "grad_norm": 0.933478593826294, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.8575913906097412, "num_tokens": 752932908.0, "step": 4724 }, { "epoch": 2.403357070193286, "grad_norm": 0.9062263369560242, "learning_rate": 1e-05, "loss": 0.4406, "mean_token_accuracy": 0.858018696308136, "num_tokens": 753098396.0, "step": 4725 }, { "epoch": 2.4038657171922684, "grad_norm": 0.9479162096977234, "learning_rate": 1e-05, "loss": 0.4456, "mean_token_accuracy": 0.8552176356315613, "num_tokens": 753256285.0, "step": 4726 }, { "epoch": 2.404374364191251, "grad_norm": 0.9442487955093384, "learning_rate": 1e-05, "loss": 0.4563, "mean_token_accuracy": 0.8518790006637573, "num_tokens": 753412905.0, "step": 4727 }, { "epoch": 2.404883011190234, "grad_norm": 0.9494585990905762, "learning_rate": 1e-05, "loss": 0.3956, "mean_token_accuracy": 0.8692808151245117, "num_tokens": 753554563.0, "step": 4728 }, { "epoch": 2.4053916581892167, "grad_norm": 0.9336660504341125, "learning_rate": 1e-05, "loss": 0.4445, "mean_token_accuracy": 0.8570419549942017, "num_tokens": 753724603.0, "step": 4729 }, { "epoch": 2.4059003051881995, "grad_norm": 0.8968808054924011, "learning_rate": 1e-05, "loss": 0.4184, "mean_token_accuracy": 0.8642021417617798, "num_tokens": 753883153.0, "step": 4730 }, { "epoch": 2.4064089521871823, "grad_norm": 0.9284188151359558, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.8559017777442932, "num_tokens": 754043284.0, "step": 4731 }, { "epoch": 2.4069175991861647, "grad_norm": 0.8959974050521851, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8548084497451782, "num_tokens": 754197393.0, "step": 4732 }, { "epoch": 2.4074262461851474, "grad_norm": 0.9350323677062988, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8600445985794067, "num_tokens": 754351162.0, "step": 4733 }, { "epoch": 2.4079348931841302, "grad_norm": 0.8921173214912415, "learning_rate": 1e-05, "loss": 0.4671, "mean_token_accuracy": 0.8496970534324646, "num_tokens": 754512562.0, "step": 4734 }, { "epoch": 2.408443540183113, "grad_norm": 0.8848469257354736, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8504832983016968, "num_tokens": 754672287.0, "step": 4735 }, { "epoch": 2.408952187182096, "grad_norm": 0.8982890844345093, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8612314462661743, "num_tokens": 754823935.0, "step": 4736 }, { "epoch": 2.409460834181078, "grad_norm": 0.9384156465530396, "learning_rate": 1e-05, "loss": 0.4569, "mean_token_accuracy": 0.8550631999969482, "num_tokens": 754983096.0, "step": 4737 }, { "epoch": 2.409969481180061, "grad_norm": 0.8527581095695496, "learning_rate": 1e-05, "loss": 0.4282, "mean_token_accuracy": 0.8617085218429565, "num_tokens": 755155364.0, "step": 4738 }, { "epoch": 2.4104781281790437, "grad_norm": 0.8351427316665649, "learning_rate": 1e-05, "loss": 0.4307, "mean_token_accuracy": 0.8600209951400757, "num_tokens": 755318398.0, "step": 4739 }, { "epoch": 2.4109867751780265, "grad_norm": 0.9485787153244019, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8549201488494873, "num_tokens": 755467594.0, "step": 4740 }, { "epoch": 2.4114954221770093, "grad_norm": 0.8743390440940857, "learning_rate": 1e-05, "loss": 0.417, "mean_token_accuracy": 0.8640428185462952, "num_tokens": 755619810.0, "step": 4741 }, { "epoch": 2.4120040691759916, "grad_norm": 0.8932185173034668, "learning_rate": 1e-05, "loss": 0.4158, "mean_token_accuracy": 0.8637555241584778, "num_tokens": 755776980.0, "step": 4742 }, { "epoch": 2.4125127161749744, "grad_norm": 0.8570172786712646, "learning_rate": 1e-05, "loss": 0.4125, "mean_token_accuracy": 0.8660873770713806, "num_tokens": 755934665.0, "step": 4743 }, { "epoch": 2.413021363173957, "grad_norm": 0.8673945069313049, "learning_rate": 1e-05, "loss": 0.4447, "mean_token_accuracy": 0.855694055557251, "num_tokens": 756086651.0, "step": 4744 }, { "epoch": 2.41353001017294, "grad_norm": 0.9503878355026245, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.852497935295105, "num_tokens": 756237065.0, "step": 4745 }, { "epoch": 2.414038657171923, "grad_norm": 0.856009840965271, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8569983839988708, "num_tokens": 756399704.0, "step": 4746 }, { "epoch": 2.4145473041709056, "grad_norm": 0.8498873710632324, "learning_rate": 1e-05, "loss": 0.4486, "mean_token_accuracy": 0.8537609577178955, "num_tokens": 756552790.0, "step": 4747 }, { "epoch": 2.415055951169888, "grad_norm": 0.8596876859664917, "learning_rate": 1e-05, "loss": 0.423, "mean_token_accuracy": 0.8616723418235779, "num_tokens": 756711634.0, "step": 4748 }, { "epoch": 2.4155645981688707, "grad_norm": 0.8566173315048218, "learning_rate": 1e-05, "loss": 0.4145, "mean_token_accuracy": 0.8648356199264526, "num_tokens": 756873060.0, "step": 4749 }, { "epoch": 2.4160732451678535, "grad_norm": 0.8351802825927734, "learning_rate": 1e-05, "loss": 0.4289, "mean_token_accuracy": 0.861006498336792, "num_tokens": 757032898.0, "step": 4750 }, { "epoch": 2.4165818921668363, "grad_norm": 0.849201500415802, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.8551095128059387, "num_tokens": 757194206.0, "step": 4751 }, { "epoch": 2.417090539165819, "grad_norm": 0.8686506152153015, "learning_rate": 1e-05, "loss": 0.4171, "mean_token_accuracy": 0.8662176132202148, "num_tokens": 757357972.0, "step": 4752 }, { "epoch": 2.417599186164802, "grad_norm": 0.8447994589805603, "learning_rate": 1e-05, "loss": 0.4257, "mean_token_accuracy": 0.8616929054260254, "num_tokens": 757525531.0, "step": 4753 }, { "epoch": 2.418107833163784, "grad_norm": 0.916158139705658, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8554221987724304, "num_tokens": 757677296.0, "step": 4754 }, { "epoch": 2.418616480162767, "grad_norm": 0.8697807192802429, "learning_rate": 1e-05, "loss": 0.4454, "mean_token_accuracy": 0.8546140789985657, "num_tokens": 757836573.0, "step": 4755 }, { "epoch": 2.4191251271617498, "grad_norm": 0.8870343565940857, "learning_rate": 1e-05, "loss": 0.4343, "mean_token_accuracy": 0.8591193556785583, "num_tokens": 757995955.0, "step": 4756 }, { "epoch": 2.4196337741607326, "grad_norm": 0.9044178128242493, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.8570597171783447, "num_tokens": 758148208.0, "step": 4757 }, { "epoch": 2.4201424211597153, "grad_norm": 1.0102097988128662, "learning_rate": 1e-05, "loss": 0.4547, "mean_token_accuracy": 0.8534799218177795, "num_tokens": 758312347.0, "step": 4758 }, { "epoch": 2.4206510681586977, "grad_norm": 0.8721146583557129, "learning_rate": 1e-05, "loss": 0.4446, "mean_token_accuracy": 0.8564634323120117, "num_tokens": 758474081.0, "step": 4759 }, { "epoch": 2.4211597151576805, "grad_norm": 0.8786713480949402, "learning_rate": 1e-05, "loss": 0.4307, "mean_token_accuracy": 0.8591622114181519, "num_tokens": 758639550.0, "step": 4760 }, { "epoch": 2.4216683621566633, "grad_norm": 0.8972764611244202, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.8601624965667725, "num_tokens": 758802055.0, "step": 4761 }, { "epoch": 2.422177009155646, "grad_norm": 0.8989319205284119, "learning_rate": 1e-05, "loss": 0.4546, "mean_token_accuracy": 0.8546927571296692, "num_tokens": 758958913.0, "step": 4762 }, { "epoch": 2.422685656154629, "grad_norm": 0.9417235851287842, "learning_rate": 1e-05, "loss": 0.4857, "mean_token_accuracy": 0.8451066613197327, "num_tokens": 759120435.0, "step": 4763 }, { "epoch": 2.423194303153611, "grad_norm": 0.9240588545799255, "learning_rate": 1e-05, "loss": 0.412, "mean_token_accuracy": 0.8654221296310425, "num_tokens": 759270507.0, "step": 4764 }, { "epoch": 2.423702950152594, "grad_norm": 0.8854421377182007, "learning_rate": 1e-05, "loss": 0.4117, "mean_token_accuracy": 0.8663169145584106, "num_tokens": 759432853.0, "step": 4765 }, { "epoch": 2.4242115971515767, "grad_norm": 0.928834080696106, "learning_rate": 1e-05, "loss": 0.4104, "mean_token_accuracy": 0.8653715252876282, "num_tokens": 759579946.0, "step": 4766 }, { "epoch": 2.4247202441505595, "grad_norm": 0.8773136138916016, "learning_rate": 1e-05, "loss": 0.4433, "mean_token_accuracy": 0.8564538955688477, "num_tokens": 759749341.0, "step": 4767 }, { "epoch": 2.4252288911495423, "grad_norm": 0.8476248979568481, "learning_rate": 1e-05, "loss": 0.4262, "mean_token_accuracy": 0.8624737858772278, "num_tokens": 759919471.0, "step": 4768 }, { "epoch": 2.425737538148525, "grad_norm": 0.899351954460144, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8652138710021973, "num_tokens": 760063245.0, "step": 4769 }, { "epoch": 2.4262461851475075, "grad_norm": 0.8362252116203308, "learning_rate": 1e-05, "loss": 0.4098, "mean_token_accuracy": 0.8650784492492676, "num_tokens": 760224828.0, "step": 4770 }, { "epoch": 2.4267548321464902, "grad_norm": 0.9169917106628418, "learning_rate": 1e-05, "loss": 0.4341, "mean_token_accuracy": 0.8595162630081177, "num_tokens": 760378809.0, "step": 4771 }, { "epoch": 2.427263479145473, "grad_norm": 0.8968492746353149, "learning_rate": 1e-05, "loss": 0.4195, "mean_token_accuracy": 0.8629690408706665, "num_tokens": 760536153.0, "step": 4772 }, { "epoch": 2.427772126144456, "grad_norm": 0.8856859803199768, "learning_rate": 1e-05, "loss": 0.4623, "mean_token_accuracy": 0.8506142497062683, "num_tokens": 760694351.0, "step": 4773 }, { "epoch": 2.4282807731434386, "grad_norm": 0.946141242980957, "learning_rate": 1e-05, "loss": 0.4255, "mean_token_accuracy": 0.8618991374969482, "num_tokens": 760838181.0, "step": 4774 }, { "epoch": 2.4287894201424214, "grad_norm": 0.9400628805160522, "learning_rate": 1e-05, "loss": 0.4555, "mean_token_accuracy": 0.8526998162269592, "num_tokens": 761009751.0, "step": 4775 }, { "epoch": 2.4292980671414037, "grad_norm": 0.9153785705566406, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.8605570197105408, "num_tokens": 761170210.0, "step": 4776 }, { "epoch": 2.4298067141403865, "grad_norm": 0.8960493803024292, "learning_rate": 1e-05, "loss": 0.4406, "mean_token_accuracy": 0.8575877547264099, "num_tokens": 761330303.0, "step": 4777 }, { "epoch": 2.4303153611393693, "grad_norm": 1.01410973072052, "learning_rate": 1e-05, "loss": 0.4456, "mean_token_accuracy": 0.8566583395004272, "num_tokens": 761491740.0, "step": 4778 }, { "epoch": 2.430824008138352, "grad_norm": 0.9823146462440491, "learning_rate": 1e-05, "loss": 0.4175, "mean_token_accuracy": 0.8643597960472107, "num_tokens": 761645233.0, "step": 4779 }, { "epoch": 2.431332655137335, "grad_norm": 0.8986417651176453, "learning_rate": 1e-05, "loss": 0.458, "mean_token_accuracy": 0.8521144390106201, "num_tokens": 761813341.0, "step": 4780 }, { "epoch": 2.431841302136317, "grad_norm": 0.9008432030677795, "learning_rate": 1e-05, "loss": 0.4265, "mean_token_accuracy": 0.8616411685943604, "num_tokens": 761972013.0, "step": 4781 }, { "epoch": 2.4323499491353, "grad_norm": 0.8586981296539307, "learning_rate": 1e-05, "loss": 0.4282, "mean_token_accuracy": 0.8597036600112915, "num_tokens": 762139354.0, "step": 4782 }, { "epoch": 2.432858596134283, "grad_norm": 0.9051372408866882, "learning_rate": 1e-05, "loss": 0.4354, "mean_token_accuracy": 0.858922004699707, "num_tokens": 762302578.0, "step": 4783 }, { "epoch": 2.4333672431332656, "grad_norm": 0.9037993550300598, "learning_rate": 1e-05, "loss": 0.4111, "mean_token_accuracy": 0.8665629625320435, "num_tokens": 762460461.0, "step": 4784 }, { "epoch": 2.4338758901322484, "grad_norm": 0.8953592777252197, "learning_rate": 1e-05, "loss": 0.4477, "mean_token_accuracy": 0.8560519218444824, "num_tokens": 762626751.0, "step": 4785 }, { "epoch": 2.4343845371312307, "grad_norm": 0.8536945581436157, "learning_rate": 1e-05, "loss": 0.431, "mean_token_accuracy": 0.8601256608963013, "num_tokens": 762803568.0, "step": 4786 }, { "epoch": 2.4348931841302135, "grad_norm": 0.875242292881012, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.8583277463912964, "num_tokens": 762977204.0, "step": 4787 }, { "epoch": 2.4354018311291963, "grad_norm": 1.011696457862854, "learning_rate": 1e-05, "loss": 0.4299, "mean_token_accuracy": 0.8596693873405457, "num_tokens": 763121607.0, "step": 4788 }, { "epoch": 2.435910478128179, "grad_norm": 0.8985397815704346, "learning_rate": 1e-05, "loss": 0.454, "mean_token_accuracy": 0.8534476161003113, "num_tokens": 763287031.0, "step": 4789 }, { "epoch": 2.436419125127162, "grad_norm": 1.7359514236450195, "learning_rate": 1e-05, "loss": 0.4517, "mean_token_accuracy": 0.8541314005851746, "num_tokens": 763440200.0, "step": 4790 }, { "epoch": 2.4369277721261446, "grad_norm": 1.0107104778289795, "learning_rate": 1e-05, "loss": 0.4237, "mean_token_accuracy": 0.863031268119812, "num_tokens": 763592000.0, "step": 4791 }, { "epoch": 2.437436419125127, "grad_norm": 0.944232702255249, "learning_rate": 1e-05, "loss": 0.4501, "mean_token_accuracy": 0.854409396648407, "num_tokens": 763765662.0, "step": 4792 }, { "epoch": 2.4379450661241098, "grad_norm": 0.8871685862541199, "learning_rate": 1e-05, "loss": 0.4333, "mean_token_accuracy": 0.8585347533226013, "num_tokens": 763939945.0, "step": 4793 }, { "epoch": 2.4384537131230926, "grad_norm": 1.0742279291152954, "learning_rate": 1e-05, "loss": 0.4155, "mean_token_accuracy": 0.8643221855163574, "num_tokens": 764082350.0, "step": 4794 }, { "epoch": 2.4389623601220753, "grad_norm": 0.9682660698890686, "learning_rate": 1e-05, "loss": 0.4371, "mean_token_accuracy": 0.8597748279571533, "num_tokens": 764237823.0, "step": 4795 }, { "epoch": 2.439471007121058, "grad_norm": 0.9521532654762268, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8522917032241821, "num_tokens": 764395519.0, "step": 4796 }, { "epoch": 2.439979654120041, "grad_norm": 0.9624428153038025, "learning_rate": 1e-05, "loss": 0.4264, "mean_token_accuracy": 0.8598757386207581, "num_tokens": 764558321.0, "step": 4797 }, { "epoch": 2.4404883011190233, "grad_norm": 0.9017758965492249, "learning_rate": 1e-05, "loss": 0.4597, "mean_token_accuracy": 0.8528575897216797, "num_tokens": 764720615.0, "step": 4798 }, { "epoch": 2.440996948118006, "grad_norm": 0.8903970718383789, "learning_rate": 1e-05, "loss": 0.447, "mean_token_accuracy": 0.8557796478271484, "num_tokens": 764886477.0, "step": 4799 }, { "epoch": 2.441505595116989, "grad_norm": 0.975334644317627, "learning_rate": 1e-05, "loss": 0.479, "mean_token_accuracy": 0.8487585783004761, "num_tokens": 765064375.0, "step": 4800 }, { "epoch": 2.4420142421159716, "grad_norm": 1.2733622789382935, "learning_rate": 1e-05, "loss": 0.4563, "mean_token_accuracy": 0.8532027006149292, "num_tokens": 765232031.0, "step": 4801 }, { "epoch": 2.4425228891149544, "grad_norm": 0.996767520904541, "learning_rate": 1e-05, "loss": 0.4795, "mean_token_accuracy": 0.8473283648490906, "num_tokens": 765405280.0, "step": 4802 }, { "epoch": 2.4430315361139368, "grad_norm": 0.981573224067688, "learning_rate": 1e-05, "loss": 0.4441, "mean_token_accuracy": 0.8556265234947205, "num_tokens": 765557818.0, "step": 4803 }, { "epoch": 2.4435401831129195, "grad_norm": 0.9518735408782959, "learning_rate": 1e-05, "loss": 0.4316, "mean_token_accuracy": 0.8591273427009583, "num_tokens": 765704128.0, "step": 4804 }, { "epoch": 2.4440488301119023, "grad_norm": 0.9897494316101074, "learning_rate": 1e-05, "loss": 0.4348, "mean_token_accuracy": 0.8603610396385193, "num_tokens": 765860896.0, "step": 4805 }, { "epoch": 2.444557477110885, "grad_norm": 1.0344433784484863, "learning_rate": 1e-05, "loss": 0.4528, "mean_token_accuracy": 0.853305995464325, "num_tokens": 766021963.0, "step": 4806 }, { "epoch": 2.445066124109868, "grad_norm": 0.8656305074691772, "learning_rate": 1e-05, "loss": 0.4477, "mean_token_accuracy": 0.856003999710083, "num_tokens": 766184156.0, "step": 4807 }, { "epoch": 2.4455747711088502, "grad_norm": 0.9636114835739136, "learning_rate": 1e-05, "loss": 0.4492, "mean_token_accuracy": 0.855746328830719, "num_tokens": 766348235.0, "step": 4808 }, { "epoch": 2.446083418107833, "grad_norm": 1.0234853029251099, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.860809862613678, "num_tokens": 766508467.0, "step": 4809 }, { "epoch": 2.446592065106816, "grad_norm": 0.8884817957878113, "learning_rate": 1e-05, "loss": 0.4203, "mean_token_accuracy": 0.8626583218574524, "num_tokens": 766664722.0, "step": 4810 }, { "epoch": 2.4471007121057986, "grad_norm": 0.9701591730117798, "learning_rate": 1e-05, "loss": 0.3896, "mean_token_accuracy": 0.8732377290725708, "num_tokens": 766821416.0, "step": 4811 }, { "epoch": 2.4476093591047814, "grad_norm": 0.9683111310005188, "learning_rate": 1e-05, "loss": 0.4517, "mean_token_accuracy": 0.8539290428161621, "num_tokens": 766973521.0, "step": 4812 }, { "epoch": 2.448118006103764, "grad_norm": 1.375157117843628, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8657113909721375, "num_tokens": 767140971.0, "step": 4813 }, { "epoch": 2.4486266531027465, "grad_norm": 0.9833841919898987, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8539447784423828, "num_tokens": 767301821.0, "step": 4814 }, { "epoch": 2.4491353001017293, "grad_norm": 0.9484559297561646, "learning_rate": 1e-05, "loss": 0.4266, "mean_token_accuracy": 0.8614406585693359, "num_tokens": 767467792.0, "step": 4815 }, { "epoch": 2.449643947100712, "grad_norm": 0.9295511841773987, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8562976121902466, "num_tokens": 767630278.0, "step": 4816 }, { "epoch": 2.450152594099695, "grad_norm": 0.8951289653778076, "learning_rate": 1e-05, "loss": 0.437, "mean_token_accuracy": 0.8576017618179321, "num_tokens": 767800184.0, "step": 4817 }, { "epoch": 2.4506612410986777, "grad_norm": 0.9002968072891235, "learning_rate": 1e-05, "loss": 0.4273, "mean_token_accuracy": 0.8604477643966675, "num_tokens": 767961963.0, "step": 4818 }, { "epoch": 2.4511698880976605, "grad_norm": 0.9128016829490662, "learning_rate": 1e-05, "loss": 0.4169, "mean_token_accuracy": 0.8654111623764038, "num_tokens": 768111977.0, "step": 4819 }, { "epoch": 2.451678535096643, "grad_norm": 0.9885953068733215, "learning_rate": 1e-05, "loss": 0.407, "mean_token_accuracy": 0.8666447401046753, "num_tokens": 768274598.0, "step": 4820 }, { "epoch": 2.4521871820956256, "grad_norm": 0.8686884045600891, "learning_rate": 1e-05, "loss": 0.4509, "mean_token_accuracy": 0.8539066314697266, "num_tokens": 768427058.0, "step": 4821 }, { "epoch": 2.4526958290946084, "grad_norm": 0.8789154887199402, "learning_rate": 1e-05, "loss": 0.465, "mean_token_accuracy": 0.8505491614341736, "num_tokens": 768596525.0, "step": 4822 }, { "epoch": 2.453204476093591, "grad_norm": 0.8778037428855896, "learning_rate": 1e-05, "loss": 0.4192, "mean_token_accuracy": 0.8648917078971863, "num_tokens": 768763320.0, "step": 4823 }, { "epoch": 2.453713123092574, "grad_norm": 0.8817370533943176, "learning_rate": 1e-05, "loss": 0.4141, "mean_token_accuracy": 0.8660046458244324, "num_tokens": 768931098.0, "step": 4824 }, { "epoch": 2.4542217700915563, "grad_norm": 0.9536473751068115, "learning_rate": 1e-05, "loss": 0.436, "mean_token_accuracy": 0.8574396371841431, "num_tokens": 769083639.0, "step": 4825 }, { "epoch": 2.454730417090539, "grad_norm": 0.9757430553436279, "learning_rate": 1e-05, "loss": 0.4644, "mean_token_accuracy": 0.8526658415794373, "num_tokens": 769236726.0, "step": 4826 }, { "epoch": 2.455239064089522, "grad_norm": 0.9364215135574341, "learning_rate": 1e-05, "loss": 0.4254, "mean_token_accuracy": 0.8613604307174683, "num_tokens": 769387035.0, "step": 4827 }, { "epoch": 2.4557477110885046, "grad_norm": 0.9096567630767822, "learning_rate": 1e-05, "loss": 0.4513, "mean_token_accuracy": 0.8547468185424805, "num_tokens": 769546860.0, "step": 4828 }, { "epoch": 2.4562563580874874, "grad_norm": 0.8673074245452881, "learning_rate": 1e-05, "loss": 0.4281, "mean_token_accuracy": 0.8612393140792847, "num_tokens": 769696427.0, "step": 4829 }, { "epoch": 2.4567650050864698, "grad_norm": 1.0168750286102295, "learning_rate": 1e-05, "loss": 0.4054, "mean_token_accuracy": 0.866659939289093, "num_tokens": 769852870.0, "step": 4830 }, { "epoch": 2.4572736520854526, "grad_norm": 0.9404606819152832, "learning_rate": 1e-05, "loss": 0.4316, "mean_token_accuracy": 0.858262836933136, "num_tokens": 769992679.0, "step": 4831 }, { "epoch": 2.4577822990844354, "grad_norm": 0.9464917778968811, "learning_rate": 1e-05, "loss": 0.4698, "mean_token_accuracy": 0.8508808016777039, "num_tokens": 770157889.0, "step": 4832 }, { "epoch": 2.458290946083418, "grad_norm": 0.9580070376396179, "learning_rate": 1e-05, "loss": 0.4251, "mean_token_accuracy": 0.8595504760742188, "num_tokens": 770327479.0, "step": 4833 }, { "epoch": 2.458799593082401, "grad_norm": 0.8783279061317444, "learning_rate": 1e-05, "loss": 0.4391, "mean_token_accuracy": 0.8582003116607666, "num_tokens": 770495718.0, "step": 4834 }, { "epoch": 2.4593082400813837, "grad_norm": 0.9397633671760559, "learning_rate": 1e-05, "loss": 0.436, "mean_token_accuracy": 0.8579865097999573, "num_tokens": 770649211.0, "step": 4835 }, { "epoch": 2.459816887080366, "grad_norm": 0.9210466146469116, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8545470237731934, "num_tokens": 770807568.0, "step": 4836 }, { "epoch": 2.460325534079349, "grad_norm": 0.9139899611473083, "learning_rate": 1e-05, "loss": 0.4461, "mean_token_accuracy": 0.8555503487586975, "num_tokens": 770972725.0, "step": 4837 }, { "epoch": 2.4608341810783316, "grad_norm": 0.9260302186012268, "learning_rate": 1e-05, "loss": 0.3865, "mean_token_accuracy": 0.8711071610450745, "num_tokens": 771115947.0, "step": 4838 }, { "epoch": 2.4613428280773144, "grad_norm": 0.8994468450546265, "learning_rate": 1e-05, "loss": 0.4165, "mean_token_accuracy": 0.8638403415679932, "num_tokens": 771281930.0, "step": 4839 }, { "epoch": 2.461851475076297, "grad_norm": 1.0170361995697021, "learning_rate": 1e-05, "loss": 0.4987, "mean_token_accuracy": 0.8415489196777344, "num_tokens": 771443575.0, "step": 4840 }, { "epoch": 2.46236012207528, "grad_norm": 0.9376793503761292, "learning_rate": 1e-05, "loss": 0.4624, "mean_token_accuracy": 0.8517590761184692, "num_tokens": 771610548.0, "step": 4841 }, { "epoch": 2.4628687690742623, "grad_norm": 0.9882082939147949, "learning_rate": 1e-05, "loss": 0.4661, "mean_token_accuracy": 0.8496295809745789, "num_tokens": 771758233.0, "step": 4842 }, { "epoch": 2.463377416073245, "grad_norm": 0.9411653876304626, "learning_rate": 1e-05, "loss": 0.4213, "mean_token_accuracy": 0.8629002571105957, "num_tokens": 771915010.0, "step": 4843 }, { "epoch": 2.463886063072228, "grad_norm": 0.9319836497306824, "learning_rate": 1e-05, "loss": 0.4549, "mean_token_accuracy": 0.8531183004379272, "num_tokens": 772093055.0, "step": 4844 }, { "epoch": 2.4643947100712107, "grad_norm": 0.9602798223495483, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8609582185745239, "num_tokens": 772251316.0, "step": 4845 }, { "epoch": 2.4649033570701935, "grad_norm": 0.8279889225959778, "learning_rate": 1e-05, "loss": 0.4164, "mean_token_accuracy": 0.865665853023529, "num_tokens": 772411962.0, "step": 4846 }, { "epoch": 2.465412004069176, "grad_norm": 0.9248610734939575, "learning_rate": 1e-05, "loss": 0.4441, "mean_token_accuracy": 0.8555432558059692, "num_tokens": 772579215.0, "step": 4847 }, { "epoch": 2.4659206510681586, "grad_norm": 0.9257016181945801, "learning_rate": 1e-05, "loss": 0.4406, "mean_token_accuracy": 0.8550276160240173, "num_tokens": 772734084.0, "step": 4848 }, { "epoch": 2.4664292980671414, "grad_norm": 0.8871474862098694, "learning_rate": 1e-05, "loss": 0.4221, "mean_token_accuracy": 0.8642134666442871, "num_tokens": 772893088.0, "step": 4849 }, { "epoch": 2.466937945066124, "grad_norm": 0.8337686657905579, "learning_rate": 1e-05, "loss": 0.4159, "mean_token_accuracy": 0.8641759157180786, "num_tokens": 773069483.0, "step": 4850 }, { "epoch": 2.467446592065107, "grad_norm": 0.9676657915115356, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8470906615257263, "num_tokens": 773230998.0, "step": 4851 }, { "epoch": 2.4679552390640893, "grad_norm": 0.8760485649108887, "learning_rate": 1e-05, "loss": 0.4338, "mean_token_accuracy": 0.8574041724205017, "num_tokens": 773388030.0, "step": 4852 }, { "epoch": 2.468463886063072, "grad_norm": 0.8590541481971741, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8576443791389465, "num_tokens": 773551588.0, "step": 4853 }, { "epoch": 2.468972533062055, "grad_norm": 0.9119069576263428, "learning_rate": 1e-05, "loss": 0.4392, "mean_token_accuracy": 0.8565086126327515, "num_tokens": 773708733.0, "step": 4854 }, { "epoch": 2.4694811800610377, "grad_norm": 1.0006945133209229, "learning_rate": 1e-05, "loss": 0.4404, "mean_token_accuracy": 0.8568620681762695, "num_tokens": 773861934.0, "step": 4855 }, { "epoch": 2.4699898270600205, "grad_norm": 0.8730751872062683, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8648988604545593, "num_tokens": 774027412.0, "step": 4856 }, { "epoch": 2.470498474059003, "grad_norm": 0.9297431707382202, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8567512035369873, "num_tokens": 774187145.0, "step": 4857 }, { "epoch": 2.4710071210579856, "grad_norm": 0.8286118507385254, "learning_rate": 1e-05, "loss": 0.4079, "mean_token_accuracy": 0.8668421506881714, "num_tokens": 774354417.0, "step": 4858 }, { "epoch": 2.4715157680569684, "grad_norm": 0.9265725612640381, "learning_rate": 1e-05, "loss": 0.4414, "mean_token_accuracy": 0.8573986887931824, "num_tokens": 774500419.0, "step": 4859 }, { "epoch": 2.472024415055951, "grad_norm": 0.9389397501945496, "learning_rate": 1e-05, "loss": 0.4202, "mean_token_accuracy": 0.8619729280471802, "num_tokens": 774658382.0, "step": 4860 }, { "epoch": 2.472533062054934, "grad_norm": 0.8721439242362976, "learning_rate": 1e-05, "loss": 0.4589, "mean_token_accuracy": 0.8539974689483643, "num_tokens": 774814169.0, "step": 4861 }, { "epoch": 2.4730417090539167, "grad_norm": 0.9134073257446289, "learning_rate": 1e-05, "loss": 0.4325, "mean_token_accuracy": 0.8584816455841064, "num_tokens": 774978306.0, "step": 4862 }, { "epoch": 2.4735503560528995, "grad_norm": 0.9060777425765991, "learning_rate": 1e-05, "loss": 0.4389, "mean_token_accuracy": 0.8568652272224426, "num_tokens": 775144801.0, "step": 4863 }, { "epoch": 2.474059003051882, "grad_norm": 0.9013589024543762, "learning_rate": 1e-05, "loss": 0.4283, "mean_token_accuracy": 0.8597061634063721, "num_tokens": 775301131.0, "step": 4864 }, { "epoch": 2.4745676500508647, "grad_norm": 0.9063507318496704, "learning_rate": 1e-05, "loss": 0.4302, "mean_token_accuracy": 0.8605985641479492, "num_tokens": 775464800.0, "step": 4865 }, { "epoch": 2.4750762970498474, "grad_norm": 0.95588618516922, "learning_rate": 1e-05, "loss": 0.4545, "mean_token_accuracy": 0.8535224795341492, "num_tokens": 775609674.0, "step": 4866 }, { "epoch": 2.4755849440488302, "grad_norm": 0.9267025589942932, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8562873601913452, "num_tokens": 775770167.0, "step": 4867 }, { "epoch": 2.476093591047813, "grad_norm": 0.9138270616531372, "learning_rate": 1e-05, "loss": 0.4703, "mean_token_accuracy": 0.8506574630737305, "num_tokens": 775937278.0, "step": 4868 }, { "epoch": 2.4766022380467954, "grad_norm": 0.8666245341300964, "learning_rate": 1e-05, "loss": 0.4197, "mean_token_accuracy": 0.8650186657905579, "num_tokens": 776102454.0, "step": 4869 }, { "epoch": 2.477110885045778, "grad_norm": 0.961727499961853, "learning_rate": 1e-05, "loss": 0.421, "mean_token_accuracy": 0.8643406629562378, "num_tokens": 776256891.0, "step": 4870 }, { "epoch": 2.477619532044761, "grad_norm": 0.876415491104126, "learning_rate": 1e-05, "loss": 0.4201, "mean_token_accuracy": 0.8636149764060974, "num_tokens": 776419578.0, "step": 4871 }, { "epoch": 2.4781281790437437, "grad_norm": 0.8858059048652649, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8590368032455444, "num_tokens": 776580024.0, "step": 4872 }, { "epoch": 2.4786368260427265, "grad_norm": 0.8576182126998901, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.856072187423706, "num_tokens": 776728328.0, "step": 4873 }, { "epoch": 2.479145473041709, "grad_norm": 0.927025318145752, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8582096099853516, "num_tokens": 776881190.0, "step": 4874 }, { "epoch": 2.4796541200406916, "grad_norm": 0.8863288760185242, "learning_rate": 1e-05, "loss": 0.4363, "mean_token_accuracy": 0.8576536178588867, "num_tokens": 777045533.0, "step": 4875 }, { "epoch": 2.4801627670396744, "grad_norm": 0.8556225895881653, "learning_rate": 1e-05, "loss": 0.449, "mean_token_accuracy": 0.8564656972885132, "num_tokens": 777210571.0, "step": 4876 }, { "epoch": 2.480671414038657, "grad_norm": 0.9408009052276611, "learning_rate": 1e-05, "loss": 0.4766, "mean_token_accuracy": 0.8474354147911072, "num_tokens": 777387909.0, "step": 4877 }, { "epoch": 2.48118006103764, "grad_norm": 0.920619010925293, "learning_rate": 1e-05, "loss": 0.4257, "mean_token_accuracy": 0.8599849343299866, "num_tokens": 777541394.0, "step": 4878 }, { "epoch": 2.4816887080366223, "grad_norm": 0.9394387602806091, "learning_rate": 1e-05, "loss": 0.4473, "mean_token_accuracy": 0.8554525375366211, "num_tokens": 777690828.0, "step": 4879 }, { "epoch": 2.482197355035605, "grad_norm": 0.9379386305809021, "learning_rate": 1e-05, "loss": 0.4195, "mean_token_accuracy": 0.8626469969749451, "num_tokens": 777839792.0, "step": 4880 }, { "epoch": 2.482706002034588, "grad_norm": 0.9391974210739136, "learning_rate": 1e-05, "loss": 0.4221, "mean_token_accuracy": 0.862157940864563, "num_tokens": 777986004.0, "step": 4881 }, { "epoch": 2.4832146490335707, "grad_norm": 0.967427670955658, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8584979176521301, "num_tokens": 778138868.0, "step": 4882 }, { "epoch": 2.4837232960325535, "grad_norm": 0.9181756377220154, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8566079139709473, "num_tokens": 778297635.0, "step": 4883 }, { "epoch": 2.4842319430315363, "grad_norm": 0.8523744344711304, "learning_rate": 1e-05, "loss": 0.4136, "mean_token_accuracy": 0.8662214279174805, "num_tokens": 778464246.0, "step": 4884 }, { "epoch": 2.484740590030519, "grad_norm": 0.902087390422821, "learning_rate": 1e-05, "loss": 0.4701, "mean_token_accuracy": 0.8492181301116943, "num_tokens": 778634588.0, "step": 4885 }, { "epoch": 2.4852492370295014, "grad_norm": 0.9841026067733765, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.844761848449707, "num_tokens": 778782743.0, "step": 4886 }, { "epoch": 2.485757884028484, "grad_norm": 0.9505780339241028, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8542821407318115, "num_tokens": 778933303.0, "step": 4887 }, { "epoch": 2.486266531027467, "grad_norm": 0.8608629703521729, "learning_rate": 1e-05, "loss": 0.3945, "mean_token_accuracy": 0.8706498146057129, "num_tokens": 779106415.0, "step": 4888 }, { "epoch": 2.4867751780264498, "grad_norm": 0.9805450439453125, "learning_rate": 1e-05, "loss": 0.4436, "mean_token_accuracy": 0.8559221029281616, "num_tokens": 779272445.0, "step": 4889 }, { "epoch": 2.4872838250254325, "grad_norm": 0.938648521900177, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8558464646339417, "num_tokens": 779428841.0, "step": 4890 }, { "epoch": 2.487792472024415, "grad_norm": 0.8914933800697327, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8562479615211487, "num_tokens": 779587772.0, "step": 4891 }, { "epoch": 2.4883011190233977, "grad_norm": 0.8571580052375793, "learning_rate": 1e-05, "loss": 0.4248, "mean_token_accuracy": 0.8619630932807922, "num_tokens": 779752096.0, "step": 4892 }, { "epoch": 2.4888097660223805, "grad_norm": 0.9890848398208618, "learning_rate": 1e-05, "loss": 0.4556, "mean_token_accuracy": 0.8534312844276428, "num_tokens": 779902168.0, "step": 4893 }, { "epoch": 2.4893184130213633, "grad_norm": 0.8465467095375061, "learning_rate": 1e-05, "loss": 0.4247, "mean_token_accuracy": 0.862485408782959, "num_tokens": 780066182.0, "step": 4894 }, { "epoch": 2.489827060020346, "grad_norm": 0.8678783178329468, "learning_rate": 1e-05, "loss": 0.4295, "mean_token_accuracy": 0.8596822023391724, "num_tokens": 780230835.0, "step": 4895 }, { "epoch": 2.4903357070193284, "grad_norm": 0.9244816899299622, "learning_rate": 1e-05, "loss": 0.4457, "mean_token_accuracy": 0.8552453517913818, "num_tokens": 780379379.0, "step": 4896 }, { "epoch": 2.490844354018311, "grad_norm": 0.8347296714782715, "learning_rate": 1e-05, "loss": 0.4657, "mean_token_accuracy": 0.8511793613433838, "num_tokens": 780545850.0, "step": 4897 }, { "epoch": 2.491353001017294, "grad_norm": 0.9213253855705261, "learning_rate": 1e-05, "loss": 0.4194, "mean_token_accuracy": 0.8629751205444336, "num_tokens": 780709618.0, "step": 4898 }, { "epoch": 2.4918616480162767, "grad_norm": 0.9033819437026978, "learning_rate": 1e-05, "loss": 0.4575, "mean_token_accuracy": 0.8512227535247803, "num_tokens": 780875436.0, "step": 4899 }, { "epoch": 2.4923702950152595, "grad_norm": 0.9290204048156738, "learning_rate": 1e-05, "loss": 0.4886, "mean_token_accuracy": 0.8433731198310852, "num_tokens": 781038418.0, "step": 4900 }, { "epoch": 2.492878942014242, "grad_norm": 0.9555561542510986, "learning_rate": 1e-05, "loss": 0.4368, "mean_token_accuracy": 0.8578329086303711, "num_tokens": 781200813.0, "step": 4901 }, { "epoch": 2.4933875890132247, "grad_norm": 0.9693750739097595, "learning_rate": 1e-05, "loss": 0.4396, "mean_token_accuracy": 0.8561158180236816, "num_tokens": 781342015.0, "step": 4902 }, { "epoch": 2.4938962360122074, "grad_norm": 0.990487813949585, "learning_rate": 1e-05, "loss": 0.4415, "mean_token_accuracy": 0.8567934632301331, "num_tokens": 781494626.0, "step": 4903 }, { "epoch": 2.4944048830111902, "grad_norm": 0.9190848469734192, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8608559370040894, "num_tokens": 781654668.0, "step": 4904 }, { "epoch": 2.494913530010173, "grad_norm": 1.000533103942871, "learning_rate": 1e-05, "loss": 0.4383, "mean_token_accuracy": 0.8600908517837524, "num_tokens": 781814957.0, "step": 4905 }, { "epoch": 2.495422177009156, "grad_norm": 0.9340627789497375, "learning_rate": 1e-05, "loss": 0.4701, "mean_token_accuracy": 0.8493572473526001, "num_tokens": 781972092.0, "step": 4906 }, { "epoch": 2.4959308240081386, "grad_norm": 1.0018067359924316, "learning_rate": 1e-05, "loss": 0.4677, "mean_token_accuracy": 0.8498542904853821, "num_tokens": 782131669.0, "step": 4907 }, { "epoch": 2.496439471007121, "grad_norm": 0.9221603870391846, "learning_rate": 1e-05, "loss": 0.4871, "mean_token_accuracy": 0.8443587422370911, "num_tokens": 782293763.0, "step": 4908 }, { "epoch": 2.4969481180061037, "grad_norm": 0.9541207551956177, "learning_rate": 1e-05, "loss": 0.4522, "mean_token_accuracy": 0.854601263999939, "num_tokens": 782439748.0, "step": 4909 }, { "epoch": 2.4974567650050865, "grad_norm": 0.989997923374176, "learning_rate": 1e-05, "loss": 0.4621, "mean_token_accuracy": 0.8509840369224548, "num_tokens": 782606788.0, "step": 4910 }, { "epoch": 2.4979654120040693, "grad_norm": 0.992000162601471, "learning_rate": 1e-05, "loss": 0.4423, "mean_token_accuracy": 0.8571023344993591, "num_tokens": 782753839.0, "step": 4911 }, { "epoch": 2.498474059003052, "grad_norm": 0.934873104095459, "learning_rate": 1e-05, "loss": 0.4297, "mean_token_accuracy": 0.8594983816146851, "num_tokens": 782909993.0, "step": 4912 }, { "epoch": 2.4989827060020344, "grad_norm": 1.005946159362793, "learning_rate": 1e-05, "loss": 0.4373, "mean_token_accuracy": 0.8588556051254272, "num_tokens": 783060995.0, "step": 4913 }, { "epoch": 2.499491353001017, "grad_norm": 0.9700726866722107, "learning_rate": 1e-05, "loss": 0.4427, "mean_token_accuracy": 0.8575286865234375, "num_tokens": 783215610.0, "step": 4914 }, { "epoch": 2.5, "grad_norm": 1.0107392072677612, "learning_rate": 1e-05, "loss": 0.4256, "mean_token_accuracy": 0.861768364906311, "num_tokens": 783383780.0, "step": 4915 }, { "epoch": 2.500508646998983, "grad_norm": 0.9195471405982971, "learning_rate": 1e-05, "loss": 0.4475, "mean_token_accuracy": 0.8552471399307251, "num_tokens": 783555679.0, "step": 4916 }, { "epoch": 2.5010172939979656, "grad_norm": 0.9699219465255737, "learning_rate": 1e-05, "loss": 0.445, "mean_token_accuracy": 0.8576300740242004, "num_tokens": 783719996.0, "step": 4917 }, { "epoch": 2.501525940996948, "grad_norm": 1.0189080238342285, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8536948561668396, "num_tokens": 783880886.0, "step": 4918 }, { "epoch": 2.5020345879959307, "grad_norm": 0.8958556652069092, "learning_rate": 1e-05, "loss": 0.4204, "mean_token_accuracy": 0.8617076873779297, "num_tokens": 784052705.0, "step": 4919 }, { "epoch": 2.5025432349949135, "grad_norm": 1.017082691192627, "learning_rate": 1e-05, "loss": 0.4234, "mean_token_accuracy": 0.8614131212234497, "num_tokens": 784198844.0, "step": 4920 }, { "epoch": 2.5030518819938963, "grad_norm": 1.014042854309082, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8566722273826599, "num_tokens": 784355610.0, "step": 4921 }, { "epoch": 2.503560528992879, "grad_norm": 0.8815516829490662, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8559888005256653, "num_tokens": 784524245.0, "step": 4922 }, { "epoch": 2.5040691759918614, "grad_norm": 0.9433804750442505, "learning_rate": 1e-05, "loss": 0.4227, "mean_token_accuracy": 0.8626022338867188, "num_tokens": 784687981.0, "step": 4923 }, { "epoch": 2.504577822990844, "grad_norm": 1.0432592630386353, "learning_rate": 1e-05, "loss": 0.4577, "mean_token_accuracy": 0.8526281118392944, "num_tokens": 784850952.0, "step": 4924 }, { "epoch": 2.505086469989827, "grad_norm": 1.0025300979614258, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.858517587184906, "num_tokens": 785007092.0, "step": 4925 }, { "epoch": 2.5055951169888098, "grad_norm": 0.9529726505279541, "learning_rate": 1e-05, "loss": 0.4492, "mean_token_accuracy": 0.8542094826698303, "num_tokens": 785151045.0, "step": 4926 }, { "epoch": 2.5061037639877926, "grad_norm": 0.9628111124038696, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.85976243019104, "num_tokens": 785308226.0, "step": 4927 }, { "epoch": 2.5066124109867753, "grad_norm": 0.9270199537277222, "learning_rate": 1e-05, "loss": 0.4462, "mean_token_accuracy": 0.8552361130714417, "num_tokens": 785470010.0, "step": 4928 }, { "epoch": 2.507121057985758, "grad_norm": 0.9673618078231812, "learning_rate": 1e-05, "loss": 0.4523, "mean_token_accuracy": 0.8539666533470154, "num_tokens": 785644727.0, "step": 4929 }, { "epoch": 2.5076297049847405, "grad_norm": 1.0008258819580078, "learning_rate": 1e-05, "loss": 0.4666, "mean_token_accuracy": 0.8490034341812134, "num_tokens": 785801195.0, "step": 4930 }, { "epoch": 2.5081383519837233, "grad_norm": 0.925003707408905, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8556567430496216, "num_tokens": 785966207.0, "step": 4931 }, { "epoch": 2.508646998982706, "grad_norm": 1.0037144422531128, "learning_rate": 1e-05, "loss": 0.458, "mean_token_accuracy": 0.8509320616722107, "num_tokens": 786115203.0, "step": 4932 }, { "epoch": 2.509155645981689, "grad_norm": 1.1053309440612793, "learning_rate": 1e-05, "loss": 0.4277, "mean_token_accuracy": 0.8615651726722717, "num_tokens": 786264456.0, "step": 4933 }, { "epoch": 2.5096642929806716, "grad_norm": 0.8932700157165527, "learning_rate": 1e-05, "loss": 0.4242, "mean_token_accuracy": 0.8625380992889404, "num_tokens": 786424319.0, "step": 4934 }, { "epoch": 2.510172939979654, "grad_norm": 0.9791507720947266, "learning_rate": 1e-05, "loss": 0.4764, "mean_token_accuracy": 0.8476613759994507, "num_tokens": 786578284.0, "step": 4935 }, { "epoch": 2.5106815869786367, "grad_norm": 1.005222201347351, "learning_rate": 1e-05, "loss": 0.427, "mean_token_accuracy": 0.8623913526535034, "num_tokens": 786732176.0, "step": 4936 }, { "epoch": 2.5111902339776195, "grad_norm": 0.8984588980674744, "learning_rate": 1e-05, "loss": 0.4401, "mean_token_accuracy": 0.8568511009216309, "num_tokens": 786889777.0, "step": 4937 }, { "epoch": 2.5116988809766023, "grad_norm": 0.8648860454559326, "learning_rate": 1e-05, "loss": 0.4278, "mean_token_accuracy": 0.8603200912475586, "num_tokens": 787060121.0, "step": 4938 }, { "epoch": 2.512207527975585, "grad_norm": 0.9340547323226929, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.8607314229011536, "num_tokens": 787224425.0, "step": 4939 }, { "epoch": 2.5127161749745675, "grad_norm": 0.8825793266296387, "learning_rate": 1e-05, "loss": 0.4869, "mean_token_accuracy": 0.8437527418136597, "num_tokens": 787389440.0, "step": 4940 }, { "epoch": 2.5132248219735502, "grad_norm": 0.9117286801338196, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8571785092353821, "num_tokens": 787550693.0, "step": 4941 }, { "epoch": 2.513733468972533, "grad_norm": 0.9391953945159912, "learning_rate": 1e-05, "loss": 0.4502, "mean_token_accuracy": 0.8555510640144348, "num_tokens": 787711904.0, "step": 4942 }, { "epoch": 2.514242115971516, "grad_norm": 0.9750176668167114, "learning_rate": 1e-05, "loss": 0.4197, "mean_token_accuracy": 0.8627287149429321, "num_tokens": 787876921.0, "step": 4943 }, { "epoch": 2.5147507629704986, "grad_norm": 0.8940181136131287, "learning_rate": 1e-05, "loss": 0.448, "mean_token_accuracy": 0.8568694591522217, "num_tokens": 788040834.0, "step": 4944 }, { "epoch": 2.515259409969481, "grad_norm": 0.9667310118675232, "learning_rate": 1e-05, "loss": 0.4598, "mean_token_accuracy": 0.8532164096832275, "num_tokens": 788203339.0, "step": 4945 }, { "epoch": 2.5157680569684637, "grad_norm": 0.9718950986862183, "learning_rate": 1e-05, "loss": 0.4442, "mean_token_accuracy": 0.8562138080596924, "num_tokens": 788365031.0, "step": 4946 }, { "epoch": 2.5162767039674465, "grad_norm": 0.8439437747001648, "learning_rate": 1e-05, "loss": 0.4246, "mean_token_accuracy": 0.8619179725646973, "num_tokens": 788536411.0, "step": 4947 }, { "epoch": 2.5167853509664293, "grad_norm": 0.9090779423713684, "learning_rate": 1e-05, "loss": 0.4205, "mean_token_accuracy": 0.8635702133178711, "num_tokens": 788684184.0, "step": 4948 }, { "epoch": 2.517293997965412, "grad_norm": 0.9026861786842346, "learning_rate": 1e-05, "loss": 0.4558, "mean_token_accuracy": 0.8554911017417908, "num_tokens": 788855318.0, "step": 4949 }, { "epoch": 2.517802644964395, "grad_norm": 0.8946784138679504, "learning_rate": 1e-05, "loss": 0.408, "mean_token_accuracy": 0.8685429692268372, "num_tokens": 789007564.0, "step": 4950 }, { "epoch": 2.5183112919633777, "grad_norm": 0.8506482243537903, "learning_rate": 1e-05, "loss": 0.4509, "mean_token_accuracy": 0.8538142442703247, "num_tokens": 789180703.0, "step": 4951 }, { "epoch": 2.51881993896236, "grad_norm": 0.9500423669815063, "learning_rate": 1e-05, "loss": 0.4557, "mean_token_accuracy": 0.8525198698043823, "num_tokens": 789338711.0, "step": 4952 }, { "epoch": 2.519328585961343, "grad_norm": 0.9335389733314514, "learning_rate": 1e-05, "loss": 0.4172, "mean_token_accuracy": 0.8638827204704285, "num_tokens": 789483931.0, "step": 4953 }, { "epoch": 2.5198372329603256, "grad_norm": 0.9226436018943787, "learning_rate": 1e-05, "loss": 0.4423, "mean_token_accuracy": 0.8589118719100952, "num_tokens": 789651554.0, "step": 4954 }, { "epoch": 2.5203458799593084, "grad_norm": 0.9027571082115173, "learning_rate": 1e-05, "loss": 0.451, "mean_token_accuracy": 0.8539646863937378, "num_tokens": 789806734.0, "step": 4955 }, { "epoch": 2.520854526958291, "grad_norm": 0.8884085416793823, "learning_rate": 1e-05, "loss": 0.4523, "mean_token_accuracy": 0.8536780476570129, "num_tokens": 789980144.0, "step": 4956 }, { "epoch": 2.5213631739572735, "grad_norm": 0.9327627420425415, "learning_rate": 1e-05, "loss": 0.4477, "mean_token_accuracy": 0.8556212186813354, "num_tokens": 790137138.0, "step": 4957 }, { "epoch": 2.5218718209562563, "grad_norm": 0.858979344367981, "learning_rate": 1e-05, "loss": 0.396, "mean_token_accuracy": 0.8699868321418762, "num_tokens": 790289686.0, "step": 4958 }, { "epoch": 2.522380467955239, "grad_norm": 0.893985390663147, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8482808470726013, "num_tokens": 790455729.0, "step": 4959 }, { "epoch": 2.522889114954222, "grad_norm": 0.8696373105049133, "learning_rate": 1e-05, "loss": 0.4467, "mean_token_accuracy": 0.855807900428772, "num_tokens": 790617809.0, "step": 4960 }, { "epoch": 2.5233977619532046, "grad_norm": 0.9078160524368286, "learning_rate": 1e-05, "loss": 0.4755, "mean_token_accuracy": 0.8464839458465576, "num_tokens": 790769406.0, "step": 4961 }, { "epoch": 2.523906408952187, "grad_norm": 0.8878223299980164, "learning_rate": 1e-05, "loss": 0.4702, "mean_token_accuracy": 0.8494907021522522, "num_tokens": 790931867.0, "step": 4962 }, { "epoch": 2.5244150559511698, "grad_norm": 0.9027721881866455, "learning_rate": 1e-05, "loss": 0.4348, "mean_token_accuracy": 0.8584170937538147, "num_tokens": 791097268.0, "step": 4963 }, { "epoch": 2.5249237029501526, "grad_norm": 0.8924083113670349, "learning_rate": 1e-05, "loss": 0.455, "mean_token_accuracy": 0.8540330529212952, "num_tokens": 791257431.0, "step": 4964 }, { "epoch": 2.5254323499491353, "grad_norm": 0.906243085861206, "learning_rate": 1e-05, "loss": 0.4098, "mean_token_accuracy": 0.865562915802002, "num_tokens": 791407837.0, "step": 4965 }, { "epoch": 2.525940996948118, "grad_norm": 0.8967453837394714, "learning_rate": 1e-05, "loss": 0.4614, "mean_token_accuracy": 0.8541016578674316, "num_tokens": 791571096.0, "step": 4966 }, { "epoch": 2.5264496439471005, "grad_norm": 0.9120975732803345, "learning_rate": 1e-05, "loss": 0.456, "mean_token_accuracy": 0.851302444934845, "num_tokens": 791733488.0, "step": 4967 }, { "epoch": 2.5269582909460833, "grad_norm": 0.896980881690979, "learning_rate": 1e-05, "loss": 0.4487, "mean_token_accuracy": 0.8537946939468384, "num_tokens": 791889891.0, "step": 4968 }, { "epoch": 2.527466937945066, "grad_norm": 0.9022660851478577, "learning_rate": 1e-05, "loss": 0.4172, "mean_token_accuracy": 0.8633976578712463, "num_tokens": 792056142.0, "step": 4969 }, { "epoch": 2.527975584944049, "grad_norm": 0.9533856511116028, "learning_rate": 1e-05, "loss": 0.4549, "mean_token_accuracy": 0.8529731631278992, "num_tokens": 792214100.0, "step": 4970 }, { "epoch": 2.5284842319430316, "grad_norm": 0.8792077302932739, "learning_rate": 1e-05, "loss": 0.4335, "mean_token_accuracy": 0.859175443649292, "num_tokens": 792375627.0, "step": 4971 }, { "epoch": 2.528992878942014, "grad_norm": 0.872978150844574, "learning_rate": 1e-05, "loss": 0.413, "mean_token_accuracy": 0.8643496632575989, "num_tokens": 792530725.0, "step": 4972 }, { "epoch": 2.529501525940997, "grad_norm": 0.962568461894989, "learning_rate": 1e-05, "loss": 0.4567, "mean_token_accuracy": 0.852573037147522, "num_tokens": 792679522.0, "step": 4973 }, { "epoch": 2.5300101729399795, "grad_norm": 0.8625645637512207, "learning_rate": 1e-05, "loss": 0.4403, "mean_token_accuracy": 0.8584206700325012, "num_tokens": 792847903.0, "step": 4974 }, { "epoch": 2.5305188199389623, "grad_norm": 0.9123122096061707, "learning_rate": 1e-05, "loss": 0.4319, "mean_token_accuracy": 0.8611040115356445, "num_tokens": 793001198.0, "step": 4975 }, { "epoch": 2.531027466937945, "grad_norm": 0.913103461265564, "learning_rate": 1e-05, "loss": 0.4193, "mean_token_accuracy": 0.8622370362281799, "num_tokens": 793151668.0, "step": 4976 }, { "epoch": 2.531536113936928, "grad_norm": 0.8801509737968445, "learning_rate": 1e-05, "loss": 0.4522, "mean_token_accuracy": 0.8534757494926453, "num_tokens": 793307932.0, "step": 4977 }, { "epoch": 2.5320447609359107, "grad_norm": 0.9614366292953491, "learning_rate": 1e-05, "loss": 0.44, "mean_token_accuracy": 0.8562948703765869, "num_tokens": 793460024.0, "step": 4978 }, { "epoch": 2.532553407934893, "grad_norm": 0.9331154227256775, "learning_rate": 1e-05, "loss": 0.4377, "mean_token_accuracy": 0.8571763634681702, "num_tokens": 793624026.0, "step": 4979 }, { "epoch": 2.533062054933876, "grad_norm": 0.8993594646453857, "learning_rate": 1e-05, "loss": 0.4104, "mean_token_accuracy": 0.864728569984436, "num_tokens": 793777015.0, "step": 4980 }, { "epoch": 2.5335707019328586, "grad_norm": 1.0761821269989014, "learning_rate": 1e-05, "loss": 0.4628, "mean_token_accuracy": 0.8507185578346252, "num_tokens": 793940262.0, "step": 4981 }, { "epoch": 2.5340793489318414, "grad_norm": 0.8796427249908447, "learning_rate": 1e-05, "loss": 0.4569, "mean_token_accuracy": 0.8514412045478821, "num_tokens": 794106369.0, "step": 4982 }, { "epoch": 2.534587995930824, "grad_norm": 0.9506366848945618, "learning_rate": 1e-05, "loss": 0.4314, "mean_token_accuracy": 0.8596986532211304, "num_tokens": 794254361.0, "step": 4983 }, { "epoch": 2.5350966429298065, "grad_norm": 0.8823890686035156, "learning_rate": 1e-05, "loss": 0.4358, "mean_token_accuracy": 0.8592212200164795, "num_tokens": 794419893.0, "step": 4984 }, { "epoch": 2.5356052899287893, "grad_norm": 0.9204660058021545, "learning_rate": 1e-05, "loss": 0.3984, "mean_token_accuracy": 0.8693076372146606, "num_tokens": 794567582.0, "step": 4985 }, { "epoch": 2.536113936927772, "grad_norm": 0.882679283618927, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8561245203018188, "num_tokens": 794725898.0, "step": 4986 }, { "epoch": 2.536622583926755, "grad_norm": 0.9224008321762085, "learning_rate": 1e-05, "loss": 0.4674, "mean_token_accuracy": 0.8492908477783203, "num_tokens": 794880869.0, "step": 4987 }, { "epoch": 2.5371312309257377, "grad_norm": 1.008573293685913, "learning_rate": 1e-05, "loss": 0.4357, "mean_token_accuracy": 0.8573492765426636, "num_tokens": 795034685.0, "step": 4988 }, { "epoch": 2.53763987792472, "grad_norm": 0.9045929312705994, "learning_rate": 1e-05, "loss": 0.4291, "mean_token_accuracy": 0.8587425947189331, "num_tokens": 795199669.0, "step": 4989 }, { "epoch": 2.538148524923703, "grad_norm": 1.2988561391830444, "learning_rate": 1e-05, "loss": 0.44, "mean_token_accuracy": 0.8566228151321411, "num_tokens": 795359866.0, "step": 4990 }, { "epoch": 2.5386571719226856, "grad_norm": 0.8983350396156311, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8551926016807556, "num_tokens": 795516315.0, "step": 4991 }, { "epoch": 2.5391658189216684, "grad_norm": 0.8776088953018188, "learning_rate": 1e-05, "loss": 0.4175, "mean_token_accuracy": 0.8644740581512451, "num_tokens": 795665057.0, "step": 4992 }, { "epoch": 2.539674465920651, "grad_norm": 0.8982292413711548, "learning_rate": 1e-05, "loss": 0.4488, "mean_token_accuracy": 0.8547069430351257, "num_tokens": 795835444.0, "step": 4993 }, { "epoch": 2.5401831129196335, "grad_norm": 0.9244893789291382, "learning_rate": 1e-05, "loss": 0.4264, "mean_token_accuracy": 0.8608143329620361, "num_tokens": 795976783.0, "step": 4994 }, { "epoch": 2.5406917599186167, "grad_norm": 0.8885383009910583, "learning_rate": 1e-05, "loss": 0.4281, "mean_token_accuracy": 0.8603293895721436, "num_tokens": 796143149.0, "step": 4995 }, { "epoch": 2.541200406917599, "grad_norm": 1.0368354320526123, "learning_rate": 1e-05, "loss": 0.4485, "mean_token_accuracy": 0.8553367853164673, "num_tokens": 796287301.0, "step": 4996 }, { "epoch": 2.541709053916582, "grad_norm": 0.8786417245864868, "learning_rate": 1e-05, "loss": 0.4226, "mean_token_accuracy": 0.8627998232841492, "num_tokens": 796441792.0, "step": 4997 }, { "epoch": 2.5422177009155646, "grad_norm": 1.464304804801941, "learning_rate": 1e-05, "loss": 0.4416, "mean_token_accuracy": 0.857825756072998, "num_tokens": 796600291.0, "step": 4998 }, { "epoch": 2.5427263479145474, "grad_norm": 0.9433695077896118, "learning_rate": 1e-05, "loss": 0.4355, "mean_token_accuracy": 0.8583287596702576, "num_tokens": 796757003.0, "step": 4999 }, { "epoch": 2.5432349949135302, "grad_norm": 1.0070462226867676, "learning_rate": 1e-05, "loss": 0.4292, "mean_token_accuracy": 0.8625640869140625, "num_tokens": 796904278.0, "step": 5000 }, { "epoch": 2.5437436419125126, "grad_norm": 0.9288358092308044, "learning_rate": 1e-05, "loss": 0.4525, "mean_token_accuracy": 0.854245662689209, "num_tokens": 797061038.0, "step": 5001 }, { "epoch": 2.5442522889114954, "grad_norm": 0.8655028343200684, "learning_rate": 1e-05, "loss": 0.4514, "mean_token_accuracy": 0.855544924736023, "num_tokens": 797229543.0, "step": 5002 }, { "epoch": 2.544760935910478, "grad_norm": 1.220302939414978, "learning_rate": 1e-05, "loss": 0.4801, "mean_token_accuracy": 0.8450947999954224, "num_tokens": 797390103.0, "step": 5003 }, { "epoch": 2.545269582909461, "grad_norm": 0.9486819505691528, "learning_rate": 1e-05, "loss": 0.4199, "mean_token_accuracy": 0.8622397780418396, "num_tokens": 797536387.0, "step": 5004 }, { "epoch": 2.5457782299084437, "grad_norm": 0.9457309246063232, "learning_rate": 1e-05, "loss": 0.4464, "mean_token_accuracy": 0.855649471282959, "num_tokens": 797694216.0, "step": 5005 }, { "epoch": 2.546286876907426, "grad_norm": 0.9005159735679626, "learning_rate": 1e-05, "loss": 0.4476, "mean_token_accuracy": 0.85480797290802, "num_tokens": 797843165.0, "step": 5006 }, { "epoch": 2.546795523906409, "grad_norm": 0.8820387721061707, "learning_rate": 1e-05, "loss": 0.4561, "mean_token_accuracy": 0.8531599640846252, "num_tokens": 798000808.0, "step": 5007 }, { "epoch": 2.5473041709053916, "grad_norm": 0.906570315361023, "learning_rate": 1e-05, "loss": 0.4202, "mean_token_accuracy": 0.8623149394989014, "num_tokens": 798149467.0, "step": 5008 }, { "epoch": 2.5478128179043744, "grad_norm": 0.8808760643005371, "learning_rate": 1e-05, "loss": 0.4578, "mean_token_accuracy": 0.8505076169967651, "num_tokens": 798307311.0, "step": 5009 }, { "epoch": 2.548321464903357, "grad_norm": 0.8601338863372803, "learning_rate": 1e-05, "loss": 0.4245, "mean_token_accuracy": 0.8612698316574097, "num_tokens": 798476152.0, "step": 5010 }, { "epoch": 2.5488301119023395, "grad_norm": 0.8980915546417236, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.8529607653617859, "num_tokens": 798634863.0, "step": 5011 }, { "epoch": 2.5493387589013223, "grad_norm": 0.8095453977584839, "learning_rate": 1e-05, "loss": 0.4306, "mean_token_accuracy": 0.8594805002212524, "num_tokens": 798807326.0, "step": 5012 }, { "epoch": 2.549847405900305, "grad_norm": 0.8589807748794556, "learning_rate": 1e-05, "loss": 0.4532, "mean_token_accuracy": 0.8550978899002075, "num_tokens": 798971177.0, "step": 5013 }, { "epoch": 2.550356052899288, "grad_norm": 0.8826808333396912, "learning_rate": 1e-05, "loss": 0.4989, "mean_token_accuracy": 0.8407763838768005, "num_tokens": 799134326.0, "step": 5014 }, { "epoch": 2.5508646998982707, "grad_norm": 0.8850137591362, "learning_rate": 1e-05, "loss": 0.424, "mean_token_accuracy": 0.8614013195037842, "num_tokens": 799279049.0, "step": 5015 }, { "epoch": 2.551373346897253, "grad_norm": 0.8453471064567566, "learning_rate": 1e-05, "loss": 0.4214, "mean_token_accuracy": 0.8633631467819214, "num_tokens": 799444712.0, "step": 5016 }, { "epoch": 2.5518819938962363, "grad_norm": 0.9310623407363892, "learning_rate": 1e-05, "loss": 0.4382, "mean_token_accuracy": 0.8583777546882629, "num_tokens": 799590600.0, "step": 5017 }, { "epoch": 2.5523906408952186, "grad_norm": 0.8986615538597107, "learning_rate": 1e-05, "loss": 0.4415, "mean_token_accuracy": 0.85772705078125, "num_tokens": 799761918.0, "step": 5018 }, { "epoch": 2.5528992878942014, "grad_norm": 0.8750797510147095, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8598487973213196, "num_tokens": 799928851.0, "step": 5019 }, { "epoch": 2.553407934893184, "grad_norm": 1.001911997795105, "learning_rate": 1e-05, "loss": 0.4024, "mean_token_accuracy": 0.8676479458808899, "num_tokens": 800086313.0, "step": 5020 }, { "epoch": 2.553916581892167, "grad_norm": 0.9289815425872803, "learning_rate": 1e-05, "loss": 0.4571, "mean_token_accuracy": 0.8532669544219971, "num_tokens": 800242737.0, "step": 5021 }, { "epoch": 2.5544252288911498, "grad_norm": 0.9957112669944763, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8532627820968628, "num_tokens": 800392700.0, "step": 5022 }, { "epoch": 2.554933875890132, "grad_norm": 0.883906364440918, "learning_rate": 1e-05, "loss": 0.4346, "mean_token_accuracy": 0.8591322302818298, "num_tokens": 800561813.0, "step": 5023 }, { "epoch": 2.555442522889115, "grad_norm": 0.9741874933242798, "learning_rate": 1e-05, "loss": 0.4273, "mean_token_accuracy": 0.8615695834159851, "num_tokens": 800723148.0, "step": 5024 }, { "epoch": 2.5559511698880977, "grad_norm": 0.8632175326347351, "learning_rate": 1e-05, "loss": 0.4457, "mean_token_accuracy": 0.8571192026138306, "num_tokens": 800880074.0, "step": 5025 }, { "epoch": 2.5564598168870805, "grad_norm": 0.8779349327087402, "learning_rate": 1e-05, "loss": 0.426, "mean_token_accuracy": 0.8629305362701416, "num_tokens": 801037154.0, "step": 5026 }, { "epoch": 2.5569684638860632, "grad_norm": 0.9181589484214783, "learning_rate": 1e-05, "loss": 0.4525, "mean_token_accuracy": 0.8532425761222839, "num_tokens": 801183676.0, "step": 5027 }, { "epoch": 2.5574771108850456, "grad_norm": 0.8723611235618591, "learning_rate": 1e-05, "loss": 0.4747, "mean_token_accuracy": 0.8468239903450012, "num_tokens": 801343393.0, "step": 5028 }, { "epoch": 2.5579857578840284, "grad_norm": 0.9260839223861694, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8584328293800354, "num_tokens": 801497266.0, "step": 5029 }, { "epoch": 2.558494404883011, "grad_norm": 0.960536003112793, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8499679565429688, "num_tokens": 801652284.0, "step": 5030 }, { "epoch": 2.559003051881994, "grad_norm": 0.9009841084480286, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8557119369506836, "num_tokens": 801811029.0, "step": 5031 }, { "epoch": 2.5595116988809767, "grad_norm": 0.8594654202461243, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8569321632385254, "num_tokens": 801967943.0, "step": 5032 }, { "epoch": 2.560020345879959, "grad_norm": 0.8589481115341187, "learning_rate": 1e-05, "loss": 0.4345, "mean_token_accuracy": 0.8608789443969727, "num_tokens": 802130591.0, "step": 5033 }, { "epoch": 2.560528992878942, "grad_norm": 0.8562963008880615, "learning_rate": 1e-05, "loss": 0.4188, "mean_token_accuracy": 0.8626354932785034, "num_tokens": 802277825.0, "step": 5034 }, { "epoch": 2.5610376398779247, "grad_norm": 0.816863477230072, "learning_rate": 1e-05, "loss": 0.4296, "mean_token_accuracy": 0.8625769019126892, "num_tokens": 802439864.0, "step": 5035 }, { "epoch": 2.5615462868769074, "grad_norm": 0.9014476537704468, "learning_rate": 1e-05, "loss": 0.4465, "mean_token_accuracy": 0.8575881719589233, "num_tokens": 802590185.0, "step": 5036 }, { "epoch": 2.5620549338758902, "grad_norm": 0.8575737476348877, "learning_rate": 1e-05, "loss": 0.3821, "mean_token_accuracy": 0.8743054866790771, "num_tokens": 802736978.0, "step": 5037 }, { "epoch": 2.5625635808748726, "grad_norm": 0.9233067631721497, "learning_rate": 1e-05, "loss": 0.4217, "mean_token_accuracy": 0.8624237775802612, "num_tokens": 802896699.0, "step": 5038 }, { "epoch": 2.563072227873856, "grad_norm": 1.0039176940917969, "learning_rate": 1e-05, "loss": 0.4605, "mean_token_accuracy": 0.851535439491272, "num_tokens": 803048086.0, "step": 5039 }, { "epoch": 2.563580874872838, "grad_norm": 0.8698328733444214, "learning_rate": 1e-05, "loss": 0.4108, "mean_token_accuracy": 0.86533123254776, "num_tokens": 803217224.0, "step": 5040 }, { "epoch": 2.564089521871821, "grad_norm": 0.8888559937477112, "learning_rate": 1e-05, "loss": 0.438, "mean_token_accuracy": 0.8598009347915649, "num_tokens": 803390085.0, "step": 5041 }, { "epoch": 2.5645981688708037, "grad_norm": 0.9239501953125, "learning_rate": 1e-05, "loss": 0.4662, "mean_token_accuracy": 0.8478236198425293, "num_tokens": 803552881.0, "step": 5042 }, { "epoch": 2.5651068158697865, "grad_norm": 0.8581882119178772, "learning_rate": 1e-05, "loss": 0.4232, "mean_token_accuracy": 0.8616005182266235, "num_tokens": 803715846.0, "step": 5043 }, { "epoch": 2.5656154628687693, "grad_norm": 0.881350040435791, "learning_rate": 1e-05, "loss": 0.4273, "mean_token_accuracy": 0.8613227605819702, "num_tokens": 803876358.0, "step": 5044 }, { "epoch": 2.5661241098677516, "grad_norm": 0.9468534588813782, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8618128299713135, "num_tokens": 804030876.0, "step": 5045 }, { "epoch": 2.5666327568667344, "grad_norm": 0.9433355927467346, "learning_rate": 1e-05, "loss": 0.436, "mean_token_accuracy": 0.8575580716133118, "num_tokens": 804192458.0, "step": 5046 }, { "epoch": 2.567141403865717, "grad_norm": 0.8917705416679382, "learning_rate": 1e-05, "loss": 0.423, "mean_token_accuracy": 0.8625609874725342, "num_tokens": 804348187.0, "step": 5047 }, { "epoch": 2.5676500508647, "grad_norm": 0.8788217306137085, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.8561955690383911, "num_tokens": 804511634.0, "step": 5048 }, { "epoch": 2.568158697863683, "grad_norm": 0.8972126841545105, "learning_rate": 1e-05, "loss": 0.4605, "mean_token_accuracy": 0.8492066860198975, "num_tokens": 804669479.0, "step": 5049 }, { "epoch": 2.568667344862665, "grad_norm": 0.9535084962844849, "learning_rate": 1e-05, "loss": 0.4332, "mean_token_accuracy": 0.8596338033676147, "num_tokens": 804828982.0, "step": 5050 }, { "epoch": 2.569175991861648, "grad_norm": 0.8947502970695496, "learning_rate": 1e-05, "loss": 0.4369, "mean_token_accuracy": 0.8589239120483398, "num_tokens": 805001239.0, "step": 5051 }, { "epoch": 2.5696846388606307, "grad_norm": 0.8903855085372925, "learning_rate": 1e-05, "loss": 0.4791, "mean_token_accuracy": 0.8452934622764587, "num_tokens": 805159595.0, "step": 5052 }, { "epoch": 2.5701932858596135, "grad_norm": 1.0177345275878906, "learning_rate": 1e-05, "loss": 0.4767, "mean_token_accuracy": 0.847319483757019, "num_tokens": 805313717.0, "step": 5053 }, { "epoch": 2.5707019328585963, "grad_norm": 0.9481968879699707, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8640065789222717, "num_tokens": 805467391.0, "step": 5054 }, { "epoch": 2.5712105798575786, "grad_norm": 0.9168588519096375, "learning_rate": 1e-05, "loss": 0.4455, "mean_token_accuracy": 0.8557322025299072, "num_tokens": 805617795.0, "step": 5055 }, { "epoch": 2.5717192268565614, "grad_norm": 0.9425328373908997, "learning_rate": 1e-05, "loss": 0.4421, "mean_token_accuracy": 0.8565764427185059, "num_tokens": 805767777.0, "step": 5056 }, { "epoch": 2.572227873855544, "grad_norm": 0.8727470636367798, "learning_rate": 1e-05, "loss": 0.4161, "mean_token_accuracy": 0.8638690710067749, "num_tokens": 805929061.0, "step": 5057 }, { "epoch": 2.572736520854527, "grad_norm": 0.9014266729354858, "learning_rate": 1e-05, "loss": 0.4523, "mean_token_accuracy": 0.8545477390289307, "num_tokens": 806083011.0, "step": 5058 }, { "epoch": 2.5732451678535098, "grad_norm": 0.8865407109260559, "learning_rate": 1e-05, "loss": 0.4416, "mean_token_accuracy": 0.8571542501449585, "num_tokens": 806240633.0, "step": 5059 }, { "epoch": 2.573753814852492, "grad_norm": 0.9044598937034607, "learning_rate": 1e-05, "loss": 0.4142, "mean_token_accuracy": 0.8641639351844788, "num_tokens": 806389989.0, "step": 5060 }, { "epoch": 2.5742624618514753, "grad_norm": 0.8423361778259277, "learning_rate": 1e-05, "loss": 0.4555, "mean_token_accuracy": 0.8527371883392334, "num_tokens": 806560090.0, "step": 5061 }, { "epoch": 2.5747711088504577, "grad_norm": 0.8408938646316528, "learning_rate": 1e-05, "loss": 0.4205, "mean_token_accuracy": 0.8641003370285034, "num_tokens": 806724467.0, "step": 5062 }, { "epoch": 2.5752797558494405, "grad_norm": 0.8934341669082642, "learning_rate": 1e-05, "loss": 0.4255, "mean_token_accuracy": 0.8617019653320312, "num_tokens": 806885928.0, "step": 5063 }, { "epoch": 2.5757884028484233, "grad_norm": 0.873944878578186, "learning_rate": 1e-05, "loss": 0.4159, "mean_token_accuracy": 0.8641436100006104, "num_tokens": 807048509.0, "step": 5064 }, { "epoch": 2.576297049847406, "grad_norm": 0.9249725937843323, "learning_rate": 1e-05, "loss": 0.4346, "mean_token_accuracy": 0.8582570552825928, "num_tokens": 807201053.0, "step": 5065 }, { "epoch": 2.576805696846389, "grad_norm": 0.8614180684089661, "learning_rate": 1e-05, "loss": 0.4204, "mean_token_accuracy": 0.8631008863449097, "num_tokens": 807355916.0, "step": 5066 }, { "epoch": 2.577314343845371, "grad_norm": 0.9144124388694763, "learning_rate": 1e-05, "loss": 0.4132, "mean_token_accuracy": 0.865911066532135, "num_tokens": 807507280.0, "step": 5067 }, { "epoch": 2.577822990844354, "grad_norm": 0.8963925838470459, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8562051057815552, "num_tokens": 807664154.0, "step": 5068 }, { "epoch": 2.5783316378433367, "grad_norm": 0.880099356174469, "learning_rate": 1e-05, "loss": 0.4222, "mean_token_accuracy": 0.8610818386077881, "num_tokens": 807826692.0, "step": 5069 }, { "epoch": 2.5788402848423195, "grad_norm": 0.9867204427719116, "learning_rate": 1e-05, "loss": 0.3965, "mean_token_accuracy": 0.869681715965271, "num_tokens": 807982521.0, "step": 5070 }, { "epoch": 2.5793489318413023, "grad_norm": 0.8853605389595032, "learning_rate": 1e-05, "loss": 0.445, "mean_token_accuracy": 0.8563171625137329, "num_tokens": 808145213.0, "step": 5071 }, { "epoch": 2.5798575788402847, "grad_norm": 0.8659780025482178, "learning_rate": 1e-05, "loss": 0.4498, "mean_token_accuracy": 0.8545401096343994, "num_tokens": 808310119.0, "step": 5072 }, { "epoch": 2.5803662258392674, "grad_norm": 0.8800836205482483, "learning_rate": 1e-05, "loss": 0.4239, "mean_token_accuracy": 0.8624007701873779, "num_tokens": 808470957.0, "step": 5073 }, { "epoch": 2.5808748728382502, "grad_norm": 0.9086866974830627, "learning_rate": 1e-05, "loss": 0.4411, "mean_token_accuracy": 0.8568074703216553, "num_tokens": 808637880.0, "step": 5074 }, { "epoch": 2.581383519837233, "grad_norm": 0.9906461834907532, "learning_rate": 1e-05, "loss": 0.4379, "mean_token_accuracy": 0.8569093942642212, "num_tokens": 808804560.0, "step": 5075 }, { "epoch": 2.581892166836216, "grad_norm": 0.8614892363548279, "learning_rate": 1e-05, "loss": 0.4315, "mean_token_accuracy": 0.8585564494132996, "num_tokens": 808973018.0, "step": 5076 }, { "epoch": 2.582400813835198, "grad_norm": 0.9015856385231018, "learning_rate": 1e-05, "loss": 0.4478, "mean_token_accuracy": 0.8551123142242432, "num_tokens": 809136988.0, "step": 5077 }, { "epoch": 2.582909460834181, "grad_norm": 0.911135196685791, "learning_rate": 1e-05, "loss": 0.4221, "mean_token_accuracy": 0.8616445064544678, "num_tokens": 809288153.0, "step": 5078 }, { "epoch": 2.5834181078331637, "grad_norm": 0.9012998342514038, "learning_rate": 1e-05, "loss": 0.4454, "mean_token_accuracy": 0.8553784489631653, "num_tokens": 809443468.0, "step": 5079 }, { "epoch": 2.5839267548321465, "grad_norm": 0.9023162126541138, "learning_rate": 1e-05, "loss": 0.4107, "mean_token_accuracy": 0.8648260235786438, "num_tokens": 809594813.0, "step": 5080 }, { "epoch": 2.5844354018311293, "grad_norm": 0.8686497211456299, "learning_rate": 1e-05, "loss": 0.4486, "mean_token_accuracy": 0.8574556112289429, "num_tokens": 809760173.0, "step": 5081 }, { "epoch": 2.5849440488301116, "grad_norm": 0.9478240013122559, "learning_rate": 1e-05, "loss": 0.4273, "mean_token_accuracy": 0.8620869517326355, "num_tokens": 809914564.0, "step": 5082 }, { "epoch": 2.585452695829095, "grad_norm": 0.8770041465759277, "learning_rate": 1e-05, "loss": 0.435, "mean_token_accuracy": 0.8599441051483154, "num_tokens": 810071927.0, "step": 5083 }, { "epoch": 2.585961342828077, "grad_norm": 0.9543029069900513, "learning_rate": 1e-05, "loss": 0.4748, "mean_token_accuracy": 0.8473060131072998, "num_tokens": 810235007.0, "step": 5084 }, { "epoch": 2.58646998982706, "grad_norm": 0.9194167852401733, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8596623539924622, "num_tokens": 810398502.0, "step": 5085 }, { "epoch": 2.586978636826043, "grad_norm": 0.993401825428009, "learning_rate": 1e-05, "loss": 0.4447, "mean_token_accuracy": 0.855347752571106, "num_tokens": 810562179.0, "step": 5086 }, { "epoch": 2.5874872838250256, "grad_norm": 0.9939655065536499, "learning_rate": 1e-05, "loss": 0.4143, "mean_token_accuracy": 0.8665523529052734, "num_tokens": 810722858.0, "step": 5087 }, { "epoch": 2.5879959308240084, "grad_norm": 0.8436356782913208, "learning_rate": 1e-05, "loss": 0.4531, "mean_token_accuracy": 0.8540129661560059, "num_tokens": 810890208.0, "step": 5088 }, { "epoch": 2.5885045778229907, "grad_norm": 0.9794009923934937, "learning_rate": 1e-05, "loss": 0.4418, "mean_token_accuracy": 0.8579432368278503, "num_tokens": 811055142.0, "step": 5089 }, { "epoch": 2.5890132248219735, "grad_norm": 0.8651193976402283, "learning_rate": 1e-05, "loss": 0.436, "mean_token_accuracy": 0.8586689233779907, "num_tokens": 811223836.0, "step": 5090 }, { "epoch": 2.5895218718209563, "grad_norm": 0.8984514474868774, "learning_rate": 1e-05, "loss": 0.4532, "mean_token_accuracy": 0.8534331321716309, "num_tokens": 811391290.0, "step": 5091 }, { "epoch": 2.590030518819939, "grad_norm": 0.8978536128997803, "learning_rate": 1e-05, "loss": 0.4433, "mean_token_accuracy": 0.8573331832885742, "num_tokens": 811552000.0, "step": 5092 }, { "epoch": 2.590539165818922, "grad_norm": 0.9069320559501648, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.8586399555206299, "num_tokens": 811720907.0, "step": 5093 }, { "epoch": 2.591047812817904, "grad_norm": 0.9119493961334229, "learning_rate": 1e-05, "loss": 0.4454, "mean_token_accuracy": 0.8551663160324097, "num_tokens": 811879361.0, "step": 5094 }, { "epoch": 2.591556459816887, "grad_norm": 0.8818292617797852, "learning_rate": 1e-05, "loss": 0.4101, "mean_token_accuracy": 0.8652764558792114, "num_tokens": 812047878.0, "step": 5095 }, { "epoch": 2.5920651068158698, "grad_norm": 0.9392347931861877, "learning_rate": 1e-05, "loss": 0.4435, "mean_token_accuracy": 0.8555039763450623, "num_tokens": 812201258.0, "step": 5096 }, { "epoch": 2.5925737538148526, "grad_norm": 0.8664877414703369, "learning_rate": 1e-05, "loss": 0.4689, "mean_token_accuracy": 0.8496640920639038, "num_tokens": 812364086.0, "step": 5097 }, { "epoch": 2.5930824008138353, "grad_norm": 0.9408642053604126, "learning_rate": 1e-05, "loss": 0.4401, "mean_token_accuracy": 0.8558844327926636, "num_tokens": 812522643.0, "step": 5098 }, { "epoch": 2.5935910478128177, "grad_norm": 0.8321124315261841, "learning_rate": 1e-05, "loss": 0.4116, "mean_token_accuracy": 0.866344690322876, "num_tokens": 812697980.0, "step": 5099 }, { "epoch": 2.5940996948118005, "grad_norm": 0.8491588830947876, "learning_rate": 1e-05, "loss": 0.437, "mean_token_accuracy": 0.8593347668647766, "num_tokens": 812857034.0, "step": 5100 }, { "epoch": 2.5946083418107833, "grad_norm": 0.8439227342605591, "learning_rate": 1e-05, "loss": 0.4388, "mean_token_accuracy": 0.8568952083587646, "num_tokens": 813014073.0, "step": 5101 }, { "epoch": 2.595116988809766, "grad_norm": 0.8874712586402893, "learning_rate": 1e-05, "loss": 0.4596, "mean_token_accuracy": 0.852493405342102, "num_tokens": 813171705.0, "step": 5102 }, { "epoch": 2.595625635808749, "grad_norm": 0.9219279885292053, "learning_rate": 1e-05, "loss": 0.4332, "mean_token_accuracy": 0.8584223389625549, "num_tokens": 813321780.0, "step": 5103 }, { "epoch": 2.596134282807731, "grad_norm": 0.9127758741378784, "learning_rate": 1e-05, "loss": 0.4599, "mean_token_accuracy": 0.8504177331924438, "num_tokens": 813478790.0, "step": 5104 }, { "epoch": 2.5966429298067144, "grad_norm": 0.8439157009124756, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8566184043884277, "num_tokens": 813639812.0, "step": 5105 }, { "epoch": 2.5971515768056967, "grad_norm": 0.8955456614494324, "learning_rate": 1e-05, "loss": 0.4372, "mean_token_accuracy": 0.8591792583465576, "num_tokens": 813795523.0, "step": 5106 }, { "epoch": 2.5976602238046795, "grad_norm": 0.9001061320304871, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8603255152702332, "num_tokens": 813955082.0, "step": 5107 }, { "epoch": 2.5981688708036623, "grad_norm": 0.8776459693908691, "learning_rate": 1e-05, "loss": 0.4435, "mean_token_accuracy": 0.8569793701171875, "num_tokens": 814113911.0, "step": 5108 }, { "epoch": 2.598677517802645, "grad_norm": 0.9070444703102112, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8551826477050781, "num_tokens": 814275305.0, "step": 5109 }, { "epoch": 2.599186164801628, "grad_norm": 0.8836700916290283, "learning_rate": 1e-05, "loss": 0.4343, "mean_token_accuracy": 0.8597379922866821, "num_tokens": 814435363.0, "step": 5110 }, { "epoch": 2.5996948118006102, "grad_norm": 0.9383623600006104, "learning_rate": 1e-05, "loss": 0.4708, "mean_token_accuracy": 0.850771963596344, "num_tokens": 814606320.0, "step": 5111 }, { "epoch": 2.600203458799593, "grad_norm": 0.9148361682891846, "learning_rate": 1e-05, "loss": 0.4284, "mean_token_accuracy": 0.8600580096244812, "num_tokens": 814760052.0, "step": 5112 }, { "epoch": 2.600712105798576, "grad_norm": 0.8902022242546082, "learning_rate": 1e-05, "loss": 0.4302, "mean_token_accuracy": 0.859188437461853, "num_tokens": 814923437.0, "step": 5113 }, { "epoch": 2.6012207527975586, "grad_norm": 0.8794373869895935, "learning_rate": 1e-05, "loss": 0.4351, "mean_token_accuracy": 0.8580101728439331, "num_tokens": 815081519.0, "step": 5114 }, { "epoch": 2.6017293997965414, "grad_norm": 0.8626607060432434, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8555400371551514, "num_tokens": 815246123.0, "step": 5115 }, { "epoch": 2.6022380467955237, "grad_norm": 0.87879878282547, "learning_rate": 1e-05, "loss": 0.4479, "mean_token_accuracy": 0.8553816080093384, "num_tokens": 815404054.0, "step": 5116 }, { "epoch": 2.6027466937945065, "grad_norm": 0.8513388633728027, "learning_rate": 1e-05, "loss": 0.4434, "mean_token_accuracy": 0.8568986058235168, "num_tokens": 815560199.0, "step": 5117 }, { "epoch": 2.6032553407934893, "grad_norm": 0.8843596577644348, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8611048460006714, "num_tokens": 815726850.0, "step": 5118 }, { "epoch": 2.603763987792472, "grad_norm": 0.8609278202056885, "learning_rate": 1e-05, "loss": 0.4906, "mean_token_accuracy": 0.843414306640625, "num_tokens": 815898444.0, "step": 5119 }, { "epoch": 2.604272634791455, "grad_norm": 0.9479242563247681, "learning_rate": 1e-05, "loss": 0.4264, "mean_token_accuracy": 0.8611266016960144, "num_tokens": 816039389.0, "step": 5120 }, { "epoch": 2.604781281790437, "grad_norm": 0.8942499756813049, "learning_rate": 1e-05, "loss": 0.426, "mean_token_accuracy": 0.8614490032196045, "num_tokens": 816198957.0, "step": 5121 }, { "epoch": 2.60528992878942, "grad_norm": 0.8818740248680115, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8538020849227905, "num_tokens": 816369450.0, "step": 5122 }, { "epoch": 2.605798575788403, "grad_norm": 0.9133926033973694, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.8584133386611938, "num_tokens": 816527861.0, "step": 5123 }, { "epoch": 2.6063072227873856, "grad_norm": 0.8590560555458069, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8604164123535156, "num_tokens": 816691307.0, "step": 5124 }, { "epoch": 2.6068158697863684, "grad_norm": 0.9377419352531433, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8640687465667725, "num_tokens": 816850090.0, "step": 5125 }, { "epoch": 2.6073245167853507, "grad_norm": 0.8784112930297852, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8555065393447876, "num_tokens": 817014438.0, "step": 5126 }, { "epoch": 2.607833163784334, "grad_norm": 0.9228441119194031, "learning_rate": 1e-05, "loss": 0.4403, "mean_token_accuracy": 0.8573063015937805, "num_tokens": 817172384.0, "step": 5127 }, { "epoch": 2.6083418107833163, "grad_norm": 0.8088212609291077, "learning_rate": 1e-05, "loss": 0.4277, "mean_token_accuracy": 0.8604148030281067, "num_tokens": 817342242.0, "step": 5128 }, { "epoch": 2.608850457782299, "grad_norm": 0.9407182931900024, "learning_rate": 1e-05, "loss": 0.4391, "mean_token_accuracy": 0.8581157922744751, "num_tokens": 817492020.0, "step": 5129 }, { "epoch": 2.609359104781282, "grad_norm": 0.8717429041862488, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8543630242347717, "num_tokens": 817643303.0, "step": 5130 }, { "epoch": 2.6098677517802646, "grad_norm": 0.8946168422698975, "learning_rate": 1e-05, "loss": 0.3996, "mean_token_accuracy": 0.8689746856689453, "num_tokens": 817795006.0, "step": 5131 }, { "epoch": 2.6103763987792474, "grad_norm": 0.9147217273712158, "learning_rate": 1e-05, "loss": 0.4008, "mean_token_accuracy": 0.8693822622299194, "num_tokens": 817952612.0, "step": 5132 }, { "epoch": 2.6108850457782298, "grad_norm": 0.9123526811599731, "learning_rate": 1e-05, "loss": 0.4731, "mean_token_accuracy": 0.8474389314651489, "num_tokens": 818110205.0, "step": 5133 }, { "epoch": 2.6113936927772126, "grad_norm": 0.9014759659767151, "learning_rate": 1e-05, "loss": 0.4465, "mean_token_accuracy": 0.8557165265083313, "num_tokens": 818287143.0, "step": 5134 }, { "epoch": 2.6119023397761953, "grad_norm": 0.8831366896629333, "learning_rate": 1e-05, "loss": 0.4257, "mean_token_accuracy": 0.860625684261322, "num_tokens": 818445307.0, "step": 5135 }, { "epoch": 2.612410986775178, "grad_norm": 0.8970555067062378, "learning_rate": 1e-05, "loss": 0.4277, "mean_token_accuracy": 0.860558271408081, "num_tokens": 818608748.0, "step": 5136 }, { "epoch": 2.612919633774161, "grad_norm": 0.9433715343475342, "learning_rate": 1e-05, "loss": 0.4256, "mean_token_accuracy": 0.8625460863113403, "num_tokens": 818761341.0, "step": 5137 }, { "epoch": 2.6134282807731433, "grad_norm": 0.9488721489906311, "learning_rate": 1e-05, "loss": 0.4354, "mean_token_accuracy": 0.8575472831726074, "num_tokens": 818924573.0, "step": 5138 }, { "epoch": 2.613936927772126, "grad_norm": 0.9305161833763123, "learning_rate": 1e-05, "loss": 0.4191, "mean_token_accuracy": 0.8635993003845215, "num_tokens": 819084612.0, "step": 5139 }, { "epoch": 2.614445574771109, "grad_norm": 0.8633292317390442, "learning_rate": 1e-05, "loss": 0.4162, "mean_token_accuracy": 0.8654317855834961, "num_tokens": 819253050.0, "step": 5140 }, { "epoch": 2.6149542217700916, "grad_norm": 0.969501793384552, "learning_rate": 1e-05, "loss": 0.4224, "mean_token_accuracy": 0.8637001514434814, "num_tokens": 819417551.0, "step": 5141 }, { "epoch": 2.6154628687690744, "grad_norm": 0.8660587072372437, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.857624888420105, "num_tokens": 819585374.0, "step": 5142 }, { "epoch": 2.6159715157680568, "grad_norm": 0.9489619731903076, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8545424342155457, "num_tokens": 819750312.0, "step": 5143 }, { "epoch": 2.6164801627670395, "grad_norm": 0.8869263529777527, "learning_rate": 1e-05, "loss": 0.427, "mean_token_accuracy": 0.8611224293708801, "num_tokens": 819902977.0, "step": 5144 }, { "epoch": 2.6169888097660223, "grad_norm": 0.9305620789527893, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8522117733955383, "num_tokens": 820060784.0, "step": 5145 }, { "epoch": 2.617497456765005, "grad_norm": 0.9102436304092407, "learning_rate": 1e-05, "loss": 0.4246, "mean_token_accuracy": 0.8633850812911987, "num_tokens": 820219660.0, "step": 5146 }, { "epoch": 2.618006103763988, "grad_norm": 0.8894648551940918, "learning_rate": 1e-05, "loss": 0.4781, "mean_token_accuracy": 0.8472837805747986, "num_tokens": 820390494.0, "step": 5147 }, { "epoch": 2.6185147507629702, "grad_norm": 0.9662260413169861, "learning_rate": 1e-05, "loss": 0.4357, "mean_token_accuracy": 0.8584712743759155, "num_tokens": 820542132.0, "step": 5148 }, { "epoch": 2.6190233977619535, "grad_norm": 0.8438816070556641, "learning_rate": 1e-05, "loss": 0.4238, "mean_token_accuracy": 0.8626097440719604, "num_tokens": 820707454.0, "step": 5149 }, { "epoch": 2.619532044760936, "grad_norm": 0.8659455180168152, "learning_rate": 1e-05, "loss": 0.4042, "mean_token_accuracy": 0.8677513599395752, "num_tokens": 820860446.0, "step": 5150 }, { "epoch": 2.6200406917599186, "grad_norm": 0.8488020896911621, "learning_rate": 1e-05, "loss": 0.4481, "mean_token_accuracy": 0.8547829985618591, "num_tokens": 821018836.0, "step": 5151 }, { "epoch": 2.6205493387589014, "grad_norm": 1.1235007047653198, "learning_rate": 1e-05, "loss": 0.4813, "mean_token_accuracy": 0.8466203212738037, "num_tokens": 821181422.0, "step": 5152 }, { "epoch": 2.621057985757884, "grad_norm": 0.9657119512557983, "learning_rate": 1e-05, "loss": 0.4169, "mean_token_accuracy": 0.8650161027908325, "num_tokens": 821327762.0, "step": 5153 }, { "epoch": 2.621566632756867, "grad_norm": 0.8740357756614685, "learning_rate": 1e-05, "loss": 0.4446, "mean_token_accuracy": 0.8541836738586426, "num_tokens": 821489947.0, "step": 5154 }, { "epoch": 2.6220752797558493, "grad_norm": 0.8773635029792786, "learning_rate": 1e-05, "loss": 0.4406, "mean_token_accuracy": 0.8584322333335876, "num_tokens": 821657627.0, "step": 5155 }, { "epoch": 2.622583926754832, "grad_norm": 0.9291388392448425, "learning_rate": 1e-05, "loss": 0.4426, "mean_token_accuracy": 0.8563066720962524, "num_tokens": 821809310.0, "step": 5156 }, { "epoch": 2.623092573753815, "grad_norm": 0.869541347026825, "learning_rate": 1e-05, "loss": 0.4155, "mean_token_accuracy": 0.8639887571334839, "num_tokens": 821966193.0, "step": 5157 }, { "epoch": 2.6236012207527977, "grad_norm": 0.8881884217262268, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8515429496765137, "num_tokens": 822129562.0, "step": 5158 }, { "epoch": 2.6241098677517805, "grad_norm": 0.9253519177436829, "learning_rate": 1e-05, "loss": 0.397, "mean_token_accuracy": 0.869998574256897, "num_tokens": 822287065.0, "step": 5159 }, { "epoch": 2.624618514750763, "grad_norm": 0.8654248118400574, "learning_rate": 1e-05, "loss": 0.4307, "mean_token_accuracy": 0.8618375062942505, "num_tokens": 822441494.0, "step": 5160 }, { "epoch": 2.6251271617497456, "grad_norm": 0.9040758609771729, "learning_rate": 1e-05, "loss": 0.4033, "mean_token_accuracy": 0.8672125339508057, "num_tokens": 822603565.0, "step": 5161 }, { "epoch": 2.6256358087487284, "grad_norm": 0.9012642502784729, "learning_rate": 1e-05, "loss": 0.4616, "mean_token_accuracy": 0.849787175655365, "num_tokens": 822761536.0, "step": 5162 }, { "epoch": 2.626144455747711, "grad_norm": 0.88527512550354, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8527937531471252, "num_tokens": 822927303.0, "step": 5163 }, { "epoch": 2.626653102746694, "grad_norm": 0.906010091304779, "learning_rate": 1e-05, "loss": 0.4459, "mean_token_accuracy": 0.8562264442443848, "num_tokens": 823092906.0, "step": 5164 }, { "epoch": 2.6271617497456763, "grad_norm": 0.8845453262329102, "learning_rate": 1e-05, "loss": 0.444, "mean_token_accuracy": 0.8577530980110168, "num_tokens": 823249279.0, "step": 5165 }, { "epoch": 2.627670396744659, "grad_norm": 0.8868412375450134, "learning_rate": 1e-05, "loss": 0.4302, "mean_token_accuracy": 0.860711395740509, "num_tokens": 823403147.0, "step": 5166 }, { "epoch": 2.628179043743642, "grad_norm": 0.9444741010665894, "learning_rate": 1e-05, "loss": 0.449, "mean_token_accuracy": 0.8536720871925354, "num_tokens": 823545931.0, "step": 5167 }, { "epoch": 2.6286876907426246, "grad_norm": 0.907468318939209, "learning_rate": 1e-05, "loss": 0.4335, "mean_token_accuracy": 0.859333872795105, "num_tokens": 823694330.0, "step": 5168 }, { "epoch": 2.6291963377416074, "grad_norm": 0.8744403719902039, "learning_rate": 1e-05, "loss": 0.414, "mean_token_accuracy": 0.863753080368042, "num_tokens": 823851802.0, "step": 5169 }, { "epoch": 2.62970498474059, "grad_norm": 0.8766217827796936, "learning_rate": 1e-05, "loss": 0.4386, "mean_token_accuracy": 0.8578379154205322, "num_tokens": 824009139.0, "step": 5170 }, { "epoch": 2.630213631739573, "grad_norm": 0.9645333290100098, "learning_rate": 1e-05, "loss": 0.4162, "mean_token_accuracy": 0.8645223379135132, "num_tokens": 824173599.0, "step": 5171 }, { "epoch": 2.6307222787385554, "grad_norm": 0.8777177333831787, "learning_rate": 1e-05, "loss": 0.4152, "mean_token_accuracy": 0.8644059896469116, "num_tokens": 824329297.0, "step": 5172 }, { "epoch": 2.631230925737538, "grad_norm": 0.8273940086364746, "learning_rate": 1e-05, "loss": 0.4412, "mean_token_accuracy": 0.8570989966392517, "num_tokens": 824511087.0, "step": 5173 }, { "epoch": 2.631739572736521, "grad_norm": 0.9432076215744019, "learning_rate": 1e-05, "loss": 0.4511, "mean_token_accuracy": 0.8537647724151611, "num_tokens": 824673484.0, "step": 5174 }, { "epoch": 2.6322482197355037, "grad_norm": 0.848598837852478, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.8540287017822266, "num_tokens": 824843437.0, "step": 5175 }, { "epoch": 2.6327568667344865, "grad_norm": 0.963729739189148, "learning_rate": 1e-05, "loss": 0.4421, "mean_token_accuracy": 0.8569854497909546, "num_tokens": 825002632.0, "step": 5176 }, { "epoch": 2.633265513733469, "grad_norm": 0.8712653517723083, "learning_rate": 1e-05, "loss": 0.4404, "mean_token_accuracy": 0.8580498695373535, "num_tokens": 825163042.0, "step": 5177 }, { "epoch": 2.6337741607324516, "grad_norm": 0.8973901867866516, "learning_rate": 1e-05, "loss": 0.442, "mean_token_accuracy": 0.8566519021987915, "num_tokens": 825320835.0, "step": 5178 }, { "epoch": 2.6342828077314344, "grad_norm": 0.8860601782798767, "learning_rate": 1e-05, "loss": 0.4357, "mean_token_accuracy": 0.8590834736824036, "num_tokens": 825488497.0, "step": 5179 }, { "epoch": 2.634791454730417, "grad_norm": 0.8676003217697144, "learning_rate": 1e-05, "loss": 0.4228, "mean_token_accuracy": 0.8617715835571289, "num_tokens": 825642437.0, "step": 5180 }, { "epoch": 2.6353001017294, "grad_norm": 0.9539165496826172, "learning_rate": 1e-05, "loss": 0.4409, "mean_token_accuracy": 0.8583078384399414, "num_tokens": 825798090.0, "step": 5181 }, { "epoch": 2.6358087487283823, "grad_norm": 0.8959965705871582, "learning_rate": 1e-05, "loss": 0.4679, "mean_token_accuracy": 0.8503422737121582, "num_tokens": 825949889.0, "step": 5182 }, { "epoch": 2.636317395727365, "grad_norm": 0.9380295872688293, "learning_rate": 1e-05, "loss": 0.4315, "mean_token_accuracy": 0.859856903553009, "num_tokens": 826089977.0, "step": 5183 }, { "epoch": 2.636826042726348, "grad_norm": 0.924473762512207, "learning_rate": 1e-05, "loss": 0.4188, "mean_token_accuracy": 0.8627535104751587, "num_tokens": 826249106.0, "step": 5184 }, { "epoch": 2.6373346897253307, "grad_norm": 0.9031071066856384, "learning_rate": 1e-05, "loss": 0.4491, "mean_token_accuracy": 0.8555418252944946, "num_tokens": 826417254.0, "step": 5185 }, { "epoch": 2.6378433367243135, "grad_norm": 0.8903203010559082, "learning_rate": 1e-05, "loss": 0.4263, "mean_token_accuracy": 0.8602005839347839, "num_tokens": 826566435.0, "step": 5186 }, { "epoch": 2.638351983723296, "grad_norm": 0.941855788230896, "learning_rate": 1e-05, "loss": 0.4503, "mean_token_accuracy": 0.8543573617935181, "num_tokens": 826722136.0, "step": 5187 }, { "epoch": 2.6388606307222786, "grad_norm": 1.0012236833572388, "learning_rate": 1e-05, "loss": 0.4267, "mean_token_accuracy": 0.8614889979362488, "num_tokens": 826880946.0, "step": 5188 }, { "epoch": 2.6393692777212614, "grad_norm": 0.9274325370788574, "learning_rate": 1e-05, "loss": 0.4573, "mean_token_accuracy": 0.8530974984169006, "num_tokens": 827041777.0, "step": 5189 }, { "epoch": 2.639877924720244, "grad_norm": 0.8896151185035706, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8657070994377136, "num_tokens": 827204054.0, "step": 5190 }, { "epoch": 2.640386571719227, "grad_norm": 0.8736908435821533, "learning_rate": 1e-05, "loss": 0.4105, "mean_token_accuracy": 0.8652385473251343, "num_tokens": 827369516.0, "step": 5191 }, { "epoch": 2.6408952187182093, "grad_norm": 0.8775960206985474, "learning_rate": 1e-05, "loss": 0.4167, "mean_token_accuracy": 0.863031268119812, "num_tokens": 827526209.0, "step": 5192 }, { "epoch": 2.6414038657171925, "grad_norm": 0.9726622104644775, "learning_rate": 1e-05, "loss": 0.4312, "mean_token_accuracy": 0.8588466048240662, "num_tokens": 827678096.0, "step": 5193 }, { "epoch": 2.641912512716175, "grad_norm": 0.8615018725395203, "learning_rate": 1e-05, "loss": 0.4055, "mean_token_accuracy": 0.8670532703399658, "num_tokens": 827841027.0, "step": 5194 }, { "epoch": 2.6424211597151577, "grad_norm": 0.9759061336517334, "learning_rate": 1e-05, "loss": 0.4488, "mean_token_accuracy": 0.8542364835739136, "num_tokens": 828000994.0, "step": 5195 }, { "epoch": 2.6429298067141405, "grad_norm": 0.9128785729408264, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8505659699440002, "num_tokens": 828167926.0, "step": 5196 }, { "epoch": 2.6434384537131232, "grad_norm": 0.9594423770904541, "learning_rate": 1e-05, "loss": 0.409, "mean_token_accuracy": 0.8666039705276489, "num_tokens": 828333229.0, "step": 5197 }, { "epoch": 2.643947100712106, "grad_norm": 1.0733044147491455, "learning_rate": 1e-05, "loss": 0.4618, "mean_token_accuracy": 0.8506227731704712, "num_tokens": 828492310.0, "step": 5198 }, { "epoch": 2.6444557477110884, "grad_norm": 0.9024893045425415, "learning_rate": 1e-05, "loss": 0.4295, "mean_token_accuracy": 0.8601677417755127, "num_tokens": 828659357.0, "step": 5199 }, { "epoch": 2.644964394710071, "grad_norm": 1.140589952468872, "learning_rate": 1e-05, "loss": 0.44, "mean_token_accuracy": 0.8560845851898193, "num_tokens": 828811219.0, "step": 5200 }, { "epoch": 2.645473041709054, "grad_norm": 0.9783995747566223, "learning_rate": 1e-05, "loss": 0.4132, "mean_token_accuracy": 0.8665125370025635, "num_tokens": 828964453.0, "step": 5201 }, { "epoch": 2.6459816887080367, "grad_norm": 1.0958985090255737, "learning_rate": 1e-05, "loss": 0.4041, "mean_token_accuracy": 0.8671479821205139, "num_tokens": 829121022.0, "step": 5202 }, { "epoch": 2.6464903357070195, "grad_norm": 1.1051671504974365, "learning_rate": 1e-05, "loss": 0.4448, "mean_token_accuracy": 0.8555877804756165, "num_tokens": 829285283.0, "step": 5203 }, { "epoch": 2.646998982706002, "grad_norm": 0.8696182370185852, "learning_rate": 1e-05, "loss": 0.4355, "mean_token_accuracy": 0.8576204776763916, "num_tokens": 829442232.0, "step": 5204 }, { "epoch": 2.6475076297049847, "grad_norm": 1.0274218320846558, "learning_rate": 1e-05, "loss": 0.4337, "mean_token_accuracy": 0.8589468598365784, "num_tokens": 829597449.0, "step": 5205 }, { "epoch": 2.6480162767039674, "grad_norm": 0.9053404331207275, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8558415174484253, "num_tokens": 829766930.0, "step": 5206 }, { "epoch": 2.6485249237029502, "grad_norm": 0.900059163570404, "learning_rate": 1e-05, "loss": 0.4459, "mean_token_accuracy": 0.8552563190460205, "num_tokens": 829930119.0, "step": 5207 }, { "epoch": 2.649033570701933, "grad_norm": 1.030701994895935, "learning_rate": 1e-05, "loss": 0.4379, "mean_token_accuracy": 0.8582024574279785, "num_tokens": 830084634.0, "step": 5208 }, { "epoch": 2.6495422177009154, "grad_norm": 0.8864624500274658, "learning_rate": 1e-05, "loss": 0.4398, "mean_token_accuracy": 0.8589258790016174, "num_tokens": 830249760.0, "step": 5209 }, { "epoch": 2.650050864699898, "grad_norm": 0.9129674434661865, "learning_rate": 1e-05, "loss": 0.395, "mean_token_accuracy": 0.8705196380615234, "num_tokens": 830411922.0, "step": 5210 }, { "epoch": 2.650559511698881, "grad_norm": 0.9383035898208618, "learning_rate": 1e-05, "loss": 0.455, "mean_token_accuracy": 0.8523730635643005, "num_tokens": 830575618.0, "step": 5211 }, { "epoch": 2.6510681586978637, "grad_norm": 1.006765604019165, "learning_rate": 1e-05, "loss": 0.4102, "mean_token_accuracy": 0.866306722164154, "num_tokens": 830725899.0, "step": 5212 }, { "epoch": 2.6515768056968465, "grad_norm": 0.8999183773994446, "learning_rate": 1e-05, "loss": 0.4255, "mean_token_accuracy": 0.8611314296722412, "num_tokens": 830890593.0, "step": 5213 }, { "epoch": 2.652085452695829, "grad_norm": 0.8834280371665955, "learning_rate": 1e-05, "loss": 0.4258, "mean_token_accuracy": 0.8608800768852234, "num_tokens": 831048409.0, "step": 5214 }, { "epoch": 2.6525940996948116, "grad_norm": 0.9180814623832703, "learning_rate": 1e-05, "loss": 0.4414, "mean_token_accuracy": 0.8565956354141235, "num_tokens": 831204167.0, "step": 5215 }, { "epoch": 2.6531027466937944, "grad_norm": 0.9318739175796509, "learning_rate": 1e-05, "loss": 0.4682, "mean_token_accuracy": 0.8499984741210938, "num_tokens": 831363323.0, "step": 5216 }, { "epoch": 2.653611393692777, "grad_norm": 0.8888118863105774, "learning_rate": 1e-05, "loss": 0.4623, "mean_token_accuracy": 0.8512741327285767, "num_tokens": 831519336.0, "step": 5217 }, { "epoch": 2.65412004069176, "grad_norm": 0.9057247638702393, "learning_rate": 1e-05, "loss": 0.434, "mean_token_accuracy": 0.8602588176727295, "num_tokens": 831680359.0, "step": 5218 }, { "epoch": 2.654628687690743, "grad_norm": 0.9005915522575378, "learning_rate": 1e-05, "loss": 0.4847, "mean_token_accuracy": 0.8457971811294556, "num_tokens": 831848068.0, "step": 5219 }, { "epoch": 2.6551373346897256, "grad_norm": 0.9270437359809875, "learning_rate": 1e-05, "loss": 0.4351, "mean_token_accuracy": 0.8590574264526367, "num_tokens": 831993691.0, "step": 5220 }, { "epoch": 2.655645981688708, "grad_norm": 0.8919311761856079, "learning_rate": 1e-05, "loss": 0.4397, "mean_token_accuracy": 0.8572813272476196, "num_tokens": 832163882.0, "step": 5221 }, { "epoch": 2.6561546286876907, "grad_norm": 0.833203136920929, "learning_rate": 1e-05, "loss": 0.4304, "mean_token_accuracy": 0.8595755100250244, "num_tokens": 832317882.0, "step": 5222 }, { "epoch": 2.6566632756866735, "grad_norm": 0.8920441269874573, "learning_rate": 1e-05, "loss": 0.4583, "mean_token_accuracy": 0.8540435433387756, "num_tokens": 832484402.0, "step": 5223 }, { "epoch": 2.6571719226856563, "grad_norm": 0.8928776979446411, "learning_rate": 1e-05, "loss": 0.4392, "mean_token_accuracy": 0.857858419418335, "num_tokens": 832638879.0, "step": 5224 }, { "epoch": 2.657680569684639, "grad_norm": 0.8863247036933899, "learning_rate": 1e-05, "loss": 0.4161, "mean_token_accuracy": 0.8645328879356384, "num_tokens": 832789130.0, "step": 5225 }, { "epoch": 2.6581892166836214, "grad_norm": 1.0162090063095093, "learning_rate": 1e-05, "loss": 0.4604, "mean_token_accuracy": 0.8517112731933594, "num_tokens": 832949967.0, "step": 5226 }, { "epoch": 2.658697863682604, "grad_norm": 0.9088463187217712, "learning_rate": 1e-05, "loss": 0.4341, "mean_token_accuracy": 0.8591833710670471, "num_tokens": 833119872.0, "step": 5227 }, { "epoch": 2.659206510681587, "grad_norm": 0.8535668253898621, "learning_rate": 1e-05, "loss": 0.4602, "mean_token_accuracy": 0.8516002893447876, "num_tokens": 833279512.0, "step": 5228 }, { "epoch": 2.6597151576805698, "grad_norm": 1.0515375137329102, "learning_rate": 1e-05, "loss": 0.4635, "mean_token_accuracy": 0.8504531383514404, "num_tokens": 833433563.0, "step": 5229 }, { "epoch": 2.6602238046795526, "grad_norm": 0.910163938999176, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8567389249801636, "num_tokens": 833592791.0, "step": 5230 }, { "epoch": 2.660732451678535, "grad_norm": 0.9681553840637207, "learning_rate": 1e-05, "loss": 0.4037, "mean_token_accuracy": 0.8662945628166199, "num_tokens": 833763220.0, "step": 5231 }, { "epoch": 2.6612410986775177, "grad_norm": 0.8657732009887695, "learning_rate": 1e-05, "loss": 0.4031, "mean_token_accuracy": 0.8685414791107178, "num_tokens": 833906060.0, "step": 5232 }, { "epoch": 2.6617497456765005, "grad_norm": 0.9288004040718079, "learning_rate": 1e-05, "loss": 0.4358, "mean_token_accuracy": 0.8585642576217651, "num_tokens": 834064692.0, "step": 5233 }, { "epoch": 2.6622583926754833, "grad_norm": 0.8908243179321289, "learning_rate": 1e-05, "loss": 0.4487, "mean_token_accuracy": 0.8539721965789795, "num_tokens": 834238761.0, "step": 5234 }, { "epoch": 2.662767039674466, "grad_norm": 0.8896021842956543, "learning_rate": 1e-05, "loss": 0.4372, "mean_token_accuracy": 0.8590289354324341, "num_tokens": 834394611.0, "step": 5235 }, { "epoch": 2.6632756866734484, "grad_norm": 0.9499205350875854, "learning_rate": 1e-05, "loss": 0.436, "mean_token_accuracy": 0.8576411008834839, "num_tokens": 834556371.0, "step": 5236 }, { "epoch": 2.663784333672431, "grad_norm": 0.9268885254859924, "learning_rate": 1e-05, "loss": 0.4321, "mean_token_accuracy": 0.859523355960846, "num_tokens": 834720648.0, "step": 5237 }, { "epoch": 2.664292980671414, "grad_norm": 0.973332941532135, "learning_rate": 1e-05, "loss": 0.4416, "mean_token_accuracy": 0.8574525117874146, "num_tokens": 834886463.0, "step": 5238 }, { "epoch": 2.6648016276703967, "grad_norm": 0.8768401741981506, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8543452620506287, "num_tokens": 835045036.0, "step": 5239 }, { "epoch": 2.6653102746693795, "grad_norm": 0.9415298700332642, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.8475942611694336, "num_tokens": 835210506.0, "step": 5240 }, { "epoch": 2.6658189216683623, "grad_norm": 0.8788790106773376, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8509476184844971, "num_tokens": 835378216.0, "step": 5241 }, { "epoch": 2.666327568667345, "grad_norm": 0.9069360494613647, "learning_rate": 1e-05, "loss": 0.4477, "mean_token_accuracy": 0.8549486994743347, "num_tokens": 835541684.0, "step": 5242 }, { "epoch": 2.6668362156663274, "grad_norm": 0.8734854459762573, "learning_rate": 1e-05, "loss": 0.4218, "mean_token_accuracy": 0.862857460975647, "num_tokens": 835698814.0, "step": 5243 }, { "epoch": 2.6673448626653102, "grad_norm": 0.8554462790489197, "learning_rate": 1e-05, "loss": 0.4067, "mean_token_accuracy": 0.8660014867782593, "num_tokens": 835845878.0, "step": 5244 }, { "epoch": 2.667853509664293, "grad_norm": 0.9750162959098816, "learning_rate": 1e-05, "loss": 0.4122, "mean_token_accuracy": 0.865595281124115, "num_tokens": 836006330.0, "step": 5245 }, { "epoch": 2.668362156663276, "grad_norm": 0.8666068315505981, "learning_rate": 1e-05, "loss": 0.4482, "mean_token_accuracy": 0.8550605773925781, "num_tokens": 836171735.0, "step": 5246 }, { "epoch": 2.6688708036622586, "grad_norm": 1.0751583576202393, "learning_rate": 1e-05, "loss": 0.4291, "mean_token_accuracy": 0.8609598875045776, "num_tokens": 836330652.0, "step": 5247 }, { "epoch": 2.669379450661241, "grad_norm": 0.9035525321960449, "learning_rate": 1e-05, "loss": 0.4314, "mean_token_accuracy": 0.8590593338012695, "num_tokens": 836490407.0, "step": 5248 }, { "epoch": 2.6698880976602237, "grad_norm": 0.8673198223114014, "learning_rate": 1e-05, "loss": 0.4581, "mean_token_accuracy": 0.853705644607544, "num_tokens": 836657627.0, "step": 5249 }, { "epoch": 2.6703967446592065, "grad_norm": 0.8926166296005249, "learning_rate": 1e-05, "loss": 0.4604, "mean_token_accuracy": 0.8502712845802307, "num_tokens": 836806889.0, "step": 5250 }, { "epoch": 2.6709053916581893, "grad_norm": 0.8546720147132874, "learning_rate": 1e-05, "loss": 0.4045, "mean_token_accuracy": 0.8660810589790344, "num_tokens": 836957002.0, "step": 5251 }, { "epoch": 2.671414038657172, "grad_norm": 0.8993261456489563, "learning_rate": 1e-05, "loss": 0.4193, "mean_token_accuracy": 0.8635987043380737, "num_tokens": 837114999.0, "step": 5252 }, { "epoch": 2.6719226856561544, "grad_norm": 0.835578203201294, "learning_rate": 1e-05, "loss": 0.4183, "mean_token_accuracy": 0.8642175197601318, "num_tokens": 837278032.0, "step": 5253 }, { "epoch": 2.672431332655137, "grad_norm": 0.8844635486602783, "learning_rate": 1e-05, "loss": 0.4582, "mean_token_accuracy": 0.8512252569198608, "num_tokens": 837446232.0, "step": 5254 }, { "epoch": 2.67293997965412, "grad_norm": 0.9081737399101257, "learning_rate": 1e-05, "loss": 0.4563, "mean_token_accuracy": 0.8528986573219299, "num_tokens": 837606045.0, "step": 5255 }, { "epoch": 2.673448626653103, "grad_norm": 0.8913959860801697, "learning_rate": 1e-05, "loss": 0.4278, "mean_token_accuracy": 0.8601134419441223, "num_tokens": 837755582.0, "step": 5256 }, { "epoch": 2.6739572736520856, "grad_norm": 0.8811616897583008, "learning_rate": 1e-05, "loss": 0.4181, "mean_token_accuracy": 0.8632052540779114, "num_tokens": 837919220.0, "step": 5257 }, { "epoch": 2.674465920651068, "grad_norm": 0.9206438660621643, "learning_rate": 1e-05, "loss": 0.4445, "mean_token_accuracy": 0.8559415340423584, "num_tokens": 838095626.0, "step": 5258 }, { "epoch": 2.6749745676500507, "grad_norm": 0.8314093947410583, "learning_rate": 1e-05, "loss": 0.4218, "mean_token_accuracy": 0.8621360659599304, "num_tokens": 838261047.0, "step": 5259 }, { "epoch": 2.6754832146490335, "grad_norm": 0.8358904123306274, "learning_rate": 1e-05, "loss": 0.4264, "mean_token_accuracy": 0.8617807030677795, "num_tokens": 838424701.0, "step": 5260 }, { "epoch": 2.6759918616480163, "grad_norm": 0.9278913140296936, "learning_rate": 1e-05, "loss": 0.4137, "mean_token_accuracy": 0.8653403520584106, "num_tokens": 838578163.0, "step": 5261 }, { "epoch": 2.676500508646999, "grad_norm": 0.8403149843215942, "learning_rate": 1e-05, "loss": 0.389, "mean_token_accuracy": 0.8716791868209839, "num_tokens": 838749312.0, "step": 5262 }, { "epoch": 2.677009155645982, "grad_norm": 0.8926715850830078, "learning_rate": 1e-05, "loss": 0.4179, "mean_token_accuracy": 0.8627158403396606, "num_tokens": 838907950.0, "step": 5263 }, { "epoch": 2.6775178026449646, "grad_norm": 0.9633036851882935, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8537728786468506, "num_tokens": 839056583.0, "step": 5264 }, { "epoch": 2.678026449643947, "grad_norm": 0.8729433417320251, "learning_rate": 1e-05, "loss": 0.4325, "mean_token_accuracy": 0.8604938983917236, "num_tokens": 839210164.0, "step": 5265 }, { "epoch": 2.6785350966429298, "grad_norm": 0.8906210660934448, "learning_rate": 1e-05, "loss": 0.4153, "mean_token_accuracy": 0.8655133247375488, "num_tokens": 839375666.0, "step": 5266 }, { "epoch": 2.6790437436419126, "grad_norm": 0.9205353856086731, "learning_rate": 1e-05, "loss": 0.4236, "mean_token_accuracy": 0.8611639738082886, "num_tokens": 839530690.0, "step": 5267 }, { "epoch": 2.6795523906408953, "grad_norm": 0.8499439358711243, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.8617724776268005, "num_tokens": 839686915.0, "step": 5268 }, { "epoch": 2.680061037639878, "grad_norm": 0.9117785692214966, "learning_rate": 1e-05, "loss": 0.4421, "mean_token_accuracy": 0.856441855430603, "num_tokens": 839849116.0, "step": 5269 }, { "epoch": 2.6805696846388605, "grad_norm": 0.9000932574272156, "learning_rate": 1e-05, "loss": 0.4369, "mean_token_accuracy": 0.8586184978485107, "num_tokens": 840014745.0, "step": 5270 }, { "epoch": 2.6810783316378433, "grad_norm": 0.9353579878807068, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8558241724967957, "num_tokens": 840164253.0, "step": 5271 }, { "epoch": 2.681586978636826, "grad_norm": 0.9166135191917419, "learning_rate": 1e-05, "loss": 0.4654, "mean_token_accuracy": 0.8499311804771423, "num_tokens": 840328107.0, "step": 5272 }, { "epoch": 2.682095625635809, "grad_norm": 0.8736609816551208, "learning_rate": 1e-05, "loss": 0.4139, "mean_token_accuracy": 0.8634559512138367, "num_tokens": 840486789.0, "step": 5273 }, { "epoch": 2.6826042726347916, "grad_norm": 0.907143771648407, "learning_rate": 1e-05, "loss": 0.4289, "mean_token_accuracy": 0.8602548837661743, "num_tokens": 840645654.0, "step": 5274 }, { "epoch": 2.683112919633774, "grad_norm": 0.9404701590538025, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.8499409556388855, "num_tokens": 840794418.0, "step": 5275 }, { "epoch": 2.6836215666327567, "grad_norm": 0.9110710024833679, "learning_rate": 1e-05, "loss": 0.4306, "mean_token_accuracy": 0.858879566192627, "num_tokens": 840940420.0, "step": 5276 }, { "epoch": 2.6841302136317395, "grad_norm": 0.8523752689361572, "learning_rate": 1e-05, "loss": 0.442, "mean_token_accuracy": 0.855581521987915, "num_tokens": 841097573.0, "step": 5277 }, { "epoch": 2.6846388606307223, "grad_norm": 0.9715843796730042, "learning_rate": 1e-05, "loss": 0.4193, "mean_token_accuracy": 0.8620656728744507, "num_tokens": 841254500.0, "step": 5278 }, { "epoch": 2.685147507629705, "grad_norm": 0.9293642044067383, "learning_rate": 1e-05, "loss": 0.4394, "mean_token_accuracy": 0.8580410480499268, "num_tokens": 841405456.0, "step": 5279 }, { "epoch": 2.6856561546286875, "grad_norm": 0.8933702111244202, "learning_rate": 1e-05, "loss": 0.4288, "mean_token_accuracy": 0.8603155016899109, "num_tokens": 841560795.0, "step": 5280 }, { "epoch": 2.6861648016276702, "grad_norm": 0.9727333188056946, "learning_rate": 1e-05, "loss": 0.4187, "mean_token_accuracy": 0.8624151945114136, "num_tokens": 841717082.0, "step": 5281 }, { "epoch": 2.686673448626653, "grad_norm": 0.9016296863555908, "learning_rate": 1e-05, "loss": 0.4727, "mean_token_accuracy": 0.8495412468910217, "num_tokens": 841878106.0, "step": 5282 }, { "epoch": 2.687182095625636, "grad_norm": 0.8928479552268982, "learning_rate": 1e-05, "loss": 0.4379, "mean_token_accuracy": 0.8579577207565308, "num_tokens": 842040480.0, "step": 5283 }, { "epoch": 2.6876907426246186, "grad_norm": 1.0036600828170776, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8550857305526733, "num_tokens": 842189540.0, "step": 5284 }, { "epoch": 2.688199389623601, "grad_norm": 0.8805148005485535, "learning_rate": 1e-05, "loss": 0.445, "mean_token_accuracy": 0.8565136194229126, "num_tokens": 842345555.0, "step": 5285 }, { "epoch": 2.688708036622584, "grad_norm": 0.905439019203186, "learning_rate": 1e-05, "loss": 0.408, "mean_token_accuracy": 0.8662591576576233, "num_tokens": 842497226.0, "step": 5286 }, { "epoch": 2.6892166836215665, "grad_norm": 0.9065870046615601, "learning_rate": 1e-05, "loss": 0.4245, "mean_token_accuracy": 0.8618921637535095, "num_tokens": 842654020.0, "step": 5287 }, { "epoch": 2.6897253306205493, "grad_norm": 0.8972324132919312, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8648066520690918, "num_tokens": 842814612.0, "step": 5288 }, { "epoch": 2.690233977619532, "grad_norm": 0.9083245992660522, "learning_rate": 1e-05, "loss": 0.4448, "mean_token_accuracy": 0.855293869972229, "num_tokens": 842969822.0, "step": 5289 }, { "epoch": 2.690742624618515, "grad_norm": 1.0097336769104004, "learning_rate": 1e-05, "loss": 0.4653, "mean_token_accuracy": 0.8518034219741821, "num_tokens": 843117888.0, "step": 5290 }, { "epoch": 2.6912512716174977, "grad_norm": 0.8871720433235168, "learning_rate": 1e-05, "loss": 0.4348, "mean_token_accuracy": 0.858801007270813, "num_tokens": 843276391.0, "step": 5291 }, { "epoch": 2.69175991861648, "grad_norm": 0.8812194466590881, "learning_rate": 1e-05, "loss": 0.4198, "mean_token_accuracy": 0.8638508319854736, "num_tokens": 843448573.0, "step": 5292 }, { "epoch": 2.692268565615463, "grad_norm": 0.9194967150688171, "learning_rate": 1e-05, "loss": 0.4313, "mean_token_accuracy": 0.8597676157951355, "num_tokens": 843610730.0, "step": 5293 }, { "epoch": 2.6927772126144456, "grad_norm": 0.963604211807251, "learning_rate": 1e-05, "loss": 0.4305, "mean_token_accuracy": 0.8583006858825684, "num_tokens": 843753683.0, "step": 5294 }, { "epoch": 2.6932858596134284, "grad_norm": 0.9230378866195679, "learning_rate": 1e-05, "loss": 0.4204, "mean_token_accuracy": 0.8627330660820007, "num_tokens": 843921987.0, "step": 5295 }, { "epoch": 2.693794506612411, "grad_norm": 0.8835469484329224, "learning_rate": 1e-05, "loss": 0.4258, "mean_token_accuracy": 0.8617545962333679, "num_tokens": 844084227.0, "step": 5296 }, { "epoch": 2.6943031536113935, "grad_norm": 1.0125384330749512, "learning_rate": 1e-05, "loss": 0.4426, "mean_token_accuracy": 0.8542388677597046, "num_tokens": 844241768.0, "step": 5297 }, { "epoch": 2.6948118006103763, "grad_norm": 0.9577308893203735, "learning_rate": 1e-05, "loss": 0.4608, "mean_token_accuracy": 0.8525644540786743, "num_tokens": 844401669.0, "step": 5298 }, { "epoch": 2.695320447609359, "grad_norm": 0.9190992116928101, "learning_rate": 1e-05, "loss": 0.3843, "mean_token_accuracy": 0.8727595806121826, "num_tokens": 844543490.0, "step": 5299 }, { "epoch": 2.695829094608342, "grad_norm": 1.065569281578064, "learning_rate": 1e-05, "loss": 0.4478, "mean_token_accuracy": 0.8555886745452881, "num_tokens": 844700667.0, "step": 5300 }, { "epoch": 2.6963377416073246, "grad_norm": 1.1497288942337036, "learning_rate": 1e-05, "loss": 0.4772, "mean_token_accuracy": 0.8471383452415466, "num_tokens": 844859397.0, "step": 5301 }, { "epoch": 2.696846388606307, "grad_norm": 1.0540322065353394, "learning_rate": 1e-05, "loss": 0.4164, "mean_token_accuracy": 0.8639854192733765, "num_tokens": 845028014.0, "step": 5302 }, { "epoch": 2.6973550356052898, "grad_norm": 0.9754948019981384, "learning_rate": 1e-05, "loss": 0.4226, "mean_token_accuracy": 0.8623305559158325, "num_tokens": 845188251.0, "step": 5303 }, { "epoch": 2.6978636826042726, "grad_norm": 0.9905082583427429, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8608218431472778, "num_tokens": 845341860.0, "step": 5304 }, { "epoch": 2.6983723296032553, "grad_norm": 0.9921637177467346, "learning_rate": 1e-05, "loss": 0.413, "mean_token_accuracy": 0.8662896156311035, "num_tokens": 845496873.0, "step": 5305 }, { "epoch": 2.698880976602238, "grad_norm": 1.0702582597732544, "learning_rate": 1e-05, "loss": 0.453, "mean_token_accuracy": 0.8552283048629761, "num_tokens": 845655021.0, "step": 5306 }, { "epoch": 2.6993896236012205, "grad_norm": 1.0450761318206787, "learning_rate": 1e-05, "loss": 0.4211, "mean_token_accuracy": 0.8628575801849365, "num_tokens": 845820482.0, "step": 5307 }, { "epoch": 2.6998982706002037, "grad_norm": 0.9499820470809937, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8609029650688171, "num_tokens": 845981043.0, "step": 5308 }, { "epoch": 2.700406917599186, "grad_norm": 1.038619875907898, "learning_rate": 1e-05, "loss": 0.4412, "mean_token_accuracy": 0.8579244017601013, "num_tokens": 846137074.0, "step": 5309 }, { "epoch": 2.700915564598169, "grad_norm": 0.97670978307724, "learning_rate": 1e-05, "loss": 0.4531, "mean_token_accuracy": 0.8525657653808594, "num_tokens": 846289014.0, "step": 5310 }, { "epoch": 2.7014242115971516, "grad_norm": 0.8953006267547607, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8571546077728271, "num_tokens": 846463658.0, "step": 5311 }, { "epoch": 2.7019328585961344, "grad_norm": 1.0701743364334106, "learning_rate": 1e-05, "loss": 0.4394, "mean_token_accuracy": 0.8594745397567749, "num_tokens": 846620497.0, "step": 5312 }, { "epoch": 2.702441505595117, "grad_norm": 0.9389851689338684, "learning_rate": 1e-05, "loss": 0.3982, "mean_token_accuracy": 0.8693704605102539, "num_tokens": 846782407.0, "step": 5313 }, { "epoch": 2.7029501525940995, "grad_norm": 1.068837285041809, "learning_rate": 1e-05, "loss": 0.4337, "mean_token_accuracy": 0.8585376143455505, "num_tokens": 846939843.0, "step": 5314 }, { "epoch": 2.7034587995930823, "grad_norm": 0.9822969436645508, "learning_rate": 1e-05, "loss": 0.4354, "mean_token_accuracy": 0.8587103486061096, "num_tokens": 847100525.0, "step": 5315 }, { "epoch": 2.703967446592065, "grad_norm": 0.973683774471283, "learning_rate": 1e-05, "loss": 0.4403, "mean_token_accuracy": 0.8573872447013855, "num_tokens": 847258615.0, "step": 5316 }, { "epoch": 2.704476093591048, "grad_norm": 1.0660804510116577, "learning_rate": 1e-05, "loss": 0.4266, "mean_token_accuracy": 0.8625261783599854, "num_tokens": 847412916.0, "step": 5317 }, { "epoch": 2.7049847405900307, "grad_norm": 0.8933954238891602, "learning_rate": 1e-05, "loss": 0.4272, "mean_token_accuracy": 0.8619201183319092, "num_tokens": 847566136.0, "step": 5318 }, { "epoch": 2.705493387589013, "grad_norm": 1.0075711011886597, "learning_rate": 1e-05, "loss": 0.46, "mean_token_accuracy": 0.8495711088180542, "num_tokens": 847728434.0, "step": 5319 }, { "epoch": 2.706002034587996, "grad_norm": 0.9256721138954163, "learning_rate": 1e-05, "loss": 0.4518, "mean_token_accuracy": 0.8546531200408936, "num_tokens": 847880060.0, "step": 5320 }, { "epoch": 2.7065106815869786, "grad_norm": 0.9893863797187805, "learning_rate": 1e-05, "loss": 0.3943, "mean_token_accuracy": 0.8700924515724182, "num_tokens": 848022342.0, "step": 5321 }, { "epoch": 2.7070193285859614, "grad_norm": 0.9653140306472778, "learning_rate": 1e-05, "loss": 0.4541, "mean_token_accuracy": 0.8531556129455566, "num_tokens": 848174723.0, "step": 5322 }, { "epoch": 2.707527975584944, "grad_norm": 0.9758864641189575, "learning_rate": 1e-05, "loss": 0.4295, "mean_token_accuracy": 0.8584966659545898, "num_tokens": 848325984.0, "step": 5323 }, { "epoch": 2.7080366225839265, "grad_norm": 0.8586357235908508, "learning_rate": 1e-05, "loss": 0.4152, "mean_token_accuracy": 0.8653855323791504, "num_tokens": 848487841.0, "step": 5324 }, { "epoch": 2.7085452695829093, "grad_norm": 0.9456252455711365, "learning_rate": 1e-05, "loss": 0.4232, "mean_token_accuracy": 0.861905574798584, "num_tokens": 848636882.0, "step": 5325 }, { "epoch": 2.709053916581892, "grad_norm": 0.8794358372688293, "learning_rate": 1e-05, "loss": 0.4379, "mean_token_accuracy": 0.8578913807868958, "num_tokens": 848802975.0, "step": 5326 }, { "epoch": 2.709562563580875, "grad_norm": 0.8466426730155945, "learning_rate": 1e-05, "loss": 0.4398, "mean_token_accuracy": 0.8574482798576355, "num_tokens": 848967586.0, "step": 5327 }, { "epoch": 2.7100712105798577, "grad_norm": 0.9271215796470642, "learning_rate": 1e-05, "loss": 0.4292, "mean_token_accuracy": 0.8618929386138916, "num_tokens": 849119947.0, "step": 5328 }, { "epoch": 2.71057985757884, "grad_norm": 0.9016660451889038, "learning_rate": 1e-05, "loss": 0.4078, "mean_token_accuracy": 0.8671656847000122, "num_tokens": 849278651.0, "step": 5329 }, { "epoch": 2.7110885045778232, "grad_norm": 0.9124810695648193, "learning_rate": 1e-05, "loss": 0.4176, "mean_token_accuracy": 0.8643842935562134, "num_tokens": 849449730.0, "step": 5330 }, { "epoch": 2.7115971515768056, "grad_norm": 0.878895103931427, "learning_rate": 1e-05, "loss": 0.4213, "mean_token_accuracy": 0.8634142875671387, "num_tokens": 849606858.0, "step": 5331 }, { "epoch": 2.7121057985757884, "grad_norm": 0.9118165373802185, "learning_rate": 1e-05, "loss": 0.4594, "mean_token_accuracy": 0.8515703678131104, "num_tokens": 849776273.0, "step": 5332 }, { "epoch": 2.712614445574771, "grad_norm": 0.9616394639015198, "learning_rate": 1e-05, "loss": 0.4341, "mean_token_accuracy": 0.8593379259109497, "num_tokens": 849934124.0, "step": 5333 }, { "epoch": 2.713123092573754, "grad_norm": 0.9179736375808716, "learning_rate": 1e-05, "loss": 0.4465, "mean_token_accuracy": 0.855577826499939, "num_tokens": 850089497.0, "step": 5334 }, { "epoch": 2.7136317395727367, "grad_norm": 0.9623274207115173, "learning_rate": 1e-05, "loss": 0.4419, "mean_token_accuracy": 0.8587888479232788, "num_tokens": 850252982.0, "step": 5335 }, { "epoch": 2.714140386571719, "grad_norm": 0.9166063070297241, "learning_rate": 1e-05, "loss": 0.4202, "mean_token_accuracy": 0.8629820346832275, "num_tokens": 850402164.0, "step": 5336 }, { "epoch": 2.714649033570702, "grad_norm": 0.8992436528205872, "learning_rate": 1e-05, "loss": 0.4425, "mean_token_accuracy": 0.859114944934845, "num_tokens": 850563953.0, "step": 5337 }, { "epoch": 2.7151576805696847, "grad_norm": 0.8603736162185669, "learning_rate": 1e-05, "loss": 0.4544, "mean_token_accuracy": 0.854015588760376, "num_tokens": 850732466.0, "step": 5338 }, { "epoch": 2.7156663275686674, "grad_norm": 0.9173347353935242, "learning_rate": 1e-05, "loss": 0.4126, "mean_token_accuracy": 0.8645404577255249, "num_tokens": 850881803.0, "step": 5339 }, { "epoch": 2.7161749745676502, "grad_norm": 0.9201670289039612, "learning_rate": 1e-05, "loss": 0.4526, "mean_token_accuracy": 0.8551905751228333, "num_tokens": 851032068.0, "step": 5340 }, { "epoch": 2.7166836215666326, "grad_norm": 0.8854498267173767, "learning_rate": 1e-05, "loss": 0.4249, "mean_token_accuracy": 0.8620210886001587, "num_tokens": 851184436.0, "step": 5341 }, { "epoch": 2.7171922685656154, "grad_norm": 0.8970783948898315, "learning_rate": 1e-05, "loss": 0.4309, "mean_token_accuracy": 0.8604885339736938, "num_tokens": 851344640.0, "step": 5342 }, { "epoch": 2.717700915564598, "grad_norm": 0.8435708284378052, "learning_rate": 1e-05, "loss": 0.4079, "mean_token_accuracy": 0.8662682771682739, "num_tokens": 851496643.0, "step": 5343 }, { "epoch": 2.718209562563581, "grad_norm": 0.9770410060882568, "learning_rate": 1e-05, "loss": 0.4294, "mean_token_accuracy": 0.8620365858078003, "num_tokens": 851650774.0, "step": 5344 }, { "epoch": 2.7187182095625637, "grad_norm": 0.941686749458313, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8531053066253662, "num_tokens": 851807168.0, "step": 5345 }, { "epoch": 2.719226856561546, "grad_norm": 0.9545437097549438, "learning_rate": 1e-05, "loss": 0.4636, "mean_token_accuracy": 0.8520953059196472, "num_tokens": 851963425.0, "step": 5346 }, { "epoch": 2.719735503560529, "grad_norm": 0.9199371933937073, "learning_rate": 1e-05, "loss": 0.4301, "mean_token_accuracy": 0.8604966998100281, "num_tokens": 852127672.0, "step": 5347 }, { "epoch": 2.7202441505595116, "grad_norm": 0.8954357504844666, "learning_rate": 1e-05, "loss": 0.4318, "mean_token_accuracy": 0.8590489625930786, "num_tokens": 852286142.0, "step": 5348 }, { "epoch": 2.7207527975584944, "grad_norm": 0.9719069004058838, "learning_rate": 1e-05, "loss": 0.4362, "mean_token_accuracy": 0.8575436472892761, "num_tokens": 852423657.0, "step": 5349 }, { "epoch": 2.721261444557477, "grad_norm": 0.8555158376693726, "learning_rate": 1e-05, "loss": 0.4039, "mean_token_accuracy": 0.8683286905288696, "num_tokens": 852582032.0, "step": 5350 }, { "epoch": 2.7217700915564595, "grad_norm": 0.9108577966690063, "learning_rate": 1e-05, "loss": 0.4295, "mean_token_accuracy": 0.8620162010192871, "num_tokens": 852731358.0, "step": 5351 }, { "epoch": 2.722278738555443, "grad_norm": 0.9062488079071045, "learning_rate": 1e-05, "loss": 0.4061, "mean_token_accuracy": 0.8677446246147156, "num_tokens": 852884998.0, "step": 5352 }, { "epoch": 2.722787385554425, "grad_norm": 0.926082193851471, "learning_rate": 1e-05, "loss": 0.4133, "mean_token_accuracy": 0.8656351566314697, "num_tokens": 853052940.0, "step": 5353 }, { "epoch": 2.723296032553408, "grad_norm": 0.9328706860542297, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8513386845588684, "num_tokens": 853215576.0, "step": 5354 }, { "epoch": 2.7238046795523907, "grad_norm": 0.9114854335784912, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8492288589477539, "num_tokens": 853379664.0, "step": 5355 }, { "epoch": 2.7243133265513735, "grad_norm": 0.8919292092323303, "learning_rate": 1e-05, "loss": 0.4308, "mean_token_accuracy": 0.8609844446182251, "num_tokens": 853539145.0, "step": 5356 }, { "epoch": 2.7248219735503563, "grad_norm": 0.9118651151657104, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8546961545944214, "num_tokens": 853695632.0, "step": 5357 }, { "epoch": 2.7253306205493386, "grad_norm": 0.9264186024665833, "learning_rate": 1e-05, "loss": 0.3998, "mean_token_accuracy": 0.8693535923957825, "num_tokens": 853850299.0, "step": 5358 }, { "epoch": 2.7258392675483214, "grad_norm": 0.9042052626609802, "learning_rate": 1e-05, "loss": 0.4388, "mean_token_accuracy": 0.8559480309486389, "num_tokens": 854002137.0, "step": 5359 }, { "epoch": 2.726347914547304, "grad_norm": 0.918724536895752, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.8582726120948792, "num_tokens": 854162459.0, "step": 5360 }, { "epoch": 2.726856561546287, "grad_norm": 0.841949999332428, "learning_rate": 1e-05, "loss": 0.3968, "mean_token_accuracy": 0.8698896765708923, "num_tokens": 854320980.0, "step": 5361 }, { "epoch": 2.7273652085452698, "grad_norm": 0.8705538511276245, "learning_rate": 1e-05, "loss": 0.4274, "mean_token_accuracy": 0.8598239421844482, "num_tokens": 854480464.0, "step": 5362 }, { "epoch": 2.727873855544252, "grad_norm": 0.8634495139122009, "learning_rate": 1e-05, "loss": 0.4234, "mean_token_accuracy": 0.862626314163208, "num_tokens": 854639187.0, "step": 5363 }, { "epoch": 2.728382502543235, "grad_norm": 0.9334564208984375, "learning_rate": 1e-05, "loss": 0.4344, "mean_token_accuracy": 0.8592795133590698, "num_tokens": 854791740.0, "step": 5364 }, { "epoch": 2.7288911495422177, "grad_norm": 0.8462790846824646, "learning_rate": 1e-05, "loss": 0.423, "mean_token_accuracy": 0.8633471727371216, "num_tokens": 854959936.0, "step": 5365 }, { "epoch": 2.7293997965412005, "grad_norm": 0.8552737832069397, "learning_rate": 1e-05, "loss": 0.4143, "mean_token_accuracy": 0.86388099193573, "num_tokens": 855114343.0, "step": 5366 }, { "epoch": 2.7299084435401832, "grad_norm": 0.896813154220581, "learning_rate": 1e-05, "loss": 0.4213, "mean_token_accuracy": 0.8606491088867188, "num_tokens": 855274165.0, "step": 5367 }, { "epoch": 2.7304170905391656, "grad_norm": 0.8741781115531921, "learning_rate": 1e-05, "loss": 0.4101, "mean_token_accuracy": 0.8644659519195557, "num_tokens": 855437674.0, "step": 5368 }, { "epoch": 2.7309257375381484, "grad_norm": 0.915451169013977, "learning_rate": 1e-05, "loss": 0.4259, "mean_token_accuracy": 0.8615007996559143, "num_tokens": 855605378.0, "step": 5369 }, { "epoch": 2.731434384537131, "grad_norm": 0.9084851741790771, "learning_rate": 1e-05, "loss": 0.4309, "mean_token_accuracy": 0.8610782623291016, "num_tokens": 855766653.0, "step": 5370 }, { "epoch": 2.731943031536114, "grad_norm": 0.8731338977813721, "learning_rate": 1e-05, "loss": 0.4212, "mean_token_accuracy": 0.8621797561645508, "num_tokens": 855925570.0, "step": 5371 }, { "epoch": 2.7324516785350967, "grad_norm": 0.8459338545799255, "learning_rate": 1e-05, "loss": 0.4635, "mean_token_accuracy": 0.8504737615585327, "num_tokens": 856096377.0, "step": 5372 }, { "epoch": 2.732960325534079, "grad_norm": 0.8906593918800354, "learning_rate": 1e-05, "loss": 0.4381, "mean_token_accuracy": 0.857587456703186, "num_tokens": 856248152.0, "step": 5373 }, { "epoch": 2.7334689725330623, "grad_norm": 0.9160838723182678, "learning_rate": 1e-05, "loss": 0.446, "mean_token_accuracy": 0.855549156665802, "num_tokens": 856408236.0, "step": 5374 }, { "epoch": 2.7339776195320447, "grad_norm": 0.8952018618583679, "learning_rate": 1e-05, "loss": 0.4576, "mean_token_accuracy": 0.8525691628456116, "num_tokens": 856563894.0, "step": 5375 }, { "epoch": 2.7344862665310274, "grad_norm": 0.8565658926963806, "learning_rate": 1e-05, "loss": 0.4328, "mean_token_accuracy": 0.8586984276771545, "num_tokens": 856726568.0, "step": 5376 }, { "epoch": 2.7349949135300102, "grad_norm": 0.8847385048866272, "learning_rate": 1e-05, "loss": 0.4325, "mean_token_accuracy": 0.8605058193206787, "num_tokens": 856885734.0, "step": 5377 }, { "epoch": 2.735503560528993, "grad_norm": 0.9560387134552002, "learning_rate": 1e-05, "loss": 0.439, "mean_token_accuracy": 0.8562148809432983, "num_tokens": 857033785.0, "step": 5378 }, { "epoch": 2.736012207527976, "grad_norm": 0.9595639109611511, "learning_rate": 1e-05, "loss": 0.4265, "mean_token_accuracy": 0.86170893907547, "num_tokens": 857194080.0, "step": 5379 }, { "epoch": 2.736520854526958, "grad_norm": 0.8554273247718811, "learning_rate": 1e-05, "loss": 0.4337, "mean_token_accuracy": 0.8590626120567322, "num_tokens": 857354595.0, "step": 5380 }, { "epoch": 2.737029501525941, "grad_norm": 0.9065148830413818, "learning_rate": 1e-05, "loss": 0.4217, "mean_token_accuracy": 0.863584041595459, "num_tokens": 857506990.0, "step": 5381 }, { "epoch": 2.7375381485249237, "grad_norm": 0.8713171482086182, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8588833212852478, "num_tokens": 857678702.0, "step": 5382 }, { "epoch": 2.7380467955239065, "grad_norm": 0.845331072807312, "learning_rate": 1e-05, "loss": 0.4405, "mean_token_accuracy": 0.8578164577484131, "num_tokens": 857837239.0, "step": 5383 }, { "epoch": 2.7385554425228893, "grad_norm": 0.8636237978935242, "learning_rate": 1e-05, "loss": 0.4232, "mean_token_accuracy": 0.8616138696670532, "num_tokens": 858003534.0, "step": 5384 }, { "epoch": 2.7390640895218716, "grad_norm": 0.829346776008606, "learning_rate": 1e-05, "loss": 0.4485, "mean_token_accuracy": 0.8571034073829651, "num_tokens": 858184527.0, "step": 5385 }, { "epoch": 2.7395727365208544, "grad_norm": 0.8747344017028809, "learning_rate": 1e-05, "loss": 0.458, "mean_token_accuracy": 0.8544100522994995, "num_tokens": 858340027.0, "step": 5386 }, { "epoch": 2.740081383519837, "grad_norm": 0.8807426691055298, "learning_rate": 1e-05, "loss": 0.4507, "mean_token_accuracy": 0.8541029691696167, "num_tokens": 858501686.0, "step": 5387 }, { "epoch": 2.74059003051882, "grad_norm": 0.8712559938430786, "learning_rate": 1e-05, "loss": 0.4528, "mean_token_accuracy": 0.8543274402618408, "num_tokens": 858667008.0, "step": 5388 }, { "epoch": 2.741098677517803, "grad_norm": 0.9187637567520142, "learning_rate": 1e-05, "loss": 0.4588, "mean_token_accuracy": 0.8529598712921143, "num_tokens": 858829541.0, "step": 5389 }, { "epoch": 2.741607324516785, "grad_norm": 0.8646373748779297, "learning_rate": 1e-05, "loss": 0.4231, "mean_token_accuracy": 0.8623921871185303, "num_tokens": 858997524.0, "step": 5390 }, { "epoch": 2.742115971515768, "grad_norm": 0.9268781542778015, "learning_rate": 1e-05, "loss": 0.4373, "mean_token_accuracy": 0.8578705787658691, "num_tokens": 859151303.0, "step": 5391 }, { "epoch": 2.7426246185147507, "grad_norm": 0.8708943128585815, "learning_rate": 1e-05, "loss": 0.4301, "mean_token_accuracy": 0.8591492772102356, "num_tokens": 859308138.0, "step": 5392 }, { "epoch": 2.7431332655137335, "grad_norm": 0.8756990432739258, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8604567646980286, "num_tokens": 859468758.0, "step": 5393 }, { "epoch": 2.7436419125127163, "grad_norm": 0.9198269248008728, "learning_rate": 1e-05, "loss": 0.4562, "mean_token_accuracy": 0.8523392677307129, "num_tokens": 859624523.0, "step": 5394 }, { "epoch": 2.7441505595116986, "grad_norm": 0.8417802453041077, "learning_rate": 1e-05, "loss": 0.4403, "mean_token_accuracy": 0.8584886789321899, "num_tokens": 859793482.0, "step": 5395 }, { "epoch": 2.744659206510682, "grad_norm": 0.846386730670929, "learning_rate": 1e-05, "loss": 0.4292, "mean_token_accuracy": 0.8598151803016663, "num_tokens": 859964920.0, "step": 5396 }, { "epoch": 2.745167853509664, "grad_norm": 0.9259663820266724, "learning_rate": 1e-05, "loss": 0.4666, "mean_token_accuracy": 0.8496070504188538, "num_tokens": 860126092.0, "step": 5397 }, { "epoch": 2.745676500508647, "grad_norm": 0.8447114825248718, "learning_rate": 1e-05, "loss": 0.4115, "mean_token_accuracy": 0.865730881690979, "num_tokens": 860276802.0, "step": 5398 }, { "epoch": 2.7461851475076298, "grad_norm": 0.838446855545044, "learning_rate": 1e-05, "loss": 0.4466, "mean_token_accuracy": 0.8559407591819763, "num_tokens": 860439424.0, "step": 5399 }, { "epoch": 2.7466937945066126, "grad_norm": 0.8424673080444336, "learning_rate": 1e-05, "loss": 0.4253, "mean_token_accuracy": 0.860883891582489, "num_tokens": 860603738.0, "step": 5400 }, { "epoch": 2.7472024415055953, "grad_norm": 0.8394173383712769, "learning_rate": 1e-05, "loss": 0.4237, "mean_token_accuracy": 0.8619527816772461, "num_tokens": 860762795.0, "step": 5401 }, { "epoch": 2.7477110885045777, "grad_norm": 0.8642099499702454, "learning_rate": 1e-05, "loss": 0.4355, "mean_token_accuracy": 0.8594051003456116, "num_tokens": 860922428.0, "step": 5402 }, { "epoch": 2.7482197355035605, "grad_norm": 0.8128042221069336, "learning_rate": 1e-05, "loss": 0.3997, "mean_token_accuracy": 0.8694398403167725, "num_tokens": 861090451.0, "step": 5403 }, { "epoch": 2.7487283825025433, "grad_norm": 0.8231059908866882, "learning_rate": 1e-05, "loss": 0.4087, "mean_token_accuracy": 0.8662785887718201, "num_tokens": 861269373.0, "step": 5404 }, { "epoch": 2.749237029501526, "grad_norm": 0.8843866586685181, "learning_rate": 1e-05, "loss": 0.4314, "mean_token_accuracy": 0.8592203855514526, "num_tokens": 861434807.0, "step": 5405 }, { "epoch": 2.749745676500509, "grad_norm": 0.83979731798172, "learning_rate": 1e-05, "loss": 0.4496, "mean_token_accuracy": 0.8535430431365967, "num_tokens": 861599524.0, "step": 5406 }, { "epoch": 2.750254323499491, "grad_norm": 0.9112589359283447, "learning_rate": 1e-05, "loss": 0.4414, "mean_token_accuracy": 0.8580020070075989, "num_tokens": 861753733.0, "step": 5407 }, { "epoch": 2.750762970498474, "grad_norm": 0.9031829237937927, "learning_rate": 1e-05, "loss": 0.4541, "mean_token_accuracy": 0.8531202673912048, "num_tokens": 861913031.0, "step": 5408 }, { "epoch": 2.7512716174974567, "grad_norm": 0.8952576518058777, "learning_rate": 1e-05, "loss": 0.424, "mean_token_accuracy": 0.8609805107116699, "num_tokens": 862064227.0, "step": 5409 }, { "epoch": 2.7517802644964395, "grad_norm": 0.9540919661521912, "learning_rate": 1e-05, "loss": 0.4129, "mean_token_accuracy": 0.8651838302612305, "num_tokens": 862222025.0, "step": 5410 }, { "epoch": 2.7522889114954223, "grad_norm": 0.8260163068771362, "learning_rate": 1e-05, "loss": 0.4232, "mean_token_accuracy": 0.8638889789581299, "num_tokens": 862386155.0, "step": 5411 }, { "epoch": 2.7527975584944047, "grad_norm": 0.8747431039810181, "learning_rate": 1e-05, "loss": 0.4409, "mean_token_accuracy": 0.8570093512535095, "num_tokens": 862540675.0, "step": 5412 }, { "epoch": 2.7533062054933874, "grad_norm": 0.8887156844139099, "learning_rate": 1e-05, "loss": 0.4518, "mean_token_accuracy": 0.8555654287338257, "num_tokens": 862704936.0, "step": 5413 }, { "epoch": 2.7538148524923702, "grad_norm": 0.872695803642273, "learning_rate": 1e-05, "loss": 0.4384, "mean_token_accuracy": 0.8563168048858643, "num_tokens": 862862721.0, "step": 5414 }, { "epoch": 2.754323499491353, "grad_norm": 0.8667381405830383, "learning_rate": 1e-05, "loss": 0.4279, "mean_token_accuracy": 0.8610851764678955, "num_tokens": 863023723.0, "step": 5415 }, { "epoch": 2.754832146490336, "grad_norm": 0.8873748183250427, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8571150302886963, "num_tokens": 863189315.0, "step": 5416 }, { "epoch": 2.755340793489318, "grad_norm": 0.8880362510681152, "learning_rate": 1e-05, "loss": 0.4232, "mean_token_accuracy": 0.8624112606048584, "num_tokens": 863350818.0, "step": 5417 }, { "epoch": 2.7558494404883014, "grad_norm": 0.8671992421150208, "learning_rate": 1e-05, "loss": 0.4563, "mean_token_accuracy": 0.8548204898834229, "num_tokens": 863516158.0, "step": 5418 }, { "epoch": 2.7563580874872837, "grad_norm": 0.9104628562927246, "learning_rate": 1e-05, "loss": 0.4661, "mean_token_accuracy": 0.8503981232643127, "num_tokens": 863684595.0, "step": 5419 }, { "epoch": 2.7568667344862665, "grad_norm": 0.922897458076477, "learning_rate": 1e-05, "loss": 0.4607, "mean_token_accuracy": 0.8522496223449707, "num_tokens": 863840254.0, "step": 5420 }, { "epoch": 2.7573753814852493, "grad_norm": 0.8758529424667358, "learning_rate": 1e-05, "loss": 0.4421, "mean_token_accuracy": 0.856182873249054, "num_tokens": 863991638.0, "step": 5421 }, { "epoch": 2.757884028484232, "grad_norm": 0.9188312292098999, "learning_rate": 1e-05, "loss": 0.4234, "mean_token_accuracy": 0.8620766401290894, "num_tokens": 864145411.0, "step": 5422 }, { "epoch": 2.758392675483215, "grad_norm": 0.8687071204185486, "learning_rate": 1e-05, "loss": 0.3912, "mean_token_accuracy": 0.8708814382553101, "num_tokens": 864310881.0, "step": 5423 }, { "epoch": 2.758901322482197, "grad_norm": 0.8740066289901733, "learning_rate": 1e-05, "loss": 0.406, "mean_token_accuracy": 0.8678805232048035, "num_tokens": 864463131.0, "step": 5424 }, { "epoch": 2.75940996948118, "grad_norm": 0.9356034398078918, "learning_rate": 1e-05, "loss": 0.4575, "mean_token_accuracy": 0.8515549898147583, "num_tokens": 864630170.0, "step": 5425 }, { "epoch": 2.759918616480163, "grad_norm": 0.8386475443840027, "learning_rate": 1e-05, "loss": 0.4128, "mean_token_accuracy": 0.864818274974823, "num_tokens": 864802104.0, "step": 5426 }, { "epoch": 2.7604272634791456, "grad_norm": 0.914970874786377, "learning_rate": 1e-05, "loss": 0.4729, "mean_token_accuracy": 0.8484303951263428, "num_tokens": 864970249.0, "step": 5427 }, { "epoch": 2.7609359104781284, "grad_norm": 0.8303055167198181, "learning_rate": 1e-05, "loss": 0.4338, "mean_token_accuracy": 0.8578987121582031, "num_tokens": 865146102.0, "step": 5428 }, { "epoch": 2.7614445574771107, "grad_norm": 0.8983274698257446, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8555811643600464, "num_tokens": 865293014.0, "step": 5429 }, { "epoch": 2.7619532044760935, "grad_norm": 0.8957774639129639, "learning_rate": 1e-05, "loss": 0.3939, "mean_token_accuracy": 0.870738685131073, "num_tokens": 865438180.0, "step": 5430 }, { "epoch": 2.7624618514750763, "grad_norm": 0.9148992896080017, "learning_rate": 1e-05, "loss": 0.4501, "mean_token_accuracy": 0.8549180030822754, "num_tokens": 865605006.0, "step": 5431 }, { "epoch": 2.762970498474059, "grad_norm": 0.9894981384277344, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8587237596511841, "num_tokens": 865761672.0, "step": 5432 }, { "epoch": 2.763479145473042, "grad_norm": 0.9346439242362976, "learning_rate": 1e-05, "loss": 0.4403, "mean_token_accuracy": 0.8564201593399048, "num_tokens": 865913316.0, "step": 5433 }, { "epoch": 2.763987792472024, "grad_norm": 0.9156526923179626, "learning_rate": 1e-05, "loss": 0.4479, "mean_token_accuracy": 0.8553595542907715, "num_tokens": 866080300.0, "step": 5434 }, { "epoch": 2.764496439471007, "grad_norm": 0.9552758932113647, "learning_rate": 1e-05, "loss": 0.4049, "mean_token_accuracy": 0.8684354424476624, "num_tokens": 866237570.0, "step": 5435 }, { "epoch": 2.7650050864699898, "grad_norm": 0.9023818969726562, "learning_rate": 1e-05, "loss": 0.4581, "mean_token_accuracy": 0.8515496253967285, "num_tokens": 866401658.0, "step": 5436 }, { "epoch": 2.7655137334689726, "grad_norm": 0.9802160859107971, "learning_rate": 1e-05, "loss": 0.435, "mean_token_accuracy": 0.8586635589599609, "num_tokens": 866565329.0, "step": 5437 }, { "epoch": 2.7660223804679553, "grad_norm": 0.893657386302948, "learning_rate": 1e-05, "loss": 0.4258, "mean_token_accuracy": 0.8619270920753479, "num_tokens": 866735792.0, "step": 5438 }, { "epoch": 2.7665310274669377, "grad_norm": 0.9348874688148499, "learning_rate": 1e-05, "loss": 0.4513, "mean_token_accuracy": 0.8542196154594421, "num_tokens": 866903239.0, "step": 5439 }, { "epoch": 2.767039674465921, "grad_norm": 0.8428106904029846, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8591272234916687, "num_tokens": 867069334.0, "step": 5440 }, { "epoch": 2.7675483214649033, "grad_norm": 0.8445457220077515, "learning_rate": 1e-05, "loss": 0.426, "mean_token_accuracy": 0.8615500330924988, "num_tokens": 867232722.0, "step": 5441 }, { "epoch": 2.768056968463886, "grad_norm": 0.9008581042289734, "learning_rate": 1e-05, "loss": 0.4579, "mean_token_accuracy": 0.8527448177337646, "num_tokens": 867389752.0, "step": 5442 }, { "epoch": 2.768565615462869, "grad_norm": 0.8951466083526611, "learning_rate": 1e-05, "loss": 0.4637, "mean_token_accuracy": 0.8508716821670532, "num_tokens": 867550520.0, "step": 5443 }, { "epoch": 2.7690742624618516, "grad_norm": 0.8982107043266296, "learning_rate": 1e-05, "loss": 0.4459, "mean_token_accuracy": 0.8546098470687866, "num_tokens": 867692997.0, "step": 5444 }, { "epoch": 2.7695829094608344, "grad_norm": 0.8363794088363647, "learning_rate": 1e-05, "loss": 0.4296, "mean_token_accuracy": 0.8605350255966187, "num_tokens": 867864658.0, "step": 5445 }, { "epoch": 2.7700915564598168, "grad_norm": 0.9093189835548401, "learning_rate": 1e-05, "loss": 0.4296, "mean_token_accuracy": 0.8598928451538086, "num_tokens": 868020290.0, "step": 5446 }, { "epoch": 2.7706002034587995, "grad_norm": 0.8726778626441956, "learning_rate": 1e-05, "loss": 0.4449, "mean_token_accuracy": 0.8547472953796387, "num_tokens": 868176893.0, "step": 5447 }, { "epoch": 2.7711088504577823, "grad_norm": 0.9353107213973999, "learning_rate": 1e-05, "loss": 0.4366, "mean_token_accuracy": 0.8588535785675049, "num_tokens": 868330610.0, "step": 5448 }, { "epoch": 2.771617497456765, "grad_norm": 0.8884314298629761, "learning_rate": 1e-05, "loss": 0.444, "mean_token_accuracy": 0.8559328317642212, "num_tokens": 868486653.0, "step": 5449 }, { "epoch": 2.772126144455748, "grad_norm": 0.8668583631515503, "learning_rate": 1e-05, "loss": 0.4361, "mean_token_accuracy": 0.8602858185768127, "num_tokens": 868649173.0, "step": 5450 }, { "epoch": 2.7726347914547302, "grad_norm": 0.8272715210914612, "learning_rate": 1e-05, "loss": 0.4305, "mean_token_accuracy": 0.8603529930114746, "num_tokens": 868815495.0, "step": 5451 }, { "epoch": 2.773143438453713, "grad_norm": 0.8570767045021057, "learning_rate": 1e-05, "loss": 0.425, "mean_token_accuracy": 0.8625004291534424, "num_tokens": 868970827.0, "step": 5452 }, { "epoch": 2.773652085452696, "grad_norm": 0.9126412272453308, "learning_rate": 1e-05, "loss": 0.4213, "mean_token_accuracy": 0.8636242151260376, "num_tokens": 869120830.0, "step": 5453 }, { "epoch": 2.7741607324516786, "grad_norm": 0.9080553650856018, "learning_rate": 1e-05, "loss": 0.4183, "mean_token_accuracy": 0.8638372421264648, "num_tokens": 869274987.0, "step": 5454 }, { "epoch": 2.7746693794506614, "grad_norm": 0.8964677453041077, "learning_rate": 1e-05, "loss": 0.4248, "mean_token_accuracy": 0.863158643245697, "num_tokens": 869423755.0, "step": 5455 }, { "epoch": 2.7751780264496437, "grad_norm": 0.9210769534111023, "learning_rate": 1e-05, "loss": 0.4463, "mean_token_accuracy": 0.8563529849052429, "num_tokens": 869586556.0, "step": 5456 }, { "epoch": 2.7756866734486265, "grad_norm": 0.881309449672699, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8571156859397888, "num_tokens": 869750823.0, "step": 5457 }, { "epoch": 2.7761953204476093, "grad_norm": 0.811819314956665, "learning_rate": 1e-05, "loss": 0.4407, "mean_token_accuracy": 0.85698401927948, "num_tokens": 869923056.0, "step": 5458 }, { "epoch": 2.776703967446592, "grad_norm": 0.8605573773384094, "learning_rate": 1e-05, "loss": 0.4766, "mean_token_accuracy": 0.8466289043426514, "num_tokens": 870090234.0, "step": 5459 }, { "epoch": 2.777212614445575, "grad_norm": 0.8861764669418335, "learning_rate": 1e-05, "loss": 0.4727, "mean_token_accuracy": 0.8470626473426819, "num_tokens": 870245934.0, "step": 5460 }, { "epoch": 2.777721261444557, "grad_norm": 0.9186680316925049, "learning_rate": 1e-05, "loss": 0.407, "mean_token_accuracy": 0.8667311668395996, "num_tokens": 870397537.0, "step": 5461 }, { "epoch": 2.7782299084435405, "grad_norm": 0.8719220161437988, "learning_rate": 1e-05, "loss": 0.4368, "mean_token_accuracy": 0.8578290939331055, "num_tokens": 870553892.0, "step": 5462 }, { "epoch": 2.778738555442523, "grad_norm": 1.9770593643188477, "learning_rate": 1e-05, "loss": 0.4188, "mean_token_accuracy": 0.8625924587249756, "num_tokens": 870700935.0, "step": 5463 }, { "epoch": 2.7792472024415056, "grad_norm": 0.9255325794219971, "learning_rate": 1e-05, "loss": 0.4317, "mean_token_accuracy": 0.8586496114730835, "num_tokens": 870854822.0, "step": 5464 }, { "epoch": 2.7797558494404884, "grad_norm": 0.8715844750404358, "learning_rate": 1e-05, "loss": 0.429, "mean_token_accuracy": 0.8607480525970459, "num_tokens": 871014660.0, "step": 5465 }, { "epoch": 2.780264496439471, "grad_norm": 0.9867140054702759, "learning_rate": 1e-05, "loss": 0.4819, "mean_token_accuracy": 0.8451007604598999, "num_tokens": 871168666.0, "step": 5466 }, { "epoch": 2.780773143438454, "grad_norm": 0.9128692150115967, "learning_rate": 1e-05, "loss": 0.4383, "mean_token_accuracy": 0.8578115105628967, "num_tokens": 871331772.0, "step": 5467 }, { "epoch": 2.7812817904374363, "grad_norm": 0.8914899230003357, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8648374676704407, "num_tokens": 871494942.0, "step": 5468 }, { "epoch": 2.781790437436419, "grad_norm": 0.933683454990387, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8490318059921265, "num_tokens": 871654840.0, "step": 5469 }, { "epoch": 2.782299084435402, "grad_norm": 0.8720911145210266, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8578515648841858, "num_tokens": 871816010.0, "step": 5470 }, { "epoch": 2.7828077314343846, "grad_norm": 0.8903201222419739, "learning_rate": 1e-05, "loss": 0.4301, "mean_token_accuracy": 0.8601359128952026, "num_tokens": 871986772.0, "step": 5471 }, { "epoch": 2.7833163784333674, "grad_norm": 0.9022795557975769, "learning_rate": 1e-05, "loss": 0.4237, "mean_token_accuracy": 0.8618631362915039, "num_tokens": 872141817.0, "step": 5472 }, { "epoch": 2.7838250254323498, "grad_norm": 0.8795859813690186, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8556370139122009, "num_tokens": 872295335.0, "step": 5473 }, { "epoch": 2.7843336724313326, "grad_norm": 1.0180273056030273, "learning_rate": 1e-05, "loss": 0.4421, "mean_token_accuracy": 0.8556839227676392, "num_tokens": 872455047.0, "step": 5474 }, { "epoch": 2.7848423194303153, "grad_norm": 0.8734360933303833, "learning_rate": 1e-05, "loss": 0.4585, "mean_token_accuracy": 0.8520325422286987, "num_tokens": 872622737.0, "step": 5475 }, { "epoch": 2.785350966429298, "grad_norm": 0.9091038107872009, "learning_rate": 1e-05, "loss": 0.4484, "mean_token_accuracy": 0.8580871820449829, "num_tokens": 872792380.0, "step": 5476 }, { "epoch": 2.785859613428281, "grad_norm": 0.9413979649543762, "learning_rate": 1e-05, "loss": 0.4226, "mean_token_accuracy": 0.8612833023071289, "num_tokens": 872943238.0, "step": 5477 }, { "epoch": 2.7863682604272633, "grad_norm": 0.9505724310874939, "learning_rate": 1e-05, "loss": 0.4652, "mean_token_accuracy": 0.850635290145874, "num_tokens": 873101516.0, "step": 5478 }, { "epoch": 2.786876907426246, "grad_norm": 0.9366916418075562, "learning_rate": 1e-05, "loss": 0.4306, "mean_token_accuracy": 0.8588675260543823, "num_tokens": 873250345.0, "step": 5479 }, { "epoch": 2.787385554425229, "grad_norm": 1.0134268999099731, "learning_rate": 1e-05, "loss": 0.4384, "mean_token_accuracy": 0.8575441837310791, "num_tokens": 873396170.0, "step": 5480 }, { "epoch": 2.7878942014242116, "grad_norm": 0.9557996392250061, "learning_rate": 1e-05, "loss": 0.4344, "mean_token_accuracy": 0.8593090772628784, "num_tokens": 873553305.0, "step": 5481 }, { "epoch": 2.7884028484231944, "grad_norm": 0.8980011343955994, "learning_rate": 1e-05, "loss": 0.4501, "mean_token_accuracy": 0.8541994094848633, "num_tokens": 873721507.0, "step": 5482 }, { "epoch": 2.7889114954221768, "grad_norm": 0.9143933653831482, "learning_rate": 1e-05, "loss": 0.4173, "mean_token_accuracy": 0.8620104789733887, "num_tokens": 873888032.0, "step": 5483 }, { "epoch": 2.78942014242116, "grad_norm": 0.8855127096176147, "learning_rate": 1e-05, "loss": 0.4035, "mean_token_accuracy": 0.8683799505233765, "num_tokens": 874047437.0, "step": 5484 }, { "epoch": 2.7899287894201423, "grad_norm": 0.8832699060440063, "learning_rate": 1e-05, "loss": 0.4408, "mean_token_accuracy": 0.8589814901351929, "num_tokens": 874204022.0, "step": 5485 }, { "epoch": 2.790437436419125, "grad_norm": 0.924237847328186, "learning_rate": 1e-05, "loss": 0.4252, "mean_token_accuracy": 0.8617229461669922, "num_tokens": 874355294.0, "step": 5486 }, { "epoch": 2.790946083418108, "grad_norm": 0.9835050106048584, "learning_rate": 1e-05, "loss": 0.4451, "mean_token_accuracy": 0.8557791709899902, "num_tokens": 874515758.0, "step": 5487 }, { "epoch": 2.7914547304170907, "grad_norm": 0.8630616664886475, "learning_rate": 1e-05, "loss": 0.4172, "mean_token_accuracy": 0.8630630373954773, "num_tokens": 874686080.0, "step": 5488 }, { "epoch": 2.7919633774160735, "grad_norm": 0.9042604565620422, "learning_rate": 1e-05, "loss": 0.4324, "mean_token_accuracy": 0.8611968755722046, "num_tokens": 874840681.0, "step": 5489 }, { "epoch": 2.792472024415056, "grad_norm": 0.9088775515556335, "learning_rate": 1e-05, "loss": 0.4251, "mean_token_accuracy": 0.8623006343841553, "num_tokens": 875008971.0, "step": 5490 }, { "epoch": 2.7929806714140386, "grad_norm": 0.8844015598297119, "learning_rate": 1e-05, "loss": 0.439, "mean_token_accuracy": 0.8596288561820984, "num_tokens": 875167442.0, "step": 5491 }, { "epoch": 2.7934893184130214, "grad_norm": 0.8631628751754761, "learning_rate": 1e-05, "loss": 0.4089, "mean_token_accuracy": 0.8667330145835876, "num_tokens": 875321096.0, "step": 5492 }, { "epoch": 2.793997965412004, "grad_norm": 0.9279149174690247, "learning_rate": 1e-05, "loss": 0.4317, "mean_token_accuracy": 0.8598275780677795, "num_tokens": 875481221.0, "step": 5493 }, { "epoch": 2.794506612410987, "grad_norm": 0.8831683397293091, "learning_rate": 1e-05, "loss": 0.4163, "mean_token_accuracy": 0.8644309043884277, "num_tokens": 875635412.0, "step": 5494 }, { "epoch": 2.7950152594099693, "grad_norm": 0.877562940120697, "learning_rate": 1e-05, "loss": 0.4227, "mean_token_accuracy": 0.8641342520713806, "num_tokens": 875803130.0, "step": 5495 }, { "epoch": 2.795523906408952, "grad_norm": 0.9235041737556458, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8580608367919922, "num_tokens": 875976489.0, "step": 5496 }, { "epoch": 2.796032553407935, "grad_norm": 0.9082411527633667, "learning_rate": 1e-05, "loss": 0.4874, "mean_token_accuracy": 0.8443915247917175, "num_tokens": 876135385.0, "step": 5497 }, { "epoch": 2.7965412004069177, "grad_norm": 0.8878147006034851, "learning_rate": 1e-05, "loss": 0.4383, "mean_token_accuracy": 0.8584445118904114, "num_tokens": 876298846.0, "step": 5498 }, { "epoch": 2.7970498474059005, "grad_norm": 0.8781887888908386, "learning_rate": 1e-05, "loss": 0.4199, "mean_token_accuracy": 0.8640130758285522, "num_tokens": 876461343.0, "step": 5499 }, { "epoch": 2.797558494404883, "grad_norm": 0.924721360206604, "learning_rate": 1e-05, "loss": 0.4667, "mean_token_accuracy": 0.849367082118988, "num_tokens": 876621222.0, "step": 5500 }, { "epoch": 2.7980671414038656, "grad_norm": 0.9024066925048828, "learning_rate": 1e-05, "loss": 0.4244, "mean_token_accuracy": 0.8612882494926453, "num_tokens": 876767008.0, "step": 5501 }, { "epoch": 2.7985757884028484, "grad_norm": 0.9149547815322876, "learning_rate": 1e-05, "loss": 0.4455, "mean_token_accuracy": 0.8569803237915039, "num_tokens": 876924869.0, "step": 5502 }, { "epoch": 2.799084435401831, "grad_norm": 0.8639458417892456, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.8605494499206543, "num_tokens": 877084246.0, "step": 5503 }, { "epoch": 2.799593082400814, "grad_norm": 0.9297685027122498, "learning_rate": 1e-05, "loss": 0.447, "mean_token_accuracy": 0.8527074456214905, "num_tokens": 877247182.0, "step": 5504 }, { "epoch": 2.8001017293997963, "grad_norm": 0.854729413986206, "learning_rate": 1e-05, "loss": 0.4216, "mean_token_accuracy": 0.8616640567779541, "num_tokens": 877401770.0, "step": 5505 }, { "epoch": 2.8006103763987795, "grad_norm": 0.886106014251709, "learning_rate": 1e-05, "loss": 0.4153, "mean_token_accuracy": 0.863771915435791, "num_tokens": 877562351.0, "step": 5506 }, { "epoch": 2.801119023397762, "grad_norm": 1.0278208255767822, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.86051344871521, "num_tokens": 877717791.0, "step": 5507 }, { "epoch": 2.8016276703967447, "grad_norm": 0.8804529309272766, "learning_rate": 1e-05, "loss": 0.4322, "mean_token_accuracy": 0.8584067821502686, "num_tokens": 877878025.0, "step": 5508 }, { "epoch": 2.8021363173957274, "grad_norm": 0.9551355838775635, "learning_rate": 1e-05, "loss": 0.457, "mean_token_accuracy": 0.8528353571891785, "num_tokens": 878036046.0, "step": 5509 }, { "epoch": 2.8026449643947102, "grad_norm": 0.919446587562561, "learning_rate": 1e-05, "loss": 0.4611, "mean_token_accuracy": 0.8518680930137634, "num_tokens": 878193570.0, "step": 5510 }, { "epoch": 2.803153611393693, "grad_norm": 0.9995241165161133, "learning_rate": 1e-05, "loss": 0.4365, "mean_token_accuracy": 0.8584878444671631, "num_tokens": 878356399.0, "step": 5511 }, { "epoch": 2.8036622583926754, "grad_norm": 0.9080137610435486, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8556913137435913, "num_tokens": 878511125.0, "step": 5512 }, { "epoch": 2.804170905391658, "grad_norm": 0.9686445593833923, "learning_rate": 1e-05, "loss": 0.3943, "mean_token_accuracy": 0.8702806234359741, "num_tokens": 878667723.0, "step": 5513 }, { "epoch": 2.804679552390641, "grad_norm": 0.9286717176437378, "learning_rate": 1e-05, "loss": 0.4105, "mean_token_accuracy": 0.8658961057662964, "num_tokens": 878816670.0, "step": 5514 }, { "epoch": 2.8051881993896237, "grad_norm": 0.9878254532814026, "learning_rate": 1e-05, "loss": 0.4449, "mean_token_accuracy": 0.855570375919342, "num_tokens": 878967803.0, "step": 5515 }, { "epoch": 2.8056968463886065, "grad_norm": 1.0075044631958008, "learning_rate": 1e-05, "loss": 0.423, "mean_token_accuracy": 0.8624618053436279, "num_tokens": 879137968.0, "step": 5516 }, { "epoch": 2.806205493387589, "grad_norm": 0.9074199199676514, "learning_rate": 1e-05, "loss": 0.4135, "mean_token_accuracy": 0.8656484484672546, "num_tokens": 879297687.0, "step": 5517 }, { "epoch": 2.8067141403865716, "grad_norm": 0.9003438353538513, "learning_rate": 1e-05, "loss": 0.4604, "mean_token_accuracy": 0.8533244132995605, "num_tokens": 879459289.0, "step": 5518 }, { "epoch": 2.8072227873855544, "grad_norm": 0.9956253170967102, "learning_rate": 1e-05, "loss": 0.435, "mean_token_accuracy": 0.8599238395690918, "num_tokens": 879627794.0, "step": 5519 }, { "epoch": 2.807731434384537, "grad_norm": 0.9911926984786987, "learning_rate": 1e-05, "loss": 0.4535, "mean_token_accuracy": 0.8532723188400269, "num_tokens": 879793433.0, "step": 5520 }, { "epoch": 2.80824008138352, "grad_norm": 0.9630784392356873, "learning_rate": 1e-05, "loss": 0.4679, "mean_token_accuracy": 0.8500961065292358, "num_tokens": 879947230.0, "step": 5521 }, { "epoch": 2.8087487283825023, "grad_norm": 1.0050899982452393, "learning_rate": 1e-05, "loss": 0.4583, "mean_token_accuracy": 0.8508563041687012, "num_tokens": 880110903.0, "step": 5522 }, { "epoch": 2.809257375381485, "grad_norm": 0.9409875869750977, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8560415506362915, "num_tokens": 880269275.0, "step": 5523 }, { "epoch": 2.809766022380468, "grad_norm": 0.8914468288421631, "learning_rate": 1e-05, "loss": 0.3964, "mean_token_accuracy": 0.8705465793609619, "num_tokens": 880423872.0, "step": 5524 }, { "epoch": 2.8102746693794507, "grad_norm": 0.9682357311248779, "learning_rate": 1e-05, "loss": 0.4275, "mean_token_accuracy": 0.8604957461357117, "num_tokens": 880573676.0, "step": 5525 }, { "epoch": 2.8107833163784335, "grad_norm": 0.8917509913444519, "learning_rate": 1e-05, "loss": 0.4315, "mean_token_accuracy": 0.8595092296600342, "num_tokens": 880730161.0, "step": 5526 }, { "epoch": 2.811291963377416, "grad_norm": 0.8812057375907898, "learning_rate": 1e-05, "loss": 0.4241, "mean_token_accuracy": 0.8612045049667358, "num_tokens": 880886957.0, "step": 5527 }, { "epoch": 2.811800610376399, "grad_norm": 1.008043646812439, "learning_rate": 1e-05, "loss": 0.4378, "mean_token_accuracy": 0.8575454354286194, "num_tokens": 881055356.0, "step": 5528 }, { "epoch": 2.8123092573753814, "grad_norm": 0.8801813125610352, "learning_rate": 1e-05, "loss": 0.4229, "mean_token_accuracy": 0.8622205257415771, "num_tokens": 881206094.0, "step": 5529 }, { "epoch": 2.812817904374364, "grad_norm": 0.9803150296211243, "learning_rate": 1e-05, "loss": 0.4522, "mean_token_accuracy": 0.8542446494102478, "num_tokens": 881381086.0, "step": 5530 }, { "epoch": 2.813326551373347, "grad_norm": 0.9275711178779602, "learning_rate": 1e-05, "loss": 0.4295, "mean_token_accuracy": 0.860144317150116, "num_tokens": 881543453.0, "step": 5531 }, { "epoch": 2.8138351983723298, "grad_norm": 0.9270220398902893, "learning_rate": 1e-05, "loss": 0.4209, "mean_token_accuracy": 0.8627274036407471, "num_tokens": 881697429.0, "step": 5532 }, { "epoch": 2.8143438453713125, "grad_norm": 0.9209438562393188, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8603096008300781, "num_tokens": 881842639.0, "step": 5533 }, { "epoch": 2.814852492370295, "grad_norm": 0.8858495354652405, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8545648455619812, "num_tokens": 882012327.0, "step": 5534 }, { "epoch": 2.8153611393692777, "grad_norm": 0.9024306535720825, "learning_rate": 1e-05, "loss": 0.4189, "mean_token_accuracy": 0.862636387348175, "num_tokens": 882179920.0, "step": 5535 }, { "epoch": 2.8158697863682605, "grad_norm": 0.9070698618888855, "learning_rate": 1e-05, "loss": 0.4272, "mean_token_accuracy": 0.8598899841308594, "num_tokens": 882333209.0, "step": 5536 }, { "epoch": 2.8163784333672433, "grad_norm": 0.8896937370300293, "learning_rate": 1e-05, "loss": 0.4365, "mean_token_accuracy": 0.8578742742538452, "num_tokens": 882491328.0, "step": 5537 }, { "epoch": 2.816887080366226, "grad_norm": 0.9919584393501282, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8596024513244629, "num_tokens": 882643243.0, "step": 5538 }, { "epoch": 2.8173957273652084, "grad_norm": 0.8577784299850464, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.8593538999557495, "num_tokens": 882803061.0, "step": 5539 }, { "epoch": 2.817904374364191, "grad_norm": 0.8803346157073975, "learning_rate": 1e-05, "loss": 0.4224, "mean_token_accuracy": 0.8638498783111572, "num_tokens": 882969232.0, "step": 5540 }, { "epoch": 2.818413021363174, "grad_norm": 0.9800869822502136, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.8502121567726135, "num_tokens": 883124158.0, "step": 5541 }, { "epoch": 2.8189216683621567, "grad_norm": 0.8235204219818115, "learning_rate": 1e-05, "loss": 0.3985, "mean_token_accuracy": 0.8684201240539551, "num_tokens": 883279013.0, "step": 5542 }, { "epoch": 2.8194303153611395, "grad_norm": 0.8743343353271484, "learning_rate": 1e-05, "loss": 0.4594, "mean_token_accuracy": 0.8527327179908752, "num_tokens": 883438475.0, "step": 5543 }, { "epoch": 2.819938962360122, "grad_norm": 0.8948822021484375, "learning_rate": 1e-05, "loss": 0.4324, "mean_token_accuracy": 0.8579428791999817, "num_tokens": 883592025.0, "step": 5544 }, { "epoch": 2.8204476093591047, "grad_norm": 0.9139871597290039, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8574783802032471, "num_tokens": 883747762.0, "step": 5545 }, { "epoch": 2.8209562563580874, "grad_norm": 0.8890607953071594, "learning_rate": 1e-05, "loss": 0.3932, "mean_token_accuracy": 0.8691313862800598, "num_tokens": 883908818.0, "step": 5546 }, { "epoch": 2.8214649033570702, "grad_norm": 0.8738477230072021, "learning_rate": 1e-05, "loss": 0.4637, "mean_token_accuracy": 0.8499952554702759, "num_tokens": 884072795.0, "step": 5547 }, { "epoch": 2.821973550356053, "grad_norm": 0.9552207589149475, "learning_rate": 1e-05, "loss": 0.452, "mean_token_accuracy": 0.8557916283607483, "num_tokens": 884225754.0, "step": 5548 }, { "epoch": 2.8224821973550354, "grad_norm": 0.8908520340919495, "learning_rate": 1e-05, "loss": 0.4515, "mean_token_accuracy": 0.8545007705688477, "num_tokens": 884389141.0, "step": 5549 }, { "epoch": 2.822990844354018, "grad_norm": 0.8425365686416626, "learning_rate": 1e-05, "loss": 0.455, "mean_token_accuracy": 0.8545691967010498, "num_tokens": 884553353.0, "step": 5550 }, { "epoch": 2.823499491353001, "grad_norm": 0.888638436794281, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8544846773147583, "num_tokens": 884717764.0, "step": 5551 }, { "epoch": 2.8240081383519837, "grad_norm": 0.8891406655311584, "learning_rate": 1e-05, "loss": 0.4578, "mean_token_accuracy": 0.8521091938018799, "num_tokens": 884885565.0, "step": 5552 }, { "epoch": 2.8245167853509665, "grad_norm": 0.8493015170097351, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8548928499221802, "num_tokens": 885042881.0, "step": 5553 }, { "epoch": 2.8250254323499493, "grad_norm": 0.8271024823188782, "learning_rate": 1e-05, "loss": 0.4313, "mean_token_accuracy": 0.8589669466018677, "num_tokens": 885208221.0, "step": 5554 }, { "epoch": 2.825534079348932, "grad_norm": 0.896240770816803, "learning_rate": 1e-05, "loss": 0.4434, "mean_token_accuracy": 0.8566911816596985, "num_tokens": 885368977.0, "step": 5555 }, { "epoch": 2.8260427263479144, "grad_norm": 0.8497312068939209, "learning_rate": 1e-05, "loss": 0.394, "mean_token_accuracy": 0.8694432973861694, "num_tokens": 885522863.0, "step": 5556 }, { "epoch": 2.826551373346897, "grad_norm": 0.8087031841278076, "learning_rate": 1e-05, "loss": 0.4243, "mean_token_accuracy": 0.8616116046905518, "num_tokens": 885682654.0, "step": 5557 }, { "epoch": 2.82706002034588, "grad_norm": 0.8608993291854858, "learning_rate": 1e-05, "loss": 0.4482, "mean_token_accuracy": 0.8543695211410522, "num_tokens": 885850081.0, "step": 5558 }, { "epoch": 2.827568667344863, "grad_norm": 0.9493992328643799, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8574357032775879, "num_tokens": 886026553.0, "step": 5559 }, { "epoch": 2.8280773143438456, "grad_norm": 0.8679805994033813, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8610323667526245, "num_tokens": 886186789.0, "step": 5560 }, { "epoch": 2.828585961342828, "grad_norm": 0.8890089988708496, "learning_rate": 1e-05, "loss": 0.45, "mean_token_accuracy": 0.8530704975128174, "num_tokens": 886337087.0, "step": 5561 }, { "epoch": 2.8290946083418107, "grad_norm": 0.8177929520606995, "learning_rate": 1e-05, "loss": 0.43, "mean_token_accuracy": 0.8591690063476562, "num_tokens": 886496034.0, "step": 5562 }, { "epoch": 2.8296032553407935, "grad_norm": 0.8867534399032593, "learning_rate": 1e-05, "loss": 0.4129, "mean_token_accuracy": 0.8648470640182495, "num_tokens": 886666027.0, "step": 5563 }, { "epoch": 2.8301119023397763, "grad_norm": 0.9145981073379517, "learning_rate": 1e-05, "loss": 0.4471, "mean_token_accuracy": 0.8576685786247253, "num_tokens": 886817995.0, "step": 5564 }, { "epoch": 2.830620549338759, "grad_norm": 0.8816673755645752, "learning_rate": 1e-05, "loss": 0.4453, "mean_token_accuracy": 0.8556328415870667, "num_tokens": 886966338.0, "step": 5565 }, { "epoch": 2.8311291963377414, "grad_norm": 0.8810669779777527, "learning_rate": 1e-05, "loss": 0.4099, "mean_token_accuracy": 0.8664361834526062, "num_tokens": 887120555.0, "step": 5566 }, { "epoch": 2.831637843336724, "grad_norm": 0.8689042329788208, "learning_rate": 1e-05, "loss": 0.418, "mean_token_accuracy": 0.8618471026420593, "num_tokens": 887278500.0, "step": 5567 }, { "epoch": 2.832146490335707, "grad_norm": 0.8702934980392456, "learning_rate": 1e-05, "loss": 0.425, "mean_token_accuracy": 0.8622481822967529, "num_tokens": 887434554.0, "step": 5568 }, { "epoch": 2.8326551373346898, "grad_norm": 0.83709716796875, "learning_rate": 1e-05, "loss": 0.4021, "mean_token_accuracy": 0.869196891784668, "num_tokens": 887590296.0, "step": 5569 }, { "epoch": 2.8331637843336726, "grad_norm": 0.9024631977081299, "learning_rate": 1e-05, "loss": 0.4312, "mean_token_accuracy": 0.8614013195037842, "num_tokens": 887749092.0, "step": 5570 }, { "epoch": 2.833672431332655, "grad_norm": 0.9151177406311035, "learning_rate": 1e-05, "loss": 0.4592, "mean_token_accuracy": 0.8518662452697754, "num_tokens": 887909082.0, "step": 5571 }, { "epoch": 2.8341810783316377, "grad_norm": 0.8768104910850525, "learning_rate": 1e-05, "loss": 0.4528, "mean_token_accuracy": 0.8537473678588867, "num_tokens": 888072082.0, "step": 5572 }, { "epoch": 2.8346897253306205, "grad_norm": 0.8094744682312012, "learning_rate": 1e-05, "loss": 0.4193, "mean_token_accuracy": 0.8633731007575989, "num_tokens": 888229339.0, "step": 5573 }, { "epoch": 2.8351983723296033, "grad_norm": 0.9014713764190674, "learning_rate": 1e-05, "loss": 0.4608, "mean_token_accuracy": 0.8511308431625366, "num_tokens": 888384734.0, "step": 5574 }, { "epoch": 2.835707019328586, "grad_norm": 0.8186359405517578, "learning_rate": 1e-05, "loss": 0.395, "mean_token_accuracy": 0.8703606128692627, "num_tokens": 888546458.0, "step": 5575 }, { "epoch": 2.836215666327569, "grad_norm": 0.8721616268157959, "learning_rate": 1e-05, "loss": 0.4399, "mean_token_accuracy": 0.8587380647659302, "num_tokens": 888705436.0, "step": 5576 }, { "epoch": 2.8367243133265516, "grad_norm": 0.8719232082366943, "learning_rate": 1e-05, "loss": 0.4441, "mean_token_accuracy": 0.8571681976318359, "num_tokens": 888863764.0, "step": 5577 }, { "epoch": 2.837232960325534, "grad_norm": 0.8821418881416321, "learning_rate": 1e-05, "loss": 0.4159, "mean_token_accuracy": 0.8637368083000183, "num_tokens": 889025194.0, "step": 5578 }, { "epoch": 2.8377416073245167, "grad_norm": 0.9229016304016113, "learning_rate": 1e-05, "loss": 0.4278, "mean_token_accuracy": 0.8599576950073242, "num_tokens": 889182518.0, "step": 5579 }, { "epoch": 2.8382502543234995, "grad_norm": 0.9832534790039062, "learning_rate": 1e-05, "loss": 0.4583, "mean_token_accuracy": 0.8509209156036377, "num_tokens": 889348596.0, "step": 5580 }, { "epoch": 2.8387589013224823, "grad_norm": 0.8540241122245789, "learning_rate": 1e-05, "loss": 0.404, "mean_token_accuracy": 0.8679183125495911, "num_tokens": 889511461.0, "step": 5581 }, { "epoch": 2.839267548321465, "grad_norm": 0.912282407283783, "learning_rate": 1e-05, "loss": 0.4606, "mean_token_accuracy": 0.852907657623291, "num_tokens": 889676197.0, "step": 5582 }, { "epoch": 2.8397761953204474, "grad_norm": 0.8290195465087891, "learning_rate": 1e-05, "loss": 0.439, "mean_token_accuracy": 0.8586235046386719, "num_tokens": 889843485.0, "step": 5583 }, { "epoch": 2.8402848423194302, "grad_norm": 1.198011040687561, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8564248085021973, "num_tokens": 890005117.0, "step": 5584 }, { "epoch": 2.840793489318413, "grad_norm": 0.8554531335830688, "learning_rate": 1e-05, "loss": 0.4494, "mean_token_accuracy": 0.8548786044120789, "num_tokens": 890158166.0, "step": 5585 }, { "epoch": 2.841302136317396, "grad_norm": 0.8591538667678833, "learning_rate": 1e-05, "loss": 0.4207, "mean_token_accuracy": 0.8624992370605469, "num_tokens": 890316760.0, "step": 5586 }, { "epoch": 2.8418107833163786, "grad_norm": 0.9349908232688904, "learning_rate": 1e-05, "loss": 0.4285, "mean_token_accuracy": 0.8602296113967896, "num_tokens": 890465667.0, "step": 5587 }, { "epoch": 2.842319430315361, "grad_norm": 0.8721975088119507, "learning_rate": 1e-05, "loss": 0.4383, "mean_token_accuracy": 0.8565325736999512, "num_tokens": 890620871.0, "step": 5588 }, { "epoch": 2.8428280773143437, "grad_norm": 0.940648078918457, "learning_rate": 1e-05, "loss": 0.4604, "mean_token_accuracy": 0.8533288836479187, "num_tokens": 890780390.0, "step": 5589 }, { "epoch": 2.8433367243133265, "grad_norm": 0.8890752792358398, "learning_rate": 1e-05, "loss": 0.3947, "mean_token_accuracy": 0.8674182295799255, "num_tokens": 890936449.0, "step": 5590 }, { "epoch": 2.8438453713123093, "grad_norm": 0.8123888373374939, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8599176406860352, "num_tokens": 891102174.0, "step": 5591 }, { "epoch": 2.844354018311292, "grad_norm": 0.954985499382019, "learning_rate": 1e-05, "loss": 0.438, "mean_token_accuracy": 0.8574029207229614, "num_tokens": 891269034.0, "step": 5592 }, { "epoch": 2.8448626653102744, "grad_norm": 0.8957969546318054, "learning_rate": 1e-05, "loss": 0.4104, "mean_token_accuracy": 0.8660489320755005, "num_tokens": 891427523.0, "step": 5593 }, { "epoch": 2.845371312309257, "grad_norm": 1.1233398914337158, "learning_rate": 1e-05, "loss": 0.4481, "mean_token_accuracy": 0.8538949489593506, "num_tokens": 891601141.0, "step": 5594 }, { "epoch": 2.84587995930824, "grad_norm": 0.8899928331375122, "learning_rate": 1e-05, "loss": 0.4401, "mean_token_accuracy": 0.8564096689224243, "num_tokens": 891758193.0, "step": 5595 }, { "epoch": 2.846388606307223, "grad_norm": 0.9822993874549866, "learning_rate": 1e-05, "loss": 0.4715, "mean_token_accuracy": 0.8498365879058838, "num_tokens": 891911699.0, "step": 5596 }, { "epoch": 2.8468972533062056, "grad_norm": 0.9196913242340088, "learning_rate": 1e-05, "loss": 0.4416, "mean_token_accuracy": 0.8551076650619507, "num_tokens": 892079968.0, "step": 5597 }, { "epoch": 2.8474059003051884, "grad_norm": 0.8975830078125, "learning_rate": 1e-05, "loss": 0.4346, "mean_token_accuracy": 0.8593940734863281, "num_tokens": 892236979.0, "step": 5598 }, { "epoch": 2.847914547304171, "grad_norm": 0.9877001643180847, "learning_rate": 1e-05, "loss": 0.4156, "mean_token_accuracy": 0.8644639849662781, "num_tokens": 892387456.0, "step": 5599 }, { "epoch": 2.8484231943031535, "grad_norm": 0.9261271357536316, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.8556768298149109, "num_tokens": 892536633.0, "step": 5600 }, { "epoch": 2.8489318413021363, "grad_norm": 0.9050893187522888, "learning_rate": 1e-05, "loss": 0.4141, "mean_token_accuracy": 0.864107072353363, "num_tokens": 892699765.0, "step": 5601 }, { "epoch": 2.849440488301119, "grad_norm": 0.899746835231781, "learning_rate": 1e-05, "loss": 0.4382, "mean_token_accuracy": 0.8585728406906128, "num_tokens": 892862170.0, "step": 5602 }, { "epoch": 2.849949135300102, "grad_norm": 0.9267527461051941, "learning_rate": 1e-05, "loss": 0.4537, "mean_token_accuracy": 0.850830614566803, "num_tokens": 893017234.0, "step": 5603 }, { "epoch": 2.8504577822990846, "grad_norm": 1.0719654560089111, "learning_rate": 1e-05, "loss": 0.4296, "mean_token_accuracy": 0.8597113490104675, "num_tokens": 893170520.0, "step": 5604 }, { "epoch": 2.850966429298067, "grad_norm": 0.912548840045929, "learning_rate": 1e-05, "loss": 0.4171, "mean_token_accuracy": 0.8634018301963806, "num_tokens": 893319754.0, "step": 5605 }, { "epoch": 2.8514750762970498, "grad_norm": 0.9132776260375977, "learning_rate": 1e-05, "loss": 0.4662, "mean_token_accuracy": 0.8516316413879395, "num_tokens": 893480701.0, "step": 5606 }, { "epoch": 2.8519837232960326, "grad_norm": 0.9201282262802124, "learning_rate": 1e-05, "loss": 0.4529, "mean_token_accuracy": 0.8552930355072021, "num_tokens": 893634155.0, "step": 5607 }, { "epoch": 2.8524923702950153, "grad_norm": 0.8679859042167664, "learning_rate": 1e-05, "loss": 0.469, "mean_token_accuracy": 0.8502346277236938, "num_tokens": 893795202.0, "step": 5608 }, { "epoch": 2.853001017293998, "grad_norm": 0.9941920638084412, "learning_rate": 1e-05, "loss": 0.4047, "mean_token_accuracy": 0.867050290107727, "num_tokens": 893943577.0, "step": 5609 }, { "epoch": 2.8535096642929805, "grad_norm": 0.8621466755867004, "learning_rate": 1e-05, "loss": 0.4476, "mean_token_accuracy": 0.8550496697425842, "num_tokens": 894108299.0, "step": 5610 }, { "epoch": 2.8540183112919633, "grad_norm": 0.9322388768196106, "learning_rate": 1e-05, "loss": 0.4418, "mean_token_accuracy": 0.8577138185501099, "num_tokens": 894259102.0, "step": 5611 }, { "epoch": 2.854526958290946, "grad_norm": 0.909308135509491, "learning_rate": 1e-05, "loss": 0.457, "mean_token_accuracy": 0.8523279428482056, "num_tokens": 894419933.0, "step": 5612 }, { "epoch": 2.855035605289929, "grad_norm": 0.8565107583999634, "learning_rate": 1e-05, "loss": 0.417, "mean_token_accuracy": 0.8655107021331787, "num_tokens": 894576241.0, "step": 5613 }, { "epoch": 2.8555442522889116, "grad_norm": 0.9509232640266418, "learning_rate": 1e-05, "loss": 0.4517, "mean_token_accuracy": 0.8545820713043213, "num_tokens": 894741124.0, "step": 5614 }, { "epoch": 2.856052899287894, "grad_norm": 1.0113739967346191, "learning_rate": 1e-05, "loss": 0.4472, "mean_token_accuracy": 0.857478141784668, "num_tokens": 894904122.0, "step": 5615 }, { "epoch": 2.8565615462868768, "grad_norm": 1.0040079355239868, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8473595380783081, "num_tokens": 895056647.0, "step": 5616 }, { "epoch": 2.8570701932858595, "grad_norm": 0.8770785331726074, "learning_rate": 1e-05, "loss": 0.4513, "mean_token_accuracy": 0.8533390760421753, "num_tokens": 895213822.0, "step": 5617 }, { "epoch": 2.8575788402848423, "grad_norm": 0.8685998320579529, "learning_rate": 1e-05, "loss": 0.4817, "mean_token_accuracy": 0.8471949100494385, "num_tokens": 895380518.0, "step": 5618 }, { "epoch": 2.858087487283825, "grad_norm": 0.94172203540802, "learning_rate": 1e-05, "loss": 0.4364, "mean_token_accuracy": 0.8586889505386353, "num_tokens": 895544268.0, "step": 5619 }, { "epoch": 2.8585961342828075, "grad_norm": 0.8902906775474548, "learning_rate": 1e-05, "loss": 0.4298, "mean_token_accuracy": 0.8607203960418701, "num_tokens": 895702008.0, "step": 5620 }, { "epoch": 2.8591047812817907, "grad_norm": 0.850885272026062, "learning_rate": 1e-05, "loss": 0.4, "mean_token_accuracy": 0.8698835372924805, "num_tokens": 895851412.0, "step": 5621 }, { "epoch": 2.859613428280773, "grad_norm": 1.0017894506454468, "learning_rate": 1e-05, "loss": 0.4209, "mean_token_accuracy": 0.8614801168441772, "num_tokens": 896011546.0, "step": 5622 }, { "epoch": 2.860122075279756, "grad_norm": 1.015999674797058, "learning_rate": 1e-05, "loss": 0.4374, "mean_token_accuracy": 0.8582688570022583, "num_tokens": 896162494.0, "step": 5623 }, { "epoch": 2.8606307222787386, "grad_norm": 0.923900306224823, "learning_rate": 1e-05, "loss": 0.4187, "mean_token_accuracy": 0.864201545715332, "num_tokens": 896329246.0, "step": 5624 }, { "epoch": 2.8611393692777214, "grad_norm": 0.9373534321784973, "learning_rate": 1e-05, "loss": 0.4025, "mean_token_accuracy": 0.868912935256958, "num_tokens": 896485456.0, "step": 5625 }, { "epoch": 2.861648016276704, "grad_norm": 0.8971603512763977, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.8586189150810242, "num_tokens": 896652517.0, "step": 5626 }, { "epoch": 2.8621566632756865, "grad_norm": 0.9007195234298706, "learning_rate": 1e-05, "loss": 0.4288, "mean_token_accuracy": 0.859435498714447, "num_tokens": 896821016.0, "step": 5627 }, { "epoch": 2.8626653102746693, "grad_norm": 0.9142847657203674, "learning_rate": 1e-05, "loss": 0.4184, "mean_token_accuracy": 0.8641673922538757, "num_tokens": 896970551.0, "step": 5628 }, { "epoch": 2.863173957273652, "grad_norm": 0.8780483603477478, "learning_rate": 1e-05, "loss": 0.4314, "mean_token_accuracy": 0.8598891496658325, "num_tokens": 897136957.0, "step": 5629 }, { "epoch": 2.863682604272635, "grad_norm": 0.8631423711776733, "learning_rate": 1e-05, "loss": 0.4343, "mean_token_accuracy": 0.8595150709152222, "num_tokens": 897291892.0, "step": 5630 }, { "epoch": 2.8641912512716177, "grad_norm": 0.8913904428482056, "learning_rate": 1e-05, "loss": 0.4177, "mean_token_accuracy": 0.8650398850440979, "num_tokens": 897452981.0, "step": 5631 }, { "epoch": 2.8646998982706, "grad_norm": 0.9493880867958069, "learning_rate": 1e-05, "loss": 0.4601, "mean_token_accuracy": 0.8521661162376404, "num_tokens": 897605769.0, "step": 5632 }, { "epoch": 2.865208545269583, "grad_norm": 0.8981471657752991, "learning_rate": 1e-05, "loss": 0.414, "mean_token_accuracy": 0.8634394407272339, "num_tokens": 897760281.0, "step": 5633 }, { "epoch": 2.8657171922685656, "grad_norm": 0.8748188018798828, "learning_rate": 1e-05, "loss": 0.4279, "mean_token_accuracy": 0.8613673448562622, "num_tokens": 897914249.0, "step": 5634 }, { "epoch": 2.8662258392675484, "grad_norm": 0.8937290906906128, "learning_rate": 1e-05, "loss": 0.4369, "mean_token_accuracy": 0.8586488962173462, "num_tokens": 898064141.0, "step": 5635 }, { "epoch": 2.866734486266531, "grad_norm": 0.8642660975456238, "learning_rate": 1e-05, "loss": 0.4204, "mean_token_accuracy": 0.8620635271072388, "num_tokens": 898228863.0, "step": 5636 }, { "epoch": 2.8672431332655135, "grad_norm": 0.9388706088066101, "learning_rate": 1e-05, "loss": 0.4382, "mean_token_accuracy": 0.8560210466384888, "num_tokens": 898373794.0, "step": 5637 }, { "epoch": 2.8677517802644963, "grad_norm": 0.8486452698707581, "learning_rate": 1e-05, "loss": 0.4155, "mean_token_accuracy": 0.8632423281669617, "num_tokens": 898530951.0, "step": 5638 }, { "epoch": 2.868260427263479, "grad_norm": 0.8945029377937317, "learning_rate": 1e-05, "loss": 0.4494, "mean_token_accuracy": 0.8540161848068237, "num_tokens": 898691706.0, "step": 5639 }, { "epoch": 2.868769074262462, "grad_norm": 0.8432234525680542, "learning_rate": 1e-05, "loss": 0.414, "mean_token_accuracy": 0.8628842830657959, "num_tokens": 898851712.0, "step": 5640 }, { "epoch": 2.8692777212614446, "grad_norm": 0.9244741201400757, "learning_rate": 1e-05, "loss": 0.4435, "mean_token_accuracy": 0.8574188351631165, "num_tokens": 899015049.0, "step": 5641 }, { "epoch": 2.869786368260427, "grad_norm": 0.8768820762634277, "learning_rate": 1e-05, "loss": 0.4311, "mean_token_accuracy": 0.8579252362251282, "num_tokens": 899173646.0, "step": 5642 }, { "epoch": 2.87029501525941, "grad_norm": 0.8740848898887634, "learning_rate": 1e-05, "loss": 0.4339, "mean_token_accuracy": 0.8589560389518738, "num_tokens": 899340664.0, "step": 5643 }, { "epoch": 2.8708036622583926, "grad_norm": 0.8888913989067078, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.8611010313034058, "num_tokens": 899486743.0, "step": 5644 }, { "epoch": 2.8713123092573754, "grad_norm": 0.8741505146026611, "learning_rate": 1e-05, "loss": 0.4353, "mean_token_accuracy": 0.8584800958633423, "num_tokens": 899653132.0, "step": 5645 }, { "epoch": 2.871820956256358, "grad_norm": 0.9070521593093872, "learning_rate": 1e-05, "loss": 0.4266, "mean_token_accuracy": 0.8619548678398132, "num_tokens": 899809627.0, "step": 5646 }, { "epoch": 2.872329603255341, "grad_norm": 0.8843440413475037, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8559424877166748, "num_tokens": 899975130.0, "step": 5647 }, { "epoch": 2.8728382502543237, "grad_norm": 0.8765766620635986, "learning_rate": 1e-05, "loss": 0.432, "mean_token_accuracy": 0.8621829152107239, "num_tokens": 900137262.0, "step": 5648 }, { "epoch": 2.873346897253306, "grad_norm": 0.9020034074783325, "learning_rate": 1e-05, "loss": 0.4698, "mean_token_accuracy": 0.8482083082199097, "num_tokens": 900291517.0, "step": 5649 }, { "epoch": 2.873855544252289, "grad_norm": 0.9273054003715515, "learning_rate": 1e-05, "loss": 0.4247, "mean_token_accuracy": 0.8615149855613708, "num_tokens": 900459708.0, "step": 5650 }, { "epoch": 2.8743641912512716, "grad_norm": 0.8844437599182129, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8597338199615479, "num_tokens": 900612066.0, "step": 5651 }, { "epoch": 2.8748728382502544, "grad_norm": 0.8729273676872253, "learning_rate": 1e-05, "loss": 0.4494, "mean_token_accuracy": 0.8557407855987549, "num_tokens": 900777165.0, "step": 5652 }, { "epoch": 2.875381485249237, "grad_norm": 0.8966065049171448, "learning_rate": 1e-05, "loss": 0.4195, "mean_token_accuracy": 0.8638300895690918, "num_tokens": 900940416.0, "step": 5653 }, { "epoch": 2.8758901322482195, "grad_norm": 0.8397489786148071, "learning_rate": 1e-05, "loss": 0.4217, "mean_token_accuracy": 0.8620539903640747, "num_tokens": 901100446.0, "step": 5654 }, { "epoch": 2.8763987792472023, "grad_norm": 0.9159259796142578, "learning_rate": 1e-05, "loss": 0.4203, "mean_token_accuracy": 0.8628263473510742, "num_tokens": 901251879.0, "step": 5655 }, { "epoch": 2.876907426246185, "grad_norm": 0.8920116424560547, "learning_rate": 1e-05, "loss": 0.4114, "mean_token_accuracy": 0.8661978840827942, "num_tokens": 901413081.0, "step": 5656 }, { "epoch": 2.877416073245168, "grad_norm": 0.8771112561225891, "learning_rate": 1e-05, "loss": 0.4109, "mean_token_accuracy": 0.865467369556427, "num_tokens": 901575779.0, "step": 5657 }, { "epoch": 2.8779247202441507, "grad_norm": 0.9231264591217041, "learning_rate": 1e-05, "loss": 0.4362, "mean_token_accuracy": 0.857517421245575, "num_tokens": 901726168.0, "step": 5658 }, { "epoch": 2.878433367243133, "grad_norm": 0.9495973587036133, "learning_rate": 1e-05, "loss": 0.4246, "mean_token_accuracy": 0.8618420362472534, "num_tokens": 901879638.0, "step": 5659 }, { "epoch": 2.878942014242116, "grad_norm": 0.9396060109138489, "learning_rate": 1e-05, "loss": 0.4537, "mean_token_accuracy": 0.8520980477333069, "num_tokens": 902035368.0, "step": 5660 }, { "epoch": 2.8794506612410986, "grad_norm": 0.9062393307685852, "learning_rate": 1e-05, "loss": 0.4219, "mean_token_accuracy": 0.8610861301422119, "num_tokens": 902198864.0, "step": 5661 }, { "epoch": 2.8799593082400814, "grad_norm": 0.8732383847236633, "learning_rate": 1e-05, "loss": 0.4285, "mean_token_accuracy": 0.860880970954895, "num_tokens": 902361599.0, "step": 5662 }, { "epoch": 2.880467955239064, "grad_norm": 0.9061823487281799, "learning_rate": 1e-05, "loss": 0.4451, "mean_token_accuracy": 0.8560501933097839, "num_tokens": 902520237.0, "step": 5663 }, { "epoch": 2.8809766022380465, "grad_norm": 0.9862648844718933, "learning_rate": 1e-05, "loss": 0.4111, "mean_token_accuracy": 0.8644874095916748, "num_tokens": 902671212.0, "step": 5664 }, { "epoch": 2.8814852492370298, "grad_norm": 0.8495729565620422, "learning_rate": 1e-05, "loss": 0.456, "mean_token_accuracy": 0.8526976108551025, "num_tokens": 902828552.0, "step": 5665 }, { "epoch": 2.881993896236012, "grad_norm": 0.8538773655891418, "learning_rate": 1e-05, "loss": 0.4511, "mean_token_accuracy": 0.8544628024101257, "num_tokens": 902987411.0, "step": 5666 }, { "epoch": 2.882502543234995, "grad_norm": 0.9478979706764221, "learning_rate": 1e-05, "loss": 0.4458, "mean_token_accuracy": 0.8565550446510315, "num_tokens": 903143437.0, "step": 5667 }, { "epoch": 2.8830111902339777, "grad_norm": 0.8719910979270935, "learning_rate": 1e-05, "loss": 0.4433, "mean_token_accuracy": 0.8571025729179382, "num_tokens": 903308282.0, "step": 5668 }, { "epoch": 2.8835198372329605, "grad_norm": 0.9409666657447815, "learning_rate": 1e-05, "loss": 0.4422, "mean_token_accuracy": 0.8566873669624329, "num_tokens": 903468922.0, "step": 5669 }, { "epoch": 2.8840284842319432, "grad_norm": 0.8633664846420288, "learning_rate": 1e-05, "loss": 0.431, "mean_token_accuracy": 0.8594985008239746, "num_tokens": 903628314.0, "step": 5670 }, { "epoch": 2.8845371312309256, "grad_norm": 0.8893129229545593, "learning_rate": 1e-05, "loss": 0.4282, "mean_token_accuracy": 0.8600053787231445, "num_tokens": 903785061.0, "step": 5671 }, { "epoch": 2.8850457782299084, "grad_norm": 0.9648041725158691, "learning_rate": 1e-05, "loss": 0.4296, "mean_token_accuracy": 0.8607710599899292, "num_tokens": 903947269.0, "step": 5672 }, { "epoch": 2.885554425228891, "grad_norm": 0.9494853615760803, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8642484545707703, "num_tokens": 904095166.0, "step": 5673 }, { "epoch": 2.886063072227874, "grad_norm": 0.8843656778335571, "learning_rate": 1e-05, "loss": 0.4657, "mean_token_accuracy": 0.851182222366333, "num_tokens": 904246591.0, "step": 5674 }, { "epoch": 2.8865717192268567, "grad_norm": 0.9313526153564453, "learning_rate": 1e-05, "loss": 0.433, "mean_token_accuracy": 0.8591406345367432, "num_tokens": 904404768.0, "step": 5675 }, { "epoch": 2.887080366225839, "grad_norm": 0.9152849316596985, "learning_rate": 1e-05, "loss": 0.442, "mean_token_accuracy": 0.8567684888839722, "num_tokens": 904565888.0, "step": 5676 }, { "epoch": 2.887589013224822, "grad_norm": 0.923521876335144, "learning_rate": 1e-05, "loss": 0.4595, "mean_token_accuracy": 0.8506315350532532, "num_tokens": 904722127.0, "step": 5677 }, { "epoch": 2.8880976602238047, "grad_norm": 0.9015587568283081, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8540414571762085, "num_tokens": 904888197.0, "step": 5678 }, { "epoch": 2.8886063072227874, "grad_norm": 0.8459432721138, "learning_rate": 1e-05, "loss": 0.4505, "mean_token_accuracy": 0.8547311425209045, "num_tokens": 905057713.0, "step": 5679 }, { "epoch": 2.8891149542217702, "grad_norm": 0.8662464618682861, "learning_rate": 1e-05, "loss": 0.4375, "mean_token_accuracy": 0.8573547601699829, "num_tokens": 905218458.0, "step": 5680 }, { "epoch": 2.8896236012207526, "grad_norm": 0.9608275294303894, "learning_rate": 1e-05, "loss": 0.4451, "mean_token_accuracy": 0.8553639650344849, "num_tokens": 905368564.0, "step": 5681 }, { "epoch": 2.8901322482197354, "grad_norm": 0.8665888905525208, "learning_rate": 1e-05, "loss": 0.4766, "mean_token_accuracy": 0.8467645049095154, "num_tokens": 905543712.0, "step": 5682 }, { "epoch": 2.890640895218718, "grad_norm": 0.9251812696456909, "learning_rate": 1e-05, "loss": 0.4439, "mean_token_accuracy": 0.8561020493507385, "num_tokens": 905691310.0, "step": 5683 }, { "epoch": 2.891149542217701, "grad_norm": 0.9309903979301453, "learning_rate": 1e-05, "loss": 0.423, "mean_token_accuracy": 0.861987829208374, "num_tokens": 905848988.0, "step": 5684 }, { "epoch": 2.8916581892166837, "grad_norm": 0.8957622647285461, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8643568754196167, "num_tokens": 906001421.0, "step": 5685 }, { "epoch": 2.892166836215666, "grad_norm": 0.960602343082428, "learning_rate": 1e-05, "loss": 0.4615, "mean_token_accuracy": 0.8526604175567627, "num_tokens": 906155224.0, "step": 5686 }, { "epoch": 2.8926754832146493, "grad_norm": 0.9190289974212646, "learning_rate": 1e-05, "loss": 0.4316, "mean_token_accuracy": 0.8602994680404663, "num_tokens": 906310194.0, "step": 5687 }, { "epoch": 2.8931841302136316, "grad_norm": 0.9160748720169067, "learning_rate": 1e-05, "loss": 0.4681, "mean_token_accuracy": 0.8495457768440247, "num_tokens": 906474923.0, "step": 5688 }, { "epoch": 2.8936927772126144, "grad_norm": 0.9127035140991211, "learning_rate": 1e-05, "loss": 0.3988, "mean_token_accuracy": 0.8672407269477844, "num_tokens": 906624664.0, "step": 5689 }, { "epoch": 2.894201424211597, "grad_norm": 0.8942268490791321, "learning_rate": 1e-05, "loss": 0.4532, "mean_token_accuracy": 0.854296088218689, "num_tokens": 906788218.0, "step": 5690 }, { "epoch": 2.89471007121058, "grad_norm": 0.9423340559005737, "learning_rate": 1e-05, "loss": 0.4359, "mean_token_accuracy": 0.8581220507621765, "num_tokens": 906934206.0, "step": 5691 }, { "epoch": 2.895218718209563, "grad_norm": 0.8928824663162231, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.854724109172821, "num_tokens": 907091206.0, "step": 5692 }, { "epoch": 2.895727365208545, "grad_norm": 0.8230746388435364, "learning_rate": 1e-05, "loss": 0.4251, "mean_token_accuracy": 0.8614833354949951, "num_tokens": 907264336.0, "step": 5693 }, { "epoch": 2.896236012207528, "grad_norm": 0.9321528077125549, "learning_rate": 1e-05, "loss": 0.446, "mean_token_accuracy": 0.8563940525054932, "num_tokens": 907426047.0, "step": 5694 }, { "epoch": 2.8967446592065107, "grad_norm": 0.8536028861999512, "learning_rate": 1e-05, "loss": 0.4308, "mean_token_accuracy": 0.8602784872055054, "num_tokens": 907587211.0, "step": 5695 }, { "epoch": 2.8972533062054935, "grad_norm": 0.8194431066513062, "learning_rate": 1e-05, "loss": 0.4351, "mean_token_accuracy": 0.859073281288147, "num_tokens": 907753455.0, "step": 5696 }, { "epoch": 2.8977619532044763, "grad_norm": 0.8605436682701111, "learning_rate": 1e-05, "loss": 0.3989, "mean_token_accuracy": 0.8686566948890686, "num_tokens": 907908915.0, "step": 5697 }, { "epoch": 2.8982706002034586, "grad_norm": 1.587167739868164, "learning_rate": 1e-05, "loss": 0.4312, "mean_token_accuracy": 0.8598580360412598, "num_tokens": 908071238.0, "step": 5698 }, { "epoch": 2.8987792472024414, "grad_norm": 0.918057918548584, "learning_rate": 1e-05, "loss": 0.4259, "mean_token_accuracy": 0.8622524738311768, "num_tokens": 908221402.0, "step": 5699 }, { "epoch": 2.899287894201424, "grad_norm": 0.9031367897987366, "learning_rate": 1e-05, "loss": 0.444, "mean_token_accuracy": 0.8568249344825745, "num_tokens": 908384365.0, "step": 5700 }, { "epoch": 2.899796541200407, "grad_norm": 0.8905377984046936, "learning_rate": 1e-05, "loss": 0.4137, "mean_token_accuracy": 0.8642496466636658, "num_tokens": 908531089.0, "step": 5701 }, { "epoch": 2.9003051881993898, "grad_norm": 0.9216523766517639, "learning_rate": 1e-05, "loss": 0.4417, "mean_token_accuracy": 0.8555152416229248, "num_tokens": 908692929.0, "step": 5702 }, { "epoch": 2.900813835198372, "grad_norm": 0.8346421718597412, "learning_rate": 1e-05, "loss": 0.4149, "mean_token_accuracy": 0.8649652004241943, "num_tokens": 908848433.0, "step": 5703 }, { "epoch": 2.901322482197355, "grad_norm": 0.8686506748199463, "learning_rate": 1e-05, "loss": 0.437, "mean_token_accuracy": 0.858033299446106, "num_tokens": 909009416.0, "step": 5704 }, { "epoch": 2.9018311291963377, "grad_norm": 0.8234359622001648, "learning_rate": 1e-05, "loss": 0.4011, "mean_token_accuracy": 0.8687787055969238, "num_tokens": 909166803.0, "step": 5705 }, { "epoch": 2.9023397761953205, "grad_norm": 0.8834549784660339, "learning_rate": 1e-05, "loss": 0.4207, "mean_token_accuracy": 0.8631112575531006, "num_tokens": 909326555.0, "step": 5706 }, { "epoch": 2.9028484231943033, "grad_norm": 0.8309847712516785, "learning_rate": 1e-05, "loss": 0.4092, "mean_token_accuracy": 0.8655686974525452, "num_tokens": 909484045.0, "step": 5707 }, { "epoch": 2.9033570701932856, "grad_norm": 0.8650888800621033, "learning_rate": 1e-05, "loss": 0.4233, "mean_token_accuracy": 0.8623444437980652, "num_tokens": 909639003.0, "step": 5708 }, { "epoch": 2.903865717192269, "grad_norm": 0.891726016998291, "learning_rate": 1e-05, "loss": 0.4534, "mean_token_accuracy": 0.8554078936576843, "num_tokens": 909799087.0, "step": 5709 }, { "epoch": 2.904374364191251, "grad_norm": 0.8960121870040894, "learning_rate": 1e-05, "loss": 0.4384, "mean_token_accuracy": 0.8592454791069031, "num_tokens": 909943894.0, "step": 5710 }, { "epoch": 2.904883011190234, "grad_norm": 0.9292611479759216, "learning_rate": 1e-05, "loss": 0.4558, "mean_token_accuracy": 0.8520259261131287, "num_tokens": 910111511.0, "step": 5711 }, { "epoch": 2.9053916581892167, "grad_norm": 0.7885063886642456, "learning_rate": 1e-05, "loss": 0.4275, "mean_token_accuracy": 0.8613859415054321, "num_tokens": 910282610.0, "step": 5712 }, { "epoch": 2.9059003051881995, "grad_norm": 0.8445349931716919, "learning_rate": 1e-05, "loss": 0.4386, "mean_token_accuracy": 0.8575445413589478, "num_tokens": 910445177.0, "step": 5713 }, { "epoch": 2.9064089521871823, "grad_norm": 0.8545286059379578, "learning_rate": 1e-05, "loss": 0.4319, "mean_token_accuracy": 0.8597594499588013, "num_tokens": 910603096.0, "step": 5714 }, { "epoch": 2.9069175991861647, "grad_norm": 0.8807753324508667, "learning_rate": 1e-05, "loss": 0.4343, "mean_token_accuracy": 0.8599470853805542, "num_tokens": 910760377.0, "step": 5715 }, { "epoch": 2.9074262461851474, "grad_norm": 0.8718137145042419, "learning_rate": 1e-05, "loss": 0.4324, "mean_token_accuracy": 0.8583258390426636, "num_tokens": 910907889.0, "step": 5716 }, { "epoch": 2.9079348931841302, "grad_norm": 0.9387333989143372, "learning_rate": 1e-05, "loss": 0.4435, "mean_token_accuracy": 0.8550516366958618, "num_tokens": 911055378.0, "step": 5717 }, { "epoch": 2.908443540183113, "grad_norm": 0.8214811086654663, "learning_rate": 1e-05, "loss": 0.4314, "mean_token_accuracy": 0.8608577847480774, "num_tokens": 911219506.0, "step": 5718 }, { "epoch": 2.908952187182096, "grad_norm": 0.8933064937591553, "learning_rate": 1e-05, "loss": 0.4444, "mean_token_accuracy": 0.8551285266876221, "num_tokens": 911381527.0, "step": 5719 }, { "epoch": 2.909460834181078, "grad_norm": 0.8504346609115601, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8552653789520264, "num_tokens": 911543675.0, "step": 5720 }, { "epoch": 2.909969481180061, "grad_norm": 0.8746544122695923, "learning_rate": 1e-05, "loss": 0.4514, "mean_token_accuracy": 0.855337381362915, "num_tokens": 911704808.0, "step": 5721 }, { "epoch": 2.9104781281790437, "grad_norm": 0.8669593930244446, "learning_rate": 1e-05, "loss": 0.4189, "mean_token_accuracy": 0.864909291267395, "num_tokens": 911875094.0, "step": 5722 }, { "epoch": 2.9109867751780265, "grad_norm": 0.9222279787063599, "learning_rate": 1e-05, "loss": 0.4572, "mean_token_accuracy": 0.8513796925544739, "num_tokens": 912035787.0, "step": 5723 }, { "epoch": 2.9114954221770093, "grad_norm": 0.8894267082214355, "learning_rate": 1e-05, "loss": 0.4365, "mean_token_accuracy": 0.8590693473815918, "num_tokens": 912185571.0, "step": 5724 }, { "epoch": 2.9120040691759916, "grad_norm": 0.8288862705230713, "learning_rate": 1e-05, "loss": 0.398, "mean_token_accuracy": 0.8673139810562134, "num_tokens": 912347614.0, "step": 5725 }, { "epoch": 2.9125127161749744, "grad_norm": 0.8889040946960449, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8502926826477051, "num_tokens": 912513109.0, "step": 5726 }, { "epoch": 2.913021363173957, "grad_norm": 0.8634620308876038, "learning_rate": 1e-05, "loss": 0.4331, "mean_token_accuracy": 0.8603816032409668, "num_tokens": 912679521.0, "step": 5727 }, { "epoch": 2.91353001017294, "grad_norm": 0.809637725353241, "learning_rate": 1e-05, "loss": 0.4429, "mean_token_accuracy": 0.8566316962242126, "num_tokens": 912850119.0, "step": 5728 }, { "epoch": 2.914038657171923, "grad_norm": 0.857983410358429, "learning_rate": 1e-05, "loss": 0.4201, "mean_token_accuracy": 0.863347053527832, "num_tokens": 913006967.0, "step": 5729 }, { "epoch": 2.914547304170905, "grad_norm": 0.8073707818984985, "learning_rate": 1e-05, "loss": 0.4353, "mean_token_accuracy": 0.8588767647743225, "num_tokens": 913178881.0, "step": 5730 }, { "epoch": 2.9150559511698884, "grad_norm": 0.8749431371688843, "learning_rate": 1e-05, "loss": 0.4402, "mean_token_accuracy": 0.8577772974967957, "num_tokens": 913345431.0, "step": 5731 }, { "epoch": 2.9155645981688707, "grad_norm": 0.8460366725921631, "learning_rate": 1e-05, "loss": 0.4291, "mean_token_accuracy": 0.8599922060966492, "num_tokens": 913518334.0, "step": 5732 }, { "epoch": 2.9160732451678535, "grad_norm": 0.8206888437271118, "learning_rate": 1e-05, "loss": 0.4179, "mean_token_accuracy": 0.86368727684021, "num_tokens": 913677797.0, "step": 5733 }, { "epoch": 2.9165818921668363, "grad_norm": 0.9418649077415466, "learning_rate": 1e-05, "loss": 0.4434, "mean_token_accuracy": 0.8554619550704956, "num_tokens": 913821428.0, "step": 5734 }, { "epoch": 2.917090539165819, "grad_norm": 0.8547963500022888, "learning_rate": 1e-05, "loss": 0.4176, "mean_token_accuracy": 0.8628682494163513, "num_tokens": 913969348.0, "step": 5735 }, { "epoch": 2.917599186164802, "grad_norm": 0.8919240236282349, "learning_rate": 1e-05, "loss": 0.4215, "mean_token_accuracy": 0.8628110885620117, "num_tokens": 914126038.0, "step": 5736 }, { "epoch": 2.918107833163784, "grad_norm": 0.914557933807373, "learning_rate": 1e-05, "loss": 0.4351, "mean_token_accuracy": 0.8588264584541321, "num_tokens": 914289459.0, "step": 5737 }, { "epoch": 2.918616480162767, "grad_norm": 0.817688524723053, "learning_rate": 1e-05, "loss": 0.4534, "mean_token_accuracy": 0.8541843891143799, "num_tokens": 914459291.0, "step": 5738 }, { "epoch": 2.9191251271617498, "grad_norm": 0.9456674456596375, "learning_rate": 1e-05, "loss": 0.4091, "mean_token_accuracy": 0.8657861948013306, "num_tokens": 914608971.0, "step": 5739 }, { "epoch": 2.9196337741607326, "grad_norm": 0.855796754360199, "learning_rate": 1e-05, "loss": 0.4178, "mean_token_accuracy": 0.8632485270500183, "num_tokens": 914765805.0, "step": 5740 }, { "epoch": 2.9201424211597153, "grad_norm": 0.9075614213943481, "learning_rate": 1e-05, "loss": 0.4277, "mean_token_accuracy": 0.8599561452865601, "num_tokens": 914923608.0, "step": 5741 }, { "epoch": 2.9206510681586977, "grad_norm": 1.9019993543624878, "learning_rate": 1e-05, "loss": 0.4904, "mean_token_accuracy": 0.844566822052002, "num_tokens": 915083509.0, "step": 5742 }, { "epoch": 2.9211597151576805, "grad_norm": 0.9290248155593872, "learning_rate": 1e-05, "loss": 0.4099, "mean_token_accuracy": 0.865955650806427, "num_tokens": 915245487.0, "step": 5743 }, { "epoch": 2.9216683621566633, "grad_norm": 1.0105957984924316, "learning_rate": 1e-05, "loss": 0.4533, "mean_token_accuracy": 0.8545387983322144, "num_tokens": 915391939.0, "step": 5744 }, { "epoch": 2.922177009155646, "grad_norm": 0.8672761917114258, "learning_rate": 1e-05, "loss": 0.4149, "mean_token_accuracy": 0.8643245697021484, "num_tokens": 915543628.0, "step": 5745 }, { "epoch": 2.922685656154629, "grad_norm": 0.8365616798400879, "learning_rate": 1e-05, "loss": 0.4474, "mean_token_accuracy": 0.8555518388748169, "num_tokens": 915708594.0, "step": 5746 }, { "epoch": 2.923194303153611, "grad_norm": 0.9645183682441711, "learning_rate": 1e-05, "loss": 0.4179, "mean_token_accuracy": 0.8628501892089844, "num_tokens": 915850846.0, "step": 5747 }, { "epoch": 2.923702950152594, "grad_norm": 0.9386368989944458, "learning_rate": 1e-05, "loss": 0.449, "mean_token_accuracy": 0.8545777201652527, "num_tokens": 915997116.0, "step": 5748 }, { "epoch": 2.9242115971515767, "grad_norm": 0.8516026735305786, "learning_rate": 1e-05, "loss": 0.4543, "mean_token_accuracy": 0.8535363078117371, "num_tokens": 916164228.0, "step": 5749 }, { "epoch": 2.9247202441505595, "grad_norm": 1.0035674571990967, "learning_rate": 1e-05, "loss": 0.4681, "mean_token_accuracy": 0.8500915765762329, "num_tokens": 916313182.0, "step": 5750 }, { "epoch": 2.9252288911495423, "grad_norm": 0.8694740533828735, "learning_rate": 1e-05, "loss": 0.4278, "mean_token_accuracy": 0.8602680563926697, "num_tokens": 916468393.0, "step": 5751 }, { "epoch": 2.9257375381485247, "grad_norm": 0.9328964352607727, "learning_rate": 1e-05, "loss": 0.4351, "mean_token_accuracy": 0.8590227365493774, "num_tokens": 916617077.0, "step": 5752 }, { "epoch": 2.926246185147508, "grad_norm": 0.9030390977859497, "learning_rate": 1e-05, "loss": 0.4144, "mean_token_accuracy": 0.8632201552391052, "num_tokens": 916764508.0, "step": 5753 }, { "epoch": 2.9267548321464902, "grad_norm": 0.918763279914856, "learning_rate": 1e-05, "loss": 0.4423, "mean_token_accuracy": 0.8566667437553406, "num_tokens": 916935162.0, "step": 5754 }, { "epoch": 2.927263479145473, "grad_norm": 0.8663561344146729, "learning_rate": 1e-05, "loss": 0.4551, "mean_token_accuracy": 0.853164553642273, "num_tokens": 917106077.0, "step": 5755 }, { "epoch": 2.927772126144456, "grad_norm": 0.9716460704803467, "learning_rate": 1e-05, "loss": 0.4704, "mean_token_accuracy": 0.8501912355422974, "num_tokens": 917266918.0, "step": 5756 }, { "epoch": 2.9282807731434386, "grad_norm": 0.9873749017715454, "learning_rate": 1e-05, "loss": 0.4009, "mean_token_accuracy": 0.868045449256897, "num_tokens": 917420760.0, "step": 5757 }, { "epoch": 2.9287894201424214, "grad_norm": 0.9092333912849426, "learning_rate": 1e-05, "loss": 0.4254, "mean_token_accuracy": 0.8617450594902039, "num_tokens": 917586569.0, "step": 5758 }, { "epoch": 2.9292980671414037, "grad_norm": 0.8969524502754211, "learning_rate": 1e-05, "loss": 0.458, "mean_token_accuracy": 0.8509863615036011, "num_tokens": 917750067.0, "step": 5759 }, { "epoch": 2.9298067141403865, "grad_norm": 0.9118671417236328, "learning_rate": 1e-05, "loss": 0.4474, "mean_token_accuracy": 0.8554458618164062, "num_tokens": 917913379.0, "step": 5760 }, { "epoch": 2.9303153611393693, "grad_norm": 0.8653302788734436, "learning_rate": 1e-05, "loss": 0.4452, "mean_token_accuracy": 0.855238139629364, "num_tokens": 918072489.0, "step": 5761 }, { "epoch": 2.930824008138352, "grad_norm": 1.0064209699630737, "learning_rate": 1e-05, "loss": 0.4518, "mean_token_accuracy": 0.8538899421691895, "num_tokens": 918219033.0, "step": 5762 }, { "epoch": 2.931332655137335, "grad_norm": 0.9989521503448486, "learning_rate": 1e-05, "loss": 0.4246, "mean_token_accuracy": 0.8602008819580078, "num_tokens": 918368824.0, "step": 5763 }, { "epoch": 2.931841302136317, "grad_norm": 0.8290295004844666, "learning_rate": 1e-05, "loss": 0.438, "mean_token_accuracy": 0.8582369685173035, "num_tokens": 918538998.0, "step": 5764 }, { "epoch": 2.9323499491353, "grad_norm": 0.9657419919967651, "learning_rate": 1e-05, "loss": 0.4414, "mean_token_accuracy": 0.8566818237304688, "num_tokens": 918692837.0, "step": 5765 }, { "epoch": 2.932858596134283, "grad_norm": 0.9440754055976868, "learning_rate": 1e-05, "loss": 0.4077, "mean_token_accuracy": 0.8666754961013794, "num_tokens": 918848120.0, "step": 5766 }, { "epoch": 2.9333672431332656, "grad_norm": 0.843254566192627, "learning_rate": 1e-05, "loss": 0.4247, "mean_token_accuracy": 0.8619252443313599, "num_tokens": 919000637.0, "step": 5767 }, { "epoch": 2.9338758901322484, "grad_norm": 0.9444208741188049, "learning_rate": 1e-05, "loss": 0.4594, "mean_token_accuracy": 0.8507851362228394, "num_tokens": 919158162.0, "step": 5768 }, { "epoch": 2.9343845371312307, "grad_norm": 0.9736772179603577, "learning_rate": 1e-05, "loss": 0.4454, "mean_token_accuracy": 0.8532898426055908, "num_tokens": 919298786.0, "step": 5769 }, { "epoch": 2.9348931841302135, "grad_norm": 0.8969355225563049, "learning_rate": 1e-05, "loss": 0.4455, "mean_token_accuracy": 0.855862021446228, "num_tokens": 919454435.0, "step": 5770 }, { "epoch": 2.9354018311291963, "grad_norm": 0.9506131410598755, "learning_rate": 1e-05, "loss": 0.4198, "mean_token_accuracy": 0.8641059398651123, "num_tokens": 919611410.0, "step": 5771 }, { "epoch": 2.935910478128179, "grad_norm": 0.923133909702301, "learning_rate": 1e-05, "loss": 0.4606, "mean_token_accuracy": 0.8519150018692017, "num_tokens": 919767371.0, "step": 5772 }, { "epoch": 2.936419125127162, "grad_norm": 0.8677843809127808, "learning_rate": 1e-05, "loss": 0.4098, "mean_token_accuracy": 0.8654341101646423, "num_tokens": 919927418.0, "step": 5773 }, { "epoch": 2.936927772126144, "grad_norm": 0.8953611850738525, "learning_rate": 1e-05, "loss": 0.4254, "mean_token_accuracy": 0.8617711067199707, "num_tokens": 920097231.0, "step": 5774 }, { "epoch": 2.9374364191251274, "grad_norm": 0.9595847725868225, "learning_rate": 1e-05, "loss": 0.4293, "mean_token_accuracy": 0.8610657453536987, "num_tokens": 920251209.0, "step": 5775 }, { "epoch": 2.9379450661241098, "grad_norm": 0.839107871055603, "learning_rate": 1e-05, "loss": 0.4336, "mean_token_accuracy": 0.8592088222503662, "num_tokens": 920404124.0, "step": 5776 }, { "epoch": 2.9384537131230926, "grad_norm": 0.9945467114448547, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8458770513534546, "num_tokens": 920562123.0, "step": 5777 }, { "epoch": 2.9389623601220753, "grad_norm": 0.9366150498390198, "learning_rate": 1e-05, "loss": 0.4482, "mean_token_accuracy": 0.854699969291687, "num_tokens": 920723542.0, "step": 5778 }, { "epoch": 2.939471007121058, "grad_norm": 0.9452741742134094, "learning_rate": 1e-05, "loss": 0.4341, "mean_token_accuracy": 0.8599119782447815, "num_tokens": 920881486.0, "step": 5779 }, { "epoch": 2.939979654120041, "grad_norm": 0.9388673305511475, "learning_rate": 1e-05, "loss": 0.4526, "mean_token_accuracy": 0.853776216506958, "num_tokens": 921032468.0, "step": 5780 }, { "epoch": 2.9404883011190233, "grad_norm": 0.9121649265289307, "learning_rate": 1e-05, "loss": 0.4253, "mean_token_accuracy": 0.8614283800125122, "num_tokens": 921185787.0, "step": 5781 }, { "epoch": 2.940996948118006, "grad_norm": 0.8633955717086792, "learning_rate": 1e-05, "loss": 0.4254, "mean_token_accuracy": 0.8609992861747742, "num_tokens": 921343839.0, "step": 5782 }, { "epoch": 2.941505595116989, "grad_norm": 0.9059745073318481, "learning_rate": 1e-05, "loss": 0.4727, "mean_token_accuracy": 0.8484917879104614, "num_tokens": 921504957.0, "step": 5783 }, { "epoch": 2.9420142421159716, "grad_norm": 0.8350028991699219, "learning_rate": 1e-05, "loss": 0.4218, "mean_token_accuracy": 0.8621200323104858, "num_tokens": 921677191.0, "step": 5784 }, { "epoch": 2.9425228891149544, "grad_norm": 0.9048314094543457, "learning_rate": 1e-05, "loss": 0.4706, "mean_token_accuracy": 0.8496427536010742, "num_tokens": 921834278.0, "step": 5785 }, { "epoch": 2.9430315361139368, "grad_norm": 0.8689739108085632, "learning_rate": 1e-05, "loss": 0.4199, "mean_token_accuracy": 0.8630783557891846, "num_tokens": 921985325.0, "step": 5786 }, { "epoch": 2.9435401831129195, "grad_norm": 0.8724319934844971, "learning_rate": 1e-05, "loss": 0.444, "mean_token_accuracy": 0.8558523654937744, "num_tokens": 922150194.0, "step": 5787 }, { "epoch": 2.9440488301119023, "grad_norm": 0.8524068593978882, "learning_rate": 1e-05, "loss": 0.391, "mean_token_accuracy": 0.8696946501731873, "num_tokens": 922307545.0, "step": 5788 }, { "epoch": 2.944557477110885, "grad_norm": 0.9184873104095459, "learning_rate": 1e-05, "loss": 0.4684, "mean_token_accuracy": 0.8503162860870361, "num_tokens": 922467567.0, "step": 5789 }, { "epoch": 2.945066124109868, "grad_norm": 0.8801780939102173, "learning_rate": 1e-05, "loss": 0.4286, "mean_token_accuracy": 0.8609876036643982, "num_tokens": 922627414.0, "step": 5790 }, { "epoch": 2.9455747711088502, "grad_norm": 0.9340073466300964, "learning_rate": 1e-05, "loss": 0.4448, "mean_token_accuracy": 0.856844961643219, "num_tokens": 922782327.0, "step": 5791 }, { "epoch": 2.946083418107833, "grad_norm": 0.9026318192481995, "learning_rate": 1e-05, "loss": 0.4559, "mean_token_accuracy": 0.8538775444030762, "num_tokens": 922940862.0, "step": 5792 }, { "epoch": 2.946592065106816, "grad_norm": 0.8768016695976257, "learning_rate": 1e-05, "loss": 0.4637, "mean_token_accuracy": 0.8493372797966003, "num_tokens": 923105553.0, "step": 5793 }, { "epoch": 2.9471007121057986, "grad_norm": 0.8517777919769287, "learning_rate": 1e-05, "loss": 0.4204, "mean_token_accuracy": 0.8641206622123718, "num_tokens": 923268455.0, "step": 5794 }, { "epoch": 2.9476093591047814, "grad_norm": 0.8783465027809143, "learning_rate": 1e-05, "loss": 0.4498, "mean_token_accuracy": 0.8550516963005066, "num_tokens": 923426151.0, "step": 5795 }, { "epoch": 2.9481180061037637, "grad_norm": 0.8488904237747192, "learning_rate": 1e-05, "loss": 0.4081, "mean_token_accuracy": 0.8654050230979919, "num_tokens": 923581013.0, "step": 5796 }, { "epoch": 2.948626653102747, "grad_norm": 0.8633197546005249, "learning_rate": 1e-05, "loss": 0.4261, "mean_token_accuracy": 0.860765278339386, "num_tokens": 923735620.0, "step": 5797 }, { "epoch": 2.9491353001017293, "grad_norm": 0.8653395771980286, "learning_rate": 1e-05, "loss": 0.4483, "mean_token_accuracy": 0.8558655381202698, "num_tokens": 923889387.0, "step": 5798 }, { "epoch": 2.949643947100712, "grad_norm": 0.9279692769050598, "learning_rate": 1e-05, "loss": 0.443, "mean_token_accuracy": 0.8564894199371338, "num_tokens": 924043776.0, "step": 5799 }, { "epoch": 2.950152594099695, "grad_norm": 0.8566408753395081, "learning_rate": 1e-05, "loss": 0.4252, "mean_token_accuracy": 0.8626484274864197, "num_tokens": 924193594.0, "step": 5800 }, { "epoch": 2.9506612410986777, "grad_norm": 0.8193241953849792, "learning_rate": 1e-05, "loss": 0.4094, "mean_token_accuracy": 0.8671364784240723, "num_tokens": 924362420.0, "step": 5801 }, { "epoch": 2.9511698880976605, "grad_norm": 0.8682567477226257, "learning_rate": 1e-05, "loss": 0.4405, "mean_token_accuracy": 0.857014536857605, "num_tokens": 924523122.0, "step": 5802 }, { "epoch": 2.951678535096643, "grad_norm": 0.8148035407066345, "learning_rate": 1e-05, "loss": 0.3869, "mean_token_accuracy": 0.8721718788146973, "num_tokens": 924684963.0, "step": 5803 }, { "epoch": 2.9521871820956256, "grad_norm": 0.9142512083053589, "learning_rate": 1e-05, "loss": 0.4299, "mean_token_accuracy": 0.8605676889419556, "num_tokens": 924829073.0, "step": 5804 }, { "epoch": 2.9526958290946084, "grad_norm": 0.9010807275772095, "learning_rate": 1e-05, "loss": 0.4487, "mean_token_accuracy": 0.8550687432289124, "num_tokens": 925002218.0, "step": 5805 }, { "epoch": 2.953204476093591, "grad_norm": 0.8682645559310913, "learning_rate": 1e-05, "loss": 0.4288, "mean_token_accuracy": 0.8604177236557007, "num_tokens": 925176143.0, "step": 5806 }, { "epoch": 2.953713123092574, "grad_norm": 0.875255286693573, "learning_rate": 1e-05, "loss": 0.4139, "mean_token_accuracy": 0.8656608462333679, "num_tokens": 925331149.0, "step": 5807 }, { "epoch": 2.9542217700915563, "grad_norm": 0.9133750796318054, "learning_rate": 1e-05, "loss": 0.4127, "mean_token_accuracy": 0.8645917773246765, "num_tokens": 925477479.0, "step": 5808 }, { "epoch": 2.954730417090539, "grad_norm": 0.8490424752235413, "learning_rate": 1e-05, "loss": 0.4319, "mean_token_accuracy": 0.8602641820907593, "num_tokens": 925643872.0, "step": 5809 }, { "epoch": 2.955239064089522, "grad_norm": 0.8649740219116211, "learning_rate": 1e-05, "loss": 0.4087, "mean_token_accuracy": 0.8658615946769714, "num_tokens": 925794642.0, "step": 5810 }, { "epoch": 2.9557477110885046, "grad_norm": 0.8914206624031067, "learning_rate": 1e-05, "loss": 0.4504, "mean_token_accuracy": 0.8545023798942566, "num_tokens": 925945760.0, "step": 5811 }, { "epoch": 2.9562563580874874, "grad_norm": 0.8733227252960205, "learning_rate": 1e-05, "loss": 0.4024, "mean_token_accuracy": 0.8668968677520752, "num_tokens": 926085112.0, "step": 5812 }, { "epoch": 2.9567650050864698, "grad_norm": 0.8863427042961121, "learning_rate": 1e-05, "loss": 0.4397, "mean_token_accuracy": 0.8588989973068237, "num_tokens": 926240188.0, "step": 5813 }, { "epoch": 2.9572736520854526, "grad_norm": 0.9870784878730774, "learning_rate": 1e-05, "loss": 0.471, "mean_token_accuracy": 0.8487330079078674, "num_tokens": 926404523.0, "step": 5814 }, { "epoch": 2.9577822990844354, "grad_norm": 0.8670083284378052, "learning_rate": 1e-05, "loss": 0.4, "mean_token_accuracy": 0.867817759513855, "num_tokens": 926553199.0, "step": 5815 }, { "epoch": 2.958290946083418, "grad_norm": 0.8427149653434753, "learning_rate": 1e-05, "loss": 0.4162, "mean_token_accuracy": 0.8652034997940063, "num_tokens": 926703380.0, "step": 5816 }, { "epoch": 2.958799593082401, "grad_norm": 0.8586266040802002, "learning_rate": 1e-05, "loss": 0.4348, "mean_token_accuracy": 0.8596380352973938, "num_tokens": 926863378.0, "step": 5817 }, { "epoch": 2.9593082400813833, "grad_norm": 0.9027202129364014, "learning_rate": 1e-05, "loss": 0.4329, "mean_token_accuracy": 0.8599502444267273, "num_tokens": 927009636.0, "step": 5818 }, { "epoch": 2.9598168870803665, "grad_norm": 0.8678000569343567, "learning_rate": 1e-05, "loss": 0.4253, "mean_token_accuracy": 0.8616846799850464, "num_tokens": 927168274.0, "step": 5819 }, { "epoch": 2.960325534079349, "grad_norm": 0.8938841819763184, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8489087820053101, "num_tokens": 927349363.0, "step": 5820 }, { "epoch": 2.9608341810783316, "grad_norm": 0.8878791928291321, "learning_rate": 1e-05, "loss": 0.4292, "mean_token_accuracy": 0.8604944348335266, "num_tokens": 927504872.0, "step": 5821 }, { "epoch": 2.9613428280773144, "grad_norm": 0.8782253265380859, "learning_rate": 1e-05, "loss": 0.4121, "mean_token_accuracy": 0.8634744882583618, "num_tokens": 927661962.0, "step": 5822 }, { "epoch": 2.961851475076297, "grad_norm": 0.8695517778396606, "learning_rate": 1e-05, "loss": 0.4543, "mean_token_accuracy": 0.8522966504096985, "num_tokens": 927820356.0, "step": 5823 }, { "epoch": 2.96236012207528, "grad_norm": 1.1400654315948486, "learning_rate": 1e-05, "loss": 0.4255, "mean_token_accuracy": 0.8615685701370239, "num_tokens": 927984140.0, "step": 5824 }, { "epoch": 2.9628687690742623, "grad_norm": 0.8361784219741821, "learning_rate": 1e-05, "loss": 0.4135, "mean_token_accuracy": 0.8660613298416138, "num_tokens": 928149741.0, "step": 5825 }, { "epoch": 2.963377416073245, "grad_norm": 0.8206629157066345, "learning_rate": 1e-05, "loss": 0.44, "mean_token_accuracy": 0.8572777509689331, "num_tokens": 928321531.0, "step": 5826 }, { "epoch": 2.963886063072228, "grad_norm": 0.889646589756012, "learning_rate": 1e-05, "loss": 0.4235, "mean_token_accuracy": 0.8619962930679321, "num_tokens": 928470935.0, "step": 5827 }, { "epoch": 2.9643947100712107, "grad_norm": 0.9425367712974548, "learning_rate": 1e-05, "loss": 0.407, "mean_token_accuracy": 0.8678266406059265, "num_tokens": 928624216.0, "step": 5828 }, { "epoch": 2.9649033570701935, "grad_norm": 0.8638216257095337, "learning_rate": 1e-05, "loss": 0.4432, "mean_token_accuracy": 0.8562783002853394, "num_tokens": 928787184.0, "step": 5829 }, { "epoch": 2.965412004069176, "grad_norm": 0.8920972943305969, "learning_rate": 1e-05, "loss": 0.4364, "mean_token_accuracy": 0.8587175607681274, "num_tokens": 928950416.0, "step": 5830 }, { "epoch": 2.9659206510681586, "grad_norm": 0.8766757845878601, "learning_rate": 1e-05, "loss": 0.4639, "mean_token_accuracy": 0.8482862114906311, "num_tokens": 929103335.0, "step": 5831 }, { "epoch": 2.9664292980671414, "grad_norm": 0.9349969029426575, "learning_rate": 1e-05, "loss": 0.4122, "mean_token_accuracy": 0.8652468919754028, "num_tokens": 929254662.0, "step": 5832 }, { "epoch": 2.966937945066124, "grad_norm": 0.9228084683418274, "learning_rate": 1e-05, "loss": 0.4211, "mean_token_accuracy": 0.8630307912826538, "num_tokens": 929403805.0, "step": 5833 }, { "epoch": 2.967446592065107, "grad_norm": 0.860146701335907, "learning_rate": 1e-05, "loss": 0.4109, "mean_token_accuracy": 0.8636744618415833, "num_tokens": 929559750.0, "step": 5834 }, { "epoch": 2.9679552390640893, "grad_norm": 0.9154155254364014, "learning_rate": 1e-05, "loss": 0.4011, "mean_token_accuracy": 0.8682236075401306, "num_tokens": 929708224.0, "step": 5835 }, { "epoch": 2.968463886063072, "grad_norm": 0.8699504733085632, "learning_rate": 1e-05, "loss": 0.4218, "mean_token_accuracy": 0.8614717721939087, "num_tokens": 929861530.0, "step": 5836 }, { "epoch": 2.968972533062055, "grad_norm": 0.9429341554641724, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8554191589355469, "num_tokens": 930013475.0, "step": 5837 }, { "epoch": 2.9694811800610377, "grad_norm": 0.837781548500061, "learning_rate": 1e-05, "loss": 0.427, "mean_token_accuracy": 0.8609052896499634, "num_tokens": 930177144.0, "step": 5838 }, { "epoch": 2.9699898270600205, "grad_norm": 0.9412606358528137, "learning_rate": 1e-05, "loss": 0.4665, "mean_token_accuracy": 0.8494040966033936, "num_tokens": 930339758.0, "step": 5839 }, { "epoch": 2.970498474059003, "grad_norm": 0.8832743167877197, "learning_rate": 1e-05, "loss": 0.437, "mean_token_accuracy": 0.8587670922279358, "num_tokens": 930500065.0, "step": 5840 }, { "epoch": 2.971007121057986, "grad_norm": 0.9219328165054321, "learning_rate": 1e-05, "loss": 0.4393, "mean_token_accuracy": 0.8565832376480103, "num_tokens": 930650312.0, "step": 5841 }, { "epoch": 2.9715157680569684, "grad_norm": 0.9061831831932068, "learning_rate": 1e-05, "loss": 0.4494, "mean_token_accuracy": 0.8533433675765991, "num_tokens": 930809980.0, "step": 5842 }, { "epoch": 2.972024415055951, "grad_norm": 0.8891438245773315, "learning_rate": 1e-05, "loss": 0.4757, "mean_token_accuracy": 0.8493027687072754, "num_tokens": 930965995.0, "step": 5843 }, { "epoch": 2.972533062054934, "grad_norm": 0.869954526424408, "learning_rate": 1e-05, "loss": 0.3767, "mean_token_accuracy": 0.8745460510253906, "num_tokens": 931124035.0, "step": 5844 }, { "epoch": 2.9730417090539167, "grad_norm": 0.8907552361488342, "learning_rate": 1e-05, "loss": 0.4332, "mean_token_accuracy": 0.8616758584976196, "num_tokens": 931271106.0, "step": 5845 }, { "epoch": 2.9735503560528995, "grad_norm": 0.9448345303535461, "learning_rate": 1e-05, "loss": 0.4254, "mean_token_accuracy": 0.8614962100982666, "num_tokens": 931415682.0, "step": 5846 }, { "epoch": 2.974059003051882, "grad_norm": 0.8706139922142029, "learning_rate": 1e-05, "loss": 0.4197, "mean_token_accuracy": 0.8637193441390991, "num_tokens": 931571305.0, "step": 5847 }, { "epoch": 2.9745676500508647, "grad_norm": 0.9198446869850159, "learning_rate": 1e-05, "loss": 0.4755, "mean_token_accuracy": 0.8487168550491333, "num_tokens": 931735254.0, "step": 5848 }, { "epoch": 2.9750762970498474, "grad_norm": 0.9850619435310364, "learning_rate": 1e-05, "loss": 0.4645, "mean_token_accuracy": 0.850271463394165, "num_tokens": 931891725.0, "step": 5849 }, { "epoch": 2.9755849440488302, "grad_norm": 0.9214954376220703, "learning_rate": 1e-05, "loss": 0.4462, "mean_token_accuracy": 0.856054425239563, "num_tokens": 932044046.0, "step": 5850 }, { "epoch": 2.976093591047813, "grad_norm": 0.9269073009490967, "learning_rate": 1e-05, "loss": 0.422, "mean_token_accuracy": 0.8621110320091248, "num_tokens": 932197593.0, "step": 5851 }, { "epoch": 2.9766022380467954, "grad_norm": 0.8905263543128967, "learning_rate": 1e-05, "loss": 0.425, "mean_token_accuracy": 0.8598513603210449, "num_tokens": 932350730.0, "step": 5852 }, { "epoch": 2.977110885045778, "grad_norm": 0.8312326669692993, "learning_rate": 1e-05, "loss": 0.4539, "mean_token_accuracy": 0.8548107743263245, "num_tokens": 932513750.0, "step": 5853 }, { "epoch": 2.977619532044761, "grad_norm": 0.89973384141922, "learning_rate": 1e-05, "loss": 0.4162, "mean_token_accuracy": 0.8633626699447632, "num_tokens": 932673626.0, "step": 5854 }, { "epoch": 2.9781281790437437, "grad_norm": 0.8836349844932556, "learning_rate": 1e-05, "loss": 0.4065, "mean_token_accuracy": 0.8674150109291077, "num_tokens": 932846477.0, "step": 5855 }, { "epoch": 2.9786368260427265, "grad_norm": 0.8640139698982239, "learning_rate": 1e-05, "loss": 0.4461, "mean_token_accuracy": 0.8566869497299194, "num_tokens": 933021539.0, "step": 5856 }, { "epoch": 2.979145473041709, "grad_norm": 0.9574412107467651, "learning_rate": 1e-05, "loss": 0.4622, "mean_token_accuracy": 0.8520747423171997, "num_tokens": 933173949.0, "step": 5857 }, { "epoch": 2.9796541200406916, "grad_norm": 0.8743879199028015, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8490179181098938, "num_tokens": 933342649.0, "step": 5858 }, { "epoch": 2.9801627670396744, "grad_norm": 1.0385737419128418, "learning_rate": 1e-05, "loss": 0.4288, "mean_token_accuracy": 0.8602662086486816, "num_tokens": 933509483.0, "step": 5859 }, { "epoch": 2.980671414038657, "grad_norm": 0.8679269552230835, "learning_rate": 1e-05, "loss": 0.4137, "mean_token_accuracy": 0.8649242520332336, "num_tokens": 933683564.0, "step": 5860 }, { "epoch": 2.98118006103764, "grad_norm": 0.8368974924087524, "learning_rate": 1e-05, "loss": 0.4153, "mean_token_accuracy": 0.8648080229759216, "num_tokens": 933842448.0, "step": 5861 }, { "epoch": 2.9816887080366223, "grad_norm": 0.9177777767181396, "learning_rate": 1e-05, "loss": 0.4774, "mean_token_accuracy": 0.8469770550727844, "num_tokens": 933996328.0, "step": 5862 }, { "epoch": 2.982197355035605, "grad_norm": 0.9060273766517639, "learning_rate": 1e-05, "loss": 0.3981, "mean_token_accuracy": 0.8685603141784668, "num_tokens": 934146606.0, "step": 5863 }, { "epoch": 2.982706002034588, "grad_norm": 0.8164167404174805, "learning_rate": 1e-05, "loss": 0.4423, "mean_token_accuracy": 0.8569616079330444, "num_tokens": 934323985.0, "step": 5864 }, { "epoch": 2.9832146490335707, "grad_norm": 0.9017447233200073, "learning_rate": 1e-05, "loss": 0.4162, "mean_token_accuracy": 0.8646686673164368, "num_tokens": 934492502.0, "step": 5865 }, { "epoch": 2.9837232960325535, "grad_norm": 0.8669429421424866, "learning_rate": 1e-05, "loss": 0.4405, "mean_token_accuracy": 0.8561424016952515, "num_tokens": 934651946.0, "step": 5866 }, { "epoch": 2.9842319430315363, "grad_norm": 0.8700833320617676, "learning_rate": 1e-05, "loss": 0.4154, "mean_token_accuracy": 0.8638380169868469, "num_tokens": 934817411.0, "step": 5867 }, { "epoch": 2.984740590030519, "grad_norm": 0.8727999925613403, "learning_rate": 1e-05, "loss": 0.4343, "mean_token_accuracy": 0.8592092990875244, "num_tokens": 934983408.0, "step": 5868 }, { "epoch": 2.9852492370295014, "grad_norm": 0.8172105550765991, "learning_rate": 1e-05, "loss": 0.3962, "mean_token_accuracy": 0.8701355457305908, "num_tokens": 935138600.0, "step": 5869 }, { "epoch": 2.985757884028484, "grad_norm": 0.9610244035720825, "learning_rate": 1e-05, "loss": 0.4521, "mean_token_accuracy": 0.8546727895736694, "num_tokens": 935295080.0, "step": 5870 }, { "epoch": 2.986266531027467, "grad_norm": 0.8617233633995056, "learning_rate": 1e-05, "loss": 0.4283, "mean_token_accuracy": 0.8610825538635254, "num_tokens": 935460191.0, "step": 5871 }, { "epoch": 2.9867751780264498, "grad_norm": 0.8361219763755798, "learning_rate": 1e-05, "loss": 0.4425, "mean_token_accuracy": 0.8569613695144653, "num_tokens": 935616845.0, "step": 5872 }, { "epoch": 2.9872838250254325, "grad_norm": 0.9874329566955566, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8622537851333618, "num_tokens": 935768955.0, "step": 5873 }, { "epoch": 2.987792472024415, "grad_norm": 0.9735478758811951, "learning_rate": 1e-05, "loss": 0.4334, "mean_token_accuracy": 0.8583329319953918, "num_tokens": 935925552.0, "step": 5874 }, { "epoch": 2.9883011190233977, "grad_norm": 0.9315745234489441, "learning_rate": 1e-05, "loss": 0.4159, "mean_token_accuracy": 0.8649601340293884, "num_tokens": 936086816.0, "step": 5875 }, { "epoch": 2.9888097660223805, "grad_norm": 0.9147635698318481, "learning_rate": 1e-05, "loss": 0.4417, "mean_token_accuracy": 0.8556721806526184, "num_tokens": 936253904.0, "step": 5876 }, { "epoch": 2.9893184130213633, "grad_norm": 0.935129702091217, "learning_rate": 1e-05, "loss": 0.4219, "mean_token_accuracy": 0.8635666370391846, "num_tokens": 936409194.0, "step": 5877 }, { "epoch": 2.989827060020346, "grad_norm": 0.9388399720191956, "learning_rate": 1e-05, "loss": 0.4181, "mean_token_accuracy": 0.8653729557991028, "num_tokens": 936577899.0, "step": 5878 }, { "epoch": 2.9903357070193284, "grad_norm": 0.8908611536026001, "learning_rate": 1e-05, "loss": 0.4414, "mean_token_accuracy": 0.856574296951294, "num_tokens": 936734381.0, "step": 5879 }, { "epoch": 2.990844354018311, "grad_norm": 0.8607943654060364, "learning_rate": 1e-05, "loss": 0.4315, "mean_token_accuracy": 0.8605170249938965, "num_tokens": 936910362.0, "step": 5880 }, { "epoch": 2.991353001017294, "grad_norm": 0.8888345956802368, "learning_rate": 1e-05, "loss": 0.4425, "mean_token_accuracy": 0.8557511568069458, "num_tokens": 937078132.0, "step": 5881 }, { "epoch": 2.9918616480162767, "grad_norm": 0.9754486680030823, "learning_rate": 1e-05, "loss": 0.4191, "mean_token_accuracy": 0.8625936508178711, "num_tokens": 937231947.0, "step": 5882 }, { "epoch": 2.9923702950152595, "grad_norm": 0.8599565029144287, "learning_rate": 1e-05, "loss": 0.4438, "mean_token_accuracy": 0.8562726378440857, "num_tokens": 937393536.0, "step": 5883 }, { "epoch": 2.992878942014242, "grad_norm": 0.9650074243545532, "learning_rate": 1e-05, "loss": 0.4699, "mean_token_accuracy": 0.8488419055938721, "num_tokens": 937547528.0, "step": 5884 }, { "epoch": 2.9933875890132247, "grad_norm": 0.9295585751533508, "learning_rate": 1e-05, "loss": 0.4081, "mean_token_accuracy": 0.8679110407829285, "num_tokens": 937697898.0, "step": 5885 }, { "epoch": 2.9938962360122074, "grad_norm": 0.8543989062309265, "learning_rate": 1e-05, "loss": 0.4233, "mean_token_accuracy": 0.8614925146102905, "num_tokens": 937866031.0, "step": 5886 }, { "epoch": 2.9944048830111902, "grad_norm": 0.9370037913322449, "learning_rate": 1e-05, "loss": 0.4275, "mean_token_accuracy": 0.8597909808158875, "num_tokens": 938023104.0, "step": 5887 }, { "epoch": 2.994913530010173, "grad_norm": 0.8910934329032898, "learning_rate": 1e-05, "loss": 0.4226, "mean_token_accuracy": 0.8629887104034424, "num_tokens": 938184792.0, "step": 5888 }, { "epoch": 2.995422177009156, "grad_norm": 1.5907645225524902, "learning_rate": 1e-05, "loss": 0.428, "mean_token_accuracy": 0.8609790205955505, "num_tokens": 938339130.0, "step": 5889 }, { "epoch": 2.9959308240081386, "grad_norm": 0.8379800915718079, "learning_rate": 1e-05, "loss": 0.441, "mean_token_accuracy": 0.8579897284507751, "num_tokens": 938504623.0, "step": 5890 }, { "epoch": 2.996439471007121, "grad_norm": 0.9151134490966797, "learning_rate": 1e-05, "loss": 0.4417, "mean_token_accuracy": 0.8580859899520874, "num_tokens": 938672321.0, "step": 5891 }, { "epoch": 2.9969481180061037, "grad_norm": 0.9333588480949402, "learning_rate": 1e-05, "loss": 0.43, "mean_token_accuracy": 0.8601522445678711, "num_tokens": 938827631.0, "step": 5892 }, { "epoch": 2.9974567650050865, "grad_norm": 0.8548453450202942, "learning_rate": 1e-05, "loss": 0.4148, "mean_token_accuracy": 0.86507648229599, "num_tokens": 938986452.0, "step": 5893 }, { "epoch": 2.9979654120040693, "grad_norm": 0.8833407759666443, "learning_rate": 1e-05, "loss": 0.4301, "mean_token_accuracy": 0.860978364944458, "num_tokens": 939140656.0, "step": 5894 }, { "epoch": 2.998474059003052, "grad_norm": 0.8966116309165955, "learning_rate": 1e-05, "loss": 0.4235, "mean_token_accuracy": 0.8621483445167542, "num_tokens": 939281857.0, "step": 5895 }, { "epoch": 2.9989827060020344, "grad_norm": 0.8217146396636963, "learning_rate": 1e-05, "loss": 0.4287, "mean_token_accuracy": 0.8596178293228149, "num_tokens": 939437597.0, "step": 5896 }, { "epoch": 2.999491353001017, "grad_norm": 0.8681790828704834, "learning_rate": 1e-05, "loss": 0.423, "mean_token_accuracy": 0.8632126450538635, "num_tokens": 939611691.0, "step": 5897 }, { "epoch": 3.0, "grad_norm": 0.8482578992843628, "learning_rate": 1e-05, "loss": 0.4063, "mean_token_accuracy": 0.8673560619354248, "num_tokens": 939774271.0, "step": 5898 }, { "epoch": 3.0, "step": 5898, "total_flos": 6.289145677364593e+18, "train_loss": 0.497733977299579, "train_runtime": 8529.1423, "train_samples_per_second": 44.238, "train_steps_per_second": 0.692 } ], "logging_steps": 1, "max_steps": 5898, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2949, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.289145677364593e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }