{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 215, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004651162790697674, "grad_norm": 19.56722068786621, "learning_rate": 0.0, "loss": 0.842, "mean_token_accuracy": 0.8234314918518066, "step": 1 }, { "epoch": 0.009302325581395349, "grad_norm": 21.499818801879883, "learning_rate": 4.5454545454545457e-07, "loss": 0.8773, "mean_token_accuracy": 0.8220880627632141, "step": 2 }, { "epoch": 0.013953488372093023, "grad_norm": 19.097368240356445, "learning_rate": 9.090909090909091e-07, "loss": 0.8511, "mean_token_accuracy": 0.8329019546508789, "step": 3 }, { "epoch": 0.018604651162790697, "grad_norm": 18.014055252075195, "learning_rate": 1.3636363636363636e-06, "loss": 0.8172, "mean_token_accuracy": 0.8300436735153198, "step": 4 }, { "epoch": 0.023255813953488372, "grad_norm": 21.1191349029541, "learning_rate": 1.8181818181818183e-06, "loss": 0.932, "mean_token_accuracy": 0.8103048801422119, "step": 5 }, { "epoch": 0.027906976744186046, "grad_norm": 18.354591369628906, "learning_rate": 2.2727272727272728e-06, "loss": 0.782, "mean_token_accuracy": 0.8292044401168823, "step": 6 }, { "epoch": 0.03255813953488372, "grad_norm": 16.53504753112793, "learning_rate": 2.7272727272727272e-06, "loss": 0.8115, "mean_token_accuracy": 0.8264797925949097, "step": 7 }, { "epoch": 0.037209302325581395, "grad_norm": 10.243330955505371, "learning_rate": 3.181818181818182e-06, "loss": 0.6383, "mean_token_accuracy": 0.8500951528549194, "step": 8 }, { "epoch": 0.04186046511627907, "grad_norm": 8.570775985717773, "learning_rate": 3.6363636363636366e-06, "loss": 0.5647, "mean_token_accuracy": 0.8617518544197083, "step": 9 }, { "epoch": 0.046511627906976744, "grad_norm": 9.014289855957031, "learning_rate": 4.0909090909090915e-06, "loss": 0.4928, "mean_token_accuracy": 0.8720639944076538, "step": 10 }, { "epoch": 0.05116279069767442, "grad_norm": 9.353399276733398, "learning_rate": 4.5454545454545455e-06, "loss": 0.4215, "mean_token_accuracy": 0.8906779885292053, "step": 11 }, { "epoch": 0.05581395348837209, "grad_norm": 7.49207067489624, "learning_rate": 5e-06, "loss": 0.4415, "mean_token_accuracy": 0.8809018135070801, "step": 12 }, { "epoch": 0.06046511627906977, "grad_norm": 4.262305736541748, "learning_rate": 5.4545454545454545e-06, "loss": 0.42, "mean_token_accuracy": 0.8770877122879028, "step": 13 }, { "epoch": 0.06511627906976744, "grad_norm": 5.891422748565674, "learning_rate": 5.90909090909091e-06, "loss": 0.3657, "mean_token_accuracy": 0.8887879252433777, "step": 14 }, { "epoch": 0.06976744186046512, "grad_norm": 3.392139196395874, "learning_rate": 6.363636363636364e-06, "loss": 0.3438, "mean_token_accuracy": 0.8969727158546448, "step": 15 }, { "epoch": 0.07441860465116279, "grad_norm": 3.7728092670440674, "learning_rate": 6.818181818181818e-06, "loss": 0.381, "mean_token_accuracy": 0.8863928318023682, "step": 16 }, { "epoch": 0.07906976744186046, "grad_norm": 3.083871603012085, "learning_rate": 7.272727272727273e-06, "loss": 0.333, "mean_token_accuracy": 0.902701735496521, "step": 17 }, { "epoch": 0.08372093023255814, "grad_norm": 3.078378200531006, "learning_rate": 7.727272727272727e-06, "loss": 0.3776, "mean_token_accuracy": 0.8830382823944092, "step": 18 }, { "epoch": 0.08837209302325581, "grad_norm": 3.3377773761749268, "learning_rate": 8.181818181818183e-06, "loss": 0.384, "mean_token_accuracy": 0.8844019174575806, "step": 19 }, { "epoch": 0.09302325581395349, "grad_norm": 2.8573648929595947, "learning_rate": 8.636363636363637e-06, "loss": 0.3525, "mean_token_accuracy": 0.8916018605232239, "step": 20 }, { "epoch": 0.09767441860465116, "grad_norm": 3.159400463104248, "learning_rate": 9.090909090909091e-06, "loss": 0.3996, "mean_token_accuracy": 0.8746355772018433, "step": 21 }, { "epoch": 0.10232558139534884, "grad_norm": 3.09920334815979, "learning_rate": 9.545454545454547e-06, "loss": 0.3518, "mean_token_accuracy": 0.8835098147392273, "step": 22 }, { "epoch": 0.10697674418604651, "grad_norm": 2.9805715084075928, "learning_rate": 1e-05, "loss": 0.3449, "mean_token_accuracy": 0.8943809270858765, "step": 23 }, { "epoch": 0.11162790697674418, "grad_norm": 3.226511001586914, "learning_rate": 9.999403846557509e-06, "loss": 0.3707, "mean_token_accuracy": 0.8781068921089172, "step": 24 }, { "epoch": 0.11627906976744186, "grad_norm": 3.0211565494537354, "learning_rate": 9.99761554418511e-06, "loss": 0.3333, "mean_token_accuracy": 0.8967213034629822, "step": 25 }, { "epoch": 0.12093023255813953, "grad_norm": 3.0328941345214844, "learning_rate": 9.99463556670619e-06, "loss": 0.3337, "mean_token_accuracy": 0.9013436436653137, "step": 26 }, { "epoch": 0.12558139534883722, "grad_norm": 3.127363681793213, "learning_rate": 9.990464703686895e-06, "loss": 0.3643, "mean_token_accuracy": 0.8956908583641052, "step": 27 }, { "epoch": 0.13023255813953488, "grad_norm": 3.1299784183502197, "learning_rate": 9.985104060226937e-06, "loss": 0.3536, "mean_token_accuracy": 0.8928705453872681, "step": 28 }, { "epoch": 0.13488372093023257, "grad_norm": 3.025752305984497, "learning_rate": 9.978555056666784e-06, "loss": 0.3768, "mean_token_accuracy": 0.8789021372795105, "step": 29 }, { "epoch": 0.13953488372093023, "grad_norm": 3.072669744491577, "learning_rate": 9.97081942821133e-06, "loss": 0.3708, "mean_token_accuracy": 0.8879210948944092, "step": 30 }, { "epoch": 0.14418604651162792, "grad_norm": 3.2670693397521973, "learning_rate": 9.961899224470146e-06, "loss": 0.3947, "mean_token_accuracy": 0.8810369372367859, "step": 31 }, { "epoch": 0.14883720930232558, "grad_norm": 3.0507073402404785, "learning_rate": 9.95179680891442e-06, "loss": 0.3177, "mean_token_accuracy": 0.9065743684768677, "step": 32 }, { "epoch": 0.15348837209302327, "grad_norm": 3.6491124629974365, "learning_rate": 9.940514858250736e-06, "loss": 0.3565, "mean_token_accuracy": 0.8926541209220886, "step": 33 }, { "epoch": 0.15813953488372093, "grad_norm": 2.94435977935791, "learning_rate": 9.928056361711854e-06, "loss": 0.3032, "mean_token_accuracy": 0.9051738977432251, "step": 34 }, { "epoch": 0.16279069767441862, "grad_norm": 3.0378453731536865, "learning_rate": 9.914424620264714e-06, "loss": 0.3582, "mean_token_accuracy": 0.8896961808204651, "step": 35 }, { "epoch": 0.16744186046511628, "grad_norm": 3.0237748622894287, "learning_rate": 9.899623245735798e-06, "loss": 0.3287, "mean_token_accuracy": 0.8992047309875488, "step": 36 }, { "epoch": 0.17209302325581396, "grad_norm": 3.0034892559051514, "learning_rate": 9.883656159854166e-06, "loss": 0.3175, "mean_token_accuracy": 0.9017576575279236, "step": 37 }, { "epoch": 0.17674418604651163, "grad_norm": 3.0133862495422363, "learning_rate": 9.866527593212355e-06, "loss": 0.3321, "mean_token_accuracy": 0.8984139561653137, "step": 38 }, { "epoch": 0.1813953488372093, "grad_norm": 2.9698572158813477, "learning_rate": 9.848242084145462e-06, "loss": 0.3203, "mean_token_accuracy": 0.9006622433662415, "step": 39 }, { "epoch": 0.18604651162790697, "grad_norm": 2.8872194290161133, "learning_rate": 9.82880447752868e-06, "loss": 0.3377, "mean_token_accuracy": 0.8935135006904602, "step": 40 }, { "epoch": 0.19069767441860466, "grad_norm": 3.0155012607574463, "learning_rate": 9.808219923493606e-06, "loss": 0.3318, "mean_token_accuracy": 0.8975328803062439, "step": 41 }, { "epoch": 0.19534883720930232, "grad_norm": 3.193694591522217, "learning_rate": 9.786493876063685e-06, "loss": 0.3284, "mean_token_accuracy": 0.8954758048057556, "step": 42 }, { "epoch": 0.2, "grad_norm": 3.0788753032684326, "learning_rate": 9.763632091709125e-06, "loss": 0.3093, "mean_token_accuracy": 0.8967692852020264, "step": 43 }, { "epoch": 0.20465116279069767, "grad_norm": 2.8875858783721924, "learning_rate": 9.739640627821678e-06, "loss": 0.3373, "mean_token_accuracy": 0.8984333872795105, "step": 44 }, { "epoch": 0.20930232558139536, "grad_norm": 3.2467026710510254, "learning_rate": 9.714525841109697e-06, "loss": 0.2945, "mean_token_accuracy": 0.9045419692993164, "step": 45 }, { "epoch": 0.21395348837209302, "grad_norm": 3.0920603275299072, "learning_rate": 9.68829438591387e-06, "loss": 0.3045, "mean_token_accuracy": 0.9002149701118469, "step": 46 }, { "epoch": 0.2186046511627907, "grad_norm": 2.996901512145996, "learning_rate": 9.660953212444116e-06, "loss": 0.3139, "mean_token_accuracy": 0.8980564475059509, "step": 47 }, { "epoch": 0.22325581395348837, "grad_norm": 2.84829044342041, "learning_rate": 9.632509564938073e-06, "loss": 0.3218, "mean_token_accuracy": 0.8964840173721313, "step": 48 }, { "epoch": 0.22790697674418606, "grad_norm": 3.001344680786133, "learning_rate": 9.60297097974169e-06, "loss": 0.3145, "mean_token_accuracy": 0.8978151082992554, "step": 49 }, { "epoch": 0.23255813953488372, "grad_norm": 2.866776943206787, "learning_rate": 9.572345283312407e-06, "loss": 0.3155, "mean_token_accuracy": 0.8966426253318787, "step": 50 }, { "epoch": 0.2372093023255814, "grad_norm": 3.1876516342163086, "learning_rate": 9.540640590145496e-06, "loss": 0.3358, "mean_token_accuracy": 0.8820154070854187, "step": 51 }, { "epoch": 0.24186046511627907, "grad_norm": 2.8794522285461426, "learning_rate": 9.507865300624057e-06, "loss": 0.3378, "mean_token_accuracy": 0.8850215673446655, "step": 52 }, { "epoch": 0.24651162790697675, "grad_norm": 2.9620361328125, "learning_rate": 9.474028098793277e-06, "loss": 0.3129, "mean_token_accuracy": 0.8966131806373596, "step": 53 }, { "epoch": 0.25116279069767444, "grad_norm": 2.8167107105255127, "learning_rate": 9.439137950059539e-06, "loss": 0.3294, "mean_token_accuracy": 0.8901098966598511, "step": 54 }, { "epoch": 0.2558139534883721, "grad_norm": 2.7492761611938477, "learning_rate": 9.403204098814965e-06, "loss": 0.3288, "mean_token_accuracy": 0.8896990418434143, "step": 55 }, { "epoch": 0.26046511627906976, "grad_norm": 2.7716281414031982, "learning_rate": 9.366236065988053e-06, "loss": 0.321, "mean_token_accuracy": 0.896246075630188, "step": 56 }, { "epoch": 0.2651162790697674, "grad_norm": 2.415069341659546, "learning_rate": 9.32824364652104e-06, "loss": 0.2824, "mean_token_accuracy": 0.9081918597221375, "step": 57 }, { "epoch": 0.26976744186046514, "grad_norm": 2.4787442684173584, "learning_rate": 9.289236906774663e-06, "loss": 0.3112, "mean_token_accuracy": 0.8966255187988281, "step": 58 }, { "epoch": 0.2744186046511628, "grad_norm": 2.7912585735321045, "learning_rate": 9.249226181861e-06, "loss": 0.2924, "mean_token_accuracy": 0.9057479500770569, "step": 59 }, { "epoch": 0.27906976744186046, "grad_norm": 2.6348111629486084, "learning_rate": 9.208222072905113e-06, "loss": 0.3073, "mean_token_accuracy": 0.8927038908004761, "step": 60 }, { "epoch": 0.2837209302325581, "grad_norm": 2.7154784202575684, "learning_rate": 9.166235444236209e-06, "loss": 0.3418, "mean_token_accuracy": 0.883296012878418, "step": 61 }, { "epoch": 0.28837209302325584, "grad_norm": 2.7209088802337646, "learning_rate": 9.123277420509053e-06, "loss": 0.3355, "mean_token_accuracy": 0.8873890042304993, "step": 62 }, { "epoch": 0.2930232558139535, "grad_norm": 2.58394193649292, "learning_rate": 9.079359383756411e-06, "loss": 0.3137, "mean_token_accuracy": 0.8987603187561035, "step": 63 }, { "epoch": 0.29767441860465116, "grad_norm": 2.5450754165649414, "learning_rate": 9.034492970373305e-06, "loss": 0.3055, "mean_token_accuracy": 0.8960996270179749, "step": 64 }, { "epoch": 0.3023255813953488, "grad_norm": 2.6043131351470947, "learning_rate": 8.988690068033864e-06, "loss": 0.3329, "mean_token_accuracy": 0.8888005018234253, "step": 65 }, { "epoch": 0.30697674418604654, "grad_norm": 2.310673475265503, "learning_rate": 8.941962812541604e-06, "loss": 0.3013, "mean_token_accuracy": 0.8942438960075378, "step": 66 }, { "epoch": 0.3116279069767442, "grad_norm": 2.518235206604004, "learning_rate": 8.894323584613951e-06, "loss": 0.3199, "mean_token_accuracy": 0.8883762359619141, "step": 67 }, { "epoch": 0.31627906976744186, "grad_norm": 2.4000134468078613, "learning_rate": 8.845785006601898e-06, "loss": 0.2818, "mean_token_accuracy": 0.9015190601348877, "step": 68 }, { "epoch": 0.3209302325581395, "grad_norm": 2.541795492172241, "learning_rate": 8.796359939145614e-06, "loss": 0.3087, "mean_token_accuracy": 0.8922914266586304, "step": 69 }, { "epoch": 0.32558139534883723, "grad_norm": 2.65686297416687, "learning_rate": 8.74606147776692e-06, "loss": 0.3094, "mean_token_accuracy": 0.8930131196975708, "step": 70 }, { "epoch": 0.3302325581395349, "grad_norm": 2.518319606781006, "learning_rate": 8.694902949399555e-06, "loss": 0.3091, "mean_token_accuracy": 0.8983761072158813, "step": 71 }, { "epoch": 0.33488372093023255, "grad_norm": 2.572589159011841, "learning_rate": 8.642897908858096e-06, "loss": 0.3108, "mean_token_accuracy": 0.8960880041122437, "step": 72 }, { "epoch": 0.3395348837209302, "grad_norm": 2.646913766860962, "learning_rate": 8.590060135246516e-06, "loss": 0.3362, "mean_token_accuracy": 0.8827519416809082, "step": 73 }, { "epoch": 0.34418604651162793, "grad_norm": 2.309241771697998, "learning_rate": 8.53640362830732e-06, "loss": 0.2904, "mean_token_accuracy": 0.8993450403213501, "step": 74 }, { "epoch": 0.3488372093023256, "grad_norm": 2.6384401321411133, "learning_rate": 8.481942604712209e-06, "loss": 0.3195, "mean_token_accuracy": 0.8918577432632446, "step": 75 }, { "epoch": 0.35348837209302325, "grad_norm": 2.4883103370666504, "learning_rate": 8.426691494295269e-06, "loss": 0.2889, "mean_token_accuracy": 0.9023594856262207, "step": 76 }, { "epoch": 0.3581395348837209, "grad_norm": 2.464266538619995, "learning_rate": 8.370664936229688e-06, "loss": 0.3144, "mean_token_accuracy": 0.8927644491195679, "step": 77 }, { "epoch": 0.3627906976744186, "grad_norm": 2.4750561714172363, "learning_rate": 8.313877775149009e-06, "loss": 0.2742, "mean_token_accuracy": 0.9066977500915527, "step": 78 }, { "epoch": 0.3674418604651163, "grad_norm": 2.781059980392456, "learning_rate": 8.256345057213925e-06, "loss": 0.3152, "mean_token_accuracy": 0.8888514041900635, "step": 79 }, { "epoch": 0.37209302325581395, "grad_norm": 2.3490397930145264, "learning_rate": 8.198082026125707e-06, "loss": 0.3281, "mean_token_accuracy": 0.8870996236801147, "step": 80 }, { "epoch": 0.3767441860465116, "grad_norm": 2.4752197265625, "learning_rate": 8.139104119087265e-06, "loss": 0.2822, "mean_token_accuracy": 0.9008604288101196, "step": 81 }, { "epoch": 0.3813953488372093, "grad_norm": 2.4564919471740723, "learning_rate": 8.07942696271296e-06, "loss": 0.2819, "mean_token_accuracy": 0.9009503126144409, "step": 82 }, { "epoch": 0.386046511627907, "grad_norm": 2.3468544483184814, "learning_rate": 8.019066368888222e-06, "loss": 0.2881, "mean_token_accuracy": 0.9013717770576477, "step": 83 }, { "epoch": 0.39069767441860465, "grad_norm": 2.4819157123565674, "learning_rate": 7.958038330580067e-06, "loss": 0.3321, "mean_token_accuracy": 0.8821220993995667, "step": 84 }, { "epoch": 0.3953488372093023, "grad_norm": 2.3831310272216797, "learning_rate": 7.89635901759967e-06, "loss": 0.2994, "mean_token_accuracy": 0.8928157687187195, "step": 85 }, { "epoch": 0.4, "grad_norm": 2.7160511016845703, "learning_rate": 7.834044772318033e-06, "loss": 0.3278, "mean_token_accuracy": 0.8914129137992859, "step": 86 }, { "epoch": 0.4046511627906977, "grad_norm": 2.4505536556243896, "learning_rate": 7.77111210533597e-06, "loss": 0.3004, "mean_token_accuracy": 0.8894042372703552, "step": 87 }, { "epoch": 0.40930232558139534, "grad_norm": 3.191389322280884, "learning_rate": 7.707577691109519e-06, "loss": 0.3545, "mean_token_accuracy": 0.8798402547836304, "step": 88 }, { "epoch": 0.413953488372093, "grad_norm": 2.280726671218872, "learning_rate": 7.6434583635319e-06, "loss": 0.3027, "mean_token_accuracy": 0.900479793548584, "step": 89 }, { "epoch": 0.4186046511627907, "grad_norm": 2.3006751537323, "learning_rate": 7.578771111473276e-06, "loss": 0.2587, "mean_token_accuracy": 0.9121513962745667, "step": 90 }, { "epoch": 0.4232558139534884, "grad_norm": 2.3514935970306396, "learning_rate": 7.513533074279427e-06, "loss": 0.2674, "mean_token_accuracy": 0.908033013343811, "step": 91 }, { "epoch": 0.42790697674418604, "grad_norm": 2.483891487121582, "learning_rate": 7.4477615372305545e-06, "loss": 0.3212, "mean_token_accuracy": 0.8946645855903625, "step": 92 }, { "epoch": 0.4325581395348837, "grad_norm": 2.3761796951293945, "learning_rate": 7.3814739269614265e-06, "loss": 0.2946, "mean_token_accuracy": 0.900073230266571, "step": 93 }, { "epoch": 0.4372093023255814, "grad_norm": 2.4102704524993896, "learning_rate": 7.314687806844067e-06, "loss": 0.3073, "mean_token_accuracy": 0.8981302380561829, "step": 94 }, { "epoch": 0.4418604651162791, "grad_norm": 2.468144416809082, "learning_rate": 7.247420872334221e-06, "loss": 0.302, "mean_token_accuracy": 0.8967844843864441, "step": 95 }, { "epoch": 0.44651162790697674, "grad_norm": 2.222743034362793, "learning_rate": 7.179690946282808e-06, "loss": 0.2823, "mean_token_accuracy": 0.9029505848884583, "step": 96 }, { "epoch": 0.4511627906976744, "grad_norm": 2.301776647567749, "learning_rate": 7.111515974213639e-06, "loss": 0.2494, "mean_token_accuracy": 0.9104065895080566, "step": 97 }, { "epoch": 0.4558139534883721, "grad_norm": 2.373887777328491, "learning_rate": 7.042914019568621e-06, "loss": 0.3023, "mean_token_accuracy": 0.8933764100074768, "step": 98 }, { "epoch": 0.4604651162790698, "grad_norm": 2.228679656982422, "learning_rate": 6.973903258921719e-06, "loss": 0.2563, "mean_token_accuracy": 0.910088062286377, "step": 99 }, { "epoch": 0.46511627906976744, "grad_norm": 2.3759074211120605, "learning_rate": 6.904501977162949e-06, "loss": 0.3274, "mean_token_accuracy": 0.8846719861030579, "step": 100 }, { "epoch": 0.4697674418604651, "grad_norm": 2.367203712463379, "learning_rate": 6.834728562653659e-06, "loss": 0.3091, "mean_token_accuracy": 0.8904908299446106, "step": 101 }, { "epoch": 0.4744186046511628, "grad_norm": 2.4363155364990234, "learning_rate": 6.764601502354403e-06, "loss": 0.3064, "mean_token_accuracy": 0.8976874351501465, "step": 102 }, { "epoch": 0.4790697674418605, "grad_norm": 2.2353415489196777, "learning_rate": 6.6941393769266995e-06, "loss": 0.2975, "mean_token_accuracy": 0.8998541831970215, "step": 103 }, { "epoch": 0.48372093023255813, "grad_norm": 2.108872890472412, "learning_rate": 6.6233608558099405e-06, "loss": 0.2778, "mean_token_accuracy": 0.9046753644943237, "step": 104 }, { "epoch": 0.4883720930232558, "grad_norm": 2.300018310546875, "learning_rate": 6.552284692274803e-06, "loss": 0.3093, "mean_token_accuracy": 0.8958457112312317, "step": 105 }, { "epoch": 0.4930232558139535, "grad_norm": 2.3953607082366943, "learning_rate": 6.48092971845443e-06, "loss": 0.2805, "mean_token_accuracy": 0.9026817083358765, "step": 106 }, { "epoch": 0.49767441860465117, "grad_norm": 2.397383451461792, "learning_rate": 6.409314840354724e-06, "loss": 0.3381, "mean_token_accuracy": 0.8877024054527283, "step": 107 }, { "epoch": 0.5023255813953489, "grad_norm": 2.524808883666992, "learning_rate": 6.337459032845068e-06, "loss": 0.3136, "mean_token_accuracy": 0.8877217173576355, "step": 108 }, { "epoch": 0.5069767441860465, "grad_norm": 2.4271292686462402, "learning_rate": 6.2653813346308e-06, "loss": 0.2741, "mean_token_accuracy": 0.9050701260566711, "step": 109 }, { "epoch": 0.5116279069767442, "grad_norm": 2.2498927116394043, "learning_rate": 6.193100843208772e-06, "loss": 0.2809, "mean_token_accuracy": 0.8968031406402588, "step": 110 }, { "epoch": 0.5162790697674419, "grad_norm": 2.330934524536133, "learning_rate": 6.120636709807334e-06, "loss": 0.31, "mean_token_accuracy": 0.8886767029762268, "step": 111 }, { "epoch": 0.5209302325581395, "grad_norm": 2.399275302886963, "learning_rate": 6.048008134312078e-06, "loss": 0.2888, "mean_token_accuracy": 0.9004721641540527, "step": 112 }, { "epoch": 0.5255813953488372, "grad_norm": 2.39941668510437, "learning_rate": 5.975234360178698e-06, "loss": 0.3259, "mean_token_accuracy": 0.8926165103912354, "step": 113 }, { "epoch": 0.5302325581395348, "grad_norm": 2.22544002532959, "learning_rate": 5.902334669334287e-06, "loss": 0.2992, "mean_token_accuracy": 0.90220046043396, "step": 114 }, { "epoch": 0.5348837209302325, "grad_norm": 2.4933297634124756, "learning_rate": 5.829328377068476e-06, "loss": 0.2943, "mean_token_accuracy": 0.8951420783996582, "step": 115 }, { "epoch": 0.5395348837209303, "grad_norm": 2.575714349746704, "learning_rate": 5.756234826915686e-06, "loss": 0.3028, "mean_token_accuracy": 0.8980122208595276, "step": 116 }, { "epoch": 0.5441860465116279, "grad_norm": 2.6264634132385254, "learning_rate": 5.683073385529938e-06, "loss": 0.3157, "mean_token_accuracy": 0.8895466923713684, "step": 117 }, { "epoch": 0.5488372093023256, "grad_norm": 2.182809591293335, "learning_rate": 5.60986343755352e-06, "loss": 0.2617, "mean_token_accuracy": 0.907944917678833, "step": 118 }, { "epoch": 0.5534883720930233, "grad_norm": 2.4722118377685547, "learning_rate": 5.536624380480878e-06, "loss": 0.2513, "mean_token_accuracy": 0.9123600125312805, "step": 119 }, { "epoch": 0.5581395348837209, "grad_norm": 2.3010671138763428, "learning_rate": 5.4633756195191235e-06, "loss": 0.2599, "mean_token_accuracy": 0.9083516597747803, "step": 120 }, { "epoch": 0.5627906976744186, "grad_norm": 2.06095290184021, "learning_rate": 5.390136562446482e-06, "loss": 0.2275, "mean_token_accuracy": 0.9167689085006714, "step": 121 }, { "epoch": 0.5674418604651162, "grad_norm": 2.4146745204925537, "learning_rate": 5.316926614470063e-06, "loss": 0.3051, "mean_token_accuracy": 0.8924481272697449, "step": 122 }, { "epoch": 0.5720930232558139, "grad_norm": 2.705653190612793, "learning_rate": 5.2437651730843165e-06, "loss": 0.3418, "mean_token_accuracy": 0.8777977228164673, "step": 123 }, { "epoch": 0.5767441860465117, "grad_norm": 2.5144317150115967, "learning_rate": 5.170671622931527e-06, "loss": 0.3105, "mean_token_accuracy": 0.8910664916038513, "step": 124 }, { "epoch": 0.5813953488372093, "grad_norm": 2.083824634552002, "learning_rate": 5.097665330665714e-06, "loss": 0.2399, "mean_token_accuracy": 0.9148097634315491, "step": 125 }, { "epoch": 0.586046511627907, "grad_norm": 2.6404523849487305, "learning_rate": 5.024765639821305e-06, "loss": 0.3211, "mean_token_accuracy": 0.8938697576522827, "step": 126 }, { "epoch": 0.5906976744186047, "grad_norm": 2.0765397548675537, "learning_rate": 4.951991865687923e-06, "loss": 0.2523, "mean_token_accuracy": 0.9152514934539795, "step": 127 }, { "epoch": 0.5953488372093023, "grad_norm": 2.412893772125244, "learning_rate": 4.879363290192667e-06, "loss": 0.2936, "mean_token_accuracy": 0.9007983803749084, "step": 128 }, { "epoch": 0.6, "grad_norm": 2.7524521350860596, "learning_rate": 4.806899156791231e-06, "loss": 0.3579, "mean_token_accuracy": 0.8833458423614502, "step": 129 }, { "epoch": 0.6046511627906976, "grad_norm": 2.421870470046997, "learning_rate": 4.734618665369202e-06, "loss": 0.2835, "mean_token_accuracy": 0.8964264988899231, "step": 130 }, { "epoch": 0.6093023255813953, "grad_norm": 2.338674306869507, "learning_rate": 4.662540967154934e-06, "loss": 0.2663, "mean_token_accuracy": 0.9057571887969971, "step": 131 }, { "epoch": 0.6139534883720931, "grad_norm": 2.144547700881958, "learning_rate": 4.5906851596452765e-06, "loss": 0.2723, "mean_token_accuracy": 0.9027295112609863, "step": 132 }, { "epoch": 0.6186046511627907, "grad_norm": 2.2873592376708984, "learning_rate": 4.519070281545571e-06, "loss": 0.305, "mean_token_accuracy": 0.895431637763977, "step": 133 }, { "epoch": 0.6232558139534884, "grad_norm": 2.3061864376068115, "learning_rate": 4.447715307725197e-06, "loss": 0.3126, "mean_token_accuracy": 0.8983712196350098, "step": 134 }, { "epoch": 0.627906976744186, "grad_norm": 2.276719093322754, "learning_rate": 4.376639144190061e-06, "loss": 0.2996, "mean_token_accuracy": 0.8973622918128967, "step": 135 }, { "epoch": 0.6325581395348837, "grad_norm": 2.5244691371917725, "learning_rate": 4.305860623073304e-06, "loss": 0.2969, "mean_token_accuracy": 0.8997734189033508, "step": 136 }, { "epoch": 0.6372093023255814, "grad_norm": 2.136345386505127, "learning_rate": 4.2353984976456e-06, "loss": 0.2782, "mean_token_accuracy": 0.9054514765739441, "step": 137 }, { "epoch": 0.641860465116279, "grad_norm": 2.2209346294403076, "learning_rate": 4.1652714373463435e-06, "loss": 0.2881, "mean_token_accuracy": 0.8994898200035095, "step": 138 }, { "epoch": 0.6465116279069767, "grad_norm": 2.2153260707855225, "learning_rate": 4.095498022837051e-06, "loss": 0.2713, "mean_token_accuracy": 0.9085735082626343, "step": 139 }, { "epoch": 0.6511627906976745, "grad_norm": 2.323110818862915, "learning_rate": 4.026096741078281e-06, "loss": 0.3132, "mean_token_accuracy": 0.8948206901550293, "step": 140 }, { "epoch": 0.6558139534883721, "grad_norm": 2.3423662185668945, "learning_rate": 3.957085980431382e-06, "loss": 0.3104, "mean_token_accuracy": 0.8916748762130737, "step": 141 }, { "epoch": 0.6604651162790698, "grad_norm": 2.168367624282837, "learning_rate": 3.888484025786364e-06, "loss": 0.2807, "mean_token_accuracy": 0.9040858745574951, "step": 142 }, { "epoch": 0.6651162790697674, "grad_norm": 2.1336045265197754, "learning_rate": 3.820309053717195e-06, "loss": 0.2629, "mean_token_accuracy": 0.9082328081130981, "step": 143 }, { "epoch": 0.6697674418604651, "grad_norm": 2.4182586669921875, "learning_rate": 3.75257912766578e-06, "loss": 0.3521, "mean_token_accuracy": 0.8834901452064514, "step": 144 }, { "epoch": 0.6744186046511628, "grad_norm": 2.415916681289673, "learning_rate": 3.6853121931559334e-06, "loss": 0.3177, "mean_token_accuracy": 0.8882511258125305, "step": 145 }, { "epoch": 0.6790697674418604, "grad_norm": 2.250131368637085, "learning_rate": 3.618526073038574e-06, "loss": 0.3391, "mean_token_accuracy": 0.8773726224899292, "step": 146 }, { "epoch": 0.6837209302325581, "grad_norm": 2.3493268489837646, "learning_rate": 3.552238462769446e-06, "loss": 0.2808, "mean_token_accuracy": 0.8990680575370789, "step": 147 }, { "epoch": 0.6883720930232559, "grad_norm": 2.5086095333099365, "learning_rate": 3.4864669257205745e-06, "loss": 0.2714, "mean_token_accuracy": 0.9056432843208313, "step": 148 }, { "epoch": 0.6930232558139535, "grad_norm": 2.324185848236084, "learning_rate": 3.4212288885267246e-06, "loss": 0.3179, "mean_token_accuracy": 0.8921183347702026, "step": 149 }, { "epoch": 0.6976744186046512, "grad_norm": 2.469515323638916, "learning_rate": 3.3565416364681016e-06, "loss": 0.2866, "mean_token_accuracy": 0.8951537013053894, "step": 150 }, { "epoch": 0.7023255813953488, "grad_norm": 2.097574234008789, "learning_rate": 3.2924223088904816e-06, "loss": 0.2751, "mean_token_accuracy": 0.903618574142456, "step": 151 }, { "epoch": 0.7069767441860465, "grad_norm": 2.559396743774414, "learning_rate": 3.228887894664029e-06, "loss": 0.3377, "mean_token_accuracy": 0.8870507478713989, "step": 152 }, { "epoch": 0.7116279069767442, "grad_norm": 2.4009549617767334, "learning_rate": 3.1659552276819693e-06, "loss": 0.3098, "mean_token_accuracy": 0.8919164538383484, "step": 153 }, { "epoch": 0.7162790697674418, "grad_norm": 2.2265982627868652, "learning_rate": 3.1036409824003324e-06, "loss": 0.2853, "mean_token_accuracy": 0.9013279676437378, "step": 154 }, { "epoch": 0.7209302325581395, "grad_norm": 2.1032068729400635, "learning_rate": 3.0419616694199327e-06, "loss": 0.2629, "mean_token_accuracy": 0.9076212644577026, "step": 155 }, { "epoch": 0.7255813953488373, "grad_norm": 2.1827595233917236, "learning_rate": 2.98093363111178e-06, "loss": 0.2401, "mean_token_accuracy": 0.915657639503479, "step": 156 }, { "epoch": 0.7302325581395349, "grad_norm": 2.498218297958374, "learning_rate": 2.92057303728704e-06, "loss": 0.2965, "mean_token_accuracy": 0.8975441455841064, "step": 157 }, { "epoch": 0.7348837209302326, "grad_norm": 2.230644941329956, "learning_rate": 2.860895880912735e-06, "loss": 0.2833, "mean_token_accuracy": 0.9061956405639648, "step": 158 }, { "epoch": 0.7395348837209302, "grad_norm": 2.2900495529174805, "learning_rate": 2.801917973874294e-06, "loss": 0.3014, "mean_token_accuracy": 0.8936989903450012, "step": 159 }, { "epoch": 0.7441860465116279, "grad_norm": 2.3364789485931396, "learning_rate": 2.7436549427860766e-06, "loss": 0.3207, "mean_token_accuracy": 0.8891250491142273, "step": 160 }, { "epoch": 0.7488372093023256, "grad_norm": 2.0765552520751953, "learning_rate": 2.6861222248509926e-06, "loss": 0.2473, "mean_token_accuracy": 0.9146341681480408, "step": 161 }, { "epoch": 0.7534883720930232, "grad_norm": 2.1344990730285645, "learning_rate": 2.6293350637703123e-06, "loss": 0.2804, "mean_token_accuracy": 0.9104732871055603, "step": 162 }, { "epoch": 0.7581395348837209, "grad_norm": 2.4207024574279785, "learning_rate": 2.5733085057047325e-06, "loss": 0.2856, "mean_token_accuracy": 0.9033816456794739, "step": 163 }, { "epoch": 0.7627906976744186, "grad_norm": 2.183412790298462, "learning_rate": 2.518057395287792e-06, "loss": 0.2971, "mean_token_accuracy": 0.8962974548339844, "step": 164 }, { "epoch": 0.7674418604651163, "grad_norm": 2.357823371887207, "learning_rate": 2.463596371692681e-06, "loss": 0.2952, "mean_token_accuracy": 0.8953551650047302, "step": 165 }, { "epoch": 0.772093023255814, "grad_norm": 2.134320020675659, "learning_rate": 2.409939864753487e-06, "loss": 0.2963, "mean_token_accuracy": 0.8957446813583374, "step": 166 }, { "epoch": 0.7767441860465116, "grad_norm": 2.194366693496704, "learning_rate": 2.3571020911419067e-06, "loss": 0.2818, "mean_token_accuracy": 0.9092063307762146, "step": 167 }, { "epoch": 0.7813953488372093, "grad_norm": 2.407562494277954, "learning_rate": 2.3050970506004463e-06, "loss": 0.3105, "mean_token_accuracy": 0.8934277892112732, "step": 168 }, { "epoch": 0.786046511627907, "grad_norm": 2.3943395614624023, "learning_rate": 2.2539385222330797e-06, "loss": 0.2516, "mean_token_accuracy": 0.9115543961524963, "step": 169 }, { "epoch": 0.7906976744186046, "grad_norm": 2.2870237827301025, "learning_rate": 2.203640060854387e-06, "loss": 0.2662, "mean_token_accuracy": 0.9091401696205139, "step": 170 }, { "epoch": 0.7953488372093023, "grad_norm": 2.2317891120910645, "learning_rate": 2.1542149933981014e-06, "loss": 0.3036, "mean_token_accuracy": 0.8948342800140381, "step": 171 }, { "epoch": 0.8, "grad_norm": 2.2114291191101074, "learning_rate": 2.10567641538605e-06, "loss": 0.2777, "mean_token_accuracy": 0.9104036092758179, "step": 172 }, { "epoch": 0.8046511627906977, "grad_norm": 2.1394577026367188, "learning_rate": 2.058037187458398e-06, "loss": 0.2885, "mean_token_accuracy": 0.9054663777351379, "step": 173 }, { "epoch": 0.8093023255813954, "grad_norm": 2.3979647159576416, "learning_rate": 2.011309931966136e-06, "loss": 0.2822, "mean_token_accuracy": 0.9056000113487244, "step": 174 }, { "epoch": 0.813953488372093, "grad_norm": 2.212559461593628, "learning_rate": 1.965507029626695e-06, "loss": 0.2933, "mean_token_accuracy": 0.8865057826042175, "step": 175 }, { "epoch": 0.8186046511627907, "grad_norm": 2.2449798583984375, "learning_rate": 1.920640616243589e-06, "loss": 0.2605, "mean_token_accuracy": 0.9103293418884277, "step": 176 }, { "epoch": 0.8232558139534883, "grad_norm": 2.4525561332702637, "learning_rate": 1.8767225794909484e-06, "loss": 0.318, "mean_token_accuracy": 0.8924676179885864, "step": 177 }, { "epoch": 0.827906976744186, "grad_norm": 2.1186583042144775, "learning_rate": 1.8337645557637929e-06, "loss": 0.2723, "mean_token_accuracy": 0.9039152264595032, "step": 178 }, { "epoch": 0.8325581395348837, "grad_norm": 2.124532461166382, "learning_rate": 1.7917779270948887e-06, "loss": 0.2916, "mean_token_accuracy": 0.8994188904762268, "step": 179 }, { "epoch": 0.8372093023255814, "grad_norm": 2.302385091781616, "learning_rate": 1.7507738181390027e-06, "loss": 0.2811, "mean_token_accuracy": 0.9010390639305115, "step": 180 }, { "epoch": 0.8418604651162791, "grad_norm": 2.309089422225952, "learning_rate": 1.7107630932253383e-06, "loss": 0.2813, "mean_token_accuracy": 0.902031660079956, "step": 181 }, { "epoch": 0.8465116279069768, "grad_norm": 2.244872808456421, "learning_rate": 1.6717563534789594e-06, "loss": 0.2882, "mean_token_accuracy": 0.9040014147758484, "step": 182 }, { "epoch": 0.8511627906976744, "grad_norm": 2.278420925140381, "learning_rate": 1.6337639340119476e-06, "loss": 0.2939, "mean_token_accuracy": 0.8955157995223999, "step": 183 }, { "epoch": 0.8558139534883721, "grad_norm": 2.336935043334961, "learning_rate": 1.596795901185037e-06, "loss": 0.3015, "mean_token_accuracy": 0.8964377641677856, "step": 184 }, { "epoch": 0.8604651162790697, "grad_norm": 2.0233819484710693, "learning_rate": 1.5608620499404628e-06, "loss": 0.274, "mean_token_accuracy": 0.9038158059120178, "step": 185 }, { "epoch": 0.8651162790697674, "grad_norm": 2.278059959411621, "learning_rate": 1.5259719012067249e-06, "loss": 0.2998, "mean_token_accuracy": 0.9001963138580322, "step": 186 }, { "epoch": 0.8697674418604651, "grad_norm": 2.2441978454589844, "learning_rate": 1.4921346993759453e-06, "loss": 0.2667, "mean_token_accuracy": 0.9050288796424866, "step": 187 }, { "epoch": 0.8744186046511628, "grad_norm": 2.1383931636810303, "learning_rate": 1.459359409854505e-06, "loss": 0.2815, "mean_token_accuracy": 0.908123791217804, "step": 188 }, { "epoch": 0.8790697674418605, "grad_norm": 2.204313278198242, "learning_rate": 1.4276547166875946e-06, "loss": 0.2606, "mean_token_accuracy": 0.9115758538246155, "step": 189 }, { "epoch": 0.8837209302325582, "grad_norm": 2.4099416732788086, "learning_rate": 1.397029020258313e-06, "loss": 0.2828, "mean_token_accuracy": 0.8990963697433472, "step": 190 }, { "epoch": 0.8883720930232558, "grad_norm": 2.332491159439087, "learning_rate": 1.367490435061928e-06, "loss": 0.3299, "mean_token_accuracy": 0.8829391598701477, "step": 191 }, { "epoch": 0.8930232558139535, "grad_norm": 2.145528554916382, "learning_rate": 1.3390467875558855e-06, "loss": 0.2444, "mean_token_accuracy": 0.9146341681480408, "step": 192 }, { "epoch": 0.8976744186046511, "grad_norm": 2.165822982788086, "learning_rate": 1.3117056140861317e-06, "loss": 0.2863, "mean_token_accuracy": 0.9044266939163208, "step": 193 }, { "epoch": 0.9023255813953488, "grad_norm": 2.1664891242980957, "learning_rate": 1.285474158890304e-06, "loss": 0.2596, "mean_token_accuracy": 0.9129547476768494, "step": 194 }, { "epoch": 0.9069767441860465, "grad_norm": 2.0730209350585938, "learning_rate": 1.2603593721783219e-06, "loss": 0.2618, "mean_token_accuracy": 0.9128862023353577, "step": 195 }, { "epoch": 0.9116279069767442, "grad_norm": 2.156203508377075, "learning_rate": 1.2363679082908766e-06, "loss": 0.2953, "mean_token_accuracy": 0.8979016542434692, "step": 196 }, { "epoch": 0.9162790697674419, "grad_norm": 2.236882209777832, "learning_rate": 1.2135061239363161e-06, "loss": 0.2538, "mean_token_accuracy": 0.9129107594490051, "step": 197 }, { "epoch": 0.9209302325581395, "grad_norm": 2.2554514408111572, "learning_rate": 1.1917800765063954e-06, "loss": 0.2668, "mean_token_accuracy": 0.9089072346687317, "step": 198 }, { "epoch": 0.9255813953488372, "grad_norm": 2.4423582553863525, "learning_rate": 1.1711955224713209e-06, "loss": 0.2917, "mean_token_accuracy": 0.898510217666626, "step": 199 }, { "epoch": 0.9302325581395349, "grad_norm": 2.198429822921753, "learning_rate": 1.1517579158545386e-06, "loss": 0.2671, "mean_token_accuracy": 0.9085792303085327, "step": 200 }, { "epoch": 0.9348837209302325, "grad_norm": 2.3745462894439697, "learning_rate": 1.1334724067876463e-06, "loss": 0.2967, "mean_token_accuracy": 0.9004044532775879, "step": 201 }, { "epoch": 0.9395348837209302, "grad_norm": 2.1694419384002686, "learning_rate": 1.1163438401458358e-06, "loss": 0.263, "mean_token_accuracy": 0.906028687953949, "step": 202 }, { "epoch": 0.9441860465116279, "grad_norm": 2.3387255668640137, "learning_rate": 1.1003767542642021e-06, "loss": 0.2795, "mean_token_accuracy": 0.9063853025436401, "step": 203 }, { "epoch": 0.9488372093023256, "grad_norm": 2.1712303161621094, "learning_rate": 1.0855753797352868e-06, "loss": 0.2498, "mean_token_accuracy": 0.9100555777549744, "step": 204 }, { "epoch": 0.9534883720930233, "grad_norm": 2.0819292068481445, "learning_rate": 1.0719436382881466e-06, "loss": 0.2587, "mean_token_accuracy": 0.9077077507972717, "step": 205 }, { "epoch": 0.958139534883721, "grad_norm": 2.2681405544281006, "learning_rate": 1.0594851417492665e-06, "loss": 0.28, "mean_token_accuracy": 0.9018029570579529, "step": 206 }, { "epoch": 0.9627906976744186, "grad_norm": 2.4561564922332764, "learning_rate": 1.0482031910855804e-06, "loss": 0.2997, "mean_token_accuracy": 0.8940787315368652, "step": 207 }, { "epoch": 0.9674418604651163, "grad_norm": 2.234903335571289, "learning_rate": 1.0381007755298547e-06, "loss": 0.2715, "mean_token_accuracy": 0.9086244106292725, "step": 208 }, { "epoch": 0.9720930232558139, "grad_norm": 2.2118217945098877, "learning_rate": 1.029180571788672e-06, "loss": 0.2549, "mean_token_accuracy": 0.9135593175888062, "step": 209 }, { "epoch": 0.9767441860465116, "grad_norm": 2.0356454849243164, "learning_rate": 1.021444943333218e-06, "loss": 0.2619, "mean_token_accuracy": 0.9112606048583984, "step": 210 }, { "epoch": 0.9813953488372092, "grad_norm": 2.2461442947387695, "learning_rate": 1.0148959397730637e-06, "loss": 0.2587, "mean_token_accuracy": 0.9082584977149963, "step": 211 }, { "epoch": 0.986046511627907, "grad_norm": 2.1244120597839355, "learning_rate": 1.0095352963131057e-06, "loss": 0.2805, "mean_token_accuracy": 0.9018282294273376, "step": 212 }, { "epoch": 0.9906976744186047, "grad_norm": 2.2568986415863037, "learning_rate": 1.0053644332938118e-06, "loss": 0.2488, "mean_token_accuracy": 0.9099595546722412, "step": 213 }, { "epoch": 0.9953488372093023, "grad_norm": 2.1964008808135986, "learning_rate": 1.0023844558148912e-06, "loss": 0.295, "mean_token_accuracy": 0.8972173929214478, "step": 214 }, { "epoch": 1.0, "grad_norm": 2.0329501628875732, "learning_rate": 1.0005961534424925e-06, "loss": 0.2276, "mean_token_accuracy": 0.9119472503662109, "step": 215 }, { "epoch": 1.0, "step": 215, "total_flos": 2.0144468407196058e+17, "train_loss": 0.32494123903817906, "train_runtime": 1259.521, "train_samples_per_second": 5.439, "train_steps_per_second": 0.171 } ], "logging_steps": 1, "max_steps": 215, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0144468407196058e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }