{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 50, "global_step": 846, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01182033096926714, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 3.2832, "mean_token_accuracy": 0.4556728000442187, "num_tokens": 74681.0, "step": 5 }, { "epoch": 0.02364066193853428, "grad_norm": 104.05796232824258, "learning_rate": 1.7647058823529414e-07, "loss": 3.2183, "mean_token_accuracy": 0.45771179099877674, "num_tokens": 153902.0, "step": 10 }, { "epoch": 0.03546099290780142, "grad_norm": 44.640553186839156, "learning_rate": 4.7058823529411767e-07, "loss": 2.7453, "mean_token_accuracy": 0.4893781746427218, "num_tokens": 230170.0, "step": 15 }, { "epoch": 0.04728132387706856, "grad_norm": 16.641432874457237, "learning_rate": 7.647058823529413e-07, "loss": 2.4251, "mean_token_accuracy": 0.5127047290404637, "num_tokens": 311284.0, "step": 20 }, { "epoch": 0.0591016548463357, "grad_norm": 12.092821042180054, "learning_rate": 1.0588235294117648e-06, "loss": 2.3421, "mean_token_accuracy": 0.5212494264046351, "num_tokens": 385963.0, "step": 25 }, { "epoch": 0.07092198581560284, "grad_norm": 12.48879299232815, "learning_rate": 1.3529411764705883e-06, "loss": 2.2408, "mean_token_accuracy": 0.5327892646193504, "num_tokens": 458884.0, "step": 30 }, { "epoch": 0.08274231678486997, "grad_norm": 9.244193713594534, "learning_rate": 1.6470588235294118e-06, "loss": 2.1777, "mean_token_accuracy": 0.5375485748052597, "num_tokens": 540057.0, "step": 35 }, { "epoch": 0.09456264775413711, "grad_norm": 12.566401264702058, "learning_rate": 1.9411764705882353e-06, "loss": 2.1226, "mean_token_accuracy": 0.5466068352262179, "num_tokens": 621727.0, "step": 40 }, { "epoch": 0.10638297872340426, "grad_norm": 12.966605888942672, "learning_rate": 2.2352941176470592e-06, "loss": 2.0219, "mean_token_accuracy": 0.5580442860722542, "num_tokens": 704767.0, "step": 45 }, { "epoch": 0.1182033096926714, "grad_norm": 12.223856725829414, "learning_rate": 2.5294117647058823e-06, "loss": 1.9071, "mean_token_accuracy": 0.5804560139775277, "num_tokens": 787574.0, "step": 50 }, { "epoch": 0.1182033096926714, "eval_loss": 2.261831045150757, "eval_mean_token_accuracy": 0.5334406300379445, "eval_num_tokens": 787574.0, "eval_runtime": 112.2462, "eval_samples_per_second": 33.444, "eval_steps_per_second": 5.577, "step": 50 }, { "epoch": 0.13002364066193853, "grad_norm": 8.808307531063669, "learning_rate": 2.8235294117647062e-06, "loss": 1.8351, "mean_token_accuracy": 0.5938245743513108, "num_tokens": 868536.0, "step": 55 }, { "epoch": 0.14184397163120568, "grad_norm": 11.082814688242964, "learning_rate": 3.1176470588235297e-06, "loss": 1.8068, "mean_token_accuracy": 0.597475977241993, "num_tokens": 944129.0, "step": 60 }, { "epoch": 0.1536643026004728, "grad_norm": 11.206601503508725, "learning_rate": 3.4117647058823532e-06, "loss": 1.7111, "mean_token_accuracy": 0.6089037587245305, "num_tokens": 1032000.0, "step": 65 }, { "epoch": 0.16548463356973994, "grad_norm": 14.333554845362597, "learning_rate": 3.7058823529411767e-06, "loss": 1.6025, "mean_token_accuracy": 0.6384305556615194, "num_tokens": 1110048.0, "step": 70 }, { "epoch": 0.1773049645390071, "grad_norm": 17.238389536700844, "learning_rate": 4.000000000000001e-06, "loss": 1.4971, "mean_token_accuracy": 0.6512377719084422, "num_tokens": 1189270.0, "step": 75 }, { "epoch": 0.18912529550827423, "grad_norm": 15.105419987740975, "learning_rate": 4.294117647058823e-06, "loss": 1.405, "mean_token_accuracy": 0.6698001553614934, "num_tokens": 1276274.0, "step": 80 }, { "epoch": 0.20094562647754138, "grad_norm": 18.16302751183317, "learning_rate": 4.588235294117647e-06, "loss": 1.3866, "mean_token_accuracy": 0.677808458606402, "num_tokens": 1347635.0, "step": 85 }, { "epoch": 0.2127659574468085, "grad_norm": 16.544595608890432, "learning_rate": 4.882352941176471e-06, "loss": 1.2625, "mean_token_accuracy": 0.7019270072380702, "num_tokens": 1425994.0, "step": 90 }, { "epoch": 0.22458628841607564, "grad_norm": 16.879827911667025, "learning_rate": 4.999808275592979e-06, "loss": 1.2236, "mean_token_accuracy": 0.7105038404464722, "num_tokens": 1507998.0, "step": 95 }, { "epoch": 0.2364066193853428, "grad_norm": 14.093813445236359, "learning_rate": 4.998636732930301e-06, "loss": 1.231, "mean_token_accuracy": 0.7122000068426132, "num_tokens": 1588043.0, "step": 100 }, { "epoch": 0.2364066193853428, "eval_loss": 2.1943154335021973, "eval_mean_token_accuracy": 0.5279583666271295, "eval_num_tokens": 1588043.0, "eval_runtime": 110.4732, "eval_samples_per_second": 33.981, "eval_steps_per_second": 5.667, "step": 100 }, { "epoch": 0.24822695035460993, "grad_norm": 11.560621474065723, "learning_rate": 4.9964006596886895e-06, "loss": 1.0953, "mean_token_accuracy": 0.7428930242856343, "num_tokens": 1676024.0, "step": 105 }, { "epoch": 0.26004728132387706, "grad_norm": 15.262549473091083, "learning_rate": 4.993101008534978e-06, "loss": 1.0258, "mean_token_accuracy": 0.7531117727359136, "num_tokens": 1758985.0, "step": 110 }, { "epoch": 0.2718676122931442, "grad_norm": 16.492390914481756, "learning_rate": 4.988739185267578e-06, "loss": 1.0085, "mean_token_accuracy": 0.7558938791354497, "num_tokens": 1840987.0, "step": 115 }, { "epoch": 0.28368794326241137, "grad_norm": 10.824079042185982, "learning_rate": 4.9833170482175505e-06, "loss": 0.8932, "mean_token_accuracy": 0.774032841126124, "num_tokens": 1926935.0, "step": 120 }, { "epoch": 0.29550827423167847, "grad_norm": 10.282044151979626, "learning_rate": 4.97683690745687e-06, "loss": 0.9501, "mean_token_accuracy": 0.7681633462508519, "num_tokens": 2009169.0, "step": 125 }, { "epoch": 0.3073286052009456, "grad_norm": 12.335809357841024, "learning_rate": 4.969301523814234e-06, "loss": 0.9667, "mean_token_accuracy": 0.765592094262441, "num_tokens": 2085142.0, "step": 130 }, { "epoch": 0.3191489361702128, "grad_norm": 33.465797114274885, "learning_rate": 4.9607141076988244e-06, "loss": 0.8265, "mean_token_accuracy": 0.7936822563409806, "num_tokens": 2163220.0, "step": 135 }, { "epoch": 0.3309692671394799, "grad_norm": 11.83225961982719, "learning_rate": 4.9510783177325335e-06, "loss": 0.7997, "mean_token_accuracy": 0.8059376885493597, "num_tokens": 2248409.0, "step": 140 }, { "epoch": 0.34278959810874704, "grad_norm": 10.335334720960685, "learning_rate": 4.9403982591912235e-06, "loss": 0.7572, "mean_token_accuracy": 0.8129442622264226, "num_tokens": 2327346.0, "step": 145 }, { "epoch": 0.3546099290780142, "grad_norm": 10.178136355537001, "learning_rate": 4.9286784822557e-06, "loss": 0.7467, "mean_token_accuracy": 0.8148943414290746, "num_tokens": 2406245.0, "step": 150 }, { "epoch": 0.3546099290780142, "eval_loss": 2.279350996017456, "eval_mean_token_accuracy": 0.520639596846157, "eval_num_tokens": 2406245.0, "eval_runtime": 109.1976, "eval_samples_per_second": 34.378, "eval_steps_per_second": 5.733, "step": 150 }, { "epoch": 0.3664302600472813, "grad_norm": 10.080113241918701, "learning_rate": 4.915923980073132e-06, "loss": 0.6972, "mean_token_accuracy": 0.8263093769550324, "num_tokens": 2490950.0, "step": 155 }, { "epoch": 0.37825059101654845, "grad_norm": 11.063889721157608, "learning_rate": 4.902140186629744e-06, "loss": 0.7158, "mean_token_accuracy": 0.824493623773257, "num_tokens": 2576510.0, "step": 160 }, { "epoch": 0.3900709219858156, "grad_norm": 10.278296874650637, "learning_rate": 4.887332974435705e-06, "loss": 0.6446, "mean_token_accuracy": 0.8359018911918005, "num_tokens": 2662979.0, "step": 165 }, { "epoch": 0.40189125295508277, "grad_norm": 9.750471315097762, "learning_rate": 4.871508652023164e-06, "loss": 0.59, "mean_token_accuracy": 0.850665803750356, "num_tokens": 2744509.0, "step": 170 }, { "epoch": 0.41371158392434987, "grad_norm": 10.48760989183566, "learning_rate": 4.854673961258549e-06, "loss": 0.5977, "mean_token_accuracy": 0.8467812786499659, "num_tokens": 2821398.0, "step": 175 }, { "epoch": 0.425531914893617, "grad_norm": 8.730182101129285, "learning_rate": 4.836836074470223e-06, "loss": 0.5764, "mean_token_accuracy": 0.8605570962031682, "num_tokens": 2906463.0, "step": 180 }, { "epoch": 0.4373522458628842, "grad_norm": 8.019194726296847, "learning_rate": 4.818002591392751e-06, "loss": 0.5152, "mean_token_accuracy": 0.8619820535182953, "num_tokens": 2990659.0, "step": 185 }, { "epoch": 0.4491725768321513, "grad_norm": 7.390157783777142, "learning_rate": 4.7981815359290805e-06, "loss": 0.5046, "mean_token_accuracy": 0.8695660372575124, "num_tokens": 3078550.0, "step": 190 }, { "epoch": 0.46099290780141844, "grad_norm": 10.541421255580739, "learning_rate": 4.777381352731997e-06, "loss": 0.5843, "mean_token_accuracy": 0.8512990067402522, "num_tokens": 3159131.0, "step": 195 }, { "epoch": 0.4728132387706856, "grad_norm": 10.078950854270674, "learning_rate": 4.7556109036063275e-06, "loss": 0.6007, "mean_token_accuracy": 0.8467452943325042, "num_tokens": 3229781.0, "step": 200 }, { "epoch": 0.4728132387706856, "eval_loss": 2.253284215927124, "eval_mean_token_accuracy": 0.5191043009297155, "eval_num_tokens": 3229781.0, "eval_runtime": 109.5026, "eval_samples_per_second": 34.282, "eval_steps_per_second": 5.717, "step": 200 }, { "epoch": 0.4846335697399527, "grad_norm": 9.992622922165191, "learning_rate": 4.732879463733416e-06, "loss": 0.4923, "mean_token_accuracy": 0.8700054933627447, "num_tokens": 3313277.0, "step": 205 }, { "epoch": 0.49645390070921985, "grad_norm": 6.744124934432935, "learning_rate": 4.7091967177194855e-06, "loss": 0.4855, "mean_token_accuracy": 0.8768873423337936, "num_tokens": 3389646.0, "step": 210 }, { "epoch": 0.508274231678487, "grad_norm": 11.128091166162552, "learning_rate": 4.684572755469557e-06, "loss": 0.4945, "mean_token_accuracy": 0.8679842710494995, "num_tokens": 3469439.0, "step": 215 }, { "epoch": 0.5200945626477541, "grad_norm": 9.062221614300752, "learning_rate": 4.6590180678887106e-06, "loss": 0.4731, "mean_token_accuracy": 0.8787163704633713, "num_tokens": 3551443.0, "step": 220 }, { "epoch": 0.5319148936170213, "grad_norm": 8.982474548381703, "learning_rate": 4.632543542412485e-06, "loss": 0.4604, "mean_token_accuracy": 0.8834071298440297, "num_tokens": 3633871.0, "step": 225 }, { "epoch": 0.5437352245862884, "grad_norm": 9.887115562771593, "learning_rate": 4.6051604583683466e-06, "loss": 0.4134, "mean_token_accuracy": 0.8951348612705866, "num_tokens": 3719608.0, "step": 230 }, { "epoch": 0.5555555555555556, "grad_norm": 8.821856914951912, "learning_rate": 4.5768804821701955e-06, "loss": 0.3942, "mean_token_accuracy": 0.8990837872028351, "num_tokens": 3803920.0, "step": 235 }, { "epoch": 0.5673758865248227, "grad_norm": 7.16860636832362, "learning_rate": 4.54771566234795e-06, "loss": 0.4262, "mean_token_accuracy": 0.8890527258316676, "num_tokens": 3888982.0, "step": 240 }, { "epoch": 0.5791962174940898, "grad_norm": 8.029202329935103, "learning_rate": 4.51767842441434e-06, "loss": 0.5444, "mean_token_accuracy": 0.8689338505268097, "num_tokens": 3968283.0, "step": 245 }, { "epoch": 0.5910165484633569, "grad_norm": 7.713164000798632, "learning_rate": 4.486781565571082e-06, "loss": 0.3567, "mean_token_accuracy": 0.8971066276232401, "num_tokens": 4050822.0, "step": 250 }, { "epoch": 0.5910165484633569, "eval_loss": 2.40179705619812, "eval_mean_token_accuracy": 0.5091435096134393, "eval_num_tokens": 4050822.0, "eval_runtime": 108.43, "eval_samples_per_second": 34.621, "eval_steps_per_second": 5.773, "step": 250 }, { "epoch": 0.6028368794326241, "grad_norm": 7.981770465410259, "learning_rate": 4.455038249256702e-06, "loss": 0.4175, "mean_token_accuracy": 0.8864375064770381, "num_tokens": 4125571.0, "step": 255 }, { "epoch": 0.6146572104018913, "grad_norm": 5.5094271110797655, "learning_rate": 4.42246199953832e-06, "loss": 0.4184, "mean_token_accuracy": 0.8935485412677129, "num_tokens": 4205062.0, "step": 260 }, { "epoch": 0.6264775413711584, "grad_norm": 6.038016625662183, "learning_rate": 4.389066695349807e-06, "loss": 0.3899, "mean_token_accuracy": 0.8980253010988235, "num_tokens": 4284113.0, "step": 265 }, { "epoch": 0.6382978723404256, "grad_norm": 7.483680898980003, "learning_rate": 4.354866564578725e-06, "loss": 0.3331, "mean_token_accuracy": 0.9092767784992853, "num_tokens": 4363941.0, "step": 270 }, { "epoch": 0.6501182033096927, "grad_norm": 7.385021253615098, "learning_rate": 4.319876178004624e-06, "loss": 0.3928, "mean_token_accuracy": 0.9003854801257452, "num_tokens": 4438097.0, "step": 275 }, { "epoch": 0.6619385342789598, "grad_norm": 7.463896490898541, "learning_rate": 4.284110443091236e-06, "loss": 0.3341, "mean_token_accuracy": 0.9117354313532512, "num_tokens": 4527132.0, "step": 280 }, { "epoch": 0.6737588652482269, "grad_norm": 6.952725217970187, "learning_rate": 4.247584597635234e-06, "loss": 0.3499, "mean_token_accuracy": 0.903310830394427, "num_tokens": 4608279.0, "step": 285 }, { "epoch": 0.6855791962174941, "grad_norm": 6.943838612781438, "learning_rate": 4.210314203274247e-06, "loss": 0.3287, "mean_token_accuracy": 0.9131078998247782, "num_tokens": 4691886.0, "step": 290 }, { "epoch": 0.6973995271867612, "grad_norm": 7.381800762619572, "learning_rate": 4.1723151388569165e-06, "loss": 0.3576, "mean_token_accuracy": 0.906527488430341, "num_tokens": 4769203.0, "step": 295 }, { "epoch": 0.7092198581560284, "grad_norm": 9.418331863507088, "learning_rate": 4.133603593677792e-06, "loss": 0.3257, "mean_token_accuracy": 0.9139421621958415, "num_tokens": 4850864.0, "step": 300 }, { "epoch": 0.7092198581560284, "eval_loss": 2.4337592124938965, "eval_mean_token_accuracy": 0.5087787045743137, "eval_num_tokens": 4850864.0, "eval_runtime": 111.3343, "eval_samples_per_second": 33.718, "eval_steps_per_second": 5.623, "step": 300 }, { "epoch": 0.7210401891252955, "grad_norm": 7.874468218736852, "learning_rate": 4.094196060579972e-06, "loss": 0.31, "mean_token_accuracy": 0.9130396594603857, "num_tokens": 4933811.0, "step": 305 }, { "epoch": 0.7328605200945626, "grad_norm": 7.223379947683901, "learning_rate": 4.054109328928423e-06, "loss": 0.3371, "mean_token_accuracy": 0.9120342075824738, "num_tokens": 5017994.0, "step": 310 }, { "epoch": 0.7446808510638298, "grad_norm": 5.83651158137616, "learning_rate": 4.013360477456956e-06, "loss": 0.2646, "mean_token_accuracy": 0.9279499053955078, "num_tokens": 5102351.0, "step": 315 }, { "epoch": 0.7565011820330969, "grad_norm": 7.113298483675116, "learning_rate": 3.971966866991926e-06, "loss": 0.3273, "mean_token_accuracy": 0.9165905058383942, "num_tokens": 5184621.0, "step": 320 }, { "epoch": 0.7683215130023641, "grad_norm": 8.025597832460408, "learning_rate": 3.92994613305575e-06, "loss": 0.3111, "mean_token_accuracy": 0.9173123935858408, "num_tokens": 5269023.0, "step": 325 }, { "epoch": 0.7801418439716312, "grad_norm": 6.537392517022451, "learning_rate": 3.887316178353384e-06, "loss": 0.2939, "mean_token_accuracy": 0.9172585904598236, "num_tokens": 5347155.0, "step": 330 }, { "epoch": 0.7919621749408984, "grad_norm": 4.806081208802315, "learning_rate": 3.844095165144977e-06, "loss": 0.2873, "mean_token_accuracy": 0.9166339705387752, "num_tokens": 5424281.0, "step": 335 }, { "epoch": 0.8037825059101655, "grad_norm": 5.822318247837762, "learning_rate": 3.800301507507935e-06, "loss": 0.2692, "mean_token_accuracy": 0.9217310587565104, "num_tokens": 5509893.0, "step": 340 }, { "epoch": 0.8156028368794326, "grad_norm": 4.6655813133266575, "learning_rate": 3.755953863491709e-06, "loss": 0.295, "mean_token_accuracy": 0.9214087873697281, "num_tokens": 5587333.0, "step": 345 }, { "epoch": 0.8274231678486997, "grad_norm": 5.2738562954379296, "learning_rate": 3.7110711271686276e-06, "loss": 0.2995, "mean_token_accuracy": 0.9183420379956563, "num_tokens": 5667797.0, "step": 350 }, { "epoch": 0.8274231678486997, "eval_loss": 2.484821319580078, "eval_mean_token_accuracy": 0.5057851479838069, "eval_num_tokens": 5667797.0, "eval_runtime": 108.6037, "eval_samples_per_second": 34.566, "eval_steps_per_second": 5.764, "step": 350 }, { "epoch": 0.8392434988179669, "grad_norm": 5.288946153592396, "learning_rate": 3.6656724205841866e-06, "loss": 0.3035, "mean_token_accuracy": 0.9168096592028936, "num_tokens": 5740150.0, "step": 355 }, { "epoch": 0.851063829787234, "grad_norm": 5.709113745487931, "learning_rate": 3.619777085610201e-06, "loss": 0.2726, "mean_token_accuracy": 0.920536317427953, "num_tokens": 5820836.0, "step": 360 }, { "epoch": 0.8628841607565012, "grad_norm": 6.035834640936946, "learning_rate": 3.57340467570431e-06, "loss": 0.2892, "mean_token_accuracy": 0.9215306291977564, "num_tokens": 5899213.0, "step": 365 }, { "epoch": 0.8747044917257684, "grad_norm": 5.74179547981077, "learning_rate": 3.5265749475793274e-06, "loss": 0.2759, "mean_token_accuracy": 0.9214441031217575, "num_tokens": 5978332.0, "step": 370 }, { "epoch": 0.8865248226950354, "grad_norm": 4.69842226017672, "learning_rate": 3.47930785278601e-06, "loss": 0.25, "mean_token_accuracy": 0.92916699051857, "num_tokens": 6061339.0, "step": 375 }, { "epoch": 0.8983451536643026, "grad_norm": 6.7345436238624155, "learning_rate": 3.431623529212797e-06, "loss": 0.2819, "mean_token_accuracy": 0.9222736060619354, "num_tokens": 6137077.0, "step": 380 }, { "epoch": 0.9101654846335697, "grad_norm": 5.563878030106144, "learning_rate": 3.3835422925061826e-06, "loss": 0.2486, "mean_token_accuracy": 0.9305912603934606, "num_tokens": 6227518.0, "step": 385 }, { "epoch": 0.9219858156028369, "grad_norm": 5.065962877403909, "learning_rate": 3.3350846274153387e-06, "loss": 0.2716, "mean_token_accuracy": 0.9261675874392191, "num_tokens": 6302707.0, "step": 390 }, { "epoch": 0.933806146572104, "grad_norm": 5.045177866951061, "learning_rate": 3.286271179064701e-06, "loss": 0.3013, "mean_token_accuracy": 0.9195235311985016, "num_tokens": 6380736.0, "step": 395 }, { "epoch": 0.9456264775413712, "grad_norm": 4.252712034080436, "learning_rate": 3.2371227441582285e-06, "loss": 0.2931, "mean_token_accuracy": 0.9227547705173492, "num_tokens": 6453122.0, "step": 400 }, { "epoch": 0.9456264775413712, "eval_loss": 2.4907124042510986, "eval_mean_token_accuracy": 0.5083554677974683, "eval_num_tokens": 6453122.0, "eval_runtime": 109.7712, "eval_samples_per_second": 34.198, "eval_steps_per_second": 5.703, "step": 400 }, { "epoch": 0.9574468085106383, "grad_norm": 3.815988429880776, "learning_rate": 3.187660262119077e-06, "loss": 0.2545, "mean_token_accuracy": 0.926997916897138, "num_tokens": 6533869.0, "step": 405 }, { "epoch": 0.9692671394799054, "grad_norm": 7.321292264517903, "learning_rate": 3.1379048061684735e-06, "loss": 0.2727, "mean_token_accuracy": 0.9246514956156413, "num_tokens": 6608291.0, "step": 410 }, { "epoch": 0.9810874704491725, "grad_norm": 5.072077367784098, "learning_rate": 3.087877574347587e-06, "loss": 0.2427, "mean_token_accuracy": 0.9336031595865886, "num_tokens": 6688125.0, "step": 415 }, { "epoch": 0.9929078014184397, "grad_norm": 4.533402816180472, "learning_rate": 3.0375998804862146e-06, "loss": 0.2597, "mean_token_accuracy": 0.9298433562119802, "num_tokens": 6765643.0, "step": 420 }, { "epoch": 1.0047281323877069, "grad_norm": 4.67945464989491, "learning_rate": 2.9870931451221436e-06, "loss": 0.2356, "mean_token_accuracy": 0.9360798348983129, "num_tokens": 6842035.0, "step": 425 }, { "epoch": 1.016548463356974, "grad_norm": 4.067854721801768, "learning_rate": 2.9363788863750465e-06, "loss": 0.1656, "mean_token_accuracy": 0.9521991004546483, "num_tokens": 6932082.0, "step": 430 }, { "epoch": 1.0283687943262412, "grad_norm": 3.6749072345955294, "learning_rate": 2.885478710778803e-06, "loss": 0.1761, "mean_token_accuracy": 0.9476487815380097, "num_tokens": 7011600.0, "step": 435 }, { "epoch": 1.0401891252955082, "grad_norm": 5.286620213070309, "learning_rate": 2.834414304076155e-06, "loss": 0.1617, "mean_token_accuracy": 0.9531299829483032, "num_tokens": 7097718.0, "step": 440 }, { "epoch": 1.0520094562647755, "grad_norm": 3.99180921580787, "learning_rate": 2.783207421979614e-06, "loss": 0.1743, "mean_token_accuracy": 0.9491087565819423, "num_tokens": 7180644.0, "step": 445 }, { "epoch": 1.0638297872340425, "grad_norm": 3.5159362565459324, "learning_rate": 2.731879880902555e-06, "loss": 0.1773, "mean_token_accuracy": 0.9466255068778991, "num_tokens": 7259970.0, "step": 450 }, { "epoch": 1.0638297872340425, "eval_loss": 2.6374220848083496, "eval_mean_token_accuracy": 0.5018131742938258, "eval_num_tokens": 7259970.0, "eval_runtime": 110.8709, "eval_samples_per_second": 33.859, "eval_steps_per_second": 5.646, "step": 450 }, { "epoch": 1.0756501182033098, "grad_norm": 4.265917733350775, "learning_rate": 2.680453548664458e-06, "loss": 0.1733, "mean_token_accuracy": 0.9452025413513183, "num_tokens": 7340719.0, "step": 455 }, { "epoch": 1.0874704491725768, "grad_norm": 4.156827622094583, "learning_rate": 2.6289503351742365e-06, "loss": 0.1692, "mean_token_accuracy": 0.9517264991998673, "num_tokens": 7422782.0, "step": 460 }, { "epoch": 1.099290780141844, "grad_norm": 3.6858687333016302, "learning_rate": 2.5773921830956455e-06, "loss": 0.1735, "mean_token_accuracy": 0.9458188633124034, "num_tokens": 7504521.0, "step": 465 }, { "epoch": 1.1111111111111112, "grad_norm": 4.471806923768531, "learning_rate": 2.525801058498725e-06, "loss": 0.1656, "mean_token_accuracy": 0.948051197330157, "num_tokens": 7590826.0, "step": 470 }, { "epoch": 1.1229314420803782, "grad_norm": 3.417137678628772, "learning_rate": 2.474198941501276e-06, "loss": 0.1558, "mean_token_accuracy": 0.9528810689846675, "num_tokens": 7678043.0, "step": 475 }, { "epoch": 1.1347517730496455, "grad_norm": 5.039333815788341, "learning_rate": 2.4226078169043554e-06, "loss": 0.1805, "mean_token_accuracy": 0.9472235560417175, "num_tokens": 7755209.0, "step": 480 }, { "epoch": 1.1465721040189125, "grad_norm": 3.7176041502221917, "learning_rate": 2.3710496648257644e-06, "loss": 0.1709, "mean_token_accuracy": 0.9491322924693425, "num_tokens": 7838175.0, "step": 485 }, { "epoch": 1.1583924349881798, "grad_norm": 3.2115692600213652, "learning_rate": 2.319546451335543e-06, "loss": 0.1672, "mean_token_accuracy": 0.949834555387497, "num_tokens": 7922235.0, "step": 490 }, { "epoch": 1.1702127659574468, "grad_norm": 4.136417148989969, "learning_rate": 2.2681201190974454e-06, "loss": 0.1771, "mean_token_accuracy": 0.9477785180012385, "num_tokens": 8001243.0, "step": 495 }, { "epoch": 1.1820330969267139, "grad_norm": 3.8397139922722716, "learning_rate": 2.2167925780203865e-06, "loss": 0.1888, "mean_token_accuracy": 0.9445441563924154, "num_tokens": 8076462.0, "step": 500 }, { "epoch": 1.1820330969267139, "eval_loss": 2.685175895690918, "eval_mean_token_accuracy": 0.49984380817070556, "eval_num_tokens": 8076462.0, "eval_runtime": 112.2773, "eval_samples_per_second": 33.435, "eval_steps_per_second": 5.575, "step": 500 }, { "epoch": 1.1938534278959811, "grad_norm": 3.315216919284447, "learning_rate": 2.1655856959238452e-06, "loss": 0.1539, "mean_token_accuracy": 0.952604294816653, "num_tokens": 8164712.0, "step": 505 }, { "epoch": 1.2056737588652482, "grad_norm": 4.235323705607921, "learning_rate": 2.114521289221198e-06, "loss": 0.1608, "mean_token_accuracy": 0.9499759902556737, "num_tokens": 8248878.0, "step": 510 }, { "epoch": 1.2174940898345155, "grad_norm": 4.608242768223314, "learning_rate": 2.0636211136249543e-06, "loss": 0.1672, "mean_token_accuracy": 0.9494648416837056, "num_tokens": 8326064.0, "step": 515 }, { "epoch": 1.2293144208037825, "grad_norm": 3.502689021987934, "learning_rate": 2.0129068548778572e-06, "loss": 0.1537, "mean_token_accuracy": 0.9527056773503622, "num_tokens": 8414776.0, "step": 520 }, { "epoch": 1.2411347517730495, "grad_norm": 4.235942485988551, "learning_rate": 1.962400119513786e-06, "loss": 0.174, "mean_token_accuracy": 0.9458868026733398, "num_tokens": 8491600.0, "step": 525 }, { "epoch": 1.2529550827423168, "grad_norm": 3.2078232835897222, "learning_rate": 1.9121224256524134e-06, "loss": 0.172, "mean_token_accuracy": 0.9478765934705734, "num_tokens": 8570603.0, "step": 530 }, { "epoch": 1.2647754137115839, "grad_norm": 3.0687990919983323, "learning_rate": 1.862095193831527e-06, "loss": 0.1601, "mean_token_accuracy": 0.9530910869439443, "num_tokens": 8654941.0, "step": 535 }, { "epoch": 1.2765957446808511, "grad_norm": 3.5905336673591757, "learning_rate": 1.8123397378809232e-06, "loss": 0.16, "mean_token_accuracy": 0.9501720656951268, "num_tokens": 8737108.0, "step": 540 }, { "epoch": 1.2884160756501182, "grad_norm": 3.4606821671127697, "learning_rate": 1.7628772558417717e-06, "loss": 0.1609, "mean_token_accuracy": 0.9513311187426249, "num_tokens": 8816132.0, "step": 545 }, { "epoch": 1.3002364066193852, "grad_norm": 3.0342047963429017, "learning_rate": 1.7137288209352994e-06, "loss": 0.1568, "mean_token_accuracy": 0.9533144136269888, "num_tokens": 8899608.0, "step": 550 }, { "epoch": 1.3002364066193852, "eval_loss": 2.695701837539673, "eval_mean_token_accuracy": 0.5027767699747421, "eval_num_tokens": 8899608.0, "eval_runtime": 110.95, "eval_samples_per_second": 33.835, "eval_steps_per_second": 5.642, "step": 550 }, { "epoch": 1.3120567375886525, "grad_norm": 4.541312231558626, "learning_rate": 1.664915372584662e-06, "loss": 0.1609, "mean_token_accuracy": 0.950713715950648, "num_tokens": 8981457.0, "step": 555 }, { "epoch": 1.3238770685579198, "grad_norm": 2.7175384180465874, "learning_rate": 1.6164577074938182e-06, "loss": 0.174, "mean_token_accuracy": 0.9483120679855347, "num_tokens": 9060540.0, "step": 560 }, { "epoch": 1.3356973995271868, "grad_norm": 3.366989184203709, "learning_rate": 1.5683764707872037e-06, "loss": 0.1704, "mean_token_accuracy": 0.9461151162783304, "num_tokens": 9141972.0, "step": 565 }, { "epoch": 1.3475177304964538, "grad_norm": 4.153873238683359, "learning_rate": 1.5206921472139907e-06, "loss": 0.1783, "mean_token_accuracy": 0.9454526672760646, "num_tokens": 9215271.0, "step": 570 }, { "epoch": 1.3593380614657211, "grad_norm": 2.855098111823155, "learning_rate": 1.4734250524206727e-06, "loss": 0.1756, "mean_token_accuracy": 0.9473081688086192, "num_tokens": 9293349.0, "step": 575 }, { "epoch": 1.3711583924349882, "grad_norm": 3.486431999394093, "learning_rate": 1.4265953242956914e-06, "loss": 0.1669, "mean_token_accuracy": 0.9501691997051239, "num_tokens": 9372365.0, "step": 580 }, { "epoch": 1.3829787234042552, "grad_norm": 3.118171717428614, "learning_rate": 1.3802229143897993e-06, "loss": 0.1705, "mean_token_accuracy": 0.9490540792544683, "num_tokens": 9450218.0, "step": 585 }, { "epoch": 1.3947990543735225, "grad_norm": 4.284146957939775, "learning_rate": 1.3343275794158138e-06, "loss": 0.1656, "mean_token_accuracy": 0.9492589155832927, "num_tokens": 9532191.0, "step": 590 }, { "epoch": 1.4066193853427895, "grad_norm": 4.987440979362939, "learning_rate": 1.2889288728313732e-06, "loss": 0.1715, "mean_token_accuracy": 0.9477653960386913, "num_tokens": 9607710.0, "step": 595 }, { "epoch": 1.4184397163120568, "grad_norm": 3.4008167160659326, "learning_rate": 1.2440461365082917e-06, "loss": 0.1541, "mean_token_accuracy": 0.9530154536167781, "num_tokens": 9691506.0, "step": 600 }, { "epoch": 1.4184397163120568, "eval_loss": 2.6637179851531982, "eval_mean_token_accuracy": 0.5024550324811722, "eval_num_tokens": 9691506.0, "eval_runtime": 110.6668, "eval_samples_per_second": 33.922, "eval_steps_per_second": 5.657, "step": 600 }, { "epoch": 1.4302600472813238, "grad_norm": 3.4244372133508523, "learning_rate": 1.1996984924920651e-06, "loss": 0.1739, "mean_token_accuracy": 0.9478476305802663, "num_tokens": 9766964.0, "step": 605 }, { "epoch": 1.442080378250591, "grad_norm": 3.8359155778997174, "learning_rate": 1.1559048348550245e-06, "loss": 0.1687, "mean_token_accuracy": 0.9482156693935394, "num_tokens": 9846839.0, "step": 610 }, { "epoch": 1.4539007092198581, "grad_norm": 2.6947789470311263, "learning_rate": 1.1126838216466171e-06, "loss": 0.1585, "mean_token_accuracy": 0.9508161584536234, "num_tokens": 9928630.0, "step": 615 }, { "epoch": 1.4657210401891252, "grad_norm": 2.4566267363432215, "learning_rate": 1.0700538669442512e-06, "loss": 0.162, "mean_token_accuracy": 0.9484717577695847, "num_tokens": 10007569.0, "step": 620 }, { "epoch": 1.4775413711583925, "grad_norm": 2.98051235327796, "learning_rate": 1.0280331330080756e-06, "loss": 0.1731, "mean_token_accuracy": 0.9485133985678355, "num_tokens": 10082100.0, "step": 625 }, { "epoch": 1.4893617021276595, "grad_norm": 2.5966978913461576, "learning_rate": 9.866395225430455e-07, "loss": 0.1699, "mean_token_accuracy": 0.9498284469048183, "num_tokens": 10160498.0, "step": 630 }, { "epoch": 1.5011820330969265, "grad_norm": 3.0272596986145848, "learning_rate": 9.458906710715776e-07, "loss": 0.1442, "mean_token_accuracy": 0.9565223922332128, "num_tokens": 10247185.0, "step": 635 }, { "epoch": 1.5130023640661938, "grad_norm": 2.902942105541407, "learning_rate": 9.058039394200283e-07, "loss": 0.1586, "mean_token_accuracy": 0.9523343364397685, "num_tokens": 10329810.0, "step": 640 }, { "epoch": 1.524822695035461, "grad_norm": 3.8091691785911803, "learning_rate": 8.663964063222094e-07, "loss": 0.1712, "mean_token_accuracy": 0.9478918602069218, "num_tokens": 10406342.0, "step": 645 }, { "epoch": 1.5366430260047281, "grad_norm": 2.84704352427359, "learning_rate": 8.27684861143084e-07, "loss": 0.1591, "mean_token_accuracy": 0.9527056852976481, "num_tokens": 10487338.0, "step": 650 }, { "epoch": 1.5366430260047281, "eval_loss": 2.6855952739715576, "eval_mean_token_accuracy": 0.5021388608331497, "eval_num_tokens": 10487338.0, "eval_runtime": 108.7473, "eval_samples_per_second": 34.52, "eval_steps_per_second": 5.756, "step": 650 }, { "epoch": 1.5484633569739952, "grad_norm": 3.395443058239169, "learning_rate": 7.896857967257532e-07, "loss": 0.1608, "mean_token_accuracy": 0.9517467439174652, "num_tokens": 10566656.0, "step": 655 }, { "epoch": 1.5602836879432624, "grad_norm": 2.6081166041756334, "learning_rate": 7.524154023647678e-07, "loss": 0.1621, "mean_token_accuracy": 0.9508071412642797, "num_tokens": 10642301.0, "step": 660 }, { "epoch": 1.5721040189125297, "grad_norm": 2.3224792159461134, "learning_rate": 7.158895569087651e-07, "loss": 0.1558, "mean_token_accuracy": 0.9522350211938222, "num_tokens": 10724129.0, "step": 665 }, { "epoch": 1.5839243498817965, "grad_norm": 2.9407695481451954, "learning_rate": 6.801238219953774e-07, "loss": 0.1573, "mean_token_accuracy": 0.9532469352086385, "num_tokens": 10803308.0, "step": 670 }, { "epoch": 1.5957446808510638, "grad_norm": 2.461257725656036, "learning_rate": 6.451334354212765e-07, "loss": 0.1581, "mean_token_accuracy": 0.9496888081232707, "num_tokens": 10882366.0, "step": 675 }, { "epoch": 1.607565011820331, "grad_norm": 2.9648253517967116, "learning_rate": 6.109333046501942e-07, "loss": 0.1556, "mean_token_accuracy": 0.9546032408873241, "num_tokens": 10962923.0, "step": 680 }, { "epoch": 1.6193853427895981, "grad_norm": 3.8083000466171475, "learning_rate": 5.775380004616804e-07, "loss": 0.1683, "mean_token_accuracy": 0.9476418197154999, "num_tokens": 11038680.0, "step": 685 }, { "epoch": 1.6312056737588652, "grad_norm": 2.3462649916503864, "learning_rate": 5.449617507433002e-07, "loss": 0.1526, "mean_token_accuracy": 0.9539241482814153, "num_tokens": 11120210.0, "step": 690 }, { "epoch": 1.6430260047281324, "grad_norm": 3.2163417105630074, "learning_rate": 5.132184344289187e-07, "loss": 0.1601, "mean_token_accuracy": 0.9499970803658168, "num_tokens": 11198180.0, "step": 695 }, { "epoch": 1.6548463356973995, "grad_norm": 2.785105592615648, "learning_rate": 4.823215755856603e-07, "loss": 0.1695, "mean_token_accuracy": 0.948690946896871, "num_tokens": 11272183.0, "step": 700 }, { "epoch": 1.6548463356973995, "eval_loss": 2.7217941284179688, "eval_mean_token_accuracy": 0.5019449446909725, "eval_num_tokens": 11272183.0, "eval_runtime": 95.876, "eval_samples_per_second": 39.155, "eval_steps_per_second": 6.529, "step": 700 }, { "epoch": 1.6666666666666665, "grad_norm": 2.7877400675525514, "learning_rate": 4.522843376520508e-07, "loss": 0.1535, "mean_token_accuracy": 0.9523400167624155, "num_tokens": 11354735.0, "step": 705 }, { "epoch": 1.6784869976359338, "grad_norm": 3.4767887349168283, "learning_rate": 4.2311951782980587e-07, "loss": 0.1579, "mean_token_accuracy": 0.9517521331707637, "num_tokens": 11432857.0, "step": 710 }, { "epoch": 1.690307328605201, "grad_norm": 3.638265461712605, "learning_rate": 3.9483954163165363e-07, "loss": 0.1683, "mean_token_accuracy": 0.9472279886404673, "num_tokens": 11506345.0, "step": 715 }, { "epoch": 1.702127659574468, "grad_norm": 3.329843084823806, "learning_rate": 3.674564575875156e-07, "loss": 0.1625, "mean_token_accuracy": 0.9498989204565684, "num_tokens": 11584102.0, "step": 720 }, { "epoch": 1.7139479905437351, "grad_norm": 2.693105492226511, "learning_rate": 3.4098193211128975e-07, "loss": 0.1669, "mean_token_accuracy": 0.94957415163517, "num_tokens": 11662420.0, "step": 725 }, { "epoch": 1.7257683215130024, "grad_norm": 2.347451562983623, "learning_rate": 3.1542724453044323e-07, "loss": 0.1547, "mean_token_accuracy": 0.9542993714412054, "num_tokens": 11745725.0, "step": 730 }, { "epoch": 1.7375886524822695, "grad_norm": 3.428528200094209, "learning_rate": 2.908032822805157e-07, "loss": 0.1629, "mean_token_accuracy": 0.949917741616567, "num_tokens": 11825500.0, "step": 735 }, { "epoch": 1.7494089834515365, "grad_norm": 2.5067600170197095, "learning_rate": 2.671205362665841e-07, "loss": 0.1581, "mean_token_accuracy": 0.9531556775172552, "num_tokens": 11906463.0, "step": 740 }, { "epoch": 1.7612293144208038, "grad_norm": 2.5295945128152417, "learning_rate": 2.4438909639367294e-07, "loss": 0.1405, "mean_token_accuracy": 0.9579586456219356, "num_tokens": 11994373.0, "step": 745 }, { "epoch": 1.773049645390071, "grad_norm": 2.4227200543725185, "learning_rate": 2.2261864726800364e-07, "loss": 0.1454, "mean_token_accuracy": 0.9546575715144475, "num_tokens": 12078968.0, "step": 750 }, { "epoch": 1.773049645390071, "eval_loss": 2.739609479904175, "eval_mean_token_accuracy": 0.5016750158212436, "eval_num_tokens": 12078968.0, "eval_runtime": 96.4627, "eval_samples_per_second": 38.917, "eval_steps_per_second": 6.49, "step": 750 }, { "epoch": 1.784869976359338, "grad_norm": 2.734440331002493, "learning_rate": 2.0181846407092003e-07, "loss": 0.1562, "mean_token_accuracy": 0.953593663374583, "num_tokens": 12157200.0, "step": 755 }, { "epoch": 1.7966903073286051, "grad_norm": 3.0212485027888847, "learning_rate": 1.8199740860724928e-07, "loss": 0.1491, "mean_token_accuracy": 0.9538020372390748, "num_tokens": 12239141.0, "step": 760 }, { "epoch": 1.8085106382978724, "grad_norm": 3.7603456536650057, "learning_rate": 1.6316392552977732e-07, "loss": 0.1677, "mean_token_accuracy": 0.9494908769925435, "num_tokens": 12319055.0, "step": 765 }, { "epoch": 1.8203309692671394, "grad_norm": 3.856800815054017, "learning_rate": 1.4532603874145068e-07, "loss": 0.1563, "mean_token_accuracy": 0.9524733434120815, "num_tokens": 12399598.0, "step": 770 }, { "epoch": 1.8321513002364065, "grad_norm": 2.7252151319976603, "learning_rate": 1.2849134797683627e-07, "loss": 0.1515, "mean_token_accuracy": 0.9528288086255391, "num_tokens": 12480923.0, "step": 775 }, { "epoch": 1.8439716312056738, "grad_norm": 3.203424503912004, "learning_rate": 1.1266702556429615e-07, "loss": 0.1745, "mean_token_accuracy": 0.9487062722444535, "num_tokens": 12555434.0, "step": 780 }, { "epoch": 1.855791962174941, "grad_norm": 6.562777724476359, "learning_rate": 9.785981337025602e-08, "loss": 0.1518, "mean_token_accuracy": 0.9551320135593414, "num_tokens": 12635397.0, "step": 785 }, { "epoch": 1.867612293144208, "grad_norm": 2.524203238556445, "learning_rate": 8.407601992686864e-08, "loss": 0.1587, "mean_token_accuracy": 0.9509722570578257, "num_tokens": 12712118.0, "step": 790 }, { "epoch": 1.8794326241134751, "grad_norm": 2.700254226198366, "learning_rate": 7.132151774429996e-08, "loss": 0.1488, "mean_token_accuracy": 0.9541502008835475, "num_tokens": 12794157.0, "step": 795 }, { "epoch": 1.8912529550827424, "grad_norm": 2.528281516063428, "learning_rate": 5.9601740808777065e-08, "loss": 0.152, "mean_token_accuracy": 0.956131245692571, "num_tokens": 12876278.0, "step": 800 }, { "epoch": 1.8912529550827424, "eval_loss": 2.7611730098724365, "eval_mean_token_accuracy": 0.501501605247918, "eval_num_tokens": 12876278.0, "eval_runtime": 94.7046, "eval_samples_per_second": 39.639, "eval_steps_per_second": 6.61, "step": 800 }, { "epoch": 1.9030732860520094, "grad_norm": 2.8738724345889053, "learning_rate": 4.8921682267467075e-08, "loss": 0.1525, "mean_token_accuracy": 0.9533034404118855, "num_tokens": 12958123.0, "step": 805 }, { "epoch": 1.9148936170212765, "grad_norm": 2.5890563371947253, "learning_rate": 3.9285892301175744e-08, "loss": 0.1665, "mean_token_accuracy": 0.9502290070056916, "num_tokens": 13033488.0, "step": 810 }, { "epoch": 1.9267139479905437, "grad_norm": 2.8437599872465507, "learning_rate": 3.069847618576649e-08, "loss": 0.143, "mean_token_accuracy": 0.9558833320935567, "num_tokens": 13118515.0, "step": 815 }, { "epoch": 1.938534278959811, "grad_norm": 3.2653301157414907, "learning_rate": 2.3163092543130317e-08, "loss": 0.1565, "mean_token_accuracy": 0.9513460914293925, "num_tokens": 13200181.0, "step": 820 }, { "epoch": 1.950354609929078, "grad_norm": 2.2911215598352026, "learning_rate": 1.6682951782449887e-08, "loss": 0.14, "mean_token_accuracy": 0.9551817645629247, "num_tokens": 13287198.0, "step": 825 }, { "epoch": 1.962174940898345, "grad_norm": 2.528025657021651, "learning_rate": 1.1260814732422242e-08, "loss": 0.1577, "mean_token_accuracy": 0.9525305430094401, "num_tokens": 13365313.0, "step": 830 }, { "epoch": 1.9739952718676124, "grad_norm": 2.737000504326715, "learning_rate": 6.898991465022487e-09, "loss": 0.1457, "mean_token_accuracy": 0.9558167507251104, "num_tokens": 13448375.0, "step": 835 }, { "epoch": 1.9858156028368794, "grad_norm": 2.851536894418566, "learning_rate": 3.5993403113107616e-09, "loss": 0.1493, "mean_token_accuracy": 0.9524207770824432, "num_tokens": 13529509.0, "step": 840 }, { "epoch": 1.9976359338061465, "grad_norm": 2.5596855272672463, "learning_rate": 1.3632670696991922e-09, "loss": 0.1561, "mean_token_accuracy": 0.9530758639176686, "num_tokens": 13608446.0, "step": 845 }, { "epoch": 2.0, "mean_token_accuracy": 0.958593467871348, "num_tokens": 13625289.0, "step": 846, "total_flos": 38833750351872.0, "train_loss": 0.4931214354914695, "train_runtime": 8229.9073, "train_samples_per_second": 7.4, "train_steps_per_second": 0.103 } ], "logging_steps": 5, "max_steps": 846, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 38833750351872.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }