{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.597039473684211, "eval_steps": 500, "global_step": 1421, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024671052631578948, "grad_norm": 5.377997875213623, "learning_rate": 1.118881118881119e-06, "loss": 0.5476, "loss_nan_ranks": 0, "loss_rank_avg": 0.47340989112854004, "step": 5 }, { "epoch": 0.049342105263157895, "grad_norm": 5.392073154449463, "learning_rate": 2.517482517482518e-06, "loss": 0.5429, "loss_nan_ranks": 0, "loss_rank_avg": 0.48170506954193115, "step": 10 }, { "epoch": 0.07401315789473684, "grad_norm": 4.068439960479736, "learning_rate": 3.916083916083917e-06, "loss": 0.5108, "loss_nan_ranks": 0, "loss_rank_avg": 0.5030127763748169, "step": 15 }, { "epoch": 0.09868421052631579, "grad_norm": 1.50242280960083, "learning_rate": 5.314685314685315e-06, "loss": 0.4949, "loss_nan_ranks": 0, "loss_rank_avg": 0.4839881658554077, "step": 20 }, { "epoch": 0.12335526315789473, "grad_norm": 0.9494503736495972, "learning_rate": 6.713286713286714e-06, "loss": 0.4425, "loss_nan_ranks": 0, "loss_rank_avg": 0.4382355809211731, "step": 25 }, { "epoch": 0.14802631578947367, "grad_norm": 0.8043321967124939, "learning_rate": 8.111888111888112e-06, "loss": 0.4344, "loss_nan_ranks": 0, "loss_rank_avg": 0.3997962474822998, "step": 30 }, { "epoch": 0.17269736842105263, "grad_norm": 0.7665994763374329, "learning_rate": 9.510489510489511e-06, "loss": 0.4256, "loss_nan_ranks": 0, "loss_rank_avg": 0.4312514364719391, "step": 35 }, { "epoch": 0.19736842105263158, "grad_norm": 0.5361006855964661, "learning_rate": 1.0909090909090909e-05, "loss": 0.3893, "loss_nan_ranks": 0, "loss_rank_avg": 0.33488091826438904, "step": 40 }, { "epoch": 0.22203947368421054, "grad_norm": 0.4154154062271118, "learning_rate": 1.230769230769231e-05, "loss": 0.3565, "loss_nan_ranks": 0, "loss_rank_avg": 0.2907222509384155, "step": 45 }, { "epoch": 0.24671052631578946, "grad_norm": 0.38533732295036316, "learning_rate": 1.3706293706293707e-05, "loss": 0.3607, "loss_nan_ranks": 0, "loss_rank_avg": 0.38802745938301086, "step": 50 }, { "epoch": 0.2713815789473684, "grad_norm": 0.6358391642570496, "learning_rate": 1.5104895104895105e-05, "loss": 0.336, "loss_nan_ranks": 0, "loss_rank_avg": 0.35459989309310913, "step": 55 }, { "epoch": 0.29605263157894735, "grad_norm": 0.7225229740142822, "learning_rate": 1.6503496503496507e-05, "loss": 0.3261, "loss_nan_ranks": 0, "loss_rank_avg": 0.32752037048339844, "step": 60 }, { "epoch": 0.3207236842105263, "grad_norm": 0.4612922668457031, "learning_rate": 1.7902097902097903e-05, "loss": 0.3038, "loss_nan_ranks": 0, "loss_rank_avg": 0.29034003615379333, "step": 65 }, { "epoch": 0.34539473684210525, "grad_norm": 0.41292667388916016, "learning_rate": 1.9300699300699302e-05, "loss": 0.3026, "loss_nan_ranks": 0, "loss_rank_avg": 0.29919901490211487, "step": 70 }, { "epoch": 0.37006578947368424, "grad_norm": 0.3574488162994385, "learning_rate": 2.06993006993007e-05, "loss": 0.2916, "loss_nan_ranks": 0, "loss_rank_avg": 0.2877832353115082, "step": 75 }, { "epoch": 0.39473684210526316, "grad_norm": 0.4183422923088074, "learning_rate": 2.2097902097902097e-05, "loss": 0.2951, "loss_nan_ranks": 0, "loss_rank_avg": 0.2495700865983963, "step": 80 }, { "epoch": 0.4194078947368421, "grad_norm": 0.27113303542137146, "learning_rate": 2.3496503496503496e-05, "loss": 0.2694, "loss_nan_ranks": 0, "loss_rank_avg": 0.2706011235713959, "step": 85 }, { "epoch": 0.4440789473684211, "grad_norm": 0.32469940185546875, "learning_rate": 2.48951048951049e-05, "loss": 0.2666, "loss_nan_ranks": 0, "loss_rank_avg": 0.2613554000854492, "step": 90 }, { "epoch": 0.46875, "grad_norm": 0.3434113562107086, "learning_rate": 2.6293706293706294e-05, "loss": 0.2516, "loss_nan_ranks": 0, "loss_rank_avg": 0.2635467052459717, "step": 95 }, { "epoch": 0.4934210526315789, "grad_norm": 0.44404661655426025, "learning_rate": 2.7692307692307694e-05, "loss": 0.2534, "loss_nan_ranks": 0, "loss_rank_avg": 0.2347426861524582, "step": 100 }, { "epoch": 0.024671052631578948, "grad_norm": 0.3727414608001709, "learning_rate": 2.9090909090909093e-05, "loss": 0.3065, "loss_nan_ranks": 0, "loss_rank_avg": 0.25368186831474304, "step": 105 }, { "epoch": 0.049342105263157895, "grad_norm": 0.3068452477455139, "learning_rate": 3.048951048951049e-05, "loss": 0.2958, "loss_nan_ranks": 0, "loss_rank_avg": 0.2523610293865204, "step": 110 }, { "epoch": 0.07401315789473684, "grad_norm": 0.29999276995658875, "learning_rate": 3.188811188811189e-05, "loss": 0.2856, "loss_nan_ranks": 0, "loss_rank_avg": 0.27986133098602295, "step": 115 }, { "epoch": 0.09868421052631579, "grad_norm": 0.28988873958587646, "learning_rate": 3.328671328671329e-05, "loss": 0.2982, "loss_nan_ranks": 0, "loss_rank_avg": 0.30054572224617004, "step": 120 }, { "epoch": 0.12335526315789473, "grad_norm": 0.2404654622077942, "learning_rate": 3.468531468531469e-05, "loss": 0.2746, "loss_nan_ranks": 0, "loss_rank_avg": 0.27615606784820557, "step": 125 }, { "epoch": 0.14802631578947367, "grad_norm": 0.24244281649589539, "learning_rate": 3.608391608391609e-05, "loss": 0.2767, "loss_nan_ranks": 0, "loss_rank_avg": 0.25839051604270935, "step": 130 }, { "epoch": 0.17269736842105263, "grad_norm": 0.25900793075561523, "learning_rate": 3.748251748251749e-05, "loss": 0.2763, "loss_nan_ranks": 0, "loss_rank_avg": 0.2887214124202728, "step": 135 }, { "epoch": 0.19736842105263158, "grad_norm": 0.23659999668598175, "learning_rate": 3.888111888111888e-05, "loss": 0.2626, "loss_nan_ranks": 0, "loss_rank_avg": 0.23153053224086761, "step": 140 }, { "epoch": 0.22203947368421054, "grad_norm": 0.22117386758327484, "learning_rate": 3.999993957205587e-05, "loss": 0.2494, "loss_nan_ranks": 0, "loss_rank_avg": 0.20298953354358673, "step": 145 }, { "epoch": 0.24671052631578946, "grad_norm": 0.26892349123954773, "learning_rate": 3.999782463235198e-05, "loss": 0.2604, "loss_nan_ranks": 0, "loss_rank_avg": 0.28457778692245483, "step": 150 }, { "epoch": 0.2713815789473684, "grad_norm": 0.32646119594573975, "learning_rate": 3.999268866058499e-05, "loss": 0.246, "loss_nan_ranks": 0, "loss_rank_avg": 0.2623623311519623, "step": 155 }, { "epoch": 0.29605263157894735, "grad_norm": 0.32743313908576965, "learning_rate": 3.9984532432636075e-05, "loss": 0.2436, "loss_nan_ranks": 0, "loss_rank_avg": 0.25287550687789917, "step": 160 }, { "epoch": 0.3207236842105263, "grad_norm": 0.2839692234992981, "learning_rate": 3.997335718065055e-05, "loss": 0.2348, "loss_nan_ranks": 0, "loss_rank_avg": 0.22406595945358276, "step": 165 }, { "epoch": 0.34539473684210525, "grad_norm": 0.27507659792900085, "learning_rate": 3.995916459285176e-05, "loss": 0.2395, "loss_nan_ranks": 0, "loss_rank_avg": 0.24255843460559845, "step": 170 }, { "epoch": 0.37006578947368424, "grad_norm": 0.27469953894615173, "learning_rate": 3.994195681328607e-05, "loss": 0.235, "loss_nan_ranks": 0, "loss_rank_avg": 0.23514795303344727, "step": 175 }, { "epoch": 0.39473684210526316, "grad_norm": 0.30293864011764526, "learning_rate": 3.99217364414989e-05, "loss": 0.2436, "loss_nan_ranks": 0, "loss_rank_avg": 0.20414546132087708, "step": 180 }, { "epoch": 0.4194078947368421, "grad_norm": 0.22480283677577972, "learning_rate": 3.989850653214208e-05, "loss": 0.2274, "loss_nan_ranks": 0, "loss_rank_avg": 0.23089763522148132, "step": 185 }, { "epoch": 0.4440789473684211, "grad_norm": 0.2698158621788025, "learning_rate": 3.987227059451237e-05, "loss": 0.2289, "loss_nan_ranks": 0, "loss_rank_avg": 0.224493145942688, "step": 190 }, { "epoch": 0.46875, "grad_norm": 0.30755510926246643, "learning_rate": 3.984303259202129e-05, "loss": 0.2179, "loss_nan_ranks": 0, "loss_rank_avg": 0.2319842278957367, "step": 195 }, { "epoch": 0.4934210526315789, "grad_norm": 0.31990331411361694, "learning_rate": 3.9810796941596414e-05, "loss": 0.2229, "loss_nan_ranks": 0, "loss_rank_avg": 0.20815354585647583, "step": 200 }, { "epoch": 0.024671052631578948, "grad_norm": 0.2590547502040863, "learning_rate": 3.97755685130141e-05, "loss": 0.2447, "loss_nan_ranks": 0, "loss_rank_avg": 0.2057332992553711, "step": 205 }, { "epoch": 0.049342105263157895, "grad_norm": 0.2779129147529602, "learning_rate": 3.973735262816381e-05, "loss": 0.2418, "loss_nan_ranks": 0, "loss_rank_avg": 0.2053648829460144, "step": 210 }, { "epoch": 0.07401315789473684, "grad_norm": 0.2121449112892151, "learning_rate": 3.9696155060244166e-05, "loss": 0.238, "loss_nan_ranks": 0, "loss_rank_avg": 0.23251962661743164, "step": 215 }, { "epoch": 0.09868421052631579, "grad_norm": 0.2546617090702057, "learning_rate": 3.9651982032890774e-05, "loss": 0.2522, "loss_nan_ranks": 0, "loss_rank_avg": 0.25815051794052124, "step": 220 }, { "epoch": 0.12335526315789473, "grad_norm": 0.20591332018375397, "learning_rate": 3.960484021923606e-05, "loss": 0.2365, "loss_nan_ranks": 0, "loss_rank_avg": 0.23984409868717194, "step": 225 }, { "epoch": 0.14802631578947367, "grad_norm": 0.22549138963222504, "learning_rate": 3.9554736740901163e-05, "loss": 0.2417, "loss_nan_ranks": 0, "loss_rank_avg": 0.22766338288784027, "step": 230 }, { "epoch": 0.17269736842105263, "grad_norm": 0.22996200621128082, "learning_rate": 3.950167916692008e-05, "loss": 0.2442, "loss_nan_ranks": 0, "loss_rank_avg": 0.25690600275993347, "step": 235 }, { "epoch": 0.19736842105263158, "grad_norm": 0.23695601522922516, "learning_rate": 3.9445675512596224e-05, "loss": 0.2347, "loss_nan_ranks": 0, "loss_rank_avg": 0.20780086517333984, "step": 240 }, { "epoch": 0.22203947368421054, "grad_norm": 0.1984836608171463, "learning_rate": 3.938673423829159e-05, "loss": 0.2254, "loss_nan_ranks": 0, "loss_rank_avg": 0.18308115005493164, "step": 245 }, { "epoch": 0.24671052631578946, "grad_norm": 0.23254786431789398, "learning_rate": 3.932486424814865e-05, "loss": 0.2369, "loss_nan_ranks": 0, "loss_rank_avg": 0.2599466145038605, "step": 250 }, { "epoch": 0.2713815789473684, "grad_norm": 0.2990603446960449, "learning_rate": 3.92600748887452e-05, "loss": 0.2234, "loss_nan_ranks": 0, "loss_rank_avg": 0.23988397419452667, "step": 255 }, { "epoch": 0.29605263157894735, "grad_norm": 0.3279288411140442, "learning_rate": 3.9192375947682436e-05, "loss": 0.2225, "loss_nan_ranks": 0, "loss_rank_avg": 0.23210851848125458, "step": 260 }, { "epoch": 0.3207236842105263, "grad_norm": 0.28320780396461487, "learning_rate": 3.9121777652106325e-05, "loss": 0.2157, "loss_nan_ranks": 0, "loss_rank_avg": 0.20393934845924377, "step": 265 }, { "epoch": 0.34539473684210525, "grad_norm": 0.2532626986503601, "learning_rate": 3.904829066716263e-05, "loss": 0.2206, "loss_nan_ranks": 0, "loss_rank_avg": 0.22491160035133362, "step": 270 }, { "epoch": 0.37006578947368424, "grad_norm": 0.24180759489536285, "learning_rate": 3.8971926094385725e-05, "loss": 0.217, "loss_nan_ranks": 0, "loss_rank_avg": 0.21774451434612274, "step": 275 }, { "epoch": 0.39473684210526316, "grad_norm": 0.28742486238479614, "learning_rate": 3.889269547002153e-05, "loss": 0.2258, "loss_nan_ranks": 0, "loss_rank_avg": 0.18878665566444397, "step": 280 }, { "epoch": 0.4194078947368421, "grad_norm": 0.2189558744430542, "learning_rate": 3.881061076328475e-05, "loss": 0.2121, "loss_nan_ranks": 0, "loss_rank_avg": 0.2159784585237503, "step": 285 }, { "epoch": 0.4440789473684211, "grad_norm": 0.24447618424892426, "learning_rate": 3.872568437455071e-05, "loss": 0.2141, "loss_nan_ranks": 0, "loss_rank_avg": 0.20933416485786438, "step": 290 }, { "epoch": 0.46875, "grad_norm": 0.2725847661495209, "learning_rate": 3.863792913348202e-05, "loss": 0.2045, "loss_nan_ranks": 0, "loss_rank_avg": 0.21849879622459412, "step": 295 }, { "epoch": 0.4934210526315789, "grad_norm": 0.2945365011692047, "learning_rate": 3.854735829709049e-05, "loss": 0.2099, "loss_nan_ranks": 0, "loss_rank_avg": 0.1962527334690094, "step": 300 }, { "epoch": 1.024671052631579, "grad_norm": 0.2862054705619812, "learning_rate": 3.8453985547734364e-05, "loss": 0.2265, "loss_nan_ranks": 0, "loss_rank_avg": 0.19139519333839417, "step": 305 }, { "epoch": 1.049342105263158, "grad_norm": 0.2504815459251404, "learning_rate": 3.835782499105136e-05, "loss": 0.2244, "loss_nan_ranks": 0, "loss_rank_avg": 0.19045579433441162, "step": 310 }, { "epoch": 1.0740131578947367, "grad_norm": 0.22405609488487244, "learning_rate": 3.825889115382777e-05, "loss": 0.2215, "loss_nan_ranks": 0, "loss_rank_avg": 0.2161717265844345, "step": 315 }, { "epoch": 1.0986842105263157, "grad_norm": 0.23503802716732025, "learning_rate": 3.815719898180397e-05, "loss": 0.2353, "loss_nan_ranks": 0, "loss_rank_avg": 0.24189823865890503, "step": 320 }, { "epoch": 1.1233552631578947, "grad_norm": 0.19855502247810364, "learning_rate": 3.8052763837416496e-05, "loss": 0.221, "loss_nan_ranks": 0, "loss_rank_avg": 0.22518810629844666, "step": 325 }, { "epoch": 1.1480263157894737, "grad_norm": 0.22038273513317108, "learning_rate": 3.794560149747736e-05, "loss": 0.2268, "loss_nan_ranks": 0, "loss_rank_avg": 0.21357837319374084, "step": 330 }, { "epoch": 1.1726973684210527, "grad_norm": 0.2181052416563034, "learning_rate": 3.7835728150790626e-05, "loss": 0.2297, "loss_nan_ranks": 0, "loss_rank_avg": 0.24232840538024902, "step": 335 }, { "epoch": 1.1973684210526316, "grad_norm": 0.21558205783367157, "learning_rate": 3.7723160395706846e-05, "loss": 0.2213, "loss_nan_ranks": 0, "loss_rank_avg": 0.19611139595508575, "step": 340 }, { "epoch": 1.2220394736842106, "grad_norm": 0.19245308637619019, "learning_rate": 3.760791523761553e-05, "loss": 0.213, "loss_nan_ranks": 0, "loss_rank_avg": 0.17266516387462616, "step": 345 }, { "epoch": 1.2467105263157894, "grad_norm": 0.23584093153476715, "learning_rate": 3.749001008637621e-05, "loss": 0.2247, "loss_nan_ranks": 0, "loss_rank_avg": 0.24593010544776917, "step": 350 }, { "epoch": 1.2713815789473684, "grad_norm": 0.2926468253135681, "learning_rate": 3.736946275368834e-05, "loss": 0.2116, "loss_nan_ranks": 0, "loss_rank_avg": 0.22750172019004822, "step": 355 }, { "epoch": 1.2960526315789473, "grad_norm": 0.26042640209198, "learning_rate": 3.724629145040056e-05, "loss": 0.2112, "loss_nan_ranks": 0, "loss_rank_avg": 0.22006699442863464, "step": 360 }, { "epoch": 1.3207236842105263, "grad_norm": 0.26961809396743774, "learning_rate": 3.7120514783759555e-05, "loss": 0.206, "loss_nan_ranks": 0, "loss_rank_avg": 0.19352692365646362, "step": 365 }, { "epoch": 1.3453947368421053, "grad_norm": 0.25605323910713196, "learning_rate": 3.699215175459917e-05, "loss": 0.2105, "loss_nan_ranks": 0, "loss_rank_avg": 0.21544437110424042, "step": 370 }, { "epoch": 1.3700657894736843, "grad_norm": 0.28638923168182373, "learning_rate": 3.686122175446992e-05, "loss": 0.2072, "loss_nan_ranks": 0, "loss_rank_avg": 0.20685678720474243, "step": 375 }, { "epoch": 1.3947368421052633, "grad_norm": 0.26736754179000854, "learning_rate": 3.672774456270959e-05, "loss": 0.215, "loss_nan_ranks": 0, "loss_rank_avg": 0.1788450926542282, "step": 380 }, { "epoch": 1.419407894736842, "grad_norm": 0.19807708263397217, "learning_rate": 3.659174034345522e-05, "loss": 0.2027, "loss_nan_ranks": 0, "loss_rank_avg": 0.20726953446865082, "step": 385 }, { "epoch": 1.444078947368421, "grad_norm": 0.24334271252155304, "learning_rate": 3.645322964259689e-05, "loss": 0.2047, "loss_nan_ranks": 0, "loss_rank_avg": 0.1996752917766571, "step": 390 }, { "epoch": 1.46875, "grad_norm": 0.2998853623867035, "learning_rate": 3.631223338467394e-05, "loss": 0.1961, "loss_nan_ranks": 0, "loss_rank_avg": 0.20962709188461304, "step": 395 }, { "epoch": 1.493421052631579, "grad_norm": 0.23867939412593842, "learning_rate": 3.616877286971396e-05, "loss": 0.2018, "loss_nan_ranks": 0, "loss_rank_avg": 0.18793398141860962, "step": 400 }, { "epoch": 1.024671052631579, "grad_norm": 0.2809722423553467, "learning_rate": 3.6022869770014964e-05, "loss": 0.2166, "loss_nan_ranks": 0, "loss_rank_avg": 0.18363182246685028, "step": 405 }, { "epoch": 1.049342105263158, "grad_norm": 0.22357912361621857, "learning_rate": 3.587454612687148e-05, "loss": 0.2144, "loss_nan_ranks": 0, "loss_rank_avg": 0.18171587586402893, "step": 410 }, { "epoch": 1.0740131578947367, "grad_norm": 0.20701156556606293, "learning_rate": 3.5723824347244745e-05, "loss": 0.2119, "loss_nan_ranks": 0, "loss_rank_avg": 0.2063273936510086, "step": 415 }, { "epoch": 1.0986842105263157, "grad_norm": 0.23529016971588135, "learning_rate": 3.557072720037779e-05, "loss": 0.225, "loss_nan_ranks": 0, "loss_rank_avg": 0.23247161507606506, "step": 420 }, { "epoch": 1.1233552631578947, "grad_norm": 0.2022576779127121, "learning_rate": 3.541527781435568e-05, "loss": 0.2115, "loss_nan_ranks": 0, "loss_rank_avg": 0.2152596414089203, "step": 425 }, { "epoch": 1.1480263157894737, "grad_norm": 0.20009814202785492, "learning_rate": 3.525749967261164e-05, "loss": 0.2173, "loss_nan_ranks": 0, "loss_rank_avg": 0.20451240241527557, "step": 430 }, { "epoch": 1.1726973684210527, "grad_norm": 0.2168785035610199, "learning_rate": 3.509741661037945e-05, "loss": 0.2202, "loss_nan_ranks": 0, "loss_rank_avg": 0.2329305112361908, "step": 435 }, { "epoch": 1.1973684210526316, "grad_norm": 0.22165916860103607, "learning_rate": 3.493505281109269e-05, "loss": 0.2125, "loss_nan_ranks": 0, "loss_rank_avg": 0.18825937807559967, "step": 440 }, { "epoch": 1.2220394736842106, "grad_norm": 0.1891016662120819, "learning_rate": 3.477043280273139e-05, "loss": 0.2048, "loss_nan_ranks": 0, "loss_rank_avg": 0.16563668847084045, "step": 445 }, { "epoch": 1.2467105263157894, "grad_norm": 0.21851275861263275, "learning_rate": 3.460358145411669e-05, "loss": 0.2163, "loss_nan_ranks": 0, "loss_rank_avg": 0.2363748550415039, "step": 450 }, { "epoch": 1.2713815789473684, "grad_norm": 0.30250489711761475, "learning_rate": 3.4434523971153876e-05, "loss": 0.2038, "loss_nan_ranks": 0, "loss_rank_avg": 0.21931317448616028, "step": 455 }, { "epoch": 1.2960526315789473, "grad_norm": 0.25916534662246704, "learning_rate": 3.426328589302463e-05, "loss": 0.2034, "loss_nan_ranks": 0, "loss_rank_avg": 0.21159030497074127, "step": 460 }, { "epoch": 1.3207236842105263, "grad_norm": 0.28234443068504333, "learning_rate": 3.408989308832887e-05, "loss": 0.1997, "loss_nan_ranks": 0, "loss_rank_avg": 0.18619588017463684, "step": 465 }, { "epoch": 1.3453947368421053, "grad_norm": 0.25823721289634705, "learning_rate": 3.3914371751176806e-05, "loss": 0.2034, "loss_nan_ranks": 0, "loss_rank_avg": 0.20875269174575806, "step": 470 }, { "epoch": 1.3700657894736843, "grad_norm": 0.25747162103652954, "learning_rate": 3.3736748397231865e-05, "loss": 0.2001, "loss_nan_ranks": 0, "loss_rank_avg": 0.20162513852119446, "step": 475 }, { "epoch": 1.3947368421052633, "grad_norm": 0.2906784415245056, "learning_rate": 3.3557049859705026e-05, "loss": 0.208, "loss_nan_ranks": 0, "loss_rank_avg": 0.1721152365207672, "step": 480 }, { "epoch": 1.419407894736842, "grad_norm": 0.19228465855121613, "learning_rate": 3.3375303285301175e-05, "loss": 0.1964, "loss_nan_ranks": 0, "loss_rank_avg": 0.2005109190940857, "step": 485 }, { "epoch": 1.444078947368421, "grad_norm": 0.22863629460334778, "learning_rate": 3.31915361301181e-05, "loss": 0.1981, "loss_nan_ranks": 0, "loss_rank_avg": 0.19252851605415344, "step": 490 }, { "epoch": 1.46875, "grad_norm": 0.26857441663742065, "learning_rate": 3.300577615549874e-05, "loss": 0.1899, "loss_nan_ranks": 0, "loss_rank_avg": 0.2054862082004547, "step": 495 }, { "epoch": 1.493421052631579, "grad_norm": 0.23823519051074982, "learning_rate": 3.281805142383738e-05, "loss": 0.1957, "loss_nan_ranks": 0, "loss_rank_avg": 0.1814895123243332, "step": 500 }, { "epoch": 2.0246710526315788, "grad_norm": 0.29881370067596436, "learning_rate": 3.262839029434026e-05, "loss": 0.2102, "loss_nan_ranks": 0, "loss_rank_avg": 0.1785247027873993, "step": 505 }, { "epoch": 2.049342105263158, "grad_norm": 0.24525128304958344, "learning_rate": 3.243682141874147e-05, "loss": 0.2074, "loss_nan_ranks": 0, "loss_rank_avg": 0.17585539817810059, "step": 510 }, { "epoch": 2.0740131578947367, "grad_norm": 0.20698322355747223, "learning_rate": 3.2243373736974524e-05, "loss": 0.2051, "loss_nan_ranks": 0, "loss_rank_avg": 0.1997726708650589, "step": 515 }, { "epoch": 2.098684210526316, "grad_norm": 0.22929632663726807, "learning_rate": 3.204807647280049e-05, "loss": 0.2176, "loss_nan_ranks": 0, "loss_rank_avg": 0.22522705793380737, "step": 520 }, { "epoch": 2.1233552631578947, "grad_norm": 0.18781138956546783, "learning_rate": 3.185095912939324e-05, "loss": 0.2046, "loss_nan_ranks": 0, "loss_rank_avg": 0.20897895097732544, "step": 525 }, { "epoch": 2.1480263157894735, "grad_norm": 0.2008383721113205, "learning_rate": 3.165205148488242e-05, "loss": 0.2105, "loss_nan_ranks": 0, "loss_rank_avg": 0.19787193834781647, "step": 530 }, { "epoch": 2.1726973684210527, "grad_norm": 0.21053168177604675, "learning_rate": 3.145138358785494e-05, "loss": 0.2131, "loss_nan_ranks": 0, "loss_rank_avg": 0.2260877788066864, "step": 535 }, { "epoch": 2.1973684210526314, "grad_norm": 0.2918646037578583, "learning_rate": 3.124898575281562e-05, "loss": 0.2057, "loss_nan_ranks": 0, "loss_rank_avg": 0.1822304129600525, "step": 540 }, { "epoch": 2.2220394736842106, "grad_norm": 0.18496881425380707, "learning_rate": 3.1044888555607594e-05, "loss": 0.1985, "loss_nan_ranks": 0, "loss_rank_avg": 0.16007639467716217, "step": 545 }, { "epoch": 2.2467105263157894, "grad_norm": 0.2168571650981903, "learning_rate": 3.0839122828793314e-05, "loss": 0.2098, "loss_nan_ranks": 0, "loss_rank_avg": 0.22958366572856903, "step": 550 }, { "epoch": 2.2713815789473686, "grad_norm": 0.29614129662513733, "learning_rate": 3.0631719656996707e-05, "loss": 0.1979, "loss_nan_ranks": 0, "loss_rank_avg": 0.21339645981788635, "step": 555 }, { "epoch": 2.2960526315789473, "grad_norm": 0.2810326814651489, "learning_rate": 3.042271037220731e-05, "loss": 0.1972, "loss_nan_ranks": 0, "loss_rank_avg": 0.20447814464569092, "step": 560 }, { "epoch": 2.3207236842105265, "grad_norm": 0.2408124804496765, "learning_rate": 3.0212126549046986e-05, "loss": 0.1923, "loss_nan_ranks": 0, "loss_rank_avg": 0.17865577340126038, "step": 565 }, { "epoch": 2.3453947368421053, "grad_norm": 0.2610262334346771, "learning_rate": 3.0000000000000004e-05, "loss": 0.1969, "loss_nan_ranks": 0, "loss_rank_avg": 0.20306739211082458, "step": 570 }, { "epoch": 2.370065789473684, "grad_norm": 0.4123166501522064, "learning_rate": 2.978636277060722e-05, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.2014990746974945, "step": 575 }, { "epoch": 2.3947368421052633, "grad_norm": 0.277736634016037, "learning_rate": 2.9571247134624985e-05, "loss": 0.2033, "loss_nan_ranks": 0, "loss_rank_avg": 0.16646766662597656, "step": 580 }, { "epoch": 2.419407894736842, "grad_norm": 0.2018052637577057, "learning_rate": 2.9354685589149637e-05, "loss": 0.1911, "loss_nan_ranks": 0, "loss_rank_avg": 0.19513532519340515, "step": 585 }, { "epoch": 2.4440789473684212, "grad_norm": 0.24421605467796326, "learning_rate": 2.9136710849708225e-05, "loss": 0.1925, "loss_nan_ranks": 0, "loss_rank_avg": 0.18745818734169006, "step": 590 }, { "epoch": 2.46875, "grad_norm": 0.26889941096305847, "learning_rate": 2.8917355845316214e-05, "loss": 0.1844, "loss_nan_ranks": 0, "loss_rank_avg": 0.20007860660552979, "step": 595 }, { "epoch": 2.4934210526315788, "grad_norm": 0.22898580133914948, "learning_rate": 2.869665371350299e-05, "loss": 0.1907, "loss_nan_ranks": 0, "loss_rank_avg": 0.17558446526527405, "step": 600 }, { "epoch": 2.0246710526315788, "grad_norm": 0.29629117250442505, "learning_rate": 2.8474637795305842e-05, "loss": 0.2053, "loss_nan_ranks": 0, "loss_rank_avg": 0.1745256781578064, "step": 605 }, { "epoch": 2.049342105263158, "grad_norm": 0.22614240646362305, "learning_rate": 2.825134163023318e-05, "loss": 0.2023, "loss_nan_ranks": 0, "loss_rank_avg": 0.17126205563545227, "step": 610 }, { "epoch": 2.0740131578947367, "grad_norm": 0.20102806389331818, "learning_rate": 2.802679895119778e-05, "loss": 0.2, "loss_nan_ranks": 0, "loss_rank_avg": 0.19442611932754517, "step": 615 }, { "epoch": 2.098684210526316, "grad_norm": 0.2268202155828476, "learning_rate": 2.7801043679420856e-05, "loss": 0.2119, "loss_nan_ranks": 0, "loss_rank_avg": 0.21963992714881897, "step": 620 }, { "epoch": 2.1233552631578947, "grad_norm": 0.1838330328464508, "learning_rate": 2.75741099193076e-05, "loss": 0.1992, "loss_nan_ranks": 0, "loss_rank_avg": 0.20316563546657562, "step": 625 }, { "epoch": 2.1480263157894735, "grad_norm": 0.20691487193107605, "learning_rate": 2.734603195329514e-05, "loss": 0.205, "loss_nan_ranks": 0, "loss_rank_avg": 0.1925206482410431, "step": 630 }, { "epoch": 2.1726973684210527, "grad_norm": 0.21233633160591125, "learning_rate": 2.711684423667353e-05, "loss": 0.2073, "loss_nan_ranks": 0, "loss_rank_avg": 0.22030431032180786, "step": 635 }, { "epoch": 2.1973684210526314, "grad_norm": 0.2087518721818924, "learning_rate": 2.688658139238067e-05, "loss": 0.2004, "loss_nan_ranks": 0, "loss_rank_avg": 0.17837481200695038, "step": 640 }, { "epoch": 2.2220394736842106, "grad_norm": 0.18232280015945435, "learning_rate": 2.6655278205771877e-05, "loss": 0.1934, "loss_nan_ranks": 0, "loss_rank_avg": 0.15586256980895996, "step": 645 }, { "epoch": 2.2467105263157894, "grad_norm": 0.2109103947877884, "learning_rate": 2.6422969619364965e-05, "loss": 0.2045, "loss_nan_ranks": 0, "loss_rank_avg": 0.223219633102417, "step": 650 }, { "epoch": 2.2713815789473686, "grad_norm": 0.27833494544029236, "learning_rate": 2.6189690727561478e-05, "loss": 0.1929, "loss_nan_ranks": 0, "loss_rank_avg": 0.20833665132522583, "step": 655 }, { "epoch": 2.2960526315789473, "grad_norm": 0.2569701075553894, "learning_rate": 2.5955476771345116e-05, "loss": 0.1928, "loss_nan_ranks": 0, "loss_rank_avg": 0.2008177787065506, "step": 660 }, { "epoch": 2.3207236842105265, "grad_norm": 0.24047020077705383, "learning_rate": 2.5720363132957915e-05, "loss": 0.1874, "loss_nan_ranks": 0, "loss_rank_avg": 0.1724005788564682, "step": 665 }, { "epoch": 2.3453947368421053, "grad_norm": 0.2805600166320801, "learning_rate": 2.5484385330555138e-05, "loss": 0.1915, "loss_nan_ranks": 0, "loss_rank_avg": 0.19802047312259674, "step": 670 }, { "epoch": 2.370065789473684, "grad_norm": 0.3280099332332611, "learning_rate": 2.5247579012839584e-05, "loss": 0.1895, "loss_nan_ranks": 0, "loss_rank_avg": 0.19793689250946045, "step": 675 }, { "epoch": 2.3947368421052633, "grad_norm": 0.27804508805274963, "learning_rate": 2.500997995367626e-05, "loss": 0.2007, "loss_nan_ranks": 0, "loss_rank_avg": 0.16160696744918823, "step": 680 }, { "epoch": 2.419407894736842, "grad_norm": 0.2003459334373474, "learning_rate": 2.4771624046688043e-05, "loss": 0.1865, "loss_nan_ranks": 0, "loss_rank_avg": 0.19053032994270325, "step": 685 }, { "epoch": 2.4440789473684212, "grad_norm": 0.2516365945339203, "learning_rate": 2.4532547299833337e-05, "loss": 0.1876, "loss_nan_ranks": 0, "loss_rank_avg": 0.1822153627872467, "step": 690 }, { "epoch": 2.46875, "grad_norm": 0.2790171205997467, "learning_rate": 2.4292785829966407e-05, "loss": 0.1798, "loss_nan_ranks": 0, "loss_rank_avg": 0.19594642519950867, "step": 695 }, { "epoch": 2.4934210526315788, "grad_norm": 0.2231920063495636, "learning_rate": 2.405237585738126e-05, "loss": 0.1859, "loss_nan_ranks": 0, "loss_rank_avg": 0.1701628416776657, "step": 700 }, { "epoch": 3.0246710526315788, "grad_norm": 0.28956133127212524, "learning_rate": 2.381135370033996e-05, "loss": 0.2012, "loss_nan_ranks": 0, "loss_rank_avg": 0.1710219532251358, "step": 705 }, { "epoch": 3.049342105263158, "grad_norm": 0.23850497603416443, "learning_rate": 2.356975576958606e-05, "loss": 0.1977, "loss_nan_ranks": 0, "loss_rank_avg": 0.16769038140773773, "step": 710 }, { "epoch": 3.0740131578947367, "grad_norm": 0.19639335572719574, "learning_rate": 2.3327618562844116e-05, "loss": 0.1954, "loss_nan_ranks": 0, "loss_rank_avg": 0.18999743461608887, "step": 715 }, { "epoch": 3.098684210526316, "grad_norm": 0.22467830777168274, "learning_rate": 2.3084978659306048e-05, "loss": 0.2069, "loss_nan_ranks": 0, "loss_rank_avg": 0.21505983173847198, "step": 720 }, { "epoch": 3.1233552631578947, "grad_norm": 0.19431814551353455, "learning_rate": 2.2841872714105196e-05, "loss": 0.1944, "loss_nan_ranks": 0, "loss_rank_avg": 0.19865034520626068, "step": 725 }, { "epoch": 3.1480263157894735, "grad_norm": 0.2027537077665329, "learning_rate": 2.25983374527789e-05, "loss": 0.2002, "loss_nan_ranks": 0, "loss_rank_avg": 0.18814553320407867, "step": 730 }, { "epoch": 3.1726973684210527, "grad_norm": 0.21179619431495667, "learning_rate": 2.2354409665720427e-05, "loss": 0.2024, "loss_nan_ranks": 0, "loss_rank_avg": 0.21532279253005981, "step": 735 }, { "epoch": 3.1973684210526314, "grad_norm": 0.20413529872894287, "learning_rate": 2.2110126202621162e-05, "loss": 0.1957, "loss_nan_ranks": 0, "loss_rank_avg": 0.17406898736953735, "step": 740 }, { "epoch": 3.2220394736842106, "grad_norm": 0.19533619284629822, "learning_rate": 2.1865523966903758e-05, "loss": 0.1889, "loss_nan_ranks": 0, "loss_rank_avg": 0.15199615061283112, "step": 745 }, { "epoch": 3.2467105263157894, "grad_norm": 0.212092787027359, "learning_rate": 2.16206399101472e-05, "loss": 0.1998, "loss_nan_ranks": 0, "loss_rank_avg": 0.21798661351203918, "step": 750 }, { "epoch": 3.2713815789473686, "grad_norm": 0.2671966552734375, "learning_rate": 2.1375511026504653e-05, "loss": 0.1885, "loss_nan_ranks": 0, "loss_rank_avg": 0.20311211049556732, "step": 755 }, { "epoch": 3.2960526315789473, "grad_norm": 0.24363179504871368, "learning_rate": 2.113017434711479e-05, "loss": 0.1875, "loss_nan_ranks": 0, "loss_rank_avg": 0.1942652463912964, "step": 760 }, { "epoch": 3.3207236842105265, "grad_norm": 0.22911496460437775, "learning_rate": 2.088466693450758e-05, "loss": 0.1823, "loss_nan_ranks": 0, "loss_rank_avg": 0.1669602394104004, "step": 765 }, { "epoch": 3.3453947368421053, "grad_norm": 0.24698033928871155, "learning_rate": 2.0639025877005308e-05, "loss": 0.1863, "loss_nan_ranks": 0, "loss_rank_avg": 0.19326898455619812, "step": 770 }, { "epoch": 3.370065789473684, "grad_norm": 0.5782654881477356, "learning_rate": 2.039328828311976e-05, "loss": 0.1854, "loss_nan_ranks": 0, "loss_rank_avg": 0.20133379101753235, "step": 775 }, { "epoch": 3.3947368421052633, "grad_norm": 0.29828956723213196, "learning_rate": 2.014749127594625e-05, "loss": 0.1983, "loss_nan_ranks": 0, "loss_rank_avg": 0.1581995040178299, "step": 780 }, { "epoch": 3.419407894736842, "grad_norm": 0.20889180898666382, "learning_rate": 1.9901671987555568e-05, "loss": 0.1828, "loss_nan_ranks": 0, "loss_rank_avg": 0.18696996569633484, "step": 785 }, { "epoch": 3.4440789473684212, "grad_norm": 0.24528741836547852, "learning_rate": 1.9655867553384472e-05, "loss": 0.1834, "loss_nan_ranks": 0, "loss_rank_avg": 0.17848661541938782, "step": 790 }, { "epoch": 3.46875, "grad_norm": 0.24264982342720032, "learning_rate": 1.9410115106625714e-05, "loss": 0.1754, "loss_nan_ranks": 0, "loss_rank_avg": 0.18969368934631348, "step": 795 }, { "epoch": 3.4934210526315788, "grad_norm": 0.2114223688840866, "learning_rate": 1.9164451772618435e-05, "loss": 0.1812, "loss_nan_ranks": 0, "loss_rank_avg": 0.16628679633140564, "step": 800 }, { "epoch": 3.0246710526315788, "grad_norm": 0.308517187833786, "learning_rate": 1.891891466323966e-05, "loss": 0.1979, "loss_nan_ranks": 0, "loss_rank_avg": 0.16870887577533722, "step": 805 }, { "epoch": 3.049342105263158, "grad_norm": 0.24148114025592804, "learning_rate": 1.8673540871297927e-05, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.16438522934913635, "step": 810 }, { "epoch": 3.0740131578947367, "grad_norm": 0.1998043656349182, "learning_rate": 1.842836746492971e-05, "loss": 0.1916, "loss_nan_ranks": 0, "loss_rank_avg": 0.18565362691879272, "step": 815 }, { "epoch": 3.098684210526316, "grad_norm": 0.22963786125183105, "learning_rate": 1.8183431481999658e-05, "loss": 0.2026, "loss_nan_ranks": 0, "loss_rank_avg": 0.21126440167427063, "step": 820 }, { "epoch": 3.1233552631578947, "grad_norm": 0.1848352700471878, "learning_rate": 1.793876992450529e-05, "loss": 0.1905, "loss_nan_ranks": 0, "loss_rank_avg": 0.1943422555923462, "step": 825 }, { "epoch": 3.1480263157894735, "grad_norm": 0.20099779963493347, "learning_rate": 1.769441975298726e-05, "loss": 0.1958, "loss_nan_ranks": 0, "loss_rank_avg": 0.1838373839855194, "step": 830 }, { "epoch": 3.1726973684210527, "grad_norm": 0.23807059228420258, "learning_rate": 1.7450417880945705e-05, "loss": 0.198, "loss_nan_ranks": 0, "loss_rank_avg": 0.21098263561725616, "step": 835 }, { "epoch": 3.1973684210526314, "grad_norm": 0.2139764130115509, "learning_rate": 1.720680116926388e-05, "loss": 0.1914, "loss_nan_ranks": 0, "loss_rank_avg": 0.17009641230106354, "step": 840 }, { "epoch": 3.2220394736842106, "grad_norm": 0.18776723742485046, "learning_rate": 1.6963606420639602e-05, "loss": 0.185, "loss_nan_ranks": 0, "loss_rank_avg": 0.14875054359436035, "step": 845 }, { "epoch": 3.2467105263157894, "grad_norm": 0.21373049914836884, "learning_rate": 1.6720870374025578e-05, "loss": 0.1957, "loss_nan_ranks": 0, "loss_rank_avg": 0.212870791554451, "step": 850 }, { "epoch": 3.2713815789473686, "grad_norm": 0.2705714702606201, "learning_rate": 1.6478629699079278e-05, "loss": 0.1846, "loss_nan_ranks": 0, "loss_rank_avg": 0.19945034384727478, "step": 855 }, { "epoch": 3.2960526315789473, "grad_norm": 0.25683969259262085, "learning_rate": 1.6236920990623374e-05, "loss": 0.1833, "loss_nan_ranks": 0, "loss_rank_avg": 0.19059477746486664, "step": 860 }, { "epoch": 3.3207236842105265, "grad_norm": 0.2478659451007843, "learning_rate": 1.5995780763117382e-05, "loss": 0.1781, "loss_nan_ranks": 0, "loss_rank_avg": 0.16162365674972534, "step": 865 }, { "epoch": 3.3453947368421053, "grad_norm": 0.24454790353775024, "learning_rate": 1.5755245445141544e-05, "loss": 0.182, "loss_nan_ranks": 0, "loss_rank_avg": 0.1889064908027649, "step": 870 }, { "epoch": 3.370065789473684, "grad_norm": 0.32232385873794556, "learning_rate": 1.5515351373893573e-05, "loss": 0.1804, "loss_nan_ranks": 0, "loss_rank_avg": 0.19105729460716248, "step": 875 }, { "epoch": 3.3947368421052633, "grad_norm": 0.2774730324745178, "learning_rate": 1.5276134789699344e-05, "loss": 0.1936, "loss_nan_ranks": 0, "loss_rank_avg": 0.15349537134170532, "step": 880 }, { "epoch": 3.419407894736842, "grad_norm": 0.19822268187999725, "learning_rate": 1.503763183053805e-05, "loss": 0.1787, "loss_nan_ranks": 0, "loss_rank_avg": 0.18199126422405243, "step": 885 }, { "epoch": 3.4440789473684212, "grad_norm": 0.2223716378211975, "learning_rate": 1.4799878526582987e-05, "loss": 0.1788, "loss_nan_ranks": 0, "loss_rank_avg": 0.17311102151870728, "step": 890 }, { "epoch": 3.46875, "grad_norm": 0.2528238594532013, "learning_rate": 1.4562910794758488e-05, "loss": 0.171, "loss_nan_ranks": 0, "loss_rank_avg": 0.1867274045944214, "step": 895 }, { "epoch": 3.4934210526315788, "grad_norm": 0.21180278062820435, "learning_rate": 1.4326764433314066e-05, "loss": 0.1771, "loss_nan_ranks": 0, "loss_rank_avg": 0.16158296167850494, "step": 900 }, { "epoch": 4.024671052631579, "grad_norm": 0.2938304543495178, "learning_rate": 1.4091475116416415e-05, "loss": 0.195, "loss_nan_ranks": 0, "loss_rank_avg": 0.1667296290397644, "step": 905 }, { "epoch": 4.0493421052631575, "grad_norm": 0.24434833228588104, "learning_rate": 1.3857078388760203e-05, "loss": 0.1909, "loss_nan_ranks": 0, "loss_rank_avg": 0.16151553392410278, "step": 910 }, { "epoch": 4.074013157894737, "grad_norm": 0.19539859890937805, "learning_rate": 1.3623609660198373e-05, "loss": 0.188, "loss_nan_ranks": 0, "loss_rank_avg": 0.18197977542877197, "step": 915 }, { "epoch": 4.098684210526316, "grad_norm": 0.23001670837402344, "learning_rate": 1.3391104200392905e-05, "loss": 0.1987, "loss_nan_ranks": 0, "loss_rank_avg": 0.2076358199119568, "step": 920 }, { "epoch": 4.123355263157895, "grad_norm": 0.18925786018371582, "learning_rate": 1.3159597133486628e-05, "loss": 0.1865, "loss_nan_ranks": 0, "loss_rank_avg": 0.19061236083507538, "step": 925 }, { "epoch": 4.1480263157894735, "grad_norm": 0.20448650419712067, "learning_rate": 1.292912343279713e-05, "loss": 0.1918, "loss_nan_ranks": 0, "loss_rank_avg": 0.18021315336227417, "step": 930 }, { "epoch": 4.172697368421052, "grad_norm": 0.2233712524175644, "learning_rate": 1.2699717915533402e-05, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.20732900500297546, "step": 935 }, { "epoch": 4.197368421052632, "grad_norm": 0.20618166029453278, "learning_rate": 1.2471415237536065e-05, "loss": 0.1874, "loss_nan_ranks": 0, "loss_rank_avg": 0.16642165184020996, "step": 940 }, { "epoch": 4.222039473684211, "grad_norm": 0.19250109791755676, "learning_rate": 1.2244249888041955e-05, "loss": 0.1813, "loss_nan_ranks": 0, "loss_rank_avg": 0.1461605280637741, "step": 945 }, { "epoch": 4.246710526315789, "grad_norm": 0.20326243340969086, "learning_rate": 1.2018256184473967e-05, "loss": 0.1919, "loss_nan_ranks": 0, "loss_rank_avg": 0.20845885574817657, "step": 950 }, { "epoch": 4.271381578947368, "grad_norm": 0.28202906250953674, "learning_rate": 1.1793468267256709e-05, "loss": 0.1804, "loss_nan_ranks": 0, "loss_rank_avg": 0.19517214596271515, "step": 955 }, { "epoch": 4.296052631578947, "grad_norm": 0.23696176707744598, "learning_rate": 1.156992009465904e-05, "loss": 0.1788, "loss_nan_ranks": 0, "loss_rank_avg": 0.18528538942337036, "step": 960 }, { "epoch": 4.3207236842105265, "grad_norm": 0.2820415794849396, "learning_rate": 1.1347645437664032e-05, "loss": 0.1738, "loss_nan_ranks": 0, "loss_rank_avg": 0.15721410512924194, "step": 965 }, { "epoch": 4.345394736842105, "grad_norm": 0.2320161908864975, "learning_rate": 1.1126677874867245e-05, "loss": 0.1776, "loss_nan_ranks": 0, "loss_rank_avg": 0.1845950335264206, "step": 970 }, { "epoch": 4.370065789473684, "grad_norm": 0.33977606892585754, "learning_rate": 1.0907050787404105e-05, "loss": 0.1757, "loss_nan_ranks": 0, "loss_rank_avg": 0.18589606881141663, "step": 975 }, { "epoch": 4.394736842105263, "grad_norm": 0.2689703404903412, "learning_rate": 1.0688797353907052e-05, "loss": 0.1882, "loss_nan_ranks": 0, "loss_rank_avg": 0.14957153797149658, "step": 980 }, { "epoch": 4.4194078947368425, "grad_norm": 0.2037592977285385, "learning_rate": 1.0471950545493328e-05, "loss": 0.1753, "loss_nan_ranks": 0, "loss_rank_avg": 0.17800632119178772, "step": 985 }, { "epoch": 4.444078947368421, "grad_norm": 0.2365749329328537, "learning_rate": 1.0256543120784074e-05, "loss": 0.1746, "loss_nan_ranks": 0, "loss_rank_avg": 0.16894236207008362, "step": 990 }, { "epoch": 4.46875, "grad_norm": 0.2567404806613922, "learning_rate": 1.0042607620955592e-05, "loss": 0.1669, "loss_nan_ranks": 0, "loss_rank_avg": 0.18090274930000305, "step": 995 }, { "epoch": 4.493421052631579, "grad_norm": 0.2108602672815323, "learning_rate": 9.830176364823349e-06, "loss": 0.1729, "loss_nan_ranks": 0, "loss_rank_avg": 0.15751853585243225, "step": 1000 }, { "epoch": 4.024671052631579, "grad_norm": 0.30531227588653564, "learning_rate": 9.619281443959711e-06, "loss": 0.1925, "loss_nan_ranks": 0, "loss_rank_avg": 0.16471046209335327, "step": 1005 }, { "epoch": 4.0493421052631575, "grad_norm": 0.2457159012556076, "learning_rate": 9.409954717845861e-06, "loss": 0.188, "loss_nan_ranks": 0, "loss_rank_avg": 0.15918654203414917, "step": 1010 }, { "epoch": 4.074013157894737, "grad_norm": 0.2185594141483307, "learning_rate": 9.202227809058912e-06, "loss": 0.1848, "loss_nan_ranks": 0, "loss_rank_avg": 0.17858435213565826, "step": 1015 }, { "epoch": 4.098684210526316, "grad_norm": 0.31074318289756775, "learning_rate": 8.996132098494688e-06, "loss": 0.1951, "loss_nan_ranks": 0, "loss_rank_avg": 0.20442083477973938, "step": 1020 }, { "epoch": 4.123355263157895, "grad_norm": 0.1880805939435959, "learning_rate": 8.791698720627138e-06, "loss": 0.183, "loss_nan_ranks": 0, "loss_rank_avg": 0.18682318925857544, "step": 1025 }, { "epoch": 4.1480263157894735, "grad_norm": 0.1928797960281372, "learning_rate": 8.58895855880484e-06, "loss": 0.1881, "loss_nan_ranks": 0, "loss_rank_avg": 0.17656448483467102, "step": 1030 }, { "epoch": 4.172697368421052, "grad_norm": 0.22544586658477783, "learning_rate": 8.387942240585587e-06, "loss": 0.1905, "loss_nan_ranks": 0, "loss_rank_avg": 0.20401433110237122, "step": 1035 }, { "epoch": 4.197368421052632, "grad_norm": 0.20387957990169525, "learning_rate": 8.188680133109485e-06, "loss": 0.1838, "loss_nan_ranks": 0, "loss_rank_avg": 0.16293783485889435, "step": 1040 }, { "epoch": 4.222039473684211, "grad_norm": 0.22418615221977234, "learning_rate": 7.991202338511477e-06, "loss": 0.1779, "loss_nan_ranks": 0, "loss_rank_avg": 0.14349329471588135, "step": 1045 }, { "epoch": 4.246710526315789, "grad_norm": 0.21125830709934235, "learning_rate": 7.795538689373859e-06, "loss": 0.1881, "loss_nan_ranks": 0, "loss_rank_avg": 0.20435020327568054, "step": 1050 }, { "epoch": 4.271381578947368, "grad_norm": 0.256409227848053, "learning_rate": 7.601718744219555e-06, "loss": 0.1768, "loss_nan_ranks": 0, "loss_rank_avg": 0.19160351157188416, "step": 1055 }, { "epoch": 4.296052631578947, "grad_norm": 0.23899729549884796, "learning_rate": 7.409771783046733e-06, "loss": 0.1747, "loss_nan_ranks": 0, "loss_rank_avg": 0.18137173354625702, "step": 1060 }, { "epoch": 4.3207236842105265, "grad_norm": 0.2511383295059204, "learning_rate": 7.219726802905573e-06, "loss": 0.17, "loss_nan_ranks": 0, "loss_rank_avg": 0.15333089232444763, "step": 1065 }, { "epoch": 4.345394736842105, "grad_norm": 0.24497000873088837, "learning_rate": 7.0316125135176935e-06, "loss": 0.1735, "loss_nan_ranks": 0, "loss_rank_avg": 0.17997264862060547, "step": 1070 }, { "epoch": 4.370065789473684, "grad_norm": 0.2674829661846161, "learning_rate": 6.845457332939083e-06, "loss": 0.1717, "loss_nan_ranks": 0, "loss_rank_avg": 0.18120409548282623, "step": 1075 }, { "epoch": 4.394736842105263, "grad_norm": 0.253813773393631, "learning_rate": 6.661289383266984e-06, "loss": 0.1848, "loss_nan_ranks": 0, "loss_rank_avg": 0.14498010277748108, "step": 1080 }, { "epoch": 4.4194078947368425, "grad_norm": 0.20171727240085602, "learning_rate": 6.479136486391599e-06, "loss": 0.1709, "loss_nan_ranks": 0, "loss_rank_avg": 0.1741921305656433, "step": 1085 }, { "epoch": 4.444078947368421, "grad_norm": 0.265755832195282, "learning_rate": 6.299026159793042e-06, "loss": 0.1704, "loss_nan_ranks": 0, "loss_rank_avg": 0.16434627771377563, "step": 1090 }, { "epoch": 4.46875, "grad_norm": 0.24092736840248108, "learning_rate": 6.120985612384369e-06, "loss": 0.1636, "loss_nan_ranks": 0, "loss_rank_avg": 0.17743858695030212, "step": 1095 }, { "epoch": 4.493421052631579, "grad_norm": 0.22309978306293488, "learning_rate": 5.945041740401147e-06, "loss": 0.1691, "loss_nan_ranks": 0, "loss_rank_avg": 0.1535460352897644, "step": 1100 }, { "epoch": 5.024671052631579, "grad_norm": 0.296979695558548, "learning_rate": 5.7712211233383104e-06, "loss": 0.1901, "loss_nan_ranks": 0, "loss_rank_avg": 0.16269783675670624, "step": 1105 }, { "epoch": 5.0493421052631575, "grad_norm": 0.2464631199836731, "learning_rate": 5.5995500199348565e-06, "loss": 0.1852, "loss_nan_ranks": 0, "loss_rank_avg": 0.15707314014434814, "step": 1110 }, { "epoch": 5.074013157894737, "grad_norm": 0.19684258103370667, "learning_rate": 5.430054364206965e-06, "loss": 0.182, "loss_nan_ranks": 0, "loss_rank_avg": 0.17553085088729858, "step": 1115 }, { "epoch": 5.098684210526316, "grad_norm": 0.2454098016023636, "learning_rate": 5.262759761530214e-06, "loss": 0.1921, "loss_nan_ranks": 0, "loss_rank_avg": 0.20213395357131958, "step": 1120 }, { "epoch": 5.123355263157895, "grad_norm": 0.19603319466114044, "learning_rate": 5.097691484771434e-06, "loss": 0.1797, "loss_nan_ranks": 0, "loss_rank_avg": 0.1835222691297531, "step": 1125 }, { "epoch": 5.1480263157894735, "grad_norm": 0.19945049285888672, "learning_rate": 4.934874470470756e-06, "loss": 0.1847, "loss_nan_ranks": 0, "loss_rank_avg": 0.17295250296592712, "step": 1130 }, { "epoch": 5.172697368421052, "grad_norm": 0.23703482747077942, "learning_rate": 4.77433331507454e-06, "loss": 0.187, "loss_nan_ranks": 0, "loss_rank_avg": 0.2007107436656952, "step": 1135 }, { "epoch": 5.197368421052632, "grad_norm": 0.19850043952465057, "learning_rate": 4.6160922712195875e-06, "loss": 0.1803, "loss_nan_ranks": 0, "loss_rank_avg": 0.15952754020690918, "step": 1140 }, { "epoch": 5.222039473684211, "grad_norm": 0.19503454864025116, "learning_rate": 4.460175244069395e-06, "loss": 0.1748, "loss_nan_ranks": 0, "loss_rank_avg": 0.14143893122673035, "step": 1145 }, { "epoch": 5.246710526315789, "grad_norm": 0.21830520033836365, "learning_rate": 4.306605787702802e-06, "loss": 0.1846, "loss_nan_ranks": 0, "loss_rank_avg": 0.20023545622825623, "step": 1150 }, { "epoch": 5.271381578947368, "grad_norm": 0.2543354332447052, "learning_rate": 4.155407101555764e-06, "loss": 0.1731, "loss_nan_ranks": 0, "loss_rank_avg": 0.18734650313854218, "step": 1155 }, { "epoch": 5.296052631578947, "grad_norm": 0.2577175796031952, "learning_rate": 4.006602026916617e-06, "loss": 0.1708, "loss_nan_ranks": 0, "loss_rank_avg": 0.17666634917259216, "step": 1160 }, { "epoch": 5.3207236842105265, "grad_norm": 0.2335437387228012, "learning_rate": 3.860213043475531e-06, "loss": 0.1663, "loss_nan_ranks": 0, "loss_rank_avg": 0.1490708291530609, "step": 1165 }, { "epoch": 5.345394736842105, "grad_norm": 0.23638561367988586, "learning_rate": 3.7162622659285185e-06, "loss": 0.1694, "loss_nan_ranks": 0, "loss_rank_avg": 0.1765957921743393, "step": 1170 }, { "epoch": 5.370065789473684, "grad_norm": 0.3142367899417877, "learning_rate": 3.5747714406366154e-06, "loss": 0.1677, "loss_nan_ranks": 0, "loss_rank_avg": 0.17734456062316895, "step": 1175 }, { "epoch": 5.394736842105263, "grad_norm": 0.2611706852912903, "learning_rate": 3.435761942340705e-06, "loss": 0.1807, "loss_nan_ranks": 0, "loss_rank_avg": 0.14173588156700134, "step": 1180 }, { "epoch": 5.4194078947368425, "grad_norm": 0.20892195403575897, "learning_rate": 3.2992547709324964e-06, "loss": 0.168, "loss_nan_ranks": 0, "loss_rank_avg": 0.17047154903411865, "step": 1185 }, { "epoch": 5.444078947368421, "grad_norm": 0.24029605090618134, "learning_rate": 3.1652705482820665e-06, "loss": 0.167, "loss_nan_ranks": 0, "loss_rank_avg": 0.16115230321884155, "step": 1190 }, { "epoch": 5.46875, "grad_norm": 0.2471027821302414, "learning_rate": 3.033829515122608e-06, "loss": 0.1598, "loss_nan_ranks": 0, "loss_rank_avg": 0.1730382740497589, "step": 1195 }, { "epoch": 5.493421052631579, "grad_norm": 0.22252005338668823, "learning_rate": 2.904951527992652e-06, "loss": 0.1656, "loss_nan_ranks": 0, "loss_rank_avg": 0.1501745879650116, "step": 1200 }, { "epoch": 5.024671052631579, "grad_norm": 0.2984672784805298, "learning_rate": 2.7786560562364285e-06, "loss": 0.1878, "loss_nan_ranks": 0, "loss_rank_avg": 0.16104725003242493, "step": 1205 }, { "epoch": 5.0493421052631575, "grad_norm": 0.24322442710399628, "learning_rate": 2.6549621790626166e-06, "loss": 0.1825, "loss_nan_ranks": 0, "loss_rank_avg": 0.1545785665512085, "step": 1210 }, { "epoch": 5.074013157894737, "grad_norm": 0.21504734456539154, "learning_rate": 2.533888582662145e-06, "loss": 0.1791, "loss_nan_ranks": 0, "loss_rank_avg": 0.17252680659294128, "step": 1215 }, { "epoch": 5.098684210526316, "grad_norm": 0.2362941950559616, "learning_rate": 2.41545355738525e-06, "loss": 0.189, "loss_nan_ranks": 0, "loss_rank_avg": 0.19943520426750183, "step": 1220 }, { "epoch": 5.123355263157895, "grad_norm": 0.1955908238887787, "learning_rate": 2.299674994978436e-06, "loss": 0.1765, "loss_nan_ranks": 0, "loss_rank_avg": 0.1803397238254547, "step": 1225 }, { "epoch": 5.1480263157894735, "grad_norm": 0.19484035670757294, "learning_rate": 2.1865703858815656e-06, "loss": 0.1813, "loss_nan_ranks": 0, "loss_rank_avg": 0.1700981855392456, "step": 1230 }, { "epoch": 5.172697368421052, "grad_norm": 0.2399033159017563, "learning_rate": 2.076156816585639e-06, "loss": 0.1836, "loss_nan_ranks": 0, "loss_rank_avg": 0.19748282432556152, "step": 1235 }, { "epoch": 5.197368421052632, "grad_norm": 0.20060895383358002, "learning_rate": 1.9684509670515585e-06, "loss": 0.177, "loss_nan_ranks": 0, "loss_rank_avg": 0.1564170867204666, "step": 1240 }, { "epoch": 5.222039473684211, "grad_norm": 0.2051486074924469, "learning_rate": 1.86346910819033e-06, "loss": 0.1718, "loss_nan_ranks": 0, "loss_rank_avg": 0.13873499631881714, "step": 1245 }, { "epoch": 5.246710526315789, "grad_norm": 0.21883392333984375, "learning_rate": 1.7612270994050362e-06, "loss": 0.1812, "loss_nan_ranks": 0, "loss_rank_avg": 0.19669045507907867, "step": 1250 }, { "epoch": 5.271381578947368, "grad_norm": 0.27726781368255615, "learning_rate": 1.6617403861949898e-06, "loss": 0.1702, "loss_nan_ranks": 0, "loss_rank_avg": 0.18430817127227783, "step": 1255 }, { "epoch": 5.296052631578947, "grad_norm": 0.25286152958869934, "learning_rate": 1.5650239978224346e-06, "loss": 0.1672, "loss_nan_ranks": 0, "loss_rank_avg": 0.17335672676563263, "step": 1260 }, { "epoch": 5.3207236842105265, "grad_norm": 0.24115750193595886, "learning_rate": 1.4710925450420632e-06, "loss": 0.1629, "loss_nan_ranks": 0, "loss_rank_avg": 0.14512015879154205, "step": 1265 }, { "epoch": 5.345394736842105, "grad_norm": 0.36607709527015686, "learning_rate": 1.379960217893841e-06, "loss": 0.1659, "loss_nan_ranks": 0, "loss_rank_avg": 0.1726306825876236, "step": 1270 }, { "epoch": 5.370065789473684, "grad_norm": 0.2740156054496765, "learning_rate": 1.2916407835593093e-06, "loss": 0.1641, "loss_nan_ranks": 0, "loss_rank_avg": 0.1730458289384842, "step": 1275 }, { "epoch": 5.394736842105263, "grad_norm": 0.2575497627258301, "learning_rate": 1.2061475842818337e-06, "loss": 0.1772, "loss_nan_ranks": 0, "loss_rank_avg": 0.13783614337444305, "step": 1280 }, { "epoch": 5.4194078947368425, "grad_norm": 0.19674454629421234, "learning_rate": 1.1234935353509946e-06, "loss": 0.1638, "loss_nan_ranks": 0, "loss_rank_avg": 0.16719526052474976, "step": 1285 }, { "epoch": 5.444078947368421, "grad_norm": 0.25193727016448975, "learning_rate": 1.0436911231515202e-06, "loss": 0.1631, "loss_nan_ranks": 0, "loss_rank_avg": 0.15700435638427734, "step": 1290 }, { "epoch": 5.46875, "grad_norm": 0.2387438416481018, "learning_rate": 9.667524032769715e-07, "loss": 0.1565, "loss_nan_ranks": 0, "loss_rank_avg": 0.16894632577896118, "step": 1295 }, { "epoch": 5.493421052631579, "grad_norm": 0.21962200105190277, "learning_rate": 8.926889987085441e-07, "loss": 0.1619, "loss_nan_ranks": 0, "loss_rank_avg": 0.1462545394897461, "step": 1300 }, { "epoch": 6.024671052631579, "grad_norm": 0.294238418340683, "learning_rate": 8.215120980591984e-07, "loss": 0.1856, "loss_nan_ranks": 0, "loss_rank_avg": 0.15981429815292358, "step": 1305 }, { "epoch": 6.0493421052631575, "grad_norm": 0.2525791823863983, "learning_rate": 7.532324538834279e-07, "loss": 0.1802, "loss_nan_ranks": 0, "loss_rank_avg": 0.15260854363441467, "step": 1310 }, { "epoch": 6.074013157894737, "grad_norm": 0.1967260092496872, "learning_rate": 6.878603810528739e-07, "loss": 0.1765, "loss_nan_ranks": 0, "loss_rank_avg": 0.1697266399860382, "step": 1315 }, { "epoch": 6.098684210526316, "grad_norm": 0.24107760190963745, "learning_rate": 6.25405755198103e-07, "loss": 0.1861, "loss_nan_ranks": 0, "loss_rank_avg": 0.19656933844089508, "step": 1320 }, { "epoch": 6.123355263157895, "grad_norm": 0.21563208103179932, "learning_rate": 5.658780112166872e-07, "loss": 0.1735, "loss_nan_ranks": 0, "loss_rank_avg": 0.17719779908657074, "step": 1325 }, { "epoch": 6.1480263157894735, "grad_norm": 0.20149841904640198, "learning_rate": 5.092861418479156e-07, "loss": 0.1781, "loss_nan_ranks": 0, "loss_rank_avg": 0.16660000383853912, "step": 1330 }, { "epoch": 6.172697368421052, "grad_norm": 0.24279265105724335, "learning_rate": 4.556386963142645e-07, "loss": 0.1805, "loss_nan_ranks": 0, "loss_rank_avg": 0.19446003437042236, "step": 1335 }, { "epoch": 6.197368421052632, "grad_norm": 0.2047039270401001, "learning_rate": 4.04943779029896e-07, "loss": 0.1738, "loss_nan_ranks": 0, "loss_rank_avg": 0.1529483050107956, "step": 1340 }, { "epoch": 6.222039473684211, "grad_norm": 0.20605064928531647, "learning_rate": 3.5720904837632355e-07, "loss": 0.1688, "loss_nan_ranks": 0, "loss_rank_avg": 0.13626405596733093, "step": 1345 }, { "epoch": 6.246710526315789, "grad_norm": 0.20880380272865295, "learning_rate": 3.124417155454884e-07, "loss": 0.178, "loss_nan_ranks": 0, "loss_rank_avg": 0.19269785284996033, "step": 1350 }, { "epoch": 6.271381578947368, "grad_norm": 0.2428048700094223, "learning_rate": 2.7064854345037585e-07, "loss": 0.167, "loss_nan_ranks": 0, "loss_rank_avg": 0.18093225359916687, "step": 1355 }, { "epoch": 6.296052631578947, "grad_norm": 0.2846923768520355, "learning_rate": 2.3183584570335205e-07, "loss": 0.1636, "loss_nan_ranks": 0, "loss_rank_avg": 0.16989687085151672, "step": 1360 }, { "epoch": 6.3207236842105265, "grad_norm": 0.2494208961725235, "learning_rate": 1.9600948566238287e-07, "loss": 0.1596, "loss_nan_ranks": 0, "loss_rank_avg": 0.1410118043422699, "step": 1365 }, { "epoch": 6.345394736842105, "grad_norm": 0.25421878695487976, "learning_rate": 1.631748755452667e-07, "loss": 0.1629, "loss_nan_ranks": 0, "loss_rank_avg": 0.17008930444717407, "step": 1370 }, { "epoch": 6.370065789473684, "grad_norm": 0.30556827783584595, "learning_rate": 1.3333697561201732e-07, "loss": 0.1603, "loss_nan_ranks": 0, "loss_rank_avg": 0.1695536971092224, "step": 1375 }, { "epoch": 6.394736842105263, "grad_norm": 0.2617764472961426, "learning_rate": 1.0650029341553902e-07, "loss": 0.1734, "loss_nan_ranks": 0, "loss_rank_avg": 0.13440750539302826, "step": 1380 }, { "epoch": 6.4194078947368425, "grad_norm": 0.22152091562747955, "learning_rate": 8.266888312066013e-08, "loss": 0.1609, "loss_nan_ranks": 0, "loss_rank_avg": 0.16376778483390808, "step": 1385 }, { "epoch": 6.444078947368421, "grad_norm": 0.24168169498443604, "learning_rate": 6.184634489169838e-08, "loss": 0.1598, "loss_nan_ranks": 0, "loss_rank_avg": 0.15376853942871094, "step": 1390 }, { "epoch": 6.46875, "grad_norm": 0.27041196823120117, "learning_rate": 4.403582434857834e-08, "loss": 0.1529, "loss_nan_ranks": 0, "loss_rank_avg": 0.1645251214504242, "step": 1395 }, { "epoch": 6.493421052631579, "grad_norm": 0.23931093513965607, "learning_rate": 2.924001209163363e-08, "loss": 0.1583, "loss_nan_ranks": 0, "loss_rank_avg": 0.14250212907791138, "step": 1400 }, { "epoch": 6.5180921052631575, "grad_norm": 0.2812100648880005, "learning_rate": 1.7461143295141036e-08, "loss": 0.1821, "loss_nan_ranks": 0, "loss_rank_avg": 0.16714885830879211, "step": 1405 }, { "epoch": 6.542763157894737, "grad_norm": 0.21569399535655975, "learning_rate": 8.700997369659459e-09, "loss": 0.1983, "loss_nan_ranks": 0, "loss_rank_avg": 0.21874967217445374, "step": 1410 }, { "epoch": 6.567434210526316, "grad_norm": 0.23459841310977936, "learning_rate": 2.9608976932182788e-09, "loss": 0.2006, "loss_nan_ranks": 0, "loss_rank_avg": 0.18229544162750244, "step": 1415 }, { "epoch": 6.592105263157895, "grad_norm": 0.22248035669326782, "learning_rate": 2.4171141139284204e-10, "loss": 0.1941, "loss_nan_ranks": 0, "loss_rank_avg": 0.18813243508338928, "step": 1420 }, { "epoch": 6.597039473684211, "step": 1421, "total_flos": 7.249602429950362e+16, "train_loss": 0.0, "train_runtime": 1.0653, "train_samples_per_second": 7990.221, "train_steps_per_second": 1333.894 } ], "logging_steps": 5, "max_steps": 1421, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.249602429950362e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }