{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004273504273504274, "grad_norm": 55.73861312866211, "learning_rate": 2.1276595744680852e-07, "loss": 1.4264, "mean_token_accuracy": 0.8229577541351318, "step": 1 }, { "epoch": 0.008547008547008548, "grad_norm": 53.30722427368164, "learning_rate": 4.2553191489361704e-07, "loss": 1.3324, "mean_token_accuracy": 0.8398163318634033, "step": 2 }, { "epoch": 0.01282051282051282, "grad_norm": 57.59785461425781, "learning_rate": 6.382978723404255e-07, "loss": 1.4545, "mean_token_accuracy": 0.8217088580131531, "step": 3 }, { "epoch": 0.017094017094017096, "grad_norm": 52.380863189697266, "learning_rate": 8.510638297872341e-07, "loss": 1.3236, "mean_token_accuracy": 0.8358691334724426, "step": 4 }, { "epoch": 0.021367521367521368, "grad_norm": 56.7705192565918, "learning_rate": 1.0638297872340427e-06, "loss": 1.3879, "mean_token_accuracy": 0.831577479839325, "step": 5 }, { "epoch": 0.02564102564102564, "grad_norm": 47.17439270019531, "learning_rate": 1.276595744680851e-06, "loss": 1.4027, "mean_token_accuracy": 0.824908971786499, "step": 6 }, { "epoch": 0.029914529914529916, "grad_norm": 48.12850570678711, "learning_rate": 1.4893617021276596e-06, "loss": 1.376, "mean_token_accuracy": 0.831479012966156, "step": 7 }, { "epoch": 0.03418803418803419, "grad_norm": 40.80478286743164, "learning_rate": 1.7021276595744682e-06, "loss": 1.2739, "mean_token_accuracy": 0.8457263112068176, "step": 8 }, { "epoch": 0.038461538461538464, "grad_norm": 36.486202239990234, "learning_rate": 1.9148936170212767e-06, "loss": 1.184, "mean_token_accuracy": 0.856406569480896, "step": 9 }, { "epoch": 0.042735042735042736, "grad_norm": 37.32433319091797, "learning_rate": 2.1276595744680853e-06, "loss": 1.1372, "mean_token_accuracy": 0.8697962760925293, "step": 10 }, { "epoch": 0.04700854700854701, "grad_norm": 42.96809005737305, "learning_rate": 2.340425531914894e-06, "loss": 1.0039, "mean_token_accuracy": 0.8862177133560181, "step": 11 }, { "epoch": 0.05128205128205128, "grad_norm": 28.670639038085938, "learning_rate": 2.553191489361702e-06, "loss": 0.9532, "mean_token_accuracy": 0.8876250982284546, "step": 12 }, { "epoch": 0.05555555555555555, "grad_norm": 21.205902099609375, "learning_rate": 2.765957446808511e-06, "loss": 0.8262, "mean_token_accuracy": 0.9083229303359985, "step": 13 }, { "epoch": 0.05982905982905983, "grad_norm": 26.272602081298828, "learning_rate": 2.978723404255319e-06, "loss": 0.7653, "mean_token_accuracy": 0.9099212884902954, "step": 14 }, { "epoch": 0.0641025641025641, "grad_norm": 21.69289207458496, "learning_rate": 3.191489361702128e-06, "loss": 0.6729, "mean_token_accuracy": 0.9244007468223572, "step": 15 }, { "epoch": 0.06837606837606838, "grad_norm": 15.221785545349121, "learning_rate": 3.4042553191489363e-06, "loss": 0.5633, "mean_token_accuracy": 0.9354838728904724, "step": 16 }, { "epoch": 0.07264957264957266, "grad_norm": 13.282171249389648, "learning_rate": 3.6170212765957453e-06, "loss": 0.5253, "mean_token_accuracy": 0.9385498762130737, "step": 17 }, { "epoch": 0.07692307692307693, "grad_norm": 9.240787506103516, "learning_rate": 3.8297872340425535e-06, "loss": 0.4977, "mean_token_accuracy": 0.9378522634506226, "step": 18 }, { "epoch": 0.0811965811965812, "grad_norm": 11.71231746673584, "learning_rate": 4.042553191489362e-06, "loss": 0.4495, "mean_token_accuracy": 0.94464111328125, "step": 19 }, { "epoch": 0.08547008547008547, "grad_norm": 14.272575378417969, "learning_rate": 4.255319148936171e-06, "loss": 0.4435, "mean_token_accuracy": 0.9443333745002747, "step": 20 }, { "epoch": 0.08974358974358974, "grad_norm": 5.9292144775390625, "learning_rate": 4.468085106382979e-06, "loss": 0.3692, "mean_token_accuracy": 0.9512913227081299, "step": 21 }, { "epoch": 0.09401709401709402, "grad_norm": 5.188922882080078, "learning_rate": 4.680851063829788e-06, "loss": 0.3901, "mean_token_accuracy": 0.9478208422660828, "step": 22 }, { "epoch": 0.09829059829059829, "grad_norm": 5.5339274406433105, "learning_rate": 4.893617021276596e-06, "loss": 0.3865, "mean_token_accuracy": 0.9494546055793762, "step": 23 }, { "epoch": 0.10256410256410256, "grad_norm": 6.537012100219727, "learning_rate": 5.106382978723404e-06, "loss": 0.3417, "mean_token_accuracy": 0.9544081091880798, "step": 24 }, { "epoch": 0.10683760683760683, "grad_norm": 4.679383277893066, "learning_rate": 5.319148936170213e-06, "loss": 0.3538, "mean_token_accuracy": 0.9526859521865845, "step": 25 }, { "epoch": 0.1111111111111111, "grad_norm": 4.511115074157715, "learning_rate": 5.531914893617022e-06, "loss": 0.346, "mean_token_accuracy": 0.9522511959075928, "step": 26 }, { "epoch": 0.11538461538461539, "grad_norm": 5.591527938842773, "learning_rate": 5.744680851063831e-06, "loss": 0.3842, "mean_token_accuracy": 0.9494712352752686, "step": 27 }, { "epoch": 0.11965811965811966, "grad_norm": 5.034028053283691, "learning_rate": 5.957446808510638e-06, "loss": 0.3603, "mean_token_accuracy": 0.9508964419364929, "step": 28 }, { "epoch": 0.12393162393162394, "grad_norm": 5.053023815155029, "learning_rate": 6.170212765957447e-06, "loss": 0.3497, "mean_token_accuracy": 0.9484222531318665, "step": 29 }, { "epoch": 0.1282051282051282, "grad_norm": 4.7491536140441895, "learning_rate": 6.382978723404256e-06, "loss": 0.3214, "mean_token_accuracy": 0.9549383521080017, "step": 30 }, { "epoch": 0.13247863247863248, "grad_norm": 4.993784427642822, "learning_rate": 6.595744680851064e-06, "loss": 0.3288, "mean_token_accuracy": 0.9535877704620361, "step": 31 }, { "epoch": 0.13675213675213677, "grad_norm": 5.114716053009033, "learning_rate": 6.808510638297873e-06, "loss": 0.3161, "mean_token_accuracy": 0.9539319276809692, "step": 32 }, { "epoch": 0.14102564102564102, "grad_norm": 5.302931308746338, "learning_rate": 7.021276595744682e-06, "loss": 0.3356, "mean_token_accuracy": 0.9496086239814758, "step": 33 }, { "epoch": 0.1452991452991453, "grad_norm": 4.586728096008301, "learning_rate": 7.234042553191491e-06, "loss": 0.2819, "mean_token_accuracy": 0.9578362107276917, "step": 34 }, { "epoch": 0.14957264957264957, "grad_norm": 4.939296245574951, "learning_rate": 7.446808510638298e-06, "loss": 0.2949, "mean_token_accuracy": 0.9560521841049194, "step": 35 }, { "epoch": 0.15384615384615385, "grad_norm": 4.6308064460754395, "learning_rate": 7.659574468085107e-06, "loss": 0.2724, "mean_token_accuracy": 0.9601955413818359, "step": 36 }, { "epoch": 0.1581196581196581, "grad_norm": 5.236644268035889, "learning_rate": 7.872340425531916e-06, "loss": 0.2964, "mean_token_accuracy": 0.9557088017463684, "step": 37 }, { "epoch": 0.1623931623931624, "grad_norm": 6.120398044586182, "learning_rate": 8.085106382978723e-06, "loss": 0.2837, "mean_token_accuracy": 0.9584332704544067, "step": 38 }, { "epoch": 0.16666666666666666, "grad_norm": 4.9033522605896, "learning_rate": 8.297872340425532e-06, "loss": 0.2664, "mean_token_accuracy": 0.9623565673828125, "step": 39 }, { "epoch": 0.17094017094017094, "grad_norm": 5.031535625457764, "learning_rate": 8.510638297872341e-06, "loss": 0.2735, "mean_token_accuracy": 0.9660788774490356, "step": 40 }, { "epoch": 0.1752136752136752, "grad_norm": 5.569308757781982, "learning_rate": 8.72340425531915e-06, "loss": 0.3008, "mean_token_accuracy": 0.9610214829444885, "step": 41 }, { "epoch": 0.1794871794871795, "grad_norm": 4.788116455078125, "learning_rate": 8.936170212765958e-06, "loss": 0.249, "mean_token_accuracy": 0.9689068794250488, "step": 42 }, { "epoch": 0.18376068376068377, "grad_norm": 4.97907829284668, "learning_rate": 9.148936170212767e-06, "loss": 0.2526, "mean_token_accuracy": 0.9705320596694946, "step": 43 }, { "epoch": 0.18803418803418803, "grad_norm": 5.394193172454834, "learning_rate": 9.361702127659576e-06, "loss": 0.2575, "mean_token_accuracy": 0.9729896783828735, "step": 44 }, { "epoch": 0.19230769230769232, "grad_norm": 5.6487345695495605, "learning_rate": 9.574468085106385e-06, "loss": 0.2699, "mean_token_accuracy": 0.9703598022460938, "step": 45 }, { "epoch": 0.19658119658119658, "grad_norm": 4.73795223236084, "learning_rate": 9.787234042553192e-06, "loss": 0.213, "mean_token_accuracy": 0.9751967191696167, "step": 46 }, { "epoch": 0.20085470085470086, "grad_norm": 5.331301689147949, "learning_rate": 1e-05, "loss": 0.2284, "mean_token_accuracy": 0.9703989624977112, "step": 47 }, { "epoch": 0.20512820512820512, "grad_norm": 5.021146774291992, "learning_rate": 9.999874710101753e-06, "loss": 0.2204, "mean_token_accuracy": 0.9770245552062988, "step": 48 }, { "epoch": 0.2094017094017094, "grad_norm": 5.38365364074707, "learning_rate": 9.999498847383701e-06, "loss": 0.2246, "mean_token_accuracy": 0.9734657406806946, "step": 49 }, { "epoch": 0.21367521367521367, "grad_norm": 5.117666244506836, "learning_rate": 9.998872432775537e-06, "loss": 0.2124, "mean_token_accuracy": 0.9771387577056885, "step": 50 }, { "epoch": 0.21794871794871795, "grad_norm": 5.200557231903076, "learning_rate": 9.997995501158781e-06, "loss": 0.1975, "mean_token_accuracy": 0.9770382046699524, "step": 51 }, { "epoch": 0.2222222222222222, "grad_norm": 5.478715419769287, "learning_rate": 9.996868101364841e-06, "loss": 0.2082, "mean_token_accuracy": 0.9750229716300964, "step": 52 }, { "epoch": 0.2264957264957265, "grad_norm": 5.196822643280029, "learning_rate": 9.995490296172302e-06, "loss": 0.1931, "mean_token_accuracy": 0.9769241809844971, "step": 53 }, { "epoch": 0.23076923076923078, "grad_norm": 5.330948352813721, "learning_rate": 9.993862162303414e-06, "loss": 0.1951, "mean_token_accuracy": 0.9732048511505127, "step": 54 }, { "epoch": 0.23504273504273504, "grad_norm": 4.878313064575195, "learning_rate": 9.991983790419835e-06, "loss": 0.1719, "mean_token_accuracy": 0.9765658378601074, "step": 55 }, { "epoch": 0.23931623931623933, "grad_norm": 5.060050964355469, "learning_rate": 9.989855285117573e-06, "loss": 0.1844, "mean_token_accuracy": 0.9762716889381409, "step": 56 }, { "epoch": 0.24358974358974358, "grad_norm": 4.994210243225098, "learning_rate": 9.987476764921172e-06, "loss": 0.1818, "mean_token_accuracy": 0.9686596393585205, "step": 57 }, { "epoch": 0.24786324786324787, "grad_norm": 4.1533708572387695, "learning_rate": 9.984848362277094e-06, "loss": 0.1432, "mean_token_accuracy": 0.9788950085639954, "step": 58 }, { "epoch": 0.25213675213675213, "grad_norm": 4.930089473724365, "learning_rate": 9.981970223546365e-06, "loss": 0.1659, "mean_token_accuracy": 0.9724494814872742, "step": 59 }, { "epoch": 0.2564102564102564, "grad_norm": 4.344954490661621, "learning_rate": 9.978842508996411e-06, "loss": 0.156, "mean_token_accuracy": 0.9734289050102234, "step": 60 }, { "epoch": 0.2606837606837607, "grad_norm": 4.381039619445801, "learning_rate": 9.975465392792136e-06, "loss": 0.1529, "mean_token_accuracy": 0.971455454826355, "step": 61 }, { "epoch": 0.26495726495726496, "grad_norm": 4.306074619293213, "learning_rate": 9.971839062986229e-06, "loss": 0.1589, "mean_token_accuracy": 0.9690320491790771, "step": 62 }, { "epoch": 0.2692307692307692, "grad_norm": 3.455580949783325, "learning_rate": 9.967963721508684e-06, "loss": 0.1269, "mean_token_accuracy": 0.9736381769180298, "step": 63 }, { "epoch": 0.27350427350427353, "grad_norm": 3.405048370361328, "learning_rate": 9.963839584155565e-06, "loss": 0.1266, "mean_token_accuracy": 0.9724921584129333, "step": 64 }, { "epoch": 0.2777777777777778, "grad_norm": 3.429849147796631, "learning_rate": 9.95946688057698e-06, "loss": 0.1309, "mean_token_accuracy": 0.9714710116386414, "step": 65 }, { "epoch": 0.28205128205128205, "grad_norm": 3.2758281230926514, "learning_rate": 9.954845854264306e-06, "loss": 0.1294, "mean_token_accuracy": 0.9714531898498535, "step": 66 }, { "epoch": 0.2863247863247863, "grad_norm": 2.8356447219848633, "learning_rate": 9.949976762536612e-06, "loss": 0.1213, "mean_token_accuracy": 0.9722399711608887, "step": 67 }, { "epoch": 0.2905982905982906, "grad_norm": 2.8008711338043213, "learning_rate": 9.944859876526348e-06, "loss": 0.1204, "mean_token_accuracy": 0.9703302979469299, "step": 68 }, { "epoch": 0.2948717948717949, "grad_norm": 2.8772833347320557, "learning_rate": 9.939495481164237e-06, "loss": 0.1305, "mean_token_accuracy": 0.9711523652076721, "step": 69 }, { "epoch": 0.29914529914529914, "grad_norm": 2.3752126693725586, "learning_rate": 9.933883875163411e-06, "loss": 0.1187, "mean_token_accuracy": 0.9728144407272339, "step": 70 }, { "epoch": 0.3034188034188034, "grad_norm": 3.0123136043548584, "learning_rate": 9.928025371002781e-06, "loss": 0.11, "mean_token_accuracy": 0.9761629700660706, "step": 71 }, { "epoch": 0.3076923076923077, "grad_norm": 2.6965625286102295, "learning_rate": 9.921920294909629e-06, "loss": 0.1242, "mean_token_accuracy": 0.9750074744224548, "step": 72 }, { "epoch": 0.31196581196581197, "grad_norm": 2.699901819229126, "learning_rate": 9.915568986841452e-06, "loss": 0.1281, "mean_token_accuracy": 0.9753836989402771, "step": 73 }, { "epoch": 0.3162393162393162, "grad_norm": 2.854963779449463, "learning_rate": 9.908971800467021e-06, "loss": 0.125, "mean_token_accuracy": 0.9752618074417114, "step": 74 }, { "epoch": 0.32051282051282054, "grad_norm": 2.7737538814544678, "learning_rate": 9.902129103146697e-06, "loss": 0.1298, "mean_token_accuracy": 0.9693613052368164, "step": 75 }, { "epoch": 0.3247863247863248, "grad_norm": 2.7642276287078857, "learning_rate": 9.895041275911972e-06, "loss": 0.1291, "mean_token_accuracy": 0.9729389548301697, "step": 76 }, { "epoch": 0.32905982905982906, "grad_norm": 3.0446126461029053, "learning_rate": 9.887708713444242e-06, "loss": 0.134, "mean_token_accuracy": 0.9701337218284607, "step": 77 }, { "epoch": 0.3333333333333333, "grad_norm": 2.538485288619995, "learning_rate": 9.88013182405285e-06, "loss": 0.1241, "mean_token_accuracy": 0.9748736023902893, "step": 78 }, { "epoch": 0.33760683760683763, "grad_norm": 2.8316211700439453, "learning_rate": 9.872311029652322e-06, "loss": 0.1236, "mean_token_accuracy": 0.9711246490478516, "step": 79 }, { "epoch": 0.3418803418803419, "grad_norm": 2.848421335220337, "learning_rate": 9.864246765738901e-06, "loss": 0.1363, "mean_token_accuracy": 0.9725619554519653, "step": 80 }, { "epoch": 0.34615384615384615, "grad_norm": 2.8594236373901367, "learning_rate": 9.855939481366276e-06, "loss": 0.1327, "mean_token_accuracy": 0.9708970785140991, "step": 81 }, { "epoch": 0.3504273504273504, "grad_norm": 2.9276626110076904, "learning_rate": 9.847389639120585e-06, "loss": 0.1278, "mean_token_accuracy": 0.969779372215271, "step": 82 }, { "epoch": 0.3547008547008547, "grad_norm": 3.02407169342041, "learning_rate": 9.838597715094661e-06, "loss": 0.1291, "mean_token_accuracy": 0.972348690032959, "step": 83 }, { "epoch": 0.358974358974359, "grad_norm": 3.002563953399658, "learning_rate": 9.82956419886151e-06, "loss": 0.1289, "mean_token_accuracy": 0.9705927968025208, "step": 84 }, { "epoch": 0.36324786324786323, "grad_norm": 3.177736520767212, "learning_rate": 9.820289593447053e-06, "loss": 0.1258, "mean_token_accuracy": 0.9714928865432739, "step": 85 }, { "epoch": 0.36752136752136755, "grad_norm": 3.0293781757354736, "learning_rate": 9.810774415302124e-06, "loss": 0.1258, "mean_token_accuracy": 0.9701533913612366, "step": 86 }, { "epoch": 0.3717948717948718, "grad_norm": 3.531522512435913, "learning_rate": 9.801019194273702e-06, "loss": 0.1368, "mean_token_accuracy": 0.9657713174819946, "step": 87 }, { "epoch": 0.37606837606837606, "grad_norm": 2.783475160598755, "learning_rate": 9.791024473575405e-06, "loss": 0.1265, "mean_token_accuracy": 0.9692285060882568, "step": 88 }, { "epoch": 0.3803418803418803, "grad_norm": 3.15690279006958, "learning_rate": 9.780790809757254e-06, "loss": 0.12, "mean_token_accuracy": 0.9689655303955078, "step": 89 }, { "epoch": 0.38461538461538464, "grad_norm": 3.449833393096924, "learning_rate": 9.770318772674669e-06, "loss": 0.1304, "mean_token_accuracy": 0.9709726572036743, "step": 90 }, { "epoch": 0.3888888888888889, "grad_norm": 2.9852263927459717, "learning_rate": 9.759608945456745e-06, "loss": 0.1282, "mean_token_accuracy": 0.9665982127189636, "step": 91 }, { "epoch": 0.39316239316239315, "grad_norm": 2.830143690109253, "learning_rate": 9.748661924473777e-06, "loss": 0.122, "mean_token_accuracy": 0.9698586463928223, "step": 92 }, { "epoch": 0.3974358974358974, "grad_norm": 3.019658327102661, "learning_rate": 9.73747831930405e-06, "loss": 0.1192, "mean_token_accuracy": 0.9726413488388062, "step": 93 }, { "epoch": 0.4017094017094017, "grad_norm": 3.0762267112731934, "learning_rate": 9.726058752699898e-06, "loss": 0.1259, "mean_token_accuracy": 0.96836256980896, "step": 94 }, { "epoch": 0.405982905982906, "grad_norm": 2.900409698486328, "learning_rate": 9.714403860553028e-06, "loss": 0.1286, "mean_token_accuracy": 0.968357503414154, "step": 95 }, { "epoch": 0.41025641025641024, "grad_norm": 2.963927984237671, "learning_rate": 9.70251429185911e-06, "loss": 0.1371, "mean_token_accuracy": 0.9652521014213562, "step": 96 }, { "epoch": 0.41452991452991456, "grad_norm": 2.7278425693511963, "learning_rate": 9.690390708681627e-06, "loss": 0.1223, "mean_token_accuracy": 0.9720891118049622, "step": 97 }, { "epoch": 0.4188034188034188, "grad_norm": 2.7922236919403076, "learning_rate": 9.67803378611503e-06, "loss": 0.1152, "mean_token_accuracy": 0.9715592265129089, "step": 98 }, { "epoch": 0.4230769230769231, "grad_norm": 3.1420092582702637, "learning_rate": 9.665444212247127e-06, "loss": 0.1223, "mean_token_accuracy": 0.969136655330658, "step": 99 }, { "epoch": 0.42735042735042733, "grad_norm": 2.8342413902282715, "learning_rate": 9.652622688120776e-06, "loss": 0.1237, "mean_token_accuracy": 0.9718309640884399, "step": 100 }, { "epoch": 0.43162393162393164, "grad_norm": 3.100160598754883, "learning_rate": 9.639569927694845e-06, "loss": 0.1382, "mean_token_accuracy": 0.9662887454032898, "step": 101 }, { "epoch": 0.4358974358974359, "grad_norm": 3.214078664779663, "learning_rate": 9.626286657804455e-06, "loss": 0.129, "mean_token_accuracy": 0.9656282067298889, "step": 102 }, { "epoch": 0.44017094017094016, "grad_norm": 2.7865896224975586, "learning_rate": 9.61277361812051e-06, "loss": 0.1184, "mean_token_accuracy": 0.970238983631134, "step": 103 }, { "epoch": 0.4444444444444444, "grad_norm": 3.0797605514526367, "learning_rate": 9.599031561108506e-06, "loss": 0.1216, "mean_token_accuracy": 0.9684609770774841, "step": 104 }, { "epoch": 0.44871794871794873, "grad_norm": 2.7707583904266357, "learning_rate": 9.585061251986634e-06, "loss": 0.1259, "mean_token_accuracy": 0.9682539701461792, "step": 105 }, { "epoch": 0.452991452991453, "grad_norm": 3.061985492706299, "learning_rate": 9.570863468683161e-06, "loss": 0.1196, "mean_token_accuracy": 0.9703070521354675, "step": 106 }, { "epoch": 0.45726495726495725, "grad_norm": 3.291062355041504, "learning_rate": 9.556439001793125e-06, "loss": 0.1318, "mean_token_accuracy": 0.965293824672699, "step": 107 }, { "epoch": 0.46153846153846156, "grad_norm": 2.679776430130005, "learning_rate": 9.541788654534296e-06, "loss": 0.1209, "mean_token_accuracy": 0.9700332880020142, "step": 108 }, { "epoch": 0.4658119658119658, "grad_norm": 3.1325292587280273, "learning_rate": 9.526913242702459e-06, "loss": 0.1344, "mean_token_accuracy": 0.9673547744750977, "step": 109 }, { "epoch": 0.4700854700854701, "grad_norm": 2.857881784439087, "learning_rate": 9.511813594625987e-06, "loss": 0.1291, "mean_token_accuracy": 0.9659602642059326, "step": 110 }, { "epoch": 0.47435897435897434, "grad_norm": 2.7918944358825684, "learning_rate": 9.49649055111971e-06, "loss": 0.1252, "mean_token_accuracy": 0.9653458595275879, "step": 111 }, { "epoch": 0.47863247863247865, "grad_norm": 2.8813693523406982, "learning_rate": 9.480944965438099e-06, "loss": 0.128, "mean_token_accuracy": 0.962127685546875, "step": 112 }, { "epoch": 0.4829059829059829, "grad_norm": 2.667649984359741, "learning_rate": 9.465177703227755e-06, "loss": 0.1214, "mean_token_accuracy": 0.9691765904426575, "step": 113 }, { "epoch": 0.48717948717948717, "grad_norm": 2.925039529800415, "learning_rate": 9.449189642479203e-06, "loss": 0.1263, "mean_token_accuracy": 0.9643491506576538, "step": 114 }, { "epoch": 0.49145299145299143, "grad_norm": 2.7949769496917725, "learning_rate": 9.432981673477998e-06, "loss": 0.1241, "mean_token_accuracy": 0.9685289263725281, "step": 115 }, { "epoch": 0.49572649572649574, "grad_norm": 3.3986713886260986, "learning_rate": 9.416554698755154e-06, "loss": 0.1262, "mean_token_accuracy": 0.9689081907272339, "step": 116 }, { "epoch": 0.5, "grad_norm": 3.4772071838378906, "learning_rate": 9.399909633036896e-06, "loss": 0.1284, "mean_token_accuracy": 0.9670020937919617, "step": 117 }, { "epoch": 0.5042735042735043, "grad_norm": 3.6029865741729736, "learning_rate": 9.383047403193704e-06, "loss": 0.1313, "mean_token_accuracy": 0.9666910171508789, "step": 118 }, { "epoch": 0.5085470085470085, "grad_norm": 3.636085033416748, "learning_rate": 9.365968948188717e-06, "loss": 0.1343, "mean_token_accuracy": 0.9655527472496033, "step": 119 }, { "epoch": 0.5128205128205128, "grad_norm": 3.53393816947937, "learning_rate": 9.348675219025443e-06, "loss": 0.1426, "mean_token_accuracy": 0.961666464805603, "step": 120 }, { "epoch": 0.5170940170940171, "grad_norm": 2.9354088306427, "learning_rate": 9.331167178694798e-06, "loss": 0.1188, "mean_token_accuracy": 0.9681200981140137, "step": 121 }, { "epoch": 0.5213675213675214, "grad_norm": 3.025766611099243, "learning_rate": 9.313445802121493e-06, "loss": 0.1242, "mean_token_accuracy": 0.9681670069694519, "step": 122 }, { "epoch": 0.5256410256410257, "grad_norm": 3.3338441848754883, "learning_rate": 9.295512076109734e-06, "loss": 0.1342, "mean_token_accuracy": 0.966331958770752, "step": 123 }, { "epoch": 0.5299145299145299, "grad_norm": 2.8079636096954346, "learning_rate": 9.277366999288279e-06, "loss": 0.1212, "mean_token_accuracy": 0.9686406254768372, "step": 124 }, { "epoch": 0.5341880341880342, "grad_norm": 3.0100514888763428, "learning_rate": 9.25901158205483e-06, "loss": 0.126, "mean_token_accuracy": 0.967146635055542, "step": 125 }, { "epoch": 0.5384615384615384, "grad_norm": 2.392273426055908, "learning_rate": 9.240446846519769e-06, "loss": 0.1141, "mean_token_accuracy": 0.9710744023323059, "step": 126 }, { "epoch": 0.5427350427350427, "grad_norm": 2.8574609756469727, "learning_rate": 9.22167382644924e-06, "loss": 0.1391, "mean_token_accuracy": 0.9653558135032654, "step": 127 }, { "epoch": 0.5470085470085471, "grad_norm": 2.432234525680542, "learning_rate": 9.202693567207588e-06, "loss": 0.1157, "mean_token_accuracy": 0.9682042002677917, "step": 128 }, { "epoch": 0.5512820512820513, "grad_norm": 2.939648151397705, "learning_rate": 9.183507125699144e-06, "loss": 0.1234, "mean_token_accuracy": 0.9685525298118591, "step": 129 }, { "epoch": 0.5555555555555556, "grad_norm": 2.6586060523986816, "learning_rate": 9.16411557030938e-06, "loss": 0.1148, "mean_token_accuracy": 0.9667240381240845, "step": 130 }, { "epoch": 0.5598290598290598, "grad_norm": 2.9206671714782715, "learning_rate": 9.144519980845405e-06, "loss": 0.1218, "mean_token_accuracy": 0.9657745361328125, "step": 131 }, { "epoch": 0.5641025641025641, "grad_norm": 3.10524320602417, "learning_rate": 9.124721448475848e-06, "loss": 0.1258, "mean_token_accuracy": 0.9668874144554138, "step": 132 }, { "epoch": 0.5683760683760684, "grad_norm": 2.9490950107574463, "learning_rate": 9.104721075670087e-06, "loss": 0.1178, "mean_token_accuracy": 0.9691486358642578, "step": 133 }, { "epoch": 0.5726495726495726, "grad_norm": 3.5055556297302246, "learning_rate": 9.084519976136867e-06, "loss": 0.1324, "mean_token_accuracy": 0.967658281326294, "step": 134 }, { "epoch": 0.5769230769230769, "grad_norm": 3.0520718097686768, "learning_rate": 9.06411927476228e-06, "loss": 0.1291, "mean_token_accuracy": 0.9664340019226074, "step": 135 }, { "epoch": 0.5811965811965812, "grad_norm": 3.246432065963745, "learning_rate": 9.043520107547123e-06, "loss": 0.1287, "mean_token_accuracy": 0.9649555683135986, "step": 136 }, { "epoch": 0.5854700854700855, "grad_norm": 2.822936773300171, "learning_rate": 9.02272362154365e-06, "loss": 0.1167, "mean_token_accuracy": 0.9701053500175476, "step": 137 }, { "epoch": 0.5897435897435898, "grad_norm": 3.3216702938079834, "learning_rate": 9.00173097479169e-06, "loss": 0.137, "mean_token_accuracy": 0.9654337167739868, "step": 138 }, { "epoch": 0.594017094017094, "grad_norm": 3.212130546569824, "learning_rate": 8.980543336254161e-06, "loss": 0.1357, "mean_token_accuracy": 0.9622559547424316, "step": 139 }, { "epoch": 0.5982905982905983, "grad_norm": 2.845677375793457, "learning_rate": 8.959161885751991e-06, "loss": 0.1145, "mean_token_accuracy": 0.9693924784660339, "step": 140 }, { "epoch": 0.6025641025641025, "grad_norm": 3.0614523887634277, "learning_rate": 8.937587813898402e-06, "loss": 0.1305, "mean_token_accuracy": 0.965413510799408, "step": 141 }, { "epoch": 0.6068376068376068, "grad_norm": 2.913874387741089, "learning_rate": 8.915822322032628e-06, "loss": 0.1299, "mean_token_accuracy": 0.9669148325920105, "step": 142 }, { "epoch": 0.6111111111111112, "grad_norm": 2.596005916595459, "learning_rate": 8.893866622153006e-06, "loss": 0.1227, "mean_token_accuracy": 0.9691241383552551, "step": 143 }, { "epoch": 0.6153846153846154, "grad_norm": 2.7618889808654785, "learning_rate": 8.87172193684949e-06, "loss": 0.1212, "mean_token_accuracy": 0.9670799374580383, "step": 144 }, { "epoch": 0.6196581196581197, "grad_norm": 3.188061237335205, "learning_rate": 8.84938949923558e-06, "loss": 0.1196, "mean_token_accuracy": 0.9661862254142761, "step": 145 }, { "epoch": 0.6239316239316239, "grad_norm": 2.9043335914611816, "learning_rate": 8.826870552879646e-06, "loss": 0.1158, "mean_token_accuracy": 0.9694948792457581, "step": 146 }, { "epoch": 0.6282051282051282, "grad_norm": 2.6360220909118652, "learning_rate": 8.80416635173569e-06, "loss": 0.1256, "mean_token_accuracy": 0.9667301774024963, "step": 147 }, { "epoch": 0.6324786324786325, "grad_norm": 2.8086981773376465, "learning_rate": 8.78127816007351e-06, "loss": 0.1336, "mean_token_accuracy": 0.9666719436645508, "step": 148 }, { "epoch": 0.6367521367521367, "grad_norm": 3.122762441635132, "learning_rate": 8.758207252408306e-06, "loss": 0.1195, "mean_token_accuracy": 0.9682682156562805, "step": 149 }, { "epoch": 0.6410256410256411, "grad_norm": 2.828507423400879, "learning_rate": 8.734954913429715e-06, "loss": 0.1206, "mean_token_accuracy": 0.9686484932899475, "step": 150 }, { "epoch": 0.6452991452991453, "grad_norm": 2.836404800415039, "learning_rate": 8.71152243793026e-06, "loss": 0.1208, "mean_token_accuracy": 0.9689119458198547, "step": 151 }, { "epoch": 0.6495726495726496, "grad_norm": 2.7368805408477783, "learning_rate": 8.687911130733266e-06, "loss": 0.1189, "mean_token_accuracy": 0.967898964881897, "step": 152 }, { "epoch": 0.6538461538461539, "grad_norm": 2.948840618133545, "learning_rate": 8.664122306620185e-06, "loss": 0.1229, "mean_token_accuracy": 0.9695394039154053, "step": 153 }, { "epoch": 0.6581196581196581, "grad_norm": 2.9949734210968018, "learning_rate": 8.640157290257398e-06, "loss": 0.1181, "mean_token_accuracy": 0.9687728881835938, "step": 154 }, { "epoch": 0.6623931623931624, "grad_norm": 2.728360652923584, "learning_rate": 8.61601741612244e-06, "loss": 0.1218, "mean_token_accuracy": 0.969561755657196, "step": 155 }, { "epoch": 0.6666666666666666, "grad_norm": 3.2591919898986816, "learning_rate": 8.591704028429704e-06, "loss": 0.1242, "mean_token_accuracy": 0.9672890305519104, "step": 156 }, { "epoch": 0.6709401709401709, "grad_norm": 2.9548633098602295, "learning_rate": 8.567218481055575e-06, "loss": 0.129, "mean_token_accuracy": 0.9685006141662598, "step": 157 }, { "epoch": 0.6752136752136753, "grad_norm": 3.126328229904175, "learning_rate": 8.542562137463049e-06, "loss": 0.1297, "mean_token_accuracy": 0.9670224785804749, "step": 158 }, { "epoch": 0.6794871794871795, "grad_norm": 3.116311550140381, "learning_rate": 8.517736370625803e-06, "loss": 0.1282, "mean_token_accuracy": 0.9657440185546875, "step": 159 }, { "epoch": 0.6837606837606838, "grad_norm": 3.0345067977905273, "learning_rate": 8.492742562951752e-06, "loss": 0.1193, "mean_token_accuracy": 0.9700401425361633, "step": 160 }, { "epoch": 0.688034188034188, "grad_norm": 2.7307636737823486, "learning_rate": 8.467582106206059e-06, "loss": 0.1163, "mean_token_accuracy": 0.9677555561065674, "step": 161 }, { "epoch": 0.6923076923076923, "grad_norm": 3.3693504333496094, "learning_rate": 8.44225640143364e-06, "loss": 0.1389, "mean_token_accuracy": 0.9626268744468689, "step": 162 }, { "epoch": 0.6965811965811965, "grad_norm": 3.258207321166992, "learning_rate": 8.416766858881155e-06, "loss": 0.13, "mean_token_accuracy": 0.9651078581809998, "step": 163 }, { "epoch": 0.7008547008547008, "grad_norm": 2.904632806777954, "learning_rate": 8.391114897918463e-06, "loss": 0.1252, "mean_token_accuracy": 0.9667837023735046, "step": 164 }, { "epoch": 0.7051282051282052, "grad_norm": 3.072021245956421, "learning_rate": 8.365301946959601e-06, "loss": 0.1195, "mean_token_accuracy": 0.9670177102088928, "step": 165 }, { "epoch": 0.7094017094017094, "grad_norm": 2.99420166015625, "learning_rate": 8.339329443383234e-06, "loss": 0.1198, "mean_token_accuracy": 0.9678386449813843, "step": 166 }, { "epoch": 0.7136752136752137, "grad_norm": 3.074570417404175, "learning_rate": 8.313198833452622e-06, "loss": 0.1222, "mean_token_accuracy": 0.968068540096283, "step": 167 }, { "epoch": 0.717948717948718, "grad_norm": 3.298757553100586, "learning_rate": 8.28691157223508e-06, "loss": 0.1392, "mean_token_accuracy": 0.9620700478553772, "step": 168 }, { "epoch": 0.7222222222222222, "grad_norm": 2.632319688796997, "learning_rate": 8.260469123520955e-06, "loss": 0.1193, "mean_token_accuracy": 0.9686574339866638, "step": 169 }, { "epoch": 0.7264957264957265, "grad_norm": 2.8462018966674805, "learning_rate": 8.233872959742117e-06, "loss": 0.1345, "mean_token_accuracy": 0.9627861380577087, "step": 170 }, { "epoch": 0.7307692307692307, "grad_norm": 2.8106915950775146, "learning_rate": 8.207124561889967e-06, "loss": 0.1221, "mean_token_accuracy": 0.9662142395973206, "step": 171 }, { "epoch": 0.7350427350427351, "grad_norm": 2.946042776107788, "learning_rate": 8.180225419432974e-06, "loss": 0.1336, "mean_token_accuracy": 0.9638904929161072, "step": 172 }, { "epoch": 0.7393162393162394, "grad_norm": 2.8250818252563477, "learning_rate": 8.15317703023372e-06, "loss": 0.1233, "mean_token_accuracy": 0.968315839767456, "step": 173 }, { "epoch": 0.7435897435897436, "grad_norm": 2.757537364959717, "learning_rate": 8.125980900465512e-06, "loss": 0.1241, "mean_token_accuracy": 0.9649458527565002, "step": 174 }, { "epoch": 0.7478632478632479, "grad_norm": 3.01009464263916, "learning_rate": 8.098638544528493e-06, "loss": 0.1282, "mean_token_accuracy": 0.9654348492622375, "step": 175 }, { "epoch": 0.7521367521367521, "grad_norm": 2.971163034439087, "learning_rate": 8.07115148496533e-06, "loss": 0.1219, "mean_token_accuracy": 0.9695356488227844, "step": 176 }, { "epoch": 0.7564102564102564, "grad_norm": 2.752361536026001, "learning_rate": 8.043521252376419e-06, "loss": 0.1115, "mean_token_accuracy": 0.9711832404136658, "step": 177 }, { "epoch": 0.7606837606837606, "grad_norm": 3.2744572162628174, "learning_rate": 8.015749385334662e-06, "loss": 0.1255, "mean_token_accuracy": 0.9651964902877808, "step": 178 }, { "epoch": 0.7649572649572649, "grad_norm": 3.198021411895752, "learning_rate": 7.987837430299793e-06, "loss": 0.1128, "mean_token_accuracy": 0.971238911151886, "step": 179 }, { "epoch": 0.7692307692307693, "grad_norm": 3.0890040397644043, "learning_rate": 7.959786941532257e-06, "loss": 0.1158, "mean_token_accuracy": 0.9695550203323364, "step": 180 }, { "epoch": 0.7735042735042735, "grad_norm": 3.1784725189208984, "learning_rate": 7.93159948100667e-06, "loss": 0.1237, "mean_token_accuracy": 0.9680773019790649, "step": 181 }, { "epoch": 0.7777777777777778, "grad_norm": 2.876870632171631, "learning_rate": 7.903276618324833e-06, "loss": 0.122, "mean_token_accuracy": 0.968846321105957, "step": 182 }, { "epoch": 0.782051282051282, "grad_norm": 3.075652599334717, "learning_rate": 7.874819930628346e-06, "loss": 0.1288, "mean_token_accuracy": 0.9640507698059082, "step": 183 }, { "epoch": 0.7863247863247863, "grad_norm": 3.393401861190796, "learning_rate": 7.846231002510763e-06, "loss": 0.1298, "mean_token_accuracy": 0.9649748802185059, "step": 184 }, { "epoch": 0.7905982905982906, "grad_norm": 2.9897360801696777, "learning_rate": 7.817511425929368e-06, "loss": 0.1195, "mean_token_accuracy": 0.9691243171691895, "step": 185 }, { "epoch": 0.7948717948717948, "grad_norm": 3.0758893489837646, "learning_rate": 7.788662800116534e-06, "loss": 0.1259, "mean_token_accuracy": 0.9677749872207642, "step": 186 }, { "epoch": 0.7991452991452992, "grad_norm": 2.974975824356079, "learning_rate": 7.759686731490655e-06, "loss": 0.1277, "mean_token_accuracy": 0.9659624695777893, "step": 187 }, { "epoch": 0.8034188034188035, "grad_norm": 2.820207357406616, "learning_rate": 7.730584833566704e-06, "loss": 0.1181, "mean_token_accuracy": 0.9691581130027771, "step": 188 }, { "epoch": 0.8076923076923077, "grad_norm": 2.644383668899536, "learning_rate": 7.701358726866384e-06, "loss": 0.1282, "mean_token_accuracy": 0.9655576944351196, "step": 189 }, { "epoch": 0.811965811965812, "grad_norm": 2.8675873279571533, "learning_rate": 7.672010038827888e-06, "loss": 0.1177, "mean_token_accuracy": 0.9688689708709717, "step": 190 }, { "epoch": 0.8162393162393162, "grad_norm": 2.505047082901001, "learning_rate": 7.642540403715279e-06, "loss": 0.1131, "mean_token_accuracy": 0.9703559875488281, "step": 191 }, { "epoch": 0.8205128205128205, "grad_norm": 2.9962170124053955, "learning_rate": 7.6129514625274806e-06, "loss": 0.1191, "mean_token_accuracy": 0.9711988568305969, "step": 192 }, { "epoch": 0.8247863247863247, "grad_norm": 2.7384965419769287, "learning_rate": 7.583244862906906e-06, "loss": 0.12, "mean_token_accuracy": 0.9679151773452759, "step": 193 }, { "epoch": 0.8290598290598291, "grad_norm": 2.7203431129455566, "learning_rate": 7.553422259047712e-06, "loss": 0.1118, "mean_token_accuracy": 0.9700168967247009, "step": 194 }, { "epoch": 0.8333333333333334, "grad_norm": 2.715108871459961, "learning_rate": 7.523485311603672e-06, "loss": 0.1168, "mean_token_accuracy": 0.9702435731887817, "step": 195 }, { "epoch": 0.8376068376068376, "grad_norm": 2.69944429397583, "learning_rate": 7.493435687595725e-06, "loss": 0.1101, "mean_token_accuracy": 0.9719541668891907, "step": 196 }, { "epoch": 0.8418803418803419, "grad_norm": 2.8891189098358154, "learning_rate": 7.463275060319127e-06, "loss": 0.1156, "mean_token_accuracy": 0.9676284193992615, "step": 197 }, { "epoch": 0.8461538461538461, "grad_norm": 2.8775413036346436, "learning_rate": 7.433005109250291e-06, "loss": 0.1176, "mean_token_accuracy": 0.9696885943412781, "step": 198 }, { "epoch": 0.8504273504273504, "grad_norm": 2.7444727420806885, "learning_rate": 7.402627519953251e-06, "loss": 0.1123, "mean_token_accuracy": 0.9693893194198608, "step": 199 }, { "epoch": 0.8547008547008547, "grad_norm": 2.682316780090332, "learning_rate": 7.3721439839858245e-06, "loss": 0.1166, "mean_token_accuracy": 0.9664866328239441, "step": 200 }, { "epoch": 0.8589743589743589, "grad_norm": 3.2439863681793213, "learning_rate": 7.341556198805392e-06, "loss": 0.124, "mean_token_accuracy": 0.9668755531311035, "step": 201 }, { "epoch": 0.8632478632478633, "grad_norm": 3.0606460571289062, "learning_rate": 7.310865867674397e-06, "loss": 0.1163, "mean_token_accuracy": 0.96805340051651, "step": 202 }, { "epoch": 0.8675213675213675, "grad_norm": 2.628513813018799, "learning_rate": 7.28007469956549e-06, "loss": 0.112, "mean_token_accuracy": 0.9691203832626343, "step": 203 }, { "epoch": 0.8717948717948718, "grad_norm": 3.0109522342681885, "learning_rate": 7.249184409066368e-06, "loss": 0.1193, "mean_token_accuracy": 0.9659459590911865, "step": 204 }, { "epoch": 0.8760683760683761, "grad_norm": 2.7769620418548584, "learning_rate": 7.218196716284302e-06, "loss": 0.1188, "mean_token_accuracy": 0.9707610607147217, "step": 205 }, { "epoch": 0.8803418803418803, "grad_norm": 3.0632548332214355, "learning_rate": 7.187113346750345e-06, "loss": 0.12, "mean_token_accuracy": 0.9674087166786194, "step": 206 }, { "epoch": 0.8846153846153846, "grad_norm": 2.789855718612671, "learning_rate": 7.155936031323254e-06, "loss": 0.1106, "mean_token_accuracy": 0.9703866243362427, "step": 207 }, { "epoch": 0.8888888888888888, "grad_norm": 3.0573058128356934, "learning_rate": 7.124666506093112e-06, "loss": 0.122, "mean_token_accuracy": 0.9684903621673584, "step": 208 }, { "epoch": 0.8931623931623932, "grad_norm": 3.0812251567840576, "learning_rate": 7.093306512284642e-06, "loss": 0.1174, "mean_token_accuracy": 0.9704350233078003, "step": 209 }, { "epoch": 0.8974358974358975, "grad_norm": 2.5385091304779053, "learning_rate": 7.061857796160261e-06, "loss": 0.1043, "mean_token_accuracy": 0.9734078645706177, "step": 210 }, { "epoch": 0.9017094017094017, "grad_norm": 3.305619239807129, "learning_rate": 7.030322108922831e-06, "loss": 0.1268, "mean_token_accuracy": 0.9653949737548828, "step": 211 }, { "epoch": 0.905982905982906, "grad_norm": 2.835049629211426, "learning_rate": 6.998701206618153e-06, "loss": 0.1138, "mean_token_accuracy": 0.9680963158607483, "step": 212 }, { "epoch": 0.9102564102564102, "grad_norm": 2.810410499572754, "learning_rate": 6.966996850037168e-06, "loss": 0.1122, "mean_token_accuracy": 0.9715486764907837, "step": 213 }, { "epoch": 0.9145299145299145, "grad_norm": 2.569246292114258, "learning_rate": 6.9352108046179325e-06, "loss": 0.1085, "mean_token_accuracy": 0.9726495742797852, "step": 214 }, { "epoch": 0.9188034188034188, "grad_norm": 2.7993946075439453, "learning_rate": 6.903344840347286e-06, "loss": 0.1183, "mean_token_accuracy": 0.9697819352149963, "step": 215 }, { "epoch": 0.9230769230769231, "grad_norm": 2.7210962772369385, "learning_rate": 6.871400731662303e-06, "loss": 0.1186, "mean_token_accuracy": 0.9677559733390808, "step": 216 }, { "epoch": 0.9273504273504274, "grad_norm": 2.9043362140655518, "learning_rate": 6.839380257351486e-06, "loss": 0.1093, "mean_token_accuracy": 0.9705744981765747, "step": 217 }, { "epoch": 0.9316239316239316, "grad_norm": 2.5438663959503174, "learning_rate": 6.8072852004557085e-06, "loss": 0.1109, "mean_token_accuracy": 0.9724981188774109, "step": 218 }, { "epoch": 0.9358974358974359, "grad_norm": 2.694098711013794, "learning_rate": 6.775117348168934e-06, "loss": 0.1156, "mean_token_accuracy": 0.9705842137336731, "step": 219 }, { "epoch": 0.9401709401709402, "grad_norm": 2.7085683345794678, "learning_rate": 6.742878491738691e-06, "loss": 0.1058, "mean_token_accuracy": 0.9714870452880859, "step": 220 }, { "epoch": 0.9444444444444444, "grad_norm": 3.007352828979492, "learning_rate": 6.71057042636633e-06, "loss": 0.1179, "mean_token_accuracy": 0.9679550528526306, "step": 221 }, { "epoch": 0.9487179487179487, "grad_norm": 2.7090036869049072, "learning_rate": 6.678194951107061e-06, "loss": 0.1093, "mean_token_accuracy": 0.9711792469024658, "step": 222 }, { "epoch": 0.9529914529914529, "grad_norm": 2.7703821659088135, "learning_rate": 6.645753868769773e-06, "loss": 0.119, "mean_token_accuracy": 0.9694185256958008, "step": 223 }, { "epoch": 0.9572649572649573, "grad_norm": 2.726327657699585, "learning_rate": 6.61324898581665e-06, "loss": 0.1235, "mean_token_accuracy": 0.9704574346542358, "step": 224 }, { "epoch": 0.9615384615384616, "grad_norm": 2.35150408744812, "learning_rate": 6.580682112262566e-06, "loss": 0.1003, "mean_token_accuracy": 0.9741271138191223, "step": 225 }, { "epoch": 0.9658119658119658, "grad_norm": 2.970930814743042, "learning_rate": 6.5480550615743124e-06, "loss": 0.1207, "mean_token_accuracy": 0.967167854309082, "step": 226 }, { "epoch": 0.9700854700854701, "grad_norm": 2.907005548477173, "learning_rate": 6.515369650569603e-06, "loss": 0.1176, "mean_token_accuracy": 0.9715429544448853, "step": 227 }, { "epoch": 0.9743589743589743, "grad_norm": 2.661025047302246, "learning_rate": 6.4826276993159155e-06, "loss": 0.1154, "mean_token_accuracy": 0.9683908224105835, "step": 228 }, { "epoch": 0.9786324786324786, "grad_norm": 2.9590840339660645, "learning_rate": 6.449831031029134e-06, "loss": 0.1145, "mean_token_accuracy": 0.9706094264984131, "step": 229 }, { "epoch": 0.9829059829059829, "grad_norm": 3.047009229660034, "learning_rate": 6.416981471972026e-06, "loss": 0.1144, "mean_token_accuracy": 0.969413161277771, "step": 230 }, { "epoch": 0.9871794871794872, "grad_norm": 2.5724730491638184, "learning_rate": 6.384080851352553e-06, "loss": 0.1014, "mean_token_accuracy": 0.9736120104789734, "step": 231 }, { "epoch": 0.9914529914529915, "grad_norm": 2.7814061641693115, "learning_rate": 6.351131001222012e-06, "loss": 0.1112, "mean_token_accuracy": 0.9722570776939392, "step": 232 }, { "epoch": 0.9957264957264957, "grad_norm": 3.269582509994507, "learning_rate": 6.318133756373009e-06, "loss": 0.122, "mean_token_accuracy": 0.9661454558372498, "step": 233 }, { "epoch": 1.0, "grad_norm": 2.7873144149780273, "learning_rate": 6.2850909542373e-06, "loss": 0.1111, "mean_token_accuracy": 0.9729042053222656, "step": 234 }, { "epoch": 1.0042735042735043, "grad_norm": 3.190188407897949, "learning_rate": 6.2520044347834684e-06, "loss": 0.0988, "mean_token_accuracy": 0.9759896993637085, "step": 235 }, { "epoch": 1.0085470085470085, "grad_norm": 3.600405693054199, "learning_rate": 6.218876040414476e-06, "loss": 0.1298, "mean_token_accuracy": 0.9713731408119202, "step": 236 }, { "epoch": 1.0128205128205128, "grad_norm": 2.5787041187286377, "learning_rate": 6.185707615865058e-06, "loss": 0.0896, "mean_token_accuracy": 0.9797489643096924, "step": 237 }, { "epoch": 1.017094017094017, "grad_norm": 3.417285203933716, "learning_rate": 6.152501008099009e-06, "loss": 0.1086, "mean_token_accuracy": 0.9717546105384827, "step": 238 }, { "epoch": 1.0213675213675213, "grad_norm": 3.2675163745880127, "learning_rate": 6.119258066206333e-06, "loss": 0.1084, "mean_token_accuracy": 0.9739053249359131, "step": 239 }, { "epoch": 1.0256410256410255, "grad_norm": 3.1262497901916504, "learning_rate": 6.085980641300278e-06, "loss": 0.0979, "mean_token_accuracy": 0.9745363593101501, "step": 240 }, { "epoch": 1.0299145299145298, "grad_norm": 2.747432231903076, "learning_rate": 6.052670586414255e-06, "loss": 0.0991, "mean_token_accuracy": 0.9759474396705627, "step": 241 }, { "epoch": 1.0341880341880343, "grad_norm": 3.2090530395507812, "learning_rate": 6.019329756398661e-06, "loss": 0.1136, "mean_token_accuracy": 0.975712776184082, "step": 242 }, { "epoch": 1.0384615384615385, "grad_norm": 2.662107229232788, "learning_rate": 5.9859600078175836e-06, "loss": 0.0946, "mean_token_accuracy": 0.9759296774864197, "step": 243 }, { "epoch": 1.0427350427350428, "grad_norm": 2.873894214630127, "learning_rate": 5.952563198845427e-06, "loss": 0.1103, "mean_token_accuracy": 0.9721987247467041, "step": 244 }, { "epoch": 1.047008547008547, "grad_norm": 2.198923349380493, "learning_rate": 5.919141189163431e-06, "loss": 0.087, "mean_token_accuracy": 0.9801637530326843, "step": 245 }, { "epoch": 1.0512820512820513, "grad_norm": 2.7435288429260254, "learning_rate": 5.885695839856129e-06, "loss": 0.1029, "mean_token_accuracy": 0.9745575189590454, "step": 246 }, { "epoch": 1.0555555555555556, "grad_norm": 2.4154202938079834, "learning_rate": 5.852229013307704e-06, "loss": 0.1004, "mean_token_accuracy": 0.9773362278938293, "step": 247 }, { "epoch": 1.0598290598290598, "grad_norm": 1.7442923784255981, "learning_rate": 5.818742573098283e-06, "loss": 0.079, "mean_token_accuracy": 0.9817251563072205, "step": 248 }, { "epoch": 1.064102564102564, "grad_norm": 2.7456750869750977, "learning_rate": 5.785238383900172e-06, "loss": 0.0906, "mean_token_accuracy": 0.9783667325973511, "step": 249 }, { "epoch": 1.0683760683760684, "grad_norm": 3.0801470279693604, "learning_rate": 5.75171831137402e-06, "loss": 0.1113, "mean_token_accuracy": 0.970004677772522, "step": 250 }, { "epoch": 1.0726495726495726, "grad_norm": 1.9699081182479858, "learning_rate": 5.7181842220649245e-06, "loss": 0.0847, "mean_token_accuracy": 0.9803862571716309, "step": 251 }, { "epoch": 1.0769230769230769, "grad_norm": 2.0998027324676514, "learning_rate": 5.6846379832985046e-06, "loss": 0.0942, "mean_token_accuracy": 0.9790985584259033, "step": 252 }, { "epoch": 1.0811965811965811, "grad_norm": 2.5801784992218018, "learning_rate": 5.651081463076911e-06, "loss": 0.0972, "mean_token_accuracy": 0.9760162234306335, "step": 253 }, { "epoch": 1.0854700854700854, "grad_norm": 2.551215171813965, "learning_rate": 5.617516529974812e-06, "loss": 0.1051, "mean_token_accuracy": 0.9762815833091736, "step": 254 }, { "epoch": 1.0897435897435896, "grad_norm": 2.5487148761749268, "learning_rate": 5.583945053035346e-06, "loss": 0.0921, "mean_token_accuracy": 0.9765682816505432, "step": 255 }, { "epoch": 1.0940170940170941, "grad_norm": 2.5740602016448975, "learning_rate": 5.550368901666031e-06, "loss": 0.1004, "mean_token_accuracy": 0.9755838513374329, "step": 256 }, { "epoch": 1.0982905982905984, "grad_norm": 2.338292121887207, "learning_rate": 5.5167899455346875e-06, "loss": 0.087, "mean_token_accuracy": 0.9767951369285583, "step": 257 }, { "epoch": 1.1025641025641026, "grad_norm": 2.2318756580352783, "learning_rate": 5.483210054465313e-06, "loss": 0.0846, "mean_token_accuracy": 0.9803149700164795, "step": 258 }, { "epoch": 1.106837606837607, "grad_norm": 2.467461347579956, "learning_rate": 5.449631098333971e-06, "loss": 0.0988, "mean_token_accuracy": 0.9754037857055664, "step": 259 }, { "epoch": 1.1111111111111112, "grad_norm": 2.1578803062438965, "learning_rate": 5.416054946964657e-06, "loss": 0.0848, "mean_token_accuracy": 0.9773491024971008, "step": 260 }, { "epoch": 1.1153846153846154, "grad_norm": 2.831975221633911, "learning_rate": 5.382483470025188e-06, "loss": 0.1046, "mean_token_accuracy": 0.976597785949707, "step": 261 }, { "epoch": 1.1196581196581197, "grad_norm": 2.9957687854766846, "learning_rate": 5.34891853692309e-06, "loss": 0.1071, "mean_token_accuracy": 0.9731133580207825, "step": 262 }, { "epoch": 1.123931623931624, "grad_norm": 2.807539701461792, "learning_rate": 5.315362016701496e-06, "loss": 0.1042, "mean_token_accuracy": 0.9745635390281677, "step": 263 }, { "epoch": 1.1282051282051282, "grad_norm": 2.411320447921753, "learning_rate": 5.281815777935077e-06, "loss": 0.0939, "mean_token_accuracy": 0.9777224063873291, "step": 264 }, { "epoch": 1.1324786324786325, "grad_norm": 2.6769258975982666, "learning_rate": 5.248281688625984e-06, "loss": 0.1019, "mean_token_accuracy": 0.9765105247497559, "step": 265 }, { "epoch": 1.1367521367521367, "grad_norm": 2.6799635887145996, "learning_rate": 5.214761616099831e-06, "loss": 0.1106, "mean_token_accuracy": 0.9744277596473694, "step": 266 }, { "epoch": 1.141025641025641, "grad_norm": 2.4581170082092285, "learning_rate": 5.18125742690172e-06, "loss": 0.0975, "mean_token_accuracy": 0.9779106974601746, "step": 267 }, { "epoch": 1.1452991452991452, "grad_norm": 2.4275119304656982, "learning_rate": 5.147770986692298e-06, "loss": 0.0896, "mean_token_accuracy": 0.977669358253479, "step": 268 }, { "epoch": 1.1495726495726495, "grad_norm": 2.197598934173584, "learning_rate": 5.114304160143873e-06, "loss": 0.0904, "mean_token_accuracy": 0.9766778349876404, "step": 269 }, { "epoch": 1.1538461538461537, "grad_norm": 2.2721002101898193, "learning_rate": 5.08085881083657e-06, "loss": 0.0897, "mean_token_accuracy": 0.977644681930542, "step": 270 }, { "epoch": 1.158119658119658, "grad_norm": 2.1074318885803223, "learning_rate": 5.047436801154575e-06, "loss": 0.0877, "mean_token_accuracy": 0.9794923067092896, "step": 271 }, { "epoch": 1.1623931623931625, "grad_norm": 2.6289446353912354, "learning_rate": 5.014039992182417e-06, "loss": 0.0994, "mean_token_accuracy": 0.9763491153717041, "step": 272 }, { "epoch": 1.1666666666666667, "grad_norm": 2.1914560794830322, "learning_rate": 4.980670243601341e-06, "loss": 0.087, "mean_token_accuracy": 0.9788976907730103, "step": 273 }, { "epoch": 1.170940170940171, "grad_norm": 2.311638116836548, "learning_rate": 4.947329413585746e-06, "loss": 0.092, "mean_token_accuracy": 0.9794085025787354, "step": 274 }, { "epoch": 1.1752136752136753, "grad_norm": 2.0195963382720947, "learning_rate": 4.914019358699725e-06, "loss": 0.0755, "mean_token_accuracy": 0.9825838804244995, "step": 275 }, { "epoch": 1.1794871794871795, "grad_norm": 2.5088884830474854, "learning_rate": 4.880741933793669e-06, "loss": 0.1063, "mean_token_accuracy": 0.9738301038742065, "step": 276 }, { "epoch": 1.1837606837606838, "grad_norm": 2.246293544769287, "learning_rate": 4.8474989919009915e-06, "loss": 0.0943, "mean_token_accuracy": 0.9782775044441223, "step": 277 }, { "epoch": 1.188034188034188, "grad_norm": 2.261032819747925, "learning_rate": 4.8142923841349435e-06, "loss": 0.0944, "mean_token_accuracy": 0.9763620495796204, "step": 278 }, { "epoch": 1.1923076923076923, "grad_norm": 2.5907723903656006, "learning_rate": 4.781123959585526e-06, "loss": 0.1054, "mean_token_accuracy": 0.9734842777252197, "step": 279 }, { "epoch": 1.1965811965811965, "grad_norm": 2.166476011276245, "learning_rate": 4.747995565216532e-06, "loss": 0.0817, "mean_token_accuracy": 0.9802180528640747, "step": 280 }, { "epoch": 1.2008547008547008, "grad_norm": 2.6413872241973877, "learning_rate": 4.714909045762702e-06, "loss": 0.1075, "mean_token_accuracy": 0.9746893644332886, "step": 281 }, { "epoch": 1.205128205128205, "grad_norm": 2.4932637214660645, "learning_rate": 4.681866243626992e-06, "loss": 0.1068, "mean_token_accuracy": 0.9767117500305176, "step": 282 }, { "epoch": 1.2094017094017093, "grad_norm": 1.9535247087478638, "learning_rate": 4.64886899877799e-06, "loss": 0.0828, "mean_token_accuracy": 0.9810839295387268, "step": 283 }, { "epoch": 1.2136752136752136, "grad_norm": 2.248490810394287, "learning_rate": 4.615919148647449e-06, "loss": 0.0932, "mean_token_accuracy": 0.9774066805839539, "step": 284 }, { "epoch": 1.217948717948718, "grad_norm": 2.524912118911743, "learning_rate": 4.583018528027976e-06, "loss": 0.0975, "mean_token_accuracy": 0.9775086641311646, "step": 285 }, { "epoch": 1.2222222222222223, "grad_norm": 2.622223138809204, "learning_rate": 4.550168968970869e-06, "loss": 0.0936, "mean_token_accuracy": 0.9763756990432739, "step": 286 }, { "epoch": 1.2264957264957266, "grad_norm": 2.0964972972869873, "learning_rate": 4.517372300684087e-06, "loss": 0.0842, "mean_token_accuracy": 0.9804538488388062, "step": 287 }, { "epoch": 1.2307692307692308, "grad_norm": 2.625102996826172, "learning_rate": 4.484630349430398e-06, "loss": 0.0949, "mean_token_accuracy": 0.975182294845581, "step": 288 }, { "epoch": 1.235042735042735, "grad_norm": 2.5802104473114014, "learning_rate": 4.45194493842569e-06, "loss": 0.0979, "mean_token_accuracy": 0.9763470888137817, "step": 289 }, { "epoch": 1.2393162393162394, "grad_norm": 2.3211922645568848, "learning_rate": 4.419317887737434e-06, "loss": 0.0882, "mean_token_accuracy": 0.9776690006256104, "step": 290 }, { "epoch": 1.2435897435897436, "grad_norm": 2.8441874980926514, "learning_rate": 4.386751014183352e-06, "loss": 0.1091, "mean_token_accuracy": 0.9710574746131897, "step": 291 }, { "epoch": 1.2478632478632479, "grad_norm": 2.6006760597229004, "learning_rate": 4.3542461312302264e-06, "loss": 0.098, "mean_token_accuracy": 0.9752784967422485, "step": 292 }, { "epoch": 1.2521367521367521, "grad_norm": 2.247154712677002, "learning_rate": 4.321805048892942e-06, "loss": 0.0833, "mean_token_accuracy": 0.9812250137329102, "step": 293 }, { "epoch": 1.2564102564102564, "grad_norm": 2.42104172706604, "learning_rate": 4.2894295736336725e-06, "loss": 0.0872, "mean_token_accuracy": 0.9812902212142944, "step": 294 }, { "epoch": 1.2606837606837606, "grad_norm": 2.7402937412261963, "learning_rate": 4.257121508261311e-06, "loss": 0.0995, "mean_token_accuracy": 0.9761265516281128, "step": 295 }, { "epoch": 1.264957264957265, "grad_norm": 2.2438583374023438, "learning_rate": 4.224882651831067e-06, "loss": 0.0892, "mean_token_accuracy": 0.9804651141166687, "step": 296 }, { "epoch": 1.2692307692307692, "grad_norm": 2.573054075241089, "learning_rate": 4.192714799544293e-06, "loss": 0.1079, "mean_token_accuracy": 0.9739599227905273, "step": 297 }, { "epoch": 1.2735042735042734, "grad_norm": 2.515164852142334, "learning_rate": 4.1606197426485175e-06, "loss": 0.1055, "mean_token_accuracy": 0.9740031957626343, "step": 298 }, { "epoch": 1.2777777777777777, "grad_norm": 2.208859920501709, "learning_rate": 4.128599268337699e-06, "loss": 0.0816, "mean_token_accuracy": 0.9804835319519043, "step": 299 }, { "epoch": 1.282051282051282, "grad_norm": 2.472729444503784, "learning_rate": 4.096655159652717e-06, "loss": 0.0953, "mean_token_accuracy": 0.9765296578407288, "step": 300 }, { "epoch": 1.2863247863247862, "grad_norm": 2.4752037525177, "learning_rate": 4.064789195382068e-06, "loss": 0.0997, "mean_token_accuracy": 0.9748784303665161, "step": 301 }, { "epoch": 1.2905982905982907, "grad_norm": 2.179346799850464, "learning_rate": 4.033003149962833e-06, "loss": 0.0883, "mean_token_accuracy": 0.9797558784484863, "step": 302 }, { "epoch": 1.294871794871795, "grad_norm": 2.3181893825531006, "learning_rate": 4.00129879338185e-06, "loss": 0.0914, "mean_token_accuracy": 0.974800705909729, "step": 303 }, { "epoch": 1.2991452991452992, "grad_norm": 2.3745031356811523, "learning_rate": 3.96967789107717e-06, "loss": 0.0958, "mean_token_accuracy": 0.9788101315498352, "step": 304 }, { "epoch": 1.3034188034188035, "grad_norm": 2.3366518020629883, "learning_rate": 3.9381422038397395e-06, "loss": 0.0867, "mean_token_accuracy": 0.9784852266311646, "step": 305 }, { "epoch": 1.3076923076923077, "grad_norm": 2.771221160888672, "learning_rate": 3.906693487715358e-06, "loss": 0.1107, "mean_token_accuracy": 0.9715330600738525, "step": 306 }, { "epoch": 1.311965811965812, "grad_norm": 2.3197829723358154, "learning_rate": 3.87533349390689e-06, "loss": 0.0947, "mean_token_accuracy": 0.9761702418327332, "step": 307 }, { "epoch": 1.3162393162393162, "grad_norm": 2.152040719985962, "learning_rate": 3.844063968676748e-06, "loss": 0.0928, "mean_token_accuracy": 0.9783110022544861, "step": 308 }, { "epoch": 1.3205128205128205, "grad_norm": 2.331920623779297, "learning_rate": 3.8128866532496576e-06, "loss": 0.0911, "mean_token_accuracy": 0.9766107797622681, "step": 309 }, { "epoch": 1.3247863247863247, "grad_norm": 2.161576986312866, "learning_rate": 3.7818032837157006e-06, "loss": 0.0878, "mean_token_accuracy": 0.9797548055648804, "step": 310 }, { "epoch": 1.329059829059829, "grad_norm": 2.173659086227417, "learning_rate": 3.750815590933633e-06, "loss": 0.0912, "mean_token_accuracy": 0.9780091047286987, "step": 311 }, { "epoch": 1.3333333333333333, "grad_norm": 2.7930123805999756, "learning_rate": 3.7199253004345114e-06, "loss": 0.1133, "mean_token_accuracy": 0.9729244112968445, "step": 312 }, { "epoch": 1.3376068376068377, "grad_norm": 2.3382835388183594, "learning_rate": 3.6891341323256047e-06, "loss": 0.0904, "mean_token_accuracy": 0.9790826439857483, "step": 313 }, { "epoch": 1.341880341880342, "grad_norm": 1.9892425537109375, "learning_rate": 3.65844380119461e-06, "loss": 0.0868, "mean_token_accuracy": 0.9794397950172424, "step": 314 }, { "epoch": 1.3461538461538463, "grad_norm": 2.2188029289245605, "learning_rate": 3.6278560160141774e-06, "loss": 0.0878, "mean_token_accuracy": 0.9780125617980957, "step": 315 }, { "epoch": 1.3504273504273505, "grad_norm": 2.6905603408813477, "learning_rate": 3.597372480046749e-06, "loss": 0.0933, "mean_token_accuracy": 0.9762943387031555, "step": 316 }, { "epoch": 1.3547008547008548, "grad_norm": 2.358496904373169, "learning_rate": 3.5669948907497108e-06, "loss": 0.0851, "mean_token_accuracy": 0.9798140525817871, "step": 317 }, { "epoch": 1.358974358974359, "grad_norm": 2.5938258171081543, "learning_rate": 3.5367249396808733e-06, "loss": 0.0974, "mean_token_accuracy": 0.9744832515716553, "step": 318 }, { "epoch": 1.3632478632478633, "grad_norm": 2.1112005710601807, "learning_rate": 3.5065643124042746e-06, "loss": 0.0861, "mean_token_accuracy": 0.9812760949134827, "step": 319 }, { "epoch": 1.3675213675213675, "grad_norm": 2.4144303798675537, "learning_rate": 3.4765146883963263e-06, "loss": 0.0952, "mean_token_accuracy": 0.9786186814308167, "step": 320 }, { "epoch": 1.3717948717948718, "grad_norm": 2.3461201190948486, "learning_rate": 3.4465777409522915e-06, "loss": 0.0883, "mean_token_accuracy": 0.9787299633026123, "step": 321 }, { "epoch": 1.376068376068376, "grad_norm": 2.0723886489868164, "learning_rate": 3.4167551370930955e-06, "loss": 0.0851, "mean_token_accuracy": 0.9797854423522949, "step": 322 }, { "epoch": 1.3803418803418803, "grad_norm": 2.3695363998413086, "learning_rate": 3.3870485374725217e-06, "loss": 0.0949, "mean_token_accuracy": 0.9754375219345093, "step": 323 }, { "epoch": 1.3846153846153846, "grad_norm": 2.0983939170837402, "learning_rate": 3.3574595962847234e-06, "loss": 0.0816, "mean_token_accuracy": 0.9791032671928406, "step": 324 }, { "epoch": 1.3888888888888888, "grad_norm": 2.357313871383667, "learning_rate": 3.327989961172112e-06, "loss": 0.0934, "mean_token_accuracy": 0.9782047271728516, "step": 325 }, { "epoch": 1.393162393162393, "grad_norm": 2.5017433166503906, "learning_rate": 3.2986412731336184e-06, "loss": 0.0876, "mean_token_accuracy": 0.9789897203445435, "step": 326 }, { "epoch": 1.3974358974358974, "grad_norm": 2.283299684524536, "learning_rate": 3.269415166433297e-06, "loss": 0.0828, "mean_token_accuracy": 0.9798080921173096, "step": 327 }, { "epoch": 1.4017094017094016, "grad_norm": 2.114645481109619, "learning_rate": 3.2403132685093455e-06, "loss": 0.0856, "mean_token_accuracy": 0.9803335666656494, "step": 328 }, { "epoch": 1.4059829059829059, "grad_norm": 2.050668478012085, "learning_rate": 3.2113371998834677e-06, "loss": 0.0836, "mean_token_accuracy": 0.980510950088501, "step": 329 }, { "epoch": 1.4102564102564101, "grad_norm": 2.2558069229125977, "learning_rate": 3.1824885740706323e-06, "loss": 0.08, "mean_token_accuracy": 0.9798475503921509, "step": 330 }, { "epoch": 1.4145299145299146, "grad_norm": 2.7303566932678223, "learning_rate": 3.1537689974892393e-06, "loss": 0.1006, "mean_token_accuracy": 0.9758530855178833, "step": 331 }, { "epoch": 1.4188034188034189, "grad_norm": 2.5855560302734375, "learning_rate": 3.125180069371655e-06, "loss": 0.0972, "mean_token_accuracy": 0.9753068089485168, "step": 332 }, { "epoch": 1.4230769230769231, "grad_norm": 2.2191054821014404, "learning_rate": 3.0967233816751657e-06, "loss": 0.0871, "mean_token_accuracy": 0.9791666865348816, "step": 333 }, { "epoch": 1.4273504273504274, "grad_norm": 2.373244285583496, "learning_rate": 3.0684005189933317e-06, "loss": 0.0879, "mean_token_accuracy": 0.9779411554336548, "step": 334 }, { "epoch": 1.4316239316239316, "grad_norm": 2.3935558795928955, "learning_rate": 3.040213058467746e-06, "loss": 0.0944, "mean_token_accuracy": 0.9777158498764038, "step": 335 }, { "epoch": 1.435897435897436, "grad_norm": 2.688636302947998, "learning_rate": 3.012162569700209e-06, "loss": 0.099, "mean_token_accuracy": 0.9758201241493225, "step": 336 }, { "epoch": 1.4401709401709402, "grad_norm": 2.169508934020996, "learning_rate": 2.9842506146653395e-06, "loss": 0.0875, "mean_token_accuracy": 0.9801456928253174, "step": 337 }, { "epoch": 1.4444444444444444, "grad_norm": 2.194892168045044, "learning_rate": 2.9564787476235828e-06, "loss": 0.0803, "mean_token_accuracy": 0.9811508059501648, "step": 338 }, { "epoch": 1.4487179487179487, "grad_norm": 2.4856936931610107, "learning_rate": 2.928848515034673e-06, "loss": 0.0882, "mean_token_accuracy": 0.9796777367591858, "step": 339 }, { "epoch": 1.452991452991453, "grad_norm": 2.5571961402893066, "learning_rate": 2.9013614554715084e-06, "loss": 0.0968, "mean_token_accuracy": 0.9769884347915649, "step": 340 }, { "epoch": 1.4572649572649572, "grad_norm": 2.440589427947998, "learning_rate": 2.8740190995344908e-06, "loss": 0.0924, "mean_token_accuracy": 0.9782286882400513, "step": 341 }, { "epoch": 1.4615384615384617, "grad_norm": 2.2676053047180176, "learning_rate": 2.846822969766281e-06, "loss": 0.0802, "mean_token_accuracy": 0.9807692170143127, "step": 342 }, { "epoch": 1.465811965811966, "grad_norm": 2.512946605682373, "learning_rate": 2.8197745805670274e-06, "loss": 0.1079, "mean_token_accuracy": 0.9728260636329651, "step": 343 }, { "epoch": 1.4700854700854702, "grad_norm": 2.4237072467803955, "learning_rate": 2.792875438110033e-06, "loss": 0.0919, "mean_token_accuracy": 0.9775099158287048, "step": 344 }, { "epoch": 1.4743589743589745, "grad_norm": 2.0417165756225586, "learning_rate": 2.766127040257884e-06, "loss": 0.082, "mean_token_accuracy": 0.9809138178825378, "step": 345 }, { "epoch": 1.4786324786324787, "grad_norm": 2.1946988105773926, "learning_rate": 2.739530876479048e-06, "loss": 0.0853, "mean_token_accuracy": 0.9817578792572021, "step": 346 }, { "epoch": 1.482905982905983, "grad_norm": 2.1269588470458984, "learning_rate": 2.7130884277649215e-06, "loss": 0.0887, "mean_token_accuracy": 0.978447675704956, "step": 347 }, { "epoch": 1.4871794871794872, "grad_norm": 2.0282142162323, "learning_rate": 2.6868011665473777e-06, "loss": 0.0818, "mean_token_accuracy": 0.981675386428833, "step": 348 }, { "epoch": 1.4914529914529915, "grad_norm": 2.108011484146118, "learning_rate": 2.660670556616768e-06, "loss": 0.086, "mean_token_accuracy": 0.9770215153694153, "step": 349 }, { "epoch": 1.4957264957264957, "grad_norm": 2.2282185554504395, "learning_rate": 2.634698053040401e-06, "loss": 0.1159, "mean_token_accuracy": 0.9792807698249817, "step": 350 }, { "epoch": 1.5, "grad_norm": 1.9703973531723022, "learning_rate": 2.608885102081539e-06, "loss": 0.0821, "mean_token_accuracy": 0.9803772568702698, "step": 351 }, { "epoch": 1.5042735042735043, "grad_norm": 2.060464859008789, "learning_rate": 2.5832331411188476e-06, "loss": 0.0854, "mean_token_accuracy": 0.9798780679702759, "step": 352 }, { "epoch": 1.5085470085470085, "grad_norm": 2.1341521739959717, "learning_rate": 2.5577435985663614e-06, "loss": 0.0816, "mean_token_accuracy": 0.9788466691970825, "step": 353 }, { "epoch": 1.5128205128205128, "grad_norm": 1.9853061437606812, "learning_rate": 2.5324178937939436e-06, "loss": 0.0825, "mean_token_accuracy": 0.9808792471885681, "step": 354 }, { "epoch": 1.517094017094017, "grad_norm": 2.1807961463928223, "learning_rate": 2.5072574370482493e-06, "loss": 0.081, "mean_token_accuracy": 0.9800437092781067, "step": 355 }, { "epoch": 1.5213675213675213, "grad_norm": 1.9949653148651123, "learning_rate": 2.482263629374197e-06, "loss": 0.0804, "mean_token_accuracy": 0.9815117120742798, "step": 356 }, { "epoch": 1.5256410256410255, "grad_norm": 2.1713290214538574, "learning_rate": 2.457437862536953e-06, "loss": 0.0785, "mean_token_accuracy": 0.9811709523200989, "step": 357 }, { "epoch": 1.5299145299145298, "grad_norm": 2.1930902004241943, "learning_rate": 2.4327815189444255e-06, "loss": 0.0897, "mean_token_accuracy": 0.9784946441650391, "step": 358 }, { "epoch": 1.534188034188034, "grad_norm": 2.5312485694885254, "learning_rate": 2.408295971570297e-06, "loss": 0.0921, "mean_token_accuracy": 0.9755170941352844, "step": 359 }, { "epoch": 1.5384615384615383, "grad_norm": 2.149301052093506, "learning_rate": 2.38398258387756e-06, "loss": 0.0844, "mean_token_accuracy": 0.9801076650619507, "step": 360 }, { "epoch": 1.5427350427350426, "grad_norm": 1.987364649772644, "learning_rate": 2.359842709742603e-06, "loss": 0.0895, "mean_token_accuracy": 0.980217456817627, "step": 361 }, { "epoch": 1.547008547008547, "grad_norm": 2.093104839324951, "learning_rate": 2.3358776933798166e-06, "loss": 0.0757, "mean_token_accuracy": 0.9816683530807495, "step": 362 }, { "epoch": 1.5512820512820513, "grad_norm": 1.9293057918548584, "learning_rate": 2.3120888692667358e-06, "loss": 0.0768, "mean_token_accuracy": 0.9828811883926392, "step": 363 }, { "epoch": 1.5555555555555556, "grad_norm": 1.9935529232025146, "learning_rate": 2.28847756206974e-06, "loss": 0.083, "mean_token_accuracy": 0.9815475940704346, "step": 364 }, { "epoch": 1.5598290598290598, "grad_norm": 2.6537835597991943, "learning_rate": 2.2650450865702876e-06, "loss": 0.0837, "mean_token_accuracy": 0.9782962799072266, "step": 365 }, { "epoch": 1.564102564102564, "grad_norm": 2.2035484313964844, "learning_rate": 2.241792747591695e-06, "loss": 0.0971, "mean_token_accuracy": 0.9779199957847595, "step": 366 }, { "epoch": 1.5683760683760684, "grad_norm": 1.952289342880249, "learning_rate": 2.2187218399264933e-06, "loss": 0.0765, "mean_token_accuracy": 0.9826548099517822, "step": 367 }, { "epoch": 1.5726495726495726, "grad_norm": 2.5793616771698, "learning_rate": 2.1958336482643123e-06, "loss": 0.0884, "mean_token_accuracy": 0.9791607856750488, "step": 368 }, { "epoch": 1.5769230769230769, "grad_norm": 2.2669849395751953, "learning_rate": 2.1731294471203543e-06, "loss": 0.084, "mean_token_accuracy": 0.9792264699935913, "step": 369 }, { "epoch": 1.5811965811965814, "grad_norm": 2.0273962020874023, "learning_rate": 2.1506105007644216e-06, "loss": 0.0834, "mean_token_accuracy": 0.9810412526130676, "step": 370 }, { "epoch": 1.5854700854700856, "grad_norm": 2.562767505645752, "learning_rate": 2.128278063150511e-06, "loss": 0.0956, "mean_token_accuracy": 0.9758344888687134, "step": 371 }, { "epoch": 1.5897435897435899, "grad_norm": 2.866821765899658, "learning_rate": 2.106133377846996e-06, "loss": 0.0939, "mean_token_accuracy": 0.9768562912940979, "step": 372 }, { "epoch": 1.5940170940170941, "grad_norm": 2.002687931060791, "learning_rate": 2.0841776779673715e-06, "loss": 0.088, "mean_token_accuracy": 0.9787533283233643, "step": 373 }, { "epoch": 1.5982905982905984, "grad_norm": 2.1603848934173584, "learning_rate": 2.062412186101596e-06, "loss": 0.087, "mean_token_accuracy": 0.9783121347427368, "step": 374 }, { "epoch": 1.6025641025641026, "grad_norm": 2.0110819339752197, "learning_rate": 2.0408381142480095e-06, "loss": 0.081, "mean_token_accuracy": 0.9819519519805908, "step": 375 }, { "epoch": 1.606837606837607, "grad_norm": 2.259678840637207, "learning_rate": 2.019456663745839e-06, "loss": 0.0916, "mean_token_accuracy": 0.9787756204605103, "step": 376 }, { "epoch": 1.6111111111111112, "grad_norm": 2.026271343231201, "learning_rate": 1.9982690252083127e-06, "loss": 0.0798, "mean_token_accuracy": 0.9790022373199463, "step": 377 }, { "epoch": 1.6153846153846154, "grad_norm": 2.129807472229004, "learning_rate": 1.977276378456352e-06, "loss": 0.0762, "mean_token_accuracy": 0.9800582528114319, "step": 378 }, { "epoch": 1.6196581196581197, "grad_norm": 2.276139259338379, "learning_rate": 1.956479892452878e-06, "loss": 0.0944, "mean_token_accuracy": 0.9772907495498657, "step": 379 }, { "epoch": 1.623931623931624, "grad_norm": 1.944844126701355, "learning_rate": 1.9358807252377226e-06, "loss": 0.0786, "mean_token_accuracy": 0.9809095859527588, "step": 380 }, { "epoch": 1.6282051282051282, "grad_norm": 2.225731611251831, "learning_rate": 1.9154800238631344e-06, "loss": 0.0839, "mean_token_accuracy": 0.9785885214805603, "step": 381 }, { "epoch": 1.6324786324786325, "grad_norm": 2.48402738571167, "learning_rate": 1.8952789243299141e-06, "loss": 0.0859, "mean_token_accuracy": 0.9793146252632141, "step": 382 }, { "epoch": 1.6367521367521367, "grad_norm": 2.5232338905334473, "learning_rate": 1.8752785515241536e-06, "loss": 0.0978, "mean_token_accuracy": 0.9753180146217346, "step": 383 }, { "epoch": 1.641025641025641, "grad_norm": 2.413363456726074, "learning_rate": 1.8554800191545957e-06, "loss": 0.0961, "mean_token_accuracy": 0.9776303768157959, "step": 384 }, { "epoch": 1.6452991452991452, "grad_norm": 3.013465404510498, "learning_rate": 1.8358844296906213e-06, "loss": 0.1043, "mean_token_accuracy": 0.9743627309799194, "step": 385 }, { "epoch": 1.6495726495726495, "grad_norm": 1.9660561084747314, "learning_rate": 1.8164928743008564e-06, "loss": 0.0783, "mean_token_accuracy": 0.9812124967575073, "step": 386 }, { "epoch": 1.6538461538461537, "grad_norm": 1.8742467164993286, "learning_rate": 1.7973064327924128e-06, "loss": 0.0813, "mean_token_accuracy": 0.9807978272438049, "step": 387 }, { "epoch": 1.658119658119658, "grad_norm": 2.1209309101104736, "learning_rate": 1.778326173550761e-06, "loss": 0.0954, "mean_token_accuracy": 0.9767441749572754, "step": 388 }, { "epoch": 1.6623931623931623, "grad_norm": 2.1338541507720947, "learning_rate": 1.7595531534802317e-06, "loss": 0.0943, "mean_token_accuracy": 0.9795450568199158, "step": 389 }, { "epoch": 1.6666666666666665, "grad_norm": 1.9863265752792358, "learning_rate": 1.7409884179451714e-06, "loss": 0.0888, "mean_token_accuracy": 0.977613091468811, "step": 390 }, { "epoch": 1.6709401709401708, "grad_norm": 2.217792272567749, "learning_rate": 1.7226330007117231e-06, "loss": 0.0915, "mean_token_accuracy": 0.9778071641921997, "step": 391 }, { "epoch": 1.6752136752136753, "grad_norm": 2.277088165283203, "learning_rate": 1.7044879238902675e-06, "loss": 0.0923, "mean_token_accuracy": 0.9758321046829224, "step": 392 }, { "epoch": 1.6794871794871795, "grad_norm": 2.33744740486145, "learning_rate": 1.6865541978785083e-06, "loss": 0.0888, "mean_token_accuracy": 0.9786233901977539, "step": 393 }, { "epoch": 1.6837606837606838, "grad_norm": 2.055072546005249, "learning_rate": 1.6688328213052018e-06, "loss": 0.0815, "mean_token_accuracy": 0.9813379645347595, "step": 394 }, { "epoch": 1.688034188034188, "grad_norm": 2.1897664070129395, "learning_rate": 1.6513247809745587e-06, "loss": 0.0911, "mean_token_accuracy": 0.9775086641311646, "step": 395 }, { "epoch": 1.6923076923076923, "grad_norm": 2.0002174377441406, "learning_rate": 1.634031051811284e-06, "loss": 0.0717, "mean_token_accuracy": 0.9830272793769836, "step": 396 }, { "epoch": 1.6965811965811965, "grad_norm": 2.155630111694336, "learning_rate": 1.6169525968062965e-06, "loss": 0.0884, "mean_token_accuracy": 0.9808605909347534, "step": 397 }, { "epoch": 1.7008547008547008, "grad_norm": 1.7671974897384644, "learning_rate": 1.6000903669631052e-06, "loss": 0.0826, "mean_token_accuracy": 0.980997622013092, "step": 398 }, { "epoch": 1.7051282051282053, "grad_norm": 2.4356114864349365, "learning_rate": 1.5834453012448455e-06, "loss": 0.0939, "mean_token_accuracy": 0.9777652025222778, "step": 399 }, { "epoch": 1.7094017094017095, "grad_norm": 2.3396825790405273, "learning_rate": 1.5670183265220046e-06, "loss": 0.0909, "mean_token_accuracy": 0.9789909720420837, "step": 400 }, { "epoch": 1.7136752136752138, "grad_norm": 2.167603015899658, "learning_rate": 1.5508103575207989e-06, "loss": 0.0854, "mean_token_accuracy": 0.9791582822799683, "step": 401 }, { "epoch": 1.717948717948718, "grad_norm": 1.838385820388794, "learning_rate": 1.5348222967722451e-06, "loss": 0.0786, "mean_token_accuracy": 0.9818563461303711, "step": 402 }, { "epoch": 1.7222222222222223, "grad_norm": 2.2510430812835693, "learning_rate": 1.5190550345619021e-06, "loss": 0.0931, "mean_token_accuracy": 0.9793668985366821, "step": 403 }, { "epoch": 1.7264957264957266, "grad_norm": 1.8351444005966187, "learning_rate": 1.503509448880292e-06, "loss": 0.078, "mean_token_accuracy": 0.9809679388999939, "step": 404 }, { "epoch": 1.7307692307692308, "grad_norm": 2.311199188232422, "learning_rate": 1.4881864053740154e-06, "loss": 0.0862, "mean_token_accuracy": 0.9793581366539001, "step": 405 }, { "epoch": 1.735042735042735, "grad_norm": 2.3395721912384033, "learning_rate": 1.473086757297543e-06, "loss": 0.0812, "mean_token_accuracy": 0.9812895655632019, "step": 406 }, { "epoch": 1.7393162393162394, "grad_norm": 2.0679736137390137, "learning_rate": 1.4582113454657057e-06, "loss": 0.0876, "mean_token_accuracy": 0.9811984300613403, "step": 407 }, { "epoch": 1.7435897435897436, "grad_norm": 2.3800151348114014, "learning_rate": 1.4435609982068766e-06, "loss": 0.0877, "mean_token_accuracy": 0.9787911176681519, "step": 408 }, { "epoch": 1.7478632478632479, "grad_norm": 2.3753533363342285, "learning_rate": 1.4291365313168393e-06, "loss": 0.0889, "mean_token_accuracy": 0.9775543808937073, "step": 409 }, { "epoch": 1.7521367521367521, "grad_norm": 2.0593650341033936, "learning_rate": 1.4149387480133674e-06, "loss": 0.0897, "mean_token_accuracy": 0.9783128499984741, "step": 410 }, { "epoch": 1.7564102564102564, "grad_norm": 2.45052170753479, "learning_rate": 1.4009684388914957e-06, "loss": 0.0955, "mean_token_accuracy": 0.9760656952857971, "step": 411 }, { "epoch": 1.7606837606837606, "grad_norm": 2.1106207370758057, "learning_rate": 1.3872263818794918e-06, "loss": 0.0989, "mean_token_accuracy": 0.9786368012428284, "step": 412 }, { "epoch": 1.764957264957265, "grad_norm": 2.3723652362823486, "learning_rate": 1.373713342195548e-06, "loss": 0.0878, "mean_token_accuracy": 0.9786877632141113, "step": 413 }, { "epoch": 1.7692307692307692, "grad_norm": 1.7907770872116089, "learning_rate": 1.3604300723051571e-06, "loss": 0.0767, "mean_token_accuracy": 0.9808289408683777, "step": 414 }, { "epoch": 1.7735042735042734, "grad_norm": 2.0177221298217773, "learning_rate": 1.347377311879225e-06, "loss": 0.0903, "mean_token_accuracy": 0.9788788557052612, "step": 415 }, { "epoch": 1.7777777777777777, "grad_norm": 2.277787446975708, "learning_rate": 1.3345557877528737e-06, "loss": 0.0931, "mean_token_accuracy": 0.9771180152893066, "step": 416 }, { "epoch": 1.782051282051282, "grad_norm": 1.9423468112945557, "learning_rate": 1.3219662138849707e-06, "loss": 0.082, "mean_token_accuracy": 0.9799258708953857, "step": 417 }, { "epoch": 1.7863247863247862, "grad_norm": 1.8951685428619385, "learning_rate": 1.3096092913183741e-06, "loss": 0.0753, "mean_token_accuracy": 0.9838992953300476, "step": 418 }, { "epoch": 1.7905982905982905, "grad_norm": 2.024014949798584, "learning_rate": 1.2974857081408935e-06, "loss": 0.0865, "mean_token_accuracy": 0.9807155132293701, "step": 419 }, { "epoch": 1.7948717948717947, "grad_norm": 2.1214489936828613, "learning_rate": 1.2855961394469728e-06, "loss": 0.089, "mean_token_accuracy": 0.9781718850135803, "step": 420 }, { "epoch": 1.7991452991452992, "grad_norm": 2.066615581512451, "learning_rate": 1.273941247300104e-06, "loss": 0.0831, "mean_token_accuracy": 0.9806697368621826, "step": 421 }, { "epoch": 1.8034188034188035, "grad_norm": 1.923561930656433, "learning_rate": 1.2625216806959522e-06, "loss": 0.0765, "mean_token_accuracy": 0.9830482602119446, "step": 422 }, { "epoch": 1.8076923076923077, "grad_norm": 1.823364496231079, "learning_rate": 1.2513380755262242e-06, "loss": 0.085, "mean_token_accuracy": 0.9803729057312012, "step": 423 }, { "epoch": 1.811965811965812, "grad_norm": 2.010810136795044, "learning_rate": 1.240391054543255e-06, "loss": 0.0804, "mean_token_accuracy": 0.9809787273406982, "step": 424 }, { "epoch": 1.8162393162393162, "grad_norm": 2.2021121978759766, "learning_rate": 1.2296812273253308e-06, "loss": 0.0832, "mean_token_accuracy": 0.9796929955482483, "step": 425 }, { "epoch": 1.8205128205128205, "grad_norm": 2.7726950645446777, "learning_rate": 1.2192091902427471e-06, "loss": 0.1055, "mean_token_accuracy": 0.9748052358627319, "step": 426 }, { "epoch": 1.8247863247863247, "grad_norm": 2.122213363647461, "learning_rate": 1.2089755264245962e-06, "loss": 0.0836, "mean_token_accuracy": 0.9801605939865112, "step": 427 }, { "epoch": 1.8290598290598292, "grad_norm": 2.100584030151367, "learning_rate": 1.1989808057263e-06, "loss": 0.0844, "mean_token_accuracy": 0.9800516366958618, "step": 428 }, { "epoch": 1.8333333333333335, "grad_norm": 2.343102216720581, "learning_rate": 1.1892255846978764e-06, "loss": 0.0885, "mean_token_accuracy": 0.9799159169197083, "step": 429 }, { "epoch": 1.8376068376068377, "grad_norm": 2.159567356109619, "learning_rate": 1.179710406552947e-06, "loss": 0.0812, "mean_token_accuracy": 0.9803233742713928, "step": 430 }, { "epoch": 1.841880341880342, "grad_norm": 2.3904519081115723, "learning_rate": 1.1704358011384918e-06, "loss": 0.1003, "mean_token_accuracy": 0.9763982892036438, "step": 431 }, { "epoch": 1.8461538461538463, "grad_norm": 1.9930096864700317, "learning_rate": 1.1614022849053393e-06, "loss": 0.0836, "mean_token_accuracy": 0.9820988774299622, "step": 432 }, { "epoch": 1.8504273504273505, "grad_norm": 1.978892207145691, "learning_rate": 1.152610360879415e-06, "loss": 0.0847, "mean_token_accuracy": 0.979092001914978, "step": 433 }, { "epoch": 1.8547008547008548, "grad_norm": 2.216299295425415, "learning_rate": 1.1440605186337256e-06, "loss": 0.0872, "mean_token_accuracy": 0.9798073768615723, "step": 434 }, { "epoch": 1.858974358974359, "grad_norm": 1.9769878387451172, "learning_rate": 1.1357532342611006e-06, "loss": 0.0833, "mean_token_accuracy": 0.980143666267395, "step": 435 }, { "epoch": 1.8632478632478633, "grad_norm": 2.341925621032715, "learning_rate": 1.1276889703476789e-06, "loss": 0.0833, "mean_token_accuracy": 0.9784598350524902, "step": 436 }, { "epoch": 1.8675213675213675, "grad_norm": 2.1460580825805664, "learning_rate": 1.1198681759471524e-06, "loss": 0.0896, "mean_token_accuracy": 0.9792025089263916, "step": 437 }, { "epoch": 1.8717948717948718, "grad_norm": 2.2955880165100098, "learning_rate": 1.1122912865557579e-06, "loss": 0.0887, "mean_token_accuracy": 0.9784141182899475, "step": 438 }, { "epoch": 1.876068376068376, "grad_norm": 1.7442694902420044, "learning_rate": 1.1049587240880296e-06, "loss": 0.0691, "mean_token_accuracy": 0.9838666319847107, "step": 439 }, { "epoch": 1.8803418803418803, "grad_norm": 2.3275156021118164, "learning_rate": 1.0978708968533029e-06, "loss": 0.0863, "mean_token_accuracy": 0.9796561598777771, "step": 440 }, { "epoch": 1.8846153846153846, "grad_norm": 2.238799810409546, "learning_rate": 1.09102819953298e-06, "loss": 0.0989, "mean_token_accuracy": 0.9774456024169922, "step": 441 }, { "epoch": 1.8888888888888888, "grad_norm": 2.3636116981506348, "learning_rate": 1.0844310131585498e-06, "loss": 0.0878, "mean_token_accuracy": 0.9802519083023071, "step": 442 }, { "epoch": 1.893162393162393, "grad_norm": 2.082477331161499, "learning_rate": 1.0780797050903713e-06, "loss": 0.0819, "mean_token_accuracy": 0.9814638495445251, "step": 443 }, { "epoch": 1.8974358974358974, "grad_norm": 2.0771663188934326, "learning_rate": 1.07197462899722e-06, "loss": 0.0852, "mean_token_accuracy": 0.9814097881317139, "step": 444 }, { "epoch": 1.9017094017094016, "grad_norm": 2.476078510284424, "learning_rate": 1.066116124836589e-06, "loss": 0.0937, "mean_token_accuracy": 0.9798304438591003, "step": 445 }, { "epoch": 1.9059829059829059, "grad_norm": 1.598405122756958, "learning_rate": 1.0605045188357633e-06, "loss": 0.0712, "mean_token_accuracy": 0.9831634759902954, "step": 446 }, { "epoch": 1.9102564102564101, "grad_norm": 2.015641450881958, "learning_rate": 1.0551401234736524e-06, "loss": 0.0797, "mean_token_accuracy": 0.9820832014083862, "step": 447 }, { "epoch": 1.9145299145299144, "grad_norm": 2.4531426429748535, "learning_rate": 1.0500232374633884e-06, "loss": 0.0961, "mean_token_accuracy": 0.976710319519043, "step": 448 }, { "epoch": 1.9188034188034186, "grad_norm": 1.7264620065689087, "learning_rate": 1.0451541457356949e-06, "loss": 0.0774, "mean_token_accuracy": 0.9826287627220154, "step": 449 }, { "epoch": 1.9230769230769231, "grad_norm": 1.9337164163589478, "learning_rate": 1.0405331194230197e-06, "loss": 0.08, "mean_token_accuracy": 0.9816673994064331, "step": 450 }, { "epoch": 1.9273504273504274, "grad_norm": 2.3220560550689697, "learning_rate": 1.036160415844436e-06, "loss": 0.087, "mean_token_accuracy": 0.9797703623771667, "step": 451 }, { "epoch": 1.9316239316239316, "grad_norm": 2.0156548023223877, "learning_rate": 1.032036278491317e-06, "loss": 0.0827, "mean_token_accuracy": 0.980651319026947, "step": 452 }, { "epoch": 1.935897435897436, "grad_norm": 2.087442636489868, "learning_rate": 1.0281609370137724e-06, "loss": 0.0854, "mean_token_accuracy": 0.9798422455787659, "step": 453 }, { "epoch": 1.9401709401709402, "grad_norm": 2.0209758281707764, "learning_rate": 1.0245346072078642e-06, "loss": 0.0786, "mean_token_accuracy": 0.9813392162322998, "step": 454 }, { "epoch": 1.9444444444444444, "grad_norm": 2.3285915851593018, "learning_rate": 1.0211574910035892e-06, "loss": 0.0864, "mean_token_accuracy": 0.980617880821228, "step": 455 }, { "epoch": 1.9487179487179487, "grad_norm": 2.3946189880371094, "learning_rate": 1.018029776453635e-06, "loss": 0.0888, "mean_token_accuracy": 0.9790608286857605, "step": 456 }, { "epoch": 1.952991452991453, "grad_norm": 2.0309736728668213, "learning_rate": 1.0151516377229062e-06, "loss": 0.085, "mean_token_accuracy": 0.9819555878639221, "step": 457 }, { "epoch": 1.9572649572649574, "grad_norm": 2.4477789402008057, "learning_rate": 1.0125232350788295e-06, "loss": 0.0968, "mean_token_accuracy": 0.9767808318138123, "step": 458 }, { "epoch": 1.9615384615384617, "grad_norm": 2.2161478996276855, "learning_rate": 1.0101447148824265e-06, "loss": 0.0871, "mean_token_accuracy": 0.9767209887504578, "step": 459 }, { "epoch": 1.965811965811966, "grad_norm": 2.2850284576416016, "learning_rate": 1.0080162095801663e-06, "loss": 0.082, "mean_token_accuracy": 0.9784152507781982, "step": 460 }, { "epoch": 1.9700854700854702, "grad_norm": 1.9745234251022339, "learning_rate": 1.0061378376965871e-06, "loss": 0.0808, "mean_token_accuracy": 0.9789802432060242, "step": 461 }, { "epoch": 1.9743589743589745, "grad_norm": 1.8992419242858887, "learning_rate": 1.0045097038276994e-06, "loss": 0.0783, "mean_token_accuracy": 0.9831842184066772, "step": 462 }, { "epoch": 1.9786324786324787, "grad_norm": 2.1573259830474854, "learning_rate": 1.0031318986351587e-06, "loss": 0.0783, "mean_token_accuracy": 0.9816881418228149, "step": 463 }, { "epoch": 1.982905982905983, "grad_norm": 2.2348172664642334, "learning_rate": 1.0020044988412197e-06, "loss": 0.0893, "mean_token_accuracy": 0.9791444540023804, "step": 464 }, { "epoch": 1.9871794871794872, "grad_norm": 2.1570818424224854, "learning_rate": 1.0011275672244635e-06, "loss": 0.0886, "mean_token_accuracy": 0.9798603653907776, "step": 465 }, { "epoch": 1.9914529914529915, "grad_norm": 2.324618101119995, "learning_rate": 1.0005011526162988e-06, "loss": 0.0888, "mean_token_accuracy": 0.9777023196220398, "step": 466 }, { "epoch": 1.9957264957264957, "grad_norm": 1.7417622804641724, "learning_rate": 1.0001252898982478e-06, "loss": 0.0776, "mean_token_accuracy": 0.9812348484992981, "step": 467 }, { "epoch": 2.0, "grad_norm": 1.4360781908035278, "learning_rate": 1.0000000000000002e-06, "loss": 0.0643, "mean_token_accuracy": 0.9865781664848328, "step": 468 }, { "epoch": 2.0, "step": 468, "total_flos": 3.6134539308407194e+17, "train_loss": 0.15647654232178998, "train_runtime": 1906.4028, "train_samples_per_second": 7.84, "train_steps_per_second": 0.245 } ], "logging_steps": 1, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.6134539308407194e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }