{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 834, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0036019810895992796, "grad_norm": 8.959743121022326, "learning_rate": 0.0, "loss": 1.8318, "step": 1 }, { "epoch": 0.007203962179198559, "grad_norm": 8.599514774705224, "learning_rate": 1.1904761904761906e-07, "loss": 1.8629, "step": 2 }, { "epoch": 0.010805943268797838, "grad_norm": 8.27099405489232, "learning_rate": 2.3809523809523811e-07, "loss": 1.8047, "step": 3 }, { "epoch": 0.014407924358397118, "grad_norm": 8.807297661164533, "learning_rate": 3.5714285714285716e-07, "loss": 1.838, "step": 4 }, { "epoch": 0.018009905447996397, "grad_norm": 9.118038313686268, "learning_rate": 4.7619047619047623e-07, "loss": 1.8445, "step": 5 }, { "epoch": 0.021611886537595677, "grad_norm": 9.342871560722859, "learning_rate": 5.952380952380953e-07, "loss": 1.8716, "step": 6 }, { "epoch": 0.025213867627194957, "grad_norm": 8.104634918034305, "learning_rate": 7.142857142857143e-07, "loss": 1.7625, "step": 7 }, { "epoch": 0.028815848716794237, "grad_norm": 8.497419415075033, "learning_rate": 8.333333333333333e-07, "loss": 1.8662, "step": 8 }, { "epoch": 0.03241782980639352, "grad_norm": 8.997271493582199, "learning_rate": 9.523809523809525e-07, "loss": 1.8798, "step": 9 }, { "epoch": 0.03601981089599279, "grad_norm": 7.186846943308143, "learning_rate": 1.0714285714285714e-06, "loss": 1.7647, "step": 10 }, { "epoch": 0.03962179198559208, "grad_norm": 7.023004901959916, "learning_rate": 1.1904761904761906e-06, "loss": 1.8083, "step": 11 }, { "epoch": 0.04322377307519135, "grad_norm": 7.3197143781084195, "learning_rate": 1.3095238095238096e-06, "loss": 1.8125, "step": 12 }, { "epoch": 0.04682575416479064, "grad_norm": 5.742362390459089, "learning_rate": 1.4285714285714286e-06, "loss": 1.7355, "step": 13 }, { "epoch": 0.05042773525438991, "grad_norm": 5.054071547886992, "learning_rate": 1.5476190476190479e-06, "loss": 1.7257, "step": 14 }, { "epoch": 0.0540297163439892, "grad_norm": 5.035497816412668, "learning_rate": 1.6666666666666667e-06, "loss": 1.744, "step": 15 }, { "epoch": 0.05763169743358847, "grad_norm": 4.701882228340393, "learning_rate": 1.7857142857142859e-06, "loss": 1.6886, "step": 16 }, { "epoch": 0.06123367852318776, "grad_norm": 4.331238204850519, "learning_rate": 1.904761904761905e-06, "loss": 1.6805, "step": 17 }, { "epoch": 0.06483565961278703, "grad_norm": 2.767544145879741, "learning_rate": 2.023809523809524e-06, "loss": 1.6131, "step": 18 }, { "epoch": 0.06843764070238631, "grad_norm": 2.6868680980231345, "learning_rate": 2.1428571428571427e-06, "loss": 1.5981, "step": 19 }, { "epoch": 0.07203962179198559, "grad_norm": 2.4108574787799673, "learning_rate": 2.261904761904762e-06, "loss": 1.5926, "step": 20 }, { "epoch": 0.07564160288158488, "grad_norm": 2.4672291267909237, "learning_rate": 2.380952380952381e-06, "loss": 1.5968, "step": 21 }, { "epoch": 0.07924358397118415, "grad_norm": 2.1181123521738914, "learning_rate": 2.5e-06, "loss": 1.5456, "step": 22 }, { "epoch": 0.08284556506078343, "grad_norm": 2.3108498703640565, "learning_rate": 2.6190476190476192e-06, "loss": 1.5908, "step": 23 }, { "epoch": 0.0864475461503827, "grad_norm": 1.593382451878654, "learning_rate": 2.7380952380952387e-06, "loss": 1.5203, "step": 24 }, { "epoch": 0.090049527239982, "grad_norm": 1.8594286026551032, "learning_rate": 2.8571428571428573e-06, "loss": 1.4988, "step": 25 }, { "epoch": 0.09365150832958127, "grad_norm": 1.812932314030098, "learning_rate": 2.9761904761904763e-06, "loss": 1.5038, "step": 26 }, { "epoch": 0.09725348941918055, "grad_norm": 1.6854819502367606, "learning_rate": 3.0952380952380957e-06, "loss": 1.5037, "step": 27 }, { "epoch": 0.10085547050877983, "grad_norm": 1.4363843088807504, "learning_rate": 3.2142857142857147e-06, "loss": 1.4492, "step": 28 }, { "epoch": 0.1044574515983791, "grad_norm": 1.2720392687132038, "learning_rate": 3.3333333333333333e-06, "loss": 1.4398, "step": 29 }, { "epoch": 0.1080594326879784, "grad_norm": 1.151633877445218, "learning_rate": 3.4523809523809528e-06, "loss": 1.427, "step": 30 }, { "epoch": 0.11166141377757767, "grad_norm": 0.9908597997660091, "learning_rate": 3.5714285714285718e-06, "loss": 1.4175, "step": 31 }, { "epoch": 0.11526339486717695, "grad_norm": 0.8183196011685436, "learning_rate": 3.690476190476191e-06, "loss": 1.3983, "step": 32 }, { "epoch": 0.11886537595677622, "grad_norm": 0.8635221903260426, "learning_rate": 3.80952380952381e-06, "loss": 1.4133, "step": 33 }, { "epoch": 0.12246735704637551, "grad_norm": 0.8674489391428222, "learning_rate": 3.928571428571429e-06, "loss": 1.3498, "step": 34 }, { "epoch": 0.1260693381359748, "grad_norm": 0.9238078524289024, "learning_rate": 4.047619047619048e-06, "loss": 1.3676, "step": 35 }, { "epoch": 0.12967131922557407, "grad_norm": 0.8890230038452177, "learning_rate": 4.166666666666667e-06, "loss": 1.3671, "step": 36 }, { "epoch": 0.13327330031517334, "grad_norm": 0.7855135929116516, "learning_rate": 4.2857142857142855e-06, "loss": 1.3752, "step": 37 }, { "epoch": 0.13687528140477262, "grad_norm": 0.7376255448971343, "learning_rate": 4.404761904761905e-06, "loss": 1.3343, "step": 38 }, { "epoch": 0.1404772624943719, "grad_norm": 0.594654333690764, "learning_rate": 4.523809523809524e-06, "loss": 1.3251, "step": 39 }, { "epoch": 0.14407924358397117, "grad_norm": 0.5613128279020647, "learning_rate": 4.642857142857144e-06, "loss": 1.3126, "step": 40 }, { "epoch": 0.14768122467357048, "grad_norm": 0.560019251447276, "learning_rate": 4.761904761904762e-06, "loss": 1.3397, "step": 41 }, { "epoch": 0.15128320576316975, "grad_norm": 0.5314471256259126, "learning_rate": 4.880952380952381e-06, "loss": 1.2782, "step": 42 }, { "epoch": 0.15488518685276903, "grad_norm": 0.5332998102282385, "learning_rate": 5e-06, "loss": 1.3021, "step": 43 }, { "epoch": 0.1584871679423683, "grad_norm": 0.5282474767077582, "learning_rate": 5.119047619047619e-06, "loss": 1.2855, "step": 44 }, { "epoch": 0.16208914903196758, "grad_norm": 0.5050735166019568, "learning_rate": 5.2380952380952384e-06, "loss": 1.2798, "step": 45 }, { "epoch": 0.16569113012156686, "grad_norm": 0.5264954959532085, "learning_rate": 5.357142857142857e-06, "loss": 1.2685, "step": 46 }, { "epoch": 0.16929311121116614, "grad_norm": 0.49891809978749935, "learning_rate": 5.476190476190477e-06, "loss": 1.3067, "step": 47 }, { "epoch": 0.1728950923007654, "grad_norm": 0.43713019827527205, "learning_rate": 5.595238095238096e-06, "loss": 1.2912, "step": 48 }, { "epoch": 0.1764970733903647, "grad_norm": 0.42268295993435495, "learning_rate": 5.7142857142857145e-06, "loss": 1.2677, "step": 49 }, { "epoch": 0.180099054479964, "grad_norm": 0.42192747607841885, "learning_rate": 5.833333333333334e-06, "loss": 1.2945, "step": 50 }, { "epoch": 0.18370103556956327, "grad_norm": 0.37897863688600525, "learning_rate": 5.9523809523809525e-06, "loss": 1.2431, "step": 51 }, { "epoch": 0.18730301665916255, "grad_norm": 0.40405168811471465, "learning_rate": 6.071428571428571e-06, "loss": 1.2804, "step": 52 }, { "epoch": 0.19090499774876182, "grad_norm": 0.3934601575838702, "learning_rate": 6.1904761904761914e-06, "loss": 1.2597, "step": 53 }, { "epoch": 0.1945069788383611, "grad_norm": 0.41031465320208005, "learning_rate": 6.30952380952381e-06, "loss": 1.2451, "step": 54 }, { "epoch": 0.19810895992796038, "grad_norm": 0.3681773008428082, "learning_rate": 6.4285714285714295e-06, "loss": 1.26, "step": 55 }, { "epoch": 0.20171094101755965, "grad_norm": 0.36871798332732425, "learning_rate": 6.547619047619048e-06, "loss": 1.2251, "step": 56 }, { "epoch": 0.20531292210715893, "grad_norm": 0.35510983491031706, "learning_rate": 6.666666666666667e-06, "loss": 1.2455, "step": 57 }, { "epoch": 0.2089149031967582, "grad_norm": 0.3348595552564557, "learning_rate": 6.785714285714287e-06, "loss": 1.2582, "step": 58 }, { "epoch": 0.2125168842863575, "grad_norm": 0.33479239236035946, "learning_rate": 6.9047619047619055e-06, "loss": 1.229, "step": 59 }, { "epoch": 0.2161188653759568, "grad_norm": 0.35235529909722807, "learning_rate": 7.023809523809524e-06, "loss": 1.194, "step": 60 }, { "epoch": 0.21972084646555606, "grad_norm": 0.3612868104143937, "learning_rate": 7.1428571428571436e-06, "loss": 1.216, "step": 61 }, { "epoch": 0.22332282755515534, "grad_norm": 0.33124367306424535, "learning_rate": 7.261904761904762e-06, "loss": 1.2322, "step": 62 }, { "epoch": 0.22692480864475462, "grad_norm": 0.31005107875726384, "learning_rate": 7.380952380952382e-06, "loss": 1.1965, "step": 63 }, { "epoch": 0.2305267897343539, "grad_norm": 0.35704168784229545, "learning_rate": 7.500000000000001e-06, "loss": 1.2472, "step": 64 }, { "epoch": 0.23412877082395317, "grad_norm": 0.3009141198350835, "learning_rate": 7.61904761904762e-06, "loss": 1.2042, "step": 65 }, { "epoch": 0.23773075191355245, "grad_norm": 0.3361466848573483, "learning_rate": 7.738095238095238e-06, "loss": 1.1972, "step": 66 }, { "epoch": 0.24133273300315172, "grad_norm": 0.32300651355695426, "learning_rate": 7.857142857142858e-06, "loss": 1.1821, "step": 67 }, { "epoch": 0.24493471409275103, "grad_norm": 0.34432276589140803, "learning_rate": 7.976190476190477e-06, "loss": 1.2158, "step": 68 }, { "epoch": 0.2485366951823503, "grad_norm": 0.32159883103552483, "learning_rate": 8.095238095238097e-06, "loss": 1.2273, "step": 69 }, { "epoch": 0.2521386762719496, "grad_norm": 0.3132280942086526, "learning_rate": 8.214285714285714e-06, "loss": 1.217, "step": 70 }, { "epoch": 0.25574065736154883, "grad_norm": 0.34857390044798864, "learning_rate": 8.333333333333334e-06, "loss": 1.1963, "step": 71 }, { "epoch": 0.25934263845114813, "grad_norm": 0.28499241440527673, "learning_rate": 8.452380952380953e-06, "loss": 1.1921, "step": 72 }, { "epoch": 0.26294461954074744, "grad_norm": 0.3170111251216066, "learning_rate": 8.571428571428571e-06, "loss": 1.1975, "step": 73 }, { "epoch": 0.2665466006303467, "grad_norm": 0.3211922078756118, "learning_rate": 8.690476190476192e-06, "loss": 1.1704, "step": 74 }, { "epoch": 0.270148581719946, "grad_norm": 0.30453515336097836, "learning_rate": 8.80952380952381e-06, "loss": 1.2062, "step": 75 }, { "epoch": 0.27375056280954524, "grad_norm": 0.3064941559502552, "learning_rate": 8.92857142857143e-06, "loss": 1.1928, "step": 76 }, { "epoch": 0.27735254389914454, "grad_norm": 0.33218232714495777, "learning_rate": 9.047619047619049e-06, "loss": 1.205, "step": 77 }, { "epoch": 0.2809545249887438, "grad_norm": 0.29079080164563587, "learning_rate": 9.166666666666666e-06, "loss": 1.2031, "step": 78 }, { "epoch": 0.2845565060783431, "grad_norm": 0.31159296882004955, "learning_rate": 9.285714285714288e-06, "loss": 1.212, "step": 79 }, { "epoch": 0.28815848716794235, "grad_norm": 0.2950167931965713, "learning_rate": 9.404761904761905e-06, "loss": 1.21, "step": 80 }, { "epoch": 0.29176046825754165, "grad_norm": 0.3168487800792039, "learning_rate": 9.523809523809525e-06, "loss": 1.1467, "step": 81 }, { "epoch": 0.29536244934714095, "grad_norm": 0.31180168015480736, "learning_rate": 9.642857142857144e-06, "loss": 1.1904, "step": 82 }, { "epoch": 0.2989644304367402, "grad_norm": 0.31394964544202014, "learning_rate": 9.761904761904762e-06, "loss": 1.1742, "step": 83 }, { "epoch": 0.3025664115263395, "grad_norm": 0.28380748068760736, "learning_rate": 9.880952380952381e-06, "loss": 1.2007, "step": 84 }, { "epoch": 0.30616839261593876, "grad_norm": 0.3122970930157758, "learning_rate": 1e-05, "loss": 1.1607, "step": 85 }, { "epoch": 0.30977037370553806, "grad_norm": 0.30341382381199433, "learning_rate": 9.999956135155688e-06, "loss": 1.1943, "step": 86 }, { "epoch": 0.3133723547951373, "grad_norm": 0.29699266871397906, "learning_rate": 9.999824541392404e-06, "loss": 1.156, "step": 87 }, { "epoch": 0.3169743358847366, "grad_norm": 0.3370219209966227, "learning_rate": 9.999605221019082e-06, "loss": 1.166, "step": 88 }, { "epoch": 0.32057631697433586, "grad_norm": 0.30301135573648547, "learning_rate": 9.999298177883902e-06, "loss": 1.186, "step": 89 }, { "epoch": 0.32417829806393517, "grad_norm": 0.30025420893856164, "learning_rate": 9.998903417374228e-06, "loss": 1.1832, "step": 90 }, { "epoch": 0.32778027915353447, "grad_norm": 0.3211722926193115, "learning_rate": 9.9984209464165e-06, "loss": 1.1309, "step": 91 }, { "epoch": 0.3313822602431337, "grad_norm": 0.34378918525170493, "learning_rate": 9.997850773476126e-06, "loss": 1.1822, "step": 92 }, { "epoch": 0.334984241332733, "grad_norm": 0.3202545861207382, "learning_rate": 9.997192908557322e-06, "loss": 1.1644, "step": 93 }, { "epoch": 0.3385862224223323, "grad_norm": 0.3192066377734346, "learning_rate": 9.996447363202947e-06, "loss": 1.1827, "step": 94 }, { "epoch": 0.3421882035119316, "grad_norm": 0.32504247647618456, "learning_rate": 9.995614150494293e-06, "loss": 1.16, "step": 95 }, { "epoch": 0.3457901846015308, "grad_norm": 0.37075579456497426, "learning_rate": 9.994693285050858e-06, "loss": 1.1813, "step": 96 }, { "epoch": 0.34939216569113013, "grad_norm": 0.3604600871283949, "learning_rate": 9.99368478303009e-06, "loss": 1.1535, "step": 97 }, { "epoch": 0.3529941467807294, "grad_norm": 0.3272940747117161, "learning_rate": 9.9925886621271e-06, "loss": 1.1636, "step": 98 }, { "epoch": 0.3565961278703287, "grad_norm": 0.3449509830414838, "learning_rate": 9.99140494157436e-06, "loss": 1.1575, "step": 99 }, { "epoch": 0.360198108959928, "grad_norm": 0.35962181776653873, "learning_rate": 9.990133642141359e-06, "loss": 1.1756, "step": 100 }, { "epoch": 0.36380009004952724, "grad_norm": 0.3255881417609746, "learning_rate": 9.988774786134235e-06, "loss": 1.1751, "step": 101 }, { "epoch": 0.36740207113912654, "grad_norm": 0.3466801749265495, "learning_rate": 9.987328397395389e-06, "loss": 1.148, "step": 102 }, { "epoch": 0.3710040522287258, "grad_norm": 0.3781154633191771, "learning_rate": 9.98579450130307e-06, "loss": 1.1672, "step": 103 }, { "epoch": 0.3746060333183251, "grad_norm": 0.3177289518646908, "learning_rate": 9.984173124770924e-06, "loss": 1.1767, "step": 104 }, { "epoch": 0.37820801440792434, "grad_norm": 0.3287531127302142, "learning_rate": 9.982464296247523e-06, "loss": 1.1729, "step": 105 }, { "epoch": 0.38180999549752365, "grad_norm": 0.35236574446805197, "learning_rate": 9.980668045715864e-06, "loss": 1.162, "step": 106 }, { "epoch": 0.3854119765871229, "grad_norm": 0.32366666300178654, "learning_rate": 9.978784404692847e-06, "loss": 1.1541, "step": 107 }, { "epoch": 0.3890139576767222, "grad_norm": 0.3441412737476968, "learning_rate": 9.97681340622872e-06, "loss": 1.1483, "step": 108 }, { "epoch": 0.3926159387663215, "grad_norm": 0.3368352565729486, "learning_rate": 9.974755084906503e-06, "loss": 1.1587, "step": 109 }, { "epoch": 0.39621791985592075, "grad_norm": 0.3146485277926942, "learning_rate": 9.972609476841368e-06, "loss": 1.1603, "step": 110 }, { "epoch": 0.39981990094552006, "grad_norm": 0.34336750676307926, "learning_rate": 9.970376619680024e-06, "loss": 1.1793, "step": 111 }, { "epoch": 0.4034218820351193, "grad_norm": 0.3079869001100948, "learning_rate": 9.968056552600043e-06, "loss": 1.1601, "step": 112 }, { "epoch": 0.4070238631247186, "grad_norm": 0.3194899482588308, "learning_rate": 9.965649316309178e-06, "loss": 1.1931, "step": 113 }, { "epoch": 0.41062584421431786, "grad_norm": 0.31236725178854713, "learning_rate": 9.963154953044646e-06, "loss": 1.1157, "step": 114 }, { "epoch": 0.41422782530391716, "grad_norm": 0.3641377454803935, "learning_rate": 9.960573506572391e-06, "loss": 1.1257, "step": 115 }, { "epoch": 0.4178298063935164, "grad_norm": 0.3367690084363564, "learning_rate": 9.957905022186309e-06, "loss": 1.1332, "step": 116 }, { "epoch": 0.4214317874831157, "grad_norm": 0.3282965716002517, "learning_rate": 9.955149546707465e-06, "loss": 1.0959, "step": 117 }, { "epoch": 0.425033768572715, "grad_norm": 0.3764974200013322, "learning_rate": 9.952307128483257e-06, "loss": 1.168, "step": 118 }, { "epoch": 0.42863574966231427, "grad_norm": 0.332077880161025, "learning_rate": 9.94937781738658e-06, "loss": 1.1847, "step": 119 }, { "epoch": 0.4322377307519136, "grad_norm": 0.3122087138952814, "learning_rate": 9.946361664814942e-06, "loss": 1.1214, "step": 120 }, { "epoch": 0.4358397118415128, "grad_norm": 0.321004643708737, "learning_rate": 9.94325872368957e-06, "loss": 1.1235, "step": 121 }, { "epoch": 0.4394416929311121, "grad_norm": 0.35397785938333604, "learning_rate": 9.940069048454478e-06, "loss": 1.1792, "step": 122 }, { "epoch": 0.4430436740207114, "grad_norm": 0.34751295835336804, "learning_rate": 9.936792695075502e-06, "loss": 1.1626, "step": 123 }, { "epoch": 0.4466456551103107, "grad_norm": 0.33334683436011303, "learning_rate": 9.93342972103934e-06, "loss": 1.1156, "step": 124 }, { "epoch": 0.45024763619990993, "grad_norm": 0.35572890521109984, "learning_rate": 9.929980185352525e-06, "loss": 1.134, "step": 125 }, { "epoch": 0.45384961728950923, "grad_norm": 0.3600252106382079, "learning_rate": 9.926444148540394e-06, "loss": 1.1552, "step": 126 }, { "epoch": 0.45745159837910854, "grad_norm": 0.31574099694060664, "learning_rate": 9.922821672646028e-06, "loss": 1.1294, "step": 127 }, { "epoch": 0.4610535794687078, "grad_norm": 0.3384836348033959, "learning_rate": 9.919112821229165e-06, "loss": 1.1415, "step": 128 }, { "epoch": 0.4646555605583071, "grad_norm": 0.36082038793653404, "learning_rate": 9.915317659365078e-06, "loss": 1.1486, "step": 129 }, { "epoch": 0.46825754164790634, "grad_norm": 0.3475974165432403, "learning_rate": 9.911436253643445e-06, "loss": 1.1265, "step": 130 }, { "epoch": 0.47185952273750564, "grad_norm": 0.3635545773479418, "learning_rate": 9.907468672167165e-06, "loss": 1.1549, "step": 131 }, { "epoch": 0.4754615038271049, "grad_norm": 0.347794452842081, "learning_rate": 9.903414984551178e-06, "loss": 1.1461, "step": 132 }, { "epoch": 0.4790634849167042, "grad_norm": 0.32822120698172536, "learning_rate": 9.899275261921236e-06, "loss": 1.1649, "step": 133 }, { "epoch": 0.48266546600630345, "grad_norm": 0.29837185314643394, "learning_rate": 9.89504957691265e-06, "loss": 1.1571, "step": 134 }, { "epoch": 0.48626744709590275, "grad_norm": 0.38306110575116503, "learning_rate": 9.890738003669029e-06, "loss": 1.1252, "step": 135 }, { "epoch": 0.48986942818550205, "grad_norm": 0.34086844442205383, "learning_rate": 9.886340617840968e-06, "loss": 1.1382, "step": 136 }, { "epoch": 0.4934714092751013, "grad_norm": 0.30015621390458924, "learning_rate": 9.881857496584726e-06, "loss": 1.1275, "step": 137 }, { "epoch": 0.4970733903647006, "grad_norm": 0.3614100125796906, "learning_rate": 9.877288718560866e-06, "loss": 1.1161, "step": 138 }, { "epoch": 0.5006753714542999, "grad_norm": 0.3267608562772437, "learning_rate": 9.872634363932887e-06, "loss": 1.1316, "step": 139 }, { "epoch": 0.5042773525438992, "grad_norm": 0.30874876519759176, "learning_rate": 9.867894514365802e-06, "loss": 1.1485, "step": 140 }, { "epoch": 0.5078793336334985, "grad_norm": 0.3155720475990474, "learning_rate": 9.863069253024719e-06, "loss": 1.1481, "step": 141 }, { "epoch": 0.5114813147230977, "grad_norm": 0.33367944966848107, "learning_rate": 9.85815866457337e-06, "loss": 1.108, "step": 142 }, { "epoch": 0.515083295812697, "grad_norm": 0.337631206990529, "learning_rate": 9.853162835172638e-06, "loss": 1.1292, "step": 143 }, { "epoch": 0.5186852769022963, "grad_norm": 0.353550852505754, "learning_rate": 9.84808185247903e-06, "loss": 1.1378, "step": 144 }, { "epoch": 0.5222872579918956, "grad_norm": 0.3545306891033168, "learning_rate": 9.842915805643156e-06, "loss": 1.108, "step": 145 }, { "epoch": 0.5258892390814949, "grad_norm": 0.42557248608508313, "learning_rate": 9.83766478530815e-06, "loss": 1.1334, "step": 146 }, { "epoch": 0.5294912201710941, "grad_norm": 0.3728030570909934, "learning_rate": 9.832328883608088e-06, "loss": 1.1381, "step": 147 }, { "epoch": 0.5330932012606934, "grad_norm": 0.49924286590903955, "learning_rate": 9.82690819416637e-06, "loss": 1.097, "step": 148 }, { "epoch": 0.5366951823502927, "grad_norm": 0.3439764618050822, "learning_rate": 9.821402812094074e-06, "loss": 1.1577, "step": 149 }, { "epoch": 0.540297163439892, "grad_norm": 0.4159129574954092, "learning_rate": 9.815812833988292e-06, "loss": 1.132, "step": 150 }, { "epoch": 0.5438991445294912, "grad_norm": 0.4129107171971605, "learning_rate": 9.81013835793043e-06, "loss": 1.1574, "step": 151 }, { "epoch": 0.5475011256190905, "grad_norm": 0.4296938671727486, "learning_rate": 9.804379483484493e-06, "loss": 1.1459, "step": 152 }, { "epoch": 0.5511031067086898, "grad_norm": 0.3766243233797345, "learning_rate": 9.798536311695334e-06, "loss": 1.1545, "step": 153 }, { "epoch": 0.5547050877982891, "grad_norm": 0.35435312036719, "learning_rate": 9.79260894508688e-06, "loss": 1.1171, "step": 154 }, { "epoch": 0.5583070688878884, "grad_norm": 0.37652608806645427, "learning_rate": 9.786597487660336e-06, "loss": 1.111, "step": 155 }, { "epoch": 0.5619090499774876, "grad_norm": 0.3453394867906792, "learning_rate": 9.780502044892363e-06, "loss": 1.1749, "step": 156 }, { "epoch": 0.5655110310670869, "grad_norm": 0.379890664116539, "learning_rate": 9.774322723733216e-06, "loss": 1.1481, "step": 157 }, { "epoch": 0.5691130121566862, "grad_norm": 0.39501637379719035, "learning_rate": 9.768059632604881e-06, "loss": 1.1061, "step": 158 }, { "epoch": 0.5727149932462855, "grad_norm": 0.3326705156892084, "learning_rate": 9.761712881399164e-06, "loss": 1.1412, "step": 159 }, { "epoch": 0.5763169743358847, "grad_norm": 0.3617516742176726, "learning_rate": 9.755282581475769e-06, "loss": 1.1789, "step": 160 }, { "epoch": 0.579918955425484, "grad_norm": 0.39263243654387053, "learning_rate": 9.748768845660335e-06, "loss": 1.152, "step": 161 }, { "epoch": 0.5835209365150833, "grad_norm": 0.34780105035332837, "learning_rate": 9.742171788242468e-06, "loss": 1.1267, "step": 162 }, { "epoch": 0.5871229176046826, "grad_norm": 0.45976305202766604, "learning_rate": 9.735491524973723e-06, "loss": 1.1043, "step": 163 }, { "epoch": 0.5907248986942819, "grad_norm": 0.3604888952284618, "learning_rate": 9.728728173065584e-06, "loss": 1.1105, "step": 164 }, { "epoch": 0.5943268797838811, "grad_norm": 0.4168296170026391, "learning_rate": 9.721881851187406e-06, "loss": 1.1362, "step": 165 }, { "epoch": 0.5979288608734804, "grad_norm": 0.3757592230549266, "learning_rate": 9.714952679464324e-06, "loss": 1.1405, "step": 166 }, { "epoch": 0.6015308419630797, "grad_norm": 0.40888701962345614, "learning_rate": 9.707940779475151e-06, "loss": 1.0968, "step": 167 }, { "epoch": 0.605132823052679, "grad_norm": 0.36126726128985687, "learning_rate": 9.700846274250252e-06, "loss": 1.1197, "step": 168 }, { "epoch": 0.6087348041422782, "grad_norm": 0.3792632202992676, "learning_rate": 9.693669288269371e-06, "loss": 1.1129, "step": 169 }, { "epoch": 0.6123367852318775, "grad_norm": 0.4094377957060575, "learning_rate": 9.68640994745946e-06, "loss": 1.1235, "step": 170 }, { "epoch": 0.6159387663214768, "grad_norm": 0.39486664256207166, "learning_rate": 9.679068379192455e-06, "loss": 1.1189, "step": 171 }, { "epoch": 0.6195407474110761, "grad_norm": 0.37256294824637853, "learning_rate": 9.671644712283061e-06, "loss": 1.0951, "step": 172 }, { "epoch": 0.6231427285006754, "grad_norm": 0.3627397815654964, "learning_rate": 9.664139076986473e-06, "loss": 1.1321, "step": 173 }, { "epoch": 0.6267447095902746, "grad_norm": 0.4248545009770537, "learning_rate": 9.656551604996102e-06, "loss": 1.1338, "step": 174 }, { "epoch": 0.6303466906798739, "grad_norm": 0.3492982058791957, "learning_rate": 9.648882429441258e-06, "loss": 1.1383, "step": 175 }, { "epoch": 0.6339486717694732, "grad_norm": 0.37748320014327436, "learning_rate": 9.641131684884817e-06, "loss": 1.1316, "step": 176 }, { "epoch": 0.6375506528590725, "grad_norm": 0.35682755754679785, "learning_rate": 9.633299507320862e-06, "loss": 1.09, "step": 177 }, { "epoch": 0.6411526339486717, "grad_norm": 0.34106929576049877, "learning_rate": 9.62538603417229e-06, "loss": 1.136, "step": 178 }, { "epoch": 0.644754615038271, "grad_norm": 0.34746889086894356, "learning_rate": 9.617391404288412e-06, "loss": 1.0943, "step": 179 }, { "epoch": 0.6483565961278703, "grad_norm": 0.36723025633050504, "learning_rate": 9.609315757942504e-06, "loss": 1.182, "step": 180 }, { "epoch": 0.6519585772174696, "grad_norm": 0.33451032057782015, "learning_rate": 9.601159236829353e-06, "loss": 1.1351, "step": 181 }, { "epoch": 0.6555605583070689, "grad_norm": 0.33248672869162116, "learning_rate": 9.592921984062771e-06, "loss": 1.1187, "step": 182 }, { "epoch": 0.6591625393966681, "grad_norm": 0.3331636644805929, "learning_rate": 9.584604144173084e-06, "loss": 1.1009, "step": 183 }, { "epoch": 0.6627645204862674, "grad_norm": 0.3268063515189842, "learning_rate": 9.576205863104588e-06, "loss": 1.1048, "step": 184 }, { "epoch": 0.6663665015758667, "grad_norm": 0.3651627841761447, "learning_rate": 9.567727288213005e-06, "loss": 1.1534, "step": 185 }, { "epoch": 0.669968482665466, "grad_norm": 0.3347297887818968, "learning_rate": 9.55916856826288e-06, "loss": 1.1567, "step": 186 }, { "epoch": 0.6735704637550652, "grad_norm": 0.3615161248575356, "learning_rate": 9.550529853424979e-06, "loss": 1.1278, "step": 187 }, { "epoch": 0.6771724448446645, "grad_norm": 0.3713014349141257, "learning_rate": 9.541811295273657e-06, "loss": 1.1101, "step": 188 }, { "epoch": 0.6807744259342638, "grad_norm": 0.34004372649926684, "learning_rate": 9.53301304678419e-06, "loss": 1.1276, "step": 189 }, { "epoch": 0.6843764070238632, "grad_norm": 0.35758706995009787, "learning_rate": 9.524135262330098e-06, "loss": 1.1109, "step": 190 }, { "epoch": 0.6879783881134625, "grad_norm": 0.3220236515093335, "learning_rate": 9.515178097680437e-06, "loss": 1.119, "step": 191 }, { "epoch": 0.6915803692030617, "grad_norm": 0.35195981649297625, "learning_rate": 9.506141709997058e-06, "loss": 1.0968, "step": 192 }, { "epoch": 0.695182350292661, "grad_norm": 0.3960324035518941, "learning_rate": 9.497026257831856e-06, "loss": 1.1396, "step": 193 }, { "epoch": 0.6987843313822603, "grad_norm": 0.36557327721045585, "learning_rate": 9.487831901123989e-06, "loss": 1.1238, "step": 194 }, { "epoch": 0.7023863124718596, "grad_norm": 0.3343162963854113, "learning_rate": 9.478558801197065e-06, "loss": 1.1293, "step": 195 }, { "epoch": 0.7059882935614588, "grad_norm": 0.40726069321694147, "learning_rate": 9.46920712075632e-06, "loss": 1.1103, "step": 196 }, { "epoch": 0.7095902746510581, "grad_norm": 0.3381887572157023, "learning_rate": 9.459777023885754e-06, "loss": 1.0944, "step": 197 }, { "epoch": 0.7131922557406574, "grad_norm": 0.3733440854292112, "learning_rate": 9.450268676045261e-06, "loss": 1.1909, "step": 198 }, { "epoch": 0.7167942368302567, "grad_norm": 0.3755723398224028, "learning_rate": 9.440682244067724e-06, "loss": 1.0909, "step": 199 }, { "epoch": 0.720396217919856, "grad_norm": 0.38426065259470316, "learning_rate": 9.431017896156074e-06, "loss": 1.136, "step": 200 }, { "epoch": 0.7239981990094552, "grad_norm": 0.379071589505192, "learning_rate": 9.421275801880363e-06, "loss": 1.1121, "step": 201 }, { "epoch": 0.7276001800990545, "grad_norm": 0.34461539082999465, "learning_rate": 9.411456132174768e-06, "loss": 1.0988, "step": 202 }, { "epoch": 0.7312021611886538, "grad_norm": 0.3654979888018069, "learning_rate": 9.401559059334601e-06, "loss": 1.1238, "step": 203 }, { "epoch": 0.7348041422782531, "grad_norm": 0.34977548124519137, "learning_rate": 9.39158475701329e-06, "loss": 1.11, "step": 204 }, { "epoch": 0.7384061233678523, "grad_norm": 0.3738440708255404, "learning_rate": 9.381533400219319e-06, "loss": 1.1023, "step": 205 }, { "epoch": 0.7420081044574516, "grad_norm": 0.34015234663312666, "learning_rate": 9.371405165313169e-06, "loss": 1.1271, "step": 206 }, { "epoch": 0.7456100855470509, "grad_norm": 0.344237085861616, "learning_rate": 9.361200230004219e-06, "loss": 1.1192, "step": 207 }, { "epoch": 0.7492120666366502, "grad_norm": 0.3485062946685098, "learning_rate": 9.35091877334763e-06, "loss": 1.147, "step": 208 }, { "epoch": 0.7528140477262495, "grad_norm": 0.3458232266553943, "learning_rate": 9.340560975741198e-06, "loss": 1.15, "step": 209 }, { "epoch": 0.7564160288158487, "grad_norm": 0.338158493876411, "learning_rate": 9.330127018922195e-06, "loss": 1.1367, "step": 210 }, { "epoch": 0.760018009905448, "grad_norm": 0.34361214214695057, "learning_rate": 9.319617085964177e-06, "loss": 1.0956, "step": 211 }, { "epoch": 0.7636199909950473, "grad_norm": 0.3321785776797715, "learning_rate": 9.309031361273775e-06, "loss": 1.138, "step": 212 }, { "epoch": 0.7672219720846466, "grad_norm": 0.36852431928120577, "learning_rate": 9.298370030587456e-06, "loss": 1.1271, "step": 213 }, { "epoch": 0.7708239531742458, "grad_norm": 0.36986638396232147, "learning_rate": 9.287633280968263e-06, "loss": 1.112, "step": 214 }, { "epoch": 0.7744259342638451, "grad_norm": 0.32561497084045427, "learning_rate": 9.276821300802535e-06, "loss": 1.08, "step": 215 }, { "epoch": 0.7780279153534444, "grad_norm": 0.3746970517644171, "learning_rate": 9.265934279796602e-06, "loss": 1.1136, "step": 216 }, { "epoch": 0.7816298964430437, "grad_norm": 0.4173073842610304, "learning_rate": 9.25497240897346e-06, "loss": 1.1273, "step": 217 }, { "epoch": 0.785231877532643, "grad_norm": 0.37809222920052, "learning_rate": 9.24393588066941e-06, "loss": 1.0955, "step": 218 }, { "epoch": 0.7888338586222422, "grad_norm": 0.35280564760591715, "learning_rate": 9.232824888530689e-06, "loss": 1.1037, "step": 219 }, { "epoch": 0.7924358397118415, "grad_norm": 0.4219603934746488, "learning_rate": 9.221639627510076e-06, "loss": 1.1389, "step": 220 }, { "epoch": 0.7960378208014408, "grad_norm": 0.34392171607237565, "learning_rate": 9.210380293863462e-06, "loss": 1.1329, "step": 221 }, { "epoch": 0.7996398018910401, "grad_norm": 0.3553225127256251, "learning_rate": 9.199047085146415e-06, "loss": 1.0945, "step": 222 }, { "epoch": 0.8032417829806393, "grad_norm": 0.3492982055796588, "learning_rate": 9.18764020021071e-06, "loss": 1.1537, "step": 223 }, { "epoch": 0.8068437640702386, "grad_norm": 0.329347516548987, "learning_rate": 9.176159839200838e-06, "loss": 1.0952, "step": 224 }, { "epoch": 0.8104457451598379, "grad_norm": 0.3256059168725448, "learning_rate": 9.164606203550498e-06, "loss": 1.1352, "step": 225 }, { "epoch": 0.8140477262494372, "grad_norm": 0.40382790613119113, "learning_rate": 9.152979495979064e-06, "loss": 1.1412, "step": 226 }, { "epoch": 0.8176497073390365, "grad_norm": 0.3575693980124408, "learning_rate": 9.141279920488021e-06, "loss": 1.1295, "step": 227 }, { "epoch": 0.8212516884286357, "grad_norm": 0.3488694495678355, "learning_rate": 9.129507682357393e-06, "loss": 1.0832, "step": 228 }, { "epoch": 0.824853669518235, "grad_norm": 0.3539230531756584, "learning_rate": 9.117662988142138e-06, "loss": 1.1281, "step": 229 }, { "epoch": 0.8284556506078343, "grad_norm": 0.38111067495290035, "learning_rate": 9.10574604566852e-06, "loss": 1.1212, "step": 230 }, { "epoch": 0.8320576316974336, "grad_norm": 0.3883299352729627, "learning_rate": 9.093757064030473e-06, "loss": 1.138, "step": 231 }, { "epoch": 0.8356596127870328, "grad_norm": 0.32925848731485013, "learning_rate": 9.08169625358592e-06, "loss": 1.1304, "step": 232 }, { "epoch": 0.8392615938766321, "grad_norm": 0.4180598977166807, "learning_rate": 9.069563825953092e-06, "loss": 1.1038, "step": 233 }, { "epoch": 0.8428635749662314, "grad_norm": 0.34456574190208067, "learning_rate": 9.057359994006806e-06, "loss": 1.0855, "step": 234 }, { "epoch": 0.8464655560558307, "grad_norm": 0.39699049033870243, "learning_rate": 9.045084971874738e-06, "loss": 1.1254, "step": 235 }, { "epoch": 0.85006753714543, "grad_norm": 0.40763691295552823, "learning_rate": 9.032738974933663e-06, "loss": 1.0794, "step": 236 }, { "epoch": 0.8536695182350292, "grad_norm": 0.36236914147842597, "learning_rate": 9.020322219805674e-06, "loss": 1.1203, "step": 237 }, { "epoch": 0.8572714993246285, "grad_norm": 0.3833655304553943, "learning_rate": 9.007834924354384e-06, "loss": 1.1262, "step": 238 }, { "epoch": 0.8608734804142278, "grad_norm": 0.37172135259336675, "learning_rate": 8.9952773076811e-06, "loss": 1.0626, "step": 239 }, { "epoch": 0.8644754615038271, "grad_norm": 0.3334721033923123, "learning_rate": 8.982649590120982e-06, "loss": 1.0791, "step": 240 }, { "epoch": 0.8680774425934263, "grad_norm": 0.34086631317663335, "learning_rate": 8.969951993239177e-06, "loss": 1.1154, "step": 241 }, { "epoch": 0.8716794236830256, "grad_norm": 0.3371727774416591, "learning_rate": 8.957184739826929e-06, "loss": 1.1387, "step": 242 }, { "epoch": 0.875281404772625, "grad_norm": 0.3633470123731805, "learning_rate": 8.944348053897672e-06, "loss": 1.1505, "step": 243 }, { "epoch": 0.8788833858622243, "grad_norm": 0.3695807933678788, "learning_rate": 8.931442160683094e-06, "loss": 1.1351, "step": 244 }, { "epoch": 0.8824853669518236, "grad_norm": 0.38189837725904396, "learning_rate": 8.9184672866292e-06, "loss": 1.099, "step": 245 }, { "epoch": 0.8860873480414228, "grad_norm": 0.38223867232950853, "learning_rate": 8.905423659392316e-06, "loss": 1.0558, "step": 246 }, { "epoch": 0.889689329131022, "grad_norm": 0.33464001960015227, "learning_rate": 8.892311507835118e-06, "loss": 1.1106, "step": 247 }, { "epoch": 0.8932913102206214, "grad_norm": 0.3500335015064184, "learning_rate": 8.879131062022598e-06, "loss": 1.0943, "step": 248 }, { "epoch": 0.8968932913102207, "grad_norm": 0.38615622184380055, "learning_rate": 8.865882553218036e-06, "loss": 1.1362, "step": 249 }, { "epoch": 0.9004952723998199, "grad_norm": 0.3549431372544129, "learning_rate": 8.852566213878947e-06, "loss": 1.1453, "step": 250 }, { "epoch": 0.9040972534894192, "grad_norm": 0.3324763448668655, "learning_rate": 8.83918227765299e-06, "loss": 1.1157, "step": 251 }, { "epoch": 0.9076992345790185, "grad_norm": 0.35594427834572845, "learning_rate": 8.825730979373873e-06, "loss": 1.1095, "step": 252 }, { "epoch": 0.9113012156686178, "grad_norm": 0.34567215550933944, "learning_rate": 8.81221255505724e-06, "loss": 1.1086, "step": 253 }, { "epoch": 0.9149031967582171, "grad_norm": 0.3653541234957228, "learning_rate": 8.798627241896524e-06, "loss": 1.0936, "step": 254 }, { "epoch": 0.9185051778478163, "grad_norm": 0.3795042606149247, "learning_rate": 8.784975278258783e-06, "loss": 1.1185, "step": 255 }, { "epoch": 0.9221071589374156, "grad_norm": 0.3660926060585707, "learning_rate": 8.77125690368052e-06, "loss": 1.1117, "step": 256 }, { "epoch": 0.9257091400270149, "grad_norm": 0.377064805408519, "learning_rate": 8.757472358863481e-06, "loss": 1.1265, "step": 257 }, { "epoch": 0.9293111211166142, "grad_norm": 0.36507443565402037, "learning_rate": 8.743621885670431e-06, "loss": 1.1493, "step": 258 }, { "epoch": 0.9329131022062134, "grad_norm": 0.36199439427145674, "learning_rate": 8.729705727120911e-06, "loss": 1.0902, "step": 259 }, { "epoch": 0.9365150832958127, "grad_norm": 0.365398128659048, "learning_rate": 8.715724127386971e-06, "loss": 1.1199, "step": 260 }, { "epoch": 0.940117064385412, "grad_norm": 0.3597974956274622, "learning_rate": 8.701677331788891e-06, "loss": 1.1349, "step": 261 }, { "epoch": 0.9437190454750113, "grad_norm": 0.34565112893965727, "learning_rate": 8.68756558679087e-06, "loss": 1.1093, "step": 262 }, { "epoch": 0.9473210265646106, "grad_norm": 0.35741324005523417, "learning_rate": 8.673389139996708e-06, "loss": 1.0965, "step": 263 }, { "epoch": 0.9509230076542098, "grad_norm": 0.32598922479655423, "learning_rate": 8.659148240145456e-06, "loss": 1.105, "step": 264 }, { "epoch": 0.9545249887438091, "grad_norm": 0.3329117672496498, "learning_rate": 8.644843137107058e-06, "loss": 1.0749, "step": 265 }, { "epoch": 0.9581269698334084, "grad_norm": 0.3937476383650613, "learning_rate": 8.630474081877959e-06, "loss": 1.1018, "step": 266 }, { "epoch": 0.9617289509230077, "grad_norm": 0.31471194111230605, "learning_rate": 8.616041326576711e-06, "loss": 1.1058, "step": 267 }, { "epoch": 0.9653309320126069, "grad_norm": 0.36368215737547877, "learning_rate": 8.601545124439535e-06, "loss": 1.1285, "step": 268 }, { "epoch": 0.9689329131022062, "grad_norm": 0.3473607661443015, "learning_rate": 8.586985729815895e-06, "loss": 1.1326, "step": 269 }, { "epoch": 0.9725348941918055, "grad_norm": 0.377774377998179, "learning_rate": 8.572363398164017e-06, "loss": 1.1253, "step": 270 }, { "epoch": 0.9761368752814048, "grad_norm": 0.3611382706567514, "learning_rate": 8.557678386046429e-06, "loss": 1.1152, "step": 271 }, { "epoch": 0.9797388563710041, "grad_norm": 0.360467222912271, "learning_rate": 8.542930951125432e-06, "loss": 1.0612, "step": 272 }, { "epoch": 0.9833408374606033, "grad_norm": 0.36148714655712805, "learning_rate": 8.528121352158604e-06, "loss": 1.1254, "step": 273 }, { "epoch": 0.9869428185502026, "grad_norm": 0.4620821380277117, "learning_rate": 8.513249848994248e-06, "loss": 1.1151, "step": 274 }, { "epoch": 0.9905447996398019, "grad_norm": 0.3699494680308604, "learning_rate": 8.498316702566828e-06, "loss": 1.1331, "step": 275 }, { "epoch": 0.9941467807294012, "grad_norm": 0.3944738632436142, "learning_rate": 8.483322174892404e-06, "loss": 1.1218, "step": 276 }, { "epoch": 0.9977487618190004, "grad_norm": 0.37443116561016265, "learning_rate": 8.468266529064025e-06, "loss": 1.0858, "step": 277 }, { "epoch": 1.0, "grad_norm": 0.37443116561016265, "learning_rate": 8.453150029247115e-06, "loss": 1.1388, "step": 278 }, { "epoch": 1.0036019810895993, "grad_norm": 0.5412674452239241, "learning_rate": 8.437972940674838e-06, "loss": 1.0955, "step": 279 }, { "epoch": 1.0072039621791986, "grad_norm": 0.3606994742407109, "learning_rate": 8.422735529643445e-06, "loss": 1.0508, "step": 280 }, { "epoch": 1.010805943268798, "grad_norm": 0.500891921192956, "learning_rate": 8.4074380635076e-06, "loss": 1.0681, "step": 281 }, { "epoch": 1.0144079243583972, "grad_norm": 0.3250502662600929, "learning_rate": 8.392080810675692e-06, "loss": 1.0734, "step": 282 }, { "epoch": 1.0180099054479963, "grad_norm": 0.4496303032317973, "learning_rate": 8.376664040605122e-06, "loss": 1.0971, "step": 283 }, { "epoch": 1.0216118865375956, "grad_norm": 0.3582282301299312, "learning_rate": 8.361188023797581e-06, "loss": 1.1034, "step": 284 }, { "epoch": 1.025213867627195, "grad_norm": 0.4887190983762863, "learning_rate": 8.345653031794292e-06, "loss": 1.0937, "step": 285 }, { "epoch": 1.0288158487167942, "grad_norm": 0.419655234473874, "learning_rate": 8.33005933717126e-06, "loss": 1.0917, "step": 286 }, { "epoch": 1.0324178298063935, "grad_norm": 0.4289753872257514, "learning_rate": 8.314407213534477e-06, "loss": 1.0846, "step": 287 }, { "epoch": 1.0360198108959928, "grad_norm": 0.3868135415082272, "learning_rate": 8.298696935515132e-06, "loss": 1.0901, "step": 288 }, { "epoch": 1.0396217919855921, "grad_norm": 0.39062885945304754, "learning_rate": 8.282928778764783e-06, "loss": 1.1005, "step": 289 }, { "epoch": 1.0432237730751914, "grad_norm": 0.3750887902646042, "learning_rate": 8.267103019950529e-06, "loss": 1.092, "step": 290 }, { "epoch": 1.0468257541647907, "grad_norm": 0.3944074092851082, "learning_rate": 8.251219936750145e-06, "loss": 1.0559, "step": 291 }, { "epoch": 1.0504277352543898, "grad_norm": 0.442523607996674, "learning_rate": 8.235279807847223e-06, "loss": 1.0879, "step": 292 }, { "epoch": 1.0540297163439891, "grad_norm": 0.400701104226115, "learning_rate": 8.21928291292627e-06, "loss": 1.0761, "step": 293 }, { "epoch": 1.0576316974335884, "grad_norm": 0.3929872862213915, "learning_rate": 8.203229532667808e-06, "loss": 1.1122, "step": 294 }, { "epoch": 1.0612336785231877, "grad_norm": 0.4802600223498965, "learning_rate": 8.18711994874345e-06, "loss": 1.0431, "step": 295 }, { "epoch": 1.064835659612787, "grad_norm": 0.3896837748940082, "learning_rate": 8.170954443810947e-06, "loss": 1.0706, "step": 296 }, { "epoch": 1.0684376407023863, "grad_norm": 0.46461703487742634, "learning_rate": 8.154733301509249e-06, "loss": 1.1189, "step": 297 }, { "epoch": 1.0720396217919856, "grad_norm": 0.37467583421393963, "learning_rate": 8.138456806453503e-06, "loss": 1.0592, "step": 298 }, { "epoch": 1.075641602881585, "grad_norm": 0.4532029044825418, "learning_rate": 8.12212524423008e-06, "loss": 1.0787, "step": 299 }, { "epoch": 1.0792435839711843, "grad_norm": 0.3732477898818737, "learning_rate": 8.105738901391553e-06, "loss": 1.0592, "step": 300 }, { "epoch": 1.0828455650607833, "grad_norm": 0.39110742601041953, "learning_rate": 8.089298065451673e-06, "loss": 1.0412, "step": 301 }, { "epoch": 1.0864475461503826, "grad_norm": 0.4819959554895645, "learning_rate": 8.072803024880322e-06, "loss": 1.1164, "step": 302 }, { "epoch": 1.090049527239982, "grad_norm": 0.40121766412624266, "learning_rate": 8.05625406909846e-06, "loss": 1.1179, "step": 303 }, { "epoch": 1.0936515083295812, "grad_norm": 0.43991989176939206, "learning_rate": 8.039651488473028e-06, "loss": 1.0665, "step": 304 }, { "epoch": 1.0972534894191805, "grad_norm": 0.3655171144729394, "learning_rate": 8.022995574311876e-06, "loss": 1.0892, "step": 305 }, { "epoch": 1.1008554705087799, "grad_norm": 0.33944440562879685, "learning_rate": 8.006286618858634e-06, "loss": 1.0412, "step": 306 }, { "epoch": 1.1044574515983792, "grad_norm": 0.4079616754346733, "learning_rate": 7.989524915287595e-06, "loss": 1.0643, "step": 307 }, { "epoch": 1.1080594326879785, "grad_norm": 0.345780124558647, "learning_rate": 7.972710757698567e-06, "loss": 1.0982, "step": 308 }, { "epoch": 1.1116614137775778, "grad_norm": 0.42242099900664976, "learning_rate": 7.95584444111171e-06, "loss": 1.1034, "step": 309 }, { "epoch": 1.1152633948671768, "grad_norm": 0.4005977051798865, "learning_rate": 7.938926261462366e-06, "loss": 1.0703, "step": 310 }, { "epoch": 1.1188653759567762, "grad_norm": 0.36549179047552754, "learning_rate": 7.921956515595861e-06, "loss": 1.1015, "step": 311 }, { "epoch": 1.1224673570463755, "grad_norm": 0.4262110579462978, "learning_rate": 7.904935501262301e-06, "loss": 1.0648, "step": 312 }, { "epoch": 1.1260693381359748, "grad_norm": 0.37185963973126124, "learning_rate": 7.887863517111337e-06, "loss": 1.1019, "step": 313 }, { "epoch": 1.129671319225574, "grad_norm": 0.34072768994255465, "learning_rate": 7.87074086268695e-06, "loss": 1.049, "step": 314 }, { "epoch": 1.1332733003151734, "grad_norm": 0.38829026634405583, "learning_rate": 7.85356783842216e-06, "loss": 1.099, "step": 315 }, { "epoch": 1.1368752814047727, "grad_norm": 0.3454891333781692, "learning_rate": 7.836344745633785e-06, "loss": 1.0896, "step": 316 }, { "epoch": 1.140477262494372, "grad_norm": 0.40407073745828537, "learning_rate": 7.819071886517134e-06, "loss": 1.0885, "step": 317 }, { "epoch": 1.1440792435839713, "grad_norm": 0.40693031599207263, "learning_rate": 7.801749564140724e-06, "loss": 1.0621, "step": 318 }, { "epoch": 1.1476812246735704, "grad_norm": 0.41527967835091, "learning_rate": 7.78437808244094e-06, "loss": 1.0661, "step": 319 }, { "epoch": 1.1512832057631697, "grad_norm": 0.32461739079088076, "learning_rate": 7.76695774621672e-06, "loss": 1.0738, "step": 320 }, { "epoch": 1.154885186852769, "grad_norm": 0.369117935448055, "learning_rate": 7.7494888611242e-06, "loss": 1.0507, "step": 321 }, { "epoch": 1.1584871679423683, "grad_norm": 0.40286832831280445, "learning_rate": 7.731971733671347e-06, "loss": 1.0115, "step": 322 }, { "epoch": 1.1620891490319676, "grad_norm": 0.3562069260238656, "learning_rate": 7.714406671212589e-06, "loss": 1.0678, "step": 323 }, { "epoch": 1.1656911301215669, "grad_norm": 0.4156873725872057, "learning_rate": 7.696793981943418e-06, "loss": 1.0846, "step": 324 }, { "epoch": 1.1692931112111662, "grad_norm": 0.40288662850406076, "learning_rate": 7.679133974894984e-06, "loss": 1.0544, "step": 325 }, { "epoch": 1.1728950923007655, "grad_norm": 0.3739155064577844, "learning_rate": 7.66142695992867e-06, "loss": 1.0737, "step": 326 }, { "epoch": 1.1764970733903648, "grad_norm": 0.4406680015338461, "learning_rate": 7.64367324773066e-06, "loss": 1.1106, "step": 327 }, { "epoch": 1.1800990544799639, "grad_norm": 0.34345904522451653, "learning_rate": 7.6258731498064796e-06, "loss": 1.0759, "step": 328 }, { "epoch": 1.1837010355695632, "grad_norm": 0.4365590987719929, "learning_rate": 7.6080269784755405e-06, "loss": 1.0738, "step": 329 }, { "epoch": 1.1873030166591625, "grad_norm": 0.3633076504956387, "learning_rate": 7.590135046865652e-06, "loss": 1.1089, "step": 330 }, { "epoch": 1.1909049977487618, "grad_norm": 0.395068007524633, "learning_rate": 7.572197668907533e-06, "loss": 1.1244, "step": 331 }, { "epoch": 1.194506978838361, "grad_norm": 0.39729351127387735, "learning_rate": 7.5542151593293e-06, "loss": 1.0889, "step": 332 }, { "epoch": 1.1981089599279604, "grad_norm": 2.5701237358423255, "learning_rate": 7.536187833650947e-06, "loss": 1.1059, "step": 333 }, { "epoch": 1.2017109410175597, "grad_norm": 0.47480486252921483, "learning_rate": 7.518116008178805e-06, "loss": 1.0482, "step": 334 }, { "epoch": 1.205312922107159, "grad_norm": 0.40028253752269954, "learning_rate": 7.500000000000001e-06, "loss": 1.0897, "step": 335 }, { "epoch": 1.2089149031967583, "grad_norm": 0.45760958914786226, "learning_rate": 7.481840126976885e-06, "loss": 1.0907, "step": 336 }, { "epoch": 1.2125168842863574, "grad_norm": 0.40820185657609664, "learning_rate": 7.463636707741458e-06, "loss": 1.0526, "step": 337 }, { "epoch": 1.2161188653759567, "grad_norm": 0.4393002377107819, "learning_rate": 7.445390061689782e-06, "loss": 1.1063, "step": 338 }, { "epoch": 1.219720846465556, "grad_norm": 0.3828530976851444, "learning_rate": 7.42710050897637e-06, "loss": 1.0735, "step": 339 }, { "epoch": 1.2233228275551553, "grad_norm": 0.43353384279187324, "learning_rate": 7.408768370508577e-06, "loss": 1.0893, "step": 340 }, { "epoch": 1.2269248086447546, "grad_norm": 0.3888399760706739, "learning_rate": 7.390393967940962e-06, "loss": 1.0666, "step": 341 }, { "epoch": 1.230526789734354, "grad_norm": 0.4143338997564131, "learning_rate": 7.371977623669646e-06, "loss": 1.1293, "step": 342 }, { "epoch": 1.2341287708239532, "grad_norm": 0.32879022172398614, "learning_rate": 7.353519660826665e-06, "loss": 1.0879, "step": 343 }, { "epoch": 1.2377307519135525, "grad_norm": 0.357811781692411, "learning_rate": 7.335020403274277e-06, "loss": 1.0792, "step": 344 }, { "epoch": 1.2413327330031518, "grad_norm": 0.37966132927138474, "learning_rate": 7.31648017559931e-06, "loss": 1.0718, "step": 345 }, { "epoch": 1.244934714092751, "grad_norm": 0.3968902781929655, "learning_rate": 7.297899303107441e-06, "loss": 1.0676, "step": 346 }, { "epoch": 1.2485366951823502, "grad_norm": 0.35750082909244935, "learning_rate": 7.279278111817502e-06, "loss": 1.0227, "step": 347 }, { "epoch": 1.2521386762719495, "grad_norm": 0.42773056447698266, "learning_rate": 7.260616928455754e-06, "loss": 1.0496, "step": 348 }, { "epoch": 1.2557406573615488, "grad_norm": 0.40151554123482264, "learning_rate": 7.241916080450163e-06, "loss": 1.0696, "step": 349 }, { "epoch": 1.2593426384511481, "grad_norm": 0.4131771160185376, "learning_rate": 7.223175895924638e-06, "loss": 1.0915, "step": 350 }, { "epoch": 1.2629446195407474, "grad_norm": 0.4106261415779341, "learning_rate": 7.2043967036932935e-06, "loss": 1.0328, "step": 351 }, { "epoch": 1.2665466006303467, "grad_norm": 0.34879774503343425, "learning_rate": 7.185578833254665e-06, "loss": 1.057, "step": 352 }, { "epoch": 1.270148581719946, "grad_norm": 0.3836853786701187, "learning_rate": 7.166722614785937e-06, "loss": 1.0754, "step": 353 }, { "epoch": 1.2737505628095454, "grad_norm": 0.3800741116171829, "learning_rate": 7.1478283791371415e-06, "loss": 1.0841, "step": 354 }, { "epoch": 1.2773525438991444, "grad_norm": 0.34355233879245345, "learning_rate": 7.128896457825364e-06, "loss": 1.0632, "step": 355 }, { "epoch": 1.2809545249887437, "grad_norm": 0.42485239689947135, "learning_rate": 7.1099271830289155e-06, "loss": 1.0985, "step": 356 }, { "epoch": 1.284556506078343, "grad_norm": 0.4215135642816349, "learning_rate": 7.090920887581507e-06, "loss": 1.0707, "step": 357 }, { "epoch": 1.2881584871679423, "grad_norm": 0.35779792395285226, "learning_rate": 7.071877904966422e-06, "loss": 1.0815, "step": 358 }, { "epoch": 1.2917604682575416, "grad_norm": 0.43513590600966623, "learning_rate": 7.052798569310641e-06, "loss": 1.1024, "step": 359 }, { "epoch": 1.295362449347141, "grad_norm": 0.3709332183690677, "learning_rate": 7.033683215379002e-06, "loss": 1.0788, "step": 360 }, { "epoch": 1.2989644304367403, "grad_norm": 0.3951949031993291, "learning_rate": 7.014532178568314e-06, "loss": 1.05, "step": 361 }, { "epoch": 1.3025664115263396, "grad_norm": 0.33987216437327716, "learning_rate": 6.995345794901477e-06, "loss": 1.0697, "step": 362 }, { "epoch": 1.3061683926159389, "grad_norm": 0.3780771954200527, "learning_rate": 6.976124401021583e-06, "loss": 1.0729, "step": 363 }, { "epoch": 1.309770373705538, "grad_norm": 0.3908638367202069, "learning_rate": 6.9568683341860135e-06, "loss": 1.0328, "step": 364 }, { "epoch": 1.3133723547951373, "grad_norm": 0.38289027631075667, "learning_rate": 6.9375779322605154e-06, "loss": 1.0928, "step": 365 }, { "epoch": 1.3169743358847366, "grad_norm": 0.3614859878801377, "learning_rate": 6.9182535337132824e-06, "loss": 1.0756, "step": 366 }, { "epoch": 1.3205763169743359, "grad_norm": 0.3598395419280253, "learning_rate": 6.898895477609007e-06, "loss": 1.103, "step": 367 }, { "epoch": 1.3241782980639352, "grad_norm": 0.37455771512012165, "learning_rate": 6.879504103602934e-06, "loss": 1.0758, "step": 368 }, { "epoch": 1.3277802791535345, "grad_norm": 0.40574624953367006, "learning_rate": 6.860079751934908e-06, "loss": 1.0794, "step": 369 }, { "epoch": 1.3313822602431338, "grad_norm": 0.3725008186416374, "learning_rate": 6.840622763423391e-06, "loss": 1.0756, "step": 370 }, { "epoch": 1.334984241332733, "grad_norm": 0.3646005212300081, "learning_rate": 6.821133479459492e-06, "loss": 1.0959, "step": 371 }, { "epoch": 1.3385862224223324, "grad_norm": 0.4113247656948934, "learning_rate": 6.8016122420009745e-06, "loss": 1.0835, "step": 372 }, { "epoch": 1.3421882035119315, "grad_norm": 0.35986780686629694, "learning_rate": 6.782059393566254e-06, "loss": 1.0589, "step": 373 }, { "epoch": 1.3457901846015308, "grad_norm": 0.40465760674379114, "learning_rate": 6.762475277228393e-06, "loss": 1.0825, "step": 374 }, { "epoch": 1.34939216569113, "grad_norm": 0.3988091604755472, "learning_rate": 6.7428602366090764e-06, "loss": 1.0716, "step": 375 }, { "epoch": 1.3529941467807294, "grad_norm": 0.40966197148471073, "learning_rate": 6.723214615872585e-06, "loss": 1.0922, "step": 376 }, { "epoch": 1.3565961278703287, "grad_norm": 0.42278479417615644, "learning_rate": 6.70353875971976e-06, "loss": 1.0802, "step": 377 }, { "epoch": 1.360198108959928, "grad_norm": 0.35541048253819246, "learning_rate": 6.683833013381942e-06, "loss": 1.0872, "step": 378 }, { "epoch": 1.3638000900495273, "grad_norm": 0.3502392276010618, "learning_rate": 6.664097722614934e-06, "loss": 1.0583, "step": 379 }, { "epoch": 1.3674020711391266, "grad_norm": 0.38138433478225825, "learning_rate": 6.644333233692917e-06, "loss": 1.0692, "step": 380 }, { "epoch": 1.371004052228726, "grad_norm": 0.3377587100898131, "learning_rate": 6.624539893402383e-06, "loss": 1.079, "step": 381 }, { "epoch": 1.374606033318325, "grad_norm": 0.3409130145543055, "learning_rate": 6.604718049036047e-06, "loss": 1.0794, "step": 382 }, { "epoch": 1.3782080144079243, "grad_norm": 0.37652977172080476, "learning_rate": 6.58486804838676e-06, "loss": 1.0908, "step": 383 }, { "epoch": 1.3818099954975236, "grad_norm": 0.3257917788013114, "learning_rate": 6.5649902397413915e-06, "loss": 1.0856, "step": 384 }, { "epoch": 1.385411976587123, "grad_norm": 0.3195784470315217, "learning_rate": 6.545084971874738e-06, "loss": 1.1038, "step": 385 }, { "epoch": 1.3890139576767222, "grad_norm": 0.3616341536622016, "learning_rate": 6.525152594043389e-06, "loss": 1.0814, "step": 386 }, { "epoch": 1.3926159387663215, "grad_norm": 0.40527856810901475, "learning_rate": 6.505193455979603e-06, "loss": 1.0994, "step": 387 }, { "epoch": 1.3962179198559208, "grad_norm": 0.35495797940458634, "learning_rate": 6.485207907885175e-06, "loss": 1.063, "step": 388 }, { "epoch": 1.3998199009455201, "grad_norm": 0.3327326350539732, "learning_rate": 6.465196300425287e-06, "loss": 1.1113, "step": 389 }, { "epoch": 1.4034218820351194, "grad_norm": 0.36061962924782925, "learning_rate": 6.445158984722358e-06, "loss": 1.0644, "step": 390 }, { "epoch": 1.4070238631247185, "grad_norm": 0.34339629638301855, "learning_rate": 6.425096312349881e-06, "loss": 1.0904, "step": 391 }, { "epoch": 1.4106258442143178, "grad_norm": 0.34978035011668274, "learning_rate": 6.4050086353262565e-06, "loss": 1.0788, "step": 392 }, { "epoch": 1.414227825303917, "grad_norm": 0.38376522345500463, "learning_rate": 6.384896306108612e-06, "loss": 1.0564, "step": 393 }, { "epoch": 1.4178298063935164, "grad_norm": 0.35566301025630226, "learning_rate": 6.364759677586627e-06, "loss": 1.083, "step": 394 }, { "epoch": 1.4214317874831157, "grad_norm": 0.32888274597150713, "learning_rate": 6.344599103076329e-06, "loss": 1.0868, "step": 395 }, { "epoch": 1.425033768572715, "grad_norm": 0.3669493819260475, "learning_rate": 6.324414936313904e-06, "loss": 1.0402, "step": 396 }, { "epoch": 1.4286357496623143, "grad_norm": 0.3617926547245432, "learning_rate": 6.304207531449486e-06, "loss": 1.0905, "step": 397 }, { "epoch": 1.4322377307519136, "grad_norm": 0.36414284887525294, "learning_rate": 6.28397724304094e-06, "loss": 1.0827, "step": 398 }, { "epoch": 1.435839711841513, "grad_norm": 0.3386723453983854, "learning_rate": 6.2637244260476474e-06, "loss": 1.061, "step": 399 }, { "epoch": 1.439441692931112, "grad_norm": 0.34781644394622324, "learning_rate": 6.243449435824276e-06, "loss": 1.0802, "step": 400 }, { "epoch": 1.4430436740207113, "grad_norm": 0.3503485349700194, "learning_rate": 6.223152628114537e-06, "loss": 1.0664, "step": 401 }, { "epoch": 1.4466456551103106, "grad_norm": 0.3328702702553546, "learning_rate": 6.202834359044959e-06, "loss": 1.0701, "step": 402 }, { "epoch": 1.45024763619991, "grad_norm": 0.33914752395171327, "learning_rate": 6.182494985118625e-06, "loss": 1.0538, "step": 403 }, { "epoch": 1.4538496172895092, "grad_norm": 0.39570376225734244, "learning_rate": 6.1621348632089205e-06, "loss": 1.0608, "step": 404 }, { "epoch": 1.4574515983791085, "grad_norm": 0.3515088932100447, "learning_rate": 6.141754350553279e-06, "loss": 1.0665, "step": 405 }, { "epoch": 1.4610535794687078, "grad_norm": 0.3393163686581987, "learning_rate": 6.121353804746907e-06, "loss": 1.0678, "step": 406 }, { "epoch": 1.4646555605583071, "grad_norm": 0.363682561032409, "learning_rate": 6.100933583736508e-06, "loss": 1.0712, "step": 407 }, { "epoch": 1.4682575416479065, "grad_norm": 0.3852133533106999, "learning_rate": 6.080494045814011e-06, "loss": 1.0675, "step": 408 }, { "epoch": 1.4718595227375055, "grad_norm": 0.33227273331193696, "learning_rate": 6.060035549610275e-06, "loss": 1.0702, "step": 409 }, { "epoch": 1.4754615038271048, "grad_norm": 0.34242254777920744, "learning_rate": 6.039558454088796e-06, "loss": 1.0887, "step": 410 }, { "epoch": 1.4790634849167041, "grad_norm": 0.36440240139479513, "learning_rate": 6.019063118539425e-06, "loss": 1.0629, "step": 411 }, { "epoch": 1.4826654660063034, "grad_norm": 0.34522394073241824, "learning_rate": 5.9985499025720354e-06, "loss": 1.0604, "step": 412 }, { "epoch": 1.4862674470959027, "grad_norm": 0.35469984623846545, "learning_rate": 5.978019166110242e-06, "loss": 1.0732, "step": 413 }, { "epoch": 1.489869428185502, "grad_norm": 0.3327229153625461, "learning_rate": 5.957471269385065e-06, "loss": 1.0916, "step": 414 }, { "epoch": 1.4934714092751014, "grad_norm": 0.35947455556579844, "learning_rate": 5.936906572928625e-06, "loss": 1.0857, "step": 415 }, { "epoch": 1.4970733903647007, "grad_norm": 0.3523660373102713, "learning_rate": 5.9163254375677995e-06, "loss": 1.0354, "step": 416 }, { "epoch": 1.5006753714543, "grad_norm": 0.33259457908461304, "learning_rate": 5.8957282244179125e-06, "loss": 1.081, "step": 417 }, { "epoch": 1.504277352543899, "grad_norm": 0.36043116131180003, "learning_rate": 5.8751152948763815e-06, "loss": 1.0882, "step": 418 }, { "epoch": 1.5078793336334986, "grad_norm": 0.3644427491180247, "learning_rate": 5.854487010616384e-06, "loss": 1.0753, "step": 419 }, { "epoch": 1.5114813147230977, "grad_norm": 0.35395707576036406, "learning_rate": 5.8338437335805124e-06, "loss": 1.0953, "step": 420 }, { "epoch": 1.515083295812697, "grad_norm": 0.48229948381614823, "learning_rate": 5.813185825974419e-06, "loss": 1.1207, "step": 421 }, { "epoch": 1.5186852769022963, "grad_norm": 0.38738601915939525, "learning_rate": 5.792513650260465e-06, "loss": 1.0958, "step": 422 }, { "epoch": 1.5222872579918956, "grad_norm": 0.3259233561804611, "learning_rate": 5.771827569151357e-06, "loss": 1.0954, "step": 423 }, { "epoch": 1.5258892390814949, "grad_norm": 0.3332133749381523, "learning_rate": 5.751127945603786e-06, "loss": 1.0927, "step": 424 }, { "epoch": 1.529491220171094, "grad_norm": 0.32989085101834736, "learning_rate": 5.730415142812059e-06, "loss": 1.0527, "step": 425 }, { "epoch": 1.5330932012606935, "grad_norm": 0.36851444118947263, "learning_rate": 5.709689524201723e-06, "loss": 1.0583, "step": 426 }, { "epoch": 1.5366951823502926, "grad_norm": 0.35379667109046153, "learning_rate": 5.68895145342319e-06, "loss": 1.0943, "step": 427 }, { "epoch": 1.540297163439892, "grad_norm": 0.37779417160624446, "learning_rate": 5.668201294345363e-06, "loss": 1.0812, "step": 428 }, { "epoch": 1.5438991445294912, "grad_norm": 0.3256143401933206, "learning_rate": 5.647439411049235e-06, "loss": 1.0646, "step": 429 }, { "epoch": 1.5475011256190905, "grad_norm": 0.34874005725965246, "learning_rate": 5.626666167821522e-06, "loss": 1.0753, "step": 430 }, { "epoch": 1.5511031067086898, "grad_norm": 0.315911724351532, "learning_rate": 5.605881929148254e-06, "loss": 1.062, "step": 431 }, { "epoch": 1.554705087798289, "grad_norm": 0.3684384723712539, "learning_rate": 5.585087059708389e-06, "loss": 1.0853, "step": 432 }, { "epoch": 1.5583070688878884, "grad_norm": 0.3803316237726428, "learning_rate": 5.5642819243674085e-06, "loss": 1.0471, "step": 433 }, { "epoch": 1.5619090499774875, "grad_norm": 0.3322373067202158, "learning_rate": 5.543466888170927e-06, "loss": 1.0472, "step": 434 }, { "epoch": 1.565511031067087, "grad_norm": 0.35808606269858695, "learning_rate": 5.522642316338268e-06, "loss": 1.0435, "step": 435 }, { "epoch": 1.569113012156686, "grad_norm": 0.3651095676354401, "learning_rate": 5.5018085742560745e-06, "loss": 1.0415, "step": 436 }, { "epoch": 1.5727149932462856, "grad_norm": 0.3434911579987937, "learning_rate": 5.480966027471889e-06, "loss": 1.0683, "step": 437 }, { "epoch": 1.5763169743358847, "grad_norm": 0.3704214813504543, "learning_rate": 5.460115041687737e-06, "loss": 1.0413, "step": 438 }, { "epoch": 1.579918955425484, "grad_norm": 0.34635012302936774, "learning_rate": 5.439255982753717e-06, "loss": 1.097, "step": 439 }, { "epoch": 1.5835209365150833, "grad_norm": 0.3894870354521757, "learning_rate": 5.41838921666158e-06, "loss": 1.0078, "step": 440 }, { "epoch": 1.5871229176046826, "grad_norm": 0.32628973132390515, "learning_rate": 5.3975151095383e-06, "loss": 1.0708, "step": 441 }, { "epoch": 1.590724898694282, "grad_norm": 0.353493468436304, "learning_rate": 5.376634027639664e-06, "loss": 1.0893, "step": 442 }, { "epoch": 1.594326879783881, "grad_norm": 0.35574519333131815, "learning_rate": 5.355746337343835e-06, "loss": 1.0992, "step": 443 }, { "epoch": 1.5979288608734805, "grad_norm": 0.3721178525169961, "learning_rate": 5.334852405144926e-06, "loss": 1.08, "step": 444 }, { "epoch": 1.6015308419630796, "grad_norm": 0.35260152951310547, "learning_rate": 5.3139525976465675e-06, "loss": 1.0573, "step": 445 }, { "epoch": 1.6051328230526791, "grad_norm": 0.3102871481104457, "learning_rate": 5.293047281555482e-06, "loss": 1.0845, "step": 446 }, { "epoch": 1.6087348041422782, "grad_norm": 0.34166757066038683, "learning_rate": 5.272136823675046e-06, "loss": 1.0644, "step": 447 }, { "epoch": 1.6123367852318775, "grad_norm": 0.3555363375900774, "learning_rate": 5.251221590898848e-06, "loss": 1.086, "step": 448 }, { "epoch": 1.6159387663214768, "grad_norm": 0.3487085514723649, "learning_rate": 5.230301950204261e-06, "loss": 1.0655, "step": 449 }, { "epoch": 1.6195407474110761, "grad_norm": 0.3481330917315912, "learning_rate": 5.209378268645998e-06, "loss": 1.0804, "step": 450 }, { "epoch": 1.6231427285006754, "grad_norm": 0.3479613181209226, "learning_rate": 5.188450913349674e-06, "loss": 1.0802, "step": 451 }, { "epoch": 1.6267447095902745, "grad_norm": 0.329411221852206, "learning_rate": 5.167520251505358e-06, "loss": 1.049, "step": 452 }, { "epoch": 1.630346690679874, "grad_norm": 0.373931387358175, "learning_rate": 5.146586650361143e-06, "loss": 1.05, "step": 453 }, { "epoch": 1.6339486717694731, "grad_norm": 0.33179661919955405, "learning_rate": 5.1256504772166885e-06, "loss": 1.068, "step": 454 }, { "epoch": 1.6375506528590726, "grad_norm": 0.3248298243422403, "learning_rate": 5.1047120994167855e-06, "loss": 1.0459, "step": 455 }, { "epoch": 1.6411526339486717, "grad_norm": 0.32872162641417635, "learning_rate": 5.083771884344908e-06, "loss": 1.1005, "step": 456 }, { "epoch": 1.644754615038271, "grad_norm": 0.3350940372390754, "learning_rate": 5.062830199416764e-06, "loss": 1.0729, "step": 457 }, { "epoch": 1.6483565961278703, "grad_norm": 0.3500027893173043, "learning_rate": 5.041887412073853e-06, "loss": 1.056, "step": 458 }, { "epoch": 1.6519585772174696, "grad_norm": 0.352966799403548, "learning_rate": 5.0209438897770205e-06, "loss": 1.0368, "step": 459 }, { "epoch": 1.655560558307069, "grad_norm": 0.3570213767690983, "learning_rate": 5e-06, "loss": 1.1005, "step": 460 }, { "epoch": 1.659162539396668, "grad_norm": 0.35681386612113786, "learning_rate": 4.979056110222982e-06, "loss": 1.0552, "step": 461 }, { "epoch": 1.6627645204862675, "grad_norm": 0.37777499026486816, "learning_rate": 4.9581125879261476e-06, "loss": 1.0655, "step": 462 }, { "epoch": 1.6663665015758666, "grad_norm": 0.3720854550185324, "learning_rate": 4.937169800583237e-06, "loss": 1.0905, "step": 463 }, { "epoch": 1.6699684826654662, "grad_norm": 0.35871679077028334, "learning_rate": 4.9162281156550945e-06, "loss": 1.0735, "step": 464 }, { "epoch": 1.6735704637550652, "grad_norm": 0.32746933618519874, "learning_rate": 4.895287900583216e-06, "loss": 1.0375, "step": 465 }, { "epoch": 1.6771724448446645, "grad_norm": 0.38118331962443885, "learning_rate": 4.874349522783313e-06, "loss": 1.0646, "step": 466 }, { "epoch": 1.6807744259342638, "grad_norm": 0.35946893815132736, "learning_rate": 4.853413349638859e-06, "loss": 1.0828, "step": 467 }, { "epoch": 1.6843764070238632, "grad_norm": 0.3666759251106939, "learning_rate": 4.832479748494643e-06, "loss": 1.0551, "step": 468 }, { "epoch": 1.6879783881134625, "grad_norm": 0.3508414333413744, "learning_rate": 4.811549086650327e-06, "loss": 1.0599, "step": 469 }, { "epoch": 1.6915803692030615, "grad_norm": 0.35707827749845356, "learning_rate": 4.7906217313540035e-06, "loss": 1.0759, "step": 470 }, { "epoch": 1.695182350292661, "grad_norm": 0.3788611425520442, "learning_rate": 4.769698049795739e-06, "loss": 1.0357, "step": 471 }, { "epoch": 1.6987843313822601, "grad_norm": 0.31406557358200793, "learning_rate": 4.748778409101153e-06, "loss": 1.0835, "step": 472 }, { "epoch": 1.7023863124718597, "grad_norm": 0.330042469142287, "learning_rate": 4.727863176324955e-06, "loss": 1.0356, "step": 473 }, { "epoch": 1.7059882935614588, "grad_norm": 0.3692982637487251, "learning_rate": 4.706952718444518e-06, "loss": 1.0224, "step": 474 }, { "epoch": 1.709590274651058, "grad_norm": 0.3401427800549121, "learning_rate": 4.686047402353433e-06, "loss": 1.056, "step": 475 }, { "epoch": 1.7131922557406574, "grad_norm": 0.32559689457568264, "learning_rate": 4.6651475948550765e-06, "loss": 1.072, "step": 476 }, { "epoch": 1.7167942368302567, "grad_norm": 0.33516871825696326, "learning_rate": 4.644253662656167e-06, "loss": 1.056, "step": 477 }, { "epoch": 1.720396217919856, "grad_norm": 0.34667604652264583, "learning_rate": 4.6233659723603374e-06, "loss": 1.0555, "step": 478 }, { "epoch": 1.723998199009455, "grad_norm": 0.37161554561132193, "learning_rate": 4.602484890461702e-06, "loss": 1.0563, "step": 479 }, { "epoch": 1.7276001800990546, "grad_norm": 0.31848164234901527, "learning_rate": 4.581610783338424e-06, "loss": 1.0941, "step": 480 }, { "epoch": 1.7312021611886537, "grad_norm": 0.3594252163107309, "learning_rate": 4.560744017246284e-06, "loss": 1.0751, "step": 481 }, { "epoch": 1.7348041422782532, "grad_norm": 0.3544059030193539, "learning_rate": 4.539884958312265e-06, "loss": 1.0469, "step": 482 }, { "epoch": 1.7384061233678523, "grad_norm": 0.32959836916197055, "learning_rate": 4.519033972528114e-06, "loss": 1.063, "step": 483 }, { "epoch": 1.7420081044574516, "grad_norm": 0.31875669846466737, "learning_rate": 4.4981914257439254e-06, "loss": 1.0575, "step": 484 }, { "epoch": 1.7456100855470509, "grad_norm": 0.31894865236781544, "learning_rate": 4.477357683661734e-06, "loss": 1.0591, "step": 485 }, { "epoch": 1.7492120666366502, "grad_norm": 0.355450564535886, "learning_rate": 4.456533111829076e-06, "loss": 1.1063, "step": 486 }, { "epoch": 1.7528140477262495, "grad_norm": 0.35660689731655426, "learning_rate": 4.4357180756325915e-06, "loss": 1.025, "step": 487 }, { "epoch": 1.7564160288158486, "grad_norm": 0.35786503142864406, "learning_rate": 4.414912940291614e-06, "loss": 1.0731, "step": 488 }, { "epoch": 1.760018009905448, "grad_norm": 0.33168551352286857, "learning_rate": 4.394118070851749e-06, "loss": 1.0152, "step": 489 }, { "epoch": 1.7636199909950472, "grad_norm": 0.31557735871932313, "learning_rate": 4.373333832178478e-06, "loss": 1.0658, "step": 490 }, { "epoch": 1.7672219720846467, "grad_norm": 0.3643796172518758, "learning_rate": 4.352560588950766e-06, "loss": 1.0788, "step": 491 }, { "epoch": 1.7708239531742458, "grad_norm": 0.3342250830340594, "learning_rate": 4.331798705654639e-06, "loss": 1.0929, "step": 492 }, { "epoch": 1.774425934263845, "grad_norm": 0.8828365078514346, "learning_rate": 4.31104854657681e-06, "loss": 1.0723, "step": 493 }, { "epoch": 1.7780279153534444, "grad_norm": 0.32325289629440884, "learning_rate": 4.290310475798278e-06, "loss": 1.0472, "step": 494 }, { "epoch": 1.7816298964430437, "grad_norm": 0.36167307979154, "learning_rate": 4.269584857187942e-06, "loss": 1.0803, "step": 495 }, { "epoch": 1.785231877532643, "grad_norm": 0.3112535067026799, "learning_rate": 4.248872054396215e-06, "loss": 1.0522, "step": 496 }, { "epoch": 1.788833858622242, "grad_norm": 0.34086457664496106, "learning_rate": 4.228172430848645e-06, "loss": 1.0775, "step": 497 }, { "epoch": 1.7924358397118416, "grad_norm": 0.30310958551948036, "learning_rate": 4.207486349739538e-06, "loss": 1.0487, "step": 498 }, { "epoch": 1.7960378208014407, "grad_norm": 0.3511261941310994, "learning_rate": 4.186814174025582e-06, "loss": 1.0844, "step": 499 }, { "epoch": 1.7996398018910402, "grad_norm": 0.3503896982707501, "learning_rate": 4.166156266419489e-06, "loss": 1.0132, "step": 500 }, { "epoch": 1.8032417829806393, "grad_norm": 0.3436670120686724, "learning_rate": 4.145512989383618e-06, "loss": 1.0598, "step": 501 }, { "epoch": 1.8068437640702386, "grad_norm": 0.34785159088570466, "learning_rate": 4.124884705123619e-06, "loss": 1.035, "step": 502 }, { "epoch": 1.810445745159838, "grad_norm": 0.33165542782534474, "learning_rate": 4.104271775582089e-06, "loss": 1.0358, "step": 503 }, { "epoch": 1.8140477262494372, "grad_norm": 0.4070554454819295, "learning_rate": 4.083674562432203e-06, "loss": 1.0434, "step": 504 }, { "epoch": 1.8176497073390365, "grad_norm": 0.3624273928106286, "learning_rate": 4.063093427071376e-06, "loss": 1.0995, "step": 505 }, { "epoch": 1.8212516884286356, "grad_norm": 0.32728343459448644, "learning_rate": 4.042528730614935e-06, "loss": 1.0178, "step": 506 }, { "epoch": 1.8248536695182351, "grad_norm": 0.31687128264594894, "learning_rate": 4.02198083388976e-06, "loss": 1.0649, "step": 507 }, { "epoch": 1.8284556506078342, "grad_norm": 0.3744296721151693, "learning_rate": 4.001450097427965e-06, "loss": 1.0252, "step": 508 }, { "epoch": 1.8320576316974337, "grad_norm": 0.3132845490517284, "learning_rate": 3.980936881460576e-06, "loss": 1.0795, "step": 509 }, { "epoch": 1.8356596127870328, "grad_norm": 0.3420985303611589, "learning_rate": 3.960441545911205e-06, "loss": 1.0746, "step": 510 }, { "epoch": 1.8392615938766321, "grad_norm": 0.307478947705401, "learning_rate": 3.939964450389728e-06, "loss": 1.0853, "step": 511 }, { "epoch": 1.8428635749662314, "grad_norm": 0.33071208183560175, "learning_rate": 3.91950595418599e-06, "loss": 1.0528, "step": 512 }, { "epoch": 1.8464655560558307, "grad_norm": 0.3340631255697292, "learning_rate": 3.899066416263493e-06, "loss": 1.0198, "step": 513 }, { "epoch": 1.85006753714543, "grad_norm": 0.32856275060694273, "learning_rate": 3.8786461952530955e-06, "loss": 1.0969, "step": 514 }, { "epoch": 1.8536695182350291, "grad_norm": 0.3416526016934508, "learning_rate": 3.8582456494467214e-06, "loss": 1.0863, "step": 515 }, { "epoch": 1.8572714993246286, "grad_norm": 0.3185489633634486, "learning_rate": 3.83786513679108e-06, "loss": 1.0385, "step": 516 }, { "epoch": 1.8608734804142277, "grad_norm": 0.32623706503747446, "learning_rate": 3.817505014881378e-06, "loss": 1.0144, "step": 517 }, { "epoch": 1.8644754615038273, "grad_norm": 0.32218552881112167, "learning_rate": 3.797165640955041e-06, "loss": 1.0705, "step": 518 }, { "epoch": 1.8680774425934263, "grad_norm": 0.30139922286880777, "learning_rate": 3.776847371885464e-06, "loss": 1.0493, "step": 519 }, { "epoch": 1.8716794236830256, "grad_norm": 0.377029678977555, "learning_rate": 3.756550564175727e-06, "loss": 1.0494, "step": 520 }, { "epoch": 1.875281404772625, "grad_norm": 0.345992186114263, "learning_rate": 3.736275573952354e-06, "loss": 1.0228, "step": 521 }, { "epoch": 1.8788833858622243, "grad_norm": 0.3033717191767872, "learning_rate": 3.716022756959061e-06, "loss": 1.0896, "step": 522 }, { "epoch": 1.8824853669518236, "grad_norm": 0.32124744536039596, "learning_rate": 3.695792468550517e-06, "loss": 1.0467, "step": 523 }, { "epoch": 1.8860873480414226, "grad_norm": 0.3470918739977987, "learning_rate": 3.6755850636860956e-06, "loss": 1.0524, "step": 524 }, { "epoch": 1.8896893291310222, "grad_norm": 0.38530316140769727, "learning_rate": 3.655400896923672e-06, "loss": 1.0685, "step": 525 }, { "epoch": 1.8932913102206212, "grad_norm": 0.34989092164325525, "learning_rate": 3.635240322413375e-06, "loss": 1.0953, "step": 526 }, { "epoch": 1.8968932913102208, "grad_norm": 0.30619682455422553, "learning_rate": 3.6151036938913887e-06, "loss": 1.0866, "step": 527 }, { "epoch": 1.9004952723998199, "grad_norm": 0.33603759713948095, "learning_rate": 3.5949913646737456e-06, "loss": 1.0562, "step": 528 }, { "epoch": 1.9040972534894192, "grad_norm": 0.36283301326571393, "learning_rate": 3.5749036876501196e-06, "loss": 1.0877, "step": 529 }, { "epoch": 1.9076992345790185, "grad_norm": 0.32449495469021755, "learning_rate": 3.5548410152776414e-06, "loss": 1.0926, "step": 530 }, { "epoch": 1.9113012156686178, "grad_norm": 0.30909965481121454, "learning_rate": 3.5348036995747135e-06, "loss": 1.0924, "step": 531 }, { "epoch": 1.914903196758217, "grad_norm": 0.31489637458548564, "learning_rate": 3.5147920921148267e-06, "loss": 1.0828, "step": 532 }, { "epoch": 1.9185051778478162, "grad_norm": 0.3330350499732472, "learning_rate": 3.4948065440203982e-06, "loss": 1.0685, "step": 533 }, { "epoch": 1.9221071589374157, "grad_norm": 0.3107239605147675, "learning_rate": 3.474847405956613e-06, "loss": 1.0618, "step": 534 }, { "epoch": 1.9257091400270148, "grad_norm": 0.34315983029146, "learning_rate": 3.4549150281252635e-06, "loss": 1.1048, "step": 535 }, { "epoch": 1.9293111211166143, "grad_norm": 0.3227737291510128, "learning_rate": 3.4350097602586085e-06, "loss": 1.0491, "step": 536 }, { "epoch": 1.9329131022062134, "grad_norm": 0.30320716585058866, "learning_rate": 3.4151319516132414e-06, "loss": 1.0179, "step": 537 }, { "epoch": 1.9365150832958127, "grad_norm": 0.32851520671529355, "learning_rate": 3.3952819509639534e-06, "loss": 1.0495, "step": 538 }, { "epoch": 1.940117064385412, "grad_norm": 0.33703603920644415, "learning_rate": 3.375460106597619e-06, "loss": 1.0148, "step": 539 }, { "epoch": 1.9437190454750113, "grad_norm": 0.3097947001373875, "learning_rate": 3.355666766307084e-06, "loss": 1.1156, "step": 540 }, { "epoch": 1.9473210265646106, "grad_norm": 0.3068291690397491, "learning_rate": 3.3359022773850673e-06, "loss": 1.0315, "step": 541 }, { "epoch": 1.9509230076542097, "grad_norm": 0.3376120770611421, "learning_rate": 3.31616698661806e-06, "loss": 1.0564, "step": 542 }, { "epoch": 1.9545249887438092, "grad_norm": 0.29124426624612915, "learning_rate": 3.2964612402802422e-06, "loss": 1.0689, "step": 543 }, { "epoch": 1.9581269698334083, "grad_norm": 0.2948609004870433, "learning_rate": 3.2767853841274154e-06, "loss": 1.0684, "step": 544 }, { "epoch": 1.9617289509230078, "grad_norm": 0.3558137852450314, "learning_rate": 3.2571397633909252e-06, "loss": 1.0452, "step": 545 }, { "epoch": 1.965330932012607, "grad_norm": 0.3008490022930337, "learning_rate": 3.2375247227716077e-06, "loss": 1.0235, "step": 546 }, { "epoch": 1.9689329131022062, "grad_norm": 0.31102274440087274, "learning_rate": 3.217940606433747e-06, "loss": 1.0575, "step": 547 }, { "epoch": 1.9725348941918055, "grad_norm": 0.33011139669007306, "learning_rate": 3.1983877579990276e-06, "loss": 1.0419, "step": 548 }, { "epoch": 1.9761368752814048, "grad_norm": 0.314573659163516, "learning_rate": 3.178866520540509e-06, "loss": 1.0364, "step": 549 }, { "epoch": 1.979738856371004, "grad_norm": 0.313348256207242, "learning_rate": 3.1593772365766107e-06, "loss": 1.0615, "step": 550 }, { "epoch": 1.9833408374606032, "grad_norm": 0.32258850263277733, "learning_rate": 3.139920248065095e-06, "loss": 1.0896, "step": 551 }, { "epoch": 1.9869428185502027, "grad_norm": 0.3154271741355316, "learning_rate": 3.1204958963970666e-06, "loss": 1.0501, "step": 552 }, { "epoch": 1.9905447996398018, "grad_norm": 0.3406321762516789, "learning_rate": 3.1011045223909954e-06, "loss": 1.0761, "step": 553 }, { "epoch": 1.9941467807294013, "grad_norm": 0.31522039092493703, "learning_rate": 3.0817464662867192e-06, "loss": 1.0556, "step": 554 }, { "epoch": 1.9977487618190004, "grad_norm": 0.3147135586324213, "learning_rate": 3.0624220677394854e-06, "loss": 1.0857, "step": 555 }, { "epoch": 2.0, "grad_norm": 0.44015502627582576, "learning_rate": 3.043131665813988e-06, "loss": 1.0463, "step": 556 }, { "epoch": 2.003601981089599, "grad_norm": 0.3983894180930443, "learning_rate": 3.023875598978419e-06, "loss": 1.0393, "step": 557 }, { "epoch": 2.0072039621791986, "grad_norm": 0.3067099309247845, "learning_rate": 3.004654205098524e-06, "loss": 1.0605, "step": 558 }, { "epoch": 2.0108059432687977, "grad_norm": 0.3357353560695444, "learning_rate": 2.9854678214316875e-06, "loss": 1.0417, "step": 559 }, { "epoch": 2.014407924358397, "grad_norm": 0.33999056854523163, "learning_rate": 2.966316784621e-06, "loss": 1.024, "step": 560 }, { "epoch": 2.0180099054479963, "grad_norm": 0.31226386349104857, "learning_rate": 2.9472014306893605e-06, "loss": 1.0485, "step": 561 }, { "epoch": 2.021611886537596, "grad_norm": 0.31603080640243814, "learning_rate": 2.92812209503358e-06, "loss": 1.0414, "step": 562 }, { "epoch": 2.025213867627195, "grad_norm": 0.3459067464369587, "learning_rate": 2.9090791124184934e-06, "loss": 1.0756, "step": 563 }, { "epoch": 2.0288158487167944, "grad_norm": 0.3330987662321279, "learning_rate": 2.8900728169710866e-06, "loss": 1.054, "step": 564 }, { "epoch": 2.0324178298063935, "grad_norm": 0.2927527791629305, "learning_rate": 2.871103542174637e-06, "loss": 1.0665, "step": 565 }, { "epoch": 2.0360198108959926, "grad_norm": 0.33886103259967365, "learning_rate": 2.8521716208628597e-06, "loss": 1.0595, "step": 566 }, { "epoch": 2.039621791985592, "grad_norm": 0.31642624101440003, "learning_rate": 2.8332773852140644e-06, "loss": 1.0177, "step": 567 }, { "epoch": 2.043223773075191, "grad_norm": 0.32001337639878913, "learning_rate": 2.814421166745337e-06, "loss": 1.0461, "step": 568 }, { "epoch": 2.0468257541647907, "grad_norm": 0.3024517860403017, "learning_rate": 2.795603296306708e-06, "loss": 1.0665, "step": 569 }, { "epoch": 2.05042773525439, "grad_norm": 0.3442352491720829, "learning_rate": 2.776824104075364e-06, "loss": 1.0368, "step": 570 }, { "epoch": 2.0540297163439893, "grad_norm": 0.3293545677344158, "learning_rate": 2.7580839195498397e-06, "loss": 1.043, "step": 571 }, { "epoch": 2.0576316974335884, "grad_norm": 0.31332600205637107, "learning_rate": 2.739383071544246e-06, "loss": 1.0476, "step": 572 }, { "epoch": 2.061233678523188, "grad_norm": 0.32030515645666624, "learning_rate": 2.7207218881825016e-06, "loss": 1.0542, "step": 573 }, { "epoch": 2.064835659612787, "grad_norm": 0.3364547680473557, "learning_rate": 2.7021006968925613e-06, "loss": 1.0364, "step": 574 }, { "epoch": 2.068437640702386, "grad_norm": 0.3145267889190637, "learning_rate": 2.683519824400693e-06, "loss": 1.0621, "step": 575 }, { "epoch": 2.0720396217919856, "grad_norm": 0.3221194724938311, "learning_rate": 2.6649795967257243e-06, "loss": 1.0827, "step": 576 }, { "epoch": 2.0756416028815847, "grad_norm": 0.31171845327350106, "learning_rate": 2.646480339173337e-06, "loss": 1.0327, "step": 577 }, { "epoch": 2.0792435839711843, "grad_norm": 0.2884769700670282, "learning_rate": 2.6280223763303546e-06, "loss": 1.0488, "step": 578 }, { "epoch": 2.0828455650607833, "grad_norm": 0.3214670803689811, "learning_rate": 2.6096060320590393e-06, "loss": 1.0175, "step": 579 }, { "epoch": 2.086447546150383, "grad_norm": 0.29588927971342016, "learning_rate": 2.5912316294914232e-06, "loss": 1.0299, "step": 580 }, { "epoch": 2.090049527239982, "grad_norm": 0.3179276570664923, "learning_rate": 2.5728994910236304e-06, "loss": 1.0416, "step": 581 }, { "epoch": 2.0936515083295815, "grad_norm": 0.32306719861333466, "learning_rate": 2.5546099383102206e-06, "loss": 1.043, "step": 582 }, { "epoch": 2.0972534894191805, "grad_norm": 0.3201304980987189, "learning_rate": 2.536363292258543e-06, "loss": 1.0612, "step": 583 }, { "epoch": 2.1008554705087796, "grad_norm": 0.2925431078505077, "learning_rate": 2.518159873023116e-06, "loss": 1.0317, "step": 584 }, { "epoch": 2.104457451598379, "grad_norm": 0.33029491252375176, "learning_rate": 2.5000000000000015e-06, "loss": 1.0468, "step": 585 }, { "epoch": 2.1080594326879782, "grad_norm": 0.3093075735098707, "learning_rate": 2.4818839918211963e-06, "loss": 1.0264, "step": 586 }, { "epoch": 2.1116614137775778, "grad_norm": 0.33517262514779006, "learning_rate": 2.4638121663490546e-06, "loss": 1.0125, "step": 587 }, { "epoch": 2.115263394867177, "grad_norm": 0.29875408693671646, "learning_rate": 2.4457848406707014e-06, "loss": 1.0145, "step": 588 }, { "epoch": 2.1188653759567764, "grad_norm": 0.3461841979966471, "learning_rate": 2.4278023310924676e-06, "loss": 1.0526, "step": 589 }, { "epoch": 2.1224673570463755, "grad_norm": 0.316595399237612, "learning_rate": 2.40986495313435e-06, "loss": 1.0276, "step": 590 }, { "epoch": 2.126069338135975, "grad_norm": 0.35509810185127894, "learning_rate": 2.391973021524461e-06, "loss": 1.0236, "step": 591 }, { "epoch": 2.129671319225574, "grad_norm": 0.31577714871143664, "learning_rate": 2.3741268501935212e-06, "loss": 1.0668, "step": 592 }, { "epoch": 2.133273300315173, "grad_norm": 0.30363778483757914, "learning_rate": 2.356326752269342e-06, "loss": 1.0399, "step": 593 }, { "epoch": 2.1368752814047727, "grad_norm": 0.2951535358604565, "learning_rate": 2.338573040071332e-06, "loss": 1.0674, "step": 594 }, { "epoch": 2.1404772624943718, "grad_norm": 0.2916733821330133, "learning_rate": 2.320866025105016e-06, "loss": 1.0251, "step": 595 }, { "epoch": 2.1440792435839713, "grad_norm": 0.28186913326636054, "learning_rate": 2.303206018056583e-06, "loss": 1.04, "step": 596 }, { "epoch": 2.1476812246735704, "grad_norm": 0.3291151473315765, "learning_rate": 2.285593328787414e-06, "loss": 1.0173, "step": 597 }, { "epoch": 2.15128320576317, "grad_norm": 0.3065042887064959, "learning_rate": 2.268028266328655e-06, "loss": 1.0294, "step": 598 }, { "epoch": 2.154885186852769, "grad_norm": 0.3025327291635849, "learning_rate": 2.250511138875801e-06, "loss": 1.046, "step": 599 }, { "epoch": 2.1584871679423685, "grad_norm": 0.28100597708822905, "learning_rate": 2.23304225378328e-06, "loss": 1.0289, "step": 600 }, { "epoch": 2.1620891490319676, "grad_norm": 0.31006702418509163, "learning_rate": 2.2156219175590623e-06, "loss": 1.023, "step": 601 }, { "epoch": 2.1656911301215667, "grad_norm": 0.2962970912734003, "learning_rate": 2.1982504358592777e-06, "loss": 1.0775, "step": 602 }, { "epoch": 2.169293111211166, "grad_norm": 0.32597592835782413, "learning_rate": 2.1809281134828663e-06, "loss": 1.0409, "step": 603 }, { "epoch": 2.1728950923007653, "grad_norm": 0.30783288748012444, "learning_rate": 2.1636552543662187e-06, "loss": 1.0609, "step": 604 }, { "epoch": 2.176497073390365, "grad_norm": 0.3197715534590088, "learning_rate": 2.146432161577842e-06, "loss": 1.0341, "step": 605 }, { "epoch": 2.180099054479964, "grad_norm": 0.30989671790932927, "learning_rate": 2.1292591373130515e-06, "loss": 1.0292, "step": 606 }, { "epoch": 2.1837010355695634, "grad_norm": 0.315438048629008, "learning_rate": 2.112136482888663e-06, "loss": 1.0171, "step": 607 }, { "epoch": 2.1873030166591625, "grad_norm": 0.3140514935443536, "learning_rate": 2.095064498737701e-06, "loss": 1.0406, "step": 608 }, { "epoch": 2.190904997748762, "grad_norm": 0.36444536122323545, "learning_rate": 2.07804348440414e-06, "loss": 1.0474, "step": 609 }, { "epoch": 2.194506978838361, "grad_norm": 0.32038773790017444, "learning_rate": 2.061073738537635e-06, "loss": 1.0406, "step": 610 }, { "epoch": 2.19810895992796, "grad_norm": 0.3327293671870662, "learning_rate": 2.04415555888829e-06, "loss": 1.0473, "step": 611 }, { "epoch": 2.2017109410175597, "grad_norm": 0.3110981786314029, "learning_rate": 2.027289242301435e-06, "loss": 1.0674, "step": 612 }, { "epoch": 2.205312922107159, "grad_norm": 0.3001638141607491, "learning_rate": 2.0104750847124075e-06, "loss": 1.0543, "step": 613 }, { "epoch": 2.2089149031967583, "grad_norm": 0.3336943922980599, "learning_rate": 1.9937133811413666e-06, "loss": 1.0378, "step": 614 }, { "epoch": 2.2125168842863574, "grad_norm": 0.3189387269628659, "learning_rate": 1.977004425688126e-06, "loss": 1.0182, "step": 615 }, { "epoch": 2.216118865375957, "grad_norm": 0.2953897029068166, "learning_rate": 1.9603485115269743e-06, "loss": 1.0307, "step": 616 }, { "epoch": 2.219720846465556, "grad_norm": 0.3354775874836228, "learning_rate": 1.9437459309015426e-06, "loss": 1.0582, "step": 617 }, { "epoch": 2.2233228275551555, "grad_norm": 0.3155756131913934, "learning_rate": 1.927196975119678e-06, "loss": 1.0729, "step": 618 }, { "epoch": 2.2269248086447546, "grad_norm": 0.30314021638523436, "learning_rate": 1.910701934548329e-06, "loss": 1.0675, "step": 619 }, { "epoch": 2.2305267897343537, "grad_norm": 0.3178641177812712, "learning_rate": 1.8942610986084487e-06, "loss": 1.0193, "step": 620 }, { "epoch": 2.2341287708239532, "grad_norm": 0.27497288823347316, "learning_rate": 1.8778747557699223e-06, "loss": 1.0397, "step": 621 }, { "epoch": 2.2377307519135523, "grad_norm": 0.27473090413221113, "learning_rate": 1.8615431935464984e-06, "loss": 1.0814, "step": 622 }, { "epoch": 2.241332733003152, "grad_norm": 0.29969676069735396, "learning_rate": 1.8452666984907519e-06, "loss": 1.0464, "step": 623 }, { "epoch": 2.244934714092751, "grad_norm": 0.29678899593418184, "learning_rate": 1.829045556189053e-06, "loss": 1.0635, "step": 624 }, { "epoch": 2.2485366951823504, "grad_norm": 0.30067878966920947, "learning_rate": 1.8128800512565514e-06, "loss": 1.0325, "step": 625 }, { "epoch": 2.2521386762719495, "grad_norm": 0.28275287186797854, "learning_rate": 1.7967704673321917e-06, "loss": 1.0606, "step": 626 }, { "epoch": 2.2557406573615486, "grad_norm": 0.27780010911864134, "learning_rate": 1.7807170870737317e-06, "loss": 1.061, "step": 627 }, { "epoch": 2.259342638451148, "grad_norm": 0.30625432629700144, "learning_rate": 1.7647201921527802e-06, "loss": 1.0008, "step": 628 }, { "epoch": 2.2629446195407477, "grad_norm": 0.30785488054179105, "learning_rate": 1.7487800632498547e-06, "loss": 1.0072, "step": 629 }, { "epoch": 2.2665466006303467, "grad_norm": 0.3052922709373008, "learning_rate": 1.7328969800494727e-06, "loss": 1.0549, "step": 630 }, { "epoch": 2.270148581719946, "grad_norm": 0.3040902290300385, "learning_rate": 1.7170712212352187e-06, "loss": 1.0484, "step": 631 }, { "epoch": 2.2737505628095454, "grad_norm": 0.3114270358030963, "learning_rate": 1.7013030644848698e-06, "loss": 1.065, "step": 632 }, { "epoch": 2.2773525438991444, "grad_norm": 0.30134193406734877, "learning_rate": 1.6855927864655241e-06, "loss": 1.0003, "step": 633 }, { "epoch": 2.280954524988744, "grad_norm": 0.29795611858634163, "learning_rate": 1.6699406628287423e-06, "loss": 1.0447, "step": 634 }, { "epoch": 2.284556506078343, "grad_norm": 0.3148996427378843, "learning_rate": 1.6543469682057105e-06, "loss": 1.0709, "step": 635 }, { "epoch": 2.2881584871679426, "grad_norm": 0.29121080028920326, "learning_rate": 1.6388119762024213e-06, "loss": 1.0482, "step": 636 }, { "epoch": 2.2917604682575416, "grad_norm": 0.288859982023708, "learning_rate": 1.6233359593948777e-06, "loss": 1.0803, "step": 637 }, { "epoch": 2.2953624493471407, "grad_norm": 0.33078409972348893, "learning_rate": 1.6079191893243102e-06, "loss": 1.0652, "step": 638 }, { "epoch": 2.2989644304367403, "grad_norm": 0.306929918665794, "learning_rate": 1.5925619364924016e-06, "loss": 1.0426, "step": 639 }, { "epoch": 2.3025664115263393, "grad_norm": 0.27619745792679457, "learning_rate": 1.5772644703565564e-06, "loss": 1.0295, "step": 640 }, { "epoch": 2.306168392615939, "grad_norm": 0.2889850236837387, "learning_rate": 1.5620270593251635e-06, "loss": 1.0017, "step": 641 }, { "epoch": 2.309770373705538, "grad_norm": 0.2997580312987809, "learning_rate": 1.5468499707528856e-06, "loss": 1.0433, "step": 642 }, { "epoch": 2.3133723547951375, "grad_norm": 0.2934596363171497, "learning_rate": 1.531733470935976e-06, "loss": 1.1025, "step": 643 }, { "epoch": 2.3169743358847366, "grad_norm": 0.2984641308954964, "learning_rate": 1.5166778251075964e-06, "loss": 1.0295, "step": 644 }, { "epoch": 2.3205763169743356, "grad_norm": 0.30803103965283024, "learning_rate": 1.5016832974331725e-06, "loss": 1.0625, "step": 645 }, { "epoch": 2.324178298063935, "grad_norm": 0.3075838465758903, "learning_rate": 1.4867501510057548e-06, "loss": 1.0274, "step": 646 }, { "epoch": 2.3277802791535347, "grad_norm": 0.2794304614904837, "learning_rate": 1.4718786478413983e-06, "loss": 1.0705, "step": 647 }, { "epoch": 2.3313822602431338, "grad_norm": 0.2854931127642351, "learning_rate": 1.4570690488745687e-06, "loss": 1.072, "step": 648 }, { "epoch": 2.334984241332733, "grad_norm": 0.2976715865621331, "learning_rate": 1.4423216139535735e-06, "loss": 1.0519, "step": 649 }, { "epoch": 2.3385862224223324, "grad_norm": 0.2867454551588936, "learning_rate": 1.4276366018359845e-06, "loss": 1.0656, "step": 650 }, { "epoch": 2.3421882035119315, "grad_norm": 0.2997877110355175, "learning_rate": 1.4130142701841076e-06, "loss": 1.0207, "step": 651 }, { "epoch": 2.345790184601531, "grad_norm": 0.29242665391081074, "learning_rate": 1.3984548755604655e-06, "loss": 1.0295, "step": 652 }, { "epoch": 2.34939216569113, "grad_norm": 0.28957351738255677, "learning_rate": 1.3839586734232907e-06, "loss": 1.0187, "step": 653 }, { "epoch": 2.3529941467807296, "grad_norm": 0.31920070289406366, "learning_rate": 1.3695259181220405e-06, "loss": 1.0309, "step": 654 }, { "epoch": 2.3565961278703287, "grad_norm": 0.3015410819374185, "learning_rate": 1.3551568628929434e-06, "loss": 1.0334, "step": 655 }, { "epoch": 2.3601981089599278, "grad_norm": 0.3076515876213567, "learning_rate": 1.3408517598545446e-06, "loss": 0.9868, "step": 656 }, { "epoch": 2.3638000900495273, "grad_norm": 0.3176154294797028, "learning_rate": 1.3266108600032928e-06, "loss": 1.0543, "step": 657 }, { "epoch": 2.3674020711391264, "grad_norm": 0.3071265193798959, "learning_rate": 1.312434413209131e-06, "loss": 1.0443, "step": 658 }, { "epoch": 2.371004052228726, "grad_norm": 0.2868126401515119, "learning_rate": 1.2983226682111094e-06, "loss": 1.0031, "step": 659 }, { "epoch": 2.374606033318325, "grad_norm": 0.30911382077034116, "learning_rate": 1.2842758726130283e-06, "loss": 1.0161, "step": 660 }, { "epoch": 2.3782080144079245, "grad_norm": 0.292638404203282, "learning_rate": 1.2702942728790897e-06, "loss": 1.0435, "step": 661 }, { "epoch": 2.3818099954975236, "grad_norm": 0.3131805047178825, "learning_rate": 1.2563781143295705e-06, "loss": 1.0817, "step": 662 }, { "epoch": 2.3854119765871227, "grad_norm": 0.27540737813728455, "learning_rate": 1.24252764113652e-06, "loss": 1.0299, "step": 663 }, { "epoch": 2.389013957676722, "grad_norm": 0.36871741578270023, "learning_rate": 1.2287430963194807e-06, "loss": 1.0207, "step": 664 }, { "epoch": 2.3926159387663217, "grad_norm": 0.2795983680218255, "learning_rate": 1.2150247217412186e-06, "loss": 1.0227, "step": 665 }, { "epoch": 2.396217919855921, "grad_norm": 0.32352488852148475, "learning_rate": 1.2013727581034783e-06, "loss": 1.0234, "step": 666 }, { "epoch": 2.39981990094552, "grad_norm": 0.297035764973451, "learning_rate": 1.18778744494276e-06, "loss": 1.0286, "step": 667 }, { "epoch": 2.4034218820351194, "grad_norm": 0.3019576240092055, "learning_rate": 1.1742690206261293e-06, "loss": 1.0221, "step": 668 }, { "epoch": 2.4070238631247185, "grad_norm": 0.2935186392415227, "learning_rate": 1.160817722347014e-06, "loss": 1.0341, "step": 669 }, { "epoch": 2.410625844214318, "grad_norm": 0.29550059263202566, "learning_rate": 1.1474337861210543e-06, "loss": 1.0312, "step": 670 }, { "epoch": 2.414227825303917, "grad_norm": 0.2919473066109118, "learning_rate": 1.1341174467819637e-06, "loss": 1.0145, "step": 671 }, { "epoch": 2.4178298063935166, "grad_norm": 0.2770789920290827, "learning_rate": 1.120868937977404e-06, "loss": 1.0233, "step": 672 }, { "epoch": 2.4214317874831157, "grad_norm": 0.2726692177451945, "learning_rate": 1.1076884921648834e-06, "loss": 1.0763, "step": 673 }, { "epoch": 2.425033768572715, "grad_norm": 0.2955147912539993, "learning_rate": 1.0945763406076837e-06, "loss": 1.0431, "step": 674 }, { "epoch": 2.4286357496623143, "grad_norm": 0.2937917795969739, "learning_rate": 1.0815327133708015e-06, "loss": 1.0238, "step": 675 }, { "epoch": 2.4322377307519134, "grad_norm": 0.26163762430548565, "learning_rate": 1.0685578393169054e-06, "loss": 1.0572, "step": 676 }, { "epoch": 2.435839711841513, "grad_norm": 0.30900006884034514, "learning_rate": 1.0556519461023301e-06, "loss": 1.0148, "step": 677 }, { "epoch": 2.439441692931112, "grad_norm": 0.28768364202012864, "learning_rate": 1.0428152601730718e-06, "loss": 1.0526, "step": 678 }, { "epoch": 2.4430436740207115, "grad_norm": 0.31064843044382456, "learning_rate": 1.0300480067608232e-06, "loss": 1.0131, "step": 679 }, { "epoch": 2.4466456551103106, "grad_norm": 0.2790738585338088, "learning_rate": 1.0173504098790188e-06, "loss": 1.0432, "step": 680 }, { "epoch": 2.4502476361999097, "grad_norm": 0.2675802235255538, "learning_rate": 1.0047226923189024e-06, "loss": 1.0592, "step": 681 }, { "epoch": 2.4538496172895092, "grad_norm": 0.2820500392897806, "learning_rate": 9.921650756456164e-07, "loss": 1.0406, "step": 682 }, { "epoch": 2.4574515983791088, "grad_norm": 0.2712979205650806, "learning_rate": 9.79677780194327e-07, "loss": 1.0469, "step": 683 }, { "epoch": 2.461053579468708, "grad_norm": 0.28678767820139744, "learning_rate": 9.67261025066339e-07, "loss": 0.9984, "step": 684 }, { "epoch": 2.464655560558307, "grad_norm": 0.2826669677633045, "learning_rate": 9.549150281252633e-07, "loss": 1.0439, "step": 685 }, { "epoch": 2.4682575416479065, "grad_norm": 0.2898206571793001, "learning_rate": 9.426400059931956e-07, "loss": 1.0046, "step": 686 }, { "epoch": 2.4718595227375055, "grad_norm": 0.3055004594875761, "learning_rate": 9.304361740469103e-07, "loss": 1.0269, "step": 687 }, { "epoch": 2.475461503827105, "grad_norm": 0.28000409560086253, "learning_rate": 9.183037464140804e-07, "loss": 1.0274, "step": 688 }, { "epoch": 2.479063484916704, "grad_norm": 0.2956303701365367, "learning_rate": 9.06242935969528e-07, "loss": 1.0458, "step": 689 }, { "epoch": 2.4826654660063037, "grad_norm": 0.29216773948217756, "learning_rate": 8.942539543314799e-07, "loss": 1.0254, "step": 690 }, { "epoch": 2.4862674470959027, "grad_norm": 0.3038042079243165, "learning_rate": 8.823370118578628e-07, "loss": 1.048, "step": 691 }, { "epoch": 2.489869428185502, "grad_norm": 0.27942852348483804, "learning_rate": 8.704923176426072e-07, "loss": 1.0092, "step": 692 }, { "epoch": 2.4934714092751014, "grad_norm": 0.28829213899925576, "learning_rate": 8.587200795119793e-07, "loss": 1.0443, "step": 693 }, { "epoch": 2.4970733903647004, "grad_norm": 0.3075901120188406, "learning_rate": 8.470205040209362e-07, "loss": 1.0754, "step": 694 }, { "epoch": 2.5006753714543, "grad_norm": 0.28958620497520454, "learning_rate": 8.353937964495029e-07, "loss": 1.0209, "step": 695 }, { "epoch": 2.504277352543899, "grad_norm": 0.27671142949543276, "learning_rate": 8.238401607991647e-07, "loss": 1.0168, "step": 696 }, { "epoch": 2.5078793336334986, "grad_norm": 0.273803663136562, "learning_rate": 8.123597997892918e-07, "loss": 1.0222, "step": 697 }, { "epoch": 2.5114813147230977, "grad_norm": 0.2822637921207432, "learning_rate": 8.009529148535855e-07, "loss": 1.0219, "step": 698 }, { "epoch": 2.5150832958126967, "grad_norm": 0.2943865446548967, "learning_rate": 7.89619706136539e-07, "loss": 1.0429, "step": 699 }, { "epoch": 2.5186852769022963, "grad_norm": 0.27277951808804135, "learning_rate": 7.783603724899258e-07, "loss": 1.0446, "step": 700 }, { "epoch": 2.522287257991896, "grad_norm": 0.27077579945237057, "learning_rate": 7.671751114693104e-07, "loss": 1.0381, "step": 701 }, { "epoch": 2.525889239081495, "grad_norm": 0.27280532919773465, "learning_rate": 7.560641193305912e-07, "loss": 1.0239, "step": 702 }, { "epoch": 2.529491220171094, "grad_norm": 0.2790422908370001, "learning_rate": 7.450275910265415e-07, "loss": 1.0249, "step": 703 }, { "epoch": 2.5330932012606935, "grad_norm": 0.27523036662265893, "learning_rate": 7.34065720203399e-07, "loss": 1.0051, "step": 704 }, { "epoch": 2.5366951823502926, "grad_norm": 0.2771023164701056, "learning_rate": 7.23178699197467e-07, "loss": 1.0354, "step": 705 }, { "epoch": 2.540297163439892, "grad_norm": 0.2666702640648939, "learning_rate": 7.123667190317396e-07, "loss": 1.0277, "step": 706 }, { "epoch": 2.543899144529491, "grad_norm": 0.2781381560124755, "learning_rate": 7.01629969412545e-07, "loss": 1.0658, "step": 707 }, { "epoch": 2.5475011256190907, "grad_norm": 0.2880670528582805, "learning_rate": 6.909686387262255e-07, "loss": 1.0257, "step": 708 }, { "epoch": 2.55110310670869, "grad_norm": 0.2772242207866922, "learning_rate": 6.803829140358237e-07, "loss": 1.0496, "step": 709 }, { "epoch": 2.554705087798289, "grad_norm": 0.34213950210212746, "learning_rate": 6.698729810778065e-07, "loss": 1.0317, "step": 710 }, { "epoch": 2.5583070688878884, "grad_norm": 0.29889700157480303, "learning_rate": 6.594390242588044e-07, "loss": 1.0357, "step": 711 }, { "epoch": 2.5619090499774875, "grad_norm": 0.281098345760278, "learning_rate": 6.490812266523716e-07, "loss": 1.0532, "step": 712 }, { "epoch": 2.565511031067087, "grad_norm": 0.27502660732949663, "learning_rate": 6.387997699957815e-07, "loss": 1.0581, "step": 713 }, { "epoch": 2.569113012156686, "grad_norm": 0.2914484204504316, "learning_rate": 6.28594834686832e-07, "loss": 0.9895, "step": 714 }, { "epoch": 2.5727149932462856, "grad_norm": 0.3129296314272526, "learning_rate": 6.184665997806832e-07, "loss": 1.0147, "step": 715 }, { "epoch": 2.5763169743358847, "grad_norm": 0.32079210829379234, "learning_rate": 6.084152429867113e-07, "loss": 1.0406, "step": 716 }, { "epoch": 2.5799189554254838, "grad_norm": 0.29202037625742366, "learning_rate": 5.98440940665399e-07, "loss": 1.0466, "step": 717 }, { "epoch": 2.5835209365150833, "grad_norm": 0.295396320039525, "learning_rate": 5.885438678252342e-07, "loss": 1.0549, "step": 718 }, { "epoch": 2.587122917604683, "grad_norm": 0.3054962097094399, "learning_rate": 5.787241981196384e-07, "loss": 1.0325, "step": 719 }, { "epoch": 2.590724898694282, "grad_norm": 0.28026222206941487, "learning_rate": 5.689821038439264e-07, "loss": 1.051, "step": 720 }, { "epoch": 2.594326879783881, "grad_norm": 0.32032760630187107, "learning_rate": 5.593177559322776e-07, "loss": 1.0006, "step": 721 }, { "epoch": 2.5979288608734805, "grad_norm": 0.3213596256851929, "learning_rate": 5.497313239547374e-07, "loss": 0.9861, "step": 722 }, { "epoch": 2.6015308419630796, "grad_norm": 0.28144430111804236, "learning_rate": 5.402229761142464e-07, "loss": 1.0751, "step": 723 }, { "epoch": 2.605132823052679, "grad_norm": 0.31364935141684536, "learning_rate": 5.307928792436812e-07, "loss": 1.0723, "step": 724 }, { "epoch": 2.608734804142278, "grad_norm": 0.27300612039452254, "learning_rate": 5.214411988029355e-07, "loss": 1.0382, "step": 725 }, { "epoch": 2.6123367852318777, "grad_norm": 0.2856505846764118, "learning_rate": 5.121680988760125e-07, "loss": 1.0649, "step": 726 }, { "epoch": 2.615938766321477, "grad_norm": 0.30242925411320504, "learning_rate": 5.029737421681446e-07, "loss": 1.0253, "step": 727 }, { "epoch": 2.619540747411076, "grad_norm": 0.2760307219806458, "learning_rate": 4.938582900029437e-07, "loss": 1.0117, "step": 728 }, { "epoch": 2.6231427285006754, "grad_norm": 0.2813571189889987, "learning_rate": 4.848219023195644e-07, "loss": 1.0558, "step": 729 }, { "epoch": 2.6267447095902745, "grad_norm": 0.27273042137937875, "learning_rate": 4.758647376699033e-07, "loss": 1.044, "step": 730 }, { "epoch": 2.630346690679874, "grad_norm": 0.2962356696193387, "learning_rate": 4.6698695321581165e-07, "loss": 1.0591, "step": 731 }, { "epoch": 2.633948671769473, "grad_norm": 0.3021705699786841, "learning_rate": 4.581887047263445e-07, "loss": 1.0098, "step": 732 }, { "epoch": 2.6375506528590726, "grad_norm": 0.3116020937459149, "learning_rate": 4.494701465750217e-07, "loss": 0.9851, "step": 733 }, { "epoch": 2.6411526339486717, "grad_norm": 0.26030040597442616, "learning_rate": 4.4083143173712207e-07, "loss": 1.0507, "step": 734 }, { "epoch": 2.644754615038271, "grad_norm": 0.2754248870408236, "learning_rate": 4.322727117869951e-07, "loss": 1.0644, "step": 735 }, { "epoch": 2.6483565961278703, "grad_norm": 0.28205843108275575, "learning_rate": 4.237941368954124e-07, "loss": 1.0424, "step": 736 }, { "epoch": 2.65195857721747, "grad_norm": 0.29294591358526023, "learning_rate": 4.153958558269189e-07, "loss": 1.0482, "step": 737 }, { "epoch": 2.655560558307069, "grad_norm": 0.27169863680028755, "learning_rate": 4.0707801593723006e-07, "loss": 1.0163, "step": 738 }, { "epoch": 2.659162539396668, "grad_norm": 0.27860960386729466, "learning_rate": 3.9884076317064813e-07, "loss": 1.0333, "step": 739 }, { "epoch": 2.6627645204862675, "grad_norm": 0.2770379539586451, "learning_rate": 3.90684242057498e-07, "loss": 1.0574, "step": 740 }, { "epoch": 2.6663665015758666, "grad_norm": 0.2668295100673454, "learning_rate": 3.8260859571158883e-07, "loss": 1.018, "step": 741 }, { "epoch": 2.669968482665466, "grad_norm": 0.30903619747795014, "learning_rate": 3.7461396582771035e-07, "loss": 1.033, "step": 742 }, { "epoch": 2.6735704637550652, "grad_norm": 0.28672971110408385, "learning_rate": 3.6670049267913954e-07, "loss": 1.0349, "step": 743 }, { "epoch": 2.6771724448446648, "grad_norm": 0.276249475350017, "learning_rate": 3.5886831511518336e-07, "loss": 1.0317, "step": 744 }, { "epoch": 2.680774425934264, "grad_norm": 0.2676884521307514, "learning_rate": 3.511175705587433e-07, "loss": 1.0254, "step": 745 }, { "epoch": 2.684376407023863, "grad_norm": 0.2983580659797731, "learning_rate": 3.434483950038986e-07, "loss": 1.0412, "step": 746 }, { "epoch": 2.6879783881134625, "grad_norm": 0.28438744098885343, "learning_rate": 3.358609230135268e-07, "loss": 1.0502, "step": 747 }, { "epoch": 2.6915803692030615, "grad_norm": 0.2715735909059339, "learning_rate": 3.283552877169399e-07, "loss": 1.0289, "step": 748 }, { "epoch": 2.695182350292661, "grad_norm": 0.2970172751919579, "learning_rate": 3.2093162080754634e-07, "loss": 1.0279, "step": 749 }, { "epoch": 2.69878433138226, "grad_norm": 0.2845334892172726, "learning_rate": 3.135900525405428e-07, "loss": 1.0137, "step": 750 }, { "epoch": 2.7023863124718597, "grad_norm": 0.27255472615309784, "learning_rate": 3.0633071173062966e-07, "loss": 1.0262, "step": 751 }, { "epoch": 2.7059882935614588, "grad_norm": 0.27752394798698354, "learning_rate": 2.99153725749749e-07, "loss": 1.0454, "step": 752 }, { "epoch": 2.709590274651058, "grad_norm": 0.276728317098971, "learning_rate": 2.920592205248496e-07, "loss": 0.9963, "step": 753 }, { "epoch": 2.7131922557406574, "grad_norm": 0.27889890261263245, "learning_rate": 2.850473205356774e-07, "loss": 1.0716, "step": 754 }, { "epoch": 2.716794236830257, "grad_norm": 0.31799452745614826, "learning_rate": 2.7811814881259503e-07, "loss": 1.0446, "step": 755 }, { "epoch": 2.720396217919856, "grad_norm": 0.27681370096335217, "learning_rate": 2.712718269344161e-07, "loss": 1.0388, "step": 756 }, { "epoch": 2.723998199009455, "grad_norm": 0.36693230305260593, "learning_rate": 2.6450847502627883e-07, "loss": 1.0537, "step": 757 }, { "epoch": 2.7276001800990546, "grad_norm": 0.29241171362928314, "learning_rate": 2.578282117575343e-07, "loss": 1.0404, "step": 758 }, { "epoch": 2.7312021611886537, "grad_norm": 0.2579639214579812, "learning_rate": 2.5123115433966615e-07, "loss": 1.0798, "step": 759 }, { "epoch": 2.734804142278253, "grad_norm": 0.27185445345296794, "learning_rate": 2.447174185242324e-07, "loss": 1.0262, "step": 760 }, { "epoch": 2.7384061233678523, "grad_norm": 0.2886015166476546, "learning_rate": 2.3828711860083676e-07, "loss": 1.0316, "step": 761 }, { "epoch": 2.742008104457452, "grad_norm": 0.3031146928198293, "learning_rate": 2.319403673951204e-07, "loss": 1.0525, "step": 762 }, { "epoch": 2.745610085547051, "grad_norm": 0.2703340838231308, "learning_rate": 2.2567727626678527e-07, "loss": 1.0723, "step": 763 }, { "epoch": 2.74921206663665, "grad_norm": 0.2650868418407948, "learning_rate": 2.1949795510763872e-07, "loss": 1.0205, "step": 764 }, { "epoch": 2.7528140477262495, "grad_norm": 0.26760082891770176, "learning_rate": 2.134025123396638e-07, "loss": 1.0261, "step": 765 }, { "epoch": 2.7564160288158486, "grad_norm": 0.2717529102257122, "learning_rate": 2.0739105491312028e-07, "loss": 1.044, "step": 766 }, { "epoch": 2.760018009905448, "grad_norm": 0.26225450814352563, "learning_rate": 2.0146368830466668e-07, "loss": 1.0793, "step": 767 }, { "epoch": 2.763619990995047, "grad_norm": 0.3063150762890465, "learning_rate": 1.9562051651550784e-07, "loss": 1.0673, "step": 768 }, { "epoch": 2.7672219720846467, "grad_norm": 0.2774099125784962, "learning_rate": 1.8986164206957037e-07, "loss": 1.0673, "step": 769 }, { "epoch": 2.770823953174246, "grad_norm": 0.2847835154599416, "learning_rate": 1.841871660117095e-07, "loss": 1.0311, "step": 770 }, { "epoch": 2.774425934263845, "grad_norm": 0.25146022356844167, "learning_rate": 1.785971879059273e-07, "loss": 1.0194, "step": 771 }, { "epoch": 2.7780279153534444, "grad_norm": 0.3024944481082907, "learning_rate": 1.7309180583363062e-07, "loss": 0.9938, "step": 772 }, { "epoch": 2.781629896443044, "grad_norm": 0.2875295564587982, "learning_rate": 1.6767111639191202e-07, "loss": 1.0043, "step": 773 }, { "epoch": 2.785231877532643, "grad_norm": 0.28086121255020596, "learning_rate": 1.6233521469185054e-07, "loss": 1.0393, "step": 774 }, { "epoch": 2.788833858622242, "grad_norm": 0.28337014562304264, "learning_rate": 1.5708419435684463e-07, "loss": 1.0467, "step": 775 }, { "epoch": 2.7924358397118416, "grad_norm": 0.2695975671118399, "learning_rate": 1.5191814752097024e-07, "loss": 1.0277, "step": 776 }, { "epoch": 2.7960378208014407, "grad_norm": 0.2596950818220317, "learning_rate": 1.4683716482736364e-07, "loss": 1.0375, "step": 777 }, { "epoch": 2.7996398018910402, "grad_norm": 0.2833370915529864, "learning_rate": 1.4184133542663014e-07, "loss": 1.0402, "step": 778 }, { "epoch": 2.8032417829806393, "grad_norm": 0.2837127477632618, "learning_rate": 1.3693074697528231e-07, "loss": 1.0663, "step": 779 }, { "epoch": 2.806843764070239, "grad_norm": 0.258408242099485, "learning_rate": 1.3210548563419857e-07, "loss": 1.0245, "step": 780 }, { "epoch": 2.810445745159838, "grad_norm": 0.29442506294804527, "learning_rate": 1.2736563606711384e-07, "loss": 0.9917, "step": 781 }, { "epoch": 2.814047726249437, "grad_norm": 0.29244090698076763, "learning_rate": 1.2271128143913458e-07, "loss": 1.0409, "step": 782 }, { "epoch": 2.8176497073390365, "grad_norm": 0.26403315463361793, "learning_rate": 1.1814250341527611e-07, "loss": 1.0486, "step": 783 }, { "epoch": 2.8212516884286356, "grad_norm": 0.2723144354203953, "learning_rate": 1.136593821590326e-07, "loss": 1.0543, "step": 784 }, { "epoch": 2.824853669518235, "grad_norm": 0.2850711230800217, "learning_rate": 1.0926199633097156e-07, "loss": 1.0095, "step": 785 }, { "epoch": 2.828455650607834, "grad_norm": 0.2771326731497052, "learning_rate": 1.0495042308735104e-07, "loss": 1.0138, "step": 786 }, { "epoch": 2.8320576316974337, "grad_norm": 0.2667788079924329, "learning_rate": 1.007247380787657e-07, "loss": 1.0354, "step": 787 }, { "epoch": 2.835659612787033, "grad_norm": 0.27205262075900666, "learning_rate": 9.658501544882182e-08, "loss": 1.008, "step": 788 }, { "epoch": 2.839261593876632, "grad_norm": 0.278153414604589, "learning_rate": 9.253132783283548e-08, "loss": 1.0575, "step": 789 }, { "epoch": 2.8428635749662314, "grad_norm": 0.2722033011712004, "learning_rate": 8.856374635655696e-08, "loss": 1.0373, "step": 790 }, { "epoch": 2.846465556055831, "grad_norm": 0.2643963535331226, "learning_rate": 8.468234063492287e-08, "loss": 1.0331, "step": 791 }, { "epoch": 2.85006753714543, "grad_norm": 0.2772917134427986, "learning_rate": 8.088717877083706e-08, "loss": 0.9933, "step": 792 }, { "epoch": 2.853669518235029, "grad_norm": 0.2567788777229302, "learning_rate": 7.717832735397335e-08, "loss": 1.039, "step": 793 }, { "epoch": 2.8572714993246286, "grad_norm": 0.3279492936255398, "learning_rate": 7.355585145960743e-08, "loss": 1.066, "step": 794 }, { "epoch": 2.8608734804142277, "grad_norm": 0.26103624246805707, "learning_rate": 7.001981464747565e-08, "loss": 1.0502, "step": 795 }, { "epoch": 2.8644754615038273, "grad_norm": 0.28523237313847577, "learning_rate": 6.657027896065982e-08, "loss": 1.0071, "step": 796 }, { "epoch": 2.8680774425934263, "grad_norm": 0.2714272091000359, "learning_rate": 6.3207304924498e-08, "loss": 1.0614, "step": 797 }, { "epoch": 2.871679423683026, "grad_norm": 0.28016748152731064, "learning_rate": 5.993095154552431e-08, "loss": 1.0324, "step": 798 }, { "epoch": 2.875281404772625, "grad_norm": 0.28138701855091797, "learning_rate": 5.674127631043025e-08, "loss": 1.0357, "step": 799 }, { "epoch": 2.878883385862224, "grad_norm": 0.259827571346895, "learning_rate": 5.363833518505834e-08, "loss": 1.0203, "step": 800 }, { "epoch": 2.8824853669518236, "grad_norm": 0.30527969392963294, "learning_rate": 5.062218261342122e-08, "loss": 1.04, "step": 801 }, { "epoch": 2.8860873480414226, "grad_norm": 0.3336787676428948, "learning_rate": 4.769287151674407e-08, "loss": 1.0177, "step": 802 }, { "epoch": 2.889689329131022, "grad_norm": 0.2673565531562563, "learning_rate": 4.485045329253646e-08, "loss": 0.9941, "step": 803 }, { "epoch": 2.8932913102206212, "grad_norm": 0.27644924732915116, "learning_rate": 4.209497781369143e-08, "loss": 1.047, "step": 804 }, { "epoch": 2.8968932913102208, "grad_norm": 0.2762539625768617, "learning_rate": 3.9426493427611177e-08, "loss": 1.0618, "step": 805 }, { "epoch": 2.90049527239982, "grad_norm": 0.279438570935992, "learning_rate": 3.684504695535496e-08, "loss": 1.0407, "step": 806 }, { "epoch": 2.904097253489419, "grad_norm": 0.25093375155659065, "learning_rate": 3.435068369082306e-08, "loss": 1.0263, "step": 807 }, { "epoch": 2.9076992345790185, "grad_norm": 0.28318467682760556, "learning_rate": 3.194344739995803e-08, "loss": 1.0519, "step": 808 }, { "epoch": 2.911301215668618, "grad_norm": 0.2904491497208285, "learning_rate": 2.9623380319976912e-08, "loss": 1.0549, "step": 809 }, { "epoch": 2.914903196758217, "grad_norm": 0.2605165703364476, "learning_rate": 2.7390523158633552e-08, "loss": 1.0166, "step": 810 }, { "epoch": 2.918505177847816, "grad_norm": 0.27714338429064156, "learning_rate": 2.5244915093499134e-08, "loss": 1.0533, "step": 811 }, { "epoch": 2.9221071589374157, "grad_norm": 0.275239630057375, "learning_rate": 2.3186593771280518e-08, "loss": 1.0233, "step": 812 }, { "epoch": 2.9257091400270148, "grad_norm": 0.27858108381395574, "learning_rate": 2.1215595307154667e-08, "loss": 0.9939, "step": 813 }, { "epoch": 2.9293111211166143, "grad_norm": 0.2685380458847235, "learning_rate": 1.9331954284137476e-08, "loss": 1.0429, "step": 814 }, { "epoch": 2.9329131022062134, "grad_norm": 0.2721576649644654, "learning_rate": 1.753570375247815e-08, "loss": 1.0401, "step": 815 }, { "epoch": 2.936515083295813, "grad_norm": 0.25219694366538187, "learning_rate": 1.582687522907633e-08, "loss": 1.0321, "step": 816 }, { "epoch": 2.940117064385412, "grad_norm": 0.31225958064880055, "learning_rate": 1.4205498696930332e-08, "loss": 1.0564, "step": 817 }, { "epoch": 2.943719045475011, "grad_norm": 0.2907802700059505, "learning_rate": 1.2671602604612531e-08, "loss": 1.0549, "step": 818 }, { "epoch": 2.9473210265646106, "grad_norm": 0.2620211100717753, "learning_rate": 1.1225213865767026e-08, "loss": 1.0135, "step": 819 }, { "epoch": 2.9509230076542097, "grad_norm": 0.2662621532694749, "learning_rate": 9.866357858642206e-09, "loss": 1.0217, "step": 820 }, { "epoch": 2.954524988743809, "grad_norm": 0.26026540743415805, "learning_rate": 8.595058425640012e-09, "loss": 1.0242, "step": 821 }, { "epoch": 2.9581269698334083, "grad_norm": 0.27157295590276637, "learning_rate": 7.411337872900715e-09, "loss": 0.9976, "step": 822 }, { "epoch": 2.961728950923008, "grad_norm": 0.2810030913242284, "learning_rate": 6.315216969912663e-09, "loss": 1.0426, "step": 823 }, { "epoch": 2.965330932012607, "grad_norm": 0.27067627376772047, "learning_rate": 5.306714949143699e-09, "loss": 1.0691, "step": 824 }, { "epoch": 2.968932913102206, "grad_norm": 0.26312224124888717, "learning_rate": 4.385849505708084e-09, "loss": 1.0305, "step": 825 }, { "epoch": 2.9725348941918055, "grad_norm": 0.28983516142663135, "learning_rate": 3.5526367970539765e-09, "loss": 1.0732, "step": 826 }, { "epoch": 2.976136875281405, "grad_norm": 0.2824286593101622, "learning_rate": 2.8070914426786555e-09, "loss": 1.0435, "step": 827 }, { "epoch": 2.979738856371004, "grad_norm": 0.26087637492866167, "learning_rate": 2.149226523874837e-09, "loss": 1.0591, "step": 828 }, { "epoch": 2.983340837460603, "grad_norm": 0.2909906050244136, "learning_rate": 1.5790535835003006e-09, "loss": 1.0739, "step": 829 }, { "epoch": 2.9869428185502027, "grad_norm": 0.26693864211195434, "learning_rate": 1.096582625772502e-09, "loss": 1.037, "step": 830 }, { "epoch": 2.990544799639802, "grad_norm": 0.28489493631726986, "learning_rate": 7.018221160981498e-10, "loss": 1.0347, "step": 831 }, { "epoch": 2.9941467807294013, "grad_norm": 0.2886886682421759, "learning_rate": 3.9477898091944135e-10, "loss": 1.0298, "step": 832 }, { "epoch": 2.9977487618190004, "grad_norm": 0.28032318013340757, "learning_rate": 1.7545860759693446e-10, "loss": 0.9798, "step": 833 }, { "epoch": 3.0, "grad_norm": 0.28032318013340757, "learning_rate": 4.3864844311847235e-11, "loss": 1.0169, "step": 834 } ], "logging_steps": 1, "max_steps": 834, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1614971456716800.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }