{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 714, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.049019607843137254, "grad_norm": 14.845285962034723, "learning_rate": 2.222222222222222e-06, "loss": 0.517, "loss_nan_ranks": 0, "loss_rank_avg": 0.18225759267807007, "step": 5, "valid_targets_mean": 4948.1, "valid_targets_min": 1347 }, { "epoch": 0.09803921568627451, "grad_norm": 7.464976124277118, "learning_rate": 5e-06, "loss": 0.4481, "loss_nan_ranks": 0, "loss_rank_avg": 0.1431998908519745, "step": 10, "valid_targets_mean": 5591.3, "valid_targets_min": 680 }, { "epoch": 0.14705882352941177, "grad_norm": 1.813784772450264, "learning_rate": 7.77777777777778e-06, "loss": 0.3912, "loss_nan_ranks": 0, "loss_rank_avg": 0.13079722225666046, "step": 15, "valid_targets_mean": 5941.9, "valid_targets_min": 267 }, { "epoch": 0.19607843137254902, "grad_norm": 0.9910305112535406, "learning_rate": 1.0555555555555557e-05, "loss": 0.3384, "loss_nan_ranks": 0, "loss_rank_avg": 0.11258930712938309, "step": 20, "valid_targets_mean": 6313.4, "valid_targets_min": 2143 }, { "epoch": 0.24509803921568626, "grad_norm": 0.7992459746477515, "learning_rate": 1.3333333333333333e-05, "loss": 0.3123, "loss_nan_ranks": 0, "loss_rank_avg": 0.10164578258991241, "step": 25, "valid_targets_mean": 5665.3, "valid_targets_min": 2015 }, { "epoch": 0.29411764705882354, "grad_norm": 0.47317240306192654, "learning_rate": 1.6111111111111115e-05, "loss": 0.2899, "loss_nan_ranks": 0, "loss_rank_avg": 0.0981433242559433, "step": 30, "valid_targets_mean": 5768.9, "valid_targets_min": 2274 }, { "epoch": 0.3431372549019608, "grad_norm": 0.3462324350621174, "learning_rate": 1.888888888888889e-05, "loss": 0.2846, "loss_nan_ranks": 0, "loss_rank_avg": 0.09364812076091766, "step": 35, "valid_targets_mean": 5493.6, "valid_targets_min": 2334 }, { "epoch": 0.39215686274509803, "grad_norm": 0.3775315605954148, "learning_rate": 2.1666666666666667e-05, "loss": 0.259, "loss_nan_ranks": 0, "loss_rank_avg": 0.07151000946760178, "step": 40, "valid_targets_mean": 5538.6, "valid_targets_min": 489 }, { "epoch": 0.4411764705882353, "grad_norm": 0.23962790083936536, "learning_rate": 2.444444444444445e-05, "loss": 0.2354, "loss_nan_ranks": 0, "loss_rank_avg": 0.07168256491422653, "step": 45, "valid_targets_mean": 6081.1, "valid_targets_min": 977 }, { "epoch": 0.49019607843137253, "grad_norm": 0.23549334781671427, "learning_rate": 2.7222222222222226e-05, "loss": 0.2383, "loss_nan_ranks": 0, "loss_rank_avg": 0.09021559357643127, "step": 50, "valid_targets_mean": 5551.8, "valid_targets_min": 983 }, { "epoch": 0.5392156862745098, "grad_norm": 0.2061785031390588, "learning_rate": 3.0000000000000004e-05, "loss": 0.2381, "loss_nan_ranks": 0, "loss_rank_avg": 0.07527650892734528, "step": 55, "valid_targets_mean": 6129.4, "valid_targets_min": 780 }, { "epoch": 0.5882352941176471, "grad_norm": 0.18844424836907367, "learning_rate": 3.277777777777778e-05, "loss": 0.2054, "loss_nan_ranks": 0, "loss_rank_avg": 0.07372777163982391, "step": 60, "valid_targets_mean": 6069.8, "valid_targets_min": 256 }, { "epoch": 0.6372549019607843, "grad_norm": 0.17246412617765505, "learning_rate": 3.555555555555555e-05, "loss": 0.2053, "loss_nan_ranks": 0, "loss_rank_avg": 0.06430137157440186, "step": 65, "valid_targets_mean": 7087.8, "valid_targets_min": 400 }, { "epoch": 0.6862745098039216, "grad_norm": 0.19460946066944204, "learning_rate": 3.833333333333334e-05, "loss": 0.2089, "loss_nan_ranks": 0, "loss_rank_avg": 0.09138505160808563, "step": 70, "valid_targets_mean": 6163.3, "valid_targets_min": 1620 }, { "epoch": 0.7352941176470589, "grad_norm": 0.19479248546898312, "learning_rate": 3.9999042174899045e-05, "loss": 0.1986, "loss_nan_ranks": 0, "loss_rank_avg": 0.06743113696575165, "step": 75, "valid_targets_mean": 5578.7, "valid_targets_min": 598 }, { "epoch": 0.7843137254901961, "grad_norm": 0.2114771866057741, "learning_rate": 3.998826769609533e-05, "loss": 0.1951, "loss_nan_ranks": 0, "loss_rank_avg": 0.060611382126808167, "step": 80, "valid_targets_mean": 6215.6, "valid_targets_min": 1322 }, { "epoch": 0.8333333333333334, "grad_norm": 0.1910136360390071, "learning_rate": 3.996552792838517e-05, "loss": 0.191, "loss_nan_ranks": 0, "loss_rank_avg": 0.06359056383371353, "step": 85, "valid_targets_mean": 5589.2, "valid_targets_min": 298 }, { "epoch": 0.8823529411764706, "grad_norm": 0.19138505864440583, "learning_rate": 3.993083648414832e-05, "loss": 0.1949, "loss_nan_ranks": 0, "loss_rank_avg": 0.07189453393220901, "step": 90, "valid_targets_mean": 5569.2, "valid_targets_min": 751 }, { "epoch": 0.9313725490196079, "grad_norm": 0.1811083652569276, "learning_rate": 3.988421413022457e-05, "loss": 0.1702, "loss_nan_ranks": 0, "loss_rank_avg": 0.05381467193365097, "step": 95, "valid_targets_mean": 6246.1, "valid_targets_min": 858 }, { "epoch": 0.9803921568627451, "grad_norm": 0.15647945413369116, "learning_rate": 3.982568877548239e-05, "loss": 0.1782, "loss_nan_ranks": 0, "loss_rank_avg": 0.051519643515348434, "step": 100, "valid_targets_mean": 6077.8, "valid_targets_min": 367 }, { "epoch": 1.0294117647058822, "grad_norm": 0.20486438301365698, "learning_rate": 3.975529545411226e-05, "loss": 0.1896, "loss_nan_ranks": 0, "loss_rank_avg": 0.05946924537420273, "step": 105, "valid_targets_mean": 5859.4, "valid_targets_min": 1382 }, { "epoch": 1.0784313725490196, "grad_norm": 0.17500723183778927, "learning_rate": 3.967307630465466e-05, "loss": 0.1748, "loss_nan_ranks": 0, "loss_rank_avg": 0.04872163012623787, "step": 110, "valid_targets_mean": 5285.2, "valid_targets_min": 1213 }, { "epoch": 1.1274509803921569, "grad_norm": 0.17177261390298737, "learning_rate": 3.957908054477526e-05, "loss": 0.1658, "loss_nan_ranks": 0, "loss_rank_avg": 0.053148724138736725, "step": 115, "valid_targets_mean": 6222.8, "valid_targets_min": 1926 }, { "epoch": 1.1764705882352942, "grad_norm": 0.18155686316968658, "learning_rate": 3.9473364441802474e-05, "loss": 0.1772, "loss_nan_ranks": 0, "loss_rank_avg": 0.05831097811460495, "step": 120, "valid_targets_mean": 5672.5, "valid_targets_min": 1670 }, { "epoch": 1.2254901960784315, "grad_norm": 0.20691680495712458, "learning_rate": 3.9355991279044965e-05, "loss": 0.1645, "loss_nan_ranks": 0, "loss_rank_avg": 0.06711968034505844, "step": 125, "valid_targets_mean": 5509.9, "valid_targets_min": 323 }, { "epoch": 1.2745098039215685, "grad_norm": 0.16953565767196252, "learning_rate": 3.922703131790925e-05, "loss": 0.1641, "loss_nan_ranks": 0, "loss_rank_avg": 0.0471009723842144, "step": 130, "valid_targets_mean": 5594.2, "valid_targets_min": 2155 }, { "epoch": 1.3235294117647058, "grad_norm": 0.19779493712452909, "learning_rate": 3.9086561755840146e-05, "loss": 0.161, "loss_nan_ranks": 0, "loss_rank_avg": 0.05489179864525795, "step": 135, "valid_targets_mean": 4934.2, "valid_targets_min": 654 }, { "epoch": 1.3725490196078431, "grad_norm": 0.20239218498545397, "learning_rate": 3.893466668010915e-05, "loss": 0.1673, "loss_nan_ranks": 0, "loss_rank_avg": 0.06282714009284973, "step": 140, "valid_targets_mean": 6535.7, "valid_targets_min": 1687 }, { "epoch": 1.4215686274509804, "grad_norm": 0.17377123597737723, "learning_rate": 3.8771437017478526e-05, "loss": 0.1668, "loss_nan_ranks": 0, "loss_rank_avg": 0.046998970210552216, "step": 145, "valid_targets_mean": 5463.5, "valid_targets_min": 390 }, { "epoch": 1.4705882352941178, "grad_norm": 0.17062719138426188, "learning_rate": 3.859697047977108e-05, "loss": 0.1635, "loss_nan_ranks": 0, "loss_rank_avg": 0.05450977012515068, "step": 150, "valid_targets_mean": 5535.4, "valid_targets_min": 804 }, { "epoch": 1.5196078431372548, "grad_norm": 0.19369358984546386, "learning_rate": 3.8411371505378356e-05, "loss": 0.1614, "loss_nan_ranks": 0, "loss_rank_avg": 0.05806870013475418, "step": 155, "valid_targets_mean": 6097.0, "valid_targets_min": 915 }, { "epoch": 1.5686274509803921, "grad_norm": 0.17606741825762348, "learning_rate": 3.8214751196742224e-05, "loss": 0.1763, "loss_nan_ranks": 0, "loss_rank_avg": 0.049261607229709625, "step": 160, "valid_targets_mean": 5752.1, "valid_targets_min": 578 }, { "epoch": 1.6176470588235294, "grad_norm": 0.2117864198467642, "learning_rate": 3.800722725384716e-05, "loss": 0.1549, "loss_nan_ranks": 0, "loss_rank_avg": 0.05296482890844345, "step": 165, "valid_targets_mean": 5688.4, "valid_targets_min": 751 }, { "epoch": 1.6666666666666665, "grad_norm": 0.19606553816396718, "learning_rate": 3.778892390376323e-05, "loss": 0.1689, "loss_nan_ranks": 0, "loss_rank_avg": 0.06805002689361572, "step": 170, "valid_targets_mean": 5743.2, "valid_targets_min": 2992 }, { "epoch": 1.715686274509804, "grad_norm": 0.2579865850635183, "learning_rate": 3.755997182628185e-05, "loss": 0.1671, "loss_nan_ranks": 0, "loss_rank_avg": 0.056646402925252914, "step": 175, "valid_targets_mean": 6171.2, "valid_targets_min": 227 }, { "epoch": 1.7647058823529411, "grad_norm": 0.15784964961757458, "learning_rate": 3.732050807568878e-05, "loss": 0.1625, "loss_nan_ranks": 0, "loss_rank_avg": 0.047502197325229645, "step": 180, "valid_targets_mean": 5991.4, "valid_targets_min": 1963 }, { "epoch": 1.8137254901960784, "grad_norm": 0.20767144079577776, "learning_rate": 3.707067599872131e-05, "loss": 0.1665, "loss_nan_ranks": 0, "loss_rank_avg": 0.06044068560004234, "step": 185, "valid_targets_mean": 5098.8, "valid_targets_min": 781 }, { "epoch": 1.8627450980392157, "grad_norm": 0.1769277041382716, "learning_rate": 3.681062514875868e-05, "loss": 0.1603, "loss_nan_ranks": 0, "loss_rank_avg": 0.06034180149435997, "step": 190, "valid_targets_mean": 5518.4, "valid_targets_min": 297 }, { "epoch": 1.9117647058823528, "grad_norm": 0.1873324059846003, "learning_rate": 3.6540511196297084e-05, "loss": 0.1657, "loss_nan_ranks": 0, "loss_rank_avg": 0.05551169440150261, "step": 195, "valid_targets_mean": 4605.0, "valid_targets_min": 1450 }, { "epoch": 1.9607843137254903, "grad_norm": 0.18605503901153028, "learning_rate": 3.6260495835762865e-05, "loss": 0.1581, "loss_nan_ranks": 0, "loss_rank_avg": 0.04336617887020111, "step": 200, "valid_targets_mean": 4599.4, "valid_targets_min": 735 }, { "epoch": 2.0098039215686274, "grad_norm": 0.185744348058967, "learning_rate": 3.597074668871972e-05, "loss": 0.1637, "loss_nan_ranks": 0, "loss_rank_avg": 0.051525089889764786, "step": 205, "valid_targets_mean": 6674.3, "valid_targets_min": 420 }, { "epoch": 2.0588235294117645, "grad_norm": 0.1852768943197571, "learning_rate": 3.567143720352786e-05, "loss": 0.1538, "loss_nan_ranks": 0, "loss_rank_avg": 0.03533821552991867, "step": 210, "valid_targets_mean": 5553.1, "valid_targets_min": 343 }, { "epoch": 2.107843137254902, "grad_norm": 0.1833870940799021, "learning_rate": 3.536274655151502e-05, "loss": 0.1526, "loss_nan_ranks": 0, "loss_rank_avg": 0.04637109115719795, "step": 215, "valid_targets_mean": 5456.4, "valid_targets_min": 894 }, { "epoch": 2.156862745098039, "grad_norm": 0.15680028488123737, "learning_rate": 3.504485951972181e-05, "loss": 0.1472, "loss_nan_ranks": 0, "loss_rank_avg": 0.03762596845626831, "step": 220, "valid_targets_mean": 5934.5, "valid_targets_min": 1149 }, { "epoch": 2.2058823529411766, "grad_norm": 0.17181083479283146, "learning_rate": 3.4717966400285215e-05, "loss": 0.1542, "loss_nan_ranks": 0, "loss_rank_avg": 0.039286620914936066, "step": 225, "valid_targets_mean": 6930.2, "valid_targets_min": 1228 }, { "epoch": 2.2549019607843137, "grad_norm": 0.19508797620673862, "learning_rate": 3.4382262876526845e-05, "loss": 0.1564, "loss_nan_ranks": 0, "loss_rank_avg": 0.06591647118330002, "step": 230, "valid_targets_mean": 6494.8, "valid_targets_min": 2458 }, { "epoch": 2.303921568627451, "grad_norm": 0.19945856709801285, "learning_rate": 3.403794990581377e-05, "loss": 0.16, "loss_nan_ranks": 0, "loss_rank_avg": 0.05911063775420189, "step": 235, "valid_targets_mean": 5858.9, "valid_targets_min": 1700 }, { "epoch": 2.3529411764705883, "grad_norm": 0.20200795155761367, "learning_rate": 3.368523359926234e-05, "loss": 0.1492, "loss_nan_ranks": 0, "loss_rank_avg": 0.05478104576468468, "step": 240, "valid_targets_mean": 5153.1, "valid_targets_min": 2065 }, { "epoch": 2.4019607843137254, "grad_norm": 0.19519853478938234, "learning_rate": 3.332432509835687e-05, "loss": 0.1549, "loss_nan_ranks": 0, "loss_rank_avg": 0.04512668773531914, "step": 245, "valid_targets_mean": 5347.1, "valid_targets_min": 2068 }, { "epoch": 2.450980392156863, "grad_norm": 0.18550082917348132, "learning_rate": 3.2955440448556986e-05, "loss": 0.1548, "loss_nan_ranks": 0, "loss_rank_avg": 0.05476547032594681, "step": 250, "valid_targets_mean": 5342.8, "valid_targets_min": 527 }, { "epoch": 2.5, "grad_norm": 0.17389768733214, "learning_rate": 3.257880046996954e-05, "loss": 0.1482, "loss_nan_ranks": 0, "loss_rank_avg": 0.044103942811489105, "step": 255, "valid_targets_mean": 5376.1, "valid_targets_min": 188 }, { "epoch": 2.549019607843137, "grad_norm": 0.18960557879045112, "learning_rate": 3.219463062516218e-05, "loss": 0.1472, "loss_nan_ranks": 0, "loss_rank_avg": 0.044259026646614075, "step": 260, "valid_targets_mean": 6247.7, "valid_targets_min": 120 }, { "epoch": 2.5980392156862746, "grad_norm": 0.18057320660204854, "learning_rate": 3.180316088419794e-05, "loss": 0.1504, "loss_nan_ranks": 0, "loss_rank_avg": 0.04895833134651184, "step": 265, "valid_targets_mean": 5481.7, "valid_targets_min": 682 }, { "epoch": 2.6470588235294117, "grad_norm": 0.16850152487348163, "learning_rate": 3.14046255869716e-05, "loss": 0.153, "loss_nan_ranks": 0, "loss_rank_avg": 0.05481772869825363, "step": 270, "valid_targets_mean": 5729.1, "valid_targets_min": 1495 }, { "epoch": 2.696078431372549, "grad_norm": 0.1972977593451679, "learning_rate": 3.099926330293017e-05, "loss": 0.1654, "loss_nan_ranks": 0, "loss_rank_avg": 0.0604429729282856, "step": 275, "valid_targets_mean": 4838.8, "valid_targets_min": 1164 }, { "epoch": 2.7450980392156863, "grad_norm": 0.1779532546747506, "learning_rate": 3.058731668826147e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.0506039559841156, "step": 280, "valid_targets_mean": 5173.8, "valid_targets_min": 966 }, { "epoch": 2.7941176470588234, "grad_norm": 0.16859117358751705, "learning_rate": 3.0169032340636363e-05, "loss": 0.1562, "loss_nan_ranks": 0, "loss_rank_avg": 0.06492990255355835, "step": 285, "valid_targets_mean": 6798.8, "valid_targets_min": 2298 }, { "epoch": 2.843137254901961, "grad_norm": 0.17047341359032425, "learning_rate": 2.9744660651591544e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.04040013998746872, "step": 290, "valid_targets_mean": 5656.5, "valid_targets_min": 1994 }, { "epoch": 2.892156862745098, "grad_norm": 0.1691917059677243, "learning_rate": 2.9314455656641275e-05, "loss": 0.1442, "loss_nan_ranks": 0, "loss_rank_avg": 0.05659858137369156, "step": 295, "valid_targets_mean": 6168.2, "valid_targets_min": 255 }, { "epoch": 2.9411764705882355, "grad_norm": 0.18686162719495658, "learning_rate": 2.8878674883207726e-05, "loss": 0.1525, "loss_nan_ranks": 0, "loss_rank_avg": 0.05041201412677765, "step": 300, "valid_targets_mean": 6669.2, "valid_targets_min": 1463 }, { "epoch": 2.9901960784313726, "grad_norm": 0.17069829336335143, "learning_rate": 2.8437579196461072e-05, "loss": 0.1396, "loss_nan_ranks": 0, "loss_rank_avg": 0.044972144067287445, "step": 305, "valid_targets_mean": 6637.5, "valid_targets_min": 969 }, { "epoch": 3.0392156862745097, "grad_norm": 0.15999237456048723, "learning_rate": 2.799143264316152e-05, "loss": 0.148, "loss_nan_ranks": 0, "loss_rank_avg": 0.0373958945274353, "step": 310, "valid_targets_mean": 5989.3, "valid_targets_min": 337 }, { "epoch": 3.088235294117647, "grad_norm": 0.1592160126728059, "learning_rate": 2.7540502293596802e-05, "loss": 0.1396, "loss_nan_ranks": 0, "loss_rank_avg": 0.04034284129738808, "step": 315, "valid_targets_mean": 5127.5, "valid_targets_min": 695 }, { "epoch": 3.1372549019607843, "grad_norm": 0.1769980777310915, "learning_rate": 2.708505808170973e-05, "loss": 0.1537, "loss_nan_ranks": 0, "loss_rank_avg": 0.044640615582466125, "step": 320, "valid_targets_mean": 5812.9, "valid_targets_min": 277 }, { "epoch": 3.186274509803922, "grad_norm": 0.15257116830142609, "learning_rate": 2.662537264351152e-05, "loss": 0.1404, "loss_nan_ranks": 0, "loss_rank_avg": 0.05242038518190384, "step": 325, "valid_targets_mean": 5841.9, "valid_targets_min": 983 }, { "epoch": 3.235294117647059, "grad_norm": 0.16404975625042978, "learning_rate": 2.6161721153877658e-05, "loss": 0.1519, "loss_nan_ranks": 0, "loss_rank_avg": 0.048479050397872925, "step": 330, "valid_targets_mean": 5749.8, "valid_targets_min": 349 }, { "epoch": 3.284313725490196, "grad_norm": 0.16196200510111014, "learning_rate": 2.5694381161823883e-05, "loss": 0.1393, "loss_nan_ranks": 0, "loss_rank_avg": 0.05674157291650772, "step": 335, "valid_targets_mean": 6188.4, "valid_targets_min": 2150 }, { "epoch": 3.3333333333333335, "grad_norm": 0.1715568446927489, "learning_rate": 2.522363242436102e-05, "loss": 0.1461, "loss_nan_ranks": 0, "loss_rank_avg": 0.0405210517346859, "step": 340, "valid_targets_mean": 5543.5, "valid_targets_min": 220 }, { "epoch": 3.3823529411764706, "grad_norm": 0.18625399468743448, "learning_rate": 2.47497567390281e-05, "loss": 0.1581, "loss_nan_ranks": 0, "loss_rank_avg": 0.05725814402103424, "step": 345, "valid_targets_mean": 6666.8, "valid_targets_min": 1783 }, { "epoch": 3.431372549019608, "grad_norm": 0.16710741980403304, "learning_rate": 2.4273037775203924e-05, "loss": 0.1434, "loss_nan_ranks": 0, "loss_rank_avg": 0.047255054116249084, "step": 350, "valid_targets_mean": 5879.6, "valid_targets_min": 2429 }, { "epoch": 3.480392156862745, "grad_norm": 0.18714960345056877, "learning_rate": 2.3793760904298154e-05, "loss": 0.1444, "loss_nan_ranks": 0, "loss_rank_avg": 0.04799136519432068, "step": 355, "valid_targets_mean": 5505.2, "valid_targets_min": 178 }, { "epoch": 3.5294117647058822, "grad_norm": 0.195485882837898, "learning_rate": 2.3312213028923572e-05, "loss": 0.1458, "loss_nan_ranks": 0, "loss_rank_avg": 0.05502048134803772, "step": 360, "valid_targets_mean": 6203.2, "valid_targets_min": 915 }, { "epoch": 3.5784313725490198, "grad_norm": 0.1977460248140161, "learning_rate": 2.2828682411151634e-05, "loss": 0.142, "loss_nan_ranks": 0, "loss_rank_avg": 0.04789289832115173, "step": 365, "valid_targets_mean": 5283.4, "valid_targets_min": 1751 }, { "epoch": 3.627450980392157, "grad_norm": 0.17712548108134565, "learning_rate": 2.2343458499954342e-05, "loss": 0.1438, "loss_nan_ranks": 0, "loss_rank_avg": 0.05372612178325653, "step": 370, "valid_targets_mean": 5116.2, "valid_targets_min": 745 }, { "epoch": 3.6764705882352944, "grad_norm": 0.17787145700828857, "learning_rate": 2.1856831757935563e-05, "loss": 0.1459, "loss_nan_ranks": 0, "loss_rank_avg": 0.052830249071121216, "step": 375, "valid_targets_mean": 6221.8, "valid_targets_min": 489 }, { "epoch": 3.7254901960784315, "grad_norm": 0.1403375945900038, "learning_rate": 2.136909348745558e-05, "loss": 0.1445, "loss_nan_ranks": 0, "loss_rank_avg": 0.05467274412512779, "step": 380, "valid_targets_mean": 7338.7, "valid_targets_min": 267 }, { "epoch": 3.7745098039215685, "grad_norm": 0.17890476249652149, "learning_rate": 2.0880535656252955e-05, "loss": 0.1559, "loss_nan_ranks": 0, "loss_rank_avg": 0.04823530465364456, "step": 385, "valid_targets_mean": 5682.3, "valid_targets_min": 1113 }, { "epoch": 3.8235294117647056, "grad_norm": 0.1675345935224821, "learning_rate": 2.0391450722668096e-05, "loss": 0.1422, "loss_nan_ranks": 0, "loss_rank_avg": 0.04629891738295555, "step": 390, "valid_targets_mean": 5964.7, "valid_targets_min": 906 }, { "epoch": 3.872549019607843, "grad_norm": 0.16849306980232315, "learning_rate": 1.9902131460573106e-05, "loss": 0.1412, "loss_nan_ranks": 0, "loss_rank_avg": 0.045284684747457504, "step": 395, "valid_targets_mean": 4922.6, "valid_targets_min": 233 }, { "epoch": 3.9215686274509802, "grad_norm": 0.1684953006314063, "learning_rate": 1.941287078411279e-05, "loss": 0.1419, "loss_nan_ranks": 0, "loss_rank_avg": 0.053950127214193344, "step": 400, "valid_targets_mean": 6029.8, "valid_targets_min": 1074 }, { "epoch": 3.9705882352941178, "grad_norm": 0.17222377503381056, "learning_rate": 1.8923961572361688e-05, "loss": 0.1448, "loss_nan_ranks": 0, "loss_rank_avg": 0.046407949179410934, "step": 405, "valid_targets_mean": 6019.1, "valid_targets_min": 293 }, { "epoch": 4.019607843137255, "grad_norm": 0.15984187499464808, "learning_rate": 1.8435696494002076e-05, "loss": 0.1359, "loss_nan_ranks": 0, "loss_rank_avg": 0.04338710010051727, "step": 410, "valid_targets_mean": 5504.8, "valid_targets_min": 321 }, { "epoch": 4.068627450980392, "grad_norm": 0.16765151368961928, "learning_rate": 1.7948367832127934e-05, "loss": 0.148, "loss_nan_ranks": 0, "loss_rank_avg": 0.051340095698833466, "step": 415, "valid_targets_mean": 6667.2, "valid_targets_min": 1060 }, { "epoch": 4.117647058823529, "grad_norm": 0.15792491122980973, "learning_rate": 1.7462267309279722e-05, "loss": 0.1436, "loss_nan_ranks": 0, "loss_rank_avg": 0.05365137755870819, "step": 420, "valid_targets_mean": 6270.2, "valid_targets_min": 1126 }, { "epoch": 4.166666666666667, "grad_norm": 0.1692968124966608, "learning_rate": 1.6977685912814723e-05, "loss": 0.1386, "loss_nan_ranks": 0, "loss_rank_avg": 0.04327564314007759, "step": 425, "valid_targets_mean": 4881.8, "valid_targets_min": 280 }, { "epoch": 4.215686274509804, "grad_norm": 0.16802626120360126, "learning_rate": 1.649491372071745e-05, "loss": 0.1348, "loss_nan_ranks": 0, "loss_rank_avg": 0.04639950394630432, "step": 430, "valid_targets_mean": 6071.3, "valid_targets_min": 1608 }, { "epoch": 4.264705882352941, "grad_norm": 0.1739251489205305, "learning_rate": 1.601423972795448e-05, "loss": 0.1466, "loss_nan_ranks": 0, "loss_rank_avg": 0.04997720196843147, "step": 435, "valid_targets_mean": 5071.2, "valid_targets_min": 1901 }, { "epoch": 4.313725490196078, "grad_norm": 0.16846274854669305, "learning_rate": 1.5535951673477493e-05, "loss": 0.1443, "loss_nan_ranks": 0, "loss_rank_avg": 0.036386433988809586, "step": 440, "valid_targets_mean": 6193.9, "valid_targets_min": 804 }, { "epoch": 4.362745098039215, "grad_norm": 0.17845219785128927, "learning_rate": 1.5060335867978322e-05, "loss": 0.1491, "loss_nan_ranks": 0, "loss_rank_avg": 0.049667488783597946, "step": 445, "valid_targets_mean": 5358.5, "valid_targets_min": 906 }, { "epoch": 4.411764705882353, "grad_norm": 0.16638712672136335, "learning_rate": 1.4587677022498845e-05, "loss": 0.1492, "loss_nan_ranks": 0, "loss_rank_avg": 0.04850779101252556, "step": 450, "valid_targets_mean": 6082.3, "valid_targets_min": 1213 }, { "epoch": 4.46078431372549, "grad_norm": 0.1509181659878023, "learning_rate": 1.4118258077998563e-05, "loss": 0.1347, "loss_nan_ranks": 0, "loss_rank_avg": 0.037120066583156586, "step": 455, "valid_targets_mean": 7179.8, "valid_targets_min": 1754 }, { "epoch": 4.509803921568627, "grad_norm": 0.17919038805552429, "learning_rate": 1.3652360035981657e-05, "loss": 0.1352, "loss_nan_ranks": 0, "loss_rank_avg": 0.04360215738415718, "step": 460, "valid_targets_mean": 5704.8, "valid_targets_min": 1074 }, { "epoch": 4.5588235294117645, "grad_norm": 0.17444385735623882, "learning_rate": 1.3190261790285202e-05, "loss": 0.1537, "loss_nan_ranks": 0, "loss_rank_avg": 0.046840980648994446, "step": 465, "valid_targets_mean": 6333.6, "valid_targets_min": 990 }, { "epoch": 4.607843137254902, "grad_norm": 0.17400376107735924, "learning_rate": 1.2732239960128854e-05, "loss": 0.1403, "loss_nan_ranks": 0, "loss_rank_avg": 0.04668301343917847, "step": 470, "valid_targets_mean": 6577.3, "valid_targets_min": 804 }, { "epoch": 4.6568627450980395, "grad_norm": 0.1642690410502086, "learning_rate": 1.227856872452637e-05, "loss": 0.1363, "loss_nan_ranks": 0, "loss_rank_avg": 0.04167339950799942, "step": 475, "valid_targets_mean": 6613.6, "valid_targets_min": 1274 }, { "epoch": 4.705882352941177, "grad_norm": 0.1665608183925791, "learning_rate": 1.1829519658157706e-05, "loss": 0.1349, "loss_nan_ranks": 0, "loss_rank_avg": 0.04697566106915474, "step": 480, "valid_targets_mean": 5399.3, "valid_targets_min": 343 }, { "epoch": 4.754901960784314, "grad_norm": 0.15109027745291384, "learning_rate": 1.1385361568800205e-05, "loss": 0.1409, "loss_nan_ranks": 0, "loss_rank_avg": 0.04550229385495186, "step": 485, "valid_targets_mean": 7463.3, "valid_targets_min": 2613 }, { "epoch": 4.803921568627451, "grad_norm": 0.16283398624058726, "learning_rate": 1.0946360336416041e-05, "loss": 0.145, "loss_nan_ranks": 0, "loss_rank_avg": 0.04752221703529358, "step": 490, "valid_targets_mean": 5754.3, "valid_targets_min": 1679 }, { "epoch": 4.852941176470588, "grad_norm": 0.15690476742993395, "learning_rate": 1.0512778753992384e-05, "loss": 0.1391, "loss_nan_ranks": 0, "loss_rank_avg": 0.050478532910346985, "step": 495, "valid_targets_mean": 6343.4, "valid_targets_min": 1164 }, { "epoch": 4.901960784313726, "grad_norm": 0.1547493986508796, "learning_rate": 1.0084876370229346e-05, "loss": 0.1361, "loss_nan_ranks": 0, "loss_rank_avg": 0.037191685289144516, "step": 500, "valid_targets_mean": 5394.4, "valid_targets_min": 2151 }, { "epoch": 4.950980392156863, "grad_norm": 0.1546433674602096, "learning_rate": 9.662909334170119e-06, "loss": 0.1398, "loss_nan_ranks": 0, "loss_rank_avg": 0.0633089691400528, "step": 505, "valid_targets_mean": 6511.4, "valid_targets_min": 2728 }, { "epoch": 5.0, "grad_norm": 0.14864264452615442, "learning_rate": 9.247130241866162e-06, "loss": 0.1293, "loss_nan_ranks": 0, "loss_rank_avg": 0.0435556136071682, "step": 510, "valid_targets_mean": 6029.2, "valid_targets_min": 826 }, { "epoch": 5.049019607843137, "grad_norm": 0.17351132595970922, "learning_rate": 8.837787985169248e-06, "loss": 0.1389, "loss_nan_ranks": 0, "loss_rank_avg": 0.04400845244526863, "step": 515, "valid_targets_mean": 4938.3, "valid_targets_min": 578 }, { "epoch": 5.098039215686274, "grad_norm": 0.16525532968859072, "learning_rate": 8.435127602740931e-06, "loss": 0.136, "loss_nan_ranks": 0, "loss_rank_avg": 0.04021601378917694, "step": 520, "valid_targets_mean": 4886.9, "valid_targets_min": 245 }, { "epoch": 5.147058823529412, "grad_norm": 0.17565072755854125, "learning_rate": 8.03939013336857e-06, "loss": 0.1486, "loss_nan_ranks": 0, "loss_rank_avg": 0.0686882734298706, "step": 525, "valid_targets_mean": 6319.9, "valid_targets_min": 2372 }, { "epoch": 5.196078431372549, "grad_norm": 0.16546346883164598, "learning_rate": 7.650812471675752e-06, "loss": 0.1418, "loss_nan_ranks": 0, "loss_rank_avg": 0.048375021666288376, "step": 530, "valid_targets_mean": 5439.3, "valid_targets_min": 401 }, { "epoch": 5.245098039215686, "grad_norm": 0.16730821511271163, "learning_rate": 7.269627226313507e-06, "loss": 0.1373, "loss_nan_ranks": 0, "loss_rank_avg": 0.050091587007045746, "step": 535, "valid_targets_mean": 5765.1, "valid_targets_min": 942 }, { "epoch": 5.294117647058823, "grad_norm": 0.15886005158900465, "learning_rate": 6.896062580717056e-06, "loss": 0.1333, "loss_nan_ranks": 0, "loss_rank_avg": 0.052998557686805725, "step": 540, "valid_targets_mean": 5913.3, "valid_targets_min": 1181 }, { "epoch": 5.3431372549019605, "grad_norm": 0.18046601157388797, "learning_rate": 6.5303421565117595e-06, "loss": 0.1359, "loss_nan_ranks": 0, "loss_rank_avg": 0.04307221621274948, "step": 545, "valid_targets_mean": 4173.9, "valid_targets_min": 651 }, { "epoch": 5.392156862745098, "grad_norm": 0.14559092733051696, "learning_rate": 6.172684879649613e-06, "loss": 0.1284, "loss_nan_ranks": 0, "loss_rank_avg": 0.04021957516670227, "step": 550, "valid_targets_mean": 5618.3, "valid_targets_min": 386 }, { "epoch": 5.4411764705882355, "grad_norm": 0.15063235370677366, "learning_rate": 5.82330484935685e-06, "loss": 0.1361, "loss_nan_ranks": 0, "loss_rank_avg": 0.05188259109854698, "step": 555, "valid_targets_mean": 6229.9, "valid_targets_min": 2368 }, { "epoch": 5.490196078431373, "grad_norm": 0.15416766663654213, "learning_rate": 5.482411209970742e-06, "loss": 0.1324, "loss_nan_ranks": 0, "loss_rank_avg": 0.04332681745290756, "step": 560, "valid_targets_mean": 5592.3, "valid_targets_min": 420 }, { "epoch": 5.53921568627451, "grad_norm": 0.16407271932187248, "learning_rate": 5.15020802574256e-06, "loss": 0.1401, "loss_nan_ranks": 0, "loss_rank_avg": 0.055699240416288376, "step": 565, "valid_targets_mean": 5802.0, "valid_targets_min": 776 }, { "epoch": 5.588235294117647, "grad_norm": 0.16225757678922273, "learning_rate": 4.8268941586815275e-06, "loss": 0.1384, "loss_nan_ranks": 0, "loss_rank_avg": 0.05296599119901657, "step": 570, "valid_targets_mean": 5673.7, "valid_targets_min": 1561 }, { "epoch": 5.637254901960784, "grad_norm": 0.1783066013238609, "learning_rate": 4.512663149512915e-06, "loss": 0.1404, "loss_nan_ranks": 0, "loss_rank_avg": 0.05072878301143646, "step": 575, "valid_targets_mean": 4909.6, "valid_targets_min": 270 }, { "epoch": 5.686274509803922, "grad_norm": 0.1635156014972583, "learning_rate": 4.207703101821547e-06, "loss": 0.1452, "loss_nan_ranks": 0, "loss_rank_avg": 0.04394640773534775, "step": 580, "valid_targets_mean": 6057.6, "valid_targets_min": 724 }, { "epoch": 5.735294117647059, "grad_norm": 0.1530197110100291, "learning_rate": 3.912196569450062e-06, "loss": 0.1365, "loss_nan_ranks": 0, "loss_rank_avg": 0.04203708469867706, "step": 585, "valid_targets_mean": 6321.2, "valid_targets_min": 332 }, { "epoch": 5.784313725490196, "grad_norm": 0.15884170707784306, "learning_rate": 3.626320447219325e-06, "loss": 0.1422, "loss_nan_ranks": 0, "loss_rank_avg": 0.04695093259215355, "step": 590, "valid_targets_mean": 5222.4, "valid_targets_min": 825 }, { "epoch": 5.833333333333333, "grad_norm": 0.1630943874822207, "learning_rate": 3.350245865036439e-06, "loss": 0.1399, "loss_nan_ranks": 0, "loss_rank_avg": 0.043691154569387436, "step": 595, "valid_targets_mean": 6610.5, "valid_targets_min": 924 }, { "epoch": 5.882352941176471, "grad_norm": 0.14034024802523662, "learning_rate": 3.0841380854536986e-06, "loss": 0.1374, "loss_nan_ranks": 0, "loss_rank_avg": 0.03624898940324783, "step": 600, "valid_targets_mean": 7330.9, "valid_targets_min": 2730 }, { "epoch": 5.931372549019608, "grad_norm": 0.14334753955970883, "learning_rate": 2.828156404739879e-06, "loss": 0.1353, "loss_nan_ranks": 0, "loss_rank_avg": 0.041707608848810196, "step": 605, "valid_targets_mean": 6577.7, "valid_targets_min": 977 }, { "epoch": 5.980392156862745, "grad_norm": 0.14232337944292772, "learning_rate": 2.5824540575229475e-06, "loss": 0.1297, "loss_nan_ranks": 0, "loss_rank_avg": 0.033248208463191986, "step": 610, "valid_targets_mean": 5936.8, "valid_targets_min": 1856 }, { "epoch": 6.029411764705882, "grad_norm": 0.1528237861523893, "learning_rate": 2.3471781250614932e-06, "loss": 0.1409, "loss_nan_ranks": 0, "loss_rank_avg": 0.04537317529320717, "step": 615, "valid_targets_mean": 5408.1, "valid_targets_min": 2357 }, { "epoch": 6.078431372549019, "grad_norm": 0.15536078711039872, "learning_rate": 2.122469447199529e-06, "loss": 0.1444, "loss_nan_ranks": 0, "loss_rank_avg": 0.04748551920056343, "step": 620, "valid_targets_mean": 6326.7, "valid_targets_min": 1518 }, { "epoch": 6.127450980392156, "grad_norm": 0.15104775706307438, "learning_rate": 1.908462538057607e-06, "loss": 0.1356, "loss_nan_ranks": 0, "loss_rank_avg": 0.044153109192848206, "step": 625, "valid_targets_mean": 5396.5, "valid_targets_min": 1338 }, { "epoch": 6.176470588235294, "grad_norm": 0.1532713977989426, "learning_rate": 1.7052855055105477e-06, "loss": 0.1403, "loss_nan_ranks": 0, "loss_rank_avg": 0.04733777046203613, "step": 630, "valid_targets_mean": 6734.2, "valid_targets_min": 2804 }, { "epoch": 6.2254901960784315, "grad_norm": 0.15270643758494024, "learning_rate": 1.5130599745000663e-06, "loss": 0.135, "loss_nan_ranks": 0, "loss_rank_avg": 0.040537070482969284, "step": 635, "valid_targets_mean": 5465.8, "valid_targets_min": 1202 }, { "epoch": 6.2745098039215685, "grad_norm": 0.1478234433976078, "learning_rate": 1.331901014228192e-06, "loss": 0.129, "loss_nan_ranks": 0, "loss_rank_avg": 0.05076032876968384, "step": 640, "valid_targets_mean": 5307.6, "valid_targets_min": 974 }, { "epoch": 6.323529411764706, "grad_norm": 0.14756205953677878, "learning_rate": 1.161917069275047e-06, "loss": 0.1436, "loss_nan_ranks": 0, "loss_rank_avg": 0.04347331076860428, "step": 645, "valid_targets_mean": 5066.8, "valid_targets_min": 727 }, { "epoch": 6.372549019607844, "grad_norm": 0.15417081876853794, "learning_rate": 1.0032098946822244e-06, "loss": 0.1402, "loss_nan_ranks": 0, "loss_rank_avg": 0.041828703135252, "step": 650, "valid_targets_mean": 6169.8, "valid_targets_min": 1408 }, { "epoch": 6.421568627450981, "grad_norm": 0.16730842517730218, "learning_rate": 8.558744950406361e-07, "loss": 0.1344, "loss_nan_ranks": 0, "loss_rank_avg": 0.052832819521427155, "step": 655, "valid_targets_mean": 6030.4, "valid_targets_min": 916 }, { "epoch": 6.470588235294118, "grad_norm": 0.14840505399338527, "learning_rate": 7.199990676192836e-07, "loss": 0.133, "loss_nan_ranks": 0, "loss_rank_avg": 0.042389050126075745, "step": 660, "valid_targets_mean": 6102.2, "valid_targets_min": 1060 }, { "epoch": 6.519607843137255, "grad_norm": 0.1464931099328798, "learning_rate": 5.956649495689992e-07, "loss": 0.1328, "loss_nan_ranks": 0, "loss_rank_avg": 0.052474960684776306, "step": 665, "valid_targets_mean": 5826.2, "valid_targets_min": 1276 }, { "epoch": 6.568627450980392, "grad_norm": 0.14130389260204995, "learning_rate": 4.829465692327429e-07, "loss": 0.1367, "loss_nan_ranks": 0, "loss_rank_avg": 0.04665312170982361, "step": 670, "valid_targets_mean": 6207.3, "valid_targets_min": 1636 }, { "epoch": 6.617647058823529, "grad_norm": 0.14337796217921622, "learning_rate": 3.819114015916614e-07, "loss": 0.1474, "loss_nan_ranks": 0, "loss_rank_avg": 0.05262928083539009, "step": 675, "valid_targets_mean": 6180.5, "valid_targets_min": 1432 }, { "epoch": 6.666666666666667, "grad_norm": 0.14706066206331941, "learning_rate": 2.9261992787347873e-07, "loss": 0.1413, "loss_nan_ranks": 0, "loss_rank_avg": 0.04194498062133789, "step": 680, "valid_targets_mean": 6359.8, "valid_targets_min": 1625 }, { "epoch": 6.715686274509804, "grad_norm": 0.14057051185988517, "learning_rate": 2.151255993475254e-07, "loss": 0.1398, "loss_nan_ranks": 0, "loss_rank_avg": 0.0460154190659523, "step": 685, "valid_targets_mean": 6352.1, "valid_targets_min": 1706 }, { "epoch": 6.764705882352941, "grad_norm": 0.12402153861336833, "learning_rate": 1.4947480532794489e-07, "loss": 0.1311, "loss_nan_ranks": 0, "loss_rank_avg": 0.03990158438682556, "step": 690, "valid_targets_mean": 7494.4, "valid_targets_min": 270 }, { "epoch": 6.813725490196078, "grad_norm": 0.1599205562983089, "learning_rate": 9.570684540434638e-08, "loss": 0.133, "loss_nan_ranks": 0, "loss_rank_avg": 0.05221497267484665, "step": 695, "valid_targets_mean": 5782.2, "valid_targets_min": 977 }, { "epoch": 6.862745098039216, "grad_norm": 0.1479123937146945, "learning_rate": 5.3853905916443347e-08, "loss": 0.1307, "loss_nan_ranks": 0, "loss_rank_avg": 0.04318413883447647, "step": 700, "valid_targets_mean": 5034.2, "valid_targets_min": 1045 }, { "epoch": 6.911764705882353, "grad_norm": 0.1465353782553542, "learning_rate": 2.3941040686816796e-08, "loss": 0.1387, "loss_nan_ranks": 0, "loss_rank_avg": 0.040560588240623474, "step": 705, "valid_targets_mean": 6557.7, "valid_targets_min": 2578 }, { "epoch": 6.96078431372549, "grad_norm": 0.13650641166527105, "learning_rate": 5.986156023303214e-09, "loss": 0.1264, "loss_nan_ranks": 0, "loss_rank_avg": 0.043502289801836014, "step": 710, "valid_targets_mean": 6664.9, "valid_targets_min": 1868 }, { "epoch": 7.0, "step": 714, "total_flos": 2.800887118061109e+18, "train_loss": 0.0, "train_runtime": 3.2172, "train_samples_per_second": 21305.35, "train_steps_per_second": 221.931 } ], "logging_steps": 5, "max_steps": 714, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.800887118061109e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }