{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008090614886731391, "grad_norm": 10.671297597284719, "learning_rate": 1.2129380053908356e-06, "loss": 2.601, "step": 10 }, { "epoch": 0.016181229773462782, "grad_norm": 6.311089864073009, "learning_rate": 2.560646900269542e-06, "loss": 2.3954, "step": 20 }, { "epoch": 0.024271844660194174, "grad_norm": 5.215362734590839, "learning_rate": 3.908355795148248e-06, "loss": 1.7763, "step": 30 }, { "epoch": 0.032362459546925564, "grad_norm": 2.0676790587143836, "learning_rate": 5.256064690026954e-06, "loss": 1.2086, "step": 40 }, { "epoch": 0.040453074433656956, "grad_norm": 1.4206424846718397, "learning_rate": 6.60377358490566e-06, "loss": 0.8436, "step": 50 }, { "epoch": 0.04854368932038835, "grad_norm": 1.5867176039284459, "learning_rate": 7.951482479784367e-06, "loss": 0.6706, "step": 60 }, { "epoch": 0.05663430420711974, "grad_norm": 1.2138979784248856, "learning_rate": 9.299191374663074e-06, "loss": 0.5858, "step": 70 }, { "epoch": 0.06472491909385113, "grad_norm": 0.7702442127235006, "learning_rate": 1.0646900269541779e-05, "loss": 0.5384, "step": 80 }, { "epoch": 0.07281553398058252, "grad_norm": 1.0114209152057974, "learning_rate": 1.1994609164420486e-05, "loss": 0.4977, "step": 90 }, { "epoch": 0.08090614886731391, "grad_norm": 0.5794843568633474, "learning_rate": 1.3342318059299191e-05, "loss": 0.4639, "step": 100 }, { "epoch": 0.0889967637540453, "grad_norm": 0.696228528125689, "learning_rate": 1.4690026954177898e-05, "loss": 0.4589, "step": 110 }, { "epoch": 0.0970873786407767, "grad_norm": 0.7033818455552552, "learning_rate": 1.6037735849056604e-05, "loss": 0.4511, "step": 120 }, { "epoch": 0.10517799352750809, "grad_norm": 0.6918266955210028, "learning_rate": 1.738544474393531e-05, "loss": 0.4259, "step": 130 }, { "epoch": 0.11326860841423948, "grad_norm": 0.48391302376719975, "learning_rate": 1.8733153638814018e-05, "loss": 0.4271, "step": 140 }, { "epoch": 0.12135922330097088, "grad_norm": 0.6600203232524687, "learning_rate": 2.0080862533692725e-05, "loss": 0.415, "step": 150 }, { "epoch": 0.12944983818770225, "grad_norm": 0.6491506270021533, "learning_rate": 2.1428571428571428e-05, "loss": 0.4047, "step": 160 }, { "epoch": 0.13754045307443366, "grad_norm": 1.108358955768956, "learning_rate": 2.2776280323450135e-05, "loss": 0.3977, "step": 170 }, { "epoch": 0.14563106796116504, "grad_norm": 1.0220828997842137, "learning_rate": 2.4123989218328842e-05, "loss": 0.3972, "step": 180 }, { "epoch": 0.15372168284789645, "grad_norm": 0.7866428488585541, "learning_rate": 2.547169811320755e-05, "loss": 0.3886, "step": 190 }, { "epoch": 0.16181229773462782, "grad_norm": 0.7956757749575036, "learning_rate": 2.6819407008086256e-05, "loss": 0.3881, "step": 200 }, { "epoch": 0.16990291262135923, "grad_norm": 0.7154298194551921, "learning_rate": 2.8167115902964963e-05, "loss": 0.3915, "step": 210 }, { "epoch": 0.1779935275080906, "grad_norm": 0.7172238259413939, "learning_rate": 2.9514824797843667e-05, "loss": 0.384, "step": 220 }, { "epoch": 0.18608414239482202, "grad_norm": 0.7995655410192202, "learning_rate": 3.086253369272237e-05, "loss": 0.3748, "step": 230 }, { "epoch": 0.1941747572815534, "grad_norm": 13.800265685156136, "learning_rate": 3.221024258760108e-05, "loss": 0.3732, "step": 240 }, { "epoch": 0.2022653721682848, "grad_norm": 0.9742105484202757, "learning_rate": 3.355795148247979e-05, "loss": 0.3778, "step": 250 }, { "epoch": 0.21035598705501618, "grad_norm": 0.7304881990540768, "learning_rate": 3.490566037735849e-05, "loss": 0.3779, "step": 260 }, { "epoch": 0.21844660194174756, "grad_norm": 0.6020503863674406, "learning_rate": 3.62533692722372e-05, "loss": 0.3659, "step": 270 }, { "epoch": 0.22653721682847897, "grad_norm": 0.6249460279802084, "learning_rate": 3.76010781671159e-05, "loss": 0.3678, "step": 280 }, { "epoch": 0.23462783171521034, "grad_norm": 0.7546333918045006, "learning_rate": 3.894878706199461e-05, "loss": 0.3608, "step": 290 }, { "epoch": 0.24271844660194175, "grad_norm": 1.0594379228439452, "learning_rate": 4.0296495956873316e-05, "loss": 0.3623, "step": 300 }, { "epoch": 0.25080906148867316, "grad_norm": 0.8968179435463678, "learning_rate": 4.164420485175202e-05, "loss": 0.361, "step": 310 }, { "epoch": 0.2588996763754045, "grad_norm": 0.6785334732318974, "learning_rate": 4.299191374663073e-05, "loss": 0.3587, "step": 320 }, { "epoch": 0.2669902912621359, "grad_norm": 0.7864105607470325, "learning_rate": 4.433962264150944e-05, "loss": 0.3641, "step": 330 }, { "epoch": 0.2750809061488673, "grad_norm": 0.7520885489880877, "learning_rate": 4.5687331536388144e-05, "loss": 0.3571, "step": 340 }, { "epoch": 0.28317152103559873, "grad_norm": 0.7154271960483066, "learning_rate": 4.703504043126685e-05, "loss": 0.3564, "step": 350 }, { "epoch": 0.2912621359223301, "grad_norm": 0.5540540831665184, "learning_rate": 4.838274932614555e-05, "loss": 0.3499, "step": 360 }, { "epoch": 0.2993527508090615, "grad_norm": 0.8972157863979777, "learning_rate": 4.973045822102426e-05, "loss": 0.3546, "step": 370 }, { "epoch": 0.3074433656957929, "grad_norm": 0.5977035180668759, "learning_rate": 4.9999290952604396e-05, "loss": 0.358, "step": 380 }, { "epoch": 0.3155339805825243, "grad_norm": 0.6794119464101197, "learning_rate": 4.9996410516491115e-05, "loss": 0.3442, "step": 390 }, { "epoch": 0.32362459546925565, "grad_norm": 0.6445475083450791, "learning_rate": 4.9991314631296585e-05, "loss": 0.3504, "step": 400 }, { "epoch": 0.33171521035598706, "grad_norm": 0.6254469656785541, "learning_rate": 4.9984003748672604e-05, "loss": 0.3451, "step": 410 }, { "epoch": 0.33980582524271846, "grad_norm": 0.6124809359745962, "learning_rate": 4.997447851658774e-05, "loss": 0.3373, "step": 420 }, { "epoch": 0.3478964401294498, "grad_norm": 0.8314626543045377, "learning_rate": 4.9962739779269887e-05, "loss": 0.3427, "step": 430 }, { "epoch": 0.3559870550161812, "grad_norm": 0.7096404483846562, "learning_rate": 4.9948788577131414e-05, "loss": 0.3402, "step": 440 }, { "epoch": 0.3640776699029126, "grad_norm": 0.5010872233529703, "learning_rate": 4.993262614667696e-05, "loss": 0.3404, "step": 450 }, { "epoch": 0.37216828478964403, "grad_norm": 0.7096873040187438, "learning_rate": 4.9914253920393884e-05, "loss": 0.3374, "step": 460 }, { "epoch": 0.3802588996763754, "grad_norm": 0.6463889915075135, "learning_rate": 4.9893673526625265e-05, "loss": 0.3374, "step": 470 }, { "epoch": 0.3883495145631068, "grad_norm": 0.698463838389325, "learning_rate": 4.987088678942555e-05, "loss": 0.338, "step": 480 }, { "epoch": 0.3964401294498382, "grad_norm": 0.6798285617884782, "learning_rate": 4.984589572839897e-05, "loss": 0.3335, "step": 490 }, { "epoch": 0.4045307443365696, "grad_norm": 0.4599792513564179, "learning_rate": 4.9818702558520485e-05, "loss": 0.3299, "step": 500 }, { "epoch": 0.41262135922330095, "grad_norm": 0.49728635683969147, "learning_rate": 4.978930968993946e-05, "loss": 0.3388, "step": 510 }, { "epoch": 0.42071197411003236, "grad_norm": 0.4217113735156639, "learning_rate": 4.9757719727766085e-05, "loss": 0.3319, "step": 520 }, { "epoch": 0.42880258899676377, "grad_norm": 0.48369122768285383, "learning_rate": 4.972393547184046e-05, "loss": 0.33, "step": 530 }, { "epoch": 0.4368932038834951, "grad_norm": 0.43182058716411714, "learning_rate": 4.968795991648446e-05, "loss": 0.3252, "step": 540 }, { "epoch": 0.4449838187702265, "grad_norm": 0.5876386529955178, "learning_rate": 4.9649796250236344e-05, "loss": 0.324, "step": 550 }, { "epoch": 0.45307443365695793, "grad_norm": 0.5486057928498816, "learning_rate": 4.960944785556814e-05, "loss": 0.3254, "step": 560 }, { "epoch": 0.46116504854368934, "grad_norm": 0.4867958049194759, "learning_rate": 4.956691830858585e-05, "loss": 0.3216, "step": 570 }, { "epoch": 0.4692556634304207, "grad_norm": 0.4571043410958847, "learning_rate": 4.952221137871252e-05, "loss": 0.3206, "step": 580 }, { "epoch": 0.4773462783171521, "grad_norm": 0.5063667000215466, "learning_rate": 4.947533102835413e-05, "loss": 0.322, "step": 590 }, { "epoch": 0.4854368932038835, "grad_norm": 0.4522556669690265, "learning_rate": 4.942628141254843e-05, "loss": 0.3257, "step": 600 }, { "epoch": 0.4935275080906149, "grad_norm": 0.4321047527271199, "learning_rate": 4.937506687859666e-05, "loss": 0.3197, "step": 610 }, { "epoch": 0.5016181229773463, "grad_norm": 0.393401694292632, "learning_rate": 4.932169196567824e-05, "loss": 0.3209, "step": 620 }, { "epoch": 0.5097087378640777, "grad_norm": 0.39900307971080384, "learning_rate": 4.9266161404448454e-05, "loss": 0.3102, "step": 630 }, { "epoch": 0.517799352750809, "grad_norm": 0.4716849647042864, "learning_rate": 4.920848011661919e-05, "loss": 0.3195, "step": 640 }, { "epoch": 0.5258899676375405, "grad_norm": 0.460323217460504, "learning_rate": 4.914865321452274e-05, "loss": 0.3151, "step": 650 }, { "epoch": 0.5339805825242718, "grad_norm": 0.45925505988077353, "learning_rate": 4.908668600065862e-05, "loss": 0.3151, "step": 660 }, { "epoch": 0.5420711974110033, "grad_norm": 0.5830461135145422, "learning_rate": 4.90225839672237e-05, "loss": 0.3216, "step": 670 }, { "epoch": 0.5501618122977346, "grad_norm": 0.4701717654524378, "learning_rate": 4.8956352795625325e-05, "loss": 0.3168, "step": 680 }, { "epoch": 0.558252427184466, "grad_norm": 0.337481033517792, "learning_rate": 4.8887998355977886e-05, "loss": 0.313, "step": 690 }, { "epoch": 0.5663430420711975, "grad_norm": 0.6269962483834282, "learning_rate": 4.881752670658244e-05, "loss": 0.3132, "step": 700 }, { "epoch": 0.5744336569579288, "grad_norm": 0.4662171259348263, "learning_rate": 4.87449440933898e-05, "loss": 0.314, "step": 710 }, { "epoch": 0.5825242718446602, "grad_norm": 0.5290025725406212, "learning_rate": 4.867025694944698e-05, "loss": 0.3109, "step": 720 }, { "epoch": 0.5906148867313916, "grad_norm": 0.43779075196037404, "learning_rate": 4.859347189432699e-05, "loss": 0.3188, "step": 730 }, { "epoch": 0.598705501618123, "grad_norm": 0.5368922851512631, "learning_rate": 4.8514595733542144e-05, "loss": 0.3053, "step": 740 }, { "epoch": 0.6067961165048543, "grad_norm": 0.3739446436447005, "learning_rate": 4.8433635457940915e-05, "loss": 0.3095, "step": 750 }, { "epoch": 0.6148867313915858, "grad_norm": 0.36805536772795056, "learning_rate": 4.8350598243088283e-05, "loss": 0.3081, "step": 760 }, { "epoch": 0.6229773462783171, "grad_norm": 0.6081454922011427, "learning_rate": 4.8265491448629804e-05, "loss": 0.3096, "step": 770 }, { "epoch": 0.6310679611650486, "grad_norm": 0.606989309832587, "learning_rate": 4.817832261763928e-05, "loss": 0.3064, "step": 780 }, { "epoch": 0.63915857605178, "grad_norm": 0.374478457061797, "learning_rate": 4.8089099475950257e-05, "loss": 0.3075, "step": 790 }, { "epoch": 0.6472491909385113, "grad_norm": 0.4495226360651449, "learning_rate": 4.7997829931471225e-05, "loss": 0.3055, "step": 800 }, { "epoch": 0.6553398058252428, "grad_norm": 0.48869687342847196, "learning_rate": 4.7904522073484786e-05, "loss": 0.3056, "step": 810 }, { "epoch": 0.6634304207119741, "grad_norm": 0.4244338878096003, "learning_rate": 4.780918417193065e-05, "loss": 0.3068, "step": 820 }, { "epoch": 0.6715210355987055, "grad_norm": 0.8568600210635637, "learning_rate": 4.7711824676672726e-05, "loss": 0.311, "step": 830 }, { "epoch": 0.6796116504854369, "grad_norm": 0.4818809073072544, "learning_rate": 4.76124522167501e-05, "loss": 0.3139, "step": 840 }, { "epoch": 0.6877022653721683, "grad_norm": 0.5673965426592162, "learning_rate": 4.751107559961238e-05, "loss": 0.3085, "step": 850 }, { "epoch": 0.6957928802588996, "grad_norm": 0.42296772741332755, "learning_rate": 4.740770381033894e-05, "loss": 0.3129, "step": 860 }, { "epoch": 0.7038834951456311, "grad_norm": 0.4358296754132214, "learning_rate": 4.730234601084268e-05, "loss": 0.3058, "step": 870 }, { "epoch": 0.7119741100323624, "grad_norm": 0.4373965191621123, "learning_rate": 4.719501153905793e-05, "loss": 0.3025, "step": 880 }, { "epoch": 0.7200647249190939, "grad_norm": 0.3787352736875979, "learning_rate": 4.7085709908112866e-05, "loss": 0.3034, "step": 890 }, { "epoch": 0.7281553398058253, "grad_norm": 0.469843837143792, "learning_rate": 4.6974450805486305e-05, "loss": 0.303, "step": 900 }, { "epoch": 0.7362459546925566, "grad_norm": 0.4394456386334091, "learning_rate": 4.686124409214917e-05, "loss": 0.2975, "step": 910 }, { "epoch": 0.7443365695792881, "grad_norm": 0.37340718943669743, "learning_rate": 4.674609980169042e-05, "loss": 0.3002, "step": 920 }, { "epoch": 0.7524271844660194, "grad_norm": 0.47566576341803307, "learning_rate": 4.662902813942784e-05, "loss": 0.3051, "step": 930 }, { "epoch": 0.7605177993527508, "grad_norm": 0.4410622981522137, "learning_rate": 4.651003948150349e-05, "loss": 0.2962, "step": 940 }, { "epoch": 0.7686084142394822, "grad_norm": 0.41727057267800893, "learning_rate": 4.638914437396408e-05, "loss": 0.2959, "step": 950 }, { "epoch": 0.7766990291262136, "grad_norm": 0.3766577031650596, "learning_rate": 4.626635353182626e-05, "loss": 0.2951, "step": 960 }, { "epoch": 0.7847896440129449, "grad_norm": 0.36286446424405566, "learning_rate": 4.614167783812694e-05, "loss": 0.294, "step": 970 }, { "epoch": 0.7928802588996764, "grad_norm": 0.4051751512540439, "learning_rate": 4.601512834295874e-05, "loss": 0.2944, "step": 980 }, { "epoch": 0.8009708737864077, "grad_norm": 0.37027627803204927, "learning_rate": 4.588671626249057e-05, "loss": 0.2952, "step": 990 }, { "epoch": 0.8090614886731392, "grad_norm": 0.464872048858869, "learning_rate": 4.5756452977973585e-05, "loss": 0.292, "step": 1000 }, { "epoch": 0.8171521035598706, "grad_norm": 0.4408350560704109, "learning_rate": 4.56243500347324e-05, "loss": 0.3009, "step": 1010 }, { "epoch": 0.8252427184466019, "grad_norm": 0.49398039939609895, "learning_rate": 4.549041914114188e-05, "loss": 0.2969, "step": 1020 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5592891850787267, "learning_rate": 4.535467216758936e-05, "loss": 0.2981, "step": 1030 }, { "epoch": 0.8414239482200647, "grad_norm": 0.4230495195351826, "learning_rate": 4.5217121145422616e-05, "loss": 0.2883, "step": 1040 }, { "epoch": 0.8495145631067961, "grad_norm": 0.33133186454457314, "learning_rate": 4.5077778265883477e-05, "loss": 0.2927, "step": 1050 }, { "epoch": 0.8576051779935275, "grad_norm": 0.3936509630212508, "learning_rate": 4.4936655879027336e-05, "loss": 0.2948, "step": 1060 }, { "epoch": 0.8656957928802589, "grad_norm": 0.4500916019115437, "learning_rate": 4.479376649262855e-05, "loss": 0.2943, "step": 1070 }, { "epoch": 0.8737864077669902, "grad_norm": 0.3668748277040676, "learning_rate": 4.464912277107185e-05, "loss": 0.2928, "step": 1080 }, { "epoch": 0.8818770226537217, "grad_norm": 0.320494366403153, "learning_rate": 4.450273753422992e-05, "loss": 0.2909, "step": 1090 }, { "epoch": 0.889967637540453, "grad_norm": 0.37987923612254676, "learning_rate": 4.435462375632711e-05, "loss": 0.2964, "step": 1100 }, { "epoch": 0.8980582524271845, "grad_norm": 0.39981221876873035, "learning_rate": 4.420479456478957e-05, "loss": 0.2994, "step": 1110 }, { "epoch": 0.9061488673139159, "grad_norm": 0.3091126654651453, "learning_rate": 4.405326323908178e-05, "loss": 0.2851, "step": 1120 }, { "epoch": 0.9142394822006472, "grad_norm": 0.47407567538309503, "learning_rate": 4.390004320952947e-05, "loss": 0.2914, "step": 1130 }, { "epoch": 0.9223300970873787, "grad_norm": 0.4473436088105401, "learning_rate": 4.374514805612942e-05, "loss": 0.2958, "step": 1140 }, { "epoch": 0.93042071197411, "grad_norm": 0.3335572127939302, "learning_rate": 4.358859150734576e-05, "loss": 0.2915, "step": 1150 }, { "epoch": 0.9385113268608414, "grad_norm": 0.4410700993823537, "learning_rate": 4.343038743889324e-05, "loss": 0.2936, "step": 1160 }, { "epoch": 0.9466019417475728, "grad_norm": 0.4566532222343424, "learning_rate": 4.3270549872507415e-05, "loss": 0.2921, "step": 1170 }, { "epoch": 0.9546925566343042, "grad_norm": 0.34831612960636166, "learning_rate": 4.3109092974701895e-05, "loss": 0.2961, "step": 1180 }, { "epoch": 0.9627831715210357, "grad_norm": 0.33970324826345877, "learning_rate": 4.2946031055512733e-05, "loss": 0.2887, "step": 1190 }, { "epoch": 0.970873786407767, "grad_norm": 0.46192580192198573, "learning_rate": 4.2781378567230145e-05, "loss": 0.29, "step": 1200 }, { "epoch": 0.9789644012944984, "grad_norm": 0.4615119904861507, "learning_rate": 4.2615150103117576e-05, "loss": 0.2886, "step": 1210 }, { "epoch": 0.9870550161812298, "grad_norm": 0.2981731488057052, "learning_rate": 4.24473603961183e-05, "loss": 0.2908, "step": 1220 }, { "epoch": 0.9951456310679612, "grad_norm": 0.3632890670918485, "learning_rate": 4.227802431754961e-05, "loss": 0.2924, "step": 1230 }, { "epoch": 1.0032362459546926, "grad_norm": 0.4150403194043393, "learning_rate": 4.210715687578481e-05, "loss": 0.2727, "step": 1240 }, { "epoch": 1.0113268608414239, "grad_norm": 0.34869821901507675, "learning_rate": 4.193477321492293e-05, "loss": 0.2453, "step": 1250 }, { "epoch": 1.0194174757281553, "grad_norm": 0.3601010037088075, "learning_rate": 4.176088861344657e-05, "loss": 0.2414, "step": 1260 }, { "epoch": 1.0275080906148868, "grad_norm": 0.32325115826296424, "learning_rate": 4.158551848286773e-05, "loss": 0.2425, "step": 1270 }, { "epoch": 1.035598705501618, "grad_norm": 0.38691862676257593, "learning_rate": 4.140867836636189e-05, "loss": 0.247, "step": 1280 }, { "epoch": 1.0436893203883495, "grad_norm": 0.34882276619380004, "learning_rate": 4.1230383937390374e-05, "loss": 0.239, "step": 1290 }, { "epoch": 1.051779935275081, "grad_norm": 0.3951880114595209, "learning_rate": 4.1050650998311215e-05, "loss": 0.2375, "step": 1300 }, { "epoch": 1.0598705501618122, "grad_norm": 0.3736593550545292, "learning_rate": 4.086949547897862e-05, "loss": 0.2479, "step": 1310 }, { "epoch": 1.0679611650485437, "grad_norm": 0.34774896387912796, "learning_rate": 4.068693343533103e-05, "loss": 0.2372, "step": 1320 }, { "epoch": 1.0760517799352751, "grad_norm": 0.3330973525666551, "learning_rate": 4.050298104796812e-05, "loss": 0.2413, "step": 1330 }, { "epoch": 1.0841423948220066, "grad_norm": 0.27924340181820345, "learning_rate": 4.0317654620716704e-05, "loss": 0.2454, "step": 1340 }, { "epoch": 1.0922330097087378, "grad_norm": 0.3807786754448861, "learning_rate": 4.013097057918566e-05, "loss": 0.2457, "step": 1350 }, { "epoch": 1.1003236245954693, "grad_norm": 0.34402611290470836, "learning_rate": 3.9942945469310175e-05, "loss": 0.2406, "step": 1360 }, { "epoch": 1.1084142394822007, "grad_norm": 0.3087094208360471, "learning_rate": 3.9753595955885264e-05, "loss": 0.2403, "step": 1370 }, { "epoch": 1.116504854368932, "grad_norm": 0.334687554837771, "learning_rate": 3.9562938821088705e-05, "loss": 0.2443, "step": 1380 }, { "epoch": 1.1245954692556634, "grad_norm": 0.34101610586326964, "learning_rate": 3.9370990962993695e-05, "loss": 0.2455, "step": 1390 }, { "epoch": 1.132686084142395, "grad_norm": 0.36304196735659044, "learning_rate": 3.9177769394071086e-05, "loss": 0.2423, "step": 1400 }, { "epoch": 1.1407766990291262, "grad_norm": 0.32455755430896477, "learning_rate": 3.898329123968163e-05, "loss": 0.2424, "step": 1410 }, { "epoch": 1.1488673139158576, "grad_norm": 0.29393617658060145, "learning_rate": 3.87875737365581e-05, "loss": 0.2383, "step": 1420 }, { "epoch": 1.156957928802589, "grad_norm": 0.40951987232018516, "learning_rate": 3.8590634231277626e-05, "loss": 0.2605, "step": 1430 }, { "epoch": 1.1650485436893203, "grad_norm": 0.3441977909919604, "learning_rate": 3.8392490178724184e-05, "loss": 0.2455, "step": 1440 }, { "epoch": 1.1731391585760518, "grad_norm": 0.3112960711261136, "learning_rate": 3.8193159140541645e-05, "loss": 0.2419, "step": 1450 }, { "epoch": 1.1812297734627832, "grad_norm": 0.3032973624770194, "learning_rate": 3.7992658783577215e-05, "loss": 0.2422, "step": 1460 }, { "epoch": 1.1893203883495145, "grad_norm": 0.29109047481785005, "learning_rate": 3.779100687831563e-05, "loss": 0.2462, "step": 1470 }, { "epoch": 1.197411003236246, "grad_norm": 0.2948022665787199, "learning_rate": 3.758822129730415e-05, "loss": 0.245, "step": 1480 }, { "epoch": 1.2055016181229774, "grad_norm": 0.31258495126302516, "learning_rate": 3.738432001356851e-05, "loss": 0.2455, "step": 1490 }, { "epoch": 1.2135922330097086, "grad_norm": 0.27328969134267245, "learning_rate": 3.7179321099019916e-05, "loss": 0.2369, "step": 1500 }, { "epoch": 1.22168284789644, "grad_norm": 0.2847184801501813, "learning_rate": 3.6973242722853365e-05, "loss": 0.2402, "step": 1510 }, { "epoch": 1.2297734627831716, "grad_norm": 0.29812191017952583, "learning_rate": 3.6766103149937295e-05, "loss": 0.2427, "step": 1520 }, { "epoch": 1.237864077669903, "grad_norm": 0.3017615376479855, "learning_rate": 3.655792073919471e-05, "loss": 0.2363, "step": 1530 }, { "epoch": 1.2459546925566343, "grad_norm": 0.27567364203177047, "learning_rate": 3.634871394197607e-05, "loss": 0.2388, "step": 1540 }, { "epoch": 1.2540453074433657, "grad_norm": 0.2685513406212581, "learning_rate": 3.6138501300423934e-05, "loss": 0.2378, "step": 1550 }, { "epoch": 1.262135922330097, "grad_norm": 0.25546275734682755, "learning_rate": 3.592730144582948e-05, "loss": 0.2341, "step": 1560 }, { "epoch": 1.2702265372168284, "grad_norm": 0.2841532225060496, "learning_rate": 3.571513309698131e-05, "loss": 0.2366, "step": 1570 }, { "epoch": 1.27831715210356, "grad_norm": 0.30541995269870376, "learning_rate": 3.5502015058506335e-05, "loss": 0.2375, "step": 1580 }, { "epoch": 1.2864077669902914, "grad_norm": 0.34844869264740597, "learning_rate": 3.528796621920307e-05, "loss": 0.239, "step": 1590 }, { "epoch": 1.2944983818770226, "grad_norm": 0.30708349077357805, "learning_rate": 3.50730055503676e-05, "loss": 0.2356, "step": 1600 }, { "epoch": 1.302588996763754, "grad_norm": 0.3175696506236882, "learning_rate": 3.485715210411204e-05, "loss": 0.2358, "step": 1610 }, { "epoch": 1.3106796116504853, "grad_norm": 0.29197347345983926, "learning_rate": 3.4640425011676034e-05, "loss": 0.2408, "step": 1620 }, { "epoch": 1.3187702265372168, "grad_norm": 0.28169129894596506, "learning_rate": 3.442284348173106e-05, "loss": 0.2395, "step": 1630 }, { "epoch": 1.3268608414239482, "grad_norm": 0.3992186155056125, "learning_rate": 3.420442679867796e-05, "loss": 0.2391, "step": 1640 }, { "epoch": 1.3349514563106797, "grad_norm": 0.35636813012626095, "learning_rate": 3.398519432093782e-05, "loss": 0.2374, "step": 1650 }, { "epoch": 1.343042071197411, "grad_norm": 0.27742962322683806, "learning_rate": 3.376516547923614e-05, "loss": 0.2336, "step": 1660 }, { "epoch": 1.3511326860841424, "grad_norm": 0.28660763778902115, "learning_rate": 3.3544359774880714e-05, "loss": 0.2371, "step": 1670 }, { "epoch": 1.3592233009708738, "grad_norm": 0.40153445123463827, "learning_rate": 3.3322796778033204e-05, "loss": 0.2376, "step": 1680 }, { "epoch": 1.367313915857605, "grad_norm": 0.3035749815074104, "learning_rate": 3.3100496125974624e-05, "loss": 0.2377, "step": 1690 }, { "epoch": 1.3754045307443366, "grad_norm": 0.342174636479568, "learning_rate": 3.2877477521364895e-05, "loss": 0.2347, "step": 1700 }, { "epoch": 1.383495145631068, "grad_norm": 0.3199103842497454, "learning_rate": 3.2653760730496555e-05, "loss": 0.2297, "step": 1710 }, { "epoch": 1.3915857605177995, "grad_norm": 0.27362978201436644, "learning_rate": 3.242936558154285e-05, "loss": 0.2356, "step": 1720 }, { "epoch": 1.3996763754045307, "grad_norm": 0.27057113114600895, "learning_rate": 3.2204311962800426e-05, "loss": 0.2322, "step": 1730 }, { "epoch": 1.4077669902912622, "grad_norm": 0.26643871030324334, "learning_rate": 3.197861982092651e-05, "loss": 0.2384, "step": 1740 }, { "epoch": 1.4158576051779934, "grad_norm": 0.31497718220761595, "learning_rate": 3.175230915917108e-05, "loss": 0.2427, "step": 1750 }, { "epoch": 1.4239482200647249, "grad_norm": 0.3267383691103009, "learning_rate": 3.152540003560398e-05, "loss": 0.2417, "step": 1760 }, { "epoch": 1.4320388349514563, "grad_norm": 0.29663197558136717, "learning_rate": 3.129791256133712e-05, "loss": 0.2288, "step": 1770 }, { "epoch": 1.4401294498381878, "grad_norm": 0.32445565003875587, "learning_rate": 3.106986689874204e-05, "loss": 0.2347, "step": 1780 }, { "epoch": 1.448220064724919, "grad_norm": 0.29535887592241783, "learning_rate": 3.0841283259662875e-05, "loss": 0.2385, "step": 1790 }, { "epoch": 1.4563106796116505, "grad_norm": 0.2733621998571008, "learning_rate": 3.0612181903625014e-05, "loss": 0.2359, "step": 1800 }, { "epoch": 1.4644012944983817, "grad_norm": 0.2698891118245767, "learning_rate": 3.0382583136039444e-05, "loss": 0.2339, "step": 1810 }, { "epoch": 1.4724919093851132, "grad_norm": 0.30735748022487547, "learning_rate": 3.015250730640308e-05, "loss": 0.237, "step": 1820 }, { "epoch": 1.4805825242718447, "grad_norm": 0.35024703112347744, "learning_rate": 2.9921974806495178e-05, "loss": 0.2301, "step": 1830 }, { "epoch": 1.4886731391585761, "grad_norm": 0.27111655738392937, "learning_rate": 2.969100606856998e-05, "loss": 0.2339, "step": 1840 }, { "epoch": 1.4967637540453074, "grad_norm": 0.2787522117740128, "learning_rate": 2.9459621563545825e-05, "loss": 0.2385, "step": 1850 }, { "epoch": 1.5048543689320388, "grad_norm": 0.2969133211792456, "learning_rate": 2.9227841799190775e-05, "loss": 0.2305, "step": 1860 }, { "epoch": 1.51294498381877, "grad_norm": 0.2613286640167969, "learning_rate": 2.8995687318304975e-05, "loss": 0.2328, "step": 1870 }, { "epoch": 1.5210355987055015, "grad_norm": 0.27216163489187184, "learning_rate": 2.8763178696899995e-05, "loss": 0.2373, "step": 1880 }, { "epoch": 1.529126213592233, "grad_norm": 0.2964461145886047, "learning_rate": 2.853033654237507e-05, "loss": 0.2289, "step": 1890 }, { "epoch": 1.5372168284789645, "grad_norm": 0.32949202305010017, "learning_rate": 2.8297181491690756e-05, "loss": 0.23, "step": 1900 }, { "epoch": 1.545307443365696, "grad_norm": 0.3071880700004742, "learning_rate": 2.8063734209539773e-05, "loss": 0.2367, "step": 1910 }, { "epoch": 1.5533980582524272, "grad_norm": 0.2529468819780906, "learning_rate": 2.783001538651554e-05, "loss": 0.2292, "step": 1920 }, { "epoch": 1.5614886731391586, "grad_norm": 0.2773840151703638, "learning_rate": 2.7596045737278336e-05, "loss": 0.2398, "step": 1930 }, { "epoch": 1.5695792880258899, "grad_norm": 0.295185746563609, "learning_rate": 2.7361845998719315e-05, "loss": 0.2325, "step": 1940 }, { "epoch": 1.5776699029126213, "grad_norm": 0.25593073129115584, "learning_rate": 2.7127436928122612e-05, "loss": 0.2367, "step": 1950 }, { "epoch": 1.5857605177993528, "grad_norm": 0.3516341035154705, "learning_rate": 2.6892839301325623e-05, "loss": 0.239, "step": 1960 }, { "epoch": 1.5938511326860842, "grad_norm": 0.27153437539635045, "learning_rate": 2.6658073910877603e-05, "loss": 0.2288, "step": 1970 }, { "epoch": 1.6019417475728155, "grad_norm": 0.26490849237503283, "learning_rate": 2.6423161564196803e-05, "loss": 0.231, "step": 1980 }, { "epoch": 1.610032362459547, "grad_norm": 0.25415769405974487, "learning_rate": 2.6188123081726306e-05, "loss": 0.2341, "step": 1990 }, { "epoch": 1.6181229773462782, "grad_norm": 0.2566774871023634, "learning_rate": 2.5952979295088714e-05, "loss": 0.2303, "step": 2000 }, { "epoch": 1.6262135922330097, "grad_norm": 0.28720215998836357, "learning_rate": 2.57177510452398e-05, "loss": 0.2297, "step": 2010 }, { "epoch": 1.6343042071197411, "grad_norm": 0.26535154825569424, "learning_rate": 2.5482459180621377e-05, "loss": 0.2336, "step": 2020 }, { "epoch": 1.6423948220064726, "grad_norm": 0.2348030131858575, "learning_rate": 2.524712455531347e-05, "loss": 0.2283, "step": 2030 }, { "epoch": 1.650485436893204, "grad_norm": 0.2981340903697484, "learning_rate": 2.501176802718599e-05, "loss": 0.2367, "step": 2040 }, { "epoch": 1.6585760517799353, "grad_norm": 0.2508609786813777, "learning_rate": 2.4776410456050165e-05, "loss": 0.232, "step": 2050 }, { "epoch": 1.6666666666666665, "grad_norm": 0.2760175375931361, "learning_rate": 2.4541072701809624e-05, "loss": 0.2348, "step": 2060 }, { "epoch": 1.674757281553398, "grad_norm": 0.2681609470474934, "learning_rate": 2.4305775622611627e-05, "loss": 0.2285, "step": 2070 }, { "epoch": 1.6828478964401294, "grad_norm": 0.3323878551187027, "learning_rate": 2.4070540072998372e-05, "loss": 0.2272, "step": 2080 }, { "epoch": 1.690938511326861, "grad_norm": 0.24794381758629244, "learning_rate": 2.3835386902058637e-05, "loss": 0.2292, "step": 2090 }, { "epoch": 1.6990291262135924, "grad_norm": 0.26234035679605566, "learning_rate": 2.360033695157995e-05, "loss": 0.2337, "step": 2100 }, { "epoch": 1.7071197411003236, "grad_norm": 0.3095409200594481, "learning_rate": 2.3365411054201315e-05, "loss": 0.2265, "step": 2110 }, { "epoch": 1.715210355987055, "grad_norm": 0.24814239367844307, "learning_rate": 2.3130630031566818e-05, "loss": 0.2269, "step": 2120 }, { "epoch": 1.7233009708737863, "grad_norm": 0.2503027719931105, "learning_rate": 2.2896014692480226e-05, "loss": 0.231, "step": 2130 }, { "epoch": 1.7313915857605178, "grad_norm": 0.24658598678553392, "learning_rate": 2.266158583106063e-05, "loss": 0.228, "step": 2140 }, { "epoch": 1.7394822006472492, "grad_norm": 0.23999491418212948, "learning_rate": 2.2427364224899502e-05, "loss": 0.2289, "step": 2150 }, { "epoch": 1.7475728155339807, "grad_norm": 0.24294562795428556, "learning_rate": 2.2193370633219115e-05, "loss": 0.2295, "step": 2160 }, { "epoch": 1.755663430420712, "grad_norm": 0.27195306057038027, "learning_rate": 2.1959625795032664e-05, "loss": 0.2312, "step": 2170 }, { "epoch": 1.7637540453074434, "grad_norm": 0.31612469238407165, "learning_rate": 2.1726150427306182e-05, "loss": 0.2286, "step": 2180 }, { "epoch": 1.7718446601941746, "grad_norm": 0.29752263873224494, "learning_rate": 2.1492965223122305e-05, "loss": 0.2267, "step": 2190 }, { "epoch": 1.779935275080906, "grad_norm": 0.2603667246009949, "learning_rate": 2.126009084984629e-05, "loss": 0.2251, "step": 2200 }, { "epoch": 1.7880258899676376, "grad_norm": 0.26289071039247, "learning_rate": 2.102754794729426e-05, "loss": 0.2246, "step": 2210 }, { "epoch": 1.796116504854369, "grad_norm": 0.263903471162293, "learning_rate": 2.079535712590382e-05, "loss": 0.2282, "step": 2220 }, { "epoch": 1.8042071197411005, "grad_norm": 0.27035700806263924, "learning_rate": 2.056353896490742e-05, "loss": 0.2231, "step": 2230 }, { "epoch": 1.8122977346278317, "grad_norm": 0.2654192689718002, "learning_rate": 2.0332114010508334e-05, "loss": 0.2268, "step": 2240 }, { "epoch": 1.820388349514563, "grad_norm": 0.23959916600696418, "learning_rate": 2.010110277405966e-05, "loss": 0.2274, "step": 2250 }, { "epoch": 1.8284789644012944, "grad_norm": 0.24872872826123787, "learning_rate": 1.9870525730246424e-05, "loss": 0.2293, "step": 2260 }, { "epoch": 1.8365695792880259, "grad_norm": 0.2645954361424342, "learning_rate": 1.9640403315270824e-05, "loss": 0.2286, "step": 2270 }, { "epoch": 1.8446601941747574, "grad_norm": 0.24420673184598343, "learning_rate": 1.9410755925041006e-05, "loss": 0.2257, "step": 2280 }, { "epoch": 1.8527508090614888, "grad_norm": 0.24705849748195569, "learning_rate": 1.918160391336335e-05, "loss": 0.2259, "step": 2290 }, { "epoch": 1.86084142394822, "grad_norm": 0.2951271763878442, "learning_rate": 1.8952967590138472e-05, "loss": 0.2248, "step": 2300 }, { "epoch": 1.8689320388349513, "grad_norm": 0.2520418480110052, "learning_rate": 1.8724867219561203e-05, "loss": 0.2287, "step": 2310 }, { "epoch": 1.8770226537216828, "grad_norm": 0.294901863474176, "learning_rate": 1.8497323018324476e-05, "loss": 0.2252, "step": 2320 }, { "epoch": 1.8851132686084142, "grad_norm": 0.2620675111989084, "learning_rate": 1.8270355153827598e-05, "loss": 0.2243, "step": 2330 }, { "epoch": 1.8932038834951457, "grad_norm": 0.21729342901905718, "learning_rate": 1.804398374238872e-05, "loss": 0.2231, "step": 2340 }, { "epoch": 1.9012944983818771, "grad_norm": 0.26438351131145754, "learning_rate": 1.781822884746196e-05, "loss": 0.2203, "step": 2350 }, { "epoch": 1.9093851132686084, "grad_norm": 0.22358504292564055, "learning_rate": 1.7593110477859153e-05, "loss": 0.223, "step": 2360 }, { "epoch": 1.9174757281553398, "grad_norm": 0.24990353950246044, "learning_rate": 1.736864858597645e-05, "loss": 0.2233, "step": 2370 }, { "epoch": 1.925566343042071, "grad_norm": 0.24537211238465853, "learning_rate": 1.7144863066025955e-05, "loss": 0.2193, "step": 2380 }, { "epoch": 1.9336569579288025, "grad_norm": 0.2394304662544921, "learning_rate": 1.692177375227242e-05, "loss": 0.2251, "step": 2390 }, { "epoch": 1.941747572815534, "grad_norm": 0.2785110439501238, "learning_rate": 1.669940041727538e-05, "loss": 0.218, "step": 2400 }, { "epoch": 1.9498381877022655, "grad_norm": 0.23030205395765063, "learning_rate": 1.6477762770136707e-05, "loss": 0.225, "step": 2410 }, { "epoch": 1.9579288025889967, "grad_norm": 0.24458486846119534, "learning_rate": 1.625688045475371e-05, "loss": 0.2208, "step": 2420 }, { "epoch": 1.9660194174757282, "grad_norm": 0.223552780447482, "learning_rate": 1.603677304807815e-05, "loss": 0.2223, "step": 2430 }, { "epoch": 1.9741100323624594, "grad_norm": 0.23262918496301524, "learning_rate": 1.5817460058381088e-05, "loss": 0.2274, "step": 2440 }, { "epoch": 1.9822006472491909, "grad_norm": 0.2210251232882193, "learning_rate": 1.5598960923523842e-05, "loss": 0.2248, "step": 2450 }, { "epoch": 1.9902912621359223, "grad_norm": 0.2543751678692885, "learning_rate": 1.5381295009235262e-05, "loss": 0.2277, "step": 2460 }, { "epoch": 1.9983818770226538, "grad_norm": 0.2767394174239906, "learning_rate": 1.5164481607395238e-05, "loss": 0.2243, "step": 2470 }, { "epoch": 2.0064724919093853, "grad_norm": 0.2831027468460383, "learning_rate": 1.4948539934324923e-05, "loss": 0.1818, "step": 2480 }, { "epoch": 2.0145631067961167, "grad_norm": 0.23514037047255174, "learning_rate": 1.4733489129083534e-05, "loss": 0.1718, "step": 2490 }, { "epoch": 2.0226537216828477, "grad_norm": 0.23543211368476746, "learning_rate": 1.4519348251772058e-05, "loss": 0.1696, "step": 2500 }, { "epoch": 2.030744336569579, "grad_norm": 0.24256191701199467, "learning_rate": 1.4306136281843962e-05, "loss": 0.1691, "step": 2510 }, { "epoch": 2.0388349514563107, "grad_norm": 0.22234069623742508, "learning_rate": 1.4093872116422979e-05, "loss": 0.169, "step": 2520 }, { "epoch": 2.046925566343042, "grad_norm": 0.23132835680535235, "learning_rate": 1.3882574568628315e-05, "loss": 0.168, "step": 2530 }, { "epoch": 2.0550161812297736, "grad_norm": 0.23134456189741553, "learning_rate": 1.3672262365907163e-05, "loss": 0.1684, "step": 2540 }, { "epoch": 2.063106796116505, "grad_norm": 0.22760921973646828, "learning_rate": 1.3462954148374899e-05, "loss": 0.1661, "step": 2550 }, { "epoch": 2.071197411003236, "grad_norm": 0.21228629003960797, "learning_rate": 1.3254668467163029e-05, "loss": 0.1696, "step": 2560 }, { "epoch": 2.0792880258899675, "grad_norm": 0.2182291343034792, "learning_rate": 1.3047423782774937e-05, "loss": 0.1649, "step": 2570 }, { "epoch": 2.087378640776699, "grad_norm": 0.21718735716811796, "learning_rate": 1.2841238463449743e-05, "loss": 0.1719, "step": 2580 }, { "epoch": 2.0954692556634305, "grad_norm": 0.22472401035201942, "learning_rate": 1.2636130783534319e-05, "loss": 0.1703, "step": 2590 }, { "epoch": 2.103559870550162, "grad_norm": 0.21069125465698005, "learning_rate": 1.2432118921863604e-05, "loss": 0.168, "step": 2600 }, { "epoch": 2.1116504854368934, "grad_norm": 0.21446866638864276, "learning_rate": 1.2229220960149431e-05, "loss": 0.1695, "step": 2610 }, { "epoch": 2.1197411003236244, "grad_norm": 0.21738468937149366, "learning_rate": 1.2027454881377889e-05, "loss": 0.1675, "step": 2620 }, { "epoch": 2.127831715210356, "grad_norm": 0.22154845083846222, "learning_rate": 1.1826838568215526e-05, "loss": 0.1685, "step": 2630 }, { "epoch": 2.1359223300970873, "grad_norm": 0.20713221800601425, "learning_rate": 1.1627389801424351e-05, "loss": 0.1706, "step": 2640 }, { "epoch": 2.144012944983819, "grad_norm": 0.20802282917927592, "learning_rate": 1.1429126258285946e-05, "loss": 0.1661, "step": 2650 }, { "epoch": 2.1521035598705502, "grad_norm": 0.20646189593102915, "learning_rate": 1.1232065511034696e-05, "loss": 0.1663, "step": 2660 }, { "epoch": 2.1601941747572817, "grad_norm": 0.21341477579858295, "learning_rate": 1.1036225025300357e-05, "loss": 0.1687, "step": 2670 }, { "epoch": 2.168284789644013, "grad_norm": 0.21356709357188863, "learning_rate": 1.0841622158560085e-05, "loss": 0.1668, "step": 2680 }, { "epoch": 2.176375404530744, "grad_norm": 0.20685337347705235, "learning_rate": 1.0648274158599994e-05, "loss": 0.1698, "step": 2690 }, { "epoch": 2.1844660194174756, "grad_norm": 0.22156407557595678, "learning_rate": 1.0456198161986489e-05, "loss": 0.1753, "step": 2700 }, { "epoch": 2.192556634304207, "grad_norm": 0.2035462284179136, "learning_rate": 1.0265411192547462e-05, "loss": 0.1718, "step": 2710 }, { "epoch": 2.2006472491909386, "grad_norm": 0.20189924705262216, "learning_rate": 1.0075930159863416e-05, "loss": 0.1661, "step": 2720 }, { "epoch": 2.20873786407767, "grad_norm": 0.20956549431298424, "learning_rate": 9.887771857768796e-06, "loss": 0.1683, "step": 2730 }, { "epoch": 2.2168284789644015, "grad_norm": 0.2140205114087363, "learning_rate": 9.700952962863513e-06, "loss": 0.1701, "step": 2740 }, { "epoch": 2.2249190938511325, "grad_norm": 0.20398172720239158, "learning_rate": 9.515490033034893e-06, "loss": 0.1667, "step": 2750 }, { "epoch": 2.233009708737864, "grad_norm": 0.20165594228560696, "learning_rate": 9.331399505990168e-06, "loss": 0.1655, "step": 2760 }, { "epoch": 2.2411003236245954, "grad_norm": 0.20182970094436348, "learning_rate": 9.148697697799533e-06, "loss": 0.1694, "step": 2770 }, { "epoch": 2.249190938511327, "grad_norm": 0.2291167648037075, "learning_rate": 8.967400801450105e-06, "loss": 0.1685, "step": 2780 }, { "epoch": 2.2572815533980584, "grad_norm": 0.20375204743694134, "learning_rate": 8.787524885410678e-06, "loss": 0.1638, "step": 2790 }, { "epoch": 2.26537216828479, "grad_norm": 0.214007858233087, "learning_rate": 8.60908589220758e-06, "loss": 0.1676, "step": 2800 }, { "epoch": 2.273462783171521, "grad_norm": 0.2025950468246571, "learning_rate": 8.432099637011693e-06, "loss": 0.1657, "step": 2810 }, { "epoch": 2.2815533980582523, "grad_norm": 0.21480260045098348, "learning_rate": 8.256581806236704e-06, "loss": 0.1649, "step": 2820 }, { "epoch": 2.2896440129449838, "grad_norm": 0.2052262400110326, "learning_rate": 8.082547956148873e-06, "loss": 0.1663, "step": 2830 }, { "epoch": 2.2977346278317152, "grad_norm": 0.20019987270861134, "learning_rate": 7.91001351148819e-06, "loss": 0.1653, "step": 2840 }, { "epoch": 2.3058252427184467, "grad_norm": 0.207551520801799, "learning_rate": 7.738993764101324e-06, "loss": 0.1677, "step": 2850 }, { "epoch": 2.313915857605178, "grad_norm": 0.19280865204847908, "learning_rate": 7.569503871586292e-06, "loss": 0.1674, "step": 2860 }, { "epoch": 2.3220064724919096, "grad_norm": 0.20122645680053192, "learning_rate": 7.401558855949004e-06, "loss": 0.1678, "step": 2870 }, { "epoch": 2.3300970873786406, "grad_norm": 0.1945489837836613, "learning_rate": 7.235173602271875e-06, "loss": 0.162, "step": 2880 }, { "epoch": 2.338187702265372, "grad_norm": 0.2051511298220981, "learning_rate": 7.070362857394538e-06, "loss": 0.1712, "step": 2890 }, { "epoch": 2.3462783171521036, "grad_norm": 0.2038615765888219, "learning_rate": 6.907141228606831e-06, "loss": 0.1664, "step": 2900 }, { "epoch": 2.354368932038835, "grad_norm": 0.19824356024446557, "learning_rate": 6.745523182354147e-06, "loss": 0.1634, "step": 2910 }, { "epoch": 2.3624595469255665, "grad_norm": 0.1897897800458373, "learning_rate": 6.585523042955233e-06, "loss": 0.1633, "step": 2920 }, { "epoch": 2.3705501618122975, "grad_norm": 0.19361276633294466, "learning_rate": 6.427154991332665e-06, "loss": 0.1665, "step": 2930 }, { "epoch": 2.378640776699029, "grad_norm": 0.19269698264895235, "learning_rate": 6.2704330637559315e-06, "loss": 0.1632, "step": 2940 }, { "epoch": 2.3867313915857604, "grad_norm": 0.19487091156668288, "learning_rate": 6.115371150597413e-06, "loss": 0.1657, "step": 2950 }, { "epoch": 2.394822006472492, "grad_norm": 0.17726783434092147, "learning_rate": 5.961982995101301e-06, "loss": 0.1652, "step": 2960 }, { "epoch": 2.4029126213592233, "grad_norm": 0.19174116985719267, "learning_rate": 5.810282192165442e-06, "loss": 0.1674, "step": 2970 }, { "epoch": 2.411003236245955, "grad_norm": 0.20196362222659836, "learning_rate": 5.660282187136507e-06, "loss": 0.1648, "step": 2980 }, { "epoch": 2.4190938511326863, "grad_norm": 0.2037293127188244, "learning_rate": 5.511996274618253e-06, "loss": 0.1673, "step": 2990 }, { "epoch": 2.4271844660194173, "grad_norm": 0.2075997573199353, "learning_rate": 5.365437597293238e-06, "loss": 0.1681, "step": 3000 }, { "epoch": 2.4352750809061487, "grad_norm": 0.19485346480204271, "learning_rate": 5.220619144757996e-06, "loss": 0.168, "step": 3010 }, { "epoch": 2.44336569579288, "grad_norm": 0.20032468931249656, "learning_rate": 5.077553752371708e-06, "loss": 0.1695, "step": 3020 }, { "epoch": 2.4514563106796117, "grad_norm": 0.18780814063723766, "learning_rate": 4.936254100118656e-06, "loss": 0.1626, "step": 3030 }, { "epoch": 2.459546925566343, "grad_norm": 0.20597954746449326, "learning_rate": 4.796732711484342e-06, "loss": 0.1688, "step": 3040 }, { "epoch": 2.4676375404530746, "grad_norm": 0.1884986064298571, "learning_rate": 4.659001952345538e-06, "loss": 0.1618, "step": 3050 }, { "epoch": 2.475728155339806, "grad_norm": 0.19330264761303187, "learning_rate": 4.523074029874291e-06, "loss": 0.1611, "step": 3060 }, { "epoch": 2.483818770226537, "grad_norm": 0.1899995190829889, "learning_rate": 4.388960991455998e-06, "loss": 0.1642, "step": 3070 }, { "epoch": 2.4919093851132685, "grad_norm": 0.20352479818055758, "learning_rate": 4.256674723621621e-06, "loss": 0.165, "step": 3080 }, { "epoch": 2.5, "grad_norm": 0.19705480301688083, "learning_rate": 4.126226950994211e-06, "loss": 0.1633, "step": 3090 }, { "epoch": 2.5080906148867315, "grad_norm": 0.19973033430929824, "learning_rate": 3.997629235249692e-06, "loss": 0.1645, "step": 3100 }, { "epoch": 2.516181229773463, "grad_norm": 0.19227297342228458, "learning_rate": 3.870892974092197e-06, "loss": 0.1657, "step": 3110 }, { "epoch": 2.524271844660194, "grad_norm": 0.19226389494086693, "learning_rate": 3.7460294002438444e-06, "loss": 0.1649, "step": 3120 }, { "epoch": 2.5323624595469254, "grad_norm": 0.23947157434034713, "learning_rate": 3.6230495804491864e-06, "loss": 0.1638, "step": 3130 }, { "epoch": 2.540453074433657, "grad_norm": 0.18606261818175104, "learning_rate": 3.5019644144943576e-06, "loss": 0.1657, "step": 3140 }, { "epoch": 2.5485436893203883, "grad_norm": 0.18558275603040017, "learning_rate": 3.382784634241015e-06, "loss": 0.1626, "step": 3150 }, { "epoch": 2.55663430420712, "grad_norm": 0.1838874793051254, "learning_rate": 3.2655208026751816e-06, "loss": 0.1645, "step": 3160 }, { "epoch": 2.5647249190938513, "grad_norm": 0.18161462708162493, "learning_rate": 3.150183312971014e-06, "loss": 0.1625, "step": 3170 }, { "epoch": 2.5728155339805827, "grad_norm": 0.18098413098000662, "learning_rate": 3.036782387569659e-06, "loss": 0.1623, "step": 3180 }, { "epoch": 2.5809061488673137, "grad_norm": 0.19242866333812558, "learning_rate": 2.9253280772732595e-06, "loss": 0.1627, "step": 3190 }, { "epoch": 2.588996763754045, "grad_norm": 0.17889937009144094, "learning_rate": 2.8158302603540965e-06, "loss": 0.1609, "step": 3200 }, { "epoch": 2.5970873786407767, "grad_norm": 0.1879805259242369, "learning_rate": 2.708298641679105e-06, "loss": 0.1642, "step": 3210 }, { "epoch": 2.605177993527508, "grad_norm": 0.18048882079511047, "learning_rate": 2.6027427518497153e-06, "loss": 0.1634, "step": 3220 }, { "epoch": 2.6132686084142396, "grad_norm": 0.18903574393083356, "learning_rate": 2.49917194635714e-06, "loss": 0.1607, "step": 3230 }, { "epoch": 2.6213592233009706, "grad_norm": 0.18631019977805036, "learning_rate": 2.397595404753225e-06, "loss": 0.1589, "step": 3240 }, { "epoch": 2.6294498381877025, "grad_norm": 0.18583873327326217, "learning_rate": 2.2980221298367995e-06, "loss": 0.1679, "step": 3250 }, { "epoch": 2.6375404530744335, "grad_norm": 0.1872827952915852, "learning_rate": 2.2004609468558175e-06, "loss": 0.1648, "step": 3260 }, { "epoch": 2.645631067961165, "grad_norm": 0.18646198991723772, "learning_rate": 2.1049205027251216e-06, "loss": 0.1648, "step": 3270 }, { "epoch": 2.6537216828478964, "grad_norm": 0.18510117286188082, "learning_rate": 2.0114092652600806e-06, "loss": 0.1669, "step": 3280 }, { "epoch": 2.661812297734628, "grad_norm": 0.1813478635510933, "learning_rate": 1.919935522426081e-06, "loss": 0.1601, "step": 3290 }, { "epoch": 2.6699029126213594, "grad_norm": 0.18995739003728715, "learning_rate": 1.8305073816039492e-06, "loss": 0.165, "step": 3300 }, { "epoch": 2.6779935275080904, "grad_norm": 0.18850255636775962, "learning_rate": 1.7431327688714139e-06, "loss": 0.1686, "step": 3310 }, { "epoch": 2.686084142394822, "grad_norm": 0.19405450386616901, "learning_rate": 1.6578194283005804e-06, "loss": 0.1586, "step": 3320 }, { "epoch": 2.6941747572815533, "grad_norm": 0.18168641096398497, "learning_rate": 1.5745749212715794e-06, "loss": 0.1669, "step": 3330 }, { "epoch": 2.7022653721682848, "grad_norm": 0.18390467428127966, "learning_rate": 1.4934066258024182e-06, "loss": 0.1681, "step": 3340 }, { "epoch": 2.7103559870550162, "grad_norm": 0.18623903954773796, "learning_rate": 1.4143217358950217e-06, "loss": 0.1648, "step": 3350 }, { "epoch": 2.7184466019417477, "grad_norm": 0.18708444577479785, "learning_rate": 1.3373272608976668e-06, "loss": 0.1635, "step": 3360 }, { "epoch": 2.726537216828479, "grad_norm": 0.1850383082145996, "learning_rate": 1.2624300248836928e-06, "loss": 0.1607, "step": 3370 }, { "epoch": 2.73462783171521, "grad_norm": 0.19358386622393567, "learning_rate": 1.1896366660467173e-06, "loss": 0.1645, "step": 3380 }, { "epoch": 2.7427184466019416, "grad_norm": 0.17960690916881664, "learning_rate": 1.1189536361122799e-06, "loss": 0.1628, "step": 3390 }, { "epoch": 2.750809061488673, "grad_norm": 0.20127134763525154, "learning_rate": 1.0503871997660036e-06, "loss": 0.168, "step": 3400 }, { "epoch": 2.7588996763754046, "grad_norm": 0.19658936043581707, "learning_rate": 9.83943434098372e-07, "loss": 0.1633, "step": 3410 }, { "epoch": 2.766990291262136, "grad_norm": 0.18650679653778446, "learning_rate": 9.196282280661023e-07, "loss": 0.1673, "step": 3420 }, { "epoch": 2.775080906148867, "grad_norm": 0.1733323921140541, "learning_rate": 8.574472819702029e-07, "loss": 0.1652, "step": 3430 }, { "epoch": 2.783171521035599, "grad_norm": 0.17972872694953243, "learning_rate": 7.974061069507571e-07, "loss": 0.1636, "step": 3440 }, { "epoch": 2.79126213592233, "grad_norm": 0.1777461753836034, "learning_rate": 7.395100244984604e-07, "loss": 0.1634, "step": 3450 }, { "epoch": 2.7993527508090614, "grad_norm": 0.18989630729322288, "learning_rate": 6.837641659829807e-07, "loss": 0.1661, "step": 3460 }, { "epoch": 2.807443365695793, "grad_norm": 0.17625775327505652, "learning_rate": 6.301734721981533e-07, "loss": 0.1643, "step": 3470 }, { "epoch": 2.8155339805825244, "grad_norm": 0.1862820489099266, "learning_rate": 5.787426929240808e-07, "loss": 0.1643, "step": 3480 }, { "epoch": 2.823624595469256, "grad_norm": 0.1790622651864573, "learning_rate": 5.294763865061558e-07, "loss": 0.1626, "step": 3490 }, { "epoch": 2.831715210355987, "grad_norm": 0.173713637483726, "learning_rate": 4.823789194510514e-07, "loss": 0.1639, "step": 3500 }, { "epoch": 2.8398058252427183, "grad_norm": 0.1866652309901256, "learning_rate": 4.3745446603971064e-07, "loss": 0.1602, "step": 3510 }, { "epoch": 2.8478964401294498, "grad_norm": 0.17910113776598907, "learning_rate": 3.947070079573872e-07, "loss": 0.1642, "step": 3520 }, { "epoch": 2.855987055016181, "grad_norm": 0.18065751924055337, "learning_rate": 3.541403339407279e-07, "loss": 0.1656, "step": 3530 }, { "epoch": 2.8640776699029127, "grad_norm": 0.18007167117331419, "learning_rate": 3.1575803944199624e-07, "loss": 0.1651, "step": 3540 }, { "epoch": 2.872168284789644, "grad_norm": 0.18247762658055755, "learning_rate": 2.7956352631038906e-07, "loss": 0.1623, "step": 3550 }, { "epoch": 2.8802588996763756, "grad_norm": 0.17849381652959184, "learning_rate": 2.4556000249054133e-07, "loss": 0.1653, "step": 3560 }, { "epoch": 2.8883495145631066, "grad_norm": 0.1846373828579316, "learning_rate": 2.1375048173818412e-07, "loss": 0.1691, "step": 3570 }, { "epoch": 2.896440129449838, "grad_norm": 0.17151731227871805, "learning_rate": 1.8413778335305e-07, "loss": 0.1606, "step": 3580 }, { "epoch": 2.9045307443365695, "grad_norm": 0.22901848451176904, "learning_rate": 1.567245319290006e-07, "loss": 0.1607, "step": 3590 }, { "epoch": 2.912621359223301, "grad_norm": 0.1841472999781577, "learning_rate": 1.315131571213879e-07, "loss": 0.1659, "step": 3600 }, { "epoch": 2.9207119741100325, "grad_norm": 0.17620735263373793, "learning_rate": 1.0850589343172624e-07, "loss": 0.163, "step": 3610 }, { "epoch": 2.9288025889967635, "grad_norm": 0.1732366383013906, "learning_rate": 8.770478000964532e-08, "loss": 0.1603, "step": 3620 }, { "epoch": 2.9368932038834954, "grad_norm": 0.18106989776343552, "learning_rate": 6.911166047215145e-08, "loss": 0.1639, "step": 3630 }, { "epoch": 2.9449838187702264, "grad_norm": 0.18480994602766684, "learning_rate": 5.272818274023872e-08, "loss": 0.163, "step": 3640 }, { "epoch": 2.953074433656958, "grad_norm": 0.17298187343963853, "learning_rate": 3.855579889282257e-08, "loss": 0.1607, "step": 3650 }, { "epoch": 2.9611650485436893, "grad_norm": 0.1826761908488716, "learning_rate": 2.6595765038045507e-08, "loss": 0.1598, "step": 3660 }, { "epoch": 2.969255663430421, "grad_norm": 0.18104206371371273, "learning_rate": 1.6849141201946693e-08, "loss": 0.1656, "step": 3670 }, { "epoch": 2.9773462783171523, "grad_norm": 0.17420913687043998, "learning_rate": 9.316791234506572e-09, "loss": 0.1589, "step": 3680 }, { "epoch": 2.9854368932038833, "grad_norm": 0.177305619544985, "learning_rate": 3.999382733096968e-09, "loss": 0.1633, "step": 3690 }, { "epoch": 2.9935275080906147, "grad_norm": 0.17610657137080274, "learning_rate": 8.973869832895609e-10, "loss": 0.1623, "step": 3700 }, { "epoch": 3.0, "step": 3708, "total_flos": 6.364819972765516e+18, "train_loss": 0.2649726264235014, "train_runtime": 208629.7363, "train_samples_per_second": 0.569, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 3708, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.364819972765516e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }