{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998683627921449, "eval_steps": 500, "global_step": 2611, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038294460466953076, "grad_norm": 5.426074325401742, "learning_rate": 5.69620253164557e-07, "loss": 0.596, "step": 10 }, { "epoch": 0.007658892093390615, "grad_norm": 1.7300049140411797, "learning_rate": 1.2025316455696204e-06, "loss": 0.4935, "step": 20 }, { "epoch": 0.011488338140085923, "grad_norm": 1.0566614384820672, "learning_rate": 1.8354430379746838e-06, "loss": 0.4179, "step": 30 }, { "epoch": 0.01531778418678123, "grad_norm": 1.1239284474445808, "learning_rate": 2.4683544303797473e-06, "loss": 0.3786, "step": 40 }, { "epoch": 0.019147230233476538, "grad_norm": 0.8236458977275282, "learning_rate": 3.10126582278481e-06, "loss": 0.3645, "step": 50 }, { "epoch": 0.022976676280171845, "grad_norm": 0.939010895376297, "learning_rate": 3.7341772151898737e-06, "loss": 0.3472, "step": 60 }, { "epoch": 0.026806122326867153, "grad_norm": 0.7558771041067948, "learning_rate": 4.367088607594937e-06, "loss": 0.3234, "step": 70 }, { "epoch": 0.03063556837356246, "grad_norm": 0.8507599742006083, "learning_rate": 5e-06, "loss": 0.3273, "step": 80 }, { "epoch": 0.03446501442025777, "grad_norm": 0.9322867483149297, "learning_rate": 4.999807568225742e-06, "loss": 0.3269, "step": 90 }, { "epoch": 0.038294460466953076, "grad_norm": 0.8060180679646447, "learning_rate": 4.999230302526956e-06, "loss": 0.338, "step": 100 }, { "epoch": 0.04212390651364838, "grad_norm": 0.8178741447804784, "learning_rate": 4.998268291771053e-06, "loss": 0.3232, "step": 110 }, { "epoch": 0.04595335256034369, "grad_norm": 0.8264908445738317, "learning_rate": 4.9969216840551815e-06, "loss": 0.3239, "step": 120 }, { "epoch": 0.049782798607039, "grad_norm": 0.8185737508740193, "learning_rate": 4.995190686683432e-06, "loss": 0.3164, "step": 130 }, { "epoch": 0.053612244653734306, "grad_norm": 0.8016123974519153, "learning_rate": 4.9930755661349215e-06, "loss": 0.3227, "step": 140 }, { "epoch": 0.057441690700429614, "grad_norm": 0.8304543418674754, "learning_rate": 4.990576648022768e-06, "loss": 0.3136, "step": 150 }, { "epoch": 0.06127113674712492, "grad_norm": 0.7555621229293721, "learning_rate": 4.98769431704397e-06, "loss": 0.3033, "step": 160 }, { "epoch": 0.06510058279382024, "grad_norm": 0.8637646507692361, "learning_rate": 4.984429016920178e-06, "loss": 0.3231, "step": 170 }, { "epoch": 0.06893002884051554, "grad_norm": 0.7476252312804286, "learning_rate": 4.980781250329389e-06, "loss": 0.309, "step": 180 }, { "epoch": 0.07275947488721085, "grad_norm": 0.8589580099130042, "learning_rate": 4.976751578828562e-06, "loss": 0.3122, "step": 190 }, { "epoch": 0.07658892093390615, "grad_norm": 0.8355945240964501, "learning_rate": 4.9723406227671645e-06, "loss": 0.3109, "step": 200 }, { "epoch": 0.08041836698060147, "grad_norm": 0.7902874463038752, "learning_rate": 4.967549061191679e-06, "loss": 0.3118, "step": 210 }, { "epoch": 0.08424781302729677, "grad_norm": 0.8876153745394134, "learning_rate": 4.962377631741061e-06, "loss": 0.306, "step": 220 }, { "epoch": 0.08807725907399208, "grad_norm": 0.866679126237035, "learning_rate": 4.956827130533185e-06, "loss": 0.3135, "step": 230 }, { "epoch": 0.09190670512068738, "grad_norm": 0.8629644307536233, "learning_rate": 4.95089841204229e-06, "loss": 0.302, "step": 240 }, { "epoch": 0.0957361511673827, "grad_norm": 0.7338565663547403, "learning_rate": 4.9445923889674285e-06, "loss": 0.303, "step": 250 }, { "epoch": 0.099565597214078, "grad_norm": 0.7305791719581167, "learning_rate": 4.937910032091968e-06, "loss": 0.3009, "step": 260 }, { "epoch": 0.10339504326077331, "grad_norm": 0.6695669065750437, "learning_rate": 4.9308523701341415e-06, "loss": 0.305, "step": 270 }, { "epoch": 0.10722448930746861, "grad_norm": 0.7251397159069465, "learning_rate": 4.923420489588677e-06, "loss": 0.3005, "step": 280 }, { "epoch": 0.11105393535416393, "grad_norm": 0.7899848638296046, "learning_rate": 4.915615534559545e-06, "loss": 0.3036, "step": 290 }, { "epoch": 0.11488338140085923, "grad_norm": 0.8159662777762657, "learning_rate": 4.907438706583818e-06, "loss": 0.2997, "step": 300 }, { "epoch": 0.11871282744755454, "grad_norm": 0.7496451104541622, "learning_rate": 4.898891264446709e-06, "loss": 0.2984, "step": 310 }, { "epoch": 0.12254227349424984, "grad_norm": 0.7090938182919049, "learning_rate": 4.889974523987784e-06, "loss": 0.3037, "step": 320 }, { "epoch": 0.12637171954094514, "grad_norm": 0.7534479964128541, "learning_rate": 4.880689857898392e-06, "loss": 0.2907, "step": 330 }, { "epoch": 0.13020116558764047, "grad_norm": 0.7967215720335111, "learning_rate": 4.871038695510347e-06, "loss": 0.3035, "step": 340 }, { "epoch": 0.13403061163433577, "grad_norm": 0.8589787634591548, "learning_rate": 4.861022522575892e-06, "loss": 0.2917, "step": 350 }, { "epoch": 0.13786005768103107, "grad_norm": 0.77962150007173, "learning_rate": 4.850642881038969e-06, "loss": 0.3019, "step": 360 }, { "epoch": 0.14168950372772637, "grad_norm": 0.6913568705438217, "learning_rate": 4.839901368797849e-06, "loss": 0.2987, "step": 370 }, { "epoch": 0.1455189497744217, "grad_norm": 0.7245824514430528, "learning_rate": 4.828799639459139e-06, "loss": 0.2996, "step": 380 }, { "epoch": 0.149348395821117, "grad_norm": 0.7331355223021702, "learning_rate": 4.817339402083217e-06, "loss": 0.2958, "step": 390 }, { "epoch": 0.1531778418678123, "grad_norm": 0.684066179750134, "learning_rate": 4.805522420921132e-06, "loss": 0.2923, "step": 400 }, { "epoch": 0.1570072879145076, "grad_norm": 0.7273451737034062, "learning_rate": 4.793350515143007e-06, "loss": 0.2955, "step": 410 }, { "epoch": 0.16083673396120293, "grad_norm": 0.6922184014896344, "learning_rate": 4.780825558557981e-06, "loss": 0.3021, "step": 420 }, { "epoch": 0.16466618000789823, "grad_norm": 0.7901617539245519, "learning_rate": 4.767949479325749e-06, "loss": 0.3004, "step": 430 }, { "epoch": 0.16849562605459353, "grad_norm": 0.7269661753924206, "learning_rate": 4.754724259659727e-06, "loss": 0.2966, "step": 440 }, { "epoch": 0.17232507210128883, "grad_norm": 0.7893201123959417, "learning_rate": 4.741151935521906e-06, "loss": 0.2985, "step": 450 }, { "epoch": 0.17615451814798416, "grad_norm": 0.6946007030633581, "learning_rate": 4.727234596309417e-06, "loss": 0.3036, "step": 460 }, { "epoch": 0.17998396419467946, "grad_norm": 0.7322226198234377, "learning_rate": 4.71297438453288e-06, "loss": 0.3001, "step": 470 }, { "epoch": 0.18381341024137476, "grad_norm": 0.6823377208042828, "learning_rate": 4.69837349548658e-06, "loss": 0.2925, "step": 480 }, { "epoch": 0.1876428562880701, "grad_norm": 0.7135771256714908, "learning_rate": 4.683434176910503e-06, "loss": 0.2939, "step": 490 }, { "epoch": 0.1914723023347654, "grad_norm": 0.691020843307318, "learning_rate": 4.668158728644315e-06, "loss": 0.2804, "step": 500 }, { "epoch": 0.1953017483814607, "grad_norm": 0.6612550303473294, "learning_rate": 4.652549502273305e-06, "loss": 0.2922, "step": 510 }, { "epoch": 0.199131194428156, "grad_norm": 0.7364276724142655, "learning_rate": 4.636608900766372e-06, "loss": 0.2891, "step": 520 }, { "epoch": 0.20296064047485132, "grad_norm": 0.6721354413833053, "learning_rate": 4.620339378106103e-06, "loss": 0.2809, "step": 530 }, { "epoch": 0.20679008652154662, "grad_norm": 0.6653652380595567, "learning_rate": 4.6037434389109855e-06, "loss": 0.2983, "step": 540 }, { "epoch": 0.21061953256824192, "grad_norm": 0.7449501156668481, "learning_rate": 4.586823638049841e-06, "loss": 0.2903, "step": 550 }, { "epoch": 0.21444897861493722, "grad_norm": 0.715508013931221, "learning_rate": 4.569582580248509e-06, "loss": 0.2923, "step": 560 }, { "epoch": 0.21827842466163255, "grad_norm": 0.6641963861933619, "learning_rate": 4.552022919688861e-06, "loss": 0.2924, "step": 570 }, { "epoch": 0.22210787070832785, "grad_norm": 0.7113518461330477, "learning_rate": 4.534147359600211e-06, "loss": 0.2819, "step": 580 }, { "epoch": 0.22593731675502315, "grad_norm": 0.7089839605701392, "learning_rate": 4.515958651843151e-06, "loss": 0.2939, "step": 590 }, { "epoch": 0.22976676280171845, "grad_norm": 0.677078858734863, "learning_rate": 4.497459596485924e-06, "loss": 0.2835, "step": 600 }, { "epoch": 0.23359620884841378, "grad_norm": 0.7703623199956556, "learning_rate": 4.478653041373371e-06, "loss": 0.2854, "step": 610 }, { "epoch": 0.23742565489510908, "grad_norm": 0.6407221872558565, "learning_rate": 4.459541881688501e-06, "loss": 0.2872, "step": 620 }, { "epoch": 0.24125510094180438, "grad_norm": 0.7625245912179776, "learning_rate": 4.440129059506808e-06, "loss": 0.2852, "step": 630 }, { "epoch": 0.24508454698849969, "grad_norm": 0.6533872161632752, "learning_rate": 4.420417563343347e-06, "loss": 0.2883, "step": 640 }, { "epoch": 0.248913993035195, "grad_norm": 0.6347508565680315, "learning_rate": 4.40041042769266e-06, "loss": 0.2818, "step": 650 }, { "epoch": 0.2527434390818903, "grad_norm": 0.6956608959261769, "learning_rate": 4.380110732561636e-06, "loss": 0.2858, "step": 660 }, { "epoch": 0.2565728851285856, "grad_norm": 0.7174236888577228, "learning_rate": 4.3595216029953575e-06, "loss": 0.2948, "step": 670 }, { "epoch": 0.26040233117528094, "grad_norm": 0.6538209955447881, "learning_rate": 4.338646208596009e-06, "loss": 0.2901, "step": 680 }, { "epoch": 0.2642317772219762, "grad_norm": 0.6777945072272051, "learning_rate": 4.317487763034936e-06, "loss": 0.2848, "step": 690 }, { "epoch": 0.26806122326867154, "grad_norm": 0.6915180680337352, "learning_rate": 4.296049523557917e-06, "loss": 0.294, "step": 700 }, { "epoch": 0.2718906693153669, "grad_norm": 0.6811198761407046, "learning_rate": 4.274334790483718e-06, "loss": 0.2925, "step": 710 }, { "epoch": 0.27572011536206215, "grad_norm": 0.6682149146681646, "learning_rate": 4.2523469066960295e-06, "loss": 0.2832, "step": 720 }, { "epoch": 0.2795495614087575, "grad_norm": 0.6343382096231662, "learning_rate": 4.230089257128842e-06, "loss": 0.2865, "step": 730 }, { "epoch": 0.28337900745545275, "grad_norm": 0.7142478024296977, "learning_rate": 4.207565268245356e-06, "loss": 0.2852, "step": 740 }, { "epoch": 0.2872084535021481, "grad_norm": 0.6678411720839094, "learning_rate": 4.184778407510484e-06, "loss": 0.2924, "step": 750 }, { "epoch": 0.2910378995488434, "grad_norm": 0.7689563352484293, "learning_rate": 4.16173218285706e-06, "loss": 0.2901, "step": 760 }, { "epoch": 0.2948673455955387, "grad_norm": 0.749864633773232, "learning_rate": 4.138430142145805e-06, "loss": 0.2839, "step": 770 }, { "epoch": 0.298696791642234, "grad_norm": 0.6699595192464245, "learning_rate": 4.114875872619147e-06, "loss": 0.2951, "step": 780 }, { "epoch": 0.30252623768892933, "grad_norm": 0.691214596956005, "learning_rate": 4.091073000348989e-06, "loss": 0.2874, "step": 790 }, { "epoch": 0.3063556837356246, "grad_norm": 0.6554956219244137, "learning_rate": 4.067025189678485e-06, "loss": 0.286, "step": 800 }, { "epoch": 0.31018512978231993, "grad_norm": 0.6954151666813602, "learning_rate": 4.042736142657936e-06, "loss": 0.2834, "step": 810 }, { "epoch": 0.3140145758290152, "grad_norm": 0.7196715435903528, "learning_rate": 4.018209598474869e-06, "loss": 0.284, "step": 820 }, { "epoch": 0.31784402187571054, "grad_norm": 0.7723328668264622, "learning_rate": 3.9934493328784185e-06, "loss": 0.2777, "step": 830 }, { "epoch": 0.32167346792240586, "grad_norm": 0.6919247319317805, "learning_rate": 3.9684591575980546e-06, "loss": 0.2893, "step": 840 }, { "epoch": 0.32550291396910114, "grad_norm": 0.612288507127871, "learning_rate": 3.943242919756792e-06, "loss": 0.2891, "step": 850 }, { "epoch": 0.32933236001579647, "grad_norm": 0.7304106009933916, "learning_rate": 3.917804501278942e-06, "loss": 0.2838, "step": 860 }, { "epoch": 0.3331618060624918, "grad_norm": 0.6961425637816675, "learning_rate": 3.892147818292505e-06, "loss": 0.2818, "step": 870 }, { "epoch": 0.33699125210918707, "grad_norm": 0.6415090586497654, "learning_rate": 3.866276820526305e-06, "loss": 0.2826, "step": 880 }, { "epoch": 0.3408206981558824, "grad_norm": 0.7319097656364029, "learning_rate": 3.840195490701943e-06, "loss": 0.2797, "step": 890 }, { "epoch": 0.34465014420257767, "grad_norm": 0.6643906637801983, "learning_rate": 3.8139078439206755e-06, "loss": 0.2823, "step": 900 }, { "epoch": 0.348479590249273, "grad_norm": 0.651442028320178, "learning_rate": 3.787417927045315e-06, "loss": 0.2845, "step": 910 }, { "epoch": 0.3523090362959683, "grad_norm": 0.703399820267242, "learning_rate": 3.760729818077224e-06, "loss": 0.2782, "step": 920 }, { "epoch": 0.3561384823426636, "grad_norm": 0.6339374061657803, "learning_rate": 3.7338476255285295e-06, "loss": 0.2809, "step": 930 }, { "epoch": 0.3599679283893589, "grad_norm": 0.6650804278367294, "learning_rate": 3.7067754877896388e-06, "loss": 0.288, "step": 940 }, { "epoch": 0.36379737443605425, "grad_norm": 0.6645625939049019, "learning_rate": 3.6795175724921506e-06, "loss": 0.2821, "step": 950 }, { "epoch": 0.36762682048274953, "grad_norm": 0.6819651400048093, "learning_rate": 3.652078075867267e-06, "loss": 0.2759, "step": 960 }, { "epoch": 0.37145626652944486, "grad_norm": 0.6767426872168217, "learning_rate": 3.624461222099804e-06, "loss": 0.28, "step": 970 }, { "epoch": 0.3752857125761402, "grad_norm": 0.7310278431962209, "learning_rate": 3.596671262677898e-06, "loss": 0.2883, "step": 980 }, { "epoch": 0.37911515862283546, "grad_norm": 0.6791506792289325, "learning_rate": 3.5687124757385084e-06, "loss": 0.2885, "step": 990 }, { "epoch": 0.3829446046695308, "grad_norm": 0.7086896739862765, "learning_rate": 3.5405891654088154e-06, "loss": 0.2815, "step": 1000 }, { "epoch": 0.38677405071622606, "grad_norm": 0.6280354477468107, "learning_rate": 3.5123056611436224e-06, "loss": 0.2807, "step": 1010 }, { "epoch": 0.3906034967629214, "grad_norm": 0.6538192887744636, "learning_rate": 3.4838663170588573e-06, "loss": 0.2723, "step": 1020 }, { "epoch": 0.3944329428096167, "grad_norm": 0.6999366929435851, "learning_rate": 3.455275511261272e-06, "loss": 0.2804, "step": 1030 }, { "epoch": 0.398262388856312, "grad_norm": 0.6406720215436563, "learning_rate": 3.4265376451744564e-06, "loss": 0.2776, "step": 1040 }, { "epoch": 0.4020918349030073, "grad_norm": 0.6809060571142534, "learning_rate": 3.3976571428612583e-06, "loss": 0.2823, "step": 1050 }, { "epoch": 0.40592128094970265, "grad_norm": 0.6506142391402524, "learning_rate": 3.3686384503427177e-06, "loss": 0.2785, "step": 1060 }, { "epoch": 0.4097507269963979, "grad_norm": 0.6623408933951855, "learning_rate": 3.339486034913627e-06, "loss": 0.2781, "step": 1070 }, { "epoch": 0.41358017304309325, "grad_norm": 0.675859964827601, "learning_rate": 3.310204384454805e-06, "loss": 0.2776, "step": 1080 }, { "epoch": 0.4174096190897885, "grad_norm": 0.7354171342985883, "learning_rate": 3.280798006742213e-06, "loss": 0.2929, "step": 1090 }, { "epoch": 0.42123906513648385, "grad_norm": 0.6457062459707484, "learning_rate": 3.2512714287530007e-06, "loss": 0.2743, "step": 1100 }, { "epoch": 0.4250685111831792, "grad_norm": 0.641329871481826, "learning_rate": 3.2216291959686007e-06, "loss": 0.2737, "step": 1110 }, { "epoch": 0.42889795722987445, "grad_norm": 0.6566421664911357, "learning_rate": 3.191875871674971e-06, "loss": 0.2838, "step": 1120 }, { "epoch": 0.4327274032765698, "grad_norm": 0.6027701960697498, "learning_rate": 3.162016036260098e-06, "loss": 0.2752, "step": 1130 }, { "epoch": 0.4365568493232651, "grad_norm": 0.648907215118306, "learning_rate": 3.1320542865088695e-06, "loss": 0.2667, "step": 1140 }, { "epoch": 0.4403862953699604, "grad_norm": 0.6077801768682932, "learning_rate": 3.1019952348954163e-06, "loss": 0.2747, "step": 1150 }, { "epoch": 0.4442157414166557, "grad_norm": 0.6576708995713357, "learning_rate": 3.071843508873046e-06, "loss": 0.2836, "step": 1160 }, { "epoch": 0.448045187463351, "grad_norm": 0.695646974082748, "learning_rate": 3.0416037501618676e-06, "loss": 0.2732, "step": 1170 }, { "epoch": 0.4518746335100463, "grad_norm": 0.616879255204133, "learning_rate": 3.0112806140342176e-06, "loss": 0.2759, "step": 1180 }, { "epoch": 0.45570407955674164, "grad_norm": 0.6008817125153927, "learning_rate": 2.9808787685980054e-06, "loss": 0.2769, "step": 1190 }, { "epoch": 0.4595335256034369, "grad_norm": 0.5972779982979275, "learning_rate": 2.9504028940780777e-06, "loss": 0.2836, "step": 1200 }, { "epoch": 0.46336297165013224, "grad_norm": 0.6541064777899037, "learning_rate": 2.9198576820957188e-06, "loss": 0.2678, "step": 1210 }, { "epoch": 0.46719241769682757, "grad_norm": 0.7461980100750918, "learning_rate": 2.8892478349463987e-06, "loss": 0.279, "step": 1220 }, { "epoch": 0.47102186374352284, "grad_norm": 0.697749095404112, "learning_rate": 2.8585780648758745e-06, "loss": 0.2774, "step": 1230 }, { "epoch": 0.47485130979021817, "grad_norm": 0.6901917627087478, "learning_rate": 2.827853093354763e-06, "loss": 0.2731, "step": 1240 }, { "epoch": 0.47868075583691344, "grad_norm": 0.6167921413072504, "learning_rate": 2.79707765035169e-06, "loss": 0.2781, "step": 1250 }, { "epoch": 0.48251020188360877, "grad_norm": 0.6877679962661163, "learning_rate": 2.7662564736051378e-06, "loss": 0.2779, "step": 1260 }, { "epoch": 0.4863396479303041, "grad_norm": 0.6529732798686552, "learning_rate": 2.7353943078940876e-06, "loss": 0.2755, "step": 1270 }, { "epoch": 0.49016909397699937, "grad_norm": 0.573925656257561, "learning_rate": 2.7044959043075815e-06, "loss": 0.2781, "step": 1280 }, { "epoch": 0.4939985400236947, "grad_norm": 0.7060010201146872, "learning_rate": 2.67356601951332e-06, "loss": 0.2885, "step": 1290 }, { "epoch": 0.49782798607039, "grad_norm": 0.6706570115076154, "learning_rate": 2.64260941502539e-06, "loss": 0.2823, "step": 1300 }, { "epoch": 0.5016574321170854, "grad_norm": 0.6816750561081921, "learning_rate": 2.611630856471252e-06, "loss": 0.2734, "step": 1310 }, { "epoch": 0.5054868781637806, "grad_norm": 0.6503962781113031, "learning_rate": 2.5806351128580963e-06, "loss": 0.2775, "step": 1320 }, { "epoch": 0.5093163242104759, "grad_norm": 0.6770280496930409, "learning_rate": 2.549626955838673e-06, "loss": 0.2805, "step": 1330 }, { "epoch": 0.5131457702571712, "grad_norm": 0.6025144898257462, "learning_rate": 2.5186111589767187e-06, "loss": 0.2715, "step": 1340 }, { "epoch": 0.5169752163038666, "grad_norm": 0.6707574673231309, "learning_rate": 2.487592497012089e-06, "loss": 0.2763, "step": 1350 }, { "epoch": 0.5208046623505619, "grad_norm": 0.6063466305966893, "learning_rate": 2.456575745125713e-06, "loss": 0.2845, "step": 1360 }, { "epoch": 0.5246341083972571, "grad_norm": 0.6060316435488056, "learning_rate": 2.4255656782044644e-06, "loss": 0.2772, "step": 1370 }, { "epoch": 0.5284635544439524, "grad_norm": 0.6023582378786108, "learning_rate": 2.3945670701061033e-06, "loss": 0.267, "step": 1380 }, { "epoch": 0.5322930004906478, "grad_norm": 0.672852399998615, "learning_rate": 2.3635846929243536e-06, "loss": 0.2757, "step": 1390 }, { "epoch": 0.5361224465373431, "grad_norm": 0.671673917828738, "learning_rate": 2.3326233162542655e-06, "loss": 0.2772, "step": 1400 }, { "epoch": 0.5399518925840384, "grad_norm": 0.6908191182674716, "learning_rate": 2.3016877064579564e-06, "loss": 0.2752, "step": 1410 }, { "epoch": 0.5437813386307337, "grad_norm": 0.6483112345732687, "learning_rate": 2.2707826259308493e-06, "loss": 0.2773, "step": 1420 }, { "epoch": 0.547610784677429, "grad_norm": 0.6527686650275873, "learning_rate": 2.2399128323685287e-06, "loss": 0.2711, "step": 1430 }, { "epoch": 0.5514402307241243, "grad_norm": 0.6402708780992856, "learning_rate": 2.2090830780343116e-06, "loss": 0.2774, "step": 1440 }, { "epoch": 0.5552696767708196, "grad_norm": 0.7121133583105477, "learning_rate": 2.178298109027659e-06, "loss": 0.2789, "step": 1450 }, { "epoch": 0.559099122817515, "grad_norm": 0.6936411537357103, "learning_rate": 2.147562664553537e-06, "loss": 0.2744, "step": 1460 }, { "epoch": 0.5629285688642103, "grad_norm": 0.6126488504445933, "learning_rate": 2.116881476192834e-06, "loss": 0.2698, "step": 1470 }, { "epoch": 0.5667580149109055, "grad_norm": 0.6045796996159061, "learning_rate": 2.086259267173961e-06, "loss": 0.2756, "step": 1480 }, { "epoch": 0.5705874609576008, "grad_norm": 0.6331045988234285, "learning_rate": 2.0557007516457287e-06, "loss": 0.2813, "step": 1490 }, { "epoch": 0.5744169070042962, "grad_norm": 0.6250063298087053, "learning_rate": 2.025210633951627e-06, "loss": 0.2659, "step": 1500 }, { "epoch": 0.5782463530509915, "grad_norm": 0.6244702166810576, "learning_rate": 1.9947936079056118e-06, "loss": 0.2691, "step": 1510 }, { "epoch": 0.5820757990976868, "grad_norm": 0.6645304037867747, "learning_rate": 1.964454356069514e-06, "loss": 0.2653, "step": 1520 }, { "epoch": 0.5859052451443821, "grad_norm": 0.6422648719414517, "learning_rate": 1.934197549032183e-06, "loss": 0.2753, "step": 1530 }, { "epoch": 0.5897346911910774, "grad_norm": 0.6796801504650317, "learning_rate": 1.904027844690468e-06, "loss": 0.2756, "step": 1540 }, { "epoch": 0.5935641372377727, "grad_norm": 0.612092392514174, "learning_rate": 1.8739498875321563e-06, "loss": 0.2781, "step": 1550 }, { "epoch": 0.597393583284468, "grad_norm": 0.6072776324810985, "learning_rate": 1.8439683079209789e-06, "loss": 0.2762, "step": 1560 }, { "epoch": 0.6012230293311633, "grad_norm": 0.6756861517531914, "learning_rate": 1.8140877213837823e-06, "loss": 0.2671, "step": 1570 }, { "epoch": 0.6050524753778587, "grad_norm": 0.6297494292950692, "learning_rate": 1.7843127278999944e-06, "loss": 0.2656, "step": 1580 }, { "epoch": 0.6088819214245539, "grad_norm": 0.6384980188983074, "learning_rate": 1.7546479111934733e-06, "loss": 0.2742, "step": 1590 }, { "epoch": 0.6127113674712492, "grad_norm": 0.6354445299662702, "learning_rate": 1.7250978380268696e-06, "loss": 0.2703, "step": 1600 }, { "epoch": 0.6165408135179445, "grad_norm": 0.6499360638842633, "learning_rate": 1.6956670574985909e-06, "loss": 0.2778, "step": 1610 }, { "epoch": 0.6203702595646399, "grad_norm": 0.6611350047172592, "learning_rate": 1.6663601003424884e-06, "loss": 0.2751, "step": 1620 }, { "epoch": 0.6241997056113352, "grad_norm": 0.6676095929381155, "learning_rate": 1.6371814782303723e-06, "loss": 0.2697, "step": 1630 }, { "epoch": 0.6280291516580304, "grad_norm": 0.6627616428191541, "learning_rate": 1.6081356830774625e-06, "loss": 0.2728, "step": 1640 }, { "epoch": 0.6318585977047257, "grad_norm": 0.6297343768461555, "learning_rate": 1.5792271863508751e-06, "loss": 0.2725, "step": 1650 }, { "epoch": 0.6356880437514211, "grad_norm": 0.6164109978910287, "learning_rate": 1.5504604383812646e-06, "loss": 0.2665, "step": 1660 }, { "epoch": 0.6395174897981164, "grad_norm": 0.6163778395985405, "learning_rate": 1.5218398676777103e-06, "loss": 0.2676, "step": 1670 }, { "epoch": 0.6433469358448117, "grad_norm": 0.7111914438547673, "learning_rate": 1.493369880245973e-06, "loss": 0.2682, "step": 1680 }, { "epoch": 0.6471763818915071, "grad_norm": 0.5951988021270335, "learning_rate": 1.4650548589102092e-06, "loss": 0.2725, "step": 1690 }, { "epoch": 0.6510058279382023, "grad_norm": 0.615884409391351, "learning_rate": 1.436899162638255e-06, "loss": 0.2693, "step": 1700 }, { "epoch": 0.6548352739848976, "grad_norm": 0.5615244588845161, "learning_rate": 1.4089071258705782e-06, "loss": 0.2717, "step": 1710 }, { "epoch": 0.6586647200315929, "grad_norm": 0.613522242640938, "learning_rate": 1.3810830578530226e-06, "loss": 0.2645, "step": 1720 }, { "epoch": 0.6624941660782883, "grad_norm": 0.6255188764708021, "learning_rate": 1.3534312419734066e-06, "loss": 0.2619, "step": 1730 }, { "epoch": 0.6663236121249836, "grad_norm": 0.6219089714484455, "learning_rate": 1.3259559351021249e-06, "loss": 0.2706, "step": 1740 }, { "epoch": 0.6701530581716788, "grad_norm": 0.7068243904113033, "learning_rate": 1.2986613669368159e-06, "loss": 0.2724, "step": 1750 }, { "epoch": 0.6739825042183741, "grad_norm": 0.6239904301630281, "learning_rate": 1.2715517393512239e-06, "loss": 0.2699, "step": 1760 }, { "epoch": 0.6778119502650695, "grad_norm": 0.636573560785032, "learning_rate": 1.2446312257483358e-06, "loss": 0.2606, "step": 1770 }, { "epoch": 0.6816413963117648, "grad_norm": 0.5822319215351535, "learning_rate": 1.2179039704179119e-06, "loss": 0.2671, "step": 1780 }, { "epoch": 0.6854708423584601, "grad_norm": 0.6414100583030806, "learning_rate": 1.1913740878984818e-06, "loss": 0.2728, "step": 1790 }, { "epoch": 0.6893002884051553, "grad_norm": 0.6019132802768392, "learning_rate": 1.1650456623439368e-06, "loss": 0.2684, "step": 1800 }, { "epoch": 0.6931297344518507, "grad_norm": 0.6936171991583808, "learning_rate": 1.1389227468947905e-06, "loss": 0.271, "step": 1810 }, { "epoch": 0.696959180498546, "grad_norm": 0.621045757650744, "learning_rate": 1.11300936305422e-06, "loss": 0.2657, "step": 1820 }, { "epoch": 0.7007886265452413, "grad_norm": 0.6554333537644303, "learning_rate": 1.0873095000689676e-06, "loss": 0.2666, "step": 1830 }, { "epoch": 0.7046180725919367, "grad_norm": 0.6286228416556564, "learning_rate": 1.0618271143152185e-06, "loss": 0.2714, "step": 1840 }, { "epoch": 0.708447518638632, "grad_norm": 0.6221920366356362, "learning_rate": 1.0365661286895364e-06, "loss": 0.2672, "step": 1850 }, { "epoch": 0.7122769646853272, "grad_norm": 0.6038564143991549, "learning_rate": 1.011530432004948e-06, "loss": 0.2639, "step": 1860 }, { "epoch": 0.7161064107320225, "grad_norm": 0.6754360798564671, "learning_rate": 9.86723878392279e-07, "loss": 0.2675, "step": 1870 }, { "epoch": 0.7199358567787179, "grad_norm": 0.5875424854805807, "learning_rate": 9.621502867068286e-07, "loss": 0.2592, "step": 1880 }, { "epoch": 0.7237653028254132, "grad_norm": 0.6032756771398841, "learning_rate": 9.378134399404768e-07, "loss": 0.2676, "step": 1890 }, { "epoch": 0.7275947488721085, "grad_norm": 0.597275111662435, "learning_rate": 9.137170846393054e-07, "loss": 0.268, "step": 1900 }, { "epoch": 0.7314241949188037, "grad_norm": 0.610745344226226, "learning_rate": 8.898649303268373e-07, "loss": 0.2752, "step": 1910 }, { "epoch": 0.7352536409654991, "grad_norm": 0.7020431210410311, "learning_rate": 8.662606489329712e-07, "loss": 0.2793, "step": 1920 }, { "epoch": 0.7390830870121944, "grad_norm": 0.6300673226887155, "learning_rate": 8.429078742287072e-07, "loss": 0.2673, "step": 1930 }, { "epoch": 0.7429125330588897, "grad_norm": 0.6440907203937188, "learning_rate": 8.198102012667409e-07, "loss": 0.2662, "step": 1940 }, { "epoch": 0.746741979105585, "grad_norm": 0.6258159414377766, "learning_rate": 7.969711858280251e-07, "loss": 0.2712, "step": 1950 }, { "epoch": 0.7505714251522804, "grad_norm": 0.5989574886855157, "learning_rate": 7.743943438743676e-07, "loss": 0.2634, "step": 1960 }, { "epoch": 0.7544008711989756, "grad_norm": 0.7834580590666472, "learning_rate": 7.520831510071744e-07, "loss": 0.2632, "step": 1970 }, { "epoch": 0.7582303172456709, "grad_norm": 0.6728771166977499, "learning_rate": 7.30041041932387e-07, "loss": 0.2756, "step": 1980 }, { "epoch": 0.7620597632923662, "grad_norm": 0.5993928197071544, "learning_rate": 7.082714099317334e-07, "loss": 0.2664, "step": 1990 }, { "epoch": 0.7658892093390616, "grad_norm": 0.5848853677834179, "learning_rate": 6.867776063403411e-07, "loss": 0.2628, "step": 2000 }, { "epoch": 0.7697186553857569, "grad_norm": 0.629330394414076, "learning_rate": 6.655629400308191e-07, "loss": 0.2658, "step": 2010 }, { "epoch": 0.7735481014324521, "grad_norm": 0.5924525230054234, "learning_rate": 6.44630676903869e-07, "loss": 0.2669, "step": 2020 }, { "epoch": 0.7773775474791474, "grad_norm": 0.6760063055600575, "learning_rate": 6.239840393855185e-07, "loss": 0.2692, "step": 2030 }, { "epoch": 0.7812069935258428, "grad_norm": 0.6602651610212074, "learning_rate": 6.036262059310383e-07, "loss": 0.2629, "step": 2040 }, { "epoch": 0.7850364395725381, "grad_norm": 0.6327130212823728, "learning_rate": 5.835603105356396e-07, "loss": 0.2678, "step": 2050 }, { "epoch": 0.7888658856192334, "grad_norm": 0.68617187633267, "learning_rate": 5.637894422520027e-07, "loss": 0.268, "step": 2060 }, { "epoch": 0.7926953316659286, "grad_norm": 0.6854582170736043, "learning_rate": 5.443166447147392e-07, "loss": 0.2652, "step": 2070 }, { "epoch": 0.796524777712624, "grad_norm": 0.6454661983770266, "learning_rate": 5.251449156718313e-07, "loss": 0.2616, "step": 2080 }, { "epoch": 0.8003542237593193, "grad_norm": 0.6204018065147047, "learning_rate": 5.062772065231492e-07, "loss": 0.2664, "step": 2090 }, { "epoch": 0.8041836698060146, "grad_norm": 0.6500266139560574, "learning_rate": 4.877164218660901e-07, "loss": 0.2656, "step": 2100 }, { "epoch": 0.80801311585271, "grad_norm": 0.6419744527723766, "learning_rate": 4.694654190484327e-07, "loss": 0.2612, "step": 2110 }, { "epoch": 0.8118425618994053, "grad_norm": 0.6238307107139875, "learning_rate": 4.5152700772845947e-07, "loss": 0.2676, "step": 2120 }, { "epoch": 0.8156720079461005, "grad_norm": 0.6514460031994411, "learning_rate": 4.339039494424263e-07, "loss": 0.2755, "step": 2130 }, { "epoch": 0.8195014539927958, "grad_norm": 0.5974777353730542, "learning_rate": 4.16598957179431e-07, "loss": 0.2597, "step": 2140 }, { "epoch": 0.8233309000394912, "grad_norm": 0.6009135110985171, "learning_rate": 3.9961469496376584e-07, "loss": 0.2592, "step": 2150 }, { "epoch": 0.8271603460861865, "grad_norm": 0.6454195389454723, "learning_rate": 3.829537774448e-07, "loss": 0.2714, "step": 2160 }, { "epoch": 0.8309897921328818, "grad_norm": 0.5913764986372555, "learning_rate": 3.6661876949447006e-07, "loss": 0.2637, "step": 2170 }, { "epoch": 0.834819238179577, "grad_norm": 0.6714610867965414, "learning_rate": 3.506121858124253e-07, "loss": 0.2652, "step": 2180 }, { "epoch": 0.8386486842262724, "grad_norm": 0.6260094170090917, "learning_rate": 3.3493649053890325e-07, "loss": 0.2642, "step": 2190 }, { "epoch": 0.8424781302729677, "grad_norm": 0.5797985857131752, "learning_rate": 3.1959409687538854e-07, "loss": 0.2632, "step": 2200 }, { "epoch": 0.846307576319663, "grad_norm": 0.6189077791942977, "learning_rate": 3.04587366713108e-07, "loss": 0.2648, "step": 2210 }, { "epoch": 0.8501370223663584, "grad_norm": 0.6122894183484023, "learning_rate": 2.8991861026943015e-07, "loss": 0.2741, "step": 2220 }, { "epoch": 0.8539664684130536, "grad_norm": 0.6661899719747609, "learning_rate": 2.755900857322172e-07, "loss": 0.2645, "step": 2230 }, { "epoch": 0.8577959144597489, "grad_norm": 0.5568358628059588, "learning_rate": 2.616039989121899e-07, "loss": 0.2546, "step": 2240 }, { "epoch": 0.8616253605064442, "grad_norm": 0.6975565699445695, "learning_rate": 2.479625029033489e-07, "loss": 0.2774, "step": 2250 }, { "epoch": 0.8654548065531396, "grad_norm": 0.6137153739809891, "learning_rate": 2.3466769775151887e-07, "loss": 0.266, "step": 2260 }, { "epoch": 0.8692842525998349, "grad_norm": 0.6065270965865103, "learning_rate": 2.21721630131054e-07, "loss": 0.2717, "step": 2270 }, { "epoch": 0.8731136986465302, "grad_norm": 0.6194228612620867, "learning_rate": 2.0912629302976494e-07, "loss": 0.2656, "step": 2280 }, { "epoch": 0.8769431446932254, "grad_norm": 0.6721041148274991, "learning_rate": 1.968836254421036e-07, "loss": 0.2653, "step": 2290 }, { "epoch": 0.8807725907399208, "grad_norm": 0.664610505857682, "learning_rate": 1.849955120706673e-07, "loss": 0.2677, "step": 2300 }, { "epoch": 0.8846020367866161, "grad_norm": 0.6321996875856193, "learning_rate": 1.734637830360536e-07, "loss": 0.2645, "step": 2310 }, { "epoch": 0.8884314828333114, "grad_norm": 0.6365093137379149, "learning_rate": 1.6229021359512626e-07, "loss": 0.2658, "step": 2320 }, { "epoch": 0.8922609288800067, "grad_norm": 0.6539770975459238, "learning_rate": 1.514765238677185e-07, "loss": 0.259, "step": 2330 }, { "epoch": 0.896090374926702, "grad_norm": 0.5990361778489132, "learning_rate": 1.4102437857183155e-07, "loss": 0.265, "step": 2340 }, { "epoch": 0.8999198209733973, "grad_norm": 0.6484099138497742, "learning_rate": 1.30935386767356e-07, "loss": 0.2667, "step": 2350 }, { "epoch": 0.9037492670200926, "grad_norm": 0.5867117908179056, "learning_rate": 1.2121110160836697e-07, "loss": 0.2634, "step": 2360 }, { "epoch": 0.907578713066788, "grad_norm": 0.5796597988618597, "learning_rate": 1.1185302010402105e-07, "loss": 0.2719, "step": 2370 }, { "epoch": 0.9114081591134833, "grad_norm": 0.6237999983224305, "learning_rate": 1.0286258288810108e-07, "loss": 0.2627, "step": 2380 }, { "epoch": 0.9152376051601786, "grad_norm": 0.603524669140342, "learning_rate": 9.424117399723432e-08, "loss": 0.262, "step": 2390 }, { "epoch": 0.9190670512068738, "grad_norm": 0.620014776939978, "learning_rate": 8.599012065782924e-08, "loss": 0.271, "step": 2400 }, { "epoch": 0.9228964972535691, "grad_norm": 0.6276616849770017, "learning_rate": 7.811069308175156e-08, "loss": 0.2692, "step": 2410 }, { "epoch": 0.9267259433002645, "grad_norm": 0.6160811115395115, "learning_rate": 7.060410427078473e-08, "loss": 0.2674, "step": 2420 }, { "epoch": 0.9305553893469598, "grad_norm": 0.5848887503471439, "learning_rate": 6.347150982989159e-08, "loss": 0.2625, "step": 2430 }, { "epoch": 0.9343848353936551, "grad_norm": 0.5883046146508241, "learning_rate": 5.6714007789314686e-08, "loss": 0.2621, "step": 2440 }, { "epoch": 0.9382142814403504, "grad_norm": 0.6940261485803617, "learning_rate": 5.033263843554015e-08, "loss": 0.2646, "step": 2450 }, { "epoch": 0.9420437274870457, "grad_norm": 0.6682532368250712, "learning_rate": 4.4328384151149094e-08, "loss": 0.2667, "step": 2460 }, { "epoch": 0.945873173533741, "grad_norm": 0.6267836020326658, "learning_rate": 3.870216926358555e-08, "loss": 0.2643, "step": 2470 }, { "epoch": 0.9497026195804363, "grad_norm": 0.5575161118815479, "learning_rate": 3.3454859902860295e-08, "loss": 0.2641, "step": 2480 }, { "epoch": 0.9535320656271317, "grad_norm": 0.5714423960482532, "learning_rate": 2.858726386821359e-08, "loss": 0.2707, "step": 2490 }, { "epoch": 0.9573615116738269, "grad_norm": 0.6111083062299648, "learning_rate": 2.410013050375859e-08, "loss": 0.2709, "step": 2500 }, { "epoch": 0.9611909577205222, "grad_norm": 0.642938384488523, "learning_rate": 1.999415058312276e-08, "loss": 0.271, "step": 2510 }, { "epoch": 0.9650204037672175, "grad_norm": 0.6112837503032897, "learning_rate": 1.6269956203107117e-08, "loss": 0.2512, "step": 2520 }, { "epoch": 0.9688498498139129, "grad_norm": 0.5889833725999716, "learning_rate": 1.2928120686377388e-08, "loss": 0.2661, "step": 2530 }, { "epoch": 0.9726792958606082, "grad_norm": 0.5773801445867635, "learning_rate": 9.969158493204067e-09, "loss": 0.2653, "step": 2540 }, { "epoch": 0.9765087419073035, "grad_norm": 0.6062759480472552, "learning_rate": 7.393525142262992e-09, "loss": 0.2691, "step": 2550 }, { "epoch": 0.9803381879539987, "grad_norm": 0.5770795227743729, "learning_rate": 5.201617140510318e-09, "loss": 0.2694, "step": 2560 }, { "epoch": 0.9841676340006941, "grad_norm": 0.6054174692665726, "learning_rate": 3.3937719221427413e-09, "loss": 0.2592, "step": 2570 }, { "epoch": 0.9879970800473894, "grad_norm": 0.5960623948954498, "learning_rate": 1.9702677966507157e-09, "loss": 0.2641, "step": 2580 }, { "epoch": 0.9918265260940847, "grad_norm": 0.6702560926553222, "learning_rate": 9.31323905974113e-10, "loss": 0.2621, "step": 2590 }, { "epoch": 0.99565597214078, "grad_norm": 0.623171089038422, "learning_rate": 2.7710019076532257e-10, "loss": 0.2672, "step": 2600 }, { "epoch": 0.9994854181874753, "grad_norm": 0.6269492106617217, "learning_rate": 7.697365768943865e-12, "loss": 0.2736, "step": 2610 } ], "logging_steps": 10, "max_steps": 2611, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 624569162006528.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }