{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011118832522585128, "grad_norm": 3.517343759536743, "learning_rate": 1.3333333333333334e-06, "loss": 0.8242700576782227, "step": 10 }, { "epoch": 0.022237665045170257, "grad_norm": 0.8755150437355042, "learning_rate": 2.814814814814815e-06, "loss": 0.7332224369049072, "step": 20 }, { "epoch": 0.03335649756775538, "grad_norm": 0.5096220970153809, "learning_rate": 4.296296296296296e-06, "loss": 0.6391815185546875, "step": 30 }, { "epoch": 0.04447533009034051, "grad_norm": 0.39251089096069336, "learning_rate": 5.777777777777778e-06, "loss": 0.5873191356658936, "step": 40 }, { "epoch": 0.05559416261292564, "grad_norm": 0.2596520781517029, "learning_rate": 7.2592592592592605e-06, "loss": 0.5601994514465332, "step": 50 }, { "epoch": 0.06671299513551077, "grad_norm": 0.21380962431430817, "learning_rate": 8.740740740740741e-06, "loss": 0.538310432434082, "step": 60 }, { "epoch": 0.0778318276580959, "grad_norm": 0.2109403759241104, "learning_rate": 1.0222222222222223e-05, "loss": 0.5150705337524414, "step": 70 }, { "epoch": 0.08895066018068103, "grad_norm": 0.2186734825372696, "learning_rate": 1.1703703703703703e-05, "loss": 0.5032827377319335, "step": 80 }, { "epoch": 0.10006949270326616, "grad_norm": 0.2625578045845032, "learning_rate": 1.3185185185185185e-05, "loss": 0.49479260444641116, "step": 90 }, { "epoch": 0.11118832522585129, "grad_norm": 0.2372923344373703, "learning_rate": 1.4666666666666666e-05, "loss": 0.49692888259887696, "step": 100 }, { "epoch": 0.12230715774843641, "grad_norm": 0.3200058043003082, "learning_rate": 1.614814814814815e-05, "loss": 0.47324380874633787, "step": 110 }, { "epoch": 0.13342599027102153, "grad_norm": 0.2912400960922241, "learning_rate": 1.7629629629629633e-05, "loss": 0.4717508316040039, "step": 120 }, { "epoch": 0.14454482279360667, "grad_norm": 0.35332974791526794, "learning_rate": 1.9111111111111113e-05, "loss": 0.4714301586151123, "step": 130 }, { "epoch": 0.1556636553161918, "grad_norm": 0.3428114950656891, "learning_rate": 2.0592592592592596e-05, "loss": 0.4707786083221436, "step": 140 }, { "epoch": 0.16678248783877692, "grad_norm": 0.23904399573802948, "learning_rate": 2.2074074074074073e-05, "loss": 0.4618192672729492, "step": 150 }, { "epoch": 0.17790132036136205, "grad_norm": 0.3916633725166321, "learning_rate": 2.355555555555556e-05, "loss": 0.45795350074768065, "step": 160 }, { "epoch": 0.1890201528839472, "grad_norm": 0.291522741317749, "learning_rate": 2.5037037037037036e-05, "loss": 0.4513235569000244, "step": 170 }, { "epoch": 0.20013898540653233, "grad_norm": 0.3826625645160675, "learning_rate": 2.651851851851852e-05, "loss": 0.4494623184204102, "step": 180 }, { "epoch": 0.21125781792911744, "grad_norm": 0.2857173979282379, "learning_rate": 2.8e-05, "loss": 0.4489255428314209, "step": 190 }, { "epoch": 0.22237665045170257, "grad_norm": 0.2880389392375946, "learning_rate": 2.9481481481481483e-05, "loss": 0.44628753662109377, "step": 200 }, { "epoch": 0.2334954829742877, "grad_norm": 0.2952287495136261, "learning_rate": 3.096296296296296e-05, "loss": 0.44461727142333984, "step": 210 }, { "epoch": 0.24461431549687282, "grad_norm": 0.30243533849716187, "learning_rate": 3.2444444444444446e-05, "loss": 0.4425827503204346, "step": 220 }, { "epoch": 0.25573314801945796, "grad_norm": 0.308292418718338, "learning_rate": 3.392592592592593e-05, "loss": 0.4386408805847168, "step": 230 }, { "epoch": 0.26685198054204307, "grad_norm": 0.3824830651283264, "learning_rate": 3.540740740740741e-05, "loss": 0.43536901473999023, "step": 240 }, { "epoch": 0.27797081306462823, "grad_norm": 0.32702672481536865, "learning_rate": 3.6888888888888896e-05, "loss": 0.43506660461425783, "step": 250 }, { "epoch": 0.28908964558721334, "grad_norm": 0.349981427192688, "learning_rate": 3.837037037037037e-05, "loss": 0.43353118896484377, "step": 260 }, { "epoch": 0.30020847810979845, "grad_norm": 0.26233726739883423, "learning_rate": 3.9851851851851856e-05, "loss": 0.4353696346282959, "step": 270 }, { "epoch": 0.3113273106323836, "grad_norm": 0.3655954599380493, "learning_rate": 3.999864616007525e-05, "loss": 0.4296071529388428, "step": 280 }, { "epoch": 0.3224461431549687, "grad_norm": 0.2861635386943817, "learning_rate": 3.999396645490857e-05, "loss": 0.4362457275390625, "step": 290 }, { "epoch": 0.33356497567755383, "grad_norm": 0.27363601326942444, "learning_rate": 3.998594495240786e-05, "loss": 0.4307443618774414, "step": 300 }, { "epoch": 0.344683808200139, "grad_norm": 0.2889789044857025, "learning_rate": 3.997458299328936e-05, "loss": 0.4261889934539795, "step": 310 }, { "epoch": 0.3558026407227241, "grad_norm": 0.2857920229434967, "learning_rate": 3.99598824765942e-05, "loss": 0.4217526435852051, "step": 320 }, { "epoch": 0.3669214732453092, "grad_norm": 0.24713562428951263, "learning_rate": 3.994184585937099e-05, "loss": 0.4260251998901367, "step": 330 }, { "epoch": 0.3780403057678944, "grad_norm": 0.321676641702652, "learning_rate": 3.992047615626516e-05, "loss": 0.4209277153015137, "step": 340 }, { "epoch": 0.3891591382904795, "grad_norm": 0.32741880416870117, "learning_rate": 3.989577693901505e-05, "loss": 0.4218775272369385, "step": 350 }, { "epoch": 0.40027797081306465, "grad_norm": 0.25548964738845825, "learning_rate": 3.986775233585499e-05, "loss": 0.42417049407958984, "step": 360 }, { "epoch": 0.41139680333564976, "grad_norm": 0.2476252168416977, "learning_rate": 3.983640703082523e-05, "loss": 0.41838369369506834, "step": 370 }, { "epoch": 0.4225156358582349, "grad_norm": 0.274328351020813, "learning_rate": 3.980174626298914e-05, "loss": 0.41819052696228026, "step": 380 }, { "epoch": 0.43363446838082004, "grad_norm": 0.3162706196308136, "learning_rate": 3.9763775825557476e-05, "loss": 0.4195350170135498, "step": 390 }, { "epoch": 0.44475330090340515, "grad_norm": 0.28077232837677, "learning_rate": 3.9722502064920105e-05, "loss": 0.4215540885925293, "step": 400 }, { "epoch": 0.45587213342599026, "grad_norm": 0.24924027919769287, "learning_rate": 3.967793187958533e-05, "loss": 0.41279850006103513, "step": 410 }, { "epoch": 0.4669909659485754, "grad_norm": 0.2332940250635147, "learning_rate": 3.9630072719026816e-05, "loss": 0.4138369560241699, "step": 420 }, { "epoch": 0.47810979847116053, "grad_norm": 0.25689420104026794, "learning_rate": 3.957893258243849e-05, "loss": 0.41005849838256836, "step": 430 }, { "epoch": 0.48922863099374564, "grad_norm": 0.2200179100036621, "learning_rate": 3.9524520017397574e-05, "loss": 0.4146875858306885, "step": 440 }, { "epoch": 0.5003474635163307, "grad_norm": 0.2616479694843292, "learning_rate": 3.946684411843591e-05, "loss": 0.4060019016265869, "step": 450 }, { "epoch": 0.5114662960389159, "grad_norm": 0.250016450881958, "learning_rate": 3.940591452551993e-05, "loss": 0.40746545791625977, "step": 460 }, { "epoch": 0.5225851285615011, "grad_norm": 0.24024935066699982, "learning_rate": 3.9341741422439416e-05, "loss": 0.4112556457519531, "step": 470 }, { "epoch": 0.5337039610840861, "grad_norm": 0.2836955189704895, "learning_rate": 3.9274335535105373e-05, "loss": 0.4094208240509033, "step": 480 }, { "epoch": 0.5448227936066713, "grad_norm": 0.24973008036613464, "learning_rate": 3.920370812975729e-05, "loss": 0.41716728210449217, "step": 490 }, { "epoch": 0.5559416261292565, "grad_norm": 0.25282275676727295, "learning_rate": 3.912987101108012e-05, "loss": 0.4076206684112549, "step": 500 }, { "epoch": 0.5670604586518415, "grad_norm": 0.2290191948413849, "learning_rate": 3.905283652023123e-05, "loss": 0.40010957717895507, "step": 510 }, { "epoch": 0.5781792911744267, "grad_norm": 0.2107991874217987, "learning_rate": 3.8972617532777686e-05, "loss": 0.4060469627380371, "step": 520 }, { "epoch": 0.5892981236970118, "grad_norm": 0.23016655445098877, "learning_rate": 3.8889227456544254e-05, "loss": 0.4046156883239746, "step": 530 }, { "epoch": 0.6004169562195969, "grad_norm": 0.20803174376487732, "learning_rate": 3.8802680229372374e-05, "loss": 0.40668258666992185, "step": 540 }, { "epoch": 0.6115357887421821, "grad_norm": 0.23622117936611176, "learning_rate": 3.8712990316790633e-05, "loss": 0.4025533676147461, "step": 550 }, { "epoch": 0.6226546212647672, "grad_norm": 0.20919625461101532, "learning_rate": 3.862017270959694e-05, "loss": 0.4043358325958252, "step": 560 }, { "epoch": 0.6337734537873523, "grad_norm": 0.22993041574954987, "learning_rate": 3.8524242921353e-05, "loss": 0.3996579170227051, "step": 570 }, { "epoch": 0.6448922863099374, "grad_norm": 0.2250276803970337, "learning_rate": 3.842521698579136e-05, "loss": 0.39752275943756105, "step": 580 }, { "epoch": 0.6560111188325226, "grad_norm": 0.22397875785827637, "learning_rate": 3.832311145413552e-05, "loss": 0.39977571964263914, "step": 590 }, { "epoch": 0.6671299513551077, "grad_norm": 0.22084222733974457, "learning_rate": 3.8217943392333555e-05, "loss": 0.3992427349090576, "step": 600 }, { "epoch": 0.6782487838776928, "grad_norm": 0.2924990653991699, "learning_rate": 3.810973037820572e-05, "loss": 0.4037761211395264, "step": 610 }, { "epoch": 0.689367616400278, "grad_norm": 0.2066657841205597, "learning_rate": 3.7998490498506494e-05, "loss": 0.3975439310073853, "step": 620 }, { "epoch": 0.700486448922863, "grad_norm": 0.19355815649032593, "learning_rate": 3.788424234590153e-05, "loss": 0.4000706195831299, "step": 630 }, { "epoch": 0.7116052814454482, "grad_norm": 0.1997012048959732, "learning_rate": 3.776700501586009e-05, "loss": 0.3945141792297363, "step": 640 }, { "epoch": 0.7227241139680334, "grad_norm": 0.2134384959936142, "learning_rate": 3.7646798103463395e-05, "loss": 0.39577999114990237, "step": 650 }, { "epoch": 0.7338429464906184, "grad_norm": 0.24351662397384644, "learning_rate": 3.752364170012956e-05, "loss": 0.39435544013977053, "step": 660 }, { "epoch": 0.7449617790132036, "grad_norm": 0.2532612383365631, "learning_rate": 3.739755639025543e-05, "loss": 0.3988224506378174, "step": 670 }, { "epoch": 0.7560806115357888, "grad_norm": 0.24978569149971008, "learning_rate": 3.726856324777616e-05, "loss": 0.3910386085510254, "step": 680 }, { "epoch": 0.7671994440583739, "grad_norm": 0.25254756212234497, "learning_rate": 3.713668383264288e-05, "loss": 0.38977618217468263, "step": 690 }, { "epoch": 0.778318276580959, "grad_norm": 0.22012752294540405, "learning_rate": 3.700194018721916e-05, "loss": 0.38769371509552003, "step": 700 }, { "epoch": 0.7894371091035441, "grad_norm": 0.22096213698387146, "learning_rate": 3.686435483259686e-05, "loss": 0.39794325828552246, "step": 710 }, { "epoch": 0.8005559416261293, "grad_norm": 0.2342117577791214, "learning_rate": 3.672395076483192e-05, "loss": 0.39336109161376953, "step": 720 }, { "epoch": 0.8116747741487144, "grad_norm": 0.2648347020149231, "learning_rate": 3.658075145110083e-05, "loss": 0.3947201490402222, "step": 730 }, { "epoch": 0.8227936066712995, "grad_norm": 0.23970480263233185, "learning_rate": 3.64347808257783e-05, "loss": 0.39340207576751707, "step": 740 }, { "epoch": 0.8339124391938847, "grad_norm": 0.23063965141773224, "learning_rate": 3.6286063286436826e-05, "loss": 0.3845417261123657, "step": 750 }, { "epoch": 0.8450312717164697, "grad_norm": 0.20158712565898895, "learning_rate": 3.613462368976894e-05, "loss": 0.3933848857879639, "step": 760 }, { "epoch": 0.8561501042390549, "grad_norm": 0.20051920413970947, "learning_rate": 3.598048734743262e-05, "loss": 0.3893456935882568, "step": 770 }, { "epoch": 0.8672689367616401, "grad_norm": 0.2043953388929367, "learning_rate": 3.58236800218207e-05, "loss": 0.38582921028137207, "step": 780 }, { "epoch": 0.8783877692842251, "grad_norm": 0.20056948065757751, "learning_rate": 3.566422792175489e-05, "loss": 0.38297524452209475, "step": 790 }, { "epoch": 0.8895066018068103, "grad_norm": 0.19881410896778107, "learning_rate": 3.550215769810532e-05, "loss": 0.3857751369476318, "step": 800 }, { "epoch": 0.9006254343293955, "grad_norm": 0.2066773623228073, "learning_rate": 3.5337496439336014e-05, "loss": 0.392465877532959, "step": 810 }, { "epoch": 0.9117442668519805, "grad_norm": 0.20534034073352814, "learning_rate": 3.5170271666977383e-05, "loss": 0.3881709098815918, "step": 820 }, { "epoch": 0.9228630993745657, "grad_norm": 0.1855618804693222, "learning_rate": 3.5000511331026224e-05, "loss": 0.3881243705749512, "step": 830 }, { "epoch": 0.9339819318971508, "grad_norm": 0.18303897976875305, "learning_rate": 3.4828243805274166e-05, "loss": 0.38112101554870603, "step": 840 }, { "epoch": 0.9451007644197359, "grad_norm": 0.1936378926038742, "learning_rate": 3.4653497882565276e-05, "loss": 0.38739733695983886, "step": 850 }, { "epoch": 0.9562195969423211, "grad_norm": 0.2301376312971115, "learning_rate": 3.44763027699836e-05, "loss": 0.3910486936569214, "step": 860 }, { "epoch": 0.9673384294649062, "grad_norm": 0.20131757855415344, "learning_rate": 3.429668808397147e-05, "loss": 0.3879512071609497, "step": 870 }, { "epoch": 0.9784572619874913, "grad_norm": 0.20710399746894836, "learning_rate": 3.4114683845379465e-05, "loss": 0.38755533695220945, "step": 880 }, { "epoch": 0.9895760945100764, "grad_norm": 0.2099325954914093, "learning_rate": 3.393032047444862e-05, "loss": 0.3850740432739258, "step": 890 }, { "epoch": 1.0, "grad_norm": 0.25888943672180176, "learning_rate": 3.3743628785726075e-05, "loss": 0.3849788665771484, "step": 900 }, { "epoch": 1.011118832522585, "grad_norm": 0.22799654304981232, "learning_rate": 3.355463998291465e-05, "loss": 0.34186859130859376, "step": 910 }, { "epoch": 1.0222376650451703, "grad_norm": 0.22239384055137634, "learning_rate": 3.3363385653657486e-05, "loss": 0.3563653230667114, "step": 920 }, { "epoch": 1.0333564975677554, "grad_norm": 0.2089320719242096, "learning_rate": 3.3169897764258476e-05, "loss": 0.34113943576812744, "step": 930 }, { "epoch": 1.0444753300903404, "grad_norm": 0.28375443816185, "learning_rate": 3.2974208654339405e-05, "loss": 0.33684582710266114, "step": 940 }, { "epoch": 1.0555941626129257, "grad_norm": 0.21052372455596924, "learning_rate": 3.277635103143467e-05, "loss": 0.33628795146942136, "step": 950 }, { "epoch": 1.0667129951355108, "grad_norm": 0.20992518961429596, "learning_rate": 3.2576357965524574e-05, "loss": 0.3357236862182617, "step": 960 }, { "epoch": 1.0778318276580958, "grad_norm": 0.22243370115756989, "learning_rate": 3.237426288350798e-05, "loss": 0.33824107646942136, "step": 970 }, { "epoch": 1.088950660180681, "grad_norm": 0.2306300699710846, "learning_rate": 3.217009956361531e-05, "loss": 0.34063472747802737, "step": 980 }, { "epoch": 1.1000694927032661, "grad_norm": 0.19953912496566772, "learning_rate": 3.196390212976291e-05, "loss": 0.34016103744506837, "step": 990 }, { "epoch": 1.1111883252258512, "grad_norm": 0.2101643830537796, "learning_rate": 3.1755705045849465e-05, "loss": 0.3383723974227905, "step": 1000 }, { "epoch": 1.1223071577484365, "grad_norm": 0.20528212189674377, "learning_rate": 3.154554310999578e-05, "loss": 0.33558709621429444, "step": 1010 }, { "epoch": 1.1334259902710215, "grad_norm": 0.18450650572776794, "learning_rate": 3.133345144872859e-05, "loss": 0.3336428165435791, "step": 1020 }, { "epoch": 1.1445448227936066, "grad_norm": 0.18666331470012665, "learning_rate": 3.111946551110947e-05, "loss": 0.33807053565979006, "step": 1030 }, { "epoch": 1.1556636553161919, "grad_norm": 0.20218265056610107, "learning_rate": 3.09036210628099e-05, "loss": 0.33912909030914307, "step": 1040 }, { "epoch": 1.166782487838777, "grad_norm": 0.18102531135082245, "learning_rate": 3.068595418013339e-05, "loss": 0.3401691436767578, "step": 1050 }, { "epoch": 1.177901320361362, "grad_norm": 0.1858808994293213, "learning_rate": 3.046650124398562e-05, "loss": 0.3396461963653564, "step": 1060 }, { "epoch": 1.1890201528839472, "grad_norm": 0.21451160311698914, "learning_rate": 3.0245298933793798e-05, "loss": 0.3356295108795166, "step": 1070 }, { "epoch": 1.2001389854065323, "grad_norm": 0.24296917021274567, "learning_rate": 3.0022384221376017e-05, "loss": 0.3362587928771973, "step": 1080 }, { "epoch": 1.2112578179291174, "grad_norm": 0.22454988956451416, "learning_rate": 2.9797794364761743e-05, "loss": 0.33704962730407717, "step": 1090 }, { "epoch": 1.2223766504517026, "grad_norm": 0.20934318006038666, "learning_rate": 2.9571566901964552e-05, "loss": 0.3362391471862793, "step": 1100 }, { "epoch": 1.2334954829742877, "grad_norm": 0.20203141868114471, "learning_rate": 2.9343739644707986e-05, "loss": 0.33880271911621096, "step": 1110 }, { "epoch": 1.2446143154968727, "grad_norm": 0.21586163341999054, "learning_rate": 2.911435067210569e-05, "loss": 0.3303499698638916, "step": 1120 }, { "epoch": 1.255733148019458, "grad_norm": 0.21403352916240692, "learning_rate": 2.8883438324296866e-05, "loss": 0.3369396686553955, "step": 1130 }, { "epoch": 1.266851980542043, "grad_norm": 0.2058866173028946, "learning_rate": 2.8651041196038098e-05, "loss": 0.33591766357421876, "step": 1140 }, { "epoch": 1.2779708130646283, "grad_norm": 0.1970527321100235, "learning_rate": 2.8417198130252584e-05, "loss": 0.337198281288147, "step": 1150 }, { "epoch": 1.2890896455872134, "grad_norm": 0.2015780508518219, "learning_rate": 2.8181948211537954e-05, "loss": 0.33505361080169677, "step": 1160 }, { "epoch": 1.3002084781097984, "grad_norm": 0.19171395897865295, "learning_rate": 2.7945330759633642e-05, "loss": 0.3337593078613281, "step": 1170 }, { "epoch": 1.3113273106323837, "grad_norm": 0.1808820515871048, "learning_rate": 2.770738532284897e-05, "loss": 0.3352059364318848, "step": 1180 }, { "epoch": 1.3224461431549688, "grad_norm": 0.19644634425640106, "learning_rate": 2.746815167145303e-05, "loss": 0.34058656692504885, "step": 1190 }, { "epoch": 1.3335649756775538, "grad_norm": 0.20617882907390594, "learning_rate": 2.7227669791027497e-05, "loss": 0.33803424835205076, "step": 1200 }, { "epoch": 1.344683808200139, "grad_norm": 0.20170675218105316, "learning_rate": 2.6985979875783388e-05, "loss": 0.3345954418182373, "step": 1210 }, { "epoch": 1.3558026407227242, "grad_norm": 0.20398379862308502, "learning_rate": 2.6743122321843014e-05, "loss": 0.3368945598602295, "step": 1220 }, { "epoch": 1.3669214732453092, "grad_norm": 0.1875438690185547, "learning_rate": 2.6499137720488163e-05, "loss": 0.3394474983215332, "step": 1230 }, { "epoch": 1.3780403057678945, "grad_norm": 0.19435080885887146, "learning_rate": 2.625406685137564e-05, "loss": 0.33933372497558595, "step": 1240 }, { "epoch": 1.3891591382904795, "grad_norm": 0.19872593879699707, "learning_rate": 2.6007950675721373e-05, "loss": 0.33671281337738035, "step": 1250 }, { "epoch": 1.4002779708130646, "grad_norm": 0.19588671624660492, "learning_rate": 2.5760830329454117e-05, "loss": 0.3355069637298584, "step": 1260 }, { "epoch": 1.4113968033356499, "grad_norm": 0.1908605545759201, "learning_rate": 2.5512747116339985e-05, "loss": 0.3366411209106445, "step": 1270 }, { "epoch": 1.422515635858235, "grad_norm": 0.1810505986213684, "learning_rate": 2.5263742501078957e-05, "loss": 0.34022998809814453, "step": 1280 }, { "epoch": 1.43363446838082, "grad_norm": 0.21036396920681, "learning_rate": 2.501385810237442e-05, "loss": 0.3365932941436768, "step": 1290 }, { "epoch": 1.4447533009034053, "grad_norm": 0.20304331183433533, "learning_rate": 2.476313568597702e-05, "loss": 0.3349552392959595, "step": 1300 }, { "epoch": 1.4558721334259903, "grad_norm": 0.17836874723434448, "learning_rate": 2.4511617157703915e-05, "loss": 0.32939877510070803, "step": 1310 }, { "epoch": 1.4669909659485754, "grad_norm": 0.192827507853508, "learning_rate": 2.4259344556434656e-05, "loss": 0.33145139217376707, "step": 1320 }, { "epoch": 1.4781097984711606, "grad_norm": 0.18946915864944458, "learning_rate": 2.400636004708475e-05, "loss": 0.3342320919036865, "step": 1330 }, { "epoch": 1.4892286309937457, "grad_norm": 0.18936337530612946, "learning_rate": 2.3752705913558228e-05, "loss": 0.3325347423553467, "step": 1340 }, { "epoch": 1.5003474635163307, "grad_norm": 0.1900292932987213, "learning_rate": 2.3498424551680318e-05, "loss": 0.3336307525634766, "step": 1350 }, { "epoch": 1.511466296038916, "grad_norm": 0.20230746269226074, "learning_rate": 2.3243558462111354e-05, "loss": 0.33458542823791504, "step": 1360 }, { "epoch": 1.522585128561501, "grad_norm": 0.20732760429382324, "learning_rate": 2.2988150243243235e-05, "loss": 0.3298256158828735, "step": 1370 }, { "epoch": 1.5337039610840861, "grad_norm": 0.19748058915138245, "learning_rate": 2.273224258407951e-05, "loss": 0.3372241973876953, "step": 1380 }, { "epoch": 1.5448227936066714, "grad_norm": 0.19819419085979462, "learning_rate": 2.2475878257100333e-05, "loss": 0.3346505641937256, "step": 1390 }, { "epoch": 1.5559416261292565, "grad_norm": 0.18096713721752167, "learning_rate": 2.2219100111113408e-05, "loss": 0.33000621795654295, "step": 1400 }, { "epoch": 1.5670604586518415, "grad_norm": 0.17553280293941498, "learning_rate": 2.196195106409232e-05, "loss": 0.32919626235961913, "step": 1410 }, { "epoch": 1.5781792911744268, "grad_norm": 0.18030349910259247, "learning_rate": 2.1704474096003135e-05, "loss": 0.3363958835601807, "step": 1420 }, { "epoch": 1.5892981236970118, "grad_norm": 0.17756561934947968, "learning_rate": 2.1446712241620734e-05, "loss": 0.33312478065490725, "step": 1430 }, { "epoch": 1.600416956219597, "grad_norm": 0.17333616316318512, "learning_rate": 2.118870858333599e-05, "loss": 0.327138090133667, "step": 1440 }, { "epoch": 1.6115357887421822, "grad_norm": 0.18877775967121124, "learning_rate": 2.093050624395494e-05, "loss": 0.334153938293457, "step": 1450 }, { "epoch": 1.6226546212647672, "grad_norm": 0.2033713012933731, "learning_rate": 2.0672148379491234e-05, "loss": 0.32985107898712157, "step": 1460 }, { "epoch": 1.6337734537873523, "grad_norm": 0.18154199421405792, "learning_rate": 2.0413678171953056e-05, "loss": 0.3321858882904053, "step": 1470 }, { "epoch": 1.6448922863099376, "grad_norm": 0.1664343774318695, "learning_rate": 2.0155138822125608e-05, "loss": 0.3292757511138916, "step": 1480 }, { "epoch": 1.6560111188325226, "grad_norm": 0.17724835872650146, "learning_rate": 1.9896573542350576e-05, "loss": 0.3216125011444092, "step": 1490 }, { "epoch": 1.6671299513551077, "grad_norm": 0.1763281524181366, "learning_rate": 1.9638025549303576e-05, "loss": 0.33061861991882324, "step": 1500 }, { "epoch": 1.678248783877693, "grad_norm": 0.1832234263420105, "learning_rate": 1.9379538056770927e-05, "loss": 0.33137152194976804, "step": 1510 }, { "epoch": 1.689367616400278, "grad_norm": 0.18359725177288055, "learning_rate": 1.912115426842686e-05, "loss": 0.33290562629699705, "step": 1520 }, { "epoch": 1.700486448922863, "grad_norm": 0.18772786855697632, "learning_rate": 1.8862917370612454e-05, "loss": 0.3290142059326172, "step": 1530 }, { "epoch": 1.7116052814454483, "grad_norm": 0.178235724568367, "learning_rate": 1.8604870525117496e-05, "loss": 0.3255646228790283, "step": 1540 }, { "epoch": 1.7227241139680334, "grad_norm": 0.17328216135501862, "learning_rate": 1.8347056861966333e-05, "loss": 0.3327143907546997, "step": 1550 }, { "epoch": 1.7338429464906184, "grad_norm": 0.17393898963928223, "learning_rate": 1.8089519472209168e-05, "loss": 0.3347191572189331, "step": 1560 }, { "epoch": 1.7449617790132037, "grad_norm": 0.1645958125591278, "learning_rate": 1.7832301400719793e-05, "loss": 0.326206111907959, "step": 1570 }, { "epoch": 1.7560806115357888, "grad_norm": 0.1829153597354889, "learning_rate": 1.7575445639001026e-05, "loss": 0.3275812387466431, "step": 1580 }, { "epoch": 1.7671994440583738, "grad_norm": 0.1847892552614212, "learning_rate": 1.7318995117999158e-05, "loss": 0.32840893268585203, "step": 1590 }, { "epoch": 1.778318276580959, "grad_norm": 0.1821216493844986, "learning_rate": 1.706299270092842e-05, "loss": 0.3295578956604004, "step": 1600 }, { "epoch": 1.7894371091035441, "grad_norm": 0.18880750238895416, "learning_rate": 1.6807481176106816e-05, "loss": 0.3292530536651611, "step": 1610 }, { "epoch": 1.8005559416261292, "grad_norm": 0.16780254244804382, "learning_rate": 1.655250324980447e-05, "loss": 0.32708158493041994, "step": 1620 }, { "epoch": 1.8116747741487145, "grad_norm": 0.1843137890100479, "learning_rate": 1.6298101539105712e-05, "loss": 0.3281073093414307, "step": 1630 }, { "epoch": 1.8227936066712995, "grad_norm": 0.1689286231994629, "learning_rate": 1.604431856478602e-05, "loss": 0.32400391101837156, "step": 1640 }, { "epoch": 1.8339124391938846, "grad_norm": 0.18957920372486115, "learning_rate": 1.5791196744205094e-05, "loss": 0.3259273052215576, "step": 1650 }, { "epoch": 1.8450312717164699, "grad_norm": 0.17294242978096008, "learning_rate": 1.5538778384217215e-05, "loss": 0.3238994598388672, "step": 1660 }, { "epoch": 1.856150104239055, "grad_norm": 0.17969508469104767, "learning_rate": 1.5287105674100053e-05, "loss": 0.32615640163421633, "step": 1670 }, { "epoch": 1.86726893676164, "grad_norm": 0.17494842410087585, "learning_rate": 1.5036220678503137e-05, "loss": 0.3246027946472168, "step": 1680 }, { "epoch": 1.8783877692842252, "grad_norm": 0.17964747548103333, "learning_rate": 1.4786165330417173e-05, "loss": 0.325272798538208, "step": 1690 }, { "epoch": 1.8895066018068103, "grad_norm": 0.1734607219696045, "learning_rate": 1.4536981424165334e-05, "loss": 0.32622013092041013, "step": 1700 }, { "epoch": 1.9006254343293953, "grad_norm": 0.19146278500556946, "learning_rate": 1.4288710608417754e-05, "loss": 0.32583372592926024, "step": 1710 }, { "epoch": 1.9117442668519806, "grad_norm": 0.1902935951948166, "learning_rate": 1.404139437923036e-05, "loss": 0.32550692558288574, "step": 1720 }, { "epoch": 1.9228630993745657, "grad_norm": 0.16681939363479614, "learning_rate": 1.3795074073109211e-05, "loss": 0.3274375438690186, "step": 1730 }, { "epoch": 1.9339819318971507, "grad_norm": 0.17112983763217926, "learning_rate": 1.3549790860101481e-05, "loss": 0.3290217399597168, "step": 1740 }, { "epoch": 1.945100764419736, "grad_norm": 0.16517910361289978, "learning_rate": 1.3305585736914318e-05, "loss": 0.3270266056060791, "step": 1750 }, { "epoch": 1.956219596942321, "grad_norm": 0.20247752964496613, "learning_rate": 1.3062499520062608e-05, "loss": 0.32512893676757815, "step": 1760 }, { "epoch": 1.9673384294649061, "grad_norm": 0.1725420504808426, "learning_rate": 1.2820572839046915e-05, "loss": 0.32194349765777586, "step": 1770 }, { "epoch": 1.9784572619874914, "grad_norm": 0.19890804588794708, "learning_rate": 1.2579846129562663e-05, "loss": 0.32468571662902834, "step": 1780 }, { "epoch": 1.9895760945100764, "grad_norm": 0.17223462462425232, "learning_rate": 1.2340359626741676e-05, "loss": 0.31974453926086427, "step": 1790 }, { "epoch": 2.0, "grad_norm": 0.23255887627601624, "learning_rate": 1.2102153358427264e-05, "loss": 0.32196643352508547, "step": 1800 }, { "epoch": 2.0111188325225853, "grad_norm": 0.2428450584411621, "learning_rate": 1.1865267138484e-05, "loss": 0.2720228672027588, "step": 1810 }, { "epoch": 2.02223766504517, "grad_norm": 0.19334714114665985, "learning_rate": 1.1629740560143162e-05, "loss": 0.2708899974822998, "step": 1820 }, { "epoch": 2.0333564975677554, "grad_norm": 0.18385189771652222, "learning_rate": 1.139561298938515e-05, "loss": 0.2705179214477539, "step": 1830 }, { "epoch": 2.0444753300903407, "grad_norm": 0.1807754784822464, "learning_rate": 1.1162923558359849e-05, "loss": 0.2721697807312012, "step": 1840 }, { "epoch": 2.0555941626129255, "grad_norm": 0.18753038346767426, "learning_rate": 1.0931711158846024e-05, "loss": 0.2725118398666382, "step": 1850 }, { "epoch": 2.0667129951355108, "grad_norm": 0.17677700519561768, "learning_rate": 1.0702014435750985e-05, "loss": 0.27192416191101076, "step": 1860 }, { "epoch": 2.077831827658096, "grad_norm": 0.1743370145559311, "learning_rate": 1.0473871780651435e-05, "loss": 0.27294752597808836, "step": 1870 }, { "epoch": 2.088950660180681, "grad_norm": 0.1780771017074585, "learning_rate": 1.0247321325376704e-05, "loss": 0.2742859601974487, "step": 1880 }, { "epoch": 2.100069492703266, "grad_norm": 0.17365196347236633, "learning_rate": 1.00224009356354e-05, "loss": 0.2734071254730225, "step": 1890 }, { "epoch": 2.1111883252258514, "grad_norm": 0.1772630214691162, "learning_rate": 9.799148204686495e-06, "loss": 0.27077765464782716, "step": 1900 }, { "epoch": 2.1223071577484363, "grad_norm": 0.17199768126010895, "learning_rate": 9.577600447055983e-06, "loss": 0.2729313373565674, "step": 1910 }, { "epoch": 2.1334259902710215, "grad_norm": 0.17337964475154877, "learning_rate": 9.357794692300134e-06, "loss": 0.27156963348388674, "step": 1920 }, { "epoch": 2.144544822793607, "grad_norm": 0.1897203028202057, "learning_rate": 9.13976767881634e-06, "loss": 0.27226369380950927, "step": 1930 }, { "epoch": 2.1556636553161916, "grad_norm": 0.17273284494876862, "learning_rate": 8.923555847702675e-06, "loss": 0.2755557060241699, "step": 1940 }, { "epoch": 2.166782487838777, "grad_norm": 0.1792151778936386, "learning_rate": 8.709195336667102e-06, "loss": 0.2707196235656738, "step": 1950 }, { "epoch": 2.177901320361362, "grad_norm": 0.1746506690979004, "learning_rate": 8.496721973987423e-06, "loss": 0.271243953704834, "step": 1960 }, { "epoch": 2.189020152883947, "grad_norm": 0.1751469522714615, "learning_rate": 8.286171272522904e-06, "loss": 0.2702665090560913, "step": 1970 }, { "epoch": 2.2001389854065323, "grad_norm": 0.17558525502681732, "learning_rate": 8.077578423778658e-06, "loss": 0.2705970764160156, "step": 1980 }, { "epoch": 2.2112578179291176, "grad_norm": 0.17805011570453644, "learning_rate": 7.870978292023739e-06, "loss": 0.275134539604187, "step": 1990 }, { "epoch": 2.2223766504517024, "grad_norm": 0.178203746676445, "learning_rate": 7.666405408463889e-06, "loss": 0.27126991748809814, "step": 2000 }, { "epoch": 2.2334954829742877, "grad_norm": 0.17006462812423706, "learning_rate": 7.4638939654700235e-06, "loss": 0.26952409744262695, "step": 2010 }, { "epoch": 2.244614315496873, "grad_norm": 0.1694515347480774, "learning_rate": 7.263477810863282e-06, "loss": 0.27427287101745607, "step": 2020 }, { "epoch": 2.255733148019458, "grad_norm": 0.19160398840904236, "learning_rate": 7.065190442257686e-06, "loss": 0.27138872146606446, "step": 2030 }, { "epoch": 2.266851980542043, "grad_norm": 0.16741624474525452, "learning_rate": 6.8690650014613505e-06, "loss": 0.2695302486419678, "step": 2040 }, { "epoch": 2.2779708130646283, "grad_norm": 0.16422736644744873, "learning_rate": 6.675134268937158e-06, "loss": 0.2692440032958984, "step": 2050 }, { "epoch": 2.289089645587213, "grad_norm": 0.1828807145357132, "learning_rate": 6.483430658323806e-06, "loss": 0.26996283531188964, "step": 2060 }, { "epoch": 2.3002084781097984, "grad_norm": 0.17340506613254547, "learning_rate": 6.293986211018208e-06, "loss": 0.2742361783981323, "step": 2070 }, { "epoch": 2.3113273106323837, "grad_norm": 0.1660027652978897, "learning_rate": 6.106832590820053e-06, "loss": 0.2687552452087402, "step": 2080 }, { "epoch": 2.3224461431549686, "grad_norm": 0.17267292737960815, "learning_rate": 5.922001078639541e-06, "loss": 0.273982572555542, "step": 2090 }, { "epoch": 2.333564975677554, "grad_norm": 0.16563960909843445, "learning_rate": 5.739522567269052e-06, "loss": 0.27311880588531495, "step": 2100 }, { "epoch": 2.344683808200139, "grad_norm": 0.16006183624267578, "learning_rate": 5.559427556219734e-06, "loss": 0.273662805557251, "step": 2110 }, { "epoch": 2.355802640722724, "grad_norm": 0.16579481959342957, "learning_rate": 5.381746146623805e-06, "loss": 0.269588041305542, "step": 2120 }, { "epoch": 2.366921473245309, "grad_norm": 0.16734926402568817, "learning_rate": 5.20650803620343e-06, "loss": 0.2678502321243286, "step": 2130 }, { "epoch": 2.3780403057678945, "grad_norm": 0.16894319653511047, "learning_rate": 5.033742514307061e-06, "loss": 0.273479700088501, "step": 2140 }, { "epoch": 2.3891591382904793, "grad_norm": 0.16338688135147095, "learning_rate": 4.863478457013977e-06, "loss": 0.27426838874816895, "step": 2150 }, { "epoch": 2.4002779708130646, "grad_norm": 0.16434775292873383, "learning_rate": 4.6957443223079425e-06, "loss": 0.27318222522735597, "step": 2160 }, { "epoch": 2.41139680333565, "grad_norm": 0.16170066595077515, "learning_rate": 4.530568145320724e-06, "loss": 0.2738449811935425, "step": 2170 }, { "epoch": 2.4225156358582347, "grad_norm": 0.16669511795043945, "learning_rate": 4.367977533646297e-06, "loss": 0.26780765056610106, "step": 2180 }, { "epoch": 2.43363446838082, "grad_norm": 0.15889227390289307, "learning_rate": 4.207999662726516e-06, "loss": 0.2710136413574219, "step": 2190 }, { "epoch": 2.4447533009034053, "grad_norm": 0.15884160995483398, "learning_rate": 4.050661271308969e-06, "loss": 0.27028565406799315, "step": 2200 }, { "epoch": 2.45587213342599, "grad_norm": 0.16362424194812775, "learning_rate": 3.895988656977898e-06, "loss": 0.2697636604309082, "step": 2210 }, { "epoch": 2.4669909659485754, "grad_norm": 0.160287007689476, "learning_rate": 3.744007671758778e-06, "loss": 0.26787629127502444, "step": 2220 }, { "epoch": 2.4781097984711606, "grad_norm": 0.15940804779529572, "learning_rate": 3.59474371779742e-06, "loss": 0.2722454071044922, "step": 2230 }, { "epoch": 2.4892286309937455, "grad_norm": 0.17461982369422913, "learning_rate": 3.4482217431142394e-06, "loss": 0.27159295082092283, "step": 2240 }, { "epoch": 2.5003474635163307, "grad_norm": 0.1559775024652481, "learning_rate": 3.304466237434458e-06, "loss": 0.26755647659301757, "step": 2250 }, { "epoch": 2.511466296038916, "grad_norm": 0.1622040718793869, "learning_rate": 3.1635012280948496e-06, "loss": 0.2669699668884277, "step": 2260 }, { "epoch": 2.5225851285615013, "grad_norm": 0.16182290017604828, "learning_rate": 3.0253502760278406e-06, "loss": 0.2691537380218506, "step": 2270 }, { "epoch": 2.533703961084086, "grad_norm": 0.15870854258537292, "learning_rate": 2.8900364718234987e-06, "loss": 0.2669252872467041, "step": 2280 }, { "epoch": 2.5448227936066714, "grad_norm": 0.17056338489055634, "learning_rate": 2.7575824318701806e-06, "loss": 0.26923959255218505, "step": 2290 }, { "epoch": 2.5559416261292567, "grad_norm": 0.16084995865821838, "learning_rate": 2.6280102945744124e-06, "loss": 0.2703924417495728, "step": 2300 }, { "epoch": 2.5670604586518415, "grad_norm": 0.1600581705570221, "learning_rate": 2.501341716660699e-06, "loss": 0.2688326358795166, "step": 2310 }, { "epoch": 2.578179291174427, "grad_norm": 0.15505236387252808, "learning_rate": 2.377597869551762e-06, "loss": 0.2724630832672119, "step": 2320 }, { "epoch": 2.589298123697012, "grad_norm": 0.1598374992609024, "learning_rate": 2.2567994358299973e-06, "loss": 0.27178168296813965, "step": 2330 }, { "epoch": 2.600416956219597, "grad_norm": 0.161187544465065, "learning_rate": 2.138966605780537e-06, "loss": 0.2691415548324585, "step": 2340 }, { "epoch": 2.611535788742182, "grad_norm": 0.16358298063278198, "learning_rate": 2.024119074016664e-06, "loss": 0.26769893169403075, "step": 2350 }, { "epoch": 2.6226546212647674, "grad_norm": 0.159417986869812, "learning_rate": 1.9122760361880364e-06, "loss": 0.2699122428894043, "step": 2360 }, { "epoch": 2.6337734537873523, "grad_norm": 0.15539702773094177, "learning_rate": 1.8034561857723453e-06, "loss": 0.2678532123565674, "step": 2370 }, { "epoch": 2.6448922863099376, "grad_norm": 0.15744274854660034, "learning_rate": 1.6976777109508446e-06, "loss": 0.2715311050415039, "step": 2380 }, { "epoch": 2.656011118832523, "grad_norm": 0.15635013580322266, "learning_rate": 1.5949582915684025e-06, "loss": 0.26792240142822266, "step": 2390 }, { "epoch": 2.6671299513551077, "grad_norm": 0.1590019315481186, "learning_rate": 1.4953150961784713e-06, "loss": 0.2684680461883545, "step": 2400 }, { "epoch": 2.678248783877693, "grad_norm": 0.1583629995584488, "learning_rate": 1.398764779173538e-06, "loss": 0.2693314552307129, "step": 2410 }, { "epoch": 2.689367616400278, "grad_norm": 0.15858042240142822, "learning_rate": 1.3053234780015012e-06, "loss": 0.27136645317077634, "step": 2420 }, { "epoch": 2.700486448922863, "grad_norm": 0.16220812499523163, "learning_rate": 1.2150068104684577e-06, "loss": 0.2695932149887085, "step": 2430 }, { "epoch": 2.7116052814454483, "grad_norm": 0.15654098987579346, "learning_rate": 1.12782987212833e-06, "loss": 0.2679957151412964, "step": 2440 }, { "epoch": 2.7227241139680336, "grad_norm": 0.21130171418190002, "learning_rate": 1.0438072337597972e-06, "loss": 0.2688181400299072, "step": 2450 }, { "epoch": 2.7338429464906184, "grad_norm": 0.1545591801404953, "learning_rate": 9.6295293893093e-07, "loss": 0.2718325138092041, "step": 2460 }, { "epoch": 2.7449617790132037, "grad_norm": 0.15754657983779907, "learning_rate": 8.852805016519417e-07, "loss": 0.26870386600494384, "step": 2470 }, { "epoch": 2.756080611535789, "grad_norm": 0.1659633219242096, "learning_rate": 8.108029041164566e-07, "loss": 0.2723594903945923, "step": 2480 }, { "epoch": 2.767199444058374, "grad_norm": 0.15784206986427307, "learning_rate": 7.395325945316623e-07, "loss": 0.26740460395812987, "step": 2490 }, { "epoch": 2.778318276580959, "grad_norm": 0.1567256897687912, "learning_rate": 6.714814850377082e-07, "loss": 0.26805920600891114, "step": 2500 }, { "epoch": 2.7894371091035444, "grad_norm": 0.1606101393699646, "learning_rate": 6.066609497167086e-07, "loss": 0.26962528228759763, "step": 2510 }, { "epoch": 2.800555941626129, "grad_norm": 0.17048372328281403, "learning_rate": 5.450818226916799e-07, "loss": 0.26596574783325194, "step": 2520 }, { "epoch": 2.8116747741487145, "grad_norm": 0.1672402024269104, "learning_rate": 4.867543963157162e-07, "loss": 0.27477524280548093, "step": 2530 }, { "epoch": 2.8227936066712997, "grad_norm": 0.15722811222076416, "learning_rate": 4.3168841945172347e-07, "loss": 0.26975407600402834, "step": 2540 }, { "epoch": 2.8339124391938846, "grad_norm": 0.15568679571151733, "learning_rate": 3.798930958430025e-07, "loss": 0.2677044630050659, "step": 2550 }, { "epoch": 2.84503127171647, "grad_norm": 0.15909866988658905, "learning_rate": 3.3137708257491074e-07, "loss": 0.2706420421600342, "step": 2560 }, { "epoch": 2.856150104239055, "grad_norm": 0.15334129333496094, "learning_rate": 2.861484886279331e-07, "loss": 0.26921744346618653, "step": 2570 }, { "epoch": 2.86726893676164, "grad_norm": 0.1526278257369995, "learning_rate": 2.4421487352234376e-07, "loss": 0.2678064823150635, "step": 2580 }, { "epoch": 2.8783877692842252, "grad_norm": 0.1667514145374298, "learning_rate": 2.0558324605469248e-07, "loss": 0.26693019866943357, "step": 2590 }, { "epoch": 2.8895066018068105, "grad_norm": 0.15330937504768372, "learning_rate": 1.7026006312635956e-07, "loss": 0.266402268409729, "step": 2600 }, { "epoch": 2.9006254343293953, "grad_norm": 0.1585593819618225, "learning_rate": 1.3825122866435893e-07, "loss": 0.2690951585769653, "step": 2610 }, { "epoch": 2.9117442668519806, "grad_norm": 0.15491652488708496, "learning_rate": 1.0956209263453421e-07, "loss": 0.26980152130126955, "step": 2620 }, { "epoch": 2.922863099374566, "grad_norm": 0.1553027331829071, "learning_rate": 8.419745014737412e-08, "loss": 0.26876671314239503, "step": 2630 }, { "epoch": 2.9339819318971507, "grad_norm": 0.15838363766670227, "learning_rate": 6.216154065656233e-08, "loss": 0.2718991279602051, "step": 2640 }, { "epoch": 2.945100764419736, "grad_norm": 0.1559874266386032, "learning_rate": 4.345804725037983e-08, "loss": 0.2709752321243286, "step": 2650 }, { "epoch": 2.9562195969423213, "grad_norm": 0.15690867602825165, "learning_rate": 2.8090096036119675e-08, "loss": 0.26477723121643065, "step": 2660 }, { "epoch": 2.967338429464906, "grad_norm": 0.15188027918338776, "learning_rate": 1.6060255617595943e-08, "loss": 0.27164206504821775, "step": 2670 }, { "epoch": 2.9784572619874914, "grad_norm": 0.15537431836128235, "learning_rate": 7.370536665811667e-09, "loss": 0.2670041561126709, "step": 2680 }, { "epoch": 2.9895760945100767, "grad_norm": 0.15168948471546173, "learning_rate": 2.0223915829031828e-09, "loss": 0.27080345153808594, "step": 2690 }, { "epoch": 3.0, "grad_norm": 0.24983854591846466, "learning_rate": 1.6714259387651256e-11, "loss": 0.2644808292388916, "step": 2700 }, { "epoch": 3.0, "step": 2700, "total_flos": 2.5566761492312123e+20, "train_loss": 0.34524431255128646, "train_runtime": 116173.3714, "train_samples_per_second": 2.973, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 2700, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5566761492312123e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }