{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 897, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8498557328246534, "epoch": 0.011154489682097044, "grad_norm": 44.5, "learning_rate": 1e-05, "loss": 3.704855728149414, "mean_token_accuracy": 0.5304131177254021, "num_tokens": 833799.0, "step": 10 }, { "entropy": 1.765550174564123, "epoch": 0.022308979364194088, "grad_norm": 10.25, "learning_rate": 2.111111111111111e-05, "loss": 1.9117172241210938, "mean_token_accuracy": 0.6469556039199233, "num_tokens": 1671910.0, "step": 20 }, { "entropy": 0.8713466321351007, "epoch": 0.03346346904629113, "grad_norm": 3.78125, "learning_rate": 3.222222222222223e-05, "loss": 0.8604758262634278, "mean_token_accuracy": 0.8343592453747988, "num_tokens": 2510443.0, "step": 30 }, { "entropy": 0.23340068878605963, "epoch": 0.044617958728388175, "grad_norm": 1.953125, "learning_rate": 4.3333333333333334e-05, "loss": 0.24464244842529298, "mean_token_accuracy": 0.9471705831587315, "num_tokens": 3345229.0, "step": 40 }, { "entropy": 0.16152286342112349, "epoch": 0.05577244841048522, "grad_norm": 3.171875, "learning_rate": 4.9997280790439974e-05, "loss": 0.17044107913970946, "mean_token_accuracy": 0.9553312979638576, "num_tokens": 4184871.0, "step": 50 }, { "entropy": 0.1448873324552551, "epoch": 0.06692693809258227, "grad_norm": 0.984375, "learning_rate": 4.996669647581318e-05, "loss": 0.14741255044937135, "mean_token_accuracy": 0.9585917994379998, "num_tokens": 5039068.0, "step": 60 }, { "entropy": 0.12383970170631073, "epoch": 0.0780814277746793, "grad_norm": 1.125, "learning_rate": 4.990217055187362e-05, "loss": 0.12599575519561768, "mean_token_accuracy": 0.9621855434030294, "num_tokens": 5904036.0, "step": 70 }, { "entropy": 0.11301726293168031, "epoch": 0.08923591745677635, "grad_norm": 1.3828125, "learning_rate": 4.980379074002661e-05, "loss": 0.11982399225234985, "mean_token_accuracy": 0.964000066742301, "num_tokens": 6747251.0, "step": 80 }, { "entropy": 0.10463761446881108, "epoch": 0.1003904071388734, "grad_norm": 0.71484375, "learning_rate": 4.967169078520476e-05, "loss": 0.11220132112503052, "mean_token_accuracy": 0.9658373668789864, "num_tokens": 7572769.0, "step": 90 }, { "entropy": 0.10532618285797071, "epoch": 0.11154489682097044, "grad_norm": 0.9296875, "learning_rate": 4.9506050274045076e-05, "loss": 0.11419826745986938, "mean_token_accuracy": 0.965171106159687, "num_tokens": 8410764.0, "step": 100 }, { "entropy": 0.10305561173590831, "epoch": 0.12269938650306748, "grad_norm": 0.89453125, "learning_rate": 4.930709439074528e-05, "loss": 0.10990087985992432, "mean_token_accuracy": 0.9659170717000961, "num_tokens": 9255104.0, "step": 110 }, { "entropy": 0.10211670671415049, "epoch": 0.13385387618516453, "grad_norm": 0.76953125, "learning_rate": 4.90750936109315e-05, "loss": 0.11037271022796631, "mean_token_accuracy": 0.966075143776834, "num_tokens": 10103000.0, "step": 120 }, { "entropy": 0.09717915701621678, "epoch": 0.14500836586726157, "grad_norm": 0.73046875, "learning_rate": 4.881036333395329e-05, "loss": 0.10295262336730956, "mean_token_accuracy": 0.9677550513297319, "num_tokens": 10945768.0, "step": 130 }, { "entropy": 0.09635820150724612, "epoch": 0.1561628555493586, "grad_norm": 0.7265625, "learning_rate": 4.851326345410594e-05, "loss": 0.10199121236801148, "mean_token_accuracy": 0.9681327627971769, "num_tokens": 11801129.0, "step": 140 }, { "entropy": 0.09002169943414629, "epoch": 0.16731734523145567, "grad_norm": 0.98046875, "learning_rate": 4.818419787136311e-05, "loss": 0.09567687511444092, "mean_token_accuracy": 0.9701874911785126, "num_tokens": 12636424.0, "step": 150 }, { "entropy": 0.09068954657413997, "epoch": 0.1784718349135527, "grad_norm": 1.1015625, "learning_rate": 4.782361394228472e-05, "loss": 0.0969263732433319, "mean_token_accuracy": 0.9701515214517713, "num_tokens": 13490177.0, "step": 160 }, { "entropy": 0.08610689677589108, "epoch": 0.18962632459564974, "grad_norm": 0.96484375, "learning_rate": 4.74320018718467e-05, "loss": 0.08986451625823974, "mean_token_accuracy": 0.9720516135916114, "num_tokens": 14328572.0, "step": 170 }, { "entropy": 0.08233372264367063, "epoch": 0.2007808142777468, "grad_norm": 1.890625, "learning_rate": 4.700989404701941e-05, "loss": 0.08586806058883667, "mean_token_accuracy": 0.9734413396567106, "num_tokens": 15189703.0, "step": 180 }, { "entropy": 0.07736555864394176, "epoch": 0.21193530395984383, "grad_norm": 1.28125, "learning_rate": 4.6557864313000695e-05, "loss": 0.07913717031478881, "mean_token_accuracy": 0.9759283743798732, "num_tokens": 16028746.0, "step": 190 }, { "entropy": 0.07013691037718672, "epoch": 0.22308979364194087, "grad_norm": 1.1953125, "learning_rate": 4.60765271930874e-05, "loss": 0.07151558399200439, "mean_token_accuracy": 0.978206392377615, "num_tokens": 16849567.0, "step": 200 }, { "entropy": 0.07075640709081199, "epoch": 0.23424428332403793, "grad_norm": 1.4921875, "learning_rate": 4.55665370532461e-05, "loss": 0.07073653936386108, "mean_token_accuracy": 0.9784815656021237, "num_tokens": 17705656.0, "step": 210 }, { "entropy": 0.06637019099725876, "epoch": 0.24539877300613497, "grad_norm": 1.28125, "learning_rate": 4.5028587212518705e-05, "loss": 0.06597371697425843, "mean_token_accuracy": 0.979678837954998, "num_tokens": 18565065.0, "step": 220 }, { "entropy": 0.05923618896049447, "epoch": 0.25655326268823203, "grad_norm": 1.2734375, "learning_rate": 4.4463409000472234e-05, "loss": 0.058509671688079835, "mean_token_accuracy": 0.9822878390550613, "num_tokens": 19392725.0, "step": 230 }, { "entropy": 0.05874249367916491, "epoch": 0.26770775237032907, "grad_norm": 1.171875, "learning_rate": 4.3871770762974306e-05, "loss": 0.05813463926315308, "mean_token_accuracy": 0.9824939148500562, "num_tokens": 20223583.0, "step": 240 }, { "entropy": 0.05689131756371353, "epoch": 0.2788622420524261, "grad_norm": 1.4609375, "learning_rate": 4.325447681764586e-05, "loss": 0.055121219158172606, "mean_token_accuracy": 0.9830958772450685, "num_tokens": 21085157.0, "step": 250 }, { "entropy": 0.05158787120017223, "epoch": 0.29001673173452314, "grad_norm": 1.0390625, "learning_rate": 4.261236636041108e-05, "loss": 0.05031982660293579, "mean_token_accuracy": 0.984761236794293, "num_tokens": 21904752.0, "step": 260 }, { "entropy": 0.05179886775440536, "epoch": 0.30117122141662017, "grad_norm": 0.78515625, "learning_rate": 4.194631232463128e-05, "loss": 0.0493065744638443, "mean_token_accuracy": 0.9848343567922712, "num_tokens": 22765981.0, "step": 270 }, { "entropy": 0.049823664524592456, "epoch": 0.3123257110987172, "grad_norm": 0.62890625, "learning_rate": 4.1257220194373424e-05, "loss": 0.04740493595600128, "mean_token_accuracy": 0.9855156386271119, "num_tokens": 23621926.0, "step": 280 }, { "entropy": 0.04758051415265072, "epoch": 0.3234802007808143, "grad_norm": 0.5, "learning_rate": 4.054602677342684e-05, "loss": 0.04637431204319, "mean_token_accuracy": 0.9858794504776597, "num_tokens": 24465607.0, "step": 290 }, { "entropy": 0.04668422270915471, "epoch": 0.33463469046291133, "grad_norm": 0.474609375, "learning_rate": 3.981369891174155e-05, "loss": 0.04507455825805664, "mean_token_accuracy": 0.9860366908833385, "num_tokens": 25299933.0, "step": 300 }, { "entropy": 0.04640703263867181, "epoch": 0.34578918014500837, "grad_norm": 0.390625, "learning_rate": 3.906123219101952e-05, "loss": 0.04516075849533081, "mean_token_accuracy": 0.9859529983252286, "num_tokens": 26154681.0, "step": 310 }, { "entropy": 0.04438853256579023, "epoch": 0.3569436698271054, "grad_norm": 0.423828125, "learning_rate": 3.8289649571245885e-05, "loss": 0.044096818566322325, "mean_token_accuracy": 0.986466808244586, "num_tokens": 26983541.0, "step": 320 }, { "entropy": 0.044660908347577785, "epoch": 0.36809815950920244, "grad_norm": 0.376953125, "learning_rate": 3.7500000000000003e-05, "loss": 0.043841251730918886, "mean_token_accuracy": 0.986228640563786, "num_tokens": 27833414.0, "step": 330 }, { "entropy": 0.043841811595484614, "epoch": 0.3792526491912995, "grad_norm": 0.328125, "learning_rate": 3.669335698643704e-05, "loss": 0.04326000213623047, "mean_token_accuracy": 0.9865183688700199, "num_tokens": 28677577.0, "step": 340 }, { "entropy": 0.04371235728031024, "epoch": 0.39040713887339656, "grad_norm": 0.390625, "learning_rate": 3.587081714187874e-05, "loss": 0.043233224749565126, "mean_token_accuracy": 0.9865481941029429, "num_tokens": 29512895.0, "step": 350 }, { "entropy": 0.04327065504912753, "epoch": 0.4015616285554936, "grad_norm": 0.330078125, "learning_rate": 3.503349868899722e-05, "loss": 0.04288822710514069, "mean_token_accuracy": 0.986665309779346, "num_tokens": 30343018.0, "step": 360 }, { "entropy": 0.04256358170532622, "epoch": 0.41271611823759063, "grad_norm": 0.376953125, "learning_rate": 3.418253994161892e-05, "loss": 0.042832252383232114, "mean_token_accuracy": 0.9867880517616868, "num_tokens": 31158940.0, "step": 370 }, { "entropy": 0.04262932341953274, "epoch": 0.42387060791968767, "grad_norm": 0.30078125, "learning_rate": 3.3319097757214843e-05, "loss": 0.04222110211849213, "mean_token_accuracy": 0.9866109801456332, "num_tokens": 32008434.0, "step": 380 }, { "entropy": 0.04266308699734509, "epoch": 0.4350250976017847, "grad_norm": 0.314453125, "learning_rate": 3.244434596418139e-05, "loss": 0.042472487688064574, "mean_token_accuracy": 0.9866344403475523, "num_tokens": 32854651.0, "step": 390 }, { "entropy": 0.0429983379173791, "epoch": 0.44617958728388174, "grad_norm": 0.357421875, "learning_rate": 3.155947376604948e-05, "loss": 0.04324407577514648, "mean_token_accuracy": 0.9865379808470607, "num_tokens": 33705423.0, "step": 400 }, { "entropy": 0.04282604140753392, "epoch": 0.45733407696597883, "grad_norm": 0.294921875, "learning_rate": 3.066568412479167e-05, "loss": 0.04259026348590851, "mean_token_accuracy": 0.9867103593423963, "num_tokens": 34538632.0, "step": 410 }, { "entropy": 0.04212904951127712, "epoch": 0.46848856664807587, "grad_norm": 0.4140625, "learning_rate": 2.976419212542495e-05, "loss": 0.04252048432826996, "mean_token_accuracy": 0.9867507757619023, "num_tokens": 35381587.0, "step": 420 }, { "entropy": 0.0412595656584017, "epoch": 0.4796430563301729, "grad_norm": 0.3125, "learning_rate": 2.885622332413256e-05, "loss": 0.041145503520965576, "mean_token_accuracy": 0.9870552903041243, "num_tokens": 36234112.0, "step": 430 }, { "entropy": 0.041778112881002014, "epoch": 0.49079754601226994, "grad_norm": 0.349609375, "learning_rate": 2.7943012082150533e-05, "loss": 0.041335687041282654, "mean_token_accuracy": 0.9867611099034548, "num_tokens": 37077497.0, "step": 440 }, { "entropy": 0.04076959296362474, "epoch": 0.501952035694367, "grad_norm": 0.318359375, "learning_rate": 2.7025799887684002e-05, "loss": 0.041106203198432924, "mean_token_accuracy": 0.9871867259964346, "num_tokens": 37919261.0, "step": 450 }, { "entropy": 0.04188558856840245, "epoch": 0.5131065253764641, "grad_norm": 0.314453125, "learning_rate": 2.6105833668134473e-05, "loss": 0.041896390914916995, "mean_token_accuracy": 0.9867892485111952, "num_tokens": 38782505.0, "step": 460 }, { "entropy": 0.04178600409068167, "epoch": 0.524261015058561, "grad_norm": 0.3359375, "learning_rate": 2.518436409493281e-05, "loss": 0.04188077747821808, "mean_token_accuracy": 0.9868596900254488, "num_tokens": 39610893.0, "step": 470 }, { "entropy": 0.04152031776320655, "epoch": 0.5354155047406581, "grad_norm": 0.267578125, "learning_rate": 2.426264388328214e-05, "loss": 0.04174352586269379, "mean_token_accuracy": 0.9868257040157914, "num_tokens": 40427636.0, "step": 480 }, { "entropy": 0.040754605704569256, "epoch": 0.5465699944227551, "grad_norm": 0.3125, "learning_rate": 2.334192608912241e-05, "loss": 0.04108997285366058, "mean_token_accuracy": 0.9870152780786157, "num_tokens": 41252001.0, "step": 490 }, { "entropy": 0.042179943548399025, "epoch": 0.5577244841048522, "grad_norm": 0.353515625, "learning_rate": 2.2423462405631616e-05, "loss": 0.04207477867603302, "mean_token_accuracy": 0.9866394894197583, "num_tokens": 42113694.0, "step": 500 }, { "entropy": 0.04122589101898484, "epoch": 0.5688789737869493, "grad_norm": 0.330078125, "learning_rate": 2.150850146157985e-05, "loss": 0.04146281182765961, "mean_token_accuracy": 0.9869526766240597, "num_tokens": 42941138.0, "step": 510 }, { "entropy": 0.04114197726012207, "epoch": 0.5800334634690463, "grad_norm": 0.345703125, "learning_rate": 2.0598287123849095e-05, "loss": 0.040973353385925296, "mean_token_accuracy": 0.9871221456676722, "num_tokens": 43770822.0, "step": 520 }, { "entropy": 0.04192428553069476, "epoch": 0.5911879531511434, "grad_norm": 0.302734375, "learning_rate": 1.9694056806426928e-05, "loss": 0.04169855713844299, "mean_token_accuracy": 0.9866715084761382, "num_tokens": 44637866.0, "step": 530 }, { "entropy": 0.03977415001136251, "epoch": 0.6023424428332403, "grad_norm": 0.314453125, "learning_rate": 1.879703978817256e-05, "loss": 0.04036850333213806, "mean_token_accuracy": 0.9872556058689952, "num_tokens": 45453923.0, "step": 540 }, { "entropy": 0.04234540155448485, "epoch": 0.6134969325153374, "grad_norm": 0.33203125, "learning_rate": 1.7908455541642584e-05, "loss": 0.04180983603000641, "mean_token_accuracy": 0.9865613304078579, "num_tokens": 46306551.0, "step": 550 }, { "entropy": 0.041263596300268546, "epoch": 0.6246514221974344, "grad_norm": 0.31640625, "learning_rate": 1.7029512075247967e-05, "loss": 0.04135525822639465, "mean_token_accuracy": 0.986899808421731, "num_tokens": 47143518.0, "step": 560 }, { "entropy": 0.04119314953277353, "epoch": 0.6358059118795315, "grad_norm": 0.3203125, "learning_rate": 1.6161404290996412e-05, "loss": 0.04146760106086731, "mean_token_accuracy": 0.9868535120040178, "num_tokens": 47992113.0, "step": 570 }, { "entropy": 0.04120078657870181, "epoch": 0.6469604015616286, "grad_norm": 0.30859375, "learning_rate": 1.5305312360052442e-05, "loss": 0.0413068950176239, "mean_token_accuracy": 0.986842698045075, "num_tokens": 48843712.0, "step": 580 }, { "entropy": 0.04128208919428289, "epoch": 0.6581148912437256, "grad_norm": 0.326171875, "learning_rate": 1.4462400118323798e-05, "loss": 0.041500210762023926, "mean_token_accuracy": 0.9869369497522712, "num_tokens": 49688129.0, "step": 590 }, { "entropy": 0.04088454471202567, "epoch": 0.6692693809258227, "grad_norm": 0.33203125, "learning_rate": 1.3633813484255131e-05, "loss": 0.041133826971054076, "mean_token_accuracy": 0.9869873868301511, "num_tokens": 50520741.0, "step": 600 }, { "entropy": 0.04227327090920881, "epoch": 0.6804238706079196, "grad_norm": 0.275390625, "learning_rate": 1.2820678900980093e-05, "loss": 0.04190162420272827, "mean_token_accuracy": 0.9865590412169695, "num_tokens": 51392294.0, "step": 610 }, { "entropy": 0.04065559499140363, "epoch": 0.6915783602900167, "grad_norm": 0.328125, "learning_rate": 1.2024101804949638e-05, "loss": 0.04115171730518341, "mean_token_accuracy": 0.9869800698012113, "num_tokens": 52240430.0, "step": 620 }, { "entropy": 0.042412614575005135, "epoch": 0.7027328499721138, "grad_norm": 0.3203125, "learning_rate": 1.124516512311836e-05, "loss": 0.04237264692783356, "mean_token_accuracy": 0.9865791719406843, "num_tokens": 53087953.0, "step": 630 }, { "entropy": 0.04051031620183494, "epoch": 0.7138873396542108, "grad_norm": 0.28515625, "learning_rate": 1.0484927800731984e-05, "loss": 0.040881377458572385, "mean_token_accuracy": 0.9869993204250932, "num_tokens": 53927989.0, "step": 640 }, { "entropy": 0.04193990352796391, "epoch": 0.7250418293363079, "grad_norm": 0.298828125, "learning_rate": 9.744423361717323e-06, "loss": 0.04187402129173279, "mean_token_accuracy": 0.9866631610319019, "num_tokens": 54774541.0, "step": 650 }, { "entropy": 0.03943951329856645, "epoch": 0.7361963190184049, "grad_norm": 0.3125, "learning_rate": 9.024658503631967e-06, "loss": 0.04017325043678284, "mean_token_accuracy": 0.9874097904190421, "num_tokens": 55607613.0, "step": 660 }, { "entropy": 0.041415746443090026, "epoch": 0.747350808700502, "grad_norm": 0.296875, "learning_rate": 8.32661172908373e-06, "loss": 0.04164916574954987, "mean_token_accuracy": 0.9868578946217894, "num_tokens": 56444089.0, "step": 670 }, { "entropy": 0.04108071085065603, "epoch": 0.758505298382599, "grad_norm": 0.265625, "learning_rate": 7.651232015480462e-06, "loss": 0.04107390642166138, "mean_token_accuracy": 0.986830660328269, "num_tokens": 57290368.0, "step": 680 }, { "entropy": 0.04229205273441039, "epoch": 0.769659788064696, "grad_norm": 0.306640625, "learning_rate": 6.99943752491857e-06, "loss": 0.04177336990833282, "mean_token_accuracy": 0.9866125296801329, "num_tokens": 58128968.0, "step": 690 }, { "entropy": 0.041548075363971294, "epoch": 0.7808142777467931, "grad_norm": 0.279296875, "learning_rate": 6.372114355964293e-06, "loss": 0.04167112410068512, "mean_token_accuracy": 0.9867573702707887, "num_tokens": 58984460.0, "step": 700 }, { "entropy": 0.0403200296277646, "epoch": 0.7919687674288901, "grad_norm": 0.306640625, "learning_rate": 5.770115339024484e-06, "loss": 0.04050106704235077, "mean_token_accuracy": 0.9871903322637081, "num_tokens": 59827367.0, "step": 710 }, { "entropy": 0.0406710500101326, "epoch": 0.8031232571109872, "grad_norm": 0.294921875, "learning_rate": 5.194258876944705e-06, "loss": 0.04084862470626831, "mean_token_accuracy": 0.9871157312765717, "num_tokens": 60654050.0, "step": 720 }, { "entropy": 0.040411298532853836, "epoch": 0.8142777467930842, "grad_norm": 0.28515625, "learning_rate": 4.645327832410648e-06, "loss": 0.040474030375480655, "mean_token_accuracy": 0.9871165057644248, "num_tokens": 61488530.0, "step": 730 }, { "entropy": 0.04196117307874374, "epoch": 0.8254322364751813, "grad_norm": 0.310546875, "learning_rate": 4.12406846366562e-06, "loss": 0.04189785122871399, "mean_token_accuracy": 0.9867719961330295, "num_tokens": 62326744.0, "step": 740 }, { "entropy": 0.040386362894787455, "epoch": 0.8365867261572784, "grad_norm": 0.2734375, "learning_rate": 3.631189409990815e-06, "loss": 0.04039705097675324, "mean_token_accuracy": 0.9871017251163721, "num_tokens": 63170753.0, "step": 750 }, { "entropy": 0.040684799235896206, "epoch": 0.8477412158393753, "grad_norm": 0.265625, "learning_rate": 3.1673607283276813e-06, "loss": 0.04109015464782715, "mean_token_accuracy": 0.9869557719677686, "num_tokens": 64013316.0, "step": 760 }, { "entropy": 0.042057951152673925, "epoch": 0.8588957055214724, "grad_norm": 0.275390625, "learning_rate": 2.733212982351957e-06, "loss": 0.04174878001213074, "mean_token_accuracy": 0.986495653167367, "num_tokens": 64863074.0, "step": 770 }, { "entropy": 0.04044588297838345, "epoch": 0.8700501952035694, "grad_norm": 0.283203125, "learning_rate": 2.3293363852379125e-06, "loss": 0.04043938219547272, "mean_token_accuracy": 0.9872395290061832, "num_tokens": 65709101.0, "step": 780 }, { "entropy": 0.04246396276575979, "epoch": 0.8812046848856665, "grad_norm": 0.275390625, "learning_rate": 1.956279997278043e-06, "loss": 0.041996100544929506, "mean_token_accuracy": 0.9866539994254708, "num_tokens": 66553598.0, "step": 790 }, { "entropy": 0.041515190360951235, "epoch": 0.8923591745677635, "grad_norm": 0.322265625, "learning_rate": 1.6145509794491364e-06, "loss": 0.041551712155342105, "mean_token_accuracy": 0.9867776447907091, "num_tokens": 67405428.0, "step": 800 }, { "entropy": 0.04269145799044054, "epoch": 0.9035136642498606, "grad_norm": 0.283203125, "learning_rate": 1.3046139039394e-06, "loss": 0.042556726932525636, "mean_token_accuracy": 0.9866596391424537, "num_tokens": 68245631.0, "step": 810 }, { "entropy": 0.04082234081579372, "epoch": 0.9146681539319577, "grad_norm": 0.30859375, "learning_rate": 1.026890122573998e-06, "loss": 0.04080590307712555, "mean_token_accuracy": 0.987078714184463, "num_tokens": 69069024.0, "step": 820 }, { "entropy": 0.04289137564774137, "epoch": 0.9258226436140546, "grad_norm": 0.29296875, "learning_rate": 7.817571939976288e-07, "loss": 0.04283967912197113, "mean_token_accuracy": 0.9864464558660984, "num_tokens": 69903803.0, "step": 830 }, { "entropy": 0.041589401010423896, "epoch": 0.9369771332961517, "grad_norm": 0.2890625, "learning_rate": 5.695483703928306e-07, "loss": 0.04148242473602295, "mean_token_accuracy": 0.9868634788319468, "num_tokens": 70747562.0, "step": 840 }, { "entropy": 0.04105772517505102, "epoch": 0.9481316229782487, "grad_norm": 0.271484375, "learning_rate": 3.905521444318605e-07, "loss": 0.04133652150630951, "mean_token_accuracy": 0.986898991279304, "num_tokens": 71601623.0, "step": 850 }, { "entropy": 0.040625668261782266, "epoch": 0.9592861126603458, "grad_norm": 0.322265625, "learning_rate": 2.450118570779786e-07, "loss": 0.04106319844722748, "mean_token_accuracy": 0.9870263114571571, "num_tokens": 72449591.0, "step": 860 }, { "entropy": 0.041737568736425604, "epoch": 0.9704406023424428, "grad_norm": 0.28515625, "learning_rate": 1.3312536676942377e-07, "loss": 0.04145269989967346, "mean_token_accuracy": 0.9866915429010987, "num_tokens": 73305562.0, "step": 870 }, { "entropy": 0.04121337772812694, "epoch": 0.9815950920245399, "grad_norm": 0.28515625, "learning_rate": 5.5044780435722923e-08, "loss": 0.04093064665794373, "mean_token_accuracy": 0.986900057643652, "num_tokens": 74143208.0, "step": 880 }, { "entropy": 0.040503930338309145, "epoch": 0.992749581706637, "grad_norm": 0.287109375, "learning_rate": 1.0876246712074322e-08, "loss": 0.0403281182050705, "mean_token_accuracy": 0.9871442951261997, "num_tokens": 74991508.0, "step": 890 }, { "epoch": 1.0, "eval_entropy": 0.041321987748146057, "eval_loss": 0.03999410942196846, "eval_mean_token_accuracy": 0.9868998980522156, "eval_num_tokens": 75541075.0, "eval_runtime": 50.0483, "eval_samples_per_second": 19.981, "eval_steps_per_second": 9.99, "step": 897 } ], "logging_steps": 10, "max_steps": 897, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.495254912950835e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }