{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 11066, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018074807107917894, "grad_norm": 58.25, "learning_rate": 1.805054151624549e-07, "loss": 4.652698516845703, "step": 1, "token_acc": 0.43327626145634 }, { "epoch": 0.018074807107917895, "grad_norm": 6.53125, "learning_rate": 1.805054151624549e-05, "loss": 3.920129641137942, "step": 100, "token_acc": 0.45332661759508064 }, { "epoch": 0.03614961421583579, "grad_norm": 6.0, "learning_rate": 3.610108303249098e-05, "loss": 2.8419125366210936, "step": 200, "token_acc": 0.5193163006097973 }, { "epoch": 0.054224421323753685, "grad_norm": 5.625, "learning_rate": 5.415162454873647e-05, "loss": 2.2325677490234375, "step": 300, "token_acc": 0.577260581442613 }, { "epoch": 0.07229922843167158, "grad_norm": 5.09375, "learning_rate": 7.220216606498195e-05, "loss": 1.9477809143066407, "step": 400, "token_acc": 0.6095200383891847 }, { "epoch": 0.09037403553958948, "grad_norm": 4.4375, "learning_rate": 9.025270758122743e-05, "loss": 1.7510385131835937, "step": 500, "token_acc": 0.6378192081318703 }, { "epoch": 0.10844884264750737, "grad_norm": 4.28125, "learning_rate": 9.999527526045029e-05, "loss": 1.6535232543945313, "step": 600, "token_acc": 0.6496733713777781 }, { "epoch": 0.12652364975542527, "grad_norm": 3.625, "learning_rate": 9.99524110790929e-05, "loss": 1.5711769104003905, "step": 700, "token_acc": 0.6620965629303417 }, { "epoch": 0.14459845686334316, "grad_norm": 3.4375, "learning_rate": 9.986493474590536e-05, "loss": 1.5024029541015624, "step": 800, "token_acc": 0.6721741041947722 }, { "epoch": 0.16267326397126106, "grad_norm": 3.0, "learning_rate": 9.973292438539405e-05, "loss": 1.4528140258789062, "step": 900, "token_acc": 0.6810912279574309 }, { "epoch": 0.18074807107917895, "grad_norm": 3.421875, "learning_rate": 9.955649789509624e-05, "loss": 1.4288172912597656, "step": 1000, "token_acc": 0.6845218263208889 }, { "epoch": 0.19882287818709685, "grad_norm": 3.4375, "learning_rate": 9.933581284028659e-05, "loss": 1.4166110229492188, "step": 1100, "token_acc": 0.68570654719404 }, { "epoch": 0.21689768529501474, "grad_norm": 2.90625, "learning_rate": 9.907106631325671e-05, "loss": 1.3844677734375, "step": 1200, "token_acc": 0.6901876362337299 }, { "epoch": 0.23497249240293264, "grad_norm": 2.375, "learning_rate": 9.876249475729344e-05, "loss": 1.3720639038085938, "step": 1300, "token_acc": 0.6921398802222583 }, { "epoch": 0.25304729951085053, "grad_norm": 2.6875, "learning_rate": 9.841037375551294e-05, "loss": 1.3502085876464844, "step": 1400, "token_acc": 0.6961120678024922 }, { "epoch": 0.27112210661876845, "grad_norm": 2.53125, "learning_rate": 9.801501778473935e-05, "loss": 1.3242225646972656, "step": 1500, "token_acc": 0.7008815156964483 }, { "epoch": 0.2891969137266863, "grad_norm": 2.671875, "learning_rate": 9.757677993464771e-05, "loss": 1.3310586547851562, "step": 1600, "token_acc": 0.6988938129948864 }, { "epoch": 0.30727172083460425, "grad_norm": 2.4375, "learning_rate": 9.709605159242199e-05, "loss": 1.3116970825195313, "step": 1700, "token_acc": 0.7015194406957871 }, { "epoch": 0.3253465279425221, "grad_norm": 2.5625, "learning_rate": 9.657326209320998e-05, "loss": 1.3130838012695312, "step": 1800, "token_acc": 0.7008979147977925 }, { "epoch": 0.34342133505044004, "grad_norm": 2.453125, "learning_rate": 9.600887833668701e-05, "loss": 1.2876347351074218, "step": 1900, "token_acc": 0.7059691653488992 }, { "epoch": 0.3614961421583579, "grad_norm": 2.28125, "learning_rate": 9.540340437007106e-05, "loss": 1.2805183410644532, "step": 2000, "token_acc": 0.7076493690679662 }, { "epoch": 0.3795709492662758, "grad_norm": 2.3125, "learning_rate": 9.475738093796172e-05, "loss": 1.2861131286621095, "step": 2100, "token_acc": 0.7055881842553867 }, { "epoch": 0.3976457563741937, "grad_norm": 2.375, "learning_rate": 9.407138499940496e-05, "loss": 1.268822479248047, "step": 2200, "token_acc": 0.7090846641855041 }, { "epoch": 0.4157205634821116, "grad_norm": 2.59375, "learning_rate": 9.334602921261492e-05, "loss": 1.2444308471679688, "step": 2300, "token_acc": 0.7131349902523053 }, { "epoch": 0.4337953705900295, "grad_norm": 3.015625, "learning_rate": 9.258196138781327e-05, "loss": 1.2656473541259765, "step": 2400, "token_acc": 0.7094099462846465 }, { "epoch": 0.4518701776979474, "grad_norm": 2.265625, "learning_rate": 9.177986390867419e-05, "loss": 1.2598892211914063, "step": 2500, "token_acc": 0.7109632032447007 }, { "epoch": 0.4699449848058653, "grad_norm": 2.5, "learning_rate": 9.09404531228924e-05, "loss": 1.2393927764892578, "step": 2600, "token_acc": 0.7133786975644326 }, { "epoch": 0.4880197919137832, "grad_norm": 2.3125, "learning_rate": 9.0064478702418e-05, "loss": 1.2412493896484376, "step": 2700, "token_acc": 0.7132032480227637 }, { "epoch": 0.5060945990217011, "grad_norm": 2.5625, "learning_rate": 8.915272297392945e-05, "loss": 1.2382020568847656, "step": 2800, "token_acc": 0.7131689546842609 }, { "epoch": 0.5241694061296189, "grad_norm": 2.640625, "learning_rate": 8.820600022014338e-05, "loss": 1.2349536895751954, "step": 2900, "token_acc": 0.7140737742682136 }, { "epoch": 0.5422442132375369, "grad_norm": 2.109375, "learning_rate": 8.722515595258402e-05, "loss": 1.2141342163085938, "step": 3000, "token_acc": 0.7181298317819385 }, { "epoch": 0.5603190203454548, "grad_norm": 2.078125, "learning_rate": 8.621106615646292e-05, "loss": 1.2058545684814452, "step": 3100, "token_acc": 0.7190379193709925 }, { "epoch": 0.5783938274533726, "grad_norm": 2.453125, "learning_rate": 8.51646365083426e-05, "loss": 1.2195273590087892, "step": 3200, "token_acc": 0.7170014250513891 }, { "epoch": 0.5964686345612905, "grad_norm": 2.3125, "learning_rate": 8.408680156728299e-05, "loss": 1.2225239562988282, "step": 3300, "token_acc": 0.71660874090857 }, { "epoch": 0.6145434416692085, "grad_norm": 2.359375, "learning_rate": 8.297852394019336e-05, "loss": 1.1935769653320312, "step": 3400, "token_acc": 0.721348535831292 }, { "epoch": 0.6326182487771264, "grad_norm": 2.328125, "learning_rate": 8.184079342213466e-05, "loss": 1.198054428100586, "step": 3500, "token_acc": 0.7195931038612298 }, { "epoch": 0.6506930558850442, "grad_norm": 2.15625, "learning_rate": 8.067462611234052e-05, "loss": 1.2039249420166016, "step": 3600, "token_acc": 0.7198145640636711 }, { "epoch": 0.6687678629929621, "grad_norm": 2.25, "learning_rate": 7.948106350674593e-05, "loss": 1.2044364166259767, "step": 3700, "token_acc": 0.7191590166885238 }, { "epoch": 0.6868426701008801, "grad_norm": 2.21875, "learning_rate": 7.826117156783461e-05, "loss": 1.1947254180908202, "step": 3800, "token_acc": 0.7211205989004562 }, { "epoch": 0.7049174772087979, "grad_norm": 2.171875, "learning_rate": 7.701603977263513e-05, "loss": 1.1763773345947266, "step": 3900, "token_acc": 0.7237809512705871 }, { "epoch": 0.7229922843167158, "grad_norm": 2.546875, "learning_rate": 7.574678013971672e-05, "loss": 1.180088424682617, "step": 4000, "token_acc": 0.7241476445274628 }, { "epoch": 0.7410670914246337, "grad_norm": 2.359375, "learning_rate": 7.445452623605307e-05, "loss": 1.1668415069580078, "step": 4100, "token_acc": 0.7264480800382547 }, { "epoch": 0.7591418985325517, "grad_norm": 2.015625, "learning_rate": 7.314043216464158e-05, "loss": 1.1807654571533204, "step": 4200, "token_acc": 0.7243762391137034 }, { "epoch": 0.7772167056404695, "grad_norm": 2.328125, "learning_rate": 7.180567153378193e-05, "loss": 1.1845186614990235, "step": 4300, "token_acc": 0.7214888736557151 }, { "epoch": 0.7952915127483874, "grad_norm": 2.1875, "learning_rate": 7.045143640893474e-05, "loss": 1.1583942413330077, "step": 4400, "token_acc": 0.7274813631693335 }, { "epoch": 0.8133663198563053, "grad_norm": 2.109375, "learning_rate": 6.907893624809609e-05, "loss": 1.145471420288086, "step": 4500, "token_acc": 0.7298424303929127 }, { "epoch": 0.8314411269642232, "grad_norm": 2.359375, "learning_rate": 6.768939682163902e-05, "loss": 1.1666727447509766, "step": 4600, "token_acc": 0.7264280244660613 }, { "epoch": 0.8495159340721411, "grad_norm": 2.09375, "learning_rate": 6.628405911758647e-05, "loss": 1.1622318267822265, "step": 4700, "token_acc": 0.7271438460195889 }, { "epoch": 0.867590741180059, "grad_norm": 2.15625, "learning_rate": 6.486417823329354e-05, "loss": 1.1364639282226563, "step": 4800, "token_acc": 0.7319144126512372 }, { "epoch": 0.8856655482879768, "grad_norm": 2.421875, "learning_rate": 6.34310222545287e-05, "loss": 1.1561846923828125, "step": 4900, "token_acc": 0.7282120514362783 }, { "epoch": 0.9037403553958948, "grad_norm": 2.109375, "learning_rate": 6.198587112295526e-05, "loss": 1.1464973449707032, "step": 5000, "token_acc": 0.7304544859576637 }, { "epoch": 0.9218151625038127, "grad_norm": 2.0, "learning_rate": 6.053001549302422e-05, "loss": 1.135927963256836, "step": 5100, "token_acc": 0.7309169370264882 }, { "epoch": 0.9398899696117305, "grad_norm": 2.359375, "learning_rate": 5.906475557929985e-05, "loss": 1.1362411499023437, "step": 5200, "token_acc": 0.7308731770004574 }, { "epoch": 0.9579647767196484, "grad_norm": 2.15625, "learning_rate": 5.759139999524705e-05, "loss": 1.132964859008789, "step": 5300, "token_acc": 0.7320876184986531 }, { "epoch": 0.9760395838275664, "grad_norm": 2.09375, "learning_rate": 5.611126458451772e-05, "loss": 1.1330313873291016, "step": 5400, "token_acc": 0.7321249508199824 }, { "epoch": 0.9941143909354843, "grad_norm": 2.1875, "learning_rate": 5.462567124577992e-05, "loss": 1.137665786743164, "step": 5500, "token_acc": 0.731388650772945 }, { "epoch": 1.0121101207623049, "grad_norm": 2.71875, "learning_rate": 5.3135946752139385e-05, "loss": 1.0870736694335938, "step": 5600, "token_acc": 0.7406857225004193 }, { "epoch": 1.0301849278702229, "grad_norm": 2.234375, "learning_rate": 5.1643421566207615e-05, "loss": 1.0574837493896485, "step": 5700, "token_acc": 0.7462299226419611 }, { "epoch": 1.0482597349781408, "grad_norm": 2.1875, "learning_rate": 5.0149428651874985e-05, "loss": 1.0600157165527344, "step": 5800, "token_acc": 0.7451135106751428 }, { "epoch": 1.0663345420860586, "grad_norm": 2.296875, "learning_rate": 4.86553022838499e-05, "loss": 1.0566656494140625, "step": 5900, "token_acc": 0.7469262925837835 }, { "epoch": 1.0844093491939766, "grad_norm": 2.1875, "learning_rate": 4.716237685602735e-05, "loss": 1.073977508544922, "step": 6000, "token_acc": 0.7416494081610407 }, { "epoch": 1.1024841563018946, "grad_norm": 2.359375, "learning_rate": 4.567198568975096e-05, "loss": 1.0501838684082032, "step": 6100, "token_acc": 0.7475906446092413 }, { "epoch": 1.1205589634098123, "grad_norm": 1.9453125, "learning_rate": 4.418545984303294e-05, "loss": 1.0559381866455078, "step": 6200, "token_acc": 0.7470775683736796 }, { "epoch": 1.1386337705177303, "grad_norm": 2.09375, "learning_rate": 4.2704126921795424e-05, "loss": 1.0539588928222656, "step": 6300, "token_acc": 0.7466571018782976 }, { "epoch": 1.1567085776256483, "grad_norm": 2.125, "learning_rate": 4.1229309894194806e-05, "loss": 1.0646955108642577, "step": 6400, "token_acc": 0.7450717568377625 }, { "epoch": 1.174783384733566, "grad_norm": 1.8828125, "learning_rate": 3.976232590908812e-05, "loss": 1.0539531707763672, "step": 6500, "token_acc": 0.7465753023540981 }, { "epoch": 1.192858191841484, "grad_norm": 2.25, "learning_rate": 3.830448511969638e-05, "loss": 1.0504056549072265, "step": 6600, "token_acc": 0.7470591527899016 }, { "epoch": 1.2109329989494018, "grad_norm": 1.9921875, "learning_rate": 3.6857089513516035e-05, "loss": 1.0537297821044922, "step": 6700, "token_acc": 0.746264189895728 }, { "epoch": 1.2290078060573197, "grad_norm": 2.078125, "learning_rate": 3.542143174952282e-05, "loss": 1.0569972229003906, "step": 6800, "token_acc": 0.7456408105039157 }, { "epoch": 1.2470826131652377, "grad_norm": 2.078125, "learning_rate": 3.399879400370704e-05, "loss": 1.0594657897949218, "step": 6900, "token_acc": 0.7464631930608675 }, { "epoch": 1.2651574202731555, "grad_norm": 2.296875, "learning_rate": 3.259044682397107e-05, "loss": 1.059138946533203, "step": 7000, "token_acc": 0.7466058265866723 }, { "epoch": 1.2832322273810735, "grad_norm": 2.140625, "learning_rate": 3.119764799541187e-05, "loss": 1.0547212982177734, "step": 7100, "token_acc": 0.7473527098438676 }, { "epoch": 1.3013070344889912, "grad_norm": 2.125, "learning_rate": 2.9821641417001806e-05, "loss": 1.0402613067626953, "step": 7200, "token_acc": 0.7493453306325137 }, { "epoch": 1.3193818415969092, "grad_norm": 2.140625, "learning_rate": 2.846365599067111e-05, "loss": 1.045955352783203, "step": 7300, "token_acc": 0.7483038206918617 }, { "epoch": 1.3374566487048272, "grad_norm": 2.203125, "learning_rate": 2.7124904523784144e-05, "loss": 1.0378961944580078, "step": 7400, "token_acc": 0.7502343443728431 }, { "epoch": 1.355531455812745, "grad_norm": 2.46875, "learning_rate": 2.580658264598942e-05, "loss": 1.0476718902587892, "step": 7500, "token_acc": 0.7483254533842428 }, { "epoch": 1.373606262920663, "grad_norm": 2.109375, "learning_rate": 2.450986774141123e-05, "loss": 1.0452989196777345, "step": 7600, "token_acc": 0.7482226148441731 }, { "epoch": 1.3916810700285809, "grad_norm": 2.140625, "learning_rate": 2.3235917897135934e-05, "loss": 1.0406829833984375, "step": 7700, "token_acc": 0.74961852670988 }, { "epoch": 1.4097558771364986, "grad_norm": 2.3125, "learning_rate": 2.1985870868932456e-05, "loss": 1.0259892272949218, "step": 7800, "token_acc": 0.7533507908450509 }, { "epoch": 1.4278306842444166, "grad_norm": 2.421875, "learning_rate": 2.076084306513049e-05, "loss": 1.0363540649414062, "step": 7900, "token_acc": 0.7503043904274775 }, { "epoch": 1.4459054913523346, "grad_norm": 1.984375, "learning_rate": 1.9561928549563968e-05, "loss": 1.0644924926757813, "step": 8000, "token_acc": 0.7456255842220881 }, { "epoch": 1.4639802984602523, "grad_norm": 2.125, "learning_rate": 1.839019806447024e-05, "loss": 1.0405005645751952, "step": 8100, "token_acc": 0.7495747171714061 }, { "epoch": 1.4820551055681703, "grad_norm": 2.09375, "learning_rate": 1.724669807421762e-05, "loss": 1.0440809631347656, "step": 8200, "token_acc": 0.7486369722470563 }, { "epoch": 1.5001299126760883, "grad_norm": 2.125, "learning_rate": 1.6132449830715263e-05, "loss": 1.0530775451660157, "step": 8300, "token_acc": 0.7477425947235785 }, { "epoch": 1.518204719784006, "grad_norm": 2.109375, "learning_rate": 1.5048448461340258e-05, "loss": 1.0460784912109375, "step": 8400, "token_acc": 0.7489244523851678 }, { "epoch": 1.5362795268919238, "grad_norm": 2.125, "learning_rate": 1.3995662080196215e-05, "loss": 1.0327759552001954, "step": 8500, "token_acc": 0.7517219276186928 }, { "epoch": 1.5543543339998418, "grad_norm": 1.8125, "learning_rate": 1.2975030923497262e-05, "loss": 1.037949981689453, "step": 8600, "token_acc": 0.7504533222058104 }, { "epoch": 1.5724291411077598, "grad_norm": 2.109375, "learning_rate": 1.1987466509849655e-05, "loss": 1.0523592376708983, "step": 8700, "token_acc": 0.7466913343954412 }, { "epoch": 1.5905039482156775, "grad_norm": 2.265625, "learning_rate": 1.1033850826180781e-05, "loss": 1.048785171508789, "step": 8800, "token_acc": 0.7472665235971139 }, { "epoch": 1.6085787553235955, "grad_norm": 2.0, "learning_rate": 1.0115035540042784e-05, "loss": 1.032520523071289, "step": 8900, "token_acc": 0.7507173368953239 }, { "epoch": 1.6266535624315135, "grad_norm": 2.21875, "learning_rate": 9.231841238994194e-06, "loss": 1.043864974975586, "step": 9000, "token_acc": 0.7493066372338768 }, { "epoch": 1.6447283695394312, "grad_norm": 1.90625, "learning_rate": 8.385056697738796e-06, "loss": 1.0542935943603515, "step": 9100, "token_acc": 0.7465372369884397 }, { "epoch": 1.6628031766473492, "grad_norm": 1.984375, "learning_rate": 7.575438173676513e-06, "loss": 1.031275177001953, "step": 9200, "token_acc": 0.7517255178380788 }, { "epoch": 1.6808779837552672, "grad_norm": 2.125, "learning_rate": 6.803708731495117e-06, "loss": 1.0453128814697266, "step": 9300, "token_acc": 0.7487111883579803 }, { "epoch": 1.698952790863185, "grad_norm": 1.8984375, "learning_rate": 6.070557597406163e-06, "loss": 1.0431288146972657, "step": 9400, "token_acc": 0.7488178364241926 }, { "epoch": 1.717027597971103, "grad_norm": 1.96875, "learning_rate": 5.376639543601858e-06, "loss": 1.0395802307128905, "step": 9500, "token_acc": 0.7503542725542914 }, { "epoch": 1.735102405079021, "grad_norm": 2.078125, "learning_rate": 4.722574303482557e-06, "loss": 1.0538075256347657, "step": 9600, "token_acc": 0.7472696432430282 }, { "epoch": 1.7531772121869387, "grad_norm": 2.234375, "learning_rate": 4.1089460181771675e-06, "loss": 1.046026382446289, "step": 9700, "token_acc": 0.7478882170852356 }, { "epoch": 1.7712520192948564, "grad_norm": 2.15625, "learning_rate": 3.5363027148507423e-06, "loss": 1.0465138244628907, "step": 9800, "token_acc": 0.7479565299418576 }, { "epoch": 1.7893268264027746, "grad_norm": 2.28125, "learning_rate": 3.0051558172652316e-06, "loss": 1.0322959899902344, "step": 9900, "token_acc": 0.752127805838985 }, { "epoch": 1.8074016335106924, "grad_norm": 2.21875, "learning_rate": 2.5159796890304564e-06, "loss": 1.0446186065673828, "step": 10000, "token_acc": 0.7488376764944241 }, { "epoch": 1.8254764406186101, "grad_norm": 2.0625, "learning_rate": 2.069211209953287e-06, "loss": 1.0309945678710937, "step": 10100, "token_acc": 0.7524413140197763 }, { "epoch": 1.8435512477265281, "grad_norm": 1.8203125, "learning_rate": 1.6652493858632823e-06, "loss": 1.0363735198974608, "step": 10200, "token_acc": 0.7503588935333034 }, { "epoch": 1.861626054834446, "grad_norm": 2.296875, "learning_rate": 1.3044549922633876e-06, "loss": 1.0336082458496094, "step": 10300, "token_acc": 0.7506362801039103 }, { "epoch": 1.8797008619423639, "grad_norm": 2.046875, "learning_rate": 9.871502521237975e-07, "loss": 1.0328756713867187, "step": 10400, "token_acc": 0.7518247545659716 }, { "epoch": 1.8977756690502818, "grad_norm": 1.8671875, "learning_rate": 7.136185481068925e-07, "loss": 1.0489426422119141, "step": 10500, "token_acc": 0.747691160714326 }, { "epoch": 1.9158504761581998, "grad_norm": 1.765625, "learning_rate": 4.841041694801208e-07, "loss": 1.0270442962646484, "step": 10600, "token_acc": 0.7519379627407866 }, { "epoch": 1.9339252832661176, "grad_norm": 2.0, "learning_rate": 2.988120939429684e-07, "loss": 1.0303496551513671, "step": 10700, "token_acc": 0.7513509325616106 }, { "epoch": 1.9520000903740355, "grad_norm": 1.890625, "learning_rate": 1.5790780456277355e-07, "loss": 1.0394702911376954, "step": 10800, "token_acc": 0.7495070401423912 }, { "epoch": 1.9700748974819535, "grad_norm": 2.390625, "learning_rate": 6.15171419829752e-08, "loss": 1.0353932189941406, "step": 10900, "token_acc": 0.7506620155660084 }, { "epoch": 1.9881497045898713, "grad_norm": 2.15625, "learning_rate": 9.726192035691694e-09, "loss": 1.0364900970458983, "step": 11000, "token_acc": 0.7511165613553197 } ], "logging_steps": 100, "max_steps": 11066, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.597718898108247e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }