{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.919056052845723, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.9638157235458493, "epoch": 0.007658800440381025, "grad_norm": 8.468399047851562, "learning_rate": 1.125e-06, "loss": 1.000040054321289, "mean_token_accuracy": 0.784000868536532, "num_tokens": 2256601.0, "step": 10 }, { "entropy": 0.8869377555325627, "epoch": 0.01531760088076205, "grad_norm": 0.4561779201030731, "learning_rate": 2.375e-06, "loss": 0.8214498519897461, "mean_token_accuracy": 0.8017587121576071, "num_tokens": 4548350.0, "step": 20 }, { "entropy": 0.6845712042413652, "epoch": 0.022976401321143074, "grad_norm": 0.07862971723079681, "learning_rate": 3.625e-06, "loss": 0.6381561279296875, "mean_token_accuracy": 0.8316645501181483, "num_tokens": 6813579.0, "step": 30 }, { "entropy": 0.619562475476414, "epoch": 0.0306352017615241, "grad_norm": 0.04751257970929146, "learning_rate": 4.875e-06, "loss": 0.5589711666107178, "mean_token_accuracy": 0.8444838780909777, "num_tokens": 9161357.0, "step": 40 }, { "entropy": 0.5984557962045074, "epoch": 0.038294002201905125, "grad_norm": 0.08734887838363647, "learning_rate": 4.964454976303318e-06, "loss": 0.5401619911193848, "mean_token_accuracy": 0.8480381922796368, "num_tokens": 11470288.0, "step": 50 }, { "entropy": 0.5861514512449503, "epoch": 0.04595280264228615, "grad_norm": 0.0356341153383255, "learning_rate": 4.924960505529227e-06, "loss": 0.5160280227661133, "mean_token_accuracy": 0.8562987703830004, "num_tokens": 13763429.0, "step": 60 }, { "entropy": 0.5815956988371909, "epoch": 0.05361160308266718, "grad_norm": 0.06630558520555496, "learning_rate": 4.885466034755134e-06, "loss": 0.5006961822509766, "mean_token_accuracy": 0.8571984633803368, "num_tokens": 16017906.0, "step": 70 }, { "entropy": 0.5651088379323482, "epoch": 0.0612704035230482, "grad_norm": 0.026391323655843735, "learning_rate": 4.845971563981043e-06, "loss": 0.4796905517578125, "mean_token_accuracy": 0.86026117708534, "num_tokens": 18302237.0, "step": 80 }, { "entropy": 0.5485505453310907, "epoch": 0.06892920396342923, "grad_norm": 0.16785100102424622, "learning_rate": 4.806477093206952e-06, "loss": 0.47562880516052247, "mean_token_accuracy": 0.8600230842828751, "num_tokens": 20613396.0, "step": 90 }, { "entropy": 0.5333567421883345, "epoch": 0.07658800440381025, "grad_norm": 0.03617825359106064, "learning_rate": 4.766982622432859e-06, "loss": 0.467279052734375, "mean_token_accuracy": 0.8615671737119556, "num_tokens": 22894278.0, "step": 100 }, { "entropy": 0.5182453896850348, "epoch": 0.08424680484419128, "grad_norm": 0.06548844277858734, "learning_rate": 4.727488151658769e-06, "loss": 0.458143424987793, "mean_token_accuracy": 0.8627345286309719, "num_tokens": 25206563.0, "step": 110 }, { "entropy": 0.5145570897497237, "epoch": 0.0919056052845723, "grad_norm": 0.03264116495847702, "learning_rate": 4.6879936808846766e-06, "loss": 0.45707268714904786, "mean_token_accuracy": 0.8632168389856816, "num_tokens": 27484040.0, "step": 120 }, { "entropy": 0.5025211286731064, "epoch": 0.09956440572495333, "grad_norm": 0.027719072997570038, "learning_rate": 4.648499210110584e-06, "loss": 0.4481193542480469, "mean_token_accuracy": 0.8667424885556102, "num_tokens": 29792202.0, "step": 130 }, { "entropy": 0.49839433738961814, "epoch": 0.10722320616533436, "grad_norm": 0.06785497069358826, "learning_rate": 4.609004739336494e-06, "loss": 0.4417428016662598, "mean_token_accuracy": 0.8677690284326672, "num_tokens": 32107572.0, "step": 140 }, { "entropy": 0.49580598613247273, "epoch": 0.11488200660571538, "grad_norm": 0.06671962141990662, "learning_rate": 4.5695102685624015e-06, "loss": 0.4398933410644531, "mean_token_accuracy": 0.8677896987646818, "num_tokens": 34407548.0, "step": 150 }, { "entropy": 0.49476480865851047, "epoch": 0.1225408070460964, "grad_norm": 0.02333025634288788, "learning_rate": 4.53001579778831e-06, "loss": 0.4360370635986328, "mean_token_accuracy": 0.8685005273669958, "num_tokens": 36664288.0, "step": 160 }, { "entropy": 0.48315772889181974, "epoch": 0.13019960748647744, "grad_norm": 0.022907257080078125, "learning_rate": 4.490521327014219e-06, "loss": 0.4286161422729492, "mean_token_accuracy": 0.8704841218888759, "num_tokens": 38965655.0, "step": 170 }, { "entropy": 0.4817703261971474, "epoch": 0.13785840792685847, "grad_norm": 0.02129477635025978, "learning_rate": 4.4510268562401265e-06, "loss": 0.42794160842895507, "mean_token_accuracy": 0.8699940867722035, "num_tokens": 41234311.0, "step": 180 }, { "entropy": 0.47914891233667734, "epoch": 0.14551720836723947, "grad_norm": 0.018593771383166313, "learning_rate": 4.411532385466035e-06, "loss": 0.42693591117858887, "mean_token_accuracy": 0.869681833870709, "num_tokens": 43517016.0, "step": 190 }, { "entropy": 0.46877209544181825, "epoch": 0.1531760088076205, "grad_norm": 0.054788339883089066, "learning_rate": 4.372037914691944e-06, "loss": 0.4187319755554199, "mean_token_accuracy": 0.8715709690004587, "num_tokens": 45813790.0, "step": 200 }, { "entropy": 0.4654896330088377, "epoch": 0.16083480924800153, "grad_norm": 0.028184032067656517, "learning_rate": 4.332543443917852e-06, "loss": 0.42058391571044923, "mean_token_accuracy": 0.8709377760067583, "num_tokens": 48144428.0, "step": 210 }, { "entropy": 0.465091020707041, "epoch": 0.16849360968838256, "grad_norm": 0.020246045663952827, "learning_rate": 4.29304897314376e-06, "loss": 0.42343916893005373, "mean_token_accuracy": 0.8702428733929992, "num_tokens": 50411268.0, "step": 220 }, { "entropy": 0.4618240131996572, "epoch": 0.1761524101287636, "grad_norm": 0.019168304279446602, "learning_rate": 4.253554502369669e-06, "loss": 0.42192907333374025, "mean_token_accuracy": 0.8700458832085133, "num_tokens": 52712022.0, "step": 230 }, { "entropy": 0.4555769263766706, "epoch": 0.1838112105691446, "grad_norm": 0.028189843520522118, "learning_rate": 4.214060031595577e-06, "loss": 0.41624298095703127, "mean_token_accuracy": 0.8716806696727872, "num_tokens": 55014306.0, "step": 240 }, { "entropy": 0.44512661090120675, "epoch": 0.19147001100952563, "grad_norm": 0.05628414452075958, "learning_rate": 4.174565560821485e-06, "loss": 0.4077006340026855, "mean_token_accuracy": 0.8738998031243682, "num_tokens": 57332962.0, "step": 250 }, { "entropy": 0.4452826808206737, "epoch": 0.19912881144990666, "grad_norm": 0.03325086459517479, "learning_rate": 4.135071090047394e-06, "loss": 0.4117462158203125, "mean_token_accuracy": 0.872441022284329, "num_tokens": 59617425.0, "step": 260 }, { "entropy": 0.4452002733945847, "epoch": 0.2067876118902877, "grad_norm": 0.025545459240674973, "learning_rate": 4.095576619273302e-06, "loss": 0.4101984977722168, "mean_token_accuracy": 0.8726631047204136, "num_tokens": 61923455.0, "step": 270 }, { "entropy": 0.4457569817081094, "epoch": 0.21444641233066872, "grad_norm": 0.031541287899017334, "learning_rate": 4.05608214849921e-06, "loss": 0.41226825714111326, "mean_token_accuracy": 0.8723130978643894, "num_tokens": 64205792.0, "step": 280 }, { "entropy": 0.43516338849440217, "epoch": 0.22210521277104975, "grad_norm": 0.0174368005245924, "learning_rate": 4.0165876777251185e-06, "loss": 0.40465373992919923, "mean_token_accuracy": 0.8751402111724019, "num_tokens": 66513837.0, "step": 290 }, { "entropy": 0.4423691611737013, "epoch": 0.22976401321143075, "grad_norm": 0.017861908301711082, "learning_rate": 3.977093206951027e-06, "loss": 0.4081140041351318, "mean_token_accuracy": 0.8730683302506804, "num_tokens": 68807723.0, "step": 300 }, { "entropy": 0.43804038101807236, "epoch": 0.23742281365181178, "grad_norm": 0.01862718164920807, "learning_rate": 3.937598736176936e-06, "loss": 0.4079318046569824, "mean_token_accuracy": 0.8735693100839853, "num_tokens": 71105215.0, "step": 310 }, { "entropy": 0.4372365009970963, "epoch": 0.2450816140921928, "grad_norm": 0.01921224780380726, "learning_rate": 3.898104265402844e-06, "loss": 0.4054897308349609, "mean_token_accuracy": 0.8743662687018514, "num_tokens": 73443927.0, "step": 320 }, { "entropy": 0.4320651748217642, "epoch": 0.25274041453257384, "grad_norm": 0.01882867142558098, "learning_rate": 3.858609794628752e-06, "loss": 0.40141096115112307, "mean_token_accuracy": 0.8753820607438684, "num_tokens": 75767699.0, "step": 330 }, { "entropy": 0.43937554229050874, "epoch": 0.2603992149729549, "grad_norm": 0.017837367951869965, "learning_rate": 3.819115323854661e-06, "loss": 0.40896854400634763, "mean_token_accuracy": 0.8731667961925268, "num_tokens": 78059352.0, "step": 340 }, { "entropy": 0.4369427585974336, "epoch": 0.2680580154133359, "grad_norm": 0.02532646618783474, "learning_rate": 3.779620853080569e-06, "loss": 0.40844316482543946, "mean_token_accuracy": 0.8738381527364254, "num_tokens": 80348013.0, "step": 350 }, { "entropy": 0.4335848417133093, "epoch": 0.27571681585371693, "grad_norm": 0.031413592398166656, "learning_rate": 3.7401263823064775e-06, "loss": 0.4057870388031006, "mean_token_accuracy": 0.8742154082283378, "num_tokens": 82659712.0, "step": 360 }, { "entropy": 0.43348568454384806, "epoch": 0.2833756162940979, "grad_norm": 0.018019968643784523, "learning_rate": 3.7006319115323856e-06, "loss": 0.4047835826873779, "mean_token_accuracy": 0.8740507639944554, "num_tokens": 84980573.0, "step": 370 }, { "entropy": 0.4382829017937183, "epoch": 0.29103441673447894, "grad_norm": 0.03078663907945156, "learning_rate": 3.661137440758294e-06, "loss": 0.4107412338256836, "mean_token_accuracy": 0.8724170258268714, "num_tokens": 87201556.0, "step": 380 }, { "entropy": 0.43776033921167257, "epoch": 0.29869321717485997, "grad_norm": 0.02386470139026642, "learning_rate": 3.6216429699842024e-06, "loss": 0.4104489326477051, "mean_token_accuracy": 0.8728113612160087, "num_tokens": 89478842.0, "step": 390 }, { "entropy": 0.43476897608488796, "epoch": 0.306352017615241, "grad_norm": 0.01759020984172821, "learning_rate": 3.5821484992101106e-06, "loss": 0.40773825645446776, "mean_token_accuracy": 0.8733439993113279, "num_tokens": 91786832.0, "step": 400 }, { "entropy": 0.4269565034657717, "epoch": 0.31401081805562203, "grad_norm": 0.02766292169690132, "learning_rate": 3.5426540284360196e-06, "loss": 0.40129985809326174, "mean_token_accuracy": 0.8752415424212814, "num_tokens": 94116280.0, "step": 410 }, { "entropy": 0.43023997033014894, "epoch": 0.32166961849600306, "grad_norm": 0.02584155462682247, "learning_rate": 3.5031595576619278e-06, "loss": 0.4028194427490234, "mean_token_accuracy": 0.8748508550226688, "num_tokens": 96390327.0, "step": 420 }, { "entropy": 0.4311048804782331, "epoch": 0.3293284189363841, "grad_norm": 0.030514976009726524, "learning_rate": 3.463665086887836e-06, "loss": 0.40514497756958007, "mean_token_accuracy": 0.8740883070975543, "num_tokens": 98693689.0, "step": 430 }, { "entropy": 0.42613045433536173, "epoch": 0.3369872193767651, "grad_norm": 0.028530791401863098, "learning_rate": 3.4241706161137446e-06, "loss": 0.3992255449295044, "mean_token_accuracy": 0.8751774175092578, "num_tokens": 101002410.0, "step": 440 }, { "entropy": 0.4248706246726215, "epoch": 0.34464601981714615, "grad_norm": 0.019795197993516922, "learning_rate": 3.3846761453396527e-06, "loss": 0.3998436450958252, "mean_token_accuracy": 0.8754259610548616, "num_tokens": 103306507.0, "step": 450 }, { "entropy": 0.42447849148884415, "epoch": 0.3523048202575272, "grad_norm": 0.026115981861948967, "learning_rate": 3.3451816745655613e-06, "loss": 0.39901156425476075, "mean_token_accuracy": 0.8751692553982139, "num_tokens": 105628688.0, "step": 460 }, { "entropy": 0.42756049148738384, "epoch": 0.3599636206979082, "grad_norm": 0.024642089381814003, "learning_rate": 3.3056872037914695e-06, "loss": 0.40213947296142577, "mean_token_accuracy": 0.8749727945774793, "num_tokens": 107893640.0, "step": 470 }, { "entropy": 0.42482230020686984, "epoch": 0.3676224211382892, "grad_norm": 0.01740400120615959, "learning_rate": 3.2661927330173777e-06, "loss": 0.39930453300476076, "mean_token_accuracy": 0.8750920739024878, "num_tokens": 110182930.0, "step": 480 }, { "entropy": 0.4245555128902197, "epoch": 0.3752812215786702, "grad_norm": 0.01773119531571865, "learning_rate": 3.2266982622432863e-06, "loss": 0.4016741752624512, "mean_token_accuracy": 0.875009255297482, "num_tokens": 112454989.0, "step": 490 }, { "entropy": 0.42941147135570645, "epoch": 0.38294002201905125, "grad_norm": 0.024097498506307602, "learning_rate": 3.1872037914691945e-06, "loss": 0.4024195671081543, "mean_token_accuracy": 0.8748315701261162, "num_tokens": 114752265.0, "step": 500 }, { "entropy": 0.4233192947693169, "epoch": 0.3905988224594323, "grad_norm": 0.020476436242461205, "learning_rate": 3.147709320695103e-06, "loss": 0.39432778358459475, "mean_token_accuracy": 0.8766942717134952, "num_tokens": 117042981.0, "step": 510 }, { "entropy": 0.4223570663481951, "epoch": 0.3982576228998133, "grad_norm": 0.023389821872115135, "learning_rate": 3.1082148499210112e-06, "loss": 0.3961241006851196, "mean_token_accuracy": 0.875825615786016, "num_tokens": 119360637.0, "step": 520 }, { "entropy": 0.42124154828488825, "epoch": 0.40591642334019434, "grad_norm": 0.024319512769579887, "learning_rate": 3.0687203791469194e-06, "loss": 0.3995026111602783, "mean_token_accuracy": 0.8755284296348691, "num_tokens": 121646498.0, "step": 530 }, { "entropy": 0.4242598842829466, "epoch": 0.4135752237805754, "grad_norm": 0.021054713055491447, "learning_rate": 3.029225908372828e-06, "loss": 0.39807853698730467, "mean_token_accuracy": 0.8752687338739633, "num_tokens": 123920056.0, "step": 540 }, { "entropy": 0.4243672636337578, "epoch": 0.4212340242209564, "grad_norm": 0.02523292973637581, "learning_rate": 2.989731437598736e-06, "loss": 0.4009854316711426, "mean_token_accuracy": 0.875128398090601, "num_tokens": 126207207.0, "step": 550 }, { "entropy": 0.42228690376505257, "epoch": 0.42889282466133744, "grad_norm": 0.01712065190076828, "learning_rate": 2.950236966824645e-06, "loss": 0.3969071865081787, "mean_token_accuracy": 0.8754786295816303, "num_tokens": 128477832.0, "step": 560 }, { "entropy": 0.4186239805072546, "epoch": 0.43655162510171847, "grad_norm": 0.0163181871175766, "learning_rate": 2.910742496050553e-06, "loss": 0.3907261848449707, "mean_token_accuracy": 0.8775882260873914, "num_tokens": 130769237.0, "step": 570 }, { "entropy": 0.42039443040266633, "epoch": 0.4442104255420995, "grad_norm": 0.02180050127208233, "learning_rate": 2.871248025276461e-06, "loss": 0.397492790222168, "mean_token_accuracy": 0.876148846000433, "num_tokens": 133031465.0, "step": 580 }, { "entropy": 0.4161387952044606, "epoch": 0.45186922598248047, "grad_norm": 0.017672181129455566, "learning_rate": 2.83175355450237e-06, "loss": 0.3924778699874878, "mean_token_accuracy": 0.8767253663390875, "num_tokens": 135322215.0, "step": 590 }, { "entropy": 0.42231198167428374, "epoch": 0.4595280264228615, "grad_norm": 0.031108738854527473, "learning_rate": 2.7922590837282783e-06, "loss": 0.3990061283111572, "mean_token_accuracy": 0.8758387329056859, "num_tokens": 137590880.0, "step": 600 }, { "entropy": 0.4186031956225634, "epoch": 0.46718682686324253, "grad_norm": 0.020063655450940132, "learning_rate": 2.752764612954187e-06, "loss": 0.3937615156173706, "mean_token_accuracy": 0.8771176159381866, "num_tokens": 139882957.0, "step": 610 }, { "entropy": 0.4200515809468925, "epoch": 0.47484562730362356, "grad_norm": 0.01840071938931942, "learning_rate": 2.713270142180095e-06, "loss": 0.39367630481719973, "mean_token_accuracy": 0.8769001543521882, "num_tokens": 142150557.0, "step": 620 }, { "entropy": 0.41964845787733795, "epoch": 0.4825044277440046, "grad_norm": 0.03402973338961601, "learning_rate": 2.6737756714060033e-06, "loss": 0.3966160535812378, "mean_token_accuracy": 0.876146792806685, "num_tokens": 144417758.0, "step": 630 }, { "entropy": 0.41940504405647516, "epoch": 0.4901632281843856, "grad_norm": 0.01644454151391983, "learning_rate": 2.634281200631912e-06, "loss": 0.3938936710357666, "mean_token_accuracy": 0.8767383242025971, "num_tokens": 146708789.0, "step": 640 }, { "entropy": 0.4258418914861977, "epoch": 0.49782202862476665, "grad_norm": 0.028102336451411247, "learning_rate": 2.59478672985782e-06, "loss": 0.40206151008605956, "mean_token_accuracy": 0.8747876984998584, "num_tokens": 149020891.0, "step": 650 }, { "entropy": 0.4147974385879934, "epoch": 0.5054808290651477, "grad_norm": 0.022107699885964394, "learning_rate": 2.5552922590837287e-06, "loss": 0.3919835090637207, "mean_token_accuracy": 0.8775576103478671, "num_tokens": 151299185.0, "step": 660 }, { "entropy": 0.41798324035480616, "epoch": 0.5131396295055287, "grad_norm": 0.016613123938441277, "learning_rate": 2.515797788309637e-06, "loss": 0.39252438545227053, "mean_token_accuracy": 0.8769917484372854, "num_tokens": 153586618.0, "step": 670 }, { "entropy": 0.42186861447989943, "epoch": 0.5207984299459097, "grad_norm": 0.01724848710000515, "learning_rate": 2.4763033175355454e-06, "loss": 0.39810571670532224, "mean_token_accuracy": 0.8758242284879089, "num_tokens": 155856504.0, "step": 680 }, { "entropy": 0.41550648426637055, "epoch": 0.5284572303862908, "grad_norm": 0.023920124396681786, "learning_rate": 2.4368088467614536e-06, "loss": 0.3909403085708618, "mean_token_accuracy": 0.8782436966896057, "num_tokens": 158153561.0, "step": 690 }, { "entropy": 0.416356707457453, "epoch": 0.5361160308266718, "grad_norm": 0.025771064683794975, "learning_rate": 2.397314375987362e-06, "loss": 0.3935497522354126, "mean_token_accuracy": 0.8770827081054449, "num_tokens": 160414992.0, "step": 700 }, { "entropy": 0.4169067163951695, "epoch": 0.5437748312670528, "grad_norm": 0.0343230739235878, "learning_rate": 2.3578199052132704e-06, "loss": 0.3934483528137207, "mean_token_accuracy": 0.8766279637813568, "num_tokens": 162719857.0, "step": 710 }, { "entropy": 0.4152266987599432, "epoch": 0.5514336317074339, "grad_norm": 0.03518790379166603, "learning_rate": 2.3183254344391786e-06, "loss": 0.39179143905639646, "mean_token_accuracy": 0.8775566022843122, "num_tokens": 164998717.0, "step": 720 }, { "entropy": 0.41512856343761084, "epoch": 0.5590924321478149, "grad_norm": 0.017510589212179184, "learning_rate": 2.278830963665087e-06, "loss": 0.3936178207397461, "mean_token_accuracy": 0.8766550052911043, "num_tokens": 167283423.0, "step": 730 }, { "entropy": 0.41410760041326283, "epoch": 0.5667512325881958, "grad_norm": 0.028657181188464165, "learning_rate": 2.2393364928909954e-06, "loss": 0.3910004377365112, "mean_token_accuracy": 0.8774017574265599, "num_tokens": 169574664.0, "step": 740 }, { "entropy": 0.41111645018681886, "epoch": 0.5744100330285768, "grad_norm": 0.017415538430213928, "learning_rate": 2.1998420221169035e-06, "loss": 0.3903531551361084, "mean_token_accuracy": 0.878159393183887, "num_tokens": 171918950.0, "step": 750 }, { "entropy": 0.4121713091619313, "epoch": 0.5820688334689579, "grad_norm": 0.016998812556266785, "learning_rate": 2.160347551342812e-06, "loss": 0.38824028968811036, "mean_token_accuracy": 0.8780525822192431, "num_tokens": 174167500.0, "step": 760 }, { "entropy": 0.41725681545212867, "epoch": 0.5897276339093389, "grad_norm": 0.02490777149796486, "learning_rate": 2.1208530805687207e-06, "loss": 0.3949615955352783, "mean_token_accuracy": 0.8761186260730028, "num_tokens": 176430133.0, "step": 770 }, { "entropy": 0.41779537945985795, "epoch": 0.5973864343497199, "grad_norm": 0.023372486233711243, "learning_rate": 2.081358609794629e-06, "loss": 0.3959986925125122, "mean_token_accuracy": 0.8762047516182065, "num_tokens": 178715515.0, "step": 780 }, { "entropy": 0.4114751876331866, "epoch": 0.605045234790101, "grad_norm": 0.01719413697719574, "learning_rate": 2.0418641390205375e-06, "loss": 0.3856500148773193, "mean_token_accuracy": 0.8787943137809634, "num_tokens": 180969485.0, "step": 790 }, { "entropy": 0.41450852565467355, "epoch": 0.612704035230482, "grad_norm": 0.01740705594420433, "learning_rate": 2.0023696682464457e-06, "loss": 0.3912628650665283, "mean_token_accuracy": 0.8774056326597929, "num_tokens": 183230970.0, "step": 800 }, { "entropy": 0.4187679937109351, "epoch": 0.620362835670863, "grad_norm": 0.021541906520724297, "learning_rate": 1.962875197472354e-06, "loss": 0.3923694610595703, "mean_token_accuracy": 0.8767649749293923, "num_tokens": 2285361.0, "step": 810 }, { "entropy": 0.41270146872848273, "epoch": 0.6280216361112441, "grad_norm": 0.015697607770562172, "learning_rate": 1.9233807266982625e-06, "loss": 0.3898160457611084, "mean_token_accuracy": 0.8781723350286483, "num_tokens": 4587240.0, "step": 820 }, { "entropy": 0.41292855991050603, "epoch": 0.6356804365516251, "grad_norm": 0.020294206216931343, "learning_rate": 1.8838862559241708e-06, "loss": 0.3890320062637329, "mean_token_accuracy": 0.8778593957424163, "num_tokens": 6872728.0, "step": 830 }, { "entropy": 0.411221484746784, "epoch": 0.6433392369920061, "grad_norm": 0.046215225011110306, "learning_rate": 1.8443917851500792e-06, "loss": 0.3870939970016479, "mean_token_accuracy": 0.8784448400139808, "num_tokens": 9160881.0, "step": 840 }, { "entropy": 0.41191457901149986, "epoch": 0.6509980374323872, "grad_norm": 0.01619116961956024, "learning_rate": 1.8048973143759876e-06, "loss": 0.3880185604095459, "mean_token_accuracy": 0.8788302283734083, "num_tokens": 11456890.0, "step": 850 }, { "entropy": 0.4106498261913657, "epoch": 0.6586568378727682, "grad_norm": 0.016796967014670372, "learning_rate": 1.7654028436018958e-06, "loss": 0.38854246139526366, "mean_token_accuracy": 0.8780328661203385, "num_tokens": 13779035.0, "step": 860 }, { "entropy": 0.41066021313890816, "epoch": 0.6663156383131492, "grad_norm": 0.10197632014751434, "learning_rate": 1.7259083728278042e-06, "loss": 0.3858454465866089, "mean_token_accuracy": 0.878836939483881, "num_tokens": 16050638.0, "step": 870 }, { "entropy": 0.41092505119740963, "epoch": 0.6739744387535302, "grad_norm": 0.019850848242640495, "learning_rate": 1.6864139020537126e-06, "loss": 0.38851447105407716, "mean_token_accuracy": 0.8778711641207337, "num_tokens": 18346546.0, "step": 880 }, { "entropy": 0.4060887537896633, "epoch": 0.6816332391939113, "grad_norm": 0.01596878468990326, "learning_rate": 1.646919431279621e-06, "loss": 0.38296055793762207, "mean_token_accuracy": 0.8796854361891746, "num_tokens": 20693646.0, "step": 890 }, { "entropy": 0.41415045112371446, "epoch": 0.6892920396342923, "grad_norm": 0.020703142508864403, "learning_rate": 1.6074249605055296e-06, "loss": 0.39114315509796144, "mean_token_accuracy": 0.8775241926312447, "num_tokens": 22979888.0, "step": 900 }, { "entropy": 0.4099827105179429, "epoch": 0.6969508400746733, "grad_norm": 0.018190376460552216, "learning_rate": 1.5679304897314377e-06, "loss": 0.3876938343048096, "mean_token_accuracy": 0.8783342713490129, "num_tokens": 25245863.0, "step": 910 }, { "entropy": 0.4111726184375584, "epoch": 0.7046096405150544, "grad_norm": 0.03450653702020645, "learning_rate": 1.5284360189573461e-06, "loss": 0.38763377666473386, "mean_token_accuracy": 0.8786343418061733, "num_tokens": 27522191.0, "step": 920 }, { "entropy": 0.41031082523986695, "epoch": 0.7122684409554354, "grad_norm": 0.019164785742759705, "learning_rate": 1.4889415481832545e-06, "loss": 0.38515076637268064, "mean_token_accuracy": 0.8786031175404787, "num_tokens": 29812189.0, "step": 930 }, { "entropy": 0.4064248114824295, "epoch": 0.7199272413958164, "grad_norm": 0.01673167012631893, "learning_rate": 1.449447077409163e-06, "loss": 0.38536901473999025, "mean_token_accuracy": 0.8789769830182195, "num_tokens": 32068083.0, "step": 940 }, { "entropy": 0.4080692335031927, "epoch": 0.7275860418361974, "grad_norm": 0.018068261444568634, "learning_rate": 1.4099526066350713e-06, "loss": 0.38577680587768554, "mean_token_accuracy": 0.879298797622323, "num_tokens": 34347274.0, "step": 950 }, { "entropy": 0.4161804819479585, "epoch": 0.7352448422765784, "grad_norm": 0.018596794456243515, "learning_rate": 1.3704581358609795e-06, "loss": 0.3958181858062744, "mean_token_accuracy": 0.8759849725291133, "num_tokens": 36614119.0, "step": 960 }, { "entropy": 0.4100917984731495, "epoch": 0.7429036427169594, "grad_norm": 0.022355731576681137, "learning_rate": 1.3309636650868879e-06, "loss": 0.38584303855895996, "mean_token_accuracy": 0.8787054903805256, "num_tokens": 38895883.0, "step": 970 }, { "entropy": 0.40771132363006474, "epoch": 0.7505624431573404, "grad_norm": 0.0167247261852026, "learning_rate": 1.2914691943127962e-06, "loss": 0.3863053321838379, "mean_token_accuracy": 0.8788028365001083, "num_tokens": 41161264.0, "step": 980 }, { "entropy": 0.4112453758716583, "epoch": 0.7582212435977215, "grad_norm": 0.017709029838442802, "learning_rate": 1.2519747235387048e-06, "loss": 0.38743517398834226, "mean_token_accuracy": 0.8780813764780759, "num_tokens": 43424897.0, "step": 990 }, { "entropy": 0.40866071078926325, "epoch": 0.7658800440381025, "grad_norm": 0.03608441352844238, "learning_rate": 1.212480252764613e-06, "loss": 0.38464813232421874, "mean_token_accuracy": 0.8790146630257368, "num_tokens": 45688164.0, "step": 1000 }, { "entropy": 0.406084228400141, "epoch": 0.7735388444784835, "grad_norm": 0.01752273179590702, "learning_rate": 1.1729857819905214e-06, "loss": 0.3844747543334961, "mean_token_accuracy": 0.8798089537769556, "num_tokens": 48021969.0, "step": 1010 }, { "entropy": 0.4065625052899122, "epoch": 0.7811976449188646, "grad_norm": 0.015777474269270897, "learning_rate": 1.1334913112164298e-06, "loss": 0.3854344844818115, "mean_token_accuracy": 0.8788379110395909, "num_tokens": 50319637.0, "step": 1020 }, { "entropy": 0.41205244278535247, "epoch": 0.7888564453592456, "grad_norm": 0.02172328531742096, "learning_rate": 1.0939968404423382e-06, "loss": 0.3861358880996704, "mean_token_accuracy": 0.8784137150272727, "num_tokens": 52584409.0, "step": 1030 }, { "entropy": 0.4055585923604667, "epoch": 0.7965152457996266, "grad_norm": 0.015919683501124382, "learning_rate": 1.0545023696682466e-06, "loss": 0.38269662857055664, "mean_token_accuracy": 0.8797270691022276, "num_tokens": 54841155.0, "step": 1040 }, { "entropy": 0.4033643173985183, "epoch": 0.8041740462400077, "grad_norm": 0.016592318192124367, "learning_rate": 1.015007898894155e-06, "loss": 0.38209493160247804, "mean_token_accuracy": 0.8799605475738644, "num_tokens": 57158410.0, "step": 1050 }, { "entropy": 0.4045918888412416, "epoch": 0.8118328466803887, "grad_norm": 0.016041293740272522, "learning_rate": 9.755134281200633e-07, "loss": 0.3831462383270264, "mean_token_accuracy": 0.879774154163897, "num_tokens": 59496509.0, "step": 1060 }, { "entropy": 0.41076484909281136, "epoch": 0.8194916471207697, "grad_norm": 0.016471123322844505, "learning_rate": 9.360189573459716e-07, "loss": 0.38496901988983157, "mean_token_accuracy": 0.8790599407628179, "num_tokens": 61783974.0, "step": 1070 }, { "entropy": 0.40901572797447444, "epoch": 0.8271504475611507, "grad_norm": 0.01565743237733841, "learning_rate": 8.9652448657188e-07, "loss": 0.3854458570480347, "mean_token_accuracy": 0.8787517255172134, "num_tokens": 64100657.0, "step": 1080 }, { "entropy": 0.4073885683901608, "epoch": 0.8348092480015318, "grad_norm": 0.015450418926775455, "learning_rate": 8.570300157977884e-07, "loss": 0.38377454280853274, "mean_token_accuracy": 0.8794623363763094, "num_tokens": 66386805.0, "step": 1090 }, { "entropy": 0.4052654759958386, "epoch": 0.8424680484419128, "grad_norm": 0.015288250520825386, "learning_rate": 8.175355450236967e-07, "loss": 0.3812230348587036, "mean_token_accuracy": 0.8800647355616092, "num_tokens": 68721383.0, "step": 1100 }, { "entropy": 0.4113547313027084, "epoch": 0.8501268488822938, "grad_norm": 0.02288076840341091, "learning_rate": 7.780410742496052e-07, "loss": 0.3886786222457886, "mean_token_accuracy": 0.8778966784477233, "num_tokens": 70994578.0, "step": 1110 }, { "entropy": 0.4017532772384584, "epoch": 0.8577856493226749, "grad_norm": 0.03342736139893532, "learning_rate": 7.385466034755135e-07, "loss": 0.37719638347625734, "mean_token_accuracy": 0.88170400056988, "num_tokens": 73294302.0, "step": 1120 }, { "entropy": 0.4073382027447224, "epoch": 0.8654444497630559, "grad_norm": 0.015439708717167377, "learning_rate": 6.990521327014219e-07, "loss": 0.3865658283233643, "mean_token_accuracy": 0.8787333536893129, "num_tokens": 75596477.0, "step": 1130 }, { "entropy": 0.4074658391997218, "epoch": 0.8731032502034369, "grad_norm": 0.0176975317299366, "learning_rate": 6.595576619273302e-07, "loss": 0.3862590789794922, "mean_token_accuracy": 0.8786328813061118, "num_tokens": 77899557.0, "step": 1140 }, { "entropy": 0.40545356962829826, "epoch": 0.880762050643818, "grad_norm": 0.022836821153759956, "learning_rate": 6.200631911532385e-07, "loss": 0.3839853763580322, "mean_token_accuracy": 0.8795729441568255, "num_tokens": 80171019.0, "step": 1150 }, { "entropy": 0.4044990832917392, "epoch": 0.888420851084199, "grad_norm": 0.021449508145451546, "learning_rate": 5.80568720379147e-07, "loss": 0.38073570728302003, "mean_token_accuracy": 0.8800560528412461, "num_tokens": 82521264.0, "step": 1160 }, { "entropy": 0.40236521204933523, "epoch": 0.8960796515245799, "grad_norm": 0.02636878378689289, "learning_rate": 5.410742496050553e-07, "loss": 0.3819872379302979, "mean_token_accuracy": 0.8800198381766677, "num_tokens": 84803701.0, "step": 1170 }, { "entropy": 0.4063895161263645, "epoch": 0.9037384519649609, "grad_norm": 0.019358456134796143, "learning_rate": 5.015797788309637e-07, "loss": 0.38217973709106445, "mean_token_accuracy": 0.8796799056231975, "num_tokens": 87127376.0, "step": 1180 }, { "entropy": 0.40959922643378377, "epoch": 0.911397252405342, "grad_norm": 0.018155870959162712, "learning_rate": 4.6208530805687207e-07, "loss": 0.3856808662414551, "mean_token_accuracy": 0.8791890177875757, "num_tokens": 89417694.0, "step": 1190 }, { "entropy": 0.4059012939222157, "epoch": 0.919056052845723, "grad_norm": 0.019169267266988754, "learning_rate": 4.225908372827804e-07, "loss": 0.38498947620391843, "mean_token_accuracy": 0.879179273173213, "num_tokens": 91699734.0, "step": 1200 } ], "logging_steps": 10, "max_steps": 1306, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.114433819648459e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }