[ { "loss": 4.235999755859375, "grad_norm": 5.0625, "learning_rate": 1.9056785370548606e-06, "entropy": 1.2326550805568695, "num_tokens": 886962.0, "mean_token_accuracy": 0.5153479687372844, "epoch": 0.02888781896966779, "step": 100 }, { "loss": 1.0453578186035157, "grad_norm": 1.328125, "learning_rate": 3.83060635226179e-06, "entropy": 1.003088502685229, "num_tokens": 1772142.0, "mean_token_accuracy": 0.8255874119202296, "epoch": 0.05777563793933558, "step": 200 }, { "loss": 0.6848526000976562, "grad_norm": 0.384765625, "learning_rate": 5.75553416746872e-06, "entropy": 0.730368642906348, "num_tokens": 2658161.0, "mean_token_accuracy": 0.8729760211706161, "epoch": 0.08666345690900337, "step": 300 }, { "loss": 0.6384764099121094, "grad_norm": 0.5234375, "learning_rate": 7.68046198267565e-06, "entropy": 0.637606812864542, "num_tokens": 3544837.0, "mean_token_accuracy": 0.8758877144257228, "epoch": 0.11555127587867116, "step": 400 }, { "loss": 0.6139374160766602, "grad_norm": 0.287109375, "learning_rate": 9.605389797882581e-06, "entropy": 0.6128936386108399, "num_tokens": 4431654.0, "mean_token_accuracy": 0.8796136958400408, "epoch": 0.14443909484833894, "step": 500 }, { "eval_loss": 0.6138447523117065, "eval_runtime": 121.1778, "eval_samples_per_second": 111.522, "eval_steps_per_second": 18.593, "eval_entropy": 0.6253009630979774, "eval_num_tokens": 4431654.0, "eval_mean_token_accuracy": 0.8792604871500983, "epoch": 0.14443909484833894, "step": 500 }, { "loss": 0.5988116073608398, "grad_norm": 0.294921875, "learning_rate": 1.153031761308951e-05, "entropy": 0.599972769121329, "num_tokens": 5317947.0, "mean_token_accuracy": 0.8811199645201365, "epoch": 0.17332691381800674, "step": 600 }, { "loss": 0.5892007064819336, "grad_norm": 0.345703125, "learning_rate": 1.3455245428296439e-05, "entropy": 0.5934143226842086, "num_tokens": 6205303.0, "mean_token_accuracy": 0.8826212238272031, "epoch": 0.20221473278767452, "step": 700 }, { "loss": 0.579403953552246, "grad_norm": 0.244140625, "learning_rate": 1.538017324350337e-05, "entropy": 0.5813252867758274, "num_tokens": 7092051.0, "mean_token_accuracy": 0.8843332821130753, "epoch": 0.23110255175734232, "step": 800 }, { "loss": 0.5702339553833008, "grad_norm": 0.2158203125, "learning_rate": 1.73051010587103e-05, "entropy": 0.5745066069066525, "num_tokens": 7978376.0, "mean_token_accuracy": 0.884980032046636, "epoch": 0.2599903707270101, "step": 900 }, { "loss": 0.5615914154052735, "grad_norm": 0.326171875, "learning_rate": 1.923002887391723e-05, "entropy": 0.564577516814073, "num_tokens": 8865602.0, "mean_token_accuracy": 0.8863117796182632, "epoch": 0.2888781896966779, "step": 1000 }, { "eval_loss": 0.5681502223014832, "eval_runtime": 121.1382, "eval_samples_per_second": 111.559, "eval_steps_per_second": 18.599, "eval_entropy": 0.5520287772257805, "eval_num_tokens": 8865602.0, "eval_mean_token_accuracy": 0.8852167688789655, "epoch": 0.2888781896966779, "step": 1000 }, { "loss": 0.5591428756713868, "grad_norm": 0.2392578125, "learning_rate": 1.9997966645755392e-05, "entropy": 0.5630815910299619, "num_tokens": 9752872.0, "mean_token_accuracy": 0.8859697584311167, "epoch": 0.3177660086663457, "step": 1100 }, { "loss": 0.5547999954223632, "grad_norm": 0.2255859375, "learning_rate": 1.9985543586468365e-05, "entropy": 0.5587607964873313, "num_tokens": 10640416.0, "mean_token_accuracy": 0.886342776119709, "epoch": 0.3466538276360135, "step": 1200 }, { "loss": 0.5420237731933594, "grad_norm": 0.208984375, "learning_rate": 1.9961841124880657e-05, "entropy": 0.5386069346467653, "num_tokens": 11526809.0, "mean_token_accuracy": 0.8891533743341764, "epoch": 0.3755416466056813, "step": 1300 }, { "loss": 0.5437238693237305, "grad_norm": 0.20703125, "learning_rate": 1.9926886034657355e-05, "entropy": 0.5422710782289505, "num_tokens": 12412683.0, "mean_token_accuracy": 0.8881420868635178, "epoch": 0.40442946557534903, "step": 1400 }, { "loss": 0.538447380065918, "grad_norm": 0.189453125, "learning_rate": 1.9880717800131158e-05, "entropy": 0.5379938718179862, "num_tokens": 13297912.0, "mean_token_accuracy": 0.8887334618965784, "epoch": 0.43331728454501683, "step": 1500 }, { "eval_loss": 0.5418813228607178, "eval_runtime": 121.0107, "eval_samples_per_second": 111.676, "eval_steps_per_second": 18.618, "eval_entropy": 0.5439975674264547, "eval_num_tokens": 13297912.0, "eval_mean_token_accuracy": 0.8884000489831341, "epoch": 0.43331728454501683, "step": 1500 }, { "loss": 0.533262062072754, "grad_norm": 0.203125, "learning_rate": 1.9823388571701914e-05, "entropy": 0.5346410566071669, "num_tokens": 14184641.0, "mean_token_accuracy": 0.889172242085139, "epoch": 0.46220510351468463, "step": 1600 }, { "loss": 0.5277928924560547, "grad_norm": 0.2021484375, "learning_rate": 1.975496310692893e-05, "entropy": 0.528671110868454, "num_tokens": 15071591.0, "mean_token_accuracy": 0.889830404818058, "epoch": 0.49109292248435243, "step": 1700 }, { "loss": 0.5305094909667969, "grad_norm": 0.2294921875, "learning_rate": 1.9675518697382575e-05, "entropy": 0.5330572768549124, "num_tokens": 15958400.0, "mean_token_accuracy": 0.8894869204362233, "epoch": 0.5199807414540202, "step": 1800 }, { "loss": 0.5314492797851562, "grad_norm": 0.2080078125, "learning_rate": 1.958514508133779e-05, "entropy": 0.5313531577587127, "num_tokens": 16845090.0, "mean_token_accuracy": 0.8891615062952042, "epoch": 0.548868560423688, "step": 1900 }, { "loss": 0.5224239349365234, "grad_norm": 0.2265625, "learning_rate": 1.9483944342408145e-05, "entropy": 0.521418957610925, "num_tokens": 17731372.0, "mean_token_accuracy": 0.8913801329334577, "epoch": 0.5777563793933558, "step": 2000 }, { "eval_loss": 0.5295727849006653, "eval_runtime": 120.9976, "eval_samples_per_second": 111.688, "eval_steps_per_second": 18.62, "eval_entropy": 0.5356734260367755, "eval_num_tokens": 17731372.0, "eval_mean_token_accuracy": 0.889919996896533, "epoch": 0.5777563793933558, "step": 2000 }, { "loss": 0.5248940658569335, "grad_norm": 0.1904296875, "learning_rate": 1.9372030794234916e-05, "entropy": 0.5240081022679806, "num_tokens": 18620093.0, "mean_token_accuracy": 0.8901815564433734, "epoch": 0.6066441983630236, "step": 2100 }, { "loss": 0.5177249526977539, "grad_norm": 0.1884765625, "learning_rate": 1.9249530851361477e-05, "entropy": 0.5155584744115671, "num_tokens": 19506319.0, "mean_token_accuracy": 0.892069263557593, "epoch": 0.6355320173326914, "step": 2200 }, { "loss": 0.5167172622680664, "grad_norm": 0.2060546875, "learning_rate": 1.9116582886438787e-05, "entropy": 0.5149475511411826, "num_tokens": 20392582.0, "mean_token_accuracy": 0.8917997235059738, "epoch": 0.6644198363023591, "step": 2300 }, { "loss": 0.5186575698852539, "grad_norm": 0.2158203125, "learning_rate": 1.8973337073923406e-05, "entropy": 0.5176833253105482, "num_tokens": 21281188.0, "mean_token_accuracy": 0.891216093202432, "epoch": 0.693307655272027, "step": 2400 }, { "loss": 0.5165076446533203, "grad_norm": 0.208984375, "learning_rate": 1.881995522044441e-05, "entropy": 0.5149765905241172, "num_tokens": 22166818.0, "mean_token_accuracy": 0.8918550156553586, "epoch": 0.7221954742416947, "step": 2500 }, { "eval_loss": 0.5220824480056763, "eval_runtime": 120.9697, "eval_samples_per_second": 111.714, "eval_steps_per_second": 18.625, "eval_entropy": 0.5212392443902641, "eval_num_tokens": 22166818.0, "eval_mean_token_accuracy": 0.8910056220866075, "epoch": 0.7221954742416947, "step": 2500 }, { "loss": 0.5141453170776367, "grad_norm": 0.185546875, "learning_rate": 1.8656610582030976e-05, "entropy": 0.5123816165824732, "num_tokens": 23055206.0, "mean_token_accuracy": 0.8920613800485929, "epoch": 0.7510832932113626, "step": 2600 }, { "loss": 0.5139566802978516, "grad_norm": 0.193359375, "learning_rate": 1.848348766840703e-05, "entropy": 0.5132978439331055, "num_tokens": 23942436.0, "mean_token_accuracy": 0.8916262516379356, "epoch": 0.7799711121810303, "step": 2700 }, { "loss": 0.5115699005126954, "grad_norm": 0.2255859375, "learning_rate": 1.8300782034573984e-05, "entropy": 0.5114874669909477, "num_tokens": 24827923.0, "mean_token_accuracy": 0.8923087509473165, "epoch": 0.8088589311506981, "step": 2800 }, { "loss": 0.5094342803955079, "grad_norm": 0.1962890625, "learning_rate": 1.8108700059917083e-05, "entropy": 0.5071784610549609, "num_tokens": 25714586.0, "mean_token_accuracy": 0.8931817663709323, "epoch": 0.8377467501203659, "step": 2900 }, { "loss": 0.511685447692871, "grad_norm": 0.20703125, "learning_rate": 1.7907458715084743e-05, "entropy": 0.5103998957574367, "num_tokens": 26601471.0, "mean_token_accuracy": 0.8925204570094745, "epoch": 0.8666345690900337, "step": 3000 }, { "eval_loss": 0.5168190002441406, "eval_runtime": 121.3252, "eval_samples_per_second": 111.387, "eval_steps_per_second": 18.57, "eval_entropy": 0.5127926120560168, "eval_num_tokens": 26601471.0, "eval_mean_token_accuracy": 0.8917217899892682, "epoch": 0.8666345690900337, "step": 3000 }, { "loss": 0.510882911682129, "grad_norm": 0.2216796875, "learning_rate": 1.769728531690437e-05, "entropy": 0.5096460196375847, "num_tokens": 27489339.0, "mean_token_accuracy": 0.8923352911074957, "epoch": 0.8955223880597015, "step": 3100 }, { "loss": 0.5120228195190429, "grad_norm": 0.20703125, "learning_rate": 1.7478417271611325e-05, "entropy": 0.512226097236077, "num_tokens": 28377750.0, "mean_token_accuracy": 0.8917342044909795, "epoch": 0.9244102070293693, "step": 3200 }, { "loss": 0.5107974243164063, "grad_norm": 0.2177734375, "learning_rate": 1.725110180668124e-05, "entropy": 0.5103923585514227, "num_tokens": 29264856.0, "mean_token_accuracy": 0.8925020119547844, "epoch": 0.9532980259990371, "step": 3300 }, { "loss": 0.5124374008178711, "grad_norm": 0.18359375, "learning_rate": 1.7015595691568466e-05, "entropy": 0.5123077415426572, "num_tokens": 30150254.0, "mean_token_accuracy": 0.8922114634513855, "epoch": 0.9821858449687049, "step": 3400 }, { "loss": 0.5024247741699219, "grad_norm": 0.2109375, "learning_rate": 1.6772164947666184e-05, "entropy": 0.5024208198721991, "num_tokens": 31033148.0, "mean_token_accuracy": 0.8933051790680774, "epoch": 1.0109773712084738, "step": 3500 }, { "eval_loss": 0.5133240818977356, "eval_runtime": 121.1413, "eval_samples_per_second": 111.556, "eval_steps_per_second": 18.598, "eval_entropy": 0.49454835110169965, "eval_num_tokens": 31033148.0, "eval_mean_token_accuracy": 0.8922610160408155, "epoch": 1.0109773712084738, "step": 3500 }, { "loss": 0.49717609405517577, "grad_norm": 0.2138671875, "learning_rate": 1.6521084547815804e-05, "entropy": 0.49633729274074234, "num_tokens": 31919854.0, "mean_token_accuracy": 0.8940571908156077, "epoch": 1.0398651901781415, "step": 3600 }, { "loss": 0.49857440948486326, "grad_norm": 0.2060546875, "learning_rate": 1.6262638105704958e-05, "entropy": 0.4946801410615444, "num_tokens": 32806935.0, "mean_token_accuracy": 0.8944180096189182, "epoch": 1.0687530091478092, "step": 3700 }, { "loss": 0.49641082763671873, "grad_norm": 0.2001953125, "learning_rate": 1.5997117555505138e-05, "entropy": 0.4942504517734051, "num_tokens": 33692824.0, "mean_token_accuracy": 0.894220456580321, "epoch": 1.0976408281174772, "step": 3800 }, { "loss": 0.49624130249023435, "grad_norm": 0.212890625, "learning_rate": 1.5724822822110655e-05, "entropy": 0.49494813561439516, "num_tokens": 34579763.0, "mean_token_accuracy": 0.8944190714756648, "epoch": 1.126528647087145, "step": 3900 }, { "loss": 0.49298313140869143, "grad_norm": 0.1904296875, "learning_rate": 1.5446061482351525e-05, "entropy": 0.4929438012341658, "num_tokens": 35466508.0, "mean_token_accuracy": 0.8952326637506485, "epoch": 1.1554164660568127, "step": 4000 }, { "eval_loss": 0.5108281970024109, "eval_runtime": 121.0509, "eval_samples_per_second": 111.639, "eval_steps_per_second": 18.612, "eval_entropy": 0.501549962833623, "eval_num_tokens": 35466508.0, "eval_mean_token_accuracy": 0.8925832025855581, "epoch": 1.1554164660568127, "step": 4000 }, { "loss": 0.4936357116699219, "grad_norm": 0.2001953125, "learning_rate": 1.5161148417562932e-05, "entropy": 0.4936923775573572, "num_tokens": 36353094.0, "mean_token_accuracy": 0.8945872736970584, "epoch": 1.1843042850264804, "step": 4100 }, { "loss": 0.49270416259765626, "grad_norm": 0.1943359375, "learning_rate": 1.4870405457903703e-05, "entropy": 0.4933264861504237, "num_tokens": 37239421.0, "mean_token_accuracy": 0.8946521012981733, "epoch": 1.2131921039961484, "step": 4200 }, { "loss": 0.49468719482421875, "grad_norm": 0.2177734375, "learning_rate": 1.457416101882561e-05, "entropy": 0.49451990927259126, "num_tokens": 38126461.0, "mean_token_accuracy": 0.8947041656573613, "epoch": 1.2420799229658162, "step": 4300 }, { "loss": 0.48858612060546874, "grad_norm": 0.1923828125, "learning_rate": 1.4272749730104063e-05, "entropy": 0.4893125213185946, "num_tokens": 39013383.0, "mean_token_accuracy": 0.8960103297233581, "epoch": 1.270967741935484, "step": 4400 }, { "loss": 0.48944534301757814, "grad_norm": 0.2109375, "learning_rate": 1.3966512057849295e-05, "entropy": 0.48983013848463697, "num_tokens": 39900377.0, "mean_token_accuracy": 0.8956057903170586, "epoch": 1.2998555609051516, "step": 4500 }, { "eval_loss": 0.5091761946678162, "eval_runtime": 121.0065, "eval_samples_per_second": 111.68, "eval_steps_per_second": 18.619, "eval_entropy": 0.48820916045733254, "eval_num_tokens": 39900377.0, "eval_mean_token_accuracy": 0.892838896749499, "epoch": 1.2998555609051516, "step": 4500 }, { "loss": 0.49533580780029296, "grad_norm": 0.212890625, "learning_rate": 1.3655793919924975e-05, "entropy": 0.49353456447521843, "num_tokens": 40787140.0, "mean_token_accuracy": 0.8943837519486745, "epoch": 1.3287433798748194, "step": 4600 }, { "loss": 0.4934038925170898, "grad_norm": 0.1962890625, "learning_rate": 1.3340946295208658e-05, "entropy": 0.4935056679447492, "num_tokens": 41674591.0, "mean_token_accuracy": 0.894527651667595, "epoch": 1.3576311988444871, "step": 4700 }, { "loss": 0.4890486907958984, "grad_norm": 0.205078125, "learning_rate": 1.302232482713546e-05, "entropy": 0.48878600150346757, "num_tokens": 42561974.0, "mean_token_accuracy": 0.8960626801848411, "epoch": 1.386519017814155, "step": 4800 }, { "loss": 0.4930953598022461, "grad_norm": 0.2177734375, "learning_rate": 1.2700289421972767e-05, "entropy": 0.49330734809239707, "num_tokens": 43449322.0, "mean_token_accuracy": 0.8950501901904742, "epoch": 1.4154068367838228, "step": 4900 }, { "loss": 0.4914593887329102, "grad_norm": 0.216796875, "learning_rate": 1.237520384227977e-05, "entropy": 0.490766015201807, "num_tokens": 44335714.0, "mean_token_accuracy": 0.8949617861708006, "epoch": 1.4442946557534906, "step": 5000 }, { "eval_loss": 0.5075456500053406, "eval_runtime": 121.5152, "eval_samples_per_second": 111.212, "eval_steps_per_second": 18.541, "eval_entropy": 0.494863443701626, "eval_num_tokens": 44335714.0, "eval_mean_token_accuracy": 0.8929759670351539, "epoch": 1.4442946557534906, "step": 5000 }, { "loss": 0.49099269866943357, "grad_norm": 0.21875, "learning_rate": 1.2047435296011018e-05, "entropy": 0.49070664674043657, "num_tokens": 45223045.0, "mean_token_accuracy": 0.8956617527206738, "epoch": 1.4731824747231583, "step": 5100 }, { "loss": 0.4920531463623047, "grad_norm": 0.1943359375, "learning_rate": 1.171735402172818e-05, "entropy": 0.49278342053294183, "num_tokens": 46109072.0, "mean_token_accuracy": 0.8947107720375062, "epoch": 1.5020702936928263, "step": 5200 }, { "loss": 0.4921791458129883, "grad_norm": 0.2197265625, "learning_rate": 1.1385332870388473e-05, "entropy": 0.49249339212973914, "num_tokens": 46995491.0, "mean_token_accuracy": 0.8952183723449707, "epoch": 1.530958112662494, "step": 5300 }, { "loss": 0.4928662872314453, "grad_norm": 0.2158203125, "learning_rate": 1.1051746884182222e-05, "entropy": 0.4916799567639828, "num_tokens": 47881628.0, "mean_token_accuracy": 0.8948841803272565, "epoch": 1.5598459316321618, "step": 5400 }, { "loss": 0.4906736373901367, "grad_norm": 0.2041015625, "learning_rate": 1.0716972872895268e-05, "entropy": 0.4912960589925448, "num_tokens": 48769412.0, "mean_token_accuracy": 0.8952219298481942, "epoch": 1.5887337506018295, "step": 5500 }, { "eval_loss": 0.5062649846076965, "eval_runtime": 121.6481, "eval_samples_per_second": 111.091, "eval_steps_per_second": 18.521, "eval_entropy": 0.49529570853027616, "eval_num_tokens": 48769412.0, "eval_mean_token_accuracy": 0.8931941252732563, "epoch": 1.5887337506018295, "step": 5500 }, { "loss": 0.49281944274902345, "grad_norm": 0.2216796875, "learning_rate": 1.0381388988274725e-05, "entropy": 0.4927716707189878, "num_tokens": 49656341.0, "mean_token_accuracy": 0.8945845268170038, "epoch": 1.6176215695714973, "step": 5600 }, { "loss": 0.4927938842773438, "grad_norm": 0.1953125, "learning_rate": 1.0045374296878913e-05, "entropy": 0.49278713996211687, "num_tokens": 50542195.0, "mean_token_accuracy": 0.8949234291911126, "epoch": 1.646509388541165, "step": 5700 }, { "loss": 0.49176959991455077, "grad_norm": 0.2099609375, "learning_rate": 9.709308351893933e-06, "entropy": 0.4925773192942142, "num_tokens": 51428935.0, "mean_token_accuracy": 0.8949492185314496, "epoch": 1.675397207510833, "step": 5800 }, { "loss": 0.49320865631103517, "grad_norm": 0.2041015625, "learning_rate": 9.37357076440057e-06, "entropy": 0.4937224745750427, "num_tokens": 52316133.0, "mean_token_accuracy": 0.894678007364273, "epoch": 1.7042850264805007, "step": 5900 }, { "loss": 0.49113529205322265, "grad_norm": 0.1943359375, "learning_rate": 9.038540774575775e-06, "entropy": 0.49027820602059363, "num_tokens": 53203155.0, "mean_token_accuracy": 0.8950705190499624, "epoch": 1.7331728454501685, "step": 6000 }, { "eval_loss": 0.5055996179580688, "eval_runtime": 121.1117, "eval_samples_per_second": 111.583, "eval_steps_per_second": 18.603, "eval_entropy": 0.4944385351126214, "eval_num_tokens": 53203155.0, "eval_mean_token_accuracy": 0.8932915333636962, "epoch": 1.7331728454501685, "step": 6000 }, { "loss": 0.4895584487915039, "grad_norm": 0.2216796875, "learning_rate": 8.704596823313166e-06, "entropy": 0.4888417159020901, "num_tokens": 54089263.0, "mean_token_accuracy": 0.8957057174046834, "epoch": 1.7620606644198364, "step": 6100 }, { "loss": 0.4907315063476563, "grad_norm": 0.2294921875, "learning_rate": 8.372116124746306e-06, "entropy": 0.4891975859304269, "num_tokens": 54975703.0, "mean_token_accuracy": 0.8951012322306633, "epoch": 1.7909484833895042, "step": 6200 }, { "loss": 0.4888127136230469, "grad_norm": 0.2099609375, "learning_rate": 8.04147424015775e-06, "entropy": 0.4891936507821083, "num_tokens": 55863620.0, "mean_token_accuracy": 0.8955637833476067, "epoch": 1.819836302359172, "step": 6300 }, { "loss": 0.4893819427490234, "grad_norm": 0.19921875, "learning_rate": 7.713044653755093e-06, "entropy": 0.48820455322662987, "num_tokens": 56750061.0, "mean_token_accuracy": 0.8957294267416, "epoch": 1.8487241213288397, "step": 6400 }, { "loss": 0.48793128967285154, "grad_norm": 0.2080078125, "learning_rate": 7.387198350793201e-06, "entropy": 0.4864146198829015, "num_tokens": 57636695.0, "mean_token_accuracy": 0.895841832458973, "epoch": 1.8776119402985074, "step": 6500 }, { "eval_loss": 0.5049271583557129, "eval_runtime": 121.3147, "eval_samples_per_second": 111.396, "eval_steps_per_second": 18.572, "eval_entropy": 0.4945345818731873, "eval_num_tokens": 57636695.0, "eval_mean_token_accuracy": 0.8933592520974977, "epoch": 1.8776119402985074, "step": 6500 }, { "loss": 0.4899849700927734, "grad_norm": 0.21484375, "learning_rate": 7.0643033985192415e-06, "entropy": 0.48893075570464134, "num_tokens": 58523499.0, "mean_token_accuracy": 0.8950817889968554, "epoch": 1.9064997592681752, "step": 6600 }, { "loss": 0.4888337707519531, "grad_norm": 0.2099609375, "learning_rate": 6.744724530413773e-06, "entropy": 0.4885369242727757, "num_tokens": 59409639.0, "mean_token_accuracy": 0.895801799595356, "epoch": 1.935387578237843, "step": 6700 }, { "loss": 0.48825778961181643, "grad_norm": 0.2353515625, "learning_rate": 6.428822734197584e-06, "entropy": 0.48850176721811295, "num_tokens": 60295475.0, "mean_token_accuracy": 0.8954997793833415, "epoch": 1.9642753972075109, "step": 6800 }, { "loss": 0.48861705780029296, "grad_norm": 0.2041015625, "learning_rate": 6.116954844069659e-06, "entropy": 0.4889233031868935, "num_tokens": 61183207.0, "mean_token_accuracy": 0.8955455178022385, "epoch": 1.9931632161771786, "step": 6900 }, { "loss": 0.4824595642089844, "grad_norm": 0.2021484375, "learning_rate": 5.8094731376368115e-06, "entropy": 0.4824550705011872, "num_tokens": 62065451.0, "mean_token_accuracy": 0.8966363853834146, "epoch": 2.0219547424169475, "step": 7000 }, { "eval_loss": 0.5048847794532776, "eval_runtime": 120.9542, "eval_samples_per_second": 111.728, "eval_steps_per_second": 18.627, "eval_entropy": 0.485265318785463, "eval_num_tokens": 62065451.0, "eval_mean_token_accuracy": 0.8934332738174526, "epoch": 2.0219547424169475, "step": 7000 }, { "loss": 0.48333885192871096, "grad_norm": 0.2197265625, "learning_rate": 5.506724937990357e-06, "entropy": 0.4820535824199518, "num_tokens": 62952513.0, "mean_token_accuracy": 0.8962123716870943, "epoch": 2.0508425613866152, "step": 7100 }, { "loss": 0.4831295394897461, "grad_norm": 0.2255859375, "learning_rate": 5.2090522213792734e-06, "entropy": 0.48218101014693576, "num_tokens": 63838911.0, "mean_token_accuracy": 0.8962508221467336, "epoch": 2.079730380356283, "step": 7200 }, { "loss": 0.48569129943847655, "grad_norm": 0.2421875, "learning_rate": 4.916791230922975e-06, "entropy": 0.48549183184901873, "num_tokens": 64725297.0, "mean_token_accuracy": 0.895985666513443, "epoch": 2.1086181993259507, "step": 7300 }, { "loss": 0.482639274597168, "grad_norm": 0.2236328125, "learning_rate": 4.630272096800113e-06, "entropy": 0.48277713745832446, "num_tokens": 65611469.0, "mean_token_accuracy": 0.8964148736000062, "epoch": 2.1375060182956185, "step": 7400 }, { "loss": 0.48228111267089846, "grad_norm": 0.2138671875, "learning_rate": 4.3498184633423775e-06, "entropy": 0.4816711642841498, "num_tokens": 66498972.0, "mean_token_accuracy": 0.8969011158744494, "epoch": 2.1663938372652867, "step": 7500 }, { "eval_loss": 0.5047065019607544, "eval_runtime": 121.1419, "eval_samples_per_second": 111.555, "eval_steps_per_second": 18.598, "eval_entropy": 0.4889771351272458, "eval_num_tokens": 66498972.0, "eval_mean_token_accuracy": 0.893424384577032, "epoch": 2.1663938372652867, "step": 7500 }, { "loss": 0.4861163330078125, "grad_norm": 0.1884765625, "learning_rate": 4.075747123454544e-06, "entropy": 0.4855067411561807, "num_tokens": 67386206.0, "mean_token_accuracy": 0.8959952719012896, "epoch": 2.1952816562349544, "step": 7600 }, { "loss": 0.4826494598388672, "grad_norm": 0.2373046875, "learning_rate": 3.808367660773684e-06, "entropy": 0.48214618876576426, "num_tokens": 68272531.0, "mean_token_accuracy": 0.8969175884127617, "epoch": 2.224169475204622, "step": 7700 }, { "loss": 0.4870090103149414, "grad_norm": 0.2109375, "learning_rate": 3.547982099971804e-06, "entropy": 0.48708953022956847, "num_tokens": 69159267.0, "mean_token_accuracy": 0.8955915210644404, "epoch": 2.25305729417429, "step": 7800 }, { "loss": 0.48725185394287107, "grad_norm": 0.205078125, "learning_rate": 3.2948845655968743e-06, "entropy": 0.48610410739978155, "num_tokens": 70046517.0, "mean_token_accuracy": 0.8959172365069389, "epoch": 2.2819451131439576, "step": 7900 }, { "loss": 0.4846121597290039, "grad_norm": 0.23828125, "learning_rate": 3.0493609498376174e-06, "entropy": 0.4851847393314044, "num_tokens": 70933414.0, "mean_token_accuracy": 0.8960670222838719, "epoch": 2.3108329321136254, "step": 8000 }, { "eval_loss": 0.5046308040618896, "eval_runtime": 121.3471, "eval_samples_per_second": 111.367, "eval_steps_per_second": 18.567, "eval_entropy": 0.4883948566513595, "eval_num_tokens": 70933414.0, "eval_mean_token_accuracy": 0.8934336849387042, "epoch": 2.3108329321136254, "step": 8000 }, { "loss": 0.48443710327148437, "grad_norm": 0.1962890625, "learning_rate": 2.811688589587358e-06, "entropy": 0.48562209352850916, "num_tokens": 71820845.0, "mean_token_accuracy": 0.8964889810482661, "epoch": 2.339720751083293, "step": 8100 }, { "loss": 0.4795720672607422, "grad_norm": 0.197265625, "learning_rate": 2.582135953171717e-06, "entropy": 0.4802011082569758, "num_tokens": 72707499.0, "mean_token_accuracy": 0.8968185101946194, "epoch": 2.368608570052961, "step": 8200 }, { "loss": 0.48555206298828124, "grad_norm": 0.23828125, "learning_rate": 2.3609623370939707e-06, "entropy": 0.48567692418893177, "num_tokens": 73593737.0, "mean_token_accuracy": 0.8955408794681231, "epoch": 2.3974963890226286, "step": 8300 }, { "loss": 0.4849796676635742, "grad_norm": 0.2314453125, "learning_rate": 2.148417573140682e-06, "entropy": 0.4830826353530089, "num_tokens": 74480089.0, "mean_token_accuracy": 0.8961175856987635, "epoch": 2.426384207992297, "step": 8400 }, { "loss": 0.48482059478759765, "grad_norm": 0.2119140625, "learning_rate": 1.9447417461784214e-06, "entropy": 0.48449866617719334, "num_tokens": 75367239.0, "mean_token_accuracy": 0.8959982444842657, "epoch": 2.4552720269619646, "step": 8500 }, { "eval_loss": 0.5046458840370178, "eval_runtime": 121.1164, "eval_samples_per_second": 111.579, "eval_steps_per_second": 18.602, "eval_entropy": 0.4878934076691647, "eval_num_tokens": 75367239.0, "eval_mean_token_accuracy": 0.8934249661257676, "epoch": 2.4552720269619646, "step": 8500 }, { "loss": 0.48140850067138674, "grad_norm": 0.2138671875, "learning_rate": 1.7501649229603213e-06, "entropy": 0.48195242514212927, "num_tokens": 76254713.0, "mean_token_accuracy": 0.8970886744062105, "epoch": 2.4841598459316323, "step": 8600 }, { "loss": 0.48088233947753906, "grad_norm": 0.203125, "learning_rate": 1.564906892248851e-06, "entropy": 0.4809670109550158, "num_tokens": 77142380.0, "mean_token_accuracy": 0.8968148503700892, "epoch": 2.5130476649013, "step": 8700 }, { "loss": 0.48114513397216796, "grad_norm": 0.201171875, "learning_rate": 1.389176916548326e-06, "entropy": 0.48076146269838016, "num_tokens": 78029752.0, "mean_token_accuracy": 0.8966197535395622, "epoch": 2.541935483870968, "step": 8800 }, { "loss": 0.484271240234375, "grad_norm": 0.216796875, "learning_rate": 1.2231734957275866e-06, "entropy": 0.48319436301787694, "num_tokens": 78914544.0, "mean_token_accuracy": 0.8967648883660634, "epoch": 2.5708233028406355, "step": 8900 }, { "loss": 0.4859063720703125, "grad_norm": 0.22265625, "learning_rate": 1.067084142799869e-06, "entropy": 0.4856315462787946, "num_tokens": 79801258.0, "mean_token_accuracy": 0.8954879422982533, "epoch": 2.5997111218103033, "step": 9000 }, { "eval_loss": 0.5045983195304871, "eval_runtime": 121.336, "eval_samples_per_second": 111.377, "eval_steps_per_second": 18.568, "eval_entropy": 0.4877890812467058, "eval_num_tokens": 79801258.0, "eval_mean_token_accuracy": 0.8934447498941654, "epoch": 2.5997111218103033, "step": 9000 }, { "loss": 0.48263256072998045, "grad_norm": 0.23046875, "learning_rate": 9.210851721131397e-07, "entropy": 0.48227159813046455, "num_tokens": 80686981.0, "mean_token_accuracy": 0.8968270209431648, "epoch": 2.628598940779971, "step": 9100 }, { "loss": 0.4856527709960938, "grad_norm": 0.193359375, "learning_rate": 7.85341500190131e-07, "entropy": 0.4843608529369036, "num_tokens": 81574851.0, "mean_token_accuracy": 0.8959235412875811, "epoch": 2.6574867597496388, "step": 9200 }, { "loss": 0.48150875091552736, "grad_norm": 0.2265625, "learning_rate": 6.600064594430566e-07, "entropy": 0.482298014909029, "num_tokens": 82461264.0, "mean_token_accuracy": 0.8967729851603508, "epoch": 2.686374578719307, "step": 9300 }, { "loss": 0.485591926574707, "grad_norm": 0.201171875, "learning_rate": 5.452216249734332e-07, "entropy": 0.4847200144827366, "num_tokens": 83348000.0, "mean_token_accuracy": 0.8960838095347087, "epoch": 2.7152623976889743, "step": 9400 }, { "loss": 0.48456764221191406, "grad_norm": 0.2109375, "learning_rate": 4.4111665465264466e-07, "entropy": 0.4836878172556559, "num_tokens": 84234477.0, "mean_token_accuracy": 0.8962146702408791, "epoch": 2.7441502166586424, "step": 9500 }, { "eval_loss": 0.504546046257019, "eval_runtime": 121.2203, "eval_samples_per_second": 111.483, "eval_steps_per_second": 18.586, "eval_entropy": 0.4879270561181647, "eval_num_tokens": 84234477.0, "eval_mean_token_accuracy": 0.8934456505248455, "epoch": 2.7441502166586424, "step": 9500 }, { "loss": 0.48081619262695313, "grad_norm": 0.2060546875, "learning_rate": 3.478091426638763e-07, "entropy": 0.4814382904271285, "num_tokens": 85120633.0, "mean_token_accuracy": 0.8969470235705376, "epoch": 2.77303803562831, "step": 9600 }, { "loss": 0.4832990264892578, "grad_norm": 0.20703125, "learning_rate": 2.654044866708905e-07, "entropy": 0.4842444409926732, "num_tokens": 86007778.0, "mean_token_accuracy": 0.8965607133507728, "epoch": 2.801925854597978, "step": 9700 }, { "loss": 0.4815947341918945, "grad_norm": 0.2255859375, "learning_rate": 1.939957687636451e-07, "entropy": 0.48316286092003186, "num_tokens": 86893679.0, "mean_token_accuracy": 0.8967393870155017, "epoch": 2.8308136735676457, "step": 9800 }, { "loss": 0.4858028793334961, "grad_norm": 0.2041015625, "learning_rate": 1.336636503152622e-07, "entropy": 0.4849201820790768, "num_tokens": 87782234.0, "mean_token_accuracy": 0.8959653866291046, "epoch": 2.8597014925373134, "step": 9900 }, { "loss": 0.48418357849121096, "grad_norm": 0.20703125, "learning_rate": 8.447628086910242e-08, "entropy": 0.48366599187254905, "num_tokens": 88669239.0, "mean_token_accuracy": 0.8961514661709468, "epoch": 2.888589311506981, "step": 10000 }, { "eval_loss": 0.5046224594116211, "eval_runtime": 120.9979, "eval_samples_per_second": 111.688, "eval_steps_per_second": 18.62, "eval_entropy": 0.4881912901223738, "eval_num_tokens": 88669239.0, "eval_mean_token_accuracy": 0.8934334994050275, "epoch": 2.888589311506981, "step": 10000 }, { "loss": 0.4841903305053711, "grad_norm": 0.21875, "learning_rate": 4.648922115887078e-08, "entropy": 0.48405092969536784, "num_tokens": 89555716.0, "mean_token_accuracy": 0.8961480244000752, "epoch": 2.917477130476649, "step": 10100 }, { "loss": 0.4839943313598633, "grad_norm": 0.2314453125, "learning_rate": 1.9745380348696887e-08, "entropy": 0.48480835517247517, "num_tokens": 90443198.0, "mean_token_accuracy": 0.8965321667989095, "epoch": 2.9463649494463167, "step": 10200 }, { "loss": 0.48620803833007814, "grad_norm": 0.2158203125, "learning_rate": 4.274967564099619e-09, "entropy": 0.48755290483434993, "num_tokens": 91330106.0, "mean_token_accuracy": 0.895567223628362, "epoch": 2.9752527684159844, "step": 10300 }, { "train_runtime": 17442.4557, "train_samples_per_second": 21.433, "train_steps_per_second": 0.595, "total_flos": 2.0076201569253059e+18, "train_loss": 0.5468513610710557, "entropy": 0.4824813495930067, "num_tokens": 92088141.0, "mean_token_accuracy": 0.8962666590621963, "epoch": 3.0, "step": 10386 } ]