{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5677337646484375, "epoch": 0.007936507936507936, "grad_norm": 5.825922321576571, "learning_rate": 0.0, "loss": 1.3956, "mean_token_accuracy": 0.6547382255084813, "num_tokens": 849869.0, "step": 1 }, { "entropy": 0.569549560546875, "epoch": 0.015873015873015872, "grad_norm": 5.801234157189965, "learning_rate": 1.0526315789473685e-06, "loss": 1.4001, "mean_token_accuracy": 0.6515501267276704, "num_tokens": 1710146.0, "step": 2 }, { "entropy": 0.5735321044921875, "epoch": 0.023809523809523808, "grad_norm": 5.662678552390535, "learning_rate": 2.105263157894737e-06, "loss": 1.3808, "mean_token_accuracy": 0.6574624702334404, "num_tokens": 2560005.0, "step": 3 }, { "entropy": 0.5650634765625, "epoch": 0.031746031746031744, "grad_norm": 5.556776721246513, "learning_rate": 3.157894736842105e-06, "loss": 1.3916, "mean_token_accuracy": 0.6538396221585572, "num_tokens": 3457966.0, "step": 4 }, { "entropy": 0.57452392578125, "epoch": 0.03968253968253968, "grad_norm": 5.365978906848366, "learning_rate": 4.210526315789474e-06, "loss": 1.3703, "mean_token_accuracy": 0.6564755998551846, "num_tokens": 4321827.0, "step": 5 }, { "entropy": 0.5656585693359375, "epoch": 0.047619047619047616, "grad_norm": 4.396681219826731, "learning_rate": 5.263157894736842e-06, "loss": 1.293, "mean_token_accuracy": 0.6733334762975574, "num_tokens": 5188122.0, "step": 6 }, { "entropy": 0.5627288818359375, "epoch": 0.05555555555555555, "grad_norm": 3.422587251657164, "learning_rate": 6.31578947368421e-06, "loss": 1.2025, "mean_token_accuracy": 0.6850569075904787, "num_tokens": 6042413.0, "step": 7 }, { "entropy": 0.5742950439453125, "epoch": 0.06349206349206349, "grad_norm": 3.305712030477459, "learning_rate": 7.368421052631579e-06, "loss": 1.1674, "mean_token_accuracy": 0.6904466319829226, "num_tokens": 6898441.0, "step": 8 }, { "entropy": 0.5521697998046875, "epoch": 0.07142857142857142, "grad_norm": 3.0042279135772834, "learning_rate": 8.421052631578948e-06, "loss": 1.1149, "mean_token_accuracy": 0.700476243160665, "num_tokens": 7794638.0, "step": 9 }, { "entropy": 0.5288543701171875, "epoch": 0.07936507936507936, "grad_norm": 5.681857010760831, "learning_rate": 9.473684210526315e-06, "loss": 1.0692, "mean_token_accuracy": 0.7063358542509377, "num_tokens": 8673402.0, "step": 10 }, { "entropy": 0.536529541015625, "epoch": 0.0873015873015873, "grad_norm": 4.63333613929414, "learning_rate": 1.0526315789473684e-05, "loss": 1.0074, "mean_token_accuracy": 0.7214855612255633, "num_tokens": 9525436.0, "step": 11 }, { "entropy": 0.549041748046875, "epoch": 0.09523809523809523, "grad_norm": 3.571284227430379, "learning_rate": 1.1578947368421053e-05, "loss": 0.9655, "mean_token_accuracy": 0.7288991836830974, "num_tokens": 10358777.0, "step": 12 }, { "entropy": 0.5587310791015625, "epoch": 0.10317460317460317, "grad_norm": 5.5475348990467035, "learning_rate": 1.263157894736842e-05, "loss": 0.976, "mean_token_accuracy": 0.7247777748852968, "num_tokens": 11211677.0, "step": 13 }, { "entropy": 0.5686798095703125, "epoch": 0.1111111111111111, "grad_norm": 4.953437827171886, "learning_rate": 1.3684210526315791e-05, "loss": 0.9599, "mean_token_accuracy": 0.7281729411333799, "num_tokens": 12067363.0, "step": 14 }, { "entropy": 0.5579376220703125, "epoch": 0.11904761904761904, "grad_norm": 3.345315151683028, "learning_rate": 1.4736842105263159e-05, "loss": 0.9403, "mean_token_accuracy": 0.7340105604380369, "num_tokens": 12945458.0, "step": 15 }, { "entropy": 0.564178466796875, "epoch": 0.12698412698412698, "grad_norm": 3.080260033569176, "learning_rate": 1.578947368421053e-05, "loss": 0.8993, "mean_token_accuracy": 0.7429266143590212, "num_tokens": 13815066.0, "step": 16 }, { "entropy": 0.55816650390625, "epoch": 0.1349206349206349, "grad_norm": 3.077777207658634, "learning_rate": 1.6842105263157896e-05, "loss": 0.881, "mean_token_accuracy": 0.7466331031173468, "num_tokens": 14685173.0, "step": 17 }, { "entropy": 0.5535430908203125, "epoch": 0.14285714285714285, "grad_norm": 2.5640577389793164, "learning_rate": 1.7894736842105264e-05, "loss": 0.8439, "mean_token_accuracy": 0.7535499525256455, "num_tokens": 15522062.0, "step": 18 }, { "entropy": 0.5429534912109375, "epoch": 0.15079365079365079, "grad_norm": 2.4128378954630563, "learning_rate": 1.894736842105263e-05, "loss": 0.815, "mean_token_accuracy": 0.7603332437574863, "num_tokens": 16388252.0, "step": 19 }, { "entropy": 0.5367584228515625, "epoch": 0.15873015873015872, "grad_norm": 2.2585216462856423, "learning_rate": 2e-05, "loss": 0.7944, "mean_token_accuracy": 0.764781333040446, "num_tokens": 17235205.0, "step": 20 }, { "entropy": 0.531982421875, "epoch": 0.16666666666666666, "grad_norm": 2.174159674202976, "learning_rate": 1.999961710642308e-05, "loss": 0.7779, "mean_token_accuracy": 0.7687827018089592, "num_tokens": 18090069.0, "step": 21 }, { "entropy": 0.53106689453125, "epoch": 0.1746031746031746, "grad_norm": 1.99421990181621, "learning_rate": 1.9998468455013825e-05, "loss": 0.7668, "mean_token_accuracy": 0.7712001241743565, "num_tokens": 18955962.0, "step": 22 }, { "entropy": 0.5294189453125, "epoch": 0.18253968253968253, "grad_norm": 1.6261845638000727, "learning_rate": 1.9996554133734473e-05, "loss": 0.7311, "mean_token_accuracy": 0.7804098608903587, "num_tokens": 19812261.0, "step": 23 }, { "entropy": 0.531005859375, "epoch": 0.19047619047619047, "grad_norm": 2.0323147143629425, "learning_rate": 1.99938742891813e-05, "loss": 0.7211, "mean_token_accuracy": 0.7803159174509346, "num_tokens": 20647709.0, "step": 24 }, { "entropy": 0.52178955078125, "epoch": 0.1984126984126984, "grad_norm": 1.9552611093327645, "learning_rate": 1.9990429126573353e-05, "loss": 0.7179, "mean_token_accuracy": 0.7824284308589995, "num_tokens": 21486371.0, "step": 25 }, { "entropy": 0.54156494140625, "epoch": 0.20634920634920634, "grad_norm": 1.7653002599490228, "learning_rate": 1.9986218909736758e-05, "loss": 0.7017, "mean_token_accuracy": 0.782297340221703, "num_tokens": 22318414.0, "step": 26 }, { "entropy": 0.5255889892578125, "epoch": 0.21428571428571427, "grad_norm": 1.4137706795116607, "learning_rate": 1.9981243961084516e-05, "loss": 0.6751, "mean_token_accuracy": 0.789531962480396, "num_tokens": 23134604.0, "step": 27 }, { "entropy": 0.5037689208984375, "epoch": 0.2222222222222222, "grad_norm": 1.3650570766755068, "learning_rate": 1.99755046615918e-05, "loss": 0.6794, "mean_token_accuracy": 0.789926297031343, "num_tokens": 24009770.0, "step": 28 }, { "entropy": 0.509796142578125, "epoch": 0.23015873015873015, "grad_norm": 1.644136924578907, "learning_rate": 1.9969001450766795e-05, "loss": 0.6845, "mean_token_accuracy": 0.7879371428862214, "num_tokens": 24869806.0, "step": 29 }, { "entropy": 0.4919281005859375, "epoch": 0.23809523809523808, "grad_norm": 1.6577988199334892, "learning_rate": 1.9961734826617033e-05, "loss": 0.6806, "mean_token_accuracy": 0.7895815866068006, "num_tokens": 25762999.0, "step": 30 }, { "entropy": 0.4969482421875, "epoch": 0.24603174603174602, "grad_norm": 1.2984344326991837, "learning_rate": 1.995370534561125e-05, "loss": 0.6515, "mean_token_accuracy": 0.7961041065864265, "num_tokens": 26651412.0, "step": 31 }, { "entropy": 0.4962921142578125, "epoch": 0.25396825396825395, "grad_norm": 1.4360495249533112, "learning_rate": 1.9944913622636798e-05, "loss": 0.6523, "mean_token_accuracy": 0.7966451491229236, "num_tokens": 27520069.0, "step": 32 }, { "entropy": 0.4964752197265625, "epoch": 0.2619047619047619, "grad_norm": 1.5230137669636878, "learning_rate": 1.993536033095252e-05, "loss": 0.6461, "mean_token_accuracy": 0.7965357885695994, "num_tokens": 28367541.0, "step": 33 }, { "entropy": 0.4960174560546875, "epoch": 0.2698412698412698, "grad_norm": 1.3174931583623266, "learning_rate": 1.9925046202137215e-05, "loss": 0.6252, "mean_token_accuracy": 0.8011059854179621, "num_tokens": 29217570.0, "step": 34 }, { "entropy": 0.48931884765625, "epoch": 0.2777777777777778, "grad_norm": 1.5851822977243282, "learning_rate": 1.991397202603363e-05, "loss": 0.6297, "mean_token_accuracy": 0.8029140271246433, "num_tokens": 30088869.0, "step": 35 }, { "entropy": 0.4861297607421875, "epoch": 0.2857142857142857, "grad_norm": 1.1284230135515079, "learning_rate": 1.9902138650687943e-05, "loss": 0.6252, "mean_token_accuracy": 0.8036042922176421, "num_tokens": 30959821.0, "step": 36 }, { "entropy": 0.487884521484375, "epoch": 0.29365079365079366, "grad_norm": 1.4212003300454723, "learning_rate": 1.9889546982284833e-05, "loss": 0.6302, "mean_token_accuracy": 0.8021468138322234, "num_tokens": 31830767.0, "step": 37 }, { "entropy": 0.4900970458984375, "epoch": 0.30158730158730157, "grad_norm": 1.3590880297462014, "learning_rate": 1.987619798507809e-05, "loss": 0.6172, "mean_token_accuracy": 0.803361180704087, "num_tokens": 32692726.0, "step": 38 }, { "entropy": 0.4883575439453125, "epoch": 0.30952380952380953, "grad_norm": 1.0740519369196173, "learning_rate": 1.9862092681316774e-05, "loss": 0.5925, "mean_token_accuracy": 0.81046880222857, "num_tokens": 33543076.0, "step": 39 }, { "entropy": 0.487884521484375, "epoch": 0.31746031746031744, "grad_norm": 1.2402102211054793, "learning_rate": 1.984723215116693e-05, "loss": 0.6003, "mean_token_accuracy": 0.8089331048540771, "num_tokens": 34404352.0, "step": 40 }, { "entropy": 0.49627685546875, "epoch": 0.3253968253968254, "grad_norm": 1.2006299948645196, "learning_rate": 1.983161753262886e-05, "loss": 0.6025, "mean_token_accuracy": 0.8069734866730869, "num_tokens": 35236933.0, "step": 41 }, { "entropy": 0.485504150390625, "epoch": 0.3333333333333333, "grad_norm": 1.0380668548649223, "learning_rate": 1.9815250021449998e-05, "loss": 0.5956, "mean_token_accuracy": 0.8101956453174353, "num_tokens": 36088050.0, "step": 42 }, { "entropy": 0.4821624755859375, "epoch": 0.3412698412698413, "grad_norm": 1.2775189629937171, "learning_rate": 1.9798130871033322e-05, "loss": 0.5916, "mean_token_accuracy": 0.8094993168488145, "num_tokens": 36942407.0, "step": 43 }, { "entropy": 0.4796142578125, "epoch": 0.3492063492063492, "grad_norm": 1.403908049116242, "learning_rate": 1.9780261392341383e-05, "loss": 0.5945, "mean_token_accuracy": 0.8099073590710759, "num_tokens": 37803882.0, "step": 44 }, { "entropy": 0.4794769287109375, "epoch": 0.35714285714285715, "grad_norm": 1.28597257343405, "learning_rate": 1.9761642953795896e-05, "loss": 0.5943, "mean_token_accuracy": 0.809020611923188, "num_tokens": 38667329.0, "step": 45 }, { "entropy": 0.4782257080078125, "epoch": 0.36507936507936506, "grad_norm": 1.0785371718889085, "learning_rate": 1.9742276981172978e-05, "loss": 0.5797, "mean_token_accuracy": 0.8110241792164743, "num_tokens": 39524166.0, "step": 46 }, { "entropy": 0.473052978515625, "epoch": 0.373015873015873, "grad_norm": 1.0279641554931107, "learning_rate": 1.9722164957493925e-05, "loss": 0.5723, "mean_token_accuracy": 0.814803515560925, "num_tokens": 40389693.0, "step": 47 }, { "entropy": 0.4784088134765625, "epoch": 0.38095238095238093, "grad_norm": 0.9899645288539431, "learning_rate": 1.9701308422911674e-05, "loss": 0.5763, "mean_token_accuracy": 0.813154571224004, "num_tokens": 41231841.0, "step": 48 }, { "entropy": 0.4776763916015625, "epoch": 0.3888888888888889, "grad_norm": 1.0848064904660297, "learning_rate": 1.967970897459286e-05, "loss": 0.5785, "mean_token_accuracy": 0.8125878237187862, "num_tokens": 42082897.0, "step": 49 }, { "entropy": 0.47198486328125, "epoch": 0.3968253968253968, "grad_norm": 1.0914454254108568, "learning_rate": 1.9657368266595477e-05, "loss": 0.5584, "mean_token_accuracy": 0.8166225766763091, "num_tokens": 42941458.0, "step": 50 }, { "entropy": 0.469757080078125, "epoch": 0.40476190476190477, "grad_norm": 1.0811685110199658, "learning_rate": 1.9634288009742254e-05, "loss": 0.5613, "mean_token_accuracy": 0.8169586607255042, "num_tokens": 43801380.0, "step": 51 }, { "entropy": 0.469635009765625, "epoch": 0.4126984126984127, "grad_norm": 1.0762575387658044, "learning_rate": 1.961046997148961e-05, "loss": 0.5745, "mean_token_accuracy": 0.8137071407400072, "num_tokens": 44671335.0, "step": 52 }, { "entropy": 0.47747802734375, "epoch": 0.42063492063492064, "grad_norm": 1.0897325457635905, "learning_rate": 1.958591597579231e-05, "loss": 0.5645, "mean_token_accuracy": 0.814334771130234, "num_tokens": 45508166.0, "step": 53 }, { "entropy": 0.4644012451171875, "epoch": 0.42857142857142855, "grad_norm": 1.0941123611019024, "learning_rate": 1.9560627902963808e-05, "loss": 0.5732, "mean_token_accuracy": 0.8126041651703417, "num_tokens": 46398974.0, "step": 54 }, { "entropy": 0.461273193359375, "epoch": 0.4365079365079365, "grad_norm": 1.1949513128743081, "learning_rate": 1.9534607689532236e-05, "loss": 0.5746, "mean_token_accuracy": 0.8108557453379035, "num_tokens": 47311746.0, "step": 55 }, { "entropy": 0.4606475830078125, "epoch": 0.4444444444444444, "grad_norm": 1.0419520721819582, "learning_rate": 1.950785732809211e-05, "loss": 0.5489, "mean_token_accuracy": 0.8197311628609896, "num_tokens": 48177761.0, "step": 56 }, { "entropy": 0.4680938720703125, "epoch": 0.4523809523809524, "grad_norm": 1.2749921802895587, "learning_rate": 1.9480378867151746e-05, "loss": 0.5568, "mean_token_accuracy": 0.8179731853306293, "num_tokens": 49026375.0, "step": 57 }, { "entropy": 0.4620361328125, "epoch": 0.4603174603174603, "grad_norm": 1.0787103519924253, "learning_rate": 1.9452174410976383e-05, "loss": 0.5613, "mean_token_accuracy": 0.8137976322323084, "num_tokens": 49889163.0, "step": 58 }, { "entropy": 0.467193603515625, "epoch": 0.46825396825396826, "grad_norm": 1.1592461490839407, "learning_rate": 1.9423246119427044e-05, "loss": 0.5544, "mean_token_accuracy": 0.8175857574678957, "num_tokens": 50735361.0, "step": 59 }, { "entropy": 0.47021484375, "epoch": 0.47619047619047616, "grad_norm": 1.0574063397814852, "learning_rate": 1.9393596207795135e-05, "loss": 0.5451, "mean_token_accuracy": 0.8189192642457783, "num_tokens": 51597577.0, "step": 60 }, { "entropy": 0.4606781005859375, "epoch": 0.48412698412698413, "grad_norm": 1.1516456097619052, "learning_rate": 1.93632269466328e-05, "loss": 0.5556, "mean_token_accuracy": 0.8182462360709906, "num_tokens": 52482382.0, "step": 61 }, { "entropy": 0.458343505859375, "epoch": 0.49206349206349204, "grad_norm": 1.1488599206834784, "learning_rate": 1.933214066157904e-05, "loss": 0.5502, "mean_token_accuracy": 0.8174868933856487, "num_tokens": 53370750.0, "step": 62 }, { "entropy": 0.462005615234375, "epoch": 0.5, "grad_norm": 0.9651094359764429, "learning_rate": 1.930033973318164e-05, "loss": 0.5415, "mean_token_accuracy": 0.821011008694768, "num_tokens": 54235189.0, "step": 63 }, { "entropy": 0.4658966064453125, "epoch": 0.5079365079365079, "grad_norm": 1.0776190917138506, "learning_rate": 1.926782659671484e-05, "loss": 0.5378, "mean_token_accuracy": 0.8214049334637821, "num_tokens": 55066936.0, "step": 64 }, { "entropy": 0.4623565673828125, "epoch": 0.5158730158730159, "grad_norm": 1.0551561314848272, "learning_rate": 1.9234603741992864e-05, "loss": 0.5399, "mean_token_accuracy": 0.8217946467921138, "num_tokens": 55922405.0, "step": 65 }, { "entropy": 0.46429443359375, "epoch": 0.5238095238095238, "grad_norm": 1.061935522396689, "learning_rate": 1.9200673713179245e-05, "loss": 0.5368, "mean_token_accuracy": 0.8207846856676042, "num_tokens": 56770275.0, "step": 66 }, { "entropy": 0.461700439453125, "epoch": 0.5317460317460317, "grad_norm": 1.1927343816090568, "learning_rate": 1.9166039108592008e-05, "loss": 0.5454, "mean_token_accuracy": 0.8190617277286947, "num_tokens": 57627870.0, "step": 67 }, { "entropy": 0.4640045166015625, "epoch": 0.5396825396825397, "grad_norm": 0.932865808441491, "learning_rate": 1.9130702580504678e-05, "loss": 0.5327, "mean_token_accuracy": 0.8246302427724004, "num_tokens": 58469884.0, "step": 68 }, { "entropy": 0.4622344970703125, "epoch": 0.5476190476190477, "grad_norm": 1.0722808809130155, "learning_rate": 1.9094666834943177e-05, "loss": 0.5312, "mean_token_accuracy": 0.8219157354906201, "num_tokens": 59323796.0, "step": 69 }, { "entropy": 0.4629058837890625, "epoch": 0.5555555555555556, "grad_norm": 1.0019623771507054, "learning_rate": 1.9057934631478616e-05, "loss": 0.5228, "mean_token_accuracy": 0.8249138863757253, "num_tokens": 60183841.0, "step": 70 }, { "entropy": 0.4584503173828125, "epoch": 0.5634920634920635, "grad_norm": 0.9985559966173079, "learning_rate": 1.9020508783015942e-05, "loss": 0.5351, "mean_token_accuracy": 0.8224714086391032, "num_tokens": 61048601.0, "step": 71 }, { "entropy": 0.4614410400390625, "epoch": 0.5714285714285714, "grad_norm": 0.9030749792761511, "learning_rate": 1.898239215557856e-05, "loss": 0.5361, "mean_token_accuracy": 0.8228761674836278, "num_tokens": 61887912.0, "step": 72 }, { "entropy": 0.4592132568359375, "epoch": 0.5793650793650794, "grad_norm": 1.0707804587432073, "learning_rate": 1.894358766808883e-05, "loss": 0.5324, "mean_token_accuracy": 0.8242231444455683, "num_tokens": 62741342.0, "step": 73 }, { "entropy": 0.4568939208984375, "epoch": 0.5873015873015873, "grad_norm": 0.9716132865694278, "learning_rate": 1.8904098292144556e-05, "loss": 0.5288, "mean_token_accuracy": 0.8249963694252074, "num_tokens": 63594617.0, "step": 74 }, { "entropy": 0.4536590576171875, "epoch": 0.5952380952380952, "grad_norm": 0.9051567770653394, "learning_rate": 1.8863927051791418e-05, "loss": 0.5255, "mean_token_accuracy": 0.8258578674867749, "num_tokens": 64467695.0, "step": 75 }, { "entropy": 0.45355224609375, "epoch": 0.6031746031746031, "grad_norm": 0.9565155259831919, "learning_rate": 1.88230770232914e-05, "loss": 0.5282, "mean_token_accuracy": 0.8233450087718666, "num_tokens": 65333788.0, "step": 76 }, { "entropy": 0.4597625732421875, "epoch": 0.6111111111111112, "grad_norm": 0.9745677952807633, "learning_rate": 1.8781551334887204e-05, "loss": 0.5202, "mean_token_accuracy": 0.8260575924068689, "num_tokens": 66178918.0, "step": 77 }, { "entropy": 0.45465087890625, "epoch": 0.6190476190476191, "grad_norm": 0.893135579624428, "learning_rate": 1.87393531665627e-05, "loss": 0.5315, "mean_token_accuracy": 0.8249422176741064, "num_tokens": 67052342.0, "step": 78 }, { "entropy": 0.4491424560546875, "epoch": 0.626984126984127, "grad_norm": 0.888144638384342, "learning_rate": 1.869648574979942e-05, "loss": 0.5236, "mean_token_accuracy": 0.8263327423483133, "num_tokens": 67913391.0, "step": 79 }, { "entropy": 0.451751708984375, "epoch": 0.6349206349206349, "grad_norm": 0.9234771356349594, "learning_rate": 1.865295236732907e-05, "loss": 0.5229, "mean_token_accuracy": 0.8255810001865029, "num_tokens": 68772115.0, "step": 80 }, { "entropy": 0.4463348388671875, "epoch": 0.6428571428571429, "grad_norm": 1.0435810443945315, "learning_rate": 1.8608756352882152e-05, "loss": 0.5137, "mean_token_accuracy": 0.828637046739459, "num_tokens": 69611653.0, "step": 81 }, { "entropy": 0.4492645263671875, "epoch": 0.6507936507936508, "grad_norm": 1.050967823837876, "learning_rate": 1.8563901090932673e-05, "loss": 0.5331, "mean_token_accuracy": 0.8240101523697376, "num_tokens": 70496952.0, "step": 82 }, { "entropy": 0.4458160400390625, "epoch": 0.6587301587301587, "grad_norm": 0.9615124983710195, "learning_rate": 1.851839001643898e-05, "loss": 0.5149, "mean_token_accuracy": 0.8283301163464785, "num_tokens": 71343921.0, "step": 83 }, { "entropy": 0.4455108642578125, "epoch": 0.6666666666666666, "grad_norm": 0.9638388209458152, "learning_rate": 1.847222661458069e-05, "loss": 0.5292, "mean_token_accuracy": 0.8246422847732902, "num_tokens": 72240608.0, "step": 84 }, { "entropy": 0.445037841796875, "epoch": 0.6746031746031746, "grad_norm": 1.1463109632434823, "learning_rate": 1.8425414420491817e-05, "loss": 0.5176, "mean_token_accuracy": 0.826205097604543, "num_tokens": 73118452.0, "step": 85 }, { "entropy": 0.4449615478515625, "epoch": 0.6825396825396826, "grad_norm": 0.9645173292489025, "learning_rate": 1.8377957018990043e-05, "loss": 0.522, "mean_token_accuracy": 0.8236676808446646, "num_tokens": 73991069.0, "step": 86 }, { "entropy": 0.4539337158203125, "epoch": 0.6904761904761905, "grad_norm": 1.0557439048120705, "learning_rate": 1.8329858044302212e-05, "loss": 0.5143, "mean_token_accuracy": 0.8274678424932063, "num_tokens": 74839901.0, "step": 87 }, { "entropy": 0.4486083984375, "epoch": 0.6984126984126984, "grad_norm": 0.9805168214479623, "learning_rate": 1.8281121179786024e-05, "loss": 0.5255, "mean_token_accuracy": 0.8258101856335998, "num_tokens": 75749725.0, "step": 88 }, { "entropy": 0.4547576904296875, "epoch": 0.7063492063492064, "grad_norm": 1.0335119460745013, "learning_rate": 1.823175015764795e-05, "loss": 0.52, "mean_token_accuracy": 0.8265441199764609, "num_tokens": 76593690.0, "step": 89 }, { "entropy": 0.4552154541015625, "epoch": 0.7142857142857143, "grad_norm": 1.0221663487735775, "learning_rate": 1.818174875865744e-05, "loss": 0.5127, "mean_token_accuracy": 0.8279964146204293, "num_tokens": 77431030.0, "step": 90 }, { "entropy": 0.4508514404296875, "epoch": 0.7222222222222222, "grad_norm": 1.0129218088142515, "learning_rate": 1.8131120811857398e-05, "loss": 0.5138, "mean_token_accuracy": 0.827417416498065, "num_tokens": 78278605.0, "step": 91 }, { "entropy": 0.4442596435546875, "epoch": 0.7301587301587301, "grad_norm": 0.862267079570144, "learning_rate": 1.8079870194270958e-05, "loss": 0.5086, "mean_token_accuracy": 0.8285580319352448, "num_tokens": 79154216.0, "step": 92 }, { "entropy": 0.4412384033203125, "epoch": 0.7380952380952381, "grad_norm": 1.0290357398317191, "learning_rate": 1.802800083060457e-05, "loss": 0.5196, "mean_token_accuracy": 0.8259601076133549, "num_tokens": 80039204.0, "step": 93 }, { "entropy": 0.440216064453125, "epoch": 0.746031746031746, "grad_norm": 0.9200513017864748, "learning_rate": 1.7975516692947478e-05, "loss": 0.5122, "mean_token_accuracy": 0.8288197009824216, "num_tokens": 80910348.0, "step": 94 }, { "entropy": 0.441497802734375, "epoch": 0.753968253968254, "grad_norm": 0.9187678159469537, "learning_rate": 1.7922421800467515e-05, "loss": 0.51, "mean_token_accuracy": 0.8292136248201132, "num_tokens": 81765325.0, "step": 95 }, { "entropy": 0.448394775390625, "epoch": 0.7619047619047619, "grad_norm": 0.9243368764216906, "learning_rate": 1.7868720219103343e-05, "loss": 0.5021, "mean_token_accuracy": 0.830197315197438, "num_tokens": 82611125.0, "step": 96 }, { "entropy": 0.4393768310546875, "epoch": 0.7698412698412699, "grad_norm": 0.9416277689893134, "learning_rate": 1.7814416061253076e-05, "loss": 0.5129, "mean_token_accuracy": 0.8264825385995209, "num_tokens": 83488021.0, "step": 97 }, { "entropy": 0.4447784423828125, "epoch": 0.7777777777777778, "grad_norm": 0.9592607594441928, "learning_rate": 1.7759513485459367e-05, "loss": 0.5082, "mean_token_accuracy": 0.828405749052763, "num_tokens": 84321775.0, "step": 98 }, { "entropy": 0.4433746337890625, "epoch": 0.7857142857142857, "grad_norm": 1.0051516989789744, "learning_rate": 1.7704016696090936e-05, "loss": 0.4961, "mean_token_accuracy": 0.8327284948900342, "num_tokens": 85167087.0, "step": 99 }, { "entropy": 0.44500732421875, "epoch": 0.7936507936507936, "grad_norm": 0.9104747464714449, "learning_rate": 1.7647929943020625e-05, "loss": 0.5084, "mean_token_accuracy": 0.8306850432418287, "num_tokens": 86009383.0, "step": 100 }, { "entropy": 0.441436767578125, "epoch": 0.8015873015873016, "grad_norm": 0.9773768746481235, "learning_rate": 1.759125752129993e-05, "loss": 0.5015, "mean_token_accuracy": 0.8314397023059428, "num_tokens": 86862628.0, "step": 101 }, { "entropy": 0.4582061767578125, "epoch": 0.8095238095238095, "grad_norm": 1.059657795313606, "learning_rate": 1.753400377083011e-05, "loss": 0.505, "mean_token_accuracy": 0.8305098316632211, "num_tokens": 87713395.0, "step": 102 }, { "entropy": 0.5252227783203125, "epoch": 0.8174603174603174, "grad_norm": 0.9398994877728049, "learning_rate": 1.747617307602982e-05, "loss": 0.5165, "mean_token_accuracy": 0.8260967722162604, "num_tokens": 88602545.0, "step": 103 }, { "entropy": 0.508758544921875, "epoch": 0.8253968253968254, "grad_norm": 0.9520709865588342, "learning_rate": 1.741776986549938e-05, "loss": 0.5006, "mean_token_accuracy": 0.8329666894860566, "num_tokens": 89444255.0, "step": 104 }, { "entropy": 0.482879638671875, "epoch": 0.8333333333333334, "grad_norm": 0.8345183511029048, "learning_rate": 1.735879861168163e-05, "loss": 0.4944, "mean_token_accuracy": 0.8349198163487017, "num_tokens": 90312774.0, "step": 105 }, { "entropy": 0.4606170654296875, "epoch": 0.8412698412698413, "grad_norm": 0.8850629942716965, "learning_rate": 1.729926383051943e-05, "loss": 0.4939, "mean_token_accuracy": 0.832167761400342, "num_tokens": 91177447.0, "step": 106 }, { "entropy": 0.443572998046875, "epoch": 0.8492063492063492, "grad_norm": 0.7755107549812108, "learning_rate": 1.723917008110984e-05, "loss": 0.4936, "mean_token_accuracy": 0.8328760690055788, "num_tokens": 92026164.0, "step": 107 }, { "entropy": 0.4337921142578125, "epoch": 0.8571428571428571, "grad_norm": 0.9590789688031214, "learning_rate": 1.7178521965354992e-05, "loss": 0.4946, "mean_token_accuracy": 0.8331891498528421, "num_tokens": 92891631.0, "step": 108 }, { "entropy": 0.4388275146484375, "epoch": 0.8650793650793651, "grad_norm": 0.7656212035221501, "learning_rate": 1.7117324127609686e-05, "loss": 0.4979, "mean_token_accuracy": 0.8320917426608503, "num_tokens": 93760535.0, "step": 109 }, { "entropy": 0.436920166015625, "epoch": 0.873015873015873, "grad_norm": 0.8567777469026723, "learning_rate": 1.7055581254325716e-05, "loss": 0.4953, "mean_token_accuracy": 0.832193429581821, "num_tokens": 94599260.0, "step": 110 }, { "entropy": 0.4438934326171875, "epoch": 0.8809523809523809, "grad_norm": 0.8601918515641613, "learning_rate": 1.6993298073693005e-05, "loss": 0.4935, "mean_token_accuracy": 0.8328238227404654, "num_tokens": 95425799.0, "step": 111 }, { "entropy": 0.4399871826171875, "epoch": 0.8888888888888888, "grad_norm": 0.8064705741039472, "learning_rate": 1.693047935527751e-05, "loss": 0.4988, "mean_token_accuracy": 0.833238854072988, "num_tokens": 96260271.0, "step": 112 }, { "entropy": 0.4324188232421875, "epoch": 0.8968253968253969, "grad_norm": 0.8781501824333462, "learning_rate": 1.6867129909656e-05, "loss": 0.4972, "mean_token_accuracy": 0.8320957766845822, "num_tokens": 97135925.0, "step": 113 }, { "entropy": 0.43426513671875, "epoch": 0.9047619047619048, "grad_norm": 0.8118228734667666, "learning_rate": 1.680325458804763e-05, "loss": 0.4989, "mean_token_accuracy": 0.8331179022789001, "num_tokens": 98011108.0, "step": 114 }, { "entropy": 0.4383697509765625, "epoch": 0.9126984126984127, "grad_norm": 0.8371447521121621, "learning_rate": 1.6738858281942477e-05, "loss": 0.4878, "mean_token_accuracy": 0.8329889746382833, "num_tokens": 98873029.0, "step": 115 }, { "entropy": 0.437957763671875, "epoch": 0.9206349206349206, "grad_norm": 0.9317880439623095, "learning_rate": 1.6673945922726945e-05, "loss": 0.4956, "mean_token_accuracy": 0.8307403367944062, "num_tokens": 99734864.0, "step": 116 }, { "entropy": 0.437530517578125, "epoch": 0.9285714285714286, "grad_norm": 0.8078951251086017, "learning_rate": 1.660852248130611e-05, "loss": 0.4987, "mean_token_accuracy": 0.8325757388956845, "num_tokens": 100606926.0, "step": 117 }, { "entropy": 0.441864013671875, "epoch": 0.9365079365079365, "grad_norm": 0.8703434683361917, "learning_rate": 1.6542592967723065e-05, "loss": 0.4862, "mean_token_accuracy": 0.8341412721201777, "num_tokens": 101447599.0, "step": 118 }, { "entropy": 0.4399871826171875, "epoch": 0.9444444444444444, "grad_norm": 0.8653941545842383, "learning_rate": 1.6476162430775278e-05, "loss": 0.4875, "mean_token_accuracy": 0.8343443763442338, "num_tokens": 102275358.0, "step": 119 }, { "entropy": 0.443206787109375, "epoch": 0.9523809523809523, "grad_norm": 0.8489052672431998, "learning_rate": 1.6409235957627926e-05, "loss": 0.487, "mean_token_accuracy": 0.833757430780679, "num_tokens": 103113293.0, "step": 120 }, { "entropy": 0.4330596923828125, "epoch": 0.9603174603174603, "grad_norm": 0.8343529697637952, "learning_rate": 1.6341818673424342e-05, "loss": 0.4927, "mean_token_accuracy": 0.8324310649186373, "num_tokens": 104000550.0, "step": 121 }, { "entropy": 0.4350738525390625, "epoch": 0.9682539682539683, "grad_norm": 0.8358525938559462, "learning_rate": 1.6273915740893557e-05, "loss": 0.491, "mean_token_accuracy": 0.8333931621164083, "num_tokens": 104859286.0, "step": 122 }, { "entropy": 0.4355621337890625, "epoch": 0.9761904761904762, "grad_norm": 0.8709315219677664, "learning_rate": 1.6205532359954905e-05, "loss": 0.4911, "mean_token_accuracy": 0.8330642161890864, "num_tokens": 105729675.0, "step": 123 }, { "entropy": 0.430023193359375, "epoch": 0.9841269841269841, "grad_norm": 0.7686865926441706, "learning_rate": 1.6136673767319853e-05, "loss": 0.4874, "mean_token_accuracy": 0.8338018441572785, "num_tokens": 106603968.0, "step": 124 }, { "entropy": 0.426727294921875, "epoch": 0.9920634920634921, "grad_norm": 0.8197400248642781, "learning_rate": 1.606734523609097e-05, "loss": 0.4839, "mean_token_accuracy": 0.8362096287310123, "num_tokens": 107495007.0, "step": 125 }, { "entropy": 0.4326019287109375, "epoch": 1.0, "grad_norm": 0.7907079441928082, "learning_rate": 1.5997552075358122e-05, "loss": 0.4943, "mean_token_accuracy": 0.8323847940191627, "num_tokens": 108364335.0, "step": 126 }, { "entropy": 0.4468841552734375, "epoch": 1.007936507936508, "grad_norm": 0.8800472941103096, "learning_rate": 1.592729962979189e-05, "loss": 0.4628, "mean_token_accuracy": 0.8423365484923124, "num_tokens": 109202665.0, "step": 127 }, { "entropy": 0.43121337890625, "epoch": 1.0158730158730158, "grad_norm": 0.7866269381892753, "learning_rate": 1.585659327923432e-05, "loss": 0.4648, "mean_token_accuracy": 0.8404850475490093, "num_tokens": 110061605.0, "step": 128 }, { "entropy": 0.432586669921875, "epoch": 1.0238095238095237, "grad_norm": 0.8561544665592976, "learning_rate": 1.5785438438286892e-05, "loss": 0.4597, "mean_token_accuracy": 0.8419345654547215, "num_tokens": 110924491.0, "step": 129 }, { "entropy": 0.430633544921875, "epoch": 1.0317460317460316, "grad_norm": 0.9000097472640374, "learning_rate": 1.5713840555895937e-05, "loss": 0.4624, "mean_token_accuracy": 0.8415501727722585, "num_tokens": 111773832.0, "step": 130 }, { "entropy": 0.42864990234375, "epoch": 1.0396825396825398, "grad_norm": 0.7502414887434086, "learning_rate": 1.5641805114935297e-05, "loss": 0.4542, "mean_token_accuracy": 0.8432249454781413, "num_tokens": 112637470.0, "step": 131 }, { "entropy": 0.42877197265625, "epoch": 1.0476190476190477, "grad_norm": 0.7541703719215876, "learning_rate": 1.556933763178651e-05, "loss": 0.4638, "mean_token_accuracy": 0.8415564014576375, "num_tokens": 113501590.0, "step": 132 }, { "entropy": 0.433380126953125, "epoch": 1.0555555555555556, "grad_norm": 0.7325102765605829, "learning_rate": 1.5496443655916348e-05, "loss": 0.4594, "mean_token_accuracy": 0.8436301471665502, "num_tokens": 114360533.0, "step": 133 }, { "entropy": 0.4308319091796875, "epoch": 1.0634920634920635, "grad_norm": 0.7230120782824351, "learning_rate": 1.5423128769451832e-05, "loss": 0.4612, "mean_token_accuracy": 0.8420308292843401, "num_tokens": 115231953.0, "step": 134 }, { "entropy": 0.4346466064453125, "epoch": 1.0714285714285714, "grad_norm": 0.7168197905226968, "learning_rate": 1.5349398586752794e-05, "loss": 0.4577, "mean_token_accuracy": 0.8419447150081396, "num_tokens": 116092221.0, "step": 135 }, { "entropy": 0.43701171875, "epoch": 1.0793650793650793, "grad_norm": 0.7872377130062906, "learning_rate": 1.52752587539819e-05, "loss": 0.462, "mean_token_accuracy": 0.8413771693594754, "num_tokens": 116984739.0, "step": 136 }, { "entropy": 0.435943603515625, "epoch": 1.0873015873015872, "grad_norm": 0.7254403268334357, "learning_rate": 1.5200714948672313e-05, "loss": 0.4626, "mean_token_accuracy": 0.8416628721170127, "num_tokens": 117852991.0, "step": 137 }, { "entropy": 0.4440155029296875, "epoch": 1.0952380952380953, "grad_norm": 0.8923472179704972, "learning_rate": 1.512577287929288e-05, "loss": 0.46, "mean_token_accuracy": 0.8432498262263834, "num_tokens": 118696927.0, "step": 138 }, { "entropy": 0.4384002685546875, "epoch": 1.1031746031746033, "grad_norm": 0.8036615162498064, "learning_rate": 1.5050438284811001e-05, "loss": 0.4608, "mean_token_accuracy": 0.8414666503667831, "num_tokens": 119569613.0, "step": 139 }, { "entropy": 0.43438720703125, "epoch": 1.1111111111111112, "grad_norm": 0.8378698989178901, "learning_rate": 1.4974716934253146e-05, "loss": 0.4615, "mean_token_accuracy": 0.8419166500680149, "num_tokens": 120447089.0, "step": 140 }, { "entropy": 0.4336395263671875, "epoch": 1.119047619047619, "grad_norm": 0.7339153921994472, "learning_rate": 1.4898614626263066e-05, "loss": 0.452, "mean_token_accuracy": 0.8448847294785082, "num_tokens": 121314886.0, "step": 141 }, { "entropy": 0.436614990234375, "epoch": 1.126984126984127, "grad_norm": 0.7413352124459704, "learning_rate": 1.4822137188657752e-05, "loss": 0.4498, "mean_token_accuracy": 0.8447657427750528, "num_tokens": 122167617.0, "step": 142 }, { "entropy": 0.4323883056640625, "epoch": 1.1349206349206349, "grad_norm": 0.7606295937517913, "learning_rate": 1.474529047798112e-05, "loss": 0.4538, "mean_token_accuracy": 0.8420953522436321, "num_tokens": 123013840.0, "step": 143 }, { "entropy": 0.4267730712890625, "epoch": 1.1428571428571428, "grad_norm": 0.7759729162933741, "learning_rate": 1.4668080379055563e-05, "loss": 0.4466, "mean_token_accuracy": 0.8450842797756195, "num_tokens": 123876490.0, "step": 144 }, { "entropy": 0.4368438720703125, "epoch": 1.1507936507936507, "grad_norm": 0.7751718151560868, "learning_rate": 1.4590512804531272e-05, "loss": 0.4541, "mean_token_accuracy": 0.8422016915865242, "num_tokens": 124713314.0, "step": 145 }, { "entropy": 0.437255859375, "epoch": 1.1587301587301586, "grad_norm": 0.8833882223237038, "learning_rate": 1.4512593694433455e-05, "loss": 0.4641, "mean_token_accuracy": 0.8396992119960487, "num_tokens": 125564746.0, "step": 146 }, { "entropy": 0.4323883056640625, "epoch": 1.1666666666666667, "grad_norm": 0.7186599445695117, "learning_rate": 1.4434329015707468e-05, "loss": 0.447, "mean_token_accuracy": 0.8458566442131996, "num_tokens": 126415997.0, "step": 147 }, { "entropy": 0.4389801025390625, "epoch": 1.1746031746031746, "grad_norm": 0.7870343406013125, "learning_rate": 1.435572476176187e-05, "loss": 0.4611, "mean_token_accuracy": 0.8420486990362406, "num_tokens": 127285760.0, "step": 148 }, { "entropy": 0.445037841796875, "epoch": 1.1825396825396826, "grad_norm": 0.7564486218479161, "learning_rate": 1.427678695200945e-05, "loss": 0.4633, "mean_token_accuracy": 0.8416752209886909, "num_tokens": 128153685.0, "step": 149 }, { "entropy": 0.4512786865234375, "epoch": 1.1904761904761905, "grad_norm": 0.7524650004648994, "learning_rate": 1.4197521631406279e-05, "loss": 0.4477, "mean_token_accuracy": 0.8451099991798401, "num_tokens": 128990738.0, "step": 150 }, { "entropy": 0.4539337158203125, "epoch": 1.1984126984126984, "grad_norm": 0.8270324190177804, "learning_rate": 1.4117934869988776e-05, "loss": 0.4602, "mean_token_accuracy": 0.8432164471596479, "num_tokens": 129848900.0, "step": 151 }, { "entropy": 0.4565582275390625, "epoch": 1.2063492063492063, "grad_norm": 0.7668134474607962, "learning_rate": 1.4038032762408897e-05, "loss": 0.4588, "mean_token_accuracy": 0.8423843700438738, "num_tokens": 130724709.0, "step": 152 }, { "entropy": 0.457763671875, "epoch": 1.2142857142857142, "grad_norm": 0.7234771900704076, "learning_rate": 1.3957821427467392e-05, "loss": 0.4448, "mean_token_accuracy": 0.8460949282161891, "num_tokens": 131582811.0, "step": 153 }, { "entropy": 0.4557952880859375, "epoch": 1.2222222222222223, "grad_norm": 0.7569275321904251, "learning_rate": 1.3877307007645256e-05, "loss": 0.4521, "mean_token_accuracy": 0.8441468216478825, "num_tokens": 132429743.0, "step": 154 }, { "entropy": 0.4557342529296875, "epoch": 1.2301587301587302, "grad_norm": 0.7661971933972097, "learning_rate": 1.3796495668633325e-05, "loss": 0.455, "mean_token_accuracy": 0.8435652130283415, "num_tokens": 133291943.0, "step": 155 }, { "entropy": 0.4580535888671875, "epoch": 1.2380952380952381, "grad_norm": 0.7878876156560011, "learning_rate": 1.3715393598860129e-05, "loss": 0.4515, "mean_token_accuracy": 0.8445776179432869, "num_tokens": 134149814.0, "step": 156 }, { "entropy": 0.45654296875, "epoch": 1.246031746031746, "grad_norm": 0.7496215665379915, "learning_rate": 1.3634007009017986e-05, "loss": 0.4435, "mean_token_accuracy": 0.8470091614872217, "num_tokens": 134989406.0, "step": 157 }, { "entropy": 0.449371337890625, "epoch": 1.253968253968254, "grad_norm": 0.7747908901130726, "learning_rate": 1.3552342131587399e-05, "loss": 0.4398, "mean_token_accuracy": 0.8483070912770927, "num_tokens": 135832642.0, "step": 158 }, { "entropy": 0.44940185546875, "epoch": 1.2619047619047619, "grad_norm": 0.8393013667899509, "learning_rate": 1.3470405220359773e-05, "loss": 0.4493, "mean_token_accuracy": 0.8453219896182418, "num_tokens": 136724748.0, "step": 159 }, { "entropy": 0.4533843994140625, "epoch": 1.2698412698412698, "grad_norm": 0.736903810428085, "learning_rate": 1.3388202549958507e-05, "loss": 0.4487, "mean_token_accuracy": 0.8448820817284286, "num_tokens": 137570382.0, "step": 160 }, { "entropy": 0.451629638671875, "epoch": 1.2777777777777777, "grad_norm": 0.7268998948956484, "learning_rate": 1.3305740415358506e-05, "loss": 0.4511, "mean_token_accuracy": 0.8454865459352732, "num_tokens": 138431194.0, "step": 161 }, { "entropy": 0.44989013671875, "epoch": 1.2857142857142856, "grad_norm": 0.7914645129000742, "learning_rate": 1.3223025131404106e-05, "loss": 0.4367, "mean_token_accuracy": 0.8472912893630564, "num_tokens": 139287890.0, "step": 162 }, { "entropy": 0.4466400146484375, "epoch": 1.2936507936507937, "grad_norm": 0.6999054520073498, "learning_rate": 1.3140063032325491e-05, "loss": 0.4509, "mean_token_accuracy": 0.8445514859631658, "num_tokens": 140160179.0, "step": 163 }, { "entropy": 0.44244384765625, "epoch": 1.3015873015873016, "grad_norm": 0.7926199960561141, "learning_rate": 1.3056860471253639e-05, "loss": 0.4512, "mean_token_accuracy": 0.8441420421004295, "num_tokens": 141002875.0, "step": 164 }, { "entropy": 0.4391632080078125, "epoch": 1.3095238095238095, "grad_norm": 0.7155247497693717, "learning_rate": 1.297342381973379e-05, "loss": 0.4405, "mean_token_accuracy": 0.8467378858476877, "num_tokens": 141858286.0, "step": 165 }, { "entropy": 0.442840576171875, "epoch": 1.3174603174603174, "grad_norm": 0.7604417217639244, "learning_rate": 1.2889759467237532e-05, "loss": 0.4424, "mean_token_accuracy": 0.8471214440651238, "num_tokens": 142698339.0, "step": 166 }, { "entropy": 0.44219970703125, "epoch": 1.3253968253968254, "grad_norm": 0.7842027331020274, "learning_rate": 1.2805873820673509e-05, "loss": 0.443, "mean_token_accuracy": 0.84578693844378, "num_tokens": 143561112.0, "step": 167 }, { "entropy": 0.4424896240234375, "epoch": 1.3333333333333333, "grad_norm": 0.6815401406283373, "learning_rate": 1.2721773303896765e-05, "loss": 0.4451, "mean_token_accuracy": 0.8464445443823934, "num_tokens": 144390223.0, "step": 168 }, { "entropy": 0.440338134765625, "epoch": 1.3412698412698414, "grad_norm": 0.6935995098018823, "learning_rate": 1.2637464357216847e-05, "loss": 0.4565, "mean_token_accuracy": 0.843538910150528, "num_tokens": 145276276.0, "step": 169 }, { "entropy": 0.4456939697265625, "epoch": 1.3492063492063493, "grad_norm": 0.7555715111365282, "learning_rate": 1.2552953436904578e-05, "loss": 0.4464, "mean_token_accuracy": 0.8468158571049571, "num_tokens": 146148957.0, "step": 170 }, { "entropy": 0.45037841796875, "epoch": 1.3571428571428572, "grad_norm": 0.6578938329224823, "learning_rate": 1.246824701469768e-05, "loss": 0.4444, "mean_token_accuracy": 0.8462554500438273, "num_tokens": 146999892.0, "step": 171 }, { "entropy": 0.4438934326171875, "epoch": 1.3650793650793651, "grad_norm": 0.7952540493885925, "learning_rate": 1.2383351577305148e-05, "loss": 0.446, "mean_token_accuracy": 0.845737649127841, "num_tokens": 147888947.0, "step": 172 }, { "entropy": 0.450836181640625, "epoch": 1.373015873015873, "grad_norm": 0.7074173689683401, "learning_rate": 1.2298273625910512e-05, "loss": 0.4568, "mean_token_accuracy": 0.8422739477828145, "num_tokens": 148771994.0, "step": 173 }, { "entropy": 0.4434661865234375, "epoch": 1.380952380952381, "grad_norm": 0.6710031250840192, "learning_rate": 1.2213019675674008e-05, "loss": 0.4433, "mean_token_accuracy": 0.8467194638215005, "num_tokens": 149626353.0, "step": 174 }, { "entropy": 0.4410858154296875, "epoch": 1.3888888888888888, "grad_norm": 0.6457827179824793, "learning_rate": 1.2127596255233622e-05, "loss": 0.4379, "mean_token_accuracy": 0.8480355520732701, "num_tokens": 150484433.0, "step": 175 }, { "entropy": 0.4429931640625, "epoch": 1.3968253968253967, "grad_norm": 0.6573907703715658, "learning_rate": 1.2042009906205152e-05, "loss": 0.4433, "mean_token_accuracy": 0.8479111595079303, "num_tokens": 151351171.0, "step": 176 }, { "entropy": 0.4395904541015625, "epoch": 1.4047619047619047, "grad_norm": 0.7949371909592465, "learning_rate": 1.1956267182681265e-05, "loss": 0.4504, "mean_token_accuracy": 0.8442064803093672, "num_tokens": 152198704.0, "step": 177 }, { "entropy": 0.442718505859375, "epoch": 1.4126984126984126, "grad_norm": 0.678353055772471, "learning_rate": 1.1870374650729582e-05, "loss": 0.4433, "mean_token_accuracy": 0.8477690340951085, "num_tokens": 153027562.0, "step": 178 }, { "entropy": 0.4433441162109375, "epoch": 1.4206349206349207, "grad_norm": 0.6780439570882703, "learning_rate": 1.1784338887889858e-05, "loss": 0.4385, "mean_token_accuracy": 0.8462753728963435, "num_tokens": 153863890.0, "step": 179 }, { "entropy": 0.442413330078125, "epoch": 1.4285714285714286, "grad_norm": 0.6587673965640602, "learning_rate": 1.1698166482670293e-05, "loss": 0.446, "mean_token_accuracy": 0.8457558886148036, "num_tokens": 154707913.0, "step": 180 }, { "entropy": 0.4350433349609375, "epoch": 1.4365079365079365, "grad_norm": 0.6816545482512325, "learning_rate": 1.1611864034042972e-05, "loss": 0.4438, "mean_token_accuracy": 0.8467250894755125, "num_tokens": 155590050.0, "step": 181 }, { "entropy": 0.4351959228515625, "epoch": 1.4444444444444444, "grad_norm": 0.6868757464599637, "learning_rate": 1.1525438150938554e-05, "loss": 0.4401, "mean_token_accuracy": 0.84777757152915, "num_tokens": 156449879.0, "step": 182 }, { "entropy": 0.4363861083984375, "epoch": 1.4523809523809523, "grad_norm": 0.6766249529263749, "learning_rate": 1.1438895451740141e-05, "loss": 0.4292, "mean_token_accuracy": 0.8506109705194831, "num_tokens": 157304143.0, "step": 183 }, { "entropy": 0.4354400634765625, "epoch": 1.4603174603174602, "grad_norm": 0.6379506488261414, "learning_rate": 1.135224256377646e-05, "loss": 0.4459, "mean_token_accuracy": 0.8441523900255561, "num_tokens": 158177988.0, "step": 184 }, { "entropy": 0.431732177734375, "epoch": 1.4682539682539684, "grad_norm": 0.6506443662541825, "learning_rate": 1.1265486122814359e-05, "loss": 0.4468, "mean_token_accuracy": 0.845328216906637, "num_tokens": 159060066.0, "step": 185 }, { "entropy": 0.4365386962890625, "epoch": 1.4761904761904763, "grad_norm": 0.6750756914615313, "learning_rate": 1.1178632772550636e-05, "loss": 0.4416, "mean_token_accuracy": 0.846350169274956, "num_tokens": 159942986.0, "step": 186 }, { "entropy": 0.438690185546875, "epoch": 1.4841269841269842, "grad_norm": 0.5998682565802522, "learning_rate": 1.1091689164103281e-05, "loss": 0.4338, "mean_token_accuracy": 0.8496084534563124, "num_tokens": 160782816.0, "step": 187 }, { "entropy": 0.4376220703125, "epoch": 1.492063492063492, "grad_norm": 0.7017393839562811, "learning_rate": 1.1004661955502143e-05, "loss": 0.4369, "mean_token_accuracy": 0.8506594416685402, "num_tokens": 161643512.0, "step": 188 }, { "entropy": 0.43133544921875, "epoch": 1.5, "grad_norm": 0.6526274910477081, "learning_rate": 1.0917557811179057e-05, "loss": 0.4308, "mean_token_accuracy": 0.849564865231514, "num_tokens": 162503001.0, "step": 189 }, { "entropy": 0.4289703369140625, "epoch": 1.507936507936508, "grad_norm": 0.6948150219846541, "learning_rate": 1.0830383401457499e-05, "loss": 0.4423, "mean_token_accuracy": 0.8475161967799067, "num_tokens": 163388010.0, "step": 190 }, { "entropy": 0.424591064453125, "epoch": 1.5158730158730158, "grad_norm": 0.6197770130603713, "learning_rate": 1.0743145402041781e-05, "loss": 0.4356, "mean_token_accuracy": 0.8487303233705461, "num_tokens": 164270399.0, "step": 191 }, { "entropy": 0.4240570068359375, "epoch": 1.5238095238095237, "grad_norm": 0.6196952871398042, "learning_rate": 1.0655850493505834e-05, "loss": 0.4332, "mean_token_accuracy": 0.8493564445525408, "num_tokens": 165155523.0, "step": 192 }, { "entropy": 0.42626953125, "epoch": 1.5317460317460316, "grad_norm": 0.6835645143050197, "learning_rate": 1.0568505360781606e-05, "loss": 0.4323, "mean_token_accuracy": 0.8495618836022913, "num_tokens": 166004219.0, "step": 193 }, { "entropy": 0.4276275634765625, "epoch": 1.5396825396825395, "grad_norm": 0.6324264334244951, "learning_rate": 1.0481116692647165e-05, "loss": 0.433, "mean_token_accuracy": 0.8492591748945415, "num_tokens": 166887486.0, "step": 194 }, { "entropy": 0.424072265625, "epoch": 1.5476190476190477, "grad_norm": 0.651624241869441, "learning_rate": 1.039369118121445e-05, "loss": 0.4353, "mean_token_accuracy": 0.8506257832050323, "num_tokens": 167743608.0, "step": 195 }, { "entropy": 0.4300537109375, "epoch": 1.5555555555555556, "grad_norm": 0.6114244627798591, "learning_rate": 1.0306235521416822e-05, "loss": 0.4327, "mean_token_accuracy": 0.8502432033419609, "num_tokens": 168602032.0, "step": 196 }, { "entropy": 0.42584228515625, "epoch": 1.5634920634920635, "grad_norm": 0.6819738923664077, "learning_rate": 1.0218756410496353e-05, "loss": 0.4399, "mean_token_accuracy": 0.84707788284868, "num_tokens": 169469975.0, "step": 197 }, { "entropy": 0.4196014404296875, "epoch": 1.5714285714285714, "grad_norm": 0.6181282041855998, "learning_rate": 1.013126054749099e-05, "loss": 0.4381, "mean_token_accuracy": 0.8472674307413399, "num_tokens": 170343282.0, "step": 198 }, { "entropy": 0.4282989501953125, "epoch": 1.5793650793650795, "grad_norm": 0.658618641759341, "learning_rate": 1.0043754632721519e-05, "loss": 0.4372, "mean_token_accuracy": 0.8485192256048322, "num_tokens": 171227432.0, "step": 199 }, { "entropy": 0.4275665283203125, "epoch": 1.5873015873015874, "grad_norm": 0.6304982479501472, "learning_rate": 9.956245367278483e-06, "loss": 0.4212, "mean_token_accuracy": 0.8523962092585862, "num_tokens": 172096305.0, "step": 200 }, { "entropy": 0.427978515625, "epoch": 1.5952380952380953, "grad_norm": 0.6371903010309696, "learning_rate": 9.868739452509011e-06, "loss": 0.4255, "mean_token_accuracy": 0.8514103842899203, "num_tokens": 172910673.0, "step": 201 }, { "entropy": 0.41900634765625, "epoch": 1.6031746031746033, "grad_norm": 0.6949109352240498, "learning_rate": 9.78124358950365e-06, "loss": 0.4312, "mean_token_accuracy": 0.8511450518853962, "num_tokens": 173775762.0, "step": 202 }, { "entropy": 0.4253692626953125, "epoch": 1.6111111111111112, "grad_norm": 0.6809037771433717, "learning_rate": 9.693764478583185e-06, "loss": 0.4341, "mean_token_accuracy": 0.8501040656119585, "num_tokens": 174651858.0, "step": 203 }, { "entropy": 0.428009033203125, "epoch": 1.619047619047619, "grad_norm": 0.6543104002765698, "learning_rate": 9.606308818785552e-06, "loss": 0.425, "mean_token_accuracy": 0.8514936515130103, "num_tokens": 175519282.0, "step": 204 }, { "entropy": 0.429931640625, "epoch": 1.626984126984127, "grad_norm": 0.6352549480190481, "learning_rate": 9.518883307352839e-06, "loss": 0.4405, "mean_token_accuracy": 0.8475879756733775, "num_tokens": 176387199.0, "step": 205 }, { "entropy": 0.4254150390625, "epoch": 1.6349206349206349, "grad_norm": 0.6497469357956864, "learning_rate": 9.431494639218397e-06, "loss": 0.4355, "mean_token_accuracy": 0.8499571783468127, "num_tokens": 177264131.0, "step": 206 }, { "entropy": 0.42901611328125, "epoch": 1.6428571428571428, "grad_norm": 0.6810764457880856, "learning_rate": 9.344149506494169e-06, "loss": 0.4281, "mean_token_accuracy": 0.8508090190589428, "num_tokens": 178114003.0, "step": 207 }, { "entropy": 0.4324951171875, "epoch": 1.6507936507936507, "grad_norm": 0.6629255607044134, "learning_rate": 9.256854597958222e-06, "loss": 0.4369, "mean_token_accuracy": 0.8478540312498808, "num_tokens": 178950487.0, "step": 208 }, { "entropy": 0.4268035888671875, "epoch": 1.6587301587301586, "grad_norm": 0.6985129275896726, "learning_rate": 9.169616598542503e-06, "loss": 0.4379, "mean_token_accuracy": 0.8475267360918224, "num_tokens": 179833212.0, "step": 209 }, { "entropy": 0.4263458251953125, "epoch": 1.6666666666666665, "grad_norm": 0.6679619087501362, "learning_rate": 9.082442188820947e-06, "loss": 0.4283, "mean_token_accuracy": 0.8523209383711219, "num_tokens": 180712234.0, "step": 210 }, { "entropy": 0.4289398193359375, "epoch": 1.6746031746031746, "grad_norm": 0.6473182358853972, "learning_rate": 8.995338044497862e-06, "loss": 0.4402, "mean_token_accuracy": 0.8467970639467239, "num_tokens": 181566490.0, "step": 211 }, { "entropy": 0.43121337890625, "epoch": 1.6825396825396826, "grad_norm": 0.6906263852183294, "learning_rate": 8.90831083589672e-06, "loss": 0.4335, "mean_token_accuracy": 0.8494043787941337, "num_tokens": 182413254.0, "step": 212 }, { "entropy": 0.4273223876953125, "epoch": 1.6904761904761905, "grad_norm": 0.6039213029248242, "learning_rate": 8.821367227449368e-06, "loss": 0.4276, "mean_token_accuracy": 0.8509090105071664, "num_tokens": 183269538.0, "step": 213 }, { "entropy": 0.433563232421875, "epoch": 1.6984126984126984, "grad_norm": 0.7269407148981566, "learning_rate": 8.734513877185644e-06, "loss": 0.4245, "mean_token_accuracy": 0.8521207985468209, "num_tokens": 184109496.0, "step": 214 }, { "entropy": 0.4334564208984375, "epoch": 1.7063492063492065, "grad_norm": 0.5897073419234542, "learning_rate": 8.647757436223543e-06, "loss": 0.4383, "mean_token_accuracy": 0.8479864248074591, "num_tokens": 184968366.0, "step": 215 }, { "entropy": 0.4318084716796875, "epoch": 1.7142857142857144, "grad_norm": 0.6884447059302515, "learning_rate": 8.561104548259864e-06, "loss": 0.4376, "mean_token_accuracy": 0.8475788393989205, "num_tokens": 185857166.0, "step": 216 }, { "entropy": 0.43670654296875, "epoch": 1.7222222222222223, "grad_norm": 0.6407810130824321, "learning_rate": 8.474561849061446e-06, "loss": 0.4194, "mean_token_accuracy": 0.8531361422501504, "num_tokens": 186684767.0, "step": 217 }, { "entropy": 0.4307403564453125, "epoch": 1.7301587301587302, "grad_norm": 0.672082814857435, "learning_rate": 8.388135965957031e-06, "loss": 0.422, "mean_token_accuracy": 0.8524083560332656, "num_tokens": 187534641.0, "step": 218 }, { "entropy": 0.4279327392578125, "epoch": 1.7380952380952381, "grad_norm": 0.6434223057887511, "learning_rate": 8.301833517329714e-06, "loss": 0.4254, "mean_token_accuracy": 0.8503254759125412, "num_tokens": 188403747.0, "step": 219 }, { "entropy": 0.4255218505859375, "epoch": 1.746031746031746, "grad_norm": 0.660253597993015, "learning_rate": 8.215661112110143e-06, "loss": 0.43, "mean_token_accuracy": 0.8505678987130523, "num_tokens": 189286051.0, "step": 220 }, { "entropy": 0.4325103759765625, "epoch": 1.753968253968254, "grad_norm": 0.6268697653988943, "learning_rate": 8.12962534927042e-06, "loss": 0.4231, "mean_token_accuracy": 0.851342577021569, "num_tokens": 190135846.0, "step": 221 }, { "entropy": 0.4230194091796875, "epoch": 1.7619047619047619, "grad_norm": 0.6927249184398813, "learning_rate": 8.043732817318736e-06, "loss": 0.4331, "mean_token_accuracy": 0.8501622658222914, "num_tokens": 191023956.0, "step": 222 }, { "entropy": 0.4322509765625, "epoch": 1.7698412698412698, "grad_norm": 0.6104186946004958, "learning_rate": 7.95799009379485e-06, "loss": 0.4182, "mean_token_accuracy": 0.8539444855414331, "num_tokens": 191865715.0, "step": 223 }, { "entropy": 0.4320526123046875, "epoch": 1.7777777777777777, "grad_norm": 0.5976651871901888, "learning_rate": 7.872403744766383e-06, "loss": 0.4263, "mean_token_accuracy": 0.8515215283259749, "num_tokens": 192687536.0, "step": 224 }, { "entropy": 0.430633544921875, "epoch": 1.7857142857142856, "grad_norm": 0.6460900080888754, "learning_rate": 7.786980324325994e-06, "loss": 0.4199, "mean_token_accuracy": 0.8545417245477438, "num_tokens": 193514317.0, "step": 225 }, { "entropy": 0.4226226806640625, "epoch": 1.7936507936507935, "grad_norm": 0.5833584404095209, "learning_rate": 7.70172637408949e-06, "loss": 0.4224, "mean_token_accuracy": 0.8528628125786781, "num_tokens": 194368425.0, "step": 226 }, { "entropy": 0.42388916015625, "epoch": 1.8015873015873016, "grad_norm": 0.6154377441414096, "learning_rate": 7.616648422694858e-06, "loss": 0.4241, "mean_token_accuracy": 0.851911770645529, "num_tokens": 195229420.0, "step": 227 }, { "entropy": 0.42431640625, "epoch": 1.8095238095238095, "grad_norm": 0.6645832127632725, "learning_rate": 7.531752985302323e-06, "loss": 0.429, "mean_token_accuracy": 0.8506735726259649, "num_tokens": 196086060.0, "step": 228 }, { "entropy": 0.4266510009765625, "epoch": 1.8174603174603174, "grad_norm": 0.5959462674015885, "learning_rate": 7.447046563095425e-06, "loss": 0.4251, "mean_token_accuracy": 0.8506376668810844, "num_tokens": 196949752.0, "step": 229 }, { "entropy": 0.425537109375, "epoch": 1.8253968253968254, "grad_norm": 0.6534426125937292, "learning_rate": 7.362535642783155e-06, "loss": 0.4218, "mean_token_accuracy": 0.8530319351702929, "num_tokens": 197787383.0, "step": 230 }, { "entropy": 0.4254608154296875, "epoch": 1.8333333333333335, "grad_norm": 0.5921640707070417, "learning_rate": 7.278226696103239e-06, "loss": 0.4306, "mean_token_accuracy": 0.8518684362061322, "num_tokens": 198640204.0, "step": 231 }, { "entropy": 0.418487548828125, "epoch": 1.8412698412698414, "grad_norm": 0.6502344798493351, "learning_rate": 7.194126179326497e-06, "loss": 0.4293, "mean_token_accuracy": 0.850899113342166, "num_tokens": 199534945.0, "step": 232 }, { "entropy": 0.42578125, "epoch": 1.8492063492063493, "grad_norm": 0.6743484773379774, "learning_rate": 7.110240532762469e-06, "loss": 0.421, "mean_token_accuracy": 0.8534788498654962, "num_tokens": 200401566.0, "step": 233 }, { "entropy": 0.4231109619140625, "epoch": 1.8571428571428572, "grad_norm": 0.5734181994402893, "learning_rate": 7.026576180266213e-06, "loss": 0.4247, "mean_token_accuracy": 0.8510698927566409, "num_tokens": 201286569.0, "step": 234 }, { "entropy": 0.423370361328125, "epoch": 1.8650793650793651, "grad_norm": 0.6211303859995316, "learning_rate": 6.9431395287463655e-06, "loss": 0.4216, "mean_token_accuracy": 0.8529601790942252, "num_tokens": 202148785.0, "step": 235 }, { "entropy": 0.4267425537109375, "epoch": 1.873015873015873, "grad_norm": 0.6512618843621472, "learning_rate": 6.859936967674509e-06, "loss": 0.4179, "mean_token_accuracy": 0.8532352782785892, "num_tokens": 202969412.0, "step": 236 }, { "entropy": 0.4201812744140625, "epoch": 1.880952380952381, "grad_norm": 0.5685040340161264, "learning_rate": 6.776974868595898e-06, "loss": 0.4228, "mean_token_accuracy": 0.852645758073777, "num_tokens": 203845826.0, "step": 237 }, { "entropy": 0.42022705078125, "epoch": 1.8888888888888888, "grad_norm": 0.6361988218426201, "learning_rate": 6.694259584641496e-06, "loss": 0.4194, "mean_token_accuracy": 0.853855645749718, "num_tokens": 204713067.0, "step": 238 }, { "entropy": 0.426239013671875, "epoch": 1.8968253968253967, "grad_norm": 0.5758526586055672, "learning_rate": 6.611797450041495e-06, "loss": 0.4189, "mean_token_accuracy": 0.8542146142572165, "num_tokens": 205549482.0, "step": 239 }, { "entropy": 0.421630859375, "epoch": 1.9047619047619047, "grad_norm": 0.6073175159295213, "learning_rate": 6.5295947796402315e-06, "loss": 0.4202, "mean_token_accuracy": 0.85362005000934, "num_tokens": 206394578.0, "step": 240 }, { "entropy": 0.4246978759765625, "epoch": 1.9126984126984126, "grad_norm": 0.5769397482915233, "learning_rate": 6.447657868412603e-06, "loss": 0.426, "mean_token_accuracy": 0.8522290964610875, "num_tokens": 207233636.0, "step": 241 }, { "entropy": 0.4238433837890625, "epoch": 1.9206349206349205, "grad_norm": 0.6006836089858458, "learning_rate": 6.365992990982015e-06, "loss": 0.4268, "mean_token_accuracy": 0.8512483732774854, "num_tokens": 208074376.0, "step": 242 }, { "entropy": 0.422637939453125, "epoch": 1.9285714285714286, "grad_norm": 0.5738787319949351, "learning_rate": 6.284606401139875e-06, "loss": 0.4262, "mean_token_accuracy": 0.8505582748912275, "num_tokens": 208947370.0, "step": 243 }, { "entropy": 0.4206695556640625, "epoch": 1.9365079365079365, "grad_norm": 0.5799710929690532, "learning_rate": 6.203504331366677e-06, "loss": 0.4155, "mean_token_accuracy": 0.8547424203716218, "num_tokens": 209798547.0, "step": 244 }, { "entropy": 0.425048828125, "epoch": 1.9444444444444444, "grad_norm": 0.6423312059049974, "learning_rate": 6.122692992354748e-06, "loss": 0.4229, "mean_token_accuracy": 0.8520036181434989, "num_tokens": 210661829.0, "step": 245 }, { "entropy": 0.4195709228515625, "epoch": 1.9523809523809523, "grad_norm": 0.5554620758818444, "learning_rate": 6.0421785725326085e-06, "loss": 0.4216, "mean_token_accuracy": 0.8520813095383346, "num_tokens": 211549046.0, "step": 246 }, { "entropy": 0.4189910888671875, "epoch": 1.9603174603174605, "grad_norm": 0.6117133326079128, "learning_rate": 5.9619672375911065e-06, "loss": 0.4148, "mean_token_accuracy": 0.8543260907754302, "num_tokens": 212447521.0, "step": 247 }, { "entropy": 0.4197235107421875, "epoch": 1.9682539682539684, "grad_norm": 0.6336191027571944, "learning_rate": 5.882065130011226e-06, "loss": 0.4209, "mean_token_accuracy": 0.8533798800781369, "num_tokens": 213310334.0, "step": 248 }, { "entropy": 0.4182586669921875, "epoch": 1.9761904761904763, "grad_norm": 0.7899010517492578, "learning_rate": 5.80247836859372e-06, "loss": 0.4207, "mean_token_accuracy": 0.8531867042183876, "num_tokens": 214169043.0, "step": 249 }, { "entropy": 0.4210205078125, "epoch": 1.9841269841269842, "grad_norm": 0.5663331071559328, "learning_rate": 5.723213047990553e-06, "loss": 0.4212, "mean_token_accuracy": 0.8530774302780628, "num_tokens": 215005057.0, "step": 250 }, { "entropy": 0.4158935546875, "epoch": 1.992063492063492, "grad_norm": 0.6025248790984578, "learning_rate": 5.64427523823813e-06, "loss": 0.4104, "mean_token_accuracy": 0.8561887559480965, "num_tokens": 215869766.0, "step": 251 }, { "entropy": 0.4177703857421875, "epoch": 2.0, "grad_norm": 0.5973147348731676, "learning_rate": 5.5656709842925335e-06, "loss": 0.4144, "mean_token_accuracy": 0.8541272669099271, "num_tokens": 216731206.0, "step": 252 }, { "entropy": 0.4171600341796875, "epoch": 2.007936507936508, "grad_norm": 0.6276654160889864, "learning_rate": 5.4874063055665495e-06, "loss": 0.3902, "mean_token_accuracy": 0.8642581212334335, "num_tokens": 217589905.0, "step": 253 }, { "entropy": 0.4161834716796875, "epoch": 2.015873015873016, "grad_norm": 0.6655836865810364, "learning_rate": 5.40948719546873e-06, "loss": 0.3946, "mean_token_accuracy": 0.8615933828987181, "num_tokens": 218446876.0, "step": 254 }, { "entropy": 0.4152069091796875, "epoch": 2.0238095238095237, "grad_norm": 0.5865307595034494, "learning_rate": 5.331919620944438e-06, "loss": 0.3954, "mean_token_accuracy": 0.86083889240399, "num_tokens": 219322571.0, "step": 255 }, { "entropy": 0.4158782958984375, "epoch": 2.0317460317460316, "grad_norm": 0.529665010763266, "learning_rate": 5.2547095220188815e-06, "loss": 0.3933, "mean_token_accuracy": 0.8596651367843151, "num_tokens": 220180160.0, "step": 256 }, { "entropy": 0.4136505126953125, "epoch": 2.0396825396825395, "grad_norm": 0.5832622748528953, "learning_rate": 5.177862811342254e-06, "loss": 0.3836, "mean_token_accuracy": 0.8657393348403275, "num_tokens": 221016578.0, "step": 257 }, { "entropy": 0.4110565185546875, "epoch": 2.0476190476190474, "grad_norm": 0.6002714768590333, "learning_rate": 5.101385373736937e-06, "loss": 0.3919, "mean_token_accuracy": 0.8619846105575562, "num_tokens": 221871968.0, "step": 258 }, { "entropy": 0.4073638916015625, "epoch": 2.0555555555555554, "grad_norm": 0.5644672942522324, "learning_rate": 5.025283065746855e-06, "loss": 0.3825, "mean_token_accuracy": 0.8645827597938478, "num_tokens": 222738323.0, "step": 259 }, { "entropy": 0.4102935791015625, "epoch": 2.0634920634920633, "grad_norm": 0.5904010681889716, "learning_rate": 4.949561715189001e-06, "loss": 0.388, "mean_token_accuracy": 0.8645457159727812, "num_tokens": 223584134.0, "step": 260 }, { "entropy": 0.40545654296875, "epoch": 2.0714285714285716, "grad_norm": 0.5790799071819784, "learning_rate": 4.8742271207071226e-06, "loss": 0.3877, "mean_token_accuracy": 0.8634475646540523, "num_tokens": 224461654.0, "step": 261 }, { "entropy": 0.4102325439453125, "epoch": 2.0793650793650795, "grad_norm": 0.5721734360627909, "learning_rate": 4.799285051327686e-06, "loss": 0.3938, "mean_token_accuracy": 0.8604177525267005, "num_tokens": 225327562.0, "step": 262 }, { "entropy": 0.4100799560546875, "epoch": 2.0873015873015874, "grad_norm": 0.6109882214140274, "learning_rate": 4.724741246018103e-06, "loss": 0.385, "mean_token_accuracy": 0.8638843321241438, "num_tokens": 226189031.0, "step": 263 }, { "entropy": 0.4103240966796875, "epoch": 2.0952380952380953, "grad_norm": 0.5597646515339337, "learning_rate": 4.650601413247214e-06, "loss": 0.3998, "mean_token_accuracy": 0.8596187229268253, "num_tokens": 227062309.0, "step": 264 }, { "entropy": 0.4138641357421875, "epoch": 2.1031746031746033, "grad_norm": 0.5631751720121708, "learning_rate": 4.57687123054817e-06, "loss": 0.391, "mean_token_accuracy": 0.8622312569059432, "num_tokens": 227920598.0, "step": 265 }, { "entropy": 0.4128875732421875, "epoch": 2.111111111111111, "grad_norm": 0.5735787131262603, "learning_rate": 4.503556344083656e-06, "loss": 0.3869, "mean_token_accuracy": 0.8629599534906447, "num_tokens": 228773818.0, "step": 266 }, { "entropy": 0.412200927734375, "epoch": 2.119047619047619, "grad_norm": 0.5725239133889661, "learning_rate": 4.4306623682134875e-06, "loss": 0.3827, "mean_token_accuracy": 0.8646458461880684, "num_tokens": 229627711.0, "step": 267 }, { "entropy": 0.4097442626953125, "epoch": 2.126984126984127, "grad_norm": 0.5481999431523443, "learning_rate": 4.358194885064704e-06, "loss": 0.3949, "mean_token_accuracy": 0.8609438170678914, "num_tokens": 230489032.0, "step": 268 }, { "entropy": 0.409942626953125, "epoch": 2.134920634920635, "grad_norm": 0.5621397553390326, "learning_rate": 4.286159444104068e-06, "loss": 0.3943, "mean_token_accuracy": 0.8611305872909725, "num_tokens": 231339019.0, "step": 269 }, { "entropy": 0.41339111328125, "epoch": 2.142857142857143, "grad_norm": 0.7325265217894285, "learning_rate": 4.2145615617131095e-06, "loss": 0.3935, "mean_token_accuracy": 0.8603947004303336, "num_tokens": 232199672.0, "step": 270 }, { "entropy": 0.415496826171875, "epoch": 2.1507936507936507, "grad_norm": 0.6471864956846543, "learning_rate": 4.143406720765687e-06, "loss": 0.3915, "mean_token_accuracy": 0.8618022156879306, "num_tokens": 233076007.0, "step": 271 }, { "entropy": 0.4146575927734375, "epoch": 2.1587301587301586, "grad_norm": 0.5471917966436562, "learning_rate": 4.0727003702081146e-06, "loss": 0.3896, "mean_token_accuracy": 0.8616545354016125, "num_tokens": 233942156.0, "step": 272 }, { "entropy": 0.4138031005859375, "epoch": 2.1666666666666665, "grad_norm": 0.5471689661185632, "learning_rate": 4.002447924641882e-06, "loss": 0.3912, "mean_token_accuracy": 0.8624369469471276, "num_tokens": 234844668.0, "step": 273 }, { "entropy": 0.4194183349609375, "epoch": 2.1746031746031744, "grad_norm": 0.5341897422332818, "learning_rate": 3.9326547639090315e-06, "loss": 0.3976, "mean_token_accuracy": 0.8597730663605034, "num_tokens": 235697877.0, "step": 274 }, { "entropy": 0.4160614013671875, "epoch": 2.1825396825396823, "grad_norm": 0.5650190225140705, "learning_rate": 3.863326232680148e-06, "loss": 0.3867, "mean_token_accuracy": 0.8626719349995255, "num_tokens": 236586699.0, "step": 275 }, { "entropy": 0.4175567626953125, "epoch": 2.1904761904761907, "grad_norm": 0.5875625634296703, "learning_rate": 3.7944676400451017e-06, "loss": 0.3871, "mean_token_accuracy": 0.861822621896863, "num_tokens": 237426378.0, "step": 276 }, { "entropy": 0.41094970703125, "epoch": 2.1984126984126986, "grad_norm": 0.6096935487800307, "learning_rate": 3.7260842591064504e-06, "loss": 0.3871, "mean_token_accuracy": 0.8619845635257661, "num_tokens": 238297987.0, "step": 277 }, { "entropy": 0.4205474853515625, "epoch": 2.2063492063492065, "grad_norm": 0.5794805245252707, "learning_rate": 3.6581813265756595e-06, "loss": 0.3988, "mean_token_accuracy": 0.8605999210849404, "num_tokens": 239171101.0, "step": 278 }, { "entropy": 0.4158172607421875, "epoch": 2.2142857142857144, "grad_norm": 0.5763915780264333, "learning_rate": 3.590764042372079e-06, "loss": 0.3844, "mean_token_accuracy": 0.8631811602972448, "num_tokens": 240034337.0, "step": 279 }, { "entropy": 0.4194793701171875, "epoch": 2.2222222222222223, "grad_norm": 0.5303818182249157, "learning_rate": 3.523837569224725e-06, "loss": 0.3792, "mean_token_accuracy": 0.8658822155557573, "num_tokens": 240860927.0, "step": 280 }, { "entropy": 0.41436767578125, "epoch": 2.2301587301587302, "grad_norm": 0.5581623734673887, "learning_rate": 3.4574070322769347e-06, "loss": 0.3896, "mean_token_accuracy": 0.8626445569097996, "num_tokens": 241739076.0, "step": 281 }, { "entropy": 0.414520263671875, "epoch": 2.238095238095238, "grad_norm": 0.5494365768968688, "learning_rate": 3.391477518693894e-06, "loss": 0.3805, "mean_token_accuracy": 0.8653818825259805, "num_tokens": 242574011.0, "step": 282 }, { "entropy": 0.415740966796875, "epoch": 2.246031746031746, "grad_norm": 0.5877235744523559, "learning_rate": 3.3260540772730576e-06, "loss": 0.3902, "mean_token_accuracy": 0.8616071529686451, "num_tokens": 243458878.0, "step": 283 }, { "entropy": 0.412841796875, "epoch": 2.253968253968254, "grad_norm": 0.5569142251529777, "learning_rate": 3.261141718057523e-06, "loss": 0.3879, "mean_token_accuracy": 0.8641184438019991, "num_tokens": 244313964.0, "step": 284 }, { "entropy": 0.414337158203125, "epoch": 2.261904761904762, "grad_norm": 0.579133790812722, "learning_rate": 3.1967454119523745e-06, "loss": 0.3827, "mean_token_accuracy": 0.8644507811404765, "num_tokens": 245200322.0, "step": 285 }, { "entropy": 0.41656494140625, "epoch": 2.2698412698412698, "grad_norm": 0.5714755304570499, "learning_rate": 3.1328700903440045e-06, "loss": 0.3867, "mean_token_accuracy": 0.8641348239034414, "num_tokens": 246083539.0, "step": 286 }, { "entropy": 0.41748046875, "epoch": 2.2777777777777777, "grad_norm": 0.5402817316233841, "learning_rate": 3.0695206447224923e-06, "loss": 0.3882, "mean_token_accuracy": 0.8631519465707242, "num_tokens": 246933619.0, "step": 287 }, { "entropy": 0.4218597412109375, "epoch": 2.2857142857142856, "grad_norm": 0.5351912335816236, "learning_rate": 3.0067019263069973e-06, "loss": 0.3797, "mean_token_accuracy": 0.8656269912607968, "num_tokens": 247765672.0, "step": 288 }, { "entropy": 0.4116973876953125, "epoch": 2.2936507936507935, "grad_norm": 0.5559360442278566, "learning_rate": 2.9444187456742855e-06, "loss": 0.3812, "mean_token_accuracy": 0.8642131965607405, "num_tokens": 248628378.0, "step": 289 }, { "entropy": 0.4143218994140625, "epoch": 2.3015873015873014, "grad_norm": 0.5890265994765199, "learning_rate": 2.8826758723903192e-06, "loss": 0.3895, "mean_token_accuracy": 0.8638571444898844, "num_tokens": 249501143.0, "step": 290 }, { "entropy": 0.4134368896484375, "epoch": 2.3095238095238093, "grad_norm": 0.5998092954683998, "learning_rate": 2.821478034645009e-06, "loss": 0.3842, "mean_token_accuracy": 0.8642507120966911, "num_tokens": 250356099.0, "step": 291 }, { "entropy": 0.4135284423828125, "epoch": 2.317460317460317, "grad_norm": 0.5510958860977246, "learning_rate": 2.7608299188901632e-06, "loss": 0.3861, "mean_token_accuracy": 0.8630354404449463, "num_tokens": 251219125.0, "step": 292 }, { "entropy": 0.410308837890625, "epoch": 2.3253968253968256, "grad_norm": 0.5629966415686625, "learning_rate": 2.7007361694805735e-06, "loss": 0.3852, "mean_token_accuracy": 0.8642032388597727, "num_tokens": 252080434.0, "step": 293 }, { "entropy": 0.4159088134765625, "epoch": 2.3333333333333335, "grad_norm": 0.5390902225233088, "learning_rate": 2.64120138831837e-06, "loss": 0.3813, "mean_token_accuracy": 0.8645626241341233, "num_tokens": 252923218.0, "step": 294 }, { "entropy": 0.4195098876953125, "epoch": 2.3412698412698414, "grad_norm": 0.539915212152828, "learning_rate": 2.5822301345006196e-06, "loss": 0.3822, "mean_token_accuracy": 0.8648238624446094, "num_tokens": 253761289.0, "step": 295 }, { "entropy": 0.412567138671875, "epoch": 2.3492063492063493, "grad_norm": 0.5605080176549344, "learning_rate": 2.5238269239701816e-06, "loss": 0.3883, "mean_token_accuracy": 0.862140198238194, "num_tokens": 254643716.0, "step": 296 }, { "entropy": 0.416534423828125, "epoch": 2.357142857142857, "grad_norm": 0.5577110501244517, "learning_rate": 2.4659962291698936e-06, "loss": 0.3878, "mean_token_accuracy": 0.862798870075494, "num_tokens": 255486573.0, "step": 297 }, { "entropy": 0.413330078125, "epoch": 2.365079365079365, "grad_norm": 0.564255039988231, "learning_rate": 2.408742478700071e-06, "loss": 0.3759, "mean_token_accuracy": 0.8661748920567334, "num_tokens": 256345326.0, "step": 298 }, { "entropy": 0.411529541015625, "epoch": 2.373015873015873, "grad_norm": 0.5347008401050389, "learning_rate": 2.352070056979375e-06, "loss": 0.3762, "mean_token_accuracy": 0.867778348736465, "num_tokens": 257185152.0, "step": 299 }, { "entropy": 0.41009521484375, "epoch": 2.380952380952381, "grad_norm": 0.543409354225312, "learning_rate": 2.295983303909065e-06, "loss": 0.3821, "mean_token_accuracy": 0.8653234201483428, "num_tokens": 258067282.0, "step": 300 }, { "entropy": 0.4130859375, "epoch": 2.388888888888889, "grad_norm": 0.5346168976688244, "learning_rate": 2.2404865145406353e-06, "loss": 0.3852, "mean_token_accuracy": 0.8636267627589405, "num_tokens": 258944016.0, "step": 301 }, { "entropy": 0.4159393310546875, "epoch": 2.3968253968253967, "grad_norm": 0.5607670324824822, "learning_rate": 2.1855839387469237e-06, "loss": 0.3804, "mean_token_accuracy": 0.8650295240804553, "num_tokens": 259808268.0, "step": 302 }, { "entropy": 0.4105377197265625, "epoch": 2.4047619047619047, "grad_norm": 0.5763140805976009, "learning_rate": 2.1312797808966625e-06, "loss": 0.3903, "mean_token_accuracy": 0.863428748678416, "num_tokens": 260684292.0, "step": 303 }, { "entropy": 0.414642333984375, "epoch": 2.4126984126984126, "grad_norm": 0.6375048447554551, "learning_rate": 2.0775781995324886e-06, "loss": 0.3858, "mean_token_accuracy": 0.8646045490168035, "num_tokens": 261555918.0, "step": 304 }, { "entropy": 0.413177490234375, "epoch": 2.4206349206349205, "grad_norm": 0.5711513351236631, "learning_rate": 2.024483307052526e-06, "loss": 0.3816, "mean_token_accuracy": 0.8652195022441447, "num_tokens": 262425946.0, "step": 305 }, { "entropy": 0.4171905517578125, "epoch": 2.4285714285714284, "grad_norm": 0.5460960698149459, "learning_rate": 1.971999169395432e-06, "loss": 0.3764, "mean_token_accuracy": 0.8664770247414708, "num_tokens": 263268966.0, "step": 306 }, { "entropy": 0.41497802734375, "epoch": 2.4365079365079367, "grad_norm": 0.5256766338517593, "learning_rate": 1.920129805729043e-06, "loss": 0.3806, "mean_token_accuracy": 0.8658863957971334, "num_tokens": 264137961.0, "step": 307 }, { "entropy": 0.4152679443359375, "epoch": 2.4444444444444446, "grad_norm": 0.5266763600052715, "learning_rate": 1.8688791881426017e-06, "loss": 0.3805, "mean_token_accuracy": 0.8645090684294701, "num_tokens": 264995910.0, "step": 308 }, { "entropy": 0.41461181640625, "epoch": 2.4523809523809526, "grad_norm": 0.5434774314682953, "learning_rate": 1.8182512413425624e-06, "loss": 0.3799, "mean_token_accuracy": 0.865009430795908, "num_tokens": 265867518.0, "step": 309 }, { "entropy": 0.41693115234375, "epoch": 2.4603174603174605, "grad_norm": 0.5493374323066029, "learning_rate": 1.7682498423520545e-06, "loss": 0.3848, "mean_token_accuracy": 0.8647962850518525, "num_tokens": 266730039.0, "step": 310 }, { "entropy": 0.41461181640625, "epoch": 2.4682539682539684, "grad_norm": 0.5317398307781561, "learning_rate": 1.7188788202139794e-06, "loss": 0.3875, "mean_token_accuracy": 0.8631189134903252, "num_tokens": 267605673.0, "step": 311 }, { "entropy": 0.415802001953125, "epoch": 2.4761904761904763, "grad_norm": 0.5266121906145388, "learning_rate": 1.6701419556977882e-06, "loss": 0.3886, "mean_token_accuracy": 0.864013391546905, "num_tokens": 268470812.0, "step": 312 }, { "entropy": 0.4138946533203125, "epoch": 2.484126984126984, "grad_norm": 0.5249027475139008, "learning_rate": 1.6220429810099603e-06, "loss": 0.3792, "mean_token_accuracy": 0.8644625195302069, "num_tokens": 269329768.0, "step": 313 }, { "entropy": 0.41455078125, "epoch": 2.492063492063492, "grad_norm": 0.5316446888258474, "learning_rate": 1.5745855795081889e-06, "loss": 0.386, "mean_token_accuracy": 0.8626924455165863, "num_tokens": 270192672.0, "step": 314 }, { "entropy": 0.416595458984375, "epoch": 2.5, "grad_norm": 0.507973852043752, "learning_rate": 1.527773385419311e-06, "loss": 0.3878, "mean_token_accuracy": 0.8625178756192327, "num_tokens": 271053623.0, "step": 315 }, { "entropy": 0.4129638671875, "epoch": 2.507936507936508, "grad_norm": 0.5077791813190773, "learning_rate": 1.4816099835610209e-06, "loss": 0.3834, "mean_token_accuracy": 0.8646475677378476, "num_tokens": 271921985.0, "step": 316 }, { "entropy": 0.423919677734375, "epoch": 2.515873015873016, "grad_norm": 0.5348663606444165, "learning_rate": 1.4360989090673284e-06, "loss": 0.3838, "mean_token_accuracy": 0.8645837544463575, "num_tokens": 272757706.0, "step": 317 }, { "entropy": 0.419281005859375, "epoch": 2.5238095238095237, "grad_norm": 0.5272426034742803, "learning_rate": 1.3912436471178525e-06, "loss": 0.3786, "mean_token_accuracy": 0.8656437643803656, "num_tokens": 273603268.0, "step": 318 }, { "entropy": 0.4167327880859375, "epoch": 2.5317460317460316, "grad_norm": 0.5340390184826158, "learning_rate": 1.3470476326709337e-06, "loss": 0.3788, "mean_token_accuracy": 0.8665351970121264, "num_tokens": 274458735.0, "step": 319 }, { "entropy": 0.41522216796875, "epoch": 2.5396825396825395, "grad_norm": 0.5104651040256136, "learning_rate": 1.3035142502005792e-06, "loss": 0.3821, "mean_token_accuracy": 0.8639437765814364, "num_tokens": 275320028.0, "step": 320 }, { "entropy": 0.4129486083984375, "epoch": 2.5476190476190474, "grad_norm": 0.5922952980019984, "learning_rate": 1.2606468334373e-06, "loss": 0.3774, "mean_token_accuracy": 0.8665279163978994, "num_tokens": 276151262.0, "step": 321 }, { "entropy": 0.4136962890625, "epoch": 2.5555555555555554, "grad_norm": 0.5357315155466328, "learning_rate": 1.2184486651128014e-06, "loss": 0.3817, "mean_token_accuracy": 0.8641278254799545, "num_tokens": 276997327.0, "step": 322 }, { "entropy": 0.4170989990234375, "epoch": 2.5634920634920633, "grad_norm": 0.5250006813841819, "learning_rate": 1.1769229767086053e-06, "loss": 0.3856, "mean_token_accuracy": 0.8660658891312778, "num_tokens": 277849848.0, "step": 323 }, { "entropy": 0.41107177734375, "epoch": 2.571428571428571, "grad_norm": 0.502170658695331, "learning_rate": 1.1360729482085852e-06, "loss": 0.3756, "mean_token_accuracy": 0.8676945436745882, "num_tokens": 278726761.0, "step": 324 }, { "entropy": 0.4149017333984375, "epoch": 2.5793650793650795, "grad_norm": 0.5457600582389259, "learning_rate": 1.0959017078554458e-06, "loss": 0.3762, "mean_token_accuracy": 0.8675552252680063, "num_tokens": 279572082.0, "step": 325 }, { "entropy": 0.4125823974609375, "epoch": 2.5873015873015874, "grad_norm": 0.5267025240131151, "learning_rate": 1.0564123319111708e-06, "loss": 0.3798, "mean_token_accuracy": 0.8648343035019934, "num_tokens": 280414468.0, "step": 326 }, { "entropy": 0.4168548583984375, "epoch": 2.5952380952380953, "grad_norm": 0.5099791881851006, "learning_rate": 1.017607844421441e-06, "loss": 0.3848, "mean_token_accuracy": 0.864824591204524, "num_tokens": 281237288.0, "step": 327 }, { "entropy": 0.41204833984375, "epoch": 2.6031746031746033, "grad_norm": 0.5664377723043557, "learning_rate": 9.794912169840564e-07, "loss": 0.372, "mean_token_accuracy": 0.8691010950133204, "num_tokens": 282118057.0, "step": 328 }, { "entropy": 0.4074859619140625, "epoch": 2.611111111111111, "grad_norm": 0.5020933620876052, "learning_rate": 9.420653685213854e-07, "loss": 0.3876, "mean_token_accuracy": 0.8634662297554314, "num_tokens": 283028330.0, "step": 329 }, { "entropy": 0.41888427734375, "epoch": 2.619047619047619, "grad_norm": 0.49659765769759895, "learning_rate": 9.053331650568264e-07, "loss": 0.3811, "mean_token_accuracy": 0.8650768841616809, "num_tokens": 283866607.0, "step": 330 }, { "entropy": 0.4181976318359375, "epoch": 2.626984126984127, "grad_norm": 0.5446277616026562, "learning_rate": 8.692974194953263e-07, "loss": 0.3839, "mean_token_accuracy": 0.8648977861739695, "num_tokens": 284705319.0, "step": 331 }, { "entropy": 0.4133758544921875, "epoch": 2.634920634920635, "grad_norm": 0.4865910494795602, "learning_rate": 8.339608914079944e-07, "loss": 0.3958, "mean_token_accuracy": 0.8617808111011982, "num_tokens": 285598883.0, "step": 332 }, { "entropy": 0.415557861328125, "epoch": 2.642857142857143, "grad_norm": 0.5244468960355501, "learning_rate": 7.993262868207552e-07, "loss": 0.3853, "mean_token_accuracy": 0.8644285243935883, "num_tokens": 286461446.0, "step": 333 }, { "entropy": 0.4182281494140625, "epoch": 2.6507936507936507, "grad_norm": 0.7087612664979458, "learning_rate": 7.653962580071384e-07, "loss": 0.3808, "mean_token_accuracy": 0.8640225417912006, "num_tokens": 287308399.0, "step": 334 }, { "entropy": 0.40972900390625, "epoch": 2.6587301587301586, "grad_norm": 0.5050650123459585, "learning_rate": 7.321734032851613e-07, "loss": 0.3838, "mean_token_accuracy": 0.8654659832827747, "num_tokens": 288187832.0, "step": 335 }, { "entropy": 0.4111785888671875, "epoch": 2.6666666666666665, "grad_norm": 0.5138723208589175, "learning_rate": 6.996602668183605e-07, "loss": 0.3807, "mean_token_accuracy": 0.8655836824327707, "num_tokens": 289047051.0, "step": 336 }, { "entropy": 0.4153594970703125, "epoch": 2.674603174603175, "grad_norm": 0.5408275070681229, "learning_rate": 6.678593384209597e-07, "loss": 0.3884, "mean_token_accuracy": 0.8643983146175742, "num_tokens": 289920851.0, "step": 337 }, { "entropy": 0.4125823974609375, "epoch": 2.682539682539683, "grad_norm": 0.4759412131823873, "learning_rate": 6.367730533672035e-07, "loss": 0.3732, "mean_token_accuracy": 0.8673816910013556, "num_tokens": 290766931.0, "step": 338 }, { "entropy": 0.42041015625, "epoch": 2.6904761904761907, "grad_norm": 0.5004081975261553, "learning_rate": 6.064037922048661e-07, "loss": 0.3817, "mean_token_accuracy": 0.8643794315867126, "num_tokens": 291599836.0, "step": 339 }, { "entropy": 0.4112701416015625, "epoch": 2.6984126984126986, "grad_norm": 0.48982483200118515, "learning_rate": 5.767538805729578e-07, "loss": 0.3856, "mean_token_accuracy": 0.8643845007754862, "num_tokens": 292454972.0, "step": 340 }, { "entropy": 0.418304443359375, "epoch": 2.7063492063492065, "grad_norm": 0.5174894409413827, "learning_rate": 5.478255890236184e-07, "loss": 0.3799, "mean_token_accuracy": 0.8644901039078832, "num_tokens": 293307675.0, "step": 341 }, { "entropy": 0.415863037109375, "epoch": 2.7142857142857144, "grad_norm": 0.49053909813768987, "learning_rate": 5.196211328482559e-07, "loss": 0.3817, "mean_token_accuracy": 0.8651680708862841, "num_tokens": 294149752.0, "step": 342 }, { "entropy": 0.419036865234375, "epoch": 2.7222222222222223, "grad_norm": 0.5048327044966334, "learning_rate": 4.921426719078948e-07, "loss": 0.3687, "mean_token_accuracy": 0.8694347636774182, "num_tokens": 294975614.0, "step": 343 }, { "entropy": 0.4116058349609375, "epoch": 2.7301587301587302, "grad_norm": 0.4966619149777404, "learning_rate": 4.653923104677671e-07, "loss": 0.3801, "mean_token_accuracy": 0.8655543397180736, "num_tokens": 295846874.0, "step": 344 }, { "entropy": 0.4119720458984375, "epoch": 2.738095238095238, "grad_norm": 0.5136264989197665, "learning_rate": 4.3937209703619476e-07, "loss": 0.3806, "mean_token_accuracy": 0.8657941850833595, "num_tokens": 296730922.0, "step": 345 }, { "entropy": 0.4112091064453125, "epoch": 2.746031746031746, "grad_norm": 0.48053531479565836, "learning_rate": 4.140840242076927e-07, "loss": 0.3741, "mean_token_accuracy": 0.8681624769233167, "num_tokens": 297610941.0, "step": 346 }, { "entropy": 0.415740966796875, "epoch": 2.753968253968254, "grad_norm": 0.5414004762026651, "learning_rate": 3.895300285103931e-07, "loss": 0.3755, "mean_token_accuracy": 0.864651458337903, "num_tokens": 298464464.0, "step": 347 }, { "entropy": 0.4115142822265625, "epoch": 2.761904761904762, "grad_norm": 0.5039880854406058, "learning_rate": 3.657119902577466e-07, "loss": 0.3722, "mean_token_accuracy": 0.8687906567938626, "num_tokens": 299315956.0, "step": 348 }, { "entropy": 0.416900634765625, "epoch": 2.7698412698412698, "grad_norm": 0.5361811952601372, "learning_rate": 3.426317334045226e-07, "loss": 0.3774, "mean_token_accuracy": 0.8656895193271339, "num_tokens": 300162540.0, "step": 349 }, { "entropy": 0.4165191650390625, "epoch": 2.7777777777777777, "grad_norm": 0.5147685427448626, "learning_rate": 3.202910254071434e-07, "loss": 0.3751, "mean_token_accuracy": 0.8678149809129536, "num_tokens": 301009855.0, "step": 350 }, { "entropy": 0.4177398681640625, "epoch": 2.7857142857142856, "grad_norm": 0.5293771485579281, "learning_rate": 2.9869157708832805e-07, "loss": 0.3697, "mean_token_accuracy": 0.8683305000886321, "num_tokens": 301852351.0, "step": 351 }, { "entropy": 0.412750244140625, "epoch": 2.7936507936507935, "grad_norm": 0.5809290124899761, "learning_rate": 2.778350425060794e-07, "loss": 0.3726, "mean_token_accuracy": 0.8671337850391865, "num_tokens": 302702189.0, "step": 352 }, { "entropy": 0.417388916015625, "epoch": 2.8015873015873014, "grad_norm": 0.4880249907803546, "learning_rate": 2.5772301882702634e-07, "loss": 0.3734, "mean_token_accuracy": 0.8674810263328254, "num_tokens": 303546117.0, "step": 353 }, { "entropy": 0.4113616943359375, "epoch": 2.8095238095238093, "grad_norm": 0.48636087376036335, "learning_rate": 2.3835704620410294e-07, "loss": 0.3778, "mean_token_accuracy": 0.8667405629530549, "num_tokens": 304424136.0, "step": 354 }, { "entropy": 0.4126739501953125, "epoch": 2.817460317460317, "grad_norm": 0.5409934504868933, "learning_rate": 2.1973860765861831e-07, "loss": 0.3845, "mean_token_accuracy": 0.8655604547820985, "num_tokens": 305299176.0, "step": 355 }, { "entropy": 0.4120635986328125, "epoch": 2.825396825396825, "grad_norm": 0.5115713722265198, "learning_rate": 2.0186912896667744e-07, "loss": 0.3773, "mean_token_accuracy": 0.8656438020989299, "num_tokens": 306180918.0, "step": 356 }, { "entropy": 0.4132080078125, "epoch": 2.8333333333333335, "grad_norm": 0.4843831982656416, "learning_rate": 1.8474997855000177e-07, "loss": 0.3843, "mean_token_accuracy": 0.8643954736180604, "num_tokens": 307053855.0, "step": 357 }, { "entropy": 0.416534423828125, "epoch": 2.8412698412698414, "grad_norm": 0.5147871761826365, "learning_rate": 1.6838246737113983e-07, "loss": 0.3835, "mean_token_accuracy": 0.8651239294558764, "num_tokens": 307908233.0, "step": 358 }, { "entropy": 0.4155731201171875, "epoch": 2.8492063492063493, "grad_norm": 0.530597613594654, "learning_rate": 1.5276784883307084e-07, "loss": 0.3697, "mean_token_accuracy": 0.869139929767698, "num_tokens": 308765267.0, "step": 359 }, { "entropy": 0.4214630126953125, "epoch": 2.857142857142857, "grad_norm": 0.501462039664022, "learning_rate": 1.3790731868322472e-07, "loss": 0.3763, "mean_token_accuracy": 0.8655449384823442, "num_tokens": 309586897.0, "step": 360 }, { "entropy": 0.41253662109375, "epoch": 2.865079365079365, "grad_norm": 0.4685559999749004, "learning_rate": 1.238020149219099e-07, "loss": 0.3751, "mean_token_accuracy": 0.8660591626539826, "num_tokens": 310440896.0, "step": 361 }, { "entropy": 0.4139404296875, "epoch": 2.873015873015873, "grad_norm": 0.4845099487055374, "learning_rate": 1.1045301771516748e-07, "loss": 0.3744, "mean_token_accuracy": 0.8673448082990944, "num_tokens": 311297562.0, "step": 362 }, { "entropy": 0.4100494384765625, "epoch": 2.880952380952381, "grad_norm": 0.48128143239397103, "learning_rate": 9.786134931205726e-08, "loss": 0.3875, "mean_token_accuracy": 0.8648535516113043, "num_tokens": 312189513.0, "step": 363 }, { "entropy": 0.4131317138671875, "epoch": 2.888888888888889, "grad_norm": 0.4783546055097021, "learning_rate": 8.602797396636941e-08, "loss": 0.3773, "mean_token_accuracy": 0.865596916526556, "num_tokens": 313055977.0, "step": 364 }, { "entropy": 0.41400146484375, "epoch": 2.8968253968253967, "grad_norm": 0.5064252562933648, "learning_rate": 7.495379786278456e-08, "loss": 0.3833, "mean_token_accuracy": 0.8648101268336177, "num_tokens": 313934488.0, "step": 365 }, { "entropy": 0.4145965576171875, "epoch": 2.9047619047619047, "grad_norm": 0.5132880237470535, "learning_rate": 6.463966904748487e-08, "loss": 0.3809, "mean_token_accuracy": 0.8653016709722579, "num_tokens": 314782888.0, "step": 366 }, { "entropy": 0.41058349609375, "epoch": 2.9126984126984126, "grad_norm": 0.4885061208644805, "learning_rate": 5.508637736320488e-08, "loss": 0.3704, "mean_token_accuracy": 0.8688876410014927, "num_tokens": 315667111.0, "step": 367 }, { "entropy": 0.4156036376953125, "epoch": 2.9206349206349205, "grad_norm": 0.4926976995984634, "learning_rate": 4.62946543887488e-08, "loss": 0.38, "mean_token_accuracy": 0.864195094909519, "num_tokens": 316519645.0, "step": 368 }, { "entropy": 0.4157562255859375, "epoch": 2.928571428571429, "grad_norm": 0.5012094010197111, "learning_rate": 3.826517338296865e-08, "loss": 0.3778, "mean_token_accuracy": 0.8676678575575352, "num_tokens": 317376914.0, "step": 369 }, { "entropy": 0.41656494140625, "epoch": 2.9365079365079367, "grad_norm": 0.48025666248124643, "learning_rate": 3.0998549233205446e-08, "loss": 0.3819, "mean_token_accuracy": 0.8646529386751354, "num_tokens": 318210944.0, "step": 370 }, { "entropy": 0.41094970703125, "epoch": 2.9444444444444446, "grad_norm": 0.4765329017560117, "learning_rate": 2.4495338408201397e-08, "loss": 0.3803, "mean_token_accuracy": 0.8647728296928108, "num_tokens": 319072977.0, "step": 371 }, { "entropy": 0.41326904296875, "epoch": 2.9523809523809526, "grad_norm": 0.4877083974615076, "learning_rate": 1.8756038915486165e-08, "loss": 0.3819, "mean_token_accuracy": 0.864876258186996, "num_tokens": 319936836.0, "step": 372 }, { "entropy": 0.4106903076171875, "epoch": 2.9603174603174605, "grad_norm": 0.7890691969320142, "learning_rate": 1.3781090263242924e-08, "loss": 0.3777, "mean_token_accuracy": 0.8663178561255336, "num_tokens": 320807541.0, "step": 373 }, { "entropy": 0.4145660400390625, "epoch": 2.9682539682539684, "grad_norm": 0.47432214070949463, "learning_rate": 9.570873426649752e-09, "loss": 0.3787, "mean_token_accuracy": 0.8662264375016093, "num_tokens": 321676330.0, "step": 374 }, { "entropy": 0.412261962890625, "epoch": 2.9761904761904763, "grad_norm": 0.4980955112880234, "learning_rate": 6.125710818701836e-09, "loss": 0.3743, "mean_token_accuracy": 0.8680550749413669, "num_tokens": 322526195.0, "step": 375 }, { "entropy": 0.41668701171875, "epoch": 2.984126984126984, "grad_norm": 0.5212561146099488, "learning_rate": 3.445866265526787e-09, "loss": 0.3829, "mean_token_accuracy": 0.8645712514407933, "num_tokens": 323373321.0, "step": 376 }, { "entropy": 0.412017822265625, "epoch": 2.992063492063492, "grad_norm": 0.46886859359122, "learning_rate": 1.531544986177469e-09, "loss": 0.3727, "mean_token_accuracy": 0.865846767090261, "num_tokens": 324241487.0, "step": 377 }, { "entropy": 0.4130401611328125, "epoch": 3.0, "grad_norm": 0.4965409602777678, "learning_rate": 3.8289357691900785e-10, "loss": 0.3831, "mean_token_accuracy": 0.8650536630302668, "num_tokens": 325114310.0, "step": 378 }, { "epoch": 3.0, "step": 378, "total_flos": 601237770600448.0, "train_loss": 0.4870561873786664, "train_runtime": 57802.2631, "train_samples_per_second": 1.274, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 378, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 32, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 601237770600448.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }