{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 534, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5542068481445312, "epoch": 0.011235955056179775, "grad_norm": 390.23344037065993, "learning_rate": 0.0, "loss": 8.277, "mean_token_accuracy": 0.0, "num_tokens": 822155.0, "step": 1 }, { "entropy": 0.5560684204101562, "epoch": 0.02247191011235955, "grad_norm": 387.2980617003247, "learning_rate": 1.8518518518518518e-07, "loss": 8.3201, "mean_token_accuracy": 0.0, "num_tokens": 1647671.0, "step": 2 }, { "entropy": 0.5492401123046875, "epoch": 0.033707865168539325, "grad_norm": 384.7370311195686, "learning_rate": 3.7037037037037036e-07, "loss": 8.3097, "mean_token_accuracy": 0.0, "num_tokens": 2496794.0, "step": 3 }, { "entropy": 0.5499496459960938, "epoch": 0.0449438202247191, "grad_norm": 386.7170475562737, "learning_rate": 5.555555555555555e-07, "loss": 8.2263, "mean_token_accuracy": 0.0, "num_tokens": 3335427.0, "step": 4 }, { "entropy": 0.5614547729492188, "epoch": 0.056179775280898875, "grad_norm": 391.10655108122654, "learning_rate": 7.407407407407407e-07, "loss": 8.1153, "mean_token_accuracy": 0.0, "num_tokens": 4147344.0, "step": 5 }, { "entropy": 0.5488052368164062, "epoch": 0.06741573033707865, "grad_norm": 394.56555016865775, "learning_rate": 9.259259259259259e-07, "loss": 7.9897, "mean_token_accuracy": 0.0, "num_tokens": 4987664.0, "step": 6 }, { "entropy": 0.5349197387695312, "epoch": 0.07865168539325842, "grad_norm": 398.0852378093763, "learning_rate": 1.111111111111111e-06, "loss": 7.4606, "mean_token_accuracy": 0.0, "num_tokens": 5868951.0, "step": 7 }, { "entropy": 0.5573577880859375, "epoch": 0.0898876404494382, "grad_norm": 269.2196405074287, "learning_rate": 1.2962962962962962e-06, "loss": 5.8655, "mean_token_accuracy": 0.0, "num_tokens": 6673251.0, "step": 8 }, { "entropy": 0.5700607299804688, "epoch": 0.10112359550561797, "grad_norm": 227.1382148537302, "learning_rate": 1.4814814814814815e-06, "loss": 5.5805, "mean_token_accuracy": 0.006510416860692203, "num_tokens": 7475702.0, "step": 9 }, { "entropy": 0.5521163940429688, "epoch": 0.11235955056179775, "grad_norm": 191.45954657460527, "learning_rate": 1.6666666666666667e-06, "loss": 5.2965, "mean_token_accuracy": 0.016927083837799728, "num_tokens": 8289995.0, "step": 10 }, { "entropy": 0.5551223754882812, "epoch": 0.12359550561797752, "grad_norm": 102.83125134655117, "learning_rate": 1.8518518518518519e-06, "loss": 4.1159, "mean_token_accuracy": 0.5000000149011612, "num_tokens": 9130985.0, "step": 11 }, { "entropy": 0.5453948974609375, "epoch": 0.1348314606741573, "grad_norm": 96.0432311413094, "learning_rate": 2.037037037037037e-06, "loss": 4.0236, "mean_token_accuracy": 0.5247395989717916, "num_tokens": 9991024.0, "step": 12 }, { "entropy": 0.5710296630859375, "epoch": 0.14606741573033707, "grad_norm": 82.56292307777714, "learning_rate": 2.222222222222222e-06, "loss": 3.8238, "mean_token_accuracy": 0.5182291821110994, "num_tokens": 10795531.0, "step": 13 }, { "entropy": 0.5636825561523438, "epoch": 0.15730337078651685, "grad_norm": 74.53161781330034, "learning_rate": 2.4074074074074075e-06, "loss": 3.7069, "mean_token_accuracy": 0.5026041816454381, "num_tokens": 11605868.0, "step": 14 }, { "entropy": 0.5413894653320312, "epoch": 0.16853932584269662, "grad_norm": 59.737070732481115, "learning_rate": 2.5925925925925925e-06, "loss": 3.2683, "mean_token_accuracy": 0.5013020982732996, "num_tokens": 12460543.0, "step": 15 }, { "entropy": 0.5427627563476562, "epoch": 0.1797752808988764, "grad_norm": 58.49611194582999, "learning_rate": 2.7777777777777783e-06, "loss": 3.2039, "mean_token_accuracy": 0.5208333488553762, "num_tokens": 13294073.0, "step": 16 }, { "entropy": 0.5473175048828125, "epoch": 0.19101123595505617, "grad_norm": 57.54039518404522, "learning_rate": 2.962962962962963e-06, "loss": 3.1559, "mean_token_accuracy": 0.5104166818782687, "num_tokens": 14111722.0, "step": 17 }, { "entropy": 0.5622482299804688, "epoch": 0.20224719101123595, "grad_norm": 57.432008827772684, "learning_rate": 3.1481481481481483e-06, "loss": 3.093, "mean_token_accuracy": 0.5390625160653144, "num_tokens": 14914584.0, "step": 18 }, { "entropy": 0.54608154296875, "epoch": 0.21348314606741572, "grad_norm": 57.483023727287936, "learning_rate": 3.3333333333333333e-06, "loss": 3.0533, "mean_token_accuracy": 0.5364583493210375, "num_tokens": 15747323.0, "step": 19 }, { "entropy": 0.5431365966796875, "epoch": 0.2247191011235955, "grad_norm": 58.03235110267345, "learning_rate": 3.5185185185185187e-06, "loss": 2.9627, "mean_token_accuracy": 0.5403645994374529, "num_tokens": 16578442.0, "step": 20 }, { "entropy": 0.5537033081054688, "epoch": 0.23595505617977527, "grad_norm": 57.031635513840406, "learning_rate": 3.7037037037037037e-06, "loss": 2.9173, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 17365881.0, "step": 21 }, { "entropy": 0.54766845703125, "epoch": 0.24719101123595505, "grad_norm": 61.80595725006236, "learning_rate": 3.88888888888889e-06, "loss": 2.9261, "mean_token_accuracy": 0.5247395989717916, "num_tokens": 18163036.0, "step": 22 }, { "entropy": 0.5399551391601562, "epoch": 0.25842696629213485, "grad_norm": 59.257010291932765, "learning_rate": 4.074074074074074e-06, "loss": 2.889, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 18990094.0, "step": 23 }, { "entropy": 0.54046630859375, "epoch": 0.2696629213483146, "grad_norm": 58.14418169147904, "learning_rate": 4.2592592592592596e-06, "loss": 2.8644, "mean_token_accuracy": 0.5559895999031141, "num_tokens": 19825429.0, "step": 24 }, { "entropy": 0.545257568359375, "epoch": 0.2808988764044944, "grad_norm": 57.31107773937866, "learning_rate": 4.444444444444444e-06, "loss": 2.8263, "mean_token_accuracy": 0.5768229338573292, "num_tokens": 20620523.0, "step": 25 }, { "entropy": 0.5317459106445312, "epoch": 0.29213483146067415, "grad_norm": 57.695511366393276, "learning_rate": 4.62962962962963e-06, "loss": 2.8152, "mean_token_accuracy": 0.5481770996702835, "num_tokens": 21466727.0, "step": 26 }, { "entropy": 0.5383377075195312, "epoch": 0.30337078651685395, "grad_norm": 57.43374006560947, "learning_rate": 4.814814814814815e-06, "loss": 2.7654, "mean_token_accuracy": 0.5664062668802217, "num_tokens": 22276427.0, "step": 27 }, { "entropy": 0.5467529296875, "epoch": 0.3146067415730337, "grad_norm": 57.29313317484686, "learning_rate": 5e-06, "loss": 2.7203, "mean_token_accuracy": 0.5716146003687754, "num_tokens": 23060552.0, "step": 28 }, { "entropy": 0.5378189086914062, "epoch": 0.3258426966292135, "grad_norm": 58.51832801138517, "learning_rate": 4.999952005391863e-06, "loss": 2.7212, "mean_token_accuracy": 0.5455729329260066, "num_tokens": 23845994.0, "step": 29 }, { "entropy": 0.5296554565429688, "epoch": 0.33707865168539325, "grad_norm": 57.46515253087696, "learning_rate": 4.999808023410233e-06, "loss": 2.6669, "mean_token_accuracy": 0.5651041835080832, "num_tokens": 24664278.0, "step": 30 }, { "entropy": 0.5262527465820312, "epoch": 0.34831460674157305, "grad_norm": 58.42941782798559, "learning_rate": 4.999568059583401e-06, "loss": 2.6528, "mean_token_accuracy": 0.5533854331588373, "num_tokens": 25492953.0, "step": 31 }, { "entropy": 0.525665283203125, "epoch": 0.3595505617977528, "grad_norm": 57.76343798611473, "learning_rate": 4.9992321231249425e-06, "loss": 2.6188, "mean_token_accuracy": 0.5729166837409139, "num_tokens": 26327442.0, "step": 32 }, { "entropy": 0.5118408203125, "epoch": 0.3707865168539326, "grad_norm": 58.36500275312141, "learning_rate": 4.998800226933367e-06, "loss": 2.6118, "mean_token_accuracy": 0.5572916832752526, "num_tokens": 27182108.0, "step": 33 }, { "entropy": 0.5230560302734375, "epoch": 0.38202247191011235, "grad_norm": 58.29103483362129, "learning_rate": 4.998272387591625e-06, "loss": 2.565, "mean_token_accuracy": 0.5807291839737445, "num_tokens": 27990119.0, "step": 34 }, { "entropy": 0.5248336791992188, "epoch": 0.39325842696629215, "grad_norm": 58.88250032879414, "learning_rate": 4.997648625366471e-06, "loss": 2.5656, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 28815728.0, "step": 35 }, { "entropy": 0.5244140625, "epoch": 0.4044943820224719, "grad_norm": 59.128939451714714, "learning_rate": 4.996928964207685e-06, "loss": 2.5173, "mean_token_accuracy": 0.5716146003687754, "num_tokens": 29630564.0, "step": 36 }, { "entropy": 0.529052734375, "epoch": 0.4157303370786517, "grad_norm": 59.872929282401124, "learning_rate": 4.99611343174715e-06, "loss": 2.475, "mean_token_accuracy": 0.5781250172294676, "num_tokens": 30441045.0, "step": 37 }, { "entropy": 0.5102386474609375, "epoch": 0.42696629213483145, "grad_norm": 59.524880081693325, "learning_rate": 4.995202059297795e-06, "loss": 2.475, "mean_token_accuracy": 0.5651041835080832, "num_tokens": 31303954.0, "step": 38 }, { "entropy": 0.5181503295898438, "epoch": 0.43820224719101125, "grad_norm": 59.47510189267516, "learning_rate": 4.99419488185239e-06, "loss": 2.4294, "mean_token_accuracy": 0.5833333507180214, "num_tokens": 32133913.0, "step": 39 }, { "entropy": 0.5339736938476562, "epoch": 0.449438202247191, "grad_norm": 60.729118832026174, "learning_rate": 4.993091938082206e-06, "loss": 2.4088, "mean_token_accuracy": 0.5872396008344367, "num_tokens": 32917166.0, "step": 40 }, { "entropy": 0.50799560546875, "epoch": 0.4606741573033708, "grad_norm": 60.78892852056622, "learning_rate": 4.991893270335526e-06, "loss": 2.4005, "mean_token_accuracy": 0.5611979333916679, "num_tokens": 33768662.0, "step": 41 }, { "entropy": 0.5279388427734375, "epoch": 0.47191011235955055, "grad_norm": 59.9563298919123, "learning_rate": 4.990598924636019e-06, "loss": 2.3612, "mean_token_accuracy": 0.5794271006016061, "num_tokens": 34579535.0, "step": 42 }, { "entropy": 0.5206527709960938, "epoch": 0.48314606741573035, "grad_norm": 60.22141204319965, "learning_rate": 4.989208950680979e-06, "loss": 2.3572, "mean_token_accuracy": 0.5611979333916679, "num_tokens": 35405664.0, "step": 43 }, { "entropy": 0.518218994140625, "epoch": 0.4943820224719101, "grad_norm": 60.352777525668316, "learning_rate": 4.987723401839409e-06, "loss": 2.3016, "mean_token_accuracy": 0.5976562678115442, "num_tokens": 36237455.0, "step": 44 }, { "entropy": 0.5099258422851562, "epoch": 0.5056179775280899, "grad_norm": 60.37422724299012, "learning_rate": 4.9861423351499786e-06, "loss": 2.3152, "mean_token_accuracy": 0.5989583493210375, "num_tokens": 37080531.0, "step": 45 }, { "entropy": 0.5283050537109375, "epoch": 0.5168539325842697, "grad_norm": 60.891478974349575, "learning_rate": 4.984465811318826e-06, "loss": 2.2646, "mean_token_accuracy": 0.7473958439659327, "num_tokens": 37899788.0, "step": 46 }, { "entropy": 0.5109405517578125, "epoch": 0.5280898876404494, "grad_norm": 60.95117855356651, "learning_rate": 4.982693894717237e-06, "loss": 2.261, "mean_token_accuracy": 0.8867187564028427, "num_tokens": 38740893.0, "step": 47 }, { "entropy": 0.53253173828125, "epoch": 0.5393258426966292, "grad_norm": 60.74187561074295, "learning_rate": 4.980826653379163e-06, "loss": 2.2125, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 39582346.0, "step": 48 }, { "entropy": 0.5358963012695312, "epoch": 0.550561797752809, "grad_norm": 60.56695566553226, "learning_rate": 4.97886415899862e-06, "loss": 2.1837, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 40397227.0, "step": 49 }, { "entropy": 0.528045654296875, "epoch": 0.5617977528089888, "grad_norm": 60.65909676388702, "learning_rate": 4.976806486926926e-06, "loss": 2.1628, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 41239235.0, "step": 50 }, { "entropy": 0.5263214111328125, "epoch": 0.5730337078651685, "grad_norm": 60.510109640876315, "learning_rate": 4.9746537161698125e-06, "loss": 2.1332, "mean_token_accuracy": 0.912760421866551, "num_tokens": 42099432.0, "step": 51 }, { "entropy": 0.531646728515625, "epoch": 0.5842696629213483, "grad_norm": 60.707459562078135, "learning_rate": 4.972405929384391e-06, "loss": 2.1541, "mean_token_accuracy": 0.901041672565043, "num_tokens": 42954514.0, "step": 52 }, { "entropy": 0.5231857299804688, "epoch": 0.5955056179775281, "grad_norm": 60.712009812773005, "learning_rate": 4.970063212875979e-06, "loss": 2.1064, "mean_token_accuracy": 0.901041672565043, "num_tokens": 43831261.0, "step": 53 }, { "entropy": 0.5416336059570312, "epoch": 0.6067415730337079, "grad_norm": 59.97544270872148, "learning_rate": 4.967625656594782e-06, "loss": 2.0685, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 44663681.0, "step": 54 }, { "entropy": 0.5709991455078125, "epoch": 0.6179775280898876, "grad_norm": 60.549193390626336, "learning_rate": 4.965093354132451e-06, "loss": 2.0444, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 45428334.0, "step": 55 }, { "entropy": 0.5407867431640625, "epoch": 0.6292134831460674, "grad_norm": 60.807708362065945, "learning_rate": 4.962466402718475e-06, "loss": 2.0334, "mean_token_accuracy": 0.8997395893093199, "num_tokens": 46279619.0, "step": 56 }, { "entropy": 0.5298614501953125, "epoch": 0.6404494382022472, "grad_norm": 59.86881859999676, "learning_rate": 4.959744903216458e-06, "loss": 1.9953, "mean_token_accuracy": 0.912760421866551, "num_tokens": 47142939.0, "step": 57 }, { "entropy": 0.5374221801757812, "epoch": 0.651685393258427, "grad_norm": 60.387166382590635, "learning_rate": 4.9569289601202405e-06, "loss": 1.975, "mean_token_accuracy": 0.9153645883779973, "num_tokens": 47975113.0, "step": 58 }, { "entropy": 0.5334854125976562, "epoch": 0.6629213483146067, "grad_norm": 59.65588927086945, "learning_rate": 4.954018681549891e-06, "loss": 1.9455, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 48809586.0, "step": 59 }, { "entropy": 0.5394287109375, "epoch": 0.6741573033707865, "grad_norm": 63.51403196675763, "learning_rate": 4.951014179247555e-06, "loss": 1.9311, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 49623736.0, "step": 60 }, { "entropy": 0.5323715209960938, "epoch": 0.6853932584269663, "grad_norm": 59.81148319230244, "learning_rate": 4.9479155685731595e-06, "loss": 1.8914, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 50448514.0, "step": 61 }, { "entropy": 0.5502471923828125, "epoch": 0.6966292134831461, "grad_norm": 61.91436012722269, "learning_rate": 4.944722968499989e-06, "loss": 1.8825, "mean_token_accuracy": 0.9101562553551048, "num_tokens": 51261440.0, "step": 62 }, { "entropy": 0.5454025268554688, "epoch": 0.7078651685393258, "grad_norm": 60.64630880005511, "learning_rate": 4.9414365016101144e-06, "loss": 1.8559, "mean_token_accuracy": 0.912760421866551, "num_tokens": 52075479.0, "step": 63 }, { "entropy": 0.53387451171875, "epoch": 0.7191011235955056, "grad_norm": 59.4763603036066, "learning_rate": 4.938056294089689e-06, "loss": 1.8253, "mean_token_accuracy": 0.9101562553551048, "num_tokens": 52947384.0, "step": 64 }, { "entropy": 0.5350189208984375, "epoch": 0.7303370786516854, "grad_norm": 58.59080646861511, "learning_rate": 4.934582475724101e-06, "loss": 1.7885, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 53803463.0, "step": 65 }, { "entropy": 0.5385360717773438, "epoch": 0.7415730337078652, "grad_norm": 59.607691840587314, "learning_rate": 4.93101517989299e-06, "loss": 1.7782, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 54629114.0, "step": 66 }, { "entropy": 0.5513763427734375, "epoch": 0.7528089887640449, "grad_norm": 58.79322877259206, "learning_rate": 4.927354543565131e-06, "loss": 1.7471, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 55460361.0, "step": 67 }, { "entropy": 0.5551681518554688, "epoch": 0.7640449438202247, "grad_norm": 57.96512656440569, "learning_rate": 4.923600707293166e-06, "loss": 1.7144, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 56245748.0, "step": 68 }, { "entropy": 0.56158447265625, "epoch": 0.7752808988764045, "grad_norm": 58.47646030692927, "learning_rate": 4.919753815208218e-06, "loss": 1.6723, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 57025912.0, "step": 69 }, { "entropy": 0.5417938232421875, "epoch": 0.7865168539325843, "grad_norm": 59.27714480205728, "learning_rate": 4.915814015014349e-06, "loss": 1.6721, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 57834059.0, "step": 70 }, { "entropy": 0.5401611328125, "epoch": 0.797752808988764, "grad_norm": 58.344511616531506, "learning_rate": 4.91178145798289e-06, "loss": 1.6156, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 58666075.0, "step": 71 }, { "entropy": 0.5393142700195312, "epoch": 0.8089887640449438, "grad_norm": 59.042195880718445, "learning_rate": 4.90765629894664e-06, "loss": 1.598, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 59494512.0, "step": 72 }, { "entropy": 0.5437774658203125, "epoch": 0.8202247191011236, "grad_norm": 58.82751577757508, "learning_rate": 4.90343869629391e-06, "loss": 1.5733, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 60288577.0, "step": 73 }, { "entropy": 0.5414581298828125, "epoch": 0.8314606741573034, "grad_norm": 58.26603403657131, "learning_rate": 4.89912881196245e-06, "loss": 1.5266, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 61117524.0, "step": 74 }, { "entropy": 0.5426177978515625, "epoch": 0.8426966292134831, "grad_norm": 59.26658241025557, "learning_rate": 4.8947268114332276e-06, "loss": 1.5502, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 61931952.0, "step": 75 }, { "entropy": 0.536407470703125, "epoch": 0.8539325842696629, "grad_norm": 58.860024938330206, "learning_rate": 4.890232863724075e-06, "loss": 1.4873, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 62774606.0, "step": 76 }, { "entropy": 0.5328445434570312, "epoch": 0.8651685393258427, "grad_norm": 58.76272222471983, "learning_rate": 4.8856471413831995e-06, "loss": 1.4552, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 63636059.0, "step": 77 }, { "entropy": 0.5324859619140625, "epoch": 0.8764044943820225, "grad_norm": 58.95451119005953, "learning_rate": 4.880969820482559e-06, "loss": 1.4268, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 64456269.0, "step": 78 }, { "entropy": 0.5229721069335938, "epoch": 0.8876404494382022, "grad_norm": 58.84513969144007, "learning_rate": 4.8762010806111e-06, "loss": 1.4127, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 65332024.0, "step": 79 }, { "entropy": 0.5389480590820312, "epoch": 0.898876404494382, "grad_norm": 58.33326344529121, "learning_rate": 4.8713411048678635e-06, "loss": 1.3606, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 66146121.0, "step": 80 }, { "entropy": 0.5313339233398438, "epoch": 0.9101123595505618, "grad_norm": 58.770046988203866, "learning_rate": 4.866390079854956e-06, "loss": 1.3559, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 66983445.0, "step": 81 }, { "entropy": 0.543670654296875, "epoch": 0.9213483146067416, "grad_norm": 58.35961263678049, "learning_rate": 4.861348195670381e-06, "loss": 1.3214, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 67786937.0, "step": 82 }, { "entropy": 0.5296173095703125, "epoch": 0.9325842696629213, "grad_norm": 58.16424063871105, "learning_rate": 4.856215645900742e-06, "loss": 1.2894, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 68600141.0, "step": 83 }, { "entropy": 0.5406417846679688, "epoch": 0.9438202247191011, "grad_norm": 58.67244467783903, "learning_rate": 4.850992627613812e-06, "loss": 1.2609, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 69454327.0, "step": 84 }, { "entropy": 0.5292129516601562, "epoch": 0.9550561797752809, "grad_norm": 57.850796732664065, "learning_rate": 4.845679341350963e-06, "loss": 1.2283, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 70270259.0, "step": 85 }, { "entropy": 0.5702438354492188, "epoch": 0.9662921348314607, "grad_norm": 57.61484422295124, "learning_rate": 4.8402759911194705e-06, "loss": 1.175, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 71047982.0, "step": 86 }, { "entropy": 0.547698974609375, "epoch": 0.9775280898876404, "grad_norm": 57.52383290276939, "learning_rate": 4.834782784384674e-06, "loss": 1.1626, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 71864427.0, "step": 87 }, { "entropy": 0.5348434448242188, "epoch": 0.9887640449438202, "grad_norm": 57.09411552139804, "learning_rate": 4.8291999320620185e-06, "loss": 1.1274, "mean_token_accuracy": 0.945312503259629, "num_tokens": 72710085.0, "step": 88 }, { "entropy": 0.5321273803710938, "epoch": 1.0, "grad_norm": 57.070649311975515, "learning_rate": 4.823527648508951e-06, "loss": 1.1004, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 73549946.0, "step": 89 }, { "entropy": 0.5517959594726562, "epoch": 1.0112359550561798, "grad_norm": 57.109477979057964, "learning_rate": 4.817766151516693e-06, "loss": 1.0819, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 74355101.0, "step": 90 }, { "entropy": 0.53094482421875, "epoch": 1.0224719101123596, "grad_norm": 57.86303273485758, "learning_rate": 4.811915662301877e-06, "loss": 1.055, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 75191280.0, "step": 91 }, { "entropy": 0.5199966430664062, "epoch": 1.0337078651685394, "grad_norm": 57.09250683738646, "learning_rate": 4.805976405498052e-06, "loss": 1.0464, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 76053228.0, "step": 92 }, { "entropy": 0.5236129760742188, "epoch": 1.0449438202247192, "grad_norm": 56.56208472514724, "learning_rate": 4.799948609147061e-06, "loss": 1.0133, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 76922703.0, "step": 93 }, { "entropy": 0.5320358276367188, "epoch": 1.0561797752808988, "grad_norm": 56.52468060162162, "learning_rate": 4.793832504690283e-06, "loss": 0.984, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 77760537.0, "step": 94 }, { "entropy": 0.5383987426757812, "epoch": 1.0674157303370786, "grad_norm": 56.23138412443709, "learning_rate": 4.787628326959747e-06, "loss": 0.9668, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 78584878.0, "step": 95 }, { "entropy": 0.5504608154296875, "epoch": 1.0786516853932584, "grad_norm": 55.775703271501115, "learning_rate": 4.7813363141691166e-06, "loss": 0.9265, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 79362452.0, "step": 96 }, { "entropy": 0.52728271484375, "epoch": 1.0898876404494382, "grad_norm": 55.443074597823376, "learning_rate": 4.774956707904542e-06, "loss": 0.9173, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 80224183.0, "step": 97 }, { "entropy": 0.5221405029296875, "epoch": 1.101123595505618, "grad_norm": 55.28172803435839, "learning_rate": 4.768489753115386e-06, "loss": 0.8786, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 81054014.0, "step": 98 }, { "entropy": 0.522125244140625, "epoch": 1.1123595505617978, "grad_norm": 55.15995525310935, "learning_rate": 4.761935698104817e-06, "loss": 0.8674, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 81886788.0, "step": 99 }, { "entropy": 0.5253677368164062, "epoch": 1.1235955056179776, "grad_norm": 54.71553871787075, "learning_rate": 4.755294794520277e-06, "loss": 0.8102, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 82715583.0, "step": 100 }, { "entropy": 0.540069580078125, "epoch": 1.1348314606741572, "grad_norm": 56.64220620441506, "learning_rate": 4.7485672973438175e-06, "loss": 0.8516, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 83523671.0, "step": 101 }, { "entropy": 0.532379150390625, "epoch": 1.146067415730337, "grad_norm": 57.49781410452237, "learning_rate": 4.741753464882312e-06, "loss": 0.8225, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 84335142.0, "step": 102 }, { "entropy": 0.561614990234375, "epoch": 1.1573033707865168, "grad_norm": 54.70294783987688, "learning_rate": 4.734853558757534e-06, "loss": 0.7812, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 85112682.0, "step": 103 }, { "entropy": 0.5339889526367188, "epoch": 1.1685393258426966, "grad_norm": 53.21911232668916, "learning_rate": 4.727867843896116e-06, "loss": 0.7431, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 85953912.0, "step": 104 }, { "entropy": 0.5423355102539062, "epoch": 1.1797752808988764, "grad_norm": 52.7058239940401, "learning_rate": 4.72079658851938e-06, "loss": 0.7265, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 86770289.0, "step": 105 }, { "entropy": 0.5316314697265625, "epoch": 1.1910112359550562, "grad_norm": 52.57859861877315, "learning_rate": 4.7136400641330245e-06, "loss": 0.7063, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 87591764.0, "step": 106 }, { "entropy": 0.5559310913085938, "epoch": 1.202247191011236, "grad_norm": 51.41978129735086, "learning_rate": 4.706398545516722e-06, "loss": 0.6742, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 88393028.0, "step": 107 }, { "entropy": 0.539886474609375, "epoch": 1.2134831460674158, "grad_norm": 51.39997895136707, "learning_rate": 4.6990723107135475e-06, "loss": 0.6652, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 89219905.0, "step": 108 }, { "entropy": 0.5354461669921875, "epoch": 1.2247191011235956, "grad_norm": 50.28502697998156, "learning_rate": 4.691661641019316e-06, "loss": 0.6255, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 90075085.0, "step": 109 }, { "entropy": 0.5335617065429688, "epoch": 1.2359550561797752, "grad_norm": 48.758575622533606, "learning_rate": 4.684166820971779e-06, "loss": 0.6082, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 90906884.0, "step": 110 }, { "entropy": 0.5511932373046875, "epoch": 1.247191011235955, "grad_norm": 47.80816266829214, "learning_rate": 4.6765881383396985e-06, "loss": 0.6082, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 91706441.0, "step": 111 }, { "entropy": 0.5378570556640625, "epoch": 1.2584269662921348, "grad_norm": 46.527536617371204, "learning_rate": 4.6689258841117946e-06, "loss": 0.5867, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 92547894.0, "step": 112 }, { "entropy": 0.53167724609375, "epoch": 1.2696629213483146, "grad_norm": 45.65667911602808, "learning_rate": 4.6611803524855805e-06, "loss": 0.56, "mean_token_accuracy": 0.955729169305414, "num_tokens": 93391238.0, "step": 113 }, { "entropy": 0.5202484130859375, "epoch": 1.2808988764044944, "grad_norm": 45.04158241974194, "learning_rate": 4.65335184085606e-06, "loss": 0.5511, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 94259336.0, "step": 114 }, { "entropy": 0.5372695922851562, "epoch": 1.2921348314606742, "grad_norm": 43.878757463536445, "learning_rate": 4.64544064980431e-06, "loss": 0.5162, "mean_token_accuracy": 0.945312503259629, "num_tokens": 95078639.0, "step": 115 }, { "entropy": 0.5404129028320312, "epoch": 1.303370786516854, "grad_norm": 42.9718114331032, "learning_rate": 4.637447083085944e-06, "loss": 0.5121, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 95914468.0, "step": 116 }, { "entropy": 0.53564453125, "epoch": 1.3146067415730336, "grad_norm": 41.78707116055528, "learning_rate": 4.629371447619443e-06, "loss": 0.4901, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 96715346.0, "step": 117 }, { "entropy": 0.5341415405273438, "epoch": 1.3258426966292136, "grad_norm": 41.532789530235036, "learning_rate": 4.621214053474374e-06, "loss": 0.483, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 97534917.0, "step": 118 }, { "entropy": 0.5507736206054688, "epoch": 1.3370786516853932, "grad_norm": 44.303988041655955, "learning_rate": 4.612975213859487e-06, "loss": 0.5075, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 98348468.0, "step": 119 }, { "entropy": 0.538421630859375, "epoch": 1.348314606741573, "grad_norm": 39.13623243577061, "learning_rate": 4.604655245110684e-06, "loss": 0.4581, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 99146166.0, "step": 120 }, { "entropy": 0.5124588012695312, "epoch": 1.3595505617977528, "grad_norm": 35.61302129069593, "learning_rate": 4.596254466678877e-06, "loss": 0.4225, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 100023483.0, "step": 121 }, { "entropy": 0.5451736450195312, "epoch": 1.3707865168539326, "grad_norm": 34.79497149141314, "learning_rate": 4.5877732011177215e-06, "loss": 0.4174, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 100807531.0, "step": 122 }, { "entropy": 0.5343170166015625, "epoch": 1.3820224719101124, "grad_norm": 33.6184828101984, "learning_rate": 4.579211774071229e-06, "loss": 0.3814, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 101643474.0, "step": 123 }, { "entropy": 0.5354690551757812, "epoch": 1.3932584269662922, "grad_norm": 32.37729538841938, "learning_rate": 4.570570514261272e-06, "loss": 0.3839, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 102462858.0, "step": 124 }, { "entropy": 0.5319061279296875, "epoch": 1.404494382022472, "grad_norm": 31.363308088082444, "learning_rate": 4.561849753474951e-06, "loss": 0.3421, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 103287709.0, "step": 125 }, { "entropy": 0.5308456420898438, "epoch": 1.4157303370786516, "grad_norm": 34.482826340425774, "learning_rate": 4.553049826551864e-06, "loss": 0.3421, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 104117613.0, "step": 126 }, { "entropy": 0.5468826293945312, "epoch": 1.4269662921348314, "grad_norm": 43.099922541286496, "learning_rate": 4.544171071371246e-06, "loss": 0.4766, "mean_token_accuracy": 0.8997395893093199, "num_tokens": 104910285.0, "step": 127 }, { "entropy": 0.543365478515625, "epoch": 1.4382022471910112, "grad_norm": 28.630466194759624, "learning_rate": 4.535213828838998e-06, "loss": 0.3273, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 105723583.0, "step": 128 }, { "entropy": 0.53936767578125, "epoch": 1.449438202247191, "grad_norm": 35.38257284310518, "learning_rate": 4.526178442874596e-06, "loss": 0.3644, "mean_token_accuracy": 0.923177087912336, "num_tokens": 106547148.0, "step": 129 }, { "entropy": 0.5411376953125, "epoch": 1.4606741573033708, "grad_norm": 26.906307855558588, "learning_rate": 4.517065260397887e-06, "loss": 0.317, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 107421430.0, "step": 130 }, { "entropy": 0.5464248657226562, "epoch": 1.4719101123595506, "grad_norm": 27.053952446861047, "learning_rate": 4.5078746313157684e-06, "loss": 0.3227, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 108268831.0, "step": 131 }, { "entropy": 0.5452651977539062, "epoch": 1.4831460674157304, "grad_norm": 25.285396700518028, "learning_rate": 4.498606908508754e-06, "loss": 0.2983, "mean_token_accuracy": 0.955729169305414, "num_tokens": 109099543.0, "step": 132 }, { "entropy": 0.550140380859375, "epoch": 1.49438202247191, "grad_norm": 23.61719992855654, "learning_rate": 4.489262447817421e-06, "loss": 0.2844, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 109922580.0, "step": 133 }, { "entropy": 0.5431671142578125, "epoch": 1.50561797752809, "grad_norm": 25.4495115008555, "learning_rate": 4.479841608028756e-06, "loss": 0.2781, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 110764011.0, "step": 134 }, { "entropy": 0.5407867431640625, "epoch": 1.5168539325842696, "grad_norm": 21.937846662427205, "learning_rate": 4.470344750862369e-06, "loss": 0.2533, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 111604839.0, "step": 135 }, { "entropy": 0.5406417846679688, "epoch": 1.5280898876404494, "grad_norm": 21.05181225648866, "learning_rate": 4.460772240956609e-06, "loss": 0.2445, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 112446849.0, "step": 136 }, { "entropy": 0.54180908203125, "epoch": 1.5393258426966292, "grad_norm": 26.085445973452664, "learning_rate": 4.4511244458545666e-06, "loss": 0.2675, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 113267124.0, "step": 137 }, { "entropy": 0.53436279296875, "epoch": 1.550561797752809, "grad_norm": 18.064464279776033, "learning_rate": 4.441401735989958e-06, "loss": 0.2398, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 114118062.0, "step": 138 }, { "entropy": 0.5387649536132812, "epoch": 1.5617977528089888, "grad_norm": 20.856843279353505, "learning_rate": 4.431604484672905e-06, "loss": 0.2399, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 114957756.0, "step": 139 }, { "entropy": 0.5518875122070312, "epoch": 1.5730337078651684, "grad_norm": 15.763148774834674, "learning_rate": 4.421733068075596e-06, "loss": 0.2171, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 115750224.0, "step": 140 }, { "entropy": 0.5423736572265625, "epoch": 1.5842696629213484, "grad_norm": 19.090770454447515, "learning_rate": 4.411787865217847e-06, "loss": 0.2309, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 116594786.0, "step": 141 }, { "entropy": 0.5412063598632812, "epoch": 1.595505617977528, "grad_norm": 15.016224731584037, "learning_rate": 4.401769257952551e-06, "loss": 0.2385, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 117425980.0, "step": 142 }, { "entropy": 0.5459976196289062, "epoch": 1.606741573033708, "grad_norm": 18.355720830544215, "learning_rate": 4.3916776309510115e-06, "loss": 0.2292, "mean_token_accuracy": 0.945312503259629, "num_tokens": 118249651.0, "step": 143 }, { "entropy": 0.5451126098632812, "epoch": 1.6179775280898876, "grad_norm": 14.837666903281002, "learning_rate": 4.381513371688174e-06, "loss": 0.1965, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 119084058.0, "step": 144 }, { "entropy": 0.548553466796875, "epoch": 1.6292134831460674, "grad_norm": 16.682044870857027, "learning_rate": 4.3712768704277535e-06, "loss": 0.2006, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 119900949.0, "step": 145 }, { "entropy": 0.54833984375, "epoch": 1.6404494382022472, "grad_norm": 13.398664955064358, "learning_rate": 4.360968520207241e-06, "loss": 0.1766, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 120703593.0, "step": 146 }, { "entropy": 0.5443878173828125, "epoch": 1.651685393258427, "grad_norm": 19.269521133635244, "learning_rate": 4.35058871682282e-06, "loss": 0.2035, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 121516344.0, "step": 147 }, { "entropy": 0.5469894409179688, "epoch": 1.6629213483146068, "grad_norm": 17.4616206956177, "learning_rate": 4.340137858814168e-06, "loss": 0.2089, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 122351127.0, "step": 148 }, { "entropy": 0.5352020263671875, "epoch": 1.6741573033707864, "grad_norm": 17.178879243601084, "learning_rate": 4.329616347449154e-06, "loss": 0.213, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 123172441.0, "step": 149 }, { "entropy": 0.5321578979492188, "epoch": 1.6853932584269664, "grad_norm": 13.357276465389605, "learning_rate": 4.3190245867084275e-06, "loss": 0.1728, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 124012921.0, "step": 150 }, { "entropy": 0.5493316650390625, "epoch": 1.696629213483146, "grad_norm": 11.642016241780885, "learning_rate": 4.308362983269916e-06, "loss": 0.1747, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 124803857.0, "step": 151 }, { "entropy": 0.5419769287109375, "epoch": 1.7078651685393258, "grad_norm": 11.421215730426258, "learning_rate": 4.297631946493202e-06, "loss": 0.1715, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 125627395.0, "step": 152 }, { "entropy": 0.5377578735351562, "epoch": 1.7191011235955056, "grad_norm": 9.875789917298611, "learning_rate": 4.2868318884038075e-06, "loss": 0.177, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 126450383.0, "step": 153 }, { "entropy": 0.5468826293945312, "epoch": 1.7303370786516854, "grad_norm": 11.701291607682503, "learning_rate": 4.275963223677379e-06, "loss": 0.1517, "mean_token_accuracy": 0.945312503259629, "num_tokens": 127279834.0, "step": 154 }, { "entropy": 0.523223876953125, "epoch": 1.7415730337078652, "grad_norm": 7.880131869221865, "learning_rate": 4.265026369623761e-06, "loss": 0.1662, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 128165756.0, "step": 155 }, { "entropy": 0.5451507568359375, "epoch": 1.7528089887640448, "grad_norm": 8.585183353114594, "learning_rate": 4.254021746170972e-06, "loss": 0.1449, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 128976327.0, "step": 156 }, { "entropy": 0.5521163940429688, "epoch": 1.7640449438202248, "grad_norm": 8.571471771414366, "learning_rate": 4.242949775849083e-06, "loss": 0.1578, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 129771800.0, "step": 157 }, { "entropy": 0.5428085327148438, "epoch": 1.7752808988764044, "grad_norm": 16.43602264445111, "learning_rate": 4.231810883773999e-06, "loss": 0.1799, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 130569228.0, "step": 158 }, { "entropy": 0.5567550659179688, "epoch": 1.7865168539325844, "grad_norm": 7.531185608644983, "learning_rate": 4.220605497631125e-06, "loss": 0.156, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 131343343.0, "step": 159 }, { "entropy": 0.5351791381835938, "epoch": 1.797752808988764, "grad_norm": 18.604263622333004, "learning_rate": 4.209334047658956e-06, "loss": 0.1977, "mean_token_accuracy": 0.9153645883779973, "num_tokens": 132192522.0, "step": 160 }, { "entropy": 0.550018310546875, "epoch": 1.8089887640449438, "grad_norm": 15.376187129436879, "learning_rate": 4.197996966632551e-06, "loss": 0.1777, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 132973585.0, "step": 161 }, { "entropy": 0.534149169921875, "epoch": 1.8202247191011236, "grad_norm": 6.060874528483011, "learning_rate": 4.186594689846919e-06, "loss": 0.1446, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 133810589.0, "step": 162 }, { "entropy": 0.5483245849609375, "epoch": 1.8314606741573034, "grad_norm": 13.790675275430694, "learning_rate": 4.175127655100306e-06, "loss": 0.1647, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 134632550.0, "step": 163 }, { "entropy": 0.5397720336914062, "epoch": 1.8426966292134832, "grad_norm": 9.616982136039516, "learning_rate": 4.163596302677383e-06, "loss": 0.127, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 135475079.0, "step": 164 }, { "entropy": 0.5223236083984375, "epoch": 1.8539325842696628, "grad_norm": 11.249984036705216, "learning_rate": 4.152001075332342e-06, "loss": 0.134, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 136337338.0, "step": 165 }, { "entropy": 0.5302963256835938, "epoch": 1.8651685393258428, "grad_norm": 11.29797181598071, "learning_rate": 4.140342418271897e-06, "loss": 0.1743, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 137170980.0, "step": 166 }, { "entropy": 0.5428543090820312, "epoch": 1.8764044943820224, "grad_norm": 5.666218885177399, "learning_rate": 4.128620779138191e-06, "loss": 0.1069, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 137967406.0, "step": 167 }, { "entropy": 0.5390548706054688, "epoch": 1.8876404494382022, "grad_norm": 7.4433019218071435, "learning_rate": 4.116836607991603e-06, "loss": 0.1159, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 138776095.0, "step": 168 }, { "entropy": 0.5308151245117188, "epoch": 1.898876404494382, "grad_norm": 5.850528993135207, "learning_rate": 4.104990357293478e-06, "loss": 0.1243, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 139611842.0, "step": 169 }, { "entropy": 0.5367202758789062, "epoch": 1.9101123595505618, "grad_norm": 4.483523712916532, "learning_rate": 4.09308248188874e-06, "loss": 0.1148, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 140452877.0, "step": 170 }, { "entropy": 0.538116455078125, "epoch": 1.9213483146067416, "grad_norm": 4.63259111392511, "learning_rate": 4.081113438988443e-06, "loss": 0.1168, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 141283700.0, "step": 171 }, { "entropy": 0.5343170166015625, "epoch": 1.9325842696629212, "grad_norm": 5.6051268320124, "learning_rate": 4.069083688152206e-06, "loss": 0.1042, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 142114404.0, "step": 172 }, { "entropy": 0.5272216796875, "epoch": 1.9438202247191012, "grad_norm": 5.637386397174846, "learning_rate": 4.056993691270569e-06, "loss": 0.1067, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 142975160.0, "step": 173 }, { "entropy": 0.5473403930664062, "epoch": 1.9550561797752808, "grad_norm": 4.4513312700269605, "learning_rate": 4.044843912547262e-06, "loss": 0.0698, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 143782020.0, "step": 174 }, { "entropy": 0.5282516479492188, "epoch": 1.9662921348314608, "grad_norm": 8.165500644505597, "learning_rate": 4.032634818481382e-06, "loss": 0.1, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 144634435.0, "step": 175 }, { "entropy": 0.5271377563476562, "epoch": 1.9775280898876404, "grad_norm": 8.58130607122661, "learning_rate": 4.020366877849477e-06, "loss": 0.1126, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 145481475.0, "step": 176 }, { "entropy": 0.5458755493164062, "epoch": 1.9887640449438202, "grad_norm": 11.866586657273107, "learning_rate": 4.008040561687549e-06, "loss": 0.1346, "mean_token_accuracy": 0.955729169305414, "num_tokens": 146272147.0, "step": 177 }, { "entropy": 0.5224761962890625, "epoch": 2.0, "grad_norm": 7.043806183115956, "learning_rate": 3.995656343272969e-06, "loss": 0.1186, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 147128144.0, "step": 178 }, { "entropy": 0.540283203125, "epoch": 2.0112359550561796, "grad_norm": 10.498406309580753, "learning_rate": 3.983214698106305e-06, "loss": 0.1109, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 147963457.0, "step": 179 }, { "entropy": 0.519134521484375, "epoch": 2.0224719101123596, "grad_norm": 9.652118424262301, "learning_rate": 3.970716103893065e-06, "loss": 0.1069, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 148846204.0, "step": 180 }, { "entropy": 0.5508880615234375, "epoch": 2.033707865168539, "grad_norm": 7.951460626005676, "learning_rate": 3.958161040525354e-06, "loss": 0.0995, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 149670358.0, "step": 181 }, { "entropy": 0.5459823608398438, "epoch": 2.044943820224719, "grad_norm": 4.843661555713835, "learning_rate": 3.94554999006345e-06, "loss": 0.0914, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 150475622.0, "step": 182 }, { "entropy": 0.5539932250976562, "epoch": 2.056179775280899, "grad_norm": 10.638552878346802, "learning_rate": 3.932883436717291e-06, "loss": 0.1272, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 151275741.0, "step": 183 }, { "entropy": 0.531158447265625, "epoch": 2.067415730337079, "grad_norm": 16.65219342556677, "learning_rate": 3.92016186682789e-06, "loss": 0.1808, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 152107378.0, "step": 184 }, { "entropy": 0.5382614135742188, "epoch": 2.0786516853932584, "grad_norm": 4.433323748901134, "learning_rate": 3.907385768848656e-06, "loss": 0.0996, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 152930819.0, "step": 185 }, { "entropy": 0.5397109985351562, "epoch": 2.0898876404494384, "grad_norm": 11.98756204808435, "learning_rate": 3.894555633326642e-06, "loss": 0.1189, "mean_token_accuracy": 0.955729169305414, "num_tokens": 153762988.0, "step": 186 }, { "entropy": 0.539337158203125, "epoch": 2.101123595505618, "grad_norm": 14.319094435898721, "learning_rate": 3.88167195288371e-06, "loss": 0.1407, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 154587742.0, "step": 187 }, { "entropy": 0.5379104614257812, "epoch": 2.1123595505617976, "grad_norm": 6.448338570022156, "learning_rate": 3.868735222197614e-06, "loss": 0.1047, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 155401431.0, "step": 188 }, { "entropy": 0.551666259765625, "epoch": 2.1235955056179776, "grad_norm": 11.070470267067817, "learning_rate": 3.85574593798301e-06, "loss": 0.1434, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 156232809.0, "step": 189 }, { "entropy": 0.5646133422851562, "epoch": 2.134831460674157, "grad_norm": 12.984647361123118, "learning_rate": 3.842704598972384e-06, "loss": 0.1244, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 157038066.0, "step": 190 }, { "entropy": 0.5512008666992188, "epoch": 2.146067415730337, "grad_norm": 7.571198459069707, "learning_rate": 3.8296117058969e-06, "loss": 0.0925, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 157847874.0, "step": 191 }, { "entropy": 0.538909912109375, "epoch": 2.157303370786517, "grad_norm": 3.498838742080043, "learning_rate": 3.816467761467175e-06, "loss": 0.0871, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 158683400.0, "step": 192 }, { "entropy": 0.5359344482421875, "epoch": 2.168539325842697, "grad_norm": 6.154847561172168, "learning_rate": 3.80327327035398e-06, "loss": 0.1039, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 159559442.0, "step": 193 }, { "entropy": 0.5301971435546875, "epoch": 2.1797752808988764, "grad_norm": 3.464618502492379, "learning_rate": 3.7900287391688584e-06, "loss": 0.0758, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 160409769.0, "step": 194 }, { "entropy": 0.5456008911132812, "epoch": 2.191011235955056, "grad_norm": 9.16083029440977, "learning_rate": 3.776734676444678e-06, "loss": 0.1088, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 161215428.0, "step": 195 }, { "entropy": 0.5370635986328125, "epoch": 2.202247191011236, "grad_norm": 8.727730015955157, "learning_rate": 3.763391592616104e-06, "loss": 0.098, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 162038062.0, "step": 196 }, { "entropy": 0.5242767333984375, "epoch": 2.2134831460674156, "grad_norm": 9.7605350617224, "learning_rate": 3.7500000000000005e-06, "loss": 0.1193, "mean_token_accuracy": 0.955729169305414, "num_tokens": 162898167.0, "step": 197 }, { "entropy": 0.5362014770507812, "epoch": 2.2247191011235956, "grad_norm": 11.751831338500692, "learning_rate": 3.7365604127757584e-06, "loss": 0.1232, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 163737294.0, "step": 198 }, { "entropy": 0.5273818969726562, "epoch": 2.235955056179775, "grad_norm": 5.446400766116038, "learning_rate": 3.7230733469655554e-06, "loss": 0.0746, "mean_token_accuracy": 0.977864584652707, "num_tokens": 164576231.0, "step": 199 }, { "entropy": 0.5343704223632812, "epoch": 2.247191011235955, "grad_norm": 15.034561777079416, "learning_rate": 3.709539320414544e-06, "loss": 0.1336, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 165421397.0, "step": 200 }, { "entropy": 0.537506103515625, "epoch": 2.258426966292135, "grad_norm": 16.63689156589337, "learning_rate": 3.6959588527709635e-06, "loss": 0.1461, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 166255000.0, "step": 201 }, { "entropy": 0.54998779296875, "epoch": 2.2696629213483144, "grad_norm": 6.385410420428761, "learning_rate": 3.6823324654661923e-06, "loss": 0.0849, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 167062178.0, "step": 202 }, { "entropy": 0.536407470703125, "epoch": 2.2808988764044944, "grad_norm": 5.6791741210641815, "learning_rate": 3.6686606816947264e-06, "loss": 0.078, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 167880704.0, "step": 203 }, { "entropy": 0.5526046752929688, "epoch": 2.292134831460674, "grad_norm": 7.912547116557072, "learning_rate": 3.6549440263940878e-06, "loss": 0.0945, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 168691010.0, "step": 204 }, { "entropy": 0.5461044311523438, "epoch": 2.303370786516854, "grad_norm": 6.52901004838527, "learning_rate": 3.6411830262246755e-06, "loss": 0.0935, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 169509964.0, "step": 205 }, { "entropy": 0.537628173828125, "epoch": 2.3146067415730336, "grad_norm": 2.6353286532560545, "learning_rate": 3.627378209549537e-06, "loss": 0.0706, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 170357405.0, "step": 206 }, { "entropy": 0.5371932983398438, "epoch": 2.3258426966292136, "grad_norm": 9.812177322775737, "learning_rate": 3.6135301064140856e-06, "loss": 0.1168, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 171203221.0, "step": 207 }, { "entropy": 0.5315933227539062, "epoch": 2.337078651685393, "grad_norm": 10.239216110161607, "learning_rate": 3.599639248525749e-06, "loss": 0.112, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 172038636.0, "step": 208 }, { "entropy": 0.53472900390625, "epoch": 2.348314606741573, "grad_norm": 3.8454624292214628, "learning_rate": 3.5857061692335503e-06, "loss": 0.0641, "mean_token_accuracy": 0.977864584652707, "num_tokens": 172864323.0, "step": 209 }, { "entropy": 0.5257797241210938, "epoch": 2.359550561797753, "grad_norm": 7.821223582969504, "learning_rate": 3.5717314035076355e-06, "loss": 0.09, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 173727784.0, "step": 210 }, { "entropy": 0.5433807373046875, "epoch": 2.370786516853933, "grad_norm": 7.137283108304516, "learning_rate": 3.5577154879187286e-06, "loss": 0.0816, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 174532281.0, "step": 211 }, { "entropy": 0.5399627685546875, "epoch": 2.3820224719101124, "grad_norm": 4.79769687231269, "learning_rate": 3.5436589606175296e-06, "loss": 0.0794, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 175364357.0, "step": 212 }, { "entropy": 0.5312271118164062, "epoch": 2.393258426966292, "grad_norm": 6.896383560835884, "learning_rate": 3.5295623613140563e-06, "loss": 0.0749, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 176186569.0, "step": 213 }, { "entropy": 0.527679443359375, "epoch": 2.404494382022472, "grad_norm": 8.908697981712665, "learning_rate": 3.5154262312569134e-06, "loss": 0.0701, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 177025300.0, "step": 214 }, { "entropy": 0.5305328369140625, "epoch": 2.4157303370786516, "grad_norm": 6.818692017843762, "learning_rate": 3.501251113212521e-06, "loss": 0.0667, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 177873635.0, "step": 215 }, { "entropy": 0.5447006225585938, "epoch": 2.4269662921348316, "grad_norm": 3.8165153692265603, "learning_rate": 3.4870375514442677e-06, "loss": 0.0733, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 178670872.0, "step": 216 }, { "entropy": 0.5322647094726562, "epoch": 2.438202247191011, "grad_norm": 5.369901004672838, "learning_rate": 3.4727860916916143e-06, "loss": 0.0663, "mean_token_accuracy": 0.977864584652707, "num_tokens": 179491783.0, "step": 217 }, { "entropy": 0.5320358276367188, "epoch": 2.449438202247191, "grad_norm": 4.555500008711359, "learning_rate": 3.458497281149143e-06, "loss": 0.0611, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 180317316.0, "step": 218 }, { "entropy": 0.5376434326171875, "epoch": 2.460674157303371, "grad_norm": 4.071209493199293, "learning_rate": 3.444171668445544e-06, "loss": 0.0542, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 181148889.0, "step": 219 }, { "entropy": 0.5237350463867188, "epoch": 2.4719101123595504, "grad_norm": 11.47920307652276, "learning_rate": 3.429809803622551e-06, "loss": 0.0843, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 182012448.0, "step": 220 }, { "entropy": 0.5371170043945312, "epoch": 2.4831460674157304, "grad_norm": 5.447244205245656, "learning_rate": 3.415412238113823e-06, "loss": 0.0493, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 182831219.0, "step": 221 }, { "entropy": 0.5419769287109375, "epoch": 2.49438202247191, "grad_norm": 3.402318570494703, "learning_rate": 3.400979524723773e-06, "loss": 0.0463, "mean_token_accuracy": 0.989583333954215, "num_tokens": 183649085.0, "step": 222 }, { "entropy": 0.5189590454101562, "epoch": 2.50561797752809, "grad_norm": 4.872229888332632, "learning_rate": 3.386512217606339e-06, "loss": 0.0534, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 184524290.0, "step": 223 }, { "entropy": 0.5490798950195312, "epoch": 2.5168539325842696, "grad_norm": 6.35163899835482, "learning_rate": 3.372010872243711e-06, "loss": 0.0593, "mean_token_accuracy": 0.977864584652707, "num_tokens": 185304697.0, "step": 224 }, { "entropy": 0.541900634765625, "epoch": 2.5280898876404496, "grad_norm": 3.75106123980352, "learning_rate": 3.357476045424998e-06, "loss": 0.0372, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 186091630.0, "step": 225 }, { "entropy": 0.5357437133789062, "epoch": 2.539325842696629, "grad_norm": 15.183650700034958, "learning_rate": 3.342908295224854e-06, "loss": 0.0698, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 186900960.0, "step": 226 }, { "entropy": 0.5135269165039062, "epoch": 2.550561797752809, "grad_norm": 7.11769951958488, "learning_rate": 3.32830818098205e-06, "loss": 0.0501, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 187751474.0, "step": 227 }, { "entropy": 0.5372314453125, "epoch": 2.561797752808989, "grad_norm": 4.989153243587186, "learning_rate": 3.313676263277995e-06, "loss": 0.0487, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 188559110.0, "step": 228 }, { "entropy": 0.5383682250976562, "epoch": 2.5730337078651684, "grad_norm": 6.827867800336417, "learning_rate": 3.299013103915214e-06, "loss": 0.0713, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 189360657.0, "step": 229 }, { "entropy": 0.5142898559570312, "epoch": 2.5842696629213484, "grad_norm": 8.36796100398585, "learning_rate": 3.2843192658957775e-06, "loss": 0.0759, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 190208504.0, "step": 230 }, { "entropy": 0.5430984497070312, "epoch": 2.595505617977528, "grad_norm": 6.384555684576824, "learning_rate": 3.269595313399683e-06, "loss": 0.0791, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 191011578.0, "step": 231 }, { "entropy": 0.51165771484375, "epoch": 2.606741573033708, "grad_norm": 7.460899194764321, "learning_rate": 3.2548418117631952e-06, "loss": 0.048, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 191877386.0, "step": 232 }, { "entropy": 0.5298843383789062, "epoch": 2.6179775280898876, "grad_norm": 6.821805872662446, "learning_rate": 3.240059327457138e-06, "loss": 0.0529, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 192727467.0, "step": 233 }, { "entropy": 0.5463333129882812, "epoch": 2.629213483146067, "grad_norm": 3.4805806068994416, "learning_rate": 3.2252484280651453e-06, "loss": 0.0511, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 193508507.0, "step": 234 }, { "entropy": 0.5321044921875, "epoch": 2.640449438202247, "grad_norm": 4.219564851577349, "learning_rate": 3.2104096822618657e-06, "loss": 0.0524, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 194340559.0, "step": 235 }, { "entropy": 0.5420913696289062, "epoch": 2.6516853932584272, "grad_norm": 2.9827980950480515, "learning_rate": 3.195543659791132e-06, "loss": 0.0342, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 195164876.0, "step": 236 }, { "entropy": 0.5332260131835938, "epoch": 2.662921348314607, "grad_norm": 5.696614902089947, "learning_rate": 3.1806509314440827e-06, "loss": 0.0437, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 195980793.0, "step": 237 }, { "entropy": 0.5210113525390625, "epoch": 2.6741573033707864, "grad_norm": 7.372573411260061, "learning_rate": 3.1657320690372464e-06, "loss": 0.0588, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 196834051.0, "step": 238 }, { "entropy": 0.54461669921875, "epoch": 2.6853932584269664, "grad_norm": 4.804908333028152, "learning_rate": 3.150787645390587e-06, "loss": 0.0461, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 197624113.0, "step": 239 }, { "entropy": 0.527008056640625, "epoch": 2.696629213483146, "grad_norm": 5.494419605196046, "learning_rate": 3.135818234305511e-06, "loss": 0.0502, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 198465881.0, "step": 240 }, { "entropy": 0.5293502807617188, "epoch": 2.7078651685393256, "grad_norm": 4.222259777927373, "learning_rate": 3.120824410542833e-06, "loss": 0.0319, "mean_token_accuracy": 0.989583333954215, "num_tokens": 199286078.0, "step": 241 }, { "entropy": 0.51971435546875, "epoch": 2.7191011235955056, "grad_norm": 9.126867635985343, "learning_rate": 3.1058067498007094e-06, "loss": 0.0521, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 200114610.0, "step": 242 }, { "entropy": 0.5160369873046875, "epoch": 2.7303370786516856, "grad_norm": 7.329097197987436, "learning_rate": 3.090765828692534e-06, "loss": 0.046, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 200955034.0, "step": 243 }, { "entropy": 0.5411148071289062, "epoch": 2.741573033707865, "grad_norm": 3.340662077032214, "learning_rate": 3.0757022247248e-06, "loss": 0.0318, "mean_token_accuracy": 0.989583333954215, "num_tokens": 201751463.0, "step": 244 }, { "entropy": 0.5378646850585938, "epoch": 2.752808988764045, "grad_norm": 4.8775745170805065, "learning_rate": 3.0606165162749212e-06, "loss": 0.0494, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 202542823.0, "step": 245 }, { "entropy": 0.5283050537109375, "epoch": 2.764044943820225, "grad_norm": 3.9452834388750384, "learning_rate": 3.045509282569031e-06, "loss": 0.0347, "mean_token_accuracy": 0.989583333954215, "num_tokens": 203354246.0, "step": 246 }, { "entropy": 0.5169830322265625, "epoch": 2.7752808988764044, "grad_norm": 5.491362292823874, "learning_rate": 3.0303811036597395e-06, "loss": 0.0388, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 204200941.0, "step": 247 }, { "entropy": 0.5260467529296875, "epoch": 2.7865168539325844, "grad_norm": 6.201924151041956, "learning_rate": 3.01523256040386e-06, "loss": 0.0508, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 205037033.0, "step": 248 }, { "entropy": 0.5252304077148438, "epoch": 2.797752808988764, "grad_norm": 6.457190267020416, "learning_rate": 3.0000642344401115e-06, "loss": 0.056, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 205854828.0, "step": 249 }, { "entropy": 0.5137939453125, "epoch": 2.808988764044944, "grad_norm": 4.69210936983822, "learning_rate": 2.9848767081667823e-06, "loss": 0.0311, "mean_token_accuracy": 0.989583333954215, "num_tokens": 206719096.0, "step": 250 }, { "entropy": 0.5233001708984375, "epoch": 2.8202247191011236, "grad_norm": 3.4937557726205766, "learning_rate": 2.9696705647193695e-06, "loss": 0.032, "mean_token_accuracy": 0.989583333954215, "num_tokens": 207564713.0, "step": 251 }, { "entropy": 0.5282363891601562, "epoch": 2.831460674157303, "grad_norm": 3.0073778425878825, "learning_rate": 2.9544463879481914e-06, "loss": 0.0415, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 208358356.0, "step": 252 }, { "entropy": 0.5185775756835938, "epoch": 2.842696629213483, "grad_norm": 4.817956965116018, "learning_rate": 2.9392047623959653e-06, "loss": 0.0466, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 209186191.0, "step": 253 }, { "entropy": 0.522125244140625, "epoch": 2.853932584269663, "grad_norm": 3.029972926905953, "learning_rate": 2.923946273275369e-06, "loss": 0.0231, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 210020565.0, "step": 254 }, { "entropy": 0.5248794555664062, "epoch": 2.865168539325843, "grad_norm": 3.3353822683100467, "learning_rate": 2.908671506446566e-06, "loss": 0.0347, "mean_token_accuracy": 0.989583333954215, "num_tokens": 210826562.0, "step": 255 }, { "entropy": 0.519561767578125, "epoch": 2.8764044943820224, "grad_norm": 4.164606363683451, "learning_rate": 2.8933810483947156e-06, "loss": 0.042, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 211637830.0, "step": 256 }, { "entropy": 0.5108184814453125, "epoch": 2.8876404494382024, "grad_norm": 3.4352370759093667, "learning_rate": 2.878075486207452e-06, "loss": 0.0241, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 212483603.0, "step": 257 }, { "entropy": 0.5216064453125, "epoch": 2.898876404494382, "grad_norm": 4.264768762251155, "learning_rate": 2.8627554075523426e-06, "loss": 0.0264, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 213277086.0, "step": 258 }, { "entropy": 0.5249557495117188, "epoch": 2.9101123595505616, "grad_norm": 5.873772509100592, "learning_rate": 2.8474214006543255e-06, "loss": 0.0414, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 214086307.0, "step": 259 }, { "entropy": 0.5169525146484375, "epoch": 2.9213483146067416, "grad_norm": 4.060622338334745, "learning_rate": 2.832074054273121e-06, "loss": 0.0384, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 214907128.0, "step": 260 }, { "entropy": 0.5202713012695312, "epoch": 2.932584269662921, "grad_norm": 7.348539660999507, "learning_rate": 2.8167139576806306e-06, "loss": 0.0608, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 215725903.0, "step": 261 }, { "entropy": 0.5224227905273438, "epoch": 2.943820224719101, "grad_norm": 3.453263597151809, "learning_rate": 2.8013417006383078e-06, "loss": 0.0218, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 216535618.0, "step": 262 }, { "entropy": 0.522552490234375, "epoch": 2.955056179775281, "grad_norm": 4.453941784446791, "learning_rate": 2.7859578733745153e-06, "loss": 0.0289, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 217345581.0, "step": 263 }, { "entropy": 0.51446533203125, "epoch": 2.966292134831461, "grad_norm": 4.032638412989834, "learning_rate": 2.7705630665618605e-06, "loss": 0.0315, "mean_token_accuracy": 0.989583333954215, "num_tokens": 218173244.0, "step": 264 }, { "entropy": 0.5102767944335938, "epoch": 2.9775280898876404, "grad_norm": 5.378680439331005, "learning_rate": 2.755157871294521e-06, "loss": 0.0196, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 219012419.0, "step": 265 }, { "entropy": 0.5094451904296875, "epoch": 2.98876404494382, "grad_norm": 5.4791488171443925, "learning_rate": 2.7397428790655447e-06, "loss": 0.0292, "mean_token_accuracy": 0.989583333954215, "num_tokens": 219854421.0, "step": 266 }, { "entropy": 0.5302352905273438, "epoch": 3.0, "grad_norm": 3.8596785746166216, "learning_rate": 2.7243186817441403e-06, "loss": 0.0315, "mean_token_accuracy": 0.989583333954215, "num_tokens": 220623700.0, "step": 267 }, { "entropy": 0.5127182006835938, "epoch": 3.0112359550561796, "grad_norm": 4.57819470414012, "learning_rate": 2.708885871552954e-06, "loss": 0.0356, "mean_token_accuracy": 0.989583333954215, "num_tokens": 221452033.0, "step": 268 }, { "entropy": 0.5211715698242188, "epoch": 3.0224719101123596, "grad_norm": 3.854103160052815, "learning_rate": 2.693445041045326e-06, "loss": 0.0398, "mean_token_accuracy": 0.989583333954215, "num_tokens": 222254082.0, "step": 269 }, { "entropy": 0.5071258544921875, "epoch": 3.033707865168539, "grad_norm": 3.1948641595887493, "learning_rate": 2.6779967830825454e-06, "loss": 0.0276, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 223083493.0, "step": 270 }, { "entropy": 0.508270263671875, "epoch": 3.044943820224719, "grad_norm": 5.1040099580837355, "learning_rate": 2.6625416908110825e-06, "loss": 0.0206, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 223934861.0, "step": 271 }, { "entropy": 0.5089187622070312, "epoch": 3.056179775280899, "grad_norm": 3.5217613203200178, "learning_rate": 2.647080357639813e-06, "loss": 0.03, "mean_token_accuracy": 0.989583333954215, "num_tokens": 224772447.0, "step": 272 }, { "entropy": 0.5082550048828125, "epoch": 3.067415730337079, "grad_norm": 3.717919821676401, "learning_rate": 2.6316133772172403e-06, "loss": 0.0229, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 225620881.0, "step": 273 }, { "entropy": 0.528961181640625, "epoch": 3.0786516853932584, "grad_norm": 4.126696150915157, "learning_rate": 2.616141343408696e-06, "loss": 0.0278, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 226398051.0, "step": 274 }, { "entropy": 0.5100021362304688, "epoch": 3.0898876404494384, "grad_norm": 4.936464408938361, "learning_rate": 2.6006648502735384e-06, "loss": 0.0351, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 227221027.0, "step": 275 }, { "entropy": 0.5383224487304688, "epoch": 3.101123595505618, "grad_norm": 4.911326562954185, "learning_rate": 2.5851844920423473e-06, "loss": 0.0391, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 227998038.0, "step": 276 }, { "entropy": 0.5119400024414062, "epoch": 3.1123595505617976, "grad_norm": 4.035362852199955, "learning_rate": 2.569700863094104e-06, "loss": 0.0225, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 228848791.0, "step": 277 }, { "entropy": 0.5259246826171875, "epoch": 3.1235955056179776, "grad_norm": 4.907222005633188, "learning_rate": 2.554214557933372e-06, "loss": 0.0333, "mean_token_accuracy": 0.989583333954215, "num_tokens": 229650218.0, "step": 278 }, { "entropy": 0.5128173828125, "epoch": 3.134831460674157, "grad_norm": 3.6298141208358325, "learning_rate": 2.5387261711674695e-06, "loss": 0.0213, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 230506406.0, "step": 279 }, { "entropy": 0.51739501953125, "epoch": 3.146067415730337, "grad_norm": 2.16036446857172, "learning_rate": 2.5232362974836394e-06, "loss": 0.0234, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 231317938.0, "step": 280 }, { "entropy": 0.5114364624023438, "epoch": 3.157303370786517, "grad_norm": 3.7210989346070695, "learning_rate": 2.507745531626215e-06, "loss": 0.0226, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 232180619.0, "step": 281 }, { "entropy": 0.5086212158203125, "epoch": 3.168539325842697, "grad_norm": 6.879747676119053, "learning_rate": 2.4922544683737857e-06, "loss": 0.0316, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 233015053.0, "step": 282 }, { "entropy": 0.5146713256835938, "epoch": 3.1797752808988764, "grad_norm": 5.482318170907438, "learning_rate": 2.4767637025163614e-06, "loss": 0.0345, "mean_token_accuracy": 0.989583333954215, "num_tokens": 233833558.0, "step": 283 }, { "entropy": 0.5211105346679688, "epoch": 3.191011235955056, "grad_norm": 3.2491463558151317, "learning_rate": 2.461273828832531e-06, "loss": 0.0214, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 234627558.0, "step": 284 }, { "entropy": 0.5092926025390625, "epoch": 3.202247191011236, "grad_norm": 5.031466393488865, "learning_rate": 2.445785442066628e-06, "loss": 0.0321, "mean_token_accuracy": 0.989583333954215, "num_tokens": 235468401.0, "step": 285 }, { "entropy": 0.5166778564453125, "epoch": 3.2134831460674156, "grad_norm": 10.076323689534382, "learning_rate": 2.4302991369058963e-06, "loss": 0.037, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 236277777.0, "step": 286 }, { "entropy": 0.5113983154296875, "epoch": 3.2247191011235956, "grad_norm": 3.247168354051238, "learning_rate": 2.414815507957653e-06, "loss": 0.022, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 237109698.0, "step": 287 }, { "entropy": 0.5170822143554688, "epoch": 3.235955056179775, "grad_norm": 4.8226389846288065, "learning_rate": 2.399335149726463e-06, "loss": 0.0283, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 237913805.0, "step": 288 }, { "entropy": 0.5075225830078125, "epoch": 3.247191011235955, "grad_norm": 4.782854834657138, "learning_rate": 2.3838586565913053e-06, "loss": 0.0215, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 238760016.0, "step": 289 }, { "entropy": 0.5094528198242188, "epoch": 3.258426966292135, "grad_norm": 3.208516002299857, "learning_rate": 2.3683866227827605e-06, "loss": 0.0166, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 239607946.0, "step": 290 }, { "entropy": 0.516571044921875, "epoch": 3.2696629213483144, "grad_norm": 4.503497884660067, "learning_rate": 2.352919642360188e-06, "loss": 0.0372, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 240438345.0, "step": 291 }, { "entropy": 0.523040771484375, "epoch": 3.2808988764044944, "grad_norm": 6.357036147397838, "learning_rate": 2.3374583091889188e-06, "loss": 0.027, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 241241845.0, "step": 292 }, { "entropy": 0.5255355834960938, "epoch": 3.292134831460674, "grad_norm": 6.159491750149713, "learning_rate": 2.322003216917455e-06, "loss": 0.0348, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 242048957.0, "step": 293 }, { "entropy": 0.5313034057617188, "epoch": 3.303370786516854, "grad_norm": 2.767906707454989, "learning_rate": 2.3065549589546747e-06, "loss": 0.0155, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 242812266.0, "step": 294 }, { "entropy": 0.5044479370117188, "epoch": 3.3146067415730336, "grad_norm": 3.249806139136359, "learning_rate": 2.2911141284470466e-06, "loss": 0.0234, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 243649058.0, "step": 295 }, { "entropy": 0.5091476440429688, "epoch": 3.3258426966292136, "grad_norm": 3.279106758362481, "learning_rate": 2.27568131825586e-06, "loss": 0.0169, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 244473742.0, "step": 296 }, { "entropy": 0.49976348876953125, "epoch": 3.337078651685393, "grad_norm": 4.399014190769296, "learning_rate": 2.260257120934456e-06, "loss": 0.0219, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 245334162.0, "step": 297 }, { "entropy": 0.5103302001953125, "epoch": 3.348314606741573, "grad_norm": 3.379998191266739, "learning_rate": 2.2448421287054794e-06, "loss": 0.0195, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 246164496.0, "step": 298 }, { "entropy": 0.5167922973632812, "epoch": 3.359550561797753, "grad_norm": 6.348607538192022, "learning_rate": 2.229436933438141e-06, "loss": 0.0342, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 246957857.0, "step": 299 }, { "entropy": 0.520721435546875, "epoch": 3.370786516853933, "grad_norm": 3.7378441121949244, "learning_rate": 2.214042126625486e-06, "loss": 0.0264, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 247790646.0, "step": 300 }, { "entropy": 0.5183486938476562, "epoch": 3.3820224719101124, "grad_norm": 4.811164098866914, "learning_rate": 2.1986582993616926e-06, "loss": 0.0317, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 248596791.0, "step": 301 }, { "entropy": 0.5113983154296875, "epoch": 3.393258426966292, "grad_norm": 3.592413843287222, "learning_rate": 2.1832860423193703e-06, "loss": 0.0224, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 249439466.0, "step": 302 }, { "entropy": 0.50799560546875, "epoch": 3.404494382022472, "grad_norm": 3.807110419678122, "learning_rate": 2.1679259457268796e-06, "loss": 0.0262, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 250296069.0, "step": 303 }, { "entropy": 0.5160369873046875, "epoch": 3.4157303370786516, "grad_norm": 3.5332255502622822, "learning_rate": 2.1525785993456753e-06, "loss": 0.0155, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 251137734.0, "step": 304 }, { "entropy": 0.5208358764648438, "epoch": 3.4269662921348316, "grad_norm": 4.105991998447796, "learning_rate": 2.1372445924476578e-06, "loss": 0.0168, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 251948138.0, "step": 305 }, { "entropy": 0.503387451171875, "epoch": 3.438202247191011, "grad_norm": 7.115594516406741, "learning_rate": 2.1219245137925482e-06, "loss": 0.0234, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 252783179.0, "step": 306 }, { "entropy": 0.5314254760742188, "epoch": 3.449438202247191, "grad_norm": 1.9885695184447987, "learning_rate": 2.1066189516052848e-06, "loss": 0.0194, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 253561788.0, "step": 307 }, { "entropy": 0.5133819580078125, "epoch": 3.460674157303371, "grad_norm": 3.5082179529267936, "learning_rate": 2.0913284935534345e-06, "loss": 0.0196, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 254385361.0, "step": 308 }, { "entropy": 0.5140228271484375, "epoch": 3.4719101123595504, "grad_norm": 4.022328315511428, "learning_rate": 2.0760537267246316e-06, "loss": 0.0248, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 255224425.0, "step": 309 }, { "entropy": 0.5027923583984375, "epoch": 3.4831460674157304, "grad_norm": 2.9944684642176997, "learning_rate": 2.0607952376040355e-06, "loss": 0.0181, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 256044678.0, "step": 310 }, { "entropy": 0.52716064453125, "epoch": 3.49438202247191, "grad_norm": 4.504770884220794, "learning_rate": 2.0455536120518094e-06, "loss": 0.0384, "mean_token_accuracy": 0.989583333954215, "num_tokens": 256832233.0, "step": 311 }, { "entropy": 0.5007553100585938, "epoch": 3.50561797752809, "grad_norm": 2.942114577376295, "learning_rate": 2.0303294352806313e-06, "loss": 0.0238, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 257686727.0, "step": 312 }, { "entropy": 0.49721527099609375, "epoch": 3.5168539325842696, "grad_norm": 2.116297926501953, "learning_rate": 2.0151232918332186e-06, "loss": 0.0104, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 258546281.0, "step": 313 }, { "entropy": 0.5155029296875, "epoch": 3.5280898876404496, "grad_norm": 3.7890686933117177, "learning_rate": 1.9999357655598894e-06, "loss": 0.0164, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 259354523.0, "step": 314 }, { "entropy": 0.49981689453125, "epoch": 3.539325842696629, "grad_norm": 2.0113129181647937, "learning_rate": 1.9847674395961407e-06, "loss": 0.0133, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 260197353.0, "step": 315 }, { "entropy": 0.5087356567382812, "epoch": 3.550561797752809, "grad_norm": 4.964666181582898, "learning_rate": 1.9696188963402613e-06, "loss": 0.0281, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 261008532.0, "step": 316 }, { "entropy": 0.5017013549804688, "epoch": 3.561797752808989, "grad_norm": 3.199383699793436, "learning_rate": 1.9544907174309693e-06, "loss": 0.0134, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 261848586.0, "step": 317 }, { "entropy": 0.5186920166015625, "epoch": 3.5730337078651684, "grad_norm": 4.0141254096173205, "learning_rate": 1.939383483725079e-06, "loss": 0.012, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 262642445.0, "step": 318 }, { "entropy": 0.5063323974609375, "epoch": 3.5842696629213484, "grad_norm": 3.668558882237143, "learning_rate": 1.9242977752752006e-06, "loss": 0.0143, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 263451697.0, "step": 319 }, { "entropy": 0.5094070434570312, "epoch": 3.595505617977528, "grad_norm": 4.487986811292986, "learning_rate": 1.909234171307466e-06, "loss": 0.0213, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 264268081.0, "step": 320 }, { "entropy": 0.49770355224609375, "epoch": 3.606741573033708, "grad_norm": 3.3550283768698588, "learning_rate": 1.8941932501992915e-06, "loss": 0.013, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 265096782.0, "step": 321 }, { "entropy": 0.5041961669921875, "epoch": 3.6179775280898876, "grad_norm": 5.995755473364592, "learning_rate": 1.879175589457168e-06, "loss": 0.0188, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 265912003.0, "step": 322 }, { "entropy": 0.48159027099609375, "epoch": 3.629213483146067, "grad_norm": 2.3130150611335965, "learning_rate": 1.8641817656944894e-06, "loss": 0.0067, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 266788683.0, "step": 323 }, { "entropy": 0.48583221435546875, "epoch": 3.640449438202247, "grad_norm": 4.083263934783113, "learning_rate": 1.8492123546094132e-06, "loss": 0.0126, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 267649106.0, "step": 324 }, { "entropy": 0.49964141845703125, "epoch": 3.6516853932584272, "grad_norm": 0.4203389488288955, "learning_rate": 1.8342679309627545e-06, "loss": 0.0027, "mean_token_accuracy": 1.0, "num_tokens": 268474651.0, "step": 325 }, { "entropy": 0.5025482177734375, "epoch": 3.662921348314607, "grad_norm": 5.90761497919095, "learning_rate": 1.8193490685559179e-06, "loss": 0.0369, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 269291928.0, "step": 326 }, { "entropy": 0.49892425537109375, "epoch": 3.6741573033707864, "grad_norm": 4.346760130845783, "learning_rate": 1.8044563402088686e-06, "loss": 0.028, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 270116488.0, "step": 327 }, { "entropy": 0.5001907348632812, "epoch": 3.6853932584269664, "grad_norm": 16.271240321339146, "learning_rate": 1.7895903177381351e-06, "loss": 0.0356, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 270917370.0, "step": 328 }, { "entropy": 0.46492767333984375, "epoch": 3.696629213483146, "grad_norm": 4.470491736532796, "learning_rate": 1.7747515719348551e-06, "loss": 0.011, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 271826297.0, "step": 329 }, { "entropy": 0.4820098876953125, "epoch": 3.7078651685393256, "grad_norm": 7.212866002958275, "learning_rate": 1.759940672542862e-06, "loss": 0.033, "mean_token_accuracy": 0.989583333954215, "num_tokens": 272694273.0, "step": 330 }, { "entropy": 0.482574462890625, "epoch": 3.7191011235955056, "grad_norm": 7.8089400803404985, "learning_rate": 1.7451581882368052e-06, "loss": 0.0181, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 273560920.0, "step": 331 }, { "entropy": 0.49365997314453125, "epoch": 3.7303370786516856, "grad_norm": 9.96198480242964, "learning_rate": 1.7304046866003183e-06, "loss": 0.025, "mean_token_accuracy": 0.989583333954215, "num_tokens": 274386758.0, "step": 332 }, { "entropy": 0.49715423583984375, "epoch": 3.741573033707865, "grad_norm": 6.256653146358632, "learning_rate": 1.7156807341042242e-06, "loss": 0.0172, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 275211665.0, "step": 333 }, { "entropy": 0.4954986572265625, "epoch": 3.752808988764045, "grad_norm": 5.4914007926781, "learning_rate": 1.700986896084787e-06, "loss": 0.0131, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 276029702.0, "step": 334 }, { "entropy": 0.50457763671875, "epoch": 3.764044943820225, "grad_norm": 5.3972041785495835, "learning_rate": 1.686323736722006e-06, "loss": 0.0209, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 276871811.0, "step": 335 }, { "entropy": 0.4974365234375, "epoch": 3.7752808988764044, "grad_norm": 1.1708667866046776, "learning_rate": 1.671691819017951e-06, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 277714646.0, "step": 336 }, { "entropy": 0.5124359130859375, "epoch": 3.7865168539325844, "grad_norm": 3.7431018922660204, "learning_rate": 1.6570917047751465e-06, "loss": 0.0083, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 278518639.0, "step": 337 }, { "entropy": 0.4889068603515625, "epoch": 3.797752808988764, "grad_norm": 6.144159379632208, "learning_rate": 1.642523954575003e-06, "loss": 0.0262, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 279367205.0, "step": 338 }, { "entropy": 0.49878692626953125, "epoch": 3.808988764044944, "grad_norm": 3.751432165154299, "learning_rate": 1.6279891277562896e-06, "loss": 0.0147, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 280179340.0, "step": 339 }, { "entropy": 0.48464202880859375, "epoch": 3.8202247191011236, "grad_norm": 3.0121999885704795, "learning_rate": 1.613487782393661e-06, "loss": 0.0074, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 281043372.0, "step": 340 }, { "entropy": 0.5075607299804688, "epoch": 3.831460674157303, "grad_norm": 3.506062048383523, "learning_rate": 1.5990204752762273e-06, "loss": 0.0082, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 281856207.0, "step": 341 }, { "entropy": 0.485870361328125, "epoch": 3.842696629213483, "grad_norm": 3.0322880070112657, "learning_rate": 1.5845877618861769e-06, "loss": 0.017, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 282703908.0, "step": 342 }, { "entropy": 0.489654541015625, "epoch": 3.853932584269663, "grad_norm": 4.926358920639807, "learning_rate": 1.5701901963774504e-06, "loss": 0.0192, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 283548980.0, "step": 343 }, { "entropy": 0.4991302490234375, "epoch": 3.865168539325843, "grad_norm": 4.466227120509381, "learning_rate": 1.555828331554457e-06, "loss": 0.0179, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 284386913.0, "step": 344 }, { "entropy": 0.49823760986328125, "epoch": 3.8764044943820224, "grad_norm": 3.705201119031269, "learning_rate": 1.5415027188508574e-06, "loss": 0.0182, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 285220080.0, "step": 345 }, { "entropy": 0.49188995361328125, "epoch": 3.8876404494382024, "grad_norm": 5.0998857739487065, "learning_rate": 1.5272139083083865e-06, "loss": 0.0158, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 286072154.0, "step": 346 }, { "entropy": 0.48580169677734375, "epoch": 3.898876404494382, "grad_norm": 3.4875601342154177, "learning_rate": 1.5129624485557331e-06, "loss": 0.0073, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 286942986.0, "step": 347 }, { "entropy": 0.48960113525390625, "epoch": 3.9101123595505616, "grad_norm": 2.2828403000677064, "learning_rate": 1.4987488867874798e-06, "loss": 0.0084, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 287797004.0, "step": 348 }, { "entropy": 0.50335693359375, "epoch": 3.9213483146067416, "grad_norm": 3.1386584985605372, "learning_rate": 1.4845737687430875e-06, "loss": 0.0253, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 288618402.0, "step": 349 }, { "entropy": 0.50701904296875, "epoch": 3.932584269662921, "grad_norm": 4.172480270751928, "learning_rate": 1.4704376386859447e-06, "loss": 0.0137, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 289407298.0, "step": 350 }, { "entropy": 0.5020904541015625, "epoch": 3.943820224719101, "grad_norm": 2.5202804888657275, "learning_rate": 1.4563410393824701e-06, "loss": 0.012, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 290238447.0, "step": 351 }, { "entropy": 0.5082855224609375, "epoch": 3.955056179775281, "grad_norm": 4.712749356736079, "learning_rate": 1.4422845120812718e-06, "loss": 0.018, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 291036026.0, "step": 352 }, { "entropy": 0.49322509765625, "epoch": 3.966292134831461, "grad_norm": 1.9341080117017146, "learning_rate": 1.4282685964923643e-06, "loss": 0.0106, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 291893428.0, "step": 353 }, { "entropy": 0.501678466796875, "epoch": 3.9775280898876404, "grad_norm": 1.8281731406713833, "learning_rate": 1.4142938307664505e-06, "loss": 0.008, "mean_token_accuracy": 1.0, "num_tokens": 292697320.0, "step": 354 }, { "entropy": 0.5015640258789062, "epoch": 3.98876404494382, "grad_norm": 2.0972743594587158, "learning_rate": 1.400360751474253e-06, "loss": 0.0061, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 293502907.0, "step": 355 }, { "entropy": 0.5002670288085938, "epoch": 4.0, "grad_norm": 2.4603604078082357, "learning_rate": 1.3864698935859153e-06, "loss": 0.0092, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 294326570.0, "step": 356 }, { "entropy": 0.489013671875, "epoch": 4.01123595505618, "grad_norm": 1.0981283873031276, "learning_rate": 1.3726217904504636e-06, "loss": 0.0107, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 295155654.0, "step": 357 }, { "entropy": 0.4899444580078125, "epoch": 4.022471910112359, "grad_norm": 2.4658745249006575, "learning_rate": 1.3588169737753258e-06, "loss": 0.0045, "mean_token_accuracy": 1.0, "num_tokens": 296004651.0, "step": 358 }, { "entropy": 0.496856689453125, "epoch": 4.033707865168539, "grad_norm": 0.6986873706498641, "learning_rate": 1.3450559736059126e-06, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 296825273.0, "step": 359 }, { "entropy": 0.4771728515625, "epoch": 4.044943820224719, "grad_norm": 4.022638809306908, "learning_rate": 1.3313393183052747e-06, "loss": 0.0079, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 297661901.0, "step": 360 }, { "entropy": 0.48487091064453125, "epoch": 4.056179775280899, "grad_norm": 1.1734333901132759, "learning_rate": 1.3176675345338085e-06, "loss": 0.003, "mean_token_accuracy": 1.0, "num_tokens": 298495109.0, "step": 361 }, { "entropy": 0.47898101806640625, "epoch": 4.067415730337078, "grad_norm": 0.7234754463345818, "learning_rate": 1.304041147229037e-06, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 299343801.0, "step": 362 }, { "entropy": 0.498199462890625, "epoch": 4.078651685393258, "grad_norm": 1.5200640964253078, "learning_rate": 1.2904606795854562e-06, "loss": 0.0196, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 300156927.0, "step": 363 }, { "entropy": 0.48677825927734375, "epoch": 4.089887640449438, "grad_norm": 1.3478491381169233, "learning_rate": 1.276926653034444e-06, "loss": 0.0029, "mean_token_accuracy": 1.0, "num_tokens": 300997011.0, "step": 364 }, { "entropy": 0.48504638671875, "epoch": 4.101123595505618, "grad_norm": 4.927855969737134, "learning_rate": 1.2634395872242433e-06, "loss": 0.0114, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 301820560.0, "step": 365 }, { "entropy": 0.4751739501953125, "epoch": 4.112359550561798, "grad_norm": 4.779020236399108, "learning_rate": 1.2500000000000007e-06, "loss": 0.0063, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 302678409.0, "step": 366 }, { "entropy": 0.4949951171875, "epoch": 4.123595505617978, "grad_norm": 0.7404055825881121, "learning_rate": 1.2366084073838963e-06, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 303479311.0, "step": 367 }, { "entropy": 0.485565185546875, "epoch": 4.134831460674158, "grad_norm": 2.8852964190089296, "learning_rate": 1.223265323555323e-06, "loss": 0.0123, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 304286629.0, "step": 368 }, { "entropy": 0.4863433837890625, "epoch": 4.146067415730337, "grad_norm": 1.910333236425596, "learning_rate": 1.2099712608311426e-06, "loss": 0.0112, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 305106480.0, "step": 369 }, { "entropy": 0.4847564697265625, "epoch": 4.157303370786517, "grad_norm": 10.011313102335771, "learning_rate": 1.1967267296460208e-06, "loss": 0.0088, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 305927120.0, "step": 370 }, { "entropy": 0.47748565673828125, "epoch": 4.168539325842697, "grad_norm": 0.36743206379299365, "learning_rate": 1.183532238532826e-06, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 306771916.0, "step": 371 }, { "entropy": 0.49149322509765625, "epoch": 4.179775280898877, "grad_norm": 0.4732089401749855, "learning_rate": 1.1703882941031012e-06, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 307565025.0, "step": 372 }, { "entropy": 0.48378753662109375, "epoch": 4.191011235955056, "grad_norm": 0.7993030080623406, "learning_rate": 1.157295401027616e-06, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 308380624.0, "step": 373 }, { "entropy": 0.4936065673828125, "epoch": 4.202247191011236, "grad_norm": 4.721093006577871, "learning_rate": 1.1442540620169906e-06, "loss": 0.0149, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 309224060.0, "step": 374 }, { "entropy": 0.47303009033203125, "epoch": 4.213483146067416, "grad_norm": 0.6684526220228418, "learning_rate": 1.131264777802387e-06, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 310063659.0, "step": 375 }, { "entropy": 0.49187469482421875, "epoch": 4.224719101123595, "grad_norm": 4.552394207734528, "learning_rate": 1.1183280471162916e-06, "loss": 0.0075, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 310884602.0, "step": 376 }, { "entropy": 0.48296356201171875, "epoch": 4.235955056179775, "grad_norm": 3.2031832299513603, "learning_rate": 1.1054443666733586e-06, "loss": 0.0157, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 311708014.0, "step": 377 }, { "entropy": 0.48511505126953125, "epoch": 4.247191011235955, "grad_norm": 0.5423288881673503, "learning_rate": 1.0926142311513453e-06, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 312508500.0, "step": 378 }, { "entropy": 0.4829254150390625, "epoch": 4.258426966292135, "grad_norm": 0.6328572390463946, "learning_rate": 1.079838133172111e-06, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 313334377.0, "step": 379 }, { "entropy": 0.4954986572265625, "epoch": 4.269662921348314, "grad_norm": 2.965808891706028, "learning_rate": 1.0671165632827097e-06, "loss": 0.0044, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 314135369.0, "step": 380 }, { "entropy": 0.48342132568359375, "epoch": 4.280898876404494, "grad_norm": 4.9369087391667925, "learning_rate": 1.0544500099365515e-06, "loss": 0.0046, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 314947060.0, "step": 381 }, { "entropy": 0.474609375, "epoch": 4.292134831460674, "grad_norm": 0.8352741047517535, "learning_rate": 1.0418389594746462e-06, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 315789920.0, "step": 382 }, { "entropy": 0.4948883056640625, "epoch": 4.303370786516854, "grad_norm": 1.4879623292920148, "learning_rate": 1.0292838961069348e-06, "loss": 0.0077, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 316594232.0, "step": 383 }, { "entropy": 0.4829254150390625, "epoch": 4.314606741573034, "grad_norm": 7.760157332381639, "learning_rate": 1.0167853018936955e-06, "loss": 0.0096, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 317419960.0, "step": 384 }, { "entropy": 0.48496246337890625, "epoch": 4.325842696629214, "grad_norm": 0.6546893569941181, "learning_rate": 1.0043436567270313e-06, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 318246051.0, "step": 385 }, { "entropy": 0.48297882080078125, "epoch": 4.337078651685394, "grad_norm": 1.1821091917310644, "learning_rate": 9.919594383124512e-07, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 319071794.0, "step": 386 }, { "entropy": 0.4850616455078125, "epoch": 4.348314606741573, "grad_norm": 1.1860110971220579, "learning_rate": 9.796331221505235e-07, "loss": 0.0097, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 319880367.0, "step": 387 }, { "entropy": 0.47988128662109375, "epoch": 4.359550561797753, "grad_norm": 3.6479067412195456, "learning_rate": 9.673651815186186e-07, "loss": 0.0041, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 320710009.0, "step": 388 }, { "entropy": 0.4726104736328125, "epoch": 4.370786516853933, "grad_norm": 2.7332778453429873, "learning_rate": 9.551560874527385e-07, "loss": 0.0091, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 321533419.0, "step": 389 }, { "entropy": 0.48587799072265625, "epoch": 4.382022471910112, "grad_norm": 1.322646368017749, "learning_rate": 9.43006308729432e-07, "loss": 0.0071, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 322334485.0, "step": 390 }, { "entropy": 0.4619598388671875, "epoch": 4.393258426966292, "grad_norm": 0.3194115918370614, "learning_rate": 9.309163118477954e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 323195057.0, "step": 391 }, { "entropy": 0.48651885986328125, "epoch": 4.404494382022472, "grad_norm": 2.0428959059308465, "learning_rate": 9.188865610115572e-07, "loss": 0.0028, "mean_token_accuracy": 1.0, "num_tokens": 324010094.0, "step": 392 }, { "entropy": 0.4793243408203125, "epoch": 4.415730337078652, "grad_norm": 1.823624278000048, "learning_rate": 9.069175181112597e-07, "loss": 0.0105, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 324846854.0, "step": 393 }, { "entropy": 0.480072021484375, "epoch": 4.426966292134831, "grad_norm": 1.0106602397778806, "learning_rate": 8.950096427065232e-07, "loss": 0.009, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 325686662.0, "step": 394 }, { "entropy": 0.480133056640625, "epoch": 4.438202247191011, "grad_norm": 7.9492170461609595, "learning_rate": 8.831633920083968e-07, "loss": 0.0074, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 326513919.0, "step": 395 }, { "entropy": 0.49493408203125, "epoch": 4.449438202247191, "grad_norm": 7.174628436600066, "learning_rate": 8.713792208618097e-07, "loss": 0.015, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 327299315.0, "step": 396 }, { "entropy": 0.49566650390625, "epoch": 4.460674157303371, "grad_norm": 6.030765502141134, "learning_rate": 8.596575817281036e-07, "loss": 0.0165, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 328084351.0, "step": 397 }, { "entropy": 0.47319793701171875, "epoch": 4.47191011235955, "grad_norm": 3.9848054852496055, "learning_rate": 8.479989246676595e-07, "loss": 0.0045, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 328918727.0, "step": 398 }, { "entropy": 0.46825408935546875, "epoch": 4.48314606741573, "grad_norm": 3.3412799376622124, "learning_rate": 8.36403697322618e-07, "loss": 0.0042, "mean_token_accuracy": 1.0, "num_tokens": 329773820.0, "step": 399 }, { "entropy": 0.47360992431640625, "epoch": 4.49438202247191, "grad_norm": 1.5264228995013582, "learning_rate": 8.248723448996942e-07, "loss": 0.0029, "mean_token_accuracy": 1.0, "num_tokens": 330626895.0, "step": 400 }, { "entropy": 0.48606109619140625, "epoch": 4.50561797752809, "grad_norm": 4.047559713867161, "learning_rate": 8.134053101530814e-07, "loss": 0.0069, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 331422952.0, "step": 401 }, { "entropy": 0.4675140380859375, "epoch": 4.51685393258427, "grad_norm": 1.7070514726450159, "learning_rate": 8.020030333674498e-07, "loss": 0.008, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 332286023.0, "step": 402 }, { "entropy": 0.5069656372070312, "epoch": 4.52808988764045, "grad_norm": 0.3246355685066132, "learning_rate": 7.906659523410445e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 333044926.0, "step": 403 }, { "entropy": 0.47544097900390625, "epoch": 4.539325842696629, "grad_norm": 1.326921144241493, "learning_rate": 7.793945023688756e-07, "loss": 0.0059, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 333869264.0, "step": 404 }, { "entropy": 0.4818572998046875, "epoch": 4.550561797752809, "grad_norm": 0.2982905893437566, "learning_rate": 7.681891162260016e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 334689272.0, "step": 405 }, { "entropy": 0.48146820068359375, "epoch": 4.561797752808989, "grad_norm": 4.863117295648831, "learning_rate": 7.570502241509162e-07, "loss": 0.0041, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 335515946.0, "step": 406 }, { "entropy": 0.48076629638671875, "epoch": 4.573033707865169, "grad_norm": 2.8900389523558796, "learning_rate": 7.459782538290289e-07, "loss": 0.0162, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 336334489.0, "step": 407 }, { "entropy": 0.4797515869140625, "epoch": 4.584269662921348, "grad_norm": 1.8411529472514119, "learning_rate": 7.349736303762392e-07, "loss": 0.0041, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 337166188.0, "step": 408 }, { "entropy": 0.473114013671875, "epoch": 4.595505617977528, "grad_norm": 1.5143632218630005, "learning_rate": 7.240367763226214e-07, "loss": 0.0033, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 338004079.0, "step": 409 }, { "entropy": 0.4938507080078125, "epoch": 4.606741573033708, "grad_norm": 0.45911529906334586, "learning_rate": 7.13168111596193e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 338802322.0, "step": 410 }, { "entropy": 0.47324371337890625, "epoch": 4.617977528089888, "grad_norm": 0.35480851339412894, "learning_rate": 7.023680535067998e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 339659802.0, "step": 411 }, { "entropy": 0.4801788330078125, "epoch": 4.629213483146067, "grad_norm": 0.6426393512461525, "learning_rate": 6.916370167300846e-07, "loss": 0.0025, "mean_token_accuracy": 1.0, "num_tokens": 340490334.0, "step": 412 }, { "entropy": 0.4700927734375, "epoch": 4.640449438202247, "grad_norm": 0.31174721363519464, "learning_rate": 6.809754132915722e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 341341544.0, "step": 413 }, { "entropy": 0.47267913818359375, "epoch": 4.651685393258427, "grad_norm": 8.399016287808813, "learning_rate": 6.70383652550847e-07, "loss": 0.0092, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 342172518.0, "step": 414 }, { "entropy": 0.4641876220703125, "epoch": 4.662921348314606, "grad_norm": 1.1280041270939898, "learning_rate": 6.59862141185832e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 343013626.0, "step": 415 }, { "entropy": 0.4617767333984375, "epoch": 4.674157303370786, "grad_norm": 0.25249368595421007, "learning_rate": 6.494112831771801e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 343864156.0, "step": 416 }, { "entropy": 0.454071044921875, "epoch": 4.685393258426966, "grad_norm": 0.3627879993512605, "learning_rate": 6.390314797927601e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 344721728.0, "step": 417 }, { "entropy": 0.4547119140625, "epoch": 4.696629213483146, "grad_norm": 4.815638286312687, "learning_rate": 6.28723129572247e-07, "loss": 0.0111, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 345612861.0, "step": 418 }, { "entropy": 0.47646331787109375, "epoch": 4.707865168539326, "grad_norm": 0.23854029589661835, "learning_rate": 6.184866283118254e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 346422266.0, "step": 419 }, { "entropy": 0.4585418701171875, "epoch": 4.719101123595506, "grad_norm": 4.111400656540378, "learning_rate": 6.083223690489901e-07, "loss": 0.0058, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 347254366.0, "step": 420 }, { "entropy": 0.4715423583984375, "epoch": 4.730337078651686, "grad_norm": 1.8760479897518607, "learning_rate": 5.982307420474501e-07, "loss": 0.0067, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 348075516.0, "step": 421 }, { "entropy": 0.48101043701171875, "epoch": 4.741573033707866, "grad_norm": 5.6709204135360025, "learning_rate": 5.882121347821537e-07, "loss": 0.0112, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 348900037.0, "step": 422 }, { "entropy": 0.4796142578125, "epoch": 4.752808988764045, "grad_norm": 2.057248698971926, "learning_rate": 5.782669319244058e-07, "loss": 0.0092, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 349701930.0, "step": 423 }, { "entropy": 0.48802947998046875, "epoch": 4.764044943820225, "grad_norm": 9.041450054881212, "learning_rate": 5.683955153270959e-07, "loss": 0.0095, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 350494406.0, "step": 424 }, { "entropy": 0.47820281982421875, "epoch": 4.775280898876405, "grad_norm": 5.449915375980587, "learning_rate": 5.585982640100416e-07, "loss": 0.0033, "mean_token_accuracy": 1.0, "num_tokens": 351301540.0, "step": 425 }, { "entropy": 0.46595001220703125, "epoch": 4.786516853932584, "grad_norm": 5.589766890796259, "learning_rate": 5.488755541454335e-07, "loss": 0.0074, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 352130587.0, "step": 426 }, { "entropy": 0.4650115966796875, "epoch": 4.797752808988764, "grad_norm": 4.375937174255108, "learning_rate": 5.39227759043392e-07, "loss": 0.0214, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 352966590.0, "step": 427 }, { "entropy": 0.46227264404296875, "epoch": 4.808988764044944, "grad_norm": 1.1976658888052818, "learning_rate": 5.296552491376322e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 353805333.0, "step": 428 }, { "entropy": 0.47081756591796875, "epoch": 4.820224719101123, "grad_norm": 5.3943544894469895, "learning_rate": 5.201583919712441e-07, "loss": 0.005, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 354653578.0, "step": 429 }, { "entropy": 0.48395538330078125, "epoch": 4.831460674157303, "grad_norm": 5.03495222189158, "learning_rate": 5.107375521825791e-07, "loss": 0.0134, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 355441821.0, "step": 430 }, { "entropy": 0.46059417724609375, "epoch": 4.842696629213483, "grad_norm": 0.2979677100012393, "learning_rate": 5.013930914912477e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 356278826.0, "step": 431 }, { "entropy": 0.4580535888671875, "epoch": 4.853932584269663, "grad_norm": 5.831142656822307, "learning_rate": 4.921253686842323e-07, "loss": 0.0138, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 357122624.0, "step": 432 }, { "entropy": 0.4652252197265625, "epoch": 4.865168539325842, "grad_norm": 0.2269138434389349, "learning_rate": 4.829347396021142e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 357978267.0, "step": 433 }, { "entropy": 0.46025848388671875, "epoch": 4.876404494382022, "grad_norm": 2.166646775289874, "learning_rate": 4.7382155712540484e-07, "loss": 0.0172, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 358840138.0, "step": 434 }, { "entropy": 0.4871978759765625, "epoch": 4.887640449438202, "grad_norm": 1.2710311121179774, "learning_rate": 4.6478617116100244e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 359618488.0, "step": 435 }, { "entropy": 0.4758453369140625, "epoch": 4.898876404494382, "grad_norm": 1.1292250535564667, "learning_rate": 4.5582892862875457e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 360443404.0, "step": 436 }, { "entropy": 0.4720306396484375, "epoch": 4.910112359550562, "grad_norm": 0.2333684518257228, "learning_rate": 4.469501734481363e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 361279704.0, "step": 437 }, { "entropy": 0.4758758544921875, "epoch": 4.921348314606742, "grad_norm": 3.0840865860194806, "learning_rate": 4.3815024652504897e-07, "loss": 0.005, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 362101433.0, "step": 438 }, { "entropy": 0.47599029541015625, "epoch": 4.932584269662922, "grad_norm": 0.8938608510966778, "learning_rate": 4.294294857387285e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 362916195.0, "step": 439 }, { "entropy": 0.4738922119140625, "epoch": 4.943820224719101, "grad_norm": 0.4136594464102139, "learning_rate": 4.2078822592877074e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 363745948.0, "step": 440 }, { "entropy": 0.47412109375, "epoch": 4.955056179775281, "grad_norm": 1.709198405151175, "learning_rate": 4.122267988822792e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 364576688.0, "step": 441 }, { "entropy": 0.475006103515625, "epoch": 4.966292134831461, "grad_norm": 0.30835544679447674, "learning_rate": 4.0374553332112374e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 365409056.0, "step": 442 }, { "entropy": 0.469818115234375, "epoch": 4.97752808988764, "grad_norm": 0.3707181543844393, "learning_rate": 3.953447548893169e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 366217810.0, "step": 443 }, { "entropy": 0.46750640869140625, "epoch": 4.98876404494382, "grad_norm": 3.208220247586545, "learning_rate": 3.8702478614051353e-07, "loss": 0.0048, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 367063953.0, "step": 444 }, { "entropy": 0.45229339599609375, "epoch": 5.0, "grad_norm": 2.253105869866448, "learning_rate": 3.787859465256258e-07, "loss": 0.004, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 367930716.0, "step": 445 }, { "entropy": 0.48581695556640625, "epoch": 5.01123595505618, "grad_norm": 1.5438222190450817, "learning_rate": 3.706285523805578e-07, "loss": 0.0168, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 368718665.0, "step": 446 }, { "entropy": 0.47020721435546875, "epoch": 5.022471910112359, "grad_norm": 0.31678928999987055, "learning_rate": 3.625529169140565e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 369554247.0, "step": 447 }, { "entropy": 0.4745330810546875, "epoch": 5.033707865168539, "grad_norm": 0.3533643855991936, "learning_rate": 3.545593501956901e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 370388951.0, "step": 448 }, { "entropy": 0.47808074951171875, "epoch": 5.044943820224719, "grad_norm": 0.30752357641221745, "learning_rate": 3.4664815914394106e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 371198482.0, "step": 449 }, { "entropy": 0.4639739990234375, "epoch": 5.056179775280899, "grad_norm": 0.367754265454166, "learning_rate": 3.3881964751441984e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 372042829.0, "step": 450 }, { "entropy": 0.45928955078125, "epoch": 5.067415730337078, "grad_norm": 0.300585908373268, "learning_rate": 3.3107411588820527e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 372900142.0, "step": 451 }, { "entropy": 0.47325897216796875, "epoch": 5.078651685393258, "grad_norm": 2.7462415680052654, "learning_rate": 3.2341186166030214e-07, "loss": 0.0027, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 373730470.0, "step": 452 }, { "entropy": 0.4946746826171875, "epoch": 5.089887640449438, "grad_norm": 0.24662934610126366, "learning_rate": 3.1583317902822127e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 374517244.0, "step": 453 }, { "entropy": 0.46686553955078125, "epoch": 5.101123595505618, "grad_norm": 0.33971112074173476, "learning_rate": 3.083383589806846e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 375355793.0, "step": 454 }, { "entropy": 0.4696807861328125, "epoch": 5.112359550561798, "grad_norm": 0.2483861138363355, "learning_rate": 3.0092768928645375e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 376189678.0, "step": 455 }, { "entropy": 0.4691314697265625, "epoch": 5.123595505617978, "grad_norm": 0.229972069260914, "learning_rate": 2.936014544832794e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 377025599.0, "step": 456 }, { "entropy": 0.47251129150390625, "epoch": 5.134831460674158, "grad_norm": 0.23132255602760288, "learning_rate": 2.8635993586697555e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 377866992.0, "step": 457 }, { "entropy": 0.46392822265625, "epoch": 5.146067415730337, "grad_norm": 0.22332672147895574, "learning_rate": 2.792034114806211e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 378718256.0, "step": 458 }, { "entropy": 0.4629669189453125, "epoch": 5.157303370786517, "grad_norm": 0.21910029988776278, "learning_rate": 2.7213215610388364e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 379571339.0, "step": 459 }, { "entropy": 0.49639129638671875, "epoch": 5.168539325842697, "grad_norm": 0.21660451173158826, "learning_rate": 2.6514644124246675e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 380350234.0, "step": 460 }, { "entropy": 0.47663116455078125, "epoch": 5.179775280898877, "grad_norm": 0.8999031171309789, "learning_rate": 2.582465351176891e-07, "loss": 0.0102, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 381160580.0, "step": 461 }, { "entropy": 0.4720001220703125, "epoch": 5.191011235955056, "grad_norm": 0.22313632929478272, "learning_rate": 2.514327026561833e-07, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 381992655.0, "step": 462 }, { "entropy": 0.46752166748046875, "epoch": 5.202247191011236, "grad_norm": 3.041812924459871, "learning_rate": 2.447052054797233e-07, "loss": 0.0105, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 382821261.0, "step": 463 }, { "entropy": 0.47173309326171875, "epoch": 5.213483146067416, "grad_norm": 0.24449038922545974, "learning_rate": 2.3806430189518337e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 383646555.0, "step": 464 }, { "entropy": 0.4704437255859375, "epoch": 5.224719101123595, "grad_norm": 0.7976410857300053, "learning_rate": 2.3151024688461422e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 384458624.0, "step": 465 }, { "entropy": 0.48795318603515625, "epoch": 5.235955056179775, "grad_norm": 0.23199326788537672, "learning_rate": 2.2504329209545846e-07, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 385254806.0, "step": 466 }, { "entropy": 0.4750213623046875, "epoch": 5.247191011235955, "grad_norm": 0.8121541277788699, "learning_rate": 2.186636858308841e-07, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 386062765.0, "step": 467 }, { "entropy": 0.45218658447265625, "epoch": 5.258426966292135, "grad_norm": 3.105997642674649, "learning_rate": 2.1237167304025336e-07, "loss": 0.0029, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 386913540.0, "step": 468 }, { "entropy": 0.47808074951171875, "epoch": 5.269662921348314, "grad_norm": 1.2728652990081897, "learning_rate": 2.0616749530971785e-07, "loss": 0.0073, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 387728374.0, "step": 469 }, { "entropy": 0.46541595458984375, "epoch": 5.280898876404494, "grad_norm": 0.21479561022075241, "learning_rate": 2.0005139085293945e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 388542917.0, "step": 470 }, { "entropy": 0.4601593017578125, "epoch": 5.292134831460674, "grad_norm": 0.5642741656898853, "learning_rate": 1.9402359450194836e-07, "loss": 0.01, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 389396618.0, "step": 471 }, { "entropy": 0.47005462646484375, "epoch": 5.303370786516854, "grad_norm": 2.048653701812348, "learning_rate": 1.8808433769812367e-07, "loss": 0.0124, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 390232039.0, "step": 472 }, { "entropy": 0.46900177001953125, "epoch": 5.314606741573034, "grad_norm": 1.9806921777678528, "learning_rate": 1.8223384848330723e-07, "loss": 0.0072, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 391058664.0, "step": 473 }, { "entropy": 0.45298004150390625, "epoch": 5.325842696629214, "grad_norm": 2.1026358322419987, "learning_rate": 1.7647235149104908e-07, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 391906231.0, "step": 474 }, { "entropy": 0.4974212646484375, "epoch": 5.337078651685394, "grad_norm": 2.964448609260168, "learning_rate": 1.7080006793798176e-07, "loss": 0.005, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 392689363.0, "step": 475 }, { "entropy": 0.4739837646484375, "epoch": 5.348314606741573, "grad_norm": 4.838257625443637, "learning_rate": 1.6521721561532645e-07, "loss": 0.0099, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 393498433.0, "step": 476 }, { "entropy": 0.46537017822265625, "epoch": 5.359550561797753, "grad_norm": 0.2345104947673042, "learning_rate": 1.597240088805302e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 394318449.0, "step": 477 }, { "entropy": 0.46009063720703125, "epoch": 5.370786516853933, "grad_norm": 0.2373016671284719, "learning_rate": 1.54320658649037e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 395161746.0, "step": 478 }, { "entropy": 0.47601318359375, "epoch": 5.382022471910112, "grad_norm": 3.077415790643201, "learning_rate": 1.4900737238618874e-07, "loss": 0.0115, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 395961351.0, "step": 479 }, { "entropy": 0.45795440673828125, "epoch": 5.393258426966292, "grad_norm": 0.36061924748354707, "learning_rate": 1.4378435409925868e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 396827054.0, "step": 480 }, { "entropy": 0.4578094482421875, "epoch": 5.404494382022472, "grad_norm": 0.8381009700413259, "learning_rate": 1.3865180432961977e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 397711856.0, "step": 481 }, { "entropy": 0.46482086181640625, "epoch": 5.415730337078652, "grad_norm": 0.6388255497569517, "learning_rate": 1.3360992014504414e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 398547667.0, "step": 482 }, { "entropy": 0.46080780029296875, "epoch": 5.426966292134831, "grad_norm": 0.5384287405673601, "learning_rate": 1.286588951321363e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 399397538.0, "step": 483 }, { "entropy": 0.48981475830078125, "epoch": 5.438202247191011, "grad_norm": 0.36193846374908706, "learning_rate": 1.237989193889e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 400193633.0, "step": 484 }, { "entropy": 0.4829254150390625, "epoch": 5.449438202247191, "grad_norm": 0.29962032623383783, "learning_rate": 1.1903017951744144e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 401008141.0, "step": 485 }, { "entropy": 0.47170257568359375, "epoch": 5.460674157303371, "grad_norm": 0.2596090181292732, "learning_rate": 1.1435285861680106e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 401831012.0, "step": 486 }, { "entropy": 0.45348358154296875, "epoch": 5.47191011235955, "grad_norm": 0.3331736548838834, "learning_rate": 1.0976713627592561e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 402682567.0, "step": 487 }, { "entropy": 0.47737884521484375, "epoch": 5.48314606741573, "grad_norm": 3.574551781953933, "learning_rate": 1.0527318856677293e-07, "loss": 0.0033, "mean_token_accuracy": 1.0, "num_tokens": 403467221.0, "step": 488 }, { "entropy": 0.49402618408203125, "epoch": 5.49438202247191, "grad_norm": 0.2337886807199716, "learning_rate": 1.0087118803755069e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 404233243.0, "step": 489 }, { "entropy": 0.45621490478515625, "epoch": 5.50561797752809, "grad_norm": 1.1592782206231753, "learning_rate": 9.656130370609057e-08, "loss": 0.0108, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 405084914.0, "step": 490 }, { "entropy": 0.457061767578125, "epoch": 5.51685393258427, "grad_norm": 0.2545051809144223, "learning_rate": 9.234370105336039e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 405909288.0, "step": 491 }, { "entropy": 0.45556640625, "epoch": 5.52808988764045, "grad_norm": 1.0394244180655139, "learning_rate": 8.821854201711027e-08, "loss": 0.0114, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 406788128.0, "step": 492 }, { "entropy": 0.48175048828125, "epoch": 5.539325842696629, "grad_norm": 0.33453697449549497, "learning_rate": 8.418598498565217e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 407580454.0, "step": 493 }, { "entropy": 0.48504638671875, "epoch": 5.550561797752809, "grad_norm": 2.1089507267897116, "learning_rate": 8.024618479178237e-08, "loss": 0.0026, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 408383110.0, "step": 494 }, { "entropy": 0.46286773681640625, "epoch": 5.561797752808989, "grad_norm": 0.2540000053758498, "learning_rate": 7.639929270683438e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 409229114.0, "step": 495 }, { "entropy": 0.47723388671875, "epoch": 5.573033707865169, "grad_norm": 1.593634160516405, "learning_rate": 7.264545643486997e-08, "loss": 0.0101, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 410042844.0, "step": 496 }, { "entropy": 0.48851776123046875, "epoch": 5.584269662921348, "grad_norm": 5.858467783591157, "learning_rate": 6.898482010701036e-08, "loss": 0.008, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 410829963.0, "step": 497 }, { "entropy": 0.4671630859375, "epoch": 5.595505617977528, "grad_norm": 0.3065081480250999, "learning_rate": 6.541752427590004e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 411656140.0, "step": 498 }, { "entropy": 0.457275390625, "epoch": 5.606741573033708, "grad_norm": 0.2640124180695043, "learning_rate": 6.194370591031174e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 412521051.0, "step": 499 }, { "entropy": 0.462554931640625, "epoch": 5.617977528089888, "grad_norm": 0.23712236276528317, "learning_rate": 5.856349838988612e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 413372843.0, "step": 500 }, { "entropy": 0.462982177734375, "epoch": 5.629213483146067, "grad_norm": 1.2591830453291861, "learning_rate": 5.5277031500011734e-08, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 414221035.0, "step": 501 }, { "entropy": 0.46945953369140625, "epoch": 5.640449438202247, "grad_norm": 0.35757120099699974, "learning_rate": 5.208443142684094e-08, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 415057282.0, "step": 502 }, { "entropy": 0.4797821044921875, "epoch": 5.651685393258427, "grad_norm": 1.7058610019138325, "learning_rate": 4.8985820752445177e-08, "loss": 0.0063, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 415863475.0, "step": 503 }, { "entropy": 0.4644775390625, "epoch": 5.662921348314606, "grad_norm": 3.697075376126118, "learning_rate": 4.5981318450109e-08, "loss": 0.0137, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 416695157.0, "step": 504 }, { "entropy": 0.46736907958984375, "epoch": 5.674157303370786, "grad_norm": 0.2899077751323242, "learning_rate": 4.307103987976041e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 417539681.0, "step": 505 }, { "entropy": 0.47779083251953125, "epoch": 5.685393258426966, "grad_norm": 0.23808611258993526, "learning_rate": 4.0255096783543e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 418336130.0, "step": 506 }, { "entropy": 0.45633697509765625, "epoch": 5.696629213483146, "grad_norm": 1.8720058072970238, "learning_rate": 3.75335972815255e-08, "loss": 0.0025, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 419186786.0, "step": 507 }, { "entropy": 0.47606658935546875, "epoch": 5.707865168539326, "grad_norm": 0.24311851614182342, "learning_rate": 3.4906645867549547e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 420027066.0, "step": 508 }, { "entropy": 0.47313690185546875, "epoch": 5.719101123595506, "grad_norm": 0.47520225601419763, "learning_rate": 3.237434340521789e-08, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 420854154.0, "step": 509 }, { "entropy": 0.48545074462890625, "epoch": 5.730337078651686, "grad_norm": 0.24919993452145028, "learning_rate": 2.993678712402221e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 421659979.0, "step": 510 }, { "entropy": 0.46788787841796875, "epoch": 5.741573033707866, "grad_norm": 0.2687240802887501, "learning_rate": 2.7594070615609426e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 422514934.0, "step": 511 }, { "entropy": 0.47194671630859375, "epoch": 5.752808988764045, "grad_norm": 0.2600550522951215, "learning_rate": 2.5346283830187667e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 423354852.0, "step": 512 }, { "entropy": 0.4694366455078125, "epoch": 5.764044943820225, "grad_norm": 0.2720018158822482, "learning_rate": 2.319351307307427e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 424172352.0, "step": 513 }, { "entropy": 0.4664459228515625, "epoch": 5.775280898876405, "grad_norm": 0.9650729639873621, "learning_rate": 2.1135841001380386e-08, "loss": 0.0102, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 424994231.0, "step": 514 }, { "entropy": 0.4492645263671875, "epoch": 5.786516853932584, "grad_norm": 2.648888232254079, "learning_rate": 1.917334662083714e-08, "loss": 0.0068, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 425869811.0, "step": 515 }, { "entropy": 0.453094482421875, "epoch": 5.797752808988764, "grad_norm": 1.027370821629121, "learning_rate": 1.7306105282764162e-08, "loss": 0.0087, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 426728032.0, "step": 516 }, { "entropy": 0.46843719482421875, "epoch": 5.808988764044944, "grad_norm": 0.23837807235842895, "learning_rate": 1.55341886811744e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 427544143.0, "step": 517 }, { "entropy": 0.4472198486328125, "epoch": 5.820224719101123, "grad_norm": 0.24481577785236583, "learning_rate": 1.3857664850022157e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 428420275.0, "step": 518 }, { "entropy": 0.47149658203125, "epoch": 5.831460674157303, "grad_norm": 0.2426903810937681, "learning_rate": 1.2276598160590736e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 429224830.0, "step": 519 }, { "entropy": 0.4630584716796875, "epoch": 5.842696629213483, "grad_norm": 0.25840814875325685, "learning_rate": 1.0791049319021086e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 430056259.0, "step": 520 }, { "entropy": 0.4733734130859375, "epoch": 5.853932584269663, "grad_norm": 0.26267972632078107, "learning_rate": 9.401075363981438e-09, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 430887003.0, "step": 521 }, { "entropy": 0.467041015625, "epoch": 5.865168539325842, "grad_norm": 0.24048673726778205, "learning_rate": 8.106729664475178e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 431721666.0, "step": 522 }, { "entropy": 0.47376251220703125, "epoch": 5.876404494382022, "grad_norm": 0.24254325715878092, "learning_rate": 6.908061917794417e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 432523209.0, "step": 523 }, { "entropy": 0.4714813232421875, "epoch": 5.887640449438202, "grad_norm": 0.31821184051507945, "learning_rate": 5.805118147610145e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 433343280.0, "step": 524 }, { "entropy": 0.4551544189453125, "epoch": 5.898876404494382, "grad_norm": 0.2263073507825304, "learning_rate": 4.797940702205572e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 434225691.0, "step": 525 }, { "entropy": 0.4636688232421875, "epoch": 5.910112359550562, "grad_norm": 0.2921054936133335, "learning_rate": 3.8865682528504975e-09, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 435079584.0, "step": 526 }, { "entropy": 0.48111724853515625, "epoch": 5.921348314606742, "grad_norm": 0.2539645731699836, "learning_rate": 3.071035792315269e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 435876270.0, "step": 527 }, { "entropy": 0.4649505615234375, "epoch": 5.932584269662922, "grad_norm": 0.25467262177337446, "learning_rate": 2.351374633528802e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 436699694.0, "step": 528 }, { "entropy": 0.45807647705078125, "epoch": 5.943820224719101, "grad_norm": 0.2970856799447582, "learning_rate": 1.7276124083753788e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 437552430.0, "step": 529 }, { "entropy": 0.46283721923828125, "epoch": 5.955056179775281, "grad_norm": 0.24692476291579676, "learning_rate": 1.1997730666338248e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 438392649.0, "step": 530 }, { "entropy": 0.47788238525390625, "epoch": 5.966292134831461, "grad_norm": 0.28913800701048575, "learning_rate": 7.678768750579713e-10, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 439188890.0, "step": 531 }, { "entropy": 0.48076629638671875, "epoch": 5.97752808988764, "grad_norm": 0.2517588518870856, "learning_rate": 4.3194041659866405e-10, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 439971575.0, "step": 532 }, { "entropy": 0.46562957763671875, "epoch": 5.98876404494382, "grad_norm": 0.2411681682465322, "learning_rate": 1.9197658976677358e-10, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 440804562.0, "step": 533 }, { "entropy": 0.4652252197265625, "epoch": 6.0, "grad_norm": 0.27634116211326665, "learning_rate": 4.799460813803558e-11, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 441644783.0, "step": 534 }, { "epoch": 6.0, "step": 534, "total_flos": 519624752562176.0, "train_loss": 0.5481295004301296, "train_runtime": 70704.5263, "train_samples_per_second": 3.496, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 534, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 45, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 519624752562176.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }