Files
P9-split5_only_answer_Qwen3…/trainer_state.json
ModelHub XC 7b37b183ac 初始化项目,由ModelHub XC社区提供模型
Model: Hyeongwon/P9-split5_only_answer_Qwen3-4B-Base_0402-01-5e-6
Source: Original Platform
2026-05-14 19:39:32 +08:00

5384 lines
151 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 534,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5542068481445312,
"epoch": 0.011235955056179775,
"grad_norm": 390.23344037065993,
"learning_rate": 0.0,
"loss": 8.277,
"mean_token_accuracy": 0.0,
"num_tokens": 822155.0,
"step": 1
},
{
"entropy": 0.5560684204101562,
"epoch": 0.02247191011235955,
"grad_norm": 387.2980617003247,
"learning_rate": 1.8518518518518518e-07,
"loss": 8.3201,
"mean_token_accuracy": 0.0,
"num_tokens": 1647671.0,
"step": 2
},
{
"entropy": 0.5492401123046875,
"epoch": 0.033707865168539325,
"grad_norm": 384.7370311195686,
"learning_rate": 3.7037037037037036e-07,
"loss": 8.3097,
"mean_token_accuracy": 0.0,
"num_tokens": 2496794.0,
"step": 3
},
{
"entropy": 0.5499496459960938,
"epoch": 0.0449438202247191,
"grad_norm": 386.7170475562737,
"learning_rate": 5.555555555555555e-07,
"loss": 8.2263,
"mean_token_accuracy": 0.0,
"num_tokens": 3335427.0,
"step": 4
},
{
"entropy": 0.5614547729492188,
"epoch": 0.056179775280898875,
"grad_norm": 391.10655108122654,
"learning_rate": 7.407407407407407e-07,
"loss": 8.1153,
"mean_token_accuracy": 0.0,
"num_tokens": 4147344.0,
"step": 5
},
{
"entropy": 0.5488052368164062,
"epoch": 0.06741573033707865,
"grad_norm": 394.56555016865775,
"learning_rate": 9.259259259259259e-07,
"loss": 7.9897,
"mean_token_accuracy": 0.0,
"num_tokens": 4987664.0,
"step": 6
},
{
"entropy": 0.5349197387695312,
"epoch": 0.07865168539325842,
"grad_norm": 398.0852378093763,
"learning_rate": 1.111111111111111e-06,
"loss": 7.4606,
"mean_token_accuracy": 0.0,
"num_tokens": 5868951.0,
"step": 7
},
{
"entropy": 0.5573577880859375,
"epoch": 0.0898876404494382,
"grad_norm": 269.2196405074287,
"learning_rate": 1.2962962962962962e-06,
"loss": 5.8655,
"mean_token_accuracy": 0.0,
"num_tokens": 6673251.0,
"step": 8
},
{
"entropy": 0.5700607299804688,
"epoch": 0.10112359550561797,
"grad_norm": 227.1382148537302,
"learning_rate": 1.4814814814814815e-06,
"loss": 5.5805,
"mean_token_accuracy": 0.006510416860692203,
"num_tokens": 7475702.0,
"step": 9
},
{
"entropy": 0.5521163940429688,
"epoch": 0.11235955056179775,
"grad_norm": 191.45954657460527,
"learning_rate": 1.6666666666666667e-06,
"loss": 5.2965,
"mean_token_accuracy": 0.016927083837799728,
"num_tokens": 8289995.0,
"step": 10
},
{
"entropy": 0.5551223754882812,
"epoch": 0.12359550561797752,
"grad_norm": 102.83125134655117,
"learning_rate": 1.8518518518518519e-06,
"loss": 4.1159,
"mean_token_accuracy": 0.5000000149011612,
"num_tokens": 9130985.0,
"step": 11
},
{
"entropy": 0.5453948974609375,
"epoch": 0.1348314606741573,
"grad_norm": 96.0432311413094,
"learning_rate": 2.037037037037037e-06,
"loss": 4.0236,
"mean_token_accuracy": 0.5247395989717916,
"num_tokens": 9991024.0,
"step": 12
},
{
"entropy": 0.5710296630859375,
"epoch": 0.14606741573033707,
"grad_norm": 82.56292307777714,
"learning_rate": 2.222222222222222e-06,
"loss": 3.8238,
"mean_token_accuracy": 0.5182291821110994,
"num_tokens": 10795531.0,
"step": 13
},
{
"entropy": 0.5636825561523438,
"epoch": 0.15730337078651685,
"grad_norm": 74.53161781330034,
"learning_rate": 2.4074074074074075e-06,
"loss": 3.7069,
"mean_token_accuracy": 0.5026041816454381,
"num_tokens": 11605868.0,
"step": 14
},
{
"entropy": 0.5413894653320312,
"epoch": 0.16853932584269662,
"grad_norm": 59.737070732481115,
"learning_rate": 2.5925925925925925e-06,
"loss": 3.2683,
"mean_token_accuracy": 0.5013020982732996,
"num_tokens": 12460543.0,
"step": 15
},
{
"entropy": 0.5427627563476562,
"epoch": 0.1797752808988764,
"grad_norm": 58.49611194582999,
"learning_rate": 2.7777777777777783e-06,
"loss": 3.2039,
"mean_token_accuracy": 0.5208333488553762,
"num_tokens": 13294073.0,
"step": 16
},
{
"entropy": 0.5473175048828125,
"epoch": 0.19101123595505617,
"grad_norm": 57.54039518404522,
"learning_rate": 2.962962962962963e-06,
"loss": 3.1559,
"mean_token_accuracy": 0.5104166818782687,
"num_tokens": 14111722.0,
"step": 17
},
{
"entropy": 0.5622482299804688,
"epoch": 0.20224719101123595,
"grad_norm": 57.432008827772684,
"learning_rate": 3.1481481481481483e-06,
"loss": 3.093,
"mean_token_accuracy": 0.5390625160653144,
"num_tokens": 14914584.0,
"step": 18
},
{
"entropy": 0.54608154296875,
"epoch": 0.21348314606741572,
"grad_norm": 57.483023727287936,
"learning_rate": 3.3333333333333333e-06,
"loss": 3.0533,
"mean_token_accuracy": 0.5364583493210375,
"num_tokens": 15747323.0,
"step": 19
},
{
"entropy": 0.5431365966796875,
"epoch": 0.2247191011235955,
"grad_norm": 58.03235110267345,
"learning_rate": 3.5185185185185187e-06,
"loss": 2.9627,
"mean_token_accuracy": 0.5403645994374529,
"num_tokens": 16578442.0,
"step": 20
},
{
"entropy": 0.5537033081054688,
"epoch": 0.23595505617977527,
"grad_norm": 57.031635513840406,
"learning_rate": 3.7037037037037037e-06,
"loss": 2.9173,
"mean_token_accuracy": 0.5638021001359448,
"num_tokens": 17365881.0,
"step": 21
},
{
"entropy": 0.54766845703125,
"epoch": 0.24719101123595505,
"grad_norm": 61.80595725006236,
"learning_rate": 3.88888888888889e-06,
"loss": 2.9261,
"mean_token_accuracy": 0.5247395989717916,
"num_tokens": 18163036.0,
"step": 22
},
{
"entropy": 0.5399551391601562,
"epoch": 0.25842696629213485,
"grad_norm": 59.257010291932765,
"learning_rate": 4.074074074074074e-06,
"loss": 2.889,
"mean_token_accuracy": 0.5520833497866988,
"num_tokens": 18990094.0,
"step": 23
},
{
"entropy": 0.54046630859375,
"epoch": 0.2696629213483146,
"grad_norm": 58.14418169147904,
"learning_rate": 4.2592592592592596e-06,
"loss": 2.8644,
"mean_token_accuracy": 0.5559895999031141,
"num_tokens": 19825429.0,
"step": 24
},
{
"entropy": 0.545257568359375,
"epoch": 0.2808988764044944,
"grad_norm": 57.31107773937866,
"learning_rate": 4.444444444444444e-06,
"loss": 2.8263,
"mean_token_accuracy": 0.5768229338573292,
"num_tokens": 20620523.0,
"step": 25
},
{
"entropy": 0.5317459106445312,
"epoch": 0.29213483146067415,
"grad_norm": 57.695511366393276,
"learning_rate": 4.62962962962963e-06,
"loss": 2.8152,
"mean_token_accuracy": 0.5481770996702835,
"num_tokens": 21466727.0,
"step": 26
},
{
"entropy": 0.5383377075195312,
"epoch": 0.30337078651685395,
"grad_norm": 57.43374006560947,
"learning_rate": 4.814814814814815e-06,
"loss": 2.7654,
"mean_token_accuracy": 0.5664062668802217,
"num_tokens": 22276427.0,
"step": 27
},
{
"entropy": 0.5467529296875,
"epoch": 0.3146067415730337,
"grad_norm": 57.29313317484686,
"learning_rate": 5e-06,
"loss": 2.7203,
"mean_token_accuracy": 0.5716146003687754,
"num_tokens": 23060552.0,
"step": 28
},
{
"entropy": 0.5378189086914062,
"epoch": 0.3258426966292135,
"grad_norm": 58.51832801138517,
"learning_rate": 4.999952005391863e-06,
"loss": 2.7212,
"mean_token_accuracy": 0.5455729329260066,
"num_tokens": 23845994.0,
"step": 29
},
{
"entropy": 0.5296554565429688,
"epoch": 0.33707865168539325,
"grad_norm": 57.46515253087696,
"learning_rate": 4.999808023410233e-06,
"loss": 2.6669,
"mean_token_accuracy": 0.5651041835080832,
"num_tokens": 24664278.0,
"step": 30
},
{
"entropy": 0.5262527465820312,
"epoch": 0.34831460674157305,
"grad_norm": 58.42941782798559,
"learning_rate": 4.999568059583401e-06,
"loss": 2.6528,
"mean_token_accuracy": 0.5533854331588373,
"num_tokens": 25492953.0,
"step": 31
},
{
"entropy": 0.525665283203125,
"epoch": 0.3595505617977528,
"grad_norm": 57.76343798611473,
"learning_rate": 4.9992321231249425e-06,
"loss": 2.6188,
"mean_token_accuracy": 0.5729166837409139,
"num_tokens": 26327442.0,
"step": 32
},
{
"entropy": 0.5118408203125,
"epoch": 0.3707865168539326,
"grad_norm": 58.36500275312141,
"learning_rate": 4.998800226933367e-06,
"loss": 2.6118,
"mean_token_accuracy": 0.5572916832752526,
"num_tokens": 27182108.0,
"step": 33
},
{
"entropy": 0.5230560302734375,
"epoch": 0.38202247191011235,
"grad_norm": 58.29103483362129,
"learning_rate": 4.998272387591625e-06,
"loss": 2.565,
"mean_token_accuracy": 0.5807291839737445,
"num_tokens": 27990119.0,
"step": 34
},
{
"entropy": 0.5248336791992188,
"epoch": 0.39325842696629215,
"grad_norm": 58.88250032879414,
"learning_rate": 4.997648625366471e-06,
"loss": 2.5656,
"mean_token_accuracy": 0.5520833497866988,
"num_tokens": 28815728.0,
"step": 35
},
{
"entropy": 0.5244140625,
"epoch": 0.4044943820224719,
"grad_norm": 59.128939451714714,
"learning_rate": 4.996928964207685e-06,
"loss": 2.5173,
"mean_token_accuracy": 0.5716146003687754,
"num_tokens": 29630564.0,
"step": 36
},
{
"entropy": 0.529052734375,
"epoch": 0.4157303370786517,
"grad_norm": 59.872929282401124,
"learning_rate": 4.99611343174715e-06,
"loss": 2.475,
"mean_token_accuracy": 0.5781250172294676,
"num_tokens": 30441045.0,
"step": 37
},
{
"entropy": 0.5102386474609375,
"epoch": 0.42696629213483145,
"grad_norm": 59.524880081693325,
"learning_rate": 4.995202059297795e-06,
"loss": 2.475,
"mean_token_accuracy": 0.5651041835080832,
"num_tokens": 31303954.0,
"step": 38
},
{
"entropy": 0.5181503295898438,
"epoch": 0.43820224719101125,
"grad_norm": 59.47510189267516,
"learning_rate": 4.99419488185239e-06,
"loss": 2.4294,
"mean_token_accuracy": 0.5833333507180214,
"num_tokens": 32133913.0,
"step": 39
},
{
"entropy": 0.5339736938476562,
"epoch": 0.449438202247191,
"grad_norm": 60.729118832026174,
"learning_rate": 4.993091938082206e-06,
"loss": 2.4088,
"mean_token_accuracy": 0.5872396008344367,
"num_tokens": 32917166.0,
"step": 40
},
{
"entropy": 0.50799560546875,
"epoch": 0.4606741573033708,
"grad_norm": 60.78892852056622,
"learning_rate": 4.991893270335526e-06,
"loss": 2.4005,
"mean_token_accuracy": 0.5611979333916679,
"num_tokens": 33768662.0,
"step": 41
},
{
"entropy": 0.5279388427734375,
"epoch": 0.47191011235955055,
"grad_norm": 59.9563298919123,
"learning_rate": 4.990598924636019e-06,
"loss": 2.3612,
"mean_token_accuracy": 0.5794271006016061,
"num_tokens": 34579535.0,
"step": 42
},
{
"entropy": 0.5206527709960938,
"epoch": 0.48314606741573035,
"grad_norm": 60.22141204319965,
"learning_rate": 4.989208950680979e-06,
"loss": 2.3572,
"mean_token_accuracy": 0.5611979333916679,
"num_tokens": 35405664.0,
"step": 43
},
{
"entropy": 0.518218994140625,
"epoch": 0.4943820224719101,
"grad_norm": 60.352777525668316,
"learning_rate": 4.987723401839409e-06,
"loss": 2.3016,
"mean_token_accuracy": 0.5976562678115442,
"num_tokens": 36237455.0,
"step": 44
},
{
"entropy": 0.5099258422851562,
"epoch": 0.5056179775280899,
"grad_norm": 60.37422724299012,
"learning_rate": 4.9861423351499786e-06,
"loss": 2.3152,
"mean_token_accuracy": 0.5989583493210375,
"num_tokens": 37080531.0,
"step": 45
},
{
"entropy": 0.5283050537109375,
"epoch": 0.5168539325842697,
"grad_norm": 60.891478974349575,
"learning_rate": 4.984465811318826e-06,
"loss": 2.2646,
"mean_token_accuracy": 0.7473958439659327,
"num_tokens": 37899788.0,
"step": 46
},
{
"entropy": 0.5109405517578125,
"epoch": 0.5280898876404494,
"grad_norm": 60.95117855356651,
"learning_rate": 4.982693894717237e-06,
"loss": 2.261,
"mean_token_accuracy": 0.8867187564028427,
"num_tokens": 38740893.0,
"step": 47
},
{
"entropy": 0.53253173828125,
"epoch": 0.5393258426966292,
"grad_norm": 60.74187561074295,
"learning_rate": 4.980826653379163e-06,
"loss": 2.2125,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 39582346.0,
"step": 48
},
{
"entropy": 0.5358963012695312,
"epoch": 0.550561797752809,
"grad_norm": 60.56695566553226,
"learning_rate": 4.97886415899862e-06,
"loss": 2.1837,
"mean_token_accuracy": 0.9283854209352285,
"num_tokens": 40397227.0,
"step": 49
},
{
"entropy": 0.528045654296875,
"epoch": 0.5617977528089888,
"grad_norm": 60.65909676388702,
"learning_rate": 4.976806486926926e-06,
"loss": 2.1628,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 41239235.0,
"step": 50
},
{
"entropy": 0.5263214111328125,
"epoch": 0.5730337078651685,
"grad_norm": 60.510109640876315,
"learning_rate": 4.9746537161698125e-06,
"loss": 2.1332,
"mean_token_accuracy": 0.912760421866551,
"num_tokens": 42099432.0,
"step": 51
},
{
"entropy": 0.531646728515625,
"epoch": 0.5842696629213483,
"grad_norm": 60.707459562078135,
"learning_rate": 4.972405929384391e-06,
"loss": 2.1541,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 42954514.0,
"step": 52
},
{
"entropy": 0.5231857299804688,
"epoch": 0.5955056179775281,
"grad_norm": 60.712009812773005,
"learning_rate": 4.970063212875979e-06,
"loss": 2.1064,
"mean_token_accuracy": 0.901041672565043,
"num_tokens": 43831261.0,
"step": 53
},
{
"entropy": 0.5416336059570312,
"epoch": 0.6067415730337079,
"grad_norm": 59.97544270872148,
"learning_rate": 4.967625656594782e-06,
"loss": 2.0685,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 44663681.0,
"step": 54
},
{
"entropy": 0.5709991455078125,
"epoch": 0.6179775280898876,
"grad_norm": 60.549193390626336,
"learning_rate": 4.965093354132451e-06,
"loss": 2.0444,
"mean_token_accuracy": 0.9283854209352285,
"num_tokens": 45428334.0,
"step": 55
},
{
"entropy": 0.5407867431640625,
"epoch": 0.6292134831460674,
"grad_norm": 60.807708362065945,
"learning_rate": 4.962466402718475e-06,
"loss": 2.0334,
"mean_token_accuracy": 0.8997395893093199,
"num_tokens": 46279619.0,
"step": 56
},
{
"entropy": 0.5298614501953125,
"epoch": 0.6404494382022472,
"grad_norm": 59.86881859999676,
"learning_rate": 4.959744903216458e-06,
"loss": 1.9953,
"mean_token_accuracy": 0.912760421866551,
"num_tokens": 47142939.0,
"step": 57
},
{
"entropy": 0.5374221801757812,
"epoch": 0.651685393258427,
"grad_norm": 60.387166382590635,
"learning_rate": 4.9569289601202405e-06,
"loss": 1.975,
"mean_token_accuracy": 0.9153645883779973,
"num_tokens": 47975113.0,
"step": 58
},
{
"entropy": 0.5334854125976562,
"epoch": 0.6629213483146067,
"grad_norm": 59.65588927086945,
"learning_rate": 4.954018681549891e-06,
"loss": 1.9455,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 48809586.0,
"step": 59
},
{
"entropy": 0.5394287109375,
"epoch": 0.6741573033707865,
"grad_norm": 63.51403196675763,
"learning_rate": 4.951014179247555e-06,
"loss": 1.9311,
"mean_token_accuracy": 0.9179687548894435,
"num_tokens": 49623736.0,
"step": 60
},
{
"entropy": 0.5323715209960938,
"epoch": 0.6853932584269663,
"grad_norm": 59.81148319230244,
"learning_rate": 4.9479155685731595e-06,
"loss": 1.8914,
"mean_token_accuracy": 0.9257812544237822,
"num_tokens": 50448514.0,
"step": 61
},
{
"entropy": 0.5502471923828125,
"epoch": 0.6966292134831461,
"grad_norm": 61.91436012722269,
"learning_rate": 4.944722968499989e-06,
"loss": 1.8825,
"mean_token_accuracy": 0.9101562553551048,
"num_tokens": 51261440.0,
"step": 62
},
{
"entropy": 0.5454025268554688,
"epoch": 0.7078651685393258,
"grad_norm": 60.64630880005511,
"learning_rate": 4.9414365016101144e-06,
"loss": 1.8559,
"mean_token_accuracy": 0.912760421866551,
"num_tokens": 52075479.0,
"step": 63
},
{
"entropy": 0.53387451171875,
"epoch": 0.7191011235955056,
"grad_norm": 59.4763603036066,
"learning_rate": 4.938056294089689e-06,
"loss": 1.8253,
"mean_token_accuracy": 0.9101562553551048,
"num_tokens": 52947384.0,
"step": 64
},
{
"entropy": 0.5350189208984375,
"epoch": 0.7303370786516854,
"grad_norm": 58.59080646861511,
"learning_rate": 4.934582475724101e-06,
"loss": 1.7885,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 53803463.0,
"step": 65
},
{
"entropy": 0.5385360717773438,
"epoch": 0.7415730337078652,
"grad_norm": 59.607691840587314,
"learning_rate": 4.93101517989299e-06,
"loss": 1.7782,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 54629114.0,
"step": 66
},
{
"entropy": 0.5513763427734375,
"epoch": 0.7528089887640449,
"grad_norm": 58.79322877259206,
"learning_rate": 4.927354543565131e-06,
"loss": 1.7471,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 55460361.0,
"step": 67
},
{
"entropy": 0.5551681518554688,
"epoch": 0.7640449438202247,
"grad_norm": 57.96512656440569,
"learning_rate": 4.923600707293166e-06,
"loss": 1.7144,
"mean_token_accuracy": 0.9179687548894435,
"num_tokens": 56245748.0,
"step": 68
},
{
"entropy": 0.56158447265625,
"epoch": 0.7752808988764045,
"grad_norm": 58.47646030692927,
"learning_rate": 4.919753815208218e-06,
"loss": 1.6723,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 57025912.0,
"step": 69
},
{
"entropy": 0.5417938232421875,
"epoch": 0.7865168539325843,
"grad_norm": 59.27714480205728,
"learning_rate": 4.915814015014349e-06,
"loss": 1.6721,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 57834059.0,
"step": 70
},
{
"entropy": 0.5401611328125,
"epoch": 0.797752808988764,
"grad_norm": 58.344511616531506,
"learning_rate": 4.91178145798289e-06,
"loss": 1.6156,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 58666075.0,
"step": 71
},
{
"entropy": 0.5393142700195312,
"epoch": 0.8089887640449438,
"grad_norm": 59.042195880718445,
"learning_rate": 4.90765629894664e-06,
"loss": 1.598,
"mean_token_accuracy": 0.9283854209352285,
"num_tokens": 59494512.0,
"step": 72
},
{
"entropy": 0.5437774658203125,
"epoch": 0.8202247191011236,
"grad_norm": 58.82751577757508,
"learning_rate": 4.90343869629391e-06,
"loss": 1.5733,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 60288577.0,
"step": 73
},
{
"entropy": 0.5414581298828125,
"epoch": 0.8314606741573034,
"grad_norm": 58.26603403657131,
"learning_rate": 4.89912881196245e-06,
"loss": 1.5266,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 61117524.0,
"step": 74
},
{
"entropy": 0.5426177978515625,
"epoch": 0.8426966292134831,
"grad_norm": 59.26658241025557,
"learning_rate": 4.8947268114332276e-06,
"loss": 1.5502,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 61931952.0,
"step": 75
},
{
"entropy": 0.536407470703125,
"epoch": 0.8539325842696629,
"grad_norm": 58.860024938330206,
"learning_rate": 4.890232863724075e-06,
"loss": 1.4873,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 62774606.0,
"step": 76
},
{
"entropy": 0.5328445434570312,
"epoch": 0.8651685393258427,
"grad_norm": 58.76272222471983,
"learning_rate": 4.8856471413831995e-06,
"loss": 1.4552,
"mean_token_accuracy": 0.9257812544237822,
"num_tokens": 63636059.0,
"step": 77
},
{
"entropy": 0.5324859619140625,
"epoch": 0.8764044943820225,
"grad_norm": 58.95451119005953,
"learning_rate": 4.880969820482559e-06,
"loss": 1.4268,
"mean_token_accuracy": 0.9361979204695672,
"num_tokens": 64456269.0,
"step": 78
},
{
"entropy": 0.5229721069335938,
"epoch": 0.8876404494382022,
"grad_norm": 58.84513969144007,
"learning_rate": 4.8762010806111e-06,
"loss": 1.4127,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 65332024.0,
"step": 79
},
{
"entropy": 0.5389480590820312,
"epoch": 0.898876404494382,
"grad_norm": 58.33326344529121,
"learning_rate": 4.8713411048678635e-06,
"loss": 1.3606,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 66146121.0,
"step": 80
},
{
"entropy": 0.5313339233398438,
"epoch": 0.9101123595505618,
"grad_norm": 58.770046988203866,
"learning_rate": 4.866390079854956e-06,
"loss": 1.3559,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 66983445.0,
"step": 81
},
{
"entropy": 0.543670654296875,
"epoch": 0.9213483146067416,
"grad_norm": 58.35961263678049,
"learning_rate": 4.861348195670381e-06,
"loss": 1.3214,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 67786937.0,
"step": 82
},
{
"entropy": 0.5296173095703125,
"epoch": 0.9325842696629213,
"grad_norm": 58.16424063871105,
"learning_rate": 4.856215645900742e-06,
"loss": 1.2894,
"mean_token_accuracy": 0.9283854209352285,
"num_tokens": 68600141.0,
"step": 83
},
{
"entropy": 0.5406417846679688,
"epoch": 0.9438202247191011,
"grad_norm": 58.67244467783903,
"learning_rate": 4.850992627613812e-06,
"loss": 1.2609,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 69454327.0,
"step": 84
},
{
"entropy": 0.5292129516601562,
"epoch": 0.9550561797752809,
"grad_norm": 57.850796732664065,
"learning_rate": 4.845679341350963e-06,
"loss": 1.2283,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 70270259.0,
"step": 85
},
{
"entropy": 0.5702438354492188,
"epoch": 0.9662921348314607,
"grad_norm": 57.61484422295124,
"learning_rate": 4.8402759911194705e-06,
"loss": 1.175,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 71047982.0,
"step": 86
},
{
"entropy": 0.547698974609375,
"epoch": 0.9775280898876404,
"grad_norm": 57.52383290276939,
"learning_rate": 4.834782784384674e-06,
"loss": 1.1626,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 71864427.0,
"step": 87
},
{
"entropy": 0.5348434448242188,
"epoch": 0.9887640449438202,
"grad_norm": 57.09411552139804,
"learning_rate": 4.8291999320620185e-06,
"loss": 1.1274,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 72710085.0,
"step": 88
},
{
"entropy": 0.5321273803710938,
"epoch": 1.0,
"grad_norm": 57.070649311975515,
"learning_rate": 4.823527648508951e-06,
"loss": 1.1004,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 73549946.0,
"step": 89
},
{
"entropy": 0.5517959594726562,
"epoch": 1.0112359550561798,
"grad_norm": 57.109477979057964,
"learning_rate": 4.817766151516693e-06,
"loss": 1.0819,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 74355101.0,
"step": 90
},
{
"entropy": 0.53094482421875,
"epoch": 1.0224719101123596,
"grad_norm": 57.86303273485758,
"learning_rate": 4.811915662301877e-06,
"loss": 1.055,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 75191280.0,
"step": 91
},
{
"entropy": 0.5199966430664062,
"epoch": 1.0337078651685394,
"grad_norm": 57.09250683738646,
"learning_rate": 4.805976405498052e-06,
"loss": 1.0464,
"mean_token_accuracy": 0.9361979204695672,
"num_tokens": 76053228.0,
"step": 92
},
{
"entropy": 0.5236129760742188,
"epoch": 1.0449438202247192,
"grad_norm": 56.56208472514724,
"learning_rate": 4.799948609147061e-06,
"loss": 1.0133,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 76922703.0,
"step": 93
},
{
"entropy": 0.5320358276367188,
"epoch": 1.0561797752808988,
"grad_norm": 56.52468060162162,
"learning_rate": 4.793832504690283e-06,
"loss": 0.984,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 77760537.0,
"step": 94
},
{
"entropy": 0.5383987426757812,
"epoch": 1.0674157303370786,
"grad_norm": 56.23138412443709,
"learning_rate": 4.787628326959747e-06,
"loss": 0.9668,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 78584878.0,
"step": 95
},
{
"entropy": 0.5504608154296875,
"epoch": 1.0786516853932584,
"grad_norm": 55.775703271501115,
"learning_rate": 4.7813363141691166e-06,
"loss": 0.9265,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 79362452.0,
"step": 96
},
{
"entropy": 0.52728271484375,
"epoch": 1.0898876404494382,
"grad_norm": 55.443074597823376,
"learning_rate": 4.774956707904542e-06,
"loss": 0.9173,
"mean_token_accuracy": 0.9361979204695672,
"num_tokens": 80224183.0,
"step": 97
},
{
"entropy": 0.5221405029296875,
"epoch": 1.101123595505618,
"grad_norm": 55.28172803435839,
"learning_rate": 4.768489753115386e-06,
"loss": 0.8786,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 81054014.0,
"step": 98
},
{
"entropy": 0.522125244140625,
"epoch": 1.1123595505617978,
"grad_norm": 55.15995525310935,
"learning_rate": 4.761935698104817e-06,
"loss": 0.8674,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 81886788.0,
"step": 99
},
{
"entropy": 0.5253677368164062,
"epoch": 1.1235955056179776,
"grad_norm": 54.71553871787075,
"learning_rate": 4.755294794520277e-06,
"loss": 0.8102,
"mean_token_accuracy": 0.9596354190725833,
"num_tokens": 82715583.0,
"step": 100
},
{
"entropy": 0.540069580078125,
"epoch": 1.1348314606741572,
"grad_norm": 56.64220620441506,
"learning_rate": 4.7485672973438175e-06,
"loss": 0.8516,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 83523671.0,
"step": 101
},
{
"entropy": 0.532379150390625,
"epoch": 1.146067415730337,
"grad_norm": 57.49781410452237,
"learning_rate": 4.741753464882312e-06,
"loss": 0.8225,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 84335142.0,
"step": 102
},
{
"entropy": 0.561614990234375,
"epoch": 1.1573033707865168,
"grad_norm": 54.70294783987688,
"learning_rate": 4.734853558757534e-06,
"loss": 0.7812,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 85112682.0,
"step": 103
},
{
"entropy": 0.5339889526367188,
"epoch": 1.1685393258426966,
"grad_norm": 53.21911232668916,
"learning_rate": 4.727867843896116e-06,
"loss": 0.7431,
"mean_token_accuracy": 0.9596354190725833,
"num_tokens": 85953912.0,
"step": 104
},
{
"entropy": 0.5423355102539062,
"epoch": 1.1797752808988764,
"grad_norm": 52.7058239940401,
"learning_rate": 4.72079658851938e-06,
"loss": 0.7265,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 86770289.0,
"step": 105
},
{
"entropy": 0.5316314697265625,
"epoch": 1.1910112359550562,
"grad_norm": 52.57859861877315,
"learning_rate": 4.7136400641330245e-06,
"loss": 0.7063,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 87591764.0,
"step": 106
},
{
"entropy": 0.5559310913085938,
"epoch": 1.202247191011236,
"grad_norm": 51.41978129735086,
"learning_rate": 4.706398545516722e-06,
"loss": 0.6742,
"mean_token_accuracy": 0.9518229195382446,
"num_tokens": 88393028.0,
"step": 107
},
{
"entropy": 0.539886474609375,
"epoch": 1.2134831460674158,
"grad_norm": 51.39997895136707,
"learning_rate": 4.6990723107135475e-06,
"loss": 0.6652,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 89219905.0,
"step": 108
},
{
"entropy": 0.5354461669921875,
"epoch": 1.2247191011235956,
"grad_norm": 50.28502697998156,
"learning_rate": 4.691661641019316e-06,
"loss": 0.6255,
"mean_token_accuracy": 0.9492187530267984,
"num_tokens": 90075085.0,
"step": 109
},
{
"entropy": 0.5335617065429688,
"epoch": 1.2359550561797752,
"grad_norm": 48.758575622533606,
"learning_rate": 4.684166820971779e-06,
"loss": 0.6082,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 90906884.0,
"step": 110
},
{
"entropy": 0.5511932373046875,
"epoch": 1.247191011235955,
"grad_norm": 47.80816266829214,
"learning_rate": 4.6765881383396985e-06,
"loss": 0.6082,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 91706441.0,
"step": 111
},
{
"entropy": 0.5378570556640625,
"epoch": 1.2584269662921348,
"grad_norm": 46.527536617371204,
"learning_rate": 4.6689258841117946e-06,
"loss": 0.5867,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 92547894.0,
"step": 112
},
{
"entropy": 0.53167724609375,
"epoch": 1.2696629213483146,
"grad_norm": 45.65667911602808,
"learning_rate": 4.6611803524855805e-06,
"loss": 0.56,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 93391238.0,
"step": 113
},
{
"entropy": 0.5202484130859375,
"epoch": 1.2808988764044944,
"grad_norm": 45.04158241974194,
"learning_rate": 4.65335184085606e-06,
"loss": 0.5511,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 94259336.0,
"step": 114
},
{
"entropy": 0.5372695922851562,
"epoch": 1.2921348314606742,
"grad_norm": 43.878757463536445,
"learning_rate": 4.64544064980431e-06,
"loss": 0.5162,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 95078639.0,
"step": 115
},
{
"entropy": 0.5404129028320312,
"epoch": 1.303370786516854,
"grad_norm": 42.9718114331032,
"learning_rate": 4.637447083085944e-06,
"loss": 0.5121,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 95914468.0,
"step": 116
},
{
"entropy": 0.53564453125,
"epoch": 1.3146067415730336,
"grad_norm": 41.78707116055528,
"learning_rate": 4.629371447619443e-06,
"loss": 0.4901,
"mean_token_accuracy": 0.9414062534924597,
"num_tokens": 96715346.0,
"step": 117
},
{
"entropy": 0.5341415405273438,
"epoch": 1.3258426966292136,
"grad_norm": 41.532789530235036,
"learning_rate": 4.621214053474374e-06,
"loss": 0.483,
"mean_token_accuracy": 0.9414062534924597,
"num_tokens": 97534917.0,
"step": 118
},
{
"entropy": 0.5507736206054688,
"epoch": 1.3370786516853932,
"grad_norm": 44.303988041655955,
"learning_rate": 4.612975213859487e-06,
"loss": 0.5075,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 98348468.0,
"step": 119
},
{
"entropy": 0.538421630859375,
"epoch": 1.348314606741573,
"grad_norm": 39.13623243577061,
"learning_rate": 4.604655245110684e-06,
"loss": 0.4581,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 99146166.0,
"step": 120
},
{
"entropy": 0.5124588012695312,
"epoch": 1.3595505617977528,
"grad_norm": 35.61302129069593,
"learning_rate": 4.596254466678877e-06,
"loss": 0.4225,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 100023483.0,
"step": 121
},
{
"entropy": 0.5451736450195312,
"epoch": 1.3707865168539326,
"grad_norm": 34.79497149141314,
"learning_rate": 4.5877732011177215e-06,
"loss": 0.4174,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 100807531.0,
"step": 122
},
{
"entropy": 0.5343170166015625,
"epoch": 1.3820224719101124,
"grad_norm": 33.6184828101984,
"learning_rate": 4.579211774071229e-06,
"loss": 0.3814,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 101643474.0,
"step": 123
},
{
"entropy": 0.5354690551757812,
"epoch": 1.3932584269662922,
"grad_norm": 32.37729538841938,
"learning_rate": 4.570570514261272e-06,
"loss": 0.3839,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 102462858.0,
"step": 124
},
{
"entropy": 0.5319061279296875,
"epoch": 1.404494382022472,
"grad_norm": 31.363308088082444,
"learning_rate": 4.561849753474951e-06,
"loss": 0.3421,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 103287709.0,
"step": 125
},
{
"entropy": 0.5308456420898438,
"epoch": 1.4157303370786516,
"grad_norm": 34.482826340425774,
"learning_rate": 4.553049826551864e-06,
"loss": 0.3421,
"mean_token_accuracy": 0.9518229195382446,
"num_tokens": 104117613.0,
"step": 126
},
{
"entropy": 0.5468826293945312,
"epoch": 1.4269662921348314,
"grad_norm": 43.099922541286496,
"learning_rate": 4.544171071371246e-06,
"loss": 0.4766,
"mean_token_accuracy": 0.8997395893093199,
"num_tokens": 104910285.0,
"step": 127
},
{
"entropy": 0.543365478515625,
"epoch": 1.4382022471910112,
"grad_norm": 28.630466194759624,
"learning_rate": 4.535213828838998e-06,
"loss": 0.3273,
"mean_token_accuracy": 0.9518229195382446,
"num_tokens": 105723583.0,
"step": 128
},
{
"entropy": 0.53936767578125,
"epoch": 1.449438202247191,
"grad_norm": 35.38257284310518,
"learning_rate": 4.526178442874596e-06,
"loss": 0.3644,
"mean_token_accuracy": 0.923177087912336,
"num_tokens": 106547148.0,
"step": 129
},
{
"entropy": 0.5411376953125,
"epoch": 1.4606741573033708,
"grad_norm": 26.906307855558588,
"learning_rate": 4.517065260397887e-06,
"loss": 0.317,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 107421430.0,
"step": 130
},
{
"entropy": 0.5464248657226562,
"epoch": 1.4719101123595506,
"grad_norm": 27.053952446861047,
"learning_rate": 4.5078746313157684e-06,
"loss": 0.3227,
"mean_token_accuracy": 0.9361979204695672,
"num_tokens": 108268831.0,
"step": 131
},
{
"entropy": 0.5452651977539062,
"epoch": 1.4831460674157304,
"grad_norm": 25.285396700518028,
"learning_rate": 4.498606908508754e-06,
"loss": 0.2983,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 109099543.0,
"step": 132
},
{
"entropy": 0.550140380859375,
"epoch": 1.49438202247191,
"grad_norm": 23.61719992855654,
"learning_rate": 4.489262447817421e-06,
"loss": 0.2844,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 109922580.0,
"step": 133
},
{
"entropy": 0.5431671142578125,
"epoch": 1.50561797752809,
"grad_norm": 25.4495115008555,
"learning_rate": 4.479841608028756e-06,
"loss": 0.2781,
"mean_token_accuracy": 0.9361979204695672,
"num_tokens": 110764011.0,
"step": 134
},
{
"entropy": 0.5407867431640625,
"epoch": 1.5168539325842696,
"grad_norm": 21.937846662427205,
"learning_rate": 4.470344750862369e-06,
"loss": 0.2533,
"mean_token_accuracy": 0.9492187530267984,
"num_tokens": 111604839.0,
"step": 135
},
{
"entropy": 0.5406417846679688,
"epoch": 1.5280898876404494,
"grad_norm": 21.05181225648866,
"learning_rate": 4.460772240956609e-06,
"loss": 0.2445,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 112446849.0,
"step": 136
},
{
"entropy": 0.54180908203125,
"epoch": 1.5393258426966292,
"grad_norm": 26.085445973452664,
"learning_rate": 4.4511244458545666e-06,
"loss": 0.2675,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 113267124.0,
"step": 137
},
{
"entropy": 0.53436279296875,
"epoch": 1.550561797752809,
"grad_norm": 18.064464279776033,
"learning_rate": 4.441401735989958e-06,
"loss": 0.2398,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 114118062.0,
"step": 138
},
{
"entropy": 0.5387649536132812,
"epoch": 1.5617977528089888,
"grad_norm": 20.856843279353505,
"learning_rate": 4.431604484672905e-06,
"loss": 0.2399,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 114957756.0,
"step": 139
},
{
"entropy": 0.5518875122070312,
"epoch": 1.5730337078651684,
"grad_norm": 15.763148774834674,
"learning_rate": 4.421733068075596e-06,
"loss": 0.2171,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 115750224.0,
"step": 140
},
{
"entropy": 0.5423736572265625,
"epoch": 1.5842696629213484,
"grad_norm": 19.090770454447515,
"learning_rate": 4.411787865217847e-06,
"loss": 0.2309,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 116594786.0,
"step": 141
},
{
"entropy": 0.5412063598632812,
"epoch": 1.595505617977528,
"grad_norm": 15.016224731584037,
"learning_rate": 4.401769257952551e-06,
"loss": 0.2385,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 117425980.0,
"step": 142
},
{
"entropy": 0.5459976196289062,
"epoch": 1.606741573033708,
"grad_norm": 18.355720830544215,
"learning_rate": 4.3916776309510115e-06,
"loss": 0.2292,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 118249651.0,
"step": 143
},
{
"entropy": 0.5451126098632812,
"epoch": 1.6179775280898876,
"grad_norm": 14.837666903281002,
"learning_rate": 4.381513371688174e-06,
"loss": 0.1965,
"mean_token_accuracy": 0.9544270860496908,
"num_tokens": 119084058.0,
"step": 144
},
{
"entropy": 0.548553466796875,
"epoch": 1.6292134831460674,
"grad_norm": 16.682044870857027,
"learning_rate": 4.3712768704277535e-06,
"loss": 0.2006,
"mean_token_accuracy": 0.9361979204695672,
"num_tokens": 119900949.0,
"step": 145
},
{
"entropy": 0.54833984375,
"epoch": 1.6404494382022472,
"grad_norm": 13.398664955064358,
"learning_rate": 4.360968520207241e-06,
"loss": 0.1766,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 120703593.0,
"step": 146
},
{
"entropy": 0.5443878173828125,
"epoch": 1.651685393258427,
"grad_norm": 19.269521133635244,
"learning_rate": 4.35058871682282e-06,
"loss": 0.2035,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 121516344.0,
"step": 147
},
{
"entropy": 0.5469894409179688,
"epoch": 1.6629213483146068,
"grad_norm": 17.4616206956177,
"learning_rate": 4.340137858814168e-06,
"loss": 0.2089,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 122351127.0,
"step": 148
},
{
"entropy": 0.5352020263671875,
"epoch": 1.6741573033707864,
"grad_norm": 17.178879243601084,
"learning_rate": 4.329616347449154e-06,
"loss": 0.213,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 123172441.0,
"step": 149
},
{
"entropy": 0.5321578979492188,
"epoch": 1.6853932584269664,
"grad_norm": 13.357276465389605,
"learning_rate": 4.3190245867084275e-06,
"loss": 0.1728,
"mean_token_accuracy": 0.9518229195382446,
"num_tokens": 124012921.0,
"step": 150
},
{
"entropy": 0.5493316650390625,
"epoch": 1.696629213483146,
"grad_norm": 11.642016241780885,
"learning_rate": 4.308362983269916e-06,
"loss": 0.1747,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 124803857.0,
"step": 151
},
{
"entropy": 0.5419769287109375,
"epoch": 1.7078651685393258,
"grad_norm": 11.421215730426258,
"learning_rate": 4.297631946493202e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9492187530267984,
"num_tokens": 125627395.0,
"step": 152
},
{
"entropy": 0.5377578735351562,
"epoch": 1.7191011235955056,
"grad_norm": 9.875789917298611,
"learning_rate": 4.2868318884038075e-06,
"loss": 0.177,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 126450383.0,
"step": 153
},
{
"entropy": 0.5468826293945312,
"epoch": 1.7303370786516854,
"grad_norm": 11.701291607682503,
"learning_rate": 4.275963223677379e-06,
"loss": 0.1517,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 127279834.0,
"step": 154
},
{
"entropy": 0.523223876953125,
"epoch": 1.7415730337078652,
"grad_norm": 7.880131869221865,
"learning_rate": 4.265026369623761e-06,
"loss": 0.1662,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 128165756.0,
"step": 155
},
{
"entropy": 0.5451507568359375,
"epoch": 1.7528089887640448,
"grad_norm": 8.585183353114594,
"learning_rate": 4.254021746170972e-06,
"loss": 0.1449,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 128976327.0,
"step": 156
},
{
"entropy": 0.5521163940429688,
"epoch": 1.7640449438202248,
"grad_norm": 8.571471771414366,
"learning_rate": 4.242949775849083e-06,
"loss": 0.1578,
"mean_token_accuracy": 0.9505208362825215,
"num_tokens": 129771800.0,
"step": 157
},
{
"entropy": 0.5428085327148438,
"epoch": 1.7752808988764044,
"grad_norm": 16.43602264445111,
"learning_rate": 4.231810883773999e-06,
"loss": 0.1799,
"mean_token_accuracy": 0.9257812544237822,
"num_tokens": 130569228.0,
"step": 158
},
{
"entropy": 0.5567550659179688,
"epoch": 1.7865168539325844,
"grad_norm": 7.531185608644983,
"learning_rate": 4.220605497631125e-06,
"loss": 0.156,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 131343343.0,
"step": 159
},
{
"entropy": 0.5351791381835938,
"epoch": 1.797752808988764,
"grad_norm": 18.604263622333004,
"learning_rate": 4.209334047658956e-06,
"loss": 0.1977,
"mean_token_accuracy": 0.9153645883779973,
"num_tokens": 132192522.0,
"step": 160
},
{
"entropy": 0.550018310546875,
"epoch": 1.8089887640449438,
"grad_norm": 15.376187129436879,
"learning_rate": 4.197996966632551e-06,
"loss": 0.1777,
"mean_token_accuracy": 0.9257812544237822,
"num_tokens": 132973585.0,
"step": 161
},
{
"entropy": 0.534149169921875,
"epoch": 1.8202247191011236,
"grad_norm": 6.060874528483011,
"learning_rate": 4.186594689846919e-06,
"loss": 0.1446,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 133810589.0,
"step": 162
},
{
"entropy": 0.5483245849609375,
"epoch": 1.8314606741573034,
"grad_norm": 13.790675275430694,
"learning_rate": 4.175127655100306e-06,
"loss": 0.1647,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 134632550.0,
"step": 163
},
{
"entropy": 0.5397720336914062,
"epoch": 1.8426966292134832,
"grad_norm": 9.616982136039516,
"learning_rate": 4.163596302677383e-06,
"loss": 0.127,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 135475079.0,
"step": 164
},
{
"entropy": 0.5223236083984375,
"epoch": 1.8539325842696628,
"grad_norm": 11.249984036705216,
"learning_rate": 4.152001075332342e-06,
"loss": 0.134,
"mean_token_accuracy": 0.9544270860496908,
"num_tokens": 136337338.0,
"step": 165
},
{
"entropy": 0.5302963256835938,
"epoch": 1.8651685393258428,
"grad_norm": 11.29797181598071,
"learning_rate": 4.140342418271897e-06,
"loss": 0.1743,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 137170980.0,
"step": 166
},
{
"entropy": 0.5428543090820312,
"epoch": 1.8764044943820224,
"grad_norm": 5.666218885177399,
"learning_rate": 4.128620779138191e-06,
"loss": 0.1069,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 137967406.0,
"step": 167
},
{
"entropy": 0.5390548706054688,
"epoch": 1.8876404494382022,
"grad_norm": 7.4433019218071435,
"learning_rate": 4.116836607991603e-06,
"loss": 0.1159,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 138776095.0,
"step": 168
},
{
"entropy": 0.5308151245117188,
"epoch": 1.898876404494382,
"grad_norm": 5.850528993135207,
"learning_rate": 4.104990357293478e-06,
"loss": 0.1243,
"mean_token_accuracy": 0.9544270860496908,
"num_tokens": 139611842.0,
"step": 169
},
{
"entropy": 0.5367202758789062,
"epoch": 1.9101123595505618,
"grad_norm": 4.483523712916532,
"learning_rate": 4.09308248188874e-06,
"loss": 0.1148,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 140452877.0,
"step": 170
},
{
"entropy": 0.538116455078125,
"epoch": 1.9213483146067416,
"grad_norm": 4.63259111392511,
"learning_rate": 4.081113438988443e-06,
"loss": 0.1168,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 141283700.0,
"step": 171
},
{
"entropy": 0.5343170166015625,
"epoch": 1.9325842696629212,
"grad_norm": 5.6051268320124,
"learning_rate": 4.069083688152206e-06,
"loss": 0.1042,
"mean_token_accuracy": 0.9622395855840296,
"num_tokens": 142114404.0,
"step": 172
},
{
"entropy": 0.5272216796875,
"epoch": 1.9438202247191012,
"grad_norm": 5.637386397174846,
"learning_rate": 4.056993691270569e-06,
"loss": 0.1067,
"mean_token_accuracy": 0.9622395855840296,
"num_tokens": 142975160.0,
"step": 173
},
{
"entropy": 0.5473403930664062,
"epoch": 1.9550561797752808,
"grad_norm": 4.4513312700269605,
"learning_rate": 4.044843912547262e-06,
"loss": 0.0698,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 143782020.0,
"step": 174
},
{
"entropy": 0.5282516479492188,
"epoch": 1.9662921348314608,
"grad_norm": 8.165500644505597,
"learning_rate": 4.032634818481382e-06,
"loss": 0.1,
"mean_token_accuracy": 0.9700520851183683,
"num_tokens": 144634435.0,
"step": 175
},
{
"entropy": 0.5271377563476562,
"epoch": 1.9775280898876404,
"grad_norm": 8.58130607122661,
"learning_rate": 4.020366877849477e-06,
"loss": 0.1126,
"mean_token_accuracy": 0.9596354190725833,
"num_tokens": 145481475.0,
"step": 176
},
{
"entropy": 0.5458755493164062,
"epoch": 1.9887640449438202,
"grad_norm": 11.866586657273107,
"learning_rate": 4.008040561687549e-06,
"loss": 0.1346,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 146272147.0,
"step": 177
},
{
"entropy": 0.5224761962890625,
"epoch": 2.0,
"grad_norm": 7.043806183115956,
"learning_rate": 3.995656343272969e-06,
"loss": 0.1186,
"mean_token_accuracy": 0.9570312525611371,
"num_tokens": 147128144.0,
"step": 178
},
{
"entropy": 0.540283203125,
"epoch": 2.0112359550561796,
"grad_norm": 10.498406309580753,
"learning_rate": 3.983214698106305e-06,
"loss": 0.1109,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 147963457.0,
"step": 179
},
{
"entropy": 0.519134521484375,
"epoch": 2.0224719101123596,
"grad_norm": 9.652118424262301,
"learning_rate": 3.970716103893065e-06,
"loss": 0.1069,
"mean_token_accuracy": 0.9544270860496908,
"num_tokens": 148846204.0,
"step": 180
},
{
"entropy": 0.5508880615234375,
"epoch": 2.033707865168539,
"grad_norm": 7.951460626005676,
"learning_rate": 3.958161040525354e-06,
"loss": 0.0995,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 149670358.0,
"step": 181
},
{
"entropy": 0.5459823608398438,
"epoch": 2.044943820224719,
"grad_norm": 4.843661555713835,
"learning_rate": 3.94554999006345e-06,
"loss": 0.0914,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 150475622.0,
"step": 182
},
{
"entropy": 0.5539932250976562,
"epoch": 2.056179775280899,
"grad_norm": 10.638552878346802,
"learning_rate": 3.932883436717291e-06,
"loss": 0.1272,
"mean_token_accuracy": 0.9544270860496908,
"num_tokens": 151275741.0,
"step": 183
},
{
"entropy": 0.531158447265625,
"epoch": 2.067415730337079,
"grad_norm": 16.65219342556677,
"learning_rate": 3.92016186682789e-06,
"loss": 0.1808,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 152107378.0,
"step": 184
},
{
"entropy": 0.5382614135742188,
"epoch": 2.0786516853932584,
"grad_norm": 4.433323748901134,
"learning_rate": 3.907385768848656e-06,
"loss": 0.0996,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 152930819.0,
"step": 185
},
{
"entropy": 0.5397109985351562,
"epoch": 2.0898876404494384,
"grad_norm": 11.98756204808435,
"learning_rate": 3.894555633326642e-06,
"loss": 0.1189,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 153762988.0,
"step": 186
},
{
"entropy": 0.539337158203125,
"epoch": 2.101123595505618,
"grad_norm": 14.319094435898721,
"learning_rate": 3.88167195288371e-06,
"loss": 0.1407,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 154587742.0,
"step": 187
},
{
"entropy": 0.5379104614257812,
"epoch": 2.1123595505617976,
"grad_norm": 6.448338570022156,
"learning_rate": 3.868735222197614e-06,
"loss": 0.1047,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 155401431.0,
"step": 188
},
{
"entropy": 0.551666259765625,
"epoch": 2.1235955056179776,
"grad_norm": 11.070470267067817,
"learning_rate": 3.85574593798301e-06,
"loss": 0.1434,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 156232809.0,
"step": 189
},
{
"entropy": 0.5646133422851562,
"epoch": 2.134831460674157,
"grad_norm": 12.984647361123118,
"learning_rate": 3.842704598972384e-06,
"loss": 0.1244,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 157038066.0,
"step": 190
},
{
"entropy": 0.5512008666992188,
"epoch": 2.146067415730337,
"grad_norm": 7.571198459069707,
"learning_rate": 3.8296117058969e-06,
"loss": 0.0925,
"mean_token_accuracy": 0.9700520851183683,
"num_tokens": 157847874.0,
"step": 191
},
{
"entropy": 0.538909912109375,
"epoch": 2.157303370786517,
"grad_norm": 3.498838742080043,
"learning_rate": 3.816467761467175e-06,
"loss": 0.0871,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 158683400.0,
"step": 192
},
{
"entropy": 0.5359344482421875,
"epoch": 2.168539325842697,
"grad_norm": 6.154847561172168,
"learning_rate": 3.80327327035398e-06,
"loss": 0.1039,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 159559442.0,
"step": 193
},
{
"entropy": 0.5301971435546875,
"epoch": 2.1797752808988764,
"grad_norm": 3.464618502492379,
"learning_rate": 3.7900287391688584e-06,
"loss": 0.0758,
"mean_token_accuracy": 0.9700520851183683,
"num_tokens": 160409769.0,
"step": 194
},
{
"entropy": 0.5456008911132812,
"epoch": 2.191011235955056,
"grad_norm": 9.16083029440977,
"learning_rate": 3.776734676444678e-06,
"loss": 0.1088,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 161215428.0,
"step": 195
},
{
"entropy": 0.5370635986328125,
"epoch": 2.202247191011236,
"grad_norm": 8.727730015955157,
"learning_rate": 3.763391592616104e-06,
"loss": 0.098,
"mean_token_accuracy": 0.9622395855840296,
"num_tokens": 162038062.0,
"step": 196
},
{
"entropy": 0.5242767333984375,
"epoch": 2.2134831460674156,
"grad_norm": 9.7605350617224,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.1193,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 162898167.0,
"step": 197
},
{
"entropy": 0.5362014770507812,
"epoch": 2.2247191011235956,
"grad_norm": 11.751831338500692,
"learning_rate": 3.7365604127757584e-06,
"loss": 0.1232,
"mean_token_accuracy": 0.9570312525611371,
"num_tokens": 163737294.0,
"step": 198
},
{
"entropy": 0.5273818969726562,
"epoch": 2.235955056179775,
"grad_norm": 5.446400766116038,
"learning_rate": 3.7230733469655554e-06,
"loss": 0.0746,
"mean_token_accuracy": 0.977864584652707,
"num_tokens": 164576231.0,
"step": 199
},
{
"entropy": 0.5343704223632812,
"epoch": 2.247191011235955,
"grad_norm": 15.034561777079416,
"learning_rate": 3.709539320414544e-06,
"loss": 0.1336,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 165421397.0,
"step": 200
},
{
"entropy": 0.537506103515625,
"epoch": 2.258426966292135,
"grad_norm": 16.63689156589337,
"learning_rate": 3.6959588527709635e-06,
"loss": 0.1461,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 166255000.0,
"step": 201
},
{
"entropy": 0.54998779296875,
"epoch": 2.2696629213483144,
"grad_norm": 6.385410420428761,
"learning_rate": 3.6823324654661923e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9596354190725833,
"num_tokens": 167062178.0,
"step": 202
},
{
"entropy": 0.536407470703125,
"epoch": 2.2808988764044944,
"grad_norm": 5.6791741210641815,
"learning_rate": 3.6686606816947264e-06,
"loss": 0.078,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 167880704.0,
"step": 203
},
{
"entropy": 0.5526046752929688,
"epoch": 2.292134831460674,
"grad_norm": 7.912547116557072,
"learning_rate": 3.6549440263940878e-06,
"loss": 0.0945,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 168691010.0,
"step": 204
},
{
"entropy": 0.5461044311523438,
"epoch": 2.303370786516854,
"grad_norm": 6.52901004838527,
"learning_rate": 3.6411830262246755e-06,
"loss": 0.0935,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 169509964.0,
"step": 205
},
{
"entropy": 0.537628173828125,
"epoch": 2.3146067415730336,
"grad_norm": 2.6353286532560545,
"learning_rate": 3.627378209549537e-06,
"loss": 0.0706,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 170357405.0,
"step": 206
},
{
"entropy": 0.5371932983398438,
"epoch": 2.3258426966292136,
"grad_norm": 9.812177322775737,
"learning_rate": 3.6135301064140856e-06,
"loss": 0.1168,
"mean_token_accuracy": 0.9492187530267984,
"num_tokens": 171203221.0,
"step": 207
},
{
"entropy": 0.5315933227539062,
"epoch": 2.337078651685393,
"grad_norm": 10.239216110161607,
"learning_rate": 3.599639248525749e-06,
"loss": 0.112,
"mean_token_accuracy": 0.9596354190725833,
"num_tokens": 172038636.0,
"step": 208
},
{
"entropy": 0.53472900390625,
"epoch": 2.348314606741573,
"grad_norm": 3.8454624292214628,
"learning_rate": 3.5857061692335503e-06,
"loss": 0.0641,
"mean_token_accuracy": 0.977864584652707,
"num_tokens": 172864323.0,
"step": 209
},
{
"entropy": 0.5257797241210938,
"epoch": 2.359550561797753,
"grad_norm": 7.821223582969504,
"learning_rate": 3.5717314035076355e-06,
"loss": 0.09,
"mean_token_accuracy": 0.9635416688397527,
"num_tokens": 173727784.0,
"step": 210
},
{
"entropy": 0.5433807373046875,
"epoch": 2.370786516853933,
"grad_norm": 7.137283108304516,
"learning_rate": 3.5577154879187286e-06,
"loss": 0.0816,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 174532281.0,
"step": 211
},
{
"entropy": 0.5399627685546875,
"epoch": 2.3820224719101124,
"grad_norm": 4.79769687231269,
"learning_rate": 3.5436589606175296e-06,
"loss": 0.0794,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 175364357.0,
"step": 212
},
{
"entropy": 0.5312271118164062,
"epoch": 2.393258426966292,
"grad_norm": 6.896383560835884,
"learning_rate": 3.5295623613140563e-06,
"loss": 0.0749,
"mean_token_accuracy": 0.9752604181412607,
"num_tokens": 176186569.0,
"step": 213
},
{
"entropy": 0.527679443359375,
"epoch": 2.404494382022472,
"grad_norm": 8.908697981712665,
"learning_rate": 3.5154262312569134e-06,
"loss": 0.0701,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 177025300.0,
"step": 214
},
{
"entropy": 0.5305328369140625,
"epoch": 2.4157303370786516,
"grad_norm": 6.818692017843762,
"learning_rate": 3.501251113212521e-06,
"loss": 0.0667,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 177873635.0,
"step": 215
},
{
"entropy": 0.5447006225585938,
"epoch": 2.4269662921348316,
"grad_norm": 3.8165153692265603,
"learning_rate": 3.4870375514442677e-06,
"loss": 0.0733,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 178670872.0,
"step": 216
},
{
"entropy": 0.5322647094726562,
"epoch": 2.438202247191011,
"grad_norm": 5.369901004672838,
"learning_rate": 3.4727860916916143e-06,
"loss": 0.0663,
"mean_token_accuracy": 0.977864584652707,
"num_tokens": 179491783.0,
"step": 217
},
{
"entropy": 0.5320358276367188,
"epoch": 2.449438202247191,
"grad_norm": 4.555500008711359,
"learning_rate": 3.458497281149143e-06,
"loss": 0.0611,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 180317316.0,
"step": 218
},
{
"entropy": 0.5376434326171875,
"epoch": 2.460674157303371,
"grad_norm": 4.071209493199293,
"learning_rate": 3.444171668445544e-06,
"loss": 0.0542,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 181148889.0,
"step": 219
},
{
"entropy": 0.5237350463867188,
"epoch": 2.4719101123595504,
"grad_norm": 11.47920307652276,
"learning_rate": 3.429809803622551e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9700520851183683,
"num_tokens": 182012448.0,
"step": 220
},
{
"entropy": 0.5371170043945312,
"epoch": 2.4831460674157304,
"grad_norm": 5.447244205245656,
"learning_rate": 3.415412238113823e-06,
"loss": 0.0493,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 182831219.0,
"step": 221
},
{
"entropy": 0.5419769287109375,
"epoch": 2.49438202247191,
"grad_norm": 3.402318570494703,
"learning_rate": 3.400979524723773e-06,
"loss": 0.0463,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 183649085.0,
"step": 222
},
{
"entropy": 0.5189590454101562,
"epoch": 2.50561797752809,
"grad_norm": 4.872229888332632,
"learning_rate": 3.386512217606339e-06,
"loss": 0.0534,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 184524290.0,
"step": 223
},
{
"entropy": 0.5490798950195312,
"epoch": 2.5168539325842696,
"grad_norm": 6.35163899835482,
"learning_rate": 3.372010872243711e-06,
"loss": 0.0593,
"mean_token_accuracy": 0.977864584652707,
"num_tokens": 185304697.0,
"step": 224
},
{
"entropy": 0.541900634765625,
"epoch": 2.5280898876404496,
"grad_norm": 3.75106123980352,
"learning_rate": 3.357476045424998e-06,
"loss": 0.0372,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 186091630.0,
"step": 225
},
{
"entropy": 0.5357437133789062,
"epoch": 2.539325842696629,
"grad_norm": 15.183650700034958,
"learning_rate": 3.342908295224854e-06,
"loss": 0.0698,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 186900960.0,
"step": 226
},
{
"entropy": 0.5135269165039062,
"epoch": 2.550561797752809,
"grad_norm": 7.11769951958488,
"learning_rate": 3.32830818098205e-06,
"loss": 0.0501,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 187751474.0,
"step": 227
},
{
"entropy": 0.5372314453125,
"epoch": 2.561797752808989,
"grad_norm": 4.989153243587186,
"learning_rate": 3.313676263277995e-06,
"loss": 0.0487,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 188559110.0,
"step": 228
},
{
"entropy": 0.5383682250976562,
"epoch": 2.5730337078651684,
"grad_norm": 6.827867800336417,
"learning_rate": 3.299013103915214e-06,
"loss": 0.0713,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 189360657.0,
"step": 229
},
{
"entropy": 0.5142898559570312,
"epoch": 2.5842696629213484,
"grad_norm": 8.36796100398585,
"learning_rate": 3.2843192658957775e-06,
"loss": 0.0759,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 190208504.0,
"step": 230
},
{
"entropy": 0.5430984497070312,
"epoch": 2.595505617977528,
"grad_norm": 6.384555684576824,
"learning_rate": 3.269595313399683e-06,
"loss": 0.0791,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 191011578.0,
"step": 231
},
{
"entropy": 0.51165771484375,
"epoch": 2.606741573033708,
"grad_norm": 7.460899194764321,
"learning_rate": 3.2548418117631952e-06,
"loss": 0.048,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 191877386.0,
"step": 232
},
{
"entropy": 0.5298843383789062,
"epoch": 2.6179775280898876,
"grad_norm": 6.821805872662446,
"learning_rate": 3.240059327457138e-06,
"loss": 0.0529,
"mean_token_accuracy": 0.9830729176755995,
"num_tokens": 192727467.0,
"step": 233
},
{
"entropy": 0.5463333129882812,
"epoch": 2.629213483146067,
"grad_norm": 3.4805806068994416,
"learning_rate": 3.2252484280651453e-06,
"loss": 0.0511,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 193508507.0,
"step": 234
},
{
"entropy": 0.5321044921875,
"epoch": 2.640449438202247,
"grad_norm": 4.219564851577349,
"learning_rate": 3.2104096822618657e-06,
"loss": 0.0524,
"mean_token_accuracy": 0.9830729176755995,
"num_tokens": 194340559.0,
"step": 235
},
{
"entropy": 0.5420913696289062,
"epoch": 2.6516853932584272,
"grad_norm": 2.9827980950480515,
"learning_rate": 3.195543659791132e-06,
"loss": 0.0342,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 195164876.0,
"step": 236
},
{
"entropy": 0.5332260131835938,
"epoch": 2.662921348314607,
"grad_norm": 5.696614902089947,
"learning_rate": 3.1806509314440827e-06,
"loss": 0.0437,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 195980793.0,
"step": 237
},
{
"entropy": 0.5210113525390625,
"epoch": 2.6741573033707864,
"grad_norm": 7.372573411260061,
"learning_rate": 3.1657320690372464e-06,
"loss": 0.0588,
"mean_token_accuracy": 0.9804687511641532,
"num_tokens": 196834051.0,
"step": 238
},
{
"entropy": 0.54461669921875,
"epoch": 2.6853932584269664,
"grad_norm": 4.804908333028152,
"learning_rate": 3.150787645390587e-06,
"loss": 0.0461,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 197624113.0,
"step": 239
},
{
"entropy": 0.527008056640625,
"epoch": 2.696629213483146,
"grad_norm": 5.494419605196046,
"learning_rate": 3.135818234305511e-06,
"loss": 0.0502,
"mean_token_accuracy": 0.9830729176755995,
"num_tokens": 198465881.0,
"step": 240
},
{
"entropy": 0.5293502807617188,
"epoch": 2.7078651685393256,
"grad_norm": 4.222259777927373,
"learning_rate": 3.120824410542833e-06,
"loss": 0.0319,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 199286078.0,
"step": 241
},
{
"entropy": 0.51971435546875,
"epoch": 2.7191011235955056,
"grad_norm": 9.126867635985343,
"learning_rate": 3.1058067498007094e-06,
"loss": 0.0521,
"mean_token_accuracy": 0.9804687511641532,
"num_tokens": 200114610.0,
"step": 242
},
{
"entropy": 0.5160369873046875,
"epoch": 2.7303370786516856,
"grad_norm": 7.329097197987436,
"learning_rate": 3.090765828692534e-06,
"loss": 0.046,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 200955034.0,
"step": 243
},
{
"entropy": 0.5411148071289062,
"epoch": 2.741573033707865,
"grad_norm": 3.340662077032214,
"learning_rate": 3.0757022247248e-06,
"loss": 0.0318,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 201751463.0,
"step": 244
},
{
"entropy": 0.5378646850585938,
"epoch": 2.752808988764045,
"grad_norm": 4.8775745170805065,
"learning_rate": 3.0606165162749212e-06,
"loss": 0.0494,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 202542823.0,
"step": 245
},
{
"entropy": 0.5283050537109375,
"epoch": 2.764044943820225,
"grad_norm": 3.9452834388750384,
"learning_rate": 3.045509282569031e-06,
"loss": 0.0347,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 203354246.0,
"step": 246
},
{
"entropy": 0.5169830322265625,
"epoch": 2.7752808988764044,
"grad_norm": 5.491362292823874,
"learning_rate": 3.0303811036597395e-06,
"loss": 0.0388,
"mean_token_accuracy": 0.9856770841870457,
"num_tokens": 204200941.0,
"step": 247
},
{
"entropy": 0.5260467529296875,
"epoch": 2.7865168539325844,
"grad_norm": 6.201924151041956,
"learning_rate": 3.01523256040386e-06,
"loss": 0.0508,
"mean_token_accuracy": 0.9830729176755995,
"num_tokens": 205037033.0,
"step": 248
},
{
"entropy": 0.5252304077148438,
"epoch": 2.797752808988764,
"grad_norm": 6.457190267020416,
"learning_rate": 3.0000642344401115e-06,
"loss": 0.056,
"mean_token_accuracy": 0.9804687511641532,
"num_tokens": 205854828.0,
"step": 249
},
{
"entropy": 0.5137939453125,
"epoch": 2.808988764044944,
"grad_norm": 4.69210936983822,
"learning_rate": 2.9848767081667823e-06,
"loss": 0.0311,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 206719096.0,
"step": 250
},
{
"entropy": 0.5233001708984375,
"epoch": 2.8202247191011236,
"grad_norm": 3.4937557726205766,
"learning_rate": 2.9696705647193695e-06,
"loss": 0.032,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 207564713.0,
"step": 251
},
{
"entropy": 0.5282363891601562,
"epoch": 2.831460674157303,
"grad_norm": 3.0073778425878825,
"learning_rate": 2.9544463879481914e-06,
"loss": 0.0415,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 208358356.0,
"step": 252
},
{
"entropy": 0.5185775756835938,
"epoch": 2.842696629213483,
"grad_norm": 4.817956965116018,
"learning_rate": 2.9392047623959653e-06,
"loss": 0.0466,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 209186191.0,
"step": 253
},
{
"entropy": 0.522125244140625,
"epoch": 2.853932584269663,
"grad_norm": 3.029972926905953,
"learning_rate": 2.923946273275369e-06,
"loss": 0.0231,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 210020565.0,
"step": 254
},
{
"entropy": 0.5248794555664062,
"epoch": 2.865168539325843,
"grad_norm": 3.3353822683100467,
"learning_rate": 2.908671506446566e-06,
"loss": 0.0347,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 210826562.0,
"step": 255
},
{
"entropy": 0.519561767578125,
"epoch": 2.8764044943820224,
"grad_norm": 4.164606363683451,
"learning_rate": 2.8933810483947156e-06,
"loss": 0.042,
"mean_token_accuracy": 0.9856770841870457,
"num_tokens": 211637830.0,
"step": 256
},
{
"entropy": 0.5108184814453125,
"epoch": 2.8876404494382024,
"grad_norm": 3.4352370759093667,
"learning_rate": 2.878075486207452e-06,
"loss": 0.0241,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 212483603.0,
"step": 257
},
{
"entropy": 0.5216064453125,
"epoch": 2.898876404494382,
"grad_norm": 4.264768762251155,
"learning_rate": 2.8627554075523426e-06,
"loss": 0.0264,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 213277086.0,
"step": 258
},
{
"entropy": 0.5249557495117188,
"epoch": 2.9101123595505616,
"grad_norm": 5.873772509100592,
"learning_rate": 2.8474214006543255e-06,
"loss": 0.0414,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 214086307.0,
"step": 259
},
{
"entropy": 0.5169525146484375,
"epoch": 2.9213483146067416,
"grad_norm": 4.060622338334745,
"learning_rate": 2.832074054273121e-06,
"loss": 0.0384,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 214907128.0,
"step": 260
},
{
"entropy": 0.5202713012695312,
"epoch": 2.932584269662921,
"grad_norm": 7.348539660999507,
"learning_rate": 2.8167139576806306e-06,
"loss": 0.0608,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 215725903.0,
"step": 261
},
{
"entropy": 0.5224227905273438,
"epoch": 2.943820224719101,
"grad_norm": 3.453263597151809,
"learning_rate": 2.8013417006383078e-06,
"loss": 0.0218,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 216535618.0,
"step": 262
},
{
"entropy": 0.522552490234375,
"epoch": 2.955056179775281,
"grad_norm": 4.453941784446791,
"learning_rate": 2.7859578733745153e-06,
"loss": 0.0289,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 217345581.0,
"step": 263
},
{
"entropy": 0.51446533203125,
"epoch": 2.966292134831461,
"grad_norm": 4.032638412989834,
"learning_rate": 2.7705630665618605e-06,
"loss": 0.0315,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 218173244.0,
"step": 264
},
{
"entropy": 0.5102767944335938,
"epoch": 2.9775280898876404,
"grad_norm": 5.378680439331005,
"learning_rate": 2.755157871294521e-06,
"loss": 0.0196,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 219012419.0,
"step": 265
},
{
"entropy": 0.5094451904296875,
"epoch": 2.98876404494382,
"grad_norm": 5.4791488171443925,
"learning_rate": 2.7397428790655447e-06,
"loss": 0.0292,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 219854421.0,
"step": 266
},
{
"entropy": 0.5302352905273438,
"epoch": 3.0,
"grad_norm": 3.8596785746166216,
"learning_rate": 2.7243186817441403e-06,
"loss": 0.0315,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 220623700.0,
"step": 267
},
{
"entropy": 0.5127182006835938,
"epoch": 3.0112359550561796,
"grad_norm": 4.57819470414012,
"learning_rate": 2.708885871552954e-06,
"loss": 0.0356,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 221452033.0,
"step": 268
},
{
"entropy": 0.5211715698242188,
"epoch": 3.0224719101123596,
"grad_norm": 3.854103160052815,
"learning_rate": 2.693445041045326e-06,
"loss": 0.0398,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 222254082.0,
"step": 269
},
{
"entropy": 0.5071258544921875,
"epoch": 3.033707865168539,
"grad_norm": 3.1948641595887493,
"learning_rate": 2.6779967830825454e-06,
"loss": 0.0276,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 223083493.0,
"step": 270
},
{
"entropy": 0.508270263671875,
"epoch": 3.044943820224719,
"grad_norm": 5.1040099580837355,
"learning_rate": 2.6625416908110825e-06,
"loss": 0.0206,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 223934861.0,
"step": 271
},
{
"entropy": 0.5089187622070312,
"epoch": 3.056179775280899,
"grad_norm": 3.5217613203200178,
"learning_rate": 2.647080357639813e-06,
"loss": 0.03,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 224772447.0,
"step": 272
},
{
"entropy": 0.5082550048828125,
"epoch": 3.067415730337079,
"grad_norm": 3.717919821676401,
"learning_rate": 2.6316133772172403e-06,
"loss": 0.0229,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 225620881.0,
"step": 273
},
{
"entropy": 0.528961181640625,
"epoch": 3.0786516853932584,
"grad_norm": 4.126696150915157,
"learning_rate": 2.616141343408696e-06,
"loss": 0.0278,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 226398051.0,
"step": 274
},
{
"entropy": 0.5100021362304688,
"epoch": 3.0898876404494384,
"grad_norm": 4.936464408938361,
"learning_rate": 2.6006648502735384e-06,
"loss": 0.0351,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 227221027.0,
"step": 275
},
{
"entropy": 0.5383224487304688,
"epoch": 3.101123595505618,
"grad_norm": 4.911326562954185,
"learning_rate": 2.5851844920423473e-06,
"loss": 0.0391,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 227998038.0,
"step": 276
},
{
"entropy": 0.5119400024414062,
"epoch": 3.1123595505617976,
"grad_norm": 4.035362852199955,
"learning_rate": 2.569700863094104e-06,
"loss": 0.0225,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 228848791.0,
"step": 277
},
{
"entropy": 0.5259246826171875,
"epoch": 3.1235955056179776,
"grad_norm": 4.907222005633188,
"learning_rate": 2.554214557933372e-06,
"loss": 0.0333,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 229650218.0,
"step": 278
},
{
"entropy": 0.5128173828125,
"epoch": 3.134831460674157,
"grad_norm": 3.6298141208358325,
"learning_rate": 2.5387261711674695e-06,
"loss": 0.0213,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 230506406.0,
"step": 279
},
{
"entropy": 0.51739501953125,
"epoch": 3.146067415730337,
"grad_norm": 2.16036446857172,
"learning_rate": 2.5232362974836394e-06,
"loss": 0.0234,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 231317938.0,
"step": 280
},
{
"entropy": 0.5114364624023438,
"epoch": 3.157303370786517,
"grad_norm": 3.7210989346070695,
"learning_rate": 2.507745531626215e-06,
"loss": 0.0226,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 232180619.0,
"step": 281
},
{
"entropy": 0.5086212158203125,
"epoch": 3.168539325842697,
"grad_norm": 6.879747676119053,
"learning_rate": 2.4922544683737857e-06,
"loss": 0.0316,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 233015053.0,
"step": 282
},
{
"entropy": 0.5146713256835938,
"epoch": 3.1797752808988764,
"grad_norm": 5.482318170907438,
"learning_rate": 2.4767637025163614e-06,
"loss": 0.0345,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 233833558.0,
"step": 283
},
{
"entropy": 0.5211105346679688,
"epoch": 3.191011235955056,
"grad_norm": 3.2491463558151317,
"learning_rate": 2.461273828832531e-06,
"loss": 0.0214,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 234627558.0,
"step": 284
},
{
"entropy": 0.5092926025390625,
"epoch": 3.202247191011236,
"grad_norm": 5.031466393488865,
"learning_rate": 2.445785442066628e-06,
"loss": 0.0321,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 235468401.0,
"step": 285
},
{
"entropy": 0.5166778564453125,
"epoch": 3.2134831460674156,
"grad_norm": 10.076323689534382,
"learning_rate": 2.4302991369058963e-06,
"loss": 0.037,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 236277777.0,
"step": 286
},
{
"entropy": 0.5113983154296875,
"epoch": 3.2247191011235956,
"grad_norm": 3.247168354051238,
"learning_rate": 2.414815507957653e-06,
"loss": 0.022,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 237109698.0,
"step": 287
},
{
"entropy": 0.5170822143554688,
"epoch": 3.235955056179775,
"grad_norm": 4.8226389846288065,
"learning_rate": 2.399335149726463e-06,
"loss": 0.0283,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 237913805.0,
"step": 288
},
{
"entropy": 0.5075225830078125,
"epoch": 3.247191011235955,
"grad_norm": 4.782854834657138,
"learning_rate": 2.3838586565913053e-06,
"loss": 0.0215,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 238760016.0,
"step": 289
},
{
"entropy": 0.5094528198242188,
"epoch": 3.258426966292135,
"grad_norm": 3.208516002299857,
"learning_rate": 2.3683866227827605e-06,
"loss": 0.0166,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 239607946.0,
"step": 290
},
{
"entropy": 0.516571044921875,
"epoch": 3.2696629213483144,
"grad_norm": 4.503497884660067,
"learning_rate": 2.352919642360188e-06,
"loss": 0.0372,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 240438345.0,
"step": 291
},
{
"entropy": 0.523040771484375,
"epoch": 3.2808988764044944,
"grad_norm": 6.357036147397838,
"learning_rate": 2.3374583091889188e-06,
"loss": 0.027,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 241241845.0,
"step": 292
},
{
"entropy": 0.5255355834960938,
"epoch": 3.292134831460674,
"grad_norm": 6.159491750149713,
"learning_rate": 2.322003216917455e-06,
"loss": 0.0348,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 242048957.0,
"step": 293
},
{
"entropy": 0.5313034057617188,
"epoch": 3.303370786516854,
"grad_norm": 2.767906707454989,
"learning_rate": 2.3065549589546747e-06,
"loss": 0.0155,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 242812266.0,
"step": 294
},
{
"entropy": 0.5044479370117188,
"epoch": 3.3146067415730336,
"grad_norm": 3.249806139136359,
"learning_rate": 2.2911141284470466e-06,
"loss": 0.0234,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 243649058.0,
"step": 295
},
{
"entropy": 0.5091476440429688,
"epoch": 3.3258426966292136,
"grad_norm": 3.279106758362481,
"learning_rate": 2.27568131825586e-06,
"loss": 0.0169,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 244473742.0,
"step": 296
},
{
"entropy": 0.49976348876953125,
"epoch": 3.337078651685393,
"grad_norm": 4.399014190769296,
"learning_rate": 2.260257120934456e-06,
"loss": 0.0219,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 245334162.0,
"step": 297
},
{
"entropy": 0.5103302001953125,
"epoch": 3.348314606741573,
"grad_norm": 3.379998191266739,
"learning_rate": 2.2448421287054794e-06,
"loss": 0.0195,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 246164496.0,
"step": 298
},
{
"entropy": 0.5167922973632812,
"epoch": 3.359550561797753,
"grad_norm": 6.348607538192022,
"learning_rate": 2.229436933438141e-06,
"loss": 0.0342,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 246957857.0,
"step": 299
},
{
"entropy": 0.520721435546875,
"epoch": 3.370786516853933,
"grad_norm": 3.7378441121949244,
"learning_rate": 2.214042126625486e-06,
"loss": 0.0264,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 247790646.0,
"step": 300
},
{
"entropy": 0.5183486938476562,
"epoch": 3.3820224719101124,
"grad_norm": 4.811164098866914,
"learning_rate": 2.1986582993616926e-06,
"loss": 0.0317,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 248596791.0,
"step": 301
},
{
"entropy": 0.5113983154296875,
"epoch": 3.393258426966292,
"grad_norm": 3.592413843287222,
"learning_rate": 2.1832860423193703e-06,
"loss": 0.0224,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 249439466.0,
"step": 302
},
{
"entropy": 0.50799560546875,
"epoch": 3.404494382022472,
"grad_norm": 3.807110419678122,
"learning_rate": 2.1679259457268796e-06,
"loss": 0.0262,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 250296069.0,
"step": 303
},
{
"entropy": 0.5160369873046875,
"epoch": 3.4157303370786516,
"grad_norm": 3.5332255502622822,
"learning_rate": 2.1525785993456753e-06,
"loss": 0.0155,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 251137734.0,
"step": 304
},
{
"entropy": 0.5208358764648438,
"epoch": 3.4269662921348316,
"grad_norm": 4.105991998447796,
"learning_rate": 2.1372445924476578e-06,
"loss": 0.0168,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 251948138.0,
"step": 305
},
{
"entropy": 0.503387451171875,
"epoch": 3.438202247191011,
"grad_norm": 7.115594516406741,
"learning_rate": 2.1219245137925482e-06,
"loss": 0.0234,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 252783179.0,
"step": 306
},
{
"entropy": 0.5314254760742188,
"epoch": 3.449438202247191,
"grad_norm": 1.9885695184447987,
"learning_rate": 2.1066189516052848e-06,
"loss": 0.0194,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 253561788.0,
"step": 307
},
{
"entropy": 0.5133819580078125,
"epoch": 3.460674157303371,
"grad_norm": 3.5082179529267936,
"learning_rate": 2.0913284935534345e-06,
"loss": 0.0196,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 254385361.0,
"step": 308
},
{
"entropy": 0.5140228271484375,
"epoch": 3.4719101123595504,
"grad_norm": 4.022328315511428,
"learning_rate": 2.0760537267246316e-06,
"loss": 0.0248,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 255224425.0,
"step": 309
},
{
"entropy": 0.5027923583984375,
"epoch": 3.4831460674157304,
"grad_norm": 2.9944684642176997,
"learning_rate": 2.0607952376040355e-06,
"loss": 0.0181,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 256044678.0,
"step": 310
},
{
"entropy": 0.52716064453125,
"epoch": 3.49438202247191,
"grad_norm": 4.504770884220794,
"learning_rate": 2.0455536120518094e-06,
"loss": 0.0384,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 256832233.0,
"step": 311
},
{
"entropy": 0.5007553100585938,
"epoch": 3.50561797752809,
"grad_norm": 2.942114577376295,
"learning_rate": 2.0303294352806313e-06,
"loss": 0.0238,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 257686727.0,
"step": 312
},
{
"entropy": 0.49721527099609375,
"epoch": 3.5168539325842696,
"grad_norm": 2.116297926501953,
"learning_rate": 2.0151232918332186e-06,
"loss": 0.0104,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 258546281.0,
"step": 313
},
{
"entropy": 0.5155029296875,
"epoch": 3.5280898876404496,
"grad_norm": 3.7890686933117177,
"learning_rate": 1.9999357655598894e-06,
"loss": 0.0164,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 259354523.0,
"step": 314
},
{
"entropy": 0.49981689453125,
"epoch": 3.539325842696629,
"grad_norm": 2.0113129181647937,
"learning_rate": 1.9847674395961407e-06,
"loss": 0.0133,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 260197353.0,
"step": 315
},
{
"entropy": 0.5087356567382812,
"epoch": 3.550561797752809,
"grad_norm": 4.964666181582898,
"learning_rate": 1.9696188963402613e-06,
"loss": 0.0281,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 261008532.0,
"step": 316
},
{
"entropy": 0.5017013549804688,
"epoch": 3.561797752808989,
"grad_norm": 3.199383699793436,
"learning_rate": 1.9544907174309693e-06,
"loss": 0.0134,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 261848586.0,
"step": 317
},
{
"entropy": 0.5186920166015625,
"epoch": 3.5730337078651684,
"grad_norm": 4.0141254096173205,
"learning_rate": 1.939383483725079e-06,
"loss": 0.012,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 262642445.0,
"step": 318
},
{
"entropy": 0.5063323974609375,
"epoch": 3.5842696629213484,
"grad_norm": 3.668558882237143,
"learning_rate": 1.9242977752752006e-06,
"loss": 0.0143,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 263451697.0,
"step": 319
},
{
"entropy": 0.5094070434570312,
"epoch": 3.595505617977528,
"grad_norm": 4.487986811292986,
"learning_rate": 1.909234171307466e-06,
"loss": 0.0213,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 264268081.0,
"step": 320
},
{
"entropy": 0.49770355224609375,
"epoch": 3.606741573033708,
"grad_norm": 3.3550283768698588,
"learning_rate": 1.8941932501992915e-06,
"loss": 0.013,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 265096782.0,
"step": 321
},
{
"entropy": 0.5041961669921875,
"epoch": 3.6179775280898876,
"grad_norm": 5.995755473364592,
"learning_rate": 1.879175589457168e-06,
"loss": 0.0188,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 265912003.0,
"step": 322
},
{
"entropy": 0.48159027099609375,
"epoch": 3.629213483146067,
"grad_norm": 2.3130150611335965,
"learning_rate": 1.8641817656944894e-06,
"loss": 0.0067,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 266788683.0,
"step": 323
},
{
"entropy": 0.48583221435546875,
"epoch": 3.640449438202247,
"grad_norm": 4.083263934783113,
"learning_rate": 1.8492123546094132e-06,
"loss": 0.0126,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 267649106.0,
"step": 324
},
{
"entropy": 0.49964141845703125,
"epoch": 3.6516853932584272,
"grad_norm": 0.4203389488288955,
"learning_rate": 1.8342679309627545e-06,
"loss": 0.0027,
"mean_token_accuracy": 1.0,
"num_tokens": 268474651.0,
"step": 325
},
{
"entropy": 0.5025482177734375,
"epoch": 3.662921348314607,
"grad_norm": 5.90761497919095,
"learning_rate": 1.8193490685559179e-06,
"loss": 0.0369,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 269291928.0,
"step": 326
},
{
"entropy": 0.49892425537109375,
"epoch": 3.6741573033707864,
"grad_norm": 4.346760130845783,
"learning_rate": 1.8044563402088686e-06,
"loss": 0.028,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 270116488.0,
"step": 327
},
{
"entropy": 0.5001907348632812,
"epoch": 3.6853932584269664,
"grad_norm": 16.271240321339146,
"learning_rate": 1.7895903177381351e-06,
"loss": 0.0356,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 270917370.0,
"step": 328
},
{
"entropy": 0.46492767333984375,
"epoch": 3.696629213483146,
"grad_norm": 4.470491736532796,
"learning_rate": 1.7747515719348551e-06,
"loss": 0.011,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 271826297.0,
"step": 329
},
{
"entropy": 0.4820098876953125,
"epoch": 3.7078651685393256,
"grad_norm": 7.212866002958275,
"learning_rate": 1.759940672542862e-06,
"loss": 0.033,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 272694273.0,
"step": 330
},
{
"entropy": 0.482574462890625,
"epoch": 3.7191011235955056,
"grad_norm": 7.8089400803404985,
"learning_rate": 1.7451581882368052e-06,
"loss": 0.0181,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 273560920.0,
"step": 331
},
{
"entropy": 0.49365997314453125,
"epoch": 3.7303370786516856,
"grad_norm": 9.96198480242964,
"learning_rate": 1.7304046866003183e-06,
"loss": 0.025,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 274386758.0,
"step": 332
},
{
"entropy": 0.49715423583984375,
"epoch": 3.741573033707865,
"grad_norm": 6.256653146358632,
"learning_rate": 1.7156807341042242e-06,
"loss": 0.0172,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 275211665.0,
"step": 333
},
{
"entropy": 0.4954986572265625,
"epoch": 3.752808988764045,
"grad_norm": 5.4914007926781,
"learning_rate": 1.700986896084787e-06,
"loss": 0.0131,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 276029702.0,
"step": 334
},
{
"entropy": 0.50457763671875,
"epoch": 3.764044943820225,
"grad_norm": 5.3972041785495835,
"learning_rate": 1.686323736722006e-06,
"loss": 0.0209,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 276871811.0,
"step": 335
},
{
"entropy": 0.4974365234375,
"epoch": 3.7752808988764044,
"grad_norm": 1.1708667866046776,
"learning_rate": 1.671691819017951e-06,
"loss": 0.0032,
"mean_token_accuracy": 1.0,
"num_tokens": 277714646.0,
"step": 336
},
{
"entropy": 0.5124359130859375,
"epoch": 3.7865168539325844,
"grad_norm": 3.7431018922660204,
"learning_rate": 1.6570917047751465e-06,
"loss": 0.0083,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 278518639.0,
"step": 337
},
{
"entropy": 0.4889068603515625,
"epoch": 3.797752808988764,
"grad_norm": 6.144159379632208,
"learning_rate": 1.642523954575003e-06,
"loss": 0.0262,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 279367205.0,
"step": 338
},
{
"entropy": 0.49878692626953125,
"epoch": 3.808988764044944,
"grad_norm": 3.751432165154299,
"learning_rate": 1.6279891277562896e-06,
"loss": 0.0147,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 280179340.0,
"step": 339
},
{
"entropy": 0.48464202880859375,
"epoch": 3.8202247191011236,
"grad_norm": 3.0121999885704795,
"learning_rate": 1.613487782393661e-06,
"loss": 0.0074,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 281043372.0,
"step": 340
},
{
"entropy": 0.5075607299804688,
"epoch": 3.831460674157303,
"grad_norm": 3.506062048383523,
"learning_rate": 1.5990204752762273e-06,
"loss": 0.0082,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 281856207.0,
"step": 341
},
{
"entropy": 0.485870361328125,
"epoch": 3.842696629213483,
"grad_norm": 3.0322880070112657,
"learning_rate": 1.5845877618861769e-06,
"loss": 0.017,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 282703908.0,
"step": 342
},
{
"entropy": 0.489654541015625,
"epoch": 3.853932584269663,
"grad_norm": 4.926358920639807,
"learning_rate": 1.5701901963774504e-06,
"loss": 0.0192,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 283548980.0,
"step": 343
},
{
"entropy": 0.4991302490234375,
"epoch": 3.865168539325843,
"grad_norm": 4.466227120509381,
"learning_rate": 1.555828331554457e-06,
"loss": 0.0179,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 284386913.0,
"step": 344
},
{
"entropy": 0.49823760986328125,
"epoch": 3.8764044943820224,
"grad_norm": 3.705201119031269,
"learning_rate": 1.5415027188508574e-06,
"loss": 0.0182,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 285220080.0,
"step": 345
},
{
"entropy": 0.49188995361328125,
"epoch": 3.8876404494382024,
"grad_norm": 5.0998857739487065,
"learning_rate": 1.5272139083083865e-06,
"loss": 0.0158,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 286072154.0,
"step": 346
},
{
"entropy": 0.48580169677734375,
"epoch": 3.898876404494382,
"grad_norm": 3.4875601342154177,
"learning_rate": 1.5129624485557331e-06,
"loss": 0.0073,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 286942986.0,
"step": 347
},
{
"entropy": 0.48960113525390625,
"epoch": 3.9101123595505616,
"grad_norm": 2.2828403000677064,
"learning_rate": 1.4987488867874798e-06,
"loss": 0.0084,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 287797004.0,
"step": 348
},
{
"entropy": 0.50335693359375,
"epoch": 3.9213483146067416,
"grad_norm": 3.1386584985605372,
"learning_rate": 1.4845737687430875e-06,
"loss": 0.0253,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 288618402.0,
"step": 349
},
{
"entropy": 0.50701904296875,
"epoch": 3.932584269662921,
"grad_norm": 4.172480270751928,
"learning_rate": 1.4704376386859447e-06,
"loss": 0.0137,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 289407298.0,
"step": 350
},
{
"entropy": 0.5020904541015625,
"epoch": 3.943820224719101,
"grad_norm": 2.5202804888657275,
"learning_rate": 1.4563410393824701e-06,
"loss": 0.012,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 290238447.0,
"step": 351
},
{
"entropy": 0.5082855224609375,
"epoch": 3.955056179775281,
"grad_norm": 4.712749356736079,
"learning_rate": 1.4422845120812718e-06,
"loss": 0.018,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 291036026.0,
"step": 352
},
{
"entropy": 0.49322509765625,
"epoch": 3.966292134831461,
"grad_norm": 1.9341080117017146,
"learning_rate": 1.4282685964923643e-06,
"loss": 0.0106,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 291893428.0,
"step": 353
},
{
"entropy": 0.501678466796875,
"epoch": 3.9775280898876404,
"grad_norm": 1.8281731406713833,
"learning_rate": 1.4142938307664505e-06,
"loss": 0.008,
"mean_token_accuracy": 1.0,
"num_tokens": 292697320.0,
"step": 354
},
{
"entropy": 0.5015640258789062,
"epoch": 3.98876404494382,
"grad_norm": 2.0972743594587158,
"learning_rate": 1.400360751474253e-06,
"loss": 0.0061,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 293502907.0,
"step": 355
},
{
"entropy": 0.5002670288085938,
"epoch": 4.0,
"grad_norm": 2.4603604078082357,
"learning_rate": 1.3864698935859153e-06,
"loss": 0.0092,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 294326570.0,
"step": 356
},
{
"entropy": 0.489013671875,
"epoch": 4.01123595505618,
"grad_norm": 1.0981283873031276,
"learning_rate": 1.3726217904504636e-06,
"loss": 0.0107,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 295155654.0,
"step": 357
},
{
"entropy": 0.4899444580078125,
"epoch": 4.022471910112359,
"grad_norm": 2.4658745249006575,
"learning_rate": 1.3588169737753258e-06,
"loss": 0.0045,
"mean_token_accuracy": 1.0,
"num_tokens": 296004651.0,
"step": 358
},
{
"entropy": 0.496856689453125,
"epoch": 4.033707865168539,
"grad_norm": 0.6986873706498641,
"learning_rate": 1.3450559736059126e-06,
"loss": 0.0032,
"mean_token_accuracy": 1.0,
"num_tokens": 296825273.0,
"step": 359
},
{
"entropy": 0.4771728515625,
"epoch": 4.044943820224719,
"grad_norm": 4.022638809306908,
"learning_rate": 1.3313393183052747e-06,
"loss": 0.0079,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 297661901.0,
"step": 360
},
{
"entropy": 0.48487091064453125,
"epoch": 4.056179775280899,
"grad_norm": 1.1734333901132759,
"learning_rate": 1.3176675345338085e-06,
"loss": 0.003,
"mean_token_accuracy": 1.0,
"num_tokens": 298495109.0,
"step": 361
},
{
"entropy": 0.47898101806640625,
"epoch": 4.067415730337078,
"grad_norm": 0.7234754463345818,
"learning_rate": 1.304041147229037e-06,
"loss": 0.0024,
"mean_token_accuracy": 1.0,
"num_tokens": 299343801.0,
"step": 362
},
{
"entropy": 0.498199462890625,
"epoch": 4.078651685393258,
"grad_norm": 1.5200640964253078,
"learning_rate": 1.2904606795854562e-06,
"loss": 0.0196,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 300156927.0,
"step": 363
},
{
"entropy": 0.48677825927734375,
"epoch": 4.089887640449438,
"grad_norm": 1.3478491381169233,
"learning_rate": 1.276926653034444e-06,
"loss": 0.0029,
"mean_token_accuracy": 1.0,
"num_tokens": 300997011.0,
"step": 364
},
{
"entropy": 0.48504638671875,
"epoch": 4.101123595505618,
"grad_norm": 4.927855969737134,
"learning_rate": 1.2634395872242433e-06,
"loss": 0.0114,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 301820560.0,
"step": 365
},
{
"entropy": 0.4751739501953125,
"epoch": 4.112359550561798,
"grad_norm": 4.779020236399108,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.0063,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 302678409.0,
"step": 366
},
{
"entropy": 0.4949951171875,
"epoch": 4.123595505617978,
"grad_norm": 0.7404055825881121,
"learning_rate": 1.2366084073838963e-06,
"loss": 0.0024,
"mean_token_accuracy": 1.0,
"num_tokens": 303479311.0,
"step": 367
},
{
"entropy": 0.485565185546875,
"epoch": 4.134831460674158,
"grad_norm": 2.8852964190089296,
"learning_rate": 1.223265323555323e-06,
"loss": 0.0123,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 304286629.0,
"step": 368
},
{
"entropy": 0.4863433837890625,
"epoch": 4.146067415730337,
"grad_norm": 1.910333236425596,
"learning_rate": 1.2099712608311426e-06,
"loss": 0.0112,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 305106480.0,
"step": 369
},
{
"entropy": 0.4847564697265625,
"epoch": 4.157303370786517,
"grad_norm": 10.011313102335771,
"learning_rate": 1.1967267296460208e-06,
"loss": 0.0088,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 305927120.0,
"step": 370
},
{
"entropy": 0.47748565673828125,
"epoch": 4.168539325842697,
"grad_norm": 0.36743206379299365,
"learning_rate": 1.183532238532826e-06,
"loss": 0.0019,
"mean_token_accuracy": 1.0,
"num_tokens": 306771916.0,
"step": 371
},
{
"entropy": 0.49149322509765625,
"epoch": 4.179775280898877,
"grad_norm": 0.4732089401749855,
"learning_rate": 1.1703882941031012e-06,
"loss": 0.0021,
"mean_token_accuracy": 1.0,
"num_tokens": 307565025.0,
"step": 372
},
{
"entropy": 0.48378753662109375,
"epoch": 4.191011235955056,
"grad_norm": 0.7993030080623406,
"learning_rate": 1.157295401027616e-06,
"loss": 0.0021,
"mean_token_accuracy": 1.0,
"num_tokens": 308380624.0,
"step": 373
},
{
"entropy": 0.4936065673828125,
"epoch": 4.202247191011236,
"grad_norm": 4.721093006577871,
"learning_rate": 1.1442540620169906e-06,
"loss": 0.0149,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 309224060.0,
"step": 374
},
{
"entropy": 0.47303009033203125,
"epoch": 4.213483146067416,
"grad_norm": 0.6684526220228418,
"learning_rate": 1.131264777802387e-06,
"loss": 0.002,
"mean_token_accuracy": 1.0,
"num_tokens": 310063659.0,
"step": 375
},
{
"entropy": 0.49187469482421875,
"epoch": 4.224719101123595,
"grad_norm": 4.552394207734528,
"learning_rate": 1.1183280471162916e-06,
"loss": 0.0075,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 310884602.0,
"step": 376
},
{
"entropy": 0.48296356201171875,
"epoch": 4.235955056179775,
"grad_norm": 3.2031832299513603,
"learning_rate": 1.1054443666733586e-06,
"loss": 0.0157,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 311708014.0,
"step": 377
},
{
"entropy": 0.48511505126953125,
"epoch": 4.247191011235955,
"grad_norm": 0.5423288881673503,
"learning_rate": 1.0926142311513453e-06,
"loss": 0.0019,
"mean_token_accuracy": 1.0,
"num_tokens": 312508500.0,
"step": 378
},
{
"entropy": 0.4829254150390625,
"epoch": 4.258426966292135,
"grad_norm": 0.6328572390463946,
"learning_rate": 1.079838133172111e-06,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 313334377.0,
"step": 379
},
{
"entropy": 0.4954986572265625,
"epoch": 4.269662921348314,
"grad_norm": 2.965808891706028,
"learning_rate": 1.0671165632827097e-06,
"loss": 0.0044,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 314135369.0,
"step": 380
},
{
"entropy": 0.48342132568359375,
"epoch": 4.280898876404494,
"grad_norm": 4.9369087391667925,
"learning_rate": 1.0544500099365515e-06,
"loss": 0.0046,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 314947060.0,
"step": 381
},
{
"entropy": 0.474609375,
"epoch": 4.292134831460674,
"grad_norm": 0.8352741047517535,
"learning_rate": 1.0418389594746462e-06,
"loss": 0.0023,
"mean_token_accuracy": 1.0,
"num_tokens": 315789920.0,
"step": 382
},
{
"entropy": 0.4948883056640625,
"epoch": 4.303370786516854,
"grad_norm": 1.4879623292920148,
"learning_rate": 1.0292838961069348e-06,
"loss": 0.0077,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 316594232.0,
"step": 383
},
{
"entropy": 0.4829254150390625,
"epoch": 4.314606741573034,
"grad_norm": 7.760157332381639,
"learning_rate": 1.0167853018936955e-06,
"loss": 0.0096,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 317419960.0,
"step": 384
},
{
"entropy": 0.48496246337890625,
"epoch": 4.325842696629214,
"grad_norm": 0.6546893569941181,
"learning_rate": 1.0043436567270313e-06,
"loss": 0.002,
"mean_token_accuracy": 1.0,
"num_tokens": 318246051.0,
"step": 385
},
{
"entropy": 0.48297882080078125,
"epoch": 4.337078651685394,
"grad_norm": 1.1821091917310644,
"learning_rate": 9.919594383124512e-07,
"loss": 0.0022,
"mean_token_accuracy": 1.0,
"num_tokens": 319071794.0,
"step": 386
},
{
"entropy": 0.4850616455078125,
"epoch": 4.348314606741573,
"grad_norm": 1.1860110971220579,
"learning_rate": 9.796331221505235e-07,
"loss": 0.0097,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 319880367.0,
"step": 387
},
{
"entropy": 0.47988128662109375,
"epoch": 4.359550561797753,
"grad_norm": 3.6479067412195456,
"learning_rate": 9.673651815186186e-07,
"loss": 0.0041,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 320710009.0,
"step": 388
},
{
"entropy": 0.4726104736328125,
"epoch": 4.370786516853933,
"grad_norm": 2.7332778453429873,
"learning_rate": 9.551560874527385e-07,
"loss": 0.0091,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 321533419.0,
"step": 389
},
{
"entropy": 0.48587799072265625,
"epoch": 4.382022471910112,
"grad_norm": 1.322646368017749,
"learning_rate": 9.43006308729432e-07,
"loss": 0.0071,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 322334485.0,
"step": 390
},
{
"entropy": 0.4619598388671875,
"epoch": 4.393258426966292,
"grad_norm": 0.3194115918370614,
"learning_rate": 9.309163118477954e-07,
"loss": 0.0017,
"mean_token_accuracy": 1.0,
"num_tokens": 323195057.0,
"step": 391
},
{
"entropy": 0.48651885986328125,
"epoch": 4.404494382022472,
"grad_norm": 2.0428959059308465,
"learning_rate": 9.188865610115572e-07,
"loss": 0.0028,
"mean_token_accuracy": 1.0,
"num_tokens": 324010094.0,
"step": 392
},
{
"entropy": 0.4793243408203125,
"epoch": 4.415730337078652,
"grad_norm": 1.823624278000048,
"learning_rate": 9.069175181112597e-07,
"loss": 0.0105,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 324846854.0,
"step": 393
},
{
"entropy": 0.480072021484375,
"epoch": 4.426966292134831,
"grad_norm": 1.0106602397778806,
"learning_rate": 8.950096427065232e-07,
"loss": 0.009,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 325686662.0,
"step": 394
},
{
"entropy": 0.480133056640625,
"epoch": 4.438202247191011,
"grad_norm": 7.9492170461609595,
"learning_rate": 8.831633920083968e-07,
"loss": 0.0074,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 326513919.0,
"step": 395
},
{
"entropy": 0.49493408203125,
"epoch": 4.449438202247191,
"grad_norm": 7.174628436600066,
"learning_rate": 8.713792208618097e-07,
"loss": 0.015,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 327299315.0,
"step": 396
},
{
"entropy": 0.49566650390625,
"epoch": 4.460674157303371,
"grad_norm": 6.030765502141134,
"learning_rate": 8.596575817281036e-07,
"loss": 0.0165,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 328084351.0,
"step": 397
},
{
"entropy": 0.47319793701171875,
"epoch": 4.47191011235955,
"grad_norm": 3.9848054852496055,
"learning_rate": 8.479989246676595e-07,
"loss": 0.0045,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 328918727.0,
"step": 398
},
{
"entropy": 0.46825408935546875,
"epoch": 4.48314606741573,
"grad_norm": 3.3412799376622124,
"learning_rate": 8.36403697322618e-07,
"loss": 0.0042,
"mean_token_accuracy": 1.0,
"num_tokens": 329773820.0,
"step": 399
},
{
"entropy": 0.47360992431640625,
"epoch": 4.49438202247191,
"grad_norm": 1.5264228995013582,
"learning_rate": 8.248723448996942e-07,
"loss": 0.0029,
"mean_token_accuracy": 1.0,
"num_tokens": 330626895.0,
"step": 400
},
{
"entropy": 0.48606109619140625,
"epoch": 4.50561797752809,
"grad_norm": 4.047559713867161,
"learning_rate": 8.134053101530814e-07,
"loss": 0.0069,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 331422952.0,
"step": 401
},
{
"entropy": 0.4675140380859375,
"epoch": 4.51685393258427,
"grad_norm": 1.7070514726450159,
"learning_rate": 8.020030333674498e-07,
"loss": 0.008,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 332286023.0,
"step": 402
},
{
"entropy": 0.5069656372070312,
"epoch": 4.52808988764045,
"grad_norm": 0.3246355685066132,
"learning_rate": 7.906659523410445e-07,
"loss": 0.0021,
"mean_token_accuracy": 1.0,
"num_tokens": 333044926.0,
"step": 403
},
{
"entropy": 0.47544097900390625,
"epoch": 4.539325842696629,
"grad_norm": 1.326921144241493,
"learning_rate": 7.793945023688756e-07,
"loss": 0.0059,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 333869264.0,
"step": 404
},
{
"entropy": 0.4818572998046875,
"epoch": 4.550561797752809,
"grad_norm": 0.2982905893437566,
"learning_rate": 7.681891162260016e-07,
"loss": 0.0019,
"mean_token_accuracy": 1.0,
"num_tokens": 334689272.0,
"step": 405
},
{
"entropy": 0.48146820068359375,
"epoch": 4.561797752808989,
"grad_norm": 4.863117295648831,
"learning_rate": 7.570502241509162e-07,
"loss": 0.0041,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 335515946.0,
"step": 406
},
{
"entropy": 0.48076629638671875,
"epoch": 4.573033707865169,
"grad_norm": 2.8900389523558796,
"learning_rate": 7.459782538290289e-07,
"loss": 0.0162,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 336334489.0,
"step": 407
},
{
"entropy": 0.4797515869140625,
"epoch": 4.584269662921348,
"grad_norm": 1.8411529472514119,
"learning_rate": 7.349736303762392e-07,
"loss": 0.0041,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 337166188.0,
"step": 408
},
{
"entropy": 0.473114013671875,
"epoch": 4.595505617977528,
"grad_norm": 1.5143632218630005,
"learning_rate": 7.240367763226214e-07,
"loss": 0.0033,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 338004079.0,
"step": 409
},
{
"entropy": 0.4938507080078125,
"epoch": 4.606741573033708,
"grad_norm": 0.45911529906334586,
"learning_rate": 7.13168111596193e-07,
"loss": 0.0023,
"mean_token_accuracy": 1.0,
"num_tokens": 338802322.0,
"step": 410
},
{
"entropy": 0.47324371337890625,
"epoch": 4.617977528089888,
"grad_norm": 0.35480851339412894,
"learning_rate": 7.023680535067998e-07,
"loss": 0.002,
"mean_token_accuracy": 1.0,
"num_tokens": 339659802.0,
"step": 411
},
{
"entropy": 0.4801788330078125,
"epoch": 4.629213483146067,
"grad_norm": 0.6426393512461525,
"learning_rate": 6.916370167300846e-07,
"loss": 0.0025,
"mean_token_accuracy": 1.0,
"num_tokens": 340490334.0,
"step": 412
},
{
"entropy": 0.4700927734375,
"epoch": 4.640449438202247,
"grad_norm": 0.31174721363519464,
"learning_rate": 6.809754132915722e-07,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 341341544.0,
"step": 413
},
{
"entropy": 0.47267913818359375,
"epoch": 4.651685393258427,
"grad_norm": 8.399016287808813,
"learning_rate": 6.70383652550847e-07,
"loss": 0.0092,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 342172518.0,
"step": 414
},
{
"entropy": 0.4641876220703125,
"epoch": 4.662921348314606,
"grad_norm": 1.1280041270939898,
"learning_rate": 6.59862141185832e-07,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 343013626.0,
"step": 415
},
{
"entropy": 0.4617767333984375,
"epoch": 4.674157303370786,
"grad_norm": 0.25249368595421007,
"learning_rate": 6.494112831771801e-07,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 343864156.0,
"step": 416
},
{
"entropy": 0.454071044921875,
"epoch": 4.685393258426966,
"grad_norm": 0.3627879993512605,
"learning_rate": 6.390314797927601e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 344721728.0,
"step": 417
},
{
"entropy": 0.4547119140625,
"epoch": 4.696629213483146,
"grad_norm": 4.815638286312687,
"learning_rate": 6.28723129572247e-07,
"loss": 0.0111,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 345612861.0,
"step": 418
},
{
"entropy": 0.47646331787109375,
"epoch": 4.707865168539326,
"grad_norm": 0.23854029589661835,
"learning_rate": 6.184866283118254e-07,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 346422266.0,
"step": 419
},
{
"entropy": 0.4585418701171875,
"epoch": 4.719101123595506,
"grad_norm": 4.111400656540378,
"learning_rate": 6.083223690489901e-07,
"loss": 0.0058,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 347254366.0,
"step": 420
},
{
"entropy": 0.4715423583984375,
"epoch": 4.730337078651686,
"grad_norm": 1.8760479897518607,
"learning_rate": 5.982307420474501e-07,
"loss": 0.0067,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 348075516.0,
"step": 421
},
{
"entropy": 0.48101043701171875,
"epoch": 4.741573033707866,
"grad_norm": 5.6709204135360025,
"learning_rate": 5.882121347821537e-07,
"loss": 0.0112,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 348900037.0,
"step": 422
},
{
"entropy": 0.4796142578125,
"epoch": 4.752808988764045,
"grad_norm": 2.057248698971926,
"learning_rate": 5.782669319244058e-07,
"loss": 0.0092,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 349701930.0,
"step": 423
},
{
"entropy": 0.48802947998046875,
"epoch": 4.764044943820225,
"grad_norm": 9.041450054881212,
"learning_rate": 5.683955153270959e-07,
"loss": 0.0095,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 350494406.0,
"step": 424
},
{
"entropy": 0.47820281982421875,
"epoch": 4.775280898876405,
"grad_norm": 5.449915375980587,
"learning_rate": 5.585982640100416e-07,
"loss": 0.0033,
"mean_token_accuracy": 1.0,
"num_tokens": 351301540.0,
"step": 425
},
{
"entropy": 0.46595001220703125,
"epoch": 4.786516853932584,
"grad_norm": 5.589766890796259,
"learning_rate": 5.488755541454335e-07,
"loss": 0.0074,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 352130587.0,
"step": 426
},
{
"entropy": 0.4650115966796875,
"epoch": 4.797752808988764,
"grad_norm": 4.375937174255108,
"learning_rate": 5.39227759043392e-07,
"loss": 0.0214,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 352966590.0,
"step": 427
},
{
"entropy": 0.46227264404296875,
"epoch": 4.808988764044944,
"grad_norm": 1.1976658888052818,
"learning_rate": 5.296552491376322e-07,
"loss": 0.0021,
"mean_token_accuracy": 1.0,
"num_tokens": 353805333.0,
"step": 428
},
{
"entropy": 0.47081756591796875,
"epoch": 4.820224719101123,
"grad_norm": 5.3943544894469895,
"learning_rate": 5.201583919712441e-07,
"loss": 0.005,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 354653578.0,
"step": 429
},
{
"entropy": 0.48395538330078125,
"epoch": 4.831460674157303,
"grad_norm": 5.03495222189158,
"learning_rate": 5.107375521825791e-07,
"loss": 0.0134,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 355441821.0,
"step": 430
},
{
"entropy": 0.46059417724609375,
"epoch": 4.842696629213483,
"grad_norm": 0.2979677100012393,
"learning_rate": 5.013930914912477e-07,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 356278826.0,
"step": 431
},
{
"entropy": 0.4580535888671875,
"epoch": 4.853932584269663,
"grad_norm": 5.831142656822307,
"learning_rate": 4.921253686842323e-07,
"loss": 0.0138,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 357122624.0,
"step": 432
},
{
"entropy": 0.4652252197265625,
"epoch": 4.865168539325842,
"grad_norm": 0.2269138434389349,
"learning_rate": 4.829347396021142e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 357978267.0,
"step": 433
},
{
"entropy": 0.46025848388671875,
"epoch": 4.876404494382022,
"grad_norm": 2.166646775289874,
"learning_rate": 4.7382155712540484e-07,
"loss": 0.0172,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 358840138.0,
"step": 434
},
{
"entropy": 0.4871978759765625,
"epoch": 4.887640449438202,
"grad_norm": 1.2710311121179774,
"learning_rate": 4.6478617116100244e-07,
"loss": 0.0017,
"mean_token_accuracy": 1.0,
"num_tokens": 359618488.0,
"step": 435
},
{
"entropy": 0.4758453369140625,
"epoch": 4.898876404494382,
"grad_norm": 1.1292250535564667,
"learning_rate": 4.5582892862875457e-07,
"loss": 0.002,
"mean_token_accuracy": 1.0,
"num_tokens": 360443404.0,
"step": 436
},
{
"entropy": 0.4720306396484375,
"epoch": 4.910112359550562,
"grad_norm": 0.2333684518257228,
"learning_rate": 4.469501734481363e-07,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 361279704.0,
"step": 437
},
{
"entropy": 0.4758758544921875,
"epoch": 4.921348314606742,
"grad_norm": 3.0840865860194806,
"learning_rate": 4.3815024652504897e-07,
"loss": 0.005,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 362101433.0,
"step": 438
},
{
"entropy": 0.47599029541015625,
"epoch": 4.932584269662922,
"grad_norm": 0.8938608510966778,
"learning_rate": 4.294294857387285e-07,
"loss": 0.0017,
"mean_token_accuracy": 1.0,
"num_tokens": 362916195.0,
"step": 439
},
{
"entropy": 0.4738922119140625,
"epoch": 4.943820224719101,
"grad_norm": 0.4136594464102139,
"learning_rate": 4.2078822592877074e-07,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 363745948.0,
"step": 440
},
{
"entropy": 0.47412109375,
"epoch": 4.955056179775281,
"grad_norm": 1.709198405151175,
"learning_rate": 4.122267988822792e-07,
"loss": 0.002,
"mean_token_accuracy": 1.0,
"num_tokens": 364576688.0,
"step": 441
},
{
"entropy": 0.475006103515625,
"epoch": 4.966292134831461,
"grad_norm": 0.30835544679447674,
"learning_rate": 4.0374553332112374e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 365409056.0,
"step": 442
},
{
"entropy": 0.469818115234375,
"epoch": 4.97752808988764,
"grad_norm": 0.3707181543844393,
"learning_rate": 3.953447548893169e-07,
"loss": 0.0017,
"mean_token_accuracy": 1.0,
"num_tokens": 366217810.0,
"step": 443
},
{
"entropy": 0.46750640869140625,
"epoch": 4.98876404494382,
"grad_norm": 3.208220247586545,
"learning_rate": 3.8702478614051353e-07,
"loss": 0.0048,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 367063953.0,
"step": 444
},
{
"entropy": 0.45229339599609375,
"epoch": 5.0,
"grad_norm": 2.253105869866448,
"learning_rate": 3.787859465256258e-07,
"loss": 0.004,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 367930716.0,
"step": 445
},
{
"entropy": 0.48581695556640625,
"epoch": 5.01123595505618,
"grad_norm": 1.5438222190450817,
"learning_rate": 3.706285523805578e-07,
"loss": 0.0168,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 368718665.0,
"step": 446
},
{
"entropy": 0.47020721435546875,
"epoch": 5.022471910112359,
"grad_norm": 0.31678928999987055,
"learning_rate": 3.625529169140565e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 369554247.0,
"step": 447
},
{
"entropy": 0.4745330810546875,
"epoch": 5.033707865168539,
"grad_norm": 0.3533643855991936,
"learning_rate": 3.545593501956901e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 370388951.0,
"step": 448
},
{
"entropy": 0.47808074951171875,
"epoch": 5.044943820224719,
"grad_norm": 0.30752357641221745,
"learning_rate": 3.4664815914394106e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 371198482.0,
"step": 449
},
{
"entropy": 0.4639739990234375,
"epoch": 5.056179775280899,
"grad_norm": 0.367754265454166,
"learning_rate": 3.3881964751441984e-07,
"loss": 0.0017,
"mean_token_accuracy": 1.0,
"num_tokens": 372042829.0,
"step": 450
},
{
"entropy": 0.45928955078125,
"epoch": 5.067415730337078,
"grad_norm": 0.300585908373268,
"learning_rate": 3.3107411588820527e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 372900142.0,
"step": 451
},
{
"entropy": 0.47325897216796875,
"epoch": 5.078651685393258,
"grad_norm": 2.7462415680052654,
"learning_rate": 3.2341186166030214e-07,
"loss": 0.0027,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 373730470.0,
"step": 452
},
{
"entropy": 0.4946746826171875,
"epoch": 5.089887640449438,
"grad_norm": 0.24662934610126366,
"learning_rate": 3.1583317902822127e-07,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 374517244.0,
"step": 453
},
{
"entropy": 0.46686553955078125,
"epoch": 5.101123595505618,
"grad_norm": 0.33971112074173476,
"learning_rate": 3.083383589806846e-07,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 375355793.0,
"step": 454
},
{
"entropy": 0.4696807861328125,
"epoch": 5.112359550561798,
"grad_norm": 0.2483861138363355,
"learning_rate": 3.0092768928645375e-07,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 376189678.0,
"step": 455
},
{
"entropy": 0.4691314697265625,
"epoch": 5.123595505617978,
"grad_norm": 0.229972069260914,
"learning_rate": 2.936014544832794e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 377025599.0,
"step": 456
},
{
"entropy": 0.47251129150390625,
"epoch": 5.134831460674158,
"grad_norm": 0.23132255602760288,
"learning_rate": 2.8635993586697555e-07,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 377866992.0,
"step": 457
},
{
"entropy": 0.46392822265625,
"epoch": 5.146067415730337,
"grad_norm": 0.22332672147895574,
"learning_rate": 2.792034114806211e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 378718256.0,
"step": 458
},
{
"entropy": 0.4629669189453125,
"epoch": 5.157303370786517,
"grad_norm": 0.21910029988776278,
"learning_rate": 2.7213215610388364e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 379571339.0,
"step": 459
},
{
"entropy": 0.49639129638671875,
"epoch": 5.168539325842697,
"grad_norm": 0.21660451173158826,
"learning_rate": 2.6514644124246675e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 380350234.0,
"step": 460
},
{
"entropy": 0.47663116455078125,
"epoch": 5.179775280898877,
"grad_norm": 0.8999031171309789,
"learning_rate": 2.582465351176891e-07,
"loss": 0.0102,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 381160580.0,
"step": 461
},
{
"entropy": 0.4720001220703125,
"epoch": 5.191011235955056,
"grad_norm": 0.22313632929478272,
"learning_rate": 2.514327026561833e-07,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 381992655.0,
"step": 462
},
{
"entropy": 0.46752166748046875,
"epoch": 5.202247191011236,
"grad_norm": 3.041812924459871,
"learning_rate": 2.447052054797233e-07,
"loss": 0.0105,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 382821261.0,
"step": 463
},
{
"entropy": 0.47173309326171875,
"epoch": 5.213483146067416,
"grad_norm": 0.24449038922545974,
"learning_rate": 2.3806430189518337e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 383646555.0,
"step": 464
},
{
"entropy": 0.4704437255859375,
"epoch": 5.224719101123595,
"grad_norm": 0.7976410857300053,
"learning_rate": 2.3151024688461422e-07,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 384458624.0,
"step": 465
},
{
"entropy": 0.48795318603515625,
"epoch": 5.235955056179775,
"grad_norm": 0.23199326788537672,
"learning_rate": 2.2504329209545846e-07,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 385254806.0,
"step": 466
},
{
"entropy": 0.4750213623046875,
"epoch": 5.247191011235955,
"grad_norm": 0.8121541277788699,
"learning_rate": 2.186636858308841e-07,
"loss": 0.0022,
"mean_token_accuracy": 1.0,
"num_tokens": 386062765.0,
"step": 467
},
{
"entropy": 0.45218658447265625,
"epoch": 5.258426966292135,
"grad_norm": 3.105997642674649,
"learning_rate": 2.1237167304025336e-07,
"loss": 0.0029,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 386913540.0,
"step": 468
},
{
"entropy": 0.47808074951171875,
"epoch": 5.269662921348314,
"grad_norm": 1.2728652990081897,
"learning_rate": 2.0616749530971785e-07,
"loss": 0.0073,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 387728374.0,
"step": 469
},
{
"entropy": 0.46541595458984375,
"epoch": 5.280898876404494,
"grad_norm": 0.21479561022075241,
"learning_rate": 2.0005139085293945e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 388542917.0,
"step": 470
},
{
"entropy": 0.4601593017578125,
"epoch": 5.292134831460674,
"grad_norm": 0.5642741656898853,
"learning_rate": 1.9402359450194836e-07,
"loss": 0.01,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 389396618.0,
"step": 471
},
{
"entropy": 0.47005462646484375,
"epoch": 5.303370786516854,
"grad_norm": 2.048653701812348,
"learning_rate": 1.8808433769812367e-07,
"loss": 0.0124,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 390232039.0,
"step": 472
},
{
"entropy": 0.46900177001953125,
"epoch": 5.314606741573034,
"grad_norm": 1.9806921777678528,
"learning_rate": 1.8223384848330723e-07,
"loss": 0.0072,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 391058664.0,
"step": 473
},
{
"entropy": 0.45298004150390625,
"epoch": 5.325842696629214,
"grad_norm": 2.1026358322419987,
"learning_rate": 1.7647235149104908e-07,
"loss": 0.0022,
"mean_token_accuracy": 1.0,
"num_tokens": 391906231.0,
"step": 474
},
{
"entropy": 0.4974212646484375,
"epoch": 5.337078651685394,
"grad_norm": 2.964448609260168,
"learning_rate": 1.7080006793798176e-07,
"loss": 0.005,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 392689363.0,
"step": 475
},
{
"entropy": 0.4739837646484375,
"epoch": 5.348314606741573,
"grad_norm": 4.838257625443637,
"learning_rate": 1.6521721561532645e-07,
"loss": 0.0099,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 393498433.0,
"step": 476
},
{
"entropy": 0.46537017822265625,
"epoch": 5.359550561797753,
"grad_norm": 0.2345104947673042,
"learning_rate": 1.597240088805302e-07,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 394318449.0,
"step": 477
},
{
"entropy": 0.46009063720703125,
"epoch": 5.370786516853933,
"grad_norm": 0.2373016671284719,
"learning_rate": 1.54320658649037e-07,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 395161746.0,
"step": 478
},
{
"entropy": 0.47601318359375,
"epoch": 5.382022471910112,
"grad_norm": 3.077415790643201,
"learning_rate": 1.4900737238618874e-07,
"loss": 0.0115,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 395961351.0,
"step": 479
},
{
"entropy": 0.45795440673828125,
"epoch": 5.393258426966292,
"grad_norm": 0.36061924748354707,
"learning_rate": 1.4378435409925868e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 396827054.0,
"step": 480
},
{
"entropy": 0.4578094482421875,
"epoch": 5.404494382022472,
"grad_norm": 0.8381009700413259,
"learning_rate": 1.3865180432961977e-07,
"loss": 0.0019,
"mean_token_accuracy": 1.0,
"num_tokens": 397711856.0,
"step": 481
},
{
"entropy": 0.46482086181640625,
"epoch": 5.415730337078652,
"grad_norm": 0.6388255497569517,
"learning_rate": 1.3360992014504414e-07,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 398547667.0,
"step": 482
},
{
"entropy": 0.46080780029296875,
"epoch": 5.426966292134831,
"grad_norm": 0.5384287405673601,
"learning_rate": 1.286588951321363e-07,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 399397538.0,
"step": 483
},
{
"entropy": 0.48981475830078125,
"epoch": 5.438202247191011,
"grad_norm": 0.36193846374908706,
"learning_rate": 1.237989193889e-07,
"loss": 0.0017,
"mean_token_accuracy": 1.0,
"num_tokens": 400193633.0,
"step": 484
},
{
"entropy": 0.4829254150390625,
"epoch": 5.449438202247191,
"grad_norm": 0.29962032623383783,
"learning_rate": 1.1903017951744144e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 401008141.0,
"step": 485
},
{
"entropy": 0.47170257568359375,
"epoch": 5.460674157303371,
"grad_norm": 0.2596090181292732,
"learning_rate": 1.1435285861680106e-07,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 401831012.0,
"step": 486
},
{
"entropy": 0.45348358154296875,
"epoch": 5.47191011235955,
"grad_norm": 0.3331736548838834,
"learning_rate": 1.0976713627592561e-07,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 402682567.0,
"step": 487
},
{
"entropy": 0.47737884521484375,
"epoch": 5.48314606741573,
"grad_norm": 3.574551781953933,
"learning_rate": 1.0527318856677293e-07,
"loss": 0.0033,
"mean_token_accuracy": 1.0,
"num_tokens": 403467221.0,
"step": 488
},
{
"entropy": 0.49402618408203125,
"epoch": 5.49438202247191,
"grad_norm": 0.2337886807199716,
"learning_rate": 1.0087118803755069e-07,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 404233243.0,
"step": 489
},
{
"entropy": 0.45621490478515625,
"epoch": 5.50561797752809,
"grad_norm": 1.1592782206231753,
"learning_rate": 9.656130370609057e-08,
"loss": 0.0108,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 405084914.0,
"step": 490
},
{
"entropy": 0.457061767578125,
"epoch": 5.51685393258427,
"grad_norm": 0.2545051809144223,
"learning_rate": 9.234370105336039e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 405909288.0,
"step": 491
},
{
"entropy": 0.45556640625,
"epoch": 5.52808988764045,
"grad_norm": 1.0394244180655139,
"learning_rate": 8.821854201711027e-08,
"loss": 0.0114,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 406788128.0,
"step": 492
},
{
"entropy": 0.48175048828125,
"epoch": 5.539325842696629,
"grad_norm": 0.33453697449549497,
"learning_rate": 8.418598498565217e-08,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 407580454.0,
"step": 493
},
{
"entropy": 0.48504638671875,
"epoch": 5.550561797752809,
"grad_norm": 2.1089507267897116,
"learning_rate": 8.024618479178237e-08,
"loss": 0.0026,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 408383110.0,
"step": 494
},
{
"entropy": 0.46286773681640625,
"epoch": 5.561797752808989,
"grad_norm": 0.2540000053758498,
"learning_rate": 7.639929270683438e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 409229114.0,
"step": 495
},
{
"entropy": 0.47723388671875,
"epoch": 5.573033707865169,
"grad_norm": 1.593634160516405,
"learning_rate": 7.264545643486997e-08,
"loss": 0.0101,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 410042844.0,
"step": 496
},
{
"entropy": 0.48851776123046875,
"epoch": 5.584269662921348,
"grad_norm": 5.858467783591157,
"learning_rate": 6.898482010701036e-08,
"loss": 0.008,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 410829963.0,
"step": 497
},
{
"entropy": 0.4671630859375,
"epoch": 5.595505617977528,
"grad_norm": 0.3065081480250999,
"learning_rate": 6.541752427590004e-08,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 411656140.0,
"step": 498
},
{
"entropy": 0.457275390625,
"epoch": 5.606741573033708,
"grad_norm": 0.2640124180695043,
"learning_rate": 6.194370591031174e-08,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 412521051.0,
"step": 499
},
{
"entropy": 0.462554931640625,
"epoch": 5.617977528089888,
"grad_norm": 0.23712236276528317,
"learning_rate": 5.856349838988612e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 413372843.0,
"step": 500
},
{
"entropy": 0.462982177734375,
"epoch": 5.629213483146067,
"grad_norm": 1.2591830453291861,
"learning_rate": 5.5277031500011734e-08,
"loss": 0.0022,
"mean_token_accuracy": 1.0,
"num_tokens": 414221035.0,
"step": 501
},
{
"entropy": 0.46945953369140625,
"epoch": 5.640449438202247,
"grad_norm": 0.35757120099699974,
"learning_rate": 5.208443142684094e-08,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 415057282.0,
"step": 502
},
{
"entropy": 0.4797821044921875,
"epoch": 5.651685393258427,
"grad_norm": 1.7058610019138325,
"learning_rate": 4.8985820752445177e-08,
"loss": 0.0063,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 415863475.0,
"step": 503
},
{
"entropy": 0.4644775390625,
"epoch": 5.662921348314606,
"grad_norm": 3.697075376126118,
"learning_rate": 4.5981318450109e-08,
"loss": 0.0137,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 416695157.0,
"step": 504
},
{
"entropy": 0.46736907958984375,
"epoch": 5.674157303370786,
"grad_norm": 0.2899077751323242,
"learning_rate": 4.307103987976041e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 417539681.0,
"step": 505
},
{
"entropy": 0.47779083251953125,
"epoch": 5.685393258426966,
"grad_norm": 0.23808611258993526,
"learning_rate": 4.0255096783543e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 418336130.0,
"step": 506
},
{
"entropy": 0.45633697509765625,
"epoch": 5.696629213483146,
"grad_norm": 1.8720058072970238,
"learning_rate": 3.75335972815255e-08,
"loss": 0.0025,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 419186786.0,
"step": 507
},
{
"entropy": 0.47606658935546875,
"epoch": 5.707865168539326,
"grad_norm": 0.24311851614182342,
"learning_rate": 3.4906645867549547e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 420027066.0,
"step": 508
},
{
"entropy": 0.47313690185546875,
"epoch": 5.719101123595506,
"grad_norm": 0.47520225601419763,
"learning_rate": 3.237434340521789e-08,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 420854154.0,
"step": 509
},
{
"entropy": 0.48545074462890625,
"epoch": 5.730337078651686,
"grad_norm": 0.24919993452145028,
"learning_rate": 2.993678712402221e-08,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 421659979.0,
"step": 510
},
{
"entropy": 0.46788787841796875,
"epoch": 5.741573033707866,
"grad_norm": 0.2687240802887501,
"learning_rate": 2.7594070615609426e-08,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 422514934.0,
"step": 511
},
{
"entropy": 0.47194671630859375,
"epoch": 5.752808988764045,
"grad_norm": 0.2600550522951215,
"learning_rate": 2.5346283830187667e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 423354852.0,
"step": 512
},
{
"entropy": 0.4694366455078125,
"epoch": 5.764044943820225,
"grad_norm": 0.2720018158822482,
"learning_rate": 2.319351307307427e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 424172352.0,
"step": 513
},
{
"entropy": 0.4664459228515625,
"epoch": 5.775280898876405,
"grad_norm": 0.9650729639873621,
"learning_rate": 2.1135841001380386e-08,
"loss": 0.0102,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 424994231.0,
"step": 514
},
{
"entropy": 0.4492645263671875,
"epoch": 5.786516853932584,
"grad_norm": 2.648888232254079,
"learning_rate": 1.917334662083714e-08,
"loss": 0.0068,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 425869811.0,
"step": 515
},
{
"entropy": 0.453094482421875,
"epoch": 5.797752808988764,
"grad_norm": 1.027370821629121,
"learning_rate": 1.7306105282764162e-08,
"loss": 0.0087,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 426728032.0,
"step": 516
},
{
"entropy": 0.46843719482421875,
"epoch": 5.808988764044944,
"grad_norm": 0.23837807235842895,
"learning_rate": 1.55341886811744e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 427544143.0,
"step": 517
},
{
"entropy": 0.4472198486328125,
"epoch": 5.820224719101123,
"grad_norm": 0.24481577785236583,
"learning_rate": 1.3857664850022157e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 428420275.0,
"step": 518
},
{
"entropy": 0.47149658203125,
"epoch": 5.831460674157303,
"grad_norm": 0.2426903810937681,
"learning_rate": 1.2276598160590736e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 429224830.0,
"step": 519
},
{
"entropy": 0.4630584716796875,
"epoch": 5.842696629213483,
"grad_norm": 0.25840814875325685,
"learning_rate": 1.0791049319021086e-08,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 430056259.0,
"step": 520
},
{
"entropy": 0.4733734130859375,
"epoch": 5.853932584269663,
"grad_norm": 0.26267972632078107,
"learning_rate": 9.401075363981438e-09,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 430887003.0,
"step": 521
},
{
"entropy": 0.467041015625,
"epoch": 5.865168539325842,
"grad_norm": 0.24048673726778205,
"learning_rate": 8.106729664475178e-09,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 431721666.0,
"step": 522
},
{
"entropy": 0.47376251220703125,
"epoch": 5.876404494382022,
"grad_norm": 0.24254325715878092,
"learning_rate": 6.908061917794417e-09,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 432523209.0,
"step": 523
},
{
"entropy": 0.4714813232421875,
"epoch": 5.887640449438202,
"grad_norm": 0.31821184051507945,
"learning_rate": 5.805118147610145e-09,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 433343280.0,
"step": 524
},
{
"entropy": 0.4551544189453125,
"epoch": 5.898876404494382,
"grad_norm": 0.2263073507825304,
"learning_rate": 4.797940702205572e-09,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 434225691.0,
"step": 525
},
{
"entropy": 0.4636688232421875,
"epoch": 5.910112359550562,
"grad_norm": 0.2921054936133335,
"learning_rate": 3.8865682528504975e-09,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 435079584.0,
"step": 526
},
{
"entropy": 0.48111724853515625,
"epoch": 5.921348314606742,
"grad_norm": 0.2539645731699836,
"learning_rate": 3.071035792315269e-09,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 435876270.0,
"step": 527
},
{
"entropy": 0.4649505615234375,
"epoch": 5.932584269662922,
"grad_norm": 0.25467262177337446,
"learning_rate": 2.351374633528802e-09,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 436699694.0,
"step": 528
},
{
"entropy": 0.45807647705078125,
"epoch": 5.943820224719101,
"grad_norm": 0.2970856799447582,
"learning_rate": 1.7276124083753788e-09,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 437552430.0,
"step": 529
},
{
"entropy": 0.46283721923828125,
"epoch": 5.955056179775281,
"grad_norm": 0.24692476291579676,
"learning_rate": 1.1997730666338248e-09,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 438392649.0,
"step": 530
},
{
"entropy": 0.47788238525390625,
"epoch": 5.966292134831461,
"grad_norm": 0.28913800701048575,
"learning_rate": 7.678768750579713e-10,
"loss": 0.0015,
"mean_token_accuracy": 1.0,
"num_tokens": 439188890.0,
"step": 531
},
{
"entropy": 0.48076629638671875,
"epoch": 5.97752808988764,
"grad_norm": 0.2517588518870856,
"learning_rate": 4.3194041659866405e-10,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 439971575.0,
"step": 532
},
{
"entropy": 0.46562957763671875,
"epoch": 5.98876404494382,
"grad_norm": 0.2411681682465322,
"learning_rate": 1.9197658976677358e-10,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 440804562.0,
"step": 533
},
{
"entropy": 0.4652252197265625,
"epoch": 6.0,
"grad_norm": 0.27634116211326665,
"learning_rate": 4.799460813803558e-11,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 441644783.0,
"step": 534
},
{
"epoch": 6.0,
"step": 534,
"total_flos": 519624752562176.0,
"train_loss": 0.5481295004301296,
"train_runtime": 70704.5263,
"train_samples_per_second": 3.496,
"train_steps_per_second": 0.008
}
],
"logging_steps": 1,
"max_steps": 534,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 45,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 519624752562176.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}