Files
P9-split1_only_answer_Qwen3…/trainer_state.json
ModelHub XC 4152186543 初始化项目,由ModelHub XC社区提供模型
Model: Hyeongwon/P9-split1_only_answer_Qwen3-4B-Base_0402-01-1e-5
Source: Original Platform
2026-04-10 15:53:07 +08:00

5324 lines
148 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 528,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5595855712890625,
"epoch": 0.011363636363636364,
"grad_norm": 381.7861402310546,
"learning_rate": 0.0,
"loss": 8.3191,
"mean_token_accuracy": 0.0,
"num_tokens": 852123.0,
"step": 1
},
{
"entropy": 0.5646438598632812,
"epoch": 0.022727272727272728,
"grad_norm": 384.6525251162329,
"learning_rate": 3.7037037037037036e-07,
"loss": 8.2985,
"mean_token_accuracy": 0.0,
"num_tokens": 1667244.0,
"step": 2
},
{
"entropy": 0.553924560546875,
"epoch": 0.03409090909090909,
"grad_norm": 384.8117222961788,
"learning_rate": 7.407407407407407e-07,
"loss": 8.2789,
"mean_token_accuracy": 0.0,
"num_tokens": 2503572.0,
"step": 3
},
{
"entropy": 0.5484161376953125,
"epoch": 0.045454545454545456,
"grad_norm": 389.3572097207638,
"learning_rate": 1.111111111111111e-06,
"loss": 8.0992,
"mean_token_accuracy": 0.0,
"num_tokens": 3345459.0,
"step": 4
},
{
"entropy": 0.5574264526367188,
"epoch": 0.056818181818181816,
"grad_norm": 403.9101556918202,
"learning_rate": 1.4814814814814815e-06,
"loss": 7.4779,
"mean_token_accuracy": 0.0,
"num_tokens": 4166938.0,
"step": 5
},
{
"entropy": 0.5470428466796875,
"epoch": 0.06818181818181818,
"grad_norm": 395.2760506482167,
"learning_rate": 1.8518518518518519e-06,
"loss": 7.2347,
"mean_token_accuracy": 0.0,
"num_tokens": 5016940.0,
"step": 6
},
{
"entropy": 0.5523529052734375,
"epoch": 0.07954545454545454,
"grad_norm": 223.35456023967305,
"learning_rate": 2.222222222222222e-06,
"loss": 5.576,
"mean_token_accuracy": 0.007812500232830644,
"num_tokens": 5848503.0,
"step": 7
},
{
"entropy": 0.5435867309570312,
"epoch": 0.09090909090909091,
"grad_norm": 112.29477147905564,
"learning_rate": 2.5925925925925925e-06,
"loss": 4.2732,
"mean_token_accuracy": 0.5026041816454381,
"num_tokens": 6709898.0,
"step": 8
},
{
"entropy": 0.5592041015625,
"epoch": 0.10227272727272728,
"grad_norm": 95.74272291998905,
"learning_rate": 2.962962962962963e-06,
"loss": 4.0579,
"mean_token_accuracy": 0.505208348389715,
"num_tokens": 7560854.0,
"step": 9
},
{
"entropy": 0.5602951049804688,
"epoch": 0.11363636363636363,
"grad_norm": 80.94962958572077,
"learning_rate": 3.3333333333333333e-06,
"loss": 3.8263,
"mean_token_accuracy": 0.5117187652504072,
"num_tokens": 8391135.0,
"step": 10
},
{
"entropy": 0.5577774047851562,
"epoch": 0.125,
"grad_norm": 59.51647665780966,
"learning_rate": 3.7037037037037037e-06,
"loss": 3.3053,
"mean_token_accuracy": 0.505208348389715,
"num_tokens": 9185279.0,
"step": 11
},
{
"entropy": 0.5463104248046875,
"epoch": 0.13636363636363635,
"grad_norm": 57.89777609345863,
"learning_rate": 4.074074074074074e-06,
"loss": 3.2147,
"mean_token_accuracy": 0.5169270987389609,
"num_tokens": 10024891.0,
"step": 12
},
{
"entropy": 0.5550765991210938,
"epoch": 0.14772727272727273,
"grad_norm": 57.43984061602855,
"learning_rate": 4.444444444444444e-06,
"loss": 3.1308,
"mean_token_accuracy": 0.5273437657160684,
"num_tokens": 10842191.0,
"step": 13
},
{
"entropy": 0.5534210205078125,
"epoch": 0.1590909090909091,
"grad_norm": 57.49098275744853,
"learning_rate": 4.814814814814815e-06,
"loss": 3.0731,
"mean_token_accuracy": 0.5351562659488991,
"num_tokens": 11650475.0,
"step": 14
},
{
"entropy": 0.5523452758789062,
"epoch": 0.17045454545454544,
"grad_norm": 60.44468793760702,
"learning_rate": 5.185185185185185e-06,
"loss": 2.9645,
"mean_token_accuracy": 0.5208333488553762,
"num_tokens": 12464155.0,
"step": 15
},
{
"entropy": 0.5218353271484375,
"epoch": 0.18181818181818182,
"grad_norm": 66.21017634939703,
"learning_rate": 5.555555555555557e-06,
"loss": 2.9797,
"mean_token_accuracy": 0.5013020982732996,
"num_tokens": 13346836.0,
"step": 16
},
{
"entropy": 0.5411224365234375,
"epoch": 0.19318181818181818,
"grad_norm": 57.306031059188456,
"learning_rate": 5.925925925925926e-06,
"loss": 2.9136,
"mean_token_accuracy": 0.5234375155996531,
"num_tokens": 14174968.0,
"step": 17
},
{
"entropy": 0.5549850463867188,
"epoch": 0.20454545454545456,
"grad_norm": 57.48037572896507,
"learning_rate": 6.296296296296297e-06,
"loss": 2.8744,
"mean_token_accuracy": 0.5130208486225456,
"num_tokens": 14975189.0,
"step": 18
},
{
"entropy": 0.558502197265625,
"epoch": 0.2159090909090909,
"grad_norm": 57.18753706899099,
"learning_rate": 6.666666666666667e-06,
"loss": 2.8435,
"mean_token_accuracy": 0.5247395989717916,
"num_tokens": 15764524.0,
"step": 19
},
{
"entropy": 0.5551605224609375,
"epoch": 0.22727272727272727,
"grad_norm": 57.099620732693666,
"learning_rate": 7.0370370370370375e-06,
"loss": 2.8033,
"mean_token_accuracy": 0.5611979333916679,
"num_tokens": 16594212.0,
"step": 20
},
{
"entropy": 0.54339599609375,
"epoch": 0.23863636363636365,
"grad_norm": 56.761099934785754,
"learning_rate": 7.4074074074074075e-06,
"loss": 2.7555,
"mean_token_accuracy": 0.5455729329260066,
"num_tokens": 17431524.0,
"step": 21
},
{
"entropy": 0.5582275390625,
"epoch": 0.25,
"grad_norm": 56.67625638500944,
"learning_rate": 7.77777777777778e-06,
"loss": 2.7206,
"mean_token_accuracy": 0.5286458490882069,
"num_tokens": 18240206.0,
"step": 22
},
{
"entropy": 0.544647216796875,
"epoch": 0.26136363636363635,
"grad_norm": 57.07526498149015,
"learning_rate": 8.148148148148148e-06,
"loss": 2.6748,
"mean_token_accuracy": 0.5638021001359448,
"num_tokens": 19067759.0,
"step": 23
},
{
"entropy": 0.5390625,
"epoch": 0.2727272727272727,
"grad_norm": 57.35919508340141,
"learning_rate": 8.518518518518519e-06,
"loss": 2.6279,
"mean_token_accuracy": 0.5690104336244985,
"num_tokens": 19896857.0,
"step": 24
},
{
"entropy": 0.5362167358398438,
"epoch": 0.2840909090909091,
"grad_norm": 58.5953667217032,
"learning_rate": 8.888888888888888e-06,
"loss": 2.5885,
"mean_token_accuracy": 0.558593766647391,
"num_tokens": 20712844.0,
"step": 25
},
{
"entropy": 0.5246353149414062,
"epoch": 0.29545454545454547,
"grad_norm": 59.726659251879035,
"learning_rate": 9.25925925925926e-06,
"loss": 2.5624,
"mean_token_accuracy": 0.5572916832752526,
"num_tokens": 21562110.0,
"step": 26
},
{
"entropy": 0.517822265625,
"epoch": 0.3068181818181818,
"grad_norm": 61.90672236852855,
"learning_rate": 9.62962962962963e-06,
"loss": 2.5112,
"mean_token_accuracy": 0.5598958500195295,
"num_tokens": 22406352.0,
"step": 27
},
{
"entropy": 0.5255279541015625,
"epoch": 0.3181818181818182,
"grad_norm": 70.11627842380628,
"learning_rate": 1e-05,
"loss": 2.5404,
"mean_token_accuracy": 0.5195312654832378,
"num_tokens": 23252892.0,
"step": 28
},
{
"entropy": 0.54864501953125,
"epoch": 0.32954545454545453,
"grad_norm": 59.62883231844681,
"learning_rate": 9.999901697881075e-06,
"loss": 2.4391,
"mean_token_accuracy": 0.558593766647391,
"num_tokens": 24068796.0,
"step": 29
},
{
"entropy": 0.5512237548828125,
"epoch": 0.3409090909090909,
"grad_norm": 60.88601090681198,
"learning_rate": 9.999606795389623e-06,
"loss": 2.4135,
"mean_token_accuracy": 0.5442708487389609,
"num_tokens": 24888182.0,
"step": 30
},
{
"entropy": 0.5544052124023438,
"epoch": 0.3522727272727273,
"grad_norm": 60.86420068870514,
"learning_rate": 9.999115304121459e-06,
"loss": 2.3746,
"mean_token_accuracy": 0.8619791747769341,
"num_tokens": 25701818.0,
"step": 31
},
{
"entropy": 0.5430526733398438,
"epoch": 0.36363636363636365,
"grad_norm": 60.251653944045856,
"learning_rate": 9.998427243402437e-06,
"loss": 2.3292,
"mean_token_accuracy": 0.8854166734963655,
"num_tokens": 26550803.0,
"step": 32
},
{
"entropy": 0.5553741455078125,
"epoch": 0.375,
"grad_norm": 60.3298612444737,
"learning_rate": 9.997542640287686e-06,
"loss": 2.2774,
"mean_token_accuracy": 0.8841145902406424,
"num_tokens": 27362749.0,
"step": 33
},
{
"entropy": 0.5552520751953125,
"epoch": 0.38636363636363635,
"grad_norm": 60.62608864413497,
"learning_rate": 9.996461529560553e-06,
"loss": 2.2108,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 28197226.0,
"step": 34
},
{
"entropy": 0.5430068969726562,
"epoch": 0.3977272727272727,
"grad_norm": 60.45843293213623,
"learning_rate": 9.995183953731225e-06,
"loss": 2.1694,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 29070075.0,
"step": 35
},
{
"entropy": 0.5520477294921875,
"epoch": 0.4090909090909091,
"grad_norm": 60.60470764646535,
"learning_rate": 9.99370996303507e-06,
"loss": 2.1127,
"mean_token_accuracy": 0.8997395893093199,
"num_tokens": 29909320.0,
"step": 36
},
{
"entropy": 0.533050537109375,
"epoch": 0.42045454545454547,
"grad_norm": 60.8797781205061,
"learning_rate": 9.992039615430648e-06,
"loss": 2.071,
"mean_token_accuracy": 0.912760421866551,
"num_tokens": 30778680.0,
"step": 37
},
{
"entropy": 0.5433807373046875,
"epoch": 0.4318181818181818,
"grad_norm": 60.866971731966345,
"learning_rate": 9.990172976597446e-06,
"loss": 2.0387,
"mean_token_accuracy": 0.8958333395421505,
"num_tokens": 31596240.0,
"step": 38
},
{
"entropy": 0.5598068237304688,
"epoch": 0.4431818181818182,
"grad_norm": 61.325213343510775,
"learning_rate": 9.988110119933281e-06,
"loss": 1.9883,
"mean_token_accuracy": 0.8971354227978736,
"num_tokens": 32396599.0,
"step": 39
},
{
"entropy": 0.5557174682617188,
"epoch": 0.45454545454545453,
"grad_norm": 60.349279291270896,
"learning_rate": 9.985851126551428e-06,
"loss": 1.9158,
"mean_token_accuracy": 0.9192708381451666,
"num_tokens": 33192180.0,
"step": 40
},
{
"entropy": 0.5460128784179688,
"epoch": 0.4659090909090909,
"grad_norm": 60.29280940957208,
"learning_rate": 9.983396085277421e-06,
"loss": 1.8879,
"mean_token_accuracy": 0.8997395893093199,
"num_tokens": 34019780.0,
"step": 41
},
{
"entropy": 0.5501022338867188,
"epoch": 0.4772727272727273,
"grad_norm": 58.97902817615115,
"learning_rate": 9.980745092645564e-06,
"loss": 1.8189,
"mean_token_accuracy": 0.9101562553551048,
"num_tokens": 34845898.0,
"step": 42
},
{
"entropy": 0.55059814453125,
"epoch": 0.48863636363636365,
"grad_norm": 59.56331193069494,
"learning_rate": 9.977898252895133e-06,
"loss": 1.7845,
"mean_token_accuracy": 0.912760421866551,
"num_tokens": 35658003.0,
"step": 43
},
{
"entropy": 0.5355606079101562,
"epoch": 0.5,
"grad_norm": 59.1190449560066,
"learning_rate": 9.974855677966283e-06,
"loss": 1.7301,
"mean_token_accuracy": 0.9049479223322123,
"num_tokens": 36513090.0,
"step": 44
},
{
"entropy": 0.5511093139648438,
"epoch": 0.5113636363636364,
"grad_norm": 58.84552514073676,
"learning_rate": 9.971617487495635e-06,
"loss": 1.6771,
"mean_token_accuracy": 0.8997395893093199,
"num_tokens": 37335753.0,
"step": 45
},
{
"entropy": 0.5509262084960938,
"epoch": 0.5227272727272727,
"grad_norm": 58.05247290731136,
"learning_rate": 9.968183808811586e-06,
"loss": 1.6113,
"mean_token_accuracy": 0.912760421866551,
"num_tokens": 38171714.0,
"step": 46
},
{
"entropy": 0.5467910766601562,
"epoch": 0.5340909090909091,
"grad_norm": 57.62644994665755,
"learning_rate": 9.964554776929289e-06,
"loss": 1.5464,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 39004676.0,
"step": 47
},
{
"entropy": 0.554779052734375,
"epoch": 0.5454545454545454,
"grad_norm": 57.493682548859965,
"learning_rate": 9.960730534545357e-06,
"loss": 1.507,
"mean_token_accuracy": 0.9257812544237822,
"num_tokens": 39839454.0,
"step": 48
},
{
"entropy": 0.5637130737304688,
"epoch": 0.5568181818181818,
"grad_norm": 57.70900607158181,
"learning_rate": 9.95671123203224e-06,
"loss": 1.4657,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 40636201.0,
"step": 49
},
{
"entropy": 0.5558547973632812,
"epoch": 0.5681818181818182,
"grad_norm": 57.39395222476001,
"learning_rate": 9.95249702743232e-06,
"loss": 1.3995,
"mean_token_accuracy": 0.923177087912336,
"num_tokens": 41456439.0,
"step": 50
},
{
"entropy": 0.5491790771484375,
"epoch": 0.5795454545454546,
"grad_norm": 57.41371703432652,
"learning_rate": 9.948088086451692e-06,
"loss": 1.3504,
"mean_token_accuracy": 0.923177087912336,
"num_tokens": 42306168.0,
"step": 51
},
{
"entropy": 0.5419998168945312,
"epoch": 0.5909090909090909,
"grad_norm": 57.708036609750216,
"learning_rate": 9.943484582453653e-06,
"loss": 1.2953,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 43156058.0,
"step": 52
},
{
"entropy": 0.5348358154296875,
"epoch": 0.6022727272727273,
"grad_norm": 57.73872770246758,
"learning_rate": 9.938686696451884e-06,
"loss": 1.2523,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 44026197.0,
"step": 53
},
{
"entropy": 0.5535354614257812,
"epoch": 0.6136363636363636,
"grad_norm": 57.777576405881355,
"learning_rate": 9.933694617103328e-06,
"loss": 1.1934,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 44830346.0,
"step": 54
},
{
"entropy": 0.5513992309570312,
"epoch": 0.625,
"grad_norm": 57.34605611556142,
"learning_rate": 9.928508540700775e-06,
"loss": 1.147,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 45639894.0,
"step": 55
},
{
"entropy": 0.5546798706054688,
"epoch": 0.6363636363636364,
"grad_norm": 57.04857568973497,
"learning_rate": 9.923128671165145e-06,
"loss": 1.109,
"mean_token_accuracy": 0.9023437558207661,
"num_tokens": 46453026.0,
"step": 56
},
{
"entropy": 0.5538177490234375,
"epoch": 0.6477272727272727,
"grad_norm": 56.470712174285566,
"learning_rate": 9.917555220037469e-06,
"loss": 1.0488,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 47264316.0,
"step": 57
},
{
"entropy": 0.5666427612304688,
"epoch": 0.6590909090909091,
"grad_norm": 55.841215147852594,
"learning_rate": 9.91178840647057e-06,
"loss": 1.0016,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 48069506.0,
"step": 58
},
{
"entropy": 0.5659713745117188,
"epoch": 0.6704545454545454,
"grad_norm": 55.98626961303541,
"learning_rate": 9.905828457220442e-06,
"loss": 0.9377,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 48869835.0,
"step": 59
},
{
"entropy": 0.5512847900390625,
"epoch": 0.6818181818181818,
"grad_norm": 54.8634329832115,
"learning_rate": 9.899675606637344e-06,
"loss": 0.8998,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 49727599.0,
"step": 60
},
{
"entropy": 0.551544189453125,
"epoch": 0.6931818181818182,
"grad_norm": 54.36819144170542,
"learning_rate": 9.893330096656576e-06,
"loss": 0.837,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 50548928.0,
"step": 61
},
{
"entropy": 0.5626449584960938,
"epoch": 0.7045454545454546,
"grad_norm": 54.914774939776095,
"learning_rate": 9.886792176788964e-06,
"loss": 0.8096,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 51360270.0,
"step": 62
},
{
"entropy": 0.557525634765625,
"epoch": 0.7159090909090909,
"grad_norm": 55.61783293820803,
"learning_rate": 9.880062104111064e-06,
"loss": 0.8107,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 52194089.0,
"step": 63
},
{
"entropy": 0.55279541015625,
"epoch": 0.7272727272727273,
"grad_norm": 52.63001268534479,
"learning_rate": 9.873140143255035e-06,
"loss": 0.7542,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 53031403.0,
"step": 64
},
{
"entropy": 0.5531845092773438,
"epoch": 0.7386363636363636,
"grad_norm": 49.54331333494459,
"learning_rate": 9.866026566398248e-06,
"loss": 0.6812,
"mean_token_accuracy": 0.9283854209352285,
"num_tokens": 53844933.0,
"step": 65
},
{
"entropy": 0.5571212768554688,
"epoch": 0.75,
"grad_norm": 48.50824783464925,
"learning_rate": 9.858721653252571e-06,
"loss": 0.6659,
"mean_token_accuracy": 0.9075520888436586,
"num_tokens": 54693823.0,
"step": 66
},
{
"entropy": 0.5417633056640625,
"epoch": 0.7613636363636364,
"grad_norm": 46.31315205932425,
"learning_rate": 9.851225691053382e-06,
"loss": 0.6124,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 55549536.0,
"step": 67
},
{
"entropy": 0.552581787109375,
"epoch": 0.7727272727272727,
"grad_norm": 44.52806747500585,
"learning_rate": 9.843538974548264e-06,
"loss": 0.5685,
"mean_token_accuracy": 0.923177087912336,
"num_tokens": 56378172.0,
"step": 68
},
{
"entropy": 0.5659942626953125,
"epoch": 0.7840909090909091,
"grad_norm": 41.985635746009535,
"learning_rate": 9.835661805985432e-06,
"loss": 0.5224,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 57189636.0,
"step": 69
},
{
"entropy": 0.5614395141601562,
"epoch": 0.7954545454545454,
"grad_norm": 39.34370735906953,
"learning_rate": 9.827594495101824e-06,
"loss": 0.4839,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 58000089.0,
"step": 70
},
{
"entropy": 0.5698471069335938,
"epoch": 0.8068181818181818,
"grad_norm": 37.74374421075821,
"learning_rate": 9.819337359110945e-06,
"loss": 0.4483,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 58802829.0,
"step": 71
},
{
"entropy": 0.5551528930664062,
"epoch": 0.8181818181818182,
"grad_norm": 36.45001846956263,
"learning_rate": 9.81089072269038e-06,
"loss": 0.4381,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 59625095.0,
"step": 72
},
{
"entropy": 0.5639801025390625,
"epoch": 0.8295454545454546,
"grad_norm": 32.91100179668808,
"learning_rate": 9.802254917969033e-06,
"loss": 0.4023,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 60419806.0,
"step": 73
},
{
"entropy": 0.5682144165039062,
"epoch": 0.8409090909090909,
"grad_norm": 36.672735804041565,
"learning_rate": 9.793430284514063e-06,
"loss": 0.4244,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 61242019.0,
"step": 74
},
{
"entropy": 0.5541458129882812,
"epoch": 0.8522727272727273,
"grad_norm": 28.896417954347815,
"learning_rate": 9.78441716931754e-06,
"loss": 0.3576,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 62099804.0,
"step": 75
},
{
"entropy": 0.5613327026367188,
"epoch": 0.8636363636363636,
"grad_norm": 27.80615299063651,
"learning_rate": 9.775215926782788e-06,
"loss": 0.3477,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 62934736.0,
"step": 76
},
{
"entropy": 0.5644607543945312,
"epoch": 0.875,
"grad_norm": 24.175728899608025,
"learning_rate": 9.765826918710466e-06,
"loss": 0.3243,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 63755694.0,
"step": 77
},
{
"entropy": 0.5602569580078125,
"epoch": 0.8863636363636364,
"grad_norm": 23.17376499958452,
"learning_rate": 9.75625051428433e-06,
"loss": 0.3052,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 64618678.0,
"step": 78
},
{
"entropy": 0.5586624145507812,
"epoch": 0.8977272727272727,
"grad_norm": 19.233654316377294,
"learning_rate": 9.746487090056712e-06,
"loss": 0.2687,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 65481725.0,
"step": 79
},
{
"entropy": 0.544525146484375,
"epoch": 0.9090909090909091,
"grad_norm": 21.651323416304713,
"learning_rate": 9.736537029933738e-06,
"loss": 0.2901,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 66326767.0,
"step": 80
},
{
"entropy": 0.5647964477539062,
"epoch": 0.9204545454545454,
"grad_norm": 19.420139513068413,
"learning_rate": 9.726400725160199e-06,
"loss": 0.2637,
"mean_token_accuracy": 0.9179687548894435,
"num_tokens": 67132451.0,
"step": 81
},
{
"entropy": 0.5595474243164062,
"epoch": 0.9318181818181818,
"grad_norm": 14.444919061371778,
"learning_rate": 9.71607857430419e-06,
"loss": 0.2071,
"mean_token_accuracy": 0.9492187530267984,
"num_tokens": 67951468.0,
"step": 82
},
{
"entropy": 0.5521011352539062,
"epoch": 0.9431818181818182,
"grad_norm": 16.094060936634268,
"learning_rate": 9.705570983241433e-06,
"loss": 0.2253,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 68794669.0,
"step": 83
},
{
"entropy": 0.5579681396484375,
"epoch": 0.9545454545454546,
"grad_norm": 15.839955166577948,
"learning_rate": 9.694878365139313e-06,
"loss": 0.242,
"mean_token_accuracy": 0.9179687548894435,
"num_tokens": 69610011.0,
"step": 84
},
{
"entropy": 0.5581436157226562,
"epoch": 0.9659090909090909,
"grad_norm": 10.342555180693717,
"learning_rate": 9.68400114044064e-06,
"loss": 0.2039,
"mean_token_accuracy": 0.9361979204695672,
"num_tokens": 70406656.0,
"step": 85
},
{
"entropy": 0.5596466064453125,
"epoch": 0.9772727272727273,
"grad_norm": 9.046442023037564,
"learning_rate": 9.672939736847104e-06,
"loss": 0.1837,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 71214893.0,
"step": 86
},
{
"entropy": 0.5611190795898438,
"epoch": 0.9886363636363636,
"grad_norm": 8.777636166267405,
"learning_rate": 9.661694589302471e-06,
"loss": 0.1819,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 72038768.0,
"step": 87
},
{
"entropy": 0.5543975830078125,
"epoch": 1.0,
"grad_norm": 7.180103618322581,
"learning_rate": 9.650266139975474e-06,
"loss": 0.1855,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 72847782.0,
"step": 88
},
{
"entropy": 0.546783447265625,
"epoch": 1.0113636363636365,
"grad_norm": 11.51915215226843,
"learning_rate": 9.63865483824243e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 73701972.0,
"step": 89
},
{
"entropy": 0.5644760131835938,
"epoch": 1.0227272727272727,
"grad_norm": 16.213117035444654,
"learning_rate": 9.62686114066956e-06,
"loss": 0.2025,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 74539661.0,
"step": 90
},
{
"entropy": 0.551055908203125,
"epoch": 1.0340909090909092,
"grad_norm": 5.356675563856126,
"learning_rate": 9.614885510995047e-06,
"loss": 0.1652,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 75393325.0,
"step": 91
},
{
"entropy": 0.5531463623046875,
"epoch": 1.0454545454545454,
"grad_norm": 21.01084472531982,
"learning_rate": 9.602728420110807e-06,
"loss": 0.2453,
"mean_token_accuracy": 0.8932291730307043,
"num_tokens": 76230771.0,
"step": 92
},
{
"entropy": 0.5424957275390625,
"epoch": 1.0568181818181819,
"grad_norm": 9.730742448823642,
"learning_rate": 9.590390346043952e-06,
"loss": 0.1919,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 77097214.0,
"step": 93
},
{
"entropy": 0.5310592651367188,
"epoch": 1.0681818181818181,
"grad_norm": 19.44105394481565,
"learning_rate": 9.577871773938013e-06,
"loss": 0.2412,
"mean_token_accuracy": 0.8736979241948575,
"num_tokens": 78024772.0,
"step": 94
},
{
"entropy": 0.5767440795898438,
"epoch": 1.0795454545454546,
"grad_norm": 16.703429142070803,
"learning_rate": 9.565173196033855e-06,
"loss": 0.2218,
"mean_token_accuracy": 0.8763020907063037,
"num_tokens": 78828734.0,
"step": 95
},
{
"entropy": 0.5640106201171875,
"epoch": 1.0909090909090908,
"grad_norm": 4.368012071519686,
"learning_rate": 9.552295111650328e-06,
"loss": 0.1877,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 79685825.0,
"step": 96
},
{
"entropy": 0.5703125,
"epoch": 1.1022727272727273,
"grad_norm": 7.72202950747282,
"learning_rate": 9.539238027164618e-06,
"loss": 0.1969,
"mean_token_accuracy": 0.9049479223322123,
"num_tokens": 80538841.0,
"step": 97
},
{
"entropy": 0.5514984130859375,
"epoch": 1.1136363636363635,
"grad_norm": 8.583643682888079,
"learning_rate": 9.526002455992361e-06,
"loss": 0.1799,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 81406964.0,
"step": 98
},
{
"entropy": 0.5702438354492188,
"epoch": 1.125,
"grad_norm": 6.688536010514189,
"learning_rate": 9.512588918567429e-06,
"loss": 0.2044,
"mean_token_accuracy": 0.9166666716337204,
"num_tokens": 82228992.0,
"step": 99
},
{
"entropy": 0.5829925537109375,
"epoch": 1.1363636363636362,
"grad_norm": 3.019991683808097,
"learning_rate": 9.498997942321484e-06,
"loss": 0.1495,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 83019198.0,
"step": 100
},
{
"entropy": 0.5686721801757812,
"epoch": 1.1477272727272727,
"grad_norm": 4.776977543639304,
"learning_rate": 9.48523006166323e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.8984375060535967,
"num_tokens": 83850145.0,
"step": 101
},
{
"entropy": 0.5742645263671875,
"epoch": 1.1590909090909092,
"grad_norm": 2.5324395623496323,
"learning_rate": 9.471285817957407e-06,
"loss": 0.1641,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 84659245.0,
"step": 102
},
{
"entropy": 0.5639724731445312,
"epoch": 1.1704545454545454,
"grad_norm": 3.1158639729762867,
"learning_rate": 9.457165759503492e-06,
"loss": 0.1662,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 85506138.0,
"step": 103
},
{
"entropy": 0.5506134033203125,
"epoch": 1.1818181818181819,
"grad_norm": 2.507395364410583,
"learning_rate": 9.442870441514155e-06,
"loss": 0.1461,
"mean_token_accuracy": 0.9283854209352285,
"num_tokens": 86335548.0,
"step": 104
},
{
"entropy": 0.55096435546875,
"epoch": 1.1931818181818181,
"grad_norm": 12.463466410754858,
"learning_rate": 9.428400426093413e-06,
"loss": 0.1831,
"mean_token_accuracy": 0.9140625051222742,
"num_tokens": 87158155.0,
"step": 105
},
{
"entropy": 0.5426483154296875,
"epoch": 1.2045454545454546,
"grad_norm": 4.335881181165624,
"learning_rate": 9.413756282214538e-06,
"loss": 0.1358,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 87999932.0,
"step": 106
},
{
"entropy": 0.5401229858398438,
"epoch": 1.2159090909090908,
"grad_norm": 15.619862153125911,
"learning_rate": 9.398938585697679e-06,
"loss": 0.2466,
"mean_token_accuracy": 0.9114583386108279,
"num_tokens": 88819905.0,
"step": 107
},
{
"entropy": 0.5477371215820312,
"epoch": 1.2272727272727273,
"grad_norm": 12.902228668846844,
"learning_rate": 9.383947919187219e-06,
"loss": 0.2149,
"mean_token_accuracy": 0.9244791711680591,
"num_tokens": 89634909.0,
"step": 108
},
{
"entropy": 0.5696182250976562,
"epoch": 1.2386363636363638,
"grad_norm": 2.68826428324166,
"learning_rate": 9.368784872128877e-06,
"loss": 0.1646,
"mean_token_accuracy": 0.923177087912336,
"num_tokens": 90402321.0,
"step": 109
},
{
"entropy": 0.564300537109375,
"epoch": 1.25,
"grad_norm": 10.378868916072332,
"learning_rate": 9.35345004074651e-06,
"loss": 0.1955,
"mean_token_accuracy": 0.8945312562864274,
"num_tokens": 91203638.0,
"step": 110
},
{
"entropy": 0.5643157958984375,
"epoch": 1.2613636363636362,
"grad_norm": 12.255195984734726,
"learning_rate": 9.337944028018689e-06,
"loss": 0.2115,
"mean_token_accuracy": 0.8632812581490725,
"num_tokens": 92020252.0,
"step": 111
},
{
"entropy": 0.5825271606445312,
"epoch": 1.2727272727272727,
"grad_norm": 7.770388720635199,
"learning_rate": 9.322267443654974e-06,
"loss": 0.1836,
"mean_token_accuracy": 0.9088541720993817,
"num_tokens": 92816807.0,
"step": 112
},
{
"entropy": 0.560638427734375,
"epoch": 1.2840909090909092,
"grad_norm": 1.6431578654617318,
"learning_rate": 9.306420904071949e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 93649991.0,
"step": 113
},
{
"entropy": 0.561920166015625,
"epoch": 1.2954545454545454,
"grad_norm": 1.4017962904890229,
"learning_rate": 9.290405032368983e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9270833376795053,
"num_tokens": 94459730.0,
"step": 114
},
{
"entropy": 0.5430450439453125,
"epoch": 1.3068181818181819,
"grad_norm": 1.6568698646332456,
"learning_rate": 9.274220458303727e-06,
"loss": 0.1466,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 95302339.0,
"step": 115
},
{
"entropy": 0.5311431884765625,
"epoch": 1.3181818181818181,
"grad_norm": 4.761720493612297,
"learning_rate": 9.257867818267347e-06,
"loss": 0.1553,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 96149112.0,
"step": 116
},
{
"entropy": 0.5357666015625,
"epoch": 1.3295454545454546,
"grad_norm": 2.1880525559156156,
"learning_rate": 9.241347755259514e-06,
"loss": 0.1222,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 96974920.0,
"step": 117
},
{
"entropy": 0.5627593994140625,
"epoch": 1.3409090909090908,
"grad_norm": 5.770062896951192,
"learning_rate": 9.224660918863104e-06,
"loss": 0.1592,
"mean_token_accuracy": 0.9218750046566129,
"num_tokens": 97741872.0,
"step": 118
},
{
"entropy": 0.532928466796875,
"epoch": 1.3522727272727273,
"grad_norm": 1.7482037476041852,
"learning_rate": 9.207807965218668e-06,
"loss": 0.1545,
"mean_token_accuracy": 0.9153645883779973,
"num_tokens": 98575899.0,
"step": 119
},
{
"entropy": 0.5299148559570312,
"epoch": 1.3636363636363638,
"grad_norm": 5.278489286192921,
"learning_rate": 9.190789556998627e-06,
"loss": 0.1656,
"mean_token_accuracy": 0.9257812544237822,
"num_tokens": 99431810.0,
"step": 120
},
{
"entropy": 0.5542984008789062,
"epoch": 1.375,
"grad_norm": 2.007494136264229,
"learning_rate": 9.173606363381218e-06,
"loss": 0.1351,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 100229559.0,
"step": 121
},
{
"entropy": 0.5370101928710938,
"epoch": 1.3863636363636362,
"grad_norm": 3.033501872375309,
"learning_rate": 9.156259060024177e-06,
"loss": 0.1464,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 101066027.0,
"step": 122
},
{
"entropy": 0.5522689819335938,
"epoch": 1.3977272727272727,
"grad_norm": 3.9482667306063894,
"learning_rate": 9.138748329038175e-06,
"loss": 0.1251,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 101861078.0,
"step": 123
},
{
"entropy": 0.5521087646484375,
"epoch": 1.4090909090909092,
"grad_norm": 6.9998063616091954,
"learning_rate": 9.121074858959997e-06,
"loss": 0.1495,
"mean_token_accuracy": 0.9296875041909516,
"num_tokens": 102654586.0,
"step": 124
},
{
"entropy": 0.5347137451171875,
"epoch": 1.4204545454545454,
"grad_norm": 2.3410945679167954,
"learning_rate": 9.103239344725465e-06,
"loss": 0.1254,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 103497056.0,
"step": 125
},
{
"entropy": 0.5460281372070312,
"epoch": 1.4318181818181819,
"grad_norm": 2.2767941229194495,
"learning_rate": 9.085242487642117e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 104315903.0,
"step": 126
},
{
"entropy": 0.5574874877929688,
"epoch": 1.4431818181818181,
"grad_norm": 4.61478807505284,
"learning_rate": 9.067084995361623e-06,
"loss": 0.1352,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 105097772.0,
"step": 127
},
{
"entropy": 0.53790283203125,
"epoch": 1.4545454545454546,
"grad_norm": 2.123720821874447,
"learning_rate": 9.048767581851973e-06,
"loss": 0.1419,
"mean_token_accuracy": 0.9335937539581209,
"num_tokens": 105938088.0,
"step": 128
},
{
"entropy": 0.5608444213867188,
"epoch": 1.4659090909090908,
"grad_norm": 1.5993090480931784,
"learning_rate": 9.030290967369392e-06,
"loss": 0.1217,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 106725926.0,
"step": 129
},
{
"entropy": 0.5270538330078125,
"epoch": 1.4772727272727273,
"grad_norm": 1.8273550840469062,
"learning_rate": 9.011655878430018e-06,
"loss": 0.1352,
"mean_token_accuracy": 0.9388020869810134,
"num_tokens": 107586088.0,
"step": 130
},
{
"entropy": 0.5430755615234375,
"epoch": 1.4886363636363638,
"grad_norm": 2.38930161410884,
"learning_rate": 8.992863047781346e-06,
"loss": 0.1219,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 108406686.0,
"step": 131
},
{
"entropy": 0.5335464477539062,
"epoch": 1.5,
"grad_norm": 4.0374627826678005,
"learning_rate": 8.973913214373405e-06,
"loss": 0.1383,
"mean_token_accuracy": 0.9322916707023978,
"num_tokens": 109266458.0,
"step": 132
},
{
"entropy": 0.53369140625,
"epoch": 1.5113636363636362,
"grad_norm": 2.267529849479496,
"learning_rate": 8.954807123329703e-06,
"loss": 0.1408,
"mean_token_accuracy": 0.945312503259629,
"num_tokens": 110097799.0,
"step": 133
},
{
"entropy": 0.5591506958007812,
"epoch": 1.5227272727272727,
"grad_norm": 1.4402288066852007,
"learning_rate": 8.935545525917936e-06,
"loss": 0.1294,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 110885530.0,
"step": 134
},
{
"entropy": 0.5544586181640625,
"epoch": 1.5340909090909092,
"grad_norm": 1.7978385448039365,
"learning_rate": 8.916129179520443e-06,
"loss": 0.1341,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 111709692.0,
"step": 135
},
{
"entropy": 0.549468994140625,
"epoch": 1.5454545454545454,
"grad_norm": 5.401067092612967,
"learning_rate": 8.896558847604414e-06,
"loss": 0.125,
"mean_token_accuracy": 0.9414062534924597,
"num_tokens": 112521769.0,
"step": 136
},
{
"entropy": 0.5504837036132812,
"epoch": 1.5568181818181817,
"grad_norm": 3.341966167312368,
"learning_rate": 8.876835299691892e-06,
"loss": 0.1175,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 113374792.0,
"step": 137
},
{
"entropy": 0.5494613647460938,
"epoch": 1.5681818181818183,
"grad_norm": 10.390450710003204,
"learning_rate": 8.856959311329495e-06,
"loss": 0.1683,
"mean_token_accuracy": 0.9205729214008898,
"num_tokens": 114209235.0,
"step": 138
},
{
"entropy": 0.5491485595703125,
"epoch": 1.5795454545454546,
"grad_norm": 7.8130355045508395,
"learning_rate": 8.836931664057935e-06,
"loss": 0.1425,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 115024770.0,
"step": 139
},
{
"entropy": 0.5506820678710938,
"epoch": 1.5909090909090908,
"grad_norm": 2.7886680033055837,
"learning_rate": 8.816753145381276e-06,
"loss": 0.1168,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 115851204.0,
"step": 140
},
{
"entropy": 0.5492782592773438,
"epoch": 1.6022727272727273,
"grad_norm": 1.9667205066850295,
"learning_rate": 8.796424548735975e-06,
"loss": 0.134,
"mean_token_accuracy": 0.9427083367481828,
"num_tokens": 116699283.0,
"step": 141
},
{
"entropy": 0.5760650634765625,
"epoch": 1.6136363636363638,
"grad_norm": 5.35821863023592,
"learning_rate": 8.775946673459682e-06,
"loss": 0.1229,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 117503423.0,
"step": 142
},
{
"entropy": 0.5779876708984375,
"epoch": 1.625,
"grad_norm": 2.8148266983890413,
"learning_rate": 8.755320324759808e-06,
"loss": 0.109,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 118291651.0,
"step": 143
},
{
"entropy": 0.5753326416015625,
"epoch": 1.6363636363636362,
"grad_norm": 3.2866194746376927,
"learning_rate": 8.734546313681869e-06,
"loss": 0.1232,
"mean_token_accuracy": 0.9479166697710752,
"num_tokens": 119098075.0,
"step": 144
},
{
"entropy": 0.5827407836914062,
"epoch": 1.6477272727272727,
"grad_norm": 2.9644999295746874,
"learning_rate": 8.713625457077585e-06,
"loss": 0.1191,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 119888254.0,
"step": 145
},
{
"entropy": 0.5566940307617188,
"epoch": 1.6590909090909092,
"grad_norm": 1.2629263567013618,
"learning_rate": 8.692558577572773e-06,
"loss": 0.1199,
"mean_token_accuracy": 0.9518229195382446,
"num_tokens": 120730204.0,
"step": 146
},
{
"entropy": 0.5731048583984375,
"epoch": 1.6704545454545454,
"grad_norm": 4.651622739066796,
"learning_rate": 8.671346503534987e-06,
"loss": 0.1216,
"mean_token_accuracy": 0.9401041702367365,
"num_tokens": 121547571.0,
"step": 147
},
{
"entropy": 0.5564804077148438,
"epoch": 1.6818181818181817,
"grad_norm": 1.8545038590057121,
"learning_rate": 8.64999006904096e-06,
"loss": 0.0951,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 122398891.0,
"step": 148
},
{
"entropy": 0.5613327026367188,
"epoch": 1.6931818181818183,
"grad_norm": 6.832643774674837,
"learning_rate": 8.628490113843798e-06,
"loss": 0.1649,
"mean_token_accuracy": 0.9309895874466747,
"num_tokens": 123217975.0,
"step": 149
},
{
"entropy": 0.5449295043945312,
"epoch": 1.7045454545454546,
"grad_norm": 6.765128841750841,
"learning_rate": 8.606847483339957e-06,
"loss": 0.1562,
"mean_token_accuracy": 0.9257812544237822,
"num_tokens": 124067259.0,
"step": 150
},
{
"entropy": 0.5537567138671875,
"epoch": 1.7159090909090908,
"grad_norm": 1.8067052651080437,
"learning_rate": 8.585063028536015e-06,
"loss": 0.0881,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 124928545.0,
"step": 151
},
{
"entropy": 0.5481033325195312,
"epoch": 1.7272727272727273,
"grad_norm": 1.936600476892622,
"learning_rate": 8.563137606015201e-06,
"loss": 0.1096,
"mean_token_accuracy": 0.9531250027939677,
"num_tokens": 125802700.0,
"step": 152
},
{
"entropy": 0.5662612915039062,
"epoch": 1.7386363636363638,
"grad_norm": 3.3667078901079286,
"learning_rate": 8.54107207790371e-06,
"loss": 0.0913,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 126615551.0,
"step": 153
},
{
"entropy": 0.5416641235351562,
"epoch": 1.75,
"grad_norm": 1.8440360775274136,
"learning_rate": 8.518867311836808e-06,
"loss": 0.1098,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 127457356.0,
"step": 154
},
{
"entropy": 0.539337158203125,
"epoch": 1.7613636363636362,
"grad_norm": 2.928897176060182,
"learning_rate": 8.49652418092472e-06,
"loss": 0.0814,
"mean_token_accuracy": 0.9726562516298145,
"num_tokens": 128340971.0,
"step": 155
},
{
"entropy": 0.5511016845703125,
"epoch": 1.7727272727272727,
"grad_norm": 2.5333879176767247,
"learning_rate": 8.474043563718287e-06,
"loss": 0.0981,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 129194640.0,
"step": 156
},
{
"entropy": 0.5571365356445312,
"epoch": 1.7840909090909092,
"grad_norm": 3.34543946127575,
"learning_rate": 8.451426344174433e-06,
"loss": 0.0957,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 130035771.0,
"step": 157
},
{
"entropy": 0.5454559326171875,
"epoch": 1.7954545454545454,
"grad_norm": 1.7036283510875303,
"learning_rate": 8.4286734116214e-06,
"loss": 0.0966,
"mean_token_accuracy": 0.9596354190725833,
"num_tokens": 130925665.0,
"step": 158
},
{
"entropy": 0.5605239868164062,
"epoch": 1.8068181818181817,
"grad_norm": 1.665281365638825,
"learning_rate": 8.405785660723784e-06,
"loss": 0.0988,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 131762678.0,
"step": 159
},
{
"entropy": 0.5491256713867188,
"epoch": 1.8181818181818183,
"grad_norm": 7.473940273244859,
"learning_rate": 8.382763991447344e-06,
"loss": 0.1227,
"mean_token_accuracy": 0.9440104200039059,
"num_tokens": 132615569.0,
"step": 160
},
{
"entropy": 0.5663986206054688,
"epoch": 1.8295454545454546,
"grad_norm": 7.633821231684359,
"learning_rate": 8.359609309023632e-06,
"loss": 0.125,
"mean_token_accuracy": 0.9492187530267984,
"num_tokens": 133443981.0,
"step": 161
},
{
"entropy": 0.5599212646484375,
"epoch": 1.8409090909090908,
"grad_norm": 1.7052593935067812,
"learning_rate": 8.336322523914385e-06,
"loss": 0.0974,
"mean_token_accuracy": 0.9596354190725833,
"num_tokens": 134283417.0,
"step": 162
},
{
"entropy": 0.551239013671875,
"epoch": 1.8522727272727273,
"grad_norm": 2.9955966815957433,
"learning_rate": 8.312904551775731e-06,
"loss": 0.096,
"mean_token_accuracy": 0.955729169305414,
"num_tokens": 135129505.0,
"step": 163
},
{
"entropy": 0.54931640625,
"epoch": 1.8636363636363638,
"grad_norm": 1.605468865667034,
"learning_rate": 8.289356313422182e-06,
"loss": 0.116,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 135982474.0,
"step": 164
},
{
"entropy": 0.5581436157226562,
"epoch": 1.875,
"grad_norm": 1.6624611761909598,
"learning_rate": 8.26567873479043e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 136810572.0,
"step": 165
},
{
"entropy": 0.564849853515625,
"epoch": 1.8863636363636362,
"grad_norm": 2.682316743161302,
"learning_rate": 8.241872746902934e-06,
"loss": 0.0893,
"mean_token_accuracy": 0.9700520851183683,
"num_tokens": 137623231.0,
"step": 166
},
{
"entropy": 0.5549545288085938,
"epoch": 1.8977272727272727,
"grad_norm": 3.8888251203366844,
"learning_rate": 8.217939285831315e-06,
"loss": 0.1113,
"mean_token_accuracy": 0.9466145865153521,
"num_tokens": 138474048.0,
"step": 167
},
{
"entropy": 0.5504684448242188,
"epoch": 1.9090909090909092,
"grad_norm": 1.723647194606046,
"learning_rate": 8.19387929265955e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.967447918606922,
"num_tokens": 139318039.0,
"step": 168
},
{
"entropy": 0.5573348999023438,
"epoch": 1.9204545454545454,
"grad_norm": 5.091415219779504,
"learning_rate": 8.16969371344696e-06,
"loss": 0.094,
"mean_token_accuracy": 0.9687500018626451,
"num_tokens": 140136159.0,
"step": 169
},
{
"entropy": 0.558563232421875,
"epoch": 1.9318181818181817,
"grad_norm": 3.8416271732252576,
"learning_rate": 8.14538349919102e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9518229195382446,
"num_tokens": 140989018.0,
"step": 170
},
{
"entropy": 0.5544891357421875,
"epoch": 1.9431818181818183,
"grad_norm": 4.006864387265136,
"learning_rate": 8.12094960578996e-06,
"loss": 0.0885,
"mean_token_accuracy": 0.9609375023283064,
"num_tokens": 141834446.0,
"step": 171
},
{
"entropy": 0.5765304565429688,
"epoch": 1.9545454545454546,
"grad_norm": 2.0489624758048204,
"learning_rate": 8.096392994005177e-06,
"loss": 0.0784,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 142637643.0,
"step": 172
},
{
"entropy": 0.5684738159179688,
"epoch": 1.9659090909090908,
"grad_norm": 1.7193238150531707,
"learning_rate": 8.071714629423459e-06,
"loss": 0.0671,
"mean_token_accuracy": 0.9804687511641532,
"num_tokens": 143446972.0,
"step": 173
},
{
"entropy": 0.5797042846679688,
"epoch": 1.9772727272727273,
"grad_norm": 1.8892931068119632,
"learning_rate": 8.046915482419018e-06,
"loss": 0.0761,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 144219843.0,
"step": 174
},
{
"entropy": 0.5546188354492188,
"epoch": 1.9886363636363638,
"grad_norm": 2.89175754473916,
"learning_rate": 8.021996528115335e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 145065280.0,
"step": 175
},
{
"entropy": 0.5574493408203125,
"epoch": 2.0,
"grad_norm": 1.7760622157547898,
"learning_rate": 7.996958746346812e-06,
"loss": 0.0593,
"mean_token_accuracy": 0.977864584652707,
"num_tokens": 145884441.0,
"step": 176
},
{
"entropy": 0.5555419921875,
"epoch": 2.0113636363636362,
"grad_norm": 1.6411360190424782,
"learning_rate": 7.971803121620252e-06,
"loss": 0.0667,
"mean_token_accuracy": 0.9752604181412607,
"num_tokens": 146692231.0,
"step": 177
},
{
"entropy": 0.5388107299804688,
"epoch": 2.022727272727273,
"grad_norm": 2.3653599867436776,
"learning_rate": 7.946530643076138e-06,
"loss": 0.0652,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 147528006.0,
"step": 178
},
{
"entropy": 0.5242385864257812,
"epoch": 2.034090909090909,
"grad_norm": 3.2222161790722694,
"learning_rate": 7.921142304449744e-06,
"loss": 0.0675,
"mean_token_accuracy": 0.9726562516298145,
"num_tokens": 148417260.0,
"step": 179
},
{
"entropy": 0.5447845458984375,
"epoch": 2.0454545454545454,
"grad_norm": 2.463707776669041,
"learning_rate": 7.895639104032071e-06,
"loss": 0.066,
"mean_token_accuracy": 0.9791666679084301,
"num_tokens": 149242794.0,
"step": 180
},
{
"entropy": 0.5285720825195312,
"epoch": 2.0568181818181817,
"grad_norm": 7.236977149302813,
"learning_rate": 7.870022044630569e-06,
"loss": 0.1021,
"mean_token_accuracy": 0.9622395855840296,
"num_tokens": 150098081.0,
"step": 181
},
{
"entropy": 0.54425048828125,
"epoch": 2.0681818181818183,
"grad_norm": 3.3362947055324494,
"learning_rate": 7.844292133529727e-06,
"loss": 0.0811,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 150917507.0,
"step": 182
},
{
"entropy": 0.5337677001953125,
"epoch": 2.0795454545454546,
"grad_norm": 10.193693903709919,
"learning_rate": 7.818450382451457e-06,
"loss": 0.1514,
"mean_token_accuracy": 0.9348958372138441,
"num_tokens": 151774062.0,
"step": 183
},
{
"entropy": 0.5380935668945312,
"epoch": 2.090909090909091,
"grad_norm": 6.471329983707537,
"learning_rate": 7.792497807515317e-06,
"loss": 0.1129,
"mean_token_accuracy": 0.9518229195382446,
"num_tokens": 152611097.0,
"step": 184
},
{
"entropy": 0.5472564697265625,
"epoch": 2.102272727272727,
"grad_norm": 1.6410409359900417,
"learning_rate": 7.766435429198547e-06,
"loss": 0.0666,
"mean_token_accuracy": 0.9726562516298145,
"num_tokens": 153466669.0,
"step": 185
},
{
"entropy": 0.5562591552734375,
"epoch": 2.1136363636363638,
"grad_norm": 6.992530287146324,
"learning_rate": 7.740264272295954e-06,
"loss": 0.1071,
"mean_token_accuracy": 0.9583333358168602,
"num_tokens": 154298660.0,
"step": 186
},
{
"entropy": 0.5470352172851562,
"epoch": 2.125,
"grad_norm": 6.0492157426773385,
"learning_rate": 7.713985365879607e-06,
"loss": 0.0907,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 155130292.0,
"step": 187
},
{
"entropy": 0.54876708984375,
"epoch": 2.1363636363636362,
"grad_norm": 2.698030226800749,
"learning_rate": 7.68759974325838e-06,
"loss": 0.0665,
"mean_token_accuracy": 0.9752604181412607,
"num_tokens": 155975863.0,
"step": 188
},
{
"entropy": 0.5512847900390625,
"epoch": 2.147727272727273,
"grad_norm": 5.25550884455084,
"learning_rate": 7.661108441937321e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9661458353511989,
"num_tokens": 156800322.0,
"step": 189
},
{
"entropy": 0.5344924926757812,
"epoch": 2.159090909090909,
"grad_norm": 9.276367829707043,
"learning_rate": 7.63451250357685e-06,
"loss": 0.1309,
"mean_token_accuracy": 0.9375000037252903,
"num_tokens": 157668790.0,
"step": 190
},
{
"entropy": 0.5440673828125,
"epoch": 2.1704545454545454,
"grad_norm": 5.295760824612763,
"learning_rate": 7.607812973951802e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9648437520954758,
"num_tokens": 158496829.0,
"step": 191
},
{
"entropy": 0.5383377075195312,
"epoch": 2.1818181818181817,
"grad_norm": 1.4547423731248486,
"learning_rate": 7.581010902910316e-06,
"loss": 0.0592,
"mean_token_accuracy": 0.9739583348855376,
"num_tokens": 159347676.0,
"step": 192
},
{
"entropy": 0.5419235229492188,
"epoch": 2.1931818181818183,
"grad_norm": 3.45460465485423,
"learning_rate": 7.55410734433254e-06,
"loss": 0.0756,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 160186343.0,
"step": 193
},
{
"entropy": 0.5480194091796875,
"epoch": 2.2045454545454546,
"grad_norm": 4.242880928264725,
"learning_rate": 7.5271033560892e-06,
"loss": 0.0786,
"mean_token_accuracy": 0.9700520851183683,
"num_tokens": 160994811.0,
"step": 194
},
{
"entropy": 0.54876708984375,
"epoch": 2.215909090909091,
"grad_norm": 2.01718177176073,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0441,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 161804760.0,
"step": 195
},
{
"entropy": 0.5249099731445312,
"epoch": 2.227272727272727,
"grad_norm": 3.119173047157026,
"learning_rate": 7.472798341791877e-06,
"loss": 0.0502,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 162669635.0,
"step": 196
},
{
"entropy": 0.5340576171875,
"epoch": 2.2386363636363638,
"grad_norm": 3.2640184382157327,
"learning_rate": 7.445499451057083e-06,
"loss": 0.0751,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 163509429.0,
"step": 197
},
{
"entropy": 0.5215835571289062,
"epoch": 2.25,
"grad_norm": 2.5349734692769106,
"learning_rate": 7.418104401211144e-06,
"loss": 0.045,
"mean_token_accuracy": 0.9830729176755995,
"num_tokens": 164350633.0,
"step": 198
},
{
"entropy": 0.5251007080078125,
"epoch": 2.2613636363636362,
"grad_norm": 2.6297238209432,
"learning_rate": 7.390614269450633e-06,
"loss": 0.0509,
"mean_token_accuracy": 0.9856770841870457,
"num_tokens": 165187521.0,
"step": 199
},
{
"entropy": 0.5383987426757812,
"epoch": 2.2727272727272725,
"grad_norm": 3.562530401980587,
"learning_rate": 7.363030136710837e-06,
"loss": 0.05,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 166006751.0,
"step": 200
},
{
"entropy": 0.5240249633789062,
"epoch": 2.284090909090909,
"grad_norm": 3.7171813986264657,
"learning_rate": 7.3353530876232315e-06,
"loss": 0.0712,
"mean_token_accuracy": 0.9752604181412607,
"num_tokens": 166843238.0,
"step": 201
},
{
"entropy": 0.5258255004882812,
"epoch": 2.2954545454545454,
"grad_norm": 2.0334492418178454,
"learning_rate": 7.3075842104728445e-06,
"loss": 0.0463,
"mean_token_accuracy": 0.9856770841870457,
"num_tokens": 167691986.0,
"step": 202
},
{
"entropy": 0.5163116455078125,
"epoch": 2.3068181818181817,
"grad_norm": 5.950230395278793,
"learning_rate": 7.279724597155463e-06,
"loss": 0.0688,
"mean_token_accuracy": 0.9700520851183683,
"num_tokens": 168532989.0,
"step": 203
},
{
"entropy": 0.5151596069335938,
"epoch": 2.3181818181818183,
"grad_norm": 6.82055119678414,
"learning_rate": 7.251775343134695e-06,
"loss": 0.0818,
"mean_token_accuracy": 0.9713541683740914,
"num_tokens": 169363455.0,
"step": 204
},
{
"entropy": 0.5315170288085938,
"epoch": 2.3295454545454546,
"grad_norm": 3.65607888800478,
"learning_rate": 7.223737547398898e-06,
"loss": 0.0681,
"mean_token_accuracy": 0.9752604181412607,
"num_tokens": 170177432.0,
"step": 205
},
{
"entropy": 0.5236282348632812,
"epoch": 2.340909090909091,
"grad_norm": 3.922669241526332,
"learning_rate": 7.195612312417964e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 171014423.0,
"step": 206
},
{
"entropy": 0.5333251953125,
"epoch": 2.3522727272727275,
"grad_norm": 5.835006780995604,
"learning_rate": 7.1674007440999706e-06,
"loss": 0.0701,
"mean_token_accuracy": 0.967447918606922,
"num_tokens": 171832784.0,
"step": 207
},
{
"entropy": 0.5201034545898438,
"epoch": 2.3636363636363638,
"grad_norm": 3.762781794806702,
"learning_rate": 7.139103951747694e-06,
"loss": 0.0689,
"mean_token_accuracy": 0.9726562516298145,
"num_tokens": 172678098.0,
"step": 208
},
{
"entropy": 0.5207901000976562,
"epoch": 2.375,
"grad_norm": 2.8820018215511296,
"learning_rate": 7.110723048014996e-06,
"loss": 0.051,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 173521043.0,
"step": 209
},
{
"entropy": 0.5383834838867188,
"epoch": 2.3863636363636362,
"grad_norm": 4.032203821493698,
"learning_rate": 7.082259148863064e-06,
"loss": 0.0514,
"mean_token_accuracy": 0.9765625013969839,
"num_tokens": 174330129.0,
"step": 210
},
{
"entropy": 0.5193099975585938,
"epoch": 2.3977272727272725,
"grad_norm": 4.031643917333958,
"learning_rate": 7.053713373516538e-06,
"loss": 0.0599,
"mean_token_accuracy": 0.977864584652707,
"num_tokens": 175179051.0,
"step": 211
},
{
"entropy": 0.5137481689453125,
"epoch": 2.409090909090909,
"grad_norm": 1.494980753055014,
"learning_rate": 7.0250868444195e-06,
"loss": 0.0463,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 176037909.0,
"step": 212
},
{
"entropy": 0.5253143310546875,
"epoch": 2.4204545454545454,
"grad_norm": 1.7895769023957353,
"learning_rate": 6.996380687191335e-06,
"loss": 0.0445,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 176860131.0,
"step": 213
},
{
"entropy": 0.5091476440429688,
"epoch": 2.4318181818181817,
"grad_norm": 2.1956945351245682,
"learning_rate": 6.9675960305824785e-06,
"loss": 0.0398,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 177705136.0,
"step": 214
},
{
"entropy": 0.523956298828125,
"epoch": 2.4431818181818183,
"grad_norm": 2.703363904992189,
"learning_rate": 6.9387340064300234e-06,
"loss": 0.0425,
"mean_token_accuracy": 0.9817708344198763,
"num_tokens": 178510671.0,
"step": 215
},
{
"entropy": 0.5203170776367188,
"epoch": 2.4545454545454546,
"grad_norm": 2.1620913420308496,
"learning_rate": 6.909795749613223e-06,
"loss": 0.034,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 179329651.0,
"step": 216
},
{
"entropy": 0.5090408325195312,
"epoch": 2.465909090909091,
"grad_norm": 2.3276449435155357,
"learning_rate": 6.880782398008862e-06,
"loss": 0.0318,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 180158843.0,
"step": 217
},
{
"entropy": 0.501129150390625,
"epoch": 2.4772727272727275,
"grad_norm": 3.2599600902110795,
"learning_rate": 6.851695092446517e-06,
"loss": 0.0347,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 181003645.0,
"step": 218
},
{
"entropy": 0.49993133544921875,
"epoch": 2.4886363636363638,
"grad_norm": 2.2988867090225753,
"learning_rate": 6.822534976663695e-06,
"loss": 0.0249,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 181838208.0,
"step": 219
},
{
"entropy": 0.49395751953125,
"epoch": 2.5,
"grad_norm": 2.8150814971598033,
"learning_rate": 6.7933031972608644e-06,
"loss": 0.0229,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 182681925.0,
"step": 220
},
{
"entropy": 0.49420928955078125,
"epoch": 2.5113636363636362,
"grad_norm": 2.4377006101904937,
"learning_rate": 6.764000903656367e-06,
"loss": 0.0219,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 183519031.0,
"step": 221
},
{
"entropy": 0.49845123291015625,
"epoch": 2.5227272727272725,
"grad_norm": 1.9554050092260766,
"learning_rate": 6.734629248041226e-06,
"loss": 0.0282,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 184358917.0,
"step": 222
},
{
"entropy": 0.4810333251953125,
"epoch": 2.534090909090909,
"grad_norm": 1.9724903668294038,
"learning_rate": 6.70518938533383e-06,
"loss": 0.0223,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 185224412.0,
"step": 223
},
{
"entropy": 0.5009841918945312,
"epoch": 2.5454545454545454,
"grad_norm": 2.716850789628636,
"learning_rate": 6.675682473134536e-06,
"loss": 0.0354,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 186048899.0,
"step": 224
},
{
"entropy": 0.494140625,
"epoch": 2.5568181818181817,
"grad_norm": 3.486680710445784,
"learning_rate": 6.64610967168014e-06,
"loss": 0.045,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 186880043.0,
"step": 225
},
{
"entropy": 0.5100173950195312,
"epoch": 2.5681818181818183,
"grad_norm": 3.387793071023026,
"learning_rate": 6.61647214379826e-06,
"loss": 0.0482,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 187654506.0,
"step": 226
},
{
"entropy": 0.5018386840820312,
"epoch": 2.5795454545454546,
"grad_norm": 2.526453396302034,
"learning_rate": 6.586771054861613e-06,
"loss": 0.0263,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 188464920.0,
"step": 227
},
{
"entropy": 0.5029830932617188,
"epoch": 2.590909090909091,
"grad_norm": 3.0298112078189554,
"learning_rate": 6.55700757274219e-06,
"loss": 0.0374,
"mean_token_accuracy": 0.9856770841870457,
"num_tokens": 189293301.0,
"step": 228
},
{
"entropy": 0.5063552856445312,
"epoch": 2.6022727272727275,
"grad_norm": 2.394723685020552,
"learning_rate": 6.527182867765333e-06,
"loss": 0.023,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 190121007.0,
"step": 229
},
{
"entropy": 0.5308837890625,
"epoch": 2.6136363636363638,
"grad_norm": 2.945677885521069,
"learning_rate": 6.497298112663721e-06,
"loss": 0.0335,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 190881003.0,
"step": 230
},
{
"entropy": 0.506683349609375,
"epoch": 2.625,
"grad_norm": 2.431103647089613,
"learning_rate": 6.467354482531254e-06,
"loss": 0.0189,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 191699288.0,
"step": 231
},
{
"entropy": 0.50860595703125,
"epoch": 2.6363636363636362,
"grad_norm": 2.6832616012101975,
"learning_rate": 6.437353154776848e-06,
"loss": 0.0289,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 192526668.0,
"step": 232
},
{
"entropy": 0.5082778930664062,
"epoch": 2.6477272727272725,
"grad_norm": 2.2653553256013796,
"learning_rate": 6.407295309078139e-06,
"loss": 0.0302,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 193349142.0,
"step": 233
},
{
"entropy": 0.5096511840820312,
"epoch": 2.659090909090909,
"grad_norm": 1.5361613738548292,
"learning_rate": 6.377182127335096e-06,
"loss": 0.0145,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 194176787.0,
"step": 234
},
{
"entropy": 0.5134658813476562,
"epoch": 2.6704545454545454,
"grad_norm": 2.8189785812143295,
"learning_rate": 6.3470147936235485e-06,
"loss": 0.0238,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 195004474.0,
"step": 235
},
{
"entropy": 0.5214920043945312,
"epoch": 2.6818181818181817,
"grad_norm": 1.7469149362286367,
"learning_rate": 6.316794494148625e-06,
"loss": 0.0224,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 195807358.0,
"step": 236
},
{
"entropy": 0.5124053955078125,
"epoch": 2.6931818181818183,
"grad_norm": 3.217590245044298,
"learning_rate": 6.286522417198115e-06,
"loss": 0.0284,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 196617818.0,
"step": 237
},
{
"entropy": 0.5110855102539062,
"epoch": 2.7045454545454546,
"grad_norm": 1.6664060279228878,
"learning_rate": 6.256199753095745e-06,
"loss": 0.0188,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 197430311.0,
"step": 238
},
{
"entropy": 0.505767822265625,
"epoch": 2.715909090909091,
"grad_norm": 3.3115585706561586,
"learning_rate": 6.225827694154365e-06,
"loss": 0.0287,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 198277321.0,
"step": 239
},
{
"entropy": 0.5065383911132812,
"epoch": 2.7272727272727275,
"grad_norm": 2.545417377201447,
"learning_rate": 6.1954074346290775e-06,
"loss": 0.0186,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 199108815.0,
"step": 240
},
{
"entropy": 0.5145187377929688,
"epoch": 2.7386363636363638,
"grad_norm": 2.333232181354277,
"learning_rate": 6.164940170670266e-06,
"loss": 0.0467,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 199899561.0,
"step": 241
},
{
"entropy": 0.516876220703125,
"epoch": 2.75,
"grad_norm": 2.914196581871424,
"learning_rate": 6.134427100276579e-06,
"loss": 0.0562,
"mean_token_accuracy": 0.9830729176755995,
"num_tokens": 200725431.0,
"step": 242
},
{
"entropy": 0.5063247680664062,
"epoch": 2.7613636363636362,
"grad_norm": 1.8740763274483776,
"learning_rate": 6.1038694232478e-06,
"loss": 0.0263,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 201560765.0,
"step": 243
},
{
"entropy": 0.5165557861328125,
"epoch": 2.7727272727272725,
"grad_norm": 2.8713490854584705,
"learning_rate": 6.073268341137694e-06,
"loss": 0.0273,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 202381503.0,
"step": 244
},
{
"entropy": 0.5127792358398438,
"epoch": 2.784090909090909,
"grad_norm": 1.4259016961929543,
"learning_rate": 6.042625057206742e-06,
"loss": 0.0237,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 203203187.0,
"step": 245
},
{
"entropy": 0.5303802490234375,
"epoch": 2.7954545454545454,
"grad_norm": 2.761830170391252,
"learning_rate": 6.0119407763748465e-06,
"loss": 0.0256,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 203986073.0,
"step": 246
},
{
"entropy": 0.5184249877929688,
"epoch": 2.8068181818181817,
"grad_norm": 2.702130210407872,
"learning_rate": 5.98121670517393e-06,
"loss": 0.0274,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 204805437.0,
"step": 247
},
{
"entropy": 0.5221176147460938,
"epoch": 2.8181818181818183,
"grad_norm": 2.0126057256121936,
"learning_rate": 5.950454051700519e-06,
"loss": 0.0324,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 205608028.0,
"step": 248
},
{
"entropy": 0.5098419189453125,
"epoch": 2.8295454545454546,
"grad_norm": 3.2909474534699363,
"learning_rate": 5.919654025568216e-06,
"loss": 0.0392,
"mean_token_accuracy": 0.9856770841870457,
"num_tokens": 206426839.0,
"step": 249
},
{
"entropy": 0.49706268310546875,
"epoch": 2.840909090909091,
"grad_norm": 2.8470716408708774,
"learning_rate": 5.8888178378601565e-06,
"loss": 0.0344,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 207279265.0,
"step": 250
},
{
"entropy": 0.5188217163085938,
"epoch": 2.8522727272727275,
"grad_norm": 3.5895784761020173,
"learning_rate": 5.85794670108138e-06,
"loss": 0.0305,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 208074284.0,
"step": 251
},
{
"entropy": 0.5012741088867188,
"epoch": 2.8636363636363638,
"grad_norm": 6.393556799364061,
"learning_rate": 5.827041829111144e-06,
"loss": 0.0626,
"mean_token_accuracy": 0.9830729176755995,
"num_tokens": 208920165.0,
"step": 252
},
{
"entropy": 0.499847412109375,
"epoch": 2.875,
"grad_norm": 3.127734745296954,
"learning_rate": 5.796104437155213e-06,
"loss": 0.0261,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 209763076.0,
"step": 253
},
{
"entropy": 0.498291015625,
"epoch": 2.8863636363636362,
"grad_norm": 2.259346877145594,
"learning_rate": 5.765135741698058e-06,
"loss": 0.023,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 210578485.0,
"step": 254
},
{
"entropy": 0.5106277465820312,
"epoch": 2.8977272727272725,
"grad_norm": 3.030246237871133,
"learning_rate": 5.734136960455035e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 211388341.0,
"step": 255
},
{
"entropy": 0.49578857421875,
"epoch": 2.909090909090909,
"grad_norm": 2.750429015412331,
"learning_rate": 5.703109312324493e-06,
"loss": 0.0358,
"mean_token_accuracy": 0.9882812506984919,
"num_tokens": 212229038.0,
"step": 256
},
{
"entropy": 0.5024261474609375,
"epoch": 2.9204545454545454,
"grad_norm": 1.2863557607521734,
"learning_rate": 5.672054017339855e-06,
"loss": 0.0148,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 213072715.0,
"step": 257
},
{
"entropy": 0.49884796142578125,
"epoch": 2.9318181818181817,
"grad_norm": 3.02221939510891,
"learning_rate": 5.640972296621644e-06,
"loss": 0.0238,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 213912498.0,
"step": 258
},
{
"entropy": 0.5084075927734375,
"epoch": 2.9431818181818183,
"grad_norm": 4.428890553493783,
"learning_rate": 5.609865372329461e-06,
"loss": 0.0398,
"mean_token_accuracy": 0.9843750009313226,
"num_tokens": 214726839.0,
"step": 259
},
{
"entropy": 0.5108566284179688,
"epoch": 2.9545454545454546,
"grad_norm": 2.5752296561401664,
"learning_rate": 5.578734467613933e-06,
"loss": 0.0223,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 215527901.0,
"step": 260
},
{
"entropy": 0.4960784912109375,
"epoch": 2.965909090909091,
"grad_norm": 1.401132052463041,
"learning_rate": 5.547580806568621e-06,
"loss": 0.0187,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 216370628.0,
"step": 261
},
{
"entropy": 0.5012435913085938,
"epoch": 2.9772727272727275,
"grad_norm": 1.8759810858997696,
"learning_rate": 5.516405614181883e-06,
"loss": 0.0164,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 217191647.0,
"step": 262
},
{
"entropy": 0.494384765625,
"epoch": 2.9886363636363638,
"grad_norm": 1.708292087377222,
"learning_rate": 5.485210116288704e-06,
"loss": 0.0128,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 218045769.0,
"step": 263
},
{
"entropy": 0.4870147705078125,
"epoch": 3.0,
"grad_norm": 3.1469843111715337,
"learning_rate": 5.453995539522503e-06,
"loss": 0.0294,
"mean_token_accuracy": 0.9869791674427688,
"num_tokens": 218903155.0,
"step": 264
},
{
"entropy": 0.49308013916015625,
"epoch": 3.0113636363636362,
"grad_norm": 0.910668285240801,
"learning_rate": 5.4227631112668955e-06,
"loss": 0.0086,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 219733127.0,
"step": 265
},
{
"entropy": 0.497039794921875,
"epoch": 3.022727272727273,
"grad_norm": 1.4708773619805977,
"learning_rate": 5.391514059607431e-06,
"loss": 0.0098,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 220560193.0,
"step": 266
},
{
"entropy": 0.4811553955078125,
"epoch": 3.034090909090909,
"grad_norm": 2.026974085665039,
"learning_rate": 5.360249613283308e-06,
"loss": 0.014,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 221419000.0,
"step": 267
},
{
"entropy": 0.5044631958007812,
"epoch": 3.0454545454545454,
"grad_norm": 1.7384277706071183,
"learning_rate": 5.328971001639054e-06,
"loss": 0.0138,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 222209715.0,
"step": 268
},
{
"entropy": 0.4959716796875,
"epoch": 3.0568181818181817,
"grad_norm": 2.1623095900546883,
"learning_rate": 5.2976794545761886e-06,
"loss": 0.0188,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 223024902.0,
"step": 269
},
{
"entropy": 0.49108123779296875,
"epoch": 3.0681818181818183,
"grad_norm": 1.509606911341433,
"learning_rate": 5.266376202504866e-06,
"loss": 0.0217,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 223854643.0,
"step": 270
},
{
"entropy": 0.48877716064453125,
"epoch": 3.0795454545454546,
"grad_norm": 3.1043150088135776,
"learning_rate": 5.235062476295488e-06,
"loss": 0.0311,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 224719358.0,
"step": 271
},
{
"entropy": 0.4909515380859375,
"epoch": 3.090909090909091,
"grad_norm": 3.13964195517409,
"learning_rate": 5.203739507230311e-06,
"loss": 0.028,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 225549159.0,
"step": 272
},
{
"entropy": 0.50506591796875,
"epoch": 3.102272727272727,
"grad_norm": 1.6160317091643448,
"learning_rate": 5.172408526955025e-06,
"loss": 0.0071,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 226368587.0,
"step": 273
},
{
"entropy": 0.5047378540039062,
"epoch": 3.1136363636363638,
"grad_norm": 1.353500778978189,
"learning_rate": 5.141070767430331e-06,
"loss": 0.0108,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 227156622.0,
"step": 274
},
{
"entropy": 0.5011825561523438,
"epoch": 3.125,
"grad_norm": 6.367060575954853,
"learning_rate": 5.109727460883496e-06,
"loss": 0.0531,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 227980137.0,
"step": 275
},
{
"entropy": 0.5009002685546875,
"epoch": 3.1363636363636362,
"grad_norm": 3.0144130557709854,
"learning_rate": 5.078379839759895e-06,
"loss": 0.0151,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 228789801.0,
"step": 276
},
{
"entropy": 0.490264892578125,
"epoch": 3.147727272727273,
"grad_norm": 1.8289965387337097,
"learning_rate": 5.047029136674563e-06,
"loss": 0.0068,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 229631483.0,
"step": 277
},
{
"entropy": 0.49471282958984375,
"epoch": 3.159090909090909,
"grad_norm": 3.5661554320566493,
"learning_rate": 5.015676584363716e-06,
"loss": 0.0311,
"mean_token_accuracy": 0.989583333954215,
"num_tokens": 230473829.0,
"step": 278
},
{
"entropy": 0.49500274658203125,
"epoch": 3.1704545454545454,
"grad_norm": 2.297701797368414,
"learning_rate": 4.984323415636285e-06,
"loss": 0.0147,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 231297503.0,
"step": 279
},
{
"entropy": 0.48564910888671875,
"epoch": 3.1818181818181817,
"grad_norm": 1.5108735884549447,
"learning_rate": 4.95297086332544e-06,
"loss": 0.0141,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 232167203.0,
"step": 280
},
{
"entropy": 0.5065155029296875,
"epoch": 3.1931818181818183,
"grad_norm": 1.4197005931026672,
"learning_rate": 4.921620160240107e-06,
"loss": 0.0086,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 232982992.0,
"step": 281
},
{
"entropy": 0.50030517578125,
"epoch": 3.2045454545454546,
"grad_norm": 2.0658589438724086,
"learning_rate": 4.890272539116508e-06,
"loss": 0.0177,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 233794951.0,
"step": 282
},
{
"entropy": 0.5062942504882812,
"epoch": 3.215909090909091,
"grad_norm": 1.390316826284385,
"learning_rate": 4.858929232569671e-06,
"loss": 0.0116,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 234627063.0,
"step": 283
},
{
"entropy": 0.508026123046875,
"epoch": 3.227272727272727,
"grad_norm": 0.8411098837665795,
"learning_rate": 4.827591473044978e-06,
"loss": 0.0114,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 235422472.0,
"step": 284
},
{
"entropy": 0.5152587890625,
"epoch": 3.2386363636363638,
"grad_norm": 1.109489038564382,
"learning_rate": 4.796260492769691e-06,
"loss": 0.0101,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 236250854.0,
"step": 285
},
{
"entropy": 0.526885986328125,
"epoch": 3.25,
"grad_norm": 1.618921272165497,
"learning_rate": 4.7649375237045135e-06,
"loss": 0.0202,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 237025724.0,
"step": 286
},
{
"entropy": 0.51226806640625,
"epoch": 3.2613636363636362,
"grad_norm": 0.8364422992296099,
"learning_rate": 4.733623797495136e-06,
"loss": 0.0196,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 237847006.0,
"step": 287
},
{
"entropy": 0.5173263549804688,
"epoch": 3.2727272727272725,
"grad_norm": 0.9380743709915682,
"learning_rate": 4.702320545423814e-06,
"loss": 0.0072,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 238659400.0,
"step": 288
},
{
"entropy": 0.50469970703125,
"epoch": 3.284090909090909,
"grad_norm": 0.8521605214578054,
"learning_rate": 4.671028998360947e-06,
"loss": 0.0062,
"mean_token_accuracy": 1.0,
"num_tokens": 239520566.0,
"step": 289
},
{
"entropy": 0.5128173828125,
"epoch": 3.2954545454545454,
"grad_norm": 1.593635812033709,
"learning_rate": 4.639750386716693e-06,
"loss": 0.0092,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 240338169.0,
"step": 290
},
{
"entropy": 0.49086761474609375,
"epoch": 3.3068181818181817,
"grad_norm": 1.5856201719842262,
"learning_rate": 4.60848594039257e-06,
"loss": 0.0061,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 241205913.0,
"step": 291
},
{
"entropy": 0.49979400634765625,
"epoch": 3.3181818181818183,
"grad_norm": 2.4064616761833673,
"learning_rate": 4.5772368887331044e-06,
"loss": 0.0191,
"mean_token_accuracy": 0.9908854172099382,
"num_tokens": 242031860.0,
"step": 292
},
{
"entropy": 0.501129150390625,
"epoch": 3.3295454545454546,
"grad_norm": 1.786638161744003,
"learning_rate": 4.5460044604774986e-06,
"loss": 0.0121,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 242863430.0,
"step": 293
},
{
"entropy": 0.49781036376953125,
"epoch": 3.340909090909091,
"grad_norm": 1.3504594379263928,
"learning_rate": 4.514789883711296e-06,
"loss": 0.0042,
"mean_token_accuracy": 1.0,
"num_tokens": 243706209.0,
"step": 294
},
{
"entropy": 0.5142593383789062,
"epoch": 3.3522727272727275,
"grad_norm": 2.0637007989663494,
"learning_rate": 4.483594385818119e-06,
"loss": 0.0112,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 244495154.0,
"step": 295
},
{
"entropy": 0.491363525390625,
"epoch": 3.3636363636363638,
"grad_norm": 2.887309248591406,
"learning_rate": 4.452419193431379e-06,
"loss": 0.022,
"mean_token_accuracy": 0.9934895837213844,
"num_tokens": 245361556.0,
"step": 296
},
{
"entropy": 0.503753662109375,
"epoch": 3.375,
"grad_norm": 1.701023939664883,
"learning_rate": 4.4212655323860685e-06,
"loss": 0.0115,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 246179032.0,
"step": 297
},
{
"entropy": 0.4980621337890625,
"epoch": 3.3863636363636362,
"grad_norm": 1.5662274219193635,
"learning_rate": 4.39013462767054e-06,
"loss": 0.0035,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 247044591.0,
"step": 298
},
{
"entropy": 0.49640655517578125,
"epoch": 3.3977272727272725,
"grad_norm": 4.085649656756541,
"learning_rate": 4.359027703378357e-06,
"loss": 0.0117,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 247849642.0,
"step": 299
},
{
"entropy": 0.4882965087890625,
"epoch": 3.409090909090909,
"grad_norm": 2.0545604630857364,
"learning_rate": 4.327945982660146e-06,
"loss": 0.005,
"mean_token_accuracy": 1.0,
"num_tokens": 248687645.0,
"step": 300
},
{
"entropy": 0.49010467529296875,
"epoch": 3.4204545454545454,
"grad_norm": 4.205117061041687,
"learning_rate": 4.29689068767551e-06,
"loss": 0.0202,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 249523174.0,
"step": 301
},
{
"entropy": 0.48773193359375,
"epoch": 3.4318181818181817,
"grad_norm": 1.0834267264499315,
"learning_rate": 4.265863039544967e-06,
"loss": 0.0051,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 250381087.0,
"step": 302
},
{
"entropy": 0.4884796142578125,
"epoch": 3.4431818181818183,
"grad_norm": 3.6072628291290694,
"learning_rate": 4.234864258301943e-06,
"loss": 0.0073,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 251231901.0,
"step": 303
},
{
"entropy": 0.4911956787109375,
"epoch": 3.4545454545454546,
"grad_norm": 2.36608886636574,
"learning_rate": 4.203895562844789e-06,
"loss": 0.0044,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 252066427.0,
"step": 304
},
{
"entropy": 0.49538421630859375,
"epoch": 3.465909090909091,
"grad_norm": 2.887363512478274,
"learning_rate": 4.172958170888858e-06,
"loss": 0.0104,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 252898682.0,
"step": 305
},
{
"entropy": 0.47566986083984375,
"epoch": 3.4772727272727275,
"grad_norm": 2.4132663080029713,
"learning_rate": 4.142053298918622e-06,
"loss": 0.0112,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 253766783.0,
"step": 306
},
{
"entropy": 0.48429107666015625,
"epoch": 3.4886363636363638,
"grad_norm": 4.288908543594292,
"learning_rate": 4.111182162139844e-06,
"loss": 0.0124,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 254619003.0,
"step": 307
},
{
"entropy": 0.49538421630859375,
"epoch": 3.5,
"grad_norm": 7.886183702925642,
"learning_rate": 4.080345974431786e-06,
"loss": 0.0136,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 255449310.0,
"step": 308
},
{
"entropy": 0.49535369873046875,
"epoch": 3.5113636363636362,
"grad_norm": 1.364174024741626,
"learning_rate": 4.049545948299482e-06,
"loss": 0.0112,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 256269971.0,
"step": 309
},
{
"entropy": 0.48786163330078125,
"epoch": 3.5227272727272725,
"grad_norm": 2.0856928688130427,
"learning_rate": 4.018783294826071e-06,
"loss": 0.0092,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 257110990.0,
"step": 310
},
{
"entropy": 0.47814178466796875,
"epoch": 3.534090909090909,
"grad_norm": 1.4364086187206067,
"learning_rate": 3.988059223625155e-06,
"loss": 0.0057,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 257969152.0,
"step": 311
},
{
"entropy": 0.49022674560546875,
"epoch": 3.5454545454545454,
"grad_norm": 2.3462745888843757,
"learning_rate": 3.957374942793259e-06,
"loss": 0.0069,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 258787610.0,
"step": 312
},
{
"entropy": 0.5002593994140625,
"epoch": 3.5568181818181817,
"grad_norm": 1.076206746462636,
"learning_rate": 3.926731658862307e-06,
"loss": 0.0049,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 259591761.0,
"step": 313
},
{
"entropy": 0.493865966796875,
"epoch": 3.5681818181818183,
"grad_norm": 1.9316523298812094,
"learning_rate": 3.8961305767522015e-06,
"loss": 0.0106,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 260397894.0,
"step": 314
},
{
"entropy": 0.5008392333984375,
"epoch": 3.5795454545454546,
"grad_norm": 6.898639876304829,
"learning_rate": 3.865572899723423e-06,
"loss": 0.0285,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 261200683.0,
"step": 315
},
{
"entropy": 0.487060546875,
"epoch": 3.590909090909091,
"grad_norm": 2.0707747690495437,
"learning_rate": 3.8350598293297345e-06,
"loss": 0.0079,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 262040136.0,
"step": 316
},
{
"entropy": 0.490570068359375,
"epoch": 3.6022727272727275,
"grad_norm": 1.3196169683117467,
"learning_rate": 3.8045925653709238e-06,
"loss": 0.0117,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 262885485.0,
"step": 317
},
{
"entropy": 0.48920440673828125,
"epoch": 3.6136363636363638,
"grad_norm": 3.8532041553000016,
"learning_rate": 3.774172305845636e-06,
"loss": 0.027,
"mean_token_accuracy": 0.9921875004656613,
"num_tokens": 263732921.0,
"step": 318
},
{
"entropy": 0.5139923095703125,
"epoch": 3.625,
"grad_norm": 0.8491195918322743,
"learning_rate": 3.7438002469042567e-06,
"loss": 0.0035,
"mean_token_accuracy": 1.0,
"num_tokens": 264525028.0,
"step": 319
},
{
"entropy": 0.49022674560546875,
"epoch": 3.6363636363636362,
"grad_norm": 2.12743923318016,
"learning_rate": 3.7134775828018864e-06,
"loss": 0.0078,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 265358646.0,
"step": 320
},
{
"entropy": 0.47808837890625,
"epoch": 3.6477272727272725,
"grad_norm": 2.4356510539940985,
"learning_rate": 3.683205505851377e-06,
"loss": 0.0134,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 266215215.0,
"step": 321
},
{
"entropy": 0.47763824462890625,
"epoch": 3.659090909090909,
"grad_norm": 1.624290644861085,
"learning_rate": 3.652985206376455e-06,
"loss": 0.0085,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 267080581.0,
"step": 322
},
{
"entropy": 0.5059356689453125,
"epoch": 3.6704545454545454,
"grad_norm": 4.112024944011851,
"learning_rate": 3.622817872664905e-06,
"loss": 0.0107,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 267854230.0,
"step": 323
},
{
"entropy": 0.507568359375,
"epoch": 3.6818181818181817,
"grad_norm": 3.8307619158031723,
"learning_rate": 3.5927046909218634e-06,
"loss": 0.008,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 268647574.0,
"step": 324
},
{
"entropy": 0.49181365966796875,
"epoch": 3.6931818181818183,
"grad_norm": 1.635004844085684,
"learning_rate": 3.5626468452231534e-06,
"loss": 0.0082,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 269479857.0,
"step": 325
},
{
"entropy": 0.501556396484375,
"epoch": 3.7045454545454546,
"grad_norm": 2.6044052865970957,
"learning_rate": 3.532645517468748e-06,
"loss": 0.0089,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 270278460.0,
"step": 326
},
{
"entropy": 0.4834136962890625,
"epoch": 3.715909090909091,
"grad_norm": 2.713985450438721,
"learning_rate": 3.50270188733628e-06,
"loss": 0.0107,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 271125235.0,
"step": 327
},
{
"entropy": 0.4970855712890625,
"epoch": 3.7272727272727275,
"grad_norm": 2.55244138710167,
"learning_rate": 3.472817132234669e-06,
"loss": 0.0065,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 271947014.0,
"step": 328
},
{
"entropy": 0.4950103759765625,
"epoch": 3.7386363636363638,
"grad_norm": 1.5809093118699922,
"learning_rate": 3.442992427257812e-06,
"loss": 0.0041,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 272778931.0,
"step": 329
},
{
"entropy": 0.4849395751953125,
"epoch": 3.75,
"grad_norm": 3.7521409608312437,
"learning_rate": 3.4132289451383866e-06,
"loss": 0.0078,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 273609373.0,
"step": 330
},
{
"entropy": 0.48828125,
"epoch": 3.7613636363636362,
"grad_norm": 1.3638130294760002,
"learning_rate": 3.3835278562017405e-06,
"loss": 0.0045,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 274452240.0,
"step": 331
},
{
"entropy": 0.49332427978515625,
"epoch": 3.7727272727272725,
"grad_norm": 1.6122316611301655,
"learning_rate": 3.353890328319861e-06,
"loss": 0.0177,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 275279923.0,
"step": 332
},
{
"entropy": 0.48734283447265625,
"epoch": 3.784090909090909,
"grad_norm": 1.4018731489506076,
"learning_rate": 3.3243175268654656e-06,
"loss": 0.0117,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 276099706.0,
"step": 333
},
{
"entropy": 0.48419189453125,
"epoch": 3.7954545454545454,
"grad_norm": 1.3425953438370244,
"learning_rate": 3.29481061466617e-06,
"loss": 0.0107,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 276948323.0,
"step": 334
},
{
"entropy": 0.504119873046875,
"epoch": 3.8068181818181817,
"grad_norm": 1.973194371711739,
"learning_rate": 3.2653707519587756e-06,
"loss": 0.0198,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 277752317.0,
"step": 335
},
{
"entropy": 0.5062255859375,
"epoch": 3.8181818181818183,
"grad_norm": 3.5019643819627295,
"learning_rate": 3.235999096343633e-06,
"loss": 0.0112,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 278553380.0,
"step": 336
},
{
"entropy": 0.5022354125976562,
"epoch": 3.8295454545454546,
"grad_norm": 2.0390847173402866,
"learning_rate": 3.2066968027391377e-06,
"loss": 0.0129,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 279380180.0,
"step": 337
},
{
"entropy": 0.5066680908203125,
"epoch": 3.840909090909091,
"grad_norm": 1.000856198418521,
"learning_rate": 3.177465023336306e-06,
"loss": 0.0047,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 280190463.0,
"step": 338
},
{
"entropy": 0.4948577880859375,
"epoch": 3.8522727272727275,
"grad_norm": 0.4576350562763719,
"learning_rate": 3.1483049075534853e-06,
"loss": 0.0023,
"mean_token_accuracy": 1.0,
"num_tokens": 281038954.0,
"step": 339
},
{
"entropy": 0.4932098388671875,
"epoch": 3.8636363636363638,
"grad_norm": 0.43387840159329444,
"learning_rate": 3.119217601991139e-06,
"loss": 0.0021,
"mean_token_accuracy": 1.0,
"num_tokens": 281892856.0,
"step": 340
},
{
"entropy": 0.5100860595703125,
"epoch": 3.875,
"grad_norm": 1.5329998698285614,
"learning_rate": 3.090204250386779e-06,
"loss": 0.0056,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 282714631.0,
"step": 341
},
{
"entropy": 0.48583221435546875,
"epoch": 3.8863636363636362,
"grad_norm": 0.367306149196234,
"learning_rate": 3.0612659935699774e-06,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 283568255.0,
"step": 342
},
{
"entropy": 0.49864959716796875,
"epoch": 3.8977272727272725,
"grad_norm": 2.4133813718148036,
"learning_rate": 3.032403969417523e-06,
"loss": 0.0119,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 284396996.0,
"step": 343
},
{
"entropy": 0.4964447021484375,
"epoch": 3.909090909090909,
"grad_norm": 1.0290489541589731,
"learning_rate": 3.0036193128086667e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 285208621.0,
"step": 344
},
{
"entropy": 0.48796844482421875,
"epoch": 3.9204545454545454,
"grad_norm": 1.2919335824908056,
"learning_rate": 2.9749131555805035e-06,
"loss": 0.004,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 286068274.0,
"step": 345
},
{
"entropy": 0.5021209716796875,
"epoch": 3.9318181818181817,
"grad_norm": 2.7963876596419777,
"learning_rate": 2.946286626483463e-06,
"loss": 0.0094,
"mean_token_accuracy": 0.9947916669771075,
"num_tokens": 286884909.0,
"step": 346
},
{
"entropy": 0.48792266845703125,
"epoch": 3.9431818181818183,
"grad_norm": 1.1255514150822787,
"learning_rate": 2.9177408511369395e-06,
"loss": 0.0039,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 287730733.0,
"step": 347
},
{
"entropy": 0.49262237548828125,
"epoch": 3.9545454545454546,
"grad_norm": 1.0668263081925125,
"learning_rate": 2.889276951985005e-06,
"loss": 0.0124,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 288550016.0,
"step": 348
},
{
"entropy": 0.5013275146484375,
"epoch": 3.965909090909091,
"grad_norm": 0.5501779477247107,
"learning_rate": 2.8608960482523058e-06,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 289378888.0,
"step": 349
},
{
"entropy": 0.46614837646484375,
"epoch": 3.9772727272727275,
"grad_norm": 1.2675364954133146,
"learning_rate": 2.8325992559000315e-06,
"loss": 0.0029,
"mean_token_accuracy": 1.0,
"num_tokens": 290281736.0,
"step": 350
},
{
"entropy": 0.5061798095703125,
"epoch": 3.9886363636363638,
"grad_norm": 0.994131535402032,
"learning_rate": 2.8043876875820363e-06,
"loss": 0.0107,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 291064628.0,
"step": 351
},
{
"entropy": 0.49474334716796875,
"epoch": 4.0,
"grad_norm": 0.5646989718771591,
"learning_rate": 2.776262452601104e-06,
"loss": 0.0085,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 291890322.0,
"step": 352
},
{
"entropy": 0.50128173828125,
"epoch": 4.011363636363637,
"grad_norm": 0.2305855975577037,
"learning_rate": 2.748224656865304e-06,
"loss": 0.001,
"mean_token_accuracy": 1.0,
"num_tokens": 292701363.0,
"step": 353
},
{
"entropy": 0.49643707275390625,
"epoch": 4.0227272727272725,
"grad_norm": 1.0613672820064046,
"learning_rate": 2.7202754028445375e-06,
"loss": 0.0024,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 293532137.0,
"step": 354
},
{
"entropy": 0.48738861083984375,
"epoch": 4.034090909090909,
"grad_norm": 0.35673670151406833,
"learning_rate": 2.6924157895271563e-06,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 294375049.0,
"step": 355
},
{
"entropy": 0.5010528564453125,
"epoch": 4.045454545454546,
"grad_norm": 0.7012641916420297,
"learning_rate": 2.6646469123767694e-06,
"loss": 0.0019,
"mean_token_accuracy": 1.0,
"num_tokens": 295187394.0,
"step": 356
},
{
"entropy": 0.5065155029296875,
"epoch": 4.056818181818182,
"grad_norm": 1.5537278669101064,
"learning_rate": 2.636969863289164e-06,
"loss": 0.008,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 295972208.0,
"step": 357
},
{
"entropy": 0.47309112548828125,
"epoch": 4.068181818181818,
"grad_norm": 1.0646019233100874,
"learning_rate": 2.6093857305493666e-06,
"loss": 0.0022,
"mean_token_accuracy": 1.0,
"num_tokens": 296841049.0,
"step": 358
},
{
"entropy": 0.4745330810546875,
"epoch": 4.079545454545454,
"grad_norm": 1.4480106846744918,
"learning_rate": 2.581895598788857e-06,
"loss": 0.0119,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 297687674.0,
"step": 359
},
{
"entropy": 0.4698486328125,
"epoch": 4.090909090909091,
"grad_norm": 3.08458120657145,
"learning_rate": 2.5545005489429185e-06,
"loss": 0.0034,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 298539579.0,
"step": 360
},
{
"entropy": 0.4842376708984375,
"epoch": 4.1022727272727275,
"grad_norm": 2.598857092464848,
"learning_rate": 2.5272016582081236e-06,
"loss": 0.0064,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 299370858.0,
"step": 361
},
{
"entropy": 0.48838043212890625,
"epoch": 4.113636363636363,
"grad_norm": 2.7477071900327545,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0111,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 300193508.0,
"step": 362
},
{
"entropy": 0.479339599609375,
"epoch": 4.125,
"grad_norm": 1.1736957699343047,
"learning_rate": 2.472896643910802e-06,
"loss": 0.0044,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 301045850.0,
"step": 363
},
{
"entropy": 0.49602508544921875,
"epoch": 4.136363636363637,
"grad_norm": 0.37729375908384405,
"learning_rate": 2.445892655667462e-06,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 301841078.0,
"step": 364
},
{
"entropy": 0.48199462890625,
"epoch": 4.1477272727272725,
"grad_norm": 0.1355837306910303,
"learning_rate": 2.418989097089685e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 302689150.0,
"step": 365
},
{
"entropy": 0.51080322265625,
"epoch": 4.159090909090909,
"grad_norm": 1.1637477153658335,
"learning_rate": 2.392187026048198e-06,
"loss": 0.0064,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 303488060.0,
"step": 366
},
{
"entropy": 0.47991180419921875,
"epoch": 4.170454545454546,
"grad_norm": 0.38030490067585737,
"learning_rate": 2.365487496423152e-06,
"loss": 0.001,
"mean_token_accuracy": 1.0,
"num_tokens": 304341157.0,
"step": 367
},
{
"entropy": 0.47565460205078125,
"epoch": 4.181818181818182,
"grad_norm": 0.10997153487552541,
"learning_rate": 2.3388915580626807e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 305186459.0,
"step": 368
},
{
"entropy": 0.5027694702148438,
"epoch": 4.193181818181818,
"grad_norm": 0.14510941025692908,
"learning_rate": 2.31240025674162e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 305978157.0,
"step": 369
},
{
"entropy": 0.49146270751953125,
"epoch": 4.204545454545454,
"grad_norm": 1.5883776551936257,
"learning_rate": 2.2860146341203936e-06,
"loss": 0.0059,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 306769045.0,
"step": 370
},
{
"entropy": 0.49044036865234375,
"epoch": 4.215909090909091,
"grad_norm": 1.992656406279671,
"learning_rate": 2.2597357277040494e-06,
"loss": 0.015,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 307562505.0,
"step": 371
},
{
"entropy": 0.47119903564453125,
"epoch": 4.2272727272727275,
"grad_norm": 1.2199582939244709,
"learning_rate": 2.233564570801453e-06,
"loss": 0.0079,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 308441553.0,
"step": 372
},
{
"entropy": 0.4873199462890625,
"epoch": 4.238636363636363,
"grad_norm": 2.149306783936389,
"learning_rate": 2.207502192484685e-06,
"loss": 0.0101,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 309253697.0,
"step": 373
},
{
"entropy": 0.47591400146484375,
"epoch": 4.25,
"grad_norm": 0.35564477739269224,
"learning_rate": 2.1815496175485433e-06,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 310096440.0,
"step": 374
},
{
"entropy": 0.47376251220703125,
"epoch": 4.261363636363637,
"grad_norm": 1.5466708854824467,
"learning_rate": 2.1557078664702747e-06,
"loss": 0.0053,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 310946699.0,
"step": 375
},
{
"entropy": 0.47772979736328125,
"epoch": 4.2727272727272725,
"grad_norm": 3.903707776896074,
"learning_rate": 2.1299779553694323e-06,
"loss": 0.0099,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 311780327.0,
"step": 376
},
{
"entropy": 0.4879913330078125,
"epoch": 4.284090909090909,
"grad_norm": 1.28385949827407,
"learning_rate": 2.1043608959679302e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 312599293.0,
"step": 377
},
{
"entropy": 0.48241424560546875,
"epoch": 4.295454545454546,
"grad_norm": 1.3157851336620305,
"learning_rate": 2.0788576955502547e-06,
"loss": 0.0032,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 313433046.0,
"step": 378
},
{
"entropy": 0.497406005859375,
"epoch": 4.306818181818182,
"grad_norm": 0.4986614424581066,
"learning_rate": 2.053469356923865e-06,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 314231321.0,
"step": 379
},
{
"entropy": 0.47122955322265625,
"epoch": 4.318181818181818,
"grad_norm": 0.9605458456996543,
"learning_rate": 2.028196878379749e-06,
"loss": 0.0014,
"mean_token_accuracy": 1.0,
"num_tokens": 315090907.0,
"step": 380
},
{
"entropy": 0.46869659423828125,
"epoch": 4.329545454545454,
"grad_norm": 0.07665020057891755,
"learning_rate": 2.0030412536531896e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 315934937.0,
"step": 381
},
{
"entropy": 0.4843597412109375,
"epoch": 4.340909090909091,
"grad_norm": 2.4860913585370934,
"learning_rate": 1.9780034718846653e-06,
"loss": 0.0088,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 316744484.0,
"step": 382
},
{
"entropy": 0.4806060791015625,
"epoch": 4.3522727272727275,
"grad_norm": 1.610995719001932,
"learning_rate": 1.9530845175809838e-06,
"loss": 0.0032,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 317569584.0,
"step": 383
},
{
"entropy": 0.48772430419921875,
"epoch": 4.363636363636363,
"grad_norm": 0.3630643065135508,
"learning_rate": 1.9282853705765435e-06,
"loss": 0.0011,
"mean_token_accuracy": 1.0,
"num_tokens": 318380963.0,
"step": 384
},
{
"entropy": 0.4920654296875,
"epoch": 4.375,
"grad_norm": 0.11318106327604302,
"learning_rate": 1.9036070059948253e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 319177808.0,
"step": 385
},
{
"entropy": 0.48041534423828125,
"epoch": 4.386363636363637,
"grad_norm": 0.12828443108853602,
"learning_rate": 1.8790503942100413e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 319991752.0,
"step": 386
},
{
"entropy": 0.48044586181640625,
"epoch": 4.3977272727272725,
"grad_norm": 0.12009248173561284,
"learning_rate": 1.8546165008089806e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 320823781.0,
"step": 387
},
{
"entropy": 0.46346282958984375,
"epoch": 4.409090909090909,
"grad_norm": 1.4373424468930374,
"learning_rate": 1.8303062865530407e-06,
"loss": 0.0031,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 321672380.0,
"step": 388
},
{
"entropy": 0.48361968994140625,
"epoch": 4.420454545454546,
"grad_norm": 0.45259690051953183,
"learning_rate": 1.8061207073404507e-06,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 322475535.0,
"step": 389
},
{
"entropy": 0.4696197509765625,
"epoch": 4.431818181818182,
"grad_norm": 0.4233541567189679,
"learning_rate": 1.7820607141686846e-06,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 323285742.0,
"step": 390
},
{
"entropy": 0.4728851318359375,
"epoch": 4.443181818181818,
"grad_norm": 0.1816069998282873,
"learning_rate": 1.7581272530970666e-06,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 324121368.0,
"step": 391
},
{
"entropy": 0.47039794921875,
"epoch": 4.454545454545454,
"grad_norm": 0.3483848284787408,
"learning_rate": 1.734321265209572e-06,
"loss": 0.001,
"mean_token_accuracy": 1.0,
"num_tokens": 324973947.0,
"step": 392
},
{
"entropy": 0.4647674560546875,
"epoch": 4.465909090909091,
"grad_norm": 0.1144960907716229,
"learning_rate": 1.7106436865778182e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 325804782.0,
"step": 393
},
{
"entropy": 0.48302459716796875,
"epoch": 4.4772727272727275,
"grad_norm": 0.1139971153359024,
"learning_rate": 1.6870954482242707e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 326603097.0,
"step": 394
},
{
"entropy": 0.4611968994140625,
"epoch": 4.488636363636363,
"grad_norm": 1.2769097861232268,
"learning_rate": 1.663677476085616e-06,
"loss": 0.0022,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 327455117.0,
"step": 395
},
{
"entropy": 0.46791839599609375,
"epoch": 4.5,
"grad_norm": 1.6141901294301404,
"learning_rate": 1.6403906909763688e-06,
"loss": 0.0062,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 328271844.0,
"step": 396
},
{
"entropy": 0.46714019775390625,
"epoch": 4.511363636363637,
"grad_norm": 3.407028832408574,
"learning_rate": 1.6172360085526567e-06,
"loss": 0.0144,
"mean_token_accuracy": 0.9960937502328306,
"num_tokens": 329105170.0,
"step": 397
},
{
"entropy": 0.4729766845703125,
"epoch": 4.5227272727272725,
"grad_norm": 1.1263314457968352,
"learning_rate": 1.5942143392762178e-06,
"loss": 0.0047,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 329922811.0,
"step": 398
},
{
"entropy": 0.4759521484375,
"epoch": 4.534090909090909,
"grad_norm": 2.280759600794648,
"learning_rate": 1.5713265883786e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 330722929.0,
"step": 399
},
{
"entropy": 0.4548187255859375,
"epoch": 4.545454545454545,
"grad_norm": 0.3194534227852655,
"learning_rate": 1.54857365582557e-06,
"loss": 0.0009,
"mean_token_accuracy": 1.0,
"num_tokens": 331559428.0,
"step": 400
},
{
"entropy": 0.4586944580078125,
"epoch": 4.556818181818182,
"grad_norm": 0.4687987791903233,
"learning_rate": 1.5259564362817147e-06,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 332413515.0,
"step": 401
},
{
"entropy": 0.454254150390625,
"epoch": 4.568181818181818,
"grad_norm": 0.415545362925253,
"learning_rate": 1.5034758190752836e-06,
"loss": 0.001,
"mean_token_accuracy": 1.0,
"num_tokens": 333265076.0,
"step": 402
},
{
"entropy": 0.46869659423828125,
"epoch": 4.579545454545455,
"grad_norm": 0.7752038225629332,
"learning_rate": 1.4811326881631937e-06,
"loss": 0.0011,
"mean_token_accuracy": 1.0,
"num_tokens": 334094385.0,
"step": 403
},
{
"entropy": 0.4632415771484375,
"epoch": 4.590909090909091,
"grad_norm": 0.21104740454493934,
"learning_rate": 1.4589279220962922e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 334947835.0,
"step": 404
},
{
"entropy": 0.4703216552734375,
"epoch": 4.6022727272727275,
"grad_norm": 0.42765647679577773,
"learning_rate": 1.4368623939848003e-06,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 335753110.0,
"step": 405
},
{
"entropy": 0.45865631103515625,
"epoch": 4.613636363636363,
"grad_norm": 0.05702480437380243,
"learning_rate": 1.4149369714639856e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 336600931.0,
"step": 406
},
{
"entropy": 0.4477996826171875,
"epoch": 4.625,
"grad_norm": 0.05129594018064339,
"learning_rate": 1.3931525166600447e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 337462613.0,
"step": 407
},
{
"entropy": 0.46739959716796875,
"epoch": 4.636363636363637,
"grad_norm": 0.05649629590246661,
"learning_rate": 1.371509886156206e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 338298847.0,
"step": 408
},
{
"entropy": 0.46347808837890625,
"epoch": 4.6477272727272725,
"grad_norm": 0.05392471241424441,
"learning_rate": 1.3500099309590397e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 339113689.0,
"step": 409
},
{
"entropy": 0.46538543701171875,
"epoch": 4.659090909090909,
"grad_norm": 0.04261517453780075,
"learning_rate": 1.3286534964650121e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 339940988.0,
"step": 410
},
{
"entropy": 0.475616455078125,
"epoch": 4.670454545454545,
"grad_norm": 0.041270752321869116,
"learning_rate": 1.3074414224272287e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 340752578.0,
"step": 411
},
{
"entropy": 0.47516632080078125,
"epoch": 4.681818181818182,
"grad_norm": 0.3298133997421892,
"learning_rate": 1.2863745429224145e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 341554983.0,
"step": 412
},
{
"entropy": 0.4729461669921875,
"epoch": 4.693181818181818,
"grad_norm": 3.708268542945646,
"learning_rate": 1.2654536863181328e-06,
"loss": 0.0095,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 342358183.0,
"step": 413
},
{
"entropy": 0.4546356201171875,
"epoch": 4.704545454545455,
"grad_norm": 1.8590922166659447,
"learning_rate": 1.2446796752401912e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 343205856.0,
"step": 414
},
{
"entropy": 0.45923614501953125,
"epoch": 4.715909090909091,
"grad_norm": 1.2867278575579393,
"learning_rate": 1.22405332654032e-06,
"loss": 0.0122,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 344033478.0,
"step": 415
},
{
"entropy": 0.4714202880859375,
"epoch": 4.7272727272727275,
"grad_norm": 0.04144212296397527,
"learning_rate": 1.2035754512640263e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 344827009.0,
"step": 416
},
{
"entropy": 0.4552764892578125,
"epoch": 4.738636363636363,
"grad_norm": 0.04260184992648092,
"learning_rate": 1.1832468546187248e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 345670205.0,
"step": 417
},
{
"entropy": 0.46997833251953125,
"epoch": 4.75,
"grad_norm": 0.8777265760304463,
"learning_rate": 1.1630683359420653e-06,
"loss": 0.0012,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 346471682.0,
"step": 418
},
{
"entropy": 0.45848846435546875,
"epoch": 4.761363636363637,
"grad_norm": 2.1795793503430803,
"learning_rate": 1.1430406886705053e-06,
"loss": 0.0019,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 347308458.0,
"step": 419
},
{
"entropy": 0.455657958984375,
"epoch": 4.7727272727272725,
"grad_norm": 0.11627180910044319,
"learning_rate": 1.1231647003081092e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 348141445.0,
"step": 420
},
{
"entropy": 0.45229339599609375,
"epoch": 4.784090909090909,
"grad_norm": 0.29998009764725636,
"learning_rate": 1.103441152395588e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 349005773.0,
"step": 421
},
{
"entropy": 0.46309661865234375,
"epoch": 4.795454545454545,
"grad_norm": 0.2991841726066073,
"learning_rate": 1.0838708204795584e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 349829599.0,
"step": 422
},
{
"entropy": 0.4610137939453125,
"epoch": 4.806818181818182,
"grad_norm": 1.0609883118459258,
"learning_rate": 1.064454474082064e-06,
"loss": 0.0126,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 350647234.0,
"step": 423
},
{
"entropy": 0.43990325927734375,
"epoch": 4.818181818181818,
"grad_norm": 0.16789793270696388,
"learning_rate": 1.045192876670298e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 351529720.0,
"step": 424
},
{
"entropy": 0.455780029296875,
"epoch": 4.829545454545455,
"grad_norm": 0.21827131764117488,
"learning_rate": 1.0260867856265967e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 352366018.0,
"step": 425
},
{
"entropy": 0.46533966064453125,
"epoch": 4.840909090909091,
"grad_norm": 1.2044394471044855,
"learning_rate": 1.0071369522186546e-06,
"loss": 0.0018,
"mean_token_accuracy": 1.0,
"num_tokens": 353187956.0,
"step": 426
},
{
"entropy": 0.4680633544921875,
"epoch": 4.8522727272727275,
"grad_norm": 2.817809119636871,
"learning_rate": 9.883441215699824e-07,
"loss": 0.0021,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 354005342.0,
"step": 427
},
{
"entropy": 0.45850372314453125,
"epoch": 4.863636363636363,
"grad_norm": 1.5308707690431358,
"learning_rate": 9.697090326306096e-07,
"loss": 0.0018,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 354847121.0,
"step": 428
},
{
"entropy": 0.45076751708984375,
"epoch": 4.875,
"grad_norm": 0.6612914507279286,
"learning_rate": 9.51232418148027e-07,
"loss": 0.0013,
"mean_token_accuracy": 1.0,
"num_tokens": 355685163.0,
"step": 429
},
{
"entropy": 0.464630126953125,
"epoch": 4.886363636363637,
"grad_norm": 0.564486291517846,
"learning_rate": 9.329150046383773e-07,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 356523319.0,
"step": 430
},
{
"entropy": 0.4573822021484375,
"epoch": 4.8977272727272725,
"grad_norm": 0.9866803918640974,
"learning_rate": 9.147575123578845e-07,
"loss": 0.0036,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 357363691.0,
"step": 431
},
{
"entropy": 0.46978759765625,
"epoch": 4.909090909090909,
"grad_norm": 0.17811715621871,
"learning_rate": 8.967606552745361e-07,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 358180072.0,
"step": 432
},
{
"entropy": 0.45139312744140625,
"epoch": 4.920454545454545,
"grad_norm": 0.12562428430327954,
"learning_rate": 8.789251410400024e-07,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 359024240.0,
"step": 433
},
{
"entropy": 0.46982574462890625,
"epoch": 4.931818181818182,
"grad_norm": 0.6932111763771747,
"learning_rate": 8.612516709618251e-07,
"loss": 0.0009,
"mean_token_accuracy": 1.0,
"num_tokens": 359847230.0,
"step": 434
},
{
"entropy": 0.46353912353515625,
"epoch": 4.943181818181818,
"grad_norm": 0.23164408202084988,
"learning_rate": 8.437409399758234e-07,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 360685374.0,
"step": 435
},
{
"entropy": 0.451324462890625,
"epoch": 4.954545454545455,
"grad_norm": 0.19196243101290555,
"learning_rate": 8.263936366187825e-07,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 361526820.0,
"step": 436
},
{
"entropy": 0.4579963684082031,
"epoch": 4.965909090909091,
"grad_norm": 0.0687299728812095,
"learning_rate": 8.092104430013737e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 362340123.0,
"step": 437
},
{
"entropy": 0.45263671875,
"epoch": 4.9772727272727275,
"grad_norm": 0.07585912602037853,
"learning_rate": 7.921920347813333e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 363177369.0,
"step": 438
},
{
"entropy": 0.46087646484375,
"epoch": 4.988636363636363,
"grad_norm": 0.10025359489418027,
"learning_rate": 7.753390811368972e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 364019483.0,
"step": 439
},
{
"entropy": 0.47498321533203125,
"epoch": 5.0,
"grad_norm": 0.08401001803633867,
"learning_rate": 7.586522447404882e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 364810814.0,
"step": 440
},
{
"entropy": 0.4663848876953125,
"epoch": 5.011363636363637,
"grad_norm": 0.06753832442651661,
"learning_rate": 7.421321817326527e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 365630038.0,
"step": 441
},
{
"entropy": 0.4595947265625,
"epoch": 5.0227272727272725,
"grad_norm": 0.052693763957122906,
"learning_rate": 7.257795416962754e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 366463667.0,
"step": 442
},
{
"entropy": 0.45317840576171875,
"epoch": 5.034090909090909,
"grad_norm": 1.3697325708926469,
"learning_rate": 7.095949676310171e-07,
"loss": 0.0017,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 367311117.0,
"step": 443
},
{
"entropy": 0.4619865417480469,
"epoch": 5.045454545454546,
"grad_norm": 0.15729396392326342,
"learning_rate": 6.935790959280525e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 368129489.0,
"step": 444
},
{
"entropy": 0.46871185302734375,
"epoch": 5.056818181818182,
"grad_norm": 0.06924769182275127,
"learning_rate": 6.777325563450282e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 368926469.0,
"step": 445
},
{
"entropy": 0.47141265869140625,
"epoch": 5.068181818181818,
"grad_norm": 0.6783294778672456,
"learning_rate": 6.62055971981313e-07,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 369725096.0,
"step": 446
},
{
"entropy": 0.451629638671875,
"epoch": 5.079545454545454,
"grad_norm": 0.1031929872686621,
"learning_rate": 6.465499592534902e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 370550137.0,
"step": 447
},
{
"entropy": 0.455291748046875,
"epoch": 5.090909090909091,
"grad_norm": 0.0772291963261543,
"learning_rate": 6.312151278711237e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 371400483.0,
"step": 448
},
{
"entropy": 0.46193695068359375,
"epoch": 5.1022727272727275,
"grad_norm": 1.326339689563553,
"learning_rate": 6.160520808127807e-07,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 372240572.0,
"step": 449
},
{
"entropy": 0.44817352294921875,
"epoch": 5.113636363636363,
"grad_norm": 0.09897383514384525,
"learning_rate": 6.010614143023231e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 373094230.0,
"step": 450
},
{
"entropy": 0.46028900146484375,
"epoch": 5.125,
"grad_norm": 0.09472823323409073,
"learning_rate": 5.862437177854629e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 373920609.0,
"step": 451
},
{
"entropy": 0.46224212646484375,
"epoch": 5.136363636363637,
"grad_norm": 0.044213112080802454,
"learning_rate": 5.715995739065877e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 374735909.0,
"step": 452
},
{
"entropy": 0.45038604736328125,
"epoch": 5.1477272727272725,
"grad_norm": 1.2764131215349144,
"learning_rate": 5.571295584858466e-07,
"loss": 0.0018,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 375593425.0,
"step": 453
},
{
"entropy": 0.4536018371582031,
"epoch": 5.159090909090909,
"grad_norm": 0.041178158611641445,
"learning_rate": 5.428342404965076e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 376407379.0,
"step": 454
},
{
"entropy": 0.459197998046875,
"epoch": 5.170454545454546,
"grad_norm": 0.043318758692832936,
"learning_rate": 5.287141820425945e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 377243881.0,
"step": 455
},
{
"entropy": 0.4656829833984375,
"epoch": 5.181818181818182,
"grad_norm": 0.046883302108158394,
"learning_rate": 5.147699383367705e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 378061945.0,
"step": 456
},
{
"entropy": 0.4624786376953125,
"epoch": 5.193181818181818,
"grad_norm": 0.043128817920418165,
"learning_rate": 5.010020576785174e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 378873096.0,
"step": 457
},
{
"entropy": 0.4590301513671875,
"epoch": 5.204545454545454,
"grad_norm": 0.7610317787113492,
"learning_rate": 4.874110814325723e-07,
"loss": 0.0144,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 379691075.0,
"step": 458
},
{
"entropy": 0.4679222106933594,
"epoch": 5.215909090909091,
"grad_norm": 0.043349450851136395,
"learning_rate": 4.739975440076405e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 380488242.0,
"step": 459
},
{
"entropy": 0.44734954833984375,
"epoch": 5.2272727272727275,
"grad_norm": 0.04658326574454793,
"learning_rate": 4.607619728353818e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 381344689.0,
"step": 460
},
{
"entropy": 0.44751739501953125,
"epoch": 5.238636363636363,
"grad_norm": 0.4068762779710976,
"learning_rate": 4.4770488834967486e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 382191810.0,
"step": 461
},
{
"entropy": 0.46425628662109375,
"epoch": 5.25,
"grad_norm": 2.1205539558566877,
"learning_rate": 4.348268039661452e-07,
"loss": 0.0092,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 383013514.0,
"step": 462
},
{
"entropy": 0.4523468017578125,
"epoch": 5.261363636363637,
"grad_norm": 2.899603346657294,
"learning_rate": 4.221282260619891e-07,
"loss": 0.0066,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 383851533.0,
"step": 463
},
{
"entropy": 0.4504852294921875,
"epoch": 5.2727272727272725,
"grad_norm": 0.06079617521527757,
"learning_rate": 4.0960965395605015e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 384690195.0,
"step": 464
},
{
"entropy": 0.4595794677734375,
"epoch": 5.284090909090909,
"grad_norm": 0.0850218217128288,
"learning_rate": 3.972715798891952e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 385510439.0,
"step": 465
},
{
"entropy": 0.4385986328125,
"epoch": 5.295454545454546,
"grad_norm": 0.062420950188709565,
"learning_rate": 3.851144890049535e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 386394511.0,
"step": 466
},
{
"entropy": 0.457122802734375,
"epoch": 5.306818181818182,
"grad_norm": 1.135348931750305,
"learning_rate": 3.731388593304425e-07,
"loss": 0.009,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 387221719.0,
"step": 467
},
{
"entropy": 0.46808624267578125,
"epoch": 5.318181818181818,
"grad_norm": 0.10620184695975321,
"learning_rate": 3.6134516175757193e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 388021747.0,
"step": 468
},
{
"entropy": 0.4699668884277344,
"epoch": 5.329545454545454,
"grad_norm": 1.0873134956301034,
"learning_rate": 3.497338600245254e-07,
"loss": 0.0022,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 388858436.0,
"step": 469
},
{
"entropy": 0.4461669921875,
"epoch": 5.340909090909091,
"grad_norm": 0.08362416650926273,
"learning_rate": 3.383054106975292e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 389713106.0,
"step": 470
},
{
"entropy": 0.45136260986328125,
"epoch": 5.3522727272727275,
"grad_norm": 0.11019270156426565,
"learning_rate": 3.270602631528968e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 390560663.0,
"step": 471
},
{
"entropy": 0.45748138427734375,
"epoch": 5.363636363636363,
"grad_norm": 0.12689398877694197,
"learning_rate": 3.159988595593616e-07,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 391378014.0,
"step": 472
},
{
"entropy": 0.4533958435058594,
"epoch": 5.375,
"grad_norm": 0.08597158452505722,
"learning_rate": 3.051216348606867e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 392217186.0,
"step": 473
},
{
"entropy": 0.46509552001953125,
"epoch": 5.386363636363637,
"grad_norm": 0.4545345190127941,
"learning_rate": 2.944290167585684e-07,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 393043338.0,
"step": 474
},
{
"entropy": 0.4675140380859375,
"epoch": 5.3977272727272725,
"grad_norm": 0.09329400176802885,
"learning_rate": 2.839214256958106e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 393861984.0,
"step": 475
},
{
"entropy": 0.4590911865234375,
"epoch": 5.409090909090909,
"grad_norm": 0.12809045767232038,
"learning_rate": 2.7359927483980254e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 394694902.0,
"step": 476
},
{
"entropy": 0.45597076416015625,
"epoch": 5.420454545454546,
"grad_norm": 0.0741422272419836,
"learning_rate": 2.634629700662628e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 395512370.0,
"step": 477
},
{
"entropy": 0.455474853515625,
"epoch": 5.431818181818182,
"grad_norm": 0.0948517795191296,
"learning_rate": 2.5351290994328703e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 396351709.0,
"step": 478
},
{
"entropy": 0.443359375,
"epoch": 5.443181818181818,
"grad_norm": 0.09645060067855557,
"learning_rate": 2.4374948571567246e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 397205354.0,
"step": 479
},
{
"entropy": 0.44438934326171875,
"epoch": 5.454545454545454,
"grad_norm": 0.08367706408648884,
"learning_rate": 2.3417308128953486e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 398068964.0,
"step": 480
},
{
"entropy": 0.44445037841796875,
"epoch": 5.465909090909091,
"grad_norm": 0.08096091553646355,
"learning_rate": 2.2478407321721295e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 398931333.0,
"step": 481
},
{
"entropy": 0.46445465087890625,
"epoch": 5.4772727272727275,
"grad_norm": 0.07120740238741786,
"learning_rate": 2.1558283068246254e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 399743561.0,
"step": 482
},
{
"entropy": 0.44690704345703125,
"epoch": 5.488636363636363,
"grad_norm": 1.3626841259762659,
"learning_rate": 2.065697154859375e-07,
"loss": 0.0015,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 400597274.0,
"step": 483
},
{
"entropy": 0.44899749755859375,
"epoch": 5.5,
"grad_norm": 0.0643260899222743,
"learning_rate": 1.9774508203096843e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 401443536.0,
"step": 484
},
{
"entropy": 0.4696502685546875,
"epoch": 5.511363636363637,
"grad_norm": 1.74414056253339,
"learning_rate": 1.8910927730962038e-07,
"loss": 0.0059,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 402253531.0,
"step": 485
},
{
"entropy": 0.45011138916015625,
"epoch": 5.5227272727272725,
"grad_norm": 3.4760475953885583,
"learning_rate": 1.806626408890555e-07,
"loss": 0.0039,
"mean_token_accuracy": 0.9973958334885538,
"num_tokens": 403096077.0,
"step": 486
},
{
"entropy": 0.45471954345703125,
"epoch": 5.534090909090909,
"grad_norm": 0.06840228217105021,
"learning_rate": 1.7240550489817652e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 403931464.0,
"step": 487
},
{
"entropy": 0.4575042724609375,
"epoch": 5.545454545454545,
"grad_norm": 1.7006292138027919,
"learning_rate": 1.6433819401456996e-07,
"loss": 0.0058,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 404753168.0,
"step": 488
},
{
"entropy": 0.4656829833984375,
"epoch": 5.556818181818182,
"grad_norm": 0.07530184857523241,
"learning_rate": 1.5646102545173625e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 405574379.0,
"step": 489
},
{
"entropy": 0.4536285400390625,
"epoch": 5.568181818181818,
"grad_norm": 0.06879577484091869,
"learning_rate": 1.4877430894662037e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 406389523.0,
"step": 490
},
{
"entropy": 0.46601104736328125,
"epoch": 5.579545454545455,
"grad_norm": 0.07631570022562256,
"learning_rate": 1.412783467474299e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 407214371.0,
"step": 491
},
{
"entropy": 0.45583343505859375,
"epoch": 5.590909090909091,
"grad_norm": 0.07293315492618727,
"learning_rate": 1.3397343360175287e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 408046275.0,
"step": 492
},
{
"entropy": 0.44484710693359375,
"epoch": 5.6022727272727275,
"grad_norm": 0.06505986294350438,
"learning_rate": 1.268598567449647e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 408911193.0,
"step": 493
},
{
"entropy": 0.4643096923828125,
"epoch": 5.613636363636363,
"grad_norm": 0.12132793601368723,
"learning_rate": 1.1993789588893634e-07,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 409719072.0,
"step": 494
},
{
"entropy": 0.4685325622558594,
"epoch": 5.625,
"grad_norm": 0.07053942071636507,
"learning_rate": 1.1320782321103673e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 410525184.0,
"step": 495
},
{
"entropy": 0.45401763916015625,
"epoch": 5.636363636363637,
"grad_norm": 0.08594584151577633,
"learning_rate": 1.0666990334342708e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 411347407.0,
"step": 496
},
{
"entropy": 0.458099365234375,
"epoch": 5.6477272727272725,
"grad_norm": 0.08486605252448229,
"learning_rate": 1.0032439336265742e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 412178040.0,
"step": 497
},
{
"entropy": 0.4579010009765625,
"epoch": 5.659090909090909,
"grad_norm": 0.06659241573883855,
"learning_rate": 9.417154277955864e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 413017601.0,
"step": 498
},
{
"entropy": 0.454986572265625,
"epoch": 5.670454545454545,
"grad_norm": 0.07546447247108004,
"learning_rate": 8.821159352943142e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 413853566.0,
"step": 499
},
{
"entropy": 0.4441070556640625,
"epoch": 5.681818181818182,
"grad_norm": 0.05281085008423308,
"learning_rate": 8.244477996253109e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 414704010.0,
"step": 500
},
{
"entropy": 0.4505615234375,
"epoch": 5.693181818181818,
"grad_norm": 0.06989309481120927,
"learning_rate": 7.687132883485548e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 415551512.0,
"step": 501
},
{
"entropy": 0.45363616943359375,
"epoch": 5.704545454545455,
"grad_norm": 0.3490127804874226,
"learning_rate": 7.149145929922607e-08,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 416393119.0,
"step": 502
},
{
"entropy": 0.451324462890625,
"epoch": 5.715909090909091,
"grad_norm": 0.0683123728452308,
"learning_rate": 6.630538289667365e-08,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 417211364.0,
"step": 503
},
{
"entropy": 0.44922637939453125,
"epoch": 5.7272727272727275,
"grad_norm": 0.06803576881976604,
"learning_rate": 6.131330354811616e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 418047976.0,
"step": 504
},
{
"entropy": 0.46166229248046875,
"epoch": 5.738636363636363,
"grad_norm": 0.0677087702491836,
"learning_rate": 5.651541754634726e-08,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 418866422.0,
"step": 505
},
{
"entropy": 0.4704399108886719,
"epoch": 5.75,
"grad_norm": 0.060578646422546296,
"learning_rate": 5.1911913548309266e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 419667732.0,
"step": 506
},
{
"entropy": 0.45671844482421875,
"epoch": 5.761363636363637,
"grad_norm": 0.6548315170801939,
"learning_rate": 4.750297256768177e-08,
"loss": 0.0011,
"mean_token_accuracy": 1.0,
"num_tokens": 420500583.0,
"step": 507
},
{
"entropy": 0.46508026123046875,
"epoch": 5.7727272727272725,
"grad_norm": 0.0817666940431779,
"learning_rate": 4.328876796776071e-08,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 421327571.0,
"step": 508
},
{
"entropy": 0.447998046875,
"epoch": 5.784090909090909,
"grad_norm": 0.051109468067632835,
"learning_rate": 3.926946545464327e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 422199122.0,
"step": 509
},
{
"entropy": 0.456024169921875,
"epoch": 5.795454545454545,
"grad_norm": 0.07490586230437635,
"learning_rate": 3.544522307071085e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 423032124.0,
"step": 510
},
{
"entropy": 0.4470672607421875,
"epoch": 5.806818181818182,
"grad_norm": 0.057700792289358926,
"learning_rate": 3.181619118841517e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 423876920.0,
"step": 511
},
{
"entropy": 0.454193115234375,
"epoch": 5.818181818181818,
"grad_norm": 0.07045615281014198,
"learning_rate": 2.838251250436519e-08,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 424725606.0,
"step": 512
},
{
"entropy": 0.45302581787109375,
"epoch": 5.829545454545455,
"grad_norm": 0.05710442232271105,
"learning_rate": 2.5144322033717748e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 425566413.0,
"step": 513
},
{
"entropy": 0.46292877197265625,
"epoch": 5.840909090909091,
"grad_norm": 0.052942049545793506,
"learning_rate": 2.210174710486679e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 426383910.0,
"step": 514
},
{
"entropy": 0.44835662841796875,
"epoch": 5.8522727272727275,
"grad_norm": 0.716754002813711,
"learning_rate": 1.9254907354436804e-08,
"loss": 0.0016,
"mean_token_accuracy": 0.9986979167442769,
"num_tokens": 427233159.0,
"step": 515
},
{
"entropy": 0.4639892578125,
"epoch": 5.863636363636363,
"grad_norm": 0.06450919221085051,
"learning_rate": 1.6603914722579938e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 428045659.0,
"step": 516
},
{
"entropy": 0.4636421203613281,
"epoch": 5.875,
"grad_norm": 1.5156528649134724,
"learning_rate": 1.4148873448573408e-08,
"loss": 0.0016,
"mean_token_accuracy": 1.0,
"num_tokens": 428849891.0,
"step": 517
},
{
"entropy": 0.4610748291015625,
"epoch": 5.886363636363637,
"grad_norm": 0.054638375766997926,
"learning_rate": 1.1889880066720538e-08,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 429665132.0,
"step": 518
},
{
"entropy": 0.45429229736328125,
"epoch": 5.8977272727272725,
"grad_norm": 0.053427197947298284,
"learning_rate": 9.827023402556035e-09,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 430492556.0,
"step": 519
},
{
"entropy": 0.44527435302734375,
"epoch": 5.909090909090909,
"grad_norm": 0.06448753790680099,
"learning_rate": 7.96038456935322e-09,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 431344266.0,
"step": 520
},
{
"entropy": 0.45428466796875,
"epoch": 5.920454545454545,
"grad_norm": 0.05394024331607267,
"learning_rate": 6.2900369649315785e-09,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 432174654.0,
"step": 521
},
{
"entropy": 0.45616912841796875,
"epoch": 5.931818181818182,
"grad_norm": 0.07017412516356693,
"learning_rate": 4.816046268775742e-09,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 433016456.0,
"step": 522
},
{
"entropy": 0.4521484375,
"epoch": 5.943181818181818,
"grad_norm": 0.1719029795976701,
"learning_rate": 3.538470439448105e-09,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 433844975.0,
"step": 523
},
{
"entropy": 0.44696807861328125,
"epoch": 5.954545454545455,
"grad_norm": 0.05060097656295442,
"learning_rate": 2.4573597123145333e-09,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 434715113.0,
"step": 524
},
{
"entropy": 0.46170806884765625,
"epoch": 5.965909090909091,
"grad_norm": 0.06786098871780048,
"learning_rate": 1.5727565975642844e-09,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 435516656.0,
"step": 525
},
{
"entropy": 0.47309112548828125,
"epoch": 5.9772727272727275,
"grad_norm": 0.08698342318910769,
"learning_rate": 8.846958785418969e-10,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 436285686.0,
"step": 526
},
{
"entropy": 0.4729576110839844,
"epoch": 5.988636363636363,
"grad_norm": 0.06953938165656462,
"learning_rate": 3.9320461037772873e-10,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 437058962.0,
"step": 527
},
{
"entropy": 0.458892822265625,
"epoch": 6.0,
"grad_norm": 0.0538137181935737,
"learning_rate": 9.830211892492004e-11,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 437879517.0,
"step": 528
},
{
"epoch": 6.0,
"step": 528,
"total_flos": 515196244262912.0,
"train_loss": 0.37932722877818986,
"train_runtime": 69903.6938,
"train_samples_per_second": 3.522,
"train_steps_per_second": 0.008
}
],
"logging_steps": 1,
"max_steps": 528,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 44,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 515196244262912.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}