Files
P2-split1_prob_Qwen3-8B-Bas…/trainer_state.json
ModelHub XC 0045ef5db6 初始化项目,由ModelHub XC社区提供模型
Model: Hyeongwon/P2-split1_prob_Qwen3-8B-Base_0325-01
Source: Original Platform
2026-06-06 08:52:42 +08:00

7784 lines
220 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 774,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.553863525390625,
"epoch": 0.003875968992248062,
"grad_norm": 5.757997452916937,
"learning_rate": 0.0,
"loss": 1.3887,
"mean_token_accuracy": 0.6599368844181299,
"num_tokens": 438497.0,
"step": 1
},
{
"entropy": 0.55859375,
"epoch": 0.007751937984496124,
"grad_norm": 5.784239047600006,
"learning_rate": 5.128205128205128e-07,
"loss": 1.3966,
"mean_token_accuracy": 0.6559260198846459,
"num_tokens": 881766.0,
"step": 2
},
{
"entropy": 0.5675048828125,
"epoch": 0.011627906976744186,
"grad_norm": 5.831349144495169,
"learning_rate": 1.0256410256410257e-06,
"loss": 1.4124,
"mean_token_accuracy": 0.6495780032128096,
"num_tokens": 1313728.0,
"step": 3
},
{
"entropy": 0.565948486328125,
"epoch": 0.015503875968992248,
"grad_norm": 5.999742316986466,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.4314,
"mean_token_accuracy": 0.6460442841053009,
"num_tokens": 1736438.0,
"step": 4
},
{
"entropy": 0.5799560546875,
"epoch": 0.01937984496124031,
"grad_norm": 5.62840439540393,
"learning_rate": 2.0512820512820513e-06,
"loss": 1.3791,
"mean_token_accuracy": 0.6597223430871964,
"num_tokens": 2159510.0,
"step": 5
},
{
"entropy": 0.562835693359375,
"epoch": 0.023255813953488372,
"grad_norm": 5.559443063914328,
"learning_rate": 2.564102564102564e-06,
"loss": 1.3862,
"mean_token_accuracy": 0.6581119578331709,
"num_tokens": 2593106.0,
"step": 6
},
{
"entropy": 0.5673828125,
"epoch": 0.027131782945736434,
"grad_norm": 4.991039051490951,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.3499,
"mean_token_accuracy": 0.661266727373004,
"num_tokens": 3009566.0,
"step": 7
},
{
"entropy": 0.57232666015625,
"epoch": 0.031007751937984496,
"grad_norm": 4.609727165215335,
"learning_rate": 3.58974358974359e-06,
"loss": 1.3037,
"mean_token_accuracy": 0.666324052028358,
"num_tokens": 3442267.0,
"step": 8
},
{
"entropy": 0.5614013671875,
"epoch": 0.03488372093023256,
"grad_norm": 4.395216373431362,
"learning_rate": 4.102564102564103e-06,
"loss": 1.3005,
"mean_token_accuracy": 0.6676654135808349,
"num_tokens": 3885637.0,
"step": 9
},
{
"entropy": 0.58343505859375,
"epoch": 0.03875968992248062,
"grad_norm": 3.6789554797458583,
"learning_rate": 4.615384615384616e-06,
"loss": 1.1796,
"mean_token_accuracy": 0.6888375803828239,
"num_tokens": 4289326.0,
"step": 10
},
{
"entropy": 0.5760498046875,
"epoch": 0.04263565891472868,
"grad_norm": 3.4456412878449334,
"learning_rate": 5.128205128205128e-06,
"loss": 1.1616,
"mean_token_accuracy": 0.6923027131706476,
"num_tokens": 4705258.0,
"step": 11
},
{
"entropy": 0.562744140625,
"epoch": 0.046511627906976744,
"grad_norm": 3.3438497846016206,
"learning_rate": 5.641025641025641e-06,
"loss": 1.1297,
"mean_token_accuracy": 0.6954672196879983,
"num_tokens": 5141166.0,
"step": 12
},
{
"entropy": 0.527740478515625,
"epoch": 0.050387596899224806,
"grad_norm": 4.642324495039121,
"learning_rate": 6.153846153846155e-06,
"loss": 1.0554,
"mean_token_accuracy": 0.711206135340035,
"num_tokens": 5583075.0,
"step": 13
},
{
"entropy": 0.549102783203125,
"epoch": 0.05426356589147287,
"grad_norm": 4.625817739349075,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0342,
"mean_token_accuracy": 0.7171442015096545,
"num_tokens": 5997782.0,
"step": 14
},
{
"entropy": 0.549407958984375,
"epoch": 0.05813953488372093,
"grad_norm": 4.011955003610355,
"learning_rate": 7.17948717948718e-06,
"loss": 0.9871,
"mean_token_accuracy": 0.7261330829933286,
"num_tokens": 6431769.0,
"step": 15
},
{
"entropy": 0.5477294921875,
"epoch": 0.06201550387596899,
"grad_norm": 3.457808901288651,
"learning_rate": 7.692307692307694e-06,
"loss": 0.9568,
"mean_token_accuracy": 0.7292733397334814,
"num_tokens": 6854070.0,
"step": 16
},
{
"entropy": 0.54443359375,
"epoch": 0.06589147286821706,
"grad_norm": 2.6640028081114506,
"learning_rate": 8.205128205128205e-06,
"loss": 0.9393,
"mean_token_accuracy": 0.7348427921533585,
"num_tokens": 7300929.0,
"step": 17
},
{
"entropy": 0.541290283203125,
"epoch": 0.06976744186046512,
"grad_norm": 3.867353613777132,
"learning_rate": 8.717948717948719e-06,
"loss": 0.9207,
"mean_token_accuracy": 0.7395377028733492,
"num_tokens": 7743375.0,
"step": 18
},
{
"entropy": 0.542083740234375,
"epoch": 0.07364341085271318,
"grad_norm": 3.7108241527896415,
"learning_rate": 9.230769230769232e-06,
"loss": 0.9346,
"mean_token_accuracy": 0.7370091788470745,
"num_tokens": 8185557.0,
"step": 19
},
{
"entropy": 0.536529541015625,
"epoch": 0.07751937984496124,
"grad_norm": 3.052580376312922,
"learning_rate": 9.743589743589744e-06,
"loss": 0.889,
"mean_token_accuracy": 0.7455960083752871,
"num_tokens": 8632051.0,
"step": 20
},
{
"entropy": 0.542724609375,
"epoch": 0.08139534883720931,
"grad_norm": 2.773614625944707,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.8748,
"mean_token_accuracy": 0.7437297496944666,
"num_tokens": 9049387.0,
"step": 21
},
{
"entropy": 0.5391845703125,
"epoch": 0.08527131782945736,
"grad_norm": 2.3103684483676905,
"learning_rate": 1.076923076923077e-05,
"loss": 0.8399,
"mean_token_accuracy": 0.7560118213295937,
"num_tokens": 9472374.0,
"step": 22
},
{
"entropy": 0.53289794921875,
"epoch": 0.08914728682170543,
"grad_norm": 2.423538463976918,
"learning_rate": 1.1282051282051283e-05,
"loss": 0.8428,
"mean_token_accuracy": 0.7558635827153921,
"num_tokens": 9904619.0,
"step": 23
},
{
"entropy": 0.53314208984375,
"epoch": 0.09302325581395349,
"grad_norm": 3.133872651991572,
"learning_rate": 1.1794871794871796e-05,
"loss": 0.8011,
"mean_token_accuracy": 0.7630726611241698,
"num_tokens": 10316799.0,
"step": 24
},
{
"entropy": 0.53680419921875,
"epoch": 0.09689922480620156,
"grad_norm": 2.307993317837642,
"learning_rate": 1.230769230769231e-05,
"loss": 0.7905,
"mean_token_accuracy": 0.7649277085438371,
"num_tokens": 10724410.0,
"step": 25
},
{
"entropy": 0.53424072265625,
"epoch": 0.10077519379844961,
"grad_norm": 2.3004423511325736,
"learning_rate": 1.2820512820512823e-05,
"loss": 0.784,
"mean_token_accuracy": 0.7660581776872277,
"num_tokens": 11157441.0,
"step": 26
},
{
"entropy": 0.519683837890625,
"epoch": 0.10465116279069768,
"grad_norm": 2.0605076377059737,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.7622,
"mean_token_accuracy": 0.7721455879509449,
"num_tokens": 11583383.0,
"step": 27
},
{
"entropy": 0.530609130859375,
"epoch": 0.10852713178294573,
"grad_norm": 1.7051329435139446,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.7549,
"mean_token_accuracy": 0.7728732898831367,
"num_tokens": 11996833.0,
"step": 28
},
{
"entropy": 0.50787353515625,
"epoch": 0.1124031007751938,
"grad_norm": 1.653966676070023,
"learning_rate": 1.435897435897436e-05,
"loss": 0.7424,
"mean_token_accuracy": 0.7756998986005783,
"num_tokens": 12434838.0,
"step": 29
},
{
"entropy": 0.50762939453125,
"epoch": 0.11627906976744186,
"grad_norm": 1.7404676617867714,
"learning_rate": 1.4871794871794874e-05,
"loss": 0.7389,
"mean_token_accuracy": 0.7756242621690035,
"num_tokens": 12880605.0,
"step": 30
},
{
"entropy": 0.53057861328125,
"epoch": 0.12015503875968993,
"grad_norm": 1.6941417095470896,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.6965,
"mean_token_accuracy": 0.7865999517962337,
"num_tokens": 13286289.0,
"step": 31
},
{
"entropy": 0.513275146484375,
"epoch": 0.12403100775193798,
"grad_norm": 1.777615436740328,
"learning_rate": 1.5897435897435897e-05,
"loss": 0.7276,
"mean_token_accuracy": 0.7772727366536856,
"num_tokens": 13720208.0,
"step": 32
},
{
"entropy": 0.5091552734375,
"epoch": 0.12790697674418605,
"grad_norm": 2.169414280230603,
"learning_rate": 1.641025641025641e-05,
"loss": 0.7014,
"mean_token_accuracy": 0.7825290272012353,
"num_tokens": 14151016.0,
"step": 33
},
{
"entropy": 0.499481201171875,
"epoch": 0.13178294573643412,
"grad_norm": 1.6130944050268388,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.7115,
"mean_token_accuracy": 0.7820462808012962,
"num_tokens": 14593303.0,
"step": 34
},
{
"entropy": 0.513458251953125,
"epoch": 0.13565891472868216,
"grad_norm": 1.6123542005336788,
"learning_rate": 1.7435897435897438e-05,
"loss": 0.6765,
"mean_token_accuracy": 0.7885658349841833,
"num_tokens": 15017503.0,
"step": 35
},
{
"entropy": 0.490997314453125,
"epoch": 0.13953488372093023,
"grad_norm": 2.012326490340564,
"learning_rate": 1.794871794871795e-05,
"loss": 0.6943,
"mean_token_accuracy": 0.7859224667772651,
"num_tokens": 15471309.0,
"step": 36
},
{
"entropy": 0.498046875,
"epoch": 0.1434108527131783,
"grad_norm": 1.7927014251735478,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.6843,
"mean_token_accuracy": 0.7870860742405057,
"num_tokens": 15908325.0,
"step": 37
},
{
"entropy": 0.499267578125,
"epoch": 0.14728682170542637,
"grad_norm": 1.6105672802191198,
"learning_rate": 1.8974358974358975e-05,
"loss": 0.6589,
"mean_token_accuracy": 0.7950629340484738,
"num_tokens": 16345910.0,
"step": 38
},
{
"entropy": 0.49749755859375,
"epoch": 0.1511627906976744,
"grad_norm": 1.5453421039467747,
"learning_rate": 1.9487179487179488e-05,
"loss": 0.6827,
"mean_token_accuracy": 0.7869697958230972,
"num_tokens": 16794948.0,
"step": 39
},
{
"entropy": 0.49737548828125,
"epoch": 0.15503875968992248,
"grad_norm": 1.6951113924313514,
"learning_rate": 2e-05,
"loss": 0.6532,
"mean_token_accuracy": 0.7926666866987944,
"num_tokens": 17211373.0,
"step": 40
},
{
"entropy": 0.49774169921875,
"epoch": 0.15891472868217055,
"grad_norm": 1.5679650999550647,
"learning_rate": 1.9999908652974457e-05,
"loss": 0.6461,
"mean_token_accuracy": 0.7951579857617617,
"num_tokens": 17650141.0,
"step": 41
},
{
"entropy": 0.507293701171875,
"epoch": 0.16279069767441862,
"grad_norm": 1.868205623671047,
"learning_rate": 1.9999634613566673e-05,
"loss": 0.6458,
"mean_token_accuracy": 0.7966311117634177,
"num_tokens": 18079352.0,
"step": 42
},
{
"entropy": 0.497589111328125,
"epoch": 0.16666666666666666,
"grad_norm": 1.584702952530748,
"learning_rate": 1.9999177886783194e-05,
"loss": 0.6422,
"mean_token_accuracy": 0.7978830458596349,
"num_tokens": 18510478.0,
"step": 43
},
{
"entropy": 0.492156982421875,
"epoch": 0.17054263565891473,
"grad_norm": 1.6650565736181908,
"learning_rate": 1.9998538480968142e-05,
"loss": 0.6564,
"mean_token_accuracy": 0.7950685685500503,
"num_tokens": 18967235.0,
"step": 44
},
{
"entropy": 0.502593994140625,
"epoch": 0.1744186046511628,
"grad_norm": 1.7541270410806087,
"learning_rate": 1.999771640780308e-05,
"loss": 0.6327,
"mean_token_accuracy": 0.7998538371175528,
"num_tokens": 19402140.0,
"step": 45
},
{
"entropy": 0.4981689453125,
"epoch": 0.17829457364341086,
"grad_norm": 1.7630109068067972,
"learning_rate": 1.99967116823068e-05,
"loss": 0.6245,
"mean_token_accuracy": 0.8009940264746547,
"num_tokens": 19828968.0,
"step": 46
},
{
"entropy": 0.491119384765625,
"epoch": 0.1821705426356589,
"grad_norm": 1.6817263064733463,
"learning_rate": 1.9995524322835035e-05,
"loss": 0.6187,
"mean_token_accuracy": 0.8020930821076035,
"num_tokens": 20257306.0,
"step": 47
},
{
"entropy": 0.494384765625,
"epoch": 0.18604651162790697,
"grad_norm": 1.6183581152741309,
"learning_rate": 1.9994154351080137e-05,
"loss": 0.6278,
"mean_token_accuracy": 0.801664580591023,
"num_tokens": 20703796.0,
"step": 48
},
{
"entropy": 0.491363525390625,
"epoch": 0.18992248062015504,
"grad_norm": 1.6613810662187647,
"learning_rate": 1.999260179207068e-05,
"loss": 0.6129,
"mean_token_accuracy": 0.8025477975606918,
"num_tokens": 21131092.0,
"step": 49
},
{
"entropy": 0.4857177734375,
"epoch": 0.1937984496124031,
"grad_norm": 1.426212419722124,
"learning_rate": 1.9990866674170984e-05,
"loss": 0.6166,
"mean_token_accuracy": 0.8041007313877344,
"num_tokens": 21568909.0,
"step": 50
},
{
"entropy": 0.4915771484375,
"epoch": 0.19767441860465115,
"grad_norm": 1.3780333302834848,
"learning_rate": 1.9988949029080625e-05,
"loss": 0.599,
"mean_token_accuracy": 0.8107355292886496,
"num_tokens": 21994935.0,
"step": 51
},
{
"entropy": 0.5010986328125,
"epoch": 0.20155038759689922,
"grad_norm": 1.7509993242634168,
"learning_rate": 1.9986848891833846e-05,
"loss": 0.615,
"mean_token_accuracy": 0.8016165606677532,
"num_tokens": 22410374.0,
"step": 52
},
{
"entropy": 0.482513427734375,
"epoch": 0.2054263565891473,
"grad_norm": 1.3707684982728474,
"learning_rate": 1.9984566300798895e-05,
"loss": 0.6021,
"mean_token_accuracy": 0.8062009122222662,
"num_tokens": 22839536.0,
"step": 53
},
{
"entropy": 0.4803466796875,
"epoch": 0.20930232558139536,
"grad_norm": 1.4431367638619368,
"learning_rate": 1.998210129767735e-05,
"loss": 0.5923,
"mean_token_accuracy": 0.8107659220695496,
"num_tokens": 23270394.0,
"step": 54
},
{
"entropy": 0.4873046875,
"epoch": 0.2131782945736434,
"grad_norm": 1.6712200349650355,
"learning_rate": 1.9979453927503366e-05,
"loss": 0.626,
"mean_token_accuracy": 0.802630621008575,
"num_tokens": 23714867.0,
"step": 55
},
{
"entropy": 0.48211669921875,
"epoch": 0.21705426356589147,
"grad_norm": 1.416046601988453,
"learning_rate": 1.997662423864281e-05,
"loss": 0.6048,
"mean_token_accuracy": 0.8021730659529567,
"num_tokens": 24149194.0,
"step": 56
},
{
"entropy": 0.499420166015625,
"epoch": 0.22093023255813954,
"grad_norm": 1.3801609002151292,
"learning_rate": 1.9973612282792413e-05,
"loss": 0.5847,
"mean_token_accuracy": 0.8101371973752975,
"num_tokens": 24552337.0,
"step": 57
},
{
"entropy": 0.478302001953125,
"epoch": 0.2248062015503876,
"grad_norm": 1.5671567435801814,
"learning_rate": 1.997041811497882e-05,
"loss": 0.6018,
"mean_token_accuracy": 0.8056067563593388,
"num_tokens": 25009560.0,
"step": 58
},
{
"entropy": 0.486572265625,
"epoch": 0.22868217054263565,
"grad_norm": 1.7189084201411189,
"learning_rate": 1.9967041793557578e-05,
"loss": 0.5965,
"mean_token_accuracy": 0.8089054571464658,
"num_tokens": 25448410.0,
"step": 59
},
{
"entropy": 0.4847412109375,
"epoch": 0.23255813953488372,
"grad_norm": 1.7298449784625014,
"learning_rate": 1.996348338021207e-05,
"loss": 0.6061,
"mean_token_accuracy": 0.8035977333784103,
"num_tokens": 25869456.0,
"step": 60
},
{
"entropy": 0.47747802734375,
"epoch": 0.2364341085271318,
"grad_norm": 1.4378887326409655,
"learning_rate": 1.9959742939952393e-05,
"loss": 0.5958,
"mean_token_accuracy": 0.8067287458106875,
"num_tokens": 26304651.0,
"step": 61
},
{
"entropy": 0.470184326171875,
"epoch": 0.24031007751937986,
"grad_norm": 1.408145966867428,
"learning_rate": 1.995582054111416e-05,
"loss": 0.5639,
"mean_token_accuracy": 0.8142334129661322,
"num_tokens": 26717402.0,
"step": 62
},
{
"entropy": 0.477020263671875,
"epoch": 0.2441860465116279,
"grad_norm": 1.286721956523712,
"learning_rate": 1.9951716255357267e-05,
"loss": 0.5788,
"mean_token_accuracy": 0.811956575140357,
"num_tokens": 27150636.0,
"step": 63
},
{
"entropy": 0.47705078125,
"epoch": 0.24806201550387597,
"grad_norm": 1.5668383477560968,
"learning_rate": 1.9947430157664575e-05,
"loss": 0.587,
"mean_token_accuracy": 0.8080101488158107,
"num_tokens": 27591516.0,
"step": 64
},
{
"entropy": 0.465057373046875,
"epoch": 0.25193798449612403,
"grad_norm": 1.2590904210941338,
"learning_rate": 1.994296232634054e-05,
"loss": 0.5843,
"mean_token_accuracy": 0.8114851666614413,
"num_tokens": 28032955.0,
"step": 65
},
{
"entropy": 0.47210693359375,
"epoch": 0.2558139534883721,
"grad_norm": 1.3064580012149183,
"learning_rate": 1.9938312843009776e-05,
"loss": 0.581,
"mean_token_accuracy": 0.8110779244452715,
"num_tokens": 28461266.0,
"step": 66
},
{
"entropy": 0.46649169921875,
"epoch": 0.2596899224806202,
"grad_norm": 1.443208706347969,
"learning_rate": 1.9933481792615583e-05,
"loss": 0.5769,
"mean_token_accuracy": 0.8113029273226857,
"num_tokens": 28910287.0,
"step": 67
},
{
"entropy": 0.469696044921875,
"epoch": 0.26356589147286824,
"grad_norm": 1.3107422511431897,
"learning_rate": 1.9928469263418376e-05,
"loss": 0.5764,
"mean_token_accuracy": 0.8129542609676719,
"num_tokens": 29337607.0,
"step": 68
},
{
"entropy": 0.47607421875,
"epoch": 0.26744186046511625,
"grad_norm": 1.5954507940802074,
"learning_rate": 1.992327534699408e-05,
"loss": 0.5704,
"mean_token_accuracy": 0.8138690665364265,
"num_tokens": 29727407.0,
"step": 69
},
{
"entropy": 0.459442138671875,
"epoch": 0.2713178294573643,
"grad_norm": 1.2075602107711025,
"learning_rate": 1.991790013823246e-05,
"loss": 0.5662,
"mean_token_accuracy": 0.8136689653620124,
"num_tokens": 30159242.0,
"step": 70
},
{
"entropy": 0.462005615234375,
"epoch": 0.2751937984496124,
"grad_norm": 1.3818376110580417,
"learning_rate": 1.991234373533539e-05,
"loss": 0.5497,
"mean_token_accuracy": 0.8170208567753434,
"num_tokens": 30583266.0,
"step": 71
},
{
"entropy": 0.46826171875,
"epoch": 0.27906976744186046,
"grad_norm": 1.4506362747599748,
"learning_rate": 1.990660623981503e-05,
"loss": 0.5711,
"mean_token_accuracy": 0.8127527991309762,
"num_tokens": 31014469.0,
"step": 72
},
{
"entropy": 0.469696044921875,
"epoch": 0.28294573643410853,
"grad_norm": 1.4597438563853555,
"learning_rate": 1.9900687756492022e-05,
"loss": 0.5742,
"mean_token_accuracy": 0.8152262289077044,
"num_tokens": 31442182.0,
"step": 73
},
{
"entropy": 0.463592529296875,
"epoch": 0.2868217054263566,
"grad_norm": 1.587910669929024,
"learning_rate": 1.9894588393493528e-05,
"loss": 0.5704,
"mean_token_accuracy": 0.8138687778264284,
"num_tokens": 31881511.0,
"step": 74
},
{
"entropy": 0.462158203125,
"epoch": 0.29069767441860467,
"grad_norm": 1.3596755737390338,
"learning_rate": 1.9888308262251286e-05,
"loss": 0.5686,
"mean_token_accuracy": 0.8140115709975362,
"num_tokens": 32311809.0,
"step": 75
},
{
"entropy": 0.4627685546875,
"epoch": 0.29457364341085274,
"grad_norm": 1.5095142996250903,
"learning_rate": 1.988184747749956e-05,
"loss": 0.5612,
"mean_token_accuracy": 0.8161912616342306,
"num_tokens": 32730110.0,
"step": 76
},
{
"entropy": 0.455291748046875,
"epoch": 0.29844961240310075,
"grad_norm": 1.4739840663337003,
"learning_rate": 1.9875206157273038e-05,
"loss": 0.5626,
"mean_token_accuracy": 0.814151655882597,
"num_tokens": 33157816.0,
"step": 77
},
{
"entropy": 0.45819091796875,
"epoch": 0.3023255813953488,
"grad_norm": 1.312276706222236,
"learning_rate": 1.9868384422904693e-05,
"loss": 0.5811,
"mean_token_accuracy": 0.8097445927560329,
"num_tokens": 33594309.0,
"step": 78
},
{
"entropy": 0.461273193359375,
"epoch": 0.3062015503875969,
"grad_norm": 1.3662318896095438,
"learning_rate": 1.986138239902355e-05,
"loss": 0.5398,
"mean_token_accuracy": 0.8207592982798815,
"num_tokens": 34027149.0,
"step": 79
},
{
"entropy": 0.460845947265625,
"epoch": 0.31007751937984496,
"grad_norm": 1.114084887070516,
"learning_rate": 1.9854200213552426e-05,
"loss": 0.5491,
"mean_token_accuracy": 0.8194785909727216,
"num_tokens": 34443239.0,
"step": 80
},
{
"entropy": 0.455780029296875,
"epoch": 0.313953488372093,
"grad_norm": 1.3793377958773556,
"learning_rate": 1.9846837997705576e-05,
"loss": 0.5644,
"mean_token_accuracy": 0.8135287668555975,
"num_tokens": 34888207.0,
"step": 81
},
{
"entropy": 0.457061767578125,
"epoch": 0.3178294573643411,
"grad_norm": 1.2399000210904676,
"learning_rate": 1.98392958859863e-05,
"loss": 0.5535,
"mean_token_accuracy": 0.8170906333252788,
"num_tokens": 35326139.0,
"step": 82
},
{
"entropy": 0.45587158203125,
"epoch": 0.32170542635658916,
"grad_norm": 1.2255651250046555,
"learning_rate": 1.9831574016184493e-05,
"loss": 0.5619,
"mean_token_accuracy": 0.8146381946280599,
"num_tokens": 35772478.0,
"step": 83
},
{
"entropy": 0.460906982421875,
"epoch": 0.32558139534883723,
"grad_norm": 1.1547393809140933,
"learning_rate": 1.9823672529374123e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8206607829779387,
"num_tokens": 36195639.0,
"step": 84
},
{
"entropy": 0.464202880859375,
"epoch": 0.32945736434108525,
"grad_norm": 1.3004399814018137,
"learning_rate": 1.9815591569910654e-05,
"loss": 0.5507,
"mean_token_accuracy": 0.8176483260467649,
"num_tokens": 36620535.0,
"step": 85
},
{
"entropy": 0.4609375,
"epoch": 0.3333333333333333,
"grad_norm": 1.2762821791429464,
"learning_rate": 1.980733128542841e-05,
"loss": 0.5518,
"mean_token_accuracy": 0.8188320798799396,
"num_tokens": 37051548.0,
"step": 86
},
{
"entropy": 0.46221923828125,
"epoch": 0.3372093023255814,
"grad_norm": 1.2824385762190618,
"learning_rate": 1.9798891826837872e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8216770840808749,
"num_tokens": 37483698.0,
"step": 87
},
{
"entropy": 0.46661376953125,
"epoch": 0.34108527131782945,
"grad_norm": 1.3876212166283788,
"learning_rate": 1.979027334832293e-05,
"loss": 0.5626,
"mean_token_accuracy": 0.8143940027803183,
"num_tokens": 37910246.0,
"step": 88
},
{
"entropy": 0.46575927734375,
"epoch": 0.3449612403100775,
"grad_norm": 1.1660372191371922,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.5428,
"mean_token_accuracy": 0.820567911490798,
"num_tokens": 38319746.0,
"step": 89
},
{
"entropy": 0.47100830078125,
"epoch": 0.3488372093023256,
"grad_norm": 1.1487166285802124,
"learning_rate": 1.977249996460544e-05,
"loss": 0.5604,
"mean_token_accuracy": 0.816031564027071,
"num_tokens": 38752631.0,
"step": 90
},
{
"entropy": 0.460296630859375,
"epoch": 0.35271317829457366,
"grad_norm": 1.1402604232378695,
"learning_rate": 1.9763345384112044e-05,
"loss": 0.5473,
"mean_token_accuracy": 0.8196038343012333,
"num_tokens": 39182910.0,
"step": 91
},
{
"entropy": 0.46826171875,
"epoch": 0.35658914728682173,
"grad_norm": 1.1923633175594284,
"learning_rate": 1.97540124331066e-05,
"loss": 0.5409,
"mean_token_accuracy": 0.8211136739701033,
"num_tokens": 39621180.0,
"step": 92
},
{
"entropy": 0.460357666015625,
"epoch": 0.36046511627906974,
"grad_norm": 1.2665373099510882,
"learning_rate": 1.974450128209658e-05,
"loss": 0.5531,
"mean_token_accuracy": 0.8183085061609745,
"num_tokens": 40050290.0,
"step": 93
},
{
"entropy": 0.46466064453125,
"epoch": 0.3643410852713178,
"grad_norm": 1.2259431636846179,
"learning_rate": 1.973481210484505e-05,
"loss": 0.5419,
"mean_token_accuracy": 0.8202582132071257,
"num_tokens": 40464004.0,
"step": 94
},
{
"entropy": 0.45953369140625,
"epoch": 0.3682170542635659,
"grad_norm": 1.2314434799144434,
"learning_rate": 1.9724945078367513e-05,
"loss": 0.5331,
"mean_token_accuracy": 0.8198390873149037,
"num_tokens": 40902322.0,
"step": 95
},
{
"entropy": 0.46234130859375,
"epoch": 0.37209302325581395,
"grad_norm": 1.149693863746958,
"learning_rate": 1.9714900382928674e-05,
"loss": 0.5322,
"mean_token_accuracy": 0.8233453892171383,
"num_tokens": 41302073.0,
"step": 96
},
{
"entropy": 0.463897705078125,
"epoch": 0.375968992248062,
"grad_norm": 1.2008159652924766,
"learning_rate": 1.9704678202039148e-05,
"loss": 0.5298,
"mean_token_accuracy": 0.8223507273942232,
"num_tokens": 41716373.0,
"step": 97
},
{
"entropy": 0.46221923828125,
"epoch": 0.3798449612403101,
"grad_norm": 1.1035610955872097,
"learning_rate": 1.9694278722452092e-05,
"loss": 0.556,
"mean_token_accuracy": 0.8180048149079084,
"num_tokens": 42144096.0,
"step": 98
},
{
"entropy": 0.4696044921875,
"epoch": 0.38372093023255816,
"grad_norm": 1.2329688382688684,
"learning_rate": 1.9683702134159815e-05,
"loss": 0.5375,
"mean_token_accuracy": 0.8225236749276519,
"num_tokens": 42563834.0,
"step": 99
},
{
"entropy": 0.46099853515625,
"epoch": 0.3875968992248062,
"grad_norm": 1.1709642889265268,
"learning_rate": 1.9672948630390296e-05,
"loss": 0.5384,
"mean_token_accuracy": 0.819816923700273,
"num_tokens": 42973627.0,
"step": 100
},
{
"entropy": 0.45849609375,
"epoch": 0.39147286821705424,
"grad_norm": 1.1856435417034512,
"learning_rate": 1.9662018407603643e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8200850309804082,
"num_tokens": 43404575.0,
"step": 101
},
{
"entropy": 0.4561767578125,
"epoch": 0.3953488372093023,
"grad_norm": 1.228277406849483,
"learning_rate": 1.9650911665488533e-05,
"loss": 0.5292,
"mean_token_accuracy": 0.8224469656124711,
"num_tokens": 43837685.0,
"step": 102
},
{
"entropy": 0.46295166015625,
"epoch": 0.3992248062015504,
"grad_norm": 1.0680952459464894,
"learning_rate": 1.9639628606958535e-05,
"loss": 0.5404,
"mean_token_accuracy": 0.8188497675582767,
"num_tokens": 44260189.0,
"step": 103
},
{
"entropy": 0.447418212890625,
"epoch": 0.40310077519379844,
"grad_norm": 1.1981648971283583,
"learning_rate": 1.9628169438148414e-05,
"loss": 0.544,
"mean_token_accuracy": 0.8205827260389924,
"num_tokens": 44688279.0,
"step": 104
},
{
"entropy": 0.45977783203125,
"epoch": 0.4069767441860465,
"grad_norm": 1.2193905149033133,
"learning_rate": 1.9616534368410364e-05,
"loss": 0.5429,
"mean_token_accuracy": 0.8211770560592413,
"num_tokens": 45096594.0,
"step": 105
},
{
"entropy": 0.45819091796875,
"epoch": 0.4108527131782946,
"grad_norm": 1.1549313358131803,
"learning_rate": 1.9604723610310195e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8212394239380956,
"num_tokens": 45512935.0,
"step": 106
},
{
"entropy": 0.45819091796875,
"epoch": 0.41472868217054265,
"grad_norm": 1.1221291833410496,
"learning_rate": 1.9592737379623427e-05,
"loss": 0.5384,
"mean_token_accuracy": 0.8197078760713339,
"num_tokens": 45942440.0,
"step": 107
},
{
"entropy": 0.45684814453125,
"epoch": 0.4186046511627907,
"grad_norm": 1.0352641413953396,
"learning_rate": 1.9580575895331364e-05,
"loss": 0.5403,
"mean_token_accuracy": 0.8169021736830473,
"num_tokens": 46386550.0,
"step": 108
},
{
"entropy": 0.459136962890625,
"epoch": 0.42248062015503873,
"grad_norm": 1.105470239941562,
"learning_rate": 1.956823937961709e-05,
"loss": 0.5533,
"mean_token_accuracy": 0.81779002584517,
"num_tokens": 46827445.0,
"step": 109
},
{
"entropy": 0.463409423828125,
"epoch": 0.4263565891472868,
"grad_norm": 1.179096360799542,
"learning_rate": 1.955572805786141e-05,
"loss": 0.5264,
"mean_token_accuracy": 0.8238548217341304,
"num_tokens": 47239972.0,
"step": 110
},
{
"entropy": 0.456756591796875,
"epoch": 0.43023255813953487,
"grad_norm": 1.1110689927671835,
"learning_rate": 1.9543042158638728e-05,
"loss": 0.5279,
"mean_token_accuracy": 0.8251189421862364,
"num_tokens": 47645121.0,
"step": 111
},
{
"entropy": 0.453582763671875,
"epoch": 0.43410852713178294,
"grad_norm": 1.2062009597372043,
"learning_rate": 1.9530181913712875e-05,
"loss": 0.5388,
"mean_token_accuracy": 0.8221446331590414,
"num_tokens": 48092214.0,
"step": 112
},
{
"entropy": 0.45489501953125,
"epoch": 0.437984496124031,
"grad_norm": 1.103293645738396,
"learning_rate": 1.9517147558032877e-05,
"loss": 0.5341,
"mean_token_accuracy": 0.8221006505191326,
"num_tokens": 48526733.0,
"step": 113
},
{
"entropy": 0.457305908203125,
"epoch": 0.4418604651162791,
"grad_norm": 1.0568614511684307,
"learning_rate": 1.9503939329728657e-05,
"loss": 0.5296,
"mean_token_accuracy": 0.8257120624184608,
"num_tokens": 48933549.0,
"step": 114
},
{
"entropy": 0.449493408203125,
"epoch": 0.44573643410852715,
"grad_norm": 1.0530308845913583,
"learning_rate": 1.949055747010669e-05,
"loss": 0.5476,
"mean_token_accuracy": 0.8189520025625825,
"num_tokens": 49385777.0,
"step": 115
},
{
"entropy": 0.451873779296875,
"epoch": 0.4496124031007752,
"grad_norm": 1.201764585546533,
"learning_rate": 1.9477002223645587e-05,
"loss": 0.518,
"mean_token_accuracy": 0.8288652747869492,
"num_tokens": 49815021.0,
"step": 116
},
{
"entropy": 0.4442138671875,
"epoch": 0.45348837209302323,
"grad_norm": 1.1614506102963222,
"learning_rate": 1.9463273837991643e-05,
"loss": 0.5252,
"mean_token_accuracy": 0.8240685043856502,
"num_tokens": 50253819.0,
"step": 117
},
{
"entropy": 0.441131591796875,
"epoch": 0.4573643410852713,
"grad_norm": 1.057659936365217,
"learning_rate": 1.9449372563954293e-05,
"loss": 0.5335,
"mean_token_accuracy": 0.8224316090345383,
"num_tokens": 50705508.0,
"step": 118
},
{
"entropy": 0.450897216796875,
"epoch": 0.46124031007751937,
"grad_norm": 1.1289057507904565,
"learning_rate": 1.9435298655501547e-05,
"loss": 0.5202,
"mean_token_accuracy": 0.8252356611192226,
"num_tokens": 51126464.0,
"step": 119
},
{
"entropy": 0.4454345703125,
"epoch": 0.46511627906976744,
"grad_norm": 1.1783048839311618,
"learning_rate": 1.9421052369755335e-05,
"loss": 0.5383,
"mean_token_accuracy": 0.8221455095335841,
"num_tokens": 51558968.0,
"step": 120
},
{
"entropy": 0.454193115234375,
"epoch": 0.4689922480620155,
"grad_norm": 1.1118532244729238,
"learning_rate": 1.9406633966986828e-05,
"loss": 0.5349,
"mean_token_accuracy": 0.8201886266469955,
"num_tokens": 51978742.0,
"step": 121
},
{
"entropy": 0.456939697265625,
"epoch": 0.4728682170542636,
"grad_norm": 1.3616417906601193,
"learning_rate": 1.939204371061166e-05,
"loss": 0.531,
"mean_token_accuracy": 0.8195713134482503,
"num_tokens": 52402161.0,
"step": 122
},
{
"entropy": 0.4530029296875,
"epoch": 0.47674418604651164,
"grad_norm": 1.0890779163412931,
"learning_rate": 1.9377281867185145e-05,
"loss": 0.5412,
"mean_token_accuracy": 0.8204515632241964,
"num_tokens": 52839250.0,
"step": 123
},
{
"entropy": 0.44537353515625,
"epoch": 0.4806201550387597,
"grad_norm": 1.3916228521360827,
"learning_rate": 1.9362348706397374e-05,
"loss": 0.5496,
"mean_token_accuracy": 0.8194613959640265,
"num_tokens": 53288733.0,
"step": 124
},
{
"entropy": 0.45404052734375,
"epoch": 0.4844961240310077,
"grad_norm": 1.1216245307718116,
"learning_rate": 1.934724450106831e-05,
"loss": 0.518,
"mean_token_accuracy": 0.8268485888838768,
"num_tokens": 53703856.0,
"step": 125
},
{
"entropy": 0.4644775390625,
"epoch": 0.4883720930232558,
"grad_norm": 1.292642203861098,
"learning_rate": 1.9331969527142805e-05,
"loss": 0.5248,
"mean_token_accuracy": 0.8242862829938531,
"num_tokens": 54125616.0,
"step": 126
},
{
"entropy": 0.457183837890625,
"epoch": 0.49224806201550386,
"grad_norm": 1.1577638631630167,
"learning_rate": 1.9316524063685544e-05,
"loss": 0.5329,
"mean_token_accuracy": 0.8215478174388409,
"num_tokens": 54559965.0,
"step": 127
},
{
"entropy": 0.463348388671875,
"epoch": 0.49612403100775193,
"grad_norm": 1.1108314222482614,
"learning_rate": 1.930090839287595e-05,
"loss": 0.5224,
"mean_token_accuracy": 0.8258008388802409,
"num_tokens": 54983536.0,
"step": 128
},
{
"entropy": 0.46112060546875,
"epoch": 0.5,
"grad_norm": 1.209480669363572,
"learning_rate": 1.9285122800003045e-05,
"loss": 0.5335,
"mean_token_accuracy": 0.8223184822127223,
"num_tokens": 55414016.0,
"step": 129
},
{
"entropy": 0.447906494140625,
"epoch": 0.5038759689922481,
"grad_norm": 1.1619619363279732,
"learning_rate": 1.926916757346022e-05,
"loss": 0.529,
"mean_token_accuracy": 0.8222861513495445,
"num_tokens": 55844385.0,
"step": 130
},
{
"entropy": 0.461822509765625,
"epoch": 0.5077519379844961,
"grad_norm": 1.1171998492566917,
"learning_rate": 1.9253043004739967e-05,
"loss": 0.5229,
"mean_token_accuracy": 0.8233607662841678,
"num_tokens": 56275861.0,
"step": 131
},
{
"entropy": 0.45672607421875,
"epoch": 0.5116279069767442,
"grad_norm": 1.0256784025552215,
"learning_rate": 1.923674938842857e-05,
"loss": 0.5292,
"mean_token_accuracy": 0.8243364058434963,
"num_tokens": 56732989.0,
"step": 132
},
{
"entropy": 0.464263916015625,
"epoch": 0.5155038759689923,
"grad_norm": 1.0143232022410413,
"learning_rate": 1.9220287022200707e-05,
"loss": 0.5018,
"mean_token_accuracy": 0.8304745489731431,
"num_tokens": 57152314.0,
"step": 133
},
{
"entropy": 0.4564208984375,
"epoch": 0.5193798449612403,
"grad_norm": 1.1661400246068792,
"learning_rate": 1.920365620681401e-05,
"loss": 0.5211,
"mean_token_accuracy": 0.8252125987783074,
"num_tokens": 57590357.0,
"step": 134
},
{
"entropy": 0.456329345703125,
"epoch": 0.5232558139534884,
"grad_norm": 0.9456257334190764,
"learning_rate": 1.9186857246103586e-05,
"loss": 0.5186,
"mean_token_accuracy": 0.824681076221168,
"num_tokens": 58041311.0,
"step": 135
},
{
"entropy": 0.4566650390625,
"epoch": 0.5271317829457365,
"grad_norm": 1.0414842833260838,
"learning_rate": 1.9169890446976454e-05,
"loss": 0.5184,
"mean_token_accuracy": 0.8255736064165831,
"num_tokens": 58469229.0,
"step": 136
},
{
"entropy": 0.46527099609375,
"epoch": 0.5310077519379846,
"grad_norm": 0.9659958295123616,
"learning_rate": 1.9152756119405937e-05,
"loss": 0.5214,
"mean_token_accuracy": 0.8240566346794367,
"num_tokens": 58871704.0,
"step": 137
},
{
"entropy": 0.465423583984375,
"epoch": 0.5348837209302325,
"grad_norm": 1.1234923378415513,
"learning_rate": 1.913545457642601e-05,
"loss": 0.5264,
"mean_token_accuracy": 0.8233613995835185,
"num_tokens": 59297215.0,
"step": 138
},
{
"entropy": 0.4613037109375,
"epoch": 0.5387596899224806,
"grad_norm": 0.9179083293680214,
"learning_rate": 1.911798613412557e-05,
"loss": 0.5181,
"mean_token_accuracy": 0.8269952731207013,
"num_tokens": 59730637.0,
"step": 139
},
{
"entropy": 0.46160888671875,
"epoch": 0.5426356589147286,
"grad_norm": 0.9491732293660206,
"learning_rate": 1.9100351111642666e-05,
"loss": 0.5289,
"mean_token_accuracy": 0.8205523490905762,
"num_tokens": 60156285.0,
"step": 140
},
{
"entropy": 0.453521728515625,
"epoch": 0.5465116279069767,
"grad_norm": 1.102929475751906,
"learning_rate": 1.908254983115867e-05,
"loss": 0.5133,
"mean_token_accuracy": 0.8261066768318415,
"num_tokens": 60596318.0,
"step": 141
},
{
"entropy": 0.455657958984375,
"epoch": 0.5503875968992248,
"grad_norm": 1.0744997289996236,
"learning_rate": 1.9064582617892383e-05,
"loss": 0.5054,
"mean_token_accuracy": 0.8298247829079628,
"num_tokens": 61012662.0,
"step": 142
},
{
"entropy": 0.44940185546875,
"epoch": 0.5542635658914729,
"grad_norm": 1.153734884287017,
"learning_rate": 1.9046449800094103e-05,
"loss": 0.5114,
"mean_token_accuracy": 0.8287463616579771,
"num_tokens": 61440619.0,
"step": 143
},
{
"entropy": 0.452392578125,
"epoch": 0.5581395348837209,
"grad_norm": 1.1039031001679573,
"learning_rate": 1.902815170903963e-05,
"loss": 0.5241,
"mean_token_accuracy": 0.8248511329293251,
"num_tokens": 61883413.0,
"step": 144
},
{
"entropy": 0.4530029296875,
"epoch": 0.562015503875969,
"grad_norm": 1.068677659477954,
"learning_rate": 1.900968867902419e-05,
"loss": 0.5244,
"mean_token_accuracy": 0.8252197271212935,
"num_tokens": 62329563.0,
"step": 145
},
{
"entropy": 0.4559326171875,
"epoch": 0.5658914728682171,
"grad_norm": 1.0529708726567546,
"learning_rate": 1.8991061047356374e-05,
"loss": 0.514,
"mean_token_accuracy": 0.82704047113657,
"num_tokens": 62770914.0,
"step": 146
},
{
"entropy": 0.4534912109375,
"epoch": 0.5697674418604651,
"grad_norm": 0.9988406542616296,
"learning_rate": 1.8972269154351917e-05,
"loss": 0.5222,
"mean_token_accuracy": 0.8278125440701842,
"num_tokens": 63212782.0,
"step": 147
},
{
"entropy": 0.4549560546875,
"epoch": 0.5736434108527132,
"grad_norm": 0.969074986348695,
"learning_rate": 1.895331334332753e-05,
"loss": 0.5268,
"mean_token_accuracy": 0.8241426143795252,
"num_tokens": 63652708.0,
"step": 148
},
{
"entropy": 0.45428466796875,
"epoch": 0.5775193798449613,
"grad_norm": 0.977410620254015,
"learning_rate": 1.893419396059461e-05,
"loss": 0.5115,
"mean_token_accuracy": 0.8301441119983792,
"num_tokens": 64074452.0,
"step": 149
},
{
"entropy": 0.453765869140625,
"epoch": 0.5813953488372093,
"grad_norm": 1.0071558774043947,
"learning_rate": 1.8914911355452895e-05,
"loss": 0.5063,
"mean_token_accuracy": 0.8300048960372806,
"num_tokens": 64484680.0,
"step": 150
},
{
"entropy": 0.44122314453125,
"epoch": 0.5852713178294574,
"grad_norm": 0.956524944955269,
"learning_rate": 1.889546588018412e-05,
"loss": 0.5159,
"mean_token_accuracy": 0.8281158301979303,
"num_tokens": 64924985.0,
"step": 151
},
{
"entropy": 0.4344482421875,
"epoch": 0.5891472868217055,
"grad_norm": 0.9899399473264823,
"learning_rate": 1.8875857890045544e-05,
"loss": 0.5146,
"mean_token_accuracy": 0.8271653046831489,
"num_tokens": 65366395.0,
"step": 152
},
{
"entropy": 0.4498291015625,
"epoch": 0.5930232558139535,
"grad_norm": 0.9972264103529548,
"learning_rate": 1.885608774326348e-05,
"loss": 0.5066,
"mean_token_accuracy": 0.8289199098944664,
"num_tokens": 65787947.0,
"step": 153
},
{
"entropy": 0.4351806640625,
"epoch": 0.5968992248062015,
"grad_norm": 0.948063517501392,
"learning_rate": 1.8836155801026754e-05,
"loss": 0.5193,
"mean_token_accuracy": 0.8260297365486622,
"num_tokens": 66248911.0,
"step": 154
},
{
"entropy": 0.449737548828125,
"epoch": 0.6007751937984496,
"grad_norm": 0.9561345172101009,
"learning_rate": 1.881606242748009e-05,
"loss": 0.4962,
"mean_token_accuracy": 0.8330451222136617,
"num_tokens": 66672337.0,
"step": 155
},
{
"entropy": 0.45330810546875,
"epoch": 0.6046511627906976,
"grad_norm": 1.0343090275971194,
"learning_rate": 1.8795807989717473e-05,
"loss": 0.5035,
"mean_token_accuracy": 0.8295647175982594,
"num_tokens": 67089851.0,
"step": 156
},
{
"entropy": 0.462554931640625,
"epoch": 0.6085271317829457,
"grad_norm": 0.8231981809906675,
"learning_rate": 1.877539285777543e-05,
"loss": 0.514,
"mean_token_accuracy": 0.8288662061095238,
"num_tokens": 67512314.0,
"step": 157
},
{
"entropy": 0.450164794921875,
"epoch": 0.6124031007751938,
"grad_norm": 1.0053067273936602,
"learning_rate": 1.8754817404626275e-05,
"loss": 0.5092,
"mean_token_accuracy": 0.8304671561345458,
"num_tokens": 67935246.0,
"step": 158
},
{
"entropy": 0.45330810546875,
"epoch": 0.6162790697674418,
"grad_norm": 1.0010500407032097,
"learning_rate": 1.87340820061713e-05,
"loss": 0.5173,
"mean_token_accuracy": 0.8267777897417545,
"num_tokens": 68349691.0,
"step": 159
},
{
"entropy": 0.449676513671875,
"epoch": 0.6201550387596899,
"grad_norm": 0.9522314009042264,
"learning_rate": 1.8713187041233896e-05,
"loss": 0.5141,
"mean_token_accuracy": 0.8266640789806843,
"num_tokens": 68790259.0,
"step": 160
},
{
"entropy": 0.451171875,
"epoch": 0.624031007751938,
"grad_norm": 1.0963807833125467,
"learning_rate": 1.8692132891552644e-05,
"loss": 0.5176,
"mean_token_accuracy": 0.826544975861907,
"num_tokens": 69228021.0,
"step": 161
},
{
"entropy": 0.458221435546875,
"epoch": 0.627906976744186,
"grad_norm": 1.0045498488288913,
"learning_rate": 1.867091994177433e-05,
"loss": 0.5075,
"mean_token_accuracy": 0.8274988839402795,
"num_tokens": 69643321.0,
"step": 162
},
{
"entropy": 0.44891357421875,
"epoch": 0.6317829457364341,
"grad_norm": 0.9622723880491996,
"learning_rate": 1.8649548579446938e-05,
"loss": 0.5072,
"mean_token_accuracy": 0.8281608214601874,
"num_tokens": 70091782.0,
"step": 163
},
{
"entropy": 0.450439453125,
"epoch": 0.6356589147286822,
"grad_norm": 0.9636809427862026,
"learning_rate": 1.862801919501253e-05,
"loss": 0.5006,
"mean_token_accuracy": 0.8330097962170839,
"num_tokens": 70515135.0,
"step": 164
},
{
"entropy": 0.445159912109375,
"epoch": 0.6395348837209303,
"grad_norm": 0.9632629012572823,
"learning_rate": 1.8606332181800165e-05,
"loss": 0.507,
"mean_token_accuracy": 0.8288661614060402,
"num_tokens": 70952827.0,
"step": 165
},
{
"entropy": 0.450958251953125,
"epoch": 0.6434108527131783,
"grad_norm": 0.9924678229906452,
"learning_rate": 1.8584487936018663e-05,
"loss": 0.5127,
"mean_token_accuracy": 0.8285386795178056,
"num_tokens": 71384968.0,
"step": 166
},
{
"entropy": 0.45819091796875,
"epoch": 0.6472868217054264,
"grad_norm": 0.9772765903059605,
"learning_rate": 1.8562486856749403e-05,
"loss": 0.5048,
"mean_token_accuracy": 0.8292614417150617,
"num_tokens": 71795936.0,
"step": 167
},
{
"entropy": 0.44146728515625,
"epoch": 0.6511627906976745,
"grad_norm": 0.9934626771494418,
"learning_rate": 1.8540329345939015e-05,
"loss": 0.4996,
"mean_token_accuracy": 0.8288496835157275,
"num_tokens": 72234579.0,
"step": 168
},
{
"entropy": 0.446746826171875,
"epoch": 0.6550387596899225,
"grad_norm": 0.9589171182214611,
"learning_rate": 1.8518015808392045e-05,
"loss": 0.5056,
"mean_token_accuracy": 0.8287281664088368,
"num_tokens": 72666870.0,
"step": 169
},
{
"entropy": 0.453857421875,
"epoch": 0.6589147286821705,
"grad_norm": 0.9087478052569928,
"learning_rate": 1.849554665176354e-05,
"loss": 0.5077,
"mean_token_accuracy": 0.8301580296829343,
"num_tokens": 73105141.0,
"step": 170
},
{
"entropy": 0.45330810546875,
"epoch": 0.6627906976744186,
"grad_norm": 1.0302730313362078,
"learning_rate": 1.8472922286551633e-05,
"loss": 0.5096,
"mean_token_accuracy": 0.827417085878551,
"num_tokens": 73535123.0,
"step": 171
},
{
"entropy": 0.4488525390625,
"epoch": 0.6666666666666666,
"grad_norm": 0.840700088225803,
"learning_rate": 1.8450143126090015e-05,
"loss": 0.4873,
"mean_token_accuracy": 0.8343714782968163,
"num_tokens": 73959285.0,
"step": 172
},
{
"entropy": 0.446319580078125,
"epoch": 0.6705426356589147,
"grad_norm": 0.9380020082651943,
"learning_rate": 1.8427209586540392e-05,
"loss": 0.5034,
"mean_token_accuracy": 0.8296555746346712,
"num_tokens": 74402105.0,
"step": 173
},
{
"entropy": 0.4476318359375,
"epoch": 0.6744186046511628,
"grad_norm": 0.9403613147062398,
"learning_rate": 1.8404122086884898e-05,
"loss": 0.5018,
"mean_token_accuracy": 0.8316757902503014,
"num_tokens": 74832515.0,
"step": 174
},
{
"entropy": 0.439788818359375,
"epoch": 0.6782945736434108,
"grad_norm": 0.8586381294647795,
"learning_rate": 1.8380881048918406e-05,
"loss": 0.4989,
"mean_token_accuracy": 0.8328359462320805,
"num_tokens": 75286480.0,
"step": 175
},
{
"entropy": 0.452789306640625,
"epoch": 0.6821705426356589,
"grad_norm": 0.8058022848926649,
"learning_rate": 1.8357486897240866e-05,
"loss": 0.5016,
"mean_token_accuracy": 0.8294941317290068,
"num_tokens": 75693389.0,
"step": 176
},
{
"entropy": 0.458770751953125,
"epoch": 0.686046511627907,
"grad_norm": 1.0045483433681783,
"learning_rate": 1.83339400592495e-05,
"loss": 0.4939,
"mean_token_accuracy": 0.8322997633367777,
"num_tokens": 76123537.0,
"step": 177
},
{
"entropy": 0.4451904296875,
"epoch": 0.689922480620155,
"grad_norm": 0.8651822682228671,
"learning_rate": 1.831024096513104e-05,
"loss": 0.4962,
"mean_token_accuracy": 0.832309733144939,
"num_tokens": 76575873.0,
"step": 178
},
{
"entropy": 0.45037841796875,
"epoch": 0.6937984496124031,
"grad_norm": 0.878788280336596,
"learning_rate": 1.8286390047853835e-05,
"loss": 0.4738,
"mean_token_accuracy": 0.837097929790616,
"num_tokens": 76979316.0,
"step": 179
},
{
"entropy": 0.45184326171875,
"epoch": 0.6976744186046512,
"grad_norm": 0.9694188623669399,
"learning_rate": 1.826238774315995e-05,
"loss": 0.4956,
"mean_token_accuracy": 0.8334890305995941,
"num_tokens": 77407125.0,
"step": 180
},
{
"entropy": 0.441192626953125,
"epoch": 0.7015503875968992,
"grad_norm": 0.8080436391493971,
"learning_rate": 1.8238234489557217e-05,
"loss": 0.4953,
"mean_token_accuracy": 0.829826689325273,
"num_tokens": 77849752.0,
"step": 181
},
{
"entropy": 0.446533203125,
"epoch": 0.7054263565891473,
"grad_norm": 1.0021744357127353,
"learning_rate": 1.821393072831121e-05,
"loss": 0.5097,
"mean_token_accuracy": 0.8279161658138037,
"num_tokens": 78281078.0,
"step": 182
},
{
"entropy": 0.45159912109375,
"epoch": 0.7093023255813954,
"grad_norm": 0.9280479231187806,
"learning_rate": 1.818947690343719e-05,
"loss": 0.4951,
"mean_token_accuracy": 0.8309991173446178,
"num_tokens": 78706365.0,
"step": 183
},
{
"entropy": 0.448455810546875,
"epoch": 0.7131782945736435,
"grad_norm": 0.9870261846497186,
"learning_rate": 1.8164873461691987e-05,
"loss": 0.4978,
"mean_token_accuracy": 0.8303654547780752,
"num_tokens": 79135421.0,
"step": 184
},
{
"entropy": 0.44256591796875,
"epoch": 0.7170542635658915,
"grad_norm": 0.9269194775003367,
"learning_rate": 1.814012085256585e-05,
"loss": 0.4831,
"mean_token_accuracy": 0.8345851162448525,
"num_tokens": 79566359.0,
"step": 185
},
{
"entropy": 0.450531005859375,
"epoch": 0.7209302325581395,
"grad_norm": 0.9698714267278364,
"learning_rate": 1.811521952827422e-05,
"loss": 0.4889,
"mean_token_accuracy": 0.8325546151027083,
"num_tokens": 79990413.0,
"step": 186
},
{
"entropy": 0.44677734375,
"epoch": 0.7248062015503876,
"grad_norm": 0.9371152240387562,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.4935,
"mean_token_accuracy": 0.8314268151298165,
"num_tokens": 80408016.0,
"step": 187
},
{
"entropy": 0.448150634765625,
"epoch": 0.7286821705426356,
"grad_norm": 0.8702924632944642,
"learning_rate": 1.806497255663263e-05,
"loss": 0.4957,
"mean_token_accuracy": 0.8298279447481036,
"num_tokens": 80821220.0,
"step": 188
},
{
"entropy": 0.438232421875,
"epoch": 0.7325581395348837,
"grad_norm": 0.9905266797503475,
"learning_rate": 1.8039627827264953e-05,
"loss": 0.5055,
"mean_token_accuracy": 0.829126013442874,
"num_tokens": 81260674.0,
"step": 189
},
{
"entropy": 0.443389892578125,
"epoch": 0.7364341085271318,
"grad_norm": 0.8574532206726159,
"learning_rate": 1.8014136218679566e-05,
"loss": 0.4995,
"mean_token_accuracy": 0.8337188037112355,
"num_tokens": 81699370.0,
"step": 190
},
{
"entropy": 0.447723388671875,
"epoch": 0.7403100775193798,
"grad_norm": 0.8787803121596964,
"learning_rate": 1.7988498196593007e-05,
"loss": 0.5025,
"mean_token_accuracy": 0.831497854553163,
"num_tokens": 82126270.0,
"step": 191
},
{
"entropy": 0.439666748046875,
"epoch": 0.7441860465116279,
"grad_norm": 0.9001985159271618,
"learning_rate": 1.796271422939668e-05,
"loss": 0.4999,
"mean_token_accuracy": 0.8317448329180479,
"num_tokens": 82567340.0,
"step": 192
},
{
"entropy": 0.446136474609375,
"epoch": 0.748062015503876,
"grad_norm": 0.9164028392242967,
"learning_rate": 1.793678478814833e-05,
"loss": 0.5049,
"mean_token_accuracy": 0.828097378835082,
"num_tokens": 82992466.0,
"step": 193
},
{
"entropy": 0.4488525390625,
"epoch": 0.751937984496124,
"grad_norm": 0.8485676413423029,
"learning_rate": 1.7910710346563417e-05,
"loss": 0.4957,
"mean_token_accuracy": 0.8310582870617509,
"num_tokens": 83434886.0,
"step": 194
},
{
"entropy": 0.446441650390625,
"epoch": 0.7558139534883721,
"grad_norm": 0.9286191377343639,
"learning_rate": 1.788449138100648e-05,
"loss": 0.4932,
"mean_token_accuracy": 0.8343960093334317,
"num_tokens": 83880177.0,
"step": 195
},
{
"entropy": 0.4471435546875,
"epoch": 0.7596899224806202,
"grad_norm": 0.8796929401948946,
"learning_rate": 1.7858128370482427e-05,
"loss": 0.4784,
"mean_token_accuracy": 0.8362671909853816,
"num_tokens": 84287722.0,
"step": 196
},
{
"entropy": 0.435089111328125,
"epoch": 0.7635658914728682,
"grad_norm": 0.8294179065956871,
"learning_rate": 1.7831621796627773e-05,
"loss": 0.5043,
"mean_token_accuracy": 0.8294558906927705,
"num_tokens": 84744150.0,
"step": 197
},
{
"entropy": 0.4432373046875,
"epoch": 0.7674418604651163,
"grad_norm": 0.8675227812064625,
"learning_rate": 1.7804972143701853e-05,
"loss": 0.4927,
"mean_token_accuracy": 0.8332074852660298,
"num_tokens": 85180131.0,
"step": 198
},
{
"entropy": 0.44091796875,
"epoch": 0.7713178294573644,
"grad_norm": 0.8952478928442558,
"learning_rate": 1.7778179898577973e-05,
"loss": 0.4982,
"mean_token_accuracy": 0.8292250717058778,
"num_tokens": 85630008.0,
"step": 199
},
{
"entropy": 0.44366455078125,
"epoch": 0.7751937984496124,
"grad_norm": 0.8447485958176423,
"learning_rate": 1.775124555073452e-05,
"loss": 0.4868,
"mean_token_accuracy": 0.8350116610527039,
"num_tokens": 86037948.0,
"step": 200
},
{
"entropy": 0.4471435546875,
"epoch": 0.7790697674418605,
"grad_norm": 0.8111264675206218,
"learning_rate": 1.7724169592245996e-05,
"loss": 0.4847,
"mean_token_accuracy": 0.836847304366529,
"num_tokens": 86448906.0,
"step": 201
},
{
"entropy": 0.437042236328125,
"epoch": 0.7829457364341085,
"grad_norm": 0.887117371160646,
"learning_rate": 1.769695251777406e-05,
"loss": 0.4912,
"mean_token_accuracy": 0.8314886456355453,
"num_tokens": 86877191.0,
"step": 202
},
{
"entropy": 0.438079833984375,
"epoch": 0.7868217054263565,
"grad_norm": 0.8276242886119606,
"learning_rate": 1.7669594824558474e-05,
"loss": 0.4848,
"mean_token_accuracy": 0.8351955693215132,
"num_tokens": 87313657.0,
"step": 203
},
{
"entropy": 0.444000244140625,
"epoch": 0.7906976744186046,
"grad_norm": 0.7558616852027694,
"learning_rate": 1.7642097012408013e-05,
"loss": 0.4865,
"mean_token_accuracy": 0.835452251136303,
"num_tokens": 87736991.0,
"step": 204
},
{
"entropy": 0.4342041015625,
"epoch": 0.7945736434108527,
"grad_norm": 0.8020890652572583,
"learning_rate": 1.7614459583691346e-05,
"loss": 0.4813,
"mean_token_accuracy": 0.8367800936102867,
"num_tokens": 88175181.0,
"step": 205
},
{
"entropy": 0.44451904296875,
"epoch": 0.7984496124031008,
"grad_norm": 0.8402541788890805,
"learning_rate": 1.758668304332786e-05,
"loss": 0.4756,
"mean_token_accuracy": 0.8371487222611904,
"num_tokens": 88589103.0,
"step": 206
},
{
"entropy": 0.436370849609375,
"epoch": 0.8023255813953488,
"grad_norm": 0.7812249400484098,
"learning_rate": 1.755876789877842e-05,
"loss": 0.4844,
"mean_token_accuracy": 0.8344348035752773,
"num_tokens": 89015985.0,
"step": 207
},
{
"entropy": 0.44293212890625,
"epoch": 0.8062015503875969,
"grad_norm": 0.8424590194694084,
"learning_rate": 1.7530714660036112e-05,
"loss": 0.4936,
"mean_token_accuracy": 0.8327551614493132,
"num_tokens": 89446817.0,
"step": 208
},
{
"entropy": 0.444427490234375,
"epoch": 0.810077519379845,
"grad_norm": 0.7878206490332407,
"learning_rate": 1.7502523839616916e-05,
"loss": 0.5012,
"mean_token_accuracy": 0.8302563494071364,
"num_tokens": 89867870.0,
"step": 209
},
{
"entropy": 0.446258544921875,
"epoch": 0.813953488372093,
"grad_norm": 0.8461103209901029,
"learning_rate": 1.7474195952550355e-05,
"loss": 0.4935,
"mean_token_accuracy": 0.8328652335330844,
"num_tokens": 90321128.0,
"step": 210
},
{
"entropy": 0.45703125,
"epoch": 0.8178294573643411,
"grad_norm": 0.8655537216150097,
"learning_rate": 1.744573151637007e-05,
"loss": 0.4983,
"mean_token_accuracy": 0.8308916166424751,
"num_tokens": 90750470.0,
"step": 211
},
{
"entropy": 0.4500732421875,
"epoch": 0.8217054263565892,
"grad_norm": 0.8454143832435108,
"learning_rate": 1.7417131051104382e-05,
"loss": 0.475,
"mean_token_accuracy": 0.8389784749597311,
"num_tokens": 91185119.0,
"step": 212
},
{
"entropy": 0.449249267578125,
"epoch": 0.8255813953488372,
"grad_norm": 0.8064273191403369,
"learning_rate": 1.738839507926677e-05,
"loss": 0.4794,
"mean_token_accuracy": 0.8374122427776456,
"num_tokens": 91618372.0,
"step": 213
},
{
"entropy": 0.442901611328125,
"epoch": 0.8294573643410853,
"grad_norm": 0.8099550078779665,
"learning_rate": 1.7359524125846353e-05,
"loss": 0.4675,
"mean_token_accuracy": 0.8417509058490396,
"num_tokens": 92056592.0,
"step": 214
},
{
"entropy": 0.445098876953125,
"epoch": 0.8333333333333334,
"grad_norm": 0.871757253886241,
"learning_rate": 1.7330518718298263e-05,
"loss": 0.4946,
"mean_token_accuracy": 0.8306501191109419,
"num_tokens": 92492584.0,
"step": 215
},
{
"entropy": 0.437835693359375,
"epoch": 0.8372093023255814,
"grad_norm": 0.7581284379435382,
"learning_rate": 1.7301379386534056e-05,
"loss": 0.4727,
"mean_token_accuracy": 0.8387518906965852,
"num_tokens": 92924009.0,
"step": 216
},
{
"entropy": 0.438751220703125,
"epoch": 0.8410852713178295,
"grad_norm": 0.8993405013438669,
"learning_rate": 1.7272106662911972e-05,
"loss": 0.4799,
"mean_token_accuracy": 0.8359887674450874,
"num_tokens": 93371957.0,
"step": 217
},
{
"entropy": 0.452850341796875,
"epoch": 0.8449612403100775,
"grad_norm": 0.724479689279493,
"learning_rate": 1.7242701082227275e-05,
"loss": 0.4773,
"mean_token_accuracy": 0.8368725245818496,
"num_tokens": 93780438.0,
"step": 218
},
{
"entropy": 0.44256591796875,
"epoch": 0.8488372093023255,
"grad_norm": 0.8311485238950603,
"learning_rate": 1.721316318170242e-05,
"loss": 0.4933,
"mean_token_accuracy": 0.8339337343350053,
"num_tokens": 94225581.0,
"step": 219
},
{
"entropy": 0.43878173828125,
"epoch": 0.8527131782945736,
"grad_norm": 0.77900635570311,
"learning_rate": 1.7183493500977277e-05,
"loss": 0.4739,
"mean_token_accuracy": 0.8379270052537322,
"num_tokens": 94658592.0,
"step": 220
},
{
"entropy": 0.4503173828125,
"epoch": 0.8565891472868217,
"grad_norm": 0.7732389296731101,
"learning_rate": 1.715369258209927e-05,
"loss": 0.4706,
"mean_token_accuracy": 0.8392182057723403,
"num_tokens": 95083295.0,
"step": 221
},
{
"entropy": 0.445404052734375,
"epoch": 0.8604651162790697,
"grad_norm": 0.9115352102726256,
"learning_rate": 1.712376096951345e-05,
"loss": 0.4731,
"mean_token_accuracy": 0.8384861033409834,
"num_tokens": 95521036.0,
"step": 222
},
{
"entropy": 0.453369140625,
"epoch": 0.8643410852713178,
"grad_norm": 3.654039788029253,
"learning_rate": 1.709369921005258e-05,
"loss": 0.4775,
"mean_token_accuracy": 0.8348365603014827,
"num_tokens": 95942840.0,
"step": 223
},
{
"entropy": 0.445953369140625,
"epoch": 0.8682170542635659,
"grad_norm": 0.8876473841192518,
"learning_rate": 1.7063507852927113e-05,
"loss": 0.4853,
"mean_token_accuracy": 0.8355412427335978,
"num_tokens": 96379381.0,
"step": 224
},
{
"entropy": 0.44903564453125,
"epoch": 0.872093023255814,
"grad_norm": 0.8029328363551933,
"learning_rate": 1.7033187449715195e-05,
"loss": 0.491,
"mean_token_accuracy": 0.8344325283542275,
"num_tokens": 96810978.0,
"step": 225
},
{
"entropy": 0.443756103515625,
"epoch": 0.875968992248062,
"grad_norm": 0.7943668108623076,
"learning_rate": 1.700273855435255e-05,
"loss": 0.474,
"mean_token_accuracy": 0.8393369819968939,
"num_tokens": 97230264.0,
"step": 226
},
{
"entropy": 0.439453125,
"epoch": 0.8798449612403101,
"grad_norm": 0.8335982186316193,
"learning_rate": 1.697216172312238e-05,
"loss": 0.4757,
"mean_token_accuracy": 0.8387380233034492,
"num_tokens": 97669092.0,
"step": 227
},
{
"entropy": 0.438995361328125,
"epoch": 0.8837209302325582,
"grad_norm": 0.8313343101425054,
"learning_rate": 1.6941457514645207e-05,
"loss": 0.4709,
"mean_token_accuracy": 0.8384785000234842,
"num_tokens": 98083117.0,
"step": 228
},
{
"entropy": 0.440704345703125,
"epoch": 0.8875968992248062,
"grad_norm": 0.7332959697247916,
"learning_rate": 1.691062648986865e-05,
"loss": 0.4656,
"mean_token_accuracy": 0.8408916248008609,
"num_tokens": 98508532.0,
"step": 229
},
{
"entropy": 0.438232421875,
"epoch": 0.8914728682170543,
"grad_norm": 0.846162571369785,
"learning_rate": 1.6879669212057187e-05,
"loss": 0.4721,
"mean_token_accuracy": 0.8412652369588614,
"num_tokens": 98949168.0,
"step": 230
},
{
"entropy": 0.43841552734375,
"epoch": 0.8953488372093024,
"grad_norm": 0.7298784972709891,
"learning_rate": 1.684858624678188e-05,
"loss": 0.4722,
"mean_token_accuracy": 0.8387791896238923,
"num_tokens": 99376686.0,
"step": 231
},
{
"entropy": 0.44097900390625,
"epoch": 0.8992248062015504,
"grad_norm": 0.8091551233214312,
"learning_rate": 1.6817378161909995e-05,
"loss": 0.47,
"mean_token_accuracy": 0.8381591122597456,
"num_tokens": 99795406.0,
"step": 232
},
{
"entropy": 0.441375732421875,
"epoch": 0.9031007751937985,
"grad_norm": 0.7872364416163475,
"learning_rate": 1.6786045527594693e-05,
"loss": 0.4761,
"mean_token_accuracy": 0.8357131062075496,
"num_tokens": 100221423.0,
"step": 233
},
{
"entropy": 0.445556640625,
"epoch": 0.9069767441860465,
"grad_norm": 0.8082366968087289,
"learning_rate": 1.6754588916264563e-05,
"loss": 0.4641,
"mean_token_accuracy": 0.840836713090539,
"num_tokens": 100631261.0,
"step": 234
},
{
"entropy": 0.436309814453125,
"epoch": 0.9108527131782945,
"grad_norm": 0.7489337052822527,
"learning_rate": 1.672300890261317e-05,
"loss": 0.4821,
"mean_token_accuracy": 0.8357611820101738,
"num_tokens": 101073449.0,
"step": 235
},
{
"entropy": 0.44140625,
"epoch": 0.9147286821705426,
"grad_norm": 0.821237789529478,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.4803,
"mean_token_accuracy": 0.8347616344690323,
"num_tokens": 101499902.0,
"step": 236
},
{
"entropy": 0.449005126953125,
"epoch": 0.9186046511627907,
"grad_norm": 0.8551074877141371,
"learning_rate": 1.6659480978382815e-05,
"loss": 0.4908,
"mean_token_accuracy": 0.8317182743921876,
"num_tokens": 101919468.0,
"step": 237
},
{
"entropy": 0.449981689453125,
"epoch": 0.9224806201550387,
"grad_norm": 0.783108156538897,
"learning_rate": 1.662753422842123e-05,
"loss": 0.4793,
"mean_token_accuracy": 0.837918421253562,
"num_tokens": 102351125.0,
"step": 238
},
{
"entropy": 0.44158935546875,
"epoch": 0.9263565891472868,
"grad_norm": 0.7892781334018207,
"learning_rate": 1.6595466397351955e-05,
"loss": 0.4738,
"mean_token_accuracy": 0.8374282121658325,
"num_tokens": 102794918.0,
"step": 239
},
{
"entropy": 0.4407958984375,
"epoch": 0.9302325581395349,
"grad_norm": 0.7954577060406584,
"learning_rate": 1.6563278071035182e-05,
"loss": 0.4771,
"mean_token_accuracy": 0.8359366981312633,
"num_tokens": 103244596.0,
"step": 240
},
{
"entropy": 0.45074462890625,
"epoch": 0.9341085271317829,
"grad_norm": 0.7600997638428164,
"learning_rate": 1.6530969837532487e-05,
"loss": 0.4725,
"mean_token_accuracy": 0.8393061570823193,
"num_tokens": 103694943.0,
"step": 241
},
{
"entropy": 0.43280029296875,
"epoch": 0.937984496124031,
"grad_norm": 0.798802849580159,
"learning_rate": 1.6498542287096074e-05,
"loss": 0.4745,
"mean_token_accuracy": 0.8393577989190817,
"num_tokens": 104139199.0,
"step": 242
},
{
"entropy": 0.439971923828125,
"epoch": 0.9418604651162791,
"grad_norm": 0.7448764201270436,
"learning_rate": 1.6465996012157996e-05,
"loss": 0.4581,
"mean_token_accuracy": 0.8409141302108765,
"num_tokens": 104560030.0,
"step": 243
},
{
"entropy": 0.4364013671875,
"epoch": 0.9457364341085271,
"grad_norm": 0.7890873960924092,
"learning_rate": 1.6433331607319342e-05,
"loss": 0.4707,
"mean_token_accuracy": 0.8371709603816271,
"num_tokens": 104979376.0,
"step": 244
},
{
"entropy": 0.44085693359375,
"epoch": 0.9496124031007752,
"grad_norm": 0.7542687515989888,
"learning_rate": 1.640054966933935e-05,
"loss": 0.4777,
"mean_token_accuracy": 0.8365635378286242,
"num_tokens": 105405940.0,
"step": 245
},
{
"entropy": 0.4351806640625,
"epoch": 0.9534883720930233,
"grad_norm": 0.8975716299466565,
"learning_rate": 1.636765079712453e-05,
"loss": 0.4617,
"mean_token_accuracy": 0.8432385390624404,
"num_tokens": 105819745.0,
"step": 246
},
{
"entropy": 0.44439697265625,
"epoch": 0.9573643410852714,
"grad_norm": 0.8433624087080338,
"learning_rate": 1.63346355917177e-05,
"loss": 0.4717,
"mean_token_accuracy": 0.8381514484062791,
"num_tokens": 106235149.0,
"step": 247
},
{
"entropy": 0.449920654296875,
"epoch": 0.9612403100775194,
"grad_norm": 0.791209417025907,
"learning_rate": 1.6301504656287027e-05,
"loss": 0.4661,
"mean_token_accuracy": 0.8396106716245413,
"num_tokens": 106650083.0,
"step": 248
},
{
"entropy": 0.4412841796875,
"epoch": 0.9651162790697675,
"grad_norm": 0.7804784526481979,
"learning_rate": 1.626825859611499e-05,
"loss": 0.4727,
"mean_token_accuracy": 0.8388944864273071,
"num_tokens": 107072567.0,
"step": 249
},
{
"entropy": 0.43536376953125,
"epoch": 0.9689922480620154,
"grad_norm": 0.8192426752598542,
"learning_rate": 1.6234898018587336e-05,
"loss": 0.4779,
"mean_token_accuracy": 0.8374818284064531,
"num_tokens": 107523545.0,
"step": 250
},
{
"entropy": 0.43695068359375,
"epoch": 0.9728682170542635,
"grad_norm": 0.8517630233628122,
"learning_rate": 1.6201423533181965e-05,
"loss": 0.4664,
"mean_token_accuracy": 0.8392149573192,
"num_tokens": 107958397.0,
"step": 251
},
{
"entropy": 0.43719482421875,
"epoch": 0.9767441860465116,
"grad_norm": 0.798869637694991,
"learning_rate": 1.6167835751457812e-05,
"loss": 0.4617,
"mean_token_accuracy": 0.8421220034360886,
"num_tokens": 108402895.0,
"step": 252
},
{
"entropy": 0.4276123046875,
"epoch": 0.9806201550387597,
"grad_norm": 0.7966618537088541,
"learning_rate": 1.6134135287043668e-05,
"loss": 0.4668,
"mean_token_accuracy": 0.8377330722287297,
"num_tokens": 108824927.0,
"step": 253
},
{
"entropy": 0.427734375,
"epoch": 0.9844961240310077,
"grad_norm": 0.7727490730992017,
"learning_rate": 1.610032275562697e-05,
"loss": 0.4765,
"mean_token_accuracy": 0.8366042710840702,
"num_tokens": 109261228.0,
"step": 254
},
{
"entropy": 0.430694580078125,
"epoch": 0.9883720930232558,
"grad_norm": 0.8579153905672905,
"learning_rate": 1.6066398774942556e-05,
"loss": 0.4736,
"mean_token_accuracy": 0.8386273989453912,
"num_tokens": 109702965.0,
"step": 255
},
{
"entropy": 0.43243408203125,
"epoch": 0.9922480620155039,
"grad_norm": 0.8076432536420048,
"learning_rate": 1.6032363964761363e-05,
"loss": 0.4821,
"mean_token_accuracy": 0.8358757542446256,
"num_tokens": 110136786.0,
"step": 256
},
{
"entropy": 0.432037353515625,
"epoch": 0.9961240310077519,
"grad_norm": 0.7604697931060647,
"learning_rate": 1.599821894687914e-05,
"loss": 0.4734,
"mean_token_accuracy": 0.8369957143440843,
"num_tokens": 110574894.0,
"step": 257
},
{
"entropy": 0.431243896484375,
"epoch": 1.0,
"grad_norm": 0.8716310508142131,
"learning_rate": 1.5963964345105038e-05,
"loss": 0.4667,
"mean_token_accuracy": 0.8397450698539615,
"num_tokens": 111005104.0,
"step": 258
},
{
"entropy": 0.440673828125,
"epoch": 1.003875968992248,
"grad_norm": 0.8235418343380135,
"learning_rate": 1.592960078525026e-05,
"loss": 0.4346,
"mean_token_accuracy": 0.8489210112020373,
"num_tokens": 111416103.0,
"step": 259
},
{
"entropy": 0.4296875,
"epoch": 1.0077519379844961,
"grad_norm": 0.7597015966730314,
"learning_rate": 1.58951288951166e-05,
"loss": 0.4534,
"mean_token_accuracy": 0.8417296558618546,
"num_tokens": 111861275.0,
"step": 260
},
{
"entropy": 0.43377685546875,
"epoch": 1.0116279069767442,
"grad_norm": 0.822573314981198,
"learning_rate": 1.5860549304484986e-05,
"loss": 0.4418,
"mean_token_accuracy": 0.8477731151506305,
"num_tokens": 112300406.0,
"step": 261
},
{
"entropy": 0.42333984375,
"epoch": 1.0155038759689923,
"grad_norm": 0.757456488834231,
"learning_rate": 1.5825862645103962e-05,
"loss": 0.4334,
"mean_token_accuracy": 0.8519468028098345,
"num_tokens": 112738254.0,
"step": 262
},
{
"entropy": 0.4271240234375,
"epoch": 1.0193798449612403,
"grad_norm": 0.7849441545164026,
"learning_rate": 1.579106955067817e-05,
"loss": 0.4239,
"mean_token_accuracy": 0.8551452234387398,
"num_tokens": 113165687.0,
"step": 263
},
{
"entropy": 0.415191650390625,
"epoch": 1.0232558139534884,
"grad_norm": 0.7790501279119509,
"learning_rate": 1.575617065685674e-05,
"loss": 0.449,
"mean_token_accuracy": 0.8445991091430187,
"num_tokens": 113612143.0,
"step": 264
},
{
"entropy": 0.415740966796875,
"epoch": 1.0271317829457365,
"grad_norm": 0.8396079499552707,
"learning_rate": 1.5721166601221697e-05,
"loss": 0.433,
"mean_token_accuracy": 0.848967888392508,
"num_tokens": 114055171.0,
"step": 265
},
{
"entropy": 0.4189453125,
"epoch": 1.0310077519379846,
"grad_norm": 0.7568866287481192,
"learning_rate": 1.5686058023276324e-05,
"loss": 0.4383,
"mean_token_accuracy": 0.8488096483051777,
"num_tokens": 114488971.0,
"step": 266
},
{
"entropy": 0.43145751953125,
"epoch": 1.0348837209302326,
"grad_norm": 0.7881343959390121,
"learning_rate": 1.565084556443345e-05,
"loss": 0.434,
"mean_token_accuracy": 0.8480666261166334,
"num_tokens": 114894217.0,
"step": 267
},
{
"entropy": 0.422027587890625,
"epoch": 1.0387596899224807,
"grad_norm": 0.8019993519205597,
"learning_rate": 1.561552986800375e-05,
"loss": 0.4333,
"mean_token_accuracy": 0.8492436576634645,
"num_tokens": 115316721.0,
"step": 268
},
{
"entropy": 0.4254150390625,
"epoch": 1.0426356589147288,
"grad_norm": 0.7433123503993253,
"learning_rate": 1.558011157918399e-05,
"loss": 0.4285,
"mean_token_accuracy": 0.8507828311994672,
"num_tokens": 115748926.0,
"step": 269
},
{
"entropy": 0.43426513671875,
"epoch": 1.0465116279069768,
"grad_norm": 0.7717661614873045,
"learning_rate": 1.554459134504523e-05,
"loss": 0.4321,
"mean_token_accuracy": 0.8468701997771859,
"num_tokens": 116164117.0,
"step": 270
},
{
"entropy": 0.425567626953125,
"epoch": 1.050387596899225,
"grad_norm": 0.7108056865458328,
"learning_rate": 1.5508969814521026e-05,
"loss": 0.4343,
"mean_token_accuracy": 0.848011078312993,
"num_tokens": 116600627.0,
"step": 271
},
{
"entropy": 0.42022705078125,
"epoch": 1.054263565891473,
"grad_norm": 0.7587295229376584,
"learning_rate": 1.5473247638395547e-05,
"loss": 0.4345,
"mean_token_accuracy": 0.8474989645183086,
"num_tokens": 117032055.0,
"step": 272
},
{
"entropy": 0.4144287109375,
"epoch": 1.058139534883721,
"grad_norm": 0.7514092686548347,
"learning_rate": 1.54374254692917e-05,
"loss": 0.4314,
"mean_token_accuracy": 0.8491556569933891,
"num_tokens": 117471178.0,
"step": 273
},
{
"entropy": 0.421173095703125,
"epoch": 1.062015503875969,
"grad_norm": 0.8051438236503431,
"learning_rate": 1.5401503961659202e-05,
"loss": 0.442,
"mean_token_accuracy": 0.8469699621200562,
"num_tokens": 117906370.0,
"step": 274
},
{
"entropy": 0.42413330078125,
"epoch": 1.0658914728682172,
"grad_norm": 0.7607882516236013,
"learning_rate": 1.536548377176263e-05,
"loss": 0.4375,
"mean_token_accuracy": 0.8471976118162274,
"num_tokens": 118336719.0,
"step": 275
},
{
"entropy": 0.427337646484375,
"epoch": 1.069767441860465,
"grad_norm": 0.7169629382848399,
"learning_rate": 1.5329365557669427e-05,
"loss": 0.4181,
"mean_token_accuracy": 0.8526374585926533,
"num_tokens": 118735169.0,
"step": 276
},
{
"entropy": 0.42352294921875,
"epoch": 1.073643410852713,
"grad_norm": 0.753541645601325,
"learning_rate": 1.5293149979237875e-05,
"loss": 0.4364,
"mean_token_accuracy": 0.8475749678909779,
"num_tokens": 119165398.0,
"step": 277
},
{
"entropy": 0.431304931640625,
"epoch": 1.0775193798449612,
"grad_norm": 0.7376279482610961,
"learning_rate": 1.5256837698105047e-05,
"loss": 0.4393,
"mean_token_accuracy": 0.8488104958087206,
"num_tokens": 119597939.0,
"step": 278
},
{
"entropy": 0.4210205078125,
"epoch": 1.0813953488372092,
"grad_norm": 0.707931634848308,
"learning_rate": 1.5220429377674724e-05,
"loss": 0.4258,
"mean_token_accuracy": 0.8488586442545056,
"num_tokens": 120043390.0,
"step": 279
},
{
"entropy": 0.426422119140625,
"epoch": 1.0852713178294573,
"grad_norm": 0.7629223961734812,
"learning_rate": 1.5183925683105254e-05,
"loss": 0.4269,
"mean_token_accuracy": 0.8513823468238115,
"num_tokens": 120458989.0,
"step": 280
},
{
"entropy": 0.426177978515625,
"epoch": 1.0891472868217054,
"grad_norm": 0.7895833351493825,
"learning_rate": 1.5147327281297421e-05,
"loss": 0.4273,
"mean_token_accuracy": 0.8519961144775152,
"num_tokens": 120880842.0,
"step": 281
},
{
"entropy": 0.4229736328125,
"epoch": 1.0930232558139534,
"grad_norm": 0.756105261455271,
"learning_rate": 1.5110634840882258e-05,
"loss": 0.4262,
"mean_token_accuracy": 0.8532844102010131,
"num_tokens": 121310709.0,
"step": 282
},
{
"entropy": 0.416046142578125,
"epoch": 1.0968992248062015,
"grad_norm": 0.7357203616832968,
"learning_rate": 1.5073849032208823e-05,
"loss": 0.4352,
"mean_token_accuracy": 0.8488708259537816,
"num_tokens": 121755899.0,
"step": 283
},
{
"entropy": 0.4189453125,
"epoch": 1.1007751937984496,
"grad_norm": 0.7677030953711179,
"learning_rate": 1.5036970527331955e-05,
"loss": 0.4194,
"mean_token_accuracy": 0.8525788122788072,
"num_tokens": 122197116.0,
"step": 284
},
{
"entropy": 0.430389404296875,
"epoch": 1.1046511627906976,
"grad_norm": 0.7113451501233499,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.4326,
"mean_token_accuracy": 0.8495708471164107,
"num_tokens": 122612155.0,
"step": 285
},
{
"entropy": 0.41802978515625,
"epoch": 1.1085271317829457,
"grad_norm": 0.7164644043332025,
"learning_rate": 1.4962938125642504e-05,
"loss": 0.4309,
"mean_token_accuracy": 0.851562624797225,
"num_tokens": 123055956.0,
"step": 286
},
{
"entropy": 0.42437744140625,
"epoch": 1.1124031007751938,
"grad_norm": 0.78372528910054,
"learning_rate": 1.4925785581357852e-05,
"loss": 0.4279,
"mean_token_accuracy": 0.8492834325879812,
"num_tokens": 123478026.0,
"step": 287
},
{
"entropy": 0.421844482421875,
"epoch": 1.1162790697674418,
"grad_norm": 0.6861426810283858,
"learning_rate": 1.4888543045900938e-05,
"loss": 0.4465,
"mean_token_accuracy": 0.8457562746480107,
"num_tokens": 123922207.0,
"step": 288
},
{
"entropy": 0.427825927734375,
"epoch": 1.12015503875969,
"grad_norm": 0.6763098044720579,
"learning_rate": 1.485121119967072e-05,
"loss": 0.432,
"mean_token_accuracy": 0.8471246156841516,
"num_tokens": 124362582.0,
"step": 289
},
{
"entropy": 0.42303466796875,
"epoch": 1.124031007751938,
"grad_norm": 0.7290088686941348,
"learning_rate": 1.4813790724697832e-05,
"loss": 0.4495,
"mean_token_accuracy": 0.845588430762291,
"num_tokens": 124806716.0,
"step": 290
},
{
"entropy": 0.4210205078125,
"epoch": 1.127906976744186,
"grad_norm": 0.6888112492696885,
"learning_rate": 1.4776282304632078e-05,
"loss": 0.4378,
"mean_token_accuracy": 0.8481010003015399,
"num_tokens": 125233523.0,
"step": 291
},
{
"entropy": 0.42071533203125,
"epoch": 1.1317829457364341,
"grad_norm": 0.698153653677859,
"learning_rate": 1.4738686624729987e-05,
"loss": 0.4307,
"mean_token_accuracy": 0.8496177345514297,
"num_tokens": 125679583.0,
"step": 292
},
{
"entropy": 0.417327880859375,
"epoch": 1.1356589147286822,
"grad_norm": 0.7783000500288277,
"learning_rate": 1.4701004371842264e-05,
"loss": 0.4179,
"mean_token_accuracy": 0.8545960262417793,
"num_tokens": 126126243.0,
"step": 293
},
{
"entropy": 0.417816162109375,
"epoch": 1.1395348837209303,
"grad_norm": 0.7409730122451095,
"learning_rate": 1.4663236234401253e-05,
"loss": 0.4362,
"mean_token_accuracy": 0.8477563932538033,
"num_tokens": 126566551.0,
"step": 294
},
{
"entropy": 0.418304443359375,
"epoch": 1.1434108527131783,
"grad_norm": 0.7021480114616296,
"learning_rate": 1.4625382902408356e-05,
"loss": 0.4321,
"mean_token_accuracy": 0.8495519608259201,
"num_tokens": 127004749.0,
"step": 295
},
{
"entropy": 0.415771484375,
"epoch": 1.1472868217054264,
"grad_norm": 0.7175857859009526,
"learning_rate": 1.4587445067421429e-05,
"loss": 0.4342,
"mean_token_accuracy": 0.8500601844862103,
"num_tokens": 127436381.0,
"step": 296
},
{
"entropy": 0.420257568359375,
"epoch": 1.1511627906976745,
"grad_norm": 0.7229560006972757,
"learning_rate": 1.4549423422542148e-05,
"loss": 0.4157,
"mean_token_accuracy": 0.8538505714386702,
"num_tokens": 127851171.0,
"step": 297
},
{
"entropy": 0.421539306640625,
"epoch": 1.1550387596899225,
"grad_norm": 0.6985006915852251,
"learning_rate": 1.4511318662403347e-05,
"loss": 0.4274,
"mean_token_accuracy": 0.8511299481615424,
"num_tokens": 128268747.0,
"step": 298
},
{
"entropy": 0.417510986328125,
"epoch": 1.1589147286821706,
"grad_norm": 0.7072433737368383,
"learning_rate": 1.4473131483156326e-05,
"loss": 0.4361,
"mean_token_accuracy": 0.8500646986067295,
"num_tokens": 128707759.0,
"step": 299
},
{
"entropy": 0.423553466796875,
"epoch": 1.1627906976744187,
"grad_norm": 0.7755469923778686,
"learning_rate": 1.4434862582458136e-05,
"loss": 0.4416,
"mean_token_accuracy": 0.8463389156386256,
"num_tokens": 129142557.0,
"step": 300
},
{
"entropy": 0.421630859375,
"epoch": 1.1666666666666667,
"grad_norm": 0.7155794871533206,
"learning_rate": 1.4396512659458824e-05,
"loss": 0.4214,
"mean_token_accuracy": 0.8518092129379511,
"num_tokens": 129570205.0,
"step": 301
},
{
"entropy": 0.4237060546875,
"epoch": 1.1705426356589148,
"grad_norm": 0.7896631453072354,
"learning_rate": 1.4358082414788666e-05,
"loss": 0.4254,
"mean_token_accuracy": 0.8514489009976387,
"num_tokens": 129975368.0,
"step": 302
},
{
"entropy": 0.41925048828125,
"epoch": 1.1744186046511629,
"grad_norm": 0.7703803004356109,
"learning_rate": 1.4319572550545374e-05,
"loss": 0.4283,
"mean_token_accuracy": 0.8518984178081155,
"num_tokens": 130408915.0,
"step": 303
},
{
"entropy": 0.41986083984375,
"epoch": 1.178294573643411,
"grad_norm": 0.7534066489126126,
"learning_rate": 1.4280983770281258e-05,
"loss": 0.4161,
"mean_token_accuracy": 0.8520697662606835,
"num_tokens": 130831126.0,
"step": 304
},
{
"entropy": 0.410888671875,
"epoch": 1.1821705426356588,
"grad_norm": 0.781352178713833,
"learning_rate": 1.4242316778990373e-05,
"loss": 0.436,
"mean_token_accuracy": 0.8507404867559671,
"num_tokens": 131263278.0,
"step": 305
},
{
"entropy": 0.4180908203125,
"epoch": 1.1860465116279069,
"grad_norm": 0.8259033580922711,
"learning_rate": 1.4203572283095657e-05,
"loss": 0.4328,
"mean_token_accuracy": 0.8494941433891654,
"num_tokens": 131692010.0,
"step": 306
},
{
"entropy": 0.418975830078125,
"epoch": 1.189922480620155,
"grad_norm": 0.7865591711742697,
"learning_rate": 1.4164750990435991e-05,
"loss": 0.4349,
"mean_token_accuracy": 0.8495086506009102,
"num_tokens": 132134463.0,
"step": 307
},
{
"entropy": 0.4202880859375,
"epoch": 1.193798449612403,
"grad_norm": 0.731714021499699,
"learning_rate": 1.4125853610253306e-05,
"loss": 0.4295,
"mean_token_accuracy": 0.850966832600534,
"num_tokens": 132567775.0,
"step": 308
},
{
"entropy": 0.4229736328125,
"epoch": 1.197674418604651,
"grad_norm": 0.8413435296703591,
"learning_rate": 1.4086880853179592e-05,
"loss": 0.4306,
"mean_token_accuracy": 0.8497058739885688,
"num_tokens": 133000302.0,
"step": 309
},
{
"entropy": 0.426361083984375,
"epoch": 1.2015503875968991,
"grad_norm": 0.8120752444730676,
"learning_rate": 1.4047833431223938e-05,
"loss": 0.4334,
"mean_token_accuracy": 0.8482353119179606,
"num_tokens": 133422180.0,
"step": 310
},
{
"entropy": 0.416290283203125,
"epoch": 1.2054263565891472,
"grad_norm": 0.7944571026325243,
"learning_rate": 1.4008712057759519e-05,
"loss": 0.4295,
"mean_token_accuracy": 0.8492019288241863,
"num_tokens": 133866767.0,
"step": 311
},
{
"entropy": 0.416961669921875,
"epoch": 1.2093023255813953,
"grad_norm": 0.7212454482113841,
"learning_rate": 1.3969517447510546e-05,
"loss": 0.4333,
"mean_token_accuracy": 0.847635168582201,
"num_tokens": 134323563.0,
"step": 312
},
{
"entropy": 0.422698974609375,
"epoch": 1.2131782945736433,
"grad_norm": 0.8453314046957682,
"learning_rate": 1.3930250316539237e-05,
"loss": 0.4474,
"mean_token_accuracy": 0.8459825245663524,
"num_tokens": 134759823.0,
"step": 313
},
{
"entropy": 0.41668701171875,
"epoch": 1.2170542635658914,
"grad_norm": 0.7480620726932977,
"learning_rate": 1.3890911382232717e-05,
"loss": 0.4047,
"mean_token_accuracy": 0.8580910852178931,
"num_tokens": 135186114.0,
"step": 314
},
{
"entropy": 0.427459716796875,
"epoch": 1.2209302325581395,
"grad_norm": 0.7821854419992171,
"learning_rate": 1.3851501363289907e-05,
"loss": 0.4303,
"mean_token_accuracy": 0.8487546825781465,
"num_tokens": 135610201.0,
"step": 315
},
{
"entropy": 0.425079345703125,
"epoch": 1.2248062015503876,
"grad_norm": 0.7421312963343092,
"learning_rate": 1.3812020979708418e-05,
"loss": 0.4238,
"mean_token_accuracy": 0.8516996335238218,
"num_tokens": 136031193.0,
"step": 316
},
{
"entropy": 0.425933837890625,
"epoch": 1.2286821705426356,
"grad_norm": 0.6956434644749807,
"learning_rate": 1.3772470952771364e-05,
"loss": 0.4199,
"mean_token_accuracy": 0.8497325489297509,
"num_tokens": 136451557.0,
"step": 317
},
{
"entropy": 0.423492431640625,
"epoch": 1.2325581395348837,
"grad_norm": 0.7325561302834672,
"learning_rate": 1.3732852005034212e-05,
"loss": 0.4222,
"mean_token_accuracy": 0.8517925990745425,
"num_tokens": 136877337.0,
"step": 318
},
{
"entropy": 0.415557861328125,
"epoch": 1.2364341085271318,
"grad_norm": 0.7118813272913093,
"learning_rate": 1.3693164860311565e-05,
"loss": 0.4218,
"mean_token_accuracy": 0.8540134867653251,
"num_tokens": 137321216.0,
"step": 319
},
{
"entropy": 0.418212890625,
"epoch": 1.2403100775193798,
"grad_norm": 0.7988600048796505,
"learning_rate": 1.3653410243663953e-05,
"loss": 0.4321,
"mean_token_accuracy": 0.8492146460339427,
"num_tokens": 137772473.0,
"step": 320
},
{
"entropy": 0.423370361328125,
"epoch": 1.244186046511628,
"grad_norm": 0.7627685472813416,
"learning_rate": 1.3613588881384565e-05,
"loss": 0.4234,
"mean_token_accuracy": 0.8511487627401948,
"num_tokens": 138194574.0,
"step": 321
},
{
"entropy": 0.43115234375,
"epoch": 1.248062015503876,
"grad_norm": 0.7255243137072226,
"learning_rate": 1.3573701500986012e-05,
"loss": 0.4109,
"mean_token_accuracy": 0.8555542044341564,
"num_tokens": 138603729.0,
"step": 322
},
{
"entropy": 0.42938232421875,
"epoch": 1.251937984496124,
"grad_norm": 0.720487935434389,
"learning_rate": 1.3533748831186992e-05,
"loss": 0.4273,
"mean_token_accuracy": 0.8499365914613008,
"num_tokens": 139009030.0,
"step": 323
},
{
"entropy": 0.425750732421875,
"epoch": 1.255813953488372,
"grad_norm": 0.7319411749735341,
"learning_rate": 1.3493731601899023e-05,
"loss": 0.4205,
"mean_token_accuracy": 0.8510079709812999,
"num_tokens": 139433412.0,
"step": 324
},
{
"entropy": 0.42303466796875,
"epoch": 1.2596899224806202,
"grad_norm": 0.7403000017923702,
"learning_rate": 1.3453650544213078e-05,
"loss": 0.4102,
"mean_token_accuracy": 0.8584857322275639,
"num_tokens": 139845008.0,
"step": 325
},
{
"entropy": 0.428375244140625,
"epoch": 1.2635658914728682,
"grad_norm": 0.7414567866770928,
"learning_rate": 1.3413506390386233e-05,
"loss": 0.4238,
"mean_token_accuracy": 0.8523871023207903,
"num_tokens": 140279966.0,
"step": 326
},
{
"entropy": 0.42449951171875,
"epoch": 1.2674418604651163,
"grad_norm": 0.706884408701634,
"learning_rate": 1.3373299873828303e-05,
"loss": 0.4064,
"mean_token_accuracy": 0.8562332447618246,
"num_tokens": 140702025.0,
"step": 327
},
{
"entropy": 0.42193603515625,
"epoch": 1.2713178294573644,
"grad_norm": 0.7172204794469905,
"learning_rate": 1.333303172908842e-05,
"loss": 0.4188,
"mean_token_accuracy": 0.8553838301450014,
"num_tokens": 141123133.0,
"step": 328
},
{
"entropy": 0.42510986328125,
"epoch": 1.2751937984496124,
"grad_norm": 0.7709096687308097,
"learning_rate": 1.3292702691841637e-05,
"loss": 0.4302,
"mean_token_accuracy": 0.8494652854278684,
"num_tokens": 141553515.0,
"step": 329
},
{
"entropy": 0.430084228515625,
"epoch": 1.2790697674418605,
"grad_norm": 0.7368366945349477,
"learning_rate": 1.3252313498875473e-05,
"loss": 0.4287,
"mean_token_accuracy": 0.8533294908702374,
"num_tokens": 141974216.0,
"step": 330
},
{
"entropy": 0.4251708984375,
"epoch": 1.2829457364341086,
"grad_norm": 0.7185600809351362,
"learning_rate": 1.3211864888076458e-05,
"loss": 0.4353,
"mean_token_accuracy": 0.849311051890254,
"num_tokens": 142392154.0,
"step": 331
},
{
"entropy": 0.430389404296875,
"epoch": 1.2868217054263567,
"grad_norm": 0.7422111929493115,
"learning_rate": 1.3171357598416642e-05,
"loss": 0.4162,
"mean_token_accuracy": 0.8547528050839901,
"num_tokens": 142813018.0,
"step": 332
},
{
"entropy": 0.424835205078125,
"epoch": 1.2906976744186047,
"grad_norm": 0.7133086085038155,
"learning_rate": 1.313079236994012e-05,
"loss": 0.4262,
"mean_token_accuracy": 0.8504309616982937,
"num_tokens": 143242752.0,
"step": 333
},
{
"entropy": 0.423187255859375,
"epoch": 1.2945736434108528,
"grad_norm": 0.6789452214744285,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.4086,
"mean_token_accuracy": 0.8557021962478757,
"num_tokens": 143688265.0,
"step": 334
},
{
"entropy": 0.424835205078125,
"epoch": 1.2984496124031009,
"grad_norm": 0.6735243128953131,
"learning_rate": 1.3049491061992274e-05,
"loss": 0.4102,
"mean_token_accuracy": 0.8555810833349824,
"num_tokens": 144113299.0,
"step": 335
},
{
"entropy": 0.425140380859375,
"epoch": 1.302325581395349,
"grad_norm": 0.6878761206422891,
"learning_rate": 1.3008756467847486e-05,
"loss": 0.4102,
"mean_token_accuracy": 0.8577935267239809,
"num_tokens": 144547082.0,
"step": 336
},
{
"entropy": 0.4267578125,
"epoch": 1.306201550387597,
"grad_norm": 0.735956052863927,
"learning_rate": 1.2967966905511906e-05,
"loss": 0.4171,
"mean_token_accuracy": 0.8548558866605163,
"num_tokens": 144962488.0,
"step": 337
},
{
"entropy": 0.416900634765625,
"epoch": 1.310077519379845,
"grad_norm": 0.6868378061745052,
"learning_rate": 1.2927123120186584e-05,
"loss": 0.4103,
"mean_token_accuracy": 0.8534078542143106,
"num_tokens": 145397748.0,
"step": 338
},
{
"entropy": 0.421356201171875,
"epoch": 1.3139534883720931,
"grad_norm": 0.7224003448379767,
"learning_rate": 1.2886225858063175e-05,
"loss": 0.4273,
"mean_token_accuracy": 0.8498321361839771,
"num_tokens": 145835597.0,
"step": 339
},
{
"entropy": 0.421783447265625,
"epoch": 1.3178294573643412,
"grad_norm": 0.6466091718297071,
"learning_rate": 1.2845275866310325e-05,
"loss": 0.417,
"mean_token_accuracy": 0.8557677045464516,
"num_tokens": 146284089.0,
"step": 340
},
{
"entropy": 0.42236328125,
"epoch": 1.3217054263565893,
"grad_norm": 0.7153551768847246,
"learning_rate": 1.2804273893060028e-05,
"loss": 0.4207,
"mean_token_accuracy": 0.8512698579579592,
"num_tokens": 146739438.0,
"step": 341
},
{
"entropy": 0.429473876953125,
"epoch": 1.3255813953488373,
"grad_norm": 0.6659058031600372,
"learning_rate": 1.2763220687393942e-05,
"loss": 0.417,
"mean_token_accuracy": 0.8553942264989018,
"num_tokens": 147142580.0,
"step": 342
},
{
"entropy": 0.424896240234375,
"epoch": 1.3294573643410852,
"grad_norm": 0.6617647509435118,
"learning_rate": 1.2722116999329712e-05,
"loss": 0.4164,
"mean_token_accuracy": 0.8552069365978241,
"num_tokens": 147565462.0,
"step": 343
},
{
"entropy": 0.42559814453125,
"epoch": 1.3333333333333333,
"grad_norm": 0.6976673140780362,
"learning_rate": 1.2680963579807268e-05,
"loss": 0.4194,
"mean_token_accuracy": 0.8545995801687241,
"num_tokens": 148000343.0,
"step": 344
},
{
"entropy": 0.426544189453125,
"epoch": 1.3372093023255813,
"grad_norm": 0.6976500715660131,
"learning_rate": 1.2639761180675098e-05,
"loss": 0.4193,
"mean_token_accuracy": 0.8548473976552486,
"num_tokens": 148450057.0,
"step": 345
},
{
"entropy": 0.4208984375,
"epoch": 1.3410852713178294,
"grad_norm": 0.6998064472558588,
"learning_rate": 1.259851055467653e-05,
"loss": 0.4253,
"mean_token_accuracy": 0.8499715300276875,
"num_tokens": 148898991.0,
"step": 346
},
{
"entropy": 0.4200439453125,
"epoch": 1.3449612403100775,
"grad_norm": 0.6567142053816986,
"learning_rate": 1.2557212455435958e-05,
"loss": 0.4107,
"mean_token_accuracy": 0.8560676537454128,
"num_tokens": 149332404.0,
"step": 347
},
{
"entropy": 0.428253173828125,
"epoch": 1.3488372093023255,
"grad_norm": 0.6246728679193004,
"learning_rate": 1.2515867637445088e-05,
"loss": 0.4091,
"mean_token_accuracy": 0.855487022548914,
"num_tokens": 149739226.0,
"step": 348
},
{
"entropy": 0.42034912109375,
"epoch": 1.3527131782945736,
"grad_norm": 0.6872162569734706,
"learning_rate": 1.2474476856049145e-05,
"loss": 0.4161,
"mean_token_accuracy": 0.8547687204554677,
"num_tokens": 150201075.0,
"step": 349
},
{
"entropy": 0.419647216796875,
"epoch": 1.3565891472868217,
"grad_norm": 0.6444775302707234,
"learning_rate": 1.2433040867433087e-05,
"loss": 0.4294,
"mean_token_accuracy": 0.8485890505835414,
"num_tokens": 150646168.0,
"step": 350
},
{
"entropy": 0.425750732421875,
"epoch": 1.3604651162790697,
"grad_norm": 0.6872503649025473,
"learning_rate": 1.2391560428607776e-05,
"loss": 0.4113,
"mean_token_accuracy": 0.8564107986167073,
"num_tokens": 151063971.0,
"step": 351
},
{
"entropy": 0.420989990234375,
"epoch": 1.3643410852713178,
"grad_norm": 0.7340706794906026,
"learning_rate": 1.2350036297396153e-05,
"loss": 0.4216,
"mean_token_accuracy": 0.8534470964223146,
"num_tokens": 151491366.0,
"step": 352
},
{
"entropy": 0.420074462890625,
"epoch": 1.3682170542635659,
"grad_norm": 0.6810651675173917,
"learning_rate": 1.2308469232419387e-05,
"loss": 0.4149,
"mean_token_accuracy": 0.8558577708899975,
"num_tokens": 151931328.0,
"step": 353
},
{
"entropy": 0.410614013671875,
"epoch": 1.372093023255814,
"grad_norm": 0.6681093420635835,
"learning_rate": 1.2266859993083037e-05,
"loss": 0.4104,
"mean_token_accuracy": 0.8580976203083992,
"num_tokens": 152372760.0,
"step": 354
},
{
"entropy": 0.419769287109375,
"epoch": 1.375968992248062,
"grad_norm": 0.7329839399462965,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.4064,
"mean_token_accuracy": 0.857388780452311,
"num_tokens": 152786975.0,
"step": 355
},
{
"entropy": 0.426300048828125,
"epoch": 1.37984496124031,
"grad_norm": 0.7332044687155054,
"learning_rate": 1.2183518032792376e-05,
"loss": 0.41,
"mean_token_accuracy": 0.8559472924098372,
"num_tokens": 153199532.0,
"step": 356
},
{
"entropy": 0.42572021484375,
"epoch": 1.3837209302325582,
"grad_norm": 0.6799405186287775,
"learning_rate": 1.2141786834446105e-05,
"loss": 0.4188,
"mean_token_accuracy": 0.8518085544928908,
"num_tokens": 153632347.0,
"step": 357
},
{
"entropy": 0.426361083984375,
"epoch": 1.3875968992248062,
"grad_norm": 0.6449750458569437,
"learning_rate": 1.2100016506928494e-05,
"loss": 0.4199,
"mean_token_accuracy": 0.8516499819234014,
"num_tokens": 154065344.0,
"step": 358
},
{
"entropy": 0.42437744140625,
"epoch": 1.3914728682170543,
"grad_norm": 0.6763480475833509,
"learning_rate": 1.2058207813358587e-05,
"loss": 0.4083,
"mean_token_accuracy": 0.8591054771095514,
"num_tokens": 154482988.0,
"step": 359
},
{
"entropy": 0.41851806640625,
"epoch": 1.3953488372093024,
"grad_norm": 0.7346435605404793,
"learning_rate": 1.2016361517556334e-05,
"loss": 0.4244,
"mean_token_accuracy": 0.8513177242130041,
"num_tokens": 154920519.0,
"step": 360
},
{
"entropy": 0.42596435546875,
"epoch": 1.3992248062015504,
"grad_norm": 0.6715875218591029,
"learning_rate": 1.1974478384028672e-05,
"loss": 0.4163,
"mean_token_accuracy": 0.8543539261445403,
"num_tokens": 155349717.0,
"step": 361
},
{
"entropy": 0.4283447265625,
"epoch": 1.4031007751937985,
"grad_norm": 0.6724081825580828,
"learning_rate": 1.1932559177955533e-05,
"loss": 0.4079,
"mean_token_accuracy": 0.8556110095232725,
"num_tokens": 155763526.0,
"step": 362
},
{
"entropy": 0.420196533203125,
"epoch": 1.4069767441860466,
"grad_norm": 0.6752640946014242,
"learning_rate": 1.1890604665175878e-05,
"loss": 0.4217,
"mean_token_accuracy": 0.8532299809157848,
"num_tokens": 156181266.0,
"step": 363
},
{
"entropy": 0.421661376953125,
"epoch": 1.4108527131782946,
"grad_norm": 0.6672888444227608,
"learning_rate": 1.1848615612173689e-05,
"loss": 0.4116,
"mean_token_accuracy": 0.8545421287417412,
"num_tokens": 156600191.0,
"step": 364
},
{
"entropy": 0.42340087890625,
"epoch": 1.4147286821705427,
"grad_norm": 0.6532653684313114,
"learning_rate": 1.1806592786063991e-05,
"loss": 0.4064,
"mean_token_accuracy": 0.8569002998992801,
"num_tokens": 157038509.0,
"step": 365
},
{
"entropy": 0.412811279296875,
"epoch": 1.4186046511627908,
"grad_norm": 0.6533143053038322,
"learning_rate": 1.1764536954578817e-05,
"loss": 0.409,
"mean_token_accuracy": 0.8555587902665138,
"num_tokens": 157468885.0,
"step": 366
},
{
"entropy": 0.414794921875,
"epoch": 1.4224806201550386,
"grad_norm": 0.626757449703669,
"learning_rate": 1.172244888605319e-05,
"loss": 0.4028,
"mean_token_accuracy": 0.8573278188705444,
"num_tokens": 157897065.0,
"step": 367
},
{
"entropy": 0.41864013671875,
"epoch": 1.4263565891472867,
"grad_norm": 0.6747328666294428,
"learning_rate": 1.1680329349411086e-05,
"loss": 0.4079,
"mean_token_accuracy": 0.8546053608879447,
"num_tokens": 158329094.0,
"step": 368
},
{
"entropy": 0.418060302734375,
"epoch": 1.4302325581395348,
"grad_norm": 0.6724341191502806,
"learning_rate": 1.1638179114151378e-05,
"loss": 0.4238,
"mean_token_accuracy": 0.8480729442089796,
"num_tokens": 158773650.0,
"step": 369
},
{
"entropy": 0.41351318359375,
"epoch": 1.4341085271317828,
"grad_norm": 0.6191751963920147,
"learning_rate": 1.1595998950333794e-05,
"loss": 0.4078,
"mean_token_accuracy": 0.8567248536273837,
"num_tokens": 159212465.0,
"step": 370
},
{
"entropy": 0.422271728515625,
"epoch": 1.437984496124031,
"grad_norm": 0.662553873559576,
"learning_rate": 1.1553789628564832e-05,
"loss": 0.4066,
"mean_token_accuracy": 0.8549947030842304,
"num_tokens": 159612271.0,
"step": 371
},
{
"entropy": 0.415802001953125,
"epoch": 1.441860465116279,
"grad_norm": 0.6596280602581531,
"learning_rate": 1.151155191998369e-05,
"loss": 0.418,
"mean_token_accuracy": 0.8543671853840351,
"num_tokens": 160056420.0,
"step": 372
},
{
"entropy": 0.41851806640625,
"epoch": 1.445736434108527,
"grad_norm": 0.718746454796938,
"learning_rate": 1.1469286596248181e-05,
"loss": 0.4176,
"mean_token_accuracy": 0.8542827153578401,
"num_tokens": 160475081.0,
"step": 373
},
{
"entropy": 0.41009521484375,
"epoch": 1.449612403100775,
"grad_norm": 0.652059758876679,
"learning_rate": 1.1426994429520622e-05,
"loss": 0.4093,
"mean_token_accuracy": 0.8568979185074568,
"num_tokens": 160908789.0,
"step": 374
},
{
"entropy": 0.41717529296875,
"epoch": 1.4534883720930232,
"grad_norm": 0.6430873724176981,
"learning_rate": 1.138467619245374e-05,
"loss": 0.4092,
"mean_token_accuracy": 0.8570778556168079,
"num_tokens": 161339820.0,
"step": 375
},
{
"entropy": 0.413787841796875,
"epoch": 1.4573643410852712,
"grad_norm": 0.7222965583263963,
"learning_rate": 1.1342332658176556e-05,
"loss": 0.4184,
"mean_token_accuracy": 0.8542719716206193,
"num_tokens": 161777346.0,
"step": 376
},
{
"entropy": 0.419158935546875,
"epoch": 1.4612403100775193,
"grad_norm": 0.6609105273639142,
"learning_rate": 1.1299964600280247e-05,
"loss": 0.3931,
"mean_token_accuracy": 0.861978692933917,
"num_tokens": 162199267.0,
"step": 377
},
{
"entropy": 0.40478515625,
"epoch": 1.4651162790697674,
"grad_norm": 0.6412126135519434,
"learning_rate": 1.1257572792804028e-05,
"loss": 0.4063,
"mean_token_accuracy": 0.8554513454437256,
"num_tokens": 162633923.0,
"step": 378
},
{
"entropy": 0.41717529296875,
"epoch": 1.4689922480620154,
"grad_norm": 0.6876116007476988,
"learning_rate": 1.1215158010221005e-05,
"loss": 0.4069,
"mean_token_accuracy": 0.8563102921471,
"num_tokens": 163043237.0,
"step": 379
},
{
"entropy": 0.4061279296875,
"epoch": 1.4728682170542635,
"grad_norm": 0.6428579499395306,
"learning_rate": 1.1172721027424021e-05,
"loss": 0.4079,
"mean_token_accuracy": 0.858618251979351,
"num_tokens": 163486291.0,
"step": 380
},
{
"entropy": 0.419342041015625,
"epoch": 1.4767441860465116,
"grad_norm": 0.687187689490725,
"learning_rate": 1.1130262619711505e-05,
"loss": 0.4093,
"mean_token_accuracy": 0.8567131292074919,
"num_tokens": 163906930.0,
"step": 381
},
{
"entropy": 0.409820556640625,
"epoch": 1.4806201550387597,
"grad_norm": 0.6756942959039264,
"learning_rate": 1.108778356277331e-05,
"loss": 0.4137,
"mean_token_accuracy": 0.8561842953786254,
"num_tokens": 164346184.0,
"step": 382
},
{
"entropy": 0.416900634765625,
"epoch": 1.4844961240310077,
"grad_norm": 0.678878649649754,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.4142,
"mean_token_accuracy": 0.8553884662687778,
"num_tokens": 164758508.0,
"step": 383
},
{
"entropy": 0.41632080078125,
"epoch": 1.4883720930232558,
"grad_norm": 0.697760524956994,
"learning_rate": 1.1002766605851353e-05,
"loss": 0.4044,
"mean_token_accuracy": 0.8585758162662387,
"num_tokens": 165177286.0,
"step": 384
},
{
"entropy": 0.41912841796875,
"epoch": 1.4922480620155039,
"grad_norm": 0.6519093686870057,
"learning_rate": 1.0960230259076819e-05,
"loss": 0.3998,
"mean_token_accuracy": 0.859433357603848,
"num_tokens": 165601852.0,
"step": 385
},
{
"entropy": 0.40447998046875,
"epoch": 1.496124031007752,
"grad_norm": 0.6543050312276183,
"learning_rate": 1.0917676369466683e-05,
"loss": 0.4067,
"mean_token_accuracy": 0.8571837488561869,
"num_tokens": 166030463.0,
"step": 386
},
{
"entropy": 0.41229248046875,
"epoch": 1.5,
"grad_norm": 0.7035556193327832,
"learning_rate": 1.0875105714455193e-05,
"loss": 0.4074,
"mean_token_accuracy": 0.8557658046483994,
"num_tokens": 166444670.0,
"step": 387
},
{
"entropy": 0.41241455078125,
"epoch": 1.503875968992248,
"grad_norm": 0.6563472796443234,
"learning_rate": 1.0832519071782895e-05,
"loss": 0.4055,
"mean_token_accuracy": 0.8547043697908521,
"num_tokens": 166872503.0,
"step": 388
},
{
"entropy": 0.401947021484375,
"epoch": 1.5077519379844961,
"grad_norm": 0.709679816736362,
"learning_rate": 1.0789917219482413e-05,
"loss": 0.4073,
"mean_token_accuracy": 0.8544203815981746,
"num_tokens": 167320746.0,
"step": 389
},
{
"entropy": 0.410614013671875,
"epoch": 1.5116279069767442,
"grad_norm": 0.6541389657979753,
"learning_rate": 1.0747300935864245e-05,
"loss": 0.4019,
"mean_token_accuracy": 0.8587576989084482,
"num_tokens": 167759677.0,
"step": 390
},
{
"entropy": 0.41009521484375,
"epoch": 1.5155038759689923,
"grad_norm": 0.667881183522671,
"learning_rate": 1.070467099950254e-05,
"loss": 0.4053,
"mean_token_accuracy": 0.8573957877233624,
"num_tokens": 168199322.0,
"step": 391
},
{
"entropy": 0.407257080078125,
"epoch": 1.5193798449612403,
"grad_norm": 0.6633613364086678,
"learning_rate": 1.0662028189220876e-05,
"loss": 0.4057,
"mean_token_accuracy": 0.8579470701515675,
"num_tokens": 168640945.0,
"step": 392
},
{
"entropy": 0.412139892578125,
"epoch": 1.5232558139534884,
"grad_norm": 0.6831773547333138,
"learning_rate": 1.0619373284078032e-05,
"loss": 0.4084,
"mean_token_accuracy": 0.8582353731617332,
"num_tokens": 169064555.0,
"step": 393
},
{
"entropy": 0.4107666015625,
"epoch": 1.5271317829457365,
"grad_norm": 0.6948469884270619,
"learning_rate": 1.0576707063353745e-05,
"loss": 0.4125,
"mean_token_accuracy": 0.8543423097580671,
"num_tokens": 169485377.0,
"step": 394
},
{
"entropy": 0.414642333984375,
"epoch": 1.5310077519379846,
"grad_norm": 0.6926922523877129,
"learning_rate": 1.0534030306534491e-05,
"loss": 0.4149,
"mean_token_accuracy": 0.8544100457802415,
"num_tokens": 169927287.0,
"step": 395
},
{
"entropy": 0.413848876953125,
"epoch": 1.5348837209302326,
"grad_norm": 0.6813641145842185,
"learning_rate": 1.0491343793299225e-05,
"loss": 0.4093,
"mean_token_accuracy": 0.8580722212791443,
"num_tokens": 170354986.0,
"step": 396
},
{
"entropy": 0.417449951171875,
"epoch": 1.5387596899224807,
"grad_norm": 0.6807712850448645,
"learning_rate": 1.044864830350515e-05,
"loss": 0.4133,
"mean_token_accuracy": 0.8556122053414583,
"num_tokens": 170779478.0,
"step": 397
},
{
"entropy": 0.4117431640625,
"epoch": 1.5426356589147288,
"grad_norm": 0.6651144480747045,
"learning_rate": 1.040594461717347e-05,
"loss": 0.4043,
"mean_token_accuracy": 0.8557463986799121,
"num_tokens": 171202689.0,
"step": 398
},
{
"entropy": 0.409576416015625,
"epoch": 1.5465116279069768,
"grad_norm": 0.680059415481525,
"learning_rate": 1.0363233514475121e-05,
"loss": 0.4093,
"mean_token_accuracy": 0.8574331281706691,
"num_tokens": 171652048.0,
"step": 399
},
{
"entropy": 0.418182373046875,
"epoch": 1.550387596899225,
"grad_norm": 0.7290148760854265,
"learning_rate": 1.0320515775716556e-05,
"loss": 0.3907,
"mean_token_accuracy": 0.8639104012399912,
"num_tokens": 172075445.0,
"step": 400
},
{
"entropy": 0.405548095703125,
"epoch": 1.554263565891473,
"grad_norm": 0.6573849244620844,
"learning_rate": 1.027779218132543e-05,
"loss": 0.4076,
"mean_token_accuracy": 0.8560699671506882,
"num_tokens": 172531164.0,
"step": 401
},
{
"entropy": 0.415771484375,
"epoch": 1.558139534883721,
"grad_norm": 0.6954008019614183,
"learning_rate": 1.0235063511836416e-05,
"loss": 0.4133,
"mean_token_accuracy": 0.8537066699936986,
"num_tokens": 172962665.0,
"step": 402
},
{
"entropy": 0.415863037109375,
"epoch": 1.562015503875969,
"grad_norm": 0.6562796726655113,
"learning_rate": 1.0192330547876871e-05,
"loss": 0.409,
"mean_token_accuracy": 0.8552350932732224,
"num_tokens": 173377559.0,
"step": 403
},
{
"entropy": 0.415191650390625,
"epoch": 1.5658914728682172,
"grad_norm": 0.667137112397098,
"learning_rate": 1.0149594070152638e-05,
"loss": 0.3962,
"mean_token_accuracy": 0.8587388163432479,
"num_tokens": 173786515.0,
"step": 404
},
{
"entropy": 0.407806396484375,
"epoch": 1.5697674418604652,
"grad_norm": 0.6768098234173051,
"learning_rate": 1.0106854859433734e-05,
"loss": 0.397,
"mean_token_accuracy": 0.8586418740451336,
"num_tokens": 174219900.0,
"step": 405
},
{
"entropy": 0.412872314453125,
"epoch": 1.5736434108527133,
"grad_norm": 0.6885176427252887,
"learning_rate": 1.0064113696540112e-05,
"loss": 0.3971,
"mean_token_accuracy": 0.8602798972278833,
"num_tokens": 174657405.0,
"step": 406
},
{
"entropy": 0.4066162109375,
"epoch": 1.5775193798449614,
"grad_norm": 0.631994956220823,
"learning_rate": 1.0021371362327397e-05,
"loss": 0.3829,
"mean_token_accuracy": 0.8659420674666762,
"num_tokens": 175080761.0,
"step": 407
},
{
"entropy": 0.398590087890625,
"epoch": 1.5813953488372094,
"grad_norm": 0.6912443194591774,
"learning_rate": 9.978628637672604e-06,
"loss": 0.411,
"mean_token_accuracy": 0.853080808185041,
"num_tokens": 175544696.0,
"step": 408
},
{
"entropy": 0.40594482421875,
"epoch": 1.5852713178294575,
"grad_norm": 0.708091166473127,
"learning_rate": 9.93588630345989e-06,
"loss": 0.4132,
"mean_token_accuracy": 0.8563732989132404,
"num_tokens": 175983787.0,
"step": 409
},
{
"entropy": 0.41259765625,
"epoch": 1.5891472868217056,
"grad_norm": 0.6387188583839009,
"learning_rate": 9.89314514056627e-06,
"loss": 0.4068,
"mean_token_accuracy": 0.8602753495797515,
"num_tokens": 176413686.0,
"step": 410
},
{
"entropy": 0.4127197265625,
"epoch": 1.5930232558139537,
"grad_norm": 0.6592710314757873,
"learning_rate": 9.850405929847367e-06,
"loss": 0.3979,
"mean_token_accuracy": 0.8593348637223244,
"num_tokens": 176834640.0,
"step": 411
},
{
"entropy": 0.411285400390625,
"epoch": 1.5968992248062015,
"grad_norm": 0.6738215140979368,
"learning_rate": 9.80766945212313e-06,
"loss": 0.4071,
"mean_token_accuracy": 0.856034941971302,
"num_tokens": 177250198.0,
"step": 412
},
{
"entropy": 0.410552978515625,
"epoch": 1.6007751937984496,
"grad_norm": 0.7220302341014333,
"learning_rate": 9.764936488163585e-06,
"loss": 0.3955,
"mean_token_accuracy": 0.8606227496638894,
"num_tokens": 177679620.0,
"step": 413
},
{
"entropy": 0.412078857421875,
"epoch": 1.6046511627906976,
"grad_norm": 0.6909184770326399,
"learning_rate": 9.72220781867457e-06,
"loss": 0.4066,
"mean_token_accuracy": 0.8569380175322294,
"num_tokens": 178105455.0,
"step": 414
},
{
"entropy": 0.41461181640625,
"epoch": 1.6085271317829457,
"grad_norm": 0.6798184746040906,
"learning_rate": 9.67948422428345e-06,
"loss": 0.3995,
"mean_token_accuracy": 0.8596138171851635,
"num_tokens": 178517293.0,
"step": 415
},
{
"entropy": 0.405731201171875,
"epoch": 1.6124031007751938,
"grad_norm": 0.6364483551849868,
"learning_rate": 9.63676648552488e-06,
"loss": 0.3909,
"mean_token_accuracy": 0.8604221493005753,
"num_tokens": 178951290.0,
"step": 416
},
{
"entropy": 0.41064453125,
"epoch": 1.6162790697674418,
"grad_norm": 0.6417949151025134,
"learning_rate": 9.594055382826534e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.8645245768129826,
"num_tokens": 179368525.0,
"step": 417
},
{
"entropy": 0.409881591796875,
"epoch": 1.62015503875969,
"grad_norm": 0.6122486116049144,
"learning_rate": 9.551351696494854e-06,
"loss": 0.4031,
"mean_token_accuracy": 0.859037296846509,
"num_tokens": 179810881.0,
"step": 418
},
{
"entropy": 0.4073486328125,
"epoch": 1.624031007751938,
"grad_norm": 0.7193263550810518,
"learning_rate": 9.508656206700778e-06,
"loss": 0.423,
"mean_token_accuracy": 0.8512195134535432,
"num_tokens": 180256456.0,
"step": 419
},
{
"entropy": 0.4124755859375,
"epoch": 1.627906976744186,
"grad_norm": 0.6396498575313437,
"learning_rate": 9.46596969346551e-06,
"loss": 0.4117,
"mean_token_accuracy": 0.8570121200755239,
"num_tokens": 180695177.0,
"step": 420
},
{
"entropy": 0.41705322265625,
"epoch": 1.6317829457364341,
"grad_norm": 0.6192449327399918,
"learning_rate": 9.423292936646258e-06,
"loss": 0.3966,
"mean_token_accuracy": 0.8602887643501163,
"num_tokens": 181108644.0,
"step": 421
},
{
"entropy": 0.420989990234375,
"epoch": 1.6356589147286822,
"grad_norm": 0.632880768949324,
"learning_rate": 9.380626715921972e-06,
"loss": 0.3993,
"mean_token_accuracy": 0.8594755912199616,
"num_tokens": 181522964.0,
"step": 422
},
{
"entropy": 0.417572021484375,
"epoch": 1.6395348837209303,
"grad_norm": 0.6923163082611585,
"learning_rate": 9.337971810779127e-06,
"loss": 0.3936,
"mean_token_accuracy": 0.861396661028266,
"num_tokens": 181949804.0,
"step": 423
},
{
"entropy": 0.408599853515625,
"epoch": 1.6434108527131783,
"grad_norm": 0.6622239880569959,
"learning_rate": 9.29532900049746e-06,
"loss": 0.3942,
"mean_token_accuracy": 0.8627764778211713,
"num_tokens": 182378755.0,
"step": 424
},
{
"entropy": 0.412200927734375,
"epoch": 1.6472868217054264,
"grad_norm": 0.6279180554692346,
"learning_rate": 9.252699064135759e-06,
"loss": 0.3984,
"mean_token_accuracy": 0.8595286570489407,
"num_tokens": 182804060.0,
"step": 425
},
{
"entropy": 0.406951904296875,
"epoch": 1.6511627906976745,
"grad_norm": 0.6621427730372136,
"learning_rate": 9.21008278051759e-06,
"loss": 0.396,
"mean_token_accuracy": 0.8606462860479951,
"num_tokens": 183243736.0,
"step": 426
},
{
"entropy": 0.401947021484375,
"epoch": 1.6550387596899225,
"grad_norm": 0.6817892729294643,
"learning_rate": 9.167480928217108e-06,
"loss": 0.4068,
"mean_token_accuracy": 0.8560972642153502,
"num_tokens": 183696488.0,
"step": 427
},
{
"entropy": 0.40386962890625,
"epoch": 1.6589147286821704,
"grad_norm": 0.6817036559805504,
"learning_rate": 9.124894285544808e-06,
"loss": 0.405,
"mean_token_accuracy": 0.8574656415730715,
"num_tokens": 184134192.0,
"step": 428
},
{
"entropy": 0.412811279296875,
"epoch": 1.6627906976744184,
"grad_norm": 0.6761747313195076,
"learning_rate": 9.082323630533317e-06,
"loss": 0.3904,
"mean_token_accuracy": 0.8623712658882141,
"num_tokens": 184547689.0,
"step": 429
},
{
"entropy": 0.407196044921875,
"epoch": 1.6666666666666665,
"grad_norm": 0.6093614910819201,
"learning_rate": 9.039769740923183e-06,
"loss": 0.3954,
"mean_token_accuracy": 0.8592645572498441,
"num_tokens": 184980830.0,
"step": 430
},
{
"entropy": 0.4166259765625,
"epoch": 1.6705426356589146,
"grad_norm": 0.6332487484043614,
"learning_rate": 8.997233394148648e-06,
"loss": 0.3918,
"mean_token_accuracy": 0.8626098716631532,
"num_tokens": 185396383.0,
"step": 431
},
{
"entropy": 0.41619873046875,
"epoch": 1.6744186046511627,
"grad_norm": 0.6580432745200419,
"learning_rate": 8.954715367323468e-06,
"loss": 0.4013,
"mean_token_accuracy": 0.8584313243627548,
"num_tokens": 185821177.0,
"step": 432
},
{
"entropy": 0.414154052734375,
"epoch": 1.6782945736434107,
"grad_norm": 0.6851065385902475,
"learning_rate": 8.912216437226692e-06,
"loss": 0.396,
"mean_token_accuracy": 0.860386623069644,
"num_tokens": 186247442.0,
"step": 433
},
{
"entropy": 0.420745849609375,
"epoch": 1.6821705426356588,
"grad_norm": 0.6568781689919727,
"learning_rate": 8.869737380288502e-06,
"loss": 0.4014,
"mean_token_accuracy": 0.8592943148687482,
"num_tokens": 186670839.0,
"step": 434
},
{
"entropy": 0.41497802734375,
"epoch": 1.6860465116279069,
"grad_norm": 0.6319514560027869,
"learning_rate": 8.827278972575984e-06,
"loss": 0.3916,
"mean_token_accuracy": 0.8617346873506904,
"num_tokens": 187095317.0,
"step": 435
},
{
"entropy": 0.406524658203125,
"epoch": 1.689922480620155,
"grad_norm": 0.6682072796432892,
"learning_rate": 8.784841989778997e-06,
"loss": 0.3979,
"mean_token_accuracy": 0.859723481349647,
"num_tokens": 187535915.0,
"step": 436
},
{
"entropy": 0.41015625,
"epoch": 1.693798449612403,
"grad_norm": 0.6691193142851936,
"learning_rate": 8.742427207195975e-06,
"loss": 0.3883,
"mean_token_accuracy": 0.8643459100276232,
"num_tokens": 187962507.0,
"step": 437
},
{
"entropy": 0.4033203125,
"epoch": 1.697674418604651,
"grad_norm": 0.6803532551447259,
"learning_rate": 8.700035399719754e-06,
"loss": 0.4016,
"mean_token_accuracy": 0.8600505525246263,
"num_tokens": 188414361.0,
"step": 438
},
{
"entropy": 0.4100341796875,
"epoch": 1.7015503875968991,
"grad_norm": 0.6705454693190775,
"learning_rate": 8.657667341823449e-06,
"loss": 0.4131,
"mean_token_accuracy": 0.8566976124420762,
"num_tokens": 188861294.0,
"step": 439
},
{
"entropy": 0.416778564453125,
"epoch": 1.7054263565891472,
"grad_norm": 0.6425730442834899,
"learning_rate": 8.615323807546258e-06,
"loss": 0.4022,
"mean_token_accuracy": 0.8590597696602345,
"num_tokens": 189287748.0,
"step": 440
},
{
"entropy": 0.41192626953125,
"epoch": 1.7093023255813953,
"grad_norm": 0.6409295984728016,
"learning_rate": 8.57300557047938e-06,
"loss": 0.3931,
"mean_token_accuracy": 0.8600128889083862,
"num_tokens": 189737727.0,
"step": 441
},
{
"entropy": 0.414276123046875,
"epoch": 1.7131782945736433,
"grad_norm": 0.6433632025761097,
"learning_rate": 8.530713403751822e-06,
"loss": 0.396,
"mean_token_accuracy": 0.8607654105871916,
"num_tokens": 190166544.0,
"step": 442
},
{
"entropy": 0.41436767578125,
"epoch": 1.7170542635658914,
"grad_norm": 0.6296991719670527,
"learning_rate": 8.488448080016312e-06,
"loss": 0.3805,
"mean_token_accuracy": 0.8660553842782974,
"num_tokens": 190605188.0,
"step": 443
},
{
"entropy": 0.409088134765625,
"epoch": 1.7209302325581395,
"grad_norm": 0.659206038831299,
"learning_rate": 8.446210371435172e-06,
"loss": 0.3953,
"mean_token_accuracy": 0.8605815563350916,
"num_tokens": 191035664.0,
"step": 444
},
{
"entropy": 0.408050537109375,
"epoch": 1.7248062015503876,
"grad_norm": 0.652051845273745,
"learning_rate": 8.404001049666211e-06,
"loss": 0.3899,
"mean_token_accuracy": 0.8624423686414957,
"num_tokens": 191462754.0,
"step": 445
},
{
"entropy": 0.416046142578125,
"epoch": 1.7286821705426356,
"grad_norm": 0.6692499987903555,
"learning_rate": 8.361820885848623e-06,
"loss": 0.4025,
"mean_token_accuracy": 0.8589138938114047,
"num_tokens": 191882226.0,
"step": 446
},
{
"entropy": 0.4078369140625,
"epoch": 1.7325581395348837,
"grad_norm": 0.623322525199549,
"learning_rate": 8.319670650588916e-06,
"loss": 0.3786,
"mean_token_accuracy": 0.8644889798015356,
"num_tokens": 192307832.0,
"step": 447
},
{
"entropy": 0.4110107421875,
"epoch": 1.7364341085271318,
"grad_norm": 0.6189571086504357,
"learning_rate": 8.277551113946812e-06,
"loss": 0.3889,
"mean_token_accuracy": 0.8638109732419252,
"num_tokens": 192740453.0,
"step": 448
},
{
"entropy": 0.406951904296875,
"epoch": 1.7403100775193798,
"grad_norm": 0.6436896246044009,
"learning_rate": 8.235463045421186e-06,
"loss": 0.3797,
"mean_token_accuracy": 0.8641934292390943,
"num_tokens": 193161453.0,
"step": 449
},
{
"entropy": 0.405487060546875,
"epoch": 1.744186046511628,
"grad_norm": 0.7136051111650151,
"learning_rate": 8.193407213936014e-06,
"loss": 0.3923,
"mean_token_accuracy": 0.8620244851335883,
"num_tokens": 193581605.0,
"step": 450
},
{
"entropy": 0.41082763671875,
"epoch": 1.748062015503876,
"grad_norm": 0.6504324698248508,
"learning_rate": 8.151384387826313e-06,
"loss": 0.3867,
"mean_token_accuracy": 0.8634668812155724,
"num_tokens": 193986862.0,
"step": 451
},
{
"entropy": 0.410003662109375,
"epoch": 1.751937984496124,
"grad_norm": 0.6572442527472421,
"learning_rate": 8.109395334824127e-06,
"loss": 0.3986,
"mean_token_accuracy": 0.8602865533903241,
"num_tokens": 194402333.0,
"step": 452
},
{
"entropy": 0.409820556640625,
"epoch": 1.755813953488372,
"grad_norm": 0.6434483925139697,
"learning_rate": 8.06744082204447e-06,
"loss": 0.4005,
"mean_token_accuracy": 0.8582842675969005,
"num_tokens": 194857383.0,
"step": 453
},
{
"entropy": 0.408966064453125,
"epoch": 1.7596899224806202,
"grad_norm": 0.6680448763661996,
"learning_rate": 8.02552161597133e-06,
"loss": 0.3928,
"mean_token_accuracy": 0.8633231353014708,
"num_tokens": 195286940.0,
"step": 454
},
{
"entropy": 0.406494140625,
"epoch": 1.7635658914728682,
"grad_norm": 0.6369078203975965,
"learning_rate": 7.983638482443671e-06,
"loss": 0.3988,
"mean_token_accuracy": 0.859028734266758,
"num_tokens": 195736988.0,
"step": 455
},
{
"entropy": 0.40313720703125,
"epoch": 1.7674418604651163,
"grad_norm": 0.643861645368624,
"learning_rate": 7.941792186641417e-06,
"loss": 0.3833,
"mean_token_accuracy": 0.864879741333425,
"num_tokens": 196169764.0,
"step": 456
},
{
"entropy": 0.40716552734375,
"epoch": 1.7713178294573644,
"grad_norm": 0.6561302747521901,
"learning_rate": 7.899983493071506e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8641843870282173,
"num_tokens": 196598690.0,
"step": 457
},
{
"entropy": 0.409393310546875,
"epoch": 1.7751937984496124,
"grad_norm": 0.6594934710060417,
"learning_rate": 7.858213165553897e-06,
"loss": 0.3963,
"mean_token_accuracy": 0.8609117828309536,
"num_tokens": 197014178.0,
"step": 458
},
{
"entropy": 0.399658203125,
"epoch": 1.7790697674418605,
"grad_norm": 0.6478982839392694,
"learning_rate": 7.816481967207627e-06,
"loss": 0.3875,
"mean_token_accuracy": 0.8637743312865496,
"num_tokens": 197476046.0,
"step": 459
},
{
"entropy": 0.406494140625,
"epoch": 1.7829457364341086,
"grad_norm": 0.6157464144088126,
"learning_rate": 7.774790660436857e-06,
"loss": 0.3912,
"mean_token_accuracy": 0.862570708617568,
"num_tokens": 197900624.0,
"step": 460
},
{
"entropy": 0.41302490234375,
"epoch": 1.7868217054263567,
"grad_norm": 0.683013379784653,
"learning_rate": 7.733140006916968e-06,
"loss": 0.3799,
"mean_token_accuracy": 0.8661030624061823,
"num_tokens": 198310858.0,
"step": 461
},
{
"entropy": 0.41424560546875,
"epoch": 1.7906976744186047,
"grad_norm": 0.6367234615706063,
"learning_rate": 7.691530767580613e-06,
"loss": 0.3873,
"mean_token_accuracy": 0.861991093493998,
"num_tokens": 198746731.0,
"step": 462
},
{
"entropy": 0.410888671875,
"epoch": 1.7945736434108528,
"grad_norm": 0.6977685434199898,
"learning_rate": 7.649963702603848e-06,
"loss": 0.3856,
"mean_token_accuracy": 0.863556420430541,
"num_tokens": 199169885.0,
"step": 463
},
{
"entropy": 0.4053955078125,
"epoch": 1.7984496124031009,
"grad_norm": 0.6602781525851062,
"learning_rate": 7.608439571392227e-06,
"loss": 0.3939,
"mean_token_accuracy": 0.861305077560246,
"num_tokens": 199606209.0,
"step": 464
},
{
"entropy": 0.40625,
"epoch": 1.802325581395349,
"grad_norm": 0.6523264761673283,
"learning_rate": 7.566959132566914e-06,
"loss": 0.4128,
"mean_token_accuracy": 0.8558422513306141,
"num_tokens": 200040899.0,
"step": 465
},
{
"entropy": 0.404632568359375,
"epoch": 1.806201550387597,
"grad_norm": 0.6136108426271604,
"learning_rate": 7.525523143950859e-06,
"loss": 0.3943,
"mean_token_accuracy": 0.8612562520429492,
"num_tokens": 200483003.0,
"step": 466
},
{
"entropy": 0.404266357421875,
"epoch": 1.810077519379845,
"grad_norm": 0.6808527274202781,
"learning_rate": 7.484132362554915e-06,
"loss": 0.4035,
"mean_token_accuracy": 0.8603929774835706,
"num_tokens": 200918100.0,
"step": 467
},
{
"entropy": 0.415069580078125,
"epoch": 1.8139534883720931,
"grad_norm": 0.7268727125086556,
"learning_rate": 7.442787544564044e-06,
"loss": 0.3884,
"mean_token_accuracy": 0.8616755288094282,
"num_tokens": 201332208.0,
"step": 468
},
{
"entropy": 0.406463623046875,
"epoch": 1.8178294573643412,
"grad_norm": 0.6327268458389125,
"learning_rate": 7.401489445323473e-06,
"loss": 0.3825,
"mean_token_accuracy": 0.8625819915905595,
"num_tokens": 201748342.0,
"step": 469
},
{
"entropy": 0.406768798828125,
"epoch": 1.8217054263565893,
"grad_norm": 0.5843910418974224,
"learning_rate": 7.360238819324903e-06,
"loss": 0.3883,
"mean_token_accuracy": 0.8621263904497027,
"num_tokens": 202190718.0,
"step": 470
},
{
"entropy": 0.401763916015625,
"epoch": 1.8255813953488373,
"grad_norm": 0.6553658624886732,
"learning_rate": 7.319036420192737e-06,
"loss": 0.386,
"mean_token_accuracy": 0.8664320418611169,
"num_tokens": 202617317.0,
"step": 471
},
{
"entropy": 0.401702880859375,
"epoch": 1.8294573643410854,
"grad_norm": 0.6697797086612566,
"learning_rate": 7.27788300067029e-06,
"loss": 0.3898,
"mean_token_accuracy": 0.8603641046211123,
"num_tokens": 203052208.0,
"step": 472
},
{
"entropy": 0.40185546875,
"epoch": 1.8333333333333335,
"grad_norm": 0.6123304501658193,
"learning_rate": 7.236779312606059e-06,
"loss": 0.3825,
"mean_token_accuracy": 0.8642279775813222,
"num_tokens": 203497214.0,
"step": 473
},
{
"entropy": 0.41339111328125,
"epoch": 1.8372093023255816,
"grad_norm": 0.6202994666777202,
"learning_rate": 7.1957261069399745e-06,
"loss": 0.3801,
"mean_token_accuracy": 0.8631016416475177,
"num_tokens": 203906589.0,
"step": 474
},
{
"entropy": 0.41558837890625,
"epoch": 1.8410852713178296,
"grad_norm": 0.6791465966432566,
"learning_rate": 7.154724133689677e-06,
"loss": 0.3972,
"mean_token_accuracy": 0.8600956695154309,
"num_tokens": 204327623.0,
"step": 475
},
{
"entropy": 0.408416748046875,
"epoch": 1.8449612403100775,
"grad_norm": 0.6350010558970041,
"learning_rate": 7.113774141936829e-06,
"loss": 0.4041,
"mean_token_accuracy": 0.8566409824416041,
"num_tokens": 204761918.0,
"step": 476
},
{
"entropy": 0.41143798828125,
"epoch": 1.8488372093023255,
"grad_norm": 0.6313657105845258,
"learning_rate": 7.0728768798134195e-06,
"loss": 0.3909,
"mean_token_accuracy": 0.8599191624671221,
"num_tokens": 205174533.0,
"step": 477
},
{
"entropy": 0.41021728515625,
"epoch": 1.8527131782945736,
"grad_norm": 0.6378670434188185,
"learning_rate": 7.032033094488094e-06,
"loss": 0.3828,
"mean_token_accuracy": 0.8645174792036414,
"num_tokens": 205594485.0,
"step": 478
},
{
"entropy": 0.405364990234375,
"epoch": 1.8565891472868217,
"grad_norm": 0.6543240433781295,
"learning_rate": 6.9912435321525185e-06,
"loss": 0.3916,
"mean_token_accuracy": 0.861815670505166,
"num_tokens": 206032827.0,
"step": 479
},
{
"entropy": 0.402587890625,
"epoch": 1.8604651162790697,
"grad_norm": 0.6530799396059754,
"learning_rate": 6.95050893800773e-06,
"loss": 0.3705,
"mean_token_accuracy": 0.8680164245888591,
"num_tokens": 206464664.0,
"step": 480
},
{
"entropy": 0.40667724609375,
"epoch": 1.8643410852713178,
"grad_norm": 0.6166340670957584,
"learning_rate": 6.909830056250527e-06,
"loss": 0.3819,
"mean_token_accuracy": 0.8618542673066258,
"num_tokens": 206892855.0,
"step": 481
},
{
"entropy": 0.398773193359375,
"epoch": 1.8682170542635659,
"grad_norm": 0.6776441639714464,
"learning_rate": 6.869207630059885e-06,
"loss": 0.3851,
"mean_token_accuracy": 0.8656446663662791,
"num_tokens": 207340245.0,
"step": 482
},
{
"entropy": 0.40667724609375,
"epoch": 1.872093023255814,
"grad_norm": 0.701960556262414,
"learning_rate": 6.8286424015833585e-06,
"loss": 0.3945,
"mean_token_accuracy": 0.8633942920714617,
"num_tokens": 207781765.0,
"step": 483
},
{
"entropy": 0.401336669921875,
"epoch": 1.875968992248062,
"grad_norm": 0.6417634518224469,
"learning_rate": 6.788135111923545e-06,
"loss": 0.3715,
"mean_token_accuracy": 0.8689904985949397,
"num_tokens": 208199533.0,
"step": 484
},
{
"entropy": 0.403167724609375,
"epoch": 1.87984496124031,
"grad_norm": 0.6314848357880424,
"learning_rate": 6.747686501124531e-06,
"loss": 0.3956,
"mean_token_accuracy": 0.8605082351714373,
"num_tokens": 208637751.0,
"step": 485
},
{
"entropy": 0.4044189453125,
"epoch": 1.8837209302325582,
"grad_norm": 0.6526708888274303,
"learning_rate": 6.707297308158366e-06,
"loss": 0.3839,
"mean_token_accuracy": 0.8627981888130307,
"num_tokens": 209072826.0,
"step": 486
},
{
"entropy": 0.404571533203125,
"epoch": 1.8875968992248062,
"grad_norm": 0.6120925587753634,
"learning_rate": 6.666968270911585e-06,
"loss": 0.3865,
"mean_token_accuracy": 0.8635251615196466,
"num_tokens": 209523954.0,
"step": 487
},
{
"entropy": 0.40380859375,
"epoch": 1.8914728682170543,
"grad_norm": 0.6272847749525875,
"learning_rate": 6.6267001261717015e-06,
"loss": 0.3821,
"mean_token_accuracy": 0.8653721883893013,
"num_tokens": 209972017.0,
"step": 488
},
{
"entropy": 0.4151611328125,
"epoch": 1.8953488372093024,
"grad_norm": 0.6479005625946697,
"learning_rate": 6.586493609613768e-06,
"loss": 0.3881,
"mean_token_accuracy": 0.86237673740834,
"num_tokens": 210406408.0,
"step": 489
},
{
"entropy": 0.404541015625,
"epoch": 1.8992248062015504,
"grad_norm": 0.6180822520605622,
"learning_rate": 6.546349455786926e-06,
"loss": 0.366,
"mean_token_accuracy": 0.8684000810608268,
"num_tokens": 210815640.0,
"step": 490
},
{
"entropy": 0.39935302734375,
"epoch": 1.9031007751937985,
"grad_norm": 0.6425138033116063,
"learning_rate": 6.506268398100979e-06,
"loss": 0.3849,
"mean_token_accuracy": 0.8652486437931657,
"num_tokens": 211258730.0,
"step": 491
},
{
"entropy": 0.396697998046875,
"epoch": 1.9069767441860463,
"grad_norm": 0.6475606998124774,
"learning_rate": 6.46625116881301e-06,
"loss": 0.3881,
"mean_token_accuracy": 0.8615107480436563,
"num_tokens": 211691792.0,
"step": 492
},
{
"entropy": 0.4013671875,
"epoch": 1.9108527131782944,
"grad_norm": 0.676923862359068,
"learning_rate": 6.426298499013994e-06,
"loss": 0.3864,
"mean_token_accuracy": 0.8626894094049931,
"num_tokens": 212108212.0,
"step": 493
},
{
"entropy": 0.40142822265625,
"epoch": 1.9147286821705425,
"grad_norm": 0.6493841657255129,
"learning_rate": 6.386411118615434e-06,
"loss": 0.3841,
"mean_token_accuracy": 0.8646282628178596,
"num_tokens": 212524846.0,
"step": 494
},
{
"entropy": 0.407470703125,
"epoch": 1.9186046511627906,
"grad_norm": 0.6517307413767497,
"learning_rate": 6.34658975633605e-06,
"loss": 0.3862,
"mean_token_accuracy": 0.866667116060853,
"num_tokens": 212949216.0,
"step": 495
},
{
"entropy": 0.399688720703125,
"epoch": 1.9224806201550386,
"grad_norm": 0.639749877293263,
"learning_rate": 6.306835139688439e-06,
"loss": 0.374,
"mean_token_accuracy": 0.8665135633200407,
"num_tokens": 213375704.0,
"step": 496
},
{
"entropy": 0.404449462890625,
"epoch": 1.9263565891472867,
"grad_norm": 0.6396313970116991,
"learning_rate": 6.267147994965792e-06,
"loss": 0.3898,
"mean_token_accuracy": 0.861801334656775,
"num_tokens": 213818308.0,
"step": 497
},
{
"entropy": 0.40673828125,
"epoch": 1.9302325581395348,
"grad_norm": 0.6293975472846354,
"learning_rate": 6.2275290472286406e-06,
"loss": 0.3828,
"mean_token_accuracy": 0.866341283544898,
"num_tokens": 214250620.0,
"step": 498
},
{
"entropy": 0.400238037109375,
"epoch": 1.9341085271317828,
"grad_norm": 0.6307078281794433,
"learning_rate": 6.187979020291584e-06,
"loss": 0.3947,
"mean_token_accuracy": 0.8613047506660223,
"num_tokens": 214695002.0,
"step": 499
},
{
"entropy": 0.4093017578125,
"epoch": 1.937984496124031,
"grad_norm": 0.6607845873938842,
"learning_rate": 6.148498636710092e-06,
"loss": 0.3632,
"mean_token_accuracy": 0.8689653361216187,
"num_tokens": 215101633.0,
"step": 500
},
{
"entropy": 0.408416748046875,
"epoch": 1.941860465116279,
"grad_norm": 0.6961543921206979,
"learning_rate": 6.109088617767287e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8612672435119748,
"num_tokens": 215530624.0,
"step": 501
},
{
"entropy": 0.403350830078125,
"epoch": 1.945736434108527,
"grad_norm": 0.6886590872632472,
"learning_rate": 6.069749683460765e-06,
"loss": 0.3848,
"mean_token_accuracy": 0.8641320113092661,
"num_tokens": 215963019.0,
"step": 502
},
{
"entropy": 0.39825439453125,
"epoch": 1.949612403100775,
"grad_norm": 0.6433536008619647,
"learning_rate": 6.030482552489458e-06,
"loss": 0.3815,
"mean_token_accuracy": 0.8654372049495578,
"num_tokens": 216395090.0,
"step": 503
},
{
"entropy": 0.39599609375,
"epoch": 1.9534883720930232,
"grad_norm": 0.6124588094534525,
"learning_rate": 5.9912879422404864e-06,
"loss": 0.3775,
"mean_token_accuracy": 0.8667404530569911,
"num_tokens": 216852626.0,
"step": 504
},
{
"entropy": 0.4044189453125,
"epoch": 1.9573643410852712,
"grad_norm": 0.6969406927169308,
"learning_rate": 5.952166568776062e-06,
"loss": 0.3824,
"mean_token_accuracy": 0.8645559353753924,
"num_tokens": 217284606.0,
"step": 505
},
{
"entropy": 0.4014892578125,
"epoch": 1.9612403100775193,
"grad_norm": 0.6289776276400901,
"learning_rate": 5.91311914682041e-06,
"loss": 0.3855,
"mean_token_accuracy": 0.8640624480322003,
"num_tokens": 217702095.0,
"step": 506
},
{
"entropy": 0.40374755859375,
"epoch": 1.9651162790697674,
"grad_norm": 0.6292663972830993,
"learning_rate": 5.874146389746697e-06,
"loss": 0.3725,
"mean_token_accuracy": 0.8682069405913353,
"num_tokens": 218124430.0,
"step": 507
},
{
"entropy": 0.403778076171875,
"epoch": 1.9689922480620154,
"grad_norm": 0.6405088898702465,
"learning_rate": 5.835249009564013e-06,
"loss": 0.3819,
"mean_token_accuracy": 0.8638218138366938,
"num_tokens": 218559364.0,
"step": 508
},
{
"entropy": 0.40142822265625,
"epoch": 1.9728682170542635,
"grad_norm": 0.6594125717541489,
"learning_rate": 5.796427716904347e-06,
"loss": 0.3788,
"mean_token_accuracy": 0.866894249804318,
"num_tokens": 218987214.0,
"step": 509
},
{
"entropy": 0.4051513671875,
"epoch": 1.9767441860465116,
"grad_norm": 0.6394085248266738,
"learning_rate": 5.757683221009625e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.8652555495500565,
"num_tokens": 219428184.0,
"step": 510
},
{
"entropy": 0.40252685546875,
"epoch": 1.9806201550387597,
"grad_norm": 0.7035221174607182,
"learning_rate": 5.719016229718748e-06,
"loss": 0.386,
"mean_token_accuracy": 0.8648106651380658,
"num_tokens": 219866920.0,
"step": 511
},
{
"entropy": 0.403076171875,
"epoch": 1.9844961240310077,
"grad_norm": 0.6765569141175194,
"learning_rate": 5.680427449454631e-06,
"loss": 0.3757,
"mean_token_accuracy": 0.8669820064678788,
"num_tokens": 220314515.0,
"step": 512
},
{
"entropy": 0.396575927734375,
"epoch": 1.9883720930232558,
"grad_norm": 0.6391561698642179,
"learning_rate": 5.641917585211338e-06,
"loss": 0.3824,
"mean_token_accuracy": 0.8642657212913036,
"num_tokens": 220750298.0,
"step": 513
},
{
"entropy": 0.398956298828125,
"epoch": 1.9922480620155039,
"grad_norm": 0.6432687978090328,
"learning_rate": 5.60348734054118e-06,
"loss": 0.3857,
"mean_token_accuracy": 0.8652426460757852,
"num_tokens": 221200699.0,
"step": 514
},
{
"entropy": 0.401336669921875,
"epoch": 1.996124031007752,
"grad_norm": 0.6660097790435096,
"learning_rate": 5.565137417541866e-06,
"loss": 0.3647,
"mean_token_accuracy": 0.870651887729764,
"num_tokens": 221631125.0,
"step": 515
},
{
"entropy": 0.4005126953125,
"epoch": 2.0,
"grad_norm": 0.654520060535491,
"learning_rate": 5.526868516843673e-06,
"loss": 0.3735,
"mean_token_accuracy": 0.8669488895684481,
"num_tokens": 222046610.0,
"step": 516
},
{
"entropy": 0.400299072265625,
"epoch": 2.003875968992248,
"grad_norm": 0.6936563180726831,
"learning_rate": 5.488681337596653e-06,
"loss": 0.3466,
"mean_token_accuracy": 0.8784803748130798,
"num_tokens": 222488581.0,
"step": 517
},
{
"entropy": 0.401611328125,
"epoch": 2.007751937984496,
"grad_norm": 0.6926888243105179,
"learning_rate": 5.450576577457858e-06,
"loss": 0.3293,
"mean_token_accuracy": 0.8813051115721464,
"num_tokens": 222901744.0,
"step": 518
},
{
"entropy": 0.39300537109375,
"epoch": 2.011627906976744,
"grad_norm": 0.6643375347233971,
"learning_rate": 5.412554932578578e-06,
"loss": 0.3349,
"mean_token_accuracy": 0.8790784049779177,
"num_tokens": 223337496.0,
"step": 519
},
{
"entropy": 0.392425537109375,
"epoch": 2.0155038759689923,
"grad_norm": 0.7298666897594812,
"learning_rate": 5.37461709759165e-06,
"loss": 0.3462,
"mean_token_accuracy": 0.8755828496068716,
"num_tokens": 223777243.0,
"step": 520
},
{
"entropy": 0.3936767578125,
"epoch": 2.0193798449612403,
"grad_norm": 0.7323552015632304,
"learning_rate": 5.3367637655987515e-06,
"loss": 0.3315,
"mean_token_accuracy": 0.8803297802805901,
"num_tokens": 224185656.0,
"step": 521
},
{
"entropy": 0.382598876953125,
"epoch": 2.0232558139534884,
"grad_norm": 0.7128553544647434,
"learning_rate": 5.298995628157738e-06,
"loss": 0.3201,
"mean_token_accuracy": 0.8834712980315089,
"num_tokens": 224616727.0,
"step": 522
},
{
"entropy": 0.387176513671875,
"epoch": 2.0271317829457365,
"grad_norm": 0.6357745281130776,
"learning_rate": 5.2613133752700145e-06,
"loss": 0.3315,
"mean_token_accuracy": 0.8827424431219697,
"num_tokens": 225029016.0,
"step": 523
},
{
"entropy": 0.394073486328125,
"epoch": 2.0310077519379846,
"grad_norm": 0.685138271938773,
"learning_rate": 5.223717695367922e-06,
"loss": 0.3343,
"mean_token_accuracy": 0.8783880360424519,
"num_tokens": 225455623.0,
"step": 524
},
{
"entropy": 0.403533935546875,
"epoch": 2.0348837209302326,
"grad_norm": 0.6897816458876816,
"learning_rate": 5.186209275302175e-06,
"loss": 0.3365,
"mean_token_accuracy": 0.8792239772155881,
"num_tokens": 225868920.0,
"step": 525
},
{
"entropy": 0.395294189453125,
"epoch": 2.0387596899224807,
"grad_norm": 0.7020774690634989,
"learning_rate": 5.148788800329279e-06,
"loss": 0.3366,
"mean_token_accuracy": 0.8801201498135924,
"num_tokens": 226290859.0,
"step": 526
},
{
"entropy": 0.39208984375,
"epoch": 2.0426356589147288,
"grad_norm": 0.6544070719356055,
"learning_rate": 5.111456954099064e-06,
"loss": 0.325,
"mean_token_accuracy": 0.8827378544956446,
"num_tokens": 226729135.0,
"step": 527
},
{
"entropy": 0.384185791015625,
"epoch": 2.046511627906977,
"grad_norm": 0.6922594514727605,
"learning_rate": 5.0742144186421484e-06,
"loss": 0.3341,
"mean_token_accuracy": 0.8800540259107947,
"num_tokens": 227155610.0,
"step": 528
},
{
"entropy": 0.389190673828125,
"epoch": 2.050387596899225,
"grad_norm": 0.6710910306877101,
"learning_rate": 5.037061874357503e-06,
"loss": 0.3391,
"mean_token_accuracy": 0.8789013354107738,
"num_tokens": 227602237.0,
"step": 529
},
{
"entropy": 0.385589599609375,
"epoch": 2.054263565891473,
"grad_norm": 0.6061828959415225,
"learning_rate": 5.000000000000003e-06,
"loss": 0.3346,
"mean_token_accuracy": 0.8803443806245923,
"num_tokens": 228063981.0,
"step": 530
},
{
"entropy": 0.39202880859375,
"epoch": 2.058139534883721,
"grad_norm": 0.6141481495965043,
"learning_rate": 4.963029472668044e-06,
"loss": 0.3302,
"mean_token_accuracy": 0.8802597392350435,
"num_tokens": 228496111.0,
"step": 531
},
{
"entropy": 0.385711669921875,
"epoch": 2.062015503875969,
"grad_norm": 0.6384598762874943,
"learning_rate": 4.92615096779118e-06,
"loss": 0.3338,
"mean_token_accuracy": 0.880737591534853,
"num_tokens": 228944550.0,
"step": 532
},
{
"entropy": 0.395233154296875,
"epoch": 2.065891472868217,
"grad_norm": 0.6698946719476137,
"learning_rate": 4.889365159117744e-06,
"loss": 0.3321,
"mean_token_accuracy": 0.8801658367738128,
"num_tokens": 229382741.0,
"step": 533
},
{
"entropy": 0.385955810546875,
"epoch": 2.0697674418604652,
"grad_norm": 0.6778064236402924,
"learning_rate": 4.852672718702581e-06,
"loss": 0.3299,
"mean_token_accuracy": 0.8840415002778172,
"num_tokens": 229821349.0,
"step": 534
},
{
"entropy": 0.387420654296875,
"epoch": 2.0736434108527133,
"grad_norm": 0.6328680945170063,
"learning_rate": 4.81607431689475e-06,
"loss": 0.3166,
"mean_token_accuracy": 0.8840623144060373,
"num_tokens": 230260834.0,
"step": 535
},
{
"entropy": 0.390625,
"epoch": 2.0775193798449614,
"grad_norm": 0.6408500370095382,
"learning_rate": 4.779570622325284e-06,
"loss": 0.3324,
"mean_token_accuracy": 0.8803566815331578,
"num_tokens": 230677028.0,
"step": 536
},
{
"entropy": 0.383026123046875,
"epoch": 2.0813953488372094,
"grad_norm": 0.6331145987896816,
"learning_rate": 4.743162301894952e-06,
"loss": 0.3314,
"mean_token_accuracy": 0.8804466081783175,
"num_tokens": 231123890.0,
"step": 537
},
{
"entropy": 0.3948974609375,
"epoch": 2.0852713178294575,
"grad_norm": 0.6360376709937053,
"learning_rate": 4.706850020762126e-06,
"loss": 0.3203,
"mean_token_accuracy": 0.8849101848900318,
"num_tokens": 231537700.0,
"step": 538
},
{
"entropy": 0.39019775390625,
"epoch": 2.0891472868217056,
"grad_norm": 0.6480759056424732,
"learning_rate": 4.6706344423305775e-06,
"loss": 0.3258,
"mean_token_accuracy": 0.8819584492594004,
"num_tokens": 231966260.0,
"step": 539
},
{
"entropy": 0.393280029296875,
"epoch": 2.0930232558139537,
"grad_norm": 0.6365457220306857,
"learning_rate": 4.634516228237372e-06,
"loss": 0.3328,
"mean_token_accuracy": 0.882646357640624,
"num_tokens": 232394010.0,
"step": 540
},
{
"entropy": 0.3883056640625,
"epoch": 2.0968992248062017,
"grad_norm": 0.6729577807372763,
"learning_rate": 4.598496038340801e-06,
"loss": 0.3312,
"mean_token_accuracy": 0.8807763801887631,
"num_tokens": 232833486.0,
"step": 541
},
{
"entropy": 0.386322021484375,
"epoch": 2.10077519379845,
"grad_norm": 0.6467656132694498,
"learning_rate": 4.5625745307083e-06,
"loss": 0.339,
"mean_token_accuracy": 0.8785864366218448,
"num_tokens": 233276645.0,
"step": 542
},
{
"entropy": 0.392120361328125,
"epoch": 2.104651162790698,
"grad_norm": 0.6341672757705268,
"learning_rate": 4.526752361604455e-06,
"loss": 0.3279,
"mean_token_accuracy": 0.8815192077308893,
"num_tokens": 233710256.0,
"step": 543
},
{
"entropy": 0.385162353515625,
"epoch": 2.108527131782946,
"grad_norm": 0.661558127233088,
"learning_rate": 4.491030185478976e-06,
"loss": 0.3181,
"mean_token_accuracy": 0.8827753607183695,
"num_tokens": 234144022.0,
"step": 544
},
{
"entropy": 0.3856201171875,
"epoch": 2.112403100775194,
"grad_norm": 0.638959340473472,
"learning_rate": 4.455408654954771e-06,
"loss": 0.3246,
"mean_token_accuracy": 0.8830780945718288,
"num_tokens": 234561879.0,
"step": 545
},
{
"entropy": 0.388946533203125,
"epoch": 2.116279069767442,
"grad_norm": 0.6321172574448592,
"learning_rate": 4.419888420816015e-06,
"loss": 0.3237,
"mean_token_accuracy": 0.8818927984684706,
"num_tokens": 234987166.0,
"step": 546
},
{
"entropy": 0.384979248046875,
"epoch": 2.12015503875969,
"grad_norm": 0.6645224096624154,
"learning_rate": 4.3844701319962525e-06,
"loss": 0.333,
"mean_token_accuracy": 0.8798952642828226,
"num_tokens": 235438001.0,
"step": 547
},
{
"entropy": 0.395172119140625,
"epoch": 2.124031007751938,
"grad_norm": 0.6734141584773501,
"learning_rate": 4.349154435566551e-06,
"loss": 0.3339,
"mean_token_accuracy": 0.8803757233545184,
"num_tokens": 235852297.0,
"step": 548
},
{
"entropy": 0.388427734375,
"epoch": 2.1279069767441863,
"grad_norm": 0.6638427414245535,
"learning_rate": 4.313941976723677e-06,
"loss": 0.3249,
"mean_token_accuracy": 0.8819608362391591,
"num_tokens": 236284675.0,
"step": 549
},
{
"entropy": 0.390655517578125,
"epoch": 2.1317829457364343,
"grad_norm": 0.6269884063395513,
"learning_rate": 4.278833398778306e-06,
"loss": 0.3227,
"mean_token_accuracy": 0.8828015690669417,
"num_tokens": 236717833.0,
"step": 550
},
{
"entropy": 0.39385986328125,
"epoch": 2.135658914728682,
"grad_norm": 0.6090452595776772,
"learning_rate": 4.2438293431432665e-06,
"loss": 0.3346,
"mean_token_accuracy": 0.8799500595778227,
"num_tokens": 237135700.0,
"step": 551
},
{
"entropy": 0.39410400390625,
"epoch": 2.13953488372093,
"grad_norm": 0.6332869380655064,
"learning_rate": 4.2089304493218355e-06,
"loss": 0.3325,
"mean_token_accuracy": 0.8803482167422771,
"num_tokens": 237556435.0,
"step": 552
},
{
"entropy": 0.39239501953125,
"epoch": 2.143410852713178,
"grad_norm": 0.6129101680068585,
"learning_rate": 4.17413735489604e-06,
"loss": 0.3364,
"mean_token_accuracy": 0.8808656400069594,
"num_tokens": 237993246.0,
"step": 553
},
{
"entropy": 0.394775390625,
"epoch": 2.147286821705426,
"grad_norm": 0.622513871237599,
"learning_rate": 4.139450695515018e-06,
"loss": 0.3177,
"mean_token_accuracy": 0.8859911020845175,
"num_tokens": 238416417.0,
"step": 554
},
{
"entropy": 0.385986328125,
"epoch": 2.1511627906976742,
"grad_norm": 0.6686843413819599,
"learning_rate": 4.104871104883403e-06,
"loss": 0.3476,
"mean_token_accuracy": 0.8776693055406213,
"num_tokens": 238872766.0,
"step": 555
},
{
"entropy": 0.392120361328125,
"epoch": 2.1550387596899223,
"grad_norm": 0.6426341081994442,
"learning_rate": 4.070399214749743e-06,
"loss": 0.3362,
"mean_token_accuracy": 0.8793003624305129,
"num_tokens": 239296976.0,
"step": 556
},
{
"entropy": 0.3946533203125,
"epoch": 2.1589147286821704,
"grad_norm": 0.6721747752054241,
"learning_rate": 4.036035654894967e-06,
"loss": 0.3176,
"mean_token_accuracy": 0.8857940044254065,
"num_tokens": 239703256.0,
"step": 557
},
{
"entropy": 0.38916015625,
"epoch": 2.1627906976744184,
"grad_norm": 0.6255029124954248,
"learning_rate": 4.001781053120863e-06,
"loss": 0.3407,
"mean_token_accuracy": 0.8771230475977063,
"num_tokens": 240138828.0,
"step": 558
},
{
"entropy": 0.39349365234375,
"epoch": 2.1666666666666665,
"grad_norm": 0.6818590678557627,
"learning_rate": 3.967636035238636e-06,
"loss": 0.341,
"mean_token_accuracy": 0.8793875314295292,
"num_tokens": 240559168.0,
"step": 559
},
{
"entropy": 0.38836669921875,
"epoch": 2.1705426356589146,
"grad_norm": 0.6518531386657375,
"learning_rate": 3.933601225057446e-06,
"loss": 0.3272,
"mean_token_accuracy": 0.8832004126161337,
"num_tokens": 240997393.0,
"step": 560
},
{
"entropy": 0.388427734375,
"epoch": 2.1744186046511627,
"grad_norm": 0.6162239541133624,
"learning_rate": 3.8996772443730335e-06,
"loss": 0.3289,
"mean_token_accuracy": 0.8784168781712651,
"num_tokens": 241432591.0,
"step": 561
},
{
"entropy": 0.388946533203125,
"epoch": 2.1782945736434107,
"grad_norm": 0.6484603141660229,
"learning_rate": 3.865864712956336e-06,
"loss": 0.3398,
"mean_token_accuracy": 0.8791722999885678,
"num_tokens": 241869468.0,
"step": 562
},
{
"entropy": 0.39080810546875,
"epoch": 2.182170542635659,
"grad_norm": 0.6345203708850213,
"learning_rate": 3.832164248542192e-06,
"loss": 0.3165,
"mean_token_accuracy": 0.8831057138741016,
"num_tokens": 242308134.0,
"step": 563
},
{
"entropy": 0.391815185546875,
"epoch": 2.186046511627907,
"grad_norm": 0.6250147866285306,
"learning_rate": 3.798576466818038e-06,
"loss": 0.3163,
"mean_token_accuracy": 0.8841667361557484,
"num_tokens": 242743846.0,
"step": 564
},
{
"entropy": 0.387176513671875,
"epoch": 2.189922480620155,
"grad_norm": 0.6587618073867574,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.3372,
"mean_token_accuracy": 0.879564318805933,
"num_tokens": 243184279.0,
"step": 565
},
{
"entropy": 0.391998291015625,
"epoch": 2.193798449612403,
"grad_norm": 0.6231496336515444,
"learning_rate": 3.7317414038850085e-06,
"loss": 0.3349,
"mean_token_accuracy": 0.8822369873523712,
"num_tokens": 243603345.0,
"step": 566
},
{
"entropy": 0.38885498046875,
"epoch": 2.197674418604651,
"grad_norm": 0.6243098230227394,
"learning_rate": 3.6984953437129734e-06,
"loss": 0.328,
"mean_token_accuracy": 0.8829399077221751,
"num_tokens": 244029585.0,
"step": 567
},
{
"entropy": 0.390411376953125,
"epoch": 2.201550387596899,
"grad_norm": 0.6183649760265033,
"learning_rate": 3.665364408282305e-06,
"loss": 0.3335,
"mean_token_accuracy": 0.8813704084604979,
"num_tokens": 244452331.0,
"step": 568
},
{
"entropy": 0.39208984375,
"epoch": 2.205426356589147,
"grad_norm": 0.6835852376079943,
"learning_rate": 3.6323492028754724e-06,
"loss": 0.3263,
"mean_token_accuracy": 0.881412522867322,
"num_tokens": 244882973.0,
"step": 569
},
{
"entropy": 0.392364501953125,
"epoch": 2.2093023255813953,
"grad_norm": 0.6327714392921648,
"learning_rate": 3.5994503306606497e-06,
"loss": 0.3156,
"mean_token_accuracy": 0.8864553738385439,
"num_tokens": 245295491.0,
"step": 570
},
{
"entropy": 0.38726806640625,
"epoch": 2.2131782945736433,
"grad_norm": 0.6256797059109875,
"learning_rate": 3.5666683926806623e-06,
"loss": 0.3356,
"mean_token_accuracy": 0.8783683739602566,
"num_tokens": 245759364.0,
"step": 571
},
{
"entropy": 0.383056640625,
"epoch": 2.2170542635658914,
"grad_norm": 0.6399228477600465,
"learning_rate": 3.534003987842005e-06,
"loss": 0.3219,
"mean_token_accuracy": 0.8829577537253499,
"num_tokens": 246203758.0,
"step": 572
},
{
"entropy": 0.386688232421875,
"epoch": 2.2209302325581395,
"grad_norm": 0.6415173203869877,
"learning_rate": 3.5014577129039296e-06,
"loss": 0.3243,
"mean_token_accuracy": 0.8814328899607062,
"num_tokens": 246645472.0,
"step": 573
},
{
"entropy": 0.390869140625,
"epoch": 2.2248062015503876,
"grad_norm": 0.6580150116543236,
"learning_rate": 3.4690301624675127e-06,
"loss": 0.3251,
"mean_token_accuracy": 0.8838908141478896,
"num_tokens": 247064090.0,
"step": 574
},
{
"entropy": 0.385711669921875,
"epoch": 2.2286821705426356,
"grad_norm": 0.6616373941415779,
"learning_rate": 3.4367219289648192e-06,
"loss": 0.3406,
"mean_token_accuracy": 0.8785260496661067,
"num_tokens": 247508703.0,
"step": 575
},
{
"entropy": 0.38714599609375,
"epoch": 2.2325581395348837,
"grad_norm": 0.6720601378427568,
"learning_rate": 3.4045336026480457e-06,
"loss": 0.3338,
"mean_token_accuracy": 0.8801887268200517,
"num_tokens": 247934655.0,
"step": 576
},
{
"entropy": 0.3848876953125,
"epoch": 2.2364341085271318,
"grad_norm": 0.6255047863223557,
"learning_rate": 3.372465771578771e-06,
"loss": 0.3321,
"mean_token_accuracy": 0.8830512659624219,
"num_tokens": 248373031.0,
"step": 577
},
{
"entropy": 0.391143798828125,
"epoch": 2.24031007751938,
"grad_norm": 0.636261521458766,
"learning_rate": 3.340519021617189e-06,
"loss": 0.3368,
"mean_token_accuracy": 0.877558303065598,
"num_tokens": 248800011.0,
"step": 578
},
{
"entropy": 0.392242431640625,
"epoch": 2.244186046511628,
"grad_norm": 0.6811192022983874,
"learning_rate": 3.308693936411421e-06,
"loss": 0.3231,
"mean_token_accuracy": 0.8823585864156485,
"num_tokens": 249220660.0,
"step": 579
},
{
"entropy": 0.400482177734375,
"epoch": 2.248062015503876,
"grad_norm": 0.6678322815867844,
"learning_rate": 3.2769910973868314e-06,
"loss": 0.3111,
"mean_token_accuracy": 0.8872639862820506,
"num_tokens": 249612238.0,
"step": 580
},
{
"entropy": 0.392578125,
"epoch": 2.251937984496124,
"grad_norm": 0.6680878488658235,
"learning_rate": 3.24541108373544e-06,
"loss": 0.3221,
"mean_token_accuracy": 0.8829946629703045,
"num_tokens": 250039769.0,
"step": 581
},
{
"entropy": 0.386383056640625,
"epoch": 2.255813953488372,
"grad_norm": 0.658743704605255,
"learning_rate": 3.2139544724053083e-06,
"loss": 0.3112,
"mean_token_accuracy": 0.8881248384714127,
"num_tokens": 250468645.0,
"step": 582
},
{
"entropy": 0.38922119140625,
"epoch": 2.25968992248062,
"grad_norm": 0.7447959483419172,
"learning_rate": 3.1826218380900066e-06,
"loss": 0.3341,
"mean_token_accuracy": 0.8805524576455355,
"num_tokens": 250883417.0,
"step": 583
},
{
"entropy": 0.380645751953125,
"epoch": 2.2635658914728682,
"grad_norm": 0.6453212527580212,
"learning_rate": 3.1514137532181265e-06,
"loss": 0.3276,
"mean_token_accuracy": 0.8826109319925308,
"num_tokens": 251327846.0,
"step": 584
},
{
"entropy": 0.391693115234375,
"epoch": 2.2674418604651163,
"grad_norm": 0.657520936501664,
"learning_rate": 3.1203307879428146e-06,
"loss": 0.3215,
"mean_token_accuracy": 0.8842367362231016,
"num_tokens": 251752385.0,
"step": 585
},
{
"entropy": 0.390045166015625,
"epoch": 2.2713178294573644,
"grad_norm": 0.6650188059458049,
"learning_rate": 3.089373510131354e-06,
"loss": 0.3084,
"mean_token_accuracy": 0.8864572271704674,
"num_tokens": 252174547.0,
"step": 586
},
{
"entropy": 0.39080810546875,
"epoch": 2.2751937984496124,
"grad_norm": 0.6417234553292116,
"learning_rate": 3.0585424853547953e-06,
"loss": 0.3238,
"mean_token_accuracy": 0.8842586716637015,
"num_tokens": 252596335.0,
"step": 587
},
{
"entropy": 0.392608642578125,
"epoch": 2.2790697674418605,
"grad_norm": 0.6639151590319899,
"learning_rate": 3.0278382768776193e-06,
"loss": 0.3339,
"mean_token_accuracy": 0.8815375939011574,
"num_tokens": 253021440.0,
"step": 588
},
{
"entropy": 0.39007568359375,
"epoch": 2.2829457364341086,
"grad_norm": 0.6644409987626044,
"learning_rate": 2.9972614456474537e-06,
"loss": 0.3289,
"mean_token_accuracy": 0.8809984363615513,
"num_tokens": 253440932.0,
"step": 589
},
{
"entropy": 0.385986328125,
"epoch": 2.2868217054263567,
"grad_norm": 0.6481260918771182,
"learning_rate": 2.9668125502848035e-06,
"loss": 0.3184,
"mean_token_accuracy": 0.8846102599054575,
"num_tokens": 253865848.0,
"step": 590
},
{
"entropy": 0.384033203125,
"epoch": 2.2906976744186047,
"grad_norm": 0.6548821507339614,
"learning_rate": 2.936492147072885e-06,
"loss": 0.3212,
"mean_token_accuracy": 0.8840930741280317,
"num_tokens": 254319258.0,
"step": 591
},
{
"entropy": 0.3841552734375,
"epoch": 2.294573643410853,
"grad_norm": 0.6143947491644914,
"learning_rate": 2.9063007899474214e-06,
"loss": 0.3198,
"mean_token_accuracy": 0.8845802173018456,
"num_tokens": 254778429.0,
"step": 592
},
{
"entropy": 0.382781982421875,
"epoch": 2.298449612403101,
"grad_norm": 0.5994337091831552,
"learning_rate": 2.876239030486554e-06,
"loss": 0.333,
"mean_token_accuracy": 0.8806400252506137,
"num_tokens": 255246376.0,
"step": 593
},
{
"entropy": 0.39447021484375,
"epoch": 2.302325581395349,
"grad_norm": 0.6370982481961588,
"learning_rate": 2.8463074179007356e-06,
"loss": 0.3298,
"mean_token_accuracy": 0.8816047692671418,
"num_tokens": 255679298.0,
"step": 594
},
{
"entropy": 0.38818359375,
"epoch": 2.306201550387597,
"grad_norm": 0.6326100288460391,
"learning_rate": 2.8165064990227255e-06,
"loss": 0.3139,
"mean_token_accuracy": 0.8859955314546824,
"num_tokens": 256126964.0,
"step": 595
},
{
"entropy": 0.3873291015625,
"epoch": 2.310077519379845,
"grad_norm": 0.6281151901542636,
"learning_rate": 2.7868368182975835e-06,
"loss": 0.3328,
"mean_token_accuracy": 0.8832689542323351,
"num_tokens": 256574701.0,
"step": 596
},
{
"entropy": 0.391632080078125,
"epoch": 2.313953488372093,
"grad_norm": 0.6543928974172343,
"learning_rate": 2.757298917772727e-06,
"loss": 0.3294,
"mean_token_accuracy": 0.8810861445963383,
"num_tokens": 256980992.0,
"step": 597
},
{
"entropy": 0.3851318359375,
"epoch": 2.317829457364341,
"grad_norm": 0.615207001414363,
"learning_rate": 2.7278933370880267e-06,
"loss": 0.3109,
"mean_token_accuracy": 0.887361123226583,
"num_tokens": 257433482.0,
"step": 598
},
{
"entropy": 0.392578125,
"epoch": 2.3217054263565893,
"grad_norm": 0.622271406233036,
"learning_rate": 2.6986206134659477e-06,
"loss": 0.32,
"mean_token_accuracy": 0.8841242687776685,
"num_tokens": 257864399.0,
"step": 599
},
{
"entropy": 0.384429931640625,
"epoch": 2.3255813953488373,
"grad_norm": 0.6493402572209033,
"learning_rate": 2.669481281701739e-06,
"loss": 0.3159,
"mean_token_accuracy": 0.8869122276082635,
"num_tokens": 258323505.0,
"step": 600
},
{
"entropy": 0.385650634765625,
"epoch": 2.3294573643410854,
"grad_norm": 0.655441563972454,
"learning_rate": 2.640475874153651e-06,
"loss": 0.3273,
"mean_token_accuracy": 0.8811930902302265,
"num_tokens": 258755764.0,
"step": 601
},
{
"entropy": 0.386383056640625,
"epoch": 2.3333333333333335,
"grad_norm": 0.6241612484793835,
"learning_rate": 2.6116049207332304e-06,
"loss": 0.3169,
"mean_token_accuracy": 0.8841598564758897,
"num_tokens": 259174373.0,
"step": 602
},
{
"entropy": 0.38812255859375,
"epoch": 2.3372093023255816,
"grad_norm": 0.6869232640948452,
"learning_rate": 2.582868948895623e-06,
"loss": 0.3271,
"mean_token_accuracy": 0.8831503242254257,
"num_tokens": 259633845.0,
"step": 603
},
{
"entropy": 0.394287109375,
"epoch": 2.3410852713178296,
"grad_norm": 0.6445176893131164,
"learning_rate": 2.5542684836299316e-06,
"loss": 0.3232,
"mean_token_accuracy": 0.8843172611668706,
"num_tokens": 260062070.0,
"step": 604
},
{
"entropy": 0.388092041015625,
"epoch": 2.3449612403100777,
"grad_norm": 0.644553761738823,
"learning_rate": 2.5258040474496483e-06,
"loss": 0.3167,
"mean_token_accuracy": 0.8840411538258195,
"num_tokens": 260479228.0,
"step": 605
},
{
"entropy": 0.384735107421875,
"epoch": 2.3488372093023258,
"grad_norm": 0.6303072264697063,
"learning_rate": 2.4974761603830865e-06,
"loss": 0.3274,
"mean_token_accuracy": 0.883203936740756,
"num_tokens": 260915469.0,
"step": 606
},
{
"entropy": 0.387939453125,
"epoch": 2.352713178294574,
"grad_norm": 0.6726103111482618,
"learning_rate": 2.469285339963892e-06,
"loss": 0.3199,
"mean_token_accuracy": 0.8849070286378264,
"num_tokens": 261365417.0,
"step": 607
},
{
"entropy": 0.38800048828125,
"epoch": 2.356589147286822,
"grad_norm": 0.6399916127128915,
"learning_rate": 2.4412321012215824e-06,
"loss": 0.3317,
"mean_token_accuracy": 0.880051271058619,
"num_tokens": 261812841.0,
"step": 608
},
{
"entropy": 0.394989013671875,
"epoch": 2.3604651162790695,
"grad_norm": 0.613061425373263,
"learning_rate": 2.4133169566721426e-06,
"loss": 0.3278,
"mean_token_accuracy": 0.883314672857523,
"num_tokens": 262236708.0,
"step": 609
},
{
"entropy": 0.389251708984375,
"epoch": 2.3643410852713176,
"grad_norm": 0.6588087474437182,
"learning_rate": 2.3855404163086558e-06,
"loss": 0.3013,
"mean_token_accuracy": 0.8911587707698345,
"num_tokens": 262663914.0,
"step": 610
},
{
"entropy": 0.388580322265625,
"epoch": 2.3682170542635657,
"grad_norm": 0.6650335500867999,
"learning_rate": 2.3579029875919933e-06,
"loss": 0.3337,
"mean_token_accuracy": 0.8805716382339597,
"num_tokens": 263097308.0,
"step": 611
},
{
"entropy": 0.38568115234375,
"epoch": 2.3720930232558137,
"grad_norm": 0.6613953074032827,
"learning_rate": 2.330405175441529e-06,
"loss": 0.3222,
"mean_token_accuracy": 0.8836323749274015,
"num_tokens": 263521684.0,
"step": 612
},
{
"entropy": 0.38543701171875,
"epoch": 2.375968992248062,
"grad_norm": 0.6419694798675706,
"learning_rate": 2.3030474822259396e-06,
"loss": 0.3215,
"mean_token_accuracy": 0.8862187461927533,
"num_tokens": 263961105.0,
"step": 613
},
{
"entropy": 0.384033203125,
"epoch": 2.37984496124031,
"grad_norm": 0.639786053232097,
"learning_rate": 2.275830407754006e-06,
"loss": 0.3083,
"mean_token_accuracy": 0.8866908960044384,
"num_tokens": 264391888.0,
"step": 614
},
{
"entropy": 0.388092041015625,
"epoch": 2.383720930232558,
"grad_norm": 0.6195099013880934,
"learning_rate": 2.2487544492654832e-06,
"loss": 0.3269,
"mean_token_accuracy": 0.8840560354292393,
"num_tokens": 264826227.0,
"step": 615
},
{
"entropy": 0.387176513671875,
"epoch": 2.387596899224806,
"grad_norm": 0.6003192458281774,
"learning_rate": 2.2218201014220266e-06,
"loss": 0.3172,
"mean_token_accuracy": 0.8835435407236218,
"num_tokens": 265245659.0,
"step": 616
},
{
"entropy": 0.38287353515625,
"epoch": 2.391472868217054,
"grad_norm": 0.6213328046705171,
"learning_rate": 2.1950278562981497e-06,
"loss": 0.3127,
"mean_token_accuracy": 0.8866761410608888,
"num_tokens": 265682628.0,
"step": 617
},
{
"entropy": 0.392486572265625,
"epoch": 2.395348837209302,
"grad_norm": 0.6357426254067038,
"learning_rate": 2.1683782033722313e-06,
"loss": 0.3376,
"mean_token_accuracy": 0.880131833255291,
"num_tokens": 266095365.0,
"step": 618
},
{
"entropy": 0.3843994140625,
"epoch": 2.39922480620155,
"grad_norm": 0.6563518530775935,
"learning_rate": 2.1418716295175766e-06,
"loss": 0.3132,
"mean_token_accuracy": 0.8866546172648668,
"num_tokens": 266542811.0,
"step": 619
},
{
"entropy": 0.388702392578125,
"epoch": 2.4031007751937983,
"grad_norm": 0.6423172247359347,
"learning_rate": 2.1155086189935227e-06,
"loss": 0.3204,
"mean_token_accuracy": 0.8823329349979758,
"num_tokens": 266968847.0,
"step": 620
},
{
"entropy": 0.383148193359375,
"epoch": 2.4069767441860463,
"grad_norm": 0.6140724947507443,
"learning_rate": 2.08928965343659e-06,
"loss": 0.3114,
"mean_token_accuracy": 0.8879816886037588,
"num_tokens": 267405210.0,
"step": 621
},
{
"entropy": 0.3831787109375,
"epoch": 2.4108527131782944,
"grad_norm": 0.6120711901597875,
"learning_rate": 2.063215211851678e-06,
"loss": 0.3102,
"mean_token_accuracy": 0.8873960571363568,
"num_tokens": 267834286.0,
"step": 622
},
{
"entropy": 0.38360595703125,
"epoch": 2.4147286821705425,
"grad_norm": 0.6702572768032118,
"learning_rate": 2.037285770603321e-06,
"loss": 0.3297,
"mean_token_accuracy": 0.882229084149003,
"num_tokens": 268270543.0,
"step": 623
},
{
"entropy": 0.37982177734375,
"epoch": 2.4186046511627906,
"grad_norm": 0.6677987053876467,
"learning_rate": 2.0115018034069955e-06,
"loss": 0.3136,
"mean_token_accuracy": 0.8866470847278833,
"num_tokens": 268692883.0,
"step": 624
},
{
"entropy": 0.379608154296875,
"epoch": 2.4224806201550386,
"grad_norm": 1.0553125635780334,
"learning_rate": 1.9858637813204352e-06,
"loss": 0.3089,
"mean_token_accuracy": 0.8852367643266916,
"num_tokens": 269108495.0,
"step": 625
},
{
"entropy": 0.383544921875,
"epoch": 2.4263565891472867,
"grad_norm": 0.6385113376841698,
"learning_rate": 1.9603721727350532e-06,
"loss": 0.312,
"mean_token_accuracy": 0.8891161847859621,
"num_tokens": 269541818.0,
"step": 626
},
{
"entropy": 0.38232421875,
"epoch": 2.4302325581395348,
"grad_norm": 0.6093604650776818,
"learning_rate": 1.9350274433673745e-06,
"loss": 0.3065,
"mean_token_accuracy": 0.8878201972693205,
"num_tokens": 269944971.0,
"step": 627
},
{
"entropy": 0.388641357421875,
"epoch": 2.434108527131783,
"grad_norm": 0.6283112616971976,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.314,
"mean_token_accuracy": 0.8887106478214264,
"num_tokens": 270357437.0,
"step": 628
},
{
"entropy": 0.384918212890625,
"epoch": 2.437984496124031,
"grad_norm": 0.6207773662097917,
"learning_rate": 1.8847804717257833e-06,
"loss": 0.3184,
"mean_token_accuracy": 0.884671707637608,
"num_tokens": 270804654.0,
"step": 629
},
{
"entropy": 0.38421630859375,
"epoch": 2.441860465116279,
"grad_norm": 0.6193418360544493,
"learning_rate": 1.8598791474341516e-06,
"loss": 0.3234,
"mean_token_accuracy": 0.8826173283159733,
"num_tokens": 271227560.0,
"step": 630
},
{
"entropy": 0.3800048828125,
"epoch": 2.445736434108527,
"grad_norm": 0.6173876400834213,
"learning_rate": 1.835126538308013e-06,
"loss": 0.3123,
"mean_token_accuracy": 0.8874257709830999,
"num_tokens": 271645073.0,
"step": 631
},
{
"entropy": 0.3846435546875,
"epoch": 2.449612403100775,
"grad_norm": 0.6232085631801352,
"learning_rate": 1.810523096562814e-06,
"loss": 0.3124,
"mean_token_accuracy": 0.8858788376674056,
"num_tokens": 272066074.0,
"step": 632
},
{
"entropy": 0.383758544921875,
"epoch": 2.453488372093023,
"grad_norm": 0.6681731482811958,
"learning_rate": 1.7860692716887906e-06,
"loss": 0.3101,
"mean_token_accuracy": 0.8878279887139797,
"num_tokens": 272500490.0,
"step": 633
},
{
"entropy": 0.38494873046875,
"epoch": 2.4573643410852712,
"grad_norm": 0.6414729428255811,
"learning_rate": 1.7617655104427833e-06,
"loss": 0.333,
"mean_token_accuracy": 0.8813438573852181,
"num_tokens": 272914398.0,
"step": 634
},
{
"entropy": 0.383514404296875,
"epoch": 2.4612403100775193,
"grad_norm": 0.6210433661294859,
"learning_rate": 1.7376122568400533e-06,
"loss": 0.3239,
"mean_token_accuracy": 0.882821892388165,
"num_tokens": 273359632.0,
"step": 635
},
{
"entropy": 0.377685546875,
"epoch": 2.4651162790697674,
"grad_norm": 0.6122133439124037,
"learning_rate": 1.713609952146168e-06,
"loss": 0.3229,
"mean_token_accuracy": 0.8840822214260697,
"num_tokens": 273808032.0,
"step": 636
},
{
"entropy": 0.3824462890625,
"epoch": 2.4689922480620154,
"grad_norm": 0.6021018597895732,
"learning_rate": 1.6897590348689607e-06,
"loss": 0.3068,
"mean_token_accuracy": 0.8891383018344641,
"num_tokens": 274244780.0,
"step": 637
},
{
"entropy": 0.3824462890625,
"epoch": 2.4728682170542635,
"grad_norm": 0.6348069079815698,
"learning_rate": 1.6660599407504995e-06,
"loss": 0.319,
"mean_token_accuracy": 0.8837192356586456,
"num_tokens": 274681493.0,
"step": 638
},
{
"entropy": 0.385833740234375,
"epoch": 2.4767441860465116,
"grad_norm": 0.6124007089537341,
"learning_rate": 1.6425131027591368e-06,
"loss": 0.3299,
"mean_token_accuracy": 0.8837573220953345,
"num_tokens": 275115770.0,
"step": 639
},
{
"entropy": 0.3885498046875,
"epoch": 2.4806201550387597,
"grad_norm": 0.6271542904897481,
"learning_rate": 1.6191189510815942e-06,
"loss": 0.323,
"mean_token_accuracy": 0.8834406100213528,
"num_tokens": 275540602.0,
"step": 640
},
{
"entropy": 0.3846435546875,
"epoch": 2.4844961240310077,
"grad_norm": 0.6598534403286116,
"learning_rate": 1.5958779131151049e-06,
"loss": 0.3317,
"mean_token_accuracy": 0.8816557712852955,
"num_tokens": 275960564.0,
"step": 641
},
{
"entropy": 0.38409423828125,
"epoch": 2.488372093023256,
"grad_norm": 0.616565215055412,
"learning_rate": 1.5727904134596084e-06,
"loss": 0.3083,
"mean_token_accuracy": 0.8885947009548545,
"num_tokens": 276395546.0,
"step": 642
},
{
"entropy": 0.38604736328125,
"epoch": 2.492248062015504,
"grad_norm": 0.6157486130398027,
"learning_rate": 1.5498568739099907e-06,
"loss": 0.3155,
"mean_token_accuracy": 0.8881635349243879,
"num_tokens": 276819951.0,
"step": 643
},
{
"entropy": 0.3807373046875,
"epoch": 2.496124031007752,
"grad_norm": 0.6303652006027609,
"learning_rate": 1.5270777134483683e-06,
"loss": 0.3351,
"mean_token_accuracy": 0.8798212753608823,
"num_tokens": 277257356.0,
"step": 644
},
{
"entropy": 0.388397216796875,
"epoch": 2.5,
"grad_norm": 0.642010581870401,
"learning_rate": 1.504453348236461e-06,
"loss": 0.3212,
"mean_token_accuracy": 0.8849497428163886,
"num_tokens": 277661317.0,
"step": 645
},
{
"entropy": 0.382965087890625,
"epoch": 2.503875968992248,
"grad_norm": 0.6252232844795501,
"learning_rate": 1.481984191607959e-06,
"loss": 0.3105,
"mean_token_accuracy": 0.8894343795254827,
"num_tokens": 278088072.0,
"step": 646
},
{
"entropy": 0.380126953125,
"epoch": 2.507751937984496,
"grad_norm": 0.582093369587659,
"learning_rate": 1.4596706540609862e-06,
"loss": 0.3238,
"mean_token_accuracy": 0.8829025160521269,
"num_tokens": 278525498.0,
"step": 647
},
{
"entropy": 0.393646240234375,
"epoch": 2.511627906976744,
"grad_norm": 0.6328882087488623,
"learning_rate": 1.4375131432505984e-06,
"loss": 0.317,
"mean_token_accuracy": 0.8850847911089659,
"num_tokens": 278934425.0,
"step": 648
},
{
"entropy": 0.3792724609375,
"epoch": 2.5155038759689923,
"grad_norm": 0.6316222487630614,
"learning_rate": 1.4155120639813392e-06,
"loss": 0.3147,
"mean_token_accuracy": 0.8857432128861547,
"num_tokens": 279368869.0,
"step": 649
},
{
"entropy": 0.384033203125,
"epoch": 2.5193798449612403,
"grad_norm": 0.6163425061600645,
"learning_rate": 1.3936678181998376e-06,
"loss": 0.3077,
"mean_token_accuracy": 0.8878246061503887,
"num_tokens": 279776778.0,
"step": 650
},
{
"entropy": 0.384521484375,
"epoch": 2.5232558139534884,
"grad_norm": 0.6010971074385706,
"learning_rate": 1.3719808049874695e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.885104707442224,
"num_tokens": 280216614.0,
"step": 651
},
{
"entropy": 0.383270263671875,
"epoch": 2.5271317829457365,
"grad_norm": 0.6249118870033076,
"learning_rate": 1.350451420553065e-06,
"loss": 0.3095,
"mean_token_accuracy": 0.8848751662299037,
"num_tokens": 280634172.0,
"step": 652
},
{
"entropy": 0.385467529296875,
"epoch": 2.5310077519379846,
"grad_norm": 0.6762125774083043,
"learning_rate": 1.3290800582256714e-06,
"loss": 0.3141,
"mean_token_accuracy": 0.8861252348870039,
"num_tokens": 281056902.0,
"step": 653
},
{
"entropy": 0.37860107421875,
"epoch": 2.5348837209302326,
"grad_norm": 0.6074642502490406,
"learning_rate": 1.3078671084473604e-06,
"loss": 0.3079,
"mean_token_accuracy": 0.8881129696965218,
"num_tokens": 281497206.0,
"step": 654
},
{
"entropy": 0.380523681640625,
"epoch": 2.5387596899224807,
"grad_norm": 0.6209025013219114,
"learning_rate": 1.286812958766106e-06,
"loss": 0.3049,
"mean_token_accuracy": 0.8877112930640578,
"num_tokens": 281916304.0,
"step": 655
},
{
"entropy": 0.385955810546875,
"epoch": 2.5426356589147288,
"grad_norm": 0.6328788729805691,
"learning_rate": 1.2659179938287035e-06,
"loss": 0.3231,
"mean_token_accuracy": 0.883956940844655,
"num_tokens": 282338790.0,
"step": 656
},
{
"entropy": 0.387176513671875,
"epoch": 2.546511627906977,
"grad_norm": 0.6132138038624881,
"learning_rate": 1.2451825953737273e-06,
"loss": 0.3125,
"mean_token_accuracy": 0.8869189685210586,
"num_tokens": 282771871.0,
"step": 657
},
{
"entropy": 0.383087158203125,
"epoch": 2.550387596899225,
"grad_norm": 0.6100791024748512,
"learning_rate": 1.224607142224572e-06,
"loss": 0.3213,
"mean_token_accuracy": 0.8853695271536708,
"num_tokens": 283186245.0,
"step": 658
},
{
"entropy": 0.387054443359375,
"epoch": 2.554263565891473,
"grad_norm": 0.6239314174742299,
"learning_rate": 1.2041920102825277e-06,
"loss": 0.299,
"mean_token_accuracy": 0.8931658444926143,
"num_tokens": 283603528.0,
"step": 659
},
{
"entropy": 0.390350341796875,
"epoch": 2.558139534883721,
"grad_norm": 0.5964951578284012,
"learning_rate": 1.1839375725199098e-06,
"loss": 0.3206,
"mean_token_accuracy": 0.8860745606943965,
"num_tokens": 284012300.0,
"step": 660
},
{
"entropy": 0.37884521484375,
"epoch": 2.562015503875969,
"grad_norm": 0.6186039138525928,
"learning_rate": 1.1638441989732474e-06,
"loss": 0.305,
"mean_token_accuracy": 0.8898253720253706,
"num_tokens": 284456555.0,
"step": 661
},
{
"entropy": 0.38311767578125,
"epoch": 2.565891472868217,
"grad_norm": 0.6436702012478264,
"learning_rate": 1.1439122567365214e-06,
"loss": 0.3143,
"mean_token_accuracy": 0.8866937700659037,
"num_tokens": 284898106.0,
"step": 662
},
{
"entropy": 0.38592529296875,
"epoch": 2.5697674418604652,
"grad_norm": 0.6081422963421407,
"learning_rate": 1.124142109954459e-06,
"loss": 0.3092,
"mean_token_accuracy": 0.8888078099116683,
"num_tokens": 285322897.0,
"step": 663
},
{
"entropy": 0.3870849609375,
"epoch": 2.5736434108527133,
"grad_norm": 0.618254819328906,
"learning_rate": 1.1045341198158833e-06,
"loss": 0.2992,
"mean_token_accuracy": 0.892044042237103,
"num_tokens": 285743476.0,
"step": 664
},
{
"entropy": 0.382415771484375,
"epoch": 2.5775193798449614,
"grad_norm": 0.6382580440374259,
"learning_rate": 1.0850886445471055e-06,
"loss": 0.3199,
"mean_token_accuracy": 0.8851598743349314,
"num_tokens": 286167399.0,
"step": 665
},
{
"entropy": 0.38165283203125,
"epoch": 2.5813953488372094,
"grad_norm": 0.6249775328464647,
"learning_rate": 1.0658060394053904e-06,
"loss": 0.3105,
"mean_token_accuracy": 0.8882809020578861,
"num_tokens": 286601559.0,
"step": 666
},
{
"entropy": 0.385833740234375,
"epoch": 2.5852713178294575,
"grad_norm": 0.6355443861472679,
"learning_rate": 1.0466866566724698e-06,
"loss": 0.3164,
"mean_token_accuracy": 0.8866121266037226,
"num_tokens": 287036421.0,
"step": 667
},
{
"entropy": 0.38177490234375,
"epoch": 2.5891472868217056,
"grad_norm": 0.5788824075949772,
"learning_rate": 1.027730845648085e-06,
"loss": 0.3273,
"mean_token_accuracy": 0.8835360938683152,
"num_tokens": 287484834.0,
"step": 668
},
{
"entropy": 0.381439208984375,
"epoch": 2.5930232558139537,
"grad_norm": 0.6105981189688949,
"learning_rate": 1.0089389526436299e-06,
"loss": 0.3098,
"mean_token_accuracy": 0.8879820080474019,
"num_tokens": 287920778.0,
"step": 669
},
{
"entropy": 0.377593994140625,
"epoch": 2.5968992248062017,
"grad_norm": 0.6244913453198664,
"learning_rate": 9.903113209758098e-07,
"loss": 0.3051,
"mean_token_accuracy": 0.8882512943819165,
"num_tokens": 288354358.0,
"step": 670
},
{
"entropy": 0.37994384765625,
"epoch": 2.60077519379845,
"grad_norm": 0.6181953248203848,
"learning_rate": 9.718482909603732e-07,
"loss": 0.3117,
"mean_token_accuracy": 0.8855423256754875,
"num_tokens": 288793843.0,
"step": 671
},
{
"entropy": 0.37725830078125,
"epoch": 2.604651162790698,
"grad_norm": 0.6349924792877945,
"learning_rate": 9.535501999058971e-07,
"loss": 0.3168,
"mean_token_accuracy": 0.8862435938790441,
"num_tokens": 289225299.0,
"step": 672
},
{
"entropy": 0.387481689453125,
"epoch": 2.608527131782946,
"grad_norm": 0.6693755629098281,
"learning_rate": 9.354173821076184e-07,
"loss": 0.3234,
"mean_token_accuracy": 0.8822949966415763,
"num_tokens": 289644576.0,
"step": 673
},
{
"entropy": 0.386260986328125,
"epoch": 2.612403100775194,
"grad_norm": 0.6097539865340834,
"learning_rate": 9.174501688413329e-07,
"loss": 0.3205,
"mean_token_accuracy": 0.886177402921021,
"num_tokens": 290070186.0,
"step": 674
},
{
"entropy": 0.38409423828125,
"epoch": 2.616279069767442,
"grad_norm": 0.6152331705902413,
"learning_rate": 8.996488883573351e-07,
"loss": 0.3229,
"mean_token_accuracy": 0.8854604810476303,
"num_tokens": 290499509.0,
"step": 675
},
{
"entropy": 0.383544921875,
"epoch": 2.62015503875969,
"grad_norm": 0.6258210400477547,
"learning_rate": 8.820138658744304e-07,
"loss": 0.304,
"mean_token_accuracy": 0.8898686449974775,
"num_tokens": 290921470.0,
"step": 676
},
{
"entropy": 0.387298583984375,
"epoch": 2.624031007751938,
"grad_norm": 0.6137335164148037,
"learning_rate": 8.645454235739903e-07,
"loss": 0.325,
"mean_token_accuracy": 0.8838518625125289,
"num_tokens": 291348905.0,
"step": 677
},
{
"entropy": 0.386138916015625,
"epoch": 2.6279069767441863,
"grad_norm": 0.6432892349676601,
"learning_rate": 8.472438805940652e-07,
"loss": 0.332,
"mean_token_accuracy": 0.8829165082424879,
"num_tokens": 291774652.0,
"step": 678
},
{
"entropy": 0.3824462890625,
"epoch": 2.6317829457364343,
"grad_norm": 0.6291908471810097,
"learning_rate": 8.301095530235492e-07,
"loss": 0.338,
"mean_token_accuracy": 0.882173334248364,
"num_tokens": 292214413.0,
"step": 679
},
{
"entropy": 0.38372802734375,
"epoch": 2.6356589147286824,
"grad_norm": 0.6063757017276409,
"learning_rate": 8.131427538964165e-07,
"loss": 0.3184,
"mean_token_accuracy": 0.8866541981697083,
"num_tokens": 292653229.0,
"step": 680
},
{
"entropy": 0.38775634765625,
"epoch": 2.6395348837209305,
"grad_norm": 0.6573377080205017,
"learning_rate": 7.963437931859919e-07,
"loss": 0.297,
"mean_token_accuracy": 0.8897636560723186,
"num_tokens": 293062442.0,
"step": 681
},
{
"entropy": 0.3831787109375,
"epoch": 2.6434108527131785,
"grad_norm": 0.6270362942338372,
"learning_rate": 7.797129777992951e-07,
"loss": 0.3213,
"mean_token_accuracy": 0.8836455037817359,
"num_tokens": 293504484.0,
"step": 682
},
{
"entropy": 0.38446044921875,
"epoch": 2.6472868217054266,
"grad_norm": 0.6577635211374225,
"learning_rate": 7.632506115714289e-07,
"loss": 0.3089,
"mean_token_accuracy": 0.8872796315699816,
"num_tokens": 293948693.0,
"step": 683
},
{
"entropy": 0.38397216796875,
"epoch": 2.6511627906976747,
"grad_norm": 0.6293287670711796,
"learning_rate": 7.46956995260033e-07,
"loss": 0.317,
"mean_token_accuracy": 0.8846086421981454,
"num_tokens": 294376049.0,
"step": 684
},
{
"entropy": 0.382598876953125,
"epoch": 2.6550387596899228,
"grad_norm": 0.6101235033718403,
"learning_rate": 7.308324265397837e-07,
"loss": 0.3053,
"mean_token_accuracy": 0.8887326065450907,
"num_tokens": 294817272.0,
"step": 685
},
{
"entropy": 0.384246826171875,
"epoch": 2.6589147286821704,
"grad_norm": 0.603135068569057,
"learning_rate": 7.148771999969573e-07,
"loss": 0.3187,
"mean_token_accuracy": 0.8846358032897115,
"num_tokens": 295268820.0,
"step": 686
},
{
"entropy": 0.375823974609375,
"epoch": 2.6627906976744184,
"grad_norm": 0.6370470013882648,
"learning_rate": 6.990916071240506e-07,
"loss": 0.3162,
"mean_token_accuracy": 0.8857185756787658,
"num_tokens": 295704514.0,
"step": 687
},
{
"entropy": 0.3878173828125,
"epoch": 2.6666666666666665,
"grad_norm": 0.6307798739932974,
"learning_rate": 6.834759363144595e-07,
"loss": 0.321,
"mean_token_accuracy": 0.884716515429318,
"num_tokens": 296123518.0,
"step": 688
},
{
"entropy": 0.38555908203125,
"epoch": 2.6705426356589146,
"grad_norm": 0.6359467279336724,
"learning_rate": 6.680304728571963e-07,
"loss": 0.3115,
"mean_token_accuracy": 0.887707345187664,
"num_tokens": 296568970.0,
"step": 689
},
{
"entropy": 0.38232421875,
"epoch": 2.6744186046511627,
"grad_norm": 0.6594731380323239,
"learning_rate": 6.527554989316898e-07,
"loss": 0.3147,
"mean_token_accuracy": 0.8869885383173823,
"num_tokens": 297009689.0,
"step": 690
},
{
"entropy": 0.376495361328125,
"epoch": 2.6782945736434107,
"grad_norm": 0.6528547449153141,
"learning_rate": 6.37651293602628e-07,
"loss": 0.3056,
"mean_token_accuracy": 0.8896724404767156,
"num_tokens": 297466234.0,
"step": 691
},
{
"entropy": 0.380035400390625,
"epoch": 2.682170542635659,
"grad_norm": 0.5984715392007748,
"learning_rate": 6.227181328148568e-07,
"loss": 0.3074,
"mean_token_accuracy": 0.8880112646147609,
"num_tokens": 297913176.0,
"step": 692
},
{
"entropy": 0.382171630859375,
"epoch": 2.686046511627907,
"grad_norm": 0.6055523687350702,
"learning_rate": 6.079562893883395e-07,
"loss": 0.3136,
"mean_token_accuracy": 0.8882074085995555,
"num_tokens": 298336850.0,
"step": 693
},
{
"entropy": 0.380035400390625,
"epoch": 2.689922480620155,
"grad_norm": 0.6196322460239065,
"learning_rate": 5.933660330131752e-07,
"loss": 0.3129,
"mean_token_accuracy": 0.8837150661274791,
"num_tokens": 298778162.0,
"step": 694
},
{
"entropy": 0.382781982421875,
"epoch": 2.693798449612403,
"grad_norm": 0.6053882069820191,
"learning_rate": 5.789476302446662e-07,
"loss": 0.3206,
"mean_token_accuracy": 0.8839719081297517,
"num_tokens": 299213302.0,
"step": 695
},
{
"entropy": 0.37969970703125,
"epoch": 2.697674418604651,
"grad_norm": 0.6516034301333918,
"learning_rate": 5.647013444984561e-07,
"loss": 0.3086,
"mean_token_accuracy": 0.8866365505382419,
"num_tokens": 299633216.0,
"step": 696
},
{
"entropy": 0.378387451171875,
"epoch": 2.701550387596899,
"grad_norm": 0.6020971649541776,
"learning_rate": 5.506274360457087e-07,
"loss": 0.3094,
"mean_token_accuracy": 0.8886294420808554,
"num_tokens": 300056664.0,
"step": 697
},
{
"entropy": 0.388641357421875,
"epoch": 2.705426356589147,
"grad_norm": 0.699089060760847,
"learning_rate": 5.367261620083575e-07,
"loss": 0.3257,
"mean_token_accuracy": 0.8841283833608031,
"num_tokens": 300476483.0,
"step": 698
},
{
"entropy": 0.388946533203125,
"epoch": 2.7093023255813953,
"grad_norm": 0.5976278141664447,
"learning_rate": 5.229977763544148e-07,
"loss": 0.3267,
"mean_token_accuracy": 0.8839065292850137,
"num_tokens": 300896040.0,
"step": 699
},
{
"entropy": 0.38214111328125,
"epoch": 2.7131782945736433,
"grad_norm": 0.598960900544294,
"learning_rate": 5.094425298933136e-07,
"loss": 0.3161,
"mean_token_accuracy": 0.8860001573339105,
"num_tokens": 301328932.0,
"step": 700
},
{
"entropy": 0.378326416015625,
"epoch": 2.7170542635658914,
"grad_norm": 0.6121343549337338,
"learning_rate": 4.960606702713466e-07,
"loss": 0.3095,
"mean_token_accuracy": 0.8888444481417537,
"num_tokens": 301789862.0,
"step": 701
},
{
"entropy": 0.385162353515625,
"epoch": 2.7209302325581395,
"grad_norm": 0.5949190313734338,
"learning_rate": 4.828524419671266e-07,
"loss": 0.3166,
"mean_token_accuracy": 0.8873936915770173,
"num_tokens": 302211924.0,
"step": 702
},
{
"entropy": 0.387908935546875,
"epoch": 2.7248062015503876,
"grad_norm": 0.6231799085677647,
"learning_rate": 4.6981808628712823e-07,
"loss": 0.3226,
"mean_token_accuracy": 0.8857519812881947,
"num_tokens": 302632584.0,
"step": 703
},
{
"entropy": 0.381256103515625,
"epoch": 2.7286821705426356,
"grad_norm": 0.597154298203785,
"learning_rate": 4.569578413612752e-07,
"loss": 0.3246,
"mean_token_accuracy": 0.8845319030806422,
"num_tokens": 303079662.0,
"step": 704
},
{
"entropy": 0.381011962890625,
"epoch": 2.7325581395348837,
"grad_norm": 0.6097019666151339,
"learning_rate": 4.4427194213859216e-07,
"loss": 0.3055,
"mean_token_accuracy": 0.8890186436474323,
"num_tokens": 303513162.0,
"step": 705
},
{
"entropy": 0.377777099609375,
"epoch": 2.7364341085271318,
"grad_norm": 0.6097219538821098,
"learning_rate": 4.3176062038291275e-07,
"loss": 0.3084,
"mean_token_accuracy": 0.8881680406630039,
"num_tokens": 303965110.0,
"step": 706
},
{
"entropy": 0.3841552734375,
"epoch": 2.74031007751938,
"grad_norm": 0.5987018303788194,
"learning_rate": 4.194241046686398e-07,
"loss": 0.3233,
"mean_token_accuracy": 0.8819524059072137,
"num_tokens": 304417429.0,
"step": 707
},
{
"entropy": 0.379608154296875,
"epoch": 2.744186046511628,
"grad_norm": 0.6088852283958164,
"learning_rate": 4.0726262037657506e-07,
"loss": 0.3172,
"mean_token_accuracy": 0.8878186987712979,
"num_tokens": 304848098.0,
"step": 708
},
{
"entropy": 0.38323974609375,
"epoch": 2.748062015503876,
"grad_norm": 0.6247123304946086,
"learning_rate": 3.9527638968980707e-07,
"loss": 0.3051,
"mean_token_accuracy": 0.8870606170967221,
"num_tokens": 305267991.0,
"step": 709
},
{
"entropy": 0.3841552734375,
"epoch": 2.751937984496124,
"grad_norm": 0.6268733632031187,
"learning_rate": 3.834656315896379e-07,
"loss": 0.3134,
"mean_token_accuracy": 0.8865107716992497,
"num_tokens": 305695308.0,
"step": 710
},
{
"entropy": 0.3807373046875,
"epoch": 2.755813953488372,
"grad_norm": 0.6379178455584169,
"learning_rate": 3.718305618515905e-07,
"loss": 0.3188,
"mean_token_accuracy": 0.8853443302214146,
"num_tokens": 306143903.0,
"step": 711
},
{
"entropy": 0.380645751953125,
"epoch": 2.75968992248062,
"grad_norm": 0.6004645935396044,
"learning_rate": 3.603713930414676e-07,
"loss": 0.3128,
"mean_token_accuracy": 0.8875391287729144,
"num_tokens": 306583114.0,
"step": 712
},
{
"entropy": 0.377777099609375,
"epoch": 2.7635658914728682,
"grad_norm": 0.6013189306552227,
"learning_rate": 3.490883345114671e-07,
"loss": 0.3099,
"mean_token_accuracy": 0.8893257696181536,
"num_tokens": 307035468.0,
"step": 713
},
{
"entropy": 0.377227783203125,
"epoch": 2.7674418604651163,
"grad_norm": 0.6471701813407102,
"learning_rate": 3.3798159239635585e-07,
"loss": 0.3104,
"mean_token_accuracy": 0.8878767946735024,
"num_tokens": 307468508.0,
"step": 714
},
{
"entropy": 0.3798828125,
"epoch": 2.7713178294573644,
"grad_norm": 0.6053150592034582,
"learning_rate": 3.2705136960970554e-07,
"loss": 0.3051,
"mean_token_accuracy": 0.8884820081293583,
"num_tokens": 307897907.0,
"step": 715
},
{
"entropy": 0.386627197265625,
"epoch": 2.7751937984496124,
"grad_norm": 0.6168893552698425,
"learning_rate": 3.1629786584018387e-07,
"loss": 0.3104,
"mean_token_accuracy": 0.8878091182559729,
"num_tokens": 308327346.0,
"step": 716
},
{
"entropy": 0.383270263671875,
"epoch": 2.7790697674418605,
"grad_norm": 0.6368678138884837,
"learning_rate": 3.05721277547909e-07,
"loss": 0.3224,
"mean_token_accuracy": 0.8836130304262042,
"num_tokens": 308762802.0,
"step": 717
},
{
"entropy": 0.377960205078125,
"epoch": 2.7829457364341086,
"grad_norm": 0.6316411958735261,
"learning_rate": 2.9532179796085356e-07,
"loss": 0.2994,
"mean_token_accuracy": 0.8904037978500128,
"num_tokens": 309183208.0,
"step": 718
},
{
"entropy": 0.3790283203125,
"epoch": 2.7868217054263567,
"grad_norm": 0.6158546042861247,
"learning_rate": 2.8509961707132496e-07,
"loss": 0.3092,
"mean_token_accuracy": 0.8881755471229553,
"num_tokens": 309623208.0,
"step": 719
},
{
"entropy": 0.374969482421875,
"epoch": 2.7906976744186047,
"grad_norm": 0.6093210127446324,
"learning_rate": 2.750549216324894e-07,
"loss": 0.3,
"mean_token_accuracy": 0.8928798316046596,
"num_tokens": 310055523.0,
"step": 720
},
{
"entropy": 0.38287353515625,
"epoch": 2.794573643410853,
"grad_norm": 0.6559455125212247,
"learning_rate": 2.6518789515495356e-07,
"loss": 0.3148,
"mean_token_accuracy": 0.8859463995322585,
"num_tokens": 310485887.0,
"step": 721
},
{
"entropy": 0.38128662109375,
"epoch": 2.798449612403101,
"grad_norm": 0.6032446881532523,
"learning_rate": 2.554987179034218e-07,
"loss": 0.3221,
"mean_token_accuracy": 0.8861085483804345,
"num_tokens": 310926474.0,
"step": 722
},
{
"entropy": 0.385955810546875,
"epoch": 2.802325581395349,
"grad_norm": 0.6176670649999463,
"learning_rate": 2.4598756689339975e-07,
"loss": 0.3116,
"mean_token_accuracy": 0.8888868298381567,
"num_tokens": 311337806.0,
"step": 723
},
{
"entropy": 0.37939453125,
"epoch": 2.806201550387597,
"grad_norm": 0.6115489071383378,
"learning_rate": 2.3665461588795902e-07,
"loss": 0.3099,
"mean_token_accuracy": 0.887202151119709,
"num_tokens": 311771969.0,
"step": 724
},
{
"entropy": 0.380401611328125,
"epoch": 2.810077519379845,
"grad_norm": 0.6168345748959235,
"learning_rate": 2.2750003539456e-07,
"loss": 0.2989,
"mean_token_accuracy": 0.8897026870399714,
"num_tokens": 312196977.0,
"step": 725
},
{
"entropy": 0.378082275390625,
"epoch": 2.813953488372093,
"grad_norm": 0.5932331085634238,
"learning_rate": 2.1852399266194312e-07,
"loss": 0.2981,
"mean_token_accuracy": 0.8931511901319027,
"num_tokens": 312628560.0,
"step": 726
},
{
"entropy": 0.378814697265625,
"epoch": 2.817829457364341,
"grad_norm": 0.6263166553893282,
"learning_rate": 2.097266516770713e-07,
"loss": 0.2902,
"mean_token_accuracy": 0.8911039233207703,
"num_tokens": 313057357.0,
"step": 727
},
{
"entropy": 0.38458251953125,
"epoch": 2.8217054263565893,
"grad_norm": 0.6039916001081406,
"learning_rate": 2.0110817316212893e-07,
"loss": 0.3166,
"mean_token_accuracy": 0.8877752646803856,
"num_tokens": 313489145.0,
"step": 728
},
{
"entropy": 0.3828125,
"epoch": 2.8255813953488373,
"grad_norm": 0.5978445141112033,
"learning_rate": 1.9266871457159108e-07,
"loss": 0.3148,
"mean_token_accuracy": 0.8861930128186941,
"num_tokens": 313909863.0,
"step": 729
},
{
"entropy": 0.37738037109375,
"epoch": 2.8294573643410854,
"grad_norm": 0.6048382839853098,
"learning_rate": 1.844084300893456e-07,
"loss": 0.3046,
"mean_token_accuracy": 0.8915831623598933,
"num_tokens": 314341606.0,
"step": 730
},
{
"entropy": 0.380645751953125,
"epoch": 2.8333333333333335,
"grad_norm": 0.6482349068060914,
"learning_rate": 1.7632747062587884e-07,
"loss": 0.3061,
"mean_token_accuracy": 0.8873017486184835,
"num_tokens": 314769168.0,
"step": 731
},
{
"entropy": 0.377166748046875,
"epoch": 2.8372093023255816,
"grad_norm": 0.6049506747628145,
"learning_rate": 1.6842598381551e-07,
"loss": 0.2998,
"mean_token_accuracy": 0.8923018351197243,
"num_tokens": 315195435.0,
"step": 732
},
{
"entropy": 0.388031005859375,
"epoch": 2.8410852713178296,
"grad_norm": 0.5889538128743542,
"learning_rate": 1.6070411401370335e-07,
"loss": 0.305,
"mean_token_accuracy": 0.8921971945092082,
"num_tokens": 315604073.0,
"step": 733
},
{
"entropy": 0.37384033203125,
"epoch": 2.8449612403100772,
"grad_norm": 0.6226688966249363,
"learning_rate": 1.531620022944269e-07,
"loss": 0.3002,
"mean_token_accuracy": 0.8922487432137132,
"num_tokens": 316040545.0,
"step": 734
},
{
"entropy": 0.38275146484375,
"epoch": 2.8488372093023253,
"grad_norm": 0.668427970745727,
"learning_rate": 1.4579978644757463e-07,
"loss": 0.3181,
"mean_token_accuracy": 0.8864294197410345,
"num_tokens": 316468028.0,
"step": 735
},
{
"entropy": 0.378875732421875,
"epoch": 2.8527131782945734,
"grad_norm": 0.5924749050744685,
"learning_rate": 1.3861760097645062e-07,
"loss": 0.3031,
"mean_token_accuracy": 0.8921865597367287,
"num_tokens": 316904656.0,
"step": 736
},
{
"entropy": 0.375030517578125,
"epoch": 2.8565891472868215,
"grad_norm": 0.6041440396713224,
"learning_rate": 1.3161557709530982e-07,
"loss": 0.3117,
"mean_token_accuracy": 0.8871205970644951,
"num_tokens": 317339393.0,
"step": 737
},
{
"entropy": 0.385498046875,
"epoch": 2.8604651162790695,
"grad_norm": 0.6084600637159178,
"learning_rate": 1.2479384272696572e-07,
"loss": 0.3076,
"mean_token_accuracy": 0.8861882789060473,
"num_tokens": 317754313.0,
"step": 738
},
{
"entropy": 0.382904052734375,
"epoch": 2.8643410852713176,
"grad_norm": 0.5945461244400033,
"learning_rate": 1.1815252250044318e-07,
"loss": 0.3045,
"mean_token_accuracy": 0.8900884315371513,
"num_tokens": 318175206.0,
"step": 739
},
{
"entropy": 0.38031005859375,
"epoch": 2.8682170542635657,
"grad_norm": 0.5881787762100744,
"learning_rate": 1.1169173774871478e-07,
"loss": 0.2931,
"mean_token_accuracy": 0.8941018283367157,
"num_tokens": 318603349.0,
"step": 740
},
{
"entropy": 0.3812255859375,
"epoch": 2.8720930232558137,
"grad_norm": 0.5955703008177884,
"learning_rate": 1.0541160650647364e-07,
"loss": 0.3113,
"mean_token_accuracy": 0.8901800969615579,
"num_tokens": 319029522.0,
"step": 741
},
{
"entropy": 0.38336181640625,
"epoch": 2.875968992248062,
"grad_norm": 0.620592670160612,
"learning_rate": 9.931224350798185e-08,
"loss": 0.3163,
"mean_token_accuracy": 0.889438440091908,
"num_tokens": 319467084.0,
"step": 742
},
{
"entropy": 0.389007568359375,
"epoch": 2.87984496124031,
"grad_norm": 0.6263135619513145,
"learning_rate": 9.339376018497216e-08,
"loss": 0.3135,
"mean_token_accuracy": 0.8862051470205188,
"num_tokens": 319883624.0,
"step": 743
},
{
"entropy": 0.374481201171875,
"epoch": 2.883720930232558,
"grad_norm": 0.6146314660717794,
"learning_rate": 8.765626466461397e-08,
"loss": 0.3245,
"mean_token_accuracy": 0.8832680499181151,
"num_tokens": 320347517.0,
"step": 744
},
{
"entropy": 0.379791259765625,
"epoch": 2.887596899224806,
"grad_norm": 0.6482836636122309,
"learning_rate": 8.209986176753947e-08,
"loss": 0.316,
"mean_token_accuracy": 0.885383871383965,
"num_tokens": 320769928.0,
"step": 745
},
{
"entropy": 0.3804931640625,
"epoch": 2.891472868217054,
"grad_norm": 0.6068986326895162,
"learning_rate": 7.672465300592069e-08,
"loss": 0.3152,
"mean_token_accuracy": 0.8881333563476801,
"num_tokens": 321217963.0,
"step": 746
},
{
"entropy": 0.380157470703125,
"epoch": 2.895348837209302,
"grad_norm": 0.5942824706627966,
"learning_rate": 7.153073658162646e-08,
"loss": 0.3131,
"mean_token_accuracy": 0.8865061281248927,
"num_tokens": 321637092.0,
"step": 747
},
{
"entropy": 0.378143310546875,
"epoch": 2.89922480620155,
"grad_norm": 0.6413515101364472,
"learning_rate": 6.65182073844195e-08,
"loss": 0.3028,
"mean_token_accuracy": 0.8900581542402506,
"num_tokens": 322084312.0,
"step": 748
},
{
"entropy": 0.379058837890625,
"epoch": 2.9031007751937983,
"grad_norm": 0.5995579886871476,
"learning_rate": 6.168715699022776e-08,
"loss": 0.3201,
"mean_token_accuracy": 0.8862557569518685,
"num_tokens": 322517226.0,
"step": 749
},
{
"entropy": 0.379669189453125,
"epoch": 2.9069767441860463,
"grad_norm": 0.6139724741481611,
"learning_rate": 5.7037673659464664e-08,
"loss": 0.3062,
"mean_token_accuracy": 0.8911184314638376,
"num_tokens": 322943803.0,
"step": 750
},
{
"entropy": 0.38592529296875,
"epoch": 2.9108527131782944,
"grad_norm": 0.6371344577180248,
"learning_rate": 5.256984233542595e-08,
"loss": 0.3227,
"mean_token_accuracy": 0.8849206436425447,
"num_tokens": 323366694.0,
"step": 751
},
{
"entropy": 0.38250732421875,
"epoch": 2.9147286821705425,
"grad_norm": 0.595286110379896,
"learning_rate": 4.828374464273422e-08,
"loss": 0.3146,
"mean_token_accuracy": 0.8888753112405539,
"num_tokens": 323787284.0,
"step": 752
},
{
"entropy": 0.379486083984375,
"epoch": 2.9186046511627906,
"grad_norm": 0.608322323266727,
"learning_rate": 4.417945888584241e-08,
"loss": 0.3023,
"mean_token_accuracy": 0.8914187019690871,
"num_tokens": 324198852.0,
"step": 753
},
{
"entropy": 0.386444091796875,
"epoch": 2.9224806201550386,
"grad_norm": 0.6120063755796368,
"learning_rate": 4.025706004760932e-08,
"loss": 0.3149,
"mean_token_accuracy": 0.8872729791328311,
"num_tokens": 324621415.0,
"step": 754
},
{
"entropy": 0.387176513671875,
"epoch": 2.9263565891472867,
"grad_norm": 0.623477103826481,
"learning_rate": 3.651661978793075e-08,
"loss": 0.3043,
"mean_token_accuracy": 0.8890358442440629,
"num_tokens": 325023864.0,
"step": 755
},
{
"entropy": 0.3787841796875,
"epoch": 2.9302325581395348,
"grad_norm": 0.6148588161751789,
"learning_rate": 3.2958206442422754e-08,
"loss": 0.3102,
"mean_token_accuracy": 0.8885551122948527,
"num_tokens": 325444314.0,
"step": 756
},
{
"entropy": 0.377716064453125,
"epoch": 2.934108527131783,
"grad_norm": 0.6007577376002693,
"learning_rate": 2.9581885021181534e-08,
"loss": 0.3032,
"mean_token_accuracy": 0.8900681380182505,
"num_tokens": 325899500.0,
"step": 757
},
{
"entropy": 0.381317138671875,
"epoch": 2.937984496124031,
"grad_norm": 0.6031347151158447,
"learning_rate": 2.6387717207589925e-08,
"loss": 0.3206,
"mean_token_accuracy": 0.8850037911906838,
"num_tokens": 326331108.0,
"step": 758
},
{
"entropy": 0.385345458984375,
"epoch": 2.941860465116279,
"grad_norm": 0.601218932311134,
"learning_rate": 2.3375761357193883e-08,
"loss": 0.3109,
"mean_token_accuracy": 0.8882812475785613,
"num_tokens": 326742800.0,
"step": 759
},
{
"entropy": 0.37762451171875,
"epoch": 2.945736434108527,
"grad_norm": 0.6655915324031394,
"learning_rate": 2.054607249663665e-08,
"loss": 0.3276,
"mean_token_accuracy": 0.88413349352777,
"num_tokens": 327198682.0,
"step": 760
},
{
"entropy": 0.38250732421875,
"epoch": 2.949612403100775,
"grad_norm": 0.6064845718430714,
"learning_rate": 1.7898702322648453e-08,
"loss": 0.3008,
"mean_token_accuracy": 0.8902482967823744,
"num_tokens": 327619351.0,
"step": 761
},
{
"entropy": 0.379852294921875,
"epoch": 2.953488372093023,
"grad_norm": 0.6119773387004711,
"learning_rate": 1.5433699201108377e-08,
"loss": 0.3153,
"mean_token_accuracy": 0.8852389631792903,
"num_tokens": 328045535.0,
"step": 762
},
{
"entropy": 0.38037109375,
"epoch": 2.9573643410852712,
"grad_norm": 0.6198918451978505,
"learning_rate": 1.3151108166156168e-08,
"loss": 0.3096,
"mean_token_accuracy": 0.8888922138139606,
"num_tokens": 328473736.0,
"step": 763
},
{
"entropy": 0.378204345703125,
"epoch": 2.9612403100775193,
"grad_norm": 0.5998301701967552,
"learning_rate": 1.1050970919374016e-08,
"loss": 0.2997,
"mean_token_accuracy": 0.8889357475563884,
"num_tokens": 328890873.0,
"step": 764
},
{
"entropy": 0.378997802734375,
"epoch": 2.9651162790697674,
"grad_norm": 0.6177168341536942,
"learning_rate": 9.13332582901716e-09,
"loss": 0.3165,
"mean_token_accuracy": 0.8886342123150826,
"num_tokens": 329322746.0,
"step": 765
},
{
"entropy": 0.379241943359375,
"epoch": 2.9689922480620154,
"grad_norm": 0.592192707262181,
"learning_rate": 7.3982079293233314e-09,
"loss": 0.3056,
"mean_token_accuracy": 0.8878153394907713,
"num_tokens": 329742686.0,
"step": 766
},
{
"entropy": 0.3809814453125,
"epoch": 2.9728682170542635,
"grad_norm": 0.5800435523051897,
"learning_rate": 5.845648919863278e-09,
"loss": 0.3224,
"mean_token_accuracy": 0.8846305012702942,
"num_tokens": 330187335.0,
"step": 767
},
{
"entropy": 0.37921142578125,
"epoch": 2.9767441860465116,
"grad_norm": 0.6234929524950468,
"learning_rate": 4.475677164966774e-09,
"loss": 0.3253,
"mean_token_accuracy": 0.8846317222341895,
"num_tokens": 330621750.0,
"step": 768
},
{
"entropy": 0.383514404296875,
"epoch": 2.9806201550387597,
"grad_norm": 0.621643677788845,
"learning_rate": 3.2883176932019255e-09,
"loss": 0.3168,
"mean_token_accuracy": 0.8865220500156283,
"num_tokens": 331055767.0,
"step": 769
},
{
"entropy": 0.380706787109375,
"epoch": 2.9844961240310077,
"grad_norm": 0.6025789986792925,
"learning_rate": 2.2835921969210917e-09,
"loss": 0.2977,
"mean_token_accuracy": 0.890127319842577,
"num_tokens": 331478526.0,
"step": 770
},
{
"entropy": 0.38348388671875,
"epoch": 2.988372093023256,
"grad_norm": 0.621377310978937,
"learning_rate": 1.4615190318600924e-09,
"loss": 0.3107,
"mean_token_accuracy": 0.8878284217789769,
"num_tokens": 331903679.0,
"step": 771
},
{
"entropy": 0.382965087890625,
"epoch": 2.992248062015504,
"grad_norm": 0.6302553243798728,
"learning_rate": 8.221132168073631e-10,
"loss": 0.317,
"mean_token_accuracy": 0.8872353015467525,
"num_tokens": 332342987.0,
"step": 772
},
{
"entropy": 0.389373779296875,
"epoch": 2.996124031007752,
"grad_norm": 0.6125161547285307,
"learning_rate": 3.653864333275081e-10,
"loss": 0.3118,
"mean_token_accuracy": 0.888038388453424,
"num_tokens": 332763851.0,
"step": 773
},
{
"entropy": 0.37945556640625,
"epoch": 3.0,
"grad_norm": 0.600207904778677,
"learning_rate": 9.134702554591812e-11,
"loss": 0.3101,
"mean_token_accuracy": 0.8894096985459328,
"num_tokens": 333188605.0,
"step": 774
},
{
"epoch": 3.0,
"step": 774,
"total_flos": 616169609822208.0,
"train_loss": 0.4388826183339422,
"train_runtime": 59766.2651,
"train_samples_per_second": 1.24,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 774,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 65,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 616169609822208.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}