Files
eus-latn-10mb-10mb_seed3407/checkpoint-11630/trainer_state.json
ModelHub XC 213141ef9f 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/eus-latn-10mb-10mb_seed3407
Source: Original Platform
2026-06-28 05:31:17 +08:00

23548 lines
645 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.991834980661796,
"eval_steps": 500,
"global_step": 11630,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 10.742608070373535,
"epoch": 0.004297378599054577,
"grad_norm": 5.46875,
"learning_rate": 2e-06,
"loss": 10.7643,
"mean_token_accuracy": 7.587253348901868e-05,
"num_tokens": 10107.0,
"step": 5
},
{
"entropy": 10.742630290985108,
"epoch": 0.008594757198109154,
"grad_norm": 5.78125,
"learning_rate": 4.5e-06,
"loss": 10.7086,
"mean_token_accuracy": 0.0,
"num_tokens": 18391.0,
"step": 10
},
{
"entropy": 10.74263505935669,
"epoch": 0.01289213579716373,
"grad_norm": 5.3125,
"learning_rate": 7e-06,
"loss": 10.6888,
"mean_token_accuracy": 7.022471982054412e-05,
"num_tokens": 27061.0,
"step": 15
},
{
"entropy": 10.742604160308838,
"epoch": 0.017189514396218308,
"grad_norm": 6.0,
"learning_rate": 9.5e-06,
"loss": 10.6611,
"mean_token_accuracy": 0.0008422504703048617,
"num_tokens": 36339.0,
"step": 20
},
{
"entropy": 10.742517948150635,
"epoch": 0.021486892995272882,
"grad_norm": 4.75,
"learning_rate": 1.2e-05,
"loss": 10.5317,
"mean_token_accuracy": 0.02025789166800678,
"num_tokens": 45770.0,
"step": 25
},
{
"entropy": 10.741962242126466,
"epoch": 0.02578427159432746,
"grad_norm": 4.25,
"learning_rate": 1.4500000000000002e-05,
"loss": 10.399,
"mean_token_accuracy": 0.04876907132565975,
"num_tokens": 54575.0,
"step": 30
},
{
"entropy": 10.73945140838623,
"epoch": 0.030081650193382038,
"grad_norm": 3.15625,
"learning_rate": 1.7000000000000003e-05,
"loss": 10.3065,
"mean_token_accuracy": 0.0514072135090828,
"num_tokens": 66403.0,
"step": 35
},
{
"entropy": 10.730937385559082,
"epoch": 0.034379028792436615,
"grad_norm": 2.640625,
"learning_rate": 1.95e-05,
"loss": 10.0976,
"mean_token_accuracy": 0.05973539762198925,
"num_tokens": 76510.0,
"step": 40
},
{
"entropy": 10.715238952636719,
"epoch": 0.03867640739149119,
"grad_norm": 2.40625,
"learning_rate": 2.2e-05,
"loss": 9.9688,
"mean_token_accuracy": 0.05614017099142075,
"num_tokens": 84836.0,
"step": 45
},
{
"entropy": 10.702037715911866,
"epoch": 0.042973785990545764,
"grad_norm": 2.046875,
"learning_rate": 2.4500000000000003e-05,
"loss": 9.9015,
"mean_token_accuracy": 0.053829558193683624,
"num_tokens": 93197.0,
"step": 50
},
{
"entropy": 10.697910690307618,
"epoch": 0.047271164589600345,
"grad_norm": 2.40625,
"learning_rate": 2.7e-05,
"loss": 9.8366,
"mean_token_accuracy": 0.05843428298830986,
"num_tokens": 101546.0,
"step": 55
},
{
"entropy": 10.693470478057861,
"epoch": 0.05156854318865492,
"grad_norm": 1.9609375,
"learning_rate": 2.95e-05,
"loss": 9.8429,
"mean_token_accuracy": 0.0558084711432457,
"num_tokens": 111703.0,
"step": 60
},
{
"entropy": 10.680869865417481,
"epoch": 0.055865921787709494,
"grad_norm": 1.9453125,
"learning_rate": 3.2e-05,
"loss": 9.7131,
"mean_token_accuracy": 0.0589165486395359,
"num_tokens": 119894.0,
"step": 65
},
{
"entropy": 10.668927574157715,
"epoch": 0.060163300386764075,
"grad_norm": 1.9765625,
"learning_rate": 3.4500000000000005e-05,
"loss": 9.6682,
"mean_token_accuracy": 0.06148771904408932,
"num_tokens": 128885.0,
"step": 70
},
{
"entropy": 10.654484272003174,
"epoch": 0.06446067898581866,
"grad_norm": 1.953125,
"learning_rate": 3.7e-05,
"loss": 9.6297,
"mean_token_accuracy": 0.057728851959109304,
"num_tokens": 138106.0,
"step": 75
},
{
"entropy": 10.645826625823975,
"epoch": 0.06875805758487323,
"grad_norm": 1.9296875,
"learning_rate": 3.95e-05,
"loss": 9.5722,
"mean_token_accuracy": 0.058954347297549246,
"num_tokens": 146691.0,
"step": 80
},
{
"entropy": 10.637816619873046,
"epoch": 0.0730554361839278,
"grad_norm": 1.90625,
"learning_rate": 4.2000000000000004e-05,
"loss": 9.5126,
"mean_token_accuracy": 0.059067190065979956,
"num_tokens": 155792.0,
"step": 85
},
{
"entropy": 10.63103084564209,
"epoch": 0.07735281478298238,
"grad_norm": 1.7890625,
"learning_rate": 4.45e-05,
"loss": 9.5251,
"mean_token_accuracy": 0.0552229531109333,
"num_tokens": 166944.0,
"step": 90
},
{
"entropy": 10.616693305969239,
"epoch": 0.08165019338203695,
"grad_norm": 1.96875,
"learning_rate": 4.7000000000000004e-05,
"loss": 9.3423,
"mean_token_accuracy": 0.060124922543764114,
"num_tokens": 175303.0,
"step": 95
},
{
"entropy": 10.591300106048584,
"epoch": 0.08594757198109153,
"grad_norm": 1.8203125,
"learning_rate": 4.9500000000000004e-05,
"loss": 9.3133,
"mean_token_accuracy": 0.06174388714134693,
"num_tokens": 184708.0,
"step": 100
},
{
"entropy": 10.564336776733398,
"epoch": 0.09024495058014612,
"grad_norm": 1.7890625,
"learning_rate": 5.2e-05,
"loss": 9.2307,
"mean_token_accuracy": 0.0674959484487772,
"num_tokens": 193835.0,
"step": 105
},
{
"entropy": 10.52622423171997,
"epoch": 0.09454232917920069,
"grad_norm": 1.8828125,
"learning_rate": 5.45e-05,
"loss": 9.1379,
"mean_token_accuracy": 0.07480009235441684,
"num_tokens": 203344.0,
"step": 110
},
{
"entropy": 10.454349136352539,
"epoch": 0.09883970777825526,
"grad_norm": 1.6171875,
"learning_rate": 5.7e-05,
"loss": 9.1209,
"mean_token_accuracy": 0.06218625903129578,
"num_tokens": 213048.0,
"step": 115
},
{
"entropy": 10.415324211120605,
"epoch": 0.10313708637730984,
"grad_norm": 1.578125,
"learning_rate": 5.9499999999999996e-05,
"loss": 8.9306,
"mean_token_accuracy": 0.07533645890653133,
"num_tokens": 221784.0,
"step": 120
},
{
"entropy": 10.303644943237305,
"epoch": 0.10743446497636441,
"grad_norm": 1.4765625,
"learning_rate": 6.2e-05,
"loss": 8.8509,
"mean_token_accuracy": 0.07504003196954727,
"num_tokens": 230971.0,
"step": 125
},
{
"entropy": 10.209668159484863,
"epoch": 0.11173184357541899,
"grad_norm": 1.4296875,
"learning_rate": 6.450000000000001e-05,
"loss": 8.7412,
"mean_token_accuracy": 0.07478504739701748,
"num_tokens": 240524.0,
"step": 130
},
{
"entropy": 10.153745365142822,
"epoch": 0.11602922217447358,
"grad_norm": 1.3359375,
"learning_rate": 6.7e-05,
"loss": 8.6323,
"mean_token_accuracy": 0.07354197278618813,
"num_tokens": 249220.0,
"step": 135
},
{
"entropy": 10.068094253540039,
"epoch": 0.12032660077352815,
"grad_norm": 1.3125,
"learning_rate": 6.950000000000001e-05,
"loss": 8.61,
"mean_token_accuracy": 0.07049238979816437,
"num_tokens": 258934.0,
"step": 140
},
{
"entropy": 9.973960685729981,
"epoch": 0.12462397937258272,
"grad_norm": 1.2734375,
"learning_rate": 7.2e-05,
"loss": 8.4673,
"mean_token_accuracy": 0.07534252405166626,
"num_tokens": 267680.0,
"step": 145
},
{
"entropy": 9.815561103820801,
"epoch": 0.1289213579716373,
"grad_norm": 1.09375,
"learning_rate": 7.45e-05,
"loss": 8.3709,
"mean_token_accuracy": 0.07952065020799637,
"num_tokens": 276227.0,
"step": 150
},
{
"entropy": 9.66996259689331,
"epoch": 0.1332187365706919,
"grad_norm": 1.1875,
"learning_rate": 7.7e-05,
"loss": 8.2269,
"mean_token_accuracy": 0.08225171342492103,
"num_tokens": 286342.0,
"step": 155
},
{
"entropy": 9.510671615600586,
"epoch": 0.13751611516974646,
"grad_norm": 0.953125,
"learning_rate": 7.950000000000001e-05,
"loss": 8.1921,
"mean_token_accuracy": 0.0742720566689968,
"num_tokens": 294994.0,
"step": 160
},
{
"entropy": 9.346861934661865,
"epoch": 0.14181349376880104,
"grad_norm": 0.984375,
"learning_rate": 8.2e-05,
"loss": 8.113,
"mean_token_accuracy": 0.08004417940974236,
"num_tokens": 303882.0,
"step": 165
},
{
"entropy": 9.199288940429687,
"epoch": 0.1461108723678556,
"grad_norm": 0.9296875,
"learning_rate": 8.450000000000001e-05,
"loss": 8.0403,
"mean_token_accuracy": 0.07799897268414498,
"num_tokens": 312515.0,
"step": 170
},
{
"entropy": 8.978620052337646,
"epoch": 0.15040825096691018,
"grad_norm": 0.9375,
"learning_rate": 8.7e-05,
"loss": 7.9977,
"mean_token_accuracy": 0.07381256259977817,
"num_tokens": 320801.0,
"step": 175
},
{
"entropy": 8.861582374572754,
"epoch": 0.15470562956596476,
"grad_norm": 0.9765625,
"learning_rate": 8.95e-05,
"loss": 7.9642,
"mean_token_accuracy": 0.08192512467503547,
"num_tokens": 329382.0,
"step": 180
},
{
"entropy": 8.755144786834716,
"epoch": 0.15900300816501933,
"grad_norm": 0.9296875,
"learning_rate": 9.2e-05,
"loss": 7.9273,
"mean_token_accuracy": 0.07583913430571557,
"num_tokens": 337894.0,
"step": 185
},
{
"entropy": 8.582227611541748,
"epoch": 0.1633003867640739,
"grad_norm": 0.8984375,
"learning_rate": 9.45e-05,
"loss": 7.9012,
"mean_token_accuracy": 0.07614588961005211,
"num_tokens": 346380.0,
"step": 190
},
{
"entropy": 8.591823768615722,
"epoch": 0.16759776536312848,
"grad_norm": 0.9609375,
"learning_rate": 9.7e-05,
"loss": 7.9407,
"mean_token_accuracy": 0.07390806600451469,
"num_tokens": 356305.0,
"step": 195
},
{
"entropy": 8.515201950073243,
"epoch": 0.17189514396218306,
"grad_norm": 1.1328125,
"learning_rate": 9.95e-05,
"loss": 7.8901,
"mean_token_accuracy": 0.07247771993279457,
"num_tokens": 364899.0,
"step": 200
},
{
"entropy": 8.457213211059571,
"epoch": 0.17619252256123766,
"grad_norm": 0.93359375,
"learning_rate": 0.000102,
"loss": 7.8566,
"mean_token_accuracy": 0.0781160645186901,
"num_tokens": 373663.0,
"step": 205
},
{
"entropy": 8.381179523468017,
"epoch": 0.18048990116029223,
"grad_norm": 0.95703125,
"learning_rate": 0.00010449999999999999,
"loss": 7.8221,
"mean_token_accuracy": 0.07758632972836495,
"num_tokens": 382730.0,
"step": 210
},
{
"entropy": 8.390653896331788,
"epoch": 0.1847872797593468,
"grad_norm": 0.921875,
"learning_rate": 0.000107,
"loss": 7.8622,
"mean_token_accuracy": 0.071787304058671,
"num_tokens": 392676.0,
"step": 215
},
{
"entropy": 8.255177211761474,
"epoch": 0.18908465835840138,
"grad_norm": 1.1015625,
"learning_rate": 0.0001095,
"loss": 7.8473,
"mean_token_accuracy": 0.08185218423604965,
"num_tokens": 401050.0,
"step": 220
},
{
"entropy": 8.367721462249756,
"epoch": 0.19338203695745596,
"grad_norm": 0.796875,
"learning_rate": 0.000112,
"loss": 7.795,
"mean_token_accuracy": 0.07991239950060844,
"num_tokens": 410009.0,
"step": 225
},
{
"entropy": 8.268333339691162,
"epoch": 0.19767941555651053,
"grad_norm": 0.859375,
"learning_rate": 0.0001145,
"loss": 7.7757,
"mean_token_accuracy": 0.08171008005738259,
"num_tokens": 419302.0,
"step": 230
},
{
"entropy": 8.304029846191407,
"epoch": 0.2019767941555651,
"grad_norm": 0.984375,
"learning_rate": 0.00011700000000000001,
"loss": 7.6812,
"mean_token_accuracy": 0.08820762410759926,
"num_tokens": 427296.0,
"step": 235
},
{
"entropy": 8.16576337814331,
"epoch": 0.20627417275461968,
"grad_norm": 0.91796875,
"learning_rate": 0.00011949999999999999,
"loss": 7.8198,
"mean_token_accuracy": 0.07870872803032399,
"num_tokens": 436368.0,
"step": 240
},
{
"entropy": 8.189785575866699,
"epoch": 0.21057155135367425,
"grad_norm": 1.28125,
"learning_rate": 0.000122,
"loss": 7.7389,
"mean_token_accuracy": 0.08551637679338456,
"num_tokens": 445535.0,
"step": 245
},
{
"entropy": 8.265625381469727,
"epoch": 0.21486892995272883,
"grad_norm": 0.8671875,
"learning_rate": 0.0001245,
"loss": 7.7093,
"mean_token_accuracy": 0.07919453792273998,
"num_tokens": 454769.0,
"step": 250
},
{
"entropy": 8.1545090675354,
"epoch": 0.2191663085517834,
"grad_norm": 0.93359375,
"learning_rate": 0.000127,
"loss": 7.7315,
"mean_token_accuracy": 0.0871740497648716,
"num_tokens": 463975.0,
"step": 255
},
{
"entropy": 8.13952112197876,
"epoch": 0.22346368715083798,
"grad_norm": 0.88671875,
"learning_rate": 0.0001295,
"loss": 7.726,
"mean_token_accuracy": 0.08799278363585472,
"num_tokens": 472899.0,
"step": 260
},
{
"entropy": 8.196070003509522,
"epoch": 0.22776106574989258,
"grad_norm": 0.93359375,
"learning_rate": 0.000132,
"loss": 7.7354,
"mean_token_accuracy": 0.08013860881328583,
"num_tokens": 481556.0,
"step": 265
},
{
"entropy": 8.114658737182618,
"epoch": 0.23205844434894715,
"grad_norm": 0.91015625,
"learning_rate": 0.00013450000000000002,
"loss": 7.7023,
"mean_token_accuracy": 0.0854449674487114,
"num_tokens": 490253.0,
"step": 270
},
{
"entropy": 8.193334579467773,
"epoch": 0.23635582294800173,
"grad_norm": 1.09375,
"learning_rate": 0.00013700000000000002,
"loss": 7.7066,
"mean_token_accuracy": 0.0806311085820198,
"num_tokens": 498444.0,
"step": 275
},
{
"entropy": 8.104936504364014,
"epoch": 0.2406532015470563,
"grad_norm": 0.8046875,
"learning_rate": 0.0001395,
"loss": 7.6467,
"mean_token_accuracy": 0.08675235286355018,
"num_tokens": 508330.0,
"step": 280
},
{
"entropy": 8.113396596908569,
"epoch": 0.24495058014611087,
"grad_norm": 1.015625,
"learning_rate": 0.00014199999999999998,
"loss": 7.7405,
"mean_token_accuracy": 0.08165572881698609,
"num_tokens": 517900.0,
"step": 285
},
{
"entropy": 8.046846723556518,
"epoch": 0.24924795874516545,
"grad_norm": 0.93359375,
"learning_rate": 0.0001445,
"loss": 7.6901,
"mean_token_accuracy": 0.08230286985635757,
"num_tokens": 527808.0,
"step": 290
},
{
"entropy": 8.13338761329651,
"epoch": 0.25354533734422,
"grad_norm": 0.8984375,
"learning_rate": 0.000147,
"loss": 7.6711,
"mean_token_accuracy": 0.08156475871801376,
"num_tokens": 536931.0,
"step": 295
},
{
"entropy": 8.18837013244629,
"epoch": 0.2578427159432746,
"grad_norm": 1.1875,
"learning_rate": 0.0001495,
"loss": 7.7049,
"mean_token_accuracy": 0.0835341140627861,
"num_tokens": 545758.0,
"step": 300
},
{
"entropy": 8.025089168548584,
"epoch": 0.26214009454232917,
"grad_norm": 0.9921875,
"learning_rate": 0.000152,
"loss": 7.7131,
"mean_token_accuracy": 0.08242038711905479,
"num_tokens": 555165.0,
"step": 305
},
{
"entropy": 8.155539417266846,
"epoch": 0.2664374731413838,
"grad_norm": 0.86328125,
"learning_rate": 0.00015450000000000001,
"loss": 7.6144,
"mean_token_accuracy": 0.08789716809988021,
"num_tokens": 564719.0,
"step": 310
},
{
"entropy": 8.041153383255004,
"epoch": 0.2707348517404383,
"grad_norm": 1.0,
"learning_rate": 0.000157,
"loss": 7.594,
"mean_token_accuracy": 0.09155945181846618,
"num_tokens": 573572.0,
"step": 315
},
{
"entropy": 8.15259666442871,
"epoch": 0.2750322303394929,
"grad_norm": 1.0859375,
"learning_rate": 0.0001595,
"loss": 7.7634,
"mean_token_accuracy": 0.08318910300731659,
"num_tokens": 581497.0,
"step": 320
},
{
"entropy": 8.100253248214722,
"epoch": 0.27932960893854747,
"grad_norm": 1.125,
"learning_rate": 0.000162,
"loss": 7.6118,
"mean_token_accuracy": 0.08767011985182763,
"num_tokens": 591107.0,
"step": 325
},
{
"entropy": 7.984478855133057,
"epoch": 0.28362698753760207,
"grad_norm": 0.84765625,
"learning_rate": 0.00016450000000000001,
"loss": 7.6456,
"mean_token_accuracy": 0.08353794142603874,
"num_tokens": 600241.0,
"step": 330
},
{
"entropy": 8.057686376571656,
"epoch": 0.2879243661366566,
"grad_norm": 0.91796875,
"learning_rate": 0.00016700000000000002,
"loss": 7.5776,
"mean_token_accuracy": 0.08751234114170074,
"num_tokens": 608697.0,
"step": 335
},
{
"entropy": 8.016141748428344,
"epoch": 0.2922217447357112,
"grad_norm": 0.9453125,
"learning_rate": 0.00016950000000000003,
"loss": 7.568,
"mean_token_accuracy": 0.09023259431123734,
"num_tokens": 617275.0,
"step": 340
},
{
"entropy": 8.084819841384888,
"epoch": 0.29651912333476577,
"grad_norm": 0.8984375,
"learning_rate": 0.00017199999999999998,
"loss": 7.6405,
"mean_token_accuracy": 0.08630914464592934,
"num_tokens": 626644.0,
"step": 345
},
{
"entropy": 8.008595705032349,
"epoch": 0.30081650193382037,
"grad_norm": 0.98828125,
"learning_rate": 0.00017449999999999999,
"loss": 7.5665,
"mean_token_accuracy": 0.08766811862587928,
"num_tokens": 635110.0,
"step": 350
},
{
"entropy": 8.04712610244751,
"epoch": 0.30511388053287497,
"grad_norm": 0.87109375,
"learning_rate": 0.000177,
"loss": 7.7031,
"mean_token_accuracy": 0.08570141717791557,
"num_tokens": 644746.0,
"step": 355
},
{
"entropy": 8.179811954498291,
"epoch": 0.3094112591319295,
"grad_norm": 1.1015625,
"learning_rate": 0.0001795,
"loss": 7.5831,
"mean_token_accuracy": 0.08595824986696243,
"num_tokens": 654281.0,
"step": 360
},
{
"entropy": 7.987443113327027,
"epoch": 0.3137086377309841,
"grad_norm": 1.203125,
"learning_rate": 0.000182,
"loss": 7.585,
"mean_token_accuracy": 0.09283285215497017,
"num_tokens": 663174.0,
"step": 365
},
{
"entropy": 7.916810417175293,
"epoch": 0.31800601633003867,
"grad_norm": 0.90625,
"learning_rate": 0.0001845,
"loss": 7.511,
"mean_token_accuracy": 0.08863886222243308,
"num_tokens": 672178.0,
"step": 370
},
{
"entropy": 8.005489206314087,
"epoch": 0.32230339492909327,
"grad_norm": 0.96484375,
"learning_rate": 0.000187,
"loss": 7.5218,
"mean_token_accuracy": 0.09131815880537034,
"num_tokens": 681323.0,
"step": 375
},
{
"entropy": 7.9803643226623535,
"epoch": 0.3266007735281478,
"grad_norm": 0.890625,
"learning_rate": 0.0001895,
"loss": 7.4406,
"mean_token_accuracy": 0.08985799476504326,
"num_tokens": 690461.0,
"step": 380
},
{
"entropy": 7.829833698272705,
"epoch": 0.3308981521272024,
"grad_norm": 1.046875,
"learning_rate": 0.000192,
"loss": 7.5004,
"mean_token_accuracy": 0.08490158319473266,
"num_tokens": 699199.0,
"step": 385
},
{
"entropy": 8.038139152526856,
"epoch": 0.33519553072625696,
"grad_norm": 1.1484375,
"learning_rate": 0.0001945,
"loss": 7.4484,
"mean_token_accuracy": 0.09670188426971435,
"num_tokens": 707949.0,
"step": 390
},
{
"entropy": 7.9735198497772215,
"epoch": 0.33949290932531156,
"grad_norm": 1.203125,
"learning_rate": 0.00019700000000000002,
"loss": 7.5219,
"mean_token_accuracy": 0.08999367579817771,
"num_tokens": 715752.0,
"step": 395
},
{
"entropy": 7.93391604423523,
"epoch": 0.3437902879243661,
"grad_norm": 1.1171875,
"learning_rate": 0.00019950000000000002,
"loss": 7.4479,
"mean_token_accuracy": 0.0979436494410038,
"num_tokens": 724416.0,
"step": 400
},
{
"entropy": 7.925309085845948,
"epoch": 0.3480876665234207,
"grad_norm": 1.0546875,
"learning_rate": 0.000202,
"loss": 7.4953,
"mean_token_accuracy": 0.09031900316476822,
"num_tokens": 733116.0,
"step": 405
},
{
"entropy": 7.916099977493286,
"epoch": 0.3523850451224753,
"grad_norm": 1.0625,
"learning_rate": 0.00020449999999999998,
"loss": 7.4726,
"mean_token_accuracy": 0.09227924942970275,
"num_tokens": 742093.0,
"step": 410
},
{
"entropy": 7.918701934814453,
"epoch": 0.35668242372152986,
"grad_norm": 1.046875,
"learning_rate": 0.000207,
"loss": 7.4649,
"mean_token_accuracy": 0.09618089124560356,
"num_tokens": 750402.0,
"step": 415
},
{
"entropy": 7.816703271865845,
"epoch": 0.36097980232058446,
"grad_norm": 0.9140625,
"learning_rate": 0.0002095,
"loss": 7.4336,
"mean_token_accuracy": 0.09461462944746017,
"num_tokens": 760961.0,
"step": 420
},
{
"entropy": 7.944287586212158,
"epoch": 0.365277180919639,
"grad_norm": 1.0390625,
"learning_rate": 0.000212,
"loss": 7.4865,
"mean_token_accuracy": 0.09455274268984795,
"num_tokens": 770554.0,
"step": 425
},
{
"entropy": 7.750526332855225,
"epoch": 0.3695745595186936,
"grad_norm": 1.03125,
"learning_rate": 0.0002145,
"loss": 7.4618,
"mean_token_accuracy": 0.09681151732802391,
"num_tokens": 779172.0,
"step": 430
},
{
"entropy": 7.9787256717681885,
"epoch": 0.37387193811774816,
"grad_norm": 0.984375,
"learning_rate": 0.00021700000000000002,
"loss": 7.5123,
"mean_token_accuracy": 0.08840151131153107,
"num_tokens": 788040.0,
"step": 435
},
{
"entropy": 7.883750295639038,
"epoch": 0.37816931671680276,
"grad_norm": 1.109375,
"learning_rate": 0.0002195,
"loss": 7.4135,
"mean_token_accuracy": 0.0939902700483799,
"num_tokens": 796786.0,
"step": 440
},
{
"entropy": 7.851776885986328,
"epoch": 0.3824666953158573,
"grad_norm": 1.09375,
"learning_rate": 0.000222,
"loss": 7.4233,
"mean_token_accuracy": 0.0923767201602459,
"num_tokens": 805520.0,
"step": 445
},
{
"entropy": 7.805376100540161,
"epoch": 0.3867640739149119,
"grad_norm": 1.1484375,
"learning_rate": 0.0002245,
"loss": 7.3508,
"mean_token_accuracy": 0.09647825658321381,
"num_tokens": 814939.0,
"step": 450
},
{
"entropy": 7.874559307098389,
"epoch": 0.39106145251396646,
"grad_norm": 1.2265625,
"learning_rate": 0.00022700000000000002,
"loss": 7.3531,
"mean_token_accuracy": 0.09795481041073799,
"num_tokens": 823862.0,
"step": 455
},
{
"entropy": 7.7626677513122555,
"epoch": 0.39535883111302106,
"grad_norm": 1.1328125,
"learning_rate": 0.00022950000000000002,
"loss": 7.3918,
"mean_token_accuracy": 0.09068166017532349,
"num_tokens": 832820.0,
"step": 460
},
{
"entropy": 7.928297901153565,
"epoch": 0.39965620971207566,
"grad_norm": 1.1171875,
"learning_rate": 0.00023200000000000003,
"loss": 7.3494,
"mean_token_accuracy": 0.09501236006617546,
"num_tokens": 841538.0,
"step": 465
},
{
"entropy": 7.7496504306793215,
"epoch": 0.4039535883111302,
"grad_norm": 0.99609375,
"learning_rate": 0.00023449999999999998,
"loss": 7.4626,
"mean_token_accuracy": 0.09104103595018387,
"num_tokens": 851123.0,
"step": 470
},
{
"entropy": 7.8953351974487305,
"epoch": 0.4082509669101848,
"grad_norm": 1.125,
"learning_rate": 0.000237,
"loss": 7.4266,
"mean_token_accuracy": 0.09596899375319481,
"num_tokens": 860357.0,
"step": 475
},
{
"entropy": 7.76341495513916,
"epoch": 0.41254834550923936,
"grad_norm": 1.0703125,
"learning_rate": 0.0002395,
"loss": 7.3425,
"mean_token_accuracy": 0.09861095696687698,
"num_tokens": 869980.0,
"step": 480
},
{
"entropy": 7.82184157371521,
"epoch": 0.41684572410829396,
"grad_norm": 1.03125,
"learning_rate": 0.000242,
"loss": 7.2999,
"mean_token_accuracy": 0.10065284445881843,
"num_tokens": 878250.0,
"step": 485
},
{
"entropy": 7.76347074508667,
"epoch": 0.4211431027073485,
"grad_norm": 1.25,
"learning_rate": 0.0002445,
"loss": 7.4007,
"mean_token_accuracy": 0.095355936139822,
"num_tokens": 887624.0,
"step": 490
},
{
"entropy": 7.753844261169434,
"epoch": 0.4254404813064031,
"grad_norm": 1.1484375,
"learning_rate": 0.000247,
"loss": 7.3568,
"mean_token_accuracy": 0.09853926301002502,
"num_tokens": 897120.0,
"step": 495
},
{
"entropy": 7.802051830291748,
"epoch": 0.42973785990545765,
"grad_norm": 1.03125,
"learning_rate": 0.0002495,
"loss": 7.3179,
"mean_token_accuracy": 0.10127250477671623,
"num_tokens": 906215.0,
"step": 500
},
{
"epoch": 0.42973785990545765,
"eval_entropy": 7.412716417699246,
"eval_loss": 7.3790483474731445,
"eval_mean_token_accuracy": 0.09986981684929347,
"eval_num_tokens": 906215.0,
"eval_runtime": 2.0966,
"eval_samples_per_second": 1692.736,
"eval_steps_per_second": 211.771,
"step": 500
},
{
"entropy": 7.651102495193482,
"epoch": 0.43403523850451226,
"grad_norm": 1.09375,
"learning_rate": 0.000252,
"loss": 7.3112,
"mean_token_accuracy": 0.10008608102798462,
"num_tokens": 915181.0,
"step": 505
},
{
"entropy": 7.728409194946289,
"epoch": 0.4383326171035668,
"grad_norm": 1.0703125,
"learning_rate": 0.0002545,
"loss": 7.3388,
"mean_token_accuracy": 0.09651862978935241,
"num_tokens": 924377.0,
"step": 510
},
{
"entropy": 7.770003318786621,
"epoch": 0.4426299957026214,
"grad_norm": 0.984375,
"learning_rate": 0.000257,
"loss": 7.4098,
"mean_token_accuracy": 0.09438847750425339,
"num_tokens": 933114.0,
"step": 515
},
{
"entropy": 7.86782751083374,
"epoch": 0.44692737430167595,
"grad_norm": 0.9375,
"learning_rate": 0.0002595,
"loss": 7.3692,
"mean_token_accuracy": 0.09444344118237495,
"num_tokens": 943306.0,
"step": 520
},
{
"entropy": 7.659075498580933,
"epoch": 0.45122475290073055,
"grad_norm": 1.1875,
"learning_rate": 0.000262,
"loss": 7.2626,
"mean_token_accuracy": 0.10587219074368477,
"num_tokens": 951515.0,
"step": 525
},
{
"entropy": 7.713227224349976,
"epoch": 0.45552213149978515,
"grad_norm": 1.015625,
"learning_rate": 0.00026450000000000003,
"loss": 7.3711,
"mean_token_accuracy": 0.09387057200074196,
"num_tokens": 962686.0,
"step": 530
},
{
"entropy": 7.780395078659057,
"epoch": 0.4598195100988397,
"grad_norm": 1.09375,
"learning_rate": 0.00026700000000000004,
"loss": 7.3777,
"mean_token_accuracy": 0.10021266266703606,
"num_tokens": 972136.0,
"step": 535
},
{
"entropy": 7.657458114624023,
"epoch": 0.4641168886978943,
"grad_norm": 1.09375,
"learning_rate": 0.00026950000000000005,
"loss": 7.2696,
"mean_token_accuracy": 0.10345774069428444,
"num_tokens": 981301.0,
"step": 540
},
{
"entropy": 7.700049114227295,
"epoch": 0.46841426729694885,
"grad_norm": 1.1484375,
"learning_rate": 0.00027200000000000005,
"loss": 7.2923,
"mean_token_accuracy": 0.10189392492175102,
"num_tokens": 990360.0,
"step": 545
},
{
"entropy": 7.770557546615601,
"epoch": 0.47271164589600345,
"grad_norm": 1.0859375,
"learning_rate": 0.0002745,
"loss": 7.3438,
"mean_token_accuracy": 0.09953725263476372,
"num_tokens": 999415.0,
"step": 550
},
{
"entropy": 7.656623125076294,
"epoch": 0.477009024495058,
"grad_norm": 1.0625,
"learning_rate": 0.000277,
"loss": 7.2635,
"mean_token_accuracy": 0.10239741951227188,
"num_tokens": 1008762.0,
"step": 555
},
{
"entropy": 7.690563821792603,
"epoch": 0.4813064030941126,
"grad_norm": 1.171875,
"learning_rate": 0.0002795,
"loss": 7.2652,
"mean_token_accuracy": 0.10631422251462937,
"num_tokens": 1017704.0,
"step": 560
},
{
"entropy": 7.641897583007813,
"epoch": 0.48560378169316715,
"grad_norm": 1.1640625,
"learning_rate": 0.00028199999999999997,
"loss": 7.2341,
"mean_token_accuracy": 0.10428761765360832,
"num_tokens": 1026251.0,
"step": 565
},
{
"entropy": 7.641419315338135,
"epoch": 0.48990116029222175,
"grad_norm": 1.03125,
"learning_rate": 0.0002845,
"loss": 7.2158,
"mean_token_accuracy": 0.10731100514531136,
"num_tokens": 1036191.0,
"step": 570
},
{
"entropy": 7.658735990524292,
"epoch": 0.4941985388912763,
"grad_norm": 1.0859375,
"learning_rate": 0.000287,
"loss": 7.2462,
"mean_token_accuracy": 0.10594421103596688,
"num_tokens": 1044936.0,
"step": 575
},
{
"entropy": 7.621677112579346,
"epoch": 0.4984959174903309,
"grad_norm": 1.1171875,
"learning_rate": 0.0002895,
"loss": 7.2472,
"mean_token_accuracy": 0.10367096737027168,
"num_tokens": 1053683.0,
"step": 580
},
{
"entropy": 7.570435047149658,
"epoch": 0.5027932960893855,
"grad_norm": 1.046875,
"learning_rate": 0.000292,
"loss": 7.2271,
"mean_token_accuracy": 0.1076263040304184,
"num_tokens": 1062932.0,
"step": 585
},
{
"entropy": 7.723283386230468,
"epoch": 0.50709067468844,
"grad_norm": 0.98828125,
"learning_rate": 0.0002945,
"loss": 7.2544,
"mean_token_accuracy": 0.10264097228646278,
"num_tokens": 1072313.0,
"step": 590
},
{
"entropy": 7.62511043548584,
"epoch": 0.5113880532874946,
"grad_norm": 1.171875,
"learning_rate": 0.000297,
"loss": 7.2228,
"mean_token_accuracy": 0.09801378548145294,
"num_tokens": 1081675.0,
"step": 595
},
{
"entropy": 7.608328151702881,
"epoch": 0.5156854318865493,
"grad_norm": 1.0703125,
"learning_rate": 0.0002995,
"loss": 7.2433,
"mean_token_accuracy": 0.10141062065958976,
"num_tokens": 1091541.0,
"step": 600
},
{
"entropy": 7.695394897460938,
"epoch": 0.5199828104856038,
"grad_norm": 1.015625,
"learning_rate": 0.000302,
"loss": 7.2462,
"mean_token_accuracy": 0.10475782826542854,
"num_tokens": 1100724.0,
"step": 605
},
{
"entropy": 7.50453405380249,
"epoch": 0.5242801890846583,
"grad_norm": 1.0546875,
"learning_rate": 0.0003045,
"loss": 7.1924,
"mean_token_accuracy": 0.1077597513794899,
"num_tokens": 1108869.0,
"step": 610
},
{
"entropy": 7.644835519790649,
"epoch": 0.5285775676837129,
"grad_norm": 1.1015625,
"learning_rate": 0.000307,
"loss": 7.2261,
"mean_token_accuracy": 0.10431057810783387,
"num_tokens": 1117314.0,
"step": 615
},
{
"entropy": 7.488267469406128,
"epoch": 0.5328749462827675,
"grad_norm": 1.109375,
"learning_rate": 0.0003095,
"loss": 7.148,
"mean_token_accuracy": 0.10711429193615914,
"num_tokens": 1126786.0,
"step": 620
},
{
"entropy": 7.577956056594848,
"epoch": 0.5371723248818221,
"grad_norm": 1.3046875,
"learning_rate": 0.000312,
"loss": 7.1645,
"mean_token_accuracy": 0.10579404905438423,
"num_tokens": 1136013.0,
"step": 625
},
{
"entropy": 7.527575206756592,
"epoch": 0.5414697034808766,
"grad_norm": 1.109375,
"learning_rate": 0.0003145,
"loss": 7.1969,
"mean_token_accuracy": 0.10749110653996467,
"num_tokens": 1144970.0,
"step": 630
},
{
"entropy": 7.613465976715088,
"epoch": 0.5457670820799312,
"grad_norm": 1.2578125,
"learning_rate": 0.000317,
"loss": 7.1614,
"mean_token_accuracy": 0.11203600242733955,
"num_tokens": 1153810.0,
"step": 635
},
{
"entropy": 7.521342611312866,
"epoch": 0.5500644606789858,
"grad_norm": 1.0546875,
"learning_rate": 0.0003195,
"loss": 7.1408,
"mean_token_accuracy": 0.10991051346063614,
"num_tokens": 1162498.0,
"step": 640
},
{
"entropy": 7.5313867092132565,
"epoch": 0.5543618392780404,
"grad_norm": 1.0546875,
"learning_rate": 0.000322,
"loss": 7.2164,
"mean_token_accuracy": 0.1044546626508236,
"num_tokens": 1172091.0,
"step": 645
},
{
"entropy": 7.653256607055664,
"epoch": 0.5586592178770949,
"grad_norm": 1.1015625,
"learning_rate": 0.00032450000000000003,
"loss": 7.1977,
"mean_token_accuracy": 0.10631284043192864,
"num_tokens": 1181400.0,
"step": 650
},
{
"entropy": 7.537307643890381,
"epoch": 0.5629565964761496,
"grad_norm": 1.2890625,
"learning_rate": 0.00032700000000000003,
"loss": 7.1721,
"mean_token_accuracy": 0.11125476211309433,
"num_tokens": 1189780.0,
"step": 655
},
{
"entropy": 7.477937269210815,
"epoch": 0.5672539750752041,
"grad_norm": 1.1875,
"learning_rate": 0.00032950000000000004,
"loss": 7.1315,
"mean_token_accuracy": 0.1057468131184578,
"num_tokens": 1198671.0,
"step": 660
},
{
"entropy": 7.589753818511963,
"epoch": 0.5715513536742587,
"grad_norm": 1.09375,
"learning_rate": 0.00033200000000000005,
"loss": 7.1652,
"mean_token_accuracy": 0.1051194004714489,
"num_tokens": 1207173.0,
"step": 665
},
{
"entropy": 7.461796855926513,
"epoch": 0.5758487322733132,
"grad_norm": 1.21875,
"learning_rate": 0.00033450000000000005,
"loss": 7.0998,
"mean_token_accuracy": 0.11046240702271462,
"num_tokens": 1216387.0,
"step": 670
},
{
"entropy": 7.622633552551269,
"epoch": 0.5801461108723679,
"grad_norm": 1.0234375,
"learning_rate": 0.000337,
"loss": 7.0722,
"mean_token_accuracy": 0.11004948541522026,
"num_tokens": 1224461.0,
"step": 675
},
{
"entropy": 7.451505851745606,
"epoch": 0.5844434894714224,
"grad_norm": 1.1796875,
"learning_rate": 0.0003395,
"loss": 7.1414,
"mean_token_accuracy": 0.11011224165558815,
"num_tokens": 1233774.0,
"step": 680
},
{
"entropy": 7.457524538040161,
"epoch": 0.588740868070477,
"grad_norm": 1.2109375,
"learning_rate": 0.000342,
"loss": 7.0938,
"mean_token_accuracy": 0.1142980344593525,
"num_tokens": 1242812.0,
"step": 685
},
{
"entropy": 7.605640840530396,
"epoch": 0.5930382466695315,
"grad_norm": 1.03125,
"learning_rate": 0.00034449999999999997,
"loss": 7.191,
"mean_token_accuracy": 0.11035142987966537,
"num_tokens": 1252872.0,
"step": 690
},
{
"entropy": 7.307473850250244,
"epoch": 0.5973356252685862,
"grad_norm": 1.1796875,
"learning_rate": 0.000347,
"loss": 6.983,
"mean_token_accuracy": 0.11081922426819801,
"num_tokens": 1260852.0,
"step": 695
},
{
"entropy": 7.438599157333374,
"epoch": 0.6016330038676407,
"grad_norm": 1.2578125,
"learning_rate": 0.0003495,
"loss": 7.0984,
"mean_token_accuracy": 0.10763570070266723,
"num_tokens": 1268925.0,
"step": 700
},
{
"entropy": 7.530004072189331,
"epoch": 0.6059303824666953,
"grad_norm": 1.109375,
"learning_rate": 0.000352,
"loss": 7.145,
"mean_token_accuracy": 0.10653513446450233,
"num_tokens": 1278994.0,
"step": 705
},
{
"entropy": 7.4260091304779055,
"epoch": 0.6102277610657499,
"grad_norm": 1.1640625,
"learning_rate": 0.0003545,
"loss": 7.1323,
"mean_token_accuracy": 0.10368426591157913,
"num_tokens": 1287698.0,
"step": 710
},
{
"entropy": 7.482218551635742,
"epoch": 0.6145251396648045,
"grad_norm": 1.0546875,
"learning_rate": 0.000357,
"loss": 7.0787,
"mean_token_accuracy": 0.11120296269655228,
"num_tokens": 1297475.0,
"step": 715
},
{
"entropy": 7.480340671539307,
"epoch": 0.618822518263859,
"grad_norm": 1.1328125,
"learning_rate": 0.0003595,
"loss": 7.1091,
"mean_token_accuracy": 0.11085583940148354,
"num_tokens": 1306836.0,
"step": 720
},
{
"entropy": 7.506947946548462,
"epoch": 0.6231198968629136,
"grad_norm": 1.03125,
"learning_rate": 0.000362,
"loss": 7.1377,
"mean_token_accuracy": 0.10435779988765717,
"num_tokens": 1315872.0,
"step": 725
},
{
"entropy": 7.4788847923278805,
"epoch": 0.6274172754619682,
"grad_norm": 1.1796875,
"learning_rate": 0.0003645,
"loss": 7.0782,
"mean_token_accuracy": 0.11685637310147286,
"num_tokens": 1324624.0,
"step": 730
},
{
"entropy": 7.444537830352783,
"epoch": 0.6317146540610228,
"grad_norm": 1.15625,
"learning_rate": 0.000367,
"loss": 7.061,
"mean_token_accuracy": 0.11548577472567559,
"num_tokens": 1333058.0,
"step": 735
},
{
"entropy": 7.262284660339356,
"epoch": 0.6360120326600773,
"grad_norm": 1.078125,
"learning_rate": 0.0003695,
"loss": 7.0248,
"mean_token_accuracy": 0.11004846841096878,
"num_tokens": 1342376.0,
"step": 740
},
{
"entropy": 7.526681852340698,
"epoch": 0.6403094112591319,
"grad_norm": 1.1484375,
"learning_rate": 0.000372,
"loss": 7.0693,
"mean_token_accuracy": 0.10503109246492386,
"num_tokens": 1351386.0,
"step": 745
},
{
"entropy": 7.364239978790283,
"epoch": 0.6446067898581865,
"grad_norm": 1.265625,
"learning_rate": 0.0003745,
"loss": 6.9832,
"mean_token_accuracy": 0.11761592403054237,
"num_tokens": 1358958.0,
"step": 750
},
{
"entropy": 7.496349859237671,
"epoch": 0.6489041684572411,
"grad_norm": 1.109375,
"learning_rate": 0.000377,
"loss": 7.1231,
"mean_token_accuracy": 0.10967899858951569,
"num_tokens": 1368599.0,
"step": 755
},
{
"entropy": 7.435608530044556,
"epoch": 0.6532015470562956,
"grad_norm": 1.890625,
"learning_rate": 0.0003795,
"loss": 7.1433,
"mean_token_accuracy": 0.1064300425350666,
"num_tokens": 1378529.0,
"step": 760
},
{
"entropy": 7.344243001937866,
"epoch": 0.6574989256553503,
"grad_norm": 1.25,
"learning_rate": 0.000382,
"loss": 6.9306,
"mean_token_accuracy": 0.11750481277704239,
"num_tokens": 1386993.0,
"step": 765
},
{
"entropy": 7.390715217590332,
"epoch": 0.6617963042544048,
"grad_norm": 1.5,
"learning_rate": 0.0003845,
"loss": 7.0322,
"mean_token_accuracy": 0.11829963177442551,
"num_tokens": 1395790.0,
"step": 770
},
{
"entropy": 7.302670812606811,
"epoch": 0.6660936828534594,
"grad_norm": 1.078125,
"learning_rate": 0.00038700000000000003,
"loss": 7.0393,
"mean_token_accuracy": 0.11235549300909042,
"num_tokens": 1405587.0,
"step": 775
},
{
"entropy": 7.348860168457032,
"epoch": 0.6703910614525139,
"grad_norm": 1.0390625,
"learning_rate": 0.00038950000000000003,
"loss": 6.9999,
"mean_token_accuracy": 0.11504087448120118,
"num_tokens": 1414478.0,
"step": 780
},
{
"entropy": 7.428205347061157,
"epoch": 0.6746884400515686,
"grad_norm": 1.375,
"learning_rate": 0.00039200000000000004,
"loss": 7.0623,
"mean_token_accuracy": 0.11534775421023369,
"num_tokens": 1423791.0,
"step": 785
},
{
"entropy": 7.467832851409912,
"epoch": 0.6789858186506231,
"grad_norm": 1.234375,
"learning_rate": 0.00039450000000000005,
"loss": 7.1014,
"mean_token_accuracy": 0.10728210881352425,
"num_tokens": 1432955.0,
"step": 790
},
{
"entropy": 7.385548782348633,
"epoch": 0.6832831972496777,
"grad_norm": 0.99609375,
"learning_rate": 0.00039700000000000005,
"loss": 7.074,
"mean_token_accuracy": 0.1087567687034607,
"num_tokens": 1441907.0,
"step": 795
},
{
"entropy": 7.290066146850586,
"epoch": 0.6875805758487322,
"grad_norm": 1.203125,
"learning_rate": 0.0003995,
"loss": 6.935,
"mean_token_accuracy": 0.11768098697066307,
"num_tokens": 1451062.0,
"step": 800
},
{
"entropy": 7.399672508239746,
"epoch": 0.6918779544477869,
"grad_norm": 1.0234375,
"learning_rate": 0.000402,
"loss": 7.0218,
"mean_token_accuracy": 0.10959179401397705,
"num_tokens": 1460132.0,
"step": 805
},
{
"entropy": 7.272280263900757,
"epoch": 0.6961753330468414,
"grad_norm": 1.0625,
"learning_rate": 0.0004045,
"loss": 6.9141,
"mean_token_accuracy": 0.11885375007987023,
"num_tokens": 1469582.0,
"step": 810
},
{
"entropy": 7.255832242965698,
"epoch": 0.700472711645896,
"grad_norm": 1.3515625,
"learning_rate": 0.00040699999999999997,
"loss": 7.012,
"mean_token_accuracy": 0.10950389429926873,
"num_tokens": 1479053.0,
"step": 815
},
{
"entropy": 7.313858604431152,
"epoch": 0.7047700902449506,
"grad_norm": 1.21875,
"learning_rate": 0.0004095,
"loss": 7.0142,
"mean_token_accuracy": 0.11343196108937263,
"num_tokens": 1488189.0,
"step": 820
},
{
"entropy": 7.236453676223755,
"epoch": 0.7090674688440052,
"grad_norm": 1.046875,
"learning_rate": 0.000412,
"loss": 6.8662,
"mean_token_accuracy": 0.12046442031860352,
"num_tokens": 1497324.0,
"step": 825
},
{
"entropy": 7.310264635086059,
"epoch": 0.7133648474430597,
"grad_norm": 1.015625,
"learning_rate": 0.0004145,
"loss": 6.9814,
"mean_token_accuracy": 0.11739002540707588,
"num_tokens": 1506543.0,
"step": 830
},
{
"entropy": 7.289929437637329,
"epoch": 0.7176622260421143,
"grad_norm": 1.109375,
"learning_rate": 0.000417,
"loss": 6.9742,
"mean_token_accuracy": 0.12236066460609436,
"num_tokens": 1516737.0,
"step": 835
},
{
"entropy": 7.161224508285523,
"epoch": 0.7219596046411689,
"grad_norm": 1.046875,
"learning_rate": 0.0004195,
"loss": 6.8503,
"mean_token_accuracy": 0.11500222384929656,
"num_tokens": 1525561.0,
"step": 840
},
{
"entropy": 7.280500030517578,
"epoch": 0.7262569832402235,
"grad_norm": 1.1328125,
"learning_rate": 0.000422,
"loss": 6.8765,
"mean_token_accuracy": 0.1242159940302372,
"num_tokens": 1533323.0,
"step": 845
},
{
"entropy": 7.292038059234619,
"epoch": 0.730554361839278,
"grad_norm": 1.1875,
"learning_rate": 0.0004245,
"loss": 6.9379,
"mean_token_accuracy": 0.12142991349101066,
"num_tokens": 1542632.0,
"step": 850
},
{
"entropy": 7.305912923812866,
"epoch": 0.7348517404383326,
"grad_norm": 1.265625,
"learning_rate": 0.000427,
"loss": 6.8775,
"mean_token_accuracy": 0.12107516825199127,
"num_tokens": 1551236.0,
"step": 855
},
{
"entropy": 7.118098545074463,
"epoch": 0.7391491190373872,
"grad_norm": 1.15625,
"learning_rate": 0.0004295,
"loss": 6.878,
"mean_token_accuracy": 0.12266490310430526,
"num_tokens": 1559674.0,
"step": 860
},
{
"entropy": 7.268103885650635,
"epoch": 0.7434464976364418,
"grad_norm": 1.09375,
"learning_rate": 0.000432,
"loss": 6.9687,
"mean_token_accuracy": 0.1217973381280899,
"num_tokens": 1569481.0,
"step": 865
},
{
"entropy": 7.2675707817077635,
"epoch": 0.7477438762354963,
"grad_norm": 1.0859375,
"learning_rate": 0.0004345,
"loss": 6.9975,
"mean_token_accuracy": 0.11359266638755798,
"num_tokens": 1578488.0,
"step": 870
},
{
"entropy": 7.171451759338379,
"epoch": 0.752041254834551,
"grad_norm": 1.0625,
"learning_rate": 0.000437,
"loss": 6.8946,
"mean_token_accuracy": 0.11810402423143387,
"num_tokens": 1586675.0,
"step": 875
},
{
"entropy": 7.285072469711304,
"epoch": 0.7563386334336055,
"grad_norm": 1.0859375,
"learning_rate": 0.0004395,
"loss": 7.0021,
"mean_token_accuracy": 0.10800698548555374,
"num_tokens": 1595411.0,
"step": 880
},
{
"entropy": 7.312672233581543,
"epoch": 0.7606360120326601,
"grad_norm": 1.1953125,
"learning_rate": 0.000442,
"loss": 6.9755,
"mean_token_accuracy": 0.11759781166911125,
"num_tokens": 1604046.0,
"step": 885
},
{
"entropy": 7.245748281478882,
"epoch": 0.7649333906317146,
"grad_norm": 1.0859375,
"learning_rate": 0.0004445,
"loss": 6.9643,
"mean_token_accuracy": 0.11201045587658882,
"num_tokens": 1613759.0,
"step": 890
},
{
"entropy": 7.238279533386231,
"epoch": 0.7692307692307693,
"grad_norm": 1.015625,
"learning_rate": 0.000447,
"loss": 6.9209,
"mean_token_accuracy": 0.11877147182822227,
"num_tokens": 1623323.0,
"step": 895
},
{
"entropy": 7.230697107315064,
"epoch": 0.7735281478298238,
"grad_norm": 1.1328125,
"learning_rate": 0.00044950000000000003,
"loss": 6.9005,
"mean_token_accuracy": 0.11391794160008431,
"num_tokens": 1631727.0,
"step": 900
},
{
"entropy": 7.194222545623779,
"epoch": 0.7778255264288784,
"grad_norm": 1.1875,
"learning_rate": 0.00045200000000000004,
"loss": 6.8583,
"mean_token_accuracy": 0.12049278989434242,
"num_tokens": 1639544.0,
"step": 905
},
{
"entropy": 7.284112405776978,
"epoch": 0.7821229050279329,
"grad_norm": 1.125,
"learning_rate": 0.00045450000000000004,
"loss": 6.9773,
"mean_token_accuracy": 0.11113567724823951,
"num_tokens": 1648931.0,
"step": 910
},
{
"entropy": 7.1627342224121096,
"epoch": 0.7864202836269876,
"grad_norm": 1.15625,
"learning_rate": 0.00045700000000000005,
"loss": 6.8345,
"mean_token_accuracy": 0.12127922549843788,
"num_tokens": 1657688.0,
"step": 915
},
{
"entropy": 7.259271335601807,
"epoch": 0.7907176622260421,
"grad_norm": 1.0390625,
"learning_rate": 0.00045950000000000006,
"loss": 6.9244,
"mean_token_accuracy": 0.11565326899290085,
"num_tokens": 1666879.0,
"step": 920
},
{
"entropy": 7.1275458335876465,
"epoch": 0.7950150408250967,
"grad_norm": 1.109375,
"learning_rate": 0.000462,
"loss": 6.8982,
"mean_token_accuracy": 0.118662890791893,
"num_tokens": 1676773.0,
"step": 925
},
{
"entropy": 7.2360998630523685,
"epoch": 0.7993124194241513,
"grad_norm": 1.0859375,
"learning_rate": 0.0004645,
"loss": 7.0092,
"mean_token_accuracy": 0.11184348464012146,
"num_tokens": 1686144.0,
"step": 930
},
{
"entropy": 7.26247010231018,
"epoch": 0.8036097980232059,
"grad_norm": 1.078125,
"learning_rate": 0.000467,
"loss": 6.9646,
"mean_token_accuracy": 0.10949353277683258,
"num_tokens": 1695476.0,
"step": 935
},
{
"entropy": 7.174946022033692,
"epoch": 0.8079071766222604,
"grad_norm": 1.046875,
"learning_rate": 0.0004695,
"loss": 6.8498,
"mean_token_accuracy": 0.12084392830729485,
"num_tokens": 1704907.0,
"step": 940
},
{
"entropy": 7.166734504699707,
"epoch": 0.812204555221315,
"grad_norm": 0.9609375,
"learning_rate": 0.000472,
"loss": 6.8948,
"mean_token_accuracy": 0.12091493904590607,
"num_tokens": 1714564.0,
"step": 945
},
{
"entropy": 7.244975614547729,
"epoch": 0.8165019338203696,
"grad_norm": 1.1171875,
"learning_rate": 0.0004745,
"loss": 6.9209,
"mean_token_accuracy": 0.1155279442667961,
"num_tokens": 1725285.0,
"step": 950
},
{
"entropy": 7.1149109363555905,
"epoch": 0.8207993124194242,
"grad_norm": 1.03125,
"learning_rate": 0.000477,
"loss": 6.9153,
"mean_token_accuracy": 0.11715079098939896,
"num_tokens": 1734331.0,
"step": 955
},
{
"entropy": 7.227117824554443,
"epoch": 0.8250966910184787,
"grad_norm": 1.2578125,
"learning_rate": 0.0004795,
"loss": 6.852,
"mean_token_accuracy": 0.11185217499732972,
"num_tokens": 1742340.0,
"step": 960
},
{
"entropy": 7.160442066192627,
"epoch": 0.8293940696175333,
"grad_norm": 1.109375,
"learning_rate": 0.000482,
"loss": 6.8351,
"mean_token_accuracy": 0.12198592498898506,
"num_tokens": 1751725.0,
"step": 965
},
{
"entropy": 6.999344539642334,
"epoch": 0.8336914482165879,
"grad_norm": 1.1328125,
"learning_rate": 0.0004845,
"loss": 6.7683,
"mean_token_accuracy": 0.12398558706045151,
"num_tokens": 1760294.0,
"step": 970
},
{
"entropy": 7.112461137771606,
"epoch": 0.8379888268156425,
"grad_norm": 1.0546875,
"learning_rate": 0.000487,
"loss": 6.8275,
"mean_token_accuracy": 0.11639805063605309,
"num_tokens": 1768912.0,
"step": 975
},
{
"entropy": 7.257990169525146,
"epoch": 0.842286205414697,
"grad_norm": 1.0390625,
"learning_rate": 0.0004895,
"loss": 7.0148,
"mean_token_accuracy": 0.12016609534621239,
"num_tokens": 1778633.0,
"step": 980
},
{
"entropy": 7.1191816329956055,
"epoch": 0.8465835840137517,
"grad_norm": 1.1171875,
"learning_rate": 0.000492,
"loss": 6.8847,
"mean_token_accuracy": 0.11811531409621238,
"num_tokens": 1787275.0,
"step": 985
},
{
"entropy": 7.235857200622559,
"epoch": 0.8508809626128062,
"grad_norm": 1.2578125,
"learning_rate": 0.0004945,
"loss": 6.8878,
"mean_token_accuracy": 0.11604067236185074,
"num_tokens": 1795994.0,
"step": 990
},
{
"entropy": 7.036646842956543,
"epoch": 0.8551783412118608,
"grad_norm": 0.8359375,
"learning_rate": 0.000497,
"loss": 6.804,
"mean_token_accuracy": 0.11985133662819862,
"num_tokens": 1806379.0,
"step": 995
},
{
"entropy": 7.154667520523072,
"epoch": 0.8594757198109153,
"grad_norm": 1.0546875,
"learning_rate": 0.0004995,
"loss": 6.8296,
"mean_token_accuracy": 0.1270947828888893,
"num_tokens": 1816135.0,
"step": 1000
},
{
"epoch": 0.8594757198109153,
"eval_entropy": 6.812919497489929,
"eval_loss": 6.8574419021606445,
"eval_mean_token_accuracy": 0.12292942362795542,
"eval_num_tokens": 1816135.0,
"eval_runtime": 2.0522,
"eval_samples_per_second": 1729.37,
"eval_steps_per_second": 216.354,
"step": 1000
},
{
"entropy": 7.122643280029297,
"epoch": 0.86377309840997,
"grad_norm": 1.2734375,
"learning_rate": 0.0004999998427807679,
"loss": 6.8305,
"mean_token_accuracy": 0.12133256047964096,
"num_tokens": 1824777.0,
"step": 1005
},
{
"entropy": 7.058982563018799,
"epoch": 0.8680704770090245,
"grad_norm": 1.234375,
"learning_rate": 0.0004999992040780138,
"loss": 6.8924,
"mean_token_accuracy": 0.12320492565631866,
"num_tokens": 1833807.0,
"step": 1010
},
{
"entropy": 7.185050773620605,
"epoch": 0.8723678556080791,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999980740669294,
"loss": 6.8357,
"mean_token_accuracy": 0.11969011649489403,
"num_tokens": 1843375.0,
"step": 1015
},
{
"entropy": 7.11086139678955,
"epoch": 0.8766652342071336,
"grad_norm": 1.140625,
"learning_rate": 0.0004999964527499823,
"loss": 6.9058,
"mean_token_accuracy": 0.11237111985683441,
"num_tokens": 1853036.0,
"step": 1020
},
{
"entropy": 7.120519638061523,
"epoch": 0.8809626128061883,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999943401307127,
"loss": 6.8707,
"mean_token_accuracy": 0.11769452393054962,
"num_tokens": 1862041.0,
"step": 1025
},
{
"entropy": 7.087871503829956,
"epoch": 0.8852599914052428,
"grad_norm": 1.1015625,
"learning_rate": 0.0004999917362137337,
"loss": 6.7742,
"mean_token_accuracy": 0.1225271351635456,
"num_tokens": 1870707.0,
"step": 1030
},
{
"entropy": 7.055140686035156,
"epoch": 0.8895573700042974,
"grad_norm": 1.078125,
"learning_rate": 0.0004999886410047312,
"loss": 6.7705,
"mean_token_accuracy": 0.11845692843198777,
"num_tokens": 1879787.0,
"step": 1035
},
{
"entropy": 7.138674926757813,
"epoch": 0.8938547486033519,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999850545104638,
"loss": 6.8315,
"mean_token_accuracy": 0.1223653219640255,
"num_tokens": 1889413.0,
"step": 1040
},
{
"entropy": 7.048402404785156,
"epoch": 0.8981521272024066,
"grad_norm": 1.171875,
"learning_rate": 0.0004999809767387633,
"loss": 6.8174,
"mean_token_accuracy": 0.12110616937279702,
"num_tokens": 1898283.0,
"step": 1045
},
{
"entropy": 7.144178056716919,
"epoch": 0.9024495058014611,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999764076985337,
"loss": 6.8287,
"mean_token_accuracy": 0.12670400962233544,
"num_tokens": 1907175.0,
"step": 1050
},
{
"entropy": 6.988327312469482,
"epoch": 0.9067468844005157,
"grad_norm": 1.09375,
"learning_rate": 0.0004999713473997519,
"loss": 6.8824,
"mean_token_accuracy": 0.11774980947375298,
"num_tokens": 1918223.0,
"step": 1055
},
{
"entropy": 7.124748563766479,
"epoch": 0.9110442629995703,
"grad_norm": 1.09375,
"learning_rate": 0.0004999657958534677,
"loss": 6.8312,
"mean_token_accuracy": 0.1194000355899334,
"num_tokens": 1928801.0,
"step": 1060
},
{
"entropy": 7.008511686325074,
"epoch": 0.9153416415986249,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999597530718034,
"loss": 6.7896,
"mean_token_accuracy": 0.12186847031116485,
"num_tokens": 1937406.0,
"step": 1065
},
{
"entropy": 6.997484445571899,
"epoch": 0.9196390201976794,
"grad_norm": 1.078125,
"learning_rate": 0.000499953219067954,
"loss": 6.7932,
"mean_token_accuracy": 0.11857569143176079,
"num_tokens": 1947184.0,
"step": 1070
},
{
"entropy": 7.135808944702148,
"epoch": 0.923936398796734,
"grad_norm": 1.09375,
"learning_rate": 0.0004999461938561873,
"loss": 6.8139,
"mean_token_accuracy": 0.12288291603326798,
"num_tokens": 1956293.0,
"step": 1075
},
{
"entropy": 7.027012157440185,
"epoch": 0.9282337773957886,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999386774518432,
"loss": 6.7854,
"mean_token_accuracy": 0.11997194737195968,
"num_tokens": 1964791.0,
"step": 1080
},
{
"entropy": 6.975531768798828,
"epoch": 0.9325311559948432,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999306698713349,
"loss": 6.7088,
"mean_token_accuracy": 0.12559010088443756,
"num_tokens": 1973754.0,
"step": 1085
},
{
"entropy": 7.052453565597534,
"epoch": 0.9368285345938977,
"grad_norm": 1.078125,
"learning_rate": 0.0004999221711321477,
"loss": 6.7738,
"mean_token_accuracy": 0.12475829720497131,
"num_tokens": 1983035.0,
"step": 1090
},
{
"entropy": 6.906819009780884,
"epoch": 0.9411259131929522,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999131812528393,
"loss": 6.8003,
"mean_token_accuracy": 0.12229804769158363,
"num_tokens": 1992584.0,
"step": 1095
},
{
"entropy": 7.109902429580688,
"epoch": 0.9454232917920069,
"grad_norm": 0.97265625,
"learning_rate": 0.00049990370025304,
"loss": 6.8193,
"mean_token_accuracy": 0.12188051193952561,
"num_tokens": 2001876.0,
"step": 1100
},
{
"entropy": 7.017454195022583,
"epoch": 0.9497206703910615,
"grad_norm": 0.97265625,
"learning_rate": 0.0004998937281534526,
"loss": 6.7115,
"mean_token_accuracy": 0.1300358146429062,
"num_tokens": 2011067.0,
"step": 1105
},
{
"entropy": 7.091220808029175,
"epoch": 0.954018048990116,
"grad_norm": 1.09375,
"learning_rate": 0.0004998832649758521,
"loss": 6.8077,
"mean_token_accuracy": 0.12548175528645517,
"num_tokens": 2020763.0,
"step": 1110
},
{
"entropy": 6.9685986042022705,
"epoch": 0.9583154275891707,
"grad_norm": 1.1796875,
"learning_rate": 0.0004998723107430862,
"loss": 6.7867,
"mean_token_accuracy": 0.12391732335090637,
"num_tokens": 2029534.0,
"step": 1115
},
{
"entropy": 7.046098041534424,
"epoch": 0.9626128061882252,
"grad_norm": 1.09375,
"learning_rate": 0.0004998608654790741,
"loss": 6.7311,
"mean_token_accuracy": 0.12396327033638954,
"num_tokens": 2039143.0,
"step": 1120
},
{
"entropy": 6.939239406585694,
"epoch": 0.9669101847872797,
"grad_norm": 1.125,
"learning_rate": 0.000499848929208808,
"loss": 6.7022,
"mean_token_accuracy": 0.1295892022550106,
"num_tokens": 2048253.0,
"step": 1125
},
{
"entropy": 6.931437301635742,
"epoch": 0.9712075633863343,
"grad_norm": 1.1484375,
"learning_rate": 0.0004998365019583519,
"loss": 6.7428,
"mean_token_accuracy": 0.13122318536043168,
"num_tokens": 2057234.0,
"step": 1130
},
{
"entropy": 7.081391954421997,
"epoch": 0.975504941985389,
"grad_norm": 1.1953125,
"learning_rate": 0.0004998235837548417,
"loss": 6.7881,
"mean_token_accuracy": 0.1271953523159027,
"num_tokens": 2065431.0,
"step": 1135
},
{
"entropy": 6.974546146392822,
"epoch": 0.9798023205844435,
"grad_norm": 1.0625,
"learning_rate": 0.000499810174626486,
"loss": 6.7888,
"mean_token_accuracy": 0.1228917419910431,
"num_tokens": 2074723.0,
"step": 1140
},
{
"entropy": 7.011039209365845,
"epoch": 0.984099699183498,
"grad_norm": 1.1953125,
"learning_rate": 0.0004997962746025646,
"loss": 6.6544,
"mean_token_accuracy": 0.13169871941208838,
"num_tokens": 2084509.0,
"step": 1145
},
{
"entropy": 6.973200798034668,
"epoch": 0.9883970777825526,
"grad_norm": 1.21875,
"learning_rate": 0.0004997818837134298,
"loss": 6.8028,
"mean_token_accuracy": 0.12382483929395675,
"num_tokens": 2093110.0,
"step": 1150
},
{
"entropy": 6.879178285598755,
"epoch": 0.9926944563816072,
"grad_norm": 1.125,
"learning_rate": 0.0004997670019905057,
"loss": 6.6634,
"mean_token_accuracy": 0.12532600611448289,
"num_tokens": 2102355.0,
"step": 1155
},
{
"entropy": 6.967250823974609,
"epoch": 0.9969918349806618,
"grad_norm": 1.171875,
"learning_rate": 0.0004997516294662876,
"loss": 6.6987,
"mean_token_accuracy": 0.12651606351137162,
"num_tokens": 2110418.0,
"step": 1160
},
{
"entropy": 6.987489064534505,
"epoch": 1.0008594757198108,
"grad_norm": 1.1484375,
"learning_rate": 0.0004997357661743433,
"loss": 6.6851,
"mean_token_accuracy": 0.12885562578837076,
"num_tokens": 2117866.0,
"step": 1165
},
{
"entropy": 6.906875991821289,
"epoch": 1.0051568543188656,
"grad_norm": 1.09375,
"learning_rate": 0.0004997194121493118,
"loss": 6.5242,
"mean_token_accuracy": 0.1341039627790451,
"num_tokens": 2126082.0,
"step": 1170
},
{
"entropy": 6.9217222213745115,
"epoch": 1.0094542329179201,
"grad_norm": 1.078125,
"learning_rate": 0.0004997025674269037,
"loss": 6.496,
"mean_token_accuracy": 0.14013660922646523,
"num_tokens": 2134042.0,
"step": 1175
},
{
"entropy": 6.853777265548706,
"epoch": 1.0137516115169747,
"grad_norm": 1.1953125,
"learning_rate": 0.0004996852320439013,
"loss": 6.5756,
"mean_token_accuracy": 0.13146138042211533,
"num_tokens": 2142570.0,
"step": 1180
},
{
"entropy": 6.882978248596191,
"epoch": 1.0180489901160292,
"grad_norm": 0.9765625,
"learning_rate": 0.0004996674060381578,
"loss": 6.5116,
"mean_token_accuracy": 0.13583723902702333,
"num_tokens": 2151310.0,
"step": 1185
},
{
"entropy": 6.949011325836182,
"epoch": 1.0223463687150838,
"grad_norm": 1.09375,
"learning_rate": 0.0004996490894485985,
"loss": 6.5696,
"mean_token_accuracy": 0.1317083679139614,
"num_tokens": 2160662.0,
"step": 1190
},
{
"entropy": 6.906634664535522,
"epoch": 1.0266437473141383,
"grad_norm": 1.078125,
"learning_rate": 0.0004996302823152193,
"loss": 6.5221,
"mean_token_accuracy": 0.132858457416296,
"num_tokens": 2170067.0,
"step": 1195
},
{
"entropy": 6.835825204849243,
"epoch": 1.0309411259131929,
"grad_norm": 1.09375,
"learning_rate": 0.0004996109846790873,
"loss": 6.4844,
"mean_token_accuracy": 0.13565613552927971,
"num_tokens": 2178850.0,
"step": 1200
},
{
"entropy": 6.833173513412476,
"epoch": 1.0352385045122476,
"grad_norm": 0.984375,
"learning_rate": 0.0004995911965823412,
"loss": 6.5058,
"mean_token_accuracy": 0.14241415858268738,
"num_tokens": 2188307.0,
"step": 1205
},
{
"entropy": 6.888755178451538,
"epoch": 1.0395358831113022,
"grad_norm": 1.171875,
"learning_rate": 0.0004995709180681899,
"loss": 6.5098,
"mean_token_accuracy": 0.14214854687452316,
"num_tokens": 2197026.0,
"step": 1210
},
{
"entropy": 6.828827667236328,
"epoch": 1.0438332617103567,
"grad_norm": 1.109375,
"learning_rate": 0.000499550149180914,
"loss": 6.4795,
"mean_token_accuracy": 0.13599886670708655,
"num_tokens": 2205537.0,
"step": 1215
},
{
"entropy": 6.880095815658569,
"epoch": 1.0481306403094113,
"grad_norm": 1.15625,
"learning_rate": 0.0004995288899658641,
"loss": 6.5128,
"mean_token_accuracy": 0.14047559648752211,
"num_tokens": 2214508.0,
"step": 1220
},
{
"entropy": 6.848831415176392,
"epoch": 1.0524280189084658,
"grad_norm": 1.1796875,
"learning_rate": 0.0004995071404694619,
"loss": 6.6248,
"mean_token_accuracy": 0.1286735638976097,
"num_tokens": 2223084.0,
"step": 1225
},
{
"entropy": 6.930538558959961,
"epoch": 1.0567253975075204,
"grad_norm": 1.0546875,
"learning_rate": 0.0004994849007391996,
"loss": 6.5507,
"mean_token_accuracy": 0.12893568202853203,
"num_tokens": 2231406.0,
"step": 1230
},
{
"entropy": 6.784887790679932,
"epoch": 1.061022776106575,
"grad_norm": 1.0859375,
"learning_rate": 0.0004994621708236401,
"loss": 6.4682,
"mean_token_accuracy": 0.136442781239748,
"num_tokens": 2239867.0,
"step": 1235
},
{
"entropy": 6.8624866008758545,
"epoch": 1.0653201547056295,
"grad_norm": 1.203125,
"learning_rate": 0.000499438950772416,
"loss": 6.5264,
"mean_token_accuracy": 0.1343722127377987,
"num_tokens": 2248844.0,
"step": 1240
},
{
"entropy": 6.764705419540405,
"epoch": 1.0696175333046842,
"grad_norm": 1.125,
"learning_rate": 0.0004994152406362311,
"loss": 6.4525,
"mean_token_accuracy": 0.14018251076340676,
"num_tokens": 2257599.0,
"step": 1245
},
{
"entropy": 6.871714019775391,
"epoch": 1.0739149119037388,
"grad_norm": 1.2421875,
"learning_rate": 0.0004993910404668586,
"loss": 6.4992,
"mean_token_accuracy": 0.1316287100315094,
"num_tokens": 2266510.0,
"step": 1250
},
{
"entropy": 6.801673936843872,
"epoch": 1.0782122905027933,
"grad_norm": 1.0,
"learning_rate": 0.000499366350317142,
"loss": 6.4902,
"mean_token_accuracy": 0.1355181120336056,
"num_tokens": 2275462.0,
"step": 1255
},
{
"entropy": 6.805047512054443,
"epoch": 1.0825096691018479,
"grad_norm": 1.1484375,
"learning_rate": 0.0004993411702409948,
"loss": 6.4684,
"mean_token_accuracy": 0.13499311953783036,
"num_tokens": 2283826.0,
"step": 1260
},
{
"entropy": 6.796231460571289,
"epoch": 1.0868070477009024,
"grad_norm": 1.171875,
"learning_rate": 0.0004993155002934002,
"loss": 6.4758,
"mean_token_accuracy": 0.13739539608359336,
"num_tokens": 2292967.0,
"step": 1265
},
{
"entropy": 6.935551691055298,
"epoch": 1.091104426299957,
"grad_norm": 1.5078125,
"learning_rate": 0.0004992893405304111,
"loss": 6.6091,
"mean_token_accuracy": 0.13493912890553475,
"num_tokens": 2302336.0,
"step": 1270
},
{
"entropy": 6.757972192764282,
"epoch": 1.0954018048990115,
"grad_norm": 1.03125,
"learning_rate": 0.00049926269100915,
"loss": 6.5039,
"mean_token_accuracy": 0.14085786640644074,
"num_tokens": 2311465.0,
"step": 1275
},
{
"entropy": 6.884800767898559,
"epoch": 1.0996991834980663,
"grad_norm": 1.0859375,
"learning_rate": 0.0004992355517878087,
"loss": 6.6134,
"mean_token_accuracy": 0.12797435671091079,
"num_tokens": 2320281.0,
"step": 1280
},
{
"entropy": 6.775428581237793,
"epoch": 1.1039965620971208,
"grad_norm": 1.15625,
"learning_rate": 0.0004992079229256484,
"loss": 6.5189,
"mean_token_accuracy": 0.1329084627330303,
"num_tokens": 2329755.0,
"step": 1285
},
{
"entropy": 6.721524858474732,
"epoch": 1.1082939406961754,
"grad_norm": 1.015625,
"learning_rate": 0.0004991798044829996,
"loss": 6.4524,
"mean_token_accuracy": 0.1344260886311531,
"num_tokens": 2338807.0,
"step": 1290
},
{
"entropy": 6.870701122283935,
"epoch": 1.11259131929523,
"grad_norm": 1.109375,
"learning_rate": 0.0004991511965212618,
"loss": 6.5591,
"mean_token_accuracy": 0.13554905205965043,
"num_tokens": 2348056.0,
"step": 1295
},
{
"entropy": 6.759064626693726,
"epoch": 1.1168886978942845,
"grad_norm": 1.0546875,
"learning_rate": 0.0004991220991029032,
"loss": 6.5619,
"mean_token_accuracy": 0.13164993077516557,
"num_tokens": 2357780.0,
"step": 1300
},
{
"entropy": 6.845104169845581,
"epoch": 1.121186076493339,
"grad_norm": 1.296875,
"learning_rate": 0.000499092512291461,
"loss": 6.526,
"mean_token_accuracy": 0.13971479684114457,
"num_tokens": 2367060.0,
"step": 1305
},
{
"entropy": 6.800533056259155,
"epoch": 1.1254834550923936,
"grad_norm": 1.0859375,
"learning_rate": 0.000499062436151541,
"loss": 6.5277,
"mean_token_accuracy": 0.13263508304953575,
"num_tokens": 2375751.0,
"step": 1310
},
{
"entropy": 6.890619134902954,
"epoch": 1.129780833691448,
"grad_norm": 1.109375,
"learning_rate": 0.0004990318707488173,
"loss": 6.5788,
"mean_token_accuracy": 0.12899956330657006,
"num_tokens": 2385013.0,
"step": 1315
},
{
"entropy": 6.769053792953491,
"epoch": 1.1340782122905029,
"grad_norm": 1.140625,
"learning_rate": 0.0004990008161500327,
"loss": 6.48,
"mean_token_accuracy": 0.1359359547495842,
"num_tokens": 2392935.0,
"step": 1320
},
{
"entropy": 6.7767839431762695,
"epoch": 1.1383755908895574,
"grad_norm": 1.2109375,
"learning_rate": 0.000498969272422998,
"loss": 6.4887,
"mean_token_accuracy": 0.13946662694215775,
"num_tokens": 2401560.0,
"step": 1325
},
{
"entropy": 6.732125520706177,
"epoch": 1.142672969488612,
"grad_norm": 1.0546875,
"learning_rate": 0.0004989372396365921,
"loss": 6.4183,
"mean_token_accuracy": 0.13894038647413254,
"num_tokens": 2410050.0,
"step": 1330
},
{
"entropy": 6.8855541229248045,
"epoch": 1.1469703480876665,
"grad_norm": 1.1015625,
"learning_rate": 0.0004989047178607618,
"loss": 6.5218,
"mean_token_accuracy": 0.13579266518354416,
"num_tokens": 2418980.0,
"step": 1335
},
{
"entropy": 6.7566611766815186,
"epoch": 1.151267726686721,
"grad_norm": 1.09375,
"learning_rate": 0.0004988717071665215,
"loss": 6.5177,
"mean_token_accuracy": 0.13580050468444824,
"num_tokens": 2427992.0,
"step": 1340
},
{
"entropy": 6.821787118911743,
"epoch": 1.1555651052857756,
"grad_norm": 0.99609375,
"learning_rate": 0.0004988382076259537,
"loss": 6.4297,
"mean_token_accuracy": 0.1417124703526497,
"num_tokens": 2436368.0,
"step": 1345
},
{
"entropy": 6.65723991394043,
"epoch": 1.1598624838848304,
"grad_norm": 1.0,
"learning_rate": 0.0004988042193122077,
"loss": 6.4243,
"mean_token_accuracy": 0.1399266541004181,
"num_tokens": 2445499.0,
"step": 1350
},
{
"entropy": 6.846164894104004,
"epoch": 1.164159862483885,
"grad_norm": 1.171875,
"learning_rate": 0.0004987697422995005,
"loss": 6.4564,
"mean_token_accuracy": 0.13335739225149154,
"num_tokens": 2454312.0,
"step": 1355
},
{
"entropy": 6.705566883087158,
"epoch": 1.1684572410829395,
"grad_norm": 1.0625,
"learning_rate": 0.0004987347766631161,
"loss": 6.5179,
"mean_token_accuracy": 0.13981100916862488,
"num_tokens": 2462922.0,
"step": 1360
},
{
"entropy": 6.8054440975189205,
"epoch": 1.172754619681994,
"grad_norm": 1.046875,
"learning_rate": 0.0004986993224794055,
"loss": 6.5574,
"mean_token_accuracy": 0.12931617349386215,
"num_tokens": 2472195.0,
"step": 1365
},
{
"entropy": 6.731846857070923,
"epoch": 1.1770519982810486,
"grad_norm": 1.171875,
"learning_rate": 0.0004986633798257865,
"loss": 6.456,
"mean_token_accuracy": 0.13557855412364006,
"num_tokens": 2481021.0,
"step": 1370
},
{
"entropy": 6.709754800796508,
"epoch": 1.181349376880103,
"grad_norm": 1.171875,
"learning_rate": 0.0004986269487807434,
"loss": 6.4682,
"mean_token_accuracy": 0.13462188541889192,
"num_tokens": 2490250.0,
"step": 1375
},
{
"entropy": 6.8344573974609375,
"epoch": 1.1856467554791577,
"grad_norm": 1.0625,
"learning_rate": 0.000498590029423827,
"loss": 6.529,
"mean_token_accuracy": 0.13892517015337943,
"num_tokens": 2499122.0,
"step": 1380
},
{
"entropy": 6.794313240051269,
"epoch": 1.1899441340782122,
"grad_norm": 1.109375,
"learning_rate": 0.0004985526218356546,
"loss": 6.5102,
"mean_token_accuracy": 0.13186247944831847,
"num_tokens": 2508454.0,
"step": 1385
},
{
"entropy": 6.717947912216187,
"epoch": 1.1942415126772667,
"grad_norm": 1.09375,
"learning_rate": 0.0004985147260979093,
"loss": 6.449,
"mean_token_accuracy": 0.1434843860566616,
"num_tokens": 2517353.0,
"step": 1390
},
{
"entropy": 6.771858787536621,
"epoch": 1.1985388912763215,
"grad_norm": 1.140625,
"learning_rate": 0.0004984763422933402,
"loss": 6.4618,
"mean_token_accuracy": 0.13847233429551126,
"num_tokens": 2526321.0,
"step": 1395
},
{
"entropy": 6.732237863540649,
"epoch": 1.202836269875376,
"grad_norm": 0.984375,
"learning_rate": 0.0004984374705057623,
"loss": 6.5033,
"mean_token_accuracy": 0.13528537154197692,
"num_tokens": 2535924.0,
"step": 1400
},
{
"entropy": 6.721146202087402,
"epoch": 1.2071336484744306,
"grad_norm": 1.1484375,
"learning_rate": 0.0004983981108200561,
"loss": 6.4711,
"mean_token_accuracy": 0.13535311296582223,
"num_tokens": 2545606.0,
"step": 1405
},
{
"entropy": 6.733812093734741,
"epoch": 1.2114310270734852,
"grad_norm": 1.125,
"learning_rate": 0.0004983582633221672,
"loss": 6.4601,
"mean_token_accuracy": 0.1369933992624283,
"num_tokens": 2554947.0,
"step": 1410
},
{
"entropy": 6.855603933334351,
"epoch": 1.2157284056725397,
"grad_norm": 0.984375,
"learning_rate": 0.0004983179280991068,
"loss": 6.6134,
"mean_token_accuracy": 0.12978528887033464,
"num_tokens": 2564462.0,
"step": 1415
},
{
"entropy": 6.726688861846924,
"epoch": 1.2200257842715942,
"grad_norm": 1.09375,
"learning_rate": 0.0004982771052389508,
"loss": 6.4475,
"mean_token_accuracy": 0.1368112660944462,
"num_tokens": 2573124.0,
"step": 1420
},
{
"entropy": 6.807424783706665,
"epoch": 1.224323162870649,
"grad_norm": 1.1015625,
"learning_rate": 0.0004982357948308401,
"loss": 6.5481,
"mean_token_accuracy": 0.13265790268778802,
"num_tokens": 2581829.0,
"step": 1425
},
{
"entropy": 6.770775365829468,
"epoch": 1.2286205414697036,
"grad_norm": 1.1015625,
"learning_rate": 0.0004981939969649799,
"loss": 6.4049,
"mean_token_accuracy": 0.14194427505135537,
"num_tokens": 2590631.0,
"step": 1430
},
{
"entropy": 6.709357166290284,
"epoch": 1.232917920068758,
"grad_norm": 1.1640625,
"learning_rate": 0.0004981517117326404,
"loss": 6.5216,
"mean_token_accuracy": 0.13609697446227073,
"num_tokens": 2600684.0,
"step": 1435
},
{
"entropy": 6.725667095184326,
"epoch": 1.2372152986678127,
"grad_norm": 1.046875,
"learning_rate": 0.0004981089392261553,
"loss": 6.4349,
"mean_token_accuracy": 0.14131608307361604,
"num_tokens": 2609667.0,
"step": 1440
},
{
"entropy": 6.692513275146484,
"epoch": 1.2415126772668672,
"grad_norm": 0.99609375,
"learning_rate": 0.000498065679538923,
"loss": 6.5055,
"mean_token_accuracy": 0.14114993885159494,
"num_tokens": 2620025.0,
"step": 1445
},
{
"entropy": 6.7513340473175045,
"epoch": 1.2458100558659218,
"grad_norm": 1.125,
"learning_rate": 0.0004980219327654049,
"loss": 6.428,
"mean_token_accuracy": 0.13774933964014052,
"num_tokens": 2629032.0,
"step": 1450
},
{
"entropy": 6.702835464477539,
"epoch": 1.2501074344649763,
"grad_norm": 1.09375,
"learning_rate": 0.000497977699001127,
"loss": 6.402,
"mean_token_accuracy": 0.142982679605484,
"num_tokens": 2638303.0,
"step": 1455
},
{
"entropy": 6.761410474777222,
"epoch": 1.2544048130640308,
"grad_norm": 1.125,
"learning_rate": 0.0004979329783426778,
"loss": 6.4318,
"mean_token_accuracy": 0.14380076453089713,
"num_tokens": 2647902.0,
"step": 1460
},
{
"entropy": 6.731089019775391,
"epoch": 1.2587021916630854,
"grad_norm": 1.1015625,
"learning_rate": 0.0004978877708877094,
"loss": 6.4848,
"mean_token_accuracy": 0.13676076754927635,
"num_tokens": 2657902.0,
"step": 1465
},
{
"entropy": 6.71400637626648,
"epoch": 1.2629995702621402,
"grad_norm": 1.0703125,
"learning_rate": 0.0004978420767349368,
"loss": 6.4196,
"mean_token_accuracy": 0.13780386745929718,
"num_tokens": 2667082.0,
"step": 1470
},
{
"entropy": 6.737793684005737,
"epoch": 1.2672969488611947,
"grad_norm": 1.03125,
"learning_rate": 0.0004977958959841379,
"loss": 6.4943,
"mean_token_accuracy": 0.1352358005940914,
"num_tokens": 2676855.0,
"step": 1475
},
{
"entropy": 6.734015226364136,
"epoch": 1.2715943274602493,
"grad_norm": 1.0390625,
"learning_rate": 0.000497749228736153,
"loss": 6.4201,
"mean_token_accuracy": 0.14142746701836587,
"num_tokens": 2685750.0,
"step": 1480
},
{
"entropy": 6.656690311431885,
"epoch": 1.2758917060593038,
"grad_norm": 1.171875,
"learning_rate": 0.0004977020750928845,
"loss": 6.4771,
"mean_token_accuracy": 0.14191860556602479,
"num_tokens": 2695272.0,
"step": 1485
},
{
"entropy": 6.794925928115845,
"epoch": 1.2801890846583583,
"grad_norm": 1.046875,
"learning_rate": 0.0004976544351572973,
"loss": 6.4253,
"mean_token_accuracy": 0.14196638017892838,
"num_tokens": 2704806.0,
"step": 1490
},
{
"entropy": 6.56059627532959,
"epoch": 1.2844864632574131,
"grad_norm": 1.0390625,
"learning_rate": 0.0004976063090334179,
"loss": 6.4836,
"mean_token_accuracy": 0.14093814194202423,
"num_tokens": 2713521.0,
"step": 1495
},
{
"entropy": 6.7648594856262205,
"epoch": 1.2887838418564677,
"grad_norm": 1.1171875,
"learning_rate": 0.0004975576968263346,
"loss": 6.472,
"mean_token_accuracy": 0.13531532436609267,
"num_tokens": 2721848.0,
"step": 1500
},
{
"epoch": 1.2887838418564677,
"eval_entropy": 6.583824046023257,
"eval_loss": 6.552463054656982,
"eval_mean_token_accuracy": 0.13841687775477096,
"eval_num_tokens": 2721848.0,
"eval_runtime": 2.0451,
"eval_samples_per_second": 1735.359,
"eval_steps_per_second": 217.103,
"step": 1500
},
{
"entropy": 6.6689835548400875,
"epoch": 1.2930812204555222,
"grad_norm": 1.0,
"learning_rate": 0.000497508598642197,
"loss": 6.4406,
"mean_token_accuracy": 0.13946301937103273,
"num_tokens": 2731473.0,
"step": 1505
},
{
"entropy": 6.724963998794555,
"epoch": 1.2973785990545768,
"grad_norm": 1.0625,
"learning_rate": 0.000497459014588216,
"loss": 6.5064,
"mean_token_accuracy": 0.13410719558596612,
"num_tokens": 2739867.0,
"step": 1510
},
{
"entropy": 6.701112556457519,
"epoch": 1.3016759776536313,
"grad_norm": 1.0859375,
"learning_rate": 0.000497408944772663,
"loss": 6.4165,
"mean_token_accuracy": 0.14087883234024048,
"num_tokens": 2748903.0,
"step": 1515
},
{
"entropy": 6.621306848526001,
"epoch": 1.3059733562526858,
"grad_norm": 1.0390625,
"learning_rate": 0.0004973583893048707,
"loss": 6.4144,
"mean_token_accuracy": 0.13790024891495706,
"num_tokens": 2757711.0,
"step": 1520
},
{
"entropy": 6.8078021049499515,
"epoch": 1.3102707348517404,
"grad_norm": 1.109375,
"learning_rate": 0.0004973073482952321,
"loss": 6.4178,
"mean_token_accuracy": 0.14102478623390197,
"num_tokens": 2765633.0,
"step": 1525
},
{
"entropy": 6.606275224685669,
"epoch": 1.314568113450795,
"grad_norm": 1.3046875,
"learning_rate": 0.0004972558218552004,
"loss": 6.454,
"mean_token_accuracy": 0.1388860262930393,
"num_tokens": 2774495.0,
"step": 1530
},
{
"entropy": 6.737347936630249,
"epoch": 1.3188654920498495,
"grad_norm": 1.1328125,
"learning_rate": 0.0004972038100972885,
"loss": 6.4827,
"mean_token_accuracy": 0.13370617032051085,
"num_tokens": 2782665.0,
"step": 1535
},
{
"entropy": 6.652740144729615,
"epoch": 1.323162870648904,
"grad_norm": 1.3125,
"learning_rate": 0.0004971513131350697,
"loss": 6.4163,
"mean_token_accuracy": 0.13846877068281174,
"num_tokens": 2791394.0,
"step": 1540
},
{
"entropy": 6.583173847198486,
"epoch": 1.3274602492479588,
"grad_norm": 1.1484375,
"learning_rate": 0.0004970983310831759,
"loss": 6.4113,
"mean_token_accuracy": 0.13881225883960724,
"num_tokens": 2800488.0,
"step": 1545
},
{
"entropy": 6.734278392791748,
"epoch": 1.3317576278470133,
"grad_norm": 1.03125,
"learning_rate": 0.0004970448640572989,
"loss": 6.5243,
"mean_token_accuracy": 0.1339696764945984,
"num_tokens": 2810116.0,
"step": 1550
},
{
"entropy": 6.658429765701294,
"epoch": 1.336055006446068,
"grad_norm": 0.94921875,
"learning_rate": 0.0004969909121741895,
"loss": 6.3255,
"mean_token_accuracy": 0.14455484077334405,
"num_tokens": 2819205.0,
"step": 1555
},
{
"entropy": 6.591242885589599,
"epoch": 1.3403523850451224,
"grad_norm": 1.109375,
"learning_rate": 0.0004969364755516569,
"loss": 6.4035,
"mean_token_accuracy": 0.13771276026964188,
"num_tokens": 2828017.0,
"step": 1560
},
{
"entropy": 6.73987512588501,
"epoch": 1.344649763644177,
"grad_norm": 1.1328125,
"learning_rate": 0.0004968815543085689,
"loss": 6.438,
"mean_token_accuracy": 0.14133503511548043,
"num_tokens": 2837125.0,
"step": 1565
},
{
"entropy": 6.648034620285034,
"epoch": 1.3489471422432318,
"grad_norm": 1.0625,
"learning_rate": 0.0004968261485648516,
"loss": 6.4665,
"mean_token_accuracy": 0.13752973526716233,
"num_tokens": 2845438.0,
"step": 1570
},
{
"entropy": 6.690678644180298,
"epoch": 1.3532445208422863,
"grad_norm": 1.015625,
"learning_rate": 0.000496770258441489,
"loss": 6.4311,
"mean_token_accuracy": 0.14550055414438248,
"num_tokens": 2854389.0,
"step": 1575
},
{
"entropy": 6.591717529296875,
"epoch": 1.3575418994413408,
"grad_norm": 1.0234375,
"learning_rate": 0.0004967138840605228,
"loss": 6.3947,
"mean_token_accuracy": 0.1433369368314743,
"num_tokens": 2863654.0,
"step": 1580
},
{
"entropy": 6.645109987258911,
"epoch": 1.3618392780403954,
"grad_norm": 1.0703125,
"learning_rate": 0.000496657025545052,
"loss": 6.3068,
"mean_token_accuracy": 0.14519514814019202,
"num_tokens": 2872871.0,
"step": 1585
},
{
"entropy": 6.5770776748657225,
"epoch": 1.36613665663945,
"grad_norm": 1.1328125,
"learning_rate": 0.000496599683019233,
"loss": 6.4037,
"mean_token_accuracy": 0.14221980646252633,
"num_tokens": 2881140.0,
"step": 1590
},
{
"entropy": 6.7226653575897215,
"epoch": 1.3704340352385045,
"grad_norm": 1.0546875,
"learning_rate": 0.000496541856608279,
"loss": 6.3852,
"mean_token_accuracy": 0.14397331327199936,
"num_tokens": 2889945.0,
"step": 1595
},
{
"entropy": 6.5361980438232425,
"epoch": 1.374731413837559,
"grad_norm": 0.95703125,
"learning_rate": 0.0004964835464384595,
"loss": 6.3238,
"mean_token_accuracy": 0.145409494638443,
"num_tokens": 2898897.0,
"step": 1600
},
{
"entropy": 6.686757373809814,
"epoch": 1.3790287924366136,
"grad_norm": 1.09375,
"learning_rate": 0.000496424752637101,
"loss": 6.3401,
"mean_token_accuracy": 0.14611406177282332,
"num_tokens": 2907717.0,
"step": 1605
},
{
"entropy": 6.578691530227661,
"epoch": 1.3833261710356681,
"grad_norm": 1.078125,
"learning_rate": 0.0004963654753325853,
"loss": 6.3297,
"mean_token_accuracy": 0.14271921664476395,
"num_tokens": 2916213.0,
"step": 1610
},
{
"entropy": 6.683462333679199,
"epoch": 1.387623549634723,
"grad_norm": 1.0,
"learning_rate": 0.0004963057146543505,
"loss": 6.4949,
"mean_token_accuracy": 0.1387751467525959,
"num_tokens": 2925706.0,
"step": 1615
},
{
"entropy": 6.599123191833496,
"epoch": 1.3919209282337774,
"grad_norm": 1.015625,
"learning_rate": 0.00049624547073289,
"loss": 6.4208,
"mean_token_accuracy": 0.1372368849813938,
"num_tokens": 2934464.0,
"step": 1620
},
{
"entropy": 6.672312545776367,
"epoch": 1.396218306832832,
"grad_norm": 1.140625,
"learning_rate": 0.0004961847436997526,
"loss": 6.3195,
"mean_token_accuracy": 0.14415977373719216,
"num_tokens": 2944095.0,
"step": 1625
},
{
"entropy": 6.480645990371704,
"epoch": 1.4005156854318865,
"grad_norm": 1.09375,
"learning_rate": 0.0004961235336875416,
"loss": 6.3231,
"mean_token_accuracy": 0.14915895387530326,
"num_tokens": 2953357.0,
"step": 1630
},
{
"entropy": 6.639774322509766,
"epoch": 1.404813064030941,
"grad_norm": 1.109375,
"learning_rate": 0.0004960618408299154,
"loss": 6.4687,
"mean_token_accuracy": 0.13529081642627716,
"num_tokens": 2963020.0,
"step": 1635
},
{
"entropy": 6.682909727096558,
"epoch": 1.4091104426299956,
"grad_norm": 1.046875,
"learning_rate": 0.0004959996652615865,
"loss": 6.319,
"mean_token_accuracy": 0.14330243095755577,
"num_tokens": 2971955.0,
"step": 1640
},
{
"entropy": 6.6523435592651365,
"epoch": 1.4134078212290504,
"grad_norm": 1.0703125,
"learning_rate": 0.0004959370071183216,
"loss": 6.3766,
"mean_token_accuracy": 0.14444040805101394,
"num_tokens": 2980662.0,
"step": 1645
},
{
"entropy": 6.675427007675171,
"epoch": 1.417705199828105,
"grad_norm": 1.1484375,
"learning_rate": 0.0004958738665369407,
"loss": 6.5051,
"mean_token_accuracy": 0.12928852811455727,
"num_tokens": 2990038.0,
"step": 1650
},
{
"entropy": 6.632522964477539,
"epoch": 1.4220025784271595,
"grad_norm": 1.1328125,
"learning_rate": 0.0004958102436553179,
"loss": 6.4172,
"mean_token_accuracy": 0.1390580452978611,
"num_tokens": 2999835.0,
"step": 1655
},
{
"entropy": 6.694387483596802,
"epoch": 1.426299957026214,
"grad_norm": 0.98828125,
"learning_rate": 0.00049574613861238,
"loss": 6.4118,
"mean_token_accuracy": 0.13762674629688262,
"num_tokens": 3009593.0,
"step": 1660
},
{
"entropy": 6.648862266540528,
"epoch": 1.4305973356252686,
"grad_norm": 0.99609375,
"learning_rate": 0.0004956815515481069,
"loss": 6.4348,
"mean_token_accuracy": 0.144145817309618,
"num_tokens": 3019187.0,
"step": 1665
},
{
"entropy": 6.582254266738891,
"epoch": 1.4348947142243231,
"grad_norm": 1.078125,
"learning_rate": 0.0004956164826035309,
"loss": 6.3495,
"mean_token_accuracy": 0.14171260893344878,
"num_tokens": 3027875.0,
"step": 1670
},
{
"entropy": 6.569947624206543,
"epoch": 1.4391920928233777,
"grad_norm": 1.1171875,
"learning_rate": 0.0004955509319207363,
"loss": 6.3833,
"mean_token_accuracy": 0.13855091333389283,
"num_tokens": 3036902.0,
"step": 1675
},
{
"entropy": 6.548913908004761,
"epoch": 1.4434894714224322,
"grad_norm": 0.9375,
"learning_rate": 0.0004954848996428601,
"loss": 6.36,
"mean_token_accuracy": 0.14765606224536895,
"num_tokens": 3046653.0,
"step": 1680
},
{
"entropy": 6.6836981773376465,
"epoch": 1.4477868500214868,
"grad_norm": 1.3515625,
"learning_rate": 0.00049541838591409,
"loss": 6.448,
"mean_token_accuracy": 0.13707543835043906,
"num_tokens": 3056273.0,
"step": 1685
},
{
"entropy": 6.570832586288452,
"epoch": 1.4520842286205415,
"grad_norm": 1.046875,
"learning_rate": 0.0004953513908796657,
"loss": 6.3562,
"mean_token_accuracy": 0.13904846012592315,
"num_tokens": 3065662.0,
"step": 1690
},
{
"entropy": 6.719029092788697,
"epoch": 1.456381607219596,
"grad_norm": 1.140625,
"learning_rate": 0.0004952839146858773,
"loss": 6.3883,
"mean_token_accuracy": 0.14505013972520828,
"num_tokens": 3073970.0,
"step": 1695
},
{
"entropy": 6.546349334716797,
"epoch": 1.4606789858186506,
"grad_norm": 1.1796875,
"learning_rate": 0.0004952159574800658,
"loss": 6.3978,
"mean_token_accuracy": 0.13897576928138733,
"num_tokens": 3082500.0,
"step": 1700
},
{
"entropy": 6.645324468612671,
"epoch": 1.4649763644177052,
"grad_norm": 1.0859375,
"learning_rate": 0.0004951475194106229,
"loss": 6.342,
"mean_token_accuracy": 0.14458465725183486,
"num_tokens": 3091574.0,
"step": 1705
},
{
"entropy": 6.590623474121093,
"epoch": 1.4692737430167597,
"grad_norm": 1.0234375,
"learning_rate": 0.0004950786006269898,
"loss": 6.4477,
"mean_token_accuracy": 0.1356819100677967,
"num_tokens": 3102402.0,
"step": 1710
},
{
"entropy": 6.654024839401245,
"epoch": 1.4735711216158143,
"grad_norm": 1.125,
"learning_rate": 0.0004950092012796576,
"loss": 6.2738,
"mean_token_accuracy": 0.14728236198425293,
"num_tokens": 3111347.0,
"step": 1715
},
{
"entropy": 6.553081369400024,
"epoch": 1.477868500214869,
"grad_norm": 1.1796875,
"learning_rate": 0.0004949393215201666,
"loss": 6.3455,
"mean_token_accuracy": 0.14207591861486435,
"num_tokens": 3120018.0,
"step": 1720
},
{
"entropy": 6.595822668075561,
"epoch": 1.4821658788139236,
"grad_norm": 0.96875,
"learning_rate": 0.0004948689615011065,
"loss": 6.4086,
"mean_token_accuracy": 0.13704866543412209,
"num_tokens": 3129669.0,
"step": 1725
},
{
"entropy": 6.628203105926514,
"epoch": 1.4864632574129781,
"grad_norm": 0.953125,
"learning_rate": 0.0004947981213761154,
"loss": 6.3443,
"mean_token_accuracy": 0.14518199041485785,
"num_tokens": 3139112.0,
"step": 1730
},
{
"entropy": 6.5786394596099855,
"epoch": 1.4907606360120327,
"grad_norm": 1.046875,
"learning_rate": 0.0004947268012998797,
"loss": 6.3058,
"mean_token_accuracy": 0.15637002438306807,
"num_tokens": 3148437.0,
"step": 1735
},
{
"entropy": 6.570107936859131,
"epoch": 1.4950580146110872,
"grad_norm": 0.9609375,
"learning_rate": 0.000494655001428134,
"loss": 6.2891,
"mean_token_accuracy": 0.14667836502194403,
"num_tokens": 3158165.0,
"step": 1740
},
{
"entropy": 6.586823749542236,
"epoch": 1.4993553932101418,
"grad_norm": 1.1015625,
"learning_rate": 0.0004945827219176604,
"loss": 6.3587,
"mean_token_accuracy": 0.1493491068482399,
"num_tokens": 3167262.0,
"step": 1745
},
{
"entropy": 6.514509057998657,
"epoch": 1.5036527718091963,
"grad_norm": 1.0078125,
"learning_rate": 0.0004945099629262888,
"loss": 6.3479,
"mean_token_accuracy": 0.1436598651111126,
"num_tokens": 3176696.0,
"step": 1750
},
{
"entropy": 6.673803234100342,
"epoch": 1.5079501504082509,
"grad_norm": 1.0546875,
"learning_rate": 0.0004944367246128954,
"loss": 6.4304,
"mean_token_accuracy": 0.13725945726037025,
"num_tokens": 3185857.0,
"step": 1755
},
{
"entropy": 6.5661591529846195,
"epoch": 1.5122475290073054,
"grad_norm": 1.0625,
"learning_rate": 0.0004943630071374036,
"loss": 6.2677,
"mean_token_accuracy": 0.14966750741004944,
"num_tokens": 3194687.0,
"step": 1760
},
{
"entropy": 6.554711723327637,
"epoch": 1.51654490760636,
"grad_norm": 1.0078125,
"learning_rate": 0.0004942888106607828,
"loss": 6.3291,
"mean_token_accuracy": 0.14281144142150878,
"num_tokens": 3204913.0,
"step": 1765
},
{
"entropy": 6.641019535064697,
"epoch": 1.5208422862054147,
"grad_norm": 1.0390625,
"learning_rate": 0.0004942141353450486,
"loss": 6.3145,
"mean_token_accuracy": 0.1485350415110588,
"num_tokens": 3213312.0,
"step": 1770
},
{
"entropy": 6.493930768966675,
"epoch": 1.5251396648044693,
"grad_norm": 0.96875,
"learning_rate": 0.0004941389813532619,
"loss": 6.2368,
"mean_token_accuracy": 0.15905009657144548,
"num_tokens": 3222992.0,
"step": 1775
},
{
"entropy": 6.511264657974243,
"epoch": 1.5294370434035238,
"grad_norm": 0.984375,
"learning_rate": 0.000494063348849529,
"loss": 6.2816,
"mean_token_accuracy": 0.14892083406448364,
"num_tokens": 3232836.0,
"step": 1780
},
{
"entropy": 6.616392660140991,
"epoch": 1.5337344220025786,
"grad_norm": 0.94140625,
"learning_rate": 0.0004939872379990011,
"loss": 6.4346,
"mean_token_accuracy": 0.1384902000427246,
"num_tokens": 3243171.0,
"step": 1785
},
{
"entropy": 6.671454858779907,
"epoch": 1.5380318006016331,
"grad_norm": 1.1796875,
"learning_rate": 0.0004939106489678739,
"loss": 6.3565,
"mean_token_accuracy": 0.14886578172445297,
"num_tokens": 3251995.0,
"step": 1790
},
{
"entropy": 6.483775520324707,
"epoch": 1.5423291792006877,
"grad_norm": 1.015625,
"learning_rate": 0.000493833581923387,
"loss": 6.2999,
"mean_token_accuracy": 0.147441129386425,
"num_tokens": 3260841.0,
"step": 1795
},
{
"entropy": 6.614831399917603,
"epoch": 1.5466265577997422,
"grad_norm": 1.0546875,
"learning_rate": 0.0004937560370338244,
"loss": 6.4359,
"mean_token_accuracy": 0.1328293912112713,
"num_tokens": 3270979.0,
"step": 1800
},
{
"entropy": 6.602978515625,
"epoch": 1.5509239363987968,
"grad_norm": 1.0859375,
"learning_rate": 0.000493678014468513,
"loss": 6.3703,
"mean_token_accuracy": 0.14689823091030121,
"num_tokens": 3279848.0,
"step": 1805
},
{
"entropy": 6.534598064422608,
"epoch": 1.5552213149978513,
"grad_norm": 0.94921875,
"learning_rate": 0.0004935995143978227,
"loss": 6.3674,
"mean_token_accuracy": 0.14537320658564568,
"num_tokens": 3289172.0,
"step": 1810
},
{
"entropy": 6.508708524703979,
"epoch": 1.5595186935969059,
"grad_norm": 1.1484375,
"learning_rate": 0.0004935205369931664,
"loss": 6.2677,
"mean_token_accuracy": 0.1513919234275818,
"num_tokens": 3297432.0,
"step": 1815
},
{
"entropy": 6.684668636322021,
"epoch": 1.5638160721959604,
"grad_norm": 0.92578125,
"learning_rate": 0.0004934410824269992,
"loss": 6.2954,
"mean_token_accuracy": 0.1454857923090458,
"num_tokens": 3307486.0,
"step": 1820
},
{
"entropy": 6.466551637649536,
"epoch": 1.568113450795015,
"grad_norm": 1.0234375,
"learning_rate": 0.0004933611508728182,
"loss": 6.2671,
"mean_token_accuracy": 0.14967258870601655,
"num_tokens": 3316296.0,
"step": 1825
},
{
"entropy": 6.563362693786621,
"epoch": 1.5724108293940695,
"grad_norm": 1.0078125,
"learning_rate": 0.000493280742505162,
"loss": 6.2972,
"mean_token_accuracy": 0.14479405283927918,
"num_tokens": 3326080.0,
"step": 1830
},
{
"entropy": 6.456173896789551,
"epoch": 1.576708207993124,
"grad_norm": 1.0546875,
"learning_rate": 0.0004931998574996102,
"loss": 6.217,
"mean_token_accuracy": 0.15072606950998307,
"num_tokens": 3334826.0,
"step": 1835
},
{
"entropy": 6.472858524322509,
"epoch": 1.5810055865921788,
"grad_norm": 1.0859375,
"learning_rate": 0.0004931184960327832,
"loss": 6.2177,
"mean_token_accuracy": 0.1524192661046982,
"num_tokens": 3343261.0,
"step": 1840
},
{
"entropy": 6.493236398696899,
"epoch": 1.5853029651912334,
"grad_norm": 1.640625,
"learning_rate": 0.0004930366582823421,
"loss": 6.2619,
"mean_token_accuracy": 0.14549409449100495,
"num_tokens": 3352513.0,
"step": 1845
},
{
"entropy": 6.541861534118652,
"epoch": 1.589600343790288,
"grad_norm": 1.1484375,
"learning_rate": 0.0004929543444269879,
"loss": 6.3147,
"mean_token_accuracy": 0.15202615782618523,
"num_tokens": 3361577.0,
"step": 1850
},
{
"entropy": 6.516072130203247,
"epoch": 1.5938977223893425,
"grad_norm": 1.1171875,
"learning_rate": 0.000492871554646461,
"loss": 6.3805,
"mean_token_accuracy": 0.1442191444337368,
"num_tokens": 3370591.0,
"step": 1855
},
{
"entropy": 6.489377784729004,
"epoch": 1.5981951009883972,
"grad_norm": 1.0703125,
"learning_rate": 0.0004927882891215413,
"loss": 6.2995,
"mean_token_accuracy": 0.1446702793240547,
"num_tokens": 3379761.0,
"step": 1860
},
{
"entropy": 6.6347997188568115,
"epoch": 1.6024924795874518,
"grad_norm": 1.203125,
"learning_rate": 0.0004927045480340475,
"loss": 6.3729,
"mean_token_accuracy": 0.13809221014380454,
"num_tokens": 3388974.0,
"step": 1865
},
{
"entropy": 6.515362644195557,
"epoch": 1.6067898581865063,
"grad_norm": 0.9765625,
"learning_rate": 0.0004926203315668363,
"loss": 6.2995,
"mean_token_accuracy": 0.14509507045149803,
"num_tokens": 3398339.0,
"step": 1870
},
{
"entropy": 6.501726579666138,
"epoch": 1.6110872367855609,
"grad_norm": 1.046875,
"learning_rate": 0.0004925356399038032,
"loss": 6.2645,
"mean_token_accuracy": 0.14561111479997635,
"num_tokens": 3408292.0,
"step": 1875
},
{
"entropy": 6.528331470489502,
"epoch": 1.6153846153846154,
"grad_norm": 1.1484375,
"learning_rate": 0.0004924504732298808,
"loss": 6.2363,
"mean_token_accuracy": 0.15578987523913385,
"num_tokens": 3417057.0,
"step": 1880
},
{
"entropy": 6.547144651412964,
"epoch": 1.61968199398367,
"grad_norm": 1.0703125,
"learning_rate": 0.0004923648317310391,
"loss": 6.3436,
"mean_token_accuracy": 0.1472199097275734,
"num_tokens": 3425830.0,
"step": 1885
},
{
"entropy": 6.503617954254151,
"epoch": 1.6239793725827245,
"grad_norm": 0.98046875,
"learning_rate": 0.0004922787155942849,
"loss": 6.3929,
"mean_token_accuracy": 0.13893435150384903,
"num_tokens": 3435513.0,
"step": 1890
},
{
"entropy": 6.572265768051148,
"epoch": 1.628276751181779,
"grad_norm": 1.03125,
"learning_rate": 0.0004921921250076611,
"loss": 6.2966,
"mean_token_accuracy": 0.14931443706154823,
"num_tokens": 3444684.0,
"step": 1895
},
{
"entropy": 6.4495138168334964,
"epoch": 1.6325741297808336,
"grad_norm": 1.1015625,
"learning_rate": 0.0004921050601602475,
"loss": 6.3435,
"mean_token_accuracy": 0.14741323441267012,
"num_tokens": 3453454.0,
"step": 1900
},
{
"entropy": 6.556122159957885,
"epoch": 1.6368715083798882,
"grad_norm": 1.0546875,
"learning_rate": 0.0004920175212421587,
"loss": 6.2787,
"mean_token_accuracy": 0.14662181138992308,
"num_tokens": 3463228.0,
"step": 1905
},
{
"entropy": 6.366853141784668,
"epoch": 1.6411688869789427,
"grad_norm": 1.03125,
"learning_rate": 0.0004919295084445445,
"loss": 6.166,
"mean_token_accuracy": 0.15177097618579866,
"num_tokens": 3472131.0,
"step": 1910
},
{
"entropy": 6.485814142227173,
"epoch": 1.6454662655779975,
"grad_norm": 0.98828125,
"learning_rate": 0.0004918410219595899,
"loss": 6.2547,
"mean_token_accuracy": 0.1523374244570732,
"num_tokens": 3480642.0,
"step": 1915
},
{
"entropy": 6.621995449066162,
"epoch": 1.649763644177052,
"grad_norm": 0.9765625,
"learning_rate": 0.000491752061980514,
"loss": 6.2277,
"mean_token_accuracy": 0.15280286371707916,
"num_tokens": 3489346.0,
"step": 1920
},
{
"entropy": 6.4284903049469,
"epoch": 1.6540610227761066,
"grad_norm": 1.1015625,
"learning_rate": 0.0004916626287015697,
"loss": 6.2756,
"mean_token_accuracy": 0.15068823397159575,
"num_tokens": 3498473.0,
"step": 1925
},
{
"entropy": 6.515523910522461,
"epoch": 1.658358401375161,
"grad_norm": 1.0,
"learning_rate": 0.0004915727223180436,
"loss": 6.2738,
"mean_token_accuracy": 0.142893535643816,
"num_tokens": 3507415.0,
"step": 1930
},
{
"entropy": 6.528269815444946,
"epoch": 1.6626557799742159,
"grad_norm": 0.984375,
"learning_rate": 0.0004914823430262554,
"loss": 6.3984,
"mean_token_accuracy": 0.1329946205019951,
"num_tokens": 3516873.0,
"step": 1935
},
{
"entropy": 6.484966564178467,
"epoch": 1.6669531585732704,
"grad_norm": 1.140625,
"learning_rate": 0.0004913914910235573,
"loss": 6.2479,
"mean_token_accuracy": 0.14868821799755097,
"num_tokens": 3525047.0,
"step": 1940
},
{
"entropy": 6.448112821578979,
"epoch": 1.671250537172325,
"grad_norm": 1.0859375,
"learning_rate": 0.0004913001665083337,
"loss": 6.2685,
"mean_token_accuracy": 0.14392302706837654,
"num_tokens": 3534354.0,
"step": 1945
},
{
"entropy": 6.528091144561768,
"epoch": 1.6755479157713795,
"grad_norm": 1.2265625,
"learning_rate": 0.0004912083696800008,
"loss": 6.2926,
"mean_token_accuracy": 0.14562170803546906,
"num_tokens": 3543830.0,
"step": 1950
},
{
"entropy": 6.4218017578125,
"epoch": 1.679845294370434,
"grad_norm": 1.09375,
"learning_rate": 0.0004911161007390063,
"loss": 6.1933,
"mean_token_accuracy": 0.14804754853248597,
"num_tokens": 3552314.0,
"step": 1955
},
{
"entropy": 6.470229148864746,
"epoch": 1.6841426729694886,
"grad_norm": 1.1875,
"learning_rate": 0.0004910233598868287,
"loss": 6.2765,
"mean_token_accuracy": 0.14477257579565048,
"num_tokens": 3561656.0,
"step": 1960
},
{
"entropy": 6.467269372940064,
"epoch": 1.6884400515685432,
"grad_norm": 1.0625,
"learning_rate": 0.0004909301473259769,
"loss": 6.2641,
"mean_token_accuracy": 0.14551830440759658,
"num_tokens": 3571784.0,
"step": 1965
},
{
"entropy": 6.518259859085083,
"epoch": 1.6927374301675977,
"grad_norm": 1.0625,
"learning_rate": 0.0004908364632599899,
"loss": 6.228,
"mean_token_accuracy": 0.15220747292041778,
"num_tokens": 3580626.0,
"step": 1970
},
{
"entropy": 6.378790664672851,
"epoch": 1.6970348087666522,
"grad_norm": 1.046875,
"learning_rate": 0.0004907423078934362,
"loss": 6.2467,
"mean_token_accuracy": 0.14601020216941835,
"num_tokens": 3589916.0,
"step": 1975
},
{
"entropy": 6.473833656311035,
"epoch": 1.7013321873657068,
"grad_norm": 1.0078125,
"learning_rate": 0.0004906476814319134,
"loss": 6.2572,
"mean_token_accuracy": 0.14930620267987252,
"num_tokens": 3599128.0,
"step": 1980
},
{
"entropy": 6.429199600219727,
"epoch": 1.7056295659647613,
"grad_norm": 0.9140625,
"learning_rate": 0.0004905525840820481,
"loss": 6.2686,
"mean_token_accuracy": 0.1471567466855049,
"num_tokens": 3608764.0,
"step": 1985
},
{
"entropy": 6.58309121131897,
"epoch": 1.709926944563816,
"grad_norm": 0.9453125,
"learning_rate": 0.0004904570160514948,
"loss": 6.3077,
"mean_token_accuracy": 0.14043890461325645,
"num_tokens": 3619082.0,
"step": 1990
},
{
"entropy": 6.45733323097229,
"epoch": 1.7142243231628707,
"grad_norm": 1.140625,
"learning_rate": 0.0004903609775489358,
"loss": 6.2682,
"mean_token_accuracy": 0.14586469754576684,
"num_tokens": 3628695.0,
"step": 1995
},
{
"entropy": 6.511290454864502,
"epoch": 1.7185217017619252,
"grad_norm": 1.015625,
"learning_rate": 0.0004902644687840809,
"loss": 6.267,
"mean_token_accuracy": 0.14717549681663514,
"num_tokens": 3637599.0,
"step": 2000
},
{
"epoch": 1.7185217017619252,
"eval_entropy": 6.214308420817058,
"eval_loss": 6.331518173217773,
"eval_mean_token_accuracy": 0.14971260959702032,
"eval_num_tokens": 3637599.0,
"eval_runtime": 2.0415,
"eval_samples_per_second": 1738.466,
"eval_steps_per_second": 217.492,
"step": 2000
},
{
"entropy": 6.427486324310303,
"epoch": 1.7228190803609797,
"grad_norm": 1.1484375,
"learning_rate": 0.0004901674899676667,
"loss": 6.2449,
"mean_token_accuracy": 0.14803531616926194,
"num_tokens": 3647406.0,
"step": 2005
},
{
"entropy": 6.416431045532226,
"epoch": 1.7271164589600345,
"grad_norm": 1.03125,
"learning_rate": 0.0004900700413114561,
"loss": 6.1252,
"mean_token_accuracy": 0.15068818926811217,
"num_tokens": 3656531.0,
"step": 2010
},
{
"entropy": 6.388833618164062,
"epoch": 1.731413837559089,
"grad_norm": 1.0078125,
"learning_rate": 0.000489972123028238,
"loss": 6.2244,
"mean_token_accuracy": 0.1465991474688053,
"num_tokens": 3664922.0,
"step": 2015
},
{
"entropy": 6.502804613113403,
"epoch": 1.7357112161581436,
"grad_norm": 1.0234375,
"learning_rate": 0.0004898737353318268,
"loss": 6.1557,
"mean_token_accuracy": 0.1519090563058853,
"num_tokens": 3673283.0,
"step": 2020
},
{
"entropy": 6.377015924453735,
"epoch": 1.7400085947571982,
"grad_norm": 1.125,
"learning_rate": 0.000489774878437062,
"loss": 6.298,
"mean_token_accuracy": 0.15162839442491532,
"num_tokens": 3681760.0,
"step": 2025
},
{
"entropy": 6.46599555015564,
"epoch": 1.7443059733562527,
"grad_norm": 1.078125,
"learning_rate": 0.0004896755525598074,
"loss": 6.1178,
"mean_token_accuracy": 0.15259039252996445,
"num_tokens": 3689408.0,
"step": 2030
},
{
"entropy": 6.4247987270355225,
"epoch": 1.7486033519553073,
"grad_norm": 1.109375,
"learning_rate": 0.0004895757579169511,
"loss": 6.234,
"mean_token_accuracy": 0.14994207322597503,
"num_tokens": 3697904.0,
"step": 2035
},
{
"entropy": 6.579666042327881,
"epoch": 1.7529007305543618,
"grad_norm": 1.0078125,
"learning_rate": 0.0004894754947264047,
"loss": 6.2504,
"mean_token_accuracy": 0.15150809586048125,
"num_tokens": 3706704.0,
"step": 2040
},
{
"entropy": 6.433872127532959,
"epoch": 1.7571981091534163,
"grad_norm": 1.109375,
"learning_rate": 0.000489374763207103,
"loss": 6.3286,
"mean_token_accuracy": 0.14471730291843415,
"num_tokens": 3715690.0,
"step": 2045
},
{
"entropy": 6.465651893615723,
"epoch": 1.761495487752471,
"grad_norm": 1.109375,
"learning_rate": 0.0004892735635790033,
"loss": 6.125,
"mean_token_accuracy": 0.15927532613277434,
"num_tokens": 3724835.0,
"step": 2050
},
{
"entropy": 6.368647861480713,
"epoch": 1.7657928663515254,
"grad_norm": 0.94140625,
"learning_rate": 0.000489171896063085,
"loss": 6.1498,
"mean_token_accuracy": 0.157290717959404,
"num_tokens": 3733977.0,
"step": 2055
},
{
"entropy": 6.458992671966553,
"epoch": 1.77009024495058,
"grad_norm": 1.078125,
"learning_rate": 0.0004890697608813495,
"loss": 6.2682,
"mean_token_accuracy": 0.14064312726259232,
"num_tokens": 3742665.0,
"step": 2060
},
{
"entropy": 6.583484077453614,
"epoch": 1.7743876235496348,
"grad_norm": 1.078125,
"learning_rate": 0.0004889671582568193,
"loss": 6.3367,
"mean_token_accuracy": 0.14621492847800255,
"num_tokens": 3751647.0,
"step": 2065
},
{
"entropy": 6.387417125701904,
"epoch": 1.7786850021486893,
"grad_norm": 1.140625,
"learning_rate": 0.0004888640884135374,
"loss": 6.2386,
"mean_token_accuracy": 0.1474798172712326,
"num_tokens": 3760852.0,
"step": 2070
},
{
"entropy": 6.3953369617462155,
"epoch": 1.7829823807477438,
"grad_norm": 1.25,
"learning_rate": 0.0004887605515765671,
"loss": 6.1913,
"mean_token_accuracy": 0.15439595878124238,
"num_tokens": 3768640.0,
"step": 2075
},
{
"entropy": 6.503360080718994,
"epoch": 1.7872797593467986,
"grad_norm": 1.0546875,
"learning_rate": 0.0004886565479719914,
"loss": 6.2177,
"mean_token_accuracy": 0.14689500331878663,
"num_tokens": 3776504.0,
"step": 2080
},
{
"entropy": 6.52859411239624,
"epoch": 1.7915771379458532,
"grad_norm": 1.1875,
"learning_rate": 0.0004885520778269128,
"loss": 6.2515,
"mean_token_accuracy": 0.1499434307217598,
"num_tokens": 3786353.0,
"step": 2085
},
{
"entropy": 6.410916137695312,
"epoch": 1.7958745165449077,
"grad_norm": 1.0859375,
"learning_rate": 0.0004884471413694523,
"loss": 6.2783,
"mean_token_accuracy": 0.15109124332666396,
"num_tokens": 3795902.0,
"step": 2090
},
{
"entropy": 6.470384979248047,
"epoch": 1.8001718951439623,
"grad_norm": 0.9140625,
"learning_rate": 0.0004883417388287491,
"loss": 6.194,
"mean_token_accuracy": 0.1435760647058487,
"num_tokens": 3805986.0,
"step": 2095
},
{
"entropy": 6.400091123580933,
"epoch": 1.8044692737430168,
"grad_norm": 1.140625,
"learning_rate": 0.0004882358704349603,
"loss": 6.3188,
"mean_token_accuracy": 0.1500417910516262,
"num_tokens": 3814915.0,
"step": 2100
},
{
"entropy": 6.456367015838623,
"epoch": 1.8087666523420713,
"grad_norm": 1.15625,
"learning_rate": 0.0004881295364192601,
"loss": 6.2089,
"mean_token_accuracy": 0.15894449651241302,
"num_tokens": 3823966.0,
"step": 2105
},
{
"entropy": 6.510165739059448,
"epoch": 1.813064030941126,
"grad_norm": 1.0078125,
"learning_rate": 0.0004880227370138394,
"loss": 6.2729,
"mean_token_accuracy": 0.142085450142622,
"num_tokens": 3832775.0,
"step": 2110
},
{
"entropy": 6.3983588218688965,
"epoch": 1.8173614095401804,
"grad_norm": 0.8984375,
"learning_rate": 0.0004879154724519057,
"loss": 6.1809,
"mean_token_accuracy": 0.15120477825403214,
"num_tokens": 3842808.0,
"step": 2115
},
{
"entropy": 6.493490934371948,
"epoch": 1.821658788139235,
"grad_norm": 1.046875,
"learning_rate": 0.0004878077429676816,
"loss": 6.3143,
"mean_token_accuracy": 0.14699392020702362,
"num_tokens": 3853303.0,
"step": 2120
},
{
"entropy": 6.4460196018219,
"epoch": 1.8259561667382895,
"grad_norm": 1.046875,
"learning_rate": 0.0004876995487964054,
"loss": 6.2277,
"mean_token_accuracy": 0.13867998719215394,
"num_tokens": 3862462.0,
"step": 2125
},
{
"entropy": 6.459061241149902,
"epoch": 1.830253545337344,
"grad_norm": 1.0234375,
"learning_rate": 0.00048759089017432996,
"loss": 6.3388,
"mean_token_accuracy": 0.14455281794071198,
"num_tokens": 3871596.0,
"step": 2130
},
{
"entropy": 6.482069444656372,
"epoch": 1.8345509239363988,
"grad_norm": 1.015625,
"learning_rate": 0.0004874817673387222,
"loss": 6.2427,
"mean_token_accuracy": 0.14856942594051362,
"num_tokens": 3881276.0,
"step": 2135
},
{
"entropy": 6.43566927909851,
"epoch": 1.8388483025354534,
"grad_norm": 0.96875,
"learning_rate": 0.00048737218052786275,
"loss": 6.33,
"mean_token_accuracy": 0.14330809488892554,
"num_tokens": 3891610.0,
"step": 2140
},
{
"entropy": 6.498207521438599,
"epoch": 1.843145681134508,
"grad_norm": 0.98046875,
"learning_rate": 0.00048726212998104554,
"loss": 6.2531,
"mean_token_accuracy": 0.14796748533844947,
"num_tokens": 3900584.0,
"step": 2145
},
{
"entropy": 6.405120611190796,
"epoch": 1.8474430597335625,
"grad_norm": 1.0390625,
"learning_rate": 0.0004871516159385768,
"loss": 6.1817,
"mean_token_accuracy": 0.1539264902472496,
"num_tokens": 3910208.0,
"step": 2150
},
{
"entropy": 6.320563936233521,
"epoch": 1.8517404383326173,
"grad_norm": 1.1015625,
"learning_rate": 0.0004870406386417752,
"loss": 6.1061,
"mean_token_accuracy": 0.15697987973690034,
"num_tokens": 3918424.0,
"step": 2155
},
{
"entropy": 6.313277053833008,
"epoch": 1.8560378169316718,
"grad_norm": 1.0859375,
"learning_rate": 0.0004869291983329707,
"loss": 6.047,
"mean_token_accuracy": 0.17023974657058716,
"num_tokens": 3926206.0,
"step": 2160
},
{
"entropy": 6.473067951202393,
"epoch": 1.8603351955307263,
"grad_norm": 1.046875,
"learning_rate": 0.0004868172952555044,
"loss": 6.1485,
"mean_token_accuracy": 0.14482472315430642,
"num_tokens": 3935769.0,
"step": 2165
},
{
"entropy": 6.363153123855591,
"epoch": 1.864632574129781,
"grad_norm": 0.9453125,
"learning_rate": 0.0004867049296537278,
"loss": 6.1373,
"mean_token_accuracy": 0.1534383252263069,
"num_tokens": 3945118.0,
"step": 2170
},
{
"entropy": 6.399164772033691,
"epoch": 1.8689299527288354,
"grad_norm": 1.2578125,
"learning_rate": 0.0004865921017730027,
"loss": 6.2358,
"mean_token_accuracy": 0.15296792089939118,
"num_tokens": 3954012.0,
"step": 2175
},
{
"entropy": 6.471106052398682,
"epoch": 1.87322733132789,
"grad_norm": 0.94140625,
"learning_rate": 0.00048647881185969995,
"loss": 6.2355,
"mean_token_accuracy": 0.15060990452766418,
"num_tokens": 3964239.0,
"step": 2180
},
{
"entropy": 6.386410093307495,
"epoch": 1.8775247099269445,
"grad_norm": 1.015625,
"learning_rate": 0.0004863650601611994,
"loss": 6.1502,
"mean_token_accuracy": 0.15660223215818406,
"num_tokens": 3973694.0,
"step": 2185
},
{
"entropy": 6.372910404205323,
"epoch": 1.881822088525999,
"grad_norm": 1.0703125,
"learning_rate": 0.00048625084692588937,
"loss": 6.185,
"mean_token_accuracy": 0.15601919442415238,
"num_tokens": 3982706.0,
"step": 2190
},
{
"entropy": 6.401282548904419,
"epoch": 1.8861194671250536,
"grad_norm": 1.09375,
"learning_rate": 0.00048613617240316593,
"loss": 6.138,
"mean_token_accuracy": 0.15665835291147232,
"num_tokens": 3990934.0,
"step": 2195
},
{
"entropy": 6.4126348972320555,
"epoch": 1.8904168457241082,
"grad_norm": 1.0390625,
"learning_rate": 0.0004860210368434323,
"loss": 6.192,
"mean_token_accuracy": 0.1556440055370331,
"num_tokens": 3999864.0,
"step": 2200
},
{
"entropy": 6.424229860305786,
"epoch": 1.8947142243231627,
"grad_norm": 0.9765625,
"learning_rate": 0.00048590544049809857,
"loss": 6.1968,
"mean_token_accuracy": 0.15178433507680894,
"num_tokens": 4008273.0,
"step": 2205
},
{
"entropy": 6.427778577804565,
"epoch": 1.8990116029222175,
"grad_norm": 0.99609375,
"learning_rate": 0.000485789383619581,
"loss": 6.2178,
"mean_token_accuracy": 0.1559001922607422,
"num_tokens": 4017697.0,
"step": 2210
},
{
"entropy": 6.4254296779632565,
"epoch": 1.903308981521272,
"grad_norm": 1.09375,
"learning_rate": 0.0004856728664613015,
"loss": 6.2293,
"mean_token_accuracy": 0.14589258283376694,
"num_tokens": 4026775.0,
"step": 2215
},
{
"entropy": 6.351989793777466,
"epoch": 1.9076063601203266,
"grad_norm": 1.03125,
"learning_rate": 0.00048555588927768674,
"loss": 6.1972,
"mean_token_accuracy": 0.15271373167634011,
"num_tokens": 4036476.0,
"step": 2220
},
{
"entropy": 6.473893165588379,
"epoch": 1.9119037387193811,
"grad_norm": 1.109375,
"learning_rate": 0.0004854384523241683,
"loss": 6.204,
"mean_token_accuracy": 0.15081721246242524,
"num_tokens": 4045221.0,
"step": 2225
},
{
"entropy": 6.310385704040527,
"epoch": 1.916201117318436,
"grad_norm": 1.0078125,
"learning_rate": 0.00048532055585718143,
"loss": 6.1112,
"mean_token_accuracy": 0.15869007259607315,
"num_tokens": 4053754.0,
"step": 2230
},
{
"entropy": 6.390126276016235,
"epoch": 1.9204984959174904,
"grad_norm": 1.1015625,
"learning_rate": 0.00048520220013416505,
"loss": 6.1455,
"mean_token_accuracy": 0.15594211518764495,
"num_tokens": 4061730.0,
"step": 2235
},
{
"entropy": 6.3809610366821286,
"epoch": 1.924795874516545,
"grad_norm": 1.0390625,
"learning_rate": 0.0004850833854135607,
"loss": 6.197,
"mean_token_accuracy": 0.15130506530404092,
"num_tokens": 4070501.0,
"step": 2240
},
{
"entropy": 6.420936059951782,
"epoch": 1.9290932531155995,
"grad_norm": 0.9296875,
"learning_rate": 0.0004849641119548122,
"loss": 6.2763,
"mean_token_accuracy": 0.1485205315053463,
"num_tokens": 4079621.0,
"step": 2245
},
{
"entropy": 6.4735170841217045,
"epoch": 1.933390631714654,
"grad_norm": 1.046875,
"learning_rate": 0.000484844380018365,
"loss": 6.2663,
"mean_token_accuracy": 0.14868344217538834,
"num_tokens": 4090106.0,
"step": 2250
},
{
"entropy": 6.461083984375,
"epoch": 1.9376880103137086,
"grad_norm": 1.0,
"learning_rate": 0.000484724189865666,
"loss": 6.1985,
"mean_token_accuracy": 0.1501224085688591,
"num_tokens": 4099269.0,
"step": 2255
},
{
"entropy": 6.287312364578247,
"epoch": 1.9419853889127632,
"grad_norm": 1.046875,
"learning_rate": 0.0004846035417591624,
"loss": 6.1351,
"mean_token_accuracy": 0.1544906511902809,
"num_tokens": 4108414.0,
"step": 2260
},
{
"entropy": 6.426730060577393,
"epoch": 1.9462827675118177,
"grad_norm": 1.1328125,
"learning_rate": 0.0004844824359623014,
"loss": 6.2629,
"mean_token_accuracy": 0.14584496468305588,
"num_tokens": 4117731.0,
"step": 2265
},
{
"entropy": 6.451971340179443,
"epoch": 1.9505801461108723,
"grad_norm": 1.0703125,
"learning_rate": 0.00048436087273952966,
"loss": 6.2441,
"mean_token_accuracy": 0.14279974550008773,
"num_tokens": 4127194.0,
"step": 2270
},
{
"entropy": 6.396147346496582,
"epoch": 1.9548775247099268,
"grad_norm": 1.09375,
"learning_rate": 0.00048423885235629265,
"loss": 6.193,
"mean_token_accuracy": 0.15773467123508453,
"num_tokens": 4135594.0,
"step": 2275
},
{
"entropy": 6.39124755859375,
"epoch": 1.9591749033089814,
"grad_norm": 1.0,
"learning_rate": 0.0004841163750790342,
"loss": 6.2256,
"mean_token_accuracy": 0.15189721137285234,
"num_tokens": 4145027.0,
"step": 2280
},
{
"entropy": 6.383194398880005,
"epoch": 1.9634722819080361,
"grad_norm": 0.99609375,
"learning_rate": 0.00048399344117519555,
"loss": 6.087,
"mean_token_accuracy": 0.15884610414505004,
"num_tokens": 4153754.0,
"step": 2285
},
{
"entropy": 6.330159759521484,
"epoch": 1.9677696605070907,
"grad_norm": 0.99609375,
"learning_rate": 0.00048387005091321544,
"loss": 6.1553,
"mean_token_accuracy": 0.15946451872587203,
"num_tokens": 4162765.0,
"step": 2290
},
{
"entropy": 6.414357376098633,
"epoch": 1.9720670391061452,
"grad_norm": 1.140625,
"learning_rate": 0.00048374620456252877,
"loss": 6.1748,
"mean_token_accuracy": 0.1570574849843979,
"num_tokens": 4171589.0,
"step": 2295
},
{
"entropy": 6.360631132125855,
"epoch": 1.9763644177052,
"grad_norm": 1.015625,
"learning_rate": 0.00048362190239356644,
"loss": 6.1913,
"mean_token_accuracy": 0.155552938580513,
"num_tokens": 4181817.0,
"step": 2300
},
{
"entropy": 6.352840518951416,
"epoch": 1.9806617963042545,
"grad_norm": 0.91796875,
"learning_rate": 0.00048349714467775474,
"loss": 6.1462,
"mean_token_accuracy": 0.1511269122362137,
"num_tokens": 4191350.0,
"step": 2305
},
{
"entropy": 6.3630085468292235,
"epoch": 1.984959174903309,
"grad_norm": 1.046875,
"learning_rate": 0.00048337193168751464,
"loss": 6.1935,
"mean_token_accuracy": 0.1461350604891777,
"num_tokens": 4199888.0,
"step": 2310
},
{
"entropy": 6.447411775588989,
"epoch": 1.9892565535023636,
"grad_norm": 1.1171875,
"learning_rate": 0.0004832462636962613,
"loss": 6.1829,
"mean_token_accuracy": 0.1507252760231495,
"num_tokens": 4209509.0,
"step": 2315
},
{
"entropy": 6.372689247131348,
"epoch": 1.9935539321014182,
"grad_norm": 1.09375,
"learning_rate": 0.0004831201409784034,
"loss": 6.1215,
"mean_token_accuracy": 0.15712654441595078,
"num_tokens": 4218496.0,
"step": 2320
},
{
"entropy": 6.357889032363891,
"epoch": 1.9978513107004727,
"grad_norm": 0.99609375,
"learning_rate": 0.0004829935638093424,
"loss": 6.1463,
"mean_token_accuracy": 0.15369027704000474,
"num_tokens": 4227504.0,
"step": 2325
},
{
"entropy": 6.373083750406901,
"epoch": 2.0017189514396216,
"grad_norm": 1.046875,
"learning_rate": 0.0004828665324654724,
"loss": 6.0581,
"mean_token_accuracy": 0.15794145895375145,
"num_tokens": 4235338.0,
"step": 2330
},
{
"entropy": 6.4267494678497314,
"epoch": 2.006016330038676,
"grad_norm": 0.9765625,
"learning_rate": 0.0004827390472241791,
"loss": 5.8418,
"mean_token_accuracy": 0.16316850185394288,
"num_tokens": 4244905.0,
"step": 2335
},
{
"entropy": 6.314910984039306,
"epoch": 2.010313708637731,
"grad_norm": 0.9375,
"learning_rate": 0.0004826111083638392,
"loss": 5.9211,
"mean_token_accuracy": 0.1677140362560749,
"num_tokens": 4254533.0,
"step": 2340
},
{
"entropy": 6.370204210281372,
"epoch": 2.0146110872367857,
"grad_norm": 0.98828125,
"learning_rate": 0.00048248271616382,
"loss": 5.8961,
"mean_token_accuracy": 0.16431671380996704,
"num_tokens": 4264023.0,
"step": 2345
},
{
"entropy": 6.326271295547485,
"epoch": 2.0189084658358403,
"grad_norm": 1.015625,
"learning_rate": 0.00048235387090447894,
"loss": 5.9306,
"mean_token_accuracy": 0.1572665750980377,
"num_tokens": 4273298.0,
"step": 2350
},
{
"entropy": 6.378605699539184,
"epoch": 2.023205844434895,
"grad_norm": 1.0390625,
"learning_rate": 0.00048222457286716235,
"loss": 5.8756,
"mean_token_accuracy": 0.16723261177539825,
"num_tokens": 4283244.0,
"step": 2355
},
{
"entropy": 6.322220325469971,
"epoch": 2.0275032230339494,
"grad_norm": 1.140625,
"learning_rate": 0.00048209482233420564,
"loss": 5.8185,
"mean_token_accuracy": 0.1769508183002472,
"num_tokens": 4291677.0,
"step": 2360
},
{
"entropy": 6.314945793151855,
"epoch": 2.031800601633004,
"grad_norm": 1.0546875,
"learning_rate": 0.000481964619588932,
"loss": 5.8793,
"mean_token_accuracy": 0.16825687736272812,
"num_tokens": 4300822.0,
"step": 2365
},
{
"entropy": 6.339528942108155,
"epoch": 2.0360979802320585,
"grad_norm": 1.0859375,
"learning_rate": 0.0004818339649156523,
"loss": 5.8876,
"mean_token_accuracy": 0.16732898950576783,
"num_tokens": 4310149.0,
"step": 2370
},
{
"entropy": 6.19782075881958,
"epoch": 2.040395358831113,
"grad_norm": 1.0078125,
"learning_rate": 0.00048170285859966395,
"loss": 5.7924,
"mean_token_accuracy": 0.17466236799955367,
"num_tokens": 4319109.0,
"step": 2375
},
{
"entropy": 6.3286045551300045,
"epoch": 2.0446927374301676,
"grad_norm": 0.984375,
"learning_rate": 0.00048157130092725087,
"loss": 5.7843,
"mean_token_accuracy": 0.1704682469367981,
"num_tokens": 4327921.0,
"step": 2380
},
{
"entropy": 6.329291915893554,
"epoch": 2.048990116029222,
"grad_norm": 1.0234375,
"learning_rate": 0.0004814392921856824,
"loss": 5.9287,
"mean_token_accuracy": 0.16586144566535949,
"num_tokens": 4338026.0,
"step": 2385
},
{
"entropy": 6.2563072681427006,
"epoch": 2.0532874946282766,
"grad_norm": 0.95703125,
"learning_rate": 0.0004813068326632128,
"loss": 5.7762,
"mean_token_accuracy": 0.17654864937067033,
"num_tokens": 4347794.0,
"step": 2390
},
{
"entropy": 6.329816913604736,
"epoch": 2.057584873227331,
"grad_norm": 1.078125,
"learning_rate": 0.0004811739226490809,
"loss": 5.9557,
"mean_token_accuracy": 0.16758598685264586,
"num_tokens": 4357249.0,
"step": 2395
},
{
"entropy": 6.283816623687744,
"epoch": 2.0618822518263857,
"grad_norm": 1.0625,
"learning_rate": 0.00048104056243350896,
"loss": 5.9041,
"mean_token_accuracy": 0.16363563090562822,
"num_tokens": 4366053.0,
"step": 2400
},
{
"entropy": 6.297672891616822,
"epoch": 2.0661796304254403,
"grad_norm": 0.98046875,
"learning_rate": 0.0004809067523077023,
"loss": 5.9163,
"mean_token_accuracy": 0.16945113092660904,
"num_tokens": 4375543.0,
"step": 2405
},
{
"entropy": 6.2845330238342285,
"epoch": 2.0704770090244953,
"grad_norm": 1.0625,
"learning_rate": 0.00048077249256384884,
"loss": 5.8006,
"mean_token_accuracy": 0.17305675595998765,
"num_tokens": 4384332.0,
"step": 2410
},
{
"entropy": 6.210544061660767,
"epoch": 2.07477438762355,
"grad_norm": 1.1953125,
"learning_rate": 0.0004806377834951182,
"loss": 5.8994,
"mean_token_accuracy": 0.16216432005167009,
"num_tokens": 4393670.0,
"step": 2415
},
{
"entropy": 6.373771142959595,
"epoch": 2.0790717662226044,
"grad_norm": 1.1328125,
"learning_rate": 0.00048050262539566104,
"loss": 5.9012,
"mean_token_accuracy": 0.16862600147724152,
"num_tokens": 4402763.0,
"step": 2420
},
{
"entropy": 6.269940948486328,
"epoch": 2.083369144821659,
"grad_norm": 0.984375,
"learning_rate": 0.0004803670185606087,
"loss": 5.8086,
"mean_token_accuracy": 0.17335692346096038,
"num_tokens": 4411863.0,
"step": 2425
},
{
"entropy": 6.265923166275025,
"epoch": 2.0876665234207135,
"grad_norm": 1.078125,
"learning_rate": 0.0004802309632860724,
"loss": 5.9059,
"mean_token_accuracy": 0.16651569604873656,
"num_tokens": 4421110.0,
"step": 2430
},
{
"entropy": 6.352302503585816,
"epoch": 2.091963902019768,
"grad_norm": 1.0390625,
"learning_rate": 0.00048009445986914236,
"loss": 5.8854,
"mean_token_accuracy": 0.16589637845754623,
"num_tokens": 4430249.0,
"step": 2435
},
{
"entropy": 6.263960170745849,
"epoch": 2.0962612806188226,
"grad_norm": 1.0078125,
"learning_rate": 0.00047995750860788756,
"loss": 5.8661,
"mean_token_accuracy": 0.15910358875989913,
"num_tokens": 4439686.0,
"step": 2440
},
{
"entropy": 6.227327108383179,
"epoch": 2.100558659217877,
"grad_norm": 1.1796875,
"learning_rate": 0.0004798201098013547,
"loss": 5.8709,
"mean_token_accuracy": 0.1692453533411026,
"num_tokens": 4448645.0,
"step": 2445
},
{
"entropy": 6.291311168670655,
"epoch": 2.1048560378169316,
"grad_norm": 0.96484375,
"learning_rate": 0.00047968226374956797,
"loss": 5.8333,
"mean_token_accuracy": 0.1675017699599266,
"num_tokens": 4456870.0,
"step": 2450
},
{
"entropy": 6.195930767059326,
"epoch": 2.109153416415986,
"grad_norm": 1.03125,
"learning_rate": 0.00047954397075352794,
"loss": 5.8684,
"mean_token_accuracy": 0.17277338951826096,
"num_tokens": 4466287.0,
"step": 2455
},
{
"entropy": 6.2388382911682125,
"epoch": 2.1134507950150407,
"grad_norm": 1.0703125,
"learning_rate": 0.00047940523111521136,
"loss": 5.7553,
"mean_token_accuracy": 0.17395039051771163,
"num_tokens": 4474461.0,
"step": 2460
},
{
"entropy": 6.255577421188354,
"epoch": 2.1177481736140953,
"grad_norm": 1.1875,
"learning_rate": 0.0004792660451375701,
"loss": 5.835,
"mean_token_accuracy": 0.16953630596399308,
"num_tokens": 4483002.0,
"step": 2465
},
{
"entropy": 6.224816513061524,
"epoch": 2.12204555221315,
"grad_norm": 1.0859375,
"learning_rate": 0.00047912641312453064,
"loss": 5.8459,
"mean_token_accuracy": 0.1695180580019951,
"num_tokens": 4492061.0,
"step": 2470
},
{
"entropy": 6.284405374526978,
"epoch": 2.1263429308122044,
"grad_norm": 0.9375,
"learning_rate": 0.00047898633538099363,
"loss": 5.8957,
"mean_token_accuracy": 0.16090027987957,
"num_tokens": 4501829.0,
"step": 2475
},
{
"entropy": 6.258666229248047,
"epoch": 2.130640309411259,
"grad_norm": 0.98828125,
"learning_rate": 0.0004788458122128327,
"loss": 5.9181,
"mean_token_accuracy": 0.1656097248196602,
"num_tokens": 4511539.0,
"step": 2480
},
{
"entropy": 6.246809720993042,
"epoch": 2.134937688010314,
"grad_norm": 1.0625,
"learning_rate": 0.00047870484392689434,
"loss": 5.7722,
"mean_token_accuracy": 0.1671189084649086,
"num_tokens": 4520425.0,
"step": 2485
},
{
"entropy": 6.220279026031494,
"epoch": 2.1392350666093685,
"grad_norm": 1.0859375,
"learning_rate": 0.000478563430830997,
"loss": 5.8751,
"mean_token_accuracy": 0.16446918100118638,
"num_tokens": 4529474.0,
"step": 2490
},
{
"entropy": 6.2571605205535885,
"epoch": 2.143532445208423,
"grad_norm": 1.0546875,
"learning_rate": 0.00047842157323393035,
"loss": 5.8041,
"mean_token_accuracy": 0.1694269135594368,
"num_tokens": 4538082.0,
"step": 2495
},
{
"entropy": 6.218803596496582,
"epoch": 2.1478298238074776,
"grad_norm": 1.015625,
"learning_rate": 0.0004782792714454547,
"loss": 5.9987,
"mean_token_accuracy": 0.16337930560112,
"num_tokens": 4547340.0,
"step": 2500
},
{
"epoch": 2.1478298238074776,
"eval_entropy": 6.073525357890773,
"eval_loss": 6.213027477264404,
"eval_mean_token_accuracy": 0.15643914548999016,
"eval_num_tokens": 4547340.0,
"eval_runtime": 2.0452,
"eval_samples_per_second": 1735.325,
"eval_steps_per_second": 217.099,
"step": 2500
},
{
"entropy": 6.266714763641358,
"epoch": 2.152127202406532,
"grad_norm": 1.1171875,
"learning_rate": 0.0004781365257763002,
"loss": 5.8423,
"mean_token_accuracy": 0.16869749277830123,
"num_tokens": 4556415.0,
"step": 2505
},
{
"entropy": 6.1728370666503904,
"epoch": 2.1564245810055866,
"grad_norm": 1.28125,
"learning_rate": 0.00047799333653816633,
"loss": 5.7293,
"mean_token_accuracy": 0.17461720257997512,
"num_tokens": 4565156.0,
"step": 2510
},
{
"entropy": 6.233670806884765,
"epoch": 2.160721959604641,
"grad_norm": 1.0703125,
"learning_rate": 0.00047784970404372124,
"loss": 5.8327,
"mean_token_accuracy": 0.16848449259996415,
"num_tokens": 4574678.0,
"step": 2515
},
{
"entropy": 6.12764801979065,
"epoch": 2.1650193382036957,
"grad_norm": 1.1171875,
"learning_rate": 0.00047770562860660083,
"loss": 5.854,
"mean_token_accuracy": 0.16377500146627427,
"num_tokens": 4583253.0,
"step": 2520
},
{
"entropy": 6.273917770385742,
"epoch": 2.1693167168027503,
"grad_norm": 0.91796875,
"learning_rate": 0.0004775611105414083,
"loss": 5.9138,
"mean_token_accuracy": 0.16056130826473236,
"num_tokens": 4594042.0,
"step": 2525
},
{
"entropy": 6.210309171676636,
"epoch": 2.173614095401805,
"grad_norm": 0.98828125,
"learning_rate": 0.0004774161501637133,
"loss": 5.8661,
"mean_token_accuracy": 0.16690902709960936,
"num_tokens": 4603128.0,
"step": 2530
},
{
"entropy": 6.207437753677368,
"epoch": 2.1779114740008594,
"grad_norm": 1.234375,
"learning_rate": 0.0004772707477900514,
"loss": 5.8489,
"mean_token_accuracy": 0.17330004572868346,
"num_tokens": 4611537.0,
"step": 2535
},
{
"entropy": 6.316633796691894,
"epoch": 2.182208852599914,
"grad_norm": 1.09375,
"learning_rate": 0.0004771249037379232,
"loss": 5.9518,
"mean_token_accuracy": 0.1604529470205307,
"num_tokens": 4622481.0,
"step": 2540
},
{
"entropy": 6.174561834335327,
"epoch": 2.1865062311989685,
"grad_norm": 1.0625,
"learning_rate": 0.0004769786183257939,
"loss": 5.8564,
"mean_token_accuracy": 0.17447448074817656,
"num_tokens": 4631259.0,
"step": 2545
},
{
"entropy": 6.186811542510986,
"epoch": 2.190803609798023,
"grad_norm": 1.0859375,
"learning_rate": 0.0004768318918730924,
"loss": 5.7986,
"mean_token_accuracy": 0.1752243533730507,
"num_tokens": 4640266.0,
"step": 2550
},
{
"entropy": 6.212873888015747,
"epoch": 2.195100988397078,
"grad_norm": 1.046875,
"learning_rate": 0.00047668472470021044,
"loss": 5.853,
"mean_token_accuracy": 0.16329605877399445,
"num_tokens": 4649520.0,
"step": 2555
},
{
"entropy": 6.257145929336548,
"epoch": 2.1993983669961326,
"grad_norm": 1.03125,
"learning_rate": 0.0004765371171285025,
"loss": 5.8079,
"mean_token_accuracy": 0.1733356922864914,
"num_tokens": 4658501.0,
"step": 2560
},
{
"entropy": 6.108858823776245,
"epoch": 2.203695745595187,
"grad_norm": 1.0546875,
"learning_rate": 0.00047638906948028445,
"loss": 5.8536,
"mean_token_accuracy": 0.16747843474149704,
"num_tokens": 4667567.0,
"step": 2565
},
{
"entropy": 6.222007703781128,
"epoch": 2.2079931241942417,
"grad_norm": 1.1640625,
"learning_rate": 0.00047624058207883317,
"loss": 5.8596,
"mean_token_accuracy": 0.16799781173467637,
"num_tokens": 4676618.0,
"step": 2570
},
{
"entropy": 6.326595973968506,
"epoch": 2.212290502793296,
"grad_norm": 1.0,
"learning_rate": 0.00047609165524838576,
"loss": 5.921,
"mean_token_accuracy": 0.16489885598421097,
"num_tokens": 4685967.0,
"step": 2575
},
{
"entropy": 6.112624216079712,
"epoch": 2.2165878813923507,
"grad_norm": 1.2421875,
"learning_rate": 0.0004759422893141389,
"loss": 5.8098,
"mean_token_accuracy": 0.17214897125959397,
"num_tokens": 4694568.0,
"step": 2580
},
{
"entropy": 6.23127293586731,
"epoch": 2.2208852599914053,
"grad_norm": 1.0859375,
"learning_rate": 0.0004757924846022482,
"loss": 5.8764,
"mean_token_accuracy": 0.1683722823858261,
"num_tokens": 4703648.0,
"step": 2585
},
{
"entropy": 6.2149560928344725,
"epoch": 2.22518263859046,
"grad_norm": 1.171875,
"learning_rate": 0.00047564224143982714,
"loss": 5.7317,
"mean_token_accuracy": 0.18064576983451844,
"num_tokens": 4712444.0,
"step": 2590
},
{
"entropy": 6.195422506332397,
"epoch": 2.2294800171895144,
"grad_norm": 1.1796875,
"learning_rate": 0.00047549156015494676,
"loss": 5.887,
"mean_token_accuracy": 0.16564202010631562,
"num_tokens": 4722034.0,
"step": 2595
},
{
"entropy": 6.179683208465576,
"epoch": 2.233777395788569,
"grad_norm": 1.046875,
"learning_rate": 0.00047534044107663484,
"loss": 5.9075,
"mean_token_accuracy": 0.16279049664735795,
"num_tokens": 4731344.0,
"step": 2600
},
{
"entropy": 6.295088148117065,
"epoch": 2.2380747743876235,
"grad_norm": 1.15625,
"learning_rate": 0.00047518888453487496,
"loss": 5.809,
"mean_token_accuracy": 0.17704246044158936,
"num_tokens": 4739302.0,
"step": 2605
},
{
"entropy": 6.1531964302062985,
"epoch": 2.242372152986678,
"grad_norm": 0.98046875,
"learning_rate": 0.0004750368908606061,
"loss": 5.9282,
"mean_token_accuracy": 0.16434444785118102,
"num_tokens": 4748848.0,
"step": 2610
},
{
"entropy": 6.262106943130493,
"epoch": 2.2466695315857326,
"grad_norm": 0.99609375,
"learning_rate": 0.00047488446038572164,
"loss": 5.9816,
"mean_token_accuracy": 0.16012711673974991,
"num_tokens": 4758194.0,
"step": 2615
},
{
"entropy": 6.268323373794556,
"epoch": 2.250966910184787,
"grad_norm": 1.1171875,
"learning_rate": 0.0004747315934430688,
"loss": 5.8908,
"mean_token_accuracy": 0.164437834918499,
"num_tokens": 4768081.0,
"step": 2620
},
{
"entropy": 6.122048091888428,
"epoch": 2.2552642887838417,
"grad_norm": 1.1328125,
"learning_rate": 0.000474578290366448,
"loss": 5.8245,
"mean_token_accuracy": 0.1705750197172165,
"num_tokens": 4776471.0,
"step": 2625
},
{
"entropy": 6.204921579360962,
"epoch": 2.259561667382896,
"grad_norm": 1.09375,
"learning_rate": 0.0004744245514906117,
"loss": 5.8253,
"mean_token_accuracy": 0.1741186946630478,
"num_tokens": 4784403.0,
"step": 2630
},
{
"entropy": 6.1283422946929935,
"epoch": 2.263859045981951,
"grad_norm": 1.1171875,
"learning_rate": 0.00047427037715126426,
"loss": 5.8029,
"mean_token_accuracy": 0.16940733194351196,
"num_tokens": 4792779.0,
"step": 2635
},
{
"entropy": 6.132787275314331,
"epoch": 2.2681564245810057,
"grad_norm": 0.9921875,
"learning_rate": 0.0004741157676850608,
"loss": 5.7827,
"mean_token_accuracy": 0.1744200199842453,
"num_tokens": 4801426.0,
"step": 2640
},
{
"entropy": 6.2156031131744385,
"epoch": 2.2724538031800603,
"grad_norm": 1.2578125,
"learning_rate": 0.00047396072342960663,
"loss": 5.8338,
"mean_token_accuracy": 0.16472329795360566,
"num_tokens": 4810329.0,
"step": 2645
},
{
"entropy": 6.1918652057647705,
"epoch": 2.276751181779115,
"grad_norm": 1.0234375,
"learning_rate": 0.00047380524472345645,
"loss": 5.8802,
"mean_token_accuracy": 0.16467834115028382,
"num_tokens": 4819544.0,
"step": 2650
},
{
"entropy": 6.203462934494018,
"epoch": 2.2810485603781694,
"grad_norm": 1.078125,
"learning_rate": 0.0004736493319061134,
"loss": 5.8876,
"mean_token_accuracy": 0.16658470630645753,
"num_tokens": 4828113.0,
"step": 2655
},
{
"entropy": 6.154991245269775,
"epoch": 2.285345938977224,
"grad_norm": 0.98046875,
"learning_rate": 0.0004734929853180291,
"loss": 5.8764,
"mean_token_accuracy": 0.16575339883565904,
"num_tokens": 4836989.0,
"step": 2660
},
{
"entropy": 6.258448839187622,
"epoch": 2.2896433175762785,
"grad_norm": 0.921875,
"learning_rate": 0.00047333620530060175,
"loss": 5.9117,
"mean_token_accuracy": 0.16528864502906798,
"num_tokens": 4847103.0,
"step": 2665
},
{
"entropy": 6.181549310684204,
"epoch": 2.293940696175333,
"grad_norm": 1.1328125,
"learning_rate": 0.0004731789921961764,
"loss": 5.9289,
"mean_token_accuracy": 0.16640040427446365,
"num_tokens": 4856238.0,
"step": 2670
},
{
"entropy": 6.227826976776123,
"epoch": 2.2982380747743876,
"grad_norm": 1.09375,
"learning_rate": 0.0004730213463480434,
"loss": 5.8189,
"mean_token_accuracy": 0.17475187480449678,
"num_tokens": 4864608.0,
"step": 2675
},
{
"entropy": 6.163301944732666,
"epoch": 2.302535453373442,
"grad_norm": 1.0390625,
"learning_rate": 0.00047286326810043857,
"loss": 5.7783,
"mean_token_accuracy": 0.17075299024581908,
"num_tokens": 4873889.0,
"step": 2680
},
{
"entropy": 6.134186220169068,
"epoch": 2.3068328319724967,
"grad_norm": 1.109375,
"learning_rate": 0.00047270475779854137,
"loss": 5.8223,
"mean_token_accuracy": 0.1724078834056854,
"num_tokens": 4882902.0,
"step": 2685
},
{
"entropy": 6.292477703094482,
"epoch": 2.311130210571551,
"grad_norm": 1.09375,
"learning_rate": 0.00047254581578847507,
"loss": 5.8426,
"mean_token_accuracy": 0.16808903068304062,
"num_tokens": 4892390.0,
"step": 2690
},
{
"entropy": 6.170593881607056,
"epoch": 2.3154275891706058,
"grad_norm": 1.1015625,
"learning_rate": 0.0004723864424173055,
"loss": 5.9683,
"mean_token_accuracy": 0.1666146218776703,
"num_tokens": 4901625.0,
"step": 2695
},
{
"entropy": 6.194738912582397,
"epoch": 2.3197249677696608,
"grad_norm": 1.0703125,
"learning_rate": 0.0004722266380330403,
"loss": 5.7718,
"mean_token_accuracy": 0.17559022307395936,
"num_tokens": 4910804.0,
"step": 2700
},
{
"entropy": 6.180141830444336,
"epoch": 2.3240223463687153,
"grad_norm": 1.0625,
"learning_rate": 0.00047206640298462857,
"loss": 5.8472,
"mean_token_accuracy": 0.16781375855207442,
"num_tokens": 4920441.0,
"step": 2705
},
{
"entropy": 6.170105838775635,
"epoch": 2.32831972496777,
"grad_norm": 1.109375,
"learning_rate": 0.00047190573762195945,
"loss": 5.8928,
"mean_token_accuracy": 0.1647154539823532,
"num_tokens": 4930204.0,
"step": 2710
},
{
"entropy": 6.171744394302368,
"epoch": 2.3326171035668244,
"grad_norm": 0.89453125,
"learning_rate": 0.00047174464229586186,
"loss": 5.9868,
"mean_token_accuracy": 0.15878558307886123,
"num_tokens": 4941191.0,
"step": 2715
},
{
"entropy": 6.294037532806397,
"epoch": 2.336914482165879,
"grad_norm": 1.234375,
"learning_rate": 0.0004715831173581036,
"loss": 5.9658,
"mean_token_accuracy": 0.16081493049860002,
"num_tokens": 4951825.0,
"step": 2720
},
{
"entropy": 6.163305330276489,
"epoch": 2.3412118607649335,
"grad_norm": 0.97265625,
"learning_rate": 0.00047142116316139073,
"loss": 5.9007,
"mean_token_accuracy": 0.1701881170272827,
"num_tokens": 4960632.0,
"step": 2725
},
{
"entropy": 6.263418245315552,
"epoch": 2.345509239363988,
"grad_norm": 0.97265625,
"learning_rate": 0.0004712587800593663,
"loss": 5.9268,
"mean_token_accuracy": 0.1628424420952797,
"num_tokens": 4969455.0,
"step": 2730
},
{
"entropy": 6.159938859939575,
"epoch": 2.3498066179630426,
"grad_norm": 1.234375,
"learning_rate": 0.0004710959684066102,
"loss": 5.822,
"mean_token_accuracy": 0.1740834206342697,
"num_tokens": 4978997.0,
"step": 2735
},
{
"entropy": 6.198467969894409,
"epoch": 2.354103996562097,
"grad_norm": 1.0234375,
"learning_rate": 0.00047093272855863803,
"loss": 5.89,
"mean_token_accuracy": 0.16633735448122025,
"num_tokens": 4988305.0,
"step": 2740
},
{
"entropy": 6.171191024780273,
"epoch": 2.3584013751611517,
"grad_norm": 1.03125,
"learning_rate": 0.0004707690608719003,
"loss": 5.8201,
"mean_token_accuracy": 0.17565433084964752,
"num_tokens": 4997022.0,
"step": 2745
},
{
"entropy": 6.182925462722778,
"epoch": 2.362698753760206,
"grad_norm": 1.140625,
"learning_rate": 0.0004706049657037818,
"loss": 5.879,
"mean_token_accuracy": 0.16346064060926438,
"num_tokens": 5005664.0,
"step": 2750
},
{
"entropy": 6.149474191665649,
"epoch": 2.3669961323592608,
"grad_norm": 1.0078125,
"learning_rate": 0.0004704404434126009,
"loss": 5.8502,
"mean_token_accuracy": 0.16408389210700988,
"num_tokens": 5014769.0,
"step": 2755
},
{
"entropy": 6.255496549606323,
"epoch": 2.3712935109583153,
"grad_norm": 1.0078125,
"learning_rate": 0.00047027549435760843,
"loss": 5.9078,
"mean_token_accuracy": 0.16789433360099792,
"num_tokens": 5024060.0,
"step": 2760
},
{
"entropy": 6.256794357299805,
"epoch": 2.37559088955737,
"grad_norm": 1.109375,
"learning_rate": 0.0004701101188989872,
"loss": 5.9544,
"mean_token_accuracy": 0.1624842867255211,
"num_tokens": 5033046.0,
"step": 2765
},
{
"entropy": 6.156686782836914,
"epoch": 2.3798882681564244,
"grad_norm": 1.1953125,
"learning_rate": 0.00046994431739785114,
"loss": 5.7991,
"mean_token_accuracy": 0.18271932750940323,
"num_tokens": 5040894.0,
"step": 2770
},
{
"entropy": 6.20210337638855,
"epoch": 2.384185646755479,
"grad_norm": 1.015625,
"learning_rate": 0.00046977809021624454,
"loss": 5.9534,
"mean_token_accuracy": 0.17005517482757568,
"num_tokens": 5050961.0,
"step": 2775
},
{
"entropy": 6.216541862487793,
"epoch": 2.3884830253545335,
"grad_norm": 1.078125,
"learning_rate": 0.0004696114377171409,
"loss": 5.8757,
"mean_token_accuracy": 0.1636977568268776,
"num_tokens": 5060226.0,
"step": 2780
},
{
"entropy": 6.160855150222778,
"epoch": 2.3927804039535885,
"grad_norm": 1.09375,
"learning_rate": 0.0004694443602644429,
"loss": 5.8457,
"mean_token_accuracy": 0.16862347573041916,
"num_tokens": 5069225.0,
"step": 2785
},
{
"entropy": 6.22788553237915,
"epoch": 2.397077782552643,
"grad_norm": 1.0625,
"learning_rate": 0.0004692768582229808,
"loss": 5.8344,
"mean_token_accuracy": 0.17104473561048508,
"num_tokens": 5078386.0,
"step": 2790
},
{
"entropy": 6.091501474380493,
"epoch": 2.4013751611516976,
"grad_norm": 0.96484375,
"learning_rate": 0.00046910893195851213,
"loss": 5.765,
"mean_token_accuracy": 0.16869171112775802,
"num_tokens": 5087161.0,
"step": 2795
},
{
"entropy": 6.183551597595215,
"epoch": 2.405672539750752,
"grad_norm": 1.0234375,
"learning_rate": 0.00046894058183772074,
"loss": 5.9281,
"mean_token_accuracy": 0.16594007909297942,
"num_tokens": 5096613.0,
"step": 2800
},
{
"entropy": 6.197868537902832,
"epoch": 2.4099699183498067,
"grad_norm": 1.1171875,
"learning_rate": 0.000468771808228216,
"loss": 5.8912,
"mean_token_accuracy": 0.16417519897222518,
"num_tokens": 5106534.0,
"step": 2805
},
{
"entropy": 6.143604946136475,
"epoch": 2.414267296948861,
"grad_norm": 1.078125,
"learning_rate": 0.00046860261149853197,
"loss": 5.9134,
"mean_token_accuracy": 0.1646139517426491,
"num_tokens": 5115975.0,
"step": 2810
},
{
"entropy": 6.127184104919434,
"epoch": 2.4185646755479158,
"grad_norm": 1.125,
"learning_rate": 0.0004684329920181268,
"loss": 5.8045,
"mean_token_accuracy": 0.16945046484470366,
"num_tokens": 5124635.0,
"step": 2815
},
{
"entropy": 6.151847076416016,
"epoch": 2.4228620541469703,
"grad_norm": 1.1640625,
"learning_rate": 0.00046826295015738154,
"loss": 5.7738,
"mean_token_accuracy": 0.1773565873503685,
"num_tokens": 5133226.0,
"step": 2820
},
{
"entropy": 6.0929807186126705,
"epoch": 2.427159432746025,
"grad_norm": 1.0078125,
"learning_rate": 0.0004680924862875996,
"loss": 5.8663,
"mean_token_accuracy": 0.17087701261043547,
"num_tokens": 5142257.0,
"step": 2825
},
{
"entropy": 6.199731492996216,
"epoch": 2.4314568113450794,
"grad_norm": 0.984375,
"learning_rate": 0.00046792160078100605,
"loss": 5.8592,
"mean_token_accuracy": 0.17053601890802383,
"num_tokens": 5150752.0,
"step": 2830
},
{
"entropy": 6.151450777053833,
"epoch": 2.435754189944134,
"grad_norm": 1.0078125,
"learning_rate": 0.00046775029401074653,
"loss": 5.7783,
"mean_token_accuracy": 0.17438559532165526,
"num_tokens": 5160237.0,
"step": 2835
},
{
"entropy": 6.171485233306885,
"epoch": 2.4400515685431885,
"grad_norm": 1.109375,
"learning_rate": 0.00046757856635088645,
"loss": 5.85,
"mean_token_accuracy": 0.17521743029356002,
"num_tokens": 5169752.0,
"step": 2840
},
{
"entropy": 6.1737254619598385,
"epoch": 2.444348947142243,
"grad_norm": 1.0078125,
"learning_rate": 0.0004674064181764105,
"loss": 5.8887,
"mean_token_accuracy": 0.17213839143514634,
"num_tokens": 5178892.0,
"step": 2845
},
{
"entropy": 6.169126319885254,
"epoch": 2.448646325741298,
"grad_norm": 0.9609375,
"learning_rate": 0.00046723384986322147,
"loss": 5.8736,
"mean_token_accuracy": 0.16697555780410767,
"num_tokens": 5188468.0,
"step": 2850
},
{
"entropy": 6.121142101287842,
"epoch": 2.4529437043403526,
"grad_norm": 1.0078125,
"learning_rate": 0.0004670608617881395,
"loss": 5.7947,
"mean_token_accuracy": 0.1755498692393303,
"num_tokens": 5197565.0,
"step": 2855
},
{
"entropy": 6.083435106277466,
"epoch": 2.457241082939407,
"grad_norm": 1.09375,
"learning_rate": 0.0004668874543289014,
"loss": 5.7851,
"mean_token_accuracy": 0.1805465489625931,
"num_tokens": 5205791.0,
"step": 2860
},
{
"entropy": 6.136435890197754,
"epoch": 2.4615384615384617,
"grad_norm": 1.046875,
"learning_rate": 0.00046671362786415986,
"loss": 5.7872,
"mean_token_accuracy": 0.18155153840780258,
"num_tokens": 5214773.0,
"step": 2865
},
{
"entropy": 6.082297658920288,
"epoch": 2.465835840137516,
"grad_norm": 0.9921875,
"learning_rate": 0.00046653938277348237,
"loss": 5.8211,
"mean_token_accuracy": 0.1757299304008484,
"num_tokens": 5223734.0,
"step": 2870
},
{
"entropy": 6.256624984741211,
"epoch": 2.4701332187365708,
"grad_norm": 1.1796875,
"learning_rate": 0.0004663647194373505,
"loss": 5.9026,
"mean_token_accuracy": 0.16392517536878587,
"num_tokens": 5231742.0,
"step": 2875
},
{
"entropy": 6.135076570510864,
"epoch": 2.4744305973356253,
"grad_norm": 1.0078125,
"learning_rate": 0.00046618963823715913,
"loss": 5.8631,
"mean_token_accuracy": 0.17133675366640091,
"num_tokens": 5241673.0,
"step": 2880
},
{
"entropy": 6.190168714523315,
"epoch": 2.47872797593468,
"grad_norm": 1.1171875,
"learning_rate": 0.00046601413955521575,
"loss": 5.8246,
"mean_token_accuracy": 0.1694057285785675,
"num_tokens": 5250082.0,
"step": 2885
},
{
"entropy": 6.136935997009277,
"epoch": 2.4830253545337344,
"grad_norm": 1.1484375,
"learning_rate": 0.0004658382237747393,
"loss": 5.8976,
"mean_token_accuracy": 0.16706683337688447,
"num_tokens": 5259680.0,
"step": 2890
},
{
"entropy": 6.16978874206543,
"epoch": 2.487322733132789,
"grad_norm": 0.97265625,
"learning_rate": 0.00046566189127985946,
"loss": 5.8769,
"mean_token_accuracy": 0.1714440792798996,
"num_tokens": 5269561.0,
"step": 2895
},
{
"entropy": 6.182620716094971,
"epoch": 2.4916201117318435,
"grad_norm": 0.9921875,
"learning_rate": 0.000465485142455616,
"loss": 5.8189,
"mean_token_accuracy": 0.17375694811344147,
"num_tokens": 5278659.0,
"step": 2900
},
{
"entropy": 6.057879829406739,
"epoch": 2.495917490330898,
"grad_norm": 1.0390625,
"learning_rate": 0.00046530797768795765,
"loss": 5.8103,
"mean_token_accuracy": 0.18172994256019592,
"num_tokens": 5287619.0,
"step": 2905
},
{
"entropy": 6.1459949016571045,
"epoch": 2.5002148689299526,
"grad_norm": 1.0078125,
"learning_rate": 0.00046513039736374153,
"loss": 5.9271,
"mean_token_accuracy": 0.16282536834478378,
"num_tokens": 5297334.0,
"step": 2910
},
{
"entropy": 6.201943445205688,
"epoch": 2.504512247529007,
"grad_norm": 1.109375,
"learning_rate": 0.0004649524018707319,
"loss": 5.8405,
"mean_token_accuracy": 0.1736244261264801,
"num_tokens": 5306208.0,
"step": 2915
},
{
"entropy": 6.117348289489746,
"epoch": 2.5088096261280617,
"grad_norm": 1.2109375,
"learning_rate": 0.00046477399159759996,
"loss": 5.7789,
"mean_token_accuracy": 0.1744915708899498,
"num_tokens": 5314754.0,
"step": 2920
},
{
"entropy": 6.022426891326904,
"epoch": 2.5131070047271162,
"grad_norm": 1.125,
"learning_rate": 0.00046459516693392246,
"loss": 5.7951,
"mean_token_accuracy": 0.17653965055942536,
"num_tokens": 5324000.0,
"step": 2925
},
{
"entropy": 6.192726993560791,
"epoch": 2.517404383326171,
"grad_norm": 1.0546875,
"learning_rate": 0.0004644159282701808,
"loss": 5.8412,
"mean_token_accuracy": 0.1699216842651367,
"num_tokens": 5332478.0,
"step": 2930
},
{
"entropy": 6.193784236907959,
"epoch": 2.5217017619252258,
"grad_norm": 0.99609375,
"learning_rate": 0.00046423627599776076,
"loss": 5.9229,
"mean_token_accuracy": 0.1587831899523735,
"num_tokens": 5341635.0,
"step": 2935
},
{
"entropy": 6.126192474365235,
"epoch": 2.5259991405242803,
"grad_norm": 1.0,
"learning_rate": 0.000464056210508951,
"loss": 5.9125,
"mean_token_accuracy": 0.16348374187946318,
"num_tokens": 5350144.0,
"step": 2940
},
{
"entropy": 6.17839298248291,
"epoch": 2.530296519123335,
"grad_norm": 1.078125,
"learning_rate": 0.0004638757321969426,
"loss": 5.8251,
"mean_token_accuracy": 0.17073310166597366,
"num_tokens": 5358788.0,
"step": 2945
},
{
"entropy": 6.144708824157715,
"epoch": 2.5345938977223894,
"grad_norm": 1.0859375,
"learning_rate": 0.00046369484145582815,
"loss": 5.9064,
"mean_token_accuracy": 0.16323922872543334,
"num_tokens": 5368057.0,
"step": 2950
},
{
"entropy": 6.069336700439453,
"epoch": 2.538891276321444,
"grad_norm": 1.0546875,
"learning_rate": 0.00046351353868060054,
"loss": 5.7586,
"mean_token_accuracy": 0.174574413895607,
"num_tokens": 5376739.0,
"step": 2955
},
{
"entropy": 6.171047353744507,
"epoch": 2.5431886549204985,
"grad_norm": 1.03125,
"learning_rate": 0.00046333182426715273,
"loss": 5.8806,
"mean_token_accuracy": 0.16850085258483888,
"num_tokens": 5385967.0,
"step": 2960
},
{
"entropy": 6.161162233352661,
"epoch": 2.547486033519553,
"grad_norm": 1.0390625,
"learning_rate": 0.00046314969861227626,
"loss": 5.9049,
"mean_token_accuracy": 0.15845982432365419,
"num_tokens": 5395192.0,
"step": 2965
},
{
"entropy": 6.14454460144043,
"epoch": 2.5517834121186076,
"grad_norm": 0.96484375,
"learning_rate": 0.0004629671621136608,
"loss": 5.8588,
"mean_token_accuracy": 0.16995412558317186,
"num_tokens": 5404694.0,
"step": 2970
},
{
"entropy": 6.158005809783935,
"epoch": 2.556080790717662,
"grad_norm": 1.1484375,
"learning_rate": 0.0004627842151698931,
"loss": 5.8623,
"mean_token_accuracy": 0.16851141750812532,
"num_tokens": 5413102.0,
"step": 2975
},
{
"entropy": 6.134857320785523,
"epoch": 2.5603781693167167,
"grad_norm": 1.046875,
"learning_rate": 0.00046260085818045625,
"loss": 5.8942,
"mean_token_accuracy": 0.16586572974920272,
"num_tokens": 5423339.0,
"step": 2980
},
{
"entropy": 6.197592544555664,
"epoch": 2.5646755479157712,
"grad_norm": 1.0546875,
"learning_rate": 0.0004624170915457284,
"loss": 5.8504,
"mean_token_accuracy": 0.17059714645147322,
"num_tokens": 5432377.0,
"step": 2985
},
{
"entropy": 6.128017950057983,
"epoch": 2.5689729265148262,
"grad_norm": 1.09375,
"learning_rate": 0.00046223291566698264,
"loss": 5.7959,
"mean_token_accuracy": 0.17204724699258805,
"num_tokens": 5441038.0,
"step": 2990
},
{
"entropy": 6.107345724105835,
"epoch": 2.5732703051138808,
"grad_norm": 1.046875,
"learning_rate": 0.0004620483309463855,
"loss": 5.7918,
"mean_token_accuracy": 0.17900732010602952,
"num_tokens": 5449557.0,
"step": 2995
},
{
"entropy": 6.1927958011627195,
"epoch": 2.5775676837129353,
"grad_norm": 1.0390625,
"learning_rate": 0.0004618633377869961,
"loss": 5.9156,
"mean_token_accuracy": 0.16568114012479782,
"num_tokens": 5458931.0,
"step": 3000
},
{
"epoch": 2.5775676837129353,
"eval_entropy": 5.998430791201892,
"eval_loss": 6.121789455413818,
"eval_mean_token_accuracy": 0.16322041645243363,
"eval_num_tokens": 5458931.0,
"eval_runtime": 2.0487,
"eval_samples_per_second": 1732.347,
"eval_steps_per_second": 216.726,
"step": 3000
},
{
"entropy": 6.126945543289184,
"epoch": 2.58186506231199,
"grad_norm": 0.9765625,
"learning_rate": 0.0004616779365927656,
"loss": 5.7528,
"mean_token_accuracy": 0.18461534082889558,
"num_tokens": 5468539.0,
"step": 3005
},
{
"entropy": 5.964468240737915,
"epoch": 2.5861624409110444,
"grad_norm": 1.2734375,
"learning_rate": 0.0004614921277685361,
"loss": 5.6994,
"mean_token_accuracy": 0.18173616677522658,
"num_tokens": 5475710.0,
"step": 3010
},
{
"entropy": 6.099804162979126,
"epoch": 2.590459819510099,
"grad_norm": 1.0234375,
"learning_rate": 0.00046130591172003976,
"loss": 5.845,
"mean_token_accuracy": 0.16855668723583223,
"num_tokens": 5484597.0,
"step": 3015
},
{
"entropy": 6.216131401062012,
"epoch": 2.5947571981091535,
"grad_norm": 1.0234375,
"learning_rate": 0.0004611192888538981,
"loss": 5.9276,
"mean_token_accuracy": 0.16257163286209106,
"num_tokens": 5493213.0,
"step": 3020
},
{
"entropy": 6.1808586597442625,
"epoch": 2.599054576708208,
"grad_norm": 1.1484375,
"learning_rate": 0.00046093225957762084,
"loss": 5.903,
"mean_token_accuracy": 0.16862684190273286,
"num_tokens": 5502556.0,
"step": 3025
},
{
"entropy": 6.1216977596282955,
"epoch": 2.6033519553072626,
"grad_norm": 1.0703125,
"learning_rate": 0.0004607448242996051,
"loss": 5.8208,
"mean_token_accuracy": 0.1719271272420883,
"num_tokens": 5511779.0,
"step": 3030
},
{
"entropy": 6.1579231262207035,
"epoch": 2.607649333906317,
"grad_norm": 1.0625,
"learning_rate": 0.0004605569834291347,
"loss": 5.8058,
"mean_token_accuracy": 0.18103471398353577,
"num_tokens": 5520836.0,
"step": 3035
},
{
"entropy": 6.061151313781738,
"epoch": 2.6119467125053717,
"grad_norm": 1.171875,
"learning_rate": 0.00046036873737637904,
"loss": 5.8302,
"mean_token_accuracy": 0.17482185810804368,
"num_tokens": 5529285.0,
"step": 3040
},
{
"entropy": 6.116726493835449,
"epoch": 2.6162440911044262,
"grad_norm": 1.1015625,
"learning_rate": 0.0004601800865523921,
"loss": 5.8482,
"mean_token_accuracy": 0.1684387966990471,
"num_tokens": 5538160.0,
"step": 3045
},
{
"entropy": 6.122728109359741,
"epoch": 2.620541469703481,
"grad_norm": 1.0859375,
"learning_rate": 0.00045999103136911204,
"loss": 5.8517,
"mean_token_accuracy": 0.16452286690473555,
"num_tokens": 5547355.0,
"step": 3050
},
{
"entropy": 6.120913076400757,
"epoch": 2.6248388483025353,
"grad_norm": 1.0078125,
"learning_rate": 0.00045980157223935965,
"loss": 5.8606,
"mean_token_accuracy": 0.16614654809236526,
"num_tokens": 5557299.0,
"step": 3055
},
{
"entropy": 6.061937570571899,
"epoch": 2.62913622690159,
"grad_norm": 1.0,
"learning_rate": 0.00045961170957683806,
"loss": 5.7822,
"mean_token_accuracy": 0.17485247999429704,
"num_tokens": 5565469.0,
"step": 3060
},
{
"entropy": 6.150688505172729,
"epoch": 2.6334336055006444,
"grad_norm": 1.03125,
"learning_rate": 0.00045942144379613147,
"loss": 5.8945,
"mean_token_accuracy": 0.16743394434452058,
"num_tokens": 5574740.0,
"step": 3065
},
{
"entropy": 6.152962112426758,
"epoch": 2.637730984099699,
"grad_norm": 1.0546875,
"learning_rate": 0.00045923077531270426,
"loss": 5.8866,
"mean_token_accuracy": 0.16888206750154494,
"num_tokens": 5583438.0,
"step": 3070
},
{
"entropy": 6.126224088668823,
"epoch": 2.6420283626987535,
"grad_norm": 1.046875,
"learning_rate": 0.0004590397045429001,
"loss": 5.84,
"mean_token_accuracy": 0.17367925941944123,
"num_tokens": 5592389.0,
"step": 3075
},
{
"entropy": 6.084698152542114,
"epoch": 2.646325741297808,
"grad_norm": 0.9609375,
"learning_rate": 0.00045884823190394134,
"loss": 5.7589,
"mean_token_accuracy": 0.1789909452199936,
"num_tokens": 5601598.0,
"step": 3080
},
{
"entropy": 6.075862979888916,
"epoch": 2.650623119896863,
"grad_norm": 1.1171875,
"learning_rate": 0.0004586563578139275,
"loss": 5.8461,
"mean_token_accuracy": 0.1662924975156784,
"num_tokens": 5610498.0,
"step": 3085
},
{
"entropy": 6.096910190582276,
"epoch": 2.6549204984959176,
"grad_norm": 1.1796875,
"learning_rate": 0.00045846408269183505,
"loss": 5.7512,
"mean_token_accuracy": 0.17860534340143203,
"num_tokens": 5620082.0,
"step": 3090
},
{
"entropy": 6.1647505283355715,
"epoch": 2.659217877094972,
"grad_norm": 1.0234375,
"learning_rate": 0.00045827140695751603,
"loss": 5.8362,
"mean_token_accuracy": 0.17174756973981858,
"num_tokens": 5630291.0,
"step": 3095
},
{
"entropy": 6.091697454452515,
"epoch": 2.6635152556940267,
"grad_norm": 1.1484375,
"learning_rate": 0.0004580783310316971,
"loss": 5.8104,
"mean_token_accuracy": 0.17255474478006363,
"num_tokens": 5638784.0,
"step": 3100
},
{
"entropy": 6.026739645004272,
"epoch": 2.6678126342930812,
"grad_norm": 1.046875,
"learning_rate": 0.00045788485533597895,
"loss": 5.6819,
"mean_token_accuracy": 0.18163852095603944,
"num_tokens": 5647968.0,
"step": 3105
},
{
"entropy": 6.098209285736084,
"epoch": 2.672110012892136,
"grad_norm": 1.0390625,
"learning_rate": 0.00045769098029283526,
"loss": 5.906,
"mean_token_accuracy": 0.16296559423208237,
"num_tokens": 5657543.0,
"step": 3110
},
{
"entropy": 6.150312328338623,
"epoch": 2.6764073914911903,
"grad_norm": 1.1328125,
"learning_rate": 0.0004574967063256115,
"loss": 5.836,
"mean_token_accuracy": 0.17701750695705415,
"num_tokens": 5666535.0,
"step": 3115
},
{
"entropy": 6.1265421390533445,
"epoch": 2.680704770090245,
"grad_norm": 1.1015625,
"learning_rate": 0.00045730203385852447,
"loss": 5.9135,
"mean_token_accuracy": 0.16741105765104294,
"num_tokens": 5676273.0,
"step": 3120
},
{
"entropy": 6.052946949005127,
"epoch": 2.6850021486892994,
"grad_norm": 1.0703125,
"learning_rate": 0.000457106963316661,
"loss": 5.8151,
"mean_token_accuracy": 0.1772770792245865,
"num_tokens": 5684888.0,
"step": 3125
},
{
"entropy": 6.088335084915161,
"epoch": 2.689299527288354,
"grad_norm": 1.0703125,
"learning_rate": 0.00045691149512597717,
"loss": 5.8631,
"mean_token_accuracy": 0.16669325679540634,
"num_tokens": 5693626.0,
"step": 3130
},
{
"entropy": 6.180005502700806,
"epoch": 2.6935969058874085,
"grad_norm": 1.4453125,
"learning_rate": 0.00045671562971329736,
"loss": 5.7649,
"mean_token_accuracy": 0.18092152327299119,
"num_tokens": 5702542.0,
"step": 3135
},
{
"entropy": 6.056423187255859,
"epoch": 2.6978942844864635,
"grad_norm": 1.1484375,
"learning_rate": 0.00045651936750631337,
"loss": 5.8131,
"mean_token_accuracy": 0.17378336936235428,
"num_tokens": 5711440.0,
"step": 3140
},
{
"entropy": 6.189997816085816,
"epoch": 2.702191663085518,
"grad_norm": 1.0234375,
"learning_rate": 0.00045632270893358333,
"loss": 5.8825,
"mean_token_accuracy": 0.17272377163171768,
"num_tokens": 5721495.0,
"step": 3145
},
{
"entropy": 6.167654418945313,
"epoch": 2.7064890416845726,
"grad_norm": 1.109375,
"learning_rate": 0.0004561256544245312,
"loss": 5.9067,
"mean_token_accuracy": 0.1615714728832245,
"num_tokens": 5730664.0,
"step": 3150
},
{
"entropy": 6.04947509765625,
"epoch": 2.710786420283627,
"grad_norm": 1.0625,
"learning_rate": 0.000455928204409445,
"loss": 5.79,
"mean_token_accuracy": 0.17923566401004792,
"num_tokens": 5740229.0,
"step": 3155
},
{
"entropy": 6.107324123382568,
"epoch": 2.7150837988826817,
"grad_norm": 1.1328125,
"learning_rate": 0.00045573035931947684,
"loss": 5.7791,
"mean_token_accuracy": 0.17757482677698136,
"num_tokens": 5748549.0,
"step": 3160
},
{
"entropy": 6.101696872711182,
"epoch": 2.7193811774817362,
"grad_norm": 1.109375,
"learning_rate": 0.0004555321195866411,
"loss": 5.732,
"mean_token_accuracy": 0.17644069641828536,
"num_tokens": 5757603.0,
"step": 3165
},
{
"entropy": 6.136196327209473,
"epoch": 2.723678556080791,
"grad_norm": 1.2265625,
"learning_rate": 0.0004553334856438143,
"loss": 5.9098,
"mean_token_accuracy": 0.16370768547058107,
"num_tokens": 5767520.0,
"step": 3170
},
{
"entropy": 6.1458038806915285,
"epoch": 2.7279759346798453,
"grad_norm": 0.98828125,
"learning_rate": 0.00045513445792473356,
"loss": 5.8906,
"mean_token_accuracy": 0.16408973336219787,
"num_tokens": 5776778.0,
"step": 3175
},
{
"entropy": 6.174926614761352,
"epoch": 2.7322733132789,
"grad_norm": 1.109375,
"learning_rate": 0.0004549350368639958,
"loss": 5.9249,
"mean_token_accuracy": 0.16355405300855635,
"num_tokens": 5785652.0,
"step": 3180
},
{
"entropy": 6.212893629074097,
"epoch": 2.7365706918779544,
"grad_norm": 1.078125,
"learning_rate": 0.00045473522289705693,
"loss": 5.8811,
"mean_token_accuracy": 0.1690053179860115,
"num_tokens": 5795766.0,
"step": 3185
},
{
"entropy": 6.0142913341522215,
"epoch": 2.740868070477009,
"grad_norm": 1.140625,
"learning_rate": 0.00045453501646023085,
"loss": 5.9293,
"mean_token_accuracy": 0.16316341012716293,
"num_tokens": 5804504.0,
"step": 3190
},
{
"entropy": 6.090119218826294,
"epoch": 2.7451654490760635,
"grad_norm": 0.94921875,
"learning_rate": 0.00045433441799068837,
"loss": 5.8318,
"mean_token_accuracy": 0.17157045751810074,
"num_tokens": 5814161.0,
"step": 3195
},
{
"entropy": 6.133489179611206,
"epoch": 2.749462827675118,
"grad_norm": 1.0625,
"learning_rate": 0.0004541334279264562,
"loss": 5.7556,
"mean_token_accuracy": 0.17994108349084853,
"num_tokens": 5822235.0,
"step": 3200
},
{
"entropy": 6.069830846786499,
"epoch": 2.7537602062741726,
"grad_norm": 1.171875,
"learning_rate": 0.00045393204670641656,
"loss": 5.7589,
"mean_token_accuracy": 0.17203548699617385,
"num_tokens": 5831572.0,
"step": 3205
},
{
"entropy": 5.9929163455963135,
"epoch": 2.758057584873227,
"grad_norm": 1.0390625,
"learning_rate": 0.0004537302747703055,
"loss": 5.7621,
"mean_token_accuracy": 0.18025242835283278,
"num_tokens": 5839694.0,
"step": 3210
},
{
"entropy": 6.185488748550415,
"epoch": 2.7623549634722817,
"grad_norm": 1.1875,
"learning_rate": 0.00045352811255871216,
"loss": 5.8899,
"mean_token_accuracy": 0.17093945741653443,
"num_tokens": 5849131.0,
"step": 3215
},
{
"entropy": 6.186608505249024,
"epoch": 2.7666523420713363,
"grad_norm": 0.91796875,
"learning_rate": 0.00045332556051307804,
"loss": 5.8208,
"mean_token_accuracy": 0.16853767782449722,
"num_tokens": 5858861.0,
"step": 3220
},
{
"entropy": 6.110893869400025,
"epoch": 2.770949720670391,
"grad_norm": 1.0546875,
"learning_rate": 0.00045312261907569585,
"loss": 5.82,
"mean_token_accuracy": 0.17171475738286973,
"num_tokens": 5867585.0,
"step": 3225
},
{
"entropy": 6.081268453598023,
"epoch": 2.775247099269446,
"grad_norm": 1.0859375,
"learning_rate": 0.00045291928868970867,
"loss": 5.8317,
"mean_token_accuracy": 0.16950544714927673,
"num_tokens": 5876256.0,
"step": 3230
},
{
"entropy": 6.064776659011841,
"epoch": 2.7795444778685003,
"grad_norm": 1.0859375,
"learning_rate": 0.0004527155697991087,
"loss": 5.8911,
"mean_token_accuracy": 0.16254067420959473,
"num_tokens": 5885302.0,
"step": 3235
},
{
"entropy": 6.128396034240723,
"epoch": 2.783841856467555,
"grad_norm": 0.95703125,
"learning_rate": 0.0004525114628487365,
"loss": 5.9091,
"mean_token_accuracy": 0.16473145335912703,
"num_tokens": 5895066.0,
"step": 3240
},
{
"entropy": 6.1276613712310795,
"epoch": 2.7881392350666094,
"grad_norm": 1.0625,
"learning_rate": 0.00045230696828428026,
"loss": 5.8938,
"mean_token_accuracy": 0.16614799648523332,
"num_tokens": 5903258.0,
"step": 3245
},
{
"entropy": 6.09830675125122,
"epoch": 2.792436613665664,
"grad_norm": 1.125,
"learning_rate": 0.0004521020865522742,
"loss": 5.7738,
"mean_token_accuracy": 0.1714928478002548,
"num_tokens": 5911714.0,
"step": 3250
},
{
"entropy": 6.070488023757934,
"epoch": 2.7967339922647185,
"grad_norm": 1.0859375,
"learning_rate": 0.00045189681810009827,
"loss": 5.8635,
"mean_token_accuracy": 0.16751533150672912,
"num_tokens": 5920432.0,
"step": 3255
},
{
"entropy": 6.227630186080932,
"epoch": 2.801031370863773,
"grad_norm": 1.2265625,
"learning_rate": 0.00045169116337597653,
"loss": 5.8701,
"mean_token_accuracy": 0.17065902799367905,
"num_tokens": 5929202.0,
"step": 3260
},
{
"entropy": 6.189503717422485,
"epoch": 2.8053287494628276,
"grad_norm": 1.15625,
"learning_rate": 0.000451485122828977,
"loss": 5.9003,
"mean_token_accuracy": 0.1647379770874977,
"num_tokens": 5938034.0,
"step": 3265
},
{
"entropy": 6.010164356231689,
"epoch": 2.809626128061882,
"grad_norm": 0.9921875,
"learning_rate": 0.00045127869690900956,
"loss": 5.7485,
"mean_token_accuracy": 0.17689475119113923,
"num_tokens": 5946944.0,
"step": 3270
},
{
"entropy": 6.029814195632935,
"epoch": 2.8139235066609367,
"grad_norm": 1.2421875,
"learning_rate": 0.00045107188606682613,
"loss": 5.8498,
"mean_token_accuracy": 0.17715609222650527,
"num_tokens": 5956475.0,
"step": 3275
},
{
"entropy": 6.185597848892212,
"epoch": 2.8182208852599913,
"grad_norm": 1.0390625,
"learning_rate": 0.0004508646907540188,
"loss": 5.8236,
"mean_token_accuracy": 0.16963610351085662,
"num_tokens": 5965814.0,
"step": 3280
},
{
"entropy": 6.105741548538208,
"epoch": 2.8225182638590463,
"grad_norm": 1.1328125,
"learning_rate": 0.0004506571114230195,
"loss": 5.8687,
"mean_token_accuracy": 0.16442400217056274,
"num_tokens": 5973850.0,
"step": 3285
},
{
"entropy": 6.0313629627227785,
"epoch": 2.826815642458101,
"grad_norm": 1.0,
"learning_rate": 0.00045044914852709824,
"loss": 5.8113,
"mean_token_accuracy": 0.16617825627326965,
"num_tokens": 5982987.0,
"step": 3290
},
{
"entropy": 6.152327919006348,
"epoch": 2.8311130210571553,
"grad_norm": 1.1015625,
"learning_rate": 0.0004502408025203631,
"loss": 5.7981,
"mean_token_accuracy": 0.17620996087789537,
"num_tokens": 5992227.0,
"step": 3295
},
{
"entropy": 6.093041801452637,
"epoch": 2.83541039965621,
"grad_norm": 1.0546875,
"learning_rate": 0.0004500320738577584,
"loss": 5.7804,
"mean_token_accuracy": 0.17178058624267578,
"num_tokens": 6000243.0,
"step": 3300
},
{
"entropy": 6.071863269805908,
"epoch": 2.8397077782552644,
"grad_norm": 1.109375,
"learning_rate": 0.00044982296299506407,
"loss": 5.7959,
"mean_token_accuracy": 0.1757694289088249,
"num_tokens": 6009771.0,
"step": 3305
},
{
"entropy": 6.104401445388794,
"epoch": 2.844005156854319,
"grad_norm": 1.1796875,
"learning_rate": 0.0004496134703888948,
"loss": 5.8655,
"mean_token_accuracy": 0.16886720359325408,
"num_tokens": 6018683.0,
"step": 3310
},
{
"entropy": 6.063603019714355,
"epoch": 2.8483025354533735,
"grad_norm": 1.0703125,
"learning_rate": 0.00044940359649669846,
"loss": 5.7182,
"mean_token_accuracy": 0.1814822018146515,
"num_tokens": 6027422.0,
"step": 3315
},
{
"entropy": 6.0563880443573,
"epoch": 2.852599914052428,
"grad_norm": 1.09375,
"learning_rate": 0.00044919334177675595,
"loss": 5.8185,
"mean_token_accuracy": 0.16714439690113067,
"num_tokens": 6035670.0,
"step": 3320
},
{
"entropy": 6.098821926116943,
"epoch": 2.8568972926514826,
"grad_norm": 1.078125,
"learning_rate": 0.00044898270668817955,
"loss": 5.7433,
"mean_token_accuracy": 0.17498091757297515,
"num_tokens": 6044092.0,
"step": 3325
},
{
"entropy": 6.041405916213989,
"epoch": 2.861194671250537,
"grad_norm": 0.99609375,
"learning_rate": 0.000448771691690912,
"loss": 5.8089,
"mean_token_accuracy": 0.17252034097909927,
"num_tokens": 6053970.0,
"step": 3330
},
{
"entropy": 6.098532438278198,
"epoch": 2.8654920498495917,
"grad_norm": 1.0234375,
"learning_rate": 0.0004485602972457257,
"loss": 5.7875,
"mean_token_accuracy": 0.17401470988988876,
"num_tokens": 6062965.0,
"step": 3335
},
{
"entropy": 6.10422191619873,
"epoch": 2.8697894284486463,
"grad_norm": 1.078125,
"learning_rate": 0.00044834852381422165,
"loss": 5.8375,
"mean_token_accuracy": 0.17349963784217834,
"num_tokens": 6072420.0,
"step": 3340
},
{
"entropy": 6.048533582687378,
"epoch": 2.874086807047701,
"grad_norm": 1.078125,
"learning_rate": 0.00044813637185882836,
"loss": 5.7604,
"mean_token_accuracy": 0.17201080173254013,
"num_tokens": 6080915.0,
"step": 3345
},
{
"entropy": 6.129676723480225,
"epoch": 2.8783841856467554,
"grad_norm": 1.1875,
"learning_rate": 0.00044792384184280106,
"loss": 5.8898,
"mean_token_accuracy": 0.16713710129261017,
"num_tokens": 6090453.0,
"step": 3350
},
{
"entropy": 6.036713743209839,
"epoch": 2.88268156424581,
"grad_norm": 1.09375,
"learning_rate": 0.00044771093423022013,
"loss": 5.9178,
"mean_token_accuracy": 0.16426213681697846,
"num_tokens": 6099390.0,
"step": 3355
},
{
"entropy": 6.090553140640258,
"epoch": 2.8869789428448644,
"grad_norm": 0.99609375,
"learning_rate": 0.0004474976494859909,
"loss": 5.8439,
"mean_token_accuracy": 0.17439688742160797,
"num_tokens": 6108677.0,
"step": 3360
},
{
"entropy": 6.084423589706421,
"epoch": 2.891276321443919,
"grad_norm": 0.98046875,
"learning_rate": 0.0004472839880758419,
"loss": 5.7572,
"mean_token_accuracy": 0.17288744151592256,
"num_tokens": 6117151.0,
"step": 3365
},
{
"entropy": 6.169969892501831,
"epoch": 2.8955737000429735,
"grad_norm": 1.109375,
"learning_rate": 0.0004470699504663242,
"loss": 5.8724,
"mean_token_accuracy": 0.1652231350541115,
"num_tokens": 6127167.0,
"step": 3370
},
{
"entropy": 6.055519533157349,
"epoch": 2.899871078642028,
"grad_norm": 1.03125,
"learning_rate": 0.0004468555371248104,
"loss": 5.7663,
"mean_token_accuracy": 0.17967537939548492,
"num_tokens": 6136487.0,
"step": 3375
},
{
"entropy": 6.096647262573242,
"epoch": 2.904168457241083,
"grad_norm": 1.0078125,
"learning_rate": 0.0004466407485194937,
"loss": 5.8808,
"mean_token_accuracy": 0.16516373604536055,
"num_tokens": 6145334.0,
"step": 3380
},
{
"entropy": 6.091698265075683,
"epoch": 2.9084658358401376,
"grad_norm": 1.0625,
"learning_rate": 0.0004464255851193864,
"loss": 5.7913,
"mean_token_accuracy": 0.17120025604963302,
"num_tokens": 6155062.0,
"step": 3385
},
{
"entropy": 6.080928611755371,
"epoch": 2.912763214439192,
"grad_norm": 1.7265625,
"learning_rate": 0.0004462100473943194,
"loss": 5.7627,
"mean_token_accuracy": 0.17752974182367326,
"num_tokens": 6164313.0,
"step": 3390
},
{
"entropy": 6.061914777755737,
"epoch": 2.9170605930382467,
"grad_norm": 1.03125,
"learning_rate": 0.000445994135814941,
"loss": 5.8024,
"mean_token_accuracy": 0.17023618370294571,
"num_tokens": 6173513.0,
"step": 3395
},
{
"entropy": 6.057987403869629,
"epoch": 2.9213579716373013,
"grad_norm": 1.1953125,
"learning_rate": 0.00044577785085271566,
"loss": 5.8041,
"mean_token_accuracy": 0.17476166486740113,
"num_tokens": 6182000.0,
"step": 3400
},
{
"entropy": 6.1352544784545895,
"epoch": 2.925655350236356,
"grad_norm": 1.015625,
"learning_rate": 0.0004455611929799235,
"loss": 5.8516,
"mean_token_accuracy": 0.1572086051106453,
"num_tokens": 6191887.0,
"step": 3405
},
{
"entropy": 6.025879716873169,
"epoch": 2.9299527288354104,
"grad_norm": 1.015625,
"learning_rate": 0.0004453441626696585,
"loss": 5.885,
"mean_token_accuracy": 0.16230087578296662,
"num_tokens": 6202897.0,
"step": 3410
},
{
"entropy": 6.132012939453125,
"epoch": 2.934250107434465,
"grad_norm": 1.0390625,
"learning_rate": 0.00044512676039582823,
"loss": 5.7891,
"mean_token_accuracy": 0.1754133865237236,
"num_tokens": 6211811.0,
"step": 3415
},
{
"entropy": 6.114519882202148,
"epoch": 2.9385474860335195,
"grad_norm": 1.109375,
"learning_rate": 0.0004449089866331524,
"loss": 5.7826,
"mean_token_accuracy": 0.18096065670251846,
"num_tokens": 6219896.0,
"step": 3420
},
{
"entropy": 5.983143472671509,
"epoch": 2.942844864632574,
"grad_norm": 1.078125,
"learning_rate": 0.0004446908418571617,
"loss": 5.7734,
"mean_token_accuracy": 0.1765346944332123,
"num_tokens": 6228212.0,
"step": 3425
},
{
"entropy": 6.059330701828003,
"epoch": 2.9471422432316285,
"grad_norm": 1.0390625,
"learning_rate": 0.0004444723265441973,
"loss": 5.9301,
"mean_token_accuracy": 0.1656051605939865,
"num_tokens": 6238133.0,
"step": 3430
},
{
"entropy": 6.08131365776062,
"epoch": 2.9514396218306835,
"grad_norm": 0.98046875,
"learning_rate": 0.0004442534411714092,
"loss": 5.8366,
"mean_token_accuracy": 0.1650673657655716,
"num_tokens": 6247331.0,
"step": 3435
},
{
"entropy": 6.160918760299682,
"epoch": 2.955737000429738,
"grad_norm": 1.0859375,
"learning_rate": 0.00044403418621675555,
"loss": 5.8406,
"mean_token_accuracy": 0.16983808875083922,
"num_tokens": 6255280.0,
"step": 3440
},
{
"entropy": 6.073430061340332,
"epoch": 2.9600343790287926,
"grad_norm": 1.015625,
"learning_rate": 0.0004438145621590017,
"loss": 5.7939,
"mean_token_accuracy": 0.17472269237041474,
"num_tokens": 6264752.0,
"step": 3445
},
{
"entropy": 6.033823823928833,
"epoch": 2.964331757627847,
"grad_norm": 1.140625,
"learning_rate": 0.00044359456947771857,
"loss": 5.7495,
"mean_token_accuracy": 0.172511225938797,
"num_tokens": 6273258.0,
"step": 3450
},
{
"entropy": 5.891212129592896,
"epoch": 2.9686291362269017,
"grad_norm": 1.1953125,
"learning_rate": 0.0004433742086532824,
"loss": 5.6668,
"mean_token_accuracy": 0.19016601592302323,
"num_tokens": 6281584.0,
"step": 3455
},
{
"entropy": 6.076795339584351,
"epoch": 2.9729265148259563,
"grad_norm": 1.171875,
"learning_rate": 0.00044315348016687317,
"loss": 5.7854,
"mean_token_accuracy": 0.17181758135557174,
"num_tokens": 6290016.0,
"step": 3460
},
{
"entropy": 6.06014461517334,
"epoch": 2.977223893425011,
"grad_norm": 1.078125,
"learning_rate": 0.0004429323845004736,
"loss": 5.694,
"mean_token_accuracy": 0.17798333764076232,
"num_tokens": 6298569.0,
"step": 3465
},
{
"entropy": 5.982924079895019,
"epoch": 2.9815212720240654,
"grad_norm": 1.0078125,
"learning_rate": 0.00044271092213686824,
"loss": 5.7296,
"mean_token_accuracy": 0.17693220674991608,
"num_tokens": 6307684.0,
"step": 3470
},
{
"entropy": 6.1649445533752445,
"epoch": 2.98581865062312,
"grad_norm": 0.9453125,
"learning_rate": 0.00044248909355964247,
"loss": 5.8556,
"mean_token_accuracy": 0.1716341868042946,
"num_tokens": 6317767.0,
"step": 3475
},
{
"entropy": 6.146809720993042,
"epoch": 2.9901160292221745,
"grad_norm": 1.1484375,
"learning_rate": 0.00044226689925318117,
"loss": 5.8931,
"mean_token_accuracy": 0.16468499451875687,
"num_tokens": 6327457.0,
"step": 3480
},
{
"entropy": 5.985245990753174,
"epoch": 2.994413407821229,
"grad_norm": 1.0625,
"learning_rate": 0.00044204433970266785,
"loss": 5.6945,
"mean_token_accuracy": 0.18739936202764512,
"num_tokens": 6335747.0,
"step": 3485
},
{
"entropy": 6.050507545471191,
"epoch": 2.9987107864202835,
"grad_norm": 1.0625,
"learning_rate": 0.0004418214153940837,
"loss": 5.7846,
"mean_token_accuracy": 0.1760311618447304,
"num_tokens": 6344750.0,
"step": 3490
},
{
"entropy": 6.092853705088298,
"epoch": 3.002578427159433,
"grad_norm": 0.890625,
"learning_rate": 0.00044159812681420624,
"loss": 5.7217,
"mean_token_accuracy": 0.17525596585538653,
"num_tokens": 6354779.0,
"step": 3495
},
{
"entropy": 6.122584819793701,
"epoch": 3.0068758057584875,
"grad_norm": 1.0703125,
"learning_rate": 0.0004413744744506086,
"loss": 5.506,
"mean_token_accuracy": 0.1860961213707924,
"num_tokens": 6363809.0,
"step": 3500
},
{
"epoch": 3.0068758057584875,
"eval_entropy": 5.801608745042269,
"eval_loss": 6.042037010192871,
"eval_mean_token_accuracy": 0.1686659706336958,
"eval_num_tokens": 6363809.0,
"eval_runtime": 2.0476,
"eval_samples_per_second": 1733.255,
"eval_steps_per_second": 216.84,
"step": 3500
},
{
"entropy": 5.992935609817505,
"epoch": 3.011173184357542,
"grad_norm": 1.046875,
"learning_rate": 0.00044115045879165806,
"loss": 5.563,
"mean_token_accuracy": 0.18435313254594804,
"num_tokens": 6373082.0,
"step": 3505
},
{
"entropy": 6.053584480285645,
"epoch": 3.0154705629565965,
"grad_norm": 1.1015625,
"learning_rate": 0.00044092608032651515,
"loss": 5.5261,
"mean_token_accuracy": 0.1837206542491913,
"num_tokens": 6381286.0,
"step": 3510
},
{
"entropy": 6.083251333236694,
"epoch": 3.019767941555651,
"grad_norm": 0.98046875,
"learning_rate": 0.00044070133954513305,
"loss": 5.4729,
"mean_token_accuracy": 0.19432286769151688,
"num_tokens": 6390217.0,
"step": 3515
},
{
"entropy": 6.058011102676391,
"epoch": 3.0240653201547056,
"grad_norm": 1.28125,
"learning_rate": 0.0004404762369382555,
"loss": 5.5036,
"mean_token_accuracy": 0.18731357306241989,
"num_tokens": 6399276.0,
"step": 3520
},
{
"entropy": 6.000890445709229,
"epoch": 3.02836269875376,
"grad_norm": 1.1640625,
"learning_rate": 0.00044025077299741683,
"loss": 5.4811,
"mean_token_accuracy": 0.192198945581913,
"num_tokens": 6407981.0,
"step": 3525
},
{
"entropy": 5.988429880142212,
"epoch": 3.0326600773528147,
"grad_norm": 1.125,
"learning_rate": 0.00044002494821494007,
"loss": 5.4804,
"mean_token_accuracy": 0.18921354711055755,
"num_tokens": 6416159.0,
"step": 3530
},
{
"entropy": 5.9463738918304445,
"epoch": 3.0369574559518693,
"grad_norm": 1.125,
"learning_rate": 0.00043979876308393635,
"loss": 5.531,
"mean_token_accuracy": 0.1913963183760643,
"num_tokens": 6424564.0,
"step": 3535
},
{
"entropy": 6.106854009628296,
"epoch": 3.041254834550924,
"grad_norm": 1.0234375,
"learning_rate": 0.0004395722180983036,
"loss": 5.5823,
"mean_token_accuracy": 0.18249945044517518,
"num_tokens": 6434163.0,
"step": 3540
},
{
"entropy": 5.950508308410645,
"epoch": 3.0455522131499784,
"grad_norm": 1.0625,
"learning_rate": 0.00043934531375272535,
"loss": 5.3919,
"mean_token_accuracy": 0.20384220778942108,
"num_tokens": 6443372.0,
"step": 3545
},
{
"entropy": 5.974466180801391,
"epoch": 3.049849591749033,
"grad_norm": 0.96875,
"learning_rate": 0.00043911805054267015,
"loss": 5.4833,
"mean_token_accuracy": 0.18905829787254333,
"num_tokens": 6452638.0,
"step": 3550
},
{
"entropy": 6.111138391494751,
"epoch": 3.0541469703480875,
"grad_norm": 1.0546875,
"learning_rate": 0.00043889042896439004,
"loss": 5.4924,
"mean_token_accuracy": 0.19172994196414947,
"num_tokens": 6461319.0,
"step": 3555
},
{
"entropy": 6.002539110183716,
"epoch": 3.0584443489471425,
"grad_norm": 1.3046875,
"learning_rate": 0.00043866244951491946,
"loss": 5.4305,
"mean_token_accuracy": 0.1999826490879059,
"num_tokens": 6469506.0,
"step": 3560
},
{
"entropy": 6.020529794692993,
"epoch": 3.062741727546197,
"grad_norm": 1.1171875,
"learning_rate": 0.00043843411269207445,
"loss": 5.4837,
"mean_token_accuracy": 0.19121226519346238,
"num_tokens": 6478404.0,
"step": 3565
},
{
"entropy": 5.9611005783081055,
"epoch": 3.0670391061452515,
"grad_norm": 1.09375,
"learning_rate": 0.0004382054189944514,
"loss": 5.433,
"mean_token_accuracy": 0.18942490667104722,
"num_tokens": 6487447.0,
"step": 3570
},
{
"entropy": 5.9097977638244625,
"epoch": 3.071336484744306,
"grad_norm": 1.0234375,
"learning_rate": 0.0004379763689214259,
"loss": 5.469,
"mean_token_accuracy": 0.18396330773830413,
"num_tokens": 6496738.0,
"step": 3575
},
{
"entropy": 6.013470220565796,
"epoch": 3.0756338633433606,
"grad_norm": 0.97265625,
"learning_rate": 0.0004377469629731518,
"loss": 5.4752,
"mean_token_accuracy": 0.1895818755030632,
"num_tokens": 6505848.0,
"step": 3580
},
{
"entropy": 6.006653928756714,
"epoch": 3.079931241942415,
"grad_norm": 1.015625,
"learning_rate": 0.0004375172016505599,
"loss": 5.4558,
"mean_token_accuracy": 0.18824636489152907,
"num_tokens": 6515731.0,
"step": 3585
},
{
"entropy": 5.979631328582764,
"epoch": 3.0842286205414697,
"grad_norm": 1.03125,
"learning_rate": 0.0004372870854553572,
"loss": 5.5152,
"mean_token_accuracy": 0.18944674283266066,
"num_tokens": 6524914.0,
"step": 3590
},
{
"entropy": 5.99342303276062,
"epoch": 3.0885259991405243,
"grad_norm": 1.0625,
"learning_rate": 0.0004370566148900255,
"loss": 5.4967,
"mean_token_accuracy": 0.19440635293722153,
"num_tokens": 6533712.0,
"step": 3595
},
{
"entropy": 6.0267222881317135,
"epoch": 3.092823377739579,
"grad_norm": 1.0625,
"learning_rate": 0.00043682579045782024,
"loss": 5.5786,
"mean_token_accuracy": 0.18650965839624406,
"num_tokens": 6543313.0,
"step": 3600
},
{
"entropy": 5.940178155899048,
"epoch": 3.0971207563386334,
"grad_norm": 1.1953125,
"learning_rate": 0.0004365946126627699,
"loss": 5.4649,
"mean_token_accuracy": 0.19772678166627883,
"num_tokens": 6551634.0,
"step": 3605
},
{
"entropy": 6.004144239425659,
"epoch": 3.101418134937688,
"grad_norm": 1.0546875,
"learning_rate": 0.00043636308200967433,
"loss": 5.4821,
"mean_token_accuracy": 0.1942768707871437,
"num_tokens": 6560695.0,
"step": 3610
},
{
"entropy": 5.857456827163697,
"epoch": 3.1057155135367425,
"grad_norm": 1.03125,
"learning_rate": 0.0004361311990041039,
"loss": 5.3753,
"mean_token_accuracy": 0.19874223917722703,
"num_tokens": 6569086.0,
"step": 3615
},
{
"entropy": 5.919683027267456,
"epoch": 3.110012892135797,
"grad_norm": 1.0859375,
"learning_rate": 0.00043589896415239843,
"loss": 5.4564,
"mean_token_accuracy": 0.1986413672566414,
"num_tokens": 6578287.0,
"step": 3620
},
{
"entropy": 5.956605434417725,
"epoch": 3.1143102707348516,
"grad_norm": 0.99609375,
"learning_rate": 0.00043566637796166595,
"loss": 5.5147,
"mean_token_accuracy": 0.18752527385950088,
"num_tokens": 6587015.0,
"step": 3625
},
{
"entropy": 5.9813155174255375,
"epoch": 3.118607649333906,
"grad_norm": 1.140625,
"learning_rate": 0.00043543344093978186,
"loss": 5.5585,
"mean_token_accuracy": 0.18545775562524797,
"num_tokens": 6596187.0,
"step": 3630
},
{
"entropy": 5.964481592178345,
"epoch": 3.122905027932961,
"grad_norm": 1.0703125,
"learning_rate": 0.00043520015359538745,
"loss": 5.4268,
"mean_token_accuracy": 0.19721884578466414,
"num_tokens": 6605226.0,
"step": 3635
},
{
"entropy": 5.862498092651367,
"epoch": 3.1272024065320156,
"grad_norm": 1.109375,
"learning_rate": 0.0004349665164378891,
"loss": 5.475,
"mean_token_accuracy": 0.18966546505689622,
"num_tokens": 6613232.0,
"step": 3640
},
{
"entropy": 5.976254987716675,
"epoch": 3.13149978513107,
"grad_norm": 1.0859375,
"learning_rate": 0.00043473252997745684,
"loss": 5.4789,
"mean_token_accuracy": 0.18647109866142272,
"num_tokens": 6622247.0,
"step": 3645
},
{
"entropy": 6.025827789306641,
"epoch": 3.1357971637301247,
"grad_norm": 1.71875,
"learning_rate": 0.00043449819472502366,
"loss": 5.4281,
"mean_token_accuracy": 0.19298454523086547,
"num_tokens": 6630883.0,
"step": 3650
},
{
"entropy": 5.921304559707641,
"epoch": 3.1400945423291793,
"grad_norm": 1.09375,
"learning_rate": 0.0004342635111922841,
"loss": 5.5595,
"mean_token_accuracy": 0.18861598372459412,
"num_tokens": 6639399.0,
"step": 3655
},
{
"entropy": 5.989827823638916,
"epoch": 3.144391920928234,
"grad_norm": 1.125,
"learning_rate": 0.0004340284798916931,
"loss": 5.483,
"mean_token_accuracy": 0.19412256628274918,
"num_tokens": 6649288.0,
"step": 3660
},
{
"entropy": 5.921028423309326,
"epoch": 3.1486892995272884,
"grad_norm": 1.0078125,
"learning_rate": 0.0004337931013364653,
"loss": 5.4165,
"mean_token_accuracy": 0.19552054554224013,
"num_tokens": 6658670.0,
"step": 3665
},
{
"entropy": 5.969826030731201,
"epoch": 3.152986678126343,
"grad_norm": 1.125,
"learning_rate": 0.000433557376040573,
"loss": 5.4991,
"mean_token_accuracy": 0.1942813739180565,
"num_tokens": 6667302.0,
"step": 3670
},
{
"entropy": 5.992925643920898,
"epoch": 3.1572840567253975,
"grad_norm": 1.0703125,
"learning_rate": 0.00043332130451874645,
"loss": 5.5383,
"mean_token_accuracy": 0.1936521127820015,
"num_tokens": 6677393.0,
"step": 3675
},
{
"entropy": 6.003905582427978,
"epoch": 3.161581435324452,
"grad_norm": 0.94140625,
"learning_rate": 0.00043308488728647127,
"loss": 5.5183,
"mean_token_accuracy": 0.18625610321760178,
"num_tokens": 6686727.0,
"step": 3680
},
{
"entropy": 5.899046134948731,
"epoch": 3.1658788139235066,
"grad_norm": 1.1796875,
"learning_rate": 0.0004328481248599882,
"loss": 5.4279,
"mean_token_accuracy": 0.196131394803524,
"num_tokens": 6696116.0,
"step": 3685
},
{
"entropy": 5.968793296813965,
"epoch": 3.170176192522561,
"grad_norm": 1.078125,
"learning_rate": 0.0004326110177562918,
"loss": 5.5429,
"mean_token_accuracy": 0.18541710525751115,
"num_tokens": 6704640.0,
"step": 3690
},
{
"entropy": 5.916857767105102,
"epoch": 3.1744735711216157,
"grad_norm": 1.203125,
"learning_rate": 0.00043237356649312926,
"loss": 5.3912,
"mean_token_accuracy": 0.20387934297323226,
"num_tokens": 6713663.0,
"step": 3695
},
{
"entropy": 5.932327318191528,
"epoch": 3.17877094972067,
"grad_norm": 1.0625,
"learning_rate": 0.0004321357715889991,
"loss": 5.526,
"mean_token_accuracy": 0.1858012244105339,
"num_tokens": 6722965.0,
"step": 3700
},
{
"entropy": 5.9681384563446045,
"epoch": 3.1830683283197247,
"grad_norm": 1.140625,
"learning_rate": 0.0004318976335631505,
"loss": 5.4893,
"mean_token_accuracy": 0.19365193992853164,
"num_tokens": 6732776.0,
"step": 3705
},
{
"entropy": 5.964018297195435,
"epoch": 3.1873657069187797,
"grad_norm": 1.046875,
"learning_rate": 0.00043165915293558155,
"loss": 5.4682,
"mean_token_accuracy": 0.19091420918703078,
"num_tokens": 6741309.0,
"step": 3710
},
{
"entropy": 5.944598436355591,
"epoch": 3.1916630855178343,
"grad_norm": 1.0546875,
"learning_rate": 0.0004314203302270388,
"loss": 5.5274,
"mean_token_accuracy": 0.18904216587543488,
"num_tokens": 6750584.0,
"step": 3715
},
{
"entropy": 5.97039303779602,
"epoch": 3.195960464116889,
"grad_norm": 1.1640625,
"learning_rate": 0.0004311811659590154,
"loss": 5.5007,
"mean_token_accuracy": 0.1887460470199585,
"num_tokens": 6759344.0,
"step": 3720
},
{
"entropy": 6.059423017501831,
"epoch": 3.2002578427159434,
"grad_norm": 0.87890625,
"learning_rate": 0.0004309416606537507,
"loss": 5.6563,
"mean_token_accuracy": 0.18009912818670273,
"num_tokens": 6770345.0,
"step": 3725
},
{
"entropy": 6.00485258102417,
"epoch": 3.204555221314998,
"grad_norm": 1.125,
"learning_rate": 0.00043070181483422843,
"loss": 5.5411,
"mean_token_accuracy": 0.1854734942317009,
"num_tokens": 6779991.0,
"step": 3730
},
{
"entropy": 5.88880934715271,
"epoch": 3.2088525999140525,
"grad_norm": 1.1953125,
"learning_rate": 0.000430461629024176,
"loss": 5.4983,
"mean_token_accuracy": 0.19071830958127975,
"num_tokens": 6788972.0,
"step": 3735
},
{
"entropy": 5.885913467407226,
"epoch": 3.213149978513107,
"grad_norm": 1.1640625,
"learning_rate": 0.0004302211037480634,
"loss": 5.4111,
"mean_token_accuracy": 0.19531920850276946,
"num_tokens": 6796967.0,
"step": 3740
},
{
"entropy": 5.912165975570678,
"epoch": 3.2174473571121616,
"grad_norm": 1.234375,
"learning_rate": 0.0004299802395311015,
"loss": 5.5182,
"mean_token_accuracy": 0.18958668708801268,
"num_tokens": 6805961.0,
"step": 3745
},
{
"entropy": 5.875810194015503,
"epoch": 3.221744735711216,
"grad_norm": 1.234375,
"learning_rate": 0.0004297390368992414,
"loss": 5.4233,
"mean_token_accuracy": 0.19228914380073547,
"num_tokens": 6814657.0,
"step": 3750
},
{
"entropy": 5.940344333648682,
"epoch": 3.2260421143102707,
"grad_norm": 1.2265625,
"learning_rate": 0.00042949749637917353,
"loss": 5.4718,
"mean_token_accuracy": 0.1930217519402504,
"num_tokens": 6823095.0,
"step": 3755
},
{
"entropy": 5.956659030914307,
"epoch": 3.230339492909325,
"grad_norm": 0.99609375,
"learning_rate": 0.0004292556184983256,
"loss": 5.4872,
"mean_token_accuracy": 0.19027772098779677,
"num_tokens": 6832195.0,
"step": 3760
},
{
"entropy": 6.009495830535888,
"epoch": 3.2346368715083798,
"grad_norm": 1.15625,
"learning_rate": 0.0004290134037848623,
"loss": 5.6084,
"mean_token_accuracy": 0.18570149838924407,
"num_tokens": 6840922.0,
"step": 3765
},
{
"entropy": 5.964060831069946,
"epoch": 3.2389342501074343,
"grad_norm": 1.171875,
"learning_rate": 0.00042877085276768386,
"loss": 5.46,
"mean_token_accuracy": 0.19570931494235994,
"num_tokens": 6849182.0,
"step": 3770
},
{
"entropy": 5.94105863571167,
"epoch": 3.243231628706489,
"grad_norm": 1.1015625,
"learning_rate": 0.00042852796597642455,
"loss": 5.4551,
"mean_token_accuracy": 0.19768441170454026,
"num_tokens": 6857932.0,
"step": 3775
},
{
"entropy": 5.997882509231568,
"epoch": 3.247529007305544,
"grad_norm": 1.0859375,
"learning_rate": 0.0004282847439414522,
"loss": 5.616,
"mean_token_accuracy": 0.17659982144832612,
"num_tokens": 6867283.0,
"step": 3780
},
{
"entropy": 6.0180786609649655,
"epoch": 3.2518263859045984,
"grad_norm": 1.078125,
"learning_rate": 0.0004280411871938664,
"loss": 5.5648,
"mean_token_accuracy": 0.18943356424570085,
"num_tokens": 6876123.0,
"step": 3785
},
{
"entropy": 6.006447601318359,
"epoch": 3.256123764503653,
"grad_norm": 1.1796875,
"learning_rate": 0.0004277972962654979,
"loss": 5.5082,
"mean_token_accuracy": 0.18536664098501204,
"num_tokens": 6885239.0,
"step": 3790
},
{
"entropy": 5.930108880996704,
"epoch": 3.2604211431027075,
"grad_norm": 1.0859375,
"learning_rate": 0.0004275530716889069,
"loss": 5.5573,
"mean_token_accuracy": 0.18274880945682526,
"num_tokens": 6895061.0,
"step": 3795
},
{
"entropy": 5.983970260620117,
"epoch": 3.264718521701762,
"grad_norm": 1.2265625,
"learning_rate": 0.0004273085139973822,
"loss": 5.5993,
"mean_token_accuracy": 0.177694109082222,
"num_tokens": 6903828.0,
"step": 3800
},
{
"entropy": 6.014524221420288,
"epoch": 3.2690159003008166,
"grad_norm": 1.140625,
"learning_rate": 0.0004270636237249401,
"loss": 5.5151,
"mean_token_accuracy": 0.18856608420610427,
"num_tokens": 6912805.0,
"step": 3805
},
{
"entropy": 5.941100168228149,
"epoch": 3.273313278899871,
"grad_norm": 1.1015625,
"learning_rate": 0.00042681840140632314,
"loss": 5.5616,
"mean_token_accuracy": 0.18302462846040726,
"num_tokens": 6922165.0,
"step": 3810
},
{
"entropy": 5.997183227539063,
"epoch": 3.2776106574989257,
"grad_norm": 1.0859375,
"learning_rate": 0.0004265728475769989,
"loss": 5.5322,
"mean_token_accuracy": 0.18632204383611678,
"num_tokens": 6931677.0,
"step": 3815
},
{
"entropy": 5.975349044799804,
"epoch": 3.28190803609798,
"grad_norm": 0.97265625,
"learning_rate": 0.0004263269627731586,
"loss": 5.4952,
"mean_token_accuracy": 0.19264112412929535,
"num_tokens": 6940486.0,
"step": 3820
},
{
"entropy": 5.868766260147095,
"epoch": 3.2862054146970348,
"grad_norm": 1.1015625,
"learning_rate": 0.0004260807475317164,
"loss": 5.51,
"mean_token_accuracy": 0.1856775924563408,
"num_tokens": 6948990.0,
"step": 3825
},
{
"entropy": 6.010857200622558,
"epoch": 3.2905027932960893,
"grad_norm": 1.0234375,
"learning_rate": 0.0004258342023903081,
"loss": 5.636,
"mean_token_accuracy": 0.17837173044681548,
"num_tokens": 6959311.0,
"step": 3830
},
{
"entropy": 6.02067198753357,
"epoch": 3.294800171895144,
"grad_norm": 1.125,
"learning_rate": 0.00042558732788728975,
"loss": 5.4186,
"mean_token_accuracy": 0.19980644732713698,
"num_tokens": 6968619.0,
"step": 3835
},
{
"entropy": 5.891939735412597,
"epoch": 3.2990975504941984,
"grad_norm": 1.09375,
"learning_rate": 0.00042534012456173643,
"loss": 5.4745,
"mean_token_accuracy": 0.1930858761072159,
"num_tokens": 6977469.0,
"step": 3840
},
{
"entropy": 5.908893871307373,
"epoch": 3.303394929093253,
"grad_norm": 1.2421875,
"learning_rate": 0.00042509259295344157,
"loss": 5.4637,
"mean_token_accuracy": 0.18524923622608186,
"num_tokens": 6986772.0,
"step": 3845
},
{
"entropy": 5.965682172775269,
"epoch": 3.3076923076923075,
"grad_norm": 1.2578125,
"learning_rate": 0.00042484473360291514,
"loss": 5.4722,
"mean_token_accuracy": 0.1818112000823021,
"num_tokens": 6993937.0,
"step": 3850
},
{
"entropy": 5.878727436065674,
"epoch": 3.311989686291362,
"grad_norm": 1.1328125,
"learning_rate": 0.00042459654705138294,
"loss": 5.5336,
"mean_token_accuracy": 0.19061464071273804,
"num_tokens": 7003222.0,
"step": 3855
},
{
"entropy": 5.907388973236084,
"epoch": 3.316287064890417,
"grad_norm": 1.109375,
"learning_rate": 0.0004243480338407853,
"loss": 5.5021,
"mean_token_accuracy": 0.19867320060729982,
"num_tokens": 7012055.0,
"step": 3860
},
{
"entropy": 5.968272018432617,
"epoch": 3.3205844434894716,
"grad_norm": 1.078125,
"learning_rate": 0.0004240991945137755,
"loss": 5.4952,
"mean_token_accuracy": 0.1932666853070259,
"num_tokens": 7021036.0,
"step": 3865
},
{
"entropy": 5.909445858001709,
"epoch": 3.324881822088526,
"grad_norm": 1.1328125,
"learning_rate": 0.00042385002961371944,
"loss": 5.4787,
"mean_token_accuracy": 0.194594843685627,
"num_tokens": 7030450.0,
"step": 3870
},
{
"entropy": 6.005906677246093,
"epoch": 3.3291792006875807,
"grad_norm": 1.1640625,
"learning_rate": 0.0004236005396846935,
"loss": 5.5873,
"mean_token_accuracy": 0.18787091970443726,
"num_tokens": 7039740.0,
"step": 3875
},
{
"entropy": 6.0099263191223145,
"epoch": 3.333476579286635,
"grad_norm": 1.125,
"learning_rate": 0.00042335072527148406,
"loss": 5.5642,
"mean_token_accuracy": 0.18891336619853974,
"num_tokens": 7050430.0,
"step": 3880
},
{
"entropy": 5.886811065673828,
"epoch": 3.3377739578856898,
"grad_norm": 1.25,
"learning_rate": 0.0004231005869195859,
"loss": 5.5523,
"mean_token_accuracy": 0.18664977699518204,
"num_tokens": 7059477.0,
"step": 3885
},
{
"entropy": 5.945472669601441,
"epoch": 3.3420713364847443,
"grad_norm": 1.4296875,
"learning_rate": 0.0004228501251752011,
"loss": 5.4871,
"mean_token_accuracy": 0.19109417051076888,
"num_tokens": 7067805.0,
"step": 3890
},
{
"entropy": 5.942922163009643,
"epoch": 3.346368715083799,
"grad_norm": 1.0625,
"learning_rate": 0.00042259934058523814,
"loss": 5.4972,
"mean_token_accuracy": 0.18601811528205872,
"num_tokens": 7077606.0,
"step": 3895
},
{
"entropy": 5.984446573257446,
"epoch": 3.3506660936828534,
"grad_norm": 1.15625,
"learning_rate": 0.00042234823369731027,
"loss": 5.448,
"mean_token_accuracy": 0.19036031365394593,
"num_tokens": 7085647.0,
"step": 3900
},
{
"entropy": 5.861058759689331,
"epoch": 3.354963472281908,
"grad_norm": 1.1171875,
"learning_rate": 0.00042209680505973465,
"loss": 5.4762,
"mean_token_accuracy": 0.19057320803403854,
"num_tokens": 7095298.0,
"step": 3905
},
{
"entropy": 5.868588638305664,
"epoch": 3.3592608508809625,
"grad_norm": 1.0546875,
"learning_rate": 0.0004218450552215308,
"loss": 5.5542,
"mean_token_accuracy": 0.19133240431547166,
"num_tokens": 7105207.0,
"step": 3910
},
{
"entropy": 5.973352527618408,
"epoch": 3.363558229480017,
"grad_norm": 1.0390625,
"learning_rate": 0.0004215929847324199,
"loss": 5.6046,
"mean_token_accuracy": 0.18282657265663146,
"num_tokens": 7114833.0,
"step": 3915
},
{
"entropy": 6.0064185619354244,
"epoch": 3.3678556080790716,
"grad_norm": 1.1875,
"learning_rate": 0.000421340594142823,
"loss": 5.4227,
"mean_token_accuracy": 0.20140644013881684,
"num_tokens": 7123608.0,
"step": 3920
},
{
"entropy": 5.875625896453857,
"epoch": 3.3721529866781266,
"grad_norm": 1.21875,
"learning_rate": 0.00042108788400386035,
"loss": 5.4824,
"mean_token_accuracy": 0.19125625491142273,
"num_tokens": 7132250.0,
"step": 3925
},
{
"entropy": 5.91867356300354,
"epoch": 3.376450365277181,
"grad_norm": 0.99609375,
"learning_rate": 0.0004208348548673498,
"loss": 5.5796,
"mean_token_accuracy": 0.18955173790454866,
"num_tokens": 7142086.0,
"step": 3930
},
{
"entropy": 5.989838075637818,
"epoch": 3.3807477438762357,
"grad_norm": 1.125,
"learning_rate": 0.000420581507285806,
"loss": 5.525,
"mean_token_accuracy": 0.1797061249613762,
"num_tokens": 7152434.0,
"step": 3935
},
{
"entropy": 5.870218181610108,
"epoch": 3.38504512247529,
"grad_norm": 1.046875,
"learning_rate": 0.0004203278418124386,
"loss": 5.4707,
"mean_token_accuracy": 0.19644346386194228,
"num_tokens": 7163041.0,
"step": 3940
},
{
"entropy": 5.865656518936158,
"epoch": 3.3893425010743448,
"grad_norm": 1.0390625,
"learning_rate": 0.0004200738590011518,
"loss": 5.4512,
"mean_token_accuracy": 0.19743987321853637,
"num_tokens": 7171875.0,
"step": 3945
},
{
"entropy": 5.906575489044189,
"epoch": 3.3936398796733993,
"grad_norm": 1.1484375,
"learning_rate": 0.00041981955940654245,
"loss": 5.5679,
"mean_token_accuracy": 0.18974538147449493,
"num_tokens": 7180803.0,
"step": 3950
},
{
"entropy": 5.951998472213745,
"epoch": 3.397937258272454,
"grad_norm": 1.1171875,
"learning_rate": 0.0004195649435838992,
"loss": 5.5884,
"mean_token_accuracy": 0.17947447150945664,
"num_tokens": 7190661.0,
"step": 3955
},
{
"entropy": 5.871505403518677,
"epoch": 3.4022346368715084,
"grad_norm": 1.09375,
"learning_rate": 0.0004193100120892013,
"loss": 5.418,
"mean_token_accuracy": 0.19889674335718155,
"num_tokens": 7199357.0,
"step": 3960
},
{
"entropy": 5.934350156784058,
"epoch": 3.406532015470563,
"grad_norm": 0.99609375,
"learning_rate": 0.0004190547654791172,
"loss": 5.597,
"mean_token_accuracy": 0.18219801187515258,
"num_tokens": 7209856.0,
"step": 3965
},
{
"entropy": 5.969940042495727,
"epoch": 3.4108293940696175,
"grad_norm": 1.2265625,
"learning_rate": 0.00041879920431100347,
"loss": 5.5648,
"mean_token_accuracy": 0.17899948358535767,
"num_tokens": 7218778.0,
"step": 3970
},
{
"entropy": 5.924646472930908,
"epoch": 3.415126772668672,
"grad_norm": 1.1171875,
"learning_rate": 0.0004185433291429036,
"loss": 5.5802,
"mean_token_accuracy": 0.18834476321935653,
"num_tokens": 7228442.0,
"step": 3975
},
{
"entropy": 5.978606748580932,
"epoch": 3.4194241512677266,
"grad_norm": 1.171875,
"learning_rate": 0.00041828714053354665,
"loss": 5.5653,
"mean_token_accuracy": 0.18292482793331147,
"num_tokens": 7238724.0,
"step": 3980
},
{
"entropy": 5.850194692611694,
"epoch": 3.423721529866781,
"grad_norm": 1.078125,
"learning_rate": 0.0004180306390423462,
"loss": 5.5145,
"mean_token_accuracy": 0.19443774223327637,
"num_tokens": 7247844.0,
"step": 3985
},
{
"entropy": 5.919923639297485,
"epoch": 3.4280189084658357,
"grad_norm": 1.0703125,
"learning_rate": 0.00041777382522939884,
"loss": 5.5776,
"mean_token_accuracy": 0.1839929461479187,
"num_tokens": 7257260.0,
"step": 3990
},
{
"entropy": 5.963938665390015,
"epoch": 3.4323162870648902,
"grad_norm": 0.9921875,
"learning_rate": 0.00041751669965548344,
"loss": 5.5802,
"mean_token_accuracy": 0.1809097185730934,
"num_tokens": 7266890.0,
"step": 3995
},
{
"entropy": 5.974624681472778,
"epoch": 3.4366136656639448,
"grad_norm": 1.1484375,
"learning_rate": 0.00041725926288205945,
"loss": 5.598,
"mean_token_accuracy": 0.17821378856897355,
"num_tokens": 7276114.0,
"step": 4000
},
{
"epoch": 3.4366136656639448,
"eval_entropy": 5.73526575543859,
"eval_loss": 6.016810417175293,
"eval_mean_token_accuracy": 0.17057843910748358,
"eval_num_tokens": 7276114.0,
"eval_runtime": 2.0499,
"eval_samples_per_second": 1731.264,
"eval_steps_per_second": 216.591,
"step": 4000
},
{
"entropy": 5.9616344451904295,
"epoch": 3.4409110442629998,
"grad_norm": 1.078125,
"learning_rate": 0.0004170015154712658,
"loss": 5.548,
"mean_token_accuracy": 0.1874366208910942,
"num_tokens": 7284426.0,
"step": 4005
},
{
"entropy": 5.910069179534912,
"epoch": 3.4452084228620543,
"grad_norm": 1.015625,
"learning_rate": 0.00041674345798591993,
"loss": 5.5843,
"mean_token_accuracy": 0.18420783281326295,
"num_tokens": 7294813.0,
"step": 4010
},
{
"entropy": 5.961581373214722,
"epoch": 3.449505801461109,
"grad_norm": 1.0546875,
"learning_rate": 0.0004164850909895161,
"loss": 5.5619,
"mean_token_accuracy": 0.18809896260499953,
"num_tokens": 7304655.0,
"step": 4015
},
{
"entropy": 5.849625158309936,
"epoch": 3.4538031800601634,
"grad_norm": 1.0,
"learning_rate": 0.0004162264150462247,
"loss": 5.5155,
"mean_token_accuracy": 0.1865479052066803,
"num_tokens": 7313610.0,
"step": 4020
},
{
"entropy": 5.980514192581177,
"epoch": 3.458100558659218,
"grad_norm": 1.1171875,
"learning_rate": 0.00041596743072089065,
"loss": 5.5535,
"mean_token_accuracy": 0.19074880033731462,
"num_tokens": 7322243.0,
"step": 4025
},
{
"entropy": 6.062830209732056,
"epoch": 3.4623979372582725,
"grad_norm": 1.1953125,
"learning_rate": 0.000415708138579032,
"loss": 5.5229,
"mean_token_accuracy": 0.17943777292966842,
"num_tokens": 7331040.0,
"step": 4030
},
{
"entropy": 5.886963891983032,
"epoch": 3.466695315857327,
"grad_norm": 1.1171875,
"learning_rate": 0.00041544853918683923,
"loss": 5.5948,
"mean_token_accuracy": 0.1817588433623314,
"num_tokens": 7340771.0,
"step": 4035
},
{
"entropy": 5.9117542743682865,
"epoch": 3.4709926944563816,
"grad_norm": 1.0625,
"learning_rate": 0.0004151886331111737,
"loss": 5.6421,
"mean_token_accuracy": 0.18092233091592788,
"num_tokens": 7349960.0,
"step": 4040
},
{
"entropy": 5.899527883529663,
"epoch": 3.475290073055436,
"grad_norm": 1.1796875,
"learning_rate": 0.00041492842091956646,
"loss": 5.4649,
"mean_token_accuracy": 0.1919792726635933,
"num_tokens": 7357983.0,
"step": 4045
},
{
"entropy": 5.988178062438965,
"epoch": 3.4795874516544907,
"grad_norm": 1.1015625,
"learning_rate": 0.0004146679031802167,
"loss": 5.591,
"mean_token_accuracy": 0.19019764959812163,
"num_tokens": 7366814.0,
"step": 4050
},
{
"entropy": 5.9325186252594,
"epoch": 3.4838848302535452,
"grad_norm": 1.1953125,
"learning_rate": 0.00041440708046199123,
"loss": 5.452,
"mean_token_accuracy": 0.19600227922201158,
"num_tokens": 7374773.0,
"step": 4055
},
{
"entropy": 5.890796184539795,
"epoch": 3.4881822088525998,
"grad_norm": 1.0625,
"learning_rate": 0.0004141459533344226,
"loss": 5.5562,
"mean_token_accuracy": 0.1825706109404564,
"num_tokens": 7383937.0,
"step": 4060
},
{
"entropy": 5.957454347610474,
"epoch": 3.4924795874516543,
"grad_norm": 1.1015625,
"learning_rate": 0.00041388452236770795,
"loss": 5.5305,
"mean_token_accuracy": 0.18163443803787233,
"num_tokens": 7392577.0,
"step": 4065
},
{
"entropy": 5.882272720336914,
"epoch": 3.4967769660507093,
"grad_norm": 1.125,
"learning_rate": 0.00041362278813270823,
"loss": 5.4193,
"mean_token_accuracy": 0.20885447710752486,
"num_tokens": 7401473.0,
"step": 4070
},
{
"entropy": 5.992699241638183,
"epoch": 3.501074344649764,
"grad_norm": 1.0234375,
"learning_rate": 0.00041336075120094616,
"loss": 5.6214,
"mean_token_accuracy": 0.17333737909793853,
"num_tokens": 7410831.0,
"step": 4075
},
{
"entropy": 6.0088804244995115,
"epoch": 3.5053717232488184,
"grad_norm": 1.0390625,
"learning_rate": 0.00041309841214460586,
"loss": 5.6193,
"mean_token_accuracy": 0.18231521993875505,
"num_tokens": 7421563.0,
"step": 4080
},
{
"entropy": 5.887757968902588,
"epoch": 3.509669101847873,
"grad_norm": 1.1171875,
"learning_rate": 0.0004128357715365309,
"loss": 5.5266,
"mean_token_accuracy": 0.191811466217041,
"num_tokens": 7430174.0,
"step": 4085
},
{
"entropy": 5.899808502197265,
"epoch": 3.5139664804469275,
"grad_norm": 1.0703125,
"learning_rate": 0.00041257282995022345,
"loss": 5.4928,
"mean_token_accuracy": 0.1953655794262886,
"num_tokens": 7439034.0,
"step": 4090
},
{
"entropy": 5.912106704711914,
"epoch": 3.518263859045982,
"grad_norm": 1.359375,
"learning_rate": 0.0004123095879598426,
"loss": 5.5195,
"mean_token_accuracy": 0.18628203123807907,
"num_tokens": 7447663.0,
"step": 4095
},
{
"entropy": 5.960794830322266,
"epoch": 3.5225612376450366,
"grad_norm": 1.0625,
"learning_rate": 0.00041204604614020397,
"loss": 5.6081,
"mean_token_accuracy": 0.17660218775272368,
"num_tokens": 7456615.0,
"step": 4100
},
{
"entropy": 5.996097373962402,
"epoch": 3.526858616244091,
"grad_norm": 1.09375,
"learning_rate": 0.0004117822050667773,
"loss": 5.6382,
"mean_token_accuracy": 0.18591019809246062,
"num_tokens": 7466203.0,
"step": 4105
},
{
"entropy": 5.9893563747406,
"epoch": 3.5311559948431457,
"grad_norm": 1.09375,
"learning_rate": 0.00041151806531568617,
"loss": 5.5802,
"mean_token_accuracy": 0.18335504829883575,
"num_tokens": 7475411.0,
"step": 4110
},
{
"entropy": 5.906181669235229,
"epoch": 3.5354533734422002,
"grad_norm": 1.0390625,
"learning_rate": 0.00041125362746370625,
"loss": 5.6004,
"mean_token_accuracy": 0.18042974472045897,
"num_tokens": 7484965.0,
"step": 4115
},
{
"entropy": 5.995426511764526,
"epoch": 3.5397507520412548,
"grad_norm": 1.09375,
"learning_rate": 0.0004109888920882639,
"loss": 5.5249,
"mean_token_accuracy": 0.19167679399251938,
"num_tokens": 7494240.0,
"step": 4120
},
{
"entropy": 5.949258327484131,
"epoch": 3.5440481306403093,
"grad_norm": 1.0625,
"learning_rate": 0.0004107238597674356,
"loss": 5.5586,
"mean_token_accuracy": 0.18614224940538407,
"num_tokens": 7503560.0,
"step": 4125
},
{
"entropy": 5.863224458694458,
"epoch": 3.548345509239364,
"grad_norm": 1.0078125,
"learning_rate": 0.000410458531079946,
"loss": 5.4812,
"mean_token_accuracy": 0.19368503391742706,
"num_tokens": 7512650.0,
"step": 4130
},
{
"entropy": 5.9348499298095705,
"epoch": 3.5526428878384184,
"grad_norm": 1.1640625,
"learning_rate": 0.0004101929066051668,
"loss": 5.599,
"mean_token_accuracy": 0.1838935688138008,
"num_tokens": 7521864.0,
"step": 4135
},
{
"entropy": 5.878848266601563,
"epoch": 3.556940266437473,
"grad_norm": 1.1171875,
"learning_rate": 0.0004099269869231157,
"loss": 5.496,
"mean_token_accuracy": 0.19109761267900466,
"num_tokens": 7531013.0,
"step": 4140
},
{
"entropy": 5.948237895965576,
"epoch": 3.5612376450365275,
"grad_norm": 1.046875,
"learning_rate": 0.00040966077261445495,
"loss": 5.503,
"mean_token_accuracy": 0.1837790846824646,
"num_tokens": 7539959.0,
"step": 4145
},
{
"entropy": 6.009708642959595,
"epoch": 3.565535023635582,
"grad_norm": 1.28125,
"learning_rate": 0.0004093942642604904,
"loss": 5.4789,
"mean_token_accuracy": 0.19033878594636916,
"num_tokens": 7548354.0,
"step": 4150
},
{
"entropy": 5.921438217163086,
"epoch": 3.5698324022346366,
"grad_norm": 1.03125,
"learning_rate": 0.00040912746244316944,
"loss": 5.6032,
"mean_token_accuracy": 0.18626796901226045,
"num_tokens": 7558321.0,
"step": 4155
},
{
"entropy": 5.902405214309693,
"epoch": 3.5741297808336916,
"grad_norm": 1.046875,
"learning_rate": 0.00040886036774508095,
"loss": 5.4904,
"mean_token_accuracy": 0.18896115869283675,
"num_tokens": 7567889.0,
"step": 4160
},
{
"entropy": 5.9710170269012455,
"epoch": 3.578427159432746,
"grad_norm": 1.0859375,
"learning_rate": 0.0004085929807494527,
"loss": 5.5489,
"mean_token_accuracy": 0.1867457315325737,
"num_tokens": 7576752.0,
"step": 4165
},
{
"entropy": 5.900749206542969,
"epoch": 3.5827245380318007,
"grad_norm": 1.015625,
"learning_rate": 0.0004083253020401512,
"loss": 5.4498,
"mean_token_accuracy": 0.19864338636398315,
"num_tokens": 7585413.0,
"step": 4170
},
{
"entropy": 5.9034223556518555,
"epoch": 3.5870219166308552,
"grad_norm": 1.234375,
"learning_rate": 0.0004080573322016797,
"loss": 5.4085,
"mean_token_accuracy": 0.19775232523679734,
"num_tokens": 7593966.0,
"step": 4175
},
{
"entropy": 5.905447053909302,
"epoch": 3.59131929522991,
"grad_norm": 1.09375,
"learning_rate": 0.0004077890718191773,
"loss": 5.4219,
"mean_token_accuracy": 0.19463559091091157,
"num_tokens": 7602746.0,
"step": 4180
},
{
"entropy": 5.888575172424316,
"epoch": 3.5956166738289643,
"grad_norm": 1.15625,
"learning_rate": 0.00040752052147841733,
"loss": 5.485,
"mean_token_accuracy": 0.18464642763137817,
"num_tokens": 7611245.0,
"step": 4185
},
{
"entropy": 5.9167564868927,
"epoch": 3.599914052428019,
"grad_norm": 1.0234375,
"learning_rate": 0.0004072516817658065,
"loss": 5.5085,
"mean_token_accuracy": 0.19180469512939452,
"num_tokens": 7620234.0,
"step": 4190
},
{
"entropy": 5.9288722515106205,
"epoch": 3.6042114310270734,
"grad_norm": 1.140625,
"learning_rate": 0.0004069825532683831,
"loss": 5.5362,
"mean_token_accuracy": 0.19008248895406724,
"num_tokens": 7629794.0,
"step": 4195
},
{
"entropy": 5.883164501190185,
"epoch": 3.608508809626128,
"grad_norm": 1.046875,
"learning_rate": 0.00040671313657381645,
"loss": 5.4768,
"mean_token_accuracy": 0.19734710156917573,
"num_tokens": 7639497.0,
"step": 4200
},
{
"entropy": 5.833352327346802,
"epoch": 3.6128061882251825,
"grad_norm": 1.09375,
"learning_rate": 0.00040644343227040473,
"loss": 5.4305,
"mean_token_accuracy": 0.192035111784935,
"num_tokens": 7647647.0,
"step": 4205
},
{
"entropy": 5.882366132736206,
"epoch": 3.617103566824237,
"grad_norm": 1.046875,
"learning_rate": 0.0004061734409470745,
"loss": 5.6069,
"mean_token_accuracy": 0.18727213144302368,
"num_tokens": 7657988.0,
"step": 4210
},
{
"entropy": 5.946136331558227,
"epoch": 3.621400945423292,
"grad_norm": 1.078125,
"learning_rate": 0.0004059031631933788,
"loss": 5.5226,
"mean_token_accuracy": 0.18810444325208664,
"num_tokens": 7667498.0,
"step": 4215
},
{
"entropy": 5.928274488449096,
"epoch": 3.6256983240223466,
"grad_norm": 1.0625,
"learning_rate": 0.00040563259959949615,
"loss": 5.6612,
"mean_token_accuracy": 0.17574882060289382,
"num_tokens": 7677386.0,
"step": 4220
},
{
"entropy": 6.023345851898194,
"epoch": 3.629995702621401,
"grad_norm": 1.0546875,
"learning_rate": 0.0004053617507562295,
"loss": 5.4993,
"mean_token_accuracy": 0.1883416697382927,
"num_tokens": 7686643.0,
"step": 4225
},
{
"entropy": 5.927192258834839,
"epoch": 3.6342930812204557,
"grad_norm": 1.2265625,
"learning_rate": 0.00040509061725500426,
"loss": 5.5344,
"mean_token_accuracy": 0.18648910969495774,
"num_tokens": 7695089.0,
"step": 4230
},
{
"entropy": 5.855798292160034,
"epoch": 3.6385904598195102,
"grad_norm": 1.078125,
"learning_rate": 0.0004048191996878677,
"loss": 5.5169,
"mean_token_accuracy": 0.18715409338474273,
"num_tokens": 7703854.0,
"step": 4235
},
{
"entropy": 5.873931074142456,
"epoch": 3.642887838418565,
"grad_norm": 1.0859375,
"learning_rate": 0.00040454749864748734,
"loss": 5.4623,
"mean_token_accuracy": 0.1924944058060646,
"num_tokens": 7712903.0,
"step": 4240
},
{
"entropy": 5.9368483543396,
"epoch": 3.6471852170176193,
"grad_norm": 1.0,
"learning_rate": 0.0004042755147271496,
"loss": 5.4073,
"mean_token_accuracy": 0.19578560292720795,
"num_tokens": 7721701.0,
"step": 4245
},
{
"entropy": 5.814197635650634,
"epoch": 3.651482595616674,
"grad_norm": 0.9921875,
"learning_rate": 0.0004040032485207587,
"loss": 5.5316,
"mean_token_accuracy": 0.18780674338340758,
"num_tokens": 7731318.0,
"step": 4250
},
{
"entropy": 5.960366725921631,
"epoch": 3.6557799742157284,
"grad_norm": 0.9921875,
"learning_rate": 0.0004037307006228352,
"loss": 5.4563,
"mean_token_accuracy": 0.19457500725984572,
"num_tokens": 7740413.0,
"step": 4255
},
{
"entropy": 5.894597911834717,
"epoch": 3.660077352814783,
"grad_norm": 1.0703125,
"learning_rate": 0.0004034578716285147,
"loss": 5.4362,
"mean_token_accuracy": 0.19790690541267394,
"num_tokens": 7749054.0,
"step": 4260
},
{
"entropy": 5.855839014053345,
"epoch": 3.6643747314138375,
"grad_norm": 1.2109375,
"learning_rate": 0.0004031847621335467,
"loss": 5.4711,
"mean_token_accuracy": 0.19566139876842498,
"num_tokens": 7757366.0,
"step": 4265
},
{
"entropy": 5.889632892608643,
"epoch": 3.668672110012892,
"grad_norm": 1.2578125,
"learning_rate": 0.0004029113727342933,
"loss": 5.502,
"mean_token_accuracy": 0.19420932680368425,
"num_tokens": 7766471.0,
"step": 4270
},
{
"entropy": 5.851235818862915,
"epoch": 3.6729694886119466,
"grad_norm": 1.09375,
"learning_rate": 0.00040263770402772746,
"loss": 5.4897,
"mean_token_accuracy": 0.1871536925435066,
"num_tokens": 7775920.0,
"step": 4275
},
{
"entropy": 5.934095287322998,
"epoch": 3.677266867211001,
"grad_norm": 1.1953125,
"learning_rate": 0.0004023637566114325,
"loss": 5.5382,
"mean_token_accuracy": 0.1889081373810768,
"num_tokens": 7784530.0,
"step": 4280
},
{
"entropy": 5.93968391418457,
"epoch": 3.6815642458100557,
"grad_norm": 1.09375,
"learning_rate": 0.0004020895310835999,
"loss": 5.4721,
"mean_token_accuracy": 0.1917961835861206,
"num_tokens": 7793656.0,
"step": 4285
},
{
"entropy": 5.9000050067901615,
"epoch": 3.6858616244091102,
"grad_norm": 1.0234375,
"learning_rate": 0.00040181502804302865,
"loss": 5.496,
"mean_token_accuracy": 0.1914617270231247,
"num_tokens": 7802185.0,
"step": 4290
},
{
"entropy": 5.8633284091949465,
"epoch": 3.690159003008165,
"grad_norm": 1.171875,
"learning_rate": 0.00040154024808912377,
"loss": 5.483,
"mean_token_accuracy": 0.19215791970491408,
"num_tokens": 7810345.0,
"step": 4295
},
{
"entropy": 5.897251462936401,
"epoch": 3.6944563816072193,
"grad_norm": 1.15625,
"learning_rate": 0.0004012651918218947,
"loss": 5.5314,
"mean_token_accuracy": 0.1837465301156044,
"num_tokens": 7818998.0,
"step": 4300
},
{
"entropy": 5.959916353225708,
"epoch": 3.6987537602062743,
"grad_norm": 1.0703125,
"learning_rate": 0.0004009898598419544,
"loss": 5.6474,
"mean_token_accuracy": 0.17348452657461166,
"num_tokens": 7828638.0,
"step": 4305
},
{
"entropy": 5.956097745895386,
"epoch": 3.703051138805329,
"grad_norm": 1.171875,
"learning_rate": 0.000400714252750518,
"loss": 5.622,
"mean_token_accuracy": 0.1802245110273361,
"num_tokens": 7838812.0,
"step": 4310
},
{
"entropy": 5.987325286865234,
"epoch": 3.7073485174043834,
"grad_norm": 1.1171875,
"learning_rate": 0.0004004383711494011,
"loss": 5.5288,
"mean_token_accuracy": 0.19345352202653884,
"num_tokens": 7847458.0,
"step": 4315
},
{
"entropy": 5.95421142578125,
"epoch": 3.711645896003438,
"grad_norm": 1.03125,
"learning_rate": 0.0004001622156410189,
"loss": 5.5496,
"mean_token_accuracy": 0.18483526557683944,
"num_tokens": 7856553.0,
"step": 4320
},
{
"entropy": 5.850839233398437,
"epoch": 3.7159432746024925,
"grad_norm": 1.0546875,
"learning_rate": 0.00039988578682838467,
"loss": 5.4869,
"mean_token_accuracy": 0.18971165865659714,
"num_tokens": 7864788.0,
"step": 4325
},
{
"entropy": 5.903116130828858,
"epoch": 3.720240653201547,
"grad_norm": 1.0390625,
"learning_rate": 0.00039960908531510843,
"loss": 5.484,
"mean_token_accuracy": 0.19329809993505478,
"num_tokens": 7873850.0,
"step": 4330
},
{
"entropy": 5.974154853820801,
"epoch": 3.7245380318006016,
"grad_norm": 1.1484375,
"learning_rate": 0.0003993321117053956,
"loss": 5.6039,
"mean_token_accuracy": 0.18225040286779404,
"num_tokens": 7882775.0,
"step": 4335
},
{
"entropy": 5.980661678314209,
"epoch": 3.728835410399656,
"grad_norm": 1.1015625,
"learning_rate": 0.00039905486660404604,
"loss": 5.5353,
"mean_token_accuracy": 0.18522801846265793,
"num_tokens": 7890570.0,
"step": 4340
},
{
"entropy": 5.8748914241790775,
"epoch": 3.7331327889987107,
"grad_norm": 1.015625,
"learning_rate": 0.00039877735061645206,
"loss": 5.5033,
"mean_token_accuracy": 0.1971554860472679,
"num_tokens": 7900090.0,
"step": 4345
},
{
"entropy": 5.934943914413452,
"epoch": 3.7374301675977653,
"grad_norm": 1.171875,
"learning_rate": 0.0003984995643485977,
"loss": 5.5358,
"mean_token_accuracy": 0.18585693091154099,
"num_tokens": 7908077.0,
"step": 4350
},
{
"entropy": 5.9528398513793945,
"epoch": 3.74172754619682,
"grad_norm": 1.421875,
"learning_rate": 0.00039822150840705716,
"loss": 5.5391,
"mean_token_accuracy": 0.19125075042247772,
"num_tokens": 7916290.0,
"step": 4355
},
{
"entropy": 5.999798917770386,
"epoch": 3.746024924795875,
"grad_norm": 1.1875,
"learning_rate": 0.00039794318339899347,
"loss": 5.6233,
"mean_token_accuracy": 0.17912040501832963,
"num_tokens": 7925835.0,
"step": 4360
},
{
"entropy": 5.929653787612915,
"epoch": 3.7503223033949293,
"grad_norm": 1.0703125,
"learning_rate": 0.00039766458993215726,
"loss": 5.5867,
"mean_token_accuracy": 0.18147629946470262,
"num_tokens": 7935076.0,
"step": 4365
},
{
"entropy": 5.84507122039795,
"epoch": 3.754619681993984,
"grad_norm": 1.0390625,
"learning_rate": 0.00039738572861488527,
"loss": 5.4837,
"mean_token_accuracy": 0.19409503191709518,
"num_tokens": 7943958.0,
"step": 4370
},
{
"entropy": 5.907137012481689,
"epoch": 3.7589170605930384,
"grad_norm": 1.03125,
"learning_rate": 0.000397106600056099,
"loss": 5.5211,
"mean_token_accuracy": 0.18553533554077148,
"num_tokens": 7953189.0,
"step": 4375
},
{
"entropy": 5.878173971176148,
"epoch": 3.763214439192093,
"grad_norm": 0.9765625,
"learning_rate": 0.0003968272048653039,
"loss": 5.4441,
"mean_token_accuracy": 0.19779548197984695,
"num_tokens": 7962927.0,
"step": 4380
},
{
"entropy": 5.8026800632476805,
"epoch": 3.7675118177911475,
"grad_norm": 1.015625,
"learning_rate": 0.0003965475436525873,
"loss": 5.4712,
"mean_token_accuracy": 0.197597499191761,
"num_tokens": 7973087.0,
"step": 4385
},
{
"entropy": 5.8803709030151365,
"epoch": 3.771809196390202,
"grad_norm": 1.0078125,
"learning_rate": 0.0003962676170286174,
"loss": 5.4288,
"mean_token_accuracy": 0.1919528603553772,
"num_tokens": 7982535.0,
"step": 4390
},
{
"entropy": 5.943622827529907,
"epoch": 3.7761065749892566,
"grad_norm": 1.1796875,
"learning_rate": 0.00039598742560464223,
"loss": 5.507,
"mean_token_accuracy": 0.19596254229545593,
"num_tokens": 7990740.0,
"step": 4395
},
{
"entropy": 5.965104579925537,
"epoch": 3.780403953588311,
"grad_norm": 1.21875,
"learning_rate": 0.0003957069699924877,
"loss": 5.5021,
"mean_token_accuracy": 0.1843058630824089,
"num_tokens": 7999349.0,
"step": 4400
},
{
"entropy": 5.906688165664673,
"epoch": 3.7847013321873657,
"grad_norm": 1.0859375,
"learning_rate": 0.000395426250804557,
"loss": 5.5119,
"mean_token_accuracy": 0.19529375731945037,
"num_tokens": 8007615.0,
"step": 4405
},
{
"entropy": 5.893620347976684,
"epoch": 3.7889987107864203,
"grad_norm": 1.0234375,
"learning_rate": 0.00039514526865382847,
"loss": 5.4918,
"mean_token_accuracy": 0.19342261105775832,
"num_tokens": 8017545.0,
"step": 4410
},
{
"entropy": 5.898420667648315,
"epoch": 3.793296089385475,
"grad_norm": 1.140625,
"learning_rate": 0.0003948640241538548,
"loss": 5.4376,
"mean_token_accuracy": 0.1940651446580887,
"num_tokens": 8026381.0,
"step": 4415
},
{
"entropy": 5.925773334503174,
"epoch": 3.7975934679845293,
"grad_norm": 1.4921875,
"learning_rate": 0.0003945825179187617,
"loss": 5.5471,
"mean_token_accuracy": 0.1862453892827034,
"num_tokens": 8034745.0,
"step": 4420
},
{
"entropy": 5.93576078414917,
"epoch": 3.801890846583584,
"grad_norm": 1.203125,
"learning_rate": 0.00039430075056324604,
"loss": 5.4864,
"mean_token_accuracy": 0.19621551632881165,
"num_tokens": 8043995.0,
"step": 4425
},
{
"entropy": 5.9152994632720945,
"epoch": 3.8061882251826384,
"grad_norm": 1.1015625,
"learning_rate": 0.00039401872270257546,
"loss": 5.5773,
"mean_token_accuracy": 0.18623047918081284,
"num_tokens": 8053059.0,
"step": 4430
},
{
"entropy": 5.9053184509277346,
"epoch": 3.810485603781693,
"grad_norm": 1.0546875,
"learning_rate": 0.00039373643495258567,
"loss": 5.5995,
"mean_token_accuracy": 0.18803995102643967,
"num_tokens": 8062160.0,
"step": 4435
},
{
"entropy": 5.876355934143066,
"epoch": 3.8147829823807475,
"grad_norm": 1.15625,
"learning_rate": 0.00039345388792968056,
"loss": 5.4979,
"mean_token_accuracy": 0.1962131142616272,
"num_tokens": 8071260.0,
"step": 4440
},
{
"entropy": 5.975628805160523,
"epoch": 3.819080360979802,
"grad_norm": 1.2578125,
"learning_rate": 0.00039317108225082984,
"loss": 5.6148,
"mean_token_accuracy": 0.1825527474284172,
"num_tokens": 8081540.0,
"step": 4445
},
{
"entropy": 5.8768692970275875,
"epoch": 3.8233777395788566,
"grad_norm": 1.1640625,
"learning_rate": 0.00039288801853356806,
"loss": 5.5798,
"mean_token_accuracy": 0.1876271441578865,
"num_tokens": 8089785.0,
"step": 4450
},
{
"entropy": 5.926883172988892,
"epoch": 3.8276751181779116,
"grad_norm": 1.21875,
"learning_rate": 0.0003926046973959932,
"loss": 5.4322,
"mean_token_accuracy": 0.1977944403886795,
"num_tokens": 8098097.0,
"step": 4455
},
{
"entropy": 5.84870548248291,
"epoch": 3.831972496776966,
"grad_norm": 1.0390625,
"learning_rate": 0.0003923211194567654,
"loss": 5.6562,
"mean_token_accuracy": 0.1832739979028702,
"num_tokens": 8108693.0,
"step": 4460
},
{
"entropy": 5.936432361602783,
"epoch": 3.8362698753760207,
"grad_norm": 1.15625,
"learning_rate": 0.00039203728533510556,
"loss": 5.4945,
"mean_token_accuracy": 0.19009887129068376,
"num_tokens": 8117181.0,
"step": 4465
},
{
"entropy": 5.9394755363464355,
"epoch": 3.8405672539750753,
"grad_norm": 1.1484375,
"learning_rate": 0.000391753195650794,
"loss": 5.5152,
"mean_token_accuracy": 0.1871207147836685,
"num_tokens": 8125398.0,
"step": 4470
},
{
"entropy": 5.89150915145874,
"epoch": 3.84486463257413,
"grad_norm": 1.046875,
"learning_rate": 0.00039146885102416895,
"loss": 5.519,
"mean_token_accuracy": 0.19240910410881043,
"num_tokens": 8135320.0,
"step": 4475
},
{
"entropy": 5.932202434539795,
"epoch": 3.8491620111731844,
"grad_norm": 1.1328125,
"learning_rate": 0.00039118425207612553,
"loss": 5.6074,
"mean_token_accuracy": 0.18543781340122223,
"num_tokens": 8144320.0,
"step": 4480
},
{
"entropy": 5.821663093566895,
"epoch": 3.853459389772239,
"grad_norm": 1.1328125,
"learning_rate": 0.00039089939942811396,
"loss": 5.478,
"mean_token_accuracy": 0.19514185637235643,
"num_tokens": 8153653.0,
"step": 4485
},
{
"entropy": 5.937240219116211,
"epoch": 3.8577567683712934,
"grad_norm": 1.03125,
"learning_rate": 0.00039061429370213863,
"loss": 5.513,
"mean_token_accuracy": 0.18825586438179015,
"num_tokens": 8162741.0,
"step": 4490
},
{
"entropy": 5.856398630142212,
"epoch": 3.862054146970348,
"grad_norm": 1.0859375,
"learning_rate": 0.00039032893552075646,
"loss": 5.4271,
"mean_token_accuracy": 0.1990933135151863,
"num_tokens": 8171078.0,
"step": 4495
},
{
"entropy": 5.858392572402954,
"epoch": 3.8663515255694025,
"grad_norm": 1.1796875,
"learning_rate": 0.0003900433255070758,
"loss": 5.4881,
"mean_token_accuracy": 0.19236364662647248,
"num_tokens": 8179968.0,
"step": 4500
},
{
"epoch": 3.8663515255694025,
"eval_entropy": 5.69006564058699,
"eval_loss": 5.968277454376221,
"eval_mean_token_accuracy": 0.1735342912006754,
"eval_num_tokens": 8179968.0,
"eval_runtime": 2.0443,
"eval_samples_per_second": 1736.068,
"eval_steps_per_second": 217.192,
"step": 4500
},
{
"entropy": 5.894122076034546,
"epoch": 3.870648904168457,
"grad_norm": 1.0859375,
"learning_rate": 0.00038975746428475454,
"loss": 5.4732,
"mean_token_accuracy": 0.19004281610250473,
"num_tokens": 8189261.0,
"step": 4505
},
{
"entropy": 5.959436702728271,
"epoch": 3.874946282767512,
"grad_norm": 1.15625,
"learning_rate": 0.00038947135247799955,
"loss": 5.4841,
"mean_token_accuracy": 0.19915961623191833,
"num_tokens": 8198302.0,
"step": 4510
},
{
"entropy": 5.907156896591187,
"epoch": 3.8792436613665666,
"grad_norm": 1.0546875,
"learning_rate": 0.00038918499071156443,
"loss": 5.4669,
"mean_token_accuracy": 0.1965099200606346,
"num_tokens": 8207098.0,
"step": 4515
},
{
"entropy": 5.902419233322144,
"epoch": 3.883541039965621,
"grad_norm": 1.2109375,
"learning_rate": 0.000388898379610749,
"loss": 5.5132,
"mean_token_accuracy": 0.18933655470609664,
"num_tokens": 8216831.0,
"step": 4520
},
{
"entropy": 5.858121109008789,
"epoch": 3.8878384185646757,
"grad_norm": 1.140625,
"learning_rate": 0.0003886115198013973,
"loss": 5.5158,
"mean_token_accuracy": 0.19693622142076492,
"num_tokens": 8225369.0,
"step": 4525
},
{
"entropy": 5.928486585617065,
"epoch": 3.8921357971637303,
"grad_norm": 1.1328125,
"learning_rate": 0.0003883244119098965,
"loss": 5.6449,
"mean_token_accuracy": 0.17984056174755098,
"num_tokens": 8234440.0,
"step": 4530
},
{
"entropy": 5.944949722290039,
"epoch": 3.896433175762785,
"grad_norm": 1.0625,
"learning_rate": 0.0003880370565631754,
"loss": 5.4373,
"mean_token_accuracy": 0.19602712541818618,
"num_tokens": 8243707.0,
"step": 4535
},
{
"entropy": 5.938224267959595,
"epoch": 3.9007305543618394,
"grad_norm": 1.0859375,
"learning_rate": 0.00038774945438870337,
"loss": 5.6105,
"mean_token_accuracy": 0.18423481285572052,
"num_tokens": 8254223.0,
"step": 4540
},
{
"entropy": 5.871773719787598,
"epoch": 3.905027932960894,
"grad_norm": 1.1484375,
"learning_rate": 0.00038746160601448845,
"loss": 5.465,
"mean_token_accuracy": 0.1903871014714241,
"num_tokens": 8263105.0,
"step": 4545
},
{
"entropy": 5.857735824584961,
"epoch": 3.9093253115599484,
"grad_norm": 1.03125,
"learning_rate": 0.0003871735120690766,
"loss": 5.5241,
"mean_token_accuracy": 0.18961958587169647,
"num_tokens": 8271478.0,
"step": 4550
},
{
"entropy": 5.936745357513428,
"epoch": 3.913622690159003,
"grad_norm": 1.1171875,
"learning_rate": 0.0003868851731815497,
"loss": 5.5649,
"mean_token_accuracy": 0.1800309345126152,
"num_tokens": 8280396.0,
"step": 4555
},
{
"entropy": 5.948010683059692,
"epoch": 3.9179200687580575,
"grad_norm": 1.1953125,
"learning_rate": 0.0003865965899815247,
"loss": 5.5559,
"mean_token_accuracy": 0.18653638958930968,
"num_tokens": 8290371.0,
"step": 4560
},
{
"entropy": 5.885638093948364,
"epoch": 3.922217447357112,
"grad_norm": 1.125,
"learning_rate": 0.0003863077630991518,
"loss": 5.4559,
"mean_token_accuracy": 0.1984282374382019,
"num_tokens": 8298976.0,
"step": 4565
},
{
"entropy": 5.830101728439331,
"epoch": 3.9265148259561666,
"grad_norm": 1.0546875,
"learning_rate": 0.0003860186931651139,
"loss": 5.5129,
"mean_token_accuracy": 0.1856519967317581,
"num_tokens": 8308752.0,
"step": 4570
},
{
"entropy": 5.904654264450073,
"epoch": 3.930812204555221,
"grad_norm": 1.0625,
"learning_rate": 0.0003857293808106238,
"loss": 5.5693,
"mean_token_accuracy": 0.18588138967752457,
"num_tokens": 8317343.0,
"step": 4575
},
{
"entropy": 5.934261655807495,
"epoch": 3.9351095831542757,
"grad_norm": 1.0546875,
"learning_rate": 0.0003854398266674241,
"loss": 5.4226,
"mean_token_accuracy": 0.19770598262548447,
"num_tokens": 8326956.0,
"step": 4580
},
{
"entropy": 5.8273883819580075,
"epoch": 3.9394069617533303,
"grad_norm": 1.1796875,
"learning_rate": 0.00038515003136778544,
"loss": 5.5387,
"mean_token_accuracy": 0.18877289444208145,
"num_tokens": 8335589.0,
"step": 4585
},
{
"entropy": 5.864310264587402,
"epoch": 3.943704340352385,
"grad_norm": 1.125,
"learning_rate": 0.00038485999554450483,
"loss": 5.5134,
"mean_token_accuracy": 0.18962926417589188,
"num_tokens": 8345517.0,
"step": 4590
},
{
"entropy": 5.81669340133667,
"epoch": 3.9480017189514394,
"grad_norm": 1.171875,
"learning_rate": 0.00038456971983090454,
"loss": 5.4482,
"mean_token_accuracy": 0.19930247962474823,
"num_tokens": 8354702.0,
"step": 4595
},
{
"entropy": 5.906301403045655,
"epoch": 3.9522990975504944,
"grad_norm": 1.09375,
"learning_rate": 0.0003842792048608309,
"loss": 5.4765,
"mean_token_accuracy": 0.19456401616334915,
"num_tokens": 8362940.0,
"step": 4600
},
{
"entropy": 5.906610107421875,
"epoch": 3.956596476149549,
"grad_norm": 1.015625,
"learning_rate": 0.0003839884512686523,
"loss": 5.5178,
"mean_token_accuracy": 0.19119103550910949,
"num_tokens": 8372034.0,
"step": 4605
},
{
"entropy": 5.910079717636108,
"epoch": 3.9608938547486034,
"grad_norm": 1.1015625,
"learning_rate": 0.00038369745968925846,
"loss": 5.5487,
"mean_token_accuracy": 0.1872400775551796,
"num_tokens": 8381673.0,
"step": 4610
},
{
"entropy": 5.925352668762207,
"epoch": 3.965191233347658,
"grad_norm": 1.03125,
"learning_rate": 0.00038340623075805875,
"loss": 5.4909,
"mean_token_accuracy": 0.1889455035328865,
"num_tokens": 8390804.0,
"step": 4615
},
{
"entropy": 5.934152221679687,
"epoch": 3.9694886119467125,
"grad_norm": 1.15625,
"learning_rate": 0.00038311476511098053,
"loss": 5.5365,
"mean_token_accuracy": 0.19448018521070481,
"num_tokens": 8399644.0,
"step": 4620
},
{
"entropy": 5.884286642074585,
"epoch": 3.973785990545767,
"grad_norm": 1.15625,
"learning_rate": 0.0003828230633844685,
"loss": 5.5523,
"mean_token_accuracy": 0.19329068064689636,
"num_tokens": 8409264.0,
"step": 4625
},
{
"entropy": 5.916780805587768,
"epoch": 3.9780833691448216,
"grad_norm": 1.1953125,
"learning_rate": 0.00038253112621548243,
"loss": 5.496,
"mean_token_accuracy": 0.186178120970726,
"num_tokens": 8418383.0,
"step": 4630
},
{
"entropy": 5.926163101196289,
"epoch": 3.982380747743876,
"grad_norm": 1.0625,
"learning_rate": 0.0003822389542414966,
"loss": 5.5232,
"mean_token_accuracy": 0.18829717487096786,
"num_tokens": 8427411.0,
"step": 4635
},
{
"entropy": 5.882813405990601,
"epoch": 3.9866781263429307,
"grad_norm": 1.1171875,
"learning_rate": 0.00038194654810049775,
"loss": 5.4629,
"mean_token_accuracy": 0.18817957490682602,
"num_tokens": 8435537.0,
"step": 4640
},
{
"entropy": 5.882016706466675,
"epoch": 3.9909755049419853,
"grad_norm": 1.046875,
"learning_rate": 0.000381653908430984,
"loss": 5.5432,
"mean_token_accuracy": 0.18621994256973268,
"num_tokens": 8444400.0,
"step": 4645
},
{
"entropy": 5.930685234069824,
"epoch": 3.99527288354104,
"grad_norm": 1.1015625,
"learning_rate": 0.0003813610358719634,
"loss": 5.5236,
"mean_token_accuracy": 0.1859032317996025,
"num_tokens": 8453830.0,
"step": 4650
},
{
"entropy": 5.866905212402344,
"epoch": 3.999570262140095,
"grad_norm": 1.0625,
"learning_rate": 0.00038106793106295266,
"loss": 5.4873,
"mean_token_accuracy": 0.20101941972970963,
"num_tokens": 8463033.0,
"step": 4655
},
{
"entropy": 5.898269759284125,
"epoch": 4.003437902879243,
"grad_norm": 1.0625,
"learning_rate": 0.0003807745946439754,
"loss": 5.2703,
"mean_token_accuracy": 0.20677175455623203,
"num_tokens": 8470740.0,
"step": 4660
},
{
"entropy": 5.857395029067993,
"epoch": 4.007735281478298,
"grad_norm": 1.0078125,
"learning_rate": 0.0003804810272555612,
"loss": 5.2529,
"mean_token_accuracy": 0.20413458198308945,
"num_tokens": 8480480.0,
"step": 4665
},
{
"entropy": 5.816273021697998,
"epoch": 4.012032660077352,
"grad_norm": 1.1875,
"learning_rate": 0.0003801872295387439,
"loss": 5.2035,
"mean_token_accuracy": 0.21528093218803407,
"num_tokens": 8489047.0,
"step": 4670
},
{
"entropy": 5.927360010147095,
"epoch": 4.016330038676408,
"grad_norm": 1.1015625,
"learning_rate": 0.0003798932021350603,
"loss": 5.2819,
"mean_token_accuracy": 0.20662181824445724,
"num_tokens": 8497763.0,
"step": 4675
},
{
"entropy": 5.861963748931885,
"epoch": 4.020627417275462,
"grad_norm": 1.109375,
"learning_rate": 0.00037959894568654864,
"loss": 5.2537,
"mean_token_accuracy": 0.20978819131851195,
"num_tokens": 8506814.0,
"step": 4680
},
{
"entropy": 5.957066392898559,
"epoch": 4.024924795874517,
"grad_norm": 1.0859375,
"learning_rate": 0.0003793044608357474,
"loss": 5.377,
"mean_token_accuracy": 0.19830369651317598,
"num_tokens": 8516384.0,
"step": 4685
},
{
"entropy": 5.93622350692749,
"epoch": 4.0292221744735714,
"grad_norm": 1.1875,
"learning_rate": 0.0003790097482256939,
"loss": 5.214,
"mean_token_accuracy": 0.2048332706093788,
"num_tokens": 8524822.0,
"step": 4690
},
{
"entropy": 5.870176839828491,
"epoch": 4.033519553072626,
"grad_norm": 0.98828125,
"learning_rate": 0.0003787148084999225,
"loss": 5.242,
"mean_token_accuracy": 0.2090427428483963,
"num_tokens": 8534129.0,
"step": 4695
},
{
"entropy": 5.8284914016723635,
"epoch": 4.0378169316716805,
"grad_norm": 1.1875,
"learning_rate": 0.00037841964230246394,
"loss": 5.3055,
"mean_token_accuracy": 0.20019746124744414,
"num_tokens": 8543235.0,
"step": 4700
},
{
"entropy": 5.8483837127685545,
"epoch": 4.042114310270735,
"grad_norm": 1.1875,
"learning_rate": 0.0003781242502778429,
"loss": 5.2003,
"mean_token_accuracy": 0.22053535431623458,
"num_tokens": 8551903.0,
"step": 4705
},
{
"entropy": 5.880414295196533,
"epoch": 4.04641168886979,
"grad_norm": 1.2109375,
"learning_rate": 0.00037782863307107785,
"loss": 5.287,
"mean_token_accuracy": 0.20505535304546357,
"num_tokens": 8561173.0,
"step": 4710
},
{
"entropy": 5.899335432052612,
"epoch": 4.050709067468844,
"grad_norm": 1.1640625,
"learning_rate": 0.00037753279132767833,
"loss": 5.1929,
"mean_token_accuracy": 0.21593824326992034,
"num_tokens": 8569789.0,
"step": 4715
},
{
"entropy": 5.804694700241089,
"epoch": 4.055006446067899,
"grad_norm": 1.2421875,
"learning_rate": 0.00037723672569364453,
"loss": 5.1963,
"mean_token_accuracy": 0.20983130037784575,
"num_tokens": 8577971.0,
"step": 4720
},
{
"entropy": 5.866218900680542,
"epoch": 4.059303824666953,
"grad_norm": 1.125,
"learning_rate": 0.00037694043681546545,
"loss": 5.2858,
"mean_token_accuracy": 0.2029922142624855,
"num_tokens": 8587299.0,
"step": 4725
},
{
"entropy": 5.831310987472534,
"epoch": 4.063601203266008,
"grad_norm": 1.0703125,
"learning_rate": 0.0003766439253401177,
"loss": 5.2472,
"mean_token_accuracy": 0.20737850219011306,
"num_tokens": 8595813.0,
"step": 4730
},
{
"entropy": 5.844350147247314,
"epoch": 4.067898581865062,
"grad_norm": 1.15625,
"learning_rate": 0.00037634719191506367,
"loss": 5.2617,
"mean_token_accuracy": 0.21165675073862075,
"num_tokens": 8604552.0,
"step": 4735
},
{
"entropy": 5.796354818344116,
"epoch": 4.072195960464117,
"grad_norm": 1.3203125,
"learning_rate": 0.00037605023718825065,
"loss": 5.2002,
"mean_token_accuracy": 0.2150500625371933,
"num_tokens": 8612701.0,
"step": 4740
},
{
"entropy": 5.846735095977783,
"epoch": 4.0764933390631715,
"grad_norm": 1.0078125,
"learning_rate": 0.000375753061808109,
"loss": 5.2598,
"mean_token_accuracy": 0.20762900859117508,
"num_tokens": 8622699.0,
"step": 4745
},
{
"entropy": 5.842225646972656,
"epoch": 4.080790717662226,
"grad_norm": 1.1015625,
"learning_rate": 0.00037545566642355107,
"loss": 5.2295,
"mean_token_accuracy": 0.20560641288757325,
"num_tokens": 8631821.0,
"step": 4750
},
{
"entropy": 5.840038156509399,
"epoch": 4.0850880962612806,
"grad_norm": 1.0625,
"learning_rate": 0.0003751580516839695,
"loss": 5.202,
"mean_token_accuracy": 0.20931526124477387,
"num_tokens": 8641814.0,
"step": 4755
},
{
"entropy": 5.884950733184814,
"epoch": 4.089385474860335,
"grad_norm": 1.1640625,
"learning_rate": 0.00037486021823923574,
"loss": 5.286,
"mean_token_accuracy": 0.20766208320856094,
"num_tokens": 8649649.0,
"step": 4760
},
{
"entropy": 5.810858106613159,
"epoch": 4.09368285345939,
"grad_norm": 1.15625,
"learning_rate": 0.00037456216673969925,
"loss": 5.2206,
"mean_token_accuracy": 0.21204735338687897,
"num_tokens": 8658216.0,
"step": 4765
},
{
"entropy": 5.874101734161377,
"epoch": 4.097980232058444,
"grad_norm": 1.0,
"learning_rate": 0.0003742638978361851,
"loss": 5.2958,
"mean_token_accuracy": 0.20435795933008194,
"num_tokens": 8667725.0,
"step": 4770
},
{
"entropy": 5.781695938110351,
"epoch": 4.102277610657499,
"grad_norm": 1.1484375,
"learning_rate": 0.00037396541217999367,
"loss": 5.1561,
"mean_token_accuracy": 0.2138916879892349,
"num_tokens": 8675739.0,
"step": 4775
},
{
"entropy": 5.839225959777832,
"epoch": 4.106574989256553,
"grad_norm": 1.125,
"learning_rate": 0.0003736667104228981,
"loss": 5.2313,
"mean_token_accuracy": 0.21251195222139357,
"num_tokens": 8685764.0,
"step": 4780
},
{
"entropy": 5.8689206600189205,
"epoch": 4.110872367855608,
"grad_norm": 1.3125,
"learning_rate": 0.00037336779321714376,
"loss": 5.2059,
"mean_token_accuracy": 0.21196469962596892,
"num_tokens": 8695476.0,
"step": 4785
},
{
"entropy": 5.80074520111084,
"epoch": 4.115169746454662,
"grad_norm": 1.1953125,
"learning_rate": 0.00037306866121544633,
"loss": 5.2825,
"mean_token_accuracy": 0.20670025944709777,
"num_tokens": 8705544.0,
"step": 4790
},
{
"entropy": 5.860075855255127,
"epoch": 4.119467125053717,
"grad_norm": 1.15625,
"learning_rate": 0.0003727693150709904,
"loss": 5.2645,
"mean_token_accuracy": 0.20871647000312804,
"num_tokens": 8714883.0,
"step": 4795
},
{
"entropy": 5.886887168884277,
"epoch": 4.1237645036527715,
"grad_norm": 1.0859375,
"learning_rate": 0.00037246975543742843,
"loss": 5.3176,
"mean_token_accuracy": 0.20150526314973832,
"num_tokens": 8724589.0,
"step": 4800
},
{
"entropy": 5.745695161819458,
"epoch": 4.128061882251826,
"grad_norm": 1.109375,
"learning_rate": 0.000372169982968879,
"loss": 5.1867,
"mean_token_accuracy": 0.20965181291103363,
"num_tokens": 8733771.0,
"step": 4805
},
{
"entropy": 5.845971202850341,
"epoch": 4.132359260850881,
"grad_norm": 1.234375,
"learning_rate": 0.0003718699983199252,
"loss": 5.2624,
"mean_token_accuracy": 0.20873973071575164,
"num_tokens": 8742348.0,
"step": 4810
},
{
"entropy": 5.7872912883758545,
"epoch": 4.136656639449935,
"grad_norm": 1.1171875,
"learning_rate": 0.0003715698021456137,
"loss": 5.2081,
"mean_token_accuracy": 0.21571390181779862,
"num_tokens": 8751357.0,
"step": 4815
},
{
"entropy": 5.7935162544250485,
"epoch": 4.1409540180489905,
"grad_norm": 1.09375,
"learning_rate": 0.00037126939510145294,
"loss": 5.2631,
"mean_token_accuracy": 0.21045506447553636,
"num_tokens": 8760813.0,
"step": 4820
},
{
"entropy": 5.919540929794311,
"epoch": 4.145251396648045,
"grad_norm": 1.1875,
"learning_rate": 0.0003709687778434118,
"loss": 5.3088,
"mean_token_accuracy": 0.20338443517684937,
"num_tokens": 8770228.0,
"step": 4825
},
{
"entropy": 5.766780090332031,
"epoch": 4.1495487752471,
"grad_norm": 1.3203125,
"learning_rate": 0.0003706679510279183,
"loss": 5.1405,
"mean_token_accuracy": 0.2135200873017311,
"num_tokens": 8779351.0,
"step": 4830
},
{
"entropy": 5.818261432647705,
"epoch": 4.153846153846154,
"grad_norm": 1.2109375,
"learning_rate": 0.0003703669153118578,
"loss": 5.3029,
"mean_token_accuracy": 0.20108458995819092,
"num_tokens": 8789116.0,
"step": 4835
},
{
"entropy": 5.810438871383667,
"epoch": 4.158143532445209,
"grad_norm": 1.15625,
"learning_rate": 0.00037006567135257216,
"loss": 5.2702,
"mean_token_accuracy": 0.20288445353507994,
"num_tokens": 8797790.0,
"step": 4840
},
{
"entropy": 5.865516614913941,
"epoch": 4.162440911044263,
"grad_norm": 1.0859375,
"learning_rate": 0.00036976421980785764,
"loss": 5.3081,
"mean_token_accuracy": 0.2026110991835594,
"num_tokens": 8808067.0,
"step": 4845
},
{
"entropy": 5.80728063583374,
"epoch": 4.166738289643318,
"grad_norm": 1.1640625,
"learning_rate": 0.0003694625613359641,
"loss": 5.2167,
"mean_token_accuracy": 0.21420625150203704,
"num_tokens": 8816587.0,
"step": 4850
},
{
"entropy": 5.843136548995972,
"epoch": 4.171035668242372,
"grad_norm": 1.2109375,
"learning_rate": 0.0003691606965955929,
"loss": 5.2734,
"mean_token_accuracy": 0.20686964243650435,
"num_tokens": 8826045.0,
"step": 4855
},
{
"entropy": 5.781480550765991,
"epoch": 4.175333046841427,
"grad_norm": 1.078125,
"learning_rate": 0.000368858626245896,
"loss": 5.2662,
"mean_token_accuracy": 0.21182646304368974,
"num_tokens": 8835427.0,
"step": 4860
},
{
"entropy": 5.802968168258667,
"epoch": 4.1796304254404815,
"grad_norm": 0.9609375,
"learning_rate": 0.0003685563509464744,
"loss": 5.2058,
"mean_token_accuracy": 0.21191840171813964,
"num_tokens": 8845167.0,
"step": 4865
},
{
"entropy": 5.854573917388916,
"epoch": 4.183927804039536,
"grad_norm": 1.25,
"learning_rate": 0.00036825387135737647,
"loss": 5.2076,
"mean_token_accuracy": 0.21366898566484452,
"num_tokens": 8853591.0,
"step": 4870
},
{
"entropy": 5.830286979675293,
"epoch": 4.188225182638591,
"grad_norm": 1.203125,
"learning_rate": 0.00036795118813909674,
"loss": 5.3259,
"mean_token_accuracy": 0.19266606420278548,
"num_tokens": 8863647.0,
"step": 4875
},
{
"entropy": 5.880206489562989,
"epoch": 4.192522561237645,
"grad_norm": 1.1875,
"learning_rate": 0.00036764830195257437,
"loss": 5.2531,
"mean_token_accuracy": 0.2108171060681343,
"num_tokens": 8872911.0,
"step": 4880
},
{
"entropy": 5.866643857955933,
"epoch": 4.1968199398367,
"grad_norm": 1.2578125,
"learning_rate": 0.0003673452134591918,
"loss": 5.2999,
"mean_token_accuracy": 0.2029878944158554,
"num_tokens": 8881001.0,
"step": 4885
},
{
"entropy": 5.772600555419922,
"epoch": 4.201117318435754,
"grad_norm": 1.1953125,
"learning_rate": 0.000367041923320773,
"loss": 5.2042,
"mean_token_accuracy": 0.21341877430677414,
"num_tokens": 8890323.0,
"step": 4890
},
{
"entropy": 5.771191835403442,
"epoch": 4.205414697034809,
"grad_norm": 1.125,
"learning_rate": 0.00036673843219958257,
"loss": 5.2368,
"mean_token_accuracy": 0.21208913624286652,
"num_tokens": 8900471.0,
"step": 4895
},
{
"entropy": 5.88256139755249,
"epoch": 4.209712075633863,
"grad_norm": 1.1953125,
"learning_rate": 0.0003664347407583238,
"loss": 5.2863,
"mean_token_accuracy": 0.20272428095340728,
"num_tokens": 8909320.0,
"step": 4900
},
{
"entropy": 5.836409950256348,
"epoch": 4.214009454232918,
"grad_norm": 1.1953125,
"learning_rate": 0.0003661308496601373,
"loss": 5.2072,
"mean_token_accuracy": 0.2157358020544052,
"num_tokens": 8917453.0,
"step": 4905
},
{
"entropy": 5.788828945159912,
"epoch": 4.218306832831972,
"grad_norm": 1.265625,
"learning_rate": 0.00036582675956859983,
"loss": 5.2828,
"mean_token_accuracy": 0.2104206308722496,
"num_tokens": 8925737.0,
"step": 4910
},
{
"entropy": 5.720648384094238,
"epoch": 4.222604211431027,
"grad_norm": 1.171875,
"learning_rate": 0.00036552247114772263,
"loss": 5.2101,
"mean_token_accuracy": 0.2065804719924927,
"num_tokens": 8935475.0,
"step": 4915
},
{
"entropy": 5.83034381866455,
"epoch": 4.2269015900300815,
"grad_norm": 1.0546875,
"learning_rate": 0.00036521798506194996,
"loss": 5.2346,
"mean_token_accuracy": 0.21483660042285918,
"num_tokens": 8944683.0,
"step": 4920
},
{
"entropy": 5.881083297729492,
"epoch": 4.231198968629136,
"grad_norm": 1.2421875,
"learning_rate": 0.00036491330197615775,
"loss": 5.2826,
"mean_token_accuracy": 0.199912728369236,
"num_tokens": 8953837.0,
"step": 4925
},
{
"entropy": 5.823856353759766,
"epoch": 4.235496347228191,
"grad_norm": 0.98046875,
"learning_rate": 0.00036460842255565197,
"loss": 5.3172,
"mean_token_accuracy": 0.2043285608291626,
"num_tokens": 8964822.0,
"step": 4930
},
{
"entropy": 5.869928026199341,
"epoch": 4.239793725827245,
"grad_norm": 1.328125,
"learning_rate": 0.0003643033474661676,
"loss": 5.2965,
"mean_token_accuracy": 0.20673907697200775,
"num_tokens": 8974363.0,
"step": 4935
},
{
"entropy": 5.82686710357666,
"epoch": 4.2440911044263,
"grad_norm": 1.2109375,
"learning_rate": 0.00036399807737386657,
"loss": 5.2074,
"mean_token_accuracy": 0.21254496574401854,
"num_tokens": 8983122.0,
"step": 4940
},
{
"entropy": 5.857899141311646,
"epoch": 4.248388483025354,
"grad_norm": 1.2578125,
"learning_rate": 0.0003636926129453368,
"loss": 5.3123,
"mean_token_accuracy": 0.20272811949253083,
"num_tokens": 8991618.0,
"step": 4945
},
{
"entropy": 5.824826383590699,
"epoch": 4.252685861624409,
"grad_norm": 1.171875,
"learning_rate": 0.0003633869548475904,
"loss": 5.2415,
"mean_token_accuracy": 0.21045928597450256,
"num_tokens": 9000128.0,
"step": 4950
},
{
"entropy": 5.775493240356445,
"epoch": 4.256983240223463,
"grad_norm": 1.15625,
"learning_rate": 0.0003630811037480627,
"loss": 5.2319,
"mean_token_accuracy": 0.2093399852514267,
"num_tokens": 9008951.0,
"step": 4955
},
{
"entropy": 5.842453670501709,
"epoch": 4.261280618822518,
"grad_norm": 1.078125,
"learning_rate": 0.0003627750603146101,
"loss": 5.2789,
"mean_token_accuracy": 0.2030516341328621,
"num_tokens": 9018949.0,
"step": 4960
},
{
"entropy": 5.883487272262573,
"epoch": 4.265577997421573,
"grad_norm": 1.0546875,
"learning_rate": 0.0003624688252155091,
"loss": 5.2747,
"mean_token_accuracy": 0.20714085996150972,
"num_tokens": 9028910.0,
"step": 4965
},
{
"entropy": 5.809985780715943,
"epoch": 4.269875376020628,
"grad_norm": 1.1015625,
"learning_rate": 0.0003621623991194549,
"loss": 5.324,
"mean_token_accuracy": 0.19819179475307463,
"num_tokens": 9039012.0,
"step": 4970
},
{
"entropy": 5.9007415771484375,
"epoch": 4.274172754619682,
"grad_norm": 1.109375,
"learning_rate": 0.0003618557826955594,
"loss": 5.2954,
"mean_token_accuracy": 0.20645973831415176,
"num_tokens": 9048639.0,
"step": 4975
},
{
"entropy": 5.815454912185669,
"epoch": 4.278470133218737,
"grad_norm": 1.1171875,
"learning_rate": 0.00036154897661335063,
"loss": 5.2517,
"mean_token_accuracy": 0.2086031049489975,
"num_tokens": 9057453.0,
"step": 4980
},
{
"entropy": 5.792014074325562,
"epoch": 4.2827675118177915,
"grad_norm": 1.15625,
"learning_rate": 0.0003612419815427702,
"loss": 5.2826,
"mean_token_accuracy": 0.20074526816606522,
"num_tokens": 9066761.0,
"step": 4985
},
{
"entropy": 5.858555936813355,
"epoch": 4.287064890416846,
"grad_norm": 1.4140625,
"learning_rate": 0.0003609347981541726,
"loss": 5.3553,
"mean_token_accuracy": 0.1983863353729248,
"num_tokens": 9075535.0,
"step": 4990
},
{
"entropy": 5.862577295303344,
"epoch": 4.291362269015901,
"grad_norm": 1.2109375,
"learning_rate": 0.00036062742711832376,
"loss": 5.257,
"mean_token_accuracy": 0.2088131219148636,
"num_tokens": 9084559.0,
"step": 4995
},
{
"entropy": 5.811804294586182,
"epoch": 4.295659647614955,
"grad_norm": 1.234375,
"learning_rate": 0.0003603198691063991,
"loss": 5.2313,
"mean_token_accuracy": 0.2083360180258751,
"num_tokens": 9093069.0,
"step": 5000
},
{
"epoch": 4.295659647614955,
"eval_entropy": 5.572498395636275,
"eval_loss": 5.972146987915039,
"eval_mean_token_accuracy": 0.17474245199480573,
"eval_num_tokens": 9093069.0,
"eval_runtime": 2.0519,
"eval_samples_per_second": 1729.593,
"eval_steps_per_second": 216.382,
"step": 5000
},
{
"entropy": 5.8002112865447994,
"epoch": 4.29995702621401,
"grad_norm": 1.3125,
"learning_rate": 0.0003600121247899824,
"loss": 5.2227,
"mean_token_accuracy": 0.2073623850941658,
"num_tokens": 9101914.0,
"step": 5005
},
{
"entropy": 5.834455966949463,
"epoch": 4.304254404813064,
"grad_norm": 1.1484375,
"learning_rate": 0.00035970419484106404,
"loss": 5.2887,
"mean_token_accuracy": 0.20548986196517943,
"num_tokens": 9110967.0,
"step": 5010
},
{
"entropy": 5.891673374176025,
"epoch": 4.308551783412119,
"grad_norm": 1.2578125,
"learning_rate": 0.0003593960799320402,
"loss": 5.3822,
"mean_token_accuracy": 0.19926034808158874,
"num_tokens": 9119774.0,
"step": 5015
},
{
"entropy": 5.887394714355469,
"epoch": 4.312849162011173,
"grad_norm": 1.2890625,
"learning_rate": 0.0003590877807357107,
"loss": 5.2922,
"mean_token_accuracy": 0.20317730754613877,
"num_tokens": 9127738.0,
"step": 5020
},
{
"entropy": 5.785108280181885,
"epoch": 4.317146540610228,
"grad_norm": 1.203125,
"learning_rate": 0.0003587792979252776,
"loss": 5.2629,
"mean_token_accuracy": 0.20784647464752198,
"num_tokens": 9137060.0,
"step": 5025
},
{
"entropy": 5.777895927429199,
"epoch": 4.321443919209282,
"grad_norm": 1.2734375,
"learning_rate": 0.0003584706321743442,
"loss": 5.1962,
"mean_token_accuracy": 0.2092631295323372,
"num_tokens": 9145169.0,
"step": 5030
},
{
"entropy": 5.796663856506347,
"epoch": 4.325741297808337,
"grad_norm": 1.1328125,
"learning_rate": 0.000358161784156913,
"loss": 5.2276,
"mean_token_accuracy": 0.21179858297109605,
"num_tokens": 9154092.0,
"step": 5035
},
{
"entropy": 5.85888671875,
"epoch": 4.3300386764073915,
"grad_norm": 1.1171875,
"learning_rate": 0.00035785275454738456,
"loss": 5.286,
"mean_token_accuracy": 0.19925448596477507,
"num_tokens": 9162824.0,
"step": 5040
},
{
"entropy": 5.7883411884307865,
"epoch": 4.334336055006446,
"grad_norm": 1.3828125,
"learning_rate": 0.00035754354402055635,
"loss": 5.1959,
"mean_token_accuracy": 0.21434530913829802,
"num_tokens": 9170977.0,
"step": 5045
},
{
"entropy": 5.730002689361572,
"epoch": 4.338633433605501,
"grad_norm": 1.1875,
"learning_rate": 0.0003572341532516202,
"loss": 5.2367,
"mean_token_accuracy": 0.20432866215705872,
"num_tokens": 9179539.0,
"step": 5050
},
{
"entropy": 5.77237024307251,
"epoch": 4.342930812204555,
"grad_norm": 1.140625,
"learning_rate": 0.0003569245829161622,
"loss": 5.3173,
"mean_token_accuracy": 0.20617470294237136,
"num_tokens": 9188861.0,
"step": 5055
},
{
"entropy": 5.834583187103272,
"epoch": 4.34722819080361,
"grad_norm": 1.15625,
"learning_rate": 0.00035661483369016004,
"loss": 5.2608,
"mean_token_accuracy": 0.20369923412799834,
"num_tokens": 9197724.0,
"step": 5060
},
{
"entropy": 5.79484076499939,
"epoch": 4.351525569402664,
"grad_norm": 1.1640625,
"learning_rate": 0.0003563049062499822,
"loss": 5.2692,
"mean_token_accuracy": 0.2074078604578972,
"num_tokens": 9206375.0,
"step": 5065
},
{
"entropy": 5.755507230758667,
"epoch": 4.355822948001719,
"grad_norm": 1.296875,
"learning_rate": 0.0003559948012723865,
"loss": 5.2271,
"mean_token_accuracy": 0.21173418909311295,
"num_tokens": 9214675.0,
"step": 5070
},
{
"entropy": 5.802625036239624,
"epoch": 4.360120326600773,
"grad_norm": 1.1953125,
"learning_rate": 0.0003556845194345181,
"loss": 5.2516,
"mean_token_accuracy": 0.20623590499162675,
"num_tokens": 9224128.0,
"step": 5075
},
{
"entropy": 5.769022130966187,
"epoch": 4.364417705199828,
"grad_norm": 1.359375,
"learning_rate": 0.0003553740614139086,
"loss": 5.1773,
"mean_token_accuracy": 0.21178028136491775,
"num_tokens": 9232568.0,
"step": 5080
},
{
"entropy": 5.831740474700927,
"epoch": 4.368715083798882,
"grad_norm": 1.2734375,
"learning_rate": 0.0003550634278884742,
"loss": 5.2776,
"mean_token_accuracy": 0.2081983670592308,
"num_tokens": 9241809.0,
"step": 5085
},
{
"entropy": 5.803788042068481,
"epoch": 4.373012462397937,
"grad_norm": 1.140625,
"learning_rate": 0.00035475261953651433,
"loss": 5.272,
"mean_token_accuracy": 0.20985971093177797,
"num_tokens": 9250845.0,
"step": 5090
},
{
"entropy": 5.7017419815063475,
"epoch": 4.3773098409969915,
"grad_norm": 1.2265625,
"learning_rate": 0.00035444163703671026,
"loss": 5.2316,
"mean_token_accuracy": 0.2108854666352272,
"num_tokens": 9259465.0,
"step": 5095
},
{
"entropy": 5.795203113555909,
"epoch": 4.381607219596046,
"grad_norm": 1.078125,
"learning_rate": 0.00035413048106812357,
"loss": 5.2177,
"mean_token_accuracy": 0.21499419659376146,
"num_tokens": 9267853.0,
"step": 5100
},
{
"entropy": 5.927629661560059,
"epoch": 4.385904598195101,
"grad_norm": 1.125,
"learning_rate": 0.00035381915231019425,
"loss": 5.4268,
"mean_token_accuracy": 0.19061524271965027,
"num_tokens": 9276664.0,
"step": 5105
},
{
"entropy": 5.820791101455688,
"epoch": 4.390201976794156,
"grad_norm": 1.21875,
"learning_rate": 0.0003535076514427401,
"loss": 5.2285,
"mean_token_accuracy": 0.20389644652605057,
"num_tokens": 9285482.0,
"step": 5110
},
{
"entropy": 5.833712720870972,
"epoch": 4.39449935539321,
"grad_norm": 1.0859375,
"learning_rate": 0.00035319597914595436,
"loss": 5.3276,
"mean_token_accuracy": 0.19536473900079726,
"num_tokens": 9293936.0,
"step": 5115
},
{
"entropy": 5.812803554534912,
"epoch": 4.398796733992265,
"grad_norm": 1.1328125,
"learning_rate": 0.0003528841361004049,
"loss": 5.3509,
"mean_token_accuracy": 0.19318777322769165,
"num_tokens": 9303998.0,
"step": 5120
},
{
"entropy": 5.777164936065674,
"epoch": 4.40309411259132,
"grad_norm": 1.25,
"learning_rate": 0.0003525721229870323,
"loss": 5.3018,
"mean_token_accuracy": 0.2057452142238617,
"num_tokens": 9313117.0,
"step": 5125
},
{
"entropy": 5.843145132064819,
"epoch": 4.407391491190374,
"grad_norm": 1.140625,
"learning_rate": 0.00035225994048714823,
"loss": 5.2845,
"mean_token_accuracy": 0.205299773812294,
"num_tokens": 9321446.0,
"step": 5130
},
{
"entropy": 5.799930763244629,
"epoch": 4.411688869789429,
"grad_norm": 1.171875,
"learning_rate": 0.0003519475892824348,
"loss": 5.2629,
"mean_token_accuracy": 0.20662948340177537,
"num_tokens": 9330752.0,
"step": 5135
},
{
"entropy": 5.77738208770752,
"epoch": 4.415986248388483,
"grad_norm": 1.1875,
"learning_rate": 0.0003516350700549419,
"loss": 5.3006,
"mean_token_accuracy": 0.20330240875482558,
"num_tokens": 9339322.0,
"step": 5140
},
{
"entropy": 5.84840669631958,
"epoch": 4.420283626987538,
"grad_norm": 1.203125,
"learning_rate": 0.00035132238348708697,
"loss": 5.3297,
"mean_token_accuracy": 0.19938498139381408,
"num_tokens": 9349024.0,
"step": 5145
},
{
"entropy": 5.926564788818359,
"epoch": 4.424581005586592,
"grad_norm": 1.296875,
"learning_rate": 0.00035100953026165224,
"loss": 5.4256,
"mean_token_accuracy": 0.197027026116848,
"num_tokens": 9358833.0,
"step": 5150
},
{
"entropy": 5.868610525131226,
"epoch": 4.428878384185647,
"grad_norm": 1.1328125,
"learning_rate": 0.0003506965110617841,
"loss": 5.2718,
"mean_token_accuracy": 0.2099718302488327,
"num_tokens": 9368276.0,
"step": 5155
},
{
"entropy": 5.859810876846313,
"epoch": 4.4331757627847015,
"grad_norm": 1.015625,
"learning_rate": 0.0003503833265709915,
"loss": 5.3479,
"mean_token_accuracy": 0.1974034383893013,
"num_tokens": 9378501.0,
"step": 5160
},
{
"entropy": 5.875433778762817,
"epoch": 4.437473141383756,
"grad_norm": 1.265625,
"learning_rate": 0.00035006997747314404,
"loss": 5.3298,
"mean_token_accuracy": 0.19622083157300949,
"num_tokens": 9387789.0,
"step": 5165
},
{
"entropy": 5.835582590103149,
"epoch": 4.441770519982811,
"grad_norm": 1.125,
"learning_rate": 0.00034975646445247106,
"loss": 5.3721,
"mean_token_accuracy": 0.2014732614159584,
"num_tokens": 9397041.0,
"step": 5170
},
{
"entropy": 5.775737285614014,
"epoch": 4.446067898581865,
"grad_norm": 1.1953125,
"learning_rate": 0.0003494427881935596,
"loss": 5.3059,
"mean_token_accuracy": 0.20452196449041365,
"num_tokens": 9405393.0,
"step": 5175
},
{
"entropy": 5.779368114471436,
"epoch": 4.45036527718092,
"grad_norm": 1.1484375,
"learning_rate": 0.00034912894938135325,
"loss": 5.2582,
"mean_token_accuracy": 0.20273705422878266,
"num_tokens": 9415127.0,
"step": 5180
},
{
"entropy": 5.846761655807495,
"epoch": 4.454662655779974,
"grad_norm": 1.2265625,
"learning_rate": 0.0003488149487011506,
"loss": 5.3699,
"mean_token_accuracy": 0.20174208134412766,
"num_tokens": 9424416.0,
"step": 5185
},
{
"entropy": 5.890099239349365,
"epoch": 4.458960034379029,
"grad_norm": 1.1640625,
"learning_rate": 0.00034850078683860346,
"loss": 5.3262,
"mean_token_accuracy": 0.19683828055858613,
"num_tokens": 9434523.0,
"step": 5190
},
{
"entropy": 5.831119251251221,
"epoch": 4.463257412978083,
"grad_norm": 1.140625,
"learning_rate": 0.0003481864644797159,
"loss": 5.3245,
"mean_token_accuracy": 0.2093776971101761,
"num_tokens": 9443605.0,
"step": 5195
},
{
"entropy": 5.803278684616089,
"epoch": 4.467554791577138,
"grad_norm": 1.1640625,
"learning_rate": 0.0003478719823108424,
"loss": 5.3317,
"mean_token_accuracy": 0.19572802931070327,
"num_tokens": 9453268.0,
"step": 5200
},
{
"entropy": 5.8240362167358395,
"epoch": 4.471852170176192,
"grad_norm": 1.1953125,
"learning_rate": 0.00034755734101868613,
"loss": 5.214,
"mean_token_accuracy": 0.2097940504550934,
"num_tokens": 9461578.0,
"step": 5205
},
{
"entropy": 5.79837703704834,
"epoch": 4.476149548775247,
"grad_norm": 1.140625,
"learning_rate": 0.00034724254129029795,
"loss": 5.2436,
"mean_token_accuracy": 0.2102679118514061,
"num_tokens": 9470722.0,
"step": 5210
},
{
"entropy": 5.837281274795532,
"epoch": 4.4804469273743015,
"grad_norm": 1.2890625,
"learning_rate": 0.0003469275838130748,
"loss": 5.3607,
"mean_token_accuracy": 0.19933488070964814,
"num_tokens": 9479695.0,
"step": 5215
},
{
"entropy": 5.8430516719818115,
"epoch": 4.484744305973356,
"grad_norm": 1.1640625,
"learning_rate": 0.0003466124692747577,
"loss": 5.2646,
"mean_token_accuracy": 0.2044244959950447,
"num_tokens": 9488444.0,
"step": 5220
},
{
"entropy": 5.742202806472778,
"epoch": 4.489041684572411,
"grad_norm": 1.1953125,
"learning_rate": 0.00034629719836343106,
"loss": 5.2215,
"mean_token_accuracy": 0.21403959393501282,
"num_tokens": 9497413.0,
"step": 5225
},
{
"entropy": 5.7987758159637455,
"epoch": 4.493339063171465,
"grad_norm": 1.296875,
"learning_rate": 0.0003459817717675203,
"loss": 5.2598,
"mean_token_accuracy": 0.21579257249832154,
"num_tokens": 9506135.0,
"step": 5230
},
{
"entropy": 5.835311031341552,
"epoch": 4.49763644177052,
"grad_norm": 1.0625,
"learning_rate": 0.0003456661901757913,
"loss": 5.3341,
"mean_token_accuracy": 0.20138609558343887,
"num_tokens": 9516918.0,
"step": 5235
},
{
"entropy": 5.866192770004273,
"epoch": 4.501933820369574,
"grad_norm": 1.2578125,
"learning_rate": 0.00034535045427734796,
"loss": 5.276,
"mean_token_accuracy": 0.2101076439023018,
"num_tokens": 9526052.0,
"step": 5240
},
{
"entropy": 5.733947229385376,
"epoch": 4.506231198968629,
"grad_norm": 1.265625,
"learning_rate": 0.0003450345647616313,
"loss": 5.3369,
"mean_token_accuracy": 0.2056139588356018,
"num_tokens": 9535200.0,
"step": 5245
},
{
"entropy": 5.76122088432312,
"epoch": 4.510528577567683,
"grad_norm": 1.1796875,
"learning_rate": 0.0003447185223184177,
"loss": 5.3074,
"mean_token_accuracy": 0.20514743030071259,
"num_tokens": 9544786.0,
"step": 5250
},
{
"entropy": 5.871483230590821,
"epoch": 4.514825956166739,
"grad_norm": 1.1484375,
"learning_rate": 0.00034440232763781765,
"loss": 5.2522,
"mean_token_accuracy": 0.20949897319078445,
"num_tokens": 9553694.0,
"step": 5255
},
{
"entropy": 5.753093576431274,
"epoch": 4.519123334765792,
"grad_norm": 1.1953125,
"learning_rate": 0.000344085981410274,
"loss": 5.3192,
"mean_token_accuracy": 0.20984772890806197,
"num_tokens": 9563332.0,
"step": 5260
},
{
"entropy": 5.711885738372803,
"epoch": 4.523420713364848,
"grad_norm": 1.1015625,
"learning_rate": 0.00034376948432656036,
"loss": 5.2301,
"mean_token_accuracy": 0.2115880087018013,
"num_tokens": 9572367.0,
"step": 5265
},
{
"entropy": 5.860666131973266,
"epoch": 4.527718091963902,
"grad_norm": 1.0546875,
"learning_rate": 0.0003434528370777798,
"loss": 5.3255,
"mean_token_accuracy": 0.19527169466018676,
"num_tokens": 9582535.0,
"step": 5270
},
{
"entropy": 5.807507610321045,
"epoch": 4.532015470562957,
"grad_norm": 1.203125,
"learning_rate": 0.00034313604035536344,
"loss": 5.2775,
"mean_token_accuracy": 0.21002310365438462,
"num_tokens": 9590688.0,
"step": 5275
},
{
"entropy": 5.773982238769531,
"epoch": 4.5363128491620115,
"grad_norm": 1.171875,
"learning_rate": 0.0003428190948510687,
"loss": 5.3213,
"mean_token_accuracy": 0.2039690524339676,
"num_tokens": 9599209.0,
"step": 5280
},
{
"entropy": 5.852875804901123,
"epoch": 4.540610227761066,
"grad_norm": 1.2265625,
"learning_rate": 0.0003425020012569778,
"loss": 5.3626,
"mean_token_accuracy": 0.20032234340906144,
"num_tokens": 9608575.0,
"step": 5285
},
{
"entropy": 5.903119659423828,
"epoch": 4.544907606360121,
"grad_norm": 1.1796875,
"learning_rate": 0.00034218476026549665,
"loss": 5.3113,
"mean_token_accuracy": 0.2009777992963791,
"num_tokens": 9617312.0,
"step": 5290
},
{
"entropy": 5.826537036895752,
"epoch": 4.549204984959175,
"grad_norm": 1.265625,
"learning_rate": 0.0003418673725693524,
"loss": 5.2895,
"mean_token_accuracy": 0.21229007989168167,
"num_tokens": 9626398.0,
"step": 5295
},
{
"entropy": 5.797998762130737,
"epoch": 4.55350236355823,
"grad_norm": 1.1640625,
"learning_rate": 0.0003415498388615932,
"loss": 5.2692,
"mean_token_accuracy": 0.20089106261730194,
"num_tokens": 9635470.0,
"step": 5300
},
{
"entropy": 5.809066820144653,
"epoch": 4.557799742157284,
"grad_norm": 1.1640625,
"learning_rate": 0.0003412321598355857,
"loss": 5.213,
"mean_token_accuracy": 0.21215442568063736,
"num_tokens": 9644728.0,
"step": 5305
},
{
"entropy": 5.776236963272095,
"epoch": 4.562097120756339,
"grad_norm": 1.0859375,
"learning_rate": 0.0003409143361850139,
"loss": 5.2752,
"mean_token_accuracy": 0.2105761721730232,
"num_tokens": 9654129.0,
"step": 5310
},
{
"entropy": 5.822030639648437,
"epoch": 4.566394499355393,
"grad_norm": 1.25,
"learning_rate": 0.0003405963686038775,
"loss": 5.3633,
"mean_token_accuracy": 0.1967499941587448,
"num_tokens": 9662648.0,
"step": 5315
},
{
"entropy": 5.843867492675781,
"epoch": 4.570691877954448,
"grad_norm": 1.1640625,
"learning_rate": 0.0003402782577864908,
"loss": 5.3261,
"mean_token_accuracy": 0.20646921396255494,
"num_tokens": 9672082.0,
"step": 5320
},
{
"entropy": 5.86830587387085,
"epoch": 4.574989256553502,
"grad_norm": 1.2421875,
"learning_rate": 0.00033996000442748056,
"loss": 5.2528,
"mean_token_accuracy": 0.21100070625543593,
"num_tokens": 9681422.0,
"step": 5325
},
{
"entropy": 5.829919290542603,
"epoch": 4.579286635152557,
"grad_norm": 1.28125,
"learning_rate": 0.00033964160922178495,
"loss": 5.2957,
"mean_token_accuracy": 0.206342414021492,
"num_tokens": 9690675.0,
"step": 5330
},
{
"entropy": 5.813098335266114,
"epoch": 4.5835840137516115,
"grad_norm": 1.125,
"learning_rate": 0.0003393230728646518,
"loss": 5.2833,
"mean_token_accuracy": 0.2053971081972122,
"num_tokens": 9700200.0,
"step": 5335
},
{
"entropy": 5.761319780349732,
"epoch": 4.587881392350666,
"grad_norm": 1.2421875,
"learning_rate": 0.00033900439605163724,
"loss": 5.2785,
"mean_token_accuracy": 0.2027950644493103,
"num_tokens": 9709533.0,
"step": 5340
},
{
"entropy": 5.774492692947388,
"epoch": 4.592178770949721,
"grad_norm": 1.09375,
"learning_rate": 0.00033868557947860407,
"loss": 5.3247,
"mean_token_accuracy": 0.20598720461130143,
"num_tokens": 9719250.0,
"step": 5345
},
{
"entropy": 5.826806688308716,
"epoch": 4.596476149548775,
"grad_norm": 1.1484375,
"learning_rate": 0.00033836662384172014,
"loss": 5.243,
"mean_token_accuracy": 0.20927662551403045,
"num_tokens": 9727837.0,
"step": 5350
},
{
"entropy": 5.759864807128906,
"epoch": 4.60077352814783,
"grad_norm": 1.1484375,
"learning_rate": 0.0003380475298374573,
"loss": 5.3326,
"mean_token_accuracy": 0.20309751331806183,
"num_tokens": 9737125.0,
"step": 5355
},
{
"entropy": 5.813335514068603,
"epoch": 4.605070906746884,
"grad_norm": 1.109375,
"learning_rate": 0.000337728298162589,
"loss": 5.3499,
"mean_token_accuracy": 0.19702604413032532,
"num_tokens": 9746309.0,
"step": 5360
},
{
"entropy": 5.838102722167969,
"epoch": 4.609368285345939,
"grad_norm": 1.1484375,
"learning_rate": 0.00033740892951418993,
"loss": 5.232,
"mean_token_accuracy": 0.2094883754849434,
"num_tokens": 9755633.0,
"step": 5365
},
{
"entropy": 5.877121877670288,
"epoch": 4.613665663944993,
"grad_norm": 1.2734375,
"learning_rate": 0.0003370894245896333,
"loss": 5.2713,
"mean_token_accuracy": 0.19735931158065795,
"num_tokens": 9765179.0,
"step": 5370
},
{
"entropy": 5.872338008880615,
"epoch": 4.617963042544048,
"grad_norm": 1.3359375,
"learning_rate": 0.00033676978408659047,
"loss": 5.2987,
"mean_token_accuracy": 0.2016567572951317,
"num_tokens": 9774085.0,
"step": 5375
},
{
"entropy": 5.845898246765136,
"epoch": 4.622260421143102,
"grad_norm": 1.09375,
"learning_rate": 0.0003364500087030283,
"loss": 5.4123,
"mean_token_accuracy": 0.19296547174453735,
"num_tokens": 9784650.0,
"step": 5380
},
{
"entropy": 5.869012546539307,
"epoch": 4.626557799742157,
"grad_norm": 1.1171875,
"learning_rate": 0.00033613009913720845,
"loss": 5.2707,
"mean_token_accuracy": 0.20299201905727388,
"num_tokens": 9793947.0,
"step": 5385
},
{
"entropy": 5.734190225601196,
"epoch": 4.6308551783412115,
"grad_norm": 1.234375,
"learning_rate": 0.00033581005608768563,
"loss": 5.2453,
"mean_token_accuracy": 0.2124895542860031,
"num_tokens": 9803593.0,
"step": 5390
},
{
"entropy": 5.793021965026855,
"epoch": 4.635152556940266,
"grad_norm": 1.1953125,
"learning_rate": 0.0003354898802533058,
"loss": 5.2855,
"mean_token_accuracy": 0.20431207865476608,
"num_tokens": 9812295.0,
"step": 5395
},
{
"entropy": 5.791452312469483,
"epoch": 4.6394499355393215,
"grad_norm": 1.125,
"learning_rate": 0.0003351695723332051,
"loss": 5.2934,
"mean_token_accuracy": 0.2097485601902008,
"num_tokens": 9820586.0,
"step": 5400
},
{
"entropy": 5.798425960540771,
"epoch": 4.643747314138375,
"grad_norm": 1.1484375,
"learning_rate": 0.00033484913302680807,
"loss": 5.2279,
"mean_token_accuracy": 0.21040427088737487,
"num_tokens": 9829080.0,
"step": 5405
},
{
"entropy": 5.796739912033081,
"epoch": 4.648044692737431,
"grad_norm": 1.125,
"learning_rate": 0.00033452856303382595,
"loss": 5.2475,
"mean_token_accuracy": 0.20435117036104203,
"num_tokens": 9838421.0,
"step": 5410
},
{
"entropy": 5.759791278839112,
"epoch": 4.652342071336484,
"grad_norm": 1.3515625,
"learning_rate": 0.0003342078630542555,
"loss": 5.2524,
"mean_token_accuracy": 0.21281823366880417,
"num_tokens": 9847151.0,
"step": 5415
},
{
"entropy": 5.807016801834107,
"epoch": 4.65663944993554,
"grad_norm": 1.171875,
"learning_rate": 0.00033388703378837737,
"loss": 5.275,
"mean_token_accuracy": 0.20886558741331102,
"num_tokens": 9856803.0,
"step": 5420
},
{
"entropy": 5.791787147521973,
"epoch": 4.660936828534594,
"grad_norm": 1.1953125,
"learning_rate": 0.0003335660759367544,
"loss": 5.1847,
"mean_token_accuracy": 0.22501839995384215,
"num_tokens": 9865617.0,
"step": 5425
},
{
"entropy": 5.765948724746704,
"epoch": 4.665234207133649,
"grad_norm": 1.1328125,
"learning_rate": 0.00033324499020023025,
"loss": 5.2534,
"mean_token_accuracy": 0.21098006069660186,
"num_tokens": 9875454.0,
"step": 5430
},
{
"entropy": 5.817541313171387,
"epoch": 4.669531585732703,
"grad_norm": 1.1875,
"learning_rate": 0.0003329237772799277,
"loss": 5.2502,
"mean_token_accuracy": 0.20961165130138398,
"num_tokens": 9884770.0,
"step": 5435
},
{
"entropy": 5.783469343185425,
"epoch": 4.673828964331758,
"grad_norm": 1.2421875,
"learning_rate": 0.0003326024378772477,
"loss": 5.2538,
"mean_token_accuracy": 0.2091410353779793,
"num_tokens": 9893594.0,
"step": 5440
},
{
"entropy": 5.793620014190674,
"epoch": 4.678126342930812,
"grad_norm": 1.109375,
"learning_rate": 0.0003322809726938667,
"loss": 5.3607,
"mean_token_accuracy": 0.19666333645582199,
"num_tokens": 9902260.0,
"step": 5445
},
{
"entropy": 5.804405307769775,
"epoch": 4.682423721529867,
"grad_norm": 1.2265625,
"learning_rate": 0.00033195938243173645,
"loss": 5.2657,
"mean_token_accuracy": 0.20829562693834305,
"num_tokens": 9911020.0,
"step": 5450
},
{
"entropy": 5.8101622581481935,
"epoch": 4.6867211001289215,
"grad_norm": 1.3515625,
"learning_rate": 0.0003316376677930814,
"loss": 5.277,
"mean_token_accuracy": 0.20017611235380173,
"num_tokens": 9918696.0,
"step": 5455
},
{
"entropy": 5.745956611633301,
"epoch": 4.691018478727976,
"grad_norm": 1.21875,
"learning_rate": 0.0003313158294803977,
"loss": 5.3171,
"mean_token_accuracy": 0.1995955988764763,
"num_tokens": 9927638.0,
"step": 5460
},
{
"entropy": 5.824975442886353,
"epoch": 4.695315857327031,
"grad_norm": 1.2109375,
"learning_rate": 0.00033099386819645176,
"loss": 5.2912,
"mean_token_accuracy": 0.20382552444934846,
"num_tokens": 9936969.0,
"step": 5465
},
{
"entropy": 5.796650314331055,
"epoch": 4.699613235926085,
"grad_norm": 1.046875,
"learning_rate": 0.0003306717846442782,
"loss": 5.1993,
"mean_token_accuracy": 0.20417630672454834,
"num_tokens": 9945229.0,
"step": 5470
},
{
"entropy": 5.7901218891143795,
"epoch": 4.70391061452514,
"grad_norm": 1.25,
"learning_rate": 0.0003303495795271788,
"loss": 5.1995,
"mean_token_accuracy": 0.20233412235975265,
"num_tokens": 9953759.0,
"step": 5475
},
{
"entropy": 5.770085334777832,
"epoch": 4.708207993124194,
"grad_norm": 1.140625,
"learning_rate": 0.00033002725354872075,
"loss": 5.3092,
"mean_token_accuracy": 0.2047215849161148,
"num_tokens": 9962771.0,
"step": 5480
},
{
"entropy": 5.800899696350098,
"epoch": 4.712505371723249,
"grad_norm": 1.3203125,
"learning_rate": 0.00032970480741273514,
"loss": 5.3104,
"mean_token_accuracy": 0.19106538593769073,
"num_tokens": 9972481.0,
"step": 5485
},
{
"entropy": 5.8685791015625,
"epoch": 4.716802750322303,
"grad_norm": 1.390625,
"learning_rate": 0.0003293822418233155,
"loss": 5.256,
"mean_token_accuracy": 0.2051583468914032,
"num_tokens": 9980773.0,
"step": 5490
},
{
"entropy": 5.8781898021698,
"epoch": 4.721100128921358,
"grad_norm": 1.1953125,
"learning_rate": 0.0003290595574848161,
"loss": 5.3453,
"mean_token_accuracy": 0.19384868294000626,
"num_tokens": 9989830.0,
"step": 5495
},
{
"entropy": 5.756228923797607,
"epoch": 4.725397507520412,
"grad_norm": 1.15625,
"learning_rate": 0.0003287367551018505,
"loss": 5.272,
"mean_token_accuracy": 0.20579312443733216,
"num_tokens": 9999234.0,
"step": 5500
},
{
"epoch": 4.725397507520412,
"eval_entropy": 5.592626677977072,
"eval_loss": 5.931019306182861,
"eval_mean_token_accuracy": 0.17753368537235367,
"eval_num_tokens": 9999234.0,
"eval_runtime": 2.0334,
"eval_samples_per_second": 1745.336,
"eval_steps_per_second": 218.351,
"step": 5500
},
{
"entropy": 5.8168501377105715,
"epoch": 4.729694886119467,
"grad_norm": 1.1015625,
"learning_rate": 0.0003284138353792903,
"loss": 5.3383,
"mean_token_accuracy": 0.2040895164012909,
"num_tokens": 10008671.0,
"step": 5505
},
{
"entropy": 5.784496402740478,
"epoch": 4.7339922647185215,
"grad_norm": 1.265625,
"learning_rate": 0.0003280907990222628,
"loss": 5.2985,
"mean_token_accuracy": 0.2070325642824173,
"num_tokens": 10017170.0,
"step": 5510
},
{
"entropy": 5.77425446510315,
"epoch": 4.738289643317576,
"grad_norm": 1.3125,
"learning_rate": 0.00032776764673615055,
"loss": 5.3156,
"mean_token_accuracy": 0.20255803018808366,
"num_tokens": 10025712.0,
"step": 5515
},
{
"entropy": 5.8088236331939695,
"epoch": 4.742587021916631,
"grad_norm": 1.09375,
"learning_rate": 0.0003274443792265888,
"loss": 5.3115,
"mean_token_accuracy": 0.21292225122451783,
"num_tokens": 10035297.0,
"step": 5520
},
{
"entropy": 5.837254667282105,
"epoch": 4.746884400515685,
"grad_norm": 1.1796875,
"learning_rate": 0.00032712099719946474,
"loss": 5.278,
"mean_token_accuracy": 0.21366028040647506,
"num_tokens": 10043903.0,
"step": 5525
},
{
"entropy": 5.739489364624023,
"epoch": 4.75118177911474,
"grad_norm": 1.2265625,
"learning_rate": 0.00032679750136091533,
"loss": 5.3269,
"mean_token_accuracy": 0.20195425003767015,
"num_tokens": 10053035.0,
"step": 5530
},
{
"entropy": 5.721313333511352,
"epoch": 4.755479157713794,
"grad_norm": 1.25,
"learning_rate": 0.0003264738924173262,
"loss": 5.2737,
"mean_token_accuracy": 0.20684178918600082,
"num_tokens": 10061911.0,
"step": 5535
},
{
"entropy": 5.809173583984375,
"epoch": 4.759776536312849,
"grad_norm": 1.1875,
"learning_rate": 0.00032615017107533,
"loss": 5.2765,
"mean_token_accuracy": 0.2063768208026886,
"num_tokens": 10070738.0,
"step": 5540
},
{
"entropy": 5.802584886550903,
"epoch": 4.764073914911903,
"grad_norm": 1.21875,
"learning_rate": 0.0003258263380418047,
"loss": 5.2855,
"mean_token_accuracy": 0.20578781515359879,
"num_tokens": 10080638.0,
"step": 5545
},
{
"entropy": 5.914425611495972,
"epoch": 4.768371293510958,
"grad_norm": 1.2421875,
"learning_rate": 0.00032550239402387226,
"loss": 5.3363,
"mean_token_accuracy": 0.19763863384723662,
"num_tokens": 10089429.0,
"step": 5550
},
{
"entropy": 5.7599162578582765,
"epoch": 4.772668672110013,
"grad_norm": 1.1953125,
"learning_rate": 0.00032517833972889695,
"loss": 5.206,
"mean_token_accuracy": 0.2099302053451538,
"num_tokens": 10098109.0,
"step": 5555
},
{
"entropy": 5.811598682403565,
"epoch": 4.776966050709067,
"grad_norm": 1.1953125,
"learning_rate": 0.00032485417586448375,
"loss": 5.3145,
"mean_token_accuracy": 0.20163995772600174,
"num_tokens": 10106808.0,
"step": 5560
},
{
"entropy": 5.86333212852478,
"epoch": 4.781263429308122,
"grad_norm": 1.28125,
"learning_rate": 0.000324529903138477,
"loss": 5.3143,
"mean_token_accuracy": 0.20143208354711534,
"num_tokens": 10116372.0,
"step": 5565
},
{
"entropy": 5.801443433761596,
"epoch": 4.785560807907177,
"grad_norm": 1.1875,
"learning_rate": 0.0003242055222589587,
"loss": 5.2258,
"mean_token_accuracy": 0.21505694687366486,
"num_tokens": 10125256.0,
"step": 5570
},
{
"entropy": 5.808296251296997,
"epoch": 4.7898581865062315,
"grad_norm": 1.15625,
"learning_rate": 0.000323881033934247,
"loss": 5.3535,
"mean_token_accuracy": 0.19890447854995727,
"num_tokens": 10134784.0,
"step": 5575
},
{
"entropy": 5.8784411430358885,
"epoch": 4.794155565105286,
"grad_norm": 1.265625,
"learning_rate": 0.00032355643887289486,
"loss": 5.289,
"mean_token_accuracy": 0.2091620832681656,
"num_tokens": 10144324.0,
"step": 5580
},
{
"entropy": 5.851370334625244,
"epoch": 4.798452943704341,
"grad_norm": 1.2109375,
"learning_rate": 0.0003232317377836881,
"loss": 5.329,
"mean_token_accuracy": 0.19960423558950424,
"num_tokens": 10152866.0,
"step": 5585
},
{
"entropy": 5.782239103317261,
"epoch": 4.802750322303395,
"grad_norm": 1.1796875,
"learning_rate": 0.000322906931375644,
"loss": 5.2522,
"mean_token_accuracy": 0.2105662614107132,
"num_tokens": 10162457.0,
"step": 5590
},
{
"entropy": 5.767592477798462,
"epoch": 4.80704770090245,
"grad_norm": 1.171875,
"learning_rate": 0.00032258202035801,
"loss": 5.3246,
"mean_token_accuracy": 0.1998135194182396,
"num_tokens": 10171604.0,
"step": 5595
},
{
"entropy": 5.87656307220459,
"epoch": 4.811345079501504,
"grad_norm": 1.15625,
"learning_rate": 0.000322257005440262,
"loss": 5.2848,
"mean_token_accuracy": 0.20817122161388396,
"num_tokens": 10180762.0,
"step": 5600
},
{
"entropy": 5.740377759933471,
"epoch": 4.815642458100559,
"grad_norm": 1.1484375,
"learning_rate": 0.0003219318873321025,
"loss": 5.2017,
"mean_token_accuracy": 0.22599002420902253,
"num_tokens": 10189122.0,
"step": 5605
},
{
"entropy": 5.7609411716461185,
"epoch": 4.819939836699613,
"grad_norm": 1.1328125,
"learning_rate": 0.00032160666674345954,
"loss": 5.3069,
"mean_token_accuracy": 0.19678669720888137,
"num_tokens": 10197280.0,
"step": 5610
},
{
"entropy": 5.834485340118408,
"epoch": 4.824237215298668,
"grad_norm": 1.0859375,
"learning_rate": 0.00032128134438448504,
"loss": 5.3481,
"mean_token_accuracy": 0.19607715606689452,
"num_tokens": 10207507.0,
"step": 5615
},
{
"entropy": 5.843401002883911,
"epoch": 4.828534593897722,
"grad_norm": 1.2265625,
"learning_rate": 0.00032095592096555284,
"loss": 5.3241,
"mean_token_accuracy": 0.19834306091070175,
"num_tokens": 10217584.0,
"step": 5620
},
{
"entropy": 5.804885768890381,
"epoch": 4.832831972496777,
"grad_norm": 1.1484375,
"learning_rate": 0.0003206303971972577,
"loss": 5.257,
"mean_token_accuracy": 0.21136587262153625,
"num_tokens": 10226388.0,
"step": 5625
},
{
"entropy": 5.773497581481934,
"epoch": 4.8371293510958315,
"grad_norm": 1.25,
"learning_rate": 0.0003203047737904134,
"loss": 5.2796,
"mean_token_accuracy": 0.20816502273082732,
"num_tokens": 10235333.0,
"step": 5630
},
{
"entropy": 5.809968280792236,
"epoch": 4.841426729694886,
"grad_norm": 1.2578125,
"learning_rate": 0.00031997905145605135,
"loss": 5.3218,
"mean_token_accuracy": 0.20077406167984008,
"num_tokens": 10243985.0,
"step": 5635
},
{
"entropy": 5.875359725952149,
"epoch": 4.845724108293941,
"grad_norm": 1.1953125,
"learning_rate": 0.00031965323090541874,
"loss": 5.3292,
"mean_token_accuracy": 0.19166997075080872,
"num_tokens": 10252968.0,
"step": 5640
},
{
"entropy": 5.836591100692749,
"epoch": 4.850021486892995,
"grad_norm": 1.1875,
"learning_rate": 0.0003193273128499777,
"loss": 5.1951,
"mean_token_accuracy": 0.20659874528646469,
"num_tokens": 10261890.0,
"step": 5645
},
{
"entropy": 5.755848217010498,
"epoch": 4.85431886549205,
"grad_norm": 1.21875,
"learning_rate": 0.00031900129800140287,
"loss": 5.3049,
"mean_token_accuracy": 0.20563669949769975,
"num_tokens": 10271363.0,
"step": 5650
},
{
"entropy": 5.803152656555175,
"epoch": 4.858616244091104,
"grad_norm": 1.34375,
"learning_rate": 0.00031867518707158027,
"loss": 5.335,
"mean_token_accuracy": 0.19770116060972215,
"num_tokens": 10280608.0,
"step": 5655
},
{
"entropy": 5.818397092819214,
"epoch": 4.862913622690159,
"grad_norm": 1.1875,
"learning_rate": 0.000318348980772606,
"loss": 5.2525,
"mean_token_accuracy": 0.20726011395454408,
"num_tokens": 10289972.0,
"step": 5660
},
{
"entropy": 5.860452508926391,
"epoch": 4.867211001289213,
"grad_norm": 1.3203125,
"learning_rate": 0.00031802267981678414,
"loss": 5.3123,
"mean_token_accuracy": 0.20409038215875625,
"num_tokens": 10298740.0,
"step": 5665
},
{
"entropy": 5.833805656433105,
"epoch": 4.871508379888268,
"grad_norm": 1.078125,
"learning_rate": 0.00031769628491662563,
"loss": 5.2809,
"mean_token_accuracy": 0.20727547705173494,
"num_tokens": 10307706.0,
"step": 5670
},
{
"entropy": 5.803285360336304,
"epoch": 4.8758057584873224,
"grad_norm": 1.0703125,
"learning_rate": 0.00031736979678484634,
"loss": 5.329,
"mean_token_accuracy": 0.2064347133040428,
"num_tokens": 10317549.0,
"step": 5675
},
{
"entropy": 5.820219898223877,
"epoch": 4.880103137086377,
"grad_norm": 1.1640625,
"learning_rate": 0.00031704321613436597,
"loss": 5.3611,
"mean_token_accuracy": 0.19811774492263795,
"num_tokens": 10327681.0,
"step": 5680
},
{
"entropy": 5.755207061767578,
"epoch": 4.8844005156854315,
"grad_norm": 1.0859375,
"learning_rate": 0.0003167165436783061,
"loss": 5.2873,
"mean_token_accuracy": 0.21109480857849122,
"num_tokens": 10336261.0,
"step": 5685
},
{
"entropy": 5.727659749984741,
"epoch": 4.888697894284486,
"grad_norm": 1.234375,
"learning_rate": 0.00031638978012998875,
"loss": 5.2052,
"mean_token_accuracy": 0.21589273661375047,
"num_tokens": 10344770.0,
"step": 5690
},
{
"entropy": 5.819532823562622,
"epoch": 4.892995272883541,
"grad_norm": 1.1796875,
"learning_rate": 0.000316062926202935,
"loss": 5.3654,
"mean_token_accuracy": 0.196533140540123,
"num_tokens": 10354246.0,
"step": 5695
},
{
"entropy": 5.840965127944946,
"epoch": 4.897292651482596,
"grad_norm": 1.328125,
"learning_rate": 0.0003157359826108632,
"loss": 5.2826,
"mean_token_accuracy": 0.20469695180654526,
"num_tokens": 10362693.0,
"step": 5700
},
{
"entropy": 5.81398639678955,
"epoch": 4.90159003008165,
"grad_norm": 1.1953125,
"learning_rate": 0.00031540895006768727,
"loss": 5.2798,
"mean_token_accuracy": 0.20513766556978225,
"num_tokens": 10371639.0,
"step": 5705
},
{
"entropy": 5.778346061706543,
"epoch": 4.905887408680705,
"grad_norm": 1.1015625,
"learning_rate": 0.0003150818292875158,
"loss": 5.2986,
"mean_token_accuracy": 0.20309000611305236,
"num_tokens": 10381237.0,
"step": 5710
},
{
"entropy": 5.8171275615692135,
"epoch": 4.91018478727976,
"grad_norm": 1.203125,
"learning_rate": 0.0003147546209846497,
"loss": 5.2726,
"mean_token_accuracy": 0.20377830415964127,
"num_tokens": 10389932.0,
"step": 5715
},
{
"entropy": 5.729213857650757,
"epoch": 4.914482165878814,
"grad_norm": 1.2109375,
"learning_rate": 0.0003144273258735812,
"loss": 5.1685,
"mean_token_accuracy": 0.2099962517619133,
"num_tokens": 10398938.0,
"step": 5720
},
{
"entropy": 5.770321941375732,
"epoch": 4.918779544477869,
"grad_norm": 1.453125,
"learning_rate": 0.0003140999446689919,
"loss": 5.2774,
"mean_token_accuracy": 0.20088756531476976,
"num_tokens": 10407980.0,
"step": 5725
},
{
"entropy": 5.770383071899414,
"epoch": 4.923076923076923,
"grad_norm": 1.421875,
"learning_rate": 0.0003137724780857516,
"loss": 5.3436,
"mean_token_accuracy": 0.20163364857435226,
"num_tokens": 10416990.0,
"step": 5730
},
{
"entropy": 5.863169193267822,
"epoch": 4.927374301675978,
"grad_norm": 1.0625,
"learning_rate": 0.00031344492683891634,
"loss": 5.3587,
"mean_token_accuracy": 0.204886694252491,
"num_tokens": 10426573.0,
"step": 5735
},
{
"entropy": 5.851557922363281,
"epoch": 4.931671680275032,
"grad_norm": 1.265625,
"learning_rate": 0.0003131172916437272,
"loss": 5.3233,
"mean_token_accuracy": 0.19881743043661118,
"num_tokens": 10435162.0,
"step": 5740
},
{
"entropy": 5.759703254699707,
"epoch": 4.935969058874087,
"grad_norm": 1.2578125,
"learning_rate": 0.00031278957321560845,
"loss": 5.3238,
"mean_token_accuracy": 0.20171435326337814,
"num_tokens": 10444374.0,
"step": 5745
},
{
"entropy": 5.877453994750977,
"epoch": 4.9402664374731415,
"grad_norm": 1.390625,
"learning_rate": 0.00031246177227016615,
"loss": 5.3411,
"mean_token_accuracy": 0.1953754648566246,
"num_tokens": 10452679.0,
"step": 5750
},
{
"entropy": 5.818898057937622,
"epoch": 4.944563816072196,
"grad_norm": 1.1640625,
"learning_rate": 0.00031213388952318653,
"loss": 5.2927,
"mean_token_accuracy": 0.2119872346520424,
"num_tokens": 10461801.0,
"step": 5755
},
{
"entropy": 5.807455348968506,
"epoch": 4.948861194671251,
"grad_norm": 1.125,
"learning_rate": 0.0003118059256906345,
"loss": 5.2809,
"mean_token_accuracy": 0.20208909511566162,
"num_tokens": 10471176.0,
"step": 5760
},
{
"entropy": 5.840291500091553,
"epoch": 4.953158573270305,
"grad_norm": 1.25,
"learning_rate": 0.00031147788148865204,
"loss": 5.342,
"mean_token_accuracy": 0.19445150792598725,
"num_tokens": 10480403.0,
"step": 5765
},
{
"entropy": 5.799935436248779,
"epoch": 4.95745595186936,
"grad_norm": 1.1484375,
"learning_rate": 0.0003111497576335564,
"loss": 5.2761,
"mean_token_accuracy": 0.20416030585765838,
"num_tokens": 10489574.0,
"step": 5770
},
{
"entropy": 5.798885202407837,
"epoch": 4.961753330468414,
"grad_norm": 1.015625,
"learning_rate": 0.0003108215548418391,
"loss": 5.2857,
"mean_token_accuracy": 0.20692466497421264,
"num_tokens": 10499631.0,
"step": 5775
},
{
"entropy": 5.770237159729004,
"epoch": 4.966050709067469,
"grad_norm": 1.4375,
"learning_rate": 0.0003104932738301637,
"loss": 5.2702,
"mean_token_accuracy": 0.2006146103143692,
"num_tokens": 10508128.0,
"step": 5780
},
{
"entropy": 5.782887268066406,
"epoch": 4.970348087666523,
"grad_norm": 1.171875,
"learning_rate": 0.00031016491531536477,
"loss": 5.2448,
"mean_token_accuracy": 0.2088773876428604,
"num_tokens": 10517544.0,
"step": 5785
},
{
"entropy": 5.767910051345825,
"epoch": 4.974645466265578,
"grad_norm": 1.234375,
"learning_rate": 0.0003098364800144462,
"loss": 5.3132,
"mean_token_accuracy": 0.20686182081699372,
"num_tokens": 10526244.0,
"step": 5790
},
{
"entropy": 5.888048982620239,
"epoch": 4.9789428448646325,
"grad_norm": 1.2109375,
"learning_rate": 0.0003095079686445792,
"loss": 5.3812,
"mean_token_accuracy": 0.20500532984733583,
"num_tokens": 10535887.0,
"step": 5795
},
{
"entropy": 5.821762752532959,
"epoch": 4.983240223463687,
"grad_norm": 1.15625,
"learning_rate": 0.00030917938192310146,
"loss": 5.2341,
"mean_token_accuracy": 0.20750374495983123,
"num_tokens": 10544420.0,
"step": 5800
},
{
"entropy": 5.807241201400757,
"epoch": 4.9875376020627415,
"grad_norm": 1.1328125,
"learning_rate": 0.00030885072056751494,
"loss": 5.3215,
"mean_token_accuracy": 0.20071204453706742,
"num_tokens": 10553114.0,
"step": 5805
},
{
"entropy": 5.778054094314575,
"epoch": 4.991834980661796,
"grad_norm": 1.1171875,
"learning_rate": 0.00030852198529548476,
"loss": 5.3415,
"mean_token_accuracy": 0.2032615214586258,
"num_tokens": 10562272.0,
"step": 5810
},
{
"entropy": 5.810254526138306,
"epoch": 4.996132359260851,
"grad_norm": 1.21875,
"learning_rate": 0.0003081931768248373,
"loss": 5.2967,
"mean_token_accuracy": 0.20949976444244384,
"num_tokens": 10571757.0,
"step": 5815
},
{
"entropy": 5.7521191173129615,
"epoch": 5.0,
"grad_norm": 1.796875,
"learning_rate": 0.0003078642958735588,
"loss": 5.238,
"mean_token_accuracy": 0.2155479672882292,
"num_tokens": 10579660.0,
"step": 5820
},
{
"entropy": 5.837719202041626,
"epoch": 5.0042973785990545,
"grad_norm": 1.203125,
"learning_rate": 0.00030753534315979393,
"loss": 5.1272,
"mean_token_accuracy": 0.21314742416143417,
"num_tokens": 10589139.0,
"step": 5825
},
{
"entropy": 5.797129201889038,
"epoch": 5.008594757198109,
"grad_norm": 1.2890625,
"learning_rate": 0.0003072063194018438,
"loss": 4.9242,
"mean_token_accuracy": 0.2357708305120468,
"num_tokens": 10597915.0,
"step": 5830
},
{
"entropy": 5.798255205154419,
"epoch": 5.012892135797164,
"grad_norm": 1.1953125,
"learning_rate": 0.0003068772253181648,
"loss": 5.1386,
"mean_token_accuracy": 0.21062317192554475,
"num_tokens": 10606491.0,
"step": 5835
},
{
"entropy": 5.813257837295533,
"epoch": 5.017189514396218,
"grad_norm": 1.0546875,
"learning_rate": 0.0003065480616273671,
"loss": 5.117,
"mean_token_accuracy": 0.21578232049942017,
"num_tokens": 10615852.0,
"step": 5840
},
{
"entropy": 5.866687679290772,
"epoch": 5.021486892995273,
"grad_norm": 1.2734375,
"learning_rate": 0.0003062188290482123,
"loss": 5.0954,
"mean_token_accuracy": 0.21719059944152833,
"num_tokens": 10625442.0,
"step": 5845
},
{
"entropy": 5.775999212265015,
"epoch": 5.025784271594327,
"grad_norm": 1.078125,
"learning_rate": 0.00030588952829961304,
"loss": 5.0303,
"mean_token_accuracy": 0.22669098973274232,
"num_tokens": 10634972.0,
"step": 5850
},
{
"entropy": 5.820281648635865,
"epoch": 5.030081650193382,
"grad_norm": 1.234375,
"learning_rate": 0.0003055601601006303,
"loss": 5.0305,
"mean_token_accuracy": 0.21823905259370804,
"num_tokens": 10644487.0,
"step": 5855
},
{
"entropy": 5.800490808486939,
"epoch": 5.034379028792436,
"grad_norm": 1.2578125,
"learning_rate": 0.0003052307251704728,
"loss": 5.0487,
"mean_token_accuracy": 0.23768426477909088,
"num_tokens": 10654144.0,
"step": 5860
},
{
"entropy": 5.776832151412964,
"epoch": 5.038676407391491,
"grad_norm": 1.4375,
"learning_rate": 0.0003049012242284946,
"loss": 5.0901,
"mean_token_accuracy": 0.21536518186330794,
"num_tokens": 10663023.0,
"step": 5865
},
{
"entropy": 5.837555408477783,
"epoch": 5.0429737859905455,
"grad_norm": 1.2109375,
"learning_rate": 0.0003045716579941941,
"loss": 5.1255,
"mean_token_accuracy": 0.21810145378112794,
"num_tokens": 10672001.0,
"step": 5870
},
{
"entropy": 5.780414390563965,
"epoch": 5.0472711645896,
"grad_norm": 1.0859375,
"learning_rate": 0.00030424202718721215,
"loss": 5.095,
"mean_token_accuracy": 0.21664355248212813,
"num_tokens": 10682654.0,
"step": 5875
},
{
"entropy": 5.7828668594360355,
"epoch": 5.051568543188655,
"grad_norm": 1.2734375,
"learning_rate": 0.00030391233252733085,
"loss": 5.0795,
"mean_token_accuracy": 0.22158391326665877,
"num_tokens": 10691429.0,
"step": 5880
},
{
"entropy": 5.7656014919281,
"epoch": 5.055865921787709,
"grad_norm": 1.1640625,
"learning_rate": 0.00030358257473447144,
"loss": 5.0694,
"mean_token_accuracy": 0.22057975679636002,
"num_tokens": 10701130.0,
"step": 5885
},
{
"entropy": 5.748148441314697,
"epoch": 5.060163300386764,
"grad_norm": 1.203125,
"learning_rate": 0.00030325275452869316,
"loss": 5.0128,
"mean_token_accuracy": 0.2287128150463104,
"num_tokens": 10709779.0,
"step": 5890
},
{
"entropy": 5.718500375747681,
"epoch": 5.064460678985819,
"grad_norm": 1.234375,
"learning_rate": 0.00030292287263019153,
"loss": 5.0811,
"mean_token_accuracy": 0.22246713042259217,
"num_tokens": 10718795.0,
"step": 5895
},
{
"entropy": 5.670262241363526,
"epoch": 5.068758057584874,
"grad_norm": 1.21875,
"learning_rate": 0.00030259292975929675,
"loss": 4.9893,
"mean_token_accuracy": 0.23385845869779587,
"num_tokens": 10728202.0,
"step": 5900
},
{
"entropy": 5.783095836639404,
"epoch": 5.073055436183928,
"grad_norm": 1.109375,
"learning_rate": 0.0003022629266364723,
"loss": 5.0571,
"mean_token_accuracy": 0.2212306410074234,
"num_tokens": 10737050.0,
"step": 5905
},
{
"entropy": 5.780191469192505,
"epoch": 5.077352814782983,
"grad_norm": 1.2890625,
"learning_rate": 0.00030193286398231276,
"loss": 5.0425,
"mean_token_accuracy": 0.2253589302301407,
"num_tokens": 10745261.0,
"step": 5910
},
{
"entropy": 5.737091827392578,
"epoch": 5.081650193382037,
"grad_norm": 1.21875,
"learning_rate": 0.00030160274251754337,
"loss": 5.1055,
"mean_token_accuracy": 0.2190812796354294,
"num_tokens": 10755008.0,
"step": 5915
},
{
"entropy": 5.78898344039917,
"epoch": 5.085947571981092,
"grad_norm": 1.2265625,
"learning_rate": 0.00030127256296301724,
"loss": 5.1151,
"mean_token_accuracy": 0.21488914340734483,
"num_tokens": 10763951.0,
"step": 5920
},
{
"entropy": 5.804350757598877,
"epoch": 5.090244950580146,
"grad_norm": 1.140625,
"learning_rate": 0.0003009423260397148,
"loss": 5.0493,
"mean_token_accuracy": 0.2211121663451195,
"num_tokens": 10772770.0,
"step": 5925
},
{
"entropy": 5.7645410060882565,
"epoch": 5.094542329179201,
"grad_norm": 1.171875,
"learning_rate": 0.00030061203246874125,
"loss": 5.126,
"mean_token_accuracy": 0.21714986562728883,
"num_tokens": 10781827.0,
"step": 5930
},
{
"entropy": 5.829766368865966,
"epoch": 5.0988397077782555,
"grad_norm": 1.2578125,
"learning_rate": 0.00030028168297132593,
"loss": 5.1971,
"mean_token_accuracy": 0.21828972399234772,
"num_tokens": 10792321.0,
"step": 5935
},
{
"entropy": 5.7105169773101805,
"epoch": 5.10313708637731,
"grad_norm": 1.328125,
"learning_rate": 0.0002999512782688199,
"loss": 5.1534,
"mean_token_accuracy": 0.22245844155550004,
"num_tokens": 10801689.0,
"step": 5940
},
{
"entropy": 5.806783151626587,
"epoch": 5.1074344649763646,
"grad_norm": 1.203125,
"learning_rate": 0.0002996208190826951,
"loss": 5.0674,
"mean_token_accuracy": 0.21947959512472154,
"num_tokens": 10810513.0,
"step": 5945
},
{
"entropy": 5.784919166564942,
"epoch": 5.111731843575419,
"grad_norm": 1.1875,
"learning_rate": 0.00029929030613454227,
"loss": 5.0423,
"mean_token_accuracy": 0.2230915367603302,
"num_tokens": 10819581.0,
"step": 5950
},
{
"entropy": 5.73317141532898,
"epoch": 5.116029222174474,
"grad_norm": 1.1953125,
"learning_rate": 0.0002989597401460697,
"loss": 5.0427,
"mean_token_accuracy": 0.22367439568042755,
"num_tokens": 10828139.0,
"step": 5955
},
{
"entropy": 5.761006593704224,
"epoch": 5.120326600773528,
"grad_norm": 1.328125,
"learning_rate": 0.00029862912183910105,
"loss": 5.0515,
"mean_token_accuracy": 0.2288846716284752,
"num_tokens": 10836256.0,
"step": 5960
},
{
"entropy": 5.753820896148682,
"epoch": 5.124623979372583,
"grad_norm": 1.1875,
"learning_rate": 0.00029829845193557496,
"loss": 5.0806,
"mean_token_accuracy": 0.22265101224184036,
"num_tokens": 10846255.0,
"step": 5965
},
{
"entropy": 5.755499696731567,
"epoch": 5.128921357971637,
"grad_norm": 1.234375,
"learning_rate": 0.0002979677311575421,
"loss": 5.0762,
"mean_token_accuracy": 0.2280671551823616,
"num_tokens": 10855546.0,
"step": 5970
},
{
"entropy": 5.836764097213745,
"epoch": 5.133218736570692,
"grad_norm": 1.296875,
"learning_rate": 0.0002976369602271646,
"loss": 5.1451,
"mean_token_accuracy": 0.21697065979242325,
"num_tokens": 10864417.0,
"step": 5975
},
{
"entropy": 5.814312410354614,
"epoch": 5.137516115169746,
"grad_norm": 1.1484375,
"learning_rate": 0.0002973061398667138,
"loss": 5.1172,
"mean_token_accuracy": 0.21944077759981157,
"num_tokens": 10874527.0,
"step": 5980
},
{
"entropy": 5.744282293319702,
"epoch": 5.141813493768801,
"grad_norm": 1.296875,
"learning_rate": 0.00029697527079856916,
"loss": 5.1538,
"mean_token_accuracy": 0.21050577759742736,
"num_tokens": 10883486.0,
"step": 5985
},
{
"entropy": 5.761128902435303,
"epoch": 5.1461108723678555,
"grad_norm": 1.2890625,
"learning_rate": 0.00029664435374521665,
"loss": 5.0225,
"mean_token_accuracy": 0.2243912473320961,
"num_tokens": 10891972.0,
"step": 5990
},
{
"entropy": 5.789876794815063,
"epoch": 5.15040825096691,
"grad_norm": 1.2109375,
"learning_rate": 0.00029631338942924664,
"loss": 5.0419,
"mean_token_accuracy": 0.22999733984470366,
"num_tokens": 10901350.0,
"step": 5995
},
{
"entropy": 5.719483852386475,
"epoch": 5.154705629565965,
"grad_norm": 1.359375,
"learning_rate": 0.0002959823785733531,
"loss": 5.03,
"mean_token_accuracy": 0.22308178544044494,
"num_tokens": 10910114.0,
"step": 6000
},
{
"epoch": 5.154705629565965,
"eval_entropy": 5.520973839738348,
"eval_loss": 5.938914775848389,
"eval_mean_token_accuracy": 0.1778814101440681,
"eval_num_tokens": 10910114.0,
"eval_runtime": 2.0532,
"eval_samples_per_second": 1728.506,
"eval_steps_per_second": 216.246,
"step": 6000
},
{
"entropy": 5.724570035934448,
"epoch": 5.159003008165019,
"grad_norm": 1.296875,
"learning_rate": 0.0002956513219003312,
"loss": 5.1102,
"mean_token_accuracy": 0.22220082879066466,
"num_tokens": 10919781.0,
"step": 6005
},
{
"entropy": 5.794006776809693,
"epoch": 5.163300386764074,
"grad_norm": 1.234375,
"learning_rate": 0.00029532022013307666,
"loss": 5.107,
"mean_token_accuracy": 0.2223549634218216,
"num_tokens": 10929561.0,
"step": 6010
},
{
"entropy": 5.776598358154297,
"epoch": 5.167597765363128,
"grad_norm": 1.1640625,
"learning_rate": 0.00029498907399458325,
"loss": 5.0801,
"mean_token_accuracy": 0.21775457113981248,
"num_tokens": 10939123.0,
"step": 6015
},
{
"entropy": 5.791857671737671,
"epoch": 5.171895143962183,
"grad_norm": 1.25,
"learning_rate": 0.0002946578842079418,
"loss": 5.111,
"mean_token_accuracy": 0.22705103009939193,
"num_tokens": 10947990.0,
"step": 6020
},
{
"entropy": 5.7706746578216555,
"epoch": 5.176192522561237,
"grad_norm": 1.328125,
"learning_rate": 0.0002943266514963384,
"loss": 5.0366,
"mean_token_accuracy": 0.23295889049768448,
"num_tokens": 10956569.0,
"step": 6025
},
{
"entropy": 5.732059144973755,
"epoch": 5.180489901160292,
"grad_norm": 1.296875,
"learning_rate": 0.0002939953765830529,
"loss": 5.0807,
"mean_token_accuracy": 0.21273524165153504,
"num_tokens": 10965466.0,
"step": 6030
},
{
"entropy": 5.758616828918457,
"epoch": 5.184787279759346,
"grad_norm": 1.15625,
"learning_rate": 0.00029366406019145735,
"loss": 5.0975,
"mean_token_accuracy": 0.22184741795063018,
"num_tokens": 10975051.0,
"step": 6035
},
{
"entropy": 5.697403001785278,
"epoch": 5.189084658358402,
"grad_norm": 1.2890625,
"learning_rate": 0.0002933327030450143,
"loss": 5.0003,
"mean_token_accuracy": 0.22943248599767685,
"num_tokens": 10983940.0,
"step": 6040
},
{
"entropy": 5.7583231925964355,
"epoch": 5.193382036957456,
"grad_norm": 1.25,
"learning_rate": 0.00029300130586727545,
"loss": 5.1201,
"mean_token_accuracy": 0.2127103865146637,
"num_tokens": 10994029.0,
"step": 6045
},
{
"entropy": 5.817176723480225,
"epoch": 5.197679415556511,
"grad_norm": 1.15625,
"learning_rate": 0.00029266986938187943,
"loss": 5.147,
"mean_token_accuracy": 0.2095362886786461,
"num_tokens": 11003616.0,
"step": 6050
},
{
"entropy": 5.755208492279053,
"epoch": 5.2019767941555655,
"grad_norm": 1.2109375,
"learning_rate": 0.0002923383943125514,
"loss": 5.0296,
"mean_token_accuracy": 0.22274913787841796,
"num_tokens": 11012068.0,
"step": 6055
},
{
"entropy": 5.748697900772095,
"epoch": 5.20627417275462,
"grad_norm": 1.34375,
"learning_rate": 0.0002920068813831002,
"loss": 5.0779,
"mean_token_accuracy": 0.21767261028289794,
"num_tokens": 11020510.0,
"step": 6060
},
{
"entropy": 5.725340127944946,
"epoch": 5.210571551353675,
"grad_norm": 1.1953125,
"learning_rate": 0.0002916753313174178,
"loss": 5.1218,
"mean_token_accuracy": 0.21479322165250778,
"num_tokens": 11029804.0,
"step": 6065
},
{
"entropy": 5.7446118831634525,
"epoch": 5.214868929952729,
"grad_norm": 1.2734375,
"learning_rate": 0.0002913437448394768,
"loss": 5.1012,
"mean_token_accuracy": 0.22081879377365113,
"num_tokens": 11038586.0,
"step": 6070
},
{
"entropy": 5.8289836883544925,
"epoch": 5.219166308551784,
"grad_norm": 1.1796875,
"learning_rate": 0.00029101212267332955,
"loss": 5.1577,
"mean_token_accuracy": 0.21826230138540267,
"num_tokens": 11048240.0,
"step": 6075
},
{
"entropy": 5.774588584899902,
"epoch": 5.223463687150838,
"grad_norm": 1.1171875,
"learning_rate": 0.00029068046554310637,
"loss": 5.0426,
"mean_token_accuracy": 0.22095113396644592,
"num_tokens": 11056703.0,
"step": 6080
},
{
"entropy": 5.716027879714966,
"epoch": 5.227761065749893,
"grad_norm": 1.140625,
"learning_rate": 0.0002903487741730139,
"loss": 5.0819,
"mean_token_accuracy": 0.21253616362810135,
"num_tokens": 11066246.0,
"step": 6085
},
{
"entropy": 5.745535945892334,
"epoch": 5.232058444348947,
"grad_norm": 1.3125,
"learning_rate": 0.00029001704928733354,
"loss": 5.1308,
"mean_token_accuracy": 0.2211642697453499,
"num_tokens": 11075277.0,
"step": 6090
},
{
"entropy": 5.723521947860718,
"epoch": 5.236355822948002,
"grad_norm": 1.21875,
"learning_rate": 0.0002896852916104198,
"loss": 5.0242,
"mean_token_accuracy": 0.2263767346739769,
"num_tokens": 11083759.0,
"step": 6095
},
{
"entropy": 5.815294790267944,
"epoch": 5.240653201547056,
"grad_norm": 1.2265625,
"learning_rate": 0.0002893535018666988,
"loss": 5.0008,
"mean_token_accuracy": 0.22687483876943587,
"num_tokens": 11091960.0,
"step": 6100
},
{
"entropy": 5.741855907440185,
"epoch": 5.244950580146111,
"grad_norm": 1.453125,
"learning_rate": 0.00028902168078066674,
"loss": 5.1304,
"mean_token_accuracy": 0.21954144090414046,
"num_tokens": 11101134.0,
"step": 6105
},
{
"entropy": 5.715764713287354,
"epoch": 5.2492479587451655,
"grad_norm": 1.2578125,
"learning_rate": 0.0002886898290768883,
"loss": 5.0783,
"mean_token_accuracy": 0.21966248154640197,
"num_tokens": 11110282.0,
"step": 6110
},
{
"entropy": 5.766628980636597,
"epoch": 5.25354533734422,
"grad_norm": 1.2421875,
"learning_rate": 0.000288357947479995,
"loss": 5.1472,
"mean_token_accuracy": 0.2185099706053734,
"num_tokens": 11119591.0,
"step": 6115
},
{
"entropy": 5.794510555267334,
"epoch": 5.257842715943275,
"grad_norm": 1.125,
"learning_rate": 0.00028802603671468347,
"loss": 5.1333,
"mean_token_accuracy": 0.21663097888231278,
"num_tokens": 11129164.0,
"step": 6120
},
{
"entropy": 5.794670677185058,
"epoch": 5.262140094542329,
"grad_norm": 1.203125,
"learning_rate": 0.00028769409750571413,
"loss": 5.1054,
"mean_token_accuracy": 0.21479454636573792,
"num_tokens": 11137973.0,
"step": 6125
},
{
"entropy": 5.762855958938599,
"epoch": 5.266437473141384,
"grad_norm": 1.1640625,
"learning_rate": 0.00028736213057790975,
"loss": 5.1731,
"mean_token_accuracy": 0.21324900835752486,
"num_tokens": 11147285.0,
"step": 6130
},
{
"entropy": 5.80087947845459,
"epoch": 5.270734851740438,
"grad_norm": 1.328125,
"learning_rate": 0.0002870301366561533,
"loss": 5.1457,
"mean_token_accuracy": 0.2189454674720764,
"num_tokens": 11155303.0,
"step": 6135
},
{
"entropy": 5.777155160903931,
"epoch": 5.275032230339493,
"grad_norm": 1.375,
"learning_rate": 0.0002866981164653867,
"loss": 5.0068,
"mean_token_accuracy": 0.2319532886147499,
"num_tokens": 11163553.0,
"step": 6140
},
{
"entropy": 5.673604679107666,
"epoch": 5.279329608938547,
"grad_norm": 1.4921875,
"learning_rate": 0.0002863660707306095,
"loss": 4.9501,
"mean_token_accuracy": 0.23503543585538864,
"num_tokens": 11171865.0,
"step": 6145
},
{
"entropy": 5.708528232574463,
"epoch": 5.283626987537602,
"grad_norm": 1.1328125,
"learning_rate": 0.00028603400017687675,
"loss": 5.1259,
"mean_token_accuracy": 0.2180730476975441,
"num_tokens": 11181137.0,
"step": 6150
},
{
"entropy": 5.644678974151612,
"epoch": 5.287924366136656,
"grad_norm": 1.2265625,
"learning_rate": 0.00028570190552929794,
"loss": 5.0033,
"mean_token_accuracy": 0.23516686558723449,
"num_tokens": 11190174.0,
"step": 6155
},
{
"entropy": 5.731041049957275,
"epoch": 5.292221744735711,
"grad_norm": 1.3203125,
"learning_rate": 0.000285369787513035,
"loss": 5.0203,
"mean_token_accuracy": 0.22731503397226333,
"num_tokens": 11197964.0,
"step": 6160
},
{
"entropy": 5.808375072479248,
"epoch": 5.2965191233347655,
"grad_norm": 1.078125,
"learning_rate": 0.00028503764685330077,
"loss": 5.1475,
"mean_token_accuracy": 0.2158605545759201,
"num_tokens": 11207974.0,
"step": 6165
},
{
"entropy": 5.69039740562439,
"epoch": 5.30081650193382,
"grad_norm": 1.359375,
"learning_rate": 0.00028470548427535794,
"loss": 5.0855,
"mean_token_accuracy": 0.2215003788471222,
"num_tokens": 11216430.0,
"step": 6170
},
{
"entropy": 5.781186437606811,
"epoch": 5.305113880532875,
"grad_norm": 1.203125,
"learning_rate": 0.00028437330050451654,
"loss": 5.1772,
"mean_token_accuracy": 0.21430482119321823,
"num_tokens": 11226189.0,
"step": 6175
},
{
"entropy": 5.7771838188171385,
"epoch": 5.309411259131929,
"grad_norm": 1.3515625,
"learning_rate": 0.0002840410962661334,
"loss": 5.1362,
"mean_token_accuracy": 0.216461843252182,
"num_tokens": 11234691.0,
"step": 6180
},
{
"entropy": 5.6778770923614506,
"epoch": 5.313708637730985,
"grad_norm": 1.328125,
"learning_rate": 0.0002837088722856098,
"loss": 4.9647,
"mean_token_accuracy": 0.23130213618278503,
"num_tokens": 11243852.0,
"step": 6185
},
{
"entropy": 5.759836912155151,
"epoch": 5.318006016330038,
"grad_norm": 1.2109375,
"learning_rate": 0.00028337662928838996,
"loss": 5.1367,
"mean_token_accuracy": 0.22025407403707503,
"num_tokens": 11253416.0,
"step": 6190
},
{
"entropy": 5.744880151748657,
"epoch": 5.322303394929094,
"grad_norm": 1.09375,
"learning_rate": 0.00028304436799995986,
"loss": 5.0381,
"mean_token_accuracy": 0.23224859237670897,
"num_tokens": 11262869.0,
"step": 6195
},
{
"entropy": 5.794106817245483,
"epoch": 5.326600773528148,
"grad_norm": 1.140625,
"learning_rate": 0.00028271208914584534,
"loss": 5.1634,
"mean_token_accuracy": 0.21408282816410065,
"num_tokens": 11272386.0,
"step": 6200
},
{
"entropy": 5.7819318771362305,
"epoch": 5.330898152127203,
"grad_norm": 1.2265625,
"learning_rate": 0.00028237979345161065,
"loss": 5.0198,
"mean_token_accuracy": 0.2246573656797409,
"num_tokens": 11281590.0,
"step": 6205
},
{
"entropy": 5.723759984970092,
"epoch": 5.335195530726257,
"grad_norm": 1.3359375,
"learning_rate": 0.0002820474816428568,
"loss": 5.0703,
"mean_token_accuracy": 0.22172184884548188,
"num_tokens": 11290873.0,
"step": 6210
},
{
"entropy": 5.763450860977173,
"epoch": 5.339492909325312,
"grad_norm": 1.3125,
"learning_rate": 0.0002817151544452198,
"loss": 5.047,
"mean_token_accuracy": 0.22653649896383285,
"num_tokens": 11299064.0,
"step": 6215
},
{
"entropy": 5.763836097717285,
"epoch": 5.343790287924366,
"grad_norm": 1.21875,
"learning_rate": 0.00028138281258436947,
"loss": 5.0815,
"mean_token_accuracy": 0.2125794693827629,
"num_tokens": 11307390.0,
"step": 6220
},
{
"entropy": 5.6746196269989015,
"epoch": 5.348087666523421,
"grad_norm": 1.421875,
"learning_rate": 0.0002810504567860078,
"loss": 5.0321,
"mean_token_accuracy": 0.22260272949934007,
"num_tokens": 11315606.0,
"step": 6225
},
{
"entropy": 5.817198133468628,
"epoch": 5.3523850451224755,
"grad_norm": 1.296875,
"learning_rate": 0.0002807180877758667,
"loss": 5.1543,
"mean_token_accuracy": 0.2130942553281784,
"num_tokens": 11323821.0,
"step": 6230
},
{
"entropy": 5.749542999267578,
"epoch": 5.35668242372153,
"grad_norm": 1.4296875,
"learning_rate": 0.00028038570627970754,
"loss": 5.0964,
"mean_token_accuracy": 0.21933864206075668,
"num_tokens": 11331850.0,
"step": 6235
},
{
"entropy": 5.718008375167846,
"epoch": 5.360979802320585,
"grad_norm": 1.4140625,
"learning_rate": 0.0002800533130233184,
"loss": 5.0655,
"mean_token_accuracy": 0.22661355137825012,
"num_tokens": 11340125.0,
"step": 6240
},
{
"entropy": 5.798234176635742,
"epoch": 5.365277180919639,
"grad_norm": 1.3203125,
"learning_rate": 0.0002797209087325135,
"loss": 5.099,
"mean_token_accuracy": 0.21534867137670516,
"num_tokens": 11349184.0,
"step": 6245
},
{
"entropy": 5.796571254730225,
"epoch": 5.369574559518694,
"grad_norm": 1.25,
"learning_rate": 0.00027938849413313083,
"loss": 5.151,
"mean_token_accuracy": 0.21026744544506074,
"num_tokens": 11357536.0,
"step": 6250
},
{
"entropy": 5.762140798568725,
"epoch": 5.373871938117748,
"grad_norm": 1.1640625,
"learning_rate": 0.000279056069951031,
"loss": 5.1291,
"mean_token_accuracy": 0.2182137981057167,
"num_tokens": 11367242.0,
"step": 6255
},
{
"entropy": 5.744655513763428,
"epoch": 5.378169316716803,
"grad_norm": 1.2890625,
"learning_rate": 0.00027872363691209564,
"loss": 5.0854,
"mean_token_accuracy": 0.22035084962844848,
"num_tokens": 11374932.0,
"step": 6260
},
{
"entropy": 5.718549394607544,
"epoch": 5.382466695315857,
"grad_norm": 1.3046875,
"learning_rate": 0.0002783911957422256,
"loss": 5.0746,
"mean_token_accuracy": 0.21616823524236678,
"num_tokens": 11383575.0,
"step": 6265
},
{
"entropy": 5.8026519298553465,
"epoch": 5.386764073914912,
"grad_norm": 1.3046875,
"learning_rate": 0.0002780587471673394,
"loss": 5.1228,
"mean_token_accuracy": 0.2199627310037613,
"num_tokens": 11392285.0,
"step": 6270
},
{
"entropy": 5.761375951766968,
"epoch": 5.391061452513966,
"grad_norm": 1.2265625,
"learning_rate": 0.00027772629191337206,
"loss": 5.0803,
"mean_token_accuracy": 0.22224314510822296,
"num_tokens": 11401054.0,
"step": 6275
},
{
"entropy": 5.73178014755249,
"epoch": 5.395358831113021,
"grad_norm": 1.34375,
"learning_rate": 0.00027739383070627283,
"loss": 5.1133,
"mean_token_accuracy": 0.21695896238088608,
"num_tokens": 11410529.0,
"step": 6280
},
{
"entropy": 5.738652086257934,
"epoch": 5.3996562097120755,
"grad_norm": 1.2109375,
"learning_rate": 0.0002770613642720041,
"loss": 5.0726,
"mean_token_accuracy": 0.22234009355306625,
"num_tokens": 11419961.0,
"step": 6285
},
{
"entropy": 5.816117954254151,
"epoch": 5.40395358831113,
"grad_norm": 1.109375,
"learning_rate": 0.00027672889333653984,
"loss": 5.2143,
"mean_token_accuracy": 0.19956380277872085,
"num_tokens": 11429529.0,
"step": 6290
},
{
"entropy": 5.772477054595948,
"epoch": 5.408250966910185,
"grad_norm": 1.1640625,
"learning_rate": 0.0002763964186258635,
"loss": 5.0713,
"mean_token_accuracy": 0.2213875100016594,
"num_tokens": 11438254.0,
"step": 6295
},
{
"entropy": 5.781389999389648,
"epoch": 5.412548345509239,
"grad_norm": 1.1875,
"learning_rate": 0.0002760639408659671,
"loss": 5.1279,
"mean_token_accuracy": 0.2134677991271019,
"num_tokens": 11447587.0,
"step": 6300
},
{
"entropy": 5.742274236679077,
"epoch": 5.416845724108294,
"grad_norm": 1.3828125,
"learning_rate": 0.0002757314607828489,
"loss": 5.0879,
"mean_token_accuracy": 0.2216594934463501,
"num_tokens": 11455493.0,
"step": 6305
},
{
"entropy": 5.754730272293091,
"epoch": 5.421143102707348,
"grad_norm": 1.3046875,
"learning_rate": 0.00027539897910251293,
"loss": 5.0387,
"mean_token_accuracy": 0.2288123995065689,
"num_tokens": 11464143.0,
"step": 6310
},
{
"entropy": 5.715780973434448,
"epoch": 5.425440481306403,
"grad_norm": 1.328125,
"learning_rate": 0.00027506649655096595,
"loss": 5.0129,
"mean_token_accuracy": 0.22209218442440032,
"num_tokens": 11471813.0,
"step": 6315
},
{
"entropy": 5.726117277145386,
"epoch": 5.429737859905457,
"grad_norm": 1.3046875,
"learning_rate": 0.0002747340138542171,
"loss": 5.0473,
"mean_token_accuracy": 0.21649690121412277,
"num_tokens": 11481374.0,
"step": 6320
},
{
"entropy": 5.843945550918579,
"epoch": 5.434035238504512,
"grad_norm": 1.1953125,
"learning_rate": 0.0002744015317382757,
"loss": 5.2226,
"mean_token_accuracy": 0.21010226905345916,
"num_tokens": 11490575.0,
"step": 6325
},
{
"entropy": 5.77156343460083,
"epoch": 5.438332617103566,
"grad_norm": 1.2109375,
"learning_rate": 0.0002740690509291498,
"loss": 5.109,
"mean_token_accuracy": 0.22323488891124726,
"num_tokens": 11499898.0,
"step": 6330
},
{
"entropy": 5.756300687789917,
"epoch": 5.442629995702621,
"grad_norm": 1.3671875,
"learning_rate": 0.0002737365721528445,
"loss": 5.109,
"mean_token_accuracy": 0.2162606492638588,
"num_tokens": 11508544.0,
"step": 6335
},
{
"entropy": 5.75588583946228,
"epoch": 5.446927374301676,
"grad_norm": 1.2421875,
"learning_rate": 0.0002734040961353607,
"loss": 5.0941,
"mean_token_accuracy": 0.21851696968078613,
"num_tokens": 11519239.0,
"step": 6340
},
{
"entropy": 5.778530550003052,
"epoch": 5.451224752900731,
"grad_norm": 1.28125,
"learning_rate": 0.000273071623602693,
"loss": 5.0573,
"mean_token_accuracy": 0.22304116934537888,
"num_tokens": 11529014.0,
"step": 6345
},
{
"entropy": 5.792081117630005,
"epoch": 5.4555221314997855,
"grad_norm": 1.234375,
"learning_rate": 0.00027273915528082865,
"loss": 5.056,
"mean_token_accuracy": 0.22800618261098862,
"num_tokens": 11538367.0,
"step": 6350
},
{
"entropy": 5.8074750900268555,
"epoch": 5.45981951009884,
"grad_norm": 1.203125,
"learning_rate": 0.0002724066918957455,
"loss": 5.2142,
"mean_token_accuracy": 0.20521747320890427,
"num_tokens": 11548166.0,
"step": 6355
},
{
"entropy": 5.703108215332032,
"epoch": 5.464116888697895,
"grad_norm": 1.2109375,
"learning_rate": 0.0002720742341734107,
"loss": 5.0789,
"mean_token_accuracy": 0.22244166433811188,
"num_tokens": 11557187.0,
"step": 6360
},
{
"entropy": 5.819654417037964,
"epoch": 5.468414267296949,
"grad_norm": 1.3515625,
"learning_rate": 0.00027174178283977904,
"loss": 5.1156,
"mean_token_accuracy": 0.21346145868301392,
"num_tokens": 11566181.0,
"step": 6365
},
{
"entropy": 5.714676475524902,
"epoch": 5.472711645896004,
"grad_norm": 1.1796875,
"learning_rate": 0.00027140933862079136,
"loss": 5.0735,
"mean_token_accuracy": 0.22364838123321534,
"num_tokens": 11576157.0,
"step": 6370
},
{
"entropy": 5.70315842628479,
"epoch": 5.477009024495058,
"grad_norm": 1.296875,
"learning_rate": 0.000271076902242373,
"loss": 5.0464,
"mean_token_accuracy": 0.22535803020000458,
"num_tokens": 11585325.0,
"step": 6375
},
{
"entropy": 5.768360900878906,
"epoch": 5.481306403094113,
"grad_norm": 1.203125,
"learning_rate": 0.000270744474430432,
"loss": 5.0335,
"mean_token_accuracy": 0.22521175146102906,
"num_tokens": 11594623.0,
"step": 6380
},
{
"entropy": 5.806769609451294,
"epoch": 5.485603781693167,
"grad_norm": 1.28125,
"learning_rate": 0.000270412055910858,
"loss": 5.2131,
"mean_token_accuracy": 0.21013156920671464,
"num_tokens": 11604370.0,
"step": 6385
},
{
"entropy": 5.686313343048096,
"epoch": 5.489901160292222,
"grad_norm": 1.1796875,
"learning_rate": 0.0002700796474095201,
"loss": 5.0334,
"mean_token_accuracy": 0.2330898493528366,
"num_tokens": 11613779.0,
"step": 6390
},
{
"entropy": 5.772741842269897,
"epoch": 5.494198538891276,
"grad_norm": 1.3203125,
"learning_rate": 0.0002697472496522656,
"loss": 5.1181,
"mean_token_accuracy": 0.2183234751224518,
"num_tokens": 11623037.0,
"step": 6395
},
{
"entropy": 5.848201179504395,
"epoch": 5.498495917490331,
"grad_norm": 1.234375,
"learning_rate": 0.0002694148633649184,
"loss": 5.1467,
"mean_token_accuracy": 0.21451639384031296,
"num_tokens": 11631640.0,
"step": 6400
},
{
"entropy": 5.744092893600464,
"epoch": 5.5027932960893855,
"grad_norm": 1.328125,
"learning_rate": 0.0002690824892732772,
"loss": 5.1001,
"mean_token_accuracy": 0.22413897514343262,
"num_tokens": 11640500.0,
"step": 6405
},
{
"entropy": 5.730341243743896,
"epoch": 5.50709067468844,
"grad_norm": 1.3125,
"learning_rate": 0.0002687501281031142,
"loss": 5.1363,
"mean_token_accuracy": 0.21347840279340743,
"num_tokens": 11649173.0,
"step": 6410
},
{
"entropy": 5.61120228767395,
"epoch": 5.511388053287495,
"grad_norm": 1.296875,
"learning_rate": 0.0002684177805801734,
"loss": 4.9907,
"mean_token_accuracy": 0.23398321270942687,
"num_tokens": 11658808.0,
"step": 6415
},
{
"entropy": 5.78149824142456,
"epoch": 5.515685431886549,
"grad_norm": 1.1640625,
"learning_rate": 0.00026808544743016886,
"loss": 5.0821,
"mean_token_accuracy": 0.21574064046144487,
"num_tokens": 11667600.0,
"step": 6420
},
{
"entropy": 5.750644302368164,
"epoch": 5.519982810485604,
"grad_norm": 1.3125,
"learning_rate": 0.0002677531293787835,
"loss": 5.0974,
"mean_token_accuracy": 0.21414555311203004,
"num_tokens": 11675597.0,
"step": 6425
},
{
"entropy": 5.660244941711426,
"epoch": 5.524280189084658,
"grad_norm": 1.234375,
"learning_rate": 0.000267420827151667,
"loss": 5.0231,
"mean_token_accuracy": 0.23164253532886506,
"num_tokens": 11684549.0,
"step": 6430
},
{
"entropy": 5.730451011657715,
"epoch": 5.528577567683713,
"grad_norm": 1.28125,
"learning_rate": 0.0002670885414744347,
"loss": 5.1151,
"mean_token_accuracy": 0.22453600615262986,
"num_tokens": 11693043.0,
"step": 6435
},
{
"entropy": 5.851344728469849,
"epoch": 5.532874946282767,
"grad_norm": 1.2265625,
"learning_rate": 0.0002667562730726655,
"loss": 5.1998,
"mean_token_accuracy": 0.21441607922315598,
"num_tokens": 11702982.0,
"step": 6440
},
{
"entropy": 5.804118871688843,
"epoch": 5.537172324881822,
"grad_norm": 1.1875,
"learning_rate": 0.00026642402267190095,
"loss": 5.2054,
"mean_token_accuracy": 0.20979426354169844,
"num_tokens": 11711994.0,
"step": 6445
},
{
"entropy": 5.78815655708313,
"epoch": 5.541469703480876,
"grad_norm": 1.296875,
"learning_rate": 0.00026609179099764313,
"loss": 5.1463,
"mean_token_accuracy": 0.2133957788348198,
"num_tokens": 11722165.0,
"step": 6450
},
{
"entropy": 5.748242044448853,
"epoch": 5.545767082079931,
"grad_norm": 1.1875,
"learning_rate": 0.00026575957877535323,
"loss": 5.1148,
"mean_token_accuracy": 0.21890448033809662,
"num_tokens": 11731265.0,
"step": 6455
},
{
"entropy": 5.782331275939941,
"epoch": 5.5500644606789855,
"grad_norm": 1.21875,
"learning_rate": 0.00026542738673044985,
"loss": 5.1388,
"mean_token_accuracy": 0.21340786814689636,
"num_tokens": 11741779.0,
"step": 6460
},
{
"entropy": 5.744899702072144,
"epoch": 5.55436183927804,
"grad_norm": 1.3671875,
"learning_rate": 0.0002650952155883077,
"loss": 5.1048,
"mean_token_accuracy": 0.2189340263605118,
"num_tokens": 11749976.0,
"step": 6465
},
{
"entropy": 5.73995532989502,
"epoch": 5.558659217877095,
"grad_norm": 1.2109375,
"learning_rate": 0.0002647630660742559,
"loss": 5.0929,
"mean_token_accuracy": 0.21515202820301055,
"num_tokens": 11759781.0,
"step": 6470
},
{
"entropy": 5.763435029983521,
"epoch": 5.56295659647615,
"grad_norm": 1.1171875,
"learning_rate": 0.000264430938913576,
"loss": 5.0636,
"mean_token_accuracy": 0.2215006723999977,
"num_tokens": 11769544.0,
"step": 6475
},
{
"entropy": 5.762722492218018,
"epoch": 5.567253975075204,
"grad_norm": 1.4375,
"learning_rate": 0.00026409883483150123,
"loss": 5.0644,
"mean_token_accuracy": 0.22043437957763673,
"num_tokens": 11778831.0,
"step": 6480
},
{
"entropy": 5.726540517807007,
"epoch": 5.571551353674259,
"grad_norm": 1.21875,
"learning_rate": 0.000263766754553214,
"loss": 5.1385,
"mean_token_accuracy": 0.20914600044488907,
"num_tokens": 11788813.0,
"step": 6485
},
{
"entropy": 5.783534860610962,
"epoch": 5.575848732273313,
"grad_norm": 1.1796875,
"learning_rate": 0.0002634346988038448,
"loss": 5.0812,
"mean_token_accuracy": 0.22230044454336167,
"num_tokens": 11797335.0,
"step": 6490
},
{
"entropy": 5.774841403961181,
"epoch": 5.580146110872368,
"grad_norm": 1.3125,
"learning_rate": 0.00026310266830847093,
"loss": 5.105,
"mean_token_accuracy": 0.21853111684322357,
"num_tokens": 11806741.0,
"step": 6495
},
{
"entropy": 5.791754579544067,
"epoch": 5.584443489471423,
"grad_norm": 1.28125,
"learning_rate": 0.00026277066379211406,
"loss": 5.1402,
"mean_token_accuracy": 0.2172718971967697,
"num_tokens": 11815551.0,
"step": 6500
},
{
"epoch": 5.584443489471423,
"eval_entropy": 5.548214195010899,
"eval_loss": 5.916170597076416,
"eval_mean_token_accuracy": 0.1791800964321639,
"eval_num_tokens": 11815551.0,
"eval_runtime": 2.2528,
"eval_samples_per_second": 1575.34,
"eval_steps_per_second": 197.084,
"step": 6500
},
{
"entropy": 5.79419641494751,
"epoch": 5.588740868070477,
"grad_norm": 1.3046875,
"learning_rate": 0.0002624386859797396,
"loss": 5.1641,
"mean_token_accuracy": 0.2150591015815735,
"num_tokens": 11824483.0,
"step": 6505
},
{
"entropy": 5.678532218933105,
"epoch": 5.593038246669532,
"grad_norm": 1.28125,
"learning_rate": 0.00026210673559625406,
"loss": 4.9558,
"mean_token_accuracy": 0.23172966986894608,
"num_tokens": 11832383.0,
"step": 6510
},
{
"entropy": 5.777034711837769,
"epoch": 5.597335625268586,
"grad_norm": 1.4765625,
"learning_rate": 0.0002617748133665047,
"loss": 5.1953,
"mean_token_accuracy": 0.21313114762306212,
"num_tokens": 11841430.0,
"step": 6515
},
{
"entropy": 5.757608795166016,
"epoch": 5.601633003867641,
"grad_norm": 1.2421875,
"learning_rate": 0.0002614429200152768,
"loss": 5.1529,
"mean_token_accuracy": 0.21601863503456115,
"num_tokens": 11850863.0,
"step": 6520
},
{
"entropy": 5.719894313812256,
"epoch": 5.6059303824666955,
"grad_norm": 1.359375,
"learning_rate": 0.000261111056267293,
"loss": 5.0249,
"mean_token_accuracy": 0.22059513330459596,
"num_tokens": 11859392.0,
"step": 6525
},
{
"entropy": 5.748900747299194,
"epoch": 5.61022776106575,
"grad_norm": 1.265625,
"learning_rate": 0.00026077922284721084,
"loss": 5.0761,
"mean_token_accuracy": 0.22885973751544952,
"num_tokens": 11868762.0,
"step": 6530
},
{
"entropy": 5.7151679515838625,
"epoch": 5.614525139664805,
"grad_norm": 1.3671875,
"learning_rate": 0.00026044742047962206,
"loss": 5.0306,
"mean_token_accuracy": 0.230779293179512,
"num_tokens": 11876722.0,
"step": 6535
},
{
"entropy": 5.73458080291748,
"epoch": 5.618822518263859,
"grad_norm": 1.2578125,
"learning_rate": 0.00026011564988905023,
"loss": 5.1741,
"mean_token_accuracy": 0.21847135871648787,
"num_tokens": 11885614.0,
"step": 6540
},
{
"entropy": 5.835862874984741,
"epoch": 5.623119896862914,
"grad_norm": 1.2890625,
"learning_rate": 0.0002597839117999499,
"loss": 5.1883,
"mean_token_accuracy": 0.2149903357028961,
"num_tokens": 11894702.0,
"step": 6545
},
{
"entropy": 5.769807910919189,
"epoch": 5.627417275461968,
"grad_norm": 1.3828125,
"learning_rate": 0.0002594522069367044,
"loss": 5.0606,
"mean_token_accuracy": 0.22778366208076478,
"num_tokens": 11902829.0,
"step": 6550
},
{
"entropy": 5.7354350090026855,
"epoch": 5.631714654061023,
"grad_norm": 1.1796875,
"learning_rate": 0.0002591205360236245,
"loss": 5.1061,
"mean_token_accuracy": 0.22033643573522568,
"num_tokens": 11912377.0,
"step": 6555
},
{
"entropy": 5.73629994392395,
"epoch": 5.636012032660077,
"grad_norm": 1.3671875,
"learning_rate": 0.000258788899784947,
"loss": 5.1083,
"mean_token_accuracy": 0.21579407155513763,
"num_tokens": 11920563.0,
"step": 6560
},
{
"entropy": 5.782598829269409,
"epoch": 5.640309411259132,
"grad_norm": 1.21875,
"learning_rate": 0.00025845729894483283,
"loss": 5.1321,
"mean_token_accuracy": 0.21574058383703232,
"num_tokens": 11930190.0,
"step": 6565
},
{
"entropy": 5.814107990264892,
"epoch": 5.644606789858186,
"grad_norm": 1.2109375,
"learning_rate": 0.0002581257342273657,
"loss": 5.1906,
"mean_token_accuracy": 0.21169122010469438,
"num_tokens": 11939840.0,
"step": 6570
},
{
"entropy": 5.767313432693482,
"epoch": 5.648904168457241,
"grad_norm": 1.328125,
"learning_rate": 0.0002577942063565504,
"loss": 5.1207,
"mean_token_accuracy": 0.22112152874469757,
"num_tokens": 11948260.0,
"step": 6575
},
{
"entropy": 5.735608530044556,
"epoch": 5.6532015470562955,
"grad_norm": 1.4140625,
"learning_rate": 0.0002574627160563114,
"loss": 5.1704,
"mean_token_accuracy": 0.2230113223195076,
"num_tokens": 11956776.0,
"step": 6580
},
{
"entropy": 5.847114753723145,
"epoch": 5.65749892565535,
"grad_norm": 1.3203125,
"learning_rate": 0.0002571312640504909,
"loss": 5.1992,
"mean_token_accuracy": 0.21259045898914336,
"num_tokens": 11966375.0,
"step": 6585
},
{
"entropy": 5.86904125213623,
"epoch": 5.661796304254405,
"grad_norm": 1.2265625,
"learning_rate": 0.0002567998510628476,
"loss": 5.2003,
"mean_token_accuracy": 0.2081349566578865,
"num_tokens": 11975835.0,
"step": 6590
},
{
"entropy": 5.757561254501343,
"epoch": 5.666093682853459,
"grad_norm": 1.3671875,
"learning_rate": 0.00025646847781705506,
"loss": 5.0878,
"mean_token_accuracy": 0.21930547803640366,
"num_tokens": 11984672.0,
"step": 6595
},
{
"entropy": 5.788959789276123,
"epoch": 5.670391061452514,
"grad_norm": 1.359375,
"learning_rate": 0.0002561371450367,
"loss": 5.1018,
"mean_token_accuracy": 0.22340647727251053,
"num_tokens": 11993954.0,
"step": 6600
},
{
"entropy": 5.792072677612305,
"epoch": 5.674688440051568,
"grad_norm": 1.3203125,
"learning_rate": 0.00025580585344528076,
"loss": 5.1573,
"mean_token_accuracy": 0.2121841624379158,
"num_tokens": 12002523.0,
"step": 6605
},
{
"entropy": 5.788462352752686,
"epoch": 5.678985818650623,
"grad_norm": 1.328125,
"learning_rate": 0.0002554746037662058,
"loss": 5.1837,
"mean_token_accuracy": 0.2102429136633873,
"num_tokens": 12011638.0,
"step": 6610
},
{
"entropy": 5.825799894332886,
"epoch": 5.683283197249677,
"grad_norm": 1.1328125,
"learning_rate": 0.0002551433967227919,
"loss": 5.1468,
"mean_token_accuracy": 0.2148883506655693,
"num_tokens": 12021319.0,
"step": 6615
},
{
"entropy": 5.772326278686523,
"epoch": 5.687580575848732,
"grad_norm": 1.2578125,
"learning_rate": 0.000254812233038263,
"loss": 5.0897,
"mean_token_accuracy": 0.22315099984407424,
"num_tokens": 12030255.0,
"step": 6620
},
{
"entropy": 5.790443277359008,
"epoch": 5.691877954447786,
"grad_norm": 1.359375,
"learning_rate": 0.00025448111343574813,
"loss": 5.093,
"mean_token_accuracy": 0.22532202005386354,
"num_tokens": 12038884.0,
"step": 6625
},
{
"entropy": 5.74477071762085,
"epoch": 5.696175333046842,
"grad_norm": 1.078125,
"learning_rate": 0.0002541500386382802,
"loss": 5.0745,
"mean_token_accuracy": 0.21967335492372514,
"num_tokens": 12047477.0,
"step": 6630
},
{
"entropy": 5.764187002182007,
"epoch": 5.7004727116458955,
"grad_norm": 1.2265625,
"learning_rate": 0.00025381900936879433,
"loss": 5.1567,
"mean_token_accuracy": 0.22008973658084868,
"num_tokens": 12056902.0,
"step": 6635
},
{
"entropy": 5.75988278388977,
"epoch": 5.704770090244951,
"grad_norm": 1.2265625,
"learning_rate": 0.0002534880263501259,
"loss": 5.1201,
"mean_token_accuracy": 0.2115958884358406,
"num_tokens": 12065721.0,
"step": 6640
},
{
"entropy": 5.747469568252564,
"epoch": 5.7090674688440055,
"grad_norm": 1.171875,
"learning_rate": 0.0002531570903050097,
"loss": 5.0979,
"mean_token_accuracy": 0.22399253994226456,
"num_tokens": 12074870.0,
"step": 6645
},
{
"entropy": 5.807171726226807,
"epoch": 5.71336484744306,
"grad_norm": 1.1640625,
"learning_rate": 0.0002528262019560776,
"loss": 5.1381,
"mean_token_accuracy": 0.21587093770503998,
"num_tokens": 12084557.0,
"step": 6650
},
{
"entropy": 5.729248476028443,
"epoch": 5.717662226042115,
"grad_norm": 1.15625,
"learning_rate": 0.0002524953620258579,
"loss": 5.0104,
"mean_token_accuracy": 0.23036109060049056,
"num_tokens": 12093074.0,
"step": 6655
},
{
"entropy": 5.726521444320679,
"epoch": 5.721959604641169,
"grad_norm": 1.296875,
"learning_rate": 0.0002521645712367724,
"loss": 5.0357,
"mean_token_accuracy": 0.23260863721370698,
"num_tokens": 12102785.0,
"step": 6660
},
{
"entropy": 5.734358453750611,
"epoch": 5.726256983240224,
"grad_norm": 1.34375,
"learning_rate": 0.00025183383031113606,
"loss": 5.0578,
"mean_token_accuracy": 0.2322448804974556,
"num_tokens": 12112535.0,
"step": 6665
},
{
"entropy": 5.652305364608765,
"epoch": 5.730554361839278,
"grad_norm": 1.2109375,
"learning_rate": 0.00025150313997115476,
"loss": 5.0056,
"mean_token_accuracy": 0.2260068476200104,
"num_tokens": 12121604.0,
"step": 6670
},
{
"entropy": 5.757162714004517,
"epoch": 5.734851740438333,
"grad_norm": 1.2109375,
"learning_rate": 0.0002511725009389244,
"loss": 5.1818,
"mean_token_accuracy": 0.21061882376670837,
"num_tokens": 12131276.0,
"step": 6675
},
{
"entropy": 5.742733335494995,
"epoch": 5.739149119037387,
"grad_norm": 1.390625,
"learning_rate": 0.000250841913936428,
"loss": 5.1347,
"mean_token_accuracy": 0.2235199674963951,
"num_tokens": 12140180.0,
"step": 6680
},
{
"entropy": 5.728972244262695,
"epoch": 5.743446497636442,
"grad_norm": 1.28125,
"learning_rate": 0.0002505113796855357,
"loss": 5.1024,
"mean_token_accuracy": 0.2127378210425377,
"num_tokens": 12149635.0,
"step": 6685
},
{
"entropy": 5.762466812133789,
"epoch": 5.747743876235496,
"grad_norm": 1.328125,
"learning_rate": 0.00025018089890800225,
"loss": 5.0582,
"mean_token_accuracy": 0.22984133958816527,
"num_tokens": 12157565.0,
"step": 6690
},
{
"entropy": 5.760795783996582,
"epoch": 5.752041254834551,
"grad_norm": 1.234375,
"learning_rate": 0.00024985047232546544,
"loss": 5.1539,
"mean_token_accuracy": 0.21633774489164354,
"num_tokens": 12166647.0,
"step": 6695
},
{
"entropy": 5.78938307762146,
"epoch": 5.7563386334336055,
"grad_norm": 1.2109375,
"learning_rate": 0.00024952010065944485,
"loss": 5.1526,
"mean_token_accuracy": 0.21298279315233232,
"num_tokens": 12175554.0,
"step": 6700
},
{
"entropy": 5.794042539596558,
"epoch": 5.76063601203266,
"grad_norm": 1.375,
"learning_rate": 0.0002491897846313402,
"loss": 5.123,
"mean_token_accuracy": 0.22041986286640167,
"num_tokens": 12184756.0,
"step": 6705
},
{
"entropy": 5.837274694442749,
"epoch": 5.764933390631715,
"grad_norm": 1.203125,
"learning_rate": 0.0002488595249624297,
"loss": 5.2088,
"mean_token_accuracy": 0.20747051686048507,
"num_tokens": 12194724.0,
"step": 6710
},
{
"entropy": 5.758440542221069,
"epoch": 5.769230769230769,
"grad_norm": 1.171875,
"learning_rate": 0.00024852932237386837,
"loss": 5.1497,
"mean_token_accuracy": 0.22039461135864258,
"num_tokens": 12203804.0,
"step": 6715
},
{
"entropy": 5.761465644836425,
"epoch": 5.773528147829824,
"grad_norm": 1.1875,
"learning_rate": 0.00024819917758668673,
"loss": 5.0999,
"mean_token_accuracy": 0.219867567718029,
"num_tokens": 12212868.0,
"step": 6720
},
{
"entropy": 5.790355968475342,
"epoch": 5.777825526428878,
"grad_norm": 1.2265625,
"learning_rate": 0.00024786909132178906,
"loss": 5.1777,
"mean_token_accuracy": 0.2172165408730507,
"num_tokens": 12221650.0,
"step": 6725
},
{
"entropy": 5.773914241790772,
"epoch": 5.782122905027933,
"grad_norm": 1.34375,
"learning_rate": 0.00024753906429995194,
"loss": 5.0614,
"mean_token_accuracy": 0.22624436914920806,
"num_tokens": 12231541.0,
"step": 6730
},
{
"entropy": 5.77492847442627,
"epoch": 5.786420283626987,
"grad_norm": 1.3125,
"learning_rate": 0.0002472090972418222,
"loss": 5.1611,
"mean_token_accuracy": 0.21325822174549103,
"num_tokens": 12240899.0,
"step": 6735
},
{
"entropy": 5.800062751770019,
"epoch": 5.790717662226042,
"grad_norm": 1.21875,
"learning_rate": 0.0002468791908679163,
"loss": 5.1479,
"mean_token_accuracy": 0.20612489581108093,
"num_tokens": 12250352.0,
"step": 6740
},
{
"entropy": 5.793203258514405,
"epoch": 5.795015040825096,
"grad_norm": 1.265625,
"learning_rate": 0.0002465493458986175,
"loss": 5.1933,
"mean_token_accuracy": 0.20913417190313338,
"num_tokens": 12259975.0,
"step": 6745
},
{
"entropy": 5.695867681503296,
"epoch": 5.799312419424151,
"grad_norm": 1.40625,
"learning_rate": 0.00024621956305417587,
"loss": 5.0425,
"mean_token_accuracy": 0.22444724589586257,
"num_tokens": 12269203.0,
"step": 6750
},
{
"entropy": 5.797395038604736,
"epoch": 5.8036097980232055,
"grad_norm": 1.3203125,
"learning_rate": 0.000245889843054705,
"loss": 5.135,
"mean_token_accuracy": 0.22068165093660355,
"num_tokens": 12279481.0,
"step": 6755
},
{
"entropy": 5.747924900054931,
"epoch": 5.80790717662226,
"grad_norm": 1.2421875,
"learning_rate": 0.00024556018662018163,
"loss": 5.1148,
"mean_token_accuracy": 0.22157147377729416,
"num_tokens": 12288848.0,
"step": 6760
},
{
"entropy": 5.782013320922852,
"epoch": 5.812204555221315,
"grad_norm": 1.2109375,
"learning_rate": 0.00024523059447044377,
"loss": 5.1238,
"mean_token_accuracy": 0.2141062006354332,
"num_tokens": 12297346.0,
"step": 6765
},
{
"entropy": 5.784574270248413,
"epoch": 5.816501933820369,
"grad_norm": 1.25,
"learning_rate": 0.0002449010673251887,
"loss": 5.1208,
"mean_token_accuracy": 0.214461612701416,
"num_tokens": 12306233.0,
"step": 6770
},
{
"entropy": 5.823604106903076,
"epoch": 5.820799312419425,
"grad_norm": 1.2578125,
"learning_rate": 0.0002445716059039723,
"loss": 5.2241,
"mean_token_accuracy": 0.20766980350017547,
"num_tokens": 12315609.0,
"step": 6775
},
{
"entropy": 5.792714786529541,
"epoch": 5.825096691018478,
"grad_norm": 1.4765625,
"learning_rate": 0.00024424221092620644,
"loss": 5.1593,
"mean_token_accuracy": 0.2178465098142624,
"num_tokens": 12323915.0,
"step": 6780
},
{
"entropy": 5.679864931106567,
"epoch": 5.829394069617534,
"grad_norm": 1.234375,
"learning_rate": 0.00024391288311115822,
"loss": 5.092,
"mean_token_accuracy": 0.21546332389116288,
"num_tokens": 12334077.0,
"step": 6785
},
{
"entropy": 5.752875280380249,
"epoch": 5.833691448216588,
"grad_norm": 1.3359375,
"learning_rate": 0.0002435836231779478,
"loss": 5.1173,
"mean_token_accuracy": 0.21589842587709426,
"num_tokens": 12342411.0,
"step": 6790
},
{
"entropy": 5.71485276222229,
"epoch": 5.837988826815643,
"grad_norm": 1.2109375,
"learning_rate": 0.00024325443184554724,
"loss": 5.0154,
"mean_token_accuracy": 0.23167243152856826,
"num_tokens": 12351308.0,
"step": 6795
},
{
"entropy": 5.741046476364136,
"epoch": 5.842286205414697,
"grad_norm": 1.2421875,
"learning_rate": 0.00024292530983277904,
"loss": 5.1886,
"mean_token_accuracy": 0.2137362465262413,
"num_tokens": 12359673.0,
"step": 6800
},
{
"entropy": 5.730896043777466,
"epoch": 5.846583584013752,
"grad_norm": 1.296875,
"learning_rate": 0.00024259625785831408,
"loss": 5.0495,
"mean_token_accuracy": 0.21516438126564025,
"num_tokens": 12367876.0,
"step": 6805
},
{
"entropy": 5.795110273361206,
"epoch": 5.850880962612806,
"grad_norm": 1.3046875,
"learning_rate": 0.00024226727664067023,
"loss": 5.1901,
"mean_token_accuracy": 0.2127310201525688,
"num_tokens": 12377040.0,
"step": 6810
},
{
"entropy": 5.86044659614563,
"epoch": 5.855178341211861,
"grad_norm": 1.234375,
"learning_rate": 0.00024193836689821109,
"loss": 5.2514,
"mean_token_accuracy": 0.2128416433930397,
"num_tokens": 12387622.0,
"step": 6815
},
{
"entropy": 5.697026014328003,
"epoch": 5.8594757198109155,
"grad_norm": 1.3046875,
"learning_rate": 0.0002416095293491439,
"loss": 5.035,
"mean_token_accuracy": 0.2235540360212326,
"num_tokens": 12396447.0,
"step": 6820
},
{
"entropy": 5.762108945846558,
"epoch": 5.86377309840997,
"grad_norm": 1.34375,
"learning_rate": 0.0002412807647115186,
"loss": 5.0562,
"mean_token_accuracy": 0.23084075152873992,
"num_tokens": 12405887.0,
"step": 6825
},
{
"entropy": 5.700514125823974,
"epoch": 5.868070477009025,
"grad_norm": 1.2734375,
"learning_rate": 0.00024095207370322574,
"loss": 5.0786,
"mean_token_accuracy": 0.21588899046182633,
"num_tokens": 12414543.0,
"step": 6830
},
{
"entropy": 5.715729188919068,
"epoch": 5.872367855608079,
"grad_norm": 1.3515625,
"learning_rate": 0.00024062345704199507,
"loss": 5.0879,
"mean_token_accuracy": 0.2205901026725769,
"num_tokens": 12423370.0,
"step": 6835
},
{
"entropy": 5.793241882324219,
"epoch": 5.876665234207134,
"grad_norm": 1.03125,
"learning_rate": 0.00024029491544539405,
"loss": 5.1822,
"mean_token_accuracy": 0.2112067312002182,
"num_tokens": 12433980.0,
"step": 6840
},
{
"entropy": 5.738333225250244,
"epoch": 5.880962612806188,
"grad_norm": 1.3359375,
"learning_rate": 0.00023996644963082616,
"loss": 5.132,
"mean_token_accuracy": 0.22221640795469283,
"num_tokens": 12443300.0,
"step": 6845
},
{
"entropy": 5.76718807220459,
"epoch": 5.885259991405243,
"grad_norm": 1.15625,
"learning_rate": 0.00023963806031552948,
"loss": 5.1758,
"mean_token_accuracy": 0.2108922243118286,
"num_tokens": 12452462.0,
"step": 6850
},
{
"entropy": 5.706634998321533,
"epoch": 5.889557370004297,
"grad_norm": 1.2578125,
"learning_rate": 0.00023930974821657504,
"loss": 5.0996,
"mean_token_accuracy": 0.21803777813911437,
"num_tokens": 12461605.0,
"step": 6855
},
{
"entropy": 5.8166309833526615,
"epoch": 5.893854748603352,
"grad_norm": 1.25,
"learning_rate": 0.00023898151405086533,
"loss": 5.1663,
"mean_token_accuracy": 0.21597474217414855,
"num_tokens": 12470905.0,
"step": 6860
},
{
"entropy": 5.821134376525879,
"epoch": 5.8981521272024064,
"grad_norm": 1.3515625,
"learning_rate": 0.00023865335853513232,
"loss": 5.1416,
"mean_token_accuracy": 0.21774317771196366,
"num_tokens": 12478913.0,
"step": 6865
},
{
"entropy": 5.761513996124267,
"epoch": 5.902449505801461,
"grad_norm": 1.2265625,
"learning_rate": 0.00023832528238593677,
"loss": 5.2181,
"mean_token_accuracy": 0.21117616146802903,
"num_tokens": 12487561.0,
"step": 6870
},
{
"entropy": 5.7301372528076175,
"epoch": 5.9067468844005155,
"grad_norm": 1.28125,
"learning_rate": 0.00023799728631966556,
"loss": 5.1255,
"mean_token_accuracy": 0.2209423691034317,
"num_tokens": 12496781.0,
"step": 6875
},
{
"entropy": 5.796772241592407,
"epoch": 5.91104426299957,
"grad_norm": 1.4375,
"learning_rate": 0.0002376693710525313,
"loss": 5.2086,
"mean_token_accuracy": 0.20858777016401292,
"num_tokens": 12505716.0,
"step": 6880
},
{
"entropy": 5.841960048675537,
"epoch": 5.915341641598625,
"grad_norm": 1.1640625,
"learning_rate": 0.00023734153730056967,
"loss": 5.1519,
"mean_token_accuracy": 0.2122661292552948,
"num_tokens": 12515594.0,
"step": 6885
},
{
"entropy": 5.72416934967041,
"epoch": 5.919639020197679,
"grad_norm": 1.484375,
"learning_rate": 0.00023701378577963873,
"loss": 4.9846,
"mean_token_accuracy": 0.23235433548688889,
"num_tokens": 12523439.0,
"step": 6890
},
{
"entropy": 5.707376480102539,
"epoch": 5.923936398796734,
"grad_norm": 1.328125,
"learning_rate": 0.0002366861172054166,
"loss": 5.0914,
"mean_token_accuracy": 0.22402575612068176,
"num_tokens": 12532100.0,
"step": 6895
},
{
"entropy": 5.742840337753296,
"epoch": 5.928233777395788,
"grad_norm": 1.3359375,
"learning_rate": 0.00023635853229340054,
"loss": 5.113,
"mean_token_accuracy": 0.21385788768529893,
"num_tokens": 12539689.0,
"step": 6900
},
{
"entropy": 5.761408472061158,
"epoch": 5.932531155994843,
"grad_norm": 1.2421875,
"learning_rate": 0.00023603103175890512,
"loss": 5.1386,
"mean_token_accuracy": 0.2112519159913063,
"num_tokens": 12548486.0,
"step": 6905
},
{
"entropy": 5.7453104019165036,
"epoch": 5.936828534593897,
"grad_norm": 1.2578125,
"learning_rate": 0.00023570361631706062,
"loss": 5.0162,
"mean_token_accuracy": 0.23448468893766403,
"num_tokens": 12557423.0,
"step": 6910
},
{
"entropy": 5.724758243560791,
"epoch": 5.941125913192952,
"grad_norm": 1.2890625,
"learning_rate": 0.00023537628668281142,
"loss": 5.1705,
"mean_token_accuracy": 0.2140020415186882,
"num_tokens": 12566086.0,
"step": 6915
},
{
"entropy": 5.7241425037384035,
"epoch": 5.945423291792007,
"grad_norm": 1.2109375,
"learning_rate": 0.00023504904357091468,
"loss": 5.0751,
"mean_token_accuracy": 0.2268398404121399,
"num_tokens": 12575827.0,
"step": 6920
},
{
"entropy": 5.759646368026734,
"epoch": 5.949720670391061,
"grad_norm": 1.375,
"learning_rate": 0.0002347218876959384,
"loss": 5.0637,
"mean_token_accuracy": 0.2303071603178978,
"num_tokens": 12585044.0,
"step": 6925
},
{
"entropy": 5.7616418361663815,
"epoch": 5.954018048990116,
"grad_norm": 1.2265625,
"learning_rate": 0.0002343948197722604,
"loss": 5.1006,
"mean_token_accuracy": 0.22172485142946244,
"num_tokens": 12594677.0,
"step": 6930
},
{
"entropy": 5.730101203918457,
"epoch": 5.958315427589171,
"grad_norm": 1.28125,
"learning_rate": 0.00023406784051406638,
"loss": 5.1346,
"mean_token_accuracy": 0.21585479229688645,
"num_tokens": 12604829.0,
"step": 6935
},
{
"entropy": 5.84703574180603,
"epoch": 5.9626128061882255,
"grad_norm": 1.2421875,
"learning_rate": 0.00023374095063534816,
"loss": 5.1052,
"mean_token_accuracy": 0.22624473124742508,
"num_tokens": 12613869.0,
"step": 6940
},
{
"entropy": 5.750234889984131,
"epoch": 5.96691018478728,
"grad_norm": 1.2578125,
"learning_rate": 0.00023341415084990276,
"loss": 5.1064,
"mean_token_accuracy": 0.2171055868268013,
"num_tokens": 12623248.0,
"step": 6945
},
{
"entropy": 5.663731479644776,
"epoch": 5.971207563386335,
"grad_norm": 1.1953125,
"learning_rate": 0.00023308744187132996,
"loss": 5.0223,
"mean_token_accuracy": 0.2321384847164154,
"num_tokens": 12631973.0,
"step": 6950
},
{
"entropy": 5.7603648662567135,
"epoch": 5.975504941985389,
"grad_norm": 1.2890625,
"learning_rate": 0.00023276082441303197,
"loss": 5.1427,
"mean_token_accuracy": 0.21930764019489288,
"num_tokens": 12641435.0,
"step": 6955
},
{
"entropy": 5.780388498306275,
"epoch": 5.979802320584444,
"grad_norm": 1.265625,
"learning_rate": 0.00023243429918821056,
"loss": 5.1286,
"mean_token_accuracy": 0.22100035548210145,
"num_tokens": 12651077.0,
"step": 6960
},
{
"entropy": 5.846546459197998,
"epoch": 5.984099699183498,
"grad_norm": 1.328125,
"learning_rate": 0.00023210786690986646,
"loss": 5.2114,
"mean_token_accuracy": 0.21253881752490997,
"num_tokens": 12659929.0,
"step": 6965
},
{
"entropy": 5.781086397171021,
"epoch": 5.988397077782553,
"grad_norm": 1.3125,
"learning_rate": 0.00023178152829079712,
"loss": 5.0692,
"mean_token_accuracy": 0.2191861242055893,
"num_tokens": 12670725.0,
"step": 6970
},
{
"entropy": 5.7392051219940186,
"epoch": 5.992694456381607,
"grad_norm": 1.3828125,
"learning_rate": 0.00023145528404359562,
"loss": 5.1194,
"mean_token_accuracy": 0.22653693705797195,
"num_tokens": 12680820.0,
"step": 6975
},
{
"entropy": 5.772517347335816,
"epoch": 5.996991834980662,
"grad_norm": 1.2890625,
"learning_rate": 0.0002311291348806492,
"loss": 5.1847,
"mean_token_accuracy": 0.2081763491034508,
"num_tokens": 12689785.0,
"step": 6980
},
{
"entropy": 5.725305080413818,
"epoch": 6.000859475719811,
"grad_norm": 1.2890625,
"learning_rate": 0.0002308030815141372,
"loss": 5.115,
"mean_token_accuracy": 0.21141118307908377,
"num_tokens": 12697221.0,
"step": 6985
},
{
"entropy": 5.731379842758178,
"epoch": 6.005156854318866,
"grad_norm": 1.234375,
"learning_rate": 0.00023047712465602976,
"loss": 4.93,
"mean_token_accuracy": 0.24172718375921248,
"num_tokens": 12707127.0,
"step": 6990
},
{
"entropy": 5.700897169113159,
"epoch": 6.00945423291792,
"grad_norm": 1.4140625,
"learning_rate": 0.00023015126501808641,
"loss": 4.9318,
"mean_token_accuracy": 0.2316366359591484,
"num_tokens": 12715364.0,
"step": 6995
},
{
"entropy": 5.7420319557189945,
"epoch": 6.013751611516975,
"grad_norm": 1.203125,
"learning_rate": 0.00022982550331185437,
"loss": 4.9289,
"mean_token_accuracy": 0.2380008026957512,
"num_tokens": 12724914.0,
"step": 7000
},
{
"epoch": 6.013751611516975,
"eval_entropy": 5.5273827663413035,
"eval_loss": 5.911673069000244,
"eval_mean_token_accuracy": 0.17968238789487528,
"eval_num_tokens": 12724914.0,
"eval_runtime": 2.0516,
"eval_samples_per_second": 1729.866,
"eval_steps_per_second": 216.416,
"step": 7000
},
{
"entropy": 5.69878044128418,
"epoch": 6.0180489901160295,
"grad_norm": 1.171875,
"learning_rate": 0.00022949984024866704,
"loss": 4.9492,
"mean_token_accuracy": 0.23745594918727875,
"num_tokens": 12735193.0,
"step": 7005
},
{
"entropy": 5.753796625137329,
"epoch": 6.022346368715084,
"grad_norm": 1.34375,
"learning_rate": 0.0002291742765396424,
"loss": 4.9928,
"mean_token_accuracy": 0.23509994596242906,
"num_tokens": 12743945.0,
"step": 7010
},
{
"entropy": 5.772705030441284,
"epoch": 6.0266437473141385,
"grad_norm": 1.1796875,
"learning_rate": 0.00022884881289568133,
"loss": 4.9965,
"mean_token_accuracy": 0.229385170340538,
"num_tokens": 12753130.0,
"step": 7015
},
{
"entropy": 5.695847749710083,
"epoch": 6.030941125913193,
"grad_norm": 1.28125,
"learning_rate": 0.0002285234500274665,
"loss": 4.9808,
"mean_token_accuracy": 0.23147749304771423,
"num_tokens": 12762108.0,
"step": 7020
},
{
"entropy": 5.800906610488892,
"epoch": 6.035238504512248,
"grad_norm": 1.328125,
"learning_rate": 0.00022819818864546016,
"loss": 5.0278,
"mean_token_accuracy": 0.22203525006771088,
"num_tokens": 12772102.0,
"step": 7025
},
{
"entropy": 5.751763248443604,
"epoch": 6.039535883111302,
"grad_norm": 1.296875,
"learning_rate": 0.00022787302945990345,
"loss": 4.9848,
"mean_token_accuracy": 0.23573557138442994,
"num_tokens": 12781225.0,
"step": 7030
},
{
"entropy": 5.77105746269226,
"epoch": 6.043833261710357,
"grad_norm": 1.390625,
"learning_rate": 0.00022754797318081383,
"loss": 4.9454,
"mean_token_accuracy": 0.23269859850406646,
"num_tokens": 12789896.0,
"step": 7035
},
{
"entropy": 5.738353204727173,
"epoch": 6.048130640309411,
"grad_norm": 1.28125,
"learning_rate": 0.00022722302051798442,
"loss": 4.8836,
"mean_token_accuracy": 0.23907660245895385,
"num_tokens": 12798596.0,
"step": 7040
},
{
"entropy": 5.66996431350708,
"epoch": 6.052428018908466,
"grad_norm": 1.296875,
"learning_rate": 0.0002268981721809819,
"loss": 4.9122,
"mean_token_accuracy": 0.23859167844057083,
"num_tokens": 12807285.0,
"step": 7045
},
{
"entropy": 5.720047092437744,
"epoch": 6.05672539750752,
"grad_norm": 1.3046875,
"learning_rate": 0.0002265734288791451,
"loss": 4.9691,
"mean_token_accuracy": 0.2319769710302353,
"num_tokens": 12816668.0,
"step": 7050
},
{
"entropy": 5.782533931732178,
"epoch": 6.061022776106575,
"grad_norm": 1.3515625,
"learning_rate": 0.00022624879132158377,
"loss": 5.0621,
"mean_token_accuracy": 0.22383298426866532,
"num_tokens": 12825943.0,
"step": 7055
},
{
"entropy": 5.740294361114502,
"epoch": 6.0653201547056295,
"grad_norm": 1.171875,
"learning_rate": 0.00022592426021717654,
"loss": 4.8845,
"mean_token_accuracy": 0.23752743601799012,
"num_tokens": 12835693.0,
"step": 7060
},
{
"entropy": 5.734462833404541,
"epoch": 6.069617533304684,
"grad_norm": 1.328125,
"learning_rate": 0.0002255998362745696,
"loss": 4.7947,
"mean_token_accuracy": 0.2523151770234108,
"num_tokens": 12844201.0,
"step": 7065
},
{
"entropy": 5.674288940429688,
"epoch": 6.073914911903739,
"grad_norm": 1.2734375,
"learning_rate": 0.00022527552020217513,
"loss": 4.9312,
"mean_token_accuracy": 0.23512519299983978,
"num_tokens": 12853220.0,
"step": 7070
},
{
"entropy": 5.679785203933716,
"epoch": 6.078212290502793,
"grad_norm": 1.2421875,
"learning_rate": 0.0002249513127081697,
"loss": 5.0051,
"mean_token_accuracy": 0.2304693043231964,
"num_tokens": 12862486.0,
"step": 7075
},
{
"entropy": 5.797192478179932,
"epoch": 6.082509669101848,
"grad_norm": 1.2734375,
"learning_rate": 0.00022462721450049316,
"loss": 5.0032,
"mean_token_accuracy": 0.22529298514127732,
"num_tokens": 12871717.0,
"step": 7080
},
{
"entropy": 5.758112525939941,
"epoch": 6.086807047700902,
"grad_norm": 1.4296875,
"learning_rate": 0.0002243032262868464,
"loss": 4.9584,
"mean_token_accuracy": 0.22690722793340684,
"num_tokens": 12881278.0,
"step": 7085
},
{
"entropy": 5.750693368911743,
"epoch": 6.091104426299957,
"grad_norm": 1.2109375,
"learning_rate": 0.00022397934877469,
"loss": 4.9822,
"mean_token_accuracy": 0.22972595542669297,
"num_tokens": 12890720.0,
"step": 7090
},
{
"entropy": 5.79561676979065,
"epoch": 6.095401804899011,
"grad_norm": 1.2734375,
"learning_rate": 0.0002236555826712432,
"loss": 5.0162,
"mean_token_accuracy": 0.22308289557695388,
"num_tokens": 12900428.0,
"step": 7095
},
{
"entropy": 5.806292009353638,
"epoch": 6.099699183498066,
"grad_norm": 1.3125,
"learning_rate": 0.00022333192868348152,
"loss": 4.9924,
"mean_token_accuracy": 0.22728473246097564,
"num_tokens": 12910177.0,
"step": 7100
},
{
"entropy": 5.733801794052124,
"epoch": 6.10399656209712,
"grad_norm": 1.21875,
"learning_rate": 0.00022300838751813606,
"loss": 5.032,
"mean_token_accuracy": 0.23358280062675477,
"num_tokens": 12920734.0,
"step": 7105
},
{
"entropy": 5.729845237731934,
"epoch": 6.108293940696175,
"grad_norm": 1.3984375,
"learning_rate": 0.00022268495988169145,
"loss": 4.9171,
"mean_token_accuracy": 0.2373756691813469,
"num_tokens": 12929585.0,
"step": 7110
},
{
"entropy": 5.642404937744141,
"epoch": 6.1125913192952295,
"grad_norm": 1.3515625,
"learning_rate": 0.00022236164648038433,
"loss": 4.9177,
"mean_token_accuracy": 0.2359202727675438,
"num_tokens": 12938933.0,
"step": 7115
},
{
"entropy": 5.697567796707153,
"epoch": 6.116888697894285,
"grad_norm": 1.3984375,
"learning_rate": 0.0002220384480202019,
"loss": 4.8809,
"mean_token_accuracy": 0.23985225856304168,
"num_tokens": 12947461.0,
"step": 7120
},
{
"entropy": 5.735206222534179,
"epoch": 6.1211860764933395,
"grad_norm": 1.34375,
"learning_rate": 0.00022171536520688046,
"loss": 4.9414,
"mean_token_accuracy": 0.23593441843986512,
"num_tokens": 12956507.0,
"step": 7125
},
{
"entropy": 5.7640424251556395,
"epoch": 6.125483455092394,
"grad_norm": 1.3046875,
"learning_rate": 0.00022139239874590362,
"loss": 5.0582,
"mean_token_accuracy": 0.22682570517063141,
"num_tokens": 12965740.0,
"step": 7130
},
{
"entropy": 5.745245885848999,
"epoch": 6.1297808336914485,
"grad_norm": 1.2734375,
"learning_rate": 0.0002210695493425013,
"loss": 4.9708,
"mean_token_accuracy": 0.23030567169189453,
"num_tokens": 12975057.0,
"step": 7135
},
{
"entropy": 5.677684688568116,
"epoch": 6.134078212290503,
"grad_norm": 1.53125,
"learning_rate": 0.00022074681770164735,
"loss": 4.9075,
"mean_token_accuracy": 0.23593185544013978,
"num_tokens": 12984087.0,
"step": 7140
},
{
"entropy": 5.6709558963775635,
"epoch": 6.138375590889558,
"grad_norm": 1.2265625,
"learning_rate": 0.00022042420452805868,
"loss": 4.9296,
"mean_token_accuracy": 0.24000215977430345,
"num_tokens": 12992793.0,
"step": 7145
},
{
"entropy": 5.755114269256592,
"epoch": 6.142672969488612,
"grad_norm": 1.3828125,
"learning_rate": 0.00022010171052619365,
"loss": 4.9894,
"mean_token_accuracy": 0.2350299596786499,
"num_tokens": 13000769.0,
"step": 7150
},
{
"entropy": 5.762734317779541,
"epoch": 6.146970348087667,
"grad_norm": 1.28125,
"learning_rate": 0.00021977933640025,
"loss": 4.9752,
"mean_token_accuracy": 0.23126785159111024,
"num_tokens": 13010677.0,
"step": 7155
},
{
"entropy": 5.642547845840454,
"epoch": 6.151267726686721,
"grad_norm": 1.28125,
"learning_rate": 0.00021945708285416434,
"loss": 4.8383,
"mean_token_accuracy": 0.24688103795051575,
"num_tokens": 13019791.0,
"step": 7160
},
{
"entropy": 5.730410861968994,
"epoch": 6.155565105285776,
"grad_norm": 1.328125,
"learning_rate": 0.0002191349505916093,
"loss": 5.0057,
"mean_token_accuracy": 0.23793091177940368,
"num_tokens": 13029223.0,
"step": 7165
},
{
"entropy": 5.737023544311524,
"epoch": 6.15986248388483,
"grad_norm": 1.234375,
"learning_rate": 0.00021881294031599318,
"loss": 4.9491,
"mean_token_accuracy": 0.23026928752660752,
"num_tokens": 13038716.0,
"step": 7170
},
{
"entropy": 5.756968975067139,
"epoch": 6.164159862483885,
"grad_norm": 1.40625,
"learning_rate": 0.0002184910527304576,
"loss": 4.9858,
"mean_token_accuracy": 0.23431467413902282,
"num_tokens": 13047915.0,
"step": 7175
},
{
"entropy": 5.707231950759888,
"epoch": 6.1684572410829395,
"grad_norm": 1.359375,
"learning_rate": 0.00021816928853787636,
"loss": 4.936,
"mean_token_accuracy": 0.24333883821964264,
"num_tokens": 13056613.0,
"step": 7180
},
{
"entropy": 5.7403875350952145,
"epoch": 6.172754619681994,
"grad_norm": 1.28125,
"learning_rate": 0.00021784764844085398,
"loss": 4.9922,
"mean_token_accuracy": 0.23102872520685197,
"num_tokens": 13066658.0,
"step": 7185
},
{
"entropy": 5.814033651351929,
"epoch": 6.177051998281049,
"grad_norm": 1.4453125,
"learning_rate": 0.0002175261331417238,
"loss": 4.9941,
"mean_token_accuracy": 0.2289365902543068,
"num_tokens": 13074798.0,
"step": 7190
},
{
"entropy": 5.694831132888794,
"epoch": 6.181349376880103,
"grad_norm": 1.359375,
"learning_rate": 0.00021720474334254675,
"loss": 4.92,
"mean_token_accuracy": 0.22971168160438538,
"num_tokens": 13084173.0,
"step": 7195
},
{
"entropy": 5.730774879455566,
"epoch": 6.185646755479158,
"grad_norm": 1.25,
"learning_rate": 0.00021688347974510962,
"loss": 4.9482,
"mean_token_accuracy": 0.23248852640390397,
"num_tokens": 13093096.0,
"step": 7200
},
{
"entropy": 5.719029092788697,
"epoch": 6.189944134078212,
"grad_norm": 1.5078125,
"learning_rate": 0.00021656234305092377,
"loss": 4.9397,
"mean_token_accuracy": 0.239972348511219,
"num_tokens": 13101191.0,
"step": 7205
},
{
"entropy": 5.707985305786133,
"epoch": 6.194241512677267,
"grad_norm": 1.4609375,
"learning_rate": 0.0002162413339612234,
"loss": 4.9712,
"mean_token_accuracy": 0.23447236716747283,
"num_tokens": 13109829.0,
"step": 7210
},
{
"entropy": 5.708271741867065,
"epoch": 6.198538891276321,
"grad_norm": 1.4140625,
"learning_rate": 0.00021592045317696406,
"loss": 4.9274,
"mean_token_accuracy": 0.23861967474222184,
"num_tokens": 13119314.0,
"step": 7215
},
{
"entropy": 5.6994280338287355,
"epoch": 6.202836269875376,
"grad_norm": 1.3828125,
"learning_rate": 0.00021559970139882102,
"loss": 4.8994,
"mean_token_accuracy": 0.23726629912853242,
"num_tokens": 13128113.0,
"step": 7220
},
{
"entropy": 5.7727789878845215,
"epoch": 6.20713364847443,
"grad_norm": 1.3515625,
"learning_rate": 0.0002152790793271881,
"loss": 5.0355,
"mean_token_accuracy": 0.22249855697155,
"num_tokens": 13136892.0,
"step": 7225
},
{
"entropy": 5.766521263122558,
"epoch": 6.211431027073485,
"grad_norm": 1.1484375,
"learning_rate": 0.00021495858766217558,
"loss": 5.0147,
"mean_token_accuracy": 0.22924861907958985,
"num_tokens": 13146960.0,
"step": 7230
},
{
"entropy": 5.720303297042847,
"epoch": 6.2157284056725395,
"grad_norm": 1.375,
"learning_rate": 0.00021463822710360932,
"loss": 4.8958,
"mean_token_accuracy": 0.245298570394516,
"num_tokens": 13156147.0,
"step": 7235
},
{
"entropy": 5.748995399475097,
"epoch": 6.220025784271594,
"grad_norm": 1.2734375,
"learning_rate": 0.00021431799835102867,
"loss": 4.9738,
"mean_token_accuracy": 0.22614747285842896,
"num_tokens": 13164588.0,
"step": 7240
},
{
"entropy": 5.660120677947998,
"epoch": 6.224323162870649,
"grad_norm": 1.28125,
"learning_rate": 0.00021399790210368524,
"loss": 4.9139,
"mean_token_accuracy": 0.23604709059000015,
"num_tokens": 13174361.0,
"step": 7245
},
{
"entropy": 5.748390197753906,
"epoch": 6.228620541469703,
"grad_norm": 1.140625,
"learning_rate": 0.00021367793906054133,
"loss": 5.119,
"mean_token_accuracy": 0.22031570225954056,
"num_tokens": 13185266.0,
"step": 7250
},
{
"entropy": 5.738927364349365,
"epoch": 6.232917920068758,
"grad_norm": 1.375,
"learning_rate": 0.00021335810992026823,
"loss": 4.9654,
"mean_token_accuracy": 0.24364089071750641,
"num_tokens": 13194227.0,
"step": 7255
},
{
"entropy": 5.771244287490845,
"epoch": 6.237215298667812,
"grad_norm": 1.4453125,
"learning_rate": 0.00021303841538124497,
"loss": 5.0114,
"mean_token_accuracy": 0.22773328721523284,
"num_tokens": 13202569.0,
"step": 7260
},
{
"entropy": 5.7087109088897705,
"epoch": 6.241512677266867,
"grad_norm": 1.3515625,
"learning_rate": 0.00021271885614155685,
"loss": 4.9538,
"mean_token_accuracy": 0.2368649423122406,
"num_tokens": 13212201.0,
"step": 7265
},
{
"entropy": 5.627471113204956,
"epoch": 6.245810055865922,
"grad_norm": 1.4296875,
"learning_rate": 0.0002123994328989932,
"loss": 4.8806,
"mean_token_accuracy": 0.2345322847366333,
"num_tokens": 13220802.0,
"step": 7270
},
{
"entropy": 5.730443000793457,
"epoch": 6.250107434464977,
"grad_norm": 1.3046875,
"learning_rate": 0.00021208014635104688,
"loss": 5.0275,
"mean_token_accuracy": 0.2250627398490906,
"num_tokens": 13229519.0,
"step": 7275
},
{
"entropy": 5.741388607025146,
"epoch": 6.254404813064031,
"grad_norm": 1.3515625,
"learning_rate": 0.00021176099719491209,
"loss": 4.9881,
"mean_token_accuracy": 0.22891727834939957,
"num_tokens": 13238865.0,
"step": 7280
},
{
"entropy": 5.702636861801148,
"epoch": 6.258702191663086,
"grad_norm": 1.4609375,
"learning_rate": 0.00021144198612748312,
"loss": 4.9049,
"mean_token_accuracy": 0.2440029874444008,
"num_tokens": 13247259.0,
"step": 7285
},
{
"entropy": 5.7749049186706545,
"epoch": 6.26299957026214,
"grad_norm": 1.2578125,
"learning_rate": 0.00021112311384535243,
"loss": 5.0122,
"mean_token_accuracy": 0.23096455335617067,
"num_tokens": 13256692.0,
"step": 7290
},
{
"entropy": 5.753351926803589,
"epoch": 6.267296948861195,
"grad_norm": 1.2109375,
"learning_rate": 0.00021080438104480976,
"loss": 4.947,
"mean_token_accuracy": 0.23121515959501265,
"num_tokens": 13266109.0,
"step": 7295
},
{
"entropy": 5.79936261177063,
"epoch": 6.2715943274602495,
"grad_norm": 1.203125,
"learning_rate": 0.00021048578842184019,
"loss": 5.0452,
"mean_token_accuracy": 0.22553887069225312,
"num_tokens": 13275484.0,
"step": 7300
},
{
"entropy": 5.706963014602661,
"epoch": 6.275891706059304,
"grad_norm": 1.328125,
"learning_rate": 0.00021016733667212245,
"loss": 4.9322,
"mean_token_accuracy": 0.2428619921207428,
"num_tokens": 13284755.0,
"step": 7305
},
{
"entropy": 5.693513488769531,
"epoch": 6.280189084658359,
"grad_norm": 1.1171875,
"learning_rate": 0.00020984902649102806,
"loss": 4.9319,
"mean_token_accuracy": 0.23691536039113997,
"num_tokens": 13294386.0,
"step": 7310
},
{
"entropy": 5.711094999313355,
"epoch": 6.284486463257413,
"grad_norm": 1.265625,
"learning_rate": 0.00020953085857361924,
"loss": 4.9281,
"mean_token_accuracy": 0.23375690579414368,
"num_tokens": 13303926.0,
"step": 7315
},
{
"entropy": 5.696382474899292,
"epoch": 6.288783841856468,
"grad_norm": 1.3984375,
"learning_rate": 0.00020921283361464754,
"loss": 5.0236,
"mean_token_accuracy": 0.22575154900550842,
"num_tokens": 13312727.0,
"step": 7320
},
{
"entropy": 5.716668939590454,
"epoch": 6.293081220455522,
"grad_norm": 1.34375,
"learning_rate": 0.00020889495230855232,
"loss": 4.9784,
"mean_token_accuracy": 0.22872833162546158,
"num_tokens": 13321706.0,
"step": 7325
},
{
"entropy": 5.668966674804688,
"epoch": 6.297378599054577,
"grad_norm": 1.25,
"learning_rate": 0.00020857721534945923,
"loss": 4.9344,
"mean_token_accuracy": 0.23734308630228043,
"num_tokens": 13330436.0,
"step": 7330
},
{
"entropy": 5.757170391082764,
"epoch": 6.301675977653631,
"grad_norm": 1.328125,
"learning_rate": 0.0002082596234311789,
"loss": 5.0185,
"mean_token_accuracy": 0.22471884340047837,
"num_tokens": 13339334.0,
"step": 7335
},
{
"entropy": 5.78465838432312,
"epoch": 6.305973356252686,
"grad_norm": 1.34375,
"learning_rate": 0.0002079421772472051,
"loss": 5.0679,
"mean_token_accuracy": 0.22117299884557723,
"num_tokens": 13348969.0,
"step": 7340
},
{
"entropy": 5.793431758880615,
"epoch": 6.31027073485174,
"grad_norm": 1.1328125,
"learning_rate": 0.0002076248774907134,
"loss": 5.0456,
"mean_token_accuracy": 0.22682560235261917,
"num_tokens": 13358467.0,
"step": 7345
},
{
"entropy": 5.696144914627075,
"epoch": 6.314568113450795,
"grad_norm": 1.5,
"learning_rate": 0.00020730772485455962,
"loss": 4.8624,
"mean_token_accuracy": 0.23993807286024094,
"num_tokens": 13366413.0,
"step": 7350
},
{
"entropy": 5.7636823654174805,
"epoch": 6.3188654920498495,
"grad_norm": 1.1875,
"learning_rate": 0.0002069907200312785,
"loss": 5.0386,
"mean_token_accuracy": 0.22139448076486587,
"num_tokens": 13376620.0,
"step": 7355
},
{
"entropy": 5.723620796203614,
"epoch": 6.323162870648904,
"grad_norm": 1.359375,
"learning_rate": 0.00020667386371308162,
"loss": 5.0121,
"mean_token_accuracy": 0.226824951171875,
"num_tokens": 13385492.0,
"step": 7360
},
{
"entropy": 5.699918603897094,
"epoch": 6.327460249247959,
"grad_norm": 1.2265625,
"learning_rate": 0.00020635715659185673,
"loss": 4.9534,
"mean_token_accuracy": 0.237271548807621,
"num_tokens": 13395562.0,
"step": 7365
},
{
"entropy": 5.646546459197998,
"epoch": 6.331757627847013,
"grad_norm": 1.4453125,
"learning_rate": 0.00020604059935916551,
"loss": 4.8925,
"mean_token_accuracy": 0.24219037890434264,
"num_tokens": 13403357.0,
"step": 7370
},
{
"entropy": 5.723139715194702,
"epoch": 6.336055006446068,
"grad_norm": 1.4140625,
"learning_rate": 0.00020572419270624255,
"loss": 4.9969,
"mean_token_accuracy": 0.23338112533092498,
"num_tokens": 13412527.0,
"step": 7375
},
{
"entropy": 5.752758359909057,
"epoch": 6.340352385045122,
"grad_norm": 1.3046875,
"learning_rate": 0.00020540793732399339,
"loss": 5.0577,
"mean_token_accuracy": 0.22245372980833053,
"num_tokens": 13422455.0,
"step": 7380
},
{
"entropy": 5.807447099685669,
"epoch": 6.344649763644177,
"grad_norm": 1.375,
"learning_rate": 0.00020509183390299325,
"loss": 5.138,
"mean_token_accuracy": 0.21202833354473113,
"num_tokens": 13431677.0,
"step": 7385
},
{
"entropy": 5.653739356994629,
"epoch": 6.348947142243231,
"grad_norm": 1.2890625,
"learning_rate": 0.00020477588313348594,
"loss": 4.8809,
"mean_token_accuracy": 0.24472787827253342,
"num_tokens": 13440522.0,
"step": 7390
},
{
"entropy": 5.774398708343506,
"epoch": 6.353244520842286,
"grad_norm": 1.4921875,
"learning_rate": 0.00020446008570538154,
"loss": 5.0073,
"mean_token_accuracy": 0.22441416233778,
"num_tokens": 13450021.0,
"step": 7395
},
{
"entropy": 5.674509477615357,
"epoch": 6.35754189944134,
"grad_norm": 1.3515625,
"learning_rate": 0.0002041444423082554,
"loss": 4.9173,
"mean_token_accuracy": 0.23533451408147812,
"num_tokens": 13458115.0,
"step": 7400
},
{
"entropy": 5.764778327941895,
"epoch": 6.361839278040395,
"grad_norm": 1.3515625,
"learning_rate": 0.00020382895363134652,
"loss": 4.9923,
"mean_token_accuracy": 0.22358438670635222,
"num_tokens": 13466798.0,
"step": 7405
},
{
"entropy": 5.71299376487732,
"epoch": 6.3661366566394495,
"grad_norm": 1.359375,
"learning_rate": 0.00020351362036355602,
"loss": 5.0101,
"mean_token_accuracy": 0.23004357367753983,
"num_tokens": 13476096.0,
"step": 7410
},
{
"entropy": 5.714753818511963,
"epoch": 6.370434035238505,
"grad_norm": 1.28125,
"learning_rate": 0.0002031984431934459,
"loss": 4.9541,
"mean_token_accuracy": 0.2371777668595314,
"num_tokens": 13484601.0,
"step": 7415
},
{
"entropy": 5.716524934768676,
"epoch": 6.3747314138375595,
"grad_norm": 1.25,
"learning_rate": 0.00020288342280923695,
"loss": 4.9449,
"mean_token_accuracy": 0.23932171016931533,
"num_tokens": 13493994.0,
"step": 7420
},
{
"entropy": 5.69349045753479,
"epoch": 6.379028792436614,
"grad_norm": 1.3203125,
"learning_rate": 0.00020256855989880785,
"loss": 4.9782,
"mean_token_accuracy": 0.2243503674864769,
"num_tokens": 13502890.0,
"step": 7425
},
{
"entropy": 5.67198166847229,
"epoch": 6.383326171035669,
"grad_norm": 1.3515625,
"learning_rate": 0.00020225385514969336,
"loss": 4.9378,
"mean_token_accuracy": 0.2383576363325119,
"num_tokens": 13512980.0,
"step": 7430
},
{
"entropy": 5.71486463546753,
"epoch": 6.387623549634723,
"grad_norm": 1.3125,
"learning_rate": 0.00020193930924908277,
"loss": 5.0231,
"mean_token_accuracy": 0.23398934602737426,
"num_tokens": 13521558.0,
"step": 7435
},
{
"entropy": 5.680387735366821,
"epoch": 6.391920928233778,
"grad_norm": 1.171875,
"learning_rate": 0.00020162492288381867,
"loss": 4.9307,
"mean_token_accuracy": 0.23600068539381028,
"num_tokens": 13531506.0,
"step": 7440
},
{
"entropy": 5.672720003128052,
"epoch": 6.396218306832832,
"grad_norm": 1.1953125,
"learning_rate": 0.0002013106967403953,
"loss": 4.8982,
"mean_token_accuracy": 0.2424224779009819,
"num_tokens": 13540559.0,
"step": 7445
},
{
"entropy": 5.768481636047364,
"epoch": 6.400515685431887,
"grad_norm": 1.4140625,
"learning_rate": 0.0002009966315049569,
"loss": 5.011,
"mean_token_accuracy": 0.23141635209321976,
"num_tokens": 13550654.0,
"step": 7450
},
{
"entropy": 5.763478708267212,
"epoch": 6.404813064030941,
"grad_norm": 1.3125,
"learning_rate": 0.0002006827278632964,
"loss": 5.027,
"mean_token_accuracy": 0.23014189153909684,
"num_tokens": 13560708.0,
"step": 7455
},
{
"entropy": 5.722728872299195,
"epoch": 6.409110442629996,
"grad_norm": 1.28125,
"learning_rate": 0.00020036898650085377,
"loss": 4.9173,
"mean_token_accuracy": 0.22755362838506699,
"num_tokens": 13569330.0,
"step": 7460
},
{
"entropy": 5.764143085479736,
"epoch": 6.41340782122905,
"grad_norm": 1.40625,
"learning_rate": 0.00020005540810271493,
"loss": 5.0893,
"mean_token_accuracy": 0.21666009724140167,
"num_tokens": 13577500.0,
"step": 7465
},
{
"entropy": 5.709292697906494,
"epoch": 6.417705199828105,
"grad_norm": 1.3203125,
"learning_rate": 0.00019974199335360976,
"loss": 4.9859,
"mean_token_accuracy": 0.23307044357061385,
"num_tokens": 13586087.0,
"step": 7470
},
{
"entropy": 5.699592590332031,
"epoch": 6.4220025784271595,
"grad_norm": 1.3671875,
"learning_rate": 0.00019942874293791068,
"loss": 4.9396,
"mean_token_accuracy": 0.23114058822393418,
"num_tokens": 13595346.0,
"step": 7475
},
{
"entropy": 5.764913511276245,
"epoch": 6.426299957026214,
"grad_norm": 1.3515625,
"learning_rate": 0.00019911565753963145,
"loss": 5.0835,
"mean_token_accuracy": 0.2268539473414421,
"num_tokens": 13604755.0,
"step": 7480
},
{
"entropy": 5.752863359451294,
"epoch": 6.430597335625269,
"grad_norm": 1.1953125,
"learning_rate": 0.0001988027378424254,
"loss": 4.9844,
"mean_token_accuracy": 0.22718686014413833,
"num_tokens": 13613860.0,
"step": 7485
},
{
"entropy": 5.664366340637207,
"epoch": 6.434894714224323,
"grad_norm": 1.421875,
"learning_rate": 0.00019848998452958429,
"loss": 4.8699,
"mean_token_accuracy": 0.24055294096469879,
"num_tokens": 13622574.0,
"step": 7490
},
{
"entropy": 5.728462505340576,
"epoch": 6.439192092823378,
"grad_norm": 1.265625,
"learning_rate": 0.00019817739828403602,
"loss": 5.0143,
"mean_token_accuracy": 0.22399941384792327,
"num_tokens": 13632366.0,
"step": 7495
},
{
"entropy": 5.775394105911255,
"epoch": 6.443489471422432,
"grad_norm": 1.2265625,
"learning_rate": 0.00019786497978834422,
"loss": 4.9188,
"mean_token_accuracy": 0.24379066675901412,
"num_tokens": 13640682.0,
"step": 7500
},
{
"epoch": 6.443489471422432,
"eval_entropy": 5.516042317356075,
"eval_loss": 5.9212799072265625,
"eval_mean_token_accuracy": 0.18048049712570402,
"eval_num_tokens": 13640682.0,
"eval_runtime": 2.2313,
"eval_samples_per_second": 1590.525,
"eval_steps_per_second": 198.984,
"step": 7500
},
{
"entropy": 5.710464859008789,
"epoch": 6.447786850021487,
"grad_norm": 1.3671875,
"learning_rate": 0.00019755272972470602,
"loss": 4.9985,
"mean_token_accuracy": 0.2372261881828308,
"num_tokens": 13649675.0,
"step": 7505
},
{
"entropy": 5.728733777999878,
"epoch": 6.452084228620541,
"grad_norm": 1.390625,
"learning_rate": 0.00019724064877495057,
"loss": 4.9681,
"mean_token_accuracy": 0.22544475942850112,
"num_tokens": 13658260.0,
"step": 7510
},
{
"entropy": 5.689570474624634,
"epoch": 6.456381607219596,
"grad_norm": 1.4296875,
"learning_rate": 0.00019692873762053808,
"loss": 4.9032,
"mean_token_accuracy": 0.240114925801754,
"num_tokens": 13666571.0,
"step": 7515
},
{
"entropy": 5.74085488319397,
"epoch": 6.46067898581865,
"grad_norm": 1.1640625,
"learning_rate": 0.00019661699694255785,
"loss": 4.9944,
"mean_token_accuracy": 0.23399491906166076,
"num_tokens": 13675707.0,
"step": 7520
},
{
"entropy": 5.765975475311279,
"epoch": 6.464976364417705,
"grad_norm": 1.46875,
"learning_rate": 0.00019630542742172692,
"loss": 4.9617,
"mean_token_accuracy": 0.23181569278240205,
"num_tokens": 13684796.0,
"step": 7525
},
{
"entropy": 5.7313316822052,
"epoch": 6.4692737430167595,
"grad_norm": 1.328125,
"learning_rate": 0.00019599402973838854,
"loss": 5.0548,
"mean_token_accuracy": 0.2183179423213005,
"num_tokens": 13693158.0,
"step": 7530
},
{
"entropy": 5.759279012680054,
"epoch": 6.473571121615814,
"grad_norm": 1.40625,
"learning_rate": 0.0001956828045725107,
"loss": 5.0232,
"mean_token_accuracy": 0.2257732018828392,
"num_tokens": 13703521.0,
"step": 7535
},
{
"entropy": 5.726857852935791,
"epoch": 6.477868500214869,
"grad_norm": 1.359375,
"learning_rate": 0.0001953717526036849,
"loss": 4.938,
"mean_token_accuracy": 0.23097043633460998,
"num_tokens": 13712337.0,
"step": 7540
},
{
"entropy": 5.701276779174805,
"epoch": 6.482165878813923,
"grad_norm": 1.4296875,
"learning_rate": 0.00019506087451112437,
"loss": 4.9481,
"mean_token_accuracy": 0.22534014284610748,
"num_tokens": 13721605.0,
"step": 7545
},
{
"entropy": 5.643766689300537,
"epoch": 6.486463257412978,
"grad_norm": 1.4609375,
"learning_rate": 0.00019475017097366244,
"loss": 4.8556,
"mean_token_accuracy": 0.23758668601512908,
"num_tokens": 13730827.0,
"step": 7550
},
{
"entropy": 5.768571901321411,
"epoch": 6.490760636012032,
"grad_norm": 1.3984375,
"learning_rate": 0.00019443964266975156,
"loss": 5.0255,
"mean_token_accuracy": 0.22498469054698944,
"num_tokens": 13740128.0,
"step": 7555
},
{
"entropy": 5.71935887336731,
"epoch": 6.495058014611088,
"grad_norm": 1.3515625,
"learning_rate": 0.0001941292902774614,
"loss": 5.0079,
"mean_token_accuracy": 0.22391847968101503,
"num_tokens": 13748428.0,
"step": 7560
},
{
"entropy": 5.719990539550781,
"epoch": 6.499355393210142,
"grad_norm": 1.25,
"learning_rate": 0.00019381911447447742,
"loss": 4.9749,
"mean_token_accuracy": 0.23590452522039412,
"num_tokens": 13757109.0,
"step": 7565
},
{
"entropy": 5.771088457107544,
"epoch": 6.503652771809197,
"grad_norm": 1.4609375,
"learning_rate": 0.00019350911593809977,
"loss": 5.0273,
"mean_token_accuracy": 0.22215828597545623,
"num_tokens": 13766281.0,
"step": 7570
},
{
"entropy": 5.7560049533844,
"epoch": 6.507950150408251,
"grad_norm": 1.265625,
"learning_rate": 0.00019319929534524128,
"loss": 5.0243,
"mean_token_accuracy": 0.22932689040899276,
"num_tokens": 13775535.0,
"step": 7575
},
{
"entropy": 5.734013271331787,
"epoch": 6.512247529007306,
"grad_norm": 1.3125,
"learning_rate": 0.00019288965337242636,
"loss": 4.9859,
"mean_token_accuracy": 0.22449343800544738,
"num_tokens": 13784099.0,
"step": 7580
},
{
"entropy": 5.832199907302856,
"epoch": 6.51654490760636,
"grad_norm": 1.3125,
"learning_rate": 0.00019258019069578924,
"loss": 5.1169,
"mean_token_accuracy": 0.22021577656269073,
"num_tokens": 13793098.0,
"step": 7585
},
{
"entropy": 5.665675640106201,
"epoch": 6.520842286205415,
"grad_norm": 1.3515625,
"learning_rate": 0.00019227090799107266,
"loss": 4.9582,
"mean_token_accuracy": 0.23555098623037338,
"num_tokens": 13801847.0,
"step": 7590
},
{
"entropy": 5.6672852516174315,
"epoch": 6.5251396648044695,
"grad_norm": 1.25,
"learning_rate": 0.0001919618059336265,
"loss": 4.8502,
"mean_token_accuracy": 0.24223940819501877,
"num_tokens": 13810599.0,
"step": 7595
},
{
"entropy": 5.765718603134156,
"epoch": 6.529437043403524,
"grad_norm": 1.4453125,
"learning_rate": 0.00019165288519840617,
"loss": 4.9983,
"mean_token_accuracy": 0.22907198518514632,
"num_tokens": 13819602.0,
"step": 7600
},
{
"entropy": 5.687669324874878,
"epoch": 6.533734422002579,
"grad_norm": 1.453125,
"learning_rate": 0.000191344146459971,
"loss": 4.9475,
"mean_token_accuracy": 0.2387497156858444,
"num_tokens": 13828254.0,
"step": 7605
},
{
"entropy": 5.701369762420654,
"epoch": 6.538031800601633,
"grad_norm": 1.296875,
"learning_rate": 0.00019103559039248302,
"loss": 4.9916,
"mean_token_accuracy": 0.22820447534322738,
"num_tokens": 13837163.0,
"step": 7610
},
{
"entropy": 5.764211797714234,
"epoch": 6.542329179200688,
"grad_norm": 1.375,
"learning_rate": 0.0001907272176697052,
"loss": 5.0007,
"mean_token_accuracy": 0.22737849950790406,
"num_tokens": 13846373.0,
"step": 7615
},
{
"entropy": 5.673723602294922,
"epoch": 6.546626557799742,
"grad_norm": 1.3046875,
"learning_rate": 0.00019041902896500059,
"loss": 4.9524,
"mean_token_accuracy": 0.23322181403636932,
"num_tokens": 13855846.0,
"step": 7620
},
{
"entropy": 5.736261415481567,
"epoch": 6.550923936398797,
"grad_norm": 1.359375,
"learning_rate": 0.00019011102495132993,
"loss": 4.9605,
"mean_token_accuracy": 0.23272975385189057,
"num_tokens": 13864723.0,
"step": 7625
},
{
"entropy": 5.798337507247925,
"epoch": 6.555221314997851,
"grad_norm": 1.140625,
"learning_rate": 0.00018980320630125104,
"loss": 5.1101,
"mean_token_accuracy": 0.2181757315993309,
"num_tokens": 13873418.0,
"step": 7630
},
{
"entropy": 5.75980315208435,
"epoch": 6.559518693596906,
"grad_norm": 1.609375,
"learning_rate": 0.00018949557368691666,
"loss": 5.0061,
"mean_token_accuracy": 0.23227301239967346,
"num_tokens": 13881890.0,
"step": 7635
},
{
"entropy": 5.7168203830719,
"epoch": 6.56381607219596,
"grad_norm": 1.21875,
"learning_rate": 0.00018918812778007343,
"loss": 5.0371,
"mean_token_accuracy": 0.22749389559030533,
"num_tokens": 13891289.0,
"step": 7640
},
{
"entropy": 5.723117971420288,
"epoch": 6.568113450795015,
"grad_norm": 1.3203125,
"learning_rate": 0.00018888086925206054,
"loss": 4.9546,
"mean_token_accuracy": 0.23108558654785155,
"num_tokens": 13900344.0,
"step": 7645
},
{
"entropy": 5.778712177276612,
"epoch": 6.5724108293940695,
"grad_norm": 1.4296875,
"learning_rate": 0.00018857379877380763,
"loss": 4.9952,
"mean_token_accuracy": 0.23351393938064574,
"num_tokens": 13909108.0,
"step": 7650
},
{
"entropy": 5.726499557495117,
"epoch": 6.576708207993124,
"grad_norm": 1.2578125,
"learning_rate": 0.00018826691701583404,
"loss": 5.0088,
"mean_token_accuracy": 0.23075273931026458,
"num_tokens": 13918458.0,
"step": 7655
},
{
"entropy": 5.750834035873413,
"epoch": 6.581005586592179,
"grad_norm": 1.3828125,
"learning_rate": 0.00018796022464824663,
"loss": 4.9703,
"mean_token_accuracy": 0.23334225118160248,
"num_tokens": 13927186.0,
"step": 7660
},
{
"entropy": 5.721710252761841,
"epoch": 6.585302965191233,
"grad_norm": 1.3671875,
"learning_rate": 0.00018765372234073912,
"loss": 5.0632,
"mean_token_accuracy": 0.21736631840467452,
"num_tokens": 13936701.0,
"step": 7665
},
{
"entropy": 5.700629091262817,
"epoch": 6.589600343790288,
"grad_norm": 1.453125,
"learning_rate": 0.00018734741076259005,
"loss": 5.075,
"mean_token_accuracy": 0.21827920377254487,
"num_tokens": 13945842.0,
"step": 7670
},
{
"entropy": 5.6856285572052006,
"epoch": 6.593897722389342,
"grad_norm": 1.2109375,
"learning_rate": 0.00018704129058266152,
"loss": 4.9466,
"mean_token_accuracy": 0.2341363787651062,
"num_tokens": 13955675.0,
"step": 7675
},
{
"entropy": 5.80717830657959,
"epoch": 6.598195100988397,
"grad_norm": 1.3125,
"learning_rate": 0.00018673536246939743,
"loss": 5.0459,
"mean_token_accuracy": 0.23114715218544007,
"num_tokens": 13964153.0,
"step": 7680
},
{
"entropy": 5.752105951309204,
"epoch": 6.602492479587451,
"grad_norm": 1.4375,
"learning_rate": 0.00018642962709082274,
"loss": 5.0116,
"mean_token_accuracy": 0.22900762856006623,
"num_tokens": 13972141.0,
"step": 7685
},
{
"entropy": 5.650936651229858,
"epoch": 6.606789858186506,
"grad_norm": 1.4140625,
"learning_rate": 0.00018612408511454103,
"loss": 4.8898,
"mean_token_accuracy": 0.2377362921833992,
"num_tokens": 13981369.0,
"step": 7690
},
{
"entropy": 5.690488052368164,
"epoch": 6.61108723678556,
"grad_norm": 1.375,
"learning_rate": 0.00018581873720773423,
"loss": 4.9523,
"mean_token_accuracy": 0.2291845917701721,
"num_tokens": 13990626.0,
"step": 7695
},
{
"entropy": 5.7331787109375,
"epoch": 6.615384615384615,
"grad_norm": 1.34375,
"learning_rate": 0.00018551358403715989,
"loss": 5.0515,
"mean_token_accuracy": 0.22102296650409697,
"num_tokens": 13999399.0,
"step": 7700
},
{
"entropy": 5.750496292114258,
"epoch": 6.61968199398367,
"grad_norm": 1.4140625,
"learning_rate": 0.00018520862626915052,
"loss": 4.9555,
"mean_token_accuracy": 0.23201001733541488,
"num_tokens": 14007487.0,
"step": 7705
},
{
"entropy": 5.703403472900391,
"epoch": 6.623979372582724,
"grad_norm": 1.3046875,
"learning_rate": 0.00018490386456961223,
"loss": 4.927,
"mean_token_accuracy": 0.23265215754508972,
"num_tokens": 14016779.0,
"step": 7710
},
{
"entropy": 5.726163053512574,
"epoch": 6.6282767511817795,
"grad_norm": 1.3359375,
"learning_rate": 0.0001845992996040224,
"loss": 5.0042,
"mean_token_accuracy": 0.23183127194643022,
"num_tokens": 14025770.0,
"step": 7715
},
{
"entropy": 5.670454740524292,
"epoch": 6.632574129780834,
"grad_norm": 1.3203125,
"learning_rate": 0.00018429493203742946,
"loss": 4.9045,
"mean_token_accuracy": 0.2422835037112236,
"num_tokens": 14035304.0,
"step": 7720
},
{
"entropy": 5.719083833694458,
"epoch": 6.636871508379889,
"grad_norm": 1.5,
"learning_rate": 0.00018399076253445052,
"loss": 4.8473,
"mean_token_accuracy": 0.24168919026851654,
"num_tokens": 14044058.0,
"step": 7725
},
{
"entropy": 5.739474201202393,
"epoch": 6.641168886978943,
"grad_norm": 1.3359375,
"learning_rate": 0.00018368679175927012,
"loss": 4.9136,
"mean_token_accuracy": 0.24082895070314408,
"num_tokens": 14052709.0,
"step": 7730
},
{
"entropy": 5.655467987060547,
"epoch": 6.645466265577998,
"grad_norm": 1.3515625,
"learning_rate": 0.00018338302037563885,
"loss": 4.8753,
"mean_token_accuracy": 0.24130599498748778,
"num_tokens": 14061346.0,
"step": 7735
},
{
"entropy": 5.667017984390259,
"epoch": 6.649763644177052,
"grad_norm": 1.4453125,
"learning_rate": 0.00018307944904687211,
"loss": 4.957,
"mean_token_accuracy": 0.23699511587619781,
"num_tokens": 14070409.0,
"step": 7740
},
{
"entropy": 5.748349380493164,
"epoch": 6.654061022776107,
"grad_norm": 1.5546875,
"learning_rate": 0.0001827760784358483,
"loss": 5.0411,
"mean_token_accuracy": 0.2208509013056755,
"num_tokens": 14079448.0,
"step": 7745
},
{
"entropy": 5.750911140441895,
"epoch": 6.658358401375161,
"grad_norm": 1.3203125,
"learning_rate": 0.00018247290920500776,
"loss": 5.0164,
"mean_token_accuracy": 0.22797961235046388,
"num_tokens": 14088452.0,
"step": 7750
},
{
"entropy": 5.734561491012573,
"epoch": 6.662655779974216,
"grad_norm": 1.3984375,
"learning_rate": 0.00018216994201635062,
"loss": 5.0004,
"mean_token_accuracy": 0.23072549253702163,
"num_tokens": 14097956.0,
"step": 7755
},
{
"entropy": 5.748372364044189,
"epoch": 6.66695315857327,
"grad_norm": 1.4140625,
"learning_rate": 0.00018186717753143633,
"loss": 4.9427,
"mean_token_accuracy": 0.23190637975931166,
"num_tokens": 14106771.0,
"step": 7760
},
{
"entropy": 5.754193592071533,
"epoch": 6.671250537172325,
"grad_norm": 1.25,
"learning_rate": 0.00018156461641138133,
"loss": 5.0069,
"mean_token_accuracy": 0.2301826596260071,
"num_tokens": 14115871.0,
"step": 7765
},
{
"entropy": 5.708150577545166,
"epoch": 6.6755479157713795,
"grad_norm": 1.2578125,
"learning_rate": 0.00018126225931685836,
"loss": 4.9429,
"mean_token_accuracy": 0.23147647231817245,
"num_tokens": 14125316.0,
"step": 7770
},
{
"entropy": 5.6619995594024655,
"epoch": 6.679845294370434,
"grad_norm": 1.3671875,
"learning_rate": 0.00018096010690809444,
"loss": 4.9023,
"mean_token_accuracy": 0.23880022764205933,
"num_tokens": 14134244.0,
"step": 7775
},
{
"entropy": 5.749985933303833,
"epoch": 6.684142672969489,
"grad_norm": 1.296875,
"learning_rate": 0.00018065815984486962,
"loss": 5.032,
"mean_token_accuracy": 0.22433867752552034,
"num_tokens": 14143600.0,
"step": 7780
},
{
"entropy": 5.712496757507324,
"epoch": 6.688440051568543,
"grad_norm": 1.3828125,
"learning_rate": 0.00018035641878651548,
"loss": 4.9372,
"mean_token_accuracy": 0.23336533308029175,
"num_tokens": 14152112.0,
"step": 7785
},
{
"entropy": 5.681194543838501,
"epoch": 6.692737430167598,
"grad_norm": 1.484375,
"learning_rate": 0.00018005488439191408,
"loss": 4.9184,
"mean_token_accuracy": 0.235929536819458,
"num_tokens": 14161044.0,
"step": 7790
},
{
"entropy": 5.715410423278809,
"epoch": 6.697034808766652,
"grad_norm": 1.453125,
"learning_rate": 0.0001797535573194959,
"loss": 5.0322,
"mean_token_accuracy": 0.2334413096308708,
"num_tokens": 14169629.0,
"step": 7795
},
{
"entropy": 5.7020186424255375,
"epoch": 6.701332187365707,
"grad_norm": 1.234375,
"learning_rate": 0.0001794524382272389,
"loss": 4.9562,
"mean_token_accuracy": 0.2321821540594101,
"num_tokens": 14179234.0,
"step": 7800
},
{
"entropy": 5.716729545593262,
"epoch": 6.705629565964761,
"grad_norm": 1.1875,
"learning_rate": 0.0001791515277726667,
"loss": 4.9933,
"mean_token_accuracy": 0.2396368682384491,
"num_tokens": 14188887.0,
"step": 7805
},
{
"entropy": 5.681606817245483,
"epoch": 6.709926944563816,
"grad_norm": 1.3046875,
"learning_rate": 0.00017885082661284763,
"loss": 4.9074,
"mean_token_accuracy": 0.23539066463708877,
"num_tokens": 14197731.0,
"step": 7810
},
{
"entropy": 5.743403530120849,
"epoch": 6.71422432316287,
"grad_norm": 1.203125,
"learning_rate": 0.00017855033540439274,
"loss": 5.0258,
"mean_token_accuracy": 0.22792317420244218,
"num_tokens": 14206851.0,
"step": 7815
},
{
"entropy": 5.718718433380127,
"epoch": 6.718521701761925,
"grad_norm": 1.515625,
"learning_rate": 0.00017825005480345463,
"loss": 5.0302,
"mean_token_accuracy": 0.2288100838661194,
"num_tokens": 14215879.0,
"step": 7820
},
{
"entropy": 5.6874980449676515,
"epoch": 6.7228190803609795,
"grad_norm": 1.4296875,
"learning_rate": 0.00017794998546572627,
"loss": 4.8801,
"mean_token_accuracy": 0.23987502455711365,
"num_tokens": 14223798.0,
"step": 7825
},
{
"entropy": 5.725723648071289,
"epoch": 6.727116458960034,
"grad_norm": 1.1796875,
"learning_rate": 0.0001776501280464391,
"loss": 4.9859,
"mean_token_accuracy": 0.23374852985143663,
"num_tokens": 14233234.0,
"step": 7830
},
{
"entropy": 5.729088068008423,
"epoch": 6.731413837559089,
"grad_norm": 1.5,
"learning_rate": 0.00017735048320036197,
"loss": 4.9332,
"mean_token_accuracy": 0.22950732260942458,
"num_tokens": 14241851.0,
"step": 7835
},
{
"entropy": 5.720467853546142,
"epoch": 6.735711216158143,
"grad_norm": 1.4765625,
"learning_rate": 0.00017705105158179917,
"loss": 5.0433,
"mean_token_accuracy": 0.21816251277923585,
"num_tokens": 14251578.0,
"step": 7840
},
{
"entropy": 5.7909361839294435,
"epoch": 6.740008594757198,
"grad_norm": 1.2265625,
"learning_rate": 0.00017675183384458987,
"loss": 5.0511,
"mean_token_accuracy": 0.22307134717702864,
"num_tokens": 14261122.0,
"step": 7845
},
{
"entropy": 5.771282529830932,
"epoch": 6.744305973356253,
"grad_norm": 1.453125,
"learning_rate": 0.00017645283064210616,
"loss": 4.9444,
"mean_token_accuracy": 0.23607346415519714,
"num_tokens": 14270594.0,
"step": 7850
},
{
"entropy": 5.738985013961792,
"epoch": 6.748603351955307,
"grad_norm": 1.359375,
"learning_rate": 0.00017615404262725132,
"loss": 4.9987,
"mean_token_accuracy": 0.2309481084346771,
"num_tokens": 14279646.0,
"step": 7855
},
{
"entropy": 5.714361047744751,
"epoch": 6.752900730554362,
"grad_norm": 1.4296875,
"learning_rate": 0.00017585547045245885,
"loss": 4.9405,
"mean_token_accuracy": 0.23584286719560624,
"num_tokens": 14288555.0,
"step": 7860
},
{
"entropy": 5.699155330657959,
"epoch": 6.757198109153417,
"grad_norm": 1.4140625,
"learning_rate": 0.00017555711476969138,
"loss": 4.9656,
"mean_token_accuracy": 0.22831491380929947,
"num_tokens": 14297813.0,
"step": 7865
},
{
"entropy": 5.742558240890503,
"epoch": 6.761495487752471,
"grad_norm": 1.4375,
"learning_rate": 0.00017525897623043806,
"loss": 4.9478,
"mean_token_accuracy": 0.23676440864801407,
"num_tokens": 14305779.0,
"step": 7870
},
{
"entropy": 5.777757740020752,
"epoch": 6.765792866351526,
"grad_norm": 1.484375,
"learning_rate": 0.00017496105548571472,
"loss": 5.0401,
"mean_token_accuracy": 0.21774942576885223,
"num_tokens": 14314419.0,
"step": 7875
},
{
"entropy": 5.742297220230102,
"epoch": 6.77009024495058,
"grad_norm": 1.4296875,
"learning_rate": 0.00017466335318606086,
"loss": 4.989,
"mean_token_accuracy": 0.22961059510707854,
"num_tokens": 14322959.0,
"step": 7880
},
{
"entropy": 5.7465451717376705,
"epoch": 6.774387623549635,
"grad_norm": 1.2109375,
"learning_rate": 0.00017436586998153947,
"loss": 4.948,
"mean_token_accuracy": 0.2412852019071579,
"num_tokens": 14332270.0,
"step": 7885
},
{
"entropy": 5.658016204833984,
"epoch": 6.7786850021486895,
"grad_norm": 1.1953125,
"learning_rate": 0.00017406860652173495,
"loss": 4.9692,
"mean_token_accuracy": 0.2288740873336792,
"num_tokens": 14341701.0,
"step": 7890
},
{
"entropy": 5.730817270278931,
"epoch": 6.782982380747744,
"grad_norm": 1.46875,
"learning_rate": 0.00017377156345575176,
"loss": 4.9837,
"mean_token_accuracy": 0.23206369131803511,
"num_tokens": 14349551.0,
"step": 7895
},
{
"entropy": 5.648237323760986,
"epoch": 6.787279759346799,
"grad_norm": 1.3515625,
"learning_rate": 0.00017347474143221338,
"loss": 4.8989,
"mean_token_accuracy": 0.23939766883850097,
"num_tokens": 14358577.0,
"step": 7900
},
{
"entropy": 5.807467699050903,
"epoch": 6.791577137945853,
"grad_norm": 1.3203125,
"learning_rate": 0.00017317814109926044,
"loss": 5.0141,
"mean_token_accuracy": 0.22136349976062775,
"num_tokens": 14367862.0,
"step": 7905
},
{
"entropy": 5.643885517120362,
"epoch": 6.795874516544908,
"grad_norm": 1.21875,
"learning_rate": 0.0001728817631045495,
"loss": 4.9685,
"mean_token_accuracy": 0.23076968789100646,
"num_tokens": 14377414.0,
"step": 7910
},
{
"entropy": 5.614119005203247,
"epoch": 6.800171895143962,
"grad_norm": 1.2421875,
"learning_rate": 0.0001725856080952516,
"loss": 4.8926,
"mean_token_accuracy": 0.2443048432469368,
"num_tokens": 14387239.0,
"step": 7915
},
{
"entropy": 5.6486059665679935,
"epoch": 6.804469273743017,
"grad_norm": 1.375,
"learning_rate": 0.0001722896767180509,
"loss": 4.9112,
"mean_token_accuracy": 0.2445044696331024,
"num_tokens": 14396076.0,
"step": 7920
},
{
"entropy": 5.712938213348389,
"epoch": 6.808766652342071,
"grad_norm": 1.234375,
"learning_rate": 0.00017199396961914334,
"loss": 4.9828,
"mean_token_accuracy": 0.2291121393442154,
"num_tokens": 14404982.0,
"step": 7925
},
{
"entropy": 5.704281949996949,
"epoch": 6.813064030941126,
"grad_norm": 1.421875,
"learning_rate": 0.00017169848744423506,
"loss": 4.9396,
"mean_token_accuracy": 0.22628463208675384,
"num_tokens": 14413364.0,
"step": 7930
},
{
"entropy": 5.776637268066406,
"epoch": 6.81736140954018,
"grad_norm": 1.1875,
"learning_rate": 0.00017140323083854076,
"loss": 5.0554,
"mean_token_accuracy": 0.22274332046508788,
"num_tokens": 14424279.0,
"step": 7935
},
{
"entropy": 5.7132195949554445,
"epoch": 6.821658788139235,
"grad_norm": 1.390625,
"learning_rate": 0.00017110820044678317,
"loss": 5.0418,
"mean_token_accuracy": 0.21940283626317977,
"num_tokens": 14432931.0,
"step": 7940
},
{
"entropy": 5.733008098602295,
"epoch": 6.8259561667382895,
"grad_norm": 1.2890625,
"learning_rate": 0.00017081339691319054,
"loss": 4.9758,
"mean_token_accuracy": 0.22764192670583724,
"num_tokens": 14442652.0,
"step": 7945
},
{
"entropy": 5.75035982131958,
"epoch": 6.830253545337344,
"grad_norm": 1.2890625,
"learning_rate": 0.00017051882088149612,
"loss": 4.9486,
"mean_token_accuracy": 0.22507085800170898,
"num_tokens": 14452061.0,
"step": 7950
},
{
"entropy": 5.73496150970459,
"epoch": 6.834550923936399,
"grad_norm": 1.296875,
"learning_rate": 0.00017022447299493599,
"loss": 4.9373,
"mean_token_accuracy": 0.2278796076774597,
"num_tokens": 14460771.0,
"step": 7955
},
{
"entropy": 5.793766689300537,
"epoch": 6.838848302535453,
"grad_norm": 1.2890625,
"learning_rate": 0.00016993035389624854,
"loss": 5.0537,
"mean_token_accuracy": 0.22595914900302888,
"num_tokens": 14469983.0,
"step": 7960
},
{
"entropy": 5.70825343132019,
"epoch": 6.843145681134508,
"grad_norm": 1.390625,
"learning_rate": 0.0001696364642276722,
"loss": 4.9494,
"mean_token_accuracy": 0.2361249253153801,
"num_tokens": 14478641.0,
"step": 7965
},
{
"entropy": 5.731251430511475,
"epoch": 6.847443059733562,
"grad_norm": 1.28125,
"learning_rate": 0.00016934280463094448,
"loss": 5.0319,
"mean_token_accuracy": 0.2262295663356781,
"num_tokens": 14487922.0,
"step": 7970
},
{
"entropy": 5.655464458465576,
"epoch": 6.851740438332617,
"grad_norm": 1.5546875,
"learning_rate": 0.00016904937574730062,
"loss": 4.8786,
"mean_token_accuracy": 0.24183200299739838,
"num_tokens": 14496259.0,
"step": 7975
},
{
"entropy": 5.706764793395996,
"epoch": 6.856037816931671,
"grad_norm": 1.546875,
"learning_rate": 0.00016875617821747208,
"loss": 4.9795,
"mean_token_accuracy": 0.22885009348392488,
"num_tokens": 14504366.0,
"step": 7980
},
{
"entropy": 5.688942623138428,
"epoch": 6.860335195530726,
"grad_norm": 1.1796875,
"learning_rate": 0.00016846321268168508,
"loss": 4.9992,
"mean_token_accuracy": 0.2282481923699379,
"num_tokens": 14513561.0,
"step": 7985
},
{
"entropy": 5.7252562046051025,
"epoch": 6.8646325741297805,
"grad_norm": 1.3046875,
"learning_rate": 0.00016817047977965905,
"loss": 4.9373,
"mean_token_accuracy": 0.23489120304584504,
"num_tokens": 14522531.0,
"step": 7990
},
{
"entropy": 5.773946905136109,
"epoch": 6.868929952728836,
"grad_norm": 1.3984375,
"learning_rate": 0.0001678779801506058,
"loss": 4.9565,
"mean_token_accuracy": 0.2310111179947853,
"num_tokens": 14531375.0,
"step": 7995
},
{
"entropy": 5.709077548980713,
"epoch": 6.8732273313278895,
"grad_norm": 1.328125,
"learning_rate": 0.00016758571443322774,
"loss": 4.9955,
"mean_token_accuracy": 0.23008209466934204,
"num_tokens": 14541081.0,
"step": 8000
},
{
"epoch": 6.8732273313278895,
"eval_entropy": 5.525449365108937,
"eval_loss": 5.902733325958252,
"eval_mean_token_accuracy": 0.18115301297658737,
"eval_num_tokens": 14541081.0,
"eval_runtime": 2.0411,
"eval_samples_per_second": 1738.786,
"eval_steps_per_second": 217.532,
"step": 8000
},
{
"entropy": 5.63901720046997,
"epoch": 6.877524709926945,
"grad_norm": 1.28125,
"learning_rate": 0.0001672936832657162,
"loss": 4.9221,
"mean_token_accuracy": 0.24134268015623092,
"num_tokens": 14550129.0,
"step": 8005
},
{
"entropy": 5.745688438415527,
"epoch": 6.8818220885259995,
"grad_norm": 1.5078125,
"learning_rate": 0.00016700188728575047,
"loss": 5.0905,
"mean_token_accuracy": 0.21997221261262895,
"num_tokens": 14559051.0,
"step": 8010
},
{
"entropy": 5.691987371444702,
"epoch": 6.886119467125054,
"grad_norm": 1.3125,
"learning_rate": 0.00016671032713049655,
"loss": 5.0005,
"mean_token_accuracy": 0.23101048469543456,
"num_tokens": 14567719.0,
"step": 8015
},
{
"entropy": 5.759095239639282,
"epoch": 6.890416845724109,
"grad_norm": 1.390625,
"learning_rate": 0.00016641900343660515,
"loss": 5.0156,
"mean_token_accuracy": 0.22946806252002716,
"num_tokens": 14576256.0,
"step": 8020
},
{
"entropy": 5.766721200942993,
"epoch": 6.894714224323163,
"grad_norm": 1.1640625,
"learning_rate": 0.0001661279168402107,
"loss": 5.0238,
"mean_token_accuracy": 0.22409347891807557,
"num_tokens": 14586392.0,
"step": 8025
},
{
"entropy": 5.720468521118164,
"epoch": 6.899011602922218,
"grad_norm": 1.40625,
"learning_rate": 0.00016583706797693008,
"loss": 4.994,
"mean_token_accuracy": 0.23179059326648713,
"num_tokens": 14595448.0,
"step": 8030
},
{
"entropy": 5.732708072662353,
"epoch": 6.903308981521272,
"grad_norm": 1.296875,
"learning_rate": 0.00016554645748186105,
"loss": 5.029,
"mean_token_accuracy": 0.23494130671024321,
"num_tokens": 14604242.0,
"step": 8035
},
{
"entropy": 5.803254747390747,
"epoch": 6.907606360120327,
"grad_norm": 1.3359375,
"learning_rate": 0.00016525608598958063,
"loss": 5.1037,
"mean_token_accuracy": 0.220962455868721,
"num_tokens": 14614983.0,
"step": 8040
},
{
"entropy": 5.757966709136963,
"epoch": 6.911903738719381,
"grad_norm": 1.375,
"learning_rate": 0.00016496595413414421,
"loss": 4.971,
"mean_token_accuracy": 0.2303234815597534,
"num_tokens": 14624748.0,
"step": 8045
},
{
"entropy": 5.678277587890625,
"epoch": 6.916201117318436,
"grad_norm": 1.1875,
"learning_rate": 0.00016467606254908355,
"loss": 4.9261,
"mean_token_accuracy": 0.23968843072652818,
"num_tokens": 14633642.0,
"step": 8050
},
{
"entropy": 5.722575759887695,
"epoch": 6.9204984959174904,
"grad_norm": 1.2890625,
"learning_rate": 0.00016438641186740632,
"loss": 5.0191,
"mean_token_accuracy": 0.2276478499174118,
"num_tokens": 14642549.0,
"step": 8055
},
{
"entropy": 5.710235452651977,
"epoch": 6.924795874516545,
"grad_norm": 1.359375,
"learning_rate": 0.00016409700272159371,
"loss": 5.0261,
"mean_token_accuracy": 0.22573624700307846,
"num_tokens": 14651642.0,
"step": 8060
},
{
"entropy": 5.7362377643585205,
"epoch": 6.9290932531155995,
"grad_norm": 1.21875,
"learning_rate": 0.00016380783574359957,
"loss": 4.9909,
"mean_token_accuracy": 0.22976325154304506,
"num_tokens": 14661052.0,
"step": 8065
},
{
"entropy": 5.71889214515686,
"epoch": 6.933390631714654,
"grad_norm": 1.46875,
"learning_rate": 0.0001635189115648491,
"loss": 4.9958,
"mean_token_accuracy": 0.22729426622390747,
"num_tokens": 14670292.0,
"step": 8070
},
{
"entropy": 5.69280972480774,
"epoch": 6.937688010313709,
"grad_norm": 1.3828125,
"learning_rate": 0.00016323023081623705,
"loss": 4.9191,
"mean_token_accuracy": 0.23388173431158066,
"num_tokens": 14679735.0,
"step": 8075
},
{
"entropy": 5.754827642440796,
"epoch": 6.941985388912763,
"grad_norm": 1.296875,
"learning_rate": 0.00016294179412812702,
"loss": 5.0344,
"mean_token_accuracy": 0.21625297963619233,
"num_tokens": 14688710.0,
"step": 8080
},
{
"entropy": 5.741319990158081,
"epoch": 6.946282767511818,
"grad_norm": 1.328125,
"learning_rate": 0.00016265360213034923,
"loss": 5.0987,
"mean_token_accuracy": 0.21535037606954574,
"num_tokens": 14698523.0,
"step": 8085
},
{
"entropy": 5.681290912628174,
"epoch": 6.950580146110872,
"grad_norm": 1.1875,
"learning_rate": 0.00016236565545220007,
"loss": 4.9824,
"mean_token_accuracy": 0.22788620889186859,
"num_tokens": 14707674.0,
"step": 8090
},
{
"entropy": 5.701905870437622,
"epoch": 6.954877524709927,
"grad_norm": 1.21875,
"learning_rate": 0.00016207795472243975,
"loss": 4.9084,
"mean_token_accuracy": 0.237464140355587,
"num_tokens": 14716600.0,
"step": 8095
},
{
"entropy": 5.77295413017273,
"epoch": 6.959174903308981,
"grad_norm": 1.515625,
"learning_rate": 0.00016179050056929173,
"loss": 5.0906,
"mean_token_accuracy": 0.21767136603593826,
"num_tokens": 14726112.0,
"step": 8100
},
{
"entropy": 5.728647947311401,
"epoch": 6.963472281908036,
"grad_norm": 1.3671875,
"learning_rate": 0.00016150329362044102,
"loss": 4.9675,
"mean_token_accuracy": 0.23759464919567108,
"num_tokens": 14735126.0,
"step": 8105
},
{
"entropy": 5.7574504852294925,
"epoch": 6.9677696605070905,
"grad_norm": 1.21875,
"learning_rate": 0.00016121633450303285,
"loss": 4.9816,
"mean_token_accuracy": 0.23083561658859253,
"num_tokens": 14744346.0,
"step": 8110
},
{
"entropy": 5.817537593841553,
"epoch": 6.972067039106145,
"grad_norm": 1.2578125,
"learning_rate": 0.00016092962384367122,
"loss": 5.0652,
"mean_token_accuracy": 0.22513457387685776,
"num_tokens": 14753322.0,
"step": 8115
},
{
"entropy": 5.663236141204834,
"epoch": 6.9763644177051995,
"grad_norm": 1.3671875,
"learning_rate": 0.0001606431622684176,
"loss": 4.98,
"mean_token_accuracy": 0.2384248659014702,
"num_tokens": 14762384.0,
"step": 8120
},
{
"entropy": 5.715035390853882,
"epoch": 6.980661796304254,
"grad_norm": 1.3125,
"learning_rate": 0.00016035695040278935,
"loss": 5.0252,
"mean_token_accuracy": 0.22985492646694183,
"num_tokens": 14771451.0,
"step": 8125
},
{
"entropy": 5.794897556304932,
"epoch": 6.984959174903309,
"grad_norm": 1.265625,
"learning_rate": 0.00016007098887175914,
"loss": 5.04,
"mean_token_accuracy": 0.22508551180362701,
"num_tokens": 14780662.0,
"step": 8130
},
{
"entropy": 5.753342247009277,
"epoch": 6.989256553502363,
"grad_norm": 1.4453125,
"learning_rate": 0.00015978527829975254,
"loss": 4.9586,
"mean_token_accuracy": 0.2316376730799675,
"num_tokens": 14789201.0,
"step": 8135
},
{
"entropy": 5.703181743621826,
"epoch": 6.993553932101419,
"grad_norm": 1.40625,
"learning_rate": 0.00015949981931064714,
"loss": 4.9866,
"mean_token_accuracy": 0.23185751140117644,
"num_tokens": 14797857.0,
"step": 8140
},
{
"entropy": 5.75775842666626,
"epoch": 6.997851310700472,
"grad_norm": 1.484375,
"learning_rate": 0.0001592146125277714,
"loss": 4.9861,
"mean_token_accuracy": 0.22932713627815246,
"num_tokens": 14806271.0,
"step": 8145
},
{
"entropy": 5.770098368326823,
"epoch": 7.001718951439622,
"grad_norm": 1.1875,
"learning_rate": 0.00015892965857390278,
"loss": 5.0595,
"mean_token_accuracy": 0.21925362944602966,
"num_tokens": 14815568.0,
"step": 8150
},
{
"entropy": 5.790607690811157,
"epoch": 7.006016330038676,
"grad_norm": 1.3125,
"learning_rate": 0.00015864495807126704,
"loss": 4.9636,
"mean_token_accuracy": 0.23844788372516632,
"num_tokens": 14825140.0,
"step": 8155
},
{
"entropy": 5.7898753643035885,
"epoch": 7.010313708637731,
"grad_norm": 1.2421875,
"learning_rate": 0.00015836051164153602,
"loss": 4.9574,
"mean_token_accuracy": 0.23473027497529983,
"num_tokens": 14834459.0,
"step": 8160
},
{
"entropy": 5.77668514251709,
"epoch": 7.014611087236785,
"grad_norm": 1.4140625,
"learning_rate": 0.00015807631990582733,
"loss": 4.8857,
"mean_token_accuracy": 0.2403940051794052,
"num_tokens": 14843632.0,
"step": 8165
},
{
"entropy": 5.790946006774902,
"epoch": 7.01890846583584,
"grad_norm": 1.28125,
"learning_rate": 0.00015779238348470192,
"loss": 4.879,
"mean_token_accuracy": 0.24623702019453048,
"num_tokens": 14852626.0,
"step": 8170
},
{
"entropy": 5.639549970626831,
"epoch": 7.023205844434894,
"grad_norm": 1.453125,
"learning_rate": 0.00015750870299816345,
"loss": 4.8233,
"mean_token_accuracy": 0.23911771923303604,
"num_tokens": 14861571.0,
"step": 8175
},
{
"entropy": 5.668585920333863,
"epoch": 7.027503223033949,
"grad_norm": 1.3671875,
"learning_rate": 0.00015722527906565672,
"loss": 4.8115,
"mean_token_accuracy": 0.24984675794839858,
"num_tokens": 14870383.0,
"step": 8180
},
{
"entropy": 5.711289310455323,
"epoch": 7.0318006016330035,
"grad_norm": 1.4765625,
"learning_rate": 0.00015694211230606647,
"loss": 4.9099,
"mean_token_accuracy": 0.23236954361200332,
"num_tokens": 14880212.0,
"step": 8185
},
{
"entropy": 5.720697021484375,
"epoch": 7.036097980232058,
"grad_norm": 1.328125,
"learning_rate": 0.00015665920333771564,
"loss": 4.8637,
"mean_token_accuracy": 0.24579361379146575,
"num_tokens": 14889347.0,
"step": 8190
},
{
"entropy": 5.725212049484253,
"epoch": 7.0403953588311134,
"grad_norm": 1.28125,
"learning_rate": 0.00015637655277836427,
"loss": 4.9014,
"mean_token_accuracy": 0.23043718487024306,
"num_tokens": 14898553.0,
"step": 8195
},
{
"entropy": 5.726214981079101,
"epoch": 7.044692737430168,
"grad_norm": 1.3359375,
"learning_rate": 0.0001560941612452081,
"loss": 4.8983,
"mean_token_accuracy": 0.23977451920509338,
"num_tokens": 14907275.0,
"step": 8200
},
{
"entropy": 5.698152351379394,
"epoch": 7.0489901160292225,
"grad_norm": 1.40625,
"learning_rate": 0.0001558120293548777,
"loss": 4.8956,
"mean_token_accuracy": 0.24299730211496354,
"num_tokens": 14916409.0,
"step": 8205
},
{
"entropy": 5.807874727249145,
"epoch": 7.053287494628277,
"grad_norm": 1.390625,
"learning_rate": 0.00015553015772343614,
"loss": 4.9654,
"mean_token_accuracy": 0.2276977479457855,
"num_tokens": 14927144.0,
"step": 8210
},
{
"entropy": 5.668857002258301,
"epoch": 7.057584873227332,
"grad_norm": 1.4921875,
"learning_rate": 0.00015524854696637847,
"loss": 4.8286,
"mean_token_accuracy": 0.25360685139894484,
"num_tokens": 14936310.0,
"step": 8215
},
{
"entropy": 5.673094940185547,
"epoch": 7.061882251826386,
"grad_norm": 1.234375,
"learning_rate": 0.00015496719769862981,
"loss": 4.8297,
"mean_token_accuracy": 0.2441292092204094,
"num_tokens": 14945571.0,
"step": 8220
},
{
"entropy": 5.648290920257568,
"epoch": 7.066179630425441,
"grad_norm": 1.4375,
"learning_rate": 0.00015468611053454478,
"loss": 4.8496,
"mean_token_accuracy": 0.24561198949813842,
"num_tokens": 14954586.0,
"step": 8225
},
{
"entropy": 5.725588417053222,
"epoch": 7.070477009024495,
"grad_norm": 1.5625,
"learning_rate": 0.00015440528608790533,
"loss": 4.9286,
"mean_token_accuracy": 0.23840467929840087,
"num_tokens": 14963048.0,
"step": 8230
},
{
"entropy": 5.739574861526489,
"epoch": 7.07477438762355,
"grad_norm": 1.375,
"learning_rate": 0.0001541247249719197,
"loss": 4.9141,
"mean_token_accuracy": 0.2364799052476883,
"num_tokens": 14972158.0,
"step": 8235
},
{
"entropy": 5.750800275802613,
"epoch": 7.079071766222604,
"grad_norm": 1.328125,
"learning_rate": 0.00015384442779922135,
"loss": 4.8631,
"mean_token_accuracy": 0.2408420354127884,
"num_tokens": 14980869.0,
"step": 8240
},
{
"entropy": 5.7228302478790285,
"epoch": 7.083369144821659,
"grad_norm": 1.546875,
"learning_rate": 0.00015356439518186726,
"loss": 4.8409,
"mean_token_accuracy": 0.24564204663038253,
"num_tokens": 14989103.0,
"step": 8245
},
{
"entropy": 5.659767150878906,
"epoch": 7.0876665234207135,
"grad_norm": 1.171875,
"learning_rate": 0.00015328462773133672,
"loss": 4.8656,
"mean_token_accuracy": 0.24564456343650817,
"num_tokens": 14999378.0,
"step": 8250
},
{
"entropy": 5.721705627441406,
"epoch": 7.091963902019768,
"grad_norm": 1.390625,
"learning_rate": 0.00015300512605852977,
"loss": 4.9331,
"mean_token_accuracy": 0.23642863035202027,
"num_tokens": 15007971.0,
"step": 8255
},
{
"entropy": 5.755592966079712,
"epoch": 7.0962612806188226,
"grad_norm": 1.2734375,
"learning_rate": 0.0001527258907737668,
"loss": 4.936,
"mean_token_accuracy": 0.23605114668607713,
"num_tokens": 15017537.0,
"step": 8260
},
{
"entropy": 5.686761331558228,
"epoch": 7.100558659217877,
"grad_norm": 1.515625,
"learning_rate": 0.00015244692248678586,
"loss": 4.8187,
"mean_token_accuracy": 0.2529310867190361,
"num_tokens": 15025684.0,
"step": 8265
},
{
"entropy": 5.70835599899292,
"epoch": 7.104856037816932,
"grad_norm": 1.484375,
"learning_rate": 0.0001521682218067421,
"loss": 4.869,
"mean_token_accuracy": 0.24218338280916213,
"num_tokens": 15034753.0,
"step": 8270
},
{
"entropy": 5.728517532348633,
"epoch": 7.109153416415986,
"grad_norm": 1.3359375,
"learning_rate": 0.00015188978934220642,
"loss": 4.8816,
"mean_token_accuracy": 0.24051901549100876,
"num_tokens": 15044685.0,
"step": 8275
},
{
"entropy": 5.771997499465942,
"epoch": 7.113450795015041,
"grad_norm": 1.2109375,
"learning_rate": 0.0001516116257011641,
"loss": 4.9531,
"mean_token_accuracy": 0.24105844050645828,
"num_tokens": 15054853.0,
"step": 8280
},
{
"entropy": 5.715450191497803,
"epoch": 7.117748173614095,
"grad_norm": 1.2109375,
"learning_rate": 0.0001513337314910134,
"loss": 4.8978,
"mean_token_accuracy": 0.23127783685922623,
"num_tokens": 15065244.0,
"step": 8285
},
{
"entropy": 5.662863492965698,
"epoch": 7.12204555221315,
"grad_norm": 1.40625,
"learning_rate": 0.00015105610731856416,
"loss": 4.8215,
"mean_token_accuracy": 0.2513589784502983,
"num_tokens": 15074046.0,
"step": 8290
},
{
"entropy": 5.695620584487915,
"epoch": 7.126342930812204,
"grad_norm": 1.40625,
"learning_rate": 0.00015077875379003653,
"loss": 4.8739,
"mean_token_accuracy": 0.23767761290073394,
"num_tokens": 15083518.0,
"step": 8295
},
{
"entropy": 5.7220344066619875,
"epoch": 7.130640309411259,
"grad_norm": 1.2890625,
"learning_rate": 0.00015050167151105988,
"loss": 4.9374,
"mean_token_accuracy": 0.23466922491788864,
"num_tokens": 15092512.0,
"step": 8300
},
{
"entropy": 5.771974849700928,
"epoch": 7.1349376880103135,
"grad_norm": 1.5625,
"learning_rate": 0.000150224861086671,
"loss": 4.9051,
"mean_token_accuracy": 0.24226571321487428,
"num_tokens": 15101722.0,
"step": 8305
},
{
"entropy": 5.73566575050354,
"epoch": 7.139235066609368,
"grad_norm": 1.3125,
"learning_rate": 0.00014994832312131332,
"loss": 4.8418,
"mean_token_accuracy": 0.24409846365451812,
"num_tokens": 15110114.0,
"step": 8310
},
{
"entropy": 5.698592281341552,
"epoch": 7.143532445208423,
"grad_norm": 1.3984375,
"learning_rate": 0.00014967205821883532,
"loss": 4.937,
"mean_token_accuracy": 0.23917200565338134,
"num_tokens": 15119461.0,
"step": 8315
},
{
"entropy": 5.707847642898559,
"epoch": 7.147829823807477,
"grad_norm": 1.4765625,
"learning_rate": 0.000149396066982489,
"loss": 4.8737,
"mean_token_accuracy": 0.24108270555734634,
"num_tokens": 15127518.0,
"step": 8320
},
{
"entropy": 5.750520896911621,
"epoch": 7.152127202406532,
"grad_norm": 1.3203125,
"learning_rate": 0.00014912035001492897,
"loss": 4.9462,
"mean_token_accuracy": 0.2327495649456978,
"num_tokens": 15136741.0,
"step": 8325
},
{
"entropy": 5.734196424484253,
"epoch": 7.156424581005586,
"grad_norm": 1.4375,
"learning_rate": 0.00014884490791821058,
"loss": 4.8907,
"mean_token_accuracy": 0.24345339983701705,
"num_tokens": 15145193.0,
"step": 8330
},
{
"entropy": 5.699493360519409,
"epoch": 7.160721959604641,
"grad_norm": 1.375,
"learning_rate": 0.00014856974129378981,
"loss": 4.8922,
"mean_token_accuracy": 0.2417183518409729,
"num_tokens": 15154117.0,
"step": 8335
},
{
"entropy": 5.712007856369018,
"epoch": 7.165019338203695,
"grad_norm": 1.296875,
"learning_rate": 0.0001482948507425203,
"loss": 4.913,
"mean_token_accuracy": 0.23444428145885468,
"num_tokens": 15163221.0,
"step": 8340
},
{
"entropy": 5.747190380096436,
"epoch": 7.169316716802751,
"grad_norm": 1.2421875,
"learning_rate": 0.00014802023686465314,
"loss": 4.9764,
"mean_token_accuracy": 0.22482520043849946,
"num_tokens": 15173234.0,
"step": 8345
},
{
"entropy": 5.685973215103149,
"epoch": 7.173614095401805,
"grad_norm": 1.421875,
"learning_rate": 0.00014774590025983523,
"loss": 4.8529,
"mean_token_accuracy": 0.24296284317970276,
"num_tokens": 15181436.0,
"step": 8350
},
{
"entropy": 5.700090456008911,
"epoch": 7.17791147400086,
"grad_norm": 1.5234375,
"learning_rate": 0.00014747184152710807,
"loss": 4.9054,
"mean_token_accuracy": 0.24234439432621002,
"num_tokens": 15191697.0,
"step": 8355
},
{
"entropy": 5.714896535873413,
"epoch": 7.182208852599914,
"grad_norm": 1.3515625,
"learning_rate": 0.00014719806126490658,
"loss": 4.8689,
"mean_token_accuracy": 0.2476797804236412,
"num_tokens": 15201563.0,
"step": 8360
},
{
"entropy": 5.682293272018432,
"epoch": 7.186506231198969,
"grad_norm": 1.4296875,
"learning_rate": 0.0001469245600710573,
"loss": 4.87,
"mean_token_accuracy": 0.24080406427383422,
"num_tokens": 15210886.0,
"step": 8365
},
{
"entropy": 5.713798952102661,
"epoch": 7.1908036097980235,
"grad_norm": 1.4765625,
"learning_rate": 0.00014665133854277742,
"loss": 4.9057,
"mean_token_accuracy": 0.23708308786153792,
"num_tokens": 15219254.0,
"step": 8370
},
{
"entropy": 5.675087833404541,
"epoch": 7.195100988397078,
"grad_norm": 1.359375,
"learning_rate": 0.0001463783972766737,
"loss": 4.8843,
"mean_token_accuracy": 0.24380502253770828,
"num_tokens": 15228117.0,
"step": 8375
},
{
"entropy": 5.638825178146362,
"epoch": 7.199398366996133,
"grad_norm": 1.4765625,
"learning_rate": 0.0001461057368687407,
"loss": 4.8655,
"mean_token_accuracy": 0.24366891533136367,
"num_tokens": 15236621.0,
"step": 8380
},
{
"entropy": 5.668357849121094,
"epoch": 7.203695745595187,
"grad_norm": 1.3671875,
"learning_rate": 0.00014583335791435971,
"loss": 4.82,
"mean_token_accuracy": 0.24267793148756028,
"num_tokens": 15245487.0,
"step": 8385
},
{
"entropy": 5.695615434646607,
"epoch": 7.207993124194242,
"grad_norm": 1.3125,
"learning_rate": 0.00014556126100829774,
"loss": 4.8767,
"mean_token_accuracy": 0.23861388117074966,
"num_tokens": 15255321.0,
"step": 8390
},
{
"entropy": 5.645771932601929,
"epoch": 7.212290502793296,
"grad_norm": 1.3125,
"learning_rate": 0.00014528944674470546,
"loss": 4.7918,
"mean_token_accuracy": 0.24924195259809495,
"num_tokens": 15264788.0,
"step": 8395
},
{
"entropy": 5.726456499099731,
"epoch": 7.216587881392351,
"grad_norm": 1.4296875,
"learning_rate": 0.0001450179157171166,
"loss": 4.886,
"mean_token_accuracy": 0.23832045942544938,
"num_tokens": 15273448.0,
"step": 8400
},
{
"entropy": 5.758960819244384,
"epoch": 7.220885259991405,
"grad_norm": 1.4453125,
"learning_rate": 0.00014474666851844632,
"loss": 4.9719,
"mean_token_accuracy": 0.22947929054498672,
"num_tokens": 15283071.0,
"step": 8405
},
{
"entropy": 5.690451908111572,
"epoch": 7.22518263859046,
"grad_norm": 1.4375,
"learning_rate": 0.00014447570574099028,
"loss": 4.784,
"mean_token_accuracy": 0.25350341796875,
"num_tokens": 15291537.0,
"step": 8410
},
{
"entropy": 5.763349103927612,
"epoch": 7.229480017189514,
"grad_norm": 1.2890625,
"learning_rate": 0.00014420502797642283,
"loss": 4.8627,
"mean_token_accuracy": 0.24792618304491043,
"num_tokens": 15300531.0,
"step": 8415
},
{
"entropy": 5.700027704238892,
"epoch": 7.233777395788569,
"grad_norm": 1.453125,
"learning_rate": 0.000143934635815796,
"loss": 4.947,
"mean_token_accuracy": 0.23724473267793655,
"num_tokens": 15309820.0,
"step": 8420
},
{
"entropy": 5.676058435440064,
"epoch": 7.2380747743876235,
"grad_norm": 1.515625,
"learning_rate": 0.0001436645298495381,
"loss": 4.8861,
"mean_token_accuracy": 0.23884514719247818,
"num_tokens": 15318604.0,
"step": 8425
},
{
"entropy": 5.662927532196045,
"epoch": 7.242372152986678,
"grad_norm": 1.328125,
"learning_rate": 0.00014339471066745262,
"loss": 4.8956,
"mean_token_accuracy": 0.23285638093948363,
"num_tokens": 15327737.0,
"step": 8430
},
{
"entropy": 5.710688924789428,
"epoch": 7.246669531585733,
"grad_norm": 1.4765625,
"learning_rate": 0.000143125178858717,
"loss": 4.9411,
"mean_token_accuracy": 0.23508805632591248,
"num_tokens": 15336663.0,
"step": 8435
},
{
"entropy": 5.775945138931275,
"epoch": 7.250966910184787,
"grad_norm": 1.34375,
"learning_rate": 0.00014285593501188083,
"loss": 4.9471,
"mean_token_accuracy": 0.23182412534952163,
"num_tokens": 15345278.0,
"step": 8440
},
{
"entropy": 5.743250989913941,
"epoch": 7.255264288783842,
"grad_norm": 1.3125,
"learning_rate": 0.00014258697971486492,
"loss": 4.9232,
"mean_token_accuracy": 0.24036518335342408,
"num_tokens": 15354230.0,
"step": 8445
},
{
"entropy": 5.734498691558838,
"epoch": 7.259561667382896,
"grad_norm": 1.40625,
"learning_rate": 0.00014231831355496045,
"loss": 4.9345,
"mean_token_accuracy": 0.2410287767648697,
"num_tokens": 15362838.0,
"step": 8450
},
{
"entropy": 5.6637495994567875,
"epoch": 7.263859045981951,
"grad_norm": 1.234375,
"learning_rate": 0.00014204993711882662,
"loss": 4.8889,
"mean_token_accuracy": 0.24409003406763077,
"num_tokens": 15372593.0,
"step": 8455
},
{
"entropy": 5.759537553787231,
"epoch": 7.268156424581005,
"grad_norm": 1.359375,
"learning_rate": 0.0001417818509924906,
"loss": 4.9528,
"mean_token_accuracy": 0.23652229011058806,
"num_tokens": 15381945.0,
"step": 8460
},
{
"entropy": 5.695685482025146,
"epoch": 7.27245380318006,
"grad_norm": 1.265625,
"learning_rate": 0.000141514055761345,
"loss": 4.8834,
"mean_token_accuracy": 0.24260732531547546,
"num_tokens": 15391487.0,
"step": 8465
},
{
"entropy": 5.741024160385132,
"epoch": 7.276751181779114,
"grad_norm": 1.4375,
"learning_rate": 0.00014124655201014786,
"loss": 4.8414,
"mean_token_accuracy": 0.24344971179962158,
"num_tokens": 15399891.0,
"step": 8470
},
{
"entropy": 5.6841777801513675,
"epoch": 7.281048560378169,
"grad_norm": 1.359375,
"learning_rate": 0.00014097934032302037,
"loss": 4.8381,
"mean_token_accuracy": 0.24351507276296616,
"num_tokens": 15408693.0,
"step": 8475
},
{
"entropy": 5.673647069931031,
"epoch": 7.2853459389772235,
"grad_norm": 1.375,
"learning_rate": 0.00014071242128344593,
"loss": 4.9228,
"mean_token_accuracy": 0.238188037276268,
"num_tokens": 15417779.0,
"step": 8480
},
{
"entropy": 5.6682343006134035,
"epoch": 7.289643317576278,
"grad_norm": 1.5078125,
"learning_rate": 0.0001404457954742691,
"loss": 4.845,
"mean_token_accuracy": 0.2480306074023247,
"num_tokens": 15425826.0,
"step": 8485
},
{
"entropy": 5.69491925239563,
"epoch": 7.2939406961753335,
"grad_norm": 1.640625,
"learning_rate": 0.00014017946347769423,
"loss": 4.914,
"mean_token_accuracy": 0.2448977291584015,
"num_tokens": 15435811.0,
"step": 8490
},
{
"entropy": 5.648247766494751,
"epoch": 7.298238074774388,
"grad_norm": 1.4765625,
"learning_rate": 0.00013991342587528377,
"loss": 4.8112,
"mean_token_accuracy": 0.2435745283961296,
"num_tokens": 15444949.0,
"step": 8495
},
{
"entropy": 5.6486053466796875,
"epoch": 7.302535453373443,
"grad_norm": 1.375,
"learning_rate": 0.00013964768324795752,
"loss": 4.8301,
"mean_token_accuracy": 0.2504597008228302,
"num_tokens": 15453398.0,
"step": 8500
},
{
"epoch": 7.302535453373443,
"eval_entropy": 5.5008021507177265,
"eval_loss": 5.913280487060547,
"eval_mean_token_accuracy": 0.18130035429924457,
"eval_num_tokens": 15453398.0,
"eval_runtime": 2.0541,
"eval_samples_per_second": 1727.748,
"eval_steps_per_second": 216.151,
"step": 8500
},
{
"entropy": 5.640936183929443,
"epoch": 7.306832831972497,
"grad_norm": 1.265625,
"learning_rate": 0.00013938223617599124,
"loss": 4.9141,
"mean_token_accuracy": 0.23757578134536744,
"num_tokens": 15462785.0,
"step": 8505
},
{
"entropy": 5.703793859481811,
"epoch": 7.311130210571552,
"grad_norm": 1.484375,
"learning_rate": 0.00013911708523901514,
"loss": 4.9328,
"mean_token_accuracy": 0.23959697782993317,
"num_tokens": 15471718.0,
"step": 8510
},
{
"entropy": 5.737540197372437,
"epoch": 7.315427589170606,
"grad_norm": 1.3671875,
"learning_rate": 0.00013885223101601303,
"loss": 4.8673,
"mean_token_accuracy": 0.2403181314468384,
"num_tokens": 15480204.0,
"step": 8515
},
{
"entropy": 5.678928709030151,
"epoch": 7.319724967769661,
"grad_norm": 1.3515625,
"learning_rate": 0.00013858767408532051,
"loss": 4.8308,
"mean_token_accuracy": 0.24473243802785874,
"num_tokens": 15489388.0,
"step": 8520
},
{
"entropy": 5.666277360916138,
"epoch": 7.324022346368715,
"grad_norm": 1.453125,
"learning_rate": 0.00013832341502462432,
"loss": 4.8509,
"mean_token_accuracy": 0.2423481523990631,
"num_tokens": 15498028.0,
"step": 8525
},
{
"entropy": 5.703804349899292,
"epoch": 7.32831972496777,
"grad_norm": 1.5703125,
"learning_rate": 0.00013805945441096057,
"loss": 4.8826,
"mean_token_accuracy": 0.24215862900018692,
"num_tokens": 15506382.0,
"step": 8530
},
{
"entropy": 5.715006399154663,
"epoch": 7.332617103566824,
"grad_norm": 1.4921875,
"learning_rate": 0.00013779579282071364,
"loss": 4.9085,
"mean_token_accuracy": 0.24137408286333084,
"num_tokens": 15515271.0,
"step": 8535
},
{
"entropy": 5.669809722900391,
"epoch": 7.336914482165879,
"grad_norm": 1.3359375,
"learning_rate": 0.00013753243082961512,
"loss": 4.8373,
"mean_token_accuracy": 0.24630660563707352,
"num_tokens": 15524396.0,
"step": 8540
},
{
"entropy": 5.671169328689575,
"epoch": 7.3412118607649335,
"grad_norm": 1.4453125,
"learning_rate": 0.00013726936901274246,
"loss": 4.816,
"mean_token_accuracy": 0.24975510984659194,
"num_tokens": 15532829.0,
"step": 8545
},
{
"entropy": 5.729752159118652,
"epoch": 7.345509239363988,
"grad_norm": 1.2890625,
"learning_rate": 0.0001370066079445174,
"loss": 4.9423,
"mean_token_accuracy": 0.23485267013311387,
"num_tokens": 15541726.0,
"step": 8550
},
{
"entropy": 5.765802001953125,
"epoch": 7.349806617963043,
"grad_norm": 1.3203125,
"learning_rate": 0.00013674414819870502,
"loss": 5.0472,
"mean_token_accuracy": 0.22578038275241852,
"num_tokens": 15551539.0,
"step": 8555
},
{
"entropy": 5.687641620635986,
"epoch": 7.354103996562097,
"grad_norm": 1.484375,
"learning_rate": 0.00013648199034841264,
"loss": 4.8888,
"mean_token_accuracy": 0.23955927342176436,
"num_tokens": 15560147.0,
"step": 8560
},
{
"entropy": 5.673187303543091,
"epoch": 7.358401375161152,
"grad_norm": 1.4453125,
"learning_rate": 0.0001362201349660882,
"loss": 4.8612,
"mean_token_accuracy": 0.24715079367160797,
"num_tokens": 15568983.0,
"step": 8565
},
{
"entropy": 5.653613901138305,
"epoch": 7.362698753760206,
"grad_norm": 1.421875,
"learning_rate": 0.0001359585826235192,
"loss": 4.8706,
"mean_token_accuracy": 0.24422087669372558,
"num_tokens": 15578065.0,
"step": 8570
},
{
"entropy": 5.7628312587738035,
"epoch": 7.366996132359261,
"grad_norm": 1.5390625,
"learning_rate": 0.00013569733389183126,
"loss": 4.9618,
"mean_token_accuracy": 0.2341765359044075,
"num_tokens": 15587181.0,
"step": 8575
},
{
"entropy": 5.705190706253052,
"epoch": 7.371293510958315,
"grad_norm": 1.3359375,
"learning_rate": 0.00013543638934148736,
"loss": 4.8957,
"mean_token_accuracy": 0.23927247971296312,
"num_tokens": 15596602.0,
"step": 8580
},
{
"entropy": 5.772485971450806,
"epoch": 7.37559088955737,
"grad_norm": 1.5,
"learning_rate": 0.000135175749542286,
"loss": 4.9429,
"mean_token_accuracy": 0.22380622774362563,
"num_tokens": 15605857.0,
"step": 8585
},
{
"entropy": 5.623779296875,
"epoch": 7.379888268156424,
"grad_norm": 1.2890625,
"learning_rate": 0.0001349154150633604,
"loss": 4.8308,
"mean_token_accuracy": 0.2533808171749115,
"num_tokens": 15615320.0,
"step": 8590
},
{
"entropy": 5.737985992431641,
"epoch": 7.384185646755479,
"grad_norm": 1.34375,
"learning_rate": 0.000134655386473177,
"loss": 4.9824,
"mean_token_accuracy": 0.22759506702423096,
"num_tokens": 15624193.0,
"step": 8595
},
{
"entropy": 5.601533651351929,
"epoch": 7.3884830253545335,
"grad_norm": 1.5234375,
"learning_rate": 0.00013439566433953427,
"loss": 4.8275,
"mean_token_accuracy": 0.25197608172893526,
"num_tokens": 15632924.0,
"step": 8600
},
{
"entropy": 5.694646692276001,
"epoch": 7.392780403953588,
"grad_norm": 1.4296875,
"learning_rate": 0.0001341362492295616,
"loss": 4.847,
"mean_token_accuracy": 0.24944338649511338,
"num_tokens": 15642201.0,
"step": 8605
},
{
"entropy": 5.757268381118775,
"epoch": 7.397077782552643,
"grad_norm": 1.5234375,
"learning_rate": 0.00013387714170971776,
"loss": 4.8893,
"mean_token_accuracy": 0.23827150762081145,
"num_tokens": 15651608.0,
"step": 8610
},
{
"entropy": 5.751231670379639,
"epoch": 7.401375161151697,
"grad_norm": 1.3359375,
"learning_rate": 0.00013361834234579012,
"loss": 4.9938,
"mean_token_accuracy": 0.23161635547876358,
"num_tokens": 15661768.0,
"step": 8615
},
{
"entropy": 5.6181495666503904,
"epoch": 7.405672539750752,
"grad_norm": 1.4609375,
"learning_rate": 0.0001333598517028931,
"loss": 4.8745,
"mean_token_accuracy": 0.2390742525458336,
"num_tokens": 15670270.0,
"step": 8620
},
{
"entropy": 5.660263729095459,
"epoch": 7.409969918349806,
"grad_norm": 1.40625,
"learning_rate": 0.00013310167034546688,
"loss": 4.8491,
"mean_token_accuracy": 0.2465496301651001,
"num_tokens": 15679587.0,
"step": 8625
},
{
"entropy": 5.721620082855225,
"epoch": 7.414267296948861,
"grad_norm": 1.5078125,
"learning_rate": 0.0001328437988372763,
"loss": 4.9246,
"mean_token_accuracy": 0.23698771893978118,
"num_tokens": 15688838.0,
"step": 8630
},
{
"entropy": 5.744182872772217,
"epoch": 7.418564675547916,
"grad_norm": 1.4140625,
"learning_rate": 0.00013258623774140967,
"loss": 4.8863,
"mean_token_accuracy": 0.24282266497611998,
"num_tokens": 15697744.0,
"step": 8635
},
{
"entropy": 5.69905276298523,
"epoch": 7.422862054146971,
"grad_norm": 1.4765625,
"learning_rate": 0.00013232898762027766,
"loss": 4.8515,
"mean_token_accuracy": 0.24636502265930177,
"num_tokens": 15707643.0,
"step": 8640
},
{
"entropy": 5.722139835357666,
"epoch": 7.427159432746025,
"grad_norm": 1.3828125,
"learning_rate": 0.00013207204903561154,
"loss": 4.9429,
"mean_token_accuracy": 0.232541623711586,
"num_tokens": 15717568.0,
"step": 8645
},
{
"entropy": 5.655771064758301,
"epoch": 7.43145681134508,
"grad_norm": 1.2890625,
"learning_rate": 0.00013181542254846247,
"loss": 4.8108,
"mean_token_accuracy": 0.24847375005483627,
"num_tokens": 15726467.0,
"step": 8650
},
{
"entropy": 5.736402368545532,
"epoch": 7.435754189944134,
"grad_norm": 1.4453125,
"learning_rate": 0.0001315591087192002,
"loss": 4.9594,
"mean_token_accuracy": 0.23719182014465331,
"num_tokens": 15736533.0,
"step": 8655
},
{
"entropy": 5.628046464920044,
"epoch": 7.440051568543189,
"grad_norm": 1.328125,
"learning_rate": 0.00013130310810751162,
"loss": 4.8607,
"mean_token_accuracy": 0.2462889164686203,
"num_tokens": 15745853.0,
"step": 8660
},
{
"entropy": 5.74335241317749,
"epoch": 7.4443489471422435,
"grad_norm": 1.4375,
"learning_rate": 0.00013104742127239983,
"loss": 4.9821,
"mean_token_accuracy": 0.23338729590177537,
"num_tokens": 15755534.0,
"step": 8665
},
{
"entropy": 5.772054052352905,
"epoch": 7.448646325741298,
"grad_norm": 1.2109375,
"learning_rate": 0.0001307920487721826,
"loss": 4.9678,
"mean_token_accuracy": 0.232174876332283,
"num_tokens": 15766182.0,
"step": 8670
},
{
"entropy": 5.706271362304688,
"epoch": 7.452943704340353,
"grad_norm": 1.421875,
"learning_rate": 0.00013053699116449144,
"loss": 4.8787,
"mean_token_accuracy": 0.23836376070976256,
"num_tokens": 15775454.0,
"step": 8675
},
{
"entropy": 5.858646392822266,
"epoch": 7.457241082939407,
"grad_norm": 1.296875,
"learning_rate": 0.00013028224900627026,
"loss": 4.9947,
"mean_token_accuracy": 0.22622087746858596,
"num_tokens": 15784768.0,
"step": 8680
},
{
"entropy": 5.753428220748901,
"epoch": 7.461538461538462,
"grad_norm": 1.125,
"learning_rate": 0.00013002782285377395,
"loss": 4.9381,
"mean_token_accuracy": 0.2409772902727127,
"num_tokens": 15794255.0,
"step": 8685
},
{
"entropy": 5.725694417953491,
"epoch": 7.465835840137516,
"grad_norm": 1.5234375,
"learning_rate": 0.0001297737132625677,
"loss": 4.9665,
"mean_token_accuracy": 0.23175050765275956,
"num_tokens": 15803722.0,
"step": 8690
},
{
"entropy": 5.731313705444336,
"epoch": 7.470133218736571,
"grad_norm": 1.6015625,
"learning_rate": 0.00012951992078752528,
"loss": 4.9028,
"mean_token_accuracy": 0.23898655623197557,
"num_tokens": 15811819.0,
"step": 8695
},
{
"entropy": 5.703043031692505,
"epoch": 7.474430597335625,
"grad_norm": 1.296875,
"learning_rate": 0.00012926644598282798,
"loss": 4.9245,
"mean_token_accuracy": 0.2335854396224022,
"num_tokens": 15821446.0,
"step": 8700
},
{
"entropy": 5.735653400421143,
"epoch": 7.47872797593468,
"grad_norm": 1.28125,
"learning_rate": 0.0001290132894019634,
"loss": 4.9445,
"mean_token_accuracy": 0.24317347705364228,
"num_tokens": 15830585.0,
"step": 8705
},
{
"entropy": 5.7284379482269285,
"epoch": 7.483025354533734,
"grad_norm": 1.359375,
"learning_rate": 0.00012876045159772442,
"loss": 4.9058,
"mean_token_accuracy": 0.24099535942077638,
"num_tokens": 15838872.0,
"step": 8710
},
{
"entropy": 5.761771535873413,
"epoch": 7.487322733132789,
"grad_norm": 1.453125,
"learning_rate": 0.00012850793312220766,
"loss": 4.9076,
"mean_token_accuracy": 0.23802259117364882,
"num_tokens": 15847561.0,
"step": 8715
},
{
"entropy": 5.646856451034546,
"epoch": 7.4916201117318435,
"grad_norm": 1.265625,
"learning_rate": 0.00012825573452681266,
"loss": 4.8555,
"mean_token_accuracy": 0.25125192701816557,
"num_tokens": 15856405.0,
"step": 8720
},
{
"entropy": 5.72300238609314,
"epoch": 7.495917490330898,
"grad_norm": 1.53125,
"learning_rate": 0.00012800385636224017,
"loss": 4.9081,
"mean_token_accuracy": 0.24140879213809968,
"num_tokens": 15865856.0,
"step": 8725
},
{
"entropy": 5.711844491958618,
"epoch": 7.500214868929953,
"grad_norm": 1.5078125,
"learning_rate": 0.00012775229917849162,
"loss": 4.8527,
"mean_token_accuracy": 0.24171538800001144,
"num_tokens": 15873605.0,
"step": 8730
},
{
"entropy": 5.730578804016114,
"epoch": 7.504512247529007,
"grad_norm": 1.3203125,
"learning_rate": 0.00012750106352486728,
"loss": 4.8656,
"mean_token_accuracy": 0.24494647234678268,
"num_tokens": 15883123.0,
"step": 8735
},
{
"entropy": 5.6782575130462645,
"epoch": 7.508809626128062,
"grad_norm": 1.203125,
"learning_rate": 0.00012725014994996534,
"loss": 4.9047,
"mean_token_accuracy": 0.24687566012144088,
"num_tokens": 15892713.0,
"step": 8740
},
{
"entropy": 5.758023262023926,
"epoch": 7.513107004727116,
"grad_norm": 1.3203125,
"learning_rate": 0.00012699955900168075,
"loss": 4.9081,
"mean_token_accuracy": 0.23785718083381652,
"num_tokens": 15902913.0,
"step": 8745
},
{
"entropy": 5.727906036376953,
"epoch": 7.517404383326171,
"grad_norm": 1.359375,
"learning_rate": 0.00012674929122720414,
"loss": 4.9398,
"mean_token_accuracy": 0.2300073966383934,
"num_tokens": 15912721.0,
"step": 8750
},
{
"entropy": 5.644121694564819,
"epoch": 7.521701761925225,
"grad_norm": 1.2890625,
"learning_rate": 0.0001264993471730202,
"loss": 4.833,
"mean_token_accuracy": 0.24297845661640166,
"num_tokens": 15921520.0,
"step": 8755
},
{
"entropy": 5.747539854049682,
"epoch": 7.52599914052428,
"grad_norm": 1.3203125,
"learning_rate": 0.00012624972738490675,
"loss": 4.9571,
"mean_token_accuracy": 0.23408962190151214,
"num_tokens": 15930753.0,
"step": 8760
},
{
"entropy": 5.780120038986206,
"epoch": 7.530296519123334,
"grad_norm": 1.3203125,
"learning_rate": 0.00012600043240793368,
"loss": 4.9331,
"mean_token_accuracy": 0.2341680034995079,
"num_tokens": 15939957.0,
"step": 8765
},
{
"entropy": 5.723352432250977,
"epoch": 7.534593897722389,
"grad_norm": 1.453125,
"learning_rate": 0.00012575146278646175,
"loss": 4.8624,
"mean_token_accuracy": 0.24073042422533036,
"num_tokens": 15949555.0,
"step": 8770
},
{
"entropy": 5.667313003540039,
"epoch": 7.5388912763214435,
"grad_norm": 1.3203125,
"learning_rate": 0.00012550281906414097,
"loss": 4.8799,
"mean_token_accuracy": 0.23746145516633987,
"num_tokens": 15958395.0,
"step": 8775
},
{
"entropy": 5.7248999118804935,
"epoch": 7.543188654920499,
"grad_norm": 1.3203125,
"learning_rate": 0.00012525450178390972,
"loss": 4.9127,
"mean_token_accuracy": 0.24568843394517897,
"num_tokens": 15967522.0,
"step": 8780
},
{
"entropy": 5.747699928283692,
"epoch": 7.547486033519553,
"grad_norm": 1.4140625,
"learning_rate": 0.0001250065114879939,
"loss": 4.9025,
"mean_token_accuracy": 0.23830792605876921,
"num_tokens": 15976311.0,
"step": 8785
},
{
"entropy": 5.6730828285217285,
"epoch": 7.551783412118608,
"grad_norm": 1.375,
"learning_rate": 0.00012475884871790505,
"loss": 4.8426,
"mean_token_accuracy": 0.24330639988183975,
"num_tokens": 15985202.0,
"step": 8790
},
{
"entropy": 5.788040351867676,
"epoch": 7.556080790717663,
"grad_norm": 1.421875,
"learning_rate": 0.00012451151401443982,
"loss": 4.9778,
"mean_token_accuracy": 0.22562479078769684,
"num_tokens": 15995043.0,
"step": 8795
},
{
"entropy": 5.629932451248169,
"epoch": 7.560378169316717,
"grad_norm": 1.3828125,
"learning_rate": 0.00012426450791767815,
"loss": 4.8205,
"mean_token_accuracy": 0.24828920215368272,
"num_tokens": 16004355.0,
"step": 8800
},
{
"entropy": 5.700480365753174,
"epoch": 7.564675547915772,
"grad_norm": 1.4296875,
"learning_rate": 0.00012401783096698283,
"loss": 4.7502,
"mean_token_accuracy": 0.24835503846406937,
"num_tokens": 16013069.0,
"step": 8805
},
{
"entropy": 5.706959199905396,
"epoch": 7.568972926514826,
"grad_norm": 1.28125,
"learning_rate": 0.00012377148370099764,
"loss": 4.9231,
"mean_token_accuracy": 0.2306264817714691,
"num_tokens": 16023757.0,
"step": 8810
},
{
"entropy": 5.7141008377075195,
"epoch": 7.573270305113881,
"grad_norm": 1.40625,
"learning_rate": 0.00012352546665764642,
"loss": 4.9245,
"mean_token_accuracy": 0.23618121743202208,
"num_tokens": 16032550.0,
"step": 8815
},
{
"entropy": 5.592525005340576,
"epoch": 7.577567683712935,
"grad_norm": 1.3359375,
"learning_rate": 0.00012327978037413219,
"loss": 4.8005,
"mean_token_accuracy": 0.25415861159563063,
"num_tokens": 16041580.0,
"step": 8820
},
{
"entropy": 5.6945716381073,
"epoch": 7.58186506231199,
"grad_norm": 1.4765625,
"learning_rate": 0.00012303442538693564,
"loss": 4.9079,
"mean_token_accuracy": 0.23844119608402253,
"num_tokens": 16049845.0,
"step": 8825
},
{
"entropy": 5.649405431747437,
"epoch": 7.586162440911044,
"grad_norm": 1.484375,
"learning_rate": 0.00012278940223181393,
"loss": 4.8096,
"mean_token_accuracy": 0.23990656286478043,
"num_tokens": 16059703.0,
"step": 8830
},
{
"entropy": 5.67500786781311,
"epoch": 7.590459819510099,
"grad_norm": 1.3359375,
"learning_rate": 0.00012254471144379964,
"loss": 4.7812,
"mean_token_accuracy": 0.2586831733584404,
"num_tokens": 16068416.0,
"step": 8835
},
{
"entropy": 5.6926109313964846,
"epoch": 7.5947571981091535,
"grad_norm": 1.34375,
"learning_rate": 0.00012230035355719968,
"loss": 4.9417,
"mean_token_accuracy": 0.23565699011087418,
"num_tokens": 16078067.0,
"step": 8840
},
{
"entropy": 5.718332052230835,
"epoch": 7.599054576708208,
"grad_norm": 1.3359375,
"learning_rate": 0.0001220563291055941,
"loss": 4.8999,
"mean_token_accuracy": 0.24132361114025117,
"num_tokens": 16086591.0,
"step": 8845
},
{
"entropy": 5.74136209487915,
"epoch": 7.603351955307263,
"grad_norm": 1.40625,
"learning_rate": 0.0001218126386218347,
"loss": 4.9064,
"mean_token_accuracy": 0.2423307090997696,
"num_tokens": 16096138.0,
"step": 8850
},
{
"entropy": 5.653201866149902,
"epoch": 7.607649333906317,
"grad_norm": 1.46875,
"learning_rate": 0.00012156928263804403,
"loss": 4.839,
"mean_token_accuracy": 0.24370431303977966,
"num_tokens": 16105182.0,
"step": 8855
},
{
"entropy": 5.720748567581177,
"epoch": 7.611946712505372,
"grad_norm": 1.515625,
"learning_rate": 0.0001213262616856144,
"loss": 4.9646,
"mean_token_accuracy": 0.23470364809036254,
"num_tokens": 16113940.0,
"step": 8860
},
{
"entropy": 5.711144542694091,
"epoch": 7.616244091104426,
"grad_norm": 1.5078125,
"learning_rate": 0.00012108357629520635,
"loss": 4.8594,
"mean_token_accuracy": 0.2404816433787346,
"num_tokens": 16123036.0,
"step": 8865
},
{
"entropy": 5.682049751281738,
"epoch": 7.620541469703481,
"grad_norm": 1.4296875,
"learning_rate": 0.00012084122699674785,
"loss": 4.8693,
"mean_token_accuracy": 0.24037092477083205,
"num_tokens": 16131057.0,
"step": 8870
},
{
"entropy": 5.707250261306763,
"epoch": 7.624838848302535,
"grad_norm": 1.4921875,
"learning_rate": 0.00012059921431943278,
"loss": 4.9119,
"mean_token_accuracy": 0.23492788076400756,
"num_tokens": 16140259.0,
"step": 8875
},
{
"entropy": 5.750310230255127,
"epoch": 7.62913622690159,
"grad_norm": 1.5234375,
"learning_rate": 0.00012035753879172026,
"loss": 5.0946,
"mean_token_accuracy": 0.22113668769598008,
"num_tokens": 16149585.0,
"step": 8880
},
{
"entropy": 5.6439436912536625,
"epoch": 7.633433605500644,
"grad_norm": 1.453125,
"learning_rate": 0.00012011620094133296,
"loss": 4.7415,
"mean_token_accuracy": 0.2517879784107208,
"num_tokens": 16157656.0,
"step": 8885
},
{
"entropy": 5.633669376373291,
"epoch": 7.637730984099699,
"grad_norm": 1.359375,
"learning_rate": 0.00011987520129525622,
"loss": 4.8953,
"mean_token_accuracy": 0.2355537548661232,
"num_tokens": 16166900.0,
"step": 8890
},
{
"entropy": 5.7096264362335205,
"epoch": 7.6420283626987535,
"grad_norm": 1.546875,
"learning_rate": 0.000119634540379737,
"loss": 4.9149,
"mean_token_accuracy": 0.23628997951745986,
"num_tokens": 16174859.0,
"step": 8895
},
{
"entropy": 5.737153196334839,
"epoch": 7.646325741297808,
"grad_norm": 1.421875,
"learning_rate": 0.00011939421872028262,
"loss": 4.9069,
"mean_token_accuracy": 0.23722454458475112,
"num_tokens": 16183660.0,
"step": 8900
},
{
"entropy": 5.666551923751831,
"epoch": 7.650623119896863,
"grad_norm": 1.3984375,
"learning_rate": 0.00011915423684165948,
"loss": 4.8487,
"mean_token_accuracy": 0.24358074367046356,
"num_tokens": 16192344.0,
"step": 8905
},
{
"entropy": 5.7381829738616945,
"epoch": 7.654920498495917,
"grad_norm": 1.328125,
"learning_rate": 0.00011891459526789198,
"loss": 4.9234,
"mean_token_accuracy": 0.23702718764543534,
"num_tokens": 16202060.0,
"step": 8910
},
{
"entropy": 5.74292402267456,
"epoch": 7.659217877094972,
"grad_norm": 1.359375,
"learning_rate": 0.0001186752945222616,
"loss": 4.9217,
"mean_token_accuracy": 0.23749222308397294,
"num_tokens": 16211297.0,
"step": 8915
},
{
"entropy": 5.719897603988647,
"epoch": 7.663515255694026,
"grad_norm": 1.265625,
"learning_rate": 0.00011843633512730562,
"loss": 4.8646,
"mean_token_accuracy": 0.24102884978055955,
"num_tokens": 16219812.0,
"step": 8920
},
{
"entropy": 5.695969915390014,
"epoch": 7.667812634293082,
"grad_norm": 1.34375,
"learning_rate": 0.00011819771760481576,
"loss": 4.8765,
"mean_token_accuracy": 0.24786355644464492,
"num_tokens": 16229197.0,
"step": 8925
},
{
"entropy": 5.634260749816894,
"epoch": 7.672110012892135,
"grad_norm": 1.4140625,
"learning_rate": 0.00011795944247583725,
"loss": 4.8107,
"mean_token_accuracy": 0.2450357496738434,
"num_tokens": 16238154.0,
"step": 8930
},
{
"entropy": 5.663767862319946,
"epoch": 7.676407391491191,
"grad_norm": 1.2890625,
"learning_rate": 0.00011772151026066789,
"loss": 4.8537,
"mean_token_accuracy": 0.23516058027744294,
"num_tokens": 16247206.0,
"step": 8935
},
{
"entropy": 5.71944785118103,
"epoch": 7.680704770090245,
"grad_norm": 1.3671875,
"learning_rate": 0.00011748392147885642,
"loss": 4.972,
"mean_token_accuracy": 0.23327646702528,
"num_tokens": 16256571.0,
"step": 8940
},
{
"entropy": 5.712430381774903,
"epoch": 7.6850021486893,
"grad_norm": 1.6171875,
"learning_rate": 0.00011724667664920177,
"loss": 4.9113,
"mean_token_accuracy": 0.23777286261320113,
"num_tokens": 16265429.0,
"step": 8945
},
{
"entropy": 5.760171937942505,
"epoch": 7.689299527288354,
"grad_norm": 1.46875,
"learning_rate": 0.00011700977628975183,
"loss": 5.0088,
"mean_token_accuracy": 0.22751238495111464,
"num_tokens": 16273804.0,
"step": 8950
},
{
"entropy": 5.7118391513824465,
"epoch": 7.693596905887409,
"grad_norm": 1.453125,
"learning_rate": 0.00011677322091780243,
"loss": 4.928,
"mean_token_accuracy": 0.23360252976417542,
"num_tokens": 16282894.0,
"step": 8955
},
{
"entropy": 5.762385988235474,
"epoch": 7.6978942844864635,
"grad_norm": 1.390625,
"learning_rate": 0.0001165370110498958,
"loss": 4.9468,
"mean_token_accuracy": 0.2344141960144043,
"num_tokens": 16291568.0,
"step": 8960
},
{
"entropy": 5.75871696472168,
"epoch": 7.702191663085518,
"grad_norm": 1.3671875,
"learning_rate": 0.00011630114720181989,
"loss": 4.943,
"mean_token_accuracy": 0.23351782113313674,
"num_tokens": 16300650.0,
"step": 8965
},
{
"entropy": 5.710498237609864,
"epoch": 7.706489041684573,
"grad_norm": 1.40625,
"learning_rate": 0.00011606562988860711,
"loss": 4.8749,
"mean_token_accuracy": 0.23832377195358276,
"num_tokens": 16309712.0,
"step": 8970
},
{
"entropy": 5.69747223854065,
"epoch": 7.710786420283627,
"grad_norm": 1.25,
"learning_rate": 0.0001158304596245332,
"loss": 4.8552,
"mean_token_accuracy": 0.24203347712755202,
"num_tokens": 16319440.0,
"step": 8975
},
{
"entropy": 5.64069881439209,
"epoch": 7.715083798882682,
"grad_norm": 1.3359375,
"learning_rate": 0.00011559563692311595,
"loss": 4.8583,
"mean_token_accuracy": 0.24638050347566604,
"num_tokens": 16328752.0,
"step": 8980
},
{
"entropy": 5.687512683868408,
"epoch": 7.719381177481736,
"grad_norm": 1.34375,
"learning_rate": 0.00011536116229711422,
"loss": 4.8477,
"mean_token_accuracy": 0.24722600281238555,
"num_tokens": 16338045.0,
"step": 8985
},
{
"entropy": 5.758387184143066,
"epoch": 7.723678556080791,
"grad_norm": 1.359375,
"learning_rate": 0.000115127036258527,
"loss": 4.9253,
"mean_token_accuracy": 0.23546791225671768,
"num_tokens": 16347174.0,
"step": 8990
},
{
"entropy": 5.679119014739991,
"epoch": 7.727975934679845,
"grad_norm": 1.46875,
"learning_rate": 0.00011489325931859185,
"loss": 4.7787,
"mean_token_accuracy": 0.25743198245763776,
"num_tokens": 16355371.0,
"step": 8995
},
{
"entropy": 5.682134628295898,
"epoch": 7.7322733132789,
"grad_norm": 1.34375,
"learning_rate": 0.0001146598319877843,
"loss": 4.8576,
"mean_token_accuracy": 0.24060671776533127,
"num_tokens": 16363938.0,
"step": 9000
},
{
"epoch": 7.7322733132789,
"eval_entropy": 5.50046287893175,
"eval_loss": 5.91103458404541,
"eval_mean_token_accuracy": 0.1812090568814997,
"eval_num_tokens": 16363938.0,
"eval_runtime": 2.0492,
"eval_samples_per_second": 1731.916,
"eval_steps_per_second": 216.673,
"step": 9000
},
{
"entropy": 5.69020528793335,
"epoch": 7.736570691877954,
"grad_norm": 1.53125,
"learning_rate": 0.00011442675477581621,
"loss": 4.9217,
"mean_token_accuracy": 0.23781170547008515,
"num_tokens": 16373110.0,
"step": 9005
},
{
"entropy": 5.713038206100464,
"epoch": 7.740868070477009,
"grad_norm": 1.3515625,
"learning_rate": 0.0001141940281916352,
"loss": 4.8384,
"mean_token_accuracy": 0.23741564452648162,
"num_tokens": 16381521.0,
"step": 9010
},
{
"entropy": 5.757988119125367,
"epoch": 7.7451654490760635,
"grad_norm": 1.3359375,
"learning_rate": 0.00011396165274342304,
"loss": 4.9514,
"mean_token_accuracy": 0.23201918452978135,
"num_tokens": 16391322.0,
"step": 9015
},
{
"entropy": 5.672502326965332,
"epoch": 7.749462827675118,
"grad_norm": 1.296875,
"learning_rate": 0.00011372962893859471,
"loss": 4.8857,
"mean_token_accuracy": 0.23982396721839905,
"num_tokens": 16400653.0,
"step": 9020
},
{
"entropy": 5.67963056564331,
"epoch": 7.753760206274173,
"grad_norm": 1.390625,
"learning_rate": 0.00011349795728379759,
"loss": 4.9017,
"mean_token_accuracy": 0.23146291971206664,
"num_tokens": 16410133.0,
"step": 9025
},
{
"entropy": 5.738080835342407,
"epoch": 7.758057584873227,
"grad_norm": 1.375,
"learning_rate": 0.00011326663828491,
"loss": 4.9688,
"mean_token_accuracy": 0.23214154988527297,
"num_tokens": 16419302.0,
"step": 9030
},
{
"entropy": 5.694006776809692,
"epoch": 7.762354963472282,
"grad_norm": 1.5078125,
"learning_rate": 0.00011303567244704015,
"loss": 4.9394,
"mean_token_accuracy": 0.24590874761343,
"num_tokens": 16428020.0,
"step": 9035
},
{
"entropy": 5.726604032516479,
"epoch": 7.766652342071336,
"grad_norm": 1.375,
"learning_rate": 0.00011280506027452502,
"loss": 5.0101,
"mean_token_accuracy": 0.22551458925008774,
"num_tokens": 16438033.0,
"step": 9040
},
{
"entropy": 5.740489101409912,
"epoch": 7.770949720670391,
"grad_norm": 1.5,
"learning_rate": 0.0001125748022709295,
"loss": 4.9396,
"mean_token_accuracy": 0.24004841595888138,
"num_tokens": 16447067.0,
"step": 9045
},
{
"entropy": 5.762021446228028,
"epoch": 7.775247099269445,
"grad_norm": 1.3359375,
"learning_rate": 0.00011234489893904509,
"loss": 4.957,
"mean_token_accuracy": 0.22969225496053697,
"num_tokens": 16457146.0,
"step": 9050
},
{
"entropy": 5.696189260482788,
"epoch": 7.7795444778685,
"grad_norm": 1.3671875,
"learning_rate": 0.00011211535078088869,
"loss": 4.8183,
"mean_token_accuracy": 0.2493802011013031,
"num_tokens": 16466428.0,
"step": 9055
},
{
"entropy": 5.687802410125732,
"epoch": 7.783841856467554,
"grad_norm": 1.4453125,
"learning_rate": 0.00011188615829770171,
"loss": 4.867,
"mean_token_accuracy": 0.242595311999321,
"num_tokens": 16474198.0,
"step": 9060
},
{
"entropy": 5.6849202632904055,
"epoch": 7.788139235066609,
"grad_norm": 1.453125,
"learning_rate": 0.00011165732198994905,
"loss": 4.8999,
"mean_token_accuracy": 0.23992180526256562,
"num_tokens": 16483464.0,
"step": 9065
},
{
"entropy": 5.681135511398315,
"epoch": 7.792436613665664,
"grad_norm": 1.46875,
"learning_rate": 0.00011142884235731756,
"loss": 4.8742,
"mean_token_accuracy": 0.24454833716154098,
"num_tokens": 16492619.0,
"step": 9070
},
{
"entropy": 5.7551130771636965,
"epoch": 7.796733992264718,
"grad_norm": 1.5859375,
"learning_rate": 0.00011120071989871564,
"loss": 4.9718,
"mean_token_accuracy": 0.24016929417848587,
"num_tokens": 16501690.0,
"step": 9075
},
{
"entropy": 5.639516925811767,
"epoch": 7.8010313708637735,
"grad_norm": 1.2578125,
"learning_rate": 0.00011097295511227134,
"loss": 4.8645,
"mean_token_accuracy": 0.24110034853219986,
"num_tokens": 16510158.0,
"step": 9080
},
{
"entropy": 5.668501472473144,
"epoch": 7.805328749462827,
"grad_norm": 1.3828125,
"learning_rate": 0.0001107455484953321,
"loss": 4.8698,
"mean_token_accuracy": 0.23620172441005707,
"num_tokens": 16518722.0,
"step": 9085
},
{
"entropy": 5.657008075714112,
"epoch": 7.809626128061883,
"grad_norm": 1.40625,
"learning_rate": 0.00011051850054446306,
"loss": 4.882,
"mean_token_accuracy": 0.24305418133735657,
"num_tokens": 16527404.0,
"step": 9090
},
{
"entropy": 5.6321070194244385,
"epoch": 7.813923506660937,
"grad_norm": 1.5078125,
"learning_rate": 0.00011029181175544603,
"loss": 4.8245,
"mean_token_accuracy": 0.25041354447603226,
"num_tokens": 16536210.0,
"step": 9095
},
{
"entropy": 5.769088315963745,
"epoch": 7.818220885259992,
"grad_norm": 1.53125,
"learning_rate": 0.00011006548262327884,
"loss": 4.9854,
"mean_token_accuracy": 0.23913427740335463,
"num_tokens": 16544707.0,
"step": 9100
},
{
"entropy": 5.764179706573486,
"epoch": 7.822518263859046,
"grad_norm": 1.328125,
"learning_rate": 0.0001098395136421739,
"loss": 4.9391,
"mean_token_accuracy": 0.22906827330589294,
"num_tokens": 16553883.0,
"step": 9105
},
{
"entropy": 5.7137744426727295,
"epoch": 7.826815642458101,
"grad_norm": 1.2890625,
"learning_rate": 0.00010961390530555712,
"loss": 4.9357,
"mean_token_accuracy": 0.2278410956263542,
"num_tokens": 16562537.0,
"step": 9110
},
{
"entropy": 5.699681091308594,
"epoch": 7.831113021057155,
"grad_norm": 1.3359375,
"learning_rate": 0.00010938865810606682,
"loss": 4.9009,
"mean_token_accuracy": 0.24008741080760956,
"num_tokens": 16571665.0,
"step": 9115
},
{
"entropy": 5.712159013748169,
"epoch": 7.83541039965621,
"grad_norm": 1.296875,
"learning_rate": 0.00010916377253555293,
"loss": 4.9065,
"mean_token_accuracy": 0.23339497298002243,
"num_tokens": 16581102.0,
"step": 9120
},
{
"entropy": 5.671864986419678,
"epoch": 7.839707778255264,
"grad_norm": 1.4140625,
"learning_rate": 0.00010893924908507573,
"loss": 4.865,
"mean_token_accuracy": 0.24521686434745787,
"num_tokens": 16589958.0,
"step": 9125
},
{
"entropy": 5.765967130661011,
"epoch": 7.844005156854319,
"grad_norm": 1.4765625,
"learning_rate": 0.0001087150882449046,
"loss": 4.9268,
"mean_token_accuracy": 0.2391319289803505,
"num_tokens": 16598800.0,
"step": 9130
},
{
"entropy": 5.707003164291382,
"epoch": 7.8483025354533735,
"grad_norm": 1.3671875,
"learning_rate": 0.00010849129050451717,
"loss": 4.8968,
"mean_token_accuracy": 0.23445421308279038,
"num_tokens": 16607751.0,
"step": 9135
},
{
"entropy": 5.652349233627319,
"epoch": 7.852599914052428,
"grad_norm": 1.3125,
"learning_rate": 0.00010826785635259842,
"loss": 4.8453,
"mean_token_accuracy": 0.24450734853744507,
"num_tokens": 16616041.0,
"step": 9140
},
{
"entropy": 5.660851383209229,
"epoch": 7.856897292651483,
"grad_norm": 1.5625,
"learning_rate": 0.00010804478627703903,
"loss": 4.8055,
"mean_token_accuracy": 0.2569776579737663,
"num_tokens": 16624800.0,
"step": 9145
},
{
"entropy": 5.752418375015258,
"epoch": 7.861194671250537,
"grad_norm": 1.7109375,
"learning_rate": 0.00010782208076493508,
"loss": 4.9431,
"mean_token_accuracy": 0.23038492798805238,
"num_tokens": 16632808.0,
"step": 9150
},
{
"entropy": 5.7448986053466795,
"epoch": 7.865492049849592,
"grad_norm": 1.3984375,
"learning_rate": 0.00010759974030258621,
"loss": 4.906,
"mean_token_accuracy": 0.23768896460533143,
"num_tokens": 16641179.0,
"step": 9155
},
{
"entropy": 5.688127946853638,
"epoch": 7.869789428448646,
"grad_norm": 1.2421875,
"learning_rate": 0.00010737776537549531,
"loss": 4.8952,
"mean_token_accuracy": 0.24468690007925034,
"num_tokens": 16650402.0,
"step": 9160
},
{
"entropy": 5.64280138015747,
"epoch": 7.874086807047701,
"grad_norm": 1.4296875,
"learning_rate": 0.00010715615646836679,
"loss": 4.8555,
"mean_token_accuracy": 0.2418656662106514,
"num_tokens": 16659661.0,
"step": 9165
},
{
"entropy": 5.633242177963257,
"epoch": 7.878384185646755,
"grad_norm": 1.296875,
"learning_rate": 0.00010693491406510585,
"loss": 4.8508,
"mean_token_accuracy": 0.2495250001549721,
"num_tokens": 16668630.0,
"step": 9170
},
{
"entropy": 5.727669334411621,
"epoch": 7.88268156424581,
"grad_norm": 1.4296875,
"learning_rate": 0.00010671403864881757,
"loss": 4.9439,
"mean_token_accuracy": 0.2366262823343277,
"num_tokens": 16678023.0,
"step": 9175
},
{
"entropy": 5.733814907073975,
"epoch": 7.8869789428448644,
"grad_norm": 1.3359375,
"learning_rate": 0.00010649353070180562,
"loss": 4.9239,
"mean_token_accuracy": 0.24506820291280745,
"num_tokens": 16686751.0,
"step": 9180
},
{
"entropy": 5.7177135944366455,
"epoch": 7.891276321443919,
"grad_norm": 1.3515625,
"learning_rate": 0.00010627339070557118,
"loss": 4.9189,
"mean_token_accuracy": 0.23672835975885392,
"num_tokens": 16696672.0,
"step": 9185
},
{
"entropy": 5.688603305816651,
"epoch": 7.8955737000429735,
"grad_norm": 1.3671875,
"learning_rate": 0.00010605361914081194,
"loss": 4.8098,
"mean_token_accuracy": 0.24818727225065232,
"num_tokens": 16706018.0,
"step": 9190
},
{
"entropy": 5.71311993598938,
"epoch": 7.899871078642028,
"grad_norm": 1.21875,
"learning_rate": 0.00010583421648742125,
"loss": 4.8634,
"mean_token_accuracy": 0.2426785260438919,
"num_tokens": 16715206.0,
"step": 9195
},
{
"entropy": 5.69930329322815,
"epoch": 7.904168457241083,
"grad_norm": 1.4296875,
"learning_rate": 0.00010561518322448673,
"loss": 4.9517,
"mean_token_accuracy": 0.23524065911769867,
"num_tokens": 16724479.0,
"step": 9200
},
{
"entropy": 5.6606029033660885,
"epoch": 7.908465835840137,
"grad_norm": 1.484375,
"learning_rate": 0.00010539651983028955,
"loss": 4.8302,
"mean_token_accuracy": 0.24595999121665954,
"num_tokens": 16733304.0,
"step": 9205
},
{
"entropy": 5.674201583862304,
"epoch": 7.912763214439192,
"grad_norm": 1.5,
"learning_rate": 0.0001051782267823031,
"loss": 4.8306,
"mean_token_accuracy": 0.24602922052145004,
"num_tokens": 16741447.0,
"step": 9210
},
{
"entropy": 5.695632266998291,
"epoch": 7.917060593038247,
"grad_norm": 1.4765625,
"learning_rate": 0.00010496030455719225,
"loss": 4.8963,
"mean_token_accuracy": 0.23516589403152466,
"num_tokens": 16751487.0,
"step": 9215
},
{
"entropy": 5.6963306903839115,
"epoch": 7.921357971637301,
"grad_norm": 1.421875,
"learning_rate": 0.00010474275363081193,
"loss": 4.8813,
"mean_token_accuracy": 0.23939020037651063,
"num_tokens": 16760795.0,
"step": 9220
},
{
"entropy": 5.683042097091675,
"epoch": 7.925655350236356,
"grad_norm": 1.46875,
"learning_rate": 0.0001045255744782064,
"loss": 4.8792,
"mean_token_accuracy": 0.24189150482416152,
"num_tokens": 16769639.0,
"step": 9225
},
{
"entropy": 5.759686517715454,
"epoch": 7.92995272883541,
"grad_norm": 1.3515625,
"learning_rate": 0.00010430876757360817,
"loss": 4.9654,
"mean_token_accuracy": 0.23323026299476624,
"num_tokens": 16779195.0,
"step": 9230
},
{
"entropy": 5.640351009368897,
"epoch": 7.934250107434465,
"grad_norm": 1.4375,
"learning_rate": 0.00010409233339043694,
"loss": 4.7822,
"mean_token_accuracy": 0.25166461169719695,
"num_tokens": 16787531.0,
"step": 9235
},
{
"entropy": 5.643196535110474,
"epoch": 7.93854748603352,
"grad_norm": 1.453125,
"learning_rate": 0.00010387627240129838,
"loss": 4.8377,
"mean_token_accuracy": 0.24781358987092972,
"num_tokens": 16796392.0,
"step": 9240
},
{
"entropy": 5.7032732486724855,
"epoch": 7.942844864632574,
"grad_norm": 1.265625,
"learning_rate": 0.00010366058507798326,
"loss": 4.8837,
"mean_token_accuracy": 0.23703473657369614,
"num_tokens": 16804942.0,
"step": 9245
},
{
"entropy": 5.685716962814331,
"epoch": 7.947142243231629,
"grad_norm": 1.3671875,
"learning_rate": 0.00010344527189146655,
"loss": 4.9321,
"mean_token_accuracy": 0.23355796337127685,
"num_tokens": 16813754.0,
"step": 9250
},
{
"entropy": 5.715789413452148,
"epoch": 7.9514396218306835,
"grad_norm": 1.265625,
"learning_rate": 0.00010323033331190626,
"loss": 4.9042,
"mean_token_accuracy": 0.23985962867736815,
"num_tokens": 16823010.0,
"step": 9255
},
{
"entropy": 5.716991758346557,
"epoch": 7.955737000429738,
"grad_norm": 1.40625,
"learning_rate": 0.00010301576980864228,
"loss": 4.8258,
"mean_token_accuracy": 0.25040524303913114,
"num_tokens": 16831909.0,
"step": 9260
},
{
"entropy": 5.622262763977051,
"epoch": 7.960034379028793,
"grad_norm": 1.4765625,
"learning_rate": 0.00010280158185019547,
"loss": 4.8755,
"mean_token_accuracy": 0.24880994856357574,
"num_tokens": 16841460.0,
"step": 9265
},
{
"entropy": 5.675008153915405,
"epoch": 7.964331757627847,
"grad_norm": 1.2265625,
"learning_rate": 0.00010258776990426686,
"loss": 4.8917,
"mean_token_accuracy": 0.24589523673057556,
"num_tokens": 16850592.0,
"step": 9270
},
{
"entropy": 5.697416830062866,
"epoch": 7.968629136226902,
"grad_norm": 1.53125,
"learning_rate": 0.00010237433443773612,
"loss": 4.8994,
"mean_token_accuracy": 0.2388192519545555,
"num_tokens": 16859736.0,
"step": 9275
},
{
"entropy": 5.644947481155396,
"epoch": 7.972926514825956,
"grad_norm": 1.234375,
"learning_rate": 0.00010216127591666115,
"loss": 4.9009,
"mean_token_accuracy": 0.25150825679302213,
"num_tokens": 16870084.0,
"step": 9280
},
{
"entropy": 5.69167685508728,
"epoch": 7.977223893425011,
"grad_norm": 1.6484375,
"learning_rate": 0.00010194859480627648,
"loss": 4.9116,
"mean_token_accuracy": 0.2423036977648735,
"num_tokens": 16877771.0,
"step": 9285
},
{
"entropy": 5.679421663284302,
"epoch": 7.981521272024065,
"grad_norm": 1.25,
"learning_rate": 0.00010173629157099279,
"loss": 4.9116,
"mean_token_accuracy": 0.2358901694417,
"num_tokens": 16887487.0,
"step": 9290
},
{
"entropy": 5.679394340515136,
"epoch": 7.98581865062312,
"grad_norm": 1.25,
"learning_rate": 0.00010152436667439537,
"loss": 4.91,
"mean_token_accuracy": 0.24376949667930603,
"num_tokens": 16897286.0,
"step": 9295
},
{
"entropy": 5.705884504318237,
"epoch": 7.9901160292221745,
"grad_norm": 1.40625,
"learning_rate": 0.00010131282057924345,
"loss": 4.85,
"mean_token_accuracy": 0.24412865936756134,
"num_tokens": 16905968.0,
"step": 9300
},
{
"entropy": 5.732138919830322,
"epoch": 7.994413407821229,
"grad_norm": 1.34375,
"learning_rate": 0.00010110165374746924,
"loss": 4.8701,
"mean_token_accuracy": 0.24227845817804336,
"num_tokens": 16914604.0,
"step": 9305
},
{
"entropy": 5.702543640136719,
"epoch": 7.9987107864202835,
"grad_norm": 1.3671875,
"learning_rate": 0.00010089086664017674,
"loss": 5.0117,
"mean_token_accuracy": 0.23315883576869964,
"num_tokens": 16925085.0,
"step": 9310
},
{
"entropy": 5.676374594370524,
"epoch": 8.002578427159433,
"grad_norm": 1.390625,
"learning_rate": 0.00010068045971764067,
"loss": 4.8175,
"mean_token_accuracy": 0.2474090274837282,
"num_tokens": 16932717.0,
"step": 9315
},
{
"entropy": 5.737140417098999,
"epoch": 8.006875805758487,
"grad_norm": 1.421875,
"learning_rate": 0.00010047043343930561,
"loss": 4.9202,
"mean_token_accuracy": 0.23843726366758347,
"num_tokens": 16941332.0,
"step": 9320
},
{
"entropy": 5.6835075378417965,
"epoch": 8.011173184357542,
"grad_norm": 1.421875,
"learning_rate": 0.00010026078826378502,
"loss": 4.7602,
"mean_token_accuracy": 0.2514714956283569,
"num_tokens": 16949732.0,
"step": 9325
},
{
"entropy": 5.720363521575928,
"epoch": 8.015470562956596,
"grad_norm": 1.3359375,
"learning_rate": 0.00010005152464886031,
"loss": 4.7948,
"mean_token_accuracy": 0.2467312902212143,
"num_tokens": 16958013.0,
"step": 9330
},
{
"entropy": 5.649158191680908,
"epoch": 8.019767941555651,
"grad_norm": 1.3125,
"learning_rate": 9.984264305147941e-05,
"loss": 4.7621,
"mean_token_accuracy": 0.25033883154392245,
"num_tokens": 16966050.0,
"step": 9335
},
{
"entropy": 5.650808954238892,
"epoch": 8.024065320154705,
"grad_norm": 1.3203125,
"learning_rate": 9.963414392775627e-05,
"loss": 4.7591,
"mean_token_accuracy": 0.25660623162984847,
"num_tokens": 16975178.0,
"step": 9340
},
{
"entropy": 5.585528993606568,
"epoch": 8.02836269875376,
"grad_norm": 1.296875,
"learning_rate": 9.942602773296971e-05,
"loss": 4.7392,
"mean_token_accuracy": 0.26027477383613584,
"num_tokens": 16984247.0,
"step": 9345
},
{
"entropy": 5.767738199234008,
"epoch": 8.032660077352816,
"grad_norm": 1.3359375,
"learning_rate": 9.921829492156223e-05,
"loss": 4.9408,
"mean_token_accuracy": 0.23659028112888336,
"num_tokens": 16995048.0,
"step": 9350
},
{
"entropy": 5.726878213882446,
"epoch": 8.03695745595187,
"grad_norm": 1.6015625,
"learning_rate": 9.901094594713933e-05,
"loss": 4.9095,
"mean_token_accuracy": 0.2300567016005516,
"num_tokens": 17003748.0,
"step": 9355
},
{
"entropy": 5.753246450424195,
"epoch": 8.041254834550925,
"grad_norm": 1.3984375,
"learning_rate": 9.88039812624682e-05,
"loss": 4.8654,
"mean_token_accuracy": 0.24705124497413636,
"num_tokens": 17011639.0,
"step": 9360
},
{
"entropy": 5.767079067230225,
"epoch": 8.045552213149978,
"grad_norm": 1.2109375,
"learning_rate": 9.859740131947715e-05,
"loss": 4.8656,
"mean_token_accuracy": 0.23569979518651962,
"num_tokens": 17021056.0,
"step": 9365
},
{
"entropy": 5.730513334274292,
"epoch": 8.049849591749034,
"grad_norm": 1.3828125,
"learning_rate": 9.839120656925407e-05,
"loss": 4.9032,
"mean_token_accuracy": 0.2346501335501671,
"num_tokens": 17030944.0,
"step": 9370
},
{
"entropy": 5.669353294372558,
"epoch": 8.054146970348087,
"grad_norm": 1.40625,
"learning_rate": 9.818539746204588e-05,
"loss": 4.8452,
"mean_token_accuracy": 0.24651120901107787,
"num_tokens": 17040127.0,
"step": 9375
},
{
"entropy": 5.753323745727539,
"epoch": 8.058444348947143,
"grad_norm": 1.5859375,
"learning_rate": 9.797997444725745e-05,
"loss": 4.8358,
"mean_token_accuracy": 0.24490419328212737,
"num_tokens": 17049418.0,
"step": 9380
},
{
"entropy": 5.66489543914795,
"epoch": 8.062741727546197,
"grad_norm": 1.328125,
"learning_rate": 9.77749379734506e-05,
"loss": 4.8328,
"mean_token_accuracy": 0.25109679251909256,
"num_tokens": 17058270.0,
"step": 9385
},
{
"entropy": 5.712283277511597,
"epoch": 8.067039106145252,
"grad_norm": 1.4375,
"learning_rate": 9.757028848834293e-05,
"loss": 4.8444,
"mean_token_accuracy": 0.2447240486741066,
"num_tokens": 17068011.0,
"step": 9390
},
{
"entropy": 5.704377889633179,
"epoch": 8.071336484744306,
"grad_norm": 1.3828125,
"learning_rate": 9.736602643880712e-05,
"loss": 4.8442,
"mean_token_accuracy": 0.2414647787809372,
"num_tokens": 17077356.0,
"step": 9395
},
{
"entropy": 5.683783292770386,
"epoch": 8.075633863343361,
"grad_norm": 1.34375,
"learning_rate": 9.716215227086997e-05,
"loss": 4.8058,
"mean_token_accuracy": 0.24811055809259414,
"num_tokens": 17085679.0,
"step": 9400
},
{
"entropy": 5.713439559936523,
"epoch": 8.079931241942415,
"grad_norm": 1.3046875,
"learning_rate": 9.695866642971098e-05,
"loss": 4.828,
"mean_token_accuracy": 0.2418453276157379,
"num_tokens": 17094925.0,
"step": 9405
},
{
"entropy": 5.680472755432129,
"epoch": 8.08422862054147,
"grad_norm": 1.4140625,
"learning_rate": 9.67555693596621e-05,
"loss": 4.8917,
"mean_token_accuracy": 0.24031679928302765,
"num_tokens": 17105278.0,
"step": 9410
},
{
"entropy": 5.643775463104248,
"epoch": 8.088525999140524,
"grad_norm": 1.3984375,
"learning_rate": 9.655286150420595e-05,
"loss": 4.811,
"mean_token_accuracy": 0.2506825551390648,
"num_tokens": 17114070.0,
"step": 9415
},
{
"entropy": 5.592681741714477,
"epoch": 8.09282337773958,
"grad_norm": 1.2421875,
"learning_rate": 9.635054330597565e-05,
"loss": 4.7437,
"mean_token_accuracy": 0.2564584508538246,
"num_tokens": 17122862.0,
"step": 9420
},
{
"entropy": 5.6964335441589355,
"epoch": 8.097120756338633,
"grad_norm": 1.453125,
"learning_rate": 9.614861520675322e-05,
"loss": 4.81,
"mean_token_accuracy": 0.24855268150568008,
"num_tokens": 17131555.0,
"step": 9425
},
{
"entropy": 5.6678112030029295,
"epoch": 8.101418134937688,
"grad_norm": 1.40625,
"learning_rate": 9.594707764746881e-05,
"loss": 4.7697,
"mean_token_accuracy": 0.25087203830480576,
"num_tokens": 17140841.0,
"step": 9430
},
{
"entropy": 5.644021463394165,
"epoch": 8.105715513536742,
"grad_norm": 1.296875,
"learning_rate": 9.57459310682e-05,
"loss": 4.8181,
"mean_token_accuracy": 0.25079510658979415,
"num_tokens": 17150027.0,
"step": 9435
},
{
"entropy": 5.724911117553711,
"epoch": 8.110012892135797,
"grad_norm": 1.3125,
"learning_rate": 9.554517590817055e-05,
"loss": 4.8874,
"mean_token_accuracy": 0.23362074196338653,
"num_tokens": 17159589.0,
"step": 9440
},
{
"entropy": 5.718877220153809,
"epoch": 8.114310270734851,
"grad_norm": 1.2890625,
"learning_rate": 9.534481260574944e-05,
"loss": 4.8569,
"mean_token_accuracy": 0.24165450483560563,
"num_tokens": 17168219.0,
"step": 9445
},
{
"entropy": 5.717329406738282,
"epoch": 8.118607649333907,
"grad_norm": 1.34375,
"learning_rate": 9.514484159844997e-05,
"loss": 4.8494,
"mean_token_accuracy": 0.24667494893074035,
"num_tokens": 17177364.0,
"step": 9450
},
{
"entropy": 5.742877006530762,
"epoch": 8.12290502793296,
"grad_norm": 1.4765625,
"learning_rate": 9.494526332292899e-05,
"loss": 4.8939,
"mean_token_accuracy": 0.23939796686172485,
"num_tokens": 17186572.0,
"step": 9455
},
{
"entropy": 5.716141700744629,
"epoch": 8.127202406532016,
"grad_norm": 1.3671875,
"learning_rate": 9.47460782149857e-05,
"loss": 4.8819,
"mean_token_accuracy": 0.24952531158924102,
"num_tokens": 17195645.0,
"step": 9460
},
{
"entropy": 5.716918182373047,
"epoch": 8.13149978513107,
"grad_norm": 1.328125,
"learning_rate": 9.454728670956073e-05,
"loss": 4.8623,
"mean_token_accuracy": 0.2338918313384056,
"num_tokens": 17205279.0,
"step": 9465
},
{
"entropy": 5.674348020553589,
"epoch": 8.135797163730125,
"grad_norm": 1.3828125,
"learning_rate": 9.43488892407352e-05,
"loss": 4.7889,
"mean_token_accuracy": 0.2492659032344818,
"num_tokens": 17214536.0,
"step": 9470
},
{
"entropy": 5.7423008441925045,
"epoch": 8.140094542329178,
"grad_norm": 1.328125,
"learning_rate": 9.415088624172997e-05,
"loss": 4.8982,
"mean_token_accuracy": 0.2365766152739525,
"num_tokens": 17223336.0,
"step": 9475
},
{
"entropy": 5.725331783294678,
"epoch": 8.144391920928234,
"grad_norm": 1.375,
"learning_rate": 9.395327814490439e-05,
"loss": 4.8913,
"mean_token_accuracy": 0.24087603092193605,
"num_tokens": 17232991.0,
"step": 9480
},
{
"entropy": 5.707345008850098,
"epoch": 8.148689299527287,
"grad_norm": 1.4140625,
"learning_rate": 9.375606538175566e-05,
"loss": 4.8079,
"mean_token_accuracy": 0.2476293459534645,
"num_tokens": 17241760.0,
"step": 9485
},
{
"entropy": 5.745706558227539,
"epoch": 8.152986678126343,
"grad_norm": 1.3125,
"learning_rate": 9.35592483829175e-05,
"loss": 4.9106,
"mean_token_accuracy": 0.23400688916444778,
"num_tokens": 17251514.0,
"step": 9490
},
{
"entropy": 5.7085450172424315,
"epoch": 8.157284056725398,
"grad_norm": 1.3984375,
"learning_rate": 9.336282757815964e-05,
"loss": 4.8709,
"mean_token_accuracy": 0.2447448268532753,
"num_tokens": 17260876.0,
"step": 9495
},
{
"entropy": 5.680332851409912,
"epoch": 8.161581435324452,
"grad_norm": 1.40625,
"learning_rate": 9.316680339638664e-05,
"loss": 4.893,
"mean_token_accuracy": 0.24191939532756807,
"num_tokens": 17270051.0,
"step": 9500
},
{
"epoch": 8.161581435324452,
"eval_entropy": 5.534670775001113,
"eval_loss": 5.914956092834473,
"eval_mean_token_accuracy": 0.18108219369776077,
"eval_num_tokens": 17270051.0,
"eval_runtime": 2.0509,
"eval_samples_per_second": 1730.421,
"eval_steps_per_second": 216.485,
"step": 9500
},
{
"entropy": 5.771974754333496,
"epoch": 8.165878813923507,
"grad_norm": 1.4765625,
"learning_rate": 9.297117626563687e-05,
"loss": 4.9074,
"mean_token_accuracy": 0.243223874270916,
"num_tokens": 17279744.0,
"step": 9505
},
{
"entropy": 5.699692344665527,
"epoch": 8.170176192522561,
"grad_norm": 1.3515625,
"learning_rate": 9.27759466130818e-05,
"loss": 4.842,
"mean_token_accuracy": 0.24671292901039124,
"num_tokens": 17289107.0,
"step": 9510
},
{
"entropy": 5.6489325046539305,
"epoch": 8.174473571121617,
"grad_norm": 1.2890625,
"learning_rate": 9.25811148650251e-05,
"loss": 4.7483,
"mean_token_accuracy": 0.25123976916074753,
"num_tokens": 17298385.0,
"step": 9515
},
{
"entropy": 5.718669319152832,
"epoch": 8.17877094972067,
"grad_norm": 1.3359375,
"learning_rate": 9.238668144690133e-05,
"loss": 4.8292,
"mean_token_accuracy": 0.23712222427129745,
"num_tokens": 17307759.0,
"step": 9520
},
{
"entropy": 5.66925311088562,
"epoch": 8.183068328319726,
"grad_norm": 1.3203125,
"learning_rate": 9.219264678327527e-05,
"loss": 4.8238,
"mean_token_accuracy": 0.24355944246053696,
"num_tokens": 17317135.0,
"step": 9525
},
{
"entropy": 5.6720630645751955,
"epoch": 8.18736570691878,
"grad_norm": 1.4140625,
"learning_rate": 9.199901129784121e-05,
"loss": 4.8258,
"mean_token_accuracy": 0.2534212335944176,
"num_tokens": 17326950.0,
"step": 9530
},
{
"entropy": 5.76041088104248,
"epoch": 8.191663085517835,
"grad_norm": 1.28125,
"learning_rate": 9.180577541342164e-05,
"loss": 4.9385,
"mean_token_accuracy": 0.24018974602222443,
"num_tokens": 17335873.0,
"step": 9535
},
{
"entropy": 5.70224289894104,
"epoch": 8.195960464116888,
"grad_norm": 1.421875,
"learning_rate": 9.161293955196648e-05,
"loss": 4.8277,
"mean_token_accuracy": 0.25029603242874143,
"num_tokens": 17344659.0,
"step": 9540
},
{
"entropy": 5.750887012481689,
"epoch": 8.200257842715944,
"grad_norm": 1.375,
"learning_rate": 9.142050413455214e-05,
"loss": 4.8936,
"mean_token_accuracy": 0.23572346717119216,
"num_tokens": 17353274.0,
"step": 9545
},
{
"entropy": 5.712145090103149,
"epoch": 8.204555221314997,
"grad_norm": 1.28125,
"learning_rate": 9.12284695813807e-05,
"loss": 4.876,
"mean_token_accuracy": 0.24487333595752717,
"num_tokens": 17362212.0,
"step": 9550
},
{
"entropy": 5.6856156349182125,
"epoch": 8.208852599914053,
"grad_norm": 1.453125,
"learning_rate": 9.103683631177878e-05,
"loss": 4.8344,
"mean_token_accuracy": 0.2498592033982277,
"num_tokens": 17370567.0,
"step": 9555
},
{
"entropy": 5.6877570152282715,
"epoch": 8.213149978513107,
"grad_norm": 1.6484375,
"learning_rate": 9.084560474419701e-05,
"loss": 4.8265,
"mean_token_accuracy": 0.24633100628852844,
"num_tokens": 17378521.0,
"step": 9560
},
{
"entropy": 5.716610336303711,
"epoch": 8.217447357112162,
"grad_norm": 1.4765625,
"learning_rate": 9.065477529620852e-05,
"loss": 4.8615,
"mean_token_accuracy": 0.23589793592691422,
"num_tokens": 17386988.0,
"step": 9565
},
{
"entropy": 5.6887688636779785,
"epoch": 8.221744735711216,
"grad_norm": 1.515625,
"learning_rate": 9.046434838450868e-05,
"loss": 4.8514,
"mean_token_accuracy": 0.2439912587404251,
"num_tokens": 17396317.0,
"step": 9570
},
{
"entropy": 5.760230016708374,
"epoch": 8.226042114310271,
"grad_norm": 1.46875,
"learning_rate": 9.027432442491369e-05,
"loss": 4.939,
"mean_token_accuracy": 0.2313292518258095,
"num_tokens": 17405768.0,
"step": 9575
},
{
"entropy": 5.721897459030151,
"epoch": 8.230339492909325,
"grad_norm": 1.4765625,
"learning_rate": 9.008470383235991e-05,
"loss": 4.959,
"mean_token_accuracy": 0.23489704877138137,
"num_tokens": 17416098.0,
"step": 9580
},
{
"entropy": 5.717132997512818,
"epoch": 8.23463687150838,
"grad_norm": 1.4765625,
"learning_rate": 8.989548702090295e-05,
"loss": 4.8614,
"mean_token_accuracy": 0.2438367486000061,
"num_tokens": 17426091.0,
"step": 9585
},
{
"entropy": 5.674825143814087,
"epoch": 8.238934250107434,
"grad_norm": 1.4453125,
"learning_rate": 8.970667440371676e-05,
"loss": 4.8109,
"mean_token_accuracy": 0.24652960747480393,
"num_tokens": 17434956.0,
"step": 9590
},
{
"entropy": 5.726977968215943,
"epoch": 8.24323162870649,
"grad_norm": 1.3125,
"learning_rate": 8.951826639309257e-05,
"loss": 4.8667,
"mean_token_accuracy": 0.2461192190647125,
"num_tokens": 17443980.0,
"step": 9595
},
{
"entropy": 5.6655323028564455,
"epoch": 8.247529007305543,
"grad_norm": 1.46875,
"learning_rate": 8.933026340043811e-05,
"loss": 4.835,
"mean_token_accuracy": 0.24194021075963973,
"num_tokens": 17452982.0,
"step": 9600
},
{
"entropy": 5.658811092376709,
"epoch": 8.251826385904598,
"grad_norm": 1.484375,
"learning_rate": 8.914266583627684e-05,
"loss": 4.7973,
"mean_token_accuracy": 0.25148835182189944,
"num_tokens": 17461946.0,
"step": 9605
},
{
"entropy": 5.735917949676514,
"epoch": 8.256123764503652,
"grad_norm": 1.5859375,
"learning_rate": 8.89554741102469e-05,
"loss": 4.8817,
"mean_token_accuracy": 0.23675091862678527,
"num_tokens": 17470125.0,
"step": 9610
},
{
"entropy": 5.674997234344483,
"epoch": 8.260421143102707,
"grad_norm": 1.40625,
"learning_rate": 8.876868863110013e-05,
"loss": 4.8813,
"mean_token_accuracy": 0.2377606824040413,
"num_tokens": 17479536.0,
"step": 9615
},
{
"entropy": 5.739392614364624,
"epoch": 8.264718521701761,
"grad_norm": 1.3359375,
"learning_rate": 8.858230980670134e-05,
"loss": 4.8327,
"mean_token_accuracy": 0.24122980684041978,
"num_tokens": 17488709.0,
"step": 9620
},
{
"entropy": 5.710043478012085,
"epoch": 8.269015900300817,
"grad_norm": 1.3203125,
"learning_rate": 8.839633804402747e-05,
"loss": 4.8839,
"mean_token_accuracy": 0.24191838651895523,
"num_tokens": 17498196.0,
"step": 9625
},
{
"entropy": 5.709278202056884,
"epoch": 8.27331327889987,
"grad_norm": 1.515625,
"learning_rate": 8.821077374916647e-05,
"loss": 4.8403,
"mean_token_accuracy": 0.24802724421024322,
"num_tokens": 17507374.0,
"step": 9630
},
{
"entropy": 5.649344778060913,
"epoch": 8.277610657498926,
"grad_norm": 1.4375,
"learning_rate": 8.802561732731654e-05,
"loss": 4.8174,
"mean_token_accuracy": 0.24798057675361634,
"num_tokens": 17516163.0,
"step": 9635
},
{
"entropy": 5.7683521747589115,
"epoch": 8.281908036097981,
"grad_norm": 1.3203125,
"learning_rate": 8.784086918278534e-05,
"loss": 4.8977,
"mean_token_accuracy": 0.23559336811304094,
"num_tokens": 17525567.0,
"step": 9640
},
{
"entropy": 5.662019443511963,
"epoch": 8.286205414697035,
"grad_norm": 1.296875,
"learning_rate": 8.765652971898908e-05,
"loss": 4.7626,
"mean_token_accuracy": 0.25413262397050856,
"num_tokens": 17534433.0,
"step": 9645
},
{
"entropy": 5.716051149368286,
"epoch": 8.29050279329609,
"grad_norm": 1.5390625,
"learning_rate": 8.747259933845134e-05,
"loss": 4.912,
"mean_token_accuracy": 0.2507630452513695,
"num_tokens": 17542981.0,
"step": 9650
},
{
"entropy": 5.749143266677857,
"epoch": 8.294800171895144,
"grad_norm": 1.328125,
"learning_rate": 8.728907844280254e-05,
"loss": 4.9298,
"mean_token_accuracy": 0.2381526455283165,
"num_tokens": 17552425.0,
"step": 9655
},
{
"entropy": 5.68588399887085,
"epoch": 8.2990975504942,
"grad_norm": 1.5234375,
"learning_rate": 8.710596743277901e-05,
"loss": 4.8545,
"mean_token_accuracy": 0.24566843658685683,
"num_tokens": 17560954.0,
"step": 9660
},
{
"entropy": 5.681159543991089,
"epoch": 8.303394929093253,
"grad_norm": 1.4375,
"learning_rate": 8.692326670822207e-05,
"loss": 4.7767,
"mean_token_accuracy": 0.2506769150495529,
"num_tokens": 17569769.0,
"step": 9665
},
{
"entropy": 5.705241298675537,
"epoch": 8.307692307692308,
"grad_norm": 1.390625,
"learning_rate": 8.6740976668077e-05,
"loss": 4.8292,
"mean_token_accuracy": 0.24778336584568023,
"num_tokens": 17578875.0,
"step": 9670
},
{
"entropy": 5.6509339809417725,
"epoch": 8.311989686291362,
"grad_norm": 1.578125,
"learning_rate": 8.655909771039239e-05,
"loss": 4.8162,
"mean_token_accuracy": 0.24816256165504455,
"num_tokens": 17587556.0,
"step": 9675
},
{
"entropy": 5.717254257202148,
"epoch": 8.316287064890417,
"grad_norm": 1.65625,
"learning_rate": 8.637763023231922e-05,
"loss": 4.8536,
"mean_token_accuracy": 0.24158847630023955,
"num_tokens": 17595077.0,
"step": 9680
},
{
"entropy": 5.586067724227905,
"epoch": 8.320584443489471,
"grad_norm": 1.46875,
"learning_rate": 8.619657463010987e-05,
"loss": 4.7458,
"mean_token_accuracy": 0.2555888518691063,
"num_tokens": 17603864.0,
"step": 9685
},
{
"entropy": 5.780575895309449,
"epoch": 8.324881822088527,
"grad_norm": 1.5,
"learning_rate": 8.60159312991175e-05,
"loss": 4.9495,
"mean_token_accuracy": 0.23568148016929627,
"num_tokens": 17613570.0,
"step": 9690
},
{
"entropy": 5.721232223510742,
"epoch": 8.32917920068758,
"grad_norm": 1.296875,
"learning_rate": 8.583570063379487e-05,
"loss": 4.8582,
"mean_token_accuracy": 0.2423143908381462,
"num_tokens": 17622352.0,
"step": 9695
},
{
"entropy": 5.688345146179199,
"epoch": 8.333476579286636,
"grad_norm": 1.46875,
"learning_rate": 8.565588302769374e-05,
"loss": 4.8002,
"mean_token_accuracy": 0.2436966761946678,
"num_tokens": 17631556.0,
"step": 9700
},
{
"entropy": 5.671597337722778,
"epoch": 8.33777395788569,
"grad_norm": 1.390625,
"learning_rate": 8.54764788734639e-05,
"loss": 4.8278,
"mean_token_accuracy": 0.25227173417806625,
"num_tokens": 17641182.0,
"step": 9705
},
{
"entropy": 5.749675655364991,
"epoch": 8.342071336484745,
"grad_norm": 1.3828125,
"learning_rate": 8.529748856285228e-05,
"loss": 4.8418,
"mean_token_accuracy": 0.24297856986522676,
"num_tokens": 17650842.0,
"step": 9710
},
{
"entropy": 5.669020080566407,
"epoch": 8.346368715083798,
"grad_norm": 1.4375,
"learning_rate": 8.511891248670217e-05,
"loss": 4.8365,
"mean_token_accuracy": 0.24701906442642213,
"num_tokens": 17659963.0,
"step": 9715
},
{
"entropy": 5.729262828826904,
"epoch": 8.350666093682854,
"grad_norm": 1.5703125,
"learning_rate": 8.494075103495245e-05,
"loss": 4.9003,
"mean_token_accuracy": 0.2436648800969124,
"num_tokens": 17668637.0,
"step": 9720
},
{
"entropy": 5.725810480117798,
"epoch": 8.354963472281907,
"grad_norm": 1.390625,
"learning_rate": 8.476300459663643e-05,
"loss": 4.8768,
"mean_token_accuracy": 0.23490023016929626,
"num_tokens": 17678212.0,
"step": 9725
},
{
"entropy": 5.719243383407592,
"epoch": 8.359260850880963,
"grad_norm": 1.3125,
"learning_rate": 8.458567355988123e-05,
"loss": 4.9238,
"mean_token_accuracy": 0.23302749097347258,
"num_tokens": 17686766.0,
"step": 9730
},
{
"entropy": 5.67026309967041,
"epoch": 8.363558229480017,
"grad_norm": 1.1796875,
"learning_rate": 8.440875831190704e-05,
"loss": 4.8546,
"mean_token_accuracy": 0.24159758388996125,
"num_tokens": 17696846.0,
"step": 9735
},
{
"entropy": 5.6974513053894045,
"epoch": 8.367855608079072,
"grad_norm": 1.34375,
"learning_rate": 8.423225923902608e-05,
"loss": 4.8257,
"mean_token_accuracy": 0.24840028285980226,
"num_tokens": 17705846.0,
"step": 9740
},
{
"entropy": 5.68490195274353,
"epoch": 8.372152986678126,
"grad_norm": 1.4375,
"learning_rate": 8.405617672664175e-05,
"loss": 4.7729,
"mean_token_accuracy": 0.2577149987220764,
"num_tokens": 17714255.0,
"step": 9745
},
{
"entropy": 5.757609033584595,
"epoch": 8.376450365277181,
"grad_norm": 1.3125,
"learning_rate": 8.388051115924781e-05,
"loss": 4.9422,
"mean_token_accuracy": 0.23857035785913466,
"num_tokens": 17724246.0,
"step": 9750
},
{
"entropy": 5.684040594100952,
"epoch": 8.380747743876235,
"grad_norm": 1.40625,
"learning_rate": 8.370526292042774e-05,
"loss": 4.7974,
"mean_token_accuracy": 0.251545213162899,
"num_tokens": 17733081.0,
"step": 9755
},
{
"entropy": 5.61120572090149,
"epoch": 8.38504512247529,
"grad_norm": 1.65625,
"learning_rate": 8.353043239285357e-05,
"loss": 4.8071,
"mean_token_accuracy": 0.2531055316329002,
"num_tokens": 17741220.0,
"step": 9760
},
{
"entropy": 5.68164644241333,
"epoch": 8.389342501074344,
"grad_norm": 1.5703125,
"learning_rate": 8.335601995828534e-05,
"loss": 4.8647,
"mean_token_accuracy": 0.24154503643512726,
"num_tokens": 17750057.0,
"step": 9765
},
{
"entropy": 5.685444021224976,
"epoch": 8.3936398796734,
"grad_norm": 1.3359375,
"learning_rate": 8.318202599757008e-05,
"loss": 4.867,
"mean_token_accuracy": 0.24469136744737624,
"num_tokens": 17759016.0,
"step": 9770
},
{
"entropy": 5.685888147354126,
"epoch": 8.397937258272453,
"grad_norm": 1.4921875,
"learning_rate": 8.30084508906411e-05,
"loss": 4.7847,
"mean_token_accuracy": 0.25108771324157714,
"num_tokens": 17767170.0,
"step": 9775
},
{
"entropy": 5.669126987457275,
"epoch": 8.402234636871508,
"grad_norm": 1.5234375,
"learning_rate": 8.283529501651698e-05,
"loss": 4.8621,
"mean_token_accuracy": 0.24125717729330062,
"num_tokens": 17776508.0,
"step": 9780
},
{
"entropy": 5.681526708602905,
"epoch": 8.406532015470564,
"grad_norm": 1.3671875,
"learning_rate": 8.266255875330095e-05,
"loss": 4.812,
"mean_token_accuracy": 0.2480550304055214,
"num_tokens": 17785494.0,
"step": 9785
},
{
"entropy": 5.680064153671265,
"epoch": 8.410829394069617,
"grad_norm": 1.4921875,
"learning_rate": 8.249024247817998e-05,
"loss": 4.8591,
"mean_token_accuracy": 0.24685875922441483,
"num_tokens": 17795158.0,
"step": 9790
},
{
"entropy": 5.6914228916168215,
"epoch": 8.415126772668673,
"grad_norm": 1.2421875,
"learning_rate": 8.231834656742402e-05,
"loss": 4.8685,
"mean_token_accuracy": 0.24089238047599792,
"num_tokens": 17805130.0,
"step": 9795
},
{
"entropy": 5.716647386550903,
"epoch": 8.419424151267727,
"grad_norm": 1.4609375,
"learning_rate": 8.214687139638494e-05,
"loss": 4.884,
"mean_token_accuracy": 0.2443099856376648,
"num_tokens": 17813775.0,
"step": 9800
},
{
"entropy": 5.715089607238769,
"epoch": 8.423721529866782,
"grad_norm": 1.5703125,
"learning_rate": 8.197581733949597e-05,
"loss": 4.8172,
"mean_token_accuracy": 0.2506353959441185,
"num_tokens": 17822599.0,
"step": 9805
},
{
"entropy": 5.709234380722046,
"epoch": 8.428018908465836,
"grad_norm": 1.5546875,
"learning_rate": 8.180518477027083e-05,
"loss": 4.8411,
"mean_token_accuracy": 0.24736312478780748,
"num_tokens": 17830742.0,
"step": 9810
},
{
"entropy": 5.582332944869995,
"epoch": 8.432316287064891,
"grad_norm": 1.359375,
"learning_rate": 8.163497406130287e-05,
"loss": 4.7582,
"mean_token_accuracy": 0.25751803517341615,
"num_tokens": 17840101.0,
"step": 9815
},
{
"entropy": 5.793870306015014,
"epoch": 8.436613665663945,
"grad_norm": 1.359375,
"learning_rate": 8.146518558426426e-05,
"loss": 4.9523,
"mean_token_accuracy": 0.23325645178556442,
"num_tokens": 17850464.0,
"step": 9820
},
{
"entropy": 5.725572109222412,
"epoch": 8.440911044263,
"grad_norm": 1.5234375,
"learning_rate": 8.129581970990507e-05,
"loss": 4.8163,
"mean_token_accuracy": 0.25564427524805067,
"num_tokens": 17858792.0,
"step": 9825
},
{
"entropy": 5.69503002166748,
"epoch": 8.445208422862054,
"grad_norm": 1.5390625,
"learning_rate": 8.11268768080528e-05,
"loss": 4.8286,
"mean_token_accuracy": 0.2500312551856041,
"num_tokens": 17867225.0,
"step": 9830
},
{
"entropy": 5.722740554809571,
"epoch": 8.44950580146111,
"grad_norm": 1.578125,
"learning_rate": 8.09583572476111e-05,
"loss": 4.9198,
"mean_token_accuracy": 0.2357406347990036,
"num_tokens": 17876423.0,
"step": 9835
},
{
"entropy": 5.710339736938477,
"epoch": 8.453803180060163,
"grad_norm": 1.5703125,
"learning_rate": 8.079026139655946e-05,
"loss": 4.8892,
"mean_token_accuracy": 0.23821630626916884,
"num_tokens": 17886847.0,
"step": 9840
},
{
"entropy": 5.658467769622803,
"epoch": 8.458100558659218,
"grad_norm": 1.4140625,
"learning_rate": 8.062258962195192e-05,
"loss": 4.7725,
"mean_token_accuracy": 0.2558135434985161,
"num_tokens": 17897395.0,
"step": 9845
},
{
"entropy": 5.683120203018189,
"epoch": 8.462397937258272,
"grad_norm": 1.6015625,
"learning_rate": 8.04553422899167e-05,
"loss": 4.8719,
"mean_token_accuracy": 0.23903957158327102,
"num_tokens": 17906190.0,
"step": 9850
},
{
"entropy": 5.696431112289429,
"epoch": 8.466695315857327,
"grad_norm": 1.34375,
"learning_rate": 8.028851976565508e-05,
"loss": 4.918,
"mean_token_accuracy": 0.2389739364385605,
"num_tokens": 17916056.0,
"step": 9855
},
{
"entropy": 5.693221235275269,
"epoch": 8.470992694456381,
"grad_norm": 1.40625,
"learning_rate": 8.01221224134408e-05,
"loss": 4.8071,
"mean_token_accuracy": 0.2526836827397346,
"num_tokens": 17925790.0,
"step": 9860
},
{
"entropy": 5.683112764358521,
"epoch": 8.475290073055437,
"grad_norm": 1.4453125,
"learning_rate": 7.995615059661907e-05,
"loss": 4.8874,
"mean_token_accuracy": 0.2428746461868286,
"num_tokens": 17935007.0,
"step": 9865
},
{
"entropy": 5.700266742706299,
"epoch": 8.47958745165449,
"grad_norm": 1.4453125,
"learning_rate": 7.979060467760616e-05,
"loss": 4.8652,
"mean_token_accuracy": 0.23937302082777023,
"num_tokens": 17943363.0,
"step": 9870
},
{
"entropy": 5.69568886756897,
"epoch": 8.483884830253546,
"grad_norm": 1.546875,
"learning_rate": 7.962548501788811e-05,
"loss": 4.7895,
"mean_token_accuracy": 0.2517947033047676,
"num_tokens": 17951667.0,
"step": 9875
},
{
"entropy": 5.687301540374756,
"epoch": 8.4881822088526,
"grad_norm": 1.4921875,
"learning_rate": 7.946079197802028e-05,
"loss": 4.8921,
"mean_token_accuracy": 0.23645307570695878,
"num_tokens": 17960920.0,
"step": 9880
},
{
"entropy": 5.678065776824951,
"epoch": 8.492479587451655,
"grad_norm": 1.625,
"learning_rate": 7.929652591762636e-05,
"loss": 4.8753,
"mean_token_accuracy": 0.24290715754032136,
"num_tokens": 17969462.0,
"step": 9885
},
{
"entropy": 5.679081916809082,
"epoch": 8.496776966050708,
"grad_norm": 1.546875,
"learning_rate": 7.913268719539785e-05,
"loss": 4.8736,
"mean_token_accuracy": 0.24429829567670822,
"num_tokens": 17978608.0,
"step": 9890
},
{
"entropy": 5.665282773971557,
"epoch": 8.501074344649764,
"grad_norm": 1.3984375,
"learning_rate": 7.896927616909304e-05,
"loss": 4.7543,
"mean_token_accuracy": 0.2505912408232689,
"num_tokens": 17987563.0,
"step": 9895
},
{
"entropy": 5.718643712997436,
"epoch": 8.505371723248818,
"grad_norm": 1.3046875,
"learning_rate": 7.880629319553623e-05,
"loss": 4.8622,
"mean_token_accuracy": 0.2513675600290298,
"num_tokens": 17996493.0,
"step": 9900
},
{
"entropy": 5.699320316314697,
"epoch": 8.509669101847873,
"grad_norm": 1.3828125,
"learning_rate": 7.864373863061717e-05,
"loss": 4.8526,
"mean_token_accuracy": 0.24734937399625778,
"num_tokens": 18004578.0,
"step": 9905
},
{
"entropy": 5.698253583908081,
"epoch": 8.513966480446927,
"grad_norm": 1.53125,
"learning_rate": 7.848161282929006e-05,
"loss": 4.8414,
"mean_token_accuracy": 0.2415296345949173,
"num_tokens": 18013366.0,
"step": 9910
},
{
"entropy": 5.725323486328125,
"epoch": 8.518263859045982,
"grad_norm": 1.4765625,
"learning_rate": 7.831991614557274e-05,
"loss": 4.8704,
"mean_token_accuracy": 0.23448484092950822,
"num_tokens": 18021718.0,
"step": 9915
},
{
"entropy": 5.709474277496338,
"epoch": 8.522561237645036,
"grad_norm": 1.484375,
"learning_rate": 7.815864893254619e-05,
"loss": 4.8594,
"mean_token_accuracy": 0.23699430823326112,
"num_tokens": 18030770.0,
"step": 9920
},
{
"entropy": 5.685472393035889,
"epoch": 8.526858616244091,
"grad_norm": 1.390625,
"learning_rate": 7.799781154235361e-05,
"loss": 4.8764,
"mean_token_accuracy": 0.24279188066720964,
"num_tokens": 18039009.0,
"step": 9925
},
{
"entropy": 5.741101264953613,
"epoch": 8.531155994843147,
"grad_norm": 1.6171875,
"learning_rate": 7.783740432619954e-05,
"loss": 4.9274,
"mean_token_accuracy": 0.22871543914079667,
"num_tokens": 18046677.0,
"step": 9930
},
{
"entropy": 5.68333592414856,
"epoch": 8.5354533734422,
"grad_norm": 1.484375,
"learning_rate": 7.767742763434922e-05,
"loss": 4.8874,
"mean_token_accuracy": 0.23869478553533555,
"num_tokens": 18056772.0,
"step": 9935
},
{
"entropy": 5.657694101333618,
"epoch": 8.539750752041256,
"grad_norm": 1.390625,
"learning_rate": 7.75178818161277e-05,
"loss": 4.7672,
"mean_token_accuracy": 0.2539424285292625,
"num_tokens": 18065387.0,
"step": 9940
},
{
"entropy": 5.694158124923706,
"epoch": 8.54404813064031,
"grad_norm": 1.4921875,
"learning_rate": 7.735876721991945e-05,
"loss": 4.9216,
"mean_token_accuracy": 0.23886470794677733,
"num_tokens": 18074495.0,
"step": 9945
},
{
"entropy": 5.7139732360839846,
"epoch": 8.548345509239365,
"grad_norm": 1.3125,
"learning_rate": 7.720008419316708e-05,
"loss": 4.8844,
"mean_token_accuracy": 0.23962349444627762,
"num_tokens": 18083351.0,
"step": 9950
},
{
"entropy": 5.672069883346557,
"epoch": 8.552642887838418,
"grad_norm": 1.3515625,
"learning_rate": 7.704183308237089e-05,
"loss": 4.8055,
"mean_token_accuracy": 0.25269206762313845,
"num_tokens": 18092632.0,
"step": 9955
},
{
"entropy": 5.615656089782715,
"epoch": 8.556940266437474,
"grad_norm": 1.328125,
"learning_rate": 7.688401423308799e-05,
"loss": 4.792,
"mean_token_accuracy": 0.2518460646271706,
"num_tokens": 18102380.0,
"step": 9960
},
{
"entropy": 5.626342678070069,
"epoch": 8.561237645036528,
"grad_norm": 1.546875,
"learning_rate": 7.672662798993174e-05,
"loss": 4.7884,
"mean_token_accuracy": 0.25717829167842865,
"num_tokens": 18111254.0,
"step": 9965
},
{
"entropy": 5.711092376708985,
"epoch": 8.565535023635583,
"grad_norm": 1.4375,
"learning_rate": 7.656967469657083e-05,
"loss": 4.8606,
"mean_token_accuracy": 0.24956730604171753,
"num_tokens": 18119998.0,
"step": 9970
},
{
"entropy": 5.655433797836304,
"epoch": 8.569832402234637,
"grad_norm": 1.4140625,
"learning_rate": 7.641315469572841e-05,
"loss": 4.8115,
"mean_token_accuracy": 0.24736772924661637,
"num_tokens": 18129237.0,
"step": 9975
},
{
"entropy": 5.744913959503174,
"epoch": 8.574129780833692,
"grad_norm": 1.265625,
"learning_rate": 7.625706832918172e-05,
"loss": 4.8769,
"mean_token_accuracy": 0.24204541593790055,
"num_tokens": 18138440.0,
"step": 9980
},
{
"entropy": 5.702751970291137,
"epoch": 8.578427159432746,
"grad_norm": 1.4296875,
"learning_rate": 7.610141593776091e-05,
"loss": 4.9616,
"mean_token_accuracy": 0.23331688791513444,
"num_tokens": 18147712.0,
"step": 9985
},
{
"entropy": 5.710877704620361,
"epoch": 8.582724538031801,
"grad_norm": 1.4765625,
"learning_rate": 7.59461978613486e-05,
"loss": 4.8721,
"mean_token_accuracy": 0.2368180647492409,
"num_tokens": 18156710.0,
"step": 9990
},
{
"entropy": 5.697241592407226,
"epoch": 8.587021916630855,
"grad_norm": 1.546875,
"learning_rate": 7.579141443887901e-05,
"loss": 4.8222,
"mean_token_accuracy": 0.24461003839969636,
"num_tokens": 18165412.0,
"step": 9995
},
{
"entropy": 5.680181884765625,
"epoch": 8.59131929522991,
"grad_norm": 1.546875,
"learning_rate": 7.563706600833737e-05,
"loss": 4.8708,
"mean_token_accuracy": 0.24154605716466904,
"num_tokens": 18174415.0,
"step": 10000
},
{
"epoch": 8.59131929522991,
"eval_entropy": 5.506567450794014,
"eval_loss": 5.913105010986328,
"eval_mean_token_accuracy": 0.18126201354437046,
"eval_num_tokens": 18174415.0,
"eval_runtime": 2.0383,
"eval_samples_per_second": 1741.192,
"eval_steps_per_second": 217.833,
"step": 10000
},
{
"entropy": 5.6254020690917965,
"epoch": 8.595616673828964,
"grad_norm": 1.6015625,
"learning_rate": 7.548315290675886e-05,
"loss": 4.7962,
"mean_token_accuracy": 0.24960373640060424,
"num_tokens": 18183468.0,
"step": 10005
},
{
"entropy": 5.649244928359986,
"epoch": 8.59991405242802,
"grad_norm": 1.34375,
"learning_rate": 7.532967547022825e-05,
"loss": 4.8117,
"mean_token_accuracy": 0.24662989526987075,
"num_tokens": 18192552.0,
"step": 10010
},
{
"entropy": 5.689184713363647,
"epoch": 8.604211431027073,
"grad_norm": 1.515625,
"learning_rate": 7.517663403387874e-05,
"loss": 4.8954,
"mean_token_accuracy": 0.2369121193885803,
"num_tokens": 18202056.0,
"step": 10015
},
{
"entropy": 5.620952463150024,
"epoch": 8.608508809626128,
"grad_norm": 1.4765625,
"learning_rate": 7.502402893189191e-05,
"loss": 4.7867,
"mean_token_accuracy": 0.25269664376974105,
"num_tokens": 18210329.0,
"step": 10020
},
{
"entropy": 5.745185375213623,
"epoch": 8.612806188225182,
"grad_norm": 1.328125,
"learning_rate": 7.487186049749618e-05,
"loss": 4.9056,
"mean_token_accuracy": 0.23024686872959138,
"num_tokens": 18219143.0,
"step": 10025
},
{
"entropy": 5.682984447479248,
"epoch": 8.617103566824238,
"grad_norm": 1.390625,
"learning_rate": 7.472012906296658e-05,
"loss": 4.8526,
"mean_token_accuracy": 0.24552355259656905,
"num_tokens": 18227679.0,
"step": 10030
},
{
"entropy": 5.675940084457397,
"epoch": 8.621400945423291,
"grad_norm": 1.2265625,
"learning_rate": 7.45688349596239e-05,
"loss": 4.8174,
"mean_token_accuracy": 0.2461686313152313,
"num_tokens": 18237522.0,
"step": 10035
},
{
"entropy": 5.694513654708862,
"epoch": 8.625698324022347,
"grad_norm": 1.421875,
"learning_rate": 7.441797851783402e-05,
"loss": 4.8587,
"mean_token_accuracy": 0.2418885588645935,
"num_tokens": 18245969.0,
"step": 10040
},
{
"entropy": 5.6632660865783695,
"epoch": 8.6299957026214,
"grad_norm": 1.5,
"learning_rate": 7.426756006700716e-05,
"loss": 4.8073,
"mean_token_accuracy": 0.2459379628300667,
"num_tokens": 18254886.0,
"step": 10045
},
{
"entropy": 5.711243152618408,
"epoch": 8.634293081220456,
"grad_norm": 1.5546875,
"learning_rate": 7.411757993559702e-05,
"loss": 4.9312,
"mean_token_accuracy": 0.23633207231760026,
"num_tokens": 18264222.0,
"step": 10050
},
{
"entropy": 5.748408365249634,
"epoch": 8.63859045981951,
"grad_norm": 1.4765625,
"learning_rate": 7.396803845110032e-05,
"loss": 4.8876,
"mean_token_accuracy": 0.2470487743616104,
"num_tokens": 18273548.0,
"step": 10055
},
{
"entropy": 5.711117362976074,
"epoch": 8.642887838418565,
"grad_norm": 1.453125,
"learning_rate": 7.381893594005585e-05,
"loss": 4.8754,
"mean_token_accuracy": 0.23941914290189742,
"num_tokens": 18282842.0,
"step": 10060
},
{
"entropy": 5.681402254104614,
"epoch": 8.647185217017618,
"grad_norm": 1.421875,
"learning_rate": 7.367027272804387e-05,
"loss": 4.8257,
"mean_token_accuracy": 0.24647735208272933,
"num_tokens": 18291133.0,
"step": 10065
},
{
"entropy": 5.687382364273072,
"epoch": 8.651482595616674,
"grad_norm": 1.40625,
"learning_rate": 7.352204913968546e-05,
"loss": 4.8686,
"mean_token_accuracy": 0.2503230169415474,
"num_tokens": 18300936.0,
"step": 10070
},
{
"entropy": 5.7059704780578615,
"epoch": 8.65577997421573,
"grad_norm": 1.25,
"learning_rate": 7.337426549864175e-05,
"loss": 4.8993,
"mean_token_accuracy": 0.23561031371355057,
"num_tokens": 18311372.0,
"step": 10075
},
{
"entropy": 5.686294651031494,
"epoch": 8.660077352814783,
"grad_norm": 1.2734375,
"learning_rate": 7.322692212761306e-05,
"loss": 4.9052,
"mean_token_accuracy": 0.24392839670181274,
"num_tokens": 18321166.0,
"step": 10080
},
{
"entropy": 5.726395082473755,
"epoch": 8.664374731413837,
"grad_norm": 1.5234375,
"learning_rate": 7.308001934833844e-05,
"loss": 4.8645,
"mean_token_accuracy": 0.23951085060834884,
"num_tokens": 18330399.0,
"step": 10085
},
{
"entropy": 5.7263373851776125,
"epoch": 8.668672110012892,
"grad_norm": 1.453125,
"learning_rate": 7.29335574815948e-05,
"loss": 4.8393,
"mean_token_accuracy": 0.24821092933416367,
"num_tokens": 18338920.0,
"step": 10090
},
{
"entropy": 5.696876811981201,
"epoch": 8.672969488611947,
"grad_norm": 1.28125,
"learning_rate": 7.278753684719636e-05,
"loss": 4.8531,
"mean_token_accuracy": 0.24141126424074172,
"num_tokens": 18348515.0,
"step": 10095
},
{
"entropy": 5.720726728439331,
"epoch": 8.677266867211001,
"grad_norm": 1.453125,
"learning_rate": 7.264195776399386e-05,
"loss": 4.8862,
"mean_token_accuracy": 0.2419890806078911,
"num_tokens": 18357353.0,
"step": 10100
},
{
"entropy": 5.65763750076294,
"epoch": 8.681564245810057,
"grad_norm": 1.390625,
"learning_rate": 7.249682054987381e-05,
"loss": 4.8015,
"mean_token_accuracy": 0.24651831835508348,
"num_tokens": 18366434.0,
"step": 10105
},
{
"entropy": 5.749982023239136,
"epoch": 8.68586162440911,
"grad_norm": 1.3515625,
"learning_rate": 7.23521255217578e-05,
"loss": 4.9131,
"mean_token_accuracy": 0.2301698476076126,
"num_tokens": 18375589.0,
"step": 10110
},
{
"entropy": 5.68700098991394,
"epoch": 8.690159003008166,
"grad_norm": 1.3984375,
"learning_rate": 7.220787299560205e-05,
"loss": 4.8505,
"mean_token_accuracy": 0.2439696341753006,
"num_tokens": 18384904.0,
"step": 10115
},
{
"entropy": 5.666984224319458,
"epoch": 8.69445638160722,
"grad_norm": 1.484375,
"learning_rate": 7.20640632863963e-05,
"loss": 4.7216,
"mean_token_accuracy": 0.25795569866895673,
"num_tokens": 18393833.0,
"step": 10120
},
{
"entropy": 5.702707195281983,
"epoch": 8.698753760206275,
"grad_norm": 1.4140625,
"learning_rate": 7.192069670816359e-05,
"loss": 4.8964,
"mean_token_accuracy": 0.23888549208641052,
"num_tokens": 18403444.0,
"step": 10125
},
{
"entropy": 5.65869517326355,
"epoch": 8.703051138805328,
"grad_norm": 1.5625,
"learning_rate": 7.177777357395912e-05,
"loss": 4.8157,
"mean_token_accuracy": 0.24661154597997664,
"num_tokens": 18412091.0,
"step": 10130
},
{
"entropy": 5.68733491897583,
"epoch": 8.707348517404384,
"grad_norm": 1.4765625,
"learning_rate": 7.163529419587002e-05,
"loss": 4.8307,
"mean_token_accuracy": 0.24211833775043487,
"num_tokens": 18421655.0,
"step": 10135
},
{
"entropy": 5.7377735614776615,
"epoch": 8.711645896003438,
"grad_norm": 1.4296875,
"learning_rate": 7.149325888501418e-05,
"loss": 4.928,
"mean_token_accuracy": 0.24463406801223755,
"num_tokens": 18431239.0,
"step": 10140
},
{
"entropy": 5.669396209716797,
"epoch": 8.715943274602493,
"grad_norm": 1.296875,
"learning_rate": 7.135166795153992e-05,
"loss": 4.8142,
"mean_token_accuracy": 0.24762289375066757,
"num_tokens": 18440572.0,
"step": 10145
},
{
"entropy": 5.694294738769531,
"epoch": 8.720240653201547,
"grad_norm": 1.46875,
"learning_rate": 7.121052170462541e-05,
"loss": 4.8963,
"mean_token_accuracy": 0.24246564954519273,
"num_tokens": 18449245.0,
"step": 10150
},
{
"entropy": 5.621601915359497,
"epoch": 8.724538031800602,
"grad_norm": 1.5390625,
"learning_rate": 7.106982045247754e-05,
"loss": 4.7392,
"mean_token_accuracy": 0.26036910712718964,
"num_tokens": 18457961.0,
"step": 10155
},
{
"entropy": 5.691078186035156,
"epoch": 8.728835410399656,
"grad_norm": 1.375,
"learning_rate": 7.092956450233162e-05,
"loss": 4.9067,
"mean_token_accuracy": 0.2410487100481987,
"num_tokens": 18467956.0,
"step": 10160
},
{
"entropy": 5.701229906082153,
"epoch": 8.733132788998711,
"grad_norm": 1.3125,
"learning_rate": 7.078975416045055e-05,
"loss": 4.8365,
"mean_token_accuracy": 0.24399230629205704,
"num_tokens": 18477313.0,
"step": 10165
},
{
"entropy": 5.623446989059448,
"epoch": 8.737430167597765,
"grad_norm": 1.6328125,
"learning_rate": 7.065038973212424e-05,
"loss": 4.7297,
"mean_token_accuracy": 0.25248824506998063,
"num_tokens": 18485690.0,
"step": 10170
},
{
"entropy": 5.676666736602783,
"epoch": 8.74172754619682,
"grad_norm": 1.5546875,
"learning_rate": 7.051147152166896e-05,
"loss": 4.848,
"mean_token_accuracy": 0.2502830371260643,
"num_tokens": 18494150.0,
"step": 10175
},
{
"entropy": 5.63793683052063,
"epoch": 8.746024924795874,
"grad_norm": 1.2109375,
"learning_rate": 7.037299983242652e-05,
"loss": 4.8172,
"mean_token_accuracy": 0.2467881128191948,
"num_tokens": 18504091.0,
"step": 10180
},
{
"entropy": 5.690157794952393,
"epoch": 8.75032230339493,
"grad_norm": 1.4296875,
"learning_rate": 7.023497496676371e-05,
"loss": 4.866,
"mean_token_accuracy": 0.2452865555882454,
"num_tokens": 18513695.0,
"step": 10185
},
{
"entropy": 5.6855371475219725,
"epoch": 8.754619681993983,
"grad_norm": 1.28125,
"learning_rate": 7.009739722607173e-05,
"loss": 4.831,
"mean_token_accuracy": 0.25114827454090116,
"num_tokens": 18522551.0,
"step": 10190
},
{
"entropy": 5.739221286773682,
"epoch": 8.758917060593038,
"grad_norm": 1.390625,
"learning_rate": 6.996026691076531e-05,
"loss": 4.9829,
"mean_token_accuracy": 0.23191874623298644,
"num_tokens": 18533290.0,
"step": 10195
},
{
"entropy": 5.685747337341309,
"epoch": 8.763214439192092,
"grad_norm": 1.375,
"learning_rate": 6.982358432028234e-05,
"loss": 4.7782,
"mean_token_accuracy": 0.24641448855400086,
"num_tokens": 18542990.0,
"step": 10200
},
{
"entropy": 5.780192089080811,
"epoch": 8.767511817791148,
"grad_norm": 1.4453125,
"learning_rate": 6.968734975308283e-05,
"loss": 4.953,
"mean_token_accuracy": 0.23276820182800292,
"num_tokens": 18552525.0,
"step": 10205
},
{
"entropy": 5.6691224575042725,
"epoch": 8.771809196390201,
"grad_norm": 1.4140625,
"learning_rate": 6.955156350664876e-05,
"loss": 4.7706,
"mean_token_accuracy": 0.24767502546310424,
"num_tokens": 18561741.0,
"step": 10210
},
{
"entropy": 5.754533815383911,
"epoch": 8.776106574989257,
"grad_norm": 1.4609375,
"learning_rate": 6.941622587748298e-05,
"loss": 4.9233,
"mean_token_accuracy": 0.23771358281373978,
"num_tokens": 18570130.0,
"step": 10215
},
{
"entropy": 5.644664287567139,
"epoch": 8.780403953588312,
"grad_norm": 1.5078125,
"learning_rate": 6.928133716110863e-05,
"loss": 4.8566,
"mean_token_accuracy": 0.2493690401315689,
"num_tokens": 18579149.0,
"step": 10220
},
{
"entropy": 5.680695533752441,
"epoch": 8.784701332187366,
"grad_norm": 1.53125,
"learning_rate": 6.914689765206899e-05,
"loss": 4.7534,
"mean_token_accuracy": 0.24669545888900757,
"num_tokens": 18587604.0,
"step": 10225
},
{
"entropy": 5.696615695953369,
"epoch": 8.78899871078642,
"grad_norm": 1.5546875,
"learning_rate": 6.901290764392609e-05,
"loss": 4.8287,
"mean_token_accuracy": 0.24735084176063538,
"num_tokens": 18596056.0,
"step": 10230
},
{
"entropy": 5.695622062683105,
"epoch": 8.793296089385475,
"grad_norm": 1.21875,
"learning_rate": 6.887936742926058e-05,
"loss": 4.8745,
"mean_token_accuracy": 0.23673731535673143,
"num_tokens": 18605575.0,
"step": 10235
},
{
"entropy": 5.631627225875855,
"epoch": 8.79759346798453,
"grad_norm": 1.3828125,
"learning_rate": 6.874627729967086e-05,
"loss": 4.8136,
"mean_token_accuracy": 0.24473054558038712,
"num_tokens": 18614206.0,
"step": 10240
},
{
"entropy": 5.666772413253784,
"epoch": 8.801890846583584,
"grad_norm": 1.3203125,
"learning_rate": 6.861363754577262e-05,
"loss": 4.8219,
"mean_token_accuracy": 0.2469828635454178,
"num_tokens": 18623067.0,
"step": 10245
},
{
"entropy": 5.623386335372925,
"epoch": 8.80618822518264,
"grad_norm": 1.5703125,
"learning_rate": 6.848144845719808e-05,
"loss": 4.7902,
"mean_token_accuracy": 0.2543609425425529,
"num_tokens": 18632266.0,
"step": 10250
},
{
"entropy": 5.627623748779297,
"epoch": 8.810485603781693,
"grad_norm": 1.4453125,
"learning_rate": 6.834971032259537e-05,
"loss": 4.7903,
"mean_token_accuracy": 0.2480527102947235,
"num_tokens": 18640516.0,
"step": 10255
},
{
"entropy": 5.646703052520752,
"epoch": 8.814782982380748,
"grad_norm": 1.5,
"learning_rate": 6.821842342962786e-05,
"loss": 4.8141,
"mean_token_accuracy": 0.25480430126190184,
"num_tokens": 18649368.0,
"step": 10260
},
{
"entropy": 5.662834644317627,
"epoch": 8.819080360979802,
"grad_norm": 1.453125,
"learning_rate": 6.808758806497375e-05,
"loss": 4.7975,
"mean_token_accuracy": 0.25370775163173676,
"num_tokens": 18658306.0,
"step": 10265
},
{
"entropy": 5.711131000518799,
"epoch": 8.823377739578858,
"grad_norm": 1.4765625,
"learning_rate": 6.795720451432509e-05,
"loss": 4.8063,
"mean_token_accuracy": 0.2523254692554474,
"num_tokens": 18668676.0,
"step": 10270
},
{
"entropy": 5.702434062957764,
"epoch": 8.827675118177911,
"grad_norm": 1.34375,
"learning_rate": 6.782727306238749e-05,
"loss": 4.9387,
"mean_token_accuracy": 0.2371155932545662,
"num_tokens": 18677840.0,
"step": 10275
},
{
"entropy": 5.624211645126342,
"epoch": 8.831972496776967,
"grad_norm": 1.5625,
"learning_rate": 6.769779399287928e-05,
"loss": 4.7561,
"mean_token_accuracy": 0.24929467886686324,
"num_tokens": 18685733.0,
"step": 10280
},
{
"entropy": 5.650241088867188,
"epoch": 8.83626987537602,
"grad_norm": 1.3828125,
"learning_rate": 6.756876758853104e-05,
"loss": 4.8294,
"mean_token_accuracy": 0.24658081233501433,
"num_tokens": 18694648.0,
"step": 10285
},
{
"entropy": 5.656064081192016,
"epoch": 8.840567253975076,
"grad_norm": 1.65625,
"learning_rate": 6.744019413108486e-05,
"loss": 4.8364,
"mean_token_accuracy": 0.24805764555931092,
"num_tokens": 18702758.0,
"step": 10290
},
{
"entropy": 5.736115217208862,
"epoch": 8.84486463257413,
"grad_norm": 1.4609375,
"learning_rate": 6.731207390129366e-05,
"loss": 4.8881,
"mean_token_accuracy": 0.2394386649131775,
"num_tokens": 18712257.0,
"step": 10295
},
{
"entropy": 5.716180801391602,
"epoch": 8.849162011173185,
"grad_norm": 1.2578125,
"learning_rate": 6.7184407178921e-05,
"loss": 4.8731,
"mean_token_accuracy": 0.2420590564608574,
"num_tokens": 18722337.0,
"step": 10300
},
{
"entropy": 5.737769985198975,
"epoch": 8.853459389772238,
"grad_norm": 1.515625,
"learning_rate": 6.70571942427399e-05,
"loss": 4.9205,
"mean_token_accuracy": 0.23732213526964188,
"num_tokens": 18731260.0,
"step": 10305
},
{
"entropy": 5.747370815277099,
"epoch": 8.857756768371294,
"grad_norm": 1.421875,
"learning_rate": 6.693043537053254e-05,
"loss": 4.9446,
"mean_token_accuracy": 0.23142024278640747,
"num_tokens": 18741357.0,
"step": 10310
},
{
"entropy": 5.675503587722778,
"epoch": 8.862054146970348,
"grad_norm": 1.4296875,
"learning_rate": 6.68041308390896e-05,
"loss": 4.8852,
"mean_token_accuracy": 0.2416966140270233,
"num_tokens": 18751694.0,
"step": 10315
},
{
"entropy": 5.726422929763794,
"epoch": 8.866351525569403,
"grad_norm": 1.328125,
"learning_rate": 6.667828092420977e-05,
"loss": 4.8992,
"mean_token_accuracy": 0.2432712882757187,
"num_tokens": 18760679.0,
"step": 10320
},
{
"entropy": 5.655799865722656,
"epoch": 8.870648904168457,
"grad_norm": 1.359375,
"learning_rate": 6.655288590069889e-05,
"loss": 4.8386,
"mean_token_accuracy": 0.2504117161035538,
"num_tokens": 18769650.0,
"step": 10325
},
{
"entropy": 5.644798851013183,
"epoch": 8.874946282767512,
"grad_norm": 1.421875,
"learning_rate": 6.642794604236965e-05,
"loss": 4.8651,
"mean_token_accuracy": 0.24558795243501663,
"num_tokens": 18780076.0,
"step": 10330
},
{
"entropy": 5.686689949035644,
"epoch": 8.879243661366566,
"grad_norm": 1.40625,
"learning_rate": 6.630346162204069e-05,
"loss": 4.7995,
"mean_token_accuracy": 0.2490845650434494,
"num_tokens": 18789034.0,
"step": 10335
},
{
"entropy": 5.655998277664184,
"epoch": 8.883541039965621,
"grad_norm": 1.6328125,
"learning_rate": 6.617943291153631e-05,
"loss": 4.7623,
"mean_token_accuracy": 0.2500889241695404,
"num_tokens": 18797916.0,
"step": 10340
},
{
"entropy": 5.753029680252075,
"epoch": 8.887838418564675,
"grad_norm": 1.59375,
"learning_rate": 6.605586018168558e-05,
"loss": 4.8629,
"mean_token_accuracy": 0.24752600491046906,
"num_tokens": 18806895.0,
"step": 10345
},
{
"entropy": 5.728021287918091,
"epoch": 8.89213579716373,
"grad_norm": 1.3125,
"learning_rate": 6.593274370232191e-05,
"loss": 4.8637,
"mean_token_accuracy": 0.24471449553966523,
"num_tokens": 18816049.0,
"step": 10350
},
{
"entropy": 5.675029182434082,
"epoch": 8.896433175762784,
"grad_norm": 1.3515625,
"learning_rate": 6.581008374228255e-05,
"loss": 4.847,
"mean_token_accuracy": 0.2431689277291298,
"num_tokens": 18825491.0,
"step": 10355
},
{
"entropy": 5.636963558197022,
"epoch": 8.90073055436184,
"grad_norm": 1.3984375,
"learning_rate": 6.568788056940785e-05,
"loss": 4.8634,
"mean_token_accuracy": 0.24555369168519975,
"num_tokens": 18834677.0,
"step": 10360
},
{
"entropy": 5.6485583782196045,
"epoch": 8.905027932960895,
"grad_norm": 1.359375,
"learning_rate": 6.556613445054063e-05,
"loss": 4.7669,
"mean_token_accuracy": 0.2565375745296478,
"num_tokens": 18844283.0,
"step": 10365
},
{
"entropy": 5.733704614639282,
"epoch": 8.909325311559948,
"grad_norm": 1.3828125,
"learning_rate": 6.544484565152577e-05,
"loss": 4.8357,
"mean_token_accuracy": 0.251072558760643,
"num_tokens": 18853403.0,
"step": 10370
},
{
"entropy": 5.6631426334381105,
"epoch": 8.913622690159002,
"grad_norm": 1.6171875,
"learning_rate": 6.532401443720951e-05,
"loss": 4.8682,
"mean_token_accuracy": 0.2461735039949417,
"num_tokens": 18861530.0,
"step": 10375
},
{
"entropy": 5.705496835708618,
"epoch": 8.917920068758058,
"grad_norm": 1.2421875,
"learning_rate": 6.520364107143898e-05,
"loss": 4.8882,
"mean_token_accuracy": 0.24805489480495452,
"num_tokens": 18871495.0,
"step": 10380
},
{
"entropy": 5.720414495468139,
"epoch": 8.922217447357113,
"grad_norm": 1.3671875,
"learning_rate": 6.50837258170615e-05,
"loss": 4.879,
"mean_token_accuracy": 0.24330080598592757,
"num_tokens": 18881099.0,
"step": 10385
},
{
"entropy": 5.712412929534912,
"epoch": 8.926514825956167,
"grad_norm": 1.390625,
"learning_rate": 6.496426893592397e-05,
"loss": 4.8904,
"mean_token_accuracy": 0.23457861095666885,
"num_tokens": 18890368.0,
"step": 10390
},
{
"entropy": 5.742951536178589,
"epoch": 8.930812204555222,
"grad_norm": 1.5234375,
"learning_rate": 6.484527068887258e-05,
"loss": 4.8807,
"mean_token_accuracy": 0.23752784430980683,
"num_tokens": 18898576.0,
"step": 10395
},
{
"entropy": 5.680559682846069,
"epoch": 8.935109583154276,
"grad_norm": 1.5390625,
"learning_rate": 6.472673133575181e-05,
"loss": 4.7971,
"mean_token_accuracy": 0.2512816682457924,
"num_tokens": 18907276.0,
"step": 10400
},
{
"entropy": 5.70475172996521,
"epoch": 8.939406961753331,
"grad_norm": 1.5078125,
"learning_rate": 6.460865113540437e-05,
"loss": 4.7961,
"mean_token_accuracy": 0.25029991418123243,
"num_tokens": 18915920.0,
"step": 10405
},
{
"entropy": 5.712363386154175,
"epoch": 8.943704340352385,
"grad_norm": 1.3671875,
"learning_rate": 6.449103034567011e-05,
"loss": 4.8037,
"mean_token_accuracy": 0.25050773173570634,
"num_tokens": 18925529.0,
"step": 10410
},
{
"entropy": 5.7132209777832035,
"epoch": 8.94800171895144,
"grad_norm": 1.46875,
"learning_rate": 6.437386922338591e-05,
"loss": 4.8899,
"mean_token_accuracy": 0.24239819943904878,
"num_tokens": 18934859.0,
"step": 10415
},
{
"entropy": 5.703937101364136,
"epoch": 8.952299097550494,
"grad_norm": 1.4765625,
"learning_rate": 6.425716802438479e-05,
"loss": 4.8782,
"mean_token_accuracy": 0.24772197157144546,
"num_tokens": 18943804.0,
"step": 10420
},
{
"entropy": 5.667482328414917,
"epoch": 8.95659647614955,
"grad_norm": 1.5234375,
"learning_rate": 6.414092700349548e-05,
"loss": 4.8499,
"mean_token_accuracy": 0.24999572783708573,
"num_tokens": 18954290.0,
"step": 10425
},
{
"entropy": 5.731049251556397,
"epoch": 8.960893854748603,
"grad_norm": 1.453125,
"learning_rate": 6.402514641454192e-05,
"loss": 4.9394,
"mean_token_accuracy": 0.23362892270088195,
"num_tokens": 18963448.0,
"step": 10430
},
{
"entropy": 5.677079200744629,
"epoch": 8.965191233347658,
"grad_norm": 1.3359375,
"learning_rate": 6.390982651034274e-05,
"loss": 4.8534,
"mean_token_accuracy": 0.24200855642557145,
"num_tokens": 18972314.0,
"step": 10435
},
{
"entropy": 5.644694709777832,
"epoch": 8.969488611946712,
"grad_norm": 1.546875,
"learning_rate": 6.379496754271044e-05,
"loss": 4.697,
"mean_token_accuracy": 0.2599035635590553,
"num_tokens": 18980720.0,
"step": 10440
},
{
"entropy": 5.619343566894531,
"epoch": 8.973785990545768,
"grad_norm": 1.5859375,
"learning_rate": 6.368056976245107e-05,
"loss": 4.7341,
"mean_token_accuracy": 0.2658589705824852,
"num_tokens": 18988267.0,
"step": 10445
},
{
"entropy": 5.668655204772949,
"epoch": 8.978083369144821,
"grad_norm": 1.5234375,
"learning_rate": 6.356663341936368e-05,
"loss": 4.8362,
"mean_token_accuracy": 0.24780694246292115,
"num_tokens": 18997036.0,
"step": 10450
},
{
"entropy": 5.671089696884155,
"epoch": 8.982380747743877,
"grad_norm": 1.5078125,
"learning_rate": 6.345315876223977e-05,
"loss": 4.8045,
"mean_token_accuracy": 0.2473225250840187,
"num_tokens": 19005440.0,
"step": 10455
},
{
"entropy": 5.656380033493042,
"epoch": 8.98667812634293,
"grad_norm": 1.453125,
"learning_rate": 6.334014603886256e-05,
"loss": 4.8372,
"mean_token_accuracy": 0.24929069727659225,
"num_tokens": 19014449.0,
"step": 10460
},
{
"entropy": 5.653917503356934,
"epoch": 8.990975504941986,
"grad_norm": 1.546875,
"learning_rate": 6.322759549600665e-05,
"loss": 4.7954,
"mean_token_accuracy": 0.24964505285024643,
"num_tokens": 19024061.0,
"step": 10465
},
{
"entropy": 5.615225315093994,
"epoch": 8.99527288354104,
"grad_norm": 1.4453125,
"learning_rate": 6.311550737943753e-05,
"loss": 4.7957,
"mean_token_accuracy": 0.2515642836689949,
"num_tokens": 19033400.0,
"step": 10470
},
{
"entropy": 5.703850746154785,
"epoch": 8.999570262140095,
"grad_norm": 1.453125,
"learning_rate": 6.300388193391075e-05,
"loss": 4.8762,
"mean_token_accuracy": 0.24604927450418473,
"num_tokens": 19042633.0,
"step": 10475
},
{
"entropy": 5.689663728078206,
"epoch": 9.003437902879243,
"grad_norm": 1.359375,
"learning_rate": 6.289271940317174e-05,
"loss": 4.929,
"mean_token_accuracy": 0.23949002060625288,
"num_tokens": 19051120.0,
"step": 10480
},
{
"entropy": 5.653165674209594,
"epoch": 9.007735281478299,
"grad_norm": 1.3984375,
"learning_rate": 6.278202002995497e-05,
"loss": 4.8287,
"mean_token_accuracy": 0.24843208193778993,
"num_tokens": 19060755.0,
"step": 10485
},
{
"entropy": 5.659261417388916,
"epoch": 9.012032660077352,
"grad_norm": 1.4921875,
"learning_rate": 6.267178405598371e-05,
"loss": 4.7583,
"mean_token_accuracy": 0.2590452417731285,
"num_tokens": 19069197.0,
"step": 10490
},
{
"entropy": 5.724744033813477,
"epoch": 9.016330038676408,
"grad_norm": 1.4140625,
"learning_rate": 6.256201172196921e-05,
"loss": 4.827,
"mean_token_accuracy": 0.2497994914650917,
"num_tokens": 19078452.0,
"step": 10495
},
{
"entropy": 5.6985725402832035,
"epoch": 9.020627417275461,
"grad_norm": 1.3515625,
"learning_rate": 6.245270326761034e-05,
"loss": 4.8345,
"mean_token_accuracy": 0.24337324351072312,
"num_tokens": 19087127.0,
"step": 10500
},
{
"epoch": 9.020627417275461,
"eval_entropy": 5.525319326568294,
"eval_loss": 5.9135332107543945,
"eval_mean_token_accuracy": 0.18133216045863992,
"eval_num_tokens": 19087127.0,
"eval_runtime": 2.0427,
"eval_samples_per_second": 1737.446,
"eval_steps_per_second": 217.364,
"step": 10500
},
{
"entropy": 5.71956787109375,
"epoch": 9.024924795874517,
"grad_norm": 1.53125,
"learning_rate": 6.234385893159311e-05,
"loss": 4.8298,
"mean_token_accuracy": 0.2490815445780754,
"num_tokens": 19095610.0,
"step": 10505
},
{
"entropy": 5.676876735687256,
"epoch": 9.02922217447357,
"grad_norm": 1.5390625,
"learning_rate": 6.223547895159009e-05,
"loss": 4.7734,
"mean_token_accuracy": 0.255961075425148,
"num_tokens": 19103649.0,
"step": 10510
},
{
"entropy": 5.61830244064331,
"epoch": 9.033519553072626,
"grad_norm": 1.4375,
"learning_rate": 6.212756356425978e-05,
"loss": 4.7371,
"mean_token_accuracy": 0.25833165645599365,
"num_tokens": 19112426.0,
"step": 10515
},
{
"entropy": 5.696269941329956,
"epoch": 9.03781693167168,
"grad_norm": 1.4375,
"learning_rate": 6.202011300524623e-05,
"loss": 4.8635,
"mean_token_accuracy": 0.24548338055610658,
"num_tokens": 19121307.0,
"step": 10520
},
{
"entropy": 5.724876403808594,
"epoch": 9.042114310270735,
"grad_norm": 1.3359375,
"learning_rate": 6.191312750917855e-05,
"loss": 4.8746,
"mean_token_accuracy": 0.2516420379281044,
"num_tokens": 19129728.0,
"step": 10525
},
{
"entropy": 5.674010276794434,
"epoch": 9.046411688869789,
"grad_norm": 1.4609375,
"learning_rate": 6.180660730967036e-05,
"loss": 4.79,
"mean_token_accuracy": 0.24628546386957167,
"num_tokens": 19139931.0,
"step": 10530
},
{
"entropy": 5.723674154281616,
"epoch": 9.050709067468844,
"grad_norm": 1.4296875,
"learning_rate": 6.170055263931912e-05,
"loss": 4.7869,
"mean_token_accuracy": 0.25347589552402494,
"num_tokens": 19148711.0,
"step": 10535
},
{
"entropy": 5.695741748809814,
"epoch": 9.055006446067898,
"grad_norm": 1.484375,
"learning_rate": 6.159496372970584e-05,
"loss": 4.7519,
"mean_token_accuracy": 0.2537475124001503,
"num_tokens": 19156937.0,
"step": 10540
},
{
"entropy": 5.6423603057861325,
"epoch": 9.059303824666953,
"grad_norm": 1.3046875,
"learning_rate": 6.148984081139454e-05,
"loss": 4.809,
"mean_token_accuracy": 0.2527445778250694,
"num_tokens": 19166449.0,
"step": 10545
},
{
"entropy": 5.712124967575074,
"epoch": 9.063601203266007,
"grad_norm": 1.203125,
"learning_rate": 6.138518411393163e-05,
"loss": 4.9294,
"mean_token_accuracy": 0.23486791402101517,
"num_tokens": 19175757.0,
"step": 10550
},
{
"entropy": 5.681598091125489,
"epoch": 9.067898581865062,
"grad_norm": 1.46875,
"learning_rate": 6.128099386584548e-05,
"loss": 4.8269,
"mean_token_accuracy": 0.24936643838882447,
"num_tokens": 19184638.0,
"step": 10555
},
{
"entropy": 5.685164928436279,
"epoch": 9.072195960464116,
"grad_norm": 1.453125,
"learning_rate": 6.1177270294646e-05,
"loss": 4.8695,
"mean_token_accuracy": 0.24396283477544783,
"num_tokens": 19193791.0,
"step": 10560
},
{
"entropy": 5.649673366546631,
"epoch": 9.076493339063171,
"grad_norm": 1.4453125,
"learning_rate": 6.107401362682401e-05,
"loss": 4.8724,
"mean_token_accuracy": 0.24454542100429535,
"num_tokens": 19202595.0,
"step": 10565
},
{
"entropy": 5.667158842086792,
"epoch": 9.080790717662227,
"grad_norm": 1.3359375,
"learning_rate": 6.097122408785076e-05,
"loss": 4.8515,
"mean_token_accuracy": 0.2463081881403923,
"num_tokens": 19211994.0,
"step": 10570
},
{
"entropy": 5.706708908081055,
"epoch": 9.08508809626128,
"grad_norm": 1.40625,
"learning_rate": 6.086890190217752e-05,
"loss": 4.8054,
"mean_token_accuracy": 0.25122742652893065,
"num_tokens": 19220588.0,
"step": 10575
},
{
"entropy": 5.645897769927979,
"epoch": 9.089385474860336,
"grad_norm": 1.53125,
"learning_rate": 6.076704729323507e-05,
"loss": 4.7591,
"mean_token_accuracy": 0.25730605572462084,
"num_tokens": 19229679.0,
"step": 10580
},
{
"entropy": 5.678455591201782,
"epoch": 9.09368285345939,
"grad_norm": 1.390625,
"learning_rate": 6.0665660483433173e-05,
"loss": 4.8874,
"mean_token_accuracy": 0.24280146211385728,
"num_tokens": 19239269.0,
"step": 10585
},
{
"entropy": 5.721188926696778,
"epoch": 9.097980232058445,
"grad_norm": 1.453125,
"learning_rate": 6.05647416941601e-05,
"loss": 4.8895,
"mean_token_accuracy": 0.2490587815642357,
"num_tokens": 19249046.0,
"step": 10590
},
{
"entropy": 5.641297340393066,
"epoch": 9.102277610657499,
"grad_norm": 1.4296875,
"learning_rate": 6.046429114578212e-05,
"loss": 4.7506,
"mean_token_accuracy": 0.25748555064201356,
"num_tokens": 19257398.0,
"step": 10595
},
{
"entropy": 5.715298366546631,
"epoch": 9.106574989256554,
"grad_norm": 1.4453125,
"learning_rate": 6.0364309057643084e-05,
"loss": 4.8532,
"mean_token_accuracy": 0.24479844868183137,
"num_tokens": 19266619.0,
"step": 10600
},
{
"entropy": 5.733517217636108,
"epoch": 9.110872367855608,
"grad_norm": 1.4765625,
"learning_rate": 6.0264795648063904e-05,
"loss": 4.8546,
"mean_token_accuracy": 0.25051381438970566,
"num_tokens": 19275837.0,
"step": 10605
},
{
"entropy": 5.7208888053894045,
"epoch": 9.115169746454663,
"grad_norm": 1.2578125,
"learning_rate": 6.0165751134342155e-05,
"loss": 4.8192,
"mean_token_accuracy": 0.24117450267076493,
"num_tokens": 19284745.0,
"step": 10610
},
{
"entropy": 5.6670173645019535,
"epoch": 9.119467125053717,
"grad_norm": 1.3046875,
"learning_rate": 6.006717573275138e-05,
"loss": 4.767,
"mean_token_accuracy": 0.2596576452255249,
"num_tokens": 19293956.0,
"step": 10615
},
{
"entropy": 5.694186544418335,
"epoch": 9.123764503652772,
"grad_norm": 1.171875,
"learning_rate": 5.996906965854093e-05,
"loss": 4.8821,
"mean_token_accuracy": 0.24278711527585983,
"num_tokens": 19303629.0,
"step": 10620
},
{
"entropy": 5.708401870727539,
"epoch": 9.128061882251826,
"grad_norm": 1.546875,
"learning_rate": 5.987143312593522e-05,
"loss": 4.8858,
"mean_token_accuracy": 0.23979386538267136,
"num_tokens": 19312458.0,
"step": 10625
},
{
"entropy": 5.638538551330567,
"epoch": 9.132359260850881,
"grad_norm": 1.484375,
"learning_rate": 5.977426634813338e-05,
"loss": 4.7821,
"mean_token_accuracy": 0.2493679091334343,
"num_tokens": 19321428.0,
"step": 10630
},
{
"entropy": 5.727855634689331,
"epoch": 9.136656639449935,
"grad_norm": 1.5,
"learning_rate": 5.9677569537308866e-05,
"loss": 4.9435,
"mean_token_accuracy": 0.23229997903108596,
"num_tokens": 19331176.0,
"step": 10635
},
{
"entropy": 5.7334301471710205,
"epoch": 9.14095401804899,
"grad_norm": 1.3984375,
"learning_rate": 5.958134290460888e-05,
"loss": 4.8561,
"mean_token_accuracy": 0.24669714123010636,
"num_tokens": 19340401.0,
"step": 10640
},
{
"entropy": 5.6491090774536135,
"epoch": 9.145251396648044,
"grad_norm": 1.234375,
"learning_rate": 5.94855866601539e-05,
"loss": 4.7773,
"mean_token_accuracy": 0.24960935413837432,
"num_tokens": 19350700.0,
"step": 10645
},
{
"entropy": 5.686613368988037,
"epoch": 9.1495487752471,
"grad_norm": 1.46875,
"learning_rate": 5.939030101303724e-05,
"loss": 4.8311,
"mean_token_accuracy": 0.24494681507349014,
"num_tokens": 19360040.0,
"step": 10650
},
{
"entropy": 5.705801963806152,
"epoch": 9.153846153846153,
"grad_norm": 1.515625,
"learning_rate": 5.929548617132472e-05,
"loss": 4.7765,
"mean_token_accuracy": 0.25311189144849777,
"num_tokens": 19368546.0,
"step": 10655
},
{
"entropy": 5.6294488430023195,
"epoch": 9.158143532445209,
"grad_norm": 1.53125,
"learning_rate": 5.920114234205407e-05,
"loss": 4.741,
"mean_token_accuracy": 0.2527645379304886,
"num_tokens": 19377137.0,
"step": 10660
},
{
"entropy": 5.7201310157775875,
"epoch": 9.162440911044262,
"grad_norm": 1.4140625,
"learning_rate": 5.910726973123451e-05,
"loss": 4.9084,
"mean_token_accuracy": 0.2363821893930435,
"num_tokens": 19385781.0,
"step": 10665
},
{
"entropy": 5.692120361328125,
"epoch": 9.166738289643318,
"grad_norm": 1.4765625,
"learning_rate": 5.901386854384622e-05,
"loss": 4.844,
"mean_token_accuracy": 0.25199408531188966,
"num_tokens": 19395186.0,
"step": 10670
},
{
"entropy": 5.724067687988281,
"epoch": 9.171035668242371,
"grad_norm": 1.5,
"learning_rate": 5.892093898384017e-05,
"loss": 4.8167,
"mean_token_accuracy": 0.245925672352314,
"num_tokens": 19404140.0,
"step": 10675
},
{
"entropy": 5.713251829147339,
"epoch": 9.175333046841427,
"grad_norm": 1.4765625,
"learning_rate": 5.8828481254137276e-05,
"loss": 4.851,
"mean_token_accuracy": 0.24122422188520432,
"num_tokens": 19414481.0,
"step": 10680
},
{
"entropy": 5.554886150360107,
"epoch": 9.17963042544048,
"grad_norm": 1.5859375,
"learning_rate": 5.873649555662836e-05,
"loss": 4.6859,
"mean_token_accuracy": 0.2586899071931839,
"num_tokens": 19423259.0,
"step": 10685
},
{
"entropy": 5.676319408416748,
"epoch": 9.183927804039536,
"grad_norm": 1.4453125,
"learning_rate": 5.8644982092173335e-05,
"loss": 4.808,
"mean_token_accuracy": 0.25076107382774354,
"num_tokens": 19432011.0,
"step": 10690
},
{
"entropy": 5.654268789291382,
"epoch": 9.18822518263859,
"grad_norm": 1.421875,
"learning_rate": 5.85539410606011e-05,
"loss": 4.8157,
"mean_token_accuracy": 0.24737216532230377,
"num_tokens": 19441221.0,
"step": 10695
},
{
"entropy": 5.674529504776001,
"epoch": 9.192522561237645,
"grad_norm": 1.4765625,
"learning_rate": 5.8463372660708836e-05,
"loss": 4.7714,
"mean_token_accuracy": 0.2522480428218842,
"num_tokens": 19450014.0,
"step": 10700
},
{
"entropy": 5.687941074371338,
"epoch": 9.196819939836699,
"grad_norm": 1.453125,
"learning_rate": 5.837327709026171e-05,
"loss": 4.8721,
"mean_token_accuracy": 0.24091259837150575,
"num_tokens": 19459480.0,
"step": 10705
},
{
"entropy": 5.706489229202271,
"epoch": 9.201117318435754,
"grad_norm": 1.390625,
"learning_rate": 5.8283654545992464e-05,
"loss": 4.8697,
"mean_token_accuracy": 0.24455933570861815,
"num_tokens": 19469111.0,
"step": 10710
},
{
"entropy": 5.7112232685089115,
"epoch": 9.20541469703481,
"grad_norm": 1.5,
"learning_rate": 5.819450522360096e-05,
"loss": 4.8215,
"mean_token_accuracy": 0.24159085601568223,
"num_tokens": 19477531.0,
"step": 10715
},
{
"entropy": 5.709060525894165,
"epoch": 9.209712075633863,
"grad_norm": 1.5,
"learning_rate": 5.810582931775362e-05,
"loss": 4.8137,
"mean_token_accuracy": 0.24858906120061874,
"num_tokens": 19486041.0,
"step": 10720
},
{
"entropy": 5.681521844863892,
"epoch": 9.214009454232919,
"grad_norm": 1.4375,
"learning_rate": 5.801762702208317e-05,
"loss": 4.8478,
"mean_token_accuracy": 0.24738438427448273,
"num_tokens": 19495514.0,
"step": 10725
},
{
"entropy": 5.6257623672485355,
"epoch": 9.218306832831972,
"grad_norm": 1.4921875,
"learning_rate": 5.7929898529188215e-05,
"loss": 4.7684,
"mean_token_accuracy": 0.2537810802459717,
"num_tokens": 19503879.0,
"step": 10730
},
{
"entropy": 5.650828313827515,
"epoch": 9.222604211431028,
"grad_norm": 1.234375,
"learning_rate": 5.784264403063272e-05,
"loss": 4.7452,
"mean_token_accuracy": 0.2504738375544548,
"num_tokens": 19513922.0,
"step": 10735
},
{
"entropy": 5.654237222671509,
"epoch": 9.226901590030081,
"grad_norm": 1.5625,
"learning_rate": 5.775586371694561e-05,
"loss": 4.7951,
"mean_token_accuracy": 0.24916237890720366,
"num_tokens": 19522555.0,
"step": 10740
},
{
"entropy": 5.660638761520386,
"epoch": 9.231198968629137,
"grad_norm": 1.296875,
"learning_rate": 5.7669557777620376e-05,
"loss": 4.7902,
"mean_token_accuracy": 0.24715202003717424,
"num_tokens": 19531245.0,
"step": 10745
},
{
"entropy": 5.656718826293945,
"epoch": 9.23549634722819,
"grad_norm": 1.5,
"learning_rate": 5.7583726401114756e-05,
"loss": 4.767,
"mean_token_accuracy": 0.25462585389614106,
"num_tokens": 19540258.0,
"step": 10750
},
{
"entropy": 5.721525049209594,
"epoch": 9.239793725827246,
"grad_norm": 1.453125,
"learning_rate": 5.749836977485013e-05,
"loss": 4.8266,
"mean_token_accuracy": 0.24351436644792557,
"num_tokens": 19550071.0,
"step": 10755
},
{
"entropy": 5.650406169891357,
"epoch": 9.2440911044263,
"grad_norm": 1.3359375,
"learning_rate": 5.741348808521128e-05,
"loss": 4.7445,
"mean_token_accuracy": 0.2573240607976913,
"num_tokens": 19558972.0,
"step": 10760
},
{
"entropy": 5.714642190933228,
"epoch": 9.248388483025355,
"grad_norm": 1.4296875,
"learning_rate": 5.7329081517545846e-05,
"loss": 4.853,
"mean_token_accuracy": 0.2486440807580948,
"num_tokens": 19568270.0,
"step": 10765
},
{
"entropy": 5.722471761703491,
"epoch": 9.252685861624409,
"grad_norm": 1.703125,
"learning_rate": 5.724515025616409e-05,
"loss": 4.8096,
"mean_token_accuracy": 0.25049478709697726,
"num_tokens": 19576726.0,
"step": 10770
},
{
"entropy": 5.682512664794922,
"epoch": 9.256983240223464,
"grad_norm": 1.4140625,
"learning_rate": 5.716169448433832e-05,
"loss": 4.8121,
"mean_token_accuracy": 0.24451116025447844,
"num_tokens": 19586706.0,
"step": 10775
},
{
"entropy": 5.673455286026001,
"epoch": 9.261280618822518,
"grad_norm": 1.46875,
"learning_rate": 5.707871438430255e-05,
"loss": 4.8358,
"mean_token_accuracy": 0.24290886968374253,
"num_tokens": 19595370.0,
"step": 10780
},
{
"entropy": 5.656227445602417,
"epoch": 9.265577997421573,
"grad_norm": 1.5546875,
"learning_rate": 5.699621013725218e-05,
"loss": 4.8452,
"mean_token_accuracy": 0.25251403003931044,
"num_tokens": 19604134.0,
"step": 10785
},
{
"entropy": 5.617074394226075,
"epoch": 9.269875376020627,
"grad_norm": 1.390625,
"learning_rate": 5.691418192334352e-05,
"loss": 4.7802,
"mean_token_accuracy": 0.25588990449905397,
"num_tokens": 19612716.0,
"step": 10790
},
{
"entropy": 5.6477278709411625,
"epoch": 9.274172754619682,
"grad_norm": 1.421875,
"learning_rate": 5.683262992169341e-05,
"loss": 4.8938,
"mean_token_accuracy": 0.24226614087820053,
"num_tokens": 19621850.0,
"step": 10795
},
{
"entropy": 5.641665840148926,
"epoch": 9.278470133218736,
"grad_norm": 1.3828125,
"learning_rate": 5.675155431037876e-05,
"loss": 4.8183,
"mean_token_accuracy": 0.24731258451938629,
"num_tokens": 19630706.0,
"step": 10800
},
{
"entropy": 5.627617597579956,
"epoch": 9.282767511817791,
"grad_norm": 1.46875,
"learning_rate": 5.6670955266436365e-05,
"loss": 4.785,
"mean_token_accuracy": 0.2624928027391434,
"num_tokens": 19639364.0,
"step": 10805
},
{
"entropy": 5.707224988937378,
"epoch": 9.287064890416845,
"grad_norm": 1.421875,
"learning_rate": 5.659083296586229e-05,
"loss": 4.7588,
"mean_token_accuracy": 0.25203574150800706,
"num_tokens": 19647816.0,
"step": 10810
},
{
"entropy": 5.7143463611602785,
"epoch": 9.2913622690159,
"grad_norm": 1.3046875,
"learning_rate": 5.6511187583611663e-05,
"loss": 4.8642,
"mean_token_accuracy": 0.24791315644979478,
"num_tokens": 19657136.0,
"step": 10815
},
{
"entropy": 5.728357982635498,
"epoch": 9.295659647614954,
"grad_norm": 1.5,
"learning_rate": 5.643201929359809e-05,
"loss": 4.8286,
"mean_token_accuracy": 0.24645233452320098,
"num_tokens": 19665676.0,
"step": 10820
},
{
"entropy": 5.699741363525391,
"epoch": 9.29995702621401,
"grad_norm": 1.6171875,
"learning_rate": 5.635332826869353e-05,
"loss": 4.7941,
"mean_token_accuracy": 0.25159171521663665,
"num_tokens": 19674415.0,
"step": 10825
},
{
"entropy": 5.623746871948242,
"epoch": 9.304254404813063,
"grad_norm": 1.34375,
"learning_rate": 5.6275114680727716e-05,
"loss": 4.7987,
"mean_token_accuracy": 0.2530513867735863,
"num_tokens": 19684411.0,
"step": 10830
},
{
"entropy": 5.671674680709839,
"epoch": 9.308551783412119,
"grad_norm": 1.5625,
"learning_rate": 5.619737870048783e-05,
"loss": 4.7845,
"mean_token_accuracy": 0.2503496289253235,
"num_tokens": 19693192.0,
"step": 10835
},
{
"entropy": 5.609975337982178,
"epoch": 9.312849162011172,
"grad_norm": 1.4609375,
"learning_rate": 5.612012049771823e-05,
"loss": 4.6653,
"mean_token_accuracy": 0.2608976736664772,
"num_tokens": 19701710.0,
"step": 10840
},
{
"entropy": 5.756820487976074,
"epoch": 9.317146540610228,
"grad_norm": 1.2890625,
"learning_rate": 5.6043340241119924e-05,
"loss": 4.8882,
"mean_token_accuracy": 0.23862574100494385,
"num_tokens": 19710909.0,
"step": 10845
},
{
"entropy": 5.7104119777679445,
"epoch": 9.321443919209282,
"grad_norm": 1.3984375,
"learning_rate": 5.596703809835033e-05,
"loss": 4.838,
"mean_token_accuracy": 0.2501197576522827,
"num_tokens": 19720634.0,
"step": 10850
},
{
"entropy": 5.672194099426269,
"epoch": 9.325741297808337,
"grad_norm": 1.6484375,
"learning_rate": 5.589121423602277e-05,
"loss": 4.7236,
"mean_token_accuracy": 0.2551276534795761,
"num_tokens": 19729763.0,
"step": 10855
},
{
"entropy": 5.741184663772583,
"epoch": 9.33003867640739,
"grad_norm": 1.4375,
"learning_rate": 5.581586881970631e-05,
"loss": 4.8936,
"mean_token_accuracy": 0.23915671110153197,
"num_tokens": 19739065.0,
"step": 10860
},
{
"entropy": 5.611663722991944,
"epoch": 9.334336055006446,
"grad_norm": 1.390625,
"learning_rate": 5.574100201392522e-05,
"loss": 4.7179,
"mean_token_accuracy": 0.2580415040254593,
"num_tokens": 19748519.0,
"step": 10865
},
{
"entropy": 5.67966480255127,
"epoch": 9.338633433605501,
"grad_norm": 1.4609375,
"learning_rate": 5.5666613982158665e-05,
"loss": 4.7873,
"mean_token_accuracy": 0.2571543887257576,
"num_tokens": 19757573.0,
"step": 10870
},
{
"entropy": 5.705618810653687,
"epoch": 9.342930812204555,
"grad_norm": 1.4453125,
"learning_rate": 5.559270488684036e-05,
"loss": 4.9332,
"mean_token_accuracy": 0.2332558274269104,
"num_tokens": 19767534.0,
"step": 10875
},
{
"entropy": 5.721909713745117,
"epoch": 9.34722819080361,
"grad_norm": 1.328125,
"learning_rate": 5.551927488935826e-05,
"loss": 4.8759,
"mean_token_accuracy": 0.24375443458557128,
"num_tokens": 19776929.0,
"step": 10880
},
{
"entropy": 5.711270332336426,
"epoch": 9.351525569402664,
"grad_norm": 1.4765625,
"learning_rate": 5.5446324150054086e-05,
"loss": 4.8566,
"mean_token_accuracy": 0.24861591607332229,
"num_tokens": 19785772.0,
"step": 10885
},
{
"entropy": 5.6862475872039795,
"epoch": 9.35582294800172,
"grad_norm": 1.21875,
"learning_rate": 5.537385282822315e-05,
"loss": 4.8442,
"mean_token_accuracy": 0.24762384444475175,
"num_tokens": 19796073.0,
"step": 10890
},
{
"entropy": 5.656772804260254,
"epoch": 9.360120326600773,
"grad_norm": 1.6328125,
"learning_rate": 5.53018610821138e-05,
"loss": 4.8264,
"mean_token_accuracy": 0.24583270847797395,
"num_tokens": 19804511.0,
"step": 10895
},
{
"entropy": 5.72737684249878,
"epoch": 9.364417705199829,
"grad_norm": 1.46875,
"learning_rate": 5.523034906892728e-05,
"loss": 4.8799,
"mean_token_accuracy": 0.24223710894584655,
"num_tokens": 19813348.0,
"step": 10900
},
{
"entropy": 5.73786473274231,
"epoch": 9.368715083798882,
"grad_norm": 1.40625,
"learning_rate": 5.515931694481722e-05,
"loss": 4.8799,
"mean_token_accuracy": 0.2456185847520828,
"num_tokens": 19822390.0,
"step": 10905
},
{
"entropy": 5.784530544281006,
"epoch": 9.373012462397938,
"grad_norm": 1.3984375,
"learning_rate": 5.508876486488936e-05,
"loss": 4.8969,
"mean_token_accuracy": 0.23920056968927383,
"num_tokens": 19832423.0,
"step": 10910
},
{
"entropy": 5.673831272125244,
"epoch": 9.377309840996991,
"grad_norm": 1.5078125,
"learning_rate": 5.501869298320128e-05,
"loss": 4.8446,
"mean_token_accuracy": 0.24141159504652024,
"num_tokens": 19841271.0,
"step": 10915
},
{
"entropy": 5.695586442947388,
"epoch": 9.381607219596047,
"grad_norm": 1.5390625,
"learning_rate": 5.4949101452761995e-05,
"loss": 4.8397,
"mean_token_accuracy": 0.24960405081510545,
"num_tokens": 19850648.0,
"step": 10920
},
{
"entropy": 5.688299894332886,
"epoch": 9.3859045981951,
"grad_norm": 1.3203125,
"learning_rate": 5.4879990425531534e-05,
"loss": 4.7915,
"mean_token_accuracy": 0.2561185672879219,
"num_tokens": 19860102.0,
"step": 10925
},
{
"entropy": 5.704461908340454,
"epoch": 9.390201976794156,
"grad_norm": 1.28125,
"learning_rate": 5.4811360052420754e-05,
"loss": 4.8773,
"mean_token_accuracy": 0.24590488225221635,
"num_tokens": 19870008.0,
"step": 10930
},
{
"entropy": 5.692139482498169,
"epoch": 9.39449935539321,
"grad_norm": 1.234375,
"learning_rate": 5.4743210483290974e-05,
"loss": 4.797,
"mean_token_accuracy": 0.2533589258790016,
"num_tokens": 19879452.0,
"step": 10935
},
{
"entropy": 5.76347918510437,
"epoch": 9.398796733992265,
"grad_norm": 1.3203125,
"learning_rate": 5.467554186695364e-05,
"loss": 4.8825,
"mean_token_accuracy": 0.23813748657703399,
"num_tokens": 19888956.0,
"step": 10940
},
{
"entropy": 5.682995128631592,
"epoch": 9.403094112591319,
"grad_norm": 1.3671875,
"learning_rate": 5.4608354351169944e-05,
"loss": 4.8246,
"mean_token_accuracy": 0.24754057079553604,
"num_tokens": 19897760.0,
"step": 10945
},
{
"entropy": 5.63974142074585,
"epoch": 9.407391491190374,
"grad_norm": 1.5078125,
"learning_rate": 5.454164808265057e-05,
"loss": 4.8185,
"mean_token_accuracy": 0.2579930752515793,
"num_tokens": 19905692.0,
"step": 10950
},
{
"entropy": 5.645794248580932,
"epoch": 9.411688869789428,
"grad_norm": 1.4453125,
"learning_rate": 5.447542320705532e-05,
"loss": 4.747,
"mean_token_accuracy": 0.2527690351009369,
"num_tokens": 19914633.0,
"step": 10955
},
{
"entropy": 5.726321411132813,
"epoch": 9.415986248388483,
"grad_norm": 1.5234375,
"learning_rate": 5.440967986899289e-05,
"loss": 4.8597,
"mean_token_accuracy": 0.23715329468250274,
"num_tokens": 19922749.0,
"step": 10960
},
{
"entropy": 5.698509407043457,
"epoch": 9.420283626987537,
"grad_norm": 1.421875,
"learning_rate": 5.434441821202042e-05,
"loss": 4.7812,
"mean_token_accuracy": 0.25100332498550415,
"num_tokens": 19931274.0,
"step": 10965
},
{
"entropy": 5.700232362747192,
"epoch": 9.424581005586592,
"grad_norm": 1.4453125,
"learning_rate": 5.42796383786433e-05,
"loss": 4.8256,
"mean_token_accuracy": 0.24422383159399033,
"num_tokens": 19940052.0,
"step": 10970
},
{
"entropy": 5.680700397491455,
"epoch": 9.428878384185646,
"grad_norm": 1.34375,
"learning_rate": 5.4215340510314805e-05,
"loss": 4.8296,
"mean_token_accuracy": 0.24442802518606185,
"num_tokens": 19948890.0,
"step": 10975
},
{
"entropy": 5.670405435562134,
"epoch": 9.433175762784701,
"grad_norm": 1.2890625,
"learning_rate": 5.41515247474358e-05,
"loss": 4.8636,
"mean_token_accuracy": 0.24574528485536576,
"num_tokens": 19958158.0,
"step": 10980
},
{
"entropy": 5.7399359226226805,
"epoch": 9.437473141383755,
"grad_norm": 1.3203125,
"learning_rate": 5.4088191229354306e-05,
"loss": 4.9272,
"mean_token_accuracy": 0.23775207847356797,
"num_tokens": 19967432.0,
"step": 10985
},
{
"entropy": 5.652754831314087,
"epoch": 9.44177051998281,
"grad_norm": 1.4140625,
"learning_rate": 5.402534009436552e-05,
"loss": 4.8109,
"mean_token_accuracy": 0.24647348374128342,
"num_tokens": 19977661.0,
"step": 10990
},
{
"entropy": 5.724738121032715,
"epoch": 9.446067898581864,
"grad_norm": 1.4453125,
"learning_rate": 5.396297147971116e-05,
"loss": 4.8541,
"mean_token_accuracy": 0.2429043874144554,
"num_tokens": 19986359.0,
"step": 10995
},
{
"entropy": 5.687628936767578,
"epoch": 9.45036527718092,
"grad_norm": 1.4140625,
"learning_rate": 5.390108552157935e-05,
"loss": 4.7799,
"mean_token_accuracy": 0.2562565505504608,
"num_tokens": 19994618.0,
"step": 11000
},
{
"epoch": 9.45036527718092,
"eval_entropy": 5.519983804440713,
"eval_loss": 5.914631366729736,
"eval_mean_token_accuracy": 0.181345963582128,
"eval_num_tokens": 19994618.0,
"eval_runtime": 2.0395,
"eval_samples_per_second": 1740.091,
"eval_steps_per_second": 217.695,
"step": 11000
},
{
"entropy": 5.700768756866455,
"epoch": 9.454662655779973,
"grad_norm": 1.3984375,
"learning_rate": 5.383968235510427e-05,
"loss": 4.8818,
"mean_token_accuracy": 0.24662527143955232,
"num_tokens": 20004050.0,
"step": 11005
},
{
"entropy": 5.676410675048828,
"epoch": 9.458960034379029,
"grad_norm": 1.4140625,
"learning_rate": 5.377876211436592e-05,
"loss": 4.7963,
"mean_token_accuracy": 0.2501313492655754,
"num_tokens": 20013480.0,
"step": 11010
},
{
"entropy": 5.767210340499878,
"epoch": 9.463257412978084,
"grad_norm": 1.625,
"learning_rate": 5.371832493238973e-05,
"loss": 4.8837,
"mean_token_accuracy": 0.24043997526168823,
"num_tokens": 20022133.0,
"step": 11015
},
{
"entropy": 5.639940214157105,
"epoch": 9.467554791577138,
"grad_norm": 1.4765625,
"learning_rate": 5.365837094114639e-05,
"loss": 4.7717,
"mean_token_accuracy": 0.25986984223127363,
"num_tokens": 20031462.0,
"step": 11020
},
{
"entropy": 5.734659862518311,
"epoch": 9.471852170176193,
"grad_norm": 1.53125,
"learning_rate": 5.3598900271551396e-05,
"loss": 4.9184,
"mean_token_accuracy": 0.23817149847745894,
"num_tokens": 20042055.0,
"step": 11025
},
{
"entropy": 5.687342357635498,
"epoch": 9.476149548775247,
"grad_norm": 1.2421875,
"learning_rate": 5.353991305346499e-05,
"loss": 4.8289,
"mean_token_accuracy": 0.24456818401813507,
"num_tokens": 20051379.0,
"step": 11030
},
{
"entropy": 5.640593671798706,
"epoch": 9.480446927374302,
"grad_norm": 1.4296875,
"learning_rate": 5.348140941569165e-05,
"loss": 4.7622,
"mean_token_accuracy": 0.2537940502166748,
"num_tokens": 20060289.0,
"step": 11035
},
{
"entropy": 5.655428647994995,
"epoch": 9.484744305973356,
"grad_norm": 1.4140625,
"learning_rate": 5.342338948597989e-05,
"loss": 4.784,
"mean_token_accuracy": 0.24927219599485398,
"num_tokens": 20070133.0,
"step": 11040
},
{
"entropy": 5.703764390945435,
"epoch": 9.489041684572411,
"grad_norm": 1.453125,
"learning_rate": 5.336585339102209e-05,
"loss": 4.8228,
"mean_token_accuracy": 0.24505786895751952,
"num_tokens": 20078384.0,
"step": 11045
},
{
"entropy": 5.60769419670105,
"epoch": 9.493339063171465,
"grad_norm": 1.4609375,
"learning_rate": 5.33088012564541e-05,
"loss": 4.7463,
"mean_token_accuracy": 0.25497171729803086,
"num_tokens": 20087116.0,
"step": 11050
},
{
"entropy": 5.566410255432129,
"epoch": 9.49763644177052,
"grad_norm": 1.3984375,
"learning_rate": 5.3252233206854955e-05,
"loss": 4.7472,
"mean_token_accuracy": 0.2586297273635864,
"num_tokens": 20096918.0,
"step": 11055
},
{
"entropy": 5.694180583953857,
"epoch": 9.501933820369574,
"grad_norm": 1.40625,
"learning_rate": 5.3196149365746656e-05,
"loss": 4.8976,
"mean_token_accuracy": 0.24071943014860153,
"num_tokens": 20107602.0,
"step": 11060
},
{
"entropy": 5.6315919876098635,
"epoch": 9.50623119896863,
"grad_norm": 1.28125,
"learning_rate": 5.31405498555939e-05,
"loss": 4.7834,
"mean_token_accuracy": 0.2518941596150398,
"num_tokens": 20117732.0,
"step": 11065
},
{
"entropy": 5.755033922195435,
"epoch": 9.510528577567683,
"grad_norm": 1.3046875,
"learning_rate": 5.308543479780384e-05,
"loss": 4.9135,
"mean_token_accuracy": 0.23174715042114258,
"num_tokens": 20127765.0,
"step": 11070
},
{
"entropy": 5.71908802986145,
"epoch": 9.514825956166739,
"grad_norm": 1.4609375,
"learning_rate": 5.303080431272567e-05,
"loss": 4.8932,
"mean_token_accuracy": 0.23839702159166337,
"num_tokens": 20137135.0,
"step": 11075
},
{
"entropy": 5.6676170349121096,
"epoch": 9.519123334765792,
"grad_norm": 1.515625,
"learning_rate": 5.297665851965055e-05,
"loss": 4.8233,
"mean_token_accuracy": 0.24400498121976852,
"num_tokens": 20146156.0,
"step": 11080
},
{
"entropy": 5.718672800064087,
"epoch": 9.523420713364848,
"grad_norm": 1.6328125,
"learning_rate": 5.292299753681129e-05,
"loss": 4.8434,
"mean_token_accuracy": 0.24406409859657288,
"num_tokens": 20154433.0,
"step": 11085
},
{
"entropy": 5.699450731277466,
"epoch": 9.527718091963902,
"grad_norm": 1.3515625,
"learning_rate": 5.286982148138196e-05,
"loss": 4.8156,
"mean_token_accuracy": 0.24570093750953675,
"num_tokens": 20164209.0,
"step": 11090
},
{
"entropy": 5.677944612503052,
"epoch": 9.532015470562957,
"grad_norm": 1.453125,
"learning_rate": 5.281713046947787e-05,
"loss": 4.8291,
"mean_token_accuracy": 0.24473310708999635,
"num_tokens": 20173116.0,
"step": 11095
},
{
"entropy": 5.727268075942993,
"epoch": 9.53631284916201,
"grad_norm": 1.5,
"learning_rate": 5.2764924616155116e-05,
"loss": 4.9105,
"mean_token_accuracy": 0.2382511556148529,
"num_tokens": 20182933.0,
"step": 11100
},
{
"entropy": 5.695084810256958,
"epoch": 9.540610227761066,
"grad_norm": 1.640625,
"learning_rate": 5.271320403541038e-05,
"loss": 4.7879,
"mean_token_accuracy": 0.2528479963541031,
"num_tokens": 20191899.0,
"step": 11105
},
{
"entropy": 5.691361761093139,
"epoch": 9.54490760636012,
"grad_norm": 1.4375,
"learning_rate": 5.266196884018081e-05,
"loss": 4.8363,
"mean_token_accuracy": 0.24334924668073654,
"num_tokens": 20201108.0,
"step": 11110
},
{
"entropy": 5.672381114959717,
"epoch": 9.549204984959175,
"grad_norm": 1.453125,
"learning_rate": 5.2611219142343494e-05,
"loss": 4.8291,
"mean_token_accuracy": 0.2473324790596962,
"num_tokens": 20209137.0,
"step": 11115
},
{
"entropy": 5.614909601211548,
"epoch": 9.553502363558229,
"grad_norm": 1.3203125,
"learning_rate": 5.2560955052715574e-05,
"loss": 4.7257,
"mean_token_accuracy": 0.25412306040525434,
"num_tokens": 20217879.0,
"step": 11120
},
{
"entropy": 5.667946481704712,
"epoch": 9.557799742157284,
"grad_norm": 1.453125,
"learning_rate": 5.2511176681053704e-05,
"loss": 4.8239,
"mean_token_accuracy": 0.253523288667202,
"num_tokens": 20226783.0,
"step": 11125
},
{
"entropy": 5.680997562408447,
"epoch": 9.562097120756338,
"grad_norm": 1.4453125,
"learning_rate": 5.246188413605393e-05,
"loss": 4.8284,
"mean_token_accuracy": 0.24783779233694075,
"num_tokens": 20235008.0,
"step": 11130
},
{
"entropy": 5.670027446746826,
"epoch": 9.566394499355393,
"grad_norm": 1.4609375,
"learning_rate": 5.241307752535149e-05,
"loss": 4.7386,
"mean_token_accuracy": 0.2599273458123207,
"num_tokens": 20243157.0,
"step": 11135
},
{
"entropy": 5.671330451965332,
"epoch": 9.570691877954447,
"grad_norm": 1.390625,
"learning_rate": 5.236475695552052e-05,
"loss": 4.8014,
"mean_token_accuracy": 0.2514424681663513,
"num_tokens": 20251548.0,
"step": 11140
},
{
"entropy": 5.609284543991089,
"epoch": 9.574989256553502,
"grad_norm": 1.5546875,
"learning_rate": 5.2316922532073796e-05,
"loss": 4.7126,
"mean_token_accuracy": 0.25564580857753755,
"num_tokens": 20260954.0,
"step": 11145
},
{
"entropy": 5.748769378662109,
"epoch": 9.579286635152556,
"grad_norm": 1.4609375,
"learning_rate": 5.226957435946265e-05,
"loss": 4.9245,
"mean_token_accuracy": 0.23222365528345107,
"num_tokens": 20269976.0,
"step": 11150
},
{
"entropy": 5.697335195541382,
"epoch": 9.583584013751612,
"grad_norm": 1.4296875,
"learning_rate": 5.2222712541076464e-05,
"loss": 4.8832,
"mean_token_accuracy": 0.24263730496168137,
"num_tokens": 20279824.0,
"step": 11155
},
{
"entropy": 5.697223138809204,
"epoch": 9.587881392350667,
"grad_norm": 1.4296875,
"learning_rate": 5.217633717924282e-05,
"loss": 4.8165,
"mean_token_accuracy": 0.26576024740934373,
"num_tokens": 20289284.0,
"step": 11160
},
{
"entropy": 5.687585306167603,
"epoch": 9.59217877094972,
"grad_norm": 1.265625,
"learning_rate": 5.213044837522689e-05,
"loss": 4.7636,
"mean_token_accuracy": 0.25869722068309786,
"num_tokens": 20297905.0,
"step": 11165
},
{
"entropy": 5.646097898483276,
"epoch": 9.596476149548776,
"grad_norm": 1.4296875,
"learning_rate": 5.208504622923154e-05,
"loss": 4.7733,
"mean_token_accuracy": 0.2592957153916359,
"num_tokens": 20307219.0,
"step": 11170
},
{
"entropy": 5.678986930847168,
"epoch": 9.60077352814783,
"grad_norm": 1.4765625,
"learning_rate": 5.204013084039687e-05,
"loss": 4.8262,
"mean_token_accuracy": 0.2500856578350067,
"num_tokens": 20316369.0,
"step": 11175
},
{
"entropy": 5.659730243682861,
"epoch": 9.605070906746885,
"grad_norm": 1.515625,
"learning_rate": 5.199570230680017e-05,
"loss": 4.8122,
"mean_token_accuracy": 0.24366722404956817,
"num_tokens": 20325764.0,
"step": 11180
},
{
"entropy": 5.7436450004577635,
"epoch": 9.609368285345939,
"grad_norm": 1.40625,
"learning_rate": 5.19517607254556e-05,
"loss": 4.8856,
"mean_token_accuracy": 0.24137621819972993,
"num_tokens": 20334671.0,
"step": 11185
},
{
"entropy": 5.676516151428222,
"epoch": 9.613665663944994,
"grad_norm": 1.4375,
"learning_rate": 5.190830619231397e-05,
"loss": 4.8322,
"mean_token_accuracy": 0.24532379657030107,
"num_tokens": 20343585.0,
"step": 11190
},
{
"entropy": 5.654004716873169,
"epoch": 9.617963042544048,
"grad_norm": 1.4453125,
"learning_rate": 5.186533880226263e-05,
"loss": 4.7795,
"mean_token_accuracy": 0.25314432084560395,
"num_tokens": 20352569.0,
"step": 11195
},
{
"entropy": 5.690787410736084,
"epoch": 9.622260421143103,
"grad_norm": 1.5859375,
"learning_rate": 5.1822858649125197e-05,
"loss": 4.8357,
"mean_token_accuracy": 0.24498570412397386,
"num_tokens": 20361479.0,
"step": 11200
},
{
"entropy": 5.739130258560181,
"epoch": 9.626557799742157,
"grad_norm": 1.53125,
"learning_rate": 5.178086582566134e-05,
"loss": 4.8656,
"mean_token_accuracy": 0.24332756847143172,
"num_tokens": 20370286.0,
"step": 11205
},
{
"entropy": 5.674109697341919,
"epoch": 9.630855178341212,
"grad_norm": 1.4140625,
"learning_rate": 5.1739360423566596e-05,
"loss": 4.8558,
"mean_token_accuracy": 0.24196633249521254,
"num_tokens": 20379253.0,
"step": 11210
},
{
"entropy": 5.688290977478028,
"epoch": 9.635152556940266,
"grad_norm": 1.484375,
"learning_rate": 5.16983425334722e-05,
"loss": 4.7931,
"mean_token_accuracy": 0.2511351525783539,
"num_tokens": 20387796.0,
"step": 11215
},
{
"entropy": 5.651867961883545,
"epoch": 9.639449935539322,
"grad_norm": 1.453125,
"learning_rate": 5.1657812244944796e-05,
"loss": 4.7547,
"mean_token_accuracy": 0.24890174418687822,
"num_tokens": 20396209.0,
"step": 11220
},
{
"entropy": 5.746544456481933,
"epoch": 9.643747314138375,
"grad_norm": 1.4921875,
"learning_rate": 5.1617769646486344e-05,
"loss": 4.8977,
"mean_token_accuracy": 0.24893355071544648,
"num_tokens": 20405102.0,
"step": 11225
},
{
"entropy": 5.727382326126099,
"epoch": 9.64804469273743,
"grad_norm": 1.3125,
"learning_rate": 5.157821482553389e-05,
"loss": 4.9589,
"mean_token_accuracy": 0.2372208446264267,
"num_tokens": 20415126.0,
"step": 11230
},
{
"entropy": 5.709471321105957,
"epoch": 9.652342071336484,
"grad_norm": 1.5,
"learning_rate": 5.153914786845932e-05,
"loss": 4.8652,
"mean_token_accuracy": 0.24387197941541672,
"num_tokens": 20424166.0,
"step": 11235
},
{
"entropy": 5.604541635513305,
"epoch": 9.65663944993554,
"grad_norm": 1.65625,
"learning_rate": 5.1500568860569285e-05,
"loss": 4.69,
"mean_token_accuracy": 0.26510541439056395,
"num_tokens": 20431984.0,
"step": 11240
},
{
"entropy": 5.620431756973266,
"epoch": 9.660936828534593,
"grad_norm": 1.515625,
"learning_rate": 5.1462477886104904e-05,
"loss": 4.7552,
"mean_token_accuracy": 0.25696644335985186,
"num_tokens": 20440923.0,
"step": 11245
},
{
"entropy": 5.696271181106567,
"epoch": 9.665234207133649,
"grad_norm": 1.5625,
"learning_rate": 5.1424875028241625e-05,
"loss": 4.7763,
"mean_token_accuracy": 0.25067644417285917,
"num_tokens": 20449525.0,
"step": 11250
},
{
"entropy": 5.709213638305664,
"epoch": 9.669531585732702,
"grad_norm": 1.2890625,
"learning_rate": 5.138776036908911e-05,
"loss": 4.8629,
"mean_token_accuracy": 0.24346184432506562,
"num_tokens": 20459193.0,
"step": 11255
},
{
"entropy": 5.692313098907471,
"epoch": 9.673828964331758,
"grad_norm": 1.640625,
"learning_rate": 5.135113398969091e-05,
"loss": 4.7957,
"mean_token_accuracy": 0.24953748732805253,
"num_tokens": 20467564.0,
"step": 11260
},
{
"entropy": 5.742414760589599,
"epoch": 9.678126342930812,
"grad_norm": 1.4296875,
"learning_rate": 5.131499597002437e-05,
"loss": 4.9187,
"mean_token_accuracy": 0.2422913670539856,
"num_tokens": 20476573.0,
"step": 11265
},
{
"entropy": 5.697458744049072,
"epoch": 9.682423721529867,
"grad_norm": 1.4375,
"learning_rate": 5.12793463890005e-05,
"loss": 4.8581,
"mean_token_accuracy": 0.2450355052947998,
"num_tokens": 20486788.0,
"step": 11270
},
{
"entropy": 5.706341934204102,
"epoch": 9.68672110012892,
"grad_norm": 1.28125,
"learning_rate": 5.124418532446376e-05,
"loss": 4.8819,
"mean_token_accuracy": 0.23618692457675933,
"num_tokens": 20495982.0,
"step": 11275
},
{
"entropy": 5.747522449493408,
"epoch": 9.691018478727976,
"grad_norm": 1.3671875,
"learning_rate": 5.120951285319187e-05,
"loss": 4.9023,
"mean_token_accuracy": 0.23663830757141113,
"num_tokens": 20504678.0,
"step": 11280
},
{
"entropy": 5.654949140548706,
"epoch": 9.69531585732703,
"grad_norm": 1.4765625,
"learning_rate": 5.1175329050895584e-05,
"loss": 4.7411,
"mean_token_accuracy": 0.25393941402435305,
"num_tokens": 20513362.0,
"step": 11285
},
{
"entropy": 5.668262481689453,
"epoch": 9.699613235926085,
"grad_norm": 1.421875,
"learning_rate": 5.114163399221871e-05,
"loss": 4.8734,
"mean_token_accuracy": 0.24338336586952208,
"num_tokens": 20522280.0,
"step": 11290
},
{
"entropy": 5.652725791931152,
"epoch": 9.703910614525139,
"grad_norm": 1.546875,
"learning_rate": 5.110842775073778e-05,
"loss": 4.7787,
"mean_token_accuracy": 0.2520154297351837,
"num_tokens": 20531175.0,
"step": 11295
},
{
"entropy": 5.6531706809997555,
"epoch": 9.708207993124194,
"grad_norm": 1.3671875,
"learning_rate": 5.107571039896196e-05,
"loss": 4.8334,
"mean_token_accuracy": 0.243391315639019,
"num_tokens": 20540592.0,
"step": 11300
},
{
"entropy": 5.739127445220947,
"epoch": 9.71250537172325,
"grad_norm": 1.5234375,
"learning_rate": 5.1043482008332864e-05,
"loss": 4.9188,
"mean_token_accuracy": 0.2370056241750717,
"num_tokens": 20549963.0,
"step": 11305
},
{
"entropy": 5.664252233505249,
"epoch": 9.716802750322303,
"grad_norm": 1.3125,
"learning_rate": 5.1011742649224394e-05,
"loss": 4.8448,
"mean_token_accuracy": 0.25418578684329984,
"num_tokens": 20558928.0,
"step": 11310
},
{
"entropy": 5.649764680862427,
"epoch": 9.721100128921359,
"grad_norm": 1.5703125,
"learning_rate": 5.098049239094267e-05,
"loss": 4.7256,
"mean_token_accuracy": 0.2566137194633484,
"num_tokens": 20566483.0,
"step": 11315
},
{
"entropy": 5.719577884674072,
"epoch": 9.725397507520412,
"grad_norm": 1.4765625,
"learning_rate": 5.094973130172573e-05,
"loss": 4.8244,
"mean_token_accuracy": 0.24605006575584412,
"num_tokens": 20575349.0,
"step": 11320
},
{
"entropy": 5.73195366859436,
"epoch": 9.729694886119468,
"grad_norm": 1.515625,
"learning_rate": 5.09194594487435e-05,
"loss": 4.8509,
"mean_token_accuracy": 0.24537111073732376,
"num_tokens": 20584235.0,
"step": 11325
},
{
"entropy": 5.626504707336426,
"epoch": 9.733992264718522,
"grad_norm": 1.4921875,
"learning_rate": 5.088967689809763e-05,
"loss": 4.7733,
"mean_token_accuracy": 0.2508066087961197,
"num_tokens": 20593705.0,
"step": 11330
},
{
"entropy": 5.715359020233154,
"epoch": 9.738289643317577,
"grad_norm": 1.234375,
"learning_rate": 5.086038371482128e-05,
"loss": 4.8721,
"mean_token_accuracy": 0.2454403057694435,
"num_tokens": 20603886.0,
"step": 11335
},
{
"entropy": 5.561939764022827,
"epoch": 9.74258702191663,
"grad_norm": 1.3828125,
"learning_rate": 5.0831579962879074e-05,
"loss": 4.7021,
"mean_token_accuracy": 0.2648016601800919,
"num_tokens": 20613181.0,
"step": 11340
},
{
"entropy": 5.704611206054688,
"epoch": 9.746884400515686,
"grad_norm": 1.4609375,
"learning_rate": 5.080326570516686e-05,
"loss": 4.8074,
"mean_token_accuracy": 0.24797516465187072,
"num_tokens": 20620938.0,
"step": 11345
},
{
"entropy": 5.757068109512329,
"epoch": 9.75118177911474,
"grad_norm": 1.359375,
"learning_rate": 5.077544100351172e-05,
"loss": 4.9207,
"mean_token_accuracy": 0.23611319661140442,
"num_tokens": 20630992.0,
"step": 11350
},
{
"entropy": 5.7547034740448,
"epoch": 9.755479157713795,
"grad_norm": 1.515625,
"learning_rate": 5.0748105918671616e-05,
"loss": 4.9369,
"mean_token_accuracy": 0.24307719320058824,
"num_tokens": 20640543.0,
"step": 11355
},
{
"entropy": 5.700654411315918,
"epoch": 9.759776536312849,
"grad_norm": 1.3515625,
"learning_rate": 5.072126051033551e-05,
"loss": 4.839,
"mean_token_accuracy": 0.24246969372034072,
"num_tokens": 20649814.0,
"step": 11360
},
{
"entropy": 5.707252550125122,
"epoch": 9.764073914911904,
"grad_norm": 1.421875,
"learning_rate": 5.069490483712298e-05,
"loss": 4.8683,
"mean_token_accuracy": 0.24550638049840928,
"num_tokens": 20658215.0,
"step": 11365
},
{
"entropy": 5.68763427734375,
"epoch": 9.768371293510958,
"grad_norm": 1.3515625,
"learning_rate": 5.066903895658433e-05,
"loss": 4.8275,
"mean_token_accuracy": 0.2479804813861847,
"num_tokens": 20667412.0,
"step": 11370
},
{
"entropy": 5.6843091487884525,
"epoch": 9.772668672110013,
"grad_norm": 1.5078125,
"learning_rate": 5.064366292520028e-05,
"loss": 4.8523,
"mean_token_accuracy": 0.24432236552238465,
"num_tokens": 20676366.0,
"step": 11375
},
{
"entropy": 5.764643001556396,
"epoch": 9.776966050709067,
"grad_norm": 1.3046875,
"learning_rate": 5.061877679838192e-05,
"loss": 4.9333,
"mean_token_accuracy": 0.23031819313764573,
"num_tokens": 20686872.0,
"step": 11380
},
{
"entropy": 5.6893510818481445,
"epoch": 9.781263429308122,
"grad_norm": 1.515625,
"learning_rate": 5.059438063047066e-05,
"loss": 4.7921,
"mean_token_accuracy": 0.24938419461250305,
"num_tokens": 20695162.0,
"step": 11385
},
{
"entropy": 5.693837976455688,
"epoch": 9.785560807907176,
"grad_norm": 1.3515625,
"learning_rate": 5.057047447473796e-05,
"loss": 4.8276,
"mean_token_accuracy": 0.24814130067825318,
"num_tokens": 20703897.0,
"step": 11390
},
{
"entropy": 5.642933893203735,
"epoch": 9.789858186506232,
"grad_norm": 1.359375,
"learning_rate": 5.054705838338529e-05,
"loss": 4.7856,
"mean_token_accuracy": 0.2509089007973671,
"num_tokens": 20712115.0,
"step": 11395
},
{
"entropy": 5.627345848083496,
"epoch": 9.794155565105285,
"grad_norm": 1.390625,
"learning_rate": 5.052413240754404e-05,
"loss": 4.7816,
"mean_token_accuracy": 0.2502418413758278,
"num_tokens": 20721073.0,
"step": 11400
},
{
"entropy": 5.700538778305054,
"epoch": 9.79845294370434,
"grad_norm": 1.2734375,
"learning_rate": 5.0501696597275376e-05,
"loss": 4.8181,
"mean_token_accuracy": 0.24642034769058227,
"num_tokens": 20731370.0,
"step": 11405
},
{
"entropy": 5.7052887916564945,
"epoch": 9.802750322303394,
"grad_norm": 1.4765625,
"learning_rate": 5.047975100157018e-05,
"loss": 4.875,
"mean_token_accuracy": 0.24135247766971588,
"num_tokens": 20740744.0,
"step": 11410
},
{
"entropy": 5.6812944412231445,
"epoch": 9.80704770090245,
"grad_norm": 1.359375,
"learning_rate": 5.045829566834879e-05,
"loss": 4.8274,
"mean_token_accuracy": 0.2505713224411011,
"num_tokens": 20749838.0,
"step": 11415
},
{
"entropy": 5.681426382064819,
"epoch": 9.811345079501503,
"grad_norm": 1.2734375,
"learning_rate": 5.043733064446113e-05,
"loss": 4.7666,
"mean_token_accuracy": 0.2604792058467865,
"num_tokens": 20759689.0,
"step": 11420
},
{
"entropy": 5.679882431030274,
"epoch": 9.815642458100559,
"grad_norm": 1.5078125,
"learning_rate": 5.041685597568641e-05,
"loss": 4.7936,
"mean_token_accuracy": 0.2509396269917488,
"num_tokens": 20769234.0,
"step": 11425
},
{
"entropy": 5.685623359680176,
"epoch": 9.819939836699612,
"grad_norm": 1.3828125,
"learning_rate": 5.039687170673315e-05,
"loss": 4.8896,
"mean_token_accuracy": 0.24832461327314376,
"num_tokens": 20778794.0,
"step": 11430
},
{
"entropy": 5.697821807861328,
"epoch": 9.824237215298668,
"grad_norm": 1.3984375,
"learning_rate": 5.037737788123895e-05,
"loss": 4.8102,
"mean_token_accuracy": 0.24825302362442017,
"num_tokens": 20788327.0,
"step": 11435
},
{
"entropy": 5.70763258934021,
"epoch": 9.828534593897722,
"grad_norm": 1.3671875,
"learning_rate": 5.03583745417706e-05,
"loss": 4.8503,
"mean_token_accuracy": 0.2450679585337639,
"num_tokens": 20797161.0,
"step": 11440
},
{
"entropy": 5.759992170333862,
"epoch": 9.832831972496777,
"grad_norm": 1.3515625,
"learning_rate": 5.033986172982375e-05,
"loss": 4.9259,
"mean_token_accuracy": 0.23826471269130706,
"num_tokens": 20806264.0,
"step": 11445
},
{
"entropy": 5.697881078720092,
"epoch": 9.837129351095832,
"grad_norm": 1.40625,
"learning_rate": 5.0321839485823014e-05,
"loss": 4.8756,
"mean_token_accuracy": 0.23669908046722413,
"num_tokens": 20815569.0,
"step": 11450
},
{
"entropy": 5.753845596313477,
"epoch": 9.841426729694886,
"grad_norm": 1.46875,
"learning_rate": 5.030430784912177e-05,
"loss": 4.9454,
"mean_token_accuracy": 0.2340080052614212,
"num_tokens": 20825234.0,
"step": 11455
},
{
"entropy": 5.650054788589477,
"epoch": 9.845724108293942,
"grad_norm": 1.4765625,
"learning_rate": 5.0287266858002054e-05,
"loss": 4.8064,
"mean_token_accuracy": 0.24974256306886672,
"num_tokens": 20834601.0,
"step": 11460
},
{
"entropy": 5.71141095161438,
"epoch": 9.850021486892995,
"grad_norm": 1.2421875,
"learning_rate": 5.027071654967465e-05,
"loss": 4.8264,
"mean_token_accuracy": 0.2450408846139908,
"num_tokens": 20843686.0,
"step": 11465
},
{
"entropy": 5.699143743515014,
"epoch": 9.85431886549205,
"grad_norm": 1.421875,
"learning_rate": 5.025465696027875e-05,
"loss": 4.8253,
"mean_token_accuracy": 0.2528485044836998,
"num_tokens": 20852009.0,
"step": 11470
},
{
"entropy": 5.6895630836486815,
"epoch": 9.858616244091104,
"grad_norm": 1.453125,
"learning_rate": 5.023908812488211e-05,
"loss": 4.7985,
"mean_token_accuracy": 0.24806177914142608,
"num_tokens": 20861241.0,
"step": 11475
},
{
"entropy": 5.716805553436279,
"epoch": 9.86291362269016,
"grad_norm": 1.3203125,
"learning_rate": 5.022401007748087e-05,
"loss": 4.8403,
"mean_token_accuracy": 0.24400441944599152,
"num_tokens": 20869774.0,
"step": 11480
},
{
"entropy": 5.692715120315552,
"epoch": 9.867211001289213,
"grad_norm": 1.4609375,
"learning_rate": 5.0209422850999414e-05,
"loss": 4.8579,
"mean_token_accuracy": 0.2502797991037369,
"num_tokens": 20878844.0,
"step": 11485
},
{
"entropy": 5.6825096130371096,
"epoch": 9.871508379888269,
"grad_norm": 1.375,
"learning_rate": 5.019532647729046e-05,
"loss": 4.8553,
"mean_token_accuracy": 0.24498149901628494,
"num_tokens": 20888054.0,
"step": 11490
},
{
"entropy": 5.717511177062988,
"epoch": 9.875805758487322,
"grad_norm": 1.2265625,
"learning_rate": 5.0181720987134815e-05,
"loss": 4.8436,
"mean_token_accuracy": 0.24715185463428496,
"num_tokens": 20897752.0,
"step": 11495
},
{
"entropy": 5.584847450256348,
"epoch": 9.880103137086378,
"grad_norm": 1.578125,
"learning_rate": 5.016860641024143e-05,
"loss": 4.7288,
"mean_token_accuracy": 0.26042546331882477,
"num_tokens": 20906186.0,
"step": 11500
},
{
"epoch": 9.880103137086378,
"eval_entropy": 5.521090636382231,
"eval_loss": 5.915965557098389,
"eval_mean_token_accuracy": 0.18116104290694804,
"eval_num_tokens": 20906186.0,
"eval_runtime": 2.041,
"eval_samples_per_second": 1738.862,
"eval_steps_per_second": 217.542,
"step": 11500
},
{
"entropy": 5.728489732742309,
"epoch": 9.884400515685432,
"grad_norm": 1.515625,
"learning_rate": 5.01559827752473e-05,
"loss": 4.8631,
"mean_token_accuracy": 0.24299588054418564,
"num_tokens": 20915172.0,
"step": 11505
},
{
"entropy": 5.672415399551392,
"epoch": 9.888697894284487,
"grad_norm": 1.4765625,
"learning_rate": 5.0143850109717434e-05,
"loss": 4.7935,
"mean_token_accuracy": 0.250404292345047,
"num_tokens": 20923886.0,
"step": 11510
},
{
"entropy": 5.630904769897461,
"epoch": 9.89299527288354,
"grad_norm": 1.453125,
"learning_rate": 5.013220844014469e-05,
"loss": 4.7115,
"mean_token_accuracy": 0.25537188947200773,
"num_tokens": 20932216.0,
"step": 11515
},
{
"entropy": 5.6395348072052,
"epoch": 9.897292651482596,
"grad_norm": 1.296875,
"learning_rate": 5.012105779194985e-05,
"loss": 4.7814,
"mean_token_accuracy": 0.25184416174888613,
"num_tokens": 20941389.0,
"step": 11520
},
{
"entropy": 5.719944429397583,
"epoch": 9.90159003008165,
"grad_norm": 1.3984375,
"learning_rate": 5.011039818948144e-05,
"loss": 4.9307,
"mean_token_accuracy": 0.23785840719938278,
"num_tokens": 20950813.0,
"step": 11525
},
{
"entropy": 5.705527114868164,
"epoch": 9.905887408680705,
"grad_norm": 1.3515625,
"learning_rate": 5.010022965601579e-05,
"loss": 4.819,
"mean_token_accuracy": 0.24680709689855576,
"num_tokens": 20959298.0,
"step": 11530
},
{
"entropy": 5.6334069728851315,
"epoch": 9.910184787279759,
"grad_norm": 1.609375,
"learning_rate": 5.009055221375689e-05,
"loss": 4.8458,
"mean_token_accuracy": 0.24602019786834717,
"num_tokens": 20968400.0,
"step": 11535
},
{
"entropy": 5.660980987548828,
"epoch": 9.914482165878814,
"grad_norm": 1.5234375,
"learning_rate": 5.0081365883836436e-05,
"loss": 4.7802,
"mean_token_accuracy": 0.25230593234300613,
"num_tokens": 20977247.0,
"step": 11540
},
{
"entropy": 5.722903299331665,
"epoch": 9.918779544477868,
"grad_norm": 1.5859375,
"learning_rate": 5.0072670686313715e-05,
"loss": 4.8092,
"mean_token_accuracy": 0.24301313012838363,
"num_tokens": 20985363.0,
"step": 11545
},
{
"entropy": 5.727887153625488,
"epoch": 9.923076923076923,
"grad_norm": 1.4609375,
"learning_rate": 5.0064466640175496e-05,
"loss": 4.8932,
"mean_token_accuracy": 0.24074935913085938,
"num_tokens": 20994728.0,
"step": 11550
},
{
"entropy": 5.689137363433838,
"epoch": 9.927374301675977,
"grad_norm": 1.3515625,
"learning_rate": 5.0056753763336215e-05,
"loss": 4.7821,
"mean_token_accuracy": 0.24779497236013412,
"num_tokens": 21003801.0,
"step": 11555
},
{
"entropy": 5.666046047210694,
"epoch": 9.931671680275032,
"grad_norm": 1.515625,
"learning_rate": 5.0049532072637646e-05,
"loss": 4.7418,
"mean_token_accuracy": 0.2527583435177803,
"num_tokens": 21013245.0,
"step": 11560
},
{
"entropy": 5.6009501934051515,
"epoch": 9.935969058874086,
"grad_norm": 1.4765625,
"learning_rate": 5.004280158384913e-05,
"loss": 4.7287,
"mean_token_accuracy": 0.2534693717956543,
"num_tokens": 21021376.0,
"step": 11565
},
{
"entropy": 5.709240436553955,
"epoch": 9.940266437473142,
"grad_norm": 1.3359375,
"learning_rate": 5.0036562311667315e-05,
"loss": 4.8646,
"mean_token_accuracy": 0.24093369543552398,
"num_tokens": 21030538.0,
"step": 11570
},
{
"entropy": 5.739674472808838,
"epoch": 9.944563816072195,
"grad_norm": 1.265625,
"learning_rate": 5.0030814269716304e-05,
"loss": 4.8652,
"mean_token_accuracy": 0.23949900269508362,
"num_tokens": 21040274.0,
"step": 11575
},
{
"entropy": 5.688154888153076,
"epoch": 9.94886119467125,
"grad_norm": 1.4453125,
"learning_rate": 5.0025557470547544e-05,
"loss": 4.8694,
"mean_token_accuracy": 0.24162812381982804,
"num_tokens": 21049584.0,
"step": 11580
},
{
"entropy": 5.709632873535156,
"epoch": 9.953158573270304,
"grad_norm": 1.3984375,
"learning_rate": 5.002079192563973e-05,
"loss": 4.8675,
"mean_token_accuracy": 0.24503956884145736,
"num_tokens": 21058258.0,
"step": 11585
},
{
"entropy": 5.694040107727051,
"epoch": 9.95745595186936,
"grad_norm": 1.421875,
"learning_rate": 5.0016517645398954e-05,
"loss": 4.821,
"mean_token_accuracy": 0.24731273353099822,
"num_tokens": 21066897.0,
"step": 11590
},
{
"entropy": 5.683679294586182,
"epoch": 9.961753330468415,
"grad_norm": 1.3671875,
"learning_rate": 5.0012734639158515e-05,
"loss": 4.8031,
"mean_token_accuracy": 0.2501624494791031,
"num_tokens": 21075939.0,
"step": 11595
},
{
"entropy": 5.740120077133179,
"epoch": 9.966050709067469,
"grad_norm": 1.3515625,
"learning_rate": 5.0009442915178995e-05,
"loss": 4.8556,
"mean_token_accuracy": 0.2431507632136345,
"num_tokens": 21085073.0,
"step": 11600
},
{
"entropy": 5.734279441833496,
"epoch": 9.970348087666524,
"grad_norm": 1.4453125,
"learning_rate": 5.000664248064818e-05,
"loss": 4.9524,
"mean_token_accuracy": 0.23012969940900802,
"num_tokens": 21094195.0,
"step": 11605
},
{
"entropy": 5.663362836837768,
"epoch": 9.974645466265578,
"grad_norm": 1.21875,
"learning_rate": 5.000433334168114e-05,
"loss": 4.7975,
"mean_token_accuracy": 0.2483143299818039,
"num_tokens": 21103252.0,
"step": 11610
},
{
"entropy": 5.688148117065429,
"epoch": 9.978942844864633,
"grad_norm": 1.4375,
"learning_rate": 5.000251550332007e-05,
"loss": 4.844,
"mean_token_accuracy": 0.24331471771001817,
"num_tokens": 21113031.0,
"step": 11615
},
{
"entropy": 5.7469429016113285,
"epoch": 9.983240223463687,
"grad_norm": 1.328125,
"learning_rate": 5.000118896953443e-05,
"loss": 4.9617,
"mean_token_accuracy": 0.23067699521780013,
"num_tokens": 21123268.0,
"step": 11620
},
{
"entropy": 5.700086164474487,
"epoch": 9.987537602062742,
"grad_norm": 1.4375,
"learning_rate": 5.000035374322084e-05,
"loss": 4.8712,
"mean_token_accuracy": 0.2406125172972679,
"num_tokens": 21131352.0,
"step": 11625
},
{
"entropy": 5.728383874893188,
"epoch": 9.991834980661796,
"grad_norm": 1.6015625,
"learning_rate": 5.000000982620308e-05,
"loss": 4.8261,
"mean_token_accuracy": 0.2471036896109581,
"num_tokens": 21140296.0,
"step": 11630
}
],
"logging_steps": 5,
"max_steps": 11630,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4754071609896960.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}