Files
eus-latn-10mb-10mb_seed3407/checkpoint-500/trainer_state.json
ModelHub XC 213141ef9f 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/eus-latn-10mb-10mb_seed3407
Source: Original Platform
2026-06-28 05:31:17 +08:00

1046 lines
28 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.42973785990545765,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 10.742608070373535,
"epoch": 0.004297378599054577,
"grad_norm": 5.46875,
"learning_rate": 2e-06,
"loss": 10.7643,
"mean_token_accuracy": 7.587253348901868e-05,
"num_tokens": 10107.0,
"step": 5
},
{
"entropy": 10.742630290985108,
"epoch": 0.008594757198109154,
"grad_norm": 5.78125,
"learning_rate": 4.5e-06,
"loss": 10.7086,
"mean_token_accuracy": 0.0,
"num_tokens": 18391.0,
"step": 10
},
{
"entropy": 10.74263505935669,
"epoch": 0.01289213579716373,
"grad_norm": 5.3125,
"learning_rate": 7e-06,
"loss": 10.6888,
"mean_token_accuracy": 7.022471982054412e-05,
"num_tokens": 27061.0,
"step": 15
},
{
"entropy": 10.742604160308838,
"epoch": 0.017189514396218308,
"grad_norm": 6.0,
"learning_rate": 9.5e-06,
"loss": 10.6611,
"mean_token_accuracy": 0.0008422504703048617,
"num_tokens": 36339.0,
"step": 20
},
{
"entropy": 10.742517948150635,
"epoch": 0.021486892995272882,
"grad_norm": 4.75,
"learning_rate": 1.2e-05,
"loss": 10.5317,
"mean_token_accuracy": 0.02025789166800678,
"num_tokens": 45770.0,
"step": 25
},
{
"entropy": 10.741962242126466,
"epoch": 0.02578427159432746,
"grad_norm": 4.25,
"learning_rate": 1.4500000000000002e-05,
"loss": 10.399,
"mean_token_accuracy": 0.04876907132565975,
"num_tokens": 54575.0,
"step": 30
},
{
"entropy": 10.73945140838623,
"epoch": 0.030081650193382038,
"grad_norm": 3.15625,
"learning_rate": 1.7000000000000003e-05,
"loss": 10.3065,
"mean_token_accuracy": 0.0514072135090828,
"num_tokens": 66403.0,
"step": 35
},
{
"entropy": 10.730937385559082,
"epoch": 0.034379028792436615,
"grad_norm": 2.640625,
"learning_rate": 1.95e-05,
"loss": 10.0976,
"mean_token_accuracy": 0.05973539762198925,
"num_tokens": 76510.0,
"step": 40
},
{
"entropy": 10.715238952636719,
"epoch": 0.03867640739149119,
"grad_norm": 2.40625,
"learning_rate": 2.2e-05,
"loss": 9.9688,
"mean_token_accuracy": 0.05614017099142075,
"num_tokens": 84836.0,
"step": 45
},
{
"entropy": 10.702037715911866,
"epoch": 0.042973785990545764,
"grad_norm": 2.046875,
"learning_rate": 2.4500000000000003e-05,
"loss": 9.9015,
"mean_token_accuracy": 0.053829558193683624,
"num_tokens": 93197.0,
"step": 50
},
{
"entropy": 10.697910690307618,
"epoch": 0.047271164589600345,
"grad_norm": 2.40625,
"learning_rate": 2.7e-05,
"loss": 9.8366,
"mean_token_accuracy": 0.05843428298830986,
"num_tokens": 101546.0,
"step": 55
},
{
"entropy": 10.693470478057861,
"epoch": 0.05156854318865492,
"grad_norm": 1.9609375,
"learning_rate": 2.95e-05,
"loss": 9.8429,
"mean_token_accuracy": 0.0558084711432457,
"num_tokens": 111703.0,
"step": 60
},
{
"entropy": 10.680869865417481,
"epoch": 0.055865921787709494,
"grad_norm": 1.9453125,
"learning_rate": 3.2e-05,
"loss": 9.7131,
"mean_token_accuracy": 0.0589165486395359,
"num_tokens": 119894.0,
"step": 65
},
{
"entropy": 10.668927574157715,
"epoch": 0.060163300386764075,
"grad_norm": 1.9765625,
"learning_rate": 3.4500000000000005e-05,
"loss": 9.6682,
"mean_token_accuracy": 0.06148771904408932,
"num_tokens": 128885.0,
"step": 70
},
{
"entropy": 10.654484272003174,
"epoch": 0.06446067898581866,
"grad_norm": 1.953125,
"learning_rate": 3.7e-05,
"loss": 9.6297,
"mean_token_accuracy": 0.057728851959109304,
"num_tokens": 138106.0,
"step": 75
},
{
"entropy": 10.645826625823975,
"epoch": 0.06875805758487323,
"grad_norm": 1.9296875,
"learning_rate": 3.95e-05,
"loss": 9.5722,
"mean_token_accuracy": 0.058954347297549246,
"num_tokens": 146691.0,
"step": 80
},
{
"entropy": 10.637816619873046,
"epoch": 0.0730554361839278,
"grad_norm": 1.90625,
"learning_rate": 4.2000000000000004e-05,
"loss": 9.5126,
"mean_token_accuracy": 0.059067190065979956,
"num_tokens": 155792.0,
"step": 85
},
{
"entropy": 10.63103084564209,
"epoch": 0.07735281478298238,
"grad_norm": 1.7890625,
"learning_rate": 4.45e-05,
"loss": 9.5251,
"mean_token_accuracy": 0.0552229531109333,
"num_tokens": 166944.0,
"step": 90
},
{
"entropy": 10.616693305969239,
"epoch": 0.08165019338203695,
"grad_norm": 1.96875,
"learning_rate": 4.7000000000000004e-05,
"loss": 9.3423,
"mean_token_accuracy": 0.060124922543764114,
"num_tokens": 175303.0,
"step": 95
},
{
"entropy": 10.591300106048584,
"epoch": 0.08594757198109153,
"grad_norm": 1.8203125,
"learning_rate": 4.9500000000000004e-05,
"loss": 9.3133,
"mean_token_accuracy": 0.06174388714134693,
"num_tokens": 184708.0,
"step": 100
},
{
"entropy": 10.564336776733398,
"epoch": 0.09024495058014612,
"grad_norm": 1.7890625,
"learning_rate": 5.2e-05,
"loss": 9.2307,
"mean_token_accuracy": 0.0674959484487772,
"num_tokens": 193835.0,
"step": 105
},
{
"entropy": 10.52622423171997,
"epoch": 0.09454232917920069,
"grad_norm": 1.8828125,
"learning_rate": 5.45e-05,
"loss": 9.1379,
"mean_token_accuracy": 0.07480009235441684,
"num_tokens": 203344.0,
"step": 110
},
{
"entropy": 10.454349136352539,
"epoch": 0.09883970777825526,
"grad_norm": 1.6171875,
"learning_rate": 5.7e-05,
"loss": 9.1209,
"mean_token_accuracy": 0.06218625903129578,
"num_tokens": 213048.0,
"step": 115
},
{
"entropy": 10.415324211120605,
"epoch": 0.10313708637730984,
"grad_norm": 1.578125,
"learning_rate": 5.9499999999999996e-05,
"loss": 8.9306,
"mean_token_accuracy": 0.07533645890653133,
"num_tokens": 221784.0,
"step": 120
},
{
"entropy": 10.303644943237305,
"epoch": 0.10743446497636441,
"grad_norm": 1.4765625,
"learning_rate": 6.2e-05,
"loss": 8.8509,
"mean_token_accuracy": 0.07504003196954727,
"num_tokens": 230971.0,
"step": 125
},
{
"entropy": 10.209668159484863,
"epoch": 0.11173184357541899,
"grad_norm": 1.4296875,
"learning_rate": 6.450000000000001e-05,
"loss": 8.7412,
"mean_token_accuracy": 0.07478504739701748,
"num_tokens": 240524.0,
"step": 130
},
{
"entropy": 10.153745365142822,
"epoch": 0.11602922217447358,
"grad_norm": 1.3359375,
"learning_rate": 6.7e-05,
"loss": 8.6323,
"mean_token_accuracy": 0.07354197278618813,
"num_tokens": 249220.0,
"step": 135
},
{
"entropy": 10.068094253540039,
"epoch": 0.12032660077352815,
"grad_norm": 1.3125,
"learning_rate": 6.950000000000001e-05,
"loss": 8.61,
"mean_token_accuracy": 0.07049238979816437,
"num_tokens": 258934.0,
"step": 140
},
{
"entropy": 9.973960685729981,
"epoch": 0.12462397937258272,
"grad_norm": 1.2734375,
"learning_rate": 7.2e-05,
"loss": 8.4673,
"mean_token_accuracy": 0.07534252405166626,
"num_tokens": 267680.0,
"step": 145
},
{
"entropy": 9.815561103820801,
"epoch": 0.1289213579716373,
"grad_norm": 1.09375,
"learning_rate": 7.45e-05,
"loss": 8.3709,
"mean_token_accuracy": 0.07952065020799637,
"num_tokens": 276227.0,
"step": 150
},
{
"entropy": 9.66996259689331,
"epoch": 0.1332187365706919,
"grad_norm": 1.1875,
"learning_rate": 7.7e-05,
"loss": 8.2269,
"mean_token_accuracy": 0.08225171342492103,
"num_tokens": 286342.0,
"step": 155
},
{
"entropy": 9.510671615600586,
"epoch": 0.13751611516974646,
"grad_norm": 0.953125,
"learning_rate": 7.950000000000001e-05,
"loss": 8.1921,
"mean_token_accuracy": 0.0742720566689968,
"num_tokens": 294994.0,
"step": 160
},
{
"entropy": 9.346861934661865,
"epoch": 0.14181349376880104,
"grad_norm": 0.984375,
"learning_rate": 8.2e-05,
"loss": 8.113,
"mean_token_accuracy": 0.08004417940974236,
"num_tokens": 303882.0,
"step": 165
},
{
"entropy": 9.199288940429687,
"epoch": 0.1461108723678556,
"grad_norm": 0.9296875,
"learning_rate": 8.450000000000001e-05,
"loss": 8.0403,
"mean_token_accuracy": 0.07799897268414498,
"num_tokens": 312515.0,
"step": 170
},
{
"entropy": 8.978620052337646,
"epoch": 0.15040825096691018,
"grad_norm": 0.9375,
"learning_rate": 8.7e-05,
"loss": 7.9977,
"mean_token_accuracy": 0.07381256259977817,
"num_tokens": 320801.0,
"step": 175
},
{
"entropy": 8.861582374572754,
"epoch": 0.15470562956596476,
"grad_norm": 0.9765625,
"learning_rate": 8.95e-05,
"loss": 7.9642,
"mean_token_accuracy": 0.08192512467503547,
"num_tokens": 329382.0,
"step": 180
},
{
"entropy": 8.755144786834716,
"epoch": 0.15900300816501933,
"grad_norm": 0.9296875,
"learning_rate": 9.2e-05,
"loss": 7.9273,
"mean_token_accuracy": 0.07583913430571557,
"num_tokens": 337894.0,
"step": 185
},
{
"entropy": 8.582227611541748,
"epoch": 0.1633003867640739,
"grad_norm": 0.8984375,
"learning_rate": 9.45e-05,
"loss": 7.9012,
"mean_token_accuracy": 0.07614588961005211,
"num_tokens": 346380.0,
"step": 190
},
{
"entropy": 8.591823768615722,
"epoch": 0.16759776536312848,
"grad_norm": 0.9609375,
"learning_rate": 9.7e-05,
"loss": 7.9407,
"mean_token_accuracy": 0.07390806600451469,
"num_tokens": 356305.0,
"step": 195
},
{
"entropy": 8.515201950073243,
"epoch": 0.17189514396218306,
"grad_norm": 1.1328125,
"learning_rate": 9.95e-05,
"loss": 7.8901,
"mean_token_accuracy": 0.07247771993279457,
"num_tokens": 364899.0,
"step": 200
},
{
"entropy": 8.457213211059571,
"epoch": 0.17619252256123766,
"grad_norm": 0.93359375,
"learning_rate": 0.000102,
"loss": 7.8566,
"mean_token_accuracy": 0.0781160645186901,
"num_tokens": 373663.0,
"step": 205
},
{
"entropy": 8.381179523468017,
"epoch": 0.18048990116029223,
"grad_norm": 0.95703125,
"learning_rate": 0.00010449999999999999,
"loss": 7.8221,
"mean_token_accuracy": 0.07758632972836495,
"num_tokens": 382730.0,
"step": 210
},
{
"entropy": 8.390653896331788,
"epoch": 0.1847872797593468,
"grad_norm": 0.921875,
"learning_rate": 0.000107,
"loss": 7.8622,
"mean_token_accuracy": 0.071787304058671,
"num_tokens": 392676.0,
"step": 215
},
{
"entropy": 8.255177211761474,
"epoch": 0.18908465835840138,
"grad_norm": 1.1015625,
"learning_rate": 0.0001095,
"loss": 7.8473,
"mean_token_accuracy": 0.08185218423604965,
"num_tokens": 401050.0,
"step": 220
},
{
"entropy": 8.367721462249756,
"epoch": 0.19338203695745596,
"grad_norm": 0.796875,
"learning_rate": 0.000112,
"loss": 7.795,
"mean_token_accuracy": 0.07991239950060844,
"num_tokens": 410009.0,
"step": 225
},
{
"entropy": 8.268333339691162,
"epoch": 0.19767941555651053,
"grad_norm": 0.859375,
"learning_rate": 0.0001145,
"loss": 7.7757,
"mean_token_accuracy": 0.08171008005738259,
"num_tokens": 419302.0,
"step": 230
},
{
"entropy": 8.304029846191407,
"epoch": 0.2019767941555651,
"grad_norm": 0.984375,
"learning_rate": 0.00011700000000000001,
"loss": 7.6812,
"mean_token_accuracy": 0.08820762410759926,
"num_tokens": 427296.0,
"step": 235
},
{
"entropy": 8.16576337814331,
"epoch": 0.20627417275461968,
"grad_norm": 0.91796875,
"learning_rate": 0.00011949999999999999,
"loss": 7.8198,
"mean_token_accuracy": 0.07870872803032399,
"num_tokens": 436368.0,
"step": 240
},
{
"entropy": 8.189785575866699,
"epoch": 0.21057155135367425,
"grad_norm": 1.28125,
"learning_rate": 0.000122,
"loss": 7.7389,
"mean_token_accuracy": 0.08551637679338456,
"num_tokens": 445535.0,
"step": 245
},
{
"entropy": 8.265625381469727,
"epoch": 0.21486892995272883,
"grad_norm": 0.8671875,
"learning_rate": 0.0001245,
"loss": 7.7093,
"mean_token_accuracy": 0.07919453792273998,
"num_tokens": 454769.0,
"step": 250
},
{
"entropy": 8.1545090675354,
"epoch": 0.2191663085517834,
"grad_norm": 0.93359375,
"learning_rate": 0.000127,
"loss": 7.7315,
"mean_token_accuracy": 0.0871740497648716,
"num_tokens": 463975.0,
"step": 255
},
{
"entropy": 8.13952112197876,
"epoch": 0.22346368715083798,
"grad_norm": 0.88671875,
"learning_rate": 0.0001295,
"loss": 7.726,
"mean_token_accuracy": 0.08799278363585472,
"num_tokens": 472899.0,
"step": 260
},
{
"entropy": 8.196070003509522,
"epoch": 0.22776106574989258,
"grad_norm": 0.93359375,
"learning_rate": 0.000132,
"loss": 7.7354,
"mean_token_accuracy": 0.08013860881328583,
"num_tokens": 481556.0,
"step": 265
},
{
"entropy": 8.114658737182618,
"epoch": 0.23205844434894715,
"grad_norm": 0.91015625,
"learning_rate": 0.00013450000000000002,
"loss": 7.7023,
"mean_token_accuracy": 0.0854449674487114,
"num_tokens": 490253.0,
"step": 270
},
{
"entropy": 8.193334579467773,
"epoch": 0.23635582294800173,
"grad_norm": 1.09375,
"learning_rate": 0.00013700000000000002,
"loss": 7.7066,
"mean_token_accuracy": 0.0806311085820198,
"num_tokens": 498444.0,
"step": 275
},
{
"entropy": 8.104936504364014,
"epoch": 0.2406532015470563,
"grad_norm": 0.8046875,
"learning_rate": 0.0001395,
"loss": 7.6467,
"mean_token_accuracy": 0.08675235286355018,
"num_tokens": 508330.0,
"step": 280
},
{
"entropy": 8.113396596908569,
"epoch": 0.24495058014611087,
"grad_norm": 1.015625,
"learning_rate": 0.00014199999999999998,
"loss": 7.7405,
"mean_token_accuracy": 0.08165572881698609,
"num_tokens": 517900.0,
"step": 285
},
{
"entropy": 8.046846723556518,
"epoch": 0.24924795874516545,
"grad_norm": 0.93359375,
"learning_rate": 0.0001445,
"loss": 7.6901,
"mean_token_accuracy": 0.08230286985635757,
"num_tokens": 527808.0,
"step": 290
},
{
"entropy": 8.13338761329651,
"epoch": 0.25354533734422,
"grad_norm": 0.8984375,
"learning_rate": 0.000147,
"loss": 7.6711,
"mean_token_accuracy": 0.08156475871801376,
"num_tokens": 536931.0,
"step": 295
},
{
"entropy": 8.18837013244629,
"epoch": 0.2578427159432746,
"grad_norm": 1.1875,
"learning_rate": 0.0001495,
"loss": 7.7049,
"mean_token_accuracy": 0.0835341140627861,
"num_tokens": 545758.0,
"step": 300
},
{
"entropy": 8.025089168548584,
"epoch": 0.26214009454232917,
"grad_norm": 0.9921875,
"learning_rate": 0.000152,
"loss": 7.7131,
"mean_token_accuracy": 0.08242038711905479,
"num_tokens": 555165.0,
"step": 305
},
{
"entropy": 8.155539417266846,
"epoch": 0.2664374731413838,
"grad_norm": 0.86328125,
"learning_rate": 0.00015450000000000001,
"loss": 7.6144,
"mean_token_accuracy": 0.08789716809988021,
"num_tokens": 564719.0,
"step": 310
},
{
"entropy": 8.041153383255004,
"epoch": 0.2707348517404383,
"grad_norm": 1.0,
"learning_rate": 0.000157,
"loss": 7.594,
"mean_token_accuracy": 0.09155945181846618,
"num_tokens": 573572.0,
"step": 315
},
{
"entropy": 8.15259666442871,
"epoch": 0.2750322303394929,
"grad_norm": 1.0859375,
"learning_rate": 0.0001595,
"loss": 7.7634,
"mean_token_accuracy": 0.08318910300731659,
"num_tokens": 581497.0,
"step": 320
},
{
"entropy": 8.100253248214722,
"epoch": 0.27932960893854747,
"grad_norm": 1.125,
"learning_rate": 0.000162,
"loss": 7.6118,
"mean_token_accuracy": 0.08767011985182763,
"num_tokens": 591107.0,
"step": 325
},
{
"entropy": 7.984478855133057,
"epoch": 0.28362698753760207,
"grad_norm": 0.84765625,
"learning_rate": 0.00016450000000000001,
"loss": 7.6456,
"mean_token_accuracy": 0.08353794142603874,
"num_tokens": 600241.0,
"step": 330
},
{
"entropy": 8.057686376571656,
"epoch": 0.2879243661366566,
"grad_norm": 0.91796875,
"learning_rate": 0.00016700000000000002,
"loss": 7.5776,
"mean_token_accuracy": 0.08751234114170074,
"num_tokens": 608697.0,
"step": 335
},
{
"entropy": 8.016141748428344,
"epoch": 0.2922217447357112,
"grad_norm": 0.9453125,
"learning_rate": 0.00016950000000000003,
"loss": 7.568,
"mean_token_accuracy": 0.09023259431123734,
"num_tokens": 617275.0,
"step": 340
},
{
"entropy": 8.084819841384888,
"epoch": 0.29651912333476577,
"grad_norm": 0.8984375,
"learning_rate": 0.00017199999999999998,
"loss": 7.6405,
"mean_token_accuracy": 0.08630914464592934,
"num_tokens": 626644.0,
"step": 345
},
{
"entropy": 8.008595705032349,
"epoch": 0.30081650193382037,
"grad_norm": 0.98828125,
"learning_rate": 0.00017449999999999999,
"loss": 7.5665,
"mean_token_accuracy": 0.08766811862587928,
"num_tokens": 635110.0,
"step": 350
},
{
"entropy": 8.04712610244751,
"epoch": 0.30511388053287497,
"grad_norm": 0.87109375,
"learning_rate": 0.000177,
"loss": 7.7031,
"mean_token_accuracy": 0.08570141717791557,
"num_tokens": 644746.0,
"step": 355
},
{
"entropy": 8.179811954498291,
"epoch": 0.3094112591319295,
"grad_norm": 1.1015625,
"learning_rate": 0.0001795,
"loss": 7.5831,
"mean_token_accuracy": 0.08595824986696243,
"num_tokens": 654281.0,
"step": 360
},
{
"entropy": 7.987443113327027,
"epoch": 0.3137086377309841,
"grad_norm": 1.203125,
"learning_rate": 0.000182,
"loss": 7.585,
"mean_token_accuracy": 0.09283285215497017,
"num_tokens": 663174.0,
"step": 365
},
{
"entropy": 7.916810417175293,
"epoch": 0.31800601633003867,
"grad_norm": 0.90625,
"learning_rate": 0.0001845,
"loss": 7.511,
"mean_token_accuracy": 0.08863886222243308,
"num_tokens": 672178.0,
"step": 370
},
{
"entropy": 8.005489206314087,
"epoch": 0.32230339492909327,
"grad_norm": 0.96484375,
"learning_rate": 0.000187,
"loss": 7.5218,
"mean_token_accuracy": 0.09131815880537034,
"num_tokens": 681323.0,
"step": 375
},
{
"entropy": 7.9803643226623535,
"epoch": 0.3266007735281478,
"grad_norm": 0.890625,
"learning_rate": 0.0001895,
"loss": 7.4406,
"mean_token_accuracy": 0.08985799476504326,
"num_tokens": 690461.0,
"step": 380
},
{
"entropy": 7.829833698272705,
"epoch": 0.3308981521272024,
"grad_norm": 1.046875,
"learning_rate": 0.000192,
"loss": 7.5004,
"mean_token_accuracy": 0.08490158319473266,
"num_tokens": 699199.0,
"step": 385
},
{
"entropy": 8.038139152526856,
"epoch": 0.33519553072625696,
"grad_norm": 1.1484375,
"learning_rate": 0.0001945,
"loss": 7.4484,
"mean_token_accuracy": 0.09670188426971435,
"num_tokens": 707949.0,
"step": 390
},
{
"entropy": 7.9735198497772215,
"epoch": 0.33949290932531156,
"grad_norm": 1.203125,
"learning_rate": 0.00019700000000000002,
"loss": 7.5219,
"mean_token_accuracy": 0.08999367579817771,
"num_tokens": 715752.0,
"step": 395
},
{
"entropy": 7.93391604423523,
"epoch": 0.3437902879243661,
"grad_norm": 1.1171875,
"learning_rate": 0.00019950000000000002,
"loss": 7.4479,
"mean_token_accuracy": 0.0979436494410038,
"num_tokens": 724416.0,
"step": 400
},
{
"entropy": 7.925309085845948,
"epoch": 0.3480876665234207,
"grad_norm": 1.0546875,
"learning_rate": 0.000202,
"loss": 7.4953,
"mean_token_accuracy": 0.09031900316476822,
"num_tokens": 733116.0,
"step": 405
},
{
"entropy": 7.916099977493286,
"epoch": 0.3523850451224753,
"grad_norm": 1.0625,
"learning_rate": 0.00020449999999999998,
"loss": 7.4726,
"mean_token_accuracy": 0.09227924942970275,
"num_tokens": 742093.0,
"step": 410
},
{
"entropy": 7.918701934814453,
"epoch": 0.35668242372152986,
"grad_norm": 1.046875,
"learning_rate": 0.000207,
"loss": 7.4649,
"mean_token_accuracy": 0.09618089124560356,
"num_tokens": 750402.0,
"step": 415
},
{
"entropy": 7.816703271865845,
"epoch": 0.36097980232058446,
"grad_norm": 0.9140625,
"learning_rate": 0.0002095,
"loss": 7.4336,
"mean_token_accuracy": 0.09461462944746017,
"num_tokens": 760961.0,
"step": 420
},
{
"entropy": 7.944287586212158,
"epoch": 0.365277180919639,
"grad_norm": 1.0390625,
"learning_rate": 0.000212,
"loss": 7.4865,
"mean_token_accuracy": 0.09455274268984795,
"num_tokens": 770554.0,
"step": 425
},
{
"entropy": 7.750526332855225,
"epoch": 0.3695745595186936,
"grad_norm": 1.03125,
"learning_rate": 0.0002145,
"loss": 7.4618,
"mean_token_accuracy": 0.09681151732802391,
"num_tokens": 779172.0,
"step": 430
},
{
"entropy": 7.9787256717681885,
"epoch": 0.37387193811774816,
"grad_norm": 0.984375,
"learning_rate": 0.00021700000000000002,
"loss": 7.5123,
"mean_token_accuracy": 0.08840151131153107,
"num_tokens": 788040.0,
"step": 435
},
{
"entropy": 7.883750295639038,
"epoch": 0.37816931671680276,
"grad_norm": 1.109375,
"learning_rate": 0.0002195,
"loss": 7.4135,
"mean_token_accuracy": 0.0939902700483799,
"num_tokens": 796786.0,
"step": 440
},
{
"entropy": 7.851776885986328,
"epoch": 0.3824666953158573,
"grad_norm": 1.09375,
"learning_rate": 0.000222,
"loss": 7.4233,
"mean_token_accuracy": 0.0923767201602459,
"num_tokens": 805520.0,
"step": 445
},
{
"entropy": 7.805376100540161,
"epoch": 0.3867640739149119,
"grad_norm": 1.1484375,
"learning_rate": 0.0002245,
"loss": 7.3508,
"mean_token_accuracy": 0.09647825658321381,
"num_tokens": 814939.0,
"step": 450
},
{
"entropy": 7.874559307098389,
"epoch": 0.39106145251396646,
"grad_norm": 1.2265625,
"learning_rate": 0.00022700000000000002,
"loss": 7.3531,
"mean_token_accuracy": 0.09795481041073799,
"num_tokens": 823862.0,
"step": 455
},
{
"entropy": 7.7626677513122555,
"epoch": 0.39535883111302106,
"grad_norm": 1.1328125,
"learning_rate": 0.00022950000000000002,
"loss": 7.3918,
"mean_token_accuracy": 0.09068166017532349,
"num_tokens": 832820.0,
"step": 460
},
{
"entropy": 7.928297901153565,
"epoch": 0.39965620971207566,
"grad_norm": 1.1171875,
"learning_rate": 0.00023200000000000003,
"loss": 7.3494,
"mean_token_accuracy": 0.09501236006617546,
"num_tokens": 841538.0,
"step": 465
},
{
"entropy": 7.7496504306793215,
"epoch": 0.4039535883111302,
"grad_norm": 0.99609375,
"learning_rate": 0.00023449999999999998,
"loss": 7.4626,
"mean_token_accuracy": 0.09104103595018387,
"num_tokens": 851123.0,
"step": 470
},
{
"entropy": 7.8953351974487305,
"epoch": 0.4082509669101848,
"grad_norm": 1.125,
"learning_rate": 0.000237,
"loss": 7.4266,
"mean_token_accuracy": 0.09596899375319481,
"num_tokens": 860357.0,
"step": 475
},
{
"entropy": 7.76341495513916,
"epoch": 0.41254834550923936,
"grad_norm": 1.0703125,
"learning_rate": 0.0002395,
"loss": 7.3425,
"mean_token_accuracy": 0.09861095696687698,
"num_tokens": 869980.0,
"step": 480
},
{
"entropy": 7.82184157371521,
"epoch": 0.41684572410829396,
"grad_norm": 1.03125,
"learning_rate": 0.000242,
"loss": 7.2999,
"mean_token_accuracy": 0.10065284445881843,
"num_tokens": 878250.0,
"step": 485
},
{
"entropy": 7.76347074508667,
"epoch": 0.4211431027073485,
"grad_norm": 1.25,
"learning_rate": 0.0002445,
"loss": 7.4007,
"mean_token_accuracy": 0.095355936139822,
"num_tokens": 887624.0,
"step": 490
},
{
"entropy": 7.753844261169434,
"epoch": 0.4254404813064031,
"grad_norm": 1.1484375,
"learning_rate": 0.000247,
"loss": 7.3568,
"mean_token_accuracy": 0.09853926301002502,
"num_tokens": 897120.0,
"step": 495
},
{
"entropy": 7.802051830291748,
"epoch": 0.42973785990545765,
"grad_norm": 1.03125,
"learning_rate": 0.0002495,
"loss": 7.3179,
"mean_token_accuracy": 0.10127250477671623,
"num_tokens": 906215.0,
"step": 500
},
{
"epoch": 0.42973785990545765,
"eval_entropy": 7.412716417699246,
"eval_loss": 7.3790483474731445,
"eval_mean_token_accuracy": 0.09986981684929347,
"eval_num_tokens": 906215.0,
"eval_runtime": 2.0966,
"eval_samples_per_second": 1692.736,
"eval_steps_per_second": 211.771,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 11630,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 204362498211840.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}