Files
swa-latn-10mb-ppt-Dp-10mb_s…/checkpoint-500/trainer_state.json

1035 lines
27 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.011172310545543924,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 10.7426682472229,
"epoch": 0.00011172310545543924,
"grad_norm": 6.25,
"learning_rate": 2e-06,
"loss": 10.5231,
"mean_token_accuracy": 0.0,
"num_tokens": 4250.0,
"step": 5
},
{
"entropy": 10.742681884765625,
"epoch": 0.00022344621091087847,
"grad_norm": 7.03125,
"learning_rate": 4.5e-06,
"loss": 10.4891,
"mean_token_accuracy": 0.0,
"num_tokens": 8228.0,
"step": 10
},
{
"entropy": 10.742702770233155,
"epoch": 0.0003351693163663177,
"grad_norm": 6.8125,
"learning_rate": 7e-06,
"loss": 10.4445,
"mean_token_accuracy": 0.0003105590119957924,
"num_tokens": 12209.0,
"step": 15
},
{
"entropy": 10.742706871032714,
"epoch": 0.00044689242182175694,
"grad_norm": 6.15625,
"learning_rate": 9.5e-06,
"loss": 10.3987,
"mean_token_accuracy": 0.0004866180010139942,
"num_tokens": 16225.0,
"step": 20
},
{
"entropy": 10.742706108093262,
"epoch": 0.0005586155272771962,
"grad_norm": 5.53125,
"learning_rate": 1.2e-05,
"loss": 10.2787,
"mean_token_accuracy": 0.010022059944458307,
"num_tokens": 20212.0,
"step": 25
},
{
"entropy": 10.742425346374512,
"epoch": 0.0006703386327326354,
"grad_norm": 5.0,
"learning_rate": 1.4500000000000002e-05,
"loss": 10.173,
"mean_token_accuracy": 0.04117634426802397,
"num_tokens": 24445.0,
"step": 30
},
{
"entropy": 10.741415786743165,
"epoch": 0.0007820617381880747,
"grad_norm": 4.3125,
"learning_rate": 1.7000000000000003e-05,
"loss": 9.9901,
"mean_token_accuracy": 0.052767305821180346,
"num_tokens": 28365.0,
"step": 35
},
{
"entropy": 10.739409732818604,
"epoch": 0.0008937848436435139,
"grad_norm": 3.578125,
"learning_rate": 1.95e-05,
"loss": 9.9621,
"mean_token_accuracy": 0.052475782483816145,
"num_tokens": 33055.0,
"step": 40
},
{
"entropy": 10.7363787651062,
"epoch": 0.0010055079490989532,
"grad_norm": 2.9375,
"learning_rate": 2.2e-05,
"loss": 9.811,
"mean_token_accuracy": 0.062037082761526106,
"num_tokens": 37599.0,
"step": 45
},
{
"entropy": 10.733420372009277,
"epoch": 0.0011172310545543925,
"grad_norm": 2.59375,
"learning_rate": 2.4500000000000003e-05,
"loss": 9.6744,
"mean_token_accuracy": 0.06838746592402459,
"num_tokens": 41934.0,
"step": 50
},
{
"entropy": 10.731625938415528,
"epoch": 0.0012289541600098315,
"grad_norm": 2.53125,
"learning_rate": 2.7e-05,
"loss": 9.6365,
"mean_token_accuracy": 0.05915887728333473,
"num_tokens": 46178.0,
"step": 55
},
{
"entropy": 10.729843425750733,
"epoch": 0.0013406772654652708,
"grad_norm": 2.5,
"learning_rate": 2.95e-05,
"loss": 9.5494,
"mean_token_accuracy": 0.0692246112972498,
"num_tokens": 50513.0,
"step": 60
},
{
"entropy": 10.727421474456786,
"epoch": 0.0014524003709207101,
"grad_norm": 2.46875,
"learning_rate": 3.2e-05,
"loss": 9.5028,
"mean_token_accuracy": 0.0702465757727623,
"num_tokens": 54924.0,
"step": 65
},
{
"entropy": 10.722854518890381,
"epoch": 0.0015641234763761494,
"grad_norm": 2.46875,
"learning_rate": 3.4500000000000005e-05,
"loss": 9.4107,
"mean_token_accuracy": 0.06344567574560642,
"num_tokens": 59083.0,
"step": 70
},
{
"entropy": 10.71631669998169,
"epoch": 0.0016758465818315885,
"grad_norm": 2.65625,
"learning_rate": 3.7e-05,
"loss": 9.3233,
"mean_token_accuracy": 0.06774163469672204,
"num_tokens": 63324.0,
"step": 75
},
{
"entropy": 10.705671787261963,
"epoch": 0.0017875696872870278,
"grad_norm": 2.34375,
"learning_rate": 3.95e-05,
"loss": 9.3567,
"mean_token_accuracy": 0.06332114227116108,
"num_tokens": 67738.0,
"step": 80
},
{
"entropy": 10.693737983703613,
"epoch": 0.001899292792742467,
"grad_norm": 2.234375,
"learning_rate": 4.2000000000000004e-05,
"loss": 9.2866,
"mean_token_accuracy": 0.06349676214158535,
"num_tokens": 72305.0,
"step": 85
},
{
"entropy": 10.675388622283936,
"epoch": 0.0020110158981979064,
"grad_norm": 2.53125,
"learning_rate": 4.45e-05,
"loss": 9.0821,
"mean_token_accuracy": 0.06834135130047798,
"num_tokens": 76579.0,
"step": 90
},
{
"entropy": 10.65205717086792,
"epoch": 0.0021227390036533456,
"grad_norm": 2.234375,
"learning_rate": 4.7000000000000004e-05,
"loss": 9.0421,
"mean_token_accuracy": 0.06946654319763183,
"num_tokens": 80812.0,
"step": 95
},
{
"entropy": 10.615971279144286,
"epoch": 0.002234462109108785,
"grad_norm": 2.109375,
"learning_rate": 4.9500000000000004e-05,
"loss": 8.9523,
"mean_token_accuracy": 0.06732719540596008,
"num_tokens": 85090.0,
"step": 100
},
{
"entropy": 10.59008207321167,
"epoch": 0.0023461852145642242,
"grad_norm": 2.234375,
"learning_rate": 5.2e-05,
"loss": 8.888,
"mean_token_accuracy": 0.06908667460083961,
"num_tokens": 89578.0,
"step": 105
},
{
"entropy": 10.536420917510986,
"epoch": 0.002457908320019663,
"grad_norm": 1.96875,
"learning_rate": 5.45e-05,
"loss": 8.789,
"mean_token_accuracy": 0.0728236336261034,
"num_tokens": 94117.0,
"step": 110
},
{
"entropy": 10.488511657714843,
"epoch": 0.0025696314254751024,
"grad_norm": 2.0625,
"learning_rate": 5.7e-05,
"loss": 8.6132,
"mean_token_accuracy": 0.07127482630312443,
"num_tokens": 98082.0,
"step": 115
},
{
"entropy": 10.439968013763428,
"epoch": 0.0026813545309305417,
"grad_norm": 1.9765625,
"learning_rate": 5.9499999999999996e-05,
"loss": 8.5714,
"mean_token_accuracy": 0.07672090865671635,
"num_tokens": 102327.0,
"step": 120
},
{
"entropy": 10.355792045593262,
"epoch": 0.002793077636385981,
"grad_norm": 1.8359375,
"learning_rate": 6.2e-05,
"loss": 8.4426,
"mean_token_accuracy": 0.0740627009421587,
"num_tokens": 106567.0,
"step": 125
},
{
"entropy": 10.286309623718262,
"epoch": 0.0029048007418414202,
"grad_norm": 1.7890625,
"learning_rate": 6.450000000000001e-05,
"loss": 8.3003,
"mean_token_accuracy": 0.07362989187240601,
"num_tokens": 110654.0,
"step": 130
},
{
"entropy": 10.204053020477295,
"epoch": 0.0030165238472968595,
"grad_norm": 1.875,
"learning_rate": 6.7e-05,
"loss": 8.2511,
"mean_token_accuracy": 0.062000279501080516,
"num_tokens": 114679.0,
"step": 135
},
{
"entropy": 10.102067852020264,
"epoch": 0.003128246952752299,
"grad_norm": 1.6171875,
"learning_rate": 6.950000000000001e-05,
"loss": 8.1849,
"mean_token_accuracy": 0.06811538599431514,
"num_tokens": 118817.0,
"step": 140
},
{
"entropy": 9.926943397521972,
"epoch": 0.003239970058207738,
"grad_norm": 1.484375,
"learning_rate": 7.2e-05,
"loss": 8.0767,
"mean_token_accuracy": 0.06979594528675079,
"num_tokens": 123188.0,
"step": 145
},
{
"entropy": 9.793034744262695,
"epoch": 0.003351693163663177,
"grad_norm": 1.5078125,
"learning_rate": 7.45e-05,
"loss": 7.981,
"mean_token_accuracy": 0.06847230046987533,
"num_tokens": 127767.0,
"step": 150
},
{
"entropy": 9.643688774108886,
"epoch": 0.0034634162691186163,
"grad_norm": 1.4921875,
"learning_rate": 7.7e-05,
"loss": 7.7945,
"mean_token_accuracy": 0.06945906654000282,
"num_tokens": 131837.0,
"step": 155
},
{
"entropy": 9.430543518066406,
"epoch": 0.0035751393745740555,
"grad_norm": 1.2890625,
"learning_rate": 7.950000000000001e-05,
"loss": 7.7734,
"mean_token_accuracy": 0.07027286775410176,
"num_tokens": 136247.0,
"step": 160
},
{
"entropy": 9.239261722564697,
"epoch": 0.003686862480029495,
"grad_norm": 1.4609375,
"learning_rate": 8.2e-05,
"loss": 7.5788,
"mean_token_accuracy": 0.07950169630348683,
"num_tokens": 140170.0,
"step": 165
},
{
"entropy": 8.976140880584717,
"epoch": 0.003798585585484934,
"grad_norm": 1.21875,
"learning_rate": 8.450000000000001e-05,
"loss": 7.6177,
"mean_token_accuracy": 0.07785017378628253,
"num_tokens": 144139.0,
"step": 170
},
{
"entropy": 8.843453693389893,
"epoch": 0.003910308690940373,
"grad_norm": 1.2578125,
"learning_rate": 8.7e-05,
"loss": 7.5659,
"mean_token_accuracy": 0.07487303391098976,
"num_tokens": 148792.0,
"step": 175
},
{
"entropy": 8.658325004577637,
"epoch": 0.004022031796395813,
"grad_norm": 1.3515625,
"learning_rate": 8.95e-05,
"loss": 7.4988,
"mean_token_accuracy": 0.07942216768860817,
"num_tokens": 152844.0,
"step": 180
},
{
"entropy": 8.59526195526123,
"epoch": 0.0041337549018512516,
"grad_norm": 1.046875,
"learning_rate": 9.2e-05,
"loss": 7.527,
"mean_token_accuracy": 0.07417443916201591,
"num_tokens": 157366.0,
"step": 185
},
{
"entropy": 8.467089462280274,
"epoch": 0.004245478007306691,
"grad_norm": 1.0390625,
"learning_rate": 9.45e-05,
"loss": 7.3623,
"mean_token_accuracy": 0.07755868881940842,
"num_tokens": 161348.0,
"step": 190
},
{
"entropy": 8.307873630523682,
"epoch": 0.00435720111276213,
"grad_norm": 1.140625,
"learning_rate": 9.7e-05,
"loss": 7.3815,
"mean_token_accuracy": 0.08716461397707462,
"num_tokens": 165647.0,
"step": 195
},
{
"entropy": 8.236515140533447,
"epoch": 0.00446892421821757,
"grad_norm": 1.4453125,
"learning_rate": 9.95e-05,
"loss": 7.2754,
"mean_token_accuracy": 0.08076057620346547,
"num_tokens": 169521.0,
"step": 200
},
{
"entropy": 8.256762790679932,
"epoch": 0.004580647323673009,
"grad_norm": 1.359375,
"learning_rate": 0.000102,
"loss": 7.3426,
"mean_token_accuracy": 0.0812241055071354,
"num_tokens": 173466.0,
"step": 205
},
{
"entropy": 8.131280899047852,
"epoch": 0.0046923704291284484,
"grad_norm": 1.1484375,
"learning_rate": 0.00010449999999999999,
"loss": 7.2826,
"mean_token_accuracy": 0.07643571458756923,
"num_tokens": 177663.0,
"step": 210
},
{
"entropy": 8.097990989685059,
"epoch": 0.004804093534583887,
"grad_norm": 1.21875,
"learning_rate": 0.000107,
"loss": 7.2745,
"mean_token_accuracy": 0.08235705867409707,
"num_tokens": 181778.0,
"step": 215
},
{
"entropy": 8.089111948013306,
"epoch": 0.004915816640039326,
"grad_norm": 1.6171875,
"learning_rate": 0.0001095,
"loss": 7.2736,
"mean_token_accuracy": 0.08633389472961425,
"num_tokens": 185525.0,
"step": 220
},
{
"entropy": 8.083420944213866,
"epoch": 0.005027539745494766,
"grad_norm": 1.3671875,
"learning_rate": 0.000112,
"loss": 7.153,
"mean_token_accuracy": 0.08806331530213356,
"num_tokens": 189418.0,
"step": 225
},
{
"entropy": 7.933328151702881,
"epoch": 0.005139262850950205,
"grad_norm": 1.359375,
"learning_rate": 0.0001145,
"loss": 7.2217,
"mean_token_accuracy": 0.08842612579464912,
"num_tokens": 193494.0,
"step": 230
},
{
"entropy": 8.018900680541993,
"epoch": 0.0052509859564056445,
"grad_norm": 1.1484375,
"learning_rate": 0.00011700000000000001,
"loss": 7.2661,
"mean_token_accuracy": 0.08137304298579692,
"num_tokens": 198018.0,
"step": 235
},
{
"entropy": 7.955441856384278,
"epoch": 0.005362709061861083,
"grad_norm": 1.2578125,
"learning_rate": 0.00011949999999999999,
"loss": 7.1847,
"mean_token_accuracy": 0.08625513166189194,
"num_tokens": 202296.0,
"step": 240
},
{
"entropy": 7.9594367980957035,
"epoch": 0.005474432167316523,
"grad_norm": 1.3203125,
"learning_rate": 0.000122,
"loss": 7.1706,
"mean_token_accuracy": 0.08195730969309807,
"num_tokens": 206694.0,
"step": 245
},
{
"entropy": 7.792031574249267,
"epoch": 0.005586155272771962,
"grad_norm": 1.390625,
"learning_rate": 0.0001245,
"loss": 7.2007,
"mean_token_accuracy": 0.08904931843280792,
"num_tokens": 210810.0,
"step": 250
},
{
"entropy": 7.920461797714234,
"epoch": 0.005697878378227402,
"grad_norm": 1.1796875,
"learning_rate": 0.000127,
"loss": 7.1818,
"mean_token_accuracy": 0.0905133418738842,
"num_tokens": 215044.0,
"step": 255
},
{
"entropy": 7.8493430614471436,
"epoch": 0.0058096014836828405,
"grad_norm": 1.3828125,
"learning_rate": 0.0001295,
"loss": 7.28,
"mean_token_accuracy": 0.08591654896736145,
"num_tokens": 219235.0,
"step": 260
},
{
"entropy": 7.84934287071228,
"epoch": 0.005921324589138279,
"grad_norm": 1.1328125,
"learning_rate": 0.000132,
"loss": 7.0922,
"mean_token_accuracy": 0.0903876356780529,
"num_tokens": 223639.0,
"step": 265
},
{
"entropy": 7.785561227798462,
"epoch": 0.006033047694593719,
"grad_norm": 1.25,
"learning_rate": 0.00013450000000000002,
"loss": 7.1258,
"mean_token_accuracy": 0.09057728350162506,
"num_tokens": 227873.0,
"step": 270
},
{
"entropy": 7.707937574386596,
"epoch": 0.006144770800049158,
"grad_norm": 1.3671875,
"learning_rate": 0.00013700000000000002,
"loss": 7.0661,
"mean_token_accuracy": 0.09807337448000908,
"num_tokens": 232147.0,
"step": 275
},
{
"entropy": 7.739069509506225,
"epoch": 0.006256493905504598,
"grad_norm": 1.6015625,
"learning_rate": 0.0001395,
"loss": 7.1358,
"mean_token_accuracy": 0.09250000454485416,
"num_tokens": 236456.0,
"step": 280
},
{
"entropy": 7.7190714359283445,
"epoch": 0.0063682170109600365,
"grad_norm": 1.1953125,
"learning_rate": 0.00014199999999999998,
"loss": 7.1583,
"mean_token_accuracy": 0.09051149562001229,
"num_tokens": 241039.0,
"step": 285
},
{
"entropy": 7.938947439193726,
"epoch": 0.006479940116415476,
"grad_norm": 1.6640625,
"learning_rate": 0.0001445,
"loss": 7.1915,
"mean_token_accuracy": 0.08653632178902626,
"num_tokens": 245132.0,
"step": 290
},
{
"entropy": 7.673107481002807,
"epoch": 0.006591663221870915,
"grad_norm": 1.28125,
"learning_rate": 0.000147,
"loss": 7.0872,
"mean_token_accuracy": 0.09988043382763863,
"num_tokens": 249152.0,
"step": 295
},
{
"entropy": 7.712965631484986,
"epoch": 0.006703386327326354,
"grad_norm": 1.21875,
"learning_rate": 0.0001495,
"loss": 7.0503,
"mean_token_accuracy": 0.09596830010414123,
"num_tokens": 253439.0,
"step": 300
},
{
"entropy": 7.6600532054901125,
"epoch": 0.006815109432781794,
"grad_norm": 1.546875,
"learning_rate": 0.000152,
"loss": 7.0731,
"mean_token_accuracy": 0.09302671104669571,
"num_tokens": 258066.0,
"step": 305
},
{
"entropy": 7.665358448028565,
"epoch": 0.0069268325382372325,
"grad_norm": 1.4375,
"learning_rate": 0.00015450000000000001,
"loss": 7.0332,
"mean_token_accuracy": 0.0973147690296173,
"num_tokens": 261954.0,
"step": 310
},
{
"entropy": 7.616210794448852,
"epoch": 0.007038555643692672,
"grad_norm": 1.4453125,
"learning_rate": 0.000157,
"loss": 7.0779,
"mean_token_accuracy": 0.10462095588445663,
"num_tokens": 266650.0,
"step": 315
},
{
"entropy": 7.689846324920654,
"epoch": 0.007150278749148111,
"grad_norm": 1.3203125,
"learning_rate": 0.0001595,
"loss": 7.1433,
"mean_token_accuracy": 0.09891897812485695,
"num_tokens": 271069.0,
"step": 320
},
{
"entropy": 7.705677938461304,
"epoch": 0.007262001854603551,
"grad_norm": 1.359375,
"learning_rate": 0.000162,
"loss": 7.0039,
"mean_token_accuracy": 0.10242248028516769,
"num_tokens": 275084.0,
"step": 325
},
{
"entropy": 7.603102445602417,
"epoch": 0.00737372496005899,
"grad_norm": 1.390625,
"learning_rate": 0.00016450000000000001,
"loss": 7.0745,
"mean_token_accuracy": 0.1031483568251133,
"num_tokens": 279721.0,
"step": 330
},
{
"entropy": 7.619607782363891,
"epoch": 0.007485448065514429,
"grad_norm": 1.296875,
"learning_rate": 0.00016700000000000002,
"loss": 7.0708,
"mean_token_accuracy": 0.10527726709842682,
"num_tokens": 284317.0,
"step": 335
},
{
"entropy": 7.600710487365722,
"epoch": 0.007597171170969868,
"grad_norm": 1.3984375,
"learning_rate": 0.00016950000000000003,
"loss": 7.0451,
"mean_token_accuracy": 0.10766607597470283,
"num_tokens": 288870.0,
"step": 340
},
{
"entropy": 7.61973729133606,
"epoch": 0.007708894276425307,
"grad_norm": 1.3046875,
"learning_rate": 0.00017199999999999998,
"loss": 6.9812,
"mean_token_accuracy": 0.11351362988352776,
"num_tokens": 292996.0,
"step": 345
},
{
"entropy": 7.5854551792144775,
"epoch": 0.007820617381880746,
"grad_norm": 1.4765625,
"learning_rate": 0.00017449999999999999,
"loss": 6.9806,
"mean_token_accuracy": 0.10384939089417458,
"num_tokens": 297238.0,
"step": 350
},
{
"entropy": 7.531381893157959,
"epoch": 0.007932340487336187,
"grad_norm": 1.484375,
"learning_rate": 0.000177,
"loss": 6.9793,
"mean_token_accuracy": 0.1117280475795269,
"num_tokens": 301453.0,
"step": 355
},
{
"entropy": 7.653309726715088,
"epoch": 0.008044063592791625,
"grad_norm": 1.7890625,
"learning_rate": 0.0001795,
"loss": 6.9327,
"mean_token_accuracy": 0.10786554217338562,
"num_tokens": 305949.0,
"step": 360
},
{
"entropy": 7.534815788269043,
"epoch": 0.008155786698247064,
"grad_norm": 1.5625,
"learning_rate": 0.000182,
"loss": 6.9583,
"mean_token_accuracy": 0.11513907313346863,
"num_tokens": 310097.0,
"step": 365
},
{
"entropy": 7.547474193572998,
"epoch": 0.008267509803702503,
"grad_norm": 1.390625,
"learning_rate": 0.0001845,
"loss": 6.9517,
"mean_token_accuracy": 0.10539396926760673,
"num_tokens": 314567.0,
"step": 370
},
{
"entropy": 7.457708692550659,
"epoch": 0.008379232909157944,
"grad_norm": 1.4375,
"learning_rate": 0.000187,
"loss": 7.0323,
"mean_token_accuracy": 0.10927818715572357,
"num_tokens": 319166.0,
"step": 375
},
{
"entropy": 7.515052604675293,
"epoch": 0.008490956014613383,
"grad_norm": 1.421875,
"learning_rate": 0.0001895,
"loss": 6.9204,
"mean_token_accuracy": 0.11223233640193939,
"num_tokens": 323682.0,
"step": 380
},
{
"entropy": 7.488761281967163,
"epoch": 0.008602679120068821,
"grad_norm": 1.421875,
"learning_rate": 0.000192,
"loss": 6.8299,
"mean_token_accuracy": 0.12143486216664315,
"num_tokens": 327994.0,
"step": 385
},
{
"entropy": 7.412152099609375,
"epoch": 0.00871440222552426,
"grad_norm": 1.3671875,
"learning_rate": 0.0001945,
"loss": 6.9058,
"mean_token_accuracy": 0.11854805946350097,
"num_tokens": 332026.0,
"step": 390
},
{
"entropy": 7.578387832641601,
"epoch": 0.0088261253309797,
"grad_norm": 1.796875,
"learning_rate": 0.00019700000000000002,
"loss": 6.9475,
"mean_token_accuracy": 0.11811894848942757,
"num_tokens": 336552.0,
"step": 395
},
{
"entropy": 7.504688882827759,
"epoch": 0.00893784843643514,
"grad_norm": 1.4921875,
"learning_rate": 0.00019950000000000002,
"loss": 6.8884,
"mean_token_accuracy": 0.1116393692791462,
"num_tokens": 340643.0,
"step": 400
},
{
"entropy": 7.4580738067626955,
"epoch": 0.009049571541890579,
"grad_norm": 1.375,
"learning_rate": 0.000202,
"loss": 6.8029,
"mean_token_accuracy": 0.12075437754392623,
"num_tokens": 344886.0,
"step": 405
},
{
"entropy": 7.3586314678192135,
"epoch": 0.009161294647346017,
"grad_norm": 1.328125,
"learning_rate": 0.00020449999999999998,
"loss": 6.8632,
"mean_token_accuracy": 0.1191755935549736,
"num_tokens": 349115.0,
"step": 410
},
{
"entropy": 7.527571535110473,
"epoch": 0.009273017752801456,
"grad_norm": 1.5625,
"learning_rate": 0.000207,
"loss": 6.8235,
"mean_token_accuracy": 0.12523479163646697,
"num_tokens": 353368.0,
"step": 415
},
{
"entropy": 7.509571599960327,
"epoch": 0.009384740858256897,
"grad_norm": 1.484375,
"learning_rate": 0.0002095,
"loss": 6.8657,
"mean_token_accuracy": 0.10757644474506378,
"num_tokens": 357382.0,
"step": 420
},
{
"entropy": 7.441834354400635,
"epoch": 0.009496463963712336,
"grad_norm": 1.7265625,
"learning_rate": 0.000212,
"loss": 6.7799,
"mean_token_accuracy": 0.12460733354091644,
"num_tokens": 361542.0,
"step": 425
},
{
"entropy": 7.419776153564453,
"epoch": 0.009608187069167775,
"grad_norm": 1.2578125,
"learning_rate": 0.0002145,
"loss": 6.84,
"mean_token_accuracy": 0.11263928636908531,
"num_tokens": 366006.0,
"step": 430
},
{
"entropy": 7.367758464813233,
"epoch": 0.009719910174623213,
"grad_norm": 1.578125,
"learning_rate": 0.00021700000000000002,
"loss": 6.7506,
"mean_token_accuracy": 0.12978531718254088,
"num_tokens": 370021.0,
"step": 435
},
{
"entropy": 7.408233594894409,
"epoch": 0.009831633280078652,
"grad_norm": 1.703125,
"learning_rate": 0.0002195,
"loss": 6.8773,
"mean_token_accuracy": 0.12414649501442909,
"num_tokens": 374434.0,
"step": 440
},
{
"entropy": 7.441655158996582,
"epoch": 0.009943356385534093,
"grad_norm": 1.3515625,
"learning_rate": 0.000222,
"loss": 6.9027,
"mean_token_accuracy": 0.11324851140379906,
"num_tokens": 378634.0,
"step": 445
},
{
"entropy": 7.315918588638306,
"epoch": 0.010055079490989532,
"grad_norm": 1.4453125,
"learning_rate": 0.0002245,
"loss": 6.7869,
"mean_token_accuracy": 0.1252473659813404,
"num_tokens": 382904.0,
"step": 450
},
{
"entropy": 7.470485210418701,
"epoch": 0.01016680259644497,
"grad_norm": 1.328125,
"learning_rate": 0.00022700000000000002,
"loss": 6.7635,
"mean_token_accuracy": 0.12090180814266205,
"num_tokens": 386970.0,
"step": 455
},
{
"entropy": 7.348088216781616,
"epoch": 0.01027852570190041,
"grad_norm": 1.25,
"learning_rate": 0.00022950000000000002,
"loss": 6.729,
"mean_token_accuracy": 0.13390202075242996,
"num_tokens": 391043.0,
"step": 460
},
{
"entropy": 7.392842721939087,
"epoch": 0.01039024880735585,
"grad_norm": 1.3125,
"learning_rate": 0.00023200000000000003,
"loss": 6.7204,
"mean_token_accuracy": 0.13383440747857095,
"num_tokens": 395413.0,
"step": 465
},
{
"entropy": 7.40152382850647,
"epoch": 0.010501971912811289,
"grad_norm": 1.6015625,
"learning_rate": 0.00023449999999999998,
"loss": 6.8385,
"mean_token_accuracy": 0.12566340565681458,
"num_tokens": 399821.0,
"step": 470
},
{
"entropy": 7.2655271053314205,
"epoch": 0.010613695018266728,
"grad_norm": 1.34375,
"learning_rate": 0.000237,
"loss": 6.6582,
"mean_token_accuracy": 0.13715523406863211,
"num_tokens": 404043.0,
"step": 475
},
{
"entropy": 7.422811889648438,
"epoch": 0.010725418123722167,
"grad_norm": 1.65625,
"learning_rate": 0.0002395,
"loss": 6.803,
"mean_token_accuracy": 0.1260749615728855,
"num_tokens": 408339.0,
"step": 480
},
{
"entropy": 7.263138484954834,
"epoch": 0.010837141229177606,
"grad_norm": 1.4453125,
"learning_rate": 0.000242,
"loss": 6.6856,
"mean_token_accuracy": 0.13459724336862564,
"num_tokens": 412384.0,
"step": 485
},
{
"entropy": 7.3362548828125,
"epoch": 0.010948864334633046,
"grad_norm": 1.4921875,
"learning_rate": 0.0002445,
"loss": 6.8108,
"mean_token_accuracy": 0.12614913210272788,
"num_tokens": 416891.0,
"step": 490
},
{
"entropy": 7.36023063659668,
"epoch": 0.011060587440088485,
"grad_norm": 1.4921875,
"learning_rate": 0.000247,
"loss": 6.6642,
"mean_token_accuracy": 0.1329216368496418,
"num_tokens": 420980.0,
"step": 495
},
{
"entropy": 7.2991362571716305,
"epoch": 0.011172310545543924,
"grad_norm": 1.6640625,
"learning_rate": 0.0002495,
"loss": 6.7212,
"mean_token_accuracy": 0.13121648952364923,
"num_tokens": 425454.0,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 4000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 91570824806400.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}