2035 lines
54 KiB
JSON
2035 lines
54 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.022344621091087848,
|
|
"eval_steps": 500,
|
|
"global_step": 1000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 10.7426682472229,
|
|
"epoch": 0.00011172310545543924,
|
|
"grad_norm": 6.25,
|
|
"learning_rate": 2e-06,
|
|
"loss": 10.5231,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 4250.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 10.742681884765625,
|
|
"epoch": 0.00022344621091087847,
|
|
"grad_norm": 7.03125,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 10.4891,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 8228.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 10.742702770233155,
|
|
"epoch": 0.0003351693163663177,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 7e-06,
|
|
"loss": 10.4445,
|
|
"mean_token_accuracy": 0.0003105590119957924,
|
|
"num_tokens": 12209.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 10.742706871032714,
|
|
"epoch": 0.00044689242182175694,
|
|
"grad_norm": 6.15625,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 10.3987,
|
|
"mean_token_accuracy": 0.0004866180010139942,
|
|
"num_tokens": 16225.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 10.742706108093262,
|
|
"epoch": 0.0005586155272771962,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 10.2787,
|
|
"mean_token_accuracy": 0.010022059944458307,
|
|
"num_tokens": 20212.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.742425346374512,
|
|
"epoch": 0.0006703386327326354,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.173,
|
|
"mean_token_accuracy": 0.04117634426802397,
|
|
"num_tokens": 24445.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.741415786743165,
|
|
"epoch": 0.0007820617381880747,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 9.9901,
|
|
"mean_token_accuracy": 0.052767305821180346,
|
|
"num_tokens": 28365.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.739409732818604,
|
|
"epoch": 0.0008937848436435139,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 9.9621,
|
|
"mean_token_accuracy": 0.052475782483816145,
|
|
"num_tokens": 33055.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.7363787651062,
|
|
"epoch": 0.0010055079490989532,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 9.811,
|
|
"mean_token_accuracy": 0.062037082761526106,
|
|
"num_tokens": 37599.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.733420372009277,
|
|
"epoch": 0.0011172310545543925,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.6744,
|
|
"mean_token_accuracy": 0.06838746592402459,
|
|
"num_tokens": 41934.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.731625938415528,
|
|
"epoch": 0.0012289541600098315,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.6365,
|
|
"mean_token_accuracy": 0.05915887728333473,
|
|
"num_tokens": 46178.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.729843425750733,
|
|
"epoch": 0.0013406772654652708,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.5494,
|
|
"mean_token_accuracy": 0.0692246112972498,
|
|
"num_tokens": 50513.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.727421474456786,
|
|
"epoch": 0.0014524003709207101,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.5028,
|
|
"mean_token_accuracy": 0.0702465757727623,
|
|
"num_tokens": 54924.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.722854518890381,
|
|
"epoch": 0.0015641234763761494,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4107,
|
|
"mean_token_accuracy": 0.06344567574560642,
|
|
"num_tokens": 59083.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.71631669998169,
|
|
"epoch": 0.0016758465818315885,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.3233,
|
|
"mean_token_accuracy": 0.06774163469672204,
|
|
"num_tokens": 63324.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.705671787261963,
|
|
"epoch": 0.0017875696872870278,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.3567,
|
|
"mean_token_accuracy": 0.06332114227116108,
|
|
"num_tokens": 67738.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.693737983703613,
|
|
"epoch": 0.001899292792742467,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.2866,
|
|
"mean_token_accuracy": 0.06349676214158535,
|
|
"num_tokens": 72305.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.675388622283936,
|
|
"epoch": 0.0020110158981979064,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.0821,
|
|
"mean_token_accuracy": 0.06834135130047798,
|
|
"num_tokens": 76579.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.65205717086792,
|
|
"epoch": 0.0021227390036533456,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 9.0421,
|
|
"mean_token_accuracy": 0.06946654319763183,
|
|
"num_tokens": 80812.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.615971279144286,
|
|
"epoch": 0.002234462109108785,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.9523,
|
|
"mean_token_accuracy": 0.06732719540596008,
|
|
"num_tokens": 85090.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.59008207321167,
|
|
"epoch": 0.0023461852145642242,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.888,
|
|
"mean_token_accuracy": 0.06908667460083961,
|
|
"num_tokens": 89578.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.536420917510986,
|
|
"epoch": 0.002457908320019663,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.789,
|
|
"mean_token_accuracy": 0.0728236336261034,
|
|
"num_tokens": 94117.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 10.488511657714843,
|
|
"epoch": 0.0025696314254751024,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.6132,
|
|
"mean_token_accuracy": 0.07127482630312443,
|
|
"num_tokens": 98082.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 10.439968013763428,
|
|
"epoch": 0.0026813545309305417,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.5714,
|
|
"mean_token_accuracy": 0.07672090865671635,
|
|
"num_tokens": 102327.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 10.355792045593262,
|
|
"epoch": 0.002793077636385981,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.4426,
|
|
"mean_token_accuracy": 0.0740627009421587,
|
|
"num_tokens": 106567.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 10.286309623718262,
|
|
"epoch": 0.0029048007418414202,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.3003,
|
|
"mean_token_accuracy": 0.07362989187240601,
|
|
"num_tokens": 110654.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 10.204053020477295,
|
|
"epoch": 0.0030165238472968595,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.2511,
|
|
"mean_token_accuracy": 0.062000279501080516,
|
|
"num_tokens": 114679.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 10.102067852020264,
|
|
"epoch": 0.003128246952752299,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 8.1849,
|
|
"mean_token_accuracy": 0.06811538599431514,
|
|
"num_tokens": 118817.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 9.926943397521972,
|
|
"epoch": 0.003239970058207738,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 8.0767,
|
|
"mean_token_accuracy": 0.06979594528675079,
|
|
"num_tokens": 123188.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 9.793034744262695,
|
|
"epoch": 0.003351693163663177,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.981,
|
|
"mean_token_accuracy": 0.06847230046987533,
|
|
"num_tokens": 127767.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 9.643688774108886,
|
|
"epoch": 0.0034634162691186163,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.7945,
|
|
"mean_token_accuracy": 0.06945906654000282,
|
|
"num_tokens": 131837.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 9.430543518066406,
|
|
"epoch": 0.0035751393745740555,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.7734,
|
|
"mean_token_accuracy": 0.07027286775410176,
|
|
"num_tokens": 136247.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 9.239261722564697,
|
|
"epoch": 0.003686862480029495,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.5788,
|
|
"mean_token_accuracy": 0.07950169630348683,
|
|
"num_tokens": 140170.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.976140880584717,
|
|
"epoch": 0.003798585585484934,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.6177,
|
|
"mean_token_accuracy": 0.07785017378628253,
|
|
"num_tokens": 144139.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.843453693389893,
|
|
"epoch": 0.003910308690940373,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.5659,
|
|
"mean_token_accuracy": 0.07487303391098976,
|
|
"num_tokens": 148792.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.658325004577637,
|
|
"epoch": 0.004022031796395813,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.4988,
|
|
"mean_token_accuracy": 0.07942216768860817,
|
|
"num_tokens": 152844.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.59526195526123,
|
|
"epoch": 0.0041337549018512516,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.527,
|
|
"mean_token_accuracy": 0.07417443916201591,
|
|
"num_tokens": 157366.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 8.467089462280274,
|
|
"epoch": 0.004245478007306691,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.3623,
|
|
"mean_token_accuracy": 0.07755868881940842,
|
|
"num_tokens": 161348.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 8.307873630523682,
|
|
"epoch": 0.00435720111276213,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.3815,
|
|
"mean_token_accuracy": 0.08716461397707462,
|
|
"num_tokens": 165647.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 8.236515140533447,
|
|
"epoch": 0.00446892421821757,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.2754,
|
|
"mean_token_accuracy": 0.08076057620346547,
|
|
"num_tokens": 169521.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 8.256762790679932,
|
|
"epoch": 0.004580647323673009,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.3426,
|
|
"mean_token_accuracy": 0.0812241055071354,
|
|
"num_tokens": 173466.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 8.131280899047852,
|
|
"epoch": 0.0046923704291284484,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.2826,
|
|
"mean_token_accuracy": 0.07643571458756923,
|
|
"num_tokens": 177663.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 8.097990989685059,
|
|
"epoch": 0.004804093534583887,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.2745,
|
|
"mean_token_accuracy": 0.08235705867409707,
|
|
"num_tokens": 181778.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 8.089111948013306,
|
|
"epoch": 0.004915816640039326,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.2736,
|
|
"mean_token_accuracy": 0.08633389472961425,
|
|
"num_tokens": 185525.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 8.083420944213866,
|
|
"epoch": 0.005027539745494766,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.153,
|
|
"mean_token_accuracy": 0.08806331530213356,
|
|
"num_tokens": 189418.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.933328151702881,
|
|
"epoch": 0.005139262850950205,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.2217,
|
|
"mean_token_accuracy": 0.08842612579464912,
|
|
"num_tokens": 193494.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 8.018900680541993,
|
|
"epoch": 0.0052509859564056445,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.2661,
|
|
"mean_token_accuracy": 0.08137304298579692,
|
|
"num_tokens": 198018.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.955441856384278,
|
|
"epoch": 0.005362709061861083,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.1847,
|
|
"mean_token_accuracy": 0.08625513166189194,
|
|
"num_tokens": 202296.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.9594367980957035,
|
|
"epoch": 0.005474432167316523,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.1706,
|
|
"mean_token_accuracy": 0.08195730969309807,
|
|
"num_tokens": 206694.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.792031574249267,
|
|
"epoch": 0.005586155272771962,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.2007,
|
|
"mean_token_accuracy": 0.08904931843280792,
|
|
"num_tokens": 210810.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.920461797714234,
|
|
"epoch": 0.005697878378227402,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.1818,
|
|
"mean_token_accuracy": 0.0905133418738842,
|
|
"num_tokens": 215044.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.8493430614471436,
|
|
"epoch": 0.0058096014836828405,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.28,
|
|
"mean_token_accuracy": 0.08591654896736145,
|
|
"num_tokens": 219235.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.84934287071228,
|
|
"epoch": 0.005921324589138279,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.0922,
|
|
"mean_token_accuracy": 0.0903876356780529,
|
|
"num_tokens": 223639.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.785561227798462,
|
|
"epoch": 0.006033047694593719,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.1258,
|
|
"mean_token_accuracy": 0.09057728350162506,
|
|
"num_tokens": 227873.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.707937574386596,
|
|
"epoch": 0.006144770800049158,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.0661,
|
|
"mean_token_accuracy": 0.09807337448000908,
|
|
"num_tokens": 232147.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.739069509506225,
|
|
"epoch": 0.006256493905504598,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.1358,
|
|
"mean_token_accuracy": 0.09250000454485416,
|
|
"num_tokens": 236456.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.7190714359283445,
|
|
"epoch": 0.0063682170109600365,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.1583,
|
|
"mean_token_accuracy": 0.09051149562001229,
|
|
"num_tokens": 241039.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.938947439193726,
|
|
"epoch": 0.006479940116415476,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.1915,
|
|
"mean_token_accuracy": 0.08653632178902626,
|
|
"num_tokens": 245132.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.673107481002807,
|
|
"epoch": 0.006591663221870915,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.0872,
|
|
"mean_token_accuracy": 0.09988043382763863,
|
|
"num_tokens": 249152.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.712965631484986,
|
|
"epoch": 0.006703386327326354,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.0503,
|
|
"mean_token_accuracy": 0.09596830010414123,
|
|
"num_tokens": 253439.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.6600532054901125,
|
|
"epoch": 0.006815109432781794,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.0731,
|
|
"mean_token_accuracy": 0.09302671104669571,
|
|
"num_tokens": 258066.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.665358448028565,
|
|
"epoch": 0.0069268325382372325,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.0332,
|
|
"mean_token_accuracy": 0.0973147690296173,
|
|
"num_tokens": 261954.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.616210794448852,
|
|
"epoch": 0.007038555643692672,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.0779,
|
|
"mean_token_accuracy": 0.10462095588445663,
|
|
"num_tokens": 266650.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.689846324920654,
|
|
"epoch": 0.007150278749148111,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.1433,
|
|
"mean_token_accuracy": 0.09891897812485695,
|
|
"num_tokens": 271069.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.705677938461304,
|
|
"epoch": 0.007262001854603551,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.0039,
|
|
"mean_token_accuracy": 0.10242248028516769,
|
|
"num_tokens": 275084.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.603102445602417,
|
|
"epoch": 0.00737372496005899,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.0745,
|
|
"mean_token_accuracy": 0.1031483568251133,
|
|
"num_tokens": 279721.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.619607782363891,
|
|
"epoch": 0.007485448065514429,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.0708,
|
|
"mean_token_accuracy": 0.10527726709842682,
|
|
"num_tokens": 284317.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.600710487365722,
|
|
"epoch": 0.007597171170969868,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.0451,
|
|
"mean_token_accuracy": 0.10766607597470283,
|
|
"num_tokens": 288870.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.61973729133606,
|
|
"epoch": 0.007708894276425307,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 6.9812,
|
|
"mean_token_accuracy": 0.11351362988352776,
|
|
"num_tokens": 292996.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.5854551792144775,
|
|
"epoch": 0.007820617381880746,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 6.9806,
|
|
"mean_token_accuracy": 0.10384939089417458,
|
|
"num_tokens": 297238.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.531381893157959,
|
|
"epoch": 0.007932340487336187,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000177,
|
|
"loss": 6.9793,
|
|
"mean_token_accuracy": 0.1117280475795269,
|
|
"num_tokens": 301453.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.653309726715088,
|
|
"epoch": 0.008044063592791625,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 6.9327,
|
|
"mean_token_accuracy": 0.10786554217338562,
|
|
"num_tokens": 305949.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.534815788269043,
|
|
"epoch": 0.008155786698247064,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000182,
|
|
"loss": 6.9583,
|
|
"mean_token_accuracy": 0.11513907313346863,
|
|
"num_tokens": 310097.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.547474193572998,
|
|
"epoch": 0.008267509803702503,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 6.9517,
|
|
"mean_token_accuracy": 0.10539396926760673,
|
|
"num_tokens": 314567.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.457708692550659,
|
|
"epoch": 0.008379232909157944,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.0323,
|
|
"mean_token_accuracy": 0.10927818715572357,
|
|
"num_tokens": 319166.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.515052604675293,
|
|
"epoch": 0.008490956014613383,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 6.9204,
|
|
"mean_token_accuracy": 0.11223233640193939,
|
|
"num_tokens": 323682.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.488761281967163,
|
|
"epoch": 0.008602679120068821,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000192,
|
|
"loss": 6.8299,
|
|
"mean_token_accuracy": 0.12143486216664315,
|
|
"num_tokens": 327994.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.412152099609375,
|
|
"epoch": 0.00871440222552426,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 6.9058,
|
|
"mean_token_accuracy": 0.11854805946350097,
|
|
"num_tokens": 332026.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.578387832641601,
|
|
"epoch": 0.0088261253309797,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 6.9475,
|
|
"mean_token_accuracy": 0.11811894848942757,
|
|
"num_tokens": 336552.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.504688882827759,
|
|
"epoch": 0.00893784843643514,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 6.8884,
|
|
"mean_token_accuracy": 0.1116393692791462,
|
|
"num_tokens": 340643.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.4580738067626955,
|
|
"epoch": 0.009049571541890579,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000202,
|
|
"loss": 6.8029,
|
|
"mean_token_accuracy": 0.12075437754392623,
|
|
"num_tokens": 344886.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.3586314678192135,
|
|
"epoch": 0.009161294647346017,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 6.8632,
|
|
"mean_token_accuracy": 0.1191755935549736,
|
|
"num_tokens": 349115.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.527571535110473,
|
|
"epoch": 0.009273017752801456,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000207,
|
|
"loss": 6.8235,
|
|
"mean_token_accuracy": 0.12523479163646697,
|
|
"num_tokens": 353368.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.509571599960327,
|
|
"epoch": 0.009384740858256897,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 6.8657,
|
|
"mean_token_accuracy": 0.10757644474506378,
|
|
"num_tokens": 357382.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.441834354400635,
|
|
"epoch": 0.009496463963712336,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000212,
|
|
"loss": 6.7799,
|
|
"mean_token_accuracy": 0.12460733354091644,
|
|
"num_tokens": 361542.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.419776153564453,
|
|
"epoch": 0.009608187069167775,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.84,
|
|
"mean_token_accuracy": 0.11263928636908531,
|
|
"num_tokens": 366006.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.367758464813233,
|
|
"epoch": 0.009719910174623213,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.7506,
|
|
"mean_token_accuracy": 0.12978531718254088,
|
|
"num_tokens": 370021.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.408233594894409,
|
|
"epoch": 0.009831633280078652,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 6.8773,
|
|
"mean_token_accuracy": 0.12414649501442909,
|
|
"num_tokens": 374434.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.441655158996582,
|
|
"epoch": 0.009943356385534093,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.9027,
|
|
"mean_token_accuracy": 0.11324851140379906,
|
|
"num_tokens": 378634.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.315918588638306,
|
|
"epoch": 0.010055079490989532,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.7869,
|
|
"mean_token_accuracy": 0.1252473659813404,
|
|
"num_tokens": 382904.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.470485210418701,
|
|
"epoch": 0.01016680259644497,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.7635,
|
|
"mean_token_accuracy": 0.12090180814266205,
|
|
"num_tokens": 386970.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.348088216781616,
|
|
"epoch": 0.01027852570190041,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.729,
|
|
"mean_token_accuracy": 0.13390202075242996,
|
|
"num_tokens": 391043.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.392842721939087,
|
|
"epoch": 0.01039024880735585,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.7204,
|
|
"mean_token_accuracy": 0.13383440747857095,
|
|
"num_tokens": 395413.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.40152382850647,
|
|
"epoch": 0.010501971912811289,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 6.8385,
|
|
"mean_token_accuracy": 0.12566340565681458,
|
|
"num_tokens": 399821.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.2655271053314205,
|
|
"epoch": 0.010613695018266728,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.6582,
|
|
"mean_token_accuracy": 0.13715523406863211,
|
|
"num_tokens": 404043.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.422811889648438,
|
|
"epoch": 0.010725418123722167,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 6.803,
|
|
"mean_token_accuracy": 0.1260749615728855,
|
|
"num_tokens": 408339.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.263138484954834,
|
|
"epoch": 0.010837141229177606,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.6856,
|
|
"mean_token_accuracy": 0.13459724336862564,
|
|
"num_tokens": 412384.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.3362548828125,
|
|
"epoch": 0.010948864334633046,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.8108,
|
|
"mean_token_accuracy": 0.12614913210272788,
|
|
"num_tokens": 416891.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.36023063659668,
|
|
"epoch": 0.011060587440088485,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.6642,
|
|
"mean_token_accuracy": 0.1329216368496418,
|
|
"num_tokens": 420980.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.2991362571716305,
|
|
"epoch": 0.011172310545543924,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.7212,
|
|
"mean_token_accuracy": 0.13121648952364923,
|
|
"num_tokens": 425454.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.339645147323608,
|
|
"epoch": 0.011284033650999363,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000252,
|
|
"loss": 6.7456,
|
|
"mean_token_accuracy": 0.12466516643762589,
|
|
"num_tokens": 430087.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.289543581008911,
|
|
"epoch": 0.011395756756454803,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 6.674,
|
|
"mean_token_accuracy": 0.13790097907185556,
|
|
"num_tokens": 434062.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.032429838180542,
|
|
"epoch": 0.011507479861910242,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000257,
|
|
"loss": 6.6567,
|
|
"mean_token_accuracy": 0.13795162215828896,
|
|
"num_tokens": 437882.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.306787776947021,
|
|
"epoch": 0.011619202967365681,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 6.6863,
|
|
"mean_token_accuracy": 0.13145707920193672,
|
|
"num_tokens": 442248.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.304118633270264,
|
|
"epoch": 0.01173092607282112,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.000262,
|
|
"loss": 6.6543,
|
|
"mean_token_accuracy": 0.13509279638528823,
|
|
"num_tokens": 446492.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.187564706802368,
|
|
"epoch": 0.011842649178276559,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 6.6878,
|
|
"mean_token_accuracy": 0.1363551899790764,
|
|
"num_tokens": 450359.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.3090503215789795,
|
|
"epoch": 0.011954372283732,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.6973,
|
|
"mean_token_accuracy": 0.13162412643432617,
|
|
"num_tokens": 454508.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.349436283111572,
|
|
"epoch": 0.012066095389187438,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.666,
|
|
"mean_token_accuracy": 0.133739610016346,
|
|
"num_tokens": 459028.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.165412759780883,
|
|
"epoch": 0.012177818494642877,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.6084,
|
|
"mean_token_accuracy": 0.13254478350281715,
|
|
"num_tokens": 462911.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 7.141500329971313,
|
|
"epoch": 0.012289541600098316,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 6.6564,
|
|
"mean_token_accuracy": 0.13796778842806817,
|
|
"num_tokens": 467129.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.303054237365723,
|
|
"epoch": 0.012401264705553756,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000277,
|
|
"loss": 6.5808,
|
|
"mean_token_accuracy": 0.139659858494997,
|
|
"num_tokens": 471324.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.217333889007568,
|
|
"epoch": 0.012512987811009195,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.7315,
|
|
"mean_token_accuracy": 0.12687695473432542,
|
|
"num_tokens": 475761.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.198446226119995,
|
|
"epoch": 0.012624710916464634,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.5304,
|
|
"mean_token_accuracy": 0.13995881900191307,
|
|
"num_tokens": 479532.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.236463642120361,
|
|
"epoch": 0.012736434021920073,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.6868,
|
|
"mean_token_accuracy": 0.13790254518389702,
|
|
"num_tokens": 483844.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.325732755661011,
|
|
"epoch": 0.012848157127375512,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.7171,
|
|
"mean_token_accuracy": 0.1334671013057232,
|
|
"num_tokens": 488402.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.063196992874145,
|
|
"epoch": 0.012959880232830952,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.4612,
|
|
"mean_token_accuracy": 0.14302808195352554,
|
|
"num_tokens": 492473.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.110111331939697,
|
|
"epoch": 0.013071603338286391,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.568,
|
|
"mean_token_accuracy": 0.14078054577112198,
|
|
"num_tokens": 496830.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.2420226573944095,
|
|
"epoch": 0.01318332644374183,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.631,
|
|
"mean_token_accuracy": 0.13996972143650055,
|
|
"num_tokens": 500996.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 7.113031435012817,
|
|
"epoch": 0.013295049549197269,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.5937,
|
|
"mean_token_accuracy": 0.13735369965434074,
|
|
"num_tokens": 505174.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.149052238464355,
|
|
"epoch": 0.013406772654652708,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 6.5965,
|
|
"mean_token_accuracy": 0.14343740195035934,
|
|
"num_tokens": 509425.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.0243888854980465,
|
|
"epoch": 0.013518495760108148,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.000302,
|
|
"loss": 6.4748,
|
|
"mean_token_accuracy": 0.14360842779278754,
|
|
"num_tokens": 513271.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.198608922958374,
|
|
"epoch": 0.013630218865563587,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.641,
|
|
"mean_token_accuracy": 0.146073829382658,
|
|
"num_tokens": 517879.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.045837879180908,
|
|
"epoch": 0.013741941971019026,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.6375,
|
|
"mean_token_accuracy": 0.13332991302013397,
|
|
"num_tokens": 522321.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 7.1533918380737305,
|
|
"epoch": 0.013853665076474465,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.4453,
|
|
"mean_token_accuracy": 0.15030871629714965,
|
|
"num_tokens": 525884.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 7.081359624862671,
|
|
"epoch": 0.013965388181929906,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.5558,
|
|
"mean_token_accuracy": 0.145179907977581,
|
|
"num_tokens": 530373.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 7.055321979522705,
|
|
"epoch": 0.014077111287385344,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.3843,
|
|
"mean_token_accuracy": 0.15800822898745537,
|
|
"num_tokens": 534571.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.117716646194458,
|
|
"epoch": 0.014188834392840783,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.5609,
|
|
"mean_token_accuracy": 0.13290601670742036,
|
|
"num_tokens": 538938.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.100346279144287,
|
|
"epoch": 0.014300557498296222,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.5165,
|
|
"mean_token_accuracy": 0.1455024905502796,
|
|
"num_tokens": 542977.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 7.071889972686767,
|
|
"epoch": 0.014412280603751661,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.4884,
|
|
"mean_token_accuracy": 0.1465342827141285,
|
|
"num_tokens": 547005.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 7.0628125190734865,
|
|
"epoch": 0.014524003709207102,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.5272,
|
|
"mean_token_accuracy": 0.13975051417946815,
|
|
"num_tokens": 551361.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 7.023661518096924,
|
|
"epoch": 0.01463572681466254,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.4833,
|
|
"mean_token_accuracy": 0.14340668320655822,
|
|
"num_tokens": 555574.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 6.989748096466064,
|
|
"epoch": 0.01474744992011798,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.5127,
|
|
"mean_token_accuracy": 0.14289377629756927,
|
|
"num_tokens": 560368.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 7.077346086502075,
|
|
"epoch": 0.014859173025573418,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.5634,
|
|
"mean_token_accuracy": 0.1373551793396473,
|
|
"num_tokens": 564884.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 7.1274079322814945,
|
|
"epoch": 0.014970896131028859,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.5586,
|
|
"mean_token_accuracy": 0.1458572693169117,
|
|
"num_tokens": 569494.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.002107572555542,
|
|
"epoch": 0.015082619236484298,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.5547,
|
|
"mean_token_accuracy": 0.13712208420038224,
|
|
"num_tokens": 574161.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 7.072530221939087,
|
|
"epoch": 0.015194342341939737,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.4626,
|
|
"mean_token_accuracy": 0.14953978583216668,
|
|
"num_tokens": 578510.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 6.912496089935303,
|
|
"epoch": 0.015306065447395175,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.4979,
|
|
"mean_token_accuracy": 0.1424515500664711,
|
|
"num_tokens": 582695.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 7.025566148757934,
|
|
"epoch": 0.015417788552850614,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.4376,
|
|
"mean_token_accuracy": 0.13332833126187324,
|
|
"num_tokens": 586622.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 7.013347387313843,
|
|
"epoch": 0.015529511658306055,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.4683,
|
|
"mean_token_accuracy": 0.14548797011375428,
|
|
"num_tokens": 591704.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.755953073501587,
|
|
"epoch": 0.015641234763761492,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.276,
|
|
"mean_token_accuracy": 0.15882887542247773,
|
|
"num_tokens": 596029.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 6.987126111984253,
|
|
"epoch": 0.015752957869216933,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.4419,
|
|
"mean_token_accuracy": 0.14538582488894464,
|
|
"num_tokens": 600115.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.84918212890625,
|
|
"epoch": 0.015864680974672373,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.4459,
|
|
"mean_token_accuracy": 0.14390605613589286,
|
|
"num_tokens": 604488.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 7.084057378768921,
|
|
"epoch": 0.01597640408012781,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.4439,
|
|
"mean_token_accuracy": 0.14581422209739686,
|
|
"num_tokens": 608776.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.7030833721160885,
|
|
"epoch": 0.01608812718558325,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.4504,
|
|
"mean_token_accuracy": 0.15099047794938086,
|
|
"num_tokens": 612771.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 6.734662055969238,
|
|
"epoch": 0.01619985029103869,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.3566,
|
|
"mean_token_accuracy": 0.1516010656952858,
|
|
"num_tokens": 616947.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.9721879959106445,
|
|
"epoch": 0.01631157339649413,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.4281,
|
|
"mean_token_accuracy": 0.15130600407719613,
|
|
"num_tokens": 621064.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 6.856808233261108,
|
|
"epoch": 0.01642329650194957,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.4363,
|
|
"mean_token_accuracy": 0.14606723934412003,
|
|
"num_tokens": 625349.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.863543367385864,
|
|
"epoch": 0.016535019607405006,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.3498,
|
|
"mean_token_accuracy": 0.15212762504816055,
|
|
"num_tokens": 629754.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 6.9013481616973875,
|
|
"epoch": 0.016646742712860447,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.3994,
|
|
"mean_token_accuracy": 0.13600233122706412,
|
|
"num_tokens": 634111.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.783719491958618,
|
|
"epoch": 0.016758465818315887,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.2659,
|
|
"mean_token_accuracy": 0.16154912412166594,
|
|
"num_tokens": 638527.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.681515502929687,
|
|
"epoch": 0.016870188923771325,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.2959,
|
|
"mean_token_accuracy": 0.15193597078323365,
|
|
"num_tokens": 642530.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.832862663269043,
|
|
"epoch": 0.016981912029226765,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.3628,
|
|
"mean_token_accuracy": 0.15937515050172807,
|
|
"num_tokens": 646707.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.755436944961548,
|
|
"epoch": 0.017093635134682202,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.4113,
|
|
"mean_token_accuracy": 0.15184309035539628,
|
|
"num_tokens": 651326.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.871073818206787,
|
|
"epoch": 0.017205358240137643,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.4618,
|
|
"mean_token_accuracy": 0.15124865621328354,
|
|
"num_tokens": 655651.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.727626895904541,
|
|
"epoch": 0.017317081345593083,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.3946,
|
|
"mean_token_accuracy": 0.14350455030798911,
|
|
"num_tokens": 659868.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.921767711639404,
|
|
"epoch": 0.01742880445104852,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.4403,
|
|
"mean_token_accuracy": 0.14834593906998633,
|
|
"num_tokens": 664404.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.699166393280029,
|
|
"epoch": 0.01754052755650396,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.2549,
|
|
"mean_token_accuracy": 0.1540958382189274,
|
|
"num_tokens": 668543.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.646093511581421,
|
|
"epoch": 0.0176522506619594,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.2322,
|
|
"mean_token_accuracy": 0.15864449143409728,
|
|
"num_tokens": 672637.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.7683539390563965,
|
|
"epoch": 0.01776397376741484,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.3473,
|
|
"mean_token_accuracy": 0.15486485213041307,
|
|
"num_tokens": 676929.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.6748552322387695,
|
|
"epoch": 0.01787569687287028,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.3935,
|
|
"mean_token_accuracy": 0.14576203897595405,
|
|
"num_tokens": 681287.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.773136854171753,
|
|
"epoch": 0.017987419978325717,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.2896,
|
|
"mean_token_accuracy": 0.14774783104658126,
|
|
"num_tokens": 685543.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.736042118072509,
|
|
"epoch": 0.018099143083781157,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.2738,
|
|
"mean_token_accuracy": 0.14571748450398445,
|
|
"num_tokens": 689479.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.654993534088135,
|
|
"epoch": 0.018210866189236594,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.2274,
|
|
"mean_token_accuracy": 0.16590481102466584,
|
|
"num_tokens": 693564.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.655237770080566,
|
|
"epoch": 0.018322589294692035,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.3025,
|
|
"mean_token_accuracy": 0.15087175816297532,
|
|
"num_tokens": 697719.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.695311594009399,
|
|
"epoch": 0.018434312400147475,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.4423,
|
|
"mean_token_accuracy": 0.1513037145137787,
|
|
"num_tokens": 701937.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.683870124816894,
|
|
"epoch": 0.018546035505602913,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.3653,
|
|
"mean_token_accuracy": 0.14852157458662987,
|
|
"num_tokens": 706457.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.849027347564697,
|
|
"epoch": 0.018657758611058353,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.38,
|
|
"mean_token_accuracy": 0.15221845954656602,
|
|
"num_tokens": 710538.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.5683678150177,
|
|
"epoch": 0.018769481716513794,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.248,
|
|
"mean_token_accuracy": 0.16236387193202972,
|
|
"num_tokens": 714663.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.697820091247559,
|
|
"epoch": 0.01888120482196923,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.2517,
|
|
"mean_token_accuracy": 0.1534279391169548,
|
|
"num_tokens": 718828.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.571414041519165,
|
|
"epoch": 0.01899292792742467,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.2619,
|
|
"mean_token_accuracy": 0.15856588035821914,
|
|
"num_tokens": 723155.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.808944749832153,
|
|
"epoch": 0.01910465103288011,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.3217,
|
|
"mean_token_accuracy": 0.15492385476827622,
|
|
"num_tokens": 727605.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.5332495212554935,
|
|
"epoch": 0.01921637413833555,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.2777,
|
|
"mean_token_accuracy": 0.14969536513090134,
|
|
"num_tokens": 731510.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.71437783241272,
|
|
"epoch": 0.01932809724379099,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.4619,
|
|
"mean_token_accuracy": 0.1470765456557274,
|
|
"num_tokens": 736048.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.678787279129028,
|
|
"epoch": 0.019439820349246427,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.2512,
|
|
"mean_token_accuracy": 0.15854543596506118,
|
|
"num_tokens": 740714.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.619452238082886,
|
|
"epoch": 0.019551543454701868,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.3257,
|
|
"mean_token_accuracy": 0.15650657266378404,
|
|
"num_tokens": 745167.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.694642496109009,
|
|
"epoch": 0.019663266560157305,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.1537,
|
|
"mean_token_accuracy": 0.16280549690127372,
|
|
"num_tokens": 749401.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.627922868728637,
|
|
"epoch": 0.019774989665612745,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.2224,
|
|
"mean_token_accuracy": 0.16411646455526352,
|
|
"num_tokens": 753554.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.636331701278687,
|
|
"epoch": 0.019886712771068186,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.2725,
|
|
"mean_token_accuracy": 0.14831542521715163,
|
|
"num_tokens": 757744.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.588339996337891,
|
|
"epoch": 0.019998435876523623,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.1633,
|
|
"mean_token_accuracy": 0.15396574288606643,
|
|
"num_tokens": 761858.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.522528743743896,
|
|
"epoch": 0.020110158981979064,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.1908,
|
|
"mean_token_accuracy": 0.16618741899728776,
|
|
"num_tokens": 765852.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.504194116592407,
|
|
"epoch": 0.0202218820874345,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.195,
|
|
"mean_token_accuracy": 0.1562537170946598,
|
|
"num_tokens": 770274.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.654917287826538,
|
|
"epoch": 0.02033360519288994,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.2299,
|
|
"mean_token_accuracy": 0.15019148588180542,
|
|
"num_tokens": 774775.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.611723136901856,
|
|
"epoch": 0.020445328298345382,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.2769,
|
|
"mean_token_accuracy": 0.16535960435867308,
|
|
"num_tokens": 779374.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.6483289241790775,
|
|
"epoch": 0.02055705140380082,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.2519,
|
|
"mean_token_accuracy": 0.1526936858892441,
|
|
"num_tokens": 783537.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.512672090530396,
|
|
"epoch": 0.02066877450925626,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.3001,
|
|
"mean_token_accuracy": 0.15667854249477386,
|
|
"num_tokens": 787562.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.7299144744873045,
|
|
"epoch": 0.0207804976147117,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.3687,
|
|
"mean_token_accuracy": 0.1443271040916443,
|
|
"num_tokens": 792303.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.5588274002075195,
|
|
"epoch": 0.020892220720167137,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.1584,
|
|
"mean_token_accuracy": 0.16573118567466735,
|
|
"num_tokens": 796402.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.613452911376953,
|
|
"epoch": 0.021003943825622578,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.2314,
|
|
"mean_token_accuracy": 0.16508372873067856,
|
|
"num_tokens": 800004.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.341346979141235,
|
|
"epoch": 0.021115666931078015,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.117,
|
|
"mean_token_accuracy": 0.16131858825683593,
|
|
"num_tokens": 803782.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.510941028594971,
|
|
"epoch": 0.021227390036533456,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.1673,
|
|
"mean_token_accuracy": 0.16098449528217315,
|
|
"num_tokens": 808013.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.4510125637054445,
|
|
"epoch": 0.021339113141988896,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.1669,
|
|
"mean_token_accuracy": 0.1604609191417694,
|
|
"num_tokens": 812130.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.501435708999634,
|
|
"epoch": 0.021450836247444333,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.1342,
|
|
"mean_token_accuracy": 0.16159728765487671,
|
|
"num_tokens": 816246.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.510243511199951,
|
|
"epoch": 0.021562559352899774,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.1317,
|
|
"mean_token_accuracy": 0.16103896945714952,
|
|
"num_tokens": 820628.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.387259864807129,
|
|
"epoch": 0.02167428245835521,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.1367,
|
|
"mean_token_accuracy": 0.1684929057955742,
|
|
"num_tokens": 824864.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.61003623008728,
|
|
"epoch": 0.02178600556381065,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.183,
|
|
"mean_token_accuracy": 0.15778308734297752,
|
|
"num_tokens": 828887.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.448976230621338,
|
|
"epoch": 0.021897728669266092,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.0739,
|
|
"mean_token_accuracy": 0.16717422604560853,
|
|
"num_tokens": 832784.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.41447229385376,
|
|
"epoch": 0.02200945177472153,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.0937,
|
|
"mean_token_accuracy": 0.15710036903619767,
|
|
"num_tokens": 837408.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.451285934448242,
|
|
"epoch": 0.02212117488017697,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.1512,
|
|
"mean_token_accuracy": 0.16300837993621825,
|
|
"num_tokens": 841536.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.426507759094238,
|
|
"epoch": 0.022232897985632407,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.0582,
|
|
"mean_token_accuracy": 0.17121603935956956,
|
|
"num_tokens": 846044.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.320924186706543,
|
|
"epoch": 0.022344621091087848,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.1118,
|
|
"mean_token_accuracy": 0.1685657724738121,
|
|
"num_tokens": 850254.0,
|
|
"step": 1000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 4000,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 182438283018240.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|