1035 lines
27 KiB
JSON
1035 lines
27 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.011172310545543924,
|
|
"eval_steps": 500,
|
|
"global_step": 500,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 10.7426682472229,
|
|
"epoch": 0.00011172310545543924,
|
|
"grad_norm": 6.25,
|
|
"learning_rate": 2e-06,
|
|
"loss": 10.5231,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 4250.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 10.742681884765625,
|
|
"epoch": 0.00022344621091087847,
|
|
"grad_norm": 7.03125,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 10.4891,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 8228.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 10.742702770233155,
|
|
"epoch": 0.0003351693163663177,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 7e-06,
|
|
"loss": 10.4445,
|
|
"mean_token_accuracy": 0.0003105590119957924,
|
|
"num_tokens": 12209.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 10.742706871032714,
|
|
"epoch": 0.00044689242182175694,
|
|
"grad_norm": 6.15625,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 10.3987,
|
|
"mean_token_accuracy": 0.0004866180010139942,
|
|
"num_tokens": 16225.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 10.742706108093262,
|
|
"epoch": 0.0005586155272771962,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 10.2787,
|
|
"mean_token_accuracy": 0.010022059944458307,
|
|
"num_tokens": 20212.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.742425346374512,
|
|
"epoch": 0.0006703386327326354,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.173,
|
|
"mean_token_accuracy": 0.04117634426802397,
|
|
"num_tokens": 24445.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.741415786743165,
|
|
"epoch": 0.0007820617381880747,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 9.9901,
|
|
"mean_token_accuracy": 0.052767305821180346,
|
|
"num_tokens": 28365.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.739409732818604,
|
|
"epoch": 0.0008937848436435139,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 9.9621,
|
|
"mean_token_accuracy": 0.052475782483816145,
|
|
"num_tokens": 33055.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.7363787651062,
|
|
"epoch": 0.0010055079490989532,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 9.811,
|
|
"mean_token_accuracy": 0.062037082761526106,
|
|
"num_tokens": 37599.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.733420372009277,
|
|
"epoch": 0.0011172310545543925,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.6744,
|
|
"mean_token_accuracy": 0.06838746592402459,
|
|
"num_tokens": 41934.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.731625938415528,
|
|
"epoch": 0.0012289541600098315,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.6365,
|
|
"mean_token_accuracy": 0.05915887728333473,
|
|
"num_tokens": 46178.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.729843425750733,
|
|
"epoch": 0.0013406772654652708,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.5494,
|
|
"mean_token_accuracy": 0.0692246112972498,
|
|
"num_tokens": 50513.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.727421474456786,
|
|
"epoch": 0.0014524003709207101,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.5028,
|
|
"mean_token_accuracy": 0.0702465757727623,
|
|
"num_tokens": 54924.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.722854518890381,
|
|
"epoch": 0.0015641234763761494,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4107,
|
|
"mean_token_accuracy": 0.06344567574560642,
|
|
"num_tokens": 59083.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.71631669998169,
|
|
"epoch": 0.0016758465818315885,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.3233,
|
|
"mean_token_accuracy": 0.06774163469672204,
|
|
"num_tokens": 63324.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.705671787261963,
|
|
"epoch": 0.0017875696872870278,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.3567,
|
|
"mean_token_accuracy": 0.06332114227116108,
|
|
"num_tokens": 67738.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.693737983703613,
|
|
"epoch": 0.001899292792742467,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.2866,
|
|
"mean_token_accuracy": 0.06349676214158535,
|
|
"num_tokens": 72305.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.675388622283936,
|
|
"epoch": 0.0020110158981979064,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.0821,
|
|
"mean_token_accuracy": 0.06834135130047798,
|
|
"num_tokens": 76579.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.65205717086792,
|
|
"epoch": 0.0021227390036533456,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 9.0421,
|
|
"mean_token_accuracy": 0.06946654319763183,
|
|
"num_tokens": 80812.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.615971279144286,
|
|
"epoch": 0.002234462109108785,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.9523,
|
|
"mean_token_accuracy": 0.06732719540596008,
|
|
"num_tokens": 85090.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.59008207321167,
|
|
"epoch": 0.0023461852145642242,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.888,
|
|
"mean_token_accuracy": 0.06908667460083961,
|
|
"num_tokens": 89578.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.536420917510986,
|
|
"epoch": 0.002457908320019663,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.789,
|
|
"mean_token_accuracy": 0.0728236336261034,
|
|
"num_tokens": 94117.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 10.488511657714843,
|
|
"epoch": 0.0025696314254751024,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.6132,
|
|
"mean_token_accuracy": 0.07127482630312443,
|
|
"num_tokens": 98082.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 10.439968013763428,
|
|
"epoch": 0.0026813545309305417,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.5714,
|
|
"mean_token_accuracy": 0.07672090865671635,
|
|
"num_tokens": 102327.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 10.355792045593262,
|
|
"epoch": 0.002793077636385981,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.4426,
|
|
"mean_token_accuracy": 0.0740627009421587,
|
|
"num_tokens": 106567.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 10.286309623718262,
|
|
"epoch": 0.0029048007418414202,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.3003,
|
|
"mean_token_accuracy": 0.07362989187240601,
|
|
"num_tokens": 110654.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 10.204053020477295,
|
|
"epoch": 0.0030165238472968595,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.2511,
|
|
"mean_token_accuracy": 0.062000279501080516,
|
|
"num_tokens": 114679.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 10.102067852020264,
|
|
"epoch": 0.003128246952752299,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 8.1849,
|
|
"mean_token_accuracy": 0.06811538599431514,
|
|
"num_tokens": 118817.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 9.926943397521972,
|
|
"epoch": 0.003239970058207738,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 8.0767,
|
|
"mean_token_accuracy": 0.06979594528675079,
|
|
"num_tokens": 123188.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 9.793034744262695,
|
|
"epoch": 0.003351693163663177,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.981,
|
|
"mean_token_accuracy": 0.06847230046987533,
|
|
"num_tokens": 127767.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 9.643688774108886,
|
|
"epoch": 0.0034634162691186163,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.7945,
|
|
"mean_token_accuracy": 0.06945906654000282,
|
|
"num_tokens": 131837.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 9.430543518066406,
|
|
"epoch": 0.0035751393745740555,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.7734,
|
|
"mean_token_accuracy": 0.07027286775410176,
|
|
"num_tokens": 136247.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 9.239261722564697,
|
|
"epoch": 0.003686862480029495,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.5788,
|
|
"mean_token_accuracy": 0.07950169630348683,
|
|
"num_tokens": 140170.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.976140880584717,
|
|
"epoch": 0.003798585585484934,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.6177,
|
|
"mean_token_accuracy": 0.07785017378628253,
|
|
"num_tokens": 144139.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.843453693389893,
|
|
"epoch": 0.003910308690940373,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.5659,
|
|
"mean_token_accuracy": 0.07487303391098976,
|
|
"num_tokens": 148792.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.658325004577637,
|
|
"epoch": 0.004022031796395813,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.4988,
|
|
"mean_token_accuracy": 0.07942216768860817,
|
|
"num_tokens": 152844.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.59526195526123,
|
|
"epoch": 0.0041337549018512516,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.527,
|
|
"mean_token_accuracy": 0.07417443916201591,
|
|
"num_tokens": 157366.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 8.467089462280274,
|
|
"epoch": 0.004245478007306691,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.3623,
|
|
"mean_token_accuracy": 0.07755868881940842,
|
|
"num_tokens": 161348.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 8.307873630523682,
|
|
"epoch": 0.00435720111276213,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.3815,
|
|
"mean_token_accuracy": 0.08716461397707462,
|
|
"num_tokens": 165647.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 8.236515140533447,
|
|
"epoch": 0.00446892421821757,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.2754,
|
|
"mean_token_accuracy": 0.08076057620346547,
|
|
"num_tokens": 169521.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 8.256762790679932,
|
|
"epoch": 0.004580647323673009,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.3426,
|
|
"mean_token_accuracy": 0.0812241055071354,
|
|
"num_tokens": 173466.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 8.131280899047852,
|
|
"epoch": 0.0046923704291284484,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.2826,
|
|
"mean_token_accuracy": 0.07643571458756923,
|
|
"num_tokens": 177663.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 8.097990989685059,
|
|
"epoch": 0.004804093534583887,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.2745,
|
|
"mean_token_accuracy": 0.08235705867409707,
|
|
"num_tokens": 181778.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 8.089111948013306,
|
|
"epoch": 0.004915816640039326,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.2736,
|
|
"mean_token_accuracy": 0.08633389472961425,
|
|
"num_tokens": 185525.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 8.083420944213866,
|
|
"epoch": 0.005027539745494766,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.153,
|
|
"mean_token_accuracy": 0.08806331530213356,
|
|
"num_tokens": 189418.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.933328151702881,
|
|
"epoch": 0.005139262850950205,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.2217,
|
|
"mean_token_accuracy": 0.08842612579464912,
|
|
"num_tokens": 193494.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 8.018900680541993,
|
|
"epoch": 0.0052509859564056445,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.2661,
|
|
"mean_token_accuracy": 0.08137304298579692,
|
|
"num_tokens": 198018.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.955441856384278,
|
|
"epoch": 0.005362709061861083,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.1847,
|
|
"mean_token_accuracy": 0.08625513166189194,
|
|
"num_tokens": 202296.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.9594367980957035,
|
|
"epoch": 0.005474432167316523,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.1706,
|
|
"mean_token_accuracy": 0.08195730969309807,
|
|
"num_tokens": 206694.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.792031574249267,
|
|
"epoch": 0.005586155272771962,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.2007,
|
|
"mean_token_accuracy": 0.08904931843280792,
|
|
"num_tokens": 210810.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.920461797714234,
|
|
"epoch": 0.005697878378227402,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.1818,
|
|
"mean_token_accuracy": 0.0905133418738842,
|
|
"num_tokens": 215044.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.8493430614471436,
|
|
"epoch": 0.0058096014836828405,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.28,
|
|
"mean_token_accuracy": 0.08591654896736145,
|
|
"num_tokens": 219235.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.84934287071228,
|
|
"epoch": 0.005921324589138279,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.0922,
|
|
"mean_token_accuracy": 0.0903876356780529,
|
|
"num_tokens": 223639.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.785561227798462,
|
|
"epoch": 0.006033047694593719,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.1258,
|
|
"mean_token_accuracy": 0.09057728350162506,
|
|
"num_tokens": 227873.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.707937574386596,
|
|
"epoch": 0.006144770800049158,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.0661,
|
|
"mean_token_accuracy": 0.09807337448000908,
|
|
"num_tokens": 232147.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.739069509506225,
|
|
"epoch": 0.006256493905504598,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.1358,
|
|
"mean_token_accuracy": 0.09250000454485416,
|
|
"num_tokens": 236456.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.7190714359283445,
|
|
"epoch": 0.0063682170109600365,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.1583,
|
|
"mean_token_accuracy": 0.09051149562001229,
|
|
"num_tokens": 241039.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.938947439193726,
|
|
"epoch": 0.006479940116415476,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.1915,
|
|
"mean_token_accuracy": 0.08653632178902626,
|
|
"num_tokens": 245132.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.673107481002807,
|
|
"epoch": 0.006591663221870915,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.0872,
|
|
"mean_token_accuracy": 0.09988043382763863,
|
|
"num_tokens": 249152.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.712965631484986,
|
|
"epoch": 0.006703386327326354,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.0503,
|
|
"mean_token_accuracy": 0.09596830010414123,
|
|
"num_tokens": 253439.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.6600532054901125,
|
|
"epoch": 0.006815109432781794,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.0731,
|
|
"mean_token_accuracy": 0.09302671104669571,
|
|
"num_tokens": 258066.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.665358448028565,
|
|
"epoch": 0.0069268325382372325,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.0332,
|
|
"mean_token_accuracy": 0.0973147690296173,
|
|
"num_tokens": 261954.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.616210794448852,
|
|
"epoch": 0.007038555643692672,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.0779,
|
|
"mean_token_accuracy": 0.10462095588445663,
|
|
"num_tokens": 266650.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.689846324920654,
|
|
"epoch": 0.007150278749148111,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.1433,
|
|
"mean_token_accuracy": 0.09891897812485695,
|
|
"num_tokens": 271069.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.705677938461304,
|
|
"epoch": 0.007262001854603551,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.0039,
|
|
"mean_token_accuracy": 0.10242248028516769,
|
|
"num_tokens": 275084.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.603102445602417,
|
|
"epoch": 0.00737372496005899,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.0745,
|
|
"mean_token_accuracy": 0.1031483568251133,
|
|
"num_tokens": 279721.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.619607782363891,
|
|
"epoch": 0.007485448065514429,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.0708,
|
|
"mean_token_accuracy": 0.10527726709842682,
|
|
"num_tokens": 284317.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.600710487365722,
|
|
"epoch": 0.007597171170969868,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.0451,
|
|
"mean_token_accuracy": 0.10766607597470283,
|
|
"num_tokens": 288870.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.61973729133606,
|
|
"epoch": 0.007708894276425307,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 6.9812,
|
|
"mean_token_accuracy": 0.11351362988352776,
|
|
"num_tokens": 292996.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.5854551792144775,
|
|
"epoch": 0.007820617381880746,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 6.9806,
|
|
"mean_token_accuracy": 0.10384939089417458,
|
|
"num_tokens": 297238.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.531381893157959,
|
|
"epoch": 0.007932340487336187,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000177,
|
|
"loss": 6.9793,
|
|
"mean_token_accuracy": 0.1117280475795269,
|
|
"num_tokens": 301453.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.653309726715088,
|
|
"epoch": 0.008044063592791625,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 6.9327,
|
|
"mean_token_accuracy": 0.10786554217338562,
|
|
"num_tokens": 305949.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.534815788269043,
|
|
"epoch": 0.008155786698247064,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000182,
|
|
"loss": 6.9583,
|
|
"mean_token_accuracy": 0.11513907313346863,
|
|
"num_tokens": 310097.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.547474193572998,
|
|
"epoch": 0.008267509803702503,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 6.9517,
|
|
"mean_token_accuracy": 0.10539396926760673,
|
|
"num_tokens": 314567.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.457708692550659,
|
|
"epoch": 0.008379232909157944,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.0323,
|
|
"mean_token_accuracy": 0.10927818715572357,
|
|
"num_tokens": 319166.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.515052604675293,
|
|
"epoch": 0.008490956014613383,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 6.9204,
|
|
"mean_token_accuracy": 0.11223233640193939,
|
|
"num_tokens": 323682.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.488761281967163,
|
|
"epoch": 0.008602679120068821,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000192,
|
|
"loss": 6.8299,
|
|
"mean_token_accuracy": 0.12143486216664315,
|
|
"num_tokens": 327994.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.412152099609375,
|
|
"epoch": 0.00871440222552426,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 6.9058,
|
|
"mean_token_accuracy": 0.11854805946350097,
|
|
"num_tokens": 332026.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.578387832641601,
|
|
"epoch": 0.0088261253309797,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 6.9475,
|
|
"mean_token_accuracy": 0.11811894848942757,
|
|
"num_tokens": 336552.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.504688882827759,
|
|
"epoch": 0.00893784843643514,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 6.8884,
|
|
"mean_token_accuracy": 0.1116393692791462,
|
|
"num_tokens": 340643.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.4580738067626955,
|
|
"epoch": 0.009049571541890579,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000202,
|
|
"loss": 6.8029,
|
|
"mean_token_accuracy": 0.12075437754392623,
|
|
"num_tokens": 344886.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.3586314678192135,
|
|
"epoch": 0.009161294647346017,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 6.8632,
|
|
"mean_token_accuracy": 0.1191755935549736,
|
|
"num_tokens": 349115.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.527571535110473,
|
|
"epoch": 0.009273017752801456,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000207,
|
|
"loss": 6.8235,
|
|
"mean_token_accuracy": 0.12523479163646697,
|
|
"num_tokens": 353368.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.509571599960327,
|
|
"epoch": 0.009384740858256897,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 6.8657,
|
|
"mean_token_accuracy": 0.10757644474506378,
|
|
"num_tokens": 357382.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.441834354400635,
|
|
"epoch": 0.009496463963712336,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000212,
|
|
"loss": 6.7799,
|
|
"mean_token_accuracy": 0.12460733354091644,
|
|
"num_tokens": 361542.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.419776153564453,
|
|
"epoch": 0.009608187069167775,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.84,
|
|
"mean_token_accuracy": 0.11263928636908531,
|
|
"num_tokens": 366006.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.367758464813233,
|
|
"epoch": 0.009719910174623213,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.7506,
|
|
"mean_token_accuracy": 0.12978531718254088,
|
|
"num_tokens": 370021.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.408233594894409,
|
|
"epoch": 0.009831633280078652,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 6.8773,
|
|
"mean_token_accuracy": 0.12414649501442909,
|
|
"num_tokens": 374434.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.441655158996582,
|
|
"epoch": 0.009943356385534093,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.9027,
|
|
"mean_token_accuracy": 0.11324851140379906,
|
|
"num_tokens": 378634.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.315918588638306,
|
|
"epoch": 0.010055079490989532,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.7869,
|
|
"mean_token_accuracy": 0.1252473659813404,
|
|
"num_tokens": 382904.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.470485210418701,
|
|
"epoch": 0.01016680259644497,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.7635,
|
|
"mean_token_accuracy": 0.12090180814266205,
|
|
"num_tokens": 386970.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.348088216781616,
|
|
"epoch": 0.01027852570190041,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.729,
|
|
"mean_token_accuracy": 0.13390202075242996,
|
|
"num_tokens": 391043.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.392842721939087,
|
|
"epoch": 0.01039024880735585,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.7204,
|
|
"mean_token_accuracy": 0.13383440747857095,
|
|
"num_tokens": 395413.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.40152382850647,
|
|
"epoch": 0.010501971912811289,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 6.8385,
|
|
"mean_token_accuracy": 0.12566340565681458,
|
|
"num_tokens": 399821.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.2655271053314205,
|
|
"epoch": 0.010613695018266728,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.6582,
|
|
"mean_token_accuracy": 0.13715523406863211,
|
|
"num_tokens": 404043.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.422811889648438,
|
|
"epoch": 0.010725418123722167,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 6.803,
|
|
"mean_token_accuracy": 0.1260749615728855,
|
|
"num_tokens": 408339.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.263138484954834,
|
|
"epoch": 0.010837141229177606,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.6856,
|
|
"mean_token_accuracy": 0.13459724336862564,
|
|
"num_tokens": 412384.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.3362548828125,
|
|
"epoch": 0.010948864334633046,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.8108,
|
|
"mean_token_accuracy": 0.12614913210272788,
|
|
"num_tokens": 416891.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.36023063659668,
|
|
"epoch": 0.011060587440088485,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.6642,
|
|
"mean_token_accuracy": 0.1329216368496418,
|
|
"num_tokens": 420980.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.2991362571716305,
|
|
"epoch": 0.011172310545543924,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.7212,
|
|
"mean_token_accuracy": 0.13121648952364923,
|
|
"num_tokens": 425454.0,
|
|
"step": 500
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 4000,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 91570824806400.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|