2057 lines
54 KiB
JSON
2057 lines
54 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.8594757198109153,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 1000,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 10.742608070373535,
|
||
|
|
"epoch": 0.004297378599054577,
|
||
|
|
"grad_norm": 5.46875,
|
||
|
|
"learning_rate": 2e-06,
|
||
|
|
"loss": 10.7643,
|
||
|
|
"mean_token_accuracy": 7.587253348901868e-05,
|
||
|
|
"num_tokens": 10107.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742630290985108,
|
||
|
|
"epoch": 0.008594757198109154,
|
||
|
|
"grad_norm": 5.78125,
|
||
|
|
"learning_rate": 4.5e-06,
|
||
|
|
"loss": 10.7086,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 18391.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.74263505935669,
|
||
|
|
"epoch": 0.01289213579716373,
|
||
|
|
"grad_norm": 5.3125,
|
||
|
|
"learning_rate": 7e-06,
|
||
|
|
"loss": 10.6888,
|
||
|
|
"mean_token_accuracy": 7.022471982054412e-05,
|
||
|
|
"num_tokens": 27061.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742604160308838,
|
||
|
|
"epoch": 0.017189514396218308,
|
||
|
|
"grad_norm": 6.0,
|
||
|
|
"learning_rate": 9.5e-06,
|
||
|
|
"loss": 10.6611,
|
||
|
|
"mean_token_accuracy": 0.0008422504703048617,
|
||
|
|
"num_tokens": 36339.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742517948150635,
|
||
|
|
"epoch": 0.021486892995272882,
|
||
|
|
"grad_norm": 4.75,
|
||
|
|
"learning_rate": 1.2e-05,
|
||
|
|
"loss": 10.5317,
|
||
|
|
"mean_token_accuracy": 0.02025789166800678,
|
||
|
|
"num_tokens": 45770.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.741962242126466,
|
||
|
|
"epoch": 0.02578427159432746,
|
||
|
|
"grad_norm": 4.25,
|
||
|
|
"learning_rate": 1.4500000000000002e-05,
|
||
|
|
"loss": 10.399,
|
||
|
|
"mean_token_accuracy": 0.04876907132565975,
|
||
|
|
"num_tokens": 54575.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.73945140838623,
|
||
|
|
"epoch": 0.030081650193382038,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 1.7000000000000003e-05,
|
||
|
|
"loss": 10.3065,
|
||
|
|
"mean_token_accuracy": 0.0514072135090828,
|
||
|
|
"num_tokens": 66403.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.730937385559082,
|
||
|
|
"epoch": 0.034379028792436615,
|
||
|
|
"grad_norm": 2.640625,
|
||
|
|
"learning_rate": 1.95e-05,
|
||
|
|
"loss": 10.0976,
|
||
|
|
"mean_token_accuracy": 0.05973539762198925,
|
||
|
|
"num_tokens": 76510.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.715238952636719,
|
||
|
|
"epoch": 0.03867640739149119,
|
||
|
|
"grad_norm": 2.40625,
|
||
|
|
"learning_rate": 2.2e-05,
|
||
|
|
"loss": 9.9688,
|
||
|
|
"mean_token_accuracy": 0.05614017099142075,
|
||
|
|
"num_tokens": 84836.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.702037715911866,
|
||
|
|
"epoch": 0.042973785990545764,
|
||
|
|
"grad_norm": 2.046875,
|
||
|
|
"learning_rate": 2.4500000000000003e-05,
|
||
|
|
"loss": 9.9015,
|
||
|
|
"mean_token_accuracy": 0.053829558193683624,
|
||
|
|
"num_tokens": 93197.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.697910690307618,
|
||
|
|
"epoch": 0.047271164589600345,
|
||
|
|
"grad_norm": 2.40625,
|
||
|
|
"learning_rate": 2.7e-05,
|
||
|
|
"loss": 9.8366,
|
||
|
|
"mean_token_accuracy": 0.05843428298830986,
|
||
|
|
"num_tokens": 101546.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.693470478057861,
|
||
|
|
"epoch": 0.05156854318865492,
|
||
|
|
"grad_norm": 1.9609375,
|
||
|
|
"learning_rate": 2.95e-05,
|
||
|
|
"loss": 9.8429,
|
||
|
|
"mean_token_accuracy": 0.0558084711432457,
|
||
|
|
"num_tokens": 111703.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.680869865417481,
|
||
|
|
"epoch": 0.055865921787709494,
|
||
|
|
"grad_norm": 1.9453125,
|
||
|
|
"learning_rate": 3.2e-05,
|
||
|
|
"loss": 9.7131,
|
||
|
|
"mean_token_accuracy": 0.0589165486395359,
|
||
|
|
"num_tokens": 119894.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.668927574157715,
|
||
|
|
"epoch": 0.060163300386764075,
|
||
|
|
"grad_norm": 1.9765625,
|
||
|
|
"learning_rate": 3.4500000000000005e-05,
|
||
|
|
"loss": 9.6682,
|
||
|
|
"mean_token_accuracy": 0.06148771904408932,
|
||
|
|
"num_tokens": 128885.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.654484272003174,
|
||
|
|
"epoch": 0.06446067898581866,
|
||
|
|
"grad_norm": 1.953125,
|
||
|
|
"learning_rate": 3.7e-05,
|
||
|
|
"loss": 9.6297,
|
||
|
|
"mean_token_accuracy": 0.057728851959109304,
|
||
|
|
"num_tokens": 138106.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.645826625823975,
|
||
|
|
"epoch": 0.06875805758487323,
|
||
|
|
"grad_norm": 1.9296875,
|
||
|
|
"learning_rate": 3.95e-05,
|
||
|
|
"loss": 9.5722,
|
||
|
|
"mean_token_accuracy": 0.058954347297549246,
|
||
|
|
"num_tokens": 146691.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.637816619873046,
|
||
|
|
"epoch": 0.0730554361839278,
|
||
|
|
"grad_norm": 1.90625,
|
||
|
|
"learning_rate": 4.2000000000000004e-05,
|
||
|
|
"loss": 9.5126,
|
||
|
|
"mean_token_accuracy": 0.059067190065979956,
|
||
|
|
"num_tokens": 155792.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.63103084564209,
|
||
|
|
"epoch": 0.07735281478298238,
|
||
|
|
"grad_norm": 1.7890625,
|
||
|
|
"learning_rate": 4.45e-05,
|
||
|
|
"loss": 9.5251,
|
||
|
|
"mean_token_accuracy": 0.0552229531109333,
|
||
|
|
"num_tokens": 166944.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.616693305969239,
|
||
|
|
"epoch": 0.08165019338203695,
|
||
|
|
"grad_norm": 1.96875,
|
||
|
|
"learning_rate": 4.7000000000000004e-05,
|
||
|
|
"loss": 9.3423,
|
||
|
|
"mean_token_accuracy": 0.060124922543764114,
|
||
|
|
"num_tokens": 175303.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.591300106048584,
|
||
|
|
"epoch": 0.08594757198109153,
|
||
|
|
"grad_norm": 1.8203125,
|
||
|
|
"learning_rate": 4.9500000000000004e-05,
|
||
|
|
"loss": 9.3133,
|
||
|
|
"mean_token_accuracy": 0.06174388714134693,
|
||
|
|
"num_tokens": 184708.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.564336776733398,
|
||
|
|
"epoch": 0.09024495058014612,
|
||
|
|
"grad_norm": 1.7890625,
|
||
|
|
"learning_rate": 5.2e-05,
|
||
|
|
"loss": 9.2307,
|
||
|
|
"mean_token_accuracy": 0.0674959484487772,
|
||
|
|
"num_tokens": 193835.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.52622423171997,
|
||
|
|
"epoch": 0.09454232917920069,
|
||
|
|
"grad_norm": 1.8828125,
|
||
|
|
"learning_rate": 5.45e-05,
|
||
|
|
"loss": 9.1379,
|
||
|
|
"mean_token_accuracy": 0.07480009235441684,
|
||
|
|
"num_tokens": 203344.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.454349136352539,
|
||
|
|
"epoch": 0.09883970777825526,
|
||
|
|
"grad_norm": 1.6171875,
|
||
|
|
"learning_rate": 5.7e-05,
|
||
|
|
"loss": 9.1209,
|
||
|
|
"mean_token_accuracy": 0.06218625903129578,
|
||
|
|
"num_tokens": 213048.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.415324211120605,
|
||
|
|
"epoch": 0.10313708637730984,
|
||
|
|
"grad_norm": 1.578125,
|
||
|
|
"learning_rate": 5.9499999999999996e-05,
|
||
|
|
"loss": 8.9306,
|
||
|
|
"mean_token_accuracy": 0.07533645890653133,
|
||
|
|
"num_tokens": 221784.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.303644943237305,
|
||
|
|
"epoch": 0.10743446497636441,
|
||
|
|
"grad_norm": 1.4765625,
|
||
|
|
"learning_rate": 6.2e-05,
|
||
|
|
"loss": 8.8509,
|
||
|
|
"mean_token_accuracy": 0.07504003196954727,
|
||
|
|
"num_tokens": 230971.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.209668159484863,
|
||
|
|
"epoch": 0.11173184357541899,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 6.450000000000001e-05,
|
||
|
|
"loss": 8.7412,
|
||
|
|
"mean_token_accuracy": 0.07478504739701748,
|
||
|
|
"num_tokens": 240524.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.153745365142822,
|
||
|
|
"epoch": 0.11602922217447358,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 6.7e-05,
|
||
|
|
"loss": 8.6323,
|
||
|
|
"mean_token_accuracy": 0.07354197278618813,
|
||
|
|
"num_tokens": 249220.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.068094253540039,
|
||
|
|
"epoch": 0.12032660077352815,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 6.950000000000001e-05,
|
||
|
|
"loss": 8.61,
|
||
|
|
"mean_token_accuracy": 0.07049238979816437,
|
||
|
|
"num_tokens": 258934.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.973960685729981,
|
||
|
|
"epoch": 0.12462397937258272,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 7.2e-05,
|
||
|
|
"loss": 8.4673,
|
||
|
|
"mean_token_accuracy": 0.07534252405166626,
|
||
|
|
"num_tokens": 267680.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.815561103820801,
|
||
|
|
"epoch": 0.1289213579716373,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 7.45e-05,
|
||
|
|
"loss": 8.3709,
|
||
|
|
"mean_token_accuracy": 0.07952065020799637,
|
||
|
|
"num_tokens": 276227.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.66996259689331,
|
||
|
|
"epoch": 0.1332187365706919,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 7.7e-05,
|
||
|
|
"loss": 8.2269,
|
||
|
|
"mean_token_accuracy": 0.08225171342492103,
|
||
|
|
"num_tokens": 286342.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.510671615600586,
|
||
|
|
"epoch": 0.13751611516974646,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 7.950000000000001e-05,
|
||
|
|
"loss": 8.1921,
|
||
|
|
"mean_token_accuracy": 0.0742720566689968,
|
||
|
|
"num_tokens": 294994.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.346861934661865,
|
||
|
|
"epoch": 0.14181349376880104,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 8.2e-05,
|
||
|
|
"loss": 8.113,
|
||
|
|
"mean_token_accuracy": 0.08004417940974236,
|
||
|
|
"num_tokens": 303882.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.199288940429687,
|
||
|
|
"epoch": 0.1461108723678556,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 8.450000000000001e-05,
|
||
|
|
"loss": 8.0403,
|
||
|
|
"mean_token_accuracy": 0.07799897268414498,
|
||
|
|
"num_tokens": 312515.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.978620052337646,
|
||
|
|
"epoch": 0.15040825096691018,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 8.7e-05,
|
||
|
|
"loss": 7.9977,
|
||
|
|
"mean_token_accuracy": 0.07381256259977817,
|
||
|
|
"num_tokens": 320801.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.861582374572754,
|
||
|
|
"epoch": 0.15470562956596476,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 8.95e-05,
|
||
|
|
"loss": 7.9642,
|
||
|
|
"mean_token_accuracy": 0.08192512467503547,
|
||
|
|
"num_tokens": 329382.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.755144786834716,
|
||
|
|
"epoch": 0.15900300816501933,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 9.2e-05,
|
||
|
|
"loss": 7.9273,
|
||
|
|
"mean_token_accuracy": 0.07583913430571557,
|
||
|
|
"num_tokens": 337894.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.582227611541748,
|
||
|
|
"epoch": 0.1633003867640739,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 9.45e-05,
|
||
|
|
"loss": 7.9012,
|
||
|
|
"mean_token_accuracy": 0.07614588961005211,
|
||
|
|
"num_tokens": 346380.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.591823768615722,
|
||
|
|
"epoch": 0.16759776536312848,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 9.7e-05,
|
||
|
|
"loss": 7.9407,
|
||
|
|
"mean_token_accuracy": 0.07390806600451469,
|
||
|
|
"num_tokens": 356305.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.515201950073243,
|
||
|
|
"epoch": 0.17189514396218306,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 9.95e-05,
|
||
|
|
"loss": 7.8901,
|
||
|
|
"mean_token_accuracy": 0.07247771993279457,
|
||
|
|
"num_tokens": 364899.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.457213211059571,
|
||
|
|
"epoch": 0.17619252256123766,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000102,
|
||
|
|
"loss": 7.8566,
|
||
|
|
"mean_token_accuracy": 0.0781160645186901,
|
||
|
|
"num_tokens": 373663.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.381179523468017,
|
||
|
|
"epoch": 0.18048990116029223,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.00010449999999999999,
|
||
|
|
"loss": 7.8221,
|
||
|
|
"mean_token_accuracy": 0.07758632972836495,
|
||
|
|
"num_tokens": 382730.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.390653896331788,
|
||
|
|
"epoch": 0.1847872797593468,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.000107,
|
||
|
|
"loss": 7.8622,
|
||
|
|
"mean_token_accuracy": 0.071787304058671,
|
||
|
|
"num_tokens": 392676.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.255177211761474,
|
||
|
|
"epoch": 0.18908465835840138,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0001095,
|
||
|
|
"loss": 7.8473,
|
||
|
|
"mean_token_accuracy": 0.08185218423604965,
|
||
|
|
"num_tokens": 401050.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.367721462249756,
|
||
|
|
"epoch": 0.19338203695745596,
|
||
|
|
"grad_norm": 0.796875,
|
||
|
|
"learning_rate": 0.000112,
|
||
|
|
"loss": 7.795,
|
||
|
|
"mean_token_accuracy": 0.07991239950060844,
|
||
|
|
"num_tokens": 410009.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.268333339691162,
|
||
|
|
"epoch": 0.19767941555651053,
|
||
|
|
"grad_norm": 0.859375,
|
||
|
|
"learning_rate": 0.0001145,
|
||
|
|
"loss": 7.7757,
|
||
|
|
"mean_token_accuracy": 0.08171008005738259,
|
||
|
|
"num_tokens": 419302.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.304029846191407,
|
||
|
|
"epoch": 0.2019767941555651,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.00011700000000000001,
|
||
|
|
"loss": 7.6812,
|
||
|
|
"mean_token_accuracy": 0.08820762410759926,
|
||
|
|
"num_tokens": 427296.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.16576337814331,
|
||
|
|
"epoch": 0.20627417275461968,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00011949999999999999,
|
||
|
|
"loss": 7.8198,
|
||
|
|
"mean_token_accuracy": 0.07870872803032399,
|
||
|
|
"num_tokens": 436368.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.189785575866699,
|
||
|
|
"epoch": 0.21057155135367425,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.000122,
|
||
|
|
"loss": 7.7389,
|
||
|
|
"mean_token_accuracy": 0.08551637679338456,
|
||
|
|
"num_tokens": 445535.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.265625381469727,
|
||
|
|
"epoch": 0.21486892995272883,
|
||
|
|
"grad_norm": 0.8671875,
|
||
|
|
"learning_rate": 0.0001245,
|
||
|
|
"loss": 7.7093,
|
||
|
|
"mean_token_accuracy": 0.07919453792273998,
|
||
|
|
"num_tokens": 454769.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.1545090675354,
|
||
|
|
"epoch": 0.2191663085517834,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000127,
|
||
|
|
"loss": 7.7315,
|
||
|
|
"mean_token_accuracy": 0.0871740497648716,
|
||
|
|
"num_tokens": 463975.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.13952112197876,
|
||
|
|
"epoch": 0.22346368715083798,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0001295,
|
||
|
|
"loss": 7.726,
|
||
|
|
"mean_token_accuracy": 0.08799278363585472,
|
||
|
|
"num_tokens": 472899.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.196070003509522,
|
||
|
|
"epoch": 0.22776106574989258,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000132,
|
||
|
|
"loss": 7.7354,
|
||
|
|
"mean_token_accuracy": 0.08013860881328583,
|
||
|
|
"num_tokens": 481556.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.114658737182618,
|
||
|
|
"epoch": 0.23205844434894715,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.00013450000000000002,
|
||
|
|
"loss": 7.7023,
|
||
|
|
"mean_token_accuracy": 0.0854449674487114,
|
||
|
|
"num_tokens": 490253.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.193334579467773,
|
||
|
|
"epoch": 0.23635582294800173,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00013700000000000002,
|
||
|
|
"loss": 7.7066,
|
||
|
|
"mean_token_accuracy": 0.0806311085820198,
|
||
|
|
"num_tokens": 498444.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.104936504364014,
|
||
|
|
"epoch": 0.2406532015470563,
|
||
|
|
"grad_norm": 0.8046875,
|
||
|
|
"learning_rate": 0.0001395,
|
||
|
|
"loss": 7.6467,
|
||
|
|
"mean_token_accuracy": 0.08675235286355018,
|
||
|
|
"num_tokens": 508330.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.113396596908569,
|
||
|
|
"epoch": 0.24495058014611087,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00014199999999999998,
|
||
|
|
"loss": 7.7405,
|
||
|
|
"mean_token_accuracy": 0.08165572881698609,
|
||
|
|
"num_tokens": 517900.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.046846723556518,
|
||
|
|
"epoch": 0.24924795874516545,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0001445,
|
||
|
|
"loss": 7.6901,
|
||
|
|
"mean_token_accuracy": 0.08230286985635757,
|
||
|
|
"num_tokens": 527808.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.13338761329651,
|
||
|
|
"epoch": 0.25354533734422,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.000147,
|
||
|
|
"loss": 7.6711,
|
||
|
|
"mean_token_accuracy": 0.08156475871801376,
|
||
|
|
"num_tokens": 536931.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.18837013244629,
|
||
|
|
"epoch": 0.2578427159432746,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0001495,
|
||
|
|
"loss": 7.7049,
|
||
|
|
"mean_token_accuracy": 0.0835341140627861,
|
||
|
|
"num_tokens": 545758.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.025089168548584,
|
||
|
|
"epoch": 0.26214009454232917,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.000152,
|
||
|
|
"loss": 7.7131,
|
||
|
|
"mean_token_accuracy": 0.08242038711905479,
|
||
|
|
"num_tokens": 555165.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.155539417266846,
|
||
|
|
"epoch": 0.2664374731413838,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 0.00015450000000000001,
|
||
|
|
"loss": 7.6144,
|
||
|
|
"mean_token_accuracy": 0.08789716809988021,
|
||
|
|
"num_tokens": 564719.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.041153383255004,
|
||
|
|
"epoch": 0.2707348517404383,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000157,
|
||
|
|
"loss": 7.594,
|
||
|
|
"mean_token_accuracy": 0.09155945181846618,
|
||
|
|
"num_tokens": 573572.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.15259666442871,
|
||
|
|
"epoch": 0.2750322303394929,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0001595,
|
||
|
|
"loss": 7.7634,
|
||
|
|
"mean_token_accuracy": 0.08318910300731659,
|
||
|
|
"num_tokens": 581497.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.100253248214722,
|
||
|
|
"epoch": 0.27932960893854747,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000162,
|
||
|
|
"loss": 7.6118,
|
||
|
|
"mean_token_accuracy": 0.08767011985182763,
|
||
|
|
"num_tokens": 591107.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.984478855133057,
|
||
|
|
"epoch": 0.28362698753760207,
|
||
|
|
"grad_norm": 0.84765625,
|
||
|
|
"learning_rate": 0.00016450000000000001,
|
||
|
|
"loss": 7.6456,
|
||
|
|
"mean_token_accuracy": 0.08353794142603874,
|
||
|
|
"num_tokens": 600241.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.057686376571656,
|
||
|
|
"epoch": 0.2879243661366566,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00016700000000000002,
|
||
|
|
"loss": 7.5776,
|
||
|
|
"mean_token_accuracy": 0.08751234114170074,
|
||
|
|
"num_tokens": 608697.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.016141748428344,
|
||
|
|
"epoch": 0.2922217447357112,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.00016950000000000003,
|
||
|
|
"loss": 7.568,
|
||
|
|
"mean_token_accuracy": 0.09023259431123734,
|
||
|
|
"num_tokens": 617275.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.084819841384888,
|
||
|
|
"epoch": 0.29651912333476577,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.00017199999999999998,
|
||
|
|
"loss": 7.6405,
|
||
|
|
"mean_token_accuracy": 0.08630914464592934,
|
||
|
|
"num_tokens": 626644.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.008595705032349,
|
||
|
|
"epoch": 0.30081650193382037,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.00017449999999999999,
|
||
|
|
"loss": 7.5665,
|
||
|
|
"mean_token_accuracy": 0.08766811862587928,
|
||
|
|
"num_tokens": 635110.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.04712610244751,
|
||
|
|
"epoch": 0.30511388053287497,
|
||
|
|
"grad_norm": 0.87109375,
|
||
|
|
"learning_rate": 0.000177,
|
||
|
|
"loss": 7.7031,
|
||
|
|
"mean_token_accuracy": 0.08570141717791557,
|
||
|
|
"num_tokens": 644746.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.179811954498291,
|
||
|
|
"epoch": 0.3094112591319295,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0001795,
|
||
|
|
"loss": 7.5831,
|
||
|
|
"mean_token_accuracy": 0.08595824986696243,
|
||
|
|
"num_tokens": 654281.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.987443113327027,
|
||
|
|
"epoch": 0.3137086377309841,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.000182,
|
||
|
|
"loss": 7.585,
|
||
|
|
"mean_token_accuracy": 0.09283285215497017,
|
||
|
|
"num_tokens": 663174.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.916810417175293,
|
||
|
|
"epoch": 0.31800601633003867,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0001845,
|
||
|
|
"loss": 7.511,
|
||
|
|
"mean_token_accuracy": 0.08863886222243308,
|
||
|
|
"num_tokens": 672178.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.005489206314087,
|
||
|
|
"epoch": 0.32230339492909327,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.000187,
|
||
|
|
"loss": 7.5218,
|
||
|
|
"mean_token_accuracy": 0.09131815880537034,
|
||
|
|
"num_tokens": 681323.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.9803643226623535,
|
||
|
|
"epoch": 0.3266007735281478,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0001895,
|
||
|
|
"loss": 7.4406,
|
||
|
|
"mean_token_accuracy": 0.08985799476504326,
|
||
|
|
"num_tokens": 690461.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.829833698272705,
|
||
|
|
"epoch": 0.3308981521272024,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000192,
|
||
|
|
"loss": 7.5004,
|
||
|
|
"mean_token_accuracy": 0.08490158319473266,
|
||
|
|
"num_tokens": 699199.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.038139152526856,
|
||
|
|
"epoch": 0.33519553072625696,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0001945,
|
||
|
|
"loss": 7.4484,
|
||
|
|
"mean_token_accuracy": 0.09670188426971435,
|
||
|
|
"num_tokens": 707949.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.9735198497772215,
|
||
|
|
"epoch": 0.33949290932531156,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00019700000000000002,
|
||
|
|
"loss": 7.5219,
|
||
|
|
"mean_token_accuracy": 0.08999367579817771,
|
||
|
|
"num_tokens": 715752.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.93391604423523,
|
||
|
|
"epoch": 0.3437902879243661,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00019950000000000002,
|
||
|
|
"loss": 7.4479,
|
||
|
|
"mean_token_accuracy": 0.0979436494410038,
|
||
|
|
"num_tokens": 724416.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.925309085845948,
|
||
|
|
"epoch": 0.3480876665234207,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000202,
|
||
|
|
"loss": 7.4953,
|
||
|
|
"mean_token_accuracy": 0.09031900316476822,
|
||
|
|
"num_tokens": 733116.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.916099977493286,
|
||
|
|
"epoch": 0.3523850451224753,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00020449999999999998,
|
||
|
|
"loss": 7.4726,
|
||
|
|
"mean_token_accuracy": 0.09227924942970275,
|
||
|
|
"num_tokens": 742093.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.918701934814453,
|
||
|
|
"epoch": 0.35668242372152986,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000207,
|
||
|
|
"loss": 7.4649,
|
||
|
|
"mean_token_accuracy": 0.09618089124560356,
|
||
|
|
"num_tokens": 750402.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.816703271865845,
|
||
|
|
"epoch": 0.36097980232058446,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0002095,
|
||
|
|
"loss": 7.4336,
|
||
|
|
"mean_token_accuracy": 0.09461462944746017,
|
||
|
|
"num_tokens": 760961.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.944287586212158,
|
||
|
|
"epoch": 0.365277180919639,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000212,
|
||
|
|
"loss": 7.4865,
|
||
|
|
"mean_token_accuracy": 0.09455274268984795,
|
||
|
|
"num_tokens": 770554.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.750526332855225,
|
||
|
|
"epoch": 0.3695745595186936,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0002145,
|
||
|
|
"loss": 7.4618,
|
||
|
|
"mean_token_accuracy": 0.09681151732802391,
|
||
|
|
"num_tokens": 779172.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.9787256717681885,
|
||
|
|
"epoch": 0.37387193811774816,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.00021700000000000002,
|
||
|
|
"loss": 7.5123,
|
||
|
|
"mean_token_accuracy": 0.08840151131153107,
|
||
|
|
"num_tokens": 788040.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.883750295639038,
|
||
|
|
"epoch": 0.37816931671680276,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0002195,
|
||
|
|
"loss": 7.4135,
|
||
|
|
"mean_token_accuracy": 0.0939902700483799,
|
||
|
|
"num_tokens": 796786.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.851776885986328,
|
||
|
|
"epoch": 0.3824666953158573,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000222,
|
||
|
|
"loss": 7.4233,
|
||
|
|
"mean_token_accuracy": 0.0923767201602459,
|
||
|
|
"num_tokens": 805520.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.805376100540161,
|
||
|
|
"epoch": 0.3867640739149119,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0002245,
|
||
|
|
"loss": 7.3508,
|
||
|
|
"mean_token_accuracy": 0.09647825658321381,
|
||
|
|
"num_tokens": 814939.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.874559307098389,
|
||
|
|
"epoch": 0.39106145251396646,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00022700000000000002,
|
||
|
|
"loss": 7.3531,
|
||
|
|
"mean_token_accuracy": 0.09795481041073799,
|
||
|
|
"num_tokens": 823862.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.7626677513122555,
|
||
|
|
"epoch": 0.39535883111302106,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00022950000000000002,
|
||
|
|
"loss": 7.3918,
|
||
|
|
"mean_token_accuracy": 0.09068166017532349,
|
||
|
|
"num_tokens": 832820.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.928297901153565,
|
||
|
|
"epoch": 0.39965620971207566,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00023200000000000003,
|
||
|
|
"loss": 7.3494,
|
||
|
|
"mean_token_accuracy": 0.09501236006617546,
|
||
|
|
"num_tokens": 841538.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.7496504306793215,
|
||
|
|
"epoch": 0.4039535883111302,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00023449999999999998,
|
||
|
|
"loss": 7.4626,
|
||
|
|
"mean_token_accuracy": 0.09104103595018387,
|
||
|
|
"num_tokens": 851123.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.8953351974487305,
|
||
|
|
"epoch": 0.4082509669101848,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000237,
|
||
|
|
"loss": 7.4266,
|
||
|
|
"mean_token_accuracy": 0.09596899375319481,
|
||
|
|
"num_tokens": 860357.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.76341495513916,
|
||
|
|
"epoch": 0.41254834550923936,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0002395,
|
||
|
|
"loss": 7.3425,
|
||
|
|
"mean_token_accuracy": 0.09861095696687698,
|
||
|
|
"num_tokens": 869980.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.82184157371521,
|
||
|
|
"epoch": 0.41684572410829396,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000242,
|
||
|
|
"loss": 7.2999,
|
||
|
|
"mean_token_accuracy": 0.10065284445881843,
|
||
|
|
"num_tokens": 878250.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.76347074508667,
|
||
|
|
"epoch": 0.4211431027073485,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0002445,
|
||
|
|
"loss": 7.4007,
|
||
|
|
"mean_token_accuracy": 0.095355936139822,
|
||
|
|
"num_tokens": 887624.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.753844261169434,
|
||
|
|
"epoch": 0.4254404813064031,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000247,
|
||
|
|
"loss": 7.3568,
|
||
|
|
"mean_token_accuracy": 0.09853926301002502,
|
||
|
|
"num_tokens": 897120.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.802051830291748,
|
||
|
|
"epoch": 0.42973785990545765,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0002495,
|
||
|
|
"loss": 7.3179,
|
||
|
|
"mean_token_accuracy": 0.10127250477671623,
|
||
|
|
"num_tokens": 906215.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42973785990545765,
|
||
|
|
"eval_entropy": 7.412716417699246,
|
||
|
|
"eval_loss": 7.3790483474731445,
|
||
|
|
"eval_mean_token_accuracy": 0.09986981684929347,
|
||
|
|
"eval_num_tokens": 906215.0,
|
||
|
|
"eval_runtime": 2.0966,
|
||
|
|
"eval_samples_per_second": 1692.736,
|
||
|
|
"eval_steps_per_second": 211.771,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.651102495193482,
|
||
|
|
"epoch": 0.43403523850451226,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000252,
|
||
|
|
"loss": 7.3112,
|
||
|
|
"mean_token_accuracy": 0.10008608102798462,
|
||
|
|
"num_tokens": 915181.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.728409194946289,
|
||
|
|
"epoch": 0.4383326171035668,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0002545,
|
||
|
|
"loss": 7.3388,
|
||
|
|
"mean_token_accuracy": 0.09651862978935241,
|
||
|
|
"num_tokens": 924377.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.770003318786621,
|
||
|
|
"epoch": 0.4426299957026214,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.000257,
|
||
|
|
"loss": 7.4098,
|
||
|
|
"mean_token_accuracy": 0.09438847750425339,
|
||
|
|
"num_tokens": 933114.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.86782751083374,
|
||
|
|
"epoch": 0.44692737430167595,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0002595,
|
||
|
|
"loss": 7.3692,
|
||
|
|
"mean_token_accuracy": 0.09444344118237495,
|
||
|
|
"num_tokens": 943306.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.659075498580933,
|
||
|
|
"epoch": 0.45122475290073055,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.000262,
|
||
|
|
"loss": 7.2626,
|
||
|
|
"mean_token_accuracy": 0.10587219074368477,
|
||
|
|
"num_tokens": 951515.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.713227224349976,
|
||
|
|
"epoch": 0.45552213149978515,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00026450000000000003,
|
||
|
|
"loss": 7.3711,
|
||
|
|
"mean_token_accuracy": 0.09387057200074196,
|
||
|
|
"num_tokens": 962686.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.780395078659057,
|
||
|
|
"epoch": 0.4598195100988397,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00026700000000000004,
|
||
|
|
"loss": 7.3777,
|
||
|
|
"mean_token_accuracy": 0.10021266266703606,
|
||
|
|
"num_tokens": 972136.0,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.657458114624023,
|
||
|
|
"epoch": 0.4641168886978943,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00026950000000000005,
|
||
|
|
"loss": 7.2696,
|
||
|
|
"mean_token_accuracy": 0.10345774069428444,
|
||
|
|
"num_tokens": 981301.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.700049114227295,
|
||
|
|
"epoch": 0.46841426729694885,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00027200000000000005,
|
||
|
|
"loss": 7.2923,
|
||
|
|
"mean_token_accuracy": 0.10189392492175102,
|
||
|
|
"num_tokens": 990360.0,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.770557546615601,
|
||
|
|
"epoch": 0.47271164589600345,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0002745,
|
||
|
|
"loss": 7.3438,
|
||
|
|
"mean_token_accuracy": 0.09953725263476372,
|
||
|
|
"num_tokens": 999415.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.656623125076294,
|
||
|
|
"epoch": 0.477009024495058,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000277,
|
||
|
|
"loss": 7.2635,
|
||
|
|
"mean_token_accuracy": 0.10239741951227188,
|
||
|
|
"num_tokens": 1008762.0,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.690563821792603,
|
||
|
|
"epoch": 0.4813064030941126,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0002795,
|
||
|
|
"loss": 7.2652,
|
||
|
|
"mean_token_accuracy": 0.10631422251462937,
|
||
|
|
"num_tokens": 1017704.0,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.641897583007813,
|
||
|
|
"epoch": 0.48560378169316715,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00028199999999999997,
|
||
|
|
"loss": 7.2341,
|
||
|
|
"mean_token_accuracy": 0.10428761765360832,
|
||
|
|
"num_tokens": 1026251.0,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.641419315338135,
|
||
|
|
"epoch": 0.48990116029222175,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0002845,
|
||
|
|
"loss": 7.2158,
|
||
|
|
"mean_token_accuracy": 0.10731100514531136,
|
||
|
|
"num_tokens": 1036191.0,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.658735990524292,
|
||
|
|
"epoch": 0.4941985388912763,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000287,
|
||
|
|
"loss": 7.2462,
|
||
|
|
"mean_token_accuracy": 0.10594421103596688,
|
||
|
|
"num_tokens": 1044936.0,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.621677112579346,
|
||
|
|
"epoch": 0.4984959174903309,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0002895,
|
||
|
|
"loss": 7.2472,
|
||
|
|
"mean_token_accuracy": 0.10367096737027168,
|
||
|
|
"num_tokens": 1053683.0,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.570435047149658,
|
||
|
|
"epoch": 0.5027932960893855,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000292,
|
||
|
|
"loss": 7.2271,
|
||
|
|
"mean_token_accuracy": 0.1076263040304184,
|
||
|
|
"num_tokens": 1062932.0,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.723283386230468,
|
||
|
|
"epoch": 0.50709067468844,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0002945,
|
||
|
|
"loss": 7.2544,
|
||
|
|
"mean_token_accuracy": 0.10264097228646278,
|
||
|
|
"num_tokens": 1072313.0,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.62511043548584,
|
||
|
|
"epoch": 0.5113880532874946,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.000297,
|
||
|
|
"loss": 7.2228,
|
||
|
|
"mean_token_accuracy": 0.09801378548145294,
|
||
|
|
"num_tokens": 1081675.0,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.608328151702881,
|
||
|
|
"epoch": 0.5156854318865493,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0002995,
|
||
|
|
"loss": 7.2433,
|
||
|
|
"mean_token_accuracy": 0.10141062065958976,
|
||
|
|
"num_tokens": 1091541.0,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.695394897460938,
|
||
|
|
"epoch": 0.5199828104856038,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000302,
|
||
|
|
"loss": 7.2462,
|
||
|
|
"mean_token_accuracy": 0.10475782826542854,
|
||
|
|
"num_tokens": 1100724.0,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.50453405380249,
|
||
|
|
"epoch": 0.5242801890846583,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003045,
|
||
|
|
"loss": 7.1924,
|
||
|
|
"mean_token_accuracy": 0.1077597513794899,
|
||
|
|
"num_tokens": 1108869.0,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.644835519790649,
|
||
|
|
"epoch": 0.5285775676837129,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.000307,
|
||
|
|
"loss": 7.2261,
|
||
|
|
"mean_token_accuracy": 0.10431057810783387,
|
||
|
|
"num_tokens": 1117314.0,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.488267469406128,
|
||
|
|
"epoch": 0.5328749462827675,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0003095,
|
||
|
|
"loss": 7.148,
|
||
|
|
"mean_token_accuracy": 0.10711429193615914,
|
||
|
|
"num_tokens": 1126786.0,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.577956056594848,
|
||
|
|
"epoch": 0.5371723248818221,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.000312,
|
||
|
|
"loss": 7.1645,
|
||
|
|
"mean_token_accuracy": 0.10579404905438423,
|
||
|
|
"num_tokens": 1136013.0,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.527575206756592,
|
||
|
|
"epoch": 0.5414697034808766,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0003145,
|
||
|
|
"loss": 7.1969,
|
||
|
|
"mean_token_accuracy": 0.10749110653996467,
|
||
|
|
"num_tokens": 1144970.0,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.613465976715088,
|
||
|
|
"epoch": 0.5457670820799312,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.000317,
|
||
|
|
"loss": 7.1614,
|
||
|
|
"mean_token_accuracy": 0.11203600242733955,
|
||
|
|
"num_tokens": 1153810.0,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.521342611312866,
|
||
|
|
"epoch": 0.5500644606789858,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003195,
|
||
|
|
"loss": 7.1408,
|
||
|
|
"mean_token_accuracy": 0.10991051346063614,
|
||
|
|
"num_tokens": 1162498.0,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.5313867092132565,
|
||
|
|
"epoch": 0.5543618392780404,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000322,
|
||
|
|
"loss": 7.2164,
|
||
|
|
"mean_token_accuracy": 0.1044546626508236,
|
||
|
|
"num_tokens": 1172091.0,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.653256607055664,
|
||
|
|
"epoch": 0.5586592178770949,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00032450000000000003,
|
||
|
|
"loss": 7.1977,
|
||
|
|
"mean_token_accuracy": 0.10631284043192864,
|
||
|
|
"num_tokens": 1181400.0,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.537307643890381,
|
||
|
|
"epoch": 0.5629565964761496,
|
||
|
|
"grad_norm": 1.2890625,
|
||
|
|
"learning_rate": 0.00032700000000000003,
|
||
|
|
"loss": 7.1721,
|
||
|
|
"mean_token_accuracy": 0.11125476211309433,
|
||
|
|
"num_tokens": 1189780.0,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.477937269210815,
|
||
|
|
"epoch": 0.5672539750752041,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00032950000000000004,
|
||
|
|
"loss": 7.1315,
|
||
|
|
"mean_token_accuracy": 0.1057468131184578,
|
||
|
|
"num_tokens": 1198671.0,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.589753818511963,
|
||
|
|
"epoch": 0.5715513536742587,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00033200000000000005,
|
||
|
|
"loss": 7.1652,
|
||
|
|
"mean_token_accuracy": 0.1051194004714489,
|
||
|
|
"num_tokens": 1207173.0,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.461796855926513,
|
||
|
|
"epoch": 0.5758487322733132,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.00033450000000000005,
|
||
|
|
"loss": 7.0998,
|
||
|
|
"mean_token_accuracy": 0.11046240702271462,
|
||
|
|
"num_tokens": 1216387.0,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.622633552551269,
|
||
|
|
"epoch": 0.5801461108723679,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000337,
|
||
|
|
"loss": 7.0722,
|
||
|
|
"mean_token_accuracy": 0.11004948541522026,
|
||
|
|
"num_tokens": 1224461.0,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.451505851745606,
|
||
|
|
"epoch": 0.5844434894714224,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0003395,
|
||
|
|
"loss": 7.1414,
|
||
|
|
"mean_token_accuracy": 0.11011224165558815,
|
||
|
|
"num_tokens": 1233774.0,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.457524538040161,
|
||
|
|
"epoch": 0.588740868070477,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.000342,
|
||
|
|
"loss": 7.0938,
|
||
|
|
"mean_token_accuracy": 0.1142980344593525,
|
||
|
|
"num_tokens": 1242812.0,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.605640840530396,
|
||
|
|
"epoch": 0.5930382466695315,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00034449999999999997,
|
||
|
|
"loss": 7.191,
|
||
|
|
"mean_token_accuracy": 0.11035142987966537,
|
||
|
|
"num_tokens": 1252872.0,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.307473850250244,
|
||
|
|
"epoch": 0.5973356252685862,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.000347,
|
||
|
|
"loss": 6.983,
|
||
|
|
"mean_token_accuracy": 0.11081922426819801,
|
||
|
|
"num_tokens": 1260852.0,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.438599157333374,
|
||
|
|
"epoch": 0.6016330038676407,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0003495,
|
||
|
|
"loss": 7.0984,
|
||
|
|
"mean_token_accuracy": 0.10763570070266723,
|
||
|
|
"num_tokens": 1268925.0,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.530004072189331,
|
||
|
|
"epoch": 0.6059303824666953,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000352,
|
||
|
|
"loss": 7.145,
|
||
|
|
"mean_token_accuracy": 0.10653513446450233,
|
||
|
|
"num_tokens": 1278994.0,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.4260091304779055,
|
||
|
|
"epoch": 0.6102277610657499,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003545,
|
||
|
|
"loss": 7.1323,
|
||
|
|
"mean_token_accuracy": 0.10368426591157913,
|
||
|
|
"num_tokens": 1287698.0,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.482218551635742,
|
||
|
|
"epoch": 0.6145251396648045,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000357,
|
||
|
|
"loss": 7.0787,
|
||
|
|
"mean_token_accuracy": 0.11120296269655228,
|
||
|
|
"num_tokens": 1297475.0,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.480340671539307,
|
||
|
|
"epoch": 0.618822518263859,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0003595,
|
||
|
|
"loss": 7.1091,
|
||
|
|
"mean_token_accuracy": 0.11085583940148354,
|
||
|
|
"num_tokens": 1306836.0,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.506947946548462,
|
||
|
|
"epoch": 0.6231198968629136,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000362,
|
||
|
|
"loss": 7.1377,
|
||
|
|
"mean_token_accuracy": 0.10435779988765717,
|
||
|
|
"num_tokens": 1315872.0,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.4788847923278805,
|
||
|
|
"epoch": 0.6274172754619682,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0003645,
|
||
|
|
"loss": 7.0782,
|
||
|
|
"mean_token_accuracy": 0.11685637310147286,
|
||
|
|
"num_tokens": 1324624.0,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.444537830352783,
|
||
|
|
"epoch": 0.6317146540610228,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000367,
|
||
|
|
"loss": 7.061,
|
||
|
|
"mean_token_accuracy": 0.11548577472567559,
|
||
|
|
"num_tokens": 1333058.0,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.262284660339356,
|
||
|
|
"epoch": 0.6360120326600773,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0003695,
|
||
|
|
"loss": 7.0248,
|
||
|
|
"mean_token_accuracy": 0.11004846841096878,
|
||
|
|
"num_tokens": 1342376.0,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.526681852340698,
|
||
|
|
"epoch": 0.6403094112591319,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000372,
|
||
|
|
"loss": 7.0693,
|
||
|
|
"mean_token_accuracy": 0.10503109246492386,
|
||
|
|
"num_tokens": 1351386.0,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.364239978790283,
|
||
|
|
"epoch": 0.6446067898581865,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.0003745,
|
||
|
|
"loss": 6.9832,
|
||
|
|
"mean_token_accuracy": 0.11761592403054237,
|
||
|
|
"num_tokens": 1358958.0,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.496349859237671,
|
||
|
|
"epoch": 0.6489041684572411,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000377,
|
||
|
|
"loss": 7.1231,
|
||
|
|
"mean_token_accuracy": 0.10967899858951569,
|
||
|
|
"num_tokens": 1368599.0,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.435608530044556,
|
||
|
|
"epoch": 0.6532015470562956,
|
||
|
|
"grad_norm": 1.890625,
|
||
|
|
"learning_rate": 0.0003795,
|
||
|
|
"loss": 7.1433,
|
||
|
|
"mean_token_accuracy": 0.1064300425350666,
|
||
|
|
"num_tokens": 1378529.0,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.344243001937866,
|
||
|
|
"epoch": 0.6574989256553503,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.000382,
|
||
|
|
"loss": 6.9306,
|
||
|
|
"mean_token_accuracy": 0.11750481277704239,
|
||
|
|
"num_tokens": 1386993.0,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.390715217590332,
|
||
|
|
"epoch": 0.6617963042544048,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.0003845,
|
||
|
|
"loss": 7.0322,
|
||
|
|
"mean_token_accuracy": 0.11829963177442551,
|
||
|
|
"num_tokens": 1395790.0,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.302670812606811,
|
||
|
|
"epoch": 0.6660936828534594,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00038700000000000003,
|
||
|
|
"loss": 7.0393,
|
||
|
|
"mean_token_accuracy": 0.11235549300909042,
|
||
|
|
"num_tokens": 1405587.0,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.348860168457032,
|
||
|
|
"epoch": 0.6703910614525139,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00038950000000000003,
|
||
|
|
"loss": 6.9999,
|
||
|
|
"mean_token_accuracy": 0.11504087448120118,
|
||
|
|
"num_tokens": 1414478.0,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.428205347061157,
|
||
|
|
"epoch": 0.6746884400515686,
|
||
|
|
"grad_norm": 1.375,
|
||
|
|
"learning_rate": 0.00039200000000000004,
|
||
|
|
"loss": 7.0623,
|
||
|
|
"mean_token_accuracy": 0.11534775421023369,
|
||
|
|
"num_tokens": 1423791.0,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.467832851409912,
|
||
|
|
"epoch": 0.6789858186506231,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.00039450000000000005,
|
||
|
|
"loss": 7.1014,
|
||
|
|
"mean_token_accuracy": 0.10728210881352425,
|
||
|
|
"num_tokens": 1432955.0,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.385548782348633,
|
||
|
|
"epoch": 0.6832831972496777,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00039700000000000005,
|
||
|
|
"loss": 7.074,
|
||
|
|
"mean_token_accuracy": 0.1087567687034607,
|
||
|
|
"num_tokens": 1441907.0,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.290066146850586,
|
||
|
|
"epoch": 0.6875805758487322,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0003995,
|
||
|
|
"loss": 6.935,
|
||
|
|
"mean_token_accuracy": 0.11768098697066307,
|
||
|
|
"num_tokens": 1451062.0,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.399672508239746,
|
||
|
|
"epoch": 0.6918779544477869,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000402,
|
||
|
|
"loss": 7.0218,
|
||
|
|
"mean_token_accuracy": 0.10959179401397705,
|
||
|
|
"num_tokens": 1460132.0,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.272280263900757,
|
||
|
|
"epoch": 0.6961753330468414,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004045,
|
||
|
|
"loss": 6.9141,
|
||
|
|
"mean_token_accuracy": 0.11885375007987023,
|
||
|
|
"num_tokens": 1469582.0,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.255832242965698,
|
||
|
|
"epoch": 0.700472711645896,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.00040699999999999997,
|
||
|
|
"loss": 7.012,
|
||
|
|
"mean_token_accuracy": 0.10950389429926873,
|
||
|
|
"num_tokens": 1479053.0,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.313858604431152,
|
||
|
|
"epoch": 0.7047700902449506,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0004095,
|
||
|
|
"loss": 7.0142,
|
||
|
|
"mean_token_accuracy": 0.11343196108937263,
|
||
|
|
"num_tokens": 1488189.0,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.236453676223755,
|
||
|
|
"epoch": 0.7090674688440052,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000412,
|
||
|
|
"loss": 6.8662,
|
||
|
|
"mean_token_accuracy": 0.12046442031860352,
|
||
|
|
"num_tokens": 1497324.0,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.310264635086059,
|
||
|
|
"epoch": 0.7133648474430597,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004145,
|
||
|
|
"loss": 6.9814,
|
||
|
|
"mean_token_accuracy": 0.11739002540707588,
|
||
|
|
"num_tokens": 1506543.0,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.289929437637329,
|
||
|
|
"epoch": 0.7176622260421143,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000417,
|
||
|
|
"loss": 6.9742,
|
||
|
|
"mean_token_accuracy": 0.12236066460609436,
|
||
|
|
"num_tokens": 1516737.0,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.161224508285523,
|
||
|
|
"epoch": 0.7219596046411689,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004195,
|
||
|
|
"loss": 6.8503,
|
||
|
|
"mean_token_accuracy": 0.11500222384929656,
|
||
|
|
"num_tokens": 1525561.0,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.280500030517578,
|
||
|
|
"epoch": 0.7262569832402235,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000422,
|
||
|
|
"loss": 6.8765,
|
||
|
|
"mean_token_accuracy": 0.1242159940302372,
|
||
|
|
"num_tokens": 1533323.0,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.292038059234619,
|
||
|
|
"epoch": 0.730554361839278,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004245,
|
||
|
|
"loss": 6.9379,
|
||
|
|
"mean_token_accuracy": 0.12142991349101066,
|
||
|
|
"num_tokens": 1542632.0,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.305912923812866,
|
||
|
|
"epoch": 0.7348517404383326,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.000427,
|
||
|
|
"loss": 6.8775,
|
||
|
|
"mean_token_accuracy": 0.12107516825199127,
|
||
|
|
"num_tokens": 1551236.0,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.118098545074463,
|
||
|
|
"epoch": 0.7391491190373872,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004295,
|
||
|
|
"loss": 6.878,
|
||
|
|
"mean_token_accuracy": 0.12266490310430526,
|
||
|
|
"num_tokens": 1559674.0,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.268103885650635,
|
||
|
|
"epoch": 0.7434464976364418,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000432,
|
||
|
|
"loss": 6.9687,
|
||
|
|
"mean_token_accuracy": 0.1217973381280899,
|
||
|
|
"num_tokens": 1569481.0,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2675707817077635,
|
||
|
|
"epoch": 0.7477438762354963,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004345,
|
||
|
|
"loss": 6.9975,
|
||
|
|
"mean_token_accuracy": 0.11359266638755798,
|
||
|
|
"num_tokens": 1578488.0,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.171451759338379,
|
||
|
|
"epoch": 0.752041254834551,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000437,
|
||
|
|
"loss": 6.8946,
|
||
|
|
"mean_token_accuracy": 0.11810402423143387,
|
||
|
|
"num_tokens": 1586675.0,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.285072469711304,
|
||
|
|
"epoch": 0.7563386334336055,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004395,
|
||
|
|
"loss": 7.0021,
|
||
|
|
"mean_token_accuracy": 0.10800698548555374,
|
||
|
|
"num_tokens": 1595411.0,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.312672233581543,
|
||
|
|
"epoch": 0.7606360120326601,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000442,
|
||
|
|
"loss": 6.9755,
|
||
|
|
"mean_token_accuracy": 0.11759781166911125,
|
||
|
|
"num_tokens": 1604046.0,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.245748281478882,
|
||
|
|
"epoch": 0.7649333906317146,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004445,
|
||
|
|
"loss": 6.9643,
|
||
|
|
"mean_token_accuracy": 0.11201045587658882,
|
||
|
|
"num_tokens": 1613759.0,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.238279533386231,
|
||
|
|
"epoch": 0.7692307692307693,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000447,
|
||
|
|
"loss": 6.9209,
|
||
|
|
"mean_token_accuracy": 0.11877147182822227,
|
||
|
|
"num_tokens": 1623323.0,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.230697107315064,
|
||
|
|
"epoch": 0.7735281478298238,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00044950000000000003,
|
||
|
|
"loss": 6.9005,
|
||
|
|
"mean_token_accuracy": 0.11391794160008431,
|
||
|
|
"num_tokens": 1631727.0,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.194222545623779,
|
||
|
|
"epoch": 0.7778255264288784,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00045200000000000004,
|
||
|
|
"loss": 6.8583,
|
||
|
|
"mean_token_accuracy": 0.12049278989434242,
|
||
|
|
"num_tokens": 1639544.0,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.284112405776978,
|
||
|
|
"epoch": 0.7821229050279329,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00045450000000000004,
|
||
|
|
"loss": 6.9773,
|
||
|
|
"mean_token_accuracy": 0.11113567724823951,
|
||
|
|
"num_tokens": 1648931.0,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1627342224121096,
|
||
|
|
"epoch": 0.7864202836269876,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00045700000000000005,
|
||
|
|
"loss": 6.8345,
|
||
|
|
"mean_token_accuracy": 0.12127922549843788,
|
||
|
|
"num_tokens": 1657688.0,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.259271335601807,
|
||
|
|
"epoch": 0.7907176622260421,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00045950000000000006,
|
||
|
|
"loss": 6.9244,
|
||
|
|
"mean_token_accuracy": 0.11565326899290085,
|
||
|
|
"num_tokens": 1666879.0,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1275458335876465,
|
||
|
|
"epoch": 0.7950150408250967,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000462,
|
||
|
|
"loss": 6.8982,
|
||
|
|
"mean_token_accuracy": 0.118662890791893,
|
||
|
|
"num_tokens": 1676773.0,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2360998630523685,
|
||
|
|
"epoch": 0.7993124194241513,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004645,
|
||
|
|
"loss": 7.0092,
|
||
|
|
"mean_token_accuracy": 0.11184348464012146,
|
||
|
|
"num_tokens": 1686144.0,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.26247010231018,
|
||
|
|
"epoch": 0.8036097980232059,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000467,
|
||
|
|
"loss": 6.9646,
|
||
|
|
"mean_token_accuracy": 0.10949353277683258,
|
||
|
|
"num_tokens": 1695476.0,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.174946022033692,
|
||
|
|
"epoch": 0.8079071766222604,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004695,
|
||
|
|
"loss": 6.8498,
|
||
|
|
"mean_token_accuracy": 0.12084392830729485,
|
||
|
|
"num_tokens": 1704907.0,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.166734504699707,
|
||
|
|
"epoch": 0.812204555221315,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.000472,
|
||
|
|
"loss": 6.8948,
|
||
|
|
"mean_token_accuracy": 0.12091493904590607,
|
||
|
|
"num_tokens": 1714564.0,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.244975614547729,
|
||
|
|
"epoch": 0.8165019338203696,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004745,
|
||
|
|
"loss": 6.9209,
|
||
|
|
"mean_token_accuracy": 0.1155279442667961,
|
||
|
|
"num_tokens": 1725285.0,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1149109363555905,
|
||
|
|
"epoch": 0.8207993124194242,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000477,
|
||
|
|
"loss": 6.9153,
|
||
|
|
"mean_token_accuracy": 0.11715079098939896,
|
||
|
|
"num_tokens": 1734331.0,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.227117824554443,
|
||
|
|
"epoch": 0.8250966910184787,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0004795,
|
||
|
|
"loss": 6.852,
|
||
|
|
"mean_token_accuracy": 0.11185217499732972,
|
||
|
|
"num_tokens": 1742340.0,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.160442066192627,
|
||
|
|
"epoch": 0.8293940696175333,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000482,
|
||
|
|
"loss": 6.8351,
|
||
|
|
"mean_token_accuracy": 0.12198592498898506,
|
||
|
|
"num_tokens": 1751725.0,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.999344539642334,
|
||
|
|
"epoch": 0.8336914482165879,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004845,
|
||
|
|
"loss": 6.7683,
|
||
|
|
"mean_token_accuracy": 0.12398558706045151,
|
||
|
|
"num_tokens": 1760294.0,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.112461137771606,
|
||
|
|
"epoch": 0.8379888268156425,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000487,
|
||
|
|
"loss": 6.8275,
|
||
|
|
"mean_token_accuracy": 0.11639805063605309,
|
||
|
|
"num_tokens": 1768912.0,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.257990169525146,
|
||
|
|
"epoch": 0.842286205414697,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004895,
|
||
|
|
"loss": 7.0148,
|
||
|
|
"mean_token_accuracy": 0.12016609534621239,
|
||
|
|
"num_tokens": 1778633.0,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1191816329956055,
|
||
|
|
"epoch": 0.8465835840137517,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.000492,
|
||
|
|
"loss": 6.8847,
|
||
|
|
"mean_token_accuracy": 0.11811531409621238,
|
||
|
|
"num_tokens": 1787275.0,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.235857200622559,
|
||
|
|
"epoch": 0.8508809626128062,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0004945,
|
||
|
|
"loss": 6.8878,
|
||
|
|
"mean_token_accuracy": 0.11604067236185074,
|
||
|
|
"num_tokens": 1795994.0,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.036646842956543,
|
||
|
|
"epoch": 0.8551783412118608,
|
||
|
|
"grad_norm": 0.8359375,
|
||
|
|
"learning_rate": 0.000497,
|
||
|
|
"loss": 6.804,
|
||
|
|
"mean_token_accuracy": 0.11985133662819862,
|
||
|
|
"num_tokens": 1806379.0,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.154667520523072,
|
||
|
|
"epoch": 0.8594757198109153,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004995,
|
||
|
|
"loss": 6.8296,
|
||
|
|
"mean_token_accuracy": 0.1270947828888893,
|
||
|
|
"num_tokens": 1816135.0,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8594757198109153,
|
||
|
|
"eval_entropy": 6.812919497489929,
|
||
|
|
"eval_loss": 6.8574419021606445,
|
||
|
|
"eval_mean_token_accuracy": 0.12292942362795542,
|
||
|
|
"eval_num_tokens": 1816135.0,
|
||
|
|
"eval_runtime": 2.0522,
|
||
|
|
"eval_samples_per_second": 1729.37,
|
||
|
|
"eval_steps_per_second": 216.354,
|
||
|
|
"step": 1000
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 11630,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 408225012940800.0,
|
||
|
|
"train_batch_size": 16,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|