11156 lines
304 KiB
JSON
11156 lines
304 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 4.725397507520412,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 5500,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 10.742608070373535,
|
||
|
|
"epoch": 0.004297378599054577,
|
||
|
|
"grad_norm": 5.46875,
|
||
|
|
"learning_rate": 2e-06,
|
||
|
|
"loss": 10.7643,
|
||
|
|
"mean_token_accuracy": 7.587253348901868e-05,
|
||
|
|
"num_tokens": 10107.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742630290985108,
|
||
|
|
"epoch": 0.008594757198109154,
|
||
|
|
"grad_norm": 5.78125,
|
||
|
|
"learning_rate": 4.5e-06,
|
||
|
|
"loss": 10.7086,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 18391.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.74263505935669,
|
||
|
|
"epoch": 0.01289213579716373,
|
||
|
|
"grad_norm": 5.3125,
|
||
|
|
"learning_rate": 7e-06,
|
||
|
|
"loss": 10.6888,
|
||
|
|
"mean_token_accuracy": 7.022471982054412e-05,
|
||
|
|
"num_tokens": 27061.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742604160308838,
|
||
|
|
"epoch": 0.017189514396218308,
|
||
|
|
"grad_norm": 6.0,
|
||
|
|
"learning_rate": 9.5e-06,
|
||
|
|
"loss": 10.6611,
|
||
|
|
"mean_token_accuracy": 0.0008422504703048617,
|
||
|
|
"num_tokens": 36339.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742517948150635,
|
||
|
|
"epoch": 0.021486892995272882,
|
||
|
|
"grad_norm": 4.75,
|
||
|
|
"learning_rate": 1.2e-05,
|
||
|
|
"loss": 10.5317,
|
||
|
|
"mean_token_accuracy": 0.02025789166800678,
|
||
|
|
"num_tokens": 45770.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.741962242126466,
|
||
|
|
"epoch": 0.02578427159432746,
|
||
|
|
"grad_norm": 4.25,
|
||
|
|
"learning_rate": 1.4500000000000002e-05,
|
||
|
|
"loss": 10.399,
|
||
|
|
"mean_token_accuracy": 0.04876907132565975,
|
||
|
|
"num_tokens": 54575.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.73945140838623,
|
||
|
|
"epoch": 0.030081650193382038,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 1.7000000000000003e-05,
|
||
|
|
"loss": 10.3065,
|
||
|
|
"mean_token_accuracy": 0.0514072135090828,
|
||
|
|
"num_tokens": 66403.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.730937385559082,
|
||
|
|
"epoch": 0.034379028792436615,
|
||
|
|
"grad_norm": 2.640625,
|
||
|
|
"learning_rate": 1.95e-05,
|
||
|
|
"loss": 10.0976,
|
||
|
|
"mean_token_accuracy": 0.05973539762198925,
|
||
|
|
"num_tokens": 76510.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.715238952636719,
|
||
|
|
"epoch": 0.03867640739149119,
|
||
|
|
"grad_norm": 2.40625,
|
||
|
|
"learning_rate": 2.2e-05,
|
||
|
|
"loss": 9.9688,
|
||
|
|
"mean_token_accuracy": 0.05614017099142075,
|
||
|
|
"num_tokens": 84836.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.702037715911866,
|
||
|
|
"epoch": 0.042973785990545764,
|
||
|
|
"grad_norm": 2.046875,
|
||
|
|
"learning_rate": 2.4500000000000003e-05,
|
||
|
|
"loss": 9.9015,
|
||
|
|
"mean_token_accuracy": 0.053829558193683624,
|
||
|
|
"num_tokens": 93197.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.697910690307618,
|
||
|
|
"epoch": 0.047271164589600345,
|
||
|
|
"grad_norm": 2.40625,
|
||
|
|
"learning_rate": 2.7e-05,
|
||
|
|
"loss": 9.8366,
|
||
|
|
"mean_token_accuracy": 0.05843428298830986,
|
||
|
|
"num_tokens": 101546.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.693470478057861,
|
||
|
|
"epoch": 0.05156854318865492,
|
||
|
|
"grad_norm": 1.9609375,
|
||
|
|
"learning_rate": 2.95e-05,
|
||
|
|
"loss": 9.8429,
|
||
|
|
"mean_token_accuracy": 0.0558084711432457,
|
||
|
|
"num_tokens": 111703.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.680869865417481,
|
||
|
|
"epoch": 0.055865921787709494,
|
||
|
|
"grad_norm": 1.9453125,
|
||
|
|
"learning_rate": 3.2e-05,
|
||
|
|
"loss": 9.7131,
|
||
|
|
"mean_token_accuracy": 0.0589165486395359,
|
||
|
|
"num_tokens": 119894.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.668927574157715,
|
||
|
|
"epoch": 0.060163300386764075,
|
||
|
|
"grad_norm": 1.9765625,
|
||
|
|
"learning_rate": 3.4500000000000005e-05,
|
||
|
|
"loss": 9.6682,
|
||
|
|
"mean_token_accuracy": 0.06148771904408932,
|
||
|
|
"num_tokens": 128885.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.654484272003174,
|
||
|
|
"epoch": 0.06446067898581866,
|
||
|
|
"grad_norm": 1.953125,
|
||
|
|
"learning_rate": 3.7e-05,
|
||
|
|
"loss": 9.6297,
|
||
|
|
"mean_token_accuracy": 0.057728851959109304,
|
||
|
|
"num_tokens": 138106.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.645826625823975,
|
||
|
|
"epoch": 0.06875805758487323,
|
||
|
|
"grad_norm": 1.9296875,
|
||
|
|
"learning_rate": 3.95e-05,
|
||
|
|
"loss": 9.5722,
|
||
|
|
"mean_token_accuracy": 0.058954347297549246,
|
||
|
|
"num_tokens": 146691.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.637816619873046,
|
||
|
|
"epoch": 0.0730554361839278,
|
||
|
|
"grad_norm": 1.90625,
|
||
|
|
"learning_rate": 4.2000000000000004e-05,
|
||
|
|
"loss": 9.5126,
|
||
|
|
"mean_token_accuracy": 0.059067190065979956,
|
||
|
|
"num_tokens": 155792.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.63103084564209,
|
||
|
|
"epoch": 0.07735281478298238,
|
||
|
|
"grad_norm": 1.7890625,
|
||
|
|
"learning_rate": 4.45e-05,
|
||
|
|
"loss": 9.5251,
|
||
|
|
"mean_token_accuracy": 0.0552229531109333,
|
||
|
|
"num_tokens": 166944.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.616693305969239,
|
||
|
|
"epoch": 0.08165019338203695,
|
||
|
|
"grad_norm": 1.96875,
|
||
|
|
"learning_rate": 4.7000000000000004e-05,
|
||
|
|
"loss": 9.3423,
|
||
|
|
"mean_token_accuracy": 0.060124922543764114,
|
||
|
|
"num_tokens": 175303.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.591300106048584,
|
||
|
|
"epoch": 0.08594757198109153,
|
||
|
|
"grad_norm": 1.8203125,
|
||
|
|
"learning_rate": 4.9500000000000004e-05,
|
||
|
|
"loss": 9.3133,
|
||
|
|
"mean_token_accuracy": 0.06174388714134693,
|
||
|
|
"num_tokens": 184708.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.564336776733398,
|
||
|
|
"epoch": 0.09024495058014612,
|
||
|
|
"grad_norm": 1.7890625,
|
||
|
|
"learning_rate": 5.2e-05,
|
||
|
|
"loss": 9.2307,
|
||
|
|
"mean_token_accuracy": 0.0674959484487772,
|
||
|
|
"num_tokens": 193835.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.52622423171997,
|
||
|
|
"epoch": 0.09454232917920069,
|
||
|
|
"grad_norm": 1.8828125,
|
||
|
|
"learning_rate": 5.45e-05,
|
||
|
|
"loss": 9.1379,
|
||
|
|
"mean_token_accuracy": 0.07480009235441684,
|
||
|
|
"num_tokens": 203344.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.454349136352539,
|
||
|
|
"epoch": 0.09883970777825526,
|
||
|
|
"grad_norm": 1.6171875,
|
||
|
|
"learning_rate": 5.7e-05,
|
||
|
|
"loss": 9.1209,
|
||
|
|
"mean_token_accuracy": 0.06218625903129578,
|
||
|
|
"num_tokens": 213048.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.415324211120605,
|
||
|
|
"epoch": 0.10313708637730984,
|
||
|
|
"grad_norm": 1.578125,
|
||
|
|
"learning_rate": 5.9499999999999996e-05,
|
||
|
|
"loss": 8.9306,
|
||
|
|
"mean_token_accuracy": 0.07533645890653133,
|
||
|
|
"num_tokens": 221784.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.303644943237305,
|
||
|
|
"epoch": 0.10743446497636441,
|
||
|
|
"grad_norm": 1.4765625,
|
||
|
|
"learning_rate": 6.2e-05,
|
||
|
|
"loss": 8.8509,
|
||
|
|
"mean_token_accuracy": 0.07504003196954727,
|
||
|
|
"num_tokens": 230971.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.209668159484863,
|
||
|
|
"epoch": 0.11173184357541899,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 6.450000000000001e-05,
|
||
|
|
"loss": 8.7412,
|
||
|
|
"mean_token_accuracy": 0.07478504739701748,
|
||
|
|
"num_tokens": 240524.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.153745365142822,
|
||
|
|
"epoch": 0.11602922217447358,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 6.7e-05,
|
||
|
|
"loss": 8.6323,
|
||
|
|
"mean_token_accuracy": 0.07354197278618813,
|
||
|
|
"num_tokens": 249220.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.068094253540039,
|
||
|
|
"epoch": 0.12032660077352815,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 6.950000000000001e-05,
|
||
|
|
"loss": 8.61,
|
||
|
|
"mean_token_accuracy": 0.07049238979816437,
|
||
|
|
"num_tokens": 258934.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.973960685729981,
|
||
|
|
"epoch": 0.12462397937258272,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 7.2e-05,
|
||
|
|
"loss": 8.4673,
|
||
|
|
"mean_token_accuracy": 0.07534252405166626,
|
||
|
|
"num_tokens": 267680.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.815561103820801,
|
||
|
|
"epoch": 0.1289213579716373,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 7.45e-05,
|
||
|
|
"loss": 8.3709,
|
||
|
|
"mean_token_accuracy": 0.07952065020799637,
|
||
|
|
"num_tokens": 276227.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.66996259689331,
|
||
|
|
"epoch": 0.1332187365706919,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 7.7e-05,
|
||
|
|
"loss": 8.2269,
|
||
|
|
"mean_token_accuracy": 0.08225171342492103,
|
||
|
|
"num_tokens": 286342.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.510671615600586,
|
||
|
|
"epoch": 0.13751611516974646,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 7.950000000000001e-05,
|
||
|
|
"loss": 8.1921,
|
||
|
|
"mean_token_accuracy": 0.0742720566689968,
|
||
|
|
"num_tokens": 294994.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.346861934661865,
|
||
|
|
"epoch": 0.14181349376880104,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 8.2e-05,
|
||
|
|
"loss": 8.113,
|
||
|
|
"mean_token_accuracy": 0.08004417940974236,
|
||
|
|
"num_tokens": 303882.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.199288940429687,
|
||
|
|
"epoch": 0.1461108723678556,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 8.450000000000001e-05,
|
||
|
|
"loss": 8.0403,
|
||
|
|
"mean_token_accuracy": 0.07799897268414498,
|
||
|
|
"num_tokens": 312515.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.978620052337646,
|
||
|
|
"epoch": 0.15040825096691018,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 8.7e-05,
|
||
|
|
"loss": 7.9977,
|
||
|
|
"mean_token_accuracy": 0.07381256259977817,
|
||
|
|
"num_tokens": 320801.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.861582374572754,
|
||
|
|
"epoch": 0.15470562956596476,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 8.95e-05,
|
||
|
|
"loss": 7.9642,
|
||
|
|
"mean_token_accuracy": 0.08192512467503547,
|
||
|
|
"num_tokens": 329382.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.755144786834716,
|
||
|
|
"epoch": 0.15900300816501933,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 9.2e-05,
|
||
|
|
"loss": 7.9273,
|
||
|
|
"mean_token_accuracy": 0.07583913430571557,
|
||
|
|
"num_tokens": 337894.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.582227611541748,
|
||
|
|
"epoch": 0.1633003867640739,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 9.45e-05,
|
||
|
|
"loss": 7.9012,
|
||
|
|
"mean_token_accuracy": 0.07614588961005211,
|
||
|
|
"num_tokens": 346380.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.591823768615722,
|
||
|
|
"epoch": 0.16759776536312848,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 9.7e-05,
|
||
|
|
"loss": 7.9407,
|
||
|
|
"mean_token_accuracy": 0.07390806600451469,
|
||
|
|
"num_tokens": 356305.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.515201950073243,
|
||
|
|
"epoch": 0.17189514396218306,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 9.95e-05,
|
||
|
|
"loss": 7.8901,
|
||
|
|
"mean_token_accuracy": 0.07247771993279457,
|
||
|
|
"num_tokens": 364899.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.457213211059571,
|
||
|
|
"epoch": 0.17619252256123766,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000102,
|
||
|
|
"loss": 7.8566,
|
||
|
|
"mean_token_accuracy": 0.0781160645186901,
|
||
|
|
"num_tokens": 373663.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.381179523468017,
|
||
|
|
"epoch": 0.18048990116029223,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.00010449999999999999,
|
||
|
|
"loss": 7.8221,
|
||
|
|
"mean_token_accuracy": 0.07758632972836495,
|
||
|
|
"num_tokens": 382730.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.390653896331788,
|
||
|
|
"epoch": 0.1847872797593468,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.000107,
|
||
|
|
"loss": 7.8622,
|
||
|
|
"mean_token_accuracy": 0.071787304058671,
|
||
|
|
"num_tokens": 392676.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.255177211761474,
|
||
|
|
"epoch": 0.18908465835840138,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0001095,
|
||
|
|
"loss": 7.8473,
|
||
|
|
"mean_token_accuracy": 0.08185218423604965,
|
||
|
|
"num_tokens": 401050.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.367721462249756,
|
||
|
|
"epoch": 0.19338203695745596,
|
||
|
|
"grad_norm": 0.796875,
|
||
|
|
"learning_rate": 0.000112,
|
||
|
|
"loss": 7.795,
|
||
|
|
"mean_token_accuracy": 0.07991239950060844,
|
||
|
|
"num_tokens": 410009.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.268333339691162,
|
||
|
|
"epoch": 0.19767941555651053,
|
||
|
|
"grad_norm": 0.859375,
|
||
|
|
"learning_rate": 0.0001145,
|
||
|
|
"loss": 7.7757,
|
||
|
|
"mean_token_accuracy": 0.08171008005738259,
|
||
|
|
"num_tokens": 419302.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.304029846191407,
|
||
|
|
"epoch": 0.2019767941555651,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.00011700000000000001,
|
||
|
|
"loss": 7.6812,
|
||
|
|
"mean_token_accuracy": 0.08820762410759926,
|
||
|
|
"num_tokens": 427296.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.16576337814331,
|
||
|
|
"epoch": 0.20627417275461968,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00011949999999999999,
|
||
|
|
"loss": 7.8198,
|
||
|
|
"mean_token_accuracy": 0.07870872803032399,
|
||
|
|
"num_tokens": 436368.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.189785575866699,
|
||
|
|
"epoch": 0.21057155135367425,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.000122,
|
||
|
|
"loss": 7.7389,
|
||
|
|
"mean_token_accuracy": 0.08551637679338456,
|
||
|
|
"num_tokens": 445535.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.265625381469727,
|
||
|
|
"epoch": 0.21486892995272883,
|
||
|
|
"grad_norm": 0.8671875,
|
||
|
|
"learning_rate": 0.0001245,
|
||
|
|
"loss": 7.7093,
|
||
|
|
"mean_token_accuracy": 0.07919453792273998,
|
||
|
|
"num_tokens": 454769.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.1545090675354,
|
||
|
|
"epoch": 0.2191663085517834,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000127,
|
||
|
|
"loss": 7.7315,
|
||
|
|
"mean_token_accuracy": 0.0871740497648716,
|
||
|
|
"num_tokens": 463975.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.13952112197876,
|
||
|
|
"epoch": 0.22346368715083798,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0001295,
|
||
|
|
"loss": 7.726,
|
||
|
|
"mean_token_accuracy": 0.08799278363585472,
|
||
|
|
"num_tokens": 472899.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.196070003509522,
|
||
|
|
"epoch": 0.22776106574989258,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000132,
|
||
|
|
"loss": 7.7354,
|
||
|
|
"mean_token_accuracy": 0.08013860881328583,
|
||
|
|
"num_tokens": 481556.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.114658737182618,
|
||
|
|
"epoch": 0.23205844434894715,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.00013450000000000002,
|
||
|
|
"loss": 7.7023,
|
||
|
|
"mean_token_accuracy": 0.0854449674487114,
|
||
|
|
"num_tokens": 490253.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.193334579467773,
|
||
|
|
"epoch": 0.23635582294800173,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00013700000000000002,
|
||
|
|
"loss": 7.7066,
|
||
|
|
"mean_token_accuracy": 0.0806311085820198,
|
||
|
|
"num_tokens": 498444.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.104936504364014,
|
||
|
|
"epoch": 0.2406532015470563,
|
||
|
|
"grad_norm": 0.8046875,
|
||
|
|
"learning_rate": 0.0001395,
|
||
|
|
"loss": 7.6467,
|
||
|
|
"mean_token_accuracy": 0.08675235286355018,
|
||
|
|
"num_tokens": 508330.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.113396596908569,
|
||
|
|
"epoch": 0.24495058014611087,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00014199999999999998,
|
||
|
|
"loss": 7.7405,
|
||
|
|
"mean_token_accuracy": 0.08165572881698609,
|
||
|
|
"num_tokens": 517900.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.046846723556518,
|
||
|
|
"epoch": 0.24924795874516545,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0001445,
|
||
|
|
"loss": 7.6901,
|
||
|
|
"mean_token_accuracy": 0.08230286985635757,
|
||
|
|
"num_tokens": 527808.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.13338761329651,
|
||
|
|
"epoch": 0.25354533734422,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.000147,
|
||
|
|
"loss": 7.6711,
|
||
|
|
"mean_token_accuracy": 0.08156475871801376,
|
||
|
|
"num_tokens": 536931.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.18837013244629,
|
||
|
|
"epoch": 0.2578427159432746,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0001495,
|
||
|
|
"loss": 7.7049,
|
||
|
|
"mean_token_accuracy": 0.0835341140627861,
|
||
|
|
"num_tokens": 545758.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.025089168548584,
|
||
|
|
"epoch": 0.26214009454232917,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.000152,
|
||
|
|
"loss": 7.7131,
|
||
|
|
"mean_token_accuracy": 0.08242038711905479,
|
||
|
|
"num_tokens": 555165.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.155539417266846,
|
||
|
|
"epoch": 0.2664374731413838,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 0.00015450000000000001,
|
||
|
|
"loss": 7.6144,
|
||
|
|
"mean_token_accuracy": 0.08789716809988021,
|
||
|
|
"num_tokens": 564719.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.041153383255004,
|
||
|
|
"epoch": 0.2707348517404383,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000157,
|
||
|
|
"loss": 7.594,
|
||
|
|
"mean_token_accuracy": 0.09155945181846618,
|
||
|
|
"num_tokens": 573572.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.15259666442871,
|
||
|
|
"epoch": 0.2750322303394929,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0001595,
|
||
|
|
"loss": 7.7634,
|
||
|
|
"mean_token_accuracy": 0.08318910300731659,
|
||
|
|
"num_tokens": 581497.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.100253248214722,
|
||
|
|
"epoch": 0.27932960893854747,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000162,
|
||
|
|
"loss": 7.6118,
|
||
|
|
"mean_token_accuracy": 0.08767011985182763,
|
||
|
|
"num_tokens": 591107.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.984478855133057,
|
||
|
|
"epoch": 0.28362698753760207,
|
||
|
|
"grad_norm": 0.84765625,
|
||
|
|
"learning_rate": 0.00016450000000000001,
|
||
|
|
"loss": 7.6456,
|
||
|
|
"mean_token_accuracy": 0.08353794142603874,
|
||
|
|
"num_tokens": 600241.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.057686376571656,
|
||
|
|
"epoch": 0.2879243661366566,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00016700000000000002,
|
||
|
|
"loss": 7.5776,
|
||
|
|
"mean_token_accuracy": 0.08751234114170074,
|
||
|
|
"num_tokens": 608697.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.016141748428344,
|
||
|
|
"epoch": 0.2922217447357112,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.00016950000000000003,
|
||
|
|
"loss": 7.568,
|
||
|
|
"mean_token_accuracy": 0.09023259431123734,
|
||
|
|
"num_tokens": 617275.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.084819841384888,
|
||
|
|
"epoch": 0.29651912333476577,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.00017199999999999998,
|
||
|
|
"loss": 7.6405,
|
||
|
|
"mean_token_accuracy": 0.08630914464592934,
|
||
|
|
"num_tokens": 626644.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.008595705032349,
|
||
|
|
"epoch": 0.30081650193382037,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.00017449999999999999,
|
||
|
|
"loss": 7.5665,
|
||
|
|
"mean_token_accuracy": 0.08766811862587928,
|
||
|
|
"num_tokens": 635110.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.04712610244751,
|
||
|
|
"epoch": 0.30511388053287497,
|
||
|
|
"grad_norm": 0.87109375,
|
||
|
|
"learning_rate": 0.000177,
|
||
|
|
"loss": 7.7031,
|
||
|
|
"mean_token_accuracy": 0.08570141717791557,
|
||
|
|
"num_tokens": 644746.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.179811954498291,
|
||
|
|
"epoch": 0.3094112591319295,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0001795,
|
||
|
|
"loss": 7.5831,
|
||
|
|
"mean_token_accuracy": 0.08595824986696243,
|
||
|
|
"num_tokens": 654281.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.987443113327027,
|
||
|
|
"epoch": 0.3137086377309841,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.000182,
|
||
|
|
"loss": 7.585,
|
||
|
|
"mean_token_accuracy": 0.09283285215497017,
|
||
|
|
"num_tokens": 663174.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.916810417175293,
|
||
|
|
"epoch": 0.31800601633003867,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0001845,
|
||
|
|
"loss": 7.511,
|
||
|
|
"mean_token_accuracy": 0.08863886222243308,
|
||
|
|
"num_tokens": 672178.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.005489206314087,
|
||
|
|
"epoch": 0.32230339492909327,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.000187,
|
||
|
|
"loss": 7.5218,
|
||
|
|
"mean_token_accuracy": 0.09131815880537034,
|
||
|
|
"num_tokens": 681323.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.9803643226623535,
|
||
|
|
"epoch": 0.3266007735281478,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0001895,
|
||
|
|
"loss": 7.4406,
|
||
|
|
"mean_token_accuracy": 0.08985799476504326,
|
||
|
|
"num_tokens": 690461.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.829833698272705,
|
||
|
|
"epoch": 0.3308981521272024,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000192,
|
||
|
|
"loss": 7.5004,
|
||
|
|
"mean_token_accuracy": 0.08490158319473266,
|
||
|
|
"num_tokens": 699199.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.038139152526856,
|
||
|
|
"epoch": 0.33519553072625696,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0001945,
|
||
|
|
"loss": 7.4484,
|
||
|
|
"mean_token_accuracy": 0.09670188426971435,
|
||
|
|
"num_tokens": 707949.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.9735198497772215,
|
||
|
|
"epoch": 0.33949290932531156,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00019700000000000002,
|
||
|
|
"loss": 7.5219,
|
||
|
|
"mean_token_accuracy": 0.08999367579817771,
|
||
|
|
"num_tokens": 715752.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.93391604423523,
|
||
|
|
"epoch": 0.3437902879243661,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00019950000000000002,
|
||
|
|
"loss": 7.4479,
|
||
|
|
"mean_token_accuracy": 0.0979436494410038,
|
||
|
|
"num_tokens": 724416.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.925309085845948,
|
||
|
|
"epoch": 0.3480876665234207,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000202,
|
||
|
|
"loss": 7.4953,
|
||
|
|
"mean_token_accuracy": 0.09031900316476822,
|
||
|
|
"num_tokens": 733116.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.916099977493286,
|
||
|
|
"epoch": 0.3523850451224753,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00020449999999999998,
|
||
|
|
"loss": 7.4726,
|
||
|
|
"mean_token_accuracy": 0.09227924942970275,
|
||
|
|
"num_tokens": 742093.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.918701934814453,
|
||
|
|
"epoch": 0.35668242372152986,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000207,
|
||
|
|
"loss": 7.4649,
|
||
|
|
"mean_token_accuracy": 0.09618089124560356,
|
||
|
|
"num_tokens": 750402.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.816703271865845,
|
||
|
|
"epoch": 0.36097980232058446,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0002095,
|
||
|
|
"loss": 7.4336,
|
||
|
|
"mean_token_accuracy": 0.09461462944746017,
|
||
|
|
"num_tokens": 760961.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.944287586212158,
|
||
|
|
"epoch": 0.365277180919639,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000212,
|
||
|
|
"loss": 7.4865,
|
||
|
|
"mean_token_accuracy": 0.09455274268984795,
|
||
|
|
"num_tokens": 770554.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.750526332855225,
|
||
|
|
"epoch": 0.3695745595186936,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0002145,
|
||
|
|
"loss": 7.4618,
|
||
|
|
"mean_token_accuracy": 0.09681151732802391,
|
||
|
|
"num_tokens": 779172.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.9787256717681885,
|
||
|
|
"epoch": 0.37387193811774816,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.00021700000000000002,
|
||
|
|
"loss": 7.5123,
|
||
|
|
"mean_token_accuracy": 0.08840151131153107,
|
||
|
|
"num_tokens": 788040.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.883750295639038,
|
||
|
|
"epoch": 0.37816931671680276,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0002195,
|
||
|
|
"loss": 7.4135,
|
||
|
|
"mean_token_accuracy": 0.0939902700483799,
|
||
|
|
"num_tokens": 796786.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.851776885986328,
|
||
|
|
"epoch": 0.3824666953158573,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000222,
|
||
|
|
"loss": 7.4233,
|
||
|
|
"mean_token_accuracy": 0.0923767201602459,
|
||
|
|
"num_tokens": 805520.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.805376100540161,
|
||
|
|
"epoch": 0.3867640739149119,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0002245,
|
||
|
|
"loss": 7.3508,
|
||
|
|
"mean_token_accuracy": 0.09647825658321381,
|
||
|
|
"num_tokens": 814939.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.874559307098389,
|
||
|
|
"epoch": 0.39106145251396646,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00022700000000000002,
|
||
|
|
"loss": 7.3531,
|
||
|
|
"mean_token_accuracy": 0.09795481041073799,
|
||
|
|
"num_tokens": 823862.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.7626677513122555,
|
||
|
|
"epoch": 0.39535883111302106,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00022950000000000002,
|
||
|
|
"loss": 7.3918,
|
||
|
|
"mean_token_accuracy": 0.09068166017532349,
|
||
|
|
"num_tokens": 832820.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.928297901153565,
|
||
|
|
"epoch": 0.39965620971207566,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00023200000000000003,
|
||
|
|
"loss": 7.3494,
|
||
|
|
"mean_token_accuracy": 0.09501236006617546,
|
||
|
|
"num_tokens": 841538.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.7496504306793215,
|
||
|
|
"epoch": 0.4039535883111302,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00023449999999999998,
|
||
|
|
"loss": 7.4626,
|
||
|
|
"mean_token_accuracy": 0.09104103595018387,
|
||
|
|
"num_tokens": 851123.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.8953351974487305,
|
||
|
|
"epoch": 0.4082509669101848,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000237,
|
||
|
|
"loss": 7.4266,
|
||
|
|
"mean_token_accuracy": 0.09596899375319481,
|
||
|
|
"num_tokens": 860357.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.76341495513916,
|
||
|
|
"epoch": 0.41254834550923936,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0002395,
|
||
|
|
"loss": 7.3425,
|
||
|
|
"mean_token_accuracy": 0.09861095696687698,
|
||
|
|
"num_tokens": 869980.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.82184157371521,
|
||
|
|
"epoch": 0.41684572410829396,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000242,
|
||
|
|
"loss": 7.2999,
|
||
|
|
"mean_token_accuracy": 0.10065284445881843,
|
||
|
|
"num_tokens": 878250.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.76347074508667,
|
||
|
|
"epoch": 0.4211431027073485,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0002445,
|
||
|
|
"loss": 7.4007,
|
||
|
|
"mean_token_accuracy": 0.095355936139822,
|
||
|
|
"num_tokens": 887624.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.753844261169434,
|
||
|
|
"epoch": 0.4254404813064031,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000247,
|
||
|
|
"loss": 7.3568,
|
||
|
|
"mean_token_accuracy": 0.09853926301002502,
|
||
|
|
"num_tokens": 897120.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.802051830291748,
|
||
|
|
"epoch": 0.42973785990545765,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0002495,
|
||
|
|
"loss": 7.3179,
|
||
|
|
"mean_token_accuracy": 0.10127250477671623,
|
||
|
|
"num_tokens": 906215.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42973785990545765,
|
||
|
|
"eval_entropy": 7.412716417699246,
|
||
|
|
"eval_loss": 7.3790483474731445,
|
||
|
|
"eval_mean_token_accuracy": 0.09986981684929347,
|
||
|
|
"eval_num_tokens": 906215.0,
|
||
|
|
"eval_runtime": 2.0966,
|
||
|
|
"eval_samples_per_second": 1692.736,
|
||
|
|
"eval_steps_per_second": 211.771,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.651102495193482,
|
||
|
|
"epoch": 0.43403523850451226,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000252,
|
||
|
|
"loss": 7.3112,
|
||
|
|
"mean_token_accuracy": 0.10008608102798462,
|
||
|
|
"num_tokens": 915181.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.728409194946289,
|
||
|
|
"epoch": 0.4383326171035668,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0002545,
|
||
|
|
"loss": 7.3388,
|
||
|
|
"mean_token_accuracy": 0.09651862978935241,
|
||
|
|
"num_tokens": 924377.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.770003318786621,
|
||
|
|
"epoch": 0.4426299957026214,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.000257,
|
||
|
|
"loss": 7.4098,
|
||
|
|
"mean_token_accuracy": 0.09438847750425339,
|
||
|
|
"num_tokens": 933114.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.86782751083374,
|
||
|
|
"epoch": 0.44692737430167595,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0002595,
|
||
|
|
"loss": 7.3692,
|
||
|
|
"mean_token_accuracy": 0.09444344118237495,
|
||
|
|
"num_tokens": 943306.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.659075498580933,
|
||
|
|
"epoch": 0.45122475290073055,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.000262,
|
||
|
|
"loss": 7.2626,
|
||
|
|
"mean_token_accuracy": 0.10587219074368477,
|
||
|
|
"num_tokens": 951515.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.713227224349976,
|
||
|
|
"epoch": 0.45552213149978515,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00026450000000000003,
|
||
|
|
"loss": 7.3711,
|
||
|
|
"mean_token_accuracy": 0.09387057200074196,
|
||
|
|
"num_tokens": 962686.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.780395078659057,
|
||
|
|
"epoch": 0.4598195100988397,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00026700000000000004,
|
||
|
|
"loss": 7.3777,
|
||
|
|
"mean_token_accuracy": 0.10021266266703606,
|
||
|
|
"num_tokens": 972136.0,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.657458114624023,
|
||
|
|
"epoch": 0.4641168886978943,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00026950000000000005,
|
||
|
|
"loss": 7.2696,
|
||
|
|
"mean_token_accuracy": 0.10345774069428444,
|
||
|
|
"num_tokens": 981301.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.700049114227295,
|
||
|
|
"epoch": 0.46841426729694885,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00027200000000000005,
|
||
|
|
"loss": 7.2923,
|
||
|
|
"mean_token_accuracy": 0.10189392492175102,
|
||
|
|
"num_tokens": 990360.0,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.770557546615601,
|
||
|
|
"epoch": 0.47271164589600345,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0002745,
|
||
|
|
"loss": 7.3438,
|
||
|
|
"mean_token_accuracy": 0.09953725263476372,
|
||
|
|
"num_tokens": 999415.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.656623125076294,
|
||
|
|
"epoch": 0.477009024495058,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000277,
|
||
|
|
"loss": 7.2635,
|
||
|
|
"mean_token_accuracy": 0.10239741951227188,
|
||
|
|
"num_tokens": 1008762.0,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.690563821792603,
|
||
|
|
"epoch": 0.4813064030941126,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0002795,
|
||
|
|
"loss": 7.2652,
|
||
|
|
"mean_token_accuracy": 0.10631422251462937,
|
||
|
|
"num_tokens": 1017704.0,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.641897583007813,
|
||
|
|
"epoch": 0.48560378169316715,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00028199999999999997,
|
||
|
|
"loss": 7.2341,
|
||
|
|
"mean_token_accuracy": 0.10428761765360832,
|
||
|
|
"num_tokens": 1026251.0,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.641419315338135,
|
||
|
|
"epoch": 0.48990116029222175,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0002845,
|
||
|
|
"loss": 7.2158,
|
||
|
|
"mean_token_accuracy": 0.10731100514531136,
|
||
|
|
"num_tokens": 1036191.0,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.658735990524292,
|
||
|
|
"epoch": 0.4941985388912763,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000287,
|
||
|
|
"loss": 7.2462,
|
||
|
|
"mean_token_accuracy": 0.10594421103596688,
|
||
|
|
"num_tokens": 1044936.0,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.621677112579346,
|
||
|
|
"epoch": 0.4984959174903309,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0002895,
|
||
|
|
"loss": 7.2472,
|
||
|
|
"mean_token_accuracy": 0.10367096737027168,
|
||
|
|
"num_tokens": 1053683.0,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.570435047149658,
|
||
|
|
"epoch": 0.5027932960893855,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000292,
|
||
|
|
"loss": 7.2271,
|
||
|
|
"mean_token_accuracy": 0.1076263040304184,
|
||
|
|
"num_tokens": 1062932.0,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.723283386230468,
|
||
|
|
"epoch": 0.50709067468844,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0002945,
|
||
|
|
"loss": 7.2544,
|
||
|
|
"mean_token_accuracy": 0.10264097228646278,
|
||
|
|
"num_tokens": 1072313.0,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.62511043548584,
|
||
|
|
"epoch": 0.5113880532874946,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.000297,
|
||
|
|
"loss": 7.2228,
|
||
|
|
"mean_token_accuracy": 0.09801378548145294,
|
||
|
|
"num_tokens": 1081675.0,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.608328151702881,
|
||
|
|
"epoch": 0.5156854318865493,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0002995,
|
||
|
|
"loss": 7.2433,
|
||
|
|
"mean_token_accuracy": 0.10141062065958976,
|
||
|
|
"num_tokens": 1091541.0,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.695394897460938,
|
||
|
|
"epoch": 0.5199828104856038,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000302,
|
||
|
|
"loss": 7.2462,
|
||
|
|
"mean_token_accuracy": 0.10475782826542854,
|
||
|
|
"num_tokens": 1100724.0,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.50453405380249,
|
||
|
|
"epoch": 0.5242801890846583,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003045,
|
||
|
|
"loss": 7.1924,
|
||
|
|
"mean_token_accuracy": 0.1077597513794899,
|
||
|
|
"num_tokens": 1108869.0,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.644835519790649,
|
||
|
|
"epoch": 0.5285775676837129,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.000307,
|
||
|
|
"loss": 7.2261,
|
||
|
|
"mean_token_accuracy": 0.10431057810783387,
|
||
|
|
"num_tokens": 1117314.0,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.488267469406128,
|
||
|
|
"epoch": 0.5328749462827675,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0003095,
|
||
|
|
"loss": 7.148,
|
||
|
|
"mean_token_accuracy": 0.10711429193615914,
|
||
|
|
"num_tokens": 1126786.0,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.577956056594848,
|
||
|
|
"epoch": 0.5371723248818221,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.000312,
|
||
|
|
"loss": 7.1645,
|
||
|
|
"mean_token_accuracy": 0.10579404905438423,
|
||
|
|
"num_tokens": 1136013.0,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.527575206756592,
|
||
|
|
"epoch": 0.5414697034808766,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0003145,
|
||
|
|
"loss": 7.1969,
|
||
|
|
"mean_token_accuracy": 0.10749110653996467,
|
||
|
|
"num_tokens": 1144970.0,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.613465976715088,
|
||
|
|
"epoch": 0.5457670820799312,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.000317,
|
||
|
|
"loss": 7.1614,
|
||
|
|
"mean_token_accuracy": 0.11203600242733955,
|
||
|
|
"num_tokens": 1153810.0,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.521342611312866,
|
||
|
|
"epoch": 0.5500644606789858,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003195,
|
||
|
|
"loss": 7.1408,
|
||
|
|
"mean_token_accuracy": 0.10991051346063614,
|
||
|
|
"num_tokens": 1162498.0,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.5313867092132565,
|
||
|
|
"epoch": 0.5543618392780404,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000322,
|
||
|
|
"loss": 7.2164,
|
||
|
|
"mean_token_accuracy": 0.1044546626508236,
|
||
|
|
"num_tokens": 1172091.0,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.653256607055664,
|
||
|
|
"epoch": 0.5586592178770949,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00032450000000000003,
|
||
|
|
"loss": 7.1977,
|
||
|
|
"mean_token_accuracy": 0.10631284043192864,
|
||
|
|
"num_tokens": 1181400.0,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.537307643890381,
|
||
|
|
"epoch": 0.5629565964761496,
|
||
|
|
"grad_norm": 1.2890625,
|
||
|
|
"learning_rate": 0.00032700000000000003,
|
||
|
|
"loss": 7.1721,
|
||
|
|
"mean_token_accuracy": 0.11125476211309433,
|
||
|
|
"num_tokens": 1189780.0,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.477937269210815,
|
||
|
|
"epoch": 0.5672539750752041,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00032950000000000004,
|
||
|
|
"loss": 7.1315,
|
||
|
|
"mean_token_accuracy": 0.1057468131184578,
|
||
|
|
"num_tokens": 1198671.0,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.589753818511963,
|
||
|
|
"epoch": 0.5715513536742587,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00033200000000000005,
|
||
|
|
"loss": 7.1652,
|
||
|
|
"mean_token_accuracy": 0.1051194004714489,
|
||
|
|
"num_tokens": 1207173.0,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.461796855926513,
|
||
|
|
"epoch": 0.5758487322733132,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.00033450000000000005,
|
||
|
|
"loss": 7.0998,
|
||
|
|
"mean_token_accuracy": 0.11046240702271462,
|
||
|
|
"num_tokens": 1216387.0,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.622633552551269,
|
||
|
|
"epoch": 0.5801461108723679,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000337,
|
||
|
|
"loss": 7.0722,
|
||
|
|
"mean_token_accuracy": 0.11004948541522026,
|
||
|
|
"num_tokens": 1224461.0,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.451505851745606,
|
||
|
|
"epoch": 0.5844434894714224,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0003395,
|
||
|
|
"loss": 7.1414,
|
||
|
|
"mean_token_accuracy": 0.11011224165558815,
|
||
|
|
"num_tokens": 1233774.0,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.457524538040161,
|
||
|
|
"epoch": 0.588740868070477,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.000342,
|
||
|
|
"loss": 7.0938,
|
||
|
|
"mean_token_accuracy": 0.1142980344593525,
|
||
|
|
"num_tokens": 1242812.0,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.605640840530396,
|
||
|
|
"epoch": 0.5930382466695315,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00034449999999999997,
|
||
|
|
"loss": 7.191,
|
||
|
|
"mean_token_accuracy": 0.11035142987966537,
|
||
|
|
"num_tokens": 1252872.0,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.307473850250244,
|
||
|
|
"epoch": 0.5973356252685862,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.000347,
|
||
|
|
"loss": 6.983,
|
||
|
|
"mean_token_accuracy": 0.11081922426819801,
|
||
|
|
"num_tokens": 1260852.0,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.438599157333374,
|
||
|
|
"epoch": 0.6016330038676407,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0003495,
|
||
|
|
"loss": 7.0984,
|
||
|
|
"mean_token_accuracy": 0.10763570070266723,
|
||
|
|
"num_tokens": 1268925.0,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.530004072189331,
|
||
|
|
"epoch": 0.6059303824666953,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000352,
|
||
|
|
"loss": 7.145,
|
||
|
|
"mean_token_accuracy": 0.10653513446450233,
|
||
|
|
"num_tokens": 1278994.0,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.4260091304779055,
|
||
|
|
"epoch": 0.6102277610657499,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003545,
|
||
|
|
"loss": 7.1323,
|
||
|
|
"mean_token_accuracy": 0.10368426591157913,
|
||
|
|
"num_tokens": 1287698.0,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.482218551635742,
|
||
|
|
"epoch": 0.6145251396648045,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000357,
|
||
|
|
"loss": 7.0787,
|
||
|
|
"mean_token_accuracy": 0.11120296269655228,
|
||
|
|
"num_tokens": 1297475.0,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.480340671539307,
|
||
|
|
"epoch": 0.618822518263859,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0003595,
|
||
|
|
"loss": 7.1091,
|
||
|
|
"mean_token_accuracy": 0.11085583940148354,
|
||
|
|
"num_tokens": 1306836.0,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.506947946548462,
|
||
|
|
"epoch": 0.6231198968629136,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000362,
|
||
|
|
"loss": 7.1377,
|
||
|
|
"mean_token_accuracy": 0.10435779988765717,
|
||
|
|
"num_tokens": 1315872.0,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.4788847923278805,
|
||
|
|
"epoch": 0.6274172754619682,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0003645,
|
||
|
|
"loss": 7.0782,
|
||
|
|
"mean_token_accuracy": 0.11685637310147286,
|
||
|
|
"num_tokens": 1324624.0,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.444537830352783,
|
||
|
|
"epoch": 0.6317146540610228,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000367,
|
||
|
|
"loss": 7.061,
|
||
|
|
"mean_token_accuracy": 0.11548577472567559,
|
||
|
|
"num_tokens": 1333058.0,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.262284660339356,
|
||
|
|
"epoch": 0.6360120326600773,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0003695,
|
||
|
|
"loss": 7.0248,
|
||
|
|
"mean_token_accuracy": 0.11004846841096878,
|
||
|
|
"num_tokens": 1342376.0,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.526681852340698,
|
||
|
|
"epoch": 0.6403094112591319,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000372,
|
||
|
|
"loss": 7.0693,
|
||
|
|
"mean_token_accuracy": 0.10503109246492386,
|
||
|
|
"num_tokens": 1351386.0,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.364239978790283,
|
||
|
|
"epoch": 0.6446067898581865,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.0003745,
|
||
|
|
"loss": 6.9832,
|
||
|
|
"mean_token_accuracy": 0.11761592403054237,
|
||
|
|
"num_tokens": 1358958.0,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.496349859237671,
|
||
|
|
"epoch": 0.6489041684572411,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000377,
|
||
|
|
"loss": 7.1231,
|
||
|
|
"mean_token_accuracy": 0.10967899858951569,
|
||
|
|
"num_tokens": 1368599.0,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.435608530044556,
|
||
|
|
"epoch": 0.6532015470562956,
|
||
|
|
"grad_norm": 1.890625,
|
||
|
|
"learning_rate": 0.0003795,
|
||
|
|
"loss": 7.1433,
|
||
|
|
"mean_token_accuracy": 0.1064300425350666,
|
||
|
|
"num_tokens": 1378529.0,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.344243001937866,
|
||
|
|
"epoch": 0.6574989256553503,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.000382,
|
||
|
|
"loss": 6.9306,
|
||
|
|
"mean_token_accuracy": 0.11750481277704239,
|
||
|
|
"num_tokens": 1386993.0,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.390715217590332,
|
||
|
|
"epoch": 0.6617963042544048,
|
||
|
|
"grad_norm": 1.5,
|
||
|
|
"learning_rate": 0.0003845,
|
||
|
|
"loss": 7.0322,
|
||
|
|
"mean_token_accuracy": 0.11829963177442551,
|
||
|
|
"num_tokens": 1395790.0,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.302670812606811,
|
||
|
|
"epoch": 0.6660936828534594,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00038700000000000003,
|
||
|
|
"loss": 7.0393,
|
||
|
|
"mean_token_accuracy": 0.11235549300909042,
|
||
|
|
"num_tokens": 1405587.0,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.348860168457032,
|
||
|
|
"epoch": 0.6703910614525139,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00038950000000000003,
|
||
|
|
"loss": 6.9999,
|
||
|
|
"mean_token_accuracy": 0.11504087448120118,
|
||
|
|
"num_tokens": 1414478.0,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.428205347061157,
|
||
|
|
"epoch": 0.6746884400515686,
|
||
|
|
"grad_norm": 1.375,
|
||
|
|
"learning_rate": 0.00039200000000000004,
|
||
|
|
"loss": 7.0623,
|
||
|
|
"mean_token_accuracy": 0.11534775421023369,
|
||
|
|
"num_tokens": 1423791.0,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.467832851409912,
|
||
|
|
"epoch": 0.6789858186506231,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.00039450000000000005,
|
||
|
|
"loss": 7.1014,
|
||
|
|
"mean_token_accuracy": 0.10728210881352425,
|
||
|
|
"num_tokens": 1432955.0,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.385548782348633,
|
||
|
|
"epoch": 0.6832831972496777,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00039700000000000005,
|
||
|
|
"loss": 7.074,
|
||
|
|
"mean_token_accuracy": 0.1087567687034607,
|
||
|
|
"num_tokens": 1441907.0,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.290066146850586,
|
||
|
|
"epoch": 0.6875805758487322,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0003995,
|
||
|
|
"loss": 6.935,
|
||
|
|
"mean_token_accuracy": 0.11768098697066307,
|
||
|
|
"num_tokens": 1451062.0,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.399672508239746,
|
||
|
|
"epoch": 0.6918779544477869,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000402,
|
||
|
|
"loss": 7.0218,
|
||
|
|
"mean_token_accuracy": 0.10959179401397705,
|
||
|
|
"num_tokens": 1460132.0,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.272280263900757,
|
||
|
|
"epoch": 0.6961753330468414,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004045,
|
||
|
|
"loss": 6.9141,
|
||
|
|
"mean_token_accuracy": 0.11885375007987023,
|
||
|
|
"num_tokens": 1469582.0,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.255832242965698,
|
||
|
|
"epoch": 0.700472711645896,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.00040699999999999997,
|
||
|
|
"loss": 7.012,
|
||
|
|
"mean_token_accuracy": 0.10950389429926873,
|
||
|
|
"num_tokens": 1479053.0,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.313858604431152,
|
||
|
|
"epoch": 0.7047700902449506,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0004095,
|
||
|
|
"loss": 7.0142,
|
||
|
|
"mean_token_accuracy": 0.11343196108937263,
|
||
|
|
"num_tokens": 1488189.0,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.236453676223755,
|
||
|
|
"epoch": 0.7090674688440052,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000412,
|
||
|
|
"loss": 6.8662,
|
||
|
|
"mean_token_accuracy": 0.12046442031860352,
|
||
|
|
"num_tokens": 1497324.0,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.310264635086059,
|
||
|
|
"epoch": 0.7133648474430597,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004145,
|
||
|
|
"loss": 6.9814,
|
||
|
|
"mean_token_accuracy": 0.11739002540707588,
|
||
|
|
"num_tokens": 1506543.0,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.289929437637329,
|
||
|
|
"epoch": 0.7176622260421143,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000417,
|
||
|
|
"loss": 6.9742,
|
||
|
|
"mean_token_accuracy": 0.12236066460609436,
|
||
|
|
"num_tokens": 1516737.0,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.161224508285523,
|
||
|
|
"epoch": 0.7219596046411689,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004195,
|
||
|
|
"loss": 6.8503,
|
||
|
|
"mean_token_accuracy": 0.11500222384929656,
|
||
|
|
"num_tokens": 1525561.0,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.280500030517578,
|
||
|
|
"epoch": 0.7262569832402235,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000422,
|
||
|
|
"loss": 6.8765,
|
||
|
|
"mean_token_accuracy": 0.1242159940302372,
|
||
|
|
"num_tokens": 1533323.0,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.292038059234619,
|
||
|
|
"epoch": 0.730554361839278,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004245,
|
||
|
|
"loss": 6.9379,
|
||
|
|
"mean_token_accuracy": 0.12142991349101066,
|
||
|
|
"num_tokens": 1542632.0,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.305912923812866,
|
||
|
|
"epoch": 0.7348517404383326,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.000427,
|
||
|
|
"loss": 6.8775,
|
||
|
|
"mean_token_accuracy": 0.12107516825199127,
|
||
|
|
"num_tokens": 1551236.0,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.118098545074463,
|
||
|
|
"epoch": 0.7391491190373872,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004295,
|
||
|
|
"loss": 6.878,
|
||
|
|
"mean_token_accuracy": 0.12266490310430526,
|
||
|
|
"num_tokens": 1559674.0,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.268103885650635,
|
||
|
|
"epoch": 0.7434464976364418,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000432,
|
||
|
|
"loss": 6.9687,
|
||
|
|
"mean_token_accuracy": 0.1217973381280899,
|
||
|
|
"num_tokens": 1569481.0,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2675707817077635,
|
||
|
|
"epoch": 0.7477438762354963,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004345,
|
||
|
|
"loss": 6.9975,
|
||
|
|
"mean_token_accuracy": 0.11359266638755798,
|
||
|
|
"num_tokens": 1578488.0,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.171451759338379,
|
||
|
|
"epoch": 0.752041254834551,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000437,
|
||
|
|
"loss": 6.8946,
|
||
|
|
"mean_token_accuracy": 0.11810402423143387,
|
||
|
|
"num_tokens": 1586675.0,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.285072469711304,
|
||
|
|
"epoch": 0.7563386334336055,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004395,
|
||
|
|
"loss": 7.0021,
|
||
|
|
"mean_token_accuracy": 0.10800698548555374,
|
||
|
|
"num_tokens": 1595411.0,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.312672233581543,
|
||
|
|
"epoch": 0.7606360120326601,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000442,
|
||
|
|
"loss": 6.9755,
|
||
|
|
"mean_token_accuracy": 0.11759781166911125,
|
||
|
|
"num_tokens": 1604046.0,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.245748281478882,
|
||
|
|
"epoch": 0.7649333906317146,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004445,
|
||
|
|
"loss": 6.9643,
|
||
|
|
"mean_token_accuracy": 0.11201045587658882,
|
||
|
|
"num_tokens": 1613759.0,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.238279533386231,
|
||
|
|
"epoch": 0.7692307692307693,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000447,
|
||
|
|
"loss": 6.9209,
|
||
|
|
"mean_token_accuracy": 0.11877147182822227,
|
||
|
|
"num_tokens": 1623323.0,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.230697107315064,
|
||
|
|
"epoch": 0.7735281478298238,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00044950000000000003,
|
||
|
|
"loss": 6.9005,
|
||
|
|
"mean_token_accuracy": 0.11391794160008431,
|
||
|
|
"num_tokens": 1631727.0,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.194222545623779,
|
||
|
|
"epoch": 0.7778255264288784,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00045200000000000004,
|
||
|
|
"loss": 6.8583,
|
||
|
|
"mean_token_accuracy": 0.12049278989434242,
|
||
|
|
"num_tokens": 1639544.0,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.284112405776978,
|
||
|
|
"epoch": 0.7821229050279329,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00045450000000000004,
|
||
|
|
"loss": 6.9773,
|
||
|
|
"mean_token_accuracy": 0.11113567724823951,
|
||
|
|
"num_tokens": 1648931.0,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1627342224121096,
|
||
|
|
"epoch": 0.7864202836269876,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00045700000000000005,
|
||
|
|
"loss": 6.8345,
|
||
|
|
"mean_token_accuracy": 0.12127922549843788,
|
||
|
|
"num_tokens": 1657688.0,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.259271335601807,
|
||
|
|
"epoch": 0.7907176622260421,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00045950000000000006,
|
||
|
|
"loss": 6.9244,
|
||
|
|
"mean_token_accuracy": 0.11565326899290085,
|
||
|
|
"num_tokens": 1666879.0,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1275458335876465,
|
||
|
|
"epoch": 0.7950150408250967,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000462,
|
||
|
|
"loss": 6.8982,
|
||
|
|
"mean_token_accuracy": 0.118662890791893,
|
||
|
|
"num_tokens": 1676773.0,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2360998630523685,
|
||
|
|
"epoch": 0.7993124194241513,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004645,
|
||
|
|
"loss": 7.0092,
|
||
|
|
"mean_token_accuracy": 0.11184348464012146,
|
||
|
|
"num_tokens": 1686144.0,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.26247010231018,
|
||
|
|
"epoch": 0.8036097980232059,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000467,
|
||
|
|
"loss": 6.9646,
|
||
|
|
"mean_token_accuracy": 0.10949353277683258,
|
||
|
|
"num_tokens": 1695476.0,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.174946022033692,
|
||
|
|
"epoch": 0.8079071766222604,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004695,
|
||
|
|
"loss": 6.8498,
|
||
|
|
"mean_token_accuracy": 0.12084392830729485,
|
||
|
|
"num_tokens": 1704907.0,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.166734504699707,
|
||
|
|
"epoch": 0.812204555221315,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.000472,
|
||
|
|
"loss": 6.8948,
|
||
|
|
"mean_token_accuracy": 0.12091493904590607,
|
||
|
|
"num_tokens": 1714564.0,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.244975614547729,
|
||
|
|
"epoch": 0.8165019338203696,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004745,
|
||
|
|
"loss": 6.9209,
|
||
|
|
"mean_token_accuracy": 0.1155279442667961,
|
||
|
|
"num_tokens": 1725285.0,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1149109363555905,
|
||
|
|
"epoch": 0.8207993124194242,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000477,
|
||
|
|
"loss": 6.9153,
|
||
|
|
"mean_token_accuracy": 0.11715079098939896,
|
||
|
|
"num_tokens": 1734331.0,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.227117824554443,
|
||
|
|
"epoch": 0.8250966910184787,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0004795,
|
||
|
|
"loss": 6.852,
|
||
|
|
"mean_token_accuracy": 0.11185217499732972,
|
||
|
|
"num_tokens": 1742340.0,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.160442066192627,
|
||
|
|
"epoch": 0.8293940696175333,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000482,
|
||
|
|
"loss": 6.8351,
|
||
|
|
"mean_token_accuracy": 0.12198592498898506,
|
||
|
|
"num_tokens": 1751725.0,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.999344539642334,
|
||
|
|
"epoch": 0.8336914482165879,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004845,
|
||
|
|
"loss": 6.7683,
|
||
|
|
"mean_token_accuracy": 0.12398558706045151,
|
||
|
|
"num_tokens": 1760294.0,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.112461137771606,
|
||
|
|
"epoch": 0.8379888268156425,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000487,
|
||
|
|
"loss": 6.8275,
|
||
|
|
"mean_token_accuracy": 0.11639805063605309,
|
||
|
|
"num_tokens": 1768912.0,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.257990169525146,
|
||
|
|
"epoch": 0.842286205414697,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004895,
|
||
|
|
"loss": 7.0148,
|
||
|
|
"mean_token_accuracy": 0.12016609534621239,
|
||
|
|
"num_tokens": 1778633.0,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1191816329956055,
|
||
|
|
"epoch": 0.8465835840137517,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.000492,
|
||
|
|
"loss": 6.8847,
|
||
|
|
"mean_token_accuracy": 0.11811531409621238,
|
||
|
|
"num_tokens": 1787275.0,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.235857200622559,
|
||
|
|
"epoch": 0.8508809626128062,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0004945,
|
||
|
|
"loss": 6.8878,
|
||
|
|
"mean_token_accuracy": 0.11604067236185074,
|
||
|
|
"num_tokens": 1795994.0,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.036646842956543,
|
||
|
|
"epoch": 0.8551783412118608,
|
||
|
|
"grad_norm": 0.8359375,
|
||
|
|
"learning_rate": 0.000497,
|
||
|
|
"loss": 6.804,
|
||
|
|
"mean_token_accuracy": 0.11985133662819862,
|
||
|
|
"num_tokens": 1806379.0,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.154667520523072,
|
||
|
|
"epoch": 0.8594757198109153,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004995,
|
||
|
|
"loss": 6.8296,
|
||
|
|
"mean_token_accuracy": 0.1270947828888893,
|
||
|
|
"num_tokens": 1816135.0,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8594757198109153,
|
||
|
|
"eval_entropy": 6.812919497489929,
|
||
|
|
"eval_loss": 6.8574419021606445,
|
||
|
|
"eval_mean_token_accuracy": 0.12292942362795542,
|
||
|
|
"eval_num_tokens": 1816135.0,
|
||
|
|
"eval_runtime": 2.0522,
|
||
|
|
"eval_samples_per_second": 1729.37,
|
||
|
|
"eval_steps_per_second": 216.354,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.122643280029297,
|
||
|
|
"epoch": 0.86377309840997,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0004999998427807679,
|
||
|
|
"loss": 6.8305,
|
||
|
|
"mean_token_accuracy": 0.12133256047964096,
|
||
|
|
"num_tokens": 1824777.0,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.058982563018799,
|
||
|
|
"epoch": 0.8680704770090245,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0004999992040780138,
|
||
|
|
"loss": 6.8924,
|
||
|
|
"mean_token_accuracy": 0.12320492565631866,
|
||
|
|
"num_tokens": 1833807.0,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.185050773620605,
|
||
|
|
"epoch": 0.8723678556080791,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999980740669294,
|
||
|
|
"loss": 6.8357,
|
||
|
|
"mean_token_accuracy": 0.11969011649489403,
|
||
|
|
"num_tokens": 1843375.0,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.11086139678955,
|
||
|
|
"epoch": 0.8766652342071336,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999964527499823,
|
||
|
|
"loss": 6.9058,
|
||
|
|
"mean_token_accuracy": 0.11237111985683441,
|
||
|
|
"num_tokens": 1853036.0,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.120519638061523,
|
||
|
|
"epoch": 0.8809626128061883,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999943401307127,
|
||
|
|
"loss": 6.8707,
|
||
|
|
"mean_token_accuracy": 0.11769452393054962,
|
||
|
|
"num_tokens": 1862041.0,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.087871503829956,
|
||
|
|
"epoch": 0.8852599914052428,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004999917362137337,
|
||
|
|
"loss": 6.7742,
|
||
|
|
"mean_token_accuracy": 0.1225271351635456,
|
||
|
|
"num_tokens": 1870707.0,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.055140686035156,
|
||
|
|
"epoch": 0.8895573700042974,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999886410047312,
|
||
|
|
"loss": 6.7705,
|
||
|
|
"mean_token_accuracy": 0.11845692843198777,
|
||
|
|
"num_tokens": 1879787.0,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.138674926757813,
|
||
|
|
"epoch": 0.8938547486033519,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004999850545104638,
|
||
|
|
"loss": 6.8315,
|
||
|
|
"mean_token_accuracy": 0.1223653219640255,
|
||
|
|
"num_tokens": 1889413.0,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.048402404785156,
|
||
|
|
"epoch": 0.8981521272024066,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004999809767387633,
|
||
|
|
"loss": 6.8174,
|
||
|
|
"mean_token_accuracy": 0.12110616937279702,
|
||
|
|
"num_tokens": 1898283.0,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.144178056716919,
|
||
|
|
"epoch": 0.9024495058014611,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999764076985337,
|
||
|
|
"loss": 6.8287,
|
||
|
|
"mean_token_accuracy": 0.12670400962233544,
|
||
|
|
"num_tokens": 1907175.0,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.988327312469482,
|
||
|
|
"epoch": 0.9067468844005157,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999713473997519,
|
||
|
|
"loss": 6.8824,
|
||
|
|
"mean_token_accuracy": 0.11774980947375298,
|
||
|
|
"num_tokens": 1918223.0,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.124748563766479,
|
||
|
|
"epoch": 0.9110442629995703,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999657958534677,
|
||
|
|
"loss": 6.8312,
|
||
|
|
"mean_token_accuracy": 0.1194000355899334,
|
||
|
|
"num_tokens": 1928801.0,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.008511686325074,
|
||
|
|
"epoch": 0.9153416415986249,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004999597530718034,
|
||
|
|
"loss": 6.7896,
|
||
|
|
"mean_token_accuracy": 0.12186847031116485,
|
||
|
|
"num_tokens": 1937406.0,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.997484445571899,
|
||
|
|
"epoch": 0.9196390201976794,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000499953219067954,
|
||
|
|
"loss": 6.7932,
|
||
|
|
"mean_token_accuracy": 0.11857569143176079,
|
||
|
|
"num_tokens": 1947184.0,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.135808944702148,
|
||
|
|
"epoch": 0.923936398796734,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999461938561873,
|
||
|
|
"loss": 6.8139,
|
||
|
|
"mean_token_accuracy": 0.12288291603326798,
|
||
|
|
"num_tokens": 1956293.0,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.027012157440185,
|
||
|
|
"epoch": 0.9282337773957886,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004999386774518432,
|
||
|
|
"loss": 6.7854,
|
||
|
|
"mean_token_accuracy": 0.11997194737195968,
|
||
|
|
"num_tokens": 1964791.0,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.975531768798828,
|
||
|
|
"epoch": 0.9325311559948432,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999306698713349,
|
||
|
|
"loss": 6.7088,
|
||
|
|
"mean_token_accuracy": 0.12559010088443756,
|
||
|
|
"num_tokens": 1973754.0,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.052453565597534,
|
||
|
|
"epoch": 0.9368285345938977,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999221711321477,
|
||
|
|
"loss": 6.7738,
|
||
|
|
"mean_token_accuracy": 0.12475829720497131,
|
||
|
|
"num_tokens": 1983035.0,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.906819009780884,
|
||
|
|
"epoch": 0.9411259131929522,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999131812528393,
|
||
|
|
"loss": 6.8003,
|
||
|
|
"mean_token_accuracy": 0.12229804769158363,
|
||
|
|
"num_tokens": 1992584.0,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.109902429580688,
|
||
|
|
"epoch": 0.9454232917920069,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.00049990370025304,
|
||
|
|
"loss": 6.8193,
|
||
|
|
"mean_token_accuracy": 0.12188051193952561,
|
||
|
|
"num_tokens": 2001876.0,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.017454195022583,
|
||
|
|
"epoch": 0.9497206703910615,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004998937281534526,
|
||
|
|
"loss": 6.7115,
|
||
|
|
"mean_token_accuracy": 0.1300358146429062,
|
||
|
|
"num_tokens": 2011067.0,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.091220808029175,
|
||
|
|
"epoch": 0.954018048990116,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004998832649758521,
|
||
|
|
"loss": 6.8077,
|
||
|
|
"mean_token_accuracy": 0.12548175528645517,
|
||
|
|
"num_tokens": 2020763.0,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.9685986042022705,
|
||
|
|
"epoch": 0.9583154275891707,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004998723107430862,
|
||
|
|
"loss": 6.7867,
|
||
|
|
"mean_token_accuracy": 0.12391732335090637,
|
||
|
|
"num_tokens": 2029534.0,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.046098041534424,
|
||
|
|
"epoch": 0.9626128061882252,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004998608654790741,
|
||
|
|
"loss": 6.7311,
|
||
|
|
"mean_token_accuracy": 0.12396327033638954,
|
||
|
|
"num_tokens": 2039143.0,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.939239406585694,
|
||
|
|
"epoch": 0.9669101847872797,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000499848929208808,
|
||
|
|
"loss": 6.7022,
|
||
|
|
"mean_token_accuracy": 0.1295892022550106,
|
||
|
|
"num_tokens": 2048253.0,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.931437301635742,
|
||
|
|
"epoch": 0.9712075633863343,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004998365019583519,
|
||
|
|
"loss": 6.7428,
|
||
|
|
"mean_token_accuracy": 0.13122318536043168,
|
||
|
|
"num_tokens": 2057234.0,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.081391954421997,
|
||
|
|
"epoch": 0.975504941985389,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004998235837548417,
|
||
|
|
"loss": 6.7881,
|
||
|
|
"mean_token_accuracy": 0.1271953523159027,
|
||
|
|
"num_tokens": 2065431.0,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.974546146392822,
|
||
|
|
"epoch": 0.9798023205844435,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000499810174626486,
|
||
|
|
"loss": 6.7888,
|
||
|
|
"mean_token_accuracy": 0.1228917419910431,
|
||
|
|
"num_tokens": 2074723.0,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.011039209365845,
|
||
|
|
"epoch": 0.984099699183498,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004997962746025646,
|
||
|
|
"loss": 6.6544,
|
||
|
|
"mean_token_accuracy": 0.13169871941208838,
|
||
|
|
"num_tokens": 2084509.0,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.973200798034668,
|
||
|
|
"epoch": 0.9883970777825526,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0004997818837134298,
|
||
|
|
"loss": 6.8028,
|
||
|
|
"mean_token_accuracy": 0.12382483929395675,
|
||
|
|
"num_tokens": 2093110.0,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.879178285598755,
|
||
|
|
"epoch": 0.9926944563816072,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004997670019905057,
|
||
|
|
"loss": 6.6634,
|
||
|
|
"mean_token_accuracy": 0.12532600611448289,
|
||
|
|
"num_tokens": 2102355.0,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.967250823974609,
|
||
|
|
"epoch": 0.9969918349806618,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004997516294662876,
|
||
|
|
"loss": 6.6987,
|
||
|
|
"mean_token_accuracy": 0.12651606351137162,
|
||
|
|
"num_tokens": 2110418.0,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.987489064534505,
|
||
|
|
"epoch": 1.0008594757198108,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004997357661743433,
|
||
|
|
"loss": 6.6851,
|
||
|
|
"mean_token_accuracy": 0.12885562578837076,
|
||
|
|
"num_tokens": 2117866.0,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.906875991821289,
|
||
|
|
"epoch": 1.0051568543188656,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004997194121493118,
|
||
|
|
"loss": 6.5242,
|
||
|
|
"mean_token_accuracy": 0.1341039627790451,
|
||
|
|
"num_tokens": 2126082.0,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.9217222213745115,
|
||
|
|
"epoch": 1.0094542329179201,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004997025674269037,
|
||
|
|
"loss": 6.496,
|
||
|
|
"mean_token_accuracy": 0.14013660922646523,
|
||
|
|
"num_tokens": 2134042.0,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.853777265548706,
|
||
|
|
"epoch": 1.0137516115169747,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004996852320439013,
|
||
|
|
"loss": 6.5756,
|
||
|
|
"mean_token_accuracy": 0.13146138042211533,
|
||
|
|
"num_tokens": 2142570.0,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.882978248596191,
|
||
|
|
"epoch": 1.0180489901160292,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004996674060381578,
|
||
|
|
"loss": 6.5116,
|
||
|
|
"mean_token_accuracy": 0.13583723902702333,
|
||
|
|
"num_tokens": 2151310.0,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.949011325836182,
|
||
|
|
"epoch": 1.0223463687150838,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004996490894485985,
|
||
|
|
"loss": 6.5696,
|
||
|
|
"mean_token_accuracy": 0.1317083679139614,
|
||
|
|
"num_tokens": 2160662.0,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.906634664535522,
|
||
|
|
"epoch": 1.0266437473141383,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004996302823152193,
|
||
|
|
"loss": 6.5221,
|
||
|
|
"mean_token_accuracy": 0.132858457416296,
|
||
|
|
"num_tokens": 2170067.0,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.835825204849243,
|
||
|
|
"epoch": 1.0309411259131929,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004996109846790873,
|
||
|
|
"loss": 6.4844,
|
||
|
|
"mean_token_accuracy": 0.13565613552927971,
|
||
|
|
"num_tokens": 2178850.0,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.833173513412476,
|
||
|
|
"epoch": 1.0352385045122476,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004995911965823412,
|
||
|
|
"loss": 6.5058,
|
||
|
|
"mean_token_accuracy": 0.14241415858268738,
|
||
|
|
"num_tokens": 2188307.0,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.888755178451538,
|
||
|
|
"epoch": 1.0395358831113022,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004995709180681899,
|
||
|
|
"loss": 6.5098,
|
||
|
|
"mean_token_accuracy": 0.14214854687452316,
|
||
|
|
"num_tokens": 2197026.0,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.828827667236328,
|
||
|
|
"epoch": 1.0438332617103567,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000499550149180914,
|
||
|
|
"loss": 6.4795,
|
||
|
|
"mean_token_accuracy": 0.13599886670708655,
|
||
|
|
"num_tokens": 2205537.0,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.880095815658569,
|
||
|
|
"epoch": 1.0481306403094113,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004995288899658641,
|
||
|
|
"loss": 6.5128,
|
||
|
|
"mean_token_accuracy": 0.14047559648752211,
|
||
|
|
"num_tokens": 2214508.0,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.848831415176392,
|
||
|
|
"epoch": 1.0524280189084658,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004995071404694619,
|
||
|
|
"loss": 6.6248,
|
||
|
|
"mean_token_accuracy": 0.1286735638976097,
|
||
|
|
"num_tokens": 2223084.0,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.930538558959961,
|
||
|
|
"epoch": 1.0567253975075204,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004994849007391996,
|
||
|
|
"loss": 6.5507,
|
||
|
|
"mean_token_accuracy": 0.12893568202853203,
|
||
|
|
"num_tokens": 2231406.0,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.784887790679932,
|
||
|
|
"epoch": 1.061022776106575,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004994621708236401,
|
||
|
|
"loss": 6.4682,
|
||
|
|
"mean_token_accuracy": 0.136442781239748,
|
||
|
|
"num_tokens": 2239867.0,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8624866008758545,
|
||
|
|
"epoch": 1.0653201547056295,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.000499438950772416,
|
||
|
|
"loss": 6.5264,
|
||
|
|
"mean_token_accuracy": 0.1343722127377987,
|
||
|
|
"num_tokens": 2248844.0,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.764705419540405,
|
||
|
|
"epoch": 1.0696175333046842,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004994152406362311,
|
||
|
|
"loss": 6.4525,
|
||
|
|
"mean_token_accuracy": 0.14018251076340676,
|
||
|
|
"num_tokens": 2257599.0,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.871714019775391,
|
||
|
|
"epoch": 1.0739149119037388,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.0004993910404668586,
|
||
|
|
"loss": 6.4992,
|
||
|
|
"mean_token_accuracy": 0.1316287100315094,
|
||
|
|
"num_tokens": 2266510.0,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.801673936843872,
|
||
|
|
"epoch": 1.0782122905027933,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000499366350317142,
|
||
|
|
"loss": 6.4902,
|
||
|
|
"mean_token_accuracy": 0.1355181120336056,
|
||
|
|
"num_tokens": 2275462.0,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.805047512054443,
|
||
|
|
"epoch": 1.0825096691018479,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004993411702409948,
|
||
|
|
"loss": 6.4684,
|
||
|
|
"mean_token_accuracy": 0.13499311953783036,
|
||
|
|
"num_tokens": 2283826.0,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.796231460571289,
|
||
|
|
"epoch": 1.0868070477009024,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004993155002934002,
|
||
|
|
"loss": 6.4758,
|
||
|
|
"mean_token_accuracy": 0.13739539608359336,
|
||
|
|
"num_tokens": 2292967.0,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.935551691055298,
|
||
|
|
"epoch": 1.091104426299957,
|
||
|
|
"grad_norm": 1.5078125,
|
||
|
|
"learning_rate": 0.0004992893405304111,
|
||
|
|
"loss": 6.6091,
|
||
|
|
"mean_token_accuracy": 0.13493912890553475,
|
||
|
|
"num_tokens": 2302336.0,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.757972192764282,
|
||
|
|
"epoch": 1.0954018048990115,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00049926269100915,
|
||
|
|
"loss": 6.5039,
|
||
|
|
"mean_token_accuracy": 0.14085786640644074,
|
||
|
|
"num_tokens": 2311465.0,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.884800767898559,
|
||
|
|
"epoch": 1.0996991834980663,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004992355517878087,
|
||
|
|
"loss": 6.6134,
|
||
|
|
"mean_token_accuracy": 0.12797435671091079,
|
||
|
|
"num_tokens": 2320281.0,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.775428581237793,
|
||
|
|
"epoch": 1.1039965620971208,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004992079229256484,
|
||
|
|
"loss": 6.5189,
|
||
|
|
"mean_token_accuracy": 0.1329084627330303,
|
||
|
|
"num_tokens": 2329755.0,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.721524858474732,
|
||
|
|
"epoch": 1.1082939406961754,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004991798044829996,
|
||
|
|
"loss": 6.4524,
|
||
|
|
"mean_token_accuracy": 0.1344260886311531,
|
||
|
|
"num_tokens": 2338807.0,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.870701122283935,
|
||
|
|
"epoch": 1.11259131929523,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004991511965212618,
|
||
|
|
"loss": 6.5591,
|
||
|
|
"mean_token_accuracy": 0.13554905205965043,
|
||
|
|
"num_tokens": 2348056.0,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.759064626693726,
|
||
|
|
"epoch": 1.1168886978942845,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004991220991029032,
|
||
|
|
"loss": 6.5619,
|
||
|
|
"mean_token_accuracy": 0.13164993077516557,
|
||
|
|
"num_tokens": 2357780.0,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.845104169845581,
|
||
|
|
"epoch": 1.121186076493339,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.000499092512291461,
|
||
|
|
"loss": 6.526,
|
||
|
|
"mean_token_accuracy": 0.13971479684114457,
|
||
|
|
"num_tokens": 2367060.0,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.800533056259155,
|
||
|
|
"epoch": 1.1254834550923936,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000499062436151541,
|
||
|
|
"loss": 6.5277,
|
||
|
|
"mean_token_accuracy": 0.13263508304953575,
|
||
|
|
"num_tokens": 2375751.0,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.890619134902954,
|
||
|
|
"epoch": 1.129780833691448,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004990318707488173,
|
||
|
|
"loss": 6.5788,
|
||
|
|
"mean_token_accuracy": 0.12899956330657006,
|
||
|
|
"num_tokens": 2385013.0,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.769053792953491,
|
||
|
|
"epoch": 1.1340782122905029,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004990008161500327,
|
||
|
|
"loss": 6.48,
|
||
|
|
"mean_token_accuracy": 0.1359359547495842,
|
||
|
|
"num_tokens": 2392935.0,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7767839431762695,
|
||
|
|
"epoch": 1.1383755908895574,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.000498969272422998,
|
||
|
|
"loss": 6.4887,
|
||
|
|
"mean_token_accuracy": 0.13946662694215775,
|
||
|
|
"num_tokens": 2401560.0,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.732125520706177,
|
||
|
|
"epoch": 1.142672969488612,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004989372396365921,
|
||
|
|
"loss": 6.4183,
|
||
|
|
"mean_token_accuracy": 0.13894038647413254,
|
||
|
|
"num_tokens": 2410050.0,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8855541229248045,
|
||
|
|
"epoch": 1.1469703480876665,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004989047178607618,
|
||
|
|
"loss": 6.5218,
|
||
|
|
"mean_token_accuracy": 0.13579266518354416,
|
||
|
|
"num_tokens": 2418980.0,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7566611766815186,
|
||
|
|
"epoch": 1.151267726686721,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004988717071665215,
|
||
|
|
"loss": 6.5177,
|
||
|
|
"mean_token_accuracy": 0.13580050468444824,
|
||
|
|
"num_tokens": 2427992.0,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.821787118911743,
|
||
|
|
"epoch": 1.1555651052857756,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004988382076259537,
|
||
|
|
"loss": 6.4297,
|
||
|
|
"mean_token_accuracy": 0.1417124703526497,
|
||
|
|
"num_tokens": 2436368.0,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.65723991394043,
|
||
|
|
"epoch": 1.1598624838848304,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004988042193122077,
|
||
|
|
"loss": 6.4243,
|
||
|
|
"mean_token_accuracy": 0.1399266541004181,
|
||
|
|
"num_tokens": 2445499.0,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.846164894104004,
|
||
|
|
"epoch": 1.164159862483885,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004987697422995005,
|
||
|
|
"loss": 6.4564,
|
||
|
|
"mean_token_accuracy": 0.13335739225149154,
|
||
|
|
"num_tokens": 2454312.0,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.705566883087158,
|
||
|
|
"epoch": 1.1684572410829395,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004987347766631161,
|
||
|
|
"loss": 6.5179,
|
||
|
|
"mean_token_accuracy": 0.13981100916862488,
|
||
|
|
"num_tokens": 2462922.0,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8054440975189205,
|
||
|
|
"epoch": 1.172754619681994,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004986993224794055,
|
||
|
|
"loss": 6.5574,
|
||
|
|
"mean_token_accuracy": 0.12931617349386215,
|
||
|
|
"num_tokens": 2472195.0,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.731846857070923,
|
||
|
|
"epoch": 1.1770519982810486,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004986633798257865,
|
||
|
|
"loss": 6.456,
|
||
|
|
"mean_token_accuracy": 0.13557855412364006,
|
||
|
|
"num_tokens": 2481021.0,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.709754800796508,
|
||
|
|
"epoch": 1.181349376880103,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004986269487807434,
|
||
|
|
"loss": 6.4682,
|
||
|
|
"mean_token_accuracy": 0.13462188541889192,
|
||
|
|
"num_tokens": 2490250.0,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8344573974609375,
|
||
|
|
"epoch": 1.1856467554791577,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000498590029423827,
|
||
|
|
"loss": 6.529,
|
||
|
|
"mean_token_accuracy": 0.13892517015337943,
|
||
|
|
"num_tokens": 2499122.0,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.794313240051269,
|
||
|
|
"epoch": 1.1899441340782122,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004985526218356546,
|
||
|
|
"loss": 6.5102,
|
||
|
|
"mean_token_accuracy": 0.13186247944831847,
|
||
|
|
"num_tokens": 2508454.0,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.717947912216187,
|
||
|
|
"epoch": 1.1942415126772667,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004985147260979093,
|
||
|
|
"loss": 6.449,
|
||
|
|
"mean_token_accuracy": 0.1434843860566616,
|
||
|
|
"num_tokens": 2517353.0,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.771858787536621,
|
||
|
|
"epoch": 1.1985388912763215,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004984763422933402,
|
||
|
|
"loss": 6.4618,
|
||
|
|
"mean_token_accuracy": 0.13847233429551126,
|
||
|
|
"num_tokens": 2526321.0,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.732237863540649,
|
||
|
|
"epoch": 1.202836269875376,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004984374705057623,
|
||
|
|
"loss": 6.5033,
|
||
|
|
"mean_token_accuracy": 0.13528537154197692,
|
||
|
|
"num_tokens": 2535924.0,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.721146202087402,
|
||
|
|
"epoch": 1.2071336484744306,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004983981108200561,
|
||
|
|
"loss": 6.4711,
|
||
|
|
"mean_token_accuracy": 0.13535311296582223,
|
||
|
|
"num_tokens": 2545606.0,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.733812093734741,
|
||
|
|
"epoch": 1.2114310270734852,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004983582633221672,
|
||
|
|
"loss": 6.4601,
|
||
|
|
"mean_token_accuracy": 0.1369933992624283,
|
||
|
|
"num_tokens": 2554947.0,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.855603933334351,
|
||
|
|
"epoch": 1.2157284056725397,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004983179280991068,
|
||
|
|
"loss": 6.6134,
|
||
|
|
"mean_token_accuracy": 0.12978528887033464,
|
||
|
|
"num_tokens": 2564462.0,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.726688861846924,
|
||
|
|
"epoch": 1.2200257842715942,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004982771052389508,
|
||
|
|
"loss": 6.4475,
|
||
|
|
"mean_token_accuracy": 0.1368112660944462,
|
||
|
|
"num_tokens": 2573124.0,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.807424783706665,
|
||
|
|
"epoch": 1.224323162870649,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004982357948308401,
|
||
|
|
"loss": 6.5481,
|
||
|
|
"mean_token_accuracy": 0.13265790268778802,
|
||
|
|
"num_tokens": 2581829.0,
|
||
|
|
"step": 1425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.770775365829468,
|
||
|
|
"epoch": 1.2286205414697036,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004981939969649799,
|
||
|
|
"loss": 6.4049,
|
||
|
|
"mean_token_accuracy": 0.14194427505135537,
|
||
|
|
"num_tokens": 2590631.0,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.709357166290284,
|
||
|
|
"epoch": 1.232917920068758,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004981517117326404,
|
||
|
|
"loss": 6.5216,
|
||
|
|
"mean_token_accuracy": 0.13609697446227073,
|
||
|
|
"num_tokens": 2600684.0,
|
||
|
|
"step": 1435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.725667095184326,
|
||
|
|
"epoch": 1.2372152986678127,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004981089392261553,
|
||
|
|
"loss": 6.4349,
|
||
|
|
"mean_token_accuracy": 0.14131608307361604,
|
||
|
|
"num_tokens": 2609667.0,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.692513275146484,
|
||
|
|
"epoch": 1.2415126772668672,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000498065679538923,
|
||
|
|
"loss": 6.5055,
|
||
|
|
"mean_token_accuracy": 0.14114993885159494,
|
||
|
|
"num_tokens": 2620025.0,
|
||
|
|
"step": 1445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7513340473175045,
|
||
|
|
"epoch": 1.2458100558659218,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004980219327654049,
|
||
|
|
"loss": 6.428,
|
||
|
|
"mean_token_accuracy": 0.13774933964014052,
|
||
|
|
"num_tokens": 2629032.0,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.702835464477539,
|
||
|
|
"epoch": 1.2501074344649763,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000497977699001127,
|
||
|
|
"loss": 6.402,
|
||
|
|
"mean_token_accuracy": 0.142982679605484,
|
||
|
|
"num_tokens": 2638303.0,
|
||
|
|
"step": 1455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.761410474777222,
|
||
|
|
"epoch": 1.2544048130640308,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004979329783426778,
|
||
|
|
"loss": 6.4318,
|
||
|
|
"mean_token_accuracy": 0.14380076453089713,
|
||
|
|
"num_tokens": 2647902.0,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.731089019775391,
|
||
|
|
"epoch": 1.2587021916630854,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004978877708877094,
|
||
|
|
"loss": 6.4848,
|
||
|
|
"mean_token_accuracy": 0.13676076754927635,
|
||
|
|
"num_tokens": 2657902.0,
|
||
|
|
"step": 1465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.71400637626648,
|
||
|
|
"epoch": 1.2629995702621402,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004978420767349368,
|
||
|
|
"loss": 6.4196,
|
||
|
|
"mean_token_accuracy": 0.13780386745929718,
|
||
|
|
"num_tokens": 2667082.0,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.737793684005737,
|
||
|
|
"epoch": 1.2672969488611947,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004977958959841379,
|
||
|
|
"loss": 6.4943,
|
||
|
|
"mean_token_accuracy": 0.1352358005940914,
|
||
|
|
"num_tokens": 2676855.0,
|
||
|
|
"step": 1475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.734015226364136,
|
||
|
|
"epoch": 1.2715943274602493,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000497749228736153,
|
||
|
|
"loss": 6.4201,
|
||
|
|
"mean_token_accuracy": 0.14142746701836587,
|
||
|
|
"num_tokens": 2685750.0,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.656690311431885,
|
||
|
|
"epoch": 1.2758917060593038,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004977020750928845,
|
||
|
|
"loss": 6.4771,
|
||
|
|
"mean_token_accuracy": 0.14191860556602479,
|
||
|
|
"num_tokens": 2695272.0,
|
||
|
|
"step": 1485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.794925928115845,
|
||
|
|
"epoch": 1.2801890846583583,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004976544351572973,
|
||
|
|
"loss": 6.4253,
|
||
|
|
"mean_token_accuracy": 0.14196638017892838,
|
||
|
|
"num_tokens": 2704806.0,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.56059627532959,
|
||
|
|
"epoch": 1.2844864632574131,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004976063090334179,
|
||
|
|
"loss": 6.4836,
|
||
|
|
"mean_token_accuracy": 0.14093814194202423,
|
||
|
|
"num_tokens": 2713521.0,
|
||
|
|
"step": 1495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7648594856262205,
|
||
|
|
"epoch": 1.2887838418564677,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004975576968263346,
|
||
|
|
"loss": 6.472,
|
||
|
|
"mean_token_accuracy": 0.13531532436609267,
|
||
|
|
"num_tokens": 2721848.0,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2887838418564677,
|
||
|
|
"eval_entropy": 6.583824046023257,
|
||
|
|
"eval_loss": 6.552463054656982,
|
||
|
|
"eval_mean_token_accuracy": 0.13841687775477096,
|
||
|
|
"eval_num_tokens": 2721848.0,
|
||
|
|
"eval_runtime": 2.0451,
|
||
|
|
"eval_samples_per_second": 1735.359,
|
||
|
|
"eval_steps_per_second": 217.103,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6689835548400875,
|
||
|
|
"epoch": 1.2930812204555222,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000497508598642197,
|
||
|
|
"loss": 6.4406,
|
||
|
|
"mean_token_accuracy": 0.13946301937103273,
|
||
|
|
"num_tokens": 2731473.0,
|
||
|
|
"step": 1505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.724963998794555,
|
||
|
|
"epoch": 1.2973785990545768,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000497459014588216,
|
||
|
|
"loss": 6.5064,
|
||
|
|
"mean_token_accuracy": 0.13410719558596612,
|
||
|
|
"num_tokens": 2739867.0,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.701112556457519,
|
||
|
|
"epoch": 1.3016759776536313,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000497408944772663,
|
||
|
|
"loss": 6.4165,
|
||
|
|
"mean_token_accuracy": 0.14087883234024048,
|
||
|
|
"num_tokens": 2748903.0,
|
||
|
|
"step": 1515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.621306848526001,
|
||
|
|
"epoch": 1.3059733562526858,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004973583893048707,
|
||
|
|
"loss": 6.4144,
|
||
|
|
"mean_token_accuracy": 0.13790024891495706,
|
||
|
|
"num_tokens": 2757711.0,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8078021049499515,
|
||
|
|
"epoch": 1.3102707348517404,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004973073482952321,
|
||
|
|
"loss": 6.4178,
|
||
|
|
"mean_token_accuracy": 0.14102478623390197,
|
||
|
|
"num_tokens": 2765633.0,
|
||
|
|
"step": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.606275224685669,
|
||
|
|
"epoch": 1.314568113450795,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.0004972558218552004,
|
||
|
|
"loss": 6.454,
|
||
|
|
"mean_token_accuracy": 0.1388860262930393,
|
||
|
|
"num_tokens": 2774495.0,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.737347936630249,
|
||
|
|
"epoch": 1.3188654920498495,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004972038100972885,
|
||
|
|
"loss": 6.4827,
|
||
|
|
"mean_token_accuracy": 0.13370617032051085,
|
||
|
|
"num_tokens": 2782665.0,
|
||
|
|
"step": 1535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.652740144729615,
|
||
|
|
"epoch": 1.323162870648904,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.0004971513131350697,
|
||
|
|
"loss": 6.4163,
|
||
|
|
"mean_token_accuracy": 0.13846877068281174,
|
||
|
|
"num_tokens": 2791394.0,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.583173847198486,
|
||
|
|
"epoch": 1.3274602492479588,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004970983310831759,
|
||
|
|
"loss": 6.4113,
|
||
|
|
"mean_token_accuracy": 0.13881225883960724,
|
||
|
|
"num_tokens": 2800488.0,
|
||
|
|
"step": 1545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.734278392791748,
|
||
|
|
"epoch": 1.3317576278470133,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004970448640572989,
|
||
|
|
"loss": 6.5243,
|
||
|
|
"mean_token_accuracy": 0.1339696764945984,
|
||
|
|
"num_tokens": 2810116.0,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.658429765701294,
|
||
|
|
"epoch": 1.336055006446068,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004969909121741895,
|
||
|
|
"loss": 6.3255,
|
||
|
|
"mean_token_accuracy": 0.14455484077334405,
|
||
|
|
"num_tokens": 2819205.0,
|
||
|
|
"step": 1555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.591242885589599,
|
||
|
|
"epoch": 1.3403523850451224,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004969364755516569,
|
||
|
|
"loss": 6.4035,
|
||
|
|
"mean_token_accuracy": 0.13771276026964188,
|
||
|
|
"num_tokens": 2828017.0,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.73987512588501,
|
||
|
|
"epoch": 1.344649763644177,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004968815543085689,
|
||
|
|
"loss": 6.438,
|
||
|
|
"mean_token_accuracy": 0.14133503511548043,
|
||
|
|
"num_tokens": 2837125.0,
|
||
|
|
"step": 1565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.648034620285034,
|
||
|
|
"epoch": 1.3489471422432318,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004968261485648516,
|
||
|
|
"loss": 6.4665,
|
||
|
|
"mean_token_accuracy": 0.13752973526716233,
|
||
|
|
"num_tokens": 2845438.0,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.690678644180298,
|
||
|
|
"epoch": 1.3532445208422863,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000496770258441489,
|
||
|
|
"loss": 6.4311,
|
||
|
|
"mean_token_accuracy": 0.14550055414438248,
|
||
|
|
"num_tokens": 2854389.0,
|
||
|
|
"step": 1575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.591717529296875,
|
||
|
|
"epoch": 1.3575418994413408,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004967138840605228,
|
||
|
|
"loss": 6.3947,
|
||
|
|
"mean_token_accuracy": 0.1433369368314743,
|
||
|
|
"num_tokens": 2863654.0,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.645109987258911,
|
||
|
|
"epoch": 1.3618392780403954,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000496657025545052,
|
||
|
|
"loss": 6.3068,
|
||
|
|
"mean_token_accuracy": 0.14519514814019202,
|
||
|
|
"num_tokens": 2872871.0,
|
||
|
|
"step": 1585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5770776748657225,
|
||
|
|
"epoch": 1.36613665663945,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000496599683019233,
|
||
|
|
"loss": 6.4037,
|
||
|
|
"mean_token_accuracy": 0.14221980646252633,
|
||
|
|
"num_tokens": 2881140.0,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7226653575897215,
|
||
|
|
"epoch": 1.3704340352385045,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000496541856608279,
|
||
|
|
"loss": 6.3852,
|
||
|
|
"mean_token_accuracy": 0.14397331327199936,
|
||
|
|
"num_tokens": 2889945.0,
|
||
|
|
"step": 1595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5361980438232425,
|
||
|
|
"epoch": 1.374731413837559,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004964835464384595,
|
||
|
|
"loss": 6.3238,
|
||
|
|
"mean_token_accuracy": 0.145409494638443,
|
||
|
|
"num_tokens": 2898897.0,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.686757373809814,
|
||
|
|
"epoch": 1.3790287924366136,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000496424752637101,
|
||
|
|
"loss": 6.3401,
|
||
|
|
"mean_token_accuracy": 0.14611406177282332,
|
||
|
|
"num_tokens": 2907717.0,
|
||
|
|
"step": 1605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.578691530227661,
|
||
|
|
"epoch": 1.3833261710356681,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004963654753325853,
|
||
|
|
"loss": 6.3297,
|
||
|
|
"mean_token_accuracy": 0.14271921664476395,
|
||
|
|
"num_tokens": 2916213.0,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.683462333679199,
|
||
|
|
"epoch": 1.387623549634723,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004963057146543505,
|
||
|
|
"loss": 6.4949,
|
||
|
|
"mean_token_accuracy": 0.1387751467525959,
|
||
|
|
"num_tokens": 2925706.0,
|
||
|
|
"step": 1615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.599123191833496,
|
||
|
|
"epoch": 1.3919209282337774,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00049624547073289,
|
||
|
|
"loss": 6.4208,
|
||
|
|
"mean_token_accuracy": 0.1372368849813938,
|
||
|
|
"num_tokens": 2934464.0,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.672312545776367,
|
||
|
|
"epoch": 1.396218306832832,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004961847436997526,
|
||
|
|
"loss": 6.3195,
|
||
|
|
"mean_token_accuracy": 0.14415977373719216,
|
||
|
|
"num_tokens": 2944095.0,
|
||
|
|
"step": 1625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.480645990371704,
|
||
|
|
"epoch": 1.4005156854318865,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004961235336875416,
|
||
|
|
"loss": 6.3231,
|
||
|
|
"mean_token_accuracy": 0.14915895387530326,
|
||
|
|
"num_tokens": 2953357.0,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.639774322509766,
|
||
|
|
"epoch": 1.404813064030941,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004960618408299154,
|
||
|
|
"loss": 6.4687,
|
||
|
|
"mean_token_accuracy": 0.13529081642627716,
|
||
|
|
"num_tokens": 2963020.0,
|
||
|
|
"step": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.682909727096558,
|
||
|
|
"epoch": 1.4091104426299956,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004959996652615865,
|
||
|
|
"loss": 6.319,
|
||
|
|
"mean_token_accuracy": 0.14330243095755577,
|
||
|
|
"num_tokens": 2971955.0,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6523435592651365,
|
||
|
|
"epoch": 1.4134078212290504,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004959370071183216,
|
||
|
|
"loss": 6.3766,
|
||
|
|
"mean_token_accuracy": 0.14444040805101394,
|
||
|
|
"num_tokens": 2980662.0,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.675427007675171,
|
||
|
|
"epoch": 1.417705199828105,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004958738665369407,
|
||
|
|
"loss": 6.5051,
|
||
|
|
"mean_token_accuracy": 0.12928852811455727,
|
||
|
|
"num_tokens": 2990038.0,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.632522964477539,
|
||
|
|
"epoch": 1.4220025784271595,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004958102436553179,
|
||
|
|
"loss": 6.4172,
|
||
|
|
"mean_token_accuracy": 0.1390580452978611,
|
||
|
|
"num_tokens": 2999835.0,
|
||
|
|
"step": 1655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.694387483596802,
|
||
|
|
"epoch": 1.426299957026214,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.00049574613861238,
|
||
|
|
"loss": 6.4118,
|
||
|
|
"mean_token_accuracy": 0.13762674629688262,
|
||
|
|
"num_tokens": 3009593.0,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.648862266540528,
|
||
|
|
"epoch": 1.4305973356252686,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004956815515481069,
|
||
|
|
"loss": 6.4348,
|
||
|
|
"mean_token_accuracy": 0.144145817309618,
|
||
|
|
"num_tokens": 3019187.0,
|
||
|
|
"step": 1665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.582254266738891,
|
||
|
|
"epoch": 1.4348947142243231,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004956164826035309,
|
||
|
|
"loss": 6.3495,
|
||
|
|
"mean_token_accuracy": 0.14171260893344878,
|
||
|
|
"num_tokens": 3027875.0,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.569947624206543,
|
||
|
|
"epoch": 1.4391920928233777,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004955509319207363,
|
||
|
|
"loss": 6.3833,
|
||
|
|
"mean_token_accuracy": 0.13855091333389283,
|
||
|
|
"num_tokens": 3036902.0,
|
||
|
|
"step": 1675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.548913908004761,
|
||
|
|
"epoch": 1.4434894714224322,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004954848996428601,
|
||
|
|
"loss": 6.36,
|
||
|
|
"mean_token_accuracy": 0.14765606224536895,
|
||
|
|
"num_tokens": 3046653.0,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6836981773376465,
|
||
|
|
"epoch": 1.4477868500214868,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.00049541838591409,
|
||
|
|
"loss": 6.448,
|
||
|
|
"mean_token_accuracy": 0.13707543835043906,
|
||
|
|
"num_tokens": 3056273.0,
|
||
|
|
"step": 1685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.570832586288452,
|
||
|
|
"epoch": 1.4520842286205415,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004953513908796657,
|
||
|
|
"loss": 6.3562,
|
||
|
|
"mean_token_accuracy": 0.13904846012592315,
|
||
|
|
"num_tokens": 3065662.0,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.719029092788697,
|
||
|
|
"epoch": 1.456381607219596,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004952839146858773,
|
||
|
|
"loss": 6.3883,
|
||
|
|
"mean_token_accuracy": 0.14505013972520828,
|
||
|
|
"num_tokens": 3073970.0,
|
||
|
|
"step": 1695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.546349334716797,
|
||
|
|
"epoch": 1.4606789858186506,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004952159574800658,
|
||
|
|
"loss": 6.3978,
|
||
|
|
"mean_token_accuracy": 0.13897576928138733,
|
||
|
|
"num_tokens": 3082500.0,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.645324468612671,
|
||
|
|
"epoch": 1.4649763644177052,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004951475194106229,
|
||
|
|
"loss": 6.342,
|
||
|
|
"mean_token_accuracy": 0.14458465725183486,
|
||
|
|
"num_tokens": 3091574.0,
|
||
|
|
"step": 1705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.590623474121093,
|
||
|
|
"epoch": 1.4692737430167597,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004950786006269898,
|
||
|
|
"loss": 6.4477,
|
||
|
|
"mean_token_accuracy": 0.1356819100677967,
|
||
|
|
"num_tokens": 3102402.0,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.654024839401245,
|
||
|
|
"epoch": 1.4735711216158143,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004950092012796576,
|
||
|
|
"loss": 6.2738,
|
||
|
|
"mean_token_accuracy": 0.14728236198425293,
|
||
|
|
"num_tokens": 3111347.0,
|
||
|
|
"step": 1715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.553081369400024,
|
||
|
|
"epoch": 1.477868500214869,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004949393215201666,
|
||
|
|
"loss": 6.3455,
|
||
|
|
"mean_token_accuracy": 0.14207591861486435,
|
||
|
|
"num_tokens": 3120018.0,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.595822668075561,
|
||
|
|
"epoch": 1.4821658788139236,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004948689615011065,
|
||
|
|
"loss": 6.4086,
|
||
|
|
"mean_token_accuracy": 0.13704866543412209,
|
||
|
|
"num_tokens": 3129669.0,
|
||
|
|
"step": 1725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.628203105926514,
|
||
|
|
"epoch": 1.4864632574129781,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004947981213761154,
|
||
|
|
"loss": 6.3443,
|
||
|
|
"mean_token_accuracy": 0.14518199041485785,
|
||
|
|
"num_tokens": 3139112.0,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5786394596099855,
|
||
|
|
"epoch": 1.4907606360120327,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004947268012998797,
|
||
|
|
"loss": 6.3058,
|
||
|
|
"mean_token_accuracy": 0.15637002438306807,
|
||
|
|
"num_tokens": 3148437.0,
|
||
|
|
"step": 1735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.570107936859131,
|
||
|
|
"epoch": 1.4950580146110872,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.000494655001428134,
|
||
|
|
"loss": 6.2891,
|
||
|
|
"mean_token_accuracy": 0.14667836502194403,
|
||
|
|
"num_tokens": 3158165.0,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.586823749542236,
|
||
|
|
"epoch": 1.4993553932101418,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004945827219176604,
|
||
|
|
"loss": 6.3587,
|
||
|
|
"mean_token_accuracy": 0.1493491068482399,
|
||
|
|
"num_tokens": 3167262.0,
|
||
|
|
"step": 1745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.514509057998657,
|
||
|
|
"epoch": 1.5036527718091963,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004945099629262888,
|
||
|
|
"loss": 6.3479,
|
||
|
|
"mean_token_accuracy": 0.1436598651111126,
|
||
|
|
"num_tokens": 3176696.0,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.673803234100342,
|
||
|
|
"epoch": 1.5079501504082509,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004944367246128954,
|
||
|
|
"loss": 6.4304,
|
||
|
|
"mean_token_accuracy": 0.13725945726037025,
|
||
|
|
"num_tokens": 3185857.0,
|
||
|
|
"step": 1755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5661591529846195,
|
||
|
|
"epoch": 1.5122475290073054,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004943630071374036,
|
||
|
|
"loss": 6.2677,
|
||
|
|
"mean_token_accuracy": 0.14966750741004944,
|
||
|
|
"num_tokens": 3194687.0,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.554711723327637,
|
||
|
|
"epoch": 1.51654490760636,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004942888106607828,
|
||
|
|
"loss": 6.3291,
|
||
|
|
"mean_token_accuracy": 0.14281144142150878,
|
||
|
|
"num_tokens": 3204913.0,
|
||
|
|
"step": 1765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.641019535064697,
|
||
|
|
"epoch": 1.5208422862054147,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004942141353450486,
|
||
|
|
"loss": 6.3145,
|
||
|
|
"mean_token_accuracy": 0.1485350415110588,
|
||
|
|
"num_tokens": 3213312.0,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.493930768966675,
|
||
|
|
"epoch": 1.5251396648044693,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004941389813532619,
|
||
|
|
"loss": 6.2368,
|
||
|
|
"mean_token_accuracy": 0.15905009657144548,
|
||
|
|
"num_tokens": 3222992.0,
|
||
|
|
"step": 1775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.511264657974243,
|
||
|
|
"epoch": 1.5294370434035238,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.000494063348849529,
|
||
|
|
"loss": 6.2816,
|
||
|
|
"mean_token_accuracy": 0.14892083406448364,
|
||
|
|
"num_tokens": 3232836.0,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.616392660140991,
|
||
|
|
"epoch": 1.5337344220025786,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004939872379990011,
|
||
|
|
"loss": 6.4346,
|
||
|
|
"mean_token_accuracy": 0.1384902000427246,
|
||
|
|
"num_tokens": 3243171.0,
|
||
|
|
"step": 1785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.671454858779907,
|
||
|
|
"epoch": 1.5380318006016331,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004939106489678739,
|
||
|
|
"loss": 6.3565,
|
||
|
|
"mean_token_accuracy": 0.14886578172445297,
|
||
|
|
"num_tokens": 3251995.0,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.483775520324707,
|
||
|
|
"epoch": 1.5423291792006877,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000493833581923387,
|
||
|
|
"loss": 6.2999,
|
||
|
|
"mean_token_accuracy": 0.147441129386425,
|
||
|
|
"num_tokens": 3260841.0,
|
||
|
|
"step": 1795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.614831399917603,
|
||
|
|
"epoch": 1.5466265577997422,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004937560370338244,
|
||
|
|
"loss": 6.4359,
|
||
|
|
"mean_token_accuracy": 0.1328293912112713,
|
||
|
|
"num_tokens": 3270979.0,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.602978515625,
|
||
|
|
"epoch": 1.5509239363987968,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000493678014468513,
|
||
|
|
"loss": 6.3703,
|
||
|
|
"mean_token_accuracy": 0.14689823091030121,
|
||
|
|
"num_tokens": 3279848.0,
|
||
|
|
"step": 1805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.534598064422608,
|
||
|
|
"epoch": 1.5552213149978513,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004935995143978227,
|
||
|
|
"loss": 6.3674,
|
||
|
|
"mean_token_accuracy": 0.14537320658564568,
|
||
|
|
"num_tokens": 3289172.0,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.508708524703979,
|
||
|
|
"epoch": 1.5595186935969059,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004935205369931664,
|
||
|
|
"loss": 6.2677,
|
||
|
|
"mean_token_accuracy": 0.1513919234275818,
|
||
|
|
"num_tokens": 3297432.0,
|
||
|
|
"step": 1815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.684668636322021,
|
||
|
|
"epoch": 1.5638160721959604,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004934410824269992,
|
||
|
|
"loss": 6.2954,
|
||
|
|
"mean_token_accuracy": 0.1454857923090458,
|
||
|
|
"num_tokens": 3307486.0,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.466551637649536,
|
||
|
|
"epoch": 1.568113450795015,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004933611508728182,
|
||
|
|
"loss": 6.2671,
|
||
|
|
"mean_token_accuracy": 0.14967258870601655,
|
||
|
|
"num_tokens": 3316296.0,
|
||
|
|
"step": 1825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.563362693786621,
|
||
|
|
"epoch": 1.5724108293940695,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000493280742505162,
|
||
|
|
"loss": 6.2972,
|
||
|
|
"mean_token_accuracy": 0.14479405283927918,
|
||
|
|
"num_tokens": 3326080.0,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.456173896789551,
|
||
|
|
"epoch": 1.576708207993124,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004931998574996102,
|
||
|
|
"loss": 6.217,
|
||
|
|
"mean_token_accuracy": 0.15072606950998307,
|
||
|
|
"num_tokens": 3334826.0,
|
||
|
|
"step": 1835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.472858524322509,
|
||
|
|
"epoch": 1.5810055865921788,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004931184960327832,
|
||
|
|
"loss": 6.2177,
|
||
|
|
"mean_token_accuracy": 0.1524192661046982,
|
||
|
|
"num_tokens": 3343261.0,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.493236398696899,
|
||
|
|
"epoch": 1.5853029651912334,
|
||
|
|
"grad_norm": 1.640625,
|
||
|
|
"learning_rate": 0.0004930366582823421,
|
||
|
|
"loss": 6.2619,
|
||
|
|
"mean_token_accuracy": 0.14549409449100495,
|
||
|
|
"num_tokens": 3352513.0,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.541861534118652,
|
||
|
|
"epoch": 1.589600343790288,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004929543444269879,
|
||
|
|
"loss": 6.3147,
|
||
|
|
"mean_token_accuracy": 0.15202615782618523,
|
||
|
|
"num_tokens": 3361577.0,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.516072130203247,
|
||
|
|
"epoch": 1.5938977223893425,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.000492871554646461,
|
||
|
|
"loss": 6.3805,
|
||
|
|
"mean_token_accuracy": 0.1442191444337368,
|
||
|
|
"num_tokens": 3370591.0,
|
||
|
|
"step": 1855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.489377784729004,
|
||
|
|
"epoch": 1.5981951009883972,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004927882891215413,
|
||
|
|
"loss": 6.2995,
|
||
|
|
"mean_token_accuracy": 0.1446702793240547,
|
||
|
|
"num_tokens": 3379761.0,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6347997188568115,
|
||
|
|
"epoch": 1.6024924795874518,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0004927045480340475,
|
||
|
|
"loss": 6.3729,
|
||
|
|
"mean_token_accuracy": 0.13809221014380454,
|
||
|
|
"num_tokens": 3388974.0,
|
||
|
|
"step": 1865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.515362644195557,
|
||
|
|
"epoch": 1.6067898581865063,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004926203315668363,
|
||
|
|
"loss": 6.2995,
|
||
|
|
"mean_token_accuracy": 0.14509507045149803,
|
||
|
|
"num_tokens": 3398339.0,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.501726579666138,
|
||
|
|
"epoch": 1.6110872367855609,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004925356399038032,
|
||
|
|
"loss": 6.2645,
|
||
|
|
"mean_token_accuracy": 0.14561111479997635,
|
||
|
|
"num_tokens": 3408292.0,
|
||
|
|
"step": 1875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.528331470489502,
|
||
|
|
"epoch": 1.6153846153846154,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004924504732298808,
|
||
|
|
"loss": 6.2363,
|
||
|
|
"mean_token_accuracy": 0.15578987523913385,
|
||
|
|
"num_tokens": 3417057.0,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.547144651412964,
|
||
|
|
"epoch": 1.61968199398367,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004923648317310391,
|
||
|
|
"loss": 6.3436,
|
||
|
|
"mean_token_accuracy": 0.1472199097275734,
|
||
|
|
"num_tokens": 3425830.0,
|
||
|
|
"step": 1885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.503617954254151,
|
||
|
|
"epoch": 1.6239793725827245,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004922787155942849,
|
||
|
|
"loss": 6.3929,
|
||
|
|
"mean_token_accuracy": 0.13893435150384903,
|
||
|
|
"num_tokens": 3435513.0,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.572265768051148,
|
||
|
|
"epoch": 1.628276751181779,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004921921250076611,
|
||
|
|
"loss": 6.2966,
|
||
|
|
"mean_token_accuracy": 0.14931443706154823,
|
||
|
|
"num_tokens": 3444684.0,
|
||
|
|
"step": 1895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4495138168334964,
|
||
|
|
"epoch": 1.6325741297808336,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004921050601602475,
|
||
|
|
"loss": 6.3435,
|
||
|
|
"mean_token_accuracy": 0.14741323441267012,
|
||
|
|
"num_tokens": 3453454.0,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.556122159957885,
|
||
|
|
"epoch": 1.6368715083798882,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004920175212421587,
|
||
|
|
"loss": 6.2787,
|
||
|
|
"mean_token_accuracy": 0.14662181138992308,
|
||
|
|
"num_tokens": 3463228.0,
|
||
|
|
"step": 1905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.366853141784668,
|
||
|
|
"epoch": 1.6411688869789427,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004919295084445445,
|
||
|
|
"loss": 6.166,
|
||
|
|
"mean_token_accuracy": 0.15177097618579866,
|
||
|
|
"num_tokens": 3472131.0,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.485814142227173,
|
||
|
|
"epoch": 1.6454662655779975,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004918410219595899,
|
||
|
|
"loss": 6.2547,
|
||
|
|
"mean_token_accuracy": 0.1523374244570732,
|
||
|
|
"num_tokens": 3480642.0,
|
||
|
|
"step": 1915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.621995449066162,
|
||
|
|
"epoch": 1.649763644177052,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.000491752061980514,
|
||
|
|
"loss": 6.2277,
|
||
|
|
"mean_token_accuracy": 0.15280286371707916,
|
||
|
|
"num_tokens": 3489346.0,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4284903049469,
|
||
|
|
"epoch": 1.6540610227761066,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004916626287015697,
|
||
|
|
"loss": 6.2756,
|
||
|
|
"mean_token_accuracy": 0.15068823397159575,
|
||
|
|
"num_tokens": 3498473.0,
|
||
|
|
"step": 1925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.515523910522461,
|
||
|
|
"epoch": 1.658358401375161,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004915727223180436,
|
||
|
|
"loss": 6.2738,
|
||
|
|
"mean_token_accuracy": 0.142893535643816,
|
||
|
|
"num_tokens": 3507415.0,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.528269815444946,
|
||
|
|
"epoch": 1.6626557799742159,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004914823430262554,
|
||
|
|
"loss": 6.3984,
|
||
|
|
"mean_token_accuracy": 0.1329946205019951,
|
||
|
|
"num_tokens": 3516873.0,
|
||
|
|
"step": 1935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.484966564178467,
|
||
|
|
"epoch": 1.6669531585732704,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004913914910235573,
|
||
|
|
"loss": 6.2479,
|
||
|
|
"mean_token_accuracy": 0.14868821799755097,
|
||
|
|
"num_tokens": 3525047.0,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.448112821578979,
|
||
|
|
"epoch": 1.671250537172325,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004913001665083337,
|
||
|
|
"loss": 6.2685,
|
||
|
|
"mean_token_accuracy": 0.14392302706837654,
|
||
|
|
"num_tokens": 3534354.0,
|
||
|
|
"step": 1945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.528091144561768,
|
||
|
|
"epoch": 1.6755479157713795,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0004912083696800008,
|
||
|
|
"loss": 6.2926,
|
||
|
|
"mean_token_accuracy": 0.14562170803546906,
|
||
|
|
"num_tokens": 3543830.0,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4218017578125,
|
||
|
|
"epoch": 1.679845294370434,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004911161007390063,
|
||
|
|
"loss": 6.1933,
|
||
|
|
"mean_token_accuracy": 0.14804754853248597,
|
||
|
|
"num_tokens": 3552314.0,
|
||
|
|
"step": 1955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.470229148864746,
|
||
|
|
"epoch": 1.6841426729694886,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004910233598868287,
|
||
|
|
"loss": 6.2765,
|
||
|
|
"mean_token_accuracy": 0.14477257579565048,
|
||
|
|
"num_tokens": 3561656.0,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.467269372940064,
|
||
|
|
"epoch": 1.6884400515685432,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004909301473259769,
|
||
|
|
"loss": 6.2641,
|
||
|
|
"mean_token_accuracy": 0.14551830440759658,
|
||
|
|
"num_tokens": 3571784.0,
|
||
|
|
"step": 1965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.518259859085083,
|
||
|
|
"epoch": 1.6927374301675977,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004908364632599899,
|
||
|
|
"loss": 6.228,
|
||
|
|
"mean_token_accuracy": 0.15220747292041778,
|
||
|
|
"num_tokens": 3580626.0,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.378790664672851,
|
||
|
|
"epoch": 1.6970348087666522,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004907423078934362,
|
||
|
|
"loss": 6.2467,
|
||
|
|
"mean_token_accuracy": 0.14601020216941835,
|
||
|
|
"num_tokens": 3589916.0,
|
||
|
|
"step": 1975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.473833656311035,
|
||
|
|
"epoch": 1.7013321873657068,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004906476814319134,
|
||
|
|
"loss": 6.2572,
|
||
|
|
"mean_token_accuracy": 0.14930620267987252,
|
||
|
|
"num_tokens": 3599128.0,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.429199600219727,
|
||
|
|
"epoch": 1.7056295659647613,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004905525840820481,
|
||
|
|
"loss": 6.2686,
|
||
|
|
"mean_token_accuracy": 0.1471567466855049,
|
||
|
|
"num_tokens": 3608764.0,
|
||
|
|
"step": 1985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.58309121131897,
|
||
|
|
"epoch": 1.709926944563816,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004904570160514948,
|
||
|
|
"loss": 6.3077,
|
||
|
|
"mean_token_accuracy": 0.14043890461325645,
|
||
|
|
"num_tokens": 3619082.0,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.45733323097229,
|
||
|
|
"epoch": 1.7142243231628707,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004903609775489358,
|
||
|
|
"loss": 6.2682,
|
||
|
|
"mean_token_accuracy": 0.14586469754576684,
|
||
|
|
"num_tokens": 3628695.0,
|
||
|
|
"step": 1995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.511290454864502,
|
||
|
|
"epoch": 1.7185217017619252,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004902644687840809,
|
||
|
|
"loss": 6.267,
|
||
|
|
"mean_token_accuracy": 0.14717549681663514,
|
||
|
|
"num_tokens": 3637599.0,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7185217017619252,
|
||
|
|
"eval_entropy": 6.214308420817058,
|
||
|
|
"eval_loss": 6.331518173217773,
|
||
|
|
"eval_mean_token_accuracy": 0.14971260959702032,
|
||
|
|
"eval_num_tokens": 3637599.0,
|
||
|
|
"eval_runtime": 2.0415,
|
||
|
|
"eval_samples_per_second": 1738.466,
|
||
|
|
"eval_steps_per_second": 217.492,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.427486324310303,
|
||
|
|
"epoch": 1.7228190803609797,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004901674899676667,
|
||
|
|
"loss": 6.2449,
|
||
|
|
"mean_token_accuracy": 0.14803531616926194,
|
||
|
|
"num_tokens": 3647406.0,
|
||
|
|
"step": 2005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.416431045532226,
|
||
|
|
"epoch": 1.7271164589600345,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004900700413114561,
|
||
|
|
"loss": 6.1252,
|
||
|
|
"mean_token_accuracy": 0.15068818926811217,
|
||
|
|
"num_tokens": 3656531.0,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.388833618164062,
|
||
|
|
"epoch": 1.731413837559089,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000489972123028238,
|
||
|
|
"loss": 6.2244,
|
||
|
|
"mean_token_accuracy": 0.1465991474688053,
|
||
|
|
"num_tokens": 3664922.0,
|
||
|
|
"step": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.502804613113403,
|
||
|
|
"epoch": 1.7357112161581436,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004898737353318268,
|
||
|
|
"loss": 6.1557,
|
||
|
|
"mean_token_accuracy": 0.1519090563058853,
|
||
|
|
"num_tokens": 3673283.0,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.377015924453735,
|
||
|
|
"epoch": 1.7400085947571982,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000489774878437062,
|
||
|
|
"loss": 6.298,
|
||
|
|
"mean_token_accuracy": 0.15162839442491532,
|
||
|
|
"num_tokens": 3681760.0,
|
||
|
|
"step": 2025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.46599555015564,
|
||
|
|
"epoch": 1.7443059733562527,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004896755525598074,
|
||
|
|
"loss": 6.1178,
|
||
|
|
"mean_token_accuracy": 0.15259039252996445,
|
||
|
|
"num_tokens": 3689408.0,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4247987270355225,
|
||
|
|
"epoch": 1.7486033519553073,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004895757579169511,
|
||
|
|
"loss": 6.234,
|
||
|
|
"mean_token_accuracy": 0.14994207322597503,
|
||
|
|
"num_tokens": 3697904.0,
|
||
|
|
"step": 2035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.579666042327881,
|
||
|
|
"epoch": 1.7529007305543618,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004894754947264047,
|
||
|
|
"loss": 6.2504,
|
||
|
|
"mean_token_accuracy": 0.15150809586048125,
|
||
|
|
"num_tokens": 3706704.0,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.433872127532959,
|
||
|
|
"epoch": 1.7571981091534163,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000489374763207103,
|
||
|
|
"loss": 6.3286,
|
||
|
|
"mean_token_accuracy": 0.14471730291843415,
|
||
|
|
"num_tokens": 3715690.0,
|
||
|
|
"step": 2045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.465651893615723,
|
||
|
|
"epoch": 1.761495487752471,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004892735635790033,
|
||
|
|
"loss": 6.125,
|
||
|
|
"mean_token_accuracy": 0.15927532613277434,
|
||
|
|
"num_tokens": 3724835.0,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.368647861480713,
|
||
|
|
"epoch": 1.7657928663515254,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.000489171896063085,
|
||
|
|
"loss": 6.1498,
|
||
|
|
"mean_token_accuracy": 0.157290717959404,
|
||
|
|
"num_tokens": 3733977.0,
|
||
|
|
"step": 2055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.458992671966553,
|
||
|
|
"epoch": 1.77009024495058,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004890697608813495,
|
||
|
|
"loss": 6.2682,
|
||
|
|
"mean_token_accuracy": 0.14064312726259232,
|
||
|
|
"num_tokens": 3742665.0,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.583484077453614,
|
||
|
|
"epoch": 1.7743876235496348,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004889671582568193,
|
||
|
|
"loss": 6.3367,
|
||
|
|
"mean_token_accuracy": 0.14621492847800255,
|
||
|
|
"num_tokens": 3751647.0,
|
||
|
|
"step": 2065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.387417125701904,
|
||
|
|
"epoch": 1.7786850021486893,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004888640884135374,
|
||
|
|
"loss": 6.2386,
|
||
|
|
"mean_token_accuracy": 0.1474798172712326,
|
||
|
|
"num_tokens": 3760852.0,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3953369617462155,
|
||
|
|
"epoch": 1.7829823807477438,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0004887605515765671,
|
||
|
|
"loss": 6.1913,
|
||
|
|
"mean_token_accuracy": 0.15439595878124238,
|
||
|
|
"num_tokens": 3768640.0,
|
||
|
|
"step": 2075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.503360080718994,
|
||
|
|
"epoch": 1.7872797593467986,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004886565479719914,
|
||
|
|
"loss": 6.2177,
|
||
|
|
"mean_token_accuracy": 0.14689500331878663,
|
||
|
|
"num_tokens": 3776504.0,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.52859411239624,
|
||
|
|
"epoch": 1.7915771379458532,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004885520778269128,
|
||
|
|
"loss": 6.2515,
|
||
|
|
"mean_token_accuracy": 0.1499434307217598,
|
||
|
|
"num_tokens": 3786353.0,
|
||
|
|
"step": 2085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.410916137695312,
|
||
|
|
"epoch": 1.7958745165449077,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004884471413694523,
|
||
|
|
"loss": 6.2783,
|
||
|
|
"mean_token_accuracy": 0.15109124332666396,
|
||
|
|
"num_tokens": 3795902.0,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.470384979248047,
|
||
|
|
"epoch": 1.8001718951439623,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004883417388287491,
|
||
|
|
"loss": 6.194,
|
||
|
|
"mean_token_accuracy": 0.1435760647058487,
|
||
|
|
"num_tokens": 3805986.0,
|
||
|
|
"step": 2095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.400091123580933,
|
||
|
|
"epoch": 1.8044692737430168,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004882358704349603,
|
||
|
|
"loss": 6.3188,
|
||
|
|
"mean_token_accuracy": 0.1500417910516262,
|
||
|
|
"num_tokens": 3814915.0,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.456367015838623,
|
||
|
|
"epoch": 1.8087666523420713,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004881295364192601,
|
||
|
|
"loss": 6.2089,
|
||
|
|
"mean_token_accuracy": 0.15894449651241302,
|
||
|
|
"num_tokens": 3823966.0,
|
||
|
|
"step": 2105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.510165739059448,
|
||
|
|
"epoch": 1.813064030941126,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004880227370138394,
|
||
|
|
"loss": 6.2729,
|
||
|
|
"mean_token_accuracy": 0.142085450142622,
|
||
|
|
"num_tokens": 3832775.0,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3983588218688965,
|
||
|
|
"epoch": 1.8173614095401804,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.0004879154724519057,
|
||
|
|
"loss": 6.1809,
|
||
|
|
"mean_token_accuracy": 0.15120477825403214,
|
||
|
|
"num_tokens": 3842808.0,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.493490934371948,
|
||
|
|
"epoch": 1.821658788139235,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004878077429676816,
|
||
|
|
"loss": 6.3143,
|
||
|
|
"mean_token_accuracy": 0.14699392020702362,
|
||
|
|
"num_tokens": 3853303.0,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4460196018219,
|
||
|
|
"epoch": 1.8259561667382895,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004876995487964054,
|
||
|
|
"loss": 6.2277,
|
||
|
|
"mean_token_accuracy": 0.13867998719215394,
|
||
|
|
"num_tokens": 3862462.0,
|
||
|
|
"step": 2125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.459061241149902,
|
||
|
|
"epoch": 1.830253545337344,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00048759089017432996,
|
||
|
|
"loss": 6.3388,
|
||
|
|
"mean_token_accuracy": 0.14455281794071198,
|
||
|
|
"num_tokens": 3871596.0,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.482069444656372,
|
||
|
|
"epoch": 1.8345509239363988,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004874817673387222,
|
||
|
|
"loss": 6.2427,
|
||
|
|
"mean_token_accuracy": 0.14856942594051362,
|
||
|
|
"num_tokens": 3881276.0,
|
||
|
|
"step": 2135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.43566927909851,
|
||
|
|
"epoch": 1.8388483025354534,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.00048737218052786275,
|
||
|
|
"loss": 6.33,
|
||
|
|
"mean_token_accuracy": 0.14330809488892554,
|
||
|
|
"num_tokens": 3891610.0,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.498207521438599,
|
||
|
|
"epoch": 1.843145681134508,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.00048726212998104554,
|
||
|
|
"loss": 6.2531,
|
||
|
|
"mean_token_accuracy": 0.14796748533844947,
|
||
|
|
"num_tokens": 3900584.0,
|
||
|
|
"step": 2145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.405120611190796,
|
||
|
|
"epoch": 1.8474430597335625,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004871516159385768,
|
||
|
|
"loss": 6.1817,
|
||
|
|
"mean_token_accuracy": 0.1539264902472496,
|
||
|
|
"num_tokens": 3910208.0,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.320563936233521,
|
||
|
|
"epoch": 1.8517404383326173,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004870406386417752,
|
||
|
|
"loss": 6.1061,
|
||
|
|
"mean_token_accuracy": 0.15697987973690034,
|
||
|
|
"num_tokens": 3918424.0,
|
||
|
|
"step": 2155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.313277053833008,
|
||
|
|
"epoch": 1.8560378169316718,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004869291983329707,
|
||
|
|
"loss": 6.047,
|
||
|
|
"mean_token_accuracy": 0.17023974657058716,
|
||
|
|
"num_tokens": 3926206.0,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.473067951202393,
|
||
|
|
"epoch": 1.8603351955307263,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004868172952555044,
|
||
|
|
"loss": 6.1485,
|
||
|
|
"mean_token_accuracy": 0.14482472315430642,
|
||
|
|
"num_tokens": 3935769.0,
|
||
|
|
"step": 2165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.363153123855591,
|
||
|
|
"epoch": 1.864632574129781,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004867049296537278,
|
||
|
|
"loss": 6.1373,
|
||
|
|
"mean_token_accuracy": 0.1534383252263069,
|
||
|
|
"num_tokens": 3945118.0,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.399164772033691,
|
||
|
|
"epoch": 1.8689299527288354,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0004865921017730027,
|
||
|
|
"loss": 6.2358,
|
||
|
|
"mean_token_accuracy": 0.15296792089939118,
|
||
|
|
"num_tokens": 3954012.0,
|
||
|
|
"step": 2175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.471106052398682,
|
||
|
|
"epoch": 1.87322733132789,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.00048647881185969995,
|
||
|
|
"loss": 6.2355,
|
||
|
|
"mean_token_accuracy": 0.15060990452766418,
|
||
|
|
"num_tokens": 3964239.0,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.386410093307495,
|
||
|
|
"epoch": 1.8775247099269445,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004863650601611994,
|
||
|
|
"loss": 6.1502,
|
||
|
|
"mean_token_accuracy": 0.15660223215818406,
|
||
|
|
"num_tokens": 3973694.0,
|
||
|
|
"step": 2185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.372910404205323,
|
||
|
|
"epoch": 1.881822088525999,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00048625084692588937,
|
||
|
|
"loss": 6.185,
|
||
|
|
"mean_token_accuracy": 0.15601919442415238,
|
||
|
|
"num_tokens": 3982706.0,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.401282548904419,
|
||
|
|
"epoch": 1.8861194671250536,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00048613617240316593,
|
||
|
|
"loss": 6.138,
|
||
|
|
"mean_token_accuracy": 0.15665835291147232,
|
||
|
|
"num_tokens": 3990934.0,
|
||
|
|
"step": 2195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4126348972320555,
|
||
|
|
"epoch": 1.8904168457241082,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004860210368434323,
|
||
|
|
"loss": 6.192,
|
||
|
|
"mean_token_accuracy": 0.1556440055370331,
|
||
|
|
"num_tokens": 3999864.0,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.424229860305786,
|
||
|
|
"epoch": 1.8947142243231627,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.00048590544049809857,
|
||
|
|
"loss": 6.1968,
|
||
|
|
"mean_token_accuracy": 0.15178433507680894,
|
||
|
|
"num_tokens": 4008273.0,
|
||
|
|
"step": 2205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.427778577804565,
|
||
|
|
"epoch": 1.8990116029222175,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000485789383619581,
|
||
|
|
"loss": 6.2178,
|
||
|
|
"mean_token_accuracy": 0.1559001922607422,
|
||
|
|
"num_tokens": 4017697.0,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4254296779632565,
|
||
|
|
"epoch": 1.903308981521272,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004856728664613015,
|
||
|
|
"loss": 6.2293,
|
||
|
|
"mean_token_accuracy": 0.14589258283376694,
|
||
|
|
"num_tokens": 4026775.0,
|
||
|
|
"step": 2215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.351989793777466,
|
||
|
|
"epoch": 1.9076063601203266,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00048555588927768674,
|
||
|
|
"loss": 6.1972,
|
||
|
|
"mean_token_accuracy": 0.15271373167634011,
|
||
|
|
"num_tokens": 4036476.0,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.473893165588379,
|
||
|
|
"epoch": 1.9119037387193811,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004854384523241683,
|
||
|
|
"loss": 6.204,
|
||
|
|
"mean_token_accuracy": 0.15081721246242524,
|
||
|
|
"num_tokens": 4045221.0,
|
||
|
|
"step": 2225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.310385704040527,
|
||
|
|
"epoch": 1.916201117318436,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00048532055585718143,
|
||
|
|
"loss": 6.1112,
|
||
|
|
"mean_token_accuracy": 0.15869007259607315,
|
||
|
|
"num_tokens": 4053754.0,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.390126276016235,
|
||
|
|
"epoch": 1.9204984959174904,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00048520220013416505,
|
||
|
|
"loss": 6.1455,
|
||
|
|
"mean_token_accuracy": 0.15594211518764495,
|
||
|
|
"num_tokens": 4061730.0,
|
||
|
|
"step": 2235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3809610366821286,
|
||
|
|
"epoch": 1.924795874516545,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004850833854135607,
|
||
|
|
"loss": 6.197,
|
||
|
|
"mean_token_accuracy": 0.15130506530404092,
|
||
|
|
"num_tokens": 4070501.0,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.420936059951782,
|
||
|
|
"epoch": 1.9290932531155995,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004849641119548122,
|
||
|
|
"loss": 6.2763,
|
||
|
|
"mean_token_accuracy": 0.1485205315053463,
|
||
|
|
"num_tokens": 4079621.0,
|
||
|
|
"step": 2245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4735170841217045,
|
||
|
|
"epoch": 1.933390631714654,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000484844380018365,
|
||
|
|
"loss": 6.2663,
|
||
|
|
"mean_token_accuracy": 0.14868344217538834,
|
||
|
|
"num_tokens": 4090106.0,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.461083984375,
|
||
|
|
"epoch": 1.9376880103137086,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000484724189865666,
|
||
|
|
"loss": 6.1985,
|
||
|
|
"mean_token_accuracy": 0.1501224085688591,
|
||
|
|
"num_tokens": 4099269.0,
|
||
|
|
"step": 2255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.287312364578247,
|
||
|
|
"epoch": 1.9419853889127632,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004846035417591624,
|
||
|
|
"loss": 6.1351,
|
||
|
|
"mean_token_accuracy": 0.1544906511902809,
|
||
|
|
"num_tokens": 4108414.0,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.426730060577393,
|
||
|
|
"epoch": 1.9462827675118177,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004844824359623014,
|
||
|
|
"loss": 6.2629,
|
||
|
|
"mean_token_accuracy": 0.14584496468305588,
|
||
|
|
"num_tokens": 4117731.0,
|
||
|
|
"step": 2265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.451971340179443,
|
||
|
|
"epoch": 1.9505801461108723,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00048436087273952966,
|
||
|
|
"loss": 6.2441,
|
||
|
|
"mean_token_accuracy": 0.14279974550008773,
|
||
|
|
"num_tokens": 4127194.0,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.396147346496582,
|
||
|
|
"epoch": 1.9548775247099268,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00048423885235629265,
|
||
|
|
"loss": 6.193,
|
||
|
|
"mean_token_accuracy": 0.15773467123508453,
|
||
|
|
"num_tokens": 4135594.0,
|
||
|
|
"step": 2275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.39124755859375,
|
||
|
|
"epoch": 1.9591749033089814,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004841163750790342,
|
||
|
|
"loss": 6.2256,
|
||
|
|
"mean_token_accuracy": 0.15189721137285234,
|
||
|
|
"num_tokens": 4145027.0,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.383194398880005,
|
||
|
|
"epoch": 1.9634722819080361,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00048399344117519555,
|
||
|
|
"loss": 6.087,
|
||
|
|
"mean_token_accuracy": 0.15884610414505004,
|
||
|
|
"num_tokens": 4153754.0,
|
||
|
|
"step": 2285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.330159759521484,
|
||
|
|
"epoch": 1.9677696605070907,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00048387005091321544,
|
||
|
|
"loss": 6.1553,
|
||
|
|
"mean_token_accuracy": 0.15946451872587203,
|
||
|
|
"num_tokens": 4162765.0,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.414357376098633,
|
||
|
|
"epoch": 1.9720670391061452,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00048374620456252877,
|
||
|
|
"loss": 6.1748,
|
||
|
|
"mean_token_accuracy": 0.1570574849843979,
|
||
|
|
"num_tokens": 4171589.0,
|
||
|
|
"step": 2295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.360631132125855,
|
||
|
|
"epoch": 1.9763644177052,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00048362190239356644,
|
||
|
|
"loss": 6.1913,
|
||
|
|
"mean_token_accuracy": 0.155552938580513,
|
||
|
|
"num_tokens": 4181817.0,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.352840518951416,
|
||
|
|
"epoch": 1.9806617963042545,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00048349714467775474,
|
||
|
|
"loss": 6.1462,
|
||
|
|
"mean_token_accuracy": 0.1511269122362137,
|
||
|
|
"num_tokens": 4191350.0,
|
||
|
|
"step": 2305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3630085468292235,
|
||
|
|
"epoch": 1.984959174903309,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00048337193168751464,
|
||
|
|
"loss": 6.1935,
|
||
|
|
"mean_token_accuracy": 0.1461350604891777,
|
||
|
|
"num_tokens": 4199888.0,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.447411775588989,
|
||
|
|
"epoch": 1.9892565535023636,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004832462636962613,
|
||
|
|
"loss": 6.1829,
|
||
|
|
"mean_token_accuracy": 0.1507252760231495,
|
||
|
|
"num_tokens": 4209509.0,
|
||
|
|
"step": 2315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.372689247131348,
|
||
|
|
"epoch": 1.9935539321014182,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004831201409784034,
|
||
|
|
"loss": 6.1215,
|
||
|
|
"mean_token_accuracy": 0.15712654441595078,
|
||
|
|
"num_tokens": 4218496.0,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.357889032363891,
|
||
|
|
"epoch": 1.9978513107004727,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004829935638093424,
|
||
|
|
"loss": 6.1463,
|
||
|
|
"mean_token_accuracy": 0.15369027704000474,
|
||
|
|
"num_tokens": 4227504.0,
|
||
|
|
"step": 2325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.373083750406901,
|
||
|
|
"epoch": 2.0017189514396216,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004828665324654724,
|
||
|
|
"loss": 6.0581,
|
||
|
|
"mean_token_accuracy": 0.15794145895375145,
|
||
|
|
"num_tokens": 4235338.0,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4267494678497314,
|
||
|
|
"epoch": 2.006016330038676,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004827390472241791,
|
||
|
|
"loss": 5.8418,
|
||
|
|
"mean_token_accuracy": 0.16316850185394288,
|
||
|
|
"num_tokens": 4244905.0,
|
||
|
|
"step": 2335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.314910984039306,
|
||
|
|
"epoch": 2.010313708637731,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004826111083638392,
|
||
|
|
"loss": 5.9211,
|
||
|
|
"mean_token_accuracy": 0.1677140362560749,
|
||
|
|
"num_tokens": 4254533.0,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.370204210281372,
|
||
|
|
"epoch": 2.0146110872367857,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.00048248271616382,
|
||
|
|
"loss": 5.8961,
|
||
|
|
"mean_token_accuracy": 0.16431671380996704,
|
||
|
|
"num_tokens": 4264023.0,
|
||
|
|
"step": 2345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.326271295547485,
|
||
|
|
"epoch": 2.0189084658358403,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00048235387090447894,
|
||
|
|
"loss": 5.9306,
|
||
|
|
"mean_token_accuracy": 0.1572665750980377,
|
||
|
|
"num_tokens": 4273298.0,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.378605699539184,
|
||
|
|
"epoch": 2.023205844434895,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00048222457286716235,
|
||
|
|
"loss": 5.8756,
|
||
|
|
"mean_token_accuracy": 0.16723261177539825,
|
||
|
|
"num_tokens": 4283244.0,
|
||
|
|
"step": 2355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.322220325469971,
|
||
|
|
"epoch": 2.0275032230339494,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00048209482233420564,
|
||
|
|
"loss": 5.8185,
|
||
|
|
"mean_token_accuracy": 0.1769508183002472,
|
||
|
|
"num_tokens": 4291677.0,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.314945793151855,
|
||
|
|
"epoch": 2.031800601633004,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000481964619588932,
|
||
|
|
"loss": 5.8793,
|
||
|
|
"mean_token_accuracy": 0.16825687736272812,
|
||
|
|
"num_tokens": 4300822.0,
|
||
|
|
"step": 2365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.339528942108155,
|
||
|
|
"epoch": 2.0360979802320585,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004818339649156523,
|
||
|
|
"loss": 5.8876,
|
||
|
|
"mean_token_accuracy": 0.16732898950576783,
|
||
|
|
"num_tokens": 4310149.0,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.19782075881958,
|
||
|
|
"epoch": 2.040395358831113,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00048170285859966395,
|
||
|
|
"loss": 5.7924,
|
||
|
|
"mean_token_accuracy": 0.17466236799955367,
|
||
|
|
"num_tokens": 4319109.0,
|
||
|
|
"step": 2375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3286045551300045,
|
||
|
|
"epoch": 2.0446927374301676,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.00048157130092725087,
|
||
|
|
"loss": 5.7843,
|
||
|
|
"mean_token_accuracy": 0.1704682469367981,
|
||
|
|
"num_tokens": 4327921.0,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.329291915893554,
|
||
|
|
"epoch": 2.048990116029222,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004814392921856824,
|
||
|
|
"loss": 5.9287,
|
||
|
|
"mean_token_accuracy": 0.16586144566535949,
|
||
|
|
"num_tokens": 4338026.0,
|
||
|
|
"step": 2385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2563072681427006,
|
||
|
|
"epoch": 2.0532874946282766,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004813068326632128,
|
||
|
|
"loss": 5.7762,
|
||
|
|
"mean_token_accuracy": 0.17654864937067033,
|
||
|
|
"num_tokens": 4347794.0,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.329816913604736,
|
||
|
|
"epoch": 2.057584873227331,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004811739226490809,
|
||
|
|
"loss": 5.9557,
|
||
|
|
"mean_token_accuracy": 0.16758598685264586,
|
||
|
|
"num_tokens": 4357249.0,
|
||
|
|
"step": 2395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.283816623687744,
|
||
|
|
"epoch": 2.0618822518263857,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00048104056243350896,
|
||
|
|
"loss": 5.9041,
|
||
|
|
"mean_token_accuracy": 0.16363563090562822,
|
||
|
|
"num_tokens": 4366053.0,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.297672891616822,
|
||
|
|
"epoch": 2.0661796304254403,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004809067523077023,
|
||
|
|
"loss": 5.9163,
|
||
|
|
"mean_token_accuracy": 0.16945113092660904,
|
||
|
|
"num_tokens": 4375543.0,
|
||
|
|
"step": 2405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2845330238342285,
|
||
|
|
"epoch": 2.0704770090244953,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00048077249256384884,
|
||
|
|
"loss": 5.8006,
|
||
|
|
"mean_token_accuracy": 0.17305675595998765,
|
||
|
|
"num_tokens": 4384332.0,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.210544061660767,
|
||
|
|
"epoch": 2.07477438762355,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004806377834951182,
|
||
|
|
"loss": 5.8994,
|
||
|
|
"mean_token_accuracy": 0.16216432005167009,
|
||
|
|
"num_tokens": 4393670.0,
|
||
|
|
"step": 2415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.373771142959595,
|
||
|
|
"epoch": 2.0790717662226044,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00048050262539566104,
|
||
|
|
"loss": 5.9012,
|
||
|
|
"mean_token_accuracy": 0.16862600147724152,
|
||
|
|
"num_tokens": 4402763.0,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.269940948486328,
|
||
|
|
"epoch": 2.083369144821659,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004803670185606087,
|
||
|
|
"loss": 5.8086,
|
||
|
|
"mean_token_accuracy": 0.17335692346096038,
|
||
|
|
"num_tokens": 4411863.0,
|
||
|
|
"step": 2425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.265923166275025,
|
||
|
|
"epoch": 2.0876665234207135,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004802309632860724,
|
||
|
|
"loss": 5.9059,
|
||
|
|
"mean_token_accuracy": 0.16651569604873656,
|
||
|
|
"num_tokens": 4421110.0,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.352302503585816,
|
||
|
|
"epoch": 2.091963902019768,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00048009445986914236,
|
||
|
|
"loss": 5.8854,
|
||
|
|
"mean_token_accuracy": 0.16589637845754623,
|
||
|
|
"num_tokens": 4430249.0,
|
||
|
|
"step": 2435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.263960170745849,
|
||
|
|
"epoch": 2.0962612806188226,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00047995750860788756,
|
||
|
|
"loss": 5.8661,
|
||
|
|
"mean_token_accuracy": 0.15910358875989913,
|
||
|
|
"num_tokens": 4439686.0,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.227327108383179,
|
||
|
|
"epoch": 2.100558659217877,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004798201098013547,
|
||
|
|
"loss": 5.8709,
|
||
|
|
"mean_token_accuracy": 0.1692453533411026,
|
||
|
|
"num_tokens": 4448645.0,
|
||
|
|
"step": 2445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.291311168670655,
|
||
|
|
"epoch": 2.1048560378169316,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.00047968226374956797,
|
||
|
|
"loss": 5.8333,
|
||
|
|
"mean_token_accuracy": 0.1675017699599266,
|
||
|
|
"num_tokens": 4456870.0,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.195930767059326,
|
||
|
|
"epoch": 2.109153416415986,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00047954397075352794,
|
||
|
|
"loss": 5.8684,
|
||
|
|
"mean_token_accuracy": 0.17277338951826096,
|
||
|
|
"num_tokens": 4466287.0,
|
||
|
|
"step": 2455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2388382911682125,
|
||
|
|
"epoch": 2.1134507950150407,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00047940523111521136,
|
||
|
|
"loss": 5.7553,
|
||
|
|
"mean_token_accuracy": 0.17395039051771163,
|
||
|
|
"num_tokens": 4474461.0,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.255577421188354,
|
||
|
|
"epoch": 2.1177481736140953,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004792660451375701,
|
||
|
|
"loss": 5.835,
|
||
|
|
"mean_token_accuracy": 0.16953630596399308,
|
||
|
|
"num_tokens": 4483002.0,
|
||
|
|
"step": 2465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.224816513061524,
|
||
|
|
"epoch": 2.12204555221315,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00047912641312453064,
|
||
|
|
"loss": 5.8459,
|
||
|
|
"mean_token_accuracy": 0.1695180580019951,
|
||
|
|
"num_tokens": 4492061.0,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.284405374526978,
|
||
|
|
"epoch": 2.1263429308122044,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.00047898633538099363,
|
||
|
|
"loss": 5.8957,
|
||
|
|
"mean_token_accuracy": 0.16090027987957,
|
||
|
|
"num_tokens": 4501829.0,
|
||
|
|
"step": 2475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.258666229248047,
|
||
|
|
"epoch": 2.130640309411259,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004788458122128327,
|
||
|
|
"loss": 5.9181,
|
||
|
|
"mean_token_accuracy": 0.1656097248196602,
|
||
|
|
"num_tokens": 4511539.0,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.246809720993042,
|
||
|
|
"epoch": 2.134937688010314,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00047870484392689434,
|
||
|
|
"loss": 5.7722,
|
||
|
|
"mean_token_accuracy": 0.1671189084649086,
|
||
|
|
"num_tokens": 4520425.0,
|
||
|
|
"step": 2485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.220279026031494,
|
||
|
|
"epoch": 2.1392350666093685,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000478563430830997,
|
||
|
|
"loss": 5.8751,
|
||
|
|
"mean_token_accuracy": 0.16446918100118638,
|
||
|
|
"num_tokens": 4529474.0,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2571605205535885,
|
||
|
|
"epoch": 2.143532445208423,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00047842157323393035,
|
||
|
|
"loss": 5.8041,
|
||
|
|
"mean_token_accuracy": 0.1694269135594368,
|
||
|
|
"num_tokens": 4538082.0,
|
||
|
|
"step": 2495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.218803596496582,
|
||
|
|
"epoch": 2.1478298238074776,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004782792714454547,
|
||
|
|
"loss": 5.9987,
|
||
|
|
"mean_token_accuracy": 0.16337930560112,
|
||
|
|
"num_tokens": 4547340.0,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1478298238074776,
|
||
|
|
"eval_entropy": 6.073525357890773,
|
||
|
|
"eval_loss": 6.213027477264404,
|
||
|
|
"eval_mean_token_accuracy": 0.15643914548999016,
|
||
|
|
"eval_num_tokens": 4547340.0,
|
||
|
|
"eval_runtime": 2.0452,
|
||
|
|
"eval_samples_per_second": 1735.325,
|
||
|
|
"eval_steps_per_second": 217.099,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.266714763641358,
|
||
|
|
"epoch": 2.152127202406532,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004781365257763002,
|
||
|
|
"loss": 5.8423,
|
||
|
|
"mean_token_accuracy": 0.16869749277830123,
|
||
|
|
"num_tokens": 4556415.0,
|
||
|
|
"step": 2505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1728370666503904,
|
||
|
|
"epoch": 2.1564245810055866,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.00047799333653816633,
|
||
|
|
"loss": 5.7293,
|
||
|
|
"mean_token_accuracy": 0.17461720257997512,
|
||
|
|
"num_tokens": 4565156.0,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.233670806884765,
|
||
|
|
"epoch": 2.160721959604641,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00047784970404372124,
|
||
|
|
"loss": 5.8327,
|
||
|
|
"mean_token_accuracy": 0.16848449259996415,
|
||
|
|
"num_tokens": 4574678.0,
|
||
|
|
"step": 2515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.12764801979065,
|
||
|
|
"epoch": 2.1650193382036957,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00047770562860660083,
|
||
|
|
"loss": 5.854,
|
||
|
|
"mean_token_accuracy": 0.16377500146627427,
|
||
|
|
"num_tokens": 4583253.0,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.273917770385742,
|
||
|
|
"epoch": 2.1693167168027503,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004775611105414083,
|
||
|
|
"loss": 5.9138,
|
||
|
|
"mean_token_accuracy": 0.16056130826473236,
|
||
|
|
"num_tokens": 4594042.0,
|
||
|
|
"step": 2525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.210309171676636,
|
||
|
|
"epoch": 2.173614095401805,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004774161501637133,
|
||
|
|
"loss": 5.8661,
|
||
|
|
"mean_token_accuracy": 0.16690902709960936,
|
||
|
|
"num_tokens": 4603128.0,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.207437753677368,
|
||
|
|
"epoch": 2.1779114740008594,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0004772707477900514,
|
||
|
|
"loss": 5.8489,
|
||
|
|
"mean_token_accuracy": 0.17330004572868346,
|
||
|
|
"num_tokens": 4611537.0,
|
||
|
|
"step": 2535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.316633796691894,
|
||
|
|
"epoch": 2.182208852599914,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004771249037379232,
|
||
|
|
"loss": 5.9518,
|
||
|
|
"mean_token_accuracy": 0.1604529470205307,
|
||
|
|
"num_tokens": 4622481.0,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.174561834335327,
|
||
|
|
"epoch": 2.1865062311989685,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004769786183257939,
|
||
|
|
"loss": 5.8564,
|
||
|
|
"mean_token_accuracy": 0.17447448074817656,
|
||
|
|
"num_tokens": 4631259.0,
|
||
|
|
"step": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.186811542510986,
|
||
|
|
"epoch": 2.190803609798023,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004768318918730924,
|
||
|
|
"loss": 5.7986,
|
||
|
|
"mean_token_accuracy": 0.1752243533730507,
|
||
|
|
"num_tokens": 4640266.0,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.212873888015747,
|
||
|
|
"epoch": 2.195100988397078,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00047668472470021044,
|
||
|
|
"loss": 5.853,
|
||
|
|
"mean_token_accuracy": 0.16329605877399445,
|
||
|
|
"num_tokens": 4649520.0,
|
||
|
|
"step": 2555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.257145929336548,
|
||
|
|
"epoch": 2.1993983669961326,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004765371171285025,
|
||
|
|
"loss": 5.8079,
|
||
|
|
"mean_token_accuracy": 0.1733356922864914,
|
||
|
|
"num_tokens": 4658501.0,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.108858823776245,
|
||
|
|
"epoch": 2.203695745595187,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00047638906948028445,
|
||
|
|
"loss": 5.8536,
|
||
|
|
"mean_token_accuracy": 0.16747843474149704,
|
||
|
|
"num_tokens": 4667567.0,
|
||
|
|
"step": 2565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.222007703781128,
|
||
|
|
"epoch": 2.2079931241942417,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00047624058207883317,
|
||
|
|
"loss": 5.8596,
|
||
|
|
"mean_token_accuracy": 0.16799781173467637,
|
||
|
|
"num_tokens": 4676618.0,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.326595973968506,
|
||
|
|
"epoch": 2.212290502793296,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.00047609165524838576,
|
||
|
|
"loss": 5.921,
|
||
|
|
"mean_token_accuracy": 0.16489885598421097,
|
||
|
|
"num_tokens": 4685967.0,
|
||
|
|
"step": 2575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.112624216079712,
|
||
|
|
"epoch": 2.2165878813923507,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.0004759422893141389,
|
||
|
|
"loss": 5.8098,
|
||
|
|
"mean_token_accuracy": 0.17214897125959397,
|
||
|
|
"num_tokens": 4694568.0,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.23127293586731,
|
||
|
|
"epoch": 2.2208852599914053,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004757924846022482,
|
||
|
|
"loss": 5.8764,
|
||
|
|
"mean_token_accuracy": 0.1683722823858261,
|
||
|
|
"num_tokens": 4703648.0,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2149560928344725,
|
||
|
|
"epoch": 2.22518263859046,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00047564224143982714,
|
||
|
|
"loss": 5.7317,
|
||
|
|
"mean_token_accuracy": 0.18064576983451844,
|
||
|
|
"num_tokens": 4712444.0,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.195422506332397,
|
||
|
|
"epoch": 2.2294800171895144,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.00047549156015494676,
|
||
|
|
"loss": 5.887,
|
||
|
|
"mean_token_accuracy": 0.16564202010631562,
|
||
|
|
"num_tokens": 4722034.0,
|
||
|
|
"step": 2595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.179683208465576,
|
||
|
|
"epoch": 2.233777395788569,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00047534044107663484,
|
||
|
|
"loss": 5.9075,
|
||
|
|
"mean_token_accuracy": 0.16279049664735795,
|
||
|
|
"num_tokens": 4731344.0,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.295088148117065,
|
||
|
|
"epoch": 2.2380747743876235,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00047518888453487496,
|
||
|
|
"loss": 5.809,
|
||
|
|
"mean_token_accuracy": 0.17704246044158936,
|
||
|
|
"num_tokens": 4739302.0,
|
||
|
|
"step": 2605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1531964302062985,
|
||
|
|
"epoch": 2.242372152986678,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004750368908606061,
|
||
|
|
"loss": 5.9282,
|
||
|
|
"mean_token_accuracy": 0.16434444785118102,
|
||
|
|
"num_tokens": 4748848.0,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.262106943130493,
|
||
|
|
"epoch": 2.2466695315857326,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00047488446038572164,
|
||
|
|
"loss": 5.9816,
|
||
|
|
"mean_token_accuracy": 0.16012711673974991,
|
||
|
|
"num_tokens": 4758194.0,
|
||
|
|
"step": 2615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.268323373794556,
|
||
|
|
"epoch": 2.250966910184787,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004747315934430688,
|
||
|
|
"loss": 5.8908,
|
||
|
|
"mean_token_accuracy": 0.164437834918499,
|
||
|
|
"num_tokens": 4768081.0,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.122048091888428,
|
||
|
|
"epoch": 2.2552642887838417,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000474578290366448,
|
||
|
|
"loss": 5.8245,
|
||
|
|
"mean_token_accuracy": 0.1705750197172165,
|
||
|
|
"num_tokens": 4776471.0,
|
||
|
|
"step": 2625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.204921579360962,
|
||
|
|
"epoch": 2.259561667382896,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004744245514906117,
|
||
|
|
"loss": 5.8253,
|
||
|
|
"mean_token_accuracy": 0.1741186946630478,
|
||
|
|
"num_tokens": 4784403.0,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1283422946929935,
|
||
|
|
"epoch": 2.263859045981951,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00047427037715126426,
|
||
|
|
"loss": 5.8029,
|
||
|
|
"mean_token_accuracy": 0.16940733194351196,
|
||
|
|
"num_tokens": 4792779.0,
|
||
|
|
"step": 2635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.132787275314331,
|
||
|
|
"epoch": 2.2681564245810057,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004741157676850608,
|
||
|
|
"loss": 5.7827,
|
||
|
|
"mean_token_accuracy": 0.1744200199842453,
|
||
|
|
"num_tokens": 4801426.0,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2156031131744385,
|
||
|
|
"epoch": 2.2724538031800603,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.00047396072342960663,
|
||
|
|
"loss": 5.8338,
|
||
|
|
"mean_token_accuracy": 0.16472329795360566,
|
||
|
|
"num_tokens": 4810329.0,
|
||
|
|
"step": 2645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1918652057647705,
|
||
|
|
"epoch": 2.276751181779115,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00047380524472345645,
|
||
|
|
"loss": 5.8802,
|
||
|
|
"mean_token_accuracy": 0.16467834115028382,
|
||
|
|
"num_tokens": 4819544.0,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.203462934494018,
|
||
|
|
"epoch": 2.2810485603781694,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004736493319061134,
|
||
|
|
"loss": 5.8876,
|
||
|
|
"mean_token_accuracy": 0.16658470630645753,
|
||
|
|
"num_tokens": 4828113.0,
|
||
|
|
"step": 2655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.154991245269775,
|
||
|
|
"epoch": 2.285345938977224,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004734929853180291,
|
||
|
|
"loss": 5.8764,
|
||
|
|
"mean_token_accuracy": 0.16575339883565904,
|
||
|
|
"num_tokens": 4836989.0,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.258448839187622,
|
||
|
|
"epoch": 2.2896433175762785,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.00047333620530060175,
|
||
|
|
"loss": 5.9117,
|
||
|
|
"mean_token_accuracy": 0.16528864502906798,
|
||
|
|
"num_tokens": 4847103.0,
|
||
|
|
"step": 2665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.181549310684204,
|
||
|
|
"epoch": 2.293940696175333,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004731789921961764,
|
||
|
|
"loss": 5.9289,
|
||
|
|
"mean_token_accuracy": 0.16640040427446365,
|
||
|
|
"num_tokens": 4856238.0,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.227826976776123,
|
||
|
|
"epoch": 2.2982380747743876,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004730213463480434,
|
||
|
|
"loss": 5.8189,
|
||
|
|
"mean_token_accuracy": 0.17475187480449678,
|
||
|
|
"num_tokens": 4864608.0,
|
||
|
|
"step": 2675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.163301944732666,
|
||
|
|
"epoch": 2.302535453373442,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00047286326810043857,
|
||
|
|
"loss": 5.7783,
|
||
|
|
"mean_token_accuracy": 0.17075299024581908,
|
||
|
|
"num_tokens": 4873889.0,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.134186220169068,
|
||
|
|
"epoch": 2.3068328319724967,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.00047270475779854137,
|
||
|
|
"loss": 5.8223,
|
||
|
|
"mean_token_accuracy": 0.1724078834056854,
|
||
|
|
"num_tokens": 4882902.0,
|
||
|
|
"step": 2685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.292477703094482,
|
||
|
|
"epoch": 2.311130210571551,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00047254581578847507,
|
||
|
|
"loss": 5.8426,
|
||
|
|
"mean_token_accuracy": 0.16808903068304062,
|
||
|
|
"num_tokens": 4892390.0,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.170593881607056,
|
||
|
|
"epoch": 2.3154275891706058,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004723864424173055,
|
||
|
|
"loss": 5.9683,
|
||
|
|
"mean_token_accuracy": 0.1666146218776703,
|
||
|
|
"num_tokens": 4901625.0,
|
||
|
|
"step": 2695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.194738912582397,
|
||
|
|
"epoch": 2.3197249677696608,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004722266380330403,
|
||
|
|
"loss": 5.7718,
|
||
|
|
"mean_token_accuracy": 0.17559022307395936,
|
||
|
|
"num_tokens": 4910804.0,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.180141830444336,
|
||
|
|
"epoch": 2.3240223463687153,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00047206640298462857,
|
||
|
|
"loss": 5.8472,
|
||
|
|
"mean_token_accuracy": 0.16781375855207442,
|
||
|
|
"num_tokens": 4920441.0,
|
||
|
|
"step": 2705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.170105838775635,
|
||
|
|
"epoch": 2.32831972496777,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.00047190573762195945,
|
||
|
|
"loss": 5.8928,
|
||
|
|
"mean_token_accuracy": 0.1647154539823532,
|
||
|
|
"num_tokens": 4930204.0,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.171744394302368,
|
||
|
|
"epoch": 2.3326171035668244,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.00047174464229586186,
|
||
|
|
"loss": 5.9868,
|
||
|
|
"mean_token_accuracy": 0.15878558307886123,
|
||
|
|
"num_tokens": 4941191.0,
|
||
|
|
"step": 2715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.294037532806397,
|
||
|
|
"epoch": 2.336914482165879,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0004715831173581036,
|
||
|
|
"loss": 5.9658,
|
||
|
|
"mean_token_accuracy": 0.16081493049860002,
|
||
|
|
"num_tokens": 4951825.0,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.163305330276489,
|
||
|
|
"epoch": 2.3412118607649335,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.00047142116316139073,
|
||
|
|
"loss": 5.9007,
|
||
|
|
"mean_token_accuracy": 0.1701881170272827,
|
||
|
|
"num_tokens": 4960632.0,
|
||
|
|
"step": 2725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.263418245315552,
|
||
|
|
"epoch": 2.345509239363988,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004712587800593663,
|
||
|
|
"loss": 5.9268,
|
||
|
|
"mean_token_accuracy": 0.1628424420952797,
|
||
|
|
"num_tokens": 4969455.0,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.159938859939575,
|
||
|
|
"epoch": 2.3498066179630426,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0004710959684066102,
|
||
|
|
"loss": 5.822,
|
||
|
|
"mean_token_accuracy": 0.1740834206342697,
|
||
|
|
"num_tokens": 4978997.0,
|
||
|
|
"step": 2735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.198467969894409,
|
||
|
|
"epoch": 2.354103996562097,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00047093272855863803,
|
||
|
|
"loss": 5.89,
|
||
|
|
"mean_token_accuracy": 0.16633735448122025,
|
||
|
|
"num_tokens": 4988305.0,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.171191024780273,
|
||
|
|
"epoch": 2.3584013751611517,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004707690608719003,
|
||
|
|
"loss": 5.8201,
|
||
|
|
"mean_token_accuracy": 0.17565433084964752,
|
||
|
|
"num_tokens": 4997022.0,
|
||
|
|
"step": 2745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.182925462722778,
|
||
|
|
"epoch": 2.362698753760206,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004706049657037818,
|
||
|
|
"loss": 5.879,
|
||
|
|
"mean_token_accuracy": 0.16346064060926438,
|
||
|
|
"num_tokens": 5005664.0,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.149474191665649,
|
||
|
|
"epoch": 2.3669961323592608,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004704404434126009,
|
||
|
|
"loss": 5.8502,
|
||
|
|
"mean_token_accuracy": 0.16408389210700988,
|
||
|
|
"num_tokens": 5014769.0,
|
||
|
|
"step": 2755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.255496549606323,
|
||
|
|
"epoch": 2.3712935109583153,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00047027549435760843,
|
||
|
|
"loss": 5.9078,
|
||
|
|
"mean_token_accuracy": 0.16789433360099792,
|
||
|
|
"num_tokens": 5024060.0,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.256794357299805,
|
||
|
|
"epoch": 2.37559088955737,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004701101188989872,
|
||
|
|
"loss": 5.9544,
|
||
|
|
"mean_token_accuracy": 0.1624842867255211,
|
||
|
|
"num_tokens": 5033046.0,
|
||
|
|
"step": 2765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.156686782836914,
|
||
|
|
"epoch": 2.3798882681564244,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00046994431739785114,
|
||
|
|
"loss": 5.7991,
|
||
|
|
"mean_token_accuracy": 0.18271932750940323,
|
||
|
|
"num_tokens": 5040894.0,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.20210337638855,
|
||
|
|
"epoch": 2.384185646755479,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00046977809021624454,
|
||
|
|
"loss": 5.9534,
|
||
|
|
"mean_token_accuracy": 0.17005517482757568,
|
||
|
|
"num_tokens": 5050961.0,
|
||
|
|
"step": 2775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.216541862487793,
|
||
|
|
"epoch": 2.3884830253545335,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004696114377171409,
|
||
|
|
"loss": 5.8757,
|
||
|
|
"mean_token_accuracy": 0.1636977568268776,
|
||
|
|
"num_tokens": 5060226.0,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.160855150222778,
|
||
|
|
"epoch": 2.3927804039535885,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004694443602644429,
|
||
|
|
"loss": 5.8457,
|
||
|
|
"mean_token_accuracy": 0.16862347573041916,
|
||
|
|
"num_tokens": 5069225.0,
|
||
|
|
"step": 2785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.22788553237915,
|
||
|
|
"epoch": 2.397077782552643,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004692768582229808,
|
||
|
|
"loss": 5.8344,
|
||
|
|
"mean_token_accuracy": 0.17104473561048508,
|
||
|
|
"num_tokens": 5078386.0,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.091501474380493,
|
||
|
|
"epoch": 2.4013751611516976,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.00046910893195851213,
|
||
|
|
"loss": 5.765,
|
||
|
|
"mean_token_accuracy": 0.16869171112775802,
|
||
|
|
"num_tokens": 5087161.0,
|
||
|
|
"step": 2795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.183551597595215,
|
||
|
|
"epoch": 2.405672539750752,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00046894058183772074,
|
||
|
|
"loss": 5.9281,
|
||
|
|
"mean_token_accuracy": 0.16594007909297942,
|
||
|
|
"num_tokens": 5096613.0,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.197868537902832,
|
||
|
|
"epoch": 2.4099699183498067,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.000468771808228216,
|
||
|
|
"loss": 5.8912,
|
||
|
|
"mean_token_accuracy": 0.16417519897222518,
|
||
|
|
"num_tokens": 5106534.0,
|
||
|
|
"step": 2805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.143604946136475,
|
||
|
|
"epoch": 2.414267296948861,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00046860261149853197,
|
||
|
|
"loss": 5.9134,
|
||
|
|
"mean_token_accuracy": 0.1646139517426491,
|
||
|
|
"num_tokens": 5115975.0,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.127184104919434,
|
||
|
|
"epoch": 2.4185646755479158,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004684329920181268,
|
||
|
|
"loss": 5.8045,
|
||
|
|
"mean_token_accuracy": 0.16945046484470366,
|
||
|
|
"num_tokens": 5124635.0,
|
||
|
|
"step": 2815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.151847076416016,
|
||
|
|
"epoch": 2.4228620541469703,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00046826295015738154,
|
||
|
|
"loss": 5.7738,
|
||
|
|
"mean_token_accuracy": 0.1773565873503685,
|
||
|
|
"num_tokens": 5133226.0,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0929807186126705,
|
||
|
|
"epoch": 2.427159432746025,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004680924862875996,
|
||
|
|
"loss": 5.8663,
|
||
|
|
"mean_token_accuracy": 0.17087701261043547,
|
||
|
|
"num_tokens": 5142257.0,
|
||
|
|
"step": 2825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.199731492996216,
|
||
|
|
"epoch": 2.4314568113450794,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.00046792160078100605,
|
||
|
|
"loss": 5.8592,
|
||
|
|
"mean_token_accuracy": 0.17053601890802383,
|
||
|
|
"num_tokens": 5150752.0,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.151450777053833,
|
||
|
|
"epoch": 2.435754189944134,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00046775029401074653,
|
||
|
|
"loss": 5.7783,
|
||
|
|
"mean_token_accuracy": 0.17438559532165526,
|
||
|
|
"num_tokens": 5160237.0,
|
||
|
|
"step": 2835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.171485233306885,
|
||
|
|
"epoch": 2.4400515685431885,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.00046757856635088645,
|
||
|
|
"loss": 5.85,
|
||
|
|
"mean_token_accuracy": 0.17521743029356002,
|
||
|
|
"num_tokens": 5169752.0,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1737254619598385,
|
||
|
|
"epoch": 2.444348947142243,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004674064181764105,
|
||
|
|
"loss": 5.8887,
|
||
|
|
"mean_token_accuracy": 0.17213839143514634,
|
||
|
|
"num_tokens": 5178892.0,
|
||
|
|
"step": 2845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.169126319885254,
|
||
|
|
"epoch": 2.448646325741298,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.00046723384986322147,
|
||
|
|
"loss": 5.8736,
|
||
|
|
"mean_token_accuracy": 0.16697555780410767,
|
||
|
|
"num_tokens": 5188468.0,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.121142101287842,
|
||
|
|
"epoch": 2.4529437043403526,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004670608617881395,
|
||
|
|
"loss": 5.7947,
|
||
|
|
"mean_token_accuracy": 0.1755498692393303,
|
||
|
|
"num_tokens": 5197565.0,
|
||
|
|
"step": 2855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.083435106277466,
|
||
|
|
"epoch": 2.457241082939407,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004668874543289014,
|
||
|
|
"loss": 5.7851,
|
||
|
|
"mean_token_accuracy": 0.1805465489625931,
|
||
|
|
"num_tokens": 5205791.0,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.136435890197754,
|
||
|
|
"epoch": 2.4615384615384617,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00046671362786415986,
|
||
|
|
"loss": 5.7872,
|
||
|
|
"mean_token_accuracy": 0.18155153840780258,
|
||
|
|
"num_tokens": 5214773.0,
|
||
|
|
"step": 2865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.082297658920288,
|
||
|
|
"epoch": 2.465835840137516,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.00046653938277348237,
|
||
|
|
"loss": 5.8211,
|
||
|
|
"mean_token_accuracy": 0.1757299304008484,
|
||
|
|
"num_tokens": 5223734.0,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.256624984741211,
|
||
|
|
"epoch": 2.4701332187365708,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004663647194373505,
|
||
|
|
"loss": 5.9026,
|
||
|
|
"mean_token_accuracy": 0.16392517536878587,
|
||
|
|
"num_tokens": 5231742.0,
|
||
|
|
"step": 2875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.135076570510864,
|
||
|
|
"epoch": 2.4744305973356253,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00046618963823715913,
|
||
|
|
"loss": 5.8631,
|
||
|
|
"mean_token_accuracy": 0.17133675366640091,
|
||
|
|
"num_tokens": 5241673.0,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.190168714523315,
|
||
|
|
"epoch": 2.47872797593468,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00046601413955521575,
|
||
|
|
"loss": 5.8246,
|
||
|
|
"mean_token_accuracy": 0.1694057285785675,
|
||
|
|
"num_tokens": 5250082.0,
|
||
|
|
"step": 2885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.136935997009277,
|
||
|
|
"epoch": 2.4830253545337344,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004658382237747393,
|
||
|
|
"loss": 5.8976,
|
||
|
|
"mean_token_accuracy": 0.16706683337688447,
|
||
|
|
"num_tokens": 5259680.0,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.16978874206543,
|
||
|
|
"epoch": 2.487322733132789,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.00046566189127985946,
|
||
|
|
"loss": 5.8769,
|
||
|
|
"mean_token_accuracy": 0.1714440792798996,
|
||
|
|
"num_tokens": 5269561.0,
|
||
|
|
"step": 2895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.182620716094971,
|
||
|
|
"epoch": 2.4916201117318435,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.000465485142455616,
|
||
|
|
"loss": 5.8189,
|
||
|
|
"mean_token_accuracy": 0.17375694811344147,
|
||
|
|
"num_tokens": 5278659.0,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.057879829406739,
|
||
|
|
"epoch": 2.495917490330898,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00046530797768795765,
|
||
|
|
"loss": 5.8103,
|
||
|
|
"mean_token_accuracy": 0.18172994256019592,
|
||
|
|
"num_tokens": 5287619.0,
|
||
|
|
"step": 2905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1459949016571045,
|
||
|
|
"epoch": 2.5002148689299526,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00046513039736374153,
|
||
|
|
"loss": 5.9271,
|
||
|
|
"mean_token_accuracy": 0.16282536834478378,
|
||
|
|
"num_tokens": 5297334.0,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.201943445205688,
|
||
|
|
"epoch": 2.504512247529007,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004649524018707319,
|
||
|
|
"loss": 5.8405,
|
||
|
|
"mean_token_accuracy": 0.1736244261264801,
|
||
|
|
"num_tokens": 5306208.0,
|
||
|
|
"step": 2915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.117348289489746,
|
||
|
|
"epoch": 2.5088096261280617,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00046477399159759996,
|
||
|
|
"loss": 5.7789,
|
||
|
|
"mean_token_accuracy": 0.1744915708899498,
|
||
|
|
"num_tokens": 5314754.0,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.022426891326904,
|
||
|
|
"epoch": 2.5131070047271162,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00046459516693392246,
|
||
|
|
"loss": 5.7951,
|
||
|
|
"mean_token_accuracy": 0.17653965055942536,
|
||
|
|
"num_tokens": 5324000.0,
|
||
|
|
"step": 2925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.192726993560791,
|
||
|
|
"epoch": 2.517404383326171,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004644159282701808,
|
||
|
|
"loss": 5.8412,
|
||
|
|
"mean_token_accuracy": 0.1699216842651367,
|
||
|
|
"num_tokens": 5332478.0,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.193784236907959,
|
||
|
|
"epoch": 2.5217017619252258,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00046423627599776076,
|
||
|
|
"loss": 5.9229,
|
||
|
|
"mean_token_accuracy": 0.1587831899523735,
|
||
|
|
"num_tokens": 5341635.0,
|
||
|
|
"step": 2935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.126192474365235,
|
||
|
|
"epoch": 2.5259991405242803,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000464056210508951,
|
||
|
|
"loss": 5.9125,
|
||
|
|
"mean_token_accuracy": 0.16348374187946318,
|
||
|
|
"num_tokens": 5350144.0,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.17839298248291,
|
||
|
|
"epoch": 2.530296519123335,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004638757321969426,
|
||
|
|
"loss": 5.8251,
|
||
|
|
"mean_token_accuracy": 0.17073310166597366,
|
||
|
|
"num_tokens": 5358788.0,
|
||
|
|
"step": 2945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.144708824157715,
|
||
|
|
"epoch": 2.5345938977223894,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00046369484145582815,
|
||
|
|
"loss": 5.9064,
|
||
|
|
"mean_token_accuracy": 0.16323922872543334,
|
||
|
|
"num_tokens": 5368057.0,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.069336700439453,
|
||
|
|
"epoch": 2.538891276321444,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00046351353868060054,
|
||
|
|
"loss": 5.7586,
|
||
|
|
"mean_token_accuracy": 0.174574413895607,
|
||
|
|
"num_tokens": 5376739.0,
|
||
|
|
"step": 2955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.171047353744507,
|
||
|
|
"epoch": 2.5431886549204985,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00046333182426715273,
|
||
|
|
"loss": 5.8806,
|
||
|
|
"mean_token_accuracy": 0.16850085258483888,
|
||
|
|
"num_tokens": 5385967.0,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.161162233352661,
|
||
|
|
"epoch": 2.547486033519553,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00046314969861227626,
|
||
|
|
"loss": 5.9049,
|
||
|
|
"mean_token_accuracy": 0.15845982432365419,
|
||
|
|
"num_tokens": 5395192.0,
|
||
|
|
"step": 2965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.14454460144043,
|
||
|
|
"epoch": 2.5517834121186076,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004629671621136608,
|
||
|
|
"loss": 5.8588,
|
||
|
|
"mean_token_accuracy": 0.16995412558317186,
|
||
|
|
"num_tokens": 5404694.0,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.158005809783935,
|
||
|
|
"epoch": 2.556080790717662,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004627842151698931,
|
||
|
|
"loss": 5.8623,
|
||
|
|
"mean_token_accuracy": 0.16851141750812532,
|
||
|
|
"num_tokens": 5413102.0,
|
||
|
|
"step": 2975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.134857320785523,
|
||
|
|
"epoch": 2.5603781693167167,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00046260085818045625,
|
||
|
|
"loss": 5.8942,
|
||
|
|
"mean_token_accuracy": 0.16586572974920272,
|
||
|
|
"num_tokens": 5423339.0,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.197592544555664,
|
||
|
|
"epoch": 2.5646755479157712,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004624170915457284,
|
||
|
|
"loss": 5.8504,
|
||
|
|
"mean_token_accuracy": 0.17059714645147322,
|
||
|
|
"num_tokens": 5432377.0,
|
||
|
|
"step": 2985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.128017950057983,
|
||
|
|
"epoch": 2.5689729265148262,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00046223291566698264,
|
||
|
|
"loss": 5.7959,
|
||
|
|
"mean_token_accuracy": 0.17204724699258805,
|
||
|
|
"num_tokens": 5441038.0,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.107345724105835,
|
||
|
|
"epoch": 2.5732703051138808,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004620483309463855,
|
||
|
|
"loss": 5.7918,
|
||
|
|
"mean_token_accuracy": 0.17900732010602952,
|
||
|
|
"num_tokens": 5449557.0,
|
||
|
|
"step": 2995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1927958011627195,
|
||
|
|
"epoch": 2.5775676837129353,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004618633377869961,
|
||
|
|
"loss": 5.9156,
|
||
|
|
"mean_token_accuracy": 0.16568114012479782,
|
||
|
|
"num_tokens": 5458931.0,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5775676837129353,
|
||
|
|
"eval_entropy": 5.998430791201892,
|
||
|
|
"eval_loss": 6.121789455413818,
|
||
|
|
"eval_mean_token_accuracy": 0.16322041645243363,
|
||
|
|
"eval_num_tokens": 5458931.0,
|
||
|
|
"eval_runtime": 2.0487,
|
||
|
|
"eval_samples_per_second": 1732.347,
|
||
|
|
"eval_steps_per_second": 216.726,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.126945543289184,
|
||
|
|
"epoch": 2.58186506231199,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004616779365927656,
|
||
|
|
"loss": 5.7528,
|
||
|
|
"mean_token_accuracy": 0.18461534082889558,
|
||
|
|
"num_tokens": 5468539.0,
|
||
|
|
"step": 3005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.964468240737915,
|
||
|
|
"epoch": 2.5861624409110444,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0004614921277685361,
|
||
|
|
"loss": 5.6994,
|
||
|
|
"mean_token_accuracy": 0.18173616677522658,
|
||
|
|
"num_tokens": 5475710.0,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.099804162979126,
|
||
|
|
"epoch": 2.590459819510099,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00046130591172003976,
|
||
|
|
"loss": 5.845,
|
||
|
|
"mean_token_accuracy": 0.16855668723583223,
|
||
|
|
"num_tokens": 5484597.0,
|
||
|
|
"step": 3015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.216131401062012,
|
||
|
|
"epoch": 2.5947571981091535,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004611192888538981,
|
||
|
|
"loss": 5.9276,
|
||
|
|
"mean_token_accuracy": 0.16257163286209106,
|
||
|
|
"num_tokens": 5493213.0,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1808586597442625,
|
||
|
|
"epoch": 2.599054576708208,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00046093225957762084,
|
||
|
|
"loss": 5.903,
|
||
|
|
"mean_token_accuracy": 0.16862684190273286,
|
||
|
|
"num_tokens": 5502556.0,
|
||
|
|
"step": 3025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1216977596282955,
|
||
|
|
"epoch": 2.6033519553072626,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004607448242996051,
|
||
|
|
"loss": 5.8208,
|
||
|
|
"mean_token_accuracy": 0.1719271272420883,
|
||
|
|
"num_tokens": 5511779.0,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1579231262207035,
|
||
|
|
"epoch": 2.607649333906317,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004605569834291347,
|
||
|
|
"loss": 5.8058,
|
||
|
|
"mean_token_accuracy": 0.18103471398353577,
|
||
|
|
"num_tokens": 5520836.0,
|
||
|
|
"step": 3035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.061151313781738,
|
||
|
|
"epoch": 2.6119467125053717,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00046036873737637904,
|
||
|
|
"loss": 5.8302,
|
||
|
|
"mean_token_accuracy": 0.17482185810804368,
|
||
|
|
"num_tokens": 5529285.0,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.116726493835449,
|
||
|
|
"epoch": 2.6162440911044262,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004601800865523921,
|
||
|
|
"loss": 5.8482,
|
||
|
|
"mean_token_accuracy": 0.1684387966990471,
|
||
|
|
"num_tokens": 5538160.0,
|
||
|
|
"step": 3045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.122728109359741,
|
||
|
|
"epoch": 2.620541469703481,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00045999103136911204,
|
||
|
|
"loss": 5.8517,
|
||
|
|
"mean_token_accuracy": 0.16452286690473555,
|
||
|
|
"num_tokens": 5547355.0,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.120913076400757,
|
||
|
|
"epoch": 2.6248388483025353,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00045980157223935965,
|
||
|
|
"loss": 5.8606,
|
||
|
|
"mean_token_accuracy": 0.16614654809236526,
|
||
|
|
"num_tokens": 5557299.0,
|
||
|
|
"step": 3055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.061937570571899,
|
||
|
|
"epoch": 2.62913622690159,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.00045961170957683806,
|
||
|
|
"loss": 5.7822,
|
||
|
|
"mean_token_accuracy": 0.17485247999429704,
|
||
|
|
"num_tokens": 5565469.0,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.150688505172729,
|
||
|
|
"epoch": 2.6334336055006444,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00045942144379613147,
|
||
|
|
"loss": 5.8945,
|
||
|
|
"mean_token_accuracy": 0.16743394434452058,
|
||
|
|
"num_tokens": 5574740.0,
|
||
|
|
"step": 3065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.152962112426758,
|
||
|
|
"epoch": 2.637730984099699,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00045923077531270426,
|
||
|
|
"loss": 5.8866,
|
||
|
|
"mean_token_accuracy": 0.16888206750154494,
|
||
|
|
"num_tokens": 5583438.0,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.126224088668823,
|
||
|
|
"epoch": 2.6420283626987535,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004590397045429001,
|
||
|
|
"loss": 5.84,
|
||
|
|
"mean_token_accuracy": 0.17367925941944123,
|
||
|
|
"num_tokens": 5592389.0,
|
||
|
|
"step": 3075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.084698152542114,
|
||
|
|
"epoch": 2.646325741297808,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.00045884823190394134,
|
||
|
|
"loss": 5.7589,
|
||
|
|
"mean_token_accuracy": 0.1789909452199936,
|
||
|
|
"num_tokens": 5601598.0,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.075862979888916,
|
||
|
|
"epoch": 2.650623119896863,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004586563578139275,
|
||
|
|
"loss": 5.8461,
|
||
|
|
"mean_token_accuracy": 0.1662924975156784,
|
||
|
|
"num_tokens": 5610498.0,
|
||
|
|
"step": 3085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.096910190582276,
|
||
|
|
"epoch": 2.6549204984959176,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.00045846408269183505,
|
||
|
|
"loss": 5.7512,
|
||
|
|
"mean_token_accuracy": 0.17860534340143203,
|
||
|
|
"num_tokens": 5620082.0,
|
||
|
|
"step": 3090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1647505283355715,
|
||
|
|
"epoch": 2.659217877094972,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00045827140695751603,
|
||
|
|
"loss": 5.8362,
|
||
|
|
"mean_token_accuracy": 0.17174756973981858,
|
||
|
|
"num_tokens": 5630291.0,
|
||
|
|
"step": 3095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.091697454452515,
|
||
|
|
"epoch": 2.6635152556940267,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004580783310316971,
|
||
|
|
"loss": 5.8104,
|
||
|
|
"mean_token_accuracy": 0.17255474478006363,
|
||
|
|
"num_tokens": 5638784.0,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.026739645004272,
|
||
|
|
"epoch": 2.6678126342930812,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00045788485533597895,
|
||
|
|
"loss": 5.6819,
|
||
|
|
"mean_token_accuracy": 0.18163852095603944,
|
||
|
|
"num_tokens": 5647968.0,
|
||
|
|
"step": 3105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.098209285736084,
|
||
|
|
"epoch": 2.672110012892136,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00045769098029283526,
|
||
|
|
"loss": 5.906,
|
||
|
|
"mean_token_accuracy": 0.16296559423208237,
|
||
|
|
"num_tokens": 5657543.0,
|
||
|
|
"step": 3110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.150312328338623,
|
||
|
|
"epoch": 2.6764073914911903,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004574967063256115,
|
||
|
|
"loss": 5.836,
|
||
|
|
"mean_token_accuracy": 0.17701750695705415,
|
||
|
|
"num_tokens": 5666535.0,
|
||
|
|
"step": 3115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1265421390533445,
|
||
|
|
"epoch": 2.680704770090245,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00045730203385852447,
|
||
|
|
"loss": 5.9135,
|
||
|
|
"mean_token_accuracy": 0.16741105765104294,
|
||
|
|
"num_tokens": 5676273.0,
|
||
|
|
"step": 3120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.052946949005127,
|
||
|
|
"epoch": 2.6850021486892994,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000457106963316661,
|
||
|
|
"loss": 5.8151,
|
||
|
|
"mean_token_accuracy": 0.1772770792245865,
|
||
|
|
"num_tokens": 5684888.0,
|
||
|
|
"step": 3125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.088335084915161,
|
||
|
|
"epoch": 2.689299527288354,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00045691149512597717,
|
||
|
|
"loss": 5.8631,
|
||
|
|
"mean_token_accuracy": 0.16669325679540634,
|
||
|
|
"num_tokens": 5693626.0,
|
||
|
|
"step": 3130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.180005502700806,
|
||
|
|
"epoch": 2.6935969058874085,
|
||
|
|
"grad_norm": 1.4453125,
|
||
|
|
"learning_rate": 0.00045671562971329736,
|
||
|
|
"loss": 5.7649,
|
||
|
|
"mean_token_accuracy": 0.18092152327299119,
|
||
|
|
"num_tokens": 5702542.0,
|
||
|
|
"step": 3135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.056423187255859,
|
||
|
|
"epoch": 2.6978942844864635,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00045651936750631337,
|
||
|
|
"loss": 5.8131,
|
||
|
|
"mean_token_accuracy": 0.17378336936235428,
|
||
|
|
"num_tokens": 5711440.0,
|
||
|
|
"step": 3140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.189997816085816,
|
||
|
|
"epoch": 2.702191663085518,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00045632270893358333,
|
||
|
|
"loss": 5.8825,
|
||
|
|
"mean_token_accuracy": 0.17272377163171768,
|
||
|
|
"num_tokens": 5721495.0,
|
||
|
|
"step": 3145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.167654418945313,
|
||
|
|
"epoch": 2.7064890416845726,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004561256544245312,
|
||
|
|
"loss": 5.9067,
|
||
|
|
"mean_token_accuracy": 0.1615714728832245,
|
||
|
|
"num_tokens": 5730664.0,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.04947509765625,
|
||
|
|
"epoch": 2.710786420283627,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000455928204409445,
|
||
|
|
"loss": 5.79,
|
||
|
|
"mean_token_accuracy": 0.17923566401004792,
|
||
|
|
"num_tokens": 5740229.0,
|
||
|
|
"step": 3155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.107324123382568,
|
||
|
|
"epoch": 2.7150837988826817,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00045573035931947684,
|
||
|
|
"loss": 5.7791,
|
||
|
|
"mean_token_accuracy": 0.17757482677698136,
|
||
|
|
"num_tokens": 5748549.0,
|
||
|
|
"step": 3160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.101696872711182,
|
||
|
|
"epoch": 2.7193811774817362,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004555321195866411,
|
||
|
|
"loss": 5.732,
|
||
|
|
"mean_token_accuracy": 0.17644069641828536,
|
||
|
|
"num_tokens": 5757603.0,
|
||
|
|
"step": 3165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.136196327209473,
|
||
|
|
"epoch": 2.723678556080791,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0004553334856438143,
|
||
|
|
"loss": 5.9098,
|
||
|
|
"mean_token_accuracy": 0.16370768547058107,
|
||
|
|
"num_tokens": 5767520.0,
|
||
|
|
"step": 3170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1458038806915285,
|
||
|
|
"epoch": 2.7279759346798453,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.00045513445792473356,
|
||
|
|
"loss": 5.8906,
|
||
|
|
"mean_token_accuracy": 0.16408973336219787,
|
||
|
|
"num_tokens": 5776778.0,
|
||
|
|
"step": 3175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.174926614761352,
|
||
|
|
"epoch": 2.7322733132789,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004549350368639958,
|
||
|
|
"loss": 5.9249,
|
||
|
|
"mean_token_accuracy": 0.16355405300855635,
|
||
|
|
"num_tokens": 5785652.0,
|
||
|
|
"step": 3180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.212893629074097,
|
||
|
|
"epoch": 2.7365706918779544,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00045473522289705693,
|
||
|
|
"loss": 5.8811,
|
||
|
|
"mean_token_accuracy": 0.1690053179860115,
|
||
|
|
"num_tokens": 5795766.0,
|
||
|
|
"step": 3185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0142913341522215,
|
||
|
|
"epoch": 2.740868070477009,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00045453501646023085,
|
||
|
|
"loss": 5.9293,
|
||
|
|
"mean_token_accuracy": 0.16316341012716293,
|
||
|
|
"num_tokens": 5804504.0,
|
||
|
|
"step": 3190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.090119218826294,
|
||
|
|
"epoch": 2.7451654490760635,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.00045433441799068837,
|
||
|
|
"loss": 5.8318,
|
||
|
|
"mean_token_accuracy": 0.17157045751810074,
|
||
|
|
"num_tokens": 5814161.0,
|
||
|
|
"step": 3195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.133489179611206,
|
||
|
|
"epoch": 2.749462827675118,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004541334279264562,
|
||
|
|
"loss": 5.7556,
|
||
|
|
"mean_token_accuracy": 0.17994108349084853,
|
||
|
|
"num_tokens": 5822235.0,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.069830846786499,
|
||
|
|
"epoch": 2.7537602062741726,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00045393204670641656,
|
||
|
|
"loss": 5.7589,
|
||
|
|
"mean_token_accuracy": 0.17203548699617385,
|
||
|
|
"num_tokens": 5831572.0,
|
||
|
|
"step": 3205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9929163455963135,
|
||
|
|
"epoch": 2.758057584873227,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004537302747703055,
|
||
|
|
"loss": 5.7621,
|
||
|
|
"mean_token_accuracy": 0.18025242835283278,
|
||
|
|
"num_tokens": 5839694.0,
|
||
|
|
"step": 3210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.185488748550415,
|
||
|
|
"epoch": 2.7623549634722817,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00045352811255871216,
|
||
|
|
"loss": 5.8899,
|
||
|
|
"mean_token_accuracy": 0.17093945741653443,
|
||
|
|
"num_tokens": 5849131.0,
|
||
|
|
"step": 3215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.186608505249024,
|
||
|
|
"epoch": 2.7666523420713363,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00045332556051307804,
|
||
|
|
"loss": 5.8208,
|
||
|
|
"mean_token_accuracy": 0.16853767782449722,
|
||
|
|
"num_tokens": 5858861.0,
|
||
|
|
"step": 3220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.110893869400025,
|
||
|
|
"epoch": 2.770949720670391,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00045312261907569585,
|
||
|
|
"loss": 5.82,
|
||
|
|
"mean_token_accuracy": 0.17171475738286973,
|
||
|
|
"num_tokens": 5867585.0,
|
||
|
|
"step": 3225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.081268453598023,
|
||
|
|
"epoch": 2.775247099269446,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00045291928868970867,
|
||
|
|
"loss": 5.8317,
|
||
|
|
"mean_token_accuracy": 0.16950544714927673,
|
||
|
|
"num_tokens": 5876256.0,
|
||
|
|
"step": 3230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.064776659011841,
|
||
|
|
"epoch": 2.7795444778685003,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004527155697991087,
|
||
|
|
"loss": 5.8911,
|
||
|
|
"mean_token_accuracy": 0.16254067420959473,
|
||
|
|
"num_tokens": 5885302.0,
|
||
|
|
"step": 3235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.128396034240723,
|
||
|
|
"epoch": 2.783841856467555,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004525114628487365,
|
||
|
|
"loss": 5.9091,
|
||
|
|
"mean_token_accuracy": 0.16473145335912703,
|
||
|
|
"num_tokens": 5895066.0,
|
||
|
|
"step": 3240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1276613712310795,
|
||
|
|
"epoch": 2.7881392350666094,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00045230696828428026,
|
||
|
|
"loss": 5.8938,
|
||
|
|
"mean_token_accuracy": 0.16614799648523332,
|
||
|
|
"num_tokens": 5903258.0,
|
||
|
|
"step": 3245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.09830675125122,
|
||
|
|
"epoch": 2.792436613665664,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004521020865522742,
|
||
|
|
"loss": 5.7738,
|
||
|
|
"mean_token_accuracy": 0.1714928478002548,
|
||
|
|
"num_tokens": 5911714.0,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.070488023757934,
|
||
|
|
"epoch": 2.7967339922647185,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00045189681810009827,
|
||
|
|
"loss": 5.8635,
|
||
|
|
"mean_token_accuracy": 0.16751533150672912,
|
||
|
|
"num_tokens": 5920432.0,
|
||
|
|
"step": 3255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.227630186080932,
|
||
|
|
"epoch": 2.801031370863773,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00045169116337597653,
|
||
|
|
"loss": 5.8701,
|
||
|
|
"mean_token_accuracy": 0.17065902799367905,
|
||
|
|
"num_tokens": 5929202.0,
|
||
|
|
"step": 3260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.189503717422485,
|
||
|
|
"epoch": 2.8053287494628276,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000451485122828977,
|
||
|
|
"loss": 5.9003,
|
||
|
|
"mean_token_accuracy": 0.1647379770874977,
|
||
|
|
"num_tokens": 5938034.0,
|
||
|
|
"step": 3265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.010164356231689,
|
||
|
|
"epoch": 2.809626128061882,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.00045127869690900956,
|
||
|
|
"loss": 5.7485,
|
||
|
|
"mean_token_accuracy": 0.17689475119113923,
|
||
|
|
"num_tokens": 5946944.0,
|
||
|
|
"step": 3270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.029814195632935,
|
||
|
|
"epoch": 2.8139235066609367,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00045107188606682613,
|
||
|
|
"loss": 5.8498,
|
||
|
|
"mean_token_accuracy": 0.17715609222650527,
|
||
|
|
"num_tokens": 5956475.0,
|
||
|
|
"step": 3275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.185597848892212,
|
||
|
|
"epoch": 2.8182208852599913,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004508646907540188,
|
||
|
|
"loss": 5.8236,
|
||
|
|
"mean_token_accuracy": 0.16963610351085662,
|
||
|
|
"num_tokens": 5965814.0,
|
||
|
|
"step": 3280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.105741548538208,
|
||
|
|
"epoch": 2.8225182638590463,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004506571114230195,
|
||
|
|
"loss": 5.8687,
|
||
|
|
"mean_token_accuracy": 0.16442400217056274,
|
||
|
|
"num_tokens": 5973850.0,
|
||
|
|
"step": 3285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0313629627227785,
|
||
|
|
"epoch": 2.826815642458101,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.00045044914852709824,
|
||
|
|
"loss": 5.8113,
|
||
|
|
"mean_token_accuracy": 0.16617825627326965,
|
||
|
|
"num_tokens": 5982987.0,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.152327919006348,
|
||
|
|
"epoch": 2.8311130210571553,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004502408025203631,
|
||
|
|
"loss": 5.7981,
|
||
|
|
"mean_token_accuracy": 0.17620996087789537,
|
||
|
|
"num_tokens": 5992227.0,
|
||
|
|
"step": 3295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.093041801452637,
|
||
|
|
"epoch": 2.83541039965621,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004500320738577584,
|
||
|
|
"loss": 5.7804,
|
||
|
|
"mean_token_accuracy": 0.17178058624267578,
|
||
|
|
"num_tokens": 6000243.0,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.071863269805908,
|
||
|
|
"epoch": 2.8397077782552644,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.00044982296299506407,
|
||
|
|
"loss": 5.7959,
|
||
|
|
"mean_token_accuracy": 0.1757694289088249,
|
||
|
|
"num_tokens": 6009771.0,
|
||
|
|
"step": 3305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.104401445388794,
|
||
|
|
"epoch": 2.844005156854319,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004496134703888948,
|
||
|
|
"loss": 5.8655,
|
||
|
|
"mean_token_accuracy": 0.16886720359325408,
|
||
|
|
"num_tokens": 6018683.0,
|
||
|
|
"step": 3310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.063603019714355,
|
||
|
|
"epoch": 2.8483025354533735,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00044940359649669846,
|
||
|
|
"loss": 5.7182,
|
||
|
|
"mean_token_accuracy": 0.1814822018146515,
|
||
|
|
"num_tokens": 6027422.0,
|
||
|
|
"step": 3315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0563880443573,
|
||
|
|
"epoch": 2.852599914052428,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00044919334177675595,
|
||
|
|
"loss": 5.8185,
|
||
|
|
"mean_token_accuracy": 0.16714439690113067,
|
||
|
|
"num_tokens": 6035670.0,
|
||
|
|
"step": 3320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.098821926116943,
|
||
|
|
"epoch": 2.8568972926514826,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00044898270668817955,
|
||
|
|
"loss": 5.7433,
|
||
|
|
"mean_token_accuracy": 0.17498091757297515,
|
||
|
|
"num_tokens": 6044092.0,
|
||
|
|
"step": 3325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.041405916213989,
|
||
|
|
"epoch": 2.861194671250537,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000448771691690912,
|
||
|
|
"loss": 5.8089,
|
||
|
|
"mean_token_accuracy": 0.17252034097909927,
|
||
|
|
"num_tokens": 6053970.0,
|
||
|
|
"step": 3330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.098532438278198,
|
||
|
|
"epoch": 2.8654920498495917,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004485602972457257,
|
||
|
|
"loss": 5.7875,
|
||
|
|
"mean_token_accuracy": 0.17401470988988876,
|
||
|
|
"num_tokens": 6062965.0,
|
||
|
|
"step": 3335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.10422191619873,
|
||
|
|
"epoch": 2.8697894284486463,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00044834852381422165,
|
||
|
|
"loss": 5.8375,
|
||
|
|
"mean_token_accuracy": 0.17349963784217834,
|
||
|
|
"num_tokens": 6072420.0,
|
||
|
|
"step": 3340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.048533582687378,
|
||
|
|
"epoch": 2.874086807047701,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00044813637185882836,
|
||
|
|
"loss": 5.7604,
|
||
|
|
"mean_token_accuracy": 0.17201080173254013,
|
||
|
|
"num_tokens": 6080915.0,
|
||
|
|
"step": 3345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.129676723480225,
|
||
|
|
"epoch": 2.8783841856467554,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00044792384184280106,
|
||
|
|
"loss": 5.8898,
|
||
|
|
"mean_token_accuracy": 0.16713710129261017,
|
||
|
|
"num_tokens": 6090453.0,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.036713743209839,
|
||
|
|
"epoch": 2.88268156424581,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00044771093423022013,
|
||
|
|
"loss": 5.9178,
|
||
|
|
"mean_token_accuracy": 0.16426213681697846,
|
||
|
|
"num_tokens": 6099390.0,
|
||
|
|
"step": 3355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.090553140640258,
|
||
|
|
"epoch": 2.8869789428448644,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004474976494859909,
|
||
|
|
"loss": 5.8439,
|
||
|
|
"mean_token_accuracy": 0.17439688742160797,
|
||
|
|
"num_tokens": 6108677.0,
|
||
|
|
"step": 3360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.084423589706421,
|
||
|
|
"epoch": 2.891276321443919,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004472839880758419,
|
||
|
|
"loss": 5.7572,
|
||
|
|
"mean_token_accuracy": 0.17288744151592256,
|
||
|
|
"num_tokens": 6117151.0,
|
||
|
|
"step": 3365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.169969892501831,
|
||
|
|
"epoch": 2.8955737000429735,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004470699504663242,
|
||
|
|
"loss": 5.8724,
|
||
|
|
"mean_token_accuracy": 0.1652231350541115,
|
||
|
|
"num_tokens": 6127167.0,
|
||
|
|
"step": 3370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.055519533157349,
|
||
|
|
"epoch": 2.899871078642028,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004468555371248104,
|
||
|
|
"loss": 5.7663,
|
||
|
|
"mean_token_accuracy": 0.17967537939548492,
|
||
|
|
"num_tokens": 6136487.0,
|
||
|
|
"step": 3375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.096647262573242,
|
||
|
|
"epoch": 2.904168457241083,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004466407485194937,
|
||
|
|
"loss": 5.8808,
|
||
|
|
"mean_token_accuracy": 0.16516373604536055,
|
||
|
|
"num_tokens": 6145334.0,
|
||
|
|
"step": 3380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.091698265075683,
|
||
|
|
"epoch": 2.9084658358401376,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004464255851193864,
|
||
|
|
"loss": 5.7913,
|
||
|
|
"mean_token_accuracy": 0.17120025604963302,
|
||
|
|
"num_tokens": 6155062.0,
|
||
|
|
"step": 3385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.080928611755371,
|
||
|
|
"epoch": 2.912763214439192,
|
||
|
|
"grad_norm": 1.7265625,
|
||
|
|
"learning_rate": 0.0004462100473943194,
|
||
|
|
"loss": 5.7627,
|
||
|
|
"mean_token_accuracy": 0.17752974182367326,
|
||
|
|
"num_tokens": 6164313.0,
|
||
|
|
"step": 3390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.061914777755737,
|
||
|
|
"epoch": 2.9170605930382467,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000445994135814941,
|
||
|
|
"loss": 5.8024,
|
||
|
|
"mean_token_accuracy": 0.17023618370294571,
|
||
|
|
"num_tokens": 6173513.0,
|
||
|
|
"step": 3395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.057987403869629,
|
||
|
|
"epoch": 2.9213579716373013,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00044577785085271566,
|
||
|
|
"loss": 5.8041,
|
||
|
|
"mean_token_accuracy": 0.17476166486740113,
|
||
|
|
"num_tokens": 6182000.0,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1352544784545895,
|
||
|
|
"epoch": 2.925655350236356,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004455611929799235,
|
||
|
|
"loss": 5.8516,
|
||
|
|
"mean_token_accuracy": 0.1572086051106453,
|
||
|
|
"num_tokens": 6191887.0,
|
||
|
|
"step": 3405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.025879716873169,
|
||
|
|
"epoch": 2.9299527288354104,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004453441626696585,
|
||
|
|
"loss": 5.885,
|
||
|
|
"mean_token_accuracy": 0.16230087578296662,
|
||
|
|
"num_tokens": 6202897.0,
|
||
|
|
"step": 3410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.132012939453125,
|
||
|
|
"epoch": 2.934250107434465,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00044512676039582823,
|
||
|
|
"loss": 5.7891,
|
||
|
|
"mean_token_accuracy": 0.1754133865237236,
|
||
|
|
"num_tokens": 6211811.0,
|
||
|
|
"step": 3415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.114519882202148,
|
||
|
|
"epoch": 2.9385474860335195,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004449089866331524,
|
||
|
|
"loss": 5.7826,
|
||
|
|
"mean_token_accuracy": 0.18096065670251846,
|
||
|
|
"num_tokens": 6219896.0,
|
||
|
|
"step": 3420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.983143472671509,
|
||
|
|
"epoch": 2.942844864632574,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004446908418571617,
|
||
|
|
"loss": 5.7734,
|
||
|
|
"mean_token_accuracy": 0.1765346944332123,
|
||
|
|
"num_tokens": 6228212.0,
|
||
|
|
"step": 3425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059330701828003,
|
||
|
|
"epoch": 2.9471422432316285,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004444723265441973,
|
||
|
|
"loss": 5.9301,
|
||
|
|
"mean_token_accuracy": 0.1656051605939865,
|
||
|
|
"num_tokens": 6238133.0,
|
||
|
|
"step": 3430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.08131365776062,
|
||
|
|
"epoch": 2.9514396218306835,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004442534411714092,
|
||
|
|
"loss": 5.8366,
|
||
|
|
"mean_token_accuracy": 0.1650673657655716,
|
||
|
|
"num_tokens": 6247331.0,
|
||
|
|
"step": 3435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.160918760299682,
|
||
|
|
"epoch": 2.955737000429738,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00044403418621675555,
|
||
|
|
"loss": 5.8406,
|
||
|
|
"mean_token_accuracy": 0.16983808875083922,
|
||
|
|
"num_tokens": 6255280.0,
|
||
|
|
"step": 3440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.073430061340332,
|
||
|
|
"epoch": 2.9600343790287926,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004438145621590017,
|
||
|
|
"loss": 5.7939,
|
||
|
|
"mean_token_accuracy": 0.17472269237041474,
|
||
|
|
"num_tokens": 6264752.0,
|
||
|
|
"step": 3445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.033823823928833,
|
||
|
|
"epoch": 2.964331757627847,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00044359456947771857,
|
||
|
|
"loss": 5.7495,
|
||
|
|
"mean_token_accuracy": 0.172511225938797,
|
||
|
|
"num_tokens": 6273258.0,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.891212129592896,
|
||
|
|
"epoch": 2.9686291362269017,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004433742086532824,
|
||
|
|
"loss": 5.6668,
|
||
|
|
"mean_token_accuracy": 0.19016601592302323,
|
||
|
|
"num_tokens": 6281584.0,
|
||
|
|
"step": 3455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.076795339584351,
|
||
|
|
"epoch": 2.9729265148259563,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00044315348016687317,
|
||
|
|
"loss": 5.7854,
|
||
|
|
"mean_token_accuracy": 0.17181758135557174,
|
||
|
|
"num_tokens": 6290016.0,
|
||
|
|
"step": 3460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.06014461517334,
|
||
|
|
"epoch": 2.977223893425011,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004429323845004736,
|
||
|
|
"loss": 5.694,
|
||
|
|
"mean_token_accuracy": 0.17798333764076232,
|
||
|
|
"num_tokens": 6298569.0,
|
||
|
|
"step": 3465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.982924079895019,
|
||
|
|
"epoch": 2.9815212720240654,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00044271092213686824,
|
||
|
|
"loss": 5.7296,
|
||
|
|
"mean_token_accuracy": 0.17693220674991608,
|
||
|
|
"num_tokens": 6307684.0,
|
||
|
|
"step": 3470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1649445533752445,
|
||
|
|
"epoch": 2.98581865062312,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.00044248909355964247,
|
||
|
|
"loss": 5.8556,
|
||
|
|
"mean_token_accuracy": 0.1716341868042946,
|
||
|
|
"num_tokens": 6317767.0,
|
||
|
|
"step": 3475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.146809720993042,
|
||
|
|
"epoch": 2.9901160292221745,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00044226689925318117,
|
||
|
|
"loss": 5.8931,
|
||
|
|
"mean_token_accuracy": 0.16468499451875687,
|
||
|
|
"num_tokens": 6327457.0,
|
||
|
|
"step": 3480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.985245990753174,
|
||
|
|
"epoch": 2.994413407821229,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00044204433970266785,
|
||
|
|
"loss": 5.6945,
|
||
|
|
"mean_token_accuracy": 0.18739936202764512,
|
||
|
|
"num_tokens": 6335747.0,
|
||
|
|
"step": 3485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.050507545471191,
|
||
|
|
"epoch": 2.9987107864202835,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004418214153940837,
|
||
|
|
"loss": 5.7846,
|
||
|
|
"mean_token_accuracy": 0.1760311618447304,
|
||
|
|
"num_tokens": 6344750.0,
|
||
|
|
"step": 3490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.092853705088298,
|
||
|
|
"epoch": 3.002578427159433,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.00044159812681420624,
|
||
|
|
"loss": 5.7217,
|
||
|
|
"mean_token_accuracy": 0.17525596585538653,
|
||
|
|
"num_tokens": 6354779.0,
|
||
|
|
"step": 3495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.122584819793701,
|
||
|
|
"epoch": 3.0068758057584875,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004413744744506086,
|
||
|
|
"loss": 5.506,
|
||
|
|
"mean_token_accuracy": 0.1860961213707924,
|
||
|
|
"num_tokens": 6363809.0,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0068758057584875,
|
||
|
|
"eval_entropy": 5.801608745042269,
|
||
|
|
"eval_loss": 6.042037010192871,
|
||
|
|
"eval_mean_token_accuracy": 0.1686659706336958,
|
||
|
|
"eval_num_tokens": 6363809.0,
|
||
|
|
"eval_runtime": 2.0476,
|
||
|
|
"eval_samples_per_second": 1733.255,
|
||
|
|
"eval_steps_per_second": 216.84,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.992935609817505,
|
||
|
|
"epoch": 3.011173184357542,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00044115045879165806,
|
||
|
|
"loss": 5.563,
|
||
|
|
"mean_token_accuracy": 0.18435313254594804,
|
||
|
|
"num_tokens": 6373082.0,
|
||
|
|
"step": 3505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.053584480285645,
|
||
|
|
"epoch": 3.0154705629565965,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00044092608032651515,
|
||
|
|
"loss": 5.5261,
|
||
|
|
"mean_token_accuracy": 0.1837206542491913,
|
||
|
|
"num_tokens": 6381286.0,
|
||
|
|
"step": 3510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.083251333236694,
|
||
|
|
"epoch": 3.019767941555651,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.00044070133954513305,
|
||
|
|
"loss": 5.4729,
|
||
|
|
"mean_token_accuracy": 0.19432286769151688,
|
||
|
|
"num_tokens": 6390217.0,
|
||
|
|
"step": 3515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.058011102676391,
|
||
|
|
"epoch": 3.0240653201547056,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.0004404762369382555,
|
||
|
|
"loss": 5.5036,
|
||
|
|
"mean_token_accuracy": 0.18731357306241989,
|
||
|
|
"num_tokens": 6399276.0,
|
||
|
|
"step": 3520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.000890445709229,
|
||
|
|
"epoch": 3.02836269875376,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00044025077299741683,
|
||
|
|
"loss": 5.4811,
|
||
|
|
"mean_token_accuracy": 0.192198945581913,
|
||
|
|
"num_tokens": 6407981.0,
|
||
|
|
"step": 3525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.988429880142212,
|
||
|
|
"epoch": 3.0326600773528147,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00044002494821494007,
|
||
|
|
"loss": 5.4804,
|
||
|
|
"mean_token_accuracy": 0.18921354711055755,
|
||
|
|
"num_tokens": 6416159.0,
|
||
|
|
"step": 3530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9463738918304445,
|
||
|
|
"epoch": 3.0369574559518693,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00043979876308393635,
|
||
|
|
"loss": 5.531,
|
||
|
|
"mean_token_accuracy": 0.1913963183760643,
|
||
|
|
"num_tokens": 6424564.0,
|
||
|
|
"step": 3535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.106854009628296,
|
||
|
|
"epoch": 3.041254834550924,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004395722180983036,
|
||
|
|
"loss": 5.5823,
|
||
|
|
"mean_token_accuracy": 0.18249945044517518,
|
||
|
|
"num_tokens": 6434163.0,
|
||
|
|
"step": 3540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.950508308410645,
|
||
|
|
"epoch": 3.0455522131499784,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00043934531375272535,
|
||
|
|
"loss": 5.3919,
|
||
|
|
"mean_token_accuracy": 0.20384220778942108,
|
||
|
|
"num_tokens": 6443372.0,
|
||
|
|
"step": 3545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.974466180801391,
|
||
|
|
"epoch": 3.049849591749033,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.00043911805054267015,
|
||
|
|
"loss": 5.4833,
|
||
|
|
"mean_token_accuracy": 0.18905829787254333,
|
||
|
|
"num_tokens": 6452638.0,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.111138391494751,
|
||
|
|
"epoch": 3.0541469703480875,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00043889042896439004,
|
||
|
|
"loss": 5.4924,
|
||
|
|
"mean_token_accuracy": 0.19172994196414947,
|
||
|
|
"num_tokens": 6461319.0,
|
||
|
|
"step": 3555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.002539110183716,
|
||
|
|
"epoch": 3.0584443489471425,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.00043866244951491946,
|
||
|
|
"loss": 5.4305,
|
||
|
|
"mean_token_accuracy": 0.1999826490879059,
|
||
|
|
"num_tokens": 6469506.0,
|
||
|
|
"step": 3560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.020529794692993,
|
||
|
|
"epoch": 3.062741727546197,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00043843411269207445,
|
||
|
|
"loss": 5.4837,
|
||
|
|
"mean_token_accuracy": 0.19121226519346238,
|
||
|
|
"num_tokens": 6478404.0,
|
||
|
|
"step": 3565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9611005783081055,
|
||
|
|
"epoch": 3.0670391061452515,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004382054189944514,
|
||
|
|
"loss": 5.433,
|
||
|
|
"mean_token_accuracy": 0.18942490667104722,
|
||
|
|
"num_tokens": 6487447.0,
|
||
|
|
"step": 3570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9097977638244625,
|
||
|
|
"epoch": 3.071336484744306,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004379763689214259,
|
||
|
|
"loss": 5.469,
|
||
|
|
"mean_token_accuracy": 0.18396330773830413,
|
||
|
|
"num_tokens": 6496738.0,
|
||
|
|
"step": 3575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.013470220565796,
|
||
|
|
"epoch": 3.0756338633433606,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004377469629731518,
|
||
|
|
"loss": 5.4752,
|
||
|
|
"mean_token_accuracy": 0.1895818755030632,
|
||
|
|
"num_tokens": 6505848.0,
|
||
|
|
"step": 3580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.006653928756714,
|
||
|
|
"epoch": 3.079931241942415,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004375172016505599,
|
||
|
|
"loss": 5.4558,
|
||
|
|
"mean_token_accuracy": 0.18824636489152907,
|
||
|
|
"num_tokens": 6515731.0,
|
||
|
|
"step": 3585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.979631328582764,
|
||
|
|
"epoch": 3.0842286205414697,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004372870854553572,
|
||
|
|
"loss": 5.5152,
|
||
|
|
"mean_token_accuracy": 0.18944674283266066,
|
||
|
|
"num_tokens": 6524914.0,
|
||
|
|
"step": 3590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.99342303276062,
|
||
|
|
"epoch": 3.0885259991405243,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004370566148900255,
|
||
|
|
"loss": 5.4967,
|
||
|
|
"mean_token_accuracy": 0.19440635293722153,
|
||
|
|
"num_tokens": 6533712.0,
|
||
|
|
"step": 3595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0267222881317135,
|
||
|
|
"epoch": 3.092823377739579,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00043682579045782024,
|
||
|
|
"loss": 5.5786,
|
||
|
|
"mean_token_accuracy": 0.18650965839624406,
|
||
|
|
"num_tokens": 6543313.0,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.940178155899048,
|
||
|
|
"epoch": 3.0971207563386334,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004365946126627699,
|
||
|
|
"loss": 5.4649,
|
||
|
|
"mean_token_accuracy": 0.19772678166627883,
|
||
|
|
"num_tokens": 6551634.0,
|
||
|
|
"step": 3605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.004144239425659,
|
||
|
|
"epoch": 3.101418134937688,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00043636308200967433,
|
||
|
|
"loss": 5.4821,
|
||
|
|
"mean_token_accuracy": 0.1942768707871437,
|
||
|
|
"num_tokens": 6560695.0,
|
||
|
|
"step": 3610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.857456827163697,
|
||
|
|
"epoch": 3.1057155135367425,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004361311990041039,
|
||
|
|
"loss": 5.3753,
|
||
|
|
"mean_token_accuracy": 0.19874223917722703,
|
||
|
|
"num_tokens": 6569086.0,
|
||
|
|
"step": 3615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.919683027267456,
|
||
|
|
"epoch": 3.110012892135797,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00043589896415239843,
|
||
|
|
"loss": 5.4564,
|
||
|
|
"mean_token_accuracy": 0.1986413672566414,
|
||
|
|
"num_tokens": 6578287.0,
|
||
|
|
"step": 3620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.956605434417725,
|
||
|
|
"epoch": 3.1143102707348516,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00043566637796166595,
|
||
|
|
"loss": 5.5147,
|
||
|
|
"mean_token_accuracy": 0.18752527385950088,
|
||
|
|
"num_tokens": 6587015.0,
|
||
|
|
"step": 3625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9813155174255375,
|
||
|
|
"epoch": 3.118607649333906,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00043543344093978186,
|
||
|
|
"loss": 5.5585,
|
||
|
|
"mean_token_accuracy": 0.18545775562524797,
|
||
|
|
"num_tokens": 6596187.0,
|
||
|
|
"step": 3630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.964481592178345,
|
||
|
|
"epoch": 3.122905027932961,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00043520015359538745,
|
||
|
|
"loss": 5.4268,
|
||
|
|
"mean_token_accuracy": 0.19721884578466414,
|
||
|
|
"num_tokens": 6605226.0,
|
||
|
|
"step": 3635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.862498092651367,
|
||
|
|
"epoch": 3.1272024065320156,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004349665164378891,
|
||
|
|
"loss": 5.475,
|
||
|
|
"mean_token_accuracy": 0.18966546505689622,
|
||
|
|
"num_tokens": 6613232.0,
|
||
|
|
"step": 3640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.976254987716675,
|
||
|
|
"epoch": 3.13149978513107,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00043473252997745684,
|
||
|
|
"loss": 5.4789,
|
||
|
|
"mean_token_accuracy": 0.18647109866142272,
|
||
|
|
"num_tokens": 6622247.0,
|
||
|
|
"step": 3645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.025827789306641,
|
||
|
|
"epoch": 3.1357971637301247,
|
||
|
|
"grad_norm": 1.71875,
|
||
|
|
"learning_rate": 0.00043449819472502366,
|
||
|
|
"loss": 5.4281,
|
||
|
|
"mean_token_accuracy": 0.19298454523086547,
|
||
|
|
"num_tokens": 6630883.0,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.921304559707641,
|
||
|
|
"epoch": 3.1400945423291793,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004342635111922841,
|
||
|
|
"loss": 5.5595,
|
||
|
|
"mean_token_accuracy": 0.18861598372459412,
|
||
|
|
"num_tokens": 6639399.0,
|
||
|
|
"step": 3655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.989827823638916,
|
||
|
|
"epoch": 3.144391920928234,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004340284798916931,
|
||
|
|
"loss": 5.483,
|
||
|
|
"mean_token_accuracy": 0.19412256628274918,
|
||
|
|
"num_tokens": 6649288.0,
|
||
|
|
"step": 3660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.921028423309326,
|
||
|
|
"epoch": 3.1486892995272884,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004337931013364653,
|
||
|
|
"loss": 5.4165,
|
||
|
|
"mean_token_accuracy": 0.19552054554224013,
|
||
|
|
"num_tokens": 6658670.0,
|
||
|
|
"step": 3665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.969826030731201,
|
||
|
|
"epoch": 3.152986678126343,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000433557376040573,
|
||
|
|
"loss": 5.4991,
|
||
|
|
"mean_token_accuracy": 0.1942813739180565,
|
||
|
|
"num_tokens": 6667302.0,
|
||
|
|
"step": 3670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.992925643920898,
|
||
|
|
"epoch": 3.1572840567253975,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00043332130451874645,
|
||
|
|
"loss": 5.5383,
|
||
|
|
"mean_token_accuracy": 0.1936521127820015,
|
||
|
|
"num_tokens": 6677393.0,
|
||
|
|
"step": 3675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.003905582427978,
|
||
|
|
"epoch": 3.161581435324452,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.00043308488728647127,
|
||
|
|
"loss": 5.5183,
|
||
|
|
"mean_token_accuracy": 0.18625610321760178,
|
||
|
|
"num_tokens": 6686727.0,
|
||
|
|
"step": 3680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.899046134948731,
|
||
|
|
"epoch": 3.1658788139235066,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004328481248599882,
|
||
|
|
"loss": 5.4279,
|
||
|
|
"mean_token_accuracy": 0.196131394803524,
|
||
|
|
"num_tokens": 6696116.0,
|
||
|
|
"step": 3685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.968793296813965,
|
||
|
|
"epoch": 3.170176192522561,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004326110177562918,
|
||
|
|
"loss": 5.5429,
|
||
|
|
"mean_token_accuracy": 0.18541710525751115,
|
||
|
|
"num_tokens": 6704640.0,
|
||
|
|
"step": 3690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.916857767105102,
|
||
|
|
"epoch": 3.1744735711216157,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00043237356649312926,
|
||
|
|
"loss": 5.3912,
|
||
|
|
"mean_token_accuracy": 0.20387934297323226,
|
||
|
|
"num_tokens": 6713663.0,
|
||
|
|
"step": 3695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.932327318191528,
|
||
|
|
"epoch": 3.17877094972067,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004321357715889991,
|
||
|
|
"loss": 5.526,
|
||
|
|
"mean_token_accuracy": 0.1858012244105339,
|
||
|
|
"num_tokens": 6722965.0,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9681384563446045,
|
||
|
|
"epoch": 3.1830683283197247,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004318976335631505,
|
||
|
|
"loss": 5.4893,
|
||
|
|
"mean_token_accuracy": 0.19365193992853164,
|
||
|
|
"num_tokens": 6732776.0,
|
||
|
|
"step": 3705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.964018297195435,
|
||
|
|
"epoch": 3.1873657069187797,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00043165915293558155,
|
||
|
|
"loss": 5.4682,
|
||
|
|
"mean_token_accuracy": 0.19091420918703078,
|
||
|
|
"num_tokens": 6741309.0,
|
||
|
|
"step": 3710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.944598436355591,
|
||
|
|
"epoch": 3.1916630855178343,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004314203302270388,
|
||
|
|
"loss": 5.5274,
|
||
|
|
"mean_token_accuracy": 0.18904216587543488,
|
||
|
|
"num_tokens": 6750584.0,
|
||
|
|
"step": 3715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.97039303779602,
|
||
|
|
"epoch": 3.195960464116889,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004311811659590154,
|
||
|
|
"loss": 5.5007,
|
||
|
|
"mean_token_accuracy": 0.1887460470199585,
|
||
|
|
"num_tokens": 6759344.0,
|
||
|
|
"step": 3720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059423017501831,
|
||
|
|
"epoch": 3.2002578427159434,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004309416606537507,
|
||
|
|
"loss": 5.6563,
|
||
|
|
"mean_token_accuracy": 0.18009912818670273,
|
||
|
|
"num_tokens": 6770345.0,
|
||
|
|
"step": 3725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.00485258102417,
|
||
|
|
"epoch": 3.204555221314998,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00043070181483422843,
|
||
|
|
"loss": 5.5411,
|
||
|
|
"mean_token_accuracy": 0.1854734942317009,
|
||
|
|
"num_tokens": 6779991.0,
|
||
|
|
"step": 3730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.88880934715271,
|
||
|
|
"epoch": 3.2088525999140525,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000430461629024176,
|
||
|
|
"loss": 5.4983,
|
||
|
|
"mean_token_accuracy": 0.19071830958127975,
|
||
|
|
"num_tokens": 6788972.0,
|
||
|
|
"step": 3735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.885913467407226,
|
||
|
|
"epoch": 3.213149978513107,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004302211037480634,
|
||
|
|
"loss": 5.4111,
|
||
|
|
"mean_token_accuracy": 0.19531920850276946,
|
||
|
|
"num_tokens": 6796967.0,
|
||
|
|
"step": 3740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.912165975570678,
|
||
|
|
"epoch": 3.2174473571121616,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0004299802395311015,
|
||
|
|
"loss": 5.5182,
|
||
|
|
"mean_token_accuracy": 0.18958668708801268,
|
||
|
|
"num_tokens": 6805961.0,
|
||
|
|
"step": 3745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.875810194015503,
|
||
|
|
"epoch": 3.221744735711216,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0004297390368992414,
|
||
|
|
"loss": 5.4233,
|
||
|
|
"mean_token_accuracy": 0.19228914380073547,
|
||
|
|
"num_tokens": 6814657.0,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.940344333648682,
|
||
|
|
"epoch": 3.2260421143102707,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00042949749637917353,
|
||
|
|
"loss": 5.4718,
|
||
|
|
"mean_token_accuracy": 0.1930217519402504,
|
||
|
|
"num_tokens": 6823095.0,
|
||
|
|
"step": 3755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.956659030914307,
|
||
|
|
"epoch": 3.230339492909325,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004292556184983256,
|
||
|
|
"loss": 5.4872,
|
||
|
|
"mean_token_accuracy": 0.19027772098779677,
|
||
|
|
"num_tokens": 6832195.0,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.009495830535888,
|
||
|
|
"epoch": 3.2346368715083798,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004290134037848623,
|
||
|
|
"loss": 5.6084,
|
||
|
|
"mean_token_accuracy": 0.18570149838924407,
|
||
|
|
"num_tokens": 6840922.0,
|
||
|
|
"step": 3765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.964060831069946,
|
||
|
|
"epoch": 3.2389342501074343,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00042877085276768386,
|
||
|
|
"loss": 5.46,
|
||
|
|
"mean_token_accuracy": 0.19570931494235994,
|
||
|
|
"num_tokens": 6849182.0,
|
||
|
|
"step": 3770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.94105863571167,
|
||
|
|
"epoch": 3.243231628706489,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00042852796597642455,
|
||
|
|
"loss": 5.4551,
|
||
|
|
"mean_token_accuracy": 0.19768441170454026,
|
||
|
|
"num_tokens": 6857932.0,
|
||
|
|
"step": 3775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.997882509231568,
|
||
|
|
"epoch": 3.247529007305544,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004282847439414522,
|
||
|
|
"loss": 5.616,
|
||
|
|
"mean_token_accuracy": 0.17659982144832612,
|
||
|
|
"num_tokens": 6867283.0,
|
||
|
|
"step": 3780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0180786609649655,
|
||
|
|
"epoch": 3.2518263859045984,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004280411871938664,
|
||
|
|
"loss": 5.5648,
|
||
|
|
"mean_token_accuracy": 0.18943356424570085,
|
||
|
|
"num_tokens": 6876123.0,
|
||
|
|
"step": 3785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.006447601318359,
|
||
|
|
"epoch": 3.256123764503653,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004277972962654979,
|
||
|
|
"loss": 5.5082,
|
||
|
|
"mean_token_accuracy": 0.18536664098501204,
|
||
|
|
"num_tokens": 6885239.0,
|
||
|
|
"step": 3790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.930108880996704,
|
||
|
|
"epoch": 3.2604211431027075,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004275530716889069,
|
||
|
|
"loss": 5.5573,
|
||
|
|
"mean_token_accuracy": 0.18274880945682526,
|
||
|
|
"num_tokens": 6895061.0,
|
||
|
|
"step": 3795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.983970260620117,
|
||
|
|
"epoch": 3.264718521701762,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0004273085139973822,
|
||
|
|
"loss": 5.5993,
|
||
|
|
"mean_token_accuracy": 0.177694109082222,
|
||
|
|
"num_tokens": 6903828.0,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.014524221420288,
|
||
|
|
"epoch": 3.2690159003008166,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004270636237249401,
|
||
|
|
"loss": 5.5151,
|
||
|
|
"mean_token_accuracy": 0.18856608420610427,
|
||
|
|
"num_tokens": 6912805.0,
|
||
|
|
"step": 3805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.941100168228149,
|
||
|
|
"epoch": 3.273313278899871,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00042681840140632314,
|
||
|
|
"loss": 5.5616,
|
||
|
|
"mean_token_accuracy": 0.18302462846040726,
|
||
|
|
"num_tokens": 6922165.0,
|
||
|
|
"step": 3810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.997183227539063,
|
||
|
|
"epoch": 3.2776106574989257,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004265728475769989,
|
||
|
|
"loss": 5.5322,
|
||
|
|
"mean_token_accuracy": 0.18632204383611678,
|
||
|
|
"num_tokens": 6931677.0,
|
||
|
|
"step": 3815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.975349044799804,
|
||
|
|
"epoch": 3.28190803609798,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004263269627731586,
|
||
|
|
"loss": 5.4952,
|
||
|
|
"mean_token_accuracy": 0.19264112412929535,
|
||
|
|
"num_tokens": 6940486.0,
|
||
|
|
"step": 3820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.868766260147095,
|
||
|
|
"epoch": 3.2862054146970348,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004260807475317164,
|
||
|
|
"loss": 5.51,
|
||
|
|
"mean_token_accuracy": 0.1856775924563408,
|
||
|
|
"num_tokens": 6948990.0,
|
||
|
|
"step": 3825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.010857200622558,
|
||
|
|
"epoch": 3.2905027932960893,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004258342023903081,
|
||
|
|
"loss": 5.636,
|
||
|
|
"mean_token_accuracy": 0.17837173044681548,
|
||
|
|
"num_tokens": 6959311.0,
|
||
|
|
"step": 3830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.02067198753357,
|
||
|
|
"epoch": 3.294800171895144,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00042558732788728975,
|
||
|
|
"loss": 5.4186,
|
||
|
|
"mean_token_accuracy": 0.19980644732713698,
|
||
|
|
"num_tokens": 6968619.0,
|
||
|
|
"step": 3835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.891939735412597,
|
||
|
|
"epoch": 3.2990975504941984,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00042534012456173643,
|
||
|
|
"loss": 5.4745,
|
||
|
|
"mean_token_accuracy": 0.1930858761072159,
|
||
|
|
"num_tokens": 6977469.0,
|
||
|
|
"step": 3840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.908893871307373,
|
||
|
|
"epoch": 3.303394929093253,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00042509259295344157,
|
||
|
|
"loss": 5.4637,
|
||
|
|
"mean_token_accuracy": 0.18524923622608186,
|
||
|
|
"num_tokens": 6986772.0,
|
||
|
|
"step": 3845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.965682172775269,
|
||
|
|
"epoch": 3.3076923076923075,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.00042484473360291514,
|
||
|
|
"loss": 5.4722,
|
||
|
|
"mean_token_accuracy": 0.1818112000823021,
|
||
|
|
"num_tokens": 6993937.0,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.878727436065674,
|
||
|
|
"epoch": 3.311989686291362,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00042459654705138294,
|
||
|
|
"loss": 5.5336,
|
||
|
|
"mean_token_accuracy": 0.19061464071273804,
|
||
|
|
"num_tokens": 7003222.0,
|
||
|
|
"step": 3855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.907388973236084,
|
||
|
|
"epoch": 3.316287064890417,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004243480338407853,
|
||
|
|
"loss": 5.5021,
|
||
|
|
"mean_token_accuracy": 0.19867320060729982,
|
||
|
|
"num_tokens": 7012055.0,
|
||
|
|
"step": 3860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.968272018432617,
|
||
|
|
"epoch": 3.3205844434894716,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004240991945137755,
|
||
|
|
"loss": 5.4952,
|
||
|
|
"mean_token_accuracy": 0.1932666853070259,
|
||
|
|
"num_tokens": 7021036.0,
|
||
|
|
"step": 3865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.909445858001709,
|
||
|
|
"epoch": 3.324881822088526,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00042385002961371944,
|
||
|
|
"loss": 5.4787,
|
||
|
|
"mean_token_accuracy": 0.194594843685627,
|
||
|
|
"num_tokens": 7030450.0,
|
||
|
|
"step": 3870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.005906677246093,
|
||
|
|
"epoch": 3.3291792006875807,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004236005396846935,
|
||
|
|
"loss": 5.5873,
|
||
|
|
"mean_token_accuracy": 0.18787091970443726,
|
||
|
|
"num_tokens": 7039740.0,
|
||
|
|
"step": 3875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0099263191223145,
|
||
|
|
"epoch": 3.333476579286635,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00042335072527148406,
|
||
|
|
"loss": 5.5642,
|
||
|
|
"mean_token_accuracy": 0.18891336619853974,
|
||
|
|
"num_tokens": 7050430.0,
|
||
|
|
"step": 3880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.886811065673828,
|
||
|
|
"epoch": 3.3377739578856898,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0004231005869195859,
|
||
|
|
"loss": 5.5523,
|
||
|
|
"mean_token_accuracy": 0.18664977699518204,
|
||
|
|
"num_tokens": 7059477.0,
|
||
|
|
"step": 3885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.945472669601441,
|
||
|
|
"epoch": 3.3420713364847443,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 0.0004228501251752011,
|
||
|
|
"loss": 5.4871,
|
||
|
|
"mean_token_accuracy": 0.19109417051076888,
|
||
|
|
"num_tokens": 7067805.0,
|
||
|
|
"step": 3890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.942922163009643,
|
||
|
|
"epoch": 3.346368715083799,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00042259934058523814,
|
||
|
|
"loss": 5.4972,
|
||
|
|
"mean_token_accuracy": 0.18601811528205872,
|
||
|
|
"num_tokens": 7077606.0,
|
||
|
|
"step": 3895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.984446573257446,
|
||
|
|
"epoch": 3.3506660936828534,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00042234823369731027,
|
||
|
|
"loss": 5.448,
|
||
|
|
"mean_token_accuracy": 0.19036031365394593,
|
||
|
|
"num_tokens": 7085647.0,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.861058759689331,
|
||
|
|
"epoch": 3.354963472281908,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00042209680505973465,
|
||
|
|
"loss": 5.4762,
|
||
|
|
"mean_token_accuracy": 0.19057320803403854,
|
||
|
|
"num_tokens": 7095298.0,
|
||
|
|
"step": 3905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.868588638305664,
|
||
|
|
"epoch": 3.3592608508809625,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004218450552215308,
|
||
|
|
"loss": 5.5542,
|
||
|
|
"mean_token_accuracy": 0.19133240431547166,
|
||
|
|
"num_tokens": 7105207.0,
|
||
|
|
"step": 3910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.973352527618408,
|
||
|
|
"epoch": 3.363558229480017,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004215929847324199,
|
||
|
|
"loss": 5.6046,
|
||
|
|
"mean_token_accuracy": 0.18282657265663146,
|
||
|
|
"num_tokens": 7114833.0,
|
||
|
|
"step": 3915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0064185619354244,
|
||
|
|
"epoch": 3.3678556080790716,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.000421340594142823,
|
||
|
|
"loss": 5.4227,
|
||
|
|
"mean_token_accuracy": 0.20140644013881684,
|
||
|
|
"num_tokens": 7123608.0,
|
||
|
|
"step": 3920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.875625896453857,
|
||
|
|
"epoch": 3.3721529866781266,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.00042108788400386035,
|
||
|
|
"loss": 5.4824,
|
||
|
|
"mean_token_accuracy": 0.19125625491142273,
|
||
|
|
"num_tokens": 7132250.0,
|
||
|
|
"step": 3925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.91867356300354,
|
||
|
|
"epoch": 3.376450365277181,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004208348548673498,
|
||
|
|
"loss": 5.5796,
|
||
|
|
"mean_token_accuracy": 0.18955173790454866,
|
||
|
|
"num_tokens": 7142086.0,
|
||
|
|
"step": 3930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.989838075637818,
|
||
|
|
"epoch": 3.3807477438762357,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000420581507285806,
|
||
|
|
"loss": 5.525,
|
||
|
|
"mean_token_accuracy": 0.1797061249613762,
|
||
|
|
"num_tokens": 7152434.0,
|
||
|
|
"step": 3935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.870218181610108,
|
||
|
|
"epoch": 3.38504512247529,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004203278418124386,
|
||
|
|
"loss": 5.4707,
|
||
|
|
"mean_token_accuracy": 0.19644346386194228,
|
||
|
|
"num_tokens": 7163041.0,
|
||
|
|
"step": 3940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.865656518936158,
|
||
|
|
"epoch": 3.3893425010743448,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004200738590011518,
|
||
|
|
"loss": 5.4512,
|
||
|
|
"mean_token_accuracy": 0.19743987321853637,
|
||
|
|
"num_tokens": 7171875.0,
|
||
|
|
"step": 3945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.906575489044189,
|
||
|
|
"epoch": 3.3936398796733993,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00041981955940654245,
|
||
|
|
"loss": 5.5679,
|
||
|
|
"mean_token_accuracy": 0.18974538147449493,
|
||
|
|
"num_tokens": 7180803.0,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.951998472213745,
|
||
|
|
"epoch": 3.397937258272454,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004195649435838992,
|
||
|
|
"loss": 5.5884,
|
||
|
|
"mean_token_accuracy": 0.17947447150945664,
|
||
|
|
"num_tokens": 7190661.0,
|
||
|
|
"step": 3955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.871505403518677,
|
||
|
|
"epoch": 3.4022346368715084,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004193100120892013,
|
||
|
|
"loss": 5.418,
|
||
|
|
"mean_token_accuracy": 0.19889674335718155,
|
||
|
|
"num_tokens": 7199357.0,
|
||
|
|
"step": 3960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.934350156784058,
|
||
|
|
"epoch": 3.406532015470563,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004190547654791172,
|
||
|
|
"loss": 5.597,
|
||
|
|
"mean_token_accuracy": 0.18219801187515258,
|
||
|
|
"num_tokens": 7209856.0,
|
||
|
|
"step": 3965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.969940042495727,
|
||
|
|
"epoch": 3.4108293940696175,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00041879920431100347,
|
||
|
|
"loss": 5.5648,
|
||
|
|
"mean_token_accuracy": 0.17899948358535767,
|
||
|
|
"num_tokens": 7218778.0,
|
||
|
|
"step": 3970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.924646472930908,
|
||
|
|
"epoch": 3.415126772668672,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004185433291429036,
|
||
|
|
"loss": 5.5802,
|
||
|
|
"mean_token_accuracy": 0.18834476321935653,
|
||
|
|
"num_tokens": 7228442.0,
|
||
|
|
"step": 3975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.978606748580932,
|
||
|
|
"epoch": 3.4194241512677266,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00041828714053354665,
|
||
|
|
"loss": 5.5653,
|
||
|
|
"mean_token_accuracy": 0.18292482793331147,
|
||
|
|
"num_tokens": 7238724.0,
|
||
|
|
"step": 3980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.850194692611694,
|
||
|
|
"epoch": 3.423721529866781,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004180306390423462,
|
||
|
|
"loss": 5.5145,
|
||
|
|
"mean_token_accuracy": 0.19443774223327637,
|
||
|
|
"num_tokens": 7247844.0,
|
||
|
|
"step": 3985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.919923639297485,
|
||
|
|
"epoch": 3.4280189084658357,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00041777382522939884,
|
||
|
|
"loss": 5.5776,
|
||
|
|
"mean_token_accuracy": 0.1839929461479187,
|
||
|
|
"num_tokens": 7257260.0,
|
||
|
|
"step": 3990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.963938665390015,
|
||
|
|
"epoch": 3.4323162870648902,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.00041751669965548344,
|
||
|
|
"loss": 5.5802,
|
||
|
|
"mean_token_accuracy": 0.1809097185730934,
|
||
|
|
"num_tokens": 7266890.0,
|
||
|
|
"step": 3995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.974624681472778,
|
||
|
|
"epoch": 3.4366136656639448,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00041725926288205945,
|
||
|
|
"loss": 5.598,
|
||
|
|
"mean_token_accuracy": 0.17821378856897355,
|
||
|
|
"num_tokens": 7276114.0,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4366136656639448,
|
||
|
|
"eval_entropy": 5.73526575543859,
|
||
|
|
"eval_loss": 6.016810417175293,
|
||
|
|
"eval_mean_token_accuracy": 0.17057843910748358,
|
||
|
|
"eval_num_tokens": 7276114.0,
|
||
|
|
"eval_runtime": 2.0499,
|
||
|
|
"eval_samples_per_second": 1731.264,
|
||
|
|
"eval_steps_per_second": 216.591,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9616344451904295,
|
||
|
|
"epoch": 3.4409110442629998,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004170015154712658,
|
||
|
|
"loss": 5.548,
|
||
|
|
"mean_token_accuracy": 0.1874366208910942,
|
||
|
|
"num_tokens": 7284426.0,
|
||
|
|
"step": 4005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.910069179534912,
|
||
|
|
"epoch": 3.4452084228620543,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00041674345798591993,
|
||
|
|
"loss": 5.5843,
|
||
|
|
"mean_token_accuracy": 0.18420783281326295,
|
||
|
|
"num_tokens": 7294813.0,
|
||
|
|
"step": 4010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.961581373214722,
|
||
|
|
"epoch": 3.449505801461109,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004164850909895161,
|
||
|
|
"loss": 5.5619,
|
||
|
|
"mean_token_accuracy": 0.18809896260499953,
|
||
|
|
"num_tokens": 7304655.0,
|
||
|
|
"step": 4015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.849625158309936,
|
||
|
|
"epoch": 3.4538031800601634,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004162264150462247,
|
||
|
|
"loss": 5.5155,
|
||
|
|
"mean_token_accuracy": 0.1865479052066803,
|
||
|
|
"num_tokens": 7313610.0,
|
||
|
|
"step": 4020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.980514192581177,
|
||
|
|
"epoch": 3.458100558659218,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00041596743072089065,
|
||
|
|
"loss": 5.5535,
|
||
|
|
"mean_token_accuracy": 0.19074880033731462,
|
||
|
|
"num_tokens": 7322243.0,
|
||
|
|
"step": 4025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.062830209732056,
|
||
|
|
"epoch": 3.4623979372582725,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000415708138579032,
|
||
|
|
"loss": 5.5229,
|
||
|
|
"mean_token_accuracy": 0.17943777292966842,
|
||
|
|
"num_tokens": 7331040.0,
|
||
|
|
"step": 4030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.886963891983032,
|
||
|
|
"epoch": 3.466695315857327,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00041544853918683923,
|
||
|
|
"loss": 5.5948,
|
||
|
|
"mean_token_accuracy": 0.1817588433623314,
|
||
|
|
"num_tokens": 7340771.0,
|
||
|
|
"step": 4035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9117542743682865,
|
||
|
|
"epoch": 3.4709926944563816,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004151886331111737,
|
||
|
|
"loss": 5.6421,
|
||
|
|
"mean_token_accuracy": 0.18092233091592788,
|
||
|
|
"num_tokens": 7349960.0,
|
||
|
|
"step": 4040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.899527883529663,
|
||
|
|
"epoch": 3.475290073055436,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.00041492842091956646,
|
||
|
|
"loss": 5.4649,
|
||
|
|
"mean_token_accuracy": 0.1919792726635933,
|
||
|
|
"num_tokens": 7357983.0,
|
||
|
|
"step": 4045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.988178062438965,
|
||
|
|
"epoch": 3.4795874516544907,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004146679031802167,
|
||
|
|
"loss": 5.591,
|
||
|
|
"mean_token_accuracy": 0.19019764959812163,
|
||
|
|
"num_tokens": 7366814.0,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9325186252594,
|
||
|
|
"epoch": 3.4838848302535452,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00041440708046199123,
|
||
|
|
"loss": 5.452,
|
||
|
|
"mean_token_accuracy": 0.19600227922201158,
|
||
|
|
"num_tokens": 7374773.0,
|
||
|
|
"step": 4055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.890796184539795,
|
||
|
|
"epoch": 3.4881822088525998,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004141459533344226,
|
||
|
|
"loss": 5.5562,
|
||
|
|
"mean_token_accuracy": 0.1825706109404564,
|
||
|
|
"num_tokens": 7383937.0,
|
||
|
|
"step": 4060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.957454347610474,
|
||
|
|
"epoch": 3.4924795874516543,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00041388452236770795,
|
||
|
|
"loss": 5.5305,
|
||
|
|
"mean_token_accuracy": 0.18163443803787233,
|
||
|
|
"num_tokens": 7392577.0,
|
||
|
|
"step": 4065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.882272720336914,
|
||
|
|
"epoch": 3.4967769660507093,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00041362278813270823,
|
||
|
|
"loss": 5.4193,
|
||
|
|
"mean_token_accuracy": 0.20885447710752486,
|
||
|
|
"num_tokens": 7401473.0,
|
||
|
|
"step": 4070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.992699241638183,
|
||
|
|
"epoch": 3.501074344649764,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00041336075120094616,
|
||
|
|
"loss": 5.6214,
|
||
|
|
"mean_token_accuracy": 0.17333737909793853,
|
||
|
|
"num_tokens": 7410831.0,
|
||
|
|
"step": 4075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0088804244995115,
|
||
|
|
"epoch": 3.5053717232488184,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00041309841214460586,
|
||
|
|
"loss": 5.6193,
|
||
|
|
"mean_token_accuracy": 0.18231521993875505,
|
||
|
|
"num_tokens": 7421563.0,
|
||
|
|
"step": 4080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.887757968902588,
|
||
|
|
"epoch": 3.509669101847873,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004128357715365309,
|
||
|
|
"loss": 5.5266,
|
||
|
|
"mean_token_accuracy": 0.191811466217041,
|
||
|
|
"num_tokens": 7430174.0,
|
||
|
|
"step": 4085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.899808502197265,
|
||
|
|
"epoch": 3.5139664804469275,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00041257282995022345,
|
||
|
|
"loss": 5.4928,
|
||
|
|
"mean_token_accuracy": 0.1953655794262886,
|
||
|
|
"num_tokens": 7439034.0,
|
||
|
|
"step": 4090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.912106704711914,
|
||
|
|
"epoch": 3.518263859045982,
|
||
|
|
"grad_norm": 1.359375,
|
||
|
|
"learning_rate": 0.0004123095879598426,
|
||
|
|
"loss": 5.5195,
|
||
|
|
"mean_token_accuracy": 0.18628203123807907,
|
||
|
|
"num_tokens": 7447663.0,
|
||
|
|
"step": 4095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.960794830322266,
|
||
|
|
"epoch": 3.5225612376450366,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00041204604614020397,
|
||
|
|
"loss": 5.6081,
|
||
|
|
"mean_token_accuracy": 0.17660218775272368,
|
||
|
|
"num_tokens": 7456615.0,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.996097373962402,
|
||
|
|
"epoch": 3.526858616244091,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004117822050667773,
|
||
|
|
"loss": 5.6382,
|
||
|
|
"mean_token_accuracy": 0.18591019809246062,
|
||
|
|
"num_tokens": 7466203.0,
|
||
|
|
"step": 4105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9893563747406,
|
||
|
|
"epoch": 3.5311559948431457,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00041151806531568617,
|
||
|
|
"loss": 5.5802,
|
||
|
|
"mean_token_accuracy": 0.18335504829883575,
|
||
|
|
"num_tokens": 7475411.0,
|
||
|
|
"step": 4110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.906181669235229,
|
||
|
|
"epoch": 3.5354533734422002,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00041125362746370625,
|
||
|
|
"loss": 5.6004,
|
||
|
|
"mean_token_accuracy": 0.18042974472045897,
|
||
|
|
"num_tokens": 7484965.0,
|
||
|
|
"step": 4115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.995426511764526,
|
||
|
|
"epoch": 3.5397507520412548,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004109888920882639,
|
||
|
|
"loss": 5.5249,
|
||
|
|
"mean_token_accuracy": 0.19167679399251938,
|
||
|
|
"num_tokens": 7494240.0,
|
||
|
|
"step": 4120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.949258327484131,
|
||
|
|
"epoch": 3.5440481306403093,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004107238597674356,
|
||
|
|
"loss": 5.5586,
|
||
|
|
"mean_token_accuracy": 0.18614224940538407,
|
||
|
|
"num_tokens": 7503560.0,
|
||
|
|
"step": 4125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.863224458694458,
|
||
|
|
"epoch": 3.548345509239364,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000410458531079946,
|
||
|
|
"loss": 5.4812,
|
||
|
|
"mean_token_accuracy": 0.19368503391742706,
|
||
|
|
"num_tokens": 7512650.0,
|
||
|
|
"step": 4130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9348499298095705,
|
||
|
|
"epoch": 3.5526428878384184,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004101929066051668,
|
||
|
|
"loss": 5.599,
|
||
|
|
"mean_token_accuracy": 0.1838935688138008,
|
||
|
|
"num_tokens": 7521864.0,
|
||
|
|
"step": 4135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.878848266601563,
|
||
|
|
"epoch": 3.556940266437473,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004099269869231157,
|
||
|
|
"loss": 5.496,
|
||
|
|
"mean_token_accuracy": 0.19109761267900466,
|
||
|
|
"num_tokens": 7531013.0,
|
||
|
|
"step": 4140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.948237895965576,
|
||
|
|
"epoch": 3.5612376450365275,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00040966077261445495,
|
||
|
|
"loss": 5.503,
|
||
|
|
"mean_token_accuracy": 0.1837790846824646,
|
||
|
|
"num_tokens": 7539959.0,
|
||
|
|
"step": 4145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.009708642959595,
|
||
|
|
"epoch": 3.565535023635582,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.0004093942642604904,
|
||
|
|
"loss": 5.4789,
|
||
|
|
"mean_token_accuracy": 0.19033878594636916,
|
||
|
|
"num_tokens": 7548354.0,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.921438217163086,
|
||
|
|
"epoch": 3.5698324022346366,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00040912746244316944,
|
||
|
|
"loss": 5.6032,
|
||
|
|
"mean_token_accuracy": 0.18626796901226045,
|
||
|
|
"num_tokens": 7558321.0,
|
||
|
|
"step": 4155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.902405214309693,
|
||
|
|
"epoch": 3.5741297808336916,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00040886036774508095,
|
||
|
|
"loss": 5.4904,
|
||
|
|
"mean_token_accuracy": 0.18896115869283675,
|
||
|
|
"num_tokens": 7567889.0,
|
||
|
|
"step": 4160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9710170269012455,
|
||
|
|
"epoch": 3.578427159432746,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004085929807494527,
|
||
|
|
"loss": 5.5489,
|
||
|
|
"mean_token_accuracy": 0.1867457315325737,
|
||
|
|
"num_tokens": 7576752.0,
|
||
|
|
"step": 4165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.900749206542969,
|
||
|
|
"epoch": 3.5827245380318007,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004083253020401512,
|
||
|
|
"loss": 5.4498,
|
||
|
|
"mean_token_accuracy": 0.19864338636398315,
|
||
|
|
"num_tokens": 7585413.0,
|
||
|
|
"step": 4170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9034223556518555,
|
||
|
|
"epoch": 3.5870219166308552,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0004080573322016797,
|
||
|
|
"loss": 5.4085,
|
||
|
|
"mean_token_accuracy": 0.19775232523679734,
|
||
|
|
"num_tokens": 7593966.0,
|
||
|
|
"step": 4175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.905447053909302,
|
||
|
|
"epoch": 3.59131929522991,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004077890718191773,
|
||
|
|
"loss": 5.4219,
|
||
|
|
"mean_token_accuracy": 0.19463559091091157,
|
||
|
|
"num_tokens": 7602746.0,
|
||
|
|
"step": 4180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.888575172424316,
|
||
|
|
"epoch": 3.5956166738289643,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00040752052147841733,
|
||
|
|
"loss": 5.485,
|
||
|
|
"mean_token_accuracy": 0.18464642763137817,
|
||
|
|
"num_tokens": 7611245.0,
|
||
|
|
"step": 4185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9167564868927,
|
||
|
|
"epoch": 3.599914052428019,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004072516817658065,
|
||
|
|
"loss": 5.5085,
|
||
|
|
"mean_token_accuracy": 0.19180469512939452,
|
||
|
|
"num_tokens": 7620234.0,
|
||
|
|
"step": 4190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9288722515106205,
|
||
|
|
"epoch": 3.6042114310270734,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004069825532683831,
|
||
|
|
"loss": 5.5362,
|
||
|
|
"mean_token_accuracy": 0.19008248895406724,
|
||
|
|
"num_tokens": 7629794.0,
|
||
|
|
"step": 4195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.883164501190185,
|
||
|
|
"epoch": 3.608508809626128,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00040671313657381645,
|
||
|
|
"loss": 5.4768,
|
||
|
|
"mean_token_accuracy": 0.19734710156917573,
|
||
|
|
"num_tokens": 7639497.0,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.833352327346802,
|
||
|
|
"epoch": 3.6128061882251825,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00040644343227040473,
|
||
|
|
"loss": 5.4305,
|
||
|
|
"mean_token_accuracy": 0.192035111784935,
|
||
|
|
"num_tokens": 7647647.0,
|
||
|
|
"step": 4205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.882366132736206,
|
||
|
|
"epoch": 3.617103566824237,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004061734409470745,
|
||
|
|
"loss": 5.6069,
|
||
|
|
"mean_token_accuracy": 0.18727213144302368,
|
||
|
|
"num_tokens": 7657988.0,
|
||
|
|
"step": 4210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.946136331558227,
|
||
|
|
"epoch": 3.621400945423292,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004059031631933788,
|
||
|
|
"loss": 5.5226,
|
||
|
|
"mean_token_accuracy": 0.18810444325208664,
|
||
|
|
"num_tokens": 7667498.0,
|
||
|
|
"step": 4215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.928274488449096,
|
||
|
|
"epoch": 3.6256983240223466,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00040563259959949615,
|
||
|
|
"loss": 5.6612,
|
||
|
|
"mean_token_accuracy": 0.17574882060289382,
|
||
|
|
"num_tokens": 7677386.0,
|
||
|
|
"step": 4220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.023345851898194,
|
||
|
|
"epoch": 3.629995702621401,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004053617507562295,
|
||
|
|
"loss": 5.4993,
|
||
|
|
"mean_token_accuracy": 0.1883416697382927,
|
||
|
|
"num_tokens": 7686643.0,
|
||
|
|
"step": 4225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.927192258834839,
|
||
|
|
"epoch": 3.6342930812204557,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00040509061725500426,
|
||
|
|
"loss": 5.5344,
|
||
|
|
"mean_token_accuracy": 0.18648910969495774,
|
||
|
|
"num_tokens": 7695089.0,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.855798292160034,
|
||
|
|
"epoch": 3.6385904598195102,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004048191996878677,
|
||
|
|
"loss": 5.5169,
|
||
|
|
"mean_token_accuracy": 0.18715409338474273,
|
||
|
|
"num_tokens": 7703854.0,
|
||
|
|
"step": 4235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.873931074142456,
|
||
|
|
"epoch": 3.642887838418565,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00040454749864748734,
|
||
|
|
"loss": 5.4623,
|
||
|
|
"mean_token_accuracy": 0.1924944058060646,
|
||
|
|
"num_tokens": 7712903.0,
|
||
|
|
"step": 4240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9368483543396,
|
||
|
|
"epoch": 3.6471852170176193,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004042755147271496,
|
||
|
|
"loss": 5.4073,
|
||
|
|
"mean_token_accuracy": 0.19578560292720795,
|
||
|
|
"num_tokens": 7721701.0,
|
||
|
|
"step": 4245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.814197635650634,
|
||
|
|
"epoch": 3.651482595616674,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004040032485207587,
|
||
|
|
"loss": 5.5316,
|
||
|
|
"mean_token_accuracy": 0.18780674338340758,
|
||
|
|
"num_tokens": 7731318.0,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.960366725921631,
|
||
|
|
"epoch": 3.6557799742157284,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004037307006228352,
|
||
|
|
"loss": 5.4563,
|
||
|
|
"mean_token_accuracy": 0.19457500725984572,
|
||
|
|
"num_tokens": 7740413.0,
|
||
|
|
"step": 4255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.894597911834717,
|
||
|
|
"epoch": 3.660077352814783,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004034578716285147,
|
||
|
|
"loss": 5.4362,
|
||
|
|
"mean_token_accuracy": 0.19790690541267394,
|
||
|
|
"num_tokens": 7749054.0,
|
||
|
|
"step": 4260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.855839014053345,
|
||
|
|
"epoch": 3.6643747314138375,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.0004031847621335467,
|
||
|
|
"loss": 5.4711,
|
||
|
|
"mean_token_accuracy": 0.19566139876842498,
|
||
|
|
"num_tokens": 7757366.0,
|
||
|
|
"step": 4265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.889632892608643,
|
||
|
|
"epoch": 3.668672110012892,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0004029113727342933,
|
||
|
|
"loss": 5.502,
|
||
|
|
"mean_token_accuracy": 0.19420932680368425,
|
||
|
|
"num_tokens": 7766471.0,
|
||
|
|
"step": 4270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.851235818862915,
|
||
|
|
"epoch": 3.6729694886119466,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00040263770402772746,
|
||
|
|
"loss": 5.4897,
|
||
|
|
"mean_token_accuracy": 0.1871536925435066,
|
||
|
|
"num_tokens": 7775920.0,
|
||
|
|
"step": 4275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.934095287322998,
|
||
|
|
"epoch": 3.677266867211001,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004023637566114325,
|
||
|
|
"loss": 5.5382,
|
||
|
|
"mean_token_accuracy": 0.1889081373810768,
|
||
|
|
"num_tokens": 7784530.0,
|
||
|
|
"step": 4280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.93968391418457,
|
||
|
|
"epoch": 3.6815642458100557,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004020895310835999,
|
||
|
|
"loss": 5.4721,
|
||
|
|
"mean_token_accuracy": 0.1917961835861206,
|
||
|
|
"num_tokens": 7793656.0,
|
||
|
|
"step": 4285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9000050067901615,
|
||
|
|
"epoch": 3.6858616244091102,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00040181502804302865,
|
||
|
|
"loss": 5.496,
|
||
|
|
"mean_token_accuracy": 0.1914617270231247,
|
||
|
|
"num_tokens": 7802185.0,
|
||
|
|
"step": 4290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8633284091949465,
|
||
|
|
"epoch": 3.690159003008165,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00040154024808912377,
|
||
|
|
"loss": 5.483,
|
||
|
|
"mean_token_accuracy": 0.19215791970491408,
|
||
|
|
"num_tokens": 7810345.0,
|
||
|
|
"step": 4295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.897251462936401,
|
||
|
|
"epoch": 3.6944563816072193,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004012651918218947,
|
||
|
|
"loss": 5.5314,
|
||
|
|
"mean_token_accuracy": 0.1837465301156044,
|
||
|
|
"num_tokens": 7818998.0,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.959916353225708,
|
||
|
|
"epoch": 3.6987537602062743,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004009898598419544,
|
||
|
|
"loss": 5.6474,
|
||
|
|
"mean_token_accuracy": 0.17348452657461166,
|
||
|
|
"num_tokens": 7828638.0,
|
||
|
|
"step": 4305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.956097745895386,
|
||
|
|
"epoch": 3.703051138805329,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.000400714252750518,
|
||
|
|
"loss": 5.622,
|
||
|
|
"mean_token_accuracy": 0.1802245110273361,
|
||
|
|
"num_tokens": 7838812.0,
|
||
|
|
"step": 4310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.987325286865234,
|
||
|
|
"epoch": 3.7073485174043834,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004004383711494011,
|
||
|
|
"loss": 5.5288,
|
||
|
|
"mean_token_accuracy": 0.19345352202653884,
|
||
|
|
"num_tokens": 7847458.0,
|
||
|
|
"step": 4315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.95421142578125,
|
||
|
|
"epoch": 3.711645896003438,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004001622156410189,
|
||
|
|
"loss": 5.5496,
|
||
|
|
"mean_token_accuracy": 0.18483526557683944,
|
||
|
|
"num_tokens": 7856553.0,
|
||
|
|
"step": 4320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.850839233398437,
|
||
|
|
"epoch": 3.7159432746024925,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00039988578682838467,
|
||
|
|
"loss": 5.4869,
|
||
|
|
"mean_token_accuracy": 0.18971165865659714,
|
||
|
|
"num_tokens": 7864788.0,
|
||
|
|
"step": 4325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.903116130828858,
|
||
|
|
"epoch": 3.720240653201547,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00039960908531510843,
|
||
|
|
"loss": 5.484,
|
||
|
|
"mean_token_accuracy": 0.19329809993505478,
|
||
|
|
"num_tokens": 7873850.0,
|
||
|
|
"step": 4330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.974154853820801,
|
||
|
|
"epoch": 3.7245380318006016,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0003993321117053956,
|
||
|
|
"loss": 5.6039,
|
||
|
|
"mean_token_accuracy": 0.18225040286779404,
|
||
|
|
"num_tokens": 7882775.0,
|
||
|
|
"step": 4335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.980661678314209,
|
||
|
|
"epoch": 3.728835410399656,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00039905486660404604,
|
||
|
|
"loss": 5.5353,
|
||
|
|
"mean_token_accuracy": 0.18522801846265793,
|
||
|
|
"num_tokens": 7890570.0,
|
||
|
|
"step": 4340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8748914241790775,
|
||
|
|
"epoch": 3.7331327889987107,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00039877735061645206,
|
||
|
|
"loss": 5.5033,
|
||
|
|
"mean_token_accuracy": 0.1971554860472679,
|
||
|
|
"num_tokens": 7900090.0,
|
||
|
|
"step": 4345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.934943914413452,
|
||
|
|
"epoch": 3.7374301675977653,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0003984995643485977,
|
||
|
|
"loss": 5.5358,
|
||
|
|
"mean_token_accuracy": 0.18585693091154099,
|
||
|
|
"num_tokens": 7908077.0,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9528398513793945,
|
||
|
|
"epoch": 3.74172754619682,
|
||
|
|
"grad_norm": 1.421875,
|
||
|
|
"learning_rate": 0.00039822150840705716,
|
||
|
|
"loss": 5.5391,
|
||
|
|
"mean_token_accuracy": 0.19125075042247772,
|
||
|
|
"num_tokens": 7916290.0,
|
||
|
|
"step": 4355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.999798917770386,
|
||
|
|
"epoch": 3.746024924795875,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00039794318339899347,
|
||
|
|
"loss": 5.6233,
|
||
|
|
"mean_token_accuracy": 0.17912040501832963,
|
||
|
|
"num_tokens": 7925835.0,
|
||
|
|
"step": 4360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.929653787612915,
|
||
|
|
"epoch": 3.7503223033949293,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00039766458993215726,
|
||
|
|
"loss": 5.5867,
|
||
|
|
"mean_token_accuracy": 0.18147629946470262,
|
||
|
|
"num_tokens": 7935076.0,
|
||
|
|
"step": 4365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.84507122039795,
|
||
|
|
"epoch": 3.754619681993984,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.00039738572861488527,
|
||
|
|
"loss": 5.4837,
|
||
|
|
"mean_token_accuracy": 0.19409503191709518,
|
||
|
|
"num_tokens": 7943958.0,
|
||
|
|
"step": 4370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.907137012481689,
|
||
|
|
"epoch": 3.7589170605930384,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000397106600056099,
|
||
|
|
"loss": 5.5211,
|
||
|
|
"mean_token_accuracy": 0.18553533554077148,
|
||
|
|
"num_tokens": 7953189.0,
|
||
|
|
"step": 4375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.878173971176148,
|
||
|
|
"epoch": 3.763214439192093,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0003968272048653039,
|
||
|
|
"loss": 5.4441,
|
||
|
|
"mean_token_accuracy": 0.19779548197984695,
|
||
|
|
"num_tokens": 7962927.0,
|
||
|
|
"step": 4380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8026800632476805,
|
||
|
|
"epoch": 3.7675118177911475,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0003965475436525873,
|
||
|
|
"loss": 5.4712,
|
||
|
|
"mean_token_accuracy": 0.197597499191761,
|
||
|
|
"num_tokens": 7973087.0,
|
||
|
|
"step": 4385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8803709030151365,
|
||
|
|
"epoch": 3.771809196390202,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0003962676170286174,
|
||
|
|
"loss": 5.4288,
|
||
|
|
"mean_token_accuracy": 0.1919528603553772,
|
||
|
|
"num_tokens": 7982535.0,
|
||
|
|
"step": 4390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.943622827529907,
|
||
|
|
"epoch": 3.7761065749892566,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.00039598742560464223,
|
||
|
|
"loss": 5.507,
|
||
|
|
"mean_token_accuracy": 0.19596254229545593,
|
||
|
|
"num_tokens": 7990740.0,
|
||
|
|
"step": 4395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.965104579925537,
|
||
|
|
"epoch": 3.780403953588311,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0003957069699924877,
|
||
|
|
"loss": 5.5021,
|
||
|
|
"mean_token_accuracy": 0.1843058630824089,
|
||
|
|
"num_tokens": 7999349.0,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.906688165664673,
|
||
|
|
"epoch": 3.7847013321873657,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000395426250804557,
|
||
|
|
"loss": 5.5119,
|
||
|
|
"mean_token_accuracy": 0.19529375731945037,
|
||
|
|
"num_tokens": 8007615.0,
|
||
|
|
"step": 4405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.893620347976684,
|
||
|
|
"epoch": 3.7889987107864203,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.00039514526865382847,
|
||
|
|
"loss": 5.4918,
|
||
|
|
"mean_token_accuracy": 0.19342261105775832,
|
||
|
|
"num_tokens": 8017545.0,
|
||
|
|
"step": 4410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.898420667648315,
|
||
|
|
"epoch": 3.793296089385475,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0003948640241538548,
|
||
|
|
"loss": 5.4376,
|
||
|
|
"mean_token_accuracy": 0.1940651446580887,
|
||
|
|
"num_tokens": 8026381.0,
|
||
|
|
"step": 4415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.925773334503174,
|
||
|
|
"epoch": 3.7975934679845293,
|
||
|
|
"grad_norm": 1.4921875,
|
||
|
|
"learning_rate": 0.0003945825179187617,
|
||
|
|
"loss": 5.5471,
|
||
|
|
"mean_token_accuracy": 0.1862453892827034,
|
||
|
|
"num_tokens": 8034745.0,
|
||
|
|
"step": 4420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.93576078414917,
|
||
|
|
"epoch": 3.801890846583584,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00039430075056324604,
|
||
|
|
"loss": 5.4864,
|
||
|
|
"mean_token_accuracy": 0.19621551632881165,
|
||
|
|
"num_tokens": 8043995.0,
|
||
|
|
"step": 4425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9152994632720945,
|
||
|
|
"epoch": 3.8061882251826384,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00039401872270257546,
|
||
|
|
"loss": 5.5773,
|
||
|
|
"mean_token_accuracy": 0.18623047918081284,
|
||
|
|
"num_tokens": 8053059.0,
|
||
|
|
"step": 4430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9053184509277346,
|
||
|
|
"epoch": 3.810485603781693,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00039373643495258567,
|
||
|
|
"loss": 5.5995,
|
||
|
|
"mean_token_accuracy": 0.18803995102643967,
|
||
|
|
"num_tokens": 8062160.0,
|
||
|
|
"step": 4435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.876355934143066,
|
||
|
|
"epoch": 3.8147829823807475,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00039345388792968056,
|
||
|
|
"loss": 5.4979,
|
||
|
|
"mean_token_accuracy": 0.1962131142616272,
|
||
|
|
"num_tokens": 8071260.0,
|
||
|
|
"step": 4440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.975628805160523,
|
||
|
|
"epoch": 3.819080360979802,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.00039317108225082984,
|
||
|
|
"loss": 5.6148,
|
||
|
|
"mean_token_accuracy": 0.1825527474284172,
|
||
|
|
"num_tokens": 8081540.0,
|
||
|
|
"step": 4445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8768692970275875,
|
||
|
|
"epoch": 3.8233777395788566,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00039288801853356806,
|
||
|
|
"loss": 5.5798,
|
||
|
|
"mean_token_accuracy": 0.1876271441578865,
|
||
|
|
"num_tokens": 8089785.0,
|
||
|
|
"step": 4450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.926883172988892,
|
||
|
|
"epoch": 3.8276751181779116,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0003926046973959932,
|
||
|
|
"loss": 5.4322,
|
||
|
|
"mean_token_accuracy": 0.1977944403886795,
|
||
|
|
"num_tokens": 8098097.0,
|
||
|
|
"step": 4455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.84870548248291,
|
||
|
|
"epoch": 3.831972496776966,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0003923211194567654,
|
||
|
|
"loss": 5.6562,
|
||
|
|
"mean_token_accuracy": 0.1832739979028702,
|
||
|
|
"num_tokens": 8108693.0,
|
||
|
|
"step": 4460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.936432361602783,
|
||
|
|
"epoch": 3.8362698753760207,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00039203728533510556,
|
||
|
|
"loss": 5.4945,
|
||
|
|
"mean_token_accuracy": 0.19009887129068376,
|
||
|
|
"num_tokens": 8117181.0,
|
||
|
|
"step": 4465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9394755363464355,
|
||
|
|
"epoch": 3.8405672539750753,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000391753195650794,
|
||
|
|
"loss": 5.5152,
|
||
|
|
"mean_token_accuracy": 0.1871207147836685,
|
||
|
|
"num_tokens": 8125398.0,
|
||
|
|
"step": 4470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.89150915145874,
|
||
|
|
"epoch": 3.84486463257413,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00039146885102416895,
|
||
|
|
"loss": 5.519,
|
||
|
|
"mean_token_accuracy": 0.19240910410881043,
|
||
|
|
"num_tokens": 8135320.0,
|
||
|
|
"step": 4475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.932202434539795,
|
||
|
|
"epoch": 3.8491620111731844,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00039118425207612553,
|
||
|
|
"loss": 5.6074,
|
||
|
|
"mean_token_accuracy": 0.18543781340122223,
|
||
|
|
"num_tokens": 8144320.0,
|
||
|
|
"step": 4480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.821663093566895,
|
||
|
|
"epoch": 3.853459389772239,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00039089939942811396,
|
||
|
|
"loss": 5.478,
|
||
|
|
"mean_token_accuracy": 0.19514185637235643,
|
||
|
|
"num_tokens": 8153653.0,
|
||
|
|
"step": 4485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.937240219116211,
|
||
|
|
"epoch": 3.8577567683712934,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00039061429370213863,
|
||
|
|
"loss": 5.513,
|
||
|
|
"mean_token_accuracy": 0.18825586438179015,
|
||
|
|
"num_tokens": 8162741.0,
|
||
|
|
"step": 4490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.856398630142212,
|
||
|
|
"epoch": 3.862054146970348,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00039032893552075646,
|
||
|
|
"loss": 5.4271,
|
||
|
|
"mean_token_accuracy": 0.1990933135151863,
|
||
|
|
"num_tokens": 8171078.0,
|
||
|
|
"step": 4495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.858392572402954,
|
||
|
|
"epoch": 3.8663515255694025,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0003900433255070758,
|
||
|
|
"loss": 5.4881,
|
||
|
|
"mean_token_accuracy": 0.19236364662647248,
|
||
|
|
"num_tokens": 8179968.0,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8663515255694025,
|
||
|
|
"eval_entropy": 5.69006564058699,
|
||
|
|
"eval_loss": 5.968277454376221,
|
||
|
|
"eval_mean_token_accuracy": 0.1735342912006754,
|
||
|
|
"eval_num_tokens": 8179968.0,
|
||
|
|
"eval_runtime": 2.0443,
|
||
|
|
"eval_samples_per_second": 1736.068,
|
||
|
|
"eval_steps_per_second": 217.192,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.894122076034546,
|
||
|
|
"epoch": 3.870648904168457,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00038975746428475454,
|
||
|
|
"loss": 5.4732,
|
||
|
|
"mean_token_accuracy": 0.19004281610250473,
|
||
|
|
"num_tokens": 8189261.0,
|
||
|
|
"step": 4505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.959436702728271,
|
||
|
|
"epoch": 3.874946282767512,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00038947135247799955,
|
||
|
|
"loss": 5.4841,
|
||
|
|
"mean_token_accuracy": 0.19915961623191833,
|
||
|
|
"num_tokens": 8198302.0,
|
||
|
|
"step": 4510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.907156896591187,
|
||
|
|
"epoch": 3.8792436613665666,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00038918499071156443,
|
||
|
|
"loss": 5.4669,
|
||
|
|
"mean_token_accuracy": 0.1965099200606346,
|
||
|
|
"num_tokens": 8207098.0,
|
||
|
|
"step": 4515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.902419233322144,
|
||
|
|
"epoch": 3.883541039965621,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.000388898379610749,
|
||
|
|
"loss": 5.5132,
|
||
|
|
"mean_token_accuracy": 0.18933655470609664,
|
||
|
|
"num_tokens": 8216831.0,
|
||
|
|
"step": 4520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.858121109008789,
|
||
|
|
"epoch": 3.8878384185646757,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0003886115198013973,
|
||
|
|
"loss": 5.5158,
|
||
|
|
"mean_token_accuracy": 0.19693622142076492,
|
||
|
|
"num_tokens": 8225369.0,
|
||
|
|
"step": 4525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.928486585617065,
|
||
|
|
"epoch": 3.8921357971637303,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0003883244119098965,
|
||
|
|
"loss": 5.6449,
|
||
|
|
"mean_token_accuracy": 0.17984056174755098,
|
||
|
|
"num_tokens": 8234440.0,
|
||
|
|
"step": 4530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.944949722290039,
|
||
|
|
"epoch": 3.896433175762785,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0003880370565631754,
|
||
|
|
"loss": 5.4373,
|
||
|
|
"mean_token_accuracy": 0.19602712541818618,
|
||
|
|
"num_tokens": 8243707.0,
|
||
|
|
"step": 4535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.938224267959595,
|
||
|
|
"epoch": 3.9007305543618394,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00038774945438870337,
|
||
|
|
"loss": 5.6105,
|
||
|
|
"mean_token_accuracy": 0.18423481285572052,
|
||
|
|
"num_tokens": 8254223.0,
|
||
|
|
"step": 4540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.871773719787598,
|
||
|
|
"epoch": 3.905027932960894,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00038746160601448845,
|
||
|
|
"loss": 5.465,
|
||
|
|
"mean_token_accuracy": 0.1903871014714241,
|
||
|
|
"num_tokens": 8263105.0,
|
||
|
|
"step": 4545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.857735824584961,
|
||
|
|
"epoch": 3.9093253115599484,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0003871735120690766,
|
||
|
|
"loss": 5.5241,
|
||
|
|
"mean_token_accuracy": 0.18961958587169647,
|
||
|
|
"num_tokens": 8271478.0,
|
||
|
|
"step": 4550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.936745357513428,
|
||
|
|
"epoch": 3.913622690159003,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0003868851731815497,
|
||
|
|
"loss": 5.5649,
|
||
|
|
"mean_token_accuracy": 0.1800309345126152,
|
||
|
|
"num_tokens": 8280396.0,
|
||
|
|
"step": 4555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.948010683059692,
|
||
|
|
"epoch": 3.9179200687580575,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003865965899815247,
|
||
|
|
"loss": 5.5559,
|
||
|
|
"mean_token_accuracy": 0.18653638958930968,
|
||
|
|
"num_tokens": 8290371.0,
|
||
|
|
"step": 4560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.885638093948364,
|
||
|
|
"epoch": 3.922217447357112,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0003863077630991518,
|
||
|
|
"loss": 5.4559,
|
||
|
|
"mean_token_accuracy": 0.1984282374382019,
|
||
|
|
"num_tokens": 8298976.0,
|
||
|
|
"step": 4565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.830101728439331,
|
||
|
|
"epoch": 3.9265148259561666,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003860186931651139,
|
||
|
|
"loss": 5.5129,
|
||
|
|
"mean_token_accuracy": 0.1856519967317581,
|
||
|
|
"num_tokens": 8308752.0,
|
||
|
|
"step": 4570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.904654264450073,
|
||
|
|
"epoch": 3.930812204555221,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0003857293808106238,
|
||
|
|
"loss": 5.5693,
|
||
|
|
"mean_token_accuracy": 0.18588138967752457,
|
||
|
|
"num_tokens": 8317343.0,
|
||
|
|
"step": 4575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.934261655807495,
|
||
|
|
"epoch": 3.9351095831542757,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003854398266674241,
|
||
|
|
"loss": 5.4226,
|
||
|
|
"mean_token_accuracy": 0.19770598262548447,
|
||
|
|
"num_tokens": 8326956.0,
|
||
|
|
"step": 4580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8273883819580075,
|
||
|
|
"epoch": 3.9394069617533303,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.00038515003136778544,
|
||
|
|
"loss": 5.5387,
|
||
|
|
"mean_token_accuracy": 0.18877289444208145,
|
||
|
|
"num_tokens": 8335589.0,
|
||
|
|
"step": 4585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.864310264587402,
|
||
|
|
"epoch": 3.943704340352385,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00038485999554450483,
|
||
|
|
"loss": 5.5134,
|
||
|
|
"mean_token_accuracy": 0.18962926417589188,
|
||
|
|
"num_tokens": 8345517.0,
|
||
|
|
"step": 4590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.81669340133667,
|
||
|
|
"epoch": 3.9480017189514394,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00038456971983090454,
|
||
|
|
"loss": 5.4482,
|
||
|
|
"mean_token_accuracy": 0.19930247962474823,
|
||
|
|
"num_tokens": 8354702.0,
|
||
|
|
"step": 4595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.906301403045655,
|
||
|
|
"epoch": 3.9522990975504944,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0003842792048608309,
|
||
|
|
"loss": 5.4765,
|
||
|
|
"mean_token_accuracy": 0.19456401616334915,
|
||
|
|
"num_tokens": 8362940.0,
|
||
|
|
"step": 4600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.906610107421875,
|
||
|
|
"epoch": 3.956596476149549,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0003839884512686523,
|
||
|
|
"loss": 5.5178,
|
||
|
|
"mean_token_accuracy": 0.19119103550910949,
|
||
|
|
"num_tokens": 8372034.0,
|
||
|
|
"step": 4605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.910079717636108,
|
||
|
|
"epoch": 3.9608938547486034,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00038369745968925846,
|
||
|
|
"loss": 5.5487,
|
||
|
|
"mean_token_accuracy": 0.1872400775551796,
|
||
|
|
"num_tokens": 8381673.0,
|
||
|
|
"step": 4610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.925352668762207,
|
||
|
|
"epoch": 3.965191233347658,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00038340623075805875,
|
||
|
|
"loss": 5.4909,
|
||
|
|
"mean_token_accuracy": 0.1889455035328865,
|
||
|
|
"num_tokens": 8390804.0,
|
||
|
|
"step": 4615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.934152221679687,
|
||
|
|
"epoch": 3.9694886119467125,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00038311476511098053,
|
||
|
|
"loss": 5.5365,
|
||
|
|
"mean_token_accuracy": 0.19448018521070481,
|
||
|
|
"num_tokens": 8399644.0,
|
||
|
|
"step": 4620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.884286642074585,
|
||
|
|
"epoch": 3.973785990545767,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0003828230633844685,
|
||
|
|
"loss": 5.5523,
|
||
|
|
"mean_token_accuracy": 0.19329068064689636,
|
||
|
|
"num_tokens": 8409264.0,
|
||
|
|
"step": 4625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.916780805587768,
|
||
|
|
"epoch": 3.9780833691448216,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00038253112621548243,
|
||
|
|
"loss": 5.496,
|
||
|
|
"mean_token_accuracy": 0.186178120970726,
|
||
|
|
"num_tokens": 8418383.0,
|
||
|
|
"step": 4630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.926163101196289,
|
||
|
|
"epoch": 3.982380747743876,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0003822389542414966,
|
||
|
|
"loss": 5.5232,
|
||
|
|
"mean_token_accuracy": 0.18829717487096786,
|
||
|
|
"num_tokens": 8427411.0,
|
||
|
|
"step": 4635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.882813405990601,
|
||
|
|
"epoch": 3.9866781263429307,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00038194654810049775,
|
||
|
|
"loss": 5.4629,
|
||
|
|
"mean_token_accuracy": 0.18817957490682602,
|
||
|
|
"num_tokens": 8435537.0,
|
||
|
|
"step": 4640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.882016706466675,
|
||
|
|
"epoch": 3.9909755049419853,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000381653908430984,
|
||
|
|
"loss": 5.5432,
|
||
|
|
"mean_token_accuracy": 0.18621994256973268,
|
||
|
|
"num_tokens": 8444400.0,
|
||
|
|
"step": 4645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.930685234069824,
|
||
|
|
"epoch": 3.99527288354104,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0003813610358719634,
|
||
|
|
"loss": 5.5236,
|
||
|
|
"mean_token_accuracy": 0.1859032317996025,
|
||
|
|
"num_tokens": 8453830.0,
|
||
|
|
"step": 4650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.866905212402344,
|
||
|
|
"epoch": 3.999570262140095,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00038106793106295266,
|
||
|
|
"loss": 5.4873,
|
||
|
|
"mean_token_accuracy": 0.20101941972970963,
|
||
|
|
"num_tokens": 8463033.0,
|
||
|
|
"step": 4655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.898269759284125,
|
||
|
|
"epoch": 4.003437902879243,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0003807745946439754,
|
||
|
|
"loss": 5.2703,
|
||
|
|
"mean_token_accuracy": 0.20677175455623203,
|
||
|
|
"num_tokens": 8470740.0,
|
||
|
|
"step": 4660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.857395029067993,
|
||
|
|
"epoch": 4.007735281478298,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0003804810272555612,
|
||
|
|
"loss": 5.2529,
|
||
|
|
"mean_token_accuracy": 0.20413458198308945,
|
||
|
|
"num_tokens": 8480480.0,
|
||
|
|
"step": 4665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.816273021697998,
|
||
|
|
"epoch": 4.012032660077352,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003801872295387439,
|
||
|
|
"loss": 5.2035,
|
||
|
|
"mean_token_accuracy": 0.21528093218803407,
|
||
|
|
"num_tokens": 8489047.0,
|
||
|
|
"step": 4670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.927360010147095,
|
||
|
|
"epoch": 4.016330038676408,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0003798932021350603,
|
||
|
|
"loss": 5.2819,
|
||
|
|
"mean_token_accuracy": 0.20662181824445724,
|
||
|
|
"num_tokens": 8497763.0,
|
||
|
|
"step": 4675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.861963748931885,
|
||
|
|
"epoch": 4.020627417275462,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.00037959894568654864,
|
||
|
|
"loss": 5.2537,
|
||
|
|
"mean_token_accuracy": 0.20978819131851195,
|
||
|
|
"num_tokens": 8506814.0,
|
||
|
|
"step": 4680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.957066392898559,
|
||
|
|
"epoch": 4.024924795874517,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0003793044608357474,
|
||
|
|
"loss": 5.377,
|
||
|
|
"mean_token_accuracy": 0.19830369651317598,
|
||
|
|
"num_tokens": 8516384.0,
|
||
|
|
"step": 4685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.93622350692749,
|
||
|
|
"epoch": 4.0292221744735714,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003790097482256939,
|
||
|
|
"loss": 5.214,
|
||
|
|
"mean_token_accuracy": 0.2048332706093788,
|
||
|
|
"num_tokens": 8524822.0,
|
||
|
|
"step": 4690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.870176839828491,
|
||
|
|
"epoch": 4.033519553072626,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0003787148084999225,
|
||
|
|
"loss": 5.242,
|
||
|
|
"mean_token_accuracy": 0.2090427428483963,
|
||
|
|
"num_tokens": 8534129.0,
|
||
|
|
"step": 4695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8284914016723635,
|
||
|
|
"epoch": 4.0378169316716805,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00037841964230246394,
|
||
|
|
"loss": 5.3055,
|
||
|
|
"mean_token_accuracy": 0.20019746124744414,
|
||
|
|
"num_tokens": 8543235.0,
|
||
|
|
"step": 4700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8483837127685545,
|
||
|
|
"epoch": 4.042114310270735,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003781242502778429,
|
||
|
|
"loss": 5.2003,
|
||
|
|
"mean_token_accuracy": 0.22053535431623458,
|
||
|
|
"num_tokens": 8551903.0,
|
||
|
|
"step": 4705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.880414295196533,
|
||
|
|
"epoch": 4.04641168886979,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00037782863307107785,
|
||
|
|
"loss": 5.287,
|
||
|
|
"mean_token_accuracy": 0.20505535304546357,
|
||
|
|
"num_tokens": 8561173.0,
|
||
|
|
"step": 4710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.899335432052612,
|
||
|
|
"epoch": 4.050709067468844,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00037753279132767833,
|
||
|
|
"loss": 5.1929,
|
||
|
|
"mean_token_accuracy": 0.21593824326992034,
|
||
|
|
"num_tokens": 8569789.0,
|
||
|
|
"step": 4715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.804694700241089,
|
||
|
|
"epoch": 4.055006446067899,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00037723672569364453,
|
||
|
|
"loss": 5.1963,
|
||
|
|
"mean_token_accuracy": 0.20983130037784575,
|
||
|
|
"num_tokens": 8577971.0,
|
||
|
|
"step": 4720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.866218900680542,
|
||
|
|
"epoch": 4.059303824666953,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00037694043681546545,
|
||
|
|
"loss": 5.2858,
|
||
|
|
"mean_token_accuracy": 0.2029922142624855,
|
||
|
|
"num_tokens": 8587299.0,
|
||
|
|
"step": 4725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.831310987472534,
|
||
|
|
"epoch": 4.063601203266008,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0003766439253401177,
|
||
|
|
"loss": 5.2472,
|
||
|
|
"mean_token_accuracy": 0.20737850219011306,
|
||
|
|
"num_tokens": 8595813.0,
|
||
|
|
"step": 4730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.844350147247314,
|
||
|
|
"epoch": 4.067898581865062,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00037634719191506367,
|
||
|
|
"loss": 5.2617,
|
||
|
|
"mean_token_accuracy": 0.21165675073862075,
|
||
|
|
"num_tokens": 8604552.0,
|
||
|
|
"step": 4735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.796354818344116,
|
||
|
|
"epoch": 4.072195960464117,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.00037605023718825065,
|
||
|
|
"loss": 5.2002,
|
||
|
|
"mean_token_accuracy": 0.2150500625371933,
|
||
|
|
"num_tokens": 8612701.0,
|
||
|
|
"step": 4740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.846735095977783,
|
||
|
|
"epoch": 4.0764933390631715,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000375753061808109,
|
||
|
|
"loss": 5.2598,
|
||
|
|
"mean_token_accuracy": 0.20762900859117508,
|
||
|
|
"num_tokens": 8622699.0,
|
||
|
|
"step": 4745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.842225646972656,
|
||
|
|
"epoch": 4.080790717662226,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00037545566642355107,
|
||
|
|
"loss": 5.2295,
|
||
|
|
"mean_token_accuracy": 0.20560641288757325,
|
||
|
|
"num_tokens": 8631821.0,
|
||
|
|
"step": 4750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.840038156509399,
|
||
|
|
"epoch": 4.0850880962612806,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0003751580516839695,
|
||
|
|
"loss": 5.202,
|
||
|
|
"mean_token_accuracy": 0.20931526124477387,
|
||
|
|
"num_tokens": 8641814.0,
|
||
|
|
"step": 4755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.884950733184814,
|
||
|
|
"epoch": 4.089385474860335,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00037486021823923574,
|
||
|
|
"loss": 5.286,
|
||
|
|
"mean_token_accuracy": 0.20766208320856094,
|
||
|
|
"num_tokens": 8649649.0,
|
||
|
|
"step": 4760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.810858106613159,
|
||
|
|
"epoch": 4.09368285345939,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00037456216673969925,
|
||
|
|
"loss": 5.2206,
|
||
|
|
"mean_token_accuracy": 0.21204735338687897,
|
||
|
|
"num_tokens": 8658216.0,
|
||
|
|
"step": 4765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.874101734161377,
|
||
|
|
"epoch": 4.097980232058444,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0003742638978361851,
|
||
|
|
"loss": 5.2958,
|
||
|
|
"mean_token_accuracy": 0.20435795933008194,
|
||
|
|
"num_tokens": 8667725.0,
|
||
|
|
"step": 4770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.781695938110351,
|
||
|
|
"epoch": 4.102277610657499,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00037396541217999367,
|
||
|
|
"loss": 5.1561,
|
||
|
|
"mean_token_accuracy": 0.2138916879892349,
|
||
|
|
"num_tokens": 8675739.0,
|
||
|
|
"step": 4775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.839225959777832,
|
||
|
|
"epoch": 4.106574989256553,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0003736667104228981,
|
||
|
|
"loss": 5.2313,
|
||
|
|
"mean_token_accuracy": 0.21251195222139357,
|
||
|
|
"num_tokens": 8685764.0,
|
||
|
|
"step": 4780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8689206600189205,
|
||
|
|
"epoch": 4.110872367855608,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.00037336779321714376,
|
||
|
|
"loss": 5.2059,
|
||
|
|
"mean_token_accuracy": 0.21196469962596892,
|
||
|
|
"num_tokens": 8695476.0,
|
||
|
|
"step": 4785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.80074520111084,
|
||
|
|
"epoch": 4.115169746454662,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00037306866121544633,
|
||
|
|
"loss": 5.2825,
|
||
|
|
"mean_token_accuracy": 0.20670025944709777,
|
||
|
|
"num_tokens": 8705544.0,
|
||
|
|
"step": 4790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.860075855255127,
|
||
|
|
"epoch": 4.119467125053717,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0003727693150709904,
|
||
|
|
"loss": 5.2645,
|
||
|
|
"mean_token_accuracy": 0.20871647000312804,
|
||
|
|
"num_tokens": 8714883.0,
|
||
|
|
"step": 4795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.886887168884277,
|
||
|
|
"epoch": 4.1237645036527715,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00037246975543742843,
|
||
|
|
"loss": 5.3176,
|
||
|
|
"mean_token_accuracy": 0.20150526314973832,
|
||
|
|
"num_tokens": 8724589.0,
|
||
|
|
"step": 4800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.745695161819458,
|
||
|
|
"epoch": 4.128061882251826,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000372169982968879,
|
||
|
|
"loss": 5.1867,
|
||
|
|
"mean_token_accuracy": 0.20965181291103363,
|
||
|
|
"num_tokens": 8733771.0,
|
||
|
|
"step": 4805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.845971202850341,
|
||
|
|
"epoch": 4.132359260850881,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0003718699983199252,
|
||
|
|
"loss": 5.2624,
|
||
|
|
"mean_token_accuracy": 0.20873973071575164,
|
||
|
|
"num_tokens": 8742348.0,
|
||
|
|
"step": 4810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7872912883758545,
|
||
|
|
"epoch": 4.136656639449935,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0003715698021456137,
|
||
|
|
"loss": 5.2081,
|
||
|
|
"mean_token_accuracy": 0.21571390181779862,
|
||
|
|
"num_tokens": 8751357.0,
|
||
|
|
"step": 4815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7935162544250485,
|
||
|
|
"epoch": 4.1409540180489905,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00037126939510145294,
|
||
|
|
"loss": 5.2631,
|
||
|
|
"mean_token_accuracy": 0.21045506447553636,
|
||
|
|
"num_tokens": 8760813.0,
|
||
|
|
"step": 4820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.919540929794311,
|
||
|
|
"epoch": 4.145251396648045,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003709687778434118,
|
||
|
|
"loss": 5.3088,
|
||
|
|
"mean_token_accuracy": 0.20338443517684937,
|
||
|
|
"num_tokens": 8770228.0,
|
||
|
|
"step": 4825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.766780090332031,
|
||
|
|
"epoch": 4.1495487752471,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.0003706679510279183,
|
||
|
|
"loss": 5.1405,
|
||
|
|
"mean_token_accuracy": 0.2135200873017311,
|
||
|
|
"num_tokens": 8779351.0,
|
||
|
|
"step": 4830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.818261432647705,
|
||
|
|
"epoch": 4.153846153846154,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.0003703669153118578,
|
||
|
|
"loss": 5.3029,
|
||
|
|
"mean_token_accuracy": 0.20108458995819092,
|
||
|
|
"num_tokens": 8789116.0,
|
||
|
|
"step": 4835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.810438871383667,
|
||
|
|
"epoch": 4.158143532445209,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00037006567135257216,
|
||
|
|
"loss": 5.2702,
|
||
|
|
"mean_token_accuracy": 0.20288445353507994,
|
||
|
|
"num_tokens": 8797790.0,
|
||
|
|
"step": 4840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.865516614913941,
|
||
|
|
"epoch": 4.162440911044263,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00036976421980785764,
|
||
|
|
"loss": 5.3081,
|
||
|
|
"mean_token_accuracy": 0.2026110991835594,
|
||
|
|
"num_tokens": 8808067.0,
|
||
|
|
"step": 4845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.80728063583374,
|
||
|
|
"epoch": 4.166738289643318,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003694625613359641,
|
||
|
|
"loss": 5.2167,
|
||
|
|
"mean_token_accuracy": 0.21420625150203704,
|
||
|
|
"num_tokens": 8816587.0,
|
||
|
|
"step": 4850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.843136548995972,
|
||
|
|
"epoch": 4.171035668242372,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.0003691606965955929,
|
||
|
|
"loss": 5.2734,
|
||
|
|
"mean_token_accuracy": 0.20686964243650435,
|
||
|
|
"num_tokens": 8826045.0,
|
||
|
|
"step": 4855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.781480550765991,
|
||
|
|
"epoch": 4.175333046841427,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000368858626245896,
|
||
|
|
"loss": 5.2662,
|
||
|
|
"mean_token_accuracy": 0.21182646304368974,
|
||
|
|
"num_tokens": 8835427.0,
|
||
|
|
"step": 4860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.802968168258667,
|
||
|
|
"epoch": 4.1796304254404815,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0003685563509464744,
|
||
|
|
"loss": 5.2058,
|
||
|
|
"mean_token_accuracy": 0.21191840171813964,
|
||
|
|
"num_tokens": 8845167.0,
|
||
|
|
"step": 4865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.854573917388916,
|
||
|
|
"epoch": 4.183927804039536,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.00036825387135737647,
|
||
|
|
"loss": 5.2076,
|
||
|
|
"mean_token_accuracy": 0.21366898566484452,
|
||
|
|
"num_tokens": 8853591.0,
|
||
|
|
"step": 4870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.830286979675293,
|
||
|
|
"epoch": 4.188225182638591,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00036795118813909674,
|
||
|
|
"loss": 5.3259,
|
||
|
|
"mean_token_accuracy": 0.19266606420278548,
|
||
|
|
"num_tokens": 8863647.0,
|
||
|
|
"step": 4875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.880206489562989,
|
||
|
|
"epoch": 4.192522561237645,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.00036764830195257437,
|
||
|
|
"loss": 5.2531,
|
||
|
|
"mean_token_accuracy": 0.2108171060681343,
|
||
|
|
"num_tokens": 8872911.0,
|
||
|
|
"step": 4880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.866643857955933,
|
||
|
|
"epoch": 4.1968199398367,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0003673452134591918,
|
||
|
|
"loss": 5.2999,
|
||
|
|
"mean_token_accuracy": 0.2029878944158554,
|
||
|
|
"num_tokens": 8881001.0,
|
||
|
|
"step": 4885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.772600555419922,
|
||
|
|
"epoch": 4.201117318435754,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000367041923320773,
|
||
|
|
"loss": 5.2042,
|
||
|
|
"mean_token_accuracy": 0.21341877430677414,
|
||
|
|
"num_tokens": 8890323.0,
|
||
|
|
"step": 4890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.771191835403442,
|
||
|
|
"epoch": 4.205414697034809,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00036673843219958257,
|
||
|
|
"loss": 5.2368,
|
||
|
|
"mean_token_accuracy": 0.21208913624286652,
|
||
|
|
"num_tokens": 8900471.0,
|
||
|
|
"step": 4895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.88256139755249,
|
||
|
|
"epoch": 4.209712075633863,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003664347407583238,
|
||
|
|
"loss": 5.2863,
|
||
|
|
"mean_token_accuracy": 0.20272428095340728,
|
||
|
|
"num_tokens": 8909320.0,
|
||
|
|
"step": 4900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.836409950256348,
|
||
|
|
"epoch": 4.214009454232918,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003661308496601373,
|
||
|
|
"loss": 5.2072,
|
||
|
|
"mean_token_accuracy": 0.2157358020544052,
|
||
|
|
"num_tokens": 8917453.0,
|
||
|
|
"step": 4905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.788828945159912,
|
||
|
|
"epoch": 4.218306832831972,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.00036582675956859983,
|
||
|
|
"loss": 5.2828,
|
||
|
|
"mean_token_accuracy": 0.2104206308722496,
|
||
|
|
"num_tokens": 8925737.0,
|
||
|
|
"step": 4910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.720648384094238,
|
||
|
|
"epoch": 4.222604211431027,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00036552247114772263,
|
||
|
|
"loss": 5.2101,
|
||
|
|
"mean_token_accuracy": 0.2065804719924927,
|
||
|
|
"num_tokens": 8935475.0,
|
||
|
|
"step": 4915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.83034381866455,
|
||
|
|
"epoch": 4.2269015900300815,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00036521798506194996,
|
||
|
|
"loss": 5.2346,
|
||
|
|
"mean_token_accuracy": 0.21483660042285918,
|
||
|
|
"num_tokens": 8944683.0,
|
||
|
|
"step": 4920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.881083297729492,
|
||
|
|
"epoch": 4.231198968629136,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00036491330197615775,
|
||
|
|
"loss": 5.2826,
|
||
|
|
"mean_token_accuracy": 0.199912728369236,
|
||
|
|
"num_tokens": 8953837.0,
|
||
|
|
"step": 4925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.823856353759766,
|
||
|
|
"epoch": 4.235496347228191,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.00036460842255565197,
|
||
|
|
"loss": 5.3172,
|
||
|
|
"mean_token_accuracy": 0.2043285608291626,
|
||
|
|
"num_tokens": 8964822.0,
|
||
|
|
"step": 4930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.869928026199341,
|
||
|
|
"epoch": 4.239793725827245,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.0003643033474661676,
|
||
|
|
"loss": 5.2965,
|
||
|
|
"mean_token_accuracy": 0.20673907697200775,
|
||
|
|
"num_tokens": 8974363.0,
|
||
|
|
"step": 4935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.82686710357666,
|
||
|
|
"epoch": 4.2440911044263,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00036399807737386657,
|
||
|
|
"loss": 5.2074,
|
||
|
|
"mean_token_accuracy": 0.21254496574401854,
|
||
|
|
"num_tokens": 8983122.0,
|
||
|
|
"step": 4940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.857899141311646,
|
||
|
|
"epoch": 4.248388483025354,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0003636926129453368,
|
||
|
|
"loss": 5.3123,
|
||
|
|
"mean_token_accuracy": 0.20272811949253083,
|
||
|
|
"num_tokens": 8991618.0,
|
||
|
|
"step": 4945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.824826383590699,
|
||
|
|
"epoch": 4.252685861624409,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0003633869548475904,
|
||
|
|
"loss": 5.2415,
|
||
|
|
"mean_token_accuracy": 0.21045928597450256,
|
||
|
|
"num_tokens": 9000128.0,
|
||
|
|
"step": 4950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.775493240356445,
|
||
|
|
"epoch": 4.256983240223463,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0003630811037480627,
|
||
|
|
"loss": 5.2319,
|
||
|
|
"mean_token_accuracy": 0.2093399852514267,
|
||
|
|
"num_tokens": 9008951.0,
|
||
|
|
"step": 4955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.842453670501709,
|
||
|
|
"epoch": 4.261280618822518,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0003627750603146101,
|
||
|
|
"loss": 5.2789,
|
||
|
|
"mean_token_accuracy": 0.2030516341328621,
|
||
|
|
"num_tokens": 9018949.0,
|
||
|
|
"step": 4960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.883487272262573,
|
||
|
|
"epoch": 4.265577997421573,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003624688252155091,
|
||
|
|
"loss": 5.2747,
|
||
|
|
"mean_token_accuracy": 0.20714085996150972,
|
||
|
|
"num_tokens": 9028910.0,
|
||
|
|
"step": 4965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.809985780715943,
|
||
|
|
"epoch": 4.269875376020628,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0003621623991194549,
|
||
|
|
"loss": 5.324,
|
||
|
|
"mean_token_accuracy": 0.19819179475307463,
|
||
|
|
"num_tokens": 9039012.0,
|
||
|
|
"step": 4970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9007415771484375,
|
||
|
|
"epoch": 4.274172754619682,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0003618557826955594,
|
||
|
|
"loss": 5.2954,
|
||
|
|
"mean_token_accuracy": 0.20645973831415176,
|
||
|
|
"num_tokens": 9048639.0,
|
||
|
|
"step": 4975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.815454912185669,
|
||
|
|
"epoch": 4.278470133218737,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00036154897661335063,
|
||
|
|
"loss": 5.2517,
|
||
|
|
"mean_token_accuracy": 0.2086031049489975,
|
||
|
|
"num_tokens": 9057453.0,
|
||
|
|
"step": 4980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.792014074325562,
|
||
|
|
"epoch": 4.2827675118177915,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0003612419815427702,
|
||
|
|
"loss": 5.2826,
|
||
|
|
"mean_token_accuracy": 0.20074526816606522,
|
||
|
|
"num_tokens": 9066761.0,
|
||
|
|
"step": 4985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.858555936813355,
|
||
|
|
"epoch": 4.287064890416846,
|
||
|
|
"grad_norm": 1.4140625,
|
||
|
|
"learning_rate": 0.0003609347981541726,
|
||
|
|
"loss": 5.3553,
|
||
|
|
"mean_token_accuracy": 0.1983863353729248,
|
||
|
|
"num_tokens": 9075535.0,
|
||
|
|
"step": 4990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.862577295303344,
|
||
|
|
"epoch": 4.291362269015901,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00036062742711832376,
|
||
|
|
"loss": 5.257,
|
||
|
|
"mean_token_accuracy": 0.2088131219148636,
|
||
|
|
"num_tokens": 9084559.0,
|
||
|
|
"step": 4995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.811804294586182,
|
||
|
|
"epoch": 4.295659647614955,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0003603198691063991,
|
||
|
|
"loss": 5.2313,
|
||
|
|
"mean_token_accuracy": 0.2083360180258751,
|
||
|
|
"num_tokens": 9093069.0,
|
||
|
|
"step": 5000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.295659647614955,
|
||
|
|
"eval_entropy": 5.572498395636275,
|
||
|
|
"eval_loss": 5.972146987915039,
|
||
|
|
"eval_mean_token_accuracy": 0.17474245199480573,
|
||
|
|
"eval_num_tokens": 9093069.0,
|
||
|
|
"eval_runtime": 2.0519,
|
||
|
|
"eval_samples_per_second": 1729.593,
|
||
|
|
"eval_steps_per_second": 216.382,
|
||
|
|
"step": 5000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8002112865447994,
|
||
|
|
"epoch": 4.29995702621401,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.0003600121247899824,
|
||
|
|
"loss": 5.2227,
|
||
|
|
"mean_token_accuracy": 0.2073623850941658,
|
||
|
|
"num_tokens": 9101914.0,
|
||
|
|
"step": 5005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.834455966949463,
|
||
|
|
"epoch": 4.304254404813064,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00035970419484106404,
|
||
|
|
"loss": 5.2887,
|
||
|
|
"mean_token_accuracy": 0.20548986196517943,
|
||
|
|
"num_tokens": 9110967.0,
|
||
|
|
"step": 5010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.891673374176025,
|
||
|
|
"epoch": 4.308551783412119,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0003593960799320402,
|
||
|
|
"loss": 5.3822,
|
||
|
|
"mean_token_accuracy": 0.19926034808158874,
|
||
|
|
"num_tokens": 9119774.0,
|
||
|
|
"step": 5015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.887394714355469,
|
||
|
|
"epoch": 4.312849162011173,
|
||
|
|
"grad_norm": 1.2890625,
|
||
|
|
"learning_rate": 0.0003590877807357107,
|
||
|
|
"loss": 5.2922,
|
||
|
|
"mean_token_accuracy": 0.20317730754613877,
|
||
|
|
"num_tokens": 9127738.0,
|
||
|
|
"step": 5020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.785108280181885,
|
||
|
|
"epoch": 4.317146540610228,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0003587792979252776,
|
||
|
|
"loss": 5.2629,
|
||
|
|
"mean_token_accuracy": 0.20784647464752198,
|
||
|
|
"num_tokens": 9137060.0,
|
||
|
|
"step": 5025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.777895927429199,
|
||
|
|
"epoch": 4.321443919209282,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0003584706321743442,
|
||
|
|
"loss": 5.1962,
|
||
|
|
"mean_token_accuracy": 0.2092631295323372,
|
||
|
|
"num_tokens": 9145169.0,
|
||
|
|
"step": 5030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.796663856506347,
|
||
|
|
"epoch": 4.325741297808337,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000358161784156913,
|
||
|
|
"loss": 5.2276,
|
||
|
|
"mean_token_accuracy": 0.21179858297109605,
|
||
|
|
"num_tokens": 9154092.0,
|
||
|
|
"step": 5035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.85888671875,
|
||
|
|
"epoch": 4.3300386764073915,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00035785275454738456,
|
||
|
|
"loss": 5.286,
|
||
|
|
"mean_token_accuracy": 0.19925448596477507,
|
||
|
|
"num_tokens": 9162824.0,
|
||
|
|
"step": 5040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7883411884307865,
|
||
|
|
"epoch": 4.334336055006446,
|
||
|
|
"grad_norm": 1.3828125,
|
||
|
|
"learning_rate": 0.00035754354402055635,
|
||
|
|
"loss": 5.1959,
|
||
|
|
"mean_token_accuracy": 0.21434530913829802,
|
||
|
|
"num_tokens": 9170977.0,
|
||
|
|
"step": 5045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.730002689361572,
|
||
|
|
"epoch": 4.338633433605501,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003572341532516202,
|
||
|
|
"loss": 5.2367,
|
||
|
|
"mean_token_accuracy": 0.20432866215705872,
|
||
|
|
"num_tokens": 9179539.0,
|
||
|
|
"step": 5050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.77237024307251,
|
||
|
|
"epoch": 4.342930812204555,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0003569245829161622,
|
||
|
|
"loss": 5.3173,
|
||
|
|
"mean_token_accuracy": 0.20617470294237136,
|
||
|
|
"num_tokens": 9188861.0,
|
||
|
|
"step": 5055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.834583187103272,
|
||
|
|
"epoch": 4.34722819080361,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00035661483369016004,
|
||
|
|
"loss": 5.2608,
|
||
|
|
"mean_token_accuracy": 0.20369923412799834,
|
||
|
|
"num_tokens": 9197724.0,
|
||
|
|
"step": 5060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.79484076499939,
|
||
|
|
"epoch": 4.351525569402664,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003563049062499822,
|
||
|
|
"loss": 5.2692,
|
||
|
|
"mean_token_accuracy": 0.2074078604578972,
|
||
|
|
"num_tokens": 9206375.0,
|
||
|
|
"step": 5065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.755507230758667,
|
||
|
|
"epoch": 4.355822948001719,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.0003559948012723865,
|
||
|
|
"loss": 5.2271,
|
||
|
|
"mean_token_accuracy": 0.21173418909311295,
|
||
|
|
"num_tokens": 9214675.0,
|
||
|
|
"step": 5070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.802625036239624,
|
||
|
|
"epoch": 4.360120326600773,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003556845194345181,
|
||
|
|
"loss": 5.2516,
|
||
|
|
"mean_token_accuracy": 0.20623590499162675,
|
||
|
|
"num_tokens": 9224128.0,
|
||
|
|
"step": 5075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.769022130966187,
|
||
|
|
"epoch": 4.364417705199828,
|
||
|
|
"grad_norm": 1.359375,
|
||
|
|
"learning_rate": 0.0003553740614139086,
|
||
|
|
"loss": 5.1773,
|
||
|
|
"mean_token_accuracy": 0.21178028136491775,
|
||
|
|
"num_tokens": 9232568.0,
|
||
|
|
"step": 5080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.831740474700927,
|
||
|
|
"epoch": 4.368715083798882,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0003550634278884742,
|
||
|
|
"loss": 5.2776,
|
||
|
|
"mean_token_accuracy": 0.2081983670592308,
|
||
|
|
"num_tokens": 9241809.0,
|
||
|
|
"step": 5085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.803788042068481,
|
||
|
|
"epoch": 4.373012462397937,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00035475261953651433,
|
||
|
|
"loss": 5.272,
|
||
|
|
"mean_token_accuracy": 0.20985971093177797,
|
||
|
|
"num_tokens": 9250845.0,
|
||
|
|
"step": 5090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7017419815063475,
|
||
|
|
"epoch": 4.3773098409969915,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00035444163703671026,
|
||
|
|
"loss": 5.2316,
|
||
|
|
"mean_token_accuracy": 0.2108854666352272,
|
||
|
|
"num_tokens": 9259465.0,
|
||
|
|
"step": 5095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.795203113555909,
|
||
|
|
"epoch": 4.381607219596046,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00035413048106812357,
|
||
|
|
"loss": 5.2177,
|
||
|
|
"mean_token_accuracy": 0.21499419659376146,
|
||
|
|
"num_tokens": 9267853.0,
|
||
|
|
"step": 5100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.927629661560059,
|
||
|
|
"epoch": 4.385904598195101,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00035381915231019425,
|
||
|
|
"loss": 5.4268,
|
||
|
|
"mean_token_accuracy": 0.19061524271965027,
|
||
|
|
"num_tokens": 9276664.0,
|
||
|
|
"step": 5105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.820791101455688,
|
||
|
|
"epoch": 4.390201976794156,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0003535076514427401,
|
||
|
|
"loss": 5.2285,
|
||
|
|
"mean_token_accuracy": 0.20389644652605057,
|
||
|
|
"num_tokens": 9285482.0,
|
||
|
|
"step": 5110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.833712720870972,
|
||
|
|
"epoch": 4.39449935539321,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00035319597914595436,
|
||
|
|
"loss": 5.3276,
|
||
|
|
"mean_token_accuracy": 0.19536473900079726,
|
||
|
|
"num_tokens": 9293936.0,
|
||
|
|
"step": 5115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.812803554534912,
|
||
|
|
"epoch": 4.398796733992265,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0003528841361004049,
|
||
|
|
"loss": 5.3509,
|
||
|
|
"mean_token_accuracy": 0.19318777322769165,
|
||
|
|
"num_tokens": 9303998.0,
|
||
|
|
"step": 5120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.777164936065674,
|
||
|
|
"epoch": 4.40309411259132,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0003525721229870323,
|
||
|
|
"loss": 5.3018,
|
||
|
|
"mean_token_accuracy": 0.2057452142238617,
|
||
|
|
"num_tokens": 9313117.0,
|
||
|
|
"step": 5125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.843145132064819,
|
||
|
|
"epoch": 4.407391491190374,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00035225994048714823,
|
||
|
|
"loss": 5.2845,
|
||
|
|
"mean_token_accuracy": 0.205299773812294,
|
||
|
|
"num_tokens": 9321446.0,
|
||
|
|
"step": 5130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.799930763244629,
|
||
|
|
"epoch": 4.411688869789429,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0003519475892824348,
|
||
|
|
"loss": 5.2629,
|
||
|
|
"mean_token_accuracy": 0.20662948340177537,
|
||
|
|
"num_tokens": 9330752.0,
|
||
|
|
"step": 5135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.77738208770752,
|
||
|
|
"epoch": 4.415986248388483,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003516350700549419,
|
||
|
|
"loss": 5.3006,
|
||
|
|
"mean_token_accuracy": 0.20330240875482558,
|
||
|
|
"num_tokens": 9339322.0,
|
||
|
|
"step": 5140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.84840669631958,
|
||
|
|
"epoch": 4.420283626987538,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00035132238348708697,
|
||
|
|
"loss": 5.3297,
|
||
|
|
"mean_token_accuracy": 0.19938498139381408,
|
||
|
|
"num_tokens": 9349024.0,
|
||
|
|
"step": 5145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.926564788818359,
|
||
|
|
"epoch": 4.424581005586592,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.00035100953026165224,
|
||
|
|
"loss": 5.4256,
|
||
|
|
"mean_token_accuracy": 0.197027026116848,
|
||
|
|
"num_tokens": 9358833.0,
|
||
|
|
"step": 5150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.868610525131226,
|
||
|
|
"epoch": 4.428878384185647,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0003506965110617841,
|
||
|
|
"loss": 5.2718,
|
||
|
|
"mean_token_accuracy": 0.2099718302488327,
|
||
|
|
"num_tokens": 9368276.0,
|
||
|
|
"step": 5155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.859810876846313,
|
||
|
|
"epoch": 4.4331757627847015,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0003503833265709915,
|
||
|
|
"loss": 5.3479,
|
||
|
|
"mean_token_accuracy": 0.1974034383893013,
|
||
|
|
"num_tokens": 9378501.0,
|
||
|
|
"step": 5160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.875433778762817,
|
||
|
|
"epoch": 4.437473141383756,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.00035006997747314404,
|
||
|
|
"loss": 5.3298,
|
||
|
|
"mean_token_accuracy": 0.19622083157300949,
|
||
|
|
"num_tokens": 9387789.0,
|
||
|
|
"step": 5165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.835582590103149,
|
||
|
|
"epoch": 4.441770519982811,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00034975646445247106,
|
||
|
|
"loss": 5.3721,
|
||
|
|
"mean_token_accuracy": 0.2014732614159584,
|
||
|
|
"num_tokens": 9397041.0,
|
||
|
|
"step": 5170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.775737285614014,
|
||
|
|
"epoch": 4.446067898581865,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003494427881935596,
|
||
|
|
"loss": 5.3059,
|
||
|
|
"mean_token_accuracy": 0.20452196449041365,
|
||
|
|
"num_tokens": 9405393.0,
|
||
|
|
"step": 5175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.779368114471436,
|
||
|
|
"epoch": 4.45036527718092,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00034912894938135325,
|
||
|
|
"loss": 5.2582,
|
||
|
|
"mean_token_accuracy": 0.20273705422878266,
|
||
|
|
"num_tokens": 9415127.0,
|
||
|
|
"step": 5180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.846761655807495,
|
||
|
|
"epoch": 4.454662655779974,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0003488149487011506,
|
||
|
|
"loss": 5.3699,
|
||
|
|
"mean_token_accuracy": 0.20174208134412766,
|
||
|
|
"num_tokens": 9424416.0,
|
||
|
|
"step": 5185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.890099239349365,
|
||
|
|
"epoch": 4.458960034379029,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00034850078683860346,
|
||
|
|
"loss": 5.3262,
|
||
|
|
"mean_token_accuracy": 0.19683828055858613,
|
||
|
|
"num_tokens": 9434523.0,
|
||
|
|
"step": 5190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.831119251251221,
|
||
|
|
"epoch": 4.463257412978083,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0003481864644797159,
|
||
|
|
"loss": 5.3245,
|
||
|
|
"mean_token_accuracy": 0.2093776971101761,
|
||
|
|
"num_tokens": 9443605.0,
|
||
|
|
"step": 5195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.803278684616089,
|
||
|
|
"epoch": 4.467554791577138,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003478719823108424,
|
||
|
|
"loss": 5.3317,
|
||
|
|
"mean_token_accuracy": 0.19572802931070327,
|
||
|
|
"num_tokens": 9453268.0,
|
||
|
|
"step": 5200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8240362167358395,
|
||
|
|
"epoch": 4.471852170176192,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00034755734101868613,
|
||
|
|
"loss": 5.214,
|
||
|
|
"mean_token_accuracy": 0.2097940504550934,
|
||
|
|
"num_tokens": 9461578.0,
|
||
|
|
"step": 5205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.79837703704834,
|
||
|
|
"epoch": 4.476149548775247,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00034724254129029795,
|
||
|
|
"loss": 5.2436,
|
||
|
|
"mean_token_accuracy": 0.2102679118514061,
|
||
|
|
"num_tokens": 9470722.0,
|
||
|
|
"step": 5210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.837281274795532,
|
||
|
|
"epoch": 4.4804469273743015,
|
||
|
|
"grad_norm": 1.2890625,
|
||
|
|
"learning_rate": 0.0003469275838130748,
|
||
|
|
"loss": 5.3607,
|
||
|
|
"mean_token_accuracy": 0.19933488070964814,
|
||
|
|
"num_tokens": 9479695.0,
|
||
|
|
"step": 5215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8430516719818115,
|
||
|
|
"epoch": 4.484744305973356,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003466124692747577,
|
||
|
|
"loss": 5.2646,
|
||
|
|
"mean_token_accuracy": 0.2044244959950447,
|
||
|
|
"num_tokens": 9488444.0,
|
||
|
|
"step": 5220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.742202806472778,
|
||
|
|
"epoch": 4.489041684572411,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00034629719836343106,
|
||
|
|
"loss": 5.2215,
|
||
|
|
"mean_token_accuracy": 0.21403959393501282,
|
||
|
|
"num_tokens": 9497413.0,
|
||
|
|
"step": 5225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7987758159637455,
|
||
|
|
"epoch": 4.493339063171465,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.0003459817717675203,
|
||
|
|
"loss": 5.2598,
|
||
|
|
"mean_token_accuracy": 0.21579257249832154,
|
||
|
|
"num_tokens": 9506135.0,
|
||
|
|
"step": 5230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.835311031341552,
|
||
|
|
"epoch": 4.49763644177052,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0003456661901757913,
|
||
|
|
"loss": 5.3341,
|
||
|
|
"mean_token_accuracy": 0.20138609558343887,
|
||
|
|
"num_tokens": 9516918.0,
|
||
|
|
"step": 5235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.866192770004273,
|
||
|
|
"epoch": 4.501933820369574,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.00034535045427734796,
|
||
|
|
"loss": 5.276,
|
||
|
|
"mean_token_accuracy": 0.2101076439023018,
|
||
|
|
"num_tokens": 9526052.0,
|
||
|
|
"step": 5240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.733947229385376,
|
||
|
|
"epoch": 4.506231198968629,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.0003450345647616313,
|
||
|
|
"loss": 5.3369,
|
||
|
|
"mean_token_accuracy": 0.2056139588356018,
|
||
|
|
"num_tokens": 9535200.0,
|
||
|
|
"step": 5245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.76122088432312,
|
||
|
|
"epoch": 4.510528577567683,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0003447185223184177,
|
||
|
|
"loss": 5.3074,
|
||
|
|
"mean_token_accuracy": 0.20514743030071259,
|
||
|
|
"num_tokens": 9544786.0,
|
||
|
|
"step": 5250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.871483230590821,
|
||
|
|
"epoch": 4.514825956166739,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00034440232763781765,
|
||
|
|
"loss": 5.2522,
|
||
|
|
"mean_token_accuracy": 0.20949897319078445,
|
||
|
|
"num_tokens": 9553694.0,
|
||
|
|
"step": 5255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.753093576431274,
|
||
|
|
"epoch": 4.519123334765792,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000344085981410274,
|
||
|
|
"loss": 5.3192,
|
||
|
|
"mean_token_accuracy": 0.20984772890806197,
|
||
|
|
"num_tokens": 9563332.0,
|
||
|
|
"step": 5260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.711885738372803,
|
||
|
|
"epoch": 4.523420713364848,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00034376948432656036,
|
||
|
|
"loss": 5.2301,
|
||
|
|
"mean_token_accuracy": 0.2115880087018013,
|
||
|
|
"num_tokens": 9572367.0,
|
||
|
|
"step": 5265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.860666131973266,
|
||
|
|
"epoch": 4.527718091963902,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003434528370777798,
|
||
|
|
"loss": 5.3255,
|
||
|
|
"mean_token_accuracy": 0.19527169466018676,
|
||
|
|
"num_tokens": 9582535.0,
|
||
|
|
"step": 5270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.807507610321045,
|
||
|
|
"epoch": 4.532015470562957,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00034313604035536344,
|
||
|
|
"loss": 5.2775,
|
||
|
|
"mean_token_accuracy": 0.21002310365438462,
|
||
|
|
"num_tokens": 9590688.0,
|
||
|
|
"step": 5275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.773982238769531,
|
||
|
|
"epoch": 4.5363128491620115,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0003428190948510687,
|
||
|
|
"loss": 5.3213,
|
||
|
|
"mean_token_accuracy": 0.2039690524339676,
|
||
|
|
"num_tokens": 9599209.0,
|
||
|
|
"step": 5280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.852875804901123,
|
||
|
|
"epoch": 4.540610227761066,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0003425020012569778,
|
||
|
|
"loss": 5.3626,
|
||
|
|
"mean_token_accuracy": 0.20032234340906144,
|
||
|
|
"num_tokens": 9608575.0,
|
||
|
|
"step": 5285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.903119659423828,
|
||
|
|
"epoch": 4.544907606360121,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.00034218476026549665,
|
||
|
|
"loss": 5.3113,
|
||
|
|
"mean_token_accuracy": 0.2009777992963791,
|
||
|
|
"num_tokens": 9617312.0,
|
||
|
|
"step": 5290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.826537036895752,
|
||
|
|
"epoch": 4.549204984959175,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.0003418673725693524,
|
||
|
|
"loss": 5.2895,
|
||
|
|
"mean_token_accuracy": 0.21229007989168167,
|
||
|
|
"num_tokens": 9626398.0,
|
||
|
|
"step": 5295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.797998762130737,
|
||
|
|
"epoch": 4.55350236355823,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003415498388615932,
|
||
|
|
"loss": 5.2692,
|
||
|
|
"mean_token_accuracy": 0.20089106261730194,
|
||
|
|
"num_tokens": 9635470.0,
|
||
|
|
"step": 5300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.809066820144653,
|
||
|
|
"epoch": 4.557799742157284,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003412321598355857,
|
||
|
|
"loss": 5.213,
|
||
|
|
"mean_token_accuracy": 0.21215442568063736,
|
||
|
|
"num_tokens": 9644728.0,
|
||
|
|
"step": 5305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.776236963272095,
|
||
|
|
"epoch": 4.562097120756339,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0003409143361850139,
|
||
|
|
"loss": 5.2752,
|
||
|
|
"mean_token_accuracy": 0.2105761721730232,
|
||
|
|
"num_tokens": 9654129.0,
|
||
|
|
"step": 5310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.822030639648437,
|
||
|
|
"epoch": 4.566394499355393,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0003405963686038775,
|
||
|
|
"loss": 5.3633,
|
||
|
|
"mean_token_accuracy": 0.1967499941587448,
|
||
|
|
"num_tokens": 9662648.0,
|
||
|
|
"step": 5315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.843867492675781,
|
||
|
|
"epoch": 4.570691877954448,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003402782577864908,
|
||
|
|
"loss": 5.3261,
|
||
|
|
"mean_token_accuracy": 0.20646921396255494,
|
||
|
|
"num_tokens": 9672082.0,
|
||
|
|
"step": 5320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.86830587387085,
|
||
|
|
"epoch": 4.574989256553502,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00033996000442748056,
|
||
|
|
"loss": 5.2528,
|
||
|
|
"mean_token_accuracy": 0.21100070625543593,
|
||
|
|
"num_tokens": 9681422.0,
|
||
|
|
"step": 5325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.829919290542603,
|
||
|
|
"epoch": 4.579286635152557,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.00033964160922178495,
|
||
|
|
"loss": 5.2957,
|
||
|
|
"mean_token_accuracy": 0.206342414021492,
|
||
|
|
"num_tokens": 9690675.0,
|
||
|
|
"step": 5330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.813098335266114,
|
||
|
|
"epoch": 4.5835840137516115,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0003393230728646518,
|
||
|
|
"loss": 5.2833,
|
||
|
|
"mean_token_accuracy": 0.2053971081972122,
|
||
|
|
"num_tokens": 9700200.0,
|
||
|
|
"step": 5335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.761319780349732,
|
||
|
|
"epoch": 4.587881392350666,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00033900439605163724,
|
||
|
|
"loss": 5.2785,
|
||
|
|
"mean_token_accuracy": 0.2027950644493103,
|
||
|
|
"num_tokens": 9709533.0,
|
||
|
|
"step": 5340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.774492692947388,
|
||
|
|
"epoch": 4.592178770949721,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.00033868557947860407,
|
||
|
|
"loss": 5.3247,
|
||
|
|
"mean_token_accuracy": 0.20598720461130143,
|
||
|
|
"num_tokens": 9719250.0,
|
||
|
|
"step": 5345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.826806688308716,
|
||
|
|
"epoch": 4.596476149548775,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00033836662384172014,
|
||
|
|
"loss": 5.243,
|
||
|
|
"mean_token_accuracy": 0.20927662551403045,
|
||
|
|
"num_tokens": 9727837.0,
|
||
|
|
"step": 5350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.759864807128906,
|
||
|
|
"epoch": 4.60077352814783,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0003380475298374573,
|
||
|
|
"loss": 5.3326,
|
||
|
|
"mean_token_accuracy": 0.20309751331806183,
|
||
|
|
"num_tokens": 9737125.0,
|
||
|
|
"step": 5355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.813335514068603,
|
||
|
|
"epoch": 4.605070906746884,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000337728298162589,
|
||
|
|
"loss": 5.3499,
|
||
|
|
"mean_token_accuracy": 0.19702604413032532,
|
||
|
|
"num_tokens": 9746309.0,
|
||
|
|
"step": 5360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.838102722167969,
|
||
|
|
"epoch": 4.609368285345939,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00033740892951418993,
|
||
|
|
"loss": 5.232,
|
||
|
|
"mean_token_accuracy": 0.2094883754849434,
|
||
|
|
"num_tokens": 9755633.0,
|
||
|
|
"step": 5365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.877121877670288,
|
||
|
|
"epoch": 4.613665663944993,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0003370894245896333,
|
||
|
|
"loss": 5.2713,
|
||
|
|
"mean_token_accuracy": 0.19735931158065795,
|
||
|
|
"num_tokens": 9765179.0,
|
||
|
|
"step": 5370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.872338008880615,
|
||
|
|
"epoch": 4.617963042544048,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.00033676978408659047,
|
||
|
|
"loss": 5.2987,
|
||
|
|
"mean_token_accuracy": 0.2016567572951317,
|
||
|
|
"num_tokens": 9774085.0,
|
||
|
|
"step": 5375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.845898246765136,
|
||
|
|
"epoch": 4.622260421143102,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0003364500087030283,
|
||
|
|
"loss": 5.4123,
|
||
|
|
"mean_token_accuracy": 0.19296547174453735,
|
||
|
|
"num_tokens": 9784650.0,
|
||
|
|
"step": 5380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.869012546539307,
|
||
|
|
"epoch": 4.626557799742157,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00033613009913720845,
|
||
|
|
"loss": 5.2707,
|
||
|
|
"mean_token_accuracy": 0.20299201905727388,
|
||
|
|
"num_tokens": 9793947.0,
|
||
|
|
"step": 5385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.734190225601196,
|
||
|
|
"epoch": 4.6308551783412115,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.00033581005608768563,
|
||
|
|
"loss": 5.2453,
|
||
|
|
"mean_token_accuracy": 0.2124895542860031,
|
||
|
|
"num_tokens": 9803593.0,
|
||
|
|
"step": 5390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.793021965026855,
|
||
|
|
"epoch": 4.635152556940266,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003354898802533058,
|
||
|
|
"loss": 5.2855,
|
||
|
|
"mean_token_accuracy": 0.20431207865476608,
|
||
|
|
"num_tokens": 9812295.0,
|
||
|
|
"step": 5395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.791452312469483,
|
||
|
|
"epoch": 4.6394499355393215,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0003351695723332051,
|
||
|
|
"loss": 5.2934,
|
||
|
|
"mean_token_accuracy": 0.2097485601902008,
|
||
|
|
"num_tokens": 9820586.0,
|
||
|
|
"step": 5400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.798425960540771,
|
||
|
|
"epoch": 4.643747314138375,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00033484913302680807,
|
||
|
|
"loss": 5.2279,
|
||
|
|
"mean_token_accuracy": 0.21040427088737487,
|
||
|
|
"num_tokens": 9829080.0,
|
||
|
|
"step": 5405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.796739912033081,
|
||
|
|
"epoch": 4.648044692737431,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00033452856303382595,
|
||
|
|
"loss": 5.2475,
|
||
|
|
"mean_token_accuracy": 0.20435117036104203,
|
||
|
|
"num_tokens": 9838421.0,
|
||
|
|
"step": 5410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.759791278839112,
|
||
|
|
"epoch": 4.652342071336484,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.0003342078630542555,
|
||
|
|
"loss": 5.2524,
|
||
|
|
"mean_token_accuracy": 0.21281823366880417,
|
||
|
|
"num_tokens": 9847151.0,
|
||
|
|
"step": 5415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.807016801834107,
|
||
|
|
"epoch": 4.65663944993554,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00033388703378837737,
|
||
|
|
"loss": 5.275,
|
||
|
|
"mean_token_accuracy": 0.20886558741331102,
|
||
|
|
"num_tokens": 9856803.0,
|
||
|
|
"step": 5420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.791787147521973,
|
||
|
|
"epoch": 4.660936828534594,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003335660759367544,
|
||
|
|
"loss": 5.1847,
|
||
|
|
"mean_token_accuracy": 0.22501839995384215,
|
||
|
|
"num_tokens": 9865617.0,
|
||
|
|
"step": 5425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.765948724746704,
|
||
|
|
"epoch": 4.665234207133649,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00033324499020023025,
|
||
|
|
"loss": 5.2534,
|
||
|
|
"mean_token_accuracy": 0.21098006069660186,
|
||
|
|
"num_tokens": 9875454.0,
|
||
|
|
"step": 5430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.817541313171387,
|
||
|
|
"epoch": 4.669531585732703,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003329237772799277,
|
||
|
|
"loss": 5.2502,
|
||
|
|
"mean_token_accuracy": 0.20961165130138398,
|
||
|
|
"num_tokens": 9884770.0,
|
||
|
|
"step": 5435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.783469343185425,
|
||
|
|
"epoch": 4.673828964331758,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.0003326024378772477,
|
||
|
|
"loss": 5.2538,
|
||
|
|
"mean_token_accuracy": 0.2091410353779793,
|
||
|
|
"num_tokens": 9893594.0,
|
||
|
|
"step": 5440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.793620014190674,
|
||
|
|
"epoch": 4.678126342930812,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0003322809726938667,
|
||
|
|
"loss": 5.3607,
|
||
|
|
"mean_token_accuracy": 0.19666333645582199,
|
||
|
|
"num_tokens": 9902260.0,
|
||
|
|
"step": 5445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.804405307769775,
|
||
|
|
"epoch": 4.682423721529867,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00033195938243173645,
|
||
|
|
"loss": 5.2657,
|
||
|
|
"mean_token_accuracy": 0.20829562693834305,
|
||
|
|
"num_tokens": 9911020.0,
|
||
|
|
"step": 5450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8101622581481935,
|
||
|
|
"epoch": 4.6867211001289215,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.0003316376677930814,
|
||
|
|
"loss": 5.277,
|
||
|
|
"mean_token_accuracy": 0.20017611235380173,
|
||
|
|
"num_tokens": 9918696.0,
|
||
|
|
"step": 5455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.745956611633301,
|
||
|
|
"epoch": 4.691018478727976,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0003313158294803977,
|
||
|
|
"loss": 5.3171,
|
||
|
|
"mean_token_accuracy": 0.1995955988764763,
|
||
|
|
"num_tokens": 9927638.0,
|
||
|
|
"step": 5460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.824975442886353,
|
||
|
|
"epoch": 4.695315857327031,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00033099386819645176,
|
||
|
|
"loss": 5.2912,
|
||
|
|
"mean_token_accuracy": 0.20382552444934846,
|
||
|
|
"num_tokens": 9936969.0,
|
||
|
|
"step": 5465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.796650314331055,
|
||
|
|
"epoch": 4.699613235926085,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0003306717846442782,
|
||
|
|
"loss": 5.1993,
|
||
|
|
"mean_token_accuracy": 0.20417630672454834,
|
||
|
|
"num_tokens": 9945229.0,
|
||
|
|
"step": 5470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7901218891143795,
|
||
|
|
"epoch": 4.70391061452514,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0003303495795271788,
|
||
|
|
"loss": 5.1995,
|
||
|
|
"mean_token_accuracy": 0.20233412235975265,
|
||
|
|
"num_tokens": 9953759.0,
|
||
|
|
"step": 5475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.770085334777832,
|
||
|
|
"epoch": 4.708207993124194,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00033002725354872075,
|
||
|
|
"loss": 5.3092,
|
||
|
|
"mean_token_accuracy": 0.2047215849161148,
|
||
|
|
"num_tokens": 9962771.0,
|
||
|
|
"step": 5480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.800899696350098,
|
||
|
|
"epoch": 4.712505371723249,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.00032970480741273514,
|
||
|
|
"loss": 5.3104,
|
||
|
|
"mean_token_accuracy": 0.19106538593769073,
|
||
|
|
"num_tokens": 9972481.0,
|
||
|
|
"step": 5485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8685791015625,
|
||
|
|
"epoch": 4.716802750322303,
|
||
|
|
"grad_norm": 1.390625,
|
||
|
|
"learning_rate": 0.0003293822418233155,
|
||
|
|
"loss": 5.256,
|
||
|
|
"mean_token_accuracy": 0.2051583468914032,
|
||
|
|
"num_tokens": 9980773.0,
|
||
|
|
"step": 5490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8781898021698,
|
||
|
|
"epoch": 4.721100128921358,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003290595574848161,
|
||
|
|
"loss": 5.3453,
|
||
|
|
"mean_token_accuracy": 0.19384868294000626,
|
||
|
|
"num_tokens": 9989830.0,
|
||
|
|
"step": 5495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.756228923797607,
|
||
|
|
"epoch": 4.725397507520412,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0003287367551018505,
|
||
|
|
"loss": 5.272,
|
||
|
|
"mean_token_accuracy": 0.20579312443733216,
|
||
|
|
"num_tokens": 9999234.0,
|
||
|
|
"step": 5500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.725397507520412,
|
||
|
|
"eval_entropy": 5.592626677977072,
|
||
|
|
"eval_loss": 5.931019306182861,
|
||
|
|
"eval_mean_token_accuracy": 0.17753368537235367,
|
||
|
|
"eval_num_tokens": 9999234.0,
|
||
|
|
"eval_runtime": 2.0334,
|
||
|
|
"eval_samples_per_second": 1745.336,
|
||
|
|
"eval_steps_per_second": 218.351,
|
||
|
|
"step": 5500
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 11630,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 2253283385886720.0,
|
||
|
|
"train_batch_size": 16,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|