7074 lines
159 KiB
JSON
7074 lines
159 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 2.2461814914645104,
|
||
|
|
"eval_steps": 1000,
|
||
|
|
"global_step": 5000,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0022461814914645105,
|
||
|
|
"grad_norm": 54.0,
|
||
|
|
"learning_rate": 7.499999999999999e-07,
|
||
|
|
"loss": 10.989,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.004492362982929021,
|
||
|
|
"grad_norm": 52.75,
|
||
|
|
"learning_rate": 1.4999999999999998e-06,
|
||
|
|
"loss": 10.984,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.006738544474393531,
|
||
|
|
"grad_norm": 52.5,
|
||
|
|
"learning_rate": 2.2499999999999996e-06,
|
||
|
|
"loss": 10.9491,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008984725965858042,
|
||
|
|
"grad_norm": 50.25,
|
||
|
|
"learning_rate": 2.9999999999999997e-06,
|
||
|
|
"loss": 10.8608,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.011230907457322551,
|
||
|
|
"grad_norm": 44.75,
|
||
|
|
"learning_rate": 3.7499999999999997e-06,
|
||
|
|
"loss": 10.7375,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.013477088948787063,
|
||
|
|
"grad_norm": 38.0,
|
||
|
|
"learning_rate": 4.499999999999999e-06,
|
||
|
|
"loss": 10.5621,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.015723270440251572,
|
||
|
|
"grad_norm": 25.5,
|
||
|
|
"learning_rate": 5.25e-06,
|
||
|
|
"loss": 10.3304,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.017969451931716084,
|
||
|
|
"grad_norm": 19.25,
|
||
|
|
"learning_rate": 5.999999999999999e-06,
|
||
|
|
"loss": 10.1403,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02021563342318059,
|
||
|
|
"grad_norm": 13.8125,
|
||
|
|
"learning_rate": 6.749999999999999e-06,
|
||
|
|
"loss": 9.9521,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.022461814914645103,
|
||
|
|
"grad_norm": 11.1875,
|
||
|
|
"learning_rate": 7.499999999999999e-06,
|
||
|
|
"loss": 9.843,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024707996406109614,
|
||
|
|
"grad_norm": 10.5,
|
||
|
|
"learning_rate": 8.249999999999999e-06,
|
||
|
|
"loss": 9.7584,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.026954177897574125,
|
||
|
|
"grad_norm": 10.0625,
|
||
|
|
"learning_rate": 8.999999999999999e-06,
|
||
|
|
"loss": 9.7293,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.029200359389038633,
|
||
|
|
"grad_norm": 9.1875,
|
||
|
|
"learning_rate": 9.75e-06,
|
||
|
|
"loss": 9.719,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.031446540880503145,
|
||
|
|
"grad_norm": 8.9375,
|
||
|
|
"learning_rate": 1.05e-05,
|
||
|
|
"loss": 9.6908,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03369272237196765,
|
||
|
|
"grad_norm": 9.125,
|
||
|
|
"learning_rate": 1.1249999999999999e-05,
|
||
|
|
"loss": 9.6617,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03593890386343217,
|
||
|
|
"grad_norm": 9.0625,
|
||
|
|
"learning_rate": 1.1999999999999999e-05,
|
||
|
|
"loss": 9.6228,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.038185085354896675,
|
||
|
|
"grad_norm": 9.125,
|
||
|
|
"learning_rate": 1.275e-05,
|
||
|
|
"loss": 9.6069,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04043126684636118,
|
||
|
|
"grad_norm": 9.3125,
|
||
|
|
"learning_rate": 1.3499999999999998e-05,
|
||
|
|
"loss": 9.5342,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0426774483378257,
|
||
|
|
"grad_norm": 8.9375,
|
||
|
|
"learning_rate": 1.4249999999999999e-05,
|
||
|
|
"loss": 9.5187,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.044923629829290206,
|
||
|
|
"grad_norm": 9.125,
|
||
|
|
"learning_rate": 1.4999999999999999e-05,
|
||
|
|
"loss": 9.4719,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04716981132075472,
|
||
|
|
"grad_norm": 9.0,
|
||
|
|
"learning_rate": 1.5749999999999997e-05,
|
||
|
|
"loss": 9.4167,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04941599281221923,
|
||
|
|
"grad_norm": 8.9375,
|
||
|
|
"learning_rate": 1.6499999999999998e-05,
|
||
|
|
"loss": 9.3825,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.051662174303683736,
|
||
|
|
"grad_norm": 8.75,
|
||
|
|
"learning_rate": 1.725e-05,
|
||
|
|
"loss": 9.3577,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05390835579514825,
|
||
|
|
"grad_norm": 8.625,
|
||
|
|
"learning_rate": 1.7999999999999997e-05,
|
||
|
|
"loss": 9.3387,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05615453728661276,
|
||
|
|
"grad_norm": 9.375,
|
||
|
|
"learning_rate": 1.875e-05,
|
||
|
|
"loss": 9.2947,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05840071877807727,
|
||
|
|
"grad_norm": 8.75,
|
||
|
|
"learning_rate": 1.95e-05,
|
||
|
|
"loss": 9.2177,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06064690026954178,
|
||
|
|
"grad_norm": 8.8125,
|
||
|
|
"learning_rate": 2.025e-05,
|
||
|
|
"loss": 9.1683,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06289308176100629,
|
||
|
|
"grad_norm": 9.875,
|
||
|
|
"learning_rate": 2.1e-05,
|
||
|
|
"loss": 9.1444,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0651392632524708,
|
||
|
|
"grad_norm": 9.625,
|
||
|
|
"learning_rate": 2.1749999999999997e-05,
|
||
|
|
"loss": 9.0632,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0673854447439353,
|
||
|
|
"grad_norm": 8.8125,
|
||
|
|
"learning_rate": 2.2499999999999998e-05,
|
||
|
|
"loss": 9.0828,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06963162623539983,
|
||
|
|
"grad_norm": 9.5625,
|
||
|
|
"learning_rate": 2.325e-05,
|
||
|
|
"loss": 9.0005,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07187780772686433,
|
||
|
|
"grad_norm": 11.1875,
|
||
|
|
"learning_rate": 2.3999999999999997e-05,
|
||
|
|
"loss": 8.9463,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07412398921832884,
|
||
|
|
"grad_norm": 9.3125,
|
||
|
|
"learning_rate": 2.475e-05,
|
||
|
|
"loss": 8.9145,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07637017070979335,
|
||
|
|
"grad_norm": 8.1875,
|
||
|
|
"learning_rate": 2.55e-05,
|
||
|
|
"loss": 8.8803,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07861635220125786,
|
||
|
|
"grad_norm": 7.65625,
|
||
|
|
"learning_rate": 2.6249999999999998e-05,
|
||
|
|
"loss": 8.8266,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08086253369272237,
|
||
|
|
"grad_norm": 7.78125,
|
||
|
|
"learning_rate": 2.6999999999999996e-05,
|
||
|
|
"loss": 8.7826,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08310871518418689,
|
||
|
|
"grad_norm": 8.875,
|
||
|
|
"learning_rate": 2.7749999999999997e-05,
|
||
|
|
"loss": 8.7463,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0853548966756514,
|
||
|
|
"grad_norm": 8.375,
|
||
|
|
"learning_rate": 2.8499999999999998e-05,
|
||
|
|
"loss": 8.6836,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0876010781671159,
|
||
|
|
"grad_norm": 8.5,
|
||
|
|
"learning_rate": 2.925e-05,
|
||
|
|
"loss": 8.6827,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08984725965858041,
|
||
|
|
"grad_norm": 8.25,
|
||
|
|
"learning_rate": 2.9999999999999997e-05,
|
||
|
|
"loss": 8.588,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09209344115004492,
|
||
|
|
"grad_norm": 8.3125,
|
||
|
|
"learning_rate": 3.0749999999999995e-05,
|
||
|
|
"loss": 8.5417,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09433962264150944,
|
||
|
|
"grad_norm": 9.4375,
|
||
|
|
"learning_rate": 3.149999999999999e-05,
|
||
|
|
"loss": 8.5287,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09658580413297395,
|
||
|
|
"grad_norm": 8.125,
|
||
|
|
"learning_rate": 3.225e-05,
|
||
|
|
"loss": 8.49,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09883198562443846,
|
||
|
|
"grad_norm": 7.59375,
|
||
|
|
"learning_rate": 3.2999999999999996e-05,
|
||
|
|
"loss": 8.4025,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10107816711590296,
|
||
|
|
"grad_norm": 8.75,
|
||
|
|
"learning_rate": 3.375e-05,
|
||
|
|
"loss": 8.3121,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10332434860736747,
|
||
|
|
"grad_norm": 7.8125,
|
||
|
|
"learning_rate": 3.45e-05,
|
||
|
|
"loss": 8.2635,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10557053009883198,
|
||
|
|
"grad_norm": 8.3125,
|
||
|
|
"learning_rate": 3.5249999999999996e-05,
|
||
|
|
"loss": 8.2691,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1078167115902965,
|
||
|
|
"grad_norm": 9.4375,
|
||
|
|
"learning_rate": 3.5999999999999994e-05,
|
||
|
|
"loss": 8.1828,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11006289308176101,
|
||
|
|
"grad_norm": 7.0625,
|
||
|
|
"learning_rate": 3.675e-05,
|
||
|
|
"loss": 8.0901,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11230907457322552,
|
||
|
|
"grad_norm": 8.125,
|
||
|
|
"learning_rate": 3.75e-05,
|
||
|
|
"loss": 8.0418,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11455525606469003,
|
||
|
|
"grad_norm": 7.0625,
|
||
|
|
"learning_rate": 3.8249999999999995e-05,
|
||
|
|
"loss": 8.0148,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11680143755615453,
|
||
|
|
"grad_norm": 7.5625,
|
||
|
|
"learning_rate": 3.9e-05,
|
||
|
|
"loss": 7.9943,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11904761904761904,
|
||
|
|
"grad_norm": 7.0625,
|
||
|
|
"learning_rate": 3.975e-05,
|
||
|
|
"loss": 7.852,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12129380053908356,
|
||
|
|
"grad_norm": 6.6875,
|
||
|
|
"learning_rate": 4.05e-05,
|
||
|
|
"loss": 7.8506,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12353998203054807,
|
||
|
|
"grad_norm": 7.46875,
|
||
|
|
"learning_rate": 4.125e-05,
|
||
|
|
"loss": 7.7912,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12578616352201258,
|
||
|
|
"grad_norm": 6.0,
|
||
|
|
"learning_rate": 4.2e-05,
|
||
|
|
"loss": 7.7331,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1280323450134771,
|
||
|
|
"grad_norm": 6.75,
|
||
|
|
"learning_rate": 4.2749999999999996e-05,
|
||
|
|
"loss": 7.6362,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1302785265049416,
|
||
|
|
"grad_norm": 5.9375,
|
||
|
|
"learning_rate": 4.3499999999999993e-05,
|
||
|
|
"loss": 7.5867,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13252470799640612,
|
||
|
|
"grad_norm": 6.40625,
|
||
|
|
"learning_rate": 4.424999999999999e-05,
|
||
|
|
"loss": 7.5268,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1347708894878706,
|
||
|
|
"grad_norm": 5.71875,
|
||
|
|
"learning_rate": 4.4999999999999996e-05,
|
||
|
|
"loss": 7.5554,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13701707097933513,
|
||
|
|
"grad_norm": 5.5,
|
||
|
|
"learning_rate": 4.5749999999999994e-05,
|
||
|
|
"loss": 7.4486,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13926325247079965,
|
||
|
|
"grad_norm": 5.15625,
|
||
|
|
"learning_rate": 4.65e-05,
|
||
|
|
"loss": 7.4554,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14150943396226415,
|
||
|
|
"grad_norm": 4.84375,
|
||
|
|
"learning_rate": 4.7249999999999997e-05,
|
||
|
|
"loss": 7.3681,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14375561545372867,
|
||
|
|
"grad_norm": 7.625,
|
||
|
|
"learning_rate": 4.7999999999999994e-05,
|
||
|
|
"loss": 7.2977,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14600179694519316,
|
||
|
|
"grad_norm": 5.25,
|
||
|
|
"learning_rate": 4.875e-05,
|
||
|
|
"loss": 7.2572,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14824797843665768,
|
||
|
|
"grad_norm": 5.125,
|
||
|
|
"learning_rate": 4.95e-05,
|
||
|
|
"loss": 7.322,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15049415992812218,
|
||
|
|
"grad_norm": 4.96875,
|
||
|
|
"learning_rate": 5.025e-05,
|
||
|
|
"loss": 7.2646,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1527403414195867,
|
||
|
|
"grad_norm": 4.96875,
|
||
|
|
"learning_rate": 5.1e-05,
|
||
|
|
"loss": 7.32,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15498652291105122,
|
||
|
|
"grad_norm": 5.3125,
|
||
|
|
"learning_rate": 5.174999999999999e-05,
|
||
|
|
"loss": 7.209,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15723270440251572,
|
||
|
|
"grad_norm": 5.40625,
|
||
|
|
"learning_rate": 5.2499999999999995e-05,
|
||
|
|
"loss": 7.1961,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15947888589398024,
|
||
|
|
"grad_norm": 4.15625,
|
||
|
|
"learning_rate": 5.324999999999999e-05,
|
||
|
|
"loss": 7.2062,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16172506738544473,
|
||
|
|
"grad_norm": 4.65625,
|
||
|
|
"learning_rate": 5.399999999999999e-05,
|
||
|
|
"loss": 7.1401,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16397124887690925,
|
||
|
|
"grad_norm": 5.71875,
|
||
|
|
"learning_rate": 5.4749999999999996e-05,
|
||
|
|
"loss": 7.1402,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16621743036837378,
|
||
|
|
"grad_norm": 5.34375,
|
||
|
|
"learning_rate": 5.5499999999999994e-05,
|
||
|
|
"loss": 7.073,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16846361185983827,
|
||
|
|
"grad_norm": 5.96875,
|
||
|
|
"learning_rate": 5.625e-05,
|
||
|
|
"loss": 7.115,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1707097933513028,
|
||
|
|
"grad_norm": 4.625,
|
||
|
|
"learning_rate": 5.6999999999999996e-05,
|
||
|
|
"loss": 7.1363,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17295597484276728,
|
||
|
|
"grad_norm": 5.34375,
|
||
|
|
"learning_rate": 5.7749999999999994e-05,
|
||
|
|
"loss": 7.1075,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1752021563342318,
|
||
|
|
"grad_norm": 4.46875,
|
||
|
|
"learning_rate": 5.85e-05,
|
||
|
|
"loss": 7.0746,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17744833782569633,
|
||
|
|
"grad_norm": 4.53125,
|
||
|
|
"learning_rate": 5.925e-05,
|
||
|
|
"loss": 7.0877,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17969451931716082,
|
||
|
|
"grad_norm": 4.6875,
|
||
|
|
"learning_rate": 5.9999999999999995e-05,
|
||
|
|
"loss": 7.033,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18194070080862534,
|
||
|
|
"grad_norm": 4.9375,
|
||
|
|
"learning_rate": 6.075e-05,
|
||
|
|
"loss": 7.0603,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18418688230008984,
|
||
|
|
"grad_norm": 4.8125,
|
||
|
|
"learning_rate": 6.149999999999999e-05,
|
||
|
|
"loss": 7.0149,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18643306379155436,
|
||
|
|
"grad_norm": 4.6875,
|
||
|
|
"learning_rate": 6.225e-05,
|
||
|
|
"loss": 6.9823,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18867924528301888,
|
||
|
|
"grad_norm": 5.65625,
|
||
|
|
"learning_rate": 6.299999999999999e-05,
|
||
|
|
"loss": 7.0107,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19092542677448338,
|
||
|
|
"grad_norm": 4.5625,
|
||
|
|
"learning_rate": 6.374999999999999e-05,
|
||
|
|
"loss": 7.0235,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1931716082659479,
|
||
|
|
"grad_norm": 4.71875,
|
||
|
|
"learning_rate": 6.45e-05,
|
||
|
|
"loss": 6.9444,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1954177897574124,
|
||
|
|
"grad_norm": 4.6875,
|
||
|
|
"learning_rate": 6.525e-05,
|
||
|
|
"loss": 6.9067,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1976639712488769,
|
||
|
|
"grad_norm": 4.375,
|
||
|
|
"learning_rate": 6.599999999999999e-05,
|
||
|
|
"loss": 6.9952,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1999101527403414,
|
||
|
|
"grad_norm": 4.46875,
|
||
|
|
"learning_rate": 6.675e-05,
|
||
|
|
"loss": 6.8992,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20215633423180593,
|
||
|
|
"grad_norm": 4.875,
|
||
|
|
"learning_rate": 6.75e-05,
|
||
|
|
"loss": 6.931,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20440251572327045,
|
||
|
|
"grad_norm": 4.6875,
|
||
|
|
"learning_rate": 6.824999999999999e-05,
|
||
|
|
"loss": 6.9036,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20664869721473494,
|
||
|
|
"grad_norm": 4.75,
|
||
|
|
"learning_rate": 6.9e-05,
|
||
|
|
"loss": 6.9332,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20889487870619947,
|
||
|
|
"grad_norm": 4.25,
|
||
|
|
"learning_rate": 6.975e-05,
|
||
|
|
"loss": 7.0612,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21114106019766396,
|
||
|
|
"grad_norm": 4.59375,
|
||
|
|
"learning_rate": 7.049999999999999e-05,
|
||
|
|
"loss": 6.8777,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21338724168912848,
|
||
|
|
"grad_norm": 4.59375,
|
||
|
|
"learning_rate": 7.125e-05,
|
||
|
|
"loss": 6.8593,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.215633423180593,
|
||
|
|
"grad_norm": 5.1875,
|
||
|
|
"learning_rate": 7.199999999999999e-05,
|
||
|
|
"loss": 6.9541,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2178796046720575,
|
||
|
|
"grad_norm": 4.65625,
|
||
|
|
"learning_rate": 7.274999999999999e-05,
|
||
|
|
"loss": 6.878,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22012578616352202,
|
||
|
|
"grad_norm": 5.1875,
|
||
|
|
"learning_rate": 7.35e-05,
|
||
|
|
"loss": 6.8284,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2223719676549865,
|
||
|
|
"grad_norm": 3.9375,
|
||
|
|
"learning_rate": 7.424999999999999e-05,
|
||
|
|
"loss": 6.8567,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22461814914645103,
|
||
|
|
"grad_norm": 5.1875,
|
||
|
|
"learning_rate": 7.5e-05,
|
||
|
|
"loss": 6.8235,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22686433063791556,
|
||
|
|
"grad_norm": 4.65625,
|
||
|
|
"learning_rate": 7.575e-05,
|
||
|
|
"loss": 6.8903,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22911051212938005,
|
||
|
|
"grad_norm": 5.875,
|
||
|
|
"learning_rate": 7.649999999999999e-05,
|
||
|
|
"loss": 6.8404,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23135669362084457,
|
||
|
|
"grad_norm": 5.0625,
|
||
|
|
"learning_rate": 7.725e-05,
|
||
|
|
"loss": 6.8318,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23360287511230907,
|
||
|
|
"grad_norm": 4.5625,
|
||
|
|
"learning_rate": 7.8e-05,
|
||
|
|
"loss": 6.8522,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2358490566037736,
|
||
|
|
"grad_norm": 5.03125,
|
||
|
|
"learning_rate": 7.874999999999999e-05,
|
||
|
|
"loss": 6.859,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23809523809523808,
|
||
|
|
"grad_norm": 4.71875,
|
||
|
|
"learning_rate": 7.95e-05,
|
||
|
|
"loss": 6.8336,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2403414195867026,
|
||
|
|
"grad_norm": 4.875,
|
||
|
|
"learning_rate": 8.025e-05,
|
||
|
|
"loss": 6.7897,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24258760107816713,
|
||
|
|
"grad_norm": 4.375,
|
||
|
|
"learning_rate": 8.1e-05,
|
||
|
|
"loss": 6.7873,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24483378256963162,
|
||
|
|
"grad_norm": 4.34375,
|
||
|
|
"learning_rate": 8.175e-05,
|
||
|
|
"loss": 6.7691,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24707996406109614,
|
||
|
|
"grad_norm": 4.40625,
|
||
|
|
"learning_rate": 8.25e-05,
|
||
|
|
"loss": 6.8252,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24932614555256064,
|
||
|
|
"grad_norm": 4.6875,
|
||
|
|
"learning_rate": 8.325e-05,
|
||
|
|
"loss": 6.8071,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25157232704402516,
|
||
|
|
"grad_norm": 4.65625,
|
||
|
|
"learning_rate": 8.4e-05,
|
||
|
|
"loss": 6.7156,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25381850853548965,
|
||
|
|
"grad_norm": 4.875,
|
||
|
|
"learning_rate": 8.474999999999999e-05,
|
||
|
|
"loss": 6.8189,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2560646900269542,
|
||
|
|
"grad_norm": 4.53125,
|
||
|
|
"learning_rate": 8.549999999999999e-05,
|
||
|
|
"loss": 6.8159,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2583108715184187,
|
||
|
|
"grad_norm": 3.75,
|
||
|
|
"learning_rate": 8.624999999999998e-05,
|
||
|
|
"loss": 6.847,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2605570530098832,
|
||
|
|
"grad_norm": 4.71875,
|
||
|
|
"learning_rate": 8.699999999999999e-05,
|
||
|
|
"loss": 6.7576,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2628032345013477,
|
||
|
|
"grad_norm": 5.375,
|
||
|
|
"learning_rate": 8.774999999999999e-05,
|
||
|
|
"loss": 6.7211,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26504941599281223,
|
||
|
|
"grad_norm": 4.875,
|
||
|
|
"learning_rate": 8.849999999999998e-05,
|
||
|
|
"loss": 6.7255,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2672955974842767,
|
||
|
|
"grad_norm": 4.71875,
|
||
|
|
"learning_rate": 8.924999999999999e-05,
|
||
|
|
"loss": 6.6598,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2695417789757412,
|
||
|
|
"grad_norm": 4.25,
|
||
|
|
"learning_rate": 8.999999999999999e-05,
|
||
|
|
"loss": 6.7735,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27178796046720577,
|
||
|
|
"grad_norm": 4.3125,
|
||
|
|
"learning_rate": 9.074999999999998e-05,
|
||
|
|
"loss": 6.7253,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27403414195867026,
|
||
|
|
"grad_norm": 4.8125,
|
||
|
|
"learning_rate": 9.149999999999999e-05,
|
||
|
|
"loss": 6.6825,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27628032345013476,
|
||
|
|
"grad_norm": 4.40625,
|
||
|
|
"learning_rate": 9.224999999999999e-05,
|
||
|
|
"loss": 6.7523,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2785265049415993,
|
||
|
|
"grad_norm": 4.46875,
|
||
|
|
"learning_rate": 9.3e-05,
|
||
|
|
"loss": 6.7212,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2807726864330638,
|
||
|
|
"grad_norm": 4.875,
|
||
|
|
"learning_rate": 9.374999999999999e-05,
|
||
|
|
"loss": 6.7052,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2830188679245283,
|
||
|
|
"grad_norm": 4.6875,
|
||
|
|
"learning_rate": 9.449999999999999e-05,
|
||
|
|
"loss": 6.7031,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2852650494159928,
|
||
|
|
"grad_norm": 4.1875,
|
||
|
|
"learning_rate": 9.525e-05,
|
||
|
|
"loss": 6.7163,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28751123090745734,
|
||
|
|
"grad_norm": 4.15625,
|
||
|
|
"learning_rate": 9.599999999999999e-05,
|
||
|
|
"loss": 6.7148,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28975741239892183,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 9.675e-05,
|
||
|
|
"loss": 6.7027,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2920035938903863,
|
||
|
|
"grad_norm": 4.375,
|
||
|
|
"learning_rate": 9.75e-05,
|
||
|
|
"loss": 6.6511,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2942497753818509,
|
||
|
|
"grad_norm": 4.0625,
|
||
|
|
"learning_rate": 9.824999999999999e-05,
|
||
|
|
"loss": 6.704,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29649595687331537,
|
||
|
|
"grad_norm": 4.09375,
|
||
|
|
"learning_rate": 9.9e-05,
|
||
|
|
"loss": 6.689,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29874213836477986,
|
||
|
|
"grad_norm": 3.875,
|
||
|
|
"learning_rate": 9.975e-05,
|
||
|
|
"loss": 6.6784,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30098831985624436,
|
||
|
|
"grad_norm": 4.5,
|
||
|
|
"learning_rate": 0.0001005,
|
||
|
|
"loss": 6.597,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3032345013477089,
|
||
|
|
"grad_norm": 4.3125,
|
||
|
|
"learning_rate": 0.00010125,
|
||
|
|
"loss": 6.6198,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3054806828391734,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.000102,
|
||
|
|
"loss": 6.6226,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3077268643306379,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.00010275,
|
||
|
|
"loss": 6.627,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30997304582210244,
|
||
|
|
"grad_norm": 5.1875,
|
||
|
|
"learning_rate": 0.00010349999999999998,
|
||
|
|
"loss": 6.6003,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31221922731356694,
|
||
|
|
"grad_norm": 3.640625,
|
||
|
|
"learning_rate": 0.00010424999999999999,
|
||
|
|
"loss": 6.5845,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31446540880503143,
|
||
|
|
"grad_norm": 4.4375,
|
||
|
|
"learning_rate": 0.00010499999999999999,
|
||
|
|
"loss": 6.6143,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.316711590296496,
|
||
|
|
"grad_norm": 4.90625,
|
||
|
|
"learning_rate": 0.00010574999999999998,
|
||
|
|
"loss": 6.6305,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3189577717879605,
|
||
|
|
"grad_norm": 4.3125,
|
||
|
|
"learning_rate": 0.00010649999999999999,
|
||
|
|
"loss": 6.5312,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32120395327942497,
|
||
|
|
"grad_norm": 4.15625,
|
||
|
|
"learning_rate": 0.00010724999999999999,
|
||
|
|
"loss": 6.63,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32345013477088946,
|
||
|
|
"grad_norm": 4.53125,
|
||
|
|
"learning_rate": 0.00010799999999999998,
|
||
|
|
"loss": 6.564,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.325696316262354,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.00010874999999999999,
|
||
|
|
"loss": 6.6572,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3279424977538185,
|
||
|
|
"grad_norm": 4.40625,
|
||
|
|
"learning_rate": 0.00010949999999999999,
|
||
|
|
"loss": 6.5728,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.330188679245283,
|
||
|
|
"grad_norm": 4.34375,
|
||
|
|
"learning_rate": 0.00011024999999999998,
|
||
|
|
"loss": 6.5245,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33243486073674755,
|
||
|
|
"grad_norm": 5.5,
|
||
|
|
"learning_rate": 0.00011099999999999999,
|
||
|
|
"loss": 6.5883,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33468104222821204,
|
||
|
|
"grad_norm": 5.53125,
|
||
|
|
"learning_rate": 0.00011174999999999999,
|
||
|
|
"loss": 6.5549,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33692722371967654,
|
||
|
|
"grad_norm": 4.40625,
|
||
|
|
"learning_rate": 0.0001125,
|
||
|
|
"loss": 6.5269,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33917340521114103,
|
||
|
|
"grad_norm": 4.65625,
|
||
|
|
"learning_rate": 0.00011324999999999999,
|
||
|
|
"loss": 6.5262,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3414195867026056,
|
||
|
|
"grad_norm": 4.25,
|
||
|
|
"learning_rate": 0.00011399999999999999,
|
||
|
|
"loss": 6.4958,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3436657681940701,
|
||
|
|
"grad_norm": 4.34375,
|
||
|
|
"learning_rate": 0.00011475,
|
||
|
|
"loss": 6.4719,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34591194968553457,
|
||
|
|
"grad_norm": 3.828125,
|
||
|
|
"learning_rate": 0.00011549999999999999,
|
||
|
|
"loss": 6.4948,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3481581311769991,
|
||
|
|
"grad_norm": 3.890625,
|
||
|
|
"learning_rate": 0.00011624999999999999,
|
||
|
|
"loss": 6.5652,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3504043126684636,
|
||
|
|
"grad_norm": 3.828125,
|
||
|
|
"learning_rate": 0.000117,
|
||
|
|
"loss": 6.633,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3526504941599281,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.00011774999999999999,
|
||
|
|
"loss": 6.4617,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35489667565139266,
|
||
|
|
"grad_norm": 3.9375,
|
||
|
|
"learning_rate": 0.0001185,
|
||
|
|
"loss": 6.524,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35714285714285715,
|
||
|
|
"grad_norm": 7.25,
|
||
|
|
"learning_rate": 0.00011925,
|
||
|
|
"loss": 6.4985,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35938903863432164,
|
||
|
|
"grad_norm": 3.828125,
|
||
|
|
"learning_rate": 0.00011999999999999999,
|
||
|
|
"loss": 6.4988,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36163522012578614,
|
||
|
|
"grad_norm": 5.125,
|
||
|
|
"learning_rate": 0.00012075,
|
||
|
|
"loss": 6.5393,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3638814016172507,
|
||
|
|
"grad_norm": 4.90625,
|
||
|
|
"learning_rate": 0.0001215,
|
||
|
|
"loss": 6.4869,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3661275831087152,
|
||
|
|
"grad_norm": 4.1875,
|
||
|
|
"learning_rate": 0.00012225,
|
||
|
|
"loss": 6.4419,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3683737646001797,
|
||
|
|
"grad_norm": 3.765625,
|
||
|
|
"learning_rate": 0.00012299999999999998,
|
||
|
|
"loss": 6.574,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3706199460916442,
|
||
|
|
"grad_norm": 3.796875,
|
||
|
|
"learning_rate": 0.00012374999999999997,
|
||
|
|
"loss": 6.5063,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3728661275831087,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.0001245,
|
||
|
|
"loss": 6.5404,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3751123090745732,
|
||
|
|
"grad_norm": 3.65625,
|
||
|
|
"learning_rate": 0.00012524999999999998,
|
||
|
|
"loss": 6.4726,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37735849056603776,
|
||
|
|
"grad_norm": 4.0,
|
||
|
|
"learning_rate": 0.00012599999999999997,
|
||
|
|
"loss": 6.4099,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37960467205750226,
|
||
|
|
"grad_norm": 4.25,
|
||
|
|
"learning_rate": 0.00012675,
|
||
|
|
"loss": 6.3966,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38185085354896675,
|
||
|
|
"grad_norm": 3.828125,
|
||
|
|
"learning_rate": 0.00012749999999999998,
|
||
|
|
"loss": 6.4607,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38409703504043125,
|
||
|
|
"grad_norm": 4.28125,
|
||
|
|
"learning_rate": 0.00012824999999999997,
|
||
|
|
"loss": 6.4718,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3863432165318958,
|
||
|
|
"grad_norm": 4.71875,
|
||
|
|
"learning_rate": 0.000129,
|
||
|
|
"loss": 6.4569,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3885893980233603,
|
||
|
|
"grad_norm": 4.375,
|
||
|
|
"learning_rate": 0.00012974999999999998,
|
||
|
|
"loss": 6.3576,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3908355795148248,
|
||
|
|
"grad_norm": 4.65625,
|
||
|
|
"learning_rate": 0.0001305,
|
||
|
|
"loss": 6.4259,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39308176100628933,
|
||
|
|
"grad_norm": 4.96875,
|
||
|
|
"learning_rate": 0.00013125,
|
||
|
|
"loss": 6.3831,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3953279424977538,
|
||
|
|
"grad_norm": 3.90625,
|
||
|
|
"learning_rate": 0.00013199999999999998,
|
||
|
|
"loss": 6.4086,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3975741239892183,
|
||
|
|
"grad_norm": 3.75,
|
||
|
|
"learning_rate": 0.00013275,
|
||
|
|
"loss": 6.3207,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3998203054806828,
|
||
|
|
"grad_norm": 4.28125,
|
||
|
|
"learning_rate": 0.0001335,
|
||
|
|
"loss": 6.4129,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40206648697214736,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 0.00013424999999999998,
|
||
|
|
"loss": 6.4397,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40431266846361186,
|
||
|
|
"grad_norm": 3.921875,
|
||
|
|
"learning_rate": 0.000135,
|
||
|
|
"loss": 6.4104,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40655884995507635,
|
||
|
|
"grad_norm": 3.984375,
|
||
|
|
"learning_rate": 0.00013575,
|
||
|
|
"loss": 6.3327,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4088050314465409,
|
||
|
|
"grad_norm": 3.859375,
|
||
|
|
"learning_rate": 0.00013649999999999998,
|
||
|
|
"loss": 6.3965,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4110512129380054,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.00013725,
|
||
|
|
"loss": 6.3614,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4132973944294699,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.000138,
|
||
|
|
"loss": 6.3743,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41554357592093444,
|
||
|
|
"grad_norm": 3.984375,
|
||
|
|
"learning_rate": 0.00013874999999999998,
|
||
|
|
"loss": 6.4228,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41778975741239893,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.0001395,
|
||
|
|
"loss": 6.4047,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4200359389038634,
|
||
|
|
"grad_norm": 3.984375,
|
||
|
|
"learning_rate": 0.00014025,
|
||
|
|
"loss": 6.3634,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4222821203953279,
|
||
|
|
"grad_norm": 4.0,
|
||
|
|
"learning_rate": 0.00014099999999999998,
|
||
|
|
"loss": 6.3866,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42452830188679247,
|
||
|
|
"grad_norm": 3.796875,
|
||
|
|
"learning_rate": 0.00014174999999999998,
|
||
|
|
"loss": 6.3599,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42677448337825696,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.0001425,
|
||
|
|
"loss": 6.3422,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42902066486972146,
|
||
|
|
"grad_norm": 4.15625,
|
||
|
|
"learning_rate": 0.00014324999999999999,
|
||
|
|
"loss": 6.2791,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.431266846361186,
|
||
|
|
"grad_norm": 3.96875,
|
||
|
|
"learning_rate": 0.00014399999999999998,
|
||
|
|
"loss": 6.3505,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4335130278526505,
|
||
|
|
"grad_norm": 4.5,
|
||
|
|
"learning_rate": 0.00014475,
|
||
|
|
"loss": 6.3671,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.435759209344115,
|
||
|
|
"grad_norm": 3.65625,
|
||
|
|
"learning_rate": 0.00014549999999999999,
|
||
|
|
"loss": 6.318,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4380053908355795,
|
||
|
|
"grad_norm": 4.28125,
|
||
|
|
"learning_rate": 0.00014624999999999998,
|
||
|
|
"loss": 6.3299,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44025157232704404,
|
||
|
|
"grad_norm": 3.578125,
|
||
|
|
"learning_rate": 0.000147,
|
||
|
|
"loss": 6.4073,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44249775381850853,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.00014774999999999999,
|
||
|
|
"loss": 6.4377,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.444743935309973,
|
||
|
|
"grad_norm": 3.765625,
|
||
|
|
"learning_rate": 0.00014849999999999998,
|
||
|
|
"loss": 6.2784,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4469901168014376,
|
||
|
|
"grad_norm": 3.953125,
|
||
|
|
"learning_rate": 0.00014925,
|
||
|
|
"loss": 6.2901,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44923629829290207,
|
||
|
|
"grad_norm": 4.375,
|
||
|
|
"learning_rate": 0.00015,
|
||
|
|
"loss": 6.2973,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44923629829290207,
|
||
|
|
"eval_loss": 6.229096412658691,
|
||
|
|
"eval_runtime": 16.2469,
|
||
|
|
"eval_samples_per_second": 1908.854,
|
||
|
|
"eval_steps_per_second": 238.63,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45148247978436656,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.00015074999999999998,
|
||
|
|
"loss": 6.3253,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4537286612758311,
|
||
|
|
"grad_norm": 3.953125,
|
||
|
|
"learning_rate": 0.0001515,
|
||
|
|
"loss": 6.2906,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4559748427672956,
|
||
|
|
"grad_norm": 3.90625,
|
||
|
|
"learning_rate": 0.00015224999999999996,
|
||
|
|
"loss": 6.3351,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4582210242587601,
|
||
|
|
"grad_norm": 3.6875,
|
||
|
|
"learning_rate": 0.00015299999999999998,
|
||
|
|
"loss": 6.368,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4604672057502246,
|
||
|
|
"grad_norm": 3.796875,
|
||
|
|
"learning_rate": 0.00015374999999999997,
|
||
|
|
"loss": 6.3008,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46271338724168914,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.0001545,
|
||
|
|
"loss": 6.283,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46495956873315364,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.00015524999999999998,
|
||
|
|
"loss": 6.3212,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46720575022461813,
|
||
|
|
"grad_norm": 4.15625,
|
||
|
|
"learning_rate": 0.000156,
|
||
|
|
"loss": 6.2874,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4694519317160827,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00015675,
|
||
|
|
"loss": 6.2944,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4716981132075472,
|
||
|
|
"grad_norm": 4.3125,
|
||
|
|
"learning_rate": 0.00015749999999999998,
|
||
|
|
"loss": 6.3099,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47394429469901167,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.00015824999999999997,
|
||
|
|
"loss": 6.2531,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47619047619047616,
|
||
|
|
"grad_norm": 3.609375,
|
||
|
|
"learning_rate": 0.000159,
|
||
|
|
"loss": 6.2326,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4784366576819407,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 0.00015974999999999998,
|
||
|
|
"loss": 6.2059,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4806828391734052,
|
||
|
|
"grad_norm": 3.625,
|
||
|
|
"learning_rate": 0.0001605,
|
||
|
|
"loss": 6.2798,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4829290206648697,
|
||
|
|
"grad_norm": 3.890625,
|
||
|
|
"learning_rate": 0.00016125,
|
||
|
|
"loss": 6.2814,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48517520215633425,
|
||
|
|
"grad_norm": 3.84375,
|
||
|
|
"learning_rate": 0.000162,
|
||
|
|
"loss": 6.1955,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48742138364779874,
|
||
|
|
"grad_norm": 4.0,
|
||
|
|
"learning_rate": 0.00016274999999999997,
|
||
|
|
"loss": 6.3142,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48966756513926324,
|
||
|
|
"grad_norm": 3.71875,
|
||
|
|
"learning_rate": 0.0001635,
|
||
|
|
"loss": 6.193,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4919137466307278,
|
||
|
|
"grad_norm": 4.0,
|
||
|
|
"learning_rate": 0.00016424999999999998,
|
||
|
|
"loss": 6.26,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4941599281221923,
|
||
|
|
"grad_norm": 4.0625,
|
||
|
|
"learning_rate": 0.000165,
|
||
|
|
"loss": 6.2443,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4964061096136568,
|
||
|
|
"grad_norm": 3.671875,
|
||
|
|
"learning_rate": 0.00016575,
|
||
|
|
"loss": 6.2278,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49865229110512127,
|
||
|
|
"grad_norm": 3.6875,
|
||
|
|
"learning_rate": 0.0001665,
|
||
|
|
"loss": 6.2254,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5008984725965858,
|
||
|
|
"grad_norm": 3.921875,
|
||
|
|
"learning_rate": 0.00016724999999999997,
|
||
|
|
"loss": 6.3325,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5031446540880503,
|
||
|
|
"grad_norm": 3.921875,
|
||
|
|
"learning_rate": 0.000168,
|
||
|
|
"loss": 6.186,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5053908355795148,
|
||
|
|
"grad_norm": 3.859375,
|
||
|
|
"learning_rate": 0.00016874999999999998,
|
||
|
|
"loss": 6.2389,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5076370170709793,
|
||
|
|
"grad_norm": 4.71875,
|
||
|
|
"learning_rate": 0.00016949999999999997,
|
||
|
|
"loss": 6.1268,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5098831985624438,
|
||
|
|
"grad_norm": 3.90625,
|
||
|
|
"learning_rate": 0.00017025,
|
||
|
|
"loss": 6.1445,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5121293800539084,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00017099999999999998,
|
||
|
|
"loss": 6.1658,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5143755615453729,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.00017175,
|
||
|
|
"loss": 6.1832,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5166217430368374,
|
||
|
|
"grad_norm": 3.96875,
|
||
|
|
"learning_rate": 0.00017249999999999996,
|
||
|
|
"loss": 6.1621,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5188679245283019,
|
||
|
|
"grad_norm": 3.765625,
|
||
|
|
"learning_rate": 0.00017324999999999998,
|
||
|
|
"loss": 6.22,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5211141060197664,
|
||
|
|
"grad_norm": 3.890625,
|
||
|
|
"learning_rate": 0.00017399999999999997,
|
||
|
|
"loss": 6.1432,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5233602875112309,
|
||
|
|
"grad_norm": 3.59375,
|
||
|
|
"learning_rate": 0.00017475,
|
||
|
|
"loss": 6.1223,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5256064690026954,
|
||
|
|
"grad_norm": 3.28125,
|
||
|
|
"learning_rate": 0.00017549999999999998,
|
||
|
|
"loss": 6.1839,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.52785265049416,
|
||
|
|
"grad_norm": 3.9375,
|
||
|
|
"learning_rate": 0.00017625,
|
||
|
|
"loss": 6.2021,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5300988319856245,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.00017699999999999997,
|
||
|
|
"loss": 6.1947,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.532345013477089,
|
||
|
|
"grad_norm": 4.5,
|
||
|
|
"learning_rate": 0.00017774999999999998,
|
||
|
|
"loss": 6.1474,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5345911949685535,
|
||
|
|
"grad_norm": 3.671875,
|
||
|
|
"learning_rate": 0.00017849999999999997,
|
||
|
|
"loss": 6.1488,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.536837376460018,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.00017925,
|
||
|
|
"loss": 6.1943,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5390835579514824,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 0.00017999999999999998,
|
||
|
|
"loss": 6.13,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.541329739442947,
|
||
|
|
"grad_norm": 3.828125,
|
||
|
|
"learning_rate": 0.00018075,
|
||
|
|
"loss": 6.0818,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5435759209344115,
|
||
|
|
"grad_norm": 3.546875,
|
||
|
|
"learning_rate": 0.00018149999999999997,
|
||
|
|
"loss": 6.1505,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.545822102425876,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.00018224999999999998,
|
||
|
|
"loss": 6.1578,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5480682839173405,
|
||
|
|
"grad_norm": 3.921875,
|
||
|
|
"learning_rate": 0.00018299999999999998,
|
||
|
|
"loss": 6.0904,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.550314465408805,
|
||
|
|
"grad_norm": 4.1875,
|
||
|
|
"learning_rate": 0.00018375,
|
||
|
|
"loss": 6.0851,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5525606469002695,
|
||
|
|
"grad_norm": 4.21875,
|
||
|
|
"learning_rate": 0.00018449999999999999,
|
||
|
|
"loss": 6.1133,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.554806828391734,
|
||
|
|
"grad_norm": 3.765625,
|
||
|
|
"learning_rate": 0.00018525,
|
||
|
|
"loss": 6.1453,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5570530098831986,
|
||
|
|
"grad_norm": 3.671875,
|
||
|
|
"learning_rate": 0.000186,
|
||
|
|
"loss": 6.1572,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5592991913746631,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 0.00018675,
|
||
|
|
"loss": 6.2205,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5615453728661276,
|
||
|
|
"grad_norm": 4.4375,
|
||
|
|
"learning_rate": 0.00018749999999999998,
|
||
|
|
"loss": 6.1114,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5637915543575921,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.00018824999999999997,
|
||
|
|
"loss": 6.1407,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5660377358490566,
|
||
|
|
"grad_norm": 4.1875,
|
||
|
|
"learning_rate": 0.00018899999999999999,
|
||
|
|
"loss": 6.1272,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5682839173405211,
|
||
|
|
"grad_norm": 4.03125,
|
||
|
|
"learning_rate": 0.00018974999999999998,
|
||
|
|
"loss": 6.1264,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5705300988319856,
|
||
|
|
"grad_norm": 4.09375,
|
||
|
|
"learning_rate": 0.0001905,
|
||
|
|
"loss": 6.0308,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5727762803234502,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00019124999999999996,
|
||
|
|
"loss": 6.1028,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5750224618149147,
|
||
|
|
"grad_norm": 3.953125,
|
||
|
|
"learning_rate": 0.00019199999999999998,
|
||
|
|
"loss": 6.1002,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5772686433063792,
|
||
|
|
"grad_norm": 4.1875,
|
||
|
|
"learning_rate": 0.00019274999999999997,
|
||
|
|
"loss": 6.1451,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5795148247978437,
|
||
|
|
"grad_norm": 4.0625,
|
||
|
|
"learning_rate": 0.0001935,
|
||
|
|
"loss": 6.0798,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5817610062893082,
|
||
|
|
"grad_norm": 3.609375,
|
||
|
|
"learning_rate": 0.00019424999999999998,
|
||
|
|
"loss": 6.0831,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5840071877807727,
|
||
|
|
"grad_norm": 3.671875,
|
||
|
|
"learning_rate": 0.000195,
|
||
|
|
"loss": 6.1054,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5862533692722371,
|
||
|
|
"grad_norm": 3.625,
|
||
|
|
"learning_rate": 0.00019574999999999996,
|
||
|
|
"loss": 6.0122,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5884995507637018,
|
||
|
|
"grad_norm": 4.0625,
|
||
|
|
"learning_rate": 0.00019649999999999998,
|
||
|
|
"loss": 6.0397,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5907457322551662,
|
||
|
|
"grad_norm": 3.59375,
|
||
|
|
"learning_rate": 0.00019724999999999997,
|
||
|
|
"loss": 5.9765,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5929919137466307,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.000198,
|
||
|
|
"loss": 6.0359,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5952380952380952,
|
||
|
|
"grad_norm": 3.828125,
|
||
|
|
"learning_rate": 0.00019874999999999998,
|
||
|
|
"loss": 6.0552,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5974842767295597,
|
||
|
|
"grad_norm": 3.5625,
|
||
|
|
"learning_rate": 0.0001995,
|
||
|
|
"loss": 6.0254,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5997304582210242,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.00020025,
|
||
|
|
"loss": 6.0575,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6019766397124887,
|
||
|
|
"grad_norm": 3.59375,
|
||
|
|
"learning_rate": 0.000201,
|
||
|
|
"loss": 6.004,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6042228212039533,
|
||
|
|
"grad_norm": 3.65625,
|
||
|
|
"learning_rate": 0.00020174999999999997,
|
||
|
|
"loss": 6.0784,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6064690026954178,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.0002025,
|
||
|
|
"loss": 6.1157,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6087151841868823,
|
||
|
|
"grad_norm": 3.65625,
|
||
|
|
"learning_rate": 0.00020324999999999998,
|
||
|
|
"loss": 6.0583,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6109613656783468,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.000204,
|
||
|
|
"loss": 6.0366,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6132075471698113,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00020475,
|
||
|
|
"loss": 6.1213,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6154537286612758,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 0.0002055,
|
||
|
|
"loss": 6.1744,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6176999101527404,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 0.00020624999999999997,
|
||
|
|
"loss": 6.0912,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6199460916442049,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00020699999999999996,
|
||
|
|
"loss": 5.9619,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6221922731356694,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.00020774999999999998,
|
||
|
|
"loss": 5.9658,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6244384546271339,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00020849999999999997,
|
||
|
|
"loss": 6.0913,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6266846361185984,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00020925,
|
||
|
|
"loss": 6.0363,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6289308176100629,
|
||
|
|
"grad_norm": 3.890625,
|
||
|
|
"learning_rate": 0.00020999999999999998,
|
||
|
|
"loss": 5.9513,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6311769991015274,
|
||
|
|
"grad_norm": 4.0625,
|
||
|
|
"learning_rate": 0.00021074999999999997,
|
||
|
|
"loss": 5.9931,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.633423180592992,
|
||
|
|
"grad_norm": 4.0,
|
||
|
|
"learning_rate": 0.00021149999999999996,
|
||
|
|
"loss": 5.9732,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6356693620844565,
|
||
|
|
"grad_norm": 3.671875,
|
||
|
|
"learning_rate": 0.00021224999999999998,
|
||
|
|
"loss": 6.0028,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.637915543575921,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.00021299999999999997,
|
||
|
|
"loss": 6.0171,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6401617250673854,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00021375,
|
||
|
|
"loss": 5.9886,
|
||
|
|
"step": 1425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6424079065588499,
|
||
|
|
"grad_norm": 3.875,
|
||
|
|
"learning_rate": 0.00021449999999999998,
|
||
|
|
"loss": 5.9436,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6446540880503144,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.00021525,
|
||
|
|
"loss": 6.0565,
|
||
|
|
"step": 1435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6469002695417789,
|
||
|
|
"grad_norm": 3.640625,
|
||
|
|
"learning_rate": 0.00021599999999999996,
|
||
|
|
"loss": 6.1117,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6491464510332435,
|
||
|
|
"grad_norm": 3.625,
|
||
|
|
"learning_rate": 0.00021674999999999998,
|
||
|
|
"loss": 5.9778,
|
||
|
|
"step": 1445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.651392632524708,
|
||
|
|
"grad_norm": 4.0625,
|
||
|
|
"learning_rate": 0.00021749999999999997,
|
||
|
|
"loss": 5.9706,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6536388140161725,
|
||
|
|
"grad_norm": 4.15625,
|
||
|
|
"learning_rate": 0.00021825,
|
||
|
|
"loss": 5.9358,
|
||
|
|
"step": 1455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.655884995507637,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.00021899999999999998,
|
||
|
|
"loss": 6.0584,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6581311769991015,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.00021975,
|
||
|
|
"loss": 6.0055,
|
||
|
|
"step": 1465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.660377358490566,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.00022049999999999997,
|
||
|
|
"loss": 5.9678,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6626235399820305,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.00022124999999999998,
|
||
|
|
"loss": 5.9747,
|
||
|
|
"step": 1475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6648697214734951,
|
||
|
|
"grad_norm": 3.46875,
|
||
|
|
"learning_rate": 0.00022199999999999998,
|
||
|
|
"loss": 5.9542,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6671159029649596,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.00022275,
|
||
|
|
"loss": 5.9001,
|
||
|
|
"step": 1485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6693620844564241,
|
||
|
|
"grad_norm": 3.65625,
|
||
|
|
"learning_rate": 0.00022349999999999998,
|
||
|
|
"loss": 5.9689,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6716082659478886,
|
||
|
|
"grad_norm": 3.953125,
|
||
|
|
"learning_rate": 0.00022425,
|
||
|
|
"loss": 5.9823,
|
||
|
|
"step": 1495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6738544474393531,
|
||
|
|
"grad_norm": 3.53125,
|
||
|
|
"learning_rate": 0.000225,
|
||
|
|
"loss": 5.9758,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6761006289308176,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00022574999999999996,
|
||
|
|
"loss": 5.9994,
|
||
|
|
"step": 1505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6783468104222821,
|
||
|
|
"grad_norm": 3.6875,
|
||
|
|
"learning_rate": 0.00022649999999999998,
|
||
|
|
"loss": 5.8979,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6805929919137467,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.00022724999999999997,
|
||
|
|
"loss": 6.0046,
|
||
|
|
"step": 1515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6828391734052112,
|
||
|
|
"grad_norm": 3.75,
|
||
|
|
"learning_rate": 0.00022799999999999999,
|
||
|
|
"loss": 5.9637,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6850853548966757,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.00022874999999999998,
|
||
|
|
"loss": 5.939,
|
||
|
|
"step": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6873315363881402,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.0002295,
|
||
|
|
"loss": 6.0089,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6895777178796046,
|
||
|
|
"grad_norm": 3.46875,
|
||
|
|
"learning_rate": 0.00023024999999999996,
|
||
|
|
"loss": 5.9247,
|
||
|
|
"step": 1535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6918238993710691,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.00023099999999999998,
|
||
|
|
"loss": 5.8969,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6940700808625337,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.00023174999999999997,
|
||
|
|
"loss": 5.8485,
|
||
|
|
"step": 1545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6963162623539982,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.00023249999999999999,
|
||
|
|
"loss": 5.9481,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6985624438454627,
|
||
|
|
"grad_norm": 3.5625,
|
||
|
|
"learning_rate": 0.00023324999999999998,
|
||
|
|
"loss": 5.9145,
|
||
|
|
"step": 1555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7008086253369272,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.000234,
|
||
|
|
"loss": 5.8711,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7030548068283917,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.00023474999999999996,
|
||
|
|
"loss": 5.9697,
|
||
|
|
"step": 1565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7053009883198562,
|
||
|
|
"grad_norm": 3.75,
|
||
|
|
"learning_rate": 0.00023549999999999998,
|
||
|
|
"loss": 5.8905,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7075471698113207,
|
||
|
|
"grad_norm": 3.59375,
|
||
|
|
"learning_rate": 0.00023624999999999997,
|
||
|
|
"loss": 5.9357,
|
||
|
|
"step": 1575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7097933513027853,
|
||
|
|
"grad_norm": 3.453125,
|
||
|
|
"learning_rate": 0.000237,
|
||
|
|
"loss": 5.8548,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7120395327942498,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00023774999999999998,
|
||
|
|
"loss": 5.9498,
|
||
|
|
"step": 1585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7142857142857143,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.0002385,
|
||
|
|
"loss": 5.8457,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7165318957771788,
|
||
|
|
"grad_norm": 3.5625,
|
||
|
|
"learning_rate": 0.00023925,
|
||
|
|
"loss": 5.8717,
|
||
|
|
"step": 1595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7187780772686433,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.00023999999999999998,
|
||
|
|
"loss": 5.8193,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7210242587601078,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.00024074999999999997,
|
||
|
|
"loss": 5.8618,
|
||
|
|
"step": 1605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7232704402515723,
|
||
|
|
"grad_norm": 3.625,
|
||
|
|
"learning_rate": 0.0002415,
|
||
|
|
"loss": 5.8882,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7255166217430369,
|
||
|
|
"grad_norm": 3.28125,
|
||
|
|
"learning_rate": 0.00024224999999999998,
|
||
|
|
"loss": 5.9087,
|
||
|
|
"step": 1615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7277628032345014,
|
||
|
|
"grad_norm": 3.53125,
|
||
|
|
"learning_rate": 0.000243,
|
||
|
|
"loss": 5.8994,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7300089847259659,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.00024375,
|
||
|
|
"loss": 5.9156,
|
||
|
|
"step": 1625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7322551662174304,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.0002445,
|
||
|
|
"loss": 5.889,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7345013477088949,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.00024524999999999997,
|
||
|
|
"loss": 5.8538,
|
||
|
|
"step": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7367475292003594,
|
||
|
|
"grad_norm": 3.53125,
|
||
|
|
"learning_rate": 0.00024599999999999996,
|
||
|
|
"loss": 5.914,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7389937106918238,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00024675,
|
||
|
|
"loss": 5.8628,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7412398921832885,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.00024749999999999994,
|
||
|
|
"loss": 5.8555,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.743486073674753,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00024825,
|
||
|
|
"loss": 5.8846,
|
||
|
|
"step": 1655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7457322551662174,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.000249,
|
||
|
|
"loss": 5.8957,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7479784366576819,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00024974999999999997,
|
||
|
|
"loss": 5.8036,
|
||
|
|
"step": 1665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7502246181491464,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.00025049999999999996,
|
||
|
|
"loss": 5.845,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7524707996406109,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00025125,
|
||
|
|
"loss": 5.8801,
|
||
|
|
"step": 1675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7547169811320755,
|
||
|
|
"grad_norm": 3.53125,
|
||
|
|
"learning_rate": 0.00025199999999999995,
|
||
|
|
"loss": 5.8356,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.75696316262354,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.00025275,
|
||
|
|
"loss": 5.851,
|
||
|
|
"step": 1685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7592093441150045,
|
||
|
|
"grad_norm": 3.546875,
|
||
|
|
"learning_rate": 0.0002535,
|
||
|
|
"loss": 5.8647,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.761455525606469,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00025425,
|
||
|
|
"loss": 5.8168,
|
||
|
|
"step": 1695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7637017070979335,
|
||
|
|
"grad_norm": 3.609375,
|
||
|
|
"learning_rate": 0.00025499999999999996,
|
||
|
|
"loss": 5.8514,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.765947888589398,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.00025575,
|
||
|
|
"loss": 5.7495,
|
||
|
|
"step": 1705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7681940700808625,
|
||
|
|
"grad_norm": 3.515625,
|
||
|
|
"learning_rate": 0.00025649999999999995,
|
||
|
|
"loss": 5.8702,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7704402515723271,
|
||
|
|
"grad_norm": 3.640625,
|
||
|
|
"learning_rate": 0.00025725,
|
||
|
|
"loss": 5.9178,
|
||
|
|
"step": 1715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7726864330637916,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.000258,
|
||
|
|
"loss": 5.82,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7749326145552561,
|
||
|
|
"grad_norm": 3.765625,
|
||
|
|
"learning_rate": 0.00025875,
|
||
|
|
"loss": 5.823,
|
||
|
|
"step": 1725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7771787960467206,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00025949999999999997,
|
||
|
|
"loss": 5.8712,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7794249775381851,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00026025,
|
||
|
|
"loss": 5.8173,
|
||
|
|
"step": 1735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7816711590296496,
|
||
|
|
"grad_norm": 3.28125,
|
||
|
|
"learning_rate": 0.000261,
|
||
|
|
"loss": 5.8169,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7839173405211141,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00026175,
|
||
|
|
"loss": 5.8047,
|
||
|
|
"step": 1745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7861635220125787,
|
||
|
|
"grad_norm": 3.21875,
|
||
|
|
"learning_rate": 0.0002625,
|
||
|
|
"loss": 5.8384,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7884097035040432,
|
||
|
|
"grad_norm": 3.40625,
|
||
|
|
"learning_rate": 0.00026325,
|
||
|
|
"loss": 5.7996,
|
||
|
|
"step": 1755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7906558849955077,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00026399999999999997,
|
||
|
|
"loss": 5.7611,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7929020664869721,
|
||
|
|
"grad_norm": 3.390625,
|
||
|
|
"learning_rate": 0.00026474999999999996,
|
||
|
|
"loss": 5.7925,
|
||
|
|
"step": 1765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7951482479784366,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.0002655,
|
||
|
|
"loss": 5.8187,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7973944294699011,
|
||
|
|
"grad_norm": 3.53125,
|
||
|
|
"learning_rate": 0.00026624999999999994,
|
||
|
|
"loss": 5.7791,
|
||
|
|
"step": 1775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7996406109613656,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 0.000267,
|
||
|
|
"loss": 5.8063,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8018867924528302,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00026775,
|
||
|
|
"loss": 5.8167,
|
||
|
|
"step": 1785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8041329739442947,
|
||
|
|
"grad_norm": 3.46875,
|
||
|
|
"learning_rate": 0.00026849999999999997,
|
||
|
|
"loss": 5.7916,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8063791554357592,
|
||
|
|
"grad_norm": 3.28125,
|
||
|
|
"learning_rate": 0.00026924999999999996,
|
||
|
|
"loss": 5.8446,
|
||
|
|
"step": 1795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8086253369272237,
|
||
|
|
"grad_norm": 3.65625,
|
||
|
|
"learning_rate": 0.00027,
|
||
|
|
"loss": 5.8757,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8108715184186882,
|
||
|
|
"grad_norm": 3.734375,
|
||
|
|
"learning_rate": 0.00027074999999999994,
|
||
|
|
"loss": 5.7271,
|
||
|
|
"step": 1805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8131176999101527,
|
||
|
|
"grad_norm": 3.765625,
|
||
|
|
"learning_rate": 0.0002715,
|
||
|
|
"loss": 5.8397,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8153638814016172,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.00027225,
|
||
|
|
"loss": 5.7838,
|
||
|
|
"step": 1815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8176100628930818,
|
||
|
|
"grad_norm": 3.59375,
|
||
|
|
"learning_rate": 0.00027299999999999997,
|
||
|
|
"loss": 5.7907,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8198562443845463,
|
||
|
|
"grad_norm": 3.921875,
|
||
|
|
"learning_rate": 0.00027374999999999996,
|
||
|
|
"loss": 5.8579,
|
||
|
|
"step": 1825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8221024258760108,
|
||
|
|
"grad_norm": 3.46875,
|
||
|
|
"learning_rate": 0.0002745,
|
||
|
|
"loss": 5.8342,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8243486073674753,
|
||
|
|
"grad_norm": 3.75,
|
||
|
|
"learning_rate": 0.00027525,
|
||
|
|
"loss": 5.7949,
|
||
|
|
"step": 1835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8265947888589398,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.000276,
|
||
|
|
"loss": 5.7715,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8288409703504043,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.00027675,
|
||
|
|
"loss": 5.7804,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8310871518418689,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00027749999999999997,
|
||
|
|
"loss": 5.7288,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8333333333333334,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.00027824999999999996,
|
||
|
|
"loss": 5.7319,
|
||
|
|
"step": 1855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8355795148247979,
|
||
|
|
"grad_norm": 3.21875,
|
||
|
|
"learning_rate": 0.000279,
|
||
|
|
"loss": 5.7636,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8378256963162624,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.00027975,
|
||
|
|
"loss": 5.7395,
|
||
|
|
"step": 1865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8400718778077269,
|
||
|
|
"grad_norm": 3.6875,
|
||
|
|
"learning_rate": 0.0002805,
|
||
|
|
"loss": 5.7519,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8423180592991913,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00028125,
|
||
|
|
"loss": 5.706,
|
||
|
|
"step": 1875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8445642407906558,
|
||
|
|
"grad_norm": 3.390625,
|
||
|
|
"learning_rate": 0.00028199999999999997,
|
||
|
|
"loss": 5.799,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8468104222821204,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00028274999999999996,
|
||
|
|
"loss": 5.7856,
|
||
|
|
"step": 1885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8490566037735849,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00028349999999999995,
|
||
|
|
"loss": 5.8625,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8513027852650494,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.00028425,
|
||
|
|
"loss": 5.7212,
|
||
|
|
"step": 1895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8535489667565139,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.000285,
|
||
|
|
"loss": 5.7326,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8557951482479784,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.00028575,
|
||
|
|
"loss": 5.7664,
|
||
|
|
"step": 1905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8580413297394429,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.00028649999999999997,
|
||
|
|
"loss": 5.7231,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8602875112309074,
|
||
|
|
"grad_norm": 3.40625,
|
||
|
|
"learning_rate": 0.00028724999999999996,
|
||
|
|
"loss": 5.7759,
|
||
|
|
"step": 1915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.862533692722372,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00028799999999999995,
|
||
|
|
"loss": 5.7442,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8647798742138365,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.00028875,
|
||
|
|
"loss": 5.7252,
|
||
|
|
"step": 1925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.867026055705301,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.0002895,
|
||
|
|
"loss": 5.7196,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8692722371967655,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.00029025,
|
||
|
|
"loss": 5.7376,
|
||
|
|
"step": 1935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.87151841868823,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00029099999999999997,
|
||
|
|
"loss": 5.8077,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8737646001796945,
|
||
|
|
"grad_norm": 3.625,
|
||
|
|
"learning_rate": 0.00029174999999999996,
|
||
|
|
"loss": 5.7826,
|
||
|
|
"step": 1945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.876010781671159,
|
||
|
|
"grad_norm": 3.609375,
|
||
|
|
"learning_rate": 0.00029249999999999995,
|
||
|
|
"loss": 5.736,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8782569631626236,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00029325,
|
||
|
|
"loss": 5.7531,
|
||
|
|
"step": 1955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8805031446540881,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.000294,
|
||
|
|
"loss": 5.7246,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8827493261455526,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.00029475,
|
||
|
|
"loss": 5.7786,
|
||
|
|
"step": 1965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8849955076370171,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.00029549999999999997,
|
||
|
|
"loss": 5.7237,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8872416891284816,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029624999999999996,
|
||
|
|
"loss": 5.8053,
|
||
|
|
"step": 1975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.889487870619946,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.00029699999999999996,
|
||
|
|
"loss": 5.6918,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8917340521114105,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029775,
|
||
|
|
"loss": 5.8251,
|
||
|
|
"step": 1985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8939802336028752,
|
||
|
|
"grad_norm": 3.78125,
|
||
|
|
"learning_rate": 0.0002985,
|
||
|
|
"loss": 5.7529,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8962264150943396,
|
||
|
|
"grad_norm": 3.640625,
|
||
|
|
"learning_rate": 0.00029925,
|
||
|
|
"loss": 5.7181,
|
||
|
|
"step": 1995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8984725965858041,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.0003,
|
||
|
|
"loss": 5.7413,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8984725965858041,
|
||
|
|
"eval_loss": 5.639461517333984,
|
||
|
|
"eval_runtime": 16.0491,
|
||
|
|
"eval_samples_per_second": 1932.383,
|
||
|
|
"eval_steps_per_second": 241.571,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9007187780772686,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00029999995942443054,
|
||
|
|
"loss": 5.6436,
|
||
|
|
"step": 2005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9029649595687331,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.00029999983769774674,
|
||
|
|
"loss": 5.7627,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9052111410601976,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.0002999996348200217,
|
||
|
|
"loss": 5.7181,
|
||
|
|
"step": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9074573225516622,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.0002999993507913773,
|
||
|
|
"loss": 5.7097,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9097035040431267,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002999989856119844,
|
||
|
|
"loss": 5.6407,
|
||
|
|
"step": 2025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9119496855345912,
|
||
|
|
"grad_norm": 3.453125,
|
||
|
|
"learning_rate": 0.0002999985392820624,
|
||
|
|
"loss": 5.6532,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9141958670260557,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002999980118018797,
|
||
|
|
"loss": 5.6993,
|
||
|
|
"step": 2035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9164420485175202,
|
||
|
|
"grad_norm": 3.546875,
|
||
|
|
"learning_rate": 0.0002999974031717533,
|
||
|
|
"loss": 5.6507,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9186882300089847,
|
||
|
|
"grad_norm": 3.546875,
|
||
|
|
"learning_rate": 0.0002999967133920491,
|
||
|
|
"loss": 5.6629,
|
||
|
|
"step": 2045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9209344115004492,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002999959424631818,
|
||
|
|
"loss": 5.7172,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9231805929919138,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002999950903856147,
|
||
|
|
"loss": 5.5766,
|
||
|
|
"step": 2055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9254267744833783,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.00029999415715986,
|
||
|
|
"loss": 5.6546,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9276729559748428,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.0002999931427864788,
|
||
|
|
"loss": 5.6317,
|
||
|
|
"step": 2065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9299191374663073,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00029999204726608076,
|
||
|
|
"loss": 5.6605,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9321653189577718,
|
||
|
|
"grad_norm": 3.40625,
|
||
|
|
"learning_rate": 0.0002999908705993245,
|
||
|
|
"loss": 5.6958,
|
||
|
|
"step": 2075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9344115004492363,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029998961278691725,
|
||
|
|
"loss": 5.6498,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9366576819407008,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002999882738296152,
|
||
|
|
"loss": 5.6887,
|
||
|
|
"step": 2085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9389038634321654,
|
||
|
|
"grad_norm": 3.453125,
|
||
|
|
"learning_rate": 0.0002999868537282231,
|
||
|
|
"loss": 5.617,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9411500449236299,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.0002999853524835947,
|
||
|
|
"loss": 5.7708,
|
||
|
|
"step": 2095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9433962264150944,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.0002999837700966324,
|
||
|
|
"loss": 5.6733,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9456424079065588,
|
||
|
|
"grad_norm": 3.359375,
|
||
|
|
"learning_rate": 0.00029998210656828736,
|
||
|
|
"loss": 5.7,
|
||
|
|
"step": 2105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9478885893980233,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.0002999803618995596,
|
||
|
|
"loss": 5.6652,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9501347708894878,
|
||
|
|
"grad_norm": 3.71875,
|
||
|
|
"learning_rate": 0.00029997853609149797,
|
||
|
|
"loss": 5.7413,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9523809523809523,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.00029997662914519983,
|
||
|
|
"loss": 5.7038,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9546271338724169,
|
||
|
|
"grad_norm": 3.546875,
|
||
|
|
"learning_rate": 0.0002999746410618116,
|
||
|
|
"loss": 5.6402,
|
||
|
|
"step": 2125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9568733153638814,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029997257184252827,
|
||
|
|
"loss": 5.5762,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9591194968553459,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00029997042148859374,
|
||
|
|
"loss": 5.7327,
|
||
|
|
"step": 2135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9613656783468104,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.0002999681900013006,
|
||
|
|
"loss": 5.6974,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9636118598382749,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002999658773819903,
|
||
|
|
"loss": 5.7185,
|
||
|
|
"step": 2145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9658580413297394,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.00029996348363205296,
|
||
|
|
"loss": 5.7269,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.968104222821204,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002999610087529275,
|
||
|
|
"loss": 5.6719,
|
||
|
|
"step": 2155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9703504043126685,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.00029995845274610164,
|
||
|
|
"loss": 5.6067,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.972596585804133,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00029995581561311185,
|
||
|
|
"loss": 5.612,
|
||
|
|
"step": 2165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9748427672955975,
|
||
|
|
"grad_norm": 3.390625,
|
||
|
|
"learning_rate": 0.00029995309735554327,
|
||
|
|
"loss": 5.6163,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.977088948787062,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00029995029797503007,
|
||
|
|
"loss": 5.6468,
|
||
|
|
"step": 2175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9793351302785265,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029994741747325487,
|
||
|
|
"loss": 5.6653,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.981581311769991,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00029994445585194925,
|
||
|
|
"loss": 5.6416,
|
||
|
|
"step": 2185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9838274932614556,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029994141311289347,
|
||
|
|
"loss": 5.5982,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9860736747529201,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.00029993828925791664,
|
||
|
|
"loss": 5.6288,
|
||
|
|
"step": 2195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9883198562443846,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002999350842888965,
|
||
|
|
"loss": 5.6725,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9905660377358491,
|
||
|
|
"grad_norm": 3.40625,
|
||
|
|
"learning_rate": 0.0002999317982077596,
|
||
|
|
"loss": 5.6444,
|
||
|
|
"step": 2205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9928122192273136,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029992843101648144,
|
||
|
|
"loss": 5.6642,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.995058400718778,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029992498271708595,
|
||
|
|
"loss": 5.6011,
|
||
|
|
"step": 2215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9973045822102425,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.00029992145331164596,
|
||
|
|
"loss": 5.6432,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9995507637017071,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002999178428022831,
|
||
|
|
"loss": 5.6428,
|
||
|
|
"step": 2225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0017969451931716,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.0002999141511911678,
|
||
|
|
"loss": 5.5542,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0040431266846361,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.000299910378480519,
|
||
|
|
"loss": 5.6403,
|
||
|
|
"step": 2235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0062893081761006,
|
||
|
|
"grad_norm": 3.21875,
|
||
|
|
"learning_rate": 0.0002999065246726047,
|
||
|
|
"loss": 5.5451,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0085354896675651,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002999025897697414,
|
||
|
|
"loss": 5.6575,
|
||
|
|
"step": 2245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0107816711590296,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002998985737742945,
|
||
|
|
"loss": 5.5892,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.013027852650494,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002998944766886781,
|
||
|
|
"loss": 5.6127,
|
||
|
|
"step": 2255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0152740341419586,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.000299890298515355,
|
||
|
|
"loss": 5.5885,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.017520215633423,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.0002998860392568368,
|
||
|
|
"loss": 5.5215,
|
||
|
|
"step": 2265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0197663971248876,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.00029988169891568373,
|
||
|
|
"loss": 5.6074,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0220125786163523,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.00029987727749450506,
|
||
|
|
"loss": 5.6192,
|
||
|
|
"step": 2275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0242587601078168,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.00029987277499595843,
|
||
|
|
"loss": 5.5663,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0265049415992813,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.0002998681914227504,
|
||
|
|
"loss": 5.5862,
|
||
|
|
"step": 2285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0287511230907458,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002998635267776363,
|
||
|
|
"loss": 5.5536,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0309973045822103,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.0002998587810634201,
|
||
|
|
"loss": 5.5818,
|
||
|
|
"step": 2295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0332434860736748,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.0002998539542829546,
|
||
|
|
"loss": 5.6147,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0354896675651393,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029984904643914114,
|
||
|
|
"loss": 5.6629,
|
||
|
|
"step": 2305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0377358490566038,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.00029984405753493006,
|
||
|
|
"loss": 5.5412,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0399820305480683,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.00029983898757332024,
|
||
|
|
"loss": 5.5598,
|
||
|
|
"step": 2315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0422282120395328,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002998338365573593,
|
||
|
|
"loss": 5.6111,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0444743935309972,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.0002998286044901436,
|
||
|
|
"loss": 5.4899,
|
||
|
|
"step": 2325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0467205750224617,
|
||
|
|
"grad_norm": 3.453125,
|
||
|
|
"learning_rate": 0.0002998232913748184,
|
||
|
|
"loss": 5.5567,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0489667565139262,
|
||
|
|
"grad_norm": 3.40625,
|
||
|
|
"learning_rate": 0.0002998178972145773,
|
||
|
|
"loss": 5.4968,
|
||
|
|
"step": 2335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0512129380053907,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.000299812422012663,
|
||
|
|
"loss": 5.6119,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0534591194968554,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002998068657723666,
|
||
|
|
"loss": 5.5563,
|
||
|
|
"step": 2345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.05570530098832,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002998012284970282,
|
||
|
|
"loss": 5.5985,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0579514824797844,
|
||
|
|
"grad_norm": 3.46875,
|
||
|
|
"learning_rate": 0.00029979551019003643,
|
||
|
|
"loss": 5.5002,
|
||
|
|
"step": 2355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.060197663971249,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002997897108548286,
|
||
|
|
"loss": 5.6114,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0624438454627134,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00029978383049489093,
|
||
|
|
"loss": 5.5056,
|
||
|
|
"step": 2365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.064690026954178,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002997778691137582,
|
||
|
|
"loss": 5.515,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0669362084456424,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.00029977182671501383,
|
||
|
|
"loss": 5.5303,
|
||
|
|
"step": 2375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.069182389937107,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00029976570330229006,
|
||
|
|
"loss": 5.5147,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0714285714285714,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.00029975949887926784,
|
||
|
|
"loss": 5.5098,
|
||
|
|
"step": 2385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.073674752920036,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029975321344967676,
|
||
|
|
"loss": 5.5533,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0759209344115004,
|
||
|
|
"grad_norm": 3.28125,
|
||
|
|
"learning_rate": 0.000299746847017295,
|
||
|
|
"loss": 5.5429,
|
||
|
|
"step": 2395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0781671159029649,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00029974039958594967,
|
||
|
|
"loss": 5.508,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0804132973944294,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002997338711595165,
|
||
|
|
"loss": 5.5494,
|
||
|
|
"step": 2405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.082659478885894,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.00029972726174191965,
|
||
|
|
"loss": 5.4273,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0849056603773586,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029972057133713235,
|
||
|
|
"loss": 5.5474,
|
||
|
|
"step": 2415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.087151841868823,
|
||
|
|
"grad_norm": 2.84375,
|
||
|
|
"learning_rate": 0.00029971379994917624,
|
||
|
|
"loss": 5.5008,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0893980233602876,
|
||
|
|
"grad_norm": 3.359375,
|
||
|
|
"learning_rate": 0.00029970694758212177,
|
||
|
|
"loss": 5.4682,
|
||
|
|
"step": 2425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.091644204851752,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.000299700014240088,
|
||
|
|
"loss": 5.4666,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0938903863432166,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.00029969299992724273,
|
||
|
|
"loss": 5.5844,
|
||
|
|
"step": 2435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.096136567834681,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.00029968590464780247,
|
||
|
|
"loss": 5.5141,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0983827493261455,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002996787284060322,
|
||
|
|
"loss": 5.4897,
|
||
|
|
"step": 2445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.10062893081761,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00029967147120624573,
|
||
|
|
"loss": 5.4318,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1028751123090745,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 0.00029966413305280553,
|
||
|
|
"loss": 5.506,
|
||
|
|
"step": 2455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.105121293800539,
|
||
|
|
"grad_norm": 3.390625,
|
||
|
|
"learning_rate": 0.00029965671395012274,
|
||
|
|
"loss": 5.4363,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1073674752920035,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.0002996492139026571,
|
||
|
|
"loss": 5.4077,
|
||
|
|
"step": 2465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.109613656783468,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.000299641632914917,
|
||
|
|
"loss": 5.4435,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1118598382749325,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.0002996339709914596,
|
||
|
|
"loss": 5.4641,
|
||
|
|
"step": 2475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1141060197663972,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002996262281368905,
|
||
|
|
"loss": 5.5053,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1163522012578617,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.0002996184043558642,
|
||
|
|
"loss": 5.3987,
|
||
|
|
"step": 2485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1185983827493262,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002996104996530837,
|
||
|
|
"loss": 5.6063,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1208445642407907,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.0002996025140333006,
|
||
|
|
"loss": 5.4782,
|
||
|
|
"step": 2495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1230907457322552,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00029959444750131533,
|
||
|
|
"loss": 5.4836,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1253369272237197,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002995863000619768,
|
||
|
|
"loss": 5.5181,
|
||
|
|
"step": 2505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1275831087151842,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002995780717201825,
|
||
|
|
"loss": 5.4469,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1298292902066487,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002995697624808788,
|
||
|
|
"loss": 5.4445,
|
||
|
|
"step": 2515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1320754716981132,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00029956137234906044,
|
||
|
|
"loss": 5.4844,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1343216531895777,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.00029955290132977093,
|
||
|
|
"loss": 5.5633,
|
||
|
|
"step": 2525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1365678346810422,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002995443494281024,
|
||
|
|
"loss": 5.4724,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1388140161725067,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.00029953571664919547,
|
||
|
|
"loss": 5.4786,
|
||
|
|
"step": 2535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1410601976639712,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.0002995270029982396,
|
||
|
|
"loss": 5.5004,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1433063791554359,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029951820848047255,
|
||
|
|
"loss": 5.4758,
|
||
|
|
"step": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1455525606469004,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002995093331011811,
|
||
|
|
"loss": 5.4789,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1477987421383649,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029950037686570023,
|
||
|
|
"loss": 5.3991,
|
||
|
|
"step": 2555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1500449236298294,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.0002994913397794138,
|
||
|
|
"loss": 5.5046,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1522911051212938,
|
||
|
|
"grad_norm": 3.46875,
|
||
|
|
"learning_rate": 0.00029948222184775415,
|
||
|
|
"loss": 5.5293,
|
||
|
|
"step": 2565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1545372866127583,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00029947302307620227,
|
||
|
|
"loss": 5.4079,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1567834681042228,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002994637434702877,
|
||
|
|
"loss": 5.425,
|
||
|
|
"step": 2575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1590296495956873,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.0002994543830355886,
|
||
|
|
"loss": 5.4591,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1612758310871518,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.0002994449417777317,
|
||
|
|
"loss": 5.5263,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1635220125786163,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00029943541970239233,
|
||
|
|
"loss": 5.4458,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1657681940700808,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00029942581681529447,
|
||
|
|
"loss": 5.4449,
|
||
|
|
"step": 2595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1680143755615453,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.00029941613312221046,
|
||
|
|
"loss": 5.5558,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1702605570530098,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029940636862896145,
|
||
|
|
"loss": 5.5165,
|
||
|
|
"step": 2605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1725067385444743,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.0002993965233414171,
|
||
|
|
"loss": 5.4624,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1747529200359388,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002993865972654955,
|
||
|
|
"loss": 5.4336,
|
||
|
|
"step": 2615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1769991015274035,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.0002993765904071635,
|
||
|
|
"loss": 5.5293,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.179245283018868,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.00029936650277243633,
|
||
|
|
"loss": 5.5603,
|
||
|
|
"step": 2625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1814914645103325,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002993563343673779,
|
||
|
|
"loss": 5.4785,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.183737646001797,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002993460851981007,
|
||
|
|
"loss": 5.4188,
|
||
|
|
"step": 2635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1859838274932615,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029933575527076565,
|
||
|
|
"loss": 5.5139,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.188230008984726,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002993253445915823,
|
||
|
|
"loss": 5.3998,
|
||
|
|
"step": 2645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1904761904761905,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.0002993148531668087,
|
||
|
|
"loss": 5.5066,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.192722371967655,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002993042810027514,
|
||
|
|
"loss": 5.416,
|
||
|
|
"step": 2655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1949685534591195,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.0002992936281057656,
|
||
|
|
"loss": 5.4367,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.197214734950584,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.000299282894482255,
|
||
|
|
"loss": 5.3912,
|
||
|
|
"step": 2665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1994609164420484,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029927208013867164,
|
||
|
|
"loss": 5.4456,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.201707097933513,
|
||
|
|
"grad_norm": 3.296875,
|
||
|
|
"learning_rate": 0.0002992611850815163,
|
||
|
|
"loss": 5.5036,
|
||
|
|
"step": 2675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2039532794249777,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.0002992502093173383,
|
||
|
|
"loss": 5.4467,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2061994609164421,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.0002992391528527353,
|
||
|
|
"loss": 5.3611,
|
||
|
|
"step": 2685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2084456424079066,
|
||
|
|
"grad_norm": 3.359375,
|
||
|
|
"learning_rate": 0.00029922801569435366,
|
||
|
|
"loss": 5.4635,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2106918238993711,
|
||
|
|
"grad_norm": 3.671875,
|
||
|
|
"learning_rate": 0.00029921679784888797,
|
||
|
|
"loss": 5.4823,
|
||
|
|
"step": 2695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2129380053908356,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.0002992054993230816,
|
||
|
|
"loss": 5.378,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2151841868823001,
|
||
|
|
"grad_norm": 2.765625,
|
||
|
|
"learning_rate": 0.0002991941201237263,
|
||
|
|
"loss": 5.4737,
|
||
|
|
"step": 2705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2174303683737646,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002991826602576624,
|
||
|
|
"loss": 5.4399,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.219676549865229,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029917111973177857,
|
||
|
|
"loss": 5.4663,
|
||
|
|
"step": 2715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2219227313566936,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00029915949855301204,
|
||
|
|
"loss": 5.3946,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.224168912848158,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002991477967283485,
|
||
|
|
"loss": 5.4415,
|
||
|
|
"step": 2725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2264150943396226,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00029913601426482226,
|
||
|
|
"loss": 5.3648,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.228661275831087,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.00029912415116951593,
|
||
|
|
"loss": 5.4543,
|
||
|
|
"step": 2735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2309074573225516,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002991122074495606,
|
||
|
|
"loss": 5.381,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.233153638814016,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002991001831121359,
|
||
|
|
"loss": 5.4367,
|
||
|
|
"step": 2745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2353998203054806,
|
||
|
|
"grad_norm": 3.796875,
|
||
|
|
"learning_rate": 0.00029908807816446994,
|
||
|
|
"loss": 5.5144,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2376460017969453,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002990758926138392,
|
||
|
|
"loss": 5.4193,
|
||
|
|
"step": 2755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2398921832884098,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.0002990636264675687,
|
||
|
|
"loss": 5.4758,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2421383647798743,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00029905127973303176,
|
||
|
|
"loss": 5.4093,
|
||
|
|
"step": 2765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2443845462713388,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029903885241765036,
|
||
|
|
"loss": 5.4189,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2466307277628033,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002990263445288947,
|
||
|
|
"loss": 5.4447,
|
||
|
|
"step": 2775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2488769092542678,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002990137560742836,
|
||
|
|
"loss": 5.3926,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2511230907457322,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.00029900108706138416,
|
||
|
|
"loss": 5.3857,
|
||
|
|
"step": 2785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2533692722371967,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.000298988337497812,
|
||
|
|
"loss": 5.4141,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2556154537286612,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002989755073912311,
|
||
|
|
"loss": 5.422,
|
||
|
|
"step": 2795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2578616352201257,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002989625967493541,
|
||
|
|
"loss": 5.3838,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2601078167115902,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029894960557994146,
|
||
|
|
"loss": 5.5335,
|
||
|
|
"step": 2805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.262353998203055,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029893653389080274,
|
||
|
|
"loss": 5.3528,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2646001796945194,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002989233816897954,
|
||
|
|
"loss": 5.3309,
|
||
|
|
"step": 2815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.266846361185984,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002989101489848256,
|
||
|
|
"loss": 5.4407,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2690925426774484,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.0002988968357838477,
|
||
|
|
"loss": 5.3808,
|
||
|
|
"step": 2825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.271338724168913,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002988834420948647,
|
||
|
|
"loss": 5.4058,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2735849056603774,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002988699679259275,
|
||
|
|
"loss": 5.4674,
|
||
|
|
"step": 2835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.275831087151842,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029885641328513594,
|
||
|
|
"loss": 5.4242,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2780772686433064,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002988427781806379,
|
||
|
|
"loss": 5.4332,
|
||
|
|
"step": 2845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.280323450134771,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002988290626206297,
|
||
|
|
"loss": 5.3583,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2825696316262354,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.000298815266613356,
|
||
|
|
"loss": 5.3448,
|
||
|
|
"step": 2855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2848158131176999,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002988013901671099,
|
||
|
|
"loss": 5.4957,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2870619946091644,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.0002987874332902328,
|
||
|
|
"loss": 5.4692,
|
||
|
|
"step": 2865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2893081761006289,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002987733959911144,
|
||
|
|
"loss": 5.3743,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2915543575920934,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029875927827819286,
|
||
|
|
"loss": 5.368,
|
||
|
|
"step": 2875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2938005390835579,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029874508015995463,
|
||
|
|
"loss": 5.3748,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2960467205750223,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002987308016449344,
|
||
|
|
"loss": 5.3995,
|
||
|
|
"step": 2885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2982929020664868,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00029871644274171534,
|
||
|
|
"loss": 5.3753,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3005390835579516,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.00029870200345892876,
|
||
|
|
"loss": 5.4296,
|
||
|
|
"step": 2895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.302785265049416,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029868748380525444,
|
||
|
|
"loss": 5.315,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3050314465408805,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002986728837894205,
|
||
|
|
"loss": 5.4592,
|
||
|
|
"step": 2905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.307277628032345,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.00029865820342020325,
|
||
|
|
"loss": 5.4735,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3095238095238095,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002986434427064273,
|
||
|
|
"loss": 5.3768,
|
||
|
|
"step": 2915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.311769991015274,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002986286016569657,
|
||
|
|
"loss": 5.381,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3140161725067385,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002986136802807396,
|
||
|
|
"loss": 5.4079,
|
||
|
|
"step": 2925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.316262353998203,
|
||
|
|
"grad_norm": 3.21875,
|
||
|
|
"learning_rate": 0.00029859867858671857,
|
||
|
|
"loss": 5.435,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3185085354896675,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.00029858359658392045,
|
||
|
|
"loss": 5.4919,
|
||
|
|
"step": 2935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.320754716981132,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.00029856843428141127,
|
||
|
|
"loss": 5.3849,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3230008984725967,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.00029855319168830543,
|
||
|
|
"loss": 5.4001,
|
||
|
|
"step": 2945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3252470799640612,
|
||
|
|
"grad_norm": 3.375,
|
||
|
|
"learning_rate": 0.0002985378688137656,
|
||
|
|
"loss": 5.5048,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3274932614555257,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029852246566700253,
|
||
|
|
"loss": 5.367,
|
||
|
|
"step": 2955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3297394429469902,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002985069822572754,
|
||
|
|
"loss": 5.3137,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3319856244384547,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002984914185938916,
|
||
|
|
"loss": 5.3961,
|
||
|
|
"step": 2965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3342318059299192,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002984757746862068,
|
||
|
|
"loss": 5.4488,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3364779874213837,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.00029846005054362474,
|
||
|
|
"loss": 5.4318,
|
||
|
|
"step": 2975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3387241689128482,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002984442461755977,
|
||
|
|
"loss": 5.3834,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3409703504043127,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029842836159162583,
|
||
|
|
"loss": 5.4205,
|
||
|
|
"step": 2985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3432165318957772,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002984123968012577,
|
||
|
|
"loss": 5.4352,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3454627133872417,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002983963518140901,
|
||
|
|
"loss": 5.4451,
|
||
|
|
"step": 2995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3477088948787062,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029838022663976793,
|
||
|
|
"loss": 5.3171,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3477088948787062,
|
||
|
|
"eval_loss": 5.344548225402832,
|
||
|
|
"eval_runtime": 16.0596,
|
||
|
|
"eval_samples_per_second": 1931.124,
|
||
|
|
"eval_steps_per_second": 241.414,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3499550763701706,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002983640212879844,
|
||
|
|
"loss": 5.4371,
|
||
|
|
"step": 3005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3522012578616351,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.0002983477357684809,
|
||
|
|
"loss": 5.3769,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3544474393530996,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.0002983313700910468,
|
||
|
|
"loss": 5.4952,
|
||
|
|
"step": 3015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3566936208445641,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029831492426552,
|
||
|
|
"loss": 5.3494,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3589398023360286,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029829839830178636,
|
||
|
|
"loss": 5.4431,
|
||
|
|
"step": 3025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3611859838274933,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.00029828179220977994,
|
||
|
|
"loss": 5.3644,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3634321653189578,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.000298265105999483,
|
||
|
|
"loss": 5.3982,
|
||
|
|
"step": 3035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3656783468104223,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029824833968092595,
|
||
|
|
"loss": 5.3913,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3679245283018868,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029823149326418735,
|
||
|
|
"loss": 5.3851,
|
||
|
|
"step": 3045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3701707097933513,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002982145667593939,
|
||
|
|
"loss": 5.3206,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3724168912848158,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.00029819756017672043,
|
||
|
|
"loss": 5.3429,
|
||
|
|
"step": 3055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3746630727762803,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00029818047352639,
|
||
|
|
"loss": 5.4596,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3769092542677448,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029816330681867366,
|
||
|
|
"loss": 5.3423,
|
||
|
|
"step": 3065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3791554357592093,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.0002981460600638907,
|
||
|
|
"loss": 5.3283,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3814016172506738,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029812873327240844,
|
||
|
|
"loss": 5.3159,
|
||
|
|
"step": 3075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3836477987421385,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002981113264546424,
|
||
|
|
"loss": 5.3529,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.385893980233603,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002980938396210561,
|
||
|
|
"loss": 5.46,
|
||
|
|
"step": 3085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3881401617250675,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029807627278216126,
|
||
|
|
"loss": 5.4219,
|
||
|
|
"step": 3090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.390386343216532,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002980586259485177,
|
||
|
|
"loss": 5.4519,
|
||
|
|
"step": 3095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3926325247079965,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.00029804089913073315,
|
||
|
|
"loss": 5.4067,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.394878706199461,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002980230923394637,
|
||
|
|
"loss": 5.348,
|
||
|
|
"step": 3105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3971248876909255,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.00029800520558541317,
|
||
|
|
"loss": 5.3693,
|
||
|
|
"step": 3110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.39937106918239,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002979872388793338,
|
||
|
|
"loss": 5.3537,
|
||
|
|
"step": 3115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4016172506738545,
|
||
|
|
"grad_norm": 2.75,
|
||
|
|
"learning_rate": 0.00029796919223202563,
|
||
|
|
"loss": 5.3571,
|
||
|
|
"step": 3120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.403863432165319,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002979510656543369,
|
||
|
|
"loss": 5.3759,
|
||
|
|
"step": 3125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4061096136567834,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002979328591571639,
|
||
|
|
"loss": 5.3222,
|
||
|
|
"step": 3130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.408355795148248,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029791457275145085,
|
||
|
|
"loss": 5.2987,
|
||
|
|
"step": 3135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4106019766397124,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.00029789620644819005,
|
||
|
|
"loss": 5.3843,
|
||
|
|
"step": 3140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.412848158131177,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029787776025842186,
|
||
|
|
"loss": 5.3461,
|
||
|
|
"step": 3145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4150943396226414,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029785923419323467,
|
||
|
|
"loss": 5.3381,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.417340521114106,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002978406282637648,
|
||
|
|
"loss": 5.3985,
|
||
|
|
"step": 3155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4195867026055704,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002978219424811967,
|
||
|
|
"loss": 5.3383,
|
||
|
|
"step": 3160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4218328840970351,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00029780317685676276,
|
||
|
|
"loss": 5.4033,
|
||
|
|
"step": 3165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4240790655884996,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002977843314017433,
|
||
|
|
"loss": 5.4135,
|
||
|
|
"step": 3170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.426325247079964,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002977654061274668,
|
||
|
|
"loss": 5.3461,
|
||
|
|
"step": 3175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4285714285714286,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002977464010453095,
|
||
|
|
"loss": 5.281,
|
||
|
|
"step": 3180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.430817610062893,
|
||
|
|
"grad_norm": 3.359375,
|
||
|
|
"learning_rate": 0.0002977273161666957,
|
||
|
|
"loss": 5.4328,
|
||
|
|
"step": 3185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4330637915543576,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029770815150309787,
|
||
|
|
"loss": 5.3081,
|
||
|
|
"step": 3190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.435309973045822,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002976889070660361,
|
||
|
|
"loss": 5.4198,
|
||
|
|
"step": 3195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4375561545372866,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002976695828670787,
|
||
|
|
"loss": 5.3054,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.439802336028751,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.00029765017891784175,
|
||
|
|
"loss": 5.4182,
|
||
|
|
"step": 3205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4420485175202156,
|
||
|
|
"grad_norm": 2.765625,
|
||
|
|
"learning_rate": 0.00029763069522998936,
|
||
|
|
"loss": 5.3818,
|
||
|
|
"step": 3210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.44429469901168,
|
||
|
|
"grad_norm": 2.78125,
|
||
|
|
"learning_rate": 0.0002976111318152336,
|
||
|
|
"loss": 5.34,
|
||
|
|
"step": 3215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4465408805031448,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002975914886853344,
|
||
|
|
"loss": 5.4218,
|
||
|
|
"step": 3220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4487870619946093,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029757176585209957,
|
||
|
|
"loss": 5.3399,
|
||
|
|
"step": 3225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4510332434860738,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.000297551963327385,
|
||
|
|
"loss": 5.2921,
|
||
|
|
"step": 3230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4532794249775383,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029753208112309423,
|
||
|
|
"loss": 5.3799,
|
||
|
|
"step": 3235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4555256064690028,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029751211925117897,
|
||
|
|
"loss": 5.2984,
|
||
|
|
"step": 3240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4577717879604672,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00029749207772363867,
|
||
|
|
"loss": 5.379,
|
||
|
|
"step": 3245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4600179694519317,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002974719565525207,
|
||
|
|
"loss": 5.3465,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4622641509433962,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002974517557499201,
|
||
|
|
"loss": 5.413,
|
||
|
|
"step": 3255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4645103324348607,
|
||
|
|
"grad_norm": 3.40625,
|
||
|
|
"learning_rate": 0.00029743147532798023,
|
||
|
|
"loss": 5.2814,
|
||
|
|
"step": 3260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4667565139263252,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029741111529889194,
|
||
|
|
"loss": 5.3454,
|
||
|
|
"step": 3265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4690026954177897,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.000297390675674894,
|
||
|
|
"loss": 5.3013,
|
||
|
|
"step": 3270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4712488769092542,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002973701564682731,
|
||
|
|
"loss": 5.2762,
|
||
|
|
"step": 3275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4734950584007187,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029734955769136377,
|
||
|
|
"loss": 5.3686,
|
||
|
|
"step": 3280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4757412398921832,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00029732887935654827,
|
||
|
|
"loss": 5.3697,
|
||
|
|
"step": 3285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4779874213836477,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002973081214762568,
|
||
|
|
"loss": 5.2504,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4802336028751122,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029728728406296735,
|
||
|
|
"loss": 5.3318,
|
||
|
|
"step": 3295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.482479784366577,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029726636712920564,
|
||
|
|
"loss": 5.3078,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4847259658580414,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002972453706875453,
|
||
|
|
"loss": 5.3814,
|
||
|
|
"step": 3305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.486972147349506,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.0002972242947506076,
|
||
|
|
"loss": 5.2753,
|
||
|
|
"step": 3310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4892183288409704,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002972031393310619,
|
||
|
|
"loss": 5.3256,
|
||
|
|
"step": 3315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4914645103324349,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002971819044416249,
|
||
|
|
"loss": 5.3758,
|
||
|
|
"step": 3320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4937106918238994,
|
||
|
|
"grad_norm": 2.75,
|
||
|
|
"learning_rate": 0.00029716059009506145,
|
||
|
|
"loss": 5.3209,
|
||
|
|
"step": 3325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4959568733153639,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.000297139196304184,
|
||
|
|
"loss": 5.3075,
|
||
|
|
"step": 3330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4982030548068284,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.0002971177230818527,
|
||
|
|
"loss": 5.3805,
|
||
|
|
"step": 3335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5004492362982929,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002970961704409756,
|
||
|
|
"loss": 5.3156,
|
||
|
|
"step": 3340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5026954177897576,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002970745383945084,
|
||
|
|
"loss": 5.3465,
|
||
|
|
"step": 3345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.504941599281222,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029705282695545454,
|
||
|
|
"loss": 5.3717,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5071877807726866,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029703103613686527,
|
||
|
|
"loss": 5.2288,
|
||
|
|
"step": 3355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.509433962264151,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002970091659518393,
|
||
|
|
"loss": 5.2978,
|
||
|
|
"step": 3360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5116801437556155,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002969872164135234,
|
||
|
|
"loss": 5.2993,
|
||
|
|
"step": 3365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.51392632524708,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029696518753511173,
|
||
|
|
"loss": 5.3231,
|
||
|
|
"step": 3370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5161725067385445,
|
||
|
|
"grad_norm": 2.796875,
|
||
|
|
"learning_rate": 0.0002969430793298464,
|
||
|
|
"loss": 5.334,
|
||
|
|
"step": 3375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.518418688230009,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029692089181101696,
|
||
|
|
"loss": 5.2514,
|
||
|
|
"step": 3380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5206648697214735,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002968986249919609,
|
||
|
|
"loss": 5.3403,
|
||
|
|
"step": 3385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.522911051212938,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002968762788860631,
|
||
|
|
"loss": 5.3209,
|
||
|
|
"step": 3390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5251572327044025,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002968538535067564,
|
||
|
|
"loss": 5.3657,
|
||
|
|
"step": 3395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.527403414195867,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.000296831348867521,
|
||
|
|
"loss": 5.3167,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5296495956873315,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002968087649818848,
|
||
|
|
"loss": 5.2753,
|
||
|
|
"step": 3405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.531895777178796,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002967861018634237,
|
||
|
|
"loss": 5.3678,
|
||
|
|
"step": 3410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5341419586702605,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00029676335952576074,
|
||
|
|
"loss": 5.3243,
|
||
|
|
"step": 3415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.536388140161725,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002967405379825668,
|
||
|
|
"loss": 5.2466,
|
||
|
|
"step": 3420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5386343216531895,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002967176372475604,
|
||
|
|
"loss": 5.2428,
|
||
|
|
"step": 3425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.540880503144654,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002966946573345076,
|
||
|
|
"loss": 5.2614,
|
||
|
|
"step": 3430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5431266846361185,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029667159825722206,
|
||
|
|
"loss": 5.3399,
|
||
|
|
"step": 3435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.545372866127583,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029664846002956506,
|
||
|
|
"loss": 5.2338,
|
||
|
|
"step": 3440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5476190476190477,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002966252426654454,
|
||
|
|
"loss": 5.3445,
|
||
|
|
"step": 3445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5498652291105122,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.0002966019461788196,
|
||
|
|
"loss": 5.2916,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5521114106019767,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002965785705836915,
|
||
|
|
"loss": 5.3159,
|
||
|
|
"step": 3455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5543575920934412,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002965551158941127,
|
||
|
|
"loss": 5.3027,
|
||
|
|
"step": 3460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5566037735849056,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002965315821241823,
|
||
|
|
"loss": 5.2319,
|
||
|
|
"step": 3465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5588499550763701,
|
||
|
|
"grad_norm": 3.875,
|
||
|
|
"learning_rate": 0.00029650796928804685,
|
||
|
|
"loss": 5.3169,
|
||
|
|
"step": 3470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5610961365678346,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002964842773999005,
|
||
|
|
"loss": 5.2524,
|
||
|
|
"step": 3475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5633423180592994,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002964605064739849,
|
||
|
|
"loss": 5.3455,
|
||
|
|
"step": 3480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5655884995507638,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002964366565245892,
|
||
|
|
"loss": 5.3241,
|
||
|
|
"step": 3485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5678346810422283,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029641272756605023,
|
||
|
|
"loss": 5.301,
|
||
|
|
"step": 3490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5700808625336928,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002963887196127519,
|
||
|
|
"loss": 5.2987,
|
||
|
|
"step": 3495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5723270440251573,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029636463267912607,
|
||
|
|
"loss": 5.2262,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5745732255166218,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.00029634046677965174,
|
||
|
|
"loss": 5.2556,
|
||
|
|
"step": 3505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5768194070080863,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.00029631622192885553,
|
||
|
|
"loss": 5.3328,
|
||
|
|
"step": 3510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5790655884995508,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029629189814131155,
|
||
|
|
"loss": 5.3252,
|
||
|
|
"step": 3515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5813117699910153,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002962674954316413,
|
||
|
|
"loss": 5.2871,
|
||
|
|
"step": 3520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5835579514824798,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002962430138145137,
|
||
|
|
"loss": 5.2723,
|
||
|
|
"step": 3525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5858041329739443,
|
||
|
|
"grad_norm": 2.765625,
|
||
|
|
"learning_rate": 0.000296218453304645,
|
||
|
|
"loss": 5.2836,
|
||
|
|
"step": 3530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5880503144654088,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029619381391679923,
|
||
|
|
"loss": 5.3014,
|
||
|
|
"step": 3535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5902964959568733,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029616909566578746,
|
||
|
|
"loss": 5.2194,
|
||
|
|
"step": 3540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5925426774483378,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.0002961442985664684,
|
||
|
|
"loss": 5.3363,
|
||
|
|
"step": 3545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5947888589398023,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.000296119422633748,
|
||
|
|
"loss": 5.2192,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5970350404312668,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002960944678825797,
|
||
|
|
"loss": 5.2585,
|
||
|
|
"step": 3555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5992812219227313,
|
||
|
|
"grad_norm": 3.40625,
|
||
|
|
"learning_rate": 0.0002960694343279643,
|
||
|
|
"loss": 5.4105,
|
||
|
|
"step": 3560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6015274034141957,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002960443219849499,
|
||
|
|
"loss": 5.2834,
|
||
|
|
"step": 3565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6037735849056602,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002960191308686321,
|
||
|
|
"loss": 5.2917,
|
||
|
|
"step": 3570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6060197663971247,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002959938609941537,
|
||
|
|
"loss": 5.3014,
|
||
|
|
"step": 3575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6082659478885895,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029596851237670494,
|
||
|
|
"loss": 5.2469,
|
||
|
|
"step": 3580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.610512129380054,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029594308503152344,
|
||
|
|
"loss": 5.2651,
|
||
|
|
"step": 3585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6127583108715184,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029591757897389403,
|
||
|
|
"loss": 5.2144,
|
||
|
|
"step": 3590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.615004492362983,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029589199421914885,
|
||
|
|
"loss": 5.2536,
|
||
|
|
"step": 3595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6172506738544474,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002958663307826674,
|
||
|
|
"loss": 5.2291,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.619496855345912,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.00029584058867987656,
|
||
|
|
"loss": 5.2936,
|
||
|
|
"step": 3605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6217430368373764,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.00029581476792625035,
|
||
|
|
"loss": 5.3135,
|
||
|
|
"step": 3610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6239892183288411,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.0002957888685373101,
|
||
|
|
"loss": 5.2395,
|
||
|
|
"step": 3615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6262353998203056,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002957628905286245,
|
||
|
|
"loss": 5.2269,
|
||
|
|
"step": 3620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6284815813117701,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.00029573683391580946,
|
||
|
|
"loss": 5.2192,
|
||
|
|
"step": 3625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6307277628032346,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.000295710698714528,
|
||
|
|
"loss": 5.2539,
|
||
|
|
"step": 3630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.632973944294699,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002956844849404906,
|
||
|
|
"loss": 5.2506,
|
||
|
|
"step": 3635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6352201257861636,
|
||
|
|
"grad_norm": 2.78125,
|
||
|
|
"learning_rate": 0.00029565819260945483,
|
||
|
|
"loss": 5.2739,
|
||
|
|
"step": 3640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.637466307277628,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029563182173722555,
|
||
|
|
"loss": 5.232,
|
||
|
|
"step": 3645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6397124887690926,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002956053723396548,
|
||
|
|
"loss": 5.3054,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.641958670260557,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002955788444326418,
|
||
|
|
"loss": 5.2955,
|
||
|
|
"step": 3655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6442048517520216,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029555223803213305,
|
||
|
|
"loss": 5.2577,
|
||
|
|
"step": 3660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.646451033243486,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029552555315412216,
|
||
|
|
"loss": 5.2796,
|
||
|
|
"step": 3665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6486972147349506,
|
||
|
|
"grad_norm": 3.75,
|
||
|
|
"learning_rate": 0.0002954987898146499,
|
||
|
|
"loss": 5.3159,
|
||
|
|
"step": 3670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.650943396226415,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002954719480298043,
|
||
|
|
"loss": 5.2639,
|
||
|
|
"step": 3675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6531895777178796,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.00029544502781572035,
|
||
|
|
"loss": 5.2906,
|
||
|
|
"step": 3680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.655435759209344,
|
||
|
|
"grad_norm": 4.75,
|
||
|
|
"learning_rate": 0.0002954180291885804,
|
||
|
|
"loss": 5.299,
|
||
|
|
"step": 3685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6576819407008085,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029539095216461395,
|
||
|
|
"loss": 5.2026,
|
||
|
|
"step": 3690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.659928122192273,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.0002953637967600974,
|
||
|
|
"loss": 5.2159,
|
||
|
|
"step": 3695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6621743036837375,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002953365629913544,
|
||
|
|
"loss": 5.22,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.664420485175202,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002953092508747557,
|
||
|
|
"loss": 5.1528,
|
||
|
|
"step": 3705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6666666666666665,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002952818604267193,
|
||
|
|
"loss": 5.234,
|
||
|
|
"step": 3710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.668912848158131,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.0002952543916637099,
|
||
|
|
"loss": 5.263,
|
||
|
|
"step": 3715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6711590296495957,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.00029522684460223965,
|
||
|
|
"loss": 5.2879,
|
||
|
|
"step": 3720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6734052111410602,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002951992192588676,
|
||
|
|
"loss": 5.2081,
|
||
|
|
"step": 3725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6756513926325247,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002951715156501999,
|
||
|
|
"loss": 5.2688,
|
||
|
|
"step": 3730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6778975741239892,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029514373379288967,
|
||
|
|
"loss": 5.2266,
|
||
|
|
"step": 3735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6801437556154537,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.0002951158737036372,
|
||
|
|
"loss": 5.2542,
|
||
|
|
"step": 3740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6823899371069182,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002950879353991897,
|
||
|
|
"loss": 5.2341,
|
||
|
|
"step": 3745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.684636118598383,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.0002950599188963414,
|
||
|
|
"loss": 5.2238,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6868823000898474,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002950318242119337,
|
||
|
|
"loss": 5.3397,
|
||
|
|
"step": 3755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.689128481581312,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002950036513628547,
|
||
|
|
"loss": 5.2441,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6913746630727764,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.0002949754003660397,
|
||
|
|
"loss": 5.3238,
|
||
|
|
"step": 3765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.693620844564241,
|
||
|
|
"grad_norm": 3.390625,
|
||
|
|
"learning_rate": 0.00029494707123847095,
|
||
|
|
"loss": 5.3302,
|
||
|
|
"step": 3770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6958670260557054,
|
||
|
|
"grad_norm": 3.28125,
|
||
|
|
"learning_rate": 0.0002949186639971777,
|
||
|
|
"loss": 5.2831,
|
||
|
|
"step": 3775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6981132075471699,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029489017865923597,
|
||
|
|
"loss": 5.2566,
|
||
|
|
"step": 3780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7003593890386344,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029486161524176893,
|
||
|
|
"loss": 5.2631,
|
||
|
|
"step": 3785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7026055705300989,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002948329737619466,
|
||
|
|
"loss": 5.2597,
|
||
|
|
"step": 3790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7048517520215634,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.0002948042542369859,
|
||
|
|
"loss": 5.2838,
|
||
|
|
"step": 3795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7070979335130279,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002947754566841508,
|
||
|
|
"loss": 5.2681,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7093441150044923,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029474658112075197,
|
||
|
|
"loss": 5.3089,
|
||
|
|
"step": 3805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7115902964959568,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029471762756414703,
|
||
|
|
"loss": 5.2663,
|
||
|
|
"step": 3810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7138364779874213,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.00029468859603174065,
|
||
|
|
"loss": 5.2597,
|
||
|
|
"step": 3815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7160826594788858,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029465948654098427,
|
||
|
|
"loss": 5.2646,
|
||
|
|
"step": 3820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7183288409703503,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002946302991093761,
|
||
|
|
"loss": 5.2662,
|
||
|
|
"step": 3825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7205750224618148,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029460103375446116,
|
||
|
|
"loss": 5.2176,
|
||
|
|
"step": 3830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7228212039532793,
|
||
|
|
"grad_norm": 2.84375,
|
||
|
|
"learning_rate": 0.00029457169049383164,
|
||
|
|
"loss": 5.225,
|
||
|
|
"step": 3835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7250673854447438,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029454226934512624,
|
||
|
|
"loss": 5.2631,
|
||
|
|
"step": 3840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7273135669362083,
|
||
|
|
"grad_norm": 2.8125,
|
||
|
|
"learning_rate": 0.00029451277032603064,
|
||
|
|
"loss": 5.2029,
|
||
|
|
"step": 3845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7295597484276728,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002944831934542772,
|
||
|
|
"loss": 5.2321,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7318059299191375,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029445353874764526,
|
||
|
|
"loss": 5.2173,
|
||
|
|
"step": 3855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.734052111410602,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.00029442380622396073,
|
||
|
|
"loss": 5.2293,
|
||
|
|
"step": 3860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7362982929020665,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.00029439399590109645,
|
||
|
|
"loss": 5.1509,
|
||
|
|
"step": 3865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.738544474393531,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029436410779697206,
|
||
|
|
"loss": 5.2911,
|
||
|
|
"step": 3870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7407906558849955,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029433414192955377,
|
||
|
|
"loss": 5.1782,
|
||
|
|
"step": 3875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.74303683737646,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002943040983168547,
|
||
|
|
"loss": 5.2294,
|
||
|
|
"step": 3880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7452830188679245,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.0002942739769769347,
|
||
|
|
"loss": 5.2567,
|
||
|
|
"step": 3885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7475292003593892,
|
||
|
|
"grad_norm": 3.546875,
|
||
|
|
"learning_rate": 0.00029424377792790023,
|
||
|
|
"loss": 5.2894,
|
||
|
|
"step": 3890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7497753818508537,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002942135011879046,
|
||
|
|
"loss": 5.3933,
|
||
|
|
"step": 3895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7520215633423182,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00029418314677514764,
|
||
|
|
"loss": 5.295,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7542677448337827,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002941527147078761,
|
||
|
|
"loss": 5.1949,
|
||
|
|
"step": 3905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7565139263252472,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029412220500438317,
|
||
|
|
"loss": 5.1329,
|
||
|
|
"step": 3910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7587601078167117,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002940916176830089,
|
||
|
|
"loss": 5.3141,
|
||
|
|
"step": 3915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7610062893081762,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002940609527621399,
|
||
|
|
"loss": 5.2578,
|
||
|
|
"step": 3920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7632524707996406,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029403021026020955,
|
||
|
|
"loss": 5.2614,
|
||
|
|
"step": 3925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7654986522911051,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.00029399939019569767,
|
||
|
|
"loss": 5.2955,
|
||
|
|
"step": 3930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7677448337825696,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029396849258713084,
|
||
|
|
"loss": 5.2972,
|
||
|
|
"step": 3935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7699910152740341,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029393751745308215,
|
||
|
|
"loss": 5.2714,
|
||
|
|
"step": 3940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7722371967654986,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.0002939064648121714,
|
||
|
|
"loss": 5.2846,
|
||
|
|
"step": 3945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7744833782569631,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.00029387533468306504,
|
||
|
|
"loss": 5.263,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7767295597484276,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002938441270844758,
|
||
|
|
"loss": 5.1442,
|
||
|
|
"step": 3955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.778975741239892,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.00029381284203516334,
|
||
|
|
"loss": 5.209,
|
||
|
|
"step": 3960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7812219227313566,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029378147955393363,
|
||
|
|
"loss": 5.2285,
|
||
|
|
"step": 3965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.783468104222821,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.00029375003965963935,
|
||
|
|
"loss": 5.2605,
|
||
|
|
"step": 3970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7857142857142856,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029371852237117957,
|
||
|
|
"loss": 5.2557,
|
||
|
|
"step": 3975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.78796046720575,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029368692770749994,
|
||
|
|
"loss": 5.1953,
|
||
|
|
"step": 3980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7902066486972146,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029365525568759266,
|
||
|
|
"loss": 5.2138,
|
||
|
|
"step": 3985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7924528301886793,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002936235063304964,
|
||
|
|
"loss": 5.2362,
|
||
|
|
"step": 3990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7946990116801438,
|
||
|
|
"grad_norm": 3.703125,
|
||
|
|
"learning_rate": 0.0002935916796552963,
|
||
|
|
"loss": 5.238,
|
||
|
|
"step": 3995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7969451931716083,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029355977568112403,
|
||
|
|
"loss": 5.2092,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7969451931716083,
|
||
|
|
"eval_loss": 5.183039665222168,
|
||
|
|
"eval_runtime": 16.1808,
|
||
|
|
"eval_samples_per_second": 1916.649,
|
||
|
|
"eval_steps_per_second": 239.604,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7991913746630728,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.00029352779442715765,
|
||
|
|
"loss": 5.2075,
|
||
|
|
"step": 4005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8014375561545373,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002934957359126218,
|
||
|
|
"loss": 5.1898,
|
||
|
|
"step": 4010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8036837376460018,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.0002934636001567873,
|
||
|
|
"loss": 5.2844,
|
||
|
|
"step": 4015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8059299191374663,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002934313871789718,
|
||
|
|
"loss": 5.2941,
|
||
|
|
"step": 4020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.808176100628931,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00029339909699853904,
|
||
|
|
"loss": 5.3192,
|
||
|
|
"step": 4025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8104222821203955,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029336672963489925,
|
||
|
|
"loss": 5.1957,
|
||
|
|
"step": 4030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.81266846361186,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002933342851075092,
|
||
|
|
"loss": 5.2322,
|
||
|
|
"step": 4035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8149146451033245,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029330176343587175,
|
||
|
|
"loss": 5.124,
|
||
|
|
"step": 4040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.817160826594789,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029326916463953646,
|
||
|
|
"loss": 5.195,
|
||
|
|
"step": 4045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8194070080862534,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002932364887380991,
|
||
|
|
"loss": 5.2398,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.821653189577718,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029320373575120174,
|
||
|
|
"loss": 5.1243,
|
||
|
|
"step": 4055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8238993710691824,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002931709056985328,
|
||
|
|
"loss": 5.1875,
|
||
|
|
"step": 4060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.826145552560647,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002931379985998272,
|
||
|
|
"loss": 5.2679,
|
||
|
|
"step": 4065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8283917340521114,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002931050144748659,
|
||
|
|
"loss": 5.1371,
|
||
|
|
"step": 4070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.830637915543576,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002930719533434764,
|
||
|
|
"loss": 5.2114,
|
||
|
|
"step": 4075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8328840970350404,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002930388152255323,
|
||
|
|
"loss": 5.2132,
|
||
|
|
"step": 4080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.835130278526505,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002930056001409537,
|
||
|
|
"loss": 5.211,
|
||
|
|
"step": 4085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8373764600179694,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002929723081097067,
|
||
|
|
"loss": 5.1184,
|
||
|
|
"step": 4090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8396226415094339,
|
||
|
|
"grad_norm": 2.796875,
|
||
|
|
"learning_rate": 0.00029293893915180387,
|
||
|
|
"loss": 5.1128,
|
||
|
|
"step": 4095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8418688230008984,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00029290549328730395,
|
||
|
|
"loss": 5.2356,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8441150044923629,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.0002928719705363118,
|
||
|
|
"loss": 5.1903,
|
||
|
|
"step": 4105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8463611859838274,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029283837091897876,
|
||
|
|
"loss": 5.1552,
|
||
|
|
"step": 4110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8486073674752919,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029280469445550213,
|
||
|
|
"loss": 5.1519,
|
||
|
|
"step": 4115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8508535489667564,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002927709411661255,
|
||
|
|
"loss": 5.181,
|
||
|
|
"step": 4120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.853099730458221,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.00029273711107113856,
|
||
|
|
"loss": 5.1855,
|
||
|
|
"step": 4125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8553459119496856,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029270320419087743,
|
||
|
|
"loss": 5.2248,
|
||
|
|
"step": 4130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.85759209344115,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029266922054572395,
|
||
|
|
"loss": 5.1783,
|
||
|
|
"step": 4135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8598382749326146,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029263516015610655,
|
||
|
|
"loss": 5.2069,
|
||
|
|
"step": 4140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.862084456424079,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.0002926010230424995,
|
||
|
|
"loss": 5.1962,
|
||
|
|
"step": 4145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8643306379155435,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00029256680922542334,
|
||
|
|
"loss": 5.1803,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.866576819407008,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.0002925325187254446,
|
||
|
|
"loss": 5.2128,
|
||
|
|
"step": 4155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8688230008984728,
|
||
|
|
"grad_norm": 2.78125,
|
||
|
|
"learning_rate": 0.00029249815156317605,
|
||
|
|
"loss": 5.184,
|
||
|
|
"step": 4160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8710691823899372,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.0002924637077592764,
|
||
|
|
"loss": 5.2263,
|
||
|
|
"step": 4165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8733153638814017,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002924291873344505,
|
||
|
|
"loss": 5.1901,
|
||
|
|
"step": 4170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8755615453728662,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029239459030944935,
|
||
|
|
"loss": 5.2521,
|
||
|
|
"step": 4175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8778077268643307,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002923599167050697,
|
||
|
|
"loss": 5.167,
|
||
|
|
"step": 4180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8800539083557952,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002923251665421547,
|
||
|
|
"loss": 5.1813,
|
||
|
|
"step": 4185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8823000898472597,
|
||
|
|
"grad_norm": 2.8125,
|
||
|
|
"learning_rate": 0.0002922903398415933,
|
||
|
|
"loss": 5.2392,
|
||
|
|
"step": 4190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8845462713387242,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002922554366243205,
|
||
|
|
"loss": 5.2032,
|
||
|
|
"step": 4195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8867924528301887,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00029222045691131737,
|
||
|
|
"loss": 5.1849,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8890386343216532,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.00029218540072361074,
|
||
|
|
"loss": 5.1958,
|
||
|
|
"step": 4205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8912848158131177,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002921502680822738,
|
||
|
|
"loss": 5.174,
|
||
|
|
"step": 4210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8935309973045822,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.0002921150590084252,
|
||
|
|
"loss": 5.2986,
|
||
|
|
"step": 4215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8957771787960467,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00029207977352323005,
|
||
|
|
"loss": 5.1103,
|
||
|
|
"step": 4220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8980233602875112,
|
||
|
|
"grad_norm": 2.796875,
|
||
|
|
"learning_rate": 0.000292044411647899,
|
||
|
|
"loss": 5.2693,
|
||
|
|
"step": 4225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9002695417789757,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029200897340368883,
|
||
|
|
"loss": 5.219,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9025157232704402,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002919734588119021,
|
||
|
|
"loss": 5.1556,
|
||
|
|
"step": 4235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9047619047619047,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002919378678938874,
|
||
|
|
"loss": 5.202,
|
||
|
|
"step": 4240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9070080862533692,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.000291902200671039,
|
||
|
|
"loss": 5.1384,
|
||
|
|
"step": 4245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9092542677448336,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00029186645716479734,
|
||
|
|
"loss": 5.1446,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9115004492362981,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.0002918306373966484,
|
||
|
|
"loss": 5.3229,
|
||
|
|
"step": 4255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9137466307277629,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00029179474138812424,
|
||
|
|
"loss": 5.1863,
|
||
|
|
"step": 4260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9159928122192273,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002917587691608026,
|
||
|
|
"loss": 5.1948,
|
||
|
|
"step": 4265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9182389937106918,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00029172272073630707,
|
||
|
|
"loss": 5.1398,
|
||
|
|
"step": 4270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9204851752021563,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.000291686596136307,
|
||
|
|
"loss": 5.2248,
|
||
|
|
"step": 4275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9227313566936208,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029165039538251786,
|
||
|
|
"loss": 5.2137,
|
||
|
|
"step": 4280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9249775381850853,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00029161411849670034,
|
||
|
|
"loss": 5.2118,
|
||
|
|
"step": 4285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9272237196765498,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.00029157776550066134,
|
||
|
|
"loss": 5.1821,
|
||
|
|
"step": 4290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9294699011680145,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002915413364162533,
|
||
|
|
"loss": 5.1385,
|
||
|
|
"step": 4295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.931716082659479,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029150483126537445,
|
||
|
|
"loss": 5.1265,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9339622641509435,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002914682500699688,
|
||
|
|
"loss": 5.173,
|
||
|
|
"step": 4305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.936208445642408,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00029143159285202597,
|
||
|
|
"loss": 5.175,
|
||
|
|
"step": 4310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9384546271338725,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002913948596335814,
|
||
|
|
"loss": 5.1925,
|
||
|
|
"step": 4315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.940700808625337,
|
||
|
|
"grad_norm": 3.21875,
|
||
|
|
"learning_rate": 0.00029135805043671597,
|
||
|
|
"loss": 5.1982,
|
||
|
|
"step": 4320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9429469901168015,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002913211652835567,
|
||
|
|
"loss": 5.1497,
|
||
|
|
"step": 4325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.945193171608266,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029128420419627566,
|
||
|
|
"loss": 5.151,
|
||
|
|
"step": 4330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9474393530997305,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00029124716719709114,
|
||
|
|
"loss": 5.1051,
|
||
|
|
"step": 4335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.949685534591195,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002912100543082666,
|
||
|
|
"loss": 5.1568,
|
||
|
|
"step": 4340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9519317160826595,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002911728655521115,
|
||
|
|
"loss": 5.1824,
|
||
|
|
"step": 4345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.954177897574124,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029113560095098064,
|
||
|
|
"loss": 5.1908,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9564240790655885,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002910982605272745,
|
||
|
|
"loss": 5.1337,
|
||
|
|
"step": 4355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.958670260557053,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002910608443034391,
|
||
|
|
"loss": 5.2017,
|
||
|
|
"step": 4360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9609164420485174,
|
||
|
|
"grad_norm": 2.84375,
|
||
|
|
"learning_rate": 0.00029102335230196615,
|
||
|
|
"loss": 5.131,
|
||
|
|
"step": 4365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.963162623539982,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.00029098578454539274,
|
||
|
|
"loss": 5.1247,
|
||
|
|
"step": 4370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9654088050314464,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002909481410563017,
|
||
|
|
"loss": 5.1947,
|
||
|
|
"step": 4375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.967654986522911,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002909104218573211,
|
||
|
|
"loss": 5.162,
|
||
|
|
"step": 4380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9699011680143754,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029087262697112494,
|
||
|
|
"loss": 5.1051,
|
||
|
|
"step": 4385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.97214734950584,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029083475642043216,
|
||
|
|
"loss": 5.1855,
|
||
|
|
"step": 4390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9743935309973046,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002907968102280077,
|
||
|
|
"loss": 5.1933,
|
||
|
|
"step": 4395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9766397124887691,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.0002907587884166616,
|
||
|
|
"loss": 5.1138,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9788858939802336,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.0002907206910092498,
|
||
|
|
"loss": 5.1579,
|
||
|
|
"step": 4405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9811320754716981,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.000290682518028673,
|
||
|
|
"loss": 5.1163,
|
||
|
|
"step": 4410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9833782569631626,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00029064426949787807,
|
||
|
|
"loss": 5.1887,
|
||
|
|
"step": 4415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.985624438454627,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002906059454398567,
|
||
|
|
"loss": 5.2164,
|
||
|
|
"step": 4420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9878706199460916,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002905675458776464,
|
||
|
|
"loss": 5.0996,
|
||
|
|
"step": 4425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9901168014375563,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002905290708343298,
|
||
|
|
"loss": 5.1728,
|
||
|
|
"step": 4430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9923629829290208,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00029049052033303514,
|
||
|
|
"loss": 5.1126,
|
||
|
|
"step": 4435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9946091644204853,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.00029045189439693564,
|
||
|
|
"loss": 5.1486,
|
||
|
|
"step": 4440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9968553459119498,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00029041319304925036,
|
||
|
|
"loss": 5.098,
|
||
|
|
"step": 4445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9991015274034143,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002903744163132432,
|
||
|
|
"loss": 5.1236,
|
||
|
|
"step": 4450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.001347708894879,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.00029033556421222383,
|
||
|
|
"loss": 5.1441,
|
||
|
|
"step": 4455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0035938903863433,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002902966367695468,
|
||
|
|
"loss": 5.0451,
|
||
|
|
"step": 4460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0058400718778078,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.00029025763400861236,
|
||
|
|
"loss": 5.104,
|
||
|
|
"step": 4465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0080862533692723,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00029021855595286574,
|
||
|
|
"loss": 5.0897,
|
||
|
|
"step": 4470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0103324348607368,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002901794026257975,
|
||
|
|
"loss": 4.9517,
|
||
|
|
"step": 4475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0125786163522013,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.0002901401740509435,
|
||
|
|
"loss": 4.9774,
|
||
|
|
"step": 4480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0148247978436657,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002901008702518848,
|
||
|
|
"loss": 4.986,
|
||
|
|
"step": 4485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0170709793351302,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002900614912522476,
|
||
|
|
"loss": 5.0134,
|
||
|
|
"step": 4490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0193171608265947,
|
||
|
|
"grad_norm": 3.3125,
|
||
|
|
"learning_rate": 0.0002900220370757035,
|
||
|
|
"loss": 5.0922,
|
||
|
|
"step": 4495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0215633423180592,
|
||
|
|
"grad_norm": 2.8125,
|
||
|
|
"learning_rate": 0.0002899825077459692,
|
||
|
|
"loss": 5.0198,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0238095238095237,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002899429032868064,
|
||
|
|
"loss": 5.1019,
|
||
|
|
"step": 4505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.026055705300988,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.0002899032237220223,
|
||
|
|
"loss": 5.0552,
|
||
|
|
"step": 4510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0283018867924527,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.0002898634690754689,
|
||
|
|
"loss": 5.0344,
|
||
|
|
"step": 4515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.030548068283917,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002898236393710436,
|
||
|
|
"loss": 5.04,
|
||
|
|
"step": 4520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0327942497753817,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.00028978373463268883,
|
||
|
|
"loss": 5.0868,
|
||
|
|
"step": 4525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.035040431266846,
|
||
|
|
"grad_norm": 3.234375,
|
||
|
|
"learning_rate": 0.00028974375488439194,
|
||
|
|
"loss": 5.0977,
|
||
|
|
"step": 4530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0372866127583107,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002897037001501857,
|
||
|
|
"loss": 5.0351,
|
||
|
|
"step": 4535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.039532794249775,
|
||
|
|
"grad_norm": 3.5,
|
||
|
|
"learning_rate": 0.00028966357045414774,
|
||
|
|
"loss": 5.115,
|
||
|
|
"step": 4540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0417789757412397,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.00028962336582040086,
|
||
|
|
"loss": 5.137,
|
||
|
|
"step": 4545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0440251572327046,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002895830862731127,
|
||
|
|
"loss": 5.0389,
|
||
|
|
"step": 4550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.046271338724169,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002895427318364963,
|
||
|
|
"loss": 5.045,
|
||
|
|
"step": 4555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0485175202156336,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00028950230253480935,
|
||
|
|
"loss": 5.0665,
|
||
|
|
"step": 4560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.050763701707098,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00028946179839235475,
|
||
|
|
"loss": 4.9852,
|
||
|
|
"step": 4565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0530098831985626,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002894212194334803,
|
||
|
|
"loss": 5.1119,
|
||
|
|
"step": 4570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.055256064690027,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.00028938056568257874,
|
||
|
|
"loss": 5.0799,
|
||
|
|
"step": 4575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0575022461814916,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.000289339837164088,
|
||
|
|
"loss": 5.0597,
|
||
|
|
"step": 4580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.059748427672956,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.0002892990339024907,
|
||
|
|
"loss": 5.0044,
|
||
|
|
"step": 4585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0619946091644206,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002892581559223144,
|
||
|
|
"loss": 5.0103,
|
||
|
|
"step": 4590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.064240790655885,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00028921720324813185,
|
||
|
|
"loss": 5.0157,
|
||
|
|
"step": 4595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0664869721473496,
|
||
|
|
"grad_norm": 2.875,
|
||
|
|
"learning_rate": 0.0002891761759045603,
|
||
|
|
"loss": 5.0655,
|
||
|
|
"step": 4600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.068733153638814,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002891350739162622,
|
||
|
|
"loss": 5.1106,
|
||
|
|
"step": 4605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0709793351302785,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002890938973079447,
|
||
|
|
"loss": 5.129,
|
||
|
|
"step": 4610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.073225516621743,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.00028905264610436,
|
||
|
|
"loss": 5.031,
|
||
|
|
"step": 4615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0754716981132075,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.00028901132033030475,
|
||
|
|
"loss": 5.0716,
|
||
|
|
"step": 4620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.077717879604672,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.000288969920010621,
|
||
|
|
"loss": 5.0758,
|
||
|
|
"step": 4625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0799640610961365,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.000288928445170195,
|
||
|
|
"loss": 5.0436,
|
||
|
|
"step": 4630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.082210242587601,
|
||
|
|
"grad_norm": 2.84375,
|
||
|
|
"learning_rate": 0.00028888689583395826,
|
||
|
|
"loss": 5.0841,
|
||
|
|
"step": 4635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0844564240790655,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00028884527202688683,
|
||
|
|
"loss": 5.0446,
|
||
|
|
"step": 4640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.08670260557053,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.0002888035737740016,
|
||
|
|
"loss": 4.9765,
|
||
|
|
"step": 4645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0889487870619945,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.00028876180110036823,
|
||
|
|
"loss": 5.1058,
|
||
|
|
"step": 4650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.091194968553459,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002887199540310971,
|
||
|
|
"loss": 5.0546,
|
||
|
|
"step": 4655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0934411500449235,
|
||
|
|
"grad_norm": 2.828125,
|
||
|
|
"learning_rate": 0.00028867803259134326,
|
||
|
|
"loss": 4.9612,
|
||
|
|
"step": 4660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.095687331536388,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.00028863603680630653,
|
||
|
|
"loss": 5.0064,
|
||
|
|
"step": 4665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0979335130278525,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.00028859396670123135,
|
||
|
|
"loss": 5.0299,
|
||
|
|
"step": 4670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.100179694519317,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.000288551822301407,
|
||
|
|
"loss": 5.0889,
|
||
|
|
"step": 4675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1024258760107815,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00028850960363216714,
|
||
|
|
"loss": 5.0944,
|
||
|
|
"step": 4680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1046720575022464,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002884673107188904,
|
||
|
|
"loss": 4.9692,
|
||
|
|
"step": 4685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.106918238993711,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00028842494358699973,
|
||
|
|
"loss": 4.9994,
|
||
|
|
"step": 4690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1091644204851754,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.000288382502261963,
|
||
|
|
"loss": 5.0891,
|
||
|
|
"step": 4695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.11141060197664,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002883399867692924,
|
||
|
|
"loss": 5.0812,
|
||
|
|
"step": 4700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1136567834681044,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 0.00028829739713454483,
|
||
|
|
"loss": 5.0365,
|
||
|
|
"step": 4705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.115902964959569,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002882547333833218,
|
||
|
|
"loss": 5.0654,
|
||
|
|
"step": 4710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1181491464510334,
|
||
|
|
"grad_norm": 2.859375,
|
||
|
|
"learning_rate": 0.00028821199554126934,
|
||
|
|
"loss": 4.9854,
|
||
|
|
"step": 4715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.120395327942498,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002881691836340779,
|
||
|
|
"loss": 5.0865,
|
||
|
|
"step": 4720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1226415094339623,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00028812629768748267,
|
||
|
|
"loss": 5.045,
|
||
|
|
"step": 4725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.124887690925427,
|
||
|
|
"grad_norm": 3.390625,
|
||
|
|
"learning_rate": 0.00028808333772726316,
|
||
|
|
"loss": 5.0897,
|
||
|
|
"step": 4730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1271338724168913,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00028804030377924345,
|
||
|
|
"loss": 5.0187,
|
||
|
|
"step": 4735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.129380053908356,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.0002879971958692921,
|
||
|
|
"loss": 5.0898,
|
||
|
|
"step": 4740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1316262353998203,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.00028795401402332215,
|
||
|
|
"loss": 5.0058,
|
||
|
|
"step": 4745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.133872416891285,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.00028791075826729097,
|
||
|
|
"loss": 5.0468,
|
||
|
|
"step": 4750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1361185983827493,
|
||
|
|
"grad_norm": 3.109375,
|
||
|
|
"learning_rate": 0.00028786742862720055,
|
||
|
|
"loss": 5.0241,
|
||
|
|
"step": 4755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.138364779874214,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.0002878240251290971,
|
||
|
|
"loss": 5.1405,
|
||
|
|
"step": 4760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1406109613656783,
|
||
|
|
"grad_norm": 2.828125,
|
||
|
|
"learning_rate": 0.0002877805477990713,
|
||
|
|
"loss": 5.0095,
|
||
|
|
"step": 4765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.142857142857143,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.00028773699666325835,
|
||
|
|
"loss": 5.0425,
|
||
|
|
"step": 4770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1451033243486073,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00028769337174783754,
|
||
|
|
"loss": 5.0217,
|
||
|
|
"step": 4775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.147349505840072,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.0002876496730790327,
|
||
|
|
"loss": 5.0803,
|
||
|
|
"step": 4780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1495956873315363,
|
||
|
|
"grad_norm": 3.484375,
|
||
|
|
"learning_rate": 0.00028760590068311194,
|
||
|
|
"loss": 5.0487,
|
||
|
|
"step": 4785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1518418688230008,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00028756205458638776,
|
||
|
|
"loss": 5.0174,
|
||
|
|
"step": 4790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1540880503144653,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00028751813481521694,
|
||
|
|
"loss": 5.0855,
|
||
|
|
"step": 4795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1563342318059298,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00028747414139600034,
|
||
|
|
"loss": 5.0706,
|
||
|
|
"step": 4800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1585804132973943,
|
||
|
|
"grad_norm": 2.984375,
|
||
|
|
"learning_rate": 0.0002874300743551835,
|
||
|
|
"loss": 5.1177,
|
||
|
|
"step": 4805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1608265947888587,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.0002873859337192558,
|
||
|
|
"loss": 5.0589,
|
||
|
|
"step": 4810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1630727762803232,
|
||
|
|
"grad_norm": 3.140625,
|
||
|
|
"learning_rate": 0.00028734171951475104,
|
||
|
|
"loss": 5.0959,
|
||
|
|
"step": 4815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.165318957771788,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00028729743176824735,
|
||
|
|
"loss": 5.0754,
|
||
|
|
"step": 4820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1675651392632527,
|
||
|
|
"grad_norm": 3.03125,
|
||
|
|
"learning_rate": 0.0002872530705063669,
|
||
|
|
"loss": 5.0442,
|
||
|
|
"step": 4825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.169811320754717,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 0.00028720863575577615,
|
||
|
|
"loss": 4.9739,
|
||
|
|
"step": 4830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1720575022461817,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002871641275431856,
|
||
|
|
"loss": 5.0175,
|
||
|
|
"step": 4835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.174303683737646,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.0002871195458953501,
|
||
|
|
"loss": 5.0096,
|
||
|
|
"step": 4840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1765498652291106,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002870748908390686,
|
||
|
|
"loss": 5.0525,
|
||
|
|
"step": 4845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.178796046720575,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.0002870301624011839,
|
||
|
|
"loss": 5.0469,
|
||
|
|
"step": 4850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1810422282120396,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.0002869853606085834,
|
||
|
|
"loss": 5.0679,
|
||
|
|
"step": 4855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.183288409703504,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00028694048548819816,
|
||
|
|
"loss": 5.0369,
|
||
|
|
"step": 4860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1855345911949686,
|
||
|
|
"grad_norm": 3.078125,
|
||
|
|
"learning_rate": 0.00028689553706700356,
|
||
|
|
"loss": 5.0443,
|
||
|
|
"step": 4865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.187780772686433,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.000286850515372019,
|
||
|
|
"loss": 4.9984,
|
||
|
|
"step": 4870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1900269541778976,
|
||
|
|
"grad_norm": 2.9375,
|
||
|
|
"learning_rate": 0.00028680542043030787,
|
||
|
|
"loss": 4.9734,
|
||
|
|
"step": 4875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.192273135669362,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.0002867602522689776,
|
||
|
|
"loss": 5.0096,
|
||
|
|
"step": 4880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1945193171608266,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00028671501091517967,
|
||
|
|
"loss": 4.9606,
|
||
|
|
"step": 4885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.196765498652291,
|
||
|
|
"grad_norm": 3.171875,
|
||
|
|
"learning_rate": 0.0002866696963961096,
|
||
|
|
"loss": 5.072,
|
||
|
|
"step": 4890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1990116801437556,
|
||
|
|
"grad_norm": 2.96875,
|
||
|
|
"learning_rate": 0.0002866243087390067,
|
||
|
|
"loss": 5.0319,
|
||
|
|
"step": 4895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.20125786163522,
|
||
|
|
"grad_norm": 3.125,
|
||
|
|
"learning_rate": 0.0002865788479711545,
|
||
|
|
"loss": 5.0198,
|
||
|
|
"step": 4900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2035040431266846,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00028653331411988034,
|
||
|
|
"loss": 5.001,
|
||
|
|
"step": 4905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.205750224618149,
|
||
|
|
"grad_norm": 2.890625,
|
||
|
|
"learning_rate": 0.00028648770721255543,
|
||
|
|
"loss": 5.0652,
|
||
|
|
"step": 4910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2079964061096136,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.000286442027276595,
|
||
|
|
"loss": 4.9551,
|
||
|
|
"step": 4915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.210242587601078,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002863962743394583,
|
||
|
|
"loss": 5.0335,
|
||
|
|
"step": 4920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2124887690925426,
|
||
|
|
"grad_norm": 3.265625,
|
||
|
|
"learning_rate": 0.00028635044842864805,
|
||
|
|
"loss": 5.0267,
|
||
|
|
"step": 4925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.214734950584007,
|
||
|
|
"grad_norm": 3.34375,
|
||
|
|
"learning_rate": 0.0002863045495717113,
|
||
|
|
"loss": 5.0602,
|
||
|
|
"step": 4930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2169811320754715,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002862585777962387,
|
||
|
|
"loss": 5.0753,
|
||
|
|
"step": 4935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.219227313566936,
|
||
|
|
"grad_norm": 3.09375,
|
||
|
|
"learning_rate": 0.0002862125331298648,
|
||
|
|
"loss": 5.0716,
|
||
|
|
"step": 4940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2214734950584005,
|
||
|
|
"grad_norm": 3.328125,
|
||
|
|
"learning_rate": 0.0002861664156002679,
|
||
|
|
"loss": 5.0408,
|
||
|
|
"step": 4945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.223719676549865,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 0.00028612022523517015,
|
||
|
|
"loss": 5.0705,
|
||
|
|
"step": 4950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.22596585804133,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 0.0002860739620623375,
|
||
|
|
"loss": 5.06,
|
||
|
|
"step": 4955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2282120395327945,
|
||
|
|
"grad_norm": 2.953125,
|
||
|
|
"learning_rate": 0.00028602762610957966,
|
||
|
|
"loss": 5.0575,
|
||
|
|
"step": 4960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.230458221024259,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 0.0002859812174047501,
|
||
|
|
"loss": 5.0911,
|
||
|
|
"step": 4965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2327044025157234,
|
||
|
|
"grad_norm": 3.0625,
|
||
|
|
"learning_rate": 0.00028593473597574595,
|
||
|
|
"loss": 5.0714,
|
||
|
|
"step": 4970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.234950584007188,
|
||
|
|
"grad_norm": 3.15625,
|
||
|
|
"learning_rate": 0.00028588818185050816,
|
||
|
|
"loss": 4.9425,
|
||
|
|
"step": 4975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2371967654986524,
|
||
|
|
"grad_norm": 3.359375,
|
||
|
|
"learning_rate": 0.00028584155505702124,
|
||
|
|
"loss": 5.0257,
|
||
|
|
"step": 4980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.239442946990117,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 0.00028579485562331354,
|
||
|
|
"loss": 4.9997,
|
||
|
|
"step": 4985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2416891284815814,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 0.00028574808357745697,
|
||
|
|
"loss": 5.136,
|
||
|
|
"step": 4990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.243935309973046,
|
||
|
|
"grad_norm": 3.046875,
|
||
|
|
"learning_rate": 0.0002857012389475671,
|
||
|
|
"loss": 4.9934,
|
||
|
|
"step": 4995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2461814914645104,
|
||
|
|
"grad_norm": 3.203125,
|
||
|
|
"learning_rate": 0.0002856543217618033,
|
||
|
|
"loss": 5.0804,
|
||
|
|
"step": 5000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2461814914645104,
|
||
|
|
"eval_loss": 5.077792644500732,
|
||
|
|
"eval_runtime": 16.1311,
|
||
|
|
"eval_samples_per_second": 1922.556,
|
||
|
|
"eval_steps_per_second": 240.343,
|
||
|
|
"step": 5000
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 22260,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 5000,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 1.35397447273728e+17,
|
||
|
|
"train_batch_size": 32,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|