1010 lines
22 KiB
JSON
1010 lines
22 KiB
JSON
{
|
|
"best_global_step": 20,
|
|
"best_metric": 3.1484363079071045,
|
|
"best_model_checkpoint": "controlled-food-recipe-generation/checkpoint-20",
|
|
"epoch": 65.0,
|
|
"eval_steps": 1,
|
|
"global_step": 65,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 859997.4375,
|
|
"learning_rate": 5e-05,
|
|
"loss": 3.1008,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 3.7337515354156494,
|
|
"eval_runtime": 0.0252,
|
|
"eval_samples_per_second": 39.612,
|
|
"eval_steps_per_second": 39.612,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 591288.1875,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 2.6028,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 3.6040873527526855,
|
|
"eval_runtime": 0.0236,
|
|
"eval_samples_per_second": 42.438,
|
|
"eval_steps_per_second": 42.438,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 600605.1875,
|
|
"learning_rate": 4.9e-05,
|
|
"loss": 2.3589,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_loss": 3.560584783554077,
|
|
"eval_runtime": 0.024,
|
|
"eval_samples_per_second": 41.583,
|
|
"eval_steps_per_second": 41.583,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 601246.5625,
|
|
"learning_rate": 4.85e-05,
|
|
"loss": 2.4875,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"eval_loss": 3.499941349029541,
|
|
"eval_runtime": 0.0229,
|
|
"eval_samples_per_second": 43.614,
|
|
"eval_steps_per_second": 43.614,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 500261.0625,
|
|
"learning_rate": 4.8e-05,
|
|
"loss": 2.1798,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"eval_loss": 3.4445371627807617,
|
|
"eval_runtime": 0.0281,
|
|
"eval_samples_per_second": 35.524,
|
|
"eval_steps_per_second": 35.524,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"grad_norm": 513553.28125,
|
|
"learning_rate": 4.75e-05,
|
|
"loss": 2.3408,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"eval_loss": 3.38865065574646,
|
|
"eval_runtime": 0.0244,
|
|
"eval_samples_per_second": 41.014,
|
|
"eval_steps_per_second": 41.014,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"grad_norm": 458548.125,
|
|
"learning_rate": 4.7e-05,
|
|
"loss": 2.0578,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"eval_loss": 3.3251659870147705,
|
|
"eval_runtime": 0.0236,
|
|
"eval_samples_per_second": 42.39,
|
|
"eval_steps_per_second": 42.39,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"grad_norm": 486291.0,
|
|
"learning_rate": 4.6500000000000005e-05,
|
|
"loss": 2.2078,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"eval_loss": 3.272376775741577,
|
|
"eval_runtime": 0.0234,
|
|
"eval_samples_per_second": 42.733,
|
|
"eval_steps_per_second": 42.733,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 9.0,
|
|
"grad_norm": 470378.375,
|
|
"learning_rate": 4.600000000000001e-05,
|
|
"loss": 2.0358,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 9.0,
|
|
"eval_loss": 3.2305002212524414,
|
|
"eval_runtime": 0.0235,
|
|
"eval_samples_per_second": 42.551,
|
|
"eval_steps_per_second": 42.551,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"grad_norm": 474659.625,
|
|
"learning_rate": 4.55e-05,
|
|
"loss": 1.9474,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"eval_loss": 3.2030880451202393,
|
|
"eval_runtime": 0.0304,
|
|
"eval_samples_per_second": 32.87,
|
|
"eval_steps_per_second": 32.87,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 11.0,
|
|
"grad_norm": 522849.3125,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 1.9579,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 11.0,
|
|
"eval_loss": 3.1824824810028076,
|
|
"eval_runtime": 0.0235,
|
|
"eval_samples_per_second": 42.623,
|
|
"eval_steps_per_second": 42.623,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 12.0,
|
|
"grad_norm": 483942.09375,
|
|
"learning_rate": 4.4500000000000004e-05,
|
|
"loss": 1.8673,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 12.0,
|
|
"eval_loss": 3.1729376316070557,
|
|
"eval_runtime": 0.0248,
|
|
"eval_samples_per_second": 40.289,
|
|
"eval_steps_per_second": 40.289,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 13.0,
|
|
"grad_norm": 503133.78125,
|
|
"learning_rate": 4.4000000000000006e-05,
|
|
"loss": 1.7788,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 13.0,
|
|
"eval_loss": 3.175445556640625,
|
|
"eval_runtime": 0.0288,
|
|
"eval_samples_per_second": 34.737,
|
|
"eval_steps_per_second": 34.737,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 14.0,
|
|
"grad_norm": 484873.28125,
|
|
"learning_rate": 4.35e-05,
|
|
"loss": 1.8805,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 14.0,
|
|
"eval_loss": 3.169180393218994,
|
|
"eval_runtime": 0.0288,
|
|
"eval_samples_per_second": 34.733,
|
|
"eval_steps_per_second": 34.733,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 15.0,
|
|
"grad_norm": 570080.375,
|
|
"learning_rate": 4.3e-05,
|
|
"loss": 1.8447,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 15.0,
|
|
"eval_loss": 3.1667754650115967,
|
|
"eval_runtime": 0.0251,
|
|
"eval_samples_per_second": 39.911,
|
|
"eval_steps_per_second": 39.911,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 16.0,
|
|
"grad_norm": 513446.6875,
|
|
"learning_rate": 4.25e-05,
|
|
"loss": 1.6697,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 16.0,
|
|
"eval_loss": 3.1564900875091553,
|
|
"eval_runtime": 0.023,
|
|
"eval_samples_per_second": 43.424,
|
|
"eval_steps_per_second": 43.424,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 17.0,
|
|
"grad_norm": 497921.71875,
|
|
"learning_rate": 4.2e-05,
|
|
"loss": 1.6465,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 17.0,
|
|
"eval_loss": 3.1547012329101562,
|
|
"eval_runtime": 0.0242,
|
|
"eval_samples_per_second": 41.266,
|
|
"eval_steps_per_second": 41.266,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 18.0,
|
|
"grad_norm": 609789.375,
|
|
"learning_rate": 4.15e-05,
|
|
"loss": 1.7295,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 18.0,
|
|
"eval_loss": 3.1545279026031494,
|
|
"eval_runtime": 0.024,
|
|
"eval_samples_per_second": 41.717,
|
|
"eval_steps_per_second": 41.717,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 19.0,
|
|
"grad_norm": 595338.5,
|
|
"learning_rate": 4.1e-05,
|
|
"loss": 1.6992,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 19.0,
|
|
"eval_loss": 3.152496576309204,
|
|
"eval_runtime": 0.0228,
|
|
"eval_samples_per_second": 43.902,
|
|
"eval_steps_per_second": 43.902,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"grad_norm": 551403.1875,
|
|
"learning_rate": 4.05e-05,
|
|
"loss": 1.6533,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"eval_loss": 3.1484363079071045,
|
|
"eval_runtime": 0.0317,
|
|
"eval_samples_per_second": 31.517,
|
|
"eval_steps_per_second": 31.517,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 21.0,
|
|
"grad_norm": 557413.0625,
|
|
"learning_rate": 4e-05,
|
|
"loss": 1.5188,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 21.0,
|
|
"eval_loss": 3.150416612625122,
|
|
"eval_runtime": 0.0265,
|
|
"eval_samples_per_second": 37.802,
|
|
"eval_steps_per_second": 37.802,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 22.0,
|
|
"grad_norm": 622994.3125,
|
|
"learning_rate": 3.9500000000000005e-05,
|
|
"loss": 1.6559,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 22.0,
|
|
"eval_loss": 3.1489338874816895,
|
|
"eval_runtime": 0.0267,
|
|
"eval_samples_per_second": 37.471,
|
|
"eval_steps_per_second": 37.471,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 23.0,
|
|
"grad_norm": 597280.9375,
|
|
"learning_rate": 3.9000000000000006e-05,
|
|
"loss": 1.4457,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 23.0,
|
|
"eval_loss": 3.164409875869751,
|
|
"eval_runtime": 0.0252,
|
|
"eval_samples_per_second": 39.63,
|
|
"eval_steps_per_second": 39.63,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 24.0,
|
|
"grad_norm": 646545.6875,
|
|
"learning_rate": 3.85e-05,
|
|
"loss": 1.4396,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 24.0,
|
|
"eval_loss": 3.185584545135498,
|
|
"eval_runtime": 0.0242,
|
|
"eval_samples_per_second": 41.333,
|
|
"eval_steps_per_second": 41.333,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"grad_norm": 742538.5,
|
|
"learning_rate": 3.8e-05,
|
|
"loss": 1.5531,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"eval_loss": 3.209012269973755,
|
|
"eval_runtime": 0.0239,
|
|
"eval_samples_per_second": 41.889,
|
|
"eval_steps_per_second": 41.889,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 26.0,
|
|
"grad_norm": 779773.875,
|
|
"learning_rate": 3.7500000000000003e-05,
|
|
"loss": 1.5664,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 26.0,
|
|
"eval_loss": 3.2342324256896973,
|
|
"eval_runtime": 0.0238,
|
|
"eval_samples_per_second": 42.093,
|
|
"eval_steps_per_second": 42.093,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 27.0,
|
|
"grad_norm": 631411.9375,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 1.3529,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 27.0,
|
|
"eval_loss": 3.2571322917938232,
|
|
"eval_runtime": 0.0241,
|
|
"eval_samples_per_second": 41.443,
|
|
"eval_steps_per_second": 41.443,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 28.0,
|
|
"grad_norm": 709697.5625,
|
|
"learning_rate": 3.65e-05,
|
|
"loss": 1.4272,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 28.0,
|
|
"eval_loss": 3.2806529998779297,
|
|
"eval_runtime": 0.0246,
|
|
"eval_samples_per_second": 40.587,
|
|
"eval_steps_per_second": 40.587,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 29.0,
|
|
"grad_norm": 841672.0,
|
|
"learning_rate": 3.6e-05,
|
|
"loss": 1.494,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 29.0,
|
|
"eval_loss": 3.3014163970947266,
|
|
"eval_runtime": 0.0233,
|
|
"eval_samples_per_second": 42.991,
|
|
"eval_steps_per_second": 42.991,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 30.0,
|
|
"grad_norm": 796136.3125,
|
|
"learning_rate": 3.55e-05,
|
|
"loss": 1.4225,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 30.0,
|
|
"eval_loss": 3.317432403564453,
|
|
"eval_runtime": 0.0291,
|
|
"eval_samples_per_second": 34.392,
|
|
"eval_steps_per_second": 34.392,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 31.0,
|
|
"grad_norm": 755379.8125,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 1.4117,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 31.0,
|
|
"eval_loss": 3.3241231441497803,
|
|
"eval_runtime": 0.0236,
|
|
"eval_samples_per_second": 42.413,
|
|
"eval_steps_per_second": 42.413,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 32.0,
|
|
"grad_norm": 783589.1875,
|
|
"learning_rate": 3.45e-05,
|
|
"loss": 1.3846,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 32.0,
|
|
"eval_loss": 3.3234148025512695,
|
|
"eval_runtime": 0.0321,
|
|
"eval_samples_per_second": 31.157,
|
|
"eval_steps_per_second": 31.157,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 33.0,
|
|
"grad_norm": 809859.5,
|
|
"learning_rate": 3.4000000000000007e-05,
|
|
"loss": 1.4252,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 33.0,
|
|
"eval_loss": 3.3146393299102783,
|
|
"eval_runtime": 0.0347,
|
|
"eval_samples_per_second": 28.812,
|
|
"eval_steps_per_second": 28.812,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 34.0,
|
|
"grad_norm": 829947.125,
|
|
"learning_rate": 3.35e-05,
|
|
"loss": 1.427,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 34.0,
|
|
"eval_loss": 3.305365800857544,
|
|
"eval_runtime": 0.0314,
|
|
"eval_samples_per_second": 31.81,
|
|
"eval_steps_per_second": 31.81,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 35.0,
|
|
"grad_norm": 895208.4375,
|
|
"learning_rate": 3.3e-05,
|
|
"loss": 1.3741,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 35.0,
|
|
"eval_loss": 3.3006556034088135,
|
|
"eval_runtime": 0.031,
|
|
"eval_samples_per_second": 32.225,
|
|
"eval_steps_per_second": 32.225,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 36.0,
|
|
"grad_norm": 836688.25,
|
|
"learning_rate": 3.2500000000000004e-05,
|
|
"loss": 1.2527,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 36.0,
|
|
"eval_loss": 3.3037965297698975,
|
|
"eval_runtime": 0.0247,
|
|
"eval_samples_per_second": 40.468,
|
|
"eval_steps_per_second": 40.468,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 37.0,
|
|
"grad_norm": 944192.625,
|
|
"learning_rate": 3.2000000000000005e-05,
|
|
"loss": 1.38,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 37.0,
|
|
"eval_loss": 3.313577651977539,
|
|
"eval_runtime": 0.0262,
|
|
"eval_samples_per_second": 38.123,
|
|
"eval_steps_per_second": 38.123,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 38.0,
|
|
"grad_norm": 841649.25,
|
|
"learning_rate": 3.15e-05,
|
|
"loss": 1.1685,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 38.0,
|
|
"eval_loss": 3.336230993270874,
|
|
"eval_runtime": 0.0234,
|
|
"eval_samples_per_second": 42.65,
|
|
"eval_steps_per_second": 42.65,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 39.0,
|
|
"grad_norm": 916154.9375,
|
|
"learning_rate": 3.1e-05,
|
|
"loss": 1.2675,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 39.0,
|
|
"eval_loss": 3.369218587875366,
|
|
"eval_runtime": 0.0238,
|
|
"eval_samples_per_second": 41.981,
|
|
"eval_steps_per_second": 41.981,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 40.0,
|
|
"grad_norm": 996200.875,
|
|
"learning_rate": 3.05e-05,
|
|
"loss": 1.2837,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 40.0,
|
|
"eval_loss": 3.4009995460510254,
|
|
"eval_runtime": 0.0254,
|
|
"eval_samples_per_second": 39.446,
|
|
"eval_steps_per_second": 39.446,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 41.0,
|
|
"grad_norm": 1006036.3125,
|
|
"learning_rate": 3e-05,
|
|
"loss": 1.2778,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 41.0,
|
|
"eval_loss": 3.425380229949951,
|
|
"eval_runtime": 0.0239,
|
|
"eval_samples_per_second": 41.871,
|
|
"eval_steps_per_second": 41.871,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 42.0,
|
|
"grad_norm": 1032298.5,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 1.2941,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 42.0,
|
|
"eval_loss": 3.44785737991333,
|
|
"eval_runtime": 0.0237,
|
|
"eval_samples_per_second": 42.237,
|
|
"eval_steps_per_second": 42.237,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 43.0,
|
|
"grad_norm": 1047889.375,
|
|
"learning_rate": 2.9e-05,
|
|
"loss": 1.288,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 43.0,
|
|
"eval_loss": 3.463594436645508,
|
|
"eval_runtime": 0.0242,
|
|
"eval_samples_per_second": 41.312,
|
|
"eval_steps_per_second": 41.312,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 44.0,
|
|
"grad_norm": 1109954.375,
|
|
"learning_rate": 2.8499999999999998e-05,
|
|
"loss": 1.2703,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 44.0,
|
|
"eval_loss": 3.47367787361145,
|
|
"eval_runtime": 0.0228,
|
|
"eval_samples_per_second": 43.941,
|
|
"eval_steps_per_second": 43.941,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 45.0,
|
|
"grad_norm": 1047148.0625,
|
|
"learning_rate": 2.8000000000000003e-05,
|
|
"loss": 1.187,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 45.0,
|
|
"eval_loss": 3.4794631004333496,
|
|
"eval_runtime": 0.0238,
|
|
"eval_samples_per_second": 42.066,
|
|
"eval_steps_per_second": 42.066,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 46.0,
|
|
"grad_norm": 1041700.375,
|
|
"learning_rate": 2.7500000000000004e-05,
|
|
"loss": 1.1858,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 46.0,
|
|
"eval_loss": 3.488736629486084,
|
|
"eval_runtime": 0.028,
|
|
"eval_samples_per_second": 35.764,
|
|
"eval_steps_per_second": 35.764,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 47.0,
|
|
"grad_norm": 1012093.8125,
|
|
"learning_rate": 2.7000000000000002e-05,
|
|
"loss": 1.104,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 47.0,
|
|
"eval_loss": 3.495603322982788,
|
|
"eval_runtime": 0.0232,
|
|
"eval_samples_per_second": 43.112,
|
|
"eval_steps_per_second": 43.112,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 48.0,
|
|
"grad_norm": 987565.0625,
|
|
"learning_rate": 2.6500000000000004e-05,
|
|
"loss": 1.0798,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 48.0,
|
|
"eval_loss": 3.503004789352417,
|
|
"eval_runtime": 0.0239,
|
|
"eval_samples_per_second": 41.831,
|
|
"eval_steps_per_second": 41.831,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 49.0,
|
|
"grad_norm": 1095353.5,
|
|
"learning_rate": 2.6000000000000002e-05,
|
|
"loss": 1.1652,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 49.0,
|
|
"eval_loss": 3.5171303749084473,
|
|
"eval_runtime": 0.0267,
|
|
"eval_samples_per_second": 37.52,
|
|
"eval_steps_per_second": 37.52,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 50.0,
|
|
"grad_norm": 1143315.625,
|
|
"learning_rate": 2.5500000000000003e-05,
|
|
"loss": 1.0958,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 50.0,
|
|
"eval_loss": 3.5315191745758057,
|
|
"eval_runtime": 0.0255,
|
|
"eval_samples_per_second": 39.196,
|
|
"eval_steps_per_second": 39.196,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 51.0,
|
|
"grad_norm": 1020882.0625,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 1.1248,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 51.0,
|
|
"eval_loss": 3.546818256378174,
|
|
"eval_runtime": 0.0242,
|
|
"eval_samples_per_second": 41.267,
|
|
"eval_steps_per_second": 41.267,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 52.0,
|
|
"grad_norm": 1068618.125,
|
|
"learning_rate": 2.45e-05,
|
|
"loss": 1.0666,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 52.0,
|
|
"eval_loss": 3.5597212314605713,
|
|
"eval_runtime": 0.0312,
|
|
"eval_samples_per_second": 32.043,
|
|
"eval_steps_per_second": 32.043,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 53.0,
|
|
"grad_norm": 1039242.8125,
|
|
"learning_rate": 2.4e-05,
|
|
"loss": 1.0405,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 53.0,
|
|
"eval_loss": 3.5634210109710693,
|
|
"eval_runtime": 0.0259,
|
|
"eval_samples_per_second": 38.546,
|
|
"eval_steps_per_second": 38.546,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 54.0,
|
|
"grad_norm": 1053574.375,
|
|
"learning_rate": 2.35e-05,
|
|
"loss": 1.0269,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 54.0,
|
|
"eval_loss": 3.5643200874328613,
|
|
"eval_runtime": 0.034,
|
|
"eval_samples_per_second": 29.405,
|
|
"eval_steps_per_second": 29.405,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 55.0,
|
|
"grad_norm": 1006597.0,
|
|
"learning_rate": 2.3000000000000003e-05,
|
|
"loss": 0.9945,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 55.0,
|
|
"eval_loss": 3.5702857971191406,
|
|
"eval_runtime": 0.025,
|
|
"eval_samples_per_second": 39.947,
|
|
"eval_steps_per_second": 39.947,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 56.0,
|
|
"grad_norm": 1291302.375,
|
|
"learning_rate": 2.25e-05,
|
|
"loss": 1.1558,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 56.0,
|
|
"eval_loss": 3.573396921157837,
|
|
"eval_runtime": 0.0238,
|
|
"eval_samples_per_second": 42.087,
|
|
"eval_steps_per_second": 42.087,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 57.0,
|
|
"grad_norm": 1126977.0,
|
|
"learning_rate": 2.2000000000000003e-05,
|
|
"loss": 1.0463,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 57.0,
|
|
"eval_loss": 3.5831034183502197,
|
|
"eval_runtime": 0.0288,
|
|
"eval_samples_per_second": 34.698,
|
|
"eval_steps_per_second": 34.698,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 58.0,
|
|
"grad_norm": 1271963.625,
|
|
"learning_rate": 2.15e-05,
|
|
"loss": 1.0987,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 58.0,
|
|
"eval_loss": 3.5913143157958984,
|
|
"eval_runtime": 0.0248,
|
|
"eval_samples_per_second": 40.245,
|
|
"eval_steps_per_second": 40.245,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 59.0,
|
|
"grad_norm": 1154121.375,
|
|
"learning_rate": 2.1e-05,
|
|
"loss": 0.9989,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 59.0,
|
|
"eval_loss": 3.6040565967559814,
|
|
"eval_runtime": 0.0238,
|
|
"eval_samples_per_second": 41.975,
|
|
"eval_steps_per_second": 41.975,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 60.0,
|
|
"grad_norm": 1363833.0,
|
|
"learning_rate": 2.05e-05,
|
|
"loss": 1.1108,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 60.0,
|
|
"eval_loss": 3.6183338165283203,
|
|
"eval_runtime": 0.0266,
|
|
"eval_samples_per_second": 37.583,
|
|
"eval_steps_per_second": 37.583,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 61.0,
|
|
"grad_norm": 1005580.5625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.9088,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 61.0,
|
|
"eval_loss": 3.6330275535583496,
|
|
"eval_runtime": 0.0233,
|
|
"eval_samples_per_second": 42.911,
|
|
"eval_steps_per_second": 42.911,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 62.0,
|
|
"grad_norm": 1331999.0,
|
|
"learning_rate": 1.9500000000000003e-05,
|
|
"loss": 1.0881,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 62.0,
|
|
"eval_loss": 3.6475167274475098,
|
|
"eval_runtime": 0.0236,
|
|
"eval_samples_per_second": 42.307,
|
|
"eval_steps_per_second": 42.307,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 63.0,
|
|
"grad_norm": 1380935.75,
|
|
"learning_rate": 1.9e-05,
|
|
"loss": 1.1348,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 63.0,
|
|
"eval_loss": 3.657670021057129,
|
|
"eval_runtime": 0.031,
|
|
"eval_samples_per_second": 32.292,
|
|
"eval_steps_per_second": 32.292,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 64.0,
|
|
"grad_norm": 1282143.125,
|
|
"learning_rate": 1.85e-05,
|
|
"loss": 1.0682,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 64.0,
|
|
"eval_loss": 3.6665403842926025,
|
|
"eval_runtime": 0.0241,
|
|
"eval_samples_per_second": 41.49,
|
|
"eval_steps_per_second": 41.49,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 65.0,
|
|
"grad_norm": 1100699.625,
|
|
"learning_rate": 1.8e-05,
|
|
"loss": 0.9382,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 65.0,
|
|
"eval_loss": 3.668815851211548,
|
|
"eval_runtime": 0.0251,
|
|
"eval_samples_per_second": 39.888,
|
|
"eval_steps_per_second": 39.888,
|
|
"step": 65
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 100,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 100,
|
|
"save_steps": 1,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 38213959680000.0,
|
|
"train_batch_size": 64,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|