Model: PKU-Alignment/ProgressGym-HistLlama3-8B-C014-pretrain-v0.2 Source: Original Platform
826 lines
21 KiB
JSON
826 lines
21 KiB
JSON
{
|
|
"best_metric": 2.204540491104126,
|
|
"best_model_checkpoint": "./output/training_results/C014_llama3-8b-base_pretrain_20240428_005832/checkpoint-130",
|
|
"epoch": 4.0,
|
|
"eval_steps": 5,
|
|
"global_step": 264,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.015151515151515152,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 0.0,
|
|
"loss": 2.5789,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.015151515151515152,
|
|
"eval_loss": 2.6458332538604736,
|
|
"eval_runtime": 5.9609,
|
|
"eval_samples_per_second": 78.008,
|
|
"eval_steps_per_second": 0.671,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.07575757575757576,
|
|
"grad_norm": 3.5510944022733923,
|
|
"learning_rate": 2.25e-06,
|
|
"loss": 2.5672,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.07575757575757576,
|
|
"eval_loss": 2.628009080886841,
|
|
"eval_runtime": 5.9819,
|
|
"eval_samples_per_second": 77.735,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.15151515151515152,
|
|
"grad_norm": 3.473712979202145,
|
|
"learning_rate": 6e-06,
|
|
"loss": 2.5751,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.15151515151515152,
|
|
"eval_loss": 2.5313849449157715,
|
|
"eval_runtime": 5.9441,
|
|
"eval_samples_per_second": 78.228,
|
|
"eval_steps_per_second": 0.673,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.22727272727272727,
|
|
"grad_norm": 2.7296451314204835,
|
|
"learning_rate": 9.75e-06,
|
|
"loss": 2.418,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.22727272727272727,
|
|
"eval_loss": 2.4634220600128174,
|
|
"eval_runtime": 6.0122,
|
|
"eval_samples_per_second": 77.343,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.30303030303030304,
|
|
"grad_norm": 2.9793222204470453,
|
|
"learning_rate": 1.3500000000000001e-05,
|
|
"loss": 2.4701,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.30303030303030304,
|
|
"eval_loss": 2.4176573753356934,
|
|
"eval_runtime": 5.9735,
|
|
"eval_samples_per_second": 77.844,
|
|
"eval_steps_per_second": 0.67,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.3787878787878788,
|
|
"grad_norm": 2.9152379669367696,
|
|
"learning_rate": 1.3097898548149108e-05,
|
|
"loss": 2.3904,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.3787878787878788,
|
|
"eval_loss": 2.3785245418548584,
|
|
"eval_runtime": 5.9994,
|
|
"eval_samples_per_second": 77.507,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.45454545454545453,
|
|
"grad_norm": 2.475241901636818,
|
|
"learning_rate": 1.041060545673204e-05,
|
|
"loss": 2.3539,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.45454545454545453,
|
|
"eval_loss": 2.337780475616455,
|
|
"eval_runtime": 5.9932,
|
|
"eval_samples_per_second": 77.587,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.5303030303030303,
|
|
"grad_norm": 2.5312379436441272,
|
|
"learning_rate": 8.236247706221891e-06,
|
|
"loss": 2.3101,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.5303030303030303,
|
|
"eval_loss": 2.3082308769226074,
|
|
"eval_runtime": 5.9901,
|
|
"eval_samples_per_second": 77.628,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.6060606060606061,
|
|
"grad_norm": 2.5279244134684804,
|
|
"learning_rate": 6.4849612135310325e-06,
|
|
"loss": 2.3254,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.6060606060606061,
|
|
"eval_loss": 2.2816028594970703,
|
|
"eval_runtime": 5.9798,
|
|
"eval_samples_per_second": 77.762,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.6818181818181818,
|
|
"grad_norm": 2.3848893916874836,
|
|
"learning_rate": 5.081159821297093e-06,
|
|
"loss": 2.2762,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.6818181818181818,
|
|
"eval_loss": 2.2614095211029053,
|
|
"eval_runtime": 5.9833,
|
|
"eval_samples_per_second": 77.716,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.7575757575757576,
|
|
"grad_norm": 2.55130938777181,
|
|
"learning_rate": 3.961509285889694e-06,
|
|
"loss": 2.2525,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.7575757575757576,
|
|
"eval_loss": 2.2457971572875977,
|
|
"eval_runtime": 6.0002,
|
|
"eval_samples_per_second": 77.497,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 2.2003836383976156,
|
|
"learning_rate": 3.073152889221908e-06,
|
|
"loss": 2.2777,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"eval_loss": 2.232052803039551,
|
|
"eval_runtime": 5.9752,
|
|
"eval_samples_per_second": 77.822,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.9090909090909091,
|
|
"grad_norm": 1.938340466235554,
|
|
"learning_rate": 2.372162069694911e-06,
|
|
"loss": 2.2054,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.9090909090909091,
|
|
"eval_loss": 2.2206437587738037,
|
|
"eval_runtime": 5.9984,
|
|
"eval_samples_per_second": 77.52,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.9848484848484849,
|
|
"grad_norm": 2.0152410097081375,
|
|
"learning_rate": 1.8221877676625323e-06,
|
|
"loss": 2.237,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.9848484848484849,
|
|
"eval_loss": 2.2112882137298584,
|
|
"eval_runtime": 6.0081,
|
|
"eval_samples_per_second": 77.396,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 1.0606060606060606,
|
|
"grad_norm": 2.206665008674913,
|
|
"learning_rate": 1.3932903283558643e-06,
|
|
"loss": 1.986,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 1.0606060606060606,
|
|
"eval_loss": 2.2115273475646973,
|
|
"eval_runtime": 6.0121,
|
|
"eval_samples_per_second": 77.344,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 1.1363636363636362,
|
|
"grad_norm": 2.4135250740210816,
|
|
"learning_rate": 1.0609278071546894e-06,
|
|
"loss": 1.9373,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 1.1363636363636362,
|
|
"eval_loss": 2.221705913543701,
|
|
"eval_runtime": 5.966,
|
|
"eval_samples_per_second": 77.942,
|
|
"eval_steps_per_second": 0.67,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 1.2121212121212122,
|
|
"grad_norm": 2.0909203153994995,
|
|
"learning_rate": 8.050843851687484e-07,
|
|
"loss": 1.9228,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 1.2121212121212122,
|
|
"eval_loss": 2.2131617069244385,
|
|
"eval_runtime": 6.0048,
|
|
"eval_samples_per_second": 77.439,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 1.2878787878787878,
|
|
"grad_norm": 2.0701966451221723,
|
|
"learning_rate": 6.095223338761627e-07,
|
|
"loss": 1.9084,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 1.2878787878787878,
|
|
"eval_loss": 2.2117583751678467,
|
|
"eval_runtime": 6.0144,
|
|
"eval_samples_per_second": 77.314,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 1.3636363636363638,
|
|
"grad_norm": 2.0411250764985907,
|
|
"learning_rate": 4.611425724763914e-07,
|
|
"loss": 1.9684,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 1.3636363636363638,
|
|
"eval_loss": 2.212195634841919,
|
|
"eval_runtime": 6.005,
|
|
"eval_samples_per_second": 77.436,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 1.4393939393939394,
|
|
"grad_norm": 2.095523936778245,
|
|
"learning_rate": 3.494403469094348e-07,
|
|
"loss": 1.9126,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 1.4393939393939394,
|
|
"eval_loss": 2.2093794345855713,
|
|
"eval_runtime": 5.9753,
|
|
"eval_samples_per_second": 77.82,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 1.5151515151515151,
|
|
"grad_norm": 1.9491500689498569,
|
|
"learning_rate": 2.660439312704735e-07,
|
|
"loss": 1.9101,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.5151515151515151,
|
|
"eval_loss": 2.2066152095794678,
|
|
"eval_runtime": 5.988,
|
|
"eval_samples_per_second": 77.656,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.5909090909090908,
|
|
"grad_norm": 1.9297056004219728,
|
|
"learning_rate": 2.0432551654866868e-07,
|
|
"loss": 1.8496,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 1.5909090909090908,
|
|
"eval_loss": 2.2057933807373047,
|
|
"eval_runtime": 6.029,
|
|
"eval_samples_per_second": 77.127,
|
|
"eval_steps_per_second": 0.663,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 2.0270269069156095,
|
|
"learning_rate": 1.590746140201269e-07,
|
|
"loss": 1.9154,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"eval_loss": 2.205655097961426,
|
|
"eval_runtime": 5.9865,
|
|
"eval_samples_per_second": 77.675,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 1.7424242424242424,
|
|
"grad_norm": 2.0216556200111957,
|
|
"learning_rate": 1.2622536684767967e-07,
|
|
"loss": 1.9233,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 1.7424242424242424,
|
|
"eval_loss": 2.2055680751800537,
|
|
"eval_runtime": 5.9969,
|
|
"eval_samples_per_second": 77.54,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 1.8181818181818183,
|
|
"grad_norm": 2.121087680196771,
|
|
"learning_rate": 1.0263013894441628e-07,
|
|
"loss": 1.9198,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.8181818181818183,
|
|
"eval_loss": 2.205195665359497,
|
|
"eval_runtime": 5.9814,
|
|
"eval_samples_per_second": 77.741,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.893939393939394,
|
|
"grad_norm": 1.9887616789335223,
|
|
"learning_rate": 8.587264024428055e-08,
|
|
"loss": 1.9229,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 1.893939393939394,
|
|
"eval_loss": 2.2048099040985107,
|
|
"eval_runtime": 6.0178,
|
|
"eval_samples_per_second": 77.271,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 1.9737440679073275,
|
|
"learning_rate": 7.411465733236604e-08,
|
|
"loss": 1.8913,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"eval_loss": 2.204540491104126,
|
|
"eval_runtime": 6.0056,
|
|
"eval_samples_per_second": 77.428,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 2.0454545454545454,
|
|
"grad_norm": 2.037256369037435,
|
|
"learning_rate": 6.59711929010128e-08,
|
|
"loss": 1.8814,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 2.0454545454545454,
|
|
"eval_loss": 2.204589605331421,
|
|
"eval_runtime": 6.0067,
|
|
"eval_samples_per_second": 77.414,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 2.121212121212121,
|
|
"grad_norm": 2.0713675635613353,
|
|
"learning_rate": 6.040948153695873e-08,
|
|
"loss": 1.8813,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 2.121212121212121,
|
|
"eval_loss": 2.2050745487213135,
|
|
"eval_runtime": 5.9801,
|
|
"eval_samples_per_second": 77.758,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 2.196969696969697,
|
|
"grad_norm": 1.9602573628685647,
|
|
"learning_rate": 5.666794757151726e-08,
|
|
"loss": 1.8912,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 2.196969696969697,
|
|
"eval_loss": 2.2057695388793945,
|
|
"eval_runtime": 6.0053,
|
|
"eval_samples_per_second": 77.431,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 2.2727272727272725,
|
|
"grad_norm": 2.0631081811496133,
|
|
"learning_rate": 5.4191707642277796e-08,
|
|
"loss": 1.9184,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 2.2727272727272725,
|
|
"eval_loss": 2.2065114974975586,
|
|
"eval_runtime": 5.9555,
|
|
"eval_samples_per_second": 78.079,
|
|
"eval_steps_per_second": 0.672,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 2.3484848484848486,
|
|
"grad_norm": 2.0253318844317705,
|
|
"learning_rate": 5.258170056372994e-08,
|
|
"loss": 1.8662,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 2.3484848484848486,
|
|
"eval_loss": 2.207070827484131,
|
|
"eval_runtime": 6.0047,
|
|
"eval_samples_per_second": 77.439,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 2.4242424242424243,
|
|
"grad_norm": 2.073377995618171,
|
|
"learning_rate": 5.1554954268425945e-08,
|
|
"loss": 1.8809,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 2.4242424242424243,
|
|
"eval_loss": 2.2073893547058105,
|
|
"eval_runtime": 6.0042,
|
|
"eval_samples_per_second": 77.446,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 1.9895760567683491,
|
|
"learning_rate": 5.091387798309037e-08,
|
|
"loss": 1.8591,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"eval_loss": 2.2076644897460938,
|
|
"eval_runtime": 5.9917,
|
|
"eval_samples_per_second": 77.608,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 2.5757575757575757,
|
|
"grad_norm": 2.0833747351029372,
|
|
"learning_rate": 5.0522801309078135e-08,
|
|
"loss": 1.8731,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 2.5757575757575757,
|
|
"eval_loss": 2.2079408168792725,
|
|
"eval_runtime": 6.0146,
|
|
"eval_samples_per_second": 77.312,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 2.6515151515151514,
|
|
"grad_norm": 2.0037327741523825,
|
|
"learning_rate": 5.0290274187738543e-08,
|
|
"loss": 1.8948,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 2.6515151515151514,
|
|
"eval_loss": 2.208183765411377,
|
|
"eval_runtime": 6.0305,
|
|
"eval_samples_per_second": 77.108,
|
|
"eval_steps_per_second": 0.663,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 2.7272727272727275,
|
|
"grad_norm": 2.0305444519091917,
|
|
"learning_rate": 5.015589639287439e-08,
|
|
"loss": 1.8876,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 2.7272727272727275,
|
|
"eval_loss": 2.208235740661621,
|
|
"eval_runtime": 5.9955,
|
|
"eval_samples_per_second": 77.558,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 2.8030303030303028,
|
|
"grad_norm": 2.0062076076889266,
|
|
"learning_rate": 5.0080665589248236e-08,
|
|
"loss": 1.8408,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 2.8030303030303028,
|
|
"eval_loss": 2.2083210945129395,
|
|
"eval_runtime": 6.0004,
|
|
"eval_samples_per_second": 77.495,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 2.878787878787879,
|
|
"grad_norm": 2.043895270641295,
|
|
"learning_rate": 5.004002235298783e-08,
|
|
"loss": 1.8931,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 2.878787878787879,
|
|
"eval_loss": 2.208212375640869,
|
|
"eval_runtime": 6.0002,
|
|
"eval_samples_per_second": 77.498,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 2.9545454545454546,
|
|
"grad_norm": 1.9689184449811679,
|
|
"learning_rate": 5.001893193212864e-08,
|
|
"loss": 1.8569,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 2.9545454545454546,
|
|
"eval_loss": 2.2080307006835938,
|
|
"eval_runtime": 5.9791,
|
|
"eval_samples_per_second": 77.772,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 3.0303030303030303,
|
|
"grad_norm": 2.019410946950212,
|
|
"learning_rate": 5.000847883910016e-08,
|
|
"loss": 1.8621,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 3.0303030303030303,
|
|
"eval_loss": 2.207866907119751,
|
|
"eval_runtime": 6.0003,
|
|
"eval_samples_per_second": 77.496,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 3.106060606060606,
|
|
"grad_norm": 2.0283442915048644,
|
|
"learning_rate": 5.000356435775757e-08,
|
|
"loss": 1.8863,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 3.106060606060606,
|
|
"eval_loss": 2.207792043685913,
|
|
"eval_runtime": 5.9858,
|
|
"eval_samples_per_second": 77.684,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 3.1818181818181817,
|
|
"grad_norm": 2.05494798524839,
|
|
"learning_rate": 5.0001391301969795e-08,
|
|
"loss": 1.9021,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 3.1818181818181817,
|
|
"eval_loss": 2.2078535556793213,
|
|
"eval_runtime": 5.9911,
|
|
"eval_samples_per_second": 77.615,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 3.257575757575758,
|
|
"grad_norm": 2.0316955077847023,
|
|
"learning_rate": 5.000049730753554e-08,
|
|
"loss": 1.8648,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 3.257575757575758,
|
|
"eval_loss": 2.2079594135284424,
|
|
"eval_runtime": 5.9692,
|
|
"eval_samples_per_second": 77.9,
|
|
"eval_steps_per_second": 0.67,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 3.3333333333333335,
|
|
"grad_norm": 2.0376583248479174,
|
|
"learning_rate": 5.0000159841391415e-08,
|
|
"loss": 1.8443,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 3.3333333333333335,
|
|
"eval_loss": 2.2080650329589844,
|
|
"eval_runtime": 6.0237,
|
|
"eval_samples_per_second": 77.195,
|
|
"eval_steps_per_second": 0.664,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 3.409090909090909,
|
|
"grad_norm": 2.0309390490282864,
|
|
"learning_rate": 5.0000045079130105e-08,
|
|
"loss": 1.8978,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 3.409090909090909,
|
|
"eval_loss": 2.20800518989563,
|
|
"eval_runtime": 6.0116,
|
|
"eval_samples_per_second": 77.35,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 3.484848484848485,
|
|
"grad_norm": 2.0870431548602495,
|
|
"learning_rate": 5.000001078153535e-08,
|
|
"loss": 1.8658,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 3.484848484848485,
|
|
"eval_loss": 2.208038806915283,
|
|
"eval_runtime": 5.99,
|
|
"eval_samples_per_second": 77.629,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 3.5606060606060606,
|
|
"grad_norm": 2.0234951173098024,
|
|
"learning_rate": 5.0000002081285866e-08,
|
|
"loss": 1.8706,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 3.5606060606060606,
|
|
"eval_loss": 2.207921266555786,
|
|
"eval_runtime": 5.9823,
|
|
"eval_samples_per_second": 77.73,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 3.6363636363636362,
|
|
"grad_norm": 2.039329423643573,
|
|
"learning_rate": 5.0000000300649115e-08,
|
|
"loss": 1.8855,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 3.6363636363636362,
|
|
"eval_loss": 2.2077724933624268,
|
|
"eval_runtime": 6.0052,
|
|
"eval_samples_per_second": 77.433,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 3.712121212121212,
|
|
"grad_norm": 2.048524032150131,
|
|
"learning_rate": 5.00000000286923e-08,
|
|
"loss": 1.8535,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 3.712121212121212,
|
|
"eval_loss": 2.2078235149383545,
|
|
"eval_runtime": 6.0225,
|
|
"eval_samples_per_second": 77.21,
|
|
"eval_steps_per_second": 0.664,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 3.787878787878788,
|
|
"grad_norm": 2.117736519999513,
|
|
"learning_rate": 5.0000000001441026e-08,
|
|
"loss": 1.9062,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 3.787878787878788,
|
|
"eval_loss": 2.207879066467285,
|
|
"eval_runtime": 5.9714,
|
|
"eval_samples_per_second": 77.871,
|
|
"eval_steps_per_second": 0.67,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 3.8636363636363638,
|
|
"grad_norm": 2.0315594950889775,
|
|
"learning_rate": 5.000000000002337e-08,
|
|
"loss": 1.8628,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 3.8636363636363638,
|
|
"eval_loss": 2.2078306674957275,
|
|
"eval_runtime": 6.0145,
|
|
"eval_samples_per_second": 77.314,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 3.9393939393939394,
|
|
"grad_norm": 2.0132729461699994,
|
|
"learning_rate": 5.0000000000000024e-08,
|
|
"loss": 1.8484,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 3.9393939393939394,
|
|
"eval_loss": 2.207735776901245,
|
|
"eval_runtime": 6.0067,
|
|
"eval_samples_per_second": 77.413,
|
|
"eval_steps_per_second": 0.666,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"step": 264,
|
|
"total_flos": 27428734894080.0,
|
|
"train_loss": 2.005241002097274,
|
|
"train_runtime": 8788.5675,
|
|
"train_samples_per_second": 1.904,
|
|
"train_steps_per_second": 0.03
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 264,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 4,
|
|
"save_steps": 5,
|
|
"total_flos": 27428734894080.0,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|