2464 lines
64 KiB
JSON
2464 lines
64 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": 71,
|
||
|
|
"best_metric": 0.14072927832603455,
|
||
|
|
"best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_rte_42_1774791065/checkpoint-71",
|
||
|
|
"epoch": 5.0,
|
||
|
|
"eval_steps": 71,
|
||
|
|
"global_step": 1405,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.017793594306049824,
|
||
|
|
"grad_norm": 100.99552917480469,
|
||
|
|
"learning_rate": 1.4184397163120568e-06,
|
||
|
|
"loss": 0.6704,
|
||
|
|
"num_input_tokens_seen": 7872,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03558718861209965,
|
||
|
|
"grad_norm": 94.50279235839844,
|
||
|
|
"learning_rate": 3.1914893617021277e-06,
|
||
|
|
"loss": 0.2548,
|
||
|
|
"num_input_tokens_seen": 14784,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05338078291814947,
|
||
|
|
"grad_norm": 112.95467376708984,
|
||
|
|
"learning_rate": 4.964539007092199e-06,
|
||
|
|
"loss": 0.9227,
|
||
|
|
"num_input_tokens_seen": 23424,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0711743772241993,
|
||
|
|
"grad_norm": 4.330768585205078,
|
||
|
|
"learning_rate": 6.73758865248227e-06,
|
||
|
|
"loss": 0.1819,
|
||
|
|
"num_input_tokens_seen": 29824,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08896797153024912,
|
||
|
|
"grad_norm": 5.050032138824463,
|
||
|
|
"learning_rate": 8.510638297872341e-06,
|
||
|
|
"loss": 0.228,
|
||
|
|
"num_input_tokens_seen": 37824,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10676156583629894,
|
||
|
|
"grad_norm": 12.993303298950195,
|
||
|
|
"learning_rate": 1.0283687943262411e-05,
|
||
|
|
"loss": 0.1572,
|
||
|
|
"num_input_tokens_seen": 44608,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12455516014234876,
|
||
|
|
"grad_norm": 17.3212947845459,
|
||
|
|
"learning_rate": 1.2056737588652483e-05,
|
||
|
|
"loss": 0.1609,
|
||
|
|
"num_input_tokens_seen": 51968,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1423487544483986,
|
||
|
|
"grad_norm": 15.35787296295166,
|
||
|
|
"learning_rate": 1.3829787234042554e-05,
|
||
|
|
"loss": 0.2143,
|
||
|
|
"num_input_tokens_seen": 59456,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1601423487544484,
|
||
|
|
"grad_norm": 31.886463165283203,
|
||
|
|
"learning_rate": 1.5602836879432626e-05,
|
||
|
|
"loss": 0.2034,
|
||
|
|
"num_input_tokens_seen": 66496,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17793594306049823,
|
||
|
|
"grad_norm": 18.710391998291016,
|
||
|
|
"learning_rate": 1.7375886524822697e-05,
|
||
|
|
"loss": 0.2702,
|
||
|
|
"num_input_tokens_seen": 73408,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19572953736654805,
|
||
|
|
"grad_norm": 23.454370498657227,
|
||
|
|
"learning_rate": 1.9148936170212766e-05,
|
||
|
|
"loss": 0.1793,
|
||
|
|
"num_input_tokens_seen": 80576,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21352313167259787,
|
||
|
|
"grad_norm": 2.1824188232421875,
|
||
|
|
"learning_rate": 2.0921985815602837e-05,
|
||
|
|
"loss": 0.161,
|
||
|
|
"num_input_tokens_seen": 88256,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2313167259786477,
|
||
|
|
"grad_norm": 4.793557167053223,
|
||
|
|
"learning_rate": 2.269503546099291e-05,
|
||
|
|
"loss": 0.1808,
|
||
|
|
"num_input_tokens_seen": 96256,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2491103202846975,
|
||
|
|
"grad_norm": 27.524141311645508,
|
||
|
|
"learning_rate": 2.446808510638298e-05,
|
||
|
|
"loss": 0.2508,
|
||
|
|
"num_input_tokens_seen": 103424,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2526690391459075,
|
||
|
|
"eval_loss": 0.14072927832603455,
|
||
|
|
"eval_runtime": 0.6083,
|
||
|
|
"eval_samples_per_second": 409.332,
|
||
|
|
"eval_steps_per_second": 52.605,
|
||
|
|
"num_input_tokens_seen": 105024,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2669039145907473,
|
||
|
|
"grad_norm": 7.07442045211792,
|
||
|
|
"learning_rate": 2.624113475177305e-05,
|
||
|
|
"loss": 0.143,
|
||
|
|
"num_input_tokens_seen": 110528,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2846975088967972,
|
||
|
|
"grad_norm": 16.91026496887207,
|
||
|
|
"learning_rate": 2.8014184397163124e-05,
|
||
|
|
"loss": 0.2326,
|
||
|
|
"num_input_tokens_seen": 117440,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.302491103202847,
|
||
|
|
"grad_norm": 15.627188682556152,
|
||
|
|
"learning_rate": 2.9787234042553192e-05,
|
||
|
|
"loss": 0.2053,
|
||
|
|
"num_input_tokens_seen": 125504,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3202846975088968,
|
||
|
|
"grad_norm": 24.881179809570312,
|
||
|
|
"learning_rate": 3.156028368794326e-05,
|
||
|
|
"loss": 0.2409,
|
||
|
|
"num_input_tokens_seen": 132352,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33807829181494664,
|
||
|
|
"grad_norm": 19.5406494140625,
|
||
|
|
"learning_rate": 3.3333333333333335e-05,
|
||
|
|
"loss": 0.2063,
|
||
|
|
"num_input_tokens_seen": 139200,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35587188612099646,
|
||
|
|
"grad_norm": 23.64601707458496,
|
||
|
|
"learning_rate": 3.5106382978723407e-05,
|
||
|
|
"loss": 0.244,
|
||
|
|
"num_input_tokens_seen": 147904,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3736654804270463,
|
||
|
|
"grad_norm": 13.374123573303223,
|
||
|
|
"learning_rate": 3.687943262411347e-05,
|
||
|
|
"loss": 0.183,
|
||
|
|
"num_input_tokens_seen": 154240,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3914590747330961,
|
||
|
|
"grad_norm": 7.41645622253418,
|
||
|
|
"learning_rate": 3.865248226950355e-05,
|
||
|
|
"loss": 0.1615,
|
||
|
|
"num_input_tokens_seen": 161472,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4092526690391459,
|
||
|
|
"grad_norm": 11.39084243774414,
|
||
|
|
"learning_rate": 4.0425531914893614e-05,
|
||
|
|
"loss": 0.1703,
|
||
|
|
"num_input_tokens_seen": 168192,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42704626334519574,
|
||
|
|
"grad_norm": 11.221735000610352,
|
||
|
|
"learning_rate": 4.219858156028369e-05,
|
||
|
|
"loss": 0.246,
|
||
|
|
"num_input_tokens_seen": 174656,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44483985765124556,
|
||
|
|
"grad_norm": 10.728532791137695,
|
||
|
|
"learning_rate": 4.3971631205673764e-05,
|
||
|
|
"loss": 0.1665,
|
||
|
|
"num_input_tokens_seen": 181632,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4626334519572954,
|
||
|
|
"grad_norm": 14.208120346069336,
|
||
|
|
"learning_rate": 4.574468085106383e-05,
|
||
|
|
"loss": 0.1695,
|
||
|
|
"num_input_tokens_seen": 191488,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4804270462633452,
|
||
|
|
"grad_norm": 1.208547830581665,
|
||
|
|
"learning_rate": 4.751773049645391e-05,
|
||
|
|
"loss": 0.1764,
|
||
|
|
"num_input_tokens_seen": 198848,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.498220640569395,
|
||
|
|
"grad_norm": 19.034914016723633,
|
||
|
|
"learning_rate": 4.929078014184397e-05,
|
||
|
|
"loss": 0.1769,
|
||
|
|
"num_input_tokens_seen": 207232,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.505338078291815,
|
||
|
|
"eval_loss": 0.15581394731998444,
|
||
|
|
"eval_runtime": 0.5742,
|
||
|
|
"eval_samples_per_second": 433.643,
|
||
|
|
"eval_steps_per_second": 55.729,
|
||
|
|
"num_input_tokens_seen": 209536,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5160142348754448,
|
||
|
|
"grad_norm": 15.478862762451172,
|
||
|
|
"learning_rate": 4.9999305045921804e-05,
|
||
|
|
"loss": 0.2155,
|
||
|
|
"num_input_tokens_seen": 213952,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5338078291814946,
|
||
|
|
"grad_norm": 15.320956230163574,
|
||
|
|
"learning_rate": 4.9995058244251644e-05,
|
||
|
|
"loss": 0.185,
|
||
|
|
"num_input_tokens_seen": 221376,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5516014234875445,
|
||
|
|
"grad_norm": 14.336426734924316,
|
||
|
|
"learning_rate": 4.998695138156149e-05,
|
||
|
|
"loss": 0.2471,
|
||
|
|
"num_input_tokens_seen": 228928,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5693950177935944,
|
||
|
|
"grad_norm": 9.861719131469727,
|
||
|
|
"learning_rate": 4.997498570981822e-05,
|
||
|
|
"loss": 0.2061,
|
||
|
|
"num_input_tokens_seen": 236352,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5871886120996441,
|
||
|
|
"grad_norm": 10.547555923461914,
|
||
|
|
"learning_rate": 4.995916307691601e-05,
|
||
|
|
"loss": 0.1488,
|
||
|
|
"num_input_tokens_seen": 244416,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.604982206405694,
|
||
|
|
"grad_norm": 6.471895217895508,
|
||
|
|
"learning_rate": 4.993948592639104e-05,
|
||
|
|
"loss": 0.1625,
|
||
|
|
"num_input_tokens_seen": 251456,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6227758007117438,
|
||
|
|
"grad_norm": 10.839587211608887,
|
||
|
|
"learning_rate": 4.991595729704405e-05,
|
||
|
|
"loss": 0.1635,
|
||
|
|
"num_input_tokens_seen": 258880,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6405693950177936,
|
||
|
|
"grad_norm": 14.87012767791748,
|
||
|
|
"learning_rate": 4.9888580822471086e-05,
|
||
|
|
"loss": 0.163,
|
||
|
|
"num_input_tokens_seen": 265152,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6583629893238434,
|
||
|
|
"grad_norm": 12.832857131958008,
|
||
|
|
"learning_rate": 4.985736073050237e-05,
|
||
|
|
"loss": 0.1599,
|
||
|
|
"num_input_tokens_seen": 272576,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6761565836298933,
|
||
|
|
"grad_norm": 9.781329154968262,
|
||
|
|
"learning_rate": 4.982230184254933e-05,
|
||
|
|
"loss": 0.1669,
|
||
|
|
"num_input_tokens_seen": 279744,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.693950177935943,
|
||
|
|
"grad_norm": 9.258131980895996,
|
||
|
|
"learning_rate": 4.9783409572860105e-05,
|
||
|
|
"loss": 0.1659,
|
||
|
|
"num_input_tokens_seen": 287680,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7117437722419929,
|
||
|
|
"grad_norm": 6.527733325958252,
|
||
|
|
"learning_rate": 4.974068992768331e-05,
|
||
|
|
"loss": 0.1729,
|
||
|
|
"num_input_tokens_seen": 294592,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7295373665480427,
|
||
|
|
"grad_norm": 17.004568099975586,
|
||
|
|
"learning_rate": 4.9694149504340517e-05,
|
||
|
|
"loss": 0.2655,
|
||
|
|
"num_input_tokens_seen": 301440,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7473309608540926,
|
||
|
|
"grad_norm": 12.954022407531738,
|
||
|
|
"learning_rate": 4.964379549020741e-05,
|
||
|
|
"loss": 0.1924,
|
||
|
|
"num_input_tokens_seen": 308416,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7580071174377224,
|
||
|
|
"eval_loss": 0.1600140929222107,
|
||
|
|
"eval_runtime": 0.5686,
|
||
|
|
"eval_samples_per_second": 437.954,
|
||
|
|
"eval_steps_per_second": 56.283,
|
||
|
|
"num_input_tokens_seen": 312576,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7651245551601423,
|
||
|
|
"grad_norm": 2.3744094371795654,
|
||
|
|
"learning_rate": 4.958963566160384e-05,
|
||
|
|
"loss": 0.1666,
|
||
|
|
"num_input_tokens_seen": 315328,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7829181494661922,
|
||
|
|
"grad_norm": 4.415204048156738,
|
||
|
|
"learning_rate": 4.953167838259285e-05,
|
||
|
|
"loss": 0.1668,
|
||
|
|
"num_input_tokens_seen": 322688,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.800711743772242,
|
||
|
|
"grad_norm": 4.4129319190979,
|
||
|
|
"learning_rate": 4.946993260368904e-05,
|
||
|
|
"loss": 0.1826,
|
||
|
|
"num_input_tokens_seen": 329280,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8185053380782918,
|
||
|
|
"grad_norm": 1.2767548561096191,
|
||
|
|
"learning_rate": 4.940440786047628e-05,
|
||
|
|
"loss": 0.1488,
|
||
|
|
"num_input_tokens_seen": 336896,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8362989323843416,
|
||
|
|
"grad_norm": 10.839607238769531,
|
||
|
|
"learning_rate": 4.933511427213511e-05,
|
||
|
|
"loss": 0.2852,
|
||
|
|
"num_input_tokens_seen": 344128,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8540925266903915,
|
||
|
|
"grad_norm": 16.616424560546875,
|
||
|
|
"learning_rate": 4.926206253988001e-05,
|
||
|
|
"loss": 0.1901,
|
||
|
|
"num_input_tokens_seen": 350912,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8718861209964412,
|
||
|
|
"grad_norm": 3.9430079460144043,
|
||
|
|
"learning_rate": 4.91852639453068e-05,
|
||
|
|
"loss": 0.1972,
|
||
|
|
"num_input_tokens_seen": 358016,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8896797153024911,
|
||
|
|
"grad_norm": 6.706320762634277,
|
||
|
|
"learning_rate": 4.910473034865033e-05,
|
||
|
|
"loss": 0.3136,
|
||
|
|
"num_input_tokens_seen": 364736,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9074733096085409,
|
||
|
|
"grad_norm": 3.7334418296813965,
|
||
|
|
"learning_rate": 4.902047418695292e-05,
|
||
|
|
"loss": 0.1648,
|
||
|
|
"num_input_tokens_seen": 371648,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9252669039145908,
|
||
|
|
"grad_norm": 11.57023811340332,
|
||
|
|
"learning_rate": 4.893250847214369e-05,
|
||
|
|
"loss": 0.1706,
|
||
|
|
"num_input_tokens_seen": 379200,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9430604982206405,
|
||
|
|
"grad_norm": 1.522990345954895,
|
||
|
|
"learning_rate": 4.884084678902898e-05,
|
||
|
|
"loss": 0.2379,
|
||
|
|
"num_input_tokens_seen": 387200,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9608540925266904,
|
||
|
|
"grad_norm": 6.809507846832275,
|
||
|
|
"learning_rate": 4.874550329319457e-05,
|
||
|
|
"loss": 0.1618,
|
||
|
|
"num_input_tokens_seen": 395264,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9786476868327402,
|
||
|
|
"grad_norm": 9.76811695098877,
|
||
|
|
"learning_rate": 4.864649270881944e-05,
|
||
|
|
"loss": 0.1637,
|
||
|
|
"num_input_tokens_seen": 402176,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.99644128113879,
|
||
|
|
"grad_norm": 15.906750679016113,
|
||
|
|
"learning_rate": 4.8543830326401954e-05,
|
||
|
|
"loss": 0.1956,
|
||
|
|
"num_input_tokens_seen": 409984,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.01067615658363,
|
||
|
|
"eval_loss": 0.16843144595623016,
|
||
|
|
"eval_runtime": 0.6085,
|
||
|
|
"eval_samples_per_second": 409.228,
|
||
|
|
"eval_steps_per_second": 52.592,
|
||
|
|
"num_input_tokens_seen": 414040,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0142348754448398,
|
||
|
|
"grad_norm": 7.785819053649902,
|
||
|
|
"learning_rate": 4.843753200039851e-05,
|
||
|
|
"loss": 0.1483,
|
||
|
|
"num_input_tokens_seen": 415256,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0320284697508897,
|
||
|
|
"grad_norm": 2.8784444332122803,
|
||
|
|
"learning_rate": 4.832761414677503e-05,
|
||
|
|
"loss": 0.1508,
|
||
|
|
"num_input_tokens_seen": 422808,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0498220640569396,
|
||
|
|
"grad_norm": 9.171720504760742,
|
||
|
|
"learning_rate": 4.8214093740471836e-05,
|
||
|
|
"loss": 0.1599,
|
||
|
|
"num_input_tokens_seen": 430104,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0676156583629894,
|
||
|
|
"grad_norm": 0.9587394595146179,
|
||
|
|
"learning_rate": 4.8096988312782174e-05,
|
||
|
|
"loss": 0.1629,
|
||
|
|
"num_input_tokens_seen": 436760,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.085409252669039,
|
||
|
|
"grad_norm": 5.9907379150390625,
|
||
|
|
"learning_rate": 4.7976315948644745e-05,
|
||
|
|
"loss": 0.1729,
|
||
|
|
"num_input_tokens_seen": 444952,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.103202846975089,
|
||
|
|
"grad_norm": 0.4214398264884949,
|
||
|
|
"learning_rate": 4.7852095283850866e-05,
|
||
|
|
"loss": 3.0413,
|
||
|
|
"num_input_tokens_seen": 452760,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1209964412811388,
|
||
|
|
"grad_norm": 0.5086872577667236,
|
||
|
|
"learning_rate": 4.772434550216643e-05,
|
||
|
|
"loss": 0.1785,
|
||
|
|
"num_input_tokens_seen": 458392,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1387900355871885,
|
||
|
|
"grad_norm": 0.5129872560501099,
|
||
|
|
"learning_rate": 4.7593086332369344e-05,
|
||
|
|
"loss": 0.1666,
|
||
|
|
"num_input_tokens_seen": 465112,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1565836298932384,
|
||
|
|
"grad_norm": 7.883773326873779,
|
||
|
|
"learning_rate": 4.74583380452027e-05,
|
||
|
|
"loss": 0.2395,
|
||
|
|
"num_input_tokens_seen": 472216,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1743772241992882,
|
||
|
|
"grad_norm": 3.8998472690582275,
|
||
|
|
"learning_rate": 4.7320121450244394e-05,
|
||
|
|
"loss": 0.2229,
|
||
|
|
"num_input_tokens_seen": 479576,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1921708185053381,
|
||
|
|
"grad_norm": 11.560748100280762,
|
||
|
|
"learning_rate": 4.717845789269333e-05,
|
||
|
|
"loss": 0.2531,
|
||
|
|
"num_input_tokens_seen": 486552,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.209964412811388,
|
||
|
|
"grad_norm": 20.51876449584961,
|
||
|
|
"learning_rate": 4.703336925007311e-05,
|
||
|
|
"loss": 0.2223,
|
||
|
|
"num_input_tokens_seen": 494616,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2277580071174377,
|
||
|
|
"grad_norm": 10.914800643920898,
|
||
|
|
"learning_rate": 4.68848779288534e-05,
|
||
|
|
"loss": 0.1898,
|
||
|
|
"num_input_tokens_seen": 501400,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2455516014234875,
|
||
|
|
"grad_norm": 6.894437789916992,
|
||
|
|
"learning_rate": 4.673300686098957e-05,
|
||
|
|
"loss": 0.1662,
|
||
|
|
"num_input_tokens_seen": 508888,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2633451957295374,
|
||
|
|
"grad_norm": 4.296377658843994,
|
||
|
|
"learning_rate": 4.657777950038133e-05,
|
||
|
|
"loss": 0.1589,
|
||
|
|
"num_input_tokens_seen": 517656,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2633451957295374,
|
||
|
|
"eval_loss": 0.1600693166255951,
|
||
|
|
"eval_runtime": 0.607,
|
||
|
|
"eval_samples_per_second": 410.202,
|
||
|
|
"eval_steps_per_second": 52.717,
|
||
|
|
"num_input_tokens_seen": 517656,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.281138790035587,
|
||
|
|
"grad_norm": 6.819537162780762,
|
||
|
|
"learning_rate": 4.6419219819250636e-05,
|
||
|
|
"loss": 0.1538,
|
||
|
|
"num_input_tokens_seen": 526232,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.298932384341637,
|
||
|
|
"grad_norm": 14.25802230834961,
|
||
|
|
"learning_rate": 4.62573523044396e-05,
|
||
|
|
"loss": 0.1811,
|
||
|
|
"num_input_tokens_seen": 533400,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3167259786476868,
|
||
|
|
"grad_norm": 3.1280250549316406,
|
||
|
|
"learning_rate": 4.609220195362886e-05,
|
||
|
|
"loss": 0.174,
|
||
|
|
"num_input_tokens_seen": 542168,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3345195729537367,
|
||
|
|
"grad_norm": 7.372785568237305,
|
||
|
|
"learning_rate": 4.5923794271477217e-05,
|
||
|
|
"loss": 0.1571,
|
||
|
|
"num_input_tokens_seen": 549976,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3523131672597866,
|
||
|
|
"grad_norm": 7.614220142364502,
|
||
|
|
"learning_rate": 4.575215526568278e-05,
|
||
|
|
"loss": 0.1641,
|
||
|
|
"num_input_tokens_seen": 557016,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3701067615658362,
|
||
|
|
"grad_norm": 38.93210983276367,
|
||
|
|
"learning_rate": 4.5577311442966584e-05,
|
||
|
|
"loss": 1.4814,
|
||
|
|
"num_input_tokens_seen": 564504,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.387900355871886,
|
||
|
|
"grad_norm": 5.316745281219482,
|
||
|
|
"learning_rate": 4.539928980497903e-05,
|
||
|
|
"loss": 0.1601,
|
||
|
|
"num_input_tokens_seen": 571864,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.405693950177936,
|
||
|
|
"grad_norm": 9.071686744689941,
|
||
|
|
"learning_rate": 4.521811784412996e-05,
|
||
|
|
"loss": 0.2213,
|
||
|
|
"num_input_tokens_seen": 578456,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4234875444839858,
|
||
|
|
"grad_norm": 36.631160736083984,
|
||
|
|
"learning_rate": 4.503382353934294e-05,
|
||
|
|
"loss": 1.4493,
|
||
|
|
"num_input_tokens_seen": 584600,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4412811387900355,
|
||
|
|
"grad_norm": 17.439191818237305,
|
||
|
|
"learning_rate": 4.4846435351734376e-05,
|
||
|
|
"loss": 0.1729,
|
||
|
|
"num_input_tokens_seen": 591128,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4590747330960854,
|
||
|
|
"grad_norm": 4.0148138999938965,
|
||
|
|
"learning_rate": 4.4655982220218176e-05,
|
||
|
|
"loss": 0.1539,
|
||
|
|
"num_input_tokens_seen": 598552,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4768683274021353,
|
||
|
|
"grad_norm": 0.6515812873840332,
|
||
|
|
"learning_rate": 4.446249355703661e-05,
|
||
|
|
"loss": 0.1612,
|
||
|
|
"num_input_tokens_seen": 607320,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4946619217081851,
|
||
|
|
"grad_norm": 4.950193881988525,
|
||
|
|
"learning_rate": 4.426599924321815e-05,
|
||
|
|
"loss": 0.1594,
|
||
|
|
"num_input_tokens_seen": 614744,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.512455516014235,
|
||
|
|
"grad_norm": 4.902361869812012,
|
||
|
|
"learning_rate": 4.4066529623962784e-05,
|
||
|
|
"loss": 0.1947,
|
||
|
|
"num_input_tokens_seen": 622808,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5160142348754448,
|
||
|
|
"eval_loss": 0.18150445818901062,
|
||
|
|
"eval_runtime": 0.6062,
|
||
|
|
"eval_samples_per_second": 410.733,
|
||
|
|
"eval_steps_per_second": 52.785,
|
||
|
|
"num_input_tokens_seen": 624344,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5302491103202847,
|
||
|
|
"grad_norm": 0.29520076513290405,
|
||
|
|
"learning_rate": 4.386411550395576e-05,
|
||
|
|
"loss": 0.1523,
|
||
|
|
"num_input_tokens_seen": 630488,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5480427046263345,
|
||
|
|
"grad_norm": 1.9226378202438354,
|
||
|
|
"learning_rate": 4.365878814261032e-05,
|
||
|
|
"loss": 0.1721,
|
||
|
|
"num_input_tokens_seen": 638424,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5658362989323842,
|
||
|
|
"grad_norm": 6.8878493309021,
|
||
|
|
"learning_rate": 4.34505792492402e-05,
|
||
|
|
"loss": 0.1551,
|
||
|
|
"num_input_tokens_seen": 645208,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.583629893238434,
|
||
|
|
"grad_norm": 9.136181831359863,
|
||
|
|
"learning_rate": 4.323952097816269e-05,
|
||
|
|
"loss": 0.1499,
|
||
|
|
"num_input_tokens_seen": 653016,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.601423487544484,
|
||
|
|
"grad_norm": 7.4756178855896,
|
||
|
|
"learning_rate": 4.3025645923732926e-05,
|
||
|
|
"loss": 0.1843,
|
||
|
|
"num_input_tokens_seen": 659992,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6192170818505338,
|
||
|
|
"grad_norm": 7.807384490966797,
|
||
|
|
"learning_rate": 4.2808987115310255e-05,
|
||
|
|
"loss": 0.1579,
|
||
|
|
"num_input_tokens_seen": 667224,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6370106761565837,
|
||
|
|
"grad_norm": 0.17006787657737732,
|
||
|
|
"learning_rate": 4.2589578012157426e-05,
|
||
|
|
"loss": 0.1563,
|
||
|
|
"num_input_tokens_seen": 675160,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6548042704626336,
|
||
|
|
"grad_norm": 0.41114601492881775,
|
||
|
|
"learning_rate": 4.236745249827336e-05,
|
||
|
|
"loss": 0.1556,
|
||
|
|
"num_input_tokens_seen": 683544,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6725978647686834,
|
||
|
|
"grad_norm": 2.4918622970581055,
|
||
|
|
"learning_rate": 4.214264487716033e-05,
|
||
|
|
"loss": 0.1593,
|
||
|
|
"num_input_tokens_seen": 689368,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.690391459074733,
|
||
|
|
"grad_norm": 10.712060928344727,
|
||
|
|
"learning_rate": 4.191518986652642e-05,
|
||
|
|
"loss": 0.1699,
|
||
|
|
"num_input_tokens_seen": 695832,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.708185053380783,
|
||
|
|
"grad_norm": 0.39044228196144104,
|
||
|
|
"learning_rate": 4.168512259292391e-05,
|
||
|
|
"loss": 0.1563,
|
||
|
|
"num_input_tokens_seen": 703128,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7259786476868326,
|
||
|
|
"grad_norm": 4.815671443939209,
|
||
|
|
"learning_rate": 4.1452478586324605e-05,
|
||
|
|
"loss": 0.1507,
|
||
|
|
"num_input_tokens_seen": 709528,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7437722419928825,
|
||
|
|
"grad_norm": 0.5018470287322998,
|
||
|
|
"learning_rate": 4.121729377463285e-05,
|
||
|
|
"loss": 0.1558,
|
||
|
|
"num_input_tokens_seen": 716312,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7615658362989324,
|
||
|
|
"grad_norm": 10.01478099822998,
|
||
|
|
"learning_rate": 4.097960447813705e-05,
|
||
|
|
"loss": 0.1825,
|
||
|
|
"num_input_tokens_seen": 722776,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7686832740213523,
|
||
|
|
"eval_loss": 0.16469639539718628,
|
||
|
|
"eval_runtime": 0.5964,
|
||
|
|
"eval_samples_per_second": 417.484,
|
||
|
|
"eval_steps_per_second": 53.653,
|
||
|
|
"num_input_tokens_seen": 725656,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7793594306049823,
|
||
|
|
"grad_norm": 3.8590610027313232,
|
||
|
|
"learning_rate": 4.073944740390061e-05,
|
||
|
|
"loss": 0.1798,
|
||
|
|
"num_input_tokens_seen": 729944,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7971530249110321,
|
||
|
|
"grad_norm": 4.1739020347595215,
|
||
|
|
"learning_rate": 4.049685964009321e-05,
|
||
|
|
"loss": 0.1694,
|
||
|
|
"num_input_tokens_seen": 737112,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.814946619217082,
|
||
|
|
"grad_norm": 10.671394348144531,
|
||
|
|
"learning_rate": 4.025187865026311e-05,
|
||
|
|
"loss": 0.1605,
|
||
|
|
"num_input_tokens_seen": 744408,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8327402135231317,
|
||
|
|
"grad_norm": 0.9396809935569763,
|
||
|
|
"learning_rate": 4.000454226755159e-05,
|
||
|
|
"loss": 0.1574,
|
||
|
|
"num_input_tokens_seen": 750488,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8505338078291815,
|
||
|
|
"grad_norm": 6.7215447425842285,
|
||
|
|
"learning_rate": 3.975488868885021e-05,
|
||
|
|
"loss": 0.1703,
|
||
|
|
"num_input_tokens_seen": 757528,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8683274021352312,
|
||
|
|
"grad_norm": 0.5858572721481323,
|
||
|
|
"learning_rate": 3.9502956468902014e-05,
|
||
|
|
"loss": 0.1545,
|
||
|
|
"num_input_tokens_seen": 763736,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.886120996441281,
|
||
|
|
"grad_norm": 2.219594955444336,
|
||
|
|
"learning_rate": 3.924878451434735e-05,
|
||
|
|
"loss": 0.1534,
|
||
|
|
"num_input_tokens_seen": 771864,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.903914590747331,
|
||
|
|
"grad_norm": 1.9175541400909424,
|
||
|
|
"learning_rate": 3.899241207771546e-05,
|
||
|
|
"loss": 0.1537,
|
||
|
|
"num_input_tokens_seen": 778712,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9217081850533808,
|
||
|
|
"grad_norm": 12.399153709411621,
|
||
|
|
"learning_rate": 3.873387875136252e-05,
|
||
|
|
"loss": 0.1917,
|
||
|
|
"num_input_tokens_seen": 784280,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9395017793594307,
|
||
|
|
"grad_norm": 7.259119987487793,
|
||
|
|
"learning_rate": 3.847322446135736e-05,
|
||
|
|
"loss": 0.1743,
|
||
|
|
"num_input_tokens_seen": 792280,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9572953736654806,
|
||
|
|
"grad_norm": 7.568546772003174,
|
||
|
|
"learning_rate": 3.821048946131549e-05,
|
||
|
|
"loss": 0.1752,
|
||
|
|
"num_input_tokens_seen": 798488,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9750889679715302,
|
||
|
|
"grad_norm": 6.783497333526611,
|
||
|
|
"learning_rate": 3.794571432618267e-05,
|
||
|
|
"loss": 0.1578,
|
||
|
|
"num_input_tokens_seen": 806104,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.99288256227758,
|
||
|
|
"grad_norm": 9.681258201599121,
|
||
|
|
"learning_rate": 3.767893994596876e-05,
|
||
|
|
"loss": 0.1774,
|
||
|
|
"num_input_tokens_seen": 813336,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0106761565836297,
|
||
|
|
"grad_norm": 3.2600245475769043,
|
||
|
|
"learning_rate": 3.741020751943297e-05,
|
||
|
|
"loss": 0.1568,
|
||
|
|
"num_input_tokens_seen": 817576,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.02135231316726,
|
||
|
|
"eval_loss": 0.15550938248634338,
|
||
|
|
"eval_runtime": 0.6255,
|
||
|
|
"eval_samples_per_second": 398.079,
|
||
|
|
"eval_steps_per_second": 51.159,
|
||
|
|
"num_input_tokens_seen": 821416,
|
||
|
|
"step": 568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0284697508896796,
|
||
|
|
"grad_norm": 3.0256900787353516,
|
||
|
|
"learning_rate": 3.713955854772144e-05,
|
||
|
|
"loss": 0.1565,
|
||
|
|
"num_input_tokens_seen": 823848,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0462633451957295,
|
||
|
|
"grad_norm": 1.889113187789917,
|
||
|
|
"learning_rate": 3.686703482795802e-05,
|
||
|
|
"loss": 0.1536,
|
||
|
|
"num_input_tokens_seen": 832232,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0640569395017794,
|
||
|
|
"grad_norm": 3.334212303161621,
|
||
|
|
"learning_rate": 3.6592678446789516e-05,
|
||
|
|
"loss": 0.1624,
|
||
|
|
"num_input_tokens_seen": 840424,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0818505338078293,
|
||
|
|
"grad_norm": 3.6044702529907227,
|
||
|
|
"learning_rate": 3.631653177388605e-05,
|
||
|
|
"loss": 0.1395,
|
||
|
|
"num_input_tokens_seen": 846824,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.099644128113879,
|
||
|
|
"grad_norm": 8.975861549377441,
|
||
|
|
"learning_rate": 3.60386374553978e-05,
|
||
|
|
"loss": 0.196,
|
||
|
|
"num_input_tokens_seen": 853608,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.117437722419929,
|
||
|
|
"grad_norm": 10.559611320495605,
|
||
|
|
"learning_rate": 3.5759038407369056e-05,
|
||
|
|
"loss": 0.1637,
|
||
|
|
"num_input_tokens_seen": 860968,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.135231316725979,
|
||
|
|
"grad_norm": 6.914389610290527,
|
||
|
|
"learning_rate": 3.547777780911055e-05,
|
||
|
|
"loss": 0.194,
|
||
|
|
"num_input_tokens_seen": 868904,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1530249110320283,
|
||
|
|
"grad_norm": 8.329413414001465,
|
||
|
|
"learning_rate": 3.519489909653113e-05,
|
||
|
|
"loss": 0.1592,
|
||
|
|
"num_input_tokens_seen": 876072,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.170818505338078,
|
||
|
|
"grad_norm": 4.701565742492676,
|
||
|
|
"learning_rate": 3.4910445955429854e-05,
|
||
|
|
"loss": 0.1549,
|
||
|
|
"num_input_tokens_seen": 883752,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.188612099644128,
|
||
|
|
"grad_norm": 7.797508716583252,
|
||
|
|
"learning_rate": 3.4624462314749443e-05,
|
||
|
|
"loss": 0.1533,
|
||
|
|
"num_input_tokens_seen": 891304,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.206405693950178,
|
||
|
|
"grad_norm": 1.7337656021118164,
|
||
|
|
"learning_rate": 3.433699233979222e-05,
|
||
|
|
"loss": 0.1483,
|
||
|
|
"num_input_tokens_seen": 899176,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.224199288256228,
|
||
|
|
"grad_norm": 5.721285343170166,
|
||
|
|
"learning_rate": 3.4048080425399505e-05,
|
||
|
|
"loss": 0.1436,
|
||
|
|
"num_input_tokens_seen": 907560,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2419928825622777,
|
||
|
|
"grad_norm": 3.0777595043182373,
|
||
|
|
"learning_rate": 3.375777118909561e-05,
|
||
|
|
"loss": 0.1413,
|
||
|
|
"num_input_tokens_seen": 915240,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2597864768683276,
|
||
|
|
"grad_norm": 15.890474319458008,
|
||
|
|
"learning_rate": 3.3466109464197426e-05,
|
||
|
|
"loss": 0.1597,
|
||
|
|
"num_input_tokens_seen": 921384,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2740213523131674,
|
||
|
|
"eval_loss": 0.1567462682723999,
|
||
|
|
"eval_runtime": 0.6255,
|
||
|
|
"eval_samples_per_second": 398.087,
|
||
|
|
"eval_steps_per_second": 51.16,
|
||
|
|
"num_input_tokens_seen": 926760,
|
||
|
|
"step": 639
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.277580071174377,
|
||
|
|
"grad_norm": 1.5718131065368652,
|
||
|
|
"learning_rate": 3.317314029289067e-05,
|
||
|
|
"loss": 0.1653,
|
||
|
|
"num_input_tokens_seen": 927528,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.295373665480427,
|
||
|
|
"grad_norm": 3.7291853427886963,
|
||
|
|
"learning_rate": 3.287890891927386e-05,
|
||
|
|
"loss": 0.1594,
|
||
|
|
"num_input_tokens_seen": 934568,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3131672597864767,
|
||
|
|
"grad_norm": 4.549835205078125,
|
||
|
|
"learning_rate": 3.258346078237122e-05,
|
||
|
|
"loss": 0.1402,
|
||
|
|
"num_input_tokens_seen": 942248,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3309608540925266,
|
||
|
|
"grad_norm": 14.683507919311523,
|
||
|
|
"learning_rate": 3.228684150911527e-05,
|
||
|
|
"loss": 0.2418,
|
||
|
|
"num_input_tokens_seen": 949096,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3487544483985765,
|
||
|
|
"grad_norm": 1.7894399166107178,
|
||
|
|
"learning_rate": 3.198909690730063e-05,
|
||
|
|
"loss": 0.1845,
|
||
|
|
"num_input_tokens_seen": 955752,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3665480427046264,
|
||
|
|
"grad_norm": 15.066572189331055,
|
||
|
|
"learning_rate": 3.169027295850977e-05,
|
||
|
|
"loss": 0.1664,
|
||
|
|
"num_input_tokens_seen": 963176,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3843416370106763,
|
||
|
|
"grad_norm": 4.301926136016846,
|
||
|
|
"learning_rate": 3.139041581101187e-05,
|
||
|
|
"loss": 0.1627,
|
||
|
|
"num_input_tokens_seen": 968232,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.402135231316726,
|
||
|
|
"grad_norm": 5.145651340484619,
|
||
|
|
"learning_rate": 3.108957177263608e-05,
|
||
|
|
"loss": 0.1498,
|
||
|
|
"num_input_tokens_seen": 976552,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.419928825622776,
|
||
|
|
"grad_norm": 2.5066633224487305,
|
||
|
|
"learning_rate": 3.078778730362003e-05,
|
||
|
|
"loss": 0.1656,
|
||
|
|
"num_input_tokens_seen": 983720,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4377224199288254,
|
||
|
|
"grad_norm": 3.9444332122802734,
|
||
|
|
"learning_rate": 3.048510900943484e-05,
|
||
|
|
"loss": 0.1567,
|
||
|
|
"num_input_tokens_seen": 991976,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4555160142348753,
|
||
|
|
"grad_norm": 4.341545581817627,
|
||
|
|
"learning_rate": 3.018158363358773e-05,
|
||
|
|
"loss": 0.1807,
|
||
|
|
"num_input_tokens_seen": 998184,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.473309608540925,
|
||
|
|
"grad_norm": 4.363418102264404,
|
||
|
|
"learning_rate": 2.9877258050403212e-05,
|
||
|
|
"loss": 0.1678,
|
||
|
|
"num_input_tokens_seen": 1005672,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.491103202846975,
|
||
|
|
"grad_norm": 3.3406949043273926,
|
||
|
|
"learning_rate": 2.9572179257784215e-05,
|
||
|
|
"loss": 0.1531,
|
||
|
|
"num_input_tokens_seen": 1013096,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.508896797153025,
|
||
|
|
"grad_norm": 2.7513387203216553,
|
||
|
|
"learning_rate": 2.9266394369954052e-05,
|
||
|
|
"loss": 0.1337,
|
||
|
|
"num_input_tokens_seen": 1019304,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.526690391459075,
|
||
|
|
"grad_norm": 7.649652481079102,
|
||
|
|
"learning_rate": 2.8959950610180374e-05,
|
||
|
|
"loss": 0.1431,
|
||
|
|
"num_input_tokens_seen": 1025320,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.526690391459075,
|
||
|
|
"eval_loss": 0.16391661763191223,
|
||
|
|
"eval_runtime": 0.6072,
|
||
|
|
"eval_samples_per_second": 410.078,
|
||
|
|
"eval_steps_per_second": 52.701,
|
||
|
|
"num_input_tokens_seen": 1025320,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5444839857651247,
|
||
|
|
"grad_norm": 15.210580825805664,
|
||
|
|
"learning_rate": 2.865289530348243e-05,
|
||
|
|
"loss": 0.1675,
|
||
|
|
"num_input_tokens_seen": 1032552,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.562277580071174,
|
||
|
|
"grad_norm": 4.497170925140381,
|
||
|
|
"learning_rate": 2.834527586932243e-05,
|
||
|
|
"loss": 2.4615,
|
||
|
|
"num_input_tokens_seen": 1039912,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.580071174377224,
|
||
|
|
"grad_norm": 10.657808303833008,
|
||
|
|
"learning_rate": 2.8037139814282493e-05,
|
||
|
|
"loss": 0.1636,
|
||
|
|
"num_input_tokens_seen": 1047208,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.597864768683274,
|
||
|
|
"grad_norm": 1.3169434070587158,
|
||
|
|
"learning_rate": 2.7728534724728027e-05,
|
||
|
|
"loss": 0.1652,
|
||
|
|
"num_input_tokens_seen": 1053928,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6156583629893237,
|
||
|
|
"grad_norm": 2.855050802230835,
|
||
|
|
"learning_rate": 2.741950825945881e-05,
|
||
|
|
"loss": 0.1482,
|
||
|
|
"num_input_tokens_seen": 1061608,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6334519572953736,
|
||
|
|
"grad_norm": 2.2470901012420654,
|
||
|
|
"learning_rate": 2.711010814234896e-05,
|
||
|
|
"loss": 0.1501,
|
||
|
|
"num_input_tokens_seen": 1067560,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6512455516014235,
|
||
|
|
"grad_norm": 4.065670967102051,
|
||
|
|
"learning_rate": 2.6800382154976732e-05,
|
||
|
|
"loss": 0.1743,
|
||
|
|
"num_input_tokens_seen": 1074152,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6690391459074734,
|
||
|
|
"grad_norm": 5.455725193023682,
|
||
|
|
"learning_rate": 2.6490378129245498e-05,
|
||
|
|
"loss": 0.1441,
|
||
|
|
"num_input_tokens_seen": 1082856,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6868327402135233,
|
||
|
|
"grad_norm": 3.1051108837127686,
|
||
|
|
"learning_rate": 2.6180143939996925e-05,
|
||
|
|
"loss": 0.1495,
|
||
|
|
"num_input_tokens_seen": 1089512,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.704626334519573,
|
||
|
|
"grad_norm": 2.337266206741333,
|
||
|
|
"learning_rate": 2.5869727497617495e-05,
|
||
|
|
"loss": 0.1464,
|
||
|
|
"num_input_tokens_seen": 1096232,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.722419928825623,
|
||
|
|
"grad_norm": 4.207283973693848,
|
||
|
|
"learning_rate": 2.55591767406396e-05,
|
||
|
|
"loss": 0.1572,
|
||
|
|
"num_input_tokens_seen": 1104168,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7402135231316724,
|
||
|
|
"grad_norm": 2.140827178955078,
|
||
|
|
"learning_rate": 2.5248539628338246e-05,
|
||
|
|
"loss": 0.1326,
|
||
|
|
"num_input_tokens_seen": 1112232,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7580071174377223,
|
||
|
|
"grad_norm": 8.35146713256836,
|
||
|
|
"learning_rate": 2.4937864133324516e-05,
|
||
|
|
"loss": 0.1734,
|
||
|
|
"num_input_tokens_seen": 1119016,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.775800711743772,
|
||
|
|
"grad_norm": 18.731395721435547,
|
||
|
|
"learning_rate": 2.462719823413707e-05,
|
||
|
|
"loss": 0.1986,
|
||
|
|
"num_input_tokens_seen": 1126696,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7793594306049823,
|
||
|
|
"eval_loss": 0.15414386987686157,
|
||
|
|
"eval_runtime": 0.6372,
|
||
|
|
"eval_samples_per_second": 390.788,
|
||
|
|
"eval_steps_per_second": 50.222,
|
||
|
|
"num_input_tokens_seen": 1128104,
|
||
|
|
"step": 781
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.793594306049822,
|
||
|
|
"grad_norm": 6.263734817504883,
|
||
|
|
"learning_rate": 2.4316589907832654e-05,
|
||
|
|
"loss": 0.1576,
|
||
|
|
"num_input_tokens_seen": 1134184,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.811387900355872,
|
||
|
|
"grad_norm": 1.7886258363723755,
|
||
|
|
"learning_rate": 2.4006087122576863e-05,
|
||
|
|
"loss": 0.1392,
|
||
|
|
"num_input_tokens_seen": 1140392,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.829181494661922,
|
||
|
|
"grad_norm": 9.585826873779297,
|
||
|
|
"learning_rate": 2.3695737830236266e-05,
|
||
|
|
"loss": 0.2025,
|
||
|
|
"num_input_tokens_seen": 1148328,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8469750889679717,
|
||
|
|
"grad_norm": 3.7239151000976562,
|
||
|
|
"learning_rate": 2.338558995897307e-05,
|
||
|
|
"loss": 0.1781,
|
||
|
|
"num_input_tokens_seen": 1154024,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.864768683274021,
|
||
|
|
"grad_norm": 7.329390525817871,
|
||
|
|
"learning_rate": 2.3075691405843435e-05,
|
||
|
|
"loss": 0.195,
|
||
|
|
"num_input_tokens_seen": 1160808,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.882562277580071,
|
||
|
|
"grad_norm": 5.577742099761963,
|
||
|
|
"learning_rate": 2.2766090029400573e-05,
|
||
|
|
"loss": 0.1597,
|
||
|
|
"num_input_tokens_seen": 1167912,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.900355871886121,
|
||
|
|
"grad_norm": 8.529340744018555,
|
||
|
|
"learning_rate": 2.2456833642303822e-05,
|
||
|
|
"loss": 0.1433,
|
||
|
|
"num_input_tokens_seen": 1174568,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9181494661921707,
|
||
|
|
"grad_norm": 5.017305374145508,
|
||
|
|
"learning_rate": 2.214797000393479e-05,
|
||
|
|
"loss": 0.1553,
|
||
|
|
"num_input_tokens_seen": 1181480,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9359430604982206,
|
||
|
|
"grad_norm": 3.5880136489868164,
|
||
|
|
"learning_rate": 2.183954681302173e-05,
|
||
|
|
"loss": 0.1614,
|
||
|
|
"num_input_tokens_seen": 1189928,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9537366548042705,
|
||
|
|
"grad_norm": 1.7257145643234253,
|
||
|
|
"learning_rate": 2.1531611700273297e-05,
|
||
|
|
"loss": 0.1351,
|
||
|
|
"num_input_tokens_seen": 1197480,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9715302491103204,
|
||
|
|
"grad_norm": 4.875583171844482,
|
||
|
|
"learning_rate": 2.1224212221022777e-05,
|
||
|
|
"loss": 0.1845,
|
||
|
|
"num_input_tokens_seen": 1204584,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9893238434163703,
|
||
|
|
"grad_norm": 5.411481857299805,
|
||
|
|
"learning_rate": 2.0917395847883995e-05,
|
||
|
|
"loss": 0.1616,
|
||
|
|
"num_input_tokens_seen": 1212584,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.00711743772242,
|
||
|
|
"grad_norm": 4.330006122589111,
|
||
|
|
"learning_rate": 2.0611209963419958e-05,
|
||
|
|
"loss": 0.1625,
|
||
|
|
"num_input_tokens_seen": 1217856,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0249110320284696,
|
||
|
|
"grad_norm": 10.39330768585205,
|
||
|
|
"learning_rate": 2.030570185282544e-05,
|
||
|
|
"loss": 0.137,
|
||
|
|
"num_input_tokens_seen": 1226624,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0320284697508897,
|
||
|
|
"eval_loss": 0.1851627230644226,
|
||
|
|
"eval_runtime": 0.6345,
|
||
|
|
"eval_samples_per_second": 392.434,
|
||
|
|
"eval_steps_per_second": 50.433,
|
||
|
|
"num_input_tokens_seen": 1229440,
|
||
|
|
"step": 852
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0427046263345194,
|
||
|
|
"grad_norm": 3.0105044841766357,
|
||
|
|
"learning_rate": 2.0000918696624588e-05,
|
||
|
|
"loss": 0.1453,
|
||
|
|
"num_input_tokens_seen": 1233152,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0604982206405693,
|
||
|
|
"grad_norm": 2.1030280590057373,
|
||
|
|
"learning_rate": 1.9696907563384687e-05,
|
||
|
|
"loss": 0.138,
|
||
|
|
"num_input_tokens_seen": 1240128,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.078291814946619,
|
||
|
|
"grad_norm": 2.1849405765533447,
|
||
|
|
"learning_rate": 1.939371540244723e-05,
|
||
|
|
"loss": 0.1148,
|
||
|
|
"num_input_tokens_seen": 1248064,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.096085409252669,
|
||
|
|
"grad_norm": 6.3520402908325195,
|
||
|
|
"learning_rate": 1.9091389036677382e-05,
|
||
|
|
"loss": 0.1106,
|
||
|
|
"num_input_tokens_seen": 1255232,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.113879003558719,
|
||
|
|
"grad_norm": 3.93772554397583,
|
||
|
|
"learning_rate": 1.878997515523299e-05,
|
||
|
|
"loss": 0.1169,
|
||
|
|
"num_input_tokens_seen": 1262272,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.131672597864769,
|
||
|
|
"grad_norm": 6.558725833892822,
|
||
|
|
"learning_rate": 1.848952030635424e-05,
|
||
|
|
"loss": 0.1161,
|
||
|
|
"num_input_tokens_seen": 1269632,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1494661921708187,
|
||
|
|
"grad_norm": 3.3383939266204834,
|
||
|
|
"learning_rate": 1.819007089017508e-05,
|
||
|
|
"loss": 0.123,
|
||
|
|
"num_input_tokens_seen": 1277312,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.167259786476868,
|
||
|
|
"grad_norm": 15.820018768310547,
|
||
|
|
"learning_rate": 1.789167315155749e-05,
|
||
|
|
"loss": 0.1599,
|
||
|
|
"num_input_tokens_seen": 1284096,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.185053380782918,
|
||
|
|
"grad_norm": 2.621346950531006,
|
||
|
|
"learning_rate": 1.7594373172949784e-05,
|
||
|
|
"loss": 0.1109,
|
||
|
|
"num_input_tokens_seen": 1291648,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.202846975088968,
|
||
|
|
"grad_norm": 6.172404766082764,
|
||
|
|
"learning_rate": 1.7298216867269906e-05,
|
||
|
|
"loss": 0.1569,
|
||
|
|
"num_input_tokens_seen": 1299712,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2206405693950177,
|
||
|
|
"grad_norm": 10.012272834777832,
|
||
|
|
"learning_rate": 1.7003249970815026e-05,
|
||
|
|
"loss": 0.1082,
|
||
|
|
"num_input_tokens_seen": 1306176,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2384341637010676,
|
||
|
|
"grad_norm": 3.6646652221679688,
|
||
|
|
"learning_rate": 1.6709518036198308e-05,
|
||
|
|
"loss": 0.1387,
|
||
|
|
"num_input_tokens_seen": 1314112,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2562277580071175,
|
||
|
|
"grad_norm": 9.655856132507324,
|
||
|
|
"learning_rate": 1.6417066425314087e-05,
|
||
|
|
"loss": 0.1199,
|
||
|
|
"num_input_tokens_seen": 1321088,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2740213523131674,
|
||
|
|
"grad_norm": 7.546687602996826,
|
||
|
|
"learning_rate": 1.612594030233252e-05,
|
||
|
|
"loss": 0.1422,
|
||
|
|
"num_input_tokens_seen": 1328512,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.284697508896797,
|
||
|
|
"eval_loss": 0.16463510692119598,
|
||
|
|
"eval_runtime": 0.6174,
|
||
|
|
"eval_samples_per_second": 403.311,
|
||
|
|
"eval_steps_per_second": 51.831,
|
||
|
|
"num_input_tokens_seen": 1332544,
|
||
|
|
"step": 923
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2918149466192173,
|
||
|
|
"grad_norm": 3.2389485836029053,
|
||
|
|
"learning_rate": 1.583618462672472e-05,
|
||
|
|
"loss": 0.0863,
|
||
|
|
"num_input_tokens_seen": 1336128,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.309608540925267,
|
||
|
|
"grad_norm": 3.8101906776428223,
|
||
|
|
"learning_rate": 1.5547844146319545e-05,
|
||
|
|
"loss": 0.1155,
|
||
|
|
"num_input_tokens_seen": 1343552,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3274021352313166,
|
||
|
|
"grad_norm": 5.337780475616455,
|
||
|
|
"learning_rate": 1.5260963390393075e-05,
|
||
|
|
"loss": 0.1691,
|
||
|
|
"num_input_tokens_seen": 1351552,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3451957295373664,
|
||
|
|
"grad_norm": 4.4513840675354,
|
||
|
|
"learning_rate": 1.4975586662791783e-05,
|
||
|
|
"loss": 0.0983,
|
||
|
|
"num_input_tokens_seen": 1358272,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3629893238434163,
|
||
|
|
"grad_norm": 7.950605392456055,
|
||
|
|
"learning_rate": 1.4691758035090602e-05,
|
||
|
|
"loss": 0.137,
|
||
|
|
"num_input_tokens_seen": 1366784,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.380782918149466,
|
||
|
|
"grad_norm": 2.973015785217285,
|
||
|
|
"learning_rate": 1.4409521339786808e-05,
|
||
|
|
"loss": 0.1389,
|
||
|
|
"num_input_tokens_seen": 1373312,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.398576512455516,
|
||
|
|
"grad_norm": 1.8699113130569458,
|
||
|
|
"learning_rate": 1.41289201635308e-05,
|
||
|
|
"loss": 0.0916,
|
||
|
|
"num_input_tokens_seen": 1380736,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.416370106761566,
|
||
|
|
"grad_norm": 1.629996657371521,
|
||
|
|
"learning_rate": 1.3849997840394943e-05,
|
||
|
|
"loss": 0.096,
|
||
|
|
"num_input_tokens_seen": 1388544,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.434163701067616,
|
||
|
|
"grad_norm": 3.142674446105957,
|
||
|
|
"learning_rate": 1.3572797445181345e-05,
|
||
|
|
"loss": 0.1252,
|
||
|
|
"num_input_tokens_seen": 1396160,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4519572953736652,
|
||
|
|
"grad_norm": 1.9603294134140015,
|
||
|
|
"learning_rate": 1.3297361786769652e-05,
|
||
|
|
"loss": 0.0988,
|
||
|
|
"num_input_tokens_seen": 1404096,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.469750889679715,
|
||
|
|
"grad_norm": 18.924589157104492,
|
||
|
|
"learning_rate": 1.3023733401505981e-05,
|
||
|
|
"loss": 0.1135,
|
||
|
|
"num_input_tokens_seen": 1411008,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.487544483985765,
|
||
|
|
"grad_norm": 4.6644487380981445,
|
||
|
|
"learning_rate": 1.2751954546633871e-05,
|
||
|
|
"loss": 0.155,
|
||
|
|
"num_input_tokens_seen": 1418880,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.505338078291815,
|
||
|
|
"grad_norm": 8.87281608581543,
|
||
|
|
"learning_rate": 1.2482067193768417e-05,
|
||
|
|
"loss": 0.1302,
|
||
|
|
"num_input_tokens_seen": 1426048,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5231316725978647,
|
||
|
|
"grad_norm": 6.374912738800049,
|
||
|
|
"learning_rate": 1.2214113022414448e-05,
|
||
|
|
"loss": 0.0911,
|
||
|
|
"num_input_tokens_seen": 1432064,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5373665480427046,
|
||
|
|
"eval_loss": 0.1803617924451828,
|
||
|
|
"eval_runtime": 0.6287,
|
||
|
|
"eval_samples_per_second": 396.078,
|
||
|
|
"eval_steps_per_second": 50.902,
|
||
|
|
"num_input_tokens_seen": 1438336,
|
||
|
|
"step": 994
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5409252669039146,
|
||
|
|
"grad_norm": 7.5531110763549805,
|
||
|
|
"learning_rate": 1.1948133413529817e-05,
|
||
|
|
"loss": 0.1165,
|
||
|
|
"num_input_tokens_seen": 1439808,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5587188612099645,
|
||
|
|
"grad_norm": 10.984672546386719,
|
||
|
|
"learning_rate": 1.168416944313486e-05,
|
||
|
|
"loss": 0.156,
|
||
|
|
"num_input_tokens_seen": 1447616,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5765124555160144,
|
||
|
|
"grad_norm": 5.665327072143555,
|
||
|
|
"learning_rate": 1.1422261875968845e-05,
|
||
|
|
"loss": 0.0978,
|
||
|
|
"num_input_tokens_seen": 1454208,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5943060498220643,
|
||
|
|
"grad_norm": 5.291867256164551,
|
||
|
|
"learning_rate": 1.1162451159194614e-05,
|
||
|
|
"loss": 0.0784,
|
||
|
|
"num_input_tokens_seen": 1463296,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.612099644128114,
|
||
|
|
"grad_norm": 4.302516937255859,
|
||
|
|
"learning_rate": 1.0904777416152166e-05,
|
||
|
|
"loss": 0.1698,
|
||
|
|
"num_input_tokens_seen": 1469952,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6298932384341636,
|
||
|
|
"grad_norm": 2.612572193145752,
|
||
|
|
"learning_rate": 1.0649280440162326e-05,
|
||
|
|
"loss": 0.1033,
|
||
|
|
"num_input_tokens_seen": 1477184,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6476868327402134,
|
||
|
|
"grad_norm": 7.643741607666016,
|
||
|
|
"learning_rate": 1.0395999688381314e-05,
|
||
|
|
"loss": 0.1025,
|
||
|
|
"num_input_tokens_seen": 1484160,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6654804270462633,
|
||
|
|
"grad_norm": 1.1666496992111206,
|
||
|
|
"learning_rate": 1.0144974275707241e-05,
|
||
|
|
"loss": 0.0885,
|
||
|
|
"num_input_tokens_seen": 1491200,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.683274021352313,
|
||
|
|
"grad_norm": 8.459441184997559,
|
||
|
|
"learning_rate": 9.896242968739539e-06,
|
||
|
|
"loss": 0.1678,
|
||
|
|
"num_input_tokens_seen": 1498368,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.701067615658363,
|
||
|
|
"grad_norm": 7.720543384552002,
|
||
|
|
"learning_rate": 9.649844179792081e-06,
|
||
|
|
"loss": 0.1068,
|
||
|
|
"num_input_tokens_seen": 1505984,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.718861209964413,
|
||
|
|
"grad_norm": 1.8878631591796875,
|
||
|
|
"learning_rate": 9.405815960961054e-06,
|
||
|
|
"loss": 0.0978,
|
||
|
|
"num_input_tokens_seen": 1511680,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7366548042704624,
|
||
|
|
"grad_norm": 2.47867488861084,
|
||
|
|
"learning_rate": 9.16419599824847e-06,
|
||
|
|
"loss": 0.0966,
|
||
|
|
"num_input_tokens_seen": 1517888,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7544483985765122,
|
||
|
|
"grad_norm": 3.3050386905670166,
|
||
|
|
"learning_rate": 8.925021605742211e-06,
|
||
|
|
"loss": 0.1815,
|
||
|
|
"num_input_tokens_seen": 1525568,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.772241992882562,
|
||
|
|
"grad_norm": 6.0262837409973145,
|
||
|
|
"learning_rate": 8.68832971985347e-06,
|
||
|
|
"loss": 0.1028,
|
||
|
|
"num_input_tokens_seen": 1532480,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.790035587188612,
|
||
|
|
"grad_norm": 2.8200912475585938,
|
||
|
|
"learning_rate": 8.454156893612591e-06,
|
||
|
|
"loss": 0.1203,
|
||
|
|
"num_input_tokens_seen": 1539072,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.790035587188612,
|
||
|
|
"eval_loss": 0.17713916301727295,
|
||
|
|
"eval_runtime": 0.6261,
|
||
|
|
"eval_samples_per_second": 397.715,
|
||
|
|
"eval_steps_per_second": 51.112,
|
||
|
|
"num_input_tokens_seen": 1539072,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.807829181494662,
|
||
|
|
"grad_norm": 2.3930211067199707,
|
||
|
|
"learning_rate": 8.222539291024078e-06,
|
||
|
|
"loss": 0.1178,
|
||
|
|
"num_input_tokens_seen": 1547584,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8256227758007118,
|
||
|
|
"grad_norm": 7.24454402923584,
|
||
|
|
"learning_rate": 7.993512681481639e-06,
|
||
|
|
"loss": 0.0999,
|
||
|
|
"num_input_tokens_seen": 1554304,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8434163701067616,
|
||
|
|
"grad_norm": 7.17146110534668,
|
||
|
|
"learning_rate": 7.767112434244253e-06,
|
||
|
|
"loss": 0.145,
|
||
|
|
"num_input_tokens_seen": 1560896,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8612099644128115,
|
||
|
|
"grad_norm": 4.711667060852051,
|
||
|
|
"learning_rate": 7.543373512973947e-06,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"num_input_tokens_seen": 1567744,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8790035587188614,
|
||
|
|
"grad_norm": 12.18324089050293,
|
||
|
|
"learning_rate": 7.3223304703363135e-06,
|
||
|
|
"loss": 0.1558,
|
||
|
|
"num_input_tokens_seen": 1574400,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8967971530249113,
|
||
|
|
"grad_norm": 2.6999011039733887,
|
||
|
|
"learning_rate": 7.104017442664393e-06,
|
||
|
|
"loss": 0.0965,
|
||
|
|
"num_input_tokens_seen": 1581504,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.914590747330961,
|
||
|
|
"grad_norm": 5.639074802398682,
|
||
|
|
"learning_rate": 6.8884681446869105e-06,
|
||
|
|
"loss": 0.0914,
|
||
|
|
"num_input_tokens_seen": 1589504,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9323843416370106,
|
||
|
|
"grad_norm": 6.777685165405273,
|
||
|
|
"learning_rate": 6.67571586432163e-06,
|
||
|
|
"loss": 0.124,
|
||
|
|
"num_input_tokens_seen": 1597696,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9501779359430604,
|
||
|
|
"grad_norm": 5.154758453369141,
|
||
|
|
"learning_rate": 6.465793457534553e-06,
|
||
|
|
"loss": 0.1388,
|
||
|
|
"num_input_tokens_seen": 1605248,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9679715302491103,
|
||
|
|
"grad_norm": 4.713754653930664,
|
||
|
|
"learning_rate": 6.258733343265932e-06,
|
||
|
|
"loss": 0.1646,
|
||
|
|
"num_input_tokens_seen": 1613952,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.98576512455516,
|
||
|
|
"grad_norm": 5.546712875366211,
|
||
|
|
"learning_rate": 6.0545674984236826e-06,
|
||
|
|
"loss": 0.1024,
|
||
|
|
"num_input_tokens_seen": 1620224,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.00355871886121,
|
||
|
|
"grad_norm": 1.0218762159347534,
|
||
|
|
"learning_rate": 5.853327452945115e-06,
|
||
|
|
"loss": 0.0889,
|
||
|
|
"num_input_tokens_seen": 1625800,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0213523131672595,
|
||
|
|
"grad_norm": 7.033966541290283,
|
||
|
|
"learning_rate": 5.655044284927657e-06,
|
||
|
|
"loss": 0.0747,
|
||
|
|
"num_input_tokens_seen": 1633352,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.039145907473309,
|
||
|
|
"grad_norm": 1.1709257364273071,
|
||
|
|
"learning_rate": 5.459748615829355e-06,
|
||
|
|
"loss": 0.0551,
|
||
|
|
"num_input_tokens_seen": 1640840,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.04270462633452,
|
||
|
|
"eval_loss": 0.19830213487148285,
|
||
|
|
"eval_runtime": 0.616,
|
||
|
|
"eval_samples_per_second": 404.216,
|
||
|
|
"eval_steps_per_second": 51.947,
|
||
|
|
"num_input_tokens_seen": 1642696,
|
||
|
|
"step": 1136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.056939501779359,
|
||
|
|
"grad_norm": 2.399528980255127,
|
||
|
|
"learning_rate": 5.267470605739952e-06,
|
||
|
|
"loss": 0.0395,
|
||
|
|
"num_input_tokens_seen": 1648520,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.074733096085409,
|
||
|
|
"grad_norm": 3.8567628860473633,
|
||
|
|
"learning_rate": 5.078239948723154e-06,
|
||
|
|
"loss": 0.0215,
|
||
|
|
"num_input_tokens_seen": 1655752,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.092526690391459,
|
||
|
|
"grad_norm": 2.231137990951538,
|
||
|
|
"learning_rate": 4.892085868230881e-06,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"num_input_tokens_seen": 1662920,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.110320284697509,
|
||
|
|
"grad_norm": 8.699728012084961,
|
||
|
|
"learning_rate": 4.709037112590217e-06,
|
||
|
|
"loss": 0.0348,
|
||
|
|
"num_input_tokens_seen": 1669896,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.128113879003559,
|
||
|
|
"grad_norm": 8.660861015319824,
|
||
|
|
"learning_rate": 4.529121950563716e-06,
|
||
|
|
"loss": 0.076,
|
||
|
|
"num_input_tokens_seen": 1675400,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.145907473309609,
|
||
|
|
"grad_norm": 7.111387252807617,
|
||
|
|
"learning_rate": 4.352368166983753e-06,
|
||
|
|
"loss": 0.0705,
|
||
|
|
"num_input_tokens_seen": 1682952,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1637010676156585,
|
||
|
|
"grad_norm": 6.721922874450684,
|
||
|
|
"learning_rate": 4.178803058461664e-06,
|
||
|
|
"loss": 0.088,
|
||
|
|
"num_input_tokens_seen": 1690248,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.181494661921708,
|
||
|
|
"grad_norm": 1.4173535108566284,
|
||
|
|
"learning_rate": 4.0084534291722376e-06,
|
||
|
|
"loss": 0.05,
|
||
|
|
"num_input_tokens_seen": 1696840,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.199288256227758,
|
||
|
|
"grad_norm": 0.436257928609848,
|
||
|
|
"learning_rate": 3.841345586714251e-06,
|
||
|
|
"loss": 0.0689,
|
||
|
|
"num_input_tokens_seen": 1703624,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.217081850533808,
|
||
|
|
"grad_norm": 0.09257882088422775,
|
||
|
|
"learning_rate": 3.677505338047729e-06,
|
||
|
|
"loss": 0.0218,
|
||
|
|
"num_input_tokens_seen": 1710024,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.234875444839858,
|
||
|
|
"grad_norm": 0.0730605497956276,
|
||
|
|
"learning_rate": 3.516957985508476e-06,
|
||
|
|
"loss": 0.068,
|
||
|
|
"num_input_tokens_seen": 1717768,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.252669039145908,
|
||
|
|
"grad_norm": 0.23621395230293274,
|
||
|
|
"learning_rate": 3.3597283229005877e-06,
|
||
|
|
"loss": 0.021,
|
||
|
|
"num_input_tokens_seen": 1727240,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.270462633451958,
|
||
|
|
"grad_norm": 0.33008334040641785,
|
||
|
|
"learning_rate": 3.205840631667456e-06,
|
||
|
|
"loss": 0.0422,
|
||
|
|
"num_input_tokens_seen": 1734408,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.288256227758007,
|
||
|
|
"grad_norm": 9.555450439453125,
|
||
|
|
"learning_rate": 3.0553186771419162e-06,
|
||
|
|
"loss": 0.0577,
|
||
|
|
"num_input_tokens_seen": 1740936,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.295373665480427,
|
||
|
|
"eval_loss": 0.3402128219604492,
|
||
|
|
"eval_runtime": 0.6132,
|
||
|
|
"eval_samples_per_second": 406.087,
|
||
|
|
"eval_steps_per_second": 52.188,
|
||
|
|
"num_input_tokens_seen": 1743624,
|
||
|
|
"step": 1207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.306049822064057,
|
||
|
|
"grad_norm": 1.463619589805603,
|
||
|
|
"learning_rate": 2.908185704876101e-06,
|
||
|
|
"loss": 0.0397,
|
||
|
|
"num_input_tokens_seen": 1747784,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3238434163701065,
|
||
|
|
"grad_norm": 8.920357704162598,
|
||
|
|
"learning_rate": 2.7644644370515365e-06,
|
||
|
|
"loss": 0.0636,
|
||
|
|
"num_input_tokens_seen": 1754888,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.341637010676156,
|
||
|
|
"grad_norm": 1.068237543106079,
|
||
|
|
"learning_rate": 2.624177068970124e-06,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"num_input_tokens_seen": 1762632,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.359430604982206,
|
||
|
|
"grad_norm": 15.559476852416992,
|
||
|
|
"learning_rate": 2.4873452656264313e-06,
|
||
|
|
"loss": 0.0331,
|
||
|
|
"num_input_tokens_seen": 1769928,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.377224199288256,
|
||
|
|
"grad_norm": 0.16921275854110718,
|
||
|
|
"learning_rate": 2.3539901583619185e-06,
|
||
|
|
"loss": 0.0824,
|
||
|
|
"num_input_tokens_seen": 1777480,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.395017793594306,
|
||
|
|
"grad_norm": 0.30731886625289917,
|
||
|
|
"learning_rate": 2.2241323416015453e-06,
|
||
|
|
"loss": 0.0384,
|
||
|
|
"num_input_tokens_seen": 1784840,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.412811387900356,
|
||
|
|
"grad_norm": 0.8764639496803284,
|
||
|
|
"learning_rate": 2.09779186967331e-06,
|
||
|
|
"loss": 0.0435,
|
||
|
|
"num_input_tokens_seen": 1792584,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.430604982206406,
|
||
|
|
"grad_norm": 10.101332664489746,
|
||
|
|
"learning_rate": 1.9749882537112296e-06,
|
||
|
|
"loss": 0.0525,
|
||
|
|
"num_input_tokens_seen": 1800968,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.448398576512456,
|
||
|
|
"grad_norm": 0.037536390125751495,
|
||
|
|
"learning_rate": 1.8557404586421413e-06,
|
||
|
|
"loss": 0.0777,
|
||
|
|
"num_input_tokens_seen": 1808456,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4661921708185055,
|
||
|
|
"grad_norm": 14.205132484436035,
|
||
|
|
"learning_rate": 1.7400669002569232e-06,
|
||
|
|
"loss": 0.1469,
|
||
|
|
"num_input_tokens_seen": 1816136,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.483985765124555,
|
||
|
|
"grad_norm": 16.095531463623047,
|
||
|
|
"learning_rate": 1.6279854423664697e-06,
|
||
|
|
"loss": 0.0696,
|
||
|
|
"num_input_tokens_seen": 1824136,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.501779359430605,
|
||
|
|
"grad_norm": 0.2532411217689514,
|
||
|
|
"learning_rate": 1.5195133940429345e-06,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"num_input_tokens_seen": 1831304,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.519572953736655,
|
||
|
|
"grad_norm": 5.228630065917969,
|
||
|
|
"learning_rate": 1.4146675069466403e-06,
|
||
|
|
"loss": 0.0259,
|
||
|
|
"num_input_tokens_seen": 1837512,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.537366548042705,
|
||
|
|
"grad_norm": 1.4012762308120728,
|
||
|
|
"learning_rate": 1.313463972739068e-06,
|
||
|
|
"loss": 0.0319,
|
||
|
|
"num_input_tokens_seen": 1844296,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.548042704626335,
|
||
|
|
"eval_loss": 0.3532261848449707,
|
||
|
|
"eval_runtime": 0.6553,
|
||
|
|
"eval_samples_per_second": 379.991,
|
||
|
|
"eval_steps_per_second": 48.834,
|
||
|
|
"num_input_tokens_seen": 1849416,
|
||
|
|
"step": 1278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.555160142348754,
|
||
|
|
"grad_norm": 0.7564399838447571,
|
||
|
|
"learning_rate": 1.2159184205823432e-06,
|
||
|
|
"loss": 0.0338,
|
||
|
|
"num_input_tokens_seen": 1851720,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.572953736654805,
|
||
|
|
"grad_norm": 0.5453316569328308,
|
||
|
|
"learning_rate": 1.122045914725564e-06,
|
||
|
|
"loss": 0.0457,
|
||
|
|
"num_input_tokens_seen": 1858120,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.590747330960854,
|
||
|
|
"grad_norm": 9.23385238647461,
|
||
|
|
"learning_rate": 1.0318609521783818e-06,
|
||
|
|
"loss": 0.0645,
|
||
|
|
"num_input_tokens_seen": 1865928,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.608540925266904,
|
||
|
|
"grad_norm": 6.625101566314697,
|
||
|
|
"learning_rate": 9.453774604721938e-07,
|
||
|
|
"loss": 0.0261,
|
||
|
|
"num_input_tokens_seen": 1873800,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6263345195729535,
|
||
|
|
"grad_norm": 11.140019416809082,
|
||
|
|
"learning_rate": 8.62608795509276e-07,
|
||
|
|
"loss": 0.054,
|
||
|
|
"num_input_tokens_seen": 1881800,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.644128113879003,
|
||
|
|
"grad_norm": 1.8604605197906494,
|
||
|
|
"learning_rate": 7.835677395001795e-07,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"num_input_tokens_seen": 1888648,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.661921708185053,
|
||
|
|
"grad_norm": 10.582422256469727,
|
||
|
|
"learning_rate": 7.082664989897487e-07,
|
||
|
|
"loss": 0.1115,
|
||
|
|
"num_input_tokens_seen": 1895432,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.679715302491103,
|
||
|
|
"grad_norm": 5.248901844024658,
|
||
|
|
"learning_rate": 6.367167029720234e-07,
|
||
|
|
"loss": 0.0608,
|
||
|
|
"num_input_tokens_seen": 1902408,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.697508896797153,
|
||
|
|
"grad_norm": 0.2927665412425995,
|
||
|
|
"learning_rate": 5.68929401094323e-07,
|
||
|
|
"loss": 0.0289,
|
||
|
|
"num_input_tokens_seen": 1910344,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.715302491103203,
|
||
|
|
"grad_norm": 0.10143531113862991,
|
||
|
|
"learning_rate": 5.049150619508502e-07,
|
||
|
|
"loss": 0.0309,
|
||
|
|
"num_input_tokens_seen": 1918472,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.733096085409253,
|
||
|
|
"grad_norm": 1.0533421039581299,
|
||
|
|
"learning_rate": 4.4468357146596475e-07,
|
||
|
|
"loss": 0.0078,
|
||
|
|
"num_input_tokens_seen": 1924744,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.750889679715303,
|
||
|
|
"grad_norm": 0.11136168986558914,
|
||
|
|
"learning_rate": 3.8824423136748777e-07,
|
||
|
|
"loss": 0.0676,
|
||
|
|
"num_input_tokens_seen": 1932872,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7686832740213525,
|
||
|
|
"grad_norm": 7.673605442047119,
|
||
|
|
"learning_rate": 3.3560575775019864e-07,
|
||
|
|
"loss": 0.0673,
|
||
|
|
"num_input_tokens_seen": 1940040,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.786476868327402,
|
||
|
|
"grad_norm": 9.804219245910645,
|
||
|
|
"learning_rate": 2.8677627972978906e-07,
|
||
|
|
"loss": 0.0846,
|
||
|
|
"num_input_tokens_seen": 1948936,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.800711743772242,
|
||
|
|
"eval_loss": 0.34229812026023865,
|
||
|
|
"eval_runtime": 0.623,
|
||
|
|
"eval_samples_per_second": 399.663,
|
||
|
|
"eval_steps_per_second": 51.362,
|
||
|
|
"num_input_tokens_seen": 1954568,
|
||
|
|
"step": 1349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.804270462633452,
|
||
|
|
"grad_norm": 0.05879069119691849,
|
||
|
|
"learning_rate": 2.417633381874534e-07,
|
||
|
|
"loss": 0.001,
|
||
|
|
"num_input_tokens_seen": 1955912,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.822064056939502,
|
||
|
|
"grad_norm": 5.924869537353516,
|
||
|
|
"learning_rate": 2.0057388460533732e-07,
|
||
|
|
"loss": 0.0243,
|
||
|
|
"num_input_tokens_seen": 1962760,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.839857651245552,
|
||
|
|
"grad_norm": 5.939824104309082,
|
||
|
|
"learning_rate": 1.6321427999298755e-07,
|
||
|
|
"loss": 0.0594,
|
||
|
|
"num_input_tokens_seen": 1969160,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.857651245551601,
|
||
|
|
"grad_norm": 1.1909672021865845,
|
||
|
|
"learning_rate": 1.2969029390501597e-07,
|
||
|
|
"loss": 0.0329,
|
||
|
|
"num_input_tokens_seen": 1975752,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.875444839857651,
|
||
|
|
"grad_norm": 2.516611099243164,
|
||
|
|
"learning_rate": 1.0000710355008159e-07,
|
||
|
|
"loss": 0.0349,
|
||
|
|
"num_input_tokens_seen": 1983240,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.893238434163701,
|
||
|
|
"grad_norm": 2.7162880897521973,
|
||
|
|
"learning_rate": 7.416929299135511e-08,
|
||
|
|
"loss": 0.004,
|
||
|
|
"num_input_tokens_seen": 1990792,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.911032028469751,
|
||
|
|
"grad_norm": 0.11341112107038498,
|
||
|
|
"learning_rate": 5.218085243859638e-08,
|
||
|
|
"loss": 0.028,
|
||
|
|
"num_input_tokens_seen": 1998728,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9288256227758005,
|
||
|
|
"grad_norm": 12.289280891418457,
|
||
|
|
"learning_rate": 3.4045177631936155e-08,
|
||
|
|
"loss": 0.046,
|
||
|
|
"num_input_tokens_seen": 2006920,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.94661921708185,
|
||
|
|
"grad_norm": 5.766960144042969,
|
||
|
|
"learning_rate": 1.976506931745392e-08,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"num_input_tokens_seen": 2013128,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9644128113879,
|
||
|
|
"grad_norm": 0.9045501351356506,
|
||
|
|
"learning_rate": 9.3427328146517e-09,
|
||
|
|
"loss": 0.0718,
|
||
|
|
"num_input_tokens_seen": 2021704,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.98220640569395,
|
||
|
|
"grad_norm": 1.2223786115646362,
|
||
|
|
"learning_rate": 2.779777675890327e-09,
|
||
|
|
"loss": 0.1224,
|
||
|
|
"num_input_tokens_seen": 2028872,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 0.01060063298791647,
|
||
|
|
"learning_rate": 7.72174378022017e-11,
|
||
|
|
"loss": 0.0499,
|
||
|
|
"num_input_tokens_seen": 2035272,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"num_input_tokens_seen": 2035272,
|
||
|
|
"step": 1405,
|
||
|
|
"total_flos": 1.1883702201974784e+16,
|
||
|
|
"train_loss": 0.17133487164477065,
|
||
|
|
"train_runtime": 699.7603,
|
||
|
|
"train_samples_per_second": 16.013,
|
||
|
|
"train_steps_per_second": 2.008
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 1405,
|
||
|
|
"num_input_tokens_seen": 2035272,
|
||
|
|
"num_train_epochs": 5,
|
||
|
|
"save_steps": 71,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 1.1883702201974784e+16,
|
||
|
|
"train_batch_size": 8,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|