3520 lines
92 KiB
JSON
3520 lines
92 KiB
JSON
{
|
|
"best_global_step": 104,
|
|
"best_metric": 0.17402823269367218,
|
|
"best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_mrpc_42_1774791061/checkpoint-104",
|
|
"epoch": 5.0,
|
|
"eval_steps": 104,
|
|
"global_step": 2065,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.012106537530266344,
|
|
"grad_norm": 262.4778747558594,
|
|
"learning_rate": 9.66183574879227e-07,
|
|
"loss": 0.7681,
|
|
"num_input_tokens_seen": 4352,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.024213075060532687,
|
|
"grad_norm": 26.363384246826172,
|
|
"learning_rate": 2.173913043478261e-06,
|
|
"loss": 0.3056,
|
|
"num_input_tokens_seen": 8768,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.03631961259079903,
|
|
"grad_norm": 10.327119827270508,
|
|
"learning_rate": 3.3816425120772947e-06,
|
|
"loss": 0.183,
|
|
"num_input_tokens_seen": 12992,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.048426150121065374,
|
|
"grad_norm": 36.403324127197266,
|
|
"learning_rate": 4.589371980676329e-06,
|
|
"loss": 0.4041,
|
|
"num_input_tokens_seen": 17344,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.06053268765133172,
|
|
"grad_norm": 8.729621887207031,
|
|
"learning_rate": 5.797101449275362e-06,
|
|
"loss": 0.4147,
|
|
"num_input_tokens_seen": 21696,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.07263922518159806,
|
|
"grad_norm": 4.769359111785889,
|
|
"learning_rate": 7.004830917874397e-06,
|
|
"loss": 0.2132,
|
|
"num_input_tokens_seen": 26112,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.0847457627118644,
|
|
"grad_norm": 4.588466644287109,
|
|
"learning_rate": 8.212560386473431e-06,
|
|
"loss": 0.2587,
|
|
"num_input_tokens_seen": 30208,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.09685230024213075,
|
|
"grad_norm": 21.823162078857422,
|
|
"learning_rate": 9.420289855072464e-06,
|
|
"loss": 0.2076,
|
|
"num_input_tokens_seen": 34688,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.1089588377723971,
|
|
"grad_norm": 18.038860321044922,
|
|
"learning_rate": 1.0628019323671499e-05,
|
|
"loss": 0.1842,
|
|
"num_input_tokens_seen": 38784,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.12106537530266344,
|
|
"grad_norm": 12.918279647827148,
|
|
"learning_rate": 1.1835748792270531e-05,
|
|
"loss": 0.3012,
|
|
"num_input_tokens_seen": 43200,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.13317191283292978,
|
|
"grad_norm": 24.635744094848633,
|
|
"learning_rate": 1.3043478260869566e-05,
|
|
"loss": 0.1951,
|
|
"num_input_tokens_seen": 47296,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.14527845036319612,
|
|
"grad_norm": 14.053600311279297,
|
|
"learning_rate": 1.4251207729468599e-05,
|
|
"loss": 0.2332,
|
|
"num_input_tokens_seen": 51712,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.15738498789346247,
|
|
"grad_norm": 8.166345596313477,
|
|
"learning_rate": 1.5458937198067633e-05,
|
|
"loss": 0.2049,
|
|
"num_input_tokens_seen": 55872,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.1694915254237288,
|
|
"grad_norm": 27.84511947631836,
|
|
"learning_rate": 1.6666666666666667e-05,
|
|
"loss": 0.2103,
|
|
"num_input_tokens_seen": 59840,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.18159806295399517,
|
|
"grad_norm": 55.02257537841797,
|
|
"learning_rate": 1.78743961352657e-05,
|
|
"loss": 0.3072,
|
|
"num_input_tokens_seen": 64000,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.1937046004842615,
|
|
"grad_norm": 11.449199676513672,
|
|
"learning_rate": 1.9082125603864733e-05,
|
|
"loss": 0.3841,
|
|
"num_input_tokens_seen": 68352,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.20581113801452786,
|
|
"grad_norm": 11.381855964660645,
|
|
"learning_rate": 2.028985507246377e-05,
|
|
"loss": 0.232,
|
|
"num_input_tokens_seen": 72768,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.2179176755447942,
|
|
"grad_norm": 42.495670318603516,
|
|
"learning_rate": 2.1497584541062805e-05,
|
|
"loss": 0.2474,
|
|
"num_input_tokens_seen": 77120,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.23002421307506055,
|
|
"grad_norm": 21.28970718383789,
|
|
"learning_rate": 2.2705314009661836e-05,
|
|
"loss": 0.1841,
|
|
"num_input_tokens_seen": 81664,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.24213075060532688,
|
|
"grad_norm": 17.023759841918945,
|
|
"learning_rate": 2.391304347826087e-05,
|
|
"loss": 0.1681,
|
|
"num_input_tokens_seen": 86080,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.25181598062953997,
|
|
"eval_loss": 0.17402823269367218,
|
|
"eval_runtime": 0.639,
|
|
"eval_samples_per_second": 574.368,
|
|
"eval_steps_per_second": 71.992,
|
|
"num_input_tokens_seen": 89600,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.2542372881355932,
|
|
"grad_norm": 16.713178634643555,
|
|
"learning_rate": 2.5120772946859905e-05,
|
|
"loss": 0.1488,
|
|
"num_input_tokens_seen": 90432,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.26634382566585957,
|
|
"grad_norm": 6.363961219787598,
|
|
"learning_rate": 2.632850241545894e-05,
|
|
"loss": 0.2051,
|
|
"num_input_tokens_seen": 94528,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.2784503631961259,
|
|
"grad_norm": 7.700758934020996,
|
|
"learning_rate": 2.753623188405797e-05,
|
|
"loss": 0.16,
|
|
"num_input_tokens_seen": 98816,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.29055690072639223,
|
|
"grad_norm": 8.657270431518555,
|
|
"learning_rate": 2.8743961352657005e-05,
|
|
"loss": 0.205,
|
|
"num_input_tokens_seen": 103104,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.3026634382566586,
|
|
"grad_norm": 7.297232151031494,
|
|
"learning_rate": 2.995169082125604e-05,
|
|
"loss": 0.1846,
|
|
"num_input_tokens_seen": 107328,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.31476997578692495,
|
|
"grad_norm": 13.21757984161377,
|
|
"learning_rate": 3.1159420289855074e-05,
|
|
"loss": 0.2243,
|
|
"num_input_tokens_seen": 111488,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.3268765133171913,
|
|
"grad_norm": 6.457214832305908,
|
|
"learning_rate": 3.236714975845411e-05,
|
|
"loss": 0.2013,
|
|
"num_input_tokens_seen": 115968,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.3389830508474576,
|
|
"grad_norm": 29.321474075317383,
|
|
"learning_rate": 3.357487922705314e-05,
|
|
"loss": 0.2278,
|
|
"num_input_tokens_seen": 120192,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.35108958837772397,
|
|
"grad_norm": 10.676529884338379,
|
|
"learning_rate": 3.478260869565218e-05,
|
|
"loss": 0.1886,
|
|
"num_input_tokens_seen": 124416,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.36319612590799033,
|
|
"grad_norm": 11.802507400512695,
|
|
"learning_rate": 3.5990338164251205e-05,
|
|
"loss": 0.1635,
|
|
"num_input_tokens_seen": 128832,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.37530266343825663,
|
|
"grad_norm": 9.175806999206543,
|
|
"learning_rate": 3.719806763285024e-05,
|
|
"loss": 0.2118,
|
|
"num_input_tokens_seen": 132992,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.387409200968523,
|
|
"grad_norm": 17.557262420654297,
|
|
"learning_rate": 3.8405797101449274e-05,
|
|
"loss": 0.3186,
|
|
"num_input_tokens_seen": 137280,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.39951573849878935,
|
|
"grad_norm": 31.175756454467773,
|
|
"learning_rate": 3.961352657004831e-05,
|
|
"loss": 0.2002,
|
|
"num_input_tokens_seen": 141568,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.4116222760290557,
|
|
"grad_norm": 12.988505363464355,
|
|
"learning_rate": 4.082125603864734e-05,
|
|
"loss": 0.1792,
|
|
"num_input_tokens_seen": 145984,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.423728813559322,
|
|
"grad_norm": 43.43312454223633,
|
|
"learning_rate": 4.202898550724638e-05,
|
|
"loss": 0.3197,
|
|
"num_input_tokens_seen": 150144,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.4358353510895884,
|
|
"grad_norm": 10.99770736694336,
|
|
"learning_rate": 4.323671497584541e-05,
|
|
"loss": 0.3561,
|
|
"num_input_tokens_seen": 154624,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.44794188861985473,
|
|
"grad_norm": 8.507532119750977,
|
|
"learning_rate": 4.4444444444444447e-05,
|
|
"loss": 0.373,
|
|
"num_input_tokens_seen": 158784,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.4600484261501211,
|
|
"grad_norm": 129.54592895507812,
|
|
"learning_rate": 4.565217391304348e-05,
|
|
"loss": 0.3924,
|
|
"num_input_tokens_seen": 163072,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.4721549636803874,
|
|
"grad_norm": 15.108623504638672,
|
|
"learning_rate": 4.6859903381642516e-05,
|
|
"loss": 0.2368,
|
|
"num_input_tokens_seen": 167104,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.48426150121065376,
|
|
"grad_norm": 9.902148246765137,
|
|
"learning_rate": 4.806763285024155e-05,
|
|
"loss": 0.4497,
|
|
"num_input_tokens_seen": 171456,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.4963680387409201,
|
|
"grad_norm": 16.188369750976562,
|
|
"learning_rate": 4.9275362318840584e-05,
|
|
"loss": 0.2715,
|
|
"num_input_tokens_seen": 175808,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.5036319612590799,
|
|
"eval_loss": 0.23122040927410126,
|
|
"eval_runtime": 0.6326,
|
|
"eval_samples_per_second": 580.165,
|
|
"eval_steps_per_second": 72.718,
|
|
"num_input_tokens_seen": 178688,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.5084745762711864,
|
|
"grad_norm": 0.7912328839302063,
|
|
"learning_rate": 4.9999857052054956e-05,
|
|
"loss": 0.1981,
|
|
"num_input_tokens_seen": 180224,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.5205811138014528,
|
|
"grad_norm": 4.983211040496826,
|
|
"learning_rate": 4.999824890644693e-05,
|
|
"loss": 0.1989,
|
|
"num_input_tokens_seen": 184704,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.5326876513317191,
|
|
"grad_norm": 16.626827239990234,
|
|
"learning_rate": 4.9994854045622684e-05,
|
|
"loss": 0.2336,
|
|
"num_input_tokens_seen": 189184,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.5447941888619855,
|
|
"grad_norm": 5.18185567855835,
|
|
"learning_rate": 4.9989672712225204e-05,
|
|
"loss": 0.1595,
|
|
"num_input_tokens_seen": 193536,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.5569007263922519,
|
|
"grad_norm": 8.547920227050781,
|
|
"learning_rate": 4.998270527658311e-05,
|
|
"loss": 0.2147,
|
|
"num_input_tokens_seen": 197888,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.5690072639225182,
|
|
"grad_norm": 1.19011652469635,
|
|
"learning_rate": 4.9973952236684216e-05,
|
|
"loss": 0.1959,
|
|
"num_input_tokens_seen": 202112,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.5811138014527845,
|
|
"grad_norm": 12.658636093139648,
|
|
"learning_rate": 4.996341421813993e-05,
|
|
"loss": 0.2085,
|
|
"num_input_tokens_seen": 206528,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.5932203389830508,
|
|
"grad_norm": 20.122756958007812,
|
|
"learning_rate": 4.9951091974140506e-05,
|
|
"loss": 0.2304,
|
|
"num_input_tokens_seen": 210944,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.6053268765133172,
|
|
"grad_norm": 10.99802303314209,
|
|
"learning_rate": 4.99369863854013e-05,
|
|
"loss": 0.2171,
|
|
"num_input_tokens_seen": 215104,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.6174334140435835,
|
|
"grad_norm": 7.956684112548828,
|
|
"learning_rate": 4.992109846009972e-05,
|
|
"loss": 0.2458,
|
|
"num_input_tokens_seen": 219328,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.6295399515738499,
|
|
"grad_norm": 19.862939834594727,
|
|
"learning_rate": 4.990342933380321e-05,
|
|
"loss": 0.219,
|
|
"num_input_tokens_seen": 223680,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.6416464891041163,
|
|
"grad_norm": 7.302405834197998,
|
|
"learning_rate": 4.9883980269388106e-05,
|
|
"loss": 0.3803,
|
|
"num_input_tokens_seen": 227904,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.6537530266343826,
|
|
"grad_norm": 9.361984252929688,
|
|
"learning_rate": 4.986275265694935e-05,
|
|
"loss": 0.3005,
|
|
"num_input_tokens_seen": 231936,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.6658595641646489,
|
|
"grad_norm": 16.678607940673828,
|
|
"learning_rate": 4.9839748013701145e-05,
|
|
"loss": 0.2954,
|
|
"num_input_tokens_seen": 236160,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.6779661016949152,
|
|
"grad_norm": 9.596780776977539,
|
|
"learning_rate": 4.981496798386849e-05,
|
|
"loss": 0.2924,
|
|
"num_input_tokens_seen": 240320,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.6900726392251816,
|
|
"grad_norm": 6.522184371948242,
|
|
"learning_rate": 4.978841433856971e-05,
|
|
"loss": 0.1771,
|
|
"num_input_tokens_seen": 244800,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.7021791767554479,
|
|
"grad_norm": 8.720867156982422,
|
|
"learning_rate": 4.976008897568981e-05,
|
|
"loss": 0.194,
|
|
"num_input_tokens_seen": 249152,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.7142857142857143,
|
|
"grad_norm": 11.178607940673828,
|
|
"learning_rate": 4.972999391974488e-05,
|
|
"loss": 0.2064,
|
|
"num_input_tokens_seen": 253376,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.7263922518159807,
|
|
"grad_norm": 12.191368103027344,
|
|
"learning_rate": 4.969813132173735e-05,
|
|
"loss": 0.2096,
|
|
"num_input_tokens_seen": 257664,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.738498789346247,
|
|
"grad_norm": 5.037217617034912,
|
|
"learning_rate": 4.966450345900229e-05,
|
|
"loss": 0.1712,
|
|
"num_input_tokens_seen": 262016,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.7506053268765133,
|
|
"grad_norm": 10.153473854064941,
|
|
"learning_rate": 4.962911273504461e-05,
|
|
"loss": 0.2276,
|
|
"num_input_tokens_seen": 266432,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.7554479418886199,
|
|
"eval_loss": 0.22853781282901764,
|
|
"eval_runtime": 2.3445,
|
|
"eval_samples_per_second": 156.536,
|
|
"eval_steps_per_second": 19.62,
|
|
"num_input_tokens_seen": 267968,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 0.7627118644067796,
|
|
"grad_norm": 12.040881156921387,
|
|
"learning_rate": 4.9591961679367284e-05,
|
|
"loss": 0.2349,
|
|
"num_input_tokens_seen": 270464,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.774818401937046,
|
|
"grad_norm": 12.473306655883789,
|
|
"learning_rate": 4.955305294729056e-05,
|
|
"loss": 0.2824,
|
|
"num_input_tokens_seen": 274688,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.7869249394673123,
|
|
"grad_norm": 21.77474594116211,
|
|
"learning_rate": 4.951238931976216e-05,
|
|
"loss": 0.3105,
|
|
"num_input_tokens_seen": 278848,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.7990314769975787,
|
|
"grad_norm": 17.280487060546875,
|
|
"learning_rate": 4.9469973703158565e-05,
|
|
"loss": 0.2667,
|
|
"num_input_tokens_seen": 283136,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.8111380145278451,
|
|
"grad_norm": 6.448112487792969,
|
|
"learning_rate": 4.9425809129077204e-05,
|
|
"loss": 0.2213,
|
|
"num_input_tokens_seen": 287680,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.8232445520581114,
|
|
"grad_norm": 1.0759979486465454,
|
|
"learning_rate": 4.937989875411985e-05,
|
|
"loss": 0.1887,
|
|
"num_input_tokens_seen": 292224,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.8353510895883777,
|
|
"grad_norm": 8.703038215637207,
|
|
"learning_rate": 4.933224585966696e-05,
|
|
"loss": 0.2499,
|
|
"num_input_tokens_seen": 296448,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.847457627118644,
|
|
"grad_norm": 16.416717529296875,
|
|
"learning_rate": 4.928285385164315e-05,
|
|
"loss": 0.2431,
|
|
"num_input_tokens_seen": 300736,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.8595641646489104,
|
|
"grad_norm": 6.670568943023682,
|
|
"learning_rate": 4.923172626027379e-05,
|
|
"loss": 0.2588,
|
|
"num_input_tokens_seen": 304960,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.8716707021791767,
|
|
"grad_norm": 3.8800857067108154,
|
|
"learning_rate": 4.917886673983267e-05,
|
|
"loss": 0.2322,
|
|
"num_input_tokens_seen": 309184,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.8837772397094431,
|
|
"grad_norm": 8.991925239562988,
|
|
"learning_rate": 4.912427906838078e-05,
|
|
"loss": 0.2314,
|
|
"num_input_tokens_seen": 313408,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.8958837772397095,
|
|
"grad_norm": 9.208677291870117,
|
|
"learning_rate": 4.906796714749635e-05,
|
|
"loss": 0.1782,
|
|
"num_input_tokens_seen": 317888,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.9079903147699758,
|
|
"grad_norm": 6.636046886444092,
|
|
"learning_rate": 4.900993500199591e-05,
|
|
"loss": 0.1873,
|
|
"num_input_tokens_seen": 322048,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.9200968523002422,
|
|
"grad_norm": 10.718189239501953,
|
|
"learning_rate": 4.895018677964669e-05,
|
|
"loss": 0.1985,
|
|
"num_input_tokens_seen": 326592,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.9322033898305084,
|
|
"grad_norm": 22.99626922607422,
|
|
"learning_rate": 4.8888726750870126e-05,
|
|
"loss": 0.3036,
|
|
"num_input_tokens_seen": 330880,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.9443099273607748,
|
|
"grad_norm": 3.320899486541748,
|
|
"learning_rate": 4.882555930843664e-05,
|
|
"loss": 0.2224,
|
|
"num_input_tokens_seen": 335104,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.9564164648910412,
|
|
"grad_norm": 5.677978038787842,
|
|
"learning_rate": 4.87606889671517e-05,
|
|
"loss": 0.1898,
|
|
"num_input_tokens_seen": 339392,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.9685230024213075,
|
|
"grad_norm": 11.17044448852539,
|
|
"learning_rate": 4.8694120363533104e-05,
|
|
"loss": 0.1663,
|
|
"num_input_tokens_seen": 343744,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.9806295399515739,
|
|
"grad_norm": 9.493459701538086,
|
|
"learning_rate": 4.8625858255479574e-05,
|
|
"loss": 0.1954,
|
|
"num_input_tokens_seen": 348160,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.9927360774818402,
|
|
"grad_norm": 13.322687149047852,
|
|
"learning_rate": 4.855590752193076e-05,
|
|
"loss": 0.2606,
|
|
"num_input_tokens_seen": 352448,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.0048426150121066,
|
|
"grad_norm": 13.647954940795898,
|
|
"learning_rate": 4.848427316251842e-05,
|
|
"loss": 0.5572,
|
|
"num_input_tokens_seen": 356656,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 1.0072639225181599,
|
|
"eval_loss": 0.2624819278717041,
|
|
"eval_runtime": 0.8628,
|
|
"eval_samples_per_second": 425.363,
|
|
"eval_steps_per_second": 53.315,
|
|
"num_input_tokens_seen": 357488,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 1.0169491525423728,
|
|
"grad_norm": 43.02584457397461,
|
|
"learning_rate": 4.841096029720921e-05,
|
|
"loss": 0.2346,
|
|
"num_input_tokens_seen": 360880,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.0290556900726393,
|
|
"grad_norm": 8.104162216186523,
|
|
"learning_rate": 4.8335974165938615e-05,
|
|
"loss": 0.1819,
|
|
"num_input_tokens_seen": 365104,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 1.0411622276029056,
|
|
"grad_norm": 5.002182483673096,
|
|
"learning_rate": 4.825932012823652e-05,
|
|
"loss": 0.1495,
|
|
"num_input_tokens_seen": 369776,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 1.053268765133172,
|
|
"grad_norm": 27.77912139892578,
|
|
"learning_rate": 4.8181003662844074e-05,
|
|
"loss": 0.2583,
|
|
"num_input_tokens_seen": 374000,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 1.0653753026634383,
|
|
"grad_norm": 9.262914657592773,
|
|
"learning_rate": 4.8101030367322195e-05,
|
|
"loss": 0.2093,
|
|
"num_input_tokens_seen": 378096,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.0774818401937045,
|
|
"grad_norm": 5.5975823402404785,
|
|
"learning_rate": 4.8019405957651395e-05,
|
|
"loss": 0.1806,
|
|
"num_input_tokens_seen": 382256,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 1.089588377723971,
|
|
"grad_norm": 10.306631088256836,
|
|
"learning_rate": 4.793613626782331e-05,
|
|
"loss": 0.3307,
|
|
"num_input_tokens_seen": 386672,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 1.1016949152542372,
|
|
"grad_norm": 4.157079696655273,
|
|
"learning_rate": 4.785122724942367e-05,
|
|
"loss": 0.2208,
|
|
"num_input_tokens_seen": 390960,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 1.1138014527845037,
|
|
"grad_norm": 0.7576245069503784,
|
|
"learning_rate": 4.776468497120698e-05,
|
|
"loss": 0.2978,
|
|
"num_input_tokens_seen": 395440,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.12590799031477,
|
|
"grad_norm": 6.9619035720825195,
|
|
"learning_rate": 4.7676515618662684e-05,
|
|
"loss": 0.2315,
|
|
"num_input_tokens_seen": 399600,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 1.1380145278450362,
|
|
"grad_norm": 1.4395357370376587,
|
|
"learning_rate": 4.758672549357316e-05,
|
|
"loss": 0.2236,
|
|
"num_input_tokens_seen": 403888,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 1.1501210653753027,
|
|
"grad_norm": 18.561601638793945,
|
|
"learning_rate": 4.749532101356322e-05,
|
|
"loss": 0.1689,
|
|
"num_input_tokens_seen": 408176,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 1.162227602905569,
|
|
"grad_norm": 16.139604568481445,
|
|
"learning_rate": 4.740230871164147e-05,
|
|
"loss": 0.2012,
|
|
"num_input_tokens_seen": 412208,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 1.1743341404358354,
|
|
"grad_norm": 1.962085247039795,
|
|
"learning_rate": 4.730769523573337e-05,
|
|
"loss": 0.1816,
|
|
"num_input_tokens_seen": 416624,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 1.1864406779661016,
|
|
"grad_norm": 3.118806838989258,
|
|
"learning_rate": 4.7211487348206054e-05,
|
|
"loss": 0.2491,
|
|
"num_input_tokens_seen": 421040,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 1.1985472154963681,
|
|
"grad_norm": 3.9620296955108643,
|
|
"learning_rate": 4.711369192538503e-05,
|
|
"loss": 0.203,
|
|
"num_input_tokens_seen": 425136,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 1.2106537530266344,
|
|
"grad_norm": 4.469512462615967,
|
|
"learning_rate": 4.7014315957062685e-05,
|
|
"loss": 0.4102,
|
|
"num_input_tokens_seen": 429680,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.2227602905569008,
|
|
"grad_norm": 8.607080459594727,
|
|
"learning_rate": 4.691336654599873e-05,
|
|
"loss": 0.2409,
|
|
"num_input_tokens_seen": 434224,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 1.234866828087167,
|
|
"grad_norm": 9.237229347229004,
|
|
"learning_rate": 4.6810850907412484e-05,
|
|
"loss": 0.2191,
|
|
"num_input_tokens_seen": 438320,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 1.2469733656174333,
|
|
"grad_norm": 5.81946325302124,
|
|
"learning_rate": 4.670677636846723e-05,
|
|
"loss": 0.1975,
|
|
"num_input_tokens_seen": 442672,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 1.2590799031476998,
|
|
"grad_norm": 2.934025764465332,
|
|
"learning_rate": 4.660115036774648e-05,
|
|
"loss": 0.1881,
|
|
"num_input_tokens_seen": 446896,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.2590799031476998,
|
|
"eval_loss": 0.1976936012506485,
|
|
"eval_runtime": 0.6676,
|
|
"eval_samples_per_second": 549.73,
|
|
"eval_steps_per_second": 68.904,
|
|
"num_input_tokens_seen": 446896,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.271186440677966,
|
|
"grad_norm": 2.785706043243408,
|
|
"learning_rate": 4.6493980454722344e-05,
|
|
"loss": 0.2485,
|
|
"num_input_tokens_seen": 451312,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 1.2832929782082325,
|
|
"grad_norm": 9.8702392578125,
|
|
"learning_rate": 4.638527428921592e-05,
|
|
"loss": 0.2053,
|
|
"num_input_tokens_seen": 455408,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 1.2953995157384988,
|
|
"grad_norm": 7.424989223480225,
|
|
"learning_rate": 4.627503964084981e-05,
|
|
"loss": 0.1867,
|
|
"num_input_tokens_seen": 460080,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 1.307506053268765,
|
|
"grad_norm": 4.052550792694092,
|
|
"learning_rate": 4.6163284388492835e-05,
|
|
"loss": 0.1674,
|
|
"num_input_tokens_seen": 464496,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.3196125907990315,
|
|
"grad_norm": 2.9404428005218506,
|
|
"learning_rate": 4.605001651969686e-05,
|
|
"loss": 0.2045,
|
|
"num_input_tokens_seen": 468720,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 1.331719128329298,
|
|
"grad_norm": 6.4158148765563965,
|
|
"learning_rate": 4.593524413012592e-05,
|
|
"loss": 0.191,
|
|
"num_input_tokens_seen": 473264,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.3438256658595642,
|
|
"grad_norm": 2.213015556335449,
|
|
"learning_rate": 4.5818975422977606e-05,
|
|
"loss": 0.1828,
|
|
"num_input_tokens_seen": 477552,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 1.3559322033898304,
|
|
"grad_norm": 5.9616804122924805,
|
|
"learning_rate": 4.570121870839671e-05,
|
|
"loss": 0.1546,
|
|
"num_input_tokens_seen": 482032,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.368038740920097,
|
|
"grad_norm": 0.6267197132110596,
|
|
"learning_rate": 4.558198240288131e-05,
|
|
"loss": 0.2025,
|
|
"num_input_tokens_seen": 486384,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 1.3801452784503632,
|
|
"grad_norm": 9.450618743896484,
|
|
"learning_rate": 4.546127502868118e-05,
|
|
"loss": 0.2413,
|
|
"num_input_tokens_seen": 490672,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.3922518159806296,
|
|
"grad_norm": 5.918724536895752,
|
|
"learning_rate": 4.5339105213188714e-05,
|
|
"loss": 0.2163,
|
|
"num_input_tokens_seen": 494960,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 1.4043583535108959,
|
|
"grad_norm": 2.0229716300964355,
|
|
"learning_rate": 4.521548168832227e-05,
|
|
"loss": 0.3013,
|
|
"num_input_tokens_seen": 499120,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.4164648910411621,
|
|
"grad_norm": 4.871718406677246,
|
|
"learning_rate": 4.509041328990204e-05,
|
|
"loss": 0.2324,
|
|
"num_input_tokens_seen": 503408,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 1.4285714285714286,
|
|
"grad_norm": 4.264101028442383,
|
|
"learning_rate": 4.4963908957018576e-05,
|
|
"loss": 0.1956,
|
|
"num_input_tokens_seen": 507312,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.4406779661016949,
|
|
"grad_norm": 0.7742087841033936,
|
|
"learning_rate": 4.483597773139386e-05,
|
|
"loss": 0.2206,
|
|
"num_input_tokens_seen": 511600,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 1.4527845036319613,
|
|
"grad_norm": 1.387762427330017,
|
|
"learning_rate": 4.470662875673506e-05,
|
|
"loss": 0.1973,
|
|
"num_input_tokens_seen": 515888,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.4648910411622276,
|
|
"grad_norm": 8.138726234436035,
|
|
"learning_rate": 4.457587127808096e-05,
|
|
"loss": 0.1848,
|
|
"num_input_tokens_seen": 519920,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 1.4769975786924938,
|
|
"grad_norm": 3.1052446365356445,
|
|
"learning_rate": 4.4443714641141255e-05,
|
|
"loss": 0.1922,
|
|
"num_input_tokens_seen": 524336,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 1.4891041162227603,
|
|
"grad_norm": 1.7755212783813477,
|
|
"learning_rate": 4.4310168291628504e-05,
|
|
"loss": 0.1922,
|
|
"num_input_tokens_seen": 528496,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 1.5012106537530268,
|
|
"grad_norm": 8.44454288482666,
|
|
"learning_rate": 4.4175241774583084e-05,
|
|
"loss": 0.1809,
|
|
"num_input_tokens_seen": 532784,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.5108958837772397,
|
|
"eval_loss": 0.19258780777454376,
|
|
"eval_runtime": 0.6591,
|
|
"eval_samples_per_second": 556.848,
|
|
"eval_steps_per_second": 69.796,
|
|
"num_input_tokens_seen": 536176,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 1.513317191283293,
|
|
"grad_norm": 6.506056785583496,
|
|
"learning_rate": 4.403894473369092e-05,
|
|
"loss": 0.2205,
|
|
"num_input_tokens_seen": 537136,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 1.5254237288135593,
|
|
"grad_norm": 15.012322425842285,
|
|
"learning_rate": 4.390128691059423e-05,
|
|
"loss": 0.26,
|
|
"num_input_tokens_seen": 541552,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 1.5375302663438255,
|
|
"grad_norm": 2.567143440246582,
|
|
"learning_rate": 4.3762278144195236e-05,
|
|
"loss": 0.2678,
|
|
"num_input_tokens_seen": 545648,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 1.549636803874092,
|
|
"grad_norm": 9.604016304016113,
|
|
"learning_rate": 4.362192836995299e-05,
|
|
"loss": 0.2246,
|
|
"num_input_tokens_seen": 550256,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.5617433414043584,
|
|
"grad_norm": 6.7328104972839355,
|
|
"learning_rate": 4.348024761917321e-05,
|
|
"loss": 0.2397,
|
|
"num_input_tokens_seen": 554928,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 1.5738498789346247,
|
|
"grad_norm": 13.930996894836426,
|
|
"learning_rate": 4.333724601829132e-05,
|
|
"loss": 0.2303,
|
|
"num_input_tokens_seen": 559344,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 1.585956416464891,
|
|
"grad_norm": 7.173315048217773,
|
|
"learning_rate": 4.319293378814868e-05,
|
|
"loss": 0.2178,
|
|
"num_input_tokens_seen": 563760,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 1.5980629539951574,
|
|
"grad_norm": 1.3246958255767822,
|
|
"learning_rate": 4.304732124326206e-05,
|
|
"loss": 0.1945,
|
|
"num_input_tokens_seen": 568112,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.6101694915254239,
|
|
"grad_norm": 10.188156127929688,
|
|
"learning_rate": 4.2900418791086403e-05,
|
|
"loss": 0.1908,
|
|
"num_input_tokens_seen": 572464,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 1.6222760290556901,
|
|
"grad_norm": 7.808104515075684,
|
|
"learning_rate": 4.275223693127103e-05,
|
|
"loss": 0.2026,
|
|
"num_input_tokens_seen": 576752,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 1.6343825665859564,
|
|
"grad_norm": 0.8921657204627991,
|
|
"learning_rate": 4.260278625490911e-05,
|
|
"loss": 0.1959,
|
|
"num_input_tokens_seen": 580976,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 1.6464891041162226,
|
|
"grad_norm": 6.147708892822266,
|
|
"learning_rate": 4.2452077443780744e-05,
|
|
"loss": 0.2025,
|
|
"num_input_tokens_seen": 585264,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.658595641646489,
|
|
"grad_norm": 5.73768424987793,
|
|
"learning_rate": 4.2300121269589475e-05,
|
|
"loss": 0.1777,
|
|
"num_input_tokens_seen": 589744,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 1.6707021791767556,
|
|
"grad_norm": 5.188973426818848,
|
|
"learning_rate": 4.214692859319237e-05,
|
|
"loss": 0.2142,
|
|
"num_input_tokens_seen": 593968,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 1.6828087167070218,
|
|
"grad_norm": 20.29938316345215,
|
|
"learning_rate": 4.19925103638238e-05,
|
|
"loss": 0.2096,
|
|
"num_input_tokens_seen": 598256,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 1.694915254237288,
|
|
"grad_norm": 3.481995105743408,
|
|
"learning_rate": 4.183687761831281e-05,
|
|
"loss": 0.1881,
|
|
"num_input_tokens_seen": 602608,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.7070217917675545,
|
|
"grad_norm": 2.9380016326904297,
|
|
"learning_rate": 4.168004148029435e-05,
|
|
"loss": 0.1678,
|
|
"num_input_tokens_seen": 607088,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 1.7191283292978208,
|
|
"grad_norm": 6.645642280578613,
|
|
"learning_rate": 4.1522013159414144e-05,
|
|
"loss": 0.243,
|
|
"num_input_tokens_seen": 611248,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.7312348668280872,
|
|
"grad_norm": 5.701453685760498,
|
|
"learning_rate": 4.136280395052754e-05,
|
|
"loss": 0.2024,
|
|
"num_input_tokens_seen": 615536,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 1.7433414043583535,
|
|
"grad_norm": 4.573903560638428,
|
|
"learning_rate": 4.120242523289223e-05,
|
|
"loss": 0.1803,
|
|
"num_input_tokens_seen": 619952,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.7554479418886197,
|
|
"grad_norm": 3.025674819946289,
|
|
"learning_rate": 4.1040888469354925e-05,
|
|
"loss": 0.1949,
|
|
"num_input_tokens_seen": 624368,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 1.7627118644067796,
|
|
"eval_loss": 0.19822187721729279,
|
|
"eval_runtime": 1.1195,
|
|
"eval_samples_per_second": 327.835,
|
|
"eval_steps_per_second": 41.091,
|
|
"num_input_tokens_seen": 626992,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 1.7675544794188862,
|
|
"grad_norm": 5.934816360473633,
|
|
"learning_rate": 4.087820520553205e-05,
|
|
"loss": 0.1935,
|
|
"num_input_tokens_seen": 628720,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.7796610169491527,
|
|
"grad_norm": 1.3624376058578491,
|
|
"learning_rate": 4.0714387068984574e-05,
|
|
"loss": 0.1884,
|
|
"num_input_tokens_seen": 633008,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 1.791767554479419,
|
|
"grad_norm": 2.1475796699523926,
|
|
"learning_rate": 4.05494457683869e-05,
|
|
"loss": 0.2014,
|
|
"num_input_tokens_seen": 637360,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.8038740920096852,
|
|
"grad_norm": 10.264263153076172,
|
|
"learning_rate": 4.038339309269002e-05,
|
|
"loss": 0.2152,
|
|
"num_input_tokens_seen": 641648,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 1.8159806295399514,
|
|
"grad_norm": 4.37279748916626,
|
|
"learning_rate": 4.021624091027895e-05,
|
|
"loss": 0.192,
|
|
"num_input_tokens_seen": 645552,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.828087167070218,
|
|
"grad_norm": 10.11119270324707,
|
|
"learning_rate": 4.004800116812441e-05,
|
|
"loss": 0.3049,
|
|
"num_input_tokens_seen": 649904,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 1.8401937046004844,
|
|
"grad_norm": 0.4716910719871521,
|
|
"learning_rate": 3.987868589092893e-05,
|
|
"loss": 0.184,
|
|
"num_input_tokens_seen": 654128,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.8523002421307506,
|
|
"grad_norm": 8.259904861450195,
|
|
"learning_rate": 3.9708307180267456e-05,
|
|
"loss": 0.1914,
|
|
"num_input_tokens_seen": 658672,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 1.8644067796610169,
|
|
"grad_norm": 14.706856727600098,
|
|
"learning_rate": 3.953687721372233e-05,
|
|
"loss": 0.4553,
|
|
"num_input_tokens_seen": 663088,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.8765133171912833,
|
|
"grad_norm": 9.08963394165039,
|
|
"learning_rate": 3.936440824401299e-05,
|
|
"loss": 0.1709,
|
|
"num_input_tokens_seen": 667440,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 1.8886198547215496,
|
|
"grad_norm": 4.246565818786621,
|
|
"learning_rate": 3.919091259812013e-05,
|
|
"loss": 0.1831,
|
|
"num_input_tokens_seen": 671792,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.900726392251816,
|
|
"grad_norm": 11.860783576965332,
|
|
"learning_rate": 3.9016402676404753e-05,
|
|
"loss": 0.2175,
|
|
"num_input_tokens_seen": 676336,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 1.9128329297820823,
|
|
"grad_norm": 5.474867820739746,
|
|
"learning_rate": 3.884089095172181e-05,
|
|
"loss": 0.18,
|
|
"num_input_tokens_seen": 680624,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.9249394673123486,
|
|
"grad_norm": 2.7666966915130615,
|
|
"learning_rate": 3.866438996852872e-05,
|
|
"loss": 0.1914,
|
|
"num_input_tokens_seen": 685040,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 1.937046004842615,
|
|
"grad_norm": 10.039326667785645,
|
|
"learning_rate": 3.848691234198879e-05,
|
|
"loss": 0.1935,
|
|
"num_input_tokens_seen": 689392,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.9491525423728815,
|
|
"grad_norm": 3.919206142425537,
|
|
"learning_rate": 3.830847075706956e-05,
|
|
"loss": 0.2046,
|
|
"num_input_tokens_seen": 693552,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 1.9612590799031477,
|
|
"grad_norm": 16.429906845092773,
|
|
"learning_rate": 3.812907796763616e-05,
|
|
"loss": 0.2291,
|
|
"num_input_tokens_seen": 698032,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.973365617433414,
|
|
"grad_norm": 6.558701992034912,
|
|
"learning_rate": 3.7948746795539745e-05,
|
|
"loss": 0.1751,
|
|
"num_input_tokens_seen": 702000,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 1.9854721549636802,
|
|
"grad_norm": 8.950061798095703,
|
|
"learning_rate": 3.776749012970105e-05,
|
|
"loss": 0.1795,
|
|
"num_input_tokens_seen": 706160,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.9975786924939467,
|
|
"grad_norm": 3.701720714569092,
|
|
"learning_rate": 3.758532092518924e-05,
|
|
"loss": 0.1852,
|
|
"num_input_tokens_seen": 710768,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 2.009685230024213,
|
|
"grad_norm": 6.777426719665527,
|
|
"learning_rate": 3.740225220229587e-05,
|
|
"loss": 0.256,
|
|
"num_input_tokens_seen": 714744,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 2.0145278450363198,
|
|
"eval_loss": 0.1934857964515686,
|
|
"eval_runtime": 0.6627,
|
|
"eval_samples_per_second": 553.776,
|
|
"eval_steps_per_second": 69.411,
|
|
"num_input_tokens_seen": 716344,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 2.0217917675544794,
|
|
"grad_norm": 7.20669412612915,
|
|
"learning_rate": 3.721829704560436e-05,
|
|
"loss": 0.1878,
|
|
"num_input_tokens_seen": 718776,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 2.0338983050847457,
|
|
"grad_norm": 6.232179164886475,
|
|
"learning_rate": 3.7033468603054725e-05,
|
|
"loss": 0.2215,
|
|
"num_input_tokens_seen": 722744,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 2.046004842615012,
|
|
"grad_norm": 8.393187522888184,
|
|
"learning_rate": 3.6847780085003905e-05,
|
|
"loss": 0.1657,
|
|
"num_input_tokens_seen": 727160,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 2.0581113801452786,
|
|
"grad_norm": 9.579306602478027,
|
|
"learning_rate": 3.666124476328155e-05,
|
|
"loss": 0.1957,
|
|
"num_input_tokens_seen": 731576,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 2.070217917675545,
|
|
"grad_norm": 8.12859058380127,
|
|
"learning_rate": 3.647387597024139e-05,
|
|
"loss": 0.1881,
|
|
"num_input_tokens_seen": 736184,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 2.082324455205811,
|
|
"grad_norm": 11.758556365966797,
|
|
"learning_rate": 3.6285687097808394e-05,
|
|
"loss": 0.2041,
|
|
"num_input_tokens_seen": 740472,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 2.0944309927360774,
|
|
"grad_norm": 1.7637454271316528,
|
|
"learning_rate": 3.609669159652158e-05,
|
|
"loss": 0.213,
|
|
"num_input_tokens_seen": 744760,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 2.106537530266344,
|
|
"grad_norm": 5.633957386016846,
|
|
"learning_rate": 3.590690297457262e-05,
|
|
"loss": 0.1913,
|
|
"num_input_tokens_seen": 749176,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 2.1186440677966103,
|
|
"grad_norm": 4.531621932983398,
|
|
"learning_rate": 3.57163347968404e-05,
|
|
"loss": 0.1961,
|
|
"num_input_tokens_seen": 753528,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 2.1307506053268765,
|
|
"grad_norm": 6.524752140045166,
|
|
"learning_rate": 3.552500068392147e-05,
|
|
"loss": 0.1981,
|
|
"num_input_tokens_seen": 757688,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 2.142857142857143,
|
|
"grad_norm": 5.924046516418457,
|
|
"learning_rate": 3.533291431115653e-05,
|
|
"loss": 0.2002,
|
|
"num_input_tokens_seen": 762040,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 2.154963680387409,
|
|
"grad_norm": 4.7628068923950195,
|
|
"learning_rate": 3.514008940765304e-05,
|
|
"loss": 0.1856,
|
|
"num_input_tokens_seen": 766200,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 2.1670702179176757,
|
|
"grad_norm": 9.14155101776123,
|
|
"learning_rate": 3.494653975530388e-05,
|
|
"loss": 0.2107,
|
|
"num_input_tokens_seen": 770680,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 2.179176755447942,
|
|
"grad_norm": 7.742560386657715,
|
|
"learning_rate": 3.475227918780239e-05,
|
|
"loss": 0.1771,
|
|
"num_input_tokens_seen": 774840,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 2.1912832929782082,
|
|
"grad_norm": 1.2218825817108154,
|
|
"learning_rate": 3.4557321589653556e-05,
|
|
"loss": 0.1924,
|
|
"num_input_tokens_seen": 779192,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 2.2033898305084745,
|
|
"grad_norm": 10.382070541381836,
|
|
"learning_rate": 3.436168089518168e-05,
|
|
"loss": 0.1687,
|
|
"num_input_tokens_seen": 783608,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 2.2154963680387407,
|
|
"grad_norm": 2.07893967628479,
|
|
"learning_rate": 3.416537108753443e-05,
|
|
"loss": 0.1922,
|
|
"num_input_tokens_seen": 788088,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 2.2276029055690074,
|
|
"grad_norm": 14.99792194366455,
|
|
"learning_rate": 3.3968406197683376e-05,
|
|
"loss": 0.1721,
|
|
"num_input_tokens_seen": 792568,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 2.2397094430992737,
|
|
"grad_norm": 4.237668037414551,
|
|
"learning_rate": 3.3770800303421254e-05,
|
|
"loss": 0.2058,
|
|
"num_input_tokens_seen": 797176,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 2.25181598062954,
|
|
"grad_norm": 2.3142411708831787,
|
|
"learning_rate": 3.357256752835561e-05,
|
|
"loss": 0.1925,
|
|
"num_input_tokens_seen": 801400,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 2.263922518159806,
|
|
"grad_norm": 3.0918896198272705,
|
|
"learning_rate": 3.3373722040899517e-05,
|
|
"loss": 0.1601,
|
|
"num_input_tokens_seen": 805944,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 2.2663438256658597,
|
|
"eval_loss": 0.38670673966407776,
|
|
"eval_runtime": 2.26,
|
|
"eval_samples_per_second": 162.386,
|
|
"eval_steps_per_second": 20.354,
|
|
"num_input_tokens_seen": 806712,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 2.2760290556900724,
|
|
"grad_norm": 3.9013168811798096,
|
|
"learning_rate": 3.317427805325875e-05,
|
|
"loss": 0.9421,
|
|
"num_input_tokens_seen": 810040,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 2.288135593220339,
|
|
"grad_norm": 1.7496920824050903,
|
|
"learning_rate": 3.297424982041609e-05,
|
|
"loss": 0.191,
|
|
"num_input_tokens_seen": 814392,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 2.3002421307506054,
|
|
"grad_norm": 6.5397491455078125,
|
|
"learning_rate": 3.277365163911243e-05,
|
|
"loss": 0.1962,
|
|
"num_input_tokens_seen": 818872,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 2.3123486682808716,
|
|
"grad_norm": 2.407987594604492,
|
|
"learning_rate": 3.257249784682492e-05,
|
|
"loss": 0.2261,
|
|
"num_input_tokens_seen": 823096,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 2.324455205811138,
|
|
"grad_norm": 3.1127803325653076,
|
|
"learning_rate": 3.2370802820742275e-05,
|
|
"loss": 0.1945,
|
|
"num_input_tokens_seen": 827128,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 2.3365617433414045,
|
|
"grad_norm": 10.151595115661621,
|
|
"learning_rate": 3.2168580976737104e-05,
|
|
"loss": 0.2272,
|
|
"num_input_tokens_seen": 831288,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 2.348668280871671,
|
|
"grad_norm": 1.2875597476959229,
|
|
"learning_rate": 3.196584676833562e-05,
|
|
"loss": 0.1824,
|
|
"num_input_tokens_seen": 835640,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 2.360774818401937,
|
|
"grad_norm": 0.8216660022735596,
|
|
"learning_rate": 3.1762614685684567e-05,
|
|
"loss": 0.156,
|
|
"num_input_tokens_seen": 839736,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 2.3728813559322033,
|
|
"grad_norm": 7.343863010406494,
|
|
"learning_rate": 3.155889925451557e-05,
|
|
"loss": 0.2199,
|
|
"num_input_tokens_seen": 844024,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 2.38498789346247,
|
|
"grad_norm": 2.2787206172943115,
|
|
"learning_rate": 3.1354715035106894e-05,
|
|
"loss": 0.1885,
|
|
"num_input_tokens_seen": 848248,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 2.3970944309927362,
|
|
"grad_norm": 6.654670238494873,
|
|
"learning_rate": 3.1150076621242816e-05,
|
|
"loss": 0.1645,
|
|
"num_input_tokens_seen": 852472,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 2.4092009685230025,
|
|
"grad_norm": 3.4156064987182617,
|
|
"learning_rate": 3.0944998639170544e-05,
|
|
"loss": 0.1747,
|
|
"num_input_tokens_seen": 856824,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 2.4213075060532687,
|
|
"grad_norm": 0.4972361624240875,
|
|
"learning_rate": 3.073949574655479e-05,
|
|
"loss": 0.1751,
|
|
"num_input_tokens_seen": 860984,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 2.433414043583535,
|
|
"grad_norm": 0.7988845705986023,
|
|
"learning_rate": 3.053358263143015e-05,
|
|
"loss": 0.1975,
|
|
"num_input_tokens_seen": 865272,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 2.4455205811138017,
|
|
"grad_norm": 5.293003082275391,
|
|
"learning_rate": 3.032727401115135e-05,
|
|
"loss": 0.1765,
|
|
"num_input_tokens_seen": 869560,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 2.457627118644068,
|
|
"grad_norm": 3.4668216705322266,
|
|
"learning_rate": 3.012058463134126e-05,
|
|
"loss": 0.1624,
|
|
"num_input_tokens_seen": 873976,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 2.469733656174334,
|
|
"grad_norm": 1.981259822845459,
|
|
"learning_rate": 2.991352926483702e-05,
|
|
"loss": 0.2237,
|
|
"num_input_tokens_seen": 878200,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 2.4818401937046004,
|
|
"grad_norm": 15.534086227416992,
|
|
"learning_rate": 2.9706122710634165e-05,
|
|
"loss": 0.2024,
|
|
"num_input_tokens_seen": 882872,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 2.4939467312348667,
|
|
"grad_norm": 2.0866310596466064,
|
|
"learning_rate": 2.949837979282889e-05,
|
|
"loss": 0.2673,
|
|
"num_input_tokens_seen": 887096,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 2.5060532687651333,
|
|
"grad_norm": 1.296164870262146,
|
|
"learning_rate": 2.92903153595585e-05,
|
|
"loss": 0.2168,
|
|
"num_input_tokens_seen": 891576,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 2.5181598062953996,
|
|
"grad_norm": 3.0610435009002686,
|
|
"learning_rate": 2.908194428194019e-05,
|
|
"loss": 0.1768,
|
|
"num_input_tokens_seen": 895736,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 2.5181598062953996,
|
|
"eval_loss": 0.1943914145231247,
|
|
"eval_runtime": 0.6714,
|
|
"eval_samples_per_second": 546.608,
|
|
"eval_steps_per_second": 68.512,
|
|
"num_input_tokens_seen": 895736,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 2.530266343825666,
|
|
"grad_norm": 13.436739921569824,
|
|
"learning_rate": 2.88732814530081e-05,
|
|
"loss": 0.1555,
|
|
"num_input_tokens_seen": 900024,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 2.542372881355932,
|
|
"grad_norm": 9.469161987304688,
|
|
"learning_rate": 2.866434178664893e-05,
|
|
"loss": 0.1744,
|
|
"num_input_tokens_seen": 904440,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 2.5544794188861983,
|
|
"grad_norm": 6.683951377868652,
|
|
"learning_rate": 2.8455140216535947e-05,
|
|
"loss": 0.1842,
|
|
"num_input_tokens_seen": 908728,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 2.566585956416465,
|
|
"grad_norm": 4.156672954559326,
|
|
"learning_rate": 2.8245691695061604e-05,
|
|
"loss": 0.2018,
|
|
"num_input_tokens_seen": 913016,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 2.5786924939467313,
|
|
"grad_norm": 2.5280745029449463,
|
|
"learning_rate": 2.8036011192268863e-05,
|
|
"loss": 0.2027,
|
|
"num_input_tokens_seen": 917304,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 2.5907990314769975,
|
|
"grad_norm": 3.3346853256225586,
|
|
"learning_rate": 2.7826113694781252e-05,
|
|
"loss": 0.1984,
|
|
"num_input_tokens_seen": 921528,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 2.6029055690072638,
|
|
"grad_norm": 6.732588768005371,
|
|
"learning_rate": 2.761601420473168e-05,
|
|
"loss": 0.1674,
|
|
"num_input_tokens_seen": 925944,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 2.61501210653753,
|
|
"grad_norm": 5.7978363037109375,
|
|
"learning_rate": 2.740572773869019e-05,
|
|
"loss": 0.1523,
|
|
"num_input_tokens_seen": 930744,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 2.6271186440677967,
|
|
"grad_norm": 4.692154884338379,
|
|
"learning_rate": 2.7195269326590682e-05,
|
|
"loss": 0.1263,
|
|
"num_input_tokens_seen": 935352,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 2.639225181598063,
|
|
"grad_norm": 8.889333724975586,
|
|
"learning_rate": 2.6984654010656667e-05,
|
|
"loss": 0.1656,
|
|
"num_input_tokens_seen": 939640,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 2.651331719128329,
|
|
"grad_norm": 4.259967803955078,
|
|
"learning_rate": 2.6773896844326125e-05,
|
|
"loss": 0.2926,
|
|
"num_input_tokens_seen": 943672,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 2.663438256658596,
|
|
"grad_norm": 3.0391273498535156,
|
|
"learning_rate": 2.656301289117561e-05,
|
|
"loss": 0.1547,
|
|
"num_input_tokens_seen": 947704,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 2.6755447941888617,
|
|
"grad_norm": 9.067920684814453,
|
|
"learning_rate": 2.6352017223843585e-05,
|
|
"loss": 0.2428,
|
|
"num_input_tokens_seen": 951928,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 2.6876513317191284,
|
|
"grad_norm": 7.765347957611084,
|
|
"learning_rate": 2.6140924922953125e-05,
|
|
"loss": 0.1649,
|
|
"num_input_tokens_seen": 956216,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 2.6997578692493946,
|
|
"grad_norm": 1.6490931510925293,
|
|
"learning_rate": 2.5929751076034058e-05,
|
|
"loss": 0.1597,
|
|
"num_input_tokens_seen": 960504,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 2.711864406779661,
|
|
"grad_norm": 1.5548573732376099,
|
|
"learning_rate": 2.571851077644461e-05,
|
|
"loss": 0.1407,
|
|
"num_input_tokens_seen": 965048,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 2.7239709443099276,
|
|
"grad_norm": 5.526769161224365,
|
|
"learning_rate": 2.5507219122292598e-05,
|
|
"loss": 0.1667,
|
|
"num_input_tokens_seen": 969208,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 2.736077481840194,
|
|
"grad_norm": 5.792220115661621,
|
|
"learning_rate": 2.529589121535636e-05,
|
|
"loss": 0.1438,
|
|
"num_input_tokens_seen": 973624,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 2.74818401937046,
|
|
"grad_norm": 6.361023902893066,
|
|
"learning_rate": 2.5084542160005335e-05,
|
|
"loss": 0.2294,
|
|
"num_input_tokens_seen": 977976,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 2.7602905569007263,
|
|
"grad_norm": 1.0617471933364868,
|
|
"learning_rate": 2.487318706212051e-05,
|
|
"loss": 0.1964,
|
|
"num_input_tokens_seen": 982200,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 2.7699757869249395,
|
|
"eval_loss": 0.19318054616451263,
|
|
"eval_runtime": 0.6508,
|
|
"eval_samples_per_second": 563.894,
|
|
"eval_steps_per_second": 70.679,
|
|
"num_input_tokens_seen": 985592,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"epoch": 2.7723970944309926,
|
|
"grad_norm": 7.693630695343018,
|
|
"learning_rate": 2.4661841028014785e-05,
|
|
"loss": 0.203,
|
|
"num_input_tokens_seen": 986488,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 2.7845036319612593,
|
|
"grad_norm": 4.296042442321777,
|
|
"learning_rate": 2.445051916335321e-05,
|
|
"loss": 0.1983,
|
|
"num_input_tokens_seen": 990456,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 2.7966101694915255,
|
|
"grad_norm": 2.928414821624756,
|
|
"learning_rate": 2.4239236572073352e-05,
|
|
"loss": 0.1825,
|
|
"num_input_tokens_seen": 994744,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 2.8087167070217918,
|
|
"grad_norm": 2.411320686340332,
|
|
"learning_rate": 2.4028008355305815e-05,
|
|
"loss": 0.178,
|
|
"num_input_tokens_seen": 999160,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 2.820823244552058,
|
|
"grad_norm": 6.881911754608154,
|
|
"learning_rate": 2.3816849610294783e-05,
|
|
"loss": 0.1709,
|
|
"num_input_tokens_seen": 1003256,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 2.8329297820823243,
|
|
"grad_norm": 4.286351680755615,
|
|
"learning_rate": 2.3605775429319115e-05,
|
|
"loss": 0.1853,
|
|
"num_input_tokens_seen": 1007480,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 2.845036319612591,
|
|
"grad_norm": 3.7688863277435303,
|
|
"learning_rate": 2.3394800898613535e-05,
|
|
"loss": 0.1431,
|
|
"num_input_tokens_seen": 1011896,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 2.857142857142857,
|
|
"grad_norm": 3.717094898223877,
|
|
"learning_rate": 2.318394109729041e-05,
|
|
"loss": 0.2253,
|
|
"num_input_tokens_seen": 1015992,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 2.8692493946731235,
|
|
"grad_norm": 7.443727493286133,
|
|
"learning_rate": 2.297321109626198e-05,
|
|
"loss": 0.1686,
|
|
"num_input_tokens_seen": 1020408,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 2.8813559322033897,
|
|
"grad_norm": 12.574480056762695,
|
|
"learning_rate": 2.27626259571632e-05,
|
|
"loss": 0.1988,
|
|
"num_input_tokens_seen": 1025016,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 2.893462469733656,
|
|
"grad_norm": 9.311829566955566,
|
|
"learning_rate": 2.2552200731275213e-05,
|
|
"loss": 0.1682,
|
|
"num_input_tokens_seen": 1029368,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 2.9055690072639226,
|
|
"grad_norm": 4.659236431121826,
|
|
"learning_rate": 2.2341950458449576e-05,
|
|
"loss": 0.1918,
|
|
"num_input_tokens_seen": 1033592,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 2.917675544794189,
|
|
"grad_norm": 1.1926063299179077,
|
|
"learning_rate": 2.213189016603333e-05,
|
|
"loss": 0.2047,
|
|
"num_input_tokens_seen": 1037688,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 2.929782082324455,
|
|
"grad_norm": 1.54401433467865,
|
|
"learning_rate": 2.1922034867794925e-05,
|
|
"loss": 0.1686,
|
|
"num_input_tokens_seen": 1041912,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 2.9418886198547214,
|
|
"grad_norm": 6.956883430480957,
|
|
"learning_rate": 2.1712399562851147e-05,
|
|
"loss": 0.1663,
|
|
"num_input_tokens_seen": 1046392,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 2.9539951573849876,
|
|
"grad_norm": 6.875396728515625,
|
|
"learning_rate": 2.150299923459505e-05,
|
|
"loss": 0.1158,
|
|
"num_input_tokens_seen": 1050616,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 2.9661016949152543,
|
|
"grad_norm": 4.653652191162109,
|
|
"learning_rate": 2.1293848849625065e-05,
|
|
"loss": 0.1857,
|
|
"num_input_tokens_seen": 1054840,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 2.9782082324455206,
|
|
"grad_norm": 4.641164302825928,
|
|
"learning_rate": 2.108496335667527e-05,
|
|
"loss": 0.2051,
|
|
"num_input_tokens_seen": 1058936,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 2.990314769975787,
|
|
"grad_norm": 4.4205002784729,
|
|
"learning_rate": 2.0876357685546944e-05,
|
|
"loss": 0.137,
|
|
"num_input_tokens_seen": 1063288,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 3.002421307506053,
|
|
"grad_norm": 9.87366771697998,
|
|
"learning_rate": 2.06680467460415e-05,
|
|
"loss": 0.294,
|
|
"num_input_tokens_seen": 1067392,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 3.0145278450363198,
|
|
"grad_norm": 1.3809499740600586,
|
|
"learning_rate": 2.0460045426894817e-05,
|
|
"loss": 0.1436,
|
|
"num_input_tokens_seen": 1071872,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 3.0217917675544794,
|
|
"eval_loss": 0.20527909696102142,
|
|
"eval_runtime": 0.667,
|
|
"eval_samples_per_second": 550.187,
|
|
"eval_steps_per_second": 68.961,
|
|
"num_input_tokens_seen": 1074624,
|
|
"step": 1248
|
|
},
|
|
{
|
|
"epoch": 3.026634382566586,
|
|
"grad_norm": 1.1184051036834717,
|
|
"learning_rate": 2.0252368594713083e-05,
|
|
"loss": 0.1503,
|
|
"num_input_tokens_seen": 1076416,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 3.0387409200968523,
|
|
"grad_norm": 3.941237211227417,
|
|
"learning_rate": 2.004503109291023e-05,
|
|
"loss": 0.156,
|
|
"num_input_tokens_seen": 1080512,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 3.0508474576271185,
|
|
"grad_norm": 2.0000264644622803,
|
|
"learning_rate": 1.9838047740647026e-05,
|
|
"loss": 0.1971,
|
|
"num_input_tokens_seen": 1084608,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 3.062953995157385,
|
|
"grad_norm": 11.35123062133789,
|
|
"learning_rate": 1.9631433331771886e-05,
|
|
"loss": 0.1813,
|
|
"num_input_tokens_seen": 1089024,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 3.0750605326876514,
|
|
"grad_norm": 2.1008217334747314,
|
|
"learning_rate": 1.9425202633763513e-05,
|
|
"loss": 0.133,
|
|
"num_input_tokens_seen": 1093376,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 3.0871670702179177,
|
|
"grad_norm": 5.499813556671143,
|
|
"learning_rate": 1.9219370386675388e-05,
|
|
"loss": 0.089,
|
|
"num_input_tokens_seen": 1097728,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 3.099273607748184,
|
|
"grad_norm": 8.502225875854492,
|
|
"learning_rate": 1.901395130208229e-05,
|
|
"loss": 0.2836,
|
|
"num_input_tokens_seen": 1101888,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 3.11138014527845,
|
|
"grad_norm": 14.45283031463623,
|
|
"learning_rate": 1.880896006202876e-05,
|
|
"loss": 0.1116,
|
|
"num_input_tokens_seen": 1106176,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 3.123486682808717,
|
|
"grad_norm": 3.364891767501831,
|
|
"learning_rate": 1.860441131797977e-05,
|
|
"loss": 0.1027,
|
|
"num_input_tokens_seen": 1110272,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 3.135593220338983,
|
|
"grad_norm": 8.516124725341797,
|
|
"learning_rate": 1.8400319689773474e-05,
|
|
"loss": 0.1582,
|
|
"num_input_tokens_seen": 1114496,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 3.1476997578692494,
|
|
"grad_norm": 11.724932670593262,
|
|
"learning_rate": 1.8196699764576318e-05,
|
|
"loss": 0.0408,
|
|
"num_input_tokens_seen": 1118784,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 3.1598062953995156,
|
|
"grad_norm": 8.753253936767578,
|
|
"learning_rate": 1.7993566095840443e-05,
|
|
"loss": 0.1234,
|
|
"num_input_tokens_seen": 1123008,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 3.171912832929782,
|
|
"grad_norm": 8.221136093139648,
|
|
"learning_rate": 1.7790933202263434e-05,
|
|
"loss": 0.2236,
|
|
"num_input_tokens_seen": 1127424,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 3.1840193704600486,
|
|
"grad_norm": 17.435853958129883,
|
|
"learning_rate": 1.758881556675073e-05,
|
|
"loss": 0.1958,
|
|
"num_input_tokens_seen": 1131840,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 3.196125907990315,
|
|
"grad_norm": 5.691689491271973,
|
|
"learning_rate": 1.738722763538036e-05,
|
|
"loss": 0.1238,
|
|
"num_input_tokens_seen": 1136192,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 3.208232445520581,
|
|
"grad_norm": 2.6163206100463867,
|
|
"learning_rate": 1.7186183816370522e-05,
|
|
"loss": 0.1027,
|
|
"num_input_tokens_seen": 1140544,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 3.2203389830508473,
|
|
"grad_norm": 5.7949724197387695,
|
|
"learning_rate": 1.6985698479049702e-05,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 1145280,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 3.232445520581114,
|
|
"grad_norm": 5.007083892822266,
|
|
"learning_rate": 1.6785785952829717e-05,
|
|
"loss": 0.1037,
|
|
"num_input_tokens_seen": 1149888,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 3.2445520581113803,
|
|
"grad_norm": 12.367361068725586,
|
|
"learning_rate": 1.6586460526181473e-05,
|
|
"loss": 0.1776,
|
|
"num_input_tokens_seen": 1153920,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 3.2566585956416465,
|
|
"grad_norm": 16.06878089904785,
|
|
"learning_rate": 1.6387736445613772e-05,
|
|
"loss": 0.2125,
|
|
"num_input_tokens_seen": 1158592,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 3.2687651331719128,
|
|
"grad_norm": 7.7484588623046875,
|
|
"learning_rate": 1.6189627914655008e-05,
|
|
"loss": 0.2252,
|
|
"num_input_tokens_seen": 1162816,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 3.2736077481840193,
|
|
"eval_loss": 0.2091810256242752,
|
|
"eval_runtime": 0.6785,
|
|
"eval_samples_per_second": 540.886,
|
|
"eval_steps_per_second": 67.795,
|
|
"num_input_tokens_seen": 1164544,
|
|
"step": 1352
|
|
},
|
|
{
|
|
"epoch": 3.280871670702179,
|
|
"grad_norm": 9.04961109161377,
|
|
"learning_rate": 1.599214909283805e-05,
|
|
"loss": 0.1163,
|
|
"num_input_tokens_seen": 1167232,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 3.2929782082324457,
|
|
"grad_norm": 3.317920446395874,
|
|
"learning_rate": 1.579531409468815e-05,
|
|
"loss": 0.1094,
|
|
"num_input_tokens_seen": 1171648,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 3.305084745762712,
|
|
"grad_norm": 8.250765800476074,
|
|
"learning_rate": 1.5599136988714186e-05,
|
|
"loss": 0.141,
|
|
"num_input_tokens_seen": 1175808,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 3.317191283292978,
|
|
"grad_norm": 5.985897541046143,
|
|
"learning_rate": 1.5403631796403085e-05,
|
|
"loss": 0.1296,
|
|
"num_input_tokens_seen": 1180224,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 3.3292978208232444,
|
|
"grad_norm": 4.8227314949035645,
|
|
"learning_rate": 1.520881249121767e-05,
|
|
"loss": 0.1375,
|
|
"num_input_tokens_seen": 1184704,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 3.341404358353511,
|
|
"grad_norm": 2.318727970123291,
|
|
"learning_rate": 1.5014692997597962e-05,
|
|
"loss": 0.1459,
|
|
"num_input_tokens_seen": 1188992,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 3.3535108958837774,
|
|
"grad_norm": 13.753244400024414,
|
|
"learning_rate": 1.4821287189965866e-05,
|
|
"loss": 0.1535,
|
|
"num_input_tokens_seen": 1193408,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 3.3656174334140436,
|
|
"grad_norm": 1.9978270530700684,
|
|
"learning_rate": 1.4628608891733625e-05,
|
|
"loss": 0.1246,
|
|
"num_input_tokens_seen": 1197760,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 3.37772397094431,
|
|
"grad_norm": 6.705835819244385,
|
|
"learning_rate": 1.4436671874315722e-05,
|
|
"loss": 0.0863,
|
|
"num_input_tokens_seen": 1201792,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 3.389830508474576,
|
|
"grad_norm": 7.748871326446533,
|
|
"learning_rate": 1.4245489856144634e-05,
|
|
"loss": 0.0968,
|
|
"num_input_tokens_seen": 1205824,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 3.401937046004843,
|
|
"grad_norm": 4.018503189086914,
|
|
"learning_rate": 1.4055076501690311e-05,
|
|
"loss": 0.0749,
|
|
"num_input_tokens_seen": 1210240,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 3.414043583535109,
|
|
"grad_norm": 4.750000953674316,
|
|
"learning_rate": 1.3865445420483526e-05,
|
|
"loss": 0.09,
|
|
"num_input_tokens_seen": 1214464,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 3.4261501210653753,
|
|
"grad_norm": 9.335100173950195,
|
|
"learning_rate": 1.367661016614315e-05,
|
|
"loss": 0.1746,
|
|
"num_input_tokens_seen": 1218752,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 3.4382566585956416,
|
|
"grad_norm": 4.242533206939697,
|
|
"learning_rate": 1.3488584235407439e-05,
|
|
"loss": 0.0826,
|
|
"num_input_tokens_seen": 1223168,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 3.450363196125908,
|
|
"grad_norm": 1.9875125885009766,
|
|
"learning_rate": 1.3301381067169366e-05,
|
|
"loss": 0.1469,
|
|
"num_input_tokens_seen": 1227328,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 3.4624697336561745,
|
|
"grad_norm": 10.304492950439453,
|
|
"learning_rate": 1.3115014041516089e-05,
|
|
"loss": 0.1454,
|
|
"num_input_tokens_seen": 1231360,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 3.4745762711864407,
|
|
"grad_norm": 2.467794418334961,
|
|
"learning_rate": 1.2929496478772635e-05,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 1235456,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 3.486682808716707,
|
|
"grad_norm": 5.000001907348633,
|
|
"learning_rate": 1.2744841638549842e-05,
|
|
"loss": 0.106,
|
|
"num_input_tokens_seen": 1239616,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 3.4987893462469732,
|
|
"grad_norm": 0.32030388712882996,
|
|
"learning_rate": 1.2561062718796662e-05,
|
|
"loss": 0.0763,
|
|
"num_input_tokens_seen": 1243968,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 3.5108958837772395,
|
|
"grad_norm": 1.8182225227355957,
|
|
"learning_rate": 1.2378172854856831e-05,
|
|
"loss": 0.0978,
|
|
"num_input_tokens_seen": 1248128,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 3.523002421307506,
|
|
"grad_norm": 5.48933219909668,
|
|
"learning_rate": 1.2196185118530063e-05,
|
|
"loss": 0.1328,
|
|
"num_input_tokens_seen": 1252288,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 3.5254237288135593,
|
|
"eval_loss": 0.3491859436035156,
|
|
"eval_runtime": 0.6747,
|
|
"eval_samples_per_second": 543.942,
|
|
"eval_steps_per_second": 68.178,
|
|
"num_input_tokens_seen": 1253248,
|
|
"step": 1456
|
|
},
|
|
{
|
|
"epoch": 3.5351089588377724,
|
|
"grad_norm": 1.86709725856781,
|
|
"learning_rate": 1.2015112517137744e-05,
|
|
"loss": 0.1139,
|
|
"num_input_tokens_seen": 1256640,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 3.5472154963680387,
|
|
"grad_norm": 10.584001541137695,
|
|
"learning_rate": 1.183496799259326e-05,
|
|
"loss": 0.1247,
|
|
"num_input_tokens_seen": 1261440,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 3.559322033898305,
|
|
"grad_norm": 0.81782066822052,
|
|
"learning_rate": 1.1655764420476988e-05,
|
|
"loss": 0.0777,
|
|
"num_input_tokens_seen": 1265664,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 3.571428571428571,
|
|
"grad_norm": 4.23323917388916,
|
|
"learning_rate": 1.1477514609116039e-05,
|
|
"loss": 0.0848,
|
|
"num_input_tokens_seen": 1270016,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 3.583535108958838,
|
|
"grad_norm": 4.22898006439209,
|
|
"learning_rate": 1.1300231298668786e-05,
|
|
"loss": 0.1263,
|
|
"num_input_tokens_seen": 1274560,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 3.595641646489104,
|
|
"grad_norm": 7.585851669311523,
|
|
"learning_rate": 1.1123927160214289e-05,
|
|
"loss": 0.1362,
|
|
"num_input_tokens_seen": 1278976,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 3.6077481840193704,
|
|
"grad_norm": 2.0685174465179443,
|
|
"learning_rate": 1.0948614794846668e-05,
|
|
"loss": 0.1068,
|
|
"num_input_tokens_seen": 1283200,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 3.619854721549637,
|
|
"grad_norm": 4.345080852508545,
|
|
"learning_rate": 1.0774306732774414e-05,
|
|
"loss": 0.2069,
|
|
"num_input_tokens_seen": 1287296,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 3.6319612590799033,
|
|
"grad_norm": 15.997807502746582,
|
|
"learning_rate": 1.0601015432424819e-05,
|
|
"loss": 0.1368,
|
|
"num_input_tokens_seen": 1291712,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 3.6440677966101696,
|
|
"grad_norm": 6.712691783905029,
|
|
"learning_rate": 1.042875327955356e-05,
|
|
"loss": 0.1959,
|
|
"num_input_tokens_seen": 1295936,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 3.656174334140436,
|
|
"grad_norm": 5.0442962646484375,
|
|
"learning_rate": 1.0257532586359422e-05,
|
|
"loss": 0.0932,
|
|
"num_input_tokens_seen": 1300608,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 3.668280871670702,
|
|
"grad_norm": 5.707069396972656,
|
|
"learning_rate": 1.0087365590604289e-05,
|
|
"loss": 0.1347,
|
|
"num_input_tokens_seen": 1305024,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 3.6803874092009687,
|
|
"grad_norm": 2.964393138885498,
|
|
"learning_rate": 9.918264454738504e-06,
|
|
"loss": 0.1287,
|
|
"num_input_tokens_seen": 1309376,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 3.692493946731235,
|
|
"grad_norm": 10.144442558288574,
|
|
"learning_rate": 9.75024126503153e-06,
|
|
"loss": 0.0818,
|
|
"num_input_tokens_seen": 1313664,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 3.7046004842615012,
|
|
"grad_norm": 8.710615158081055,
|
|
"learning_rate": 9.583308030708135e-06,
|
|
"loss": 0.0869,
|
|
"num_input_tokens_seen": 1318080,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 3.7167070217917675,
|
|
"grad_norm": 2.1846084594726562,
|
|
"learning_rate": 9.417476683090007e-06,
|
|
"loss": 0.0893,
|
|
"num_input_tokens_seen": 1322432,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 3.7288135593220337,
|
|
"grad_norm": 3.826754570007324,
|
|
"learning_rate": 9.252759074743034e-06,
|
|
"loss": 0.1556,
|
|
"num_input_tokens_seen": 1326848,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 3.7409200968523004,
|
|
"grad_norm": 10.382698059082031,
|
|
"learning_rate": 9.08916697863014e-06,
|
|
"loss": 0.0774,
|
|
"num_input_tokens_seen": 1331328,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 3.7530266343825667,
|
|
"grad_norm": 7.099722862243652,
|
|
"learning_rate": 8.926712087269801e-06,
|
|
"loss": 0.1253,
|
|
"num_input_tokens_seen": 1335424,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 3.765133171912833,
|
|
"grad_norm": 5.015311241149902,
|
|
"learning_rate": 8.765406011900368e-06,
|
|
"loss": 0.1276,
|
|
"num_input_tokens_seen": 1339712,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 3.777239709443099,
|
|
"grad_norm": 4.82669734954834,
|
|
"learning_rate": 8.605260281650152e-06,
|
|
"loss": 0.1842,
|
|
"num_input_tokens_seen": 1344000,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 3.777239709443099,
|
|
"eval_loss": 0.21899566054344177,
|
|
"eval_runtime": 0.6796,
|
|
"eval_samples_per_second": 539.994,
|
|
"eval_steps_per_second": 67.683,
|
|
"num_input_tokens_seen": 1344000,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 3.7893462469733654,
|
|
"grad_norm": 3.010295867919922,
|
|
"learning_rate": 8.446286342713419e-06,
|
|
"loss": 0.0881,
|
|
"num_input_tokens_seen": 1348224,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 3.801452784503632,
|
|
"grad_norm": 2.3779475688934326,
|
|
"learning_rate": 8.288495557532241e-06,
|
|
"loss": 0.1348,
|
|
"num_input_tokens_seen": 1352576,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 3.8135593220338984,
|
|
"grad_norm": 6.911816120147705,
|
|
"learning_rate": 8.131899203984463e-06,
|
|
"loss": 0.134,
|
|
"num_input_tokens_seen": 1356864,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 3.8256658595641646,
|
|
"grad_norm": 9.250137329101562,
|
|
"learning_rate": 7.976508474577548e-06,
|
|
"loss": 0.1141,
|
|
"num_input_tokens_seen": 1361152,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 3.837772397094431,
|
|
"grad_norm": 4.86985445022583,
|
|
"learning_rate": 7.822334475648654e-06,
|
|
"loss": 0.0705,
|
|
"num_input_tokens_seen": 1365376,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 3.849878934624697,
|
|
"grad_norm": 0.7732688188552856,
|
|
"learning_rate": 7.669388226570809e-06,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 1369728,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 3.861985472154964,
|
|
"grad_norm": 5.062341213226318,
|
|
"learning_rate": 7.517680658965329e-06,
|
|
"loss": 0.1261,
|
|
"num_input_tokens_seen": 1374144,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 3.87409200968523,
|
|
"grad_norm": 8.762838363647461,
|
|
"learning_rate": 7.367222615920477e-06,
|
|
"loss": 0.1084,
|
|
"num_input_tokens_seen": 1378368,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 3.8861985472154963,
|
|
"grad_norm": 8.905739784240723,
|
|
"learning_rate": 7.2180248512164896e-06,
|
|
"loss": 0.0813,
|
|
"num_input_tokens_seen": 1382464,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 3.898305084745763,
|
|
"grad_norm": 0.5714547038078308,
|
|
"learning_rate": 7.070098028556948e-06,
|
|
"loss": 0.0805,
|
|
"num_input_tokens_seen": 1386880,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 3.910411622276029,
|
|
"grad_norm": 8.167064666748047,
|
|
"learning_rate": 6.923452720806611e-06,
|
|
"loss": 0.1924,
|
|
"num_input_tokens_seen": 1391296,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 3.9225181598062955,
|
|
"grad_norm": 3.438431739807129,
|
|
"learning_rate": 6.778099409235739e-06,
|
|
"loss": 0.0609,
|
|
"num_input_tokens_seen": 1395456,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 3.9346246973365617,
|
|
"grad_norm": 7.784511089324951,
|
|
"learning_rate": 6.634048482770946e-06,
|
|
"loss": 0.0932,
|
|
"num_input_tokens_seen": 1399616,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 3.946731234866828,
|
|
"grad_norm": 13.272894859313965,
|
|
"learning_rate": 6.491310237252679e-06,
|
|
"loss": 0.1241,
|
|
"num_input_tokens_seen": 1403712,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 3.9588377723970947,
|
|
"grad_norm": 12.38925838470459,
|
|
"learning_rate": 6.349894874699344e-06,
|
|
"loss": 0.1232,
|
|
"num_input_tokens_seen": 1408128,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 3.970944309927361,
|
|
"grad_norm": 5.343148231506348,
|
|
"learning_rate": 6.209812502578114e-06,
|
|
"loss": 0.0787,
|
|
"num_input_tokens_seen": 1412480,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 3.983050847457627,
|
|
"grad_norm": 1.2886254787445068,
|
|
"learning_rate": 6.071073133082492e-06,
|
|
"loss": 0.0494,
|
|
"num_input_tokens_seen": 1416704,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 3.9951573849878934,
|
|
"grad_norm": 10.778816223144531,
|
|
"learning_rate": 5.933686682416758e-06,
|
|
"loss": 0.0969,
|
|
"num_input_tokens_seen": 1421120,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 4.00726392251816,
|
|
"grad_norm": 0.2529144883155823,
|
|
"learning_rate": 5.797662970087184e-06,
|
|
"loss": 0.09,
|
|
"num_input_tokens_seen": 1424944,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 4.019370460048426,
|
|
"grad_norm": 6.2160162925720215,
|
|
"learning_rate": 5.663011718200201e-06,
|
|
"loss": 0.0897,
|
|
"num_input_tokens_seen": 1429296,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 4.0290556900726395,
|
|
"eval_loss": 0.2532218098640442,
|
|
"eval_runtime": 0.672,
|
|
"eval_samples_per_second": 546.104,
|
|
"eval_steps_per_second": 68.449,
|
|
"num_input_tokens_seen": 1432880,
|
|
"step": 1664
|
|
},
|
|
{
|
|
"epoch": 4.031476997578692,
|
|
"grad_norm": 0.9374585747718811,
|
|
"learning_rate": 5.529742550767544e-06,
|
|
"loss": 0.0316,
|
|
"num_input_tokens_seen": 1433776,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 4.043583535108959,
|
|
"grad_norm": 1.9009536504745483,
|
|
"learning_rate": 5.397864993018367e-06,
|
|
"loss": 0.0492,
|
|
"num_input_tokens_seen": 1438000,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 4.0556900726392255,
|
|
"grad_norm": 7.239864349365234,
|
|
"learning_rate": 5.267388470718449e-06,
|
|
"loss": 0.029,
|
|
"num_input_tokens_seen": 1442352,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 4.067796610169491,
|
|
"grad_norm": 2.098872661590576,
|
|
"learning_rate": 5.138322309496504e-06,
|
|
"loss": 0.052,
|
|
"num_input_tokens_seen": 1446704,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 4.079903147699758,
|
|
"grad_norm": 1.4036399126052856,
|
|
"learning_rate": 5.010675734177631e-06,
|
|
"loss": 0.0469,
|
|
"num_input_tokens_seen": 1450864,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 4.092009685230024,
|
|
"grad_norm": 11.33265495300293,
|
|
"learning_rate": 4.884457868124001e-06,
|
|
"loss": 0.0316,
|
|
"num_input_tokens_seen": 1455088,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 4.1041162227602905,
|
|
"grad_norm": 1.9709900617599487,
|
|
"learning_rate": 4.759677732582782e-06,
|
|
"loss": 0.0228,
|
|
"num_input_tokens_seen": 1459376,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 4.116222760290557,
|
|
"grad_norm": 0.01155536063015461,
|
|
"learning_rate": 4.636344246041321e-06,
|
|
"loss": 0.0529,
|
|
"num_input_tokens_seen": 1463600,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 4.128329297820823,
|
|
"grad_norm": 19.08058738708496,
|
|
"learning_rate": 4.514466223589753e-06,
|
|
"loss": 0.0565,
|
|
"num_input_tokens_seen": 1468080,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 4.14043583535109,
|
|
"grad_norm": 1.3092641830444336,
|
|
"learning_rate": 4.3940523762909135e-06,
|
|
"loss": 0.0695,
|
|
"num_input_tokens_seen": 1472624,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 4.1525423728813555,
|
|
"grad_norm": 0.055544547736644745,
|
|
"learning_rate": 4.275111310557758e-06,
|
|
"loss": 0.0511,
|
|
"num_input_tokens_seen": 1477040,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 4.164648910411622,
|
|
"grad_norm": 0.16590368747711182,
|
|
"learning_rate": 4.1576515275382226e-06,
|
|
"loss": 0.0311,
|
|
"num_input_tokens_seen": 1481328,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 4.176755447941889,
|
|
"grad_norm": 0.1331050992012024,
|
|
"learning_rate": 4.0416814225076035e-06,
|
|
"loss": 0.0394,
|
|
"num_input_tokens_seen": 1485808,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 4.188861985472155,
|
|
"grad_norm": 1.6521071195602417,
|
|
"learning_rate": 3.9272092842685345e-06,
|
|
"loss": 0.0255,
|
|
"num_input_tokens_seen": 1490160,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 4.200968523002421,
|
|
"grad_norm": 0.42354145646095276,
|
|
"learning_rate": 3.814243294558542e-06,
|
|
"loss": 0.0073,
|
|
"num_input_tokens_seen": 1494512,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 4.213075060532688,
|
|
"grad_norm": 2.2178032398223877,
|
|
"learning_rate": 3.702791527465274e-06,
|
|
"loss": 0.0562,
|
|
"num_input_tokens_seen": 1498480,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 4.225181598062954,
|
|
"grad_norm": 13.911809921264648,
|
|
"learning_rate": 3.592861948849416e-06,
|
|
"loss": 0.0463,
|
|
"num_input_tokens_seen": 1502768,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 4.237288135593221,
|
|
"grad_norm": 0.01323059480637312,
|
|
"learning_rate": 3.484462415775333e-06,
|
|
"loss": 0.0429,
|
|
"num_input_tokens_seen": 1506992,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 4.249394673123486,
|
|
"grad_norm": 0.1997198611497879,
|
|
"learning_rate": 3.377600675949527e-06,
|
|
"loss": 0.0035,
|
|
"num_input_tokens_seen": 1511472,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 4.261501210653753,
|
|
"grad_norm": 9.309453010559082,
|
|
"learning_rate": 3.272284367166825e-06,
|
|
"loss": 0.0395,
|
|
"num_input_tokens_seen": 1515824,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 4.27360774818402,
|
|
"grad_norm": 1.514168620109558,
|
|
"learning_rate": 3.1685210167645335e-06,
|
|
"loss": 0.0337,
|
|
"num_input_tokens_seen": 1520176,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 4.280871670702179,
|
|
"eval_loss": 0.4314914643764496,
|
|
"eval_runtime": 0.8115,
|
|
"eval_samples_per_second": 452.254,
|
|
"eval_steps_per_second": 56.686,
|
|
"num_input_tokens_seen": 1522544,
|
|
"step": 1768
|
|
},
|
|
{
|
|
"epoch": 4.285714285714286,
|
|
"grad_norm": 0.23039455711841583,
|
|
"learning_rate": 3.0663180410843982e-06,
|
|
"loss": 0.008,
|
|
"num_input_tokens_seen": 1524336,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 4.297820823244552,
|
|
"grad_norm": 0.17007200419902802,
|
|
"learning_rate": 2.9656827449425494e-06,
|
|
"loss": 0.1379,
|
|
"num_input_tokens_seen": 1528560,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 4.309927360774818,
|
|
"grad_norm": 5.092523097991943,
|
|
"learning_rate": 2.86662232110739e-06,
|
|
"loss": 0.0391,
|
|
"num_input_tokens_seen": 1532720,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 4.322033898305085,
|
|
"grad_norm": 8.858246803283691,
|
|
"learning_rate": 2.7691438497855134e-06,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 1536944,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 4.3341404358353515,
|
|
"grad_norm": 0.16653333604335785,
|
|
"learning_rate": 2.673254298115646e-06,
|
|
"loss": 0.0365,
|
|
"num_input_tokens_seen": 1541168,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 4.346246973365617,
|
|
"grad_norm": 0.057360630482435226,
|
|
"learning_rate": 2.5789605196706674e-06,
|
|
"loss": 0.0094,
|
|
"num_input_tokens_seen": 1545456,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 4.358353510895884,
|
|
"grad_norm": 18.321725845336914,
|
|
"learning_rate": 2.4862692539677906e-06,
|
|
"loss": 0.0798,
|
|
"num_input_tokens_seen": 1549872,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 4.37046004842615,
|
|
"grad_norm": 0.05611402168869972,
|
|
"learning_rate": 2.3951871259868503e-06,
|
|
"loss": 0.113,
|
|
"num_input_tokens_seen": 1554288,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 4.3825665859564165,
|
|
"grad_norm": 7.665430068969727,
|
|
"learning_rate": 2.3057206456967905e-06,
|
|
"loss": 0.1113,
|
|
"num_input_tokens_seen": 1558384,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 4.394673123486683,
|
|
"grad_norm": 9.430697441101074,
|
|
"learning_rate": 2.217876207590375e-06,
|
|
"loss": 0.0523,
|
|
"num_input_tokens_seen": 1562544,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 4.406779661016949,
|
|
"grad_norm": 0.0549406073987484,
|
|
"learning_rate": 2.131660090227139e-06,
|
|
"loss": 0.0659,
|
|
"num_input_tokens_seen": 1567216,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 4.418886198547216,
|
|
"grad_norm": 0.08962647616863251,
|
|
"learning_rate": 2.0470784557846652e-06,
|
|
"loss": 0.0756,
|
|
"num_input_tokens_seen": 1571568,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 4.4309927360774815,
|
|
"grad_norm": 0.09955435991287231,
|
|
"learning_rate": 1.964137349618114e-06,
|
|
"loss": 0.0018,
|
|
"num_input_tokens_seen": 1575792,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 4.443099273607748,
|
|
"grad_norm": 0.7829030156135559,
|
|
"learning_rate": 1.8828426998281689e-06,
|
|
"loss": 0.0419,
|
|
"num_input_tokens_seen": 1580080,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 4.455205811138015,
|
|
"grad_norm": 3.134791851043701,
|
|
"learning_rate": 1.8032003168373306e-06,
|
|
"loss": 0.0692,
|
|
"num_input_tokens_seen": 1584112,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 4.467312348668281,
|
|
"grad_norm": 1.7574918270111084,
|
|
"learning_rate": 1.7252158929746131e-06,
|
|
"loss": 0.0456,
|
|
"num_input_tokens_seen": 1588400,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 4.479418886198547,
|
|
"grad_norm": 27.999475479125977,
|
|
"learning_rate": 1.6488950020686955e-06,
|
|
"loss": 0.0504,
|
|
"num_input_tokens_seen": 1592816,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 4.491525423728813,
|
|
"grad_norm": 0.16101866960525513,
|
|
"learning_rate": 1.5742430990495466e-06,
|
|
"loss": 0.0573,
|
|
"num_input_tokens_seen": 1597296,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 4.50363196125908,
|
|
"grad_norm": 0.11599753797054291,
|
|
"learning_rate": 1.5012655195585368e-06,
|
|
"loss": 0.0293,
|
|
"num_input_tokens_seen": 1601648,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 4.5157384987893465,
|
|
"grad_norm": 6.17954683303833,
|
|
"learning_rate": 1.4299674795670764e-06,
|
|
"loss": 0.1156,
|
|
"num_input_tokens_seen": 1605936,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 4.527845036319612,
|
|
"grad_norm": 1.049548625946045,
|
|
"learning_rate": 1.360354075003828e-06,
|
|
"loss": 0.126,
|
|
"num_input_tokens_seen": 1610096,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 4.532687651331719,
|
|
"eval_loss": 0.42201921343803406,
|
|
"eval_runtime": 0.693,
|
|
"eval_samples_per_second": 529.558,
|
|
"eval_steps_per_second": 66.375,
|
|
"num_input_tokens_seen": 1611760,
|
|
"step": 1872
|
|
},
|
|
{
|
|
"epoch": 4.539951573849879,
|
|
"grad_norm": 13.409820556640625,
|
|
"learning_rate": 1.2924302813904582e-06,
|
|
"loss": 0.0436,
|
|
"num_input_tokens_seen": 1614384,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 4.552058111380145,
|
|
"grad_norm": 3.9212989807128906,
|
|
"learning_rate": 1.226200953486037e-06,
|
|
"loss": 0.0591,
|
|
"num_input_tokens_seen": 1618800,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 4.5641646489104115,
|
|
"grad_norm": 0.7789947986602783,
|
|
"learning_rate": 1.1616708249400449e-06,
|
|
"loss": 0.0027,
|
|
"num_input_tokens_seen": 1622960,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 4.576271186440678,
|
|
"grad_norm": 16.51002311706543,
|
|
"learning_rate": 1.0988445079540388e-06,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 1627056,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 4.588377723970944,
|
|
"grad_norm": 0.03825072944164276,
|
|
"learning_rate": 1.0377264929520125e-06,
|
|
"loss": 0.0205,
|
|
"num_input_tokens_seen": 1631408,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 4.600484261501211,
|
|
"grad_norm": 13.03893756866455,
|
|
"learning_rate": 9.783211482594285e-07,
|
|
"loss": 0.0687,
|
|
"num_input_tokens_seen": 1635888,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 4.6125907990314765,
|
|
"grad_norm": 0.19233529269695282,
|
|
"learning_rate": 9.206327197910203e-07,
|
|
"loss": 0.0049,
|
|
"num_input_tokens_seen": 1640176,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 4.624697336561743,
|
|
"grad_norm": 9.149880409240723,
|
|
"learning_rate": 8.646653307473079e-07,
|
|
"loss": 0.056,
|
|
"num_input_tokens_seen": 1644528,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 4.63680387409201,
|
|
"grad_norm": 0.09057964384555817,
|
|
"learning_rate": 8.10422981319911e-07,
|
|
"loss": 0.002,
|
|
"num_input_tokens_seen": 1649264,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 4.648910411622276,
|
|
"grad_norm": 0.645796537399292,
|
|
"learning_rate": 7.579095484056192e-07,
|
|
"loss": 0.0111,
|
|
"num_input_tokens_seen": 1653808,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 4.661016949152542,
|
|
"grad_norm": 0.02393440343439579,
|
|
"learning_rate": 7.07128785329314e-07,
|
|
"loss": 0.0023,
|
|
"num_input_tokens_seen": 1658288,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 4.673123486682809,
|
|
"grad_norm": 0.03354793041944504,
|
|
"learning_rate": 6.580843215757082e-07,
|
|
"loss": 0.0228,
|
|
"num_input_tokens_seen": 1662576,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 4.685230024213075,
|
|
"grad_norm": 1.0874401330947876,
|
|
"learning_rate": 6.107796625299117e-07,
|
|
"loss": 0.0221,
|
|
"num_input_tokens_seen": 1667056,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 4.697336561743342,
|
|
"grad_norm": 0.94743412733078,
|
|
"learning_rate": 5.652181892269181e-07,
|
|
"loss": 0.0733,
|
|
"num_input_tokens_seen": 1671536,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 4.709443099273607,
|
|
"grad_norm": 0.04832937568426132,
|
|
"learning_rate": 5.214031581099149e-07,
|
|
"loss": 0.0023,
|
|
"num_input_tokens_seen": 1675888,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 4.721549636803874,
|
|
"grad_norm": 12.764242172241211,
|
|
"learning_rate": 4.793377007975719e-07,
|
|
"loss": 0.0341,
|
|
"num_input_tokens_seen": 1680176,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 4.733656174334141,
|
|
"grad_norm": 6.990570068359375,
|
|
"learning_rate": 4.3902482386018186e-07,
|
|
"loss": 0.0568,
|
|
"num_input_tokens_seen": 1684400,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 4.745762711864407,
|
|
"grad_norm": 26.958158493041992,
|
|
"learning_rate": 4.004674086047905e-07,
|
|
"loss": 0.1211,
|
|
"num_input_tokens_seen": 1688816,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 4.757869249394673,
|
|
"grad_norm": 1.086872935295105,
|
|
"learning_rate": 3.636682108692502e-07,
|
|
"loss": 0.0408,
|
|
"num_input_tokens_seen": 1693360,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 4.76997578692494,
|
|
"grad_norm": 15.128409385681152,
|
|
"learning_rate": 3.2862986082524416e-07,
|
|
"loss": 0.0647,
|
|
"num_input_tokens_seen": 1697584,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 4.782082324455206,
|
|
"grad_norm": 7.18263053894043,
|
|
"learning_rate": 2.953548627903202e-07,
|
|
"loss": 0.0336,
|
|
"num_input_tokens_seen": 1702000,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 4.784503631961259,
|
|
"eval_loss": 0.4348176121711731,
|
|
"eval_runtime": 0.6821,
|
|
"eval_samples_per_second": 538.039,
|
|
"eval_steps_per_second": 67.438,
|
|
"num_input_tokens_seen": 1702832,
|
|
"step": 1976
|
|
},
|
|
{
|
|
"epoch": 4.7941888619854724,
|
|
"grad_norm": 0.357972115278244,
|
|
"learning_rate": 2.6384559504886166e-07,
|
|
"loss": 0.1448,
|
|
"num_input_tokens_seen": 1706416,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 4.806295399515738,
|
|
"grad_norm": 5.933152198791504,
|
|
"learning_rate": 2.3410430968214824e-07,
|
|
"loss": 0.0163,
|
|
"num_input_tokens_seen": 1710960,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 4.818401937046005,
|
|
"grad_norm": 21.378908157348633,
|
|
"learning_rate": 2.0613313240735454e-07,
|
|
"loss": 0.1048,
|
|
"num_input_tokens_seen": 1715440,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 4.830508474576272,
|
|
"grad_norm": 0.03769972547888756,
|
|
"learning_rate": 1.7993406242563238e-07,
|
|
"loss": 0.0295,
|
|
"num_input_tokens_seen": 1719728,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 4.842615012106537,
|
|
"grad_norm": 0.04536456614732742,
|
|
"learning_rate": 1.5550897227922523e-07,
|
|
"loss": 0.0007,
|
|
"num_input_tokens_seen": 1724272,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 4.854721549636804,
|
|
"grad_norm": 12.351763725280762,
|
|
"learning_rate": 1.3285960771761697e-07,
|
|
"loss": 0.064,
|
|
"num_input_tokens_seen": 1728560,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 4.86682808716707,
|
|
"grad_norm": 11.032571792602539,
|
|
"learning_rate": 1.119875875727705e-07,
|
|
"loss": 0.0289,
|
|
"num_input_tokens_seen": 1733104,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 4.878934624697337,
|
|
"grad_norm": 21.032617568969727,
|
|
"learning_rate": 9.289440364341485e-08,
|
|
"loss": 0.0127,
|
|
"num_input_tokens_seen": 1737264,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 4.891041162227603,
|
|
"grad_norm": 3.019296169281006,
|
|
"learning_rate": 7.558142058842754e-08,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 1741424,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 4.903147699757869,
|
|
"grad_norm": 0.06446848809719086,
|
|
"learning_rate": 6.004987582929055e-08,
|
|
"loss": 0.0657,
|
|
"num_input_tokens_seen": 1745648,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 4.915254237288136,
|
|
"grad_norm": 15.37187385559082,
|
|
"learning_rate": 4.63008794616554e-08,
|
|
"loss": 0.045,
|
|
"num_input_tokens_seen": 1749872,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 4.927360774818402,
|
|
"grad_norm": 0.0873086079955101,
|
|
"learning_rate": 3.433541417599551e-08,
|
|
"loss": 0.0431,
|
|
"num_input_tokens_seen": 1754288,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 4.939467312348668,
|
|
"grad_norm": 0.19230781495571136,
|
|
"learning_rate": 2.4154335187365207e-08,
|
|
"loss": 0.0332,
|
|
"num_input_tokens_seen": 1758640,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 4.951573849878935,
|
|
"grad_norm": 0.0936799943447113,
|
|
"learning_rate": 1.5758370174284722e-08,
|
|
"loss": 0.0602,
|
|
"num_input_tokens_seen": 1762928,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 4.963680387409201,
|
|
"grad_norm": 0.07743958383798599,
|
|
"learning_rate": 9.14811922672898e-09,
|
|
"loss": 0.0118,
|
|
"num_input_tokens_seen": 1767344,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 4.9757869249394675,
|
|
"grad_norm": 0.36676138639450073,
|
|
"learning_rate": 4.324054803223065e-09,
|
|
"loss": 0.0392,
|
|
"num_input_tokens_seen": 1771632,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 4.987893462469733,
|
|
"grad_norm": 11.666873931884766,
|
|
"learning_rate": 1.286521697091425e-09,
|
|
"loss": 0.0333,
|
|
"num_input_tokens_seen": 1775728,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 0.10359911620616913,
|
|
"learning_rate": 3.5737011805370145e-11,
|
|
"loss": 0.0653,
|
|
"num_input_tokens_seen": 1780000,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"num_input_tokens_seen": 1780000,
|
|
"step": 2065,
|
|
"total_flos": 1.039320047616e+16,
|
|
"train_loss": 0.16683834154997698,
|
|
"train_runtime": 1017.6301,
|
|
"train_samples_per_second": 16.219,
|
|
"train_steps_per_second": 2.029
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2065,
|
|
"num_input_tokens_seen": 1780000,
|
|
"num_train_epochs": 5,
|
|
"save_steps": 104,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.039320047616e+16,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|