Files
train_mrpc_42_1774791061/trainer_state.json

3520 lines
92 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": 104,
"best_metric": 0.17402823269367218,
"best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_mrpc_42_1774791061/checkpoint-104",
"epoch": 5.0,
"eval_steps": 104,
"global_step": 2065,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012106537530266344,
"grad_norm": 262.4778747558594,
"learning_rate": 9.66183574879227e-07,
"loss": 0.7681,
"num_input_tokens_seen": 4352,
"step": 5
},
{
"epoch": 0.024213075060532687,
"grad_norm": 26.363384246826172,
"learning_rate": 2.173913043478261e-06,
"loss": 0.3056,
"num_input_tokens_seen": 8768,
"step": 10
},
{
"epoch": 0.03631961259079903,
"grad_norm": 10.327119827270508,
"learning_rate": 3.3816425120772947e-06,
"loss": 0.183,
"num_input_tokens_seen": 12992,
"step": 15
},
{
"epoch": 0.048426150121065374,
"grad_norm": 36.403324127197266,
"learning_rate": 4.589371980676329e-06,
"loss": 0.4041,
"num_input_tokens_seen": 17344,
"step": 20
},
{
"epoch": 0.06053268765133172,
"grad_norm": 8.729621887207031,
"learning_rate": 5.797101449275362e-06,
"loss": 0.4147,
"num_input_tokens_seen": 21696,
"step": 25
},
{
"epoch": 0.07263922518159806,
"grad_norm": 4.769359111785889,
"learning_rate": 7.004830917874397e-06,
"loss": 0.2132,
"num_input_tokens_seen": 26112,
"step": 30
},
{
"epoch": 0.0847457627118644,
"grad_norm": 4.588466644287109,
"learning_rate": 8.212560386473431e-06,
"loss": 0.2587,
"num_input_tokens_seen": 30208,
"step": 35
},
{
"epoch": 0.09685230024213075,
"grad_norm": 21.823162078857422,
"learning_rate": 9.420289855072464e-06,
"loss": 0.2076,
"num_input_tokens_seen": 34688,
"step": 40
},
{
"epoch": 0.1089588377723971,
"grad_norm": 18.038860321044922,
"learning_rate": 1.0628019323671499e-05,
"loss": 0.1842,
"num_input_tokens_seen": 38784,
"step": 45
},
{
"epoch": 0.12106537530266344,
"grad_norm": 12.918279647827148,
"learning_rate": 1.1835748792270531e-05,
"loss": 0.3012,
"num_input_tokens_seen": 43200,
"step": 50
},
{
"epoch": 0.13317191283292978,
"grad_norm": 24.635744094848633,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.1951,
"num_input_tokens_seen": 47296,
"step": 55
},
{
"epoch": 0.14527845036319612,
"grad_norm": 14.053600311279297,
"learning_rate": 1.4251207729468599e-05,
"loss": 0.2332,
"num_input_tokens_seen": 51712,
"step": 60
},
{
"epoch": 0.15738498789346247,
"grad_norm": 8.166345596313477,
"learning_rate": 1.5458937198067633e-05,
"loss": 0.2049,
"num_input_tokens_seen": 55872,
"step": 65
},
{
"epoch": 0.1694915254237288,
"grad_norm": 27.84511947631836,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2103,
"num_input_tokens_seen": 59840,
"step": 70
},
{
"epoch": 0.18159806295399517,
"grad_norm": 55.02257537841797,
"learning_rate": 1.78743961352657e-05,
"loss": 0.3072,
"num_input_tokens_seen": 64000,
"step": 75
},
{
"epoch": 0.1937046004842615,
"grad_norm": 11.449199676513672,
"learning_rate": 1.9082125603864733e-05,
"loss": 0.3841,
"num_input_tokens_seen": 68352,
"step": 80
},
{
"epoch": 0.20581113801452786,
"grad_norm": 11.381855964660645,
"learning_rate": 2.028985507246377e-05,
"loss": 0.232,
"num_input_tokens_seen": 72768,
"step": 85
},
{
"epoch": 0.2179176755447942,
"grad_norm": 42.495670318603516,
"learning_rate": 2.1497584541062805e-05,
"loss": 0.2474,
"num_input_tokens_seen": 77120,
"step": 90
},
{
"epoch": 0.23002421307506055,
"grad_norm": 21.28970718383789,
"learning_rate": 2.2705314009661836e-05,
"loss": 0.1841,
"num_input_tokens_seen": 81664,
"step": 95
},
{
"epoch": 0.24213075060532688,
"grad_norm": 17.023759841918945,
"learning_rate": 2.391304347826087e-05,
"loss": 0.1681,
"num_input_tokens_seen": 86080,
"step": 100
},
{
"epoch": 0.25181598062953997,
"eval_loss": 0.17402823269367218,
"eval_runtime": 0.639,
"eval_samples_per_second": 574.368,
"eval_steps_per_second": 71.992,
"num_input_tokens_seen": 89600,
"step": 104
},
{
"epoch": 0.2542372881355932,
"grad_norm": 16.713178634643555,
"learning_rate": 2.5120772946859905e-05,
"loss": 0.1488,
"num_input_tokens_seen": 90432,
"step": 105
},
{
"epoch": 0.26634382566585957,
"grad_norm": 6.363961219787598,
"learning_rate": 2.632850241545894e-05,
"loss": 0.2051,
"num_input_tokens_seen": 94528,
"step": 110
},
{
"epoch": 0.2784503631961259,
"grad_norm": 7.700758934020996,
"learning_rate": 2.753623188405797e-05,
"loss": 0.16,
"num_input_tokens_seen": 98816,
"step": 115
},
{
"epoch": 0.29055690072639223,
"grad_norm": 8.657270431518555,
"learning_rate": 2.8743961352657005e-05,
"loss": 0.205,
"num_input_tokens_seen": 103104,
"step": 120
},
{
"epoch": 0.3026634382566586,
"grad_norm": 7.297232151031494,
"learning_rate": 2.995169082125604e-05,
"loss": 0.1846,
"num_input_tokens_seen": 107328,
"step": 125
},
{
"epoch": 0.31476997578692495,
"grad_norm": 13.21757984161377,
"learning_rate": 3.1159420289855074e-05,
"loss": 0.2243,
"num_input_tokens_seen": 111488,
"step": 130
},
{
"epoch": 0.3268765133171913,
"grad_norm": 6.457214832305908,
"learning_rate": 3.236714975845411e-05,
"loss": 0.2013,
"num_input_tokens_seen": 115968,
"step": 135
},
{
"epoch": 0.3389830508474576,
"grad_norm": 29.321474075317383,
"learning_rate": 3.357487922705314e-05,
"loss": 0.2278,
"num_input_tokens_seen": 120192,
"step": 140
},
{
"epoch": 0.35108958837772397,
"grad_norm": 10.676529884338379,
"learning_rate": 3.478260869565218e-05,
"loss": 0.1886,
"num_input_tokens_seen": 124416,
"step": 145
},
{
"epoch": 0.36319612590799033,
"grad_norm": 11.802507400512695,
"learning_rate": 3.5990338164251205e-05,
"loss": 0.1635,
"num_input_tokens_seen": 128832,
"step": 150
},
{
"epoch": 0.37530266343825663,
"grad_norm": 9.175806999206543,
"learning_rate": 3.719806763285024e-05,
"loss": 0.2118,
"num_input_tokens_seen": 132992,
"step": 155
},
{
"epoch": 0.387409200968523,
"grad_norm": 17.557262420654297,
"learning_rate": 3.8405797101449274e-05,
"loss": 0.3186,
"num_input_tokens_seen": 137280,
"step": 160
},
{
"epoch": 0.39951573849878935,
"grad_norm": 31.175756454467773,
"learning_rate": 3.961352657004831e-05,
"loss": 0.2002,
"num_input_tokens_seen": 141568,
"step": 165
},
{
"epoch": 0.4116222760290557,
"grad_norm": 12.988505363464355,
"learning_rate": 4.082125603864734e-05,
"loss": 0.1792,
"num_input_tokens_seen": 145984,
"step": 170
},
{
"epoch": 0.423728813559322,
"grad_norm": 43.43312454223633,
"learning_rate": 4.202898550724638e-05,
"loss": 0.3197,
"num_input_tokens_seen": 150144,
"step": 175
},
{
"epoch": 0.4358353510895884,
"grad_norm": 10.99770736694336,
"learning_rate": 4.323671497584541e-05,
"loss": 0.3561,
"num_input_tokens_seen": 154624,
"step": 180
},
{
"epoch": 0.44794188861985473,
"grad_norm": 8.507532119750977,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.373,
"num_input_tokens_seen": 158784,
"step": 185
},
{
"epoch": 0.4600484261501211,
"grad_norm": 129.54592895507812,
"learning_rate": 4.565217391304348e-05,
"loss": 0.3924,
"num_input_tokens_seen": 163072,
"step": 190
},
{
"epoch": 0.4721549636803874,
"grad_norm": 15.108623504638672,
"learning_rate": 4.6859903381642516e-05,
"loss": 0.2368,
"num_input_tokens_seen": 167104,
"step": 195
},
{
"epoch": 0.48426150121065376,
"grad_norm": 9.902148246765137,
"learning_rate": 4.806763285024155e-05,
"loss": 0.4497,
"num_input_tokens_seen": 171456,
"step": 200
},
{
"epoch": 0.4963680387409201,
"grad_norm": 16.188369750976562,
"learning_rate": 4.9275362318840584e-05,
"loss": 0.2715,
"num_input_tokens_seen": 175808,
"step": 205
},
{
"epoch": 0.5036319612590799,
"eval_loss": 0.23122040927410126,
"eval_runtime": 0.6326,
"eval_samples_per_second": 580.165,
"eval_steps_per_second": 72.718,
"num_input_tokens_seen": 178688,
"step": 208
},
{
"epoch": 0.5084745762711864,
"grad_norm": 0.7912328839302063,
"learning_rate": 4.9999857052054956e-05,
"loss": 0.1981,
"num_input_tokens_seen": 180224,
"step": 210
},
{
"epoch": 0.5205811138014528,
"grad_norm": 4.983211040496826,
"learning_rate": 4.999824890644693e-05,
"loss": 0.1989,
"num_input_tokens_seen": 184704,
"step": 215
},
{
"epoch": 0.5326876513317191,
"grad_norm": 16.626827239990234,
"learning_rate": 4.9994854045622684e-05,
"loss": 0.2336,
"num_input_tokens_seen": 189184,
"step": 220
},
{
"epoch": 0.5447941888619855,
"grad_norm": 5.18185567855835,
"learning_rate": 4.9989672712225204e-05,
"loss": 0.1595,
"num_input_tokens_seen": 193536,
"step": 225
},
{
"epoch": 0.5569007263922519,
"grad_norm": 8.547920227050781,
"learning_rate": 4.998270527658311e-05,
"loss": 0.2147,
"num_input_tokens_seen": 197888,
"step": 230
},
{
"epoch": 0.5690072639225182,
"grad_norm": 1.19011652469635,
"learning_rate": 4.9973952236684216e-05,
"loss": 0.1959,
"num_input_tokens_seen": 202112,
"step": 235
},
{
"epoch": 0.5811138014527845,
"grad_norm": 12.658636093139648,
"learning_rate": 4.996341421813993e-05,
"loss": 0.2085,
"num_input_tokens_seen": 206528,
"step": 240
},
{
"epoch": 0.5932203389830508,
"grad_norm": 20.122756958007812,
"learning_rate": 4.9951091974140506e-05,
"loss": 0.2304,
"num_input_tokens_seen": 210944,
"step": 245
},
{
"epoch": 0.6053268765133172,
"grad_norm": 10.99802303314209,
"learning_rate": 4.99369863854013e-05,
"loss": 0.2171,
"num_input_tokens_seen": 215104,
"step": 250
},
{
"epoch": 0.6174334140435835,
"grad_norm": 7.956684112548828,
"learning_rate": 4.992109846009972e-05,
"loss": 0.2458,
"num_input_tokens_seen": 219328,
"step": 255
},
{
"epoch": 0.6295399515738499,
"grad_norm": 19.862939834594727,
"learning_rate": 4.990342933380321e-05,
"loss": 0.219,
"num_input_tokens_seen": 223680,
"step": 260
},
{
"epoch": 0.6416464891041163,
"grad_norm": 7.302405834197998,
"learning_rate": 4.9883980269388106e-05,
"loss": 0.3803,
"num_input_tokens_seen": 227904,
"step": 265
},
{
"epoch": 0.6537530266343826,
"grad_norm": 9.361984252929688,
"learning_rate": 4.986275265694935e-05,
"loss": 0.3005,
"num_input_tokens_seen": 231936,
"step": 270
},
{
"epoch": 0.6658595641646489,
"grad_norm": 16.678607940673828,
"learning_rate": 4.9839748013701145e-05,
"loss": 0.2954,
"num_input_tokens_seen": 236160,
"step": 275
},
{
"epoch": 0.6779661016949152,
"grad_norm": 9.596780776977539,
"learning_rate": 4.981496798386849e-05,
"loss": 0.2924,
"num_input_tokens_seen": 240320,
"step": 280
},
{
"epoch": 0.6900726392251816,
"grad_norm": 6.522184371948242,
"learning_rate": 4.978841433856971e-05,
"loss": 0.1771,
"num_input_tokens_seen": 244800,
"step": 285
},
{
"epoch": 0.7021791767554479,
"grad_norm": 8.720867156982422,
"learning_rate": 4.976008897568981e-05,
"loss": 0.194,
"num_input_tokens_seen": 249152,
"step": 290
},
{
"epoch": 0.7142857142857143,
"grad_norm": 11.178607940673828,
"learning_rate": 4.972999391974488e-05,
"loss": 0.2064,
"num_input_tokens_seen": 253376,
"step": 295
},
{
"epoch": 0.7263922518159807,
"grad_norm": 12.191368103027344,
"learning_rate": 4.969813132173735e-05,
"loss": 0.2096,
"num_input_tokens_seen": 257664,
"step": 300
},
{
"epoch": 0.738498789346247,
"grad_norm": 5.037217617034912,
"learning_rate": 4.966450345900229e-05,
"loss": 0.1712,
"num_input_tokens_seen": 262016,
"step": 305
},
{
"epoch": 0.7506053268765133,
"grad_norm": 10.153473854064941,
"learning_rate": 4.962911273504461e-05,
"loss": 0.2276,
"num_input_tokens_seen": 266432,
"step": 310
},
{
"epoch": 0.7554479418886199,
"eval_loss": 0.22853781282901764,
"eval_runtime": 2.3445,
"eval_samples_per_second": 156.536,
"eval_steps_per_second": 19.62,
"num_input_tokens_seen": 267968,
"step": 312
},
{
"epoch": 0.7627118644067796,
"grad_norm": 12.040881156921387,
"learning_rate": 4.9591961679367284e-05,
"loss": 0.2349,
"num_input_tokens_seen": 270464,
"step": 315
},
{
"epoch": 0.774818401937046,
"grad_norm": 12.473306655883789,
"learning_rate": 4.955305294729056e-05,
"loss": 0.2824,
"num_input_tokens_seen": 274688,
"step": 320
},
{
"epoch": 0.7869249394673123,
"grad_norm": 21.77474594116211,
"learning_rate": 4.951238931976216e-05,
"loss": 0.3105,
"num_input_tokens_seen": 278848,
"step": 325
},
{
"epoch": 0.7990314769975787,
"grad_norm": 17.280487060546875,
"learning_rate": 4.9469973703158565e-05,
"loss": 0.2667,
"num_input_tokens_seen": 283136,
"step": 330
},
{
"epoch": 0.8111380145278451,
"grad_norm": 6.448112487792969,
"learning_rate": 4.9425809129077204e-05,
"loss": 0.2213,
"num_input_tokens_seen": 287680,
"step": 335
},
{
"epoch": 0.8232445520581114,
"grad_norm": 1.0759979486465454,
"learning_rate": 4.937989875411985e-05,
"loss": 0.1887,
"num_input_tokens_seen": 292224,
"step": 340
},
{
"epoch": 0.8353510895883777,
"grad_norm": 8.703038215637207,
"learning_rate": 4.933224585966696e-05,
"loss": 0.2499,
"num_input_tokens_seen": 296448,
"step": 345
},
{
"epoch": 0.847457627118644,
"grad_norm": 16.416717529296875,
"learning_rate": 4.928285385164315e-05,
"loss": 0.2431,
"num_input_tokens_seen": 300736,
"step": 350
},
{
"epoch": 0.8595641646489104,
"grad_norm": 6.670568943023682,
"learning_rate": 4.923172626027379e-05,
"loss": 0.2588,
"num_input_tokens_seen": 304960,
"step": 355
},
{
"epoch": 0.8716707021791767,
"grad_norm": 3.8800857067108154,
"learning_rate": 4.917886673983267e-05,
"loss": 0.2322,
"num_input_tokens_seen": 309184,
"step": 360
},
{
"epoch": 0.8837772397094431,
"grad_norm": 8.991925239562988,
"learning_rate": 4.912427906838078e-05,
"loss": 0.2314,
"num_input_tokens_seen": 313408,
"step": 365
},
{
"epoch": 0.8958837772397095,
"grad_norm": 9.208677291870117,
"learning_rate": 4.906796714749635e-05,
"loss": 0.1782,
"num_input_tokens_seen": 317888,
"step": 370
},
{
"epoch": 0.9079903147699758,
"grad_norm": 6.636046886444092,
"learning_rate": 4.900993500199591e-05,
"loss": 0.1873,
"num_input_tokens_seen": 322048,
"step": 375
},
{
"epoch": 0.9200968523002422,
"grad_norm": 10.718189239501953,
"learning_rate": 4.895018677964669e-05,
"loss": 0.1985,
"num_input_tokens_seen": 326592,
"step": 380
},
{
"epoch": 0.9322033898305084,
"grad_norm": 22.99626922607422,
"learning_rate": 4.8888726750870126e-05,
"loss": 0.3036,
"num_input_tokens_seen": 330880,
"step": 385
},
{
"epoch": 0.9443099273607748,
"grad_norm": 3.320899486541748,
"learning_rate": 4.882555930843664e-05,
"loss": 0.2224,
"num_input_tokens_seen": 335104,
"step": 390
},
{
"epoch": 0.9564164648910412,
"grad_norm": 5.677978038787842,
"learning_rate": 4.87606889671517e-05,
"loss": 0.1898,
"num_input_tokens_seen": 339392,
"step": 395
},
{
"epoch": 0.9685230024213075,
"grad_norm": 11.17044448852539,
"learning_rate": 4.8694120363533104e-05,
"loss": 0.1663,
"num_input_tokens_seen": 343744,
"step": 400
},
{
"epoch": 0.9806295399515739,
"grad_norm": 9.493459701538086,
"learning_rate": 4.8625858255479574e-05,
"loss": 0.1954,
"num_input_tokens_seen": 348160,
"step": 405
},
{
"epoch": 0.9927360774818402,
"grad_norm": 13.322687149047852,
"learning_rate": 4.855590752193076e-05,
"loss": 0.2606,
"num_input_tokens_seen": 352448,
"step": 410
},
{
"epoch": 1.0048426150121066,
"grad_norm": 13.647954940795898,
"learning_rate": 4.848427316251842e-05,
"loss": 0.5572,
"num_input_tokens_seen": 356656,
"step": 415
},
{
"epoch": 1.0072639225181599,
"eval_loss": 0.2624819278717041,
"eval_runtime": 0.8628,
"eval_samples_per_second": 425.363,
"eval_steps_per_second": 53.315,
"num_input_tokens_seen": 357488,
"step": 416
},
{
"epoch": 1.0169491525423728,
"grad_norm": 43.02584457397461,
"learning_rate": 4.841096029720921e-05,
"loss": 0.2346,
"num_input_tokens_seen": 360880,
"step": 420
},
{
"epoch": 1.0290556900726393,
"grad_norm": 8.104162216186523,
"learning_rate": 4.8335974165938615e-05,
"loss": 0.1819,
"num_input_tokens_seen": 365104,
"step": 425
},
{
"epoch": 1.0411622276029056,
"grad_norm": 5.002182483673096,
"learning_rate": 4.825932012823652e-05,
"loss": 0.1495,
"num_input_tokens_seen": 369776,
"step": 430
},
{
"epoch": 1.053268765133172,
"grad_norm": 27.77912139892578,
"learning_rate": 4.8181003662844074e-05,
"loss": 0.2583,
"num_input_tokens_seen": 374000,
"step": 435
},
{
"epoch": 1.0653753026634383,
"grad_norm": 9.262914657592773,
"learning_rate": 4.8101030367322195e-05,
"loss": 0.2093,
"num_input_tokens_seen": 378096,
"step": 440
},
{
"epoch": 1.0774818401937045,
"grad_norm": 5.5975823402404785,
"learning_rate": 4.8019405957651395e-05,
"loss": 0.1806,
"num_input_tokens_seen": 382256,
"step": 445
},
{
"epoch": 1.089588377723971,
"grad_norm": 10.306631088256836,
"learning_rate": 4.793613626782331e-05,
"loss": 0.3307,
"num_input_tokens_seen": 386672,
"step": 450
},
{
"epoch": 1.1016949152542372,
"grad_norm": 4.157079696655273,
"learning_rate": 4.785122724942367e-05,
"loss": 0.2208,
"num_input_tokens_seen": 390960,
"step": 455
},
{
"epoch": 1.1138014527845037,
"grad_norm": 0.7576245069503784,
"learning_rate": 4.776468497120698e-05,
"loss": 0.2978,
"num_input_tokens_seen": 395440,
"step": 460
},
{
"epoch": 1.12590799031477,
"grad_norm": 6.9619035720825195,
"learning_rate": 4.7676515618662684e-05,
"loss": 0.2315,
"num_input_tokens_seen": 399600,
"step": 465
},
{
"epoch": 1.1380145278450362,
"grad_norm": 1.4395357370376587,
"learning_rate": 4.758672549357316e-05,
"loss": 0.2236,
"num_input_tokens_seen": 403888,
"step": 470
},
{
"epoch": 1.1501210653753027,
"grad_norm": 18.561601638793945,
"learning_rate": 4.749532101356322e-05,
"loss": 0.1689,
"num_input_tokens_seen": 408176,
"step": 475
},
{
"epoch": 1.162227602905569,
"grad_norm": 16.139604568481445,
"learning_rate": 4.740230871164147e-05,
"loss": 0.2012,
"num_input_tokens_seen": 412208,
"step": 480
},
{
"epoch": 1.1743341404358354,
"grad_norm": 1.962085247039795,
"learning_rate": 4.730769523573337e-05,
"loss": 0.1816,
"num_input_tokens_seen": 416624,
"step": 485
},
{
"epoch": 1.1864406779661016,
"grad_norm": 3.118806838989258,
"learning_rate": 4.7211487348206054e-05,
"loss": 0.2491,
"num_input_tokens_seen": 421040,
"step": 490
},
{
"epoch": 1.1985472154963681,
"grad_norm": 3.9620296955108643,
"learning_rate": 4.711369192538503e-05,
"loss": 0.203,
"num_input_tokens_seen": 425136,
"step": 495
},
{
"epoch": 1.2106537530266344,
"grad_norm": 4.469512462615967,
"learning_rate": 4.7014315957062685e-05,
"loss": 0.4102,
"num_input_tokens_seen": 429680,
"step": 500
},
{
"epoch": 1.2227602905569008,
"grad_norm": 8.607080459594727,
"learning_rate": 4.691336654599873e-05,
"loss": 0.2409,
"num_input_tokens_seen": 434224,
"step": 505
},
{
"epoch": 1.234866828087167,
"grad_norm": 9.237229347229004,
"learning_rate": 4.6810850907412484e-05,
"loss": 0.2191,
"num_input_tokens_seen": 438320,
"step": 510
},
{
"epoch": 1.2469733656174333,
"grad_norm": 5.81946325302124,
"learning_rate": 4.670677636846723e-05,
"loss": 0.1975,
"num_input_tokens_seen": 442672,
"step": 515
},
{
"epoch": 1.2590799031476998,
"grad_norm": 2.934025764465332,
"learning_rate": 4.660115036774648e-05,
"loss": 0.1881,
"num_input_tokens_seen": 446896,
"step": 520
},
{
"epoch": 1.2590799031476998,
"eval_loss": 0.1976936012506485,
"eval_runtime": 0.6676,
"eval_samples_per_second": 549.73,
"eval_steps_per_second": 68.904,
"num_input_tokens_seen": 446896,
"step": 520
},
{
"epoch": 1.271186440677966,
"grad_norm": 2.785706043243408,
"learning_rate": 4.6493980454722344e-05,
"loss": 0.2485,
"num_input_tokens_seen": 451312,
"step": 525
},
{
"epoch": 1.2832929782082325,
"grad_norm": 9.8702392578125,
"learning_rate": 4.638527428921592e-05,
"loss": 0.2053,
"num_input_tokens_seen": 455408,
"step": 530
},
{
"epoch": 1.2953995157384988,
"grad_norm": 7.424989223480225,
"learning_rate": 4.627503964084981e-05,
"loss": 0.1867,
"num_input_tokens_seen": 460080,
"step": 535
},
{
"epoch": 1.307506053268765,
"grad_norm": 4.052550792694092,
"learning_rate": 4.6163284388492835e-05,
"loss": 0.1674,
"num_input_tokens_seen": 464496,
"step": 540
},
{
"epoch": 1.3196125907990315,
"grad_norm": 2.9404428005218506,
"learning_rate": 4.605001651969686e-05,
"loss": 0.2045,
"num_input_tokens_seen": 468720,
"step": 545
},
{
"epoch": 1.331719128329298,
"grad_norm": 6.4158148765563965,
"learning_rate": 4.593524413012592e-05,
"loss": 0.191,
"num_input_tokens_seen": 473264,
"step": 550
},
{
"epoch": 1.3438256658595642,
"grad_norm": 2.213015556335449,
"learning_rate": 4.5818975422977606e-05,
"loss": 0.1828,
"num_input_tokens_seen": 477552,
"step": 555
},
{
"epoch": 1.3559322033898304,
"grad_norm": 5.9616804122924805,
"learning_rate": 4.570121870839671e-05,
"loss": 0.1546,
"num_input_tokens_seen": 482032,
"step": 560
},
{
"epoch": 1.368038740920097,
"grad_norm": 0.6267197132110596,
"learning_rate": 4.558198240288131e-05,
"loss": 0.2025,
"num_input_tokens_seen": 486384,
"step": 565
},
{
"epoch": 1.3801452784503632,
"grad_norm": 9.450618743896484,
"learning_rate": 4.546127502868118e-05,
"loss": 0.2413,
"num_input_tokens_seen": 490672,
"step": 570
},
{
"epoch": 1.3922518159806296,
"grad_norm": 5.918724536895752,
"learning_rate": 4.5339105213188714e-05,
"loss": 0.2163,
"num_input_tokens_seen": 494960,
"step": 575
},
{
"epoch": 1.4043583535108959,
"grad_norm": 2.0229716300964355,
"learning_rate": 4.521548168832227e-05,
"loss": 0.3013,
"num_input_tokens_seen": 499120,
"step": 580
},
{
"epoch": 1.4164648910411621,
"grad_norm": 4.871718406677246,
"learning_rate": 4.509041328990204e-05,
"loss": 0.2324,
"num_input_tokens_seen": 503408,
"step": 585
},
{
"epoch": 1.4285714285714286,
"grad_norm": 4.264101028442383,
"learning_rate": 4.4963908957018576e-05,
"loss": 0.1956,
"num_input_tokens_seen": 507312,
"step": 590
},
{
"epoch": 1.4406779661016949,
"grad_norm": 0.7742087841033936,
"learning_rate": 4.483597773139386e-05,
"loss": 0.2206,
"num_input_tokens_seen": 511600,
"step": 595
},
{
"epoch": 1.4527845036319613,
"grad_norm": 1.387762427330017,
"learning_rate": 4.470662875673506e-05,
"loss": 0.1973,
"num_input_tokens_seen": 515888,
"step": 600
},
{
"epoch": 1.4648910411622276,
"grad_norm": 8.138726234436035,
"learning_rate": 4.457587127808096e-05,
"loss": 0.1848,
"num_input_tokens_seen": 519920,
"step": 605
},
{
"epoch": 1.4769975786924938,
"grad_norm": 3.1052446365356445,
"learning_rate": 4.4443714641141255e-05,
"loss": 0.1922,
"num_input_tokens_seen": 524336,
"step": 610
},
{
"epoch": 1.4891041162227603,
"grad_norm": 1.7755212783813477,
"learning_rate": 4.4310168291628504e-05,
"loss": 0.1922,
"num_input_tokens_seen": 528496,
"step": 615
},
{
"epoch": 1.5012106537530268,
"grad_norm": 8.44454288482666,
"learning_rate": 4.4175241774583084e-05,
"loss": 0.1809,
"num_input_tokens_seen": 532784,
"step": 620
},
{
"epoch": 1.5108958837772397,
"eval_loss": 0.19258780777454376,
"eval_runtime": 0.6591,
"eval_samples_per_second": 556.848,
"eval_steps_per_second": 69.796,
"num_input_tokens_seen": 536176,
"step": 624
},
{
"epoch": 1.513317191283293,
"grad_norm": 6.506056785583496,
"learning_rate": 4.403894473369092e-05,
"loss": 0.2205,
"num_input_tokens_seen": 537136,
"step": 625
},
{
"epoch": 1.5254237288135593,
"grad_norm": 15.012322425842285,
"learning_rate": 4.390128691059423e-05,
"loss": 0.26,
"num_input_tokens_seen": 541552,
"step": 630
},
{
"epoch": 1.5375302663438255,
"grad_norm": 2.567143440246582,
"learning_rate": 4.3762278144195236e-05,
"loss": 0.2678,
"num_input_tokens_seen": 545648,
"step": 635
},
{
"epoch": 1.549636803874092,
"grad_norm": 9.604016304016113,
"learning_rate": 4.362192836995299e-05,
"loss": 0.2246,
"num_input_tokens_seen": 550256,
"step": 640
},
{
"epoch": 1.5617433414043584,
"grad_norm": 6.7328104972839355,
"learning_rate": 4.348024761917321e-05,
"loss": 0.2397,
"num_input_tokens_seen": 554928,
"step": 645
},
{
"epoch": 1.5738498789346247,
"grad_norm": 13.930996894836426,
"learning_rate": 4.333724601829132e-05,
"loss": 0.2303,
"num_input_tokens_seen": 559344,
"step": 650
},
{
"epoch": 1.585956416464891,
"grad_norm": 7.173315048217773,
"learning_rate": 4.319293378814868e-05,
"loss": 0.2178,
"num_input_tokens_seen": 563760,
"step": 655
},
{
"epoch": 1.5980629539951574,
"grad_norm": 1.3246958255767822,
"learning_rate": 4.304732124326206e-05,
"loss": 0.1945,
"num_input_tokens_seen": 568112,
"step": 660
},
{
"epoch": 1.6101694915254239,
"grad_norm": 10.188156127929688,
"learning_rate": 4.2900418791086403e-05,
"loss": 0.1908,
"num_input_tokens_seen": 572464,
"step": 665
},
{
"epoch": 1.6222760290556901,
"grad_norm": 7.808104515075684,
"learning_rate": 4.275223693127103e-05,
"loss": 0.2026,
"num_input_tokens_seen": 576752,
"step": 670
},
{
"epoch": 1.6343825665859564,
"grad_norm": 0.8921657204627991,
"learning_rate": 4.260278625490911e-05,
"loss": 0.1959,
"num_input_tokens_seen": 580976,
"step": 675
},
{
"epoch": 1.6464891041162226,
"grad_norm": 6.147708892822266,
"learning_rate": 4.2452077443780744e-05,
"loss": 0.2025,
"num_input_tokens_seen": 585264,
"step": 680
},
{
"epoch": 1.658595641646489,
"grad_norm": 5.73768424987793,
"learning_rate": 4.2300121269589475e-05,
"loss": 0.1777,
"num_input_tokens_seen": 589744,
"step": 685
},
{
"epoch": 1.6707021791767556,
"grad_norm": 5.188973426818848,
"learning_rate": 4.214692859319237e-05,
"loss": 0.2142,
"num_input_tokens_seen": 593968,
"step": 690
},
{
"epoch": 1.6828087167070218,
"grad_norm": 20.29938316345215,
"learning_rate": 4.19925103638238e-05,
"loss": 0.2096,
"num_input_tokens_seen": 598256,
"step": 695
},
{
"epoch": 1.694915254237288,
"grad_norm": 3.481995105743408,
"learning_rate": 4.183687761831281e-05,
"loss": 0.1881,
"num_input_tokens_seen": 602608,
"step": 700
},
{
"epoch": 1.7070217917675545,
"grad_norm": 2.9380016326904297,
"learning_rate": 4.168004148029435e-05,
"loss": 0.1678,
"num_input_tokens_seen": 607088,
"step": 705
},
{
"epoch": 1.7191283292978208,
"grad_norm": 6.645642280578613,
"learning_rate": 4.1522013159414144e-05,
"loss": 0.243,
"num_input_tokens_seen": 611248,
"step": 710
},
{
"epoch": 1.7312348668280872,
"grad_norm": 5.701453685760498,
"learning_rate": 4.136280395052754e-05,
"loss": 0.2024,
"num_input_tokens_seen": 615536,
"step": 715
},
{
"epoch": 1.7433414043583535,
"grad_norm": 4.573903560638428,
"learning_rate": 4.120242523289223e-05,
"loss": 0.1803,
"num_input_tokens_seen": 619952,
"step": 720
},
{
"epoch": 1.7554479418886197,
"grad_norm": 3.025674819946289,
"learning_rate": 4.1040888469354925e-05,
"loss": 0.1949,
"num_input_tokens_seen": 624368,
"step": 725
},
{
"epoch": 1.7627118644067796,
"eval_loss": 0.19822187721729279,
"eval_runtime": 1.1195,
"eval_samples_per_second": 327.835,
"eval_steps_per_second": 41.091,
"num_input_tokens_seen": 626992,
"step": 728
},
{
"epoch": 1.7675544794188862,
"grad_norm": 5.934816360473633,
"learning_rate": 4.087820520553205e-05,
"loss": 0.1935,
"num_input_tokens_seen": 628720,
"step": 730
},
{
"epoch": 1.7796610169491527,
"grad_norm": 1.3624376058578491,
"learning_rate": 4.0714387068984574e-05,
"loss": 0.1884,
"num_input_tokens_seen": 633008,
"step": 735
},
{
"epoch": 1.791767554479419,
"grad_norm": 2.1475796699523926,
"learning_rate": 4.05494457683869e-05,
"loss": 0.2014,
"num_input_tokens_seen": 637360,
"step": 740
},
{
"epoch": 1.8038740920096852,
"grad_norm": 10.264263153076172,
"learning_rate": 4.038339309269002e-05,
"loss": 0.2152,
"num_input_tokens_seen": 641648,
"step": 745
},
{
"epoch": 1.8159806295399514,
"grad_norm": 4.37279748916626,
"learning_rate": 4.021624091027895e-05,
"loss": 0.192,
"num_input_tokens_seen": 645552,
"step": 750
},
{
"epoch": 1.828087167070218,
"grad_norm": 10.11119270324707,
"learning_rate": 4.004800116812441e-05,
"loss": 0.3049,
"num_input_tokens_seen": 649904,
"step": 755
},
{
"epoch": 1.8401937046004844,
"grad_norm": 0.4716910719871521,
"learning_rate": 3.987868589092893e-05,
"loss": 0.184,
"num_input_tokens_seen": 654128,
"step": 760
},
{
"epoch": 1.8523002421307506,
"grad_norm": 8.259904861450195,
"learning_rate": 3.9708307180267456e-05,
"loss": 0.1914,
"num_input_tokens_seen": 658672,
"step": 765
},
{
"epoch": 1.8644067796610169,
"grad_norm": 14.706856727600098,
"learning_rate": 3.953687721372233e-05,
"loss": 0.4553,
"num_input_tokens_seen": 663088,
"step": 770
},
{
"epoch": 1.8765133171912833,
"grad_norm": 9.08963394165039,
"learning_rate": 3.936440824401299e-05,
"loss": 0.1709,
"num_input_tokens_seen": 667440,
"step": 775
},
{
"epoch": 1.8886198547215496,
"grad_norm": 4.246565818786621,
"learning_rate": 3.919091259812013e-05,
"loss": 0.1831,
"num_input_tokens_seen": 671792,
"step": 780
},
{
"epoch": 1.900726392251816,
"grad_norm": 11.860783576965332,
"learning_rate": 3.9016402676404753e-05,
"loss": 0.2175,
"num_input_tokens_seen": 676336,
"step": 785
},
{
"epoch": 1.9128329297820823,
"grad_norm": 5.474867820739746,
"learning_rate": 3.884089095172181e-05,
"loss": 0.18,
"num_input_tokens_seen": 680624,
"step": 790
},
{
"epoch": 1.9249394673123486,
"grad_norm": 2.7666966915130615,
"learning_rate": 3.866438996852872e-05,
"loss": 0.1914,
"num_input_tokens_seen": 685040,
"step": 795
},
{
"epoch": 1.937046004842615,
"grad_norm": 10.039326667785645,
"learning_rate": 3.848691234198879e-05,
"loss": 0.1935,
"num_input_tokens_seen": 689392,
"step": 800
},
{
"epoch": 1.9491525423728815,
"grad_norm": 3.919206142425537,
"learning_rate": 3.830847075706956e-05,
"loss": 0.2046,
"num_input_tokens_seen": 693552,
"step": 805
},
{
"epoch": 1.9612590799031477,
"grad_norm": 16.429906845092773,
"learning_rate": 3.812907796763616e-05,
"loss": 0.2291,
"num_input_tokens_seen": 698032,
"step": 810
},
{
"epoch": 1.973365617433414,
"grad_norm": 6.558701992034912,
"learning_rate": 3.7948746795539745e-05,
"loss": 0.1751,
"num_input_tokens_seen": 702000,
"step": 815
},
{
"epoch": 1.9854721549636802,
"grad_norm": 8.950061798095703,
"learning_rate": 3.776749012970105e-05,
"loss": 0.1795,
"num_input_tokens_seen": 706160,
"step": 820
},
{
"epoch": 1.9975786924939467,
"grad_norm": 3.701720714569092,
"learning_rate": 3.758532092518924e-05,
"loss": 0.1852,
"num_input_tokens_seen": 710768,
"step": 825
},
{
"epoch": 2.009685230024213,
"grad_norm": 6.777426719665527,
"learning_rate": 3.740225220229587e-05,
"loss": 0.256,
"num_input_tokens_seen": 714744,
"step": 830
},
{
"epoch": 2.0145278450363198,
"eval_loss": 0.1934857964515686,
"eval_runtime": 0.6627,
"eval_samples_per_second": 553.776,
"eval_steps_per_second": 69.411,
"num_input_tokens_seen": 716344,
"step": 832
},
{
"epoch": 2.0217917675544794,
"grad_norm": 7.20669412612915,
"learning_rate": 3.721829704560436e-05,
"loss": 0.1878,
"num_input_tokens_seen": 718776,
"step": 835
},
{
"epoch": 2.0338983050847457,
"grad_norm": 6.232179164886475,
"learning_rate": 3.7033468603054725e-05,
"loss": 0.2215,
"num_input_tokens_seen": 722744,
"step": 840
},
{
"epoch": 2.046004842615012,
"grad_norm": 8.393187522888184,
"learning_rate": 3.6847780085003905e-05,
"loss": 0.1657,
"num_input_tokens_seen": 727160,
"step": 845
},
{
"epoch": 2.0581113801452786,
"grad_norm": 9.579306602478027,
"learning_rate": 3.666124476328155e-05,
"loss": 0.1957,
"num_input_tokens_seen": 731576,
"step": 850
},
{
"epoch": 2.070217917675545,
"grad_norm": 8.12859058380127,
"learning_rate": 3.647387597024139e-05,
"loss": 0.1881,
"num_input_tokens_seen": 736184,
"step": 855
},
{
"epoch": 2.082324455205811,
"grad_norm": 11.758556365966797,
"learning_rate": 3.6285687097808394e-05,
"loss": 0.2041,
"num_input_tokens_seen": 740472,
"step": 860
},
{
"epoch": 2.0944309927360774,
"grad_norm": 1.7637454271316528,
"learning_rate": 3.609669159652158e-05,
"loss": 0.213,
"num_input_tokens_seen": 744760,
"step": 865
},
{
"epoch": 2.106537530266344,
"grad_norm": 5.633957386016846,
"learning_rate": 3.590690297457262e-05,
"loss": 0.1913,
"num_input_tokens_seen": 749176,
"step": 870
},
{
"epoch": 2.1186440677966103,
"grad_norm": 4.531621932983398,
"learning_rate": 3.57163347968404e-05,
"loss": 0.1961,
"num_input_tokens_seen": 753528,
"step": 875
},
{
"epoch": 2.1307506053268765,
"grad_norm": 6.524752140045166,
"learning_rate": 3.552500068392147e-05,
"loss": 0.1981,
"num_input_tokens_seen": 757688,
"step": 880
},
{
"epoch": 2.142857142857143,
"grad_norm": 5.924046516418457,
"learning_rate": 3.533291431115653e-05,
"loss": 0.2002,
"num_input_tokens_seen": 762040,
"step": 885
},
{
"epoch": 2.154963680387409,
"grad_norm": 4.7628068923950195,
"learning_rate": 3.514008940765304e-05,
"loss": 0.1856,
"num_input_tokens_seen": 766200,
"step": 890
},
{
"epoch": 2.1670702179176757,
"grad_norm": 9.14155101776123,
"learning_rate": 3.494653975530388e-05,
"loss": 0.2107,
"num_input_tokens_seen": 770680,
"step": 895
},
{
"epoch": 2.179176755447942,
"grad_norm": 7.742560386657715,
"learning_rate": 3.475227918780239e-05,
"loss": 0.1771,
"num_input_tokens_seen": 774840,
"step": 900
},
{
"epoch": 2.1912832929782082,
"grad_norm": 1.2218825817108154,
"learning_rate": 3.4557321589653556e-05,
"loss": 0.1924,
"num_input_tokens_seen": 779192,
"step": 905
},
{
"epoch": 2.2033898305084745,
"grad_norm": 10.382070541381836,
"learning_rate": 3.436168089518168e-05,
"loss": 0.1687,
"num_input_tokens_seen": 783608,
"step": 910
},
{
"epoch": 2.2154963680387407,
"grad_norm": 2.07893967628479,
"learning_rate": 3.416537108753443e-05,
"loss": 0.1922,
"num_input_tokens_seen": 788088,
"step": 915
},
{
"epoch": 2.2276029055690074,
"grad_norm": 14.99792194366455,
"learning_rate": 3.3968406197683376e-05,
"loss": 0.1721,
"num_input_tokens_seen": 792568,
"step": 920
},
{
"epoch": 2.2397094430992737,
"grad_norm": 4.237668037414551,
"learning_rate": 3.3770800303421254e-05,
"loss": 0.2058,
"num_input_tokens_seen": 797176,
"step": 925
},
{
"epoch": 2.25181598062954,
"grad_norm": 2.3142411708831787,
"learning_rate": 3.357256752835561e-05,
"loss": 0.1925,
"num_input_tokens_seen": 801400,
"step": 930
},
{
"epoch": 2.263922518159806,
"grad_norm": 3.0918896198272705,
"learning_rate": 3.3373722040899517e-05,
"loss": 0.1601,
"num_input_tokens_seen": 805944,
"step": 935
},
{
"epoch": 2.2663438256658597,
"eval_loss": 0.38670673966407776,
"eval_runtime": 2.26,
"eval_samples_per_second": 162.386,
"eval_steps_per_second": 20.354,
"num_input_tokens_seen": 806712,
"step": 936
},
{
"epoch": 2.2760290556900724,
"grad_norm": 3.9013168811798096,
"learning_rate": 3.317427805325875e-05,
"loss": 0.9421,
"num_input_tokens_seen": 810040,
"step": 940
},
{
"epoch": 2.288135593220339,
"grad_norm": 1.7496920824050903,
"learning_rate": 3.297424982041609e-05,
"loss": 0.191,
"num_input_tokens_seen": 814392,
"step": 945
},
{
"epoch": 2.3002421307506054,
"grad_norm": 6.5397491455078125,
"learning_rate": 3.277365163911243e-05,
"loss": 0.1962,
"num_input_tokens_seen": 818872,
"step": 950
},
{
"epoch": 2.3123486682808716,
"grad_norm": 2.407987594604492,
"learning_rate": 3.257249784682492e-05,
"loss": 0.2261,
"num_input_tokens_seen": 823096,
"step": 955
},
{
"epoch": 2.324455205811138,
"grad_norm": 3.1127803325653076,
"learning_rate": 3.2370802820742275e-05,
"loss": 0.1945,
"num_input_tokens_seen": 827128,
"step": 960
},
{
"epoch": 2.3365617433414045,
"grad_norm": 10.151595115661621,
"learning_rate": 3.2168580976737104e-05,
"loss": 0.2272,
"num_input_tokens_seen": 831288,
"step": 965
},
{
"epoch": 2.348668280871671,
"grad_norm": 1.2875597476959229,
"learning_rate": 3.196584676833562e-05,
"loss": 0.1824,
"num_input_tokens_seen": 835640,
"step": 970
},
{
"epoch": 2.360774818401937,
"grad_norm": 0.8216660022735596,
"learning_rate": 3.1762614685684567e-05,
"loss": 0.156,
"num_input_tokens_seen": 839736,
"step": 975
},
{
"epoch": 2.3728813559322033,
"grad_norm": 7.343863010406494,
"learning_rate": 3.155889925451557e-05,
"loss": 0.2199,
"num_input_tokens_seen": 844024,
"step": 980
},
{
"epoch": 2.38498789346247,
"grad_norm": 2.2787206172943115,
"learning_rate": 3.1354715035106894e-05,
"loss": 0.1885,
"num_input_tokens_seen": 848248,
"step": 985
},
{
"epoch": 2.3970944309927362,
"grad_norm": 6.654670238494873,
"learning_rate": 3.1150076621242816e-05,
"loss": 0.1645,
"num_input_tokens_seen": 852472,
"step": 990
},
{
"epoch": 2.4092009685230025,
"grad_norm": 3.4156064987182617,
"learning_rate": 3.0944998639170544e-05,
"loss": 0.1747,
"num_input_tokens_seen": 856824,
"step": 995
},
{
"epoch": 2.4213075060532687,
"grad_norm": 0.4972361624240875,
"learning_rate": 3.073949574655479e-05,
"loss": 0.1751,
"num_input_tokens_seen": 860984,
"step": 1000
},
{
"epoch": 2.433414043583535,
"grad_norm": 0.7988845705986023,
"learning_rate": 3.053358263143015e-05,
"loss": 0.1975,
"num_input_tokens_seen": 865272,
"step": 1005
},
{
"epoch": 2.4455205811138017,
"grad_norm": 5.293003082275391,
"learning_rate": 3.032727401115135e-05,
"loss": 0.1765,
"num_input_tokens_seen": 869560,
"step": 1010
},
{
"epoch": 2.457627118644068,
"grad_norm": 3.4668216705322266,
"learning_rate": 3.012058463134126e-05,
"loss": 0.1624,
"num_input_tokens_seen": 873976,
"step": 1015
},
{
"epoch": 2.469733656174334,
"grad_norm": 1.981259822845459,
"learning_rate": 2.991352926483702e-05,
"loss": 0.2237,
"num_input_tokens_seen": 878200,
"step": 1020
},
{
"epoch": 2.4818401937046004,
"grad_norm": 15.534086227416992,
"learning_rate": 2.9706122710634165e-05,
"loss": 0.2024,
"num_input_tokens_seen": 882872,
"step": 1025
},
{
"epoch": 2.4939467312348667,
"grad_norm": 2.0866310596466064,
"learning_rate": 2.949837979282889e-05,
"loss": 0.2673,
"num_input_tokens_seen": 887096,
"step": 1030
},
{
"epoch": 2.5060532687651333,
"grad_norm": 1.296164870262146,
"learning_rate": 2.92903153595585e-05,
"loss": 0.2168,
"num_input_tokens_seen": 891576,
"step": 1035
},
{
"epoch": 2.5181598062953996,
"grad_norm": 3.0610435009002686,
"learning_rate": 2.908194428194019e-05,
"loss": 0.1768,
"num_input_tokens_seen": 895736,
"step": 1040
},
{
"epoch": 2.5181598062953996,
"eval_loss": 0.1943914145231247,
"eval_runtime": 0.6714,
"eval_samples_per_second": 546.608,
"eval_steps_per_second": 68.512,
"num_input_tokens_seen": 895736,
"step": 1040
},
{
"epoch": 2.530266343825666,
"grad_norm": 13.436739921569824,
"learning_rate": 2.88732814530081e-05,
"loss": 0.1555,
"num_input_tokens_seen": 900024,
"step": 1045
},
{
"epoch": 2.542372881355932,
"grad_norm": 9.469161987304688,
"learning_rate": 2.866434178664893e-05,
"loss": 0.1744,
"num_input_tokens_seen": 904440,
"step": 1050
},
{
"epoch": 2.5544794188861983,
"grad_norm": 6.683951377868652,
"learning_rate": 2.8455140216535947e-05,
"loss": 0.1842,
"num_input_tokens_seen": 908728,
"step": 1055
},
{
"epoch": 2.566585956416465,
"grad_norm": 4.156672954559326,
"learning_rate": 2.8245691695061604e-05,
"loss": 0.2018,
"num_input_tokens_seen": 913016,
"step": 1060
},
{
"epoch": 2.5786924939467313,
"grad_norm": 2.5280745029449463,
"learning_rate": 2.8036011192268863e-05,
"loss": 0.2027,
"num_input_tokens_seen": 917304,
"step": 1065
},
{
"epoch": 2.5907990314769975,
"grad_norm": 3.3346853256225586,
"learning_rate": 2.7826113694781252e-05,
"loss": 0.1984,
"num_input_tokens_seen": 921528,
"step": 1070
},
{
"epoch": 2.6029055690072638,
"grad_norm": 6.732588768005371,
"learning_rate": 2.761601420473168e-05,
"loss": 0.1674,
"num_input_tokens_seen": 925944,
"step": 1075
},
{
"epoch": 2.61501210653753,
"grad_norm": 5.7978363037109375,
"learning_rate": 2.740572773869019e-05,
"loss": 0.1523,
"num_input_tokens_seen": 930744,
"step": 1080
},
{
"epoch": 2.6271186440677967,
"grad_norm": 4.692154884338379,
"learning_rate": 2.7195269326590682e-05,
"loss": 0.1263,
"num_input_tokens_seen": 935352,
"step": 1085
},
{
"epoch": 2.639225181598063,
"grad_norm": 8.889333724975586,
"learning_rate": 2.6984654010656667e-05,
"loss": 0.1656,
"num_input_tokens_seen": 939640,
"step": 1090
},
{
"epoch": 2.651331719128329,
"grad_norm": 4.259967803955078,
"learning_rate": 2.6773896844326125e-05,
"loss": 0.2926,
"num_input_tokens_seen": 943672,
"step": 1095
},
{
"epoch": 2.663438256658596,
"grad_norm": 3.0391273498535156,
"learning_rate": 2.656301289117561e-05,
"loss": 0.1547,
"num_input_tokens_seen": 947704,
"step": 1100
},
{
"epoch": 2.6755447941888617,
"grad_norm": 9.067920684814453,
"learning_rate": 2.6352017223843585e-05,
"loss": 0.2428,
"num_input_tokens_seen": 951928,
"step": 1105
},
{
"epoch": 2.6876513317191284,
"grad_norm": 7.765347957611084,
"learning_rate": 2.6140924922953125e-05,
"loss": 0.1649,
"num_input_tokens_seen": 956216,
"step": 1110
},
{
"epoch": 2.6997578692493946,
"grad_norm": 1.6490931510925293,
"learning_rate": 2.5929751076034058e-05,
"loss": 0.1597,
"num_input_tokens_seen": 960504,
"step": 1115
},
{
"epoch": 2.711864406779661,
"grad_norm": 1.5548573732376099,
"learning_rate": 2.571851077644461e-05,
"loss": 0.1407,
"num_input_tokens_seen": 965048,
"step": 1120
},
{
"epoch": 2.7239709443099276,
"grad_norm": 5.526769161224365,
"learning_rate": 2.5507219122292598e-05,
"loss": 0.1667,
"num_input_tokens_seen": 969208,
"step": 1125
},
{
"epoch": 2.736077481840194,
"grad_norm": 5.792220115661621,
"learning_rate": 2.529589121535636e-05,
"loss": 0.1438,
"num_input_tokens_seen": 973624,
"step": 1130
},
{
"epoch": 2.74818401937046,
"grad_norm": 6.361023902893066,
"learning_rate": 2.5084542160005335e-05,
"loss": 0.2294,
"num_input_tokens_seen": 977976,
"step": 1135
},
{
"epoch": 2.7602905569007263,
"grad_norm": 1.0617471933364868,
"learning_rate": 2.487318706212051e-05,
"loss": 0.1964,
"num_input_tokens_seen": 982200,
"step": 1140
},
{
"epoch": 2.7699757869249395,
"eval_loss": 0.19318054616451263,
"eval_runtime": 0.6508,
"eval_samples_per_second": 563.894,
"eval_steps_per_second": 70.679,
"num_input_tokens_seen": 985592,
"step": 1144
},
{
"epoch": 2.7723970944309926,
"grad_norm": 7.693630695343018,
"learning_rate": 2.4661841028014785e-05,
"loss": 0.203,
"num_input_tokens_seen": 986488,
"step": 1145
},
{
"epoch": 2.7845036319612593,
"grad_norm": 4.296042442321777,
"learning_rate": 2.445051916335321e-05,
"loss": 0.1983,
"num_input_tokens_seen": 990456,
"step": 1150
},
{
"epoch": 2.7966101694915255,
"grad_norm": 2.928414821624756,
"learning_rate": 2.4239236572073352e-05,
"loss": 0.1825,
"num_input_tokens_seen": 994744,
"step": 1155
},
{
"epoch": 2.8087167070217918,
"grad_norm": 2.411320686340332,
"learning_rate": 2.4028008355305815e-05,
"loss": 0.178,
"num_input_tokens_seen": 999160,
"step": 1160
},
{
"epoch": 2.820823244552058,
"grad_norm": 6.881911754608154,
"learning_rate": 2.3816849610294783e-05,
"loss": 0.1709,
"num_input_tokens_seen": 1003256,
"step": 1165
},
{
"epoch": 2.8329297820823243,
"grad_norm": 4.286351680755615,
"learning_rate": 2.3605775429319115e-05,
"loss": 0.1853,
"num_input_tokens_seen": 1007480,
"step": 1170
},
{
"epoch": 2.845036319612591,
"grad_norm": 3.7688863277435303,
"learning_rate": 2.3394800898613535e-05,
"loss": 0.1431,
"num_input_tokens_seen": 1011896,
"step": 1175
},
{
"epoch": 2.857142857142857,
"grad_norm": 3.717094898223877,
"learning_rate": 2.318394109729041e-05,
"loss": 0.2253,
"num_input_tokens_seen": 1015992,
"step": 1180
},
{
"epoch": 2.8692493946731235,
"grad_norm": 7.443727493286133,
"learning_rate": 2.297321109626198e-05,
"loss": 0.1686,
"num_input_tokens_seen": 1020408,
"step": 1185
},
{
"epoch": 2.8813559322033897,
"grad_norm": 12.574480056762695,
"learning_rate": 2.27626259571632e-05,
"loss": 0.1988,
"num_input_tokens_seen": 1025016,
"step": 1190
},
{
"epoch": 2.893462469733656,
"grad_norm": 9.311829566955566,
"learning_rate": 2.2552200731275213e-05,
"loss": 0.1682,
"num_input_tokens_seen": 1029368,
"step": 1195
},
{
"epoch": 2.9055690072639226,
"grad_norm": 4.659236431121826,
"learning_rate": 2.2341950458449576e-05,
"loss": 0.1918,
"num_input_tokens_seen": 1033592,
"step": 1200
},
{
"epoch": 2.917675544794189,
"grad_norm": 1.1926063299179077,
"learning_rate": 2.213189016603333e-05,
"loss": 0.2047,
"num_input_tokens_seen": 1037688,
"step": 1205
},
{
"epoch": 2.929782082324455,
"grad_norm": 1.54401433467865,
"learning_rate": 2.1922034867794925e-05,
"loss": 0.1686,
"num_input_tokens_seen": 1041912,
"step": 1210
},
{
"epoch": 2.9418886198547214,
"grad_norm": 6.956883430480957,
"learning_rate": 2.1712399562851147e-05,
"loss": 0.1663,
"num_input_tokens_seen": 1046392,
"step": 1215
},
{
"epoch": 2.9539951573849876,
"grad_norm": 6.875396728515625,
"learning_rate": 2.150299923459505e-05,
"loss": 0.1158,
"num_input_tokens_seen": 1050616,
"step": 1220
},
{
"epoch": 2.9661016949152543,
"grad_norm": 4.653652191162109,
"learning_rate": 2.1293848849625065e-05,
"loss": 0.1857,
"num_input_tokens_seen": 1054840,
"step": 1225
},
{
"epoch": 2.9782082324455206,
"grad_norm": 4.641164302825928,
"learning_rate": 2.108496335667527e-05,
"loss": 0.2051,
"num_input_tokens_seen": 1058936,
"step": 1230
},
{
"epoch": 2.990314769975787,
"grad_norm": 4.4205002784729,
"learning_rate": 2.0876357685546944e-05,
"loss": 0.137,
"num_input_tokens_seen": 1063288,
"step": 1235
},
{
"epoch": 3.002421307506053,
"grad_norm": 9.87366771697998,
"learning_rate": 2.06680467460415e-05,
"loss": 0.294,
"num_input_tokens_seen": 1067392,
"step": 1240
},
{
"epoch": 3.0145278450363198,
"grad_norm": 1.3809499740600586,
"learning_rate": 2.0460045426894817e-05,
"loss": 0.1436,
"num_input_tokens_seen": 1071872,
"step": 1245
},
{
"epoch": 3.0217917675544794,
"eval_loss": 0.20527909696102142,
"eval_runtime": 0.667,
"eval_samples_per_second": 550.187,
"eval_steps_per_second": 68.961,
"num_input_tokens_seen": 1074624,
"step": 1248
},
{
"epoch": 3.026634382566586,
"grad_norm": 1.1184051036834717,
"learning_rate": 2.0252368594713083e-05,
"loss": 0.1503,
"num_input_tokens_seen": 1076416,
"step": 1250
},
{
"epoch": 3.0387409200968523,
"grad_norm": 3.941237211227417,
"learning_rate": 2.004503109291023e-05,
"loss": 0.156,
"num_input_tokens_seen": 1080512,
"step": 1255
},
{
"epoch": 3.0508474576271185,
"grad_norm": 2.0000264644622803,
"learning_rate": 1.9838047740647026e-05,
"loss": 0.1971,
"num_input_tokens_seen": 1084608,
"step": 1260
},
{
"epoch": 3.062953995157385,
"grad_norm": 11.35123062133789,
"learning_rate": 1.9631433331771886e-05,
"loss": 0.1813,
"num_input_tokens_seen": 1089024,
"step": 1265
},
{
"epoch": 3.0750605326876514,
"grad_norm": 2.1008217334747314,
"learning_rate": 1.9425202633763513e-05,
"loss": 0.133,
"num_input_tokens_seen": 1093376,
"step": 1270
},
{
"epoch": 3.0871670702179177,
"grad_norm": 5.499813556671143,
"learning_rate": 1.9219370386675388e-05,
"loss": 0.089,
"num_input_tokens_seen": 1097728,
"step": 1275
},
{
"epoch": 3.099273607748184,
"grad_norm": 8.502225875854492,
"learning_rate": 1.901395130208229e-05,
"loss": 0.2836,
"num_input_tokens_seen": 1101888,
"step": 1280
},
{
"epoch": 3.11138014527845,
"grad_norm": 14.45283031463623,
"learning_rate": 1.880896006202876e-05,
"loss": 0.1116,
"num_input_tokens_seen": 1106176,
"step": 1285
},
{
"epoch": 3.123486682808717,
"grad_norm": 3.364891767501831,
"learning_rate": 1.860441131797977e-05,
"loss": 0.1027,
"num_input_tokens_seen": 1110272,
"step": 1290
},
{
"epoch": 3.135593220338983,
"grad_norm": 8.516124725341797,
"learning_rate": 1.8400319689773474e-05,
"loss": 0.1582,
"num_input_tokens_seen": 1114496,
"step": 1295
},
{
"epoch": 3.1476997578692494,
"grad_norm": 11.724932670593262,
"learning_rate": 1.8196699764576318e-05,
"loss": 0.0408,
"num_input_tokens_seen": 1118784,
"step": 1300
},
{
"epoch": 3.1598062953995156,
"grad_norm": 8.753253936767578,
"learning_rate": 1.7993566095840443e-05,
"loss": 0.1234,
"num_input_tokens_seen": 1123008,
"step": 1305
},
{
"epoch": 3.171912832929782,
"grad_norm": 8.221136093139648,
"learning_rate": 1.7790933202263434e-05,
"loss": 0.2236,
"num_input_tokens_seen": 1127424,
"step": 1310
},
{
"epoch": 3.1840193704600486,
"grad_norm": 17.435853958129883,
"learning_rate": 1.758881556675073e-05,
"loss": 0.1958,
"num_input_tokens_seen": 1131840,
"step": 1315
},
{
"epoch": 3.196125907990315,
"grad_norm": 5.691689491271973,
"learning_rate": 1.738722763538036e-05,
"loss": 0.1238,
"num_input_tokens_seen": 1136192,
"step": 1320
},
{
"epoch": 3.208232445520581,
"grad_norm": 2.6163206100463867,
"learning_rate": 1.7186183816370522e-05,
"loss": 0.1027,
"num_input_tokens_seen": 1140544,
"step": 1325
},
{
"epoch": 3.2203389830508473,
"grad_norm": 5.7949724197387695,
"learning_rate": 1.6985698479049702e-05,
"loss": 0.0907,
"num_input_tokens_seen": 1145280,
"step": 1330
},
{
"epoch": 3.232445520581114,
"grad_norm": 5.007083892822266,
"learning_rate": 1.6785785952829717e-05,
"loss": 0.1037,
"num_input_tokens_seen": 1149888,
"step": 1335
},
{
"epoch": 3.2445520581113803,
"grad_norm": 12.367361068725586,
"learning_rate": 1.6586460526181473e-05,
"loss": 0.1776,
"num_input_tokens_seen": 1153920,
"step": 1340
},
{
"epoch": 3.2566585956416465,
"grad_norm": 16.06878089904785,
"learning_rate": 1.6387736445613772e-05,
"loss": 0.2125,
"num_input_tokens_seen": 1158592,
"step": 1345
},
{
"epoch": 3.2687651331719128,
"grad_norm": 7.7484588623046875,
"learning_rate": 1.6189627914655008e-05,
"loss": 0.2252,
"num_input_tokens_seen": 1162816,
"step": 1350
},
{
"epoch": 3.2736077481840193,
"eval_loss": 0.2091810256242752,
"eval_runtime": 0.6785,
"eval_samples_per_second": 540.886,
"eval_steps_per_second": 67.795,
"num_input_tokens_seen": 1164544,
"step": 1352
},
{
"epoch": 3.280871670702179,
"grad_norm": 9.04961109161377,
"learning_rate": 1.599214909283805e-05,
"loss": 0.1163,
"num_input_tokens_seen": 1167232,
"step": 1355
},
{
"epoch": 3.2929782082324457,
"grad_norm": 3.317920446395874,
"learning_rate": 1.579531409468815e-05,
"loss": 0.1094,
"num_input_tokens_seen": 1171648,
"step": 1360
},
{
"epoch": 3.305084745762712,
"grad_norm": 8.250765800476074,
"learning_rate": 1.5599136988714186e-05,
"loss": 0.141,
"num_input_tokens_seen": 1175808,
"step": 1365
},
{
"epoch": 3.317191283292978,
"grad_norm": 5.985897541046143,
"learning_rate": 1.5403631796403085e-05,
"loss": 0.1296,
"num_input_tokens_seen": 1180224,
"step": 1370
},
{
"epoch": 3.3292978208232444,
"grad_norm": 4.8227314949035645,
"learning_rate": 1.520881249121767e-05,
"loss": 0.1375,
"num_input_tokens_seen": 1184704,
"step": 1375
},
{
"epoch": 3.341404358353511,
"grad_norm": 2.318727970123291,
"learning_rate": 1.5014692997597962e-05,
"loss": 0.1459,
"num_input_tokens_seen": 1188992,
"step": 1380
},
{
"epoch": 3.3535108958837774,
"grad_norm": 13.753244400024414,
"learning_rate": 1.4821287189965866e-05,
"loss": 0.1535,
"num_input_tokens_seen": 1193408,
"step": 1385
},
{
"epoch": 3.3656174334140436,
"grad_norm": 1.9978270530700684,
"learning_rate": 1.4628608891733625e-05,
"loss": 0.1246,
"num_input_tokens_seen": 1197760,
"step": 1390
},
{
"epoch": 3.37772397094431,
"grad_norm": 6.705835819244385,
"learning_rate": 1.4436671874315722e-05,
"loss": 0.0863,
"num_input_tokens_seen": 1201792,
"step": 1395
},
{
"epoch": 3.389830508474576,
"grad_norm": 7.748871326446533,
"learning_rate": 1.4245489856144634e-05,
"loss": 0.0968,
"num_input_tokens_seen": 1205824,
"step": 1400
},
{
"epoch": 3.401937046004843,
"grad_norm": 4.018503189086914,
"learning_rate": 1.4055076501690311e-05,
"loss": 0.0749,
"num_input_tokens_seen": 1210240,
"step": 1405
},
{
"epoch": 3.414043583535109,
"grad_norm": 4.750000953674316,
"learning_rate": 1.3865445420483526e-05,
"loss": 0.09,
"num_input_tokens_seen": 1214464,
"step": 1410
},
{
"epoch": 3.4261501210653753,
"grad_norm": 9.335100173950195,
"learning_rate": 1.367661016614315e-05,
"loss": 0.1746,
"num_input_tokens_seen": 1218752,
"step": 1415
},
{
"epoch": 3.4382566585956416,
"grad_norm": 4.242533206939697,
"learning_rate": 1.3488584235407439e-05,
"loss": 0.0826,
"num_input_tokens_seen": 1223168,
"step": 1420
},
{
"epoch": 3.450363196125908,
"grad_norm": 1.9875125885009766,
"learning_rate": 1.3301381067169366e-05,
"loss": 0.1469,
"num_input_tokens_seen": 1227328,
"step": 1425
},
{
"epoch": 3.4624697336561745,
"grad_norm": 10.304492950439453,
"learning_rate": 1.3115014041516089e-05,
"loss": 0.1454,
"num_input_tokens_seen": 1231360,
"step": 1430
},
{
"epoch": 3.4745762711864407,
"grad_norm": 2.467794418334961,
"learning_rate": 1.2929496478772635e-05,
"loss": 0.0455,
"num_input_tokens_seen": 1235456,
"step": 1435
},
{
"epoch": 3.486682808716707,
"grad_norm": 5.000001907348633,
"learning_rate": 1.2744841638549842e-05,
"loss": 0.106,
"num_input_tokens_seen": 1239616,
"step": 1440
},
{
"epoch": 3.4987893462469732,
"grad_norm": 0.32030388712882996,
"learning_rate": 1.2561062718796662e-05,
"loss": 0.0763,
"num_input_tokens_seen": 1243968,
"step": 1445
},
{
"epoch": 3.5108958837772395,
"grad_norm": 1.8182225227355957,
"learning_rate": 1.2378172854856831e-05,
"loss": 0.0978,
"num_input_tokens_seen": 1248128,
"step": 1450
},
{
"epoch": 3.523002421307506,
"grad_norm": 5.48933219909668,
"learning_rate": 1.2196185118530063e-05,
"loss": 0.1328,
"num_input_tokens_seen": 1252288,
"step": 1455
},
{
"epoch": 3.5254237288135593,
"eval_loss": 0.3491859436035156,
"eval_runtime": 0.6747,
"eval_samples_per_second": 543.942,
"eval_steps_per_second": 68.178,
"num_input_tokens_seen": 1253248,
"step": 1456
},
{
"epoch": 3.5351089588377724,
"grad_norm": 1.86709725856781,
"learning_rate": 1.2015112517137744e-05,
"loss": 0.1139,
"num_input_tokens_seen": 1256640,
"step": 1460
},
{
"epoch": 3.5472154963680387,
"grad_norm": 10.584001541137695,
"learning_rate": 1.183496799259326e-05,
"loss": 0.1247,
"num_input_tokens_seen": 1261440,
"step": 1465
},
{
"epoch": 3.559322033898305,
"grad_norm": 0.81782066822052,
"learning_rate": 1.1655764420476988e-05,
"loss": 0.0777,
"num_input_tokens_seen": 1265664,
"step": 1470
},
{
"epoch": 3.571428571428571,
"grad_norm": 4.23323917388916,
"learning_rate": 1.1477514609116039e-05,
"loss": 0.0848,
"num_input_tokens_seen": 1270016,
"step": 1475
},
{
"epoch": 3.583535108958838,
"grad_norm": 4.22898006439209,
"learning_rate": 1.1300231298668786e-05,
"loss": 0.1263,
"num_input_tokens_seen": 1274560,
"step": 1480
},
{
"epoch": 3.595641646489104,
"grad_norm": 7.585851669311523,
"learning_rate": 1.1123927160214289e-05,
"loss": 0.1362,
"num_input_tokens_seen": 1278976,
"step": 1485
},
{
"epoch": 3.6077481840193704,
"grad_norm": 2.0685174465179443,
"learning_rate": 1.0948614794846668e-05,
"loss": 0.1068,
"num_input_tokens_seen": 1283200,
"step": 1490
},
{
"epoch": 3.619854721549637,
"grad_norm": 4.345080852508545,
"learning_rate": 1.0774306732774414e-05,
"loss": 0.2069,
"num_input_tokens_seen": 1287296,
"step": 1495
},
{
"epoch": 3.6319612590799033,
"grad_norm": 15.997807502746582,
"learning_rate": 1.0601015432424819e-05,
"loss": 0.1368,
"num_input_tokens_seen": 1291712,
"step": 1500
},
{
"epoch": 3.6440677966101696,
"grad_norm": 6.712691783905029,
"learning_rate": 1.042875327955356e-05,
"loss": 0.1959,
"num_input_tokens_seen": 1295936,
"step": 1505
},
{
"epoch": 3.656174334140436,
"grad_norm": 5.0442962646484375,
"learning_rate": 1.0257532586359422e-05,
"loss": 0.0932,
"num_input_tokens_seen": 1300608,
"step": 1510
},
{
"epoch": 3.668280871670702,
"grad_norm": 5.707069396972656,
"learning_rate": 1.0087365590604289e-05,
"loss": 0.1347,
"num_input_tokens_seen": 1305024,
"step": 1515
},
{
"epoch": 3.6803874092009687,
"grad_norm": 2.964393138885498,
"learning_rate": 9.918264454738504e-06,
"loss": 0.1287,
"num_input_tokens_seen": 1309376,
"step": 1520
},
{
"epoch": 3.692493946731235,
"grad_norm": 10.144442558288574,
"learning_rate": 9.75024126503153e-06,
"loss": 0.0818,
"num_input_tokens_seen": 1313664,
"step": 1525
},
{
"epoch": 3.7046004842615012,
"grad_norm": 8.710615158081055,
"learning_rate": 9.583308030708135e-06,
"loss": 0.0869,
"num_input_tokens_seen": 1318080,
"step": 1530
},
{
"epoch": 3.7167070217917675,
"grad_norm": 2.1846084594726562,
"learning_rate": 9.417476683090007e-06,
"loss": 0.0893,
"num_input_tokens_seen": 1322432,
"step": 1535
},
{
"epoch": 3.7288135593220337,
"grad_norm": 3.826754570007324,
"learning_rate": 9.252759074743034e-06,
"loss": 0.1556,
"num_input_tokens_seen": 1326848,
"step": 1540
},
{
"epoch": 3.7409200968523004,
"grad_norm": 10.382698059082031,
"learning_rate": 9.08916697863014e-06,
"loss": 0.0774,
"num_input_tokens_seen": 1331328,
"step": 1545
},
{
"epoch": 3.7530266343825667,
"grad_norm": 7.099722862243652,
"learning_rate": 8.926712087269801e-06,
"loss": 0.1253,
"num_input_tokens_seen": 1335424,
"step": 1550
},
{
"epoch": 3.765133171912833,
"grad_norm": 5.015311241149902,
"learning_rate": 8.765406011900368e-06,
"loss": 0.1276,
"num_input_tokens_seen": 1339712,
"step": 1555
},
{
"epoch": 3.777239709443099,
"grad_norm": 4.82669734954834,
"learning_rate": 8.605260281650152e-06,
"loss": 0.1842,
"num_input_tokens_seen": 1344000,
"step": 1560
},
{
"epoch": 3.777239709443099,
"eval_loss": 0.21899566054344177,
"eval_runtime": 0.6796,
"eval_samples_per_second": 539.994,
"eval_steps_per_second": 67.683,
"num_input_tokens_seen": 1344000,
"step": 1560
},
{
"epoch": 3.7893462469733654,
"grad_norm": 3.010295867919922,
"learning_rate": 8.446286342713419e-06,
"loss": 0.0881,
"num_input_tokens_seen": 1348224,
"step": 1565
},
{
"epoch": 3.801452784503632,
"grad_norm": 2.3779475688934326,
"learning_rate": 8.288495557532241e-06,
"loss": 0.1348,
"num_input_tokens_seen": 1352576,
"step": 1570
},
{
"epoch": 3.8135593220338984,
"grad_norm": 6.911816120147705,
"learning_rate": 8.131899203984463e-06,
"loss": 0.134,
"num_input_tokens_seen": 1356864,
"step": 1575
},
{
"epoch": 3.8256658595641646,
"grad_norm": 9.250137329101562,
"learning_rate": 7.976508474577548e-06,
"loss": 0.1141,
"num_input_tokens_seen": 1361152,
"step": 1580
},
{
"epoch": 3.837772397094431,
"grad_norm": 4.86985445022583,
"learning_rate": 7.822334475648654e-06,
"loss": 0.0705,
"num_input_tokens_seen": 1365376,
"step": 1585
},
{
"epoch": 3.849878934624697,
"grad_norm": 0.7732688188552856,
"learning_rate": 7.669388226570809e-06,
"loss": 0.0907,
"num_input_tokens_seen": 1369728,
"step": 1590
},
{
"epoch": 3.861985472154964,
"grad_norm": 5.062341213226318,
"learning_rate": 7.517680658965329e-06,
"loss": 0.1261,
"num_input_tokens_seen": 1374144,
"step": 1595
},
{
"epoch": 3.87409200968523,
"grad_norm": 8.762838363647461,
"learning_rate": 7.367222615920477e-06,
"loss": 0.1084,
"num_input_tokens_seen": 1378368,
"step": 1600
},
{
"epoch": 3.8861985472154963,
"grad_norm": 8.905739784240723,
"learning_rate": 7.2180248512164896e-06,
"loss": 0.0813,
"num_input_tokens_seen": 1382464,
"step": 1605
},
{
"epoch": 3.898305084745763,
"grad_norm": 0.5714547038078308,
"learning_rate": 7.070098028556948e-06,
"loss": 0.0805,
"num_input_tokens_seen": 1386880,
"step": 1610
},
{
"epoch": 3.910411622276029,
"grad_norm": 8.167064666748047,
"learning_rate": 6.923452720806611e-06,
"loss": 0.1924,
"num_input_tokens_seen": 1391296,
"step": 1615
},
{
"epoch": 3.9225181598062955,
"grad_norm": 3.438431739807129,
"learning_rate": 6.778099409235739e-06,
"loss": 0.0609,
"num_input_tokens_seen": 1395456,
"step": 1620
},
{
"epoch": 3.9346246973365617,
"grad_norm": 7.784511089324951,
"learning_rate": 6.634048482770946e-06,
"loss": 0.0932,
"num_input_tokens_seen": 1399616,
"step": 1625
},
{
"epoch": 3.946731234866828,
"grad_norm": 13.272894859313965,
"learning_rate": 6.491310237252679e-06,
"loss": 0.1241,
"num_input_tokens_seen": 1403712,
"step": 1630
},
{
"epoch": 3.9588377723970947,
"grad_norm": 12.38925838470459,
"learning_rate": 6.349894874699344e-06,
"loss": 0.1232,
"num_input_tokens_seen": 1408128,
"step": 1635
},
{
"epoch": 3.970944309927361,
"grad_norm": 5.343148231506348,
"learning_rate": 6.209812502578114e-06,
"loss": 0.0787,
"num_input_tokens_seen": 1412480,
"step": 1640
},
{
"epoch": 3.983050847457627,
"grad_norm": 1.2886254787445068,
"learning_rate": 6.071073133082492e-06,
"loss": 0.0494,
"num_input_tokens_seen": 1416704,
"step": 1645
},
{
"epoch": 3.9951573849878934,
"grad_norm": 10.778816223144531,
"learning_rate": 5.933686682416758e-06,
"loss": 0.0969,
"num_input_tokens_seen": 1421120,
"step": 1650
},
{
"epoch": 4.00726392251816,
"grad_norm": 0.2529144883155823,
"learning_rate": 5.797662970087184e-06,
"loss": 0.09,
"num_input_tokens_seen": 1424944,
"step": 1655
},
{
"epoch": 4.019370460048426,
"grad_norm": 6.2160162925720215,
"learning_rate": 5.663011718200201e-06,
"loss": 0.0897,
"num_input_tokens_seen": 1429296,
"step": 1660
},
{
"epoch": 4.0290556900726395,
"eval_loss": 0.2532218098640442,
"eval_runtime": 0.672,
"eval_samples_per_second": 546.104,
"eval_steps_per_second": 68.449,
"num_input_tokens_seen": 1432880,
"step": 1664
},
{
"epoch": 4.031476997578692,
"grad_norm": 0.9374585747718811,
"learning_rate": 5.529742550767544e-06,
"loss": 0.0316,
"num_input_tokens_seen": 1433776,
"step": 1665
},
{
"epoch": 4.043583535108959,
"grad_norm": 1.9009536504745483,
"learning_rate": 5.397864993018367e-06,
"loss": 0.0492,
"num_input_tokens_seen": 1438000,
"step": 1670
},
{
"epoch": 4.0556900726392255,
"grad_norm": 7.239864349365234,
"learning_rate": 5.267388470718449e-06,
"loss": 0.029,
"num_input_tokens_seen": 1442352,
"step": 1675
},
{
"epoch": 4.067796610169491,
"grad_norm": 2.098872661590576,
"learning_rate": 5.138322309496504e-06,
"loss": 0.052,
"num_input_tokens_seen": 1446704,
"step": 1680
},
{
"epoch": 4.079903147699758,
"grad_norm": 1.4036399126052856,
"learning_rate": 5.010675734177631e-06,
"loss": 0.0469,
"num_input_tokens_seen": 1450864,
"step": 1685
},
{
"epoch": 4.092009685230024,
"grad_norm": 11.33265495300293,
"learning_rate": 4.884457868124001e-06,
"loss": 0.0316,
"num_input_tokens_seen": 1455088,
"step": 1690
},
{
"epoch": 4.1041162227602905,
"grad_norm": 1.9709900617599487,
"learning_rate": 4.759677732582782e-06,
"loss": 0.0228,
"num_input_tokens_seen": 1459376,
"step": 1695
},
{
"epoch": 4.116222760290557,
"grad_norm": 0.01155536063015461,
"learning_rate": 4.636344246041321e-06,
"loss": 0.0529,
"num_input_tokens_seen": 1463600,
"step": 1700
},
{
"epoch": 4.128329297820823,
"grad_norm": 19.08058738708496,
"learning_rate": 4.514466223589753e-06,
"loss": 0.0565,
"num_input_tokens_seen": 1468080,
"step": 1705
},
{
"epoch": 4.14043583535109,
"grad_norm": 1.3092641830444336,
"learning_rate": 4.3940523762909135e-06,
"loss": 0.0695,
"num_input_tokens_seen": 1472624,
"step": 1710
},
{
"epoch": 4.1525423728813555,
"grad_norm": 0.055544547736644745,
"learning_rate": 4.275111310557758e-06,
"loss": 0.0511,
"num_input_tokens_seen": 1477040,
"step": 1715
},
{
"epoch": 4.164648910411622,
"grad_norm": 0.16590368747711182,
"learning_rate": 4.1576515275382226e-06,
"loss": 0.0311,
"num_input_tokens_seen": 1481328,
"step": 1720
},
{
"epoch": 4.176755447941889,
"grad_norm": 0.1331050992012024,
"learning_rate": 4.0416814225076035e-06,
"loss": 0.0394,
"num_input_tokens_seen": 1485808,
"step": 1725
},
{
"epoch": 4.188861985472155,
"grad_norm": 1.6521071195602417,
"learning_rate": 3.9272092842685345e-06,
"loss": 0.0255,
"num_input_tokens_seen": 1490160,
"step": 1730
},
{
"epoch": 4.200968523002421,
"grad_norm": 0.42354145646095276,
"learning_rate": 3.814243294558542e-06,
"loss": 0.0073,
"num_input_tokens_seen": 1494512,
"step": 1735
},
{
"epoch": 4.213075060532688,
"grad_norm": 2.2178032398223877,
"learning_rate": 3.702791527465274e-06,
"loss": 0.0562,
"num_input_tokens_seen": 1498480,
"step": 1740
},
{
"epoch": 4.225181598062954,
"grad_norm": 13.911809921264648,
"learning_rate": 3.592861948849416e-06,
"loss": 0.0463,
"num_input_tokens_seen": 1502768,
"step": 1745
},
{
"epoch": 4.237288135593221,
"grad_norm": 0.01323059480637312,
"learning_rate": 3.484462415775333e-06,
"loss": 0.0429,
"num_input_tokens_seen": 1506992,
"step": 1750
},
{
"epoch": 4.249394673123486,
"grad_norm": 0.1997198611497879,
"learning_rate": 3.377600675949527e-06,
"loss": 0.0035,
"num_input_tokens_seen": 1511472,
"step": 1755
},
{
"epoch": 4.261501210653753,
"grad_norm": 9.309453010559082,
"learning_rate": 3.272284367166825e-06,
"loss": 0.0395,
"num_input_tokens_seen": 1515824,
"step": 1760
},
{
"epoch": 4.27360774818402,
"grad_norm": 1.514168620109558,
"learning_rate": 3.1685210167645335e-06,
"loss": 0.0337,
"num_input_tokens_seen": 1520176,
"step": 1765
},
{
"epoch": 4.280871670702179,
"eval_loss": 0.4314914643764496,
"eval_runtime": 0.8115,
"eval_samples_per_second": 452.254,
"eval_steps_per_second": 56.686,
"num_input_tokens_seen": 1522544,
"step": 1768
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.23039455711841583,
"learning_rate": 3.0663180410843982e-06,
"loss": 0.008,
"num_input_tokens_seen": 1524336,
"step": 1770
},
{
"epoch": 4.297820823244552,
"grad_norm": 0.17007200419902802,
"learning_rate": 2.9656827449425494e-06,
"loss": 0.1379,
"num_input_tokens_seen": 1528560,
"step": 1775
},
{
"epoch": 4.309927360774818,
"grad_norm": 5.092523097991943,
"learning_rate": 2.86662232110739e-06,
"loss": 0.0391,
"num_input_tokens_seen": 1532720,
"step": 1780
},
{
"epoch": 4.322033898305085,
"grad_norm": 8.858246803283691,
"learning_rate": 2.7691438497855134e-06,
"loss": 0.0481,
"num_input_tokens_seen": 1536944,
"step": 1785
},
{
"epoch": 4.3341404358353515,
"grad_norm": 0.16653333604335785,
"learning_rate": 2.673254298115646e-06,
"loss": 0.0365,
"num_input_tokens_seen": 1541168,
"step": 1790
},
{
"epoch": 4.346246973365617,
"grad_norm": 0.057360630482435226,
"learning_rate": 2.5789605196706674e-06,
"loss": 0.0094,
"num_input_tokens_seen": 1545456,
"step": 1795
},
{
"epoch": 4.358353510895884,
"grad_norm": 18.321725845336914,
"learning_rate": 2.4862692539677906e-06,
"loss": 0.0798,
"num_input_tokens_seen": 1549872,
"step": 1800
},
{
"epoch": 4.37046004842615,
"grad_norm": 0.05611402168869972,
"learning_rate": 2.3951871259868503e-06,
"loss": 0.113,
"num_input_tokens_seen": 1554288,
"step": 1805
},
{
"epoch": 4.3825665859564165,
"grad_norm": 7.665430068969727,
"learning_rate": 2.3057206456967905e-06,
"loss": 0.1113,
"num_input_tokens_seen": 1558384,
"step": 1810
},
{
"epoch": 4.394673123486683,
"grad_norm": 9.430697441101074,
"learning_rate": 2.217876207590375e-06,
"loss": 0.0523,
"num_input_tokens_seen": 1562544,
"step": 1815
},
{
"epoch": 4.406779661016949,
"grad_norm": 0.0549406073987484,
"learning_rate": 2.131660090227139e-06,
"loss": 0.0659,
"num_input_tokens_seen": 1567216,
"step": 1820
},
{
"epoch": 4.418886198547216,
"grad_norm": 0.08962647616863251,
"learning_rate": 2.0470784557846652e-06,
"loss": 0.0756,
"num_input_tokens_seen": 1571568,
"step": 1825
},
{
"epoch": 4.4309927360774815,
"grad_norm": 0.09955435991287231,
"learning_rate": 1.964137349618114e-06,
"loss": 0.0018,
"num_input_tokens_seen": 1575792,
"step": 1830
},
{
"epoch": 4.443099273607748,
"grad_norm": 0.7829030156135559,
"learning_rate": 1.8828426998281689e-06,
"loss": 0.0419,
"num_input_tokens_seen": 1580080,
"step": 1835
},
{
"epoch": 4.455205811138015,
"grad_norm": 3.134791851043701,
"learning_rate": 1.8032003168373306e-06,
"loss": 0.0692,
"num_input_tokens_seen": 1584112,
"step": 1840
},
{
"epoch": 4.467312348668281,
"grad_norm": 1.7574918270111084,
"learning_rate": 1.7252158929746131e-06,
"loss": 0.0456,
"num_input_tokens_seen": 1588400,
"step": 1845
},
{
"epoch": 4.479418886198547,
"grad_norm": 27.999475479125977,
"learning_rate": 1.6488950020686955e-06,
"loss": 0.0504,
"num_input_tokens_seen": 1592816,
"step": 1850
},
{
"epoch": 4.491525423728813,
"grad_norm": 0.16101866960525513,
"learning_rate": 1.5742430990495466e-06,
"loss": 0.0573,
"num_input_tokens_seen": 1597296,
"step": 1855
},
{
"epoch": 4.50363196125908,
"grad_norm": 0.11599753797054291,
"learning_rate": 1.5012655195585368e-06,
"loss": 0.0293,
"num_input_tokens_seen": 1601648,
"step": 1860
},
{
"epoch": 4.5157384987893465,
"grad_norm": 6.17954683303833,
"learning_rate": 1.4299674795670764e-06,
"loss": 0.1156,
"num_input_tokens_seen": 1605936,
"step": 1865
},
{
"epoch": 4.527845036319612,
"grad_norm": 1.049548625946045,
"learning_rate": 1.360354075003828e-06,
"loss": 0.126,
"num_input_tokens_seen": 1610096,
"step": 1870
},
{
"epoch": 4.532687651331719,
"eval_loss": 0.42201921343803406,
"eval_runtime": 0.693,
"eval_samples_per_second": 529.558,
"eval_steps_per_second": 66.375,
"num_input_tokens_seen": 1611760,
"step": 1872
},
{
"epoch": 4.539951573849879,
"grad_norm": 13.409820556640625,
"learning_rate": 1.2924302813904582e-06,
"loss": 0.0436,
"num_input_tokens_seen": 1614384,
"step": 1875
},
{
"epoch": 4.552058111380145,
"grad_norm": 3.9212989807128906,
"learning_rate": 1.226200953486037e-06,
"loss": 0.0591,
"num_input_tokens_seen": 1618800,
"step": 1880
},
{
"epoch": 4.5641646489104115,
"grad_norm": 0.7789947986602783,
"learning_rate": 1.1616708249400449e-06,
"loss": 0.0027,
"num_input_tokens_seen": 1622960,
"step": 1885
},
{
"epoch": 4.576271186440678,
"grad_norm": 16.51002311706543,
"learning_rate": 1.0988445079540388e-06,
"loss": 0.037,
"num_input_tokens_seen": 1627056,
"step": 1890
},
{
"epoch": 4.588377723970944,
"grad_norm": 0.03825072944164276,
"learning_rate": 1.0377264929520125e-06,
"loss": 0.0205,
"num_input_tokens_seen": 1631408,
"step": 1895
},
{
"epoch": 4.600484261501211,
"grad_norm": 13.03893756866455,
"learning_rate": 9.783211482594285e-07,
"loss": 0.0687,
"num_input_tokens_seen": 1635888,
"step": 1900
},
{
"epoch": 4.6125907990314765,
"grad_norm": 0.19233529269695282,
"learning_rate": 9.206327197910203e-07,
"loss": 0.0049,
"num_input_tokens_seen": 1640176,
"step": 1905
},
{
"epoch": 4.624697336561743,
"grad_norm": 9.149880409240723,
"learning_rate": 8.646653307473079e-07,
"loss": 0.056,
"num_input_tokens_seen": 1644528,
"step": 1910
},
{
"epoch": 4.63680387409201,
"grad_norm": 0.09057964384555817,
"learning_rate": 8.10422981319911e-07,
"loss": 0.002,
"num_input_tokens_seen": 1649264,
"step": 1915
},
{
"epoch": 4.648910411622276,
"grad_norm": 0.645796537399292,
"learning_rate": 7.579095484056192e-07,
"loss": 0.0111,
"num_input_tokens_seen": 1653808,
"step": 1920
},
{
"epoch": 4.661016949152542,
"grad_norm": 0.02393440343439579,
"learning_rate": 7.07128785329314e-07,
"loss": 0.0023,
"num_input_tokens_seen": 1658288,
"step": 1925
},
{
"epoch": 4.673123486682809,
"grad_norm": 0.03354793041944504,
"learning_rate": 6.580843215757082e-07,
"loss": 0.0228,
"num_input_tokens_seen": 1662576,
"step": 1930
},
{
"epoch": 4.685230024213075,
"grad_norm": 1.0874401330947876,
"learning_rate": 6.107796625299117e-07,
"loss": 0.0221,
"num_input_tokens_seen": 1667056,
"step": 1935
},
{
"epoch": 4.697336561743342,
"grad_norm": 0.94743412733078,
"learning_rate": 5.652181892269181e-07,
"loss": 0.0733,
"num_input_tokens_seen": 1671536,
"step": 1940
},
{
"epoch": 4.709443099273607,
"grad_norm": 0.04832937568426132,
"learning_rate": 5.214031581099149e-07,
"loss": 0.0023,
"num_input_tokens_seen": 1675888,
"step": 1945
},
{
"epoch": 4.721549636803874,
"grad_norm": 12.764242172241211,
"learning_rate": 4.793377007975719e-07,
"loss": 0.0341,
"num_input_tokens_seen": 1680176,
"step": 1950
},
{
"epoch": 4.733656174334141,
"grad_norm": 6.990570068359375,
"learning_rate": 4.3902482386018186e-07,
"loss": 0.0568,
"num_input_tokens_seen": 1684400,
"step": 1955
},
{
"epoch": 4.745762711864407,
"grad_norm": 26.958158493041992,
"learning_rate": 4.004674086047905e-07,
"loss": 0.1211,
"num_input_tokens_seen": 1688816,
"step": 1960
},
{
"epoch": 4.757869249394673,
"grad_norm": 1.086872935295105,
"learning_rate": 3.636682108692502e-07,
"loss": 0.0408,
"num_input_tokens_seen": 1693360,
"step": 1965
},
{
"epoch": 4.76997578692494,
"grad_norm": 15.128409385681152,
"learning_rate": 3.2862986082524416e-07,
"loss": 0.0647,
"num_input_tokens_seen": 1697584,
"step": 1970
},
{
"epoch": 4.782082324455206,
"grad_norm": 7.18263053894043,
"learning_rate": 2.953548627903202e-07,
"loss": 0.0336,
"num_input_tokens_seen": 1702000,
"step": 1975
},
{
"epoch": 4.784503631961259,
"eval_loss": 0.4348176121711731,
"eval_runtime": 0.6821,
"eval_samples_per_second": 538.039,
"eval_steps_per_second": 67.438,
"num_input_tokens_seen": 1702832,
"step": 1976
},
{
"epoch": 4.7941888619854724,
"grad_norm": 0.357972115278244,
"learning_rate": 2.6384559504886166e-07,
"loss": 0.1448,
"num_input_tokens_seen": 1706416,
"step": 1980
},
{
"epoch": 4.806295399515738,
"grad_norm": 5.933152198791504,
"learning_rate": 2.3410430968214824e-07,
"loss": 0.0163,
"num_input_tokens_seen": 1710960,
"step": 1985
},
{
"epoch": 4.818401937046005,
"grad_norm": 21.378908157348633,
"learning_rate": 2.0613313240735454e-07,
"loss": 0.1048,
"num_input_tokens_seen": 1715440,
"step": 1990
},
{
"epoch": 4.830508474576272,
"grad_norm": 0.03769972547888756,
"learning_rate": 1.7993406242563238e-07,
"loss": 0.0295,
"num_input_tokens_seen": 1719728,
"step": 1995
},
{
"epoch": 4.842615012106537,
"grad_norm": 0.04536456614732742,
"learning_rate": 1.5550897227922523e-07,
"loss": 0.0007,
"num_input_tokens_seen": 1724272,
"step": 2000
},
{
"epoch": 4.854721549636804,
"grad_norm": 12.351763725280762,
"learning_rate": 1.3285960771761697e-07,
"loss": 0.064,
"num_input_tokens_seen": 1728560,
"step": 2005
},
{
"epoch": 4.86682808716707,
"grad_norm": 11.032571792602539,
"learning_rate": 1.119875875727705e-07,
"loss": 0.0289,
"num_input_tokens_seen": 1733104,
"step": 2010
},
{
"epoch": 4.878934624697337,
"grad_norm": 21.032617568969727,
"learning_rate": 9.289440364341485e-08,
"loss": 0.0127,
"num_input_tokens_seen": 1737264,
"step": 2015
},
{
"epoch": 4.891041162227603,
"grad_norm": 3.019296169281006,
"learning_rate": 7.558142058842754e-08,
"loss": 0.0664,
"num_input_tokens_seen": 1741424,
"step": 2020
},
{
"epoch": 4.903147699757869,
"grad_norm": 0.06446848809719086,
"learning_rate": 6.004987582929055e-08,
"loss": 0.0657,
"num_input_tokens_seen": 1745648,
"step": 2025
},
{
"epoch": 4.915254237288136,
"grad_norm": 15.37187385559082,
"learning_rate": 4.63008794616554e-08,
"loss": 0.045,
"num_input_tokens_seen": 1749872,
"step": 2030
},
{
"epoch": 4.927360774818402,
"grad_norm": 0.0873086079955101,
"learning_rate": 3.433541417599551e-08,
"loss": 0.0431,
"num_input_tokens_seen": 1754288,
"step": 2035
},
{
"epoch": 4.939467312348668,
"grad_norm": 0.19230781495571136,
"learning_rate": 2.4154335187365207e-08,
"loss": 0.0332,
"num_input_tokens_seen": 1758640,
"step": 2040
},
{
"epoch": 4.951573849878935,
"grad_norm": 0.0936799943447113,
"learning_rate": 1.5758370174284722e-08,
"loss": 0.0602,
"num_input_tokens_seen": 1762928,
"step": 2045
},
{
"epoch": 4.963680387409201,
"grad_norm": 0.07743958383798599,
"learning_rate": 9.14811922672898e-09,
"loss": 0.0118,
"num_input_tokens_seen": 1767344,
"step": 2050
},
{
"epoch": 4.9757869249394675,
"grad_norm": 0.36676138639450073,
"learning_rate": 4.324054803223065e-09,
"loss": 0.0392,
"num_input_tokens_seen": 1771632,
"step": 2055
},
{
"epoch": 4.987893462469733,
"grad_norm": 11.666873931884766,
"learning_rate": 1.286521697091425e-09,
"loss": 0.0333,
"num_input_tokens_seen": 1775728,
"step": 2060
},
{
"epoch": 5.0,
"grad_norm": 0.10359911620616913,
"learning_rate": 3.5737011805370145e-11,
"loss": 0.0653,
"num_input_tokens_seen": 1780000,
"step": 2065
},
{
"epoch": 5.0,
"num_input_tokens_seen": 1780000,
"step": 2065,
"total_flos": 1.039320047616e+16,
"train_loss": 0.16683834154997698,
"train_runtime": 1017.6301,
"train_samples_per_second": 16.219,
"train_steps_per_second": 2.029
}
],
"logging_steps": 5,
"max_steps": 2065,
"num_input_tokens_seen": 1780000,
"num_train_epochs": 5,
"save_steps": 104,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.039320047616e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}