Model: phanviethoang1512/llama3.2-1b-deita-dpo-student_sft_init Source: Original Platform
2462 lines
60 KiB
JSON
2462 lines
60 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 1713,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.008764241893076249,
|
|
"grad_norm": 5.367548942565918,
|
|
"learning_rate": 4.651162790697675e-07,
|
|
"loss": 1.491,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.017528483786152498,
|
|
"grad_norm": 3.861912488937378,
|
|
"learning_rate": 1.0465116279069768e-06,
|
|
"loss": 1.4607,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.026292725679228746,
|
|
"grad_norm": 2.542222738265991,
|
|
"learning_rate": 1.6279069767441862e-06,
|
|
"loss": 1.4704,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.035056967572304996,
|
|
"grad_norm": 2.1328587532043457,
|
|
"learning_rate": 2.2093023255813954e-06,
|
|
"loss": 1.4085,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.04382120946538125,
|
|
"grad_norm": 1.5579408407211304,
|
|
"learning_rate": 2.790697674418605e-06,
|
|
"loss": 1.3603,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.05258545135845749,
|
|
"grad_norm": 1.6026604175567627,
|
|
"learning_rate": 3.372093023255814e-06,
|
|
"loss": 1.3568,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.06134969325153374,
|
|
"grad_norm": 1.5183006525039673,
|
|
"learning_rate": 3.953488372093024e-06,
|
|
"loss": 1.3702,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.07011393514460999,
|
|
"grad_norm": 1.416035532951355,
|
|
"learning_rate": 4.5348837209302326e-06,
|
|
"loss": 1.3288,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.07887817703768624,
|
|
"grad_norm": 1.4895626306533813,
|
|
"learning_rate": 5.116279069767442e-06,
|
|
"loss": 1.3292,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.0876424189307625,
|
|
"grad_norm": 1.3430354595184326,
|
|
"learning_rate": 5.697674418604652e-06,
|
|
"loss": 1.3227,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.09640666082383874,
|
|
"grad_norm": 1.4117517471313477,
|
|
"learning_rate": 6.279069767441861e-06,
|
|
"loss": 1.2902,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.10517090271691498,
|
|
"grad_norm": 1.3359665870666504,
|
|
"learning_rate": 6.86046511627907e-06,
|
|
"loss": 1.3327,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.11393514460999124,
|
|
"grad_norm": 1.4718199968338013,
|
|
"learning_rate": 7.44186046511628e-06,
|
|
"loss": 1.2973,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.12269938650306748,
|
|
"grad_norm": 1.2470380067825317,
|
|
"learning_rate": 8.023255813953488e-06,
|
|
"loss": 1.2706,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.13146362839614373,
|
|
"grad_norm": 1.324803352355957,
|
|
"learning_rate": 8.604651162790698e-06,
|
|
"loss": 1.2178,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.14022787028921999,
|
|
"grad_norm": 1.3574628829956055,
|
|
"learning_rate": 9.186046511627908e-06,
|
|
"loss": 1.2316,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.14899211218229624,
|
|
"grad_norm": 1.3636841773986816,
|
|
"learning_rate": 9.767441860465117e-06,
|
|
"loss": 1.283,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.15775635407537247,
|
|
"grad_norm": 1.7021708488464355,
|
|
"learning_rate": 1.0348837209302327e-05,
|
|
"loss": 1.2635,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.16652059596844873,
|
|
"grad_norm": 1.243608832359314,
|
|
"learning_rate": 1.0930232558139535e-05,
|
|
"loss": 1.2079,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.175284837861525,
|
|
"grad_norm": 1.8144162893295288,
|
|
"learning_rate": 1.1511627906976746e-05,
|
|
"loss": 1.2186,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.18404907975460122,
|
|
"grad_norm": 1.1823457479476929,
|
|
"learning_rate": 1.2093023255813954e-05,
|
|
"loss": 1.2103,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.19281332164767748,
|
|
"grad_norm": 1.198132872581482,
|
|
"learning_rate": 1.2674418604651164e-05,
|
|
"loss": 1.2044,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.20157756354075373,
|
|
"grad_norm": 11.093875885009766,
|
|
"learning_rate": 1.3255813953488372e-05,
|
|
"loss": 1.1683,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.21034180543382996,
|
|
"grad_norm": 1.0984971523284912,
|
|
"learning_rate": 1.3837209302325583e-05,
|
|
"loss": 1.2289,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.21910604732690622,
|
|
"grad_norm": 1.2427825927734375,
|
|
"learning_rate": 1.441860465116279e-05,
|
|
"loss": 1.1449,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.22787028921998248,
|
|
"grad_norm": 1.2608261108398438,
|
|
"learning_rate": 1.5000000000000002e-05,
|
|
"loss": 1.1524,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.2366345311130587,
|
|
"grad_norm": 1.114823818206787,
|
|
"learning_rate": 1.558139534883721e-05,
|
|
"loss": 1.1769,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.24539877300613497,
|
|
"grad_norm": 1.1239306926727295,
|
|
"learning_rate": 1.616279069767442e-05,
|
|
"loss": 1.1516,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2541630148992112,
|
|
"grad_norm": 1.1203042268753052,
|
|
"learning_rate": 1.674418604651163e-05,
|
|
"loss": 1.1243,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.26292725679228746,
|
|
"grad_norm": 1.0693674087524414,
|
|
"learning_rate": 1.7325581395348837e-05,
|
|
"loss": 1.1574,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.27169149868536374,
|
|
"grad_norm": 1.1013996601104736,
|
|
"learning_rate": 1.790697674418605e-05,
|
|
"loss": 1.1621,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.28045574057843997,
|
|
"grad_norm": 1.1914992332458496,
|
|
"learning_rate": 1.8488372093023256e-05,
|
|
"loss": 1.1468,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.2892199824715162,
|
|
"grad_norm": 1.1144826412200928,
|
|
"learning_rate": 1.9069767441860468e-05,
|
|
"loss": 1.153,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.2979842243645925,
|
|
"grad_norm": 1.1576107740402222,
|
|
"learning_rate": 1.9651162790697676e-05,
|
|
"loss": 1.151,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.3067484662576687,
|
|
"grad_norm": 1.037223219871521,
|
|
"learning_rate": 1.999991687649223e-05,
|
|
"loss": 1.1386,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.31551270815074495,
|
|
"grad_norm": 1.1823593378067017,
|
|
"learning_rate": 1.999898175290004e-05,
|
|
"loss": 1.1368,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.32427695004382123,
|
|
"grad_norm": 1.0528627634048462,
|
|
"learning_rate": 1.9997007698817558e-05,
|
|
"loss": 1.183,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.33304119193689746,
|
|
"grad_norm": 1.1595643758773804,
|
|
"learning_rate": 1.9993994919356167e-05,
|
|
"loss": 1.1687,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.3418054338299737,
|
|
"grad_norm": 1.0715525150299072,
|
|
"learning_rate": 1.9989943727554597e-05,
|
|
"loss": 1.1648,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.35056967572305,
|
|
"grad_norm": 1.0216760635375977,
|
|
"learning_rate": 1.9984854544346367e-05,
|
|
"loss": 1.1587,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.3593339176161262,
|
|
"grad_norm": 1.0617367029190063,
|
|
"learning_rate": 1.9978727898516087e-05,
|
|
"loss": 1.145,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.36809815950920244,
|
|
"grad_norm": 1.0882047414779663,
|
|
"learning_rate": 1.997156442664449e-05,
|
|
"loss": 1.1652,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.3768624014022787,
|
|
"grad_norm": 1.154795527458191,
|
|
"learning_rate": 1.9963364873042298e-05,
|
|
"loss": 1.135,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.38562664329535495,
|
|
"grad_norm": 1.0904324054718018,
|
|
"learning_rate": 1.9954130089672893e-05,
|
|
"loss": 1.1262,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.3943908851884312,
|
|
"grad_norm": 1.0021111965179443,
|
|
"learning_rate": 1.994386103606377e-05,
|
|
"loss": 1.1422,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.40315512708150747,
|
|
"grad_norm": 1.0234806537628174,
|
|
"learning_rate": 1.9932558779206873e-05,
|
|
"loss": 1.1315,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.4119193689745837,
|
|
"grad_norm": 1.10641610622406,
|
|
"learning_rate": 1.9920224493447702e-05,
|
|
"loss": 1.1824,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.42068361086765993,
|
|
"grad_norm": 1.0238338708877563,
|
|
"learning_rate": 1.9906859460363307e-05,
|
|
"loss": 1.1442,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.4294478527607362,
|
|
"grad_norm": 0.990487813949585,
|
|
"learning_rate": 1.989246506862913e-05,
|
|
"loss": 1.1276,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.43821209465381245,
|
|
"grad_norm": 20.3802490234375,
|
|
"learning_rate": 1.9877042813874712e-05,
|
|
"loss": 1.1744,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.4469763365468887,
|
|
"grad_norm": 1.1273601055145264,
|
|
"learning_rate": 1.9860594298528283e-05,
|
|
"loss": 1.1774,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.45574057843996496,
|
|
"grad_norm": 1.014085292816162,
|
|
"learning_rate": 1.984312123165028e-05,
|
|
"loss": 1.162,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.4645048203330412,
|
|
"grad_norm": 1.0875205993652344,
|
|
"learning_rate": 1.982462542875576e-05,
|
|
"loss": 1.1485,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.4732690622261174,
|
|
"grad_norm": 1.0361530780792236,
|
|
"learning_rate": 1.9805108811625774e-05,
|
|
"loss": 1.1422,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.4820333041191937,
|
|
"grad_norm": 1.0539902448654175,
|
|
"learning_rate": 1.9784573408107657e-05,
|
|
"loss": 1.0915,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.49079754601226994,
|
|
"grad_norm": 1.05149245262146,
|
|
"learning_rate": 1.976302135190436e-05,
|
|
"loss": 1.1372,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.49956178790534617,
|
|
"grad_norm": 1.0928102731704712,
|
|
"learning_rate": 1.9740454882352733e-05,
|
|
"loss": 1.1239,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.5083260297984225,
|
|
"grad_norm": 1.0785322189331055,
|
|
"learning_rate": 1.971687634419086e-05,
|
|
"loss": 1.1429,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.5170902716914987,
|
|
"grad_norm": 1.020357370376587,
|
|
"learning_rate": 1.9692288187314423e-05,
|
|
"loss": 1.1195,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.5258545135845749,
|
|
"grad_norm": 0.9896298050880432,
|
|
"learning_rate": 1.9666692966522144e-05,
|
|
"loss": 1.1217,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.5346187554776511,
|
|
"grad_norm": 0.9637587070465088,
|
|
"learning_rate": 1.9640093341250356e-05,
|
|
"loss": 1.1082,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.5433829973707275,
|
|
"grad_norm": 1.2339686155319214,
|
|
"learning_rate": 1.961249207529665e-05,
|
|
"loss": 1.1459,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.5521472392638037,
|
|
"grad_norm": 1.0626837015151978,
|
|
"learning_rate": 1.9583892036532726e-05,
|
|
"loss": 1.1257,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.5609114811568799,
|
|
"grad_norm": 1.0179359912872314,
|
|
"learning_rate": 1.9554296196606395e-05,
|
|
"loss": 1.1111,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.5696757230499562,
|
|
"grad_norm": 1.0226428508758545,
|
|
"learning_rate": 1.9523707630632834e-05,
|
|
"loss": 1.1673,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.5784399649430324,
|
|
"grad_norm": 1.0737133026123047,
|
|
"learning_rate": 1.9492129516875055e-05,
|
|
"loss": 1.1325,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.5872042068361086,
|
|
"grad_norm": 1.0531032085418701,
|
|
"learning_rate": 1.9459565136413667e-05,
|
|
"loss": 1.1478,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.595968448729185,
|
|
"grad_norm": 1.0400668382644653,
|
|
"learning_rate": 1.942601787280598e-05,
|
|
"loss": 1.1403,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.6047326906222612,
|
|
"grad_norm": 0.9359525442123413,
|
|
"learning_rate": 1.9391491211734426e-05,
|
|
"loss": 1.1298,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.6134969325153374,
|
|
"grad_norm": 3.9531524181365967,
|
|
"learning_rate": 1.935598874064438e-05,
|
|
"loss": 1.1923,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.6222611744084137,
|
|
"grad_norm": 1.0364443063735962,
|
|
"learning_rate": 1.9319514148371436e-05,
|
|
"loss": 1.1096,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.6310254163014899,
|
|
"grad_norm": 1.0656158924102783,
|
|
"learning_rate": 1.9282071224758092e-05,
|
|
"loss": 1.1282,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.6397896581945661,
|
|
"grad_norm": 1.0614289045333862,
|
|
"learning_rate": 1.9243663860259992e-05,
|
|
"loss": 1.1137,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.6485539000876425,
|
|
"grad_norm": 1.002898931503296,
|
|
"learning_rate": 1.9204296045541686e-05,
|
|
"loss": 1.1091,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.6573181419807187,
|
|
"grad_norm": 1.0451066493988037,
|
|
"learning_rate": 1.916397187106199e-05,
|
|
"loss": 1.0919,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.6660823838737949,
|
|
"grad_norm": 1.192143201828003,
|
|
"learning_rate": 1.9122695526648968e-05,
|
|
"loss": 1.1581,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.6748466257668712,
|
|
"grad_norm": 1.0061026811599731,
|
|
"learning_rate": 1.90804713010646e-05,
|
|
"loss": 1.116,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.6836108676599474,
|
|
"grad_norm": 2.3462023735046387,
|
|
"learning_rate": 1.9037303581559143e-05,
|
|
"loss": 1.1323,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.6923751095530236,
|
|
"grad_norm": 0.9700145125389099,
|
|
"learning_rate": 1.899319685341532e-05,
|
|
"loss": 1.1075,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.7011393514461,
|
|
"grad_norm": 0.9761490821838379,
|
|
"learning_rate": 1.8948155699482243e-05,
|
|
"loss": 1.1291,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.7099035933391762,
|
|
"grad_norm": 1.0112907886505127,
|
|
"learning_rate": 1.8902184799699265e-05,
|
|
"loss": 1.1087,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.7186678352322524,
|
|
"grad_norm": 0.9741994738578796,
|
|
"learning_rate": 1.885528893060969e-05,
|
|
"loss": 1.1181,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.7274320771253286,
|
|
"grad_norm": 0.9536153078079224,
|
|
"learning_rate": 1.8807472964864516e-05,
|
|
"loss": 1.114,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.7361963190184049,
|
|
"grad_norm": 0.9664406180381775,
|
|
"learning_rate": 1.8758741870716093e-05,
|
|
"loss": 1.1474,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.7449605609114811,
|
|
"grad_norm": 0.999437689781189,
|
|
"learning_rate": 1.8709100711501957e-05,
|
|
"loss": 1.1067,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.7537248028045574,
|
|
"grad_norm": 1.0034998655319214,
|
|
"learning_rate": 1.865855464511869e-05,
|
|
"loss": 1.1409,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.7624890446976337,
|
|
"grad_norm": 1.0300318002700806,
|
|
"learning_rate": 1.8607108923486025e-05,
|
|
"loss": 1.1289,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.7712532865907099,
|
|
"grad_norm": 0.9994638562202454,
|
|
"learning_rate": 1.8554768892001137e-05,
|
|
"loss": 1.1093,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.7800175284837861,
|
|
"grad_norm": 0.9193391799926758,
|
|
"learning_rate": 1.8501539988983234e-05,
|
|
"loss": 1.1377,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.7887817703768624,
|
|
"grad_norm": 1.0165811777114868,
|
|
"learning_rate": 1.844742774510851e-05,
|
|
"loss": 1.1204,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.7975460122699386,
|
|
"grad_norm": 0.9755986928939819,
|
|
"learning_rate": 1.8392437782835475e-05,
|
|
"loss": 1.0935,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.8063102541630149,
|
|
"grad_norm": 0.977584183216095,
|
|
"learning_rate": 1.8336575815820764e-05,
|
|
"loss": 1.1064,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.8150744960560912,
|
|
"grad_norm": 0.9432125687599182,
|
|
"learning_rate": 1.8279847648325478e-05,
|
|
"loss": 1.099,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.8238387379491674,
|
|
"grad_norm": 1.0756127834320068,
|
|
"learning_rate": 1.822225917461208e-05,
|
|
"loss": 1.0926,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.8326029798422436,
|
|
"grad_norm": 1.0018426179885864,
|
|
"learning_rate": 1.8163816378331983e-05,
|
|
"loss": 1.1292,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.8413672217353199,
|
|
"grad_norm": 1.0097193717956543,
|
|
"learning_rate": 1.81045253319038e-05,
|
|
"loss": 1.0738,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.8501314636283961,
|
|
"grad_norm": 0.9783721566200256,
|
|
"learning_rate": 1.8044392195882428e-05,
|
|
"loss": 1.1059,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.8588957055214724,
|
|
"grad_norm": 0.9834737181663513,
|
|
"learning_rate": 1.7983423218318918e-05,
|
|
"loss": 1.1063,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.8676599474145487,
|
|
"grad_norm": 1.0263484716415405,
|
|
"learning_rate": 1.7921624734111292e-05,
|
|
"loss": 1.1325,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.8764241893076249,
|
|
"grad_norm": 0.9454247951507568,
|
|
"learning_rate": 1.7859003164346334e-05,
|
|
"loss": 1.0937,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.8851884312007011,
|
|
"grad_norm": 1.006463646888733,
|
|
"learning_rate": 1.779556501563239e-05,
|
|
"loss": 1.0511,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.8939526730937774,
|
|
"grad_norm": 6.430685043334961,
|
|
"learning_rate": 1.773131687942333e-05,
|
|
"loss": 1.0899,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.9027169149868537,
|
|
"grad_norm": 1.3062087297439575,
|
|
"learning_rate": 1.7666265431333654e-05,
|
|
"loss": 1.1047,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.9114811568799299,
|
|
"grad_norm": 1.0522316694259644,
|
|
"learning_rate": 1.76004174304449e-05,
|
|
"loss": 1.1009,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.9202453987730062,
|
|
"grad_norm": 0.9894193410873413,
|
|
"learning_rate": 1.7533779718603315e-05,
|
|
"loss": 1.0761,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.9290096406660824,
|
|
"grad_norm": 1.0116757154464722,
|
|
"learning_rate": 1.7466359219708987e-05,
|
|
"loss": 1.1305,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.9377738825591586,
|
|
"grad_norm": 0.962745726108551,
|
|
"learning_rate": 1.739816293899642e-05,
|
|
"loss": 1.0758,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.9465381244522348,
|
|
"grad_norm": 0.9733975529670715,
|
|
"learning_rate": 1.7329197962306666e-05,
|
|
"loss": 1.0752,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.9553023663453112,
|
|
"grad_norm": 1.026983618736267,
|
|
"learning_rate": 1.7259471455351072e-05,
|
|
"loss": 1.0576,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.9640666082383874,
|
|
"grad_norm": 0.9675541520118713,
|
|
"learning_rate": 1.718899066296675e-05,
|
|
"loss": 1.0759,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.9728308501314636,
|
|
"grad_norm": 0.9842016100883484,
|
|
"learning_rate": 1.71177629083638e-05,
|
|
"loss": 1.0704,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.9815950920245399,
|
|
"grad_norm": 0.9556295871734619,
|
|
"learning_rate": 1.7045795592364413e-05,
|
|
"loss": 1.1343,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.9903593339176161,
|
|
"grad_norm": 1.033588171005249,
|
|
"learning_rate": 1.6973096192633884e-05,
|
|
"loss": 1.0947,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.9991235758106923,
|
|
"grad_norm": 1.971771240234375,
|
|
"learning_rate": 1.6899672262903675e-05,
|
|
"loss": 1.1293,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 1.1597641706466675,
|
|
"eval_runtime": 199.4031,
|
|
"eval_samples_per_second": 9.162,
|
|
"eval_steps_per_second": 2.292,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 1.007011393514461,
|
|
"grad_norm": 1.0958155393600464,
|
|
"learning_rate": 1.6825531432186545e-05,
|
|
"loss": 1.0193,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 1.0157756354075373,
|
|
"grad_norm": 1.108912467956543,
|
|
"learning_rate": 1.6750681403983847e-05,
|
|
"loss": 0.9767,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.0245398773006136,
|
|
"grad_norm": 1.0128231048583984,
|
|
"learning_rate": 1.6675129955485154e-05,
|
|
"loss": 0.9534,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 1.0333041191936898,
|
|
"grad_norm": 0.9520951509475708,
|
|
"learning_rate": 1.659888493676013e-05,
|
|
"loss": 0.9388,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.042068361086766,
|
|
"grad_norm": 0.98039710521698,
|
|
"learning_rate": 1.652195426994292e-05,
|
|
"loss": 0.97,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 1.0508326029798423,
|
|
"grad_norm": 1.0683668851852417,
|
|
"learning_rate": 1.6444345948408985e-05,
|
|
"loss": 0.9539,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.0595968448729185,
|
|
"grad_norm": 1.0525304079055786,
|
|
"learning_rate": 1.636606803594457e-05,
|
|
"loss": 0.9534,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 1.0683610867659947,
|
|
"grad_norm": 0.999165415763855,
|
|
"learning_rate": 1.628712866590885e-05,
|
|
"loss": 0.9634,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 1.077125328659071,
|
|
"grad_norm": 0.9724875688552856,
|
|
"learning_rate": 1.6207536040388844e-05,
|
|
"loss": 0.9559,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 1.0858895705521472,
|
|
"grad_norm": 0.9775224924087524,
|
|
"learning_rate": 1.612729842934718e-05,
|
|
"loss": 0.9771,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.0946538124452234,
|
|
"grad_norm": 1.2882146835327148,
|
|
"learning_rate": 1.604642416976283e-05,
|
|
"loss": 0.9027,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 1.1034180543382996,
|
|
"grad_norm": 1.0088601112365723,
|
|
"learning_rate": 1.596492166476485e-05,
|
|
"loss": 0.9494,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 1.112182296231376,
|
|
"grad_norm": 1.0667091608047485,
|
|
"learning_rate": 1.588279938275929e-05,
|
|
"loss": 0.9493,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 1.1209465381244523,
|
|
"grad_norm": 1.0000181198120117,
|
|
"learning_rate": 1.580006585654927e-05,
|
|
"loss": 0.9609,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.1297107800175286,
|
|
"grad_norm": 0.9999110102653503,
|
|
"learning_rate": 1.5716729682448392e-05,
|
|
"loss": 1.0068,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 1.1384750219106048,
|
|
"grad_norm": 1.0657200813293457,
|
|
"learning_rate": 1.563279951938758e-05,
|
|
"loss": 0.9676,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 1.147239263803681,
|
|
"grad_norm": 1.029891848564148,
|
|
"learning_rate": 1.5548284088015354e-05,
|
|
"loss": 0.9623,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 1.1560035056967572,
|
|
"grad_norm": 1.015758752822876,
|
|
"learning_rate": 1.546319216979174e-05,
|
|
"loss": 0.9897,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.1647677475898335,
|
|
"grad_norm": 0.9652720093727112,
|
|
"learning_rate": 1.537753260607584e-05,
|
|
"loss": 0.9607,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 1.1735319894829097,
|
|
"grad_norm": 1.0845791101455688,
|
|
"learning_rate": 1.5291314297207177e-05,
|
|
"loss": 0.9783,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 1.182296231375986,
|
|
"grad_norm": 1.0521138906478882,
|
|
"learning_rate": 1.520454620158093e-05,
|
|
"loss": 0.9836,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 1.1910604732690622,
|
|
"grad_norm": 0.9807194471359253,
|
|
"learning_rate": 1.5117237334717117e-05,
|
|
"loss": 0.9443,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.1998247151621384,
|
|
"grad_norm": 1.0408189296722412,
|
|
"learning_rate": 1.5029396768323847e-05,
|
|
"loss": 0.9755,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 1.2085889570552146,
|
|
"grad_norm": 1.0140528678894043,
|
|
"learning_rate": 1.4941033629354735e-05,
|
|
"loss": 0.942,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 1.2173531989482909,
|
|
"grad_norm": 1.028287649154663,
|
|
"learning_rate": 1.4852157099060595e-05,
|
|
"loss": 0.9362,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 1.2261174408413673,
|
|
"grad_norm": 0.9807888269424438,
|
|
"learning_rate": 1.4762776412035455e-05,
|
|
"loss": 0.9752,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.2348816827344435,
|
|
"grad_norm": 1.0794785022735596,
|
|
"learning_rate": 1.4672900855257056e-05,
|
|
"loss": 0.9508,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 1.2436459246275198,
|
|
"grad_norm": 1.0464166402816772,
|
|
"learning_rate": 1.4582539767121904e-05,
|
|
"loss": 0.9519,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.252410166520596,
|
|
"grad_norm": 0.9949556589126587,
|
|
"learning_rate": 1.449170253647498e-05,
|
|
"loss": 0.9188,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 1.2611744084136722,
|
|
"grad_norm": 0.9590442180633545,
|
|
"learning_rate": 1.4400398601634189e-05,
|
|
"loss": 0.9686,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.2699386503067485,
|
|
"grad_norm": 1.0098439455032349,
|
|
"learning_rate": 1.4308637449409705e-05,
|
|
"loss": 0.9848,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 1.2787028921998247,
|
|
"grad_norm": 1.026219367980957,
|
|
"learning_rate": 1.4216428614118245e-05,
|
|
"loss": 0.9595,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.287467134092901,
|
|
"grad_norm": 1.0277692079544067,
|
|
"learning_rate": 1.4123781676592418e-05,
|
|
"loss": 0.9773,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 1.2962313759859772,
|
|
"grad_norm": 1.0222140550613403,
|
|
"learning_rate": 1.4030706263185248e-05,
|
|
"loss": 0.9399,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.3049956178790534,
|
|
"grad_norm": 0.9892441630363464,
|
|
"learning_rate": 1.3937212044769957e-05,
|
|
"loss": 0.985,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 1.3137598597721296,
|
|
"grad_norm": 1.0329406261444092,
|
|
"learning_rate": 1.384330873573513e-05,
|
|
"loss": 0.9369,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.322524101665206,
|
|
"grad_norm": 0.9816661477088928,
|
|
"learning_rate": 1.3749006092975347e-05,
|
|
"loss": 0.9457,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 1.331288343558282,
|
|
"grad_norm": 1.0054512023925781,
|
|
"learning_rate": 1.3654313914877414e-05,
|
|
"loss": 0.9087,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.3400525854513585,
|
|
"grad_norm": 17.338027954101562,
|
|
"learning_rate": 1.3559242040302274e-05,
|
|
"loss": 0.9808,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 1.3488168273444348,
|
|
"grad_norm": 1.0706207752227783,
|
|
"learning_rate": 1.3463800347562705e-05,
|
|
"loss": 0.9679,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.357581069237511,
|
|
"grad_norm": 1.040747046470642,
|
|
"learning_rate": 1.3367998753396944e-05,
|
|
"loss": 0.9974,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 1.3663453111305872,
|
|
"grad_norm": 0.9935981631278992,
|
|
"learning_rate": 1.3271847211938286e-05,
|
|
"loss": 0.9428,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.3751095530236634,
|
|
"grad_norm": 1.0025993585586548,
|
|
"learning_rate": 1.317535571368082e-05,
|
|
"loss": 0.9462,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 1.3838737949167397,
|
|
"grad_norm": 0.9988533854484558,
|
|
"learning_rate": 1.3078534284441382e-05,
|
|
"loss": 0.9734,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.392638036809816,
|
|
"grad_norm": 1.0070812702178955,
|
|
"learning_rate": 1.2981392984317835e-05,
|
|
"loss": 0.9622,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 1.4014022787028921,
|
|
"grad_norm": 1.0259467363357544,
|
|
"learning_rate": 1.2883941906643786e-05,
|
|
"loss": 0.9671,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.4101665205959684,
|
|
"grad_norm": 1.0248597860336304,
|
|
"learning_rate": 1.2786191176939848e-05,
|
|
"loss": 0.9402,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 1.4189307624890448,
|
|
"grad_norm": 1.008159875869751,
|
|
"learning_rate": 1.2688150951861582e-05,
|
|
"loss": 1.0111,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.4276950043821208,
|
|
"grad_norm": 1.024697184562683,
|
|
"learning_rate": 1.2589831418144156e-05,
|
|
"loss": 0.9354,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 1.4364592462751973,
|
|
"grad_norm": 0.9872326254844666,
|
|
"learning_rate": 1.2491242791543922e-05,
|
|
"loss": 0.9424,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.4452234881682735,
|
|
"grad_norm": 1.0979632139205933,
|
|
"learning_rate": 1.2392395315776964e-05,
|
|
"loss": 0.9594,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 1.4539877300613497,
|
|
"grad_norm": 0.9879066944122314,
|
|
"learning_rate": 1.2293299261454726e-05,
|
|
"loss": 0.9762,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 1.462751971954426,
|
|
"grad_norm": 1.278245210647583,
|
|
"learning_rate": 1.2193964925016872e-05,
|
|
"loss": 0.9458,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 1.4715162138475022,
|
|
"grad_norm": 1.4075230360031128,
|
|
"learning_rate": 1.2094402627661447e-05,
|
|
"loss": 0.9496,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.4802804557405784,
|
|
"grad_norm": 1.059368371963501,
|
|
"learning_rate": 1.1994622714272448e-05,
|
|
"loss": 0.965,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 1.4890446976336547,
|
|
"grad_norm": 0.9740917086601257,
|
|
"learning_rate": 1.1894635552344976e-05,
|
|
"loss": 0.939,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 1.4978089395267309,
|
|
"grad_norm": 0.9713614583015442,
|
|
"learning_rate": 1.1794451530908011e-05,
|
|
"loss": 0.9256,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 1.5065731814198071,
|
|
"grad_norm": 1.023720145225525,
|
|
"learning_rate": 1.1694081059444947e-05,
|
|
"loss": 0.9548,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.5153374233128836,
|
|
"grad_norm": 1.1291546821594238,
|
|
"learning_rate": 1.159353456681201e-05,
|
|
"loss": 0.9512,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 1.5241016652059596,
|
|
"grad_norm": 0.9696962833404541,
|
|
"learning_rate": 1.1492822500154668e-05,
|
|
"loss": 0.9715,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 1.532865907099036,
|
|
"grad_norm": 1.0114858150482178,
|
|
"learning_rate": 1.1391955323822126e-05,
|
|
"loss": 0.9355,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 1.541630148992112,
|
|
"grad_norm": 1.0963616371154785,
|
|
"learning_rate": 1.1290943518280058e-05,
|
|
"loss": 0.9779,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.5503943908851885,
|
|
"grad_norm": 0.9969412684440613,
|
|
"learning_rate": 1.118979757902162e-05,
|
|
"loss": 0.9589,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 1.5591586327782647,
|
|
"grad_norm": 1.022300362586975,
|
|
"learning_rate": 1.1088528015476965e-05,
|
|
"loss": 0.9656,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.567922874671341,
|
|
"grad_norm": 0.974607527256012,
|
|
"learning_rate": 1.098714534992125e-05,
|
|
"loss": 0.9622,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 1.5766871165644172,
|
|
"grad_norm": 1.0116885900497437,
|
|
"learning_rate": 1.088566011638134e-05,
|
|
"loss": 0.9343,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.5854513584574934,
|
|
"grad_norm": 1.0116138458251953,
|
|
"learning_rate": 1.0784082859541291e-05,
|
|
"loss": 0.9383,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 1.5942156003505696,
|
|
"grad_norm": 1.2108194828033447,
|
|
"learning_rate": 1.0682424133646712e-05,
|
|
"loss": 0.9171,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.6029798422436459,
|
|
"grad_norm": 1.0214340686798096,
|
|
"learning_rate": 1.0580694501408138e-05,
|
|
"loss": 0.9675,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 1.6117440841367223,
|
|
"grad_norm": 1.0362666845321655,
|
|
"learning_rate": 1.0478904532903535e-05,
|
|
"loss": 1.0028,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.6205083260297983,
|
|
"grad_norm": 0.9954794049263,
|
|
"learning_rate": 1.0377064804480025e-05,
|
|
"loss": 0.9624,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 1.6292725679228748,
|
|
"grad_norm": 1.0109649896621704,
|
|
"learning_rate": 1.0275185897654972e-05,
|
|
"loss": 0.9501,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.6380368098159508,
|
|
"grad_norm": 1.0172914266586304,
|
|
"learning_rate": 1.0173278398016502e-05,
|
|
"loss": 0.9354,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 1.6468010517090272,
|
|
"grad_norm": 0.9905017614364624,
|
|
"learning_rate": 1.0071352894123654e-05,
|
|
"loss": 0.9758,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.6555652936021035,
|
|
"grad_norm": 0.9832938313484192,
|
|
"learning_rate": 9.969419976406166e-06,
|
|
"loss": 0.9737,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 1.6643295354951797,
|
|
"grad_norm": 0.9569029808044434,
|
|
"learning_rate": 9.867490236064109e-06,
|
|
"loss": 0.9212,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.673093777388256,
|
|
"grad_norm": 1.0192569494247437,
|
|
"learning_rate": 9.765574263967397e-06,
|
|
"loss": 0.9472,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 1.6818580192813322,
|
|
"grad_norm": 0.9713300466537476,
|
|
"learning_rate": 9.663682649555389e-06,
|
|
"loss": 0.9644,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.6906222611744084,
|
|
"grad_norm": 0.9462825655937195,
|
|
"learning_rate": 9.56182597973658e-06,
|
|
"loss": 0.9576,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 1.6993865030674846,
|
|
"grad_norm": 0.9868680834770203,
|
|
"learning_rate": 9.460014837788605e-06,
|
|
"loss": 0.9667,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.708150744960561,
|
|
"grad_norm": 1.0376570224761963,
|
|
"learning_rate": 9.358259802258582e-06,
|
|
"loss": 0.9452,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 1.716914986853637,
|
|
"grad_norm": 1.0066869258880615,
|
|
"learning_rate": 9.256571445863972e-06,
|
|
"loss": 0.9534,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.7256792287467135,
|
|
"grad_norm": 1.0090934038162231,
|
|
"learning_rate": 9.154960334394027e-06,
|
|
"loss": 0.955,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.7344434706397895,
|
|
"grad_norm": 0.9518396854400635,
|
|
"learning_rate": 9.053437025611974e-06,
|
|
"loss": 0.9342,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.743207712532866,
|
|
"grad_norm": 0.9992371797561646,
|
|
"learning_rate": 8.952012068158027e-06,
|
|
"loss": 0.9722,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.751971954425942,
|
|
"grad_norm": 0.9554047584533691,
|
|
"learning_rate": 8.850696000453327e-06,
|
|
"loss": 0.9357,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.7607361963190185,
|
|
"grad_norm": 0.9989141225814819,
|
|
"learning_rate": 8.749499349604992e-06,
|
|
"loss": 0.9821,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 1.7695004382120947,
|
|
"grad_norm": 0.9504846334457397,
|
|
"learning_rate": 8.64843263031228e-06,
|
|
"loss": 0.9537,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 1.778264680105171,
|
|
"grad_norm": 0.9558340907096863,
|
|
"learning_rate": 8.547506343774097e-06,
|
|
"loss": 0.9289,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 1.7870289219982471,
|
|
"grad_norm": 1.0170910358428955,
|
|
"learning_rate": 8.446730976597877e-06,
|
|
"loss": 0.9501,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.7957931638913234,
|
|
"grad_norm": 0.9939414262771606,
|
|
"learning_rate": 8.346116999709975e-06,
|
|
"loss": 0.9472,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 1.8045574057843996,
|
|
"grad_norm": 0.9810356497764587,
|
|
"learning_rate": 8.245674867267724e-06,
|
|
"loss": 0.9491,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 1.8133216476774758,
|
|
"grad_norm": 0.9643825888633728,
|
|
"learning_rate": 8.145415015573183e-06,
|
|
"loss": 0.947,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 1.8220858895705523,
|
|
"grad_norm": 0.9195330739021301,
|
|
"learning_rate": 8.045347861988789e-06,
|
|
"loss": 0.876,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.8308501314636283,
|
|
"grad_norm": 0.9632524847984314,
|
|
"learning_rate": 7.945483803854937e-06,
|
|
"loss": 0.9173,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 1.8396143733567047,
|
|
"grad_norm": 0.9642296433448792,
|
|
"learning_rate": 7.845833217409677e-06,
|
|
"loss": 0.9233,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 1.8483786152497808,
|
|
"grad_norm": 0.9421396851539612,
|
|
"learning_rate": 7.746406456710564e-06,
|
|
"loss": 0.9187,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 1.8571428571428572,
|
|
"grad_norm": 0.9888685345649719,
|
|
"learning_rate": 7.64721385255886e-06,
|
|
"loss": 0.9289,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.8659070990359334,
|
|
"grad_norm": 0.9585088491439819,
|
|
"learning_rate": 7.548265711426105e-06,
|
|
"loss": 0.9291,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 1.8746713409290097,
|
|
"grad_norm": 0.9842194318771362,
|
|
"learning_rate": 7.449572314383237e-06,
|
|
"loss": 0.9521,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 1.883435582822086,
|
|
"grad_norm": 0.9899460077285767,
|
|
"learning_rate": 7.351143916032375e-06,
|
|
"loss": 0.9238,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 1.8921998247151621,
|
|
"grad_norm": 1.044392466545105,
|
|
"learning_rate": 7.252990743441293e-06,
|
|
"loss": 0.9354,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.9009640666082384,
|
|
"grad_norm": 0.9790138006210327,
|
|
"learning_rate": 7.155122995080826e-06,
|
|
"loss": 0.9527,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 1.9097283085013146,
|
|
"grad_norm": 1.002880334854126,
|
|
"learning_rate": 7.0575508397651885e-06,
|
|
"loss": 0.9471,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 1.918492550394391,
|
|
"grad_norm": 0.9698459506034851,
|
|
"learning_rate": 6.960284415595407e-06,
|
|
"loss": 0.9402,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 1.927256792287467,
|
|
"grad_norm": 0.9467353224754333,
|
|
"learning_rate": 6.863333828905929e-06,
|
|
"loss": 0.9409,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.9360210341805435,
|
|
"grad_norm": 0.965829074382782,
|
|
"learning_rate": 6.766709153214541e-06,
|
|
"loss": 0.9425,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 1.9447852760736195,
|
|
"grad_norm": 0.9571474194526672,
|
|
"learning_rate": 6.670420428175706e-06,
|
|
"loss": 0.9405,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 1.953549517966696,
|
|
"grad_norm": 0.9341493248939514,
|
|
"learning_rate": 6.574477658537375e-06,
|
|
"loss": 0.9145,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 1.962313759859772,
|
|
"grad_norm": 0.9990600943565369,
|
|
"learning_rate": 6.4788908131014995e-06,
|
|
"loss": 0.952,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.9710780017528484,
|
|
"grad_norm": 0.917290210723877,
|
|
"learning_rate": 6.383669823688191e-06,
|
|
"loss": 0.951,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 1.9798422436459246,
|
|
"grad_norm": 0.9599776864051819,
|
|
"learning_rate": 6.288824584103815e-06,
|
|
"loss": 0.936,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 1.9886064855390009,
|
|
"grad_norm": 0.9636255502700806,
|
|
"learning_rate": 6.194364949112952e-06,
|
|
"loss": 0.9582,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 1.997370727432077,
|
|
"grad_norm": 1.1487308740615845,
|
|
"learning_rate": 6.100300733414473e-06,
|
|
"loss": 0.9276,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 1.151129961013794,
|
|
"eval_runtime": 199.4284,
|
|
"eval_samples_per_second": 9.161,
|
|
"eval_steps_per_second": 2.292,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"epoch": 2.005258545135846,
|
|
"grad_norm": 1.1605374813079834,
|
|
"learning_rate": 6.006641710621746e-06,
|
|
"loss": 0.8479,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 2.014022787028922,
|
|
"grad_norm": 1.0491231679916382,
|
|
"learning_rate": 5.913397612247121e-06,
|
|
"loss": 0.8032,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 2.0227870289219982,
|
|
"grad_norm": 1.0855581760406494,
|
|
"learning_rate": 5.82057812669081e-06,
|
|
"loss": 0.8839,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 2.0315512708150747,
|
|
"grad_norm": 0.9942172169685364,
|
|
"learning_rate": 5.728192898234195e-06,
|
|
"loss": 0.7986,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 2.0403155127081507,
|
|
"grad_norm": 1.0435779094696045,
|
|
"learning_rate": 5.636251526037784e-06,
|
|
"loss": 0.8263,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 2.049079754601227,
|
|
"grad_norm": 1.0303524732589722,
|
|
"learning_rate": 5.544763563143794e-06,
|
|
"loss": 0.8188,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 2.057843996494303,
|
|
"grad_norm": 0.9739100933074951,
|
|
"learning_rate": 5.453738515483586e-06,
|
|
"loss": 0.8488,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 2.0666082383873796,
|
|
"grad_norm": 1.021791696548462,
|
|
"learning_rate": 5.363185840889935e-06,
|
|
"loss": 0.8666,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 2.0753724802804556,
|
|
"grad_norm": 0.9683573842048645,
|
|
"learning_rate": 5.273114948114346e-06,
|
|
"loss": 0.8276,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 2.084136722173532,
|
|
"grad_norm": 1.0052560567855835,
|
|
"learning_rate": 5.1835351958494515e-06,
|
|
"loss": 0.8089,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 2.092900964066608,
|
|
"grad_norm": 0.9584820866584778,
|
|
"learning_rate": 5.094455891756587e-06,
|
|
"loss": 0.8276,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 2.1016652059596845,
|
|
"grad_norm": 0.9803566932678223,
|
|
"learning_rate": 5.0058862914987204e-06,
|
|
"loss": 0.8256,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 2.1104294478527605,
|
|
"grad_norm": 0.9923965334892273,
|
|
"learning_rate": 4.917835597778731e-06,
|
|
"loss": 0.8241,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 2.119193689745837,
|
|
"grad_norm": 1.022495985031128,
|
|
"learning_rate": 4.830312959383238e-06,
|
|
"loss": 0.8074,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 2.127957931638913,
|
|
"grad_norm": 0.9760512709617615,
|
|
"learning_rate": 4.743327470231982e-06,
|
|
"loss": 0.8058,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 2.1367221735319895,
|
|
"grad_norm": 0.9603386521339417,
|
|
"learning_rate": 4.656888168432962e-06,
|
|
"loss": 0.8133,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 2.145486415425066,
|
|
"grad_norm": 1.034776210784912,
|
|
"learning_rate": 4.571004035343315e-06,
|
|
"loss": 0.818,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 2.154250657318142,
|
|
"grad_norm": 0.9763988256454468,
|
|
"learning_rate": 4.485683994636144e-06,
|
|
"loss": 0.8165,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 2.1630148992112184,
|
|
"grad_norm": 0.9729757905006409,
|
|
"learning_rate": 4.400936911373308e-06,
|
|
"loss": 0.808,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 2.1717791411042944,
|
|
"grad_norm": 1.0068873167037964,
|
|
"learning_rate": 4.316771591084297e-06,
|
|
"loss": 0.8038,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 2.180543382997371,
|
|
"grad_norm": 0.9344819188117981,
|
|
"learning_rate": 4.2331967788513295e-06,
|
|
"loss": 0.8335,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 2.189307624890447,
|
|
"grad_norm": 1.0315194129943848,
|
|
"learning_rate": 4.150221158400683e-06,
|
|
"loss": 0.8154,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 2.1980718667835233,
|
|
"grad_norm": 0.9959366321563721,
|
|
"learning_rate": 4.067853351200446e-06,
|
|
"loss": 0.8317,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 2.2068361086765993,
|
|
"grad_norm": 1.0919640064239502,
|
|
"learning_rate": 3.986101915564695e-06,
|
|
"loss": 0.8236,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 2.2156003505696757,
|
|
"grad_norm": 0.9548513293266296,
|
|
"learning_rate": 3.904975345764262e-06,
|
|
"loss": 0.849,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 2.224364592462752,
|
|
"grad_norm": 0.9864785075187683,
|
|
"learning_rate": 3.824482071144164e-06,
|
|
"loss": 0.8259,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 2.233128834355828,
|
|
"grad_norm": 1.014013648033142,
|
|
"learning_rate": 3.7446304552477387e-06,
|
|
"loss": 0.7696,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 2.2418930762489047,
|
|
"grad_norm": 0.95964115858078,
|
|
"learning_rate": 3.665428794947663e-06,
|
|
"loss": 0.7758,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 2.2506573181419807,
|
|
"grad_norm": 0.9974411725997925,
|
|
"learning_rate": 3.5868853195838582e-06,
|
|
"loss": 0.8512,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 2.259421560035057,
|
|
"grad_norm": 0.990260124206543,
|
|
"learning_rate": 3.509008190108453e-06,
|
|
"loss": 0.8096,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 2.268185801928133,
|
|
"grad_norm": 0.982060968875885,
|
|
"learning_rate": 3.431805498237808e-06,
|
|
"loss": 0.8259,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 2.2769500438212096,
|
|
"grad_norm": 0.9737572073936462,
|
|
"learning_rate": 3.355285265611784e-06,
|
|
"loss": 0.8368,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 2.2857142857142856,
|
|
"grad_norm": 0.9657383561134338,
|
|
"learning_rate": 3.2794554429602377e-06,
|
|
"loss": 0.8129,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 2.294478527607362,
|
|
"grad_norm": 0.9947619438171387,
|
|
"learning_rate": 3.204323909276924e-06,
|
|
"loss": 0.8034,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 2.303242769500438,
|
|
"grad_norm": 1.0247445106506348,
|
|
"learning_rate": 3.1298984710008483e-06,
|
|
"loss": 0.8267,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 2.3120070113935145,
|
|
"grad_norm": 0.9986540079116821,
|
|
"learning_rate": 3.056186861205136e-06,
|
|
"loss": 0.8233,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 2.3207712532865905,
|
|
"grad_norm": 0.9882351160049438,
|
|
"learning_rate": 2.983196738793547e-06,
|
|
"loss": 0.8097,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 2.329535495179667,
|
|
"grad_norm": 0.9737289547920227,
|
|
"learning_rate": 2.910935687704671e-06,
|
|
"loss": 0.8285,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 2.3382997370727434,
|
|
"grad_norm": 0.9512819647789001,
|
|
"learning_rate": 2.8394112161239606e-06,
|
|
"loss": 0.7998,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 2.3470639789658194,
|
|
"grad_norm": 0.980267345905304,
|
|
"learning_rate": 2.7686307557035684e-06,
|
|
"loss": 0.8364,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 2.355828220858896,
|
|
"grad_norm": 0.9798904061317444,
|
|
"learning_rate": 2.698601660790191e-06,
|
|
"loss": 0.8288,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 2.364592462751972,
|
|
"grad_norm": 0.9910169839859009,
|
|
"learning_rate": 2.629331207660931e-06,
|
|
"loss": 0.8182,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 2.3733567046450483,
|
|
"grad_norm": 1.0095982551574707,
|
|
"learning_rate": 2.560826593767244e-06,
|
|
"loss": 0.8651,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 2.3821209465381243,
|
|
"grad_norm": 1.0415823459625244,
|
|
"learning_rate": 2.4930949369871205e-06,
|
|
"loss": 0.7934,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 2.390885188431201,
|
|
"grad_norm": 0.9959484934806824,
|
|
"learning_rate": 2.426143274885493e-06,
|
|
"loss": 0.8131,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 2.399649430324277,
|
|
"grad_norm": 0.9777078628540039,
|
|
"learning_rate": 2.359978563983022e-06,
|
|
"loss": 0.8125,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 2.4084136722173533,
|
|
"grad_norm": 1.0206762552261353,
|
|
"learning_rate": 2.294607679033283e-06,
|
|
"loss": 0.7912,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 2.4171779141104293,
|
|
"grad_norm": 0.9738752245903015,
|
|
"learning_rate": 2.230037412308452e-06,
|
|
"loss": 0.8411,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 2.4259421560035057,
|
|
"grad_norm": 0.9954826831817627,
|
|
"learning_rate": 2.166274472893567e-06,
|
|
"loss": 0.8052,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 2.4347063978965817,
|
|
"grad_norm": 0.9861373901367188,
|
|
"learning_rate": 2.1033254859894224e-06,
|
|
"loss": 0.8041,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 2.443470639789658,
|
|
"grad_norm": 0.9600276947021484,
|
|
"learning_rate": 2.041196992224206e-06,
|
|
"loss": 0.8326,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 2.4522348816827346,
|
|
"grad_norm": 1.127557396888733,
|
|
"learning_rate": 1.9798954469738762e-06,
|
|
"loss": 0.8355,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 2.4609991235758106,
|
|
"grad_norm": 0.9988298416137695,
|
|
"learning_rate": 1.9194272196914533e-06,
|
|
"loss": 0.8473,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 2.469763365468887,
|
|
"grad_norm": 0.972212553024292,
|
|
"learning_rate": 1.8597985932451856e-06,
|
|
"loss": 0.816,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 2.478527607361963,
|
|
"grad_norm": 0.9716165065765381,
|
|
"learning_rate": 1.8010157632657544e-06,
|
|
"loss": 0.8157,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 2.4872918492550395,
|
|
"grad_norm": 0.9722920656204224,
|
|
"learning_rate": 1.7430848375025178e-06,
|
|
"loss": 0.8238,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 2.4960560911481156,
|
|
"grad_norm": 1.0044946670532227,
|
|
"learning_rate": 1.686011835188891e-06,
|
|
"loss": 0.8473,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 2.504820333041192,
|
|
"grad_norm": 0.9682095050811768,
|
|
"learning_rate": 1.6298026864169336e-06,
|
|
"loss": 0.8132,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 2.513584574934268,
|
|
"grad_norm": 0.9928619861602783,
|
|
"learning_rate": 1.5744632315211815e-06,
|
|
"loss": 0.837,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 2.5223488168273445,
|
|
"grad_norm": 0.9613544344902039,
|
|
"learning_rate": 1.5199992204718295e-06,
|
|
"loss": 0.8209,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 2.531113058720421,
|
|
"grad_norm": 1.0032097101211548,
|
|
"learning_rate": 1.466416312277269e-06,
|
|
"loss": 0.8303,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 2.539877300613497,
|
|
"grad_norm": 0.9630649089813232,
|
|
"learning_rate": 1.4137200743961189e-06,
|
|
"loss": 0.825,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 2.548641542506573,
|
|
"grad_norm": 0.9702491164207458,
|
|
"learning_rate": 1.3619159821587236e-06,
|
|
"loss": 0.8148,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 2.5574057843996494,
|
|
"grad_norm": 0.9509206414222717,
|
|
"learning_rate": 1.3110094181982657e-06,
|
|
"loss": 0.7695,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 2.566170026292726,
|
|
"grad_norm": 0.9589338302612305,
|
|
"learning_rate": 1.261005671891482e-06,
|
|
"loss": 0.8532,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 2.574934268185802,
|
|
"grad_norm": 0.9704285264015198,
|
|
"learning_rate": 1.2119099388090715e-06,
|
|
"loss": 0.797,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 2.5836985100788783,
|
|
"grad_norm": 1.0093833208084106,
|
|
"learning_rate": 1.1637273201758747e-06,
|
|
"loss": 0.8233,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 2.5924627519719543,
|
|
"grad_norm": 0.9612089991569519,
|
|
"learning_rate": 1.1164628223408169e-06,
|
|
"loss": 0.8489,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 2.6012269938650308,
|
|
"grad_norm": 0.9347235560417175,
|
|
"learning_rate": 1.0701213562567491e-06,
|
|
"loss": 0.7855,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 2.6099912357581068,
|
|
"grad_norm": 1.00240957736969,
|
|
"learning_rate": 1.0247077369701653e-06,
|
|
"loss": 0.8322,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 2.618755477651183,
|
|
"grad_norm": 0.99866783618927,
|
|
"learning_rate": 9.802266831209206e-07,
|
|
"loss": 0.8133,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 2.6275197195442592,
|
|
"grad_norm": 1.0041725635528564,
|
|
"learning_rate": 9.36682816451926e-07,
|
|
"loss": 0.8715,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 2.6362839614373357,
|
|
"grad_norm": 0.9615875482559204,
|
|
"learning_rate": 8.940806613289499e-07,
|
|
"loss": 0.8075,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 2.645048203330412,
|
|
"grad_norm": 0.9449265003204346,
|
|
"learning_rate": 8.524246442705153e-07,
|
|
"loss": 0.7974,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 2.653812445223488,
|
|
"grad_norm": 0.9578828811645508,
|
|
"learning_rate": 8.117190934879593e-07,
|
|
"loss": 0.8175,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 2.662576687116564,
|
|
"grad_norm": 0.9990285038948059,
|
|
"learning_rate": 7.719682384357308e-07,
|
|
"loss": 0.8147,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 2.6713409290096406,
|
|
"grad_norm": 0.9652912616729736,
|
|
"learning_rate": 7.33176209371923e-07,
|
|
"loss": 0.8429,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 2.680105170902717,
|
|
"grad_norm": 0.9373207092285156,
|
|
"learning_rate": 6.953470369291349e-07,
|
|
"loss": 0.825,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 2.688869412795793,
|
|
"grad_norm": 0.9682218432426453,
|
|
"learning_rate": 6.5848465169566e-07,
|
|
"loss": 0.7916,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 2.6976336546888695,
|
|
"grad_norm": 0.995035707950592,
|
|
"learning_rate": 6.225928838071016e-07,
|
|
"loss": 0.829,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 2.7063978965819455,
|
|
"grad_norm": 0.9676108956336975,
|
|
"learning_rate": 5.876754625483904e-07,
|
|
"loss": 0.8497,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 2.715162138475022,
|
|
"grad_norm": 0.9674281477928162,
|
|
"learning_rate": 5.537360159663107e-07,
|
|
"loss": 0.8126,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 2.7239263803680984,
|
|
"grad_norm": 0.9768509864807129,
|
|
"learning_rate": 5.207780704925314e-07,
|
|
"loss": 0.8432,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 2.7326906222611744,
|
|
"grad_norm": 0.9932735562324524,
|
|
"learning_rate": 4.888050505771869e-07,
|
|
"loss": 0.8293,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 2.7414548641542504,
|
|
"grad_norm": 0.9800174832344055,
|
|
"learning_rate": 4.5782027833307983e-07,
|
|
"loss": 0.7843,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 2.750219106047327,
|
|
"grad_norm": 0.9393450021743774,
|
|
"learning_rate": 4.2782697319048603e-07,
|
|
"loss": 0.8016,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 2.7589833479404033,
|
|
"grad_norm": 0.9714465737342834,
|
|
"learning_rate": 3.9882825156265846e-07,
|
|
"loss": 0.8264,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 2.7677475898334793,
|
|
"grad_norm": 0.975568950176239,
|
|
"learning_rate": 3.708271265220087e-07,
|
|
"loss": 0.802,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 2.776511831726556,
|
|
"grad_norm": 0.9788158535957336,
|
|
"learning_rate": 3.4382650748704173e-07,
|
|
"loss": 0.8374,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 2.785276073619632,
|
|
"grad_norm": 0.9417116641998291,
|
|
"learning_rate": 3.178291999200633e-07,
|
|
"loss": 0.8181,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 2.7940403155127083,
|
|
"grad_norm": 0.9802819490432739,
|
|
"learning_rate": 2.928379050356722e-07,
|
|
"loss": 0.8208,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 2.8028045574057843,
|
|
"grad_norm": 0.9727985858917236,
|
|
"learning_rate": 2.6885521952010105e-07,
|
|
"loss": 0.7862,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 2.8115687992988607,
|
|
"grad_norm": 0.9225666522979736,
|
|
"learning_rate": 2.458836352614069e-07,
|
|
"loss": 0.7791,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 2.8203330411919367,
|
|
"grad_norm": 1.038718342781067,
|
|
"learning_rate": 2.2392553909055813e-07,
|
|
"loss": 0.8164,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 2.829097283085013,
|
|
"grad_norm": 0.945773184299469,
|
|
"learning_rate": 2.029832125334319e-07,
|
|
"loss": 0.8277,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 2.8378615249780896,
|
|
"grad_norm": 0.9560094475746155,
|
|
"learning_rate": 1.8305883157375804e-07,
|
|
"loss": 0.7974,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 2.8466257668711656,
|
|
"grad_norm": 0.9896951913833618,
|
|
"learning_rate": 1.6415446642702337e-07,
|
|
"loss": 0.8084,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 2.8553900087642416,
|
|
"grad_norm": 0.9845879077911377,
|
|
"learning_rate": 1.4627208132536818e-07,
|
|
"loss": 0.8216,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 2.864154250657318,
|
|
"grad_norm": 0.9730380177497864,
|
|
"learning_rate": 1.2941353431350058e-07,
|
|
"loss": 0.7997,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 2.8729184925503946,
|
|
"grad_norm": 0.9810739159584045,
|
|
"learning_rate": 1.1358057705563641e-07,
|
|
"loss": 0.8212,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 2.8816827344434706,
|
|
"grad_norm": 0.9314019083976746,
|
|
"learning_rate": 9.877485465349057e-08,
|
|
"loss": 0.7794,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 2.890446976336547,
|
|
"grad_norm": 0.9651502966880798,
|
|
"learning_rate": 8.499790547535025e-08,
|
|
"loss": 0.8138,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 2.899211218229623,
|
|
"grad_norm": 0.966038167476654,
|
|
"learning_rate": 7.225116099623287e-08,
|
|
"loss": 0.8212,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 2.9079754601226995,
|
|
"grad_norm": 0.9493021965026855,
|
|
"learning_rate": 6.053594564914611e-08,
|
|
"loss": 0.832,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 2.9167397020157755,
|
|
"grad_norm": 0.9688047766685486,
|
|
"learning_rate": 4.985347668747809e-08,
|
|
"loss": 0.8239,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 2.925503943908852,
|
|
"grad_norm": 0.9778699278831482,
|
|
"learning_rate": 4.020486405852286e-08,
|
|
"loss": 0.7976,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 2.934268185801928,
|
|
"grad_norm": 0.9479379653930664,
|
|
"learning_rate": 3.15911102881461e-08,
|
|
"loss": 0.8375,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 2.9430324276950044,
|
|
"grad_norm": 1.0030702352523804,
|
|
"learning_rate": 2.4013110376623906e-08,
|
|
"loss": 0.8225,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 2.951796669588081,
|
|
"grad_norm": 1.0119658708572388,
|
|
"learning_rate": 1.747165170564724e-08,
|
|
"loss": 0.8276,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 2.960560911481157,
|
|
"grad_norm": 0.9981706738471985,
|
|
"learning_rate": 1.1967413956510687e-08,
|
|
"loss": 0.8661,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 2.969325153374233,
|
|
"grad_norm": 0.9298052787780762,
|
|
"learning_rate": 7.500969039491156e-09,
|
|
"loss": 0.8439,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 2.9780893952673093,
|
|
"grad_norm": 0.9936395287513733,
|
|
"learning_rate": 4.072781034425432e-09,
|
|
"loss": 0.8221,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 2.9868536371603858,
|
|
"grad_norm": 1.0040860176086426,
|
|
"learning_rate": 1.6832061424865155e-09,
|
|
"loss": 0.818,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 2.9956178790534618,
|
|
"grad_norm": 0.9950876235961914,
|
|
"learning_rate": 3.324926491787839e-10,
|
|
"loss": 0.8279,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_loss": 1.1766911745071411,
|
|
"eval_runtime": 199.3045,
|
|
"eval_samples_per_second": 9.167,
|
|
"eval_steps_per_second": 2.293,
|
|
"step": 1713
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 1713,
|
|
"total_flos": 90953314467840.0,
|
|
"train_loss": 0.9790556504583052,
|
|
"train_runtime": 12705.6497,
|
|
"train_samples_per_second": 8.618,
|
|
"train_steps_per_second": 0.135
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 1713,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": false,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 90953314467840.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|