1992 lines
48 KiB
JSON
1992 lines
48 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 1384,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.003614022406938923,
|
|
"grad_norm": 65.45185089111328,
|
|
"learning_rate": 5.755395683453238e-07,
|
|
"loss": 1.1416,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.007228044813877846,
|
|
"grad_norm": 13.528879165649414,
|
|
"learning_rate": 1.2949640287769785e-06,
|
|
"loss": 0.7177,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.01084206722081677,
|
|
"grad_norm": 6.224380016326904,
|
|
"learning_rate": 2.0143884892086333e-06,
|
|
"loss": 0.322,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.014456089627755691,
|
|
"grad_norm": 0.6761742234230042,
|
|
"learning_rate": 2.733812949640288e-06,
|
|
"loss": 0.0856,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.018070112034694615,
|
|
"grad_norm": 1.0489720106124878,
|
|
"learning_rate": 3.453237410071943e-06,
|
|
"loss": 0.0707,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.02168413444163354,
|
|
"grad_norm": 0.18744826316833496,
|
|
"learning_rate": 4.172661870503597e-06,
|
|
"loss": 0.0709,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.02529815684857246,
|
|
"grad_norm": 0.28395524621009827,
|
|
"learning_rate": 4.892086330935253e-06,
|
|
"loss": 0.072,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.028912179255511383,
|
|
"grad_norm": 0.3135656416416168,
|
|
"learning_rate": 5.611510791366906e-06,
|
|
"loss": 0.0709,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.03252620166245031,
|
|
"grad_norm": 0.2624616324901581,
|
|
"learning_rate": 6.330935251798561e-06,
|
|
"loss": 0.0728,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.03614022406938923,
|
|
"grad_norm": 0.06319738179445267,
|
|
"learning_rate": 7.050359712230216e-06,
|
|
"loss": 0.0724,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.03975424647632815,
|
|
"grad_norm": 1.5401397943496704,
|
|
"learning_rate": 7.769784172661872e-06,
|
|
"loss": 0.0725,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.04336826888326708,
|
|
"grad_norm": 0.6666511297225952,
|
|
"learning_rate": 8.489208633093526e-06,
|
|
"loss": 0.0681,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.046982291290206,
|
|
"grad_norm": 0.36439529061317444,
|
|
"learning_rate": 9.20863309352518e-06,
|
|
"loss": 0.0656,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.05059631369714492,
|
|
"grad_norm": 1.2289910316467285,
|
|
"learning_rate": 9.928057553956835e-06,
|
|
"loss": 0.0658,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.054210336104083844,
|
|
"grad_norm": 0.6084970235824585,
|
|
"learning_rate": 1.0647482014388491e-05,
|
|
"loss": 0.0629,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.057824358511022765,
|
|
"grad_norm": 0.249479740858078,
|
|
"learning_rate": 1.1366906474820146e-05,
|
|
"loss": 0.0608,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.061438380917961694,
|
|
"grad_norm": 0.4229901134967804,
|
|
"learning_rate": 1.20863309352518e-05,
|
|
"loss": 0.0609,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.06505240332490062,
|
|
"grad_norm": 0.31077179312705994,
|
|
"learning_rate": 1.2805755395683454e-05,
|
|
"loss": 0.0587,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.06866642573183954,
|
|
"grad_norm": 0.6159167885780334,
|
|
"learning_rate": 1.3525179856115109e-05,
|
|
"loss": 0.0562,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.07228044813877846,
|
|
"grad_norm": 0.6450925469398499,
|
|
"learning_rate": 1.4244604316546765e-05,
|
|
"loss": 0.0553,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.07589447054571738,
|
|
"grad_norm": 0.4479381740093231,
|
|
"learning_rate": 1.496402877697842e-05,
|
|
"loss": 0.0627,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.0795084929526563,
|
|
"grad_norm": 0.13854101300239563,
|
|
"learning_rate": 1.5683453237410072e-05,
|
|
"loss": 0.0608,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.08312251535959522,
|
|
"grad_norm": 0.5709484219551086,
|
|
"learning_rate": 1.640287769784173e-05,
|
|
"loss": 0.0645,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.08673653776653416,
|
|
"grad_norm": 0.13701078295707703,
|
|
"learning_rate": 1.7122302158273384e-05,
|
|
"loss": 0.0595,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.09035056017347308,
|
|
"grad_norm": 0.4921853244304657,
|
|
"learning_rate": 1.784172661870504e-05,
|
|
"loss": 0.0595,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.093964582580412,
|
|
"grad_norm": 1.0717633962631226,
|
|
"learning_rate": 1.8561151079136693e-05,
|
|
"loss": 0.0604,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.09757860498735092,
|
|
"grad_norm": 0.6324800848960876,
|
|
"learning_rate": 1.9280575539568347e-05,
|
|
"loss": 0.0587,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.10119262739428984,
|
|
"grad_norm": 0.2942172586917877,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0572,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.10480664980122877,
|
|
"grad_norm": 0.44763851165771484,
|
|
"learning_rate": 1.999920408755684e-05,
|
|
"loss": 0.0532,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.10842067220816769,
|
|
"grad_norm": 0.4999435842037201,
|
|
"learning_rate": 1.999681647692268e-05,
|
|
"loss": 0.0548,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.11203469461510661,
|
|
"grad_norm": 0.9629915952682495,
|
|
"learning_rate": 1.9992837548163315e-05,
|
|
"loss": 0.0611,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.11564871702204553,
|
|
"grad_norm": 0.36340564489364624,
|
|
"learning_rate": 1.998726793465454e-05,
|
|
"loss": 0.0534,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.11926273942898447,
|
|
"grad_norm": 0.4698989987373352,
|
|
"learning_rate": 1.9980108522981287e-05,
|
|
"loss": 0.0557,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.12287676183592339,
|
|
"grad_norm": 0.29845741391181946,
|
|
"learning_rate": 1.9971360452796523e-05,
|
|
"loss": 0.0587,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.1264907842428623,
|
|
"grad_norm": 0.11843477934598923,
|
|
"learning_rate": 1.996102511663983e-05,
|
|
"loss": 0.057,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.13010480664980123,
|
|
"grad_norm": 0.10305560380220413,
|
|
"learning_rate": 1.9949104159715746e-05,
|
|
"loss": 0.0532,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.13371882905674015,
|
|
"grad_norm": 0.25948402285575867,
|
|
"learning_rate": 1.993559947963185e-05,
|
|
"loss": 0.0497,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.13733285146367907,
|
|
"grad_norm": 0.12778767943382263,
|
|
"learning_rate": 1.9920513226096735e-05,
|
|
"loss": 0.053,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.140946873870618,
|
|
"grad_norm": 0.29483047127723694,
|
|
"learning_rate": 1.9903847800577777e-05,
|
|
"loss": 0.0558,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.14456089627755692,
|
|
"grad_norm": 0.20306497812271118,
|
|
"learning_rate": 1.9885605855918887e-05,
|
|
"loss": 0.0578,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.14817491868449584,
|
|
"grad_norm": 0.41377827525138855,
|
|
"learning_rate": 1.9865790295918212e-05,
|
|
"loss": 0.0573,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.15178894109143476,
|
|
"grad_norm": 0.37599676847457886,
|
|
"learning_rate": 1.984440427486591e-05,
|
|
"loss": 0.0508,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.15540296349837368,
|
|
"grad_norm": 0.18772058188915253,
|
|
"learning_rate": 1.9821451197042028e-05,
|
|
"loss": 0.0579,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.1590169859053126,
|
|
"grad_norm": 0.1844455748796463,
|
|
"learning_rate": 1.979693471617462e-05,
|
|
"loss": 0.0548,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.16263100831225152,
|
|
"grad_norm": 0.3249725103378296,
|
|
"learning_rate": 1.9770858734858123e-05,
|
|
"loss": 0.059,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.16624503071919045,
|
|
"grad_norm": 0.21680741012096405,
|
|
"learning_rate": 1.9743227403932135e-05,
|
|
"loss": 0.0508,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.1698590531261294,
|
|
"grad_norm": 0.17999523878097534,
|
|
"learning_rate": 1.9714045121820676e-05,
|
|
"loss": 0.0504,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.17347307553306832,
|
|
"grad_norm": 0.19252051413059235,
|
|
"learning_rate": 1.968331653383204e-05,
|
|
"loss": 0.0524,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.17708709794000724,
|
|
"grad_norm": 0.39402344822883606,
|
|
"learning_rate": 1.9651046531419335e-05,
|
|
"loss": 0.0536,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.18070112034694616,
|
|
"grad_norm": 0.38865312933921814,
|
|
"learning_rate": 1.961724025140185e-05,
|
|
"loss": 0.0517,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.18431514275388508,
|
|
"grad_norm": 0.7893776297569275,
|
|
"learning_rate": 1.9581903075147372e-05,
|
|
"loss": 0.0577,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.187929165160824,
|
|
"grad_norm": 0.8502449989318848,
|
|
"learning_rate": 1.9545040627715554e-05,
|
|
"loss": 0.052,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.19154318756776292,
|
|
"grad_norm": 0.33574968576431274,
|
|
"learning_rate": 1.9506658776962522e-05,
|
|
"loss": 0.0548,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.19515720997470185,
|
|
"grad_norm": 0.39989784359931946,
|
|
"learning_rate": 1.946676363260679e-05,
|
|
"loss": 0.0513,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.19877123238164077,
|
|
"grad_norm": 0.26637327671051025,
|
|
"learning_rate": 1.942536154525673e-05,
|
|
"loss": 0.0515,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.2023852547885797,
|
|
"grad_norm": 0.20739665627479553,
|
|
"learning_rate": 1.9382459105399634e-05,
|
|
"loss": 0.0468,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.2059992771955186,
|
|
"grad_norm": 0.12362375110387802,
|
|
"learning_rate": 1.9338063142352644e-05,
|
|
"loss": 0.0455,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.20961329960245753,
|
|
"grad_norm": 0.20232897996902466,
|
|
"learning_rate": 1.9292180723175656e-05,
|
|
"loss": 0.0497,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.21322732200939645,
|
|
"grad_norm": 0.1344902068376541,
|
|
"learning_rate": 1.9244819151546325e-05,
|
|
"loss": 0.0526,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.21684134441633537,
|
|
"grad_norm": 0.33682653307914734,
|
|
"learning_rate": 1.9195985966597495e-05,
|
|
"loss": 0.0529,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.2204553668232743,
|
|
"grad_norm": 0.2771964371204376,
|
|
"learning_rate": 1.9145688941717074e-05,
|
|
"loss": 0.0435,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.22406938923021322,
|
|
"grad_norm": 0.6373592615127563,
|
|
"learning_rate": 1.9093936083310653e-05,
|
|
"loss": 0.0531,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.22768341163715214,
|
|
"grad_norm": 0.23717528581619263,
|
|
"learning_rate": 1.9040735629527027e-05,
|
|
"loss": 0.0525,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.23129743404409106,
|
|
"grad_norm": 0.23559607565402985,
|
|
"learning_rate": 1.8986096048946826e-05,
|
|
"loss": 0.0476,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.23491145645103,
|
|
"grad_norm": 0.19278410077095032,
|
|
"learning_rate": 1.893002603923446e-05,
|
|
"loss": 0.0515,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.23852547885796893,
|
|
"grad_norm": 0.10223805904388428,
|
|
"learning_rate": 1.8872534525753617e-05,
|
|
"loss": 0.0516,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.24213950126490785,
|
|
"grad_norm": 0.13931454718112946,
|
|
"learning_rate": 1.881363066014649e-05,
|
|
"loss": 0.0518,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.24575352367184677,
|
|
"grad_norm": 0.26712024211883545,
|
|
"learning_rate": 1.875332381887699e-05,
|
|
"loss": 0.0475,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.2493675460787857,
|
|
"grad_norm": 0.1877209097146988,
|
|
"learning_rate": 1.86916236017382e-05,
|
|
"loss": 0.0491,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.2529815684857246,
|
|
"grad_norm": 0.13105887174606323,
|
|
"learning_rate": 1.862853983032423e-05,
|
|
"loss": 0.0473,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.25659559089266354,
|
|
"grad_norm": 0.15709713101387024,
|
|
"learning_rate": 1.8564082546466804e-05,
|
|
"loss": 0.0496,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.26020961329960246,
|
|
"grad_norm": 0.14228186011314392,
|
|
"learning_rate": 1.8498262010636777e-05,
|
|
"loss": 0.0486,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.2638236357065414,
|
|
"grad_norm": 0.45368531346321106,
|
|
"learning_rate": 1.8431088700310846e-05,
|
|
"loss": 0.0513,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.2674376581134803,
|
|
"grad_norm": 0.20928220450878143,
|
|
"learning_rate": 1.836257330830372e-05,
|
|
"loss": 0.0542,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.2710516805204192,
|
|
"grad_norm": 0.18949194252490997,
|
|
"learning_rate": 1.8292726741066008e-05,
|
|
"loss": 0.0466,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.27466570292735815,
|
|
"grad_norm": 0.26113438606262207,
|
|
"learning_rate": 1.8221560116948103e-05,
|
|
"loss": 0.0426,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.27827972533429707,
|
|
"grad_norm": 0.13533490896224976,
|
|
"learning_rate": 1.814908476443034e-05,
|
|
"loss": 0.0454,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.281893747741236,
|
|
"grad_norm": 0.15402187407016754,
|
|
"learning_rate": 1.80753122203197e-05,
|
|
"loss": 0.0486,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.2855077701481749,
|
|
"grad_norm": 0.4744708240032196,
|
|
"learning_rate": 1.8000254227913346e-05,
|
|
"loss": 0.0509,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.28912179255511383,
|
|
"grad_norm": 0.9314901232719421,
|
|
"learning_rate": 1.7923922735129303e-05,
|
|
"loss": 0.0542,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.29273581496205275,
|
|
"grad_norm": 0.31646886467933655,
|
|
"learning_rate": 1.7846329892604548e-05,
|
|
"loss": 0.0422,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.2963498373689917,
|
|
"grad_norm": 0.36957696080207825,
|
|
"learning_rate": 1.7767488051760858e-05,
|
|
"loss": 0.0479,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.2999638597759306,
|
|
"grad_norm": 0.13091270625591278,
|
|
"learning_rate": 1.7687409762838666e-05,
|
|
"loss": 0.0454,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.3035778821828695,
|
|
"grad_norm": 0.11943885684013367,
|
|
"learning_rate": 1.760610777289929e-05,
|
|
"loss": 0.0483,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.30719190458980844,
|
|
"grad_norm": 0.1768166720867157,
|
|
"learning_rate": 1.7523595023795814e-05,
|
|
"loss": 0.045,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.31080592699674736,
|
|
"grad_norm": 0.3109482228755951,
|
|
"learning_rate": 1.743988465011299e-05,
|
|
"loss": 0.0449,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.3144199494036863,
|
|
"grad_norm": 0.33029767870903015,
|
|
"learning_rate": 1.735498997707642e-05,
|
|
"loss": 0.0469,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.3180339718106252,
|
|
"grad_norm": 0.09848926961421967,
|
|
"learning_rate": 1.7268924518431437e-05,
|
|
"loss": 0.0436,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.3216479942175641,
|
|
"grad_norm": 0.25710049271583557,
|
|
"learning_rate": 1.7181701974291927e-05,
|
|
"loss": 0.0457,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.32526201662450305,
|
|
"grad_norm": 0.2981012165546417,
|
|
"learning_rate": 1.7093336228959538e-05,
|
|
"loss": 0.0437,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.32887603903144197,
|
|
"grad_norm": 0.2785588204860687,
|
|
"learning_rate": 1.700384134871351e-05,
|
|
"loss": 0.0499,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.3324900614383809,
|
|
"grad_norm": 0.34511667490005493,
|
|
"learning_rate": 1.691323157957161e-05,
|
|
"loss": 0.047,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.33610408384531987,
|
|
"grad_norm": 0.4485796391963959,
|
|
"learning_rate": 1.6821521345022377e-05,
|
|
"loss": 0.0453,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.3397181062522588,
|
|
"grad_norm": 0.22290338575839996,
|
|
"learning_rate": 1.672872524372919e-05,
|
|
"loss": 0.0455,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.3433321286591977,
|
|
"grad_norm": 0.11010613292455673,
|
|
"learning_rate": 1.663485804720638e-05,
|
|
"loss": 0.041,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.34694615106613663,
|
|
"grad_norm": 0.17054611444473267,
|
|
"learning_rate": 1.6539934697467895e-05,
|
|
"loss": 0.0423,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.35056017347307555,
|
|
"grad_norm": 0.31248804926872253,
|
|
"learning_rate": 1.644397030464877e-05,
|
|
"loss": 0.0458,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.3541741958800145,
|
|
"grad_norm": 0.2744062542915344,
|
|
"learning_rate": 1.634698014459988e-05,
|
|
"loss": 0.0475,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.3577882182869534,
|
|
"grad_norm": 0.23750263452529907,
|
|
"learning_rate": 1.6248979656456273e-05,
|
|
"loss": 0.0442,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.3614022406938923,
|
|
"grad_norm": 0.46631160378456116,
|
|
"learning_rate": 1.614998444017954e-05,
|
|
"loss": 0.0499,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.3614022406938923,
|
|
"eval_loss": 0.06283282488584518,
|
|
"eval_runtime": 1684.1491,
|
|
"eval_samples_per_second": 38.124,
|
|
"eval_steps_per_second": 1.192,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.36501626310083124,
|
|
"grad_norm": 0.3171086013317108,
|
|
"learning_rate": 1.6050010254074564e-05,
|
|
"loss": 0.0427,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.36863028550777016,
|
|
"grad_norm": 0.14167694747447968,
|
|
"learning_rate": 1.5949073012281092e-05,
|
|
"loss": 0.0434,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.3722443079147091,
|
|
"grad_norm": 0.21627700328826904,
|
|
"learning_rate": 1.5847188782240473e-05,
|
|
"loss": 0.0399,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.375858330321648,
|
|
"grad_norm": 0.17993256449699402,
|
|
"learning_rate": 1.5744373782137993e-05,
|
|
"loss": 0.0397,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.3794723527285869,
|
|
"grad_norm": 0.1952645629644394,
|
|
"learning_rate": 1.5640644378321236e-05,
|
|
"loss": 0.0439,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.38308637513552585,
|
|
"grad_norm": 0.36679714918136597,
|
|
"learning_rate": 1.5536017082694846e-05,
|
|
"loss": 0.0445,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.38670039754246477,
|
|
"grad_norm": 0.23930436372756958,
|
|
"learning_rate": 1.5430508550092123e-05,
|
|
"loss": 0.0459,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.3903144199494037,
|
|
"grad_norm": 0.2509578466415405,
|
|
"learning_rate": 1.532413557562386e-05,
|
|
"loss": 0.0436,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.3939284423563426,
|
|
"grad_norm": 0.34942692518234253,
|
|
"learning_rate": 1.5216915092004847e-05,
|
|
"loss": 0.0417,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.39754246476328153,
|
|
"grad_norm": 0.1713494062423706,
|
|
"learning_rate": 1.5108864166858506e-05,
|
|
"loss": 0.0399,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.40115648717022045,
|
|
"grad_norm": 0.14295130968093872,
|
|
"learning_rate": 1.5000000000000002e-05,
|
|
"loss": 0.0402,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.4047705095771594,
|
|
"grad_norm": 0.13941536843776703,
|
|
"learning_rate": 1.4890339920698334e-05,
|
|
"loss": 0.0433,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.4083845319840983,
|
|
"grad_norm": 0.46250632405281067,
|
|
"learning_rate": 1.4779901384917833e-05,
|
|
"loss": 0.0403,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.4119985543910372,
|
|
"grad_norm": 0.43441110849380493,
|
|
"learning_rate": 1.4668701972539459e-05,
|
|
"loss": 0.0429,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.41561257679797614,
|
|
"grad_norm": 0.26317739486694336,
|
|
"learning_rate": 1.4556759384562418e-05,
|
|
"loss": 0.0441,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.41922659920491506,
|
|
"grad_norm": 0.42390304803848267,
|
|
"learning_rate": 1.444409144028644e-05,
|
|
"loss": 0.0436,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.422840621611854,
|
|
"grad_norm": 0.145647332072258,
|
|
"learning_rate": 1.4330716074475287e-05,
|
|
"loss": 0.0401,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.4264546440187929,
|
|
"grad_norm": 0.2961156964302063,
|
|
"learning_rate": 1.421665133450184e-05,
|
|
"loss": 0.0431,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.4300686664257318,
|
|
"grad_norm": 0.16565440595149994,
|
|
"learning_rate": 1.4101915377475275e-05,
|
|
"loss": 0.0372,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.43368268883267075,
|
|
"grad_norm": 0.16353151202201843,
|
|
"learning_rate": 1.398652646735076e-05,
|
|
"loss": 0.0368,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.43729671123960967,
|
|
"grad_norm": 0.22869880497455597,
|
|
"learning_rate": 1.3870502972022175e-05,
|
|
"loss": 0.0402,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.4409107336465486,
|
|
"grad_norm": 0.17049099504947662,
|
|
"learning_rate": 1.3753863360398243e-05,
|
|
"loss": 0.0413,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.4445247560534875,
|
|
"grad_norm": 0.4078167676925659,
|
|
"learning_rate": 1.3636626199462615e-05,
|
|
"loss": 0.0357,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.44813877846042643,
|
|
"grad_norm": 0.2695305347442627,
|
|
"learning_rate": 1.351881015131833e-05,
|
|
"loss": 0.0416,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.45175280086736536,
|
|
"grad_norm": 0.2164461761713028,
|
|
"learning_rate": 1.3400433970217137e-05,
|
|
"loss": 0.0361,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.4553668232743043,
|
|
"grad_norm": 0.2737310528755188,
|
|
"learning_rate": 1.3281516499574134e-05,
|
|
"loss": 0.0426,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.4589808456812432,
|
|
"grad_norm": 0.27422964572906494,
|
|
"learning_rate": 1.316207666896824e-05,
|
|
"loss": 0.0381,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.4625948680881821,
|
|
"grad_norm": 0.2095690369606018,
|
|
"learning_rate": 1.3042133491128934e-05,
|
|
"loss": 0.0433,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.46620889049512104,
|
|
"grad_norm": 0.19285966455936432,
|
|
"learning_rate": 1.2921706058909757e-05,
|
|
"loss": 0.0376,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.46982291290206,
|
|
"grad_norm": 0.2235649824142456,
|
|
"learning_rate": 1.2800813542249073e-05,
|
|
"loss": 0.034,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.47343693530899894,
|
|
"grad_norm": 0.21383994817733765,
|
|
"learning_rate": 1.2679475185118535e-05,
|
|
"loss": 0.034,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.47705095771593786,
|
|
"grad_norm": 0.12778101861476898,
|
|
"learning_rate": 1.2557710302459803e-05,
|
|
"loss": 0.0422,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.4806649801228768,
|
|
"grad_norm": 0.13164684176445007,
|
|
"learning_rate": 1.2435538277109919e-05,
|
|
"loss": 0.0458,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.4842790025298157,
|
|
"grad_norm": 0.10109174251556396,
|
|
"learning_rate": 1.2312978556715934e-05,
|
|
"loss": 0.0444,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.4878930249367546,
|
|
"grad_norm": 0.12084756791591644,
|
|
"learning_rate": 1.2190050650639131e-05,
|
|
"loss": 0.0382,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.49150704734369355,
|
|
"grad_norm": 0.27953436970710754,
|
|
"learning_rate": 1.206677412684953e-05,
|
|
"loss": 0.0398,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.49512106975063247,
|
|
"grad_norm": 0.24097155034542084,
|
|
"learning_rate": 1.1943168608810977e-05,
|
|
"loss": 0.0396,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.4987350921575714,
|
|
"grad_norm": 0.17015314102172852,
|
|
"learning_rate": 1.1819253772357442e-05,
|
|
"loss": 0.0374,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.5023491145645103,
|
|
"grad_norm": 0.2601104974746704,
|
|
"learning_rate": 1.1695049342560969e-05,
|
|
"loss": 0.0461,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.5059631369714492,
|
|
"grad_norm": 0.09341022372245789,
|
|
"learning_rate": 1.157057509059179e-05,
|
|
"loss": 0.0355,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.5095771593783881,
|
|
"grad_norm": 0.2533698081970215,
|
|
"learning_rate": 1.144585083057111e-05,
|
|
"loss": 0.0389,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.5131911817853271,
|
|
"grad_norm": 0.3184564411640167,
|
|
"learning_rate": 1.1320896416417026e-05,
|
|
"loss": 0.0442,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.5168052041922659,
|
|
"grad_norm": 0.3521571755409241,
|
|
"learning_rate": 1.119573173868415e-05,
|
|
"loss": 0.0473,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.5204192265992049,
|
|
"grad_norm": 0.1924971640110016,
|
|
"learning_rate": 1.1070376721397374e-05,
|
|
"loss": 0.0411,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.5240332490061438,
|
|
"grad_norm": 0.13801982998847961,
|
|
"learning_rate": 1.0944851318880314e-05,
|
|
"loss": 0.0416,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.5276472714130828,
|
|
"grad_norm": 0.4150485694408417,
|
|
"learning_rate": 1.0819175512578925e-05,
|
|
"loss": 0.0388,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.5312612938200216,
|
|
"grad_norm": 0.17930372059345245,
|
|
"learning_rate": 1.0693369307880817e-05,
|
|
"loss": 0.04,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.5348753162269606,
|
|
"grad_norm": 0.281807005405426,
|
|
"learning_rate": 1.0567452730930743e-05,
|
|
"loss": 0.0349,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.5384893386338996,
|
|
"grad_norm": 0.18618857860565186,
|
|
"learning_rate": 1.0441445825442773e-05,
|
|
"loss": 0.0387,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.5421033610408384,
|
|
"grad_norm": 0.18845702707767487,
|
|
"learning_rate": 1.0315368649509716e-05,
|
|
"loss": 0.0379,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.5457173834477774,
|
|
"grad_norm": 0.3343959152698517,
|
|
"learning_rate": 1.0189241272410191e-05,
|
|
"loss": 0.0385,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.5493314058547163,
|
|
"grad_norm": 0.22052714228630066,
|
|
"learning_rate": 1.0063083771413975e-05,
|
|
"loss": 0.0561,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.5529454282616553,
|
|
"grad_norm": 0.11884549260139465,
|
|
"learning_rate": 9.936916228586027e-06,
|
|
"loss": 0.0383,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.5565594506685941,
|
|
"grad_norm": 0.1994011104106903,
|
|
"learning_rate": 9.810758727589814e-06,
|
|
"loss": 0.0391,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.5601734730755331,
|
|
"grad_norm": 0.264037162065506,
|
|
"learning_rate": 9.684631350490287e-06,
|
|
"loss": 0.0324,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.563787495482472,
|
|
"grad_norm": 0.22638504207134247,
|
|
"learning_rate": 9.55855417455723e-06,
|
|
"loss": 0.0352,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.567401517889411,
|
|
"grad_norm": 0.1375981718301773,
|
|
"learning_rate": 9.43254726906926e-06,
|
|
"loss": 0.0358,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.5710155402963498,
|
|
"grad_norm": 0.1931852251291275,
|
|
"learning_rate": 9.306630692119183e-06,
|
|
"loss": 0.0375,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.5746295627032888,
|
|
"grad_norm": 0.3065533936023712,
|
|
"learning_rate": 9.180824487421077e-06,
|
|
"loss": 0.0386,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.5782435851102277,
|
|
"grad_norm": 0.13863135874271393,
|
|
"learning_rate": 9.055148681119688e-06,
|
|
"loss": 0.034,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.5818576075171666,
|
|
"grad_norm": 0.1802493929862976,
|
|
"learning_rate": 8.929623278602627e-06,
|
|
"loss": 0.0368,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.5854716299241055,
|
|
"grad_norm": 0.12986677885055542,
|
|
"learning_rate": 8.80426826131585e-06,
|
|
"loss": 0.0308,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.5890856523310445,
|
|
"grad_norm": 0.2099331021308899,
|
|
"learning_rate": 8.67910358358298e-06,
|
|
"loss": 0.0319,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.5926996747379834,
|
|
"grad_norm": 0.15507160127162933,
|
|
"learning_rate": 8.554149169428894e-06,
|
|
"loss": 0.0296,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.5963136971449223,
|
|
"grad_norm": 0.2265109270811081,
|
|
"learning_rate": 8.429424909408215e-06,
|
|
"loss": 0.0359,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.5999277195518612,
|
|
"grad_norm": 0.16594430804252625,
|
|
"learning_rate": 8.304950657439034e-06,
|
|
"loss": 0.0394,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.6035417419588002,
|
|
"grad_norm": 0.175150528550148,
|
|
"learning_rate": 8.180746227642561e-06,
|
|
"loss": 0.0348,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.607155764365739,
|
|
"grad_norm": 0.15162833034992218,
|
|
"learning_rate": 8.056831391189024e-06,
|
|
"loss": 0.0349,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.610769786772678,
|
|
"grad_norm": 0.22084273397922516,
|
|
"learning_rate": 7.93322587315047e-06,
|
|
"loss": 0.0363,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.6143838091796169,
|
|
"grad_norm": 0.2815225124359131,
|
|
"learning_rate": 7.809949349360872e-06,
|
|
"loss": 0.0369,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.6179978315865559,
|
|
"grad_norm": 0.38362768292427063,
|
|
"learning_rate": 7.687021443284071e-06,
|
|
"loss": 0.0315,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.6216118539934947,
|
|
"grad_norm": 0.44302356243133545,
|
|
"learning_rate": 7.564461722890082e-06,
|
|
"loss": 0.037,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.6252258764004337,
|
|
"grad_norm": 0.16820432245731354,
|
|
"learning_rate": 7.4422896975402004e-06,
|
|
"loss": 0.0364,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.6288398988073726,
|
|
"grad_norm": 0.17497654259204865,
|
|
"learning_rate": 7.320524814881471e-06,
|
|
"loss": 0.0335,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.6324539212143115,
|
|
"grad_norm": 0.23959729075431824,
|
|
"learning_rate": 7.199186457750931e-06,
|
|
"loss": 0.0331,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.6360679436212504,
|
|
"grad_norm": 0.216622993350029,
|
|
"learning_rate": 7.078293941090248e-06,
|
|
"loss": 0.0359,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.6396819660281894,
|
|
"grad_norm": 0.21011534333229065,
|
|
"learning_rate": 6.957866508871068e-06,
|
|
"loss": 0.041,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.6432959884351283,
|
|
"grad_norm": 0.13897277414798737,
|
|
"learning_rate": 6.837923331031761e-06,
|
|
"loss": 0.0326,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.6469100108420672,
|
|
"grad_norm": 0.15225118398666382,
|
|
"learning_rate": 6.718483500425868e-06,
|
|
"loss": 0.0356,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.6505240332490061,
|
|
"grad_norm": 0.16399000585079193,
|
|
"learning_rate": 6.599566029782863e-06,
|
|
"loss": 0.0394,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.6541380556559451,
|
|
"grad_norm": 0.15435738861560822,
|
|
"learning_rate": 6.48118984868167e-06,
|
|
"loss": 0.0361,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.6577520780628839,
|
|
"grad_norm": 0.23756234347820282,
|
|
"learning_rate": 6.363373800537388e-06,
|
|
"loss": 0.0349,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.6613661004698229,
|
|
"grad_norm": 0.17288844287395477,
|
|
"learning_rate": 6.246136639601763e-06,
|
|
"loss": 0.0351,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.6649801228767618,
|
|
"grad_norm": 0.1758948117494583,
|
|
"learning_rate": 6.129497027977829e-06,
|
|
"loss": 0.034,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.6685941452837008,
|
|
"grad_norm": 0.1595619171857834,
|
|
"learning_rate": 6.013473532649246e-06,
|
|
"loss": 0.0346,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.6722081676906397,
|
|
"grad_norm": 0.24954086542129517,
|
|
"learning_rate": 5.898084622524729e-06,
|
|
"loss": 0.0323,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.6758221900975786,
|
|
"grad_norm": 0.19846731424331665,
|
|
"learning_rate": 5.78334866549816e-06,
|
|
"loss": 0.0337,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.6794362125045176,
|
|
"grad_norm": 0.42544224858283997,
|
|
"learning_rate": 5.669283925524716e-06,
|
|
"loss": 0.0327,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.6830502349114564,
|
|
"grad_norm": 0.17253951728343964,
|
|
"learning_rate": 5.555908559713561e-06,
|
|
"loss": 0.04,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.6866642573183954,
|
|
"grad_norm": 0.18640030920505524,
|
|
"learning_rate": 5.443240615437586e-06,
|
|
"loss": 0.0328,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.6902782797253343,
|
|
"grad_norm": 0.1302676647901535,
|
|
"learning_rate": 5.33129802746054e-06,
|
|
"loss": 0.0361,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.6938923021322733,
|
|
"grad_norm": 0.1659248024225235,
|
|
"learning_rate": 5.22009861508217e-06,
|
|
"loss": 0.0332,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.6975063245392121,
|
|
"grad_norm": 0.19474443793296814,
|
|
"learning_rate": 5.109660079301668e-06,
|
|
"loss": 0.0327,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.7011203469461511,
|
|
"grad_norm": 0.2394312471151352,
|
|
"learning_rate": 5.000000000000003e-06,
|
|
"loss": 0.0305,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.70473436935309,
|
|
"grad_norm": 0.24105896055698395,
|
|
"learning_rate": 4.891135833141495e-06,
|
|
"loss": 0.0316,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.708348391760029,
|
|
"grad_norm": 0.15345264971256256,
|
|
"learning_rate": 4.783084907995156e-06,
|
|
"loss": 0.0341,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.7119624141669678,
|
|
"grad_norm": 0.23472526669502258,
|
|
"learning_rate": 4.675864424376146e-06,
|
|
"loss": 0.0352,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.7155764365739068,
|
|
"grad_norm": 0.12148404121398926,
|
|
"learning_rate": 4.569491449907878e-06,
|
|
"loss": 0.0339,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.7191904589808457,
|
|
"grad_norm": 0.2683773338794708,
|
|
"learning_rate": 4.463982917305155e-06,
|
|
"loss": 0.0434,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.7228044813877846,
|
|
"grad_norm": 0.1990807205438614,
|
|
"learning_rate": 4.359355621678765e-06,
|
|
"loss": 0.0333,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.7228044813877846,
|
|
"eval_loss": 0.06226345896720886,
|
|
"eval_runtime": 1684.5006,
|
|
"eval_samples_per_second": 38.116,
|
|
"eval_steps_per_second": 1.191,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.7264185037947235,
|
|
"grad_norm": 0.16073299944400787,
|
|
"learning_rate": 4.255626217862014e-06,
|
|
"loss": 0.0281,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.7300325262016625,
|
|
"grad_norm": 0.17389674484729767,
|
|
"learning_rate": 4.152811217759529e-06,
|
|
"loss": 0.0333,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.7336465486086013,
|
|
"grad_norm": 0.23013567924499512,
|
|
"learning_rate": 4.050926987718911e-06,
|
|
"loss": 0.0359,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.7372605710155403,
|
|
"grad_norm": 0.2871190905570984,
|
|
"learning_rate": 3.9499897459254375e-06,
|
|
"loss": 0.033,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.7408745934224792,
|
|
"grad_norm": 0.1907360553741455,
|
|
"learning_rate": 3.850015559820465e-06,
|
|
"loss": 0.0324,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.7444886158294182,
|
|
"grad_norm": 0.3101818561553955,
|
|
"learning_rate": 3.75102034354373e-06,
|
|
"loss": 0.037,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.748102638236357,
|
|
"grad_norm": 0.20055055618286133,
|
|
"learning_rate": 3.653019855400123e-06,
|
|
"loss": 0.0296,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.751716660643296,
|
|
"grad_norm": 0.23536404967308044,
|
|
"learning_rate": 3.5560296953512296e-06,
|
|
"loss": 0.0355,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.7553306830502349,
|
|
"grad_norm": 0.28307974338531494,
|
|
"learning_rate": 3.4600653025321085e-06,
|
|
"loss": 0.0281,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.7589447054571739,
|
|
"grad_norm": 0.32817333936691284,
|
|
"learning_rate": 3.3651419527936223e-06,
|
|
"loss": 0.029,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.7625587278641127,
|
|
"grad_norm": 0.2622580826282501,
|
|
"learning_rate": 3.2712747562708115e-06,
|
|
"loss": 0.0346,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.7661727502710517,
|
|
"grad_norm": 0.16355015337467194,
|
|
"learning_rate": 3.178478654977624e-06,
|
|
"loss": 0.0308,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.7697867726779906,
|
|
"grad_norm": 0.13598884642124176,
|
|
"learning_rate": 3.086768420428392e-06,
|
|
"loss": 0.0342,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.7734007950849295,
|
|
"grad_norm": 0.2859932482242584,
|
|
"learning_rate": 2.9961586512864947e-06,
|
|
"loss": 0.0321,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.7770148174918684,
|
|
"grad_norm": 0.20649324357509613,
|
|
"learning_rate": 2.906663771040468e-06,
|
|
"loss": 0.0321,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.7806288398988074,
|
|
"grad_norm": 0.17001411318778992,
|
|
"learning_rate": 2.8182980257080748e-06,
|
|
"loss": 0.0318,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.7842428623057462,
|
|
"grad_norm": 0.1272343248128891,
|
|
"learning_rate": 2.7310754815685627e-06,
|
|
"loss": 0.0319,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.7878568847126852,
|
|
"grad_norm": 0.18668726086616516,
|
|
"learning_rate": 2.64501002292358e-06,
|
|
"loss": 0.0337,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.7914709071196241,
|
|
"grad_norm": 0.22532936930656433,
|
|
"learning_rate": 2.5601153498870137e-06,
|
|
"loss": 0.0292,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.7950849295265631,
|
|
"grad_norm": 0.2742968797683716,
|
|
"learning_rate": 2.4764049762041874e-06,
|
|
"loss": 0.0345,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.7986989519335019,
|
|
"grad_norm": 0.262674480676651,
|
|
"learning_rate": 2.3938922271007147e-06,
|
|
"loss": 0.0282,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.8023129743404409,
|
|
"grad_norm": 0.1856016367673874,
|
|
"learning_rate": 2.312590237161335e-06,
|
|
"loss": 0.026,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.8059269967473799,
|
|
"grad_norm": 0.1422245353460312,
|
|
"learning_rate": 2.2325119482391466e-06,
|
|
"loss": 0.0323,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.8095410191543188,
|
|
"grad_norm": 0.17140090465545654,
|
|
"learning_rate": 2.153670107395456e-06,
|
|
"loss": 0.0266,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.8131550415612577,
|
|
"grad_norm": 0.3133852481842041,
|
|
"learning_rate": 2.0760772648707016e-06,
|
|
"loss": 0.0314,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.8167690639681966,
|
|
"grad_norm": 0.2690429091453552,
|
|
"learning_rate": 1.9997457720866554e-06,
|
|
"loss": 0.0259,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.8203830863751356,
|
|
"grad_norm": 0.2718668282032013,
|
|
"learning_rate": 1.924687779680302e-06,
|
|
"loss": 0.0269,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.8239971087820744,
|
|
"grad_norm": 0.25396299362182617,
|
|
"learning_rate": 1.8509152355696625e-06,
|
|
"loss": 0.0317,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.8276111311890134,
|
|
"grad_norm": 0.2634499669075012,
|
|
"learning_rate": 1.7784398830519002e-06,
|
|
"loss": 0.0283,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.8312251535959523,
|
|
"grad_norm": 0.28554320335388184,
|
|
"learning_rate": 1.7072732589339958e-06,
|
|
"loss": 0.0294,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.8348391760028913,
|
|
"grad_norm": 0.16729383170604706,
|
|
"learning_rate": 1.6374266916962832e-06,
|
|
"loss": 0.0276,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.8384531984098301,
|
|
"grad_norm": 0.3412957787513733,
|
|
"learning_rate": 1.5689112996891576e-06,
|
|
"loss": 0.0307,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.8420672208167691,
|
|
"grad_norm": 0.19754613935947418,
|
|
"learning_rate": 1.5017379893632255e-06,
|
|
"loss": 0.03,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.845681243223708,
|
|
"grad_norm": 0.24068793654441833,
|
|
"learning_rate": 1.4359174535331998e-06,
|
|
"loss": 0.0306,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.849295265630647,
|
|
"grad_norm": 0.14576299488544464,
|
|
"learning_rate": 1.3714601696757713e-06,
|
|
"loss": 0.0302,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.8529092880375858,
|
|
"grad_norm": 0.15564730763435364,
|
|
"learning_rate": 1.3083763982618026e-06,
|
|
"loss": 0.0284,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.8565233104445248,
|
|
"grad_norm": 0.25240421295166016,
|
|
"learning_rate": 1.2466761811230099e-06,
|
|
"loss": 0.0279,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.8601373328514637,
|
|
"grad_norm": 0.2509186267852783,
|
|
"learning_rate": 1.1863693398535115e-06,
|
|
"loss": 0.0313,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.8637513552584026,
|
|
"grad_norm": 0.22277849912643433,
|
|
"learning_rate": 1.1274654742463842e-06,
|
|
"loss": 0.0277,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.8673653776653415,
|
|
"grad_norm": 0.15197935700416565,
|
|
"learning_rate": 1.0699739607655434e-06,
|
|
"loss": 0.0348,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.8709794000722805,
|
|
"grad_norm": 0.27582135796546936,
|
|
"learning_rate": 1.01390395105318e-06,
|
|
"loss": 0.0317,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.8745934224792193,
|
|
"grad_norm": 0.2296678125858307,
|
|
"learning_rate": 9.592643704729754e-07,
|
|
"loss": 0.028,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.8782074448861583,
|
|
"grad_norm": 0.3100360631942749,
|
|
"learning_rate": 9.060639166893493e-07,
|
|
"loss": 0.0296,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.8818214672930972,
|
|
"grad_norm": 0.1880117952823639,
|
|
"learning_rate": 8.543110582829272e-07,
|
|
"loss": 0.0298,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.8854354897000362,
|
|
"grad_norm": 0.1505495011806488,
|
|
"learning_rate": 8.040140334025082e-07,
|
|
"loss": 0.0251,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.889049512106975,
|
|
"grad_norm": 0.1834176778793335,
|
|
"learning_rate": 7.551808484536782e-07,
|
|
"loss": 0.0289,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.892663534513914,
|
|
"grad_norm": 0.2518479824066162,
|
|
"learning_rate": 7.078192768243486e-07,
|
|
"loss": 0.034,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.8962775569208529,
|
|
"grad_norm": 0.2181636542081833,
|
|
"learning_rate": 6.61936857647355e-07,
|
|
"loss": 0.0308,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.8998915793277918,
|
|
"grad_norm": 0.28248122334480286,
|
|
"learning_rate": 6.175408946003703e-07,
|
|
"loss": 0.0338,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.9035056017347307,
|
|
"grad_norm": 0.18422535061836243,
|
|
"learning_rate": 5.746384547432738e-07,
|
|
"loss": 0.0267,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.9071196241416697,
|
|
"grad_norm": 0.17286597192287445,
|
|
"learning_rate": 5.332363673932106e-07,
|
|
"loss": 0.0318,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.9107336465486086,
|
|
"grad_norm": 0.23957087099552155,
|
|
"learning_rate": 4.933412230374812e-07,
|
|
"loss": 0.035,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.9143476689555475,
|
|
"grad_norm": 0.1773596704006195,
|
|
"learning_rate": 4.549593722844492e-07,
|
|
"loss": 0.0243,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.9179616913624864,
|
|
"grad_norm": 0.25387516617774963,
|
|
"learning_rate": 4.180969248526334e-07,
|
|
"loss": 0.0272,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.9215757137694254,
|
|
"grad_norm": 0.23314779996871948,
|
|
"learning_rate": 3.827597485981527e-07,
|
|
"loss": 0.0288,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.9251897361763642,
|
|
"grad_norm": 0.16649064421653748,
|
|
"learning_rate": 3.4895346858066723e-07,
|
|
"loss": 0.0295,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.9288037585833032,
|
|
"grad_norm": 0.1977885514497757,
|
|
"learning_rate": 3.166834661679596e-07,
|
|
"loss": 0.029,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.9324177809902421,
|
|
"grad_norm": 0.1530725359916687,
|
|
"learning_rate": 2.8595487817932424e-07,
|
|
"loss": 0.0268,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.9360318033971811,
|
|
"grad_norm": 0.2034778892993927,
|
|
"learning_rate": 2.5677259606786686e-07,
|
|
"loss": 0.0364,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.93964582580412,
|
|
"grad_norm": 0.2887728214263916,
|
|
"learning_rate": 2.2914126514187784e-07,
|
|
"loss": 0.0372,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.9432598482110589,
|
|
"grad_norm": 0.19726163148880005,
|
|
"learning_rate": 2.0306528382538103e-07,
|
|
"loss": 0.0283,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.9468738706179979,
|
|
"grad_norm": 0.19561438262462616,
|
|
"learning_rate": 1.7854880295797406e-07,
|
|
"loss": 0.0311,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.9504878930249367,
|
|
"grad_norm": 0.31874212622642517,
|
|
"learning_rate": 1.5559572513409338e-07,
|
|
"loss": 0.031,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.9541019154318757,
|
|
"grad_norm": 0.19174811244010925,
|
|
"learning_rate": 1.3420970408178912e-07,
|
|
"loss": 0.0285,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.9577159378388146,
|
|
"grad_norm": 0.17756476998329163,
|
|
"learning_rate": 1.1439414408111471e-07,
|
|
"loss": 0.0237,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.9613299602457536,
|
|
"grad_norm": 0.2853069007396698,
|
|
"learning_rate": 9.615219942222476e-08,
|
|
"loss": 0.0324,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.9649439826526924,
|
|
"grad_norm": 0.20107683539390564,
|
|
"learning_rate": 7.948677390326786e-08,
|
|
"loss": 0.029,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.9685580050596314,
|
|
"grad_norm": 0.32500964403152466,
|
|
"learning_rate": 6.440052036815081e-08,
|
|
"loss": 0.0307,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.9721720274665703,
|
|
"grad_norm": 0.37760159373283386,
|
|
"learning_rate": 5.0895840284257424e-08,
|
|
"loss": 0.0349,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.9757860498735093,
|
|
"grad_norm": 0.1795365810394287,
|
|
"learning_rate": 3.8974883360169966e-08,
|
|
"loss": 0.0269,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.9794000722804481,
|
|
"grad_norm": 0.1918817162513733,
|
|
"learning_rate": 2.86395472034795e-08,
|
|
"loss": 0.0322,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.9830140946873871,
|
|
"grad_norm": 0.16373227536678314,
|
|
"learning_rate": 1.989147701871641e-08,
|
|
"loss": 0.0317,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.986628117094326,
|
|
"grad_norm": 0.12552815675735474,
|
|
"learning_rate": 1.2732065345462118e-08,
|
|
"loss": 0.0223,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.9902421395012649,
|
|
"grad_norm": 0.16605937480926514,
|
|
"learning_rate": 7.162451836685291e-09,
|
|
"loss": 0.0283,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.9938561619082038,
|
|
"grad_norm": 0.18890981376171112,
|
|
"learning_rate": 3.183523077324724e-09,
|
|
"loss": 0.0281,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.9974701843151428,
|
|
"grad_norm": 0.14776916801929474,
|
|
"learning_rate": 7.959124431622389e-10,
|
|
"loss": 0.0287,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 1384,
|
|
"total_flos": 2.850613244928721e+18,
|
|
"train_loss": 0.048995194472157194,
|
|
"train_runtime": 23358.4539,
|
|
"train_samples_per_second": 3.789,
|
|
"train_steps_per_second": 0.059
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 1384,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.850613244928721e+18,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|