3749 lines
82 KiB
JSON
3749 lines
82 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 5265,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.005698005698005698,
|
|
"grad_norm": 1.248605852290593,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.7464,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.011396011396011397,
|
|
"grad_norm": 1.0452895054741067,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.7185,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.017094017094017096,
|
|
"grad_norm": 0.8153324886981814,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6852,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.022792022792022793,
|
|
"grad_norm": 1.0831283785058512,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6966,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.02849002849002849,
|
|
"grad_norm": 0.8504265623978658,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6617,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.03418803418803419,
|
|
"grad_norm": 0.9237045733670409,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6694,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.039886039886039885,
|
|
"grad_norm": 0.7642515560623869,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6675,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.045584045584045586,
|
|
"grad_norm": 0.627995094432306,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6698,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.05128205128205128,
|
|
"grad_norm": 0.5944277816882634,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6493,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.05698005698005698,
|
|
"grad_norm": 0.545138744190239,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6493,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.06267806267806268,
|
|
"grad_norm": 0.5758914966539062,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6593,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.06837606837606838,
|
|
"grad_norm": 0.6137211171560547,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6464,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.07407407407407407,
|
|
"grad_norm": 0.6226547108170967,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6618,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.07977207977207977,
|
|
"grad_norm": 0.603249856221309,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6589,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.08547008547008547,
|
|
"grad_norm": 0.5036390357219359,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6529,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.09116809116809117,
|
|
"grad_norm": 0.5977465496475399,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.655,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.09686609686609686,
|
|
"grad_norm": 0.5961195181863996,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6712,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.10256410256410256,
|
|
"grad_norm": 0.6066789044063328,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.653,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.10826210826210826,
|
|
"grad_norm": 0.5768094248224014,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6571,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.11396011396011396,
|
|
"grad_norm": 0.5562364789120665,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.655,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.11965811965811966,
|
|
"grad_norm": 0.5491574417916478,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6499,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.12535612535612536,
|
|
"grad_norm": 0.5975656942284756,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6521,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.13105413105413105,
|
|
"grad_norm": 0.5783672727047425,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6339,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.13675213675213677,
|
|
"grad_norm": 0.5949371595243242,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6359,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.14245014245014245,
|
|
"grad_norm": 0.6034430748838411,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.632,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.14814814814814814,
|
|
"grad_norm": 0.5821788725046587,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6308,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.15384615384615385,
|
|
"grad_norm": 0.5678471771976431,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6503,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.15954415954415954,
|
|
"grad_norm": 0.5602675467648068,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6599,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.16524216524216523,
|
|
"grad_norm": 0.5941388781564068,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6355,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.17094017094017094,
|
|
"grad_norm": 0.5858722107720983,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6508,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.17663817663817663,
|
|
"grad_norm": 0.6035463226302998,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6553,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.18233618233618235,
|
|
"grad_norm": 0.6342249591951397,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6534,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.18803418803418803,
|
|
"grad_norm": 0.5798733093393231,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6238,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.19373219373219372,
|
|
"grad_norm": 0.6315524208231319,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6287,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.19943019943019943,
|
|
"grad_norm": 0.6118349674877842,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6439,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.20512820512820512,
|
|
"grad_norm": 0.5613968317306219,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6422,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.21082621082621084,
|
|
"grad_norm": 0.650029101740181,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6352,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.21652421652421652,
|
|
"grad_norm": 0.6034494108497298,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6419,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 0.5686444051570657,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6403,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.22792022792022792,
|
|
"grad_norm": 0.5897904881774643,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6277,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.2336182336182336,
|
|
"grad_norm": 0.5969120088165002,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6343,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.23931623931623933,
|
|
"grad_norm": 0.6066999477029202,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6223,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.245014245014245,
|
|
"grad_norm": 0.6019250090914218,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6227,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.25071225071225073,
|
|
"grad_norm": 0.635752248265818,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6301,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.2564102564102564,
|
|
"grad_norm": 0.5334914544076793,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6277,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.2621082621082621,
|
|
"grad_norm": 0.5614109457622548,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6367,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.2678062678062678,
|
|
"grad_norm": 0.5716858172898615,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6433,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.27350427350427353,
|
|
"grad_norm": 0.569385420792123,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6358,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.2792022792022792,
|
|
"grad_norm": 0.5396490903906098,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6483,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.2849002849002849,
|
|
"grad_norm": 0.5712372988205724,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6247,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.2905982905982906,
|
|
"grad_norm": 0.5616938335269104,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6397,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.2962962962962963,
|
|
"grad_norm": 0.5586395315743943,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.637,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.301994301994302,
|
|
"grad_norm": 0.593890274267505,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6425,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.3076923076923077,
|
|
"grad_norm": 0.5691720448900264,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6395,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.31339031339031337,
|
|
"grad_norm": 0.5825957418827326,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6404,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.3190883190883191,
|
|
"grad_norm": 0.5712581943035149,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6363,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.3247863247863248,
|
|
"grad_norm": 0.5611359179186056,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6247,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.33048433048433046,
|
|
"grad_norm": 0.5263861318528564,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6397,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.33618233618233617,
|
|
"grad_norm": 0.6008086879223974,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6366,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.3418803418803419,
|
|
"grad_norm": 0.5359029223247334,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6279,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.3475783475783476,
|
|
"grad_norm": 0.5476117829162411,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6472,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.35327635327635326,
|
|
"grad_norm": 0.548345413171009,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6236,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.358974358974359,
|
|
"grad_norm": 0.549515203524155,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6399,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.3646723646723647,
|
|
"grad_norm": 0.6247502479073669,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.638,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.37037037037037035,
|
|
"grad_norm": 0.6367708386541642,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6303,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.37606837606837606,
|
|
"grad_norm": 0.6035098886365409,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.618,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.3817663817663818,
|
|
"grad_norm": 0.5843073759359196,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6433,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.38746438746438744,
|
|
"grad_norm": 0.6208173453592988,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6265,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.39316239316239315,
|
|
"grad_norm": 0.6005049567591871,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6303,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.39886039886039887,
|
|
"grad_norm": 0.569831828840819,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6456,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.4045584045584046,
|
|
"grad_norm": 0.582087320229329,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6388,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.41025641025641024,
|
|
"grad_norm": 0.5537164119876729,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6265,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.41595441595441596,
|
|
"grad_norm": 0.5993733531397847,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6431,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.42165242165242167,
|
|
"grad_norm": 0.5878911525467738,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6231,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.42735042735042733,
|
|
"grad_norm": 0.5703047756344629,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6221,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.43304843304843305,
|
|
"grad_norm": 0.577768195476661,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6208,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.43874643874643876,
|
|
"grad_norm": 0.5854764641336874,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6286,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 0.5387219205137932,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6295,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.45014245014245013,
|
|
"grad_norm": 0.5918701715838999,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6293,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.45584045584045585,
|
|
"grad_norm": 0.610827318398544,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6548,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.46153846153846156,
|
|
"grad_norm": 0.5581493937641772,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6357,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.4672364672364672,
|
|
"grad_norm": 0.5526185327228441,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6255,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.47293447293447294,
|
|
"grad_norm": 0.5604472711614089,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6318,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.47863247863247865,
|
|
"grad_norm": 0.5876335201995966,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6314,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.4843304843304843,
|
|
"grad_norm": 0.5674159078017988,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6438,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.49002849002849,
|
|
"grad_norm": 0.5707009756648699,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6186,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.49572649572649574,
|
|
"grad_norm": 0.6126275501973454,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6298,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.5014245014245015,
|
|
"grad_norm": 0.5973891881380156,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6482,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.5071225071225072,
|
|
"grad_norm": 0.5440973024632595,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6287,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.5128205128205128,
|
|
"grad_norm": 0.5719103793581991,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.63,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.5185185185185185,
|
|
"grad_norm": 0.5927304335774137,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6364,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.5242165242165242,
|
|
"grad_norm": 0.5468702165759647,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6297,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.5299145299145299,
|
|
"grad_norm": 0.5689062473394013,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6256,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.5356125356125356,
|
|
"grad_norm": 0.580087974342758,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6179,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.5413105413105413,
|
|
"grad_norm": 0.6278973426700435,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6349,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.5470085470085471,
|
|
"grad_norm": 0.5749182904288472,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6312,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.5527065527065527,
|
|
"grad_norm": 0.5755058045692314,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6464,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.5584045584045584,
|
|
"grad_norm": 0.564209988292775,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6204,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.5641025641025641,
|
|
"grad_norm": 0.6064650017065378,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6302,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.5698005698005698,
|
|
"grad_norm": 0.5981562518766129,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6278,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.5754985754985755,
|
|
"grad_norm": 0.5985428419859516,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6278,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.5811965811965812,
|
|
"grad_norm": 0.613528620026823,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6358,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.5868945868945868,
|
|
"grad_norm": 0.5785257508799594,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6367,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.5925925925925926,
|
|
"grad_norm": 0.6325574889479847,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6214,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.5982905982905983,
|
|
"grad_norm": 0.5798171618499341,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6318,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.603988603988604,
|
|
"grad_norm": 0.5917058378245685,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6239,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.6096866096866097,
|
|
"grad_norm": 0.6462363857504108,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6253,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.6153846153846154,
|
|
"grad_norm": 0.5964629820058396,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6301,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.6210826210826211,
|
|
"grad_norm": 0.5697422137314763,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6318,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.6267806267806267,
|
|
"grad_norm": 0.6246288555076868,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6498,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.6324786324786325,
|
|
"grad_norm": 0.5736841164436457,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6256,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.6381766381766382,
|
|
"grad_norm": 0.6073053058703407,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6435,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.6438746438746439,
|
|
"grad_norm": 0.5682386523303715,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6118,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.6495726495726496,
|
|
"grad_norm": 0.5789250463385112,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6438,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.6552706552706553,
|
|
"grad_norm": 0.5677226033216014,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6171,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.6609686609686609,
|
|
"grad_norm": 0.570510089133293,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6333,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 0.5534962109817962,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6311,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.6723646723646723,
|
|
"grad_norm": 0.6028706947112492,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6335,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.6780626780626781,
|
|
"grad_norm": 0.5435510233432874,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6266,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.6837606837606838,
|
|
"grad_norm": 0.5781607598620716,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6081,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.6894586894586895,
|
|
"grad_norm": 0.5387883641010746,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6312,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.6951566951566952,
|
|
"grad_norm": 0.6108820344681773,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6228,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.7008547008547008,
|
|
"grad_norm": 0.5547515213703605,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6184,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.7065527065527065,
|
|
"grad_norm": 0.5703753317669427,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6283,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.7122507122507122,
|
|
"grad_norm": 0.5718536532526751,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6331,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.717948717948718,
|
|
"grad_norm": 0.5559094696648192,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.624,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.7236467236467237,
|
|
"grad_norm": 0.55959365727665,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6253,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.7293447293447294,
|
|
"grad_norm": 0.6031181458566351,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6192,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.7350427350427351,
|
|
"grad_norm": 0.5994092568067709,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6192,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.7407407407407407,
|
|
"grad_norm": 0.6365099348102852,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6355,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.7464387464387464,
|
|
"grad_norm": 0.6054035693063426,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6187,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.7521367521367521,
|
|
"grad_norm": 0.5812878969763033,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6134,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.7578347578347578,
|
|
"grad_norm": 0.6221634431669929,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6317,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.7635327635327636,
|
|
"grad_norm": 0.5837064991966057,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6186,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.7692307692307693,
|
|
"grad_norm": 0.5519308410148134,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6282,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.7749287749287749,
|
|
"grad_norm": 0.5884579917268693,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6301,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.7806267806267806,
|
|
"grad_norm": 0.5863408060758529,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6332,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.7863247863247863,
|
|
"grad_norm": 0.606462849435967,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6444,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.792022792022792,
|
|
"grad_norm": 0.609745642222076,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6188,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.7977207977207977,
|
|
"grad_norm": 0.6278637624500826,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6438,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.8034188034188035,
|
|
"grad_norm": 0.5964004351905415,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6227,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.8091168091168092,
|
|
"grad_norm": 0.5695342658619863,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6357,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.8148148148148148,
|
|
"grad_norm": 0.5859213800511389,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6193,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.8205128205128205,
|
|
"grad_norm": 0.5752147052829165,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.627,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.8262108262108262,
|
|
"grad_norm": 0.6177624749983104,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.624,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.8319088319088319,
|
|
"grad_norm": 0.608719985454889,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6191,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.8376068376068376,
|
|
"grad_norm": 0.5667215459680056,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6157,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.8433048433048433,
|
|
"grad_norm": 0.5672924637566275,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.623,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.8490028490028491,
|
|
"grad_norm": 0.6493622667391157,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6359,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.8547008547008547,
|
|
"grad_norm": 0.5623923532248208,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6289,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.8603988603988604,
|
|
"grad_norm": 0.5978019810160363,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6286,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.8660968660968661,
|
|
"grad_norm": 0.5455769144299073,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6355,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.8717948717948718,
|
|
"grad_norm": 0.5694355383197235,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.646,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.8774928774928775,
|
|
"grad_norm": 0.5755078976127412,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6119,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.8831908831908832,
|
|
"grad_norm": 0.5678832987577136,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6253,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 0.6059681652536819,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6423,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.8945868945868946,
|
|
"grad_norm": 0.5683013444396426,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6423,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.9002849002849003,
|
|
"grad_norm": 0.5683186943846946,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6219,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.905982905982906,
|
|
"grad_norm": 0.5863192427228783,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.642,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.9116809116809117,
|
|
"grad_norm": 0.5595388030839678,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6327,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.9173789173789174,
|
|
"grad_norm": 0.5676065816242905,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6362,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.9230769230769231,
|
|
"grad_norm": 0.5762087240408192,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6226,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.9287749287749287,
|
|
"grad_norm": 0.6707590903760927,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6305,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.9344729344729344,
|
|
"grad_norm": 0.6103119559173308,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6319,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.9401709401709402,
|
|
"grad_norm": 0.5633994724981068,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6068,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.9458689458689459,
|
|
"grad_norm": 0.6187682445922644,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6131,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.9515669515669516,
|
|
"grad_norm": 0.5525323634268904,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.632,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.9572649572649573,
|
|
"grad_norm": 0.5564758529357481,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6159,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.9629629629629629,
|
|
"grad_norm": 0.6349335167662418,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6365,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.9686609686609686,
|
|
"grad_norm": 0.5650086755333475,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6281,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.9743589743589743,
|
|
"grad_norm": 0.5521116095578306,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6344,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.98005698005698,
|
|
"grad_norm": 0.6644841541531104,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6234,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.9857549857549858,
|
|
"grad_norm": 0.5223237563049048,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6194,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.9914529914529915,
|
|
"grad_norm": 0.583506791545265,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6042,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.9971509971509972,
|
|
"grad_norm": 0.5524358612737751,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6165,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 0.6207965016365051,
|
|
"eval_runtime": 445.7759,
|
|
"eval_samples_per_second": 26.522,
|
|
"eval_steps_per_second": 0.415,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 1.002849002849003,
|
|
"grad_norm": 0.6193051669442373,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5925,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 1.0085470085470085,
|
|
"grad_norm": 0.5538188542900693,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5831,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 1.0142450142450143,
|
|
"grad_norm": 0.5437973437283771,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5574,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 1.01994301994302,
|
|
"grad_norm": 0.5402398297961842,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5807,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 1.0256410256410255,
|
|
"grad_norm": 0.5457114785479574,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5786,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 1.0313390313390314,
|
|
"grad_norm": 0.5835431121788174,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5873,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 1.037037037037037,
|
|
"grad_norm": 0.549315178206307,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5637,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 1.0427350427350428,
|
|
"grad_norm": 0.551965760141286,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.574,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 1.0484330484330484,
|
|
"grad_norm": 0.5558874848824243,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5641,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 1.0541310541310542,
|
|
"grad_norm": 0.5817186570163869,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5778,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 1.0598290598290598,
|
|
"grad_norm": 0.4980497695823232,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5705,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 1.0655270655270654,
|
|
"grad_norm": 0.6034343866560189,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5705,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 1.0712250712250713,
|
|
"grad_norm": 0.5404707262066166,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5642,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 1.0769230769230769,
|
|
"grad_norm": 0.5172417418761445,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5817,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 1.0826210826210827,
|
|
"grad_norm": 0.5444976667004858,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5795,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 1.0883190883190883,
|
|
"grad_norm": 0.5808327796447509,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5838,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 1.0940170940170941,
|
|
"grad_norm": 0.5553080452734186,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5729,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 1.0997150997150997,
|
|
"grad_norm": 0.6252535797926512,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5888,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 1.1054131054131053,
|
|
"grad_norm": 0.5418701052068917,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5749,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 0.5427412867505934,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5802,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 1.1168091168091168,
|
|
"grad_norm": 0.5838842247306398,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5919,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 1.1225071225071226,
|
|
"grad_norm": 0.5659489766269445,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5679,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 1.1282051282051282,
|
|
"grad_norm": 0.5710950036482688,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.588,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 1.133903133903134,
|
|
"grad_norm": 0.5563097510452688,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5747,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 1.1396011396011396,
|
|
"grad_norm": 0.5413759858943353,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5679,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 1.1452991452991452,
|
|
"grad_norm": 0.5610725075898626,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5795,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 1.150997150997151,
|
|
"grad_norm": 0.5317980893898213,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.565,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 1.1566951566951567,
|
|
"grad_norm": 0.5402604242832053,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5671,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 1.1623931623931625,
|
|
"grad_norm": 0.5628406736489239,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5785,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 1.168091168091168,
|
|
"grad_norm": 0.5598060055556051,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5687,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 1.173789173789174,
|
|
"grad_norm": 0.5812067996328552,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5739,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 1.1794871794871795,
|
|
"grad_norm": 0.5804815720213962,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.58,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 1.1851851851851851,
|
|
"grad_norm": 0.6009435615613525,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5883,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 1.190883190883191,
|
|
"grad_norm": 0.54252794895387,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5777,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 1.1965811965811965,
|
|
"grad_norm": 0.5996787413433816,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5671,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 1.2022792022792024,
|
|
"grad_norm": 0.5536047256778152,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5745,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 1.207977207977208,
|
|
"grad_norm": 0.561711893430855,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5748,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 1.2136752136752136,
|
|
"grad_norm": 0.5283929440389717,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5915,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 1.2193732193732194,
|
|
"grad_norm": 0.5301857105389954,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5737,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 1.225071225071225,
|
|
"grad_norm": 0.5563444083260252,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5642,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 1.2307692307692308,
|
|
"grad_norm": 0.5430715224319318,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5895,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 1.2364672364672364,
|
|
"grad_norm": 0.5629400205999858,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5827,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 1.242165242165242,
|
|
"grad_norm": 0.5751709064271272,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5835,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 1.2478632478632479,
|
|
"grad_norm": 0.5741003758226062,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5865,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 1.2535612535612537,
|
|
"grad_norm": 0.5791349713821747,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5699,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 1.2592592592592593,
|
|
"grad_norm": 0.5538967837131181,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5888,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 1.264957264957265,
|
|
"grad_norm": 0.6047679094918335,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5852,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 1.2706552706552707,
|
|
"grad_norm": 0.5513200031887904,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.581,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 1.2763532763532763,
|
|
"grad_norm": 0.5416098244594392,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5608,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 1.282051282051282,
|
|
"grad_norm": 0.6042872751469346,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5753,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 1.2877492877492878,
|
|
"grad_norm": 0.5529496445289886,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5869,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 1.2934472934472934,
|
|
"grad_norm": 0.5061156686160359,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5784,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 1.2991452991452992,
|
|
"grad_norm": 0.5340704963602597,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5591,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 1.3048433048433048,
|
|
"grad_norm": 0.5138792740114064,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5687,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 1.3105413105413106,
|
|
"grad_norm": 0.5804911265669914,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5808,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 1.3162393162393162,
|
|
"grad_norm": 0.6117190706702494,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5867,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 1.3219373219373218,
|
|
"grad_norm": 0.5374452206535677,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5674,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 1.3276353276353277,
|
|
"grad_norm": 0.5510977367381295,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5781,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 0.5721844682107659,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5714,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 1.339031339031339,
|
|
"grad_norm": 0.6182161591629188,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5751,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 1.3447293447293447,
|
|
"grad_norm": 0.5662236658011203,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5647,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 1.3504273504273505,
|
|
"grad_norm": 0.5538407330840261,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.587,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 1.3561253561253561,
|
|
"grad_norm": 0.5707317602292841,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5922,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 1.3618233618233617,
|
|
"grad_norm": 0.5641874789639416,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5641,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 1.3675213675213675,
|
|
"grad_norm": 0.5484949550764525,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5751,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 1.3732193732193732,
|
|
"grad_norm": 0.5681990922872379,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5794,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 1.378917378917379,
|
|
"grad_norm": 0.5369480638512492,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5722,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 1.3846153846153846,
|
|
"grad_norm": 0.5401250464395447,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5786,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 1.3903133903133904,
|
|
"grad_norm": 0.5143005599419228,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5795,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 1.396011396011396,
|
|
"grad_norm": 0.571280225689413,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5836,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 1.4017094017094016,
|
|
"grad_norm": 0.5302500110416417,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5778,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 1.4074074074074074,
|
|
"grad_norm": 0.6184863756024197,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5932,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 1.413105413105413,
|
|
"grad_norm": 0.5500463967692277,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5763,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 1.4188034188034189,
|
|
"grad_norm": 0.6265908750634392,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5829,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 1.4245014245014245,
|
|
"grad_norm": 0.5888611093710416,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6017,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 1.4301994301994303,
|
|
"grad_norm": 0.5682311211543241,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5719,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 1.435897435897436,
|
|
"grad_norm": 0.572149006000227,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5712,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 1.4415954415954415,
|
|
"grad_norm": 0.5928100499577562,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5781,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 1.4472934472934473,
|
|
"grad_norm": 0.6074436695553413,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5681,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 1.452991452991453,
|
|
"grad_norm": 0.5499482375209497,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.578,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 1.4586894586894588,
|
|
"grad_norm": 0.6303664310772396,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5688,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 1.4643874643874644,
|
|
"grad_norm": 0.510009725228524,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5678,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 1.4700854700854702,
|
|
"grad_norm": 0.5748311852491685,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5714,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 1.4757834757834758,
|
|
"grad_norm": 0.6184990291175282,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5743,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 1.4814814814814814,
|
|
"grad_norm": 0.555767002073107,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5817,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 1.4871794871794872,
|
|
"grad_norm": 0.5903444647981344,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5793,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 1.4928774928774928,
|
|
"grad_norm": 0.5576429393071742,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5647,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 1.4985754985754987,
|
|
"grad_norm": 0.5520440692451319,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5716,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 1.5042735042735043,
|
|
"grad_norm": 0.5643022408516812,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5786,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 1.50997150997151,
|
|
"grad_norm": 0.6330193140871835,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5836,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 1.5156695156695157,
|
|
"grad_norm": 0.625250641713771,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5718,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 1.5213675213675213,
|
|
"grad_norm": 0.5418501880171682,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5814,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 1.5270655270655271,
|
|
"grad_norm": 0.6064856119796758,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5859,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 1.5327635327635327,
|
|
"grad_norm": 0.5672868138655305,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5542,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 1.5384615384615383,
|
|
"grad_norm": 0.5139650665849255,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5765,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 1.5441595441595442,
|
|
"grad_norm": 0.5940578181125139,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5881,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 1.54985754985755,
|
|
"grad_norm": 0.5756432474296378,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5651,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 1.5555555555555556,
|
|
"grad_norm": 0.545530119077096,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5764,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 1.5612535612535612,
|
|
"grad_norm": 0.5756357865222977,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5612,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 1.566951566951567,
|
|
"grad_norm": 0.5876180011106431,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5794,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 1.5726495726495726,
|
|
"grad_norm": 0.5147868956095404,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5707,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 1.5783475783475782,
|
|
"grad_norm": 0.5591733948940201,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5884,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 1.584045584045584,
|
|
"grad_norm": 0.5627964904713231,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6041,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 1.5897435897435899,
|
|
"grad_norm": 0.6297308754027342,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5923,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 1.5954415954415955,
|
|
"grad_norm": 0.5936598449915134,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5761,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 1.601139601139601,
|
|
"grad_norm": 0.5515458557481886,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5833,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 1.606837606837607,
|
|
"grad_norm": 0.5257725141562714,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5765,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 1.6125356125356125,
|
|
"grad_norm": 0.5730936759045179,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5649,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 1.618233618233618,
|
|
"grad_norm": 0.5354221551624224,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5776,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 1.623931623931624,
|
|
"grad_norm": 0.5824066939857122,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5769,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 1.6296296296296298,
|
|
"grad_norm": 0.5554767609605542,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5964,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 1.6353276353276354,
|
|
"grad_norm": 0.5526840997991586,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5727,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 1.641025641025641,
|
|
"grad_norm": 0.5465241482682912,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5686,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 1.6467236467236468,
|
|
"grad_norm": 0.6190071105580203,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5823,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 1.6524216524216524,
|
|
"grad_norm": 0.5356564377870386,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5755,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 1.658119658119658,
|
|
"grad_norm": 0.5979037263652105,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5772,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 1.6638176638176638,
|
|
"grad_norm": 0.5872422786253441,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5761,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 1.6695156695156697,
|
|
"grad_norm": 0.5667349984301929,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.565,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 1.6752136752136753,
|
|
"grad_norm": 0.5553088977731174,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5737,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 1.6809116809116809,
|
|
"grad_norm": 0.5615666295169086,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5793,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 1.6866096866096867,
|
|
"grad_norm": 0.553145275478625,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5742,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 1.6923076923076923,
|
|
"grad_norm": 0.5341696725770836,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5614,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 1.698005698005698,
|
|
"grad_norm": 0.5463223359456406,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5786,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 1.7037037037037037,
|
|
"grad_norm": 0.5777813141939787,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5841,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 1.7094017094017095,
|
|
"grad_norm": 0.5300785322610779,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5778,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 1.7150997150997151,
|
|
"grad_norm": 0.563500423103054,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5755,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 1.7207977207977208,
|
|
"grad_norm": 0.5634105059434202,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5643,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 1.7264957264957266,
|
|
"grad_norm": 0.5570811501478863,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.581,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 1.7321937321937322,
|
|
"grad_norm": 0.5739299531519032,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5748,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 1.7378917378917378,
|
|
"grad_norm": 0.5701909466155488,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5708,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 1.7435897435897436,
|
|
"grad_norm": 0.5308366598042483,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5815,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 1.7492877492877494,
|
|
"grad_norm": 0.5528144482172378,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5728,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 1.7549857549857548,
|
|
"grad_norm": 0.6087932311563443,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5707,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 1.7606837606837606,
|
|
"grad_norm": 0.5516607270748759,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5896,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 1.7663817663817665,
|
|
"grad_norm": 0.5843004310212261,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5767,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 1.772079772079772,
|
|
"grad_norm": 0.5852609740047551,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5845,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 1.7777777777777777,
|
|
"grad_norm": 0.552703665463295,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5703,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 1.7834757834757835,
|
|
"grad_norm": 0.6753220707242441,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5758,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 1.7891737891737893,
|
|
"grad_norm": 0.616525110015123,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5884,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 1.7948717948717947,
|
|
"grad_norm": 0.5654708131985335,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5758,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 1.8005698005698005,
|
|
"grad_norm": 0.5323851407450543,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5686,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 1.8062678062678064,
|
|
"grad_norm": 0.5799884631320908,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5733,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 1.811965811965812,
|
|
"grad_norm": 0.6251760605506265,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5766,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 1.8176638176638176,
|
|
"grad_norm": 0.5779038699801032,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5624,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 1.8233618233618234,
|
|
"grad_norm": 0.5088787245145318,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5596,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 1.8290598290598292,
|
|
"grad_norm": 0.4965401461152827,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5698,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 1.8347578347578346,
|
|
"grad_norm": 0.5353414236210642,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5683,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 1.8404558404558404,
|
|
"grad_norm": 0.5409699336409666,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.569,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 1.8461538461538463,
|
|
"grad_norm": 0.574913493745111,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5789,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 1.8518518518518519,
|
|
"grad_norm": 0.5521006517580501,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5715,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 1.8575498575498575,
|
|
"grad_norm": 0.5805482573176093,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5585,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 1.8632478632478633,
|
|
"grad_norm": 0.573426960061509,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.578,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 1.868945868945869,
|
|
"grad_norm": 0.5565980515165863,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5729,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 1.8746438746438745,
|
|
"grad_norm": 0.5586859824206695,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5761,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 1.8803418803418803,
|
|
"grad_norm": 0.5496999183024133,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5561,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 1.8860398860398861,
|
|
"grad_norm": 0.5997656668355231,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5859,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 1.8917378917378918,
|
|
"grad_norm": 0.6206812095844995,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5845,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 1.8974358974358974,
|
|
"grad_norm": 0.5963021360576573,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5839,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 1.9031339031339032,
|
|
"grad_norm": 0.5743434396116716,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5816,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 1.9088319088319088,
|
|
"grad_norm": 0.5858788059851108,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5716,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 1.9145299145299144,
|
|
"grad_norm": 0.5648325254524372,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5789,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 1.9202279202279202,
|
|
"grad_norm": 0.5444165984165006,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5775,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 1.925925925925926,
|
|
"grad_norm": 0.6059359662104461,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5802,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 1.9316239316239316,
|
|
"grad_norm": 0.5755161543958413,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5784,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 1.9373219373219372,
|
|
"grad_norm": 0.5368654626324941,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5742,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 1.943019943019943,
|
|
"grad_norm": 0.5582700416390507,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5676,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 1.9487179487179487,
|
|
"grad_norm": 0.5625374338203465,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5741,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 1.9544159544159543,
|
|
"grad_norm": 0.6326198974505973,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5874,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 1.96011396011396,
|
|
"grad_norm": 0.5826625931834889,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5875,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 1.965811965811966,
|
|
"grad_norm": 0.6063426488985952,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5802,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 1.9715099715099715,
|
|
"grad_norm": 0.5589982670093196,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5813,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 1.9772079772079771,
|
|
"grad_norm": 0.5197556180794218,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5763,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 1.982905982905983,
|
|
"grad_norm": 0.5840640270883682,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5701,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 1.9886039886039886,
|
|
"grad_norm": 0.5335458990414118,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5701,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 1.9943019943019942,
|
|
"grad_norm": 0.5707109550752067,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5753,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.5883827598003003,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.581,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 0.6180241703987122,
|
|
"eval_runtime": 447.562,
|
|
"eval_samples_per_second": 26.416,
|
|
"eval_steps_per_second": 0.413,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 2.005698005698006,
|
|
"grad_norm": 0.5776212119618224,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5254,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 2.011396011396011,
|
|
"grad_norm": 0.5810335608051052,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5094,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 2.017094017094017,
|
|
"grad_norm": 0.5432713405979779,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5032,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 2.022792022792023,
|
|
"grad_norm": 0.5416321025053085,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.515,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 2.0284900284900287,
|
|
"grad_norm": 0.5775523857428819,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.518,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 2.034188034188034,
|
|
"grad_norm": 0.5592263586107213,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5234,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 2.03988603988604,
|
|
"grad_norm": 0.5476768718918923,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5289,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 2.0455840455840457,
|
|
"grad_norm": 0.6102527285044014,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5208,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 2.051282051282051,
|
|
"grad_norm": 0.589225412655405,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5298,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 2.056980056980057,
|
|
"grad_norm": 0.57808060135501,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5208,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 2.0626780626780628,
|
|
"grad_norm": 0.5634395489010126,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5212,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 2.0683760683760686,
|
|
"grad_norm": 0.5526570622014573,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5297,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 2.074074074074074,
|
|
"grad_norm": 0.5810750660810072,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.525,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 2.07977207977208,
|
|
"grad_norm": 0.5614577275900066,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5259,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 2.0854700854700856,
|
|
"grad_norm": 0.5486462905219032,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5213,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 2.091168091168091,
|
|
"grad_norm": 0.5307563733817223,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.529,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 2.096866096866097,
|
|
"grad_norm": 0.5389945236629596,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5348,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 2.1025641025641026,
|
|
"grad_norm": 0.5527322408012718,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5116,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 2.1082621082621085,
|
|
"grad_norm": 0.5328079584501793,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5282,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 2.113960113960114,
|
|
"grad_norm": 0.5686915040528058,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5261,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 2.1196581196581197,
|
|
"grad_norm": 0.5501606190305495,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5365,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 2.1253561253561255,
|
|
"grad_norm": 0.5536761094008102,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5263,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 2.131054131054131,
|
|
"grad_norm": 0.5345031800564628,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5206,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 2.1367521367521367,
|
|
"grad_norm": 0.6046490261900991,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5275,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 2.1424501424501425,
|
|
"grad_norm": 0.5840211791187765,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5201,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 2.148148148148148,
|
|
"grad_norm": 0.5529533135143219,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5115,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 2.1538461538461537,
|
|
"grad_norm": 0.5680751070257097,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5294,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 2.1595441595441596,
|
|
"grad_norm": 0.5245141535052799,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5281,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 2.1652421652421654,
|
|
"grad_norm": 0.5648362949355089,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5147,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 2.1709401709401708,
|
|
"grad_norm": 0.5254847337067438,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5313,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 2.1766381766381766,
|
|
"grad_norm": 0.5976261665941772,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5198,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 2.1823361823361824,
|
|
"grad_norm": 0.5864445373756276,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5336,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 2.1880341880341883,
|
|
"grad_norm": 0.5537617774511332,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5239,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 2.1937321937321936,
|
|
"grad_norm": 0.5790262967504055,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5387,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 2.1994301994301995,
|
|
"grad_norm": 0.5448893578337308,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5158,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 2.2051282051282053,
|
|
"grad_norm": 0.5224956999156651,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5319,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 2.2108262108262107,
|
|
"grad_norm": 0.5452041541066649,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5283,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 2.2165242165242165,
|
|
"grad_norm": 0.5188463908276534,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5111,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 2.2222222222222223,
|
|
"grad_norm": 0.6153310194594807,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5358,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 2.2279202279202277,
|
|
"grad_norm": 0.5926494217956065,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5154,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 2.2336182336182335,
|
|
"grad_norm": 0.5109574356125176,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.518,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 2.2393162393162394,
|
|
"grad_norm": 0.5289253041831274,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5246,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 2.245014245014245,
|
|
"grad_norm": 0.5628951778576998,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.526,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 2.2507122507122506,
|
|
"grad_norm": 0.551449654946418,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5374,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 2.2564102564102564,
|
|
"grad_norm": 0.5466152136858086,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5169,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 2.262108262108262,
|
|
"grad_norm": 0.5146969054690042,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5281,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 2.267806267806268,
|
|
"grad_norm": 0.5293060782125808,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5191,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 2.2735042735042734,
|
|
"grad_norm": 0.5473420088344219,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.531,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 2.2792022792022792,
|
|
"grad_norm": 0.5512443710837232,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5256,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 2.284900284900285,
|
|
"grad_norm": 0.5442787627600018,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5222,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 2.2905982905982905,
|
|
"grad_norm": 0.5545916348777593,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.535,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 2.2962962962962963,
|
|
"grad_norm": 0.59632132003208,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5317,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 2.301994301994302,
|
|
"grad_norm": 0.5408157566248561,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5146,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 2.3076923076923075,
|
|
"grad_norm": 0.5820724583290839,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5349,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 2.3133903133903133,
|
|
"grad_norm": 0.5687662322666911,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5282,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 2.319088319088319,
|
|
"grad_norm": 0.573552994416881,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5336,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 2.324786324786325,
|
|
"grad_norm": 0.5677912645112424,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5214,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 2.3304843304843303,
|
|
"grad_norm": 0.5274433334199329,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5244,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 2.336182336182336,
|
|
"grad_norm": 0.5658209536678374,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5286,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 2.341880341880342,
|
|
"grad_norm": 0.5780434495697487,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5341,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 2.347578347578348,
|
|
"grad_norm": 0.5818657983745251,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5338,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 2.353276353276353,
|
|
"grad_norm": 0.5389779504746351,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5291,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 2.358974358974359,
|
|
"grad_norm": 0.5610403895418081,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5225,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 2.364672364672365,
|
|
"grad_norm": 0.5209098217965255,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5334,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 2.3703703703703702,
|
|
"grad_norm": 0.5744294920867676,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5204,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 2.376068376068376,
|
|
"grad_norm": 0.598425566675419,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.52,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 2.381766381766382,
|
|
"grad_norm": 0.5493923391327106,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5359,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 2.3874643874643873,
|
|
"grad_norm": 0.5533392246170049,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.521,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 2.393162393162393,
|
|
"grad_norm": 0.5731160307080695,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5329,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 2.398860398860399,
|
|
"grad_norm": 0.5775023991320096,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5359,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 2.4045584045584047,
|
|
"grad_norm": 0.5901628223866878,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5493,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 2.41025641025641,
|
|
"grad_norm": 0.5542817321146499,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.526,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 2.415954415954416,
|
|
"grad_norm": 0.5524566146364747,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5307,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 2.421652421652422,
|
|
"grad_norm": 0.5244228024377005,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5278,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 2.427350427350427,
|
|
"grad_norm": 0.5786633243903677,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5482,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 2.433048433048433,
|
|
"grad_norm": 0.5858650466682291,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5394,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 2.438746438746439,
|
|
"grad_norm": 0.5858885917781449,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5371,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.5339546065735147,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5373,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 2.45014245014245,
|
|
"grad_norm": 0.5984049498497251,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5474,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 2.455840455840456,
|
|
"grad_norm": 0.5807043848022856,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5309,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 2.4615384615384617,
|
|
"grad_norm": 0.5709610370467612,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5246,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 2.467236467236467,
|
|
"grad_norm": 0.5499687224770995,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.531,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 2.472934472934473,
|
|
"grad_norm": 0.5722356598944494,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5286,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 2.4786324786324787,
|
|
"grad_norm": 0.5486032250328287,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5358,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 2.484330484330484,
|
|
"grad_norm": 0.5142530295671646,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5324,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 2.49002849002849,
|
|
"grad_norm": 0.6364539965127788,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5325,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 2.4957264957264957,
|
|
"grad_norm": 0.5822908149062661,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5378,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 2.5014245014245016,
|
|
"grad_norm": 0.5660579125585127,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.539,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 2.5071225071225074,
|
|
"grad_norm": 0.6015416980494055,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.543,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 2.5128205128205128,
|
|
"grad_norm": 0.544050303995212,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5186,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 2.5185185185185186,
|
|
"grad_norm": 0.5489445408860626,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5293,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 2.5242165242165244,
|
|
"grad_norm": 0.5804195388596164,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5368,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 2.52991452991453,
|
|
"grad_norm": 0.5465444916928103,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5395,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 2.5356125356125356,
|
|
"grad_norm": 0.5679778769321939,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5358,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 2.5413105413105415,
|
|
"grad_norm": 0.5726465912316608,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5253,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 2.547008547008547,
|
|
"grad_norm": 0.5387152868301355,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5268,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 2.5527065527065527,
|
|
"grad_norm": 0.5559047427422275,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5305,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 2.5584045584045585,
|
|
"grad_norm": 0.5428769349897132,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5209,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 2.564102564102564,
|
|
"grad_norm": 0.5407361307856526,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5351,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 2.5698005698005697,
|
|
"grad_norm": 0.5595203034409101,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5312,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 2.5754985754985755,
|
|
"grad_norm": 0.5752885902852435,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5328,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 2.5811965811965814,
|
|
"grad_norm": 0.5448007027240791,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5295,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 2.5868945868945867,
|
|
"grad_norm": 0.5494957146695392,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5327,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 2.5925925925925926,
|
|
"grad_norm": 0.5743882596085497,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5295,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 2.5982905982905984,
|
|
"grad_norm": 0.5481581540445639,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5305,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 2.603988603988604,
|
|
"grad_norm": 0.5834328837619958,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5376,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 2.6096866096866096,
|
|
"grad_norm": 0.5536117193354623,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5341,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 2.6153846153846154,
|
|
"grad_norm": 0.545383573085851,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5233,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 2.6210826210826212,
|
|
"grad_norm": 0.5204672857822074,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5228,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 2.6267806267806266,
|
|
"grad_norm": 0.5139161169258046,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5328,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 2.6324786324786325,
|
|
"grad_norm": 0.6028262892085369,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5332,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 2.6381766381766383,
|
|
"grad_norm": 0.5559617493532288,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5232,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 2.6438746438746437,
|
|
"grad_norm": 0.5435028142224008,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5415,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 2.6495726495726495,
|
|
"grad_norm": 0.604873621040108,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5303,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 2.6552706552706553,
|
|
"grad_norm": 0.5697598259817795,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5373,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 2.6609686609686607,
|
|
"grad_norm": 0.5511420813626869,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5434,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.5394695044798543,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5238,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 2.6723646723646723,
|
|
"grad_norm": 0.5330927779679859,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5245,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 2.678062678062678,
|
|
"grad_norm": 0.5736642108384618,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5305,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 2.683760683760684,
|
|
"grad_norm": 0.6197551413034075,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5408,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 2.6894586894586894,
|
|
"grad_norm": 0.5791951412024915,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5306,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 2.695156695156695,
|
|
"grad_norm": 0.5631274263966353,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5228,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 2.700854700854701,
|
|
"grad_norm": 0.5605562545980405,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5266,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 2.7065527065527064,
|
|
"grad_norm": 0.5321827825743034,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5338,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 2.7122507122507122,
|
|
"grad_norm": 0.5644337354264807,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5376,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 2.717948717948718,
|
|
"grad_norm": 0.5719762386839188,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5298,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 2.7236467236467234,
|
|
"grad_norm": 0.5870644859394915,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5408,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 2.7293447293447293,
|
|
"grad_norm": 0.5161759448699083,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5385,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 2.735042735042735,
|
|
"grad_norm": 0.5685973523356822,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5295,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 2.7407407407407405,
|
|
"grad_norm": 0.5955189388351516,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5357,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 2.7464387464387463,
|
|
"grad_norm": 0.5927243869455354,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5397,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 2.752136752136752,
|
|
"grad_norm": 0.5892611711545225,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5427,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 2.757834757834758,
|
|
"grad_norm": 0.5320349130904972,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5322,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 2.763532763532764,
|
|
"grad_norm": 0.5215197760783008,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5196,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 2.769230769230769,
|
|
"grad_norm": 0.5967746123628929,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5349,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 2.774928774928775,
|
|
"grad_norm": 0.5303530858087516,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5288,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 2.780626780626781,
|
|
"grad_norm": 0.5294938033518871,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5254,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 2.786324786324786,
|
|
"grad_norm": 0.6085557642175643,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5362,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 2.792022792022792,
|
|
"grad_norm": 0.5563638209032657,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5243,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 2.797720797720798,
|
|
"grad_norm": 0.5426535982775469,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5302,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 2.8034188034188032,
|
|
"grad_norm": 0.5606166025371381,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5195,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 2.809116809116809,
|
|
"grad_norm": 0.5600176374437925,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5339,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 2.814814814814815,
|
|
"grad_norm": 0.5735203266072578,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5463,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 2.8205128205128203,
|
|
"grad_norm": 0.5647627688966846,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5342,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 2.826210826210826,
|
|
"grad_norm": 0.6181052514822875,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5502,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 2.831908831908832,
|
|
"grad_norm": 0.5487589815910356,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5332,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 2.8376068376068377,
|
|
"grad_norm": 0.5519304274572768,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5342,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 2.8433048433048436,
|
|
"grad_norm": 0.5710774380484754,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5468,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 2.849002849002849,
|
|
"grad_norm": 0.5396253108717034,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.536,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 2.8547008547008548,
|
|
"grad_norm": 0.5481621751659937,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5361,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 2.8603988603988606,
|
|
"grad_norm": 0.5815133705980525,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5321,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 2.866096866096866,
|
|
"grad_norm": 0.5408578285161547,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5367,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 2.871794871794872,
|
|
"grad_norm": 0.5405279703831611,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.532,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 2.8774928774928776,
|
|
"grad_norm": 0.5566749988018465,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5374,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 2.883190883190883,
|
|
"grad_norm": 0.5806758592562609,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5425,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.5820389002607862,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5329,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 2.8945868945868947,
|
|
"grad_norm": 0.5375342327708015,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5404,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 2.9002849002849,
|
|
"grad_norm": 0.5641024886824925,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5206,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 2.905982905982906,
|
|
"grad_norm": 0.5595993132067282,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5456,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 2.9116809116809117,
|
|
"grad_norm": 0.5729657514196825,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5303,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 2.9173789173789175,
|
|
"grad_norm": 0.5592258039441389,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5137,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 2.9230769230769234,
|
|
"grad_norm": 0.5482964902412071,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5379,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 2.9287749287749287,
|
|
"grad_norm": 0.5336701580376303,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.516,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 2.9344729344729346,
|
|
"grad_norm": 0.573991652444628,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5197,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 2.9401709401709404,
|
|
"grad_norm": 0.5656512132917955,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5299,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 2.9458689458689458,
|
|
"grad_norm": 0.5637897139695605,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5318,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 2.9515669515669516,
|
|
"grad_norm": 0.5805647906397857,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5348,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 2.9572649572649574,
|
|
"grad_norm": 0.5629404743153653,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5239,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 2.962962962962963,
|
|
"grad_norm": 0.5482910577257104,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5248,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 2.9686609686609686,
|
|
"grad_norm": 0.5428900145420302,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5286,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 2.9743589743589745,
|
|
"grad_norm": 0.5426923796436356,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5439,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 2.98005698005698,
|
|
"grad_norm": 0.5421746187267816,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5394,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 2.9857549857549857,
|
|
"grad_norm": 0.5703778871540313,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5359,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 2.9914529914529915,
|
|
"grad_norm": 0.5488503690583575,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5253,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 2.9971509971509973,
|
|
"grad_norm": 0.6053670974984723,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.5255,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_loss": 0.6272810697555542,
|
|
"eval_runtime": 446.6687,
|
|
"eval_samples_per_second": 26.469,
|
|
"eval_steps_per_second": 0.414,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 5265,
|
|
"total_flos": 2759937528692736.0,
|
|
"train_loss": 0.5805289232719545,
|
|
"train_runtime": 71924.3439,
|
|
"train_samples_per_second": 9.369,
|
|
"train_steps_per_second": 0.073
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 5265,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2759937528692736.0,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|