Files
oh-dcft-v3.1-gpt-4o-mini-qwen/trainer_state.json
ModelHub XC 285056620b 初始化项目,由ModelHub XC社区提供模型
Model: mlfoundations-dev/oh-dcft-v3.1-gpt-4o-mini-qwen
Source: Original Platform
2026-04-26 21:01:02 +08:00

3749 lines
82 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 5265,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005698005698005698,
"grad_norm": 1.248605852290593,
"learning_rate": 5e-06,
"loss": 0.7464,
"step": 10
},
{
"epoch": 0.011396011396011397,
"grad_norm": 1.0452895054741067,
"learning_rate": 5e-06,
"loss": 0.7185,
"step": 20
},
{
"epoch": 0.017094017094017096,
"grad_norm": 0.8153324886981814,
"learning_rate": 5e-06,
"loss": 0.6852,
"step": 30
},
{
"epoch": 0.022792022792022793,
"grad_norm": 1.0831283785058512,
"learning_rate": 5e-06,
"loss": 0.6966,
"step": 40
},
{
"epoch": 0.02849002849002849,
"grad_norm": 0.8504265623978658,
"learning_rate": 5e-06,
"loss": 0.6617,
"step": 50
},
{
"epoch": 0.03418803418803419,
"grad_norm": 0.9237045733670409,
"learning_rate": 5e-06,
"loss": 0.6694,
"step": 60
},
{
"epoch": 0.039886039886039885,
"grad_norm": 0.7642515560623869,
"learning_rate": 5e-06,
"loss": 0.6675,
"step": 70
},
{
"epoch": 0.045584045584045586,
"grad_norm": 0.627995094432306,
"learning_rate": 5e-06,
"loss": 0.6698,
"step": 80
},
{
"epoch": 0.05128205128205128,
"grad_norm": 0.5944277816882634,
"learning_rate": 5e-06,
"loss": 0.6493,
"step": 90
},
{
"epoch": 0.05698005698005698,
"grad_norm": 0.545138744190239,
"learning_rate": 5e-06,
"loss": 0.6493,
"step": 100
},
{
"epoch": 0.06267806267806268,
"grad_norm": 0.5758914966539062,
"learning_rate": 5e-06,
"loss": 0.6593,
"step": 110
},
{
"epoch": 0.06837606837606838,
"grad_norm": 0.6137211171560547,
"learning_rate": 5e-06,
"loss": 0.6464,
"step": 120
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.6226547108170967,
"learning_rate": 5e-06,
"loss": 0.6618,
"step": 130
},
{
"epoch": 0.07977207977207977,
"grad_norm": 0.603249856221309,
"learning_rate": 5e-06,
"loss": 0.6589,
"step": 140
},
{
"epoch": 0.08547008547008547,
"grad_norm": 0.5036390357219359,
"learning_rate": 5e-06,
"loss": 0.6529,
"step": 150
},
{
"epoch": 0.09116809116809117,
"grad_norm": 0.5977465496475399,
"learning_rate": 5e-06,
"loss": 0.655,
"step": 160
},
{
"epoch": 0.09686609686609686,
"grad_norm": 0.5961195181863996,
"learning_rate": 5e-06,
"loss": 0.6712,
"step": 170
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.6066789044063328,
"learning_rate": 5e-06,
"loss": 0.653,
"step": 180
},
{
"epoch": 0.10826210826210826,
"grad_norm": 0.5768094248224014,
"learning_rate": 5e-06,
"loss": 0.6571,
"step": 190
},
{
"epoch": 0.11396011396011396,
"grad_norm": 0.5562364789120665,
"learning_rate": 5e-06,
"loss": 0.655,
"step": 200
},
{
"epoch": 0.11965811965811966,
"grad_norm": 0.5491574417916478,
"learning_rate": 5e-06,
"loss": 0.6499,
"step": 210
},
{
"epoch": 0.12535612535612536,
"grad_norm": 0.5975656942284756,
"learning_rate": 5e-06,
"loss": 0.6521,
"step": 220
},
{
"epoch": 0.13105413105413105,
"grad_norm": 0.5783672727047425,
"learning_rate": 5e-06,
"loss": 0.6339,
"step": 230
},
{
"epoch": 0.13675213675213677,
"grad_norm": 0.5949371595243242,
"learning_rate": 5e-06,
"loss": 0.6359,
"step": 240
},
{
"epoch": 0.14245014245014245,
"grad_norm": 0.6034430748838411,
"learning_rate": 5e-06,
"loss": 0.632,
"step": 250
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.5821788725046587,
"learning_rate": 5e-06,
"loss": 0.6308,
"step": 260
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.5678471771976431,
"learning_rate": 5e-06,
"loss": 0.6503,
"step": 270
},
{
"epoch": 0.15954415954415954,
"grad_norm": 0.5602675467648068,
"learning_rate": 5e-06,
"loss": 0.6599,
"step": 280
},
{
"epoch": 0.16524216524216523,
"grad_norm": 0.5941388781564068,
"learning_rate": 5e-06,
"loss": 0.6355,
"step": 290
},
{
"epoch": 0.17094017094017094,
"grad_norm": 0.5858722107720983,
"learning_rate": 5e-06,
"loss": 0.6508,
"step": 300
},
{
"epoch": 0.17663817663817663,
"grad_norm": 0.6035463226302998,
"learning_rate": 5e-06,
"loss": 0.6553,
"step": 310
},
{
"epoch": 0.18233618233618235,
"grad_norm": 0.6342249591951397,
"learning_rate": 5e-06,
"loss": 0.6534,
"step": 320
},
{
"epoch": 0.18803418803418803,
"grad_norm": 0.5798733093393231,
"learning_rate": 5e-06,
"loss": 0.6238,
"step": 330
},
{
"epoch": 0.19373219373219372,
"grad_norm": 0.6315524208231319,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 340
},
{
"epoch": 0.19943019943019943,
"grad_norm": 0.6118349674877842,
"learning_rate": 5e-06,
"loss": 0.6439,
"step": 350
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.5613968317306219,
"learning_rate": 5e-06,
"loss": 0.6422,
"step": 360
},
{
"epoch": 0.21082621082621084,
"grad_norm": 0.650029101740181,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 370
},
{
"epoch": 0.21652421652421652,
"grad_norm": 0.6034494108497298,
"learning_rate": 5e-06,
"loss": 0.6419,
"step": 380
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.5686444051570657,
"learning_rate": 5e-06,
"loss": 0.6403,
"step": 390
},
{
"epoch": 0.22792022792022792,
"grad_norm": 0.5897904881774643,
"learning_rate": 5e-06,
"loss": 0.6277,
"step": 400
},
{
"epoch": 0.2336182336182336,
"grad_norm": 0.5969120088165002,
"learning_rate": 5e-06,
"loss": 0.6343,
"step": 410
},
{
"epoch": 0.23931623931623933,
"grad_norm": 0.6066999477029202,
"learning_rate": 5e-06,
"loss": 0.6223,
"step": 420
},
{
"epoch": 0.245014245014245,
"grad_norm": 0.6019250090914218,
"learning_rate": 5e-06,
"loss": 0.6227,
"step": 430
},
{
"epoch": 0.25071225071225073,
"grad_norm": 0.635752248265818,
"learning_rate": 5e-06,
"loss": 0.6301,
"step": 440
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.5334914544076793,
"learning_rate": 5e-06,
"loss": 0.6277,
"step": 450
},
{
"epoch": 0.2621082621082621,
"grad_norm": 0.5614109457622548,
"learning_rate": 5e-06,
"loss": 0.6367,
"step": 460
},
{
"epoch": 0.2678062678062678,
"grad_norm": 0.5716858172898615,
"learning_rate": 5e-06,
"loss": 0.6433,
"step": 470
},
{
"epoch": 0.27350427350427353,
"grad_norm": 0.569385420792123,
"learning_rate": 5e-06,
"loss": 0.6358,
"step": 480
},
{
"epoch": 0.2792022792022792,
"grad_norm": 0.5396490903906098,
"learning_rate": 5e-06,
"loss": 0.6483,
"step": 490
},
{
"epoch": 0.2849002849002849,
"grad_norm": 0.5712372988205724,
"learning_rate": 5e-06,
"loss": 0.6247,
"step": 500
},
{
"epoch": 0.2905982905982906,
"grad_norm": 0.5616938335269104,
"learning_rate": 5e-06,
"loss": 0.6397,
"step": 510
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.5586395315743943,
"learning_rate": 5e-06,
"loss": 0.637,
"step": 520
},
{
"epoch": 0.301994301994302,
"grad_norm": 0.593890274267505,
"learning_rate": 5e-06,
"loss": 0.6425,
"step": 530
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.5691720448900264,
"learning_rate": 5e-06,
"loss": 0.6395,
"step": 540
},
{
"epoch": 0.31339031339031337,
"grad_norm": 0.5825957418827326,
"learning_rate": 5e-06,
"loss": 0.6404,
"step": 550
},
{
"epoch": 0.3190883190883191,
"grad_norm": 0.5712581943035149,
"learning_rate": 5e-06,
"loss": 0.6363,
"step": 560
},
{
"epoch": 0.3247863247863248,
"grad_norm": 0.5611359179186056,
"learning_rate": 5e-06,
"loss": 0.6247,
"step": 570
},
{
"epoch": 0.33048433048433046,
"grad_norm": 0.5263861318528564,
"learning_rate": 5e-06,
"loss": 0.6397,
"step": 580
},
{
"epoch": 0.33618233618233617,
"grad_norm": 0.6008086879223974,
"learning_rate": 5e-06,
"loss": 0.6366,
"step": 590
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.5359029223247334,
"learning_rate": 5e-06,
"loss": 0.6279,
"step": 600
},
{
"epoch": 0.3475783475783476,
"grad_norm": 0.5476117829162411,
"learning_rate": 5e-06,
"loss": 0.6472,
"step": 610
},
{
"epoch": 0.35327635327635326,
"grad_norm": 0.548345413171009,
"learning_rate": 5e-06,
"loss": 0.6236,
"step": 620
},
{
"epoch": 0.358974358974359,
"grad_norm": 0.549515203524155,
"learning_rate": 5e-06,
"loss": 0.6399,
"step": 630
},
{
"epoch": 0.3646723646723647,
"grad_norm": 0.6247502479073669,
"learning_rate": 5e-06,
"loss": 0.638,
"step": 640
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.6367708386541642,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 650
},
{
"epoch": 0.37606837606837606,
"grad_norm": 0.6035098886365409,
"learning_rate": 5e-06,
"loss": 0.618,
"step": 660
},
{
"epoch": 0.3817663817663818,
"grad_norm": 0.5843073759359196,
"learning_rate": 5e-06,
"loss": 0.6433,
"step": 670
},
{
"epoch": 0.38746438746438744,
"grad_norm": 0.6208173453592988,
"learning_rate": 5e-06,
"loss": 0.6265,
"step": 680
},
{
"epoch": 0.39316239316239315,
"grad_norm": 0.6005049567591871,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 690
},
{
"epoch": 0.39886039886039887,
"grad_norm": 0.569831828840819,
"learning_rate": 5e-06,
"loss": 0.6456,
"step": 700
},
{
"epoch": 0.4045584045584046,
"grad_norm": 0.582087320229329,
"learning_rate": 5e-06,
"loss": 0.6388,
"step": 710
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.5537164119876729,
"learning_rate": 5e-06,
"loss": 0.6265,
"step": 720
},
{
"epoch": 0.41595441595441596,
"grad_norm": 0.5993733531397847,
"learning_rate": 5e-06,
"loss": 0.6431,
"step": 730
},
{
"epoch": 0.42165242165242167,
"grad_norm": 0.5878911525467738,
"learning_rate": 5e-06,
"loss": 0.6231,
"step": 740
},
{
"epoch": 0.42735042735042733,
"grad_norm": 0.5703047756344629,
"learning_rate": 5e-06,
"loss": 0.6221,
"step": 750
},
{
"epoch": 0.43304843304843305,
"grad_norm": 0.577768195476661,
"learning_rate": 5e-06,
"loss": 0.6208,
"step": 760
},
{
"epoch": 0.43874643874643876,
"grad_norm": 0.5854764641336874,
"learning_rate": 5e-06,
"loss": 0.6286,
"step": 770
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.5387219205137932,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 780
},
{
"epoch": 0.45014245014245013,
"grad_norm": 0.5918701715838999,
"learning_rate": 5e-06,
"loss": 0.6293,
"step": 790
},
{
"epoch": 0.45584045584045585,
"grad_norm": 0.610827318398544,
"learning_rate": 5e-06,
"loss": 0.6548,
"step": 800
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.5581493937641772,
"learning_rate": 5e-06,
"loss": 0.6357,
"step": 810
},
{
"epoch": 0.4672364672364672,
"grad_norm": 0.5526185327228441,
"learning_rate": 5e-06,
"loss": 0.6255,
"step": 820
},
{
"epoch": 0.47293447293447294,
"grad_norm": 0.5604472711614089,
"learning_rate": 5e-06,
"loss": 0.6318,
"step": 830
},
{
"epoch": 0.47863247863247865,
"grad_norm": 0.5876335201995966,
"learning_rate": 5e-06,
"loss": 0.6314,
"step": 840
},
{
"epoch": 0.4843304843304843,
"grad_norm": 0.5674159078017988,
"learning_rate": 5e-06,
"loss": 0.6438,
"step": 850
},
{
"epoch": 0.49002849002849,
"grad_norm": 0.5707009756648699,
"learning_rate": 5e-06,
"loss": 0.6186,
"step": 860
},
{
"epoch": 0.49572649572649574,
"grad_norm": 0.6126275501973454,
"learning_rate": 5e-06,
"loss": 0.6298,
"step": 870
},
{
"epoch": 0.5014245014245015,
"grad_norm": 0.5973891881380156,
"learning_rate": 5e-06,
"loss": 0.6482,
"step": 880
},
{
"epoch": 0.5071225071225072,
"grad_norm": 0.5440973024632595,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 890
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.5719103793581991,
"learning_rate": 5e-06,
"loss": 0.63,
"step": 900
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.5927304335774137,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 910
},
{
"epoch": 0.5242165242165242,
"grad_norm": 0.5468702165759647,
"learning_rate": 5e-06,
"loss": 0.6297,
"step": 920
},
{
"epoch": 0.5299145299145299,
"grad_norm": 0.5689062473394013,
"learning_rate": 5e-06,
"loss": 0.6256,
"step": 930
},
{
"epoch": 0.5356125356125356,
"grad_norm": 0.580087974342758,
"learning_rate": 5e-06,
"loss": 0.6179,
"step": 940
},
{
"epoch": 0.5413105413105413,
"grad_norm": 0.6278973426700435,
"learning_rate": 5e-06,
"loss": 0.6349,
"step": 950
},
{
"epoch": 0.5470085470085471,
"grad_norm": 0.5749182904288472,
"learning_rate": 5e-06,
"loss": 0.6312,
"step": 960
},
{
"epoch": 0.5527065527065527,
"grad_norm": 0.5755058045692314,
"learning_rate": 5e-06,
"loss": 0.6464,
"step": 970
},
{
"epoch": 0.5584045584045584,
"grad_norm": 0.564209988292775,
"learning_rate": 5e-06,
"loss": 0.6204,
"step": 980
},
{
"epoch": 0.5641025641025641,
"grad_norm": 0.6064650017065378,
"learning_rate": 5e-06,
"loss": 0.6302,
"step": 990
},
{
"epoch": 0.5698005698005698,
"grad_norm": 0.5981562518766129,
"learning_rate": 5e-06,
"loss": 0.6278,
"step": 1000
},
{
"epoch": 0.5754985754985755,
"grad_norm": 0.5985428419859516,
"learning_rate": 5e-06,
"loss": 0.6278,
"step": 1010
},
{
"epoch": 0.5811965811965812,
"grad_norm": 0.613528620026823,
"learning_rate": 5e-06,
"loss": 0.6358,
"step": 1020
},
{
"epoch": 0.5868945868945868,
"grad_norm": 0.5785257508799594,
"learning_rate": 5e-06,
"loss": 0.6367,
"step": 1030
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.6325574889479847,
"learning_rate": 5e-06,
"loss": 0.6214,
"step": 1040
},
{
"epoch": 0.5982905982905983,
"grad_norm": 0.5798171618499341,
"learning_rate": 5e-06,
"loss": 0.6318,
"step": 1050
},
{
"epoch": 0.603988603988604,
"grad_norm": 0.5917058378245685,
"learning_rate": 5e-06,
"loss": 0.6239,
"step": 1060
},
{
"epoch": 0.6096866096866097,
"grad_norm": 0.6462363857504108,
"learning_rate": 5e-06,
"loss": 0.6253,
"step": 1070
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.5964629820058396,
"learning_rate": 5e-06,
"loss": 0.6301,
"step": 1080
},
{
"epoch": 0.6210826210826211,
"grad_norm": 0.5697422137314763,
"learning_rate": 5e-06,
"loss": 0.6318,
"step": 1090
},
{
"epoch": 0.6267806267806267,
"grad_norm": 0.6246288555076868,
"learning_rate": 5e-06,
"loss": 0.6498,
"step": 1100
},
{
"epoch": 0.6324786324786325,
"grad_norm": 0.5736841164436457,
"learning_rate": 5e-06,
"loss": 0.6256,
"step": 1110
},
{
"epoch": 0.6381766381766382,
"grad_norm": 0.6073053058703407,
"learning_rate": 5e-06,
"loss": 0.6435,
"step": 1120
},
{
"epoch": 0.6438746438746439,
"grad_norm": 0.5682386523303715,
"learning_rate": 5e-06,
"loss": 0.6118,
"step": 1130
},
{
"epoch": 0.6495726495726496,
"grad_norm": 0.5789250463385112,
"learning_rate": 5e-06,
"loss": 0.6438,
"step": 1140
},
{
"epoch": 0.6552706552706553,
"grad_norm": 0.5677226033216014,
"learning_rate": 5e-06,
"loss": 0.6171,
"step": 1150
},
{
"epoch": 0.6609686609686609,
"grad_norm": 0.570510089133293,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 1160
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.5534962109817962,
"learning_rate": 5e-06,
"loss": 0.6311,
"step": 1170
},
{
"epoch": 0.6723646723646723,
"grad_norm": 0.6028706947112492,
"learning_rate": 5e-06,
"loss": 0.6335,
"step": 1180
},
{
"epoch": 0.6780626780626781,
"grad_norm": 0.5435510233432874,
"learning_rate": 5e-06,
"loss": 0.6266,
"step": 1190
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.5781607598620716,
"learning_rate": 5e-06,
"loss": 0.6081,
"step": 1200
},
{
"epoch": 0.6894586894586895,
"grad_norm": 0.5387883641010746,
"learning_rate": 5e-06,
"loss": 0.6312,
"step": 1210
},
{
"epoch": 0.6951566951566952,
"grad_norm": 0.6108820344681773,
"learning_rate": 5e-06,
"loss": 0.6228,
"step": 1220
},
{
"epoch": 0.7008547008547008,
"grad_norm": 0.5547515213703605,
"learning_rate": 5e-06,
"loss": 0.6184,
"step": 1230
},
{
"epoch": 0.7065527065527065,
"grad_norm": 0.5703753317669427,
"learning_rate": 5e-06,
"loss": 0.6283,
"step": 1240
},
{
"epoch": 0.7122507122507122,
"grad_norm": 0.5718536532526751,
"learning_rate": 5e-06,
"loss": 0.6331,
"step": 1250
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.5559094696648192,
"learning_rate": 5e-06,
"loss": 0.624,
"step": 1260
},
{
"epoch": 0.7236467236467237,
"grad_norm": 0.55959365727665,
"learning_rate": 5e-06,
"loss": 0.6253,
"step": 1270
},
{
"epoch": 0.7293447293447294,
"grad_norm": 0.6031181458566351,
"learning_rate": 5e-06,
"loss": 0.6192,
"step": 1280
},
{
"epoch": 0.7350427350427351,
"grad_norm": 0.5994092568067709,
"learning_rate": 5e-06,
"loss": 0.6192,
"step": 1290
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.6365099348102852,
"learning_rate": 5e-06,
"loss": 0.6355,
"step": 1300
},
{
"epoch": 0.7464387464387464,
"grad_norm": 0.6054035693063426,
"learning_rate": 5e-06,
"loss": 0.6187,
"step": 1310
},
{
"epoch": 0.7521367521367521,
"grad_norm": 0.5812878969763033,
"learning_rate": 5e-06,
"loss": 0.6134,
"step": 1320
},
{
"epoch": 0.7578347578347578,
"grad_norm": 0.6221634431669929,
"learning_rate": 5e-06,
"loss": 0.6317,
"step": 1330
},
{
"epoch": 0.7635327635327636,
"grad_norm": 0.5837064991966057,
"learning_rate": 5e-06,
"loss": 0.6186,
"step": 1340
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.5519308410148134,
"learning_rate": 5e-06,
"loss": 0.6282,
"step": 1350
},
{
"epoch": 0.7749287749287749,
"grad_norm": 0.5884579917268693,
"learning_rate": 5e-06,
"loss": 0.6301,
"step": 1360
},
{
"epoch": 0.7806267806267806,
"grad_norm": 0.5863408060758529,
"learning_rate": 5e-06,
"loss": 0.6332,
"step": 1370
},
{
"epoch": 0.7863247863247863,
"grad_norm": 0.606462849435967,
"learning_rate": 5e-06,
"loss": 0.6444,
"step": 1380
},
{
"epoch": 0.792022792022792,
"grad_norm": 0.609745642222076,
"learning_rate": 5e-06,
"loss": 0.6188,
"step": 1390
},
{
"epoch": 0.7977207977207977,
"grad_norm": 0.6278637624500826,
"learning_rate": 5e-06,
"loss": 0.6438,
"step": 1400
},
{
"epoch": 0.8034188034188035,
"grad_norm": 0.5964004351905415,
"learning_rate": 5e-06,
"loss": 0.6227,
"step": 1410
},
{
"epoch": 0.8091168091168092,
"grad_norm": 0.5695342658619863,
"learning_rate": 5e-06,
"loss": 0.6357,
"step": 1420
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.5859213800511389,
"learning_rate": 5e-06,
"loss": 0.6193,
"step": 1430
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.5752147052829165,
"learning_rate": 5e-06,
"loss": 0.627,
"step": 1440
},
{
"epoch": 0.8262108262108262,
"grad_norm": 0.6177624749983104,
"learning_rate": 5e-06,
"loss": 0.624,
"step": 1450
},
{
"epoch": 0.8319088319088319,
"grad_norm": 0.608719985454889,
"learning_rate": 5e-06,
"loss": 0.6191,
"step": 1460
},
{
"epoch": 0.8376068376068376,
"grad_norm": 0.5667215459680056,
"learning_rate": 5e-06,
"loss": 0.6157,
"step": 1470
},
{
"epoch": 0.8433048433048433,
"grad_norm": 0.5672924637566275,
"learning_rate": 5e-06,
"loss": 0.623,
"step": 1480
},
{
"epoch": 0.8490028490028491,
"grad_norm": 0.6493622667391157,
"learning_rate": 5e-06,
"loss": 0.6359,
"step": 1490
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.5623923532248208,
"learning_rate": 5e-06,
"loss": 0.6289,
"step": 1500
},
{
"epoch": 0.8603988603988604,
"grad_norm": 0.5978019810160363,
"learning_rate": 5e-06,
"loss": 0.6286,
"step": 1510
},
{
"epoch": 0.8660968660968661,
"grad_norm": 0.5455769144299073,
"learning_rate": 5e-06,
"loss": 0.6355,
"step": 1520
},
{
"epoch": 0.8717948717948718,
"grad_norm": 0.5694355383197235,
"learning_rate": 5e-06,
"loss": 0.646,
"step": 1530
},
{
"epoch": 0.8774928774928775,
"grad_norm": 0.5755078976127412,
"learning_rate": 5e-06,
"loss": 0.6119,
"step": 1540
},
{
"epoch": 0.8831908831908832,
"grad_norm": 0.5678832987577136,
"learning_rate": 5e-06,
"loss": 0.6253,
"step": 1550
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.6059681652536819,
"learning_rate": 5e-06,
"loss": 0.6423,
"step": 1560
},
{
"epoch": 0.8945868945868946,
"grad_norm": 0.5683013444396426,
"learning_rate": 5e-06,
"loss": 0.6423,
"step": 1570
},
{
"epoch": 0.9002849002849003,
"grad_norm": 0.5683186943846946,
"learning_rate": 5e-06,
"loss": 0.6219,
"step": 1580
},
{
"epoch": 0.905982905982906,
"grad_norm": 0.5863192427228783,
"learning_rate": 5e-06,
"loss": 0.642,
"step": 1590
},
{
"epoch": 0.9116809116809117,
"grad_norm": 0.5595388030839678,
"learning_rate": 5e-06,
"loss": 0.6327,
"step": 1600
},
{
"epoch": 0.9173789173789174,
"grad_norm": 0.5676065816242905,
"learning_rate": 5e-06,
"loss": 0.6362,
"step": 1610
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.5762087240408192,
"learning_rate": 5e-06,
"loss": 0.6226,
"step": 1620
},
{
"epoch": 0.9287749287749287,
"grad_norm": 0.6707590903760927,
"learning_rate": 5e-06,
"loss": 0.6305,
"step": 1630
},
{
"epoch": 0.9344729344729344,
"grad_norm": 0.6103119559173308,
"learning_rate": 5e-06,
"loss": 0.6319,
"step": 1640
},
{
"epoch": 0.9401709401709402,
"grad_norm": 0.5633994724981068,
"learning_rate": 5e-06,
"loss": 0.6068,
"step": 1650
},
{
"epoch": 0.9458689458689459,
"grad_norm": 0.6187682445922644,
"learning_rate": 5e-06,
"loss": 0.6131,
"step": 1660
},
{
"epoch": 0.9515669515669516,
"grad_norm": 0.5525323634268904,
"learning_rate": 5e-06,
"loss": 0.632,
"step": 1670
},
{
"epoch": 0.9572649572649573,
"grad_norm": 0.5564758529357481,
"learning_rate": 5e-06,
"loss": 0.6159,
"step": 1680
},
{
"epoch": 0.9629629629629629,
"grad_norm": 0.6349335167662418,
"learning_rate": 5e-06,
"loss": 0.6365,
"step": 1690
},
{
"epoch": 0.9686609686609686,
"grad_norm": 0.5650086755333475,
"learning_rate": 5e-06,
"loss": 0.6281,
"step": 1700
},
{
"epoch": 0.9743589743589743,
"grad_norm": 0.5521116095578306,
"learning_rate": 5e-06,
"loss": 0.6344,
"step": 1710
},
{
"epoch": 0.98005698005698,
"grad_norm": 0.6644841541531104,
"learning_rate": 5e-06,
"loss": 0.6234,
"step": 1720
},
{
"epoch": 0.9857549857549858,
"grad_norm": 0.5223237563049048,
"learning_rate": 5e-06,
"loss": 0.6194,
"step": 1730
},
{
"epoch": 0.9914529914529915,
"grad_norm": 0.583506791545265,
"learning_rate": 5e-06,
"loss": 0.6042,
"step": 1740
},
{
"epoch": 0.9971509971509972,
"grad_norm": 0.5524358612737751,
"learning_rate": 5e-06,
"loss": 0.6165,
"step": 1750
},
{
"epoch": 1.0,
"eval_loss": 0.6207965016365051,
"eval_runtime": 445.7759,
"eval_samples_per_second": 26.522,
"eval_steps_per_second": 0.415,
"step": 1755
},
{
"epoch": 1.002849002849003,
"grad_norm": 0.6193051669442373,
"learning_rate": 5e-06,
"loss": 0.5925,
"step": 1760
},
{
"epoch": 1.0085470085470085,
"grad_norm": 0.5538188542900693,
"learning_rate": 5e-06,
"loss": 0.5831,
"step": 1770
},
{
"epoch": 1.0142450142450143,
"grad_norm": 0.5437973437283771,
"learning_rate": 5e-06,
"loss": 0.5574,
"step": 1780
},
{
"epoch": 1.01994301994302,
"grad_norm": 0.5402398297961842,
"learning_rate": 5e-06,
"loss": 0.5807,
"step": 1790
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.5457114785479574,
"learning_rate": 5e-06,
"loss": 0.5786,
"step": 1800
},
{
"epoch": 1.0313390313390314,
"grad_norm": 0.5835431121788174,
"learning_rate": 5e-06,
"loss": 0.5873,
"step": 1810
},
{
"epoch": 1.037037037037037,
"grad_norm": 0.549315178206307,
"learning_rate": 5e-06,
"loss": 0.5637,
"step": 1820
},
{
"epoch": 1.0427350427350428,
"grad_norm": 0.551965760141286,
"learning_rate": 5e-06,
"loss": 0.574,
"step": 1830
},
{
"epoch": 1.0484330484330484,
"grad_norm": 0.5558874848824243,
"learning_rate": 5e-06,
"loss": 0.5641,
"step": 1840
},
{
"epoch": 1.0541310541310542,
"grad_norm": 0.5817186570163869,
"learning_rate": 5e-06,
"loss": 0.5778,
"step": 1850
},
{
"epoch": 1.0598290598290598,
"grad_norm": 0.4980497695823232,
"learning_rate": 5e-06,
"loss": 0.5705,
"step": 1860
},
{
"epoch": 1.0655270655270654,
"grad_norm": 0.6034343866560189,
"learning_rate": 5e-06,
"loss": 0.5705,
"step": 1870
},
{
"epoch": 1.0712250712250713,
"grad_norm": 0.5404707262066166,
"learning_rate": 5e-06,
"loss": 0.5642,
"step": 1880
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.5172417418761445,
"learning_rate": 5e-06,
"loss": 0.5817,
"step": 1890
},
{
"epoch": 1.0826210826210827,
"grad_norm": 0.5444976667004858,
"learning_rate": 5e-06,
"loss": 0.5795,
"step": 1900
},
{
"epoch": 1.0883190883190883,
"grad_norm": 0.5808327796447509,
"learning_rate": 5e-06,
"loss": 0.5838,
"step": 1910
},
{
"epoch": 1.0940170940170941,
"grad_norm": 0.5553080452734186,
"learning_rate": 5e-06,
"loss": 0.5729,
"step": 1920
},
{
"epoch": 1.0997150997150997,
"grad_norm": 0.6252535797926512,
"learning_rate": 5e-06,
"loss": 0.5888,
"step": 1930
},
{
"epoch": 1.1054131054131053,
"grad_norm": 0.5418701052068917,
"learning_rate": 5e-06,
"loss": 0.5749,
"step": 1940
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.5427412867505934,
"learning_rate": 5e-06,
"loss": 0.5802,
"step": 1950
},
{
"epoch": 1.1168091168091168,
"grad_norm": 0.5838842247306398,
"learning_rate": 5e-06,
"loss": 0.5919,
"step": 1960
},
{
"epoch": 1.1225071225071226,
"grad_norm": 0.5659489766269445,
"learning_rate": 5e-06,
"loss": 0.5679,
"step": 1970
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.5710950036482688,
"learning_rate": 5e-06,
"loss": 0.588,
"step": 1980
},
{
"epoch": 1.133903133903134,
"grad_norm": 0.5563097510452688,
"learning_rate": 5e-06,
"loss": 0.5747,
"step": 1990
},
{
"epoch": 1.1396011396011396,
"grad_norm": 0.5413759858943353,
"learning_rate": 5e-06,
"loss": 0.5679,
"step": 2000
},
{
"epoch": 1.1452991452991452,
"grad_norm": 0.5610725075898626,
"learning_rate": 5e-06,
"loss": 0.5795,
"step": 2010
},
{
"epoch": 1.150997150997151,
"grad_norm": 0.5317980893898213,
"learning_rate": 5e-06,
"loss": 0.565,
"step": 2020
},
{
"epoch": 1.1566951566951567,
"grad_norm": 0.5402604242832053,
"learning_rate": 5e-06,
"loss": 0.5671,
"step": 2030
},
{
"epoch": 1.1623931623931625,
"grad_norm": 0.5628406736489239,
"learning_rate": 5e-06,
"loss": 0.5785,
"step": 2040
},
{
"epoch": 1.168091168091168,
"grad_norm": 0.5598060055556051,
"learning_rate": 5e-06,
"loss": 0.5687,
"step": 2050
},
{
"epoch": 1.173789173789174,
"grad_norm": 0.5812067996328552,
"learning_rate": 5e-06,
"loss": 0.5739,
"step": 2060
},
{
"epoch": 1.1794871794871795,
"grad_norm": 0.5804815720213962,
"learning_rate": 5e-06,
"loss": 0.58,
"step": 2070
},
{
"epoch": 1.1851851851851851,
"grad_norm": 0.6009435615613525,
"learning_rate": 5e-06,
"loss": 0.5883,
"step": 2080
},
{
"epoch": 1.190883190883191,
"grad_norm": 0.54252794895387,
"learning_rate": 5e-06,
"loss": 0.5777,
"step": 2090
},
{
"epoch": 1.1965811965811965,
"grad_norm": 0.5996787413433816,
"learning_rate": 5e-06,
"loss": 0.5671,
"step": 2100
},
{
"epoch": 1.2022792022792024,
"grad_norm": 0.5536047256778152,
"learning_rate": 5e-06,
"loss": 0.5745,
"step": 2110
},
{
"epoch": 1.207977207977208,
"grad_norm": 0.561711893430855,
"learning_rate": 5e-06,
"loss": 0.5748,
"step": 2120
},
{
"epoch": 1.2136752136752136,
"grad_norm": 0.5283929440389717,
"learning_rate": 5e-06,
"loss": 0.5915,
"step": 2130
},
{
"epoch": 1.2193732193732194,
"grad_norm": 0.5301857105389954,
"learning_rate": 5e-06,
"loss": 0.5737,
"step": 2140
},
{
"epoch": 1.225071225071225,
"grad_norm": 0.5563444083260252,
"learning_rate": 5e-06,
"loss": 0.5642,
"step": 2150
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.5430715224319318,
"learning_rate": 5e-06,
"loss": 0.5895,
"step": 2160
},
{
"epoch": 1.2364672364672364,
"grad_norm": 0.5629400205999858,
"learning_rate": 5e-06,
"loss": 0.5827,
"step": 2170
},
{
"epoch": 1.242165242165242,
"grad_norm": 0.5751709064271272,
"learning_rate": 5e-06,
"loss": 0.5835,
"step": 2180
},
{
"epoch": 1.2478632478632479,
"grad_norm": 0.5741003758226062,
"learning_rate": 5e-06,
"loss": 0.5865,
"step": 2190
},
{
"epoch": 1.2535612535612537,
"grad_norm": 0.5791349713821747,
"learning_rate": 5e-06,
"loss": 0.5699,
"step": 2200
},
{
"epoch": 1.2592592592592593,
"grad_norm": 0.5538967837131181,
"learning_rate": 5e-06,
"loss": 0.5888,
"step": 2210
},
{
"epoch": 1.264957264957265,
"grad_norm": 0.6047679094918335,
"learning_rate": 5e-06,
"loss": 0.5852,
"step": 2220
},
{
"epoch": 1.2706552706552707,
"grad_norm": 0.5513200031887904,
"learning_rate": 5e-06,
"loss": 0.581,
"step": 2230
},
{
"epoch": 1.2763532763532763,
"grad_norm": 0.5416098244594392,
"learning_rate": 5e-06,
"loss": 0.5608,
"step": 2240
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.6042872751469346,
"learning_rate": 5e-06,
"loss": 0.5753,
"step": 2250
},
{
"epoch": 1.2877492877492878,
"grad_norm": 0.5529496445289886,
"learning_rate": 5e-06,
"loss": 0.5869,
"step": 2260
},
{
"epoch": 1.2934472934472934,
"grad_norm": 0.5061156686160359,
"learning_rate": 5e-06,
"loss": 0.5784,
"step": 2270
},
{
"epoch": 1.2991452991452992,
"grad_norm": 0.5340704963602597,
"learning_rate": 5e-06,
"loss": 0.5591,
"step": 2280
},
{
"epoch": 1.3048433048433048,
"grad_norm": 0.5138792740114064,
"learning_rate": 5e-06,
"loss": 0.5687,
"step": 2290
},
{
"epoch": 1.3105413105413106,
"grad_norm": 0.5804911265669914,
"learning_rate": 5e-06,
"loss": 0.5808,
"step": 2300
},
{
"epoch": 1.3162393162393162,
"grad_norm": 0.6117190706702494,
"learning_rate": 5e-06,
"loss": 0.5867,
"step": 2310
},
{
"epoch": 1.3219373219373218,
"grad_norm": 0.5374452206535677,
"learning_rate": 5e-06,
"loss": 0.5674,
"step": 2320
},
{
"epoch": 1.3276353276353277,
"grad_norm": 0.5510977367381295,
"learning_rate": 5e-06,
"loss": 0.5781,
"step": 2330
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.5721844682107659,
"learning_rate": 5e-06,
"loss": 0.5714,
"step": 2340
},
{
"epoch": 1.339031339031339,
"grad_norm": 0.6182161591629188,
"learning_rate": 5e-06,
"loss": 0.5751,
"step": 2350
},
{
"epoch": 1.3447293447293447,
"grad_norm": 0.5662236658011203,
"learning_rate": 5e-06,
"loss": 0.5647,
"step": 2360
},
{
"epoch": 1.3504273504273505,
"grad_norm": 0.5538407330840261,
"learning_rate": 5e-06,
"loss": 0.587,
"step": 2370
},
{
"epoch": 1.3561253561253561,
"grad_norm": 0.5707317602292841,
"learning_rate": 5e-06,
"loss": 0.5922,
"step": 2380
},
{
"epoch": 1.3618233618233617,
"grad_norm": 0.5641874789639416,
"learning_rate": 5e-06,
"loss": 0.5641,
"step": 2390
},
{
"epoch": 1.3675213675213675,
"grad_norm": 0.5484949550764525,
"learning_rate": 5e-06,
"loss": 0.5751,
"step": 2400
},
{
"epoch": 1.3732193732193732,
"grad_norm": 0.5681990922872379,
"learning_rate": 5e-06,
"loss": 0.5794,
"step": 2410
},
{
"epoch": 1.378917378917379,
"grad_norm": 0.5369480638512492,
"learning_rate": 5e-06,
"loss": 0.5722,
"step": 2420
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.5401250464395447,
"learning_rate": 5e-06,
"loss": 0.5786,
"step": 2430
},
{
"epoch": 1.3903133903133904,
"grad_norm": 0.5143005599419228,
"learning_rate": 5e-06,
"loss": 0.5795,
"step": 2440
},
{
"epoch": 1.396011396011396,
"grad_norm": 0.571280225689413,
"learning_rate": 5e-06,
"loss": 0.5836,
"step": 2450
},
{
"epoch": 1.4017094017094016,
"grad_norm": 0.5302500110416417,
"learning_rate": 5e-06,
"loss": 0.5778,
"step": 2460
},
{
"epoch": 1.4074074074074074,
"grad_norm": 0.6184863756024197,
"learning_rate": 5e-06,
"loss": 0.5932,
"step": 2470
},
{
"epoch": 1.413105413105413,
"grad_norm": 0.5500463967692277,
"learning_rate": 5e-06,
"loss": 0.5763,
"step": 2480
},
{
"epoch": 1.4188034188034189,
"grad_norm": 0.6265908750634392,
"learning_rate": 5e-06,
"loss": 0.5829,
"step": 2490
},
{
"epoch": 1.4245014245014245,
"grad_norm": 0.5888611093710416,
"learning_rate": 5e-06,
"loss": 0.6017,
"step": 2500
},
{
"epoch": 1.4301994301994303,
"grad_norm": 0.5682311211543241,
"learning_rate": 5e-06,
"loss": 0.5719,
"step": 2510
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.572149006000227,
"learning_rate": 5e-06,
"loss": 0.5712,
"step": 2520
},
{
"epoch": 1.4415954415954415,
"grad_norm": 0.5928100499577562,
"learning_rate": 5e-06,
"loss": 0.5781,
"step": 2530
},
{
"epoch": 1.4472934472934473,
"grad_norm": 0.6074436695553413,
"learning_rate": 5e-06,
"loss": 0.5681,
"step": 2540
},
{
"epoch": 1.452991452991453,
"grad_norm": 0.5499482375209497,
"learning_rate": 5e-06,
"loss": 0.578,
"step": 2550
},
{
"epoch": 1.4586894586894588,
"grad_norm": 0.6303664310772396,
"learning_rate": 5e-06,
"loss": 0.5688,
"step": 2560
},
{
"epoch": 1.4643874643874644,
"grad_norm": 0.510009725228524,
"learning_rate": 5e-06,
"loss": 0.5678,
"step": 2570
},
{
"epoch": 1.4700854700854702,
"grad_norm": 0.5748311852491685,
"learning_rate": 5e-06,
"loss": 0.5714,
"step": 2580
},
{
"epoch": 1.4757834757834758,
"grad_norm": 0.6184990291175282,
"learning_rate": 5e-06,
"loss": 0.5743,
"step": 2590
},
{
"epoch": 1.4814814814814814,
"grad_norm": 0.555767002073107,
"learning_rate": 5e-06,
"loss": 0.5817,
"step": 2600
},
{
"epoch": 1.4871794871794872,
"grad_norm": 0.5903444647981344,
"learning_rate": 5e-06,
"loss": 0.5793,
"step": 2610
},
{
"epoch": 1.4928774928774928,
"grad_norm": 0.5576429393071742,
"learning_rate": 5e-06,
"loss": 0.5647,
"step": 2620
},
{
"epoch": 1.4985754985754987,
"grad_norm": 0.5520440692451319,
"learning_rate": 5e-06,
"loss": 0.5716,
"step": 2630
},
{
"epoch": 1.5042735042735043,
"grad_norm": 0.5643022408516812,
"learning_rate": 5e-06,
"loss": 0.5786,
"step": 2640
},
{
"epoch": 1.50997150997151,
"grad_norm": 0.6330193140871835,
"learning_rate": 5e-06,
"loss": 0.5836,
"step": 2650
},
{
"epoch": 1.5156695156695157,
"grad_norm": 0.625250641713771,
"learning_rate": 5e-06,
"loss": 0.5718,
"step": 2660
},
{
"epoch": 1.5213675213675213,
"grad_norm": 0.5418501880171682,
"learning_rate": 5e-06,
"loss": 0.5814,
"step": 2670
},
{
"epoch": 1.5270655270655271,
"grad_norm": 0.6064856119796758,
"learning_rate": 5e-06,
"loss": 0.5859,
"step": 2680
},
{
"epoch": 1.5327635327635327,
"grad_norm": 0.5672868138655305,
"learning_rate": 5e-06,
"loss": 0.5542,
"step": 2690
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.5139650665849255,
"learning_rate": 5e-06,
"loss": 0.5765,
"step": 2700
},
{
"epoch": 1.5441595441595442,
"grad_norm": 0.5940578181125139,
"learning_rate": 5e-06,
"loss": 0.5881,
"step": 2710
},
{
"epoch": 1.54985754985755,
"grad_norm": 0.5756432474296378,
"learning_rate": 5e-06,
"loss": 0.5651,
"step": 2720
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.545530119077096,
"learning_rate": 5e-06,
"loss": 0.5764,
"step": 2730
},
{
"epoch": 1.5612535612535612,
"grad_norm": 0.5756357865222977,
"learning_rate": 5e-06,
"loss": 0.5612,
"step": 2740
},
{
"epoch": 1.566951566951567,
"grad_norm": 0.5876180011106431,
"learning_rate": 5e-06,
"loss": 0.5794,
"step": 2750
},
{
"epoch": 1.5726495726495726,
"grad_norm": 0.5147868956095404,
"learning_rate": 5e-06,
"loss": 0.5707,
"step": 2760
},
{
"epoch": 1.5783475783475782,
"grad_norm": 0.5591733948940201,
"learning_rate": 5e-06,
"loss": 0.5884,
"step": 2770
},
{
"epoch": 1.584045584045584,
"grad_norm": 0.5627964904713231,
"learning_rate": 5e-06,
"loss": 0.6041,
"step": 2780
},
{
"epoch": 1.5897435897435899,
"grad_norm": 0.6297308754027342,
"learning_rate": 5e-06,
"loss": 0.5923,
"step": 2790
},
{
"epoch": 1.5954415954415955,
"grad_norm": 0.5936598449915134,
"learning_rate": 5e-06,
"loss": 0.5761,
"step": 2800
},
{
"epoch": 1.601139601139601,
"grad_norm": 0.5515458557481886,
"learning_rate": 5e-06,
"loss": 0.5833,
"step": 2810
},
{
"epoch": 1.606837606837607,
"grad_norm": 0.5257725141562714,
"learning_rate": 5e-06,
"loss": 0.5765,
"step": 2820
},
{
"epoch": 1.6125356125356125,
"grad_norm": 0.5730936759045179,
"learning_rate": 5e-06,
"loss": 0.5649,
"step": 2830
},
{
"epoch": 1.618233618233618,
"grad_norm": 0.5354221551624224,
"learning_rate": 5e-06,
"loss": 0.5776,
"step": 2840
},
{
"epoch": 1.623931623931624,
"grad_norm": 0.5824066939857122,
"learning_rate": 5e-06,
"loss": 0.5769,
"step": 2850
},
{
"epoch": 1.6296296296296298,
"grad_norm": 0.5554767609605542,
"learning_rate": 5e-06,
"loss": 0.5964,
"step": 2860
},
{
"epoch": 1.6353276353276354,
"grad_norm": 0.5526840997991586,
"learning_rate": 5e-06,
"loss": 0.5727,
"step": 2870
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.5465241482682912,
"learning_rate": 5e-06,
"loss": 0.5686,
"step": 2880
},
{
"epoch": 1.6467236467236468,
"grad_norm": 0.6190071105580203,
"learning_rate": 5e-06,
"loss": 0.5823,
"step": 2890
},
{
"epoch": 1.6524216524216524,
"grad_norm": 0.5356564377870386,
"learning_rate": 5e-06,
"loss": 0.5755,
"step": 2900
},
{
"epoch": 1.658119658119658,
"grad_norm": 0.5979037263652105,
"learning_rate": 5e-06,
"loss": 0.5772,
"step": 2910
},
{
"epoch": 1.6638176638176638,
"grad_norm": 0.5872422786253441,
"learning_rate": 5e-06,
"loss": 0.5761,
"step": 2920
},
{
"epoch": 1.6695156695156697,
"grad_norm": 0.5667349984301929,
"learning_rate": 5e-06,
"loss": 0.565,
"step": 2930
},
{
"epoch": 1.6752136752136753,
"grad_norm": 0.5553088977731174,
"learning_rate": 5e-06,
"loss": 0.5737,
"step": 2940
},
{
"epoch": 1.6809116809116809,
"grad_norm": 0.5615666295169086,
"learning_rate": 5e-06,
"loss": 0.5793,
"step": 2950
},
{
"epoch": 1.6866096866096867,
"grad_norm": 0.553145275478625,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 2960
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.5341696725770836,
"learning_rate": 5e-06,
"loss": 0.5614,
"step": 2970
},
{
"epoch": 1.698005698005698,
"grad_norm": 0.5463223359456406,
"learning_rate": 5e-06,
"loss": 0.5786,
"step": 2980
},
{
"epoch": 1.7037037037037037,
"grad_norm": 0.5777813141939787,
"learning_rate": 5e-06,
"loss": 0.5841,
"step": 2990
},
{
"epoch": 1.7094017094017095,
"grad_norm": 0.5300785322610779,
"learning_rate": 5e-06,
"loss": 0.5778,
"step": 3000
},
{
"epoch": 1.7150997150997151,
"grad_norm": 0.563500423103054,
"learning_rate": 5e-06,
"loss": 0.5755,
"step": 3010
},
{
"epoch": 1.7207977207977208,
"grad_norm": 0.5634105059434202,
"learning_rate": 5e-06,
"loss": 0.5643,
"step": 3020
},
{
"epoch": 1.7264957264957266,
"grad_norm": 0.5570811501478863,
"learning_rate": 5e-06,
"loss": 0.581,
"step": 3030
},
{
"epoch": 1.7321937321937322,
"grad_norm": 0.5739299531519032,
"learning_rate": 5e-06,
"loss": 0.5748,
"step": 3040
},
{
"epoch": 1.7378917378917378,
"grad_norm": 0.5701909466155488,
"learning_rate": 5e-06,
"loss": 0.5708,
"step": 3050
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.5308366598042483,
"learning_rate": 5e-06,
"loss": 0.5815,
"step": 3060
},
{
"epoch": 1.7492877492877494,
"grad_norm": 0.5528144482172378,
"learning_rate": 5e-06,
"loss": 0.5728,
"step": 3070
},
{
"epoch": 1.7549857549857548,
"grad_norm": 0.6087932311563443,
"learning_rate": 5e-06,
"loss": 0.5707,
"step": 3080
},
{
"epoch": 1.7606837606837606,
"grad_norm": 0.5516607270748759,
"learning_rate": 5e-06,
"loss": 0.5896,
"step": 3090
},
{
"epoch": 1.7663817663817665,
"grad_norm": 0.5843004310212261,
"learning_rate": 5e-06,
"loss": 0.5767,
"step": 3100
},
{
"epoch": 1.772079772079772,
"grad_norm": 0.5852609740047551,
"learning_rate": 5e-06,
"loss": 0.5845,
"step": 3110
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.552703665463295,
"learning_rate": 5e-06,
"loss": 0.5703,
"step": 3120
},
{
"epoch": 1.7834757834757835,
"grad_norm": 0.6753220707242441,
"learning_rate": 5e-06,
"loss": 0.5758,
"step": 3130
},
{
"epoch": 1.7891737891737893,
"grad_norm": 0.616525110015123,
"learning_rate": 5e-06,
"loss": 0.5884,
"step": 3140
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.5654708131985335,
"learning_rate": 5e-06,
"loss": 0.5758,
"step": 3150
},
{
"epoch": 1.8005698005698005,
"grad_norm": 0.5323851407450543,
"learning_rate": 5e-06,
"loss": 0.5686,
"step": 3160
},
{
"epoch": 1.8062678062678064,
"grad_norm": 0.5799884631320908,
"learning_rate": 5e-06,
"loss": 0.5733,
"step": 3170
},
{
"epoch": 1.811965811965812,
"grad_norm": 0.6251760605506265,
"learning_rate": 5e-06,
"loss": 0.5766,
"step": 3180
},
{
"epoch": 1.8176638176638176,
"grad_norm": 0.5779038699801032,
"learning_rate": 5e-06,
"loss": 0.5624,
"step": 3190
},
{
"epoch": 1.8233618233618234,
"grad_norm": 0.5088787245145318,
"learning_rate": 5e-06,
"loss": 0.5596,
"step": 3200
},
{
"epoch": 1.8290598290598292,
"grad_norm": 0.4965401461152827,
"learning_rate": 5e-06,
"loss": 0.5698,
"step": 3210
},
{
"epoch": 1.8347578347578346,
"grad_norm": 0.5353414236210642,
"learning_rate": 5e-06,
"loss": 0.5683,
"step": 3220
},
{
"epoch": 1.8404558404558404,
"grad_norm": 0.5409699336409666,
"learning_rate": 5e-06,
"loss": 0.569,
"step": 3230
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.574913493745111,
"learning_rate": 5e-06,
"loss": 0.5789,
"step": 3240
},
{
"epoch": 1.8518518518518519,
"grad_norm": 0.5521006517580501,
"learning_rate": 5e-06,
"loss": 0.5715,
"step": 3250
},
{
"epoch": 1.8575498575498575,
"grad_norm": 0.5805482573176093,
"learning_rate": 5e-06,
"loss": 0.5585,
"step": 3260
},
{
"epoch": 1.8632478632478633,
"grad_norm": 0.573426960061509,
"learning_rate": 5e-06,
"loss": 0.578,
"step": 3270
},
{
"epoch": 1.868945868945869,
"grad_norm": 0.5565980515165863,
"learning_rate": 5e-06,
"loss": 0.5729,
"step": 3280
},
{
"epoch": 1.8746438746438745,
"grad_norm": 0.5586859824206695,
"learning_rate": 5e-06,
"loss": 0.5761,
"step": 3290
},
{
"epoch": 1.8803418803418803,
"grad_norm": 0.5496999183024133,
"learning_rate": 5e-06,
"loss": 0.5561,
"step": 3300
},
{
"epoch": 1.8860398860398861,
"grad_norm": 0.5997656668355231,
"learning_rate": 5e-06,
"loss": 0.5859,
"step": 3310
},
{
"epoch": 1.8917378917378918,
"grad_norm": 0.6206812095844995,
"learning_rate": 5e-06,
"loss": 0.5845,
"step": 3320
},
{
"epoch": 1.8974358974358974,
"grad_norm": 0.5963021360576573,
"learning_rate": 5e-06,
"loss": 0.5839,
"step": 3330
},
{
"epoch": 1.9031339031339032,
"grad_norm": 0.5743434396116716,
"learning_rate": 5e-06,
"loss": 0.5816,
"step": 3340
},
{
"epoch": 1.9088319088319088,
"grad_norm": 0.5858788059851108,
"learning_rate": 5e-06,
"loss": 0.5716,
"step": 3350
},
{
"epoch": 1.9145299145299144,
"grad_norm": 0.5648325254524372,
"learning_rate": 5e-06,
"loss": 0.5789,
"step": 3360
},
{
"epoch": 1.9202279202279202,
"grad_norm": 0.5444165984165006,
"learning_rate": 5e-06,
"loss": 0.5775,
"step": 3370
},
{
"epoch": 1.925925925925926,
"grad_norm": 0.6059359662104461,
"learning_rate": 5e-06,
"loss": 0.5802,
"step": 3380
},
{
"epoch": 1.9316239316239316,
"grad_norm": 0.5755161543958413,
"learning_rate": 5e-06,
"loss": 0.5784,
"step": 3390
},
{
"epoch": 1.9373219373219372,
"grad_norm": 0.5368654626324941,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 3400
},
{
"epoch": 1.943019943019943,
"grad_norm": 0.5582700416390507,
"learning_rate": 5e-06,
"loss": 0.5676,
"step": 3410
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.5625374338203465,
"learning_rate": 5e-06,
"loss": 0.5741,
"step": 3420
},
{
"epoch": 1.9544159544159543,
"grad_norm": 0.6326198974505973,
"learning_rate": 5e-06,
"loss": 0.5874,
"step": 3430
},
{
"epoch": 1.96011396011396,
"grad_norm": 0.5826625931834889,
"learning_rate": 5e-06,
"loss": 0.5875,
"step": 3440
},
{
"epoch": 1.965811965811966,
"grad_norm": 0.6063426488985952,
"learning_rate": 5e-06,
"loss": 0.5802,
"step": 3450
},
{
"epoch": 1.9715099715099715,
"grad_norm": 0.5589982670093196,
"learning_rate": 5e-06,
"loss": 0.5813,
"step": 3460
},
{
"epoch": 1.9772079772079771,
"grad_norm": 0.5197556180794218,
"learning_rate": 5e-06,
"loss": 0.5763,
"step": 3470
},
{
"epoch": 1.982905982905983,
"grad_norm": 0.5840640270883682,
"learning_rate": 5e-06,
"loss": 0.5701,
"step": 3480
},
{
"epoch": 1.9886039886039886,
"grad_norm": 0.5335458990414118,
"learning_rate": 5e-06,
"loss": 0.5701,
"step": 3490
},
{
"epoch": 1.9943019943019942,
"grad_norm": 0.5707109550752067,
"learning_rate": 5e-06,
"loss": 0.5753,
"step": 3500
},
{
"epoch": 2.0,
"grad_norm": 0.5883827598003003,
"learning_rate": 5e-06,
"loss": 0.581,
"step": 3510
},
{
"epoch": 2.0,
"eval_loss": 0.6180241703987122,
"eval_runtime": 447.562,
"eval_samples_per_second": 26.416,
"eval_steps_per_second": 0.413,
"step": 3510
},
{
"epoch": 2.005698005698006,
"grad_norm": 0.5776212119618224,
"learning_rate": 5e-06,
"loss": 0.5254,
"step": 3520
},
{
"epoch": 2.011396011396011,
"grad_norm": 0.5810335608051052,
"learning_rate": 5e-06,
"loss": 0.5094,
"step": 3530
},
{
"epoch": 2.017094017094017,
"grad_norm": 0.5432713405979779,
"learning_rate": 5e-06,
"loss": 0.5032,
"step": 3540
},
{
"epoch": 2.022792022792023,
"grad_norm": 0.5416321025053085,
"learning_rate": 5e-06,
"loss": 0.515,
"step": 3550
},
{
"epoch": 2.0284900284900287,
"grad_norm": 0.5775523857428819,
"learning_rate": 5e-06,
"loss": 0.518,
"step": 3560
},
{
"epoch": 2.034188034188034,
"grad_norm": 0.5592263586107213,
"learning_rate": 5e-06,
"loss": 0.5234,
"step": 3570
},
{
"epoch": 2.03988603988604,
"grad_norm": 0.5476768718918923,
"learning_rate": 5e-06,
"loss": 0.5289,
"step": 3580
},
{
"epoch": 2.0455840455840457,
"grad_norm": 0.6102527285044014,
"learning_rate": 5e-06,
"loss": 0.5208,
"step": 3590
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.589225412655405,
"learning_rate": 5e-06,
"loss": 0.5298,
"step": 3600
},
{
"epoch": 2.056980056980057,
"grad_norm": 0.57808060135501,
"learning_rate": 5e-06,
"loss": 0.5208,
"step": 3610
},
{
"epoch": 2.0626780626780628,
"grad_norm": 0.5634395489010126,
"learning_rate": 5e-06,
"loss": 0.5212,
"step": 3620
},
{
"epoch": 2.0683760683760686,
"grad_norm": 0.5526570622014573,
"learning_rate": 5e-06,
"loss": 0.5297,
"step": 3630
},
{
"epoch": 2.074074074074074,
"grad_norm": 0.5810750660810072,
"learning_rate": 5e-06,
"loss": 0.525,
"step": 3640
},
{
"epoch": 2.07977207977208,
"grad_norm": 0.5614577275900066,
"learning_rate": 5e-06,
"loss": 0.5259,
"step": 3650
},
{
"epoch": 2.0854700854700856,
"grad_norm": 0.5486462905219032,
"learning_rate": 5e-06,
"loss": 0.5213,
"step": 3660
},
{
"epoch": 2.091168091168091,
"grad_norm": 0.5307563733817223,
"learning_rate": 5e-06,
"loss": 0.529,
"step": 3670
},
{
"epoch": 2.096866096866097,
"grad_norm": 0.5389945236629596,
"learning_rate": 5e-06,
"loss": 0.5348,
"step": 3680
},
{
"epoch": 2.1025641025641026,
"grad_norm": 0.5527322408012718,
"learning_rate": 5e-06,
"loss": 0.5116,
"step": 3690
},
{
"epoch": 2.1082621082621085,
"grad_norm": 0.5328079584501793,
"learning_rate": 5e-06,
"loss": 0.5282,
"step": 3700
},
{
"epoch": 2.113960113960114,
"grad_norm": 0.5686915040528058,
"learning_rate": 5e-06,
"loss": 0.5261,
"step": 3710
},
{
"epoch": 2.1196581196581197,
"grad_norm": 0.5501606190305495,
"learning_rate": 5e-06,
"loss": 0.5365,
"step": 3720
},
{
"epoch": 2.1253561253561255,
"grad_norm": 0.5536761094008102,
"learning_rate": 5e-06,
"loss": 0.5263,
"step": 3730
},
{
"epoch": 2.131054131054131,
"grad_norm": 0.5345031800564628,
"learning_rate": 5e-06,
"loss": 0.5206,
"step": 3740
},
{
"epoch": 2.1367521367521367,
"grad_norm": 0.6046490261900991,
"learning_rate": 5e-06,
"loss": 0.5275,
"step": 3750
},
{
"epoch": 2.1424501424501425,
"grad_norm": 0.5840211791187765,
"learning_rate": 5e-06,
"loss": 0.5201,
"step": 3760
},
{
"epoch": 2.148148148148148,
"grad_norm": 0.5529533135143219,
"learning_rate": 5e-06,
"loss": 0.5115,
"step": 3770
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.5680751070257097,
"learning_rate": 5e-06,
"loss": 0.5294,
"step": 3780
},
{
"epoch": 2.1595441595441596,
"grad_norm": 0.5245141535052799,
"learning_rate": 5e-06,
"loss": 0.5281,
"step": 3790
},
{
"epoch": 2.1652421652421654,
"grad_norm": 0.5648362949355089,
"learning_rate": 5e-06,
"loss": 0.5147,
"step": 3800
},
{
"epoch": 2.1709401709401708,
"grad_norm": 0.5254847337067438,
"learning_rate": 5e-06,
"loss": 0.5313,
"step": 3810
},
{
"epoch": 2.1766381766381766,
"grad_norm": 0.5976261665941772,
"learning_rate": 5e-06,
"loss": 0.5198,
"step": 3820
},
{
"epoch": 2.1823361823361824,
"grad_norm": 0.5864445373756276,
"learning_rate": 5e-06,
"loss": 0.5336,
"step": 3830
},
{
"epoch": 2.1880341880341883,
"grad_norm": 0.5537617774511332,
"learning_rate": 5e-06,
"loss": 0.5239,
"step": 3840
},
{
"epoch": 2.1937321937321936,
"grad_norm": 0.5790262967504055,
"learning_rate": 5e-06,
"loss": 0.5387,
"step": 3850
},
{
"epoch": 2.1994301994301995,
"grad_norm": 0.5448893578337308,
"learning_rate": 5e-06,
"loss": 0.5158,
"step": 3860
},
{
"epoch": 2.2051282051282053,
"grad_norm": 0.5224956999156651,
"learning_rate": 5e-06,
"loss": 0.5319,
"step": 3870
},
{
"epoch": 2.2108262108262107,
"grad_norm": 0.5452041541066649,
"learning_rate": 5e-06,
"loss": 0.5283,
"step": 3880
},
{
"epoch": 2.2165242165242165,
"grad_norm": 0.5188463908276534,
"learning_rate": 5e-06,
"loss": 0.5111,
"step": 3890
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.6153310194594807,
"learning_rate": 5e-06,
"loss": 0.5358,
"step": 3900
},
{
"epoch": 2.2279202279202277,
"grad_norm": 0.5926494217956065,
"learning_rate": 5e-06,
"loss": 0.5154,
"step": 3910
},
{
"epoch": 2.2336182336182335,
"grad_norm": 0.5109574356125176,
"learning_rate": 5e-06,
"loss": 0.518,
"step": 3920
},
{
"epoch": 2.2393162393162394,
"grad_norm": 0.5289253041831274,
"learning_rate": 5e-06,
"loss": 0.5246,
"step": 3930
},
{
"epoch": 2.245014245014245,
"grad_norm": 0.5628951778576998,
"learning_rate": 5e-06,
"loss": 0.526,
"step": 3940
},
{
"epoch": 2.2507122507122506,
"grad_norm": 0.551449654946418,
"learning_rate": 5e-06,
"loss": 0.5374,
"step": 3950
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.5466152136858086,
"learning_rate": 5e-06,
"loss": 0.5169,
"step": 3960
},
{
"epoch": 2.262108262108262,
"grad_norm": 0.5146969054690042,
"learning_rate": 5e-06,
"loss": 0.5281,
"step": 3970
},
{
"epoch": 2.267806267806268,
"grad_norm": 0.5293060782125808,
"learning_rate": 5e-06,
"loss": 0.5191,
"step": 3980
},
{
"epoch": 2.2735042735042734,
"grad_norm": 0.5473420088344219,
"learning_rate": 5e-06,
"loss": 0.531,
"step": 3990
},
{
"epoch": 2.2792022792022792,
"grad_norm": 0.5512443710837232,
"learning_rate": 5e-06,
"loss": 0.5256,
"step": 4000
},
{
"epoch": 2.284900284900285,
"grad_norm": 0.5442787627600018,
"learning_rate": 5e-06,
"loss": 0.5222,
"step": 4010
},
{
"epoch": 2.2905982905982905,
"grad_norm": 0.5545916348777593,
"learning_rate": 5e-06,
"loss": 0.535,
"step": 4020
},
{
"epoch": 2.2962962962962963,
"grad_norm": 0.59632132003208,
"learning_rate": 5e-06,
"loss": 0.5317,
"step": 4030
},
{
"epoch": 2.301994301994302,
"grad_norm": 0.5408157566248561,
"learning_rate": 5e-06,
"loss": 0.5146,
"step": 4040
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.5820724583290839,
"learning_rate": 5e-06,
"loss": 0.5349,
"step": 4050
},
{
"epoch": 2.3133903133903133,
"grad_norm": 0.5687662322666911,
"learning_rate": 5e-06,
"loss": 0.5282,
"step": 4060
},
{
"epoch": 2.319088319088319,
"grad_norm": 0.573552994416881,
"learning_rate": 5e-06,
"loss": 0.5336,
"step": 4070
},
{
"epoch": 2.324786324786325,
"grad_norm": 0.5677912645112424,
"learning_rate": 5e-06,
"loss": 0.5214,
"step": 4080
},
{
"epoch": 2.3304843304843303,
"grad_norm": 0.5274433334199329,
"learning_rate": 5e-06,
"loss": 0.5244,
"step": 4090
},
{
"epoch": 2.336182336182336,
"grad_norm": 0.5658209536678374,
"learning_rate": 5e-06,
"loss": 0.5286,
"step": 4100
},
{
"epoch": 2.341880341880342,
"grad_norm": 0.5780434495697487,
"learning_rate": 5e-06,
"loss": 0.5341,
"step": 4110
},
{
"epoch": 2.347578347578348,
"grad_norm": 0.5818657983745251,
"learning_rate": 5e-06,
"loss": 0.5338,
"step": 4120
},
{
"epoch": 2.353276353276353,
"grad_norm": 0.5389779504746351,
"learning_rate": 5e-06,
"loss": 0.5291,
"step": 4130
},
{
"epoch": 2.358974358974359,
"grad_norm": 0.5610403895418081,
"learning_rate": 5e-06,
"loss": 0.5225,
"step": 4140
},
{
"epoch": 2.364672364672365,
"grad_norm": 0.5209098217965255,
"learning_rate": 5e-06,
"loss": 0.5334,
"step": 4150
},
{
"epoch": 2.3703703703703702,
"grad_norm": 0.5744294920867676,
"learning_rate": 5e-06,
"loss": 0.5204,
"step": 4160
},
{
"epoch": 2.376068376068376,
"grad_norm": 0.598425566675419,
"learning_rate": 5e-06,
"loss": 0.52,
"step": 4170
},
{
"epoch": 2.381766381766382,
"grad_norm": 0.5493923391327106,
"learning_rate": 5e-06,
"loss": 0.5359,
"step": 4180
},
{
"epoch": 2.3874643874643873,
"grad_norm": 0.5533392246170049,
"learning_rate": 5e-06,
"loss": 0.521,
"step": 4190
},
{
"epoch": 2.393162393162393,
"grad_norm": 0.5731160307080695,
"learning_rate": 5e-06,
"loss": 0.5329,
"step": 4200
},
{
"epoch": 2.398860398860399,
"grad_norm": 0.5775023991320096,
"learning_rate": 5e-06,
"loss": 0.5359,
"step": 4210
},
{
"epoch": 2.4045584045584047,
"grad_norm": 0.5901628223866878,
"learning_rate": 5e-06,
"loss": 0.5493,
"step": 4220
},
{
"epoch": 2.41025641025641,
"grad_norm": 0.5542817321146499,
"learning_rate": 5e-06,
"loss": 0.526,
"step": 4230
},
{
"epoch": 2.415954415954416,
"grad_norm": 0.5524566146364747,
"learning_rate": 5e-06,
"loss": 0.5307,
"step": 4240
},
{
"epoch": 2.421652421652422,
"grad_norm": 0.5244228024377005,
"learning_rate": 5e-06,
"loss": 0.5278,
"step": 4250
},
{
"epoch": 2.427350427350427,
"grad_norm": 0.5786633243903677,
"learning_rate": 5e-06,
"loss": 0.5482,
"step": 4260
},
{
"epoch": 2.433048433048433,
"grad_norm": 0.5858650466682291,
"learning_rate": 5e-06,
"loss": 0.5394,
"step": 4270
},
{
"epoch": 2.438746438746439,
"grad_norm": 0.5858885917781449,
"learning_rate": 5e-06,
"loss": 0.5371,
"step": 4280
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.5339546065735147,
"learning_rate": 5e-06,
"loss": 0.5373,
"step": 4290
},
{
"epoch": 2.45014245014245,
"grad_norm": 0.5984049498497251,
"learning_rate": 5e-06,
"loss": 0.5474,
"step": 4300
},
{
"epoch": 2.455840455840456,
"grad_norm": 0.5807043848022856,
"learning_rate": 5e-06,
"loss": 0.5309,
"step": 4310
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.5709610370467612,
"learning_rate": 5e-06,
"loss": 0.5246,
"step": 4320
},
{
"epoch": 2.467236467236467,
"grad_norm": 0.5499687224770995,
"learning_rate": 5e-06,
"loss": 0.531,
"step": 4330
},
{
"epoch": 2.472934472934473,
"grad_norm": 0.5722356598944494,
"learning_rate": 5e-06,
"loss": 0.5286,
"step": 4340
},
{
"epoch": 2.4786324786324787,
"grad_norm": 0.5486032250328287,
"learning_rate": 5e-06,
"loss": 0.5358,
"step": 4350
},
{
"epoch": 2.484330484330484,
"grad_norm": 0.5142530295671646,
"learning_rate": 5e-06,
"loss": 0.5324,
"step": 4360
},
{
"epoch": 2.49002849002849,
"grad_norm": 0.6364539965127788,
"learning_rate": 5e-06,
"loss": 0.5325,
"step": 4370
},
{
"epoch": 2.4957264957264957,
"grad_norm": 0.5822908149062661,
"learning_rate": 5e-06,
"loss": 0.5378,
"step": 4380
},
{
"epoch": 2.5014245014245016,
"grad_norm": 0.5660579125585127,
"learning_rate": 5e-06,
"loss": 0.539,
"step": 4390
},
{
"epoch": 2.5071225071225074,
"grad_norm": 0.6015416980494055,
"learning_rate": 5e-06,
"loss": 0.543,
"step": 4400
},
{
"epoch": 2.5128205128205128,
"grad_norm": 0.544050303995212,
"learning_rate": 5e-06,
"loss": 0.5186,
"step": 4410
},
{
"epoch": 2.5185185185185186,
"grad_norm": 0.5489445408860626,
"learning_rate": 5e-06,
"loss": 0.5293,
"step": 4420
},
{
"epoch": 2.5242165242165244,
"grad_norm": 0.5804195388596164,
"learning_rate": 5e-06,
"loss": 0.5368,
"step": 4430
},
{
"epoch": 2.52991452991453,
"grad_norm": 0.5465444916928103,
"learning_rate": 5e-06,
"loss": 0.5395,
"step": 4440
},
{
"epoch": 2.5356125356125356,
"grad_norm": 0.5679778769321939,
"learning_rate": 5e-06,
"loss": 0.5358,
"step": 4450
},
{
"epoch": 2.5413105413105415,
"grad_norm": 0.5726465912316608,
"learning_rate": 5e-06,
"loss": 0.5253,
"step": 4460
},
{
"epoch": 2.547008547008547,
"grad_norm": 0.5387152868301355,
"learning_rate": 5e-06,
"loss": 0.5268,
"step": 4470
},
{
"epoch": 2.5527065527065527,
"grad_norm": 0.5559047427422275,
"learning_rate": 5e-06,
"loss": 0.5305,
"step": 4480
},
{
"epoch": 2.5584045584045585,
"grad_norm": 0.5428769349897132,
"learning_rate": 5e-06,
"loss": 0.5209,
"step": 4490
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.5407361307856526,
"learning_rate": 5e-06,
"loss": 0.5351,
"step": 4500
},
{
"epoch": 2.5698005698005697,
"grad_norm": 0.5595203034409101,
"learning_rate": 5e-06,
"loss": 0.5312,
"step": 4510
},
{
"epoch": 2.5754985754985755,
"grad_norm": 0.5752885902852435,
"learning_rate": 5e-06,
"loss": 0.5328,
"step": 4520
},
{
"epoch": 2.5811965811965814,
"grad_norm": 0.5448007027240791,
"learning_rate": 5e-06,
"loss": 0.5295,
"step": 4530
},
{
"epoch": 2.5868945868945867,
"grad_norm": 0.5494957146695392,
"learning_rate": 5e-06,
"loss": 0.5327,
"step": 4540
},
{
"epoch": 2.5925925925925926,
"grad_norm": 0.5743882596085497,
"learning_rate": 5e-06,
"loss": 0.5295,
"step": 4550
},
{
"epoch": 2.5982905982905984,
"grad_norm": 0.5481581540445639,
"learning_rate": 5e-06,
"loss": 0.5305,
"step": 4560
},
{
"epoch": 2.603988603988604,
"grad_norm": 0.5834328837619958,
"learning_rate": 5e-06,
"loss": 0.5376,
"step": 4570
},
{
"epoch": 2.6096866096866096,
"grad_norm": 0.5536117193354623,
"learning_rate": 5e-06,
"loss": 0.5341,
"step": 4580
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.545383573085851,
"learning_rate": 5e-06,
"loss": 0.5233,
"step": 4590
},
{
"epoch": 2.6210826210826212,
"grad_norm": 0.5204672857822074,
"learning_rate": 5e-06,
"loss": 0.5228,
"step": 4600
},
{
"epoch": 2.6267806267806266,
"grad_norm": 0.5139161169258046,
"learning_rate": 5e-06,
"loss": 0.5328,
"step": 4610
},
{
"epoch": 2.6324786324786325,
"grad_norm": 0.6028262892085369,
"learning_rate": 5e-06,
"loss": 0.5332,
"step": 4620
},
{
"epoch": 2.6381766381766383,
"grad_norm": 0.5559617493532288,
"learning_rate": 5e-06,
"loss": 0.5232,
"step": 4630
},
{
"epoch": 2.6438746438746437,
"grad_norm": 0.5435028142224008,
"learning_rate": 5e-06,
"loss": 0.5415,
"step": 4640
},
{
"epoch": 2.6495726495726495,
"grad_norm": 0.604873621040108,
"learning_rate": 5e-06,
"loss": 0.5303,
"step": 4650
},
{
"epoch": 2.6552706552706553,
"grad_norm": 0.5697598259817795,
"learning_rate": 5e-06,
"loss": 0.5373,
"step": 4660
},
{
"epoch": 2.6609686609686607,
"grad_norm": 0.5511420813626869,
"learning_rate": 5e-06,
"loss": 0.5434,
"step": 4670
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.5394695044798543,
"learning_rate": 5e-06,
"loss": 0.5238,
"step": 4680
},
{
"epoch": 2.6723646723646723,
"grad_norm": 0.5330927779679859,
"learning_rate": 5e-06,
"loss": 0.5245,
"step": 4690
},
{
"epoch": 2.678062678062678,
"grad_norm": 0.5736642108384618,
"learning_rate": 5e-06,
"loss": 0.5305,
"step": 4700
},
{
"epoch": 2.683760683760684,
"grad_norm": 0.6197551413034075,
"learning_rate": 5e-06,
"loss": 0.5408,
"step": 4710
},
{
"epoch": 2.6894586894586894,
"grad_norm": 0.5791951412024915,
"learning_rate": 5e-06,
"loss": 0.5306,
"step": 4720
},
{
"epoch": 2.695156695156695,
"grad_norm": 0.5631274263966353,
"learning_rate": 5e-06,
"loss": 0.5228,
"step": 4730
},
{
"epoch": 2.700854700854701,
"grad_norm": 0.5605562545980405,
"learning_rate": 5e-06,
"loss": 0.5266,
"step": 4740
},
{
"epoch": 2.7065527065527064,
"grad_norm": 0.5321827825743034,
"learning_rate": 5e-06,
"loss": 0.5338,
"step": 4750
},
{
"epoch": 2.7122507122507122,
"grad_norm": 0.5644337354264807,
"learning_rate": 5e-06,
"loss": 0.5376,
"step": 4760
},
{
"epoch": 2.717948717948718,
"grad_norm": 0.5719762386839188,
"learning_rate": 5e-06,
"loss": 0.5298,
"step": 4770
},
{
"epoch": 2.7236467236467234,
"grad_norm": 0.5870644859394915,
"learning_rate": 5e-06,
"loss": 0.5408,
"step": 4780
},
{
"epoch": 2.7293447293447293,
"grad_norm": 0.5161759448699083,
"learning_rate": 5e-06,
"loss": 0.5385,
"step": 4790
},
{
"epoch": 2.735042735042735,
"grad_norm": 0.5685973523356822,
"learning_rate": 5e-06,
"loss": 0.5295,
"step": 4800
},
{
"epoch": 2.7407407407407405,
"grad_norm": 0.5955189388351516,
"learning_rate": 5e-06,
"loss": 0.5357,
"step": 4810
},
{
"epoch": 2.7464387464387463,
"grad_norm": 0.5927243869455354,
"learning_rate": 5e-06,
"loss": 0.5397,
"step": 4820
},
{
"epoch": 2.752136752136752,
"grad_norm": 0.5892611711545225,
"learning_rate": 5e-06,
"loss": 0.5427,
"step": 4830
},
{
"epoch": 2.757834757834758,
"grad_norm": 0.5320349130904972,
"learning_rate": 5e-06,
"loss": 0.5322,
"step": 4840
},
{
"epoch": 2.763532763532764,
"grad_norm": 0.5215197760783008,
"learning_rate": 5e-06,
"loss": 0.5196,
"step": 4850
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.5967746123628929,
"learning_rate": 5e-06,
"loss": 0.5349,
"step": 4860
},
{
"epoch": 2.774928774928775,
"grad_norm": 0.5303530858087516,
"learning_rate": 5e-06,
"loss": 0.5288,
"step": 4870
},
{
"epoch": 2.780626780626781,
"grad_norm": 0.5294938033518871,
"learning_rate": 5e-06,
"loss": 0.5254,
"step": 4880
},
{
"epoch": 2.786324786324786,
"grad_norm": 0.6085557642175643,
"learning_rate": 5e-06,
"loss": 0.5362,
"step": 4890
},
{
"epoch": 2.792022792022792,
"grad_norm": 0.5563638209032657,
"learning_rate": 5e-06,
"loss": 0.5243,
"step": 4900
},
{
"epoch": 2.797720797720798,
"grad_norm": 0.5426535982775469,
"learning_rate": 5e-06,
"loss": 0.5302,
"step": 4910
},
{
"epoch": 2.8034188034188032,
"grad_norm": 0.5606166025371381,
"learning_rate": 5e-06,
"loss": 0.5195,
"step": 4920
},
{
"epoch": 2.809116809116809,
"grad_norm": 0.5600176374437925,
"learning_rate": 5e-06,
"loss": 0.5339,
"step": 4930
},
{
"epoch": 2.814814814814815,
"grad_norm": 0.5735203266072578,
"learning_rate": 5e-06,
"loss": 0.5463,
"step": 4940
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.5647627688966846,
"learning_rate": 5e-06,
"loss": 0.5342,
"step": 4950
},
{
"epoch": 2.826210826210826,
"grad_norm": 0.6181052514822875,
"learning_rate": 5e-06,
"loss": 0.5502,
"step": 4960
},
{
"epoch": 2.831908831908832,
"grad_norm": 0.5487589815910356,
"learning_rate": 5e-06,
"loss": 0.5332,
"step": 4970
},
{
"epoch": 2.8376068376068377,
"grad_norm": 0.5519304274572768,
"learning_rate": 5e-06,
"loss": 0.5342,
"step": 4980
},
{
"epoch": 2.8433048433048436,
"grad_norm": 0.5710774380484754,
"learning_rate": 5e-06,
"loss": 0.5468,
"step": 4990
},
{
"epoch": 2.849002849002849,
"grad_norm": 0.5396253108717034,
"learning_rate": 5e-06,
"loss": 0.536,
"step": 5000
},
{
"epoch": 2.8547008547008548,
"grad_norm": 0.5481621751659937,
"learning_rate": 5e-06,
"loss": 0.5361,
"step": 5010
},
{
"epoch": 2.8603988603988606,
"grad_norm": 0.5815133705980525,
"learning_rate": 5e-06,
"loss": 0.5321,
"step": 5020
},
{
"epoch": 2.866096866096866,
"grad_norm": 0.5408578285161547,
"learning_rate": 5e-06,
"loss": 0.5367,
"step": 5030
},
{
"epoch": 2.871794871794872,
"grad_norm": 0.5405279703831611,
"learning_rate": 5e-06,
"loss": 0.532,
"step": 5040
},
{
"epoch": 2.8774928774928776,
"grad_norm": 0.5566749988018465,
"learning_rate": 5e-06,
"loss": 0.5374,
"step": 5050
},
{
"epoch": 2.883190883190883,
"grad_norm": 0.5806758592562609,
"learning_rate": 5e-06,
"loss": 0.5425,
"step": 5060
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.5820389002607862,
"learning_rate": 5e-06,
"loss": 0.5329,
"step": 5070
},
{
"epoch": 2.8945868945868947,
"grad_norm": 0.5375342327708015,
"learning_rate": 5e-06,
"loss": 0.5404,
"step": 5080
},
{
"epoch": 2.9002849002849,
"grad_norm": 0.5641024886824925,
"learning_rate": 5e-06,
"loss": 0.5206,
"step": 5090
},
{
"epoch": 2.905982905982906,
"grad_norm": 0.5595993132067282,
"learning_rate": 5e-06,
"loss": 0.5456,
"step": 5100
},
{
"epoch": 2.9116809116809117,
"grad_norm": 0.5729657514196825,
"learning_rate": 5e-06,
"loss": 0.5303,
"step": 5110
},
{
"epoch": 2.9173789173789175,
"grad_norm": 0.5592258039441389,
"learning_rate": 5e-06,
"loss": 0.5137,
"step": 5120
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.5482964902412071,
"learning_rate": 5e-06,
"loss": 0.5379,
"step": 5130
},
{
"epoch": 2.9287749287749287,
"grad_norm": 0.5336701580376303,
"learning_rate": 5e-06,
"loss": 0.516,
"step": 5140
},
{
"epoch": 2.9344729344729346,
"grad_norm": 0.573991652444628,
"learning_rate": 5e-06,
"loss": 0.5197,
"step": 5150
},
{
"epoch": 2.9401709401709404,
"grad_norm": 0.5656512132917955,
"learning_rate": 5e-06,
"loss": 0.5299,
"step": 5160
},
{
"epoch": 2.9458689458689458,
"grad_norm": 0.5637897139695605,
"learning_rate": 5e-06,
"loss": 0.5318,
"step": 5170
},
{
"epoch": 2.9515669515669516,
"grad_norm": 0.5805647906397857,
"learning_rate": 5e-06,
"loss": 0.5348,
"step": 5180
},
{
"epoch": 2.9572649572649574,
"grad_norm": 0.5629404743153653,
"learning_rate": 5e-06,
"loss": 0.5239,
"step": 5190
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.5482910577257104,
"learning_rate": 5e-06,
"loss": 0.5248,
"step": 5200
},
{
"epoch": 2.9686609686609686,
"grad_norm": 0.5428900145420302,
"learning_rate": 5e-06,
"loss": 0.5286,
"step": 5210
},
{
"epoch": 2.9743589743589745,
"grad_norm": 0.5426923796436356,
"learning_rate": 5e-06,
"loss": 0.5439,
"step": 5220
},
{
"epoch": 2.98005698005698,
"grad_norm": 0.5421746187267816,
"learning_rate": 5e-06,
"loss": 0.5394,
"step": 5230
},
{
"epoch": 2.9857549857549857,
"grad_norm": 0.5703778871540313,
"learning_rate": 5e-06,
"loss": 0.5359,
"step": 5240
},
{
"epoch": 2.9914529914529915,
"grad_norm": 0.5488503690583575,
"learning_rate": 5e-06,
"loss": 0.5253,
"step": 5250
},
{
"epoch": 2.9971509971509973,
"grad_norm": 0.6053670974984723,
"learning_rate": 5e-06,
"loss": 0.5255,
"step": 5260
},
{
"epoch": 3.0,
"eval_loss": 0.6272810697555542,
"eval_runtime": 446.6687,
"eval_samples_per_second": 26.469,
"eval_steps_per_second": 0.414,
"step": 5265
},
{
"epoch": 3.0,
"step": 5265,
"total_flos": 2759937528692736.0,
"train_loss": 0.5805289232719545,
"train_runtime": 71924.3439,
"train_samples_per_second": 9.369,
"train_steps_per_second": 0.073
}
],
"logging_steps": 10,
"max_steps": 5265,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2759937528692736.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}