Files
daft-qwen2.5-coder-3b-instr…/trainer_state.json
ModelHub XC c00b0b999e 初始化项目,由ModelHub XC社区提供模型
Model: aasim-m/daft-qwen2.5-coder-3b-instruct-full-loss-0.02
Source: Original Platform
2026-04-25 13:33:36 +08:00

1500 lines
39 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1044,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014406627048442283,
"grad_norm": 2.235235207512626,
"learning_rate": 3.8095238095238102e-06,
"loss": 0.7567409992218017,
"step": 5
},
{
"epoch": 0.028813254096884566,
"grad_norm": 3.1220907369972015,
"learning_rate": 8.571428571428573e-06,
"loss": 0.6131507873535156,
"step": 10
},
{
"epoch": 0.04321988114532685,
"grad_norm": 1.26071593847269,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.4174152374267578,
"step": 15
},
{
"epoch": 0.05762650819376913,
"grad_norm": 0.7001329999687771,
"learning_rate": 1.8095238095238094e-05,
"loss": 0.3269367218017578,
"step": 20
},
{
"epoch": 0.07203313524221142,
"grad_norm": 0.45760881472851334,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.2579814910888672,
"step": 25
},
{
"epoch": 0.0864397622906537,
"grad_norm": 0.3195758164531423,
"learning_rate": 2.7619047619047622e-05,
"loss": 0.22403466701507568,
"step": 30
},
{
"epoch": 0.10084638933909598,
"grad_norm": 0.3937202043873978,
"learning_rate": 3.2380952380952386e-05,
"loss": 0.19430849552154542,
"step": 35
},
{
"epoch": 0.11525301638753827,
"grad_norm": 0.4622992203768602,
"learning_rate": 3.7142857142857143e-05,
"loss": 0.17950987815856934,
"step": 40
},
{
"epoch": 0.12965964343598055,
"grad_norm": 0.6153699707363115,
"learning_rate": 4.190476190476191e-05,
"loss": 0.16267883777618408,
"step": 45
},
{
"epoch": 0.14406627048442283,
"grad_norm": 0.44308873981945024,
"learning_rate": 4.666666666666667e-05,
"loss": 0.15465893745422363,
"step": 50
},
{
"epoch": 0.15847289753286511,
"grad_norm": 0.2667361601014973,
"learning_rate": 5.142857142857143e-05,
"loss": 0.1339721441268921,
"step": 55
},
{
"epoch": 0.1728795245813074,
"grad_norm": 0.2856336328456399,
"learning_rate": 5.619047619047619e-05,
"loss": 0.12487690448760987,
"step": 60
},
{
"epoch": 0.18728615162974968,
"grad_norm": 0.25342260570898284,
"learning_rate": 6.0952380952380964e-05,
"loss": 0.12374393939971924,
"step": 65
},
{
"epoch": 0.20169277867819196,
"grad_norm": 0.30856178034874804,
"learning_rate": 6.571428571428571e-05,
"loss": 0.11958651542663574,
"step": 70
},
{
"epoch": 0.21609940572663425,
"grad_norm": 0.37526519775047024,
"learning_rate": 7.047619047619048e-05,
"loss": 0.1187552571296692,
"step": 75
},
{
"epoch": 0.23050603277507653,
"grad_norm": 0.23950000695755433,
"learning_rate": 7.523809523809524e-05,
"loss": 0.11327266693115234,
"step": 80
},
{
"epoch": 0.2449126598235188,
"grad_norm": 0.2205068195876155,
"learning_rate": 8e-05,
"loss": 0.10409235954284668,
"step": 85
},
{
"epoch": 0.2593192868719611,
"grad_norm": 0.38702416724389993,
"learning_rate": 8.476190476190477e-05,
"loss": 0.10459071397781372,
"step": 90
},
{
"epoch": 0.2737259139204034,
"grad_norm": 1.5049886602582672,
"learning_rate": 8.952380952380953e-05,
"loss": 0.10260200500488281,
"step": 95
},
{
"epoch": 0.28813254096884566,
"grad_norm": 0.7639386034441172,
"learning_rate": 9.428571428571429e-05,
"loss": 0.15102967023849487,
"step": 100
},
{
"epoch": 0.302539168017288,
"grad_norm": 1.2918337466488035,
"learning_rate": 9.904761904761905e-05,
"loss": 0.21126885414123536,
"step": 105
},
{
"epoch": 0.31694579506573023,
"grad_norm": 0.3993035259049567,
"learning_rate": 9.99955226394288e-05,
"loss": 0.1489308714866638,
"step": 110
},
{
"epoch": 0.33135242211417254,
"grad_norm": 0.4337702538884852,
"learning_rate": 9.997733473639876e-05,
"loss": 0.1149595022201538,
"step": 115
},
{
"epoch": 0.3457590491626148,
"grad_norm": 0.19718286454816564,
"learning_rate": 9.994516154152849e-05,
"loss": 0.10596739053726197,
"step": 120
},
{
"epoch": 0.3601656762110571,
"grad_norm": 0.20133933569463766,
"learning_rate": 9.989901205792952e-05,
"loss": 0.09920316338539123,
"step": 125
},
{
"epoch": 0.37457230325949936,
"grad_norm": 0.1627871886908846,
"learning_rate": 9.983889919973586e-05,
"loss": 0.08917503952980041,
"step": 130
},
{
"epoch": 0.3889789303079417,
"grad_norm": 0.1732609558706655,
"learning_rate": 9.976483978849007e-05,
"loss": 0.08957574367523194,
"step": 135
},
{
"epoch": 0.40338555735638393,
"grad_norm": 0.17068559939024425,
"learning_rate": 9.967685454843618e-05,
"loss": 0.08561774492263793,
"step": 140
},
{
"epoch": 0.41779218440482624,
"grad_norm": 0.1627969165227149,
"learning_rate": 9.957496810072027e-05,
"loss": 0.083004629611969,
"step": 145
},
{
"epoch": 0.4321988114532685,
"grad_norm": 0.1467656284113813,
"learning_rate": 9.945920895650071e-05,
"loss": 0.0808147668838501,
"step": 150
},
{
"epoch": 0.4466054385017108,
"grad_norm": 0.13322475158297778,
"learning_rate": 9.932960950896981e-05,
"loss": 0.07847496271133422,
"step": 155
},
{
"epoch": 0.46101206555015306,
"grad_norm": 0.12265959792803287,
"learning_rate": 9.918620602428915e-05,
"loss": 0.07879123687744141,
"step": 160
},
{
"epoch": 0.47541869259859537,
"grad_norm": 0.11616840330675544,
"learning_rate": 9.902903863144107e-05,
"loss": 0.07581273913383484,
"step": 165
},
{
"epoch": 0.4898253196470376,
"grad_norm": 0.12203330893770489,
"learning_rate": 9.885815131099934e-05,
"loss": 0.07368603944778443,
"step": 170
},
{
"epoch": 0.5042319466954799,
"grad_norm": 0.1546308315564488,
"learning_rate": 9.867359188282192e-05,
"loss": 0.06976621150970459,
"step": 175
},
{
"epoch": 0.5186385737439222,
"grad_norm": 0.2238419658216969,
"learning_rate": 9.847541199266941e-05,
"loss": 0.07270271778106689,
"step": 180
},
{
"epoch": 0.5330452007923645,
"grad_norm": 0.1778940838233776,
"learning_rate": 9.826366709775286e-05,
"loss": 0.06899308562278747,
"step": 185
},
{
"epoch": 0.5474518278408068,
"grad_norm": 0.1236118996782577,
"learning_rate": 9.803841645121504e-05,
"loss": 0.06641653776168824,
"step": 190
},
{
"epoch": 0.561858454889249,
"grad_norm": 0.20583115161000629,
"learning_rate": 9.779972308554952e-05,
"loss": 0.06647136211395263,
"step": 195
},
{
"epoch": 0.5762650819376913,
"grad_norm": 0.13075349423709637,
"learning_rate": 9.754765379496202e-05,
"loss": 0.06856078505516053,
"step": 200
},
{
"epoch": 0.5906717089861336,
"grad_norm": 0.17339943774041067,
"learning_rate": 9.728227911667934e-05,
"loss": 0.06724534034729004,
"step": 205
},
{
"epoch": 0.605078336034576,
"grad_norm": 0.14182602625367816,
"learning_rate": 9.700367331121054e-05,
"loss": 0.06738802194595336,
"step": 210
},
{
"epoch": 0.6194849630830181,
"grad_norm": 0.16393619983216387,
"learning_rate": 9.67119143415667e-05,
"loss": 0.07073606252670288,
"step": 215
},
{
"epoch": 0.6338915901314605,
"grad_norm": 0.1328841830481596,
"learning_rate": 9.640708385144403e-05,
"loss": 0.06382153034210206,
"step": 220
},
{
"epoch": 0.6482982171799028,
"grad_norm": 0.13571266215544603,
"learning_rate": 9.608926714237754e-05,
"loss": 0.06776301860809326,
"step": 225
},
{
"epoch": 0.6627048442283451,
"grad_norm": 0.12351937744356929,
"learning_rate": 9.575855314987068e-05,
"loss": 0.06309446096420288,
"step": 230
},
{
"epoch": 0.6771114712767873,
"grad_norm": 0.12290269065618897,
"learning_rate": 9.541503441850843e-05,
"loss": 0.06422497630119324,
"step": 235
},
{
"epoch": 0.6915180983252296,
"grad_norm": 0.12486344744276894,
"learning_rate": 9.505880707606024e-05,
"loss": 0.06324135661125183,
"step": 240
},
{
"epoch": 0.7059247253736719,
"grad_norm": 0.1371721876913286,
"learning_rate": 9.468997080658031e-05,
"loss": 0.06205494403839111,
"step": 245
},
{
"epoch": 0.7203313524221142,
"grad_norm": 0.12440196659006258,
"learning_rate": 9.430862882251278e-05,
"loss": 0.057729125022888184,
"step": 250
},
{
"epoch": 0.7347379794705564,
"grad_norm": 0.11085006544539791,
"learning_rate": 9.391488783580955e-05,
"loss": 0.059876751899719236,
"step": 255
},
{
"epoch": 0.7491446065189987,
"grad_norm": 0.11611256342361528,
"learning_rate": 9.350885802806863e-05,
"loss": 0.05882802605628967,
"step": 260
},
{
"epoch": 0.763551233567441,
"grad_norm": 0.1279259846460798,
"learning_rate": 9.309065301970193e-05,
"loss": 0.06077917814254761,
"step": 265
},
{
"epoch": 0.7779578606158833,
"grad_norm": 0.11105876561542377,
"learning_rate": 9.266038983814039e-05,
"loss": 0.05303559303283691,
"step": 270
},
{
"epoch": 0.7923644876643255,
"grad_norm": 0.11671310410423168,
"learning_rate": 9.221818888508602e-05,
"loss": 0.06124954223632813,
"step": 275
},
{
"epoch": 0.8067711147127679,
"grad_norm": 0.11537085211038406,
"learning_rate": 9.176417390281944e-05,
"loss": 0.055888807773590087,
"step": 280
},
{
"epoch": 0.8211777417612102,
"grad_norm": 0.1480823536245831,
"learning_rate": 9.129847193957282e-05,
"loss": 0.056972581148147586,
"step": 285
},
{
"epoch": 0.8355843688096525,
"grad_norm": 0.15744268133880865,
"learning_rate": 9.08212133139776e-05,
"loss": 0.05824898481369019,
"step": 290
},
{
"epoch": 0.8499909958580947,
"grad_norm": 0.1397867333597395,
"learning_rate": 9.033253157859714e-05,
"loss": 0.05415785312652588,
"step": 295
},
{
"epoch": 0.864397622906537,
"grad_norm": 0.12034022108734013,
"learning_rate": 8.983256348255423e-05,
"loss": 0.05467197895050049,
"step": 300
},
{
"epoch": 0.8788042499549793,
"grad_norm": 0.12682573622924756,
"learning_rate": 8.932144893326432e-05,
"loss": 0.06181464791297912,
"step": 305
},
{
"epoch": 0.8932108770034216,
"grad_norm": 0.11321366531316682,
"learning_rate": 8.879933095728485e-05,
"loss": 0.05511963367462158,
"step": 310
},
{
"epoch": 0.9076175040518638,
"grad_norm": 0.1076394497380973,
"learning_rate": 8.826635566029166e-05,
"loss": 0.05229709148406982,
"step": 315
},
{
"epoch": 0.9220241311003061,
"grad_norm": 0.11249447920151531,
"learning_rate": 8.772267218619388e-05,
"loss": 0.05275582075119019,
"step": 320
},
{
"epoch": 0.9364307581487484,
"grad_norm": 0.11401150417345533,
"learning_rate": 8.716843267539869e-05,
"loss": 0.05470834374427795,
"step": 325
},
{
"epoch": 0.9508373851971907,
"grad_norm": 0.13321527980254963,
"learning_rate": 8.660379222223727e-05,
"loss": 0.05563476085662842,
"step": 330
},
{
"epoch": 0.9652440122456329,
"grad_norm": 0.10771804020098895,
"learning_rate": 8.602890883156454e-05,
"loss": 0.054843342304229735,
"step": 335
},
{
"epoch": 0.9796506392940753,
"grad_norm": 0.12601833333178913,
"learning_rate": 8.544394337454409e-05,
"loss": 0.05721263885498047,
"step": 340
},
{
"epoch": 0.9940572663425176,
"grad_norm": 0.12322820499048608,
"learning_rate": 8.484905954363123e-05,
"loss": 0.05096786618232727,
"step": 345
},
{
"epoch": 1.0057626508193769,
"grad_norm": 0.14089468629356533,
"learning_rate": 8.424442380676647e-05,
"loss": 0.05167339444160461,
"step": 350
},
{
"epoch": 1.0201692778678193,
"grad_norm": 0.1705872004915626,
"learning_rate": 8.363020536079239e-05,
"loss": 0.05249757170677185,
"step": 355
},
{
"epoch": 1.0345759049162615,
"grad_norm": 0.15358181481824462,
"learning_rate": 8.300657608410678e-05,
"loss": 0.05038872957229614,
"step": 360
},
{
"epoch": 1.0489825319647037,
"grad_norm": 0.13895400680332037,
"learning_rate": 8.237371048856546e-05,
"loss": 0.050058400630950926,
"step": 365
},
{
"epoch": 1.0633891590131461,
"grad_norm": 0.09560889181658183,
"learning_rate": 8.17317856706482e-05,
"loss": 0.04919912219047547,
"step": 370
},
{
"epoch": 1.0777957860615883,
"grad_norm": 0.10951811698505555,
"learning_rate": 8.108098126190129e-05,
"loss": 0.04963598847389221,
"step": 375
},
{
"epoch": 1.0922024131100305,
"grad_norm": 0.09853927812254934,
"learning_rate": 8.042147937867079e-05,
"loss": 0.046415746212005615,
"step": 380
},
{
"epoch": 1.106609040158473,
"grad_norm": 0.09238299590671381,
"learning_rate": 7.975346457114034e-05,
"loss": 0.04439312219619751,
"step": 385
},
{
"epoch": 1.1210156672069151,
"grad_norm": 0.10940030307745394,
"learning_rate": 7.907712377168817e-05,
"loss": 0.051634716987609866,
"step": 390
},
{
"epoch": 1.1354222942553576,
"grad_norm": 0.09338579936215781,
"learning_rate": 7.839264624257712e-05,
"loss": 0.04415662288665771,
"step": 395
},
{
"epoch": 1.1498289213037998,
"grad_norm": 0.10999587309136662,
"learning_rate": 7.770022352299293e-05,
"loss": 0.047378170490264895,
"step": 400
},
{
"epoch": 1.164235548352242,
"grad_norm": 0.10109309983264758,
"learning_rate": 7.700004937544542e-05,
"loss": 0.04249417781829834,
"step": 405
},
{
"epoch": 1.1786421754006844,
"grad_norm": 0.10231496239314469,
"learning_rate": 7.629231973154725e-05,
"loss": 0.04593285918235779,
"step": 410
},
{
"epoch": 1.1930488024491266,
"grad_norm": 0.1000912342655061,
"learning_rate": 7.557723263718596e-05,
"loss": 0.05370241403579712,
"step": 415
},
{
"epoch": 1.2074554294975688,
"grad_norm": 0.08355578823714238,
"learning_rate": 7.485498819710417e-05,
"loss": 0.04612640142440796,
"step": 420
},
{
"epoch": 1.2218620565460112,
"grad_norm": 0.087036754767847,
"learning_rate": 7.412578851890384e-05,
"loss": 0.043773263692855835,
"step": 425
},
{
"epoch": 1.2362686835944534,
"grad_norm": 0.09341830589519805,
"learning_rate": 7.338983765648985e-05,
"loss": 0.046638333797454835,
"step": 430
},
{
"epoch": 1.2506753106428956,
"grad_norm": 0.09163918271970233,
"learning_rate": 7.264734155296912e-05,
"loss": 0.045640939474105836,
"step": 435
},
{
"epoch": 1.265081937691338,
"grad_norm": 0.09623135416486957,
"learning_rate": 7.189850798302099e-05,
"loss": 0.04710923135280609,
"step": 440
},
{
"epoch": 1.2794885647397802,
"grad_norm": 0.09010925699278292,
"learning_rate": 7.114354649475499e-05,
"loss": 0.04437531530857086,
"step": 445
},
{
"epoch": 1.2938951917882227,
"grad_norm": 0.09828854110045074,
"learning_rate": 7.038266835107257e-05,
"loss": 0.04155453443527222,
"step": 450
},
{
"epoch": 1.3083018188366649,
"grad_norm": 0.09261388252893078,
"learning_rate": 6.961608647054873e-05,
"loss": 0.04477185308933258,
"step": 455
},
{
"epoch": 1.322708445885107,
"grad_norm": 0.09199618999958105,
"learning_rate": 6.884401536785045e-05,
"loss": 0.045587533712387086,
"step": 460
},
{
"epoch": 1.3371150729335495,
"grad_norm": 0.10296954226773448,
"learning_rate": 6.806667109370853e-05,
"loss": 0.04496743679046631,
"step": 465
},
{
"epoch": 1.3515216999819917,
"grad_norm": 0.0991741419475408,
"learning_rate": 6.728427117445948e-05,
"loss": 0.04124987423419953,
"step": 470
},
{
"epoch": 1.365928327030434,
"grad_norm": 0.08767468242608127,
"learning_rate": 6.649703455117458e-05,
"loss": 0.044256627559661865,
"step": 475
},
{
"epoch": 1.3803349540788763,
"grad_norm": 0.08419233546507805,
"learning_rate": 6.5705181518393e-05,
"loss": 0.047923988103866576,
"step": 480
},
{
"epoch": 1.3947415811273185,
"grad_norm": 0.15529323580619178,
"learning_rate": 6.490893366247612e-05,
"loss": 0.040982422232627866,
"step": 485
},
{
"epoch": 1.409148208175761,
"grad_norm": 0.08719252163236856,
"learning_rate": 6.41085137996006e-05,
"loss": 0.0431306004524231,
"step": 490
},
{
"epoch": 1.4235548352242031,
"grad_norm": 0.09381117178448978,
"learning_rate": 6.330414591340689e-05,
"loss": 0.039784133434295654,
"step": 495
},
{
"epoch": 1.4379614622726455,
"grad_norm": 0.08334433128110437,
"learning_rate": 6.249605509232149e-05,
"loss": 0.04327746033668518,
"step": 500
},
{
"epoch": 1.4523680893210877,
"grad_norm": 0.09141409005562276,
"learning_rate": 6.168446746656973e-05,
"loss": 0.04065501093864441,
"step": 505
},
{
"epoch": 1.46677471636953,
"grad_norm": 0.10836927533553822,
"learning_rate": 6.0869610144897215e-05,
"loss": 0.040621763467788695,
"step": 510
},
{
"epoch": 1.4811813434179721,
"grad_norm": 0.11429670482454558,
"learning_rate": 6.005171115101735e-05,
"loss": 0.042708945274353025,
"step": 515
},
{
"epoch": 1.4955879704664146,
"grad_norm": 0.10265027708777795,
"learning_rate": 5.9230999359802784e-05,
"loss": 0.03845831751823425,
"step": 520
},
{
"epoch": 1.509994597514857,
"grad_norm": 0.0937825232136341,
"learning_rate": 5.84077044332389e-05,
"loss": 0.04369714856147766,
"step": 525
},
{
"epoch": 1.5244012245632992,
"grad_norm": 0.14710934296521627,
"learning_rate": 5.7582056756156665e-05,
"loss": 0.04057990908622742,
"step": 530
},
{
"epoch": 1.5388078516117414,
"grad_norm": 0.08557873748617338,
"learning_rate": 5.675428737176367e-05,
"loss": 0.03988811373710632,
"step": 535
},
{
"epoch": 1.5532144786601836,
"grad_norm": 0.08304731519894865,
"learning_rate": 5.5924627916990446e-05,
"loss": 0.040156081318855286,
"step": 540
},
{
"epoch": 1.567621105708626,
"grad_norm": 0.09009100140646863,
"learning_rate": 5.5093310557671074e-05,
"loss": 0.04313129186630249,
"step": 545
},
{
"epoch": 1.5820277327570682,
"grad_norm": 0.09229023810015868,
"learning_rate": 5.426056792357551e-05,
"loss": 0.04041691720485687,
"step": 550
},
{
"epoch": 1.5964343598055106,
"grad_norm": 0.08400211717158966,
"learning_rate": 5.342663304331211e-05,
"loss": 0.04093085825443268,
"step": 555
},
{
"epoch": 1.6108409868539528,
"grad_norm": 0.09614326424875454,
"learning_rate": 5.25917392791188e-05,
"loss": 0.039686673879623414,
"step": 560
},
{
"epoch": 1.625247613902395,
"grad_norm": 0.1067845470194038,
"learning_rate": 5.1756120261560446e-05,
"loss": 0.039973828196525577,
"step": 565
},
{
"epoch": 1.6396542409508372,
"grad_norm": 0.08943621090417164,
"learning_rate": 5.092000982415162e-05,
"loss": 0.03885244131088257,
"step": 570
},
{
"epoch": 1.6540608679992796,
"grad_norm": 0.08753082979407804,
"learning_rate": 5.0083641937922145e-05,
"loss": 0.03913732171058655,
"step": 575
},
{
"epoch": 1.668467495047722,
"grad_norm": 0.09803669811995008,
"learning_rate": 4.924725064594447e-05,
"loss": 0.038859084248542786,
"step": 580
},
{
"epoch": 1.6828741220961643,
"grad_norm": 0.08541143736458823,
"learning_rate": 4.8411069997840756e-05,
"loss": 0.037244629859924314,
"step": 585
},
{
"epoch": 1.6972807491446065,
"grad_norm": 0.08650694144802851,
"learning_rate": 4.757533398428812e-05,
"loss": 0.04225952625274658,
"step": 590
},
{
"epoch": 1.7116873761930487,
"grad_norm": 0.09490787276668022,
"learning_rate": 4.674027647154037e-05,
"loss": 0.03874731659889221,
"step": 595
},
{
"epoch": 1.726094003241491,
"grad_norm": 0.07772058542302925,
"learning_rate": 4.590613113598461e-05,
"loss": 0.03750569224357605,
"step": 600
},
{
"epoch": 1.7405006302899335,
"grad_norm": 0.07856101825582532,
"learning_rate": 4.507313139875102e-05,
"loss": 0.03765683174133301,
"step": 605
},
{
"epoch": 1.7549072573383757,
"grad_norm": 0.07088260858693515,
"learning_rate": 4.4241510360393804e-05,
"loss": 0.03841148316860199,
"step": 610
},
{
"epoch": 1.769313884386818,
"grad_norm": 0.08315598782355023,
"learning_rate": 4.341150073566227e-05,
"loss": 0.03978689610958099,
"step": 615
},
{
"epoch": 1.7837205114352601,
"grad_norm": 0.08933153255691949,
"learning_rate": 4.258333478837947e-05,
"loss": 0.038895291090011594,
"step": 620
},
{
"epoch": 1.7981271384837025,
"grad_norm": 0.08396668543385523,
"learning_rate": 4.1757244266447245e-05,
"loss": 0.04072596728801727,
"step": 625
},
{
"epoch": 1.8125337655321447,
"grad_norm": 0.07957802106126194,
"learning_rate": 4.093346033699557e-05,
"loss": 0.03865320086479187,
"step": 630
},
{
"epoch": 1.8269403925805872,
"grad_norm": 0.08958406118221353,
"learning_rate": 4.011221352169447e-05,
"loss": 0.04185936748981476,
"step": 635
},
{
"epoch": 1.8413470196290294,
"grad_norm": 0.08961676019198377,
"learning_rate": 3.9293733632246544e-05,
"loss": 0.04408974051475525,
"step": 640
},
{
"epoch": 1.8557536466774716,
"grad_norm": 0.07858278806552751,
"learning_rate": 3.847824970607797e-05,
"loss": 0.04014042019844055,
"step": 645
},
{
"epoch": 1.8701602737259138,
"grad_norm": 0.07419667584622487,
"learning_rate": 3.7665989942246625e-05,
"loss": 0.03581300973892212,
"step": 650
},
{
"epoch": 1.8845669007743562,
"grad_norm": 0.08037951897237189,
"learning_rate": 3.685718163758427e-05,
"loss": 0.04189331531524658,
"step": 655
},
{
"epoch": 1.8989735278227986,
"grad_norm": 0.08133067284522653,
"learning_rate": 3.6052051123091634e-05,
"loss": 0.03912949562072754,
"step": 660
},
{
"epoch": 1.9133801548712408,
"grad_norm": 0.08974888658045152,
"learning_rate": 3.5250823700603496e-05,
"loss": 0.03808005452156067,
"step": 665
},
{
"epoch": 1.927786781919683,
"grad_norm": 0.07193212698550007,
"learning_rate": 3.445372357974194e-05,
"loss": 0.03524368405342102,
"step": 670
},
{
"epoch": 1.9421934089681252,
"grad_norm": 0.07439568567213939,
"learning_rate": 3.3660973815175165e-05,
"loss": 0.03650209903717041,
"step": 675
},
{
"epoch": 1.9566000360165676,
"grad_norm": 0.07586041788325688,
"learning_rate": 3.287279624419945e-05,
"loss": 0.036546701192855836,
"step": 680
},
{
"epoch": 1.97100666306501,
"grad_norm": 0.08294122441026296,
"learning_rate": 3.208941142466187e-05,
"loss": 0.03591431975364685,
"step": 685
},
{
"epoch": 1.9854132901134522,
"grad_norm": 0.08528763303850583,
"learning_rate": 3.1311038573240975e-05,
"loss": 0.03485568761825562,
"step": 690
},
{
"epoch": 1.9998199171618944,
"grad_norm": 0.0756456466151007,
"learning_rate": 3.0537895504102874e-05,
"loss": 0.037538421154022214,
"step": 695
},
{
"epoch": 2.0115253016387538,
"grad_norm": 0.0987258257656567,
"learning_rate": 2.9770198567949546e-05,
"loss": 0.027647560834884642,
"step": 700
},
{
"epoch": 2.025931928687196,
"grad_norm": 0.10342059226496335,
"learning_rate": 2.900816259147705e-05,
"loss": 0.03239924311637878,
"step": 705
},
{
"epoch": 2.0403385557356386,
"grad_norm": 0.08947622183974005,
"learning_rate": 2.8252000817259837e-05,
"loss": 0.02974867820739746,
"step": 710
},
{
"epoch": 2.054745182784081,
"grad_norm": 0.07819720124564082,
"learning_rate": 2.7501924844078534e-05,
"loss": 0.027856966853141783,
"step": 715
},
{
"epoch": 2.069151809832523,
"grad_norm": 0.07255651027166257,
"learning_rate": 2.6758144567707754e-05,
"loss": 0.028209209442138672,
"step": 720
},
{
"epoch": 2.083558436880965,
"grad_norm": 0.0777676865315773,
"learning_rate": 2.6020868122180385e-05,
"loss": 0.02793322205543518,
"step": 725
},
{
"epoch": 2.0979650639294074,
"grad_norm": 0.08664972293238134,
"learning_rate": 2.5290301821544825e-05,
"loss": 0.02801375389099121,
"step": 730
},
{
"epoch": 2.1123716909778496,
"grad_norm": 0.08559466896073407,
"learning_rate": 2.4566650102131573e-05,
"loss": 0.02737850546836853,
"step": 735
},
{
"epoch": 2.1267783180262922,
"grad_norm": 0.07852535239386964,
"learning_rate": 2.3850115465345324e-05,
"loss": 0.030919501185417177,
"step": 740
},
{
"epoch": 2.1411849450747344,
"grad_norm": 0.08182892636530964,
"learning_rate": 2.3140898420998426e-05,
"loss": 0.028718733787536622,
"step": 745
},
{
"epoch": 2.1555915721231766,
"grad_norm": 0.07295529971805709,
"learning_rate": 2.2439197431201646e-05,
"loss": 0.028903046250343324,
"step": 750
},
{
"epoch": 2.169998199171619,
"grad_norm": 0.07624400365106067,
"learning_rate": 2.1745208854828058e-05,
"loss": 0.024923816323280334,
"step": 755
},
{
"epoch": 2.184404826220061,
"grad_norm": 0.07567603422035397,
"learning_rate": 2.105912689256533e-05,
"loss": 0.026013752818107604,
"step": 760
},
{
"epoch": 2.1988114532685037,
"grad_norm": 0.07427613549699529,
"learning_rate": 2.0381143532572082e-05,
"loss": 0.026708921790122984,
"step": 765
},
{
"epoch": 2.213218080316946,
"grad_norm": 0.0721068508797536,
"learning_rate": 1.9711448496753297e-05,
"loss": 0.02909781038761139,
"step": 770
},
{
"epoch": 2.227624707365388,
"grad_norm": 0.09841381262275949,
"learning_rate": 1.905022918766995e-05,
"loss": 0.027940624952316286,
"step": 775
},
{
"epoch": 2.2420313344138303,
"grad_norm": 0.0816958462956758,
"learning_rate": 1.8397670636097636e-05,
"loss": 0.026423072814941405,
"step": 780
},
{
"epoch": 2.2564379614622725,
"grad_norm": 0.07936813973695164,
"learning_rate": 1.775395544924885e-05,
"loss": 0.028386065363883974,
"step": 785
},
{
"epoch": 2.270844588510715,
"grad_norm": 0.07710097062295308,
"learning_rate": 1.7119263759673675e-05,
"loss": 0.02769894599914551,
"step": 790
},
{
"epoch": 2.2852512155591573,
"grad_norm": 0.08498281330072474,
"learning_rate": 1.6493773174852673e-05,
"loss": 0.02839537858963013,
"step": 795
},
{
"epoch": 2.2996578426075995,
"grad_norm": 0.07674813377075432,
"learning_rate": 1.587765872749649e-05,
"loss": 0.02569463849067688,
"step": 800
},
{
"epoch": 2.3140644696560417,
"grad_norm": 0.06662948325098497,
"learning_rate": 1.527109282656611e-05,
"loss": 0.028371796011924744,
"step": 805
},
{
"epoch": 2.328471096704484,
"grad_norm": 0.08015839069477317,
"learning_rate": 1.4674245209027066e-05,
"loss": 0.026229003071784975,
"step": 810
},
{
"epoch": 2.3428777237529266,
"grad_norm": 0.08019588118318016,
"learning_rate": 1.4087282892351623e-05,
"loss": 0.029995208978652953,
"step": 815
},
{
"epoch": 2.3572843508013688,
"grad_norm": 0.08221863155956374,
"learning_rate": 1.3510370127781635e-05,
"loss": 0.029001206159591675,
"step": 820
},
{
"epoch": 2.371690977849811,
"grad_norm": 0.07480678399512465,
"learning_rate": 1.2943668354365878e-05,
"loss": 0.02766028940677643,
"step": 825
},
{
"epoch": 2.386097604898253,
"grad_norm": 0.07477452302806815,
"learning_rate": 1.2387336153784018e-05,
"loss": 0.02593517005443573,
"step": 830
},
{
"epoch": 2.4005042319466954,
"grad_norm": 0.07081183958851973,
"learning_rate": 1.184152920597028e-05,
"loss": 0.026943469047546388,
"step": 835
},
{
"epoch": 2.4149108589951376,
"grad_norm": 0.07536754957279856,
"learning_rate": 1.1306400245549158e-05,
"loss": 0.024954386055469513,
"step": 840
},
{
"epoch": 2.42931748604358,
"grad_norm": 0.06344152496317775,
"learning_rate": 1.0782099019095238e-05,
"loss": 0.028272977471351622,
"step": 845
},
{
"epoch": 2.4437241130920224,
"grad_norm": 0.0644553682371491,
"learning_rate": 1.026877224322923e-05,
"loss": 0.02370927333831787,
"step": 850
},
{
"epoch": 2.4581307401404646,
"grad_norm": 0.07529675849595874,
"learning_rate": 9.766563563561799e-06,
"loss": 0.025498074293136597,
"step": 855
},
{
"epoch": 2.472537367188907,
"grad_norm": 0.08420954265091966,
"learning_rate": 9.275613514496977e-06,
"loss": 0.02770912051200867,
"step": 860
},
{
"epoch": 2.486943994237349,
"grad_norm": 0.0744332415489311,
"learning_rate": 8.7960594799059e-06,
"loss": 0.027615338563919067,
"step": 865
},
{
"epoch": 2.501350621285791,
"grad_norm": 0.07212967627396147,
"learning_rate": 8.328035654682325e-06,
"loss": 0.027428582310676575,
"step": 870
},
{
"epoch": 2.515757248334234,
"grad_norm": 0.08246547759863139,
"learning_rate": 7.871673007190599e-06,
"loss": 0.026888126134872438,
"step": 875
},
{
"epoch": 2.530163875382676,
"grad_norm": 0.06863337011207567,
"learning_rate": 7.427099242616348e-06,
"loss": 0.025411182641983034,
"step": 880
},
{
"epoch": 2.5445705024311183,
"grad_norm": 0.06777467806972155,
"learning_rate": 6.994438767230466e-06,
"loss": 0.024811127781867982,
"step": 885
},
{
"epoch": 2.5589771294795605,
"grad_norm": 0.07029495896606512,
"learning_rate": 6.573812653576062e-06,
"loss": 0.02613699436187744,
"step": 890
},
{
"epoch": 2.5733837565280027,
"grad_norm": 0.07134936463967867,
"learning_rate": 6.1653386065885165e-06,
"loss": 0.026964515447616577,
"step": 895
},
{
"epoch": 2.5877903835764453,
"grad_norm": 0.07711841632882044,
"learning_rate": 5.769130930657734e-06,
"loss": 0.028112486004829407,
"step": 900
},
{
"epoch": 2.6021970106248875,
"grad_norm": 0.08360128959008864,
"learning_rate": 5.38530049764206e-06,
"loss": 0.02626214623451233,
"step": 905
},
{
"epoch": 2.6166036376733297,
"grad_norm": 0.07456201121764428,
"learning_rate": 5.0139547158427e-06,
"loss": 0.02669944763183594,
"step": 910
},
{
"epoch": 2.631010264721772,
"grad_norm": 0.07740576081667884,
"learning_rate": 4.655197499947378e-06,
"loss": 0.029006192088127138,
"step": 915
},
{
"epoch": 2.645416891770214,
"grad_norm": 0.06845350619031464,
"learning_rate": 4.309129241951587e-06,
"loss": 0.02491077184677124,
"step": 920
},
{
"epoch": 2.6598235188186568,
"grad_norm": 0.07501903308333313,
"learning_rate": 3.975846783065662e-06,
"loss": 0.026326572895050047,
"step": 925
},
{
"epoch": 2.674230145867099,
"grad_norm": 0.07580375293031513,
"learning_rate": 3.6554433866154036e-06,
"loss": 0.026823589205741884,
"step": 930
},
{
"epoch": 2.688636772915541,
"grad_norm": 0.06969116474563261,
"learning_rate": 3.3480087119440063e-06,
"loss": 0.025913709402084352,
"step": 935
},
{
"epoch": 2.7030433999639834,
"grad_norm": 0.0714630826160477,
"learning_rate": 3.0536287893223604e-06,
"loss": 0.026928871870040894,
"step": 940
},
{
"epoch": 2.7174500270124256,
"grad_norm": 0.07358152299227637,
"learning_rate": 2.7723859958750486e-06,
"loss": 0.02748822569847107,
"step": 945
},
{
"epoch": 2.731856654060868,
"grad_norm": 0.06838564316740577,
"learning_rate": 2.5043590325285195e-06,
"loss": 0.025952500104904175,
"step": 950
},
{
"epoch": 2.7462632811093104,
"grad_norm": 0.07787109185214655,
"learning_rate": 2.249622901987963e-06,
"loss": 0.02589995265007019,
"step": 955
},
{
"epoch": 2.7606699081577526,
"grad_norm": 0.07156945963749864,
"learning_rate": 2.0082488877491033e-06,
"loss": 0.027577921748161316,
"step": 960
},
{
"epoch": 2.775076535206195,
"grad_norm": 0.06514188446012159,
"learning_rate": 1.7803045341507952e-06,
"loss": 0.025488072633743288,
"step": 965
},
{
"epoch": 2.789483162254637,
"grad_norm": 0.0712195602884753,
"learning_rate": 1.5658536274738621e-06,
"loss": 0.02348570078611374,
"step": 970
},
{
"epoch": 2.8038897893030796,
"grad_norm": 0.0680133235009968,
"learning_rate": 1.3649561780916199e-06,
"loss": 0.02316732406616211,
"step": 975
},
{
"epoch": 2.818296416351522,
"grad_norm": 0.0824565977146897,
"learning_rate": 1.1776684036770347e-06,
"loss": 0.02901957035064697,
"step": 980
},
{
"epoch": 2.832703043399964,
"grad_norm": 0.08111572063117606,
"learning_rate": 1.004042713471165e-06,
"loss": 0.02710677683353424,
"step": 985
},
{
"epoch": 2.8471096704484062,
"grad_norm": 0.07416113908713114,
"learning_rate": 8.441276936173193e-07,
"loss": 0.024537976086139678,
"step": 990
},
{
"epoch": 2.8615162974968484,
"grad_norm": 0.06645937685734804,
"learning_rate": 6.9796809356511e-07,
"loss": 0.025470972061157227,
"step": 995
},
{
"epoch": 2.875922924545291,
"grad_norm": 0.07056688302520532,
"learning_rate": 5.656048135480763e-07,
"loss": 0.025230163335800172,
"step": 1000
},
{
"epoch": 2.8903295515937333,
"grad_norm": 0.07480029198072068,
"learning_rate": 4.470748931384494e-07,
"loss": 0.026770299673080443,
"step": 1005
},
{
"epoch": 2.9047361786421755,
"grad_norm": 0.06476290220031579,
"learning_rate": 3.424115008822726e-07,
"loss": 0.026645660400390625,
"step": 1010
},
{
"epoch": 2.9191428056906177,
"grad_norm": 0.07374044092567203,
"learning_rate": 2.5164392501777487e-07,
"loss": 0.025820019841194152,
"step": 1015
},
{
"epoch": 2.93354943273906,
"grad_norm": 0.07098709082144111,
"learning_rate": 1.7479756527955527e-07,
"loss": 0.025720816850662232,
"step": 1020
},
{
"epoch": 2.9479560597875025,
"grad_norm": 0.07593395611493338,
"learning_rate": 1.1189392579090129e-07,
"loss": 0.024733534455299376,
"step": 1025
},
{
"epoch": 2.9623626868359443,
"grad_norm": 0.07179585283776127,
"learning_rate": 6.295060904623617e-08,
"loss": 0.02832019031047821,
"step": 1030
},
{
"epoch": 2.976769313884387,
"grad_norm": 0.06802635060193646,
"learning_rate": 2.7981310985369935e-08,
"loss": 0.025465887784957886,
"step": 1035
},
{
"epoch": 2.991175940932829,
"grad_norm": 0.0759224455019542,
"learning_rate": 6.995817160920792e-09,
"loss": 0.0264853298664093,
"step": 1040
},
{
"epoch": 3.0,
"step": 1044,
"total_flos": 1577088536150016.0,
"train_loss": 0.06165807318099385,
"train_runtime": 23128.4215,
"train_samples_per_second": 2.881,
"train_steps_per_second": 0.045
}
],
"logging_steps": 5,
"max_steps": 1044,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 207,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1577088536150016.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}