Files
open-sci-ref-v0.02-1.7b-nem…/trainer_state.json
ModelHub XC 42efe026c5 初始化项目,由ModelHub XC社区提供模型
Model: ali-elganzory/open-sci-ref-v0.02-1.7b-nemotron-hq-300B-16384-rope_theta-1M-long_sft_16k
Source: Original Platform
2026-05-14 20:28:26 +08:00

2312 lines
56 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1623,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030807147258163892,
"grad_norm": 0.5412173867225647,
"learning_rate": 9.756097560975611e-06,
"loss": 1.1518,
"step": 5
},
{
"epoch": 0.0061614294516327784,
"grad_norm": 0.4266219735145569,
"learning_rate": 2.1951219512195124e-05,
"loss": 1.2475,
"step": 10
},
{
"epoch": 0.009242144177449169,
"grad_norm": 0.28109025955200195,
"learning_rate": 3.414634146341464e-05,
"loss": 1.2543,
"step": 15
},
{
"epoch": 0.012322858903265557,
"grad_norm": 0.4827917814254761,
"learning_rate": 4.634146341463415e-05,
"loss": 1.2007,
"step": 20
},
{
"epoch": 0.015403573629081947,
"grad_norm": 1.1757270097732544,
"learning_rate": 5.853658536585366e-05,
"loss": 1.0335,
"step": 25
},
{
"epoch": 0.018484288354898338,
"grad_norm": 0.5378533601760864,
"learning_rate": 7.073170731707317e-05,
"loss": 1.0887,
"step": 30
},
{
"epoch": 0.021565003080714726,
"grad_norm": 0.32407647371292114,
"learning_rate": 8.292682926829268e-05,
"loss": 1.1421,
"step": 35
},
{
"epoch": 0.024645717806531114,
"grad_norm": 0.310761034488678,
"learning_rate": 9.51219512195122e-05,
"loss": 1.1532,
"step": 40
},
{
"epoch": 0.027726432532347505,
"grad_norm": 0.770730197429657,
"learning_rate": 0.00010731707317073172,
"loss": 1.1098,
"step": 45
},
{
"epoch": 0.030807147258163893,
"grad_norm": 0.7011957764625549,
"learning_rate": 0.00011951219512195122,
"loss": 0.9678,
"step": 50
},
{
"epoch": 0.033887861983980284,
"grad_norm": 0.37764063477516174,
"learning_rate": 0.00013170731707317076,
"loss": 1.0811,
"step": 55
},
{
"epoch": 0.036968576709796676,
"grad_norm": 0.36831024289131165,
"learning_rate": 0.00014390243902439025,
"loss": 1.1093,
"step": 60
},
{
"epoch": 0.04004929143561306,
"grad_norm": 0.26355060935020447,
"learning_rate": 0.00015609756097560978,
"loss": 1.0733,
"step": 65
},
{
"epoch": 0.04313000616142945,
"grad_norm": 0.656220018863678,
"learning_rate": 0.00016829268292682927,
"loss": 1.1062,
"step": 70
},
{
"epoch": 0.04621072088724584,
"grad_norm": 0.7080119252204895,
"learning_rate": 0.0001804878048780488,
"loss": 0.9404,
"step": 75
},
{
"epoch": 0.04929143561306223,
"grad_norm": 0.3640540838241577,
"learning_rate": 0.0001926829268292683,
"loss": 0.9997,
"step": 80
},
{
"epoch": 0.05237215033887862,
"grad_norm": 0.327878475189209,
"learning_rate": 0.0001999991687649223,
"loss": 1.038,
"step": 85
},
{
"epoch": 0.05545286506469501,
"grad_norm": 0.2545090317726135,
"learning_rate": 0.00019998981752900036,
"loss": 1.1127,
"step": 90
},
{
"epoch": 0.0585335797905114,
"grad_norm": 0.729415237903595,
"learning_rate": 0.00019997007698817557,
"loss": 1.1212,
"step": 95
},
{
"epoch": 0.061614294516327786,
"grad_norm": 0.5619292259216309,
"learning_rate": 0.00019993994919356167,
"loss": 0.8807,
"step": 100
},
{
"epoch": 0.06469500924214418,
"grad_norm": 0.7847259640693665,
"learning_rate": 0.00019989943727554598,
"loss": 0.9791,
"step": 105
},
{
"epoch": 0.06777572396796057,
"grad_norm": 0.2865511178970337,
"learning_rate": 0.00019984854544346367,
"loss": 1.0313,
"step": 110
},
{
"epoch": 0.07085643869377696,
"grad_norm": 0.27198705077171326,
"learning_rate": 0.00019978727898516086,
"loss": 1.073,
"step": 115
},
{
"epoch": 0.07393715341959335,
"grad_norm": 0.7417078018188477,
"learning_rate": 0.0001997156442664449,
"loss": 0.9766,
"step": 120
},
{
"epoch": 0.07701786814540973,
"grad_norm": 0.5739508271217346,
"learning_rate": 0.00019963364873042298,
"loss": 0.8606,
"step": 125
},
{
"epoch": 0.08009858287122612,
"grad_norm": 0.28192320466041565,
"learning_rate": 0.0001995413008967289,
"loss": 1.0665,
"step": 130
},
{
"epoch": 0.08317929759704251,
"grad_norm": 0.28555136919021606,
"learning_rate": 0.00019943861036063768,
"loss": 1.0262,
"step": 135
},
{
"epoch": 0.0862600123228589,
"grad_norm": 0.2717398405075073,
"learning_rate": 0.00019932558779206874,
"loss": 1.0675,
"step": 140
},
{
"epoch": 0.0893407270486753,
"grad_norm": 0.6025347113609314,
"learning_rate": 0.00019920224493447702,
"loss": 1.069,
"step": 145
},
{
"epoch": 0.09242144177449169,
"grad_norm": 0.5110194683074951,
"learning_rate": 0.00019906859460363307,
"loss": 0.8611,
"step": 150
},
{
"epoch": 0.09550215650030808,
"grad_norm": 0.3318012058734894,
"learning_rate": 0.00019892465068629131,
"loss": 0.9906,
"step": 155
},
{
"epoch": 0.09858287122612445,
"grad_norm": 0.25904572010040283,
"learning_rate": 0.0001987704281387471,
"loss": 1.1415,
"step": 160
},
{
"epoch": 0.10166358595194085,
"grad_norm": 0.24928759038448334,
"learning_rate": 0.00019860594298528282,
"loss": 1.1192,
"step": 165
},
{
"epoch": 0.10474430067775724,
"grad_norm": 0.724487841129303,
"learning_rate": 0.0001984312123165028,
"loss": 1.1348,
"step": 170
},
{
"epoch": 0.10782501540357363,
"grad_norm": 0.4745875597000122,
"learning_rate": 0.0001982462542875576,
"loss": 0.8411,
"step": 175
},
{
"epoch": 0.11090573012939002,
"grad_norm": 0.2991827726364136,
"learning_rate": 0.00019805108811625773,
"loss": 1.0013,
"step": 180
},
{
"epoch": 0.11398644485520641,
"grad_norm": 0.2740406394004822,
"learning_rate": 0.00019784573408107657,
"loss": 0.9905,
"step": 185
},
{
"epoch": 0.1170671595810228,
"grad_norm": 0.2254003882408142,
"learning_rate": 0.00019763021351904358,
"loss": 0.9899,
"step": 190
},
{
"epoch": 0.12014787430683918,
"grad_norm": 0.4834403991699219,
"learning_rate": 0.00019740454882352732,
"loss": 1.0253,
"step": 195
},
{
"epoch": 0.12322858903265557,
"grad_norm": 0.5589045882225037,
"learning_rate": 0.0001971687634419086,
"loss": 0.8803,
"step": 200
},
{
"epoch": 0.12630930375847196,
"grad_norm": 0.36558014154434204,
"learning_rate": 0.0001969228818731442,
"loss": 0.9944,
"step": 205
},
{
"epoch": 0.12939001848428835,
"grad_norm": 0.2908712327480316,
"learning_rate": 0.00019666692966522145,
"loss": 1.0408,
"step": 210
},
{
"epoch": 0.13247073321010475,
"grad_norm": 0.260887086391449,
"learning_rate": 0.00019640093341250357,
"loss": 0.9837,
"step": 215
},
{
"epoch": 0.13555144793592114,
"grad_norm": 0.5799285769462585,
"learning_rate": 0.0001961249207529665,
"loss": 1.0473,
"step": 220
},
{
"epoch": 0.13863216266173753,
"grad_norm": 0.6132811903953552,
"learning_rate": 0.00019583892036532726,
"loss": 0.872,
"step": 225
},
{
"epoch": 0.14171287738755392,
"grad_norm": 0.28405216336250305,
"learning_rate": 0.00019554296196606395,
"loss": 0.9982,
"step": 230
},
{
"epoch": 0.1447935921133703,
"grad_norm": 0.2680262327194214,
"learning_rate": 0.00019523707630632835,
"loss": 0.9808,
"step": 235
},
{
"epoch": 0.1478743068391867,
"grad_norm": 0.22932539880275726,
"learning_rate": 0.00019492129516875055,
"loss": 1.045,
"step": 240
},
{
"epoch": 0.15095502156500307,
"grad_norm": 0.5834919810295105,
"learning_rate": 0.00019459565136413666,
"loss": 1.0869,
"step": 245
},
{
"epoch": 0.15403573629081946,
"grad_norm": 0.5460705161094666,
"learning_rate": 0.0001942601787280598,
"loss": 0.8503,
"step": 250
},
{
"epoch": 0.15711645101663585,
"grad_norm": 0.3478216230869293,
"learning_rate": 0.00019391491211734425,
"loss": 0.9952,
"step": 255
},
{
"epoch": 0.16019716574245224,
"grad_norm": 0.2494744062423706,
"learning_rate": 0.0001935598874064438,
"loss": 0.9793,
"step": 260
},
{
"epoch": 0.16327788046826863,
"grad_norm": 0.23242679238319397,
"learning_rate": 0.00019319514148371435,
"loss": 0.9427,
"step": 265
},
{
"epoch": 0.16635859519408502,
"grad_norm": 0.5289666056632996,
"learning_rate": 0.00019282071224758091,
"loss": 1.0259,
"step": 270
},
{
"epoch": 0.16943930991990142,
"grad_norm": 0.5127719640731812,
"learning_rate": 0.00019243663860259993,
"loss": 0.8559,
"step": 275
},
{
"epoch": 0.1725200246457178,
"grad_norm": 0.3006535768508911,
"learning_rate": 0.00019204296045541685,
"loss": 0.9995,
"step": 280
},
{
"epoch": 0.1756007393715342,
"grad_norm": 0.2744085192680359,
"learning_rate": 0.0001916397187106199,
"loss": 0.9409,
"step": 285
},
{
"epoch": 0.1786814540973506,
"grad_norm": 0.2169232964515686,
"learning_rate": 0.00019122695526648968,
"loss": 0.9847,
"step": 290
},
{
"epoch": 0.18176216882316698,
"grad_norm": 0.5880517959594727,
"learning_rate": 0.00019080471301064598,
"loss": 1.0982,
"step": 295
},
{
"epoch": 0.18484288354898337,
"grad_norm": 0.5316351056098938,
"learning_rate": 0.00019037303581559143,
"loss": 0.8299,
"step": 300
},
{
"epoch": 0.18792359827479976,
"grad_norm": 0.31043657660484314,
"learning_rate": 0.00018993196853415317,
"loss": 0.9737,
"step": 305
},
{
"epoch": 0.19100431300061615,
"grad_norm": 0.3109757602214813,
"learning_rate": 0.00018948155699482244,
"loss": 0.9551,
"step": 310
},
{
"epoch": 0.19408502772643252,
"grad_norm": 0.24996116757392883,
"learning_rate": 0.00018902184799699263,
"loss": 1.057,
"step": 315
},
{
"epoch": 0.1971657424522489,
"grad_norm": 0.4340403974056244,
"learning_rate": 0.00018855288930609692,
"loss": 1.0065,
"step": 320
},
{
"epoch": 0.2002464571780653,
"grad_norm": 0.47125184535980225,
"learning_rate": 0.00018807472964864515,
"loss": 0.8492,
"step": 325
},
{
"epoch": 0.2033271719038817,
"grad_norm": 0.335504949092865,
"learning_rate": 0.00018758741870716092,
"loss": 1.0248,
"step": 330
},
{
"epoch": 0.20640788662969808,
"grad_norm": 0.24601280689239502,
"learning_rate": 0.00018709100711501955,
"loss": 1.0095,
"step": 335
},
{
"epoch": 0.20948860135551448,
"grad_norm": 0.2395162731409073,
"learning_rate": 0.0001865855464511869,
"loss": 0.9469,
"step": 340
},
{
"epoch": 0.21256931608133087,
"grad_norm": 0.4389413893222809,
"learning_rate": 0.00018607108923486025,
"loss": 0.8772,
"step": 345
},
{
"epoch": 0.21565003080714726,
"grad_norm": 0.5434815287590027,
"learning_rate": 0.00018554768892001136,
"loss": 0.8309,
"step": 350
},
{
"epoch": 0.21873074553296365,
"grad_norm": 0.3031887114048004,
"learning_rate": 0.00018501539988983234,
"loss": 0.8526,
"step": 355
},
{
"epoch": 0.22181146025878004,
"grad_norm": 0.2740094065666199,
"learning_rate": 0.0001844742774510851,
"loss": 0.9808,
"step": 360
},
{
"epoch": 0.22489217498459643,
"grad_norm": 0.20324410498142242,
"learning_rate": 0.00018392437782835475,
"loss": 0.9952,
"step": 365
},
{
"epoch": 0.22797288971041282,
"grad_norm": 0.5303752422332764,
"learning_rate": 0.00018336575815820766,
"loss": 1.011,
"step": 370
},
{
"epoch": 0.23105360443622922,
"grad_norm": 0.5991610884666443,
"learning_rate": 0.00018279847648325478,
"loss": 0.8487,
"step": 375
},
{
"epoch": 0.2341343191620456,
"grad_norm": 0.3495015501976013,
"learning_rate": 0.0001822225917461208,
"loss": 0.9038,
"step": 380
},
{
"epoch": 0.23721503388786197,
"grad_norm": 0.35118335485458374,
"learning_rate": 0.0001816381637833198,
"loss": 0.9601,
"step": 385
},
{
"epoch": 0.24029574861367836,
"grad_norm": 0.2514159679412842,
"learning_rate": 0.00018104525331903799,
"loss": 1.0495,
"step": 390
},
{
"epoch": 0.24337646333949475,
"grad_norm": 0.5237391591072083,
"learning_rate": 0.00018044392195882427,
"loss": 1.0792,
"step": 395
},
{
"epoch": 0.24645717806531114,
"grad_norm": 0.5198184251785278,
"learning_rate": 0.00017983423218318918,
"loss": 0.8639,
"step": 400
},
{
"epoch": 0.24953789279112754,
"grad_norm": 0.31769484281539917,
"learning_rate": 0.00017921624734111292,
"loss": 0.9426,
"step": 405
},
{
"epoch": 0.2526186075169439,
"grad_norm": 0.27527257800102234,
"learning_rate": 0.00017859003164346336,
"loss": 0.9744,
"step": 410
},
{
"epoch": 0.2556993222427603,
"grad_norm": 0.22283877432346344,
"learning_rate": 0.0001779556501563239,
"loss": 0.9612,
"step": 415
},
{
"epoch": 0.2587800369685767,
"grad_norm": 0.4636116623878479,
"learning_rate": 0.00017731316879423327,
"loss": 1.034,
"step": 420
},
{
"epoch": 0.2618607516943931,
"grad_norm": 0.47125929594039917,
"learning_rate": 0.00017666265431333654,
"loss": 0.8632,
"step": 425
},
{
"epoch": 0.2649414664202095,
"grad_norm": 0.28012779355049133,
"learning_rate": 0.000176004174304449,
"loss": 0.9842,
"step": 430
},
{
"epoch": 0.2680221811460259,
"grad_norm": 0.2671544551849365,
"learning_rate": 0.00017533779718603313,
"loss": 0.9874,
"step": 435
},
{
"epoch": 0.2711028958718423,
"grad_norm": 0.22700628638267517,
"learning_rate": 0.00017466359219708985,
"loss": 0.9457,
"step": 440
},
{
"epoch": 0.27418361059765867,
"grad_norm": 0.41735100746154785,
"learning_rate": 0.00017398162938996422,
"loss": 0.9501,
"step": 445
},
{
"epoch": 0.27726432532347506,
"grad_norm": 0.37333425879478455,
"learning_rate": 0.00017329197962306664,
"loss": 0.8123,
"step": 450
},
{
"epoch": 0.28034504004929145,
"grad_norm": 0.2996601164340973,
"learning_rate": 0.00017259471455351072,
"loss": 0.9048,
"step": 455
},
{
"epoch": 0.28342575477510784,
"grad_norm": 0.25390616059303284,
"learning_rate": 0.0001718899066296675,
"loss": 0.9711,
"step": 460
},
{
"epoch": 0.28650646950092423,
"grad_norm": 0.25309455394744873,
"learning_rate": 0.000171177629083638,
"loss": 0.9762,
"step": 465
},
{
"epoch": 0.2895871842267406,
"grad_norm": 0.46593979001045227,
"learning_rate": 0.0001704579559236441,
"loss": 1.0148,
"step": 470
},
{
"epoch": 0.292667898952557,
"grad_norm": 0.5357691645622253,
"learning_rate": 0.00016973096192633884,
"loss": 0.786,
"step": 475
},
{
"epoch": 0.2957486136783734,
"grad_norm": 0.3320036828517914,
"learning_rate": 0.00016899672262903677,
"loss": 0.9034,
"step": 480
},
{
"epoch": 0.2988293284041898,
"grad_norm": 0.2919875681400299,
"learning_rate": 0.00016825531432186543,
"loss": 0.9694,
"step": 485
},
{
"epoch": 0.30191004313000613,
"grad_norm": 0.2110517919063568,
"learning_rate": 0.00016750681403983846,
"loss": 1.0684,
"step": 490
},
{
"epoch": 0.3049907578558225,
"grad_norm": 0.8895492553710938,
"learning_rate": 0.00016675129955485152,
"loss": 0.9935,
"step": 495
},
{
"epoch": 0.3080714725816389,
"grad_norm": 0.4201313555240631,
"learning_rate": 0.00016598884936760131,
"loss": 0.8232,
"step": 500
},
{
"epoch": 0.3111521873074553,
"grad_norm": 0.2772030234336853,
"learning_rate": 0.00016521954269942918,
"loss": 0.989,
"step": 505
},
{
"epoch": 0.3142329020332717,
"grad_norm": 0.24938176572322845,
"learning_rate": 0.00016444345948408984,
"loss": 0.9521,
"step": 510
},
{
"epoch": 0.3173136167590881,
"grad_norm": 0.23586159944534302,
"learning_rate": 0.0001636606803594457,
"loss": 1.0013,
"step": 515
},
{
"epoch": 0.3203943314849045,
"grad_norm": 0.6195285320281982,
"learning_rate": 0.0001628712866590885,
"loss": 0.9773,
"step": 520
},
{
"epoch": 0.3234750462107209,
"grad_norm": 0.5146563053131104,
"learning_rate": 0.00016207536040388845,
"loss": 0.8414,
"step": 525
},
{
"epoch": 0.32655576093653726,
"grad_norm": 0.30340778827667236,
"learning_rate": 0.0001612729842934718,
"loss": 0.9793,
"step": 530
},
{
"epoch": 0.32963647566235366,
"grad_norm": 0.2670872211456299,
"learning_rate": 0.00016046424169762827,
"loss": 1.0042,
"step": 535
},
{
"epoch": 0.33271719038817005,
"grad_norm": 0.2140674591064453,
"learning_rate": 0.0001596492166476485,
"loss": 1.0067,
"step": 540
},
{
"epoch": 0.33579790511398644,
"grad_norm": 0.42335960268974304,
"learning_rate": 0.0001588279938275929,
"loss": 0.9971,
"step": 545
},
{
"epoch": 0.33887861983980283,
"grad_norm": 0.5136492252349854,
"learning_rate": 0.00015800065856549269,
"loss": 0.7794,
"step": 550
},
{
"epoch": 0.3419593345656192,
"grad_norm": 0.28528350591659546,
"learning_rate": 0.00015716729682448393,
"loss": 0.9553,
"step": 555
},
{
"epoch": 0.3450400492914356,
"grad_norm": 0.23635320365428925,
"learning_rate": 0.0001563279951938758,
"loss": 0.9601,
"step": 560
},
{
"epoch": 0.348120764017252,
"grad_norm": 0.21445675194263458,
"learning_rate": 0.00015548284088015354,
"loss": 1.0177,
"step": 565
},
{
"epoch": 0.3512014787430684,
"grad_norm": 0.494815856218338,
"learning_rate": 0.00015463192169791741,
"loss": 0.9958,
"step": 570
},
{
"epoch": 0.3542821934688848,
"grad_norm": 0.4995960295200348,
"learning_rate": 0.0001537753260607584,
"loss": 0.8352,
"step": 575
},
{
"epoch": 0.3573629081947012,
"grad_norm": 0.2728192210197449,
"learning_rate": 0.00015291314297207175,
"loss": 0.9472,
"step": 580
},
{
"epoch": 0.36044362292051757,
"grad_norm": 0.21085520088672638,
"learning_rate": 0.0001520454620158093,
"loss": 0.9853,
"step": 585
},
{
"epoch": 0.36352433764633396,
"grad_norm": 0.21773026883602142,
"learning_rate": 0.00015117237334717117,
"loss": 0.9141,
"step": 590
},
{
"epoch": 0.36660505237215035,
"grad_norm": 0.4270155131816864,
"learning_rate": 0.00015029396768323846,
"loss": 1.0516,
"step": 595
},
{
"epoch": 0.36968576709796674,
"grad_norm": 0.444807767868042,
"learning_rate": 0.00014941033629354734,
"loss": 0.8681,
"step": 600
},
{
"epoch": 0.37276648182378314,
"grad_norm": 0.29151108860969543,
"learning_rate": 0.00014852157099060596,
"loss": 0.9942,
"step": 605
},
{
"epoch": 0.3758471965495995,
"grad_norm": 0.2614330053329468,
"learning_rate": 0.00014762776412035456,
"loss": 1.0202,
"step": 610
},
{
"epoch": 0.3789279112754159,
"grad_norm": 0.22319279611110687,
"learning_rate": 0.00014672900855257056,
"loss": 0.9941,
"step": 615
},
{
"epoch": 0.3820086260012323,
"grad_norm": 0.5060321688652039,
"learning_rate": 0.00014582539767121904,
"loss": 0.9866,
"step": 620
},
{
"epoch": 0.3850893407270487,
"grad_norm": 0.46248483657836914,
"learning_rate": 0.0001449170253647498,
"loss": 0.741,
"step": 625
},
{
"epoch": 0.38817005545286504,
"grad_norm": 0.31994864344596863,
"learning_rate": 0.0001440039860163419,
"loss": 0.9465,
"step": 630
},
{
"epoch": 0.39125077017868143,
"grad_norm": 0.2810644209384918,
"learning_rate": 0.00014308637449409706,
"loss": 0.9403,
"step": 635
},
{
"epoch": 0.3943314849044978,
"grad_norm": 0.22498297691345215,
"learning_rate": 0.00014216428614118243,
"loss": 1.0146,
"step": 640
},
{
"epoch": 0.3974121996303142,
"grad_norm": 0.4635995924472809,
"learning_rate": 0.00014123781676592418,
"loss": 0.9778,
"step": 645
},
{
"epoch": 0.4004929143561306,
"grad_norm": 0.4592895805835724,
"learning_rate": 0.00014030706263185247,
"loss": 0.8311,
"step": 650
},
{
"epoch": 0.403573629081947,
"grad_norm": 0.30809083580970764,
"learning_rate": 0.00013937212044769955,
"loss": 0.9141,
"step": 655
},
{
"epoch": 0.4066543438077634,
"grad_norm": 0.32495659589767456,
"learning_rate": 0.0001384330873573513,
"loss": 0.9867,
"step": 660
},
{
"epoch": 0.4097350585335798,
"grad_norm": 0.22196783125400543,
"learning_rate": 0.00013749006092975347,
"loss": 1.0004,
"step": 665
},
{
"epoch": 0.41281577325939617,
"grad_norm": 0.5153515934944153,
"learning_rate": 0.00013654313914877414,
"loss": 0.9771,
"step": 670
},
{
"epoch": 0.41589648798521256,
"grad_norm": 0.43011271953582764,
"learning_rate": 0.00013559242040302272,
"loss": 0.7806,
"step": 675
},
{
"epoch": 0.41897720271102895,
"grad_norm": 0.2914900779724121,
"learning_rate": 0.00013463800347562706,
"loss": 0.9531,
"step": 680
},
{
"epoch": 0.42205791743684534,
"grad_norm": 0.27283361554145813,
"learning_rate": 0.00013367998753396944,
"loss": 0.8862,
"step": 685
},
{
"epoch": 0.42513863216266173,
"grad_norm": 0.22260256111621857,
"learning_rate": 0.00013271847211938285,
"loss": 0.978,
"step": 690
},
{
"epoch": 0.4282193468884781,
"grad_norm": 0.49871596693992615,
"learning_rate": 0.0001317535571368082,
"loss": 1.0035,
"step": 695
},
{
"epoch": 0.4313000616142945,
"grad_norm": 0.4641573429107666,
"learning_rate": 0.00013078534284441382,
"loss": 0.8737,
"step": 700
},
{
"epoch": 0.4343807763401109,
"grad_norm": 0.2758205235004425,
"learning_rate": 0.00012981392984317834,
"loss": 0.9117,
"step": 705
},
{
"epoch": 0.4374614910659273,
"grad_norm": 0.266637921333313,
"learning_rate": 0.00012883941906643786,
"loss": 0.9657,
"step": 710
},
{
"epoch": 0.4405422057917437,
"grad_norm": 0.23960596323013306,
"learning_rate": 0.00012786191176939848,
"loss": 0.9081,
"step": 715
},
{
"epoch": 0.4436229205175601,
"grad_norm": 0.4470706880092621,
"learning_rate": 0.00012688150951861582,
"loss": 0.9299,
"step": 720
},
{
"epoch": 0.4467036352433765,
"grad_norm": 0.5093996524810791,
"learning_rate": 0.00012589831418144154,
"loss": 0.8259,
"step": 725
},
{
"epoch": 0.44978434996919286,
"grad_norm": 0.2514059841632843,
"learning_rate": 0.00012491242791543922,
"loss": 0.9407,
"step": 730
},
{
"epoch": 0.45286506469500926,
"grad_norm": 0.25785940885543823,
"learning_rate": 0.00012392395315776963,
"loss": 0.9092,
"step": 735
},
{
"epoch": 0.45594577942082565,
"grad_norm": 0.2544153034687042,
"learning_rate": 0.00012293299261454725,
"loss": 0.9285,
"step": 740
},
{
"epoch": 0.45902649414664204,
"grad_norm": 0.4648970663547516,
"learning_rate": 0.00012193964925016872,
"loss": 0.9379,
"step": 745
},
{
"epoch": 0.46210720887245843,
"grad_norm": 0.5296097993850708,
"learning_rate": 0.00012094402627661447,
"loss": 0.7754,
"step": 750
},
{
"epoch": 0.4651879235982748,
"grad_norm": 0.2834993898868561,
"learning_rate": 0.00011994622714272448,
"loss": 0.9358,
"step": 755
},
{
"epoch": 0.4682686383240912,
"grad_norm": 0.329428493976593,
"learning_rate": 0.00011894635552344975,
"loss": 0.9574,
"step": 760
},
{
"epoch": 0.4713493530499076,
"grad_norm": 0.20489312708377838,
"learning_rate": 0.00011794451530908011,
"loss": 0.9345,
"step": 765
},
{
"epoch": 0.47443006777572394,
"grad_norm": 0.5085513591766357,
"learning_rate": 0.00011694081059444946,
"loss": 0.9837,
"step": 770
},
{
"epoch": 0.47751078250154033,
"grad_norm": 0.4909701645374298,
"learning_rate": 0.0001159353456681201,
"loss": 0.816,
"step": 775
},
{
"epoch": 0.4805914972273567,
"grad_norm": 0.28249093890190125,
"learning_rate": 0.00011492822500154667,
"loss": 0.9001,
"step": 780
},
{
"epoch": 0.4836722119531731,
"grad_norm": 0.24982449412345886,
"learning_rate": 0.00011391955323822126,
"loss": 0.8926,
"step": 785
},
{
"epoch": 0.4867529266789895,
"grad_norm": 0.22139038145542145,
"learning_rate": 0.00011290943518280057,
"loss": 1.0207,
"step": 790
},
{
"epoch": 0.4898336414048059,
"grad_norm": 0.4508483111858368,
"learning_rate": 0.0001118979757902162,
"loss": 0.9285,
"step": 795
},
{
"epoch": 0.4929143561306223,
"grad_norm": 0.4797590672969818,
"learning_rate": 0.00011088528015476964,
"loss": 0.8541,
"step": 800
},
{
"epoch": 0.4959950708564387,
"grad_norm": 0.25662368535995483,
"learning_rate": 0.00010987145349921251,
"loss": 0.9033,
"step": 805
},
{
"epoch": 0.49907578558225507,
"grad_norm": 0.26000267267227173,
"learning_rate": 0.0001088566011638134,
"loss": 0.9413,
"step": 810
},
{
"epoch": 0.5021565003080715,
"grad_norm": 0.21961303055286407,
"learning_rate": 0.00010784082859541292,
"loss": 0.9315,
"step": 815
},
{
"epoch": 0.5052372150338879,
"grad_norm": 0.4173499047756195,
"learning_rate": 0.0001068242413364671,
"loss": 0.9527,
"step": 820
},
{
"epoch": 0.5083179297597042,
"grad_norm": 0.4871540069580078,
"learning_rate": 0.00010580694501408138,
"loss": 0.8284,
"step": 825
},
{
"epoch": 0.5113986444855206,
"grad_norm": 0.28846967220306396,
"learning_rate": 0.00010478904532903535,
"loss": 0.8648,
"step": 830
},
{
"epoch": 0.514479359211337,
"grad_norm": 0.24096441268920898,
"learning_rate": 0.00010377064804480025,
"loss": 1.0178,
"step": 835
},
{
"epoch": 0.5175600739371534,
"grad_norm": 0.2081213891506195,
"learning_rate": 0.00010275185897654971,
"loss": 0.8944,
"step": 840
},
{
"epoch": 0.5206407886629698,
"grad_norm": 0.46252796053886414,
"learning_rate": 0.00010173278398016501,
"loss": 0.922,
"step": 845
},
{
"epoch": 0.5237215033887862,
"grad_norm": 0.44197043776512146,
"learning_rate": 0.00010071352894123654,
"loss": 0.7921,
"step": 850
},
{
"epoch": 0.5268022181146026,
"grad_norm": 0.3035351037979126,
"learning_rate": 9.969419976406165e-05,
"loss": 0.9301,
"step": 855
},
{
"epoch": 0.529882932840419,
"grad_norm": 0.2718713879585266,
"learning_rate": 9.867490236064108e-05,
"loss": 0.9367,
"step": 860
},
{
"epoch": 0.5329636475662354,
"grad_norm": 0.25972554087638855,
"learning_rate": 9.765574263967396e-05,
"loss": 1.0116,
"step": 865
},
{
"epoch": 0.5360443622920518,
"grad_norm": 0.3816847503185272,
"learning_rate": 9.66368264955539e-05,
"loss": 0.915,
"step": 870
},
{
"epoch": 0.5391250770178682,
"grad_norm": 0.6426383852958679,
"learning_rate": 9.56182597973658e-05,
"loss": 0.8123,
"step": 875
},
{
"epoch": 0.5422057917436846,
"grad_norm": 0.24743957817554474,
"learning_rate": 9.460014837788605e-05,
"loss": 0.9215,
"step": 880
},
{
"epoch": 0.5452865064695009,
"grad_norm": 0.24885661900043488,
"learning_rate": 9.358259802258581e-05,
"loss": 0.9195,
"step": 885
},
{
"epoch": 0.5483672211953173,
"grad_norm": 0.21944816410541534,
"learning_rate": 9.256571445863972e-05,
"loss": 0.9105,
"step": 890
},
{
"epoch": 0.5514479359211337,
"grad_norm": 0.4631386399269104,
"learning_rate": 9.154960334394027e-05,
"loss": 0.965,
"step": 895
},
{
"epoch": 0.5545286506469501,
"grad_norm": 0.47829023003578186,
"learning_rate": 9.053437025611973e-05,
"loss": 0.7986,
"step": 900
},
{
"epoch": 0.5576093653727665,
"grad_norm": 0.29296985268592834,
"learning_rate": 8.952012068158027e-05,
"loss": 0.9545,
"step": 905
},
{
"epoch": 0.5606900800985829,
"grad_norm": 0.23259030282497406,
"learning_rate": 8.850696000453326e-05,
"loss": 0.9846,
"step": 910
},
{
"epoch": 0.5637707948243993,
"grad_norm": 0.2143285572528839,
"learning_rate": 8.749499349604993e-05,
"loss": 0.9375,
"step": 915
},
{
"epoch": 0.5668515095502157,
"grad_norm": 0.4443269670009613,
"learning_rate": 8.64843263031228e-05,
"loss": 0.8851,
"step": 920
},
{
"epoch": 0.5699322242760321,
"grad_norm": 0.49172407388687134,
"learning_rate": 8.547506343774097e-05,
"loss": 0.7475,
"step": 925
},
{
"epoch": 0.5730129390018485,
"grad_norm": 0.3034185469150543,
"learning_rate": 8.446730976597878e-05,
"loss": 1.0023,
"step": 930
},
{
"epoch": 0.5760936537276649,
"grad_norm": 0.27486246824264526,
"learning_rate": 8.346116999709975e-05,
"loss": 0.9047,
"step": 935
},
{
"epoch": 0.5791743684534812,
"grad_norm": 0.2196229249238968,
"learning_rate": 8.245674867267724e-05,
"loss": 0.9262,
"step": 940
},
{
"epoch": 0.5822550831792976,
"grad_norm": 0.4424618184566498,
"learning_rate": 8.145415015573183e-05,
"loss": 0.9537,
"step": 945
},
{
"epoch": 0.585335797905114,
"grad_norm": 0.4673041105270386,
"learning_rate": 8.045347861988789e-05,
"loss": 0.7926,
"step": 950
},
{
"epoch": 0.5884165126309304,
"grad_norm": 0.320578396320343,
"learning_rate": 7.945483803854936e-05,
"loss": 0.9144,
"step": 955
},
{
"epoch": 0.5914972273567468,
"grad_norm": 0.2610718905925751,
"learning_rate": 7.845833217409675e-05,
"loss": 1.0055,
"step": 960
},
{
"epoch": 0.5945779420825632,
"grad_norm": 0.20770247280597687,
"learning_rate": 7.746406456710564e-05,
"loss": 0.9012,
"step": 965
},
{
"epoch": 0.5976586568083796,
"grad_norm": 0.43519556522369385,
"learning_rate": 7.64721385255886e-05,
"loss": 0.9128,
"step": 970
},
{
"epoch": 0.600739371534196,
"grad_norm": 0.495310515165329,
"learning_rate": 7.548265711426104e-05,
"loss": 0.7712,
"step": 975
},
{
"epoch": 0.6038200862600123,
"grad_norm": 0.2972421646118164,
"learning_rate": 7.449572314383237e-05,
"loss": 0.9942,
"step": 980
},
{
"epoch": 0.6069008009858287,
"grad_norm": 0.25806179642677307,
"learning_rate": 7.351143916032374e-05,
"loss": 0.9889,
"step": 985
},
{
"epoch": 0.609981515711645,
"grad_norm": 0.22174964845180511,
"learning_rate": 7.252990743441293e-05,
"loss": 0.9398,
"step": 990
},
{
"epoch": 0.6130622304374614,
"grad_norm": 0.5080896019935608,
"learning_rate": 7.155122995080827e-05,
"loss": 1.0196,
"step": 995
},
{
"epoch": 0.6161429451632778,
"grad_norm": 0.5036611557006836,
"learning_rate": 7.057550839765188e-05,
"loss": 0.803,
"step": 1000
},
{
"epoch": 0.6192236598890942,
"grad_norm": 0.3295843303203583,
"learning_rate": 6.960284415595407e-05,
"loss": 0.9066,
"step": 1005
},
{
"epoch": 0.6223043746149106,
"grad_norm": 0.24868136644363403,
"learning_rate": 6.863333828905929e-05,
"loss": 1.0486,
"step": 1010
},
{
"epoch": 0.625385089340727,
"grad_norm": 0.2273484766483307,
"learning_rate": 6.766709153214542e-05,
"loss": 0.9454,
"step": 1015
},
{
"epoch": 0.6284658040665434,
"grad_norm": 0.4086507558822632,
"learning_rate": 6.670420428175705e-05,
"loss": 0.9561,
"step": 1020
},
{
"epoch": 0.6315465187923598,
"grad_norm": 0.4194948077201843,
"learning_rate": 6.574477658537375e-05,
"loss": 0.7882,
"step": 1025
},
{
"epoch": 0.6346272335181762,
"grad_norm": 0.3146796226501465,
"learning_rate": 6.4788908131015e-05,
"loss": 0.8443,
"step": 1030
},
{
"epoch": 0.6377079482439926,
"grad_norm": 0.24021373689174652,
"learning_rate": 6.38366982368819e-05,
"loss": 0.8491,
"step": 1035
},
{
"epoch": 0.640788662969809,
"grad_norm": 0.21864767372608185,
"learning_rate": 6.288824584103816e-05,
"loss": 0.9222,
"step": 1040
},
{
"epoch": 0.6438693776956254,
"grad_norm": 0.6802520155906677,
"learning_rate": 6.194364949112953e-05,
"loss": 0.9085,
"step": 1045
},
{
"epoch": 0.6469500924214417,
"grad_norm": 0.49471819400787354,
"learning_rate": 6.100300733414474e-05,
"loss": 0.8007,
"step": 1050
},
{
"epoch": 0.6500308071472581,
"grad_norm": 0.2820141017436981,
"learning_rate": 6.0066417106217455e-05,
"loss": 0.8945,
"step": 1055
},
{
"epoch": 0.6531115218730745,
"grad_norm": 0.24521881341934204,
"learning_rate": 5.9133976122471214e-05,
"loss": 0.9188,
"step": 1060
},
{
"epoch": 0.6561922365988909,
"grad_norm": 0.21457761526107788,
"learning_rate": 5.82057812669081e-05,
"loss": 0.9509,
"step": 1065
},
{
"epoch": 0.6592729513247073,
"grad_norm": 0.3352929651737213,
"learning_rate": 5.728192898234195e-05,
"loss": 0.851,
"step": 1070
},
{
"epoch": 0.6623536660505237,
"grad_norm": 0.48959973454475403,
"learning_rate": 5.6362515260377835e-05,
"loss": 0.7561,
"step": 1075
},
{
"epoch": 0.6654343807763401,
"grad_norm": 0.27886924147605896,
"learning_rate": 5.544763563143793e-05,
"loss": 0.9267,
"step": 1080
},
{
"epoch": 0.6685150955021565,
"grad_norm": 0.2557944059371948,
"learning_rate": 5.4537385154835864e-05,
"loss": 0.9299,
"step": 1085
},
{
"epoch": 0.6715958102279729,
"grad_norm": 0.2110741287469864,
"learning_rate": 5.363185840889935e-05,
"loss": 0.8646,
"step": 1090
},
{
"epoch": 0.6746765249537893,
"grad_norm": 0.44468095898628235,
"learning_rate": 5.273114948114346e-05,
"loss": 0.9427,
"step": 1095
},
{
"epoch": 0.6777572396796057,
"grad_norm": 0.45007064938545227,
"learning_rate": 5.1835351958494515e-05,
"loss": 0.7519,
"step": 1100
},
{
"epoch": 0.680837954405422,
"grad_norm": 0.2795059084892273,
"learning_rate": 5.094455891756587e-05,
"loss": 0.9132,
"step": 1105
},
{
"epoch": 0.6839186691312384,
"grad_norm": 0.24423202872276306,
"learning_rate": 5.00588629149872e-05,
"loss": 0.9795,
"step": 1110
},
{
"epoch": 0.6869993838570548,
"grad_norm": 0.2284388393163681,
"learning_rate": 4.91783559777873e-05,
"loss": 0.905,
"step": 1115
},
{
"epoch": 0.6900800985828712,
"grad_norm": 0.49596309661865234,
"learning_rate": 4.830312959383238e-05,
"loss": 0.909,
"step": 1120
},
{
"epoch": 0.6931608133086876,
"grad_norm": 0.41242459416389465,
"learning_rate": 4.7433274702319815e-05,
"loss": 0.7293,
"step": 1125
},
{
"epoch": 0.696241528034504,
"grad_norm": 0.2715208828449249,
"learning_rate": 4.656888168432962e-05,
"loss": 0.8847,
"step": 1130
},
{
"epoch": 0.6993222427603204,
"grad_norm": 0.25266537070274353,
"learning_rate": 4.571004035343315e-05,
"loss": 0.9697,
"step": 1135
},
{
"epoch": 0.7024029574861368,
"grad_norm": 0.2048375904560089,
"learning_rate": 4.485683994636144e-05,
"loss": 0.8963,
"step": 1140
},
{
"epoch": 0.7054836722119532,
"grad_norm": 0.44298356771469116,
"learning_rate": 4.400936911373308e-05,
"loss": 0.9756,
"step": 1145
},
{
"epoch": 0.7085643869377696,
"grad_norm": 0.4284767508506775,
"learning_rate": 4.3167715910842966e-05,
"loss": 0.7932,
"step": 1150
},
{
"epoch": 0.711645101663586,
"grad_norm": 0.29664915800094604,
"learning_rate": 4.2331967788513295e-05,
"loss": 0.9168,
"step": 1155
},
{
"epoch": 0.7147258163894024,
"grad_norm": 0.24990494549274445,
"learning_rate": 4.1502211584006836e-05,
"loss": 0.9272,
"step": 1160
},
{
"epoch": 0.7178065311152187,
"grad_norm": 0.20492446422576904,
"learning_rate": 4.067853351200446e-05,
"loss": 0.9724,
"step": 1165
},
{
"epoch": 0.7208872458410351,
"grad_norm": 0.38028204441070557,
"learning_rate": 3.986101915564695e-05,
"loss": 0.9153,
"step": 1170
},
{
"epoch": 0.7239679605668515,
"grad_norm": 0.5438628196716309,
"learning_rate": 3.904975345764262e-05,
"loss": 0.7897,
"step": 1175
},
{
"epoch": 0.7270486752926679,
"grad_norm": 0.28773704171180725,
"learning_rate": 3.824482071144163e-05,
"loss": 0.931,
"step": 1180
},
{
"epoch": 0.7301293900184843,
"grad_norm": 0.27623042464256287,
"learning_rate": 3.744630455247739e-05,
"loss": 0.905,
"step": 1185
},
{
"epoch": 0.7332101047443007,
"grad_norm": 0.20309558510780334,
"learning_rate": 3.6654287949476626e-05,
"loss": 0.927,
"step": 1190
},
{
"epoch": 0.7362908194701171,
"grad_norm": 0.40813419222831726,
"learning_rate": 3.586885319583858e-05,
"loss": 0.9488,
"step": 1195
},
{
"epoch": 0.7393715341959335,
"grad_norm": 0.5010459423065186,
"learning_rate": 3.5090081901084525e-05,
"loss": 0.8075,
"step": 1200
},
{
"epoch": 0.7424522489217499,
"grad_norm": 0.30515843629837036,
"learning_rate": 3.431805498237808e-05,
"loss": 0.9658,
"step": 1205
},
{
"epoch": 0.7455329636475663,
"grad_norm": 0.24745185673236847,
"learning_rate": 3.355285265611784e-05,
"loss": 0.953,
"step": 1210
},
{
"epoch": 0.7486136783733827,
"grad_norm": 0.20221352577209473,
"learning_rate": 3.279455442960238e-05,
"loss": 0.9542,
"step": 1215
},
{
"epoch": 0.751694393099199,
"grad_norm": 0.4061656594276428,
"learning_rate": 3.204323909276924e-05,
"loss": 0.9838,
"step": 1220
},
{
"epoch": 0.7547751078250154,
"grad_norm": 0.39202672243118286,
"learning_rate": 3.1298984710008484e-05,
"loss": 0.7694,
"step": 1225
},
{
"epoch": 0.7578558225508318,
"grad_norm": 0.34482622146606445,
"learning_rate": 3.056186861205136e-05,
"loss": 0.8751,
"step": 1230
},
{
"epoch": 0.7609365372766482,
"grad_norm": 0.24178501963615417,
"learning_rate": 2.9831967387935467e-05,
"loss": 0.9526,
"step": 1235
},
{
"epoch": 0.7640172520024646,
"grad_norm": 0.2215549796819687,
"learning_rate": 2.9109356877046712e-05,
"loss": 0.8726,
"step": 1240
},
{
"epoch": 0.767097966728281,
"grad_norm": 0.45621606707572937,
"learning_rate": 2.8394112161239605e-05,
"loss": 0.943,
"step": 1245
},
{
"epoch": 0.7701786814540974,
"grad_norm": 0.47603940963745117,
"learning_rate": 2.7686307557035685e-05,
"loss": 0.7294,
"step": 1250
},
{
"epoch": 0.7732593961799138,
"grad_norm": 0.2534734308719635,
"learning_rate": 2.6986016607901908e-05,
"loss": 0.8862,
"step": 1255
},
{
"epoch": 0.7763401109057301,
"grad_norm": 0.26066556572914124,
"learning_rate": 2.629331207660931e-05,
"loss": 0.9054,
"step": 1260
},
{
"epoch": 0.7794208256315465,
"grad_norm": 0.2252478003501892,
"learning_rate": 2.5608265937672436e-05,
"loss": 0.8883,
"step": 1265
},
{
"epoch": 0.7825015403573629,
"grad_norm": 0.4677968919277191,
"learning_rate": 2.4930949369871203e-05,
"loss": 0.9571,
"step": 1270
},
{
"epoch": 0.7855822550831792,
"grad_norm": 0.48786357045173645,
"learning_rate": 2.426143274885493e-05,
"loss": 0.7375,
"step": 1275
},
{
"epoch": 0.7886629698089956,
"grad_norm": 0.3174852430820465,
"learning_rate": 2.359978563983022e-05,
"loss": 0.8827,
"step": 1280
},
{
"epoch": 0.791743684534812,
"grad_norm": 0.23994563519954681,
"learning_rate": 2.2946076790332827e-05,
"loss": 0.8892,
"step": 1285
},
{
"epoch": 0.7948243992606284,
"grad_norm": 0.21942304074764252,
"learning_rate": 2.2300374123084522e-05,
"loss": 0.8561,
"step": 1290
},
{
"epoch": 0.7979051139864448,
"grad_norm": 0.5058274865150452,
"learning_rate": 2.166274472893567e-05,
"loss": 0.9178,
"step": 1295
},
{
"epoch": 0.8009858287122612,
"grad_norm": 0.43461018800735474,
"learning_rate": 2.1033254859894226e-05,
"loss": 0.7465,
"step": 1300
},
{
"epoch": 0.8040665434380776,
"grad_norm": 0.26012492179870605,
"learning_rate": 2.041196992224206e-05,
"loss": 0.8865,
"step": 1305
},
{
"epoch": 0.807147258163894,
"grad_norm": 0.2593576908111572,
"learning_rate": 1.9798954469738762e-05,
"loss": 0.8778,
"step": 1310
},
{
"epoch": 0.8102279728897104,
"grad_norm": 0.21213385462760925,
"learning_rate": 1.919427219691453e-05,
"loss": 0.9287,
"step": 1315
},
{
"epoch": 0.8133086876155268,
"grad_norm": 0.4115472435951233,
"learning_rate": 1.8597985932451856e-05,
"loss": 0.8981,
"step": 1320
},
{
"epoch": 0.8163894023413432,
"grad_norm": 0.4511643648147583,
"learning_rate": 1.8010157632657543e-05,
"loss": 0.7387,
"step": 1325
},
{
"epoch": 0.8194701170671596,
"grad_norm": 0.2990265488624573,
"learning_rate": 1.7430848375025176e-05,
"loss": 0.9106,
"step": 1330
},
{
"epoch": 0.822550831792976,
"grad_norm": 0.28121325373649597,
"learning_rate": 1.686011835188891e-05,
"loss": 0.9232,
"step": 1335
},
{
"epoch": 0.8256315465187923,
"grad_norm": 0.19917987287044525,
"learning_rate": 1.6298026864169335e-05,
"loss": 0.9458,
"step": 1340
},
{
"epoch": 0.8287122612446087,
"grad_norm": 0.4129948914051056,
"learning_rate": 1.5744632315211815e-05,
"loss": 0.9359,
"step": 1345
},
{
"epoch": 0.8317929759704251,
"grad_norm": 0.47105616331100464,
"learning_rate": 1.5199992204718294e-05,
"loss": 0.7866,
"step": 1350
},
{
"epoch": 0.8348736906962415,
"grad_norm": 0.2840569317340851,
"learning_rate": 1.4664163122772689e-05,
"loss": 0.9127,
"step": 1355
},
{
"epoch": 0.8379544054220579,
"grad_norm": 0.27385058999061584,
"learning_rate": 1.4137200743961188e-05,
"loss": 0.9092,
"step": 1360
},
{
"epoch": 0.8410351201478743,
"grad_norm": 0.21615581214427948,
"learning_rate": 1.3619159821587235e-05,
"loss": 0.9071,
"step": 1365
},
{
"epoch": 0.8441158348736907,
"grad_norm": 0.4315554201602936,
"learning_rate": 1.3110094181982657e-05,
"loss": 0.901,
"step": 1370
},
{
"epoch": 0.8471965495995071,
"grad_norm": 0.4600566029548645,
"learning_rate": 1.261005671891482e-05,
"loss": 0.7692,
"step": 1375
},
{
"epoch": 0.8502772643253235,
"grad_norm": 0.27289214730262756,
"learning_rate": 1.2119099388090716e-05,
"loss": 0.9479,
"step": 1380
},
{
"epoch": 0.8533579790511399,
"grad_norm": 0.26042231917381287,
"learning_rate": 1.1637273201758748e-05,
"loss": 0.8972,
"step": 1385
},
{
"epoch": 0.8564386937769563,
"grad_norm": 0.21819062530994415,
"learning_rate": 1.1164628223408168e-05,
"loss": 0.8494,
"step": 1390
},
{
"epoch": 0.8595194085027726,
"grad_norm": 0.5444476008415222,
"learning_rate": 1.0701213562567492e-05,
"loss": 0.9043,
"step": 1395
},
{
"epoch": 0.862600123228589,
"grad_norm": 0.5517734289169312,
"learning_rate": 1.0247077369701653e-05,
"loss": 0.7521,
"step": 1400
},
{
"epoch": 0.8656808379544054,
"grad_norm": 0.27313733100891113,
"learning_rate": 9.802266831209206e-06,
"loss": 0.8408,
"step": 1405
},
{
"epoch": 0.8687615526802218,
"grad_norm": 0.23924760520458221,
"learning_rate": 9.366828164519258e-06,
"loss": 0.8577,
"step": 1410
},
{
"epoch": 0.8718422674060382,
"grad_norm": 0.2202882021665573,
"learning_rate": 8.940806613289498e-06,
"loss": 0.9402,
"step": 1415
},
{
"epoch": 0.8749229821318546,
"grad_norm": 0.4714129865169525,
"learning_rate": 8.524246442705153e-06,
"loss": 0.8714,
"step": 1420
},
{
"epoch": 0.878003696857671,
"grad_norm": 0.5381476283073425,
"learning_rate": 8.117190934879593e-06,
"loss": 0.7554,
"step": 1425
},
{
"epoch": 0.8810844115834874,
"grad_norm": 0.30594927072525024,
"learning_rate": 7.719682384357308e-06,
"loss": 0.9058,
"step": 1430
},
{
"epoch": 0.8841651263093038,
"grad_norm": 0.28632140159606934,
"learning_rate": 7.33176209371923e-06,
"loss": 0.9048,
"step": 1435
},
{
"epoch": 0.8872458410351202,
"grad_norm": 0.23244526982307434,
"learning_rate": 6.953470369291348e-06,
"loss": 0.9097,
"step": 1440
},
{
"epoch": 0.8903265557609366,
"grad_norm": 0.4457685351371765,
"learning_rate": 6.5848465169566e-06,
"loss": 0.9375,
"step": 1445
},
{
"epoch": 0.893407270486753,
"grad_norm": 0.4593055546283722,
"learning_rate": 6.225928838071016e-06,
"loss": 0.7327,
"step": 1450
},
{
"epoch": 0.8964879852125693,
"grad_norm": 0.3131862282752991,
"learning_rate": 5.876754625483904e-06,
"loss": 0.829,
"step": 1455
},
{
"epoch": 0.8995686999383857,
"grad_norm": 0.23787960410118103,
"learning_rate": 5.537360159663108e-06,
"loss": 0.893,
"step": 1460
},
{
"epoch": 0.9026494146642021,
"grad_norm": 0.22966954112052917,
"learning_rate": 5.207780704925314e-06,
"loss": 0.8752,
"step": 1465
},
{
"epoch": 0.9057301293900185,
"grad_norm": 0.43406957387924194,
"learning_rate": 4.888050505771868e-06,
"loss": 0.9341,
"step": 1470
},
{
"epoch": 0.9088108441158349,
"grad_norm": 0.451045960187912,
"learning_rate": 4.578202783330799e-06,
"loss": 0.7766,
"step": 1475
},
{
"epoch": 0.9118915588416513,
"grad_norm": 0.28430673480033875,
"learning_rate": 4.2782697319048605e-06,
"loss": 0.8861,
"step": 1480
},
{
"epoch": 0.9149722735674677,
"grad_norm": 0.24296101927757263,
"learning_rate": 3.988282515626585e-06,
"loss": 0.8434,
"step": 1485
},
{
"epoch": 0.9180529882932841,
"grad_norm": 0.2274406999349594,
"learning_rate": 3.7082712652200867e-06,
"loss": 0.8912,
"step": 1490
},
{
"epoch": 0.9211337030191005,
"grad_norm": 0.45456749200820923,
"learning_rate": 3.438265074870417e-06,
"loss": 0.9744,
"step": 1495
},
{
"epoch": 0.9242144177449169,
"grad_norm": 0.5641310811042786,
"learning_rate": 3.1782919992006333e-06,
"loss": 0.7479,
"step": 1500
},
{
"epoch": 0.9272951324707333,
"grad_norm": 0.2640092670917511,
"learning_rate": 2.9283790503567222e-06,
"loss": 0.9081,
"step": 1505
},
{
"epoch": 0.9303758471965496,
"grad_norm": 0.24997375905513763,
"learning_rate": 2.6885521952010105e-06,
"loss": 0.9355,
"step": 1510
},
{
"epoch": 0.933456561922366,
"grad_norm": 0.2296486645936966,
"learning_rate": 2.458836352614069e-06,
"loss": 0.8545,
"step": 1515
},
{
"epoch": 0.9365372766481824,
"grad_norm": 0.4982737898826599,
"learning_rate": 2.239255390905581e-06,
"loss": 0.9361,
"step": 1520
},
{
"epoch": 0.9396179913739988,
"grad_norm": 0.5252935290336609,
"learning_rate": 2.029832125334319e-06,
"loss": 0.7706,
"step": 1525
},
{
"epoch": 0.9426987060998152,
"grad_norm": 0.271879106760025,
"learning_rate": 1.8305883157375804e-06,
"loss": 0.842,
"step": 1530
},
{
"epoch": 0.9457794208256316,
"grad_norm": 0.24854250252246857,
"learning_rate": 1.6415446642702337e-06,
"loss": 0.9651,
"step": 1535
},
{
"epoch": 0.9488601355514479,
"grad_norm": 0.21977169811725616,
"learning_rate": 1.462720813253682e-06,
"loss": 0.902,
"step": 1540
},
{
"epoch": 0.9519408502772643,
"grad_norm": 0.5121393203735352,
"learning_rate": 1.2941353431350056e-06,
"loss": 0.9256,
"step": 1545
},
{
"epoch": 0.9550215650030807,
"grad_norm": 0.5130953788757324,
"learning_rate": 1.135805770556364e-06,
"loss": 0.7639,
"step": 1550
},
{
"epoch": 0.958102279728897,
"grad_norm": 0.27663251757621765,
"learning_rate": 9.877485465349058e-07,
"loss": 0.931,
"step": 1555
},
{
"epoch": 0.9611829944547134,
"grad_norm": 0.24967016279697418,
"learning_rate": 8.499790547535025e-07,
"loss": 0.8409,
"step": 1560
},
{
"epoch": 0.9642637091805298,
"grad_norm": 0.2005191147327423,
"learning_rate": 7.225116099623286e-07,
"loss": 0.867,
"step": 1565
},
{
"epoch": 0.9673444239063462,
"grad_norm": 0.43106022477149963,
"learning_rate": 6.053594564914611e-07,
"loss": 0.9427,
"step": 1570
},
{
"epoch": 0.9704251386321626,
"grad_norm": 0.4947551190853119,
"learning_rate": 4.985347668747809e-07,
"loss": 0.7485,
"step": 1575
},
{
"epoch": 0.973505853357979,
"grad_norm": 0.26899272203445435,
"learning_rate": 4.0204864058522864e-07,
"loss": 0.9249,
"step": 1580
},
{
"epoch": 0.9765865680837954,
"grad_norm": 0.20328934490680695,
"learning_rate": 3.15911102881461e-07,
"loss": 0.9969,
"step": 1585
},
{
"epoch": 0.9796672828096118,
"grad_norm": 0.22392931580543518,
"learning_rate": 2.40131103766239e-07,
"loss": 0.8852,
"step": 1590
},
{
"epoch": 0.9827479975354282,
"grad_norm": 0.4746832251548767,
"learning_rate": 1.747165170564724e-07,
"loss": 0.9672,
"step": 1595
},
{
"epoch": 0.9858287122612446,
"grad_norm": 0.45710742473602295,
"learning_rate": 1.1967413956510686e-07,
"loss": 0.7987,
"step": 1600
},
{
"epoch": 0.988909426987061,
"grad_norm": 0.3375983238220215,
"learning_rate": 7.500969039491157e-08,
"loss": 0.8614,
"step": 1605
},
{
"epoch": 0.9919901417128774,
"grad_norm": 0.2787322998046875,
"learning_rate": 4.0727810344254325e-08,
"loss": 0.9483,
"step": 1610
},
{
"epoch": 0.9950708564386938,
"grad_norm": 0.2125030755996704,
"learning_rate": 1.6832061424865153e-08,
"loss": 0.884,
"step": 1615
},
{
"epoch": 0.9981515711645101,
"grad_norm": 0.39220768213272095,
"learning_rate": 3.3249264917878387e-09,
"loss": 0.8332,
"step": 1620
},
{
"epoch": 1.0,
"step": 1623,
"total_flos": 889480391426048.0,
"train_loss": 0.9303844255645997,
"train_runtime": 18614.4314,
"train_samples_per_second": 2.79,
"train_steps_per_second": 0.087
}
],
"logging_steps": 5,
"max_steps": 1623,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 889480391426048.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}