Files
Qwen3-8B-Drama-Thinking/trainer_state.json
ModelHub XC 8f1663e427 初始化项目,由ModelHub XC社区提供模型
Model: FutureMa/Qwen3-8B-Drama-Thinking
Source: Original Platform
2026-05-14 05:33:56 +08:00

1701 lines
44 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 50.0,
"global_step": 1185,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025324469768914213,
"grad_norm": 6.541903357166615,
"learning_rate": 1.6666666666666668e-07,
"loss": 1.6022449731826782,
"step": 1
},
{
"epoch": 0.012662234884457106,
"grad_norm": 6.812644002932686,
"learning_rate": 8.333333333333333e-07,
"loss": 1.5844556093215942,
"step": 5
},
{
"epoch": 0.025324469768914212,
"grad_norm": 5.463309643291768,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.5758234024047852,
"step": 10
},
{
"epoch": 0.03798670465337132,
"grad_norm": 3.5947186062437324,
"learning_rate": 2.5e-06,
"loss": 1.5148856163024902,
"step": 15
},
{
"epoch": 0.050648939537828425,
"grad_norm": 2.022183241828244,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.4563226699829102,
"step": 20
},
{
"epoch": 0.06331117442228554,
"grad_norm": 2.568206156447999,
"learning_rate": 4.166666666666667e-06,
"loss": 1.4071508407592774,
"step": 25
},
{
"epoch": 0.07597340930674264,
"grad_norm": 1.3172558883152181,
"learning_rate": 5e-06,
"loss": 1.361931037902832,
"step": 30
},
{
"epoch": 0.08863564419119975,
"grad_norm": 1.289611427468001,
"learning_rate": 5.833333333333334e-06,
"loss": 1.3146369934082032,
"step": 35
},
{
"epoch": 0.10129787907565685,
"grad_norm": 1.0096819520657572,
"learning_rate": 6.666666666666667e-06,
"loss": 1.2890718460083008,
"step": 40
},
{
"epoch": 0.11396011396011396,
"grad_norm": 0.9312368002064222,
"learning_rate": 7.500000000000001e-06,
"loss": 1.262472152709961,
"step": 45
},
{
"epoch": 0.12662234884457108,
"grad_norm": 0.9372677018897868,
"learning_rate": 8.333333333333334e-06,
"loss": 1.2449541091918945,
"step": 50
},
{
"epoch": 0.13928458372902816,
"grad_norm": 1.156336182386965,
"learning_rate": 9.166666666666666e-06,
"loss": 1.23805570602417,
"step": 55
},
{
"epoch": 0.15194681861348527,
"grad_norm": 1.2881982404625736,
"learning_rate": 1e-05,
"loss": 1.2132294654846192,
"step": 60
},
{
"epoch": 0.1646090534979424,
"grad_norm": 1.0254220781070695,
"learning_rate": 9.999512620046523e-06,
"loss": 1.220973587036133,
"step": 65
},
{
"epoch": 0.1772712883823995,
"grad_norm": 0.9489950684909466,
"learning_rate": 9.998050575201772e-06,
"loss": 1.2019853591918945,
"step": 70
},
{
"epoch": 0.1899335232668566,
"grad_norm": 0.9316575478826806,
"learning_rate": 9.995614150494293e-06,
"loss": 1.2073640823364258,
"step": 75
},
{
"epoch": 0.2025957581513137,
"grad_norm": 1.0042469325935621,
"learning_rate": 9.992203820909906e-06,
"loss": 1.1844447135925293,
"step": 80
},
{
"epoch": 0.2152579930357708,
"grad_norm": 0.9710381702713043,
"learning_rate": 9.987820251299121e-06,
"loss": 1.1868626594543457,
"step": 85
},
{
"epoch": 0.22792022792022792,
"grad_norm": 0.9015713419278082,
"learning_rate": 9.982464296247523e-06,
"loss": 1.16792631149292,
"step": 90
},
{
"epoch": 0.24058246280468504,
"grad_norm": 0.9242205484551671,
"learning_rate": 9.976136999909156e-06,
"loss": 1.1806648254394532,
"step": 95
},
{
"epoch": 0.25324469768914215,
"grad_norm": 0.8421714973436404,
"learning_rate": 9.968839595802982e-06,
"loss": 1.1688653945922851,
"step": 100
},
{
"epoch": 0.26590693257359926,
"grad_norm": 0.9053511703432988,
"learning_rate": 9.960573506572391e-06,
"loss": 1.1603254318237304,
"step": 105
},
{
"epoch": 0.2785691674580563,
"grad_norm": 0.8815755237366663,
"learning_rate": 9.951340343707852e-06,
"loss": 1.1436431884765625,
"step": 110
},
{
"epoch": 0.29123140234251343,
"grad_norm": 0.9133167544949871,
"learning_rate": 9.941141907232766e-06,
"loss": 1.1711238861083983,
"step": 115
},
{
"epoch": 0.30389363722697055,
"grad_norm": 0.9280708661664501,
"learning_rate": 9.929980185352525e-06,
"loss": 1.1607641220092773,
"step": 120
},
{
"epoch": 0.31655587211142766,
"grad_norm": 0.8789051540869617,
"learning_rate": 9.91785735406693e-06,
"loss": 1.1655372619628905,
"step": 125
},
{
"epoch": 0.3292181069958848,
"grad_norm": 0.9387606971380588,
"learning_rate": 9.904775776745959e-06,
"loss": 1.1415754318237306,
"step": 130
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.8962535961715238,
"learning_rate": 9.890738003669029e-06,
"loss": 1.141004753112793,
"step": 135
},
{
"epoch": 0.354542576764799,
"grad_norm": 0.8628618510513137,
"learning_rate": 9.875746771527817e-06,
"loss": 1.1703954696655274,
"step": 140
},
{
"epoch": 0.3672048116492561,
"grad_norm": 0.901181222022341,
"learning_rate": 9.859805002892733e-06,
"loss": 1.1528019905090332,
"step": 145
},
{
"epoch": 0.3798670465337132,
"grad_norm": 0.8630918009712893,
"learning_rate": 9.842915805643156e-06,
"loss": 1.1367189407348632,
"step": 150
},
{
"epoch": 0.3925292814181703,
"grad_norm": 0.8702195806012554,
"learning_rate": 9.825082472361558e-06,
"loss": 1.1533798217773437,
"step": 155
},
{
"epoch": 0.4051915163026274,
"grad_norm": 0.8708614692916694,
"learning_rate": 9.806308479691595e-06,
"loss": 1.158640480041504,
"step": 160
},
{
"epoch": 0.4178537511870845,
"grad_norm": 0.8793848769376316,
"learning_rate": 9.786597487660336e-06,
"loss": 1.1480545043945312,
"step": 165
},
{
"epoch": 0.4305159860715416,
"grad_norm": 0.8658001687150836,
"learning_rate": 9.765953338964736e-06,
"loss": 1.1336278915405273,
"step": 170
},
{
"epoch": 0.44317822095599874,
"grad_norm": 0.8569493052828222,
"learning_rate": 9.744380058222483e-06,
"loss": 1.1366922378540039,
"step": 175
},
{
"epoch": 0.45584045584045585,
"grad_norm": 0.8658238768368638,
"learning_rate": 9.721881851187406e-06,
"loss": 1.1221330642700196,
"step": 180
},
{
"epoch": 0.46850269072491296,
"grad_norm": 0.8315025062463812,
"learning_rate": 9.698463103929542e-06,
"loss": 1.137201690673828,
"step": 185
},
{
"epoch": 0.4811649256093701,
"grad_norm": 0.8646733066379476,
"learning_rate": 9.674128381980073e-06,
"loss": 1.1246437072753905,
"step": 190
},
{
"epoch": 0.49382716049382713,
"grad_norm": 0.9329613102085004,
"learning_rate": 9.648882429441258e-06,
"loss": 1.1196226119995116,
"step": 195
},
{
"epoch": 0.5064893953782843,
"grad_norm": 0.8893896484251661,
"learning_rate": 9.622730168061568e-06,
"loss": 1.1334550857543946,
"step": 200
},
{
"epoch": 0.5191516302627414,
"grad_norm": 0.912333387639604,
"learning_rate": 9.595676696276173e-06,
"loss": 1.1253994941711425,
"step": 205
},
{
"epoch": 0.5318138651471985,
"grad_norm": 0.982396926968246,
"learning_rate": 9.567727288213005e-06,
"loss": 1.1222535133361817,
"step": 210
},
{
"epoch": 0.5444761000316556,
"grad_norm": 0.885141191565451,
"learning_rate": 9.538887392664544e-06,
"loss": 1.1143704414367677,
"step": 215
},
{
"epoch": 0.5571383349161126,
"grad_norm": 0.840306231211871,
"learning_rate": 9.50916263202557e-06,
"loss": 1.1145578384399415,
"step": 220
},
{
"epoch": 0.5698005698005698,
"grad_norm": 0.873418768799577,
"learning_rate": 9.478558801197065e-06,
"loss": 1.1184951782226562,
"step": 225
},
{
"epoch": 0.5824628046850269,
"grad_norm": 0.8644731775393019,
"learning_rate": 9.44708186645649e-06,
"loss": 1.1118096351623534,
"step": 230
},
{
"epoch": 0.595125039569484,
"grad_norm": 0.8383543019686877,
"learning_rate": 9.414737964294636e-06,
"loss": 1.1120855331420898,
"step": 235
},
{
"epoch": 0.6077872744539411,
"grad_norm": 0.8339381439594867,
"learning_rate": 9.381533400219319e-06,
"loss": 1.0976166725158691,
"step": 240
},
{
"epoch": 0.6204495093383983,
"grad_norm": 1.2527861157729694,
"learning_rate": 9.347474647526095e-06,
"loss": 1.1195283889770509,
"step": 245
},
{
"epoch": 0.6331117442228553,
"grad_norm": 0.896892265554505,
"learning_rate": 9.312568346036288e-06,
"loss": 1.1280832290649414,
"step": 250
},
{
"epoch": 0.6457739791073125,
"grad_norm": 0.863217024084411,
"learning_rate": 9.276821300802535e-06,
"loss": 1.1169985771179198,
"step": 255
},
{
"epoch": 0.6584362139917695,
"grad_norm": 0.8613522109819245,
"learning_rate": 9.24024048078213e-06,
"loss": 1.110457420349121,
"step": 260
},
{
"epoch": 0.6710984488762266,
"grad_norm": 0.8269568651408957,
"learning_rate": 9.202833017478421e-06,
"loss": 1.1079233169555665,
"step": 265
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.9106153459573166,
"learning_rate": 9.164606203550498e-06,
"loss": 1.115132713317871,
"step": 270
},
{
"epoch": 0.6964229186451408,
"grad_norm": 0.8475270076896408,
"learning_rate": 9.125567491391476e-06,
"loss": 1.114927101135254,
"step": 275
},
{
"epoch": 0.709085153529598,
"grad_norm": 0.8419303390301319,
"learning_rate": 9.085724491675642e-06,
"loss": 1.1053291320800782,
"step": 280
},
{
"epoch": 0.7217473884140551,
"grad_norm": 0.8793202963091465,
"learning_rate": 9.045084971874738e-06,
"loss": 1.1043977737426758,
"step": 285
},
{
"epoch": 0.7344096232985122,
"grad_norm": 0.8844837887961337,
"learning_rate": 9.003656854743667e-06,
"loss": 1.0930152893066407,
"step": 290
},
{
"epoch": 0.7470718581829693,
"grad_norm": 0.8243068254935355,
"learning_rate": 8.961448216775955e-06,
"loss": 1.1083423614501953,
"step": 295
},
{
"epoch": 0.7597340930674265,
"grad_norm": 0.8231480643363308,
"learning_rate": 8.9184672866292e-06,
"loss": 1.093316650390625,
"step": 300
},
{
"epoch": 0.7723963279518835,
"grad_norm": 0.856487105562053,
"learning_rate": 8.874722443520898e-06,
"loss": 1.0935728073120117,
"step": 305
},
{
"epoch": 0.7850585628363406,
"grad_norm": 0.9069069891533103,
"learning_rate": 8.83022221559489e-06,
"loss": 1.085923957824707,
"step": 310
},
{
"epoch": 0.7977207977207977,
"grad_norm": 0.8415924258567633,
"learning_rate": 8.784975278258783e-06,
"loss": 1.1055352210998535,
"step": 315
},
{
"epoch": 0.8103830326052548,
"grad_norm": 0.8547842295217172,
"learning_rate": 8.73899045249266e-06,
"loss": 1.1053098678588866,
"step": 320
},
{
"epoch": 0.823045267489712,
"grad_norm": 0.9042040663099864,
"learning_rate": 8.692276703129421e-06,
"loss": 1.100543212890625,
"step": 325
},
{
"epoch": 0.835707502374169,
"grad_norm": 0.840156677605529,
"learning_rate": 8.644843137107058e-06,
"loss": 1.1007650375366211,
"step": 330
},
{
"epoch": 0.8483697372586262,
"grad_norm": 0.8554168041829401,
"learning_rate": 8.596699001693257e-06,
"loss": 1.095210647583008,
"step": 335
},
{
"epoch": 0.8610319721430832,
"grad_norm": 0.8378136162576828,
"learning_rate": 8.547853682682605e-06,
"loss": 1.0945035934448242,
"step": 340
},
{
"epoch": 0.8736942070275404,
"grad_norm": 0.8300982370825878,
"learning_rate": 8.498316702566828e-06,
"loss": 1.0824993133544922,
"step": 345
},
{
"epoch": 0.8863564419119975,
"grad_norm": 0.8879949006435145,
"learning_rate": 8.44809771867835e-06,
"loss": 1.0910042762756347,
"step": 350
},
{
"epoch": 0.8990186767964545,
"grad_norm": 0.8363110809635331,
"learning_rate": 8.397206521307584e-06,
"loss": 1.085635280609131,
"step": 355
},
{
"epoch": 0.9116809116809117,
"grad_norm": 0.8250978511317656,
"learning_rate": 8.345653031794292e-06,
"loss": 1.0832603454589844,
"step": 360
},
{
"epoch": 0.9243431465653688,
"grad_norm": 0.8250625494950978,
"learning_rate": 8.293447300593402e-06,
"loss": 1.0881545066833496,
"step": 365
},
{
"epoch": 0.9370053814498259,
"grad_norm": 0.9637417812174898,
"learning_rate": 8.240599505315656e-06,
"loss": 1.077590274810791,
"step": 370
},
{
"epoch": 0.949667616334283,
"grad_norm": 0.938188486575515,
"learning_rate": 8.18711994874345e-06,
"loss": 1.0923616409301757,
"step": 375
},
{
"epoch": 0.9623298512187402,
"grad_norm": 0.829053167214024,
"learning_rate": 8.133019056822303e-06,
"loss": 1.0790325164794923,
"step": 380
},
{
"epoch": 0.9749920861031972,
"grad_norm": 0.8296874845053457,
"learning_rate": 8.078307376628292e-06,
"loss": 1.0690267562866211,
"step": 385
},
{
"epoch": 0.9876543209876543,
"grad_norm": 0.8248755231512207,
"learning_rate": 8.022995574311876e-06,
"loss": 1.0922147750854492,
"step": 390
},
{
"epoch": 1.0,
"grad_norm": 0.9123714875418006,
"learning_rate": 7.967094433018508e-06,
"loss": 1.0716293334960938,
"step": 395
},
{
"epoch": 1.0126622348844572,
"grad_norm": 0.8825626316822892,
"learning_rate": 7.910614850786448e-06,
"loss": 0.9421855926513671,
"step": 400
},
{
"epoch": 1.0253244697689141,
"grad_norm": 0.981129259243819,
"learning_rate": 7.85356783842216e-06,
"loss": 0.9680027008056641,
"step": 405
},
{
"epoch": 1.0379867046533713,
"grad_norm": 0.9490494582638624,
"learning_rate": 7.795964517353734e-06,
"loss": 0.9392026901245117,
"step": 410
},
{
"epoch": 1.0506489395378285,
"grad_norm": 1.0436527309713077,
"learning_rate": 7.737816117462752e-06,
"loss": 0.9481110572814941,
"step": 415
},
{
"epoch": 1.0633111744222856,
"grad_norm": 0.9193717140597131,
"learning_rate": 7.679133974894984e-06,
"loss": 0.9479268074035645,
"step": 420
},
{
"epoch": 1.0759734093067426,
"grad_norm": 0.9176846478769476,
"learning_rate": 7.619929529850397e-06,
"loss": 0.9510162353515625,
"step": 425
},
{
"epoch": 1.0886356441911997,
"grad_norm": 0.9263690784461404,
"learning_rate": 7.560214324352858e-06,
"loss": 0.9560428619384765,
"step": 430
},
{
"epoch": 1.101297879075657,
"grad_norm": 0.8985018721390384,
"learning_rate": 7.500000000000001e-06,
"loss": 0.9549171447753906,
"step": 435
},
{
"epoch": 1.1139601139601139,
"grad_norm": 0.8383045792822509,
"learning_rate": 7.4392982956936644e-06,
"loss": 0.9572299957275391,
"step": 440
},
{
"epoch": 1.126622348844571,
"grad_norm": 0.8693402459631241,
"learning_rate": 7.378121045351378e-06,
"loss": 0.9538370132446289,
"step": 445
},
{
"epoch": 1.1392845837290282,
"grad_norm": 0.8465948151936904,
"learning_rate": 7.31648017559931e-06,
"loss": 0.9445423126220703,
"step": 450
},
{
"epoch": 1.1519468186134854,
"grad_norm": 0.8993258971886791,
"learning_rate": 7.254387703447154e-06,
"loss": 0.9402847290039062,
"step": 455
},
{
"epoch": 1.1646090534979423,
"grad_norm": 0.8973654441260622,
"learning_rate": 7.191855733945388e-06,
"loss": 0.9458431243896485,
"step": 460
},
{
"epoch": 1.1772712883823995,
"grad_norm": 0.8975789539843146,
"learning_rate": 7.128896457825364e-06,
"loss": 0.9456979751586914,
"step": 465
},
{
"epoch": 1.1899335232668566,
"grad_norm": 0.9025883974896288,
"learning_rate": 7.06552214912271e-06,
"loss": 0.958702278137207,
"step": 470
},
{
"epoch": 1.2025957581513138,
"grad_norm": 0.8943619241590697,
"learning_rate": 7.0017451627844765e-06,
"loss": 0.9409778594970704,
"step": 475
},
{
"epoch": 1.2152579930357708,
"grad_norm": 0.8987697465779751,
"learning_rate": 6.9375779322605154e-06,
"loss": 0.952575397491455,
"step": 480
},
{
"epoch": 1.227920227920228,
"grad_norm": 0.8957262384243423,
"learning_rate": 6.873032967079562e-06,
"loss": 0.9412460327148438,
"step": 485
},
{
"epoch": 1.240582462804685,
"grad_norm": 0.9191287064439484,
"learning_rate": 6.808122850410461e-06,
"loss": 0.9442897796630859,
"step": 490
},
{
"epoch": 1.253244697689142,
"grad_norm": 0.9120111224616239,
"learning_rate": 6.7428602366090764e-06,
"loss": 0.9721967697143554,
"step": 495
},
{
"epoch": 1.2659069325735992,
"grad_norm": 0.9297557344562997,
"learning_rate": 6.677257848751276e-06,
"loss": 0.9427990913391113,
"step": 500
},
{
"epoch": 1.2785691674580564,
"grad_norm": 0.9256360350131605,
"learning_rate": 6.611328476152557e-06,
"loss": 0.9448193550109864,
"step": 505
},
{
"epoch": 1.2912314023425133,
"grad_norm": 0.9178166712574457,
"learning_rate": 6.545084971874738e-06,
"loss": 0.9285225868225098,
"step": 510
},
{
"epoch": 1.3038936372269705,
"grad_norm": 0.8824737418151191,
"learning_rate": 6.4785402502202345e-06,
"loss": 0.9465466499328613,
"step": 515
},
{
"epoch": 1.3165558721114277,
"grad_norm": 0.8714305178817582,
"learning_rate": 6.411707284214384e-06,
"loss": 0.9558137893676758,
"step": 520
},
{
"epoch": 1.3292181069958848,
"grad_norm": 1.6420471551581535,
"learning_rate": 6.344599103076329e-06,
"loss": 0.9441043853759765,
"step": 525
},
{
"epoch": 1.341880341880342,
"grad_norm": 0.8940534993249484,
"learning_rate": 6.277228789678953e-06,
"loss": 0.9406339645385742,
"step": 530
},
{
"epoch": 1.354542576764799,
"grad_norm": 0.8657105103377609,
"learning_rate": 6.209609477998339e-06,
"loss": 0.9400988578796386,
"step": 535
},
{
"epoch": 1.3672048116492561,
"grad_norm": 0.8795303497602281,
"learning_rate": 6.141754350553279e-06,
"loss": 0.9375904083251954,
"step": 540
},
{
"epoch": 1.3798670465337133,
"grad_norm": 0.8778881000839949,
"learning_rate": 6.073676635835317e-06,
"loss": 0.9534420013427735,
"step": 545
},
{
"epoch": 1.3925292814181702,
"grad_norm": 0.8609329406866304,
"learning_rate": 6.005389605729824e-06,
"loss": 0.9435734748840332,
"step": 550
},
{
"epoch": 1.4051915163026274,
"grad_norm": 0.901450340070586,
"learning_rate": 5.936906572928625e-06,
"loss": 0.9454706192016602,
"step": 555
},
{
"epoch": 1.4178537511870846,
"grad_norm": 0.9056724009579911,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.9358626365661621,
"step": 560
},
{
"epoch": 1.4305159860715415,
"grad_norm": 0.8767791922734569,
"learning_rate": 5.799405938459175e-06,
"loss": 0.9384665489196777,
"step": 565
},
{
"epoch": 1.4431782209559987,
"grad_norm": 0.9226108292554362,
"learning_rate": 5.730415142812059e-06,
"loss": 0.9389400482177734,
"step": 570
},
{
"epoch": 1.4558404558404558,
"grad_norm": 0.8635227126945888,
"learning_rate": 5.661281951285613e-06,
"loss": 0.9539518356323242,
"step": 575
},
{
"epoch": 1.468502690724913,
"grad_norm": 0.8840260265705664,
"learning_rate": 5.592019841532507e-06,
"loss": 0.9480253219604492,
"step": 580
},
{
"epoch": 1.4811649256093702,
"grad_norm": 0.9151680057009149,
"learning_rate": 5.522642316338268e-06,
"loss": 0.9404661178588867,
"step": 585
},
{
"epoch": 1.4938271604938271,
"grad_norm": 0.9450262697016882,
"learning_rate": 5.453162900988902e-06,
"loss": 0.9321787834167481,
"step": 590
},
{
"epoch": 1.5064893953782843,
"grad_norm": 0.8402436559360018,
"learning_rate": 5.383595140634093e-06,
"loss": 0.9440553665161133,
"step": 595
},
{
"epoch": 1.5191516302627415,
"grad_norm": 0.8778976142471068,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.9511254310607911,
"step": 600
},
{
"epoch": 1.5318138651471984,
"grad_norm": 0.8781843644707367,
"learning_rate": 5.244248848978067e-06,
"loss": 0.9387626647949219,
"step": 605
},
{
"epoch": 1.5444761000316556,
"grad_norm": 0.8642449781808372,
"learning_rate": 5.174497483512506e-06,
"loss": 0.956205177307129,
"step": 610
},
{
"epoch": 1.5571383349161128,
"grad_norm": 0.8846802147972775,
"learning_rate": 5.1047120994167855e-06,
"loss": 0.9363911628723145,
"step": 615
},
{
"epoch": 1.5698005698005697,
"grad_norm": 0.8739137758439613,
"learning_rate": 5.034906301489808e-06,
"loss": 0.9367790222167969,
"step": 620
},
{
"epoch": 1.5824628046850269,
"grad_norm": 0.8953494651595788,
"learning_rate": 4.965093698510192e-06,
"loss": 0.9425483703613281,
"step": 625
},
{
"epoch": 1.595125039569484,
"grad_norm": 0.8615421639128288,
"learning_rate": 4.895287900583216e-06,
"loss": 0.9341062545776367,
"step": 630
},
{
"epoch": 1.607787274453941,
"grad_norm": 0.8353360832306662,
"learning_rate": 4.825502516487497e-06,
"loss": 0.949849796295166,
"step": 635
},
{
"epoch": 1.6204495093383984,
"grad_norm": 0.8563998366304418,
"learning_rate": 4.755751151021934e-06,
"loss": 0.9409940719604493,
"step": 640
},
{
"epoch": 1.6331117442228553,
"grad_norm": 0.9360183967885729,
"learning_rate": 4.686047402353433e-06,
"loss": 0.939891242980957,
"step": 645
},
{
"epoch": 1.6457739791073125,
"grad_norm": 0.8806457976411894,
"learning_rate": 4.6164048593659076e-06,
"loss": 0.952726173400879,
"step": 650
},
{
"epoch": 1.6584362139917697,
"grad_norm": 0.8871650293826654,
"learning_rate": 4.546837099011101e-06,
"loss": 0.9440122604370117,
"step": 655
},
{
"epoch": 1.6710984488762266,
"grad_norm": 0.8543495337665787,
"learning_rate": 4.477357683661734e-06,
"loss": 0.9277559280395508,
"step": 660
},
{
"epoch": 1.6837606837606838,
"grad_norm": 0.8754310619944701,
"learning_rate": 4.4079801584674955e-06,
"loss": 0.9328133583068847,
"step": 665
},
{
"epoch": 1.696422918645141,
"grad_norm": 0.846881206379322,
"learning_rate": 4.3387180487143875e-06,
"loss": 0.9440486907958985,
"step": 670
},
{
"epoch": 1.709085153529598,
"grad_norm": 0.8123484252146217,
"learning_rate": 4.269584857187942e-06,
"loss": 0.9334369659423828,
"step": 675
},
{
"epoch": 1.721747388414055,
"grad_norm": 0.8860941606484654,
"learning_rate": 4.200594061540827e-06,
"loss": 0.9386373519897461,
"step": 680
},
{
"epoch": 1.7344096232985122,
"grad_norm": 0.8710977899292981,
"learning_rate": 4.131759111665349e-06,
"loss": 0.9379000663757324,
"step": 685
},
{
"epoch": 1.7470718581829692,
"grad_norm": 0.8989668390644706,
"learning_rate": 4.063093427071376e-06,
"loss": 0.9351366043090821,
"step": 690
},
{
"epoch": 1.7597340930674266,
"grad_norm": 0.8426262295188102,
"learning_rate": 3.994610394270178e-06,
"loss": 0.9458501815795899,
"step": 695
},
{
"epoch": 1.7723963279518835,
"grad_norm": 0.8490556601435445,
"learning_rate": 3.926323364164684e-06,
"loss": 0.9344646453857421,
"step": 700
},
{
"epoch": 1.7850585628363405,
"grad_norm": 0.857013306646358,
"learning_rate": 3.8582456494467214e-06,
"loss": 0.9324585914611816,
"step": 705
},
{
"epoch": 1.7977207977207978,
"grad_norm": 0.8442075171060656,
"learning_rate": 3.790390522001662e-06,
"loss": 0.9345897674560547,
"step": 710
},
{
"epoch": 1.8103830326052548,
"grad_norm": 0.8635838902214552,
"learning_rate": 3.7227712103210485e-06,
"loss": 0.9480118751525879,
"step": 715
},
{
"epoch": 1.823045267489712,
"grad_norm": 0.8701785787205291,
"learning_rate": 3.655400896923672e-06,
"loss": 0.9411863327026367,
"step": 720
},
{
"epoch": 1.8357075023741691,
"grad_norm": 0.9278897279843371,
"learning_rate": 3.5882927157856175e-06,
"loss": 0.9384016036987305,
"step": 725
},
{
"epoch": 1.848369737258626,
"grad_norm": 0.8675201640437896,
"learning_rate": 3.521459749779769e-06,
"loss": 0.9388191223144531,
"step": 730
},
{
"epoch": 1.8610319721430832,
"grad_norm": 0.9047480946293855,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.943515396118164,
"step": 735
},
{
"epoch": 1.8736942070275404,
"grad_norm": 0.9100256206799584,
"learning_rate": 3.3886715238474454e-06,
"loss": 0.9317167282104493,
"step": 740
},
{
"epoch": 1.8863564419119974,
"grad_norm": 0.9121240599713055,
"learning_rate": 3.322742151248726e-06,
"loss": 0.9298182487487793,
"step": 745
},
{
"epoch": 1.8990186767964545,
"grad_norm": 0.8360632961222116,
"learning_rate": 3.2571397633909252e-06,
"loss": 0.9383123397827149,
"step": 750
},
{
"epoch": 1.9116809116809117,
"grad_norm": 0.8449980062948027,
"learning_rate": 3.1918771495895395e-06,
"loss": 0.9380681991577149,
"step": 755
},
{
"epoch": 1.9243431465653686,
"grad_norm": 0.8358057866853585,
"learning_rate": 3.12696703292044e-06,
"loss": 0.9311031341552735,
"step": 760
},
{
"epoch": 1.937005381449826,
"grad_norm": 0.8261214369483678,
"learning_rate": 3.0624220677394854e-06,
"loss": 0.9335260391235352,
"step": 765
},
{
"epoch": 1.949667616334283,
"grad_norm": 0.8746978630306859,
"learning_rate": 2.9982548372155264e-06,
"loss": 0.9282594680786133,
"step": 770
},
{
"epoch": 1.9623298512187402,
"grad_norm": 0.8914685495920053,
"learning_rate": 2.934477850877292e-06,
"loss": 0.9267834663391114,
"step": 775
},
{
"epoch": 1.9749920861031973,
"grad_norm": 0.8730909900000534,
"learning_rate": 2.871103542174637e-06,
"loss": 0.9400104522705078,
"step": 780
},
{
"epoch": 1.9876543209876543,
"grad_norm": 0.9195388866817068,
"learning_rate": 2.8081442660546126e-06,
"loss": 0.9355339050292969,
"step": 785
},
{
"epoch": 2.0,
"grad_norm": 0.8941990040688051,
"learning_rate": 2.7456122965528475e-06,
"loss": 0.9464699745178222,
"step": 790
},
{
"epoch": 2.012662234884457,
"grad_norm": 0.9551299167570609,
"learning_rate": 2.683519824400693e-06,
"loss": 0.8369241714477539,
"step": 795
},
{
"epoch": 2.0253244697689143,
"grad_norm": 0.9503417747763285,
"learning_rate": 2.6218789546486235e-06,
"loss": 0.8305461883544922,
"step": 800
},
{
"epoch": 2.0379867046533713,
"grad_norm": 0.9428708677587196,
"learning_rate": 2.560701704306336e-06,
"loss": 0.8380617141723633,
"step": 805
},
{
"epoch": 2.0506489395378282,
"grad_norm": 0.9141118129164282,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.8350756645202637,
"step": 810
},
{
"epoch": 2.0633111744222856,
"grad_norm": 0.8925703133521584,
"learning_rate": 2.4397856756471435e-06,
"loss": 0.8253829956054688,
"step": 815
},
{
"epoch": 2.0759734093067426,
"grad_norm": 0.9010434641718755,
"learning_rate": 2.380070470149605e-06,
"loss": 0.8296566009521484,
"step": 820
},
{
"epoch": 2.0886356441912,
"grad_norm": 0.9108639104627096,
"learning_rate": 2.320866025105016e-06,
"loss": 0.8311027526855469,
"step": 825
},
{
"epoch": 2.101297879075657,
"grad_norm": 0.8776111588691332,
"learning_rate": 2.2621838825372496e-06,
"loss": 0.8341006278991699,
"step": 830
},
{
"epoch": 2.113960113960114,
"grad_norm": 0.9815553215946904,
"learning_rate": 2.204035482646267e-06,
"loss": 0.8500799179077149,
"step": 835
},
{
"epoch": 2.1266223488445712,
"grad_norm": 0.9142581836968815,
"learning_rate": 2.146432161577842e-06,
"loss": 0.8405605316162109,
"step": 840
},
{
"epoch": 2.139284583729028,
"grad_norm": 1.145797436281295,
"learning_rate": 2.0893851492135536e-06,
"loss": 0.8333783149719238,
"step": 845
},
{
"epoch": 2.151946818613485,
"grad_norm": 1.0132915822668673,
"learning_rate": 2.0329055669814936e-06,
"loss": 0.8394683837890625,
"step": 850
},
{
"epoch": 2.1646090534979425,
"grad_norm": 1.054834919380244,
"learning_rate": 1.977004425688126e-06,
"loss": 0.8199810028076172,
"step": 855
},
{
"epoch": 2.1772712883823995,
"grad_norm": 0.8659196804614238,
"learning_rate": 1.9216926233717087e-06,
"loss": 0.8200090408325196,
"step": 860
},
{
"epoch": 2.1899335232668564,
"grad_norm": 0.8936160842705467,
"learning_rate": 1.8669809431776991e-06,
"loss": 0.819823932647705,
"step": 865
},
{
"epoch": 2.202595758151314,
"grad_norm": 0.8936829197489651,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.8329672813415527,
"step": 870
},
{
"epoch": 2.2152579930357708,
"grad_norm": 0.865124596235692,
"learning_rate": 1.7594004946843458e-06,
"loss": 0.830903434753418,
"step": 875
},
{
"epoch": 2.2279202279202277,
"grad_norm": 1.0004908923945968,
"learning_rate": 1.7065526994065973e-06,
"loss": 0.8222661972045898,
"step": 880
},
{
"epoch": 2.240582462804685,
"grad_norm": 0.9974019476784539,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.8375696182250977,
"step": 885
},
{
"epoch": 2.253244697689142,
"grad_norm": 0.9489943802555122,
"learning_rate": 1.6027934786924187e-06,
"loss": 0.8297539710998535,
"step": 890
},
{
"epoch": 2.2659069325735994,
"grad_norm": 0.8526558052313017,
"learning_rate": 1.551902281321651e-06,
"loss": 0.8450464248657227,
"step": 895
},
{
"epoch": 2.2785691674580564,
"grad_norm": 0.9095006101158244,
"learning_rate": 1.5016832974331725e-06,
"loss": 0.8434087753295898,
"step": 900
},
{
"epoch": 2.2912314023425133,
"grad_norm": 0.8941646461803728,
"learning_rate": 1.4521463173173966e-06,
"loss": 0.8199748992919922,
"step": 905
},
{
"epoch": 2.3038936372269707,
"grad_norm": 1.0151629908802393,
"learning_rate": 1.4033009983067454e-06,
"loss": 0.8257926940917969,
"step": 910
},
{
"epoch": 2.3165558721114277,
"grad_norm": 0.8688789281578927,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.8222599029541016,
"step": 915
},
{
"epoch": 2.3292181069958846,
"grad_norm": 0.9256160248355862,
"learning_rate": 1.3077232968705805e-06,
"loss": 0.8179254531860352,
"step": 920
},
{
"epoch": 2.341880341880342,
"grad_norm": 0.8992368646832662,
"learning_rate": 1.2610095475073415e-06,
"loss": 0.8351934432983399,
"step": 925
},
{
"epoch": 2.354542576764799,
"grad_norm": 0.8950940935118609,
"learning_rate": 1.2150247217412186e-06,
"loss": 0.8317380905151367,
"step": 930
},
{
"epoch": 2.3672048116492563,
"grad_norm": 0.8832980241323902,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.8315254211425781,
"step": 935
},
{
"epoch": 2.3798670465337133,
"grad_norm": 0.9169712891318174,
"learning_rate": 1.1252775564791023e-06,
"loss": 0.8270421981811523,
"step": 940
},
{
"epoch": 2.3925292814181702,
"grad_norm": 0.8820889479067028,
"learning_rate": 1.0815327133708015e-06,
"loss": 0.8373619079589844,
"step": 945
},
{
"epoch": 2.4051915163026276,
"grad_norm": 0.8633797543130399,
"learning_rate": 1.0385517832240472e-06,
"loss": 0.822084617614746,
"step": 950
},
{
"epoch": 2.4178537511870846,
"grad_norm": 0.883673978191503,
"learning_rate": 9.963431452563331e-07,
"loss": 0.8369743347167968,
"step": 955
},
{
"epoch": 2.4305159860715415,
"grad_norm": 0.8729883350584554,
"learning_rate": 9.549150281252633e-07,
"loss": 0.8232148170471192,
"step": 960
},
{
"epoch": 2.443178220955999,
"grad_norm": 0.9022268286398805,
"learning_rate": 9.142755083243577e-07,
"loss": 0.8386312484741211,
"step": 965
},
{
"epoch": 2.455840455840456,
"grad_norm": 0.886835830092829,
"learning_rate": 8.744325086085248e-07,
"loss": 0.8283025741577148,
"step": 970
},
{
"epoch": 2.468502690724913,
"grad_norm": 0.8893818705075751,
"learning_rate": 8.353937964495029e-07,
"loss": 0.8303911209106445,
"step": 975
},
{
"epoch": 2.48116492560937,
"grad_norm": 0.922324297214932,
"learning_rate": 7.971669825215789e-07,
"loss": 0.836126708984375,
"step": 980
},
{
"epoch": 2.493827160493827,
"grad_norm": 0.904255523759268,
"learning_rate": 7.597595192178702e-07,
"loss": 0.8196451187133789,
"step": 985
},
{
"epoch": 2.506489395378284,
"grad_norm": 0.8636934543130641,
"learning_rate": 7.23178699197467e-07,
"loss": 0.8335494995117188,
"step": 990
},
{
"epoch": 2.5191516302627415,
"grad_norm": 0.897039019918768,
"learning_rate": 6.874316539637127e-07,
"loss": 0.8088079452514648,
"step": 995
},
{
"epoch": 2.5318138651471984,
"grad_norm": 0.8864019874531589,
"learning_rate": 6.52525352473905e-07,
"loss": 0.8233877182006836,
"step": 1000
},
{
"epoch": 2.5444761000316554,
"grad_norm": 0.9088661368290617,
"learning_rate": 6.184665997806832e-07,
"loss": 0.8182021141052246,
"step": 1005
},
{
"epoch": 2.5571383349161128,
"grad_norm": 0.9072624944819723,
"learning_rate": 5.852620357053651e-07,
"loss": 0.835714054107666,
"step": 1010
},
{
"epoch": 2.5698005698005697,
"grad_norm": 0.8731742132357438,
"learning_rate": 5.529181335435124e-07,
"loss": 0.8283638000488281,
"step": 1015
},
{
"epoch": 2.5824628046850266,
"grad_norm": 0.8761704900525855,
"learning_rate": 5.214411988029355e-07,
"loss": 0.828251838684082,
"step": 1020
},
{
"epoch": 2.595125039569484,
"grad_norm": 0.8629998756845685,
"learning_rate": 4.908373679744316e-07,
"loss": 0.8239392280578614,
"step": 1025
},
{
"epoch": 2.607787274453941,
"grad_norm": 0.9025713028894049,
"learning_rate": 4.6111260733545714e-07,
"loss": 0.8368805885314942,
"step": 1030
},
{
"epoch": 2.6204495093383984,
"grad_norm": 0.8791508721949534,
"learning_rate": 4.322727117869951e-07,
"loss": 0.8214786529541016,
"step": 1035
},
{
"epoch": 2.6331117442228553,
"grad_norm": 0.8747271828916487,
"learning_rate": 4.043233037238281e-07,
"loss": 0.8331809997558594,
"step": 1040
},
{
"epoch": 2.6457739791073127,
"grad_norm": 0.9189023842289675,
"learning_rate": 3.772698319384349e-07,
"loss": 0.8299878120422364,
"step": 1045
},
{
"epoch": 2.6584362139917697,
"grad_norm": 0.9012554611673713,
"learning_rate": 3.511175705587433e-07,
"loss": 0.8398582458496093,
"step": 1050
},
{
"epoch": 2.6710984488762266,
"grad_norm": 0.8857980961838654,
"learning_rate": 3.258716180199278e-07,
"loss": 0.818387794494629,
"step": 1055
},
{
"epoch": 2.683760683760684,
"grad_norm": 0.8627721513188427,
"learning_rate": 3.015368960704584e-07,
"loss": 0.8408231735229492,
"step": 1060
},
{
"epoch": 2.696422918645141,
"grad_norm": 1.0338964144567573,
"learning_rate": 2.7811814881259503e-07,
"loss": 0.8292581558227539,
"step": 1065
},
{
"epoch": 2.709085153529598,
"grad_norm": 0.8858535803633547,
"learning_rate": 2.556199417775174e-07,
"loss": 0.8229169845581055,
"step": 1070
},
{
"epoch": 2.7217473884140553,
"grad_norm": 0.8487724578022646,
"learning_rate": 2.3404666103526542e-07,
"loss": 0.8243260383605957,
"step": 1075
},
{
"epoch": 2.7344096232985122,
"grad_norm": 0.9142856277892702,
"learning_rate": 2.134025123396638e-07,
"loss": 0.8411771774291992,
"step": 1080
},
{
"epoch": 2.747071858182969,
"grad_norm": 0.9044836277079623,
"learning_rate": 1.9369152030840553e-07,
"loss": 0.8182785034179687,
"step": 1085
},
{
"epoch": 2.7597340930674266,
"grad_norm": 0.8753811947010942,
"learning_rate": 1.7491752763844294e-07,
"loss": 0.8330059051513672,
"step": 1090
},
{
"epoch": 2.7723963279518835,
"grad_norm": 0.9127704993164228,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.8270849227905274,
"step": 1095
},
{
"epoch": 2.7850585628363405,
"grad_norm": 0.8885180442416001,
"learning_rate": 1.4019499710726913e-07,
"loss": 0.8345333099365234,
"step": 1100
},
{
"epoch": 2.797720797720798,
"grad_norm": 0.9121582122101339,
"learning_rate": 1.2425322847218368e-07,
"loss": 0.8229399681091308,
"step": 1105
},
{
"epoch": 2.810383032605255,
"grad_norm": 0.9127878209504691,
"learning_rate": 1.0926199633097156e-07,
"loss": 0.82467041015625,
"step": 1110
},
{
"epoch": 2.8230452674897117,
"grad_norm": 1.744776544604879,
"learning_rate": 9.522422325404234e-08,
"loss": 0.8274450302124023,
"step": 1115
},
{
"epoch": 2.835707502374169,
"grad_norm": 0.8982447409078338,
"learning_rate": 8.214264593307097e-08,
"loss": 0.8290293693542481,
"step": 1120
},
{
"epoch": 2.848369737258626,
"grad_norm": 0.8866891213730514,
"learning_rate": 7.001981464747565e-08,
"loss": 0.8212656021118164,
"step": 1125
},
{
"epoch": 2.861031972143083,
"grad_norm": 0.9114969683085143,
"learning_rate": 5.8858092767236084e-08,
"loss": 0.8231026649475097,
"step": 1130
},
{
"epoch": 2.8736942070275404,
"grad_norm": 0.8832450189167574,
"learning_rate": 4.865965629214819e-08,
"loss": 0.830931282043457,
"step": 1135
},
{
"epoch": 2.8863564419119974,
"grad_norm": 0.8487481576996411,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.8255987167358398,
"step": 1140
},
{
"epoch": 2.8990186767964543,
"grad_norm": 0.879731012515658,
"learning_rate": 3.1160404197018155e-08,
"loss": 0.8359064102172852,
"step": 1145
},
{
"epoch": 2.9116809116809117,
"grad_norm": 0.9058610366933287,
"learning_rate": 2.386300009084408e-08,
"loss": 0.8246042251586914,
"step": 1150
},
{
"epoch": 2.9243431465653686,
"grad_norm": 0.898984738606933,
"learning_rate": 1.753570375247815e-08,
"loss": 0.8313743591308593,
"step": 1155
},
{
"epoch": 2.937005381449826,
"grad_norm": 0.8885810042812119,
"learning_rate": 1.2179748700879013e-08,
"loss": 0.829072380065918,
"step": 1160
},
{
"epoch": 2.949667616334283,
"grad_norm": 0.903649383117039,
"learning_rate": 7.796179090094891e-09,
"loss": 0.8449357986450196,
"step": 1165
},
{
"epoch": 2.9623298512187404,
"grad_norm": 0.9753427783203841,
"learning_rate": 4.385849505708084e-09,
"loss": 0.8176769256591797,
"step": 1170
},
{
"epoch": 2.9749920861031973,
"grad_norm": 0.8748130538451198,
"learning_rate": 1.9494247982282386e-09,
"loss": 0.8217670440673828,
"step": 1175
},
{
"epoch": 2.9876543209876543,
"grad_norm": 0.9286601621176515,
"learning_rate": 4.87379953478806e-10,
"loss": 0.8410984992980957,
"step": 1180
},
{
"epoch": 3.0,
"grad_norm": 0.9389103063767028,
"learning_rate": 0.0,
"loss": 0.8439925193786622,
"step": 1185
}
],
"logging_steps": 5,
"max_steps": 1185,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.9001299950934426e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}