Files
llama-3-8b-base-sft-ultrach…/trainer_state.json
ModelHub XC e9b9bc6186 初始化项目,由ModelHub XC社区提供模型
Model: W-61/llama-3-8b-base-sft-ultrachat-8xh200
Source: Original Platform
2026-05-09 12:48:29 +08:00

1392 lines
34 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 936,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010683760683760685,
"grad_norm": 4.745241496830083e+18,
"learning_rate": 0.0,
"loss": 1.5719,
"step": 1
},
{
"epoch": 0.005341880341880342,
"grad_norm": 3565.890869140625,
"learning_rate": 8.510638297872341e-07,
"loss": 1.5783,
"step": 5
},
{
"epoch": 0.010683760683760684,
"grad_norm": 37.561458587646484,
"learning_rate": 1.9148936170212767e-06,
"loss": 1.5379,
"step": 10
},
{
"epoch": 0.016025641025641024,
"grad_norm": 5.307524681091309,
"learning_rate": 2.978723404255319e-06,
"loss": 1.439,
"step": 15
},
{
"epoch": 0.021367521367521368,
"grad_norm": 2.2638120651245117,
"learning_rate": 4.042553191489362e-06,
"loss": 1.3521,
"step": 20
},
{
"epoch": 0.026709401709401708,
"grad_norm": 3.3778412342071533,
"learning_rate": 5.106382978723404e-06,
"loss": 1.2871,
"step": 25
},
{
"epoch": 0.03205128205128205,
"grad_norm": 1.7132008075714111,
"learning_rate": 6.170212765957447e-06,
"loss": 1.2607,
"step": 30
},
{
"epoch": 0.03739316239316239,
"grad_norm": 1.8523601293563843,
"learning_rate": 7.234042553191491e-06,
"loss": 1.2357,
"step": 35
},
{
"epoch": 0.042735042735042736,
"grad_norm": 1.5859768390655518,
"learning_rate": 8.297872340425532e-06,
"loss": 1.2121,
"step": 40
},
{
"epoch": 0.04807692307692308,
"grad_norm": 1.557377815246582,
"learning_rate": 9.361702127659576e-06,
"loss": 1.1803,
"step": 45
},
{
"epoch": 0.053418803418803416,
"grad_norm": 1.4963977336883545,
"learning_rate": 1.0425531914893619e-05,
"loss": 1.1915,
"step": 50
},
{
"epoch": 0.05876068376068376,
"grad_norm": 1.3628768920898438,
"learning_rate": 1.1489361702127662e-05,
"loss": 1.1701,
"step": 55
},
{
"epoch": 0.0641025641025641,
"grad_norm": 2.2901501655578613,
"learning_rate": 1.2553191489361702e-05,
"loss": 1.1598,
"step": 60
},
{
"epoch": 0.06944444444444445,
"grad_norm": 1.3150509595870972,
"learning_rate": 1.3617021276595745e-05,
"loss": 1.1611,
"step": 65
},
{
"epoch": 0.07478632478632478,
"grad_norm": 1.1793471574783325,
"learning_rate": 1.4680851063829789e-05,
"loss": 1.1656,
"step": 70
},
{
"epoch": 0.08012820512820513,
"grad_norm": 1.3063637018203735,
"learning_rate": 1.5744680851063832e-05,
"loss": 1.1599,
"step": 75
},
{
"epoch": 0.08547008547008547,
"grad_norm": 1.6266546249389648,
"learning_rate": 1.6808510638297873e-05,
"loss": 1.1404,
"step": 80
},
{
"epoch": 0.09081196581196581,
"grad_norm": 1.2766536474227905,
"learning_rate": 1.7872340425531915e-05,
"loss": 1.1515,
"step": 85
},
{
"epoch": 0.09615384615384616,
"grad_norm": 1.1077594757080078,
"learning_rate": 1.893617021276596e-05,
"loss": 1.145,
"step": 90
},
{
"epoch": 0.1014957264957265,
"grad_norm": 1.000414252281189,
"learning_rate": 2e-05,
"loss": 1.148,
"step": 95
},
{
"epoch": 0.10683760683760683,
"grad_norm": 1.1821677684783936,
"learning_rate": 1.9998259904917257e-05,
"loss": 1.1424,
"step": 100
},
{
"epoch": 0.11217948717948718,
"grad_norm": 1.3549473285675049,
"learning_rate": 1.9993040225255205e-05,
"loss": 1.1624,
"step": 105
},
{
"epoch": 0.11752136752136752,
"grad_norm": 0.9064257144927979,
"learning_rate": 1.998434277756163e-05,
"loss": 1.1321,
"step": 110
},
{
"epoch": 0.12286324786324786,
"grad_norm": 1.0382879972457886,
"learning_rate": 1.9972170588713715e-05,
"loss": 1.1526,
"step": 115
},
{
"epoch": 0.1282051282051282,
"grad_norm": 1.089872121810913,
"learning_rate": 1.9956527894864662e-05,
"loss": 1.1479,
"step": 120
},
{
"epoch": 0.13354700854700854,
"grad_norm": 1.2133479118347168,
"learning_rate": 1.9937420139969397e-05,
"loss": 1.1415,
"step": 125
},
{
"epoch": 0.1388888888888889,
"grad_norm": 0.9869184494018555,
"learning_rate": 1.9914853973889988e-05,
"loss": 1.1373,
"step": 130
},
{
"epoch": 0.14423076923076922,
"grad_norm": 1.0042775869369507,
"learning_rate": 1.988883725008136e-05,
"loss": 1.1368,
"step": 135
},
{
"epoch": 0.14957264957264957,
"grad_norm": 1.358765959739685,
"learning_rate": 1.985937902285815e-05,
"loss": 1.1299,
"step": 140
},
{
"epoch": 0.15491452991452992,
"grad_norm": 1.0506223440170288,
"learning_rate": 1.9826489544243623e-05,
"loss": 1.1424,
"step": 145
},
{
"epoch": 0.16025641025641027,
"grad_norm": 0.8935067057609558,
"learning_rate": 1.9790180260401778e-05,
"loss": 1.1409,
"step": 150
},
{
"epoch": 0.1655982905982906,
"grad_norm": 0.910125195980072,
"learning_rate": 1.9750463807653873e-05,
"loss": 1.1436,
"step": 155
},
{
"epoch": 0.17094017094017094,
"grad_norm": 0.9480465650558472,
"learning_rate": 1.9707354008080736e-05,
"loss": 1.1289,
"step": 160
},
{
"epoch": 0.1762820512820513,
"grad_norm": 1.0701904296875,
"learning_rate": 1.9660865864712413e-05,
"loss": 1.1308,
"step": 165
},
{
"epoch": 0.18162393162393162,
"grad_norm": 0.8851361870765686,
"learning_rate": 1.9611015556306845e-05,
"loss": 1.1171,
"step": 170
},
{
"epoch": 0.18696581196581197,
"grad_norm": 0.9653987884521484,
"learning_rate": 1.9557820431719333e-05,
"loss": 1.128,
"step": 175
},
{
"epoch": 0.19230769230769232,
"grad_norm": 0.8165105581283569,
"learning_rate": 1.9501299003864828e-05,
"loss": 1.1329,
"step": 180
},
{
"epoch": 0.19764957264957264,
"grad_norm": 0.7411690354347229,
"learning_rate": 1.944147094327506e-05,
"loss": 1.1363,
"step": 185
},
{
"epoch": 0.202991452991453,
"grad_norm": 0.8322554230690002,
"learning_rate": 1.937835707125284e-05,
"loss": 1.1302,
"step": 190
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.7390133738517761,
"learning_rate": 1.9311979352625837e-05,
"loss": 1.1375,
"step": 195
},
{
"epoch": 0.21367521367521367,
"grad_norm": 0.7468808889389038,
"learning_rate": 1.924236088810241e-05,
"loss": 1.1453,
"step": 200
},
{
"epoch": 0.21367521367521367,
"eval_loss": 1.1343013048171997,
"eval_runtime": 121.3407,
"eval_samples_per_second": 109.164,
"eval_steps_per_second": 0.857,
"step": 200
},
{
"epoch": 0.21901709401709402,
"grad_norm": 0.8430936932563782,
"learning_rate": 1.916952590623212e-05,
"loss": 1.1097,
"step": 205
},
{
"epoch": 0.22435897435897437,
"grad_norm": 0.8264766335487366,
"learning_rate": 1.909349975497372e-05,
"loss": 1.1243,
"step": 210
},
{
"epoch": 0.2297008547008547,
"grad_norm": 0.7218984961509705,
"learning_rate": 1.9014308892873612e-05,
"loss": 1.1454,
"step": 215
},
{
"epoch": 0.23504273504273504,
"grad_norm": 0.950661301612854,
"learning_rate": 1.8931980879857737e-05,
"loss": 1.113,
"step": 220
},
{
"epoch": 0.2403846153846154,
"grad_norm": 0.8158000707626343,
"learning_rate": 1.8846544367640218e-05,
"loss": 1.125,
"step": 225
},
{
"epoch": 0.24572649572649571,
"grad_norm": 0.8023744225502014,
"learning_rate": 1.8758029089752023e-05,
"loss": 1.1271,
"step": 230
},
{
"epoch": 0.25106837606837606,
"grad_norm": 0.7522373795509338,
"learning_rate": 1.86664658511931e-05,
"loss": 1.1027,
"step": 235
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.7865638136863708,
"learning_rate": 1.85718865177117e-05,
"loss": 1.1091,
"step": 240
},
{
"epoch": 0.26175213675213677,
"grad_norm": 0.7381576895713806,
"learning_rate": 1.847432400471443e-05,
"loss": 1.1202,
"step": 245
},
{
"epoch": 0.2670940170940171,
"grad_norm": 0.8119399547576904,
"learning_rate": 1.8373812265811126e-05,
"loss": 1.1246,
"step": 250
},
{
"epoch": 0.2724358974358974,
"grad_norm": 0.8148014545440674,
"learning_rate": 1.827038628099831e-05,
"loss": 1.1305,
"step": 255
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.7722187042236328,
"learning_rate": 1.81640820444855e-05,
"loss": 1.1103,
"step": 260
},
{
"epoch": 0.2831196581196581,
"grad_norm": 0.7955113053321838,
"learning_rate": 1.8054936552168548e-05,
"loss": 1.1134,
"step": 265
},
{
"epoch": 0.28846153846153844,
"grad_norm": 0.8004475235939026,
"learning_rate": 1.7942987788754348e-05,
"loss": 1.1135,
"step": 270
},
{
"epoch": 0.2938034188034188,
"grad_norm": 0.786965012550354,
"learning_rate": 1.7828274714541445e-05,
"loss": 1.1247,
"step": 275
},
{
"epoch": 0.29914529914529914,
"grad_norm": 0.7405045032501221,
"learning_rate": 1.771083725186111e-05,
"loss": 1.1107,
"step": 280
},
{
"epoch": 0.30448717948717946,
"grad_norm": 0.7558034062385559,
"learning_rate": 1.759071627118362e-05,
"loss": 1.1081,
"step": 285
},
{
"epoch": 0.30982905982905984,
"grad_norm": 0.7463499903678894,
"learning_rate": 1.746795357689453e-05,
"loss": 1.1193,
"step": 290
},
{
"epoch": 0.31517094017094016,
"grad_norm": 0.7769708633422852,
"learning_rate": 1.7342591892745978e-05,
"loss": 1.1026,
"step": 295
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.7373056411743164,
"learning_rate": 1.7214674846987992e-05,
"loss": 1.1107,
"step": 300
},
{
"epoch": 0.32585470085470086,
"grad_norm": 0.7885181307792664,
"learning_rate": 1.7084246957185036e-05,
"loss": 1.1282,
"step": 305
},
{
"epoch": 0.3311965811965812,
"grad_norm": 0.7831401824951172,
"learning_rate": 1.695135361472305e-05,
"loss": 1.1166,
"step": 310
},
{
"epoch": 0.33653846153846156,
"grad_norm": 0.7612439393997192,
"learning_rate": 1.681604106901239e-05,
"loss": 1.1096,
"step": 315
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.8480131030082703,
"learning_rate": 1.6678356411392135e-05,
"loss": 1.1128,
"step": 320
},
{
"epoch": 0.3472222222222222,
"grad_norm": 0.7431190609931946,
"learning_rate": 1.6538347558741424e-05,
"loss": 1.1052,
"step": 325
},
{
"epoch": 0.3525641025641026,
"grad_norm": 0.8087280988693237,
"learning_rate": 1.6396063236803465e-05,
"loss": 1.1119,
"step": 330
},
{
"epoch": 0.3579059829059829,
"grad_norm": 0.7224681973457336,
"learning_rate": 1.625155296322805e-05,
"loss": 1.1182,
"step": 335
},
{
"epoch": 0.36324786324786323,
"grad_norm": 0.8140762448310852,
"learning_rate": 1.610486703033847e-05,
"loss": 1.093,
"step": 340
},
{
"epoch": 0.3685897435897436,
"grad_norm": 0.7991831302642822,
"learning_rate": 1.5956056487628832e-05,
"loss": 1.1214,
"step": 345
},
{
"epoch": 0.37393162393162394,
"grad_norm": 0.7950995564460754,
"learning_rate": 1.5805173123997856e-05,
"loss": 1.1171,
"step": 350
},
{
"epoch": 0.37927350427350426,
"grad_norm": 0.7558692097663879,
"learning_rate": 1.5652269449725375e-05,
"loss": 1.0817,
"step": 355
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.8574461936950684,
"learning_rate": 1.549739867819773e-05,
"loss": 1.1228,
"step": 360
},
{
"epoch": 0.38995726495726496,
"grad_norm": 0.7260861992835999,
"learning_rate": 1.534061470738852e-05,
"loss": 1.0933,
"step": 365
},
{
"epoch": 0.3952991452991453,
"grad_norm": 0.7083914875984192,
"learning_rate": 1.5181972101101083e-05,
"loss": 1.0952,
"step": 370
},
{
"epoch": 0.40064102564102566,
"grad_norm": 0.6901055574417114,
"learning_rate": 1.5021526069979232e-05,
"loss": 1.1023,
"step": 375
},
{
"epoch": 0.405982905982906,
"grad_norm": 0.8109995722770691,
"learning_rate": 1.4859332452292937e-05,
"loss": 1.1167,
"step": 380
},
{
"epoch": 0.4113247863247863,
"grad_norm": 0.7325941324234009,
"learning_rate": 1.4695447694505512e-05,
"loss": 1.11,
"step": 385
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.7975173592567444,
"learning_rate": 1.4529928831629185e-05,
"loss": 1.0997,
"step": 390
},
{
"epoch": 0.422008547008547,
"grad_norm": 0.7777805328369141,
"learning_rate": 1.4362833467375839e-05,
"loss": 1.1172,
"step": 395
},
{
"epoch": 0.42735042735042733,
"grad_norm": 0.7026362419128418,
"learning_rate": 1.4194219754109812e-05,
"loss": 1.0929,
"step": 400
},
{
"epoch": 0.42735042735042733,
"eval_loss": 1.1095771789550781,
"eval_runtime": 121.5104,
"eval_samples_per_second": 109.011,
"eval_steps_per_second": 0.856,
"step": 400
},
{
"epoch": 0.4326923076923077,
"grad_norm": 0.690746545791626,
"learning_rate": 1.402414637260977e-05,
"loss": 1.1014,
"step": 405
},
{
"epoch": 0.43803418803418803,
"grad_norm": 0.7299688458442688,
"learning_rate": 1.3852672511646683e-05,
"loss": 1.1059,
"step": 410
},
{
"epoch": 0.44337606837606836,
"grad_norm": 0.7541385889053345,
"learning_rate": 1.367985784738501e-05,
"loss": 1.0929,
"step": 415
},
{
"epoch": 0.44871794871794873,
"grad_norm": 0.7011000514030457,
"learning_rate": 1.350576252261425e-05,
"loss": 1.069,
"step": 420
},
{
"epoch": 0.45405982905982906,
"grad_norm": 0.7681401371955872,
"learning_rate": 1.3330447125818115e-05,
"loss": 1.1181,
"step": 425
},
{
"epoch": 0.4594017094017094,
"grad_norm": 0.7124375104904175,
"learning_rate": 1.3153972670088584e-05,
"loss": 1.084,
"step": 430
},
{
"epoch": 0.46474358974358976,
"grad_norm": 0.7539063096046448,
"learning_rate": 1.2976400571892189e-05,
"loss": 1.0933,
"step": 435
},
{
"epoch": 0.4700854700854701,
"grad_norm": 0.7165161967277527,
"learning_rate": 1.2797792629695909e-05,
"loss": 1.1012,
"step": 440
},
{
"epoch": 0.4754273504273504,
"grad_norm": 0.714011549949646,
"learning_rate": 1.2618211002460135e-05,
"loss": 1.093,
"step": 445
},
{
"epoch": 0.4807692307692308,
"grad_norm": 0.7356306910514832,
"learning_rate": 1.2437718188006165e-05,
"loss": 1.0975,
"step": 450
},
{
"epoch": 0.4861111111111111,
"grad_norm": 0.9547709822654724,
"learning_rate": 1.2256377001265785e-05,
"loss": 1.0849,
"step": 455
},
{
"epoch": 0.49145299145299143,
"grad_norm": 0.7575970888137817,
"learning_rate": 1.2074250552420459e-05,
"loss": 1.1048,
"step": 460
},
{
"epoch": 0.4967948717948718,
"grad_norm": 0.7571170926094055,
"learning_rate": 1.1891402224937805e-05,
"loss": 1.0833,
"step": 465
},
{
"epoch": 0.5021367521367521,
"grad_norm": 0.7408652901649475,
"learning_rate": 1.170789565351293e-05,
"loss": 1.091,
"step": 470
},
{
"epoch": 0.5074786324786325,
"grad_norm": 0.7377583384513855,
"learning_rate": 1.1523794701922351e-05,
"loss": 1.0995,
"step": 475
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.7226018309593201,
"learning_rate": 1.1339163440798187e-05,
"loss": 1.0883,
"step": 480
},
{
"epoch": 0.5181623931623932,
"grad_norm": 0.6740500330924988,
"learning_rate": 1.1154066125330358e-05,
"loss": 1.0853,
"step": 485
},
{
"epoch": 0.5235042735042735,
"grad_norm": 0.7039321660995483,
"learning_rate": 1.0968567172904558e-05,
"loss": 1.0793,
"step": 490
},
{
"epoch": 0.5288461538461539,
"grad_norm": 0.6822574734687805,
"learning_rate": 1.0782731140683786e-05,
"loss": 1.084,
"step": 495
},
{
"epoch": 0.5341880341880342,
"grad_norm": 0.6553980112075806,
"learning_rate": 1.0596622703141209e-05,
"loss": 1.0718,
"step": 500
},
{
"epoch": 0.5395299145299145,
"grad_norm": 0.70383620262146,
"learning_rate": 1.0410306629552231e-05,
"loss": 1.064,
"step": 505
},
{
"epoch": 0.5448717948717948,
"grad_norm": 0.6900469064712524,
"learning_rate": 1.0223847761453558e-05,
"loss": 1.0768,
"step": 510
},
{
"epoch": 0.5502136752136753,
"grad_norm": 0.689424455165863,
"learning_rate": 1.0037310990077083e-05,
"loss": 1.0839,
"step": 515
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.6523409485816956,
"learning_rate": 9.850761233766537e-06,
"loss": 1.103,
"step": 520
},
{
"epoch": 0.5608974358974359,
"grad_norm": 0.718777596950531,
"learning_rate": 9.664263415384644e-06,
"loss": 1.0881,
"step": 525
},
{
"epoch": 0.5662393162393162,
"grad_norm": 0.7004057168960571,
"learning_rate": 9.47788243971875e-06,
"loss": 1.0919,
"step": 530
},
{
"epoch": 0.5715811965811965,
"grad_norm": 0.6746854186058044,
"learning_rate": 9.291683170892712e-06,
"loss": 1.0911,
"step": 535
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.716135561466217,
"learning_rate": 9.10573040979294e-06,
"loss": 1.0774,
"step": 540
},
{
"epoch": 0.5822649572649573,
"grad_norm": 0.6947251558303833,
"learning_rate": 8.920088871516482e-06,
"loss": 1.0855,
"step": 545
},
{
"epoch": 0.5876068376068376,
"grad_norm": 0.6928468346595764,
"learning_rate": 8.734823162848919e-06,
"loss": 1.0743,
"step": 550
},
{
"epoch": 0.592948717948718,
"grad_norm": 0.6887853145599365,
"learning_rate": 8.549997759779981e-06,
"loss": 1.0879,
"step": 555
},
{
"epoch": 0.5982905982905983,
"grad_norm": 0.6782488822937012,
"learning_rate": 8.365676985064684e-06,
"loss": 1.078,
"step": 560
},
{
"epoch": 0.6036324786324786,
"grad_norm": 0.6661266088485718,
"learning_rate": 8.181924985837762e-06,
"loss": 1.0582,
"step": 565
},
{
"epoch": 0.6089743589743589,
"grad_norm": 0.7845523953437805,
"learning_rate": 7.998805711289281e-06,
"loss": 1.0851,
"step": 570
},
{
"epoch": 0.6143162393162394,
"grad_norm": 0.6643468141555786,
"learning_rate": 7.81638289040908e-06,
"loss": 1.0725,
"step": 575
},
{
"epoch": 0.6196581196581197,
"grad_norm": 0.6565442085266113,
"learning_rate": 7.634720009807879e-06,
"loss": 1.0822,
"step": 580
},
{
"epoch": 0.625,
"grad_norm": 0.6695041656494141,
"learning_rate": 7.453880291622726e-06,
"loss": 1.0824,
"step": 585
},
{
"epoch": 0.6303418803418803,
"grad_norm": 0.6693868041038513,
"learning_rate": 7.273926671514503e-06,
"loss": 1.0872,
"step": 590
},
{
"epoch": 0.6356837606837606,
"grad_norm": 0.6269449591636658,
"learning_rate": 7.094921776765095e-06,
"loss": 1.077,
"step": 595
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.6727346181869507,
"learning_rate": 6.916927904481934e-06,
"loss": 1.0808,
"step": 600
},
{
"epoch": 0.6410256410256411,
"eval_loss": 1.0848172903060913,
"eval_runtime": 121.4332,
"eval_samples_per_second": 109.081,
"eval_steps_per_second": 0.856,
"step": 600
},
{
"epoch": 0.6463675213675214,
"grad_norm": 0.6319820880889893,
"learning_rate": 6.740006999917406e-06,
"loss": 1.0898,
"step": 605
},
{
"epoch": 0.6517094017094017,
"grad_norm": 0.6709337830543518,
"learning_rate": 6.56422063491072e-06,
"loss": 1.0778,
"step": 610
},
{
"epoch": 0.657051282051282,
"grad_norm": 0.6739898920059204,
"learning_rate": 6.389629986459756e-06,
"loss": 1.0536,
"step": 615
},
{
"epoch": 0.6623931623931624,
"grad_norm": 0.6539934873580933,
"learning_rate": 6.216295815430277e-06,
"loss": 1.0579,
"step": 620
},
{
"epoch": 0.6677350427350427,
"grad_norm": 0.6796319484710693,
"learning_rate": 6.044278445410025e-06,
"loss": 1.0788,
"step": 625
},
{
"epoch": 0.6730769230769231,
"grad_norm": 0.6338521242141724,
"learning_rate": 5.873637741714941e-06,
"loss": 1.0553,
"step": 630
},
{
"epoch": 0.6784188034188035,
"grad_norm": 0.6625267267227173,
"learning_rate": 5.704433090554912e-06,
"loss": 1.0889,
"step": 635
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.6456872820854187,
"learning_rate": 5.536723378366226e-06,
"loss": 1.0694,
"step": 640
},
{
"epoch": 0.6891025641025641,
"grad_norm": 0.6204415559768677,
"learning_rate": 5.37056697131799e-06,
"loss": 1.0883,
"step": 645
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.6364960670471191,
"learning_rate": 5.206021694999571e-06,
"loss": 1.0727,
"step": 650
},
{
"epoch": 0.6997863247863247,
"grad_norm": 0.6018571853637695,
"learning_rate": 5.043144814296214e-06,
"loss": 1.0509,
"step": 655
},
{
"epoch": 0.7051282051282052,
"grad_norm": 0.648232102394104,
"learning_rate": 4.881993013459762e-06,
"loss": 1.0766,
"step": 660
},
{
"epoch": 0.7104700854700855,
"grad_norm": 0.6507673263549805,
"learning_rate": 4.722622376381455e-06,
"loss": 1.0764,
"step": 665
},
{
"epoch": 0.7158119658119658,
"grad_norm": 0.62617427110672,
"learning_rate": 4.565088367073675e-06,
"loss": 1.0583,
"step": 670
},
{
"epoch": 0.7211538461538461,
"grad_norm": 0.642150342464447,
"learning_rate": 4.409445810367421e-06,
"loss": 1.0596,
"step": 675
},
{
"epoch": 0.7264957264957265,
"grad_norm": 0.651789665222168,
"learning_rate": 4.255748872832201e-06,
"loss": 1.064,
"step": 680
},
{
"epoch": 0.7318376068376068,
"grad_norm": 0.6348583698272705,
"learning_rate": 4.104051043925068e-06,
"loss": 1.085,
"step": 685
},
{
"epoch": 0.7371794871794872,
"grad_norm": 0.6330149173736572,
"learning_rate": 3.9544051173752504e-06,
"loss": 1.0775,
"step": 690
},
{
"epoch": 0.7425213675213675,
"grad_norm": 0.6291777491569519,
"learning_rate": 3.8068631728109364e-06,
"loss": 1.081,
"step": 695
},
{
"epoch": 0.7478632478632479,
"grad_norm": 0.6168299913406372,
"learning_rate": 3.6614765576345755e-06,
"loss": 1.0543,
"step": 700
},
{
"epoch": 0.7532051282051282,
"grad_norm": 0.6146724820137024,
"learning_rate": 3.5182958691529945e-06,
"loss": 1.0612,
"step": 705
},
{
"epoch": 0.7585470085470085,
"grad_norm": 0.6585379838943481,
"learning_rate": 3.3773709369685924e-06,
"loss": 1.0629,
"step": 710
},
{
"epoch": 0.7638888888888888,
"grad_norm": 0.6729740500450134,
"learning_rate": 3.2387508056376726e-06,
"loss": 1.0638,
"step": 715
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.6358993053436279,
"learning_rate": 3.1024837176020173e-06,
"loss": 1.0638,
"step": 720
},
{
"epoch": 0.7745726495726496,
"grad_norm": 0.6569262146949768,
"learning_rate": 2.968617096399592e-06,
"loss": 1.0589,
"step": 725
},
{
"epoch": 0.7799145299145299,
"grad_norm": 0.608600378036499,
"learning_rate": 2.8371975301602572e-06,
"loss": 1.0911,
"step": 730
},
{
"epoch": 0.7852564102564102,
"grad_norm": 0.6112855076789856,
"learning_rate": 2.708270755392207e-06,
"loss": 1.0705,
"step": 735
},
{
"epoch": 0.7905982905982906,
"grad_norm": 0.6621512174606323,
"learning_rate": 2.581881641064806e-06,
"loss": 1.0614,
"step": 740
},
{
"epoch": 0.7959401709401709,
"grad_norm": 0.6552026271820068,
"learning_rate": 2.4580741729933246e-06,
"loss": 1.0683,
"step": 745
},
{
"epoch": 0.8012820512820513,
"grad_norm": 0.6412573456764221,
"learning_rate": 2.3368914385310415e-06,
"loss": 1.0498,
"step": 750
},
{
"epoch": 0.8066239316239316,
"grad_norm": 0.6315770745277405,
"learning_rate": 2.2183756115740274e-06,
"loss": 1.0629,
"step": 755
},
{
"epoch": 0.811965811965812,
"grad_norm": 0.6639691591262817,
"learning_rate": 2.1025679378838247e-06,
"loss": 1.0596,
"step": 760
},
{
"epoch": 0.8173076923076923,
"grad_norm": 0.6195425391197205,
"learning_rate": 1.9895087207331422e-06,
"loss": 1.0579,
"step": 765
},
{
"epoch": 0.8226495726495726,
"grad_norm": 0.630287766456604,
"learning_rate": 1.8792373068795422e-06,
"loss": 1.0606,
"step": 770
},
{
"epoch": 0.8279914529914529,
"grad_norm": 0.6329755187034607,
"learning_rate": 1.7717920728720284e-06,
"loss": 1.055,
"step": 775
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.6025441884994507,
"learning_rate": 1.6672104116952748e-06,
"loss": 1.0719,
"step": 780
},
{
"epoch": 0.8386752136752137,
"grad_norm": 0.6310837268829346,
"learning_rate": 1.5655287197561497e-06,
"loss": 1.0692,
"step": 785
},
{
"epoch": 0.844017094017094,
"grad_norm": 0.6214393377304077,
"learning_rate": 1.4667823842170837e-06,
"loss": 1.0737,
"step": 790
},
{
"epoch": 0.8493589743589743,
"grad_norm": 0.653542160987854,
"learning_rate": 1.371005770680659e-06,
"loss": 1.0525,
"step": 795
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.6011979579925537,
"learning_rate": 1.2782322112297274e-06,
"loss": 1.0529,
"step": 800
},
{
"epoch": 0.8547008547008547,
"eval_loss": 1.0704631805419922,
"eval_runtime": 121.3074,
"eval_samples_per_second": 109.194,
"eval_steps_per_second": 0.857,
"step": 800
},
{
"epoch": 0.8600427350427351,
"grad_norm": 0.6574413776397705,
"learning_rate": 1.188493992827211e-06,
"loss": 1.0474,
"step": 805
},
{
"epoch": 0.8653846153846154,
"grad_norm": 0.5898586511611938,
"learning_rate": 1.101822346079625e-06,
"loss": 1.0446,
"step": 810
},
{
"epoch": 0.8707264957264957,
"grad_norm": 0.6238599419593811,
"learning_rate": 1.0182474343682346e-06,
"loss": 1.0539,
"step": 815
},
{
"epoch": 0.8760683760683761,
"grad_norm": 0.6406736373901367,
"learning_rate": 9.377983433516181e-07,
"loss": 1.0657,
"step": 820
},
{
"epoch": 0.8814102564102564,
"grad_norm": 0.6025185585021973,
"learning_rate": 8.605030708433149e-07,
"loss": 1.0585,
"step": 825
},
{
"epoch": 0.8867521367521367,
"grad_norm": 0.6066309213638306,
"learning_rate": 7.863885170680486e-07,
"loss": 1.0596,
"step": 830
},
{
"epoch": 0.8920940170940171,
"grad_norm": 0.6598129272460938,
"learning_rate": 7.154804752999344e-07,
"loss": 1.0407,
"step": 835
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.6132450699806213,
"learning_rate": 6.478036228859363e-07,
"loss": 1.0487,
"step": 840
},
{
"epoch": 0.9027777777777778,
"grad_norm": 0.6001634001731873,
"learning_rate": 5.833815126576714e-07,
"loss": 1.0485,
"step": 845
},
{
"epoch": 0.9081196581196581,
"grad_norm": 0.6118801236152649,
"learning_rate": 5.222365647345862e-07,
"loss": 1.0666,
"step": 850
},
{
"epoch": 0.9134615384615384,
"grad_norm": 0.6073253750801086,
"learning_rate": 4.6439005872132457e-07,
"loss": 1.0574,
"step": 855
},
{
"epoch": 0.9188034188034188,
"grad_norm": 0.610341489315033,
"learning_rate": 4.0986212630201974e-07,
"loss": 1.0599,
"step": 860
},
{
"epoch": 0.9241452991452992,
"grad_norm": 0.589881420135498,
"learning_rate": 3.58671744234087e-07,
"loss": 1.073,
"step": 865
},
{
"epoch": 0.9294871794871795,
"grad_norm": 0.6028986573219299,
"learning_rate": 3.1083672774395055e-07,
"loss": 1.0768,
"step": 870
},
{
"epoch": 0.9348290598290598,
"grad_norm": 0.5990201830863953,
"learning_rate": 2.6637372432700483e-07,
"loss": 1.058,
"step": 875
},
{
"epoch": 0.9401709401709402,
"grad_norm": 0.621417760848999,
"learning_rate": 2.2529820795397228e-07,
"loss": 1.066,
"step": 880
},
{
"epoch": 0.9455128205128205,
"grad_norm": 0.5974612236022949,
"learning_rate": 1.8762447368566582e-07,
"loss": 1.0543,
"step": 885
},
{
"epoch": 0.9508547008547008,
"grad_norm": 0.60768061876297,
"learning_rate": 1.5336563269803372e-07,
"loss": 1.0598,
"step": 890
},
{
"epoch": 0.9561965811965812,
"grad_norm": 0.605929434299469,
"learning_rate": 1.225336077192274e-07,
"loss": 1.0558,
"step": 895
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.6178026795387268,
"learning_rate": 9.513912888025611e-08,
"loss": 1.0661,
"step": 900
},
{
"epoch": 0.9668803418803419,
"grad_norm": 0.603589653968811,
"learning_rate": 7.119172998070412e-08,
"loss": 1.0557,
"step": 905
},
{
"epoch": 0.9722222222222222,
"grad_norm": 0.6099143028259277,
"learning_rate": 5.0699745170785796e-08,
"loss": 1.0732,
"step": 910
},
{
"epoch": 0.9775641025641025,
"grad_norm": 0.6262799501419067,
"learning_rate": 3.367030605090249e-08,
"loss": 1.0671,
"step": 915
},
{
"epoch": 0.9829059829059829,
"grad_norm": 0.6268653273582458,
"learning_rate": 2.010933918970781e-08,
"loss": 1.0524,
"step": 920
},
{
"epoch": 0.9882478632478633,
"grad_norm": 0.5946672558784485,
"learning_rate": 1.0021564061554189e-08,
"loss": 1.0477,
"step": 925
},
{
"epoch": 0.9935897435897436,
"grad_norm": 0.6294344067573547,
"learning_rate": 3.410491404017835e-09,
"loss": 1.0792,
"step": 930
},
{
"epoch": 0.9989316239316239,
"grad_norm": 0.5966866612434387,
"learning_rate": 2.784219961060597e-10,
"loss": 1.0449,
"step": 935
},
{
"epoch": 1.0,
"step": 936,
"total_flos": 1.381094473127166e+18,
"train_loss": 1.105873676828849,
"train_runtime": 5784.2377,
"train_samples_per_second": 20.701,
"train_steps_per_second": 0.162
}
],
"logging_steps": 5,
"max_steps": 936,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.381094473127166e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}