Files
qwen2.5vl-3b-sampled_10000_…/trainer_state.json
ModelHub XC 07a53acfa4 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-sampled_10000_qwen2.5vl32b
Source: Original Platform
2026-05-22 17:04:14 +08:00

1181 lines
33 KiB
JSON

{
"best_global_step": 300,
"best_metric": 0.22917783,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v27-20250507-113338/checkpoint-300",
"epoch": 2.9826262626262627,
"eval_steps": 20,
"global_step": 462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006464646464646465,
"grad_norm": 2.5242867469787598,
"learning_rate": 9.999884400986087e-06,
"loss": 0.39474862813949585,
"memory(GiB)": 28.84,
"step": 1,
"token_acc": 0.8908829863603733,
"train_speed(iter/s)": 0.064293
},
{
"epoch": 0.03232323232323232,
"grad_norm": 1.5711474418640137,
"learning_rate": 9.997110291906109e-06,
"loss": 0.3434034585952759,
"memory(GiB)": 30.62,
"step": 5,
"token_acc": 0.8817614172656647,
"train_speed(iter/s)": 0.119566
},
{
"epoch": 0.06464646464646465,
"grad_norm": 0.9338254928588867,
"learning_rate": 9.988444507789584e-06,
"loss": 0.2892385244369507,
"memory(GiB)": 30.62,
"step": 10,
"token_acc": 0.9103395025620628,
"train_speed(iter/s)": 0.1364
},
{
"epoch": 0.09696969696969697,
"grad_norm": 0.9935480952262878,
"learning_rate": 9.97401266428502e-06,
"loss": 0.30152087211608886,
"memory(GiB)": 32.44,
"step": 15,
"token_acc": 0.9016964442328413,
"train_speed(iter/s)": 0.141481
},
{
"epoch": 0.1292929292929293,
"grad_norm": 0.8869792819023132,
"learning_rate": 9.953831442918418e-06,
"loss": 0.2820048570632935,
"memory(GiB)": 32.44,
"step": 20,
"token_acc": 0.9094200925673837,
"train_speed(iter/s)": 0.144923
},
{
"epoch": 0.1292929292929293,
"eval_loss": 0.2714148759841919,
"eval_runtime": 4.8691,
"eval_samples_per_second": 20.538,
"eval_steps_per_second": 5.134,
"eval_token_acc": 0.9161811841070817,
"step": 20
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.9623355865478516,
"learning_rate": 9.927924170825266e-06,
"loss": 0.28305883407592775,
"memory(GiB)": 32.44,
"step": 25,
"token_acc": 0.8970619818736308,
"train_speed(iter/s)": 0.133319
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.806711733341217,
"learning_rate": 9.896320793787106e-06,
"loss": 0.2747792720794678,
"memory(GiB)": 32.44,
"step": 30,
"token_acc": 0.9068442528293171,
"train_speed(iter/s)": 0.138408
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.8571366667747498,
"learning_rate": 9.859057841617709e-06,
"loss": 0.2719248294830322,
"memory(GiB)": 32.44,
"step": 35,
"token_acc": 0.9210450095580143,
"train_speed(iter/s)": 0.14047
},
{
"epoch": 0.2585858585858586,
"grad_norm": 0.8261837363243103,
"learning_rate": 9.816178385938867e-06,
"loss": 0.2738617420196533,
"memory(GiB)": 32.45,
"step": 40,
"token_acc": 0.9066808952792833,
"train_speed(iter/s)": 0.142017
},
{
"epoch": 0.2585858585858586,
"eval_loss": 0.2558521330356598,
"eval_runtime": 4.8838,
"eval_samples_per_second": 20.476,
"eval_steps_per_second": 5.119,
"eval_token_acc": 0.9202935069647429,
"step": 40
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.7319702506065369,
"learning_rate": 9.767731990394638e-06,
"loss": 0.2567479133605957,
"memory(GiB)": 32.45,
"step": 45,
"token_acc": 0.9167146310579026,
"train_speed(iter/s)": 0.136503
},
{
"epoch": 0.32323232323232326,
"grad_norm": 0.8507645130157471,
"learning_rate": 9.71377465336155e-06,
"loss": 0.261569881439209,
"memory(GiB)": 32.45,
"step": 50,
"token_acc": 0.9125896733273807,
"train_speed(iter/s)": 0.138728
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.8376278877258301,
"learning_rate": 9.654368743221022e-06,
"loss": 0.24444923400878907,
"memory(GiB)": 32.45,
"step": 55,
"token_acc": 0.9274953450318795,
"train_speed(iter/s)": 0.139921
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.8305687308311462,
"learning_rate": 9.589582926268798e-06,
"loss": 0.26804823875427247,
"memory(GiB)": 34.77,
"step": 60,
"token_acc": 0.9223425512494758,
"train_speed(iter/s)": 0.14136
},
{
"epoch": 0.3878787878787879,
"eval_loss": 0.2471594363451004,
"eval_runtime": 4.836,
"eval_samples_per_second": 20.678,
"eval_steps_per_second": 5.17,
"eval_token_acc": 0.9224706190658576,
"step": 60
},
{
"epoch": 0.4202020202020202,
"grad_norm": 0.6730025410652161,
"learning_rate": 9.519492087344724e-06,
"loss": 0.24588844776153565,
"memory(GiB)": 34.77,
"step": 65,
"token_acc": 0.9117032737506321,
"train_speed(iter/s)": 0.136683
},
{
"epoch": 0.45252525252525255,
"grad_norm": 0.7964938282966614,
"learning_rate": 9.444177243274619e-06,
"loss": 0.2592522859573364,
"memory(GiB)": 34.77,
"step": 70,
"token_acc": 0.9183477688849907,
"train_speed(iter/s)": 0.138829
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.6614187359809875,
"learning_rate": 9.363725449224281e-06,
"loss": 0.2513019561767578,
"memory(GiB)": 34.77,
"step": 75,
"token_acc": 0.9213027816690014,
"train_speed(iter/s)": 0.140268
},
{
"epoch": 0.5171717171717172,
"grad_norm": 0.7636239528656006,
"learning_rate": 9.278229698073889e-06,
"loss": 0.2455005168914795,
"memory(GiB)": 34.77,
"step": 80,
"token_acc": 0.9128787878787878,
"train_speed(iter/s)": 0.140852
},
{
"epoch": 0.5171717171717172,
"eval_loss": 0.24173545837402344,
"eval_runtime": 4.8448,
"eval_samples_per_second": 20.641,
"eval_steps_per_second": 5.16,
"eval_token_acc": 0.9233374322172274,
"step": 80
},
{
"epoch": 0.5494949494949495,
"grad_norm": 0.8166657090187073,
"learning_rate": 9.187788812929074e-06,
"loss": 0.2590561628341675,
"memory(GiB)": 34.77,
"step": 85,
"token_acc": 0.9130182349905385,
"train_speed(iter/s)": 0.137729
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.8091973066329956,
"learning_rate": 9.092507332892968e-06,
"loss": 0.2490919589996338,
"memory(GiB)": 34.77,
"step": 90,
"token_acc": 0.910330508950115,
"train_speed(iter/s)": 0.13867
},
{
"epoch": 0.6141414141414141,
"grad_norm": 0.8563810586929321,
"learning_rate": 8.992495392231195e-06,
"loss": 0.2534335613250732,
"memory(GiB)": 34.77,
"step": 95,
"token_acc": 0.9122233688797157,
"train_speed(iter/s)": 0.139651
},
{
"epoch": 0.6464646464646465,
"grad_norm": 0.7787972092628479,
"learning_rate": 8.88786859306952e-06,
"loss": 0.24485716819763184,
"memory(GiB)": 34.77,
"step": 100,
"token_acc": 0.9189008559751176,
"train_speed(iter/s)": 0.140544
},
{
"epoch": 0.6464646464646465,
"eval_loss": 0.23870104551315308,
"eval_runtime": 4.8543,
"eval_samples_per_second": 20.6,
"eval_steps_per_second": 5.15,
"eval_token_acc": 0.9255750196544843,
"step": 100
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.7570570707321167,
"learning_rate": 8.778747871771293e-06,
"loss": 0.25060880184173584,
"memory(GiB)": 34.77,
"step": 105,
"token_acc": 0.9076021696824148,
"train_speed(iter/s)": 0.138513
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.7018725872039795,
"learning_rate": 8.665259359149132e-06,
"loss": 0.2399357795715332,
"memory(GiB)": 34.77,
"step": 110,
"token_acc": 0.927743086529884,
"train_speed(iter/s)": 0.139328
},
{
"epoch": 0.7434343434343434,
"grad_norm": 0.7108844518661499,
"learning_rate": 8.547534234672435e-06,
"loss": 0.23419642448425293,
"memory(GiB)": 34.77,
"step": 115,
"token_acc": 0.9277124928693667,
"train_speed(iter/s)": 0.140095
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.8351752161979675,
"learning_rate": 8.425708574839221e-06,
"loss": 0.24019112586975097,
"memory(GiB)": 34.77,
"step": 120,
"token_acc": 0.9238991888760139,
"train_speed(iter/s)": 0.140762
},
{
"epoch": 0.7757575757575758,
"eval_loss": 0.23568643629550934,
"eval_runtime": 4.8562,
"eval_samples_per_second": 20.592,
"eval_steps_per_second": 5.148,
"eval_token_acc": 0.9252323260830124,
"step": 120
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.7820432186126709,
"learning_rate": 8.299923195887599e-06,
"loss": 0.23381190299987792,
"memory(GiB)": 34.77,
"step": 125,
"token_acc": 0.9166274910577414,
"train_speed(iter/s)": 0.138826
},
{
"epoch": 0.8404040404040404,
"grad_norm": 0.8153200149536133,
"learning_rate": 8.170323491028625e-06,
"loss": 0.25104479789733886,
"memory(GiB)": 34.77,
"step": 130,
"token_acc": 0.9234165067178502,
"train_speed(iter/s)": 0.139183
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.8061110973358154,
"learning_rate": 8.03705926238874e-06,
"loss": 0.24108409881591797,
"memory(GiB)": 34.77,
"step": 135,
"token_acc": 0.9225523279137268,
"train_speed(iter/s)": 0.139787
},
{
"epoch": 0.9050505050505051,
"grad_norm": 0.7487571835517883,
"learning_rate": 7.900284547855992e-06,
"loss": 0.23796701431274414,
"memory(GiB)": 34.77,
"step": 140,
"token_acc": 0.9192467460537247,
"train_speed(iter/s)": 0.140229
},
{
"epoch": 0.9050505050505051,
"eval_loss": 0.2325511872768402,
"eval_runtime": 4.8347,
"eval_samples_per_second": 20.684,
"eval_steps_per_second": 5.171,
"eval_token_acc": 0.9265224665873768,
"step": 140
},
{
"epoch": 0.9373737373737374,
"grad_norm": 0.7421184182167053,
"learning_rate": 7.760157443030234e-06,
"loss": 0.22932517528533936,
"memory(GiB)": 34.77,
"step": 145,
"token_acc": 0.9112203397203071,
"train_speed(iter/s)": 0.13847
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.7295236587524414,
"learning_rate": 7.616839918483061e-06,
"loss": 0.233046817779541,
"memory(GiB)": 34.77,
"step": 150,
"token_acc": 0.9231597652253514,
"train_speed(iter/s)": 0.138991
},
{
"epoch": 1.0,
"grad_norm": 0.7331147789955139,
"learning_rate": 7.470497632538743e-06,
"loss": 0.23622214794158936,
"memory(GiB)": 34.77,
"step": 155,
"token_acc": 0.923393272448806,
"train_speed(iter/s)": 0.139619
},
{
"epoch": 1.0323232323232323,
"grad_norm": 0.6679208874702454,
"learning_rate": 7.321299739792553e-06,
"loss": 0.17297937870025634,
"memory(GiB)": 34.77,
"step": 160,
"token_acc": 0.9432674199623352,
"train_speed(iter/s)": 0.140142
},
{
"epoch": 1.0323232323232323,
"eval_loss": 0.23158761858940125,
"eval_runtime": 4.8528,
"eval_samples_per_second": 20.607,
"eval_steps_per_second": 5.152,
"eval_token_acc": 0.9269256354949906,
"step": 160
},
{
"epoch": 1.0646464646464646,
"grad_norm": 0.8160377144813538,
"learning_rate": 7.169418695587791e-06,
"loss": 0.16782424449920655,
"memory(GiB)": 34.77,
"step": 165,
"token_acc": 0.9344088433847973,
"train_speed(iter/s)": 0.138724
},
{
"epoch": 1.096969696969697,
"grad_norm": 0.8471182584762573,
"learning_rate": 7.015030056677559e-06,
"loss": 0.16909420490264893,
"memory(GiB)": 34.77,
"step": 170,
"token_acc": 0.9429876289177185,
"train_speed(iter/s)": 0.139504
},
{
"epoch": 1.1292929292929292,
"grad_norm": 0.7286836504936218,
"learning_rate": 6.858312278301638e-06,
"loss": 0.1667182445526123,
"memory(GiB)": 34.77,
"step": 175,
"token_acc": 0.9418443002780352,
"train_speed(iter/s)": 0.139935
},
{
"epoch": 1.1616161616161615,
"grad_norm": 0.7752698063850403,
"learning_rate": 6.699446507913083e-06,
"loss": 0.15690959692001344,
"memory(GiB)": 34.77,
"step": 180,
"token_acc": 0.9501004865665327,
"train_speed(iter/s)": 0.14024
},
{
"epoch": 1.1616161616161615,
"eval_loss": 0.23583181202411652,
"eval_runtime": 4.8506,
"eval_samples_per_second": 20.616,
"eval_steps_per_second": 5.154,
"eval_token_acc": 0.9267442094865644,
"step": 180
},
{
"epoch": 1.1939393939393939,
"grad_norm": 0.7071201205253601,
"learning_rate": 6.53861637579291e-06,
"loss": 0.15962274074554444,
"memory(GiB)": 34.77,
"step": 185,
"token_acc": 0.9369357151160538,
"train_speed(iter/s)": 0.138875
},
{
"epoch": 1.2262626262626264,
"grad_norm": 0.7520214319229126,
"learning_rate": 6.376007782794926e-06,
"loss": 0.15966968536376952,
"memory(GiB)": 34.77,
"step": 190,
"token_acc": 0.9476890003582945,
"train_speed(iter/s)": 0.139235
},
{
"epoch": 1.2585858585858585,
"grad_norm": 0.7770646214485168,
"learning_rate": 6.211808685466063e-06,
"loss": 0.17346657514572145,
"memory(GiB)": 34.77,
"step": 195,
"token_acc": 0.937822677420255,
"train_speed(iter/s)": 0.139941
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.7723908424377441,
"learning_rate": 6.046208878790543e-06,
"loss": 0.1594362735748291,
"memory(GiB)": 34.77,
"step": 200,
"token_acc": 0.9459411057384808,
"train_speed(iter/s)": 0.140377
},
{
"epoch": 1.290909090909091,
"eval_loss": 0.23655511438846588,
"eval_runtime": 4.8375,
"eval_samples_per_second": 20.672,
"eval_steps_per_second": 5.168,
"eval_token_acc": 0.9270062692765134,
"step": 200
},
{
"epoch": 1.3232323232323233,
"grad_norm": 0.7364096641540527,
"learning_rate": 5.879399776809047e-06,
"loss": 0.16425321102142335,
"memory(GiB)": 34.77,
"step": 205,
"token_acc": 0.9377912867274569,
"train_speed(iter/s)": 0.139168
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.772078275680542,
"learning_rate": 5.711574191366427e-06,
"loss": 0.16698684692382812,
"memory(GiB)": 34.77,
"step": 210,
"token_acc": 0.9447016139121731,
"train_speed(iter/s)": 0.139538
},
{
"epoch": 1.387878787878788,
"grad_norm": 0.6973662376403809,
"learning_rate": 5.542926109243727e-06,
"loss": 0.15273804664611818,
"memory(GiB)": 34.77,
"step": 215,
"token_acc": 0.9457073269738178,
"train_speed(iter/s)": 0.140027
},
{
"epoch": 1.4202020202020202,
"grad_norm": 0.7803521156311035,
"learning_rate": 5.373650467932122e-06,
"loss": 0.17012779712677,
"memory(GiB)": 34.77,
"step": 220,
"token_acc": 0.9377423694832089,
"train_speed(iter/s)": 0.14035
},
{
"epoch": 1.4202020202020202,
"eval_loss": 0.2345450520515442,
"eval_runtime": 4.8264,
"eval_samples_per_second": 20.719,
"eval_steps_per_second": 5.18,
"eval_token_acc": 0.9277722902009797,
"step": 220
},
{
"epoch": 1.4525252525252526,
"grad_norm": 0.694709300994873,
"learning_rate": 5.2039429303079294e-06,
"loss": 0.16966450214385986,
"memory(GiB)": 34.77,
"step": 225,
"token_acc": 0.9322694965253717,
"train_speed(iter/s)": 0.139252
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.7459155917167664,
"learning_rate": 5.033999658469174e-06,
"loss": 0.17206931114196777,
"memory(GiB)": 34.77,
"step": 230,
"token_acc": 0.9417377303357386,
"train_speed(iter/s)": 0.13963
},
{
"epoch": 1.5171717171717172,
"grad_norm": 0.7466315627098083,
"learning_rate": 4.864017086995112e-06,
"loss": 0.15746488571166992,
"memory(GiB)": 34.77,
"step": 235,
"token_acc": 0.9459481252519822,
"train_speed(iter/s)": 0.139941
},
{
"epoch": 1.5494949494949495,
"grad_norm": 0.7919719815254211,
"learning_rate": 4.694191695890788e-06,
"loss": 0.1569303512573242,
"memory(GiB)": 34.77,
"step": 240,
"token_acc": 0.941917082024835,
"train_speed(iter/s)": 0.140297
},
{
"epoch": 1.5494949494949495,
"eval_loss": 0.23328742384910583,
"eval_runtime": 4.8307,
"eval_samples_per_second": 20.701,
"eval_steps_per_second": 5.175,
"eval_token_acc": 0.9284576773439233,
"step": 240
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.8104657530784607,
"learning_rate": 4.524719783479088e-06,
"loss": 0.15921467542648315,
"memory(GiB)": 34.77,
"step": 245,
"token_acc": 0.9333036905291241,
"train_speed(iter/s)": 0.139163
},
{
"epoch": 1.614141414141414,
"grad_norm": 0.8245537877082825,
"learning_rate": 4.355797239502807e-06,
"loss": 0.16331541538238525,
"memory(GiB)": 34.77,
"step": 250,
"token_acc": 0.9451396561913816,
"train_speed(iter/s)": 0.139254
},
{
"epoch": 1.6464646464646466,
"grad_norm": 0.7749842405319214,
"learning_rate": 4.187619318698971e-06,
"loss": 0.15697014331817627,
"memory(GiB)": 34.77,
"step": 255,
"token_acc": 0.9451475779917865,
"train_speed(iter/s)": 0.139673
},
{
"epoch": 1.6787878787878787,
"grad_norm": 0.8173830509185791,
"learning_rate": 4.020380415107167e-06,
"loss": 0.16766272783279418,
"memory(GiB)": 34.77,
"step": 260,
"token_acc": 0.9469431879605132,
"train_speed(iter/s)": 0.139915
},
{
"epoch": 1.6787878787878787,
"eval_loss": 0.2292548418045044,
"eval_runtime": 4.8388,
"eval_samples_per_second": 20.666,
"eval_steps_per_second": 5.167,
"eval_token_acc": 0.9289213215876791,
"step": 260
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.7277234196662903,
"learning_rate": 3.854273837372724e-06,
"loss": 0.16253018379211426,
"memory(GiB)": 34.77,
"step": 265,
"token_acc": 0.9426872469635628,
"train_speed(iter/s)": 0.138974
},
{
"epoch": 1.7434343434343433,
"grad_norm": 0.8412746787071228,
"learning_rate": 3.689491585304491e-06,
"loss": 0.16776057481765747,
"memory(GiB)": 34.77,
"step": 270,
"token_acc": 0.9380598276153456,
"train_speed(iter/s)": 0.139245
},
{
"epoch": 1.7757575757575759,
"grad_norm": 0.7006183862686157,
"learning_rate": 3.526224127945479e-06,
"loss": 0.15875219106674193,
"memory(GiB)": 34.77,
"step": 275,
"token_acc": 0.9433733748578773,
"train_speed(iter/s)": 0.13972
},
{
"epoch": 1.808080808080808,
"grad_norm": 0.7234155535697937,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.159059476852417,
"memory(GiB)": 34.77,
"step": 280,
"token_acc": 0.9385077213505401,
"train_speed(iter/s)": 0.139973
},
{
"epoch": 1.808080808080808,
"eval_loss": 0.22943958640098572,
"eval_runtime": 4.8423,
"eval_samples_per_second": 20.652,
"eval_steps_per_second": 5.163,
"eval_token_acc": 0.9282157759993549,
"step": 280
},
{
"epoch": 1.8404040404040405,
"grad_norm": 0.7152555584907532,
"learning_rate": 3.204986500762006e-06,
"loss": 0.1508580207824707,
"memory(GiB)": 34.77,
"step": 285,
"token_acc": 0.9341365308729054,
"train_speed(iter/s)": 0.138995
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.7166336178779602,
"learning_rate": 3.0473876441260786e-06,
"loss": 0.16788345575332642,
"memory(GiB)": 34.77,
"step": 290,
"token_acc": 0.9407490363579539,
"train_speed(iter/s)": 0.139247
},
{
"epoch": 1.905050505050505,
"grad_norm": 0.7887818813323975,
"learning_rate": 2.8920457793817507e-06,
"loss": 0.15700163841247558,
"memory(GiB)": 34.77,
"step": 295,
"token_acc": 0.9426423803879983,
"train_speed(iter/s)": 0.139471
},
{
"epoch": 1.9373737373737374,
"grad_norm": 0.8042952418327332,
"learning_rate": 2.7391404635865725e-06,
"loss": 0.16809990406036376,
"memory(GiB)": 34.77,
"step": 300,
"token_acc": 0.944771353933029,
"train_speed(iter/s)": 0.139742
},
{
"epoch": 1.9373737373737374,
"eval_loss": 0.2291778326034546,
"eval_runtime": 4.8337,
"eval_samples_per_second": 20.688,
"eval_steps_per_second": 5.172,
"eval_token_acc": 0.9292035398230089,
"step": 300
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.6989196538925171,
"learning_rate": 2.5888484374320033e-06,
"loss": 0.1462658405303955,
"memory(GiB)": 34.77,
"step": 305,
"token_acc": 0.9413777899090852,
"train_speed(iter/s)": 0.138859
},
{
"epoch": 2.0,
"grad_norm": 0.9214490056037903,
"learning_rate": 2.4413434209518137e-06,
"loss": 0.17060282230377197,
"memory(GiB)": 34.77,
"step": 310,
"token_acc": 0.9452530120481928,
"train_speed(iter/s)": 0.139266
},
{
"epoch": 2.0323232323232325,
"grad_norm": 0.6455035209655762,
"learning_rate": 2.296795912722014e-06,
"loss": 0.12579550743103027,
"memory(GiB)": 34.77,
"step": 315,
"token_acc": 0.9598925994294345,
"train_speed(iter/s)": 0.139482
},
{
"epoch": 2.0646464646464646,
"grad_norm": 0.6761350631713867,
"learning_rate": 2.1553729927843894e-06,
"loss": 0.10464283227920532,
"memory(GiB)": 34.77,
"step": 320,
"token_acc": 0.9628479377702958,
"train_speed(iter/s)": 0.139728
},
{
"epoch": 2.0646464646464646,
"eval_loss": 0.23895612359046936,
"eval_runtime": 4.8116,
"eval_samples_per_second": 20.783,
"eval_steps_per_second": 5.196,
"eval_token_acc": 0.9286391033523494,
"step": 320
},
{
"epoch": 2.096969696969697,
"grad_norm": 0.7690797448158264,
"learning_rate": 2.017238129521506e-06,
"loss": 0.10620735883712769,
"memory(GiB)": 34.77,
"step": 325,
"token_acc": 0.951885791453651,
"train_speed(iter/s)": 0.138943
},
{
"epoch": 2.1292929292929292,
"grad_norm": 0.8825842142105103,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.11970834732055664,
"memory(GiB)": 34.77,
"step": 330,
"token_acc": 0.9611402417348027,
"train_speed(iter/s)": 0.139288
},
{
"epoch": 2.1616161616161618,
"grad_norm": 0.7779159545898438,
"learning_rate": 1.7514672589449378e-06,
"loss": 0.1070137619972229,
"memory(GiB)": 34.77,
"step": 335,
"token_acc": 0.9595128097438052,
"train_speed(iter/s)": 0.139418
},
{
"epoch": 2.193939393939394,
"grad_norm": 0.7191023826599121,
"learning_rate": 1.6241384517255854e-06,
"loss": 0.11621193885803223,
"memory(GiB)": 34.77,
"step": 340,
"token_acc": 0.9595157410042165,
"train_speed(iter/s)": 0.139652
},
{
"epoch": 2.193939393939394,
"eval_loss": 0.2503757178783417,
"eval_runtime": 4.8011,
"eval_samples_per_second": 20.829,
"eval_steps_per_second": 5.207,
"eval_token_acc": 0.928538311125446,
"step": 340
},
{
"epoch": 2.2262626262626264,
"grad_norm": 0.6950616240501404,
"learning_rate": 1.500711746282192e-06,
"loss": 0.11175984144210815,
"memory(GiB)": 34.77,
"step": 345,
"token_acc": 0.9465600862223819,
"train_speed(iter/s)": 0.138949
},
{
"epoch": 2.2585858585858585,
"grad_norm": 0.7209317088127136,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.11107317209243775,
"memory(GiB)": 34.77,
"step": 350,
"token_acc": 0.9588838612368024,
"train_speed(iter/s)": 0.139108
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.7023849487304688,
"learning_rate": 1.2661306328825818e-06,
"loss": 0.1061722993850708,
"memory(GiB)": 34.77,
"step": 355,
"token_acc": 0.9638245595692404,
"train_speed(iter/s)": 0.139405
},
{
"epoch": 2.323232323232323,
"grad_norm": 0.6932234764099121,
"learning_rate": 1.1552473733031893e-06,
"loss": 0.11119704246520996,
"memory(GiB)": 34.77,
"step": 360,
"token_acc": 0.9628568099732029,
"train_speed(iter/s)": 0.139754
},
{
"epoch": 2.323232323232323,
"eval_loss": 0.2477826327085495,
"eval_runtime": 4.8417,
"eval_samples_per_second": 20.654,
"eval_steps_per_second": 5.163,
"eval_token_acc": 0.9284173604531618,
"step": 360
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.7261531352996826,
"learning_rate": 1.0488081988375493e-06,
"loss": 0.11782848834991455,
"memory(GiB)": 34.77,
"step": 365,
"token_acc": 0.9477818154288743,
"train_speed(iter/s)": 0.139242
},
{
"epoch": 2.3878787878787877,
"grad_norm": 0.7011246681213379,
"learning_rate": 9.469361407432431e-07,
"loss": 0.10731152296066285,
"memory(GiB)": 34.77,
"step": 370,
"token_acc": 0.9617250245182495,
"train_speed(iter/s)": 0.139376
},
{
"epoch": 2.4202020202020202,
"grad_norm": 0.7193347811698914,
"learning_rate": 8.497489512245971e-07,
"loss": 0.11734654903411865,
"memory(GiB)": 34.77,
"step": 375,
"token_acc": 0.9643068481359944,
"train_speed(iter/s)": 0.139563
},
{
"epoch": 2.4525252525252528,
"grad_norm": 0.8027297258377075,
"learning_rate": 7.573589673248833e-07,
"loss": 0.1112905502319336,
"memory(GiB)": 34.77,
"step": 380,
"token_acc": 0.964820230517805,
"train_speed(iter/s)": 0.139778
},
{
"epoch": 2.4525252525252528,
"eval_loss": 0.24963097274303436,
"eval_runtime": 4.8179,
"eval_samples_per_second": 20.756,
"eval_steps_per_second": 5.189,
"eval_token_acc": 0.9288810046969178,
"step": 380
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.7203919291496277,
"learning_rate": 6.698729810778065e-07,
"loss": 0.10394268035888672,
"memory(GiB)": 34.77,
"step": 385,
"token_acc": 0.9475781400629834,
"train_speed(iter/s)": 0.139104
},
{
"epoch": 2.517171717171717,
"grad_norm": 0.71819007396698,
"learning_rate": 5.873921160683943e-07,
"loss": 0.10946273803710938,
"memory(GiB)": 34.77,
"step": 390,
"token_acc": 0.9592940980604345,
"train_speed(iter/s)": 0.139407
},
{
"epoch": 2.5494949494949495,
"grad_norm": 0.7178409695625305,
"learning_rate": 5.100117105459279e-07,
"loss": 0.1130871295928955,
"memory(GiB)": 34.77,
"step": 395,
"token_acc": 0.9668103880477289,
"train_speed(iter/s)": 0.139538
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.67863929271698,
"learning_rate": 4.3782120722406565e-07,
"loss": 0.10182794332504272,
"memory(GiB)": 34.77,
"step": 400,
"token_acc": 0.9624655998369177,
"train_speed(iter/s)": 0.139808
},
{
"epoch": 2.581818181818182,
"eval_loss": 0.24975040555000305,
"eval_runtime": 4.8388,
"eval_samples_per_second": 20.666,
"eval_steps_per_second": 5.167,
"eval_token_acc": 0.9285786280162074,
"step": 400
},
{
"epoch": 2.614141414141414,
"grad_norm": 0.652603805065155,
"learning_rate": 3.709040498955102e-07,
"loss": 0.1031190037727356,
"memory(GiB)": 34.77,
"step": 405,
"token_acc": 0.9463408184402476,
"train_speed(iter/s)": 0.139139
},
{
"epoch": 2.6464646464646466,
"grad_norm": 1.0421696901321411,
"learning_rate": 3.0933758698072023e-07,
"loss": 0.121562659740448,
"memory(GiB)": 34.77,
"step": 410,
"token_acc": 0.9582254445019851,
"train_speed(iter/s)": 0.139341
},
{
"epoch": 2.6787878787878787,
"grad_norm": 0.7723399996757507,
"learning_rate": 2.531929821221768e-07,
"loss": 0.11757031679153443,
"memory(GiB)": 34.77,
"step": 415,
"token_acc": 0.9626831890454366,
"train_speed(iter/s)": 0.139573
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.6900373101234436,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.11266238689422607,
"memory(GiB)": 34.77,
"step": 420,
"token_acc": 0.9586481947942905,
"train_speed(iter/s)": 0.139767
},
{
"epoch": 2.7111111111111112,
"eval_loss": 0.2500038743019104,
"eval_runtime": 4.8315,
"eval_samples_per_second": 20.697,
"eval_steps_per_second": 5.174,
"eval_token_acc": 0.9289616384784405,
"step": 420
},
{
"epoch": 2.7434343434343433,
"grad_norm": 0.7542772889137268,
"learning_rate": 1.5742259095662126e-07,
"loss": 0.11297458410263062,
"memory(GiB)": 34.77,
"step": 425,
"token_acc": 0.9452227294191721,
"train_speed(iter/s)": 0.139122
},
{
"epoch": 2.775757575757576,
"grad_norm": 0.7564345598220825,
"learning_rate": 1.1790750403941231e-07,
"loss": 0.1145315408706665,
"memory(GiB)": 34.77,
"step": 430,
"token_acc": 0.9606825351304846,
"train_speed(iter/s)": 0.139315
},
{
"epoch": 2.808080808080808,
"grad_norm": 0.7472628355026245,
"learning_rate": 8.403554600248498e-08,
"loss": 0.10037808418273926,
"memory(GiB)": 34.77,
"step": 435,
"token_acc": 0.968122471719594,
"train_speed(iter/s)": 0.139508
},
{
"epoch": 2.8404040404040405,
"grad_norm": 0.7140054702758789,
"learning_rate": 5.584586887435739e-08,
"loss": 0.1066713809967041,
"memory(GiB)": 34.77,
"step": 440,
"token_acc": 0.9640665162880974,
"train_speed(iter/s)": 0.139714
},
{
"epoch": 2.8404040404040405,
"eval_loss": 0.2495991587638855,
"eval_runtime": 4.8171,
"eval_samples_per_second": 20.76,
"eval_steps_per_second": 5.19,
"eval_token_acc": 0.9289414800330599,
"step": 440
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.6888604164123535,
"learning_rate": 3.337105663029361e-08,
"loss": 0.11188592910766601,
"memory(GiB)": 34.77,
"step": 445,
"token_acc": 0.9492009251349155,
"train_speed(iter/s)": 0.13915
},
{
"epoch": 2.905050505050505,
"grad_norm": 0.7185996770858765,
"learning_rate": 1.6637087529033925e-08,
"loss": 0.11582531929016113,
"memory(GiB)": 34.77,
"step": 450,
"token_acc": 0.9591944327288064,
"train_speed(iter/s)": 0.139392
},
{
"epoch": 2.937373737373737,
"grad_norm": 0.6511121988296509,
"learning_rate": 5.6633040849601865e-09,
"loss": 0.10450353622436523,
"memory(GiB)": 34.77,
"step": 455,
"token_acc": 0.9614834408486059,
"train_speed(iter/s)": 0.139543
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.6926988363265991,
"learning_rate": 4.623907104084335e-10,
"loss": 0.11017694473266601,
"memory(GiB)": 34.77,
"step": 460,
"token_acc": 0.9601264597715982,
"train_speed(iter/s)": 0.139788
},
{
"epoch": 2.9696969696969697,
"eval_loss": 0.24949033558368683,
"eval_runtime": 4.8501,
"eval_samples_per_second": 20.618,
"eval_steps_per_second": 5.155,
"eval_token_acc": 0.9285786280162074,
"step": 460
},
{
"epoch": 2.9826262626262627,
"eval_loss": 0.2496582567691803,
"eval_runtime": 4.861,
"eval_samples_per_second": 20.572,
"eval_steps_per_second": 5.143,
"eval_token_acc": 0.929062430705344,
"step": 462
}
],
"logging_steps": 5,
"max_steps": 462,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.5801089900517786e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}