1181 lines
33 KiB
JSON
1181 lines
33 KiB
JSON
{
|
|
"best_global_step": 300,
|
|
"best_metric": 0.22917783,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v27-20250507-113338/checkpoint-300",
|
|
"epoch": 2.9826262626262627,
|
|
"eval_steps": 20,
|
|
"global_step": 462,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006464646464646465,
|
|
"grad_norm": 2.5242867469787598,
|
|
"learning_rate": 9.999884400986087e-06,
|
|
"loss": 0.39474862813949585,
|
|
"memory(GiB)": 28.84,
|
|
"step": 1,
|
|
"token_acc": 0.8908829863603733,
|
|
"train_speed(iter/s)": 0.064293
|
|
},
|
|
{
|
|
"epoch": 0.03232323232323232,
|
|
"grad_norm": 1.5711474418640137,
|
|
"learning_rate": 9.997110291906109e-06,
|
|
"loss": 0.3434034585952759,
|
|
"memory(GiB)": 30.62,
|
|
"step": 5,
|
|
"token_acc": 0.8817614172656647,
|
|
"train_speed(iter/s)": 0.119566
|
|
},
|
|
{
|
|
"epoch": 0.06464646464646465,
|
|
"grad_norm": 0.9338254928588867,
|
|
"learning_rate": 9.988444507789584e-06,
|
|
"loss": 0.2892385244369507,
|
|
"memory(GiB)": 30.62,
|
|
"step": 10,
|
|
"token_acc": 0.9103395025620628,
|
|
"train_speed(iter/s)": 0.1364
|
|
},
|
|
{
|
|
"epoch": 0.09696969696969697,
|
|
"grad_norm": 0.9935480952262878,
|
|
"learning_rate": 9.97401266428502e-06,
|
|
"loss": 0.30152087211608886,
|
|
"memory(GiB)": 32.44,
|
|
"step": 15,
|
|
"token_acc": 0.9016964442328413,
|
|
"train_speed(iter/s)": 0.141481
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"grad_norm": 0.8869792819023132,
|
|
"learning_rate": 9.953831442918418e-06,
|
|
"loss": 0.2820048570632935,
|
|
"memory(GiB)": 32.44,
|
|
"step": 20,
|
|
"token_acc": 0.9094200925673837,
|
|
"train_speed(iter/s)": 0.144923
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"eval_loss": 0.2714148759841919,
|
|
"eval_runtime": 4.8691,
|
|
"eval_samples_per_second": 20.538,
|
|
"eval_steps_per_second": 5.134,
|
|
"eval_token_acc": 0.9161811841070817,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 0.9623355865478516,
|
|
"learning_rate": 9.927924170825266e-06,
|
|
"loss": 0.28305883407592775,
|
|
"memory(GiB)": 32.44,
|
|
"step": 25,
|
|
"token_acc": 0.8970619818736308,
|
|
"train_speed(iter/s)": 0.133319
|
|
},
|
|
{
|
|
"epoch": 0.19393939393939394,
|
|
"grad_norm": 0.806711733341217,
|
|
"learning_rate": 9.896320793787106e-06,
|
|
"loss": 0.2747792720794678,
|
|
"memory(GiB)": 32.44,
|
|
"step": 30,
|
|
"token_acc": 0.9068442528293171,
|
|
"train_speed(iter/s)": 0.138408
|
|
},
|
|
{
|
|
"epoch": 0.22626262626262628,
|
|
"grad_norm": 0.8571366667747498,
|
|
"learning_rate": 9.859057841617709e-06,
|
|
"loss": 0.2719248294830322,
|
|
"memory(GiB)": 32.44,
|
|
"step": 35,
|
|
"token_acc": 0.9210450095580143,
|
|
"train_speed(iter/s)": 0.14047
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"grad_norm": 0.8261837363243103,
|
|
"learning_rate": 9.816178385938867e-06,
|
|
"loss": 0.2738617420196533,
|
|
"memory(GiB)": 32.45,
|
|
"step": 40,
|
|
"token_acc": 0.9066808952792833,
|
|
"train_speed(iter/s)": 0.142017
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"eval_loss": 0.2558521330356598,
|
|
"eval_runtime": 4.8838,
|
|
"eval_samples_per_second": 20.476,
|
|
"eval_steps_per_second": 5.119,
|
|
"eval_token_acc": 0.9202935069647429,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 0.7319702506065369,
|
|
"learning_rate": 9.767731990394638e-06,
|
|
"loss": 0.2567479133605957,
|
|
"memory(GiB)": 32.45,
|
|
"step": 45,
|
|
"token_acc": 0.9167146310579026,
|
|
"train_speed(iter/s)": 0.136503
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 0.8507645130157471,
|
|
"learning_rate": 9.71377465336155e-06,
|
|
"loss": 0.261569881439209,
|
|
"memory(GiB)": 32.45,
|
|
"step": 50,
|
|
"token_acc": 0.9125896733273807,
|
|
"train_speed(iter/s)": 0.138728
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.8376278877258301,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.24444923400878907,
|
|
"memory(GiB)": 32.45,
|
|
"step": 55,
|
|
"token_acc": 0.9274953450318795,
|
|
"train_speed(iter/s)": 0.139921
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"grad_norm": 0.8305687308311462,
|
|
"learning_rate": 9.589582926268798e-06,
|
|
"loss": 0.26804823875427247,
|
|
"memory(GiB)": 34.77,
|
|
"step": 60,
|
|
"token_acc": 0.9223425512494758,
|
|
"train_speed(iter/s)": 0.14136
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"eval_loss": 0.2471594363451004,
|
|
"eval_runtime": 4.836,
|
|
"eval_samples_per_second": 20.678,
|
|
"eval_steps_per_second": 5.17,
|
|
"eval_token_acc": 0.9224706190658576,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4202020202020202,
|
|
"grad_norm": 0.6730025410652161,
|
|
"learning_rate": 9.519492087344724e-06,
|
|
"loss": 0.24588844776153565,
|
|
"memory(GiB)": 34.77,
|
|
"step": 65,
|
|
"token_acc": 0.9117032737506321,
|
|
"train_speed(iter/s)": 0.136683
|
|
},
|
|
{
|
|
"epoch": 0.45252525252525255,
|
|
"grad_norm": 0.7964938282966614,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.2592522859573364,
|
|
"memory(GiB)": 34.77,
|
|
"step": 70,
|
|
"token_acc": 0.9183477688849907,
|
|
"train_speed(iter/s)": 0.138829
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 0.6614187359809875,
|
|
"learning_rate": 9.363725449224281e-06,
|
|
"loss": 0.2513019561767578,
|
|
"memory(GiB)": 34.77,
|
|
"step": 75,
|
|
"token_acc": 0.9213027816690014,
|
|
"train_speed(iter/s)": 0.140268
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"grad_norm": 0.7636239528656006,
|
|
"learning_rate": 9.278229698073889e-06,
|
|
"loss": 0.2455005168914795,
|
|
"memory(GiB)": 34.77,
|
|
"step": 80,
|
|
"token_acc": 0.9128787878787878,
|
|
"train_speed(iter/s)": 0.140852
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"eval_loss": 0.24173545837402344,
|
|
"eval_runtime": 4.8448,
|
|
"eval_samples_per_second": 20.641,
|
|
"eval_steps_per_second": 5.16,
|
|
"eval_token_acc": 0.9233374322172274,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5494949494949495,
|
|
"grad_norm": 0.8166657090187073,
|
|
"learning_rate": 9.187788812929074e-06,
|
|
"loss": 0.2590561628341675,
|
|
"memory(GiB)": 34.77,
|
|
"step": 85,
|
|
"token_acc": 0.9130182349905385,
|
|
"train_speed(iter/s)": 0.137729
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 0.8091973066329956,
|
|
"learning_rate": 9.092507332892968e-06,
|
|
"loss": 0.2490919589996338,
|
|
"memory(GiB)": 34.77,
|
|
"step": 90,
|
|
"token_acc": 0.910330508950115,
|
|
"train_speed(iter/s)": 0.13867
|
|
},
|
|
{
|
|
"epoch": 0.6141414141414141,
|
|
"grad_norm": 0.8563810586929321,
|
|
"learning_rate": 8.992495392231195e-06,
|
|
"loss": 0.2534335613250732,
|
|
"memory(GiB)": 34.77,
|
|
"step": 95,
|
|
"token_acc": 0.9122233688797157,
|
|
"train_speed(iter/s)": 0.139651
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 0.7787972092628479,
|
|
"learning_rate": 8.88786859306952e-06,
|
|
"loss": 0.24485716819763184,
|
|
"memory(GiB)": 34.77,
|
|
"step": 100,
|
|
"token_acc": 0.9189008559751176,
|
|
"train_speed(iter/s)": 0.140544
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"eval_loss": 0.23870104551315308,
|
|
"eval_runtime": 4.8543,
|
|
"eval_samples_per_second": 20.6,
|
|
"eval_steps_per_second": 5.15,
|
|
"eval_token_acc": 0.9255750196544843,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6787878787878788,
|
|
"grad_norm": 0.7570570707321167,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 0.25060880184173584,
|
|
"memory(GiB)": 34.77,
|
|
"step": 105,
|
|
"token_acc": 0.9076021696824148,
|
|
"train_speed(iter/s)": 0.138513
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.7018725872039795,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.2399357795715332,
|
|
"memory(GiB)": 34.77,
|
|
"step": 110,
|
|
"token_acc": 0.927743086529884,
|
|
"train_speed(iter/s)": 0.139328
|
|
},
|
|
{
|
|
"epoch": 0.7434343434343434,
|
|
"grad_norm": 0.7108844518661499,
|
|
"learning_rate": 8.547534234672435e-06,
|
|
"loss": 0.23419642448425293,
|
|
"memory(GiB)": 34.77,
|
|
"step": 115,
|
|
"token_acc": 0.9277124928693667,
|
|
"train_speed(iter/s)": 0.140095
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"grad_norm": 0.8351752161979675,
|
|
"learning_rate": 8.425708574839221e-06,
|
|
"loss": 0.24019112586975097,
|
|
"memory(GiB)": 34.77,
|
|
"step": 120,
|
|
"token_acc": 0.9238991888760139,
|
|
"train_speed(iter/s)": 0.140762
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"eval_loss": 0.23568643629550934,
|
|
"eval_runtime": 4.8562,
|
|
"eval_samples_per_second": 20.592,
|
|
"eval_steps_per_second": 5.148,
|
|
"eval_token_acc": 0.9252323260830124,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 0.7820432186126709,
|
|
"learning_rate": 8.299923195887599e-06,
|
|
"loss": 0.23381190299987792,
|
|
"memory(GiB)": 34.77,
|
|
"step": 125,
|
|
"token_acc": 0.9166274910577414,
|
|
"train_speed(iter/s)": 0.138826
|
|
},
|
|
{
|
|
"epoch": 0.8404040404040404,
|
|
"grad_norm": 0.8153200149536133,
|
|
"learning_rate": 8.170323491028625e-06,
|
|
"loss": 0.25104479789733886,
|
|
"memory(GiB)": 34.77,
|
|
"step": 130,
|
|
"token_acc": 0.9234165067178502,
|
|
"train_speed(iter/s)": 0.139183
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 0.8061110973358154,
|
|
"learning_rate": 8.03705926238874e-06,
|
|
"loss": 0.24108409881591797,
|
|
"memory(GiB)": 34.77,
|
|
"step": 135,
|
|
"token_acc": 0.9225523279137268,
|
|
"train_speed(iter/s)": 0.139787
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"grad_norm": 0.7487571835517883,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.23796701431274414,
|
|
"memory(GiB)": 34.77,
|
|
"step": 140,
|
|
"token_acc": 0.9192467460537247,
|
|
"train_speed(iter/s)": 0.140229
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"eval_loss": 0.2325511872768402,
|
|
"eval_runtime": 4.8347,
|
|
"eval_samples_per_second": 20.684,
|
|
"eval_steps_per_second": 5.171,
|
|
"eval_token_acc": 0.9265224665873768,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9373737373737374,
|
|
"grad_norm": 0.7421184182167053,
|
|
"learning_rate": 7.760157443030234e-06,
|
|
"loss": 0.22932517528533936,
|
|
"memory(GiB)": 34.77,
|
|
"step": 145,
|
|
"token_acc": 0.9112203397203071,
|
|
"train_speed(iter/s)": 0.13847
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.7295236587524414,
|
|
"learning_rate": 7.616839918483061e-06,
|
|
"loss": 0.233046817779541,
|
|
"memory(GiB)": 34.77,
|
|
"step": 150,
|
|
"token_acc": 0.9231597652253514,
|
|
"train_speed(iter/s)": 0.138991
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.7331147789955139,
|
|
"learning_rate": 7.470497632538743e-06,
|
|
"loss": 0.23622214794158936,
|
|
"memory(GiB)": 34.77,
|
|
"step": 155,
|
|
"token_acc": 0.923393272448806,
|
|
"train_speed(iter/s)": 0.139619
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"grad_norm": 0.6679208874702454,
|
|
"learning_rate": 7.321299739792553e-06,
|
|
"loss": 0.17297937870025634,
|
|
"memory(GiB)": 34.77,
|
|
"step": 160,
|
|
"token_acc": 0.9432674199623352,
|
|
"train_speed(iter/s)": 0.140142
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"eval_loss": 0.23158761858940125,
|
|
"eval_runtime": 4.8528,
|
|
"eval_samples_per_second": 20.607,
|
|
"eval_steps_per_second": 5.152,
|
|
"eval_token_acc": 0.9269256354949906,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0646464646464646,
|
|
"grad_norm": 0.8160377144813538,
|
|
"learning_rate": 7.169418695587791e-06,
|
|
"loss": 0.16782424449920655,
|
|
"memory(GiB)": 34.77,
|
|
"step": 165,
|
|
"token_acc": 0.9344088433847973,
|
|
"train_speed(iter/s)": 0.138724
|
|
},
|
|
{
|
|
"epoch": 1.096969696969697,
|
|
"grad_norm": 0.8471182584762573,
|
|
"learning_rate": 7.015030056677559e-06,
|
|
"loss": 0.16909420490264893,
|
|
"memory(GiB)": 34.77,
|
|
"step": 170,
|
|
"token_acc": 0.9429876289177185,
|
|
"train_speed(iter/s)": 0.139504
|
|
},
|
|
{
|
|
"epoch": 1.1292929292929292,
|
|
"grad_norm": 0.7286836504936218,
|
|
"learning_rate": 6.858312278301638e-06,
|
|
"loss": 0.1667182445526123,
|
|
"memory(GiB)": 34.77,
|
|
"step": 175,
|
|
"token_acc": 0.9418443002780352,
|
|
"train_speed(iter/s)": 0.139935
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 0.7752698063850403,
|
|
"learning_rate": 6.699446507913083e-06,
|
|
"loss": 0.15690959692001344,
|
|
"memory(GiB)": 34.77,
|
|
"step": 180,
|
|
"token_acc": 0.9501004865665327,
|
|
"train_speed(iter/s)": 0.14024
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"eval_loss": 0.23583181202411652,
|
|
"eval_runtime": 4.8506,
|
|
"eval_samples_per_second": 20.616,
|
|
"eval_steps_per_second": 5.154,
|
|
"eval_token_acc": 0.9267442094865644,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.1939393939393939,
|
|
"grad_norm": 0.7071201205253601,
|
|
"learning_rate": 6.53861637579291e-06,
|
|
"loss": 0.15962274074554444,
|
|
"memory(GiB)": 34.77,
|
|
"step": 185,
|
|
"token_acc": 0.9369357151160538,
|
|
"train_speed(iter/s)": 0.138875
|
|
},
|
|
{
|
|
"epoch": 1.2262626262626264,
|
|
"grad_norm": 0.7520214319229126,
|
|
"learning_rate": 6.376007782794926e-06,
|
|
"loss": 0.15966968536376952,
|
|
"memory(GiB)": 34.77,
|
|
"step": 190,
|
|
"token_acc": 0.9476890003582945,
|
|
"train_speed(iter/s)": 0.139235
|
|
},
|
|
{
|
|
"epoch": 1.2585858585858585,
|
|
"grad_norm": 0.7770646214485168,
|
|
"learning_rate": 6.211808685466063e-06,
|
|
"loss": 0.17346657514572145,
|
|
"memory(GiB)": 34.77,
|
|
"step": 195,
|
|
"token_acc": 0.937822677420255,
|
|
"train_speed(iter/s)": 0.139941
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"grad_norm": 0.7723908424377441,
|
|
"learning_rate": 6.046208878790543e-06,
|
|
"loss": 0.1594362735748291,
|
|
"memory(GiB)": 34.77,
|
|
"step": 200,
|
|
"token_acc": 0.9459411057384808,
|
|
"train_speed(iter/s)": 0.140377
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"eval_loss": 0.23655511438846588,
|
|
"eval_runtime": 4.8375,
|
|
"eval_samples_per_second": 20.672,
|
|
"eval_steps_per_second": 5.168,
|
|
"eval_token_acc": 0.9270062692765134,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 0.7364096641540527,
|
|
"learning_rate": 5.879399776809047e-06,
|
|
"loss": 0.16425321102142335,
|
|
"memory(GiB)": 34.77,
|
|
"step": 205,
|
|
"token_acc": 0.9377912867274569,
|
|
"train_speed(iter/s)": 0.139168
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.772078275680542,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.16698684692382812,
|
|
"memory(GiB)": 34.77,
|
|
"step": 210,
|
|
"token_acc": 0.9447016139121731,
|
|
"train_speed(iter/s)": 0.139538
|
|
},
|
|
{
|
|
"epoch": 1.387878787878788,
|
|
"grad_norm": 0.6973662376403809,
|
|
"learning_rate": 5.542926109243727e-06,
|
|
"loss": 0.15273804664611818,
|
|
"memory(GiB)": 34.77,
|
|
"step": 215,
|
|
"token_acc": 0.9457073269738178,
|
|
"train_speed(iter/s)": 0.140027
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"grad_norm": 0.7803521156311035,
|
|
"learning_rate": 5.373650467932122e-06,
|
|
"loss": 0.17012779712677,
|
|
"memory(GiB)": 34.77,
|
|
"step": 220,
|
|
"token_acc": 0.9377423694832089,
|
|
"train_speed(iter/s)": 0.14035
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"eval_loss": 0.2345450520515442,
|
|
"eval_runtime": 4.8264,
|
|
"eval_samples_per_second": 20.719,
|
|
"eval_steps_per_second": 5.18,
|
|
"eval_token_acc": 0.9277722902009797,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4525252525252526,
|
|
"grad_norm": 0.694709300994873,
|
|
"learning_rate": 5.2039429303079294e-06,
|
|
"loss": 0.16966450214385986,
|
|
"memory(GiB)": 34.77,
|
|
"step": 225,
|
|
"token_acc": 0.9322694965253717,
|
|
"train_speed(iter/s)": 0.139252
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 0.7459155917167664,
|
|
"learning_rate": 5.033999658469174e-06,
|
|
"loss": 0.17206931114196777,
|
|
"memory(GiB)": 34.77,
|
|
"step": 230,
|
|
"token_acc": 0.9417377303357386,
|
|
"train_speed(iter/s)": 0.13963
|
|
},
|
|
{
|
|
"epoch": 1.5171717171717172,
|
|
"grad_norm": 0.7466315627098083,
|
|
"learning_rate": 4.864017086995112e-06,
|
|
"loss": 0.15746488571166992,
|
|
"memory(GiB)": 34.77,
|
|
"step": 235,
|
|
"token_acc": 0.9459481252519822,
|
|
"train_speed(iter/s)": 0.139941
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"grad_norm": 0.7919719815254211,
|
|
"learning_rate": 4.694191695890788e-06,
|
|
"loss": 0.1569303512573242,
|
|
"memory(GiB)": 34.77,
|
|
"step": 240,
|
|
"token_acc": 0.941917082024835,
|
|
"train_speed(iter/s)": 0.140297
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"eval_loss": 0.23328742384910583,
|
|
"eval_runtime": 4.8307,
|
|
"eval_samples_per_second": 20.701,
|
|
"eval_steps_per_second": 5.175,
|
|
"eval_token_acc": 0.9284576773439233,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.5818181818181818,
|
|
"grad_norm": 0.8104657530784607,
|
|
"learning_rate": 4.524719783479088e-06,
|
|
"loss": 0.15921467542648315,
|
|
"memory(GiB)": 34.77,
|
|
"step": 245,
|
|
"token_acc": 0.9333036905291241,
|
|
"train_speed(iter/s)": 0.139163
|
|
},
|
|
{
|
|
"epoch": 1.614141414141414,
|
|
"grad_norm": 0.8245537877082825,
|
|
"learning_rate": 4.355797239502807e-06,
|
|
"loss": 0.16331541538238525,
|
|
"memory(GiB)": 34.77,
|
|
"step": 250,
|
|
"token_acc": 0.9451396561913816,
|
|
"train_speed(iter/s)": 0.139254
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 0.7749842405319214,
|
|
"learning_rate": 4.187619318698971e-06,
|
|
"loss": 0.15697014331817627,
|
|
"memory(GiB)": 34.77,
|
|
"step": 255,
|
|
"token_acc": 0.9451475779917865,
|
|
"train_speed(iter/s)": 0.139673
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"grad_norm": 0.8173830509185791,
|
|
"learning_rate": 4.020380415107167e-06,
|
|
"loss": 0.16766272783279418,
|
|
"memory(GiB)": 34.77,
|
|
"step": 260,
|
|
"token_acc": 0.9469431879605132,
|
|
"train_speed(iter/s)": 0.139915
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"eval_loss": 0.2292548418045044,
|
|
"eval_runtime": 4.8388,
|
|
"eval_samples_per_second": 20.666,
|
|
"eval_steps_per_second": 5.167,
|
|
"eval_token_acc": 0.9289213215876791,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.7277234196662903,
|
|
"learning_rate": 3.854273837372724e-06,
|
|
"loss": 0.16253018379211426,
|
|
"memory(GiB)": 34.77,
|
|
"step": 265,
|
|
"token_acc": 0.9426872469635628,
|
|
"train_speed(iter/s)": 0.138974
|
|
},
|
|
{
|
|
"epoch": 1.7434343434343433,
|
|
"grad_norm": 0.8412746787071228,
|
|
"learning_rate": 3.689491585304491e-06,
|
|
"loss": 0.16776057481765747,
|
|
"memory(GiB)": 34.77,
|
|
"step": 270,
|
|
"token_acc": 0.9380598276153456,
|
|
"train_speed(iter/s)": 0.139245
|
|
},
|
|
{
|
|
"epoch": 1.7757575757575759,
|
|
"grad_norm": 0.7006183862686157,
|
|
"learning_rate": 3.526224127945479e-06,
|
|
"loss": 0.15875219106674193,
|
|
"memory(GiB)": 34.77,
|
|
"step": 275,
|
|
"token_acc": 0.9433733748578773,
|
|
"train_speed(iter/s)": 0.13972
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 0.7234155535697937,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.159059476852417,
|
|
"memory(GiB)": 34.77,
|
|
"step": 280,
|
|
"token_acc": 0.9385077213505401,
|
|
"train_speed(iter/s)": 0.139973
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"eval_loss": 0.22943958640098572,
|
|
"eval_runtime": 4.8423,
|
|
"eval_samples_per_second": 20.652,
|
|
"eval_steps_per_second": 5.163,
|
|
"eval_token_acc": 0.9282157759993549,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.8404040404040405,
|
|
"grad_norm": 0.7152555584907532,
|
|
"learning_rate": 3.204986500762006e-06,
|
|
"loss": 0.1508580207824707,
|
|
"memory(GiB)": 34.77,
|
|
"step": 285,
|
|
"token_acc": 0.9341365308729054,
|
|
"train_speed(iter/s)": 0.138995
|
|
},
|
|
{
|
|
"epoch": 1.8727272727272726,
|
|
"grad_norm": 0.7166336178779602,
|
|
"learning_rate": 3.0473876441260786e-06,
|
|
"loss": 0.16788345575332642,
|
|
"memory(GiB)": 34.77,
|
|
"step": 290,
|
|
"token_acc": 0.9407490363579539,
|
|
"train_speed(iter/s)": 0.139247
|
|
},
|
|
{
|
|
"epoch": 1.905050505050505,
|
|
"grad_norm": 0.7887818813323975,
|
|
"learning_rate": 2.8920457793817507e-06,
|
|
"loss": 0.15700163841247558,
|
|
"memory(GiB)": 34.77,
|
|
"step": 295,
|
|
"token_acc": 0.9426423803879983,
|
|
"train_speed(iter/s)": 0.139471
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"grad_norm": 0.8042952418327332,
|
|
"learning_rate": 2.7391404635865725e-06,
|
|
"loss": 0.16809990406036376,
|
|
"memory(GiB)": 34.77,
|
|
"step": 300,
|
|
"token_acc": 0.944771353933029,
|
|
"train_speed(iter/s)": 0.139742
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"eval_loss": 0.2291778326034546,
|
|
"eval_runtime": 4.8337,
|
|
"eval_samples_per_second": 20.688,
|
|
"eval_steps_per_second": 5.172,
|
|
"eval_token_acc": 0.9292035398230089,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.6989196538925171,
|
|
"learning_rate": 2.5888484374320033e-06,
|
|
"loss": 0.1462658405303955,
|
|
"memory(GiB)": 34.77,
|
|
"step": 305,
|
|
"token_acc": 0.9413777899090852,
|
|
"train_speed(iter/s)": 0.138859
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.9214490056037903,
|
|
"learning_rate": 2.4413434209518137e-06,
|
|
"loss": 0.17060282230377197,
|
|
"memory(GiB)": 34.77,
|
|
"step": 310,
|
|
"token_acc": 0.9452530120481928,
|
|
"train_speed(iter/s)": 0.139266
|
|
},
|
|
{
|
|
"epoch": 2.0323232323232325,
|
|
"grad_norm": 0.6455035209655762,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 0.12579550743103027,
|
|
"memory(GiB)": 34.77,
|
|
"step": 315,
|
|
"token_acc": 0.9598925994294345,
|
|
"train_speed(iter/s)": 0.139482
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"grad_norm": 0.6761350631713867,
|
|
"learning_rate": 2.1553729927843894e-06,
|
|
"loss": 0.10464283227920532,
|
|
"memory(GiB)": 34.77,
|
|
"step": 320,
|
|
"token_acc": 0.9628479377702958,
|
|
"train_speed(iter/s)": 0.139728
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"eval_loss": 0.23895612359046936,
|
|
"eval_runtime": 4.8116,
|
|
"eval_samples_per_second": 20.783,
|
|
"eval_steps_per_second": 5.196,
|
|
"eval_token_acc": 0.9286391033523494,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.096969696969697,
|
|
"grad_norm": 0.7690797448158264,
|
|
"learning_rate": 2.017238129521506e-06,
|
|
"loss": 0.10620735883712769,
|
|
"memory(GiB)": 34.77,
|
|
"step": 325,
|
|
"token_acc": 0.951885791453651,
|
|
"train_speed(iter/s)": 0.138943
|
|
},
|
|
{
|
|
"epoch": 2.1292929292929292,
|
|
"grad_norm": 0.8825842142105103,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.11970834732055664,
|
|
"memory(GiB)": 34.77,
|
|
"step": 330,
|
|
"token_acc": 0.9611402417348027,
|
|
"train_speed(iter/s)": 0.139288
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 0.7779159545898438,
|
|
"learning_rate": 1.7514672589449378e-06,
|
|
"loss": 0.1070137619972229,
|
|
"memory(GiB)": 34.77,
|
|
"step": 335,
|
|
"token_acc": 0.9595128097438052,
|
|
"train_speed(iter/s)": 0.139418
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"grad_norm": 0.7191023826599121,
|
|
"learning_rate": 1.6241384517255854e-06,
|
|
"loss": 0.11621193885803223,
|
|
"memory(GiB)": 34.77,
|
|
"step": 340,
|
|
"token_acc": 0.9595157410042165,
|
|
"train_speed(iter/s)": 0.139652
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"eval_loss": 0.2503757178783417,
|
|
"eval_runtime": 4.8011,
|
|
"eval_samples_per_second": 20.829,
|
|
"eval_steps_per_second": 5.207,
|
|
"eval_token_acc": 0.928538311125446,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.2262626262626264,
|
|
"grad_norm": 0.6950616240501404,
|
|
"learning_rate": 1.500711746282192e-06,
|
|
"loss": 0.11175984144210815,
|
|
"memory(GiB)": 34.77,
|
|
"step": 345,
|
|
"token_acc": 0.9465600862223819,
|
|
"train_speed(iter/s)": 0.138949
|
|
},
|
|
{
|
|
"epoch": 2.2585858585858585,
|
|
"grad_norm": 0.7209317088127136,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.11107317209243775,
|
|
"memory(GiB)": 34.77,
|
|
"step": 350,
|
|
"token_acc": 0.9588838612368024,
|
|
"train_speed(iter/s)": 0.139108
|
|
},
|
|
{
|
|
"epoch": 2.290909090909091,
|
|
"grad_norm": 0.7023849487304688,
|
|
"learning_rate": 1.2661306328825818e-06,
|
|
"loss": 0.1061722993850708,
|
|
"memory(GiB)": 34.77,
|
|
"step": 355,
|
|
"token_acc": 0.9638245595692404,
|
|
"train_speed(iter/s)": 0.139405
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.6932234764099121,
|
|
"learning_rate": 1.1552473733031893e-06,
|
|
"loss": 0.11119704246520996,
|
|
"memory(GiB)": 34.77,
|
|
"step": 360,
|
|
"token_acc": 0.9628568099732029,
|
|
"train_speed(iter/s)": 0.139754
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"eval_loss": 0.2477826327085495,
|
|
"eval_runtime": 4.8417,
|
|
"eval_samples_per_second": 20.654,
|
|
"eval_steps_per_second": 5.163,
|
|
"eval_token_acc": 0.9284173604531618,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.7261531352996826,
|
|
"learning_rate": 1.0488081988375493e-06,
|
|
"loss": 0.11782848834991455,
|
|
"memory(GiB)": 34.77,
|
|
"step": 365,
|
|
"token_acc": 0.9477818154288743,
|
|
"train_speed(iter/s)": 0.139242
|
|
},
|
|
{
|
|
"epoch": 2.3878787878787877,
|
|
"grad_norm": 0.7011246681213379,
|
|
"learning_rate": 9.469361407432431e-07,
|
|
"loss": 0.10731152296066285,
|
|
"memory(GiB)": 34.77,
|
|
"step": 370,
|
|
"token_acc": 0.9617250245182495,
|
|
"train_speed(iter/s)": 0.139376
|
|
},
|
|
{
|
|
"epoch": 2.4202020202020202,
|
|
"grad_norm": 0.7193347811698914,
|
|
"learning_rate": 8.497489512245971e-07,
|
|
"loss": 0.11734654903411865,
|
|
"memory(GiB)": 34.77,
|
|
"step": 375,
|
|
"token_acc": 0.9643068481359944,
|
|
"train_speed(iter/s)": 0.139563
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"grad_norm": 0.8027297258377075,
|
|
"learning_rate": 7.573589673248833e-07,
|
|
"loss": 0.1112905502319336,
|
|
"memory(GiB)": 34.77,
|
|
"step": 380,
|
|
"token_acc": 0.964820230517805,
|
|
"train_speed(iter/s)": 0.139778
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"eval_loss": 0.24963097274303436,
|
|
"eval_runtime": 4.8179,
|
|
"eval_samples_per_second": 20.756,
|
|
"eval_steps_per_second": 5.189,
|
|
"eval_token_acc": 0.9288810046969178,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.7203919291496277,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.10394268035888672,
|
|
"memory(GiB)": 34.77,
|
|
"step": 385,
|
|
"token_acc": 0.9475781400629834,
|
|
"train_speed(iter/s)": 0.139104
|
|
},
|
|
{
|
|
"epoch": 2.517171717171717,
|
|
"grad_norm": 0.71819007396698,
|
|
"learning_rate": 5.873921160683943e-07,
|
|
"loss": 0.10946273803710938,
|
|
"memory(GiB)": 34.77,
|
|
"step": 390,
|
|
"token_acc": 0.9592940980604345,
|
|
"train_speed(iter/s)": 0.139407
|
|
},
|
|
{
|
|
"epoch": 2.5494949494949495,
|
|
"grad_norm": 0.7178409695625305,
|
|
"learning_rate": 5.100117105459279e-07,
|
|
"loss": 0.1130871295928955,
|
|
"memory(GiB)": 34.77,
|
|
"step": 395,
|
|
"token_acc": 0.9668103880477289,
|
|
"train_speed(iter/s)": 0.139538
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"grad_norm": 0.67863929271698,
|
|
"learning_rate": 4.3782120722406565e-07,
|
|
"loss": 0.10182794332504272,
|
|
"memory(GiB)": 34.77,
|
|
"step": 400,
|
|
"token_acc": 0.9624655998369177,
|
|
"train_speed(iter/s)": 0.139808
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"eval_loss": 0.24975040555000305,
|
|
"eval_runtime": 4.8388,
|
|
"eval_samples_per_second": 20.666,
|
|
"eval_steps_per_second": 5.167,
|
|
"eval_token_acc": 0.9285786280162074,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.614141414141414,
|
|
"grad_norm": 0.652603805065155,
|
|
"learning_rate": 3.709040498955102e-07,
|
|
"loss": 0.1031190037727356,
|
|
"memory(GiB)": 34.77,
|
|
"step": 405,
|
|
"token_acc": 0.9463408184402476,
|
|
"train_speed(iter/s)": 0.139139
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 1.0421696901321411,
|
|
"learning_rate": 3.0933758698072023e-07,
|
|
"loss": 0.121562659740448,
|
|
"memory(GiB)": 34.77,
|
|
"step": 410,
|
|
"token_acc": 0.9582254445019851,
|
|
"train_speed(iter/s)": 0.139341
|
|
},
|
|
{
|
|
"epoch": 2.6787878787878787,
|
|
"grad_norm": 0.7723399996757507,
|
|
"learning_rate": 2.531929821221768e-07,
|
|
"loss": 0.11757031679153443,
|
|
"memory(GiB)": 34.77,
|
|
"step": 415,
|
|
"token_acc": 0.9626831890454366,
|
|
"train_speed(iter/s)": 0.139573
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.6900373101234436,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.11266238689422607,
|
|
"memory(GiB)": 34.77,
|
|
"step": 420,
|
|
"token_acc": 0.9586481947942905,
|
|
"train_speed(iter/s)": 0.139767
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"eval_loss": 0.2500038743019104,
|
|
"eval_runtime": 4.8315,
|
|
"eval_samples_per_second": 20.697,
|
|
"eval_steps_per_second": 5.174,
|
|
"eval_token_acc": 0.9289616384784405,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.7434343434343433,
|
|
"grad_norm": 0.7542772889137268,
|
|
"learning_rate": 1.5742259095662126e-07,
|
|
"loss": 0.11297458410263062,
|
|
"memory(GiB)": 34.77,
|
|
"step": 425,
|
|
"token_acc": 0.9452227294191721,
|
|
"train_speed(iter/s)": 0.139122
|
|
},
|
|
{
|
|
"epoch": 2.775757575757576,
|
|
"grad_norm": 0.7564345598220825,
|
|
"learning_rate": 1.1790750403941231e-07,
|
|
"loss": 0.1145315408706665,
|
|
"memory(GiB)": 34.77,
|
|
"step": 430,
|
|
"token_acc": 0.9606825351304846,
|
|
"train_speed(iter/s)": 0.139315
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.7472628355026245,
|
|
"learning_rate": 8.403554600248498e-08,
|
|
"loss": 0.10037808418273926,
|
|
"memory(GiB)": 34.77,
|
|
"step": 435,
|
|
"token_acc": 0.968122471719594,
|
|
"train_speed(iter/s)": 0.139508
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"grad_norm": 0.7140054702758789,
|
|
"learning_rate": 5.584586887435739e-08,
|
|
"loss": 0.1066713809967041,
|
|
"memory(GiB)": 34.77,
|
|
"step": 440,
|
|
"token_acc": 0.9640665162880974,
|
|
"train_speed(iter/s)": 0.139714
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"eval_loss": 0.2495991587638855,
|
|
"eval_runtime": 4.8171,
|
|
"eval_samples_per_second": 20.76,
|
|
"eval_steps_per_second": 5.19,
|
|
"eval_token_acc": 0.9289414800330599,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.8727272727272726,
|
|
"grad_norm": 0.6888604164123535,
|
|
"learning_rate": 3.337105663029361e-08,
|
|
"loss": 0.11188592910766601,
|
|
"memory(GiB)": 34.77,
|
|
"step": 445,
|
|
"token_acc": 0.9492009251349155,
|
|
"train_speed(iter/s)": 0.13915
|
|
},
|
|
{
|
|
"epoch": 2.905050505050505,
|
|
"grad_norm": 0.7185996770858765,
|
|
"learning_rate": 1.6637087529033925e-08,
|
|
"loss": 0.11582531929016113,
|
|
"memory(GiB)": 34.77,
|
|
"step": 450,
|
|
"token_acc": 0.9591944327288064,
|
|
"train_speed(iter/s)": 0.139392
|
|
},
|
|
{
|
|
"epoch": 2.937373737373737,
|
|
"grad_norm": 0.6511121988296509,
|
|
"learning_rate": 5.6633040849601865e-09,
|
|
"loss": 0.10450353622436523,
|
|
"memory(GiB)": 34.77,
|
|
"step": 455,
|
|
"token_acc": 0.9614834408486059,
|
|
"train_speed(iter/s)": 0.139543
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 0.6926988363265991,
|
|
"learning_rate": 4.623907104084335e-10,
|
|
"loss": 0.11017694473266601,
|
|
"memory(GiB)": 34.77,
|
|
"step": 460,
|
|
"token_acc": 0.9601264597715982,
|
|
"train_speed(iter/s)": 0.139788
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"eval_loss": 0.24949033558368683,
|
|
"eval_runtime": 4.8501,
|
|
"eval_samples_per_second": 20.618,
|
|
"eval_steps_per_second": 5.155,
|
|
"eval_token_acc": 0.9285786280162074,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.9826262626262627,
|
|
"eval_loss": 0.2496582567691803,
|
|
"eval_runtime": 4.861,
|
|
"eval_samples_per_second": 20.572,
|
|
"eval_steps_per_second": 5.143,
|
|
"eval_token_acc": 0.929062430705344,
|
|
"step": 462
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 462,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 5.5801089900517786e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|