Files
qwen2.5vl-3b-sampled_10000_…/trainer_state.json
ModelHub XC e7eb2db55c 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-sampled_10000_caption-cot-7b
Source: Original Platform
2026-05-22 12:53:13 +08:00

1181 lines
33 KiB
JSON

{
"best_global_step": 300,
"best_metric": 0.40253255,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v8-20250507-004645/checkpoint-300",
"epoch": 2.9826262626262627,
"eval_steps": 20,
"global_step": 462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006464646464646465,
"grad_norm": 4.932199478149414,
"learning_rate": 9.999884400986087e-06,
"loss": 0.7780591249465942,
"memory(GiB)": 27.73,
"step": 1,
"token_acc": 0.782099343955014,
"train_speed(iter/s)": 0.064891
},
{
"epoch": 0.03232323232323232,
"grad_norm": 2.3600621223449707,
"learning_rate": 9.997110291906109e-06,
"loss": 0.6091042757034302,
"memory(GiB)": 27.73,
"step": 5,
"token_acc": 0.8179287124866458,
"train_speed(iter/s)": 0.118621
},
{
"epoch": 0.06464646464646465,
"grad_norm": 1.088510274887085,
"learning_rate": 9.988444507789584e-06,
"loss": 0.4719734191894531,
"memory(GiB)": 27.73,
"step": 10,
"token_acc": 0.8583190394511149,
"train_speed(iter/s)": 0.135341
},
{
"epoch": 0.09696969696969697,
"grad_norm": 1.0002374649047852,
"learning_rate": 9.97401266428502e-06,
"loss": 0.47036895751953123,
"memory(GiB)": 27.73,
"step": 15,
"token_acc": 0.8504078264405482,
"train_speed(iter/s)": 0.137962
},
{
"epoch": 0.1292929292929293,
"grad_norm": 0.9563055038452148,
"learning_rate": 9.953831442918418e-06,
"loss": 0.42792816162109376,
"memory(GiB)": 27.73,
"step": 20,
"token_acc": 0.863187115610118,
"train_speed(iter/s)": 0.141664
},
{
"epoch": 0.1292929292929293,
"eval_loss": 0.4551742970943451,
"eval_runtime": 5.4465,
"eval_samples_per_second": 18.36,
"eval_steps_per_second": 4.59,
"eval_token_acc": 0.8559020470633251,
"step": 20
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.8774688243865967,
"learning_rate": 9.927924170825266e-06,
"loss": 0.41231346130371094,
"memory(GiB)": 27.73,
"step": 25,
"token_acc": 0.8627429786160803,
"train_speed(iter/s)": 0.130766
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.7832381129264832,
"learning_rate": 9.896320793787106e-06,
"loss": 0.4305295467376709,
"memory(GiB)": 27.73,
"step": 30,
"token_acc": 0.8626980747248807,
"train_speed(iter/s)": 0.135526
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.7605119943618774,
"learning_rate": 9.859057841617709e-06,
"loss": 0.40700688362121584,
"memory(GiB)": 27.77,
"step": 35,
"token_acc": 0.8702584217812644,
"train_speed(iter/s)": 0.137556
},
{
"epoch": 0.2585858585858586,
"grad_norm": 0.7823914289474487,
"learning_rate": 9.816178385938867e-06,
"loss": 0.40500674247741697,
"memory(GiB)": 27.77,
"step": 40,
"token_acc": 0.8749971213412246,
"train_speed(iter/s)": 0.139743
},
{
"epoch": 0.2585858585858586,
"eval_loss": 0.4355735778808594,
"eval_runtime": 5.436,
"eval_samples_per_second": 18.396,
"eval_steps_per_second": 4.599,
"eval_token_acc": 0.8621580256361201,
"step": 40
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.8007071018218994,
"learning_rate": 9.767731990394638e-06,
"loss": 0.41349210739135744,
"memory(GiB)": 27.77,
"step": 45,
"token_acc": 0.8671768894761958,
"train_speed(iter/s)": 0.134577
},
{
"epoch": 0.32323232323232326,
"grad_norm": 0.8084492683410645,
"learning_rate": 9.71377465336155e-06,
"loss": 0.41720309257507326,
"memory(GiB)": 27.77,
"step": 50,
"token_acc": 0.8574223526534605,
"train_speed(iter/s)": 0.136609
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.757235050201416,
"learning_rate": 9.654368743221022e-06,
"loss": 0.41148929595947265,
"memory(GiB)": 27.77,
"step": 55,
"token_acc": 0.8688445445767622,
"train_speed(iter/s)": 0.138016
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.787464439868927,
"learning_rate": 9.589582926268798e-06,
"loss": 0.40866241455078123,
"memory(GiB)": 30.15,
"step": 60,
"token_acc": 0.8828734404289198,
"train_speed(iter/s)": 0.139782
},
{
"epoch": 0.3878787878787879,
"eval_loss": 0.4253135919570923,
"eval_runtime": 5.4547,
"eval_samples_per_second": 18.333,
"eval_steps_per_second": 4.583,
"eval_token_acc": 0.8650277405777693,
"step": 60
},
{
"epoch": 0.4202020202020202,
"grad_norm": 0.8023079633712769,
"learning_rate": 9.519492087344724e-06,
"loss": 0.3891183137893677,
"memory(GiB)": 30.15,
"step": 65,
"token_acc": 0.8755363232975173,
"train_speed(iter/s)": 0.135184
},
{
"epoch": 0.45252525252525255,
"grad_norm": 0.762795090675354,
"learning_rate": 9.444177243274619e-06,
"loss": 0.4177716255187988,
"memory(GiB)": 30.15,
"step": 70,
"token_acc": 0.8719830172135309,
"train_speed(iter/s)": 0.137146
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.7374395728111267,
"learning_rate": 9.363725449224281e-06,
"loss": 0.3992452621459961,
"memory(GiB)": 30.15,
"step": 75,
"token_acc": 0.872343302756429,
"train_speed(iter/s)": 0.138292
},
{
"epoch": 0.5171717171717172,
"grad_norm": 0.7449392080307007,
"learning_rate": 9.278229698073889e-06,
"loss": 0.39937677383422854,
"memory(GiB)": 30.15,
"step": 80,
"token_acc": 0.8710408988995696,
"train_speed(iter/s)": 0.138969
},
{
"epoch": 0.5171717171717172,
"eval_loss": 0.4182729721069336,
"eval_runtime": 5.4623,
"eval_samples_per_second": 18.307,
"eval_steps_per_second": 4.577,
"eval_token_acc": 0.8654868949684331,
"step": 80
},
{
"epoch": 0.5494949494949495,
"grad_norm": 0.7495951652526855,
"learning_rate": 9.187788812929074e-06,
"loss": 0.39512038230895996,
"memory(GiB)": 30.15,
"step": 85,
"token_acc": 0.870334291390984,
"train_speed(iter/s)": 0.136059
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.8733298182487488,
"learning_rate": 9.092507332892968e-06,
"loss": 0.4132417678833008,
"memory(GiB)": 30.15,
"step": 90,
"token_acc": 0.8729202391435325,
"train_speed(iter/s)": 0.137339
},
{
"epoch": 0.6141414141414141,
"grad_norm": 0.8373153805732727,
"learning_rate": 8.992495392231195e-06,
"loss": 0.40344934463500975,
"memory(GiB)": 30.15,
"step": 95,
"token_acc": 0.8812556053811659,
"train_speed(iter/s)": 0.138502
},
{
"epoch": 0.6464646464646465,
"grad_norm": 0.8512130379676819,
"learning_rate": 8.88786859306952e-06,
"loss": 0.3863351821899414,
"memory(GiB)": 30.16,
"step": 100,
"token_acc": 0.8727646779553727,
"train_speed(iter/s)": 0.139249
},
{
"epoch": 0.6464646464646465,
"eval_loss": 0.4132169485092163,
"eval_runtime": 5.4403,
"eval_samples_per_second": 18.381,
"eval_steps_per_second": 4.595,
"eval_token_acc": 0.8673235125310885,
"step": 100
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.8247293829917908,
"learning_rate": 8.778747871771293e-06,
"loss": 0.40260896682739256,
"memory(GiB)": 30.16,
"step": 105,
"token_acc": 0.8619686556852478,
"train_speed(iter/s)": 0.137282
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.7707865834236145,
"learning_rate": 8.665259359149132e-06,
"loss": 0.3850682020187378,
"memory(GiB)": 30.16,
"step": 110,
"token_acc": 0.8824787229538045,
"train_speed(iter/s)": 0.138008
},
{
"epoch": 0.7434343434343434,
"grad_norm": 0.7147298455238342,
"learning_rate": 8.547534234672435e-06,
"loss": 0.37834107875823975,
"memory(GiB)": 30.16,
"step": 115,
"token_acc": 0.8782488780852655,
"train_speed(iter/s)": 0.138792
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.7929354906082153,
"learning_rate": 8.425708574839221e-06,
"loss": 0.40454673767089844,
"memory(GiB)": 30.16,
"step": 120,
"token_acc": 0.8664209147790658,
"train_speed(iter/s)": 0.139398
},
{
"epoch": 0.7757575757575758,
"eval_loss": 0.40885430574417114,
"eval_runtime": 5.4542,
"eval_samples_per_second": 18.334,
"eval_steps_per_second": 4.584,
"eval_token_acc": 0.8693323129902429,
"step": 120
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.7925952076911926,
"learning_rate": 8.299923195887599e-06,
"loss": 0.39439709186553956,
"memory(GiB)": 30.16,
"step": 125,
"token_acc": 0.8721447484554281,
"train_speed(iter/s)": 0.137539
},
{
"epoch": 0.8404040404040404,
"grad_norm": 0.7893044352531433,
"learning_rate": 8.170323491028625e-06,
"loss": 0.39348788261413575,
"memory(GiB)": 30.16,
"step": 130,
"token_acc": 0.872299544278852,
"train_speed(iter/s)": 0.137807
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.7297965884208679,
"learning_rate": 8.03705926238874e-06,
"loss": 0.390042781829834,
"memory(GiB)": 30.16,
"step": 135,
"token_acc": 0.872897976215314,
"train_speed(iter/s)": 0.138413
},
{
"epoch": 0.9050505050505051,
"grad_norm": 0.7339494824409485,
"learning_rate": 7.900284547855992e-06,
"loss": 0.3968710660934448,
"memory(GiB)": 30.16,
"step": 140,
"token_acc": 0.8762483817273904,
"train_speed(iter/s)": 0.138796
},
{
"epoch": 0.9050505050505051,
"eval_loss": 0.40561485290527344,
"eval_runtime": 5.454,
"eval_samples_per_second": 18.335,
"eval_steps_per_second": 4.584,
"eval_token_acc": 0.8705184618327912,
"step": 140
},
{
"epoch": 0.9373737373737374,
"grad_norm": 0.7347233891487122,
"learning_rate": 7.760157443030234e-06,
"loss": 0.3751286506652832,
"memory(GiB)": 30.16,
"step": 145,
"token_acc": 0.8804492278895648,
"train_speed(iter/s)": 0.137132
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.7385802268981934,
"learning_rate": 7.616839918483061e-06,
"loss": 0.38750033378601073,
"memory(GiB)": 30.16,
"step": 150,
"token_acc": 0.8667628785284477,
"train_speed(iter/s)": 0.137631
},
{
"epoch": 1.0,
"grad_norm": 0.7280504107475281,
"learning_rate": 7.470497632538743e-06,
"loss": 0.38422205448150637,
"memory(GiB)": 30.16,
"step": 155,
"token_acc": 0.8743071565213126,
"train_speed(iter/s)": 0.138389
},
{
"epoch": 1.0323232323232323,
"grad_norm": 0.7759126424789429,
"learning_rate": 7.321299739792553e-06,
"loss": 0.33709375858306884,
"memory(GiB)": 30.16,
"step": 160,
"token_acc": 0.8903214253738025,
"train_speed(iter/s)": 0.138965
},
{
"epoch": 1.0323232323232323,
"eval_loss": 0.41121506690979004,
"eval_runtime": 5.4481,
"eval_samples_per_second": 18.355,
"eval_steps_per_second": 4.589,
"eval_token_acc": 0.8699062559785727,
"step": 160
},
{
"epoch": 1.0646464646464646,
"grad_norm": 0.7367027997970581,
"learning_rate": 7.169418695587791e-06,
"loss": 0.3059047222137451,
"memory(GiB)": 30.16,
"step": 165,
"token_acc": 0.893117110476366,
"train_speed(iter/s)": 0.13755
},
{
"epoch": 1.096969696969697,
"grad_norm": 0.7874158024787903,
"learning_rate": 7.015030056677559e-06,
"loss": 0.3194535255432129,
"memory(GiB)": 30.16,
"step": 170,
"token_acc": 0.8963855982498197,
"train_speed(iter/s)": 0.13837
},
{
"epoch": 1.1292929292929292,
"grad_norm": 0.8298231959342957,
"learning_rate": 6.858312278301638e-06,
"loss": 0.32886972427368166,
"memory(GiB)": 30.16,
"step": 175,
"token_acc": 0.8890347381744879,
"train_speed(iter/s)": 0.138796
},
{
"epoch": 1.1616161616161615,
"grad_norm": 0.7421779632568359,
"learning_rate": 6.699446507913083e-06,
"loss": 0.3223016977310181,
"memory(GiB)": 30.16,
"step": 180,
"token_acc": 0.8996364289240989,
"train_speed(iter/s)": 0.139126
},
{
"epoch": 1.1616161616161615,
"eval_loss": 0.4112629294395447,
"eval_runtime": 5.4548,
"eval_samples_per_second": 18.333,
"eval_steps_per_second": 4.583,
"eval_token_acc": 0.8689114214654677,
"step": 180
},
{
"epoch": 1.1939393939393939,
"grad_norm": 0.6949385404586792,
"learning_rate": 6.53861637579291e-06,
"loss": 0.3096341609954834,
"memory(GiB)": 30.16,
"step": 185,
"token_acc": 0.8867440022985204,
"train_speed(iter/s)": 0.137864
},
{
"epoch": 1.2262626262626264,
"grad_norm": 0.7675971984863281,
"learning_rate": 6.376007782794926e-06,
"loss": 0.3296669483184814,
"memory(GiB)": 30.16,
"step": 190,
"token_acc": 0.8872481430414091,
"train_speed(iter/s)": 0.138534
},
{
"epoch": 1.2585858585858585,
"grad_norm": 0.6753478646278381,
"learning_rate": 6.211808685466063e-06,
"loss": 0.31036269664764404,
"memory(GiB)": 30.16,
"step": 195,
"token_acc": 0.8989660334986432,
"train_speed(iter/s)": 0.139226
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.7082095742225647,
"learning_rate": 6.046208878790543e-06,
"loss": 0.3189213752746582,
"memory(GiB)": 30.16,
"step": 200,
"token_acc": 0.893559169826382,
"train_speed(iter/s)": 0.139505
},
{
"epoch": 1.290909090909091,
"eval_loss": 0.4101768136024475,
"eval_runtime": 5.4502,
"eval_samples_per_second": 18.348,
"eval_steps_per_second": 4.587,
"eval_token_acc": 0.8691792615266883,
"step": 200
},
{
"epoch": 1.3232323232323233,
"grad_norm": 0.7023712992668152,
"learning_rate": 5.879399776809047e-06,
"loss": 0.3078160285949707,
"memory(GiB)": 30.16,
"step": 205,
"token_acc": 0.8920373624341071,
"train_speed(iter/s)": 0.138308
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.7248120307922363,
"learning_rate": 5.711574191366427e-06,
"loss": 0.326322340965271,
"memory(GiB)": 30.16,
"step": 210,
"token_acc": 0.888576901881544,
"train_speed(iter/s)": 0.138642
},
{
"epoch": 1.387878787878788,
"grad_norm": 0.7424785494804382,
"learning_rate": 5.542926109243727e-06,
"loss": 0.3178426504135132,
"memory(GiB)": 30.16,
"step": 215,
"token_acc": 0.8996045025859446,
"train_speed(iter/s)": 0.138982
},
{
"epoch": 1.4202020202020202,
"grad_norm": 0.7585700154304504,
"learning_rate": 5.373650467932122e-06,
"loss": 0.31358323097229,
"memory(GiB)": 30.16,
"step": 220,
"token_acc": 0.8893666839273251,
"train_speed(iter/s)": 0.139322
},
{
"epoch": 1.4202020202020202,
"eval_loss": 0.4098529815673828,
"eval_runtime": 5.4526,
"eval_samples_per_second": 18.34,
"eval_steps_per_second": 4.585,
"eval_token_acc": 0.8699062559785727,
"step": 220
},
{
"epoch": 1.4525252525252526,
"grad_norm": 0.7577831149101257,
"learning_rate": 5.2039429303079294e-06,
"loss": 0.3181041717529297,
"memory(GiB)": 30.16,
"step": 225,
"token_acc": 0.8940511833475905,
"train_speed(iter/s)": 0.138553
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.8157357573509216,
"learning_rate": 5.033999658469174e-06,
"loss": 0.3100062370300293,
"memory(GiB)": 30.16,
"step": 230,
"token_acc": 0.8942779905384095,
"train_speed(iter/s)": 0.138826
},
{
"epoch": 1.5171717171717172,
"grad_norm": 0.7473869919776917,
"learning_rate": 4.864017086995112e-06,
"loss": 0.3215769290924072,
"memory(GiB)": 30.16,
"step": 235,
"token_acc": 0.8864230396902226,
"train_speed(iter/s)": 0.139172
},
{
"epoch": 1.5494949494949495,
"grad_norm": 0.7379017472267151,
"learning_rate": 4.694191695890788e-06,
"loss": 0.32453505992889403,
"memory(GiB)": 30.16,
"step": 240,
"token_acc": 0.9024658286970259,
"train_speed(iter/s)": 0.139554
},
{
"epoch": 1.5494949494949495,
"eval_loss": 0.406717449426651,
"eval_runtime": 5.4667,
"eval_samples_per_second": 18.293,
"eval_steps_per_second": 4.573,
"eval_token_acc": 0.8711880619858428,
"step": 240
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.7182732224464417,
"learning_rate": 4.524719783479088e-06,
"loss": 0.3010763645172119,
"memory(GiB)": 30.16,
"step": 245,
"token_acc": 0.8939800233960227,
"train_speed(iter/s)": 0.138563
},
{
"epoch": 1.614141414141414,
"grad_norm": 0.7385874390602112,
"learning_rate": 4.355797239502807e-06,
"loss": 0.30601317882537843,
"memory(GiB)": 30.16,
"step": 250,
"token_acc": 0.9005994116476079,
"train_speed(iter/s)": 0.138773
},
{
"epoch": 1.6464646464646466,
"grad_norm": 0.7460725903511047,
"learning_rate": 4.187619318698971e-06,
"loss": 0.32054686546325684,
"memory(GiB)": 30.16,
"step": 255,
"token_acc": 0.8981558249490219,
"train_speed(iter/s)": 0.139197
},
{
"epoch": 1.6787878787878787,
"grad_norm": 0.7663230299949646,
"learning_rate": 4.020380415107167e-06,
"loss": 0.32004489898681643,
"memory(GiB)": 30.16,
"step": 260,
"token_acc": 0.899984937490586,
"train_speed(iter/s)": 0.139402
},
{
"epoch": 1.6787878787878787,
"eval_loss": 0.406484454870224,
"eval_runtime": 5.4651,
"eval_samples_per_second": 18.298,
"eval_steps_per_second": 4.574,
"eval_token_acc": 0.8711306676870098,
"step": 260
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.7412447929382324,
"learning_rate": 3.854273837372724e-06,
"loss": 0.3273331642150879,
"memory(GiB)": 30.16,
"step": 265,
"token_acc": 0.8869161986402602,
"train_speed(iter/s)": 0.138596
},
{
"epoch": 1.7434343434343433,
"grad_norm": 0.773398756980896,
"learning_rate": 3.689491585304491e-06,
"loss": 0.3207144498825073,
"memory(GiB)": 30.16,
"step": 270,
"token_acc": 0.8838720231835285,
"train_speed(iter/s)": 0.138842
},
{
"epoch": 1.7757575757575759,
"grad_norm": 0.737702488899231,
"learning_rate": 3.526224127945479e-06,
"loss": 0.32349045276641847,
"memory(GiB)": 30.16,
"step": 275,
"token_acc": 0.899477893067213,
"train_speed(iter/s)": 0.139328
},
{
"epoch": 1.808080808080808,
"grad_norm": 0.7224950194358826,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.30070719718933103,
"memory(GiB)": 30.16,
"step": 280,
"token_acc": 0.8971495671394364,
"train_speed(iter/s)": 0.139562
},
{
"epoch": 1.808080808080808,
"eval_loss": 0.40459758043289185,
"eval_runtime": 5.4432,
"eval_samples_per_second": 18.371,
"eval_steps_per_second": 4.593,
"eval_token_acc": 0.8716854792423953,
"step": 280
},
{
"epoch": 1.8404040404040405,
"grad_norm": 0.6978333592414856,
"learning_rate": 3.204986500762006e-06,
"loss": 0.31296162605285643,
"memory(GiB)": 30.16,
"step": 285,
"token_acc": 0.8931119696495075,
"train_speed(iter/s)": 0.138664
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.7149765491485596,
"learning_rate": 3.0473876441260786e-06,
"loss": 0.2978228569030762,
"memory(GiB)": 30.16,
"step": 290,
"token_acc": 0.9099453551912569,
"train_speed(iter/s)": 0.138912
},
{
"epoch": 1.905050505050505,
"grad_norm": 0.7401219010353088,
"learning_rate": 2.8920457793817507e-06,
"loss": 0.3145498752593994,
"memory(GiB)": 30.16,
"step": 295,
"token_acc": 0.8971085419769723,
"train_speed(iter/s)": 0.139171
},
{
"epoch": 1.9373737373737374,
"grad_norm": 0.7960948348045349,
"learning_rate": 2.7391404635865725e-06,
"loss": 0.31858437061309813,
"memory(GiB)": 30.16,
"step": 300,
"token_acc": 0.8927697189483228,
"train_speed(iter/s)": 0.139487
},
{
"epoch": 1.9373737373737374,
"eval_loss": 0.40253254771232605,
"eval_runtime": 5.4598,
"eval_samples_per_second": 18.316,
"eval_steps_per_second": 4.579,
"eval_token_acc": 0.871608953510618,
"step": 300
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.7060583233833313,
"learning_rate": 2.5888484374320033e-06,
"loss": 0.3089438438415527,
"memory(GiB)": 30.16,
"step": 305,
"token_acc": 0.8951569409988135,
"train_speed(iter/s)": 0.138796
},
{
"epoch": 2.0,
"grad_norm": 0.8663066625595093,
"learning_rate": 2.4413434209518137e-06,
"loss": 0.30643525123596194,
"memory(GiB)": 30.16,
"step": 310,
"token_acc": 0.9002104614752836,
"train_speed(iter/s)": 0.139084
},
{
"epoch": 2.0323232323232325,
"grad_norm": 0.6777763366699219,
"learning_rate": 2.296795912722014e-06,
"loss": 0.2622525691986084,
"memory(GiB)": 30.16,
"step": 315,
"token_acc": 0.9182915057915058,
"train_speed(iter/s)": 0.139164
},
{
"epoch": 2.0646464646464646,
"grad_norm": 0.7604569792747498,
"learning_rate": 2.1553729927843894e-06,
"loss": 0.2744235277175903,
"memory(GiB)": 30.16,
"step": 320,
"token_acc": 0.9147708067912951,
"train_speed(iter/s)": 0.139363
},
{
"epoch": 2.0646464646464646,
"eval_loss": 0.41404902935028076,
"eval_runtime": 5.4464,
"eval_samples_per_second": 18.361,
"eval_steps_per_second": 4.59,
"eval_token_acc": 0.8715898220776737,
"step": 320
},
{
"epoch": 2.096969696969697,
"grad_norm": 0.7027745246887207,
"learning_rate": 2.017238129521506e-06,
"loss": 0.2601346492767334,
"memory(GiB)": 30.16,
"step": 325,
"token_acc": 0.9096833050957904,
"train_speed(iter/s)": 0.138685
},
{
"epoch": 2.1292929292929292,
"grad_norm": 0.7389653325080872,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.26451470851898196,
"memory(GiB)": 30.16,
"step": 330,
"token_acc": 0.9258255445505091,
"train_speed(iter/s)": 0.138988
},
{
"epoch": 2.1616161616161618,
"grad_norm": 0.750593364238739,
"learning_rate": 1.7514672589449378e-06,
"loss": 0.283371901512146,
"memory(GiB)": 30.16,
"step": 335,
"token_acc": 0.904814352497736,
"train_speed(iter/s)": 0.139218
},
{
"epoch": 2.193939393939394,
"grad_norm": 0.6902281641960144,
"learning_rate": 1.6241384517255854e-06,
"loss": 0.2589299440383911,
"memory(GiB)": 30.16,
"step": 340,
"token_acc": 0.9170253055603375,
"train_speed(iter/s)": 0.139418
},
{
"epoch": 2.193939393939394,
"eval_loss": 0.4144185781478882,
"eval_runtime": 5.4505,
"eval_samples_per_second": 18.347,
"eval_steps_per_second": 4.587,
"eval_token_acc": 0.8716472163765066,
"step": 340
},
{
"epoch": 2.2262626262626264,
"grad_norm": 0.6674037575721741,
"learning_rate": 1.500711746282192e-06,
"loss": 0.2723775148391724,
"memory(GiB)": 30.16,
"step": 345,
"token_acc": 0.9019237534484993,
"train_speed(iter/s)": 0.138831
},
{
"epoch": 2.2585858585858585,
"grad_norm": 0.7093120217323303,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.2645721912384033,
"memory(GiB)": 30.16,
"step": 350,
"token_acc": 0.9119130680746748,
"train_speed(iter/s)": 0.138979
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.6906498074531555,
"learning_rate": 1.2661306328825818e-06,
"loss": 0.259444522857666,
"memory(GiB)": 30.16,
"step": 355,
"token_acc": 0.9145142038672714,
"train_speed(iter/s)": 0.139182
},
{
"epoch": 2.323232323232323,
"grad_norm": 0.7055203318595886,
"learning_rate": 1.1552473733031893e-06,
"loss": 0.25058302879333494,
"memory(GiB)": 30.16,
"step": 360,
"token_acc": 0.9134637201070926,
"train_speed(iter/s)": 0.139508
},
{
"epoch": 2.323232323232323,
"eval_loss": 0.41360363364219666,
"eval_runtime": 5.4444,
"eval_samples_per_second": 18.367,
"eval_steps_per_second": 4.592,
"eval_token_acc": 0.8721828964989478,
"step": 360
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.7174535393714905,
"learning_rate": 1.0488081988375493e-06,
"loss": 0.26172120571136476,
"memory(GiB)": 30.16,
"step": 365,
"token_acc": 0.9057994175722708,
"train_speed(iter/s)": 0.139028
},
{
"epoch": 2.3878787878787877,
"grad_norm": 0.7143301367759705,
"learning_rate": 9.469361407432431e-07,
"loss": 0.2703177213668823,
"memory(GiB)": 30.16,
"step": 370,
"token_acc": 0.9194112781795432,
"train_speed(iter/s)": 0.139208
},
{
"epoch": 2.4202020202020202,
"grad_norm": 0.7012506127357483,
"learning_rate": 8.497489512245971e-07,
"loss": 0.27690658569335935,
"memory(GiB)": 30.16,
"step": 375,
"token_acc": 0.9169483450919897,
"train_speed(iter/s)": 0.139394
},
{
"epoch": 2.4525252525252528,
"grad_norm": 0.7648996114730835,
"learning_rate": 7.573589673248833e-07,
"loss": 0.26938657760620116,
"memory(GiB)": 30.16,
"step": 380,
"token_acc": 0.9042763382008948,
"train_speed(iter/s)": 0.139668
},
{
"epoch": 2.4525252525252528,
"eval_loss": 0.41534754633903503,
"eval_runtime": 5.5436,
"eval_samples_per_second": 18.039,
"eval_steps_per_second": 4.51,
"eval_token_acc": 0.871436770614119,
"step": 380
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.6768928170204163,
"learning_rate": 6.698729810778065e-07,
"loss": 0.2653059482574463,
"memory(GiB)": 30.16,
"step": 385,
"token_acc": 0.9062684911242603,
"train_speed(iter/s)": 0.139087
},
{
"epoch": 2.517171717171717,
"grad_norm": 0.6828300952911377,
"learning_rate": 5.873921160683943e-07,
"loss": 0.27868268489837644,
"memory(GiB)": 30.16,
"step": 390,
"token_acc": 0.9041146306155998,
"train_speed(iter/s)": 0.139361
},
{
"epoch": 2.5494949494949495,
"grad_norm": 0.6979082822799683,
"learning_rate": 5.100117105459279e-07,
"loss": 0.24405245780944823,
"memory(GiB)": 30.16,
"step": 395,
"token_acc": 0.9200107009095773,
"train_speed(iter/s)": 0.139546
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.6200575828552246,
"learning_rate": 4.3782120722406565e-07,
"loss": 0.2625063419342041,
"memory(GiB)": 30.16,
"step": 400,
"token_acc": 0.9186572124972302,
"train_speed(iter/s)": 0.139799
},
{
"epoch": 2.581818181818182,
"eval_loss": 0.41504132747650146,
"eval_runtime": 5.4443,
"eval_samples_per_second": 18.368,
"eval_steps_per_second": 4.592,
"eval_token_acc": 0.871723742108284,
"step": 400
},
{
"epoch": 2.614141414141414,
"grad_norm": 0.7500612139701843,
"learning_rate": 3.709040498955102e-07,
"loss": 0.26823058128356936,
"memory(GiB)": 30.16,
"step": 405,
"token_acc": 0.8981710236522072,
"train_speed(iter/s)": 0.139221
},
{
"epoch": 2.6464646464646466,
"grad_norm": 0.8057283163070679,
"learning_rate": 3.0933758698072023e-07,
"loss": 0.27291839122772216,
"memory(GiB)": 30.16,
"step": 410,
"token_acc": 0.9183253730661121,
"train_speed(iter/s)": 0.139402
},
{
"epoch": 2.6787878787878787,
"grad_norm": 0.7092121243476868,
"learning_rate": 2.531929821221768e-07,
"loss": 0.28043303489685056,
"memory(GiB)": 30.16,
"step": 415,
"token_acc": 0.9004696220894495,
"train_speed(iter/s)": 0.13956
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.7463679909706116,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.26267204284667967,
"memory(GiB)": 30.16,
"step": 420,
"token_acc": 0.9165323480546532,
"train_speed(iter/s)": 0.139835
},
{
"epoch": 2.7111111111111112,
"eval_loss": 0.41523492336273193,
"eval_runtime": 5.4494,
"eval_samples_per_second": 18.351,
"eval_steps_per_second": 4.588,
"eval_token_acc": 0.8714559020470634,
"step": 420
},
{
"epoch": 2.7434343434343433,
"grad_norm": 0.6929903626441956,
"learning_rate": 1.5742259095662126e-07,
"loss": 0.26644191741943357,
"memory(GiB)": 30.16,
"step": 425,
"token_acc": 0.8992937483651583,
"train_speed(iter/s)": 0.139333
},
{
"epoch": 2.775757575757576,
"grad_norm": 0.707722008228302,
"learning_rate": 1.1790750403941231e-07,
"loss": 0.266437292098999,
"memory(GiB)": 30.16,
"step": 430,
"token_acc": 0.9212307137056753,
"train_speed(iter/s)": 0.139491
},
{
"epoch": 2.808080808080808,
"grad_norm": 0.7085736393928528,
"learning_rate": 8.403554600248498e-08,
"loss": 0.25338120460510255,
"memory(GiB)": 30.16,
"step": 435,
"token_acc": 0.9183108895950982,
"train_speed(iter/s)": 0.139672
},
{
"epoch": 2.8404040404040405,
"grad_norm": 0.7079160213470459,
"learning_rate": 5.584586887435739e-08,
"loss": 0.26110315322875977,
"memory(GiB)": 30.16,
"step": 440,
"token_acc": 0.9123924065558306,
"train_speed(iter/s)": 0.139879
},
{
"epoch": 2.8404040404040405,
"eval_loss": 0.4152638614177704,
"eval_runtime": 5.4468,
"eval_samples_per_second": 18.359,
"eval_steps_per_second": 4.59,
"eval_token_acc": 0.8718959250047829,
"step": 440
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.7511946558952332,
"learning_rate": 3.337105663029361e-08,
"loss": 0.2701514482498169,
"memory(GiB)": 30.16,
"step": 445,
"token_acc": 0.9035763569457221,
"train_speed(iter/s)": 0.13938
},
{
"epoch": 2.905050505050505,
"grad_norm": 0.7157277464866638,
"learning_rate": 1.6637087529033925e-08,
"loss": 0.25613832473754883,
"memory(GiB)": 30.16,
"step": 450,
"token_acc": 0.9136720727064674,
"train_speed(iter/s)": 0.139627
},
{
"epoch": 2.937373737373737,
"grad_norm": 0.7009196281433105,
"learning_rate": 5.6633040849601865e-09,
"loss": 0.25980963706970217,
"memory(GiB)": 30.16,
"step": 455,
"token_acc": 0.9089919103920349,
"train_speed(iter/s)": 0.139745
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.7183138728141785,
"learning_rate": 4.623907104084335e-10,
"loss": 0.27978599071502686,
"memory(GiB)": 30.16,
"step": 460,
"token_acc": 0.9131394658753709,
"train_speed(iter/s)": 0.140016
},
{
"epoch": 2.9696969696969697,
"eval_loss": 0.41502535343170166,
"eval_runtime": 5.4392,
"eval_samples_per_second": 18.385,
"eval_steps_per_second": 4.596,
"eval_token_acc": 0.8717046106753396,
"step": 460
},
{
"epoch": 2.9826262626262627,
"eval_loss": 0.41519150137901306,
"eval_runtime": 5.4401,
"eval_samples_per_second": 18.382,
"eval_steps_per_second": 4.596,
"eval_token_acc": 0.871608953510618,
"step": 462
}
],
"logging_steps": 5,
"max_steps": 462,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.033815544329667e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}