Files
qwen2.5vl-3b-sampled_10000_…/trainer_state.json
ModelHub XC 6ab8246323 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-sampled_10000_qwen2.5vl7b
Source: Original Platform
2026-05-22 04:44:13 +08:00

1181 lines
33 KiB
JSON

{
"best_global_step": 120,
"best_metric": 0.33126009,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v7-20250507-004227/checkpoint-120",
"epoch": 2.9826262626262627,
"eval_steps": 20,
"global_step": 462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006464646464646465,
"grad_norm": 2.4505362510681152,
"learning_rate": 9.999884400986087e-06,
"loss": 0.4081788659095764,
"memory(GiB)": 27.77,
"step": 1,
"token_acc": 0.8560397131825703,
"train_speed(iter/s)": 0.065308
},
{
"epoch": 0.03232323232323232,
"grad_norm": 1.3398605585098267,
"learning_rate": 9.997110291906109e-06,
"loss": 0.3790343999862671,
"memory(GiB)": 27.77,
"step": 5,
"token_acc": 0.8759903354497949,
"train_speed(iter/s)": 0.120195
},
{
"epoch": 0.06464646464646465,
"grad_norm": 1.0211882591247559,
"learning_rate": 9.988444507789584e-06,
"loss": 0.3569159030914307,
"memory(GiB)": 27.77,
"step": 10,
"token_acc": 0.8904844941361507,
"train_speed(iter/s)": 0.137392
},
{
"epoch": 0.09696969696969697,
"grad_norm": 1.0586270093917847,
"learning_rate": 9.97401266428502e-06,
"loss": 0.36738641262054444,
"memory(GiB)": 27.77,
"step": 15,
"token_acc": 0.8821489760952925,
"train_speed(iter/s)": 0.140251
},
{
"epoch": 0.1292929292929293,
"grad_norm": 1.1482552289962769,
"learning_rate": 9.953831442918418e-06,
"loss": 0.3260908842086792,
"memory(GiB)": 27.77,
"step": 20,
"token_acc": 0.8923615160349854,
"train_speed(iter/s)": 0.144117
},
{
"epoch": 0.1292929292929293,
"eval_loss": 0.3649641275405884,
"eval_runtime": 5.3926,
"eval_samples_per_second": 18.544,
"eval_steps_per_second": 4.636,
"eval_token_acc": 0.8829422873787651,
"step": 20
},
{
"epoch": 0.16161616161616163,
"grad_norm": 1.0274701118469238,
"learning_rate": 9.927924170825266e-06,
"loss": 0.32098817825317383,
"memory(GiB)": 27.77,
"step": 25,
"token_acc": 0.8865429663420047,
"train_speed(iter/s)": 0.132926
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.9113245010375977,
"learning_rate": 9.896320793787106e-06,
"loss": 0.35467684268951416,
"memory(GiB)": 27.77,
"step": 30,
"token_acc": 0.8852310260970564,
"train_speed(iter/s)": 0.137888
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.9023920893669128,
"learning_rate": 9.859057841617709e-06,
"loss": 0.3223384380340576,
"memory(GiB)": 27.77,
"step": 35,
"token_acc": 0.8949514563106796,
"train_speed(iter/s)": 0.140106
},
{
"epoch": 0.2585858585858586,
"grad_norm": 0.9127278923988342,
"learning_rate": 9.816178385938867e-06,
"loss": 0.3180943489074707,
"memory(GiB)": 27.77,
"step": 40,
"token_acc": 0.90066669149689,
"train_speed(iter/s)": 0.142488
},
{
"epoch": 0.2585858585858586,
"eval_loss": 0.3500390648841858,
"eval_runtime": 5.3821,
"eval_samples_per_second": 18.58,
"eval_steps_per_second": 4.645,
"eval_token_acc": 0.8875516148650812,
"step": 40
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.947066068649292,
"learning_rate": 9.767731990394638e-06,
"loss": 0.33401944637298586,
"memory(GiB)": 27.77,
"step": 45,
"token_acc": 0.8919813402256824,
"train_speed(iter/s)": 0.136919
},
{
"epoch": 0.32323232323232326,
"grad_norm": 2.0620720386505127,
"learning_rate": 9.71377465336155e-06,
"loss": 0.3351354837417603,
"memory(GiB)": 27.77,
"step": 50,
"token_acc": 0.8765755647073505,
"train_speed(iter/s)": 0.139127
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.863703191280365,
"learning_rate": 9.654368743221022e-06,
"loss": 0.3273132801055908,
"memory(GiB)": 27.77,
"step": 55,
"token_acc": 0.8912860949877706,
"train_speed(iter/s)": 0.140614
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.890646755695343,
"learning_rate": 9.589582926268798e-06,
"loss": 0.3155367374420166,
"memory(GiB)": 30.08,
"step": 60,
"token_acc": 0.9143575243480992,
"train_speed(iter/s)": 0.142452
},
{
"epoch": 0.3878787878787879,
"eval_loss": 0.3454614281654358,
"eval_runtime": 5.3711,
"eval_samples_per_second": 18.618,
"eval_steps_per_second": 4.655,
"eval_token_acc": 0.8889920297045549,
"step": 60
},
{
"epoch": 0.4202020202020202,
"grad_norm": 0.8996425271034241,
"learning_rate": 9.519492087344724e-06,
"loss": 0.2981250762939453,
"memory(GiB)": 30.08,
"step": 65,
"token_acc": 0.9051183738056113,
"train_speed(iter/s)": 0.137618
},
{
"epoch": 0.45252525252525255,
"grad_norm": 0.9426372647285461,
"learning_rate": 9.444177243274619e-06,
"loss": 0.3414067029953003,
"memory(GiB)": 30.08,
"step": 70,
"token_acc": 0.8901309721453606,
"train_speed(iter/s)": 0.139706
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.8184367418289185,
"learning_rate": 9.363725449224281e-06,
"loss": 0.32115802764892576,
"memory(GiB)": 30.08,
"step": 75,
"token_acc": 0.8985269424515341,
"train_speed(iter/s)": 0.14097
},
{
"epoch": 0.5171717171717172,
"grad_norm": 0.9321162104606628,
"learning_rate": 9.278229698073889e-06,
"loss": 0.31497313976287844,
"memory(GiB)": 30.08,
"step": 80,
"token_acc": 0.9010191988622896,
"train_speed(iter/s)": 0.141741
},
{
"epoch": 0.5171717171717172,
"eval_loss": 0.33993807435035706,
"eval_runtime": 5.3957,
"eval_samples_per_second": 18.533,
"eval_steps_per_second": 4.633,
"eval_token_acc": 0.8895361864216894,
"step": 80
},
{
"epoch": 0.5494949494949495,
"grad_norm": 0.8877705931663513,
"learning_rate": 9.187788812929074e-06,
"loss": 0.32355318069458006,
"memory(GiB)": 30.08,
"step": 85,
"token_acc": 0.8971333885666943,
"train_speed(iter/s)": 0.138684
},
{
"epoch": 0.5818181818181818,
"grad_norm": 1.019900918006897,
"learning_rate": 9.092507332892968e-06,
"loss": 0.33187189102172854,
"memory(GiB)": 30.08,
"step": 90,
"token_acc": 0.8973049754299754,
"train_speed(iter/s)": 0.140038
},
{
"epoch": 0.6141414141414141,
"grad_norm": 1.0134016275405884,
"learning_rate": 8.992495392231195e-06,
"loss": 0.3340008020401001,
"memory(GiB)": 30.08,
"step": 95,
"token_acc": 0.9059227157818707,
"train_speed(iter/s)": 0.141282
},
{
"epoch": 0.6464646464646465,
"grad_norm": 0.9215405583381653,
"learning_rate": 8.88786859306952e-06,
"loss": 0.306801438331604,
"memory(GiB)": 30.08,
"step": 100,
"token_acc": 0.8903882234088613,
"train_speed(iter/s)": 0.142097
},
{
"epoch": 0.6464646464646465,
"eval_loss": 0.3363126516342163,
"eval_runtime": 5.4132,
"eval_samples_per_second": 18.473,
"eval_steps_per_second": 4.618,
"eval_token_acc": 0.8896002048589994,
"step": 100
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.9498901963233948,
"learning_rate": 8.778747871771293e-06,
"loss": 0.31042842864990233,
"memory(GiB)": 30.08,
"step": 105,
"token_acc": 0.889660103071286,
"train_speed(iter/s)": 0.139949
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.8483244180679321,
"learning_rate": 8.665259359149132e-06,
"loss": 0.3191797733306885,
"memory(GiB)": 30.08,
"step": 110,
"token_acc": 0.909381808278867,
"train_speed(iter/s)": 0.140731
},
{
"epoch": 0.7434343434343434,
"grad_norm": 0.7640553116798401,
"learning_rate": 8.547534234672435e-06,
"loss": 0.2995746374130249,
"memory(GiB)": 30.08,
"step": 115,
"token_acc": 0.8994715117849015,
"train_speed(iter/s)": 0.141551
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.9591003656387329,
"learning_rate": 8.425708574839221e-06,
"loss": 0.32628965377807617,
"memory(GiB)": 30.08,
"step": 120,
"token_acc": 0.8891077731264877,
"train_speed(iter/s)": 0.142186
},
{
"epoch": 0.7757575757575758,
"eval_loss": 0.331260085105896,
"eval_runtime": 5.3939,
"eval_samples_per_second": 18.539,
"eval_steps_per_second": 4.635,
"eval_token_acc": 0.8917128132902276,
"step": 120
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.9353374242782593,
"learning_rate": 8.299923195887599e-06,
"loss": 0.3271709680557251,
"memory(GiB)": 30.08,
"step": 125,
"token_acc": 0.8903564002694234,
"train_speed(iter/s)": 0.140229
},
{
"epoch": 0.8404040404040404,
"grad_norm": 0.9060182571411133,
"learning_rate": 8.170323491028625e-06,
"loss": 0.3163918018341064,
"memory(GiB)": 30.08,
"step": 130,
"token_acc": 0.8912760416666666,
"train_speed(iter/s)": 0.140553
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.8269082903862,
"learning_rate": 8.03705926238874e-06,
"loss": 0.3141618251800537,
"memory(GiB)": 30.08,
"step": 135,
"token_acc": 0.8898337308583083,
"train_speed(iter/s)": 0.141211
},
{
"epoch": 0.9050505050505051,
"grad_norm": 0.8577111959457397,
"learning_rate": 7.900284547855992e-06,
"loss": 0.3134615898132324,
"memory(GiB)": 30.08,
"step": 140,
"token_acc": 0.9006930194742344,
"train_speed(iter/s)": 0.141613
},
{
"epoch": 0.9050505050505051,
"eval_loss": 0.3313320279121399,
"eval_runtime": 5.3884,
"eval_samples_per_second": 18.558,
"eval_steps_per_second": 4.64,
"eval_token_acc": 0.8926410806312218,
"step": 140
},
{
"epoch": 0.9373737373737374,
"grad_norm": 0.8400760293006897,
"learning_rate": 7.760157443030234e-06,
"loss": 0.2992702007293701,
"memory(GiB)": 30.08,
"step": 145,
"token_acc": 0.9021267154765301,
"train_speed(iter/s)": 0.139829
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.8579837679862976,
"learning_rate": 7.616839918483061e-06,
"loss": 0.32117404937744143,
"memory(GiB)": 30.08,
"step": 150,
"token_acc": 0.8848251385041551,
"train_speed(iter/s)": 0.140369
},
{
"epoch": 1.0,
"grad_norm": 0.8645791411399841,
"learning_rate": 7.470497632538743e-06,
"loss": 0.3043407440185547,
"memory(GiB)": 30.08,
"step": 155,
"token_acc": 0.903212915601023,
"train_speed(iter/s)": 0.141142
},
{
"epoch": 1.0323232323232323,
"grad_norm": 0.8486159443855286,
"learning_rate": 7.321299739792553e-06,
"loss": 0.2472972869873047,
"memory(GiB)": 30.08,
"step": 160,
"token_acc": 0.9189923065319052,
"train_speed(iter/s)": 0.14175
},
{
"epoch": 1.0323232323232323,
"eval_loss": 0.33611831068992615,
"eval_runtime": 5.3694,
"eval_samples_per_second": 18.624,
"eval_steps_per_second": 4.656,
"eval_token_acc": 0.8918088409461925,
"step": 160
},
{
"epoch": 1.0646464646464646,
"grad_norm": 0.8444439768791199,
"learning_rate": 7.169418695587791e-06,
"loss": 0.22124772071838378,
"memory(GiB)": 30.08,
"step": 165,
"token_acc": 0.92039636166496,
"train_speed(iter/s)": 0.14026
},
{
"epoch": 1.096969696969697,
"grad_norm": 0.8758794069290161,
"learning_rate": 7.015030056677559e-06,
"loss": 0.231048059463501,
"memory(GiB)": 30.08,
"step": 170,
"token_acc": 0.927355278093076,
"train_speed(iter/s)": 0.141102
},
{
"epoch": 1.1292929292929292,
"grad_norm": 0.9414038062095642,
"learning_rate": 6.858312278301638e-06,
"loss": 0.2431964874267578,
"memory(GiB)": 30.08,
"step": 175,
"token_acc": 0.914981199287552,
"train_speed(iter/s)": 0.141563
},
{
"epoch": 1.1616161616161615,
"grad_norm": 0.8615570664405823,
"learning_rate": 6.699446507913083e-06,
"loss": 0.22901198863983155,
"memory(GiB)": 30.08,
"step": 180,
"token_acc": 0.9300724249884048,
"train_speed(iter/s)": 0.141935
},
{
"epoch": 1.1616161616161615,
"eval_loss": 0.34168365597724915,
"eval_runtime": 5.3971,
"eval_samples_per_second": 18.528,
"eval_steps_per_second": 4.632,
"eval_token_acc": 0.8907205275119234,
"step": 180
},
{
"epoch": 1.1939393939393939,
"grad_norm": 0.8244655132293701,
"learning_rate": 6.53861637579291e-06,
"loss": 0.2308629035949707,
"memory(GiB)": 30.08,
"step": 185,
"token_acc": 0.9109640722038423,
"train_speed(iter/s)": 0.140603
},
{
"epoch": 1.2262626262626264,
"grad_norm": 0.9139054417610168,
"learning_rate": 6.376007782794926e-06,
"loss": 0.2585730791091919,
"memory(GiB)": 30.08,
"step": 190,
"token_acc": 0.9028991841491841,
"train_speed(iter/s)": 0.141319
},
{
"epoch": 1.2585858585858585,
"grad_norm": 0.7501769065856934,
"learning_rate": 6.211808685466063e-06,
"loss": 0.2274195671081543,
"memory(GiB)": 30.08,
"step": 195,
"token_acc": 0.9299425265767627,
"train_speed(iter/s)": 0.142053
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.8027601838111877,
"learning_rate": 6.046208878790543e-06,
"loss": 0.2291938304901123,
"memory(GiB)": 30.08,
"step": 200,
"token_acc": 0.9253513490971267,
"train_speed(iter/s)": 0.142337
},
{
"epoch": 1.290909090909091,
"eval_loss": 0.34100720286369324,
"eval_runtime": 5.3678,
"eval_samples_per_second": 18.63,
"eval_steps_per_second": 4.657,
"eval_token_acc": 0.8911046381357831,
"step": 200
},
{
"epoch": 1.3232323232323233,
"grad_norm": 0.8584316372871399,
"learning_rate": 5.879399776809047e-06,
"loss": 0.21250443458557128,
"memory(GiB)": 30.08,
"step": 205,
"token_acc": 0.9207754541291406,
"train_speed(iter/s)": 0.141123
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.8164386749267578,
"learning_rate": 5.711574191366427e-06,
"loss": 0.23753111362457274,
"memory(GiB)": 30.08,
"step": 210,
"token_acc": 0.9153866525423728,
"train_speed(iter/s)": 0.141492
},
{
"epoch": 1.387878787878788,
"grad_norm": 0.8197464346885681,
"learning_rate": 5.542926109243727e-06,
"loss": 0.2262495279312134,
"memory(GiB)": 30.08,
"step": 215,
"token_acc": 0.9298754093424173,
"train_speed(iter/s)": 0.141871
},
{
"epoch": 1.4202020202020202,
"grad_norm": 0.8861284255981445,
"learning_rate": 5.373650467932122e-06,
"loss": 0.23235108852386474,
"memory(GiB)": 30.08,
"step": 220,
"token_acc": 0.916040434865535,
"train_speed(iter/s)": 0.142257
},
{
"epoch": 1.4202020202020202,
"eval_loss": 0.34229058027267456,
"eval_runtime": 5.3952,
"eval_samples_per_second": 18.535,
"eval_steps_per_second": 4.634,
"eval_token_acc": 0.8919368778208124,
"step": 220
},
{
"epoch": 1.4525252525252526,
"grad_norm": 0.8957362771034241,
"learning_rate": 5.2039429303079294e-06,
"loss": 0.2363661289215088,
"memory(GiB)": 30.08,
"step": 225,
"token_acc": 0.9184887277670782,
"train_speed(iter/s)": 0.141447
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.8692964911460876,
"learning_rate": 5.033999658469174e-06,
"loss": 0.22849671840667723,
"memory(GiB)": 30.08,
"step": 230,
"token_acc": 0.9200207931085698,
"train_speed(iter/s)": 0.141745
},
{
"epoch": 1.5171717171717172,
"grad_norm": 0.8732675909996033,
"learning_rate": 4.864017086995112e-06,
"loss": 0.22888550758361817,
"memory(GiB)": 30.08,
"step": 235,
"token_acc": 0.9242218099360956,
"train_speed(iter/s)": 0.142114
},
{
"epoch": 1.5494949494949495,
"grad_norm": 0.8548147082328796,
"learning_rate": 4.694191695890788e-06,
"loss": 0.24225883483886718,
"memory(GiB)": 30.08,
"step": 240,
"token_acc": 0.9354398726983405,
"train_speed(iter/s)": 0.142537
},
{
"epoch": 1.5494949494949495,
"eval_loss": 0.3384065330028534,
"eval_runtime": 5.3801,
"eval_samples_per_second": 18.587,
"eval_steps_per_second": 4.647,
"eval_token_acc": 0.8922569700073621,
"step": 240
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.8377289772033691,
"learning_rate": 4.524719783479088e-06,
"loss": 0.20645420551300048,
"memory(GiB)": 30.08,
"step": 245,
"token_acc": 0.9174505252870755,
"train_speed(iter/s)": 0.14148
},
{
"epoch": 1.614141414141414,
"grad_norm": 0.779121994972229,
"learning_rate": 4.355797239502807e-06,
"loss": 0.2250507354736328,
"memory(GiB)": 30.08,
"step": 250,
"token_acc": 0.9264376661536309,
"train_speed(iter/s)": 0.141711
},
{
"epoch": 1.6464646464646466,
"grad_norm": 0.8410191535949707,
"learning_rate": 4.187619318698971e-06,
"loss": 0.2303227186203003,
"memory(GiB)": 30.08,
"step": 255,
"token_acc": 0.9288690903865497,
"train_speed(iter/s)": 0.142159
},
{
"epoch": 1.6787878787878787,
"grad_norm": 0.8751044273376465,
"learning_rate": 4.020380415107167e-06,
"loss": 0.24010176658630372,
"memory(GiB)": 30.08,
"step": 260,
"token_acc": 0.9276785345930113,
"train_speed(iter/s)": 0.142379
},
{
"epoch": 1.6787878787878787,
"eval_loss": 0.33762192726135254,
"eval_runtime": 5.3885,
"eval_samples_per_second": 18.558,
"eval_steps_per_second": 4.639,
"eval_token_acc": 0.8926090714125668,
"step": 260
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.7685917019844055,
"learning_rate": 3.854273837372724e-06,
"loss": 0.25291612148284914,
"memory(GiB)": 30.08,
"step": 265,
"token_acc": 0.9064826915478832,
"train_speed(iter/s)": 0.141534
},
{
"epoch": 1.7434343434343433,
"grad_norm": 0.8771150708198547,
"learning_rate": 3.689491585304491e-06,
"loss": 0.23574538230895997,
"memory(GiB)": 30.08,
"step": 270,
"token_acc": 0.9073804876022761,
"train_speed(iter/s)": 0.141801
},
{
"epoch": 1.7757575757575759,
"grad_norm": 0.8586969375610352,
"learning_rate": 3.526224127945479e-06,
"loss": 0.24325270652770997,
"memory(GiB)": 30.08,
"step": 275,
"token_acc": 0.9250533832744076,
"train_speed(iter/s)": 0.142307
},
{
"epoch": 1.808080808080808,
"grad_norm": 0.8120052814483643,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.2067141056060791,
"memory(GiB)": 30.08,
"step": 280,
"token_acc": 0.9247845178077736,
"train_speed(iter/s)": 0.142568
},
{
"epoch": 1.808080808080808,
"eval_loss": 0.3372032642364502,
"eval_runtime": 5.3852,
"eval_samples_per_second": 18.569,
"eval_steps_per_second": 4.642,
"eval_token_acc": 0.8925130437566019,
"step": 280
},
{
"epoch": 1.8404040404040405,
"grad_norm": 0.7563571929931641,
"learning_rate": 3.204986500762006e-06,
"loss": 0.22141036987304688,
"memory(GiB)": 30.08,
"step": 285,
"token_acc": 0.9158564914393874,
"train_speed(iter/s)": 0.141564
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.840555727481842,
"learning_rate": 3.0473876441260786e-06,
"loss": 0.22226524353027344,
"memory(GiB)": 30.08,
"step": 290,
"token_acc": 0.9322453534191164,
"train_speed(iter/s)": 0.14182
},
{
"epoch": 1.905050505050505,
"grad_norm": 0.8599358797073364,
"learning_rate": 2.8920457793817507e-06,
"loss": 0.22878422737121581,
"memory(GiB)": 30.08,
"step": 295,
"token_acc": 0.9275855327468231,
"train_speed(iter/s)": 0.142089
},
{
"epoch": 1.9373737373737374,
"grad_norm": 0.9196203947067261,
"learning_rate": 2.7391404635865725e-06,
"loss": 0.23831405639648437,
"memory(GiB)": 30.08,
"step": 300,
"token_acc": 0.9162388743213797,
"train_speed(iter/s)": 0.142402
},
{
"epoch": 1.9373737373737374,
"eval_loss": 0.33528250455856323,
"eval_runtime": 5.3837,
"eval_samples_per_second": 18.575,
"eval_steps_per_second": 4.644,
"eval_token_acc": 0.8929931820364265,
"step": 300
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.757847785949707,
"learning_rate": 2.5888484374320033e-06,
"loss": 0.2106797695159912,
"memory(GiB)": 30.08,
"step": 305,
"token_acc": 0.9235401079083078,
"train_speed(iter/s)": 0.141615
},
{
"epoch": 2.0,
"grad_norm": 0.967450737953186,
"learning_rate": 2.4413434209518137e-06,
"loss": 0.21637356281280518,
"memory(GiB)": 30.08,
"step": 310,
"token_acc": 0.9329970868298622,
"train_speed(iter/s)": 0.141903
},
{
"epoch": 2.0323232323232325,
"grad_norm": 0.7503668665885925,
"learning_rate": 2.296795912722014e-06,
"loss": 0.16243449449539185,
"memory(GiB)": 30.08,
"step": 315,
"token_acc": 0.9508763656370353,
"train_speed(iter/s)": 0.141997
},
{
"epoch": 2.0646464646464646,
"grad_norm": 0.8131990432739258,
"learning_rate": 2.1553729927843894e-06,
"loss": 0.17449368238449098,
"memory(GiB)": 30.08,
"step": 320,
"token_acc": 0.9495816440955749,
"train_speed(iter/s)": 0.142202
},
{
"epoch": 2.0646464646464646,
"eval_loss": 0.3504800796508789,
"eval_runtime": 5.4562,
"eval_samples_per_second": 18.328,
"eval_steps_per_second": 4.582,
"eval_token_acc": 0.8927691175058416,
"step": 320
},
{
"epoch": 2.096969696969697,
"grad_norm": 0.8142232894897461,
"learning_rate": 2.017238129521506e-06,
"loss": 0.16946163177490234,
"memory(GiB)": 30.08,
"step": 325,
"token_acc": 0.9349265764468759,
"train_speed(iter/s)": 0.141472
},
{
"epoch": 2.1292929292929292,
"grad_norm": 0.8298311829566956,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.1755598545074463,
"memory(GiB)": 30.08,
"step": 330,
"token_acc": 0.9531752999707346,
"train_speed(iter/s)": 0.141802
},
{
"epoch": 2.1616161616161618,
"grad_norm": 0.7940059304237366,
"learning_rate": 1.7514672589449378e-06,
"loss": 0.1952407479286194,
"memory(GiB)": 30.08,
"step": 335,
"token_acc": 0.9343925770825635,
"train_speed(iter/s)": 0.142047
},
{
"epoch": 2.193939393939394,
"grad_norm": 0.7858513593673706,
"learning_rate": 1.6241384517255854e-06,
"loss": 0.16918621063232422,
"memory(GiB)": 30.08,
"step": 340,
"token_acc": 0.9412830735773831,
"train_speed(iter/s)": 0.142253
},
{
"epoch": 2.193939393939394,
"eval_loss": 0.3576539158821106,
"eval_runtime": 5.351,
"eval_samples_per_second": 18.688,
"eval_steps_per_second": 4.672,
"eval_token_acc": 0.8910406196984731,
"step": 340
},
{
"epoch": 2.2262626262626264,
"grad_norm": 0.7290251851081848,
"learning_rate": 1.500711746282192e-06,
"loss": 0.1872728943824768,
"memory(GiB)": 30.08,
"step": 345,
"token_acc": 0.9292867611138251,
"train_speed(iter/s)": 0.141644
},
{
"epoch": 2.2585858585858585,
"grad_norm": 0.7997108101844788,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.16540231704711914,
"memory(GiB)": 30.08,
"step": 350,
"token_acc": 0.9447741310403294,
"train_speed(iter/s)": 0.141801
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.7840582728385925,
"learning_rate": 1.2661306328825818e-06,
"loss": 0.17242782115936278,
"memory(GiB)": 30.08,
"step": 355,
"token_acc": 0.9399465492847037,
"train_speed(iter/s)": 0.142023
},
{
"epoch": 2.323232323232323,
"grad_norm": 0.7512005567550659,
"learning_rate": 1.1552473733031893e-06,
"loss": 0.1620992064476013,
"memory(GiB)": 30.08,
"step": 360,
"token_acc": 0.9435792877983619,
"train_speed(iter/s)": 0.142359
},
{
"epoch": 2.323232323232323,
"eval_loss": 0.36009594798088074,
"eval_runtime": 5.3832,
"eval_samples_per_second": 18.576,
"eval_steps_per_second": 4.644,
"eval_token_acc": 0.8920008962581223,
"step": 360
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.7732217311859131,
"learning_rate": 1.0488081988375493e-06,
"loss": 0.16843740940093993,
"memory(GiB)": 30.08,
"step": 365,
"token_acc": 0.9334714548802947,
"train_speed(iter/s)": 0.141824
},
{
"epoch": 2.3878787878787877,
"grad_norm": 0.7981094121932983,
"learning_rate": 9.469361407432431e-07,
"loss": 0.1794123411178589,
"memory(GiB)": 30.08,
"step": 370,
"token_acc": 0.9482818106541541,
"train_speed(iter/s)": 0.142017
},
{
"epoch": 2.4202020202020202,
"grad_norm": 0.7665418982505798,
"learning_rate": 8.497489512245971e-07,
"loss": 0.1843852996826172,
"memory(GiB)": 30.08,
"step": 375,
"token_acc": 0.9573796089286348,
"train_speed(iter/s)": 0.142235
},
{
"epoch": 2.4525252525252528,
"grad_norm": 0.9355995655059814,
"learning_rate": 7.573589673248833e-07,
"loss": 0.17202303409576417,
"memory(GiB)": 30.08,
"step": 380,
"token_acc": 0.9362966839881864,
"train_speed(iter/s)": 0.142515
},
{
"epoch": 2.4525252525252528,
"eval_loss": 0.36116844415664673,
"eval_runtime": 5.3783,
"eval_samples_per_second": 18.593,
"eval_steps_per_second": 4.648,
"eval_token_acc": 0.8914567395409878,
"step": 380
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.7312317490577698,
"learning_rate": 6.698729810778065e-07,
"loss": 0.17411458492279053,
"memory(GiB)": 30.08,
"step": 385,
"token_acc": 0.9338178444410082,
"train_speed(iter/s)": 0.141893
},
{
"epoch": 2.517171717171717,
"grad_norm": 0.7563744187355042,
"learning_rate": 5.873921160683943e-07,
"loss": 0.1915157437324524,
"memory(GiB)": 30.08,
"step": 390,
"token_acc": 0.9295483460559797,
"train_speed(iter/s)": 0.142171
},
{
"epoch": 2.5494949494949495,
"grad_norm": 0.7823712229728699,
"learning_rate": 5.100117105459279e-07,
"loss": 0.15321061611175538,
"memory(GiB)": 30.08,
"step": 395,
"token_acc": 0.9472502392696753,
"train_speed(iter/s)": 0.142376
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.6383055448532104,
"learning_rate": 4.3782120722406565e-07,
"loss": 0.16857578754425048,
"memory(GiB)": 30.08,
"step": 400,
"token_acc": 0.9525385172164202,
"train_speed(iter/s)": 0.142658
},
{
"epoch": 2.581818181818182,
"eval_loss": 0.36100760102272034,
"eval_runtime": 5.3454,
"eval_samples_per_second": 18.708,
"eval_steps_per_second": 4.677,
"eval_token_acc": 0.891264684229058,
"step": 400
},
{
"epoch": 2.614141414141414,
"grad_norm": 0.8574426174163818,
"learning_rate": 3.709040498955102e-07,
"loss": 0.18224529027938843,
"memory(GiB)": 30.08,
"step": 405,
"token_acc": 0.9255623050402233,
"train_speed(iter/s)": 0.142038
},
{
"epoch": 2.6464646464646466,
"grad_norm": 0.9696727395057678,
"learning_rate": 3.0933758698072023e-07,
"loss": 0.18939828872680664,
"memory(GiB)": 30.08,
"step": 410,
"token_acc": 0.9416907375312922,
"train_speed(iter/s)": 0.142235
},
{
"epoch": 2.6787878787878787,
"grad_norm": 0.7818398475646973,
"learning_rate": 2.531929821221768e-07,
"loss": 0.19069280624389648,
"memory(GiB)": 30.08,
"step": 415,
"token_acc": 0.9258034817542685,
"train_speed(iter/s)": 0.142409
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.8981226086616516,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.17302310466766357,
"memory(GiB)": 30.08,
"step": 420,
"token_acc": 0.950883135736753,
"train_speed(iter/s)": 0.142691
},
{
"epoch": 2.7111111111111112,
"eval_loss": 0.3606036305427551,
"eval_runtime": 5.3784,
"eval_samples_per_second": 18.593,
"eval_steps_per_second": 4.648,
"eval_token_acc": 0.8917128132902276,
"step": 420
},
{
"epoch": 2.7434343434343433,
"grad_norm": 0.754592776298523,
"learning_rate": 1.5742259095662126e-07,
"loss": 0.16562799215316773,
"memory(GiB)": 30.08,
"step": 425,
"token_acc": 0.9294097342078012,
"train_speed(iter/s)": 0.14213
},
{
"epoch": 2.775757575757576,
"grad_norm": 0.811010479927063,
"learning_rate": 1.1790750403941231e-07,
"loss": 0.17516304254531861,
"memory(GiB)": 30.08,
"step": 430,
"token_acc": 0.953036002149382,
"train_speed(iter/s)": 0.142302
},
{
"epoch": 2.808080808080808,
"grad_norm": 0.8035722374916077,
"learning_rate": 8.403554600248498e-08,
"loss": 0.16143158674240113,
"memory(GiB)": 30.08,
"step": 435,
"token_acc": 0.9470889436753271,
"train_speed(iter/s)": 0.142493
},
{
"epoch": 2.8404040404040405,
"grad_norm": 0.7885386347770691,
"learning_rate": 5.584586887435739e-08,
"loss": 0.16893348693847657,
"memory(GiB)": 30.08,
"step": 440,
"token_acc": 0.946100607044813,
"train_speed(iter/s)": 0.142701
},
{
"epoch": 2.8404040404040405,
"eval_loss": 0.36063292622566223,
"eval_runtime": 5.3864,
"eval_samples_per_second": 18.565,
"eval_steps_per_second": 4.641,
"eval_token_acc": 0.8917448225088825,
"step": 440
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.8363362550735474,
"learning_rate": 3.337105663029361e-08,
"loss": 0.166959547996521,
"memory(GiB)": 30.08,
"step": 445,
"token_acc": 0.9339094103124109,
"train_speed(iter/s)": 0.142143
},
{
"epoch": 2.905050505050505,
"grad_norm": 0.817148745059967,
"learning_rate": 1.6637087529033925e-08,
"loss": 0.16920559406280516,
"memory(GiB)": 30.08,
"step": 450,
"token_acc": 0.9441476444876153,
"train_speed(iter/s)": 0.142396
},
{
"epoch": 2.937373737373737,
"grad_norm": 0.7608515620231628,
"learning_rate": 5.6633040849601865e-09,
"loss": 0.16781603097915648,
"memory(GiB)": 30.08,
"step": 455,
"token_acc": 0.9337727971874313,
"train_speed(iter/s)": 0.142524
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.8431264162063599,
"learning_rate": 4.623907104084335e-10,
"loss": 0.2008026123046875,
"memory(GiB)": 30.08,
"step": 460,
"token_acc": 0.9438367531683766,
"train_speed(iter/s)": 0.142801
},
{
"epoch": 2.9696969696969697,
"eval_loss": 0.36077243089675903,
"eval_runtime": 5.3777,
"eval_samples_per_second": 18.595,
"eval_steps_per_second": 4.649,
"eval_token_acc": 0.8913927211036778,
"step": 460
},
{
"epoch": 2.9826262626262627,
"eval_loss": 0.3604045808315277,
"eval_runtime": 5.3956,
"eval_samples_per_second": 18.534,
"eval_steps_per_second": 4.633,
"eval_token_acc": 0.8918728593835025,
"step": 462
}
],
"logging_steps": 5,
"max_steps": 462,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.754364855085957e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}