Model: waltonfuture/qwen2.5vl-3b-sampled_10000_mixed-reflection-cot-32b Source: Original Platform
1181 lines
33 KiB
JSON
1181 lines
33 KiB
JSON
{
|
|
"best_global_step": 300,
|
|
"best_metric": 0.22214438,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v34-20250511-160020/checkpoint-300",
|
|
"epoch": 2.9826262626262627,
|
|
"eval_steps": 20,
|
|
"global_step": 462,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006464646464646465,
|
|
"grad_norm": 2.782066822052002,
|
|
"learning_rate": 9.999884400986087e-06,
|
|
"loss": 0.4351097345352173,
|
|
"memory(GiB)": 29.0,
|
|
"step": 1,
|
|
"token_acc": 0.8841401366477686,
|
|
"train_speed(iter/s)": 0.064982
|
|
},
|
|
{
|
|
"epoch": 0.03232323232323232,
|
|
"grad_norm": 1.5506770610809326,
|
|
"learning_rate": 9.997110291906109e-06,
|
|
"loss": 0.3617793917655945,
|
|
"memory(GiB)": 29.0,
|
|
"step": 5,
|
|
"token_acc": 0.884986440225848,
|
|
"train_speed(iter/s)": 0.119813
|
|
},
|
|
{
|
|
"epoch": 0.06464646464646465,
|
|
"grad_norm": 1.051324486732483,
|
|
"learning_rate": 9.988444507789584e-06,
|
|
"loss": 0.2978231906890869,
|
|
"memory(GiB)": 29.01,
|
|
"step": 10,
|
|
"token_acc": 0.9110293908036147,
|
|
"train_speed(iter/s)": 0.136021
|
|
},
|
|
{
|
|
"epoch": 0.09696969696969697,
|
|
"grad_norm": 1.0532618761062622,
|
|
"learning_rate": 9.97401266428502e-06,
|
|
"loss": 0.2773271083831787,
|
|
"memory(GiB)": 29.01,
|
|
"step": 15,
|
|
"token_acc": 0.90916535639413,
|
|
"train_speed(iter/s)": 0.140683
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"grad_norm": 1.028316617012024,
|
|
"learning_rate": 9.953831442918418e-06,
|
|
"loss": 0.26010329723358155,
|
|
"memory(GiB)": 29.01,
|
|
"step": 20,
|
|
"token_acc": 0.9205904810384322,
|
|
"train_speed(iter/s)": 0.14413
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"eval_loss": 0.2733669579029083,
|
|
"eval_runtime": 4.7919,
|
|
"eval_samples_per_second": 20.868,
|
|
"eval_steps_per_second": 5.217,
|
|
"eval_token_acc": 0.9177626754021415,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 0.9095243811607361,
|
|
"learning_rate": 9.927924170825266e-06,
|
|
"loss": 0.255949592590332,
|
|
"memory(GiB)": 29.01,
|
|
"step": 25,
|
|
"token_acc": 0.9019129441690243,
|
|
"train_speed(iter/s)": 0.132575
|
|
},
|
|
{
|
|
"epoch": 0.19393939393939394,
|
|
"grad_norm": 0.8502461314201355,
|
|
"learning_rate": 9.896320793787106e-06,
|
|
"loss": 0.254239821434021,
|
|
"memory(GiB)": 29.01,
|
|
"step": 30,
|
|
"token_acc": 0.9101830846407004,
|
|
"train_speed(iter/s)": 0.137917
|
|
},
|
|
{
|
|
"epoch": 0.22626262626262628,
|
|
"grad_norm": 0.952416181564331,
|
|
"learning_rate": 9.859057841617709e-06,
|
|
"loss": 0.25095329284667967,
|
|
"memory(GiB)": 29.01,
|
|
"step": 35,
|
|
"token_acc": 0.9192982456140351,
|
|
"train_speed(iter/s)": 0.139932
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"grad_norm": 0.9308900237083435,
|
|
"learning_rate": 9.816178385938867e-06,
|
|
"loss": 0.2500969886779785,
|
|
"memory(GiB)": 29.01,
|
|
"step": 40,
|
|
"token_acc": 0.9180038460325967,
|
|
"train_speed(iter/s)": 0.141458
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"eval_loss": 0.2549287676811218,
|
|
"eval_runtime": 4.7618,
|
|
"eval_samples_per_second": 21.0,
|
|
"eval_steps_per_second": 5.25,
|
|
"eval_token_acc": 0.9216007431672615,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 0.8023675084114075,
|
|
"learning_rate": 9.767731990394638e-06,
|
|
"loss": 0.24509100914001464,
|
|
"memory(GiB)": 29.01,
|
|
"step": 45,
|
|
"token_acc": 0.9136364761503998,
|
|
"train_speed(iter/s)": 0.136162
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 0.8799707293510437,
|
|
"learning_rate": 9.71377465336155e-06,
|
|
"loss": 0.2456353187561035,
|
|
"memory(GiB)": 29.01,
|
|
"step": 50,
|
|
"token_acc": 0.9201317882299478,
|
|
"train_speed(iter/s)": 0.138458
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.8880809545516968,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.22914605140686034,
|
|
"memory(GiB)": 29.01,
|
|
"step": 55,
|
|
"token_acc": 0.9370015671251959,
|
|
"train_speed(iter/s)": 0.139699
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"grad_norm": 0.9086585640907288,
|
|
"learning_rate": 9.589582926268798e-06,
|
|
"loss": 0.25642530918121337,
|
|
"memory(GiB)": 29.01,
|
|
"step": 60,
|
|
"token_acc": 0.9190494127315687,
|
|
"train_speed(iter/s)": 0.141257
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"eval_loss": 0.24375928938388824,
|
|
"eval_runtime": 4.8065,
|
|
"eval_samples_per_second": 20.805,
|
|
"eval_steps_per_second": 5.201,
|
|
"eval_token_acc": 0.9235564464870679,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4202020202020202,
|
|
"grad_norm": 0.6710211634635925,
|
|
"learning_rate": 9.519492087344724e-06,
|
|
"loss": 0.2273104190826416,
|
|
"memory(GiB)": 29.01,
|
|
"step": 65,
|
|
"token_acc": 0.9177157538245941,
|
|
"train_speed(iter/s)": 0.136434
|
|
},
|
|
{
|
|
"epoch": 0.45252525252525255,
|
|
"grad_norm": 0.8824236392974854,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.24116811752319336,
|
|
"memory(GiB)": 29.01,
|
|
"step": 70,
|
|
"token_acc": 0.9235306363194782,
|
|
"train_speed(iter/s)": 0.138651
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 0.7853980660438538,
|
|
"learning_rate": 9.363725449224281e-06,
|
|
"loss": 0.2349745512008667,
|
|
"memory(GiB)": 29.01,
|
|
"step": 75,
|
|
"token_acc": 0.9231191335740072,
|
|
"train_speed(iter/s)": 0.140002
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"grad_norm": 0.7967098951339722,
|
|
"learning_rate": 9.278229698073889e-06,
|
|
"loss": 0.22397637367248535,
|
|
"memory(GiB)": 29.01,
|
|
"step": 80,
|
|
"token_acc": 0.9202410885963614,
|
|
"train_speed(iter/s)": 0.140609
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"eval_loss": 0.235035702586174,
|
|
"eval_runtime": 4.8643,
|
|
"eval_samples_per_second": 20.558,
|
|
"eval_steps_per_second": 5.139,
|
|
"eval_token_acc": 0.9263922163007872,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5494949494949495,
|
|
"grad_norm": 0.867683470249176,
|
|
"learning_rate": 9.187788812929074e-06,
|
|
"loss": 0.22470014095306395,
|
|
"memory(GiB)": 29.01,
|
|
"step": 85,
|
|
"token_acc": 0.9176256656708663,
|
|
"train_speed(iter/s)": 0.137464
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 0.9094910621643066,
|
|
"learning_rate": 9.092507332892968e-06,
|
|
"loss": 0.2336829423904419,
|
|
"memory(GiB)": 29.01,
|
|
"step": 90,
|
|
"token_acc": 0.9196234077902898,
|
|
"train_speed(iter/s)": 0.13842
|
|
},
|
|
{
|
|
"epoch": 0.6141414141414141,
|
|
"grad_norm": 0.8729245662689209,
|
|
"learning_rate": 8.992495392231195e-06,
|
|
"loss": 0.22902388572692872,
|
|
"memory(GiB)": 29.01,
|
|
"step": 95,
|
|
"token_acc": 0.9179311961946873,
|
|
"train_speed(iter/s)": 0.139322
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 0.7870821356773376,
|
|
"learning_rate": 8.88786859306952e-06,
|
|
"loss": 0.22554943561553956,
|
|
"memory(GiB)": 29.01,
|
|
"step": 100,
|
|
"token_acc": 0.9258398976637716,
|
|
"train_speed(iter/s)": 0.140224
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"eval_loss": 0.23361265659332275,
|
|
"eval_runtime": 4.7619,
|
|
"eval_samples_per_second": 21.0,
|
|
"eval_steps_per_second": 5.25,
|
|
"eval_token_acc": 0.9272967290861976,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6787878787878788,
|
|
"grad_norm": 0.8173748850822449,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 0.2284949541091919,
|
|
"memory(GiB)": 29.01,
|
|
"step": 105,
|
|
"token_acc": 0.9134603158456883,
|
|
"train_speed(iter/s)": 0.138167
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.8524425625801086,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.220612096786499,
|
|
"memory(GiB)": 29.01,
|
|
"step": 110,
|
|
"token_acc": 0.9307908237343916,
|
|
"train_speed(iter/s)": 0.138956
|
|
},
|
|
{
|
|
"epoch": 0.7434343434343434,
|
|
"grad_norm": 0.7814698219299316,
|
|
"learning_rate": 8.547534234672435e-06,
|
|
"loss": 0.21316018104553222,
|
|
"memory(GiB)": 29.01,
|
|
"step": 115,
|
|
"token_acc": 0.9195374535793002,
|
|
"train_speed(iter/s)": 0.139738
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"grad_norm": 0.8820337057113647,
|
|
"learning_rate": 8.425708574839221e-06,
|
|
"loss": 0.2112647533416748,
|
|
"memory(GiB)": 29.01,
|
|
"step": 120,
|
|
"token_acc": 0.9269834018577737,
|
|
"train_speed(iter/s)": 0.140382
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"eval_loss": 0.2290322184562683,
|
|
"eval_runtime": 4.7553,
|
|
"eval_samples_per_second": 21.029,
|
|
"eval_steps_per_second": 5.257,
|
|
"eval_token_acc": 0.9274189605436856,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 0.897212564945221,
|
|
"learning_rate": 8.299923195887599e-06,
|
|
"loss": 0.21470861434936522,
|
|
"memory(GiB)": 29.01,
|
|
"step": 125,
|
|
"token_acc": 0.9227193492155723,
|
|
"train_speed(iter/s)": 0.138516
|
|
},
|
|
{
|
|
"epoch": 0.8404040404040404,
|
|
"grad_norm": 0.8352209329605103,
|
|
"learning_rate": 8.170323491028625e-06,
|
|
"loss": 0.2259157657623291,
|
|
"memory(GiB)": 29.01,
|
|
"step": 130,
|
|
"token_acc": 0.9266622410118445,
|
|
"train_speed(iter/s)": 0.138949
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 0.8725154399871826,
|
|
"learning_rate": 8.03705926238874e-06,
|
|
"loss": 0.22986299991607667,
|
|
"memory(GiB)": 29.01,
|
|
"step": 135,
|
|
"token_acc": 0.9203431372549019,
|
|
"train_speed(iter/s)": 0.139597
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"grad_norm": 0.7126485109329224,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.21029348373413087,
|
|
"memory(GiB)": 29.01,
|
|
"step": 140,
|
|
"token_acc": 0.9250847116449156,
|
|
"train_speed(iter/s)": 0.140087
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"eval_loss": 0.22619588673114777,
|
|
"eval_runtime": 4.7708,
|
|
"eval_samples_per_second": 20.961,
|
|
"eval_steps_per_second": 5.24,
|
|
"eval_token_acc": 0.9281034567056178,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9373737373737374,
|
|
"grad_norm": 0.855863094329834,
|
|
"learning_rate": 7.760157443030234e-06,
|
|
"loss": 0.22026867866516114,
|
|
"memory(GiB)": 29.01,
|
|
"step": 145,
|
|
"token_acc": 0.9182781919850885,
|
|
"train_speed(iter/s)": 0.1384
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.757979154586792,
|
|
"learning_rate": 7.616839918483061e-06,
|
|
"loss": 0.21316237449645997,
|
|
"memory(GiB)": 29.01,
|
|
"step": 150,
|
|
"token_acc": 0.9336977031687163,
|
|
"train_speed(iter/s)": 0.13894
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.7608435750007629,
|
|
"learning_rate": 7.470497632538743e-06,
|
|
"loss": 0.2081432580947876,
|
|
"memory(GiB)": 29.01,
|
|
"step": 155,
|
|
"token_acc": 0.9317403683281076,
|
|
"train_speed(iter/s)": 0.139608
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"grad_norm": 0.6974760293960571,
|
|
"learning_rate": 7.321299739792553e-06,
|
|
"loss": 0.15149842500686644,
|
|
"memory(GiB)": 29.01,
|
|
"step": 160,
|
|
"token_acc": 0.9457131222002945,
|
|
"train_speed(iter/s)": 0.140097
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"eval_loss": 0.22719089686870575,
|
|
"eval_runtime": 4.7699,
|
|
"eval_samples_per_second": 20.965,
|
|
"eval_steps_per_second": 5.241,
|
|
"eval_token_acc": 0.9280056715396274,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0646464646464646,
|
|
"grad_norm": 0.872110903263092,
|
|
"learning_rate": 7.169418695587791e-06,
|
|
"loss": 0.14826536178588867,
|
|
"memory(GiB)": 29.01,
|
|
"step": 165,
|
|
"token_acc": 0.9360687748839804,
|
|
"train_speed(iter/s)": 0.138838
|
|
},
|
|
{
|
|
"epoch": 1.096969696969697,
|
|
"grad_norm": 1.0611625909805298,
|
|
"learning_rate": 7.015030056677559e-06,
|
|
"loss": 0.15616699457168579,
|
|
"memory(GiB)": 29.01,
|
|
"step": 170,
|
|
"token_acc": 0.9440032655484637,
|
|
"train_speed(iter/s)": 0.13965
|
|
},
|
|
{
|
|
"epoch": 1.1292929292929292,
|
|
"grad_norm": 0.7496768236160278,
|
|
"learning_rate": 6.858312278301638e-06,
|
|
"loss": 0.1349432110786438,
|
|
"memory(GiB)": 29.01,
|
|
"step": 175,
|
|
"token_acc": 0.9524397395142126,
|
|
"train_speed(iter/s)": 0.140043
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 0.8625680208206177,
|
|
"learning_rate": 6.699446507913083e-06,
|
|
"loss": 0.13590528964996337,
|
|
"memory(GiB)": 29.01,
|
|
"step": 180,
|
|
"token_acc": 0.9526945902370416,
|
|
"train_speed(iter/s)": 0.140334
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"eval_loss": 0.22827181220054626,
|
|
"eval_runtime": 4.7724,
|
|
"eval_samples_per_second": 20.954,
|
|
"eval_steps_per_second": 5.238,
|
|
"eval_token_acc": 0.9284212584950863,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.1939393939393939,
|
|
"grad_norm": 0.7404259443283081,
|
|
"learning_rate": 6.53861637579291e-06,
|
|
"loss": 0.13479866981506347,
|
|
"memory(GiB)": 29.01,
|
|
"step": 185,
|
|
"token_acc": 0.9389863201441587,
|
|
"train_speed(iter/s)": 0.139057
|
|
},
|
|
{
|
|
"epoch": 1.2262626262626264,
|
|
"grad_norm": 0.7891139388084412,
|
|
"learning_rate": 6.376007782794926e-06,
|
|
"loss": 0.14628617763519286,
|
|
"memory(GiB)": 29.01,
|
|
"step": 190,
|
|
"token_acc": 0.9526434549141698,
|
|
"train_speed(iter/s)": 0.139363
|
|
},
|
|
{
|
|
"epoch": 1.2585858585858585,
|
|
"grad_norm": 0.8704652190208435,
|
|
"learning_rate": 6.211808685466063e-06,
|
|
"loss": 0.15462675094604492,
|
|
"memory(GiB)": 29.01,
|
|
"step": 195,
|
|
"token_acc": 0.9470661110485415,
|
|
"train_speed(iter/s)": 0.14002
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"grad_norm": 0.8007650971412659,
|
|
"learning_rate": 6.046208878790543e-06,
|
|
"loss": 0.1429288625717163,
|
|
"memory(GiB)": 29.01,
|
|
"step": 200,
|
|
"token_acc": 0.9471947194719472,
|
|
"train_speed(iter/s)": 0.140432
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"eval_loss": 0.22773738205432892,
|
|
"eval_runtime": 4.935,
|
|
"eval_samples_per_second": 20.263,
|
|
"eval_steps_per_second": 5.066,
|
|
"eval_token_acc": 0.9292768786975016,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 0.7445653676986694,
|
|
"learning_rate": 5.879399776809047e-06,
|
|
"loss": 0.1418352484703064,
|
|
"memory(GiB)": 29.01,
|
|
"step": 205,
|
|
"token_acc": 0.9445697474871292,
|
|
"train_speed(iter/s)": 0.139341
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.7971433401107788,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.14476661682128905,
|
|
"memory(GiB)": 29.01,
|
|
"step": 210,
|
|
"token_acc": 0.9523753004361888,
|
|
"train_speed(iter/s)": 0.139673
|
|
},
|
|
{
|
|
"epoch": 1.387878787878788,
|
|
"grad_norm": 0.7370967268943787,
|
|
"learning_rate": 5.542926109243727e-06,
|
|
"loss": 0.13473730087280272,
|
|
"memory(GiB)": 29.01,
|
|
"step": 215,
|
|
"token_acc": 0.955563595697793,
|
|
"train_speed(iter/s)": 0.140102
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"grad_norm": 0.8203840851783752,
|
|
"learning_rate": 5.373650467932122e-06,
|
|
"loss": 0.15003204345703125,
|
|
"memory(GiB)": 29.01,
|
|
"step": 220,
|
|
"token_acc": 0.9451295603024297,
|
|
"train_speed(iter/s)": 0.140456
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"eval_loss": 0.2280927300453186,
|
|
"eval_runtime": 4.7892,
|
|
"eval_samples_per_second": 20.88,
|
|
"eval_steps_per_second": 5.22,
|
|
"eval_token_acc": 0.9299124822764386,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4525252525252526,
|
|
"grad_norm": 0.7488767504692078,
|
|
"learning_rate": 5.2039429303079294e-06,
|
|
"loss": 0.1540588140487671,
|
|
"memory(GiB)": 29.01,
|
|
"step": 225,
|
|
"token_acc": 0.9373329232807363,
|
|
"train_speed(iter/s)": 0.139519
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 0.7238897085189819,
|
|
"learning_rate": 5.033999658469174e-06,
|
|
"loss": 0.1424393892288208,
|
|
"memory(GiB)": 29.01,
|
|
"step": 230,
|
|
"token_acc": 0.9550141601917073,
|
|
"train_speed(iter/s)": 0.139888
|
|
},
|
|
{
|
|
"epoch": 1.5171717171717172,
|
|
"grad_norm": 0.7469899654388428,
|
|
"learning_rate": 4.864017086995112e-06,
|
|
"loss": 0.13640257120132446,
|
|
"memory(GiB)": 29.01,
|
|
"step": 235,
|
|
"token_acc": 0.9527641822422231,
|
|
"train_speed(iter/s)": 0.140199
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"grad_norm": 0.8132858276367188,
|
|
"learning_rate": 4.694191695890788e-06,
|
|
"loss": 0.1473687171936035,
|
|
"memory(GiB)": 29.01,
|
|
"step": 240,
|
|
"token_acc": 0.9434306569343066,
|
|
"train_speed(iter/s)": 0.14057
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"eval_loss": 0.22689995169639587,
|
|
"eval_runtime": 4.819,
|
|
"eval_samples_per_second": 20.751,
|
|
"eval_steps_per_second": 5.188,
|
|
"eval_token_acc": 0.9290813083655209,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.5818181818181818,
|
|
"grad_norm": 0.8139073848724365,
|
|
"learning_rate": 4.524719783479088e-06,
|
|
"loss": 0.14017653465270996,
|
|
"memory(GiB)": 29.01,
|
|
"step": 245,
|
|
"token_acc": 0.9365190094175095,
|
|
"train_speed(iter/s)": 0.139627
|
|
},
|
|
{
|
|
"epoch": 1.614141414141414,
|
|
"grad_norm": 0.8636942505836487,
|
|
"learning_rate": 4.355797239502807e-06,
|
|
"loss": 0.13665199279785156,
|
|
"memory(GiB)": 29.01,
|
|
"step": 250,
|
|
"token_acc": 0.9555003459337065,
|
|
"train_speed(iter/s)": 0.139706
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 0.8042647838592529,
|
|
"learning_rate": 4.187619318698971e-06,
|
|
"loss": 0.14160826206207275,
|
|
"memory(GiB)": 29.01,
|
|
"step": 255,
|
|
"token_acc": 0.9507644993762149,
|
|
"train_speed(iter/s)": 0.140117
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"grad_norm": 0.7862452268600464,
|
|
"learning_rate": 4.020380415107167e-06,
|
|
"loss": 0.1396080732345581,
|
|
"memory(GiB)": 29.01,
|
|
"step": 260,
|
|
"token_acc": 0.953097139313125,
|
|
"train_speed(iter/s)": 0.14034
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"eval_loss": 0.22532083094120026,
|
|
"eval_runtime": 4.797,
|
|
"eval_samples_per_second": 20.846,
|
|
"eval_steps_per_second": 5.212,
|
|
"eval_token_acc": 0.9303036229403999,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.8368792533874512,
|
|
"learning_rate": 3.854273837372724e-06,
|
|
"loss": 0.14725687503814697,
|
|
"memory(GiB)": 29.01,
|
|
"step": 265,
|
|
"token_acc": 0.9422004865193594,
|
|
"train_speed(iter/s)": 0.139548
|
|
},
|
|
{
|
|
"epoch": 1.7434343434343433,
|
|
"grad_norm": 0.7835624814033508,
|
|
"learning_rate": 3.689491585304491e-06,
|
|
"loss": 0.14394346475601197,
|
|
"memory(GiB)": 29.01,
|
|
"step": 270,
|
|
"token_acc": 0.9471722928540114,
|
|
"train_speed(iter/s)": 0.139821
|
|
},
|
|
{
|
|
"epoch": 1.7757575757575759,
|
|
"grad_norm": 0.7341930866241455,
|
|
"learning_rate": 3.526224127945479e-06,
|
|
"loss": 0.14667458534240724,
|
|
"memory(GiB)": 29.01,
|
|
"step": 275,
|
|
"token_acc": 0.9464715744551135,
|
|
"train_speed(iter/s)": 0.140255
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 0.7114227414131165,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.132787024974823,
|
|
"memory(GiB)": 29.01,
|
|
"step": 280,
|
|
"token_acc": 0.9507424168754909,
|
|
"train_speed(iter/s)": 0.140468
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"eval_loss": 0.22350256145000458,
|
|
"eval_runtime": 4.7736,
|
|
"eval_samples_per_second": 20.949,
|
|
"eval_steps_per_second": 5.237,
|
|
"eval_token_acc": 0.9308658876448442,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.8404040404040405,
|
|
"grad_norm": 0.7501634359359741,
|
|
"learning_rate": 3.204986500762006e-06,
|
|
"loss": 0.13066763877868653,
|
|
"memory(GiB)": 29.01,
|
|
"step": 285,
|
|
"token_acc": 0.9368674340146854,
|
|
"train_speed(iter/s)": 0.139644
|
|
},
|
|
{
|
|
"epoch": 1.8727272727272726,
|
|
"grad_norm": 0.7324647307395935,
|
|
"learning_rate": 3.0473876441260786e-06,
|
|
"loss": 0.1387048363685608,
|
|
"memory(GiB)": 29.01,
|
|
"step": 290,
|
|
"token_acc": 0.9532809871003926,
|
|
"train_speed(iter/s)": 0.139906
|
|
},
|
|
{
|
|
"epoch": 1.905050505050505,
|
|
"grad_norm": 0.8160727024078369,
|
|
"learning_rate": 2.8920457793817507e-06,
|
|
"loss": 0.13977317810058593,
|
|
"memory(GiB)": 29.01,
|
|
"step": 295,
|
|
"token_acc": 0.9515957446808511,
|
|
"train_speed(iter/s)": 0.14009
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"grad_norm": 0.8773327469825745,
|
|
"learning_rate": 2.7391404635865725e-06,
|
|
"loss": 0.15099945068359374,
|
|
"memory(GiB)": 29.01,
|
|
"step": 300,
|
|
"token_acc": 0.9496292015795049,
|
|
"train_speed(iter/s)": 0.140321
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"eval_loss": 0.22214438021183014,
|
|
"eval_runtime": 4.7787,
|
|
"eval_samples_per_second": 20.926,
|
|
"eval_steps_per_second": 5.232,
|
|
"eval_token_acc": 0.930939226519337,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.7400924563407898,
|
|
"learning_rate": 2.5888484374320033e-06,
|
|
"loss": 0.1311182498931885,
|
|
"memory(GiB)": 29.01,
|
|
"step": 305,
|
|
"token_acc": 0.944199668256272,
|
|
"train_speed(iter/s)": 0.139576
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.9168914556503296,
|
|
"learning_rate": 2.4413434209518137e-06,
|
|
"loss": 0.14706777334213256,
|
|
"memory(GiB)": 29.01,
|
|
"step": 310,
|
|
"token_acc": 0.9542029226862068,
|
|
"train_speed(iter/s)": 0.139976
|
|
},
|
|
{
|
|
"epoch": 2.0323232323232325,
|
|
"grad_norm": 0.6356860399246216,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 0.09936747550964356,
|
|
"memory(GiB)": 29.01,
|
|
"step": 315,
|
|
"token_acc": 0.9699422322449106,
|
|
"train_speed(iter/s)": 0.140169
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"grad_norm": 0.7481921911239624,
|
|
"learning_rate": 2.1553729927843894e-06,
|
|
"loss": 0.09395751953125,
|
|
"memory(GiB)": 29.01,
|
|
"step": 320,
|
|
"token_acc": 0.9698759156824638,
|
|
"train_speed(iter/s)": 0.1404
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"eval_loss": 0.23354972898960114,
|
|
"eval_runtime": 4.7345,
|
|
"eval_samples_per_second": 21.122,
|
|
"eval_steps_per_second": 5.28,
|
|
"eval_token_acc": 0.9305480858553757,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.096969696969697,
|
|
"grad_norm": 0.7663989663124084,
|
|
"learning_rate": 2.017238129521506e-06,
|
|
"loss": 0.09232854843139648,
|
|
"memory(GiB)": 29.01,
|
|
"step": 325,
|
|
"token_acc": 0.9550909621122053,
|
|
"train_speed(iter/s)": 0.139732
|
|
},
|
|
{
|
|
"epoch": 2.1292929292929292,
|
|
"grad_norm": 0.8547663688659668,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.09894357919692993,
|
|
"memory(GiB)": 29.01,
|
|
"step": 330,
|
|
"token_acc": 0.9668018320254383,
|
|
"train_speed(iter/s)": 0.140046
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 0.7185168266296387,
|
|
"learning_rate": 1.7514672589449378e-06,
|
|
"loss": 0.08718444108963012,
|
|
"memory(GiB)": 29.01,
|
|
"step": 335,
|
|
"token_acc": 0.9681554248986598,
|
|
"train_speed(iter/s)": 0.140153
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"grad_norm": 0.7746614813804626,
|
|
"learning_rate": 1.6241384517255854e-06,
|
|
"loss": 0.09638407826423645,
|
|
"memory(GiB)": 29.01,
|
|
"step": 340,
|
|
"token_acc": 0.9620331882302884,
|
|
"train_speed(iter/s)": 0.14039
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"eval_loss": 0.24958540499210358,
|
|
"eval_runtime": 4.75,
|
|
"eval_samples_per_second": 21.052,
|
|
"eval_steps_per_second": 5.263,
|
|
"eval_token_acc": 0.9302791766489024,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.2262626262626264,
|
|
"grad_norm": 0.742405354976654,
|
|
"learning_rate": 1.500711746282192e-06,
|
|
"loss": 0.09411965012550354,
|
|
"memory(GiB)": 29.01,
|
|
"step": 345,
|
|
"token_acc": 0.9519363664569994,
|
|
"train_speed(iter/s)": 0.139752
|
|
},
|
|
{
|
|
"epoch": 2.2585858585858585,
|
|
"grad_norm": 0.764021098613739,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.08621931076049805,
|
|
"memory(GiB)": 29.01,
|
|
"step": 350,
|
|
"token_acc": 0.9720944103612815,
|
|
"train_speed(iter/s)": 0.139887
|
|
},
|
|
{
|
|
"epoch": 2.290909090909091,
|
|
"grad_norm": 0.773617684841156,
|
|
"learning_rate": 1.2661306328825818e-06,
|
|
"loss": 0.09195576310157776,
|
|
"memory(GiB)": 29.01,
|
|
"step": 355,
|
|
"token_acc": 0.9652025497230834,
|
|
"train_speed(iter/s)": 0.140164
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.6916205286979675,
|
|
"learning_rate": 1.1552473733031893e-06,
|
|
"loss": 0.09625710248947143,
|
|
"memory(GiB)": 29.01,
|
|
"step": 360,
|
|
"token_acc": 0.964334548769371,
|
|
"train_speed(iter/s)": 0.140497
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"eval_loss": 0.24684520065784454,
|
|
"eval_runtime": 4.7402,
|
|
"eval_samples_per_second": 21.096,
|
|
"eval_steps_per_second": 5.274,
|
|
"eval_token_acc": 0.9305236395638782,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.7022070288658142,
|
|
"learning_rate": 1.0488081988375493e-06,
|
|
"loss": 0.09287334084510804,
|
|
"memory(GiB)": 29.01,
|
|
"step": 365,
|
|
"token_acc": 0.9523647112323783,
|
|
"train_speed(iter/s)": 0.140054
|
|
},
|
|
{
|
|
"epoch": 2.3878787878787877,
|
|
"grad_norm": 0.7242087721824646,
|
|
"learning_rate": 9.469361407432431e-07,
|
|
"loss": 0.08903356790542602,
|
|
"memory(GiB)": 29.01,
|
|
"step": 370,
|
|
"token_acc": 0.9695336787564767,
|
|
"train_speed(iter/s)": 0.140188
|
|
},
|
|
{
|
|
"epoch": 2.4202020202020202,
|
|
"grad_norm": 0.6964623332023621,
|
|
"learning_rate": 8.497489512245971e-07,
|
|
"loss": 0.0938454508781433,
|
|
"memory(GiB)": 29.01,
|
|
"step": 375,
|
|
"token_acc": 0.9731585150265174,
|
|
"train_speed(iter/s)": 0.140328
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"grad_norm": 0.7716543078422546,
|
|
"learning_rate": 7.573589673248833e-07,
|
|
"loss": 0.09390033483505249,
|
|
"memory(GiB)": 29.01,
|
|
"step": 380,
|
|
"token_acc": 0.9665334135210669,
|
|
"train_speed(iter/s)": 0.140549
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"eval_loss": 0.24702604115009308,
|
|
"eval_runtime": 4.7799,
|
|
"eval_samples_per_second": 20.921,
|
|
"eval_steps_per_second": 5.23,
|
|
"eval_token_acc": 0.9308169950618491,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.6270375847816467,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.0894782304763794,
|
|
"memory(GiB)": 29.01,
|
|
"step": 385,
|
|
"token_acc": 0.9545825054765907,
|
|
"train_speed(iter/s)": 0.139956
|
|
},
|
|
{
|
|
"epoch": 2.517171717171717,
|
|
"grad_norm": 0.7530998587608337,
|
|
"learning_rate": 5.873921160683943e-07,
|
|
"loss": 0.09273716211318969,
|
|
"memory(GiB)": 29.01,
|
|
"step": 390,
|
|
"token_acc": 0.9644321902464242,
|
|
"train_speed(iter/s)": 0.140246
|
|
},
|
|
{
|
|
"epoch": 2.5494949494949495,
|
|
"grad_norm": 0.7483599781990051,
|
|
"learning_rate": 5.100117105459279e-07,
|
|
"loss": 0.09490547776222229,
|
|
"memory(GiB)": 29.01,
|
|
"step": 395,
|
|
"token_acc": 0.9702084609035968,
|
|
"train_speed(iter/s)": 0.140356
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"grad_norm": 0.6409457921981812,
|
|
"learning_rate": 4.3782120722406565e-07,
|
|
"loss": 0.08456376791000367,
|
|
"memory(GiB)": 29.01,
|
|
"step": 400,
|
|
"token_acc": 0.967419212040726,
|
|
"train_speed(iter/s)": 0.14058
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"eval_loss": 0.24691322445869446,
|
|
"eval_runtime": 4.7767,
|
|
"eval_samples_per_second": 20.935,
|
|
"eval_steps_per_second": 5.234,
|
|
"eval_token_acc": 0.9304991932723806,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.614141414141414,
|
|
"grad_norm": 0.7296220064163208,
|
|
"learning_rate": 3.709040498955102e-07,
|
|
"loss": 0.09365044832229615,
|
|
"memory(GiB)": 29.01,
|
|
"step": 405,
|
|
"token_acc": 0.9518691709509908,
|
|
"train_speed(iter/s)": 0.139998
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 0.7950789928436279,
|
|
"learning_rate": 3.0933758698072023e-07,
|
|
"loss": 0.09456123113632202,
|
|
"memory(GiB)": 29.01,
|
|
"step": 410,
|
|
"token_acc": 0.967745104460017,
|
|
"train_speed(iter/s)": 0.140168
|
|
},
|
|
{
|
|
"epoch": 2.6787878787878787,
|
|
"grad_norm": 0.7504149079322815,
|
|
"learning_rate": 2.531929821221768e-07,
|
|
"loss": 0.09618629813194275,
|
|
"memory(GiB)": 29.01,
|
|
"step": 415,
|
|
"token_acc": 0.9672515016798453,
|
|
"train_speed(iter/s)": 0.140395
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.7684112191200256,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.09071210622787476,
|
|
"memory(GiB)": 29.01,
|
|
"step": 420,
|
|
"token_acc": 0.9696820512820513,
|
|
"train_speed(iter/s)": 0.140583
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"eval_loss": 0.24677424132823944,
|
|
"eval_runtime": 4.7699,
|
|
"eval_samples_per_second": 20.965,
|
|
"eval_steps_per_second": 5.241,
|
|
"eval_token_acc": 0.930181391482912,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.7434343434343433,
|
|
"grad_norm": 0.8230168223381042,
|
|
"learning_rate": 1.5742259095662126e-07,
|
|
"loss": 0.09300805330276489,
|
|
"memory(GiB)": 29.01,
|
|
"step": 425,
|
|
"token_acc": 0.9522797263783283,
|
|
"train_speed(iter/s)": 0.140016
|
|
},
|
|
{
|
|
"epoch": 2.775757575757576,
|
|
"grad_norm": 0.7461301684379578,
|
|
"learning_rate": 1.1790750403941231e-07,
|
|
"loss": 0.09267510175704956,
|
|
"memory(GiB)": 29.01,
|
|
"step": 430,
|
|
"token_acc": 0.9682438869482375,
|
|
"train_speed(iter/s)": 0.140189
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.7107937932014465,
|
|
"learning_rate": 8.403554600248498e-08,
|
|
"loss": 0.08705815076828002,
|
|
"memory(GiB)": 29.01,
|
|
"step": 435,
|
|
"token_acc": 0.9719344842850819,
|
|
"train_speed(iter/s)": 0.140367
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"grad_norm": 0.6642769575119019,
|
|
"learning_rate": 5.584586887435739e-08,
|
|
"loss": 0.09030424356460572,
|
|
"memory(GiB)": 29.01,
|
|
"step": 440,
|
|
"token_acc": 0.9698875973943032,
|
|
"train_speed(iter/s)": 0.140575
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"eval_loss": 0.2466077357530594,
|
|
"eval_runtime": 4.7557,
|
|
"eval_samples_per_second": 21.027,
|
|
"eval_steps_per_second": 5.257,
|
|
"eval_token_acc": 0.9308658876448442,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.8727272727272726,
|
|
"grad_norm": 0.6828747987747192,
|
|
"learning_rate": 3.337105663029361e-08,
|
|
"loss": 0.0859929859638214,
|
|
"memory(GiB)": 29.01,
|
|
"step": 445,
|
|
"token_acc": 0.9535586561225003,
|
|
"train_speed(iter/s)": 0.140068
|
|
},
|
|
{
|
|
"epoch": 2.905050505050505,
|
|
"grad_norm": 0.6743197441101074,
|
|
"learning_rate": 1.6637087529033925e-08,
|
|
"loss": 0.09535614252090455,
|
|
"memory(GiB)": 29.01,
|
|
"step": 450,
|
|
"token_acc": 0.9667527211833659,
|
|
"train_speed(iter/s)": 0.140292
|
|
},
|
|
{
|
|
"epoch": 2.937373737373737,
|
|
"grad_norm": 0.6410036087036133,
|
|
"learning_rate": 5.6633040849601865e-09,
|
|
"loss": 0.08608411550521851,
|
|
"memory(GiB)": 29.01,
|
|
"step": 455,
|
|
"token_acc": 0.9691890107471665,
|
|
"train_speed(iter/s)": 0.140436
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 0.7376388311386108,
|
|
"learning_rate": 4.623907104084335e-10,
|
|
"loss": 0.0936282753944397,
|
|
"memory(GiB)": 29.01,
|
|
"step": 460,
|
|
"token_acc": 0.9650562139167427,
|
|
"train_speed(iter/s)": 0.140678
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"eval_loss": 0.24663545191287994,
|
|
"eval_runtime": 4.7524,
|
|
"eval_samples_per_second": 21.042,
|
|
"eval_steps_per_second": 5.261,
|
|
"eval_token_acc": 0.9304991932723806,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.9826262626262627,
|
|
"eval_loss": 0.24689918756484985,
|
|
"eval_runtime": 4.7514,
|
|
"eval_samples_per_second": 21.046,
|
|
"eval_steps_per_second": 5.262,
|
|
"eval_token_acc": 0.9306703173128636,
|
|
"step": 462
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 462,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 5.038524706450309e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|