Model: waltonfuture/qwen2.5vl-3b-sampled_10000_reflection-cot-32b Source: Original Platform
1181 lines
33 KiB
JSON
1181 lines
33 KiB
JSON
{
|
|
"best_global_step": 300,
|
|
"best_metric": 0.21289518,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v32-20250511-155741/checkpoint-300",
|
|
"epoch": 2.9826262626262627,
|
|
"eval_steps": 20,
|
|
"global_step": 462,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006464646464646465,
|
|
"grad_norm": 2.4983460903167725,
|
|
"learning_rate": 9.999884400986087e-06,
|
|
"loss": 0.4511958062648773,
|
|
"memory(GiB)": 29.0,
|
|
"step": 1,
|
|
"token_acc": 0.8814259881167656,
|
|
"train_speed(iter/s)": 0.064483
|
|
},
|
|
{
|
|
"epoch": 0.03232323232323232,
|
|
"grad_norm": 1.3926482200622559,
|
|
"learning_rate": 9.997110291906109e-06,
|
|
"loss": 0.36116155982017517,
|
|
"memory(GiB)": 29.0,
|
|
"step": 5,
|
|
"token_acc": 0.8813427587993634,
|
|
"train_speed(iter/s)": 0.117455
|
|
},
|
|
{
|
|
"epoch": 0.06464646464646465,
|
|
"grad_norm": 0.9170165657997131,
|
|
"learning_rate": 9.988444507789584e-06,
|
|
"loss": 0.2866232395172119,
|
|
"memory(GiB)": 29.01,
|
|
"step": 10,
|
|
"token_acc": 0.9120523855610239,
|
|
"train_speed(iter/s)": 0.133259
|
|
},
|
|
{
|
|
"epoch": 0.09696969696969697,
|
|
"grad_norm": 1.0082355737686157,
|
|
"learning_rate": 9.97401266428502e-06,
|
|
"loss": 0.28241963386535646,
|
|
"memory(GiB)": 29.01,
|
|
"step": 15,
|
|
"token_acc": 0.9087824080358354,
|
|
"train_speed(iter/s)": 0.138372
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"grad_norm": 0.8785488605499268,
|
|
"learning_rate": 9.953831442918418e-06,
|
|
"loss": 0.2537196159362793,
|
|
"memory(GiB)": 29.01,
|
|
"step": 20,
|
|
"token_acc": 0.9221852133546644,
|
|
"train_speed(iter/s)": 0.141695
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"eval_loss": 0.2639790177345276,
|
|
"eval_runtime": 4.917,
|
|
"eval_samples_per_second": 20.337,
|
|
"eval_steps_per_second": 5.084,
|
|
"eval_token_acc": 0.9243485416476181,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 0.8217573761940002,
|
|
"learning_rate": 9.927924170825266e-06,
|
|
"loss": 0.25768203735351564,
|
|
"memory(GiB)": 35.4,
|
|
"step": 25,
|
|
"token_acc": 0.9051673457803726,
|
|
"train_speed(iter/s)": 0.130395
|
|
},
|
|
{
|
|
"epoch": 0.19393939393939394,
|
|
"grad_norm": 0.7097320556640625,
|
|
"learning_rate": 9.896320793787106e-06,
|
|
"loss": 0.2483672618865967,
|
|
"memory(GiB)": 35.4,
|
|
"step": 30,
|
|
"token_acc": 0.909970182164424,
|
|
"train_speed(iter/s)": 0.135436
|
|
},
|
|
{
|
|
"epoch": 0.22626262626262628,
|
|
"grad_norm": 0.836563229560852,
|
|
"learning_rate": 9.859057841617709e-06,
|
|
"loss": 0.2459421157836914,
|
|
"memory(GiB)": 35.4,
|
|
"step": 35,
|
|
"token_acc": 0.9186036076460559,
|
|
"train_speed(iter/s)": 0.137299
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"grad_norm": 0.8180928230285645,
|
|
"learning_rate": 9.816178385938867e-06,
|
|
"loss": 0.24172163009643555,
|
|
"memory(GiB)": 35.4,
|
|
"step": 40,
|
|
"token_acc": 0.9263462681936868,
|
|
"train_speed(iter/s)": 0.138678
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"eval_loss": 0.24495410919189453,
|
|
"eval_runtime": 4.9047,
|
|
"eval_samples_per_second": 20.389,
|
|
"eval_steps_per_second": 5.097,
|
|
"eval_token_acc": 0.9283167230501966,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 0.7037041187286377,
|
|
"learning_rate": 9.767731990394638e-06,
|
|
"loss": 0.23539552688598633,
|
|
"memory(GiB)": 35.4,
|
|
"step": 45,
|
|
"token_acc": 0.923866090712743,
|
|
"train_speed(iter/s)": 0.133364
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 0.7791242003440857,
|
|
"learning_rate": 9.71377465336155e-06,
|
|
"loss": 0.24135751724243165,
|
|
"memory(GiB)": 35.4,
|
|
"step": 50,
|
|
"token_acc": 0.9219366605869673,
|
|
"train_speed(iter/s)": 0.13559
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.8093725442886353,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.22368321418762208,
|
|
"memory(GiB)": 35.4,
|
|
"step": 55,
|
|
"token_acc": 0.9367819177493826,
|
|
"train_speed(iter/s)": 0.136789
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"grad_norm": 0.793940007686615,
|
|
"learning_rate": 9.589582926268798e-06,
|
|
"loss": 0.253676700592041,
|
|
"memory(GiB)": 35.4,
|
|
"step": 60,
|
|
"token_acc": 0.9199327545541226,
|
|
"train_speed(iter/s)": 0.138407
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"eval_loss": 0.23592451214790344,
|
|
"eval_runtime": 4.9128,
|
|
"eval_samples_per_second": 20.355,
|
|
"eval_steps_per_second": 5.089,
|
|
"eval_token_acc": 0.9300722318734571,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4202020202020202,
|
|
"grad_norm": 0.6124956011772156,
|
|
"learning_rate": 9.519492087344724e-06,
|
|
"loss": 0.22318036556243898,
|
|
"memory(GiB)": 35.4,
|
|
"step": 65,
|
|
"token_acc": 0.9210595135801547,
|
|
"train_speed(iter/s)": 0.133844
|
|
},
|
|
{
|
|
"epoch": 0.45252525252525255,
|
|
"grad_norm": 0.7477027773857117,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.2359461307525635,
|
|
"memory(GiB)": 35.4,
|
|
"step": 70,
|
|
"token_acc": 0.9285559703511335,
|
|
"train_speed(iter/s)": 0.13606
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 0.6800923347473145,
|
|
"learning_rate": 9.363725449224281e-06,
|
|
"loss": 0.2285386562347412,
|
|
"memory(GiB)": 35.4,
|
|
"step": 75,
|
|
"token_acc": 0.9188793644156387,
|
|
"train_speed(iter/s)": 0.137382
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"grad_norm": 0.6734771132469177,
|
|
"learning_rate": 9.278229698073889e-06,
|
|
"loss": 0.21797473430633546,
|
|
"memory(GiB)": 35.4,
|
|
"step": 80,
|
|
"token_acc": 0.9257270821968936,
|
|
"train_speed(iter/s)": 0.137993
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"eval_loss": 0.22674760222434998,
|
|
"eval_runtime": 4.9057,
|
|
"eval_samples_per_second": 20.384,
|
|
"eval_steps_per_second": 5.096,
|
|
"eval_token_acc": 0.9334003840175551,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5494949494949495,
|
|
"grad_norm": 0.7561268210411072,
|
|
"learning_rate": 9.187788812929074e-06,
|
|
"loss": 0.21897151470184326,
|
|
"memory(GiB)": 35.4,
|
|
"step": 85,
|
|
"token_acc": 0.9210582145281738,
|
|
"train_speed(iter/s)": 0.134965
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 0.8261750936508179,
|
|
"learning_rate": 9.092507332892968e-06,
|
|
"loss": 0.22996132373809813,
|
|
"memory(GiB)": 35.4,
|
|
"step": 90,
|
|
"token_acc": 0.9239816972180894,
|
|
"train_speed(iter/s)": 0.135853
|
|
},
|
|
{
|
|
"epoch": 0.6141414141414141,
|
|
"grad_norm": 0.7424522638320923,
|
|
"learning_rate": 8.992495392231195e-06,
|
|
"loss": 0.2230750799179077,
|
|
"memory(GiB)": 35.4,
|
|
"step": 95,
|
|
"token_acc": 0.9186349499873918,
|
|
"train_speed(iter/s)": 0.13676
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 0.7229611873626709,
|
|
"learning_rate": 8.88786859306952e-06,
|
|
"loss": 0.22269039154052733,
|
|
"memory(GiB)": 35.4,
|
|
"step": 100,
|
|
"token_acc": 0.9253269004084542,
|
|
"train_speed(iter/s)": 0.13764
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"eval_loss": 0.22388949990272522,
|
|
"eval_runtime": 4.9194,
|
|
"eval_samples_per_second": 20.328,
|
|
"eval_steps_per_second": 5.082,
|
|
"eval_token_acc": 0.9335466764194935,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6787878787878788,
|
|
"grad_norm": 0.7450099587440491,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 0.2219161033630371,
|
|
"memory(GiB)": 35.4,
|
|
"step": 105,
|
|
"token_acc": 0.9172185430463576,
|
|
"train_speed(iter/s)": 0.135528
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.7880620360374451,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.2219111680984497,
|
|
"memory(GiB)": 35.4,
|
|
"step": 110,
|
|
"token_acc": 0.9298043728423475,
|
|
"train_speed(iter/s)": 0.136321
|
|
},
|
|
{
|
|
"epoch": 0.7434343434343434,
|
|
"grad_norm": 0.6852765679359436,
|
|
"learning_rate": 8.547534234672435e-06,
|
|
"loss": 0.21125171184539795,
|
|
"memory(GiB)": 35.4,
|
|
"step": 115,
|
|
"token_acc": 0.9242475103502293,
|
|
"train_speed(iter/s)": 0.137112
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"grad_norm": 0.7938790917396545,
|
|
"learning_rate": 8.425708574839221e-06,
|
|
"loss": 0.20452361106872557,
|
|
"memory(GiB)": 35.4,
|
|
"step": 120,
|
|
"token_acc": 0.9356995325578867,
|
|
"train_speed(iter/s)": 0.137738
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"eval_loss": 0.22211778163909912,
|
|
"eval_runtime": 4.9147,
|
|
"eval_samples_per_second": 20.347,
|
|
"eval_steps_per_second": 5.087,
|
|
"eval_token_acc": 0.9335649629697358,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 0.8188007473945618,
|
|
"learning_rate": 8.299923195887599e-06,
|
|
"loss": 0.2093752384185791,
|
|
"memory(GiB)": 35.4,
|
|
"step": 125,
|
|
"token_acc": 0.9252643368325139,
|
|
"train_speed(iter/s)": 0.135892
|
|
},
|
|
{
|
|
"epoch": 0.8404040404040404,
|
|
"grad_norm": 0.7376716732978821,
|
|
"learning_rate": 8.170323491028625e-06,
|
|
"loss": 0.22786922454833985,
|
|
"memory(GiB)": 35.4,
|
|
"step": 130,
|
|
"token_acc": 0.9265276699567484,
|
|
"train_speed(iter/s)": 0.136311
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 0.7271324396133423,
|
|
"learning_rate": 8.03705926238874e-06,
|
|
"loss": 0.22320261001586914,
|
|
"memory(GiB)": 35.4,
|
|
"step": 135,
|
|
"token_acc": 0.9232753388630386,
|
|
"train_speed(iter/s)": 0.136955
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"grad_norm": 0.6261648535728455,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.20459423065185547,
|
|
"memory(GiB)": 35.4,
|
|
"step": 140,
|
|
"token_acc": 0.9249030499947595,
|
|
"train_speed(iter/s)": 0.137413
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"eval_loss": 0.2182023972272873,
|
|
"eval_runtime": 4.9305,
|
|
"eval_samples_per_second": 20.282,
|
|
"eval_steps_per_second": 5.07,
|
|
"eval_token_acc": 0.9343512846301545,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9373737373737374,
|
|
"grad_norm": 0.7701309323310852,
|
|
"learning_rate": 7.760157443030234e-06,
|
|
"loss": 0.21836166381835936,
|
|
"memory(GiB)": 35.4,
|
|
"step": 145,
|
|
"token_acc": 0.9190240230822304,
|
|
"train_speed(iter/s)": 0.13576
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.7041613459587097,
|
|
"learning_rate": 7.616839918483061e-06,
|
|
"loss": 0.20981380939483643,
|
|
"memory(GiB)": 35.4,
|
|
"step": 150,
|
|
"token_acc": 0.936579955636217,
|
|
"train_speed(iter/s)": 0.136283
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.7106350660324097,
|
|
"learning_rate": 7.470497632538743e-06,
|
|
"loss": 0.21157641410827638,
|
|
"memory(GiB)": 35.4,
|
|
"step": 155,
|
|
"token_acc": 0.9277830690795003,
|
|
"train_speed(iter/s)": 0.13695
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"grad_norm": 0.6274145245552063,
|
|
"learning_rate": 7.321299739792553e-06,
|
|
"loss": 0.15708084106445314,
|
|
"memory(GiB)": 35.4,
|
|
"step": 160,
|
|
"token_acc": 0.9426217376288915,
|
|
"train_speed(iter/s)": 0.137434
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"eval_loss": 0.21975626051425934,
|
|
"eval_runtime": 4.9158,
|
|
"eval_samples_per_second": 20.343,
|
|
"eval_steps_per_second": 5.086,
|
|
"eval_token_acc": 0.9343878577306391,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0646464646464646,
|
|
"grad_norm": 0.774198591709137,
|
|
"learning_rate": 7.169418695587791e-06,
|
|
"loss": 0.15303026437759398,
|
|
"memory(GiB)": 35.4,
|
|
"step": 165,
|
|
"token_acc": 0.9383086204390173,
|
|
"train_speed(iter/s)": 0.136042
|
|
},
|
|
{
|
|
"epoch": 1.096969696969697,
|
|
"grad_norm": 0.9729277491569519,
|
|
"learning_rate": 7.015030056677559e-06,
|
|
"loss": 0.16362838745117186,
|
|
"memory(GiB)": 35.4,
|
|
"step": 170,
|
|
"token_acc": 0.9423035053342043,
|
|
"train_speed(iter/s)": 0.136853
|
|
},
|
|
{
|
|
"epoch": 1.1292929292929292,
|
|
"grad_norm": 0.6922506093978882,
|
|
"learning_rate": 6.858312278301638e-06,
|
|
"loss": 0.14241609573364258,
|
|
"memory(GiB)": 35.4,
|
|
"step": 175,
|
|
"token_acc": 0.9497638260185003,
|
|
"train_speed(iter/s)": 0.137229
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 0.7330082058906555,
|
|
"learning_rate": 6.699446507913083e-06,
|
|
"loss": 0.14002810716629027,
|
|
"memory(GiB)": 35.4,
|
|
"step": 180,
|
|
"token_acc": 0.9511152364076167,
|
|
"train_speed(iter/s)": 0.137547
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"eval_loss": 0.2188844084739685,
|
|
"eval_runtime": 4.9164,
|
|
"eval_samples_per_second": 20.34,
|
|
"eval_steps_per_second": 5.085,
|
|
"eval_token_acc": 0.9342232787784585,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.1939393939393939,
|
|
"grad_norm": 0.6539108157157898,
|
|
"learning_rate": 6.53861637579291e-06,
|
|
"loss": 0.14178130626678467,
|
|
"memory(GiB)": 35.4,
|
|
"step": 185,
|
|
"token_acc": 0.9393139351120201,
|
|
"train_speed(iter/s)": 0.13623
|
|
},
|
|
{
|
|
"epoch": 1.2262626262626264,
|
|
"grad_norm": 0.6944511532783508,
|
|
"learning_rate": 6.376007782794926e-06,
|
|
"loss": 0.14827605485916137,
|
|
"memory(GiB)": 35.4,
|
|
"step": 190,
|
|
"token_acc": 0.953679121068877,
|
|
"train_speed(iter/s)": 0.136568
|
|
},
|
|
{
|
|
"epoch": 1.2585858585858585,
|
|
"grad_norm": 0.771194577217102,
|
|
"learning_rate": 6.211808685466063e-06,
|
|
"loss": 0.15898674726486206,
|
|
"memory(GiB)": 35.4,
|
|
"step": 195,
|
|
"token_acc": 0.943248080364875,
|
|
"train_speed(iter/s)": 0.137259
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"grad_norm": 0.6921746134757996,
|
|
"learning_rate": 6.046208878790543e-06,
|
|
"loss": 0.15377380847930908,
|
|
"memory(GiB)": 35.4,
|
|
"step": 200,
|
|
"token_acc": 0.9407217751767861,
|
|
"train_speed(iter/s)": 0.137707
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"eval_loss": 0.21775808930397034,
|
|
"eval_runtime": 4.8976,
|
|
"eval_samples_per_second": 20.418,
|
|
"eval_steps_per_second": 5.105,
|
|
"eval_token_acc": 0.93543019109445,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 0.6446962952613831,
|
|
"learning_rate": 5.879399776809047e-06,
|
|
"loss": 0.14692131280899048,
|
|
"memory(GiB)": 35.4,
|
|
"step": 205,
|
|
"token_acc": 0.9425448637471383,
|
|
"train_speed(iter/s)": 0.136659
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.7306222319602966,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.15312260389328003,
|
|
"memory(GiB)": 35.4,
|
|
"step": 210,
|
|
"token_acc": 0.9499927420525476,
|
|
"train_speed(iter/s)": 0.137029
|
|
},
|
|
{
|
|
"epoch": 1.387878787878788,
|
|
"grad_norm": 0.6448764801025391,
|
|
"learning_rate": 5.542926109243727e-06,
|
|
"loss": 0.13940632343292236,
|
|
"memory(GiB)": 35.4,
|
|
"step": 215,
|
|
"token_acc": 0.9516881492881878,
|
|
"train_speed(iter/s)": 0.137465
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"grad_norm": 0.6932432055473328,
|
|
"learning_rate": 5.373650467932122e-06,
|
|
"loss": 0.15438802242279054,
|
|
"memory(GiB)": 35.4,
|
|
"step": 220,
|
|
"token_acc": 0.9442491210447012,
|
|
"train_speed(iter/s)": 0.137794
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"eval_loss": 0.21947798132896423,
|
|
"eval_runtime": 4.9113,
|
|
"eval_samples_per_second": 20.361,
|
|
"eval_steps_per_second": 5.09,
|
|
"eval_token_acc": 0.9346804425345159,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4525252525252526,
|
|
"grad_norm": 0.6570079922676086,
|
|
"learning_rate": 5.2039429303079294e-06,
|
|
"loss": 0.15822865962982177,
|
|
"memory(GiB)": 35.4,
|
|
"step": 225,
|
|
"token_acc": 0.9367170212395239,
|
|
"train_speed(iter/s)": 0.136817
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 0.6642510294914246,
|
|
"learning_rate": 5.033999658469174e-06,
|
|
"loss": 0.15136797428131105,
|
|
"memory(GiB)": 35.4,
|
|
"step": 230,
|
|
"token_acc": 0.9503748661192432,
|
|
"train_speed(iter/s)": 0.137201
|
|
},
|
|
{
|
|
"epoch": 1.5171717171717172,
|
|
"grad_norm": 0.7496922016143799,
|
|
"learning_rate": 4.864017086995112e-06,
|
|
"loss": 0.14471328258514404,
|
|
"memory(GiB)": 35.4,
|
|
"step": 235,
|
|
"token_acc": 0.9505234475835365,
|
|
"train_speed(iter/s)": 0.137524
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"grad_norm": 0.7266538739204407,
|
|
"learning_rate": 4.694191695890788e-06,
|
|
"loss": 0.1494928002357483,
|
|
"memory(GiB)": 35.4,
|
|
"step": 240,
|
|
"token_acc": 0.9422356427892412,
|
|
"train_speed(iter/s)": 0.137886
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"eval_loss": 0.21711412072181702,
|
|
"eval_runtime": 4.9366,
|
|
"eval_samples_per_second": 20.257,
|
|
"eval_steps_per_second": 5.064,
|
|
"eval_token_acc": 0.9350644600896041,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.5818181818181818,
|
|
"grad_norm": 0.7479135394096375,
|
|
"learning_rate": 4.524719783479088e-06,
|
|
"loss": 0.1447455883026123,
|
|
"memory(GiB)": 35.4,
|
|
"step": 245,
|
|
"token_acc": 0.939549201495775,
|
|
"train_speed(iter/s)": 0.136926
|
|
},
|
|
{
|
|
"epoch": 1.614141414141414,
|
|
"grad_norm": 0.749000072479248,
|
|
"learning_rate": 4.355797239502807e-06,
|
|
"loss": 0.14387867450714112,
|
|
"memory(GiB)": 35.4,
|
|
"step": 250,
|
|
"token_acc": 0.9542463385992511,
|
|
"train_speed(iter/s)": 0.136999
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 0.7617182731628418,
|
|
"learning_rate": 4.187619318698971e-06,
|
|
"loss": 0.14646867513656617,
|
|
"memory(GiB)": 35.4,
|
|
"step": 255,
|
|
"token_acc": 0.9474541561947315,
|
|
"train_speed(iter/s)": 0.137388
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"grad_norm": 0.6883800029754639,
|
|
"learning_rate": 4.020380415107167e-06,
|
|
"loss": 0.1463113784790039,
|
|
"memory(GiB)": 35.4,
|
|
"step": 260,
|
|
"token_acc": 0.9495336172965978,
|
|
"train_speed(iter/s)": 0.137618
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"eval_loss": 0.21431542932987213,
|
|
"eval_runtime": 4.9045,
|
|
"eval_samples_per_second": 20.39,
|
|
"eval_steps_per_second": 5.097,
|
|
"eval_token_acc": 0.93678339581238,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.6667433977127075,
|
|
"learning_rate": 3.854273837372724e-06,
|
|
"loss": 0.1535871744155884,
|
|
"memory(GiB)": 35.4,
|
|
"step": 265,
|
|
"token_acc": 0.9473451928299838,
|
|
"train_speed(iter/s)": 0.136826
|
|
},
|
|
{
|
|
"epoch": 1.7434343434343433,
|
|
"grad_norm": 0.6924517154693604,
|
|
"learning_rate": 3.689491585304491e-06,
|
|
"loss": 0.14744930267333983,
|
|
"memory(GiB)": 35.4,
|
|
"step": 270,
|
|
"token_acc": 0.9449676204236436,
|
|
"train_speed(iter/s)": 0.137097
|
|
},
|
|
{
|
|
"epoch": 1.7757575757575759,
|
|
"grad_norm": 0.6516327857971191,
|
|
"learning_rate": 3.526224127945479e-06,
|
|
"loss": 0.1559753894805908,
|
|
"memory(GiB)": 35.4,
|
|
"step": 275,
|
|
"token_acc": 0.9424614287896532,
|
|
"train_speed(iter/s)": 0.137539
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 0.6028838753700256,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.1387632369995117,
|
|
"memory(GiB)": 35.4,
|
|
"step": 280,
|
|
"token_acc": 0.94836721764672,
|
|
"train_speed(iter/s)": 0.137773
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"eval_loss": 0.2140393853187561,
|
|
"eval_runtime": 4.9027,
|
|
"eval_samples_per_second": 20.397,
|
|
"eval_steps_per_second": 5.099,
|
|
"eval_token_acc": 0.9365639572094724,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.8404040404040405,
|
|
"grad_norm": 0.6998237371444702,
|
|
"learning_rate": 3.204986500762006e-06,
|
|
"loss": 0.1372049331665039,
|
|
"memory(GiB)": 35.4,
|
|
"step": 285,
|
|
"token_acc": 0.9367344291458962,
|
|
"train_speed(iter/s)": 0.136971
|
|
},
|
|
{
|
|
"epoch": 1.8727272727272726,
|
|
"grad_norm": 0.6459011435508728,
|
|
"learning_rate": 3.0473876441260786e-06,
|
|
"loss": 0.14626307487487794,
|
|
"memory(GiB)": 35.4,
|
|
"step": 290,
|
|
"token_acc": 0.9516988818583687,
|
|
"train_speed(iter/s)": 0.137206
|
|
},
|
|
{
|
|
"epoch": 1.905050505050505,
|
|
"grad_norm": 0.7556573748588562,
|
|
"learning_rate": 2.8920457793817507e-06,
|
|
"loss": 0.14524000883102417,
|
|
"memory(GiB)": 35.4,
|
|
"step": 295,
|
|
"token_acc": 0.9526069161871997,
|
|
"train_speed(iter/s)": 0.137397
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"grad_norm": 0.7367635369300842,
|
|
"learning_rate": 2.7391404635865725e-06,
|
|
"loss": 0.15229568481445313,
|
|
"memory(GiB)": 35.4,
|
|
"step": 300,
|
|
"token_acc": 0.9469880723162711,
|
|
"train_speed(iter/s)": 0.137634
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"eval_loss": 0.21289518475532532,
|
|
"eval_runtime": 4.928,
|
|
"eval_samples_per_second": 20.292,
|
|
"eval_steps_per_second": 5.073,
|
|
"eval_token_acc": 0.9364908110085033,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.6437100172042847,
|
|
"learning_rate": 2.5888484374320033e-06,
|
|
"loss": 0.1357938528060913,
|
|
"memory(GiB)": 35.4,
|
|
"step": 305,
|
|
"token_acc": 0.9442405000496081,
|
|
"train_speed(iter/s)": 0.136908
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.8284677863121033,
|
|
"learning_rate": 2.4413434209518137e-06,
|
|
"loss": 0.15702880620956422,
|
|
"memory(GiB)": 35.4,
|
|
"step": 310,
|
|
"token_acc": 0.9496246977167803,
|
|
"train_speed(iter/s)": 0.137317
|
|
},
|
|
{
|
|
"epoch": 2.0323232323232325,
|
|
"grad_norm": 0.5962865948677063,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 0.10595057010650635,
|
|
"memory(GiB)": 35.4,
|
|
"step": 315,
|
|
"token_acc": 0.9668544137877333,
|
|
"train_speed(iter/s)": 0.137506
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"grad_norm": 0.659416675567627,
|
|
"learning_rate": 2.1553729927843894e-06,
|
|
"loss": 0.10077614784240722,
|
|
"memory(GiB)": 35.4,
|
|
"step": 320,
|
|
"token_acc": 0.9683417974178226,
|
|
"train_speed(iter/s)": 0.13772
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"eval_loss": 0.22252832353115082,
|
|
"eval_runtime": 4.9207,
|
|
"eval_samples_per_second": 20.322,
|
|
"eval_steps_per_second": 5.081,
|
|
"eval_token_acc": 0.9355033372954192,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.096969696969697,
|
|
"grad_norm": 0.7271497249603271,
|
|
"learning_rate": 2.017238129521506e-06,
|
|
"loss": 0.10290584564208985,
|
|
"memory(GiB)": 35.4,
|
|
"step": 325,
|
|
"token_acc": 0.9533099941945359,
|
|
"train_speed(iter/s)": 0.137044
|
|
},
|
|
{
|
|
"epoch": 2.1292929292929292,
|
|
"grad_norm": 0.8144314885139465,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.10785359144210815,
|
|
"memory(GiB)": 35.4,
|
|
"step": 330,
|
|
"token_acc": 0.9629136358979613,
|
|
"train_speed(iter/s)": 0.137367
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 0.6300661563873291,
|
|
"learning_rate": 1.7514672589449378e-06,
|
|
"loss": 0.0975375771522522,
|
|
"memory(GiB)": 35.4,
|
|
"step": 335,
|
|
"token_acc": 0.9635617067708034,
|
|
"train_speed(iter/s)": 0.137455
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"grad_norm": 0.7295346856117249,
|
|
"learning_rate": 1.6241384517255854e-06,
|
|
"loss": 0.10815587043762206,
|
|
"memory(GiB)": 35.4,
|
|
"step": 340,
|
|
"token_acc": 0.9572717202088938,
|
|
"train_speed(iter/s)": 0.137684
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"eval_loss": 0.2339620590209961,
|
|
"eval_runtime": 4.9038,
|
|
"eval_samples_per_second": 20.392,
|
|
"eval_steps_per_second": 5.098,
|
|
"eval_token_acc": 0.9357044893480845,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.2262626262626264,
|
|
"grad_norm": 0.641219437122345,
|
|
"learning_rate": 1.500711746282192e-06,
|
|
"loss": 0.10359236001968383,
|
|
"memory(GiB)": 35.4,
|
|
"step": 345,
|
|
"token_acc": 0.9507107042642255,
|
|
"train_speed(iter/s)": 0.137115
|
|
},
|
|
{
|
|
"epoch": 2.2585858585858585,
|
|
"grad_norm": 0.671101450920105,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.10023324489593506,
|
|
"memory(GiB)": 35.4,
|
|
"step": 350,
|
|
"token_acc": 0.9665440259416314,
|
|
"train_speed(iter/s)": 0.137251
|
|
},
|
|
{
|
|
"epoch": 2.290909090909091,
|
|
"grad_norm": 0.6462791562080383,
|
|
"learning_rate": 1.2661306328825818e-06,
|
|
"loss": 0.10422945022583008,
|
|
"memory(GiB)": 35.4,
|
|
"step": 355,
|
|
"token_acc": 0.9596824570536179,
|
|
"train_speed(iter/s)": 0.137541
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.6095359921455383,
|
|
"learning_rate": 1.1552473733031893e-06,
|
|
"loss": 0.10793395042419433,
|
|
"memory(GiB)": 35.4,
|
|
"step": 360,
|
|
"token_acc": 0.9589668637246134,
|
|
"train_speed(iter/s)": 0.137864
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"eval_loss": 0.2318117916584015,
|
|
"eval_runtime": 4.9037,
|
|
"eval_samples_per_second": 20.393,
|
|
"eval_steps_per_second": 5.098,
|
|
"eval_token_acc": 0.9356496296973575,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.6510639786720276,
|
|
"learning_rate": 1.0488081988375493e-06,
|
|
"loss": 0.10193748474121093,
|
|
"memory(GiB)": 35.4,
|
|
"step": 365,
|
|
"token_acc": 0.9518641538926091,
|
|
"train_speed(iter/s)": 0.137469
|
|
},
|
|
{
|
|
"epoch": 2.3878787878787877,
|
|
"grad_norm": 0.6623317003250122,
|
|
"learning_rate": 9.469361407432431e-07,
|
|
"loss": 0.10095088481903076,
|
|
"memory(GiB)": 35.4,
|
|
"step": 370,
|
|
"token_acc": 0.9654876015836633,
|
|
"train_speed(iter/s)": 0.137592
|
|
},
|
|
{
|
|
"epoch": 2.4202020202020202,
|
|
"grad_norm": 4.409001350402832,
|
|
"learning_rate": 8.497489512245971e-07,
|
|
"loss": 0.11350960731506347,
|
|
"memory(GiB)": 35.4,
|
|
"step": 375,
|
|
"token_acc": 0.9622961334066337,
|
|
"train_speed(iter/s)": 0.137743
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"grad_norm": 0.712914228439331,
|
|
"learning_rate": 7.573589673248833e-07,
|
|
"loss": 0.10257253646850586,
|
|
"memory(GiB)": 35.4,
|
|
"step": 380,
|
|
"token_acc": 0.9649572411487786,
|
|
"train_speed(iter/s)": 0.137953
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"eval_loss": 0.2319175750017166,
|
|
"eval_runtime": 4.9058,
|
|
"eval_samples_per_second": 20.384,
|
|
"eval_steps_per_second": 5.096,
|
|
"eval_token_acc": 0.935923927950992,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.5886191129684448,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.09679580926895141,
|
|
"memory(GiB)": 35.4,
|
|
"step": 385,
|
|
"token_acc": 0.9530538972153801,
|
|
"train_speed(iter/s)": 0.137382
|
|
},
|
|
{
|
|
"epoch": 2.517171717171717,
|
|
"grad_norm": 0.648043692111969,
|
|
"learning_rate": 5.873921160683943e-07,
|
|
"loss": 0.10484771728515625,
|
|
"memory(GiB)": 35.4,
|
|
"step": 390,
|
|
"token_acc": 0.9592218765019858,
|
|
"train_speed(iter/s)": 0.137666
|
|
},
|
|
{
|
|
"epoch": 2.5494949494949495,
|
|
"grad_norm": 0.6488398909568787,
|
|
"learning_rate": 5.100117105459279e-07,
|
|
"loss": 0.10683284997940064,
|
|
"memory(GiB)": 35.4,
|
|
"step": 395,
|
|
"token_acc": 0.9655297765422859,
|
|
"train_speed(iter/s)": 0.137785
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"grad_norm": 0.6164813041687012,
|
|
"learning_rate": 4.3782120722406565e-07,
|
|
"loss": 0.09545568823814392,
|
|
"memory(GiB)": 35.4,
|
|
"step": 400,
|
|
"token_acc": 0.964151079309434,
|
|
"train_speed(iter/s)": 0.138017
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"eval_loss": 0.2318730354309082,
|
|
"eval_runtime": 4.9271,
|
|
"eval_samples_per_second": 20.296,
|
|
"eval_steps_per_second": 5.074,
|
|
"eval_token_acc": 0.935795922099296,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.614141414141414,
|
|
"grad_norm": 0.6586740612983704,
|
|
"learning_rate": 3.709040498955102e-07,
|
|
"loss": 0.09898868799209595,
|
|
"memory(GiB)": 35.4,
|
|
"step": 405,
|
|
"token_acc": 0.9504973446851556,
|
|
"train_speed(iter/s)": 0.137464
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 0.7148135304450989,
|
|
"learning_rate": 3.0933758698072023e-07,
|
|
"loss": 0.10967177152633667,
|
|
"memory(GiB)": 35.4,
|
|
"step": 410,
|
|
"token_acc": 0.9612623965810093,
|
|
"train_speed(iter/s)": 0.137658
|
|
},
|
|
{
|
|
"epoch": 2.6787878787878787,
|
|
"grad_norm": 0.694380521774292,
|
|
"learning_rate": 2.531929821221768e-07,
|
|
"loss": 0.11133409738540649,
|
|
"memory(GiB)": 35.4,
|
|
"step": 415,
|
|
"token_acc": 0.9621429914828448,
|
|
"train_speed(iter/s)": 0.137903
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.6905390620231628,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.09970238208770751,
|
|
"memory(GiB)": 35.4,
|
|
"step": 420,
|
|
"token_acc": 0.9661263829919112,
|
|
"train_speed(iter/s)": 0.138101
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"eval_loss": 0.23147569596767426,
|
|
"eval_runtime": 4.9087,
|
|
"eval_samples_per_second": 20.372,
|
|
"eval_steps_per_second": 5.093,
|
|
"eval_token_acc": 0.9359605010514767,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.7434343434343433,
|
|
"grad_norm": 0.7609266042709351,
|
|
"learning_rate": 1.5742259095662126e-07,
|
|
"loss": 0.10753922462463379,
|
|
"memory(GiB)": 35.4,
|
|
"step": 425,
|
|
"token_acc": 0.9499499081649692,
|
|
"train_speed(iter/s)": 0.13755
|
|
},
|
|
{
|
|
"epoch": 2.775757575757576,
|
|
"grad_norm": 0.6554747223854065,
|
|
"learning_rate": 1.1790750403941231e-07,
|
|
"loss": 0.10583784580230712,
|
|
"memory(GiB)": 35.4,
|
|
"step": 430,
|
|
"token_acc": 0.9655550844287033,
|
|
"train_speed(iter/s)": 0.137741
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.6224953532218933,
|
|
"learning_rate": 8.403554600248498e-08,
|
|
"loss": 0.09819064140319825,
|
|
"memory(GiB)": 35.4,
|
|
"step": 435,
|
|
"token_acc": 0.9677975086138352,
|
|
"train_speed(iter/s)": 0.137915
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"grad_norm": 0.6320119500160217,
|
|
"learning_rate": 5.584586887435739e-08,
|
|
"loss": 0.10492353439331055,
|
|
"memory(GiB)": 35.4,
|
|
"step": 440,
|
|
"token_acc": 0.964147038991839,
|
|
"train_speed(iter/s)": 0.138113
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"eval_loss": 0.23168671131134033,
|
|
"eval_runtime": 4.9097,
|
|
"eval_samples_per_second": 20.368,
|
|
"eval_steps_per_second": 5.092,
|
|
"eval_token_acc": 0.9358507817500229,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.8727272727272726,
|
|
"grad_norm": 0.6035221219062805,
|
|
"learning_rate": 3.337105663029361e-08,
|
|
"loss": 0.09671036005020142,
|
|
"memory(GiB)": 35.4,
|
|
"step": 445,
|
|
"token_acc": 0.950904653367489,
|
|
"train_speed(iter/s)": 0.137622
|
|
},
|
|
{
|
|
"epoch": 2.905050505050505,
|
|
"grad_norm": 0.640915036201477,
|
|
"learning_rate": 1.6637087529033925e-08,
|
|
"loss": 0.10660991668701172,
|
|
"memory(GiB)": 35.4,
|
|
"step": 450,
|
|
"token_acc": 0.9602548048129798,
|
|
"train_speed(iter/s)": 0.137845
|
|
},
|
|
{
|
|
"epoch": 2.937373737373737,
|
|
"grad_norm": 0.6130684614181519,
|
|
"learning_rate": 5.6633040849601865e-09,
|
|
"loss": 0.09656901955604554,
|
|
"memory(GiB)": 35.4,
|
|
"step": 455,
|
|
"token_acc": 0.9638375350140056,
|
|
"train_speed(iter/s)": 0.137992
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 0.6519441604614258,
|
|
"learning_rate": 4.623907104084335e-10,
|
|
"loss": 0.1028173565864563,
|
|
"memory(GiB)": 35.4,
|
|
"step": 460,
|
|
"token_acc": 0.9615221849669585,
|
|
"train_speed(iter/s)": 0.138235
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"eval_loss": 0.23168207705020905,
|
|
"eval_runtime": 4.9089,
|
|
"eval_samples_per_second": 20.371,
|
|
"eval_steps_per_second": 5.093,
|
|
"eval_token_acc": 0.9358690683002652,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.9826262626262627,
|
|
"eval_loss": 0.23158639669418335,
|
|
"eval_runtime": 4.9243,
|
|
"eval_samples_per_second": 20.307,
|
|
"eval_steps_per_second": 5.077,
|
|
"eval_token_acc": 0.935923927950992,
|
|
"step": 462
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 462,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 5.7147979377043046e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|