Model: waltonfuture/qwen2.5vl-3b-sampled_15000_reflection-cot-32b Source: Original Platform
1750 lines
50 KiB
JSON
1750 lines
50 KiB
JSON
{
|
|
"best_global_step": 460,
|
|
"best_metric": 0.21278653,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v31-20250511-155600/checkpoint-460",
|
|
"epoch": 2.9911123081066524,
|
|
"eval_steps": 20,
|
|
"global_step": 696,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0043091839482897925,
|
|
"grad_norm": 2.5187692642211914,
|
|
"learning_rate": 9.99994906450425e-06,
|
|
"loss": 0.42822888493537903,
|
|
"memory(GiB)": 30.87,
|
|
"step": 1,
|
|
"token_acc": 0.8878183069511356,
|
|
"train_speed(iter/s)": 0.066565
|
|
},
|
|
{
|
|
"epoch": 0.02154591974144896,
|
|
"grad_norm": 1.426959753036499,
|
|
"learning_rate": 9.99872666449397e-06,
|
|
"loss": 0.3501852750778198,
|
|
"memory(GiB)": 30.87,
|
|
"step": 5,
|
|
"token_acc": 0.8919414461038001,
|
|
"train_speed(iter/s)": 0.123514
|
|
},
|
|
{
|
|
"epoch": 0.04309183948289792,
|
|
"grad_norm": 0.964651882648468,
|
|
"learning_rate": 9.994907306529203e-06,
|
|
"loss": 0.2793572902679443,
|
|
"memory(GiB)": 30.87,
|
|
"step": 10,
|
|
"token_acc": 0.9118709677419354,
|
|
"train_speed(iter/s)": 0.140044
|
|
},
|
|
{
|
|
"epoch": 0.06463775922434689,
|
|
"grad_norm": 0.8671960830688477,
|
|
"learning_rate": 9.988543871435342e-06,
|
|
"loss": 0.2740795612335205,
|
|
"memory(GiB)": 30.87,
|
|
"step": 15,
|
|
"token_acc": 0.9092932217932218,
|
|
"train_speed(iter/s)": 0.143802
|
|
},
|
|
{
|
|
"epoch": 0.08618367896579585,
|
|
"grad_norm": 0.8037099838256836,
|
|
"learning_rate": 9.979639600327522e-06,
|
|
"loss": 0.2603166103363037,
|
|
"memory(GiB)": 30.87,
|
|
"step": 20,
|
|
"token_acc": 0.909269693956469,
|
|
"train_speed(iter/s)": 0.146069
|
|
},
|
|
{
|
|
"epoch": 0.08618367896579585,
|
|
"eval_loss": 0.28445571660995483,
|
|
"eval_runtime": 9.1338,
|
|
"eval_samples_per_second": 16.422,
|
|
"eval_steps_per_second": 4.16,
|
|
"eval_token_acc": 0.9113986985450026,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.10772959870724481,
|
|
"grad_norm": 0.7577056884765625,
|
|
"learning_rate": 9.96819902845557e-06,
|
|
"loss": 0.25346100330352783,
|
|
"memory(GiB)": 30.87,
|
|
"step": 25,
|
|
"token_acc": 0.9155251620482054,
|
|
"train_speed(iter/s)": 0.132315
|
|
},
|
|
{
|
|
"epoch": 0.12927551844869378,
|
|
"grad_norm": 0.8333344459533691,
|
|
"learning_rate": 9.954227982894034e-06,
|
|
"loss": 0.2552709341049194,
|
|
"memory(GiB)": 30.87,
|
|
"step": 30,
|
|
"token_acc": 0.9222959574861856,
|
|
"train_speed(iter/s)": 0.136002
|
|
},
|
|
{
|
|
"epoch": 0.15082143819014274,
|
|
"grad_norm": 0.7584787607192993,
|
|
"learning_rate": 9.937733579574263e-06,
|
|
"loss": 0.23784613609313965,
|
|
"memory(GiB)": 30.87,
|
|
"step": 35,
|
|
"token_acc": 0.9168941654498127,
|
|
"train_speed(iter/s)": 0.137854
|
|
},
|
|
{
|
|
"epoch": 0.1723673579315917,
|
|
"grad_norm": 0.771274745464325,
|
|
"learning_rate": 9.918724219660013e-06,
|
|
"loss": 0.23706231117248536,
|
|
"memory(GiB)": 30.87,
|
|
"step": 40,
|
|
"token_acc": 0.9237318428492509,
|
|
"train_speed(iter/s)": 0.14013
|
|
},
|
|
{
|
|
"epoch": 0.1723673579315917,
|
|
"eval_loss": 0.26220738887786865,
|
|
"eval_runtime": 9.0555,
|
|
"eval_samples_per_second": 16.564,
|
|
"eval_steps_per_second": 4.196,
|
|
"eval_token_acc": 0.916019594940411,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.19391327767304067,
|
|
"grad_norm": 0.7550503611564636,
|
|
"learning_rate": 9.897209585268459e-06,
|
|
"loss": 0.2579146146774292,
|
|
"memory(GiB)": 30.87,
|
|
"step": 45,
|
|
"token_acc": 0.9144340126707885,
|
|
"train_speed(iter/s)": 0.132998
|
|
},
|
|
{
|
|
"epoch": 0.21545919741448963,
|
|
"grad_norm": 0.7265065312385559,
|
|
"learning_rate": 9.873200634538746e-06,
|
|
"loss": 0.24367237091064453,
|
|
"memory(GiB)": 30.87,
|
|
"step": 50,
|
|
"token_acc": 0.9259022989915843,
|
|
"train_speed(iter/s)": 0.134517
|
|
},
|
|
{
|
|
"epoch": 0.23700511715593858,
|
|
"grad_norm": 0.7292296290397644,
|
|
"learning_rate": 9.846709596050646e-06,
|
|
"loss": 0.23889431953430176,
|
|
"memory(GiB)": 30.87,
|
|
"step": 55,
|
|
"token_acc": 0.9210317334265112,
|
|
"train_speed(iter/s)": 0.136145
|
|
},
|
|
{
|
|
"epoch": 0.25855103689738757,
|
|
"grad_norm": 0.7953284382820129,
|
|
"learning_rate": 9.817749962596115e-06,
|
|
"loss": 0.23232686519622803,
|
|
"memory(GiB)": 30.87,
|
|
"step": 60,
|
|
"token_acc": 0.9184958572339069,
|
|
"train_speed(iter/s)": 0.137867
|
|
},
|
|
{
|
|
"epoch": 0.25855103689738757,
|
|
"eval_loss": 0.251347154378891,
|
|
"eval_runtime": 9.0788,
|
|
"eval_samples_per_second": 16.522,
|
|
"eval_steps_per_second": 4.186,
|
|
"eval_token_acc": 0.9184031585874095,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.28009695663883655,
|
|
"grad_norm": 0.6928980350494385,
|
|
"learning_rate": 9.786336484306966e-06,
|
|
"loss": 0.24375591278076172,
|
|
"memory(GiB)": 33.45,
|
|
"step": 65,
|
|
"token_acc": 0.9260038800834584,
|
|
"train_speed(iter/s)": 0.134041
|
|
},
|
|
{
|
|
"epoch": 0.3016428763802855,
|
|
"grad_norm": 0.7734121084213257,
|
|
"learning_rate": 9.752485161142103e-06,
|
|
"loss": 0.22954516410827636,
|
|
"memory(GiB)": 33.45,
|
|
"step": 70,
|
|
"token_acc": 0.9301495884406615,
|
|
"train_speed(iter/s)": 0.13516
|
|
},
|
|
{
|
|
"epoch": 0.32318879612173446,
|
|
"grad_norm": 0.6648094654083252,
|
|
"learning_rate": 9.716213234738216e-06,
|
|
"loss": 0.21929600238800048,
|
|
"memory(GiB)": 33.45,
|
|
"step": 75,
|
|
"token_acc": 0.9217300294184204,
|
|
"train_speed(iter/s)": 0.13628
|
|
},
|
|
{
|
|
"epoch": 0.3447347158631834,
|
|
"grad_norm": 0.7500734925270081,
|
|
"learning_rate": 9.677539179628005e-06,
|
|
"loss": 0.2358041524887085,
|
|
"memory(GiB)": 33.45,
|
|
"step": 80,
|
|
"token_acc": 0.9286979011240426,
|
|
"train_speed(iter/s)": 0.137406
|
|
},
|
|
{
|
|
"epoch": 0.3447347158631834,
|
|
"eval_loss": 0.24365545809268951,
|
|
"eval_runtime": 9.052,
|
|
"eval_samples_per_second": 16.571,
|
|
"eval_steps_per_second": 4.198,
|
|
"eval_token_acc": 0.9190319514513416,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.36628063560463237,
|
|
"grad_norm": 0.7007490992546082,
|
|
"learning_rate": 9.636482693830488e-06,
|
|
"loss": 0.23542990684509277,
|
|
"memory(GiB)": 33.45,
|
|
"step": 85,
|
|
"token_acc": 0.9126046231489751,
|
|
"train_speed(iter/s)": 0.133883
|
|
},
|
|
{
|
|
"epoch": 0.38782655534608135,
|
|
"grad_norm": 0.7181985378265381,
|
|
"learning_rate": 9.59306468881811e-06,
|
|
"loss": 0.23636837005615235,
|
|
"memory(GiB)": 33.45,
|
|
"step": 90,
|
|
"token_acc": 0.9158058806435421,
|
|
"train_speed(iter/s)": 0.135756
|
|
},
|
|
{
|
|
"epoch": 0.4093724750875303,
|
|
"grad_norm": 0.7669239044189453,
|
|
"learning_rate": 9.547307278865823e-06,
|
|
"loss": 0.22127339839935303,
|
|
"memory(GiB)": 33.45,
|
|
"step": 95,
|
|
"token_acc": 0.9236630450119938,
|
|
"train_speed(iter/s)": 0.136626
|
|
},
|
|
{
|
|
"epoch": 0.43091839482897926,
|
|
"grad_norm": 0.7955787777900696,
|
|
"learning_rate": 9.499233769787534e-06,
|
|
"loss": 0.2230149030685425,
|
|
"memory(GiB)": 33.45,
|
|
"step": 100,
|
|
"token_acc": 0.9264181091877497,
|
|
"train_speed(iter/s)": 0.137829
|
|
},
|
|
{
|
|
"epoch": 0.43091839482897926,
|
|
"eval_loss": 0.23765011131763458,
|
|
"eval_runtime": 9.0746,
|
|
"eval_samples_per_second": 16.53,
|
|
"eval_steps_per_second": 4.188,
|
|
"eval_token_acc": 0.9208744607735615,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.45246431457042824,
|
|
"grad_norm": 0.7825611233711243,
|
|
"learning_rate": 9.448868647065644e-06,
|
|
"loss": 0.23993771076202391,
|
|
"memory(GiB)": 33.45,
|
|
"step": 105,
|
|
"token_acc": 0.9233408703240906,
|
|
"train_speed(iter/s)": 0.135278
|
|
},
|
|
{
|
|
"epoch": 0.47401023431187717,
|
|
"grad_norm": 0.6205978989601135,
|
|
"learning_rate": 9.396237563379761e-06,
|
|
"loss": 0.2033458471298218,
|
|
"memory(GiB)": 36.07,
|
|
"step": 110,
|
|
"token_acc": 0.9351195748449955,
|
|
"train_speed(iter/s)": 0.135983
|
|
},
|
|
{
|
|
"epoch": 0.49555615405332615,
|
|
"grad_norm": 0.7755696773529053,
|
|
"learning_rate": 9.341367325540921e-06,
|
|
"loss": 0.20325517654418945,
|
|
"memory(GiB)": 36.07,
|
|
"step": 115,
|
|
"token_acc": 0.9325856886666162,
|
|
"train_speed(iter/s)": 0.136589
|
|
},
|
|
{
|
|
"epoch": 0.5171020737947751,
|
|
"grad_norm": 0.6994781494140625,
|
|
"learning_rate": 9.284285880837947e-06,
|
|
"loss": 0.20680899620056153,
|
|
"memory(GiB)": 36.07,
|
|
"step": 120,
|
|
"token_acc": 0.9287609114612856,
|
|
"train_speed(iter/s)": 0.137204
|
|
},
|
|
{
|
|
"epoch": 0.5171020737947751,
|
|
"eval_loss": 0.233298197388649,
|
|
"eval_runtime": 9.1026,
|
|
"eval_samples_per_second": 16.479,
|
|
"eval_steps_per_second": 4.175,
|
|
"eval_token_acc": 0.9223221466695913,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.5386479935362241,
|
|
"grad_norm": 0.586891233921051,
|
|
"learning_rate": 9.225022302802951e-06,
|
|
"loss": 0.20470066070556642,
|
|
"memory(GiB)": 36.07,
|
|
"step": 125,
|
|
"token_acc": 0.923740110451327,
|
|
"train_speed(iter/s)": 0.134553
|
|
},
|
|
{
|
|
"epoch": 0.5601939132776731,
|
|
"grad_norm": 0.7442881464958191,
|
|
"learning_rate": 9.163606776403182e-06,
|
|
"loss": 0.21566917896270751,
|
|
"memory(GiB)": 36.07,
|
|
"step": 130,
|
|
"token_acc": 0.9216363255911278,
|
|
"train_speed(iter/s)": 0.13532
|
|
},
|
|
{
|
|
"epoch": 0.581739833019122,
|
|
"grad_norm": 0.8242325782775879,
|
|
"learning_rate": 9.100070582666796e-06,
|
|
"loss": 0.2127697229385376,
|
|
"memory(GiB)": 36.07,
|
|
"step": 135,
|
|
"token_acc": 0.9278588316706072,
|
|
"train_speed(iter/s)": 0.135954
|
|
},
|
|
{
|
|
"epoch": 0.603285752760571,
|
|
"grad_norm": 0.7717390656471252,
|
|
"learning_rate": 9.034446082750352e-06,
|
|
"loss": 0.22162201404571533,
|
|
"memory(GiB)": 38.74,
|
|
"step": 140,
|
|
"token_acc": 0.9336324292479551,
|
|
"train_speed(iter/s)": 0.136388
|
|
},
|
|
{
|
|
"epoch": 0.603285752760571,
|
|
"eval_loss": 0.2318713515996933,
|
|
"eval_runtime": 9.0797,
|
|
"eval_samples_per_second": 16.52,
|
|
"eval_steps_per_second": 4.185,
|
|
"eval_token_acc": 0.922234408130438,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.6248316725020199,
|
|
"grad_norm": 0.6726309657096863,
|
|
"learning_rate": 8.966766701456177e-06,
|
|
"loss": 0.21126816272735596,
|
|
"memory(GiB)": 38.74,
|
|
"step": 145,
|
|
"token_acc": 0.9254626566202053,
|
|
"train_speed(iter/s)": 0.134385
|
|
},
|
|
{
|
|
"epoch": 0.6463775922434689,
|
|
"grad_norm": 0.6902908086776733,
|
|
"learning_rate": 8.897066910207958e-06,
|
|
"loss": 0.21008939743041993,
|
|
"memory(GiB)": 38.74,
|
|
"step": 150,
|
|
"token_acc": 0.9295461033399943,
|
|
"train_speed(iter/s)": 0.135048
|
|
},
|
|
{
|
|
"epoch": 0.6679235119849178,
|
|
"grad_norm": 0.7515047788619995,
|
|
"learning_rate": 8.825382209493284e-06,
|
|
"loss": 0.22056446075439454,
|
|
"memory(GiB)": 38.74,
|
|
"step": 155,
|
|
"token_acc": 0.925468949189583,
|
|
"train_speed(iter/s)": 0.135565
|
|
},
|
|
{
|
|
"epoch": 0.6894694317263668,
|
|
"grad_norm": 0.831814169883728,
|
|
"learning_rate": 8.751749110782013e-06,
|
|
"loss": 0.20998082160949708,
|
|
"memory(GiB)": 38.74,
|
|
"step": 160,
|
|
"token_acc": 0.9225184377421299,
|
|
"train_speed(iter/s)": 0.136084
|
|
},
|
|
{
|
|
"epoch": 0.6894694317263668,
|
|
"eval_loss": 0.22983159124851227,
|
|
"eval_runtime": 9.0689,
|
|
"eval_samples_per_second": 16.54,
|
|
"eval_steps_per_second": 4.19,
|
|
"eval_token_acc": 0.9240184250932222,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.7110153514678158,
|
|
"grad_norm": 0.7213571667671204,
|
|
"learning_rate": 8.676205117929752e-06,
|
|
"loss": 0.20911731719970703,
|
|
"memory(GiB)": 38.74,
|
|
"step": 165,
|
|
"token_acc": 0.9290636077587924,
|
|
"train_speed(iter/s)": 0.134458
|
|
},
|
|
{
|
|
"epoch": 0.7325612712092647,
|
|
"grad_norm": 0.7172518968582153,
|
|
"learning_rate": 8.598788708075844e-06,
|
|
"loss": 0.20972037315368652,
|
|
"memory(GiB)": 38.74,
|
|
"step": 170,
|
|
"token_acc": 0.920637162143079,
|
|
"train_speed(iter/s)": 0.134813
|
|
},
|
|
{
|
|
"epoch": 0.7541071909507137,
|
|
"grad_norm": 0.7580899596214294,
|
|
"learning_rate": 8.51953931204566e-06,
|
|
"loss": 0.20927505493164061,
|
|
"memory(GiB)": 38.74,
|
|
"step": 175,
|
|
"token_acc": 0.9197037614500098,
|
|
"train_speed(iter/s)": 0.135368
|
|
},
|
|
{
|
|
"epoch": 0.7756531106921627,
|
|
"grad_norm": 0.6354929208755493,
|
|
"learning_rate": 8.438497294267117e-06,
|
|
"loss": 0.19174000024795532,
|
|
"memory(GiB)": 38.74,
|
|
"step": 180,
|
|
"token_acc": 0.9338436037441498,
|
|
"train_speed(iter/s)": 0.13579
|
|
},
|
|
{
|
|
"epoch": 0.7756531106921627,
|
|
"eval_loss": 0.2258211374282837,
|
|
"eval_runtime": 9.0856,
|
|
"eval_samples_per_second": 16.51,
|
|
"eval_steps_per_second": 4.182,
|
|
"eval_token_acc": 0.9245302332382832,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.7971990304336116,
|
|
"grad_norm": 0.6916645169258118,
|
|
"learning_rate": 8.3557039322117e-06,
|
|
"loss": 0.20972118377685547,
|
|
"memory(GiB)": 38.74,
|
|
"step": 185,
|
|
"token_acc": 0.9328983816882928,
|
|
"train_speed(iter/s)": 0.134176
|
|
},
|
|
{
|
|
"epoch": 0.8187449501750605,
|
|
"grad_norm": 0.7086811661720276,
|
|
"learning_rate": 8.27120139537044e-06,
|
|
"loss": 0.20003724098205566,
|
|
"memory(GiB)": 38.74,
|
|
"step": 190,
|
|
"token_acc": 0.9426539918077131,
|
|
"train_speed(iter/s)": 0.13459
|
|
},
|
|
{
|
|
"epoch": 0.8402908699165096,
|
|
"grad_norm": 0.7445757389068604,
|
|
"learning_rate": 8.18503272377554e-06,
|
|
"loss": 0.2096252918243408,
|
|
"memory(GiB)": 38.74,
|
|
"step": 195,
|
|
"token_acc": 0.9330636846696804,
|
|
"train_speed(iter/s)": 0.135216
|
|
},
|
|
{
|
|
"epoch": 0.8618367896579585,
|
|
"grad_norm": 0.679315984249115,
|
|
"learning_rate": 8.097241806078616e-06,
|
|
"loss": 0.20919806957244874,
|
|
"memory(GiB)": 38.74,
|
|
"step": 200,
|
|
"token_acc": 0.9273927392739274,
|
|
"train_speed(iter/s)": 0.135566
|
|
},
|
|
{
|
|
"epoch": 0.8618367896579585,
|
|
"eval_loss": 0.22177766263484955,
|
|
"eval_runtime": 9.08,
|
|
"eval_samples_per_second": 16.52,
|
|
"eval_steps_per_second": 4.185,
|
|
"eval_token_acc": 0.925202895371792,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.8833827093994074,
|
|
"grad_norm": 0.7430039048194885,
|
|
"learning_rate": 8.007873357196716e-06,
|
|
"loss": 0.21405186653137206,
|
|
"memory(GiB)": 38.74,
|
|
"step": 205,
|
|
"token_acc": 0.9314106184693589,
|
|
"train_speed(iter/s)": 0.134435
|
|
},
|
|
{
|
|
"epoch": 0.9049286291408565,
|
|
"grad_norm": 0.781891405582428,
|
|
"learning_rate": 7.916972895537471e-06,
|
|
"loss": 0.21267032623291016,
|
|
"memory(GiB)": 38.74,
|
|
"step": 210,
|
|
"token_acc": 0.9286528119588005,
|
|
"train_speed(iter/s)": 0.13485
|
|
},
|
|
{
|
|
"epoch": 0.9264745488823054,
|
|
"grad_norm": 0.7031259536743164,
|
|
"learning_rate": 7.824586719815019e-06,
|
|
"loss": 0.19673454761505127,
|
|
"memory(GiB)": 38.74,
|
|
"step": 215,
|
|
"token_acc": 0.9280998651727184,
|
|
"train_speed(iter/s)": 0.135252
|
|
},
|
|
{
|
|
"epoch": 0.9480204686237543,
|
|
"grad_norm": 0.6894703507423401,
|
|
"learning_rate": 7.730761885468486e-06,
|
|
"loss": 0.20636224746704102,
|
|
"memory(GiB)": 38.74,
|
|
"step": 220,
|
|
"token_acc": 0.9305924848241609,
|
|
"train_speed(iter/s)": 0.135851
|
|
},
|
|
{
|
|
"epoch": 0.9480204686237543,
|
|
"eval_loss": 0.2209300696849823,
|
|
"eval_runtime": 9.0739,
|
|
"eval_samples_per_second": 16.531,
|
|
"eval_steps_per_second": 4.188,
|
|
"eval_token_acc": 0.9254076186298165,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.9695663883652034,
|
|
"grad_norm": 0.7678598761558533,
|
|
"learning_rate": 7.635546180695039e-06,
|
|
"loss": 0.2145383834838867,
|
|
"memory(GiB)": 38.74,
|
|
"step": 225,
|
|
"token_acc": 0.9279970535878707,
|
|
"train_speed(iter/s)": 0.134797
|
|
},
|
|
{
|
|
"epoch": 0.9911123081066523,
|
|
"grad_norm": 0.6631984710693359,
|
|
"learning_rate": 7.538988102109728e-06,
|
|
"loss": 0.20897607803344725,
|
|
"memory(GiB)": 38.74,
|
|
"step": 230,
|
|
"token_acc": 0.9305944343816894,
|
|
"train_speed(iter/s)": 0.135388
|
|
},
|
|
{
|
|
"epoch": 1.0086183678965797,
|
|
"grad_norm": 0.5616968870162964,
|
|
"learning_rate": 7.441136830044495e-06,
|
|
"loss": 0.1698223114013672,
|
|
"memory(GiB)": 38.74,
|
|
"step": 235,
|
|
"token_acc": 0.9450659366692111,
|
|
"train_speed(iter/s)": 0.136043
|
|
},
|
|
{
|
|
"epoch": 1.0301642876380286,
|
|
"grad_norm": 0.7747679352760315,
|
|
"learning_rate": 7.342042203498952e-06,
|
|
"loss": 0.1522472620010376,
|
|
"memory(GiB)": 38.74,
|
|
"step": 240,
|
|
"token_acc": 0.9446748506967485,
|
|
"train_speed(iter/s)": 0.136458
|
|
},
|
|
{
|
|
"epoch": 1.0301642876380286,
|
|
"eval_loss": 0.22592027485370636,
|
|
"eval_runtime": 9.0855,
|
|
"eval_samples_per_second": 16.51,
|
|
"eval_steps_per_second": 4.183,
|
|
"eval_token_acc": 0.9261680193024786,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.0517102073794775,
|
|
"grad_norm": 0.7509682178497314,
|
|
"learning_rate": 7.241754694755674e-06,
|
|
"loss": 0.14929369688034058,
|
|
"memory(GiB)": 38.74,
|
|
"step": 245,
|
|
"token_acc": 0.9417605130483494,
|
|
"train_speed(iter/s)": 0.135402
|
|
},
|
|
{
|
|
"epoch": 1.0732561271209264,
|
|
"grad_norm": 0.7504904270172119,
|
|
"learning_rate": 7.140325383672938e-06,
|
|
"loss": 0.1446376323699951,
|
|
"memory(GiB)": 38.74,
|
|
"step": 250,
|
|
"token_acc": 0.9489984514711024,
|
|
"train_speed(iter/s)": 0.135808
|
|
},
|
|
{
|
|
"epoch": 1.0948020468623754,
|
|
"grad_norm": 0.6855825781822205,
|
|
"learning_rate": 7.037805931668006e-06,
|
|
"loss": 0.14796760082244872,
|
|
"memory(GiB)": 38.74,
|
|
"step": 255,
|
|
"token_acc": 0.9450119000396668,
|
|
"train_speed(iter/s)": 0.136065
|
|
},
|
|
{
|
|
"epoch": 1.1163479666038243,
|
|
"grad_norm": 0.6018539071083069,
|
|
"learning_rate": 6.934248555404197e-06,
|
|
"loss": 0.14233092069625855,
|
|
"memory(GiB)": 38.74,
|
|
"step": 260,
|
|
"token_acc": 0.9503198086030955,
|
|
"train_speed(iter/s)": 0.13639
|
|
},
|
|
{
|
|
"epoch": 1.1163479666038243,
|
|
"eval_loss": 0.22484588623046875,
|
|
"eval_runtime": 9.0958,
|
|
"eval_samples_per_second": 16.491,
|
|
"eval_steps_per_second": 4.178,
|
|
"eval_token_acc": 0.9261680193024786,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.1378938863452734,
|
|
"grad_norm": 0.7431089878082275,
|
|
"learning_rate": 6.8297060001951545e-06,
|
|
"loss": 0.1508152961730957,
|
|
"memory(GiB)": 38.74,
|
|
"step": 265,
|
|
"token_acc": 0.9396280295818994,
|
|
"train_speed(iter/s)": 0.135576
|
|
},
|
|
{
|
|
"epoch": 1.1594398060867224,
|
|
"grad_norm": 0.6764137148857117,
|
|
"learning_rate": 6.724231513139853e-06,
|
|
"loss": 0.1467280149459839,
|
|
"memory(GiB)": 38.74,
|
|
"step": 270,
|
|
"token_acc": 0.9448425970165101,
|
|
"train_speed(iter/s)": 0.135807
|
|
},
|
|
{
|
|
"epoch": 1.1809857258281713,
|
|
"grad_norm": 0.7039455771446228,
|
|
"learning_rate": 6.617878816002032e-06,
|
|
"loss": 0.14175877571105958,
|
|
"memory(GiB)": 38.74,
|
|
"step": 275,
|
|
"token_acc": 0.955692078562785,
|
|
"train_speed(iter/s)": 0.136098
|
|
},
|
|
{
|
|
"epoch": 1.2025316455696202,
|
|
"grad_norm": 0.742364227771759,
|
|
"learning_rate": 6.510702077847864e-06,
|
|
"loss": 0.14723964929580688,
|
|
"memory(GiB)": 38.74,
|
|
"step": 280,
|
|
"token_acc": 0.947806605915655,
|
|
"train_speed(iter/s)": 0.136459
|
|
},
|
|
{
|
|
"epoch": 1.2025316455696202,
|
|
"eval_loss": 0.22613751888275146,
|
|
"eval_runtime": 9.0885,
|
|
"eval_samples_per_second": 16.504,
|
|
"eval_steps_per_second": 4.181,
|
|
"eval_token_acc": 0.9258609344154419,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.2240775653110691,
|
|
"grad_norm": 0.5859317183494568,
|
|
"learning_rate": 6.402755887455792e-06,
|
|
"loss": 0.14544841051101684,
|
|
"memory(GiB)": 38.74,
|
|
"step": 285,
|
|
"token_acc": 0.9466956804824975,
|
|
"train_speed(iter/s)": 0.135483
|
|
},
|
|
{
|
|
"epoch": 1.2456234850525183,
|
|
"grad_norm": 0.7057396769523621,
|
|
"learning_rate": 6.294095225512604e-06,
|
|
"loss": 0.1524769902229309,
|
|
"memory(GiB)": 38.74,
|
|
"step": 290,
|
|
"token_acc": 0.9396096068249258,
|
|
"train_speed(iter/s)": 0.13576
|
|
},
|
|
{
|
|
"epoch": 1.2671694047939672,
|
|
"grad_norm": 0.782086193561554,
|
|
"learning_rate": 6.184775436609885e-06,
|
|
"loss": 0.14989967346191407,
|
|
"memory(GiB)": 38.74,
|
|
"step": 295,
|
|
"token_acc": 0.9496643404887627,
|
|
"train_speed(iter/s)": 0.136136
|
|
},
|
|
{
|
|
"epoch": 1.2887153245354162,
|
|
"grad_norm": 0.8077779412269592,
|
|
"learning_rate": 6.074852201055121e-06,
|
|
"loss": 0.1529999017715454,
|
|
"memory(GiB)": 38.74,
|
|
"step": 300,
|
|
"token_acc": 0.9429197198712922,
|
|
"train_speed(iter/s)": 0.136527
|
|
},
|
|
{
|
|
"epoch": 1.2887153245354162,
|
|
"eval_loss": 0.22358979284763336,
|
|
"eval_runtime": 9.0841,
|
|
"eval_samples_per_second": 16.512,
|
|
"eval_steps_per_second": 4.183,
|
|
"eval_token_acc": 0.9260217884038897,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.310261244276865,
|
|
"grad_norm": 0.7263866066932678,
|
|
"learning_rate": 5.964381506511823e-06,
|
|
"loss": 0.1550525903701782,
|
|
"memory(GiB)": 38.74,
|
|
"step": 305,
|
|
"token_acc": 0.9437107599047413,
|
|
"train_speed(iter/s)": 0.135779
|
|
},
|
|
{
|
|
"epoch": 1.331807164018314,
|
|
"grad_norm": 0.667982816696167,
|
|
"learning_rate": 5.853419619483083e-06,
|
|
"loss": 0.1465543746948242,
|
|
"memory(GiB)": 38.74,
|
|
"step": 310,
|
|
"token_acc": 0.9492003131640756,
|
|
"train_speed(iter/s)": 0.13602
|
|
},
|
|
{
|
|
"epoch": 1.353353083759763,
|
|
"grad_norm": 0.6755979061126709,
|
|
"learning_rate": 5.742023056653131e-06,
|
|
"loss": 0.1486139178276062,
|
|
"memory(GiB)": 38.74,
|
|
"step": 315,
|
|
"token_acc": 0.953510881680061,
|
|
"train_speed(iter/s)": 0.136273
|
|
},
|
|
{
|
|
"epoch": 1.3748990035012119,
|
|
"grad_norm": 0.7418352365493774,
|
|
"learning_rate": 5.630248556101448e-06,
|
|
"loss": 0.14667509794235228,
|
|
"memory(GiB)": 38.74,
|
|
"step": 320,
|
|
"token_acc": 0.9451512649929588,
|
|
"train_speed(iter/s)": 0.136561
|
|
},
|
|
{
|
|
"epoch": 1.3748990035012119,
|
|
"eval_loss": 0.2228708267211914,
|
|
"eval_runtime": 9.0781,
|
|
"eval_samples_per_second": 16.523,
|
|
"eval_steps_per_second": 4.186,
|
|
"eval_token_acc": 0.926899173795423,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.3964449232426608,
|
|
"grad_norm": 0.7073500752449036,
|
|
"learning_rate": 5.51815304840412e-06,
|
|
"loss": 0.14506160020828246,
|
|
"memory(GiB)": 38.74,
|
|
"step": 325,
|
|
"token_acc": 0.9425411230856494,
|
|
"train_speed(iter/s)": 0.13582
|
|
},
|
|
{
|
|
"epoch": 1.41799084298411,
|
|
"grad_norm": 0.6103145480155945,
|
|
"learning_rate": 5.405793627637157e-06,
|
|
"loss": 0.1493854284286499,
|
|
"memory(GiB)": 38.74,
|
|
"step": 330,
|
|
"token_acc": 0.9529187644577494,
|
|
"train_speed(iter/s)": 0.13604
|
|
},
|
|
{
|
|
"epoch": 1.4395367627255589,
|
|
"grad_norm": 3.6716339588165283,
|
|
"learning_rate": 5.293227522296517e-06,
|
|
"loss": 0.15280224084854127,
|
|
"memory(GiB)": 38.74,
|
|
"step": 335,
|
|
"token_acc": 0.9547047104353202,
|
|
"train_speed(iter/s)": 0.136231
|
|
},
|
|
{
|
|
"epoch": 1.4610826824670078,
|
|
"grad_norm": 0.6979946494102478,
|
|
"learning_rate": 5.180512066149682e-06,
|
|
"loss": 0.1544776201248169,
|
|
"memory(GiB)": 38.74,
|
|
"step": 340,
|
|
"token_acc": 0.939196952288807,
|
|
"train_speed(iter/s)": 0.136511
|
|
},
|
|
{
|
|
"epoch": 1.4610826824670078,
|
|
"eval_loss": 0.2203603982925415,
|
|
"eval_runtime": 9.0826,
|
|
"eval_samples_per_second": 16.515,
|
|
"eval_steps_per_second": 4.184,
|
|
"eval_token_acc": 0.9274694742999195,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.4826286022084567,
|
|
"grad_norm": 0.7076512575149536,
|
|
"learning_rate": 5.06770466903361e-06,
|
|
"loss": 0.14294663667678834,
|
|
"memory(GiB)": 38.74,
|
|
"step": 345,
|
|
"token_acc": 0.9427494854259345,
|
|
"train_speed(iter/s)": 0.135716
|
|
},
|
|
{
|
|
"epoch": 1.5041745219499059,
|
|
"grad_norm": 0.7397769093513489,
|
|
"learning_rate": 4.954862787613937e-06,
|
|
"loss": 0.1430816411972046,
|
|
"memory(GiB)": 38.74,
|
|
"step": 350,
|
|
"token_acc": 0.9418471128608924,
|
|
"train_speed(iter/s)": 0.135954
|
|
},
|
|
{
|
|
"epoch": 1.5257204416913548,
|
|
"grad_norm": 0.7260046005249023,
|
|
"learning_rate": 4.842043896120332e-06,
|
|
"loss": 0.14312554597854615,
|
|
"memory(GiB)": 38.74,
|
|
"step": 355,
|
|
"token_acc": 0.9532552240608769,
|
|
"train_speed(iter/s)": 0.136211
|
|
},
|
|
{
|
|
"epoch": 1.5472663614328037,
|
|
"grad_norm": 0.7071496248245239,
|
|
"learning_rate": 4.729305457072913e-06,
|
|
"loss": 0.15508384704589845,
|
|
"memory(GiB)": 38.74,
|
|
"step": 360,
|
|
"token_acc": 0.9442986367690152,
|
|
"train_speed(iter/s)": 0.136516
|
|
},
|
|
{
|
|
"epoch": 1.5472663614328037,
|
|
"eval_loss": 0.21864531934261322,
|
|
"eval_runtime": 9.0916,
|
|
"eval_samples_per_second": 16.499,
|
|
"eval_steps_per_second": 4.18,
|
|
"eval_token_acc": 0.9273086203114718,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.5688122811742526,
|
|
"grad_norm": 0.7608581781387329,
|
|
"learning_rate": 4.616704892014613e-06,
|
|
"loss": 0.14767109155654906,
|
|
"memory(GiB)": 38.74,
|
|
"step": 365,
|
|
"token_acc": 0.9417986303251507,
|
|
"train_speed(iter/s)": 0.135816
|
|
},
|
|
{
|
|
"epoch": 1.5903582009157016,
|
|
"grad_norm": 0.6775366067886353,
|
|
"learning_rate": 4.504299552264428e-06,
|
|
"loss": 0.14293992519378662,
|
|
"memory(GiB)": 38.74,
|
|
"step": 370,
|
|
"token_acc": 0.9508605933815139,
|
|
"train_speed(iter/s)": 0.13597
|
|
},
|
|
{
|
|
"epoch": 1.6119041206571505,
|
|
"grad_norm": 0.7780856490135193,
|
|
"learning_rate": 4.392146689706426e-06,
|
|
"loss": 0.14917342662811278,
|
|
"memory(GiB)": 38.74,
|
|
"step": 375,
|
|
"token_acc": 0.9454742841633872,
|
|
"train_speed(iter/s)": 0.136293
|
|
},
|
|
{
|
|
"epoch": 1.6334500403985994,
|
|
"grad_norm": 0.7278069853782654,
|
|
"learning_rate": 4.280303427629404e-06,
|
|
"loss": 0.15140265226364136,
|
|
"memory(GiB)": 38.74,
|
|
"step": 380,
|
|
"token_acc": 0.948611652106171,
|
|
"train_speed(iter/s)": 0.136576
|
|
},
|
|
{
|
|
"epoch": 1.6334500403985994,
|
|
"eval_loss": 0.21836893260478973,
|
|
"eval_runtime": 9.0809,
|
|
"eval_samples_per_second": 16.518,
|
|
"eval_steps_per_second": 4.185,
|
|
"eval_token_acc": 0.9278204284565329,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.6549959601400483,
|
|
"grad_norm": 0.7161378860473633,
|
|
"learning_rate": 4.168826731632052e-06,
|
|
"loss": 0.13667253255844117,
|
|
"memory(GiB)": 38.74,
|
|
"step": 385,
|
|
"token_acc": 0.9480358075613754,
|
|
"train_speed(iter/s)": 0.135909
|
|
},
|
|
{
|
|
"epoch": 1.6765418798814973,
|
|
"grad_norm": 0.7765911221504211,
|
|
"learning_rate": 4.057773380608411e-06,
|
|
"loss": 0.1545323610305786,
|
|
"memory(GiB)": 38.74,
|
|
"step": 390,
|
|
"token_acc": 0.9548359404807325,
|
|
"train_speed(iter/s)": 0.136245
|
|
},
|
|
{
|
|
"epoch": 1.6980877996229464,
|
|
"grad_norm": 0.7643768787384033,
|
|
"learning_rate": 3.947199937828447e-06,
|
|
"loss": 0.15748288631439208,
|
|
"memory(GiB)": 38.74,
|
|
"step": 395,
|
|
"token_acc": 0.9471460397941154,
|
|
"train_speed(iter/s)": 0.136566
|
|
},
|
|
{
|
|
"epoch": 1.7196337193643954,
|
|
"grad_norm": 0.709528386592865,
|
|
"learning_rate": 3.8371627221284495e-06,
|
|
"loss": 0.1410720705986023,
|
|
"memory(GiB)": 38.74,
|
|
"step": 400,
|
|
"token_acc": 0.9531430538141947,
|
|
"train_speed(iter/s)": 0.136763
|
|
},
|
|
{
|
|
"epoch": 1.7196337193643954,
|
|
"eval_loss": 0.21615047752857208,
|
|
"eval_runtime": 9.087,
|
|
"eval_samples_per_second": 16.507,
|
|
"eval_steps_per_second": 4.182,
|
|
"eval_token_acc": 0.9283322366015939,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.7411796391058443,
|
|
"grad_norm": 0.6649565696716309,
|
|
"learning_rate": 3.727717779225912e-06,
|
|
"loss": 0.14372719526290895,
|
|
"memory(GiB)": 38.74,
|
|
"step": 405,
|
|
"token_acc": 0.9476192800091097,
|
|
"train_speed(iter/s)": 0.136115
|
|
},
|
|
{
|
|
"epoch": 1.7627255588472934,
|
|
"grad_norm": 0.7958775162696838,
|
|
"learning_rate": 3.6189208531735354e-06,
|
|
"loss": 0.15733466148376465,
|
|
"memory(GiB)": 38.74,
|
|
"step": 410,
|
|
"token_acc": 0.9438738194422361,
|
|
"train_speed(iter/s)": 0.136439
|
|
},
|
|
{
|
|
"epoch": 1.7842714785887424,
|
|
"grad_norm": 0.6333225965499878,
|
|
"learning_rate": 3.510827357966876e-06,
|
|
"loss": 0.13851017951965333,
|
|
"memory(GiB)": 38.74,
|
|
"step": 415,
|
|
"token_acc": 0.957043632295357,
|
|
"train_speed(iter/s)": 0.136694
|
|
},
|
|
{
|
|
"epoch": 1.8058173983301913,
|
|
"grad_norm": 0.633067786693573,
|
|
"learning_rate": 3.403492349320101e-06,
|
|
"loss": 0.13664473295211793,
|
|
"memory(GiB)": 38.74,
|
|
"step": 420,
|
|
"token_acc": 0.9495724621793467,
|
|
"train_speed(iter/s)": 0.136827
|
|
},
|
|
{
|
|
"epoch": 1.8058173983301913,
|
|
"eval_loss": 0.21541745960712433,
|
|
"eval_runtime": 9.085,
|
|
"eval_samples_per_second": 16.511,
|
|
"eval_steps_per_second": 4.183,
|
|
"eval_token_acc": 0.9283614827813117,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.8273633180716402,
|
|
"grad_norm": 0.678266704082489,
|
|
"learning_rate": 3.29697049662423e-06,
|
|
"loss": 0.13364578485488893,
|
|
"memory(GiB)": 38.74,
|
|
"step": 425,
|
|
"token_acc": 0.9482191561046045,
|
|
"train_speed(iter/s)": 0.136105
|
|
},
|
|
{
|
|
"epoch": 1.8489092378130891,
|
|
"grad_norm": 0.65605229139328,
|
|
"learning_rate": 3.191316055102146e-06,
|
|
"loss": 0.14047093391418458,
|
|
"memory(GiB)": 38.74,
|
|
"step": 430,
|
|
"token_acc": 0.9487227952692121,
|
|
"train_speed(iter/s)": 0.1363
|
|
},
|
|
{
|
|
"epoch": 1.870455157554538,
|
|
"grad_norm": 0.7330144643783569,
|
|
"learning_rate": 3.0865828381745515e-06,
|
|
"loss": 0.14013464450836183,
|
|
"memory(GiB)": 38.74,
|
|
"step": 435,
|
|
"token_acc": 0.950079521447053,
|
|
"train_speed(iter/s)": 0.136475
|
|
},
|
|
{
|
|
"epoch": 1.892001077295987,
|
|
"grad_norm": 0.6185954213142395,
|
|
"learning_rate": 2.982824190050958e-06,
|
|
"loss": 0.15014538764953614,
|
|
"memory(GiB)": 38.74,
|
|
"step": 440,
|
|
"token_acc": 0.9505330365510778,
|
|
"train_speed(iter/s)": 0.136687
|
|
},
|
|
{
|
|
"epoch": 1.892001077295987,
|
|
"eval_loss": 0.2138843536376953,
|
|
"eval_runtime": 9.078,
|
|
"eval_samples_per_second": 16.523,
|
|
"eval_steps_per_second": 4.186,
|
|
"eval_token_acc": 0.9287709292973605,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.913546997037436,
|
|
"grad_norm": 0.6207900643348694,
|
|
"learning_rate": 2.8800929585596506e-06,
|
|
"loss": 0.13897337913513183,
|
|
"memory(GiB)": 38.74,
|
|
"step": 445,
|
|
"token_acc": 0.9472243016429612,
|
|
"train_speed(iter/s)": 0.136077
|
|
},
|
|
{
|
|
"epoch": 1.9350929167788848,
|
|
"grad_norm": 0.6200534701347351,
|
|
"learning_rate": 2.778441468230483e-06,
|
|
"loss": 0.13360581398010254,
|
|
"memory(GiB)": 38.74,
|
|
"step": 450,
|
|
"token_acc": 0.9528330952581016,
|
|
"train_speed(iter/s)": 0.136238
|
|
},
|
|
{
|
|
"epoch": 1.956638836520334,
|
|
"grad_norm": 0.6887915134429932,
|
|
"learning_rate": 2.6779214936442056e-06,
|
|
"loss": 0.14459173679351806,
|
|
"memory(GiB)": 38.74,
|
|
"step": 455,
|
|
"token_acc": 0.9435832116561559,
|
|
"train_speed(iter/s)": 0.136419
|
|
},
|
|
{
|
|
"epoch": 1.978184756261783,
|
|
"grad_norm": 0.6248582005500793,
|
|
"learning_rate": 2.5785842330619038e-06,
|
|
"loss": 0.13852910995483397,
|
|
"memory(GiB)": 38.74,
|
|
"step": 460,
|
|
"token_acc": 0.9469382100304794,
|
|
"train_speed(iter/s)": 0.136568
|
|
},
|
|
{
|
|
"epoch": 1.978184756261783,
|
|
"eval_loss": 0.21278652548789978,
|
|
"eval_runtime": 9.077,
|
|
"eval_samples_per_second": 16.525,
|
|
"eval_steps_per_second": 4.186,
|
|
"eval_token_acc": 0.9287855523872194,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.9997306760032318,
|
|
"grad_norm": 0.659483790397644,
|
|
"learning_rate": 2.480480282347961e-06,
|
|
"loss": 0.14010127782821655,
|
|
"memory(GiB)": 38.74,
|
|
"step": 465,
|
|
"token_acc": 0.9450760066893402,
|
|
"train_speed(iter/s)": 0.136067
|
|
},
|
|
{
|
|
"epoch": 2.0172367357931593,
|
|
"grad_norm": 0.629996120929718,
|
|
"learning_rate": 2.383659609199873e-06,
|
|
"loss": 0.12061877250671386,
|
|
"memory(GiB)": 38.74,
|
|
"step": 470,
|
|
"token_acc": 0.954493670886076,
|
|
"train_speed(iter/s)": 0.136512
|
|
},
|
|
{
|
|
"epoch": 2.0387826555346082,
|
|
"grad_norm": 0.6116938591003418,
|
|
"learning_rate": 2.2881715276979705e-06,
|
|
"loss": 0.10172913074493409,
|
|
"memory(GiB)": 38.74,
|
|
"step": 475,
|
|
"token_acc": 0.9618814806855424,
|
|
"train_speed(iter/s)": 0.136622
|
|
},
|
|
{
|
|
"epoch": 2.060328575276057,
|
|
"grad_norm": 0.6483604311943054,
|
|
"learning_rate": 2.1940646731880887e-06,
|
|
"loss": 0.11099107265472412,
|
|
"memory(GiB)": 38.74,
|
|
"step": 480,
|
|
"token_acc": 0.9685138845023201,
|
|
"train_speed(iter/s)": 0.136775
|
|
},
|
|
{
|
|
"epoch": 2.060328575276057,
|
|
"eval_loss": 0.2272883951663971,
|
|
"eval_runtime": 9.0828,
|
|
"eval_samples_per_second": 16.515,
|
|
"eval_steps_per_second": 4.184,
|
|
"eval_token_acc": 0.928054397894275,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.081874495017506,
|
|
"grad_norm": 0.6483248472213745,
|
|
"learning_rate": 2.101386977509907e-06,
|
|
"loss": 0.11337897777557374,
|
|
"memory(GiB)": 38.74,
|
|
"step": 485,
|
|
"token_acc": 0.9550344662708691,
|
|
"train_speed(iter/s)": 0.136304
|
|
},
|
|
{
|
|
"epoch": 2.103420414758955,
|
|
"grad_norm": 0.7392867803573608,
|
|
"learning_rate": 2.010185644583641e-06,
|
|
"loss": 0.10771691799163818,
|
|
"memory(GiB)": 38.74,
|
|
"step": 490,
|
|
"token_acc": 0.9670560877027851,
|
|
"train_speed(iter/s)": 0.13646
|
|
},
|
|
{
|
|
"epoch": 2.124966334500404,
|
|
"grad_norm": 0.682255744934082,
|
|
"learning_rate": 1.920507126367448e-06,
|
|
"loss": 0.10148389339447021,
|
|
"memory(GiB)": 38.74,
|
|
"step": 495,
|
|
"token_acc": 0.9608224142005619,
|
|
"train_speed(iter/s)": 0.136614
|
|
},
|
|
{
|
|
"epoch": 2.146512254241853,
|
|
"grad_norm": 0.6227522492408752,
|
|
"learning_rate": 1.8323970991978823e-06,
|
|
"loss": 0.09187655448913574,
|
|
"memory(GiB)": 38.74,
|
|
"step": 500,
|
|
"token_acc": 0.9645073073190814,
|
|
"train_speed(iter/s)": 0.136742
|
|
},
|
|
{
|
|
"epoch": 2.146512254241853,
|
|
"eval_loss": 0.22719649970531464,
|
|
"eval_runtime": 9.0834,
|
|
"eval_samples_per_second": 16.514,
|
|
"eval_steps_per_second": 4.183,
|
|
"eval_token_acc": 0.9283029904218761,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.168058173983302,
|
|
"grad_norm": 0.6224367022514343,
|
|
"learning_rate": 1.7459004405253544e-06,
|
|
"loss": 0.09940274357795716,
|
|
"memory(GiB)": 38.74,
|
|
"step": 505,
|
|
"token_acc": 0.960183380261353,
|
|
"train_speed(iter/s)": 0.136199
|
|
},
|
|
{
|
|
"epoch": 2.1896040937247507,
|
|
"grad_norm": 0.6162160038948059,
|
|
"learning_rate": 1.6610612060565235e-06,
|
|
"loss": 0.09446293711662293,
|
|
"memory(GiB)": 38.74,
|
|
"step": 510,
|
|
"token_acc": 0.9673866334954201,
|
|
"train_speed(iter/s)": 0.136387
|
|
},
|
|
{
|
|
"epoch": 2.2111500134661997,
|
|
"grad_norm": 0.7308635115623474,
|
|
"learning_rate": 1.5779226073152071e-06,
|
|
"loss": 0.10239348411560059,
|
|
"memory(GiB)": 38.74,
|
|
"step": 515,
|
|
"token_acc": 0.9644728473210771,
|
|
"train_speed(iter/s)": 0.136648
|
|
},
|
|
{
|
|
"epoch": 2.2326959332076486,
|
|
"grad_norm": 0.63414067029953,
|
|
"learning_rate": 1.4965269896332884e-06,
|
|
"loss": 0.10488066673278809,
|
|
"memory(GiB)": 38.74,
|
|
"step": 520,
|
|
"token_acc": 0.9668579943705502,
|
|
"train_speed(iter/s)": 0.136842
|
|
},
|
|
{
|
|
"epoch": 2.2326959332076486,
|
|
"eval_loss": 0.22872595489025116,
|
|
"eval_runtime": 9.0808,
|
|
"eval_samples_per_second": 16.518,
|
|
"eval_steps_per_second": 4.185,
|
|
"eval_token_acc": 0.9286393214886306,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.254241852949098,
|
|
"grad_norm": 0.680652916431427,
|
|
"learning_rate": 1.4169158105827768e-06,
|
|
"loss": 0.10521303415298462,
|
|
"memory(GiB)": 38.74,
|
|
"step": 525,
|
|
"token_acc": 0.9544280515008022,
|
|
"train_speed(iter/s)": 0.136411
|
|
},
|
|
{
|
|
"epoch": 2.275787772690547,
|
|
"grad_norm": 0.6307674050331116,
|
|
"learning_rate": 1.3391296188600594e-06,
|
|
"loss": 0.09917184710502625,
|
|
"memory(GiB)": 38.74,
|
|
"step": 530,
|
|
"token_acc": 0.9651942551516304,
|
|
"train_speed(iter/s)": 0.136524
|
|
},
|
|
{
|
|
"epoch": 2.297333692431996,
|
|
"grad_norm": 0.6577898859977722,
|
|
"learning_rate": 1.2632080336330532e-06,
|
|
"loss": 0.10673871040344238,
|
|
"memory(GiB)": 38.74,
|
|
"step": 535,
|
|
"token_acc": 0.9639854466787945,
|
|
"train_speed(iter/s)": 0.136691
|
|
},
|
|
{
|
|
"epoch": 2.3188796121734447,
|
|
"grad_norm": 0.5732426643371582,
|
|
"learning_rate": 1.1891897243618184e-06,
|
|
"loss": 0.1005368709564209,
|
|
"memory(GiB)": 38.74,
|
|
"step": 540,
|
|
"token_acc": 0.961928605343808,
|
|
"train_speed(iter/s)": 0.136843
|
|
},
|
|
{
|
|
"epoch": 2.3188796121734447,
|
|
"eval_loss": 0.22874796390533447,
|
|
"eval_runtime": 9.0783,
|
|
"eval_samples_per_second": 16.523,
|
|
"eval_steps_per_second": 4.186,
|
|
"eval_token_acc": 0.9285223367697595,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.3404255319148937,
|
|
"grad_norm": 0.6907266974449158,
|
|
"learning_rate": 1.1171123911028692e-06,
|
|
"loss": 0.10030966997146606,
|
|
"memory(GiB)": 38.74,
|
|
"step": 545,
|
|
"token_acc": 0.9541099053336412,
|
|
"train_speed(iter/s)": 0.136383
|
|
},
|
|
{
|
|
"epoch": 2.3619714516563426,
|
|
"grad_norm": 0.7398902773857117,
|
|
"learning_rate": 1.047012745307255e-06,
|
|
"loss": 0.10119664669036865,
|
|
"memory(GiB)": 38.74,
|
|
"step": 550,
|
|
"token_acc": 0.9620670073821692,
|
|
"train_speed(iter/s)": 0.136525
|
|
},
|
|
{
|
|
"epoch": 2.3835173713977915,
|
|
"grad_norm": 0.7038416862487793,
|
|
"learning_rate": 9.789264911221546e-07,
|
|
"loss": 0.10580694675445557,
|
|
"memory(GiB)": 38.74,
|
|
"step": 555,
|
|
"token_acc": 0.9612262521588947,
|
|
"train_speed(iter/s)": 0.136738
|
|
},
|
|
{
|
|
"epoch": 2.4050632911392404,
|
|
"grad_norm": 0.567625105381012,
|
|
"learning_rate": 9.128883072055411e-07,
|
|
"loss": 0.10453232526779174,
|
|
"memory(GiB)": 38.74,
|
|
"step": 560,
|
|
"token_acc": 0.9599795204368974,
|
|
"train_speed(iter/s)": 0.136996
|
|
},
|
|
{
|
|
"epoch": 2.4050632911392404,
|
|
"eval_loss": 0.22811782360076904,
|
|
"eval_runtime": 9.0761,
|
|
"eval_samples_per_second": 16.527,
|
|
"eval_steps_per_second": 4.187,
|
|
"eval_token_acc": 0.9284638444103239,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.4266092108806894,
|
|
"grad_norm": 0.6157174706459045,
|
|
"learning_rate": 8.489318290631454e-07,
|
|
"loss": 0.10170652866363525,
|
|
"memory(GiB)": 38.74,
|
|
"step": 565,
|
|
"token_acc": 0.9582807980419787,
|
|
"train_speed(iter/s)": 0.136587
|
|
},
|
|
{
|
|
"epoch": 2.4481551306221383,
|
|
"grad_norm": 0.6362963914871216,
|
|
"learning_rate": 7.870896319167548e-07,
|
|
"loss": 0.09682157635688782,
|
|
"memory(GiB)": 38.74,
|
|
"step": 570,
|
|
"token_acc": 0.9683932052353105,
|
|
"train_speed(iter/s)": 0.136707
|
|
},
|
|
{
|
|
"epoch": 2.4697010503635872,
|
|
"grad_norm": 0.723169207572937,
|
|
"learning_rate": 7.273932141125256e-07,
|
|
"loss": 0.10265512466430664,
|
|
"memory(GiB)": 38.74,
|
|
"step": 575,
|
|
"token_acc": 0.9611537977075199,
|
|
"train_speed(iter/s)": 0.136865
|
|
},
|
|
{
|
|
"epoch": 2.4912469701050366,
|
|
"grad_norm": 0.729612410068512,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.10810785293579102,
|
|
"memory(GiB)": 38.74,
|
|
"step": 580,
|
|
"token_acc": 0.9597445638994423,
|
|
"train_speed(iter/s)": 0.136977
|
|
},
|
|
{
|
|
"epoch": 2.4912469701050366,
|
|
"eval_loss": 0.23019996285438538,
|
|
"eval_runtime": 9.0825,
|
|
"eval_samples_per_second": 16.515,
|
|
"eval_steps_per_second": 4.184,
|
|
"eval_token_acc": 0.9287124369379249,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.5127928898464855,
|
|
"grad_norm": 0.5513563752174377,
|
|
"learning_rate": 6.145582298346153e-07,
|
|
"loss": 0.09552640914916992,
|
|
"memory(GiB)": 38.74,
|
|
"step": 585,
|
|
"token_acc": 0.9564781160078752,
|
|
"train_speed(iter/s)": 0.136466
|
|
},
|
|
{
|
|
"epoch": 2.5343388095879344,
|
|
"grad_norm": 0.6641238331794739,
|
|
"learning_rate": 5.614771340776559e-07,
|
|
"loss": 0.09575198888778687,
|
|
"memory(GiB)": 38.74,
|
|
"step": 590,
|
|
"token_acc": 0.9661430780499715,
|
|
"train_speed(iter/s)": 0.1366
|
|
},
|
|
{
|
|
"epoch": 2.5558847293293834,
|
|
"grad_norm": 0.7826724052429199,
|
|
"learning_rate": 5.106567298245008e-07,
|
|
"loss": 0.10562150478363037,
|
|
"memory(GiB)": 38.74,
|
|
"step": 595,
|
|
"token_acc": 0.9642348331948237,
|
|
"train_speed(iter/s)": 0.136721
|
|
},
|
|
{
|
|
"epoch": 2.5774306490708323,
|
|
"grad_norm": 0.7083848714828491,
|
|
"learning_rate": 4.6212290164521554e-07,
|
|
"loss": 0.09948662519454957,
|
|
"memory(GiB)": 38.74,
|
|
"step": 600,
|
|
"token_acc": 0.9634162853042324,
|
|
"train_speed(iter/s)": 0.13684
|
|
},
|
|
{
|
|
"epoch": 2.5774306490708323,
|
|
"eval_loss": 0.22936248779296875,
|
|
"eval_runtime": 9.0836,
|
|
"eval_samples_per_second": 16.513,
|
|
"eval_steps_per_second": 4.183,
|
|
"eval_token_acc": 0.9285515829494773,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.5989765688122812,
|
|
"grad_norm": 0.6187947392463684,
|
|
"learning_rate": 4.159003694784647e-07,
|
|
"loss": 0.09668093919754028,
|
|
"memory(GiB)": 38.74,
|
|
"step": 605,
|
|
"token_acc": 0.9560884152358398,
|
|
"train_speed(iter/s)": 0.136435
|
|
},
|
|
{
|
|
"epoch": 2.62052248855373,
|
|
"grad_norm": 0.5855912566184998,
|
|
"learning_rate": 3.7201267604080436e-07,
|
|
"loss": 0.09887575507164001,
|
|
"memory(GiB)": 38.74,
|
|
"step": 610,
|
|
"token_acc": 0.9638565571839216,
|
|
"train_speed(iter/s)": 0.136553
|
|
},
|
|
{
|
|
"epoch": 2.642068408295179,
|
|
"grad_norm": 0.6379809379577637,
|
|
"learning_rate": 3.3048217483556743e-07,
|
|
"loss": 0.09761322140693665,
|
|
"memory(GiB)": 38.74,
|
|
"step": 615,
|
|
"token_acc": 0.9634018456375839,
|
|
"train_speed(iter/s)": 0.136656
|
|
},
|
|
{
|
|
"epoch": 2.663614328036628,
|
|
"grad_norm": 0.7473537921905518,
|
|
"learning_rate": 2.9133001876746004e-07,
|
|
"loss": 0.10321993827819824,
|
|
"memory(GiB)": 38.74,
|
|
"step": 620,
|
|
"token_acc": 0.9682616630546367,
|
|
"train_speed(iter/s)": 0.136808
|
|
},
|
|
{
|
|
"epoch": 2.663614328036628,
|
|
"eval_loss": 0.2293192744255066,
|
|
"eval_runtime": 9.083,
|
|
"eval_samples_per_second": 16.514,
|
|
"eval_steps_per_second": 4.184,
|
|
"eval_token_acc": 0.9288586678365138,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.685160247778077,
|
|
"grad_norm": 0.6255384087562561,
|
|
"learning_rate": 2.545761493686666e-07,
|
|
"loss": 0.09618874192237854,
|
|
"memory(GiB)": 38.74,
|
|
"step": 625,
|
|
"token_acc": 0.9535718866230601,
|
|
"train_speed(iter/s)": 0.136315
|
|
},
|
|
{
|
|
"epoch": 2.706706167519526,
|
|
"grad_norm": 0.7249470353126526,
|
|
"learning_rate": 2.2023928664194229e-07,
|
|
"loss": 0.09542186260223388,
|
|
"memory(GiB)": 38.74,
|
|
"step": 630,
|
|
"token_acc": 0.9657436142277394,
|
|
"train_speed(iter/s)": 0.136473
|
|
},
|
|
{
|
|
"epoch": 2.728252087260975,
|
|
"grad_norm": 0.6389328241348267,
|
|
"learning_rate": 1.8833691952587829e-07,
|
|
"loss": 0.09868041276931763,
|
|
"memory(GiB)": 38.74,
|
|
"step": 635,
|
|
"token_acc": 0.9635655520475652,
|
|
"train_speed(iter/s)": 0.136586
|
|
},
|
|
{
|
|
"epoch": 2.7497980070024237,
|
|
"grad_norm": 0.6394509673118591,
|
|
"learning_rate": 1.5888529698718347e-07,
|
|
"loss": 0.09870019555091858,
|
|
"memory(GiB)": 38.74,
|
|
"step": 640,
|
|
"token_acc": 0.9696296296296296,
|
|
"train_speed(iter/s)": 0.136694
|
|
},
|
|
{
|
|
"epoch": 2.7497980070024237,
|
|
"eval_loss": 0.2294546216726303,
|
|
"eval_runtime": 9.0801,
|
|
"eval_samples_per_second": 16.52,
|
|
"eval_steps_per_second": 4.185,
|
|
"eval_token_acc": 0.928580829129195,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.7713439267438726,
|
|
"grad_norm": 0.6763883233070374,
|
|
"learning_rate": 1.3189941974453502e-07,
|
|
"loss": 0.11155039072036743,
|
|
"memory(GiB)": 38.75,
|
|
"step": 645,
|
|
"token_acc": 0.9546169649632604,
|
|
"train_speed(iter/s)": 0.136302
|
|
},
|
|
{
|
|
"epoch": 2.7928898464853216,
|
|
"grad_norm": 0.672804057598114,
|
|
"learning_rate": 1.0739303262819301e-07,
|
|
"loss": 0.10463042259216308,
|
|
"memory(GiB)": 38.75,
|
|
"step": 650,
|
|
"token_acc": 0.9642040954859147,
|
|
"train_speed(iter/s)": 0.136412
|
|
},
|
|
{
|
|
"epoch": 2.814435766226771,
|
|
"grad_norm": 0.6254743933677673,
|
|
"learning_rate": 8.537861757929422e-08,
|
|
"loss": 0.09878579974174499,
|
|
"memory(GiB)": 48.34,
|
|
"step": 655,
|
|
"token_acc": 0.9661926605504587,
|
|
"train_speed(iter/s)": 0.136557
|
|
},
|
|
{
|
|
"epoch": 2.83598168596822,
|
|
"grad_norm": 0.645412027835846,
|
|
"learning_rate": 6.58673872923693e-08,
|
|
"loss": 0.09899102449417115,
|
|
"memory(GiB)": 48.34,
|
|
"step": 660,
|
|
"token_acc": 0.9720326171606578,
|
|
"train_speed(iter/s)": 0.136719
|
|
},
|
|
{
|
|
"epoch": 2.83598168596822,
|
|
"eval_loss": 0.22916720807552338,
|
|
"eval_runtime": 9.0857,
|
|
"eval_samples_per_second": 16.51,
|
|
"eval_steps_per_second": 4.182,
|
|
"eval_token_acc": 0.9288001754770783,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.857527605709669,
|
|
"grad_norm": 0.6514653563499451,
|
|
"learning_rate": 4.88692795043344e-08,
|
|
"loss": 0.09887722730636597,
|
|
"memory(GiB)": 48.34,
|
|
"step": 665,
|
|
"token_acc": 0.9558207029990325,
|
|
"train_speed(iter/s)": 0.136328
|
|
},
|
|
{
|
|
"epoch": 2.8790735254511177,
|
|
"grad_norm": 0.7409882545471191,
|
|
"learning_rate": 3.439295193286174e-08,
|
|
"loss": 0.10554132461547852,
|
|
"memory(GiB)": 48.34,
|
|
"step": 670,
|
|
"token_acc": 0.9599883432901064,
|
|
"train_speed(iter/s)": 0.136496
|
|
},
|
|
{
|
|
"epoch": 2.9006194451925666,
|
|
"grad_norm": 0.6542146801948547,
|
|
"learning_rate": 2.2445777866709208e-08,
|
|
"loss": 0.09306983947753907,
|
|
"memory(GiB)": 48.34,
|
|
"step": 675,
|
|
"token_acc": 0.9684243112165927,
|
|
"train_speed(iter/s)": 0.136588
|
|
},
|
|
{
|
|
"epoch": 2.9221653649340156,
|
|
"grad_norm": 0.6311559081077576,
|
|
"learning_rate": 1.3033842410251074e-08,
|
|
"loss": 0.10304062366485596,
|
|
"memory(GiB)": 48.34,
|
|
"step": 680,
|
|
"token_acc": 0.9661386389557723,
|
|
"train_speed(iter/s)": 0.136699
|
|
},
|
|
{
|
|
"epoch": 2.9221653649340156,
|
|
"eval_loss": 0.22871780395507812,
|
|
"eval_runtime": 9.0702,
|
|
"eval_samples_per_second": 16.538,
|
|
"eval_steps_per_second": 4.19,
|
|
"eval_token_acc": 0.9288732909263727,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 2.9437112846754645,
|
|
"grad_norm": 0.664340078830719,
|
|
"learning_rate": 6.16193938412557e-09,
|
|
"loss": 0.10301237106323242,
|
|
"memory(GiB)": 48.34,
|
|
"step": 685,
|
|
"token_acc": 0.9599046221570066,
|
|
"train_speed(iter/s)": 0.136287
|
|
},
|
|
{
|
|
"epoch": 2.9652572044169134,
|
|
"grad_norm": 0.6333754062652588,
|
|
"learning_rate": 1.8335688835802169e-09,
|
|
"loss": 0.09610807299613952,
|
|
"memory(GiB)": 48.34,
|
|
"step": 690,
|
|
"token_acc": 0.9651297625621204,
|
|
"train_speed(iter/s)": 0.136426
|
|
},
|
|
{
|
|
"epoch": 2.9868031241583624,
|
|
"grad_norm": 0.7277814745903015,
|
|
"learning_rate": 5.093549575119205e-11,
|
|
"loss": 0.0946582555770874,
|
|
"memory(GiB)": 48.34,
|
|
"step": 695,
|
|
"token_acc": 0.9677898215836204,
|
|
"train_speed(iter/s)": 0.136543
|
|
},
|
|
{
|
|
"epoch": 2.9911123081066524,
|
|
"eval_loss": 0.22913037240505219,
|
|
"eval_runtime": 9.0719,
|
|
"eval_samples_per_second": 16.535,
|
|
"eval_steps_per_second": 4.189,
|
|
"eval_token_acc": 0.9288147985669372,
|
|
"step": 696
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 696,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 8.629985511001293e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|