Files
qwen2.5vl-3b-sampled_15000_…/trainer_state.json
ModelHub XC 97d59f69fd 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-sampled_15000_qwen2.5vl32b
Source: Original Platform
2026-05-22 15:52:13 +08:00

1750 lines
50 KiB
JSON

{
"best_global_step": 460,
"best_metric": 0.24467714,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v29-20250507-134003/checkpoint-460",
"epoch": 2.9911123081066524,
"eval_steps": 20,
"global_step": 696,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0043091839482897925,
"grad_norm": 2.422614812850952,
"learning_rate": 9.99994906450425e-06,
"loss": 0.3783050775527954,
"memory(GiB)": 29.06,
"step": 1,
"token_acc": 0.8827366746221161,
"train_speed(iter/s)": 0.066016
},
{
"epoch": 0.02154591974144896,
"grad_norm": 1.8293063640594482,
"learning_rate": 9.99872666449397e-06,
"loss": 0.304108202457428,
"memory(GiB)": 29.06,
"step": 5,
"token_acc": 0.9021085311428756,
"train_speed(iter/s)": 0.123613
},
{
"epoch": 0.04309183948289792,
"grad_norm": 1.0181312561035156,
"learning_rate": 9.994907306529203e-06,
"loss": 0.30131869316101073,
"memory(GiB)": 29.06,
"step": 10,
"token_acc": 0.9080832657474993,
"train_speed(iter/s)": 0.141649
},
{
"epoch": 0.06463775922434689,
"grad_norm": 0.8247085213661194,
"learning_rate": 9.988543871435342e-06,
"loss": 0.29039506912231444,
"memory(GiB)": 29.07,
"step": 15,
"token_acc": 0.9026237111961776,
"train_speed(iter/s)": 0.145623
},
{
"epoch": 0.08618367896579585,
"grad_norm": 0.8313817977905273,
"learning_rate": 9.979639600327522e-06,
"loss": 0.2841599941253662,
"memory(GiB)": 29.07,
"step": 20,
"token_acc": 0.9051915284043123,
"train_speed(iter/s)": 0.148055
},
{
"epoch": 0.08618367896579585,
"eval_loss": 0.30637428164482117,
"eval_runtime": 9.2534,
"eval_samples_per_second": 16.21,
"eval_steps_per_second": 4.107,
"eval_token_acc": 0.8990793096302436,
"step": 20
},
{
"epoch": 0.10772959870724481,
"grad_norm": 0.8578795790672302,
"learning_rate": 9.96819902845557e-06,
"loss": 0.26584060192108155,
"memory(GiB)": 29.07,
"step": 25,
"token_acc": 0.9061398699976505,
"train_speed(iter/s)": 0.133467
},
{
"epoch": 0.12927551844869378,
"grad_norm": 0.8536065220832825,
"learning_rate": 9.954227982894034e-06,
"loss": 0.2721074342727661,
"memory(GiB)": 29.07,
"step": 30,
"token_acc": 0.9100939031401429,
"train_speed(iter/s)": 0.137239
},
{
"epoch": 0.15082143819014274,
"grad_norm": 0.8567067384719849,
"learning_rate": 9.937733579574263e-06,
"loss": 0.26833133697509765,
"memory(GiB)": 29.07,
"step": 35,
"token_acc": 0.9014427903508017,
"train_speed(iter/s)": 0.139069
},
{
"epoch": 0.1723673579315917,
"grad_norm": 0.8759805560112,
"learning_rate": 9.918724219660013e-06,
"loss": 0.2696810483932495,
"memory(GiB)": 29.07,
"step": 40,
"token_acc": 0.9073056300268096,
"train_speed(iter/s)": 0.141475
},
{
"epoch": 0.1723673579315917,
"eval_loss": 0.28540194034576416,
"eval_runtime": 9.1496,
"eval_samples_per_second": 16.394,
"eval_steps_per_second": 4.153,
"eval_token_acc": 0.9062731282749638,
"step": 40
},
{
"epoch": 0.19391327767304067,
"grad_norm": 0.8179587125778198,
"learning_rate": 9.897209585268459e-06,
"loss": 0.26938886642456056,
"memory(GiB)": 29.07,
"step": 45,
"token_acc": 0.9068100358422939,
"train_speed(iter/s)": 0.134153
},
{
"epoch": 0.21545919741448963,
"grad_norm": 0.7597087621688843,
"learning_rate": 9.873200634538746e-06,
"loss": 0.2661460876464844,
"memory(GiB)": 29.07,
"step": 50,
"token_acc": 0.9265829903627394,
"train_speed(iter/s)": 0.135778
},
{
"epoch": 0.23700511715593858,
"grad_norm": 0.9001930952072144,
"learning_rate": 9.846709596050646e-06,
"loss": 0.2637378692626953,
"memory(GiB)": 29.07,
"step": 55,
"token_acc": 0.9097354466352211,
"train_speed(iter/s)": 0.137425
},
{
"epoch": 0.25855103689738757,
"grad_norm": 0.894802451133728,
"learning_rate": 9.817749962596115e-06,
"loss": 0.26340594291687014,
"memory(GiB)": 29.07,
"step": 60,
"token_acc": 0.9115004961612785,
"train_speed(iter/s)": 0.139172
},
{
"epoch": 0.25855103689738757,
"eval_loss": 0.2748047709465027,
"eval_runtime": 9.1341,
"eval_samples_per_second": 16.422,
"eval_steps_per_second": 4.16,
"eval_token_acc": 0.9068652121140354,
"step": 60
},
{
"epoch": 0.28009695663883655,
"grad_norm": 0.7363941669464111,
"learning_rate": 9.786336484306966e-06,
"loss": 0.27098889350891114,
"memory(GiB)": 29.07,
"step": 65,
"token_acc": 0.9083590733590734,
"train_speed(iter/s)": 0.135246
},
{
"epoch": 0.3016428763802855,
"grad_norm": 0.8381310105323792,
"learning_rate": 9.752485161142103e-06,
"loss": 0.2478638172149658,
"memory(GiB)": 29.07,
"step": 70,
"token_acc": 0.9232435033686237,
"train_speed(iter/s)": 0.136407
},
{
"epoch": 0.32318879612173446,
"grad_norm": 0.7714306116104126,
"learning_rate": 9.716213234738216e-06,
"loss": 0.2461942672729492,
"memory(GiB)": 29.07,
"step": 75,
"token_acc": 0.9149177216982468,
"train_speed(iter/s)": 0.137517
},
{
"epoch": 0.3447347158631834,
"grad_norm": 0.9231055974960327,
"learning_rate": 9.677539179628005e-06,
"loss": 0.24781365394592286,
"memory(GiB)": 29.07,
"step": 80,
"token_acc": 0.9261125903385318,
"train_speed(iter/s)": 0.138742
},
{
"epoch": 0.3447347158631834,
"eval_loss": 0.26845118403434753,
"eval_runtime": 9.1426,
"eval_samples_per_second": 16.407,
"eval_steps_per_second": 4.156,
"eval_token_acc": 0.9084194321915984,
"step": 80
},
{
"epoch": 0.36628063560463237,
"grad_norm": 0.7188398241996765,
"learning_rate": 9.636482693830488e-06,
"loss": 0.26428771018981934,
"memory(GiB)": 29.07,
"step": 85,
"token_acc": 0.9042584492261823,
"train_speed(iter/s)": 0.135179
},
{
"epoch": 0.38782655534608135,
"grad_norm": 0.8020254969596863,
"learning_rate": 9.59306468881811e-06,
"loss": 0.2622120141983032,
"memory(GiB)": 29.07,
"step": 90,
"token_acc": 0.9098044980155814,
"train_speed(iter/s)": 0.13707
},
{
"epoch": 0.4093724750875303,
"grad_norm": 0.8326025605201721,
"learning_rate": 9.547307278865823e-06,
"loss": 0.2394162893295288,
"memory(GiB)": 29.07,
"step": 95,
"token_acc": 0.9164833305127771,
"train_speed(iter/s)": 0.138039
},
{
"epoch": 0.43091839482897926,
"grad_norm": 0.7730870842933655,
"learning_rate": 9.499233769787534e-06,
"loss": 0.24491536617279053,
"memory(GiB)": 29.07,
"step": 100,
"token_acc": 0.9174321989744152,
"train_speed(iter/s)": 0.139304
},
{
"epoch": 0.43091839482897926,
"eval_loss": 0.26416531205177307,
"eval_runtime": 9.1324,
"eval_samples_per_second": 16.425,
"eval_steps_per_second": 4.161,
"eval_token_acc": 0.9095295893898576,
"step": 100
},
{
"epoch": 0.45246431457042824,
"grad_norm": 0.860744833946228,
"learning_rate": 9.448868647065644e-06,
"loss": 0.25905332565307615,
"memory(GiB)": 29.07,
"step": 105,
"token_acc": 0.9042013222435488,
"train_speed(iter/s)": 0.136744
},
{
"epoch": 0.47401023431187717,
"grad_norm": 0.7342467904090881,
"learning_rate": 9.396237563379761e-06,
"loss": 0.23780291080474852,
"memory(GiB)": 29.07,
"step": 110,
"token_acc": 0.9195750302763405,
"train_speed(iter/s)": 0.137488
},
{
"epoch": 0.49555615405332615,
"grad_norm": 0.8625522255897522,
"learning_rate": 9.341367325540921e-06,
"loss": 0.23777966499328612,
"memory(GiB)": 29.07,
"step": 115,
"token_acc": 0.9236284378674467,
"train_speed(iter/s)": 0.138134
},
{
"epoch": 0.5171020737947751,
"grad_norm": 0.7858663201332092,
"learning_rate": 9.284285880837947e-06,
"loss": 0.23805389404296876,
"memory(GiB)": 29.07,
"step": 120,
"token_acc": 0.9197558545092177,
"train_speed(iter/s)": 0.138839
},
{
"epoch": 0.5171020737947751,
"eval_loss": 0.26037347316741943,
"eval_runtime": 9.1916,
"eval_samples_per_second": 16.319,
"eval_steps_per_second": 4.134,
"eval_token_acc": 0.9109209864116758,
"step": 120
},
{
"epoch": 0.5386479935362241,
"grad_norm": 0.69041508436203,
"learning_rate": 9.225022302802951e-06,
"loss": 0.23021812438964845,
"memory(GiB)": 29.07,
"step": 125,
"token_acc": 0.9175774697625143,
"train_speed(iter/s)": 0.136057
},
{
"epoch": 0.5601939132776731,
"grad_norm": 0.8726988434791565,
"learning_rate": 9.163606776403182e-06,
"loss": 0.24287738800048828,
"memory(GiB)": 29.07,
"step": 130,
"token_acc": 0.9174245368571058,
"train_speed(iter/s)": 0.136884
},
{
"epoch": 0.581739833019122,
"grad_norm": 0.782781720161438,
"learning_rate": 9.100070582666796e-06,
"loss": 0.2355494499206543,
"memory(GiB)": 29.07,
"step": 135,
"token_acc": 0.9218754381537254,
"train_speed(iter/s)": 0.137519
},
{
"epoch": 0.603285752760571,
"grad_norm": 0.8618416786193848,
"learning_rate": 9.034446082750352e-06,
"loss": 0.26097152233123777,
"memory(GiB)": 29.07,
"step": 140,
"token_acc": 0.9200457337339467,
"train_speed(iter/s)": 0.137992
},
{
"epoch": 0.603285752760571,
"eval_loss": 0.2570641040802002,
"eval_runtime": 9.1422,
"eval_samples_per_second": 16.407,
"eval_steps_per_second": 4.157,
"eval_token_acc": 0.9119867373220047,
"step": 140
},
{
"epoch": 0.6248316725020199,
"grad_norm": 0.7354313135147095,
"learning_rate": 8.966766701456177e-06,
"loss": 0.2450572967529297,
"memory(GiB)": 29.07,
"step": 145,
"token_acc": 0.9104867095521301,
"train_speed(iter/s)": 0.135924
},
{
"epoch": 0.6463775922434689,
"grad_norm": 0.7455778121948242,
"learning_rate": 8.897066910207958e-06,
"loss": 0.24456512928009033,
"memory(GiB)": 29.07,
"step": 150,
"token_acc": 0.9091944119638131,
"train_speed(iter/s)": 0.136598
},
{
"epoch": 0.6679235119849178,
"grad_norm": 0.6894455552101135,
"learning_rate": 8.825382209493284e-06,
"loss": 0.22335777282714844,
"memory(GiB)": 29.07,
"step": 155,
"token_acc": 0.9254186825455899,
"train_speed(iter/s)": 0.137109
},
{
"epoch": 0.6894694317263668,
"grad_norm": 0.8090242743492126,
"learning_rate": 8.751749110782013e-06,
"loss": 0.22939071655273438,
"memory(GiB)": 29.07,
"step": 160,
"token_acc": 0.917912822144448,
"train_speed(iter/s)": 0.137664
},
{
"epoch": 0.6894694317263668,
"eval_loss": 0.2559308111667633,
"eval_runtime": 9.1265,
"eval_samples_per_second": 16.436,
"eval_steps_per_second": 4.164,
"eval_token_acc": 0.9122235708576334,
"step": 160
},
{
"epoch": 0.7110153514678158,
"grad_norm": 0.6877977252006531,
"learning_rate": 8.676205117929752e-06,
"loss": 0.2350329875946045,
"memory(GiB)": 29.07,
"step": 165,
"token_acc": 0.9184605472599106,
"train_speed(iter/s)": 0.13599
},
{
"epoch": 0.7325612712092647,
"grad_norm": 0.7360235452651978,
"learning_rate": 8.598788708075844e-06,
"loss": 0.23023662567138672,
"memory(GiB)": 29.07,
"step": 170,
"token_acc": 0.9058568071626164,
"train_speed(iter/s)": 0.136418
},
{
"epoch": 0.7541071909507137,
"grad_norm": 0.8410085439682007,
"learning_rate": 8.51953931204566e-06,
"loss": 0.23642911911010742,
"memory(GiB)": 29.07,
"step": 175,
"token_acc": 0.9052932094269817,
"train_speed(iter/s)": 0.137007
},
{
"epoch": 0.7756531106921627,
"grad_norm": 0.7055257558822632,
"learning_rate": 8.438497294267117e-06,
"loss": 0.21671390533447266,
"memory(GiB)": 29.07,
"step": 180,
"token_acc": 0.9202059202059202,
"train_speed(iter/s)": 0.137438
},
{
"epoch": 0.7756531106921627,
"eval_loss": 0.251537024974823,
"eval_runtime": 9.1508,
"eval_samples_per_second": 16.392,
"eval_steps_per_second": 4.153,
"eval_token_acc": 0.9133337280558927,
"step": 180
},
{
"epoch": 0.7971990304336116,
"grad_norm": 0.7327454686164856,
"learning_rate": 8.3557039322117e-06,
"loss": 0.23624320030212403,
"memory(GiB)": 29.07,
"step": 185,
"token_acc": 0.9212678936605317,
"train_speed(iter/s)": 0.135745
},
{
"epoch": 0.8187449501750605,
"grad_norm": 0.7439467906951904,
"learning_rate": 8.27120139537044e-06,
"loss": 0.226143741607666,
"memory(GiB)": 29.07,
"step": 190,
"token_acc": 0.9260711777101381,
"train_speed(iter/s)": 0.136185
},
{
"epoch": 0.8402908699165096,
"grad_norm": 0.7658076286315918,
"learning_rate": 8.18503272377554e-06,
"loss": 0.23765263557434083,
"memory(GiB)": 29.07,
"step": 195,
"token_acc": 0.9223176899233237,
"train_speed(iter/s)": 0.136807
},
{
"epoch": 0.8618367896579585,
"grad_norm": 0.739122211933136,
"learning_rate": 8.097241806078616e-06,
"loss": 0.2310422420501709,
"memory(GiB)": 29.07,
"step": 200,
"token_acc": 0.9296302733841532,
"train_speed(iter/s)": 0.13714
},
{
"epoch": 0.8618367896579585,
"eval_loss": 0.24832946062088013,
"eval_runtime": 9.1389,
"eval_samples_per_second": 16.413,
"eval_steps_per_second": 4.158,
"eval_token_acc": 0.9134077385357766,
"step": 200
},
{
"epoch": 0.8833827093994074,
"grad_norm": 0.7651655077934265,
"learning_rate": 8.007873357196716e-06,
"loss": 0.24373788833618165,
"memory(GiB)": 29.07,
"step": 205,
"token_acc": 0.9188066080938974,
"train_speed(iter/s)": 0.135947
},
{
"epoch": 0.9049286291408565,
"grad_norm": 0.7528461217880249,
"learning_rate": 7.916972895537471e-06,
"loss": 0.23199746608734131,
"memory(GiB)": 29.07,
"step": 210,
"token_acc": 0.922004793261512,
"train_speed(iter/s)": 0.136394
},
{
"epoch": 0.9264745488823054,
"grad_norm": 0.8405919075012207,
"learning_rate": 7.824586719815019e-06,
"loss": 0.2173825740814209,
"memory(GiB)": 29.07,
"step": 215,
"token_acc": 0.9274689356403538,
"train_speed(iter/s)": 0.136811
},
{
"epoch": 0.9480204686237543,
"grad_norm": 0.7239152193069458,
"learning_rate": 7.730761885468486e-06,
"loss": 0.22583813667297364,
"memory(GiB)": 29.07,
"step": 220,
"token_acc": 0.9232335730673059,
"train_speed(iter/s)": 0.137394
},
{
"epoch": 0.9480204686237543,
"eval_loss": 0.2480185180902481,
"eval_runtime": 9.1514,
"eval_samples_per_second": 16.391,
"eval_steps_per_second": 4.152,
"eval_token_acc": 0.9135261553035909,
"step": 220
},
{
"epoch": 0.9695663883652034,
"grad_norm": 0.7927827835083008,
"learning_rate": 7.635546180695039e-06,
"loss": 0.24525394439697265,
"memory(GiB)": 31.51,
"step": 225,
"token_acc": 0.9190034762456547,
"train_speed(iter/s)": 0.136277
},
{
"epoch": 0.9911123081066523,
"grad_norm": 0.7584970593452454,
"learning_rate": 7.538988102109728e-06,
"loss": 0.24703338146209716,
"memory(GiB)": 31.51,
"step": 230,
"token_acc": 0.9166930084197822,
"train_speed(iter/s)": 0.136884
},
{
"epoch": 1.0086183678965797,
"grad_norm": 0.6898382306098938,
"learning_rate": 7.441136830044495e-06,
"loss": 0.19301869869232177,
"memory(GiB)": 31.51,
"step": 235,
"token_acc": 0.9391968052558693,
"train_speed(iter/s)": 0.137591
},
{
"epoch": 1.0301642876380286,
"grad_norm": 0.8064629435539246,
"learning_rate": 7.342042203498952e-06,
"loss": 0.16187149286270142,
"memory(GiB)": 31.51,
"step": 240,
"token_acc": 0.9393010954707055,
"train_speed(iter/s)": 0.137991
},
{
"epoch": 1.0301642876380286,
"eval_loss": 0.2533319890499115,
"eval_runtime": 9.1306,
"eval_samples_per_second": 16.428,
"eval_steps_per_second": 4.162,
"eval_token_acc": 0.914014624470825,
"step": 240
},
{
"epoch": 1.0517102073794775,
"grad_norm": 0.8042640089988708,
"learning_rate": 7.241754694755674e-06,
"loss": 0.16961312294006348,
"memory(GiB)": 31.51,
"step": 245,
"token_acc": 0.9325969259837942,
"train_speed(iter/s)": 0.13689
},
{
"epoch": 1.0732561271209264,
"grad_norm": 0.8370431661605835,
"learning_rate": 7.140325383672938e-06,
"loss": 0.1677647829055786,
"memory(GiB)": 31.51,
"step": 250,
"token_acc": 0.9420376456528234,
"train_speed(iter/s)": 0.13733
},
{
"epoch": 1.0948020468623754,
"grad_norm": 0.7853599190711975,
"learning_rate": 7.037805931668006e-06,
"loss": 0.16614892482757568,
"memory(GiB)": 31.51,
"step": 255,
"token_acc": 0.9391651270517043,
"train_speed(iter/s)": 0.137609
},
{
"epoch": 1.1163479666038243,
"grad_norm": 0.6807184815406799,
"learning_rate": 6.934248555404197e-06,
"loss": 0.1581436276435852,
"memory(GiB)": 31.51,
"step": 260,
"token_acc": 0.9458589779605179,
"train_speed(iter/s)": 0.137947
},
{
"epoch": 1.1163479666038243,
"eval_loss": 0.2524171471595764,
"eval_runtime": 9.156,
"eval_samples_per_second": 16.383,
"eval_steps_per_second": 4.15,
"eval_token_acc": 0.9150063649012701,
"step": 260
},
{
"epoch": 1.1378938863452734,
"grad_norm": 0.7507938146591187,
"learning_rate": 6.8297060001951545e-06,
"loss": 0.16150083541870117,
"memory(GiB)": 31.51,
"step": 265,
"token_acc": 0.9309385421629347,
"train_speed(iter/s)": 0.137099
},
{
"epoch": 1.1594398060867224,
"grad_norm": 0.7291717529296875,
"learning_rate": 6.724231513139853e-06,
"loss": 0.16564717292785644,
"memory(GiB)": 31.51,
"step": 270,
"token_acc": 0.9417148494231771,
"train_speed(iter/s)": 0.137339
},
{
"epoch": 1.1809857258281713,
"grad_norm": 0.7378965616226196,
"learning_rate": 6.617878816002032e-06,
"loss": 0.1618717670440674,
"memory(GiB)": 31.51,
"step": 275,
"token_acc": 0.9485524256651017,
"train_speed(iter/s)": 0.137622
},
{
"epoch": 1.2025316455696202,
"grad_norm": 0.8035087585449219,
"learning_rate": 6.510702077847864e-06,
"loss": 0.1574448823928833,
"memory(GiB)": 31.51,
"step": 280,
"token_acc": 0.9411747078213965,
"train_speed(iter/s)": 0.138001
},
{
"epoch": 1.2025316455696202,
"eval_loss": 0.24995924532413483,
"eval_runtime": 9.1411,
"eval_samples_per_second": 16.409,
"eval_steps_per_second": 4.157,
"eval_token_acc": 0.9145474999259895,
"step": 280
},
{
"epoch": 1.2240775653110691,
"grad_norm": 0.7014583349227905,
"learning_rate": 6.402755887455792e-06,
"loss": 0.1643718123435974,
"memory(GiB)": 31.51,
"step": 285,
"token_acc": 0.9339765241569784,
"train_speed(iter/s)": 0.137005
},
{
"epoch": 1.2456234850525183,
"grad_norm": 0.7766486406326294,
"learning_rate": 6.294095225512604e-06,
"loss": 0.16814930438995362,
"memory(GiB)": 31.51,
"step": 290,
"token_acc": 0.9360814298463542,
"train_speed(iter/s)": 0.137309
},
{
"epoch": 1.2671694047939672,
"grad_norm": 0.7775722146034241,
"learning_rate": 6.184775436609885e-06,
"loss": 0.1682277202606201,
"memory(GiB)": 31.51,
"step": 295,
"token_acc": 0.9411593528110813,
"train_speed(iter/s)": 0.137689
},
{
"epoch": 1.2887153245354162,
"grad_norm": 0.7489521503448486,
"learning_rate": 6.074852201055121e-06,
"loss": 0.16042615175247193,
"memory(GiB)": 31.51,
"step": 300,
"token_acc": 0.9415187229598687,
"train_speed(iter/s)": 0.138077
},
{
"epoch": 1.2887153245354162,
"eval_loss": 0.25172922015190125,
"eval_runtime": 9.1377,
"eval_samples_per_second": 16.415,
"eval_steps_per_second": 4.159,
"eval_token_acc": 0.9150211669972468,
"step": 300
},
{
"epoch": 1.310261244276865,
"grad_norm": 0.792445182800293,
"learning_rate": 5.964381506511823e-06,
"loss": 0.16529514789581298,
"memory(GiB)": 31.51,
"step": 305,
"token_acc": 0.9390098282355103,
"train_speed(iter/s)": 0.137303
},
{
"epoch": 1.331807164018314,
"grad_norm": 0.7657850980758667,
"learning_rate": 5.853419619483083e-06,
"loss": 0.16101518869400025,
"memory(GiB)": 31.51,
"step": 310,
"token_acc": 0.9423482091553342,
"train_speed(iter/s)": 0.137569
},
{
"epoch": 1.353353083759763,
"grad_norm": 0.7221185564994812,
"learning_rate": 5.742023056653131e-06,
"loss": 0.16527080535888672,
"memory(GiB)": 31.51,
"step": 315,
"token_acc": 0.9436092441929018,
"train_speed(iter/s)": 0.13783
},
{
"epoch": 1.3748990035012119,
"grad_norm": 0.7651124596595764,
"learning_rate": 5.630248556101448e-06,
"loss": 0.16076445579528809,
"memory(GiB)": 31.51,
"step": 320,
"token_acc": 0.941539852342926,
"train_speed(iter/s)": 0.138138
},
{
"epoch": 1.3748990035012119,
"eval_loss": 0.2519991993904114,
"eval_runtime": 9.1625,
"eval_samples_per_second": 16.371,
"eval_steps_per_second": 4.147,
"eval_token_acc": 0.9146955208857575,
"step": 320
},
{
"epoch": 1.3964449232426608,
"grad_norm": 0.6687130331993103,
"learning_rate": 5.51815304840412e-06,
"loss": 0.16071187257766723,
"memory(GiB)": 31.51,
"step": 325,
"token_acc": 0.93119978263401,
"train_speed(iter/s)": 0.137342
},
{
"epoch": 1.41799084298411,
"grad_norm": 0.7091411352157593,
"learning_rate": 5.405793627637157e-06,
"loss": 0.15800976753234863,
"memory(GiB)": 31.51,
"step": 330,
"token_acc": 0.9493767600253226,
"train_speed(iter/s)": 0.137567
},
{
"epoch": 1.4395367627255589,
"grad_norm": 0.7872418761253357,
"learning_rate": 5.293227522296517e-06,
"loss": 0.16303786039352416,
"memory(GiB)": 31.51,
"step": 335,
"token_acc": 0.9474813818783616,
"train_speed(iter/s)": 0.137773
},
{
"epoch": 1.4610826824670078,
"grad_norm": 0.696894109249115,
"learning_rate": 5.180512066149682e-06,
"loss": 0.1651884913444519,
"memory(GiB)": 31.51,
"step": 340,
"token_acc": 0.9437182487584908,
"train_speed(iter/s)": 0.138053
},
{
"epoch": 1.4610826824670078,
"eval_loss": 0.2488545924425125,
"eval_runtime": 9.1671,
"eval_samples_per_second": 16.363,
"eval_steps_per_second": 4.145,
"eval_token_acc": 0.9151395837650611,
"step": 340
},
{
"epoch": 1.4826286022084567,
"grad_norm": 0.7297951579093933,
"learning_rate": 5.06770466903361e-06,
"loss": 0.15690993070602416,
"memory(GiB)": 31.51,
"step": 345,
"token_acc": 0.9340376019152534,
"train_speed(iter/s)": 0.137212
},
{
"epoch": 1.5041745219499059,
"grad_norm": 0.7707265019416809,
"learning_rate": 4.954862787613937e-06,
"loss": 0.15354007482528687,
"memory(GiB)": 31.51,
"step": 350,
"token_acc": 0.9396699063799393,
"train_speed(iter/s)": 0.13744
},
{
"epoch": 1.5257204416913548,
"grad_norm": 0.7526496052742004,
"learning_rate": 4.842043896120332e-06,
"loss": 0.16020708084106444,
"memory(GiB)": 31.51,
"step": 355,
"token_acc": 0.9479154768703598,
"train_speed(iter/s)": 0.137715
},
{
"epoch": 1.5472663614328037,
"grad_norm": 0.7758511900901794,
"learning_rate": 4.729305457072913e-06,
"loss": 0.16963763236999513,
"memory(GiB)": 31.51,
"step": 360,
"token_acc": 0.9411747908278363,
"train_speed(iter/s)": 0.138029
},
{
"epoch": 1.5472663614328037,
"eval_loss": 0.24962776899337769,
"eval_runtime": 9.1377,
"eval_samples_per_second": 16.415,
"eval_steps_per_second": 4.159,
"eval_token_acc": 0.9158204801799935,
"step": 360
},
{
"epoch": 1.5688122811742526,
"grad_norm": 0.7562235593795776,
"learning_rate": 4.616704892014613e-06,
"loss": 0.1591555118560791,
"memory(GiB)": 31.51,
"step": 365,
"token_acc": 0.9352809509107749,
"train_speed(iter/s)": 0.137302
},
{
"epoch": 1.5903582009157016,
"grad_norm": 0.7587376236915588,
"learning_rate": 4.504299552264428e-06,
"loss": 0.15684648752212524,
"memory(GiB)": 31.51,
"step": 370,
"token_acc": 0.9416271651313239,
"train_speed(iter/s)": 0.137471
},
{
"epoch": 1.6119041206571505,
"grad_norm": 0.8137562870979309,
"learning_rate": 4.392146689706426e-06,
"loss": 0.1647357702255249,
"memory(GiB)": 31.51,
"step": 375,
"token_acc": 0.9458877409154104,
"train_speed(iter/s)": 0.137806
},
{
"epoch": 1.6334500403985994,
"grad_norm": 0.7551019191741943,
"learning_rate": 4.280303427629404e-06,
"loss": 0.15853278636932372,
"memory(GiB)": 31.51,
"step": 380,
"token_acc": 0.9448746907604604,
"train_speed(iter/s)": 0.138095
},
{
"epoch": 1.6334500403985994,
"eval_loss": 0.25000789761543274,
"eval_runtime": 9.1322,
"eval_samples_per_second": 16.425,
"eval_steps_per_second": 4.161,
"eval_token_acc": 0.9155096361644809,
"step": 380
},
{
"epoch": 1.6549959601400483,
"grad_norm": 0.7977014780044556,
"learning_rate": 4.168826731632052e-06,
"loss": 0.15717003345489503,
"memory(GiB)": 31.51,
"step": 385,
"token_acc": 0.9312666413084824,
"train_speed(iter/s)": 0.137411
},
{
"epoch": 1.6765418798814973,
"grad_norm": 0.7832633852958679,
"learning_rate": 4.057773380608411e-06,
"loss": 0.1634294271469116,
"memory(GiB)": 31.51,
"step": 390,
"token_acc": 0.9465973781793492,
"train_speed(iter/s)": 0.137768
},
{
"epoch": 1.6980877996229464,
"grad_norm": 0.8293562531471252,
"learning_rate": 3.947199937828447e-06,
"loss": 0.16505708694458007,
"memory(GiB)": 31.51,
"step": 395,
"token_acc": 0.9415725074644342,
"train_speed(iter/s)": 0.138088
},
{
"epoch": 1.7196337193643954,
"grad_norm": 0.7886548042297363,
"learning_rate": 3.8371627221284495e-06,
"loss": 0.1561971426010132,
"memory(GiB)": 31.51,
"step": 400,
"token_acc": 0.9470925236321971,
"train_speed(iter/s)": 0.138285
},
{
"epoch": 1.7196337193643954,
"eval_loss": 0.24771690368652344,
"eval_runtime": 9.1277,
"eval_samples_per_second": 16.434,
"eval_steps_per_second": 4.163,
"eval_token_acc": 0.915642855028272,
"step": 400
},
{
"epoch": 1.7411796391058443,
"grad_norm": 0.7254658937454224,
"learning_rate": 3.727717779225912e-06,
"loss": 0.1556318521499634,
"memory(GiB)": 31.51,
"step": 405,
"token_acc": 0.9359159282917783,
"train_speed(iter/s)": 0.137626
},
{
"epoch": 1.7627255588472934,
"grad_norm": 0.7896953225135803,
"learning_rate": 3.6189208531735354e-06,
"loss": 0.16613179445266724,
"memory(GiB)": 31.51,
"step": 410,
"token_acc": 0.9352341759749168,
"train_speed(iter/s)": 0.137957
},
{
"epoch": 1.7842714785887424,
"grad_norm": 0.6848239898681641,
"learning_rate": 3.510827357966876e-06,
"loss": 0.1551806092262268,
"memory(GiB)": 31.51,
"step": 415,
"token_acc": 0.9506668360218469,
"train_speed(iter/s)": 0.138211
},
{
"epoch": 1.8058173983301913,
"grad_norm": 0.7046887874603271,
"learning_rate": 3.403492349320101e-06,
"loss": 0.15121963024139404,
"memory(GiB)": 31.51,
"step": 420,
"token_acc": 0.9443215339233039,
"train_speed(iter/s)": 0.138347
},
{
"epoch": 1.8058173983301913,
"eval_loss": 0.24637845158576965,
"eval_runtime": 9.144,
"eval_samples_per_second": 16.404,
"eval_steps_per_second": 4.156,
"eval_token_acc": 0.9157020634121792,
"step": 420
},
{
"epoch": 1.8273633180716402,
"grad_norm": 0.7264747619628906,
"learning_rate": 3.29697049662423e-06,
"loss": 0.1486160159111023,
"memory(GiB)": 31.51,
"step": 425,
"token_acc": 0.9378997513712539,
"train_speed(iter/s)": 0.137609
},
{
"epoch": 1.8489092378130891,
"grad_norm": 0.6881827116012573,
"learning_rate": 3.191316055102146e-06,
"loss": 0.14999903440475465,
"memory(GiB)": 31.51,
"step": 430,
"token_acc": 0.9458710676835081,
"train_speed(iter/s)": 0.13782
},
{
"epoch": 1.870455157554538,
"grad_norm": 0.7096033096313477,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.15066919326782227,
"memory(GiB)": 31.51,
"step": 435,
"token_acc": 0.9486315094650982,
"train_speed(iter/s)": 0.137979
},
{
"epoch": 1.892001077295987,
"grad_norm": 0.7479064464569092,
"learning_rate": 2.982824190050958e-06,
"loss": 0.165749990940094,
"memory(GiB)": 31.51,
"step": 440,
"token_acc": 0.9481958622195534,
"train_speed(iter/s)": 0.138187
},
{
"epoch": 1.892001077295987,
"eval_loss": 0.24523746967315674,
"eval_runtime": 9.1328,
"eval_samples_per_second": 16.424,
"eval_steps_per_second": 4.161,
"eval_token_acc": 0.9161757304834365,
"step": 440
},
{
"epoch": 1.913546997037436,
"grad_norm": 0.7331624031066895,
"learning_rate": 2.8800929585596506e-06,
"loss": 0.15496289730072021,
"memory(GiB)": 31.51,
"step": 445,
"token_acc": 0.9352829677768751,
"train_speed(iter/s)": 0.137542
},
{
"epoch": 1.9350929167788848,
"grad_norm": 0.6734929084777832,
"learning_rate": 2.778441468230483e-06,
"loss": 0.1523799180984497,
"memory(GiB)": 31.51,
"step": 450,
"token_acc": 0.9479633806554332,
"train_speed(iter/s)": 0.1377
},
{
"epoch": 1.956638836520334,
"grad_norm": 0.7542054057121277,
"learning_rate": 2.6779214936442056e-06,
"loss": 0.16172744035720826,
"memory(GiB)": 31.51,
"step": 455,
"token_acc": 0.935499950154521,
"train_speed(iter/s)": 0.137884
},
{
"epoch": 1.978184756261783,
"grad_norm": 0.7129687070846558,
"learning_rate": 2.5785842330619038e-06,
"loss": 0.15356701612472534,
"memory(GiB)": 31.51,
"step": 460,
"token_acc": 0.941601546088564,
"train_speed(iter/s)": 0.13804
},
{
"epoch": 1.978184756261783,
"eval_loss": 0.24467714130878448,
"eval_runtime": 9.1316,
"eval_samples_per_second": 16.426,
"eval_steps_per_second": 4.161,
"eval_token_acc": 0.9170934604339974,
"step": 460
},
{
"epoch": 1.9997306760032318,
"grad_norm": 0.7861402630805969,
"learning_rate": 2.480480282347961e-06,
"loss": 0.15792056322097778,
"memory(GiB)": 31.51,
"step": 465,
"token_acc": 0.9337420552337027,
"train_speed(iter/s)": 0.137533
},
{
"epoch": 2.0172367357931593,
"grad_norm": 0.6826748847961426,
"learning_rate": 2.383659609199873e-06,
"loss": 0.14240689277648927,
"memory(GiB)": 31.51,
"step": 470,
"token_acc": 0.9578195371952166,
"train_speed(iter/s)": 0.137988
},
{
"epoch": 2.0387826555346082,
"grad_norm": 0.6501537561416626,
"learning_rate": 2.2881715276979705e-06,
"loss": 0.10814023017883301,
"memory(GiB)": 31.51,
"step": 475,
"token_acc": 0.9586633663366336,
"train_speed(iter/s)": 0.138103
},
{
"epoch": 2.060328575276057,
"grad_norm": 0.6575304269790649,
"learning_rate": 2.1940646731880887e-06,
"loss": 0.1118842363357544,
"memory(GiB)": 31.51,
"step": 480,
"token_acc": 0.9698543524895563,
"train_speed(iter/s)": 0.138265
},
{
"epoch": 2.060328575276057,
"eval_loss": 0.26591211557388306,
"eval_runtime": 9.1643,
"eval_samples_per_second": 16.368,
"eval_steps_per_second": 4.147,
"eval_token_acc": 0.9162201367713668,
"step": 480
},
{
"epoch": 2.081874495017506,
"grad_norm": 0.765779972076416,
"learning_rate": 2.101386977509907e-06,
"loss": 0.12155743837356567,
"memory(GiB)": 31.51,
"step": 485,
"token_acc": 0.946289860026969,
"train_speed(iter/s)": 0.137773
},
{
"epoch": 2.103420414758955,
"grad_norm": 0.7146125435829163,
"learning_rate": 2.010185644583641e-06,
"loss": 0.11463183164596558,
"memory(GiB)": 31.51,
"step": 490,
"token_acc": 0.9624759934997784,
"train_speed(iter/s)": 0.137939
},
{
"epoch": 2.124966334500404,
"grad_norm": 0.6777431964874268,
"learning_rate": 1.920507126367448e-06,
"loss": 0.10685477256774903,
"memory(GiB)": 31.51,
"step": 495,
"token_acc": 0.9612281857095818,
"train_speed(iter/s)": 0.138102
},
{
"epoch": 2.146512254241853,
"grad_norm": 0.7272450923919678,
"learning_rate": 1.8323970991978823e-06,
"loss": 0.10419889688491821,
"memory(GiB)": 31.51,
"step": 500,
"token_acc": 0.9610325296357052,
"train_speed(iter/s)": 0.138212
},
{
"epoch": 2.146512254241853,
"eval_loss": 0.26627564430236816,
"eval_runtime": 9.1679,
"eval_samples_per_second": 16.361,
"eval_steps_per_second": 4.145,
"eval_token_acc": 0.9159833032357382,
"step": 500
},
{
"epoch": 2.168058173983302,
"grad_norm": 0.7035279273986816,
"learning_rate": 1.7459004405253544e-06,
"loss": 0.1082868218421936,
"memory(GiB)": 31.51,
"step": 505,
"token_acc": 0.9475257941268758,
"train_speed(iter/s)": 0.137652
},
{
"epoch": 2.1896040937247507,
"grad_norm": 0.601047158241272,
"learning_rate": 1.6610612060565235e-06,
"loss": 0.09674398303031921,
"memory(GiB)": 31.51,
"step": 510,
"token_acc": 0.9646692233940556,
"train_speed(iter/s)": 0.137835
},
{
"epoch": 2.2111500134661997,
"grad_norm": 0.7340168356895447,
"learning_rate": 1.5779226073152071e-06,
"loss": 0.1145021677017212,
"memory(GiB)": 31.51,
"step": 515,
"token_acc": 0.9572068592615479,
"train_speed(iter/s)": 0.138109
},
{
"epoch": 2.2326959332076486,
"grad_norm": 0.7059099078178406,
"learning_rate": 1.4965269896332884e-06,
"loss": 0.1138340711593628,
"memory(GiB)": 31.51,
"step": 520,
"token_acc": 0.9643348939686037,
"train_speed(iter/s)": 0.138301
},
{
"epoch": 2.2326959332076486,
"eval_loss": 0.26528117060661316,
"eval_runtime": 9.1705,
"eval_samples_per_second": 16.357,
"eval_steps_per_second": 4.144,
"eval_token_acc": 0.9159536990437847,
"step": 520
},
{
"epoch": 2.254241852949098,
"grad_norm": 0.7410480976104736,
"learning_rate": 1.4169158105827768e-06,
"loss": 0.11105086803436279,
"memory(GiB)": 31.51,
"step": 525,
"token_acc": 0.9514605435256503,
"train_speed(iter/s)": 0.137827
},
{
"epoch": 2.275787772690547,
"grad_norm": 0.6698100566864014,
"learning_rate": 1.3391296188600594e-06,
"loss": 0.10843292474746705,
"memory(GiB)": 31.51,
"step": 530,
"token_acc": 0.9629225092250923,
"train_speed(iter/s)": 0.137941
},
{
"epoch": 2.297333692431996,
"grad_norm": 0.6693587303161621,
"learning_rate": 1.2632080336330532e-06,
"loss": 0.11362366676330567,
"memory(GiB)": 31.51,
"step": 535,
"token_acc": 0.9621976353183642,
"train_speed(iter/s)": 0.138116
},
{
"epoch": 2.3188796121734447,
"grad_norm": 0.6858277320861816,
"learning_rate": 1.1891897243618184e-06,
"loss": 0.10754673480987549,
"memory(GiB)": 31.51,
"step": 540,
"token_acc": 0.9638930030070464,
"train_speed(iter/s)": 0.138279
},
{
"epoch": 2.3188796121734447,
"eval_loss": 0.2651301622390747,
"eval_runtime": 9.1447,
"eval_samples_per_second": 16.403,
"eval_steps_per_second": 4.155,
"eval_token_acc": 0.91620533467539,
"step": 540
},
{
"epoch": 2.3404255319148937,
"grad_norm": 0.7329075932502747,
"learning_rate": 1.1171123911028692e-06,
"loss": 0.10752699375152588,
"memory(GiB)": 31.51,
"step": 545,
"token_acc": 0.946654961925566,
"train_speed(iter/s)": 0.137811
},
{
"epoch": 2.3619714516563426,
"grad_norm": 0.7588092684745789,
"learning_rate": 1.047012745307255e-06,
"loss": 0.10413261651992797,
"memory(GiB)": 31.51,
"step": 550,
"token_acc": 0.9638513608403786,
"train_speed(iter/s)": 0.137959
},
{
"epoch": 2.3835173713977915,
"grad_norm": 0.6776463389396667,
"learning_rate": 9.789264911221546e-07,
"loss": 0.11203373670578003,
"memory(GiB)": 31.51,
"step": 555,
"token_acc": 0.9539418840061927,
"train_speed(iter/s)": 0.138182
},
{
"epoch": 2.4050632911392404,
"grad_norm": 0.6233177185058594,
"learning_rate": 9.128883072055411e-07,
"loss": 0.10640518665313721,
"memory(GiB)": 31.51,
"step": 560,
"token_acc": 0.9605239362389232,
"train_speed(iter/s)": 0.138441
},
{
"epoch": 2.4050632911392404,
"eval_loss": 0.26431551575660706,
"eval_runtime": 9.1559,
"eval_samples_per_second": 16.383,
"eval_steps_per_second": 4.15,
"eval_token_acc": 0.9158056780840167,
"step": 560
},
{
"epoch": 2.4266092108806894,
"grad_norm": 0.6891351938247681,
"learning_rate": 8.489318290631454e-07,
"loss": 0.11017493009567261,
"memory(GiB)": 31.51,
"step": 565,
"token_acc": 0.9501483222252186,
"train_speed(iter/s)": 0.137996
},
{
"epoch": 2.4481551306221383,
"grad_norm": 0.685417890548706,
"learning_rate": 7.870896319167548e-07,
"loss": 0.10502817630767822,
"memory(GiB)": 31.51,
"step": 570,
"token_acc": 0.9675666865866247,
"train_speed(iter/s)": 0.138123
},
{
"epoch": 2.4697010503635872,
"grad_norm": 0.8273110389709473,
"learning_rate": 7.273932141125256e-07,
"loss": 0.11376097202301025,
"memory(GiB)": 31.51,
"step": 575,
"token_acc": 0.9588286984389538,
"train_speed(iter/s)": 0.138286
},
{
"epoch": 2.4912469701050366,
"grad_norm": 0.7995973825454712,
"learning_rate": 6.698729810778065e-07,
"loss": 0.1191399335861206,
"memory(GiB)": 31.51,
"step": 580,
"token_acc": 0.9582757592998997,
"train_speed(iter/s)": 0.138394
},
{
"epoch": 2.4912469701050366,
"eval_loss": 0.26525548100471497,
"eval_runtime": 9.1367,
"eval_samples_per_second": 16.417,
"eval_steps_per_second": 4.159,
"eval_token_acc": 0.9161461262914828,
"step": 580
},
{
"epoch": 2.5127928898464855,
"grad_norm": 0.6976614594459534,
"learning_rate": 6.145582298346153e-07,
"loss": 0.10850718021392822,
"memory(GiB)": 31.51,
"step": 585,
"token_acc": 0.9468696569536905,
"train_speed(iter/s)": 0.137863
},
{
"epoch": 2.5343388095879344,
"grad_norm": 0.7127689123153687,
"learning_rate": 5.614771340776559e-07,
"loss": 0.1049992561340332,
"memory(GiB)": 31.51,
"step": 590,
"token_acc": 0.9636775106082037,
"train_speed(iter/s)": 0.137998
},
{
"epoch": 2.5558847293293834,
"grad_norm": 0.7365370392799377,
"learning_rate": 5.106567298245008e-07,
"loss": 0.11682652235031128,
"memory(GiB)": 31.51,
"step": 595,
"token_acc": 0.9585528403681371,
"train_speed(iter/s)": 0.138108
},
{
"epoch": 2.5774306490708323,
"grad_norm": 0.6995398998260498,
"learning_rate": 4.6212290164521554e-07,
"loss": 0.10941903591156006,
"memory(GiB)": 31.51,
"step": 600,
"token_acc": 0.9588730068630993,
"train_speed(iter/s)": 0.138228
},
{
"epoch": 2.5774306490708323,
"eval_loss": 0.2648448944091797,
"eval_runtime": 9.1555,
"eval_samples_per_second": 16.384,
"eval_steps_per_second": 4.15,
"eval_token_acc": 0.91620533467539,
"step": 600
},
{
"epoch": 2.5989765688122812,
"grad_norm": 0.6558308601379395,
"learning_rate": 4.159003694784647e-07,
"loss": 0.09994454979896546,
"memory(GiB)": 31.51,
"step": 605,
"token_acc": 0.9488222044057573,
"train_speed(iter/s)": 0.137823
},
{
"epoch": 2.62052248855373,
"grad_norm": 0.5623044371604919,
"learning_rate": 3.7201267604080436e-07,
"loss": 0.10503623485565186,
"memory(GiB)": 31.51,
"step": 610,
"token_acc": 0.9605492530908896,
"train_speed(iter/s)": 0.137935
},
{
"epoch": 2.642068408295179,
"grad_norm": 0.7411386966705322,
"learning_rate": 3.3048217483556743e-07,
"loss": 0.10335917472839355,
"memory(GiB)": 31.51,
"step": 615,
"token_acc": 0.9596229517824632,
"train_speed(iter/s)": 0.138049
},
{
"epoch": 2.663614328036628,
"grad_norm": 0.7703331112861633,
"learning_rate": 2.9133001876746004e-07,
"loss": 0.11330341100692749,
"memory(GiB)": 31.51,
"step": 620,
"token_acc": 0.9627358888545153,
"train_speed(iter/s)": 0.138195
},
{
"epoch": 2.663614328036628,
"eval_loss": 0.2645653188228607,
"eval_runtime": 9.1574,
"eval_samples_per_second": 16.38,
"eval_steps_per_second": 4.15,
"eval_token_acc": 0.9161461262914828,
"step": 620
},
{
"epoch": 2.685160247778077,
"grad_norm": 0.7208820581436157,
"learning_rate": 2.545761493686666e-07,
"loss": 0.10512195825576783,
"memory(GiB)": 31.51,
"step": 625,
"token_acc": 0.9453959214438257,
"train_speed(iter/s)": 0.137691
},
{
"epoch": 2.706706167519526,
"grad_norm": 0.792917013168335,
"learning_rate": 2.2023928664194229e-07,
"loss": 0.10448248386383056,
"memory(GiB)": 31.51,
"step": 630,
"token_acc": 0.9634454263743831,
"train_speed(iter/s)": 0.137865
},
{
"epoch": 2.728252087260975,
"grad_norm": 0.7074964046478271,
"learning_rate": 1.8833691952587829e-07,
"loss": 0.10274065732955932,
"memory(GiB)": 31.51,
"step": 635,
"token_acc": 0.9614498168320434,
"train_speed(iter/s)": 0.137973
},
{
"epoch": 2.7497980070024237,
"grad_norm": 0.695501446723938,
"learning_rate": 1.5888529698718347e-07,
"loss": 0.111275053024292,
"memory(GiB)": 31.51,
"step": 640,
"token_acc": 0.9627026215729437,
"train_speed(iter/s)": 0.13808
},
{
"epoch": 2.7497980070024237,
"eval_loss": 0.26474642753601074,
"eval_runtime": 9.1444,
"eval_samples_per_second": 16.403,
"eval_steps_per_second": 4.156,
"eval_token_acc": 0.9162349388673436,
"step": 640
},
{
"epoch": 2.7713439267438726,
"grad_norm": 0.7010114192962646,
"learning_rate": 1.3189941974453502e-07,
"loss": 0.11862779855728149,
"memory(GiB)": 31.51,
"step": 645,
"token_acc": 0.9463942439720986,
"train_speed(iter/s)": 0.13766
},
{
"epoch": 2.7928898464853216,
"grad_norm": 0.7053817510604858,
"learning_rate": 1.0739303262819301e-07,
"loss": 0.10773177146911621,
"memory(GiB)": 31.51,
"step": 650,
"token_acc": 0.9672267425750056,
"train_speed(iter/s)": 0.137765
},
{
"epoch": 2.814435766226771,
"grad_norm": 0.7381494641304016,
"learning_rate": 8.537861757929422e-08,
"loss": 0.10787509679794312,
"memory(GiB)": 31.51,
"step": 655,
"token_acc": 0.9632690990902866,
"train_speed(iter/s)": 0.137917
},
{
"epoch": 2.83598168596822,
"grad_norm": 0.7262890934944153,
"learning_rate": 6.58673872923693e-08,
"loss": 0.11206209659576416,
"memory(GiB)": 31.51,
"step": 660,
"token_acc": 0.965990990990991,
"train_speed(iter/s)": 0.138079
},
{
"epoch": 2.83598168596822,
"eval_loss": 0.2648203372955322,
"eval_runtime": 9.1399,
"eval_samples_per_second": 16.412,
"eval_steps_per_second": 4.158,
"eval_token_acc": 0.916279345155274,
"step": 660
},
{
"epoch": 2.857527605709669,
"grad_norm": 0.7241911888122559,
"learning_rate": 4.88692795043344e-08,
"loss": 0.10918653011322021,
"memory(GiB)": 31.51,
"step": 665,
"token_acc": 0.9504457917261055,
"train_speed(iter/s)": 0.137668
},
{
"epoch": 2.8790735254511177,
"grad_norm": 0.7945267558097839,
"learning_rate": 3.439295193286174e-08,
"loss": 0.11153676509857177,
"memory(GiB)": 31.51,
"step": 670,
"token_acc": 0.9583095218657305,
"train_speed(iter/s)": 0.13784
},
{
"epoch": 2.9006194451925666,
"grad_norm": 0.7374799847602844,
"learning_rate": 2.2445777866709208e-08,
"loss": 0.10855717658996582,
"memory(GiB)": 31.51,
"step": 675,
"token_acc": 0.9629715143294179,
"train_speed(iter/s)": 0.13794
},
{
"epoch": 2.9221653649340156,
"grad_norm": 0.6748504042625427,
"learning_rate": 1.3033842410251074e-08,
"loss": 0.11381592750549316,
"memory(GiB)": 31.51,
"step": 680,
"token_acc": 0.9596367864459332,
"train_speed(iter/s)": 0.138055
},
{
"epoch": 2.9221653649340156,
"eval_loss": 0.26483333110809326,
"eval_runtime": 9.1492,
"eval_samples_per_second": 16.395,
"eval_steps_per_second": 4.153,
"eval_token_acc": 0.9160425116196453,
"step": 680
},
{
"epoch": 2.9437112846754645,
"grad_norm": 0.7286980748176575,
"learning_rate": 6.16193938412557e-09,
"loss": 0.10510704517364503,
"memory(GiB)": 31.51,
"step": 685,
"token_acc": 0.9509096674461929,
"train_speed(iter/s)": 0.137639
},
{
"epoch": 2.9652572044169134,
"grad_norm": 0.6892649531364441,
"learning_rate": 1.8335688835802169e-09,
"loss": 0.105083167552948,
"memory(GiB)": 31.51,
"step": 690,
"token_acc": 0.9589703497799398,
"train_speed(iter/s)": 0.137777
},
{
"epoch": 2.9868031241583624,
"grad_norm": 0.7908564209938049,
"learning_rate": 5.093549575119205e-11,
"loss": 0.10409483909606934,
"memory(GiB)": 31.51,
"step": 695,
"token_acc": 0.9638513775207209,
"train_speed(iter/s)": 0.137894
},
{
"epoch": 2.9911123081066524,
"eval_loss": 0.26468953490257263,
"eval_runtime": 9.1426,
"eval_samples_per_second": 16.407,
"eval_steps_per_second": 4.156,
"eval_token_acc": 0.9161757304834365,
"step": 696
}
],
"logging_steps": 5,
"max_steps": 696,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.392166013990339e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}