Files
openthaigpt-thaillm-8b-inst…/trainer_state.json

2485 lines
70 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100.0,
"global_step": 1224,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024554941682013503,
"grad_norm": 1.7764384203008543,
"learning_rate": 4.065040650406504e-09,
"loss": 0.6458772420883179,
"memory(GiB)": 26.71,
"step": 1,
"token_acc": 0.8,
"train_speed(iter/s)": 0.059991
},
{
"epoch": 0.012277470841006752,
"grad_norm": 1.5484946901004182,
"learning_rate": 2.032520325203252e-08,
"loss": 0.5884945392608643,
"memory(GiB)": 52.53,
"step": 5,
"token_acc": 0.8317063859955282,
"train_speed(iter/s)": 0.107624
},
{
"epoch": 0.024554941682013505,
"grad_norm": 1.4563640400430145,
"learning_rate": 4.065040650406504e-08,
"loss": 0.5743718624114991,
"memory(GiB)": 52.53,
"step": 10,
"token_acc": 0.8498605726722438,
"train_speed(iter/s)": 0.112877
},
{
"epoch": 0.03683241252302026,
"grad_norm": 1.4140197884092542,
"learning_rate": 6.097560975609756e-08,
"loss": 0.590625,
"memory(GiB)": 52.53,
"step": 15,
"token_acc": 0.8347938144329897,
"train_speed(iter/s)": 0.117218
},
{
"epoch": 0.04910988336402701,
"grad_norm": 1.6622234178073612,
"learning_rate": 8.130081300813008e-08,
"loss": 0.5686985969543457,
"memory(GiB)": 52.53,
"step": 20,
"token_acc": 0.8420736417474894,
"train_speed(iter/s)": 0.116503
},
{
"epoch": 0.061387354205033766,
"grad_norm": 1.5449639237910855,
"learning_rate": 1.016260162601626e-07,
"loss": 0.5720160961151123,
"memory(GiB)": 52.53,
"step": 25,
"token_acc": 0.8329814975785421,
"train_speed(iter/s)": 0.121226
},
{
"epoch": 0.07366482504604052,
"grad_norm": 1.4350414462950623,
"learning_rate": 1.219512195121951e-07,
"loss": 0.5950272083282471,
"memory(GiB)": 56.59,
"step": 30,
"token_acc": 0.8376042390548992,
"train_speed(iter/s)": 0.1248
},
{
"epoch": 0.08594229588704727,
"grad_norm": 1.6390155212777497,
"learning_rate": 1.4227642276422763e-07,
"loss": 0.5760597229003906,
"memory(GiB)": 56.59,
"step": 35,
"token_acc": 0.8459589739981,
"train_speed(iter/s)": 0.12788
},
{
"epoch": 0.09821976672805402,
"grad_norm": 1.3096137406039592,
"learning_rate": 1.6260162601626016e-07,
"loss": 0.5555302143096924,
"memory(GiB)": 56.59,
"step": 40,
"token_acc": 0.8554021379485874,
"train_speed(iter/s)": 0.12593
},
{
"epoch": 0.11049723756906077,
"grad_norm": 1.422446577039075,
"learning_rate": 1.8292682926829268e-07,
"loss": 0.5970460891723632,
"memory(GiB)": 57.67,
"step": 45,
"token_acc": 0.8417625280973159,
"train_speed(iter/s)": 0.126605
},
{
"epoch": 0.12277470841006753,
"grad_norm": 1.2819764069439123,
"learning_rate": 2.032520325203252e-07,
"loss": 0.5884549140930175,
"memory(GiB)": 57.67,
"step": 50,
"token_acc": 0.8406445511421504,
"train_speed(iter/s)": 0.129975
},
{
"epoch": 0.13505217925107427,
"grad_norm": 1.41910630937099,
"learning_rate": 2.235772357723577e-07,
"loss": 0.6231883049011231,
"memory(GiB)": 57.67,
"step": 55,
"token_acc": 0.829693936306861,
"train_speed(iter/s)": 0.132967
},
{
"epoch": 0.14732965009208104,
"grad_norm": 1.2531126371792276,
"learning_rate": 2.439024390243902e-07,
"loss": 0.5942145347595215,
"memory(GiB)": 57.67,
"step": 60,
"token_acc": 0.8500459277403551,
"train_speed(iter/s)": 0.133823
},
{
"epoch": 0.1596071209330878,
"grad_norm": 1.1301268691981066,
"learning_rate": 2.6422764227642274e-07,
"loss": 0.5744296073913574,
"memory(GiB)": 57.67,
"step": 65,
"token_acc": 0.8480531499966947,
"train_speed(iter/s)": 0.134364
},
{
"epoch": 0.17188459177409454,
"grad_norm": 1.0132293794887546,
"learning_rate": 2.8455284552845527e-07,
"loss": 0.5759846210479737,
"memory(GiB)": 57.67,
"step": 70,
"token_acc": 0.8393322326178683,
"train_speed(iter/s)": 0.137087
},
{
"epoch": 0.1841620626151013,
"grad_norm": 1.1217457637602928,
"learning_rate": 3.048780487804878e-07,
"loss": 0.56055588722229,
"memory(GiB)": 57.67,
"step": 75,
"token_acc": 0.8425447316103379,
"train_speed(iter/s)": 0.138299
},
{
"epoch": 0.19643953345610804,
"grad_norm": 0.8818994062981204,
"learning_rate": 3.252032520325203e-07,
"loss": 0.5674080848693848,
"memory(GiB)": 57.67,
"step": 80,
"token_acc": 0.8303922425089543,
"train_speed(iter/s)": 0.137434
},
{
"epoch": 0.2087170042971148,
"grad_norm": 0.8918960386763577,
"learning_rate": 3.4552845528455284e-07,
"loss": 0.559955358505249,
"memory(GiB)": 57.67,
"step": 85,
"token_acc": 0.8446934551423881,
"train_speed(iter/s)": 0.137418
},
{
"epoch": 0.22099447513812154,
"grad_norm": 1.0522794863647833,
"learning_rate": 3.6585365853658536e-07,
"loss": 0.5732513904571533,
"memory(GiB)": 57.67,
"step": 90,
"token_acc": 0.8489562749254482,
"train_speed(iter/s)": 0.137088
},
{
"epoch": 0.2332719459791283,
"grad_norm": 0.785438670663371,
"learning_rate": 3.861788617886179e-07,
"loss": 0.5366201877593995,
"memory(GiB)": 57.67,
"step": 95,
"token_acc": 0.8626246438746439,
"train_speed(iter/s)": 0.136376
},
{
"epoch": 0.24554941682013506,
"grad_norm": 1.0039103796923265,
"learning_rate": 4.065040650406504e-07,
"loss": 0.5319199085235595,
"memory(GiB)": 57.67,
"step": 100,
"token_acc": 0.8360549786720669,
"train_speed(iter/s)": 0.136236
},
{
"epoch": 0.2578268876611418,
"grad_norm": 0.8843990831220632,
"learning_rate": 4.268292682926829e-07,
"loss": 0.5205207347869873,
"memory(GiB)": 57.67,
"step": 105,
"token_acc": 0.8555977051133336,
"train_speed(iter/s)": 0.127767
},
{
"epoch": 0.27010435850214853,
"grad_norm": 0.7775914671814245,
"learning_rate": 4.471544715447154e-07,
"loss": 0.5021872520446777,
"memory(GiB)": 57.67,
"step": 110,
"token_acc": 0.8675163294567468,
"train_speed(iter/s)": 0.129151
},
{
"epoch": 0.2823818293431553,
"grad_norm": 0.8595659824667936,
"learning_rate": 4.674796747967479e-07,
"loss": 0.5380425930023194,
"memory(GiB)": 57.67,
"step": 115,
"token_acc": 0.8555190611506809,
"train_speed(iter/s)": 0.130605
},
{
"epoch": 0.2946593001841621,
"grad_norm": 0.9109047806477464,
"learning_rate": 4.878048780487804e-07,
"loss": 0.5206707000732422,
"memory(GiB)": 57.67,
"step": 120,
"token_acc": 0.8441454698339289,
"train_speed(iter/s)": 0.131631
},
{
"epoch": 0.3069367710251688,
"grad_norm": 0.8591216474047668,
"learning_rate": 4.999959290672028e-07,
"loss": 0.5150551795959473,
"memory(GiB)": 57.67,
"step": 125,
"token_acc": 0.8495000393669789,
"train_speed(iter/s)": 0.132508
},
{
"epoch": 0.3192142418661756,
"grad_norm": 0.8236784585400246,
"learning_rate": 4.999501325958186e-07,
"loss": 0.5071953773498535,
"memory(GiB)": 57.67,
"step": 130,
"token_acc": 0.8506227570192104,
"train_speed(iter/s)": 0.132915
},
{
"epoch": 0.3314917127071823,
"grad_norm": 0.954649941162109,
"learning_rate": 4.998534603397122e-07,
"loss": 0.49468369483947755,
"memory(GiB)": 57.67,
"step": 135,
"token_acc": 0.850890297573384,
"train_speed(iter/s)": 0.133509
},
{
"epoch": 0.3437691835481891,
"grad_norm": 0.7775368395525796,
"learning_rate": 4.997059319759163e-07,
"loss": 0.49853315353393557,
"memory(GiB)": 57.67,
"step": 140,
"token_acc": 0.8663300877509316,
"train_speed(iter/s)": 0.134979
},
{
"epoch": 0.3560466543891958,
"grad_norm": 0.8879804549594494,
"learning_rate": 4.995075775329056e-07,
"loss": 0.49777793884277344,
"memory(GiB)": 57.67,
"step": 145,
"token_acc": 0.8515488018702513,
"train_speed(iter/s)": 0.13598
},
{
"epoch": 0.3683241252302026,
"grad_norm": 0.8614455268040374,
"learning_rate": 4.992584373844852e-07,
"loss": 0.46718130111694334,
"memory(GiB)": 57.67,
"step": 150,
"token_acc": 0.870577384246449,
"train_speed(iter/s)": 0.137174
},
{
"epoch": 0.38060159607120936,
"grad_norm": 0.9466769688310654,
"learning_rate": 4.989585622415729e-07,
"loss": 0.46044120788574217,
"memory(GiB)": 57.67,
"step": 155,
"token_acc": 0.8734130199891951,
"train_speed(iter/s)": 0.137843
},
{
"epoch": 0.3928790669122161,
"grad_norm": 0.756694079775602,
"learning_rate": 4.986080131418763e-07,
"loss": 0.4397891044616699,
"memory(GiB)": 57.67,
"step": 160,
"token_acc": 0.8813499680102367,
"train_speed(iter/s)": 0.137964
},
{
"epoch": 0.40515653775322286,
"grad_norm": 0.7766777226622378,
"learning_rate": 4.982068614374703e-07,
"loss": 0.4335052490234375,
"memory(GiB)": 57.67,
"step": 165,
"token_acc": 0.8773235563703025,
"train_speed(iter/s)": 0.137421
},
{
"epoch": 0.4174340085942296,
"grad_norm": 0.7739408885073995,
"learning_rate": 4.977551887802731e-07,
"loss": 0.44889039993286134,
"memory(GiB)": 57.67,
"step": 170,
"token_acc": 0.8657095569839498,
"train_speed(iter/s)": 0.13711
},
{
"epoch": 0.42971147943523635,
"grad_norm": 0.9033077880962782,
"learning_rate": 4.972530871054263e-07,
"loss": 0.42515549659729,
"memory(GiB)": 57.67,
"step": 175,
"token_acc": 0.8809412679891093,
"train_speed(iter/s)": 0.136969
},
{
"epoch": 0.4419889502762431,
"grad_norm": 0.7409058426766315,
"learning_rate": 4.967006586125826e-07,
"loss": 0.4390419960021973,
"memory(GiB)": 57.67,
"step": 180,
"token_acc": 0.8779247640798035,
"train_speed(iter/s)": 0.1378
},
{
"epoch": 0.45426642111724985,
"grad_norm": 0.7709335380436733,
"learning_rate": 4.960980157451032e-07,
"loss": 0.4336841583251953,
"memory(GiB)": 57.67,
"step": 185,
"token_acc": 0.8787090057261843,
"train_speed(iter/s)": 0.138248
},
{
"epoch": 0.4665438919582566,
"grad_norm": 0.8002910130967711,
"learning_rate": 4.954452811671713e-07,
"loss": 0.4231499195098877,
"memory(GiB)": 57.67,
"step": 190,
"token_acc": 0.8717664903865863,
"train_speed(iter/s)": 0.138123
},
{
"epoch": 0.47882136279926335,
"grad_norm": 0.7671937721932319,
"learning_rate": 4.947425877388237e-07,
"loss": 0.4115544319152832,
"memory(GiB)": 57.67,
"step": 195,
"token_acc": 0.8822761322245893,
"train_speed(iter/s)": 0.138556
},
{
"epoch": 0.4910988336402701,
"grad_norm": 0.8718441420761184,
"learning_rate": 4.939900784889085e-07,
"loss": 0.4212639331817627,
"memory(GiB)": 57.67,
"step": 200,
"token_acc": 0.8792764857881137,
"train_speed(iter/s)": 0.13898
},
{
"epoch": 0.5033763044812769,
"grad_norm": 0.8688003112334609,
"learning_rate": 4.931879065859729e-07,
"loss": 0.3883807182312012,
"memory(GiB)": 57.67,
"step": 205,
"token_acc": 0.8808318569138615,
"train_speed(iter/s)": 0.133423
},
{
"epoch": 0.5156537753222836,
"grad_norm": 0.9144969667406203,
"learning_rate": 4.923362353070858e-07,
"loss": 0.3918790817260742,
"memory(GiB)": 57.67,
"step": 210,
"token_acc": 0.8826902804132406,
"train_speed(iter/s)": 0.133713
},
{
"epoch": 0.5279312461632903,
"grad_norm": 0.8097366879590759,
"learning_rate": 4.914352380046041e-07,
"loss": 0.3884381055831909,
"memory(GiB)": 57.67,
"step": 215,
"token_acc": 0.8870171589751625,
"train_speed(iter/s)": 0.133996
},
{
"epoch": 0.5402087170042971,
"grad_norm": 0.7538862741531127,
"learning_rate": 4.904850980708886e-07,
"loss": 0.3775317668914795,
"memory(GiB)": 57.67,
"step": 220,
"token_acc": 0.8937805494690713,
"train_speed(iter/s)": 0.133724
},
{
"epoch": 0.5524861878453039,
"grad_norm": 1.146276827778673,
"learning_rate": 4.894860089009741e-07,
"loss": 0.3711127519607544,
"memory(GiB)": 57.67,
"step": 225,
"token_acc": 0.8895364441547486,
"train_speed(iter/s)": 0.134362
},
{
"epoch": 0.5647636586863106,
"grad_norm": 0.8283210772807095,
"learning_rate": 4.884381738532069e-07,
"loss": 0.3519309043884277,
"memory(GiB)": 57.67,
"step": 230,
"token_acc": 0.8935041822388325,
"train_speed(iter/s)": 0.134198
},
{
"epoch": 0.5770411295273173,
"grad_norm": 1.004769745418558,
"learning_rate": 4.87341806207851e-07,
"loss": 0.3587208271026611,
"memory(GiB)": 57.67,
"step": 235,
"token_acc": 0.9019401835119423,
"train_speed(iter/s)": 0.134025
},
{
"epoch": 0.5893186003683242,
"grad_norm": 0.8182191455973747,
"learning_rate": 4.861971291236771e-07,
"loss": 0.3467154026031494,
"memory(GiB)": 57.67,
"step": 240,
"token_acc": 0.8995148555664775,
"train_speed(iter/s)": 0.134306
},
{
"epoch": 0.6015960712093309,
"grad_norm": 1.5271995294368055,
"learning_rate": 4.850043755925397e-07,
"loss": 0.33415584564208983,
"memory(GiB)": 57.67,
"step": 245,
"token_acc": 0.8970737022336368,
"train_speed(iter/s)": 0.134727
},
{
"epoch": 0.6138735420503376,
"grad_norm": 0.875242766730362,
"learning_rate": 4.837637883919528e-07,
"loss": 0.3291849374771118,
"memory(GiB)": 57.67,
"step": 250,
"token_acc": 0.9039310639510041,
"train_speed(iter/s)": 0.13542
},
{
"epoch": 0.6261510128913443,
"grad_norm": 0.8014571513700163,
"learning_rate": 4.824756200356748e-07,
"loss": 0.32892580032348634,
"memory(GiB)": 57.67,
"step": 255,
"token_acc": 0.9007355946056396,
"train_speed(iter/s)": 0.135787
},
{
"epoch": 0.6384284837323512,
"grad_norm": 0.9358345370809253,
"learning_rate": 4.811401327223103e-07,
"loss": 0.3249573469161987,
"memory(GiB)": 57.67,
"step": 260,
"token_acc": 0.9081637062967285,
"train_speed(iter/s)": 0.13572
},
{
"epoch": 0.6507059545733579,
"grad_norm": 0.8184870146688139,
"learning_rate": 4.797575982819412e-07,
"loss": 0.31554522514343264,
"memory(GiB)": 57.67,
"step": 265,
"token_acc": 0.9049698848226551,
"train_speed(iter/s)": 0.135725
},
{
"epoch": 0.6629834254143646,
"grad_norm": 0.8945211680671034,
"learning_rate": 4.783282981207979e-07,
"loss": 0.3033627510070801,
"memory(GiB)": 57.67,
"step": 270,
"token_acc": 0.9165617767672256,
"train_speed(iter/s)": 0.135173
},
{
"epoch": 0.6752608962553714,
"grad_norm": 1.4362312397197503,
"learning_rate": 4.768525231639802e-07,
"loss": 0.2961107730865479,
"memory(GiB)": 57.67,
"step": 275,
"token_acc": 0.9266585849680405,
"train_speed(iter/s)": 0.135236
},
{
"epoch": 0.6875383670963782,
"grad_norm": 0.7600567933058701,
"learning_rate": 4.753305737962418e-07,
"loss": 0.2920267581939697,
"memory(GiB)": 57.67,
"step": 280,
"token_acc": 0.9217091715507683,
"train_speed(iter/s)": 0.135403
},
{
"epoch": 0.6998158379373849,
"grad_norm": 0.8650286740134839,
"learning_rate": 4.7376275980084856e-07,
"loss": 0.2840526819229126,
"memory(GiB)": 57.67,
"step": 285,
"token_acc": 0.9261834939254294,
"train_speed(iter/s)": 0.135709
},
{
"epoch": 0.7120933087783916,
"grad_norm": 0.9178664296409083,
"learning_rate": 4.721494002965243e-07,
"loss": 0.2752720355987549,
"memory(GiB)": 57.67,
"step": 290,
"token_acc": 0.9174185126886014,
"train_speed(iter/s)": 0.136139
},
{
"epoch": 0.7243707796193984,
"grad_norm": 0.9722910316739173,
"learning_rate": 4.70490823672496e-07,
"loss": 0.26680717468261717,
"memory(GiB)": 57.67,
"step": 295,
"token_acc": 0.9065665385958784,
"train_speed(iter/s)": 0.136569
},
{
"epoch": 0.7366482504604052,
"grad_norm": 0.8535970757124566,
"learning_rate": 4.6878736752165216e-07,
"loss": 0.26862516403198244,
"memory(GiB)": 57.67,
"step": 300,
"token_acc": 0.9236588470631024,
"train_speed(iter/s)": 0.136851
},
{
"epoch": 0.7489257213014119,
"grad_norm": 0.8086539218227536,
"learning_rate": 4.670393785718281e-07,
"loss": 0.26166937351226804,
"memory(GiB)": 57.67,
"step": 305,
"token_acc": 0.9295115530856976,
"train_speed(iter/s)": 0.132999
},
{
"epoch": 0.7612031921424187,
"grad_norm": 0.7359005917653992,
"learning_rate": 4.652472126152316e-07,
"loss": 0.27553093433380127,
"memory(GiB)": 57.67,
"step": 310,
"token_acc": 0.923441422964037,
"train_speed(iter/s)": 0.133358
},
{
"epoch": 0.7734806629834254,
"grad_norm": 0.7800276116485089,
"learning_rate": 4.634112344360237e-07,
"loss": 0.25064496994018554,
"memory(GiB)": 57.67,
"step": 315,
"token_acc": 0.9140353723835795,
"train_speed(iter/s)": 0.133677
},
{
"epoch": 0.7857581338244322,
"grad_norm": 0.7718671135037027,
"learning_rate": 4.615318177360689e-07,
"loss": 0.24835121631622314,
"memory(GiB)": 57.67,
"step": 320,
"token_acc": 0.9273986758008343,
"train_speed(iter/s)": 0.134016
},
{
"epoch": 0.7980356046654389,
"grad_norm": 0.9264394104378584,
"learning_rate": 4.596093450588707e-07,
"loss": 0.23845996856689453,
"memory(GiB)": 57.67,
"step": 325,
"token_acc": 0.9188875580176673,
"train_speed(iter/s)": 0.134392
},
{
"epoch": 0.8103130755064457,
"grad_norm": 0.7295759857048888,
"learning_rate": 4.5764420771170723e-07,
"loss": 0.2278268575668335,
"memory(GiB)": 57.67,
"step": 330,
"token_acc": 0.940601686668829,
"train_speed(iter/s)": 0.134767
},
{
"epoch": 0.8225905463474524,
"grad_norm": 0.7266088554592744,
"learning_rate": 4.556368056859832e-07,
"loss": 0.21920721530914306,
"memory(GiB)": 57.67,
"step": 335,
"token_acc": 0.9349947057933746,
"train_speed(iter/s)": 0.134994
},
{
"epoch": 0.8348680171884592,
"grad_norm": 0.7475457504296112,
"learning_rate": 4.5358754757581397e-07,
"loss": 0.2169396162033081,
"memory(GiB)": 57.67,
"step": 340,
"token_acc": 0.9349587340046944,
"train_speed(iter/s)": 0.135495
},
{
"epoch": 0.8471454880294659,
"grad_norm": 0.7207459452665155,
"learning_rate": 4.5149685049485877e-07,
"loss": 0.22667970657348632,
"memory(GiB)": 57.67,
"step": 345,
"token_acc": 0.9089569551995486,
"train_speed(iter/s)": 0.136023
},
{
"epoch": 0.8594229588704727,
"grad_norm": 0.7912001296106889,
"learning_rate": 4.4936513999142e-07,
"loss": 0.2154712438583374,
"memory(GiB)": 57.67,
"step": 350,
"token_acc": 0.9488812673526049,
"train_speed(iter/s)": 0.136042
},
{
"epoch": 0.8717004297114794,
"grad_norm": 1.7147841573520497,
"learning_rate": 4.471928499618255e-07,
"loss": 0.21075584888458251,
"memory(GiB)": 57.67,
"step": 355,
"token_acc": 0.9382304479442082,
"train_speed(iter/s)": 0.13587
},
{
"epoch": 0.8839779005524862,
"grad_norm": 0.7658381358366665,
"learning_rate": 4.449804225621116e-07,
"loss": 0.19444403648376465,
"memory(GiB)": 57.67,
"step": 360,
"token_acc": 0.9425833467547109,
"train_speed(iter/s)": 0.135745
},
{
"epoch": 0.896255371393493,
"grad_norm": 0.6912930219104488,
"learning_rate": 4.427283081180249e-07,
"loss": 0.1945898175239563,
"memory(GiB)": 57.67,
"step": 365,
"token_acc": 0.9307141169986616,
"train_speed(iter/s)": 0.136429
},
{
"epoch": 0.9085328422344997,
"grad_norm": 0.6755021225392699,
"learning_rate": 4.404369650333616e-07,
"loss": 0.1876620650291443,
"memory(GiB)": 57.67,
"step": 370,
"token_acc": 0.9437858236320268,
"train_speed(iter/s)": 0.136643
},
{
"epoch": 0.9208103130755064,
"grad_norm": 0.7098782582131411,
"learning_rate": 4.3810685969666203e-07,
"loss": 0.2034088134765625,
"memory(GiB)": 57.67,
"step": 375,
"token_acc": 0.9409318390075421,
"train_speed(iter/s)": 0.136522
},
{
"epoch": 0.9330877839165131,
"grad_norm": 0.6938412383830229,
"learning_rate": 4.357384663862803e-07,
"loss": 0.1925197124481201,
"memory(GiB)": 57.67,
"step": 380,
"token_acc": 0.934276273372018,
"train_speed(iter/s)": 0.136664
},
{
"epoch": 0.94536525475752,
"grad_norm": 0.5516763311313012,
"learning_rate": 4.3333226717384784e-07,
"loss": 0.18835780620574952,
"memory(GiB)": 57.67,
"step": 385,
"token_acc": 0.952319409185322,
"train_speed(iter/s)": 0.136364
},
{
"epoch": 0.9576427255985267,
"grad_norm": 0.5887901021890946,
"learning_rate": 4.308887518261507e-07,
"loss": 0.18153078556060792,
"memory(GiB)": 57.67,
"step": 390,
"token_acc": 0.9437745469578999,
"train_speed(iter/s)": 0.136159
},
{
"epoch": 0.9699201964395334,
"grad_norm": 0.6208318604275079,
"learning_rate": 4.2840841770544073e-07,
"loss": 0.16547969579696656,
"memory(GiB)": 57.67,
"step": 395,
"token_acc": 0.9472743181040058,
"train_speed(iter/s)": 0.136212
},
{
"epoch": 0.9821976672805403,
"grad_norm": 1.389928819023539,
"learning_rate": 4.258917696682006e-07,
"loss": 0.17939815521240235,
"memory(GiB)": 57.67,
"step": 400,
"token_acc": 0.9349780954576896,
"train_speed(iter/s)": 0.136576
},
{
"epoch": 0.994475138121547,
"grad_norm": 0.5148203046629826,
"learning_rate": 4.2333931996238316e-07,
"loss": 0.19017149209976197,
"memory(GiB)": 57.67,
"step": 405,
"token_acc": 0.9227554596926395,
"train_speed(iter/s)": 0.133387
},
{
"epoch": 1.0049109883364027,
"grad_norm": 0.4950490943119214,
"learning_rate": 4.2075158812314694e-07,
"loss": 0.16587586402893068,
"memory(GiB)": 57.67,
"step": 410,
"token_acc": 0.935228905768836,
"train_speed(iter/s)": 0.133808
},
{
"epoch": 1.0171884591774094,
"grad_norm": 0.5453509149451119,
"learning_rate": 4.1812910086710786e-07,
"loss": 0.17764878273010254,
"memory(GiB)": 57.67,
"step": 415,
"token_acc": 0.941226073024707,
"train_speed(iter/s)": 0.133857
},
{
"epoch": 1.0294659300184161,
"grad_norm": 0.6099408421859539,
"learning_rate": 4.1547239198512906e-07,
"loss": 0.17024999856948853,
"memory(GiB)": 57.67,
"step": 420,
"token_acc": 0.9455940130963517,
"train_speed(iter/s)": 0.133546
},
{
"epoch": 1.0417434008594229,
"grad_norm": 0.5544678384480342,
"learning_rate": 4.1278200223367186e-07,
"loss": 0.1932210922241211,
"memory(GiB)": 57.67,
"step": 425,
"token_acc": 0.941819772528434,
"train_speed(iter/s)": 0.133645
},
{
"epoch": 1.0540208717004298,
"grad_norm": 0.5826701192243031,
"learning_rate": 4.1005847922472737e-07,
"loss": 0.19101818799972534,
"memory(GiB)": 57.67,
"step": 430,
"token_acc": 0.9367067743530575,
"train_speed(iter/s)": 0.133833
},
{
"epoch": 1.0662983425414365,
"grad_norm": 0.5532014271248828,
"learning_rate": 4.0730237731435377e-07,
"loss": 0.1754150390625,
"memory(GiB)": 57.67,
"step": 435,
"token_acc": 0.9651971029990765,
"train_speed(iter/s)": 0.133965
},
{
"epoch": 1.0785758133824432,
"grad_norm": 0.5950560890667523,
"learning_rate": 4.0451425748984127e-07,
"loss": 0.17856969833374023,
"memory(GiB)": 57.67,
"step": 440,
"token_acc": 0.9385016513123587,
"train_speed(iter/s)": 0.133907
},
{
"epoch": 1.09085328422345,
"grad_norm": 0.5376654555480931,
"learning_rate": 4.016946872555251e-07,
"loss": 0.1833416700363159,
"memory(GiB)": 57.67,
"step": 445,
"token_acc": 0.959684329199549,
"train_speed(iter/s)": 0.134067
},
{
"epoch": 1.1031307550644567,
"grad_norm": 0.44444520282357075,
"learning_rate": 3.988442405172755e-07,
"loss": 0.17330591678619384,
"memory(GiB)": 57.67,
"step": 450,
"token_acc": 0.925875966441849,
"train_speed(iter/s)": 0.133897
},
{
"epoch": 1.1154082259054634,
"grad_norm": 0.5467733830966146,
"learning_rate": 3.9596349746568097e-07,
"loss": 0.187214457988739,
"memory(GiB)": 57.67,
"step": 455,
"token_acc": 0.9102694260054666,
"train_speed(iter/s)": 0.133805
},
{
"epoch": 1.1276856967464703,
"grad_norm": 0.5644574909855569,
"learning_rate": 3.930530444579556e-07,
"loss": 0.18247673511505128,
"memory(GiB)": 57.67,
"step": 460,
"token_acc": 0.9407660594101273,
"train_speed(iter/s)": 0.133717
},
{
"epoch": 1.139963167587477,
"grad_norm": 0.6185677880856536,
"learning_rate": 3.901134738985885e-07,
"loss": 0.19450093507766725,
"memory(GiB)": 57.67,
"step": 465,
"token_acc": 0.9278699743370402,
"train_speed(iter/s)": 0.133408
},
{
"epoch": 1.1522406384284838,
"grad_norm": 0.5271730386275215,
"learning_rate": 3.871453841187645e-07,
"loss": 0.1889647960662842,
"memory(GiB)": 57.67,
"step": 470,
"token_acc": 0.9411243259215484,
"train_speed(iter/s)": 0.133399
},
{
"epoch": 1.1645181092694905,
"grad_norm": 0.7131823190088369,
"learning_rate": 3.8414937925457706e-07,
"loss": 0.17877411842346191,
"memory(GiB)": 57.67,
"step": 475,
"token_acc": 0.9540759574129986,
"train_speed(iter/s)": 0.133589
},
{
"epoch": 1.1767955801104972,
"grad_norm": 0.5353529040202072,
"learning_rate": 3.8112606912406037e-07,
"loss": 0.17376744747161865,
"memory(GiB)": 57.67,
"step": 480,
"token_acc": 0.9406874176214134,
"train_speed(iter/s)": 0.13372
},
{
"epoch": 1.189073050951504,
"grad_norm": 0.4982526719045866,
"learning_rate": 3.780760691030646e-07,
"loss": 0.16717066764831542,
"memory(GiB)": 57.67,
"step": 485,
"token_acc": 0.9538624787775891,
"train_speed(iter/s)": 0.134087
},
{
"epoch": 1.2013505217925107,
"grad_norm": 0.46216176496672484,
"learning_rate": 3.75e-07,
"loss": 0.19427452087402344,
"memory(GiB)": 57.67,
"step": 490,
"token_acc": 0.93522816539313,
"train_speed(iter/s)": 0.134444
},
{
"epoch": 1.2136279926335174,
"grad_norm": 0.7604094532944367,
"learning_rate": 3.7189848792947536e-07,
"loss": 0.1537397861480713,
"memory(GiB)": 57.67,
"step": 495,
"token_acc": 0.9573288642516437,
"train_speed(iter/s)": 0.134597
},
{
"epoch": 1.2259054634745243,
"grad_norm": 0.549241195148398,
"learning_rate": 3.687721641848562e-07,
"loss": 0.1440601110458374,
"memory(GiB)": 57.67,
"step": 500,
"token_acc": 0.953042040212377,
"train_speed(iter/s)": 0.134604
},
{
"epoch": 1.238182934315531,
"grad_norm": 0.563515575244595,
"learning_rate": 3.6562166510976887e-07,
"loss": 0.1917360782623291,
"memory(GiB)": 57.67,
"step": 505,
"token_acc": 0.9542143600416233,
"train_speed(iter/s)": 0.13233
},
{
"epoch": 1.2504604051565378,
"grad_norm": 0.8281020221114317,
"learning_rate": 3.624476319685771e-07,
"loss": 0.189109206199646,
"memory(GiB)": 57.67,
"step": 510,
"token_acc": 0.9374545982856313,
"train_speed(iter/s)": 0.132765
},
{
"epoch": 1.2627378759975445,
"grad_norm": 0.5241233864753891,
"learning_rate": 3.592507108158563e-07,
"loss": 0.15444846153259278,
"memory(GiB)": 57.67,
"step": 515,
"token_acc": 0.9661171743001964,
"train_speed(iter/s)": 0.13301
},
{
"epoch": 1.2750153468385512,
"grad_norm": 0.5573427156559804,
"learning_rate": 3.560315523648932e-07,
"loss": 0.18322609663009642,
"memory(GiB)": 57.67,
"step": 520,
"token_acc": 0.9517766497461929,
"train_speed(iter/s)": 0.133378
},
{
"epoch": 1.287292817679558,
"grad_norm": 0.537977575567055,
"learning_rate": 3.5279081185523763e-07,
"loss": 0.18208487033843995,
"memory(GiB)": 57.67,
"step": 525,
"token_acc": 0.9459727287141272,
"train_speed(iter/s)": 0.133578
},
{
"epoch": 1.2995702885205649,
"grad_norm": 0.4953187670124203,
"learning_rate": 3.4952914891933225e-07,
"loss": 0.17269195318222047,
"memory(GiB)": 57.67,
"step": 530,
"token_acc": 0.942077971960822,
"train_speed(iter/s)": 0.133685
},
{
"epoch": 1.3118477593615716,
"grad_norm": 0.4701054790036289,
"learning_rate": 3.4624722744824874e-07,
"loss": 0.1993415355682373,
"memory(GiB)": 57.67,
"step": 535,
"token_acc": 0.942649839836363,
"train_speed(iter/s)": 0.133424
},
{
"epoch": 1.3241252302025783,
"grad_norm": 0.5249799204021143,
"learning_rate": 3.429457154565565e-07,
"loss": 0.18299152851104736,
"memory(GiB)": 57.67,
"step": 540,
"token_acc": 0.9312291707508332,
"train_speed(iter/s)": 0.133551
},
{
"epoch": 1.336402701043585,
"grad_norm": 0.5358390030505376,
"learning_rate": 3.396252849463529e-07,
"loss": 0.1694674849510193,
"memory(GiB)": 57.67,
"step": 545,
"token_acc": 0.9403699099709948,
"train_speed(iter/s)": 0.13334
},
{
"epoch": 1.3486801718845918,
"grad_norm": 0.47175661832523136,
"learning_rate": 3.362866117704815e-07,
"loss": 0.16650619506835937,
"memory(GiB)": 57.67,
"step": 550,
"token_acc": 0.966151256036507,
"train_speed(iter/s)": 0.132885
},
{
"epoch": 1.3609576427255985,
"grad_norm": 0.5566911020042059,
"learning_rate": 3.3293037549496597e-07,
"loss": 0.18229317665100098,
"memory(GiB)": 57.67,
"step": 555,
"token_acc": 0.9332179930795848,
"train_speed(iter/s)": 0.132956
},
{
"epoch": 1.3732351135666052,
"grad_norm": 0.8552156642127553,
"learning_rate": 3.295572592606891e-07,
"loss": 0.17464141845703124,
"memory(GiB)": 57.67,
"step": 560,
"token_acc": 0.9428267315441344,
"train_speed(iter/s)": 0.133102
},
{
"epoch": 1.385512584407612,
"grad_norm": 0.4054170592537424,
"learning_rate": 3.2616794964434356e-07,
"loss": 0.169390869140625,
"memory(GiB)": 57.67,
"step": 565,
"token_acc": 0.9302136041022855,
"train_speed(iter/s)": 0.133051
},
{
"epoch": 1.3977900552486187,
"grad_norm": 0.4737608855605009,
"learning_rate": 3.227631365186836e-07,
"loss": 0.16181081533432007,
"memory(GiB)": 57.67,
"step": 570,
"token_acc": 0.9568342208944884,
"train_speed(iter/s)": 0.132964
},
{
"epoch": 1.4100675260896256,
"grad_norm": 0.4918393822553509,
"learning_rate": 3.193435129121058e-07,
"loss": 0.1819918632507324,
"memory(GiB)": 57.67,
"step": 575,
"token_acc": 0.9572955270188744,
"train_speed(iter/s)": 0.133093
},
{
"epoch": 1.4223449969306323,
"grad_norm": 0.616287863577658,
"learning_rate": 3.159097748675873e-07,
"loss": 0.1604529619216919,
"memory(GiB)": 57.67,
"step": 580,
"token_acc": 0.9625401355690332,
"train_speed(iter/s)": 0.133223
},
{
"epoch": 1.434622467771639,
"grad_norm": 0.5677509895303791,
"learning_rate": 3.124626213010108e-07,
"loss": 0.1569218635559082,
"memory(GiB)": 57.67,
"step": 585,
"token_acc": 0.9455586360854067,
"train_speed(iter/s)": 0.133671
},
{
"epoch": 1.4468999386126458,
"grad_norm": 0.5032051690478044,
"learning_rate": 3.090027538589044e-07,
"loss": 0.169755220413208,
"memory(GiB)": 57.67,
"step": 590,
"token_acc": 0.9418753193663771,
"train_speed(iter/s)": 0.134002
},
{
"epoch": 1.4591774094536525,
"grad_norm": 0.5432224909432672,
"learning_rate": 3.055308767756261e-07,
"loss": 0.18236881494522095,
"memory(GiB)": 57.67,
"step": 595,
"token_acc": 0.9640907181856363,
"train_speed(iter/s)": 0.133845
},
{
"epoch": 1.4714548802946594,
"grad_norm": 0.5191917254492633,
"learning_rate": 3.0204769673002116e-07,
"loss": 0.16606335639953612,
"memory(GiB)": 57.67,
"step": 600,
"token_acc": 0.9707240443661637,
"train_speed(iter/s)": 0.134004
},
{
"epoch": 1.4837323511356661,
"grad_norm": 0.5703809476287016,
"learning_rate": 2.9855392270158206e-07,
"loss": 0.17542767524719238,
"memory(GiB)": 57.67,
"step": 605,
"token_acc": 0.953876582278481,
"train_speed(iter/s)": 0.132384
},
{
"epoch": 1.4960098219766729,
"grad_norm": 0.42427507061236897,
"learning_rate": 2.9505026582614024e-07,
"loss": 0.19279547929763793,
"memory(GiB)": 57.67,
"step": 610,
"token_acc": 0.9392731620710896,
"train_speed(iter/s)": 0.132484
},
{
"epoch": 1.5082872928176796,
"grad_norm": 0.5816813455731817,
"learning_rate": 2.915374392511184e-07,
"loss": 0.18373801708221435,
"memory(GiB)": 57.67,
"step": 615,
"token_acc": 0.9288226144586461,
"train_speed(iter/s)": 0.132603
},
{
"epoch": 1.5205647636586863,
"grad_norm": 0.4407507522526475,
"learning_rate": 2.8801615799037484e-07,
"loss": 0.16642086505889891,
"memory(GiB)": 57.67,
"step": 620,
"token_acc": 0.9481449252432561,
"train_speed(iter/s)": 0.132928
},
{
"epoch": 1.532842234499693,
"grad_norm": 0.482240995820186,
"learning_rate": 2.844871387786655e-07,
"loss": 0.16589756011962892,
"memory(GiB)": 57.67,
"step": 625,
"token_acc": 0.9455185772142983,
"train_speed(iter/s)": 0.132983
},
{
"epoch": 1.5451197053406998,
"grad_norm": 0.5533567457008911,
"learning_rate": 2.809510999257582e-07,
"loss": 0.19375090599060057,
"memory(GiB)": 57.67,
"step": 630,
"token_acc": 0.9351315128162171,
"train_speed(iter/s)": 0.132918
},
{
"epoch": 1.5573971761817065,
"grad_norm": 0.42397946477312154,
"learning_rate": 2.7740876117022493e-07,
"loss": 0.16124327182769777,
"memory(GiB)": 57.67,
"step": 635,
"token_acc": 0.9352771876927294,
"train_speed(iter/s)": 0.132949
},
{
"epoch": 1.5696746470227132,
"grad_norm": 0.46916218507444407,
"learning_rate": 2.7386084353294305e-07,
"loss": 0.1779846429824829,
"memory(GiB)": 57.67,
"step": 640,
"token_acc": 0.9482183060321404,
"train_speed(iter/s)": 0.132929
},
{
"epoch": 1.58195211786372,
"grad_norm": 0.4494446665243088,
"learning_rate": 2.703080691703365e-07,
"loss": 0.16666009426116943,
"memory(GiB)": 57.67,
"step": 645,
"token_acc": 0.958400417736391,
"train_speed(iter/s)": 0.133055
},
{
"epoch": 1.5942295887047269,
"grad_norm": 0.5003421798456534,
"learning_rate": 2.667511612273853e-07,
"loss": 0.17405877113342286,
"memory(GiB)": 57.67,
"step": 650,
"token_acc": 0.9476805681474766,
"train_speed(iter/s)": 0.133315
},
{
"epoch": 1.6065070595457336,
"grad_norm": 0.5798622931057178,
"learning_rate": 2.6319084369043403e-07,
"loss": 0.14733604192733765,
"memory(GiB)": 57.67,
"step": 655,
"token_acc": 0.985560657322378,
"train_speed(iter/s)": 0.133499
},
{
"epoch": 1.6187845303867403,
"grad_norm": 0.5265921861617535,
"learning_rate": 2.596278412398284e-07,
"loss": 0.16961886882781982,
"memory(GiB)": 57.67,
"step": 660,
"token_acc": 0.9309208573294544,
"train_speed(iter/s)": 0.133748
},
{
"epoch": 1.6310620012277472,
"grad_norm": 0.5302133218327372,
"learning_rate": 2.560628791024118e-07,
"loss": 0.17056363821029663,
"memory(GiB)": 57.67,
"step": 665,
"token_acc": 0.9468977792846245,
"train_speed(iter/s)": 0.133937
},
{
"epoch": 1.643339472068754,
"grad_norm": 0.5423907736532357,
"learning_rate": 2.5249668290390936e-07,
"loss": 0.16655545234680175,
"memory(GiB)": 57.67,
"step": 670,
"token_acc": 0.9424981219695417,
"train_speed(iter/s)": 0.134082
},
{
"epoch": 1.6556169429097607,
"grad_norm": 0.515033840244812,
"learning_rate": 2.489299785212319e-07,
"loss": 0.17368289232254028,
"memory(GiB)": 57.67,
"step": 675,
"token_acc": 0.9435146443514645,
"train_speed(iter/s)": 0.134347
},
{
"epoch": 1.6678944137507674,
"grad_norm": 0.48163213229590374,
"learning_rate": 2.4536349193472773e-07,
"loss": 0.16638292074203492,
"memory(GiB)": 57.67,
"step": 680,
"token_acc": 0.9301941049604601,
"train_speed(iter/s)": 0.134535
},
{
"epoch": 1.6801718845917741,
"grad_norm": 0.4779642715711826,
"learning_rate": 2.417979490804143e-07,
"loss": 0.14599368572235108,
"memory(GiB)": 57.67,
"step": 685,
"token_acc": 0.9522398399014779,
"train_speed(iter/s)": 0.134598
},
{
"epoch": 1.6924493554327809,
"grad_norm": 0.498486709572586,
"learning_rate": 2.382340757022181e-07,
"loss": 0.1666867971420288,
"memory(GiB)": 57.67,
"step": 690,
"token_acc": 0.9422738067877117,
"train_speed(iter/s)": 0.134551
},
{
"epoch": 1.7047268262737876,
"grad_norm": 0.5039713682487171,
"learning_rate": 2.3467259720425429e-07,
"loss": 0.17596899271011351,
"memory(GiB)": 57.67,
"step": 695,
"token_acc": 0.9558689717925387,
"train_speed(iter/s)": 0.134726
},
{
"epoch": 1.7170042971147943,
"grad_norm": 0.4391373433161239,
"learning_rate": 2.3111423850317508e-07,
"loss": 0.17754709720611572,
"memory(GiB)": 57.67,
"step": 700,
"token_acc": 0.957532017854908,
"train_speed(iter/s)": 0.134883
},
{
"epoch": 1.729281767955801,
"grad_norm": 0.49039848892922233,
"learning_rate": 2.2755972388061755e-07,
"loss": 0.16874098777770996,
"memory(GiB)": 57.67,
"step": 705,
"token_acc": 0.9599765892310462,
"train_speed(iter/s)": 0.133296
},
{
"epoch": 1.7415592387968077,
"grad_norm": 0.4774575002594326,
"learning_rate": 2.2400977683578092e-07,
"loss": 0.16588878631591797,
"memory(GiB)": 57.67,
"step": 710,
"token_acc": 0.9488679320361306,
"train_speed(iter/s)": 0.133523
},
{
"epoch": 1.7538367096378145,
"grad_norm": 0.5147564388228866,
"learning_rate": 2.204651199381623e-07,
"loss": 0.15819010734558106,
"memory(GiB)": 57.67,
"step": 715,
"token_acc": 0.9515534491837809,
"train_speed(iter/s)": 0.133748
},
{
"epoch": 1.7661141804788214,
"grad_norm": 0.5103960664604678,
"learning_rate": 2.1692647468048233e-07,
"loss": 0.17287697792053222,
"memory(GiB)": 57.67,
"step": 720,
"token_acc": 0.9395699944668405,
"train_speed(iter/s)": 0.133786
},
{
"epoch": 1.7783916513198281,
"grad_norm": 0.4774493976896547,
"learning_rate": 2.1339456133183043e-07,
"loss": 0.1602993369102478,
"memory(GiB)": 57.67,
"step": 725,
"token_acc": 0.9661698051492921,
"train_speed(iter/s)": 0.133866
},
{
"epoch": 1.7906691221608348,
"grad_norm": 0.5075630225471248,
"learning_rate": 2.0987009879105762e-07,
"loss": 0.18199481964111328,
"memory(GiB)": 57.67,
"step": 730,
"token_acc": 0.9596290705197326,
"train_speed(iter/s)": 0.133996
},
{
"epoch": 1.8029465930018416,
"grad_norm": 0.4811594763625998,
"learning_rate": 2.0635380444044999e-07,
"loss": 0.17754099369049073,
"memory(GiB)": 57.67,
"step": 735,
"token_acc": 0.9363772728935719,
"train_speed(iter/s)": 0.134141
},
{
"epoch": 1.8152240638428485,
"grad_norm": 0.5223831838925675,
"learning_rate": 2.028463939997093e-07,
"loss": 0.16622164249420165,
"memory(GiB)": 57.67,
"step": 740,
"token_acc": 0.9566500118962645,
"train_speed(iter/s)": 0.134232
},
{
"epoch": 1.8275015346838552,
"grad_norm": 0.4729423140315417,
"learning_rate": 1.9934858138027323e-07,
"loss": 0.1844787120819092,
"memory(GiB)": 57.67,
"step": 745,
"token_acc": 0.9339959225280327,
"train_speed(iter/s)": 0.134303
},
{
"epoch": 1.839779005524862,
"grad_norm": 0.4875394529058418,
"learning_rate": 1.9586107854000325e-07,
"loss": 0.1973895788192749,
"memory(GiB)": 57.67,
"step": 750,
"token_acc": 0.9558183961305532,
"train_speed(iter/s)": 0.13412
},
{
"epoch": 1.8520564763658687,
"grad_norm": 0.40978646980156164,
"learning_rate": 1.9238459533826938e-07,
"loss": 0.19609053134918214,
"memory(GiB)": 57.67,
"step": 755,
"token_acc": 0.934996003197442,
"train_speed(iter/s)": 0.134194
},
{
"epoch": 1.8643339472068754,
"grad_norm": 0.5408849293574589,
"learning_rate": 1.8891983939146369e-07,
"loss": 0.1738824725151062,
"memory(GiB)": 57.67,
"step": 760,
"token_acc": 0.9426776599921476,
"train_speed(iter/s)": 0.134248
},
{
"epoch": 1.8766114180478821,
"grad_norm": 0.471076880754224,
"learning_rate": 1.8546751592896853e-07,
"loss": 0.18387995958328246,
"memory(GiB)": 57.67,
"step": 765,
"token_acc": 0.9433991482771971,
"train_speed(iter/s)": 0.13421
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.504836493093865,
"learning_rate": 1.8202832764961198e-07,
"loss": 0.18205785751342773,
"memory(GiB)": 57.67,
"step": 770,
"token_acc": 0.9527401477832512,
"train_speed(iter/s)": 0.13428
},
{
"epoch": 1.9011663597298956,
"grad_norm": 0.4319777645328756,
"learning_rate": 1.7860297457863802e-07,
"loss": 0.16953612565994264,
"memory(GiB)": 57.67,
"step": 775,
"token_acc": 0.947322033898305,
"train_speed(iter/s)": 0.134434
},
{
"epoch": 1.9134438305709023,
"grad_norm": 0.4590442017829868,
"learning_rate": 1.7519215392522025e-07,
"loss": 0.15192935466766358,
"memory(GiB)": 57.67,
"step": 780,
"token_acc": 0.9487110114791482,
"train_speed(iter/s)": 0.134411
},
{
"epoch": 1.925721301411909,
"grad_norm": 0.46331279083008303,
"learning_rate": 1.717965599405501e-07,
"loss": 0.17549625635147095,
"memory(GiB)": 57.67,
"step": 785,
"token_acc": 0.9622470689421031,
"train_speed(iter/s)": 0.134696
},
{
"epoch": 1.937998772252916,
"grad_norm": 0.4358941046923003,
"learning_rate": 1.6841688377652552e-07,
"loss": 0.1650502562522888,
"memory(GiB)": 57.67,
"step": 790,
"token_acc": 0.9644233133863431,
"train_speed(iter/s)": 0.134849
},
{
"epoch": 1.9502762430939227,
"grad_norm": 0.36413602356289787,
"learning_rate": 1.6505381334507175e-07,
"loss": 0.16262125968933105,
"memory(GiB)": 57.67,
"step": 795,
"token_acc": 0.9436930827359039,
"train_speed(iter/s)": 0.135052
},
{
"epoch": 1.9625537139349294,
"grad_norm": 0.5119930810029533,
"learning_rate": 1.6170803317812136e-07,
"loss": 0.17920398712158203,
"memory(GiB)": 57.67,
"step": 800,
"token_acc": 0.9322690781581618,
"train_speed(iter/s)": 0.135111
},
{
"epoch": 1.974831184775936,
"grad_norm": 0.4825725506640307,
"learning_rate": 1.583802242882816e-07,
"loss": 0.1905304193496704,
"memory(GiB)": 57.67,
"step": 805,
"token_acc": 0.9357515085024685,
"train_speed(iter/s)": 0.13373
},
{
"epoch": 1.987108655616943,
"grad_norm": 0.4976072969687368,
"learning_rate": 1.5507106403021895e-07,
"loss": 0.1734859824180603,
"memory(GiB)": 57.67,
"step": 810,
"token_acc": 0.9405059337913804,
"train_speed(iter/s)": 0.133765
},
{
"epoch": 1.9993861264579498,
"grad_norm": 0.5349651167180693,
"learning_rate": 1.517812259627874e-07,
"loss": 0.17344932556152343,
"memory(GiB)": 57.67,
"step": 815,
"token_acc": 0.9323653962492437,
"train_speed(iter/s)": 0.133696
},
{
"epoch": 2.0098219766728054,
"grad_norm": 0.6057966476171509,
"learning_rate": 1.4851137971193018e-07,
"loss": 0.16537351608276368,
"memory(GiB)": 57.67,
"step": 820,
"token_acc": 0.9556328651806039,
"train_speed(iter/s)": 0.133772
},
{
"epoch": 2.022099447513812,
"grad_norm": 1.0192898362372844,
"learning_rate": 1.4526219083438153e-07,
"loss": 0.17631728649139405,
"memory(GiB)": 57.67,
"step": 825,
"token_acc": 0.9437896645512239,
"train_speed(iter/s)": 0.133754
},
{
"epoch": 2.034376918354819,
"grad_norm": 0.43605668597687935,
"learning_rate": 1.4203432068219616e-07,
"loss": 0.1445701837539673,
"memory(GiB)": 57.67,
"step": 830,
"token_acc": 0.9750953344946095,
"train_speed(iter/s)": 0.133755
},
{
"epoch": 2.0466543891958255,
"grad_norm": 0.4517333876137591,
"learning_rate": 1.3882842626813645e-07,
"loss": 0.15836387872695923,
"memory(GiB)": 57.67,
"step": 835,
"token_acc": 0.96793536040825,
"train_speed(iter/s)": 0.133783
},
{
"epoch": 2.0589318600368323,
"grad_norm": 0.5383308109264513,
"learning_rate": 1.3564516013194022e-07,
"loss": 0.18179185390472413,
"memory(GiB)": 57.67,
"step": 840,
"token_acc": 0.9378444703705193,
"train_speed(iter/s)": 0.133821
},
{
"epoch": 2.071209330877839,
"grad_norm": 0.4522142212035384,
"learning_rate": 1.3248517020750123e-07,
"loss": 0.16212983131408693,
"memory(GiB)": 57.67,
"step": 845,
"token_acc": 0.9407257155735858,
"train_speed(iter/s)": 0.13394
},
{
"epoch": 2.0834868017188457,
"grad_norm": 0.45710576973364,
"learning_rate": 1.2934909969098612e-07,
"loss": 0.1782787561416626,
"memory(GiB)": 57.67,
"step": 850,
"token_acc": 0.9515674953476165,
"train_speed(iter/s)": 0.133905
},
{
"epoch": 2.095764272559853,
"grad_norm": 0.492740078600037,
"learning_rate": 1.2623758690991567e-07,
"loss": 0.1520832061767578,
"memory(GiB)": 57.67,
"step": 855,
"token_acc": 0.9375085324232082,
"train_speed(iter/s)": 0.13385
},
{
"epoch": 2.1080417434008596,
"grad_norm": 0.4433975430592583,
"learning_rate": 1.2315126519323751e-07,
"loss": 0.18507776260375977,
"memory(GiB)": 57.67,
"step": 860,
"token_acc": 0.9329054289056818,
"train_speed(iter/s)": 0.133924
},
{
"epoch": 2.1203192142418663,
"grad_norm": 0.5301183452451113,
"learning_rate": 1.2009076274241567e-07,
"loss": 0.15922095775604247,
"memory(GiB)": 57.67,
"step": 865,
"token_acc": 0.948112669631657,
"train_speed(iter/s)": 0.134001
},
{
"epoch": 2.132596685082873,
"grad_norm": 0.5377164487351925,
"learning_rate": 1.1705670250356414e-07,
"loss": 0.17927762269973754,
"memory(GiB)": 57.67,
"step": 870,
"token_acc": 0.9462852794687328,
"train_speed(iter/s)": 0.13419
},
{
"epoch": 2.1448741559238798,
"grad_norm": 0.5286410675937543,
"learning_rate": 1.1404970204065056e-07,
"loss": 0.17566382884979248,
"memory(GiB)": 57.67,
"step": 875,
"token_acc": 0.9370597142669037,
"train_speed(iter/s)": 0.134182
},
{
"epoch": 2.1571516267648865,
"grad_norm": 0.40268668050343476,
"learning_rate": 1.110703734097942e-07,
"loss": 0.15375242233276368,
"memory(GiB)": 57.67,
"step": 880,
"token_acc": 0.9555922520753369,
"train_speed(iter/s)": 0.134177
},
{
"epoch": 2.169429097605893,
"grad_norm": 0.5959804911771396,
"learning_rate": 1.0811932303468649e-07,
"loss": 0.18795297145843506,
"memory(GiB)": 57.67,
"step": 885,
"token_acc": 0.93544177741149,
"train_speed(iter/s)": 0.134122
},
{
"epoch": 2.1817065684469,
"grad_norm": 0.4413780946022899,
"learning_rate": 1.0519715158315667e-07,
"loss": 0.16619727611541749,
"memory(GiB)": 57.67,
"step": 890,
"token_acc": 0.9495682081573142,
"train_speed(iter/s)": 0.134035
},
{
"epoch": 2.1939840392879066,
"grad_norm": 0.41173432463685106,
"learning_rate": 1.0230445384491002e-07,
"loss": 0.15455365180969238,
"memory(GiB)": 57.67,
"step": 895,
"token_acc": 0.9446943730362753,
"train_speed(iter/s)": 0.134118
},
{
"epoch": 2.2062615101289134,
"grad_norm": 0.48208299751179823,
"learning_rate": 9.944181861046186e-08,
"loss": 0.18413586616516114,
"memory(GiB)": 57.67,
"step": 900,
"token_acc": 0.9379831280223078,
"train_speed(iter/s)": 0.134153
},
{
"epoch": 2.21853898096992,
"grad_norm": 0.3966707114512702,
"learning_rate": 9.660982855129313e-08,
"loss": 0.15873076915740966,
"memory(GiB)": 57.67,
"step": 905,
"token_acc": 0.9590186155792517,
"train_speed(iter/s)": 0.132978
},
{
"epoch": 2.230816451810927,
"grad_norm": 0.4381866262273687,
"learning_rate": 9.380906010125136e-08,
"loss": 0.15625982284545897,
"memory(GiB)": 57.67,
"step": 910,
"token_acc": 0.9386312965272267,
"train_speed(iter/s)": 0.133045
},
{
"epoch": 2.2430939226519335,
"grad_norm": 0.5223822620159875,
"learning_rate": 9.104008333922076e-08,
"loss": 0.16865816116333007,
"memory(GiB)": 57.67,
"step": 915,
"token_acc": 0.9422254974207811,
"train_speed(iter/s)": 0.133004
},
{
"epoch": 2.2553713934929407,
"grad_norm": 0.38440187816263854,
"learning_rate": 8.830346187308649e-08,
"loss": 0.1423816680908203,
"memory(GiB)": 57.67,
"step": 920,
"token_acc": 0.9562720848056537,
"train_speed(iter/s)": 0.133076
},
{
"epoch": 2.267648864333947,
"grad_norm": 0.4592624600791522,
"learning_rate": 8.559975272501601e-08,
"loss": 0.16395586729049683,
"memory(GiB)": 57.67,
"step": 925,
"token_acc": 0.9564571607254534,
"train_speed(iter/s)": 0.133105
},
{
"epoch": 2.279926335174954,
"grad_norm": 0.43806629453653373,
"learning_rate": 8.29295062180802e-08,
"loss": 0.16589367389678955,
"memory(GiB)": 57.67,
"step": 930,
"token_acc": 0.9659481977902554,
"train_speed(iter/s)": 0.133066
},
{
"epoch": 2.292203806015961,
"grad_norm": 0.464370245760019,
"learning_rate": 8.029326586423907e-08,
"loss": 0.1606292724609375,
"memory(GiB)": 57.67,
"step": 935,
"token_acc": 0.9405248868778281,
"train_speed(iter/s)": 0.133054
},
{
"epoch": 2.3044812768569676,
"grad_norm": 0.492043728887271,
"learning_rate": 7.769156825371286e-08,
"loss": 0.16174919605255128,
"memory(GiB)": 57.67,
"step": 940,
"token_acc": 0.9457228709444296,
"train_speed(iter/s)": 0.133172
},
{
"epoch": 2.3167587476979743,
"grad_norm": 0.4103343451876262,
"learning_rate": 7.512494294576269e-08,
"loss": 0.1728949785232544,
"memory(GiB)": 57.67,
"step": 945,
"token_acc": 0.9441858719315367,
"train_speed(iter/s)": 0.133227
},
{
"epoch": 2.329036218538981,
"grad_norm": 0.4809983010327667,
"learning_rate": 7.25939123609022e-08,
"loss": 0.15887634754180907,
"memory(GiB)": 57.67,
"step": 950,
"token_acc": 0.9603744280737103,
"train_speed(iter/s)": 0.133368
},
{
"epoch": 2.3413136893799877,
"grad_norm": 0.37621027938026347,
"learning_rate": 7.009899167456185e-08,
"loss": 0.14414477348327637,
"memory(GiB)": 57.67,
"step": 955,
"token_acc": 0.9454514068703608,
"train_speed(iter/s)": 0.133409
},
{
"epoch": 2.3535911602209945,
"grad_norm": 0.47025720298842205,
"learning_rate": 6.764068871222825e-08,
"loss": 0.18474191427230835,
"memory(GiB)": 57.67,
"step": 960,
"token_acc": 0.933993399339934,
"train_speed(iter/s)": 0.133486
},
{
"epoch": 2.365868631062001,
"grad_norm": 0.48612512291843357,
"learning_rate": 6.521950384607974e-08,
"loss": 0.18921175003051757,
"memory(GiB)": 57.67,
"step": 965,
"token_acc": 0.9431688588154794,
"train_speed(iter/s)": 0.133588
},
{
"epoch": 2.378146101903008,
"grad_norm": 0.3806957752805889,
"learning_rate": 6.283592989313841e-08,
"loss": 0.1681033730506897,
"memory(GiB)": 57.67,
"step": 970,
"token_acc": 0.9367773677736777,
"train_speed(iter/s)": 0.13364
},
{
"epoch": 2.3904235727440146,
"grad_norm": 0.45577439700594125,
"learning_rate": 6.049045201496042e-08,
"loss": 0.15442556142807007,
"memory(GiB)": 57.67,
"step": 975,
"token_acc": 0.948759111419646,
"train_speed(iter/s)": 0.13374
},
{
"epoch": 2.4027010435850213,
"grad_norm": 0.4860898688565903,
"learning_rate": 5.818354761888444e-08,
"loss": 0.15901718139648438,
"memory(GiB)": 57.67,
"step": 980,
"token_acc": 0.9584408255401483,
"train_speed(iter/s)": 0.13388
},
{
"epoch": 2.414978514426028,
"grad_norm": 0.514344692664739,
"learning_rate": 5.5915686260858244e-08,
"loss": 0.17236262559890747,
"memory(GiB)": 57.67,
"step": 985,
"token_acc": 0.9471186187308468,
"train_speed(iter/s)": 0.133984
},
{
"epoch": 2.427255985267035,
"grad_norm": 0.5543092645451274,
"learning_rate": 5.368732954986388e-08,
"loss": 0.1594996929168701,
"memory(GiB)": 57.67,
"step": 990,
"token_acc": 0.9353687315634218,
"train_speed(iter/s)": 0.134028
},
{
"epoch": 2.439533456108042,
"grad_norm": 0.47905205598317496,
"learning_rate": 5.14989310539595e-08,
"loss": 0.17013013362884521,
"memory(GiB)": 57.67,
"step": 995,
"token_acc": 0.9312570646677375,
"train_speed(iter/s)": 0.134104
},
{
"epoch": 2.4518109269490487,
"grad_norm": 0.45866756604400005,
"learning_rate": 4.935093620795902e-08,
"loss": 0.16783492565155028,
"memory(GiB)": 57.67,
"step": 1000,
"token_acc": 0.9406372313396965,
"train_speed(iter/s)": 0.134234
},
{
"epoch": 2.4640883977900554,
"grad_norm": 0.45591647725979945,
"learning_rate": 4.7243782222766124e-08,
"loss": 0.15385560989379882,
"memory(GiB)": 57.67,
"step": 1005,
"token_acc": 0.9611999210578251,
"train_speed(iter/s)": 0.132972
},
{
"epoch": 2.476365868631062,
"grad_norm": 0.4717972783404294,
"learning_rate": 4.517789799638297e-08,
"loss": 0.1716939926147461,
"memory(GiB)": 57.67,
"step": 1010,
"token_acc": 0.9752226720647773,
"train_speed(iter/s)": 0.133203
},
{
"epoch": 2.488643339472069,
"grad_norm": 0.4386963670049322,
"learning_rate": 4.315370402661092e-08,
"loss": 0.16546686887741088,
"memory(GiB)": 57.67,
"step": 1015,
"token_acc": 0.9399607392866403,
"train_speed(iter/s)": 0.133178
},
{
"epoch": 2.5009208103130756,
"grad_norm": 0.5231430157385392,
"learning_rate": 4.1171612325460236e-08,
"loss": 0.18315892219543456,
"memory(GiB)": 57.67,
"step": 1020,
"token_acc": 0.9306804077180577,
"train_speed(iter/s)": 0.133198
},
{
"epoch": 2.5131982811540823,
"grad_norm": 0.5125599782547446,
"learning_rate": 3.9232026335288296e-08,
"loss": 0.168873929977417,
"memory(GiB)": 57.67,
"step": 1025,
"token_acc": 0.9482202118470701,
"train_speed(iter/s)": 0.133093
},
{
"epoch": 2.525475751995089,
"grad_norm": 0.4146726330452646,
"learning_rate": 3.733534084668091e-08,
"loss": 0.1465557336807251,
"memory(GiB)": 57.67,
"step": 1030,
"token_acc": 0.9542228126779275,
"train_speed(iter/s)": 0.133256
},
{
"epoch": 2.5377532228360957,
"grad_norm": 0.4928965284851222,
"learning_rate": 3.5481941918095396e-08,
"loss": 0.1535036325454712,
"memory(GiB)": 57.67,
"step": 1035,
"token_acc": 0.98864726574992,
"train_speed(iter/s)": 0.133212
},
{
"epoch": 2.5500306936771024,
"grad_norm": 0.46167944188273663,
"learning_rate": 3.367220679728089e-08,
"loss": 0.17861878871917725,
"memory(GiB)": 57.67,
"step": 1040,
"token_acc": 0.9435955137744546,
"train_speed(iter/s)": 0.133172
},
{
"epoch": 2.562308164518109,
"grad_norm": 0.5398332391068397,
"learning_rate": 3.190650384449167e-08,
"loss": 0.1772806763648987,
"memory(GiB)": 57.67,
"step": 1045,
"token_acc": 0.9553827261563651,
"train_speed(iter/s)": 0.133219
},
{
"epoch": 2.574585635359116,
"grad_norm": 0.5539788244071175,
"learning_rate": 3.018519245750989e-08,
"loss": 0.14356986284255982,
"memory(GiB)": 57.67,
"step": 1050,
"token_acc": 0.968132854578097,
"train_speed(iter/s)": 0.133317
},
{
"epoch": 2.5868631062001226,
"grad_norm": 0.4402330635590166,
"learning_rate": 2.850862299849241e-08,
"loss": 0.16220954656600953,
"memory(GiB)": 57.67,
"step": 1055,
"token_acc": 0.9482638506948264,
"train_speed(iter/s)": 0.133252
},
{
"epoch": 2.5991405770411298,
"grad_norm": 0.48620425405940176,
"learning_rate": 2.6877136722656734e-08,
"loss": 0.18278908729553223,
"memory(GiB)": 57.67,
"step": 1060,
"token_acc": 0.9325091336116911,
"train_speed(iter/s)": 0.133237
},
{
"epoch": 2.611418047882136,
"grad_norm": 0.43457144570186146,
"learning_rate": 2.5291065708820754e-08,
"loss": 0.17628798484802247,
"memory(GiB)": 57.67,
"step": 1065,
"token_acc": 0.9281233833419555,
"train_speed(iter/s)": 0.13325
},
{
"epoch": 2.623695518723143,
"grad_norm": 0.4143344415028038,
"learning_rate": 2.375073279180992e-08,
"loss": 0.15394517183303832,
"memory(GiB)": 57.67,
"step": 1070,
"token_acc": 0.9541889329425961,
"train_speed(iter/s)": 0.133392
},
{
"epoch": 2.63597298956415,
"grad_norm": 0.4688198080933725,
"learning_rate": 2.2256451496746653e-08,
"loss": 0.1712632417678833,
"memory(GiB)": 57.67,
"step": 1075,
"token_acc": 0.9350767303476355,
"train_speed(iter/s)": 0.133374
},
{
"epoch": 2.6482504604051567,
"grad_norm": 0.5198918873737135,
"learning_rate": 2.0808525975233805e-08,
"loss": 0.19560701847076417,
"memory(GiB)": 57.67,
"step": 1080,
"token_acc": 0.9272029474976973,
"train_speed(iter/s)": 0.133426
},
{
"epoch": 2.6605279312461634,
"grad_norm": 0.4846401676955354,
"learning_rate": 1.940725094344675e-08,
"loss": 0.17480210065841675,
"memory(GiB)": 57.67,
"step": 1085,
"token_acc": 0.9508643542545645,
"train_speed(iter/s)": 0.133534
},
{
"epoch": 2.67280540208717,
"grad_norm": 0.4514013424363714,
"learning_rate": 1.8052911622145866e-08,
"loss": 0.1648250102996826,
"memory(GiB)": 57.67,
"step": 1090,
"token_acc": 0.9454948354122735,
"train_speed(iter/s)": 0.133539
},
{
"epoch": 2.685082872928177,
"grad_norm": 0.4989521104653192,
"learning_rate": 1.6745783678621367e-08,
"loss": 0.16439478397369384,
"memory(GiB)": 57.67,
"step": 1095,
"token_acc": 0.9524301583201092,
"train_speed(iter/s)": 0.133571
},
{
"epoch": 2.6973603437691835,
"grad_norm": 0.49299845547327686,
"learning_rate": 1.5486133170583145e-08,
"loss": 0.16308257579803467,
"memory(GiB)": 57.67,
"step": 1100,
"token_acc": 0.9602804837874723,
"train_speed(iter/s)": 0.133601
},
{
"epoch": 2.7096378146101903,
"grad_norm": 0.38800040402764896,
"learning_rate": 1.4274216492006302e-08,
"loss": 0.20491249561309816,
"memory(GiB)": 57.67,
"step": 1105,
"token_acc": 0.9337728751954694,
"train_speed(iter/s)": 0.132264
},
{
"epoch": 2.721915285451197,
"grad_norm": 0.46254709159908636,
"learning_rate": 1.311028032094369e-08,
"loss": 0.1842280149459839,
"memory(GiB)": 57.67,
"step": 1110,
"token_acc": 0.9332923076923076,
"train_speed(iter/s)": 0.132279
},
{
"epoch": 2.7341927562922037,
"grad_norm": 0.4700426518953757,
"learning_rate": 1.1994561569316442e-08,
"loss": 0.15502922534942626,
"memory(GiB)": 57.67,
"step": 1115,
"token_acc": 0.9172778194837443,
"train_speed(iter/s)": 0.132416
},
{
"epoch": 2.7464702271332104,
"grad_norm": 0.4570084263374054,
"learning_rate": 1.0927287334691616e-08,
"loss": 0.17944493293762206,
"memory(GiB)": 57.67,
"step": 1120,
"token_acc": 0.9620748077674964,
"train_speed(iter/s)": 0.132397
},
{
"epoch": 2.758747697974217,
"grad_norm": 0.5565607166042048,
"learning_rate": 9.908674854058219e-09,
"loss": 0.16596771478652955,
"memory(GiB)": 57.67,
"step": 1125,
"token_acc": 0.946463347418703,
"train_speed(iter/s)": 0.132429
},
{
"epoch": 2.771025168815224,
"grad_norm": 0.43235518649936683,
"learning_rate": 8.938931459609806e-09,
"loss": 0.16782586574554442,
"memory(GiB)": 57.67,
"step": 1130,
"token_acc": 0.9357140571501712,
"train_speed(iter/s)": 0.132529
},
{
"epoch": 2.783302639656231,
"grad_norm": 0.5041786160999633,
"learning_rate": 8.018254536543451e-09,
"loss": 0.18278899192810058,
"memory(GiB)": 57.67,
"step": 1135,
"token_acc": 0.9538431826960216,
"train_speed(iter/s)": 0.132595
},
{
"epoch": 2.7955801104972373,
"grad_norm": 0.3988655252448663,
"learning_rate": 7.146831482883115e-09,
"loss": 0.16640629768371581,
"memory(GiB)": 57.67,
"step": 1140,
"token_acc": 0.9387376446378571,
"train_speed(iter/s)": 0.132687
},
{
"epoch": 2.8078575813382445,
"grad_norm": 0.5585703278010272,
"learning_rate": 6.32483967133593e-09,
"loss": 0.16575145721435547,
"memory(GiB)": 57.67,
"step": 1145,
"token_acc": 0.9446856625961103,
"train_speed(iter/s)": 0.132784
},
{
"epoch": 2.820135052179251,
"grad_norm": 0.5408177805631927,
"learning_rate": 5.5524464131893046e-09,
"loss": 0.16870219707489015,
"memory(GiB)": 57.67,
"step": 1150,
"token_acc": 0.9385546004457179,
"train_speed(iter/s)": 0.132784
},
{
"epoch": 2.832412523020258,
"grad_norm": 0.47090675834257434,
"learning_rate": 4.829808924255441e-09,
"loss": 0.1629176139831543,
"memory(GiB)": 57.67,
"step": 1155,
"token_acc": 0.9479655438055886,
"train_speed(iter/s)": 0.132829
},
{
"epoch": 2.8446899938612646,
"grad_norm": 0.482099713549635,
"learning_rate": 4.157074292871238e-09,
"loss": 0.15441689491271973,
"memory(GiB)": 57.67,
"step": 1160,
"token_acc": 0.9639929344626116,
"train_speed(iter/s)": 0.132784
},
{
"epoch": 2.8569674647022714,
"grad_norm": 0.4698231217223272,
"learning_rate": 3.5343794499594625e-09,
"loss": 0.18142955303192138,
"memory(GiB)": 57.67,
"step": 1165,
"token_acc": 0.9412326017570991,
"train_speed(iter/s)": 0.132762
},
{
"epoch": 2.869244935543278,
"grad_norm": 0.43804204402927915,
"learning_rate": 2.9618511411570455e-09,
"loss": 0.18446786403656007,
"memory(GiB)": 57.67,
"step": 1170,
"token_acc": 0.9350367684435257,
"train_speed(iter/s)": 0.132754
},
{
"epoch": 2.881522406384285,
"grad_norm": 0.5571992556185795,
"learning_rate": 2.4396059010170777e-09,
"loss": 0.1577387809753418,
"memory(GiB)": 57.67,
"step": 1175,
"token_acc": 0.9629287863590772,
"train_speed(iter/s)": 0.132933
},
{
"epoch": 2.8937998772252915,
"grad_norm": 0.5267456210442775,
"learning_rate": 1.967750029288756e-09,
"loss": 0.15231819152832032,
"memory(GiB)": 57.67,
"step": 1180,
"token_acc": 0.9535112359550562,
"train_speed(iter/s)": 0.133115
},
{
"epoch": 2.9060773480662982,
"grad_norm": 0.3910293202558737,
"learning_rate": 1.5463795692808034e-09,
"loss": 0.162775456905365,
"memory(GiB)": 57.67,
"step": 1185,
"token_acc": 0.931954924708307,
"train_speed(iter/s)": 0.133131
},
{
"epoch": 2.918354818907305,
"grad_norm": 0.4093866355363869,
"learning_rate": 1.1755802883124389e-09,
"loss": 0.16861215829849244,
"memory(GiB)": 57.67,
"step": 1190,
"token_acc": 0.9492351730924053,
"train_speed(iter/s)": 0.133244
},
{
"epoch": 2.9306322897483117,
"grad_norm": 0.5603512977054463,
"learning_rate": 8.554276602559807e-10,
"loss": 0.20310664176940918,
"memory(GiB)": 57.67,
"step": 1195,
"token_acc": 0.9434275032624414,
"train_speed(iter/s)": 0.133257
},
{
"epoch": 2.942909760589319,
"grad_norm": 0.5047283156733317,
"learning_rate": 5.859868501746079e-10,
"loss": 0.1582653284072876,
"memory(GiB)": 57.67,
"step": 1200,
"token_acc": 0.9450664007614559,
"train_speed(iter/s)": 0.133326
},
{
"epoch": 2.955187231430325,
"grad_norm": 0.3732689118202031,
"learning_rate": 3.6731270105844204e-10,
"loss": 0.17182209491729736,
"memory(GiB)": 57.67,
"step": 1205,
"token_acc": 0.9346191946856244,
"train_speed(iter/s)": 0.132213
},
{
"epoch": 2.9674647022713323,
"grad_norm": 0.513720762261806,
"learning_rate": 1.9944972266153214e-10,
"loss": 0.17291358709335328,
"memory(GiB)": 57.67,
"step": 1210,
"token_acc": 0.9477955520873976,
"train_speed(iter/s)": 0.132351
},
{
"epoch": 2.979742173112339,
"grad_norm": 0.4516334969279687,
"learning_rate": 8.243208244229637e-11,
"loss": 0.156210994720459,
"memory(GiB)": 57.67,
"step": 1215,
"token_acc": 0.9544438002153566,
"train_speed(iter/s)": 0.132457
},
{
"epoch": 2.9920196439533457,
"grad_norm": 0.4764485403289777,
"learning_rate": 1.628359860883499e-11,
"loss": 0.1759173631668091,
"memory(GiB)": 57.67,
"step": 1220,
"token_acc": 0.9492864815098971,
"train_speed(iter/s)": 0.132386
}
],
"logging_steps": 5,
"max_steps": 1224,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 169385433251840.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}