Model: openthaigpt/openthaigpt-thaillm-8b-instruct-v0.7.2-research-preview Source: Original Platform
2485 lines
70 KiB
JSON
2485 lines
70 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 100.0,
|
|
"global_step": 1224,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0024554941682013503,
|
|
"grad_norm": 1.7764384203008543,
|
|
"learning_rate": 4.065040650406504e-09,
|
|
"loss": 0.6458772420883179,
|
|
"memory(GiB)": 26.71,
|
|
"step": 1,
|
|
"token_acc": 0.8,
|
|
"train_speed(iter/s)": 0.059991
|
|
},
|
|
{
|
|
"epoch": 0.012277470841006752,
|
|
"grad_norm": 1.5484946901004182,
|
|
"learning_rate": 2.032520325203252e-08,
|
|
"loss": 0.5884945392608643,
|
|
"memory(GiB)": 52.53,
|
|
"step": 5,
|
|
"token_acc": 0.8317063859955282,
|
|
"train_speed(iter/s)": 0.107624
|
|
},
|
|
{
|
|
"epoch": 0.024554941682013505,
|
|
"grad_norm": 1.4563640400430145,
|
|
"learning_rate": 4.065040650406504e-08,
|
|
"loss": 0.5743718624114991,
|
|
"memory(GiB)": 52.53,
|
|
"step": 10,
|
|
"token_acc": 0.8498605726722438,
|
|
"train_speed(iter/s)": 0.112877
|
|
},
|
|
{
|
|
"epoch": 0.03683241252302026,
|
|
"grad_norm": 1.4140197884092542,
|
|
"learning_rate": 6.097560975609756e-08,
|
|
"loss": 0.590625,
|
|
"memory(GiB)": 52.53,
|
|
"step": 15,
|
|
"token_acc": 0.8347938144329897,
|
|
"train_speed(iter/s)": 0.117218
|
|
},
|
|
{
|
|
"epoch": 0.04910988336402701,
|
|
"grad_norm": 1.6622234178073612,
|
|
"learning_rate": 8.130081300813008e-08,
|
|
"loss": 0.5686985969543457,
|
|
"memory(GiB)": 52.53,
|
|
"step": 20,
|
|
"token_acc": 0.8420736417474894,
|
|
"train_speed(iter/s)": 0.116503
|
|
},
|
|
{
|
|
"epoch": 0.061387354205033766,
|
|
"grad_norm": 1.5449639237910855,
|
|
"learning_rate": 1.016260162601626e-07,
|
|
"loss": 0.5720160961151123,
|
|
"memory(GiB)": 52.53,
|
|
"step": 25,
|
|
"token_acc": 0.8329814975785421,
|
|
"train_speed(iter/s)": 0.121226
|
|
},
|
|
{
|
|
"epoch": 0.07366482504604052,
|
|
"grad_norm": 1.4350414462950623,
|
|
"learning_rate": 1.219512195121951e-07,
|
|
"loss": 0.5950272083282471,
|
|
"memory(GiB)": 56.59,
|
|
"step": 30,
|
|
"token_acc": 0.8376042390548992,
|
|
"train_speed(iter/s)": 0.1248
|
|
},
|
|
{
|
|
"epoch": 0.08594229588704727,
|
|
"grad_norm": 1.6390155212777497,
|
|
"learning_rate": 1.4227642276422763e-07,
|
|
"loss": 0.5760597229003906,
|
|
"memory(GiB)": 56.59,
|
|
"step": 35,
|
|
"token_acc": 0.8459589739981,
|
|
"train_speed(iter/s)": 0.12788
|
|
},
|
|
{
|
|
"epoch": 0.09821976672805402,
|
|
"grad_norm": 1.3096137406039592,
|
|
"learning_rate": 1.6260162601626016e-07,
|
|
"loss": 0.5555302143096924,
|
|
"memory(GiB)": 56.59,
|
|
"step": 40,
|
|
"token_acc": 0.8554021379485874,
|
|
"train_speed(iter/s)": 0.12593
|
|
},
|
|
{
|
|
"epoch": 0.11049723756906077,
|
|
"grad_norm": 1.422446577039075,
|
|
"learning_rate": 1.8292682926829268e-07,
|
|
"loss": 0.5970460891723632,
|
|
"memory(GiB)": 57.67,
|
|
"step": 45,
|
|
"token_acc": 0.8417625280973159,
|
|
"train_speed(iter/s)": 0.126605
|
|
},
|
|
{
|
|
"epoch": 0.12277470841006753,
|
|
"grad_norm": 1.2819764069439123,
|
|
"learning_rate": 2.032520325203252e-07,
|
|
"loss": 0.5884549140930175,
|
|
"memory(GiB)": 57.67,
|
|
"step": 50,
|
|
"token_acc": 0.8406445511421504,
|
|
"train_speed(iter/s)": 0.129975
|
|
},
|
|
{
|
|
"epoch": 0.13505217925107427,
|
|
"grad_norm": 1.41910630937099,
|
|
"learning_rate": 2.235772357723577e-07,
|
|
"loss": 0.6231883049011231,
|
|
"memory(GiB)": 57.67,
|
|
"step": 55,
|
|
"token_acc": 0.829693936306861,
|
|
"train_speed(iter/s)": 0.132967
|
|
},
|
|
{
|
|
"epoch": 0.14732965009208104,
|
|
"grad_norm": 1.2531126371792276,
|
|
"learning_rate": 2.439024390243902e-07,
|
|
"loss": 0.5942145347595215,
|
|
"memory(GiB)": 57.67,
|
|
"step": 60,
|
|
"token_acc": 0.8500459277403551,
|
|
"train_speed(iter/s)": 0.133823
|
|
},
|
|
{
|
|
"epoch": 0.1596071209330878,
|
|
"grad_norm": 1.1301268691981066,
|
|
"learning_rate": 2.6422764227642274e-07,
|
|
"loss": 0.5744296073913574,
|
|
"memory(GiB)": 57.67,
|
|
"step": 65,
|
|
"token_acc": 0.8480531499966947,
|
|
"train_speed(iter/s)": 0.134364
|
|
},
|
|
{
|
|
"epoch": 0.17188459177409454,
|
|
"grad_norm": 1.0132293794887546,
|
|
"learning_rate": 2.8455284552845527e-07,
|
|
"loss": 0.5759846210479737,
|
|
"memory(GiB)": 57.67,
|
|
"step": 70,
|
|
"token_acc": 0.8393322326178683,
|
|
"train_speed(iter/s)": 0.137087
|
|
},
|
|
{
|
|
"epoch": 0.1841620626151013,
|
|
"grad_norm": 1.1217457637602928,
|
|
"learning_rate": 3.048780487804878e-07,
|
|
"loss": 0.56055588722229,
|
|
"memory(GiB)": 57.67,
|
|
"step": 75,
|
|
"token_acc": 0.8425447316103379,
|
|
"train_speed(iter/s)": 0.138299
|
|
},
|
|
{
|
|
"epoch": 0.19643953345610804,
|
|
"grad_norm": 0.8818994062981204,
|
|
"learning_rate": 3.252032520325203e-07,
|
|
"loss": 0.5674080848693848,
|
|
"memory(GiB)": 57.67,
|
|
"step": 80,
|
|
"token_acc": 0.8303922425089543,
|
|
"train_speed(iter/s)": 0.137434
|
|
},
|
|
{
|
|
"epoch": 0.2087170042971148,
|
|
"grad_norm": 0.8918960386763577,
|
|
"learning_rate": 3.4552845528455284e-07,
|
|
"loss": 0.559955358505249,
|
|
"memory(GiB)": 57.67,
|
|
"step": 85,
|
|
"token_acc": 0.8446934551423881,
|
|
"train_speed(iter/s)": 0.137418
|
|
},
|
|
{
|
|
"epoch": 0.22099447513812154,
|
|
"grad_norm": 1.0522794863647833,
|
|
"learning_rate": 3.6585365853658536e-07,
|
|
"loss": 0.5732513904571533,
|
|
"memory(GiB)": 57.67,
|
|
"step": 90,
|
|
"token_acc": 0.8489562749254482,
|
|
"train_speed(iter/s)": 0.137088
|
|
},
|
|
{
|
|
"epoch": 0.2332719459791283,
|
|
"grad_norm": 0.785438670663371,
|
|
"learning_rate": 3.861788617886179e-07,
|
|
"loss": 0.5366201877593995,
|
|
"memory(GiB)": 57.67,
|
|
"step": 95,
|
|
"token_acc": 0.8626246438746439,
|
|
"train_speed(iter/s)": 0.136376
|
|
},
|
|
{
|
|
"epoch": 0.24554941682013506,
|
|
"grad_norm": 1.0039103796923265,
|
|
"learning_rate": 4.065040650406504e-07,
|
|
"loss": 0.5319199085235595,
|
|
"memory(GiB)": 57.67,
|
|
"step": 100,
|
|
"token_acc": 0.8360549786720669,
|
|
"train_speed(iter/s)": 0.136236
|
|
},
|
|
{
|
|
"epoch": 0.2578268876611418,
|
|
"grad_norm": 0.8843990831220632,
|
|
"learning_rate": 4.268292682926829e-07,
|
|
"loss": 0.5205207347869873,
|
|
"memory(GiB)": 57.67,
|
|
"step": 105,
|
|
"token_acc": 0.8555977051133336,
|
|
"train_speed(iter/s)": 0.127767
|
|
},
|
|
{
|
|
"epoch": 0.27010435850214853,
|
|
"grad_norm": 0.7775914671814245,
|
|
"learning_rate": 4.471544715447154e-07,
|
|
"loss": 0.5021872520446777,
|
|
"memory(GiB)": 57.67,
|
|
"step": 110,
|
|
"token_acc": 0.8675163294567468,
|
|
"train_speed(iter/s)": 0.129151
|
|
},
|
|
{
|
|
"epoch": 0.2823818293431553,
|
|
"grad_norm": 0.8595659824667936,
|
|
"learning_rate": 4.674796747967479e-07,
|
|
"loss": 0.5380425930023194,
|
|
"memory(GiB)": 57.67,
|
|
"step": 115,
|
|
"token_acc": 0.8555190611506809,
|
|
"train_speed(iter/s)": 0.130605
|
|
},
|
|
{
|
|
"epoch": 0.2946593001841621,
|
|
"grad_norm": 0.9109047806477464,
|
|
"learning_rate": 4.878048780487804e-07,
|
|
"loss": 0.5206707000732422,
|
|
"memory(GiB)": 57.67,
|
|
"step": 120,
|
|
"token_acc": 0.8441454698339289,
|
|
"train_speed(iter/s)": 0.131631
|
|
},
|
|
{
|
|
"epoch": 0.3069367710251688,
|
|
"grad_norm": 0.8591216474047668,
|
|
"learning_rate": 4.999959290672028e-07,
|
|
"loss": 0.5150551795959473,
|
|
"memory(GiB)": 57.67,
|
|
"step": 125,
|
|
"token_acc": 0.8495000393669789,
|
|
"train_speed(iter/s)": 0.132508
|
|
},
|
|
{
|
|
"epoch": 0.3192142418661756,
|
|
"grad_norm": 0.8236784585400246,
|
|
"learning_rate": 4.999501325958186e-07,
|
|
"loss": 0.5071953773498535,
|
|
"memory(GiB)": 57.67,
|
|
"step": 130,
|
|
"token_acc": 0.8506227570192104,
|
|
"train_speed(iter/s)": 0.132915
|
|
},
|
|
{
|
|
"epoch": 0.3314917127071823,
|
|
"grad_norm": 0.954649941162109,
|
|
"learning_rate": 4.998534603397122e-07,
|
|
"loss": 0.49468369483947755,
|
|
"memory(GiB)": 57.67,
|
|
"step": 135,
|
|
"token_acc": 0.850890297573384,
|
|
"train_speed(iter/s)": 0.133509
|
|
},
|
|
{
|
|
"epoch": 0.3437691835481891,
|
|
"grad_norm": 0.7775368395525796,
|
|
"learning_rate": 4.997059319759163e-07,
|
|
"loss": 0.49853315353393557,
|
|
"memory(GiB)": 57.67,
|
|
"step": 140,
|
|
"token_acc": 0.8663300877509316,
|
|
"train_speed(iter/s)": 0.134979
|
|
},
|
|
{
|
|
"epoch": 0.3560466543891958,
|
|
"grad_norm": 0.8879804549594494,
|
|
"learning_rate": 4.995075775329056e-07,
|
|
"loss": 0.49777793884277344,
|
|
"memory(GiB)": 57.67,
|
|
"step": 145,
|
|
"token_acc": 0.8515488018702513,
|
|
"train_speed(iter/s)": 0.13598
|
|
},
|
|
{
|
|
"epoch": 0.3683241252302026,
|
|
"grad_norm": 0.8614455268040374,
|
|
"learning_rate": 4.992584373844852e-07,
|
|
"loss": 0.46718130111694334,
|
|
"memory(GiB)": 57.67,
|
|
"step": 150,
|
|
"token_acc": 0.870577384246449,
|
|
"train_speed(iter/s)": 0.137174
|
|
},
|
|
{
|
|
"epoch": 0.38060159607120936,
|
|
"grad_norm": 0.9466769688310654,
|
|
"learning_rate": 4.989585622415729e-07,
|
|
"loss": 0.46044120788574217,
|
|
"memory(GiB)": 57.67,
|
|
"step": 155,
|
|
"token_acc": 0.8734130199891951,
|
|
"train_speed(iter/s)": 0.137843
|
|
},
|
|
{
|
|
"epoch": 0.3928790669122161,
|
|
"grad_norm": 0.756694079775602,
|
|
"learning_rate": 4.986080131418763e-07,
|
|
"loss": 0.4397891044616699,
|
|
"memory(GiB)": 57.67,
|
|
"step": 160,
|
|
"token_acc": 0.8813499680102367,
|
|
"train_speed(iter/s)": 0.137964
|
|
},
|
|
{
|
|
"epoch": 0.40515653775322286,
|
|
"grad_norm": 0.7766777226622378,
|
|
"learning_rate": 4.982068614374703e-07,
|
|
"loss": 0.4335052490234375,
|
|
"memory(GiB)": 57.67,
|
|
"step": 165,
|
|
"token_acc": 0.8773235563703025,
|
|
"train_speed(iter/s)": 0.137421
|
|
},
|
|
{
|
|
"epoch": 0.4174340085942296,
|
|
"grad_norm": 0.7739408885073995,
|
|
"learning_rate": 4.977551887802731e-07,
|
|
"loss": 0.44889039993286134,
|
|
"memory(GiB)": 57.67,
|
|
"step": 170,
|
|
"token_acc": 0.8657095569839498,
|
|
"train_speed(iter/s)": 0.13711
|
|
},
|
|
{
|
|
"epoch": 0.42971147943523635,
|
|
"grad_norm": 0.9033077880962782,
|
|
"learning_rate": 4.972530871054263e-07,
|
|
"loss": 0.42515549659729,
|
|
"memory(GiB)": 57.67,
|
|
"step": 175,
|
|
"token_acc": 0.8809412679891093,
|
|
"train_speed(iter/s)": 0.136969
|
|
},
|
|
{
|
|
"epoch": 0.4419889502762431,
|
|
"grad_norm": 0.7409058426766315,
|
|
"learning_rate": 4.967006586125826e-07,
|
|
"loss": 0.4390419960021973,
|
|
"memory(GiB)": 57.67,
|
|
"step": 180,
|
|
"token_acc": 0.8779247640798035,
|
|
"train_speed(iter/s)": 0.1378
|
|
},
|
|
{
|
|
"epoch": 0.45426642111724985,
|
|
"grad_norm": 0.7709335380436733,
|
|
"learning_rate": 4.960980157451032e-07,
|
|
"loss": 0.4336841583251953,
|
|
"memory(GiB)": 57.67,
|
|
"step": 185,
|
|
"token_acc": 0.8787090057261843,
|
|
"train_speed(iter/s)": 0.138248
|
|
},
|
|
{
|
|
"epoch": 0.4665438919582566,
|
|
"grad_norm": 0.8002910130967711,
|
|
"learning_rate": 4.954452811671713e-07,
|
|
"loss": 0.4231499195098877,
|
|
"memory(GiB)": 57.67,
|
|
"step": 190,
|
|
"token_acc": 0.8717664903865863,
|
|
"train_speed(iter/s)": 0.138123
|
|
},
|
|
{
|
|
"epoch": 0.47882136279926335,
|
|
"grad_norm": 0.7671937721932319,
|
|
"learning_rate": 4.947425877388237e-07,
|
|
"loss": 0.4115544319152832,
|
|
"memory(GiB)": 57.67,
|
|
"step": 195,
|
|
"token_acc": 0.8822761322245893,
|
|
"train_speed(iter/s)": 0.138556
|
|
},
|
|
{
|
|
"epoch": 0.4910988336402701,
|
|
"grad_norm": 0.8718441420761184,
|
|
"learning_rate": 4.939900784889085e-07,
|
|
"loss": 0.4212639331817627,
|
|
"memory(GiB)": 57.67,
|
|
"step": 200,
|
|
"token_acc": 0.8792764857881137,
|
|
"train_speed(iter/s)": 0.13898
|
|
},
|
|
{
|
|
"epoch": 0.5033763044812769,
|
|
"grad_norm": 0.8688003112334609,
|
|
"learning_rate": 4.931879065859729e-07,
|
|
"loss": 0.3883807182312012,
|
|
"memory(GiB)": 57.67,
|
|
"step": 205,
|
|
"token_acc": 0.8808318569138615,
|
|
"train_speed(iter/s)": 0.133423
|
|
},
|
|
{
|
|
"epoch": 0.5156537753222836,
|
|
"grad_norm": 0.9144969667406203,
|
|
"learning_rate": 4.923362353070858e-07,
|
|
"loss": 0.3918790817260742,
|
|
"memory(GiB)": 57.67,
|
|
"step": 210,
|
|
"token_acc": 0.8826902804132406,
|
|
"train_speed(iter/s)": 0.133713
|
|
},
|
|
{
|
|
"epoch": 0.5279312461632903,
|
|
"grad_norm": 0.8097366879590759,
|
|
"learning_rate": 4.914352380046041e-07,
|
|
"loss": 0.3884381055831909,
|
|
"memory(GiB)": 57.67,
|
|
"step": 215,
|
|
"token_acc": 0.8870171589751625,
|
|
"train_speed(iter/s)": 0.133996
|
|
},
|
|
{
|
|
"epoch": 0.5402087170042971,
|
|
"grad_norm": 0.7538862741531127,
|
|
"learning_rate": 4.904850980708886e-07,
|
|
"loss": 0.3775317668914795,
|
|
"memory(GiB)": 57.67,
|
|
"step": 220,
|
|
"token_acc": 0.8937805494690713,
|
|
"train_speed(iter/s)": 0.133724
|
|
},
|
|
{
|
|
"epoch": 0.5524861878453039,
|
|
"grad_norm": 1.146276827778673,
|
|
"learning_rate": 4.894860089009741e-07,
|
|
"loss": 0.3711127519607544,
|
|
"memory(GiB)": 57.67,
|
|
"step": 225,
|
|
"token_acc": 0.8895364441547486,
|
|
"train_speed(iter/s)": 0.134362
|
|
},
|
|
{
|
|
"epoch": 0.5647636586863106,
|
|
"grad_norm": 0.8283210772807095,
|
|
"learning_rate": 4.884381738532069e-07,
|
|
"loss": 0.3519309043884277,
|
|
"memory(GiB)": 57.67,
|
|
"step": 230,
|
|
"token_acc": 0.8935041822388325,
|
|
"train_speed(iter/s)": 0.134198
|
|
},
|
|
{
|
|
"epoch": 0.5770411295273173,
|
|
"grad_norm": 1.004769745418558,
|
|
"learning_rate": 4.87341806207851e-07,
|
|
"loss": 0.3587208271026611,
|
|
"memory(GiB)": 57.67,
|
|
"step": 235,
|
|
"token_acc": 0.9019401835119423,
|
|
"train_speed(iter/s)": 0.134025
|
|
},
|
|
{
|
|
"epoch": 0.5893186003683242,
|
|
"grad_norm": 0.8182191455973747,
|
|
"learning_rate": 4.861971291236771e-07,
|
|
"loss": 0.3467154026031494,
|
|
"memory(GiB)": 57.67,
|
|
"step": 240,
|
|
"token_acc": 0.8995148555664775,
|
|
"train_speed(iter/s)": 0.134306
|
|
},
|
|
{
|
|
"epoch": 0.6015960712093309,
|
|
"grad_norm": 1.5271995294368055,
|
|
"learning_rate": 4.850043755925397e-07,
|
|
"loss": 0.33415584564208983,
|
|
"memory(GiB)": 57.67,
|
|
"step": 245,
|
|
"token_acc": 0.8970737022336368,
|
|
"train_speed(iter/s)": 0.134727
|
|
},
|
|
{
|
|
"epoch": 0.6138735420503376,
|
|
"grad_norm": 0.875242766730362,
|
|
"learning_rate": 4.837637883919528e-07,
|
|
"loss": 0.3291849374771118,
|
|
"memory(GiB)": 57.67,
|
|
"step": 250,
|
|
"token_acc": 0.9039310639510041,
|
|
"train_speed(iter/s)": 0.13542
|
|
},
|
|
{
|
|
"epoch": 0.6261510128913443,
|
|
"grad_norm": 0.8014571513700163,
|
|
"learning_rate": 4.824756200356748e-07,
|
|
"loss": 0.32892580032348634,
|
|
"memory(GiB)": 57.67,
|
|
"step": 255,
|
|
"token_acc": 0.9007355946056396,
|
|
"train_speed(iter/s)": 0.135787
|
|
},
|
|
{
|
|
"epoch": 0.6384284837323512,
|
|
"grad_norm": 0.9358345370809253,
|
|
"learning_rate": 4.811401327223103e-07,
|
|
"loss": 0.3249573469161987,
|
|
"memory(GiB)": 57.67,
|
|
"step": 260,
|
|
"token_acc": 0.9081637062967285,
|
|
"train_speed(iter/s)": 0.13572
|
|
},
|
|
{
|
|
"epoch": 0.6507059545733579,
|
|
"grad_norm": 0.8184870146688139,
|
|
"learning_rate": 4.797575982819412e-07,
|
|
"loss": 0.31554522514343264,
|
|
"memory(GiB)": 57.67,
|
|
"step": 265,
|
|
"token_acc": 0.9049698848226551,
|
|
"train_speed(iter/s)": 0.135725
|
|
},
|
|
{
|
|
"epoch": 0.6629834254143646,
|
|
"grad_norm": 0.8945211680671034,
|
|
"learning_rate": 4.783282981207979e-07,
|
|
"loss": 0.3033627510070801,
|
|
"memory(GiB)": 57.67,
|
|
"step": 270,
|
|
"token_acc": 0.9165617767672256,
|
|
"train_speed(iter/s)": 0.135173
|
|
},
|
|
{
|
|
"epoch": 0.6752608962553714,
|
|
"grad_norm": 1.4362312397197503,
|
|
"learning_rate": 4.768525231639802e-07,
|
|
"loss": 0.2961107730865479,
|
|
"memory(GiB)": 57.67,
|
|
"step": 275,
|
|
"token_acc": 0.9266585849680405,
|
|
"train_speed(iter/s)": 0.135236
|
|
},
|
|
{
|
|
"epoch": 0.6875383670963782,
|
|
"grad_norm": 0.7600567933058701,
|
|
"learning_rate": 4.753305737962418e-07,
|
|
"loss": 0.2920267581939697,
|
|
"memory(GiB)": 57.67,
|
|
"step": 280,
|
|
"token_acc": 0.9217091715507683,
|
|
"train_speed(iter/s)": 0.135403
|
|
},
|
|
{
|
|
"epoch": 0.6998158379373849,
|
|
"grad_norm": 0.8650286740134839,
|
|
"learning_rate": 4.7376275980084856e-07,
|
|
"loss": 0.2840526819229126,
|
|
"memory(GiB)": 57.67,
|
|
"step": 285,
|
|
"token_acc": 0.9261834939254294,
|
|
"train_speed(iter/s)": 0.135709
|
|
},
|
|
{
|
|
"epoch": 0.7120933087783916,
|
|
"grad_norm": 0.9178664296409083,
|
|
"learning_rate": 4.721494002965243e-07,
|
|
"loss": 0.2752720355987549,
|
|
"memory(GiB)": 57.67,
|
|
"step": 290,
|
|
"token_acc": 0.9174185126886014,
|
|
"train_speed(iter/s)": 0.136139
|
|
},
|
|
{
|
|
"epoch": 0.7243707796193984,
|
|
"grad_norm": 0.9722910316739173,
|
|
"learning_rate": 4.70490823672496e-07,
|
|
"loss": 0.26680717468261717,
|
|
"memory(GiB)": 57.67,
|
|
"step": 295,
|
|
"token_acc": 0.9065665385958784,
|
|
"train_speed(iter/s)": 0.136569
|
|
},
|
|
{
|
|
"epoch": 0.7366482504604052,
|
|
"grad_norm": 0.8535970757124566,
|
|
"learning_rate": 4.6878736752165216e-07,
|
|
"loss": 0.26862516403198244,
|
|
"memory(GiB)": 57.67,
|
|
"step": 300,
|
|
"token_acc": 0.9236588470631024,
|
|
"train_speed(iter/s)": 0.136851
|
|
},
|
|
{
|
|
"epoch": 0.7489257213014119,
|
|
"grad_norm": 0.8086539218227536,
|
|
"learning_rate": 4.670393785718281e-07,
|
|
"loss": 0.26166937351226804,
|
|
"memory(GiB)": 57.67,
|
|
"step": 305,
|
|
"token_acc": 0.9295115530856976,
|
|
"train_speed(iter/s)": 0.132999
|
|
},
|
|
{
|
|
"epoch": 0.7612031921424187,
|
|
"grad_norm": 0.7359005917653992,
|
|
"learning_rate": 4.652472126152316e-07,
|
|
"loss": 0.27553093433380127,
|
|
"memory(GiB)": 57.67,
|
|
"step": 310,
|
|
"token_acc": 0.923441422964037,
|
|
"train_speed(iter/s)": 0.133358
|
|
},
|
|
{
|
|
"epoch": 0.7734806629834254,
|
|
"grad_norm": 0.7800276116485089,
|
|
"learning_rate": 4.634112344360237e-07,
|
|
"loss": 0.25064496994018554,
|
|
"memory(GiB)": 57.67,
|
|
"step": 315,
|
|
"token_acc": 0.9140353723835795,
|
|
"train_speed(iter/s)": 0.133677
|
|
},
|
|
{
|
|
"epoch": 0.7857581338244322,
|
|
"grad_norm": 0.7718671135037027,
|
|
"learning_rate": 4.615318177360689e-07,
|
|
"loss": 0.24835121631622314,
|
|
"memory(GiB)": 57.67,
|
|
"step": 320,
|
|
"token_acc": 0.9273986758008343,
|
|
"train_speed(iter/s)": 0.134016
|
|
},
|
|
{
|
|
"epoch": 0.7980356046654389,
|
|
"grad_norm": 0.9264394104378584,
|
|
"learning_rate": 4.596093450588707e-07,
|
|
"loss": 0.23845996856689453,
|
|
"memory(GiB)": 57.67,
|
|
"step": 325,
|
|
"token_acc": 0.9188875580176673,
|
|
"train_speed(iter/s)": 0.134392
|
|
},
|
|
{
|
|
"epoch": 0.8103130755064457,
|
|
"grad_norm": 0.7295759857048888,
|
|
"learning_rate": 4.5764420771170723e-07,
|
|
"loss": 0.2278268575668335,
|
|
"memory(GiB)": 57.67,
|
|
"step": 330,
|
|
"token_acc": 0.940601686668829,
|
|
"train_speed(iter/s)": 0.134767
|
|
},
|
|
{
|
|
"epoch": 0.8225905463474524,
|
|
"grad_norm": 0.7266088554592744,
|
|
"learning_rate": 4.556368056859832e-07,
|
|
"loss": 0.21920721530914306,
|
|
"memory(GiB)": 57.67,
|
|
"step": 335,
|
|
"token_acc": 0.9349947057933746,
|
|
"train_speed(iter/s)": 0.134994
|
|
},
|
|
{
|
|
"epoch": 0.8348680171884592,
|
|
"grad_norm": 0.7475457504296112,
|
|
"learning_rate": 4.5358754757581397e-07,
|
|
"loss": 0.2169396162033081,
|
|
"memory(GiB)": 57.67,
|
|
"step": 340,
|
|
"token_acc": 0.9349587340046944,
|
|
"train_speed(iter/s)": 0.135495
|
|
},
|
|
{
|
|
"epoch": 0.8471454880294659,
|
|
"grad_norm": 0.7207459452665155,
|
|
"learning_rate": 4.5149685049485877e-07,
|
|
"loss": 0.22667970657348632,
|
|
"memory(GiB)": 57.67,
|
|
"step": 345,
|
|
"token_acc": 0.9089569551995486,
|
|
"train_speed(iter/s)": 0.136023
|
|
},
|
|
{
|
|
"epoch": 0.8594229588704727,
|
|
"grad_norm": 0.7912001296106889,
|
|
"learning_rate": 4.4936513999142e-07,
|
|
"loss": 0.2154712438583374,
|
|
"memory(GiB)": 57.67,
|
|
"step": 350,
|
|
"token_acc": 0.9488812673526049,
|
|
"train_speed(iter/s)": 0.136042
|
|
},
|
|
{
|
|
"epoch": 0.8717004297114794,
|
|
"grad_norm": 1.7147841573520497,
|
|
"learning_rate": 4.471928499618255e-07,
|
|
"loss": 0.21075584888458251,
|
|
"memory(GiB)": 57.67,
|
|
"step": 355,
|
|
"token_acc": 0.9382304479442082,
|
|
"train_speed(iter/s)": 0.13587
|
|
},
|
|
{
|
|
"epoch": 0.8839779005524862,
|
|
"grad_norm": 0.7658381358366665,
|
|
"learning_rate": 4.449804225621116e-07,
|
|
"loss": 0.19444403648376465,
|
|
"memory(GiB)": 57.67,
|
|
"step": 360,
|
|
"token_acc": 0.9425833467547109,
|
|
"train_speed(iter/s)": 0.135745
|
|
},
|
|
{
|
|
"epoch": 0.896255371393493,
|
|
"grad_norm": 0.6912930219104488,
|
|
"learning_rate": 4.427283081180249e-07,
|
|
"loss": 0.1945898175239563,
|
|
"memory(GiB)": 57.67,
|
|
"step": 365,
|
|
"token_acc": 0.9307141169986616,
|
|
"train_speed(iter/s)": 0.136429
|
|
},
|
|
{
|
|
"epoch": 0.9085328422344997,
|
|
"grad_norm": 0.6755021225392699,
|
|
"learning_rate": 4.404369650333616e-07,
|
|
"loss": 0.1876620650291443,
|
|
"memory(GiB)": 57.67,
|
|
"step": 370,
|
|
"token_acc": 0.9437858236320268,
|
|
"train_speed(iter/s)": 0.136643
|
|
},
|
|
{
|
|
"epoch": 0.9208103130755064,
|
|
"grad_norm": 0.7098782582131411,
|
|
"learning_rate": 4.3810685969666203e-07,
|
|
"loss": 0.2034088134765625,
|
|
"memory(GiB)": 57.67,
|
|
"step": 375,
|
|
"token_acc": 0.9409318390075421,
|
|
"train_speed(iter/s)": 0.136522
|
|
},
|
|
{
|
|
"epoch": 0.9330877839165131,
|
|
"grad_norm": 0.6938412383830229,
|
|
"learning_rate": 4.357384663862803e-07,
|
|
"loss": 0.1925197124481201,
|
|
"memory(GiB)": 57.67,
|
|
"step": 380,
|
|
"token_acc": 0.934276273372018,
|
|
"train_speed(iter/s)": 0.136664
|
|
},
|
|
{
|
|
"epoch": 0.94536525475752,
|
|
"grad_norm": 0.5516763311313012,
|
|
"learning_rate": 4.3333226717384784e-07,
|
|
"loss": 0.18835780620574952,
|
|
"memory(GiB)": 57.67,
|
|
"step": 385,
|
|
"token_acc": 0.952319409185322,
|
|
"train_speed(iter/s)": 0.136364
|
|
},
|
|
{
|
|
"epoch": 0.9576427255985267,
|
|
"grad_norm": 0.5887901021890946,
|
|
"learning_rate": 4.308887518261507e-07,
|
|
"loss": 0.18153078556060792,
|
|
"memory(GiB)": 57.67,
|
|
"step": 390,
|
|
"token_acc": 0.9437745469578999,
|
|
"train_speed(iter/s)": 0.136159
|
|
},
|
|
{
|
|
"epoch": 0.9699201964395334,
|
|
"grad_norm": 0.6208318604275079,
|
|
"learning_rate": 4.2840841770544073e-07,
|
|
"loss": 0.16547969579696656,
|
|
"memory(GiB)": 57.67,
|
|
"step": 395,
|
|
"token_acc": 0.9472743181040058,
|
|
"train_speed(iter/s)": 0.136212
|
|
},
|
|
{
|
|
"epoch": 0.9821976672805403,
|
|
"grad_norm": 1.389928819023539,
|
|
"learning_rate": 4.258917696682006e-07,
|
|
"loss": 0.17939815521240235,
|
|
"memory(GiB)": 57.67,
|
|
"step": 400,
|
|
"token_acc": 0.9349780954576896,
|
|
"train_speed(iter/s)": 0.136576
|
|
},
|
|
{
|
|
"epoch": 0.994475138121547,
|
|
"grad_norm": 0.5148203046629826,
|
|
"learning_rate": 4.2333931996238316e-07,
|
|
"loss": 0.19017149209976197,
|
|
"memory(GiB)": 57.67,
|
|
"step": 405,
|
|
"token_acc": 0.9227554596926395,
|
|
"train_speed(iter/s)": 0.133387
|
|
},
|
|
{
|
|
"epoch": 1.0049109883364027,
|
|
"grad_norm": 0.4950490943119214,
|
|
"learning_rate": 4.2075158812314694e-07,
|
|
"loss": 0.16587586402893068,
|
|
"memory(GiB)": 57.67,
|
|
"step": 410,
|
|
"token_acc": 0.935228905768836,
|
|
"train_speed(iter/s)": 0.133808
|
|
},
|
|
{
|
|
"epoch": 1.0171884591774094,
|
|
"grad_norm": 0.5453509149451119,
|
|
"learning_rate": 4.1812910086710786e-07,
|
|
"loss": 0.17764878273010254,
|
|
"memory(GiB)": 57.67,
|
|
"step": 415,
|
|
"token_acc": 0.941226073024707,
|
|
"train_speed(iter/s)": 0.133857
|
|
},
|
|
{
|
|
"epoch": 1.0294659300184161,
|
|
"grad_norm": 0.6099408421859539,
|
|
"learning_rate": 4.1547239198512906e-07,
|
|
"loss": 0.17024999856948853,
|
|
"memory(GiB)": 57.67,
|
|
"step": 420,
|
|
"token_acc": 0.9455940130963517,
|
|
"train_speed(iter/s)": 0.133546
|
|
},
|
|
{
|
|
"epoch": 1.0417434008594229,
|
|
"grad_norm": 0.5544678384480342,
|
|
"learning_rate": 4.1278200223367186e-07,
|
|
"loss": 0.1932210922241211,
|
|
"memory(GiB)": 57.67,
|
|
"step": 425,
|
|
"token_acc": 0.941819772528434,
|
|
"train_speed(iter/s)": 0.133645
|
|
},
|
|
{
|
|
"epoch": 1.0540208717004298,
|
|
"grad_norm": 0.5826701192243031,
|
|
"learning_rate": 4.1005847922472737e-07,
|
|
"loss": 0.19101818799972534,
|
|
"memory(GiB)": 57.67,
|
|
"step": 430,
|
|
"token_acc": 0.9367067743530575,
|
|
"train_speed(iter/s)": 0.133833
|
|
},
|
|
{
|
|
"epoch": 1.0662983425414365,
|
|
"grad_norm": 0.5532014271248828,
|
|
"learning_rate": 4.0730237731435377e-07,
|
|
"loss": 0.1754150390625,
|
|
"memory(GiB)": 57.67,
|
|
"step": 435,
|
|
"token_acc": 0.9651971029990765,
|
|
"train_speed(iter/s)": 0.133965
|
|
},
|
|
{
|
|
"epoch": 1.0785758133824432,
|
|
"grad_norm": 0.5950560890667523,
|
|
"learning_rate": 4.0451425748984127e-07,
|
|
"loss": 0.17856969833374023,
|
|
"memory(GiB)": 57.67,
|
|
"step": 440,
|
|
"token_acc": 0.9385016513123587,
|
|
"train_speed(iter/s)": 0.133907
|
|
},
|
|
{
|
|
"epoch": 1.09085328422345,
|
|
"grad_norm": 0.5376654555480931,
|
|
"learning_rate": 4.016946872555251e-07,
|
|
"loss": 0.1833416700363159,
|
|
"memory(GiB)": 57.67,
|
|
"step": 445,
|
|
"token_acc": 0.959684329199549,
|
|
"train_speed(iter/s)": 0.134067
|
|
},
|
|
{
|
|
"epoch": 1.1031307550644567,
|
|
"grad_norm": 0.44444520282357075,
|
|
"learning_rate": 3.988442405172755e-07,
|
|
"loss": 0.17330591678619384,
|
|
"memory(GiB)": 57.67,
|
|
"step": 450,
|
|
"token_acc": 0.925875966441849,
|
|
"train_speed(iter/s)": 0.133897
|
|
},
|
|
{
|
|
"epoch": 1.1154082259054634,
|
|
"grad_norm": 0.5467733830966146,
|
|
"learning_rate": 3.9596349746568097e-07,
|
|
"loss": 0.187214457988739,
|
|
"memory(GiB)": 57.67,
|
|
"step": 455,
|
|
"token_acc": 0.9102694260054666,
|
|
"train_speed(iter/s)": 0.133805
|
|
},
|
|
{
|
|
"epoch": 1.1276856967464703,
|
|
"grad_norm": 0.5644574909855569,
|
|
"learning_rate": 3.930530444579556e-07,
|
|
"loss": 0.18247673511505128,
|
|
"memory(GiB)": 57.67,
|
|
"step": 460,
|
|
"token_acc": 0.9407660594101273,
|
|
"train_speed(iter/s)": 0.133717
|
|
},
|
|
{
|
|
"epoch": 1.139963167587477,
|
|
"grad_norm": 0.6185677880856536,
|
|
"learning_rate": 3.901134738985885e-07,
|
|
"loss": 0.19450093507766725,
|
|
"memory(GiB)": 57.67,
|
|
"step": 465,
|
|
"token_acc": 0.9278699743370402,
|
|
"train_speed(iter/s)": 0.133408
|
|
},
|
|
{
|
|
"epoch": 1.1522406384284838,
|
|
"grad_norm": 0.5271730386275215,
|
|
"learning_rate": 3.871453841187645e-07,
|
|
"loss": 0.1889647960662842,
|
|
"memory(GiB)": 57.67,
|
|
"step": 470,
|
|
"token_acc": 0.9411243259215484,
|
|
"train_speed(iter/s)": 0.133399
|
|
},
|
|
{
|
|
"epoch": 1.1645181092694905,
|
|
"grad_norm": 0.7131823190088369,
|
|
"learning_rate": 3.8414937925457706e-07,
|
|
"loss": 0.17877411842346191,
|
|
"memory(GiB)": 57.67,
|
|
"step": 475,
|
|
"token_acc": 0.9540759574129986,
|
|
"train_speed(iter/s)": 0.133589
|
|
},
|
|
{
|
|
"epoch": 1.1767955801104972,
|
|
"grad_norm": 0.5353529040202072,
|
|
"learning_rate": 3.8112606912406037e-07,
|
|
"loss": 0.17376744747161865,
|
|
"memory(GiB)": 57.67,
|
|
"step": 480,
|
|
"token_acc": 0.9406874176214134,
|
|
"train_speed(iter/s)": 0.13372
|
|
},
|
|
{
|
|
"epoch": 1.189073050951504,
|
|
"grad_norm": 0.4982526719045866,
|
|
"learning_rate": 3.780760691030646e-07,
|
|
"loss": 0.16717066764831542,
|
|
"memory(GiB)": 57.67,
|
|
"step": 485,
|
|
"token_acc": 0.9538624787775891,
|
|
"train_speed(iter/s)": 0.134087
|
|
},
|
|
{
|
|
"epoch": 1.2013505217925107,
|
|
"grad_norm": 0.46216176496672484,
|
|
"learning_rate": 3.75e-07,
|
|
"loss": 0.19427452087402344,
|
|
"memory(GiB)": 57.67,
|
|
"step": 490,
|
|
"token_acc": 0.93522816539313,
|
|
"train_speed(iter/s)": 0.134444
|
|
},
|
|
{
|
|
"epoch": 1.2136279926335174,
|
|
"grad_norm": 0.7604094532944367,
|
|
"learning_rate": 3.7189848792947536e-07,
|
|
"loss": 0.1537397861480713,
|
|
"memory(GiB)": 57.67,
|
|
"step": 495,
|
|
"token_acc": 0.9573288642516437,
|
|
"train_speed(iter/s)": 0.134597
|
|
},
|
|
{
|
|
"epoch": 1.2259054634745243,
|
|
"grad_norm": 0.549241195148398,
|
|
"learning_rate": 3.687721641848562e-07,
|
|
"loss": 0.1440601110458374,
|
|
"memory(GiB)": 57.67,
|
|
"step": 500,
|
|
"token_acc": 0.953042040212377,
|
|
"train_speed(iter/s)": 0.134604
|
|
},
|
|
{
|
|
"epoch": 1.238182934315531,
|
|
"grad_norm": 0.563515575244595,
|
|
"learning_rate": 3.6562166510976887e-07,
|
|
"loss": 0.1917360782623291,
|
|
"memory(GiB)": 57.67,
|
|
"step": 505,
|
|
"token_acc": 0.9542143600416233,
|
|
"train_speed(iter/s)": 0.13233
|
|
},
|
|
{
|
|
"epoch": 1.2504604051565378,
|
|
"grad_norm": 0.8281020221114317,
|
|
"learning_rate": 3.624476319685771e-07,
|
|
"loss": 0.189109206199646,
|
|
"memory(GiB)": 57.67,
|
|
"step": 510,
|
|
"token_acc": 0.9374545982856313,
|
|
"train_speed(iter/s)": 0.132765
|
|
},
|
|
{
|
|
"epoch": 1.2627378759975445,
|
|
"grad_norm": 0.5241233864753891,
|
|
"learning_rate": 3.592507108158563e-07,
|
|
"loss": 0.15444846153259278,
|
|
"memory(GiB)": 57.67,
|
|
"step": 515,
|
|
"token_acc": 0.9661171743001964,
|
|
"train_speed(iter/s)": 0.13301
|
|
},
|
|
{
|
|
"epoch": 1.2750153468385512,
|
|
"grad_norm": 0.5573427156559804,
|
|
"learning_rate": 3.560315523648932e-07,
|
|
"loss": 0.18322609663009642,
|
|
"memory(GiB)": 57.67,
|
|
"step": 520,
|
|
"token_acc": 0.9517766497461929,
|
|
"train_speed(iter/s)": 0.133378
|
|
},
|
|
{
|
|
"epoch": 1.287292817679558,
|
|
"grad_norm": 0.537977575567055,
|
|
"learning_rate": 3.5279081185523763e-07,
|
|
"loss": 0.18208487033843995,
|
|
"memory(GiB)": 57.67,
|
|
"step": 525,
|
|
"token_acc": 0.9459727287141272,
|
|
"train_speed(iter/s)": 0.133578
|
|
},
|
|
{
|
|
"epoch": 1.2995702885205649,
|
|
"grad_norm": 0.4953187670124203,
|
|
"learning_rate": 3.4952914891933225e-07,
|
|
"loss": 0.17269195318222047,
|
|
"memory(GiB)": 57.67,
|
|
"step": 530,
|
|
"token_acc": 0.942077971960822,
|
|
"train_speed(iter/s)": 0.133685
|
|
},
|
|
{
|
|
"epoch": 1.3118477593615716,
|
|
"grad_norm": 0.4701054790036289,
|
|
"learning_rate": 3.4624722744824874e-07,
|
|
"loss": 0.1993415355682373,
|
|
"memory(GiB)": 57.67,
|
|
"step": 535,
|
|
"token_acc": 0.942649839836363,
|
|
"train_speed(iter/s)": 0.133424
|
|
},
|
|
{
|
|
"epoch": 1.3241252302025783,
|
|
"grad_norm": 0.5249799204021143,
|
|
"learning_rate": 3.429457154565565e-07,
|
|
"loss": 0.18299152851104736,
|
|
"memory(GiB)": 57.67,
|
|
"step": 540,
|
|
"token_acc": 0.9312291707508332,
|
|
"train_speed(iter/s)": 0.133551
|
|
},
|
|
{
|
|
"epoch": 1.336402701043585,
|
|
"grad_norm": 0.5358390030505376,
|
|
"learning_rate": 3.396252849463529e-07,
|
|
"loss": 0.1694674849510193,
|
|
"memory(GiB)": 57.67,
|
|
"step": 545,
|
|
"token_acc": 0.9403699099709948,
|
|
"train_speed(iter/s)": 0.13334
|
|
},
|
|
{
|
|
"epoch": 1.3486801718845918,
|
|
"grad_norm": 0.47175661832523136,
|
|
"learning_rate": 3.362866117704815e-07,
|
|
"loss": 0.16650619506835937,
|
|
"memory(GiB)": 57.67,
|
|
"step": 550,
|
|
"token_acc": 0.966151256036507,
|
|
"train_speed(iter/s)": 0.132885
|
|
},
|
|
{
|
|
"epoch": 1.3609576427255985,
|
|
"grad_norm": 0.5566911020042059,
|
|
"learning_rate": 3.3293037549496597e-07,
|
|
"loss": 0.18229317665100098,
|
|
"memory(GiB)": 57.67,
|
|
"step": 555,
|
|
"token_acc": 0.9332179930795848,
|
|
"train_speed(iter/s)": 0.132956
|
|
},
|
|
{
|
|
"epoch": 1.3732351135666052,
|
|
"grad_norm": 0.8552156642127553,
|
|
"learning_rate": 3.295572592606891e-07,
|
|
"loss": 0.17464141845703124,
|
|
"memory(GiB)": 57.67,
|
|
"step": 560,
|
|
"token_acc": 0.9428267315441344,
|
|
"train_speed(iter/s)": 0.133102
|
|
},
|
|
{
|
|
"epoch": 1.385512584407612,
|
|
"grad_norm": 0.4054170592537424,
|
|
"learning_rate": 3.2616794964434356e-07,
|
|
"loss": 0.169390869140625,
|
|
"memory(GiB)": 57.67,
|
|
"step": 565,
|
|
"token_acc": 0.9302136041022855,
|
|
"train_speed(iter/s)": 0.133051
|
|
},
|
|
{
|
|
"epoch": 1.3977900552486187,
|
|
"grad_norm": 0.4737608855605009,
|
|
"learning_rate": 3.227631365186836e-07,
|
|
"loss": 0.16181081533432007,
|
|
"memory(GiB)": 57.67,
|
|
"step": 570,
|
|
"token_acc": 0.9568342208944884,
|
|
"train_speed(iter/s)": 0.132964
|
|
},
|
|
{
|
|
"epoch": 1.4100675260896256,
|
|
"grad_norm": 0.4918393822553509,
|
|
"learning_rate": 3.193435129121058e-07,
|
|
"loss": 0.1819918632507324,
|
|
"memory(GiB)": 57.67,
|
|
"step": 575,
|
|
"token_acc": 0.9572955270188744,
|
|
"train_speed(iter/s)": 0.133093
|
|
},
|
|
{
|
|
"epoch": 1.4223449969306323,
|
|
"grad_norm": 0.616287863577658,
|
|
"learning_rate": 3.159097748675873e-07,
|
|
"loss": 0.1604529619216919,
|
|
"memory(GiB)": 57.67,
|
|
"step": 580,
|
|
"token_acc": 0.9625401355690332,
|
|
"train_speed(iter/s)": 0.133223
|
|
},
|
|
{
|
|
"epoch": 1.434622467771639,
|
|
"grad_norm": 0.5677509895303791,
|
|
"learning_rate": 3.124626213010108e-07,
|
|
"loss": 0.1569218635559082,
|
|
"memory(GiB)": 57.67,
|
|
"step": 585,
|
|
"token_acc": 0.9455586360854067,
|
|
"train_speed(iter/s)": 0.133671
|
|
},
|
|
{
|
|
"epoch": 1.4468999386126458,
|
|
"grad_norm": 0.5032051690478044,
|
|
"learning_rate": 3.090027538589044e-07,
|
|
"loss": 0.169755220413208,
|
|
"memory(GiB)": 57.67,
|
|
"step": 590,
|
|
"token_acc": 0.9418753193663771,
|
|
"train_speed(iter/s)": 0.134002
|
|
},
|
|
{
|
|
"epoch": 1.4591774094536525,
|
|
"grad_norm": 0.5432224909432672,
|
|
"learning_rate": 3.055308767756261e-07,
|
|
"loss": 0.18236881494522095,
|
|
"memory(GiB)": 57.67,
|
|
"step": 595,
|
|
"token_acc": 0.9640907181856363,
|
|
"train_speed(iter/s)": 0.133845
|
|
},
|
|
{
|
|
"epoch": 1.4714548802946594,
|
|
"grad_norm": 0.5191917254492633,
|
|
"learning_rate": 3.0204769673002116e-07,
|
|
"loss": 0.16606335639953612,
|
|
"memory(GiB)": 57.67,
|
|
"step": 600,
|
|
"token_acc": 0.9707240443661637,
|
|
"train_speed(iter/s)": 0.134004
|
|
},
|
|
{
|
|
"epoch": 1.4837323511356661,
|
|
"grad_norm": 0.5703809476287016,
|
|
"learning_rate": 2.9855392270158206e-07,
|
|
"loss": 0.17542767524719238,
|
|
"memory(GiB)": 57.67,
|
|
"step": 605,
|
|
"token_acc": 0.953876582278481,
|
|
"train_speed(iter/s)": 0.132384
|
|
},
|
|
{
|
|
"epoch": 1.4960098219766729,
|
|
"grad_norm": 0.42427507061236897,
|
|
"learning_rate": 2.9505026582614024e-07,
|
|
"loss": 0.19279547929763793,
|
|
"memory(GiB)": 57.67,
|
|
"step": 610,
|
|
"token_acc": 0.9392731620710896,
|
|
"train_speed(iter/s)": 0.132484
|
|
},
|
|
{
|
|
"epoch": 1.5082872928176796,
|
|
"grad_norm": 0.5816813455731817,
|
|
"learning_rate": 2.915374392511184e-07,
|
|
"loss": 0.18373801708221435,
|
|
"memory(GiB)": 57.67,
|
|
"step": 615,
|
|
"token_acc": 0.9288226144586461,
|
|
"train_speed(iter/s)": 0.132603
|
|
},
|
|
{
|
|
"epoch": 1.5205647636586863,
|
|
"grad_norm": 0.4407507522526475,
|
|
"learning_rate": 2.8801615799037484e-07,
|
|
"loss": 0.16642086505889891,
|
|
"memory(GiB)": 57.67,
|
|
"step": 620,
|
|
"token_acc": 0.9481449252432561,
|
|
"train_speed(iter/s)": 0.132928
|
|
},
|
|
{
|
|
"epoch": 1.532842234499693,
|
|
"grad_norm": 0.482240995820186,
|
|
"learning_rate": 2.844871387786655e-07,
|
|
"loss": 0.16589756011962892,
|
|
"memory(GiB)": 57.67,
|
|
"step": 625,
|
|
"token_acc": 0.9455185772142983,
|
|
"train_speed(iter/s)": 0.132983
|
|
},
|
|
{
|
|
"epoch": 1.5451197053406998,
|
|
"grad_norm": 0.5533567457008911,
|
|
"learning_rate": 2.809510999257582e-07,
|
|
"loss": 0.19375090599060057,
|
|
"memory(GiB)": 57.67,
|
|
"step": 630,
|
|
"token_acc": 0.9351315128162171,
|
|
"train_speed(iter/s)": 0.132918
|
|
},
|
|
{
|
|
"epoch": 1.5573971761817065,
|
|
"grad_norm": 0.42397946477312154,
|
|
"learning_rate": 2.7740876117022493e-07,
|
|
"loss": 0.16124327182769777,
|
|
"memory(GiB)": 57.67,
|
|
"step": 635,
|
|
"token_acc": 0.9352771876927294,
|
|
"train_speed(iter/s)": 0.132949
|
|
},
|
|
{
|
|
"epoch": 1.5696746470227132,
|
|
"grad_norm": 0.46916218507444407,
|
|
"learning_rate": 2.7386084353294305e-07,
|
|
"loss": 0.1779846429824829,
|
|
"memory(GiB)": 57.67,
|
|
"step": 640,
|
|
"token_acc": 0.9482183060321404,
|
|
"train_speed(iter/s)": 0.132929
|
|
},
|
|
{
|
|
"epoch": 1.58195211786372,
|
|
"grad_norm": 0.4494446665243088,
|
|
"learning_rate": 2.703080691703365e-07,
|
|
"loss": 0.16666009426116943,
|
|
"memory(GiB)": 57.67,
|
|
"step": 645,
|
|
"token_acc": 0.958400417736391,
|
|
"train_speed(iter/s)": 0.133055
|
|
},
|
|
{
|
|
"epoch": 1.5942295887047269,
|
|
"grad_norm": 0.5003421798456534,
|
|
"learning_rate": 2.667511612273853e-07,
|
|
"loss": 0.17405877113342286,
|
|
"memory(GiB)": 57.67,
|
|
"step": 650,
|
|
"token_acc": 0.9476805681474766,
|
|
"train_speed(iter/s)": 0.133315
|
|
},
|
|
{
|
|
"epoch": 1.6065070595457336,
|
|
"grad_norm": 0.5798622931057178,
|
|
"learning_rate": 2.6319084369043403e-07,
|
|
"loss": 0.14733604192733765,
|
|
"memory(GiB)": 57.67,
|
|
"step": 655,
|
|
"token_acc": 0.985560657322378,
|
|
"train_speed(iter/s)": 0.133499
|
|
},
|
|
{
|
|
"epoch": 1.6187845303867403,
|
|
"grad_norm": 0.5265921861617535,
|
|
"learning_rate": 2.596278412398284e-07,
|
|
"loss": 0.16961886882781982,
|
|
"memory(GiB)": 57.67,
|
|
"step": 660,
|
|
"token_acc": 0.9309208573294544,
|
|
"train_speed(iter/s)": 0.133748
|
|
},
|
|
{
|
|
"epoch": 1.6310620012277472,
|
|
"grad_norm": 0.5302133218327372,
|
|
"learning_rate": 2.560628791024118e-07,
|
|
"loss": 0.17056363821029663,
|
|
"memory(GiB)": 57.67,
|
|
"step": 665,
|
|
"token_acc": 0.9468977792846245,
|
|
"train_speed(iter/s)": 0.133937
|
|
},
|
|
{
|
|
"epoch": 1.643339472068754,
|
|
"grad_norm": 0.5423907736532357,
|
|
"learning_rate": 2.5249668290390936e-07,
|
|
"loss": 0.16655545234680175,
|
|
"memory(GiB)": 57.67,
|
|
"step": 670,
|
|
"token_acc": 0.9424981219695417,
|
|
"train_speed(iter/s)": 0.134082
|
|
},
|
|
{
|
|
"epoch": 1.6556169429097607,
|
|
"grad_norm": 0.515033840244812,
|
|
"learning_rate": 2.489299785212319e-07,
|
|
"loss": 0.17368289232254028,
|
|
"memory(GiB)": 57.67,
|
|
"step": 675,
|
|
"token_acc": 0.9435146443514645,
|
|
"train_speed(iter/s)": 0.134347
|
|
},
|
|
{
|
|
"epoch": 1.6678944137507674,
|
|
"grad_norm": 0.48163213229590374,
|
|
"learning_rate": 2.4536349193472773e-07,
|
|
"loss": 0.16638292074203492,
|
|
"memory(GiB)": 57.67,
|
|
"step": 680,
|
|
"token_acc": 0.9301941049604601,
|
|
"train_speed(iter/s)": 0.134535
|
|
},
|
|
{
|
|
"epoch": 1.6801718845917741,
|
|
"grad_norm": 0.4779642715711826,
|
|
"learning_rate": 2.417979490804143e-07,
|
|
"loss": 0.14599368572235108,
|
|
"memory(GiB)": 57.67,
|
|
"step": 685,
|
|
"token_acc": 0.9522398399014779,
|
|
"train_speed(iter/s)": 0.134598
|
|
},
|
|
{
|
|
"epoch": 1.6924493554327809,
|
|
"grad_norm": 0.498486709572586,
|
|
"learning_rate": 2.382340757022181e-07,
|
|
"loss": 0.1666867971420288,
|
|
"memory(GiB)": 57.67,
|
|
"step": 690,
|
|
"token_acc": 0.9422738067877117,
|
|
"train_speed(iter/s)": 0.134551
|
|
},
|
|
{
|
|
"epoch": 1.7047268262737876,
|
|
"grad_norm": 0.5039713682487171,
|
|
"learning_rate": 2.3467259720425429e-07,
|
|
"loss": 0.17596899271011351,
|
|
"memory(GiB)": 57.67,
|
|
"step": 695,
|
|
"token_acc": 0.9558689717925387,
|
|
"train_speed(iter/s)": 0.134726
|
|
},
|
|
{
|
|
"epoch": 1.7170042971147943,
|
|
"grad_norm": 0.4391373433161239,
|
|
"learning_rate": 2.3111423850317508e-07,
|
|
"loss": 0.17754709720611572,
|
|
"memory(GiB)": 57.67,
|
|
"step": 700,
|
|
"token_acc": 0.957532017854908,
|
|
"train_speed(iter/s)": 0.134883
|
|
},
|
|
{
|
|
"epoch": 1.729281767955801,
|
|
"grad_norm": 0.49039848892922233,
|
|
"learning_rate": 2.2755972388061755e-07,
|
|
"loss": 0.16874098777770996,
|
|
"memory(GiB)": 57.67,
|
|
"step": 705,
|
|
"token_acc": 0.9599765892310462,
|
|
"train_speed(iter/s)": 0.133296
|
|
},
|
|
{
|
|
"epoch": 1.7415592387968077,
|
|
"grad_norm": 0.4774575002594326,
|
|
"learning_rate": 2.2400977683578092e-07,
|
|
"loss": 0.16588878631591797,
|
|
"memory(GiB)": 57.67,
|
|
"step": 710,
|
|
"token_acc": 0.9488679320361306,
|
|
"train_speed(iter/s)": 0.133523
|
|
},
|
|
{
|
|
"epoch": 1.7538367096378145,
|
|
"grad_norm": 0.5147564388228866,
|
|
"learning_rate": 2.204651199381623e-07,
|
|
"loss": 0.15819010734558106,
|
|
"memory(GiB)": 57.67,
|
|
"step": 715,
|
|
"token_acc": 0.9515534491837809,
|
|
"train_speed(iter/s)": 0.133748
|
|
},
|
|
{
|
|
"epoch": 1.7661141804788214,
|
|
"grad_norm": 0.5103960664604678,
|
|
"learning_rate": 2.1692647468048233e-07,
|
|
"loss": 0.17287697792053222,
|
|
"memory(GiB)": 57.67,
|
|
"step": 720,
|
|
"token_acc": 0.9395699944668405,
|
|
"train_speed(iter/s)": 0.133786
|
|
},
|
|
{
|
|
"epoch": 1.7783916513198281,
|
|
"grad_norm": 0.4774493976896547,
|
|
"learning_rate": 2.1339456133183043e-07,
|
|
"loss": 0.1602993369102478,
|
|
"memory(GiB)": 57.67,
|
|
"step": 725,
|
|
"token_acc": 0.9661698051492921,
|
|
"train_speed(iter/s)": 0.133866
|
|
},
|
|
{
|
|
"epoch": 1.7906691221608348,
|
|
"grad_norm": 0.5075630225471248,
|
|
"learning_rate": 2.0987009879105762e-07,
|
|
"loss": 0.18199481964111328,
|
|
"memory(GiB)": 57.67,
|
|
"step": 730,
|
|
"token_acc": 0.9596290705197326,
|
|
"train_speed(iter/s)": 0.133996
|
|
},
|
|
{
|
|
"epoch": 1.8029465930018416,
|
|
"grad_norm": 0.4811594763625998,
|
|
"learning_rate": 2.0635380444044999e-07,
|
|
"loss": 0.17754099369049073,
|
|
"memory(GiB)": 57.67,
|
|
"step": 735,
|
|
"token_acc": 0.9363772728935719,
|
|
"train_speed(iter/s)": 0.134141
|
|
},
|
|
{
|
|
"epoch": 1.8152240638428485,
|
|
"grad_norm": 0.5223831838925675,
|
|
"learning_rate": 2.028463939997093e-07,
|
|
"loss": 0.16622164249420165,
|
|
"memory(GiB)": 57.67,
|
|
"step": 740,
|
|
"token_acc": 0.9566500118962645,
|
|
"train_speed(iter/s)": 0.134232
|
|
},
|
|
{
|
|
"epoch": 1.8275015346838552,
|
|
"grad_norm": 0.4729423140315417,
|
|
"learning_rate": 1.9934858138027323e-07,
|
|
"loss": 0.1844787120819092,
|
|
"memory(GiB)": 57.67,
|
|
"step": 745,
|
|
"token_acc": 0.9339959225280327,
|
|
"train_speed(iter/s)": 0.134303
|
|
},
|
|
{
|
|
"epoch": 1.839779005524862,
|
|
"grad_norm": 0.4875394529058418,
|
|
"learning_rate": 1.9586107854000325e-07,
|
|
"loss": 0.1973895788192749,
|
|
"memory(GiB)": 57.67,
|
|
"step": 750,
|
|
"token_acc": 0.9558183961305532,
|
|
"train_speed(iter/s)": 0.13412
|
|
},
|
|
{
|
|
"epoch": 1.8520564763658687,
|
|
"grad_norm": 0.40978646980156164,
|
|
"learning_rate": 1.9238459533826938e-07,
|
|
"loss": 0.19609053134918214,
|
|
"memory(GiB)": 57.67,
|
|
"step": 755,
|
|
"token_acc": 0.934996003197442,
|
|
"train_speed(iter/s)": 0.134194
|
|
},
|
|
{
|
|
"epoch": 1.8643339472068754,
|
|
"grad_norm": 0.5408849293574589,
|
|
"learning_rate": 1.8891983939146369e-07,
|
|
"loss": 0.1738824725151062,
|
|
"memory(GiB)": 57.67,
|
|
"step": 760,
|
|
"token_acc": 0.9426776599921476,
|
|
"train_speed(iter/s)": 0.134248
|
|
},
|
|
{
|
|
"epoch": 1.8766114180478821,
|
|
"grad_norm": 0.471076880754224,
|
|
"learning_rate": 1.8546751592896853e-07,
|
|
"loss": 0.18387995958328246,
|
|
"memory(GiB)": 57.67,
|
|
"step": 765,
|
|
"token_acc": 0.9433991482771971,
|
|
"train_speed(iter/s)": 0.13421
|
|
},
|
|
{
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.504836493093865,
|
|
"learning_rate": 1.8202832764961198e-07,
|
|
"loss": 0.18205785751342773,
|
|
"memory(GiB)": 57.67,
|
|
"step": 770,
|
|
"token_acc": 0.9527401477832512,
|
|
"train_speed(iter/s)": 0.13428
|
|
},
|
|
{
|
|
"epoch": 1.9011663597298956,
|
|
"grad_norm": 0.4319777645328756,
|
|
"learning_rate": 1.7860297457863802e-07,
|
|
"loss": 0.16953612565994264,
|
|
"memory(GiB)": 57.67,
|
|
"step": 775,
|
|
"token_acc": 0.947322033898305,
|
|
"train_speed(iter/s)": 0.134434
|
|
},
|
|
{
|
|
"epoch": 1.9134438305709023,
|
|
"grad_norm": 0.4590442017829868,
|
|
"learning_rate": 1.7519215392522025e-07,
|
|
"loss": 0.15192935466766358,
|
|
"memory(GiB)": 57.67,
|
|
"step": 780,
|
|
"token_acc": 0.9487110114791482,
|
|
"train_speed(iter/s)": 0.134411
|
|
},
|
|
{
|
|
"epoch": 1.925721301411909,
|
|
"grad_norm": 0.46331279083008303,
|
|
"learning_rate": 1.717965599405501e-07,
|
|
"loss": 0.17549625635147095,
|
|
"memory(GiB)": 57.67,
|
|
"step": 785,
|
|
"token_acc": 0.9622470689421031,
|
|
"train_speed(iter/s)": 0.134696
|
|
},
|
|
{
|
|
"epoch": 1.937998772252916,
|
|
"grad_norm": 0.4358941046923003,
|
|
"learning_rate": 1.6841688377652552e-07,
|
|
"loss": 0.1650502562522888,
|
|
"memory(GiB)": 57.67,
|
|
"step": 790,
|
|
"token_acc": 0.9644233133863431,
|
|
"train_speed(iter/s)": 0.134849
|
|
},
|
|
{
|
|
"epoch": 1.9502762430939227,
|
|
"grad_norm": 0.36413602356289787,
|
|
"learning_rate": 1.6505381334507175e-07,
|
|
"loss": 0.16262125968933105,
|
|
"memory(GiB)": 57.67,
|
|
"step": 795,
|
|
"token_acc": 0.9436930827359039,
|
|
"train_speed(iter/s)": 0.135052
|
|
},
|
|
{
|
|
"epoch": 1.9625537139349294,
|
|
"grad_norm": 0.5119930810029533,
|
|
"learning_rate": 1.6170803317812136e-07,
|
|
"loss": 0.17920398712158203,
|
|
"memory(GiB)": 57.67,
|
|
"step": 800,
|
|
"token_acc": 0.9322690781581618,
|
|
"train_speed(iter/s)": 0.135111
|
|
},
|
|
{
|
|
"epoch": 1.974831184775936,
|
|
"grad_norm": 0.4825725506640307,
|
|
"learning_rate": 1.583802242882816e-07,
|
|
"loss": 0.1905304193496704,
|
|
"memory(GiB)": 57.67,
|
|
"step": 805,
|
|
"token_acc": 0.9357515085024685,
|
|
"train_speed(iter/s)": 0.13373
|
|
},
|
|
{
|
|
"epoch": 1.987108655616943,
|
|
"grad_norm": 0.4976072969687368,
|
|
"learning_rate": 1.5507106403021895e-07,
|
|
"loss": 0.1734859824180603,
|
|
"memory(GiB)": 57.67,
|
|
"step": 810,
|
|
"token_acc": 0.9405059337913804,
|
|
"train_speed(iter/s)": 0.133765
|
|
},
|
|
{
|
|
"epoch": 1.9993861264579498,
|
|
"grad_norm": 0.5349651167180693,
|
|
"learning_rate": 1.517812259627874e-07,
|
|
"loss": 0.17344932556152343,
|
|
"memory(GiB)": 57.67,
|
|
"step": 815,
|
|
"token_acc": 0.9323653962492437,
|
|
"train_speed(iter/s)": 0.133696
|
|
},
|
|
{
|
|
"epoch": 2.0098219766728054,
|
|
"grad_norm": 0.6057966476171509,
|
|
"learning_rate": 1.4851137971193018e-07,
|
|
"loss": 0.16537351608276368,
|
|
"memory(GiB)": 57.67,
|
|
"step": 820,
|
|
"token_acc": 0.9556328651806039,
|
|
"train_speed(iter/s)": 0.133772
|
|
},
|
|
{
|
|
"epoch": 2.022099447513812,
|
|
"grad_norm": 1.0192898362372844,
|
|
"learning_rate": 1.4526219083438153e-07,
|
|
"loss": 0.17631728649139405,
|
|
"memory(GiB)": 57.67,
|
|
"step": 825,
|
|
"token_acc": 0.9437896645512239,
|
|
"train_speed(iter/s)": 0.133754
|
|
},
|
|
{
|
|
"epoch": 2.034376918354819,
|
|
"grad_norm": 0.43605668597687935,
|
|
"learning_rate": 1.4203432068219616e-07,
|
|
"loss": 0.1445701837539673,
|
|
"memory(GiB)": 57.67,
|
|
"step": 830,
|
|
"token_acc": 0.9750953344946095,
|
|
"train_speed(iter/s)": 0.133755
|
|
},
|
|
{
|
|
"epoch": 2.0466543891958255,
|
|
"grad_norm": 0.4517333876137591,
|
|
"learning_rate": 1.3882842626813645e-07,
|
|
"loss": 0.15836387872695923,
|
|
"memory(GiB)": 57.67,
|
|
"step": 835,
|
|
"token_acc": 0.96793536040825,
|
|
"train_speed(iter/s)": 0.133783
|
|
},
|
|
{
|
|
"epoch": 2.0589318600368323,
|
|
"grad_norm": 0.5383308109264513,
|
|
"learning_rate": 1.3564516013194022e-07,
|
|
"loss": 0.18179185390472413,
|
|
"memory(GiB)": 57.67,
|
|
"step": 840,
|
|
"token_acc": 0.9378444703705193,
|
|
"train_speed(iter/s)": 0.133821
|
|
},
|
|
{
|
|
"epoch": 2.071209330877839,
|
|
"grad_norm": 0.4522142212035384,
|
|
"learning_rate": 1.3248517020750123e-07,
|
|
"loss": 0.16212983131408693,
|
|
"memory(GiB)": 57.67,
|
|
"step": 845,
|
|
"token_acc": 0.9407257155735858,
|
|
"train_speed(iter/s)": 0.13394
|
|
},
|
|
{
|
|
"epoch": 2.0834868017188457,
|
|
"grad_norm": 0.45710576973364,
|
|
"learning_rate": 1.2934909969098612e-07,
|
|
"loss": 0.1782787561416626,
|
|
"memory(GiB)": 57.67,
|
|
"step": 850,
|
|
"token_acc": 0.9515674953476165,
|
|
"train_speed(iter/s)": 0.133905
|
|
},
|
|
{
|
|
"epoch": 2.095764272559853,
|
|
"grad_norm": 0.492740078600037,
|
|
"learning_rate": 1.2623758690991567e-07,
|
|
"loss": 0.1520832061767578,
|
|
"memory(GiB)": 57.67,
|
|
"step": 855,
|
|
"token_acc": 0.9375085324232082,
|
|
"train_speed(iter/s)": 0.13385
|
|
},
|
|
{
|
|
"epoch": 2.1080417434008596,
|
|
"grad_norm": 0.4433975430592583,
|
|
"learning_rate": 1.2315126519323751e-07,
|
|
"loss": 0.18507776260375977,
|
|
"memory(GiB)": 57.67,
|
|
"step": 860,
|
|
"token_acc": 0.9329054289056818,
|
|
"train_speed(iter/s)": 0.133924
|
|
},
|
|
{
|
|
"epoch": 2.1203192142418663,
|
|
"grad_norm": 0.5301183452451113,
|
|
"learning_rate": 1.2009076274241567e-07,
|
|
"loss": 0.15922095775604247,
|
|
"memory(GiB)": 57.67,
|
|
"step": 865,
|
|
"token_acc": 0.948112669631657,
|
|
"train_speed(iter/s)": 0.134001
|
|
},
|
|
{
|
|
"epoch": 2.132596685082873,
|
|
"grad_norm": 0.5377164487351925,
|
|
"learning_rate": 1.1705670250356414e-07,
|
|
"loss": 0.17927762269973754,
|
|
"memory(GiB)": 57.67,
|
|
"step": 870,
|
|
"token_acc": 0.9462852794687328,
|
|
"train_speed(iter/s)": 0.13419
|
|
},
|
|
{
|
|
"epoch": 2.1448741559238798,
|
|
"grad_norm": 0.5286410675937543,
|
|
"learning_rate": 1.1404970204065056e-07,
|
|
"loss": 0.17566382884979248,
|
|
"memory(GiB)": 57.67,
|
|
"step": 875,
|
|
"token_acc": 0.9370597142669037,
|
|
"train_speed(iter/s)": 0.134182
|
|
},
|
|
{
|
|
"epoch": 2.1571516267648865,
|
|
"grad_norm": 0.40268668050343476,
|
|
"learning_rate": 1.110703734097942e-07,
|
|
"loss": 0.15375242233276368,
|
|
"memory(GiB)": 57.67,
|
|
"step": 880,
|
|
"token_acc": 0.9555922520753369,
|
|
"train_speed(iter/s)": 0.134177
|
|
},
|
|
{
|
|
"epoch": 2.169429097605893,
|
|
"grad_norm": 0.5959804911771396,
|
|
"learning_rate": 1.0811932303468649e-07,
|
|
"loss": 0.18795297145843506,
|
|
"memory(GiB)": 57.67,
|
|
"step": 885,
|
|
"token_acc": 0.93544177741149,
|
|
"train_speed(iter/s)": 0.134122
|
|
},
|
|
{
|
|
"epoch": 2.1817065684469,
|
|
"grad_norm": 0.4413780946022899,
|
|
"learning_rate": 1.0519715158315667e-07,
|
|
"loss": 0.16619727611541749,
|
|
"memory(GiB)": 57.67,
|
|
"step": 890,
|
|
"token_acc": 0.9495682081573142,
|
|
"train_speed(iter/s)": 0.134035
|
|
},
|
|
{
|
|
"epoch": 2.1939840392879066,
|
|
"grad_norm": 0.41173432463685106,
|
|
"learning_rate": 1.0230445384491002e-07,
|
|
"loss": 0.15455365180969238,
|
|
"memory(GiB)": 57.67,
|
|
"step": 895,
|
|
"token_acc": 0.9446943730362753,
|
|
"train_speed(iter/s)": 0.134118
|
|
},
|
|
{
|
|
"epoch": 2.2062615101289134,
|
|
"grad_norm": 0.48208299751179823,
|
|
"learning_rate": 9.944181861046186e-08,
|
|
"loss": 0.18413586616516114,
|
|
"memory(GiB)": 57.67,
|
|
"step": 900,
|
|
"token_acc": 0.9379831280223078,
|
|
"train_speed(iter/s)": 0.134153
|
|
},
|
|
{
|
|
"epoch": 2.21853898096992,
|
|
"grad_norm": 0.3966707114512702,
|
|
"learning_rate": 9.660982855129313e-08,
|
|
"loss": 0.15873076915740966,
|
|
"memory(GiB)": 57.67,
|
|
"step": 905,
|
|
"token_acc": 0.9590186155792517,
|
|
"train_speed(iter/s)": 0.132978
|
|
},
|
|
{
|
|
"epoch": 2.230816451810927,
|
|
"grad_norm": 0.4381866262273687,
|
|
"learning_rate": 9.380906010125136e-08,
|
|
"loss": 0.15625982284545897,
|
|
"memory(GiB)": 57.67,
|
|
"step": 910,
|
|
"token_acc": 0.9386312965272267,
|
|
"train_speed(iter/s)": 0.133045
|
|
},
|
|
{
|
|
"epoch": 2.2430939226519335,
|
|
"grad_norm": 0.5223822620159875,
|
|
"learning_rate": 9.104008333922076e-08,
|
|
"loss": 0.16865816116333007,
|
|
"memory(GiB)": 57.67,
|
|
"step": 915,
|
|
"token_acc": 0.9422254974207811,
|
|
"train_speed(iter/s)": 0.133004
|
|
},
|
|
{
|
|
"epoch": 2.2553713934929407,
|
|
"grad_norm": 0.38440187816263854,
|
|
"learning_rate": 8.830346187308649e-08,
|
|
"loss": 0.1423816680908203,
|
|
"memory(GiB)": 57.67,
|
|
"step": 920,
|
|
"token_acc": 0.9562720848056537,
|
|
"train_speed(iter/s)": 0.133076
|
|
},
|
|
{
|
|
"epoch": 2.267648864333947,
|
|
"grad_norm": 0.4592624600791522,
|
|
"learning_rate": 8.559975272501601e-08,
|
|
"loss": 0.16395586729049683,
|
|
"memory(GiB)": 57.67,
|
|
"step": 925,
|
|
"token_acc": 0.9564571607254534,
|
|
"train_speed(iter/s)": 0.133105
|
|
},
|
|
{
|
|
"epoch": 2.279926335174954,
|
|
"grad_norm": 0.43806629453653373,
|
|
"learning_rate": 8.29295062180802e-08,
|
|
"loss": 0.16589367389678955,
|
|
"memory(GiB)": 57.67,
|
|
"step": 930,
|
|
"token_acc": 0.9659481977902554,
|
|
"train_speed(iter/s)": 0.133066
|
|
},
|
|
{
|
|
"epoch": 2.292203806015961,
|
|
"grad_norm": 0.464370245760019,
|
|
"learning_rate": 8.029326586423907e-08,
|
|
"loss": 0.1606292724609375,
|
|
"memory(GiB)": 57.67,
|
|
"step": 935,
|
|
"token_acc": 0.9405248868778281,
|
|
"train_speed(iter/s)": 0.133054
|
|
},
|
|
{
|
|
"epoch": 2.3044812768569676,
|
|
"grad_norm": 0.492043728887271,
|
|
"learning_rate": 7.769156825371286e-08,
|
|
"loss": 0.16174919605255128,
|
|
"memory(GiB)": 57.67,
|
|
"step": 940,
|
|
"token_acc": 0.9457228709444296,
|
|
"train_speed(iter/s)": 0.133172
|
|
},
|
|
{
|
|
"epoch": 2.3167587476979743,
|
|
"grad_norm": 0.4103343451876262,
|
|
"learning_rate": 7.512494294576269e-08,
|
|
"loss": 0.1728949785232544,
|
|
"memory(GiB)": 57.67,
|
|
"step": 945,
|
|
"token_acc": 0.9441858719315367,
|
|
"train_speed(iter/s)": 0.133227
|
|
},
|
|
{
|
|
"epoch": 2.329036218538981,
|
|
"grad_norm": 0.4809983010327667,
|
|
"learning_rate": 7.25939123609022e-08,
|
|
"loss": 0.15887634754180907,
|
|
"memory(GiB)": 57.67,
|
|
"step": 950,
|
|
"token_acc": 0.9603744280737103,
|
|
"train_speed(iter/s)": 0.133368
|
|
},
|
|
{
|
|
"epoch": 2.3413136893799877,
|
|
"grad_norm": 0.37621027938026347,
|
|
"learning_rate": 7.009899167456185e-08,
|
|
"loss": 0.14414477348327637,
|
|
"memory(GiB)": 57.67,
|
|
"step": 955,
|
|
"token_acc": 0.9454514068703608,
|
|
"train_speed(iter/s)": 0.133409
|
|
},
|
|
{
|
|
"epoch": 2.3535911602209945,
|
|
"grad_norm": 0.47025720298842205,
|
|
"learning_rate": 6.764068871222825e-08,
|
|
"loss": 0.18474191427230835,
|
|
"memory(GiB)": 57.67,
|
|
"step": 960,
|
|
"token_acc": 0.933993399339934,
|
|
"train_speed(iter/s)": 0.133486
|
|
},
|
|
{
|
|
"epoch": 2.365868631062001,
|
|
"grad_norm": 0.48612512291843357,
|
|
"learning_rate": 6.521950384607974e-08,
|
|
"loss": 0.18921175003051757,
|
|
"memory(GiB)": 57.67,
|
|
"step": 965,
|
|
"token_acc": 0.9431688588154794,
|
|
"train_speed(iter/s)": 0.133588
|
|
},
|
|
{
|
|
"epoch": 2.378146101903008,
|
|
"grad_norm": 0.3806957752805889,
|
|
"learning_rate": 6.283592989313841e-08,
|
|
"loss": 0.1681033730506897,
|
|
"memory(GiB)": 57.67,
|
|
"step": 970,
|
|
"token_acc": 0.9367773677736777,
|
|
"train_speed(iter/s)": 0.13364
|
|
},
|
|
{
|
|
"epoch": 2.3904235727440146,
|
|
"grad_norm": 0.45577439700594125,
|
|
"learning_rate": 6.049045201496042e-08,
|
|
"loss": 0.15442556142807007,
|
|
"memory(GiB)": 57.67,
|
|
"step": 975,
|
|
"token_acc": 0.948759111419646,
|
|
"train_speed(iter/s)": 0.13374
|
|
},
|
|
{
|
|
"epoch": 2.4027010435850213,
|
|
"grad_norm": 0.4860898688565903,
|
|
"learning_rate": 5.818354761888444e-08,
|
|
"loss": 0.15901718139648438,
|
|
"memory(GiB)": 57.67,
|
|
"step": 980,
|
|
"token_acc": 0.9584408255401483,
|
|
"train_speed(iter/s)": 0.13388
|
|
},
|
|
{
|
|
"epoch": 2.414978514426028,
|
|
"grad_norm": 0.514344692664739,
|
|
"learning_rate": 5.5915686260858244e-08,
|
|
"loss": 0.17236262559890747,
|
|
"memory(GiB)": 57.67,
|
|
"step": 985,
|
|
"token_acc": 0.9471186187308468,
|
|
"train_speed(iter/s)": 0.133984
|
|
},
|
|
{
|
|
"epoch": 2.427255985267035,
|
|
"grad_norm": 0.5543092645451274,
|
|
"learning_rate": 5.368732954986388e-08,
|
|
"loss": 0.1594996929168701,
|
|
"memory(GiB)": 57.67,
|
|
"step": 990,
|
|
"token_acc": 0.9353687315634218,
|
|
"train_speed(iter/s)": 0.134028
|
|
},
|
|
{
|
|
"epoch": 2.439533456108042,
|
|
"grad_norm": 0.47905205598317496,
|
|
"learning_rate": 5.14989310539595e-08,
|
|
"loss": 0.17013013362884521,
|
|
"memory(GiB)": 57.67,
|
|
"step": 995,
|
|
"token_acc": 0.9312570646677375,
|
|
"train_speed(iter/s)": 0.134104
|
|
},
|
|
{
|
|
"epoch": 2.4518109269490487,
|
|
"grad_norm": 0.45866756604400005,
|
|
"learning_rate": 4.935093620795902e-08,
|
|
"loss": 0.16783492565155028,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1000,
|
|
"token_acc": 0.9406372313396965,
|
|
"train_speed(iter/s)": 0.134234
|
|
},
|
|
{
|
|
"epoch": 2.4640883977900554,
|
|
"grad_norm": 0.45591647725979945,
|
|
"learning_rate": 4.7243782222766124e-08,
|
|
"loss": 0.15385560989379882,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1005,
|
|
"token_acc": 0.9611999210578251,
|
|
"train_speed(iter/s)": 0.132972
|
|
},
|
|
{
|
|
"epoch": 2.476365868631062,
|
|
"grad_norm": 0.4717972783404294,
|
|
"learning_rate": 4.517789799638297e-08,
|
|
"loss": 0.1716939926147461,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1010,
|
|
"token_acc": 0.9752226720647773,
|
|
"train_speed(iter/s)": 0.133203
|
|
},
|
|
{
|
|
"epoch": 2.488643339472069,
|
|
"grad_norm": 0.4386963670049322,
|
|
"learning_rate": 4.315370402661092e-08,
|
|
"loss": 0.16546686887741088,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1015,
|
|
"token_acc": 0.9399607392866403,
|
|
"train_speed(iter/s)": 0.133178
|
|
},
|
|
{
|
|
"epoch": 2.5009208103130756,
|
|
"grad_norm": 0.5231430157385392,
|
|
"learning_rate": 4.1171612325460236e-08,
|
|
"loss": 0.18315892219543456,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1020,
|
|
"token_acc": 0.9306804077180577,
|
|
"train_speed(iter/s)": 0.133198
|
|
},
|
|
{
|
|
"epoch": 2.5131982811540823,
|
|
"grad_norm": 0.5125599782547446,
|
|
"learning_rate": 3.9232026335288296e-08,
|
|
"loss": 0.168873929977417,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1025,
|
|
"token_acc": 0.9482202118470701,
|
|
"train_speed(iter/s)": 0.133093
|
|
},
|
|
{
|
|
"epoch": 2.525475751995089,
|
|
"grad_norm": 0.4146726330452646,
|
|
"learning_rate": 3.733534084668091e-08,
|
|
"loss": 0.1465557336807251,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1030,
|
|
"token_acc": 0.9542228126779275,
|
|
"train_speed(iter/s)": 0.133256
|
|
},
|
|
{
|
|
"epoch": 2.5377532228360957,
|
|
"grad_norm": 0.4928965284851222,
|
|
"learning_rate": 3.5481941918095396e-08,
|
|
"loss": 0.1535036325454712,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1035,
|
|
"token_acc": 0.98864726574992,
|
|
"train_speed(iter/s)": 0.133212
|
|
},
|
|
{
|
|
"epoch": 2.5500306936771024,
|
|
"grad_norm": 0.46167944188273663,
|
|
"learning_rate": 3.367220679728089e-08,
|
|
"loss": 0.17861878871917725,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1040,
|
|
"token_acc": 0.9435955137744546,
|
|
"train_speed(iter/s)": 0.133172
|
|
},
|
|
{
|
|
"epoch": 2.562308164518109,
|
|
"grad_norm": 0.5398332391068397,
|
|
"learning_rate": 3.190650384449167e-08,
|
|
"loss": 0.1772806763648987,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1045,
|
|
"token_acc": 0.9553827261563651,
|
|
"train_speed(iter/s)": 0.133219
|
|
},
|
|
{
|
|
"epoch": 2.574585635359116,
|
|
"grad_norm": 0.5539788244071175,
|
|
"learning_rate": 3.018519245750989e-08,
|
|
"loss": 0.14356986284255982,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1050,
|
|
"token_acc": 0.968132854578097,
|
|
"train_speed(iter/s)": 0.133317
|
|
},
|
|
{
|
|
"epoch": 2.5868631062001226,
|
|
"grad_norm": 0.4402330635590166,
|
|
"learning_rate": 2.850862299849241e-08,
|
|
"loss": 0.16220954656600953,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1055,
|
|
"token_acc": 0.9482638506948264,
|
|
"train_speed(iter/s)": 0.133252
|
|
},
|
|
{
|
|
"epoch": 2.5991405770411298,
|
|
"grad_norm": 0.48620425405940176,
|
|
"learning_rate": 2.6877136722656734e-08,
|
|
"loss": 0.18278908729553223,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1060,
|
|
"token_acc": 0.9325091336116911,
|
|
"train_speed(iter/s)": 0.133237
|
|
},
|
|
{
|
|
"epoch": 2.611418047882136,
|
|
"grad_norm": 0.43457144570186146,
|
|
"learning_rate": 2.5291065708820754e-08,
|
|
"loss": 0.17628798484802247,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1065,
|
|
"token_acc": 0.9281233833419555,
|
|
"train_speed(iter/s)": 0.13325
|
|
},
|
|
{
|
|
"epoch": 2.623695518723143,
|
|
"grad_norm": 0.4143344415028038,
|
|
"learning_rate": 2.375073279180992e-08,
|
|
"loss": 0.15394517183303832,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1070,
|
|
"token_acc": 0.9541889329425961,
|
|
"train_speed(iter/s)": 0.133392
|
|
},
|
|
{
|
|
"epoch": 2.63597298956415,
|
|
"grad_norm": 0.4688198080933725,
|
|
"learning_rate": 2.2256451496746653e-08,
|
|
"loss": 0.1712632417678833,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1075,
|
|
"token_acc": 0.9350767303476355,
|
|
"train_speed(iter/s)": 0.133374
|
|
},
|
|
{
|
|
"epoch": 2.6482504604051567,
|
|
"grad_norm": 0.5198918873737135,
|
|
"learning_rate": 2.0808525975233805e-08,
|
|
"loss": 0.19560701847076417,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1080,
|
|
"token_acc": 0.9272029474976973,
|
|
"train_speed(iter/s)": 0.133426
|
|
},
|
|
{
|
|
"epoch": 2.6605279312461634,
|
|
"grad_norm": 0.4846401676955354,
|
|
"learning_rate": 1.940725094344675e-08,
|
|
"loss": 0.17480210065841675,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1085,
|
|
"token_acc": 0.9508643542545645,
|
|
"train_speed(iter/s)": 0.133534
|
|
},
|
|
{
|
|
"epoch": 2.67280540208717,
|
|
"grad_norm": 0.4514013424363714,
|
|
"learning_rate": 1.8052911622145866e-08,
|
|
"loss": 0.1648250102996826,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1090,
|
|
"token_acc": 0.9454948354122735,
|
|
"train_speed(iter/s)": 0.133539
|
|
},
|
|
{
|
|
"epoch": 2.685082872928177,
|
|
"grad_norm": 0.4989521104653192,
|
|
"learning_rate": 1.6745783678621367e-08,
|
|
"loss": 0.16439478397369384,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1095,
|
|
"token_acc": 0.9524301583201092,
|
|
"train_speed(iter/s)": 0.133571
|
|
},
|
|
{
|
|
"epoch": 2.6973603437691835,
|
|
"grad_norm": 0.49299845547327686,
|
|
"learning_rate": 1.5486133170583145e-08,
|
|
"loss": 0.16308257579803467,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1100,
|
|
"token_acc": 0.9602804837874723,
|
|
"train_speed(iter/s)": 0.133601
|
|
},
|
|
{
|
|
"epoch": 2.7096378146101903,
|
|
"grad_norm": 0.38800040402764896,
|
|
"learning_rate": 1.4274216492006302e-08,
|
|
"loss": 0.20491249561309816,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1105,
|
|
"token_acc": 0.9337728751954694,
|
|
"train_speed(iter/s)": 0.132264
|
|
},
|
|
{
|
|
"epoch": 2.721915285451197,
|
|
"grad_norm": 0.46254709159908636,
|
|
"learning_rate": 1.311028032094369e-08,
|
|
"loss": 0.1842280149459839,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1110,
|
|
"token_acc": 0.9332923076923076,
|
|
"train_speed(iter/s)": 0.132279
|
|
},
|
|
{
|
|
"epoch": 2.7341927562922037,
|
|
"grad_norm": 0.4700426518953757,
|
|
"learning_rate": 1.1994561569316442e-08,
|
|
"loss": 0.15502922534942626,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1115,
|
|
"token_acc": 0.9172778194837443,
|
|
"train_speed(iter/s)": 0.132416
|
|
},
|
|
{
|
|
"epoch": 2.7464702271332104,
|
|
"grad_norm": 0.4570084263374054,
|
|
"learning_rate": 1.0927287334691616e-08,
|
|
"loss": 0.17944493293762206,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1120,
|
|
"token_acc": 0.9620748077674964,
|
|
"train_speed(iter/s)": 0.132397
|
|
},
|
|
{
|
|
"epoch": 2.758747697974217,
|
|
"grad_norm": 0.5565607166042048,
|
|
"learning_rate": 9.908674854058219e-09,
|
|
"loss": 0.16596771478652955,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1125,
|
|
"token_acc": 0.946463347418703,
|
|
"train_speed(iter/s)": 0.132429
|
|
},
|
|
{
|
|
"epoch": 2.771025168815224,
|
|
"grad_norm": 0.43235518649936683,
|
|
"learning_rate": 8.938931459609806e-09,
|
|
"loss": 0.16782586574554442,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1130,
|
|
"token_acc": 0.9357140571501712,
|
|
"train_speed(iter/s)": 0.132529
|
|
},
|
|
{
|
|
"epoch": 2.783302639656231,
|
|
"grad_norm": 0.5041786160999633,
|
|
"learning_rate": 8.018254536543451e-09,
|
|
"loss": 0.18278899192810058,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1135,
|
|
"token_acc": 0.9538431826960216,
|
|
"train_speed(iter/s)": 0.132595
|
|
},
|
|
{
|
|
"epoch": 2.7955801104972373,
|
|
"grad_norm": 0.3988655252448663,
|
|
"learning_rate": 7.146831482883115e-09,
|
|
"loss": 0.16640629768371581,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1140,
|
|
"token_acc": 0.9387376446378571,
|
|
"train_speed(iter/s)": 0.132687
|
|
},
|
|
{
|
|
"epoch": 2.8078575813382445,
|
|
"grad_norm": 0.5585703278010272,
|
|
"learning_rate": 6.32483967133593e-09,
|
|
"loss": 0.16575145721435547,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1145,
|
|
"token_acc": 0.9446856625961103,
|
|
"train_speed(iter/s)": 0.132784
|
|
},
|
|
{
|
|
"epoch": 2.820135052179251,
|
|
"grad_norm": 0.5408177805631927,
|
|
"learning_rate": 5.5524464131893046e-09,
|
|
"loss": 0.16870219707489015,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1150,
|
|
"token_acc": 0.9385546004457179,
|
|
"train_speed(iter/s)": 0.132784
|
|
},
|
|
{
|
|
"epoch": 2.832412523020258,
|
|
"grad_norm": 0.47090675834257434,
|
|
"learning_rate": 4.829808924255441e-09,
|
|
"loss": 0.1629176139831543,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1155,
|
|
"token_acc": 0.9479655438055886,
|
|
"train_speed(iter/s)": 0.132829
|
|
},
|
|
{
|
|
"epoch": 2.8446899938612646,
|
|
"grad_norm": 0.482099713549635,
|
|
"learning_rate": 4.157074292871238e-09,
|
|
"loss": 0.15441689491271973,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1160,
|
|
"token_acc": 0.9639929344626116,
|
|
"train_speed(iter/s)": 0.132784
|
|
},
|
|
{
|
|
"epoch": 2.8569674647022714,
|
|
"grad_norm": 0.4698231217223272,
|
|
"learning_rate": 3.5343794499594625e-09,
|
|
"loss": 0.18142955303192138,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1165,
|
|
"token_acc": 0.9412326017570991,
|
|
"train_speed(iter/s)": 0.132762
|
|
},
|
|
{
|
|
"epoch": 2.869244935543278,
|
|
"grad_norm": 0.43804204402927915,
|
|
"learning_rate": 2.9618511411570455e-09,
|
|
"loss": 0.18446786403656007,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1170,
|
|
"token_acc": 0.9350367684435257,
|
|
"train_speed(iter/s)": 0.132754
|
|
},
|
|
{
|
|
"epoch": 2.881522406384285,
|
|
"grad_norm": 0.5571992556185795,
|
|
"learning_rate": 2.4396059010170777e-09,
|
|
"loss": 0.1577387809753418,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1175,
|
|
"token_acc": 0.9629287863590772,
|
|
"train_speed(iter/s)": 0.132933
|
|
},
|
|
{
|
|
"epoch": 2.8937998772252915,
|
|
"grad_norm": 0.5267456210442775,
|
|
"learning_rate": 1.967750029288756e-09,
|
|
"loss": 0.15231819152832032,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1180,
|
|
"token_acc": 0.9535112359550562,
|
|
"train_speed(iter/s)": 0.133115
|
|
},
|
|
{
|
|
"epoch": 2.9060773480662982,
|
|
"grad_norm": 0.3910293202558737,
|
|
"learning_rate": 1.5463795692808034e-09,
|
|
"loss": 0.162775456905365,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1185,
|
|
"token_acc": 0.931954924708307,
|
|
"train_speed(iter/s)": 0.133131
|
|
},
|
|
{
|
|
"epoch": 2.918354818907305,
|
|
"grad_norm": 0.4093866355363869,
|
|
"learning_rate": 1.1755802883124389e-09,
|
|
"loss": 0.16861215829849244,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1190,
|
|
"token_acc": 0.9492351730924053,
|
|
"train_speed(iter/s)": 0.133244
|
|
},
|
|
{
|
|
"epoch": 2.9306322897483117,
|
|
"grad_norm": 0.5603512977054463,
|
|
"learning_rate": 8.554276602559807e-10,
|
|
"loss": 0.20310664176940918,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1195,
|
|
"token_acc": 0.9434275032624414,
|
|
"train_speed(iter/s)": 0.133257
|
|
},
|
|
{
|
|
"epoch": 2.942909760589319,
|
|
"grad_norm": 0.5047283156733317,
|
|
"learning_rate": 5.859868501746079e-10,
|
|
"loss": 0.1582653284072876,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1200,
|
|
"token_acc": 0.9450664007614559,
|
|
"train_speed(iter/s)": 0.133326
|
|
},
|
|
{
|
|
"epoch": 2.955187231430325,
|
|
"grad_norm": 0.3732689118202031,
|
|
"learning_rate": 3.6731270105844204e-10,
|
|
"loss": 0.17182209491729736,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1205,
|
|
"token_acc": 0.9346191946856244,
|
|
"train_speed(iter/s)": 0.132213
|
|
},
|
|
{
|
|
"epoch": 2.9674647022713323,
|
|
"grad_norm": 0.513720762261806,
|
|
"learning_rate": 1.9944972266153214e-10,
|
|
"loss": 0.17291358709335328,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1210,
|
|
"token_acc": 0.9477955520873976,
|
|
"train_speed(iter/s)": 0.132351
|
|
},
|
|
{
|
|
"epoch": 2.979742173112339,
|
|
"grad_norm": 0.4516334969279687,
|
|
"learning_rate": 8.243208244229637e-11,
|
|
"loss": 0.156210994720459,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1215,
|
|
"token_acc": 0.9544438002153566,
|
|
"train_speed(iter/s)": 0.132457
|
|
},
|
|
{
|
|
"epoch": 2.9920196439533457,
|
|
"grad_norm": 0.4764485403289777,
|
|
"learning_rate": 1.628359860883499e-11,
|
|
"loss": 0.1759173631668091,
|
|
"memory(GiB)": 57.67,
|
|
"step": 1220,
|
|
"token_acc": 0.9492864815098971,
|
|
"train_speed(iter/s)": 0.132386
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 1224,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 169385433251840.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|