{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100.0, "global_step": 1224, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024554941682013503, "grad_norm": 1.7764384203008543, "learning_rate": 4.065040650406504e-09, "loss": 0.6458772420883179, "memory(GiB)": 26.71, "step": 1, "token_acc": 0.8, "train_speed(iter/s)": 0.059991 }, { "epoch": 0.012277470841006752, "grad_norm": 1.5484946901004182, "learning_rate": 2.032520325203252e-08, "loss": 0.5884945392608643, "memory(GiB)": 52.53, "step": 5, "token_acc": 0.8317063859955282, "train_speed(iter/s)": 0.107624 }, { "epoch": 0.024554941682013505, "grad_norm": 1.4563640400430145, "learning_rate": 4.065040650406504e-08, "loss": 0.5743718624114991, "memory(GiB)": 52.53, "step": 10, "token_acc": 0.8498605726722438, "train_speed(iter/s)": 0.112877 }, { "epoch": 0.03683241252302026, "grad_norm": 1.4140197884092542, "learning_rate": 6.097560975609756e-08, "loss": 0.590625, "memory(GiB)": 52.53, "step": 15, "token_acc": 0.8347938144329897, "train_speed(iter/s)": 0.117218 }, { "epoch": 0.04910988336402701, "grad_norm": 1.6622234178073612, "learning_rate": 8.130081300813008e-08, "loss": 0.5686985969543457, "memory(GiB)": 52.53, "step": 20, "token_acc": 0.8420736417474894, "train_speed(iter/s)": 0.116503 }, { "epoch": 0.061387354205033766, "grad_norm": 1.5449639237910855, "learning_rate": 1.016260162601626e-07, "loss": 0.5720160961151123, "memory(GiB)": 52.53, "step": 25, "token_acc": 0.8329814975785421, "train_speed(iter/s)": 0.121226 }, { "epoch": 0.07366482504604052, "grad_norm": 1.4350414462950623, "learning_rate": 1.219512195121951e-07, "loss": 0.5950272083282471, "memory(GiB)": 56.59, "step": 30, "token_acc": 0.8376042390548992, "train_speed(iter/s)": 0.1248 }, { "epoch": 0.08594229588704727, "grad_norm": 1.6390155212777497, "learning_rate": 1.4227642276422763e-07, "loss": 0.5760597229003906, "memory(GiB)": 56.59, "step": 35, "token_acc": 0.8459589739981, "train_speed(iter/s)": 0.12788 }, { "epoch": 0.09821976672805402, "grad_norm": 1.3096137406039592, "learning_rate": 1.6260162601626016e-07, "loss": 0.5555302143096924, "memory(GiB)": 56.59, "step": 40, "token_acc": 0.8554021379485874, "train_speed(iter/s)": 0.12593 }, { "epoch": 0.11049723756906077, "grad_norm": 1.422446577039075, "learning_rate": 1.8292682926829268e-07, "loss": 0.5970460891723632, "memory(GiB)": 57.67, "step": 45, "token_acc": 0.8417625280973159, "train_speed(iter/s)": 0.126605 }, { "epoch": 0.12277470841006753, "grad_norm": 1.2819764069439123, "learning_rate": 2.032520325203252e-07, "loss": 0.5884549140930175, "memory(GiB)": 57.67, "step": 50, "token_acc": 0.8406445511421504, "train_speed(iter/s)": 0.129975 }, { "epoch": 0.13505217925107427, "grad_norm": 1.41910630937099, "learning_rate": 2.235772357723577e-07, "loss": 0.6231883049011231, "memory(GiB)": 57.67, "step": 55, "token_acc": 0.829693936306861, "train_speed(iter/s)": 0.132967 }, { "epoch": 0.14732965009208104, "grad_norm": 1.2531126371792276, "learning_rate": 2.439024390243902e-07, "loss": 0.5942145347595215, "memory(GiB)": 57.67, "step": 60, "token_acc": 0.8500459277403551, "train_speed(iter/s)": 0.133823 }, { "epoch": 0.1596071209330878, "grad_norm": 1.1301268691981066, "learning_rate": 2.6422764227642274e-07, "loss": 0.5744296073913574, "memory(GiB)": 57.67, "step": 65, "token_acc": 0.8480531499966947, "train_speed(iter/s)": 0.134364 }, { "epoch": 0.17188459177409454, "grad_norm": 1.0132293794887546, "learning_rate": 2.8455284552845527e-07, "loss": 0.5759846210479737, "memory(GiB)": 57.67, "step": 70, "token_acc": 0.8393322326178683, "train_speed(iter/s)": 0.137087 }, { "epoch": 0.1841620626151013, "grad_norm": 1.1217457637602928, "learning_rate": 3.048780487804878e-07, "loss": 0.56055588722229, "memory(GiB)": 57.67, "step": 75, "token_acc": 0.8425447316103379, "train_speed(iter/s)": 0.138299 }, { "epoch": 0.19643953345610804, "grad_norm": 0.8818994062981204, "learning_rate": 3.252032520325203e-07, "loss": 0.5674080848693848, "memory(GiB)": 57.67, "step": 80, "token_acc": 0.8303922425089543, "train_speed(iter/s)": 0.137434 }, { "epoch": 0.2087170042971148, "grad_norm": 0.8918960386763577, "learning_rate": 3.4552845528455284e-07, "loss": 0.559955358505249, "memory(GiB)": 57.67, "step": 85, "token_acc": 0.8446934551423881, "train_speed(iter/s)": 0.137418 }, { "epoch": 0.22099447513812154, "grad_norm": 1.0522794863647833, "learning_rate": 3.6585365853658536e-07, "loss": 0.5732513904571533, "memory(GiB)": 57.67, "step": 90, "token_acc": 0.8489562749254482, "train_speed(iter/s)": 0.137088 }, { "epoch": 0.2332719459791283, "grad_norm": 0.785438670663371, "learning_rate": 3.861788617886179e-07, "loss": 0.5366201877593995, "memory(GiB)": 57.67, "step": 95, "token_acc": 0.8626246438746439, "train_speed(iter/s)": 0.136376 }, { "epoch": 0.24554941682013506, "grad_norm": 1.0039103796923265, "learning_rate": 4.065040650406504e-07, "loss": 0.5319199085235595, "memory(GiB)": 57.67, "step": 100, "token_acc": 0.8360549786720669, "train_speed(iter/s)": 0.136236 }, { "epoch": 0.2578268876611418, "grad_norm": 0.8843990831220632, "learning_rate": 4.268292682926829e-07, "loss": 0.5205207347869873, "memory(GiB)": 57.67, "step": 105, "token_acc": 0.8555977051133336, "train_speed(iter/s)": 0.127767 }, { "epoch": 0.27010435850214853, "grad_norm": 0.7775914671814245, "learning_rate": 4.471544715447154e-07, "loss": 0.5021872520446777, "memory(GiB)": 57.67, "step": 110, "token_acc": 0.8675163294567468, "train_speed(iter/s)": 0.129151 }, { "epoch": 0.2823818293431553, "grad_norm": 0.8595659824667936, "learning_rate": 4.674796747967479e-07, "loss": 0.5380425930023194, "memory(GiB)": 57.67, "step": 115, "token_acc": 0.8555190611506809, "train_speed(iter/s)": 0.130605 }, { "epoch": 0.2946593001841621, "grad_norm": 0.9109047806477464, "learning_rate": 4.878048780487804e-07, "loss": 0.5206707000732422, "memory(GiB)": 57.67, "step": 120, "token_acc": 0.8441454698339289, "train_speed(iter/s)": 0.131631 }, { "epoch": 0.3069367710251688, "grad_norm": 0.8591216474047668, "learning_rate": 4.999959290672028e-07, "loss": 0.5150551795959473, "memory(GiB)": 57.67, "step": 125, "token_acc": 0.8495000393669789, "train_speed(iter/s)": 0.132508 }, { "epoch": 0.3192142418661756, "grad_norm": 0.8236784585400246, "learning_rate": 4.999501325958186e-07, "loss": 0.5071953773498535, "memory(GiB)": 57.67, "step": 130, "token_acc": 0.8506227570192104, "train_speed(iter/s)": 0.132915 }, { "epoch": 0.3314917127071823, "grad_norm": 0.954649941162109, "learning_rate": 4.998534603397122e-07, "loss": 0.49468369483947755, "memory(GiB)": 57.67, "step": 135, "token_acc": 0.850890297573384, "train_speed(iter/s)": 0.133509 }, { "epoch": 0.3437691835481891, "grad_norm": 0.7775368395525796, "learning_rate": 4.997059319759163e-07, "loss": 0.49853315353393557, "memory(GiB)": 57.67, "step": 140, "token_acc": 0.8663300877509316, "train_speed(iter/s)": 0.134979 }, { "epoch": 0.3560466543891958, "grad_norm": 0.8879804549594494, "learning_rate": 4.995075775329056e-07, "loss": 0.49777793884277344, "memory(GiB)": 57.67, "step": 145, "token_acc": 0.8515488018702513, "train_speed(iter/s)": 0.13598 }, { "epoch": 0.3683241252302026, "grad_norm": 0.8614455268040374, "learning_rate": 4.992584373844852e-07, "loss": 0.46718130111694334, "memory(GiB)": 57.67, "step": 150, "token_acc": 0.870577384246449, "train_speed(iter/s)": 0.137174 }, { "epoch": 0.38060159607120936, "grad_norm": 0.9466769688310654, "learning_rate": 4.989585622415729e-07, "loss": 0.46044120788574217, "memory(GiB)": 57.67, "step": 155, "token_acc": 0.8734130199891951, "train_speed(iter/s)": 0.137843 }, { "epoch": 0.3928790669122161, "grad_norm": 0.756694079775602, "learning_rate": 4.986080131418763e-07, "loss": 0.4397891044616699, "memory(GiB)": 57.67, "step": 160, "token_acc": 0.8813499680102367, "train_speed(iter/s)": 0.137964 }, { "epoch": 0.40515653775322286, "grad_norm": 0.7766777226622378, "learning_rate": 4.982068614374703e-07, "loss": 0.4335052490234375, "memory(GiB)": 57.67, "step": 165, "token_acc": 0.8773235563703025, "train_speed(iter/s)": 0.137421 }, { "epoch": 0.4174340085942296, "grad_norm": 0.7739408885073995, "learning_rate": 4.977551887802731e-07, "loss": 0.44889039993286134, "memory(GiB)": 57.67, "step": 170, "token_acc": 0.8657095569839498, "train_speed(iter/s)": 0.13711 }, { "epoch": 0.42971147943523635, "grad_norm": 0.9033077880962782, "learning_rate": 4.972530871054263e-07, "loss": 0.42515549659729, "memory(GiB)": 57.67, "step": 175, "token_acc": 0.8809412679891093, "train_speed(iter/s)": 0.136969 }, { "epoch": 0.4419889502762431, "grad_norm": 0.7409058426766315, "learning_rate": 4.967006586125826e-07, "loss": 0.4390419960021973, "memory(GiB)": 57.67, "step": 180, "token_acc": 0.8779247640798035, "train_speed(iter/s)": 0.1378 }, { "epoch": 0.45426642111724985, "grad_norm": 0.7709335380436733, "learning_rate": 4.960980157451032e-07, "loss": 0.4336841583251953, "memory(GiB)": 57.67, "step": 185, "token_acc": 0.8787090057261843, "train_speed(iter/s)": 0.138248 }, { "epoch": 0.4665438919582566, "grad_norm": 0.8002910130967711, "learning_rate": 4.954452811671713e-07, "loss": 0.4231499195098877, "memory(GiB)": 57.67, "step": 190, "token_acc": 0.8717664903865863, "train_speed(iter/s)": 0.138123 }, { "epoch": 0.47882136279926335, "grad_norm": 0.7671937721932319, "learning_rate": 4.947425877388237e-07, "loss": 0.4115544319152832, "memory(GiB)": 57.67, "step": 195, "token_acc": 0.8822761322245893, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.4910988336402701, "grad_norm": 0.8718441420761184, "learning_rate": 4.939900784889085e-07, "loss": 0.4212639331817627, "memory(GiB)": 57.67, "step": 200, "token_acc": 0.8792764857881137, "train_speed(iter/s)": 0.13898 }, { "epoch": 0.5033763044812769, "grad_norm": 0.8688003112334609, "learning_rate": 4.931879065859729e-07, "loss": 0.3883807182312012, "memory(GiB)": 57.67, "step": 205, "token_acc": 0.8808318569138615, "train_speed(iter/s)": 0.133423 }, { "epoch": 0.5156537753222836, "grad_norm": 0.9144969667406203, "learning_rate": 4.923362353070858e-07, "loss": 0.3918790817260742, "memory(GiB)": 57.67, "step": 210, "token_acc": 0.8826902804132406, "train_speed(iter/s)": 0.133713 }, { "epoch": 0.5279312461632903, "grad_norm": 0.8097366879590759, "learning_rate": 4.914352380046041e-07, "loss": 0.3884381055831909, "memory(GiB)": 57.67, "step": 215, "token_acc": 0.8870171589751625, "train_speed(iter/s)": 0.133996 }, { "epoch": 0.5402087170042971, "grad_norm": 0.7538862741531127, "learning_rate": 4.904850980708886e-07, "loss": 0.3775317668914795, "memory(GiB)": 57.67, "step": 220, "token_acc": 0.8937805494690713, "train_speed(iter/s)": 0.133724 }, { "epoch": 0.5524861878453039, "grad_norm": 1.146276827778673, "learning_rate": 4.894860089009741e-07, "loss": 0.3711127519607544, "memory(GiB)": 57.67, "step": 225, "token_acc": 0.8895364441547486, "train_speed(iter/s)": 0.134362 }, { "epoch": 0.5647636586863106, "grad_norm": 0.8283210772807095, "learning_rate": 4.884381738532069e-07, "loss": 0.3519309043884277, "memory(GiB)": 57.67, "step": 230, "token_acc": 0.8935041822388325, "train_speed(iter/s)": 0.134198 }, { "epoch": 0.5770411295273173, "grad_norm": 1.004769745418558, "learning_rate": 4.87341806207851e-07, "loss": 0.3587208271026611, "memory(GiB)": 57.67, "step": 235, "token_acc": 0.9019401835119423, "train_speed(iter/s)": 0.134025 }, { "epoch": 0.5893186003683242, "grad_norm": 0.8182191455973747, "learning_rate": 4.861971291236771e-07, "loss": 0.3467154026031494, "memory(GiB)": 57.67, "step": 240, "token_acc": 0.8995148555664775, "train_speed(iter/s)": 0.134306 }, { "epoch": 0.6015960712093309, "grad_norm": 1.5271995294368055, "learning_rate": 4.850043755925397e-07, "loss": 0.33415584564208983, "memory(GiB)": 57.67, "step": 245, "token_acc": 0.8970737022336368, "train_speed(iter/s)": 0.134727 }, { "epoch": 0.6138735420503376, "grad_norm": 0.875242766730362, "learning_rate": 4.837637883919528e-07, "loss": 0.3291849374771118, "memory(GiB)": 57.67, "step": 250, "token_acc": 0.9039310639510041, "train_speed(iter/s)": 0.13542 }, { "epoch": 0.6261510128913443, "grad_norm": 0.8014571513700163, "learning_rate": 4.824756200356748e-07, "loss": 0.32892580032348634, "memory(GiB)": 57.67, "step": 255, "token_acc": 0.9007355946056396, "train_speed(iter/s)": 0.135787 }, { "epoch": 0.6384284837323512, "grad_norm": 0.9358345370809253, "learning_rate": 4.811401327223103e-07, "loss": 0.3249573469161987, "memory(GiB)": 57.67, "step": 260, "token_acc": 0.9081637062967285, "train_speed(iter/s)": 0.13572 }, { "epoch": 0.6507059545733579, "grad_norm": 0.8184870146688139, "learning_rate": 4.797575982819412e-07, "loss": 0.31554522514343264, "memory(GiB)": 57.67, "step": 265, "token_acc": 0.9049698848226551, "train_speed(iter/s)": 0.135725 }, { "epoch": 0.6629834254143646, "grad_norm": 0.8945211680671034, "learning_rate": 4.783282981207979e-07, "loss": 0.3033627510070801, "memory(GiB)": 57.67, "step": 270, "token_acc": 0.9165617767672256, "train_speed(iter/s)": 0.135173 }, { "epoch": 0.6752608962553714, "grad_norm": 1.4362312397197503, "learning_rate": 4.768525231639802e-07, "loss": 0.2961107730865479, "memory(GiB)": 57.67, "step": 275, "token_acc": 0.9266585849680405, "train_speed(iter/s)": 0.135236 }, { "epoch": 0.6875383670963782, "grad_norm": 0.7600567933058701, "learning_rate": 4.753305737962418e-07, "loss": 0.2920267581939697, "memory(GiB)": 57.67, "step": 280, "token_acc": 0.9217091715507683, "train_speed(iter/s)": 0.135403 }, { "epoch": 0.6998158379373849, "grad_norm": 0.8650286740134839, "learning_rate": 4.7376275980084856e-07, "loss": 0.2840526819229126, "memory(GiB)": 57.67, "step": 285, "token_acc": 0.9261834939254294, "train_speed(iter/s)": 0.135709 }, { "epoch": 0.7120933087783916, "grad_norm": 0.9178664296409083, "learning_rate": 4.721494002965243e-07, "loss": 0.2752720355987549, "memory(GiB)": 57.67, "step": 290, "token_acc": 0.9174185126886014, "train_speed(iter/s)": 0.136139 }, { "epoch": 0.7243707796193984, "grad_norm": 0.9722910316739173, "learning_rate": 4.70490823672496e-07, "loss": 0.26680717468261717, "memory(GiB)": 57.67, "step": 295, "token_acc": 0.9065665385958784, "train_speed(iter/s)": 0.136569 }, { "epoch": 0.7366482504604052, "grad_norm": 0.8535970757124566, "learning_rate": 4.6878736752165216e-07, "loss": 0.26862516403198244, "memory(GiB)": 57.67, "step": 300, "token_acc": 0.9236588470631024, "train_speed(iter/s)": 0.136851 }, { "epoch": 0.7489257213014119, "grad_norm": 0.8086539218227536, "learning_rate": 4.670393785718281e-07, "loss": 0.26166937351226804, "memory(GiB)": 57.67, "step": 305, "token_acc": 0.9295115530856976, "train_speed(iter/s)": 0.132999 }, { "epoch": 0.7612031921424187, "grad_norm": 0.7359005917653992, "learning_rate": 4.652472126152316e-07, "loss": 0.27553093433380127, "memory(GiB)": 57.67, "step": 310, "token_acc": 0.923441422964037, "train_speed(iter/s)": 0.133358 }, { "epoch": 0.7734806629834254, "grad_norm": 0.7800276116485089, "learning_rate": 4.634112344360237e-07, "loss": 0.25064496994018554, "memory(GiB)": 57.67, "step": 315, "token_acc": 0.9140353723835795, "train_speed(iter/s)": 0.133677 }, { "epoch": 0.7857581338244322, "grad_norm": 0.7718671135037027, "learning_rate": 4.615318177360689e-07, "loss": 0.24835121631622314, "memory(GiB)": 57.67, "step": 320, "token_acc": 0.9273986758008343, "train_speed(iter/s)": 0.134016 }, { "epoch": 0.7980356046654389, "grad_norm": 0.9264394104378584, "learning_rate": 4.596093450588707e-07, "loss": 0.23845996856689453, "memory(GiB)": 57.67, "step": 325, "token_acc": 0.9188875580176673, "train_speed(iter/s)": 0.134392 }, { "epoch": 0.8103130755064457, "grad_norm": 0.7295759857048888, "learning_rate": 4.5764420771170723e-07, "loss": 0.2278268575668335, "memory(GiB)": 57.67, "step": 330, "token_acc": 0.940601686668829, "train_speed(iter/s)": 0.134767 }, { "epoch": 0.8225905463474524, "grad_norm": 0.7266088554592744, "learning_rate": 4.556368056859832e-07, "loss": 0.21920721530914306, "memory(GiB)": 57.67, "step": 335, "token_acc": 0.9349947057933746, "train_speed(iter/s)": 0.134994 }, { "epoch": 0.8348680171884592, "grad_norm": 0.7475457504296112, "learning_rate": 4.5358754757581397e-07, "loss": 0.2169396162033081, "memory(GiB)": 57.67, "step": 340, "token_acc": 0.9349587340046944, "train_speed(iter/s)": 0.135495 }, { "epoch": 0.8471454880294659, "grad_norm": 0.7207459452665155, "learning_rate": 4.5149685049485877e-07, "loss": 0.22667970657348632, "memory(GiB)": 57.67, "step": 345, "token_acc": 0.9089569551995486, "train_speed(iter/s)": 0.136023 }, { "epoch": 0.8594229588704727, "grad_norm": 0.7912001296106889, "learning_rate": 4.4936513999142e-07, "loss": 0.2154712438583374, "memory(GiB)": 57.67, "step": 350, "token_acc": 0.9488812673526049, "train_speed(iter/s)": 0.136042 }, { "epoch": 0.8717004297114794, "grad_norm": 1.7147841573520497, "learning_rate": 4.471928499618255e-07, "loss": 0.21075584888458251, "memory(GiB)": 57.67, "step": 355, "token_acc": 0.9382304479442082, "train_speed(iter/s)": 0.13587 }, { "epoch": 0.8839779005524862, "grad_norm": 0.7658381358366665, "learning_rate": 4.449804225621116e-07, "loss": 0.19444403648376465, "memory(GiB)": 57.67, "step": 360, "token_acc": 0.9425833467547109, "train_speed(iter/s)": 0.135745 }, { "epoch": 0.896255371393493, "grad_norm": 0.6912930219104488, "learning_rate": 4.427283081180249e-07, "loss": 0.1945898175239563, "memory(GiB)": 57.67, "step": 365, "token_acc": 0.9307141169986616, "train_speed(iter/s)": 0.136429 }, { "epoch": 0.9085328422344997, "grad_norm": 0.6755021225392699, "learning_rate": 4.404369650333616e-07, "loss": 0.1876620650291443, "memory(GiB)": 57.67, "step": 370, "token_acc": 0.9437858236320268, "train_speed(iter/s)": 0.136643 }, { "epoch": 0.9208103130755064, "grad_norm": 0.7098782582131411, "learning_rate": 4.3810685969666203e-07, "loss": 0.2034088134765625, "memory(GiB)": 57.67, "step": 375, "token_acc": 0.9409318390075421, "train_speed(iter/s)": 0.136522 }, { "epoch": 0.9330877839165131, "grad_norm": 0.6938412383830229, "learning_rate": 4.357384663862803e-07, "loss": 0.1925197124481201, "memory(GiB)": 57.67, "step": 380, "token_acc": 0.934276273372018, "train_speed(iter/s)": 0.136664 }, { "epoch": 0.94536525475752, "grad_norm": 0.5516763311313012, "learning_rate": 4.3333226717384784e-07, "loss": 0.18835780620574952, "memory(GiB)": 57.67, "step": 385, "token_acc": 0.952319409185322, "train_speed(iter/s)": 0.136364 }, { "epoch": 0.9576427255985267, "grad_norm": 0.5887901021890946, "learning_rate": 4.308887518261507e-07, "loss": 0.18153078556060792, "memory(GiB)": 57.67, "step": 390, "token_acc": 0.9437745469578999, "train_speed(iter/s)": 0.136159 }, { "epoch": 0.9699201964395334, "grad_norm": 0.6208318604275079, "learning_rate": 4.2840841770544073e-07, "loss": 0.16547969579696656, "memory(GiB)": 57.67, "step": 395, "token_acc": 0.9472743181040058, "train_speed(iter/s)": 0.136212 }, { "epoch": 0.9821976672805403, "grad_norm": 1.389928819023539, "learning_rate": 4.258917696682006e-07, "loss": 0.17939815521240235, "memory(GiB)": 57.67, "step": 400, "token_acc": 0.9349780954576896, "train_speed(iter/s)": 0.136576 }, { "epoch": 0.994475138121547, "grad_norm": 0.5148203046629826, "learning_rate": 4.2333931996238316e-07, "loss": 0.19017149209976197, "memory(GiB)": 57.67, "step": 405, "token_acc": 0.9227554596926395, "train_speed(iter/s)": 0.133387 }, { "epoch": 1.0049109883364027, "grad_norm": 0.4950490943119214, "learning_rate": 4.2075158812314694e-07, "loss": 0.16587586402893068, "memory(GiB)": 57.67, "step": 410, "token_acc": 0.935228905768836, "train_speed(iter/s)": 0.133808 }, { "epoch": 1.0171884591774094, "grad_norm": 0.5453509149451119, "learning_rate": 4.1812910086710786e-07, "loss": 0.17764878273010254, "memory(GiB)": 57.67, "step": 415, "token_acc": 0.941226073024707, "train_speed(iter/s)": 0.133857 }, { "epoch": 1.0294659300184161, "grad_norm": 0.6099408421859539, "learning_rate": 4.1547239198512906e-07, "loss": 0.17024999856948853, "memory(GiB)": 57.67, "step": 420, "token_acc": 0.9455940130963517, "train_speed(iter/s)": 0.133546 }, { "epoch": 1.0417434008594229, "grad_norm": 0.5544678384480342, "learning_rate": 4.1278200223367186e-07, "loss": 0.1932210922241211, "memory(GiB)": 57.67, "step": 425, "token_acc": 0.941819772528434, "train_speed(iter/s)": 0.133645 }, { "epoch": 1.0540208717004298, "grad_norm": 0.5826701192243031, "learning_rate": 4.1005847922472737e-07, "loss": 0.19101818799972534, "memory(GiB)": 57.67, "step": 430, "token_acc": 0.9367067743530575, "train_speed(iter/s)": 0.133833 }, { "epoch": 1.0662983425414365, "grad_norm": 0.5532014271248828, "learning_rate": 4.0730237731435377e-07, "loss": 0.1754150390625, "memory(GiB)": 57.67, "step": 435, "token_acc": 0.9651971029990765, "train_speed(iter/s)": 0.133965 }, { "epoch": 1.0785758133824432, "grad_norm": 0.5950560890667523, "learning_rate": 4.0451425748984127e-07, "loss": 0.17856969833374023, "memory(GiB)": 57.67, "step": 440, "token_acc": 0.9385016513123587, "train_speed(iter/s)": 0.133907 }, { "epoch": 1.09085328422345, "grad_norm": 0.5376654555480931, "learning_rate": 4.016946872555251e-07, "loss": 0.1833416700363159, "memory(GiB)": 57.67, "step": 445, "token_acc": 0.959684329199549, "train_speed(iter/s)": 0.134067 }, { "epoch": 1.1031307550644567, "grad_norm": 0.44444520282357075, "learning_rate": 3.988442405172755e-07, "loss": 0.17330591678619384, "memory(GiB)": 57.67, "step": 450, "token_acc": 0.925875966441849, "train_speed(iter/s)": 0.133897 }, { "epoch": 1.1154082259054634, "grad_norm": 0.5467733830966146, "learning_rate": 3.9596349746568097e-07, "loss": 0.187214457988739, "memory(GiB)": 57.67, "step": 455, "token_acc": 0.9102694260054666, "train_speed(iter/s)": 0.133805 }, { "epoch": 1.1276856967464703, "grad_norm": 0.5644574909855569, "learning_rate": 3.930530444579556e-07, "loss": 0.18247673511505128, "memory(GiB)": 57.67, "step": 460, "token_acc": 0.9407660594101273, "train_speed(iter/s)": 0.133717 }, { "epoch": 1.139963167587477, "grad_norm": 0.6185677880856536, "learning_rate": 3.901134738985885e-07, "loss": 0.19450093507766725, "memory(GiB)": 57.67, "step": 465, "token_acc": 0.9278699743370402, "train_speed(iter/s)": 0.133408 }, { "epoch": 1.1522406384284838, "grad_norm": 0.5271730386275215, "learning_rate": 3.871453841187645e-07, "loss": 0.1889647960662842, "memory(GiB)": 57.67, "step": 470, "token_acc": 0.9411243259215484, "train_speed(iter/s)": 0.133399 }, { "epoch": 1.1645181092694905, "grad_norm": 0.7131823190088369, "learning_rate": 3.8414937925457706e-07, "loss": 0.17877411842346191, "memory(GiB)": 57.67, "step": 475, "token_acc": 0.9540759574129986, "train_speed(iter/s)": 0.133589 }, { "epoch": 1.1767955801104972, "grad_norm": 0.5353529040202072, "learning_rate": 3.8112606912406037e-07, "loss": 0.17376744747161865, "memory(GiB)": 57.67, "step": 480, "token_acc": 0.9406874176214134, "train_speed(iter/s)": 0.13372 }, { "epoch": 1.189073050951504, "grad_norm": 0.4982526719045866, "learning_rate": 3.780760691030646e-07, "loss": 0.16717066764831542, "memory(GiB)": 57.67, "step": 485, "token_acc": 0.9538624787775891, "train_speed(iter/s)": 0.134087 }, { "epoch": 1.2013505217925107, "grad_norm": 0.46216176496672484, "learning_rate": 3.75e-07, "loss": 0.19427452087402344, "memory(GiB)": 57.67, "step": 490, "token_acc": 0.93522816539313, "train_speed(iter/s)": 0.134444 }, { "epoch": 1.2136279926335174, "grad_norm": 0.7604094532944367, "learning_rate": 3.7189848792947536e-07, "loss": 0.1537397861480713, "memory(GiB)": 57.67, "step": 495, "token_acc": 0.9573288642516437, "train_speed(iter/s)": 0.134597 }, { "epoch": 1.2259054634745243, "grad_norm": 0.549241195148398, "learning_rate": 3.687721641848562e-07, "loss": 0.1440601110458374, "memory(GiB)": 57.67, "step": 500, "token_acc": 0.953042040212377, "train_speed(iter/s)": 0.134604 }, { "epoch": 1.238182934315531, "grad_norm": 0.563515575244595, "learning_rate": 3.6562166510976887e-07, "loss": 0.1917360782623291, "memory(GiB)": 57.67, "step": 505, "token_acc": 0.9542143600416233, "train_speed(iter/s)": 0.13233 }, { "epoch": 1.2504604051565378, "grad_norm": 0.8281020221114317, "learning_rate": 3.624476319685771e-07, "loss": 0.189109206199646, "memory(GiB)": 57.67, "step": 510, "token_acc": 0.9374545982856313, "train_speed(iter/s)": 0.132765 }, { "epoch": 1.2627378759975445, "grad_norm": 0.5241233864753891, "learning_rate": 3.592507108158563e-07, "loss": 0.15444846153259278, "memory(GiB)": 57.67, "step": 515, "token_acc": 0.9661171743001964, "train_speed(iter/s)": 0.13301 }, { "epoch": 1.2750153468385512, "grad_norm": 0.5573427156559804, "learning_rate": 3.560315523648932e-07, "loss": 0.18322609663009642, "memory(GiB)": 57.67, "step": 520, "token_acc": 0.9517766497461929, "train_speed(iter/s)": 0.133378 }, { "epoch": 1.287292817679558, "grad_norm": 0.537977575567055, "learning_rate": 3.5279081185523763e-07, "loss": 0.18208487033843995, "memory(GiB)": 57.67, "step": 525, "token_acc": 0.9459727287141272, "train_speed(iter/s)": 0.133578 }, { "epoch": 1.2995702885205649, "grad_norm": 0.4953187670124203, "learning_rate": 3.4952914891933225e-07, "loss": 0.17269195318222047, "memory(GiB)": 57.67, "step": 530, "token_acc": 0.942077971960822, "train_speed(iter/s)": 0.133685 }, { "epoch": 1.3118477593615716, "grad_norm": 0.4701054790036289, "learning_rate": 3.4624722744824874e-07, "loss": 0.1993415355682373, "memory(GiB)": 57.67, "step": 535, "token_acc": 0.942649839836363, "train_speed(iter/s)": 0.133424 }, { "epoch": 1.3241252302025783, "grad_norm": 0.5249799204021143, "learning_rate": 3.429457154565565e-07, "loss": 0.18299152851104736, "memory(GiB)": 57.67, "step": 540, "token_acc": 0.9312291707508332, "train_speed(iter/s)": 0.133551 }, { "epoch": 1.336402701043585, "grad_norm": 0.5358390030505376, "learning_rate": 3.396252849463529e-07, "loss": 0.1694674849510193, "memory(GiB)": 57.67, "step": 545, "token_acc": 0.9403699099709948, "train_speed(iter/s)": 0.13334 }, { "epoch": 1.3486801718845918, "grad_norm": 0.47175661832523136, "learning_rate": 3.362866117704815e-07, "loss": 0.16650619506835937, "memory(GiB)": 57.67, "step": 550, "token_acc": 0.966151256036507, "train_speed(iter/s)": 0.132885 }, { "epoch": 1.3609576427255985, "grad_norm": 0.5566911020042059, "learning_rate": 3.3293037549496597e-07, "loss": 0.18229317665100098, "memory(GiB)": 57.67, "step": 555, "token_acc": 0.9332179930795848, "train_speed(iter/s)": 0.132956 }, { "epoch": 1.3732351135666052, "grad_norm": 0.8552156642127553, "learning_rate": 3.295572592606891e-07, "loss": 0.17464141845703124, "memory(GiB)": 57.67, "step": 560, "token_acc": 0.9428267315441344, "train_speed(iter/s)": 0.133102 }, { "epoch": 1.385512584407612, "grad_norm": 0.4054170592537424, "learning_rate": 3.2616794964434356e-07, "loss": 0.169390869140625, "memory(GiB)": 57.67, "step": 565, "token_acc": 0.9302136041022855, "train_speed(iter/s)": 0.133051 }, { "epoch": 1.3977900552486187, "grad_norm": 0.4737608855605009, "learning_rate": 3.227631365186836e-07, "loss": 0.16181081533432007, "memory(GiB)": 57.67, "step": 570, "token_acc": 0.9568342208944884, "train_speed(iter/s)": 0.132964 }, { "epoch": 1.4100675260896256, "grad_norm": 0.4918393822553509, "learning_rate": 3.193435129121058e-07, "loss": 0.1819918632507324, "memory(GiB)": 57.67, "step": 575, "token_acc": 0.9572955270188744, "train_speed(iter/s)": 0.133093 }, { "epoch": 1.4223449969306323, "grad_norm": 0.616287863577658, "learning_rate": 3.159097748675873e-07, "loss": 0.1604529619216919, "memory(GiB)": 57.67, "step": 580, "token_acc": 0.9625401355690332, "train_speed(iter/s)": 0.133223 }, { "epoch": 1.434622467771639, "grad_norm": 0.5677509895303791, "learning_rate": 3.124626213010108e-07, "loss": 0.1569218635559082, "memory(GiB)": 57.67, "step": 585, "token_acc": 0.9455586360854067, "train_speed(iter/s)": 0.133671 }, { "epoch": 1.4468999386126458, "grad_norm": 0.5032051690478044, "learning_rate": 3.090027538589044e-07, "loss": 0.169755220413208, "memory(GiB)": 57.67, "step": 590, "token_acc": 0.9418753193663771, "train_speed(iter/s)": 0.134002 }, { "epoch": 1.4591774094536525, "grad_norm": 0.5432224909432672, "learning_rate": 3.055308767756261e-07, "loss": 0.18236881494522095, "memory(GiB)": 57.67, "step": 595, "token_acc": 0.9640907181856363, "train_speed(iter/s)": 0.133845 }, { "epoch": 1.4714548802946594, "grad_norm": 0.5191917254492633, "learning_rate": 3.0204769673002116e-07, "loss": 0.16606335639953612, "memory(GiB)": 57.67, "step": 600, "token_acc": 0.9707240443661637, "train_speed(iter/s)": 0.134004 }, { "epoch": 1.4837323511356661, "grad_norm": 0.5703809476287016, "learning_rate": 2.9855392270158206e-07, "loss": 0.17542767524719238, "memory(GiB)": 57.67, "step": 605, "token_acc": 0.953876582278481, "train_speed(iter/s)": 0.132384 }, { "epoch": 1.4960098219766729, "grad_norm": 0.42427507061236897, "learning_rate": 2.9505026582614024e-07, "loss": 0.19279547929763793, "memory(GiB)": 57.67, "step": 610, "token_acc": 0.9392731620710896, "train_speed(iter/s)": 0.132484 }, { "epoch": 1.5082872928176796, "grad_norm": 0.5816813455731817, "learning_rate": 2.915374392511184e-07, "loss": 0.18373801708221435, "memory(GiB)": 57.67, "step": 615, "token_acc": 0.9288226144586461, "train_speed(iter/s)": 0.132603 }, { "epoch": 1.5205647636586863, "grad_norm": 0.4407507522526475, "learning_rate": 2.8801615799037484e-07, "loss": 0.16642086505889891, "memory(GiB)": 57.67, "step": 620, "token_acc": 0.9481449252432561, "train_speed(iter/s)": 0.132928 }, { "epoch": 1.532842234499693, "grad_norm": 0.482240995820186, "learning_rate": 2.844871387786655e-07, "loss": 0.16589756011962892, "memory(GiB)": 57.67, "step": 625, "token_acc": 0.9455185772142983, "train_speed(iter/s)": 0.132983 }, { "epoch": 1.5451197053406998, "grad_norm": 0.5533567457008911, "learning_rate": 2.809510999257582e-07, "loss": 0.19375090599060057, "memory(GiB)": 57.67, "step": 630, "token_acc": 0.9351315128162171, "train_speed(iter/s)": 0.132918 }, { "epoch": 1.5573971761817065, "grad_norm": 0.42397946477312154, "learning_rate": 2.7740876117022493e-07, "loss": 0.16124327182769777, "memory(GiB)": 57.67, "step": 635, "token_acc": 0.9352771876927294, "train_speed(iter/s)": 0.132949 }, { "epoch": 1.5696746470227132, "grad_norm": 0.46916218507444407, "learning_rate": 2.7386084353294305e-07, "loss": 0.1779846429824829, "memory(GiB)": 57.67, "step": 640, "token_acc": 0.9482183060321404, "train_speed(iter/s)": 0.132929 }, { "epoch": 1.58195211786372, "grad_norm": 0.4494446665243088, "learning_rate": 2.703080691703365e-07, "loss": 0.16666009426116943, "memory(GiB)": 57.67, "step": 645, "token_acc": 0.958400417736391, "train_speed(iter/s)": 0.133055 }, { "epoch": 1.5942295887047269, "grad_norm": 0.5003421798456534, "learning_rate": 2.667511612273853e-07, "loss": 0.17405877113342286, "memory(GiB)": 57.67, "step": 650, "token_acc": 0.9476805681474766, "train_speed(iter/s)": 0.133315 }, { "epoch": 1.6065070595457336, "grad_norm": 0.5798622931057178, "learning_rate": 2.6319084369043403e-07, "loss": 0.14733604192733765, "memory(GiB)": 57.67, "step": 655, "token_acc": 0.985560657322378, "train_speed(iter/s)": 0.133499 }, { "epoch": 1.6187845303867403, "grad_norm": 0.5265921861617535, "learning_rate": 2.596278412398284e-07, "loss": 0.16961886882781982, "memory(GiB)": 57.67, "step": 660, "token_acc": 0.9309208573294544, "train_speed(iter/s)": 0.133748 }, { "epoch": 1.6310620012277472, "grad_norm": 0.5302133218327372, "learning_rate": 2.560628791024118e-07, "loss": 0.17056363821029663, "memory(GiB)": 57.67, "step": 665, "token_acc": 0.9468977792846245, "train_speed(iter/s)": 0.133937 }, { "epoch": 1.643339472068754, "grad_norm": 0.5423907736532357, "learning_rate": 2.5249668290390936e-07, "loss": 0.16655545234680175, "memory(GiB)": 57.67, "step": 670, "token_acc": 0.9424981219695417, "train_speed(iter/s)": 0.134082 }, { "epoch": 1.6556169429097607, "grad_norm": 0.515033840244812, "learning_rate": 2.489299785212319e-07, "loss": 0.17368289232254028, "memory(GiB)": 57.67, "step": 675, "token_acc": 0.9435146443514645, "train_speed(iter/s)": 0.134347 }, { "epoch": 1.6678944137507674, "grad_norm": 0.48163213229590374, "learning_rate": 2.4536349193472773e-07, "loss": 0.16638292074203492, "memory(GiB)": 57.67, "step": 680, "token_acc": 0.9301941049604601, "train_speed(iter/s)": 0.134535 }, { "epoch": 1.6801718845917741, "grad_norm": 0.4779642715711826, "learning_rate": 2.417979490804143e-07, "loss": 0.14599368572235108, "memory(GiB)": 57.67, "step": 685, "token_acc": 0.9522398399014779, "train_speed(iter/s)": 0.134598 }, { "epoch": 1.6924493554327809, "grad_norm": 0.498486709572586, "learning_rate": 2.382340757022181e-07, "loss": 0.1666867971420288, "memory(GiB)": 57.67, "step": 690, "token_acc": 0.9422738067877117, "train_speed(iter/s)": 0.134551 }, { "epoch": 1.7047268262737876, "grad_norm": 0.5039713682487171, "learning_rate": 2.3467259720425429e-07, "loss": 0.17596899271011351, "memory(GiB)": 57.67, "step": 695, "token_acc": 0.9558689717925387, "train_speed(iter/s)": 0.134726 }, { "epoch": 1.7170042971147943, "grad_norm": 0.4391373433161239, "learning_rate": 2.3111423850317508e-07, "loss": 0.17754709720611572, "memory(GiB)": 57.67, "step": 700, "token_acc": 0.957532017854908, "train_speed(iter/s)": 0.134883 }, { "epoch": 1.729281767955801, "grad_norm": 0.49039848892922233, "learning_rate": 2.2755972388061755e-07, "loss": 0.16874098777770996, "memory(GiB)": 57.67, "step": 705, "token_acc": 0.9599765892310462, "train_speed(iter/s)": 0.133296 }, { "epoch": 1.7415592387968077, "grad_norm": 0.4774575002594326, "learning_rate": 2.2400977683578092e-07, "loss": 0.16588878631591797, "memory(GiB)": 57.67, "step": 710, "token_acc": 0.9488679320361306, "train_speed(iter/s)": 0.133523 }, { "epoch": 1.7538367096378145, "grad_norm": 0.5147564388228866, "learning_rate": 2.204651199381623e-07, "loss": 0.15819010734558106, "memory(GiB)": 57.67, "step": 715, "token_acc": 0.9515534491837809, "train_speed(iter/s)": 0.133748 }, { "epoch": 1.7661141804788214, "grad_norm": 0.5103960664604678, "learning_rate": 2.1692647468048233e-07, "loss": 0.17287697792053222, "memory(GiB)": 57.67, "step": 720, "token_acc": 0.9395699944668405, "train_speed(iter/s)": 0.133786 }, { "epoch": 1.7783916513198281, "grad_norm": 0.4774493976896547, "learning_rate": 2.1339456133183043e-07, "loss": 0.1602993369102478, "memory(GiB)": 57.67, "step": 725, "token_acc": 0.9661698051492921, "train_speed(iter/s)": 0.133866 }, { "epoch": 1.7906691221608348, "grad_norm": 0.5075630225471248, "learning_rate": 2.0987009879105762e-07, "loss": 0.18199481964111328, "memory(GiB)": 57.67, "step": 730, "token_acc": 0.9596290705197326, "train_speed(iter/s)": 0.133996 }, { "epoch": 1.8029465930018416, "grad_norm": 0.4811594763625998, "learning_rate": 2.0635380444044999e-07, "loss": 0.17754099369049073, "memory(GiB)": 57.67, "step": 735, "token_acc": 0.9363772728935719, "train_speed(iter/s)": 0.134141 }, { "epoch": 1.8152240638428485, "grad_norm": 0.5223831838925675, "learning_rate": 2.028463939997093e-07, "loss": 0.16622164249420165, "memory(GiB)": 57.67, "step": 740, "token_acc": 0.9566500118962645, "train_speed(iter/s)": 0.134232 }, { "epoch": 1.8275015346838552, "grad_norm": 0.4729423140315417, "learning_rate": 1.9934858138027323e-07, "loss": 0.1844787120819092, "memory(GiB)": 57.67, "step": 745, "token_acc": 0.9339959225280327, "train_speed(iter/s)": 0.134303 }, { "epoch": 1.839779005524862, "grad_norm": 0.4875394529058418, "learning_rate": 1.9586107854000325e-07, "loss": 0.1973895788192749, "memory(GiB)": 57.67, "step": 750, "token_acc": 0.9558183961305532, "train_speed(iter/s)": 0.13412 }, { "epoch": 1.8520564763658687, "grad_norm": 0.40978646980156164, "learning_rate": 1.9238459533826938e-07, "loss": 0.19609053134918214, "memory(GiB)": 57.67, "step": 755, "token_acc": 0.934996003197442, "train_speed(iter/s)": 0.134194 }, { "epoch": 1.8643339472068754, "grad_norm": 0.5408849293574589, "learning_rate": 1.8891983939146369e-07, "loss": 0.1738824725151062, "memory(GiB)": 57.67, "step": 760, "token_acc": 0.9426776599921476, "train_speed(iter/s)": 0.134248 }, { "epoch": 1.8766114180478821, "grad_norm": 0.471076880754224, "learning_rate": 1.8546751592896853e-07, "loss": 0.18387995958328246, "memory(GiB)": 57.67, "step": 765, "token_acc": 0.9433991482771971, "train_speed(iter/s)": 0.13421 }, { "epoch": 1.8888888888888888, "grad_norm": 0.504836493093865, "learning_rate": 1.8202832764961198e-07, "loss": 0.18205785751342773, "memory(GiB)": 57.67, "step": 770, "token_acc": 0.9527401477832512, "train_speed(iter/s)": 0.13428 }, { "epoch": 1.9011663597298956, "grad_norm": 0.4319777645328756, "learning_rate": 1.7860297457863802e-07, "loss": 0.16953612565994264, "memory(GiB)": 57.67, "step": 775, "token_acc": 0.947322033898305, "train_speed(iter/s)": 0.134434 }, { "epoch": 1.9134438305709023, "grad_norm": 0.4590442017829868, "learning_rate": 1.7519215392522025e-07, "loss": 0.15192935466766358, "memory(GiB)": 57.67, "step": 780, "token_acc": 0.9487110114791482, "train_speed(iter/s)": 0.134411 }, { "epoch": 1.925721301411909, "grad_norm": 0.46331279083008303, "learning_rate": 1.717965599405501e-07, "loss": 0.17549625635147095, "memory(GiB)": 57.67, "step": 785, "token_acc": 0.9622470689421031, "train_speed(iter/s)": 0.134696 }, { "epoch": 1.937998772252916, "grad_norm": 0.4358941046923003, "learning_rate": 1.6841688377652552e-07, "loss": 0.1650502562522888, "memory(GiB)": 57.67, "step": 790, "token_acc": 0.9644233133863431, "train_speed(iter/s)": 0.134849 }, { "epoch": 1.9502762430939227, "grad_norm": 0.36413602356289787, "learning_rate": 1.6505381334507175e-07, "loss": 0.16262125968933105, "memory(GiB)": 57.67, "step": 795, "token_acc": 0.9436930827359039, "train_speed(iter/s)": 0.135052 }, { "epoch": 1.9625537139349294, "grad_norm": 0.5119930810029533, "learning_rate": 1.6170803317812136e-07, "loss": 0.17920398712158203, "memory(GiB)": 57.67, "step": 800, "token_acc": 0.9322690781581618, "train_speed(iter/s)": 0.135111 }, { "epoch": 1.974831184775936, "grad_norm": 0.4825725506640307, "learning_rate": 1.583802242882816e-07, "loss": 0.1905304193496704, "memory(GiB)": 57.67, "step": 805, "token_acc": 0.9357515085024685, "train_speed(iter/s)": 0.13373 }, { "epoch": 1.987108655616943, "grad_norm": 0.4976072969687368, "learning_rate": 1.5507106403021895e-07, "loss": 0.1734859824180603, "memory(GiB)": 57.67, "step": 810, "token_acc": 0.9405059337913804, "train_speed(iter/s)": 0.133765 }, { "epoch": 1.9993861264579498, "grad_norm": 0.5349651167180693, "learning_rate": 1.517812259627874e-07, "loss": 0.17344932556152343, "memory(GiB)": 57.67, "step": 815, "token_acc": 0.9323653962492437, "train_speed(iter/s)": 0.133696 }, { "epoch": 2.0098219766728054, "grad_norm": 0.6057966476171509, "learning_rate": 1.4851137971193018e-07, "loss": 0.16537351608276368, "memory(GiB)": 57.67, "step": 820, "token_acc": 0.9556328651806039, "train_speed(iter/s)": 0.133772 }, { "epoch": 2.022099447513812, "grad_norm": 1.0192898362372844, "learning_rate": 1.4526219083438153e-07, "loss": 0.17631728649139405, "memory(GiB)": 57.67, "step": 825, "token_acc": 0.9437896645512239, "train_speed(iter/s)": 0.133754 }, { "epoch": 2.034376918354819, "grad_norm": 0.43605668597687935, "learning_rate": 1.4203432068219616e-07, "loss": 0.1445701837539673, "memory(GiB)": 57.67, "step": 830, "token_acc": 0.9750953344946095, "train_speed(iter/s)": 0.133755 }, { "epoch": 2.0466543891958255, "grad_norm": 0.4517333876137591, "learning_rate": 1.3882842626813645e-07, "loss": 0.15836387872695923, "memory(GiB)": 57.67, "step": 835, "token_acc": 0.96793536040825, "train_speed(iter/s)": 0.133783 }, { "epoch": 2.0589318600368323, "grad_norm": 0.5383308109264513, "learning_rate": 1.3564516013194022e-07, "loss": 0.18179185390472413, "memory(GiB)": 57.67, "step": 840, "token_acc": 0.9378444703705193, "train_speed(iter/s)": 0.133821 }, { "epoch": 2.071209330877839, "grad_norm": 0.4522142212035384, "learning_rate": 1.3248517020750123e-07, "loss": 0.16212983131408693, "memory(GiB)": 57.67, "step": 845, "token_acc": 0.9407257155735858, "train_speed(iter/s)": 0.13394 }, { "epoch": 2.0834868017188457, "grad_norm": 0.45710576973364, "learning_rate": 1.2934909969098612e-07, "loss": 0.1782787561416626, "memory(GiB)": 57.67, "step": 850, "token_acc": 0.9515674953476165, "train_speed(iter/s)": 0.133905 }, { "epoch": 2.095764272559853, "grad_norm": 0.492740078600037, "learning_rate": 1.2623758690991567e-07, "loss": 0.1520832061767578, "memory(GiB)": 57.67, "step": 855, "token_acc": 0.9375085324232082, "train_speed(iter/s)": 0.13385 }, { "epoch": 2.1080417434008596, "grad_norm": 0.4433975430592583, "learning_rate": 1.2315126519323751e-07, "loss": 0.18507776260375977, "memory(GiB)": 57.67, "step": 860, "token_acc": 0.9329054289056818, "train_speed(iter/s)": 0.133924 }, { "epoch": 2.1203192142418663, "grad_norm": 0.5301183452451113, "learning_rate": 1.2009076274241567e-07, "loss": 0.15922095775604247, "memory(GiB)": 57.67, "step": 865, "token_acc": 0.948112669631657, "train_speed(iter/s)": 0.134001 }, { "epoch": 2.132596685082873, "grad_norm": 0.5377164487351925, "learning_rate": 1.1705670250356414e-07, "loss": 0.17927762269973754, "memory(GiB)": 57.67, "step": 870, "token_acc": 0.9462852794687328, "train_speed(iter/s)": 0.13419 }, { "epoch": 2.1448741559238798, "grad_norm": 0.5286410675937543, "learning_rate": 1.1404970204065056e-07, "loss": 0.17566382884979248, "memory(GiB)": 57.67, "step": 875, "token_acc": 0.9370597142669037, "train_speed(iter/s)": 0.134182 }, { "epoch": 2.1571516267648865, "grad_norm": 0.40268668050343476, "learning_rate": 1.110703734097942e-07, "loss": 0.15375242233276368, "memory(GiB)": 57.67, "step": 880, "token_acc": 0.9555922520753369, "train_speed(iter/s)": 0.134177 }, { "epoch": 2.169429097605893, "grad_norm": 0.5959804911771396, "learning_rate": 1.0811932303468649e-07, "loss": 0.18795297145843506, "memory(GiB)": 57.67, "step": 885, "token_acc": 0.93544177741149, "train_speed(iter/s)": 0.134122 }, { "epoch": 2.1817065684469, "grad_norm": 0.4413780946022899, "learning_rate": 1.0519715158315667e-07, "loss": 0.16619727611541749, "memory(GiB)": 57.67, "step": 890, "token_acc": 0.9495682081573142, "train_speed(iter/s)": 0.134035 }, { "epoch": 2.1939840392879066, "grad_norm": 0.41173432463685106, "learning_rate": 1.0230445384491002e-07, "loss": 0.15455365180969238, "memory(GiB)": 57.67, "step": 895, "token_acc": 0.9446943730362753, "train_speed(iter/s)": 0.134118 }, { "epoch": 2.2062615101289134, "grad_norm": 0.48208299751179823, "learning_rate": 9.944181861046186e-08, "loss": 0.18413586616516114, "memory(GiB)": 57.67, "step": 900, "token_acc": 0.9379831280223078, "train_speed(iter/s)": 0.134153 }, { "epoch": 2.21853898096992, "grad_norm": 0.3966707114512702, "learning_rate": 9.660982855129313e-08, "loss": 0.15873076915740966, "memory(GiB)": 57.67, "step": 905, "token_acc": 0.9590186155792517, "train_speed(iter/s)": 0.132978 }, { "epoch": 2.230816451810927, "grad_norm": 0.4381866262273687, "learning_rate": 9.380906010125136e-08, "loss": 0.15625982284545897, "memory(GiB)": 57.67, "step": 910, "token_acc": 0.9386312965272267, "train_speed(iter/s)": 0.133045 }, { "epoch": 2.2430939226519335, "grad_norm": 0.5223822620159875, "learning_rate": 9.104008333922076e-08, "loss": 0.16865816116333007, "memory(GiB)": 57.67, "step": 915, "token_acc": 0.9422254974207811, "train_speed(iter/s)": 0.133004 }, { "epoch": 2.2553713934929407, "grad_norm": 0.38440187816263854, "learning_rate": 8.830346187308649e-08, "loss": 0.1423816680908203, "memory(GiB)": 57.67, "step": 920, "token_acc": 0.9562720848056537, "train_speed(iter/s)": 0.133076 }, { "epoch": 2.267648864333947, "grad_norm": 0.4592624600791522, "learning_rate": 8.559975272501601e-08, "loss": 0.16395586729049683, "memory(GiB)": 57.67, "step": 925, "token_acc": 0.9564571607254534, "train_speed(iter/s)": 0.133105 }, { "epoch": 2.279926335174954, "grad_norm": 0.43806629453653373, "learning_rate": 8.29295062180802e-08, "loss": 0.16589367389678955, "memory(GiB)": 57.67, "step": 930, "token_acc": 0.9659481977902554, "train_speed(iter/s)": 0.133066 }, { "epoch": 2.292203806015961, "grad_norm": 0.464370245760019, "learning_rate": 8.029326586423907e-08, "loss": 0.1606292724609375, "memory(GiB)": 57.67, "step": 935, "token_acc": 0.9405248868778281, "train_speed(iter/s)": 0.133054 }, { "epoch": 2.3044812768569676, "grad_norm": 0.492043728887271, "learning_rate": 7.769156825371286e-08, "loss": 0.16174919605255128, "memory(GiB)": 57.67, "step": 940, "token_acc": 0.9457228709444296, "train_speed(iter/s)": 0.133172 }, { "epoch": 2.3167587476979743, "grad_norm": 0.4103343451876262, "learning_rate": 7.512494294576269e-08, "loss": 0.1728949785232544, "memory(GiB)": 57.67, "step": 945, "token_acc": 0.9441858719315367, "train_speed(iter/s)": 0.133227 }, { "epoch": 2.329036218538981, "grad_norm": 0.4809983010327667, "learning_rate": 7.25939123609022e-08, "loss": 0.15887634754180907, "memory(GiB)": 57.67, "step": 950, "token_acc": 0.9603744280737103, "train_speed(iter/s)": 0.133368 }, { "epoch": 2.3413136893799877, "grad_norm": 0.37621027938026347, "learning_rate": 7.009899167456185e-08, "loss": 0.14414477348327637, "memory(GiB)": 57.67, "step": 955, "token_acc": 0.9454514068703608, "train_speed(iter/s)": 0.133409 }, { "epoch": 2.3535911602209945, "grad_norm": 0.47025720298842205, "learning_rate": 6.764068871222825e-08, "loss": 0.18474191427230835, "memory(GiB)": 57.67, "step": 960, "token_acc": 0.933993399339934, "train_speed(iter/s)": 0.133486 }, { "epoch": 2.365868631062001, "grad_norm": 0.48612512291843357, "learning_rate": 6.521950384607974e-08, "loss": 0.18921175003051757, "memory(GiB)": 57.67, "step": 965, "token_acc": 0.9431688588154794, "train_speed(iter/s)": 0.133588 }, { "epoch": 2.378146101903008, "grad_norm": 0.3806957752805889, "learning_rate": 6.283592989313841e-08, "loss": 0.1681033730506897, "memory(GiB)": 57.67, "step": 970, "token_acc": 0.9367773677736777, "train_speed(iter/s)": 0.13364 }, { "epoch": 2.3904235727440146, "grad_norm": 0.45577439700594125, "learning_rate": 6.049045201496042e-08, "loss": 0.15442556142807007, "memory(GiB)": 57.67, "step": 975, "token_acc": 0.948759111419646, "train_speed(iter/s)": 0.13374 }, { "epoch": 2.4027010435850213, "grad_norm": 0.4860898688565903, "learning_rate": 5.818354761888444e-08, "loss": 0.15901718139648438, "memory(GiB)": 57.67, "step": 980, "token_acc": 0.9584408255401483, "train_speed(iter/s)": 0.13388 }, { "epoch": 2.414978514426028, "grad_norm": 0.514344692664739, "learning_rate": 5.5915686260858244e-08, "loss": 0.17236262559890747, "memory(GiB)": 57.67, "step": 985, "token_acc": 0.9471186187308468, "train_speed(iter/s)": 0.133984 }, { "epoch": 2.427255985267035, "grad_norm": 0.5543092645451274, "learning_rate": 5.368732954986388e-08, "loss": 0.1594996929168701, "memory(GiB)": 57.67, "step": 990, "token_acc": 0.9353687315634218, "train_speed(iter/s)": 0.134028 }, { "epoch": 2.439533456108042, "grad_norm": 0.47905205598317496, "learning_rate": 5.14989310539595e-08, "loss": 0.17013013362884521, "memory(GiB)": 57.67, "step": 995, "token_acc": 0.9312570646677375, "train_speed(iter/s)": 0.134104 }, { "epoch": 2.4518109269490487, "grad_norm": 0.45866756604400005, "learning_rate": 4.935093620795902e-08, "loss": 0.16783492565155028, "memory(GiB)": 57.67, "step": 1000, "token_acc": 0.9406372313396965, "train_speed(iter/s)": 0.134234 }, { "epoch": 2.4640883977900554, "grad_norm": 0.45591647725979945, "learning_rate": 4.7243782222766124e-08, "loss": 0.15385560989379882, "memory(GiB)": 57.67, "step": 1005, "token_acc": 0.9611999210578251, "train_speed(iter/s)": 0.132972 }, { "epoch": 2.476365868631062, "grad_norm": 0.4717972783404294, "learning_rate": 4.517789799638297e-08, "loss": 0.1716939926147461, "memory(GiB)": 57.67, "step": 1010, "token_acc": 0.9752226720647773, "train_speed(iter/s)": 0.133203 }, { "epoch": 2.488643339472069, "grad_norm": 0.4386963670049322, "learning_rate": 4.315370402661092e-08, "loss": 0.16546686887741088, "memory(GiB)": 57.67, "step": 1015, "token_acc": 0.9399607392866403, "train_speed(iter/s)": 0.133178 }, { "epoch": 2.5009208103130756, "grad_norm": 0.5231430157385392, "learning_rate": 4.1171612325460236e-08, "loss": 0.18315892219543456, "memory(GiB)": 57.67, "step": 1020, "token_acc": 0.9306804077180577, "train_speed(iter/s)": 0.133198 }, { "epoch": 2.5131982811540823, "grad_norm": 0.5125599782547446, "learning_rate": 3.9232026335288296e-08, "loss": 0.168873929977417, "memory(GiB)": 57.67, "step": 1025, "token_acc": 0.9482202118470701, "train_speed(iter/s)": 0.133093 }, { "epoch": 2.525475751995089, "grad_norm": 0.4146726330452646, "learning_rate": 3.733534084668091e-08, "loss": 0.1465557336807251, "memory(GiB)": 57.67, "step": 1030, "token_acc": 0.9542228126779275, "train_speed(iter/s)": 0.133256 }, { "epoch": 2.5377532228360957, "grad_norm": 0.4928965284851222, "learning_rate": 3.5481941918095396e-08, "loss": 0.1535036325454712, "memory(GiB)": 57.67, "step": 1035, "token_acc": 0.98864726574992, "train_speed(iter/s)": 0.133212 }, { "epoch": 2.5500306936771024, "grad_norm": 0.46167944188273663, "learning_rate": 3.367220679728089e-08, "loss": 0.17861878871917725, "memory(GiB)": 57.67, "step": 1040, "token_acc": 0.9435955137744546, "train_speed(iter/s)": 0.133172 }, { "epoch": 2.562308164518109, "grad_norm": 0.5398332391068397, "learning_rate": 3.190650384449167e-08, "loss": 0.1772806763648987, "memory(GiB)": 57.67, "step": 1045, "token_acc": 0.9553827261563651, "train_speed(iter/s)": 0.133219 }, { "epoch": 2.574585635359116, "grad_norm": 0.5539788244071175, "learning_rate": 3.018519245750989e-08, "loss": 0.14356986284255982, "memory(GiB)": 57.67, "step": 1050, "token_acc": 0.968132854578097, "train_speed(iter/s)": 0.133317 }, { "epoch": 2.5868631062001226, "grad_norm": 0.4402330635590166, "learning_rate": 2.850862299849241e-08, "loss": 0.16220954656600953, "memory(GiB)": 57.67, "step": 1055, "token_acc": 0.9482638506948264, "train_speed(iter/s)": 0.133252 }, { "epoch": 2.5991405770411298, "grad_norm": 0.48620425405940176, "learning_rate": 2.6877136722656734e-08, "loss": 0.18278908729553223, "memory(GiB)": 57.67, "step": 1060, "token_acc": 0.9325091336116911, "train_speed(iter/s)": 0.133237 }, { "epoch": 2.611418047882136, "grad_norm": 0.43457144570186146, "learning_rate": 2.5291065708820754e-08, "loss": 0.17628798484802247, "memory(GiB)": 57.67, "step": 1065, "token_acc": 0.9281233833419555, "train_speed(iter/s)": 0.13325 }, { "epoch": 2.623695518723143, "grad_norm": 0.4143344415028038, "learning_rate": 2.375073279180992e-08, "loss": 0.15394517183303832, "memory(GiB)": 57.67, "step": 1070, "token_acc": 0.9541889329425961, "train_speed(iter/s)": 0.133392 }, { "epoch": 2.63597298956415, "grad_norm": 0.4688198080933725, "learning_rate": 2.2256451496746653e-08, "loss": 0.1712632417678833, "memory(GiB)": 57.67, "step": 1075, "token_acc": 0.9350767303476355, "train_speed(iter/s)": 0.133374 }, { "epoch": 2.6482504604051567, "grad_norm": 0.5198918873737135, "learning_rate": 2.0808525975233805e-08, "loss": 0.19560701847076417, "memory(GiB)": 57.67, "step": 1080, "token_acc": 0.9272029474976973, "train_speed(iter/s)": 0.133426 }, { "epoch": 2.6605279312461634, "grad_norm": 0.4846401676955354, "learning_rate": 1.940725094344675e-08, "loss": 0.17480210065841675, "memory(GiB)": 57.67, "step": 1085, "token_acc": 0.9508643542545645, "train_speed(iter/s)": 0.133534 }, { "epoch": 2.67280540208717, "grad_norm": 0.4514013424363714, "learning_rate": 1.8052911622145866e-08, "loss": 0.1648250102996826, "memory(GiB)": 57.67, "step": 1090, "token_acc": 0.9454948354122735, "train_speed(iter/s)": 0.133539 }, { "epoch": 2.685082872928177, "grad_norm": 0.4989521104653192, "learning_rate": 1.6745783678621367e-08, "loss": 0.16439478397369384, "memory(GiB)": 57.67, "step": 1095, "token_acc": 0.9524301583201092, "train_speed(iter/s)": 0.133571 }, { "epoch": 2.6973603437691835, "grad_norm": 0.49299845547327686, "learning_rate": 1.5486133170583145e-08, "loss": 0.16308257579803467, "memory(GiB)": 57.67, "step": 1100, "token_acc": 0.9602804837874723, "train_speed(iter/s)": 0.133601 }, { "epoch": 2.7096378146101903, "grad_norm": 0.38800040402764896, "learning_rate": 1.4274216492006302e-08, "loss": 0.20491249561309816, "memory(GiB)": 57.67, "step": 1105, "token_acc": 0.9337728751954694, "train_speed(iter/s)": 0.132264 }, { "epoch": 2.721915285451197, "grad_norm": 0.46254709159908636, "learning_rate": 1.311028032094369e-08, "loss": 0.1842280149459839, "memory(GiB)": 57.67, "step": 1110, "token_acc": 0.9332923076923076, "train_speed(iter/s)": 0.132279 }, { "epoch": 2.7341927562922037, "grad_norm": 0.4700426518953757, "learning_rate": 1.1994561569316442e-08, "loss": 0.15502922534942626, "memory(GiB)": 57.67, "step": 1115, "token_acc": 0.9172778194837443, "train_speed(iter/s)": 0.132416 }, { "epoch": 2.7464702271332104, "grad_norm": 0.4570084263374054, "learning_rate": 1.0927287334691616e-08, "loss": 0.17944493293762206, "memory(GiB)": 57.67, "step": 1120, "token_acc": 0.9620748077674964, "train_speed(iter/s)": 0.132397 }, { "epoch": 2.758747697974217, "grad_norm": 0.5565607166042048, "learning_rate": 9.908674854058219e-09, "loss": 0.16596771478652955, "memory(GiB)": 57.67, "step": 1125, "token_acc": 0.946463347418703, "train_speed(iter/s)": 0.132429 }, { "epoch": 2.771025168815224, "grad_norm": 0.43235518649936683, "learning_rate": 8.938931459609806e-09, "loss": 0.16782586574554442, "memory(GiB)": 57.67, "step": 1130, "token_acc": 0.9357140571501712, "train_speed(iter/s)": 0.132529 }, { "epoch": 2.783302639656231, "grad_norm": 0.5041786160999633, "learning_rate": 8.018254536543451e-09, "loss": 0.18278899192810058, "memory(GiB)": 57.67, "step": 1135, "token_acc": 0.9538431826960216, "train_speed(iter/s)": 0.132595 }, { "epoch": 2.7955801104972373, "grad_norm": 0.3988655252448663, "learning_rate": 7.146831482883115e-09, "loss": 0.16640629768371581, "memory(GiB)": 57.67, "step": 1140, "token_acc": 0.9387376446378571, "train_speed(iter/s)": 0.132687 }, { "epoch": 2.8078575813382445, "grad_norm": 0.5585703278010272, "learning_rate": 6.32483967133593e-09, "loss": 0.16575145721435547, "memory(GiB)": 57.67, "step": 1145, "token_acc": 0.9446856625961103, "train_speed(iter/s)": 0.132784 }, { "epoch": 2.820135052179251, "grad_norm": 0.5408177805631927, "learning_rate": 5.5524464131893046e-09, "loss": 0.16870219707489015, "memory(GiB)": 57.67, "step": 1150, "token_acc": 0.9385546004457179, "train_speed(iter/s)": 0.132784 }, { "epoch": 2.832412523020258, "grad_norm": 0.47090675834257434, "learning_rate": 4.829808924255441e-09, "loss": 0.1629176139831543, "memory(GiB)": 57.67, "step": 1155, "token_acc": 0.9479655438055886, "train_speed(iter/s)": 0.132829 }, { "epoch": 2.8446899938612646, "grad_norm": 0.482099713549635, "learning_rate": 4.157074292871238e-09, "loss": 0.15441689491271973, "memory(GiB)": 57.67, "step": 1160, "token_acc": 0.9639929344626116, "train_speed(iter/s)": 0.132784 }, { "epoch": 2.8569674647022714, "grad_norm": 0.4698231217223272, "learning_rate": 3.5343794499594625e-09, "loss": 0.18142955303192138, "memory(GiB)": 57.67, "step": 1165, "token_acc": 0.9412326017570991, "train_speed(iter/s)": 0.132762 }, { "epoch": 2.869244935543278, "grad_norm": 0.43804204402927915, "learning_rate": 2.9618511411570455e-09, "loss": 0.18446786403656007, "memory(GiB)": 57.67, "step": 1170, "token_acc": 0.9350367684435257, "train_speed(iter/s)": 0.132754 }, { "epoch": 2.881522406384285, "grad_norm": 0.5571992556185795, "learning_rate": 2.4396059010170777e-09, "loss": 0.1577387809753418, "memory(GiB)": 57.67, "step": 1175, "token_acc": 0.9629287863590772, "train_speed(iter/s)": 0.132933 }, { "epoch": 2.8937998772252915, "grad_norm": 0.5267456210442775, "learning_rate": 1.967750029288756e-09, "loss": 0.15231819152832032, "memory(GiB)": 57.67, "step": 1180, "token_acc": 0.9535112359550562, "train_speed(iter/s)": 0.133115 }, { "epoch": 2.9060773480662982, "grad_norm": 0.3910293202558737, "learning_rate": 1.5463795692808034e-09, "loss": 0.162775456905365, "memory(GiB)": 57.67, "step": 1185, "token_acc": 0.931954924708307, "train_speed(iter/s)": 0.133131 }, { "epoch": 2.918354818907305, "grad_norm": 0.4093866355363869, "learning_rate": 1.1755802883124389e-09, "loss": 0.16861215829849244, "memory(GiB)": 57.67, "step": 1190, "token_acc": 0.9492351730924053, "train_speed(iter/s)": 0.133244 }, { "epoch": 2.9306322897483117, "grad_norm": 0.5603512977054463, "learning_rate": 8.554276602559807e-10, "loss": 0.20310664176940918, "memory(GiB)": 57.67, "step": 1195, "token_acc": 0.9434275032624414, "train_speed(iter/s)": 0.133257 }, { "epoch": 2.942909760589319, "grad_norm": 0.5047283156733317, "learning_rate": 5.859868501746079e-10, "loss": 0.1582653284072876, "memory(GiB)": 57.67, "step": 1200, "token_acc": 0.9450664007614559, "train_speed(iter/s)": 0.133326 }, { "epoch": 2.955187231430325, "grad_norm": 0.3732689118202031, "learning_rate": 3.6731270105844204e-10, "loss": 0.17182209491729736, "memory(GiB)": 57.67, "step": 1205, "token_acc": 0.9346191946856244, "train_speed(iter/s)": 0.132213 }, { "epoch": 2.9674647022713323, "grad_norm": 0.513720762261806, "learning_rate": 1.9944972266153214e-10, "loss": 0.17291358709335328, "memory(GiB)": 57.67, "step": 1210, "token_acc": 0.9477955520873976, "train_speed(iter/s)": 0.132351 }, { "epoch": 2.979742173112339, "grad_norm": 0.4516334969279687, "learning_rate": 8.243208244229637e-11, "loss": 0.156210994720459, "memory(GiB)": 57.67, "step": 1215, "token_acc": 0.9544438002153566, "train_speed(iter/s)": 0.132457 }, { "epoch": 2.9920196439533457, "grad_norm": 0.4764485403289777, "learning_rate": 1.628359860883499e-11, "loss": 0.1759173631668091, "memory(GiB)": 57.67, "step": 1220, "token_acc": 0.9492864815098971, "train_speed(iter/s)": 0.132386 } ], "logging_steps": 5, "max_steps": 1224, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 169385433251840.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }