{ "best_global_step": 460, "best_metric": 0.21278653, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v31-20250511-155600/checkpoint-460", "epoch": 2.9911123081066524, "eval_steps": 20, "global_step": 696, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0043091839482897925, "grad_norm": 2.5187692642211914, "learning_rate": 9.99994906450425e-06, "loss": 0.42822888493537903, "memory(GiB)": 30.87, "step": 1, "token_acc": 0.8878183069511356, "train_speed(iter/s)": 0.066565 }, { "epoch": 0.02154591974144896, "grad_norm": 1.426959753036499, "learning_rate": 9.99872666449397e-06, "loss": 0.3501852750778198, "memory(GiB)": 30.87, "step": 5, "token_acc": 0.8919414461038001, "train_speed(iter/s)": 0.123514 }, { "epoch": 0.04309183948289792, "grad_norm": 0.964651882648468, "learning_rate": 9.994907306529203e-06, "loss": 0.2793572902679443, "memory(GiB)": 30.87, "step": 10, "token_acc": 0.9118709677419354, "train_speed(iter/s)": 0.140044 }, { "epoch": 0.06463775922434689, "grad_norm": 0.8671960830688477, "learning_rate": 9.988543871435342e-06, "loss": 0.2740795612335205, "memory(GiB)": 30.87, "step": 15, "token_acc": 0.9092932217932218, "train_speed(iter/s)": 0.143802 }, { "epoch": 0.08618367896579585, "grad_norm": 0.8037099838256836, "learning_rate": 9.979639600327522e-06, "loss": 0.2603166103363037, "memory(GiB)": 30.87, "step": 20, "token_acc": 0.909269693956469, "train_speed(iter/s)": 0.146069 }, { "epoch": 0.08618367896579585, "eval_loss": 0.28445571660995483, "eval_runtime": 9.1338, "eval_samples_per_second": 16.422, "eval_steps_per_second": 4.16, "eval_token_acc": 0.9113986985450026, "step": 20 }, { "epoch": 0.10772959870724481, "grad_norm": 0.7577056884765625, "learning_rate": 9.96819902845557e-06, "loss": 0.25346100330352783, "memory(GiB)": 30.87, "step": 25, "token_acc": 0.9155251620482054, "train_speed(iter/s)": 0.132315 }, { "epoch": 0.12927551844869378, "grad_norm": 0.8333344459533691, "learning_rate": 9.954227982894034e-06, "loss": 0.2552709341049194, "memory(GiB)": 30.87, "step": 30, "token_acc": 0.9222959574861856, "train_speed(iter/s)": 0.136002 }, { "epoch": 0.15082143819014274, "grad_norm": 0.7584787607192993, "learning_rate": 9.937733579574263e-06, "loss": 0.23784613609313965, "memory(GiB)": 30.87, "step": 35, "token_acc": 0.9168941654498127, "train_speed(iter/s)": 0.137854 }, { "epoch": 0.1723673579315917, "grad_norm": 0.771274745464325, "learning_rate": 9.918724219660013e-06, "loss": 0.23706231117248536, "memory(GiB)": 30.87, "step": 40, "token_acc": 0.9237318428492509, "train_speed(iter/s)": 0.14013 }, { "epoch": 0.1723673579315917, "eval_loss": 0.26220738887786865, "eval_runtime": 9.0555, "eval_samples_per_second": 16.564, "eval_steps_per_second": 4.196, "eval_token_acc": 0.916019594940411, "step": 40 }, { "epoch": 0.19391327767304067, "grad_norm": 0.7550503611564636, "learning_rate": 9.897209585268459e-06, "loss": 0.2579146146774292, "memory(GiB)": 30.87, "step": 45, "token_acc": 0.9144340126707885, "train_speed(iter/s)": 0.132998 }, { "epoch": 0.21545919741448963, "grad_norm": 0.7265065312385559, "learning_rate": 9.873200634538746e-06, "loss": 0.24367237091064453, "memory(GiB)": 30.87, "step": 50, "token_acc": 0.9259022989915843, "train_speed(iter/s)": 0.134517 }, { "epoch": 0.23700511715593858, "grad_norm": 0.7292296290397644, "learning_rate": 9.846709596050646e-06, "loss": 0.23889431953430176, "memory(GiB)": 30.87, "step": 55, "token_acc": 0.9210317334265112, "train_speed(iter/s)": 0.136145 }, { "epoch": 0.25855103689738757, "grad_norm": 0.7953284382820129, "learning_rate": 9.817749962596115e-06, "loss": 0.23232686519622803, "memory(GiB)": 30.87, "step": 60, "token_acc": 0.9184958572339069, "train_speed(iter/s)": 0.137867 }, { "epoch": 0.25855103689738757, "eval_loss": 0.251347154378891, "eval_runtime": 9.0788, "eval_samples_per_second": 16.522, "eval_steps_per_second": 4.186, "eval_token_acc": 0.9184031585874095, "step": 60 }, { "epoch": 0.28009695663883655, "grad_norm": 0.6928980350494385, "learning_rate": 9.786336484306966e-06, "loss": 0.24375591278076172, "memory(GiB)": 33.45, "step": 65, "token_acc": 0.9260038800834584, "train_speed(iter/s)": 0.134041 }, { "epoch": 0.3016428763802855, "grad_norm": 0.7734121084213257, "learning_rate": 9.752485161142103e-06, "loss": 0.22954516410827636, "memory(GiB)": 33.45, "step": 70, "token_acc": 0.9301495884406615, "train_speed(iter/s)": 0.13516 }, { "epoch": 0.32318879612173446, "grad_norm": 0.6648094654083252, "learning_rate": 9.716213234738216e-06, "loss": 0.21929600238800048, "memory(GiB)": 33.45, "step": 75, "token_acc": 0.9217300294184204, "train_speed(iter/s)": 0.13628 }, { "epoch": 0.3447347158631834, "grad_norm": 0.7500734925270081, "learning_rate": 9.677539179628005e-06, "loss": 0.2358041524887085, "memory(GiB)": 33.45, "step": 80, "token_acc": 0.9286979011240426, "train_speed(iter/s)": 0.137406 }, { "epoch": 0.3447347158631834, "eval_loss": 0.24365545809268951, "eval_runtime": 9.052, "eval_samples_per_second": 16.571, "eval_steps_per_second": 4.198, "eval_token_acc": 0.9190319514513416, "step": 80 }, { "epoch": 0.36628063560463237, "grad_norm": 0.7007490992546082, "learning_rate": 9.636482693830488e-06, "loss": 0.23542990684509277, "memory(GiB)": 33.45, "step": 85, "token_acc": 0.9126046231489751, "train_speed(iter/s)": 0.133883 }, { "epoch": 0.38782655534608135, "grad_norm": 0.7181985378265381, "learning_rate": 9.59306468881811e-06, "loss": 0.23636837005615235, "memory(GiB)": 33.45, "step": 90, "token_acc": 0.9158058806435421, "train_speed(iter/s)": 0.135756 }, { "epoch": 0.4093724750875303, "grad_norm": 0.7669239044189453, "learning_rate": 9.547307278865823e-06, "loss": 0.22127339839935303, "memory(GiB)": 33.45, "step": 95, "token_acc": 0.9236630450119938, "train_speed(iter/s)": 0.136626 }, { "epoch": 0.43091839482897926, "grad_norm": 0.7955787777900696, "learning_rate": 9.499233769787534e-06, "loss": 0.2230149030685425, "memory(GiB)": 33.45, "step": 100, "token_acc": 0.9264181091877497, "train_speed(iter/s)": 0.137829 }, { "epoch": 0.43091839482897926, "eval_loss": 0.23765011131763458, "eval_runtime": 9.0746, "eval_samples_per_second": 16.53, "eval_steps_per_second": 4.188, "eval_token_acc": 0.9208744607735615, "step": 100 }, { "epoch": 0.45246431457042824, "grad_norm": 0.7825611233711243, "learning_rate": 9.448868647065644e-06, "loss": 0.23993771076202391, "memory(GiB)": 33.45, "step": 105, "token_acc": 0.9233408703240906, "train_speed(iter/s)": 0.135278 }, { "epoch": 0.47401023431187717, "grad_norm": 0.6205978989601135, "learning_rate": 9.396237563379761e-06, "loss": 0.2033458471298218, "memory(GiB)": 36.07, "step": 110, "token_acc": 0.9351195748449955, "train_speed(iter/s)": 0.135983 }, { "epoch": 0.49555615405332615, "grad_norm": 0.7755696773529053, "learning_rate": 9.341367325540921e-06, "loss": 0.20325517654418945, "memory(GiB)": 36.07, "step": 115, "token_acc": 0.9325856886666162, "train_speed(iter/s)": 0.136589 }, { "epoch": 0.5171020737947751, "grad_norm": 0.6994781494140625, "learning_rate": 9.284285880837947e-06, "loss": 0.20680899620056153, "memory(GiB)": 36.07, "step": 120, "token_acc": 0.9287609114612856, "train_speed(iter/s)": 0.137204 }, { "epoch": 0.5171020737947751, "eval_loss": 0.233298197388649, "eval_runtime": 9.1026, "eval_samples_per_second": 16.479, "eval_steps_per_second": 4.175, "eval_token_acc": 0.9223221466695913, "step": 120 }, { "epoch": 0.5386479935362241, "grad_norm": 0.586891233921051, "learning_rate": 9.225022302802951e-06, "loss": 0.20470066070556642, "memory(GiB)": 36.07, "step": 125, "token_acc": 0.923740110451327, "train_speed(iter/s)": 0.134553 }, { "epoch": 0.5601939132776731, "grad_norm": 0.7442881464958191, "learning_rate": 9.163606776403182e-06, "loss": 0.21566917896270751, "memory(GiB)": 36.07, "step": 130, "token_acc": 0.9216363255911278, "train_speed(iter/s)": 0.13532 }, { "epoch": 0.581739833019122, "grad_norm": 0.8242325782775879, "learning_rate": 9.100070582666796e-06, "loss": 0.2127697229385376, "memory(GiB)": 36.07, "step": 135, "token_acc": 0.9278588316706072, "train_speed(iter/s)": 0.135954 }, { "epoch": 0.603285752760571, "grad_norm": 0.7717390656471252, "learning_rate": 9.034446082750352e-06, "loss": 0.22162201404571533, "memory(GiB)": 38.74, "step": 140, "token_acc": 0.9336324292479551, "train_speed(iter/s)": 0.136388 }, { "epoch": 0.603285752760571, "eval_loss": 0.2318713515996933, "eval_runtime": 9.0797, "eval_samples_per_second": 16.52, "eval_steps_per_second": 4.185, "eval_token_acc": 0.922234408130438, "step": 140 }, { "epoch": 0.6248316725020199, "grad_norm": 0.6726309657096863, "learning_rate": 8.966766701456177e-06, "loss": 0.21126816272735596, "memory(GiB)": 38.74, "step": 145, "token_acc": 0.9254626566202053, "train_speed(iter/s)": 0.134385 }, { "epoch": 0.6463775922434689, "grad_norm": 0.6902908086776733, "learning_rate": 8.897066910207958e-06, "loss": 0.21008939743041993, "memory(GiB)": 38.74, "step": 150, "token_acc": 0.9295461033399943, "train_speed(iter/s)": 0.135048 }, { "epoch": 0.6679235119849178, "grad_norm": 0.7515047788619995, "learning_rate": 8.825382209493284e-06, "loss": 0.22056446075439454, "memory(GiB)": 38.74, "step": 155, "token_acc": 0.925468949189583, "train_speed(iter/s)": 0.135565 }, { "epoch": 0.6894694317263668, "grad_norm": 0.831814169883728, "learning_rate": 8.751749110782013e-06, "loss": 0.20998082160949708, "memory(GiB)": 38.74, "step": 160, "token_acc": 0.9225184377421299, "train_speed(iter/s)": 0.136084 }, { "epoch": 0.6894694317263668, "eval_loss": 0.22983159124851227, "eval_runtime": 9.0689, "eval_samples_per_second": 16.54, "eval_steps_per_second": 4.19, "eval_token_acc": 0.9240184250932222, "step": 160 }, { "epoch": 0.7110153514678158, "grad_norm": 0.7213571667671204, "learning_rate": 8.676205117929752e-06, "loss": 0.20911731719970703, "memory(GiB)": 38.74, "step": 165, "token_acc": 0.9290636077587924, "train_speed(iter/s)": 0.134458 }, { "epoch": 0.7325612712092647, "grad_norm": 0.7172518968582153, "learning_rate": 8.598788708075844e-06, "loss": 0.20972037315368652, "memory(GiB)": 38.74, "step": 170, "token_acc": 0.920637162143079, "train_speed(iter/s)": 0.134813 }, { "epoch": 0.7541071909507137, "grad_norm": 0.7580899596214294, "learning_rate": 8.51953931204566e-06, "loss": 0.20927505493164061, "memory(GiB)": 38.74, "step": 175, "token_acc": 0.9197037614500098, "train_speed(iter/s)": 0.135368 }, { "epoch": 0.7756531106921627, "grad_norm": 0.6354929208755493, "learning_rate": 8.438497294267117e-06, "loss": 0.19174000024795532, "memory(GiB)": 38.74, "step": 180, "token_acc": 0.9338436037441498, "train_speed(iter/s)": 0.13579 }, { "epoch": 0.7756531106921627, "eval_loss": 0.2258211374282837, "eval_runtime": 9.0856, "eval_samples_per_second": 16.51, "eval_steps_per_second": 4.182, "eval_token_acc": 0.9245302332382832, "step": 180 }, { "epoch": 0.7971990304336116, "grad_norm": 0.6916645169258118, "learning_rate": 8.3557039322117e-06, "loss": 0.20972118377685547, "memory(GiB)": 38.74, "step": 185, "token_acc": 0.9328983816882928, "train_speed(iter/s)": 0.134176 }, { "epoch": 0.8187449501750605, "grad_norm": 0.7086811661720276, "learning_rate": 8.27120139537044e-06, "loss": 0.20003724098205566, "memory(GiB)": 38.74, "step": 190, "token_acc": 0.9426539918077131, "train_speed(iter/s)": 0.13459 }, { "epoch": 0.8402908699165096, "grad_norm": 0.7445757389068604, "learning_rate": 8.18503272377554e-06, "loss": 0.2096252918243408, "memory(GiB)": 38.74, "step": 195, "token_acc": 0.9330636846696804, "train_speed(iter/s)": 0.135216 }, { "epoch": 0.8618367896579585, "grad_norm": 0.679315984249115, "learning_rate": 8.097241806078616e-06, "loss": 0.20919806957244874, "memory(GiB)": 38.74, "step": 200, "token_acc": 0.9273927392739274, "train_speed(iter/s)": 0.135566 }, { "epoch": 0.8618367896579585, "eval_loss": 0.22177766263484955, "eval_runtime": 9.08, "eval_samples_per_second": 16.52, "eval_steps_per_second": 4.185, "eval_token_acc": 0.925202895371792, "step": 200 }, { "epoch": 0.8833827093994074, "grad_norm": 0.7430039048194885, "learning_rate": 8.007873357196716e-06, "loss": 0.21405186653137206, "memory(GiB)": 38.74, "step": 205, "token_acc": 0.9314106184693589, "train_speed(iter/s)": 0.134435 }, { "epoch": 0.9049286291408565, "grad_norm": 0.781891405582428, "learning_rate": 7.916972895537471e-06, "loss": 0.21267032623291016, "memory(GiB)": 38.74, "step": 210, "token_acc": 0.9286528119588005, "train_speed(iter/s)": 0.13485 }, { "epoch": 0.9264745488823054, "grad_norm": 0.7031259536743164, "learning_rate": 7.824586719815019e-06, "loss": 0.19673454761505127, "memory(GiB)": 38.74, "step": 215, "token_acc": 0.9280998651727184, "train_speed(iter/s)": 0.135252 }, { "epoch": 0.9480204686237543, "grad_norm": 0.6894703507423401, "learning_rate": 7.730761885468486e-06, "loss": 0.20636224746704102, "memory(GiB)": 38.74, "step": 220, "token_acc": 0.9305924848241609, "train_speed(iter/s)": 0.135851 }, { "epoch": 0.9480204686237543, "eval_loss": 0.2209300696849823, "eval_runtime": 9.0739, "eval_samples_per_second": 16.531, "eval_steps_per_second": 4.188, "eval_token_acc": 0.9254076186298165, "step": 220 }, { "epoch": 0.9695663883652034, "grad_norm": 0.7678598761558533, "learning_rate": 7.635546180695039e-06, "loss": 0.2145383834838867, "memory(GiB)": 38.74, "step": 225, "token_acc": 0.9279970535878707, "train_speed(iter/s)": 0.134797 }, { "epoch": 0.9911123081066523, "grad_norm": 0.6631984710693359, "learning_rate": 7.538988102109728e-06, "loss": 0.20897607803344725, "memory(GiB)": 38.74, "step": 230, "token_acc": 0.9305944343816894, "train_speed(iter/s)": 0.135388 }, { "epoch": 1.0086183678965797, "grad_norm": 0.5616968870162964, "learning_rate": 7.441136830044495e-06, "loss": 0.1698223114013672, "memory(GiB)": 38.74, "step": 235, "token_acc": 0.9450659366692111, "train_speed(iter/s)": 0.136043 }, { "epoch": 1.0301642876380286, "grad_norm": 0.7747679352760315, "learning_rate": 7.342042203498952e-06, "loss": 0.1522472620010376, "memory(GiB)": 38.74, "step": 240, "token_acc": 0.9446748506967485, "train_speed(iter/s)": 0.136458 }, { "epoch": 1.0301642876380286, "eval_loss": 0.22592027485370636, "eval_runtime": 9.0855, "eval_samples_per_second": 16.51, "eval_steps_per_second": 4.183, "eval_token_acc": 0.9261680193024786, "step": 240 }, { "epoch": 1.0517102073794775, "grad_norm": 0.7509682178497314, "learning_rate": 7.241754694755674e-06, "loss": 0.14929369688034058, "memory(GiB)": 38.74, "step": 245, "token_acc": 0.9417605130483494, "train_speed(iter/s)": 0.135402 }, { "epoch": 1.0732561271209264, "grad_norm": 0.7504904270172119, "learning_rate": 7.140325383672938e-06, "loss": 0.1446376323699951, "memory(GiB)": 38.74, "step": 250, "token_acc": 0.9489984514711024, "train_speed(iter/s)": 0.135808 }, { "epoch": 1.0948020468623754, "grad_norm": 0.6855825781822205, "learning_rate": 7.037805931668006e-06, "loss": 0.14796760082244872, "memory(GiB)": 38.74, "step": 255, "token_acc": 0.9450119000396668, "train_speed(iter/s)": 0.136065 }, { "epoch": 1.1163479666038243, "grad_norm": 0.6018539071083069, "learning_rate": 6.934248555404197e-06, "loss": 0.14233092069625855, "memory(GiB)": 38.74, "step": 260, "token_acc": 0.9503198086030955, "train_speed(iter/s)": 0.13639 }, { "epoch": 1.1163479666038243, "eval_loss": 0.22484588623046875, "eval_runtime": 9.0958, "eval_samples_per_second": 16.491, "eval_steps_per_second": 4.178, "eval_token_acc": 0.9261680193024786, "step": 260 }, { "epoch": 1.1378938863452734, "grad_norm": 0.7431089878082275, "learning_rate": 6.8297060001951545e-06, "loss": 0.1508152961730957, "memory(GiB)": 38.74, "step": 265, "token_acc": 0.9396280295818994, "train_speed(iter/s)": 0.135576 }, { "epoch": 1.1594398060867224, "grad_norm": 0.6764137148857117, "learning_rate": 6.724231513139853e-06, "loss": 0.1467280149459839, "memory(GiB)": 38.74, "step": 270, "token_acc": 0.9448425970165101, "train_speed(iter/s)": 0.135807 }, { "epoch": 1.1809857258281713, "grad_norm": 0.7039455771446228, "learning_rate": 6.617878816002032e-06, "loss": 0.14175877571105958, "memory(GiB)": 38.74, "step": 275, "token_acc": 0.955692078562785, "train_speed(iter/s)": 0.136098 }, { "epoch": 1.2025316455696202, "grad_norm": 0.742364227771759, "learning_rate": 6.510702077847864e-06, "loss": 0.14723964929580688, "memory(GiB)": 38.74, "step": 280, "token_acc": 0.947806605915655, "train_speed(iter/s)": 0.136459 }, { "epoch": 1.2025316455696202, "eval_loss": 0.22613751888275146, "eval_runtime": 9.0885, "eval_samples_per_second": 16.504, "eval_steps_per_second": 4.181, "eval_token_acc": 0.9258609344154419, "step": 280 }, { "epoch": 1.2240775653110691, "grad_norm": 0.5859317183494568, "learning_rate": 6.402755887455792e-06, "loss": 0.14544841051101684, "memory(GiB)": 38.74, "step": 285, "token_acc": 0.9466956804824975, "train_speed(iter/s)": 0.135483 }, { "epoch": 1.2456234850525183, "grad_norm": 0.7057396769523621, "learning_rate": 6.294095225512604e-06, "loss": 0.1524769902229309, "memory(GiB)": 38.74, "step": 290, "token_acc": 0.9396096068249258, "train_speed(iter/s)": 0.13576 }, { "epoch": 1.2671694047939672, "grad_norm": 0.782086193561554, "learning_rate": 6.184775436609885e-06, "loss": 0.14989967346191407, "memory(GiB)": 38.74, "step": 295, "token_acc": 0.9496643404887627, "train_speed(iter/s)": 0.136136 }, { "epoch": 1.2887153245354162, "grad_norm": 0.8077779412269592, "learning_rate": 6.074852201055121e-06, "loss": 0.1529999017715454, "memory(GiB)": 38.74, "step": 300, "token_acc": 0.9429197198712922, "train_speed(iter/s)": 0.136527 }, { "epoch": 1.2887153245354162, "eval_loss": 0.22358979284763336, "eval_runtime": 9.0841, "eval_samples_per_second": 16.512, "eval_steps_per_second": 4.183, "eval_token_acc": 0.9260217884038897, "step": 300 }, { "epoch": 1.310261244276865, "grad_norm": 0.7263866066932678, "learning_rate": 5.964381506511823e-06, "loss": 0.1550525903701782, "memory(GiB)": 38.74, "step": 305, "token_acc": 0.9437107599047413, "train_speed(iter/s)": 0.135779 }, { "epoch": 1.331807164018314, "grad_norm": 0.667982816696167, "learning_rate": 5.853419619483083e-06, "loss": 0.1465543746948242, "memory(GiB)": 38.74, "step": 310, "token_acc": 0.9492003131640756, "train_speed(iter/s)": 0.13602 }, { "epoch": 1.353353083759763, "grad_norm": 0.6755979061126709, "learning_rate": 5.742023056653131e-06, "loss": 0.1486139178276062, "memory(GiB)": 38.74, "step": 315, "token_acc": 0.953510881680061, "train_speed(iter/s)": 0.136273 }, { "epoch": 1.3748990035012119, "grad_norm": 0.7418352365493774, "learning_rate": 5.630248556101448e-06, "loss": 0.14667509794235228, "memory(GiB)": 38.74, "step": 320, "token_acc": 0.9451512649929588, "train_speed(iter/s)": 0.136561 }, { "epoch": 1.3748990035012119, "eval_loss": 0.2228708267211914, "eval_runtime": 9.0781, "eval_samples_per_second": 16.523, "eval_steps_per_second": 4.186, "eval_token_acc": 0.926899173795423, "step": 320 }, { "epoch": 1.3964449232426608, "grad_norm": 0.7073500752449036, "learning_rate": 5.51815304840412e-06, "loss": 0.14506160020828246, "memory(GiB)": 38.74, "step": 325, "token_acc": 0.9425411230856494, "train_speed(iter/s)": 0.13582 }, { "epoch": 1.41799084298411, "grad_norm": 0.6103145480155945, "learning_rate": 5.405793627637157e-06, "loss": 0.1493854284286499, "memory(GiB)": 38.74, "step": 330, "token_acc": 0.9529187644577494, "train_speed(iter/s)": 0.13604 }, { "epoch": 1.4395367627255589, "grad_norm": 3.6716339588165283, "learning_rate": 5.293227522296517e-06, "loss": 0.15280224084854127, "memory(GiB)": 38.74, "step": 335, "token_acc": 0.9547047104353202, "train_speed(iter/s)": 0.136231 }, { "epoch": 1.4610826824670078, "grad_norm": 0.6979946494102478, "learning_rate": 5.180512066149682e-06, "loss": 0.1544776201248169, "memory(GiB)": 38.74, "step": 340, "token_acc": 0.939196952288807, "train_speed(iter/s)": 0.136511 }, { "epoch": 1.4610826824670078, "eval_loss": 0.2203603982925415, "eval_runtime": 9.0826, "eval_samples_per_second": 16.515, "eval_steps_per_second": 4.184, "eval_token_acc": 0.9274694742999195, "step": 340 }, { "epoch": 1.4826286022084567, "grad_norm": 0.7076512575149536, "learning_rate": 5.06770466903361e-06, "loss": 0.14294663667678834, "memory(GiB)": 38.74, "step": 345, "token_acc": 0.9427494854259345, "train_speed(iter/s)": 0.135716 }, { "epoch": 1.5041745219499059, "grad_norm": 0.7397769093513489, "learning_rate": 4.954862787613937e-06, "loss": 0.1430816411972046, "memory(GiB)": 38.74, "step": 350, "token_acc": 0.9418471128608924, "train_speed(iter/s)": 0.135954 }, { "epoch": 1.5257204416913548, "grad_norm": 0.7260046005249023, "learning_rate": 4.842043896120332e-06, "loss": 0.14312554597854615, "memory(GiB)": 38.74, "step": 355, "token_acc": 0.9532552240608769, "train_speed(iter/s)": 0.136211 }, { "epoch": 1.5472663614328037, "grad_norm": 0.7071496248245239, "learning_rate": 4.729305457072913e-06, "loss": 0.15508384704589845, "memory(GiB)": 38.74, "step": 360, "token_acc": 0.9442986367690152, "train_speed(iter/s)": 0.136516 }, { "epoch": 1.5472663614328037, "eval_loss": 0.21864531934261322, "eval_runtime": 9.0916, "eval_samples_per_second": 16.499, "eval_steps_per_second": 4.18, "eval_token_acc": 0.9273086203114718, "step": 360 }, { "epoch": 1.5688122811742526, "grad_norm": 0.7608581781387329, "learning_rate": 4.616704892014613e-06, "loss": 0.14767109155654906, "memory(GiB)": 38.74, "step": 365, "token_acc": 0.9417986303251507, "train_speed(iter/s)": 0.135816 }, { "epoch": 1.5903582009157016, "grad_norm": 0.6775366067886353, "learning_rate": 4.504299552264428e-06, "loss": 0.14293992519378662, "memory(GiB)": 38.74, "step": 370, "token_acc": 0.9508605933815139, "train_speed(iter/s)": 0.13597 }, { "epoch": 1.6119041206571505, "grad_norm": 0.7780856490135193, "learning_rate": 4.392146689706426e-06, "loss": 0.14917342662811278, "memory(GiB)": 38.74, "step": 375, "token_acc": 0.9454742841633872, "train_speed(iter/s)": 0.136293 }, { "epoch": 1.6334500403985994, "grad_norm": 0.7278069853782654, "learning_rate": 4.280303427629404e-06, "loss": 0.15140265226364136, "memory(GiB)": 38.74, "step": 380, "token_acc": 0.948611652106171, "train_speed(iter/s)": 0.136576 }, { "epoch": 1.6334500403985994, "eval_loss": 0.21836893260478973, "eval_runtime": 9.0809, "eval_samples_per_second": 16.518, "eval_steps_per_second": 4.185, "eval_token_acc": 0.9278204284565329, "step": 380 }, { "epoch": 1.6549959601400483, "grad_norm": 0.7161378860473633, "learning_rate": 4.168826731632052e-06, "loss": 0.13667253255844117, "memory(GiB)": 38.74, "step": 385, "token_acc": 0.9480358075613754, "train_speed(iter/s)": 0.135909 }, { "epoch": 1.6765418798814973, "grad_norm": 0.7765911221504211, "learning_rate": 4.057773380608411e-06, "loss": 0.1545323610305786, "memory(GiB)": 38.74, "step": 390, "token_acc": 0.9548359404807325, "train_speed(iter/s)": 0.136245 }, { "epoch": 1.6980877996229464, "grad_norm": 0.7643768787384033, "learning_rate": 3.947199937828447e-06, "loss": 0.15748288631439208, "memory(GiB)": 38.74, "step": 395, "token_acc": 0.9471460397941154, "train_speed(iter/s)": 0.136566 }, { "epoch": 1.7196337193643954, "grad_norm": 0.709528386592865, "learning_rate": 3.8371627221284495e-06, "loss": 0.1410720705986023, "memory(GiB)": 38.74, "step": 400, "token_acc": 0.9531430538141947, "train_speed(iter/s)": 0.136763 }, { "epoch": 1.7196337193643954, "eval_loss": 0.21615047752857208, "eval_runtime": 9.087, "eval_samples_per_second": 16.507, "eval_steps_per_second": 4.182, "eval_token_acc": 0.9283322366015939, "step": 400 }, { "epoch": 1.7411796391058443, "grad_norm": 0.6649565696716309, "learning_rate": 3.727717779225912e-06, "loss": 0.14372719526290895, "memory(GiB)": 38.74, "step": 405, "token_acc": 0.9476192800091097, "train_speed(iter/s)": 0.136115 }, { "epoch": 1.7627255588472934, "grad_norm": 0.7958775162696838, "learning_rate": 3.6189208531735354e-06, "loss": 0.15733466148376465, "memory(GiB)": 38.74, "step": 410, "token_acc": 0.9438738194422361, "train_speed(iter/s)": 0.136439 }, { "epoch": 1.7842714785887424, "grad_norm": 0.6333225965499878, "learning_rate": 3.510827357966876e-06, "loss": 0.13851017951965333, "memory(GiB)": 38.74, "step": 415, "token_acc": 0.957043632295357, "train_speed(iter/s)": 0.136694 }, { "epoch": 1.8058173983301913, "grad_norm": 0.633067786693573, "learning_rate": 3.403492349320101e-06, "loss": 0.13664473295211793, "memory(GiB)": 38.74, "step": 420, "token_acc": 0.9495724621793467, "train_speed(iter/s)": 0.136827 }, { "epoch": 1.8058173983301913, "eval_loss": 0.21541745960712433, "eval_runtime": 9.085, "eval_samples_per_second": 16.511, "eval_steps_per_second": 4.183, "eval_token_acc": 0.9283614827813117, "step": 420 }, { "epoch": 1.8273633180716402, "grad_norm": 0.678266704082489, "learning_rate": 3.29697049662423e-06, "loss": 0.13364578485488893, "memory(GiB)": 38.74, "step": 425, "token_acc": 0.9482191561046045, "train_speed(iter/s)": 0.136105 }, { "epoch": 1.8489092378130891, "grad_norm": 0.65605229139328, "learning_rate": 3.191316055102146e-06, "loss": 0.14047093391418458, "memory(GiB)": 38.74, "step": 430, "token_acc": 0.9487227952692121, "train_speed(iter/s)": 0.1363 }, { "epoch": 1.870455157554538, "grad_norm": 0.7330144643783569, "learning_rate": 3.0865828381745515e-06, "loss": 0.14013464450836183, "memory(GiB)": 38.74, "step": 435, "token_acc": 0.950079521447053, "train_speed(iter/s)": 0.136475 }, { "epoch": 1.892001077295987, "grad_norm": 0.6185954213142395, "learning_rate": 2.982824190050958e-06, "loss": 0.15014538764953614, "memory(GiB)": 38.74, "step": 440, "token_acc": 0.9505330365510778, "train_speed(iter/s)": 0.136687 }, { "epoch": 1.892001077295987, "eval_loss": 0.2138843536376953, "eval_runtime": 9.078, "eval_samples_per_second": 16.523, "eval_steps_per_second": 4.186, "eval_token_acc": 0.9287709292973605, "step": 440 }, { "epoch": 1.913546997037436, "grad_norm": 0.6207900643348694, "learning_rate": 2.8800929585596506e-06, "loss": 0.13897337913513183, "memory(GiB)": 38.74, "step": 445, "token_acc": 0.9472243016429612, "train_speed(iter/s)": 0.136077 }, { "epoch": 1.9350929167788848, "grad_norm": 0.6200534701347351, "learning_rate": 2.778441468230483e-06, "loss": 0.13360581398010254, "memory(GiB)": 38.74, "step": 450, "token_acc": 0.9528330952581016, "train_speed(iter/s)": 0.136238 }, { "epoch": 1.956638836520334, "grad_norm": 0.6887915134429932, "learning_rate": 2.6779214936442056e-06, "loss": 0.14459173679351806, "memory(GiB)": 38.74, "step": 455, "token_acc": 0.9435832116561559, "train_speed(iter/s)": 0.136419 }, { "epoch": 1.978184756261783, "grad_norm": 0.6248582005500793, "learning_rate": 2.5785842330619038e-06, "loss": 0.13852910995483397, "memory(GiB)": 38.74, "step": 460, "token_acc": 0.9469382100304794, "train_speed(iter/s)": 0.136568 }, { "epoch": 1.978184756261783, "eval_loss": 0.21278652548789978, "eval_runtime": 9.077, "eval_samples_per_second": 16.525, "eval_steps_per_second": 4.186, "eval_token_acc": 0.9287855523872194, "step": 460 }, { "epoch": 1.9997306760032318, "grad_norm": 0.659483790397644, "learning_rate": 2.480480282347961e-06, "loss": 0.14010127782821655, "memory(GiB)": 38.74, "step": 465, "token_acc": 0.9450760066893402, "train_speed(iter/s)": 0.136067 }, { "epoch": 2.0172367357931593, "grad_norm": 0.629996120929718, "learning_rate": 2.383659609199873e-06, "loss": 0.12061877250671386, "memory(GiB)": 38.74, "step": 470, "token_acc": 0.954493670886076, "train_speed(iter/s)": 0.136512 }, { "epoch": 2.0387826555346082, "grad_norm": 0.6116938591003418, "learning_rate": 2.2881715276979705e-06, "loss": 0.10172913074493409, "memory(GiB)": 38.74, "step": 475, "token_acc": 0.9618814806855424, "train_speed(iter/s)": 0.136622 }, { "epoch": 2.060328575276057, "grad_norm": 0.6483604311943054, "learning_rate": 2.1940646731880887e-06, "loss": 0.11099107265472412, "memory(GiB)": 38.74, "step": 480, "token_acc": 0.9685138845023201, "train_speed(iter/s)": 0.136775 }, { "epoch": 2.060328575276057, "eval_loss": 0.2272883951663971, "eval_runtime": 9.0828, "eval_samples_per_second": 16.515, "eval_steps_per_second": 4.184, "eval_token_acc": 0.928054397894275, "step": 480 }, { "epoch": 2.081874495017506, "grad_norm": 0.6483248472213745, "learning_rate": 2.101386977509907e-06, "loss": 0.11337897777557374, "memory(GiB)": 38.74, "step": 485, "token_acc": 0.9550344662708691, "train_speed(iter/s)": 0.136304 }, { "epoch": 2.103420414758955, "grad_norm": 0.7392867803573608, "learning_rate": 2.010185644583641e-06, "loss": 0.10771691799163818, "memory(GiB)": 38.74, "step": 490, "token_acc": 0.9670560877027851, "train_speed(iter/s)": 0.13646 }, { "epoch": 2.124966334500404, "grad_norm": 0.682255744934082, "learning_rate": 1.920507126367448e-06, "loss": 0.10148389339447021, "memory(GiB)": 38.74, "step": 495, "token_acc": 0.9608224142005619, "train_speed(iter/s)": 0.136614 }, { "epoch": 2.146512254241853, "grad_norm": 0.6227522492408752, "learning_rate": 1.8323970991978823e-06, "loss": 0.09187655448913574, "memory(GiB)": 38.74, "step": 500, "token_acc": 0.9645073073190814, "train_speed(iter/s)": 0.136742 }, { "epoch": 2.146512254241853, "eval_loss": 0.22719649970531464, "eval_runtime": 9.0834, "eval_samples_per_second": 16.514, "eval_steps_per_second": 4.183, "eval_token_acc": 0.9283029904218761, "step": 500 }, { "epoch": 2.168058173983302, "grad_norm": 0.6224367022514343, "learning_rate": 1.7459004405253544e-06, "loss": 0.09940274357795716, "memory(GiB)": 38.74, "step": 505, "token_acc": 0.960183380261353, "train_speed(iter/s)": 0.136199 }, { "epoch": 2.1896040937247507, "grad_norm": 0.6162160038948059, "learning_rate": 1.6610612060565235e-06, "loss": 0.09446293711662293, "memory(GiB)": 38.74, "step": 510, "token_acc": 0.9673866334954201, "train_speed(iter/s)": 0.136387 }, { "epoch": 2.2111500134661997, "grad_norm": 0.7308635115623474, "learning_rate": 1.5779226073152071e-06, "loss": 0.10239348411560059, "memory(GiB)": 38.74, "step": 515, "token_acc": 0.9644728473210771, "train_speed(iter/s)": 0.136648 }, { "epoch": 2.2326959332076486, "grad_norm": 0.63414067029953, "learning_rate": 1.4965269896332884e-06, "loss": 0.10488066673278809, "memory(GiB)": 38.74, "step": 520, "token_acc": 0.9668579943705502, "train_speed(iter/s)": 0.136842 }, { "epoch": 2.2326959332076486, "eval_loss": 0.22872595489025116, "eval_runtime": 9.0808, "eval_samples_per_second": 16.518, "eval_steps_per_second": 4.185, "eval_token_acc": 0.9286393214886306, "step": 520 }, { "epoch": 2.254241852949098, "grad_norm": 0.680652916431427, "learning_rate": 1.4169158105827768e-06, "loss": 0.10521303415298462, "memory(GiB)": 38.74, "step": 525, "token_acc": 0.9544280515008022, "train_speed(iter/s)": 0.136411 }, { "epoch": 2.275787772690547, "grad_norm": 0.6307674050331116, "learning_rate": 1.3391296188600594e-06, "loss": 0.09917184710502625, "memory(GiB)": 38.74, "step": 530, "token_acc": 0.9651942551516304, "train_speed(iter/s)": 0.136524 }, { "epoch": 2.297333692431996, "grad_norm": 0.6577898859977722, "learning_rate": 1.2632080336330532e-06, "loss": 0.10673871040344238, "memory(GiB)": 38.74, "step": 535, "token_acc": 0.9639854466787945, "train_speed(iter/s)": 0.136691 }, { "epoch": 2.3188796121734447, "grad_norm": 0.5732426643371582, "learning_rate": 1.1891897243618184e-06, "loss": 0.1005368709564209, "memory(GiB)": 38.74, "step": 540, "token_acc": 0.961928605343808, "train_speed(iter/s)": 0.136843 }, { "epoch": 2.3188796121734447, "eval_loss": 0.22874796390533447, "eval_runtime": 9.0783, "eval_samples_per_second": 16.523, "eval_steps_per_second": 4.186, "eval_token_acc": 0.9285223367697595, "step": 540 }, { "epoch": 2.3404255319148937, "grad_norm": 0.6907266974449158, "learning_rate": 1.1171123911028692e-06, "loss": 0.10030966997146606, "memory(GiB)": 38.74, "step": 545, "token_acc": 0.9541099053336412, "train_speed(iter/s)": 0.136383 }, { "epoch": 2.3619714516563426, "grad_norm": 0.7398902773857117, "learning_rate": 1.047012745307255e-06, "loss": 0.10119664669036865, "memory(GiB)": 38.74, "step": 550, "token_acc": 0.9620670073821692, "train_speed(iter/s)": 0.136525 }, { "epoch": 2.3835173713977915, "grad_norm": 0.7038416862487793, "learning_rate": 9.789264911221546e-07, "loss": 0.10580694675445557, "memory(GiB)": 38.74, "step": 555, "token_acc": 0.9612262521588947, "train_speed(iter/s)": 0.136738 }, { "epoch": 2.4050632911392404, "grad_norm": 0.567625105381012, "learning_rate": 9.128883072055411e-07, "loss": 0.10453232526779174, "memory(GiB)": 38.74, "step": 560, "token_acc": 0.9599795204368974, "train_speed(iter/s)": 0.136996 }, { "epoch": 2.4050632911392404, "eval_loss": 0.22811782360076904, "eval_runtime": 9.0761, "eval_samples_per_second": 16.527, "eval_steps_per_second": 4.187, "eval_token_acc": 0.9284638444103239, "step": 560 }, { "epoch": 2.4266092108806894, "grad_norm": 0.6157174706459045, "learning_rate": 8.489318290631454e-07, "loss": 0.10170652866363525, "memory(GiB)": 38.74, "step": 565, "token_acc": 0.9582807980419787, "train_speed(iter/s)": 0.136587 }, { "epoch": 2.4481551306221383, "grad_norm": 0.6362963914871216, "learning_rate": 7.870896319167548e-07, "loss": 0.09682157635688782, "memory(GiB)": 38.74, "step": 570, "token_acc": 0.9683932052353105, "train_speed(iter/s)": 0.136707 }, { "epoch": 2.4697010503635872, "grad_norm": 0.723169207572937, "learning_rate": 7.273932141125256e-07, "loss": 0.10265512466430664, "memory(GiB)": 38.74, "step": 575, "token_acc": 0.9611537977075199, "train_speed(iter/s)": 0.136865 }, { "epoch": 2.4912469701050366, "grad_norm": 0.729612410068512, "learning_rate": 6.698729810778065e-07, "loss": 0.10810785293579102, "memory(GiB)": 38.74, "step": 580, "token_acc": 0.9597445638994423, "train_speed(iter/s)": 0.136977 }, { "epoch": 2.4912469701050366, "eval_loss": 0.23019996285438538, "eval_runtime": 9.0825, "eval_samples_per_second": 16.515, "eval_steps_per_second": 4.184, "eval_token_acc": 0.9287124369379249, "step": 580 }, { "epoch": 2.5127928898464855, "grad_norm": 0.5513563752174377, "learning_rate": 6.145582298346153e-07, "loss": 0.09552640914916992, "memory(GiB)": 38.74, "step": 585, "token_acc": 0.9564781160078752, "train_speed(iter/s)": 0.136466 }, { "epoch": 2.5343388095879344, "grad_norm": 0.6641238331794739, "learning_rate": 5.614771340776559e-07, "loss": 0.09575198888778687, "memory(GiB)": 38.74, "step": 590, "token_acc": 0.9661430780499715, "train_speed(iter/s)": 0.1366 }, { "epoch": 2.5558847293293834, "grad_norm": 0.7826724052429199, "learning_rate": 5.106567298245008e-07, "loss": 0.10562150478363037, "memory(GiB)": 38.74, "step": 595, "token_acc": 0.9642348331948237, "train_speed(iter/s)": 0.136721 }, { "epoch": 2.5774306490708323, "grad_norm": 0.7083848714828491, "learning_rate": 4.6212290164521554e-07, "loss": 0.09948662519454957, "memory(GiB)": 38.74, "step": 600, "token_acc": 0.9634162853042324, "train_speed(iter/s)": 0.13684 }, { "epoch": 2.5774306490708323, "eval_loss": 0.22936248779296875, "eval_runtime": 9.0836, "eval_samples_per_second": 16.513, "eval_steps_per_second": 4.183, "eval_token_acc": 0.9285515829494773, "step": 600 }, { "epoch": 2.5989765688122812, "grad_norm": 0.6187947392463684, "learning_rate": 4.159003694784647e-07, "loss": 0.09668093919754028, "memory(GiB)": 38.74, "step": 605, "token_acc": 0.9560884152358398, "train_speed(iter/s)": 0.136435 }, { "epoch": 2.62052248855373, "grad_norm": 0.5855912566184998, "learning_rate": 3.7201267604080436e-07, "loss": 0.09887575507164001, "memory(GiB)": 38.74, "step": 610, "token_acc": 0.9638565571839216, "train_speed(iter/s)": 0.136553 }, { "epoch": 2.642068408295179, "grad_norm": 0.6379809379577637, "learning_rate": 3.3048217483556743e-07, "loss": 0.09761322140693665, "memory(GiB)": 38.74, "step": 615, "token_acc": 0.9634018456375839, "train_speed(iter/s)": 0.136656 }, { "epoch": 2.663614328036628, "grad_norm": 0.7473537921905518, "learning_rate": 2.9133001876746004e-07, "loss": 0.10321993827819824, "memory(GiB)": 38.74, "step": 620, "token_acc": 0.9682616630546367, "train_speed(iter/s)": 0.136808 }, { "epoch": 2.663614328036628, "eval_loss": 0.2293192744255066, "eval_runtime": 9.083, "eval_samples_per_second": 16.514, "eval_steps_per_second": 4.184, "eval_token_acc": 0.9288586678365138, "step": 620 }, { "epoch": 2.685160247778077, "grad_norm": 0.6255384087562561, "learning_rate": 2.545761493686666e-07, "loss": 0.09618874192237854, "memory(GiB)": 38.74, "step": 625, "token_acc": 0.9535718866230601, "train_speed(iter/s)": 0.136315 }, { "epoch": 2.706706167519526, "grad_norm": 0.7249470353126526, "learning_rate": 2.2023928664194229e-07, "loss": 0.09542186260223388, "memory(GiB)": 38.74, "step": 630, "token_acc": 0.9657436142277394, "train_speed(iter/s)": 0.136473 }, { "epoch": 2.728252087260975, "grad_norm": 0.6389328241348267, "learning_rate": 1.8833691952587829e-07, "loss": 0.09868041276931763, "memory(GiB)": 38.74, "step": 635, "token_acc": 0.9635655520475652, "train_speed(iter/s)": 0.136586 }, { "epoch": 2.7497980070024237, "grad_norm": 0.6394509673118591, "learning_rate": 1.5888529698718347e-07, "loss": 0.09870019555091858, "memory(GiB)": 38.74, "step": 640, "token_acc": 0.9696296296296296, "train_speed(iter/s)": 0.136694 }, { "epoch": 2.7497980070024237, "eval_loss": 0.2294546216726303, "eval_runtime": 9.0801, "eval_samples_per_second": 16.52, "eval_steps_per_second": 4.185, "eval_token_acc": 0.928580829129195, "step": 640 }, { "epoch": 2.7713439267438726, "grad_norm": 0.6763883233070374, "learning_rate": 1.3189941974453502e-07, "loss": 0.11155039072036743, "memory(GiB)": 38.75, "step": 645, "token_acc": 0.9546169649632604, "train_speed(iter/s)": 0.136302 }, { "epoch": 2.7928898464853216, "grad_norm": 0.672804057598114, "learning_rate": 1.0739303262819301e-07, "loss": 0.10463042259216308, "memory(GiB)": 38.75, "step": 650, "token_acc": 0.9642040954859147, "train_speed(iter/s)": 0.136412 }, { "epoch": 2.814435766226771, "grad_norm": 0.6254743933677673, "learning_rate": 8.537861757929422e-08, "loss": 0.09878579974174499, "memory(GiB)": 48.34, "step": 655, "token_acc": 0.9661926605504587, "train_speed(iter/s)": 0.136557 }, { "epoch": 2.83598168596822, "grad_norm": 0.645412027835846, "learning_rate": 6.58673872923693e-08, "loss": 0.09899102449417115, "memory(GiB)": 48.34, "step": 660, "token_acc": 0.9720326171606578, "train_speed(iter/s)": 0.136719 }, { "epoch": 2.83598168596822, "eval_loss": 0.22916720807552338, "eval_runtime": 9.0857, "eval_samples_per_second": 16.51, "eval_steps_per_second": 4.182, "eval_token_acc": 0.9288001754770783, "step": 660 }, { "epoch": 2.857527605709669, "grad_norm": 0.6514653563499451, "learning_rate": 4.88692795043344e-08, "loss": 0.09887722730636597, "memory(GiB)": 48.34, "step": 665, "token_acc": 0.9558207029990325, "train_speed(iter/s)": 0.136328 }, { "epoch": 2.8790735254511177, "grad_norm": 0.7409882545471191, "learning_rate": 3.439295193286174e-08, "loss": 0.10554132461547852, "memory(GiB)": 48.34, "step": 670, "token_acc": 0.9599883432901064, "train_speed(iter/s)": 0.136496 }, { "epoch": 2.9006194451925666, "grad_norm": 0.6542146801948547, "learning_rate": 2.2445777866709208e-08, "loss": 0.09306983947753907, "memory(GiB)": 48.34, "step": 675, "token_acc": 0.9684243112165927, "train_speed(iter/s)": 0.136588 }, { "epoch": 2.9221653649340156, "grad_norm": 0.6311559081077576, "learning_rate": 1.3033842410251074e-08, "loss": 0.10304062366485596, "memory(GiB)": 48.34, "step": 680, "token_acc": 0.9661386389557723, "train_speed(iter/s)": 0.136699 }, { "epoch": 2.9221653649340156, "eval_loss": 0.22871780395507812, "eval_runtime": 9.0702, "eval_samples_per_second": 16.538, "eval_steps_per_second": 4.19, "eval_token_acc": 0.9288732909263727, "step": 680 }, { "epoch": 2.9437112846754645, "grad_norm": 0.664340078830719, "learning_rate": 6.16193938412557e-09, "loss": 0.10301237106323242, "memory(GiB)": 48.34, "step": 685, "token_acc": 0.9599046221570066, "train_speed(iter/s)": 0.136287 }, { "epoch": 2.9652572044169134, "grad_norm": 0.6333754062652588, "learning_rate": 1.8335688835802169e-09, "loss": 0.09610807299613952, "memory(GiB)": 48.34, "step": 690, "token_acc": 0.9651297625621204, "train_speed(iter/s)": 0.136426 }, { "epoch": 2.9868031241583624, "grad_norm": 0.7277814745903015, "learning_rate": 5.093549575119205e-11, "loss": 0.0946582555770874, "memory(GiB)": 48.34, "step": 695, "token_acc": 0.9677898215836204, "train_speed(iter/s)": 0.136543 }, { "epoch": 2.9911123081066524, "eval_loss": 0.22913037240505219, "eval_runtime": 9.0719, "eval_samples_per_second": 16.535, "eval_steps_per_second": 4.189, "eval_token_acc": 0.9288147985669372, "step": 696 } ], "logging_steps": 5, "max_steps": 696, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.629985511001293e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }