{ "best_global_step": 460, "best_metric": 0.21736681, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v33-20250511-155921/checkpoint-460", "epoch": 2.9911123081066524, "eval_steps": 20, "global_step": 696, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0043091839482897925, "grad_norm": 2.5237181186676025, "learning_rate": 9.99994906450425e-06, "loss": 0.4181586503982544, "memory(GiB)": 30.52, "step": 1, "token_acc": 0.8749789668517584, "train_speed(iter/s)": 0.065716 }, { "epoch": 0.02154591974144896, "grad_norm": 1.6377122402191162, "learning_rate": 9.99872666449397e-06, "loss": 0.346223920583725, "memory(GiB)": 30.53, "step": 5, "token_acc": 0.892663520418113, "train_speed(iter/s)": 0.123965 }, { "epoch": 0.04309183948289792, "grad_norm": 1.0654209852218628, "learning_rate": 9.994907306529203e-06, "loss": 0.28658442497253417, "memory(GiB)": 30.53, "step": 10, "token_acc": 0.9107923074276671, "train_speed(iter/s)": 0.141394 }, { "epoch": 0.06463775922434689, "grad_norm": 0.9550829529762268, "learning_rate": 9.988543871435342e-06, "loss": 0.27949891090393064, "memory(GiB)": 30.53, "step": 15, "token_acc": 0.9063064246085158, "train_speed(iter/s)": 0.145526 }, { "epoch": 0.08618367896579585, "grad_norm": 0.9428064227104187, "learning_rate": 9.979639600327522e-06, "loss": 0.26371135711669924, "memory(GiB)": 30.53, "step": 20, "token_acc": 0.9098609104557944, "train_speed(iter/s)": 0.147486 }, { "epoch": 0.08618367896579585, "eval_loss": 0.28782597184181213, "eval_runtime": 9.1011, "eval_samples_per_second": 16.482, "eval_steps_per_second": 4.175, "eval_token_acc": 0.9104138732457316, "step": 20 }, { "epoch": 0.10772959870724481, "grad_norm": 0.8753194808959961, "learning_rate": 9.96819902845557e-06, "loss": 0.2613688468933105, "memory(GiB)": 30.53, "step": 25, "token_acc": 0.9133286666148143, "train_speed(iter/s)": 0.132937 }, { "epoch": 0.12927551844869378, "grad_norm": 0.8821267485618591, "learning_rate": 9.954227982894034e-06, "loss": 0.2568368434906006, "memory(GiB)": 30.53, "step": 30, "token_acc": 0.9194433893857257, "train_speed(iter/s)": 0.136714 }, { "epoch": 0.15082143819014274, "grad_norm": 0.8757939338684082, "learning_rate": 9.937733579574263e-06, "loss": 0.2422783136367798, "memory(GiB)": 30.53, "step": 35, "token_acc": 0.9147548274711834, "train_speed(iter/s)": 0.138761 }, { "epoch": 0.1723673579315917, "grad_norm": 0.849438488483429, "learning_rate": 9.918724219660013e-06, "loss": 0.24745543003082277, "memory(GiB)": 30.53, "step": 40, "token_acc": 0.9217827626918536, "train_speed(iter/s)": 0.141087 }, { "epoch": 0.1723673579315917, "eval_loss": 0.26439228653907776, "eval_runtime": 9.0902, "eval_samples_per_second": 16.501, "eval_steps_per_second": 4.18, "eval_token_acc": 0.9148833467417539, "step": 40 }, { "epoch": 0.19391327767304067, "grad_norm": 0.8241924047470093, "learning_rate": 9.897209585268459e-06, "loss": 0.2613823413848877, "memory(GiB)": 30.53, "step": 45, "token_acc": 0.9152877021436631, "train_speed(iter/s)": 0.13376 }, { "epoch": 0.21545919741448963, "grad_norm": 0.8378185033798218, "learning_rate": 9.873200634538746e-06, "loss": 0.2541953086853027, "memory(GiB)": 30.53, "step": 50, "token_acc": 0.9222211442062029, "train_speed(iter/s)": 0.135296 }, { "epoch": 0.23700511715593858, "grad_norm": 0.8501411080360413, "learning_rate": 9.846709596050646e-06, "loss": 0.24187664985656737, "memory(GiB)": 30.53, "step": 55, "token_acc": 0.9192013726407736, "train_speed(iter/s)": 0.136824 }, { "epoch": 0.25855103689738757, "grad_norm": 0.9697138071060181, "learning_rate": 9.817749962596115e-06, "loss": 0.2432713031768799, "memory(GiB)": 30.53, "step": 60, "token_acc": 0.9142042416681518, "train_speed(iter/s)": 0.138638 }, { "epoch": 0.25855103689738757, "eval_loss": 0.25323811173439026, "eval_runtime": 9.0482, "eval_samples_per_second": 16.578, "eval_steps_per_second": 4.2, "eval_token_acc": 0.9181013676588898, "step": 60 }, { "epoch": 0.28009695663883655, "grad_norm": 0.7712034583091736, "learning_rate": 9.786336484306966e-06, "loss": 0.25183610916137694, "memory(GiB)": 30.53, "step": 65, "token_acc": 0.9200226351032726, "train_speed(iter/s)": 0.134844 }, { "epoch": 0.3016428763802855, "grad_norm": 0.822388231754303, "learning_rate": 9.752485161142103e-06, "loss": 0.23003783226013183, "memory(GiB)": 30.53, "step": 70, "token_acc": 0.9277887742117913, "train_speed(iter/s)": 0.136026 }, { "epoch": 0.32318879612173446, "grad_norm": 0.792780339717865, "learning_rate": 9.716213234738216e-06, "loss": 0.2195737838745117, "memory(GiB)": 30.53, "step": 75, "token_acc": 0.9205170406647666, "train_speed(iter/s)": 0.137139 }, { "epoch": 0.3447347158631834, "grad_norm": 0.8536691069602966, "learning_rate": 9.677539179628005e-06, "loss": 0.2305692672729492, "memory(GiB)": 30.53, "step": 80, "token_acc": 0.9304421364200957, "train_speed(iter/s)": 0.138275 }, { "epoch": 0.3447347158631834, "eval_loss": 0.24655859172344208, "eval_runtime": 9.1028, "eval_samples_per_second": 16.479, "eval_steps_per_second": 4.175, "eval_token_acc": 0.9181907571288103, "step": 80 }, { "epoch": 0.36628063560463237, "grad_norm": 0.8240283727645874, "learning_rate": 9.636482693830488e-06, "loss": 0.2409297466278076, "memory(GiB)": 30.53, "step": 85, "token_acc": 0.9139544242668143, "train_speed(iter/s)": 0.134608 }, { "epoch": 0.38782655534608135, "grad_norm": 0.793641984462738, "learning_rate": 9.59306468881811e-06, "loss": 0.24126851558685303, "memory(GiB)": 30.53, "step": 90, "token_acc": 0.9150926617579056, "train_speed(iter/s)": 0.136497 }, { "epoch": 0.4093724750875303, "grad_norm": 0.9017167091369629, "learning_rate": 9.547307278865823e-06, "loss": 0.22098026275634766, "memory(GiB)": 30.53, "step": 95, "token_acc": 0.9268885204180002, "train_speed(iter/s)": 0.137464 }, { "epoch": 0.43091839482897926, "grad_norm": 0.9111009240150452, "learning_rate": 9.499233769787534e-06, "loss": 0.23211708068847656, "memory(GiB)": 30.53, "step": 100, "token_acc": 0.9252183406113537, "train_speed(iter/s)": 0.138704 }, { "epoch": 0.43091839482897926, "eval_loss": 0.24070490896701813, "eval_runtime": 9.1064, "eval_samples_per_second": 16.472, "eval_steps_per_second": 4.173, "eval_token_acc": 0.9203897380888532, "step": 100 }, { "epoch": 0.45246431457042824, "grad_norm": 0.9242920875549316, "learning_rate": 9.448868647065644e-06, "loss": 0.24608321189880372, "memory(GiB)": 30.53, "step": 105, "token_acc": 0.9158947454709402, "train_speed(iter/s)": 0.136144 }, { "epoch": 0.47401023431187717, "grad_norm": 0.7441526055335999, "learning_rate": 9.396237563379761e-06, "loss": 0.21680865287780762, "memory(GiB)": 30.53, "step": 110, "token_acc": 0.9275543984829493, "train_speed(iter/s)": 0.136882 }, { "epoch": 0.49555615405332615, "grad_norm": 0.8983182311058044, "learning_rate": 9.341367325540921e-06, "loss": 0.21554439067840575, "memory(GiB)": 30.53, "step": 115, "token_acc": 0.9245874763890374, "train_speed(iter/s)": 0.137509 }, { "epoch": 0.5171020737947751, "grad_norm": 0.7459524273872375, "learning_rate": 9.284285880837947e-06, "loss": 0.20305709838867186, "memory(GiB)": 30.53, "step": 120, "token_acc": 0.9276550249465432, "train_speed(iter/s)": 0.138213 }, { "epoch": 0.5171020737947751, "eval_loss": 0.23513969779014587, "eval_runtime": 9.1131, "eval_samples_per_second": 16.46, "eval_steps_per_second": 4.17, "eval_token_acc": 0.9224635737910074, "step": 120 }, { "epoch": 0.5386479935362241, "grad_norm": 0.6458992958068848, "learning_rate": 9.225022302802951e-06, "loss": 0.20988364219665528, "memory(GiB)": 30.53, "step": 125, "token_acc": 0.923570767028711, "train_speed(iter/s)": 0.135446 }, { "epoch": 0.5601939132776731, "grad_norm": 0.81890469789505, "learning_rate": 9.163606776403182e-06, "loss": 0.217695951461792, "memory(GiB)": 30.53, "step": 130, "token_acc": 0.9229289737601318, "train_speed(iter/s)": 0.13625 }, { "epoch": 0.581739833019122, "grad_norm": 0.9051012992858887, "learning_rate": 9.100070582666796e-06, "loss": 0.21849727630615234, "memory(GiB)": 30.53, "step": 135, "token_acc": 0.9253172858079698, "train_speed(iter/s)": 0.136893 }, { "epoch": 0.603285752760571, "grad_norm": 0.9010105729103088, "learning_rate": 9.034446082750352e-06, "loss": 0.22707552909851075, "memory(GiB)": 30.53, "step": 140, "token_acc": 0.9273702731546916, "train_speed(iter/s)": 0.137345 }, { "epoch": 0.603285752760571, "eval_loss": 0.23331375420093536, "eval_runtime": 9.1166, "eval_samples_per_second": 16.454, "eval_steps_per_second": 4.168, "eval_token_acc": 0.9220881380173416, "step": 140 }, { "epoch": 0.6248316725020199, "grad_norm": 0.7436938881874084, "learning_rate": 8.966766701456177e-06, "loss": 0.22292051315307618, "memory(GiB)": 30.53, "step": 145, "token_acc": 0.9230008118530546, "train_speed(iter/s)": 0.135251 }, { "epoch": 0.6463775922434689, "grad_norm": 0.7819850444793701, "learning_rate": 8.897066910207958e-06, "loss": 0.21327242851257325, "memory(GiB)": 30.53, "step": 150, "token_acc": 0.9308567501527184, "train_speed(iter/s)": 0.135932 }, { "epoch": 0.6679235119849178, "grad_norm": 0.8297721147537231, "learning_rate": 8.825382209493284e-06, "loss": 0.2241668701171875, "memory(GiB)": 30.53, "step": 155, "token_acc": 0.924225955715661, "train_speed(iter/s)": 0.136464 }, { "epoch": 0.6894694317263668, "grad_norm": 0.8930188417434692, "learning_rate": 8.751749110782013e-06, "loss": 0.21810550689697267, "memory(GiB)": 30.53, "step": 160, "token_acc": 0.9215799614643545, "train_speed(iter/s)": 0.137033 }, { "epoch": 0.6894694317263668, "eval_loss": 0.23200780153274536, "eval_runtime": 9.1055, "eval_samples_per_second": 16.474, "eval_steps_per_second": 4.173, "eval_token_acc": 0.9217484580316438, "step": 160 }, { "epoch": 0.7110153514678158, "grad_norm": 0.7813264727592468, "learning_rate": 8.676205117929752e-06, "loss": 0.21541600227355956, "memory(GiB)": 30.53, "step": 165, "token_acc": 0.9275054882711329, "train_speed(iter/s)": 0.135327 }, { "epoch": 0.7325612712092647, "grad_norm": 0.8045797348022461, "learning_rate": 8.598788708075844e-06, "loss": 0.20851638317108154, "memory(GiB)": 30.53, "step": 170, "token_acc": 0.9192872543834669, "train_speed(iter/s)": 0.135759 }, { "epoch": 0.7541071909507137, "grad_norm": 0.8655018210411072, "learning_rate": 8.51953931204566e-06, "loss": 0.20699663162231446, "memory(GiB)": 30.53, "step": 175, "token_acc": 0.9224663747263059, "train_speed(iter/s)": 0.136385 }, { "epoch": 0.7756531106921627, "grad_norm": 0.6923186779022217, "learning_rate": 8.438497294267117e-06, "loss": 0.19253411293029785, "memory(GiB)": 30.53, "step": 180, "token_acc": 0.9316482201615316, "train_speed(iter/s)": 0.13683 }, { "epoch": 0.7756531106921627, "eval_loss": 0.22997288405895233, "eval_runtime": 9.1153, "eval_samples_per_second": 16.456, "eval_steps_per_second": 4.169, "eval_token_acc": 0.9232859569142755, "step": 180 }, { "epoch": 0.7971990304336116, "grad_norm": 0.7510209083557129, "learning_rate": 8.3557039322117e-06, "loss": 0.2128201961517334, "memory(GiB)": 30.53, "step": 185, "token_acc": 0.9327214400285154, "train_speed(iter/s)": 0.1351 }, { "epoch": 0.8187449501750605, "grad_norm": 0.7761275768280029, "learning_rate": 8.27120139537044e-06, "loss": 0.20115509033203124, "memory(GiB)": 30.53, "step": 190, "token_acc": 0.9393949424069906, "train_speed(iter/s)": 0.135548 }, { "epoch": 0.8402908699165096, "grad_norm": 0.8497465252876282, "learning_rate": 8.18503272377554e-06, "loss": 0.2121565341949463, "memory(GiB)": 30.53, "step": 195, "token_acc": 0.9327115256495669, "train_speed(iter/s)": 0.136171 }, { "epoch": 0.8618367896579585, "grad_norm": 0.7839154601097107, "learning_rate": 8.097241806078616e-06, "loss": 0.21248257160186768, "memory(GiB)": 30.53, "step": 200, "token_acc": 0.9275539014373717, "train_speed(iter/s)": 0.136519 }, { "epoch": 0.8618367896579585, "eval_loss": 0.22366267442703247, "eval_runtime": 9.0776, "eval_samples_per_second": 16.524, "eval_steps_per_second": 4.186, "eval_token_acc": 0.9240189505676232, "step": 200 }, { "epoch": 0.8833827093994074, "grad_norm": 0.8640104532241821, "learning_rate": 8.007873357196716e-06, "loss": 0.2174234390258789, "memory(GiB)": 30.53, "step": 205, "token_acc": 0.9313344114690327, "train_speed(iter/s)": 0.135298 }, { "epoch": 0.9049286291408565, "grad_norm": 0.8847671151161194, "learning_rate": 7.916972895537471e-06, "loss": 0.21736545562744142, "memory(GiB)": 30.53, "step": 210, "token_acc": 0.9281964485498724, "train_speed(iter/s)": 0.135752 }, { "epoch": 0.9264745488823054, "grad_norm": 0.9129384160041809, "learning_rate": 7.824586719815019e-06, "loss": 0.19911231994628906, "memory(GiB)": 30.53, "step": 215, "token_acc": 0.933074239549544, "train_speed(iter/s)": 0.136175 }, { "epoch": 0.9480204686237543, "grad_norm": 0.746377170085907, "learning_rate": 7.730761885468486e-06, "loss": 0.2088994264602661, "memory(GiB)": 30.53, "step": 220, "token_acc": 0.929672384883271, "train_speed(iter/s)": 0.136769 }, { "epoch": 0.9480204686237543, "eval_loss": 0.22297517955303192, "eval_runtime": 9.1184, "eval_samples_per_second": 16.45, "eval_steps_per_second": 4.167, "eval_token_acc": 0.9244480200232412, "step": 220 }, { "epoch": 0.9695663883652034, "grad_norm": 0.911310076713562, "learning_rate": 7.635546180695039e-06, "loss": 0.21921830177307128, "memory(GiB)": 30.53, "step": 225, "token_acc": 0.928082019245302, "train_speed(iter/s)": 0.135629 }, { "epoch": 0.9911123081066523, "grad_norm": 0.7769207954406738, "learning_rate": 7.538988102109728e-06, "loss": 0.2161275863647461, "memory(GiB)": 30.53, "step": 230, "token_acc": 0.9270204342784021, "train_speed(iter/s)": 0.136241 }, { "epoch": 1.0086183678965797, "grad_norm": 0.6726126670837402, "learning_rate": 7.441136830044495e-06, "loss": 0.17683182954788207, "memory(GiB)": 30.53, "step": 235, "token_acc": 0.943002946884209, "train_speed(iter/s)": 0.136928 }, { "epoch": 1.0301642876380286, "grad_norm": 0.815794050693512, "learning_rate": 7.342042203498952e-06, "loss": 0.14148125648498536, "memory(GiB)": 30.53, "step": 240, "token_acc": 0.9480684873355031, "train_speed(iter/s)": 0.137322 }, { "epoch": 1.0301642876380286, "eval_loss": 0.2275795191526413, "eval_runtime": 9.1077, "eval_samples_per_second": 16.47, "eval_steps_per_second": 4.172, "eval_token_acc": 0.9249307231608117, "step": 240 }, { "epoch": 1.0517102073794775, "grad_norm": 0.8187179565429688, "learning_rate": 7.241754694755674e-06, "loss": 0.1443116307258606, "memory(GiB)": 30.53, "step": 245, "token_acc": 0.9402755009664554, "train_speed(iter/s)": 0.13621 }, { "epoch": 1.0732561271209264, "grad_norm": 0.8635661005973816, "learning_rate": 7.140325383672938e-06, "loss": 0.1403177261352539, "memory(GiB)": 30.53, "step": 250, "token_acc": 0.9511112435202288, "train_speed(iter/s)": 0.136646 }, { "epoch": 1.0948020468623754, "grad_norm": 0.7241800427436829, "learning_rate": 7.037805931668006e-06, "loss": 0.14431071281433105, "memory(GiB)": 30.53, "step": 255, "token_acc": 0.9472287558048369, "train_speed(iter/s)": 0.136936 }, { "epoch": 1.1163479666038243, "grad_norm": 0.7335503101348877, "learning_rate": 6.934248555404197e-06, "loss": 0.14122509956359863, "memory(GiB)": 30.53, "step": 260, "token_acc": 0.9511055540931824, "train_speed(iter/s)": 0.137293 }, { "epoch": 1.1163479666038243, "eval_loss": 0.22707919776439667, "eval_runtime": 9.0673, "eval_samples_per_second": 16.543, "eval_steps_per_second": 4.191, "eval_token_acc": 0.9248234557969072, "step": 260 }, { "epoch": 1.1378938863452734, "grad_norm": 0.8322599530220032, "learning_rate": 6.8297060001951545e-06, "loss": 0.14616423845291138, "memory(GiB)": 30.53, "step": 265, "token_acc": 0.9405642484589853, "train_speed(iter/s)": 0.136437 }, { "epoch": 1.1594398060867224, "grad_norm": 0.746890127658844, "learning_rate": 6.724231513139853e-06, "loss": 0.13995609283447266, "memory(GiB)": 30.53, "step": 270, "token_acc": 0.9483776303064662, "train_speed(iter/s)": 0.1367 }, { "epoch": 1.1809857258281713, "grad_norm": 0.8125350475311279, "learning_rate": 6.617878816002032e-06, "loss": 0.1372074842453003, "memory(GiB)": 30.53, "step": 275, "token_acc": 0.9589013747076607, "train_speed(iter/s)": 0.136989 }, { "epoch": 1.2025316455696202, "grad_norm": 0.7824741005897522, "learning_rate": 6.510702077847864e-06, "loss": 0.14701566696166993, "memory(GiB)": 30.53, "step": 280, "token_acc": 0.9469290828625913, "train_speed(iter/s)": 0.137369 }, { "epoch": 1.2025316455696202, "eval_loss": 0.22802643477916718, "eval_runtime": 9.0755, "eval_samples_per_second": 16.528, "eval_steps_per_second": 4.187, "eval_token_acc": 0.9248592115848753, "step": 280 }, { "epoch": 1.2240775653110691, "grad_norm": 0.6673188209533691, "learning_rate": 6.402755887455792e-06, "loss": 0.14538809061050414, "memory(GiB)": 30.53, "step": 285, "token_acc": 0.9437563495666083, "train_speed(iter/s)": 0.136354 }, { "epoch": 1.2456234850525183, "grad_norm": 0.7520856261253357, "learning_rate": 6.294095225512604e-06, "loss": 0.14223116636276245, "memory(GiB)": 30.53, "step": 290, "token_acc": 0.9431994362226921, "train_speed(iter/s)": 0.136648 }, { "epoch": 1.2671694047939672, "grad_norm": 0.8446849584579468, "learning_rate": 6.184775436609885e-06, "loss": 0.14606384038925171, "memory(GiB)": 30.53, "step": 295, "token_acc": 0.9505753500623874, "train_speed(iter/s)": 0.137043 }, { "epoch": 1.2887153245354162, "grad_norm": 0.9321854710578918, "learning_rate": 6.074852201055121e-06, "loss": 0.14932063817977906, "memory(GiB)": 30.53, "step": 300, "token_acc": 0.9450343760123826, "train_speed(iter/s)": 0.137433 }, { "epoch": 1.2887153245354162, "eval_loss": 0.22745147347450256, "eval_runtime": 9.1349, "eval_samples_per_second": 16.421, "eval_steps_per_second": 4.16, "eval_token_acc": 0.924591043175114, "step": 300 }, { "epoch": 1.310261244276865, "grad_norm": 0.8243806958198547, "learning_rate": 5.964381506511823e-06, "loss": 0.15078881978988648, "memory(GiB)": 30.53, "step": 305, "token_acc": 0.9428590810419681, "train_speed(iter/s)": 0.136632 }, { "epoch": 1.331807164018314, "grad_norm": 0.7823331356048584, "learning_rate": 5.853419619483083e-06, "loss": 0.14328973293304442, "memory(GiB)": 30.53, "step": 310, "token_acc": 0.9486281555467584, "train_speed(iter/s)": 0.136888 }, { "epoch": 1.353353083759763, "grad_norm": 0.7617143392562866, "learning_rate": 5.742023056653131e-06, "loss": 0.14642927646636963, "memory(GiB)": 30.53, "step": 315, "token_acc": 0.9535409058393886, "train_speed(iter/s)": 0.137158 }, { "epoch": 1.3748990035012119, "grad_norm": 0.8405448794364929, "learning_rate": 5.630248556101448e-06, "loss": 0.14387913942337036, "memory(GiB)": 30.53, "step": 320, "token_acc": 0.9477073920984331, "train_speed(iter/s)": 0.137447 }, { "epoch": 1.3748990035012119, "eval_loss": 0.2264009267091751, "eval_runtime": 9.1222, "eval_samples_per_second": 16.443, "eval_steps_per_second": 4.166, "eval_token_acc": 0.9254670599803343, "step": 320 }, { "epoch": 1.3964449232426608, "grad_norm": 0.7621841430664062, "learning_rate": 5.51815304840412e-06, "loss": 0.13728692531585693, "memory(GiB)": 30.53, "step": 325, "token_acc": 0.9431869420146091, "train_speed(iter/s)": 0.136636 }, { "epoch": 1.41799084298411, "grad_norm": 0.7314430475234985, "learning_rate": 5.405793627637157e-06, "loss": 0.14474726915359498, "memory(GiB)": 32.82, "step": 330, "token_acc": 0.9499411071849234, "train_speed(iter/s)": 0.136864 }, { "epoch": 1.4395367627255589, "grad_norm": 0.8704355359077454, "learning_rate": 5.293227522296517e-06, "loss": 0.14055614471435546, "memory(GiB)": 32.82, "step": 335, "token_acc": 0.9562322700167892, "train_speed(iter/s)": 0.13705 }, { "epoch": 1.4610826824670078, "grad_norm": 0.7439765930175781, "learning_rate": 5.180512066149682e-06, "loss": 0.14476253986358642, "memory(GiB)": 32.82, "step": 340, "token_acc": 0.9437502448196169, "train_speed(iter/s)": 0.137336 }, { "epoch": 1.4610826824670078, "eval_loss": 0.22488656640052795, "eval_runtime": 9.1135, "eval_samples_per_second": 16.459, "eval_steps_per_second": 4.17, "eval_token_acc": 0.9261464199517296, "step": 340 }, { "epoch": 1.4826286022084567, "grad_norm": 0.806131899356842, "learning_rate": 5.06770466903361e-06, "loss": 0.13870317935943605, "memory(GiB)": 32.82, "step": 345, "token_acc": 0.942543247613352, "train_speed(iter/s)": 0.136478 }, { "epoch": 1.5041745219499059, "grad_norm": 0.8332167267799377, "learning_rate": 4.954862787613937e-06, "loss": 0.13994078636169432, "memory(GiB)": 32.82, "step": 350, "token_acc": 0.9441090757701915, "train_speed(iter/s)": 0.136722 }, { "epoch": 1.5257204416913548, "grad_norm": 0.8114423751831055, "learning_rate": 4.842043896120332e-06, "loss": 0.1382569432258606, "memory(GiB)": 32.82, "step": 355, "token_acc": 0.9571762441572724, "train_speed(iter/s)": 0.137 }, { "epoch": 1.5472663614328037, "grad_norm": 0.76919025182724, "learning_rate": 4.729305457072913e-06, "loss": 0.14944992065429688, "memory(GiB)": 32.82, "step": 360, "token_acc": 0.9474090514329827, "train_speed(iter/s)": 0.137309 }, { "epoch": 1.5472663614328037, "eval_loss": 0.22209370136260986, "eval_runtime": 9.1201, "eval_samples_per_second": 16.447, "eval_steps_per_second": 4.167, "eval_token_acc": 0.9268257799231251, "step": 360 }, { "epoch": 1.5688122811742526, "grad_norm": 1.0794566869735718, "learning_rate": 4.616704892014613e-06, "loss": 0.14552514553070067, "memory(GiB)": 32.82, "step": 365, "token_acc": 0.9390662094434187, "train_speed(iter/s)": 0.136567 }, { "epoch": 1.5903582009157016, "grad_norm": 0.7622527480125427, "learning_rate": 4.504299552264428e-06, "loss": 0.134457790851593, "memory(GiB)": 32.82, "step": 370, "token_acc": 0.9490503358128114, "train_speed(iter/s)": 0.136729 }, { "epoch": 1.6119041206571505, "grad_norm": 0.8797208666801453, "learning_rate": 4.392146689706426e-06, "loss": 0.14710538387298583, "memory(GiB)": 32.82, "step": 375, "token_acc": 0.9465866995942847, "train_speed(iter/s)": 0.137072 }, { "epoch": 1.6334500403985994, "grad_norm": 0.8271977305412292, "learning_rate": 4.280303427629404e-06, "loss": 0.1435370683670044, "memory(GiB)": 32.82, "step": 380, "token_acc": 0.9512855685695192, "train_speed(iter/s)": 0.137365 }, { "epoch": 1.6334500403985994, "eval_loss": 0.22211501002311707, "eval_runtime": 9.1527, "eval_samples_per_second": 16.389, "eval_steps_per_second": 4.152, "eval_token_acc": 0.9266470009832841, "step": 380 }, { "epoch": 1.6549959601400483, "grad_norm": 0.7813441157341003, "learning_rate": 4.168826731632052e-06, "loss": 0.13174430131912232, "memory(GiB)": 32.82, "step": 385, "token_acc": 0.9456577563647924, "train_speed(iter/s)": 0.136683 }, { "epoch": 1.6765418798814973, "grad_norm": 0.8885687589645386, "learning_rate": 4.057773380608411e-06, "loss": 0.15410563945770264, "memory(GiB)": 32.82, "step": 390, "token_acc": 0.9529198577680525, "train_speed(iter/s)": 0.137045 }, { "epoch": 1.6980877996229464, "grad_norm": 0.8711886405944824, "learning_rate": 3.947199937828447e-06, "loss": 0.15066791772842408, "memory(GiB)": 32.82, "step": 395, "token_acc": 0.9490923301005364, "train_speed(iter/s)": 0.137378 }, { "epoch": 1.7196337193643954, "grad_norm": 0.7740168571472168, "learning_rate": 3.8371627221284495e-06, "loss": 0.14116008281707765, "memory(GiB)": 32.82, "step": 400, "token_acc": 0.9508052422246854, "train_speed(iter/s)": 0.137567 }, { "epoch": 1.7196337193643954, "eval_loss": 0.2199936956167221, "eval_runtime": 9.2066, "eval_samples_per_second": 16.293, "eval_steps_per_second": 4.127, "eval_token_acc": 0.9274157504246, "step": 400 }, { "epoch": 1.7411796391058443, "grad_norm": 0.7679008841514587, "learning_rate": 3.727717779225912e-06, "loss": 0.14058753252029418, "memory(GiB)": 32.82, "step": 405, "token_acc": 0.944121915820029, "train_speed(iter/s)": 0.136891 }, { "epoch": 1.7627255588472934, "grad_norm": 0.8533144593238831, "learning_rate": 3.6189208531735354e-06, "loss": 0.14932174682617189, "memory(GiB)": 32.82, "step": 410, "token_acc": 0.9472646822204345, "train_speed(iter/s)": 0.137222 }, { "epoch": 1.7842714785887424, "grad_norm": 0.6813901662826538, "learning_rate": 3.510827357966876e-06, "loss": 0.12951855659484862, "memory(GiB)": 32.82, "step": 415, "token_acc": 0.9607936037903465, "train_speed(iter/s)": 0.137481 }, { "epoch": 1.8058173983301913, "grad_norm": 0.6819447875022888, "learning_rate": 3.403492349320101e-06, "loss": 0.12727973461151124, "memory(GiB)": 32.82, "step": 420, "token_acc": 0.9527016395506184, "train_speed(iter/s)": 0.13761 }, { "epoch": 1.8058173983301913, "eval_loss": 0.21894590556621552, "eval_runtime": 9.2314, "eval_samples_per_second": 16.249, "eval_steps_per_second": 4.116, "eval_token_acc": 0.9276660409403772, "step": 420 }, { "epoch": 1.8273633180716402, "grad_norm": 0.7320638298988342, "learning_rate": 3.29697049662423e-06, "loss": 0.12717063426971437, "memory(GiB)": 32.82, "step": 425, "token_acc": 0.948704977741805, "train_speed(iter/s)": 0.136847 }, { "epoch": 1.8489092378130891, "grad_norm": 0.779474139213562, "learning_rate": 3.191316055102146e-06, "loss": 0.13753225803375244, "memory(GiB)": 32.82, "step": 430, "token_acc": 0.9508869722421001, "train_speed(iter/s)": 0.137066 }, { "epoch": 1.870455157554538, "grad_norm": 0.8016546368598938, "learning_rate": 3.0865828381745515e-06, "loss": 0.1339845299720764, "memory(GiB)": 32.82, "step": 435, "token_acc": 0.9550213879844219, "train_speed(iter/s)": 0.137237 }, { "epoch": 1.892001077295987, "grad_norm": 0.7220640778541565, "learning_rate": 2.982824190050958e-06, "loss": 0.14519211053848266, "memory(GiB)": 32.82, "step": 440, "token_acc": 0.9519722211384651, "train_speed(iter/s)": 0.137445 }, { "epoch": 1.892001077295987, "eval_loss": 0.21764494478702545, "eval_runtime": 9.1305, "eval_samples_per_second": 16.428, "eval_steps_per_second": 4.162, "eval_token_acc": 0.9279342093501386, "step": 440 }, { "epoch": 1.913546997037436, "grad_norm": 0.696753203868866, "learning_rate": 2.8800929585596506e-06, "loss": 0.13140536546707154, "memory(GiB)": 32.82, "step": 445, "token_acc": 0.9466040818443505, "train_speed(iter/s)": 0.136801 }, { "epoch": 1.9350929167788848, "grad_norm": 0.7188719511032104, "learning_rate": 2.778441468230483e-06, "loss": 0.1310037851333618, "memory(GiB)": 32.82, "step": 450, "token_acc": 0.9548904329235702, "train_speed(iter/s)": 0.136955 }, { "epoch": 1.956638836520334, "grad_norm": 0.777593731880188, "learning_rate": 2.6779214936442056e-06, "loss": 0.1402130603790283, "memory(GiB)": 32.82, "step": 455, "token_acc": 0.9508758882829285, "train_speed(iter/s)": 0.137143 }, { "epoch": 1.978184756261783, "grad_norm": 0.6985222697257996, "learning_rate": 2.5785842330619038e-06, "loss": 0.13579378128051758, "memory(GiB)": 32.82, "step": 460, "token_acc": 0.949956619816068, "train_speed(iter/s)": 0.137296 }, { "epoch": 1.978184756261783, "eval_loss": 0.21736681461334229, "eval_runtime": 9.1209, "eval_samples_per_second": 16.446, "eval_steps_per_second": 4.166, "eval_token_acc": 0.9278984535621704, "step": 460 }, { "epoch": 1.9997306760032318, "grad_norm": 0.7867130637168884, "learning_rate": 2.480480282347961e-06, "loss": 0.13830192089080812, "memory(GiB)": 32.82, "step": 465, "token_acc": 0.9452121426518384, "train_speed(iter/s)": 0.136755 }, { "epoch": 2.0172367357931593, "grad_norm": 0.6745245456695557, "learning_rate": 2.383659609199873e-06, "loss": 0.11236014366149902, "memory(GiB)": 32.82, "step": 470, "token_acc": 0.9635579777931427, "train_speed(iter/s)": 0.137215 }, { "epoch": 2.0387826555346082, "grad_norm": 0.6941452026367188, "learning_rate": 2.2881715276979705e-06, "loss": 0.09268745183944702, "memory(GiB)": 32.82, "step": 475, "token_acc": 0.9666506095527476, "train_speed(iter/s)": 0.13734 }, { "epoch": 2.060328575276057, "grad_norm": 0.7112278342247009, "learning_rate": 2.1940646731880887e-06, "loss": 0.09282677173614502, "memory(GiB)": 32.82, "step": 480, "token_acc": 0.97280563727167, "train_speed(iter/s)": 0.137502 }, { "epoch": 2.060328575276057, "eval_loss": 0.23180951178073883, "eval_runtime": 9.1229, "eval_samples_per_second": 16.442, "eval_steps_per_second": 4.165, "eval_token_acc": 0.9271118262268705, "step": 480 }, { "epoch": 2.081874495017506, "grad_norm": 0.7650997638702393, "learning_rate": 2.101386977509907e-06, "loss": 0.10469940900802613, "memory(GiB)": 32.82, "step": 485, "token_acc": 0.9529728553554915, "train_speed(iter/s)": 0.137 }, { "epoch": 2.103420414758955, "grad_norm": 0.8435229659080505, "learning_rate": 2.010185644583641e-06, "loss": 0.0978783905506134, "memory(GiB)": 32.82, "step": 490, "token_acc": 0.9673804425410422, "train_speed(iter/s)": 0.137178 }, { "epoch": 2.124966334500404, "grad_norm": 0.7170566916465759, "learning_rate": 1.920507126367448e-06, "loss": 0.0900570273399353, "memory(GiB)": 32.82, "step": 495, "token_acc": 0.9669074241266755, "train_speed(iter/s)": 0.137328 }, { "epoch": 2.146512254241853, "grad_norm": 0.7567837834358215, "learning_rate": 1.8323970991978823e-06, "loss": 0.08487753868103028, "memory(GiB)": 32.82, "step": 500, "token_acc": 0.9661423064902595, "train_speed(iter/s)": 0.137435 }, { "epoch": 2.146512254241853, "eval_loss": 0.23648440837860107, "eval_runtime": 9.1362, "eval_samples_per_second": 16.418, "eval_steps_per_second": 4.159, "eval_token_acc": 0.9269151693930455, "step": 500 }, { "epoch": 2.168058173983302, "grad_norm": 0.679600715637207, "learning_rate": 1.7459004405253544e-06, "loss": 0.08969470262527465, "memory(GiB)": 32.82, "step": 505, "token_acc": 0.9617286751361162, "train_speed(iter/s)": 0.136847 }, { "epoch": 2.1896040937247507, "grad_norm": 0.6843417882919312, "learning_rate": 1.6610612060565235e-06, "loss": 0.08585838079452515, "memory(GiB)": 32.82, "step": 510, "token_acc": 0.9688960464822534, "train_speed(iter/s)": 0.137034 }, { "epoch": 2.2111500134661997, "grad_norm": 0.7679196000099182, "learning_rate": 1.5779226073152071e-06, "loss": 0.09446129202842712, "memory(GiB)": 32.82, "step": 515, "token_acc": 0.9684342952994548, "train_speed(iter/s)": 0.137304 }, { "epoch": 2.2326959332076486, "grad_norm": 0.7014828324317932, "learning_rate": 1.4965269896332884e-06, "loss": 0.09327901601791382, "memory(GiB)": 32.82, "step": 520, "token_acc": 0.969137028942314, "train_speed(iter/s)": 0.137493 }, { "epoch": 2.2326959332076486, "eval_loss": 0.23686107993125916, "eval_runtime": 9.0931, "eval_samples_per_second": 16.496, "eval_steps_per_second": 4.179, "eval_token_acc": 0.9274157504246, "step": 520 }, { "epoch": 2.254241852949098, "grad_norm": 0.8045688271522522, "learning_rate": 1.4169158105827768e-06, "loss": 0.09479656219482421, "memory(GiB)": 32.82, "step": 525, "token_acc": 0.95495518631706, "train_speed(iter/s)": 0.137017 }, { "epoch": 2.275787772690547, "grad_norm": 0.6673620939254761, "learning_rate": 1.3391296188600594e-06, "loss": 0.09322519898414612, "memory(GiB)": 32.82, "step": 530, "token_acc": 0.9673923560716013, "train_speed(iter/s)": 0.137133 }, { "epoch": 2.297333692431996, "grad_norm": 0.7161964774131775, "learning_rate": 1.2632080336330532e-06, "loss": 0.10077807903289795, "memory(GiB)": 32.82, "step": 535, "token_acc": 0.9664416709118696, "train_speed(iter/s)": 0.137298 }, { "epoch": 2.3188796121734447, "grad_norm": 0.6956729888916016, "learning_rate": 1.1891897243618184e-06, "loss": 0.09205458760261535, "memory(GiB)": 32.82, "step": 540, "token_acc": 0.9661544035506174, "train_speed(iter/s)": 0.137456 }, { "epoch": 2.3188796121734447, "eval_loss": 0.23653477430343628, "eval_runtime": 9.1444, "eval_samples_per_second": 16.404, "eval_steps_per_second": 4.156, "eval_token_acc": 0.9273621167426477, "step": 540 }, { "epoch": 2.3404255319148937, "grad_norm": 0.7258738279342651, "learning_rate": 1.1171123911028692e-06, "loss": 0.08955551385879516, "memory(GiB)": 32.82, "step": 545, "token_acc": 0.9569588438579956, "train_speed(iter/s)": 0.13699 }, { "epoch": 2.3619714516563426, "grad_norm": 0.8112572431564331, "learning_rate": 1.047012745307255e-06, "loss": 0.08942890167236328, "memory(GiB)": 32.82, "step": 550, "token_acc": 0.9669670722726794, "train_speed(iter/s)": 0.13713 }, { "epoch": 2.3835173713977915, "grad_norm": 0.7578905820846558, "learning_rate": 9.789264911221546e-07, "loss": 0.0977406620979309, "memory(GiB)": 32.82, "step": 555, "token_acc": 0.9592447966031011, "train_speed(iter/s)": 0.137349 }, { "epoch": 2.4050632911392404, "grad_norm": 0.6269740462303162, "learning_rate": 9.128883072055411e-07, "loss": 0.09372045993804931, "memory(GiB)": 32.82, "step": 560, "token_acc": 0.9644342269853612, "train_speed(iter/s)": 0.137606 }, { "epoch": 2.4050632911392404, "eval_loss": 0.23624224960803986, "eval_runtime": 9.1143, "eval_samples_per_second": 16.458, "eval_steps_per_second": 4.169, "eval_token_acc": 0.92780906409225, "step": 560 }, { "epoch": 2.4266092108806894, "grad_norm": 0.7092756032943726, "learning_rate": 8.489318290631454e-07, "loss": 0.09312183260917664, "memory(GiB)": 32.82, "step": 565, "token_acc": 0.9584454627003176, "train_speed(iter/s)": 0.137151 }, { "epoch": 2.4481551306221383, "grad_norm": 0.6616098880767822, "learning_rate": 7.870896319167548e-07, "loss": 0.0871224045753479, "memory(GiB)": 32.82, "step": 570, "token_acc": 0.9698027314112291, "train_speed(iter/s)": 0.137275 }, { "epoch": 2.4697010503635872, "grad_norm": 0.7446200251579285, "learning_rate": 7.273932141125256e-07, "loss": 0.09404770135879517, "memory(GiB)": 32.82, "step": 575, "token_acc": 0.9641330382318164, "train_speed(iter/s)": 0.137433 }, { "epoch": 2.4912469701050366, "grad_norm": 0.7769956588745117, "learning_rate": 6.698729810778065e-07, "loss": 0.09552533030509949, "memory(GiB)": 32.82, "step": 580, "token_acc": 0.9669471799462847, "train_speed(iter/s)": 0.137553 }, { "epoch": 2.4912469701050366, "eval_loss": 0.2381029576063156, "eval_runtime": 9.122, "eval_samples_per_second": 16.444, "eval_steps_per_second": 4.166, "eval_token_acc": 0.9273084830606955, "step": 580 }, { "epoch": 2.5127928898464855, "grad_norm": 0.6479668021202087, "learning_rate": 6.145582298346153e-07, "loss": 0.08736968636512757, "memory(GiB)": 32.82, "step": 585, "token_acc": 0.955453039981074, "train_speed(iter/s)": 0.137018 }, { "epoch": 2.5343388095879344, "grad_norm": 0.7301574349403381, "learning_rate": 5.614771340776559e-07, "loss": 0.08440666198730469, "memory(GiB)": 32.82, "step": 590, "token_acc": 0.9689230967409507, "train_speed(iter/s)": 0.13714 }, { "epoch": 2.5558847293293834, "grad_norm": 0.850368082523346, "learning_rate": 5.106567298245008e-07, "loss": 0.09577755331993103, "memory(GiB)": 32.82, "step": 595, "token_acc": 0.9682460066363145, "train_speed(iter/s)": 0.137251 }, { "epoch": 2.5774306490708323, "grad_norm": 0.7679170966148376, "learning_rate": 4.6212290164521554e-07, "loss": 0.0867478609085083, "memory(GiB)": 32.82, "step": 600, "token_acc": 0.9678065479442077, "train_speed(iter/s)": 0.137365 }, { "epoch": 2.5774306490708323, "eval_loss": 0.23731745779514313, "eval_runtime": 9.1263, "eval_samples_per_second": 16.436, "eval_steps_per_second": 4.164, "eval_token_acc": 0.9279342093501386, "step": 600 }, { "epoch": 2.5989765688122812, "grad_norm": 0.7298761010169983, "learning_rate": 4.159003694784647e-07, "loss": 0.08937226533889771, "memory(GiB)": 32.82, "step": 605, "token_acc": 0.9554988592596227, "train_speed(iter/s)": 0.136957 }, { "epoch": 2.62052248855373, "grad_norm": 0.6361852288246155, "learning_rate": 3.7201267604080436e-07, "loss": 0.0883003294467926, "memory(GiB)": 32.82, "step": 610, "token_acc": 0.9694297662657633, "train_speed(iter/s)": 0.137058 }, { "epoch": 2.642068408295179, "grad_norm": 0.6873012781143188, "learning_rate": 3.3048217483556743e-07, "loss": 0.08733320236206055, "memory(GiB)": 32.82, "step": 615, "token_acc": 0.9654447816950735, "train_speed(iter/s)": 0.137163 }, { "epoch": 2.663614328036628, "grad_norm": 0.819065511226654, "learning_rate": 2.9133001876746004e-07, "loss": 0.09572435617446899, "memory(GiB)": 32.82, "step": 620, "token_acc": 0.9705064194008559, "train_speed(iter/s)": 0.137301 }, { "epoch": 2.663614328036628, "eval_loss": 0.23720206320285797, "eval_runtime": 9.1052, "eval_samples_per_second": 16.474, "eval_steps_per_second": 4.173, "eval_token_acc": 0.9275945293644409, "step": 620 }, { "epoch": 2.685160247778077, "grad_norm": 0.6891322731971741, "learning_rate": 2.545761493686666e-07, "loss": 0.0880233645439148, "memory(GiB)": 32.82, "step": 625, "token_acc": 0.953374825625067, "train_speed(iter/s)": 0.136783 }, { "epoch": 2.706706167519526, "grad_norm": 0.8032283782958984, "learning_rate": 2.2023928664194229e-07, "loss": 0.08428794145584106, "memory(GiB)": 32.82, "step": 630, "token_acc": 0.9709327045726875, "train_speed(iter/s)": 0.136943 }, { "epoch": 2.728252087260975, "grad_norm": 0.7355332970619202, "learning_rate": 1.8833691952587829e-07, "loss": 0.09050858616828919, "memory(GiB)": 32.82, "step": 635, "token_acc": 0.965556864209589, "train_speed(iter/s)": 0.13705 }, { "epoch": 2.7497980070024237, "grad_norm": 0.6962365508079529, "learning_rate": 1.5888529698718347e-07, "loss": 0.08718093633651733, "memory(GiB)": 32.82, "step": 640, "token_acc": 0.9714373207872832, "train_speed(iter/s)": 0.137146 }, { "epoch": 2.7497980070024237, "eval_loss": 0.23689626157283783, "eval_runtime": 9.1808, "eval_samples_per_second": 16.338, "eval_steps_per_second": 4.139, "eval_token_acc": 0.9280593546080271, "step": 640 }, { "epoch": 2.7713439267438726, "grad_norm": 0.6887531280517578, "learning_rate": 1.3189941974453502e-07, "loss": 0.09717867970466613, "memory(GiB)": 32.82, "step": 645, "token_acc": 0.9555589965933725, "train_speed(iter/s)": 0.136719 }, { "epoch": 2.7928898464853216, "grad_norm": 0.7766687273979187, "learning_rate": 1.0739303262819301e-07, "loss": 0.09383597373962402, "memory(GiB)": 32.82, "step": 650, "token_acc": 0.9672413793103448, "train_speed(iter/s)": 0.136821 }, { "epoch": 2.814435766226771, "grad_norm": 0.6897131204605103, "learning_rate": 8.537861757929422e-08, "loss": 0.09179171323776245, "memory(GiB)": 32.82, "step": 655, "token_acc": 0.9684824536645028, "train_speed(iter/s)": 0.136983 }, { "epoch": 2.83598168596822, "grad_norm": 0.7220735549926758, "learning_rate": 6.58673872923693e-08, "loss": 0.08836306929588318, "memory(GiB)": 35.18, "step": 660, "token_acc": 0.9739326289291511, "train_speed(iter/s)": 0.137137 }, { "epoch": 2.83598168596822, "eval_loss": 0.23682241141796112, "eval_runtime": 9.1257, "eval_samples_per_second": 16.437, "eval_steps_per_second": 4.164, "eval_token_acc": 0.9278805756681863, "step": 660 }, { "epoch": 2.857527605709669, "grad_norm": 0.7643243670463562, "learning_rate": 4.88692795043344e-08, "loss": 0.09028244614601136, "memory(GiB)": 35.18, "step": 665, "token_acc": 0.9558236887466142, "train_speed(iter/s)": 0.13671 }, { "epoch": 2.8790735254511177, "grad_norm": 0.8298976421356201, "learning_rate": 3.439295193286174e-08, "loss": 0.0949346661567688, "memory(GiB)": 35.18, "step": 670, "token_acc": 0.9662216181643748, "train_speed(iter/s)": 0.136882 }, { "epoch": 2.9006194451925666, "grad_norm": 0.7280375361442566, "learning_rate": 2.2445777866709208e-08, "loss": 0.08288905620574952, "memory(GiB)": 35.18, "step": 675, "token_acc": 0.9704877806147827, "train_speed(iter/s)": 0.136978 }, { "epoch": 2.9221653649340156, "grad_norm": 0.7563872337341309, "learning_rate": 1.3033842410251074e-08, "loss": 0.0895566999912262, "memory(GiB)": 35.18, "step": 680, "token_acc": 0.9717285366203239, "train_speed(iter/s)": 0.137088 }, { "epoch": 2.9221653649340156, "eval_loss": 0.23678408563137054, "eval_runtime": 9.1203, "eval_samples_per_second": 16.447, "eval_steps_per_second": 4.167, "eval_token_acc": 0.928023598820059, "step": 680 }, { "epoch": 2.9437112846754645, "grad_norm": 0.7505417466163635, "learning_rate": 6.16193938412557e-09, "loss": 0.09423564076423645, "memory(GiB)": 35.18, "step": 685, "token_acc": 0.9571300622110334, "train_speed(iter/s)": 0.136664 }, { "epoch": 2.9652572044169134, "grad_norm": 0.7332241535186768, "learning_rate": 1.8335688835802169e-09, "loss": 0.08703168034553528, "memory(GiB)": 35.18, "step": 690, "token_acc": 0.9672596800717596, "train_speed(iter/s)": 0.1368 }, { "epoch": 2.9868031241583624, "grad_norm": 0.7780314087867737, "learning_rate": 5.093549575119205e-11, "loss": 0.08567211627960206, "memory(GiB)": 35.18, "step": 695, "token_acc": 0.9732243229432805, "train_speed(iter/s)": 0.136919 }, { "epoch": 2.9911123081066524, "eval_loss": 0.23683081567287445, "eval_runtime": 9.116, "eval_samples_per_second": 16.455, "eval_steps_per_second": 4.168, "eval_token_acc": 0.9277911861982658, "step": 696 } ], "logging_steps": 5, "max_steps": 696, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.596747044568433e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }