{ "best_global_step": 300, "best_metric": 0.22214438, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v34-20250511-160020/checkpoint-300", "epoch": 2.9826262626262627, "eval_steps": 20, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006464646464646465, "grad_norm": 2.782066822052002, "learning_rate": 9.999884400986087e-06, "loss": 0.4351097345352173, "memory(GiB)": 29.0, "step": 1, "token_acc": 0.8841401366477686, "train_speed(iter/s)": 0.064982 }, { "epoch": 0.03232323232323232, "grad_norm": 1.5506770610809326, "learning_rate": 9.997110291906109e-06, "loss": 0.3617793917655945, "memory(GiB)": 29.0, "step": 5, "token_acc": 0.884986440225848, "train_speed(iter/s)": 0.119813 }, { "epoch": 0.06464646464646465, "grad_norm": 1.051324486732483, "learning_rate": 9.988444507789584e-06, "loss": 0.2978231906890869, "memory(GiB)": 29.01, "step": 10, "token_acc": 0.9110293908036147, "train_speed(iter/s)": 0.136021 }, { "epoch": 0.09696969696969697, "grad_norm": 1.0532618761062622, "learning_rate": 9.97401266428502e-06, "loss": 0.2773271083831787, "memory(GiB)": 29.01, "step": 15, "token_acc": 0.90916535639413, "train_speed(iter/s)": 0.140683 }, { "epoch": 0.1292929292929293, "grad_norm": 1.028316617012024, "learning_rate": 9.953831442918418e-06, "loss": 0.26010329723358155, "memory(GiB)": 29.01, "step": 20, "token_acc": 0.9205904810384322, "train_speed(iter/s)": 0.14413 }, { "epoch": 0.1292929292929293, "eval_loss": 0.2733669579029083, "eval_runtime": 4.7919, "eval_samples_per_second": 20.868, "eval_steps_per_second": 5.217, "eval_token_acc": 0.9177626754021415, "step": 20 }, { "epoch": 0.16161616161616163, "grad_norm": 0.9095243811607361, "learning_rate": 9.927924170825266e-06, "loss": 0.255949592590332, "memory(GiB)": 29.01, "step": 25, "token_acc": 0.9019129441690243, "train_speed(iter/s)": 0.132575 }, { "epoch": 0.19393939393939394, "grad_norm": 0.8502461314201355, "learning_rate": 9.896320793787106e-06, "loss": 0.254239821434021, "memory(GiB)": 29.01, "step": 30, "token_acc": 0.9101830846407004, "train_speed(iter/s)": 0.137917 }, { "epoch": 0.22626262626262628, "grad_norm": 0.952416181564331, "learning_rate": 9.859057841617709e-06, "loss": 0.25095329284667967, "memory(GiB)": 29.01, "step": 35, "token_acc": 0.9192982456140351, "train_speed(iter/s)": 0.139932 }, { "epoch": 0.2585858585858586, "grad_norm": 0.9308900237083435, "learning_rate": 9.816178385938867e-06, "loss": 0.2500969886779785, "memory(GiB)": 29.01, "step": 40, "token_acc": 0.9180038460325967, "train_speed(iter/s)": 0.141458 }, { "epoch": 0.2585858585858586, "eval_loss": 0.2549287676811218, "eval_runtime": 4.7618, "eval_samples_per_second": 21.0, "eval_steps_per_second": 5.25, "eval_token_acc": 0.9216007431672615, "step": 40 }, { "epoch": 0.2909090909090909, "grad_norm": 0.8023675084114075, "learning_rate": 9.767731990394638e-06, "loss": 0.24509100914001464, "memory(GiB)": 29.01, "step": 45, "token_acc": 0.9136364761503998, "train_speed(iter/s)": 0.136162 }, { "epoch": 0.32323232323232326, "grad_norm": 0.8799707293510437, "learning_rate": 9.71377465336155e-06, "loss": 0.2456353187561035, "memory(GiB)": 29.01, "step": 50, "token_acc": 0.9201317882299478, "train_speed(iter/s)": 0.138458 }, { "epoch": 0.35555555555555557, "grad_norm": 0.8880809545516968, "learning_rate": 9.654368743221022e-06, "loss": 0.22914605140686034, "memory(GiB)": 29.01, "step": 55, "token_acc": 0.9370015671251959, "train_speed(iter/s)": 0.139699 }, { "epoch": 0.3878787878787879, "grad_norm": 0.9086585640907288, "learning_rate": 9.589582926268798e-06, "loss": 0.25642530918121337, "memory(GiB)": 29.01, "step": 60, "token_acc": 0.9190494127315687, "train_speed(iter/s)": 0.141257 }, { "epoch": 0.3878787878787879, "eval_loss": 0.24375928938388824, "eval_runtime": 4.8065, "eval_samples_per_second": 20.805, "eval_steps_per_second": 5.201, "eval_token_acc": 0.9235564464870679, "step": 60 }, { "epoch": 0.4202020202020202, "grad_norm": 0.6710211634635925, "learning_rate": 9.519492087344724e-06, "loss": 0.2273104190826416, "memory(GiB)": 29.01, "step": 65, "token_acc": 0.9177157538245941, "train_speed(iter/s)": 0.136434 }, { "epoch": 0.45252525252525255, "grad_norm": 0.8824236392974854, "learning_rate": 9.444177243274619e-06, "loss": 0.24116811752319336, "memory(GiB)": 29.01, "step": 70, "token_acc": 0.9235306363194782, "train_speed(iter/s)": 0.138651 }, { "epoch": 0.48484848484848486, "grad_norm": 0.7853980660438538, "learning_rate": 9.363725449224281e-06, "loss": 0.2349745512008667, "memory(GiB)": 29.01, "step": 75, "token_acc": 0.9231191335740072, "train_speed(iter/s)": 0.140002 }, { "epoch": 0.5171717171717172, "grad_norm": 0.7967098951339722, "learning_rate": 9.278229698073889e-06, "loss": 0.22397637367248535, "memory(GiB)": 29.01, "step": 80, "token_acc": 0.9202410885963614, "train_speed(iter/s)": 0.140609 }, { "epoch": 0.5171717171717172, "eval_loss": 0.235035702586174, "eval_runtime": 4.8643, "eval_samples_per_second": 20.558, "eval_steps_per_second": 5.139, "eval_token_acc": 0.9263922163007872, "step": 80 }, { "epoch": 0.5494949494949495, "grad_norm": 0.867683470249176, "learning_rate": 9.187788812929074e-06, "loss": 0.22470014095306395, "memory(GiB)": 29.01, "step": 85, "token_acc": 0.9176256656708663, "train_speed(iter/s)": 0.137464 }, { "epoch": 0.5818181818181818, "grad_norm": 0.9094910621643066, "learning_rate": 9.092507332892968e-06, "loss": 0.2336829423904419, "memory(GiB)": 29.01, "step": 90, "token_acc": 0.9196234077902898, "train_speed(iter/s)": 0.13842 }, { "epoch": 0.6141414141414141, "grad_norm": 0.8729245662689209, "learning_rate": 8.992495392231195e-06, "loss": 0.22902388572692872, "memory(GiB)": 29.01, "step": 95, "token_acc": 0.9179311961946873, "train_speed(iter/s)": 0.139322 }, { "epoch": 0.6464646464646465, "grad_norm": 0.7870821356773376, "learning_rate": 8.88786859306952e-06, "loss": 0.22554943561553956, "memory(GiB)": 29.01, "step": 100, "token_acc": 0.9258398976637716, "train_speed(iter/s)": 0.140224 }, { "epoch": 0.6464646464646465, "eval_loss": 0.23361265659332275, "eval_runtime": 4.7619, "eval_samples_per_second": 21.0, "eval_steps_per_second": 5.25, "eval_token_acc": 0.9272967290861976, "step": 100 }, { "epoch": 0.6787878787878788, "grad_norm": 0.8173748850822449, "learning_rate": 8.778747871771293e-06, "loss": 0.2284949541091919, "memory(GiB)": 29.01, "step": 105, "token_acc": 0.9134603158456883, "train_speed(iter/s)": 0.138167 }, { "epoch": 0.7111111111111111, "grad_norm": 0.8524425625801086, "learning_rate": 8.665259359149132e-06, "loss": 0.220612096786499, "memory(GiB)": 29.01, "step": 110, "token_acc": 0.9307908237343916, "train_speed(iter/s)": 0.138956 }, { "epoch": 0.7434343434343434, "grad_norm": 0.7814698219299316, "learning_rate": 8.547534234672435e-06, "loss": 0.21316018104553222, "memory(GiB)": 29.01, "step": 115, "token_acc": 0.9195374535793002, "train_speed(iter/s)": 0.139738 }, { "epoch": 0.7757575757575758, "grad_norm": 0.8820337057113647, "learning_rate": 8.425708574839221e-06, "loss": 0.2112647533416748, "memory(GiB)": 29.01, "step": 120, "token_acc": 0.9269834018577737, "train_speed(iter/s)": 0.140382 }, { "epoch": 0.7757575757575758, "eval_loss": 0.2290322184562683, "eval_runtime": 4.7553, "eval_samples_per_second": 21.029, "eval_steps_per_second": 5.257, "eval_token_acc": 0.9274189605436856, "step": 120 }, { "epoch": 0.8080808080808081, "grad_norm": 0.897212564945221, "learning_rate": 8.299923195887599e-06, "loss": 0.21470861434936522, "memory(GiB)": 29.01, "step": 125, "token_acc": 0.9227193492155723, "train_speed(iter/s)": 0.138516 }, { "epoch": 0.8404040404040404, "grad_norm": 0.8352209329605103, "learning_rate": 8.170323491028625e-06, "loss": 0.2259157657623291, "memory(GiB)": 29.01, "step": 130, "token_acc": 0.9266622410118445, "train_speed(iter/s)": 0.138949 }, { "epoch": 0.8727272727272727, "grad_norm": 0.8725154399871826, "learning_rate": 8.03705926238874e-06, "loss": 0.22986299991607667, "memory(GiB)": 29.01, "step": 135, "token_acc": 0.9203431372549019, "train_speed(iter/s)": 0.139597 }, { "epoch": 0.9050505050505051, "grad_norm": 0.7126485109329224, "learning_rate": 7.900284547855992e-06, "loss": 0.21029348373413087, "memory(GiB)": 29.01, "step": 140, "token_acc": 0.9250847116449156, "train_speed(iter/s)": 0.140087 }, { "epoch": 0.9050505050505051, "eval_loss": 0.22619588673114777, "eval_runtime": 4.7708, "eval_samples_per_second": 20.961, "eval_steps_per_second": 5.24, "eval_token_acc": 0.9281034567056178, "step": 140 }, { "epoch": 0.9373737373737374, "grad_norm": 0.855863094329834, "learning_rate": 7.760157443030234e-06, "loss": 0.22026867866516114, "memory(GiB)": 29.01, "step": 145, "token_acc": 0.9182781919850885, "train_speed(iter/s)": 0.1384 }, { "epoch": 0.9696969696969697, "grad_norm": 0.757979154586792, "learning_rate": 7.616839918483061e-06, "loss": 0.21316237449645997, "memory(GiB)": 29.01, "step": 150, "token_acc": 0.9336977031687163, "train_speed(iter/s)": 0.13894 }, { "epoch": 1.0, "grad_norm": 0.7608435750007629, "learning_rate": 7.470497632538743e-06, "loss": 0.2081432580947876, "memory(GiB)": 29.01, "step": 155, "token_acc": 0.9317403683281076, "train_speed(iter/s)": 0.139608 }, { "epoch": 1.0323232323232323, "grad_norm": 0.6974760293960571, "learning_rate": 7.321299739792553e-06, "loss": 0.15149842500686644, "memory(GiB)": 29.01, "step": 160, "token_acc": 0.9457131222002945, "train_speed(iter/s)": 0.140097 }, { "epoch": 1.0323232323232323, "eval_loss": 0.22719089686870575, "eval_runtime": 4.7699, "eval_samples_per_second": 20.965, "eval_steps_per_second": 5.241, "eval_token_acc": 0.9280056715396274, "step": 160 }, { "epoch": 1.0646464646464646, "grad_norm": 0.872110903263092, "learning_rate": 7.169418695587791e-06, "loss": 0.14826536178588867, "memory(GiB)": 29.01, "step": 165, "token_acc": 0.9360687748839804, "train_speed(iter/s)": 0.138838 }, { "epoch": 1.096969696969697, "grad_norm": 1.0611625909805298, "learning_rate": 7.015030056677559e-06, "loss": 0.15616699457168579, "memory(GiB)": 29.01, "step": 170, "token_acc": 0.9440032655484637, "train_speed(iter/s)": 0.13965 }, { "epoch": 1.1292929292929292, "grad_norm": 0.7496768236160278, "learning_rate": 6.858312278301638e-06, "loss": 0.1349432110786438, "memory(GiB)": 29.01, "step": 175, "token_acc": 0.9524397395142126, "train_speed(iter/s)": 0.140043 }, { "epoch": 1.1616161616161615, "grad_norm": 0.8625680208206177, "learning_rate": 6.699446507913083e-06, "loss": 0.13590528964996337, "memory(GiB)": 29.01, "step": 180, "token_acc": 0.9526945902370416, "train_speed(iter/s)": 0.140334 }, { "epoch": 1.1616161616161615, "eval_loss": 0.22827181220054626, "eval_runtime": 4.7724, "eval_samples_per_second": 20.954, "eval_steps_per_second": 5.238, "eval_token_acc": 0.9284212584950863, "step": 180 }, { "epoch": 1.1939393939393939, "grad_norm": 0.7404259443283081, "learning_rate": 6.53861637579291e-06, "loss": 0.13479866981506347, "memory(GiB)": 29.01, "step": 185, "token_acc": 0.9389863201441587, "train_speed(iter/s)": 0.139057 }, { "epoch": 1.2262626262626264, "grad_norm": 0.7891139388084412, "learning_rate": 6.376007782794926e-06, "loss": 0.14628617763519286, "memory(GiB)": 29.01, "step": 190, "token_acc": 0.9526434549141698, "train_speed(iter/s)": 0.139363 }, { "epoch": 1.2585858585858585, "grad_norm": 0.8704652190208435, "learning_rate": 6.211808685466063e-06, "loss": 0.15462675094604492, "memory(GiB)": 29.01, "step": 195, "token_acc": 0.9470661110485415, "train_speed(iter/s)": 0.14002 }, { "epoch": 1.290909090909091, "grad_norm": 0.8007650971412659, "learning_rate": 6.046208878790543e-06, "loss": 0.1429288625717163, "memory(GiB)": 29.01, "step": 200, "token_acc": 0.9471947194719472, "train_speed(iter/s)": 0.140432 }, { "epoch": 1.290909090909091, "eval_loss": 0.22773738205432892, "eval_runtime": 4.935, "eval_samples_per_second": 20.263, "eval_steps_per_second": 5.066, "eval_token_acc": 0.9292768786975016, "step": 200 }, { "epoch": 1.3232323232323233, "grad_norm": 0.7445653676986694, "learning_rate": 5.879399776809047e-06, "loss": 0.1418352484703064, "memory(GiB)": 29.01, "step": 205, "token_acc": 0.9445697474871292, "train_speed(iter/s)": 0.139341 }, { "epoch": 1.3555555555555556, "grad_norm": 0.7971433401107788, "learning_rate": 5.711574191366427e-06, "loss": 0.14476661682128905, "memory(GiB)": 29.01, "step": 210, "token_acc": 0.9523753004361888, "train_speed(iter/s)": 0.139673 }, { "epoch": 1.387878787878788, "grad_norm": 0.7370967268943787, "learning_rate": 5.542926109243727e-06, "loss": 0.13473730087280272, "memory(GiB)": 29.01, "step": 215, "token_acc": 0.955563595697793, "train_speed(iter/s)": 0.140102 }, { "epoch": 1.4202020202020202, "grad_norm": 0.8203840851783752, "learning_rate": 5.373650467932122e-06, "loss": 0.15003204345703125, "memory(GiB)": 29.01, "step": 220, "token_acc": 0.9451295603024297, "train_speed(iter/s)": 0.140456 }, { "epoch": 1.4202020202020202, "eval_loss": 0.2280927300453186, "eval_runtime": 4.7892, "eval_samples_per_second": 20.88, "eval_steps_per_second": 5.22, "eval_token_acc": 0.9299124822764386, "step": 220 }, { "epoch": 1.4525252525252526, "grad_norm": 0.7488767504692078, "learning_rate": 5.2039429303079294e-06, "loss": 0.1540588140487671, "memory(GiB)": 29.01, "step": 225, "token_acc": 0.9373329232807363, "train_speed(iter/s)": 0.139519 }, { "epoch": 1.4848484848484849, "grad_norm": 0.7238897085189819, "learning_rate": 5.033999658469174e-06, "loss": 0.1424393892288208, "memory(GiB)": 29.01, "step": 230, "token_acc": 0.9550141601917073, "train_speed(iter/s)": 0.139888 }, { "epoch": 1.5171717171717172, "grad_norm": 0.7469899654388428, "learning_rate": 4.864017086995112e-06, "loss": 0.13640257120132446, "memory(GiB)": 29.01, "step": 235, "token_acc": 0.9527641822422231, "train_speed(iter/s)": 0.140199 }, { "epoch": 1.5494949494949495, "grad_norm": 0.8132858276367188, "learning_rate": 4.694191695890788e-06, "loss": 0.1473687171936035, "memory(GiB)": 29.01, "step": 240, "token_acc": 0.9434306569343066, "train_speed(iter/s)": 0.14057 }, { "epoch": 1.5494949494949495, "eval_loss": 0.22689995169639587, "eval_runtime": 4.819, "eval_samples_per_second": 20.751, "eval_steps_per_second": 5.188, "eval_token_acc": 0.9290813083655209, "step": 240 }, { "epoch": 1.5818181818181818, "grad_norm": 0.8139073848724365, "learning_rate": 4.524719783479088e-06, "loss": 0.14017653465270996, "memory(GiB)": 29.01, "step": 245, "token_acc": 0.9365190094175095, "train_speed(iter/s)": 0.139627 }, { "epoch": 1.614141414141414, "grad_norm": 0.8636942505836487, "learning_rate": 4.355797239502807e-06, "loss": 0.13665199279785156, "memory(GiB)": 29.01, "step": 250, "token_acc": 0.9555003459337065, "train_speed(iter/s)": 0.139706 }, { "epoch": 1.6464646464646466, "grad_norm": 0.8042647838592529, "learning_rate": 4.187619318698971e-06, "loss": 0.14160826206207275, "memory(GiB)": 29.01, "step": 255, "token_acc": 0.9507644993762149, "train_speed(iter/s)": 0.140117 }, { "epoch": 1.6787878787878787, "grad_norm": 0.7862452268600464, "learning_rate": 4.020380415107167e-06, "loss": 0.1396080732345581, "memory(GiB)": 29.01, "step": 260, "token_acc": 0.953097139313125, "train_speed(iter/s)": 0.14034 }, { "epoch": 1.6787878787878787, "eval_loss": 0.22532083094120026, "eval_runtime": 4.797, "eval_samples_per_second": 20.846, "eval_steps_per_second": 5.212, "eval_token_acc": 0.9303036229403999, "step": 260 }, { "epoch": 1.7111111111111112, "grad_norm": 0.8368792533874512, "learning_rate": 3.854273837372724e-06, "loss": 0.14725687503814697, "memory(GiB)": 29.01, "step": 265, "token_acc": 0.9422004865193594, "train_speed(iter/s)": 0.139548 }, { "epoch": 1.7434343434343433, "grad_norm": 0.7835624814033508, "learning_rate": 3.689491585304491e-06, "loss": 0.14394346475601197, "memory(GiB)": 29.01, "step": 270, "token_acc": 0.9471722928540114, "train_speed(iter/s)": 0.139821 }, { "epoch": 1.7757575757575759, "grad_norm": 0.7341930866241455, "learning_rate": 3.526224127945479e-06, "loss": 0.14667458534240724, "memory(GiB)": 29.01, "step": 275, "token_acc": 0.9464715744551135, "train_speed(iter/s)": 0.140255 }, { "epoch": 1.808080808080808, "grad_norm": 0.7114227414131165, "learning_rate": 3.3646601834128924e-06, "loss": 0.132787024974823, "memory(GiB)": 29.01, "step": 280, "token_acc": 0.9507424168754909, "train_speed(iter/s)": 0.140468 }, { "epoch": 1.808080808080808, "eval_loss": 0.22350256145000458, "eval_runtime": 4.7736, "eval_samples_per_second": 20.949, "eval_steps_per_second": 5.237, "eval_token_acc": 0.9308658876448442, "step": 280 }, { "epoch": 1.8404040404040405, "grad_norm": 0.7501634359359741, "learning_rate": 3.204986500762006e-06, "loss": 0.13066763877868653, "memory(GiB)": 29.01, "step": 285, "token_acc": 0.9368674340146854, "train_speed(iter/s)": 0.139644 }, { "epoch": 1.8727272727272726, "grad_norm": 0.7324647307395935, "learning_rate": 3.0473876441260786e-06, "loss": 0.1387048363685608, "memory(GiB)": 29.01, "step": 290, "token_acc": 0.9532809871003926, "train_speed(iter/s)": 0.139906 }, { "epoch": 1.905050505050505, "grad_norm": 0.8160727024078369, "learning_rate": 2.8920457793817507e-06, "loss": 0.13977317810058593, "memory(GiB)": 29.01, "step": 295, "token_acc": 0.9515957446808511, "train_speed(iter/s)": 0.14009 }, { "epoch": 1.9373737373737374, "grad_norm": 0.8773327469825745, "learning_rate": 2.7391404635865725e-06, "loss": 0.15099945068359374, "memory(GiB)": 29.01, "step": 300, "token_acc": 0.9496292015795049, "train_speed(iter/s)": 0.140321 }, { "epoch": 1.9373737373737374, "eval_loss": 0.22214438021183014, "eval_runtime": 4.7787, "eval_samples_per_second": 20.926, "eval_steps_per_second": 5.232, "eval_token_acc": 0.930939226519337, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 0.7400924563407898, "learning_rate": 2.5888484374320033e-06, "loss": 0.1311182498931885, "memory(GiB)": 29.01, "step": 305, "token_acc": 0.944199668256272, "train_speed(iter/s)": 0.139576 }, { "epoch": 2.0, "grad_norm": 0.9168914556503296, "learning_rate": 2.4413434209518137e-06, "loss": 0.14706777334213256, "memory(GiB)": 29.01, "step": 310, "token_acc": 0.9542029226862068, "train_speed(iter/s)": 0.139976 }, { "epoch": 2.0323232323232325, "grad_norm": 0.6356860399246216, "learning_rate": 2.296795912722014e-06, "loss": 0.09936747550964356, "memory(GiB)": 29.01, "step": 315, "token_acc": 0.9699422322449106, "train_speed(iter/s)": 0.140169 }, { "epoch": 2.0646464646464646, "grad_norm": 0.7481921911239624, "learning_rate": 2.1553729927843894e-06, "loss": 0.09395751953125, "memory(GiB)": 29.01, "step": 320, "token_acc": 0.9698759156824638, "train_speed(iter/s)": 0.1404 }, { "epoch": 2.0646464646464646, "eval_loss": 0.23354972898960114, "eval_runtime": 4.7345, "eval_samples_per_second": 21.122, "eval_steps_per_second": 5.28, "eval_token_acc": 0.9305480858553757, "step": 320 }, { "epoch": 2.096969696969697, "grad_norm": 0.7663989663124084, "learning_rate": 2.017238129521506e-06, "loss": 0.09232854843139648, "memory(GiB)": 29.01, "step": 325, "token_acc": 0.9550909621122053, "train_speed(iter/s)": 0.139732 }, { "epoch": 2.1292929292929292, "grad_norm": 0.8547663688659668, "learning_rate": 1.8825509907063328e-06, "loss": 0.09894357919692993, "memory(GiB)": 29.01, "step": 330, "token_acc": 0.9668018320254383, "train_speed(iter/s)": 0.140046 }, { "epoch": 2.1616161616161618, "grad_norm": 0.7185168266296387, "learning_rate": 1.7514672589449378e-06, "loss": 0.08718444108963012, "memory(GiB)": 29.01, "step": 335, "token_acc": 0.9681554248986598, "train_speed(iter/s)": 0.140153 }, { "epoch": 2.193939393939394, "grad_norm": 0.7746614813804626, "learning_rate": 1.6241384517255854e-06, "loss": 0.09638407826423645, "memory(GiB)": 29.01, "step": 340, "token_acc": 0.9620331882302884, "train_speed(iter/s)": 0.14039 }, { "epoch": 2.193939393939394, "eval_loss": 0.24958540499210358, "eval_runtime": 4.75, "eval_samples_per_second": 21.052, "eval_steps_per_second": 5.263, "eval_token_acc": 0.9302791766489024, "step": 340 }, { "epoch": 2.2262626262626264, "grad_norm": 0.742405354976654, "learning_rate": 1.500711746282192e-06, "loss": 0.09411965012550354, "memory(GiB)": 29.01, "step": 345, "token_acc": 0.9519363664569994, "train_speed(iter/s)": 0.139752 }, { "epoch": 2.2585858585858585, "grad_norm": 0.764021098613739, "learning_rate": 1.3813298094746491e-06, "loss": 0.08621931076049805, "memory(GiB)": 29.01, "step": 350, "token_acc": 0.9720944103612815, "train_speed(iter/s)": 0.139887 }, { "epoch": 2.290909090909091, "grad_norm": 0.773617684841156, "learning_rate": 1.2661306328825818e-06, "loss": 0.09195576310157776, "memory(GiB)": 29.01, "step": 355, "token_acc": 0.9652025497230834, "train_speed(iter/s)": 0.140164 }, { "epoch": 2.323232323232323, "grad_norm": 0.6916205286979675, "learning_rate": 1.1552473733031893e-06, "loss": 0.09625710248947143, "memory(GiB)": 29.01, "step": 360, "token_acc": 0.964334548769371, "train_speed(iter/s)": 0.140497 }, { "epoch": 2.323232323232323, "eval_loss": 0.24684520065784454, "eval_runtime": 4.7402, "eval_samples_per_second": 21.096, "eval_steps_per_second": 5.274, "eval_token_acc": 0.9305236395638782, "step": 360 }, { "epoch": 2.3555555555555556, "grad_norm": 0.7022070288658142, "learning_rate": 1.0488081988375493e-06, "loss": 0.09287334084510804, "memory(GiB)": 29.01, "step": 365, "token_acc": 0.9523647112323783, "train_speed(iter/s)": 0.140054 }, { "epoch": 2.3878787878787877, "grad_norm": 0.7242087721824646, "learning_rate": 9.469361407432431e-07, "loss": 0.08903356790542602, "memory(GiB)": 29.01, "step": 370, "token_acc": 0.9695336787564767, "train_speed(iter/s)": 0.140188 }, { "epoch": 2.4202020202020202, "grad_norm": 0.6964623332023621, "learning_rate": 8.497489512245971e-07, "loss": 0.0938454508781433, "memory(GiB)": 29.01, "step": 375, "token_acc": 0.9731585150265174, "train_speed(iter/s)": 0.140328 }, { "epoch": 2.4525252525252528, "grad_norm": 0.7716543078422546, "learning_rate": 7.573589673248833e-07, "loss": 0.09390033483505249, "memory(GiB)": 29.01, "step": 380, "token_acc": 0.9665334135210669, "train_speed(iter/s)": 0.140549 }, { "epoch": 2.4525252525252528, "eval_loss": 0.24702604115009308, "eval_runtime": 4.7799, "eval_samples_per_second": 20.921, "eval_steps_per_second": 5.23, "eval_token_acc": 0.9308169950618491, "step": 380 }, { "epoch": 2.484848484848485, "grad_norm": 0.6270375847816467, "learning_rate": 6.698729810778065e-07, "loss": 0.0894782304763794, "memory(GiB)": 29.01, "step": 385, "token_acc": 0.9545825054765907, "train_speed(iter/s)": 0.139956 }, { "epoch": 2.517171717171717, "grad_norm": 0.7530998587608337, "learning_rate": 5.873921160683943e-07, "loss": 0.09273716211318969, "memory(GiB)": 29.01, "step": 390, "token_acc": 0.9644321902464242, "train_speed(iter/s)": 0.140246 }, { "epoch": 2.5494949494949495, "grad_norm": 0.7483599781990051, "learning_rate": 5.100117105459279e-07, "loss": 0.09490547776222229, "memory(GiB)": 29.01, "step": 395, "token_acc": 0.9702084609035968, "train_speed(iter/s)": 0.140356 }, { "epoch": 2.581818181818182, "grad_norm": 0.6409457921981812, "learning_rate": 4.3782120722406565e-07, "loss": 0.08456376791000367, "memory(GiB)": 29.01, "step": 400, "token_acc": 0.967419212040726, "train_speed(iter/s)": 0.14058 }, { "epoch": 2.581818181818182, "eval_loss": 0.24691322445869446, "eval_runtime": 4.7767, "eval_samples_per_second": 20.935, "eval_steps_per_second": 5.234, "eval_token_acc": 0.9304991932723806, "step": 400 }, { "epoch": 2.614141414141414, "grad_norm": 0.7296220064163208, "learning_rate": 3.709040498955102e-07, "loss": 0.09365044832229615, "memory(GiB)": 29.01, "step": 405, "token_acc": 0.9518691709509908, "train_speed(iter/s)": 0.139998 }, { "epoch": 2.6464646464646466, "grad_norm": 0.7950789928436279, "learning_rate": 3.0933758698072023e-07, "loss": 0.09456123113632202, "memory(GiB)": 29.01, "step": 410, "token_acc": 0.967745104460017, "train_speed(iter/s)": 0.140168 }, { "epoch": 2.6787878787878787, "grad_norm": 0.7504149079322815, "learning_rate": 2.531929821221768e-07, "loss": 0.09618629813194275, "memory(GiB)": 29.01, "step": 415, "token_acc": 0.9672515016798453, "train_speed(iter/s)": 0.140395 }, { "epoch": 2.7111111111111112, "grad_norm": 0.7684112191200256, "learning_rate": 2.0253513192751374e-07, "loss": 0.09071210622787476, "memory(GiB)": 29.01, "step": 420, "token_acc": 0.9696820512820513, "train_speed(iter/s)": 0.140583 }, { "epoch": 2.7111111111111112, "eval_loss": 0.24677424132823944, "eval_runtime": 4.7699, "eval_samples_per_second": 20.965, "eval_steps_per_second": 5.241, "eval_token_acc": 0.930181391482912, "step": 420 }, { "epoch": 2.7434343434343433, "grad_norm": 0.8230168223381042, "learning_rate": 1.5742259095662126e-07, "loss": 0.09300805330276489, "memory(GiB)": 29.01, "step": 425, "token_acc": 0.9522797263783283, "train_speed(iter/s)": 0.140016 }, { "epoch": 2.775757575757576, "grad_norm": 0.7461301684379578, "learning_rate": 1.1790750403941231e-07, "loss": 0.09267510175704956, "memory(GiB)": 29.01, "step": 430, "token_acc": 0.9682438869482375, "train_speed(iter/s)": 0.140189 }, { "epoch": 2.808080808080808, "grad_norm": 0.7107937932014465, "learning_rate": 8.403554600248498e-08, "loss": 0.08705815076828002, "memory(GiB)": 29.01, "step": 435, "token_acc": 0.9719344842850819, "train_speed(iter/s)": 0.140367 }, { "epoch": 2.8404040404040405, "grad_norm": 0.6642769575119019, "learning_rate": 5.584586887435739e-08, "loss": 0.09030424356460572, "memory(GiB)": 29.01, "step": 440, "token_acc": 0.9698875973943032, "train_speed(iter/s)": 0.140575 }, { "epoch": 2.8404040404040405, "eval_loss": 0.2466077357530594, "eval_runtime": 4.7557, "eval_samples_per_second": 21.027, "eval_steps_per_second": 5.257, "eval_token_acc": 0.9308658876448442, "step": 440 }, { "epoch": 2.8727272727272726, "grad_norm": 0.6828747987747192, "learning_rate": 3.337105663029361e-08, "loss": 0.0859929859638214, "memory(GiB)": 29.01, "step": 445, "token_acc": 0.9535586561225003, "train_speed(iter/s)": 0.140068 }, { "epoch": 2.905050505050505, "grad_norm": 0.6743197441101074, "learning_rate": 1.6637087529033925e-08, "loss": 0.09535614252090455, "memory(GiB)": 29.01, "step": 450, "token_acc": 0.9667527211833659, "train_speed(iter/s)": 0.140292 }, { "epoch": 2.937373737373737, "grad_norm": 0.6410036087036133, "learning_rate": 5.6633040849601865e-09, "loss": 0.08608411550521851, "memory(GiB)": 29.01, "step": 455, "token_acc": 0.9691890107471665, "train_speed(iter/s)": 0.140436 }, { "epoch": 2.9696969696969697, "grad_norm": 0.7376388311386108, "learning_rate": 4.623907104084335e-10, "loss": 0.0936282753944397, "memory(GiB)": 29.01, "step": 460, "token_acc": 0.9650562139167427, "train_speed(iter/s)": 0.140678 }, { "epoch": 2.9696969696969697, "eval_loss": 0.24663545191287994, "eval_runtime": 4.7524, "eval_samples_per_second": 21.042, "eval_steps_per_second": 5.261, "eval_token_acc": 0.9304991932723806, "step": 460 }, { "epoch": 2.9826262626262627, "eval_loss": 0.24689918756484985, "eval_runtime": 4.7514, "eval_samples_per_second": 21.046, "eval_steps_per_second": 5.262, "eval_token_acc": 0.9306703173128636, "step": 462 } ], "logging_steps": 5, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.038524706450309e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }