{ "best_global_step": 750, "best_metric": 0.48618392545595784, "best_model_checkpoint": "results/finetuned/ML-ENG-LUG-FULL-A40/checkpoint-750", "epoch": 3.613048978854148, "eval_steps": 250, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.036146755828664376, "grad_norm": 12.125, "learning_rate": 1.6000000000000001e-06, "loss": 1.2254, "step": 25 }, { "epoch": 0.07229351165732875, "grad_norm": 9.5625, "learning_rate": 3.266666666666667e-06, "loss": 1.1396, "step": 50 }, { "epoch": 0.10844026748599313, "grad_norm": 9.0, "learning_rate": 4.933333333333334e-06, "loss": 1.0373, "step": 75 }, { "epoch": 0.1445870233146575, "grad_norm": 8.125, "learning_rate": 6.600000000000001e-06, "loss": 0.8837, "step": 100 }, { "epoch": 0.1807337791433219, "grad_norm": 8.25, "learning_rate": 8.266666666666667e-06, "loss": 0.8438, "step": 125 }, { "epoch": 0.21688053497198626, "grad_norm": 6.65625, "learning_rate": 9.933333333333334e-06, "loss": 0.8197, "step": 150 }, { "epoch": 0.2530272908006506, "grad_norm": 6.71875, "learning_rate": 9.897872340425532e-06, "loss": 0.6942, "step": 175 }, { "epoch": 0.289174046629315, "grad_norm": 6.75, "learning_rate": 9.79148936170213e-06, "loss": 0.8162, "step": 200 }, { "epoch": 0.3253208024579794, "grad_norm": 7.25, "learning_rate": 9.685106382978723e-06, "loss": 0.7682, "step": 225 }, { "epoch": 0.3614675582866438, "grad_norm": 4.75, "learning_rate": 9.57872340425532e-06, "loss": 0.7765, "step": 250 }, { "epoch": 0.3614675582866438, "eval_cer": 0.3216240827923205, "eval_loss": 0.8622527718544006, "eval_runtime": 1448.6289, "eval_samples_per_second": 0.957, "eval_steps_per_second": 0.24, "eval_wer": 0.5111065130432537, "step": 250 }, { "epoch": 0.3976143141153082, "grad_norm": 7.65625, "learning_rate": 9.472340425531916e-06, "loss": 0.646, "step": 275 }, { "epoch": 0.4337610699439725, "grad_norm": 5.5625, "learning_rate": 9.365957446808511e-06, "loss": 0.6401, "step": 300 }, { "epoch": 0.4699078257726369, "grad_norm": 6.78125, "learning_rate": 9.259574468085107e-06, "loss": 0.7117, "step": 325 }, { "epoch": 0.5060545816013012, "grad_norm": 7.25, "learning_rate": 9.153191489361702e-06, "loss": 0.7722, "step": 350 }, { "epoch": 0.5422013374299657, "grad_norm": 5.875, "learning_rate": 9.046808510638298e-06, "loss": 0.6399, "step": 375 }, { "epoch": 0.57834809325863, "grad_norm": 5.125, "learning_rate": 8.940425531914895e-06, "loss": 0.695, "step": 400 }, { "epoch": 0.6144948490872945, "grad_norm": 4.8125, "learning_rate": 8.83404255319149e-06, "loss": 0.6709, "step": 425 }, { "epoch": 0.6506416049159588, "grad_norm": 6.28125, "learning_rate": 8.727659574468086e-06, "loss": 0.7144, "step": 450 }, { "epoch": 0.6867883607446231, "grad_norm": 5.875, "learning_rate": 8.621276595744682e-06, "loss": 0.7185, "step": 475 }, { "epoch": 0.7229351165732876, "grad_norm": 6.84375, "learning_rate": 8.514893617021277e-06, "loss": 0.6632, "step": 500 }, { "epoch": 0.7229351165732876, "eval_cer": 0.32359849188245976, "eval_loss": 0.8226236701011658, "eval_runtime": 1430.6804, "eval_samples_per_second": 0.969, "eval_steps_per_second": 0.243, "eval_wer": 0.5051111231477887, "step": 500 }, { "epoch": 0.7590818724019519, "grad_norm": 5.5, "learning_rate": 8.408510638297873e-06, "loss": 0.6509, "step": 525 }, { "epoch": 0.7952286282306164, "grad_norm": 5.3125, "learning_rate": 8.30212765957447e-06, "loss": 0.6506, "step": 550 }, { "epoch": 0.8313753840592807, "grad_norm": 6.9375, "learning_rate": 8.195744680851064e-06, "loss": 0.6635, "step": 575 }, { "epoch": 0.867522139887945, "grad_norm": 7.90625, "learning_rate": 8.08936170212766e-06, "loss": 0.6494, "step": 600 }, { "epoch": 0.9036688957166095, "grad_norm": 5.1875, "learning_rate": 7.982978723404256e-06, "loss": 0.6319, "step": 625 }, { "epoch": 0.9398156515452738, "grad_norm": 4.125, "learning_rate": 7.876595744680852e-06, "loss": 0.6252, "step": 650 }, { "epoch": 0.9759624073739381, "grad_norm": 3.90625, "learning_rate": 7.770212765957447e-06, "loss": 0.6612, "step": 675 }, { "epoch": 1.0115669618651726, "grad_norm": 5.21875, "learning_rate": 7.663829787234043e-06, "loss": 0.5836, "step": 700 }, { "epoch": 1.047713717693837, "grad_norm": 5.1875, "learning_rate": 7.557446808510639e-06, "loss": 0.6302, "step": 725 }, { "epoch": 1.0838604735225013, "grad_norm": 5.3125, "learning_rate": 7.4510638297872355e-06, "loss": 0.5672, "step": 750 }, { "epoch": 1.0838604735225013, "eval_cer": 0.3044346468112434, "eval_loss": 0.8067195415496826, "eval_runtime": 1358.1527, "eval_samples_per_second": 1.021, "eval_steps_per_second": 0.255, "eval_wer": 0.48618392545595784, "step": 750 }, { "epoch": 1.1200072293511658, "grad_norm": 5.65625, "learning_rate": 7.34468085106383e-06, "loss": 0.5476, "step": 775 }, { "epoch": 1.1561539851798301, "grad_norm": 6.625, "learning_rate": 7.2382978723404265e-06, "loss": 0.6132, "step": 800 }, { "epoch": 1.1923007410084945, "grad_norm": 4.03125, "learning_rate": 7.131914893617022e-06, "loss": 0.6052, "step": 825 }, { "epoch": 1.2284474968371588, "grad_norm": 3.84375, "learning_rate": 7.0255319148936175e-06, "loss": 0.5723, "step": 850 }, { "epoch": 1.2645942526658231, "grad_norm": 5.375, "learning_rate": 6.919148936170214e-06, "loss": 0.5791, "step": 875 }, { "epoch": 1.3007410084944877, "grad_norm": 5.25, "learning_rate": 6.8127659574468085e-06, "loss": 0.5981, "step": 900 }, { "epoch": 1.336887764323152, "grad_norm": 5.28125, "learning_rate": 6.706382978723405e-06, "loss": 0.5917, "step": 925 }, { "epoch": 1.3730345201518164, "grad_norm": 5.65625, "learning_rate": 6.600000000000001e-06, "loss": 0.633, "step": 950 }, { "epoch": 1.4091812759804807, "grad_norm": 4.78125, "learning_rate": 6.493617021276596e-06, "loss": 0.617, "step": 975 }, { "epoch": 1.445328031809145, "grad_norm": 4.875, "learning_rate": 6.387234042553192e-06, "loss": 0.6058, "step": 1000 }, { "epoch": 1.445328031809145, "eval_cer": 0.3142144715579215, "eval_loss": 0.7990756034851074, "eval_runtime": 1373.1379, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.253, "eval_wer": 0.49487802577000256, "step": 1000 }, { "epoch": 1.4814747876378096, "grad_norm": 5.09375, "learning_rate": 6.2808510638297885e-06, "loss": 0.6011, "step": 1025 }, { "epoch": 1.517621543466474, "grad_norm": 4.625, "learning_rate": 6.174468085106383e-06, "loss": 0.5811, "step": 1050 }, { "epoch": 1.5537682992951383, "grad_norm": 5.625, "learning_rate": 6.0680851063829795e-06, "loss": 0.6012, "step": 1075 }, { "epoch": 1.5899150551238026, "grad_norm": 4.5, "learning_rate": 5.961702127659575e-06, "loss": 0.5822, "step": 1100 }, { "epoch": 1.626061810952467, "grad_norm": 6.65625, "learning_rate": 5.8553191489361705e-06, "loss": 0.6002, "step": 1125 }, { "epoch": 1.6622085667811315, "grad_norm": 4.78125, "learning_rate": 5.748936170212767e-06, "loss": 0.5965, "step": 1150 }, { "epoch": 1.6983553226097958, "grad_norm": 5.34375, "learning_rate": 5.6425531914893615e-06, "loss": 0.6277, "step": 1175 }, { "epoch": 1.7345020784384602, "grad_norm": 4.40625, "learning_rate": 5.536170212765958e-06, "loss": 0.5448, "step": 1200 }, { "epoch": 1.7706488342671245, "grad_norm": 5.0625, "learning_rate": 5.429787234042554e-06, "loss": 0.5645, "step": 1225 }, { "epoch": 1.8067955900957888, "grad_norm": 4.71875, "learning_rate": 5.323404255319149e-06, "loss": 0.6589, "step": 1250 }, { "epoch": 1.8067955900957888, "eval_cer": 0.31060312404039775, "eval_loss": 0.7972104549407959, "eval_runtime": 1367.9273, "eval_samples_per_second": 1.014, "eval_steps_per_second": 0.254, "eval_wer": 0.49013358950339214, "step": 1250 }, { "epoch": 1.8429423459244534, "grad_norm": 4.0625, "learning_rate": 5.217021276595745e-06, "loss": 0.6027, "step": 1275 }, { "epoch": 1.8790891017531175, "grad_norm": 5.46875, "learning_rate": 5.110638297872342e-06, "loss": 0.6039, "step": 1300 }, { "epoch": 1.915235857581782, "grad_norm": 4.59375, "learning_rate": 5.004255319148936e-06, "loss": 0.6226, "step": 1325 }, { "epoch": 1.9513826134104464, "grad_norm": 5.59375, "learning_rate": 4.897872340425533e-06, "loss": 0.5832, "step": 1350 }, { "epoch": 1.9875293692391107, "grad_norm": 5.65625, "learning_rate": 4.791489361702128e-06, "loss": 0.5714, "step": 1375 }, { "epoch": 2.023133923730345, "grad_norm": 5.25, "learning_rate": 4.685106382978724e-06, "loss": 0.553, "step": 1400 }, { "epoch": 2.0592806795590097, "grad_norm": 4.3125, "learning_rate": 4.57872340425532e-06, "loss": 0.58, "step": 1425 }, { "epoch": 2.095427435387674, "grad_norm": 5.34375, "learning_rate": 4.4723404255319155e-06, "loss": 0.5211, "step": 1450 }, { "epoch": 2.1315741912163384, "grad_norm": 5.3125, "learning_rate": 4.365957446808511e-06, "loss": 0.5708, "step": 1475 }, { "epoch": 2.1677209470450025, "grad_norm": 3.984375, "learning_rate": 4.259574468085107e-06, "loss": 0.5959, "step": 1500 }, { "epoch": 2.1677209470450025, "eval_cer": 0.3117614241560084, "eval_loss": 0.7976789474487305, "eval_runtime": 1424.2468, "eval_samples_per_second": 0.974, "eval_steps_per_second": 0.244, "eval_wer": 0.4926164455085106, "step": 1500 }, { "epoch": 2.203867702873667, "grad_norm": 5.125, "learning_rate": 4.153191489361703e-06, "loss": 0.5694, "step": 1525 }, { "epoch": 2.2400144587023316, "grad_norm": 4.53125, "learning_rate": 4.046808510638298e-06, "loss": 0.5767, "step": 1550 }, { "epoch": 2.2761612145309957, "grad_norm": 5.3125, "learning_rate": 3.940425531914894e-06, "loss": 0.5827, "step": 1575 }, { "epoch": 2.3123079703596603, "grad_norm": 4.75, "learning_rate": 3.83404255319149e-06, "loss": 0.5597, "step": 1600 }, { "epoch": 2.3484547261883244, "grad_norm": 4.96875, "learning_rate": 3.7276595744680857e-06, "loss": 0.5605, "step": 1625 }, { "epoch": 2.384601482016989, "grad_norm": 4.75, "learning_rate": 3.621276595744681e-06, "loss": 0.5523, "step": 1650 }, { "epoch": 2.4207482378456535, "grad_norm": 4.84375, "learning_rate": 3.5148936170212767e-06, "loss": 0.5674, "step": 1675 }, { "epoch": 2.4568949936743176, "grad_norm": 4.21875, "learning_rate": 3.4085106382978726e-06, "loss": 0.5801, "step": 1700 }, { "epoch": 2.493041749502982, "grad_norm": 5.1875, "learning_rate": 3.3021276595744685e-06, "loss": 0.5634, "step": 1725 }, { "epoch": 2.5291885053316463, "grad_norm": 3.53125, "learning_rate": 3.195744680851064e-06, "loss": 0.5402, "step": 1750 }, { "epoch": 2.5291885053316463, "eval_cer": 0.3114490059221394, "eval_loss": 0.7963515520095825, "eval_runtime": 1412.53, "eval_samples_per_second": 0.982, "eval_steps_per_second": 0.246, "eval_wer": 0.49261625004199694, "step": 1750 }, { "epoch": 2.565335261160311, "grad_norm": 5.84375, "learning_rate": 3.08936170212766e-06, "loss": 0.5874, "step": 1775 }, { "epoch": 2.6014820169889754, "grad_norm": 5.0, "learning_rate": 2.9829787234042554e-06, "loss": 0.5083, "step": 1800 }, { "epoch": 2.6376287728176395, "grad_norm": 4.9375, "learning_rate": 2.8765957446808514e-06, "loss": 0.5587, "step": 1825 }, { "epoch": 2.673775528646304, "grad_norm": 4.4375, "learning_rate": 2.770212765957447e-06, "loss": 0.5781, "step": 1850 }, { "epoch": 2.7099222844749686, "grad_norm": 5.0, "learning_rate": 2.663829787234043e-06, "loss": 0.5224, "step": 1875 }, { "epoch": 2.7460690403036327, "grad_norm": 5.0, "learning_rate": 2.5574468085106387e-06, "loss": 0.5491, "step": 1900 }, { "epoch": 2.782215796132297, "grad_norm": 4.625, "learning_rate": 2.4510638297872342e-06, "loss": 0.5653, "step": 1925 }, { "epoch": 2.8183625519609614, "grad_norm": 6.625, "learning_rate": 2.34468085106383e-06, "loss": 0.5236, "step": 1950 }, { "epoch": 2.854509307789626, "grad_norm": 4.75, "learning_rate": 2.2382978723404256e-06, "loss": 0.5827, "step": 1975 }, { "epoch": 2.89065606361829, "grad_norm": 5.15625, "learning_rate": 2.1319148936170216e-06, "loss": 0.5934, "step": 2000 }, { "epoch": 2.89065606361829, "eval_cer": 0.3117611880607615, "eval_loss": 0.7964433431625366, "eval_runtime": 1409.3242, "eval_samples_per_second": 0.984, "eval_steps_per_second": 0.246, "eval_wer": 0.4920525130602013, "step": 2000 }, { "epoch": 2.9268028194469546, "grad_norm": 5.25, "learning_rate": 2.025531914893617e-06, "loss": 0.5579, "step": 2025 }, { "epoch": 2.962949575275619, "grad_norm": 5.21875, "learning_rate": 1.919148936170213e-06, "loss": 0.5838, "step": 2050 }, { "epoch": 2.9990963311042833, "grad_norm": 4.5625, "learning_rate": 1.8127659574468087e-06, "loss": 0.5663, "step": 2075 }, { "epoch": 3.0347008855955178, "grad_norm": 5.6875, "learning_rate": 1.7063829787234042e-06, "loss": 0.5703, "step": 2100 }, { "epoch": 3.0708476414241823, "grad_norm": 4.875, "learning_rate": 1.6000000000000001e-06, "loss": 0.5703, "step": 2125 }, { "epoch": 3.1069943972528464, "grad_norm": 4.875, "learning_rate": 1.4936170212765956e-06, "loss": 0.5774, "step": 2150 }, { "epoch": 3.143141153081511, "grad_norm": 4.84375, "learning_rate": 1.3872340425531916e-06, "loss": 0.5686, "step": 2175 }, { "epoch": 3.179287908910175, "grad_norm": 3.96875, "learning_rate": 1.2808510638297875e-06, "loss": 0.4906, "step": 2200 }, { "epoch": 3.2154346647388397, "grad_norm": 4.78125, "learning_rate": 1.174468085106383e-06, "loss": 0.5839, "step": 2225 }, { "epoch": 3.251581420567504, "grad_norm": 5.09375, "learning_rate": 1.0680851063829787e-06, "loss": 0.5464, "step": 2250 }, { "epoch": 3.251581420567504, "eval_cer": 0.3112543561914034, "eval_loss": 0.7959883213043213, "eval_runtime": 1614.2453, "eval_samples_per_second": 0.859, "eval_steps_per_second": 0.215, "eval_wer": 0.4931348188600785, "step": 2250 }, { "epoch": 3.2877281763961683, "grad_norm": 4.40625, "learning_rate": 9.617021276595744e-07, "loss": 0.5668, "step": 2275 }, { "epoch": 3.323874932224833, "grad_norm": 4.3125, "learning_rate": 8.553191489361703e-07, "loss": 0.5077, "step": 2300 }, { "epoch": 3.3600216880534974, "grad_norm": 5.78125, "learning_rate": 7.489361702127661e-07, "loss": 0.5428, "step": 2325 }, { "epoch": 3.3961684438821615, "grad_norm": 5.625, "learning_rate": 6.425531914893618e-07, "loss": 0.5428, "step": 2350 }, { "epoch": 3.432315199710826, "grad_norm": 5.25, "learning_rate": 5.361702127659575e-07, "loss": 0.5395, "step": 2375 }, { "epoch": 3.46846195553949, "grad_norm": 5.1875, "learning_rate": 4.2978723404255325e-07, "loss": 0.5461, "step": 2400 }, { "epoch": 3.5046087113681548, "grad_norm": 5.03125, "learning_rate": 3.2340425531914897e-07, "loss": 0.5538, "step": 2425 }, { "epoch": 3.540755467196819, "grad_norm": 5.4375, "learning_rate": 2.170212765957447e-07, "loss": 0.5298, "step": 2450 }, { "epoch": 3.5769022230254834, "grad_norm": 5.21875, "learning_rate": 1.1063829787234043e-07, "loss": 0.5959, "step": 2475 }, { "epoch": 3.613048978854148, "grad_norm": 5.5, "learning_rate": 4.25531914893617e-09, "loss": 0.5497, "step": 2500 }, { "epoch": 3.613048978854148, "eval_cer": 0.31241486923400996, "eval_loss": 0.7960155010223389, "eval_runtime": 1533.3812, "eval_samples_per_second": 0.905, "eval_steps_per_second": 0.226, "eval_wer": 0.49320595826893987, "step": 2500 } ], "logging_steps": 25, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3582858479796224e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }