{ "best_global_step": 2500, "best_metric": 0.4086505663851241, "best_model_checkpoint": "results/finetuned/ML-ENG-LUG-FULL-A40-5e-5/checkpoint-2500", "epoch": 11.563787495482472, "eval_steps": 250, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07228044813877846, "grad_norm": 5.96875, "learning_rate": 8.000000000000001e-06, "loss": 1.1364, "step": 25 }, { "epoch": 0.14456089627755692, "grad_norm": 8.75, "learning_rate": 1.6333333333333335e-05, "loss": 0.8278, "step": 50 }, { "epoch": 0.21684134441633537, "grad_norm": 4.3125, "learning_rate": 2.466666666666667e-05, "loss": 0.7449, "step": 75 }, { "epoch": 0.28912179255511383, "grad_norm": 4.15625, "learning_rate": 3.3e-05, "loss": 0.6846, "step": 100 }, { "epoch": 0.3614022406938923, "grad_norm": 3.09375, "learning_rate": 4.133333333333333e-05, "loss": 0.7195, "step": 125 }, { "epoch": 0.43368268883267075, "grad_norm": 3.15625, "learning_rate": 4.966666666666667e-05, "loss": 0.5968, "step": 150 }, { "epoch": 0.5059631369714492, "grad_norm": 3.8125, "learning_rate": 4.948936170212766e-05, "loss": 0.6925, "step": 175 }, { "epoch": 0.5782435851102277, "grad_norm": 3.84375, "learning_rate": 4.895744680851064e-05, "loss": 0.6004, "step": 200 }, { "epoch": 0.6505240332490061, "grad_norm": 3.171875, "learning_rate": 4.842553191489362e-05, "loss": 0.6391, "step": 225 }, { "epoch": 0.7228044813877846, "grad_norm": 3.515625, "learning_rate": 4.78936170212766e-05, "loss": 0.6253, "step": 250 }, { "epoch": 0.7228044813877846, "eval_cer": 0.2721768277530176, "eval_loss": 0.8155665397644043, "eval_runtime": 1120.749, "eval_samples_per_second": 1.238, "eval_steps_per_second": 0.31, "eval_wer": 0.4660412216168416, "step": 250 }, { "epoch": 0.7950849295265631, "grad_norm": 2.59375, "learning_rate": 4.736170212765957e-05, "loss": 0.5837, "step": 275 }, { "epoch": 0.8673653776653415, "grad_norm": 3.078125, "learning_rate": 4.682978723404256e-05, "loss": 0.5739, "step": 300 }, { "epoch": 0.93964582580412, "grad_norm": 2.765625, "learning_rate": 4.6297872340425536e-05, "loss": 0.5402, "step": 325 }, { "epoch": 1.0115648717022045, "grad_norm": 2.5625, "learning_rate": 4.576595744680851e-05, "loss": 0.5285, "step": 350 }, { "epoch": 1.083845319840983, "grad_norm": 2.734375, "learning_rate": 4.5234042553191494e-05, "loss": 0.4131, "step": 375 }, { "epoch": 1.1561257679797614, "grad_norm": 3.3125, "learning_rate": 4.4702127659574474e-05, "loss": 0.3972, "step": 400 }, { "epoch": 1.2284062161185398, "grad_norm": 2.171875, "learning_rate": 4.4170212765957446e-05, "loss": 0.4073, "step": 425 }, { "epoch": 1.3006866642573183, "grad_norm": 2.671875, "learning_rate": 4.3638297872340425e-05, "loss": 0.3999, "step": 450 }, { "epoch": 1.372967112396097, "grad_norm": 2.34375, "learning_rate": 4.310638297872341e-05, "loss": 0.4226, "step": 475 }, { "epoch": 1.4452475605348754, "grad_norm": 2.6875, "learning_rate": 4.2574468085106384e-05, "loss": 0.4188, "step": 500 }, { "epoch": 1.4452475605348754, "eval_cer": 0.2362257256765857, "eval_loss": 0.8118799924850464, "eval_runtime": 991.3995, "eval_samples_per_second": 1.399, "eval_steps_per_second": 0.35, "eval_wer": 0.4247106653572799, "step": 500 }, { "epoch": 1.5175280086736538, "grad_norm": 2.953125, "learning_rate": 4.204255319148936e-05, "loss": 0.3959, "step": 525 }, { "epoch": 1.5898084568124322, "grad_norm": 2.421875, "learning_rate": 4.151063829787234e-05, "loss": 0.4188, "step": 550 }, { "epoch": 1.6620889049512106, "grad_norm": 2.953125, "learning_rate": 4.097872340425532e-05, "loss": 0.4241, "step": 575 }, { "epoch": 1.7343693530899893, "grad_norm": 2.53125, "learning_rate": 4.04468085106383e-05, "loss": 0.4018, "step": 600 }, { "epoch": 1.8066498012287675, "grad_norm": 2.421875, "learning_rate": 3.991489361702127e-05, "loss": 0.4323, "step": 625 }, { "epoch": 1.8789302493675462, "grad_norm": 2.625, "learning_rate": 3.938297872340426e-05, "loss": 0.4012, "step": 650 }, { "epoch": 1.9512106975063246, "grad_norm": 2.328125, "learning_rate": 3.885106382978724e-05, "loss": 0.4, "step": 675 }, { "epoch": 2.023129743404409, "grad_norm": 2.109375, "learning_rate": 3.831914893617021e-05, "loss": 0.3423, "step": 700 }, { "epoch": 2.0954101915431878, "grad_norm": 2.09375, "learning_rate": 3.7787234042553196e-05, "loss": 0.2476, "step": 725 }, { "epoch": 2.167690639681966, "grad_norm": 1.8984375, "learning_rate": 3.7255319148936176e-05, "loss": 0.2709, "step": 750 }, { "epoch": 2.167690639681966, "eval_cer": 0.23517525369611372, "eval_loss": 0.8228520750999451, "eval_runtime": 1015.9667, "eval_samples_per_second": 1.365, "eval_steps_per_second": 0.342, "eval_wer": 0.42064168739296354, "step": 750 }, { "epoch": 2.2399710878207446, "grad_norm": 2.3125, "learning_rate": 3.672340425531915e-05, "loss": 0.2533, "step": 775 }, { "epoch": 2.312251535959523, "grad_norm": 2.015625, "learning_rate": 3.619148936170213e-05, "loss": 0.2579, "step": 800 }, { "epoch": 2.3845319840983015, "grad_norm": 2.203125, "learning_rate": 3.565957446808511e-05, "loss": 0.2544, "step": 825 }, { "epoch": 2.4568124322370797, "grad_norm": 2.421875, "learning_rate": 3.5127659574468086e-05, "loss": 0.2634, "step": 850 }, { "epoch": 2.5290928803758583, "grad_norm": 2.359375, "learning_rate": 3.4595744680851065e-05, "loss": 0.2584, "step": 875 }, { "epoch": 2.6013733285146365, "grad_norm": 2.15625, "learning_rate": 3.4063829787234044e-05, "loss": 0.2578, "step": 900 }, { "epoch": 2.673653776653415, "grad_norm": 1.75, "learning_rate": 3.353191489361702e-05, "loss": 0.2491, "step": 925 }, { "epoch": 2.745934224792194, "grad_norm": 2.703125, "learning_rate": 3.3e-05, "loss": 0.2488, "step": 950 }, { "epoch": 2.818214672930972, "grad_norm": 2.0, "learning_rate": 3.2468085106382975e-05, "loss": 0.2489, "step": 975 }, { "epoch": 2.8904951210697507, "grad_norm": 2.5625, "learning_rate": 3.193617021276596e-05, "loss": 0.2571, "step": 1000 }, { "epoch": 2.8904951210697507, "eval_cer": 0.22612252442822162, "eval_loss": 0.814146876335144, "eval_runtime": 916.4096, "eval_samples_per_second": 1.514, "eval_steps_per_second": 0.379, "eval_wer": 0.4153135913858677, "step": 1000 }, { "epoch": 2.962775569208529, "grad_norm": 2.578125, "learning_rate": 3.140425531914894e-05, "loss": 0.2534, "step": 1025 }, { "epoch": 3.0346946151066136, "grad_norm": 1.6796875, "learning_rate": 3.087234042553191e-05, "loss": 0.2146, "step": 1050 }, { "epoch": 3.1069750632453923, "grad_norm": 1.9140625, "learning_rate": 3.0340425531914895e-05, "loss": 0.1515, "step": 1075 }, { "epoch": 3.1792555113841705, "grad_norm": 1.8828125, "learning_rate": 2.9808510638297878e-05, "loss": 0.1359, "step": 1100 }, { "epoch": 3.251535959522949, "grad_norm": 1.9765625, "learning_rate": 2.927659574468085e-05, "loss": 0.1575, "step": 1125 }, { "epoch": 3.3238164076617274, "grad_norm": 2.453125, "learning_rate": 2.8744680851063833e-05, "loss": 0.1416, "step": 1150 }, { "epoch": 3.396096855800506, "grad_norm": 1.9296875, "learning_rate": 2.821276595744681e-05, "loss": 0.1545, "step": 1175 }, { "epoch": 3.4683773039392847, "grad_norm": 1.9296875, "learning_rate": 2.7680851063829788e-05, "loss": 0.1562, "step": 1200 }, { "epoch": 3.540657752078063, "grad_norm": 1.828125, "learning_rate": 2.714893617021277e-05, "loss": 0.1489, "step": 1225 }, { "epoch": 3.612938200216841, "grad_norm": 1.8515625, "learning_rate": 2.6617021276595743e-05, "loss": 0.1581, "step": 1250 }, { "epoch": 3.612938200216841, "eval_cer": 0.2291752867000159, "eval_loss": 0.9097059369087219, "eval_runtime": 904.9745, "eval_samples_per_second": 1.533, "eval_steps_per_second": 0.383, "eval_wer": 0.41673846986102747, "step": 1250 }, { "epoch": 3.6852186483556197, "grad_norm": 1.859375, "learning_rate": 2.6085106382978725e-05, "loss": 0.1481, "step": 1275 }, { "epoch": 3.7574990964943984, "grad_norm": 2.40625, "learning_rate": 2.5553191489361705e-05, "loss": 0.1498, "step": 1300 }, { "epoch": 3.8297795446331766, "grad_norm": 1.984375, "learning_rate": 2.502127659574468e-05, "loss": 0.1629, "step": 1325 }, { "epoch": 3.9020599927719553, "grad_norm": 2.171875, "learning_rate": 2.448936170212766e-05, "loss": 0.1493, "step": 1350 }, { "epoch": 3.9743404409107335, "grad_norm": 1.7421875, "learning_rate": 2.395744680851064e-05, "loss": 0.15, "step": 1375 }, { "epoch": 4.046259486808818, "grad_norm": 2.0, "learning_rate": 2.3425531914893618e-05, "loss": 0.1028, "step": 1400 }, { "epoch": 4.118539934947597, "grad_norm": 1.8203125, "learning_rate": 2.2893617021276597e-05, "loss": 0.0812, "step": 1425 }, { "epoch": 4.1908203830863755, "grad_norm": 1.90625, "learning_rate": 2.2361702127659576e-05, "loss": 0.0859, "step": 1450 }, { "epoch": 4.263100831225153, "grad_norm": 1.5078125, "learning_rate": 2.1829787234042552e-05, "loss": 0.0862, "step": 1475 }, { "epoch": 4.335381279363932, "grad_norm": 1.5, "learning_rate": 2.1297872340425535e-05, "loss": 0.083, "step": 1500 }, { "epoch": 4.335381279363932, "eval_cer": 0.2270746892469672, "eval_loss": 0.9749350547790527, "eval_runtime": 962.2618, "eval_samples_per_second": 1.441, "eval_steps_per_second": 0.361, "eval_wer": 0.41771518722192486, "step": 1500 }, { "epoch": 4.407661727502711, "grad_norm": 1.8515625, "learning_rate": 2.076595744680851e-05, "loss": 0.0907, "step": 1525 }, { "epoch": 4.479942175641489, "grad_norm": 1.5546875, "learning_rate": 2.023404255319149e-05, "loss": 0.0829, "step": 1550 }, { "epoch": 4.552222623780267, "grad_norm": 2.03125, "learning_rate": 1.970212765957447e-05, "loss": 0.0893, "step": 1575 }, { "epoch": 4.624503071919046, "grad_norm": 1.46875, "learning_rate": 1.9170212765957448e-05, "loss": 0.0947, "step": 1600 }, { "epoch": 4.696783520057824, "grad_norm": 1.96875, "learning_rate": 1.8638297872340427e-05, "loss": 0.0844, "step": 1625 }, { "epoch": 4.769063968196603, "grad_norm": 1.5234375, "learning_rate": 1.8106382978723403e-05, "loss": 0.0872, "step": 1650 }, { "epoch": 4.841344416335382, "grad_norm": 1.8984375, "learning_rate": 1.7574468085106382e-05, "loss": 0.0854, "step": 1675 }, { "epoch": 4.913624864474159, "grad_norm": 1.6015625, "learning_rate": 1.704255319148936e-05, "loss": 0.0822, "step": 1700 }, { "epoch": 4.985905312612938, "grad_norm": 1.4453125, "learning_rate": 1.651063829787234e-05, "loss": 0.0841, "step": 1725 }, { "epoch": 5.057824358511023, "grad_norm": 1.3984375, "learning_rate": 1.597872340425532e-05, "loss": 0.0593, "step": 1750 }, { "epoch": 5.057824358511023, "eval_cer": 0.2265957095050262, "eval_loss": 1.0612818002700806, "eval_runtime": 882.62, "eval_samples_per_second": 1.571, "eval_steps_per_second": 0.393, "eval_wer": 0.41074975953764603, "step": 1750 }, { "epoch": 5.130104806649801, "grad_norm": 1.3515625, "learning_rate": 1.54468085106383e-05, "loss": 0.0517, "step": 1775 }, { "epoch": 5.20238525478858, "grad_norm": 1.6796875, "learning_rate": 1.4914893617021278e-05, "loss": 0.0462, "step": 1800 }, { "epoch": 5.274665702927358, "grad_norm": 1.0703125, "learning_rate": 1.4382978723404256e-05, "loss": 0.0576, "step": 1825 }, { "epoch": 5.3469461510661365, "grad_norm": 1.6484375, "learning_rate": 1.3851063829787233e-05, "loss": 0.0528, "step": 1850 }, { "epoch": 5.419226599204915, "grad_norm": 1.5078125, "learning_rate": 1.3319148936170214e-05, "loss": 0.0537, "step": 1875 }, { "epoch": 5.491507047343694, "grad_norm": 2.0625, "learning_rate": 1.2787234042553192e-05, "loss": 0.0546, "step": 1900 }, { "epoch": 5.563787495482472, "grad_norm": 2.0, "learning_rate": 1.225531914893617e-05, "loss": 0.0536, "step": 1925 }, { "epoch": 5.63606794362125, "grad_norm": 1.8046875, "learning_rate": 1.172340425531915e-05, "loss": 0.0531, "step": 1950 }, { "epoch": 5.708348391760029, "grad_norm": 2.0625, "learning_rate": 1.119148936170213e-05, "loss": 0.0517, "step": 1975 }, { "epoch": 5.7806288398988075, "grad_norm": 1.7734375, "learning_rate": 1.0659574468085107e-05, "loss": 0.0518, "step": 2000 }, { "epoch": 5.7806288398988075, "eval_cer": 0.22345740881745507, "eval_loss": 1.054749608039856, "eval_runtime": 879.5621, "eval_samples_per_second": 1.577, "eval_steps_per_second": 0.395, "eval_wer": 0.4107791260867282, "step": 2000 }, { "epoch": 5.852909288037586, "grad_norm": 1.7890625, "learning_rate": 1.0127659574468086e-05, "loss": 0.0497, "step": 2025 }, { "epoch": 5.925189736176364, "grad_norm": 1.5078125, "learning_rate": 9.595744680851064e-06, "loss": 0.0507, "step": 2050 }, { "epoch": 5.997470184315143, "grad_norm": 1.7734375, "learning_rate": 9.063829787234043e-06, "loss": 0.0509, "step": 2075 }, { "epoch": 6.069389230213227, "grad_norm": 1.203125, "learning_rate": 8.53191489361702e-06, "loss": 0.0415, "step": 2100 }, { "epoch": 6.141669678352006, "grad_norm": 2.03125, "learning_rate": 8.000000000000001e-06, "loss": 0.04, "step": 2125 }, { "epoch": 6.213950126490785, "grad_norm": 1.7421875, "learning_rate": 7.468085106382979e-06, "loss": 0.0394, "step": 2150 }, { "epoch": 6.286230574629562, "grad_norm": 1.765625, "learning_rate": 6.936170212765958e-06, "loss": 0.042, "step": 2175 }, { "epoch": 6.358511022768341, "grad_norm": 1.3125, "learning_rate": 6.404255319148937e-06, "loss": 0.0361, "step": 2200 }, { "epoch": 6.43079147090712, "grad_norm": 1.5234375, "learning_rate": 5.872340425531915e-06, "loss": 0.0383, "step": 2225 }, { "epoch": 6.503071919045898, "grad_norm": 1.390625, "learning_rate": 5.340425531914894e-06, "loss": 0.0382, "step": 2250 }, { "epoch": 6.503071919045898, "eval_cer": 0.22488223459611492, "eval_loss": 1.1097996234893799, "eval_runtime": 877.748, "eval_samples_per_second": 1.58, "eval_steps_per_second": 0.395, "eval_wer": 0.4095101068765971, "step": 2250 }, { "epoch": 6.575352367184676, "grad_norm": 1.4921875, "learning_rate": 4.808510638297872e-06, "loss": 0.0398, "step": 2275 }, { "epoch": 6.647632815323455, "grad_norm": 1.46875, "learning_rate": 4.2765957446808515e-06, "loss": 0.0356, "step": 2300 }, { "epoch": 6.719913263462233, "grad_norm": 0.97265625, "learning_rate": 3.7446808510638303e-06, "loss": 0.0373, "step": 2325 }, { "epoch": 6.792193711601012, "grad_norm": 1.3984375, "learning_rate": 3.2127659574468086e-06, "loss": 0.0347, "step": 2350 }, { "epoch": 6.864474159739791, "grad_norm": 1.0703125, "learning_rate": 2.6808510638297874e-06, "loss": 0.0369, "step": 2375 }, { "epoch": 6.936754607878569, "grad_norm": 0.9375, "learning_rate": 2.148936170212766e-06, "loss": 0.039, "step": 2400 }, { "epoch": 7.008673653776653, "grad_norm": 1.84375, "learning_rate": 1.6170212765957448e-06, "loss": 0.0376, "step": 2425 }, { "epoch": 7.080954101915432, "grad_norm": 2.0625, "learning_rate": 1.0851063829787236e-06, "loss": 0.04, "step": 2450 }, { "epoch": 7.1532345500542105, "grad_norm": 1.171875, "learning_rate": 5.531914893617021e-07, "loss": 0.038, "step": 2475 }, { "epoch": 7.225514998192989, "grad_norm": 1.8046875, "learning_rate": 2.1276595744680853e-08, "loss": 0.0356, "step": 2500 }, { "epoch": 7.225514998192989, "eval_cer": 0.223833001387121, "eval_loss": 1.1148525476455688, "eval_runtime": 877.5866, "eval_samples_per_second": 1.58, "eval_steps_per_second": 0.395, "eval_wer": 0.4086505663851241, "step": 2500 }, { "epoch": 7.297795446331767, "grad_norm": 1.1328125, "learning_rate": 1.916883116883117e-05, "loss": 0.0364, "step": 2525 }, { "epoch": 7.370075894470546, "grad_norm": 1.4375, "learning_rate": 1.8844155844155846e-05, "loss": 0.0418, "step": 2550 }, { "epoch": 7.442356342609324, "grad_norm": 1.546875, "learning_rate": 1.851948051948052e-05, "loss": 0.0377, "step": 2575 }, { "epoch": 7.514636790748103, "grad_norm": 1.4453125, "learning_rate": 1.8194805194805195e-05, "loss": 0.0429, "step": 2600 }, { "epoch": 7.5869172388868815, "grad_norm": 1.6484375, "learning_rate": 1.7870129870129872e-05, "loss": 0.0387, "step": 2625 }, { "epoch": 7.659197687025659, "grad_norm": 1.765625, "learning_rate": 1.7545454545454545e-05, "loss": 0.0463, "step": 2650 }, { "epoch": 7.731478135164438, "grad_norm": 1.5234375, "learning_rate": 1.722077922077922e-05, "loss": 0.0421, "step": 2675 }, { "epoch": 7.803758583303217, "grad_norm": 0.91015625, "learning_rate": 1.6896103896103898e-05, "loss": 0.0394, "step": 2700 }, { "epoch": 7.876039031441995, "grad_norm": 1.59375, "learning_rate": 1.657142857142857e-05, "loss": 0.0415, "step": 2725 }, { "epoch": 7.948319479580773, "grad_norm": 1.1796875, "learning_rate": 1.6246753246753247e-05, "loss": 0.0408, "step": 2750 }, { "epoch": 7.948319479580773, "eval_cer": 0.22607830826054334, "eval_loss": 1.116784930229187, "eval_runtime": 899.4546, "eval_samples_per_second": 1.542, "eval_steps_per_second": 0.386, "eval_wer": 0.41392684859068163, "step": 2750 }, { "epoch": 8.023129743404409, "grad_norm": 2.171875, "learning_rate": 1.592207792207792e-05, "loss": 0.0422, "step": 2775 }, { "epoch": 8.095410191543188, "grad_norm": 1.40625, "learning_rate": 1.55974025974026e-05, "loss": 0.0335, "step": 2800 }, { "epoch": 8.167690639681966, "grad_norm": 1.1953125, "learning_rate": 1.5272727272727276e-05, "loss": 0.0312, "step": 2825 }, { "epoch": 8.239971087820745, "grad_norm": 1.421875, "learning_rate": 1.494805194805195e-05, "loss": 0.0341, "step": 2850 }, { "epoch": 8.312251535959524, "grad_norm": 1.21875, "learning_rate": 1.4623376623376626e-05, "loss": 0.0336, "step": 2875 }, { "epoch": 8.3845319840983, "grad_norm": 1.2109375, "learning_rate": 1.42987012987013e-05, "loss": 0.0357, "step": 2900 }, { "epoch": 8.45681243223708, "grad_norm": 1.7578125, "learning_rate": 1.3974025974025975e-05, "loss": 0.0343, "step": 2925 }, { "epoch": 8.529092880375858, "grad_norm": 1.3984375, "learning_rate": 1.3649350649350651e-05, "loss": 0.0293, "step": 2950 }, { "epoch": 8.601373328514637, "grad_norm": 1.15625, "learning_rate": 1.3324675324675326e-05, "loss": 0.0371, "step": 2975 }, { "epoch": 8.673653776653415, "grad_norm": 2.09375, "learning_rate": 1.3000000000000001e-05, "loss": 0.0368, "step": 3000 }, { "epoch": 8.673653776653415, "eval_cer": 0.2278660614765563, "eval_loss": 1.1499484777450562, "eval_runtime": 895.0529, "eval_samples_per_second": 1.55, "eval_steps_per_second": 0.388, "eval_wer": 0.4171725128409374, "step": 3000 }, { "epoch": 8.745934224792194, "grad_norm": 2.40625, "learning_rate": 1.2675324675324676e-05, "loss": 0.0344, "step": 3025 }, { "epoch": 8.818214672930972, "grad_norm": 1.109375, "learning_rate": 1.2350649350649352e-05, "loss": 0.0338, "step": 3050 }, { "epoch": 8.890495121069751, "grad_norm": 1.890625, "learning_rate": 1.2025974025974027e-05, "loss": 0.0323, "step": 3075 }, { "epoch": 8.96277556920853, "grad_norm": 1.390625, "learning_rate": 1.1701298701298701e-05, "loss": 0.0342, "step": 3100 }, { "epoch": 9.034694615106615, "grad_norm": 1.1015625, "learning_rate": 1.1376623376623378e-05, "loss": 0.0291, "step": 3125 }, { "epoch": 9.106975063245391, "grad_norm": 2.453125, "learning_rate": 1.1051948051948053e-05, "loss": 0.0275, "step": 3150 }, { "epoch": 9.17925551138417, "grad_norm": 0.9453125, "learning_rate": 1.0727272727272727e-05, "loss": 0.0245, "step": 3175 }, { "epoch": 9.251535959522949, "grad_norm": 1.0546875, "learning_rate": 1.0402597402597402e-05, "loss": 0.0251, "step": 3200 }, { "epoch": 9.323816407661727, "grad_norm": 1.25, "learning_rate": 1.0077922077922078e-05, "loss": 0.0214, "step": 3225 }, { "epoch": 9.396096855800506, "grad_norm": 1.5390625, "learning_rate": 9.753246753246753e-06, "loss": 0.0271, "step": 3250 }, { "epoch": 9.396096855800506, "eval_cer": 0.22706083911883618, "eval_loss": 1.205234169960022, "eval_runtime": 893.8344, "eval_samples_per_second": 1.552, "eval_steps_per_second": 0.388, "eval_wer": 0.41319892817631626, "step": 3250 }, { "epoch": 9.468377303939285, "grad_norm": 1.5078125, "learning_rate": 9.42857142857143e-06, "loss": 0.0272, "step": 3275 }, { "epoch": 9.540657752078063, "grad_norm": 0.734375, "learning_rate": 9.103896103896104e-06, "loss": 0.0268, "step": 3300 }, { "epoch": 9.612938200216842, "grad_norm": 1.65625, "learning_rate": 8.77922077922078e-06, "loss": 0.027, "step": 3325 }, { "epoch": 9.68521864835562, "grad_norm": 1.359375, "learning_rate": 8.454545454545455e-06, "loss": 0.0249, "step": 3350 }, { "epoch": 9.757499096494398, "grad_norm": 1.3671875, "learning_rate": 8.12987012987013e-06, "loss": 0.0249, "step": 3375 }, { "epoch": 9.829779544633176, "grad_norm": 1.5390625, "learning_rate": 7.805194805194806e-06, "loss": 0.0257, "step": 3400 }, { "epoch": 9.902059992771955, "grad_norm": 1.6015625, "learning_rate": 7.480519480519481e-06, "loss": 0.0245, "step": 3425 }, { "epoch": 9.974340440910733, "grad_norm": 1.1484375, "learning_rate": 7.155844155844156e-06, "loss": 0.0251, "step": 3450 }, { "epoch": 10.046259486808818, "grad_norm": 0.88671875, "learning_rate": 6.8311688311688315e-06, "loss": 0.0238, "step": 3475 }, { "epoch": 10.118539934947597, "grad_norm": 1.1328125, "learning_rate": 6.506493506493506e-06, "loss": 0.0237, "step": 3500 }, { "epoch": 10.118539934947597, "eval_cer": 0.2262828635227838, "eval_loss": 1.2106597423553467, "eval_runtime": 916.7169, "eval_samples_per_second": 1.513, "eval_steps_per_second": 0.379, "eval_wer": 0.41136634469731537, "step": 3500 }, { "epoch": 10.190820383086376, "grad_norm": 1.3515625, "learning_rate": 6.181818181818183e-06, "loss": 0.0239, "step": 3525 }, { "epoch": 10.263100831225154, "grad_norm": 2.078125, "learning_rate": 5.857142857142857e-06, "loss": 0.0232, "step": 3550 }, { "epoch": 10.335381279363933, "grad_norm": 1.5234375, "learning_rate": 5.532467532467533e-06, "loss": 0.021, "step": 3575 }, { "epoch": 10.40766172750271, "grad_norm": 0.94921875, "learning_rate": 5.207792207792208e-06, "loss": 0.0239, "step": 3600 }, { "epoch": 10.479942175641488, "grad_norm": 0.84375, "learning_rate": 4.883116883116883e-06, "loss": 0.0217, "step": 3625 }, { "epoch": 10.552222623780267, "grad_norm": 1.1875, "learning_rate": 4.558441558441559e-06, "loss": 0.0226, "step": 3650 }, { "epoch": 10.624503071919046, "grad_norm": 2.703125, "learning_rate": 4.233766233766234e-06, "loss": 0.0238, "step": 3675 }, { "epoch": 10.696783520057824, "grad_norm": 1.6640625, "learning_rate": 3.90909090909091e-06, "loss": 0.0255, "step": 3700 }, { "epoch": 10.769063968196603, "grad_norm": 0.84375, "learning_rate": 3.5844155844155846e-06, "loss": 0.0229, "step": 3725 }, { "epoch": 10.841344416335382, "grad_norm": 1.0625, "learning_rate": 3.2597402597402597e-06, "loss": 0.0212, "step": 3750 }, { "epoch": 10.841344416335382, "eval_cer": 0.22498029349070886, "eval_loss": 1.2275042533874512, "eval_runtime": 921.5359, "eval_samples_per_second": 1.505, "eval_steps_per_second": 0.377, "eval_wer": 0.41112972866628295, "step": 3750 }, { "epoch": 10.91362486447416, "grad_norm": 0.765625, "learning_rate": 2.9350649350649353e-06, "loss": 0.0229, "step": 3775 }, { "epoch": 10.985905312612939, "grad_norm": 1.8515625, "learning_rate": 2.6103896103896104e-06, "loss": 0.0235, "step": 3800 }, { "epoch": 11.057824358511024, "grad_norm": 0.5703125, "learning_rate": 2.285714285714286e-06, "loss": 0.0226, "step": 3825 }, { "epoch": 11.1301048066498, "grad_norm": 1.0234375, "learning_rate": 1.961038961038961e-06, "loss": 0.0227, "step": 3850 }, { "epoch": 11.20238525478858, "grad_norm": 1.75, "learning_rate": 1.6363636363636367e-06, "loss": 0.0215, "step": 3875 }, { "epoch": 11.274665702927358, "grad_norm": 0.953125, "learning_rate": 1.3116883116883118e-06, "loss": 0.0246, "step": 3900 }, { "epoch": 11.346946151066136, "grad_norm": 0.9375, "learning_rate": 9.870129870129872e-07, "loss": 0.0202, "step": 3925 }, { "epoch": 11.419226599204915, "grad_norm": 1.015625, "learning_rate": 6.623376623376623e-07, "loss": 0.0233, "step": 3950 }, { "epoch": 11.491507047343694, "grad_norm": 1.0078125, "learning_rate": 3.3766233766233765e-07, "loss": 0.0214, "step": 3975 }, { "epoch": 11.563787495482472, "grad_norm": 1.390625, "learning_rate": 1.2987012987012988e-08, "loss": 0.0221, "step": 4000 }, { "epoch": 11.563787495482472, "eval_cer": 0.2271312877577234, "eval_loss": 1.228308916091919, "eval_runtime": 912.0398, "eval_samples_per_second": 1.521, "eval_steps_per_second": 0.38, "eval_wer": 0.4136988597675676, "step": 4000 } ], "logging_steps": 25, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.347268958158848e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }