{ "best_global_step": 7500, "best_metric": 3.910543441772461, "best_model_checkpoint": "./gpt2-poems-finetuned\\checkpoint-7500", "epoch": 5.0, "eval_steps": 500, "global_step": 7835, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006385186367627105, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 4.7595, "step": 1 }, { "epoch": 0.015962965919067762, "grad_norm": 9.886136054992676, "learning_rate": 6.122448979591837e-07, "loss": 4.7981, "step": 25 }, { "epoch": 0.031925931838135524, "grad_norm": 5.848156452178955, "learning_rate": 1.25e-06, "loss": 4.7085, "step": 50 }, { "epoch": 0.047888897757203286, "grad_norm": 7.19024133682251, "learning_rate": 1.8877551020408163e-06, "loss": 4.5164, "step": 75 }, { "epoch": 0.06385186367627105, "grad_norm": 4.210382461547852, "learning_rate": 2.5255102040816328e-06, "loss": 4.4614, "step": 100 }, { "epoch": 0.07981482959533881, "grad_norm": 5.846686363220215, "learning_rate": 3.1632653061224496e-06, "loss": 4.3782, "step": 125 }, { "epoch": 0.09577779551440657, "grad_norm": 4.44492769241333, "learning_rate": 3.8010204081632656e-06, "loss": 4.3687, "step": 150 }, { "epoch": 0.11174076143347433, "grad_norm": 3.783334493637085, "learning_rate": 4.438775510204082e-06, "loss": 4.3127, "step": 175 }, { "epoch": 0.1277037273525421, "grad_norm": 4.701773166656494, "learning_rate": 5.0765306122448985e-06, "loss": 4.3091, "step": 200 }, { "epoch": 0.14366669327160986, "grad_norm": 5.687783241271973, "learning_rate": 5.7142857142857145e-06, "loss": 4.3117, "step": 225 }, { "epoch": 0.15962965919067762, "grad_norm": 3.9771533012390137, "learning_rate": 6.352040816326531e-06, "loss": 4.2744, "step": 250 }, { "epoch": 0.17559262510974538, "grad_norm": 4.070949077606201, "learning_rate": 6.989795918367348e-06, "loss": 4.3154, "step": 275 }, { "epoch": 0.19155559102881314, "grad_norm": 4.269972801208496, "learning_rate": 7.627551020408163e-06, "loss": 4.2603, "step": 300 }, { "epoch": 0.2075185569478809, "grad_norm": 3.923750400543213, "learning_rate": 8.26530612244898e-06, "loss": 4.2725, "step": 325 }, { "epoch": 0.22348152286694867, "grad_norm": 3.631035327911377, "learning_rate": 8.903061224489795e-06, "loss": 4.2377, "step": 350 }, { "epoch": 0.23944448878601643, "grad_norm": 3.9385716915130615, "learning_rate": 9.540816326530612e-06, "loss": 4.2813, "step": 375 }, { "epoch": 0.2554074547050842, "grad_norm": 4.328210353851318, "learning_rate": 1.0178571428571429e-05, "loss": 4.273, "step": 400 }, { "epoch": 0.27137042062415195, "grad_norm": 3.800662040710449, "learning_rate": 1.0816326530612246e-05, "loss": 4.2285, "step": 425 }, { "epoch": 0.2873333865432197, "grad_norm": 4.066190719604492, "learning_rate": 1.1454081632653063e-05, "loss": 4.2081, "step": 450 }, { "epoch": 0.3032963524622875, "grad_norm": 3.563178300857544, "learning_rate": 1.2091836734693878e-05, "loss": 4.2176, "step": 475 }, { "epoch": 0.31925931838135524, "grad_norm": 3.9187610149383545, "learning_rate": 1.2729591836734697e-05, "loss": 4.2281, "step": 500 }, { "epoch": 0.31925931838135524, "eval_loss": 4.076858043670654, "eval_runtime": 58.2059, "eval_samples_per_second": 53.809, "eval_steps_per_second": 26.904, "step": 500 }, { "epoch": 0.335222284300423, "grad_norm": 3.895627498626709, "learning_rate": 1.3367346938775512e-05, "loss": 4.2277, "step": 525 }, { "epoch": 0.35118525021949076, "grad_norm": 4.739663600921631, "learning_rate": 1.4005102040816327e-05, "loss": 4.216, "step": 550 }, { "epoch": 0.3671482161385585, "grad_norm": 4.143375396728516, "learning_rate": 1.4642857142857144e-05, "loss": 4.1842, "step": 575 }, { "epoch": 0.3831111820576263, "grad_norm": 4.17396879196167, "learning_rate": 1.528061224489796e-05, "loss": 4.2185, "step": 600 }, { "epoch": 0.39907414797669405, "grad_norm": 4.158074855804443, "learning_rate": 1.5918367346938776e-05, "loss": 4.1617, "step": 625 }, { "epoch": 0.4150371138957618, "grad_norm": 3.739445924758911, "learning_rate": 1.655612244897959e-05, "loss": 4.2013, "step": 650 }, { "epoch": 0.4310000798148296, "grad_norm": 3.669919729232788, "learning_rate": 1.719387755102041e-05, "loss": 4.1807, "step": 675 }, { "epoch": 0.44696304573389734, "grad_norm": 3.9435887336730957, "learning_rate": 1.7831632653061225e-05, "loss": 4.168, "step": 700 }, { "epoch": 0.4629260116529651, "grad_norm": 4.32051420211792, "learning_rate": 1.8469387755102043e-05, "loss": 4.1481, "step": 725 }, { "epoch": 0.47888897757203286, "grad_norm": 3.661527633666992, "learning_rate": 1.910714285714286e-05, "loss": 4.1624, "step": 750 }, { "epoch": 0.4948519434911006, "grad_norm": 3.5897233486175537, "learning_rate": 1.9744897959183677e-05, "loss": 4.1445, "step": 775 }, { "epoch": 0.5108149094101684, "grad_norm": 4.591588973999023, "learning_rate": 1.9999776668891292e-05, "loss": 4.1601, "step": 800 }, { "epoch": 0.5267778753292361, "grad_norm": 4.352611541748047, "learning_rate": 1.9998411903795984e-05, "loss": 4.1446, "step": 825 }, { "epoch": 0.5427408412483039, "grad_norm": 3.704148292541504, "learning_rate": 1.9995806615567444e-05, "loss": 4.1216, "step": 850 }, { "epoch": 0.5587038071673717, "grad_norm": 3.941983699798584, "learning_rate": 1.999196112744904e-05, "loss": 4.1799, "step": 875 }, { "epoch": 0.5746667730864394, "grad_norm": 4.044922828674316, "learning_rate": 1.9986875916558237e-05, "loss": 4.11, "step": 900 }, { "epoch": 0.5906297390055072, "grad_norm": 3.791494131088257, "learning_rate": 1.9980551613827405e-05, "loss": 4.1668, "step": 925 }, { "epoch": 0.606592704924575, "grad_norm": 3.999377965927124, "learning_rate": 1.9972989003925544e-05, "loss": 4.1645, "step": 950 }, { "epoch": 0.6225556708436427, "grad_norm": 3.6708574295043945, "learning_rate": 1.996418902516092e-05, "loss": 4.1543, "step": 975 }, { "epoch": 0.6385186367627105, "grad_norm": 3.741013526916504, "learning_rate": 1.995415276936465e-05, "loss": 4.1077, "step": 1000 }, { "epoch": 0.6385186367627105, "eval_loss": 4.022159099578857, "eval_runtime": 57.4323, "eval_samples_per_second": 54.534, "eval_steps_per_second": 27.267, "step": 1000 }, { "epoch": 0.6544816026817782, "grad_norm": 3.789487838745117, "learning_rate": 1.9942881481755233e-05, "loss": 4.1637, "step": 1025 }, { "epoch": 0.670444568600846, "grad_norm": 3.9883713722229004, "learning_rate": 1.9930376560784057e-05, "loss": 4.1303, "step": 1050 }, { "epoch": 0.6864075345199138, "grad_norm": 3.6000473499298096, "learning_rate": 1.9916639557961895e-05, "loss": 4.0924, "step": 1075 }, { "epoch": 0.7023705004389815, "grad_norm": 3.537168264389038, "learning_rate": 1.99016721776664e-05, "loss": 4.1139, "step": 1100 }, { "epoch": 0.7183334663580493, "grad_norm": 3.460998773574829, "learning_rate": 1.9885476276930628e-05, "loss": 4.0959, "step": 1125 }, { "epoch": 0.734296432277117, "grad_norm": 3.44901180267334, "learning_rate": 1.9868053865212658e-05, "loss": 4.1265, "step": 1150 }, { "epoch": 0.7502593981961848, "grad_norm": 3.4085631370544434, "learning_rate": 1.9849407104146254e-05, "loss": 4.1237, "step": 1175 }, { "epoch": 0.7662223641152526, "grad_norm": 3.4791786670684814, "learning_rate": 1.982953830727268e-05, "loss": 4.0928, "step": 1200 }, { "epoch": 0.7821853300343203, "grad_norm": 3.5405592918395996, "learning_rate": 1.9808449939753635e-05, "loss": 4.1219, "step": 1225 }, { "epoch": 0.7981482959533881, "grad_norm": 3.8546345233917236, "learning_rate": 1.9786144618065414e-05, "loss": 4.112, "step": 1250 }, { "epoch": 0.8141112618724559, "grad_norm": 3.500869035720825, "learning_rate": 1.976262510967428e-05, "loss": 4.1272, "step": 1275 }, { "epoch": 0.8300742277915236, "grad_norm": 3.4084880352020264, "learning_rate": 1.973789433269308e-05, "loss": 4.1496, "step": 1300 }, { "epoch": 0.8460371937105914, "grad_norm": 3.3561558723449707, "learning_rate": 1.97119553555192e-05, "loss": 4.0731, "step": 1325 }, { "epoch": 0.8620001596296591, "grad_norm": 3.4307825565338135, "learning_rate": 1.9684811396453857e-05, "loss": 4.1499, "step": 1350 }, { "epoch": 0.8779631255487269, "grad_norm": 3.2400906085968018, "learning_rate": 1.9656465823302806e-05, "loss": 4.1142, "step": 1375 }, { "epoch": 0.8939260914677947, "grad_norm": 3.5277132987976074, "learning_rate": 1.962692215295849e-05, "loss": 4.138, "step": 1400 }, { "epoch": 0.9098890573868624, "grad_norm": 2.917336940765381, "learning_rate": 1.959618405096368e-05, "loss": 4.1087, "step": 1425 }, { "epoch": 0.9258520233059302, "grad_norm": 3.4995276927948, "learning_rate": 1.956425533105669e-05, "loss": 4.129, "step": 1450 }, { "epoch": 0.941814989224998, "grad_norm": 3.203911304473877, "learning_rate": 1.953113995469821e-05, "loss": 4.0769, "step": 1475 }, { "epoch": 0.9577779551440657, "grad_norm": 3.1379663944244385, "learning_rate": 1.949684203057978e-05, "loss": 4.0667, "step": 1500 }, { "epoch": 0.9577779551440657, "eval_loss": 3.9897916316986084, "eval_runtime": 57.0506, "eval_samples_per_second": 54.899, "eval_steps_per_second": 27.449, "step": 1500 }, { "epoch": 0.9737409210631335, "grad_norm": 3.1570990085601807, "learning_rate": 1.9461365814114032e-05, "loss": 4.08, "step": 1525 }, { "epoch": 0.9897038869822012, "grad_norm": 3.5567002296447754, "learning_rate": 1.9424715706906703e-05, "loss": 4.0888, "step": 1550 }, { "epoch": 1.0051081490941016, "grad_norm": 2.8667120933532715, "learning_rate": 1.938689625621052e-05, "loss": 4.1116, "step": 1575 }, { "epoch": 1.0210711150131695, "grad_norm": 2.864108085632324, "learning_rate": 1.9347912154361022e-05, "loss": 4.1135, "step": 1600 }, { "epoch": 1.0370340809322371, "grad_norm": 3.1839237213134766, "learning_rate": 1.9307768238194363e-05, "loss": 4.0433, "step": 1625 }, { "epoch": 1.052997046851305, "grad_norm": 3.019927740097046, "learning_rate": 1.9266469488447198e-05, "loss": 4.094, "step": 1650 }, { "epoch": 1.0689600127703727, "grad_norm": 3.2604196071624756, "learning_rate": 1.9224021029138714e-05, "loss": 4.0495, "step": 1675 }, { "epoch": 1.0849229786894405, "grad_norm": 3.479423761367798, "learning_rate": 1.9180428126934877e-05, "loss": 4.0804, "step": 1700 }, { "epoch": 1.1008859446085082, "grad_norm": 3.250142812728882, "learning_rate": 1.9135696190495002e-05, "loss": 4.0409, "step": 1725 }, { "epoch": 1.116848910527576, "grad_norm": 3.3738601207733154, "learning_rate": 1.9089830769800673e-05, "loss": 4.0571, "step": 1750 }, { "epoch": 1.1328118764466437, "grad_norm": 3.4340453147888184, "learning_rate": 1.904283755546716e-05, "loss": 4.0253, "step": 1775 }, { "epoch": 1.1487748423657116, "grad_norm": 3.3150882720947266, "learning_rate": 1.8994722378037343e-05, "loss": 4.0714, "step": 1800 }, { "epoch": 1.1647378082847792, "grad_norm": 3.114799737930298, "learning_rate": 1.8945491207258356e-05, "loss": 4.07, "step": 1825 }, { "epoch": 1.1807007742038471, "grad_norm": 2.940286636352539, "learning_rate": 1.8895150151340855e-05, "loss": 4.0395, "step": 1850 }, { "epoch": 1.1966637401229148, "grad_norm": 3.1590492725372314, "learning_rate": 1.88437054562012e-05, "loss": 4.0437, "step": 1875 }, { "epoch": 1.2126267060419826, "grad_norm": 3.4203438758850098, "learning_rate": 1.879116350468648e-05, "loss": 4.0774, "step": 1900 }, { "epoch": 1.2285896719610503, "grad_norm": 3.203162908554077, "learning_rate": 1.8737530815782615e-05, "loss": 4.0435, "step": 1925 }, { "epoch": 1.2445526378801182, "grad_norm": 2.8159914016723633, "learning_rate": 1.8682814043805496e-05, "loss": 4.0611, "step": 1950 }, { "epoch": 1.2605156037991858, "grad_norm": 3.0390403270721436, "learning_rate": 1.8627019977575397e-05, "loss": 4.0507, "step": 1975 }, { "epoch": 1.2764785697182537, "grad_norm": 2.961411476135254, "learning_rate": 1.857015553957466e-05, "loss": 4.0905, "step": 2000 }, { "epoch": 1.2764785697182537, "eval_loss": 3.9738128185272217, "eval_runtime": 56.7214, "eval_samples_per_second": 55.217, "eval_steps_per_second": 27.609, "step": 2000 }, { "epoch": 1.2924415356373213, "grad_norm": 2.742133378982544, "learning_rate": 1.851222778508881e-05, "loss": 4.0503, "step": 2025 }, { "epoch": 1.3084045015563892, "grad_norm": 2.9378607273101807, "learning_rate": 1.8453243901331194e-05, "loss": 4.0667, "step": 2050 }, { "epoch": 1.3243674674754569, "grad_norm": 2.7720441818237305, "learning_rate": 1.8393211206551256e-05, "loss": 4.0483, "step": 2075 }, { "epoch": 1.3403304333945247, "grad_norm": 2.855170726776123, "learning_rate": 1.8332137149126525e-05, "loss": 4.0447, "step": 2100 }, { "epoch": 1.3562933993135924, "grad_norm": 3.0880863666534424, "learning_rate": 1.827002930663851e-05, "loss": 4.0056, "step": 2125 }, { "epoch": 1.3722563652326603, "grad_norm": 3.391598701477051, "learning_rate": 1.82068953849325e-05, "loss": 4.0299, "step": 2150 }, { "epoch": 1.388219331151728, "grad_norm": 2.805473566055298, "learning_rate": 1.8142743217161517e-05, "loss": 4.0247, "step": 2175 }, { "epoch": 1.4041822970707958, "grad_norm": 3.1002068519592285, "learning_rate": 1.807758076281442e-05, "loss": 4.023, "step": 2200 }, { "epoch": 1.4201452629898634, "grad_norm": 3.025351047515869, "learning_rate": 1.8011416106728363e-05, "loss": 4.04, "step": 2225 }, { "epoch": 1.4361082289089313, "grad_norm": 2.6007015705108643, "learning_rate": 1.7944257458085693e-05, "loss": 4.0736, "step": 2250 }, { "epoch": 1.4520711948279992, "grad_norm": 2.8499741554260254, "learning_rate": 1.787611314939541e-05, "loss": 4.0532, "step": 2275 }, { "epoch": 1.4680341607470668, "grad_norm": 2.8128528594970703, "learning_rate": 1.780699163545936e-05, "loss": 4.0392, "step": 2300 }, { "epoch": 1.4839971266661345, "grad_norm": 3.1223385334014893, "learning_rate": 1.7736901492323195e-05, "loss": 4.0352, "step": 2325 }, { "epoch": 1.4999600925852024, "grad_norm": 3.0269346237182617, "learning_rate": 1.7665851416212365e-05, "loss": 4.0201, "step": 2350 }, { "epoch": 1.5159230585042702, "grad_norm": 2.9801249504089355, "learning_rate": 1.759385022245313e-05, "loss": 4.0303, "step": 2375 }, { "epoch": 1.5318860244233379, "grad_norm": 2.950418710708618, "learning_rate": 1.7520906844378834e-05, "loss": 3.9942, "step": 2400 }, { "epoch": 1.5478489903424055, "grad_norm": 2.954284191131592, "learning_rate": 1.7447030332221534e-05, "loss": 4.0157, "step": 2425 }, { "epoch": 1.5638119562614734, "grad_norm": 2.7167422771453857, "learning_rate": 1.7372229851989115e-05, "loss": 4.0339, "step": 2450 }, { "epoch": 1.5797749221805413, "grad_norm": 2.749498128890991, "learning_rate": 1.7296514684328043e-05, "loss": 4.0505, "step": 2475 }, { "epoch": 1.595737888099609, "grad_norm": 3.0270655155181885, "learning_rate": 1.7219894223371897e-05, "loss": 4.008, "step": 2500 }, { "epoch": 1.595737888099609, "eval_loss": 3.956145763397217, "eval_runtime": 56.6738, "eval_samples_per_second": 55.264, "eval_steps_per_second": 27.632, "step": 2500 }, { "epoch": 1.6117008540186766, "grad_norm": 2.99558162689209, "learning_rate": 1.7142377975575826e-05, "loss": 4.0452, "step": 2525 }, { "epoch": 1.6276638199377444, "grad_norm": 3.2427620887756348, "learning_rate": 1.706397555853706e-05, "loss": 3.9951, "step": 2550 }, { "epoch": 1.6436267858568123, "grad_norm": 2.987531900405884, "learning_rate": 1.698469669980162e-05, "loss": 4.0321, "step": 2575 }, { "epoch": 1.65958975177588, "grad_norm": 2.827646255493164, "learning_rate": 1.690455123565743e-05, "loss": 4.0141, "step": 2600 }, { "epoch": 1.6755527176949476, "grad_norm": 2.6261708736419678, "learning_rate": 1.68235491099139e-05, "loss": 4.021, "step": 2625 }, { "epoch": 1.6915156836140155, "grad_norm": 2.822793960571289, "learning_rate": 1.6741700372668153e-05, "loss": 4.0724, "step": 2650 }, { "epoch": 1.7074786495330834, "grad_norm": 2.6139183044433594, "learning_rate": 1.6659015179058132e-05, "loss": 4.0169, "step": 2675 }, { "epoch": 1.723441615452151, "grad_norm": 2.580967426300049, "learning_rate": 1.657550378800259e-05, "loss": 4.0331, "step": 2700 }, { "epoch": 1.7394045813712187, "grad_norm": 2.6083126068115234, "learning_rate": 1.6491176560928267e-05, "loss": 3.9969, "step": 2725 }, { "epoch": 1.7553675472902865, "grad_norm": 2.944607734680176, "learning_rate": 1.640604396048434e-05, "loss": 4.0018, "step": 2750 }, { "epoch": 1.7713305132093544, "grad_norm": 2.804600715637207, "learning_rate": 1.632011654924426e-05, "loss": 4.0159, "step": 2775 }, { "epoch": 1.787293479128422, "grad_norm": 2.5616648197174072, "learning_rate": 1.6233404988395272e-05, "loss": 4.0208, "step": 2800 }, { "epoch": 1.8032564450474897, "grad_norm": 2.834343671798706, "learning_rate": 1.6145920036415643e-05, "loss": 4.0495, "step": 2825 }, { "epoch": 1.8192194109665576, "grad_norm": 2.5377800464630127, "learning_rate": 1.6057672547739833e-05, "loss": 4.0262, "step": 2850 }, { "epoch": 1.8351823768856255, "grad_norm": 2.576991558074951, "learning_rate": 1.596867347141177e-05, "loss": 4.0126, "step": 2875 }, { "epoch": 1.8511453428046931, "grad_norm": 2.5933356285095215, "learning_rate": 1.587893384972638e-05, "loss": 4.0411, "step": 2900 }, { "epoch": 1.8671083087237608, "grad_norm": 2.572754144668579, "learning_rate": 1.5788464816859544e-05, "loss": 4.042, "step": 2925 }, { "epoch": 1.8830712746428286, "grad_norm": 2.584364175796509, "learning_rate": 1.5697277597486663e-05, "loss": 4.0151, "step": 2950 }, { "epoch": 1.8990342405618965, "grad_norm": 2.7844433784484863, "learning_rate": 1.560538350538998e-05, "loss": 4.0261, "step": 2975 }, { "epoch": 1.9149972064809642, "grad_norm": 2.6380529403686523, "learning_rate": 1.551279394205486e-05, "loss": 4.0071, "step": 3000 }, { "epoch": 1.9149972064809642, "eval_loss": 3.944791078567505, "eval_runtime": 56.7672, "eval_samples_per_second": 55.173, "eval_steps_per_second": 27.586, "step": 3000 }, { "epoch": 1.9309601724000318, "grad_norm": 2.678741216659546, "learning_rate": 1.5419520395255204e-05, "loss": 3.9901, "step": 3025 }, { "epoch": 1.9469231383190997, "grad_norm": 2.701401472091675, "learning_rate": 1.5325574437628107e-05, "loss": 4.0162, "step": 3050 }, { "epoch": 1.9628861042381676, "grad_norm": 2.727498769760132, "learning_rate": 1.5230967725238036e-05, "loss": 3.9851, "step": 3075 }, { "epoch": 1.9788490701572352, "grad_norm": 2.747472047805786, "learning_rate": 1.5135711996130624e-05, "loss": 4.0346, "step": 3100 }, { "epoch": 1.9948120360763029, "grad_norm": 2.5219974517822266, "learning_rate": 1.503981906887634e-05, "loss": 3.993, "step": 3125 }, { "epoch": 2.0102162981882032, "grad_norm": 2.9478681087493896, "learning_rate": 1.4943300841104094e-05, "loss": 4.024, "step": 3150 }, { "epoch": 2.026179264107271, "grad_norm": 2.684865713119507, "learning_rate": 1.4846169288025092e-05, "loss": 4.0307, "step": 3175 }, { "epoch": 2.042142230026339, "grad_norm": 2.6168887615203857, "learning_rate": 1.4748436460947064e-05, "loss": 3.953, "step": 3200 }, { "epoch": 2.058105195945407, "grad_norm": 2.68648099899292, "learning_rate": 1.4650114485779e-05, "loss": 4.0059, "step": 3225 }, { "epoch": 2.0740681618644743, "grad_norm": 3.017493963241577, "learning_rate": 1.4551215561526692e-05, "loss": 3.931, "step": 3250 }, { "epoch": 2.090031127783542, "grad_norm": 2.6104273796081543, "learning_rate": 1.4451751958779165e-05, "loss": 3.9746, "step": 3275 }, { "epoch": 2.10599409370261, "grad_norm": 2.6534171104431152, "learning_rate": 1.435173601818625e-05, "loss": 3.9614, "step": 3300 }, { "epoch": 2.121957059621678, "grad_norm": 2.456097364425659, "learning_rate": 1.4251180148927439e-05, "loss": 3.9736, "step": 3325 }, { "epoch": 2.1379200255407453, "grad_norm": 2.6180832386016846, "learning_rate": 1.4150096827172269e-05, "loss": 3.9916, "step": 3350 }, { "epoch": 2.153882991459813, "grad_norm": 2.7971746921539307, "learning_rate": 1.4048498594532369e-05, "loss": 3.9433, "step": 3375 }, { "epoch": 2.169845957378881, "grad_norm": 3.2284584045410156, "learning_rate": 1.3946398056505407e-05, "loss": 3.9567, "step": 3400 }, { "epoch": 2.185808923297949, "grad_norm": 2.5488057136535645, "learning_rate": 1.3843807880911082e-05, "loss": 3.9326, "step": 3425 }, { "epoch": 2.2017718892170164, "grad_norm": 2.8564059734344482, "learning_rate": 1.3740740796319424e-05, "loss": 3.9668, "step": 3450 }, { "epoch": 2.2177348551360843, "grad_norm": 2.912123441696167, "learning_rate": 1.3637209590471521e-05, "loss": 3.98, "step": 3475 }, { "epoch": 2.233697821055152, "grad_norm": 2.7654449939727783, "learning_rate": 1.3533227108692916e-05, "loss": 3.9696, "step": 3500 }, { "epoch": 2.233697821055152, "eval_loss": 3.9376115798950195, "eval_runtime": 56.7367, "eval_samples_per_second": 55.202, "eval_steps_per_second": 27.601, "step": 3500 }, { "epoch": 2.24966078697422, "grad_norm": 2.5010123252868652, "learning_rate": 1.3428806252299877e-05, "loss": 3.9693, "step": 3525 }, { "epoch": 2.2656237528932874, "grad_norm": 2.5059149265289307, "learning_rate": 1.3323959976998689e-05, "loss": 3.9357, "step": 3550 }, { "epoch": 2.2815867188123553, "grad_norm": 2.5838544368743896, "learning_rate": 1.3218701291278215e-05, "loss": 3.9711, "step": 3575 }, { "epoch": 2.297549684731423, "grad_norm": 2.6254265308380127, "learning_rate": 1.3113043254795922e-05, "loss": 3.9931, "step": 3600 }, { "epoch": 2.313512650650491, "grad_norm": 2.6003894805908203, "learning_rate": 1.300699897675752e-05, "loss": 3.988, "step": 3625 }, { "epoch": 2.3294756165695585, "grad_norm": 2.9631710052490234, "learning_rate": 1.2900581614290495e-05, "loss": 3.9856, "step": 3650 }, { "epoch": 2.3454385824886264, "grad_norm": 2.8343353271484375, "learning_rate": 1.2793804370811667e-05, "loss": 3.9814, "step": 3675 }, { "epoch": 2.3614015484076942, "grad_norm": 2.731628179550171, "learning_rate": 1.2686680494389018e-05, "loss": 3.9898, "step": 3700 }, { "epoch": 2.377364514326762, "grad_norm": 2.481356382369995, "learning_rate": 1.2579223276097986e-05, "loss": 4.0092, "step": 3725 }, { "epoch": 2.3933274802458295, "grad_norm": 2.6943631172180176, "learning_rate": 1.2471446048372401e-05, "loss": 3.9694, "step": 3750 }, { "epoch": 2.4092904461648974, "grad_norm": 2.6562039852142334, "learning_rate": 1.2363362183350309e-05, "loss": 3.9859, "step": 3775 }, { "epoch": 2.4252534120839653, "grad_norm": 2.579770565032959, "learning_rate": 1.2254985091214867e-05, "loss": 3.9835, "step": 3800 }, { "epoch": 2.441216378003033, "grad_norm": 2.8043432235717773, "learning_rate": 1.2146328218530503e-05, "loss": 3.9558, "step": 3825 }, { "epoch": 2.4571793439221006, "grad_norm": 3.0811917781829834, "learning_rate": 1.2037405046574598e-05, "loss": 4.0002, "step": 3850 }, { "epoch": 2.4731423098411685, "grad_norm": 2.6078948974609375, "learning_rate": 1.1928229089664802e-05, "loss": 3.9368, "step": 3875 }, { "epoch": 2.4891052757602363, "grad_norm": 2.4601407051086426, "learning_rate": 1.1818813893482321e-05, "loss": 3.9381, "step": 3900 }, { "epoch": 2.5050682416793038, "grad_norm": 2.945693016052246, "learning_rate": 1.1709173033391247e-05, "loss": 3.9915, "step": 3925 }, { "epoch": 2.5210312075983716, "grad_norm": 2.4278717041015625, "learning_rate": 1.1599320112754258e-05, "loss": 3.9684, "step": 3950 }, { "epoch": 2.5369941735174395, "grad_norm": 2.7524194717407227, "learning_rate": 1.1489268761244804e-05, "loss": 3.9619, "step": 3975 }, { "epoch": 2.5529571394365074, "grad_norm": 2.7142021656036377, "learning_rate": 1.1379032633156062e-05, "loss": 3.9984, "step": 4000 }, { "epoch": 2.5529571394365074, "eval_loss": 3.928858757019043, "eval_runtime": 56.5867, "eval_samples_per_second": 55.349, "eval_steps_per_second": 27.674, "step": 4000 }, { "epoch": 2.5689201053555752, "grad_norm": 2.842582941055298, "learning_rate": 1.1268625405706804e-05, "loss": 3.9872, "step": 4025 }, { "epoch": 2.5848830712746427, "grad_norm": 2.6133430004119873, "learning_rate": 1.1158060777344448e-05, "loss": 3.9276, "step": 4050 }, { "epoch": 2.6008460371937105, "grad_norm": 2.502631664276123, "learning_rate": 1.1047352466045458e-05, "loss": 3.9486, "step": 4075 }, { "epoch": 2.6168090031127784, "grad_norm": 2.6683413982391357, "learning_rate": 1.0936514207613336e-05, "loss": 3.9574, "step": 4100 }, { "epoch": 2.632771969031846, "grad_norm": 2.6480071544647217, "learning_rate": 1.0825559753974385e-05, "loss": 3.9646, "step": 4125 }, { "epoch": 2.6487349349509137, "grad_norm": 2.713996171951294, "learning_rate": 1.0714502871471475e-05, "loss": 3.9898, "step": 4150 }, { "epoch": 2.6646979008699816, "grad_norm": 2.5960659980773926, "learning_rate": 1.0603357339156044e-05, "loss": 3.9893, "step": 4175 }, { "epoch": 2.6806608667890495, "grad_norm": 2.748525381088257, "learning_rate": 1.0492136947078474e-05, "loss": 3.931, "step": 4200 }, { "epoch": 2.6966238327081173, "grad_norm": 2.875739574432373, "learning_rate": 1.038085549457717e-05, "loss": 3.9918, "step": 4225 }, { "epoch": 2.7125867986271848, "grad_norm": 2.7640604972839355, "learning_rate": 1.0269526788566408e-05, "loss": 3.9917, "step": 4250 }, { "epoch": 2.7285497645462526, "grad_norm": 2.540008068084717, "learning_rate": 1.0158164641823312e-05, "loss": 3.9956, "step": 4275 }, { "epoch": 2.7445127304653205, "grad_norm": 3.028472661972046, "learning_rate": 1.004678287127406e-05, "loss": 3.9598, "step": 4300 }, { "epoch": 2.760475696384388, "grad_norm": 2.5849356651306152, "learning_rate": 9.935395296279605e-06, "loss": 3.905, "step": 4325 }, { "epoch": 2.776438662303456, "grad_norm": 2.5396087169647217, "learning_rate": 9.824015736921058e-06, "loss": 3.9343, "step": 4350 }, { "epoch": 2.7924016282225237, "grad_norm": 2.9541361331939697, "learning_rate": 9.712658012285015e-06, "loss": 3.9106, "step": 4375 }, { "epoch": 2.8083645941415916, "grad_norm": 2.7522456645965576, "learning_rate": 9.601335938749002e-06, "loss": 3.9761, "step": 4400 }, { "epoch": 2.8243275600606594, "grad_norm": 2.7509615421295166, "learning_rate": 9.490063328267235e-06, "loss": 3.9299, "step": 4425 }, { "epoch": 2.840290525979727, "grad_norm": 2.6110477447509766, "learning_rate": 9.378853986656951e-06, "loss": 3.9856, "step": 4450 }, { "epoch": 2.8562534918987947, "grad_norm": 2.560284376144409, "learning_rate": 9.267721711885486e-06, "loss": 4.0167, "step": 4475 }, { "epoch": 2.8722164578178626, "grad_norm": 2.499309539794922, "learning_rate": 9.15668029235835e-06, "loss": 3.9279, "step": 4500 }, { "epoch": 2.8722164578178626, "eval_loss": 3.921231746673584, "eval_runtime": 56.6026, "eval_samples_per_second": 55.333, "eval_steps_per_second": 27.667, "step": 4500 }, { "epoch": 2.8881794237369305, "grad_norm": 2.4418981075286865, "learning_rate": 9.045743505208442e-06, "loss": 3.9964, "step": 4525 }, { "epoch": 2.9041423896559984, "grad_norm": 2.7410218715667725, "learning_rate": 8.934925114586729e-06, "loss": 3.9709, "step": 4550 }, { "epoch": 2.920105355575066, "grad_norm": 2.3856561183929443, "learning_rate": 8.824238869954462e-06, "loss": 3.9915, "step": 4575 }, { "epoch": 2.9360683214941337, "grad_norm": 2.4345028400421143, "learning_rate": 8.713698504377294e-06, "loss": 3.9775, "step": 4600 }, { "epoch": 2.9520312874132015, "grad_norm": 2.6661508083343506, "learning_rate": 8.603317732821355e-06, "loss": 3.9766, "step": 4625 }, { "epoch": 2.967994253332269, "grad_norm": 2.3309764862060547, "learning_rate": 8.493110250451628e-06, "loss": 3.9815, "step": 4650 }, { "epoch": 2.983957219251337, "grad_norm": 2.508256196975708, "learning_rate": 8.38308973093275e-06, "loss": 4.0027, "step": 4675 }, { "epoch": 2.9999201851704047, "grad_norm": 2.6874983310699463, "learning_rate": 8.273269824732516e-06, "loss": 3.9101, "step": 4700 }, { "epoch": 3.015324447282305, "grad_norm": 2.7417962551116943, "learning_rate": 8.163664157428205e-06, "loss": 3.9155, "step": 4725 }, { "epoch": 3.031287413201373, "grad_norm": 2.679556131362915, "learning_rate": 8.054286328016055e-06, "loss": 3.9281, "step": 4750 }, { "epoch": 3.0472503791204404, "grad_norm": 2.6242339611053467, "learning_rate": 7.945149907223985e-06, "loss": 3.8998, "step": 4775 }, { "epoch": 3.0632133450395083, "grad_norm": 2.893059015274048, "learning_rate": 7.836268435827875e-06, "loss": 3.9077, "step": 4800 }, { "epoch": 3.079176310958576, "grad_norm": 2.602938652038574, "learning_rate": 7.727655422971514e-06, "loss": 3.936, "step": 4825 }, { "epoch": 3.095139276877644, "grad_norm": 2.4572951793670654, "learning_rate": 7.619324344490488e-06, "loss": 3.9821, "step": 4850 }, { "epoch": 3.1111022427967114, "grad_norm": 2.6016576290130615, "learning_rate": 7.511288641240227e-06, "loss": 3.967, "step": 4875 }, { "epoch": 3.1270652087157793, "grad_norm": 2.4637320041656494, "learning_rate": 7.4035617174283646e-06, "loss": 3.8937, "step": 4900 }, { "epoch": 3.143028174634847, "grad_norm": 2.449878215789795, "learning_rate": 7.2961569389516305e-06, "loss": 3.8913, "step": 4925 }, { "epoch": 3.158991140553915, "grad_norm": 2.3982160091400146, "learning_rate": 7.189087631737551e-06, "loss": 3.9394, "step": 4950 }, { "epoch": 3.1749541064729825, "grad_norm": 2.531035900115967, "learning_rate": 7.082367080091037e-06, "loss": 3.9346, "step": 4975 }, { "epoch": 3.1909170723920504, "grad_norm": 2.7257885932922363, "learning_rate": 6.976008525046211e-06, "loss": 3.9569, "step": 5000 }, { "epoch": 3.1909170723920504, "eval_loss": 3.918074369430542, "eval_runtime": 56.7976, "eval_samples_per_second": 55.143, "eval_steps_per_second": 27.572, "step": 5000 }, { "epoch": 3.2068800383111182, "grad_norm": 2.6535191535949707, "learning_rate": 6.870025162723538e-06, "loss": 3.9732, "step": 5025 }, { "epoch": 3.222843004230186, "grad_norm": 2.4435272216796875, "learning_rate": 6.764430142692564e-06, "loss": 3.946, "step": 5050 }, { "epoch": 3.2388059701492535, "grad_norm": 2.4859092235565186, "learning_rate": 6.659236566340422e-06, "loss": 3.9508, "step": 5075 }, { "epoch": 3.2547689360683214, "grad_norm": 2.4849720001220703, "learning_rate": 6.554457485246332e-06, "loss": 3.9247, "step": 5100 }, { "epoch": 3.2707319019873893, "grad_norm": 2.3787262439727783, "learning_rate": 6.4501058995622315e-06, "loss": 3.9676, "step": 5125 }, { "epoch": 3.286694867906457, "grad_norm": 2.625030279159546, "learning_rate": 6.346194756399855e-06, "loss": 3.948, "step": 5150 }, { "epoch": 3.302657833825525, "grad_norm": 2.892484426498413, "learning_rate": 6.242736948224333e-06, "loss": 3.9499, "step": 5175 }, { "epoch": 3.3186207997445925, "grad_norm": 2.6197938919067383, "learning_rate": 6.139745311254621e-06, "loss": 3.8792, "step": 5200 }, { "epoch": 3.3345837656636603, "grad_norm": 2.62258243560791, "learning_rate": 6.037232623870869e-06, "loss": 3.9258, "step": 5225 }, { "epoch": 3.350546731582728, "grad_norm": 2.7659857273101807, "learning_rate": 5.9352116050289795e-06, "loss": 3.9113, "step": 5250 }, { "epoch": 3.3665096975017956, "grad_norm": 2.5385048389434814, "learning_rate": 5.833694912682553e-06, "loss": 3.9307, "step": 5275 }, { "epoch": 3.3824726634208635, "grad_norm": 2.4550514221191406, "learning_rate": 5.732695142212392e-06, "loss": 3.9651, "step": 5300 }, { "epoch": 3.3984356293399314, "grad_norm": 2.566981792449951, "learning_rate": 5.632224824863741e-06, "loss": 3.9423, "step": 5325 }, { "epoch": 3.4143985952589992, "grad_norm": 2.4866788387298584, "learning_rate": 5.5322964261915395e-06, "loss": 3.959, "step": 5350 }, { "epoch": 3.430361561178067, "grad_norm": 2.6524391174316406, "learning_rate": 5.432922344513785e-06, "loss": 3.932, "step": 5375 }, { "epoch": 3.4463245270971345, "grad_norm": 2.34717059135437, "learning_rate": 5.33411490937324e-06, "loss": 3.932, "step": 5400 }, { "epoch": 3.4622874930162024, "grad_norm": 2.664670705795288, "learning_rate": 5.2358863800076956e-06, "loss": 3.9499, "step": 5425 }, { "epoch": 3.4782504589352703, "grad_norm": 2.7321012020111084, "learning_rate": 5.13824894382893e-06, "loss": 3.9093, "step": 5450 }, { "epoch": 3.4942134248543377, "grad_norm": 2.481905221939087, "learning_rate": 5.041214714910599e-06, "loss": 3.8889, "step": 5475 }, { "epoch": 3.5101763907734056, "grad_norm": 2.6542272567749023, "learning_rate": 4.94479573248522e-06, "loss": 3.9598, "step": 5500 }, { "epoch": 3.5101763907734056, "eval_loss": 3.9154069423675537, "eval_runtime": 56.6262, "eval_samples_per_second": 55.31, "eval_steps_per_second": 27.655, "step": 5500 }, { "epoch": 3.5261393566924735, "grad_norm": 2.672531843185425, "learning_rate": 4.849003959450432e-06, "loss": 3.9798, "step": 5525 }, { "epoch": 3.5421023226115413, "grad_norm": 2.5921006202697754, "learning_rate": 4.753851280884745e-06, "loss": 3.9279, "step": 5550 }, { "epoch": 3.558065288530609, "grad_norm": 2.5902962684631348, "learning_rate": 4.659349502572923e-06, "loss": 3.9163, "step": 5575 }, { "epoch": 3.5740282544496766, "grad_norm": 2.5092830657958984, "learning_rate": 4.565510349541227e-06, "loss": 3.9075, "step": 5600 }, { "epoch": 3.5899912203687445, "grad_norm": 2.8172504901885986, "learning_rate": 4.472345464602664e-06, "loss": 3.8996, "step": 5625 }, { "epoch": 3.6059541862878124, "grad_norm": 2.6843042373657227, "learning_rate": 4.379866406912429e-06, "loss": 3.9726, "step": 5650 }, { "epoch": 3.62191715220688, "grad_norm": 2.3186111450195312, "learning_rate": 4.28808465053376e-06, "loss": 3.9418, "step": 5675 }, { "epoch": 3.6378801181259477, "grad_norm": 2.6534008979797363, "learning_rate": 4.197011583014312e-06, "loss": 3.9505, "step": 5700 }, { "epoch": 3.6538430840450156, "grad_norm": 2.683316230773926, "learning_rate": 4.106658503973273e-06, "loss": 3.9261, "step": 5725 }, { "epoch": 3.6698060499640834, "grad_norm": 2.754112958908081, "learning_rate": 4.017036623699415e-06, "loss": 3.8915, "step": 5750 }, { "epoch": 3.6857690158831513, "grad_norm": 2.441436767578125, "learning_rate": 3.9281570617602145e-06, "loss": 3.9543, "step": 5775 }, { "epoch": 3.7017319818022187, "grad_norm": 2.5779573917388916, "learning_rate": 3.840030845622196e-06, "loss": 3.921, "step": 5800 }, { "epoch": 3.7176949477212866, "grad_norm": 2.3267993927001953, "learning_rate": 3.752668909282762e-06, "loss": 3.9401, "step": 5825 }, { "epoch": 3.7336579136403545, "grad_norm": 3.016676902770996, "learning_rate": 3.6660820919135774e-06, "loss": 3.9456, "step": 5850 }, { "epoch": 3.749620879559422, "grad_norm": 2.6449170112609863, "learning_rate": 3.580281136515732e-06, "loss": 3.9071, "step": 5875 }, { "epoch": 3.76558384547849, "grad_norm": 2.576220750808716, "learning_rate": 3.495276688586835e-06, "loss": 3.9559, "step": 5900 }, { "epoch": 3.7815468113975577, "grad_norm": 2.5338728427886963, "learning_rate": 3.4110792948002093e-06, "loss": 3.9402, "step": 5925 }, { "epoch": 3.7975097773166255, "grad_norm": 2.4438517093658447, "learning_rate": 3.327699401696339e-06, "loss": 3.9362, "step": 5950 }, { "epoch": 3.8134727432356934, "grad_norm": 2.9886441230773926, "learning_rate": 3.245147354386753e-06, "loss": 3.9201, "step": 5975 }, { "epoch": 3.829435709154761, "grad_norm": 2.4011476039886475, "learning_rate": 3.163433395270481e-06, "loss": 3.9403, "step": 6000 }, { "epoch": 3.829435709154761, "eval_loss": 3.9126319885253906, "eval_runtime": 95.1462, "eval_samples_per_second": 32.918, "eval_steps_per_second": 16.459, "step": 6000 }, { "epoch": 3.8453986750738287, "grad_norm": 2.6228878498077393, "learning_rate": 3.082567662763264e-06, "loss": 3.9585, "step": 6025 }, { "epoch": 3.8613616409928966, "grad_norm": 2.369616985321045, "learning_rate": 3.0025601900396408e-06, "loss": 3.8973, "step": 6050 }, { "epoch": 3.877324606911964, "grad_norm": 2.92931866645813, "learning_rate": 2.923420903788151e-06, "loss": 3.9259, "step": 6075 }, { "epoch": 3.893287572831032, "grad_norm": 2.371095657348633, "learning_rate": 2.8451596229796763e-06, "loss": 3.9598, "step": 6100 }, { "epoch": 3.9092505387500998, "grad_norm": 2.415902853012085, "learning_rate": 2.767786057649183e-06, "loss": 3.9118, "step": 6125 }, { "epoch": 3.9252135046691676, "grad_norm": 2.60103440284729, "learning_rate": 2.6913098076909994e-06, "loss": 3.9444, "step": 6150 }, { "epoch": 3.9411764705882355, "grad_norm": 2.6399271488189697, "learning_rate": 2.615740361667728e-06, "loss": 3.9276, "step": 6175 }, { "epoch": 3.957139436507303, "grad_norm": 2.461277723312378, "learning_rate": 2.541087095632965e-06, "loss": 3.9576, "step": 6200 }, { "epoch": 3.973102402426371, "grad_norm": 2.5564517974853516, "learning_rate": 2.467359271968016e-06, "loss": 3.9056, "step": 6225 }, { "epoch": 3.9890653683454387, "grad_norm": 2.6077992916107178, "learning_rate": 2.394566038232682e-06, "loss": 3.9513, "step": 6250 }, { "epoch": 4.004469630457339, "grad_norm": 2.6047403812408447, "learning_rate": 2.3227164260303148e-06, "loss": 3.9438, "step": 6275 }, { "epoch": 4.0204325963764065, "grad_norm": 2.5610029697418213, "learning_rate": 2.251819349887224e-06, "loss": 3.9488, "step": 6300 }, { "epoch": 4.036395562295475, "grad_norm": 2.5663247108459473, "learning_rate": 2.181883606146662e-06, "loss": 3.9083, "step": 6325 }, { "epoch": 4.052358528214542, "grad_norm": 2.7286441326141357, "learning_rate": 2.1129178718774222e-06, "loss": 3.9205, "step": 6350 }, { "epoch": 4.06832149413361, "grad_norm": 2.6049561500549316, "learning_rate": 2.044930703797272e-06, "loss": 3.9414, "step": 6375 }, { "epoch": 4.084284460052678, "grad_norm": 2.407318353652954, "learning_rate": 1.9779305372112943e-06, "loss": 3.9332, "step": 6400 }, { "epoch": 4.100247425971745, "grad_norm": 2.5431180000305176, "learning_rate": 1.911925684965309e-06, "loss": 3.9233, "step": 6425 }, { "epoch": 4.116210391890814, "grad_norm": 2.905609607696533, "learning_rate": 1.846924336414474e-06, "loss": 3.9074, "step": 6450 }, { "epoch": 4.132173357809881, "grad_norm": 2.6817057132720947, "learning_rate": 1.782934556407223e-06, "loss": 3.8939, "step": 6475 }, { "epoch": 4.148136323728949, "grad_norm": 2.507965564727783, "learning_rate": 1.7199642842846387e-06, "loss": 3.908, "step": 6500 }, { "epoch": 4.148136323728949, "eval_loss": 3.9117023944854736, "eval_runtime": 87.5275, "eval_samples_per_second": 35.783, "eval_steps_per_second": 17.892, "step": 6500 }, { "epoch": 4.164099289648017, "grad_norm": 2.4522206783294678, "learning_rate": 1.6580213328954054e-06, "loss": 3.9176, "step": 6525 }, { "epoch": 4.180062255567084, "grad_norm": 2.804759979248047, "learning_rate": 1.5971133876264445e-06, "loss": 3.8951, "step": 6550 }, { "epoch": 4.196025221486152, "grad_norm": 2.3992278575897217, "learning_rate": 1.5372480054493921e-06, "loss": 3.8637, "step": 6575 }, { "epoch": 4.21198818740522, "grad_norm": 2.68453049659729, "learning_rate": 1.478432613982973e-06, "loss": 3.9023, "step": 6600 }, { "epoch": 4.2279511533242875, "grad_norm": 2.6714088916778564, "learning_rate": 1.4206745105714415e-06, "loss": 3.9102, "step": 6625 }, { "epoch": 4.243914119243356, "grad_norm": 2.663738965988159, "learning_rate": 1.363980861379196e-06, "loss": 3.939, "step": 6650 }, { "epoch": 4.259877085162423, "grad_norm": 2.4409677982330322, "learning_rate": 1.3083587005016563e-06, "loss": 3.8866, "step": 6675 }, { "epoch": 4.275840051081491, "grad_norm": 2.3599727153778076, "learning_rate": 1.253814929092515e-06, "loss": 3.9531, "step": 6700 }, { "epoch": 4.291803017000559, "grad_norm": 2.31123948097229, "learning_rate": 1.200356314507517e-06, "loss": 3.9207, "step": 6725 }, { "epoch": 4.307765982919626, "grad_norm": 2.5220460891723633, "learning_rate": 1.147989489464807e-06, "loss": 3.9423, "step": 6750 }, { "epoch": 4.323728948838694, "grad_norm": 2.528290033340454, "learning_rate": 1.096720951222e-06, "loss": 3.9337, "step": 6775 }, { "epoch": 4.339691914757762, "grad_norm": 2.48093843460083, "learning_rate": 1.0465570607700526e-06, "loss": 3.9179, "step": 6800 }, { "epoch": 4.35565488067683, "grad_norm": 2.3196213245391846, "learning_rate": 9.97504042044042e-07, "loss": 3.867, "step": 6825 }, { "epoch": 4.371617846595898, "grad_norm": 2.3795883655548096, "learning_rate": 9.495679811509483e-07, "loss": 3.9262, "step": 6850 }, { "epoch": 4.387580812514965, "grad_norm": 2.6998538970947266, "learning_rate": 9.027548256145402e-07, "loss": 3.9615, "step": 6875 }, { "epoch": 4.403543778434033, "grad_norm": 2.639824867248535, "learning_rate": 8.570703836374561e-07, "loss": 3.9086, "step": 6900 }, { "epoch": 4.419506744353101, "grad_norm": 2.8238446712493896, "learning_rate": 8.125203233805634e-07, "loss": 3.9274, "step": 6925 }, { "epoch": 4.4354697102721685, "grad_norm": 2.5089192390441895, "learning_rate": 7.691101722597038e-07, "loss": 3.9387, "step": 6950 }, { "epoch": 4.451432676191236, "grad_norm": 2.6850743293762207, "learning_rate": 7.268453162598899e-07, "loss": 3.9135, "step": 6975 }, { "epoch": 4.467395642110304, "grad_norm": 2.6673882007598877, "learning_rate": 6.857309992670625e-07, "loss": 3.9344, "step": 7000 }, { "epoch": 4.467395642110304, "eval_loss": 3.9108645915985107, "eval_runtime": 88.1557, "eval_samples_per_second": 35.528, "eval_steps_per_second": 17.764, "step": 7000 }, { "epoch": 4.483358608029372, "grad_norm": 2.8991503715515137, "learning_rate": 6.457723224174606e-07, "loss": 3.9679, "step": 7025 }, { "epoch": 4.49932157394844, "grad_norm": 2.431222915649414, "learning_rate": 6.069742434647286e-07, "loss": 3.8838, "step": 7050 }, { "epoch": 4.515284539867507, "grad_norm": 2.5432920455932617, "learning_rate": 5.693415761647825e-07, "loss": 3.899, "step": 7075 }, { "epoch": 4.531247505786575, "grad_norm": 2.439650535583496, "learning_rate": 5.328789896785635e-07, "loss": 3.916, "step": 7100 }, { "epoch": 4.547210471705643, "grad_norm": 2.2136306762695312, "learning_rate": 4.97591007992726e-07, "loss": 3.8504, "step": 7125 }, { "epoch": 4.563173437624711, "grad_norm": 2.457735061645508, "learning_rate": 4.6348200935834586e-07, "loss": 3.9182, "step": 7150 }, { "epoch": 4.579136403543778, "grad_norm": 2.562631368637085, "learning_rate": 4.305562257476792e-07, "loss": 3.9066, "step": 7175 }, { "epoch": 4.595099369462846, "grad_norm": 2.6716058254241943, "learning_rate": 3.988177423291195e-07, "loss": 3.9478, "step": 7200 }, { "epoch": 4.611062335381914, "grad_norm": 2.4732768535614014, "learning_rate": 3.6827049696032233e-07, "loss": 3.8824, "step": 7225 }, { "epoch": 4.627025301300982, "grad_norm": 2.6977033615112305, "learning_rate": 3.3891827969964373e-07, "loss": 3.9434, "step": 7250 }, { "epoch": 4.6429882672200495, "grad_norm": 2.5639913082122803, "learning_rate": 3.107647323358842e-07, "loss": 3.9115, "step": 7275 }, { "epoch": 4.658951233139117, "grad_norm": 2.6874756813049316, "learning_rate": 2.8381334793645466e-07, "loss": 3.9292, "step": 7300 }, { "epoch": 4.674914199058185, "grad_norm": 2.7557287216186523, "learning_rate": 2.5806747041398403e-07, "loss": 3.9431, "step": 7325 }, { "epoch": 4.690877164977253, "grad_norm": 2.393786907196045, "learning_rate": 2.3353029411142926e-07, "loss": 3.8994, "step": 7350 }, { "epoch": 4.706840130896321, "grad_norm": 2.4166479110717773, "learning_rate": 2.1020486340574964e-07, "loss": 3.9305, "step": 7375 }, { "epoch": 4.7228030968153885, "grad_norm": 2.5869390964508057, "learning_rate": 1.8809407233018272e-07, "loss": 3.8979, "step": 7400 }, { "epoch": 4.738766062734456, "grad_norm": 2.8491010665893555, "learning_rate": 1.672006642151802e-07, "loss": 3.9278, "step": 7425 }, { "epoch": 4.754729028653524, "grad_norm": 2.427485704421997, "learning_rate": 1.4752723134803137e-07, "loss": 3.9174, "step": 7450 }, { "epoch": 4.770691994572592, "grad_norm": 2.5240070819854736, "learning_rate": 1.2907621465123587e-07, "loss": 3.9097, "step": 7475 }, { "epoch": 4.786654960491659, "grad_norm": 2.8449230194091797, "learning_rate": 1.1184990337965384e-07, "loss": 3.9645, "step": 7500 }, { "epoch": 4.786654960491659, "eval_loss": 3.910543441772461, "eval_runtime": 81.6041, "eval_samples_per_second": 38.38, "eval_steps_per_second": 19.19, "step": 7500 }, { "epoch": 4.802617926410727, "grad_norm": 2.9600746631622314, "learning_rate": 9.585043483647194e-08, "loss": 3.9351, "step": 7525 }, { "epoch": 4.818580892329795, "grad_norm": 2.489225149154663, "learning_rate": 8.107979410802769e-08, "loss": 3.9168, "step": 7550 }, { "epoch": 4.834543858248862, "grad_norm": 2.658569574356079, "learning_rate": 6.753981381751096e-08, "loss": 3.9121, "step": 7575 }, { "epoch": 4.8505068241679306, "grad_norm": 2.549792766571045, "learning_rate": 5.523217389758695e-08, "loss": 3.9393, "step": 7600 }, { "epoch": 4.866469790086998, "grad_norm": 2.651370048522949, "learning_rate": 4.4158401381966255e-08, "loss": 3.9882, "step": 7625 }, { "epoch": 4.882432756006066, "grad_norm": 2.443312644958496, "learning_rate": 3.4319870215945297e-08, "loss": 3.9325, "step": 7650 }, { "epoch": 4.898395721925134, "grad_norm": 2.498542070388794, "learning_rate": 2.571780108592936e-08, "loss": 3.8985, "step": 7675 }, { "epoch": 4.914358687844201, "grad_norm": 2.87577748298645, "learning_rate": 1.8353261267988198e-08, "loss": 3.9177, "step": 7700 }, { "epoch": 4.9303216537632695, "grad_norm": 2.4187259674072266, "learning_rate": 1.2227164495431932e-08, "loss": 3.8866, "step": 7725 }, { "epoch": 4.946284619682337, "grad_norm": 2.483729839324951, "learning_rate": 7.3402708454450855e-09, "loss": 3.9082, "step": 7750 }, { "epoch": 4.962247585601405, "grad_norm": 2.356332778930664, "learning_rate": 3.6931866447798004e-09, "loss": 3.8904, "step": 7775 }, { "epoch": 4.978210551520473, "grad_norm": 2.570075273513794, "learning_rate": 1.2863643945282278e-09, "loss": 3.9291, "step": 7800 }, { "epoch": 4.99417351743954, "grad_norm": 2.4640989303588867, "learning_rate": 1.201027139852151e-10, "loss": 3.9131, "step": 7825 } ], "logging_steps": 25, "max_steps": 7835, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.636798611456e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }