{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": 64.79862964893701, "learning_rate": 4.792332268370606e-06, "loss": 3.7986, "step": 5 }, { "epoch": 0.0032, "grad_norm": 29.290033796462424, "learning_rate": 9.584664536741212e-06, "loss": 3.304, "step": 10 }, { "epoch": 0.0048, "grad_norm": 10.895694829851312, "learning_rate": 1.4376996805111818e-05, "loss": 2.3345, "step": 15 }, { "epoch": 0.0064, "grad_norm": 4.22950366323865, "learning_rate": 1.9169329073482425e-05, "loss": 1.6672, "step": 20 }, { "epoch": 0.008, "grad_norm": 1.6360137415435663, "learning_rate": 2.3961661341853032e-05, "loss": 1.281, "step": 25 }, { "epoch": 0.0096, "grad_norm": 0.8534883766043228, "learning_rate": 2.8753993610223637e-05, "loss": 1.1186, "step": 30 }, { "epoch": 0.0112, "grad_norm": 0.42541063066843565, "learning_rate": 3.3546325878594245e-05, "loss": 1.0106, "step": 35 }, { "epoch": 0.0128, "grad_norm": 0.33403768880643325, "learning_rate": 3.833865814696485e-05, "loss": 1.0092, "step": 40 }, { "epoch": 0.0144, "grad_norm": 0.2385548037475669, "learning_rate": 4.313099041533546e-05, "loss": 0.9504, "step": 45 }, { "epoch": 0.016, "grad_norm": 0.21711175160204815, "learning_rate": 4.7923322683706065e-05, "loss": 0.9146, "step": 50 }, { "epoch": 0.0176, "grad_norm": 0.19324897304220315, "learning_rate": 5.2715654952076676e-05, "loss": 0.9051, "step": 55 }, { "epoch": 0.0192, "grad_norm": 0.16941355387097437, "learning_rate": 5.7507987220447274e-05, "loss": 0.9291, "step": 60 }, { "epoch": 0.0208, "grad_norm": 0.1520711327338434, "learning_rate": 6.230031948881788e-05, "loss": 0.8873, "step": 65 }, { "epoch": 0.0224, "grad_norm": 0.15291874280710463, "learning_rate": 6.709265175718849e-05, "loss": 0.8896, "step": 70 }, { "epoch": 0.024, "grad_norm": 0.14977669003307942, "learning_rate": 7.18849840255591e-05, "loss": 0.8982, "step": 75 }, { "epoch": 0.0256, "grad_norm": 0.14407867303158503, "learning_rate": 7.66773162939297e-05, "loss": 0.8826, "step": 80 }, { "epoch": 0.0272, "grad_norm": 0.127640372929375, "learning_rate": 8.14696485623003e-05, "loss": 0.8738, "step": 85 }, { "epoch": 0.0288, "grad_norm": 0.12916578728919045, "learning_rate": 8.626198083067092e-05, "loss": 0.8659, "step": 90 }, { "epoch": 0.0304, "grad_norm": 0.09674048749471525, "learning_rate": 9.105431309904153e-05, "loss": 0.8423, "step": 95 }, { "epoch": 0.032, "grad_norm": 0.09282410939453126, "learning_rate": 9.584664536741213e-05, "loss": 0.8458, "step": 100 }, { "epoch": 0.0336, "grad_norm": 0.10134540252174387, "learning_rate": 0.00010063897763578275, "loss": 0.8346, "step": 105 }, { "epoch": 0.0352, "grad_norm": 0.07389448471081606, "learning_rate": 0.00010543130990415335, "loss": 0.8338, "step": 110 }, { "epoch": 0.0368, "grad_norm": 0.08447006505296106, "learning_rate": 0.00011022364217252396, "loss": 0.8563, "step": 115 }, { "epoch": 0.0384, "grad_norm": 0.06958293644640014, "learning_rate": 0.00011501597444089455, "loss": 0.8441, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.0688195545868785, "learning_rate": 0.00011980830670926518, "loss": 0.8388, "step": 125 }, { "epoch": 0.0416, "grad_norm": 0.06613022535500636, "learning_rate": 0.00012460063897763577, "loss": 0.8388, "step": 130 }, { "epoch": 0.0432, "grad_norm": 0.05884003981608602, "learning_rate": 0.00012939297124600637, "loss": 0.849, "step": 135 }, { "epoch": 0.0448, "grad_norm": 0.0675868331017935, "learning_rate": 0.00013418530351437698, "loss": 0.8157, "step": 140 }, { "epoch": 0.0464, "grad_norm": 0.06520369147774627, "learning_rate": 0.0001389776357827476, "loss": 0.8377, "step": 145 }, { "epoch": 0.048, "grad_norm": 0.05591417905435732, "learning_rate": 0.0001437699680511182, "loss": 0.8346, "step": 150 }, { "epoch": 0.0496, "grad_norm": 0.06885094447020612, "learning_rate": 0.0001485623003194888, "loss": 0.8325, "step": 155 }, { "epoch": 0.0512, "grad_norm": 0.07693195506697653, "learning_rate": 0.0001533546325878594, "loss": 0.8261, "step": 160 }, { "epoch": 0.0528, "grad_norm": 0.06096756973358526, "learning_rate": 0.00015814696485623, "loss": 0.8248, "step": 165 }, { "epoch": 0.0544, "grad_norm": 0.06475575830743056, "learning_rate": 0.0001629392971246006, "loss": 0.855, "step": 170 }, { "epoch": 0.056, "grad_norm": 0.056612739195628536, "learning_rate": 0.00016773162939297124, "loss": 0.821, "step": 175 }, { "epoch": 0.0576, "grad_norm": 0.06071836821069472, "learning_rate": 0.00017252396166134184, "loss": 0.8293, "step": 180 }, { "epoch": 0.0592, "grad_norm": 0.06751182828614498, "learning_rate": 0.00017731629392971245, "loss": 0.8155, "step": 185 }, { "epoch": 0.0608, "grad_norm": 0.05955983363606016, "learning_rate": 0.00018210862619808305, "loss": 0.7971, "step": 190 }, { "epoch": 0.0624, "grad_norm": 0.07052819972972657, "learning_rate": 0.00018690095846645365, "loss": 0.8288, "step": 195 }, { "epoch": 0.064, "grad_norm": 0.05923622995995164, "learning_rate": 0.00019169329073482426, "loss": 0.8184, "step": 200 }, { "epoch": 0.0656, "grad_norm": 0.06963646035027679, "learning_rate": 0.00019648562300319486, "loss": 0.8449, "step": 205 }, { "epoch": 0.0672, "grad_norm": 0.06860014085323075, "learning_rate": 0.0002012779552715655, "loss": 0.8341, "step": 210 }, { "epoch": 0.0688, "grad_norm": 0.06941082488577938, "learning_rate": 0.0002060702875399361, "loss": 0.8383, "step": 215 }, { "epoch": 0.0704, "grad_norm": 0.06616271548096953, "learning_rate": 0.0002108626198083067, "loss": 0.804, "step": 220 }, { "epoch": 0.072, "grad_norm": 0.07107556899925888, "learning_rate": 0.0002156549520766773, "loss": 0.8156, "step": 225 }, { "epoch": 0.0736, "grad_norm": 0.06995785055398655, "learning_rate": 0.0002204472843450479, "loss": 0.8049, "step": 230 }, { "epoch": 0.0752, "grad_norm": 0.059688190721088996, "learning_rate": 0.00022523961661341852, "loss": 0.8209, "step": 235 }, { "epoch": 0.0768, "grad_norm": 0.06833286026364968, "learning_rate": 0.0002300319488817891, "loss": 0.8233, "step": 240 }, { "epoch": 0.0784, "grad_norm": 0.08297292157685465, "learning_rate": 0.0002348242811501597, "loss": 0.8359, "step": 245 }, { "epoch": 0.08, "grad_norm": 0.06316799845757914, "learning_rate": 0.00023961661341853036, "loss": 0.831, "step": 250 }, { "epoch": 0.0816, "grad_norm": 0.07179318857392636, "learning_rate": 0.00024440894568690096, "loss": 0.8359, "step": 255 }, { "epoch": 0.0832, "grad_norm": 0.06293057603954971, "learning_rate": 0.00024920127795527154, "loss": 0.8242, "step": 260 }, { "epoch": 0.0848, "grad_norm": 0.06494498769023198, "learning_rate": 0.00025399361022364217, "loss": 0.8131, "step": 265 }, { "epoch": 0.0864, "grad_norm": 0.06254880531963027, "learning_rate": 0.00025878594249201275, "loss": 0.8133, "step": 270 }, { "epoch": 0.088, "grad_norm": 0.05791128690870581, "learning_rate": 0.0002635782747603834, "loss": 0.7985, "step": 275 }, { "epoch": 0.0896, "grad_norm": 0.05968336480331785, "learning_rate": 0.00026837060702875396, "loss": 0.8259, "step": 280 }, { "epoch": 0.0912, "grad_norm": 0.05507468875480531, "learning_rate": 0.00027316293929712453, "loss": 0.8044, "step": 285 }, { "epoch": 0.0928, "grad_norm": 0.07304020634733059, "learning_rate": 0.0002779552715654952, "loss": 0.8182, "step": 290 }, { "epoch": 0.0944, "grad_norm": 0.06792046663527374, "learning_rate": 0.0002827476038338658, "loss": 0.8342, "step": 295 }, { "epoch": 0.096, "grad_norm": 0.060175207345708116, "learning_rate": 0.0002875399361022364, "loss": 0.8122, "step": 300 }, { "epoch": 0.0976, "grad_norm": 0.07531435295140049, "learning_rate": 0.000292332268370607, "loss": 0.822, "step": 305 }, { "epoch": 0.0992, "grad_norm": 0.05753958870761837, "learning_rate": 0.0002971246006389776, "loss": 0.8364, "step": 310 }, { "epoch": 0.1008, "grad_norm": 0.05265101912795107, "learning_rate": 0.00029999962555314543, "loss": 0.8159, "step": 315 }, { "epoch": 0.1024, "grad_norm": 0.04957141033828283, "learning_rate": 0.00029999541304750175, "loss": 0.8005, "step": 320 }, { "epoch": 0.104, "grad_norm": 0.0509283015803962, "learning_rate": 0.00029998652010953066, "loss": 0.8268, "step": 325 }, { "epoch": 0.1056, "grad_norm": 0.06390145591389865, "learning_rate": 0.000299972947016726, "loss": 0.8203, "step": 330 }, { "epoch": 0.1072, "grad_norm": 0.05898681941656638, "learning_rate": 0.00029995469419262033, "loss": 0.8276, "step": 335 }, { "epoch": 0.1088, "grad_norm": 0.06512724796221968, "learning_rate": 0.00029993176220677174, "loss": 0.8122, "step": 340 }, { "epoch": 0.1104, "grad_norm": 0.05551305751170922, "learning_rate": 0.0002999041517747462, "loss": 0.8018, "step": 345 }, { "epoch": 0.112, "grad_norm": 0.04874158533755733, "learning_rate": 0.0002998718637580951, "loss": 0.8192, "step": 350 }, { "epoch": 0.1136, "grad_norm": 0.05538536507227949, "learning_rate": 0.00029983489916432846, "loss": 0.8135, "step": 355 }, { "epoch": 0.1152, "grad_norm": 0.06949832683320262, "learning_rate": 0.00029979325914688344, "loss": 0.8094, "step": 360 }, { "epoch": 0.1168, "grad_norm": 0.06096787363526232, "learning_rate": 0.0002997469450050883, "loss": 0.8196, "step": 365 }, { "epoch": 0.1184, "grad_norm": 0.05591490066156486, "learning_rate": 0.00029969595818412183, "loss": 0.8093, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.0551286129224898, "learning_rate": 0.0002996403002749686, "loss": 0.8057, "step": 375 }, { "epoch": 0.1216, "grad_norm": 0.050504199679135255, "learning_rate": 0.0002995799730143689, "loss": 0.8253, "step": 380 }, { "epoch": 0.1232, "grad_norm": 0.054113844131886295, "learning_rate": 0.0002995149782847646, "loss": 0.7952, "step": 385 }, { "epoch": 0.1248, "grad_norm": 0.059361929440663486, "learning_rate": 0.0002994453181142407, "loss": 0.8148, "step": 390 }, { "epoch": 0.1264, "grad_norm": 0.05484005641635372, "learning_rate": 0.00029937099467646167, "loss": 0.8025, "step": 395 }, { "epoch": 0.128, "grad_norm": 0.046303100449141, "learning_rate": 0.000299292010290604, "loss": 0.8043, "step": 400 }, { "epoch": 0.1296, "grad_norm": 0.0527166773177604, "learning_rate": 0.0002992083674212835, "loss": 0.8115, "step": 405 }, { "epoch": 0.1312, "grad_norm": 0.04940671723136873, "learning_rate": 0.0002991200686784785, "loss": 0.7842, "step": 410 }, { "epoch": 0.1328, "grad_norm": 0.05682145984208437, "learning_rate": 0.0002990271168174487, "loss": 0.8118, "step": 415 }, { "epoch": 0.1344, "grad_norm": 0.04926281134667431, "learning_rate": 0.00029892951473864865, "loss": 0.7986, "step": 420 }, { "epoch": 0.136, "grad_norm": 0.05053759860453711, "learning_rate": 0.0002988272654876376, "loss": 0.8055, "step": 425 }, { "epoch": 0.1376, "grad_norm": 0.05587322290343626, "learning_rate": 0.00029872037225498453, "loss": 0.8104, "step": 430 }, { "epoch": 0.1392, "grad_norm": 0.04968986730979809, "learning_rate": 0.0002986088383761684, "loss": 0.8106, "step": 435 }, { "epoch": 0.1408, "grad_norm": 0.05709843408250463, "learning_rate": 0.0002984926673314741, "loss": 0.7998, "step": 440 }, { "epoch": 0.1424, "grad_norm": 0.05115846157311485, "learning_rate": 0.0002983718627458838, "loss": 0.7888, "step": 445 }, { "epoch": 0.144, "grad_norm": 0.04934224261471247, "learning_rate": 0.0002982464283889642, "loss": 0.7981, "step": 450 }, { "epoch": 0.1456, "grad_norm": 0.046214903923842345, "learning_rate": 0.0002981163681747483, "loss": 0.7939, "step": 455 }, { "epoch": 0.1472, "grad_norm": 0.04829372890505013, "learning_rate": 0.0002979816861616138, "loss": 0.8021, "step": 460 }, { "epoch": 0.1488, "grad_norm": 0.052778074611499734, "learning_rate": 0.00029784238655215626, "loss": 0.8089, "step": 465 }, { "epoch": 0.1504, "grad_norm": 0.05413941613875908, "learning_rate": 0.0002976984736930578, "loss": 0.8022, "step": 470 }, { "epoch": 0.152, "grad_norm": 0.05129352790614527, "learning_rate": 0.0002975499520749518, "loss": 0.8, "step": 475 }, { "epoch": 0.1536, "grad_norm": 0.05179202869475361, "learning_rate": 0.00029739682633228245, "loss": 0.809, "step": 480 }, { "epoch": 0.1552, "grad_norm": 0.04635356891083487, "learning_rate": 0.0002972391012431605, "loss": 0.807, "step": 485 }, { "epoch": 0.1568, "grad_norm": 0.04999190557107593, "learning_rate": 0.00029707678172921366, "loss": 0.7911, "step": 490 }, { "epoch": 0.1584, "grad_norm": 0.05456560092214985, "learning_rate": 0.0002969098728554336, "loss": 0.7817, "step": 495 }, { "epoch": 0.16, "grad_norm": 0.05317881843455394, "learning_rate": 0.00029673837983001754, "loss": 0.7886, "step": 500 }, { "epoch": 0.1616, "grad_norm": 0.054628625112347194, "learning_rate": 0.0002965623080042057, "loss": 0.7797, "step": 505 }, { "epoch": 0.1632, "grad_norm": 0.0454818393425224, "learning_rate": 0.00029638166287211453, "loss": 0.7794, "step": 510 }, { "epoch": 0.1648, "grad_norm": 0.045903405195861856, "learning_rate": 0.00029619645007056527, "loss": 0.7823, "step": 515 }, { "epoch": 0.1664, "grad_norm": 0.05778313013653165, "learning_rate": 0.0002960066753789077, "loss": 0.7938, "step": 520 }, { "epoch": 0.168, "grad_norm": 0.04794213698716533, "learning_rate": 0.00029581234471884045, "loss": 0.7921, "step": 525 }, { "epoch": 0.1696, "grad_norm": 0.04682149018802875, "learning_rate": 0.00029561346415422554, "loss": 0.7728, "step": 530 }, { "epoch": 0.1712, "grad_norm": 0.047089674218094714, "learning_rate": 0.0002954100398908995, "loss": 0.7966, "step": 535 }, { "epoch": 0.1728, "grad_norm": 0.044859883579268366, "learning_rate": 0.00029520207827647997, "loss": 0.788, "step": 540 }, { "epoch": 0.1744, "grad_norm": 0.05495628492120463, "learning_rate": 0.0002949895858001669, "loss": 0.7903, "step": 545 }, { "epoch": 0.176, "grad_norm": 0.06013853184360737, "learning_rate": 0.0002947725690925409, "loss": 0.7977, "step": 550 }, { "epoch": 0.1776, "grad_norm": 0.048624933325734164, "learning_rate": 0.00029455103492535573, "loss": 0.7928, "step": 555 }, { "epoch": 0.1792, "grad_norm": 0.047300655523156064, "learning_rate": 0.00029432499021132737, "loss": 0.7906, "step": 560 }, { "epoch": 0.1808, "grad_norm": 0.056115287002102256, "learning_rate": 0.00029409444200391807, "loss": 0.786, "step": 565 }, { "epoch": 0.1824, "grad_norm": 0.049117514914560706, "learning_rate": 0.0002938593974971163, "loss": 0.7908, "step": 570 }, { "epoch": 0.184, "grad_norm": 0.04940006503376019, "learning_rate": 0.00029361986402521255, "loss": 0.7874, "step": 575 }, { "epoch": 0.1856, "grad_norm": 0.05373210996094955, "learning_rate": 0.00029337584906256994, "loss": 0.7984, "step": 580 }, { "epoch": 0.1872, "grad_norm": 0.05474306098461321, "learning_rate": 0.0002931273602233915, "loss": 0.7884, "step": 585 }, { "epoch": 0.1888, "grad_norm": 0.14465646121684841, "learning_rate": 0.0002928744052614823, "loss": 0.7983, "step": 590 }, { "epoch": 0.1904, "grad_norm": 0.5255096935790569, "learning_rate": 0.00029261699207000757, "loss": 0.8383, "step": 595 }, { "epoch": 0.192, "grad_norm": 3.0948754803086005, "learning_rate": 0.0002923551286812465, "loss": 1.0175, "step": 600 }, { "epoch": 0.1936, "grad_norm": 3.615101942230243, "learning_rate": 0.00029208882326634125, "loss": 1.4932, "step": 605 }, { "epoch": 0.1952, "grad_norm": 0.14089470273338237, "learning_rate": 0.0002918180841350427, "loss": 1.0401, "step": 610 }, { "epoch": 0.1968, "grad_norm": 0.3373287784802123, "learning_rate": 0.00029154291973545007, "loss": 0.9607, "step": 615 }, { "epoch": 0.1984, "grad_norm": 0.1033440858245417, "learning_rate": 0.0002912633386537485, "loss": 0.9787, "step": 620 }, { "epoch": 0.2, "grad_norm": 0.06639475073647456, "learning_rate": 0.00029097934961394025, "loss": 0.9303, "step": 625 }, { "epoch": 0.2016, "grad_norm": 0.05862768804117062, "learning_rate": 0.0002906909614775729, "loss": 0.8954, "step": 630 }, { "epoch": 0.2032, "grad_norm": 0.05119833574003903, "learning_rate": 0.00029039818324346254, "loss": 0.8801, "step": 635 }, { "epoch": 0.2048, "grad_norm": 0.04790825972385907, "learning_rate": 0.00029010102404741344, "loss": 0.8483, "step": 640 }, { "epoch": 0.2064, "grad_norm": 0.04898413486706179, "learning_rate": 0.00028979949316193244, "loss": 0.8365, "step": 645 }, { "epoch": 0.208, "grad_norm": 0.044373887795511005, "learning_rate": 0.00028949359999593996, "loss": 0.8366, "step": 650 }, { "epoch": 0.2096, "grad_norm": 0.04575881648687471, "learning_rate": 0.0002891833540944764, "loss": 0.8458, "step": 655 }, { "epoch": 0.2112, "grad_norm": 0.04638167285807861, "learning_rate": 0.000288868765138404, "loss": 0.8321, "step": 660 }, { "epoch": 0.2128, "grad_norm": 0.07135297419164925, "learning_rate": 0.00028854984294410507, "loss": 0.8134, "step": 665 }, { "epoch": 0.2144, "grad_norm": 0.04281324262990822, "learning_rate": 0.00028822659746317566, "loss": 0.799, "step": 670 }, { "epoch": 0.216, "grad_norm": 0.04481987085327319, "learning_rate": 0.00028789903878211477, "loss": 0.8338, "step": 675 }, { "epoch": 0.2176, "grad_norm": 0.048657304997143574, "learning_rate": 0.00028756717712201, "loss": 0.8061, "step": 680 }, { "epoch": 0.2192, "grad_norm": 0.05132910171873023, "learning_rate": 0.00028723102283821823, "loss": 0.8016, "step": 685 }, { "epoch": 0.2208, "grad_norm": 0.043704814768955186, "learning_rate": 0.0002868905864200428, "loss": 0.7902, "step": 690 }, { "epoch": 0.2224, "grad_norm": 0.051516499601557166, "learning_rate": 0.0002865458784904059, "loss": 0.8192, "step": 695 }, { "epoch": 0.224, "grad_norm": 0.05476128143933315, "learning_rate": 0.0002861969098055174, "loss": 0.7807, "step": 700 }, { "epoch": 0.2256, "grad_norm": 0.04701129985299958, "learning_rate": 0.0002858436912545391, "loss": 0.787, "step": 705 }, { "epoch": 0.2272, "grad_norm": 0.045976584602508, "learning_rate": 0.0002854862338592448, "loss": 0.8211, "step": 710 }, { "epoch": 0.2288, "grad_norm": 0.04421088788080479, "learning_rate": 0.0002851245487736766, "loss": 0.8096, "step": 715 }, { "epoch": 0.2304, "grad_norm": 0.04437240484817772, "learning_rate": 0.0002847586472837968, "loss": 0.8009, "step": 720 }, { "epoch": 0.232, "grad_norm": 0.04595607719220013, "learning_rate": 0.00028438854080713557, "loss": 0.8053, "step": 725 }, { "epoch": 0.2336, "grad_norm": 0.043767356252426635, "learning_rate": 0.0002840142408924348, "loss": 0.8005, "step": 730 }, { "epoch": 0.2352, "grad_norm": 0.041648966315012526, "learning_rate": 0.00028363575921928793, "loss": 0.813, "step": 735 }, { "epoch": 0.2368, "grad_norm": 0.04396054654683414, "learning_rate": 0.000283253107597775, "loss": 0.8083, "step": 740 }, { "epoch": 0.2384, "grad_norm": 0.04481454329880548, "learning_rate": 0.0002828662979680947, "loss": 0.7762, "step": 745 }, { "epoch": 0.24, "grad_norm": 0.04442483114968266, "learning_rate": 0.0002824753424001914, "loss": 0.8076, "step": 750 }, { "epoch": 0.2416, "grad_norm": 0.04511753997848536, "learning_rate": 0.00028208025309337865, "loss": 0.7927, "step": 755 }, { "epoch": 0.2432, "grad_norm": 0.046354138137126584, "learning_rate": 0.00028168104237595863, "loss": 0.7683, "step": 760 }, { "epoch": 0.2448, "grad_norm": 0.04082380158391817, "learning_rate": 0.0002812777227048371, "loss": 0.7677, "step": 765 }, { "epoch": 0.2464, "grad_norm": 0.049309971949502214, "learning_rate": 0.00028087030666513525, "loss": 0.7988, "step": 770 }, { "epoch": 0.248, "grad_norm": 0.045440684885208, "learning_rate": 0.0002804588069697964, "loss": 0.7662, "step": 775 }, { "epoch": 0.2496, "grad_norm": 0.0444532237833008, "learning_rate": 0.00028004323645918974, "loss": 0.7842, "step": 780 }, { "epoch": 0.2512, "grad_norm": 0.040657171713024576, "learning_rate": 0.00027962360810070955, "loss": 0.7839, "step": 785 }, { "epoch": 0.2528, "grad_norm": 0.0540110179307627, "learning_rate": 0.00027919993498837047, "loss": 0.7896, "step": 790 }, { "epoch": 0.2544, "grad_norm": 0.04259787691620654, "learning_rate": 0.00027877223034239896, "loss": 0.7793, "step": 795 }, { "epoch": 0.256, "grad_norm": 0.04457294144966481, "learning_rate": 0.00027834050750882074, "loss": 0.7855, "step": 800 }, { "epoch": 0.2576, "grad_norm": 0.04410681383697115, "learning_rate": 0.0002779047799590445, "loss": 0.7695, "step": 805 }, { "epoch": 0.2592, "grad_norm": 0.053791843672986804, "learning_rate": 0.00027746506128944147, "loss": 0.7757, "step": 810 }, { "epoch": 0.2608, "grad_norm": 0.0486575710394306, "learning_rate": 0.00027702136522092107, "loss": 0.7818, "step": 815 }, { "epoch": 0.2624, "grad_norm": 0.04125809722919823, "learning_rate": 0.0002765737055985028, "loss": 0.8047, "step": 820 }, { "epoch": 0.264, "grad_norm": 0.047411942700633564, "learning_rate": 0.00027612209639088427, "loss": 0.7578, "step": 825 }, { "epoch": 0.2656, "grad_norm": 0.050940775491989096, "learning_rate": 0.0002756665516900053, "loss": 0.7681, "step": 830 }, { "epoch": 0.2672, "grad_norm": 0.04082369288507536, "learning_rate": 0.00027520708571060823, "loss": 0.7556, "step": 835 }, { "epoch": 0.2688, "grad_norm": 0.03911502648937811, "learning_rate": 0.0002747437127897943, "loss": 0.7508, "step": 840 }, { "epoch": 0.2704, "grad_norm": 0.044224505916356134, "learning_rate": 0.0002742764473865763, "loss": 0.7954, "step": 845 }, { "epoch": 0.272, "grad_norm": 0.04247036421357291, "learning_rate": 0.0002738053040814274, "loss": 0.7607, "step": 850 }, { "epoch": 0.2736, "grad_norm": 0.041899587734798266, "learning_rate": 0.00027333029757582624, "loss": 0.7707, "step": 855 }, { "epoch": 0.2752, "grad_norm": 0.041672158877511176, "learning_rate": 0.00027285144269179816, "loss": 0.7702, "step": 860 }, { "epoch": 0.2768, "grad_norm": 0.04155845099801199, "learning_rate": 0.0002723687543714525, "loss": 0.7893, "step": 865 }, { "epoch": 0.2784, "grad_norm": 0.04108348014143511, "learning_rate": 0.0002718822476765167, "loss": 0.7614, "step": 870 }, { "epoch": 0.28, "grad_norm": 0.045256463749299106, "learning_rate": 0.00027139193778786603, "loss": 0.7833, "step": 875 }, { "epoch": 0.2816, "grad_norm": 0.04253172171776479, "learning_rate": 0.0002708978400050501, "loss": 0.7706, "step": 880 }, { "epoch": 0.2832, "grad_norm": 0.0500050312669766, "learning_rate": 0.0002703999697458152, "loss": 0.7865, "step": 885 }, { "epoch": 0.2848, "grad_norm": 0.04149213458472267, "learning_rate": 0.0002698983425456235, "loss": 0.7554, "step": 890 }, { "epoch": 0.2864, "grad_norm": 0.04595218793063625, "learning_rate": 0.00026939297405716807, "loss": 0.7734, "step": 895 }, { "epoch": 0.288, "grad_norm": 0.04481528857577919, "learning_rate": 0.00026888388004988456, "loss": 0.746, "step": 900 }, { "epoch": 0.2896, "grad_norm": 0.043477674071316814, "learning_rate": 0.00026837107640945905, "loss": 0.7704, "step": 905 }, { "epoch": 0.2912, "grad_norm": 0.04814185600999023, "learning_rate": 0.00026785457913733237, "loss": 0.7662, "step": 910 }, { "epoch": 0.2928, "grad_norm": 0.0468699434302569, "learning_rate": 0.000267334404350201, "loss": 0.772, "step": 915 }, { "epoch": 0.2944, "grad_norm": 0.03923205014558229, "learning_rate": 0.00026681056827951386, "loss": 0.7396, "step": 920 }, { "epoch": 0.296, "grad_norm": 0.03969515280288691, "learning_rate": 0.000266283087270966, "loss": 0.7742, "step": 925 }, { "epoch": 0.2976, "grad_norm": 0.04612291434466239, "learning_rate": 0.00026575197778398843, "loss": 0.7713, "step": 930 }, { "epoch": 0.2992, "grad_norm": 0.04710609947249134, "learning_rate": 0.0002652172563912348, "loss": 0.774, "step": 935 }, { "epoch": 0.3008, "grad_norm": 0.04627823939147109, "learning_rate": 0.00026467893977806387, "loss": 0.759, "step": 940 }, { "epoch": 0.3024, "grad_norm": 0.04215962416625985, "learning_rate": 0.0002641370447420192, "loss": 0.766, "step": 945 }, { "epoch": 0.304, "grad_norm": 0.04876220916142212, "learning_rate": 0.00026359158819230484, "loss": 0.7699, "step": 950 }, { "epoch": 0.3056, "grad_norm": 0.0465570007489398, "learning_rate": 0.00026304258714925766, "loss": 0.7792, "step": 955 }, { "epoch": 0.3072, "grad_norm": 0.0437908623508308, "learning_rate": 0.00026249005874381655, "loss": 0.7648, "step": 960 }, { "epoch": 0.3088, "grad_norm": 0.04319239474141502, "learning_rate": 0.0002619340202169873, "loss": 0.7651, "step": 965 }, { "epoch": 0.3104, "grad_norm": 0.039590732151859404, "learning_rate": 0.00026137448891930514, "loss": 0.7563, "step": 970 }, { "epoch": 0.312, "grad_norm": 0.04478476030600327, "learning_rate": 0.0002608114823102932, "loss": 0.7739, "step": 975 }, { "epoch": 0.3136, "grad_norm": 0.044680126096864815, "learning_rate": 0.0002602450179579176, "loss": 0.7384, "step": 980 }, { "epoch": 0.3152, "grad_norm": 0.04607582062714833, "learning_rate": 0.0002596751135380392, "loss": 0.7589, "step": 985 }, { "epoch": 0.3168, "grad_norm": 0.04652451296184953, "learning_rate": 0.00025910178683386247, "loss": 0.7667, "step": 990 }, { "epoch": 0.3184, "grad_norm": 0.038899699811397556, "learning_rate": 0.0002585250557353799, "loss": 0.7372, "step": 995 }, { "epoch": 0.32, "grad_norm": 0.042642226425794946, "learning_rate": 0.0002579449382388144, "loss": 0.7485, "step": 1000 }, { "epoch": 0.3216, "grad_norm": 0.039833966566896535, "learning_rate": 0.00025736145244605746, "loss": 0.7707, "step": 1005 }, { "epoch": 0.3232, "grad_norm": 0.04531367629526789, "learning_rate": 0.00025677461656410417, "loss": 0.7553, "step": 1010 }, { "epoch": 0.3248, "grad_norm": 0.04349819611665461, "learning_rate": 0.00025618444890448543, "loss": 0.7738, "step": 1015 }, { "epoch": 0.3264, "grad_norm": 0.05023483178830991, "learning_rate": 0.0002555909678826964, "loss": 0.7796, "step": 1020 }, { "epoch": 0.328, "grad_norm": 0.046817340356191714, "learning_rate": 0.0002549941920176217, "loss": 0.7387, "step": 1025 }, { "epoch": 0.3296, "grad_norm": 6.598378786173098, "learning_rate": 0.00025439413993095784, "loss": 0.7527, "step": 1030 }, { "epoch": 0.3312, "grad_norm": 0.06539168468511167, "learning_rate": 0.00025379083034663194, "loss": 0.7542, "step": 1035 }, { "epoch": 0.3328, "grad_norm": 0.0411591198615963, "learning_rate": 0.0002531842820902175, "loss": 0.748, "step": 1040 }, { "epoch": 0.3344, "grad_norm": 0.04659529885950376, "learning_rate": 0.0002525745140883471, "loss": 0.7702, "step": 1045 }, { "epoch": 0.336, "grad_norm": 0.0505797168335258, "learning_rate": 0.0002519615453681218, "loss": 0.7619, "step": 1050 }, { "epoch": 0.3376, "grad_norm": 0.04830566943349081, "learning_rate": 0.00025134539505651716, "loss": 0.762, "step": 1055 }, { "epoch": 0.3392, "grad_norm": 0.04952600716886585, "learning_rate": 0.00025072608237978664, "loss": 0.7534, "step": 1060 }, { "epoch": 0.3408, "grad_norm": 0.044007033813835646, "learning_rate": 0.00025010362666286185, "loss": 0.7692, "step": 1065 }, { "epoch": 0.3424, "grad_norm": 0.04527316097454219, "learning_rate": 0.00024947804732874896, "loss": 0.7636, "step": 1070 }, { "epoch": 0.344, "grad_norm": 0.045960440573293695, "learning_rate": 0.0002488493638979233, "loss": 0.7522, "step": 1075 }, { "epoch": 0.3456, "grad_norm": 0.04427468398129268, "learning_rate": 0.00024821759598771964, "loss": 0.7541, "step": 1080 }, { "epoch": 0.3472, "grad_norm": 0.04994994624897907, "learning_rate": 0.0002475827633117206, "loss": 0.7604, "step": 1085 }, { "epoch": 0.3488, "grad_norm": 0.04346019358697389, "learning_rate": 0.00024694488567914106, "loss": 0.7466, "step": 1090 }, { "epoch": 0.3504, "grad_norm": 0.045596836164609016, "learning_rate": 0.0002463039829942104, "loss": 0.7568, "step": 1095 }, { "epoch": 0.352, "grad_norm": 0.04088907171766439, "learning_rate": 0.0002456600752555511, "loss": 0.7616, "step": 1100 }, { "epoch": 0.3536, "grad_norm": 0.041758722037058514, "learning_rate": 0.0002450131825555548, "loss": 0.7501, "step": 1105 }, { "epoch": 0.3552, "grad_norm": 0.050631334358167975, "learning_rate": 0.00024436332507975553, "loss": 0.7438, "step": 1110 }, { "epoch": 0.3568, "grad_norm": 0.07414558397207001, "learning_rate": 0.00024371052310619967, "loss": 0.7289, "step": 1115 }, { "epoch": 0.3584, "grad_norm": 0.043628197381851856, "learning_rate": 0.00024305479700481317, "loss": 0.7394, "step": 1120 }, { "epoch": 0.36, "grad_norm": 0.0446052211482763, "learning_rate": 0.000242396167236766, "loss": 0.759, "step": 1125 }, { "epoch": 0.3616, "grad_norm": 0.04481759941780531, "learning_rate": 0.0002417346543538337, "loss": 0.7491, "step": 1130 }, { "epoch": 0.3632, "grad_norm": 0.0413461889977132, "learning_rate": 0.00024107027899775596, "loss": 0.7632, "step": 1135 }, { "epoch": 0.3648, "grad_norm": 0.038375490339053624, "learning_rate": 0.00024040306189959283, "loss": 0.7524, "step": 1140 }, { "epoch": 0.3664, "grad_norm": 0.04671975131634136, "learning_rate": 0.0002397330238790774, "loss": 0.746, "step": 1145 }, { "epoch": 0.368, "grad_norm": 0.048330776029605456, "learning_rate": 0.00023906018584396641, "loss": 0.7619, "step": 1150 }, { "epoch": 0.3696, "grad_norm": 0.04143069989457122, "learning_rate": 0.00023838456878938786, "loss": 0.7612, "step": 1155 }, { "epoch": 0.3712, "grad_norm": 0.04377555777582751, "learning_rate": 0.00023770619379718573, "loss": 0.7658, "step": 1160 }, { "epoch": 0.3728, "grad_norm": 0.049064434439728785, "learning_rate": 0.00023702508203526235, "loss": 0.7527, "step": 1165 }, { "epoch": 0.3744, "grad_norm": 0.03784065311529772, "learning_rate": 0.0002363412547569177, "loss": 0.7452, "step": 1170 }, { "epoch": 0.376, "grad_norm": 0.03980251791841271, "learning_rate": 0.00023565473330018622, "loss": 0.7385, "step": 1175 }, { "epoch": 0.3776, "grad_norm": 0.04493262099167099, "learning_rate": 0.00023496553908717115, "loss": 0.7625, "step": 1180 }, { "epoch": 0.3792, "grad_norm": 0.04314266993057437, "learning_rate": 0.00023427369362337598, "loss": 0.7492, "step": 1185 }, { "epoch": 0.3808, "grad_norm": 0.04248357043924128, "learning_rate": 0.00023357921849703335, "loss": 0.7411, "step": 1190 }, { "epoch": 0.3824, "grad_norm": 0.040379817198323655, "learning_rate": 0.0002328821353784315, "loss": 0.7321, "step": 1195 }, { "epoch": 0.384, "grad_norm": 0.0360033809207297, "learning_rate": 0.000232182466019238, "loss": 0.7212, "step": 1200 }, { "epoch": 0.3856, "grad_norm": 0.04200778920904238, "learning_rate": 0.00023148023225182102, "loss": 0.7527, "step": 1205 }, { "epoch": 0.3872, "grad_norm": 0.046085792309150775, "learning_rate": 0.00023077545598856815, "loss": 0.7442, "step": 1210 }, { "epoch": 0.3888, "grad_norm": 0.04277270461232854, "learning_rate": 0.00023006815922120262, "loss": 0.7189, "step": 1215 }, { "epoch": 0.3904, "grad_norm": 0.0405134146379952, "learning_rate": 0.00022935836402009706, "loss": 0.7714, "step": 1220 }, { "epoch": 0.392, "grad_norm": 0.04173101382827032, "learning_rate": 0.00022864609253358474, "loss": 0.7415, "step": 1225 }, { "epoch": 0.3936, "grad_norm": 0.03865851171630709, "learning_rate": 0.00022793136698726863, "loss": 0.7428, "step": 1230 }, { "epoch": 0.3952, "grad_norm": 0.03910224183500891, "learning_rate": 0.00022721420968332766, "loss": 0.7493, "step": 1235 }, { "epoch": 0.3968, "grad_norm": 0.040305577555079186, "learning_rate": 0.00022649464299982116, "loss": 0.752, "step": 1240 }, { "epoch": 0.3984, "grad_norm": 0.03909306971976583, "learning_rate": 0.00022577268938999006, "loss": 0.7216, "step": 1245 }, { "epoch": 0.4, "grad_norm": 0.043954857050380644, "learning_rate": 0.00022504837138155666, "loss": 0.7335, "step": 1250 }, { "epoch": 0.4016, "grad_norm": 0.04130228180090928, "learning_rate": 0.0002243217115760217, "loss": 0.7484, "step": 1255 }, { "epoch": 0.4032, "grad_norm": 0.051231296902632124, "learning_rate": 0.0002235927326479588, "loss": 0.7453, "step": 1260 }, { "epoch": 0.4048, "grad_norm": 0.048719573133536864, "learning_rate": 0.00022286145734430713, "loss": 0.7223, "step": 1265 }, { "epoch": 0.4064, "grad_norm": 0.04017818677971695, "learning_rate": 0.00022212790848366164, "loss": 0.7282, "step": 1270 }, { "epoch": 0.408, "grad_norm": 0.040992304875421104, "learning_rate": 0.00022139210895556104, "loss": 0.7559, "step": 1275 }, { "epoch": 0.4096, "grad_norm": 0.04633597226041717, "learning_rate": 0.00022065408171977326, "loss": 0.7519, "step": 1280 }, { "epoch": 0.4112, "grad_norm": 0.04261151707795407, "learning_rate": 0.00021991384980557958, "loss": 0.7467, "step": 1285 }, { "epoch": 0.4128, "grad_norm": 0.04197091400228777, "learning_rate": 0.0002191714363110554, "loss": 0.7589, "step": 1290 }, { "epoch": 0.4144, "grad_norm": 0.043035522590453124, "learning_rate": 0.00021842686440235002, "loss": 0.7537, "step": 1295 }, { "epoch": 0.416, "grad_norm": 0.04395896783293326, "learning_rate": 0.00021768015731296345, "loss": 0.7481, "step": 1300 }, { "epoch": 0.4176, "grad_norm": 0.043457994496385745, "learning_rate": 0.00021693133834302145, "loss": 0.7316, "step": 1305 }, { "epoch": 0.4192, "grad_norm": 0.04478552802534991, "learning_rate": 0.00021618043085854872, "loss": 0.7404, "step": 1310 }, { "epoch": 0.4208, "grad_norm": 0.0387795040439355, "learning_rate": 0.00021542745829073958, "loss": 0.7389, "step": 1315 }, { "epoch": 0.4224, "grad_norm": 0.041546417532758506, "learning_rate": 0.00021467244413522673, "loss": 0.7416, "step": 1320 }, { "epoch": 0.424, "grad_norm": 0.044665087388519445, "learning_rate": 0.00021391541195134843, "loss": 0.7459, "step": 1325 }, { "epoch": 0.4256, "grad_norm": 0.04379214190156243, "learning_rate": 0.0002131563853614131, "loss": 0.7452, "step": 1330 }, { "epoch": 0.4272, "grad_norm": 0.04238125297996064, "learning_rate": 0.00021239538804996216, "loss": 0.7539, "step": 1335 }, { "epoch": 0.4288, "grad_norm": 0.038993894170660494, "learning_rate": 0.00021163244376303132, "loss": 0.7628, "step": 1340 }, { "epoch": 0.4304, "grad_norm": 0.043983536037091105, "learning_rate": 0.0002108675763074092, "loss": 0.7467, "step": 1345 }, { "epoch": 0.432, "grad_norm": 0.040167490443743736, "learning_rate": 0.00021010080954989482, "loss": 0.7438, "step": 1350 }, { "epoch": 0.4336, "grad_norm": 0.03826596993091824, "learning_rate": 0.00020933216741655258, "loss": 0.7376, "step": 1355 }, { "epoch": 0.4352, "grad_norm": 0.04237309509280299, "learning_rate": 0.00020856167389196584, "loss": 0.7297, "step": 1360 }, { "epoch": 0.4368, "grad_norm": 0.03933319137399159, "learning_rate": 0.0002077893530184885, "loss": 0.7301, "step": 1365 }, { "epoch": 0.4384, "grad_norm": 0.0412391306717837, "learning_rate": 0.0002070152288954947, "loss": 0.7404, "step": 1370 }, { "epoch": 0.44, "grad_norm": 0.040127156770831986, "learning_rate": 0.00020623932567862693, "loss": 0.7268, "step": 1375 }, { "epoch": 0.4416, "grad_norm": 0.044175415048005075, "learning_rate": 0.0002054616675790423, "loss": 0.7528, "step": 1380 }, { "epoch": 0.4432, "grad_norm": 0.0403339831078633, "learning_rate": 0.0002046822788626568, "loss": 0.7392, "step": 1385 }, { "epoch": 0.4448, "grad_norm": 0.03981323331482141, "learning_rate": 0.00020390118384938842, "loss": 0.7355, "step": 1390 }, { "epoch": 0.4464, "grad_norm": 0.03936124576111004, "learning_rate": 0.00020311840691239822, "loss": 0.7415, "step": 1395 }, { "epoch": 0.448, "grad_norm": 0.0474856170739699, "learning_rate": 0.0002023339724773297, "loss": 0.7372, "step": 1400 }, { "epoch": 0.4496, "grad_norm": 0.0457480090375028, "learning_rate": 0.00020154790502154653, "loss": 0.7306, "step": 1405 }, { "epoch": 0.4512, "grad_norm": 0.043907783218970334, "learning_rate": 0.00020076022907336902, "loss": 0.747, "step": 1410 }, { "epoch": 0.4528, "grad_norm": 0.04296393850125629, "learning_rate": 0.00019997096921130862, "loss": 0.723, "step": 1415 }, { "epoch": 0.4544, "grad_norm": 0.04697046178845861, "learning_rate": 0.00019918015006330087, "loss": 0.7437, "step": 1420 }, { "epoch": 0.456, "grad_norm": 0.040119672728064056, "learning_rate": 0.0001983877963059372, "loss": 0.7397, "step": 1425 }, { "epoch": 0.4576, "grad_norm": 0.046656663085348056, "learning_rate": 0.00019759393266369443, "loss": 0.7462, "step": 1430 }, { "epoch": 0.4592, "grad_norm": 0.047948921068881094, "learning_rate": 0.00019679858390816385, "loss": 0.7375, "step": 1435 }, { "epoch": 0.4608, "grad_norm": 0.04612317860998999, "learning_rate": 0.000196001774857278, "loss": 0.7282, "step": 1440 }, { "epoch": 0.4624, "grad_norm": 0.041194594175399085, "learning_rate": 0.00019520353037453598, "loss": 0.7276, "step": 1445 }, { "epoch": 0.464, "grad_norm": 0.048425503869563465, "learning_rate": 0.00019440387536822807, "loss": 0.7572, "step": 1450 }, { "epoch": 0.4656, "grad_norm": 0.04213193734970952, "learning_rate": 0.00019360283479065833, "loss": 0.7349, "step": 1455 }, { "epoch": 0.4672, "grad_norm": 0.04899128871136098, "learning_rate": 0.00019280043363736579, "loss": 0.734, "step": 1460 }, { "epoch": 0.4688, "grad_norm": 0.04326757561876585, "learning_rate": 0.0001919966969463448, "loss": 0.7297, "step": 1465 }, { "epoch": 0.4704, "grad_norm": 0.04024630860939001, "learning_rate": 0.0001911916497972635, "loss": 0.7204, "step": 1470 }, { "epoch": 0.472, "grad_norm": 0.04008125998287669, "learning_rate": 0.0001903853173106815, "loss": 0.7317, "step": 1475 }, { "epoch": 0.4736, "grad_norm": 0.03777929299122506, "learning_rate": 0.00018957772464726574, "loss": 0.732, "step": 1480 }, { "epoch": 0.4752, "grad_norm": 0.04340084226893865, "learning_rate": 0.00018876889700700556, "loss": 0.7211, "step": 1485 }, { "epoch": 0.4768, "grad_norm": 0.0420179308735609, "learning_rate": 0.00018795885962842628, "loss": 0.7399, "step": 1490 }, { "epoch": 0.4784, "grad_norm": 0.044123021371490996, "learning_rate": 0.0001871476377878018, "loss": 0.7383, "step": 1495 }, { "epoch": 0.48, "grad_norm": 0.040049404005910644, "learning_rate": 0.00018633525679836568, "loss": 0.7517, "step": 1500 }, { "epoch": 0.4816, "grad_norm": 0.039647217074159906, "learning_rate": 0.00018552174200952133, "loss": 0.7312, "step": 1505 }, { "epoch": 0.4832, "grad_norm": 0.04013028022885121, "learning_rate": 0.00018470711880605122, "loss": 0.729, "step": 1510 }, { "epoch": 0.4848, "grad_norm": 0.04367909174578893, "learning_rate": 0.00018389141260732444, "loss": 0.7304, "step": 1515 }, { "epoch": 0.4864, "grad_norm": 0.04061130668006248, "learning_rate": 0.00018307464886650377, "loss": 0.7197, "step": 1520 }, { "epoch": 0.488, "grad_norm": 0.03900764609449006, "learning_rate": 0.00018225685306975134, "loss": 0.7274, "step": 1525 }, { "epoch": 0.4896, "grad_norm": 0.044192934458913945, "learning_rate": 0.00018143805073543343, "loss": 0.7257, "step": 1530 }, { "epoch": 0.4912, "grad_norm": 0.03866868780491527, "learning_rate": 0.00018061826741332405, "loss": 0.7209, "step": 1535 }, { "epoch": 0.4928, "grad_norm": 0.03872685480769255, "learning_rate": 0.0001797975286838079, "loss": 0.7224, "step": 1540 }, { "epoch": 0.4944, "grad_norm": 0.04005309906829608, "learning_rate": 0.00017897586015708198, "loss": 0.7302, "step": 1545 }, { "epoch": 0.496, "grad_norm": 0.04231012970632499, "learning_rate": 0.0001781532874723566, "loss": 0.7443, "step": 1550 }, { "epoch": 0.4976, "grad_norm": 0.04312701702411617, "learning_rate": 0.0001773298362970552, "loss": 0.7231, "step": 1555 }, { "epoch": 0.4992, "grad_norm": 0.041854731258554205, "learning_rate": 0.00017650553232601354, "loss": 0.738, "step": 1560 }, { "epoch": 0.5008, "grad_norm": 0.0423243123135773, "learning_rate": 0.00017568040128067782, "loss": 0.7481, "step": 1565 }, { "epoch": 0.5024, "grad_norm": 0.041282692211946664, "learning_rate": 0.00017485446890830225, "loss": 0.7263, "step": 1570 }, { "epoch": 0.504, "grad_norm": 0.037317198782344456, "learning_rate": 0.00017402776098114539, "loss": 0.7184, "step": 1575 }, { "epoch": 0.5056, "grad_norm": 0.040461410318691195, "learning_rate": 0.00017320030329566615, "loss": 0.7199, "step": 1580 }, { "epoch": 0.5072, "grad_norm": 0.03915242982171056, "learning_rate": 0.0001723721216717188, "loss": 0.7238, "step": 1585 }, { "epoch": 0.5088, "grad_norm": 0.035306430185746605, "learning_rate": 0.00017154324195174718, "loss": 0.7186, "step": 1590 }, { "epoch": 0.5104, "grad_norm": 0.04328294482977798, "learning_rate": 0.00017071368999997848, "loss": 0.7187, "step": 1595 }, { "epoch": 0.512, "grad_norm": 0.040552285825681135, "learning_rate": 0.00016988349170161607, "loss": 0.7271, "step": 1600 }, { "epoch": 0.5136, "grad_norm": 0.03792304405548647, "learning_rate": 0.0001690526729620318, "loss": 0.7401, "step": 1605 }, { "epoch": 0.5152, "grad_norm": 0.03739907062711767, "learning_rate": 0.00016822125970595772, "loss": 0.7298, "step": 1610 }, { "epoch": 0.5168, "grad_norm": 0.04045585582372931, "learning_rate": 0.00016738927787667706, "loss": 0.7357, "step": 1615 }, { "epoch": 0.5184, "grad_norm": 0.039086636300895435, "learning_rate": 0.00016655675343521463, "loss": 0.7216, "step": 1620 }, { "epoch": 0.52, "grad_norm": 0.0378622792604208, "learning_rate": 0.00016572371235952697, "loss": 0.725, "step": 1625 }, { "epoch": 0.5216, "grad_norm": 0.03849206417818264, "learning_rate": 0.0001648901806436914, "loss": 0.7315, "step": 1630 }, { "epoch": 0.5232, "grad_norm": 0.03949752544512463, "learning_rate": 0.00016405618429709531, "loss": 0.7093, "step": 1635 }, { "epoch": 0.5248, "grad_norm": 0.044062583731413654, "learning_rate": 0.00016322174934362422, "loss": 0.7385, "step": 1640 }, { "epoch": 0.5264, "grad_norm": 0.04089556591693187, "learning_rate": 0.00016238690182084986, "loss": 0.7046, "step": 1645 }, { "epoch": 0.528, "grad_norm": 0.04222802674498491, "learning_rate": 0.0001615516677792177, "loss": 0.7289, "step": 1650 }, { "epoch": 0.5296, "grad_norm": 0.03761221440074904, "learning_rate": 0.00016071607328123425, "loss": 0.7258, "step": 1655 }, { "epoch": 0.5312, "grad_norm": 0.04005046134119886, "learning_rate": 0.0001598801444006534, "loss": 0.7424, "step": 1660 }, { "epoch": 0.5328, "grad_norm": 0.040509948741415, "learning_rate": 0.0001590439072216632, "loss": 0.7041, "step": 1665 }, { "epoch": 0.5344, "grad_norm": 0.03810512707022326, "learning_rate": 0.0001582073878380718, "loss": 0.7185, "step": 1670 }, { "epoch": 0.536, "grad_norm": 0.04280979793390455, "learning_rate": 0.00015737061235249309, "loss": 0.7029, "step": 1675 }, { "epoch": 0.5376, "grad_norm": 0.04248308027092899, "learning_rate": 0.0001565336068755325, "loss": 0.735, "step": 1680 }, { "epoch": 0.5392, "grad_norm": 0.03928332103104811, "learning_rate": 0.00015569639752497196, "loss": 0.7102, "step": 1685 }, { "epoch": 0.5408, "grad_norm": 0.03919558675753705, "learning_rate": 0.00015485901042495504, "loss": 0.7291, "step": 1690 }, { "epoch": 0.5424, "grad_norm": 0.0391348168167258, "learning_rate": 0.00015402147170517184, "loss": 0.7166, "step": 1695 }, { "epoch": 0.544, "grad_norm": 0.03714108757450143, "learning_rate": 0.00015318380750004352, "loss": 0.7272, "step": 1700 }, { "epoch": 0.5456, "grad_norm": 0.03568601370768205, "learning_rate": 0.00015234604394790687, "loss": 0.7361, "step": 1705 }, { "epoch": 0.5472, "grad_norm": 0.0386630719265334, "learning_rate": 0.00015150820719019874, "loss": 0.725, "step": 1710 }, { "epoch": 0.5488, "grad_norm": 0.03993919583787361, "learning_rate": 0.00015067032337064018, "loss": 0.7225, "step": 1715 }, { "epoch": 0.5504, "grad_norm": 0.04091268785555554, "learning_rate": 0.00014983241863442086, "loss": 0.7355, "step": 1720 }, { "epoch": 0.552, "grad_norm": 0.03823212279742222, "learning_rate": 0.00014899451912738307, "loss": 0.7124, "step": 1725 }, { "epoch": 0.5536, "grad_norm": 0.03775761218114033, "learning_rate": 0.00014815665099520588, "loss": 0.6985, "step": 1730 }, { "epoch": 0.5552, "grad_norm": 0.04149923045436972, "learning_rate": 0.00014731884038258946, "loss": 0.7099, "step": 1735 }, { "epoch": 0.5568, "grad_norm": 0.04014086331686014, "learning_rate": 0.00014648111343243907, "loss": 0.7364, "step": 1740 }, { "epoch": 0.5584, "grad_norm": 0.039575959854944824, "learning_rate": 0.00014564349628504937, "loss": 0.7114, "step": 1745 }, { "epoch": 0.56, "grad_norm": 0.036851853297479614, "learning_rate": 0.00014480601507728885, "loss": 0.6964, "step": 1750 }, { "epoch": 0.5616, "grad_norm": 0.041213192635326586, "learning_rate": 0.00014396869594178403, "loss": 0.7139, "step": 1755 }, { "epoch": 0.5632, "grad_norm": 0.04352972854945724, "learning_rate": 0.0001431315650061042, "loss": 0.7267, "step": 1760 }, { "epoch": 0.5648, "grad_norm": 0.04497516779493022, "learning_rate": 0.00014229464839194618, "loss": 0.7189, "step": 1765 }, { "epoch": 0.5664, "grad_norm": 0.039398507397311704, "learning_rate": 0.00014145797221431903, "loss": 0.7236, "step": 1770 }, { "epoch": 0.568, "grad_norm": 0.040805259594060624, "learning_rate": 0.0001406215625807293, "loss": 0.7167, "step": 1775 }, { "epoch": 0.5696, "grad_norm": 0.04047421525047546, "learning_rate": 0.00013978544559036638, "loss": 0.7137, "step": 1780 }, { "epoch": 0.5712, "grad_norm": 0.03919630804845936, "learning_rate": 0.00013894964733328797, "loss": 0.719, "step": 1785 }, { "epoch": 0.5728, "grad_norm": 0.03685077424222423, "learning_rate": 0.00013811419388960622, "loss": 0.7278, "step": 1790 }, { "epoch": 0.5744, "grad_norm": 0.044753528752375836, "learning_rate": 0.00013727911132867365, "loss": 0.7095, "step": 1795 }, { "epoch": 0.576, "grad_norm": 0.03991640163138795, "learning_rate": 0.00013644442570826982, "loss": 0.723, "step": 1800 }, { "epoch": 0.5776, "grad_norm": 0.040087229919929294, "learning_rate": 0.0001356101630737883, "loss": 0.7127, "step": 1805 }, { "epoch": 0.5792, "grad_norm": 0.04266162698298314, "learning_rate": 0.0001347763494574239, "loss": 0.7279, "step": 1810 }, { "epoch": 0.5808, "grad_norm": 0.037795236144364724, "learning_rate": 0.0001339430108773602, "loss": 0.7191, "step": 1815 }, { "epoch": 0.5824, "grad_norm": 0.0387693849128071, "learning_rate": 0.00013311017333695796, "loss": 0.7243, "step": 1820 }, { "epoch": 0.584, "grad_norm": 0.03774693738753151, "learning_rate": 0.00013227786282394349, "loss": 0.7038, "step": 1825 }, { "epoch": 0.5856, "grad_norm": 0.03812021420863255, "learning_rate": 0.00013144610530959784, "loss": 0.7161, "step": 1830 }, { "epoch": 0.5872, "grad_norm": 0.03834520692315359, "learning_rate": 0.00013061492674794646, "loss": 0.7187, "step": 1835 }, { "epoch": 0.5888, "grad_norm": 0.03678653982482293, "learning_rate": 0.00012978435307494916, "loss": 0.7103, "step": 1840 }, { "epoch": 0.5904, "grad_norm": 0.039024408400889526, "learning_rate": 0.00012895441020769095, "loss": 0.7081, "step": 1845 }, { "epoch": 0.592, "grad_norm": 0.03946176567463419, "learning_rate": 0.00012812512404357328, "loss": 0.7297, "step": 1850 }, { "epoch": 0.5936, "grad_norm": 0.03966986239112185, "learning_rate": 0.0001272965204595059, "loss": 0.7224, "step": 1855 }, { "epoch": 0.5952, "grad_norm": 0.04045671877501302, "learning_rate": 0.00012646862531109945, "loss": 0.7138, "step": 1860 }, { "epoch": 0.5968, "grad_norm": 0.04200376635921014, "learning_rate": 0.0001256414644318588, "loss": 0.7178, "step": 1865 }, { "epoch": 0.5984, "grad_norm": 0.03978046841085274, "learning_rate": 0.00012481506363237654, "loss": 0.7067, "step": 1870 }, { "epoch": 0.6, "grad_norm": 0.04238382651336426, "learning_rate": 0.00012398944869952816, "loss": 0.7274, "step": 1875 }, { "epoch": 0.6016, "grad_norm": 0.038817886095841726, "learning_rate": 0.00012316464539566683, "loss": 0.7122, "step": 1880 }, { "epoch": 0.6032, "grad_norm": 0.03594793213572876, "learning_rate": 0.0001223406794578199, "loss": 0.6961, "step": 1885 }, { "epoch": 0.6048, "grad_norm": 0.03697364543172882, "learning_rate": 0.00012151757659688571, "loss": 0.7113, "step": 1890 }, { "epoch": 0.6064, "grad_norm": 0.037004779496744064, "learning_rate": 0.00012069536249683134, "loss": 0.7131, "step": 1895 }, { "epoch": 0.608, "grad_norm": 0.0396435179197239, "learning_rate": 0.00011987406281389094, "loss": 0.6998, "step": 1900 }, { "epoch": 0.6096, "grad_norm": 0.03869430143018022, "learning_rate": 0.00011905370317576543, "loss": 0.703, "step": 1905 }, { "epoch": 0.6112, "grad_norm": 0.040681377456567816, "learning_rate": 0.00011823430918082282, "loss": 0.7126, "step": 1910 }, { "epoch": 0.6128, "grad_norm": 0.03873128265195665, "learning_rate": 0.00011741590639729918, "loss": 0.7133, "step": 1915 }, { "epoch": 0.6144, "grad_norm": 0.042933631355136996, "learning_rate": 0.00011659852036250113, "loss": 0.7118, "step": 1920 }, { "epoch": 0.616, "grad_norm": 0.04491368497490653, "learning_rate": 0.00011578217658200874, "loss": 0.7094, "step": 1925 }, { "epoch": 0.6176, "grad_norm": 0.03787882428314268, "learning_rate": 0.00011496690052887973, "loss": 0.7213, "step": 1930 }, { "epoch": 0.6192, "grad_norm": 0.037724023271429444, "learning_rate": 0.00011415271764285474, "loss": 0.7118, "step": 1935 }, { "epoch": 0.6208, "grad_norm": 0.04085791852018764, "learning_rate": 0.00011333965332956332, "loss": 0.7241, "step": 1940 }, { "epoch": 0.6224, "grad_norm": 0.03841379840100555, "learning_rate": 0.00011252773295973121, "loss": 0.6915, "step": 1945 }, { "epoch": 0.624, "grad_norm": 0.039180179570884835, "learning_rate": 0.00011171698186838887, "loss": 0.6933, "step": 1950 }, { "epoch": 0.6256, "grad_norm": 0.04020685900513363, "learning_rate": 0.00011090742535408063, "loss": 0.7099, "step": 1955 }, { "epoch": 0.6272, "grad_norm": 0.0415898031303519, "learning_rate": 0.00011009908867807556, "loss": 0.7092, "step": 1960 }, { "epoch": 0.6288, "grad_norm": 0.04650092478952174, "learning_rate": 0.00010929199706357905, "loss": 0.6972, "step": 1965 }, { "epoch": 0.6304, "grad_norm": 0.03819491431751356, "learning_rate": 0.0001084861756949457, "loss": 0.6933, "step": 1970 }, { "epoch": 0.632, "grad_norm": 0.03674646898808985, "learning_rate": 0.00010768164971689374, "loss": 0.6946, "step": 1975 }, { "epoch": 0.6336, "grad_norm": 0.04079648142913826, "learning_rate": 0.00010687844423372018, "loss": 0.7057, "step": 1980 }, { "epoch": 0.6352, "grad_norm": 0.04442964097385929, "learning_rate": 0.00010607658430851744, "loss": 0.7079, "step": 1985 }, { "epoch": 0.6368, "grad_norm": 0.041439999468769535, "learning_rate": 0.00010527609496239142, "loss": 0.7094, "step": 1990 }, { "epoch": 0.6384, "grad_norm": 0.03612721628167792, "learning_rate": 0.00010447700117368078, "loss": 0.7088, "step": 1995 }, { "epoch": 0.64, "grad_norm": 0.04369717146674075, "learning_rate": 0.00010367932787717727, "loss": 0.7434, "step": 2000 }, { "epoch": 0.6416, "grad_norm": 0.041592772074316386, "learning_rate": 0.00010288309996334801, "loss": 0.7133, "step": 2005 }, { "epoch": 0.6432, "grad_norm": 0.041785169745661155, "learning_rate": 0.00010208834227755847, "loss": 0.7039, "step": 2010 }, { "epoch": 0.6448, "grad_norm": 0.0382736079703658, "learning_rate": 0.00010129507961929748, "loss": 0.7041, "step": 2015 }, { "epoch": 0.6464, "grad_norm": 0.03803427582508488, "learning_rate": 0.00010050333674140326, "loss": 0.7349, "step": 2020 }, { "epoch": 0.648, "grad_norm": 0.04429830134367503, "learning_rate": 9.971313834929099e-05, "loss": 0.7127, "step": 2025 }, { "epoch": 0.6496, "grad_norm": 0.04092289365426379, "learning_rate": 9.892450910018203e-05, "loss": 0.709, "step": 2030 }, { "epoch": 0.6512, "grad_norm": 0.038535908469184546, "learning_rate": 9.813747360233443e-05, "loss": 0.6825, "step": 2035 }, { "epoch": 0.6528, "grad_norm": 0.03773550734191166, "learning_rate": 9.73520564142751e-05, "loss": 0.6937, "step": 2040 }, { "epoch": 0.6544, "grad_norm": 0.03897370588757223, "learning_rate": 9.656828204403352e-05, "loss": 0.7035, "step": 2045 }, { "epoch": 0.656, "grad_norm": 0.03993122832767393, "learning_rate": 9.578617494837682e-05, "loss": 0.7098, "step": 2050 }, { "epoch": 0.6576, "grad_norm": 0.04113301544799768, "learning_rate": 9.500575953204684e-05, "loss": 0.7067, "step": 2055 }, { "epoch": 0.6592, "grad_norm": 0.04267204755150456, "learning_rate": 9.422706014699863e-05, "loss": 0.7192, "step": 2060 }, { "epoch": 0.6608, "grad_norm": 0.038540803442012495, "learning_rate": 9.345010109164037e-05, "loss": 0.6948, "step": 2065 }, { "epoch": 0.6624, "grad_norm": 0.03639271405028558, "learning_rate": 9.267490661007529e-05, "loss": 0.6982, "step": 2070 }, { "epoch": 0.664, "grad_norm": 0.03764258257256097, "learning_rate": 9.190150089134512e-05, "loss": 0.7011, "step": 2075 }, { "epoch": 0.6656, "grad_norm": 0.03741160593359946, "learning_rate": 9.112990806867543e-05, "loss": 0.7077, "step": 2080 }, { "epoch": 0.6672, "grad_norm": 0.04138034247014772, "learning_rate": 9.036015221872238e-05, "loss": 0.7348, "step": 2085 }, { "epoch": 0.6688, "grad_norm": 0.03629720795501264, "learning_rate": 8.959225736082158e-05, "loss": 0.6773, "step": 2090 }, { "epoch": 0.6704, "grad_norm": 0.03804004946419038, "learning_rate": 8.882624745623856e-05, "loss": 0.7019, "step": 2095 }, { "epoch": 0.672, "grad_norm": 0.035721267232502185, "learning_rate": 8.806214640742099e-05, "loss": 0.7145, "step": 2100 }, { "epoch": 0.6736, "grad_norm": 0.03632799434171817, "learning_rate": 8.729997805725303e-05, "loss": 0.6927, "step": 2105 }, { "epoch": 0.6752, "grad_norm": 0.036651796521166076, "learning_rate": 8.653976618831118e-05, "loss": 0.7045, "step": 2110 }, { "epoch": 0.6768, "grad_norm": 0.037066902859228194, "learning_rate": 8.578153452212221e-05, "loss": 0.7155, "step": 2115 }, { "epoch": 0.6784, "grad_norm": 0.03767995581990963, "learning_rate": 8.502530671842288e-05, "loss": 0.7043, "step": 2120 }, { "epoch": 0.68, "grad_norm": 0.03561868304758191, "learning_rate": 8.427110637442201e-05, "loss": 0.7129, "step": 2125 }, { "epoch": 0.6816, "grad_norm": 0.038918395104413535, "learning_rate": 8.351895702406366e-05, "loss": 0.7143, "step": 2130 }, { "epoch": 0.6832, "grad_norm": 0.035964229362269014, "learning_rate": 8.276888213729307e-05, "loss": 0.6861, "step": 2135 }, { "epoch": 0.6848, "grad_norm": 0.03733779735545552, "learning_rate": 8.202090511932428e-05, "loss": 0.7153, "step": 2140 }, { "epoch": 0.6864, "grad_norm": 0.03770594702466583, "learning_rate": 8.127504930990986e-05, "loss": 0.7121, "step": 2145 }, { "epoch": 0.688, "grad_norm": 0.03768921861691119, "learning_rate": 8.053133798261233e-05, "loss": 0.7034, "step": 2150 }, { "epoch": 0.6896, "grad_norm": 0.03512675419142099, "learning_rate": 7.978979434407842e-05, "loss": 0.6788, "step": 2155 }, { "epoch": 0.6912, "grad_norm": 0.033678086620005154, "learning_rate": 7.905044153331441e-05, "loss": 0.711, "step": 2160 }, { "epoch": 0.6928, "grad_norm": 0.04310639146017272, "learning_rate": 7.831330262096444e-05, "loss": 0.7007, "step": 2165 }, { "epoch": 0.6944, "grad_norm": 0.035740468353472145, "learning_rate": 7.757840060859053e-05, "loss": 0.6907, "step": 2170 }, { "epoch": 0.696, "grad_norm": 0.0388185330369263, "learning_rate": 7.684575842795485e-05, "loss": 0.7066, "step": 2175 }, { "epoch": 0.6976, "grad_norm": 0.035326733296463714, "learning_rate": 7.611539894030405e-05, "loss": 0.6856, "step": 2180 }, { "epoch": 0.6992, "grad_norm": 0.03673006246399958, "learning_rate": 7.53873449356562e-05, "loss": 0.7035, "step": 2185 }, { "epoch": 0.7008, "grad_norm": 0.04032795219690735, "learning_rate": 7.466161913208932e-05, "loss": 0.713, "step": 2190 }, { "epoch": 0.7024, "grad_norm": 0.04007204362489801, "learning_rate": 7.39382441750326e-05, "loss": 0.6855, "step": 2195 }, { "epoch": 0.704, "grad_norm": 0.0349153417362628, "learning_rate": 7.321724263655988e-05, "loss": 0.6931, "step": 2200 }, { "epoch": 0.7056, "grad_norm": 0.039088044797178166, "learning_rate": 7.249863701468514e-05, "loss": 0.7003, "step": 2205 }, { "epoch": 0.7072, "grad_norm": 0.036879283255029253, "learning_rate": 7.178244973266072e-05, "loss": 0.7259, "step": 2210 }, { "epoch": 0.7088, "grad_norm": 0.03653281890039639, "learning_rate": 7.106870313827734e-05, "loss": 0.7083, "step": 2215 }, { "epoch": 0.7104, "grad_norm": 0.0378300680680633, "learning_rate": 7.035741950316698e-05, "loss": 0.688, "step": 2220 }, { "epoch": 0.712, "grad_norm": 0.03938909267628237, "learning_rate": 6.964862102210773e-05, "loss": 0.6902, "step": 2225 }, { "epoch": 0.7136, "grad_norm": 0.043292695863837646, "learning_rate": 6.894232981233154e-05, "loss": 0.707, "step": 2230 }, { "epoch": 0.7152, "grad_norm": 0.03873590569931821, "learning_rate": 6.823856791283366e-05, "loss": 0.7172, "step": 2235 }, { "epoch": 0.7168, "grad_norm": 0.040179163618301025, "learning_rate": 6.753735728368534e-05, "loss": 0.7001, "step": 2240 }, { "epoch": 0.7184, "grad_norm": 0.03665089260769722, "learning_rate": 6.683871980534833e-05, "loss": 0.7119, "step": 2245 }, { "epoch": 0.72, "grad_norm": 0.04017293034485019, "learning_rate": 6.614267727799212e-05, "loss": 0.6988, "step": 2250 }, { "epoch": 0.7216, "grad_norm": 0.03813762526767263, "learning_rate": 6.544925142081391e-05, "loss": 0.7121, "step": 2255 }, { "epoch": 0.7232, "grad_norm": 0.03854828556279748, "learning_rate": 6.475846387136067e-05, "loss": 0.6962, "step": 2260 }, { "epoch": 0.7248, "grad_norm": 0.03719435244618414, "learning_rate": 6.4070336184854e-05, "loss": 0.6927, "step": 2265 }, { "epoch": 0.7264, "grad_norm": 0.04241730525764197, "learning_rate": 6.338488983351777e-05, "loss": 0.7053, "step": 2270 }, { "epoch": 0.728, "grad_norm": 0.03572759759362592, "learning_rate": 6.270214620590773e-05, "loss": 0.7022, "step": 2275 }, { "epoch": 0.7296, "grad_norm": 0.03946181148151894, "learning_rate": 6.202212660624432e-05, "loss": 0.7106, "step": 2280 }, { "epoch": 0.7312, "grad_norm": 0.03882270135457663, "learning_rate": 6.134485225374787e-05, "loss": 0.685, "step": 2285 }, { "epoch": 0.7328, "grad_norm": 0.0379435383069947, "learning_rate": 6.06703442819764e-05, "loss": 0.6952, "step": 2290 }, { "epoch": 0.7344, "grad_norm": 0.03820113955011566, "learning_rate": 5.999862373816643e-05, "loss": 0.6873, "step": 2295 }, { "epoch": 0.736, "grad_norm": 0.04381507714614669, "learning_rate": 5.932971158257583e-05, "loss": 0.6982, "step": 2300 }, { "epoch": 0.7376, "grad_norm": 0.045255086563538174, "learning_rate": 5.866362868783008e-05, "loss": 0.711, "step": 2305 }, { "epoch": 0.7392, "grad_norm": 0.03831786593553919, "learning_rate": 5.800039583827083e-05, "loss": 0.6905, "step": 2310 }, { "epoch": 0.7408, "grad_norm": 0.038176364964366265, "learning_rate": 5.7340033729307355e-05, "loss": 0.6952, "step": 2315 }, { "epoch": 0.7424, "grad_norm": 0.03557728343089575, "learning_rate": 5.668256296677093e-05, "loss": 0.6942, "step": 2320 }, { "epoch": 0.744, "grad_norm": 0.037453237191514, "learning_rate": 5.602800406627168e-05, "loss": 0.704, "step": 2325 }, { "epoch": 0.7456, "grad_norm": 0.04201408146495244, "learning_rate": 5.5376377452558405e-05, "loss": 0.6901, "step": 2330 }, { "epoch": 0.7472, "grad_norm": 0.03459644417240455, "learning_rate": 5.472770345888134e-05, "loss": 0.6893, "step": 2335 }, { "epoch": 0.7488, "grad_norm": 0.03576418609943801, "learning_rate": 5.4082002326357655e-05, "loss": 0.6953, "step": 2340 }, { "epoch": 0.7504, "grad_norm": 0.03654844796403911, "learning_rate": 5.343929420333984e-05, "loss": 0.6902, "step": 2345 }, { "epoch": 0.752, "grad_norm": 0.043819614388394754, "learning_rate": 5.2799599144786966e-05, "loss": 0.7114, "step": 2350 }, { "epoch": 0.7536, "grad_norm": 0.04040722566989979, "learning_rate": 5.216293711163908e-05, "loss": 0.6901, "step": 2355 }, { "epoch": 0.7552, "grad_norm": 0.03649315688342495, "learning_rate": 5.152932797019403e-05, "loss": 0.691, "step": 2360 }, { "epoch": 0.7568, "grad_norm": 0.03671257352284008, "learning_rate": 5.089879149148781e-05, "loss": 0.6973, "step": 2365 }, { "epoch": 0.7584, "grad_norm": 0.03720399162630074, "learning_rate": 5.027134735067753e-05, "loss": 0.669, "step": 2370 }, { "epoch": 0.76, "grad_norm": 0.03563182343583573, "learning_rate": 4.964701512642746e-05, "loss": 0.689, "step": 2375 }, { "epoch": 0.7616, "grad_norm": 0.035955548417391445, "learning_rate": 4.9025814300298265e-05, "loss": 0.6944, "step": 2380 }, { "epoch": 0.7632, "grad_norm": 0.036492436660829214, "learning_rate": 4.840776425613886e-05, "loss": 0.6876, "step": 2385 }, { "epoch": 0.7648, "grad_norm": 0.04191984666054339, "learning_rate": 4.779288427948168e-05, "loss": 0.7135, "step": 2390 }, { "epoch": 0.7664, "grad_norm": 0.0389152552227016, "learning_rate": 4.718119355694091e-05, "loss": 0.7025, "step": 2395 }, { "epoch": 0.768, "grad_norm": 0.03552598244342018, "learning_rate": 4.657271117561374e-05, "loss": 0.6891, "step": 2400 }, { "epoch": 0.7696, "grad_norm": 0.037991199731399375, "learning_rate": 4.596745612248488e-05, "loss": 0.7018, "step": 2405 }, { "epoch": 0.7712, "grad_norm": 0.04009265600800646, "learning_rate": 4.536544728383403e-05, "loss": 0.6975, "step": 2410 }, { "epoch": 0.7728, "grad_norm": 0.034383530574825634, "learning_rate": 4.476670344464645e-05, "loss": 0.6718, "step": 2415 }, { "epoch": 0.7744, "grad_norm": 0.034993269435869126, "learning_rate": 4.417124328802694e-05, "loss": 0.684, "step": 2420 }, { "epoch": 0.776, "grad_norm": 0.03425009459589239, "learning_rate": 4.357908539461678e-05, "loss": 0.6858, "step": 2425 }, { "epoch": 0.7776, "grad_norm": 0.03919492090647449, "learning_rate": 4.299024824201402e-05, "loss": 0.6988, "step": 2430 }, { "epoch": 0.7792, "grad_norm": 0.034476194930932354, "learning_rate": 4.240475020419675e-05, "loss": 0.6733, "step": 2435 }, { "epoch": 0.7808, "grad_norm": 0.03498517451850298, "learning_rate": 4.1822609550950066e-05, "loss": 0.6847, "step": 2440 }, { "epoch": 0.7824, "grad_norm": 0.03513837580346364, "learning_rate": 4.124384444729561e-05, "loss": 0.7061, "step": 2445 }, { "epoch": 0.784, "grad_norm": 0.03568352623910555, "learning_rate": 4.0668472952924977e-05, "loss": 0.7133, "step": 2450 }, { "epoch": 0.7856, "grad_norm": 0.03452583660811545, "learning_rate": 4.009651302163614e-05, "loss": 0.6906, "step": 2455 }, { "epoch": 0.7872, "grad_norm": 0.037618350024330255, "learning_rate": 3.952798250077317e-05, "loss": 0.6919, "step": 2460 }, { "epoch": 0.7888, "grad_norm": 0.03692436380717943, "learning_rate": 3.896289913066952e-05, "loss": 0.7076, "step": 2465 }, { "epoch": 0.7904, "grad_norm": 0.03430203099467551, "learning_rate": 3.840128054409414e-05, "loss": 0.682, "step": 2470 }, { "epoch": 0.792, "grad_norm": 0.03461351398958901, "learning_rate": 3.784314426570156e-05, "loss": 0.6773, "step": 2475 }, { "epoch": 0.7936, "grad_norm": 0.03921640450921252, "learning_rate": 3.728850771148486e-05, "loss": 0.6973, "step": 2480 }, { "epoch": 0.7952, "grad_norm": 0.03544120496297954, "learning_rate": 3.67373881882323e-05, "loss": 0.71, "step": 2485 }, { "epoch": 0.7968, "grad_norm": 0.036332234631180356, "learning_rate": 3.618980289298736e-05, "loss": 0.6869, "step": 2490 }, { "epoch": 0.7984, "grad_norm": 0.0350795962674337, "learning_rate": 3.5645768912512046e-05, "loss": 0.6763, "step": 2495 }, { "epoch": 0.8, "grad_norm": 0.03620429105944647, "learning_rate": 3.510530322275362e-05, "loss": 0.7201, "step": 2500 }, { "epoch": 0.8016, "grad_norm": 0.034305049981790806, "learning_rate": 3.4568422688315025e-05, "loss": 0.6943, "step": 2505 }, { "epoch": 0.8032, "grad_norm": 0.036585621965710045, "learning_rate": 3.4035144061928635e-05, "loss": 0.6916, "step": 2510 }, { "epoch": 0.8048, "grad_norm": 0.037124612583122524, "learning_rate": 3.350548398393345e-05, "loss": 0.719, "step": 2515 }, { "epoch": 0.8064, "grad_norm": 0.035350388626476145, "learning_rate": 3.297945898175584e-05, "loss": 0.6869, "step": 2520 }, { "epoch": 0.808, "grad_norm": 0.0368119047385571, "learning_rate": 3.245708546939401e-05, "loss": 0.6981, "step": 2525 }, { "epoch": 0.8096, "grad_norm": 0.038393586777655066, "learning_rate": 3.19383797469055e-05, "loss": 0.6906, "step": 2530 }, { "epoch": 0.8112, "grad_norm": 0.034070531981797145, "learning_rate": 3.142335799989883e-05, "loss": 0.7026, "step": 2535 }, { "epoch": 0.8128, "grad_norm": 0.03656747081077928, "learning_rate": 3.091203629902832e-05, "loss": 0.6936, "step": 2540 }, { "epoch": 0.8144, "grad_norm": 0.03591367118250926, "learning_rate": 3.0404430599492636e-05, "loss": 0.6964, "step": 2545 }, { "epoch": 0.816, "grad_norm": 0.03224723089209564, "learning_rate": 2.9900556740537026e-05, "loss": 0.6967, "step": 2550 }, { "epoch": 0.8176, "grad_norm": 0.03592379714034871, "learning_rate": 2.9400430444958932e-05, "loss": 0.6837, "step": 2555 }, { "epoch": 0.8192, "grad_norm": 0.03564531082517651, "learning_rate": 2.8904067318617453e-05, "loss": 0.6863, "step": 2560 }, { "epoch": 0.8208, "grad_norm": 0.036226568505578746, "learning_rate": 2.841148284994635e-05, "loss": 0.6803, "step": 2565 }, { "epoch": 0.8224, "grad_norm": 0.036427168710013086, "learning_rate": 2.792269240947076e-05, "loss": 0.6884, "step": 2570 }, { "epoch": 0.824, "grad_norm": 0.03873968606981723, "learning_rate": 2.7437711249327655e-05, "loss": 0.6861, "step": 2575 }, { "epoch": 0.8256, "grad_norm": 0.040206628177924045, "learning_rate": 2.6956554502789867e-05, "loss": 0.678, "step": 2580 }, { "epoch": 0.8272, "grad_norm": 0.035584398042952534, "learning_rate": 2.6479237183793712e-05, "loss": 0.693, "step": 2585 }, { "epoch": 0.8288, "grad_norm": 0.03504043384204677, "learning_rate": 2.6005774186470746e-05, "loss": 0.6899, "step": 2590 }, { "epoch": 0.8304, "grad_norm": 0.03585999832889161, "learning_rate": 2.5536180284682857e-05, "loss": 0.6822, "step": 2595 }, { "epoch": 0.832, "grad_norm": 0.03582872557001751, "learning_rate": 2.507047013156133e-05, "loss": 0.7006, "step": 2600 }, { "epoch": 0.8336, "grad_norm": 0.03521207047344509, "learning_rate": 2.4608658259049542e-05, "loss": 0.6968, "step": 2605 }, { "epoch": 0.8352, "grad_norm": 0.03638818276349012, "learning_rate": 2.4150759077449698e-05, "loss": 0.6858, "step": 2610 }, { "epoch": 0.8368, "grad_norm": 0.03448652695031895, "learning_rate": 2.369678687497289e-05, "loss": 0.6923, "step": 2615 }, { "epoch": 0.8384, "grad_norm": 0.03585778645343454, "learning_rate": 2.3246755817293444e-05, "loss": 0.6921, "step": 2620 }, { "epoch": 0.84, "grad_norm": 0.035878782315928406, "learning_rate": 2.2800679947106853e-05, "loss": 0.6868, "step": 2625 }, { "epoch": 0.8416, "grad_norm": 0.03413705214140856, "learning_rate": 2.2358573183691565e-05, "loss": 0.6745, "step": 2630 }, { "epoch": 0.8432, "grad_norm": 0.03741900054647843, "learning_rate": 2.1920449322474742e-05, "loss": 0.6944, "step": 2635 }, { "epoch": 0.8448, "grad_norm": 0.033945500957096725, "learning_rate": 2.1486322034601657e-05, "loss": 0.6764, "step": 2640 }, { "epoch": 0.8464, "grad_norm": 0.03693693672153618, "learning_rate": 2.105620486650914e-05, "loss": 0.6861, "step": 2645 }, { "epoch": 0.848, "grad_norm": 0.035959214488486574, "learning_rate": 2.063011123950295e-05, "loss": 0.6907, "step": 2650 }, { "epoch": 0.8496, "grad_norm": 0.03606750368452151, "learning_rate": 2.0208054449338878e-05, "loss": 0.6757, "step": 2655 }, { "epoch": 0.8512, "grad_norm": 0.039459520303168856, "learning_rate": 1.979004766580797e-05, "loss": 0.6802, "step": 2660 }, { "epoch": 0.8528, "grad_norm": 0.035442956185441674, "learning_rate": 1.9376103932325547e-05, "loss": 0.6933, "step": 2665 }, { "epoch": 0.8544, "grad_norm": 0.035197279761445745, "learning_rate": 1.8966236165524102e-05, "loss": 0.6948, "step": 2670 }, { "epoch": 0.856, "grad_norm": 0.03580007213986297, "learning_rate": 1.8560457154850395e-05, "loss": 0.682, "step": 2675 }, { "epoch": 0.8576, "grad_norm": 0.0339719466539305, "learning_rate": 1.8158779562166252e-05, "loss": 0.6871, "step": 2680 }, { "epoch": 0.8592, "grad_norm": 0.0342844166181569, "learning_rate": 1.7761215921353576e-05, "loss": 0.6871, "step": 2685 }, { "epoch": 0.8608, "grad_norm": 0.03654992692878969, "learning_rate": 1.7367778637923135e-05, "loss": 0.6735, "step": 2690 }, { "epoch": 0.8624, "grad_norm": 0.03205243106560114, "learning_rate": 1.6978479988627592e-05, "loss": 0.6778, "step": 2695 }, { "epoch": 0.864, "grad_norm": 0.0356464042393281, "learning_rate": 1.6593332121078295e-05, "loss": 0.6893, "step": 2700 }, { "epoch": 0.8656, "grad_norm": 0.03475488641190361, "learning_rate": 1.6212347053366287e-05, "loss": 0.6969, "step": 2705 }, { "epoch": 0.8672, "grad_norm": 0.03571595410829047, "learning_rate": 1.5835536673687278e-05, "loss": 0.6561, "step": 2710 }, { "epoch": 0.8688, "grad_norm": 0.037110415954442255, "learning_rate": 1.5462912739970663e-05, "loss": 0.6739, "step": 2715 }, { "epoch": 0.8704, "grad_norm": 0.03705588213171844, "learning_rate": 1.5094486879512786e-05, "loss": 0.6724, "step": 2720 }, { "epoch": 0.872, "grad_norm": 0.0359981143961099, "learning_rate": 1.4730270588613874e-05, "loss": 0.6899, "step": 2725 }, { "epoch": 0.8736, "grad_norm": 0.03586953931395446, "learning_rate": 1.437027523221948e-05, "loss": 0.68, "step": 2730 }, { "epoch": 0.8752, "grad_norm": 0.03601668331584326, "learning_rate": 1.4014512043565812e-05, "loss": 0.6782, "step": 2735 }, { "epoch": 0.8768, "grad_norm": 0.03533498098436524, "learning_rate": 1.3662992123829164e-05, "loss": 0.7159, "step": 2740 }, { "epoch": 0.8784, "grad_norm": 0.037640124507066094, "learning_rate": 1.3315726441779629e-05, "loss": 0.6846, "step": 2745 }, { "epoch": 0.88, "grad_norm": 0.03482454818070702, "learning_rate": 1.2972725833438752e-05, "loss": 0.6819, "step": 2750 }, { "epoch": 0.8816, "grad_norm": 0.0367271221138329, "learning_rate": 1.2634001001741373e-05, "loss": 0.6755, "step": 2755 }, { "epoch": 0.8832, "grad_norm": 0.03841584868617722, "learning_rate": 1.2299562516201739e-05, "loss": 0.6935, "step": 2760 }, { "epoch": 0.8848, "grad_norm": 0.040854540776410125, "learning_rate": 1.1969420812583603e-05, "loss": 0.7012, "step": 2765 }, { "epoch": 0.8864, "grad_norm": 0.0341221051853296, "learning_rate": 1.1643586192574684e-05, "loss": 0.6706, "step": 2770 }, { "epoch": 0.888, "grad_norm": 0.03757875689795596, "learning_rate": 1.1322068823465125e-05, "loss": 0.6771, "step": 2775 }, { "epoch": 0.8896, "grad_norm": 0.034492555749470376, "learning_rate": 1.1004878737830363e-05, "loss": 0.6678, "step": 2780 }, { "epoch": 0.8912, "grad_norm": 0.03331925332418725, "learning_rate": 1.06920258332179e-05, "loss": 0.6849, "step": 2785 }, { "epoch": 0.8928, "grad_norm": 0.03367504222803977, "learning_rate": 1.0383519871838587e-05, "loss": 0.671, "step": 2790 }, { "epoch": 0.8944, "grad_norm": 0.03373539399249363, "learning_rate": 1.0079370480261984e-05, "loss": 0.6742, "step": 2795 }, { "epoch": 0.896, "grad_norm": 0.03247618752201015, "learning_rate": 9.779587149115891e-06, "loss": 0.6856, "step": 2800 }, { "epoch": 0.8976, "grad_norm": 0.03584180038198852, "learning_rate": 9.484179232790389e-06, "loss": 0.6664, "step": 2805 }, { "epoch": 0.8992, "grad_norm": 0.036302861284426946, "learning_rate": 9.193155949145736e-06, "loss": 0.672, "step": 2810 }, { "epoch": 0.9008, "grad_norm": 0.03503030446991528, "learning_rate": 8.906526379224865e-06, "loss": 0.7019, "step": 2815 }, { "epoch": 0.9024, "grad_norm": 0.03560018408938535, "learning_rate": 8.624299466969964e-06, "loss": 0.6819, "step": 2820 }, { "epoch": 0.904, "grad_norm": 0.03286452854193144, "learning_rate": 8.346484018943428e-06, "loss": 0.6785, "step": 2825 }, { "epoch": 0.9056, "grad_norm": 0.03712658893214972, "learning_rate": 8.073088704053082e-06, "loss": 0.6868, "step": 2830 }, { "epoch": 0.9072, "grad_norm": 0.035362561023794904, "learning_rate": 7.80412205328153e-06, "loss": 0.6847, "step": 2835 }, { "epoch": 0.9088, "grad_norm": 0.03413740311243108, "learning_rate": 7.539592459420219e-06, "loss": 0.6694, "step": 2840 }, { "epoch": 0.9104, "grad_norm": 0.03460380103631612, "learning_rate": 7.279508176807258e-06, "loss": 0.6814, "step": 2845 }, { "epoch": 0.912, "grad_norm": 0.03345889140853353, "learning_rate": 7.023877321070071e-06, "loss": 0.6826, "step": 2850 }, { "epoch": 0.9136, "grad_norm": 0.03658572315985044, "learning_rate": 6.7727078688720515e-06, "loss": 0.6942, "step": 2855 }, { "epoch": 0.9152, "grad_norm": 0.033525356039550876, "learning_rate": 6.526007657663656e-06, "loss": 0.6743, "step": 2860 }, { "epoch": 0.9168, "grad_norm": 0.03569978757002032, "learning_rate": 6.283784385437962e-06, "loss": 0.6824, "step": 2865 }, { "epoch": 0.9184, "grad_norm": 0.034788249129439716, "learning_rate": 6.046045610490324e-06, "loss": 0.6843, "step": 2870 }, { "epoch": 0.92, "grad_norm": 0.03856999362984195, "learning_rate": 5.8127987511825516e-06, "loss": 0.6919, "step": 2875 }, { "epoch": 0.9216, "grad_norm": 0.03363147811222781, "learning_rate": 5.584051085711488e-06, "loss": 0.6676, "step": 2880 }, { "epoch": 0.9232, "grad_norm": 0.03337401697063256, "learning_rate": 5.359809751881833e-06, "loss": 0.668, "step": 2885 }, { "epoch": 0.9248, "grad_norm": 0.03391950164002368, "learning_rate": 5.140081746883546e-06, "loss": 0.6709, "step": 2890 }, { "epoch": 0.9264, "grad_norm": 0.03628260571840196, "learning_rate": 4.924873927073292e-06, "loss": 0.6759, "step": 2895 }, { "epoch": 0.928, "grad_norm": 0.03719727694206389, "learning_rate": 4.714193007760681e-06, "loss": 0.6841, "step": 2900 }, { "epoch": 0.9296, "grad_norm": 0.035072599886154625, "learning_rate": 4.5080455629986465e-06, "loss": 0.687, "step": 2905 }, { "epoch": 0.9312, "grad_norm": 0.03402712931516279, "learning_rate": 4.306438025378317e-06, "loss": 0.6887, "step": 2910 }, { "epoch": 0.9328, "grad_norm": 0.033629393596595857, "learning_rate": 4.109376685828336e-06, "loss": 0.6938, "step": 2915 }, { "epoch": 0.9344, "grad_norm": 0.03430870214738047, "learning_rate": 3.916867693418474e-06, "loss": 0.6779, "step": 2920 }, { "epoch": 0.936, "grad_norm": 0.03428926816458531, "learning_rate": 3.7289170551678626e-06, "loss": 0.683, "step": 2925 }, { "epoch": 0.9376, "grad_norm": 0.03636527753521949, "learning_rate": 3.545530635857463e-06, "loss": 0.6849, "step": 2930 }, { "epoch": 0.9392, "grad_norm": 0.037074968354287835, "learning_rate": 3.366714157847078e-06, "loss": 0.6787, "step": 2935 }, { "epoch": 0.9408, "grad_norm": 0.03690326258068921, "learning_rate": 3.192473200896828e-06, "loss": 0.6912, "step": 2940 }, { "epoch": 0.9424, "grad_norm": 0.03676314823395921, "learning_rate": 3.0228132019930407e-06, "loss": 0.6982, "step": 2945 }, { "epoch": 0.944, "grad_norm": 0.03326096113948387, "learning_rate": 2.8577394551785705e-06, "loss": 0.6808, "step": 2950 }, { "epoch": 0.9456, "grad_norm": 0.03315391474215544, "learning_rate": 2.6972571113875798e-06, "loss": 0.6828, "step": 2955 }, { "epoch": 0.9472, "grad_norm": 0.034162229284439904, "learning_rate": 2.5413711782848845e-06, "loss": 0.6769, "step": 2960 }, { "epoch": 0.9488, "grad_norm": 0.03557968800367496, "learning_rate": 2.39008652010963e-06, "loss": 0.684, "step": 2965 }, { "epoch": 0.9504, "grad_norm": 0.037053943949424364, "learning_rate": 2.243407857523544e-06, "loss": 0.7004, "step": 2970 }, { "epoch": 0.952, "grad_norm": 0.03521116100655522, "learning_rate": 2.1013397674636057e-06, "loss": 0.6724, "step": 2975 }, { "epoch": 0.9536, "grad_norm": 0.037461763983045196, "learning_rate": 1.9638866829992606e-06, "loss": 0.6813, "step": 2980 }, { "epoch": 0.9552, "grad_norm": 0.0337899195203273, "learning_rate": 1.8310528931940627e-06, "loss": 0.6876, "step": 2985 }, { "epoch": 0.9568, "grad_norm": 0.0333450608692831, "learning_rate": 1.702842542971866e-06, "loss": 0.679, "step": 2990 }, { "epoch": 0.9584, "grad_norm": 0.03259070901476999, "learning_rate": 1.5792596329874451e-06, "loss": 0.6946, "step": 2995 }, { "epoch": 0.96, "grad_norm": 0.0355419164421536, "learning_rate": 1.4603080195017103e-06, "loss": 0.687, "step": 3000 }, { "epoch": 0.9616, "grad_norm": 0.03486772045983402, "learning_rate": 1.3459914142613381e-06, "loss": 0.6793, "step": 3005 }, { "epoch": 0.9632, "grad_norm": 0.03758596013480105, "learning_rate": 1.2363133843829976e-06, "loss": 0.6928, "step": 3010 }, { "epoch": 0.9648, "grad_norm": 0.03419367915456267, "learning_rate": 1.1312773522419716e-06, "loss": 0.6823, "step": 3015 }, { "epoch": 0.9664, "grad_norm": 0.035663872743848725, "learning_rate": 1.0308865953654267e-06, "loss": 0.6989, "step": 3020 }, { "epoch": 0.968, "grad_norm": 0.03616981749097929, "learning_rate": 9.351442463301273e-07, "loss": 0.6731, "step": 3025 }, { "epoch": 0.9696, "grad_norm": 0.03762974647472138, "learning_rate": 8.440532926646315e-07, "loss": 0.6924, "step": 3030 }, { "epoch": 0.9712, "grad_norm": 0.03399772759360819, "learning_rate": 7.576165767561815e-07, "loss": 0.6958, "step": 3035 }, { "epoch": 0.9728, "grad_norm": 0.03359200040600764, "learning_rate": 6.758367957619082e-07, "loss": 0.6755, "step": 3040 }, { "epoch": 0.9744, "grad_norm": 0.03408737827328263, "learning_rate": 5.98716501524732e-07, "loss": 0.6727, "step": 3045 }, { "epoch": 0.976, "grad_norm": 0.033986659157017304, "learning_rate": 5.262581004936927e-07, "loss": 0.6825, "step": 3050 }, { "epoch": 0.9776, "grad_norm": 0.036097505040925654, "learning_rate": 4.5846385364886053e-07, "loss": 0.6807, "step": 3055 }, { "epoch": 0.9792, "grad_norm": 0.03186557437798362, "learning_rate": 3.953358764308412e-07, "loss": 0.6746, "step": 3060 }, { "epoch": 0.9808, "grad_norm": 0.033878030478239944, "learning_rate": 3.3687613867469653e-07, "loss": 0.6913, "step": 3065 }, { "epoch": 0.9824, "grad_norm": 0.03510026870536183, "learning_rate": 2.830864645485265e-07, "loss": 0.6869, "step": 3070 }, { "epoch": 0.984, "grad_norm": 0.03417621913984012, "learning_rate": 2.3396853249653124e-07, "loss": 0.688, "step": 3075 }, { "epoch": 0.9856, "grad_norm": 0.03312402040298033, "learning_rate": 1.895238751866035e-07, "loss": 0.6843, "step": 3080 }, { "epoch": 0.9872, "grad_norm": 0.03360622380173029, "learning_rate": 1.4975387946256635e-07, "loss": 0.6783, "step": 3085 }, { "epoch": 0.9888, "grad_norm": 0.0354700691322541, "learning_rate": 1.1465978630085803e-07, "loss": 0.6868, "step": 3090 }, { "epoch": 0.9904, "grad_norm": 0.03295568524608156, "learning_rate": 8.424269077182965e-08, "loss": 0.6738, "step": 3095 }, { "epoch": 0.992, "grad_norm": 0.032788816463157545, "learning_rate": 5.8503542005555734e-08, "loss": 0.6812, "step": 3100 }, { "epoch": 0.9936, "grad_norm": 0.035024900613057144, "learning_rate": 3.7443143162207976e-08, "loss": 0.704, "step": 3105 }, { "epoch": 0.9952, "grad_norm": 0.036688178061528604, "learning_rate": 2.1062151407025274e-08, "loss": 0.6963, "step": 3110 }, { "epoch": 0.9968, "grad_norm": 0.03618314272574878, "learning_rate": 9.36107788981344e-09, "loss": 0.689, "step": 3115 }, { "epoch": 0.9984, "grad_norm": 0.03425916181867982, "learning_rate": 2.340287728941348e-09, "loss": 0.6649, "step": 3120 }, { "epoch": 1.0, "grad_norm": 0.03464178557265085, "learning_rate": 0.0, "loss": 0.6967, "step": 3125 }, { "epoch": 1.0, "eval_loss": 0.6901005506515503, "eval_runtime": 259.4269, "eval_samples_per_second": 119.849, "eval_steps_per_second": 3.747, "step": 3125 }, { "epoch": 1.0, "step": 3125, "total_flos": 514082196226048.0, "train_loss": 0.7583255160522461, "train_runtime": 13626.6135, "train_samples_per_second": 29.354, "train_steps_per_second": 0.229 } ], "logging_steps": 5, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 514082196226048.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }