{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998579209093061, "eval_steps": 500, "global_step": 5278, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000947193937958797, "grad_norm": 52.55722061842073, "learning_rate": 2.8409090909090907e-06, "loss": 3.7816, "step": 5 }, { "epoch": 0.001894387875917594, "grad_norm": 38.95340149423018, "learning_rate": 5.6818181818181815e-06, "loss": 3.5056, "step": 10 }, { "epoch": 0.0028415818138763913, "grad_norm": 21.712424534014453, "learning_rate": 8.522727272727271e-06, "loss": 2.882, "step": 15 }, { "epoch": 0.003788775751835188, "grad_norm": 9.040565708034416, "learning_rate": 1.1363636363636363e-05, "loss": 2.173, "step": 20 }, { "epoch": 0.004735969689793985, "grad_norm": 4.220912189865154, "learning_rate": 1.4204545454545453e-05, "loss": 1.6834, "step": 25 }, { "epoch": 0.005683163627752783, "grad_norm": 1.9714256696877792, "learning_rate": 1.7045454545454543e-05, "loss": 1.3769, "step": 30 }, { "epoch": 0.006630357565711579, "grad_norm": 1.0423474521511698, "learning_rate": 1.9886363636363634e-05, "loss": 1.2239, "step": 35 }, { "epoch": 0.007577551503670376, "grad_norm": 1.2391396539653674, "learning_rate": 2.2727272727272726e-05, "loss": 1.1367, "step": 40 }, { "epoch": 0.008524745441629173, "grad_norm": 0.46298881388281793, "learning_rate": 2.5568181818181814e-05, "loss": 1.0701, "step": 45 }, { "epoch": 0.00947193937958797, "grad_norm": 0.3539086469394409, "learning_rate": 2.8409090909090906e-05, "loss": 1.0452, "step": 50 }, { "epoch": 0.010419133317546767, "grad_norm": 0.303699683898037, "learning_rate": 3.125e-05, "loss": 1.0418, "step": 55 }, { "epoch": 0.011366327255505565, "grad_norm": 0.2133546406490209, "learning_rate": 3.4090909090909085e-05, "loss": 0.9768, "step": 60 }, { "epoch": 0.012313521193464362, "grad_norm": 0.2101107113029947, "learning_rate": 3.693181818181818e-05, "loss": 0.9808, "step": 65 }, { "epoch": 0.013260715131423158, "grad_norm": 0.2001787181890806, "learning_rate": 3.977272727272727e-05, "loss": 0.9558, "step": 70 }, { "epoch": 0.014207909069381956, "grad_norm": 0.2060240507616133, "learning_rate": 4.261363636363637e-05, "loss": 0.961, "step": 75 }, { "epoch": 0.015155103007340753, "grad_norm": 0.1998791188434844, "learning_rate": 4.545454545454545e-05, "loss": 0.9515, "step": 80 }, { "epoch": 0.01610229694529955, "grad_norm": 0.15833294762038227, "learning_rate": 4.8295454545454537e-05, "loss": 0.9414, "step": 85 }, { "epoch": 0.017049490883258345, "grad_norm": 0.1520760900480208, "learning_rate": 5.113636363636363e-05, "loss": 0.8963, "step": 90 }, { "epoch": 0.017996684821217145, "grad_norm": 0.1388506087010938, "learning_rate": 5.3977272727272727e-05, "loss": 0.9501, "step": 95 }, { "epoch": 0.01894387875917594, "grad_norm": 0.1410344131187388, "learning_rate": 5.681818181818181e-05, "loss": 0.909, "step": 100 }, { "epoch": 0.019891072697134738, "grad_norm": 0.12318568339151473, "learning_rate": 5.96590909090909e-05, "loss": 0.9107, "step": 105 }, { "epoch": 0.020838266635093534, "grad_norm": 0.1279058146057783, "learning_rate": 6.25e-05, "loss": 0.8978, "step": 110 }, { "epoch": 0.02178546057305233, "grad_norm": 0.11032351265233929, "learning_rate": 6.534090909090909e-05, "loss": 0.931, "step": 115 }, { "epoch": 0.02273265451101113, "grad_norm": 0.1119942887320865, "learning_rate": 6.818181818181817e-05, "loss": 0.8893, "step": 120 }, { "epoch": 0.023679848448969927, "grad_norm": 0.10935745636043816, "learning_rate": 7.102272727272727e-05, "loss": 0.9006, "step": 125 }, { "epoch": 0.024627042386928723, "grad_norm": 0.1083392524866063, "learning_rate": 7.386363636363635e-05, "loss": 0.8987, "step": 130 }, { "epoch": 0.02557423632488752, "grad_norm": 0.09975528745630019, "learning_rate": 7.670454545454545e-05, "loss": 0.8831, "step": 135 }, { "epoch": 0.026521430262846316, "grad_norm": 0.08603398438897839, "learning_rate": 7.954545454545454e-05, "loss": 0.8836, "step": 140 }, { "epoch": 0.027468624200805116, "grad_norm": 0.08005996371772706, "learning_rate": 8.238636363636362e-05, "loss": 0.8932, "step": 145 }, { "epoch": 0.028415818138763912, "grad_norm": 0.09105132634043626, "learning_rate": 8.522727272727273e-05, "loss": 0.8953, "step": 150 }, { "epoch": 0.02936301207672271, "grad_norm": 0.08350347397936869, "learning_rate": 8.806818181818182e-05, "loss": 0.8901, "step": 155 }, { "epoch": 0.030310206014681505, "grad_norm": 0.06553921290329484, "learning_rate": 9.09090909090909e-05, "loss": 0.84, "step": 160 }, { "epoch": 0.031257399952640305, "grad_norm": 0.07002312647287917, "learning_rate": 9.374999999999999e-05, "loss": 0.9037, "step": 165 }, { "epoch": 0.0322045938905991, "grad_norm": 0.07442774560302336, "learning_rate": 9.659090909090907e-05, "loss": 0.8961, "step": 170 }, { "epoch": 0.0331517878285579, "grad_norm": 0.07978205419119488, "learning_rate": 9.943181818181817e-05, "loss": 0.8472, "step": 175 }, { "epoch": 0.03409898176651669, "grad_norm": 0.08212508538706279, "learning_rate": 0.00010227272727272726, "loss": 0.8828, "step": 180 }, { "epoch": 0.03504617570447549, "grad_norm": 0.06703812335472381, "learning_rate": 0.00010511363636363635, "loss": 0.8952, "step": 185 }, { "epoch": 0.03599336964243429, "grad_norm": 0.07274197409833637, "learning_rate": 0.00010795454545454545, "loss": 0.8884, "step": 190 }, { "epoch": 0.03694056358039308, "grad_norm": 0.06566094578471512, "learning_rate": 0.00011079545454545454, "loss": 0.8457, "step": 195 }, { "epoch": 0.03788775751835188, "grad_norm": 0.08307007406251704, "learning_rate": 0.00011363636363636362, "loss": 0.8723, "step": 200 }, { "epoch": 0.038834951456310676, "grad_norm": 0.08535904339597694, "learning_rate": 0.00011647727272727271, "loss": 0.9134, "step": 205 }, { "epoch": 0.039782145394269476, "grad_norm": 0.06790982585326019, "learning_rate": 0.0001193181818181818, "loss": 0.888, "step": 210 }, { "epoch": 0.040729339332228276, "grad_norm": 0.07237225374916947, "learning_rate": 0.0001221590909090909, "loss": 0.8629, "step": 215 }, { "epoch": 0.04167653327018707, "grad_norm": 0.07663584947407301, "learning_rate": 0.000125, "loss": 0.8979, "step": 220 }, { "epoch": 0.04262372720814587, "grad_norm": 0.06530101929213837, "learning_rate": 0.00012784090909090907, "loss": 0.893, "step": 225 }, { "epoch": 0.04357092114610466, "grad_norm": 0.0750675120954495, "learning_rate": 0.00013068181818181817, "loss": 0.8826, "step": 230 }, { "epoch": 0.04451811508406346, "grad_norm": 0.06833860955837964, "learning_rate": 0.00013352272727272727, "loss": 0.9042, "step": 235 }, { "epoch": 0.04546530902202226, "grad_norm": 0.06776526573633952, "learning_rate": 0.00013636363636363634, "loss": 0.8771, "step": 240 }, { "epoch": 0.046412502959981054, "grad_norm": 0.06177830383391277, "learning_rate": 0.00013920454545454544, "loss": 0.8735, "step": 245 }, { "epoch": 0.047359696897939854, "grad_norm": 0.06679265204109729, "learning_rate": 0.00014204545454545454, "loss": 0.8705, "step": 250 }, { "epoch": 0.04830689083589865, "grad_norm": 0.06611716647725416, "learning_rate": 0.00014488636363636364, "loss": 0.8716, "step": 255 }, { "epoch": 0.04925408477385745, "grad_norm": 0.06159220932412366, "learning_rate": 0.0001477272727272727, "loss": 0.8713, "step": 260 }, { "epoch": 0.05020127871181625, "grad_norm": 0.06823008150239136, "learning_rate": 0.00015056818181818183, "loss": 0.8969, "step": 265 }, { "epoch": 0.05114847264977504, "grad_norm": 0.0703778333367766, "learning_rate": 0.0001534090909090909, "loss": 0.8687, "step": 270 }, { "epoch": 0.05209566658773384, "grad_norm": 0.06489353446808296, "learning_rate": 0.00015625, "loss": 0.8639, "step": 275 }, { "epoch": 0.05304286052569263, "grad_norm": 0.06986694992701606, "learning_rate": 0.00015909090909090907, "loss": 0.8666, "step": 280 }, { "epoch": 0.05399005446365143, "grad_norm": 0.06723519153451205, "learning_rate": 0.00016193181818181817, "loss": 0.8764, "step": 285 }, { "epoch": 0.05493724840161023, "grad_norm": 0.07077673680927957, "learning_rate": 0.00016477272727272724, "loss": 0.8618, "step": 290 }, { "epoch": 0.055884442339569025, "grad_norm": 0.06891807376211805, "learning_rate": 0.00016761363636363634, "loss": 0.8745, "step": 295 }, { "epoch": 0.056831636277527825, "grad_norm": 0.06841751882175513, "learning_rate": 0.00017045454545454547, "loss": 0.8461, "step": 300 }, { "epoch": 0.05777883021548662, "grad_norm": 0.07176128138475886, "learning_rate": 0.00017329545454545454, "loss": 0.8667, "step": 305 }, { "epoch": 0.05872602415344542, "grad_norm": 0.07290498390552146, "learning_rate": 0.00017613636363636364, "loss": 0.8739, "step": 310 }, { "epoch": 0.05967321809140422, "grad_norm": 0.06773178575121884, "learning_rate": 0.0001789772727272727, "loss": 0.8871, "step": 315 }, { "epoch": 0.06062041202936301, "grad_norm": 0.0678345156257685, "learning_rate": 0.0001818181818181818, "loss": 0.8677, "step": 320 }, { "epoch": 0.06156760596732181, "grad_norm": 0.07520267059018662, "learning_rate": 0.00018465909090909088, "loss": 0.8762, "step": 325 }, { "epoch": 0.06251479990528061, "grad_norm": 0.06412335985278948, "learning_rate": 0.00018749999999999998, "loss": 0.8935, "step": 330 }, { "epoch": 0.0634619938432394, "grad_norm": 0.09455831472598197, "learning_rate": 0.00019034090909090908, "loss": 0.8799, "step": 335 }, { "epoch": 0.0644091877811982, "grad_norm": 0.08784917380610333, "learning_rate": 0.00019318181818181815, "loss": 0.8622, "step": 340 }, { "epoch": 0.065356381719157, "grad_norm": 0.0706205634228429, "learning_rate": 0.00019602272727272727, "loss": 0.8574, "step": 345 }, { "epoch": 0.0663035756571158, "grad_norm": 0.0647307909886003, "learning_rate": 0.00019886363636363634, "loss": 0.8542, "step": 350 }, { "epoch": 0.06725076959507459, "grad_norm": 0.06150113901124715, "learning_rate": 0.00020170454545454544, "loss": 0.8407, "step": 355 }, { "epoch": 0.06819796353303338, "grad_norm": 0.06257476247931755, "learning_rate": 0.0002045454545454545, "loss": 0.8564, "step": 360 }, { "epoch": 0.06914515747099219, "grad_norm": 0.06213175384030445, "learning_rate": 0.0002073863636363636, "loss": 0.8481, "step": 365 }, { "epoch": 0.07009235140895098, "grad_norm": 0.07243770022823882, "learning_rate": 0.0002102272727272727, "loss": 0.8742, "step": 370 }, { "epoch": 0.07103954534690977, "grad_norm": 0.06094029025046222, "learning_rate": 0.00021306818181818178, "loss": 0.8686, "step": 375 }, { "epoch": 0.07198673928486858, "grad_norm": 0.07211978873607085, "learning_rate": 0.0002159090909090909, "loss": 0.8404, "step": 380 }, { "epoch": 0.07293393322282737, "grad_norm": 0.0664949706014046, "learning_rate": 0.00021874999999999998, "loss": 0.8767, "step": 385 }, { "epoch": 0.07388112716078617, "grad_norm": 0.07048102073887626, "learning_rate": 0.00022159090909090908, "loss": 0.8662, "step": 390 }, { "epoch": 0.07482832109874497, "grad_norm": 0.06550092590742955, "learning_rate": 0.00022443181818181815, "loss": 0.8546, "step": 395 }, { "epoch": 0.07577551503670377, "grad_norm": 0.07566899187849191, "learning_rate": 0.00022727272727272725, "loss": 0.8551, "step": 400 }, { "epoch": 0.07672270897466256, "grad_norm": 0.06303208036750815, "learning_rate": 0.00023011363636363634, "loss": 0.8699, "step": 405 }, { "epoch": 0.07766990291262135, "grad_norm": 0.06875716346372687, "learning_rate": 0.00023295454545454542, "loss": 0.8627, "step": 410 }, { "epoch": 0.07861709685058016, "grad_norm": 0.08595111194659674, "learning_rate": 0.00023579545454545454, "loss": 0.8834, "step": 415 }, { "epoch": 0.07956429078853895, "grad_norm": 0.061252177097668066, "learning_rate": 0.0002386363636363636, "loss": 0.8589, "step": 420 }, { "epoch": 0.08051148472649775, "grad_norm": 0.06674992779765852, "learning_rate": 0.0002414772727272727, "loss": 0.8807, "step": 425 }, { "epoch": 0.08145867866445655, "grad_norm": 0.06735600365269058, "learning_rate": 0.0002443181818181818, "loss": 0.8632, "step": 430 }, { "epoch": 0.08240587260241534, "grad_norm": 0.06263588681026308, "learning_rate": 0.0002471590909090909, "loss": 0.9034, "step": 435 }, { "epoch": 0.08335306654037414, "grad_norm": 0.064198353107829, "learning_rate": 0.00025, "loss": 0.9008, "step": 440 }, { "epoch": 0.08430026047833294, "grad_norm": 0.06187143454881962, "learning_rate": 0.00025284090909090905, "loss": 0.8421, "step": 445 }, { "epoch": 0.08524745441629174, "grad_norm": 0.05826341458036729, "learning_rate": 0.00025568181818181815, "loss": 0.8654, "step": 450 }, { "epoch": 0.08619464835425053, "grad_norm": 0.06153983398074908, "learning_rate": 0.00025852272727272725, "loss": 0.8345, "step": 455 }, { "epoch": 0.08714184229220932, "grad_norm": 0.057544439891252096, "learning_rate": 0.00026136363636363634, "loss": 0.8474, "step": 460 }, { "epoch": 0.08808903623016813, "grad_norm": 0.05386176310877567, "learning_rate": 0.00026420454545454544, "loss": 0.8449, "step": 465 }, { "epoch": 0.08903623016812692, "grad_norm": 0.05733131738608226, "learning_rate": 0.00026704545454545454, "loss": 0.8557, "step": 470 }, { "epoch": 0.08998342410608572, "grad_norm": 0.05468199774083347, "learning_rate": 0.00026988636363636364, "loss": 0.8738, "step": 475 }, { "epoch": 0.09093061804404452, "grad_norm": 0.0732652071369859, "learning_rate": 0.0002727272727272727, "loss": 0.8764, "step": 480 }, { "epoch": 0.09187781198200332, "grad_norm": 0.06609967510300549, "learning_rate": 0.0002755681818181818, "loss": 0.8695, "step": 485 }, { "epoch": 0.09282500591996211, "grad_norm": 0.06205800779765995, "learning_rate": 0.0002784090909090909, "loss": 0.8616, "step": 490 }, { "epoch": 0.09377219985792092, "grad_norm": 0.05861483996354783, "learning_rate": 0.00028125, "loss": 0.8701, "step": 495 }, { "epoch": 0.09471939379587971, "grad_norm": 0.060736665872329516, "learning_rate": 0.0002840909090909091, "loss": 0.8947, "step": 500 }, { "epoch": 0.0956665877338385, "grad_norm": 0.06142593731376609, "learning_rate": 0.0002869318181818182, "loss": 0.8526, "step": 505 }, { "epoch": 0.0966137816717973, "grad_norm": 0.054392069599614346, "learning_rate": 0.0002897727272727273, "loss": 0.8466, "step": 510 }, { "epoch": 0.0975609756097561, "grad_norm": 0.06371556067004604, "learning_rate": 0.0002926136363636363, "loss": 0.8121, "step": 515 }, { "epoch": 0.0985081695477149, "grad_norm": 0.06255544482298064, "learning_rate": 0.0002954545454545454, "loss": 0.8398, "step": 520 }, { "epoch": 0.09945536348567369, "grad_norm": 0.05906918417826802, "learning_rate": 0.0002982954545454545, "loss": 0.8763, "step": 525 }, { "epoch": 0.1004025574236325, "grad_norm": 0.05701807064591476, "learning_rate": 0.0002999998687698221, "loss": 0.8712, "step": 530 }, { "epoch": 0.10134975136159129, "grad_norm": 0.061038500218446035, "learning_rate": 0.00029999839243295787, "loss": 0.8712, "step": 535 }, { "epoch": 0.10229694529955008, "grad_norm": 0.056276485767051056, "learning_rate": 0.0002999952757377059, "loss": 0.8761, "step": 540 }, { "epoch": 0.10324413923750889, "grad_norm": 0.06244892878431511, "learning_rate": 0.00029999051871814974, "loss": 0.8711, "step": 545 }, { "epoch": 0.10419133317546768, "grad_norm": 0.05743365231512095, "learning_rate": 0.0002999841214263116, "loss": 0.8457, "step": 550 }, { "epoch": 0.10513852711342647, "grad_norm": 0.06440086459939007, "learning_rate": 0.000299976083932151, "loss": 0.8994, "step": 555 }, { "epoch": 0.10608572105138526, "grad_norm": 0.056927632875731916, "learning_rate": 0.0002999664063235649, "loss": 0.841, "step": 560 }, { "epoch": 0.10703291498934407, "grad_norm": 0.0583247436509342, "learning_rate": 0.00029995508870638596, "loss": 0.8765, "step": 565 }, { "epoch": 0.10798010892730286, "grad_norm": 0.04974164281813745, "learning_rate": 0.00029994213120438187, "loss": 0.8429, "step": 570 }, { "epoch": 0.10892730286526166, "grad_norm": 0.06125862188074254, "learning_rate": 0.0002999275339592538, "loss": 0.8935, "step": 575 }, { "epoch": 0.10987449680322046, "grad_norm": 0.059759664506794964, "learning_rate": 0.0002999112971306348, "loss": 0.869, "step": 580 }, { "epoch": 0.11082169074117926, "grad_norm": 0.06328973302665653, "learning_rate": 0.00029989342089608835, "loss": 0.852, "step": 585 }, { "epoch": 0.11176888467913805, "grad_norm": 0.060243737818932684, "learning_rate": 0.00029987390545110605, "loss": 0.857, "step": 590 }, { "epoch": 0.11271607861709686, "grad_norm": 0.055808252373465206, "learning_rate": 0.0002998527510091056, "loss": 0.8774, "step": 595 }, { "epoch": 0.11366327255505565, "grad_norm": 0.06324311731152125, "learning_rate": 0.0002998299578014287, "loss": 0.8726, "step": 600 }, { "epoch": 0.11461046649301444, "grad_norm": 0.05610986410453152, "learning_rate": 0.0002998055260773381, "loss": 0.8589, "step": 605 }, { "epoch": 0.11555766043097324, "grad_norm": 0.04965479001175545, "learning_rate": 0.0002997794561040153, "loss": 0.8383, "step": 610 }, { "epoch": 0.11650485436893204, "grad_norm": 0.05064790996842586, "learning_rate": 0.00029975174816655736, "loss": 0.8524, "step": 615 }, { "epoch": 0.11745204830689084, "grad_norm": 0.052969432132795244, "learning_rate": 0.00029972240256797384, "loss": 0.8848, "step": 620 }, { "epoch": 0.11839924224484963, "grad_norm": 0.04987789018051327, "learning_rate": 0.0002996914196291835, "loss": 0.8579, "step": 625 }, { "epoch": 0.11934643618280844, "grad_norm": 0.05418008688918587, "learning_rate": 0.0002996587996890107, "loss": 0.9321, "step": 630 }, { "epoch": 0.12029363012076723, "grad_norm": 0.057606406769674844, "learning_rate": 0.000299624543104182, "loss": 0.864, "step": 635 }, { "epoch": 0.12124082405872602, "grad_norm": 0.050896712551105165, "learning_rate": 0.0002995886502493219, "loss": 0.8508, "step": 640 }, { "epoch": 0.12218801799668483, "grad_norm": 0.05856562263921288, "learning_rate": 0.00029955112151694885, "loss": 0.8557, "step": 645 }, { "epoch": 0.12313521193464362, "grad_norm": 0.056397198481637226, "learning_rate": 0.00029951195731747114, "loss": 0.8763, "step": 650 }, { "epoch": 0.12408240587260241, "grad_norm": 0.06021365518577683, "learning_rate": 0.00029947115807918217, "loss": 0.8691, "step": 655 }, { "epoch": 0.12502959981056122, "grad_norm": 0.049632655250353507, "learning_rate": 0.0002994287242482558, "loss": 0.8593, "step": 660 }, { "epoch": 0.12597679374852, "grad_norm": 0.04958168029549867, "learning_rate": 0.00029938465628874165, "loss": 0.8591, "step": 665 }, { "epoch": 0.1269239876864788, "grad_norm": 0.05757210103501373, "learning_rate": 0.00029933895468255985, "loss": 0.8402, "step": 670 }, { "epoch": 0.1278711816244376, "grad_norm": 0.06892314154494911, "learning_rate": 0.0002992916199294959, "loss": 0.8689, "step": 675 }, { "epoch": 0.1288183755623964, "grad_norm": 0.049748765290627474, "learning_rate": 0.000299242652547195, "loss": 0.8486, "step": 680 }, { "epoch": 0.1297655695003552, "grad_norm": 0.054839939836728246, "learning_rate": 0.0002991920530711566, "loss": 0.8673, "step": 685 }, { "epoch": 0.130712763438314, "grad_norm": 0.05978086373502768, "learning_rate": 0.00029913982205472857, "loss": 0.8608, "step": 690 }, { "epoch": 0.13165995737627278, "grad_norm": 0.052502322799696084, "learning_rate": 0.0002990859600691008, "loss": 0.8613, "step": 695 }, { "epoch": 0.1326071513142316, "grad_norm": 0.05054364592204903, "learning_rate": 0.0002990304677032994, "loss": 0.8746, "step": 700 }, { "epoch": 0.1335543452521904, "grad_norm": 0.05172804041468556, "learning_rate": 0.00029897334556418004, "loss": 0.8256, "step": 705 }, { "epoch": 0.13450153919014918, "grad_norm": 0.05101864907200138, "learning_rate": 0.0002989145942764212, "loss": 0.8655, "step": 710 }, { "epoch": 0.13544873312810798, "grad_norm": 0.05509456096234295, "learning_rate": 0.0002988542144825176, "loss": 0.8692, "step": 715 }, { "epoch": 0.13639592706606676, "grad_norm": 0.049333296450028125, "learning_rate": 0.000298792206842773, "loss": 0.8572, "step": 720 }, { "epoch": 0.13734312100402557, "grad_norm": 0.059122961216738656, "learning_rate": 0.0002987285720352929, "loss": 0.8735, "step": 725 }, { "epoch": 0.13829031494198438, "grad_norm": 0.050375990420733686, "learning_rate": 0.0002986633107559775, "loss": 0.82, "step": 730 }, { "epoch": 0.13923750887994316, "grad_norm": 0.0496346403585563, "learning_rate": 0.0002985964237185136, "loss": 0.8467, "step": 735 }, { "epoch": 0.14018470281790196, "grad_norm": 0.053630745330135815, "learning_rate": 0.00029852791165436716, "loss": 0.8858, "step": 740 }, { "epoch": 0.14113189675586077, "grad_norm": 0.055544816267542034, "learning_rate": 0.0002984577753127752, "loss": 0.8707, "step": 745 }, { "epoch": 0.14207909069381955, "grad_norm": 0.04983399501220757, "learning_rate": 0.00029838601546073744, "loss": 0.846, "step": 750 }, { "epoch": 0.14302628463177836, "grad_norm": 0.05531741725439223, "learning_rate": 0.00029831263288300817, "loss": 0.8716, "step": 755 }, { "epoch": 0.14397347856973716, "grad_norm": 0.05305914413910715, "learning_rate": 0.00029823762838208744, "loss": 0.8694, "step": 760 }, { "epoch": 0.14492067250769594, "grad_norm": 0.063066365795915, "learning_rate": 0.00029816100277821247, "loss": 0.8575, "step": 765 }, { "epoch": 0.14586786644565475, "grad_norm": 0.052014222449902975, "learning_rate": 0.00029808275690934864, "loss": 0.8553, "step": 770 }, { "epoch": 0.14681506038361355, "grad_norm": 0.05627981583655042, "learning_rate": 0.00029800289163118014, "loss": 0.8491, "step": 775 }, { "epoch": 0.14776225432157233, "grad_norm": 0.048744553655055825, "learning_rate": 0.00029792140781710103, "loss": 0.8597, "step": 780 }, { "epoch": 0.14870944825953114, "grad_norm": 0.07103922836346321, "learning_rate": 0.00029783830635820506, "loss": 0.8685, "step": 785 }, { "epoch": 0.14965664219748995, "grad_norm": 0.05572062793930259, "learning_rate": 0.0002977535881632766, "loss": 0.8144, "step": 790 }, { "epoch": 0.15060383613544873, "grad_norm": 0.047330412499616226, "learning_rate": 0.00029766725415878017, "loss": 0.8353, "step": 795 }, { "epoch": 0.15155103007340753, "grad_norm": 0.05324033535511291, "learning_rate": 0.00029757930528885064, "loss": 0.8411, "step": 800 }, { "epoch": 0.15249822401136634, "grad_norm": 0.05513304510602078, "learning_rate": 0.0002974897425152828, "loss": 0.8809, "step": 805 }, { "epoch": 0.15344541794932512, "grad_norm": 0.052990990094527124, "learning_rate": 0.0002973985668175207, "loss": 0.8608, "step": 810 }, { "epoch": 0.15439261188728393, "grad_norm": 0.05324896337561592, "learning_rate": 0.0002973057791926473, "loss": 0.8458, "step": 815 }, { "epoch": 0.1553398058252427, "grad_norm": 0.05276687776977392, "learning_rate": 0.000297211380655373, "loss": 0.8697, "step": 820 }, { "epoch": 0.1562869997632015, "grad_norm": 0.052354949797073405, "learning_rate": 0.0002971153722380253, "loss": 0.8507, "step": 825 }, { "epoch": 0.15723419370116032, "grad_norm": 0.049368244945149506, "learning_rate": 0.0002970177549905368, "loss": 0.8403, "step": 830 }, { "epoch": 0.1581813876391191, "grad_norm": 0.046532042464774784, "learning_rate": 0.00029691852998043396, "loss": 0.8552, "step": 835 }, { "epoch": 0.1591285815770779, "grad_norm": 0.04876609183561892, "learning_rate": 0.00029681769829282574, "loss": 0.8479, "step": 840 }, { "epoch": 0.1600757755150367, "grad_norm": 0.059730813463699885, "learning_rate": 0.0002967152610303913, "loss": 0.8545, "step": 845 }, { "epoch": 0.1610229694529955, "grad_norm": 0.055750160324234604, "learning_rate": 0.00029661121931336804, "loss": 0.8504, "step": 850 }, { "epoch": 0.1619701633909543, "grad_norm": 0.0528593038524647, "learning_rate": 0.0002965055742795395, "loss": 0.8814, "step": 855 }, { "epoch": 0.1629173573289131, "grad_norm": 0.05558247953451502, "learning_rate": 0.000296398327084223, "loss": 0.85, "step": 860 }, { "epoch": 0.16386455126687188, "grad_norm": 0.06143605437889347, "learning_rate": 0.00029628947890025656, "loss": 0.8561, "step": 865 }, { "epoch": 0.1648117452048307, "grad_norm": 0.05186425147430888, "learning_rate": 0.0002961790309179866, "loss": 0.8393, "step": 870 }, { "epoch": 0.1657589391427895, "grad_norm": 0.047149331325217335, "learning_rate": 0.00029606698434525434, "loss": 0.8668, "step": 875 }, { "epoch": 0.16670613308074828, "grad_norm": 0.048689384932807, "learning_rate": 0.00029595334040738333, "loss": 0.8374, "step": 880 }, { "epoch": 0.16765332701870708, "grad_norm": 0.053510975406836386, "learning_rate": 0.00029583810034716545, "loss": 0.8491, "step": 885 }, { "epoch": 0.1686005209566659, "grad_norm": 0.05595964353451741, "learning_rate": 0.00029572126542484745, "loss": 0.8727, "step": 890 }, { "epoch": 0.16954771489462467, "grad_norm": 0.055885278431375376, "learning_rate": 0.0002956028369181174, "loss": 0.882, "step": 895 }, { "epoch": 0.17049490883258347, "grad_norm": 0.047842403175001005, "learning_rate": 0.00029548281612209044, "loss": 0.8682, "step": 900 }, { "epoch": 0.17144210277054228, "grad_norm": 0.058823537208354766, "learning_rate": 0.00029536120434929476, "loss": 0.8373, "step": 905 }, { "epoch": 0.17238929670850106, "grad_norm": 0.05444610376517603, "learning_rate": 0.00029523800292965724, "loss": 0.8783, "step": 910 }, { "epoch": 0.17333649064645987, "grad_norm": 0.054957105759307595, "learning_rate": 0.00029511321321048893, "loss": 0.843, "step": 915 }, { "epoch": 0.17428368458441865, "grad_norm": 0.06583345091917218, "learning_rate": 0.0002949868365564701, "loss": 0.8504, "step": 920 }, { "epoch": 0.17523087852237745, "grad_norm": 0.04777073198426105, "learning_rate": 0.00029485887434963566, "loss": 0.8298, "step": 925 }, { "epoch": 0.17617807246033626, "grad_norm": 0.05562673540582162, "learning_rate": 0.00029472932798935977, "loss": 0.8418, "step": 930 }, { "epoch": 0.17712526639829504, "grad_norm": 0.04785779459273509, "learning_rate": 0.0002945981988923406, "loss": 0.8328, "step": 935 }, { "epoch": 0.17807246033625385, "grad_norm": 0.05554074332095169, "learning_rate": 0.00029446548849258513, "loss": 0.8279, "step": 940 }, { "epoch": 0.17901965427421265, "grad_norm": 0.046624447216736774, "learning_rate": 0.00029433119824139286, "loss": 0.8494, "step": 945 }, { "epoch": 0.17996684821217143, "grad_norm": 0.051194260228541774, "learning_rate": 0.0002941953296073405, "loss": 0.8594, "step": 950 }, { "epoch": 0.18091404215013024, "grad_norm": 0.05090727374729561, "learning_rate": 0.0002940578840762658, "loss": 0.8422, "step": 955 }, { "epoch": 0.18186123608808905, "grad_norm": 0.04639853886400584, "learning_rate": 0.00029391886315125083, "loss": 0.8344, "step": 960 }, { "epoch": 0.18280843002604782, "grad_norm": 0.056061389000908554, "learning_rate": 0.0002937782683526063, "loss": 0.8131, "step": 965 }, { "epoch": 0.18375562396400663, "grad_norm": 0.04702100386110801, "learning_rate": 0.00029363610121785447, "loss": 0.8141, "step": 970 }, { "epoch": 0.18470281790196544, "grad_norm": 0.04844408584935392, "learning_rate": 0.00029349236330171224, "loss": 0.8149, "step": 975 }, { "epoch": 0.18565001183992422, "grad_norm": 0.048586646278994214, "learning_rate": 0.0002933470561760744, "loss": 0.8723, "step": 980 }, { "epoch": 0.18659720577788302, "grad_norm": 0.0510188898302652, "learning_rate": 0.00029320018142999643, "loss": 0.8319, "step": 985 }, { "epoch": 0.18754439971584183, "grad_norm": 0.04664479134380501, "learning_rate": 0.0002930517406696771, "loss": 0.8425, "step": 990 }, { "epoch": 0.1884915936538006, "grad_norm": 0.04274961179387165, "learning_rate": 0.0002929017355184407, "loss": 0.8252, "step": 995 }, { "epoch": 0.18943878759175942, "grad_norm": 0.05336071927501094, "learning_rate": 0.00029275016761671954, "loss": 0.8343, "step": 1000 }, { "epoch": 0.19038598152971822, "grad_norm": 0.0732142750483461, "learning_rate": 0.00029259703862203587, "loss": 0.8305, "step": 1005 }, { "epoch": 0.191333175467677, "grad_norm": 0.059075683989297266, "learning_rate": 0.00029244235020898395, "loss": 0.8487, "step": 1010 }, { "epoch": 0.1922803694056358, "grad_norm": 0.04650016716241106, "learning_rate": 0.0002922861040692115, "loss": 0.8583, "step": 1015 }, { "epoch": 0.1932275633435946, "grad_norm": 0.04767602957865441, "learning_rate": 0.0002921283019114011, "loss": 0.8496, "step": 1020 }, { "epoch": 0.1941747572815534, "grad_norm": 0.05644985578762857, "learning_rate": 0.00029196894546125197, "loss": 0.8429, "step": 1025 }, { "epoch": 0.1951219512195122, "grad_norm": 0.049699082298498856, "learning_rate": 0.0002918080364614607, "loss": 0.8121, "step": 1030 }, { "epoch": 0.19606914515747098, "grad_norm": 0.05678361846269368, "learning_rate": 0.0002916455766717024, "loss": 0.831, "step": 1035 }, { "epoch": 0.1970163390954298, "grad_norm": 0.05634582924907757, "learning_rate": 0.00029148156786861125, "loss": 0.8411, "step": 1040 }, { "epoch": 0.1979635330333886, "grad_norm": 0.044578510424992744, "learning_rate": 0.0002913160118457612, "loss": 0.8163, "step": 1045 }, { "epoch": 0.19891072697134737, "grad_norm": 0.049243336108268164, "learning_rate": 0.00029114891041364646, "loss": 0.8651, "step": 1050 }, { "epoch": 0.19985792090930618, "grad_norm": 0.05392438131527857, "learning_rate": 0.00029098026539966143, "loss": 0.8304, "step": 1055 }, { "epoch": 0.200805114847265, "grad_norm": 0.05004934480263781, "learning_rate": 0.0002908100786480811, "loss": 0.8686, "step": 1060 }, { "epoch": 0.20175230878522377, "grad_norm": 0.04733082301433883, "learning_rate": 0.00029063835202004036, "loss": 0.8346, "step": 1065 }, { "epoch": 0.20269950272318257, "grad_norm": 0.05783680843141235, "learning_rate": 0.0002904650873935143, "loss": 0.8312, "step": 1070 }, { "epoch": 0.20364669666114138, "grad_norm": 0.04565338494016318, "learning_rate": 0.0002902902866632969, "loss": 0.8595, "step": 1075 }, { "epoch": 0.20459389059910016, "grad_norm": 0.050858909948530574, "learning_rate": 0.0002901139517409811, "loss": 0.8642, "step": 1080 }, { "epoch": 0.20554108453705897, "grad_norm": 0.05066970549535419, "learning_rate": 0.0002899360845549373, "loss": 0.8342, "step": 1085 }, { "epoch": 0.20648827847501777, "grad_norm": 0.04679512262177614, "learning_rate": 0.0002897566870502925, "loss": 0.8306, "step": 1090 }, { "epoch": 0.20743547241297655, "grad_norm": 0.05217676490462024, "learning_rate": 0.00028957576118890914, "loss": 0.8225, "step": 1095 }, { "epoch": 0.20838266635093536, "grad_norm": 0.05437239079138474, "learning_rate": 0.0002893933089493635, "loss": 0.8553, "step": 1100 }, { "epoch": 0.20932986028889414, "grad_norm": 0.0509435046538677, "learning_rate": 0.00028920933232692386, "loss": 0.8086, "step": 1105 }, { "epoch": 0.21027705422685294, "grad_norm": 0.044222391512642414, "learning_rate": 0.00028902383333352926, "loss": 0.8412, "step": 1110 }, { "epoch": 0.21122424816481175, "grad_norm": 0.04967664422872056, "learning_rate": 0.0002888368139977669, "loss": 0.8506, "step": 1115 }, { "epoch": 0.21217144210277053, "grad_norm": 0.0465634897763295, "learning_rate": 0.0002886482763648503, "loss": 0.8217, "step": 1120 }, { "epoch": 0.21311863604072934, "grad_norm": 0.05052365351929177, "learning_rate": 0.0002884582224965968, "loss": 0.8332, "step": 1125 }, { "epoch": 0.21406582997868814, "grad_norm": 0.05180493973251838, "learning_rate": 0.000288266654471405, "loss": 0.8347, "step": 1130 }, { "epoch": 0.21501302391664692, "grad_norm": 0.0471780634040616, "learning_rate": 0.0002880735743842322, "loss": 0.8366, "step": 1135 }, { "epoch": 0.21596021785460573, "grad_norm": 0.049108577250984026, "learning_rate": 0.0002878789843465713, "loss": 0.8362, "step": 1140 }, { "epoch": 0.21690741179256454, "grad_norm": 0.05564424373715666, "learning_rate": 0.0002876828864864277, "loss": 0.8514, "step": 1145 }, { "epoch": 0.21785460573052332, "grad_norm": 0.0564159881319112, "learning_rate": 0.0002874852829482963, "loss": 0.8723, "step": 1150 }, { "epoch": 0.21880179966848212, "grad_norm": 0.051069627553698455, "learning_rate": 0.0002872861758931376, "loss": 0.851, "step": 1155 }, { "epoch": 0.21974899360644093, "grad_norm": 0.05117455703332741, "learning_rate": 0.00028708556749835454, "loss": 0.8434, "step": 1160 }, { "epoch": 0.2206961875443997, "grad_norm": 0.04934525970498773, "learning_rate": 0.0002868834599577684, "loss": 0.841, "step": 1165 }, { "epoch": 0.22164338148235851, "grad_norm": 0.05894722590210803, "learning_rate": 0.0002866798554815948, "loss": 0.8458, "step": 1170 }, { "epoch": 0.22259057542031732, "grad_norm": 0.05630861952888565, "learning_rate": 0.0002864747562964197, "loss": 0.8343, "step": 1175 }, { "epoch": 0.2235377693582761, "grad_norm": 0.045749561983576756, "learning_rate": 0.000286268164645175, "loss": 0.8421, "step": 1180 }, { "epoch": 0.2244849632962349, "grad_norm": 0.04499091282837436, "learning_rate": 0.00028606008278711373, "loss": 0.8397, "step": 1185 }, { "epoch": 0.22543215723419371, "grad_norm": 0.042402412945632365, "learning_rate": 0.00028585051299778594, "loss": 0.8061, "step": 1190 }, { "epoch": 0.2263793511721525, "grad_norm": 0.046372304084499535, "learning_rate": 0.00028563945756901314, "loss": 0.8514, "step": 1195 }, { "epoch": 0.2273265451101113, "grad_norm": 0.050066239277336104, "learning_rate": 0.00028542691880886376, "loss": 0.8473, "step": 1200 }, { "epoch": 0.22827373904807008, "grad_norm": 0.04750268427944095, "learning_rate": 0.0002852128990416275, "loss": 0.8155, "step": 1205 }, { "epoch": 0.22922093298602889, "grad_norm": 0.04448032581142824, "learning_rate": 0.0002849974006077904, "loss": 0.8462, "step": 1210 }, { "epoch": 0.2301681269239877, "grad_norm": 0.045988358494773375, "learning_rate": 0.00028478042586400876, "loss": 0.8139, "step": 1215 }, { "epoch": 0.23111532086194647, "grad_norm": 0.05150874461710223, "learning_rate": 0.00028456197718308365, "loss": 0.8511, "step": 1220 }, { "epoch": 0.23206251479990528, "grad_norm": 0.04407477102954397, "learning_rate": 0.00028434205695393477, "loss": 0.8374, "step": 1225 }, { "epoch": 0.23300970873786409, "grad_norm": 0.04739621311473698, "learning_rate": 0.0002841206675815745, "loss": 0.8126, "step": 1230 }, { "epoch": 0.23395690267582286, "grad_norm": 0.04187717815582618, "learning_rate": 0.0002838978114870816, "loss": 0.8274, "step": 1235 }, { "epoch": 0.23490409661378167, "grad_norm": 0.04177075006556251, "learning_rate": 0.0002836734911075746, "loss": 0.8168, "step": 1240 }, { "epoch": 0.23585129055174048, "grad_norm": 0.045942488951301354, "learning_rate": 0.0002834477088961853, "loss": 0.8054, "step": 1245 }, { "epoch": 0.23679848448969926, "grad_norm": 0.04359836043504769, "learning_rate": 0.00028322046732203165, "loss": 0.8538, "step": 1250 }, { "epoch": 0.23774567842765806, "grad_norm": 0.04439950975585717, "learning_rate": 0.0002829917688701912, "loss": 0.8352, "step": 1255 }, { "epoch": 0.23869287236561687, "grad_norm": 0.06555071271570542, "learning_rate": 0.00028276161604167354, "loss": 0.8395, "step": 1260 }, { "epoch": 0.23964006630357565, "grad_norm": 0.06429653649306202, "learning_rate": 0.0002825300113533932, "loss": 0.8639, "step": 1265 }, { "epoch": 0.24058726024153446, "grad_norm": 0.050999285390159056, "learning_rate": 0.0002822969573381418, "loss": 0.8265, "step": 1270 }, { "epoch": 0.24153445417949326, "grad_norm": 0.06883273146747126, "learning_rate": 0.0002820624565445608, "loss": 0.8505, "step": 1275 }, { "epoch": 0.24248164811745204, "grad_norm": 0.08792312386360097, "learning_rate": 0.00028182651153711334, "loss": 0.8393, "step": 1280 }, { "epoch": 0.24342884205541085, "grad_norm": 0.05834297565610207, "learning_rate": 0.0002815891248960562, "loss": 0.8198, "step": 1285 }, { "epoch": 0.24437603599336966, "grad_norm": 0.05226579523827474, "learning_rate": 0.0002813502992174116, "loss": 0.8127, "step": 1290 }, { "epoch": 0.24532322993132843, "grad_norm": 0.040640751080577776, "learning_rate": 0.00028111003711293897, "loss": 0.8068, "step": 1295 }, { "epoch": 0.24627042386928724, "grad_norm": 0.04679354987985417, "learning_rate": 0.00028086834121010616, "loss": 0.8368, "step": 1300 }, { "epoch": 0.24721761780724602, "grad_norm": 0.04499421671023844, "learning_rate": 0.0002806252141520608, "loss": 0.8492, "step": 1305 }, { "epoch": 0.24816481174520483, "grad_norm": 0.5315442282070766, "learning_rate": 0.00028038065859760147, "loss": 0.8775, "step": 1310 }, { "epoch": 0.24911200568316363, "grad_norm": 49.60479029682665, "learning_rate": 0.0002801346772211486, "loss": 14.7667, "step": 1315 }, { "epoch": 0.25005919962112244, "grad_norm": 74.14171657249427, "learning_rate": 0.000279887272712715, "loss": 7.0642, "step": 1320 }, { "epoch": 0.2510063935590812, "grad_norm": 0.9679325181066564, "learning_rate": 0.00027963844777787687, "loss": 2.8211, "step": 1325 }, { "epoch": 0.25195358749704, "grad_norm": 1.8750749955853339, "learning_rate": 0.0002793882051377437, "loss": 2.5509, "step": 1330 }, { "epoch": 0.25290078143499883, "grad_norm": 1.256663906662595, "learning_rate": 0.00027913654752892897, "loss": 1.6113, "step": 1335 }, { "epoch": 0.2538479753729576, "grad_norm": 0.4517499508288906, "learning_rate": 0.00027888347770352, "loss": 1.3621, "step": 1340 }, { "epoch": 0.2547951693109164, "grad_norm": 0.16403760364117914, "learning_rate": 0.00027862899842904783, "loss": 1.1522, "step": 1345 }, { "epoch": 0.2557423632488752, "grad_norm": 0.15782986879922667, "learning_rate": 0.00027837311248845697, "loss": 1.0121, "step": 1350 }, { "epoch": 0.256689557186834, "grad_norm": 0.07939626217767812, "learning_rate": 0.00027811582268007516, "loss": 0.9976, "step": 1355 }, { "epoch": 0.2576367511247928, "grad_norm": 0.08824920573476226, "learning_rate": 0.0002778571318175825, "loss": 0.937, "step": 1360 }, { "epoch": 0.2585839450627516, "grad_norm": 0.05529848573271428, "learning_rate": 0.0002775970427299808, "loss": 0.9259, "step": 1365 }, { "epoch": 0.2595311390007104, "grad_norm": 0.05160154895612758, "learning_rate": 0.00027733555826156266, "loss": 0.932, "step": 1370 }, { "epoch": 0.2604783329386692, "grad_norm": 0.6967363411222061, "learning_rate": 0.00027707268127188033, "loss": 0.941, "step": 1375 }, { "epoch": 0.261425526876628, "grad_norm": 0.05931503868977922, "learning_rate": 0.00027680841463571446, "loss": 0.8775, "step": 1380 }, { "epoch": 0.2623727208145868, "grad_norm": 0.07337997434432302, "learning_rate": 0.0002765427612430426, "loss": 0.887, "step": 1385 }, { "epoch": 0.26331991475254557, "grad_norm": 0.053104858087786935, "learning_rate": 0.00027627572399900775, "loss": 0.8484, "step": 1390 }, { "epoch": 0.2642671086905044, "grad_norm": 0.06163074713437679, "learning_rate": 0.00027600730582388644, "loss": 0.8812, "step": 1395 }, { "epoch": 0.2652143026284632, "grad_norm": 0.05016009889342552, "learning_rate": 0.00027573750965305676, "loss": 0.8678, "step": 1400 }, { "epoch": 0.26616149656642196, "grad_norm": 0.053793608113706266, "learning_rate": 0.0002754663384369664, "loss": 0.8421, "step": 1405 }, { "epoch": 0.2671086905043808, "grad_norm": 0.054043702396785645, "learning_rate": 0.0002751937951411005, "loss": 0.8374, "step": 1410 }, { "epoch": 0.2680558844423396, "grad_norm": 0.05993998463791112, "learning_rate": 0.00027491988274594865, "loss": 0.8521, "step": 1415 }, { "epoch": 0.26900307838029835, "grad_norm": 0.05214973518061344, "learning_rate": 0.00027464460424697304, "loss": 0.8563, "step": 1420 }, { "epoch": 0.2699502723182572, "grad_norm": 0.04937709748344037, "learning_rate": 0.0002743679626545753, "loss": 0.8611, "step": 1425 }, { "epoch": 0.27089746625621597, "grad_norm": 0.04690788256444743, "learning_rate": 0.0002740899609940634, "loss": 0.8737, "step": 1430 }, { "epoch": 0.27184466019417475, "grad_norm": 0.04458440143618464, "learning_rate": 0.00027381060230561904, "loss": 0.8393, "step": 1435 }, { "epoch": 0.2727918541321335, "grad_norm": 0.044640210601826116, "learning_rate": 0.0002735298896442641, "loss": 0.8569, "step": 1440 }, { "epoch": 0.27373904807009236, "grad_norm": 0.048448777510886915, "learning_rate": 0.00027324782607982727, "loss": 0.8348, "step": 1445 }, { "epoch": 0.27468624200805114, "grad_norm": 0.0500357269022256, "learning_rate": 0.0002729644146969104, "loss": 0.8676, "step": 1450 }, { "epoch": 0.2756334359460099, "grad_norm": 0.053604059700079344, "learning_rate": 0.0002726796585948551, "loss": 0.8495, "step": 1455 }, { "epoch": 0.27658062988396875, "grad_norm": 0.061024238560798645, "learning_rate": 0.00027239356088770846, "loss": 0.84, "step": 1460 }, { "epoch": 0.27752782382192753, "grad_norm": 0.05542695537862334, "learning_rate": 0.0002721061247041891, "loss": 0.8445, "step": 1465 }, { "epoch": 0.2784750177598863, "grad_norm": 0.05174226319950172, "learning_rate": 0.00027181735318765305, "loss": 0.8239, "step": 1470 }, { "epoch": 0.27942221169784515, "grad_norm": 0.0524720390301642, "learning_rate": 0.0002715272494960594, "loss": 0.8717, "step": 1475 }, { "epoch": 0.2803694056358039, "grad_norm": 0.05078251806460792, "learning_rate": 0.00027123581680193575, "loss": 0.8776, "step": 1480 }, { "epoch": 0.2813165995737627, "grad_norm": 0.0508208511214285, "learning_rate": 0.0002709430582923432, "loss": 0.8337, "step": 1485 }, { "epoch": 0.28226379351172154, "grad_norm": 0.05428792173232585, "learning_rate": 0.00027064897716884195, "loss": 0.8331, "step": 1490 }, { "epoch": 0.2832109874496803, "grad_norm": 0.04339480572676772, "learning_rate": 0.0002703535766474561, "loss": 0.8474, "step": 1495 }, { "epoch": 0.2841581813876391, "grad_norm": 0.04801286233761752, "learning_rate": 0.00027005685995863833, "loss": 0.8538, "step": 1500 }, { "epoch": 0.28510537532559793, "grad_norm": 0.05126840483010234, "learning_rate": 0.00026975883034723486, "loss": 0.8508, "step": 1505 }, { "epoch": 0.2860525692635567, "grad_norm": 0.05632538740067954, "learning_rate": 0.00026945949107244984, "loss": 0.8239, "step": 1510 }, { "epoch": 0.2869997632015155, "grad_norm": 0.05321785003972056, "learning_rate": 0.0002691588454078095, "loss": 0.809, "step": 1515 }, { "epoch": 0.2879469571394743, "grad_norm": 0.05267212261846467, "learning_rate": 0.00026885689664112673, "loss": 0.8235, "step": 1520 }, { "epoch": 0.2888941510774331, "grad_norm": 0.06239400176564442, "learning_rate": 0.0002685536480744648, "loss": 0.8336, "step": 1525 }, { "epoch": 0.2898413450153919, "grad_norm": 0.05392065914950409, "learning_rate": 0.0002682491030241016, "loss": 0.8227, "step": 1530 }, { "epoch": 0.2907885389533507, "grad_norm": 0.042067002075187244, "learning_rate": 0.0002679432648204928, "loss": 0.8336, "step": 1535 }, { "epoch": 0.2917357328913095, "grad_norm": 0.04983258400363465, "learning_rate": 0.0002676361368082362, "loss": 0.7947, "step": 1540 }, { "epoch": 0.2926829268292683, "grad_norm": 0.04869904064999139, "learning_rate": 0.00026732772234603437, "loss": 0.8127, "step": 1545 }, { "epoch": 0.2936301207672271, "grad_norm": 0.05851898617300107, "learning_rate": 0.00026701802480665857, "loss": 0.8313, "step": 1550 }, { "epoch": 0.2945773147051859, "grad_norm": 0.0552687029482635, "learning_rate": 0.0002667070475769114, "loss": 0.8049, "step": 1555 }, { "epoch": 0.29552450864314467, "grad_norm": 0.06477348854060364, "learning_rate": 0.00026639479405759006, "loss": 0.83, "step": 1560 }, { "epoch": 0.2964717025811035, "grad_norm": 0.04555333394215088, "learning_rate": 0.000266081267663449, "loss": 0.84, "step": 1565 }, { "epoch": 0.2974188965190623, "grad_norm": 0.04197167864122965, "learning_rate": 0.00026576647182316264, "loss": 0.8192, "step": 1570 }, { "epoch": 0.29836609045702106, "grad_norm": 0.15360351138708875, "learning_rate": 0.00026545040997928785, "loss": 0.8756, "step": 1575 }, { "epoch": 0.2993132843949799, "grad_norm": 0.07002447455001899, "learning_rate": 0.00026513308558822636, "loss": 0.8182, "step": 1580 }, { "epoch": 0.3002604783329387, "grad_norm": 0.05720839543286207, "learning_rate": 0.0002648145021201868, "loss": 0.8334, "step": 1585 }, { "epoch": 0.30120767227089745, "grad_norm": 0.05201720217949488, "learning_rate": 0.0002644946630591469, "loss": 0.8494, "step": 1590 }, { "epoch": 0.3021548662088563, "grad_norm": 0.04850167766978546, "learning_rate": 0.0002641735719028155, "loss": 0.8285, "step": 1595 }, { "epoch": 0.30310206014681507, "grad_norm": 0.044133764645286594, "learning_rate": 0.000263851232162594, "loss": 0.8225, "step": 1600 }, { "epoch": 0.30404925408477385, "grad_norm": 0.0488939501306738, "learning_rate": 0.00026352764736353815, "loss": 0.8395, "step": 1605 }, { "epoch": 0.3049964480227327, "grad_norm": 0.044482887206502425, "learning_rate": 0.0002632028210443194, "loss": 0.8199, "step": 1610 }, { "epoch": 0.30594364196069146, "grad_norm": 0.054188307478421044, "learning_rate": 0.00026287675675718653, "loss": 0.833, "step": 1615 }, { "epoch": 0.30689083589865024, "grad_norm": 0.048205088442678685, "learning_rate": 0.00026254945806792614, "loss": 0.8287, "step": 1620 }, { "epoch": 0.3078380298366091, "grad_norm": 0.0457249850341829, "learning_rate": 0.0002622209285558244, "loss": 0.8104, "step": 1625 }, { "epoch": 0.30878522377456785, "grad_norm": 0.04764496488482527, "learning_rate": 0.00026189117181362733, "loss": 0.807, "step": 1630 }, { "epoch": 0.30973241771252663, "grad_norm": 0.04577871219106504, "learning_rate": 0.0002615601914475018, "loss": 0.8387, "step": 1635 }, { "epoch": 0.3106796116504854, "grad_norm": 0.0560487648361042, "learning_rate": 0.0002612279910769962, "loss": 0.8209, "step": 1640 }, { "epoch": 0.31162680558844424, "grad_norm": 0.051106440022587435, "learning_rate": 0.0002608945743350004, "loss": 0.8066, "step": 1645 }, { "epoch": 0.312573999526403, "grad_norm": 0.060741538352692886, "learning_rate": 0.0002605599448677066, "loss": 0.8258, "step": 1650 }, { "epoch": 0.3135211934643618, "grad_norm": 0.0600167744118878, "learning_rate": 0.000260224106334569, "loss": 0.8174, "step": 1655 }, { "epoch": 0.31446838740232064, "grad_norm": 0.05182728019691824, "learning_rate": 0.000259887062408264, "loss": 0.8379, "step": 1660 }, { "epoch": 0.3154155813402794, "grad_norm": 0.048406806604266626, "learning_rate": 0.00025954881677464994, "loss": 0.8239, "step": 1665 }, { "epoch": 0.3163627752782382, "grad_norm": 0.04614485855762265, "learning_rate": 0.0002592093731327269, "loss": 0.8328, "step": 1670 }, { "epoch": 0.31730996921619703, "grad_norm": 0.04097208134051075, "learning_rate": 0.0002588687351945962, "loss": 0.8054, "step": 1675 }, { "epoch": 0.3182571631541558, "grad_norm": 0.04859899532989667, "learning_rate": 0.0002585269066854197, "loss": 0.828, "step": 1680 }, { "epoch": 0.3192043570921146, "grad_norm": 0.0514636005763012, "learning_rate": 0.00025818389134337925, "loss": 0.805, "step": 1685 }, { "epoch": 0.3201515510300734, "grad_norm": 0.04510538821375225, "learning_rate": 0.0002578396929196356, "loss": 0.8296, "step": 1690 }, { "epoch": 0.3210987449680322, "grad_norm": 0.04625807399119475, "learning_rate": 0.00025749431517828775, "loss": 0.8085, "step": 1695 }, { "epoch": 0.322045938905991, "grad_norm": 0.04353062203420096, "learning_rate": 0.0002571477618963311, "loss": 0.8169, "step": 1700 }, { "epoch": 0.3229931328439498, "grad_norm": 0.045940703703086845, "learning_rate": 0.00025680003686361704, "loss": 0.8337, "step": 1705 }, { "epoch": 0.3239403267819086, "grad_norm": 0.047707507442658635, "learning_rate": 0.00025645114388281066, "loss": 0.8097, "step": 1710 }, { "epoch": 0.3248875207198674, "grad_norm": 0.04488951827418563, "learning_rate": 0.00025610108676934974, "loss": 0.8296, "step": 1715 }, { "epoch": 0.3258347146578262, "grad_norm": 0.05157676249590623, "learning_rate": 0.00025574986935140287, "loss": 0.832, "step": 1720 }, { "epoch": 0.326781908595785, "grad_norm": 0.045481015370253376, "learning_rate": 0.00025539749546982736, "loss": 0.812, "step": 1725 }, { "epoch": 0.32772910253374377, "grad_norm": 0.04671492664808977, "learning_rate": 0.0002550439689781276, "loss": 0.783, "step": 1730 }, { "epoch": 0.3286762964717026, "grad_norm": 0.056706366227313135, "learning_rate": 0.00025468929374241256, "loss": 0.829, "step": 1735 }, { "epoch": 0.3296234904096614, "grad_norm": 0.04717465337329956, "learning_rate": 0.0002543334736413539, "loss": 0.8482, "step": 1740 }, { "epoch": 0.33057068434762016, "grad_norm": 0.048757000553929425, "learning_rate": 0.0002539765125661432, "loss": 0.807, "step": 1745 }, { "epoch": 0.331517878285579, "grad_norm": 0.04444091877456076, "learning_rate": 0.00025361841442044956, "loss": 0.8321, "step": 1750 }, { "epoch": 0.33246507222353777, "grad_norm": 0.04389186380472923, "learning_rate": 0.00025325918312037697, "loss": 0.806, "step": 1755 }, { "epoch": 0.33341226616149655, "grad_norm": 0.0451220202476477, "learning_rate": 0.0002528988225944214, "loss": 0.8239, "step": 1760 }, { "epoch": 0.3343594600994554, "grad_norm": 0.04940473146055937, "learning_rate": 0.00025253733678342775, "loss": 0.7978, "step": 1765 }, { "epoch": 0.33530665403741416, "grad_norm": 0.04360662632042812, "learning_rate": 0.000252174729640547, "loss": 0.7936, "step": 1770 }, { "epoch": 0.33625384797537294, "grad_norm": 0.04508266007873255, "learning_rate": 0.0002518110051311927, "loss": 0.8354, "step": 1775 }, { "epoch": 0.3372010419133318, "grad_norm": 0.0447597919709376, "learning_rate": 0.00025144616723299785, "loss": 0.8128, "step": 1780 }, { "epoch": 0.33814823585129056, "grad_norm": 0.04068239709765713, "learning_rate": 0.0002510802199357713, "loss": 0.8173, "step": 1785 }, { "epoch": 0.33909542978924934, "grad_norm": 0.04478708527154376, "learning_rate": 0.000250713167241454, "loss": 0.8192, "step": 1790 }, { "epoch": 0.34004262372720817, "grad_norm": 0.04547217242470498, "learning_rate": 0.00025034501316407537, "loss": 0.8418, "step": 1795 }, { "epoch": 0.34098981766516695, "grad_norm": 0.04674703701128052, "learning_rate": 0.0002499757617297095, "loss": 0.7595, "step": 1800 }, { "epoch": 0.34193701160312573, "grad_norm": 0.0456363229285353, "learning_rate": 0.00024960541697643094, "loss": 0.8125, "step": 1805 }, { "epoch": 0.34288420554108456, "grad_norm": 0.04312589291081109, "learning_rate": 0.00024923398295427046, "loss": 0.7931, "step": 1810 }, { "epoch": 0.34383139947904334, "grad_norm": 0.04696710814032231, "learning_rate": 0.00024886146372517107, "loss": 0.8062, "step": 1815 }, { "epoch": 0.3447785934170021, "grad_norm": 0.043291328088353766, "learning_rate": 0.00024848786336294346, "loss": 0.7962, "step": 1820 }, { "epoch": 0.3457257873549609, "grad_norm": 0.04284288738527321, "learning_rate": 0.0002481131859532212, "loss": 0.8031, "step": 1825 }, { "epoch": 0.34667298129291974, "grad_norm": 0.04649906122780062, "learning_rate": 0.0002477374355934165, "loss": 0.7931, "step": 1830 }, { "epoch": 0.3476201752308785, "grad_norm": 0.05184640676841213, "learning_rate": 0.0002473606163926751, "loss": 0.833, "step": 1835 }, { "epoch": 0.3485673691688373, "grad_norm": 0.04860441277746714, "learning_rate": 0.00024698273247183137, "loss": 0.8212, "step": 1840 }, { "epoch": 0.34951456310679613, "grad_norm": 0.05114423505303399, "learning_rate": 0.0002466037879633633, "loss": 0.7971, "step": 1845 }, { "epoch": 0.3504617570447549, "grad_norm": 0.047498936911412486, "learning_rate": 0.00024622378701134737, "loss": 0.8274, "step": 1850 }, { "epoch": 0.3514089509827137, "grad_norm": 0.04549364277100318, "learning_rate": 0.00024584273377141306, "loss": 0.7948, "step": 1855 }, { "epoch": 0.3523561449206725, "grad_norm": 0.05177910151205359, "learning_rate": 0.0002454606324106977, "loss": 0.8036, "step": 1860 }, { "epoch": 0.3533033388586313, "grad_norm": 0.04603732482580829, "learning_rate": 0.00024507748710780034, "loss": 0.8062, "step": 1865 }, { "epoch": 0.3542505327965901, "grad_norm": 0.05348425306250115, "learning_rate": 0.00024469330205273676, "loss": 0.7993, "step": 1870 }, { "epoch": 0.3551977267345489, "grad_norm": 0.04956827787982081, "learning_rate": 0.0002443080814468931, "loss": 0.8036, "step": 1875 }, { "epoch": 0.3561449206725077, "grad_norm": 0.05048490898899849, "learning_rate": 0.00024392182950298033, "loss": 0.8339, "step": 1880 }, { "epoch": 0.35709211461046647, "grad_norm": 0.04546839597515788, "learning_rate": 0.0002435345504449877, "loss": 0.8127, "step": 1885 }, { "epoch": 0.3580393085484253, "grad_norm": 0.04742371580337266, "learning_rate": 0.00024314624850813689, "loss": 0.8226, "step": 1890 }, { "epoch": 0.3589865024863841, "grad_norm": 0.04631643296750854, "learning_rate": 0.00024275692793883577, "loss": 0.8133, "step": 1895 }, { "epoch": 0.35993369642434286, "grad_norm": 0.04058836177118087, "learning_rate": 0.00024236659299463171, "loss": 0.7976, "step": 1900 }, { "epoch": 0.3608808903623017, "grad_norm": 0.049761880526735185, "learning_rate": 0.00024197524794416508, "loss": 0.8144, "step": 1905 }, { "epoch": 0.3618280843002605, "grad_norm": 0.0434144853404768, "learning_rate": 0.00024158289706712266, "loss": 0.7961, "step": 1910 }, { "epoch": 0.36277527823821926, "grad_norm": 0.04729858009338802, "learning_rate": 0.0002411895446541908, "loss": 0.8092, "step": 1915 }, { "epoch": 0.3637224721761781, "grad_norm": 0.04391468404782121, "learning_rate": 0.00024079519500700848, "loss": 0.7873, "step": 1920 }, { "epoch": 0.36466966611413687, "grad_norm": 0.046136629260664565, "learning_rate": 0.00024039985243812017, "loss": 0.8358, "step": 1925 }, { "epoch": 0.36561686005209565, "grad_norm": 0.04922463227417696, "learning_rate": 0.000240003521270929, "loss": 0.7982, "step": 1930 }, { "epoch": 0.3665640539900545, "grad_norm": 0.050099522270096786, "learning_rate": 0.00023960620583964905, "loss": 0.8119, "step": 1935 }, { "epoch": 0.36751124792801326, "grad_norm": 0.04673234998587366, "learning_rate": 0.00023920791048925817, "loss": 0.7916, "step": 1940 }, { "epoch": 0.36845844186597204, "grad_norm": 0.044736938963155615, "learning_rate": 0.00023880863957545065, "loss": 0.8092, "step": 1945 }, { "epoch": 0.3694056358039309, "grad_norm": 0.04403494677156711, "learning_rate": 0.00023840839746458906, "loss": 0.8007, "step": 1950 }, { "epoch": 0.37035282974188966, "grad_norm": 0.04207278974539967, "learning_rate": 0.00023800718853365707, "loss": 0.8079, "step": 1955 }, { "epoch": 0.37130002367984843, "grad_norm": 0.04441047413325019, "learning_rate": 0.00023760501717021127, "loss": 0.7981, "step": 1960 }, { "epoch": 0.37224721761780727, "grad_norm": 0.04392750904458252, "learning_rate": 0.00023720188777233328, "loss": 0.8189, "step": 1965 }, { "epoch": 0.37319441155576605, "grad_norm": 0.0409355126519413, "learning_rate": 0.0002367978047485816, "loss": 0.8065, "step": 1970 }, { "epoch": 0.3741416054937248, "grad_norm": 0.044747845633113445, "learning_rate": 0.00023639277251794342, "loss": 0.8152, "step": 1975 }, { "epoch": 0.37508879943168366, "grad_norm": 0.04360255348673726, "learning_rate": 0.0002359867955097863, "loss": 0.797, "step": 1980 }, { "epoch": 0.37603599336964244, "grad_norm": 0.04405719390782307, "learning_rate": 0.00023557987816380985, "loss": 0.8058, "step": 1985 }, { "epoch": 0.3769831873076012, "grad_norm": 0.04839154568146625, "learning_rate": 0.00023517202492999686, "loss": 0.8114, "step": 1990 }, { "epoch": 0.37793038124556005, "grad_norm": 0.04779278427510571, "learning_rate": 0.00023476324026856503, "loss": 0.7969, "step": 1995 }, { "epoch": 0.37887757518351883, "grad_norm": 0.05404705371255034, "learning_rate": 0.00023435352864991787, "loss": 0.8054, "step": 2000 }, { "epoch": 0.3798247691214776, "grad_norm": 0.04669907966605441, "learning_rate": 0.000233942894554596, "loss": 0.8018, "step": 2005 }, { "epoch": 0.38077196305943645, "grad_norm": 0.043137725132093525, "learning_rate": 0.0002335313424732282, "loss": 0.7924, "step": 2010 }, { "epoch": 0.3817191569973952, "grad_norm": 0.04354182985885191, "learning_rate": 0.00023311887690648196, "loss": 0.7958, "step": 2015 }, { "epoch": 0.382666350935354, "grad_norm": 0.042644363380025696, "learning_rate": 0.00023270550236501467, "loss": 0.8399, "step": 2020 }, { "epoch": 0.3836135448733128, "grad_norm": 0.04539230083821745, "learning_rate": 0.00023229122336942417, "loss": 0.8038, "step": 2025 }, { "epoch": 0.3845607388112716, "grad_norm": 0.04633684365426799, "learning_rate": 0.0002318760444501991, "loss": 0.7918, "step": 2030 }, { "epoch": 0.3855079327492304, "grad_norm": 0.04726122562269177, "learning_rate": 0.0002314599701476696, "loss": 0.8095, "step": 2035 }, { "epoch": 0.3864551266871892, "grad_norm": 0.04700120263284989, "learning_rate": 0.00023104300501195765, "loss": 0.7986, "step": 2040 }, { "epoch": 0.387402320625148, "grad_norm": 0.04621030089117987, "learning_rate": 0.0002306251536029271, "loss": 0.803, "step": 2045 }, { "epoch": 0.3883495145631068, "grad_norm": 0.04183970144784169, "learning_rate": 0.00023020642049013403, "loss": 0.785, "step": 2050 }, { "epoch": 0.38929670850106557, "grad_norm": 0.0469484416435775, "learning_rate": 0.0002297868102527767, "loss": 0.7991, "step": 2055 }, { "epoch": 0.3902439024390244, "grad_norm": 0.04737826498167925, "learning_rate": 0.0002293663274796454, "loss": 0.8004, "step": 2060 }, { "epoch": 0.3911910963769832, "grad_norm": 0.04713629528918563, "learning_rate": 0.00022894497676907244, "loss": 0.7856, "step": 2065 }, { "epoch": 0.39213829031494196, "grad_norm": 0.0539311516444963, "learning_rate": 0.0002285227627288816, "loss": 0.8007, "step": 2070 }, { "epoch": 0.3930854842529008, "grad_norm": 0.04352919743448704, "learning_rate": 0.00022809968997633803, "loss": 0.7976, "step": 2075 }, { "epoch": 0.3940326781908596, "grad_norm": 0.0438878761761322, "learning_rate": 0.00022767576313809757, "loss": 0.8084, "step": 2080 }, { "epoch": 0.39497987212881835, "grad_norm": 0.04356708774126194, "learning_rate": 0.0002272509868501561, "loss": 0.8018, "step": 2085 }, { "epoch": 0.3959270660667772, "grad_norm": 0.044799245818807745, "learning_rate": 0.00022682536575779926, "loss": 0.8185, "step": 2090 }, { "epoch": 0.39687426000473597, "grad_norm": 0.04366762752086422, "learning_rate": 0.00022639890451555094, "loss": 0.8082, "step": 2095 }, { "epoch": 0.39782145394269475, "grad_norm": 0.04970721314212078, "learning_rate": 0.00022597160778712303, "loss": 0.8163, "step": 2100 }, { "epoch": 0.3987686478806536, "grad_norm": 0.04264183467808777, "learning_rate": 0.00022554348024536413, "loss": 0.7765, "step": 2105 }, { "epoch": 0.39971584181861236, "grad_norm": 0.046000942745726546, "learning_rate": 0.00022511452657220836, "loss": 0.7767, "step": 2110 }, { "epoch": 0.40066303575657114, "grad_norm": 0.04416028306809829, "learning_rate": 0.0002246847514586244, "loss": 0.7756, "step": 2115 }, { "epoch": 0.40161022969453, "grad_norm": 0.04652032350471012, "learning_rate": 0.00022425415960456406, "loss": 0.785, "step": 2120 }, { "epoch": 0.40255742363248875, "grad_norm": 0.039319004016494394, "learning_rate": 0.00022382275571891088, "loss": 0.8171, "step": 2125 }, { "epoch": 0.40350461757044753, "grad_norm": 0.043191035340779275, "learning_rate": 0.00022339054451942853, "loss": 0.7888, "step": 2130 }, { "epoch": 0.40445181150840637, "grad_norm": 0.04850980353591229, "learning_rate": 0.00022295753073270957, "loss": 0.8024, "step": 2135 }, { "epoch": 0.40539900544636515, "grad_norm": 0.05346932816843321, "learning_rate": 0.00022252371909412338, "loss": 0.7943, "step": 2140 }, { "epoch": 0.4063461993843239, "grad_norm": 0.0504819764321655, "learning_rate": 0.00022208911434776446, "loss": 0.8113, "step": 2145 }, { "epoch": 0.40729339332228276, "grad_norm": 0.04358873484038266, "learning_rate": 0.00022165372124640075, "loss": 0.7792, "step": 2150 }, { "epoch": 0.40824058726024154, "grad_norm": 0.048136044036710554, "learning_rate": 0.0002212175445514214, "loss": 0.8271, "step": 2155 }, { "epoch": 0.4091877811982003, "grad_norm": 0.0482886839866423, "learning_rate": 0.00022078058903278493, "loss": 0.8082, "step": 2160 }, { "epoch": 0.41013497513615915, "grad_norm": 0.04932102621776615, "learning_rate": 0.00022034285946896683, "loss": 0.8157, "step": 2165 }, { "epoch": 0.41108216907411793, "grad_norm": 0.04558271327582903, "learning_rate": 0.0002199043606469075, "loss": 0.8205, "step": 2170 }, { "epoch": 0.4120293630120767, "grad_norm": 0.04637987890505172, "learning_rate": 0.00021946509736195982, "loss": 0.8104, "step": 2175 }, { "epoch": 0.41297655695003554, "grad_norm": 0.04402627600967093, "learning_rate": 0.00021902507441783666, "loss": 0.7735, "step": 2180 }, { "epoch": 0.4139237508879943, "grad_norm": 0.04343306123899328, "learning_rate": 0.0002185842966265585, "loss": 0.8137, "step": 2185 }, { "epoch": 0.4148709448259531, "grad_norm": 0.04049175499820178, "learning_rate": 0.00021814276880840057, "loss": 0.7666, "step": 2190 }, { "epoch": 0.41581813876391194, "grad_norm": 0.04190114466718279, "learning_rate": 0.0002177004957918404, "loss": 0.7941, "step": 2195 }, { "epoch": 0.4167653327018707, "grad_norm": 0.04167140065120627, "learning_rate": 0.00021725748241350486, "loss": 0.8049, "step": 2200 }, { "epoch": 0.4177125266398295, "grad_norm": 0.04030091050865229, "learning_rate": 0.00021681373351811715, "loss": 0.7765, "step": 2205 }, { "epoch": 0.4186597205777883, "grad_norm": 0.043928276470467834, "learning_rate": 0.00021636925395844425, "loss": 0.8004, "step": 2210 }, { "epoch": 0.4196069145157471, "grad_norm": 0.041711739434037935, "learning_rate": 0.00021592404859524338, "loss": 0.8014, "step": 2215 }, { "epoch": 0.4205541084537059, "grad_norm": 0.08515140585912362, "learning_rate": 0.00021547812229720905, "loss": 0.7925, "step": 2220 }, { "epoch": 0.42150130239166467, "grad_norm": 0.046571948096373585, "learning_rate": 0.0002150314799409198, "loss": 0.7995, "step": 2225 }, { "epoch": 0.4224484963296235, "grad_norm": 0.04291996713823751, "learning_rate": 0.00021458412641078484, "loss": 0.7833, "step": 2230 }, { "epoch": 0.4233956902675823, "grad_norm": 0.04800959860061716, "learning_rate": 0.00021413606659899075, "loss": 0.8056, "step": 2235 }, { "epoch": 0.42434288420554106, "grad_norm": 0.04165302702567769, "learning_rate": 0.00021368730540544784, "loss": 0.8031, "step": 2240 }, { "epoch": 0.4252900781434999, "grad_norm": 0.042443453575190054, "learning_rate": 0.0002132378477377366, "loss": 0.8342, "step": 2245 }, { "epoch": 0.4262372720814587, "grad_norm": 0.04364197763796348, "learning_rate": 0.00021278769851105413, "loss": 0.8069, "step": 2250 }, { "epoch": 0.42718446601941745, "grad_norm": 0.04424427184165216, "learning_rate": 0.00021233686264816024, "loss": 0.8093, "step": 2255 }, { "epoch": 0.4281316599573763, "grad_norm": 0.04438652665500913, "learning_rate": 0.00021188534507932369, "loss": 0.812, "step": 2260 }, { "epoch": 0.42907885389533507, "grad_norm": 0.04194291606682371, "learning_rate": 0.0002114331507422682, "loss": 0.7999, "step": 2265 }, { "epoch": 0.43002604783329385, "grad_norm": 0.040114143951823826, "learning_rate": 0.0002109802845821187, "loss": 0.776, "step": 2270 }, { "epoch": 0.4309732417712527, "grad_norm": 0.049855477781271945, "learning_rate": 0.0002105267515513469, "loss": 0.7898, "step": 2275 }, { "epoch": 0.43192043570921146, "grad_norm": 0.04832409909314252, "learning_rate": 0.00021007255660971736, "loss": 0.7705, "step": 2280 }, { "epoch": 0.43286762964717024, "grad_norm": 0.04448804574954983, "learning_rate": 0.00020961770472423323, "loss": 0.7856, "step": 2285 }, { "epoch": 0.4338148235851291, "grad_norm": 0.04056107663630711, "learning_rate": 0.00020916220086908185, "loss": 0.8386, "step": 2290 }, { "epoch": 0.43476201752308785, "grad_norm": 0.04562375800507603, "learning_rate": 0.00020870605002558038, "loss": 0.7919, "step": 2295 }, { "epoch": 0.43570921146104663, "grad_norm": 0.04990170126587913, "learning_rate": 0.00020824925718212133, "loss": 0.7812, "step": 2300 }, { "epoch": 0.43665640539900547, "grad_norm": 0.04782898096811606, "learning_rate": 0.00020779182733411813, "loss": 0.8204, "step": 2305 }, { "epoch": 0.43760359933696424, "grad_norm": 0.05977747562829158, "learning_rate": 0.00020733376548395026, "loss": 0.7674, "step": 2310 }, { "epoch": 0.438550793274923, "grad_norm": 0.050186919112014665, "learning_rate": 0.00020687507664090873, "loss": 0.7842, "step": 2315 }, { "epoch": 0.43949798721288186, "grad_norm": 0.06774152080561698, "learning_rate": 0.0002064157658211413, "loss": 0.7863, "step": 2320 }, { "epoch": 0.44044518115084064, "grad_norm": 0.05251562387148029, "learning_rate": 0.0002059558380475974, "loss": 0.7803, "step": 2325 }, { "epoch": 0.4413923750887994, "grad_norm": 0.11110903402386875, "learning_rate": 0.00020549529834997356, "loss": 0.8211, "step": 2330 }, { "epoch": 0.44233956902675825, "grad_norm": 0.05088713402476131, "learning_rate": 0.0002050341517646581, "loss": 0.8229, "step": 2335 }, { "epoch": 0.44328676296471703, "grad_norm": 0.048748937702802024, "learning_rate": 0.00020457240333467618, "loss": 0.8308, "step": 2340 }, { "epoch": 0.4442339569026758, "grad_norm": 0.04414994149656158, "learning_rate": 0.00020411005810963467, "loss": 0.7783, "step": 2345 }, { "epoch": 0.44518115084063464, "grad_norm": 0.04297554142966596, "learning_rate": 0.00020364712114566682, "loss": 0.7994, "step": 2350 }, { "epoch": 0.4461283447785934, "grad_norm": 0.0466918891507033, "learning_rate": 0.00020318359750537722, "loss": 0.7766, "step": 2355 }, { "epoch": 0.4470755387165522, "grad_norm": 0.0426774503592908, "learning_rate": 0.00020271949225778604, "loss": 0.7689, "step": 2360 }, { "epoch": 0.44802273265451104, "grad_norm": 0.04339210841225298, "learning_rate": 0.00020225481047827395, "loss": 0.7629, "step": 2365 }, { "epoch": 0.4489699265924698, "grad_norm": 0.04157539901744288, "learning_rate": 0.0002017895572485264, "loss": 0.7993, "step": 2370 }, { "epoch": 0.4499171205304286, "grad_norm": 0.04747869551715979, "learning_rate": 0.00020132373765647824, "loss": 0.7831, "step": 2375 }, { "epoch": 0.45086431446838743, "grad_norm": 0.041036503272024266, "learning_rate": 0.00020085735679625785, "loss": 0.7938, "step": 2380 }, { "epoch": 0.4518115084063462, "grad_norm": 0.046349793056969124, "learning_rate": 0.00020039041976813155, "loss": 0.8213, "step": 2385 }, { "epoch": 0.452758702344305, "grad_norm": 0.04266113778547003, "learning_rate": 0.000199922931678448, "loss": 0.7797, "step": 2390 }, { "epoch": 0.4537058962822638, "grad_norm": 0.04138658311957195, "learning_rate": 0.00019945489763958192, "loss": 0.7855, "step": 2395 }, { "epoch": 0.4546530902202226, "grad_norm": 0.03957822721175349, "learning_rate": 0.00019898632276987865, "loss": 0.7802, "step": 2400 }, { "epoch": 0.4556002841581814, "grad_norm": 0.043723247514117734, "learning_rate": 0.00019851721219359787, "loss": 0.7914, "step": 2405 }, { "epoch": 0.45654747809614016, "grad_norm": 0.040483397534663346, "learning_rate": 0.0001980475710408577, "loss": 0.7784, "step": 2410 }, { "epoch": 0.457494672034099, "grad_norm": 0.047435378340581154, "learning_rate": 0.00019757740444757856, "loss": 0.8099, "step": 2415 }, { "epoch": 0.45844186597205777, "grad_norm": 0.04167097525052772, "learning_rate": 0.00019710671755542684, "loss": 0.8004, "step": 2420 }, { "epoch": 0.45938905991001655, "grad_norm": 0.045347168717410416, "learning_rate": 0.0001966355155117592, "loss": 0.7503, "step": 2425 }, { "epoch": 0.4603362538479754, "grad_norm": 0.04793512210403035, "learning_rate": 0.00019616380346956555, "loss": 0.8034, "step": 2430 }, { "epoch": 0.46128344778593416, "grad_norm": 0.044750662926773134, "learning_rate": 0.00019569158658741325, "loss": 0.8036, "step": 2435 }, { "epoch": 0.46223064172389294, "grad_norm": 0.0408279434029639, "learning_rate": 0.0001952188700293905, "loss": 0.7744, "step": 2440 }, { "epoch": 0.4631778356618518, "grad_norm": 0.04475763705667161, "learning_rate": 0.0001947456589650498, "loss": 0.7831, "step": 2445 }, { "epoch": 0.46412502959981056, "grad_norm": 0.048943569028928394, "learning_rate": 0.00019427195856935156, "loss": 0.7584, "step": 2450 }, { "epoch": 0.46507222353776934, "grad_norm": 0.0470737958639738, "learning_rate": 0.00019379777402260735, "loss": 0.8045, "step": 2455 }, { "epoch": 0.46601941747572817, "grad_norm": 0.04333259402866556, "learning_rate": 0.0001933231105104235, "loss": 0.8252, "step": 2460 }, { "epoch": 0.46696661141368695, "grad_norm": 0.04500393126058614, "learning_rate": 0.00019284797322364412, "loss": 0.7963, "step": 2465 }, { "epoch": 0.46791380535164573, "grad_norm": 0.04617455518775394, "learning_rate": 0.00019237236735829434, "loss": 0.7905, "step": 2470 }, { "epoch": 0.46886099928960456, "grad_norm": 0.04568226599386409, "learning_rate": 0.0001918962981155238, "loss": 0.7878, "step": 2475 }, { "epoch": 0.46980819322756334, "grad_norm": 0.04384781746645944, "learning_rate": 0.00019141977070154945, "loss": 0.8155, "step": 2480 }, { "epoch": 0.4707553871655221, "grad_norm": 0.04128165433631373, "learning_rate": 0.0001909427903275988, "loss": 0.8024, "step": 2485 }, { "epoch": 0.47170258110348096, "grad_norm": 0.04842154647206823, "learning_rate": 0.00019046536220985267, "loss": 0.7762, "step": 2490 }, { "epoch": 0.47264977504143973, "grad_norm": 0.042673491173335736, "learning_rate": 0.00018998749156938854, "loss": 0.7709, "step": 2495 }, { "epoch": 0.4735969689793985, "grad_norm": 0.04785439305575584, "learning_rate": 0.00018950918363212324, "loss": 0.7804, "step": 2500 }, { "epoch": 0.47454416291735735, "grad_norm": 0.0461863217228199, "learning_rate": 0.00018903044362875558, "loss": 0.7925, "step": 2505 }, { "epoch": 0.4754913568553161, "grad_norm": 0.044243673212673404, "learning_rate": 0.0001885512767947097, "loss": 0.7941, "step": 2510 }, { "epoch": 0.4764385507932749, "grad_norm": 0.040611884775210036, "learning_rate": 0.0001880716883700772, "loss": 0.7562, "step": 2515 }, { "epoch": 0.47738574473123374, "grad_norm": 0.041073708433040416, "learning_rate": 0.00018759168359956034, "loss": 0.7856, "step": 2520 }, { "epoch": 0.4783329386691925, "grad_norm": 0.040555781455264096, "learning_rate": 0.00018711126773241434, "loss": 0.7808, "step": 2525 }, { "epoch": 0.4792801326071513, "grad_norm": 0.04412592641007611, "learning_rate": 0.00018663044602239016, "loss": 0.7527, "step": 2530 }, { "epoch": 0.48022732654511013, "grad_norm": 0.04484068242737891, "learning_rate": 0.00018614922372767705, "loss": 0.8026, "step": 2535 }, { "epoch": 0.4811745204830689, "grad_norm": 0.04494181539453932, "learning_rate": 0.00018566760611084482, "loss": 0.7884, "step": 2540 }, { "epoch": 0.4821217144210277, "grad_norm": 0.04177209726984591, "learning_rate": 0.00018518559843878663, "loss": 0.7944, "step": 2545 }, { "epoch": 0.4830689083589865, "grad_norm": 0.0418891994027945, "learning_rate": 0.00018470320598266114, "loss": 0.7876, "step": 2550 }, { "epoch": 0.4840161022969453, "grad_norm": 0.04509400985562226, "learning_rate": 0.00018422043401783499, "loss": 0.7906, "step": 2555 }, { "epoch": 0.4849632962349041, "grad_norm": 0.03886301281064671, "learning_rate": 0.00018373728782382497, "loss": 0.7658, "step": 2560 }, { "epoch": 0.4859104901728629, "grad_norm": 0.04186589657661449, "learning_rate": 0.00018325377268424054, "loss": 0.7921, "step": 2565 }, { "epoch": 0.4868576841108217, "grad_norm": 0.04025799467986521, "learning_rate": 0.00018276989388672573, "loss": 0.8143, "step": 2570 }, { "epoch": 0.4878048780487805, "grad_norm": 0.04058499655814065, "learning_rate": 0.0001822856567229016, "loss": 0.7819, "step": 2575 }, { "epoch": 0.4887520719867393, "grad_norm": 0.04628173800240841, "learning_rate": 0.0001818010664883082, "loss": 0.7944, "step": 2580 }, { "epoch": 0.4896992659246981, "grad_norm": 0.042223328062815924, "learning_rate": 0.0001813161284823466, "loss": 0.7975, "step": 2585 }, { "epoch": 0.49064645986265687, "grad_norm": 0.041313006294521955, "learning_rate": 0.00018083084800822128, "loss": 0.7954, "step": 2590 }, { "epoch": 0.49159365380061565, "grad_norm": 0.042569155544217974, "learning_rate": 0.0001803452303728816, "loss": 0.7628, "step": 2595 }, { "epoch": 0.4925408477385745, "grad_norm": 0.043867104326580246, "learning_rate": 0.00017985928088696434, "loss": 0.7558, "step": 2600 }, { "epoch": 0.49348804167653326, "grad_norm": 0.04371521207560914, "learning_rate": 0.0001793730048647352, "loss": 0.7686, "step": 2605 }, { "epoch": 0.49443523561449204, "grad_norm": 0.04369821893875824, "learning_rate": 0.00017888640762403078, "loss": 0.7961, "step": 2610 }, { "epoch": 0.4953824295524509, "grad_norm": 0.043782837772372955, "learning_rate": 0.00017839949448620064, "loss": 0.8211, "step": 2615 }, { "epoch": 0.49632962349040965, "grad_norm": 0.04427063556293102, "learning_rate": 0.00017791227077604876, "loss": 0.7961, "step": 2620 }, { "epoch": 0.49727681742836843, "grad_norm": 0.04245267391622513, "learning_rate": 0.00017742474182177567, "loss": 0.7556, "step": 2625 }, { "epoch": 0.49822401136632727, "grad_norm": 0.04105426628219163, "learning_rate": 0.00017693691295491982, "loss": 0.7994, "step": 2630 }, { "epoch": 0.49917120530428605, "grad_norm": 0.03984608708960842, "learning_rate": 0.0001764487895102995, "loss": 0.7818, "step": 2635 }, { "epoch": 0.5001183992422449, "grad_norm": 0.04428069948873579, "learning_rate": 0.00017596037682595465, "loss": 0.7862, "step": 2640 }, { "epoch": 0.5010655931802036, "grad_norm": 0.04490329464859081, "learning_rate": 0.00017547168024308806, "loss": 0.7975, "step": 2645 }, { "epoch": 0.5020127871181624, "grad_norm": 0.0413111891075842, "learning_rate": 0.0001749827051060072, "loss": 0.7678, "step": 2650 }, { "epoch": 0.5029599810561213, "grad_norm": 0.04240726873268668, "learning_rate": 0.00017449345676206595, "loss": 0.796, "step": 2655 }, { "epoch": 0.50390717499408, "grad_norm": 0.041665058788828147, "learning_rate": 0.0001740039405616057, "loss": 0.7769, "step": 2660 }, { "epoch": 0.5048543689320388, "grad_norm": 0.044320910745836974, "learning_rate": 0.00017351416185789725, "loss": 0.7805, "step": 2665 }, { "epoch": 0.5058015628699977, "grad_norm": 0.04059793612300337, "learning_rate": 0.00017302412600708202, "loss": 0.7585, "step": 2670 }, { "epoch": 0.5067487568079564, "grad_norm": 0.04002124902466378, "learning_rate": 0.00017253383836811356, "loss": 0.7902, "step": 2675 }, { "epoch": 0.5076959507459152, "grad_norm": 0.03946188221690789, "learning_rate": 0.00017204330430269896, "loss": 0.7883, "step": 2680 }, { "epoch": 0.5086431446838741, "grad_norm": 0.04168808355706374, "learning_rate": 0.00017155252917524014, "loss": 0.7623, "step": 2685 }, { "epoch": 0.5095903386218328, "grad_norm": 0.0403800938955808, "learning_rate": 0.0001710615183527753, "loss": 0.7837, "step": 2690 }, { "epoch": 0.5105375325597916, "grad_norm": 0.04265285946105665, "learning_rate": 0.0001705702772049201, "loss": 0.782, "step": 2695 }, { "epoch": 0.5114847264977505, "grad_norm": 0.03909716815369796, "learning_rate": 0.00017007881110380903, "loss": 0.7992, "step": 2700 }, { "epoch": 0.5124319204357092, "grad_norm": 0.04390278736912629, "learning_rate": 0.00016958712542403665, "loss": 0.7925, "step": 2705 }, { "epoch": 0.513379114373668, "grad_norm": 0.040328289968045196, "learning_rate": 0.00016909522554259875, "loss": 0.7888, "step": 2710 }, { "epoch": 0.5143263083116268, "grad_norm": 0.042972772541450946, "learning_rate": 0.00016860311683883366, "loss": 0.7522, "step": 2715 }, { "epoch": 0.5152735022495856, "grad_norm": 0.040623359273199364, "learning_rate": 0.0001681108046943633, "loss": 0.7673, "step": 2720 }, { "epoch": 0.5162206961875444, "grad_norm": 0.0451021298457456, "learning_rate": 0.00016761829449303442, "loss": 0.7803, "step": 2725 }, { "epoch": 0.5171678901255032, "grad_norm": 0.041924502748264085, "learning_rate": 0.00016712559162085963, "loss": 0.7691, "step": 2730 }, { "epoch": 0.518115084063462, "grad_norm": 0.04149735002775417, "learning_rate": 0.0001666327014659587, "loss": 0.7889, "step": 2735 }, { "epoch": 0.5190622780014208, "grad_norm": 0.04415559586614686, "learning_rate": 0.00016613962941849924, "loss": 0.7808, "step": 2740 }, { "epoch": 0.5200094719393796, "grad_norm": 0.03925128460208889, "learning_rate": 0.00016564638087063834, "loss": 0.7773, "step": 2745 }, { "epoch": 0.5209566658773384, "grad_norm": 0.04368831843596003, "learning_rate": 0.00016515296121646299, "loss": 0.7882, "step": 2750 }, { "epoch": 0.5219038598152972, "grad_norm": 0.0422781127201221, "learning_rate": 0.00016465937585193144, "loss": 0.764, "step": 2755 }, { "epoch": 0.522851053753256, "grad_norm": 0.040472037162999186, "learning_rate": 0.0001641656301748143, "loss": 0.7667, "step": 2760 }, { "epoch": 0.5237982476912147, "grad_norm": 0.04379401207313861, "learning_rate": 0.00016367172958463503, "loss": 0.7792, "step": 2765 }, { "epoch": 0.5247454416291736, "grad_norm": 0.04114160448623085, "learning_rate": 0.00016317767948261148, "loss": 0.7812, "step": 2770 }, { "epoch": 0.5256926355671324, "grad_norm": 0.03953583092530725, "learning_rate": 0.00016268348527159632, "loss": 0.751, "step": 2775 }, { "epoch": 0.5266398295050911, "grad_norm": 0.046057484794039295, "learning_rate": 0.0001621891523560183, "loss": 0.8031, "step": 2780 }, { "epoch": 0.52758702344305, "grad_norm": 0.044096492253313226, "learning_rate": 0.00016169468614182306, "loss": 0.768, "step": 2785 }, { "epoch": 0.5285342173810088, "grad_norm": 0.04028812660467143, "learning_rate": 0.00016120009203641374, "loss": 0.7417, "step": 2790 }, { "epoch": 0.5294814113189675, "grad_norm": 0.0421254976442694, "learning_rate": 0.00016070537544859238, "loss": 0.7525, "step": 2795 }, { "epoch": 0.5304286052569264, "grad_norm": 0.041297980776658014, "learning_rate": 0.00016021054178850025, "loss": 0.7555, "step": 2800 }, { "epoch": 0.5313757991948852, "grad_norm": 0.04218044701907684, "learning_rate": 0.000159715596467559, "loss": 0.7636, "step": 2805 }, { "epoch": 0.5323229931328439, "grad_norm": 0.04257545791321493, "learning_rate": 0.00015922054489841134, "loss": 0.7877, "step": 2810 }, { "epoch": 0.5332701870708028, "grad_norm": 0.04037611852463485, "learning_rate": 0.0001587253924948619, "loss": 0.7967, "step": 2815 }, { "epoch": 0.5342173810087616, "grad_norm": 0.04724884702232222, "learning_rate": 0.00015823014467181813, "loss": 0.778, "step": 2820 }, { "epoch": 0.5351645749467203, "grad_norm": 0.041552815375501054, "learning_rate": 0.00015773480684523082, "loss": 0.7644, "step": 2825 }, { "epoch": 0.5361117688846792, "grad_norm": 0.04060976824870222, "learning_rate": 0.00015723938443203505, "loss": 0.7568, "step": 2830 }, { "epoch": 0.537058962822638, "grad_norm": 0.039968469240745465, "learning_rate": 0.000156743882850091, "loss": 0.7641, "step": 2835 }, { "epoch": 0.5380061567605967, "grad_norm": 0.04550185985191349, "learning_rate": 0.00015624830751812452, "loss": 0.7631, "step": 2840 }, { "epoch": 0.5389533506985555, "grad_norm": 0.04397422552677812, "learning_rate": 0.0001557526638556681, "loss": 0.7898, "step": 2845 }, { "epoch": 0.5399005446365144, "grad_norm": 0.04128096672528751, "learning_rate": 0.00015525695728300142, "loss": 0.8049, "step": 2850 }, { "epoch": 0.5408477385744731, "grad_norm": 0.043926555415344445, "learning_rate": 0.00015476119322109215, "loss": 0.7856, "step": 2855 }, { "epoch": 0.5417949325124319, "grad_norm": 0.04191285035326946, "learning_rate": 0.00015426537709153665, "loss": 0.7811, "step": 2860 }, { "epoch": 0.5427421264503908, "grad_norm": 0.04398915485090648, "learning_rate": 0.00015376951431650063, "loss": 0.7642, "step": 2865 }, { "epoch": 0.5436893203883495, "grad_norm": 0.043931690655324165, "learning_rate": 0.00015327361031865994, "loss": 0.7453, "step": 2870 }, { "epoch": 0.5446365143263083, "grad_norm": 0.04098372350907064, "learning_rate": 0.00015277767052114134, "loss": 0.791, "step": 2875 }, { "epoch": 0.545583708264267, "grad_norm": 0.044827794864255185, "learning_rate": 0.00015228170034746287, "loss": 0.7742, "step": 2880 }, { "epoch": 0.5465309022022259, "grad_norm": 0.043361929504781155, "learning_rate": 0.00015178570522147503, "loss": 0.7721, "step": 2885 }, { "epoch": 0.5474780961401847, "grad_norm": 0.04140360142754399, "learning_rate": 0.00015128969056730094, "loss": 0.7638, "step": 2890 }, { "epoch": 0.5484252900781434, "grad_norm": 0.044541139967598814, "learning_rate": 0.00015079366180927747, "loss": 0.7648, "step": 2895 }, { "epoch": 0.5493724840161023, "grad_norm": 0.04373950378809721, "learning_rate": 0.00015029762437189555, "loss": 0.764, "step": 2900 }, { "epoch": 0.5503196779540611, "grad_norm": 0.044464277569613665, "learning_rate": 0.00014980158367974123, "loss": 0.7584, "step": 2905 }, { "epoch": 0.5512668718920198, "grad_norm": 0.041529612627100455, "learning_rate": 0.000149305545157436, "loss": 0.784, "step": 2910 }, { "epoch": 0.5522140658299787, "grad_norm": 0.04387046595628468, "learning_rate": 0.00014880951422957764, "loss": 0.7829, "step": 2915 }, { "epoch": 0.5531612597679375, "grad_norm": 0.03928205510955695, "learning_rate": 0.00014831349632068097, "loss": 0.7838, "step": 2920 }, { "epoch": 0.5541084537058962, "grad_norm": 0.03792796655680272, "learning_rate": 0.0001478174968551183, "loss": 0.7585, "step": 2925 }, { "epoch": 0.5550556476438551, "grad_norm": 0.0418350076429959, "learning_rate": 0.00014732152125706042, "loss": 0.7892, "step": 2930 }, { "epoch": 0.5560028415818139, "grad_norm": 0.0459671460831212, "learning_rate": 0.00014682557495041684, "loss": 0.733, "step": 2935 }, { "epoch": 0.5569500355197726, "grad_norm": 0.04058456377794918, "learning_rate": 0.00014632966335877706, "loss": 0.7686, "step": 2940 }, { "epoch": 0.5578972294577315, "grad_norm": 0.040486030304535854, "learning_rate": 0.00014583379190535075, "loss": 0.7396, "step": 2945 }, { "epoch": 0.5588444233956903, "grad_norm": 0.04178283807566538, "learning_rate": 0.00014533796601290868, "loss": 0.7982, "step": 2950 }, { "epoch": 0.559791617333649, "grad_norm": 0.040277272924603875, "learning_rate": 0.0001448421911037234, "loss": 0.7607, "step": 2955 }, { "epoch": 0.5607388112716079, "grad_norm": 0.041258343955637974, "learning_rate": 0.0001443464725995098, "loss": 0.7443, "step": 2960 }, { "epoch": 0.5616860052095667, "grad_norm": 0.04032233073174216, "learning_rate": 0.00014385081592136614, "loss": 0.7993, "step": 2965 }, { "epoch": 0.5626331991475254, "grad_norm": 0.03958174726228383, "learning_rate": 0.0001433552264897143, "loss": 0.7897, "step": 2970 }, { "epoch": 0.5635803930854842, "grad_norm": 0.03858023826870275, "learning_rate": 0.000142859709724241, "loss": 0.7531, "step": 2975 }, { "epoch": 0.5645275870234431, "grad_norm": 0.03863381061427778, "learning_rate": 0.00014236427104383827, "loss": 0.7683, "step": 2980 }, { "epoch": 0.5654747809614018, "grad_norm": 0.04304392912853754, "learning_rate": 0.00014186891586654395, "loss": 0.7611, "step": 2985 }, { "epoch": 0.5664219748993606, "grad_norm": 0.040517793123203284, "learning_rate": 0.00014137364960948307, "loss": 0.7597, "step": 2990 }, { "epoch": 0.5673691688373195, "grad_norm": 0.04010964103438508, "learning_rate": 0.0001408784776888079, "loss": 0.7886, "step": 2995 }, { "epoch": 0.5683163627752782, "grad_norm": 0.16933225878617628, "learning_rate": 0.00014038340551963946, "loss": 0.7754, "step": 3000 }, { "epoch": 0.569263556713237, "grad_norm": 0.049628098964115824, "learning_rate": 0.0001398884385160074, "loss": 0.7557, "step": 3005 }, { "epoch": 0.5702107506511959, "grad_norm": 0.04117704234522045, "learning_rate": 0.00013939358209079177, "loss": 0.7662, "step": 3010 }, { "epoch": 0.5711579445891546, "grad_norm": 0.04404870833994479, "learning_rate": 0.00013889884165566317, "loss": 0.7802, "step": 3015 }, { "epoch": 0.5721051385271134, "grad_norm": 0.0400783837624163, "learning_rate": 0.00013840422262102357, "loss": 0.7772, "step": 3020 }, { "epoch": 0.5730523324650723, "grad_norm": 0.04445272029297132, "learning_rate": 0.00013790973039594766, "loss": 0.7403, "step": 3025 }, { "epoch": 0.573999526403031, "grad_norm": 0.04189631829289255, "learning_rate": 0.000137415370388123, "loss": 0.7811, "step": 3030 }, { "epoch": 0.5749467203409898, "grad_norm": 0.04116724359754346, "learning_rate": 0.00013692114800379165, "loss": 0.7696, "step": 3035 }, { "epoch": 0.5758939142789486, "grad_norm": 0.03811498143887281, "learning_rate": 0.00013642706864769023, "loss": 0.7974, "step": 3040 }, { "epoch": 0.5768411082169074, "grad_norm": 0.03813260068428526, "learning_rate": 0.00013593313772299151, "loss": 0.7491, "step": 3045 }, { "epoch": 0.5777883021548662, "grad_norm": 0.04258997197360236, "learning_rate": 0.00013543936063124503, "loss": 0.7611, "step": 3050 }, { "epoch": 0.578735496092825, "grad_norm": 0.044652230022576414, "learning_rate": 0.00013494574277231772, "loss": 0.7639, "step": 3055 }, { "epoch": 0.5796826900307838, "grad_norm": 0.040644166117172274, "learning_rate": 0.00013445228954433568, "loss": 0.7871, "step": 3060 }, { "epoch": 0.5806298839687426, "grad_norm": 0.03834957657226225, "learning_rate": 0.00013395900634362418, "loss": 0.7516, "step": 3065 }, { "epoch": 0.5815770779067014, "grad_norm": 0.0425924434925859, "learning_rate": 0.0001334658985646493, "loss": 0.767, "step": 3070 }, { "epoch": 0.5825242718446602, "grad_norm": 0.04603157938094156, "learning_rate": 0.00013297297159995872, "loss": 0.7642, "step": 3075 }, { "epoch": 0.583471465782619, "grad_norm": 0.04251195805977364, "learning_rate": 0.00013248023084012268, "loss": 0.7695, "step": 3080 }, { "epoch": 0.5844186597205778, "grad_norm": 0.04025338997238214, "learning_rate": 0.0001319876816736754, "loss": 0.7428, "step": 3085 }, { "epoch": 0.5853658536585366, "grad_norm": 0.04211750103789674, "learning_rate": 0.00013149532948705542, "loss": 0.7621, "step": 3090 }, { "epoch": 0.5863130475964954, "grad_norm": 0.04455741942115236, "learning_rate": 0.0001310031796645475, "loss": 0.8198, "step": 3095 }, { "epoch": 0.5872602415344542, "grad_norm": 0.042122208500608355, "learning_rate": 0.00013051123758822317, "loss": 0.7902, "step": 3100 }, { "epoch": 0.5882074354724129, "grad_norm": 0.04026734370639814, "learning_rate": 0.0001300195086378822, "loss": 0.743, "step": 3105 }, { "epoch": 0.5891546294103718, "grad_norm": 0.04263858640190378, "learning_rate": 0.00012952799819099362, "loss": 0.7611, "step": 3110 }, { "epoch": 0.5901018233483306, "grad_norm": 0.042020491856201506, "learning_rate": 0.00012903671162263692, "loss": 0.7638, "step": 3115 }, { "epoch": 0.5910490172862893, "grad_norm": 0.0411120885986108, "learning_rate": 0.0001285456543054433, "loss": 0.7779, "step": 3120 }, { "epoch": 0.5919962112242482, "grad_norm": 0.0406466480831228, "learning_rate": 0.0001280548316095369, "loss": 0.7648, "step": 3125 }, { "epoch": 0.592943405162207, "grad_norm": 0.039876223182510155, "learning_rate": 0.00012756424890247612, "loss": 0.7465, "step": 3130 }, { "epoch": 0.5938905991001657, "grad_norm": 0.03968060112874898, "learning_rate": 0.00012707391154919478, "loss": 0.7788, "step": 3135 }, { "epoch": 0.5948377930381246, "grad_norm": 0.04109563591743239, "learning_rate": 0.00012658382491194368, "loss": 0.7629, "step": 3140 }, { "epoch": 0.5957849869760834, "grad_norm": 0.04068329826647931, "learning_rate": 0.0001260939943502317, "loss": 0.7652, "step": 3145 }, { "epoch": 0.5967321809140421, "grad_norm": 0.039457301232553726, "learning_rate": 0.00012560442522076745, "loss": 0.771, "step": 3150 }, { "epoch": 0.597679374852001, "grad_norm": 0.039313871399632806, "learning_rate": 0.0001251151228774005, "loss": 0.7665, "step": 3155 }, { "epoch": 0.5986265687899598, "grad_norm": 0.04012444827177292, "learning_rate": 0.0001246260926710628, "loss": 0.7672, "step": 3160 }, { "epoch": 0.5995737627279185, "grad_norm": 0.04199451923312326, "learning_rate": 0.00012413733994971044, "loss": 0.7767, "step": 3165 }, { "epoch": 0.6005209566658773, "grad_norm": 0.043536826355634925, "learning_rate": 0.0001236488700582648, "loss": 0.7447, "step": 3170 }, { "epoch": 0.6014681506038362, "grad_norm": 0.04225224126474484, "learning_rate": 0.00012316068833855438, "loss": 0.7705, "step": 3175 }, { "epoch": 0.6024153445417949, "grad_norm": 0.04122031488132078, "learning_rate": 0.00012267280012925622, "loss": 0.7553, "step": 3180 }, { "epoch": 0.6033625384797537, "grad_norm": 0.04052877747206926, "learning_rate": 0.00012218521076583767, "loss": 0.7395, "step": 3185 }, { "epoch": 0.6043097324177126, "grad_norm": 0.03976724566633056, "learning_rate": 0.00012169792558049789, "loss": 0.7902, "step": 3190 }, { "epoch": 0.6052569263556713, "grad_norm": 0.03994928147888954, "learning_rate": 0.00012121094990210951, "loss": 0.7492, "step": 3195 }, { "epoch": 0.6062041202936301, "grad_norm": 0.0410556694914752, "learning_rate": 0.00012072428905616064, "loss": 0.7513, "step": 3200 }, { "epoch": 0.607151314231589, "grad_norm": 0.04147107567296144, "learning_rate": 0.00012023794836469624, "loss": 0.7321, "step": 3205 }, { "epoch": 0.6080985081695477, "grad_norm": 0.042191697935109296, "learning_rate": 0.00011975193314626025, "loss": 0.7553, "step": 3210 }, { "epoch": 0.6090457021075065, "grad_norm": 0.04045134486807547, "learning_rate": 0.00011926624871583717, "loss": 0.7352, "step": 3215 }, { "epoch": 0.6099928960454654, "grad_norm": 0.04292542392593404, "learning_rate": 0.00011878090038479416, "loss": 0.771, "step": 3220 }, { "epoch": 0.6109400899834241, "grad_norm": 0.03947257757285626, "learning_rate": 0.00011829589346082281, "loss": 0.7555, "step": 3225 }, { "epoch": 0.6118872839213829, "grad_norm": 0.03798498100177421, "learning_rate": 0.00011781123324788111, "loss": 0.7717, "step": 3230 }, { "epoch": 0.6128344778593418, "grad_norm": 0.040633643895124465, "learning_rate": 0.00011732692504613554, "loss": 0.7412, "step": 3235 }, { "epoch": 0.6137816717973005, "grad_norm": 0.038994248859382026, "learning_rate": 0.00011684297415190295, "loss": 0.7626, "step": 3240 }, { "epoch": 0.6147288657352593, "grad_norm": 0.037087473994001245, "learning_rate": 0.00011635938585759284, "loss": 0.7485, "step": 3245 }, { "epoch": 0.6156760596732181, "grad_norm": 0.04283707658057122, "learning_rate": 0.00011587616545164923, "loss": 0.76, "step": 3250 }, { "epoch": 0.6166232536111769, "grad_norm": 0.03994982225913288, "learning_rate": 0.00011539331821849317, "loss": 0.7867, "step": 3255 }, { "epoch": 0.6175704475491357, "grad_norm": 0.045291416938963186, "learning_rate": 0.00011491084943846459, "loss": 0.7909, "step": 3260 }, { "epoch": 0.6185176414870944, "grad_norm": 0.04461380336072227, "learning_rate": 0.00011442876438776475, "loss": 0.7501, "step": 3265 }, { "epoch": 0.6194648354250533, "grad_norm": 0.04724285794795771, "learning_rate": 0.00011394706833839858, "loss": 0.7663, "step": 3270 }, { "epoch": 0.6204120293630121, "grad_norm": 0.04043233809259392, "learning_rate": 0.00011346576655811683, "loss": 0.7573, "step": 3275 }, { "epoch": 0.6213592233009708, "grad_norm": 0.04083620126750758, "learning_rate": 0.00011298486431035874, "loss": 0.796, "step": 3280 }, { "epoch": 0.6223064172389297, "grad_norm": 0.04080275832439418, "learning_rate": 0.00011250436685419418, "loss": 0.7631, "step": 3285 }, { "epoch": 0.6232536111768885, "grad_norm": 0.03886958426209183, "learning_rate": 0.00011202427944426636, "loss": 0.75, "step": 3290 }, { "epoch": 0.6242008051148472, "grad_norm": 0.03941484340988021, "learning_rate": 0.00011154460733073433, "loss": 0.7562, "step": 3295 }, { "epoch": 0.625147999052806, "grad_norm": 0.04351471227388938, "learning_rate": 0.00011106535575921536, "loss": 0.7714, "step": 3300 }, { "epoch": 0.6260951929907649, "grad_norm": 0.04704801990878809, "learning_rate": 0.00011058652997072802, "loss": 0.7793, "step": 3305 }, { "epoch": 0.6270423869287236, "grad_norm": 0.04584785902650524, "learning_rate": 0.00011010813520163427, "loss": 0.7626, "step": 3310 }, { "epoch": 0.6279895808666824, "grad_norm": 0.04629280784526772, "learning_rate": 0.00010963017668358273, "loss": 0.7418, "step": 3315 }, { "epoch": 0.6289367748046413, "grad_norm": 0.04407747166586352, "learning_rate": 0.00010915265964345114, "loss": 0.7459, "step": 3320 }, { "epoch": 0.6298839687426, "grad_norm": 0.039913077554486434, "learning_rate": 0.00010867558930328934, "loss": 0.7504, "step": 3325 }, { "epoch": 0.6308311626805588, "grad_norm": 0.04379451323559438, "learning_rate": 0.00010819897088026224, "loss": 0.7633, "step": 3330 }, { "epoch": 0.6317783566185177, "grad_norm": 0.04536301609111961, "learning_rate": 0.00010772280958659241, "loss": 0.7657, "step": 3335 }, { "epoch": 0.6327255505564764, "grad_norm": 0.041972499038774445, "learning_rate": 0.00010724711062950358, "loss": 0.774, "step": 3340 }, { "epoch": 0.6336727444944352, "grad_norm": 0.04243182090390366, "learning_rate": 0.00010677187921116325, "loss": 0.7593, "step": 3345 }, { "epoch": 0.6346199384323941, "grad_norm": 0.041997095162117505, "learning_rate": 0.00010629712052862619, "loss": 0.7525, "step": 3350 }, { "epoch": 0.6355671323703528, "grad_norm": 0.043340122902892075, "learning_rate": 0.00010582283977377709, "loss": 0.7554, "step": 3355 }, { "epoch": 0.6365143263083116, "grad_norm": 0.041977926659558386, "learning_rate": 0.00010534904213327447, "loss": 0.7503, "step": 3360 }, { "epoch": 0.6374615202462705, "grad_norm": 0.04155046135436731, "learning_rate": 0.00010487573278849338, "loss": 0.7555, "step": 3365 }, { "epoch": 0.6384087141842292, "grad_norm": 0.04049957089110068, "learning_rate": 0.00010440291691546895, "loss": 0.7701, "step": 3370 }, { "epoch": 0.639355908122188, "grad_norm": 0.042538488276278326, "learning_rate": 0.00010393059968483989, "loss": 0.765, "step": 3375 }, { "epoch": 0.6403031020601468, "grad_norm": 0.03712808828646828, "learning_rate": 0.00010345878626179162, "loss": 0.7492, "step": 3380 }, { "epoch": 0.6412502959981056, "grad_norm": 0.04192587249684641, "learning_rate": 0.00010298748180600031, "loss": 0.7644, "step": 3385 }, { "epoch": 0.6421974899360644, "grad_norm": 0.037429843394214256, "learning_rate": 0.00010251669147157582, "loss": 0.7484, "step": 3390 }, { "epoch": 0.6431446838740232, "grad_norm": 0.0454827756794171, "learning_rate": 0.00010204642040700593, "loss": 0.7432, "step": 3395 }, { "epoch": 0.644091877811982, "grad_norm": 0.03998857520778792, "learning_rate": 0.00010157667375509966, "loss": 0.7767, "step": 3400 }, { "epoch": 0.6450390717499408, "grad_norm": 0.03874189126294134, "learning_rate": 0.00010110745665293102, "loss": 0.7613, "step": 3405 }, { "epoch": 0.6459862656878996, "grad_norm": 0.04315072772814885, "learning_rate": 0.00010063877423178327, "loss": 0.7615, "step": 3410 }, { "epoch": 0.6469334596258584, "grad_norm": 0.039915598481917704, "learning_rate": 0.00010017063161709203, "loss": 0.7368, "step": 3415 }, { "epoch": 0.6478806535638172, "grad_norm": 0.043320616700844417, "learning_rate": 9.970303392839016e-05, "loss": 0.7643, "step": 3420 }, { "epoch": 0.648827847501776, "grad_norm": 0.038608629273838305, "learning_rate": 9.923598627925085e-05, "loss": 0.7647, "step": 3425 }, { "epoch": 0.6497750414397347, "grad_norm": 0.0403187718238657, "learning_rate": 9.876949377723254e-05, "loss": 0.7583, "step": 3430 }, { "epoch": 0.6507222353776936, "grad_norm": 0.04071452984696662, "learning_rate": 9.830356152382245e-05, "loss": 0.7543, "step": 3435 }, { "epoch": 0.6516694293156524, "grad_norm": 0.04123560216855826, "learning_rate": 9.783819461438097e-05, "loss": 0.7503, "step": 3440 }, { "epoch": 0.6526166232536111, "grad_norm": 0.04169247579017777, "learning_rate": 9.737339813808621e-05, "loss": 0.7633, "step": 3445 }, { "epoch": 0.65356381719157, "grad_norm": 0.03950148386345426, "learning_rate": 9.69091771778778e-05, "loss": 0.7797, "step": 3450 }, { "epoch": 0.6545110111295288, "grad_norm": 0.041505476095881656, "learning_rate": 9.644553681040196e-05, "loss": 0.7464, "step": 3455 }, { "epoch": 0.6554582050674875, "grad_norm": 0.04278317549795013, "learning_rate": 9.598248210595531e-05, "loss": 0.7758, "step": 3460 }, { "epoch": 0.6564053990054464, "grad_norm": 0.04182779902664369, "learning_rate": 9.552001812842996e-05, "loss": 0.7786, "step": 3465 }, { "epoch": 0.6573525929434052, "grad_norm": 0.03765966562832586, "learning_rate": 9.505814993525797e-05, "loss": 0.748, "step": 3470 }, { "epoch": 0.6582997868813639, "grad_norm": 0.03856379305262854, "learning_rate": 9.459688257735575e-05, "loss": 0.7265, "step": 3475 }, { "epoch": 0.6592469808193228, "grad_norm": 0.042262933505673055, "learning_rate": 9.413622109906937e-05, "loss": 0.7608, "step": 3480 }, { "epoch": 0.6601941747572816, "grad_norm": 0.03934049459172944, "learning_rate": 9.367617053811885e-05, "loss": 0.7355, "step": 3485 }, { "epoch": 0.6611413686952403, "grad_norm": 0.040505520508700994, "learning_rate": 9.321673592554346e-05, "loss": 0.7285, "step": 3490 }, { "epoch": 0.6620885626331992, "grad_norm": 0.0443739357214151, "learning_rate": 9.275792228564647e-05, "loss": 0.7465, "step": 3495 }, { "epoch": 0.663035756571158, "grad_norm": 0.042948832615535365, "learning_rate": 9.229973463594036e-05, "loss": 0.7415, "step": 3500 }, { "epoch": 0.6639829505091167, "grad_norm": 0.037284235954941944, "learning_rate": 9.184217798709195e-05, "loss": 0.7624, "step": 3505 }, { "epoch": 0.6649301444470755, "grad_norm": 0.041373070240537976, "learning_rate": 9.13852573428673e-05, "loss": 0.76, "step": 3510 }, { "epoch": 0.6658773383850344, "grad_norm": 0.04078003790880156, "learning_rate": 9.092897770007748e-05, "loss": 0.7696, "step": 3515 }, { "epoch": 0.6668245323229931, "grad_norm": 0.04227188813941138, "learning_rate": 9.047334404852349e-05, "loss": 0.7385, "step": 3520 }, { "epoch": 0.6677717262609519, "grad_norm": 0.04055813753473042, "learning_rate": 9.001836137094199e-05, "loss": 0.7411, "step": 3525 }, { "epoch": 0.6687189201989108, "grad_norm": 0.03979213857043945, "learning_rate": 8.95640346429506e-05, "loss": 0.7419, "step": 3530 }, { "epoch": 0.6696661141368695, "grad_norm": 0.04277127849333733, "learning_rate": 8.911036883299367e-05, "loss": 0.7459, "step": 3535 }, { "epoch": 0.6706133080748283, "grad_norm": 0.0414740211347812, "learning_rate": 8.865736890228782e-05, "loss": 0.7663, "step": 3540 }, { "epoch": 0.6715605020127872, "grad_norm": 0.03968770903759535, "learning_rate": 8.820503980476766e-05, "loss": 0.7397, "step": 3545 }, { "epoch": 0.6725076959507459, "grad_norm": 0.04028225688842527, "learning_rate": 8.775338648703182e-05, "loss": 0.7359, "step": 3550 }, { "epoch": 0.6734548898887047, "grad_norm": 0.03957291045780008, "learning_rate": 8.730241388828852e-05, "loss": 0.7458, "step": 3555 }, { "epoch": 0.6744020838266636, "grad_norm": 0.04075648253621573, "learning_rate": 8.685212694030197e-05, "loss": 0.7334, "step": 3560 }, { "epoch": 0.6753492777646223, "grad_norm": 0.03417551964299854, "learning_rate": 8.640253056733788e-05, "loss": 0.7105, "step": 3565 }, { "epoch": 0.6762964717025811, "grad_norm": 0.041401776497170924, "learning_rate": 8.595362968611036e-05, "loss": 0.714, "step": 3570 }, { "epoch": 0.67724366564054, "grad_norm": 0.04079313998460472, "learning_rate": 8.550542920572751e-05, "loss": 0.7426, "step": 3575 }, { "epoch": 0.6781908595784987, "grad_norm": 0.03987821946868629, "learning_rate": 8.505793402763786e-05, "loss": 0.763, "step": 3580 }, { "epoch": 0.6791380535164575, "grad_norm": 0.04091134166237045, "learning_rate": 8.461114904557712e-05, "loss": 0.751, "step": 3585 }, { "epoch": 0.6800852474544163, "grad_norm": 0.04177068343214537, "learning_rate": 8.416507914551405e-05, "loss": 0.78, "step": 3590 }, { "epoch": 0.6810324413923751, "grad_norm": 0.03961181610952186, "learning_rate": 8.371972920559791e-05, "loss": 0.7335, "step": 3595 }, { "epoch": 0.6819796353303339, "grad_norm": 0.039417654158232895, "learning_rate": 8.327510409610408e-05, "loss": 0.7642, "step": 3600 }, { "epoch": 0.6829268292682927, "grad_norm": 0.04073914119422375, "learning_rate": 8.283120867938156e-05, "loss": 0.7468, "step": 3605 }, { "epoch": 0.6838740232062515, "grad_norm": 0.03905373593771108, "learning_rate": 8.23880478097996e-05, "loss": 0.7377, "step": 3610 }, { "epoch": 0.6848212171442103, "grad_norm": 0.04307956957119377, "learning_rate": 8.194562633369428e-05, "loss": 0.7536, "step": 3615 }, { "epoch": 0.6857684110821691, "grad_norm": 0.04210126693770651, "learning_rate": 8.150394908931622e-05, "loss": 0.7554, "step": 3620 }, { "epoch": 0.6867156050201279, "grad_norm": 0.0421490482386118, "learning_rate": 8.106302090677682e-05, "loss": 0.7936, "step": 3625 }, { "epoch": 0.6876627989580867, "grad_norm": 0.03588928471279015, "learning_rate": 8.062284660799617e-05, "loss": 0.7287, "step": 3630 }, { "epoch": 0.6886099928960455, "grad_norm": 0.03952924524135366, "learning_rate": 8.018343100664975e-05, "loss": 0.7527, "step": 3635 }, { "epoch": 0.6895571868340042, "grad_norm": 0.03984238587807663, "learning_rate": 7.974477890811622e-05, "loss": 0.7528, "step": 3640 }, { "epoch": 0.6905043807719631, "grad_norm": 0.04306536866980951, "learning_rate": 7.930689510942467e-05, "loss": 0.7263, "step": 3645 }, { "epoch": 0.6914515747099218, "grad_norm": 0.03823091396622685, "learning_rate": 7.886978439920219e-05, "loss": 0.7262, "step": 3650 }, { "epoch": 0.6923987686478806, "grad_norm": 0.040037669190792775, "learning_rate": 7.84334515576215e-05, "loss": 0.761, "step": 3655 }, { "epoch": 0.6933459625858395, "grad_norm": 0.04316597192554324, "learning_rate": 7.799790135634848e-05, "loss": 0.7654, "step": 3660 }, { "epoch": 0.6942931565237982, "grad_norm": 0.040090557871707184, "learning_rate": 7.756313855849061e-05, "loss": 0.7576, "step": 3665 }, { "epoch": 0.695240350461757, "grad_norm": 0.03885025639567031, "learning_rate": 7.712916791854398e-05, "loss": 0.7337, "step": 3670 }, { "epoch": 0.6961875443997159, "grad_norm": 0.039817406008376306, "learning_rate": 7.669599418234209e-05, "loss": 0.7827, "step": 3675 }, { "epoch": 0.6971347383376746, "grad_norm": 0.03796969889976453, "learning_rate": 7.626362208700345e-05, "loss": 0.7401, "step": 3680 }, { "epoch": 0.6980819322756334, "grad_norm": 0.04250157953569218, "learning_rate": 7.583205636087998e-05, "loss": 0.7849, "step": 3685 }, { "epoch": 0.6990291262135923, "grad_norm": 0.039617912653361516, "learning_rate": 7.540130172350553e-05, "loss": 0.7299, "step": 3690 }, { "epoch": 0.699976320151551, "grad_norm": 0.03816192045516508, "learning_rate": 7.497136288554358e-05, "loss": 0.7514, "step": 3695 }, { "epoch": 0.7009235140895098, "grad_norm": 0.03983670858460556, "learning_rate": 7.454224454873653e-05, "loss": 0.726, "step": 3700 }, { "epoch": 0.7018707080274686, "grad_norm": 0.03934247221089433, "learning_rate": 7.411395140585366e-05, "loss": 0.755, "step": 3705 }, { "epoch": 0.7028179019654274, "grad_norm": 0.041762302180065505, "learning_rate": 7.368648814064017e-05, "loss": 0.7731, "step": 3710 }, { "epoch": 0.7037650959033862, "grad_norm": 0.03718549677733008, "learning_rate": 7.325985942776586e-05, "loss": 0.7245, "step": 3715 }, { "epoch": 0.704712289841345, "grad_norm": 0.03979809659047382, "learning_rate": 7.283406993277401e-05, "loss": 0.7493, "step": 3720 }, { "epoch": 0.7056594837793038, "grad_norm": 0.03891391378511992, "learning_rate": 7.240912431203036e-05, "loss": 0.7372, "step": 3725 }, { "epoch": 0.7066066777172626, "grad_norm": 0.03760484285201233, "learning_rate": 7.198502721267201e-05, "loss": 0.7319, "step": 3730 }, { "epoch": 0.7075538716552214, "grad_norm": 0.039788628547401186, "learning_rate": 7.156178327255696e-05, "loss": 0.7107, "step": 3735 }, { "epoch": 0.7085010655931802, "grad_norm": 0.03854056853156896, "learning_rate": 7.113939712021312e-05, "loss": 0.7195, "step": 3740 }, { "epoch": 0.709448259531139, "grad_norm": 0.04238606817116954, "learning_rate": 7.071787337478785e-05, "loss": 0.7448, "step": 3745 }, { "epoch": 0.7103954534690978, "grad_norm": 0.03913911448999711, "learning_rate": 7.029721664599718e-05, "loss": 0.7553, "step": 3750 }, { "epoch": 0.7113426474070565, "grad_norm": 0.04214126618606677, "learning_rate": 6.987743153407576e-05, "loss": 0.7263, "step": 3755 }, { "epoch": 0.7122898413450154, "grad_norm": 0.044083161857802124, "learning_rate": 6.94585226297263e-05, "loss": 0.7366, "step": 3760 }, { "epoch": 0.7132370352829742, "grad_norm": 0.04073891396330348, "learning_rate": 6.90404945140695e-05, "loss": 0.7389, "step": 3765 }, { "epoch": 0.7141842292209329, "grad_norm": 0.03825315556026045, "learning_rate": 6.862335175859387e-05, "loss": 0.7347, "step": 3770 }, { "epoch": 0.7151314231588918, "grad_norm": 0.03504164436950897, "learning_rate": 6.820709892510566e-05, "loss": 0.7563, "step": 3775 }, { "epoch": 0.7160786170968506, "grad_norm": 0.040944422736284514, "learning_rate": 6.779174056567923e-05, "loss": 0.7324, "step": 3780 }, { "epoch": 0.7170258110348093, "grad_norm": 0.042252601374869914, "learning_rate": 6.737728122260705e-05, "loss": 0.7428, "step": 3785 }, { "epoch": 0.7179730049727682, "grad_norm": 0.04198448843744255, "learning_rate": 6.696372542835007e-05, "loss": 0.7563, "step": 3790 }, { "epoch": 0.718920198910727, "grad_norm": 0.03891146859575185, "learning_rate": 6.655107770548829e-05, "loss": 0.7653, "step": 3795 }, { "epoch": 0.7198673928486857, "grad_norm": 0.047863846470899335, "learning_rate": 6.613934256667098e-05, "loss": 0.7443, "step": 3800 }, { "epoch": 0.7208145867866446, "grad_norm": 0.04392069667011626, "learning_rate": 6.572852451456766e-05, "loss": 0.7506, "step": 3805 }, { "epoch": 0.7217617807246034, "grad_norm": 0.04054436230218552, "learning_rate": 6.53186280418188e-05, "loss": 0.7472, "step": 3810 }, { "epoch": 0.7227089746625621, "grad_norm": 0.03885365408522593, "learning_rate": 6.490965763098654e-05, "loss": 0.719, "step": 3815 }, { "epoch": 0.723656168600521, "grad_norm": 0.04194579275932475, "learning_rate": 6.450161775450572e-05, "loss": 0.7125, "step": 3820 }, { "epoch": 0.7246033625384798, "grad_norm": 0.04052378750052248, "learning_rate": 6.409451287463508e-05, "loss": 0.7766, "step": 3825 }, { "epoch": 0.7255505564764385, "grad_norm": 0.039568496333215346, "learning_rate": 6.368834744340837e-05, "loss": 0.7278, "step": 3830 }, { "epoch": 0.7264977504143973, "grad_norm": 0.036717341021427846, "learning_rate": 6.328312590258568e-05, "loss": 0.7389, "step": 3835 }, { "epoch": 0.7274449443523562, "grad_norm": 0.04020458964083549, "learning_rate": 6.28788526836049e-05, "loss": 0.7484, "step": 3840 }, { "epoch": 0.7283921382903149, "grad_norm": 0.04041885111843109, "learning_rate": 6.247553220753305e-05, "loss": 0.7286, "step": 3845 }, { "epoch": 0.7293393322282737, "grad_norm": 0.03932262216800061, "learning_rate": 6.207316888501833e-05, "loss": 0.7211, "step": 3850 }, { "epoch": 0.7302865261662326, "grad_norm": 0.03945586913466572, "learning_rate": 6.167176711624157e-05, "loss": 0.7343, "step": 3855 }, { "epoch": 0.7312337201041913, "grad_norm": 0.03793066654633331, "learning_rate": 6.127133129086818e-05, "loss": 0.7283, "step": 3860 }, { "epoch": 0.7321809140421501, "grad_norm": 0.03754478471345365, "learning_rate": 6.087186578800027e-05, "loss": 0.7537, "step": 3865 }, { "epoch": 0.733128107980109, "grad_norm": 0.04134693277562893, "learning_rate": 6.0473374976128444e-05, "loss": 0.7279, "step": 3870 }, { "epoch": 0.7340753019180677, "grad_norm": 0.03826648717909671, "learning_rate": 6.007586321308445e-05, "loss": 0.722, "step": 3875 }, { "epoch": 0.7350224958560265, "grad_norm": 0.041876625013751154, "learning_rate": 5.967933484599324e-05, "loss": 0.7488, "step": 3880 }, { "epoch": 0.7359696897939854, "grad_norm": 0.04361394297294752, "learning_rate": 5.928379421122557e-05, "loss": 0.751, "step": 3885 }, { "epoch": 0.7369168837319441, "grad_norm": 0.040558338035631775, "learning_rate": 5.888924563435032e-05, "loss": 0.7359, "step": 3890 }, { "epoch": 0.7378640776699029, "grad_norm": 0.041973741302183905, "learning_rate": 5.849569343008758e-05, "loss": 0.746, "step": 3895 }, { "epoch": 0.7388112716078618, "grad_norm": 0.043105183047686346, "learning_rate": 5.8103141902261205e-05, "loss": 0.7403, "step": 3900 }, { "epoch": 0.7397584655458205, "grad_norm": 0.038539690413186195, "learning_rate": 5.7711595343751806e-05, "loss": 0.7467, "step": 3905 }, { "epoch": 0.7407056594837793, "grad_norm": 0.040297657563285356, "learning_rate": 5.732105803644986e-05, "loss": 0.7256, "step": 3910 }, { "epoch": 0.7416528534217381, "grad_norm": 0.040590581894694416, "learning_rate": 5.693153425120872e-05, "loss": 0.7301, "step": 3915 }, { "epoch": 0.7426000473596969, "grad_norm": 0.039579719033313615, "learning_rate": 5.654302824779815e-05, "loss": 0.7343, "step": 3920 }, { "epoch": 0.7435472412976557, "grad_norm": 0.03807616540727068, "learning_rate": 5.6155544274857436e-05, "loss": 0.7219, "step": 3925 }, { "epoch": 0.7444944352356145, "grad_norm": 0.03907724233269258, "learning_rate": 5.576908656984938e-05, "loss": 0.7359, "step": 3930 }, { "epoch": 0.7454416291735733, "grad_norm": 0.0411669903197421, "learning_rate": 5.5383659359013516e-05, "loss": 0.7606, "step": 3935 }, { "epoch": 0.7463888231115321, "grad_norm": 0.03660616286712515, "learning_rate": 5.499926685731999e-05, "loss": 0.7144, "step": 3940 }, { "epoch": 0.7473360170494909, "grad_norm": 0.04196508435156298, "learning_rate": 5.461591326842368e-05, "loss": 0.7268, "step": 3945 }, { "epoch": 0.7482832109874497, "grad_norm": 0.03782538627658778, "learning_rate": 5.4233602784617875e-05, "loss": 0.7538, "step": 3950 }, { "epoch": 0.7492304049254085, "grad_norm": 0.039917199754721966, "learning_rate": 5.385233958678899e-05, "loss": 0.7471, "step": 3955 }, { "epoch": 0.7501775988633673, "grad_norm": 0.041462493245785374, "learning_rate": 5.347212784437014e-05, "loss": 0.7335, "step": 3960 }, { "epoch": 0.751124792801326, "grad_norm": 0.04262717035805544, "learning_rate": 5.3092971715296036e-05, "loss": 0.7517, "step": 3965 }, { "epoch": 0.7520719867392849, "grad_norm": 0.03912615098583992, "learning_rate": 5.2714875345957364e-05, "loss": 0.7505, "step": 3970 }, { "epoch": 0.7530191806772437, "grad_norm": 0.03843483966970995, "learning_rate": 5.2337842871155464e-05, "loss": 0.729, "step": 3975 }, { "epoch": 0.7539663746152024, "grad_norm": 0.04001769351762854, "learning_rate": 5.1961878414057116e-05, "loss": 0.743, "step": 3980 }, { "epoch": 0.7549135685531613, "grad_norm": 0.03903544150596959, "learning_rate": 5.158698608614928e-05, "loss": 0.7231, "step": 3985 }, { "epoch": 0.7558607624911201, "grad_norm": 0.04246943536811293, "learning_rate": 5.1213169987194506e-05, "loss": 0.7376, "step": 3990 }, { "epoch": 0.7568079564290788, "grad_norm": 0.04255248262700945, "learning_rate": 5.08404342051856e-05, "loss": 0.769, "step": 3995 }, { "epoch": 0.7577551503670377, "grad_norm": 0.038324457218321194, "learning_rate": 5.04687828163015e-05, "loss": 0.7171, "step": 4000 }, { "epoch": 0.7587023443049965, "grad_norm": 0.04478588785492951, "learning_rate": 5.0098219884862265e-05, "loss": 0.764, "step": 4005 }, { "epoch": 0.7596495382429552, "grad_norm": 0.040448163186758916, "learning_rate": 4.9728749463284634e-05, "loss": 0.7416, "step": 4010 }, { "epoch": 0.7605967321809141, "grad_norm": 0.04321236917872768, "learning_rate": 4.936037559203806e-05, "loss": 0.754, "step": 4015 }, { "epoch": 0.7615439261188729, "grad_norm": 0.041244116901498824, "learning_rate": 4.899310229960002e-05, "loss": 0.745, "step": 4020 }, { "epoch": 0.7624911200568316, "grad_norm": 0.04064461271404752, "learning_rate": 4.862693360241259e-05, "loss": 0.7351, "step": 4025 }, { "epoch": 0.7634383139947905, "grad_norm": 0.03781319026471734, "learning_rate": 4.826187350483783e-05, "loss": 0.7307, "step": 4030 }, { "epoch": 0.7643855079327492, "grad_norm": 0.03974939076217195, "learning_rate": 4.789792599911453e-05, "loss": 0.7438, "step": 4035 }, { "epoch": 0.765332701870708, "grad_norm": 0.04065565436383743, "learning_rate": 4.753509506531436e-05, "loss": 0.7636, "step": 4040 }, { "epoch": 0.7662798958086668, "grad_norm": 0.04062956705746941, "learning_rate": 4.717338467129813e-05, "loss": 0.7569, "step": 4045 }, { "epoch": 0.7672270897466256, "grad_norm": 0.041349177857734835, "learning_rate": 4.6812798772672936e-05, "loss": 0.7026, "step": 4050 }, { "epoch": 0.7681742836845844, "grad_norm": 0.04354632478814716, "learning_rate": 4.645334131274828e-05, "loss": 0.7145, "step": 4055 }, { "epoch": 0.7691214776225432, "grad_norm": 0.03936898140216215, "learning_rate": 4.609501622249343e-05, "loss": 0.7286, "step": 4060 }, { "epoch": 0.770068671560502, "grad_norm": 0.03929846725341178, "learning_rate": 4.573782742049407e-05, "loss": 0.7304, "step": 4065 }, { "epoch": 0.7710158654984608, "grad_norm": 0.04124206397422393, "learning_rate": 4.538177881290973e-05, "loss": 0.7306, "step": 4070 }, { "epoch": 0.7719630594364196, "grad_norm": 0.042221939835918945, "learning_rate": 4.502687429343106e-05, "loss": 0.7519, "step": 4075 }, { "epoch": 0.7729102533743784, "grad_norm": 0.04130656943974972, "learning_rate": 4.4673117743236884e-05, "loss": 0.7245, "step": 4080 }, { "epoch": 0.7738574473123372, "grad_norm": 0.03913493498829275, "learning_rate": 4.432051303095225e-05, "loss": 0.7487, "step": 4085 }, { "epoch": 0.774804641250296, "grad_norm": 0.037086715638808006, "learning_rate": 4.396906401260573e-05, "loss": 0.7308, "step": 4090 }, { "epoch": 0.7757518351882547, "grad_norm": 0.041005709394066454, "learning_rate": 4.361877453158749e-05, "loss": 0.7222, "step": 4095 }, { "epoch": 0.7766990291262136, "grad_norm": 0.0390323446969787, "learning_rate": 4.3269648418607194e-05, "loss": 0.7187, "step": 4100 }, { "epoch": 0.7776462230641724, "grad_norm": 0.04019349411682957, "learning_rate": 4.29216894916521e-05, "loss": 0.7089, "step": 4105 }, { "epoch": 0.7785934170021311, "grad_norm": 0.04095873534287484, "learning_rate": 4.257490155594528e-05, "loss": 0.7546, "step": 4110 }, { "epoch": 0.77954061094009, "grad_norm": 0.040023174171211935, "learning_rate": 4.2229288403903994e-05, "loss": 0.7151, "step": 4115 }, { "epoch": 0.7804878048780488, "grad_norm": 0.04008455709185164, "learning_rate": 4.188485381509833e-05, "loss": 0.7317, "step": 4120 }, { "epoch": 0.7814349988160075, "grad_norm": 0.041977277262445226, "learning_rate": 4.154160155620977e-05, "loss": 0.73, "step": 4125 }, { "epoch": 0.7823821927539664, "grad_norm": 0.04088401185241922, "learning_rate": 4.119953538099006e-05, "loss": 0.7639, "step": 4130 }, { "epoch": 0.7833293866919252, "grad_norm": 0.037283132614974145, "learning_rate": 4.085865903021999e-05, "loss": 0.7456, "step": 4135 }, { "epoch": 0.7842765806298839, "grad_norm": 0.046399869448183154, "learning_rate": 4.051897623166879e-05, "loss": 0.748, "step": 4140 }, { "epoch": 0.7852237745678428, "grad_norm": 0.03912459186813119, "learning_rate": 4.0180490700053105e-05, "loss": 0.7518, "step": 4145 }, { "epoch": 0.7861709685058016, "grad_norm": 0.04059706834387849, "learning_rate": 3.984320613699648e-05, "loss": 0.7174, "step": 4150 }, { "epoch": 0.7871181624437603, "grad_norm": 0.04279168262844896, "learning_rate": 3.950712623098892e-05, "loss": 0.717, "step": 4155 }, { "epoch": 0.7880653563817192, "grad_norm": 0.04328417157580087, "learning_rate": 3.917225465734632e-05, "loss": 0.7402, "step": 4160 }, { "epoch": 0.789012550319678, "grad_norm": 0.03907178607474956, "learning_rate": 3.883859507817061e-05, "loss": 0.7109, "step": 4165 }, { "epoch": 0.7899597442576367, "grad_norm": 0.03842633012724885, "learning_rate": 3.850615114230949e-05, "loss": 0.7565, "step": 4170 }, { "epoch": 0.7909069381955955, "grad_norm": 0.04237253188344206, "learning_rate": 3.81749264853166e-05, "loss": 0.7489, "step": 4175 }, { "epoch": 0.7918541321335544, "grad_norm": 0.04212676511762711, "learning_rate": 3.784492472941173e-05, "loss": 0.7506, "step": 4180 }, { "epoch": 0.7928013260715131, "grad_norm": 0.040564694587832524, "learning_rate": 3.751614948344116e-05, "loss": 0.7594, "step": 4185 }, { "epoch": 0.7937485200094719, "grad_norm": 0.03846320035915876, "learning_rate": 3.718860434283832e-05, "loss": 0.7416, "step": 4190 }, { "epoch": 0.7946957139474308, "grad_norm": 0.04058426964727867, "learning_rate": 3.686229288958442e-05, "loss": 0.7703, "step": 4195 }, { "epoch": 0.7956429078853895, "grad_norm": 0.03767236001284344, "learning_rate": 3.653721869216926e-05, "loss": 0.7344, "step": 4200 }, { "epoch": 0.7965901018233483, "grad_norm": 0.03990592712721354, "learning_rate": 3.621338530555207e-05, "loss": 0.7329, "step": 4205 }, { "epoch": 0.7975372957613072, "grad_norm": 0.037505327559176606, "learning_rate": 3.589079627112298e-05, "loss": 0.7033, "step": 4210 }, { "epoch": 0.7984844896992659, "grad_norm": 0.03592112011887372, "learning_rate": 3.5569455116663944e-05, "loss": 0.75, "step": 4215 }, { "epoch": 0.7994316836372247, "grad_norm": 0.037737720756345204, "learning_rate": 3.524936535631036e-05, "loss": 0.7178, "step": 4220 }, { "epoch": 0.8003788775751836, "grad_norm": 0.041074083535645345, "learning_rate": 3.49305304905126e-05, "loss": 0.7296, "step": 4225 }, { "epoch": 0.8013260715131423, "grad_norm": 0.037206587197529284, "learning_rate": 3.461295400599759e-05, "loss": 0.7318, "step": 4230 }, { "epoch": 0.8022732654511011, "grad_norm": 0.04155270440503186, "learning_rate": 3.429663937573095e-05, "loss": 0.7643, "step": 4235 }, { "epoch": 0.80322045938906, "grad_norm": 0.04200718034951944, "learning_rate": 3.3981590058878764e-05, "loss": 0.7303, "step": 4240 }, { "epoch": 0.8041676533270187, "grad_norm": 0.0391722359803136, "learning_rate": 3.36678095007699e-05, "loss": 0.7551, "step": 4245 }, { "epoch": 0.8051148472649775, "grad_norm": 0.04162052982671362, "learning_rate": 3.335530113285832e-05, "loss": 0.7429, "step": 4250 }, { "epoch": 0.8060620412029363, "grad_norm": 0.038860916165607134, "learning_rate": 3.304406837268538e-05, "loss": 0.7304, "step": 4255 }, { "epoch": 0.8070092351408951, "grad_norm": 0.03765860345115411, "learning_rate": 3.2734114623842714e-05, "loss": 0.7541, "step": 4260 }, { "epoch": 0.8079564290788539, "grad_norm": 0.03985364166742812, "learning_rate": 3.242544327593487e-05, "loss": 0.7159, "step": 4265 }, { "epoch": 0.8089036230168127, "grad_norm": 0.04096122915397548, "learning_rate": 3.211805770454229e-05, "loss": 0.7494, "step": 4270 }, { "epoch": 0.8098508169547715, "grad_norm": 0.03718014396684494, "learning_rate": 3.181196127118425e-05, "loss": 0.7228, "step": 4275 }, { "epoch": 0.8107980108927303, "grad_norm": 0.037725347863418725, "learning_rate": 3.150715732328235e-05, "loss": 0.7507, "step": 4280 }, { "epoch": 0.8117452048306891, "grad_norm": 0.04354928220465528, "learning_rate": 3.120364919412374e-05, "loss": 0.744, "step": 4285 }, { "epoch": 0.8126923987686479, "grad_norm": 0.03829035376988075, "learning_rate": 3.090144020282469e-05, "loss": 0.7497, "step": 4290 }, { "epoch": 0.8136395927066067, "grad_norm": 0.04107370767206898, "learning_rate": 3.060053365429433e-05, "loss": 0.7087, "step": 4295 }, { "epoch": 0.8145867866445655, "grad_norm": 0.04270910227284078, "learning_rate": 3.030093283919841e-05, "loss": 0.7301, "step": 4300 }, { "epoch": 0.8155339805825242, "grad_norm": 0.040080109853962084, "learning_rate": 3.000264103392348e-05, "loss": 0.7113, "step": 4305 }, { "epoch": 0.8164811745204831, "grad_norm": 0.03923392398314568, "learning_rate": 2.9705661500540916e-05, "loss": 0.7235, "step": 4310 }, { "epoch": 0.8174283684584419, "grad_norm": 0.04092925596400112, "learning_rate": 2.9409997486771332e-05, "loss": 0.7086, "step": 4315 }, { "epoch": 0.8183755623964006, "grad_norm": 0.03723866823377466, "learning_rate": 2.911565222594904e-05, "loss": 0.7154, "step": 4320 }, { "epoch": 0.8193227563343595, "grad_norm": 0.04026831218954151, "learning_rate": 2.8822628936986576e-05, "loss": 0.7166, "step": 4325 }, { "epoch": 0.8202699502723183, "grad_norm": 0.03924390038270272, "learning_rate": 2.8530930824339725e-05, "loss": 0.7114, "step": 4330 }, { "epoch": 0.821217144210277, "grad_norm": 0.03857672055842463, "learning_rate": 2.8240561077972336e-05, "loss": 0.7275, "step": 4335 }, { "epoch": 0.8221643381482359, "grad_norm": 0.03813437644170459, "learning_rate": 2.795152287332143e-05, "loss": 0.7407, "step": 4340 }, { "epoch": 0.8231115320861947, "grad_norm": 0.039511538349970426, "learning_rate": 2.766381937126246e-05, "loss": 0.7224, "step": 4345 }, { "epoch": 0.8240587260241534, "grad_norm": 0.04035022629840493, "learning_rate": 2.737745371807484e-05, "loss": 0.7226, "step": 4350 }, { "epoch": 0.8250059199621123, "grad_norm": 0.041748400769491566, "learning_rate": 2.7092429045407493e-05, "loss": 0.7076, "step": 4355 }, { "epoch": 0.8259531139000711, "grad_norm": 0.04074616252719918, "learning_rate": 2.6808748470244596e-05, "loss": 0.733, "step": 4360 }, { "epoch": 0.8269003078380298, "grad_norm": 0.040304189290435735, "learning_rate": 2.6526415094871456e-05, "loss": 0.7275, "step": 4365 }, { "epoch": 0.8278475017759886, "grad_norm": 0.04056642286405685, "learning_rate": 2.624543200684059e-05, "loss": 0.7419, "step": 4370 }, { "epoch": 0.8287946957139475, "grad_norm": 0.044474637104153496, "learning_rate": 2.5965802278938104e-05, "loss": 0.7029, "step": 4375 }, { "epoch": 0.8297418896519062, "grad_norm": 0.040065168017582885, "learning_rate": 2.5687528969149797e-05, "loss": 0.7375, "step": 4380 }, { "epoch": 0.830689083589865, "grad_norm": 0.039500986446779594, "learning_rate": 2.541061512062808e-05, "loss": 0.7475, "step": 4385 }, { "epoch": 0.8316362775278239, "grad_norm": 0.04336888291320547, "learning_rate": 2.5135063761658465e-05, "loss": 0.7506, "step": 4390 }, { "epoch": 0.8325834714657826, "grad_norm": 0.04238948284006484, "learning_rate": 2.4860877905626385e-05, "loss": 0.7072, "step": 4395 }, { "epoch": 0.8335306654037414, "grad_norm": 0.03937330992282642, "learning_rate": 2.4588060550984517e-05, "loss": 0.7271, "step": 4400 }, { "epoch": 0.8344778593417003, "grad_norm": 0.04329748465343663, "learning_rate": 2.4316614681219616e-05, "loss": 0.7726, "step": 4405 }, { "epoch": 0.835425053279659, "grad_norm": 0.04153391947066848, "learning_rate": 2.4046543264820367e-05, "loss": 0.7623, "step": 4410 }, { "epoch": 0.8363722472176178, "grad_norm": 0.03864210650963097, "learning_rate": 2.3777849255244402e-05, "loss": 0.7335, "step": 4415 }, { "epoch": 0.8373194411555765, "grad_norm": 0.03605009140484424, "learning_rate": 2.3510535590886464e-05, "loss": 0.7185, "step": 4420 }, { "epoch": 0.8382666350935354, "grad_norm": 0.04343264310904382, "learning_rate": 2.324460519504584e-05, "loss": 0.7278, "step": 4425 }, { "epoch": 0.8392138290314942, "grad_norm": 0.04222856281924883, "learning_rate": 2.298006097589478e-05, "loss": 0.7272, "step": 4430 }, { "epoch": 0.8401610229694529, "grad_norm": 0.03693060964307286, "learning_rate": 2.2716905826446553e-05, "loss": 0.728, "step": 4435 }, { "epoch": 0.8411082169074118, "grad_norm": 0.03882562220045244, "learning_rate": 2.2455142624523632e-05, "loss": 0.7228, "step": 4440 }, { "epoch": 0.8420554108453706, "grad_norm": 0.04087387623558309, "learning_rate": 2.2194774232726492e-05, "loss": 0.7155, "step": 4445 }, { "epoch": 0.8430026047833293, "grad_norm": 0.04058338081432673, "learning_rate": 2.193580349840211e-05, "loss": 0.7023, "step": 4450 }, { "epoch": 0.8439497987212882, "grad_norm": 0.04404654162614751, "learning_rate": 2.167823325361297e-05, "loss": 0.6959, "step": 4455 }, { "epoch": 0.844896992659247, "grad_norm": 0.03855188004859097, "learning_rate": 2.1422066315106007e-05, "loss": 0.7258, "step": 4460 }, { "epoch": 0.8458441865972057, "grad_norm": 0.03928313344333361, "learning_rate": 2.1167305484281814e-05, "loss": 0.7372, "step": 4465 }, { "epoch": 0.8467913805351646, "grad_norm": 0.03940286973106492, "learning_rate": 2.0913953547164058e-05, "loss": 0.7163, "step": 4470 }, { "epoch": 0.8477385744731234, "grad_norm": 0.04323326221582775, "learning_rate": 2.0662013274368854e-05, "loss": 0.7378, "step": 4475 }, { "epoch": 0.8486857684110821, "grad_norm": 0.03858572389634398, "learning_rate": 2.041148742107471e-05, "loss": 0.7397, "step": 4480 }, { "epoch": 0.849632962349041, "grad_norm": 0.040751992473504785, "learning_rate": 2.0162378726992222e-05, "loss": 0.7581, "step": 4485 }, { "epoch": 0.8505801562869998, "grad_norm": 0.03829683763062958, "learning_rate": 1.9914689916334175e-05, "loss": 0.6946, "step": 4490 }, { "epoch": 0.8515273502249585, "grad_norm": 0.03695727697699672, "learning_rate": 1.9668423697785656e-05, "loss": 0.7331, "step": 4495 }, { "epoch": 0.8524745441629173, "grad_norm": 0.03752543633546998, "learning_rate": 1.942358276447462e-05, "loss": 0.7281, "step": 4500 }, { "epoch": 0.8534217381008762, "grad_norm": 0.04068189729909485, "learning_rate": 1.9180169793942272e-05, "loss": 0.7639, "step": 4505 }, { "epoch": 0.8543689320388349, "grad_norm": 0.04048730316852887, "learning_rate": 1.893818744811388e-05, "loss": 0.748, "step": 4510 }, { "epoch": 0.8553161259767937, "grad_norm": 0.04595934306302146, "learning_rate": 1.869763837326963e-05, "loss": 0.7799, "step": 4515 }, { "epoch": 0.8562633199147526, "grad_norm": 0.04070824155336302, "learning_rate": 1.8458525200015593e-05, "loss": 0.7525, "step": 4520 }, { "epoch": 0.8572105138527113, "grad_norm": 0.03956623366079343, "learning_rate": 1.822085054325515e-05, "loss": 0.7159, "step": 4525 }, { "epoch": 0.8581577077906701, "grad_norm": 0.03807785792934746, "learning_rate": 1.798461700216029e-05, "loss": 0.7562, "step": 4530 }, { "epoch": 0.859104901728629, "grad_norm": 0.03756680254096828, "learning_rate": 1.7749827160143164e-05, "loss": 0.7292, "step": 4535 }, { "epoch": 0.8600520956665877, "grad_norm": 0.03976826946310067, "learning_rate": 1.751648358482789e-05, "loss": 0.7282, "step": 4540 }, { "epoch": 0.8609992896045465, "grad_norm": 0.043044198125544504, "learning_rate": 1.7284588828022378e-05, "loss": 0.7152, "step": 4545 }, { "epoch": 0.8619464835425054, "grad_norm": 0.04230346678410999, "learning_rate": 1.7054145425690536e-05, "loss": 0.7297, "step": 4550 }, { "epoch": 0.8628936774804641, "grad_norm": 0.03893557149385341, "learning_rate": 1.6825155897924513e-05, "loss": 0.7239, "step": 4555 }, { "epoch": 0.8638408714184229, "grad_norm": 0.037068694217922235, "learning_rate": 1.6597622748917132e-05, "loss": 0.7142, "step": 4560 }, { "epoch": 0.8647880653563818, "grad_norm": 0.03991740786553955, "learning_rate": 1.6371548466934385e-05, "loss": 0.7308, "step": 4565 }, { "epoch": 0.8657352592943405, "grad_norm": 0.04471672391377206, "learning_rate": 1.6146935524288446e-05, "loss": 0.7301, "step": 4570 }, { "epoch": 0.8666824532322993, "grad_norm": 0.037180999778756545, "learning_rate": 1.5923786377310433e-05, "loss": 0.7203, "step": 4575 }, { "epoch": 0.8676296471702581, "grad_norm": 0.04146480156497774, "learning_rate": 1.5702103466323708e-05, "loss": 0.7119, "step": 4580 }, { "epoch": 0.8685768411082169, "grad_norm": 0.036947710794122034, "learning_rate": 1.5481889215617073e-05, "loss": 0.7196, "step": 4585 }, { "epoch": 0.8695240350461757, "grad_norm": 0.03548597961029977, "learning_rate": 1.5263146033418227e-05, "loss": 0.7051, "step": 4590 }, { "epoch": 0.8704712289841345, "grad_norm": 0.03955821537818952, "learning_rate": 1.5045876311867628e-05, "loss": 0.7206, "step": 4595 }, { "epoch": 0.8714184229220933, "grad_norm": 0.038962735470389774, "learning_rate": 1.4830082426992112e-05, "loss": 0.7266, "step": 4600 }, { "epoch": 0.8723656168600521, "grad_norm": 0.03843209348318749, "learning_rate": 1.4615766738679036e-05, "loss": 0.7236, "step": 4605 }, { "epoch": 0.8733128107980109, "grad_norm": 0.03583871656951685, "learning_rate": 1.4402931590650462e-05, "loss": 0.7037, "step": 4610 }, { "epoch": 0.8742600047359697, "grad_norm": 0.03814447238860827, "learning_rate": 1.4191579310437412e-05, "loss": 0.7142, "step": 4615 }, { "epoch": 0.8752071986739285, "grad_norm": 0.04131298607004148, "learning_rate": 1.398171220935459e-05, "loss": 0.7472, "step": 4620 }, { "epoch": 0.8761543926118873, "grad_norm": 0.04027601558619238, "learning_rate": 1.3773332582474995e-05, "loss": 0.7222, "step": 4625 }, { "epoch": 0.877101586549846, "grad_norm": 0.03807892715571328, "learning_rate": 1.356644270860487e-05, "loss": 0.724, "step": 4630 }, { "epoch": 0.8780487804878049, "grad_norm": 0.037140756994390095, "learning_rate": 1.3361044850258657e-05, "loss": 0.7313, "step": 4635 }, { "epoch": 0.8789959744257637, "grad_norm": 0.040598401756478456, "learning_rate": 1.3157141253634469e-05, "loss": 0.7418, "step": 4640 }, { "epoch": 0.8799431683637224, "grad_norm": 0.03636159365334358, "learning_rate": 1.2954734148589369e-05, "loss": 0.733, "step": 4645 }, { "epoch": 0.8808903623016813, "grad_norm": 0.04100817868151093, "learning_rate": 1.2753825748615032e-05, "loss": 0.715, "step": 4650 }, { "epoch": 0.8818375562396401, "grad_norm": 0.040249623789949056, "learning_rate": 1.255441825081354e-05, "loss": 0.7177, "step": 4655 }, { "epoch": 0.8827847501775988, "grad_norm": 0.038473649906406296, "learning_rate": 1.235651383587331e-05, "loss": 0.735, "step": 4660 }, { "epoch": 0.8837319441155577, "grad_norm": 0.03716433351702097, "learning_rate": 1.2160114668045335e-05, "loss": 0.7109, "step": 4665 }, { "epoch": 0.8846791380535165, "grad_norm": 0.0366065266385142, "learning_rate": 1.1965222895119442e-05, "loss": 0.7098, "step": 4670 }, { "epoch": 0.8856263319914752, "grad_norm": 0.041156901959537195, "learning_rate": 1.1771840648400849e-05, "loss": 0.7422, "step": 4675 }, { "epoch": 0.8865735259294341, "grad_norm": 0.03951261248655999, "learning_rate": 1.1579970042686843e-05, "loss": 0.7434, "step": 4680 }, { "epoch": 0.8875207198673929, "grad_norm": 0.042505748782389406, "learning_rate": 1.1389613176243567e-05, "loss": 0.7422, "step": 4685 }, { "epoch": 0.8884679138053516, "grad_norm": 0.036559177807256635, "learning_rate": 1.1200772130783259e-05, "loss": 0.6995, "step": 4690 }, { "epoch": 0.8894151077433105, "grad_norm": 0.038540183149753306, "learning_rate": 1.1013448971441313e-05, "loss": 0.7386, "step": 4695 }, { "epoch": 0.8903623016812693, "grad_norm": 0.03778323009855525, "learning_rate": 1.0827645746753837e-05, "loss": 0.7293, "step": 4700 }, { "epoch": 0.891309495619228, "grad_norm": 0.037960112685593676, "learning_rate": 1.064336448863507e-05, "loss": 0.7132, "step": 4705 }, { "epoch": 0.8922566895571868, "grad_norm": 0.038143343588638426, "learning_rate": 1.0460607212355343e-05, "loss": 0.7157, "step": 4710 }, { "epoch": 0.8932038834951457, "grad_norm": 0.03885540215502875, "learning_rate": 1.0279375916518956e-05, "loss": 0.7329, "step": 4715 }, { "epoch": 0.8941510774331044, "grad_norm": 0.036527462926812734, "learning_rate": 1.0099672583042306e-05, "loss": 0.706, "step": 4720 }, { "epoch": 0.8950982713710632, "grad_norm": 0.04170086636332326, "learning_rate": 9.921499177132325e-06, "loss": 0.7159, "step": 4725 }, { "epoch": 0.8960454653090221, "grad_norm": 0.03917784068162278, "learning_rate": 9.744857647264743e-06, "loss": 0.7151, "step": 4730 }, { "epoch": 0.8969926592469808, "grad_norm": 0.0368623059179063, "learning_rate": 9.56974992516309e-06, "loss": 0.7175, "step": 4735 }, { "epoch": 0.8979398531849396, "grad_norm": 0.03761895854461036, "learning_rate": 9.396177925777315e-06, "loss": 0.7376, "step": 4740 }, { "epoch": 0.8988870471228985, "grad_norm": 0.03943918919221873, "learning_rate": 9.224143547263018e-06, "loss": 0.727, "step": 4745 }, { "epoch": 0.8998342410608572, "grad_norm": 0.041135228489504315, "learning_rate": 9.053648670960634e-06, "loss": 0.7079, "step": 4750 }, { "epoch": 0.900781434998816, "grad_norm": 0.037659487176961826, "learning_rate": 8.88469516137476e-06, "loss": 0.719, "step": 4755 }, { "epoch": 0.9017286289367749, "grad_norm": 0.03937100685930776, "learning_rate": 8.717284866153967e-06, "loss": 0.704, "step": 4760 }, { "epoch": 0.9026758228747336, "grad_norm": 0.038814959136852366, "learning_rate": 8.551419616070322e-06, "loss": 0.7329, "step": 4765 }, { "epoch": 0.9036230168126924, "grad_norm": 0.03791616596156232, "learning_rate": 8.387101224999738e-06, "loss": 0.7544, "step": 4770 }, { "epoch": 0.9045702107506512, "grad_norm": 0.036920295113180866, "learning_rate": 8.224331489901747e-06, "loss": 0.7353, "step": 4775 }, { "epoch": 0.90551740468861, "grad_norm": 0.04102354744430632, "learning_rate": 8.063112190800114e-06, "loss": 0.743, "step": 4780 }, { "epoch": 0.9064645986265688, "grad_norm": 0.039647668406205074, "learning_rate": 7.903445090763278e-06, "loss": 0.7288, "step": 4785 }, { "epoch": 0.9074117925645276, "grad_norm": 0.037692364175638995, "learning_rate": 7.745331935885008e-06, "loss": 0.7185, "step": 4790 }, { "epoch": 0.9083589865024864, "grad_norm": 0.036226337301176685, "learning_rate": 7.588774455265517e-06, "loss": 0.7396, "step": 4795 }, { "epoch": 0.9093061804404452, "grad_norm": 0.03741364399731542, "learning_rate": 7.433774360992279e-06, "loss": 0.7226, "step": 4800 }, { "epoch": 0.9102533743784039, "grad_norm": 0.03775216663355994, "learning_rate": 7.280333348121503e-06, "loss": 0.716, "step": 4805 }, { "epoch": 0.9112005683163628, "grad_norm": 0.03958684677935284, "learning_rate": 7.128453094659508e-06, "loss": 0.7364, "step": 4810 }, { "epoch": 0.9121477622543216, "grad_norm": 0.03816974922462119, "learning_rate": 6.978135261544398e-06, "loss": 0.726, "step": 4815 }, { "epoch": 0.9130949561922803, "grad_norm": 0.03916663794581213, "learning_rate": 6.829381492627978e-06, "loss": 0.7091, "step": 4820 }, { "epoch": 0.9140421501302392, "grad_norm": 0.03819017711079349, "learning_rate": 6.682193414657583e-06, "loss": 0.7225, "step": 4825 }, { "epoch": 0.914989344068198, "grad_norm": 0.03825337879044159, "learning_rate": 6.5365726372584805e-06, "loss": 0.7167, "step": 4830 }, { "epoch": 0.9159365380061567, "grad_norm": 0.03783436042390378, "learning_rate": 6.392520752916097e-06, "loss": 0.7425, "step": 4835 }, { "epoch": 0.9168837319441155, "grad_norm": 0.03893124825143902, "learning_rate": 6.2500393369588505e-06, "loss": 0.7272, "step": 4840 }, { "epoch": 0.9178309258820744, "grad_norm": 0.03729369559567071, "learning_rate": 6.109129947540631e-06, "loss": 0.741, "step": 4845 }, { "epoch": 0.9187781198200331, "grad_norm": 0.03912193568036812, "learning_rate": 5.969794125623928e-06, "loss": 0.7276, "step": 4850 }, { "epoch": 0.9197253137579919, "grad_norm": 0.03810873022721955, "learning_rate": 5.832033394963015e-06, "loss": 0.7231, "step": 4855 }, { "epoch": 0.9206725076959508, "grad_norm": 0.03730402627420899, "learning_rate": 5.69584926208711e-06, "loss": 0.7047, "step": 4860 }, { "epoch": 0.9216197016339095, "grad_norm": 0.03667813336307107, "learning_rate": 5.561243216284139e-06, "loss": 0.7152, "step": 4865 }, { "epoch": 0.9225668955718683, "grad_norm": 0.03964877444312505, "learning_rate": 5.4282167295842e-06, "loss": 0.7151, "step": 4870 }, { "epoch": 0.9235140895098272, "grad_norm": 0.04352873821823642, "learning_rate": 5.296771256743676e-06, "loss": 0.7148, "step": 4875 }, { "epoch": 0.9244612834477859, "grad_norm": 0.04158839264995075, "learning_rate": 5.166908235229178e-06, "loss": 0.699, "step": 4880 }, { "epoch": 0.9254084773857447, "grad_norm": 0.03955083160304519, "learning_rate": 5.038629085201878e-06, "loss": 0.727, "step": 4885 }, { "epoch": 0.9263556713237036, "grad_norm": 0.044593469584970485, "learning_rate": 4.911935209502072e-06, "loss": 0.7399, "step": 4890 }, { "epoch": 0.9273028652616623, "grad_norm": 0.03845070002008913, "learning_rate": 4.786827993633635e-06, "loss": 0.7197, "step": 4895 }, { "epoch": 0.9282500591996211, "grad_norm": 0.037419281999128515, "learning_rate": 4.663308805749061e-06, "loss": 0.7318, "step": 4900 }, { "epoch": 0.92919725313758, "grad_norm": 0.03903737099095032, "learning_rate": 4.541378996634382e-06, "loss": 0.7339, "step": 4905 }, { "epoch": 0.9301444470755387, "grad_norm": 0.03913683799560393, "learning_rate": 4.421039899694468e-06, "loss": 0.7229, "step": 4910 }, { "epoch": 0.9310916410134975, "grad_norm": 0.03740496457952301, "learning_rate": 4.302292830938403e-06, "loss": 0.7138, "step": 4915 }, { "epoch": 0.9320388349514563, "grad_norm": 0.03949065539672211, "learning_rate": 4.185139088965083e-06, "loss": 0.7036, "step": 4920 }, { "epoch": 0.9329860288894151, "grad_norm": 0.03718724530040875, "learning_rate": 4.06957995494911e-06, "loss": 0.7241, "step": 4925 }, { "epoch": 0.9339332228273739, "grad_norm": 0.038984046127228444, "learning_rate": 3.955616692626612e-06, "loss": 0.7132, "step": 4930 }, { "epoch": 0.9348804167653327, "grad_norm": 0.03705946169516573, "learning_rate": 3.843250548281584e-06, "loss": 0.7205, "step": 4935 }, { "epoch": 0.9358276107032915, "grad_norm": 0.04056507468322176, "learning_rate": 3.7324827507321907e-06, "loss": 0.7095, "step": 4940 }, { "epoch": 0.9367748046412503, "grad_norm": 0.03838686948370597, "learning_rate": 3.62331451131731e-06, "loss": 0.7402, "step": 4945 }, { "epoch": 0.9377219985792091, "grad_norm": 0.03735322865661384, "learning_rate": 3.5157470238832975e-06, "loss": 0.7113, "step": 4950 }, { "epoch": 0.9386691925171678, "grad_norm": 0.03865946802077697, "learning_rate": 3.4097814647709775e-06, "loss": 0.7327, "step": 4955 }, { "epoch": 0.9396163864551267, "grad_norm": 0.03750616620805965, "learning_rate": 3.3054189928027386e-06, "loss": 0.7078, "step": 4960 }, { "epoch": 0.9405635803930855, "grad_norm": 0.040179517311287265, "learning_rate": 3.202660749269842e-06, "loss": 0.7168, "step": 4965 }, { "epoch": 0.9415107743310442, "grad_norm": 0.038488343198023446, "learning_rate": 3.1015078579199992e-06, "loss": 0.7263, "step": 4970 }, { "epoch": 0.9424579682690031, "grad_norm": 0.03846115414340801, "learning_rate": 3.0019614249449818e-06, "loss": 0.7396, "step": 4975 }, { "epoch": 0.9434051622069619, "grad_norm": 0.03891780338800125, "learning_rate": 2.9040225389686477e-06, "loss": 0.7197, "step": 4980 }, { "epoch": 0.9443523561449206, "grad_norm": 0.038745010189377524, "learning_rate": 2.8076922710349836e-06, "loss": 0.6982, "step": 4985 }, { "epoch": 0.9452995500828795, "grad_norm": 0.03993490506698604, "learning_rate": 2.7129716745963316e-06, "loss": 0.6958, "step": 4990 }, { "epoch": 0.9462467440208383, "grad_norm": 0.03974439268947578, "learning_rate": 2.6198617855020143e-06, "loss": 0.7312, "step": 4995 }, { "epoch": 0.947193937958797, "grad_norm": 0.04146912204153186, "learning_rate": 2.5283636219867954e-06, "loss": 0.7385, "step": 5000 }, { "epoch": 0.9481411318967559, "grad_norm": 0.03707229834469002, "learning_rate": 2.43847818465997e-06, "loss": 0.7345, "step": 5005 }, { "epoch": 0.9490883258347147, "grad_norm": 0.039133364688565486, "learning_rate": 2.3502064564942578e-06, "loss": 0.7075, "step": 5010 }, { "epoch": 0.9500355197726734, "grad_norm": 0.03622200599308819, "learning_rate": 2.263549402815179e-06, "loss": 0.6983, "step": 5015 }, { "epoch": 0.9509827137106323, "grad_norm": 0.038295665871847934, "learning_rate": 2.1785079712903275e-06, "loss": 0.7334, "step": 5020 }, { "epoch": 0.9519299076485911, "grad_norm": 0.03915418927573118, "learning_rate": 2.095083091919214e-06, "loss": 0.7372, "step": 5025 }, { "epoch": 0.9528771015865498, "grad_norm": 0.03728899024964666, "learning_rate": 2.0132756770229576e-06, "loss": 0.7046, "step": 5030 }, { "epoch": 0.9538242955245086, "grad_norm": 0.03775405052284275, "learning_rate": 1.9330866212343086e-06, "loss": 0.7143, "step": 5035 }, { "epoch": 0.9547714894624675, "grad_norm": 0.038310827788718325, "learning_rate": 1.8545168014879764e-06, "loss": 0.7111, "step": 5040 }, { "epoch": 0.9557186834004262, "grad_norm": 0.03835494831636439, "learning_rate": 1.777567077010883e-06, "loss": 0.7398, "step": 5045 }, { "epoch": 0.956665877338385, "grad_norm": 0.038374737236019835, "learning_rate": 1.7022382893129072e-06, "loss": 0.7149, "step": 5050 }, { "epoch": 0.9576130712763439, "grad_norm": 0.039365530128439526, "learning_rate": 1.6285312621775903e-06, "loss": 0.7074, "step": 5055 }, { "epoch": 0.9585602652143026, "grad_norm": 0.038447131703807806, "learning_rate": 1.5564468016531773e-06, "loss": 0.7531, "step": 5060 }, { "epoch": 0.9595074591522614, "grad_norm": 0.03768227162346853, "learning_rate": 1.48598569604379e-06, "loss": 0.7215, "step": 5065 }, { "epoch": 0.9604546530902203, "grad_norm": 0.03691120803905496, "learning_rate": 1.4171487159007843e-06, "loss": 0.7037, "step": 5070 }, { "epoch": 0.961401847028179, "grad_norm": 0.03847682888310551, "learning_rate": 1.349936614014341e-06, "loss": 0.7386, "step": 5075 }, { "epoch": 0.9623490409661378, "grad_norm": 0.038562458996163625, "learning_rate": 1.2843501254052368e-06, "loss": 0.7145, "step": 5080 }, { "epoch": 0.9632962349040967, "grad_norm": 0.039915946654852284, "learning_rate": 1.2203899673168205e-06, "loss": 0.723, "step": 5085 }, { "epoch": 0.9642434288420554, "grad_norm": 0.03842715857163489, "learning_rate": 1.1580568392071e-06, "loss": 0.7092, "step": 5090 }, { "epoch": 0.9651906227800142, "grad_norm": 0.040087737408694465, "learning_rate": 1.0973514227412161e-06, "loss": 0.7467, "step": 5095 }, { "epoch": 0.966137816717973, "grad_norm": 0.03868018943060032, "learning_rate": 1.038274381783849e-06, "loss": 0.7034, "step": 5100 }, { "epoch": 0.9670850106559318, "grad_norm": 0.03888690917055077, "learning_rate": 9.80826362392073e-07, "loss": 0.724, "step": 5105 }, { "epoch": 0.9680322045938906, "grad_norm": 0.0361796924871166, "learning_rate": 9.250079928082132e-07, "loss": 0.736, "step": 5110 }, { "epoch": 0.9689793985318494, "grad_norm": 0.03965669973295369, "learning_rate": 8.708198834530166e-07, "loss": 0.7261, "step": 5115 }, { "epoch": 0.9699265924698082, "grad_norm": 0.039430492673631544, "learning_rate": 8.182626269189752e-07, "loss": 0.7242, "step": 5120 }, { "epoch": 0.970873786407767, "grad_norm": 0.0348287855119034, "learning_rate": 7.673367979637968e-07, "loss": 0.7082, "step": 5125 }, { "epoch": 0.9718209803457258, "grad_norm": 0.03706016619862877, "learning_rate": 7.180429535042276e-07, "loss": 0.7518, "step": 5130 }, { "epoch": 0.9727681742836846, "grad_norm": 0.04056580951745248, "learning_rate": 6.703816326098399e-07, "loss": 0.7401, "step": 5135 }, { "epoch": 0.9737153682216434, "grad_norm": 0.04032169255601853, "learning_rate": 6.24353356497187e-07, "loss": 0.7491, "step": 5140 }, { "epoch": 0.9746625621596022, "grad_norm": 0.04118720081247295, "learning_rate": 5.799586285241242e-07, "loss": 0.7569, "step": 5145 }, { "epoch": 0.975609756097561, "grad_norm": 0.039992603944723155, "learning_rate": 5.371979341843136e-07, "loss": 0.719, "step": 5150 }, { "epoch": 0.9765569500355198, "grad_norm": 0.038975715445144526, "learning_rate": 4.960717411018277e-07, "loss": 0.7183, "step": 5155 }, { "epoch": 0.9775041439734786, "grad_norm": 0.03558255003932097, "learning_rate": 4.565804990261379e-07, "loss": 0.72, "step": 5160 }, { "epoch": 0.9784513379114373, "grad_norm": 0.041836063593222644, "learning_rate": 4.187246398271171e-07, "loss": 0.7227, "step": 5165 }, { "epoch": 0.9793985318493962, "grad_norm": 0.0401155869593642, "learning_rate": 3.825045774904112e-07, "loss": 0.723, "step": 5170 }, { "epoch": 0.980345725787355, "grad_norm": 0.04241569076307143, "learning_rate": 3.4792070811280884e-07, "loss": 0.7329, "step": 5175 }, { "epoch": 0.9812929197253137, "grad_norm": 0.04084926695436618, "learning_rate": 3.149734098979617e-07, "loss": 0.7126, "step": 5180 }, { "epoch": 0.9822401136632726, "grad_norm": 0.03939583534788028, "learning_rate": 2.83663043152238e-07, "loss": 0.7426, "step": 5185 }, { "epoch": 0.9831873076012313, "grad_norm": 0.039426635966572685, "learning_rate": 2.5398995028079184e-07, "loss": 0.7086, "step": 5190 }, { "epoch": 0.9841345015391901, "grad_norm": 0.03761530845106783, "learning_rate": 2.2595445578381665e-07, "loss": 0.7132, "step": 5195 }, { "epoch": 0.985081695477149, "grad_norm": 0.03983372031881901, "learning_rate": 1.9955686625299782e-07, "loss": 0.7041, "step": 5200 }, { "epoch": 0.9860288894151077, "grad_norm": 0.038745755637894036, "learning_rate": 1.7479747036813207e-07, "loss": 0.7565, "step": 5205 }, { "epoch": 0.9869760833530665, "grad_norm": 0.03451565749276735, "learning_rate": 1.5167653889401332e-07, "loss": 0.7053, "step": 5210 }, { "epoch": 0.9879232772910254, "grad_norm": 0.0377767563169325, "learning_rate": 1.3019432467743508e-07, "loss": 0.7165, "step": 5215 }, { "epoch": 0.9888704712289841, "grad_norm": 0.03919747343677102, "learning_rate": 1.1035106264445925e-07, "loss": 0.7135, "step": 5220 }, { "epoch": 0.9898176651669429, "grad_norm": 0.035742270061366835, "learning_rate": 9.214696979781833e-08, "loss": 0.7422, "step": 5225 }, { "epoch": 0.9907648591049018, "grad_norm": 0.03988070344903895, "learning_rate": 7.558224521455048e-08, "loss": 0.7176, "step": 5230 }, { "epoch": 0.9917120530428605, "grad_norm": 0.03844068057098456, "learning_rate": 6.065707004383468e-08, "loss": 0.7275, "step": 5235 }, { "epoch": 0.9926592469808193, "grad_norm": 0.037145775071304184, "learning_rate": 4.737160750500901e-08, "loss": 0.7158, "step": 5240 }, { "epoch": 0.9936064409187781, "grad_norm": 0.03801708244408533, "learning_rate": 3.572600288572203e-08, "loss": 0.7286, "step": 5245 }, { "epoch": 0.9945536348567369, "grad_norm": 0.03898719180494596, "learning_rate": 2.5720383540484002e-08, "loss": 0.7208, "step": 5250 }, { "epoch": 0.9955008287946957, "grad_norm": 0.03854350834493005, "learning_rate": 1.7354858889134793e-08, "loss": 0.7347, "step": 5255 }, { "epoch": 0.9964480227326545, "grad_norm": 0.039328700531041635, "learning_rate": 1.0629520415694759e-08, "loss": 0.7603, "step": 5260 }, { "epoch": 0.9973952166706133, "grad_norm": 0.03842843734298471, "learning_rate": 5.544441667398869e-09, "loss": 0.7204, "step": 5265 }, { "epoch": 0.9983424106085721, "grad_norm": 0.03655798359456603, "learning_rate": 2.099678253847381e-09, "loss": 0.7179, "step": 5270 }, { "epoch": 0.9992896045465309, "grad_norm": 0.03754524737942178, "learning_rate": 2.952678464229752e-10, "loss": 0.6995, "step": 5275 }, { "epoch": 0.9998579209093061, "eval_loss": 1.1249996423721313, "eval_runtime": 1040.1086, "eval_samples_per_second": 188.598, "eval_steps_per_second": 5.895, "step": 5278 }, { "epoch": 0.9998579209093061, "step": 5278, "total_flos": 768453779423232.0, "train_loss": 0.8248233945137535, "train_runtime": 21690.489, "train_samples_per_second": 31.151, "train_steps_per_second": 0.243 } ], "logging_steps": 5, "max_steps": 5278, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 768453779423232.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }