{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004058853373921867, "grad_norm": 1.6877160845613697, "learning_rate": 6.756756756756757e-08, "loss": 0.3778, "step": 2 }, { "epoch": 0.008117706747843734, "grad_norm": 1.7651022477491063, "learning_rate": 2.0270270270270273e-07, "loss": 0.3539, "step": 4 }, { "epoch": 0.0121765601217656, "grad_norm": 1.7346867297246225, "learning_rate": 3.378378378378379e-07, "loss": 0.3621, "step": 6 }, { "epoch": 0.016235413495687467, "grad_norm": 1.572133422531505, "learning_rate": 4.7297297297297305e-07, "loss": 0.3345, "step": 8 }, { "epoch": 0.020294266869609334, "grad_norm": 1.619733499528023, "learning_rate": 6.081081081081082e-07, "loss": 0.3408, "step": 10 }, { "epoch": 0.0243531202435312, "grad_norm": 1.658160676313538, "learning_rate": 7.432432432432434e-07, "loss": 0.351, "step": 12 }, { "epoch": 0.028411973617453068, "grad_norm": 1.5596628408438766, "learning_rate": 8.783783783783785e-07, "loss": 0.3319, "step": 14 }, { "epoch": 0.032470826991374935, "grad_norm": 1.582923343783565, "learning_rate": 1.0135135135135136e-06, "loss": 0.3631, "step": 16 }, { "epoch": 0.0365296803652968, "grad_norm": 1.6210099160307392, "learning_rate": 1.148648648648649e-06, "loss": 0.3369, "step": 18 }, { "epoch": 0.04058853373921867, "grad_norm": 1.4490925925277196, "learning_rate": 1.2837837837837838e-06, "loss": 0.3401, "step": 20 }, { "epoch": 0.044647387113140535, "grad_norm": 1.5981937222734808, "learning_rate": 1.418918918918919e-06, "loss": 0.4064, "step": 22 }, { "epoch": 0.0487062404870624, "grad_norm": 1.4745472664304582, "learning_rate": 1.5540540540540541e-06, "loss": 0.3511, "step": 24 }, { "epoch": 0.05276509386098427, "grad_norm": 1.5094086215074392, "learning_rate": 1.6891891891891894e-06, "loss": 0.3303, "step": 26 }, { "epoch": 0.056823947234906136, "grad_norm": 1.585436570063055, "learning_rate": 1.8243243243243245e-06, "loss": 0.3458, "step": 28 }, { "epoch": 0.060882800608828, "grad_norm": 1.738359897683998, "learning_rate": 1.9594594594594595e-06, "loss": 0.3401, "step": 30 }, { "epoch": 0.06494165398274987, "grad_norm": 1.5432250105335408, "learning_rate": 2.0945945945945946e-06, "loss": 0.341, "step": 32 }, { "epoch": 0.06900050735667174, "grad_norm": 1.4879837682144732, "learning_rate": 2.22972972972973e-06, "loss": 0.333, "step": 34 }, { "epoch": 0.0730593607305936, "grad_norm": 1.5249793712374056, "learning_rate": 2.364864864864865e-06, "loss": 0.3389, "step": 36 }, { "epoch": 0.07711821410451547, "grad_norm": 1.5591368574163726, "learning_rate": 2.5e-06, "loss": 0.3422, "step": 38 }, { "epoch": 0.08117706747843734, "grad_norm": 1.6074325994660499, "learning_rate": 2.6351351351351353e-06, "loss": 0.348, "step": 40 }, { "epoch": 0.0852359208523592, "grad_norm": 1.517968202412236, "learning_rate": 2.7702702702702703e-06, "loss": 0.3376, "step": 42 }, { "epoch": 0.08929477422628107, "grad_norm": 1.5371668709250539, "learning_rate": 2.9054054054054054e-06, "loss": 0.3556, "step": 44 }, { "epoch": 0.09335362760020294, "grad_norm": 1.4812012090460671, "learning_rate": 3.040540540540541e-06, "loss": 0.3524, "step": 46 }, { "epoch": 0.0974124809741248, "grad_norm": 1.5363097735419804, "learning_rate": 3.1756756756756755e-06, "loss": 0.3279, "step": 48 }, { "epoch": 0.10147133434804667, "grad_norm": 1.4293275584526721, "learning_rate": 3.310810810810811e-06, "loss": 0.3063, "step": 50 }, { "epoch": 0.10553018772196854, "grad_norm": 1.5566072184509325, "learning_rate": 3.445945945945946e-06, "loss": 0.3453, "step": 52 }, { "epoch": 0.1095890410958904, "grad_norm": 1.411331335157791, "learning_rate": 3.5810810810810816e-06, "loss": 0.3151, "step": 54 }, { "epoch": 0.11364789446981227, "grad_norm": 1.559975757164133, "learning_rate": 3.7162162162162162e-06, "loss": 0.3267, "step": 56 }, { "epoch": 0.11770674784373414, "grad_norm": 1.3575124310454543, "learning_rate": 3.851351351351352e-06, "loss": 0.3426, "step": 58 }, { "epoch": 0.121765601217656, "grad_norm": 1.566074257576769, "learning_rate": 3.986486486486487e-06, "loss": 0.3431, "step": 60 }, { "epoch": 0.12582445459157787, "grad_norm": 1.4663548045652957, "learning_rate": 4.121621621621622e-06, "loss": 0.3328, "step": 62 }, { "epoch": 0.12988330796549974, "grad_norm": 1.4786594077137154, "learning_rate": 4.256756756756757e-06, "loss": 0.318, "step": 64 }, { "epoch": 0.1339421613394216, "grad_norm": 1.4943486857884478, "learning_rate": 4.391891891891892e-06, "loss": 0.3457, "step": 66 }, { "epoch": 0.13800101471334347, "grad_norm": 1.4843598946184238, "learning_rate": 4.527027027027027e-06, "loss": 0.329, "step": 68 }, { "epoch": 0.14205986808726534, "grad_norm": 1.4234589903170214, "learning_rate": 4.6621621621621625e-06, "loss": 0.3346, "step": 70 }, { "epoch": 0.1461187214611872, "grad_norm": 1.363410197784669, "learning_rate": 4.797297297297297e-06, "loss": 0.33, "step": 72 }, { "epoch": 0.15017757483510907, "grad_norm": 1.3795399226185014, "learning_rate": 4.932432432432433e-06, "loss": 0.3111, "step": 74 }, { "epoch": 0.15423642820903094, "grad_norm": 1.4630782525423722, "learning_rate": 5.067567567567568e-06, "loss": 0.3164, "step": 76 }, { "epoch": 0.1582952815829528, "grad_norm": 1.535896727076301, "learning_rate": 5.202702702702704e-06, "loss": 0.3462, "step": 78 }, { "epoch": 0.16235413495687467, "grad_norm": 1.3384695460866296, "learning_rate": 5.337837837837838e-06, "loss": 0.328, "step": 80 }, { "epoch": 0.16641298833079654, "grad_norm": 1.564060215758407, "learning_rate": 5.472972972972973e-06, "loss": 0.3356, "step": 82 }, { "epoch": 0.1704718417047184, "grad_norm": 1.5875357299856645, "learning_rate": 5.608108108108109e-06, "loss": 0.3317, "step": 84 }, { "epoch": 0.17453069507864027, "grad_norm": 1.3150986310423165, "learning_rate": 5.743243243243244e-06, "loss": 0.3197, "step": 86 }, { "epoch": 0.17858954845256214, "grad_norm": 1.4780350536596663, "learning_rate": 5.8783783783783786e-06, "loss": 0.3328, "step": 88 }, { "epoch": 0.182648401826484, "grad_norm": 1.3565098348860962, "learning_rate": 6.013513513513514e-06, "loss": 0.3081, "step": 90 }, { "epoch": 0.18670725520040587, "grad_norm": 1.5123567866175038, "learning_rate": 6.1486486486486495e-06, "loss": 0.3276, "step": 92 }, { "epoch": 0.19076610857432774, "grad_norm": 1.4884022484673987, "learning_rate": 6.283783783783784e-06, "loss": 0.3523, "step": 94 }, { "epoch": 0.1948249619482496, "grad_norm": 1.601046629607006, "learning_rate": 6.41891891891892e-06, "loss": 0.3612, "step": 96 }, { "epoch": 0.19888381532217148, "grad_norm": 1.479212704609007, "learning_rate": 6.554054054054054e-06, "loss": 0.3106, "step": 98 }, { "epoch": 0.20294266869609334, "grad_norm": 1.4220008074976263, "learning_rate": 6.689189189189191e-06, "loss": 0.3441, "step": 100 }, { "epoch": 0.2070015220700152, "grad_norm": 1.3514096746438489, "learning_rate": 6.824324324324325e-06, "loss": 0.334, "step": 102 }, { "epoch": 0.21106037544393708, "grad_norm": 1.4495742654024877, "learning_rate": 6.95945945945946e-06, "loss": 0.3365, "step": 104 }, { "epoch": 0.21511922881785894, "grad_norm": 1.4742675513138555, "learning_rate": 7.0945945945945946e-06, "loss": 0.3306, "step": 106 }, { "epoch": 0.2191780821917808, "grad_norm": 1.5180836151687533, "learning_rate": 7.229729729729731e-06, "loss": 0.3435, "step": 108 }, { "epoch": 0.22323693556570268, "grad_norm": 1.54712083151063, "learning_rate": 7.3648648648648655e-06, "loss": 0.3698, "step": 110 }, { "epoch": 0.22729578893962454, "grad_norm": 1.5589085128625726, "learning_rate": 7.500000000000001e-06, "loss": 0.3209, "step": 112 }, { "epoch": 0.2313546423135464, "grad_norm": 1.5013917554759992, "learning_rate": 7.635135135135135e-06, "loss": 0.3488, "step": 114 }, { "epoch": 0.23541349568746828, "grad_norm": 1.4511795332039656, "learning_rate": 7.77027027027027e-06, "loss": 0.3506, "step": 116 }, { "epoch": 0.23947234906139014, "grad_norm": 1.4069378206541412, "learning_rate": 7.905405405405406e-06, "loss": 0.3481, "step": 118 }, { "epoch": 0.243531202435312, "grad_norm": 1.582455691561815, "learning_rate": 8.040540540540541e-06, "loss": 0.3603, "step": 120 }, { "epoch": 0.24759005580923388, "grad_norm": 1.434757897777904, "learning_rate": 8.175675675675677e-06, "loss": 0.3351, "step": 122 }, { "epoch": 0.25164890918315574, "grad_norm": 1.4655011777161737, "learning_rate": 8.31081081081081e-06, "loss": 0.3579, "step": 124 }, { "epoch": 0.2557077625570776, "grad_norm": 1.2773884939900029, "learning_rate": 8.445945945945948e-06, "loss": 0.3344, "step": 126 }, { "epoch": 0.2597666159309995, "grad_norm": 1.566232859360701, "learning_rate": 8.581081081081082e-06, "loss": 0.3671, "step": 128 }, { "epoch": 0.26382546930492135, "grad_norm": 1.600278240352609, "learning_rate": 8.716216216216217e-06, "loss": 0.3668, "step": 130 }, { "epoch": 0.2678843226788432, "grad_norm": 1.4285015615854608, "learning_rate": 8.851351351351351e-06, "loss": 0.3701, "step": 132 }, { "epoch": 0.2719431760527651, "grad_norm": 1.567165427429299, "learning_rate": 8.986486486486488e-06, "loss": 0.3718, "step": 134 }, { "epoch": 0.27600202942668695, "grad_norm": 1.4657335810014254, "learning_rate": 9.121621621621622e-06, "loss": 0.3576, "step": 136 }, { "epoch": 0.2800608828006088, "grad_norm": 1.5291620734959124, "learning_rate": 9.256756756756757e-06, "loss": 0.3838, "step": 138 }, { "epoch": 0.2841197361745307, "grad_norm": 1.4553153344151037, "learning_rate": 9.391891891891893e-06, "loss": 0.3782, "step": 140 }, { "epoch": 0.28817858954845255, "grad_norm": 1.406160463172771, "learning_rate": 9.527027027027028e-06, "loss": 0.3666, "step": 142 }, { "epoch": 0.2922374429223744, "grad_norm": 1.5917073052853832, "learning_rate": 9.662162162162164e-06, "loss": 0.392, "step": 144 }, { "epoch": 0.2962962962962963, "grad_norm": 1.511564904546133, "learning_rate": 9.797297297297298e-06, "loss": 0.3852, "step": 146 }, { "epoch": 0.30035514967021815, "grad_norm": 1.4461988102226726, "learning_rate": 9.932432432432433e-06, "loss": 0.3947, "step": 148 }, { "epoch": 0.30441400304414, "grad_norm": 1.5436908100507405, "learning_rate": 9.999986072170506e-06, "loss": 0.3778, "step": 150 }, { "epoch": 0.3084728564180619, "grad_norm": 1.413238952587016, "learning_rate": 9.99987465000011e-06, "loss": 0.3634, "step": 152 }, { "epoch": 0.31253170979198375, "grad_norm": 1.1989032000359305, "learning_rate": 9.999651808142305e-06, "loss": 0.3629, "step": 154 }, { "epoch": 0.3165905631659056, "grad_norm": 1.2868027887610292, "learning_rate": 9.999317551563011e-06, "loss": 0.3674, "step": 156 }, { "epoch": 0.3206494165398275, "grad_norm": 1.3871216563915811, "learning_rate": 9.998871887710965e-06, "loss": 0.3844, "step": 158 }, { "epoch": 0.32470826991374935, "grad_norm": 1.532088928705796, "learning_rate": 9.998314826517564e-06, "loss": 0.3986, "step": 160 }, { "epoch": 0.3287671232876712, "grad_norm": 1.5418249912864774, "learning_rate": 9.997646380396633e-06, "loss": 0.3934, "step": 162 }, { "epoch": 0.3328259766615931, "grad_norm": 1.4957436906571129, "learning_rate": 9.996866564244158e-06, "loss": 0.3958, "step": 164 }, { "epoch": 0.33688483003551495, "grad_norm": 1.4005270008180681, "learning_rate": 9.995975395437952e-06, "loss": 0.3697, "step": 166 }, { "epoch": 0.3409436834094368, "grad_norm": 1.3477487401644073, "learning_rate": 9.994972893837259e-06, "loss": 0.382, "step": 168 }, { "epoch": 0.3450025367833587, "grad_norm": 1.4090292300223908, "learning_rate": 9.993859081782322e-06, "loss": 0.3989, "step": 170 }, { "epoch": 0.34906139015728055, "grad_norm": 1.2787360296779213, "learning_rate": 9.992633984093886e-06, "loss": 0.3746, "step": 172 }, { "epoch": 0.3531202435312024, "grad_norm": 1.410889902419896, "learning_rate": 9.991297628072632e-06, "loss": 0.3965, "step": 174 }, { "epoch": 0.3571790969051243, "grad_norm": 1.5306391383373583, "learning_rate": 9.98985004349858e-06, "loss": 0.418, "step": 176 }, { "epoch": 0.36123795027904615, "grad_norm": 1.412306543323218, "learning_rate": 9.988291262630425e-06, "loss": 0.3954, "step": 178 }, { "epoch": 0.365296803652968, "grad_norm": 1.376967456626685, "learning_rate": 9.986621320204813e-06, "loss": 0.3944, "step": 180 }, { "epoch": 0.3693556570268899, "grad_norm": 1.325650682628611, "learning_rate": 9.984840253435569e-06, "loss": 0.396, "step": 182 }, { "epoch": 0.37341451040081175, "grad_norm": 1.248201726411196, "learning_rate": 9.982948102012866e-06, "loss": 0.3783, "step": 184 }, { "epoch": 0.3774733637747336, "grad_norm": 1.3492739121359127, "learning_rate": 9.98094490810235e-06, "loss": 0.4078, "step": 186 }, { "epoch": 0.3815322171486555, "grad_norm": 1.4387342464186235, "learning_rate": 9.978830716344185e-06, "loss": 0.3892, "step": 188 }, { "epoch": 0.38559107052257735, "grad_norm": 1.2509510227667138, "learning_rate": 9.976605573852071e-06, "loss": 0.3696, "step": 190 }, { "epoch": 0.3896499238964992, "grad_norm": 1.3154238912694323, "learning_rate": 9.974269530212185e-06, "loss": 0.405, "step": 192 }, { "epoch": 0.3937087772704211, "grad_norm": 1.5250960877423454, "learning_rate": 9.971822637482085e-06, "loss": 0.4135, "step": 194 }, { "epoch": 0.39776763064434295, "grad_norm": 1.3908097291583248, "learning_rate": 9.969264950189539e-06, "loss": 0.4006, "step": 196 }, { "epoch": 0.4018264840182648, "grad_norm": 1.4790280124932251, "learning_rate": 9.966596525331324e-06, "loss": 0.4188, "step": 198 }, { "epoch": 0.4058853373921867, "grad_norm": 1.4233060958120591, "learning_rate": 9.96381742237194e-06, "loss": 0.4042, "step": 200 }, { "epoch": 0.40994419076610855, "grad_norm": 1.3044712016817912, "learning_rate": 9.960927703242298e-06, "loss": 0.3956, "step": 202 }, { "epoch": 0.4140030441400304, "grad_norm": 1.2296559721817601, "learning_rate": 9.957927432338332e-06, "loss": 0.3813, "step": 204 }, { "epoch": 0.4180618975139523, "grad_norm": 1.247811043333453, "learning_rate": 9.954816676519569e-06, "loss": 0.3846, "step": 206 }, { "epoch": 0.42212075088787415, "grad_norm": 1.4552540186289, "learning_rate": 9.951595505107633e-06, "loss": 0.3826, "step": 208 }, { "epoch": 0.426179604261796, "grad_norm": 1.3877999146640874, "learning_rate": 9.948263989884708e-06, "loss": 0.4118, "step": 210 }, { "epoch": 0.4302384576357179, "grad_norm": 1.3634788475367725, "learning_rate": 9.944822205091929e-06, "loss": 0.3974, "step": 212 }, { "epoch": 0.43429731100963975, "grad_norm": 1.2479391778044153, "learning_rate": 9.94127022742774e-06, "loss": 0.3784, "step": 214 }, { "epoch": 0.4383561643835616, "grad_norm": 1.2120926044150644, "learning_rate": 9.937608136046171e-06, "loss": 0.3857, "step": 216 }, { "epoch": 0.4424150177574835, "grad_norm": 1.363599562180868, "learning_rate": 9.933836012555083e-06, "loss": 0.4089, "step": 218 }, { "epoch": 0.44647387113140535, "grad_norm": 1.169807478788221, "learning_rate": 9.929953941014349e-06, "loss": 0.3649, "step": 220 }, { "epoch": 0.4505327245053272, "grad_norm": 1.3307716150293825, "learning_rate": 9.925962007933975e-06, "loss": 0.4093, "step": 222 }, { "epoch": 0.4545915778792491, "grad_norm": 1.2092559857310445, "learning_rate": 9.921860302272184e-06, "loss": 0.3959, "step": 224 }, { "epoch": 0.45865043125317095, "grad_norm": 1.40047176469619, "learning_rate": 9.917648915433413e-06, "loss": 0.4271, "step": 226 }, { "epoch": 0.4627092846270928, "grad_norm": 1.2607295390446736, "learning_rate": 9.9133279412663e-06, "loss": 0.3963, "step": 228 }, { "epoch": 0.4667681380010147, "grad_norm": 1.2887307875981555, "learning_rate": 9.908897476061576e-06, "loss": 0.4128, "step": 230 }, { "epoch": 0.47082699137493655, "grad_norm": 1.3538240839163793, "learning_rate": 9.904357618549925e-06, "loss": 0.4032, "step": 232 }, { "epoch": 0.4748858447488584, "grad_norm": 1.2106198378461424, "learning_rate": 9.899708469899786e-06, "loss": 0.402, "step": 234 }, { "epoch": 0.4789446981227803, "grad_norm": 1.214556237570029, "learning_rate": 9.894950133715094e-06, "loss": 0.4079, "step": 236 }, { "epoch": 0.48300355149670215, "grad_norm": 1.3315305311943295, "learning_rate": 9.89008271603297e-06, "loss": 0.3908, "step": 238 }, { "epoch": 0.487062404870624, "grad_norm": 1.2949635168468638, "learning_rate": 9.885106325321371e-06, "loss": 0.418, "step": 240 }, { "epoch": 0.4911212582445459, "grad_norm": 1.3622823844909377, "learning_rate": 9.880021072476651e-06, "loss": 0.4136, "step": 242 }, { "epoch": 0.49518011161846776, "grad_norm": 1.298645600673932, "learning_rate": 9.874827070821112e-06, "loss": 0.4037, "step": 244 }, { "epoch": 0.4992389649923896, "grad_norm": 1.1869885168664382, "learning_rate": 9.869524436100458e-06, "loss": 0.3723, "step": 246 }, { "epoch": 0.5032978183663115, "grad_norm": 1.1960510107751574, "learning_rate": 9.864113286481237e-06, "loss": 0.3665, "step": 248 }, { "epoch": 0.5073566717402334, "grad_norm": 1.4192185941613773, "learning_rate": 9.85859374254819e-06, "loss": 0.4243, "step": 250 }, { "epoch": 0.5114155251141552, "grad_norm": 1.2602840725571196, "learning_rate": 9.852965927301573e-06, "loss": 0.3945, "step": 252 }, { "epoch": 0.5154743784880771, "grad_norm": 1.3076466906647164, "learning_rate": 9.847229966154415e-06, "loss": 0.4303, "step": 254 }, { "epoch": 0.519533231861999, "grad_norm": 1.2994672624028094, "learning_rate": 9.841385986929716e-06, "loss": 0.4223, "step": 256 }, { "epoch": 0.5235920852359208, "grad_norm": 1.2718506455560323, "learning_rate": 9.835434119857612e-06, "loss": 0.4124, "step": 258 }, { "epoch": 0.5276509386098427, "grad_norm": 1.233515929409816, "learning_rate": 9.829374497572461e-06, "loss": 0.4156, "step": 260 }, { "epoch": 0.5317097919837646, "grad_norm": 1.192750940555937, "learning_rate": 9.823207255109891e-06, "loss": 0.3865, "step": 262 }, { "epoch": 0.5357686453576864, "grad_norm": 1.2146025999616998, "learning_rate": 9.816932529903795e-06, "loss": 0.381, "step": 264 }, { "epoch": 0.5398274987316083, "grad_norm": 1.254427394175359, "learning_rate": 9.810550461783261e-06, "loss": 0.4209, "step": 266 }, { "epoch": 0.5438863521055302, "grad_norm": 1.2007483465859314, "learning_rate": 9.804061192969465e-06, "loss": 0.3935, "step": 268 }, { "epoch": 0.547945205479452, "grad_norm": 1.1962317702272547, "learning_rate": 9.797464868072489e-06, "loss": 0.4055, "step": 270 }, { "epoch": 0.5520040588533739, "grad_norm": 1.2388775998656307, "learning_rate": 9.790761634088108e-06, "loss": 0.4016, "step": 272 }, { "epoch": 0.5560629122272958, "grad_norm": 1.1565724419815788, "learning_rate": 9.78395164039452e-06, "loss": 0.4066, "step": 274 }, { "epoch": 0.5601217656012176, "grad_norm": 1.287795256739133, "learning_rate": 9.777035038749002e-06, "loss": 0.4072, "step": 276 }, { "epoch": 0.5641806189751395, "grad_norm": 1.1990919314621633, "learning_rate": 9.77001198328453e-06, "loss": 0.385, "step": 278 }, { "epoch": 0.5682394723490614, "grad_norm": 1.1155810438092542, "learning_rate": 9.762882630506366e-06, "loss": 0.4138, "step": 280 }, { "epoch": 0.5722983257229832, "grad_norm": 1.270188734252511, "learning_rate": 9.75564713928854e-06, "loss": 0.4108, "step": 282 }, { "epoch": 0.5763571790969051, "grad_norm": 1.2854062183745893, "learning_rate": 9.748305670870326e-06, "loss": 0.4105, "step": 284 }, { "epoch": 0.580416032470827, "grad_norm": 1.2822199195202089, "learning_rate": 9.740858388852652e-06, "loss": 0.4187, "step": 286 }, { "epoch": 0.5844748858447488, "grad_norm": 1.1789452806981648, "learning_rate": 9.733305459194444e-06, "loss": 0.4026, "step": 288 }, { "epoch": 0.5885337392186707, "grad_norm": 1.2063791823863752, "learning_rate": 9.725647050208936e-06, "loss": 0.4194, "step": 290 }, { "epoch": 0.5925925925925926, "grad_norm": 1.1212621894773256, "learning_rate": 9.717883332559911e-06, "loss": 0.4043, "step": 292 }, { "epoch": 0.5966514459665144, "grad_norm": 1.236354295472038, "learning_rate": 9.710014479257906e-06, "loss": 0.4279, "step": 294 }, { "epoch": 0.6007102993404363, "grad_norm": 1.230960872966148, "learning_rate": 9.702040665656353e-06, "loss": 0.417, "step": 296 }, { "epoch": 0.6047691527143582, "grad_norm": 1.302936449552778, "learning_rate": 9.693962069447669e-06, "loss": 0.4399, "step": 298 }, { "epoch": 0.60882800608828, "grad_norm": 1.1296630845707911, "learning_rate": 9.685778870659301e-06, "loss": 0.4024, "step": 300 }, { "epoch": 0.6128868594622019, "grad_norm": 1.1746225535864936, "learning_rate": 9.677491251649711e-06, "loss": 0.3912, "step": 302 }, { "epoch": 0.6169457128361238, "grad_norm": 1.241320530846212, "learning_rate": 9.669099397104314e-06, "loss": 0.4174, "step": 304 }, { "epoch": 0.6210045662100456, "grad_norm": 1.2219507615770004, "learning_rate": 9.660603494031358e-06, "loss": 0.3918, "step": 306 }, { "epoch": 0.6250634195839675, "grad_norm": 1.1589702070871013, "learning_rate": 9.652003731757763e-06, "loss": 0.4157, "step": 308 }, { "epoch": 0.6291222729578894, "grad_norm": 1.1220293339629992, "learning_rate": 9.643300301924902e-06, "loss": 0.4015, "step": 310 }, { "epoch": 0.6331811263318112, "grad_norm": 1.2563582002979947, "learning_rate": 9.634493398484319e-06, "loss": 0.4128, "step": 312 }, { "epoch": 0.6372399797057331, "grad_norm": 1.1888367524986483, "learning_rate": 9.625583217693419e-06, "loss": 0.3874, "step": 314 }, { "epoch": 0.641298833079655, "grad_norm": 1.1925360068598152, "learning_rate": 9.616569958111097e-06, "loss": 0.4219, "step": 316 }, { "epoch": 0.6453576864535768, "grad_norm": 1.2776062593085378, "learning_rate": 9.607453820593297e-06, "loss": 0.4138, "step": 318 }, { "epoch": 0.6494165398274987, "grad_norm": 1.157480079096016, "learning_rate": 9.598235008288551e-06, "loss": 0.4075, "step": 320 }, { "epoch": 0.6534753932014206, "grad_norm": 1.2352282756489477, "learning_rate": 9.58891372663345e-06, "loss": 0.4111, "step": 322 }, { "epoch": 0.6575342465753424, "grad_norm": 1.2837461432435215, "learning_rate": 9.579490183348052e-06, "loss": 0.4358, "step": 324 }, { "epoch": 0.6615930999492643, "grad_norm": 1.172789813592292, "learning_rate": 9.56996458843128e-06, "loss": 0.3986, "step": 326 }, { "epoch": 0.6656519533231862, "grad_norm": 1.194020795966964, "learning_rate": 9.56033715415621e-06, "loss": 0.4075, "step": 328 }, { "epoch": 0.669710806697108, "grad_norm": 1.0964374769088712, "learning_rate": 9.550608095065367e-06, "loss": 0.4071, "step": 330 }, { "epoch": 0.6737696600710299, "grad_norm": 1.089373021702181, "learning_rate": 9.540777627965933e-06, "loss": 0.3957, "step": 332 }, { "epoch": 0.6778285134449518, "grad_norm": 1.1992667011972529, "learning_rate": 9.53084597192491e-06, "loss": 0.4158, "step": 334 }, { "epoch": 0.6818873668188736, "grad_norm": 1.2172349749770106, "learning_rate": 9.520813348264252e-06, "loss": 0.4277, "step": 336 }, { "epoch": 0.6859462201927955, "grad_norm": 1.2574902319962946, "learning_rate": 9.510679980555922e-06, "loss": 0.3995, "step": 338 }, { "epoch": 0.6900050735667174, "grad_norm": 1.131615777672815, "learning_rate": 9.500446094616911e-06, "loss": 0.4005, "step": 340 }, { "epoch": 0.6940639269406392, "grad_norm": 1.246895890559624, "learning_rate": 9.490111918504213e-06, "loss": 0.4169, "step": 342 }, { "epoch": 0.6981227803145611, "grad_norm": 1.181624286199365, "learning_rate": 9.479677682509737e-06, "loss": 0.3986, "step": 344 }, { "epoch": 0.702181633688483, "grad_norm": 1.132690259540531, "learning_rate": 9.469143619155172e-06, "loss": 0.3923, "step": 346 }, { "epoch": 0.7062404870624048, "grad_norm": 1.047890655047983, "learning_rate": 9.458509963186815e-06, "loss": 0.4043, "step": 348 }, { "epoch": 0.7102993404363267, "grad_norm": 1.147246283887197, "learning_rate": 9.44777695157033e-06, "loss": 0.4066, "step": 350 }, { "epoch": 0.7143581938102486, "grad_norm": 1.215824000969317, "learning_rate": 9.436944823485475e-06, "loss": 0.4146, "step": 352 }, { "epoch": 0.7184170471841704, "grad_norm": 1.1437849010452408, "learning_rate": 9.426013820320764e-06, "loss": 0.4206, "step": 354 }, { "epoch": 0.7224759005580923, "grad_norm": 1.1210034262809383, "learning_rate": 9.414984185668097e-06, "loss": 0.3991, "step": 356 }, { "epoch": 0.7265347539320142, "grad_norm": 1.3628388125490938, "learning_rate": 9.403856165317322e-06, "loss": 0.4359, "step": 358 }, { "epoch": 0.730593607305936, "grad_norm": 1.236630161906545, "learning_rate": 9.392630007250769e-06, "loss": 0.4415, "step": 360 }, { "epoch": 0.7346524606798579, "grad_norm": 1.1213292521942286, "learning_rate": 9.381305961637713e-06, "loss": 0.4219, "step": 362 }, { "epoch": 0.7387113140537798, "grad_norm": 1.209577588106072, "learning_rate": 9.369884280828806e-06, "loss": 0.4308, "step": 364 }, { "epoch": 0.7427701674277016, "grad_norm": 1.3143877988319919, "learning_rate": 9.358365219350448e-06, "loss": 0.4376, "step": 366 }, { "epoch": 0.7468290208016235, "grad_norm": 1.1794072785475278, "learning_rate": 9.346749033899121e-06, "loss": 0.4331, "step": 368 }, { "epoch": 0.7508878741755454, "grad_norm": 1.1813808165518036, "learning_rate": 9.335035983335667e-06, "loss": 0.3992, "step": 370 }, { "epoch": 0.7549467275494672, "grad_norm": 1.1568780935799914, "learning_rate": 9.323226328679512e-06, "loss": 0.4044, "step": 372 }, { "epoch": 0.7590055809233891, "grad_norm": 1.0660018515522698, "learning_rate": 9.311320333102864e-06, "loss": 0.3954, "step": 374 }, { "epoch": 0.763064434297311, "grad_norm": 1.108920689047685, "learning_rate": 9.299318261924834e-06, "loss": 0.3998, "step": 376 }, { "epoch": 0.7671232876712328, "grad_norm": 1.136789158664533, "learning_rate": 9.287220382605532e-06, "loss": 0.4042, "step": 378 }, { "epoch": 0.7711821410451547, "grad_norm": 1.2496770566654822, "learning_rate": 9.275026964740101e-06, "loss": 0.4067, "step": 380 }, { "epoch": 0.7752409944190766, "grad_norm": 1.1996061958131852, "learning_rate": 9.262738280052715e-06, "loss": 0.4183, "step": 382 }, { "epoch": 0.7792998477929984, "grad_norm": 1.2149866273575285, "learning_rate": 9.250354602390523e-06, "loss": 0.4409, "step": 384 }, { "epoch": 0.7833587011669203, "grad_norm": 1.1177682668450932, "learning_rate": 9.237876207717538e-06, "loss": 0.4029, "step": 386 }, { "epoch": 0.7874175545408422, "grad_norm": 1.1677530585024758, "learning_rate": 9.225303374108503e-06, "loss": 0.4178, "step": 388 }, { "epoch": 0.791476407914764, "grad_norm": 1.3678602117567324, "learning_rate": 9.212636381742676e-06, "loss": 0.4197, "step": 390 }, { "epoch": 0.7955352612886859, "grad_norm": 1.189997795752436, "learning_rate": 9.199875512897602e-06, "loss": 0.4173, "step": 392 }, { "epoch": 0.7995941146626078, "grad_norm": 1.186213329132832, "learning_rate": 9.187021051942814e-06, "loss": 0.4145, "step": 394 }, { "epoch": 0.8036529680365296, "grad_norm": 1.203493805658719, "learning_rate": 9.174073285333498e-06, "loss": 0.4181, "step": 396 }, { "epoch": 0.8077118214104515, "grad_norm": 1.175802247814532, "learning_rate": 9.161032501604106e-06, "loss": 0.3949, "step": 398 }, { "epoch": 0.8117706747843734, "grad_norm": 1.212190555266731, "learning_rate": 9.147898991361936e-06, "loss": 0.4076, "step": 400 }, { "epoch": 0.8158295281582952, "grad_norm": 1.191973289112244, "learning_rate": 9.134673047280644e-06, "loss": 0.4233, "step": 402 }, { "epoch": 0.8198883815322171, "grad_norm": 1.2279490044480763, "learning_rate": 9.121354964093732e-06, "loss": 0.4127, "step": 404 }, { "epoch": 0.823947234906139, "grad_norm": 1.151451234197627, "learning_rate": 9.107945038587974e-06, "loss": 0.4226, "step": 406 }, { "epoch": 0.8280060882800608, "grad_norm": 1.119728494545527, "learning_rate": 9.094443569596802e-06, "loss": 0.4033, "step": 408 }, { "epoch": 0.8320649416539827, "grad_norm": 1.13012343405543, "learning_rate": 9.08085085799365e-06, "loss": 0.4088, "step": 410 }, { "epoch": 0.8361237950279046, "grad_norm": 1.159098363094475, "learning_rate": 9.067167206685248e-06, "loss": 0.4124, "step": 412 }, { "epoch": 0.8401826484018264, "grad_norm": 1.2347110694455659, "learning_rate": 9.05339292060487e-06, "loss": 0.434, "step": 414 }, { "epoch": 0.8442415017757483, "grad_norm": 1.2402415983547357, "learning_rate": 9.039528306705543e-06, "loss": 0.425, "step": 416 }, { "epoch": 0.8483003551496702, "grad_norm": 1.1683847145500172, "learning_rate": 9.025573673953201e-06, "loss": 0.4423, "step": 418 }, { "epoch": 0.852359208523592, "grad_norm": 0.9967973069250277, "learning_rate": 9.011529333319804e-06, "loss": 0.3987, "step": 420 }, { "epoch": 0.8564180618975139, "grad_norm": 1.0663571132874041, "learning_rate": 8.997395597776404e-06, "loss": 0.3908, "step": 422 }, { "epoch": 0.8604769152714358, "grad_norm": 1.1585376506062766, "learning_rate": 8.98317278228618e-06, "loss": 0.4055, "step": 424 }, { "epoch": 0.8645357686453576, "grad_norm": 1.3272038292907982, "learning_rate": 8.96886120379741e-06, "loss": 0.4241, "step": 426 }, { "epoch": 0.8685946220192795, "grad_norm": 1.1134457404863736, "learning_rate": 8.954461181236406e-06, "loss": 0.4343, "step": 428 }, { "epoch": 0.8726534753932014, "grad_norm": 1.152191927893708, "learning_rate": 8.939973035500418e-06, "loss": 0.4012, "step": 430 }, { "epoch": 0.8767123287671232, "grad_norm": 1.157272959329367, "learning_rate": 8.925397089450473e-06, "loss": 0.4116, "step": 432 }, { "epoch": 0.8807711821410451, "grad_norm": 1.1617646168179858, "learning_rate": 8.910733667904186e-06, "loss": 0.4128, "step": 434 }, { "epoch": 0.884830035514967, "grad_norm": 1.2116957635700267, "learning_rate": 8.895983097628515e-06, "loss": 0.4332, "step": 436 }, { "epoch": 0.8888888888888888, "grad_norm": 1.1950006191376203, "learning_rate": 8.88114570733249e-06, "loss": 0.4005, "step": 438 }, { "epoch": 0.8929477422628107, "grad_norm": 1.1470604791719932, "learning_rate": 8.866221827659876e-06, "loss": 0.4233, "step": 440 }, { "epoch": 0.8970065956367326, "grad_norm": 1.2230557240685258, "learning_rate": 8.851211791181813e-06, "loss": 0.4133, "step": 442 }, { "epoch": 0.9010654490106544, "grad_norm": 1.2680481957203948, "learning_rate": 8.8361159323894e-06, "loss": 0.447, "step": 444 }, { "epoch": 0.9051243023845763, "grad_norm": 1.095625464598396, "learning_rate": 8.820934587686247e-06, "loss": 0.3884, "step": 446 }, { "epoch": 0.9091831557584982, "grad_norm": 1.1796975421785947, "learning_rate": 8.805668095380969e-06, "loss": 0.4139, "step": 448 }, { "epoch": 0.91324200913242, "grad_norm": 1.2337117620848044, "learning_rate": 8.790316795679654e-06, "loss": 0.4258, "step": 450 }, { "epoch": 0.9173008625063419, "grad_norm": 1.060326310057752, "learning_rate": 8.774881030678284e-06, "loss": 0.4039, "step": 452 }, { "epoch": 0.9213597158802638, "grad_norm": 1.0818729217545202, "learning_rate": 8.759361144355103e-06, "loss": 0.4186, "step": 454 }, { "epoch": 0.9254185692541856, "grad_norm": 1.2188647624805096, "learning_rate": 8.74375748256296e-06, "loss": 0.43, "step": 456 }, { "epoch": 0.9294774226281075, "grad_norm": 1.1517012313266344, "learning_rate": 8.728070393021595e-06, "loss": 0.3952, "step": 458 }, { "epoch": 0.9335362760020294, "grad_norm": 1.146308857460623, "learning_rate": 8.712300225309894e-06, "loss": 0.419, "step": 460 }, { "epoch": 0.9375951293759512, "grad_norm": 1.1733285567505642, "learning_rate": 8.6964473308581e-06, "loss": 0.4295, "step": 462 }, { "epoch": 0.9416539827498731, "grad_norm": 1.2133986892308575, "learning_rate": 8.680512062939976e-06, "loss": 0.3994, "step": 464 }, { "epoch": 0.945712836123795, "grad_norm": 1.2860321510839698, "learning_rate": 8.664494776664942e-06, "loss": 0.4305, "step": 466 }, { "epoch": 0.9497716894977168, "grad_norm": 1.222015638603744, "learning_rate": 8.64839582897015e-06, "loss": 0.4247, "step": 468 }, { "epoch": 0.9538305428716387, "grad_norm": 1.1303294072263912, "learning_rate": 8.63221557861254e-06, "loss": 0.414, "step": 470 }, { "epoch": 0.9578893962455606, "grad_norm": 1.1883694044651687, "learning_rate": 8.615954386160836e-06, "loss": 0.3944, "step": 472 }, { "epoch": 0.9619482496194824, "grad_norm": 1.0686371570768038, "learning_rate": 8.599612613987522e-06, "loss": 0.4138, "step": 474 }, { "epoch": 0.9660071029934043, "grad_norm": 1.1520582178885161, "learning_rate": 8.583190626260754e-06, "loss": 0.408, "step": 476 }, { "epoch": 0.9700659563673262, "grad_norm": 1.2111448095961146, "learning_rate": 8.566688788936254e-06, "loss": 0.4326, "step": 478 }, { "epoch": 0.974124809741248, "grad_norm": 1.0960113471423047, "learning_rate": 8.550107469749159e-06, "loss": 0.4095, "step": 480 }, { "epoch": 0.9781836631151699, "grad_norm": 1.2031487005930193, "learning_rate": 8.533447038205805e-06, "loss": 0.4019, "step": 482 }, { "epoch": 0.9822425164890918, "grad_norm": 1.0541006609815473, "learning_rate": 8.516707865575515e-06, "loss": 0.4301, "step": 484 }, { "epoch": 0.9863013698630136, "grad_norm": 1.1625544941021624, "learning_rate": 8.499890324882323e-06, "loss": 0.3998, "step": 486 }, { "epoch": 0.9903602232369355, "grad_norm": 1.3066175946513412, "learning_rate": 8.482994790896645e-06, "loss": 0.4422, "step": 488 }, { "epoch": 0.9944190766108574, "grad_norm": 1.0701571633478897, "learning_rate": 8.466021640126946e-06, "loss": 0.4122, "step": 490 }, { "epoch": 0.9984779299847792, "grad_norm": 1.167166516481942, "learning_rate": 8.448971250811337e-06, "loss": 0.4137, "step": 492 }, { "epoch": 1.002029426686961, "grad_norm": 1.2941849102817737, "learning_rate": 8.431844002909153e-06, "loss": 0.3068, "step": 494 }, { "epoch": 1.0060882800608828, "grad_norm": 1.111084517678234, "learning_rate": 8.414640278092485e-06, "loss": 0.2196, "step": 496 }, { "epoch": 1.0101471334348047, "grad_norm": 1.0562648719597976, "learning_rate": 8.397360459737673e-06, "loss": 0.214, "step": 498 }, { "epoch": 1.0142059868087265, "grad_norm": 1.1997363186960575, "learning_rate": 8.38000493291676e-06, "loss": 0.1968, "step": 500 }, { "epoch": 1.0182648401826484, "grad_norm": 1.173481743208335, "learning_rate": 8.362574084388921e-06, "loss": 0.2037, "step": 502 }, { "epoch": 1.0223236935565703, "grad_norm": 1.0170814335662817, "learning_rate": 8.34506830259183e-06, "loss": 0.1732, "step": 504 }, { "epoch": 1.0263825469304921, "grad_norm": 0.997455021719729, "learning_rate": 8.327487977633013e-06, "loss": 0.198, "step": 506 }, { "epoch": 1.030441400304414, "grad_norm": 1.0596243435147559, "learning_rate": 8.309833501281159e-06, "loss": 0.1968, "step": 508 }, { "epoch": 1.0345002536783359, "grad_norm": 1.1329929736584996, "learning_rate": 8.292105266957372e-06, "loss": 0.2058, "step": 510 }, { "epoch": 1.0385591070522577, "grad_norm": 1.0617871718782863, "learning_rate": 8.274303669726427e-06, "loss": 0.1837, "step": 512 }, { "epoch": 1.0426179604261796, "grad_norm": 1.0158373026810432, "learning_rate": 8.256429106287944e-06, "loss": 0.1937, "step": 514 }, { "epoch": 1.0466768138001015, "grad_norm": 1.1124406488041407, "learning_rate": 8.238481974967567e-06, "loss": 0.2044, "step": 516 }, { "epoch": 1.0507356671740233, "grad_norm": 1.0954828147640017, "learning_rate": 8.220462675708075e-06, "loss": 0.2025, "step": 518 }, { "epoch": 1.0547945205479452, "grad_norm": 1.0243819070320326, "learning_rate": 8.202371610060471e-06, "loss": 0.1944, "step": 520 }, { "epoch": 1.058853373921867, "grad_norm": 1.0760742610687821, "learning_rate": 8.184209181175038e-06, "loss": 0.1949, "step": 522 }, { "epoch": 1.062912227295789, "grad_norm": 1.020386464750481, "learning_rate": 8.165975793792355e-06, "loss": 0.1923, "step": 524 }, { "epoch": 1.0669710806697108, "grad_norm": 1.1029549076667262, "learning_rate": 8.14767185423427e-06, "loss": 0.18, "step": 526 }, { "epoch": 1.0710299340436327, "grad_norm": 1.1869108297914424, "learning_rate": 8.129297770394855e-06, "loss": 0.199, "step": 528 }, { "epoch": 1.0750887874175545, "grad_norm": 1.042008597384453, "learning_rate": 8.11085395173131e-06, "loss": 0.1758, "step": 530 }, { "epoch": 1.0791476407914764, "grad_norm": 1.0824678674361556, "learning_rate": 8.092340809254844e-06, "loss": 0.183, "step": 532 }, { "epoch": 1.0832064941653983, "grad_norm": 1.1733425605990007, "learning_rate": 8.073758755521506e-06, "loss": 0.2001, "step": 534 }, { "epoch": 1.0872653475393201, "grad_norm": 0.989241357527303, "learning_rate": 8.055108204623001e-06, "loss": 0.1854, "step": 536 }, { "epoch": 1.091324200913242, "grad_norm": 1.105028315498873, "learning_rate": 8.03638957217746e-06, "loss": 0.1887, "step": 538 }, { "epoch": 1.0953830542871639, "grad_norm": 1.1215793382714723, "learning_rate": 8.017603275320176e-06, "loss": 0.206, "step": 540 }, { "epoch": 1.0994419076610857, "grad_norm": 1.0430299772389053, "learning_rate": 7.998749732694308e-06, "loss": 0.1852, "step": 542 }, { "epoch": 1.1035007610350076, "grad_norm": 1.0539243906524998, "learning_rate": 7.979829364441555e-06, "loss": 0.1792, "step": 544 }, { "epoch": 1.1075596144089295, "grad_norm": 1.056850242292317, "learning_rate": 7.960842592192792e-06, "loss": 0.1914, "step": 546 }, { "epoch": 1.1116184677828513, "grad_norm": 1.0273529138082944, "learning_rate": 7.94178983905867e-06, "loss": 0.1947, "step": 548 }, { "epoch": 1.1156773211567732, "grad_norm": 1.0677655866471754, "learning_rate": 7.922671529620192e-06, "loss": 0.1901, "step": 550 }, { "epoch": 1.119736174530695, "grad_norm": 1.012213849308213, "learning_rate": 7.903488089919253e-06, "loss": 0.1732, "step": 552 }, { "epoch": 1.123795027904617, "grad_norm": 1.1676973953753516, "learning_rate": 7.88423994744914e-06, "loss": 0.2106, "step": 554 }, { "epoch": 1.1278538812785388, "grad_norm": 1.0599154280202072, "learning_rate": 7.864927531145012e-06, "loss": 0.1868, "step": 556 }, { "epoch": 1.1319127346524607, "grad_norm": 1.0897459769656754, "learning_rate": 7.845551271374333e-06, "loss": 0.1814, "step": 558 }, { "epoch": 1.1359715880263825, "grad_norm": 1.055142014684741, "learning_rate": 7.82611159992729e-06, "loss": 0.1851, "step": 560 }, { "epoch": 1.1400304414003044, "grad_norm": 1.1116965314079303, "learning_rate": 7.80660895000717e-06, "loss": 0.196, "step": 562 }, { "epoch": 1.1440892947742263, "grad_norm": 1.039510707609459, "learning_rate": 7.787043756220698e-06, "loss": 0.1721, "step": 564 }, { "epoch": 1.1481481481481481, "grad_norm": 1.128546678780832, "learning_rate": 7.767416454568358e-06, "loss": 0.1848, "step": 566 }, { "epoch": 1.15220700152207, "grad_norm": 1.1259620179696028, "learning_rate": 7.747727482434679e-06, "loss": 0.2007, "step": 568 }, { "epoch": 1.1562658548959919, "grad_norm": 1.1097229809191436, "learning_rate": 7.727977278578484e-06, "loss": 0.1881, "step": 570 }, { "epoch": 1.1603247082699137, "grad_norm": 1.0616464097857343, "learning_rate": 7.708166283123118e-06, "loss": 0.1945, "step": 572 }, { "epoch": 1.1643835616438356, "grad_norm": 1.073617478066992, "learning_rate": 7.68829493754663e-06, "loss": 0.1858, "step": 574 }, { "epoch": 1.1684424150177575, "grad_norm": 1.0887445235919726, "learning_rate": 7.668363684671947e-06, "loss": 0.1857, "step": 576 }, { "epoch": 1.1725012683916793, "grad_norm": 1.0401775398806878, "learning_rate": 7.648372968656995e-06, "loss": 0.1786, "step": 578 }, { "epoch": 1.1765601217656012, "grad_norm": 1.072786873168531, "learning_rate": 7.628323234984806e-06, "loss": 0.1848, "step": 580 }, { "epoch": 1.180618975139523, "grad_norm": 1.183804677665548, "learning_rate": 7.608214930453597e-06, "loss": 0.2032, "step": 582 }, { "epoch": 1.184677828513445, "grad_norm": 1.1546921624510742, "learning_rate": 7.588048503166801e-06, "loss": 0.1933, "step": 584 }, { "epoch": 1.1887366818873668, "grad_norm": 1.0646260835850125, "learning_rate": 7.5678244025230894e-06, "loss": 0.1842, "step": 586 }, { "epoch": 1.1927955352612887, "grad_norm": 0.9351171981377732, "learning_rate": 7.547543079206355e-06, "loss": 0.1711, "step": 588 }, { "epoch": 1.1968543886352105, "grad_norm": 1.1893988642652746, "learning_rate": 7.5272049851756716e-06, "loss": 0.2027, "step": 590 }, { "epoch": 1.2009132420091324, "grad_norm": 1.0632981222064524, "learning_rate": 7.506810573655215e-06, "loss": 0.1852, "step": 592 }, { "epoch": 1.2049720953830543, "grad_norm": 1.0836002498537225, "learning_rate": 7.486360299124169e-06, "loss": 0.1887, "step": 594 }, { "epoch": 1.2090309487569761, "grad_norm": 1.0213871056780877, "learning_rate": 7.4658546173066005e-06, "loss": 0.1826, "step": 596 }, { "epoch": 1.213089802130898, "grad_norm": 0.9528373737399318, "learning_rate": 7.445293985161296e-06, "loss": 0.1722, "step": 598 }, { "epoch": 1.2171486555048199, "grad_norm": 0.9267772302163672, "learning_rate": 7.424678860871584e-06, "loss": 0.1754, "step": 600 }, { "epoch": 1.2212075088787417, "grad_norm": 1.0580239859843474, "learning_rate": 7.404009703835121e-06, "loss": 0.1828, "step": 602 }, { "epoch": 1.2252663622526636, "grad_norm": 1.1609412830600723, "learning_rate": 7.383286974653659e-06, "loss": 0.2043, "step": 604 }, { "epoch": 1.2293252156265855, "grad_norm": 1.2441637701281891, "learning_rate": 7.362511135122779e-06, "loss": 0.2, "step": 606 }, { "epoch": 1.2333840690005073, "grad_norm": 1.0712250042029285, "learning_rate": 7.341682648221591e-06, "loss": 0.1823, "step": 608 }, { "epoch": 1.2374429223744292, "grad_norm": 0.9995902931065666, "learning_rate": 7.320801978102434e-06, "loss": 0.1826, "step": 610 }, { "epoch": 1.241501775748351, "grad_norm": 1.1066959207293212, "learning_rate": 7.299869590080524e-06, "loss": 0.1916, "step": 612 }, { "epoch": 1.245560629122273, "grad_norm": 1.102809387398261, "learning_rate": 7.278885950623578e-06, "loss": 0.2034, "step": 614 }, { "epoch": 1.2496194824961948, "grad_norm": 1.2015908572580698, "learning_rate": 7.257851527341429e-06, "loss": 0.2007, "step": 616 }, { "epoch": 1.2536783358701167, "grad_norm": 1.0215395009781163, "learning_rate": 7.236766788975603e-06, "loss": 0.1926, "step": 618 }, { "epoch": 1.2577371892440385, "grad_norm": 0.9684806459895816, "learning_rate": 7.215632205388872e-06, "loss": 0.1738, "step": 620 }, { "epoch": 1.2617960426179604, "grad_norm": 1.1014687188825973, "learning_rate": 7.19444824755478e-06, "loss": 0.1895, "step": 622 }, { "epoch": 1.2658548959918823, "grad_norm": 1.0685183294149176, "learning_rate": 7.173215387547155e-06, "loss": 0.1798, "step": 624 }, { "epoch": 1.2699137493658041, "grad_norm": 1.100155763152369, "learning_rate": 7.151934098529583e-06, "loss": 0.1876, "step": 626 }, { "epoch": 1.273972602739726, "grad_norm": 1.1041782673624663, "learning_rate": 7.130604854744871e-06, "loss": 0.1959, "step": 628 }, { "epoch": 1.2780314561136479, "grad_norm": 1.1670239082453848, "learning_rate": 7.109228131504465e-06, "loss": 0.2055, "step": 630 }, { "epoch": 1.2820903094875697, "grad_norm": 1.0829456391928898, "learning_rate": 7.087804405177876e-06, "loss": 0.1866, "step": 632 }, { "epoch": 1.2861491628614916, "grad_norm": 1.0004546162300938, "learning_rate": 7.066334153182049e-06, "loss": 0.1805, "step": 634 }, { "epoch": 1.2902080162354135, "grad_norm": 1.032451555529876, "learning_rate": 7.044817853970732e-06, "loss": 0.1866, "step": 636 }, { "epoch": 1.2942668696093353, "grad_norm": 1.0864837314090992, "learning_rate": 7.023255987023813e-06, "loss": 0.182, "step": 638 }, { "epoch": 1.2983257229832572, "grad_norm": 1.0459756892568486, "learning_rate": 7.001649032836631e-06, "loss": 0.1863, "step": 640 }, { "epoch": 1.302384576357179, "grad_norm": 1.0601944062820794, "learning_rate": 6.9799974729092765e-06, "loss": 0.1732, "step": 642 }, { "epoch": 1.306443429731101, "grad_norm": 1.018697234233559, "learning_rate": 6.958301789735853e-06, "loss": 0.1763, "step": 644 }, { "epoch": 1.3105022831050228, "grad_norm": 1.2251116288030364, "learning_rate": 6.936562466793724e-06, "loss": 0.21, "step": 646 }, { "epoch": 1.3145611364789447, "grad_norm": 1.0662349795156443, "learning_rate": 6.914779988532755e-06, "loss": 0.1889, "step": 648 }, { "epoch": 1.3186199898528665, "grad_norm": 1.1235644124407285, "learning_rate": 6.892954840364493e-06, "loss": 0.2028, "step": 650 }, { "epoch": 1.3226788432267884, "grad_norm": 1.074570291741179, "learning_rate": 6.871087508651373e-06, "loss": 0.1884, "step": 652 }, { "epoch": 1.3267376966007103, "grad_norm": 1.0531061994655868, "learning_rate": 6.8491784806958616e-06, "loss": 0.2021, "step": 654 }, { "epoch": 1.3307965499746321, "grad_norm": 1.1819152398440131, "learning_rate": 6.827228244729609e-06, "loss": 0.1932, "step": 656 }, { "epoch": 1.334855403348554, "grad_norm": 1.045483939181271, "learning_rate": 6.805237289902565e-06, "loss": 0.1965, "step": 658 }, { "epoch": 1.3389142567224759, "grad_norm": 1.1758637342179898, "learning_rate": 6.783206106272076e-06, "loss": 0.198, "step": 660 }, { "epoch": 1.3429731100963977, "grad_norm": 1.0914064475419278, "learning_rate": 6.761135184791969e-06, "loss": 0.1846, "step": 662 }, { "epoch": 1.3470319634703196, "grad_norm": 1.0211966623620905, "learning_rate": 6.7390250173016104e-06, "loss": 0.181, "step": 664 }, { "epoch": 1.3510908168442415, "grad_norm": 1.193565468654654, "learning_rate": 6.716876096514944e-06, "loss": 0.2095, "step": 666 }, { "epoch": 1.3551496702181633, "grad_norm": 1.1271722792849745, "learning_rate": 6.694688916009505e-06, "loss": 0.1848, "step": 668 }, { "epoch": 1.3592085235920852, "grad_norm": 1.1098782394361217, "learning_rate": 6.672463970215436e-06, "loss": 0.1961, "step": 670 }, { "epoch": 1.363267376966007, "grad_norm": 1.1374613038031431, "learning_rate": 6.650201754404455e-06, "loss": 0.1836, "step": 672 }, { "epoch": 1.367326230339929, "grad_norm": 1.0341949368176346, "learning_rate": 6.627902764678824e-06, "loss": 0.1881, "step": 674 }, { "epoch": 1.3713850837138508, "grad_norm": 1.109962989096539, "learning_rate": 6.605567497960295e-06, "loss": 0.1803, "step": 676 }, { "epoch": 1.3754439370877727, "grad_norm": 1.107735146712493, "learning_rate": 6.583196451979031e-06, "loss": 0.1917, "step": 678 }, { "epoch": 1.3795027904616946, "grad_norm": 1.1579886280607274, "learning_rate": 6.560790125262524e-06, "loss": 0.1979, "step": 680 }, { "epoch": 1.3835616438356164, "grad_norm": 0.878075470155148, "learning_rate": 6.538349017124472e-06, "loss": 0.1631, "step": 682 }, { "epoch": 1.3876204972095383, "grad_norm": 1.062317827656781, "learning_rate": 6.515873627653663e-06, "loss": 0.1808, "step": 684 }, { "epoch": 1.3916793505834602, "grad_norm": 1.0327124357250628, "learning_rate": 6.493364457702831e-06, "loss": 0.1799, "step": 686 }, { "epoch": 1.395738203957382, "grad_norm": 1.132131135132688, "learning_rate": 6.470822008877482e-06, "loss": 0.1822, "step": 688 }, { "epoch": 1.3997970573313039, "grad_norm": 1.0462034604519863, "learning_rate": 6.448246783524734e-06, "loss": 0.1919, "step": 690 }, { "epoch": 1.4038559107052258, "grad_norm": 1.1044742895931243, "learning_rate": 6.42563928472211e-06, "loss": 0.1851, "step": 692 }, { "epoch": 1.4079147640791476, "grad_norm": 1.1674244148460076, "learning_rate": 6.403000016266326e-06, "loss": 0.1866, "step": 694 }, { "epoch": 1.4119736174530695, "grad_norm": 1.1296402785524131, "learning_rate": 6.380329482662078e-06, "loss": 0.2035, "step": 696 }, { "epoch": 1.4160324708269914, "grad_norm": 1.0524131174312268, "learning_rate": 6.35762818911078e-06, "loss": 0.1717, "step": 698 }, { "epoch": 1.4200913242009132, "grad_norm": 1.1191774371111942, "learning_rate": 6.334896641499324e-06, "loss": 0.178, "step": 700 }, { "epoch": 1.424150177574835, "grad_norm": 1.047752132697504, "learning_rate": 6.312135346388793e-06, "loss": 0.1881, "step": 702 }, { "epoch": 1.428209030948757, "grad_norm": 1.1743382728694667, "learning_rate": 6.289344811003184e-06, "loss": 0.2033, "step": 704 }, { "epoch": 1.4322678843226788, "grad_norm": 1.1804585608436726, "learning_rate": 6.2665255432180916e-06, "loss": 0.1931, "step": 706 }, { "epoch": 1.4363267376966007, "grad_norm": 1.0677096903056138, "learning_rate": 6.2436780515494035e-06, "loss": 0.1837, "step": 708 }, { "epoch": 1.4403855910705226, "grad_norm": 1.2099132702699213, "learning_rate": 6.2208028451419575e-06, "loss": 0.2112, "step": 710 }, { "epoch": 1.4444444444444444, "grad_norm": 1.1837036949871973, "learning_rate": 6.197900433758205e-06, "loss": 0.2021, "step": 712 }, { "epoch": 1.4485032978183663, "grad_norm": 1.167969784888959, "learning_rate": 6.174971327766842e-06, "loss": 0.1958, "step": 714 }, { "epoch": 1.4525621511922882, "grad_norm": 1.0584127834879178, "learning_rate": 6.1520160381314465e-06, "loss": 0.1854, "step": 716 }, { "epoch": 1.45662100456621, "grad_norm": 1.0674665549424147, "learning_rate": 6.129035076399077e-06, "loss": 0.1896, "step": 718 }, { "epoch": 1.4606798579401319, "grad_norm": 1.1061278306008033, "learning_rate": 6.106028954688892e-06, "loss": 0.1903, "step": 720 }, { "epoch": 1.4647387113140538, "grad_norm": 1.0435415673333681, "learning_rate": 6.082998185680718e-06, "loss": 0.1872, "step": 722 }, { "epoch": 1.4687975646879756, "grad_norm": 1.058891200904623, "learning_rate": 6.059943282603642e-06, "loss": 0.1983, "step": 724 }, { "epoch": 1.4728564180618975, "grad_norm": 1.0795684120382831, "learning_rate": 6.03686475922456e-06, "loss": 0.178, "step": 726 }, { "epoch": 1.4769152714358194, "grad_norm": 1.1384310108536333, "learning_rate": 6.013763129836739e-06, "loss": 0.1874, "step": 728 }, { "epoch": 1.4809741248097412, "grad_norm": 1.1039777651990725, "learning_rate": 5.990638909248352e-06, "loss": 0.1941, "step": 730 }, { "epoch": 1.485032978183663, "grad_norm": 1.0762677800080636, "learning_rate": 5.967492612770999e-06, "loss": 0.1869, "step": 732 }, { "epoch": 1.489091831557585, "grad_norm": 1.0660512751481621, "learning_rate": 5.944324756208238e-06, "loss": 0.1807, "step": 734 }, { "epoch": 1.4931506849315068, "grad_norm": 1.0935996132707635, "learning_rate": 5.92113585584408e-06, "loss": 0.1945, "step": 736 }, { "epoch": 1.4972095383054287, "grad_norm": 1.1460959395776262, "learning_rate": 5.897926428431485e-06, "loss": 0.193, "step": 738 }, { "epoch": 1.5012683916793506, "grad_norm": 1.1965912612959004, "learning_rate": 5.87469699118085e-06, "loss": 0.1941, "step": 740 }, { "epoch": 1.5053272450532724, "grad_norm": 1.1926255825530645, "learning_rate": 5.851448061748477e-06, "loss": 0.1954, "step": 742 }, { "epoch": 1.5093860984271943, "grad_norm": 1.0050356960035571, "learning_rate": 5.828180158225047e-06, "loss": 0.1812, "step": 744 }, { "epoch": 1.5134449518011162, "grad_norm": 1.0259967073861873, "learning_rate": 5.804893799124068e-06, "loss": 0.1892, "step": 746 }, { "epoch": 1.517503805175038, "grad_norm": 1.1502053032831951, "learning_rate": 5.7815895033703164e-06, "loss": 0.1965, "step": 748 }, { "epoch": 1.52156265854896, "grad_norm": 1.2448018070646114, "learning_rate": 5.758267790288282e-06, "loss": 0.2082, "step": 750 }, { "epoch": 1.5256215119228818, "grad_norm": 1.0310538568503425, "learning_rate": 5.734929179590593e-06, "loss": 0.1801, "step": 752 }, { "epoch": 1.5296803652968036, "grad_norm": 1.0835597986581949, "learning_rate": 5.711574191366427e-06, "loss": 0.1807, "step": 754 }, { "epoch": 1.5337392186707255, "grad_norm": 1.1039579795978836, "learning_rate": 5.6882033460699294e-06, "loss": 0.1934, "step": 756 }, { "epoch": 1.5377980720446474, "grad_norm": 1.1564614770433477, "learning_rate": 5.664817164508614e-06, "loss": 0.183, "step": 758 }, { "epoch": 1.5418569254185692, "grad_norm": 1.1933306807050015, "learning_rate": 5.641416167831752e-06, "loss": 0.1983, "step": 760 }, { "epoch": 1.545915778792491, "grad_norm": 1.1642784370439, "learning_rate": 5.618000877518767e-06, "loss": 0.205, "step": 762 }, { "epoch": 1.549974632166413, "grad_norm": 1.096750111637783, "learning_rate": 5.594571815367602e-06, "loss": 0.1871, "step": 764 }, { "epoch": 1.5540334855403348, "grad_norm": 0.8896315598965203, "learning_rate": 5.5711295034831034e-06, "loss": 0.1588, "step": 766 }, { "epoch": 1.5580923389142567, "grad_norm": 1.021696941589894, "learning_rate": 5.547674464265384e-06, "loss": 0.1885, "step": 768 }, { "epoch": 1.5621511922881786, "grad_norm": 1.0760610238279678, "learning_rate": 5.524207220398169e-06, "loss": 0.1844, "step": 770 }, { "epoch": 1.5662100456621004, "grad_norm": 1.0146299892564568, "learning_rate": 5.500728294837168e-06, "loss": 0.1717, "step": 772 }, { "epoch": 1.5702688990360223, "grad_norm": 1.1203690278420046, "learning_rate": 5.477238210798406e-06, "loss": 0.1816, "step": 774 }, { "epoch": 1.5743277524099442, "grad_norm": 1.213922753663776, "learning_rate": 5.453737491746572e-06, "loss": 0.1956, "step": 776 }, { "epoch": 1.578386605783866, "grad_norm": 1.0789723847536306, "learning_rate": 5.430226661383348e-06, "loss": 0.1831, "step": 778 }, { "epoch": 1.582445459157788, "grad_norm": 1.0165965114615476, "learning_rate": 5.406706243635742e-06, "loss": 0.1859, "step": 780 }, { "epoch": 1.5865043125317098, "grad_norm": 0.9244907929973665, "learning_rate": 5.383176762644416e-06, "loss": 0.1799, "step": 782 }, { "epoch": 1.5905631659056316, "grad_norm": 1.015459762165936, "learning_rate": 5.359638742751994e-06, "loss": 0.1859, "step": 784 }, { "epoch": 1.5946220192795535, "grad_norm": 1.119032836602815, "learning_rate": 5.3360927084913925e-06, "loss": 0.1949, "step": 786 }, { "epoch": 1.5986808726534754, "grad_norm": 0.9834799242339374, "learning_rate": 5.312539184574123e-06, "loss": 0.1795, "step": 788 }, { "epoch": 1.6027397260273972, "grad_norm": 1.1257597437225455, "learning_rate": 5.288978695878596e-06, "loss": 0.1842, "step": 790 }, { "epoch": 1.606798579401319, "grad_norm": 1.1490293546501014, "learning_rate": 5.265411767438432e-06, "loss": 0.1892, "step": 792 }, { "epoch": 1.610857432775241, "grad_norm": 1.096272375625098, "learning_rate": 5.241838924430757e-06, "loss": 0.1857, "step": 794 }, { "epoch": 1.6149162861491628, "grad_norm": 0.9881695704441573, "learning_rate": 5.2182606921645e-06, "loss": 0.1839, "step": 796 }, { "epoch": 1.6189751395230847, "grad_norm": 1.0055701217382587, "learning_rate": 5.194677596068689e-06, "loss": 0.1974, "step": 798 }, { "epoch": 1.6230339928970066, "grad_norm": 1.0445320687597668, "learning_rate": 5.171090161680736e-06, "loss": 0.186, "step": 800 }, { "epoch": 1.6270928462709284, "grad_norm": 1.010532137401968, "learning_rate": 5.1474989146347355e-06, "loss": 0.1818, "step": 802 }, { "epoch": 1.6311516996448503, "grad_norm": 1.059728950180328, "learning_rate": 5.1239043806497365e-06, "loss": 0.1878, "step": 804 }, { "epoch": 1.6352105530187722, "grad_norm": 1.02492281938429, "learning_rate": 5.100307085518046e-06, "loss": 0.1792, "step": 806 }, { "epoch": 1.639269406392694, "grad_norm": 1.1092364608711534, "learning_rate": 5.076707555093491e-06, "loss": 0.1816, "step": 808 }, { "epoch": 1.643328259766616, "grad_norm": 0.9816745515421457, "learning_rate": 5.053106315279721e-06, "loss": 0.2025, "step": 810 }, { "epoch": 1.6473871131405378, "grad_norm": 1.0000356438781097, "learning_rate": 5.029503892018472e-06, "loss": 0.1669, "step": 812 }, { "epoch": 1.6514459665144596, "grad_norm": 1.0450858430582273, "learning_rate": 5.005900811277856e-06, "loss": 0.1802, "step": 814 }, { "epoch": 1.6555048198883815, "grad_norm": 0.9390205529074375, "learning_rate": 4.982297599040633e-06, "loss": 0.1636, "step": 816 }, { "epoch": 1.6595636732623034, "grad_norm": 1.154620857603639, "learning_rate": 4.958694781292496e-06, "loss": 0.1923, "step": 818 }, { "epoch": 1.6636225266362252, "grad_norm": 1.1872938869192748, "learning_rate": 4.935092884010347e-06, "loss": 0.1873, "step": 820 }, { "epoch": 1.667681380010147, "grad_norm": 1.0599792451386443, "learning_rate": 4.911492433150573e-06, "loss": 0.1809, "step": 822 }, { "epoch": 1.671740233384069, "grad_norm": 1.1108344863089323, "learning_rate": 4.887893954637335e-06, "loss": 0.1864, "step": 824 }, { "epoch": 1.6757990867579908, "grad_norm": 1.058121443964045, "learning_rate": 4.86429797435083e-06, "loss": 0.1766, "step": 826 }, { "epoch": 1.6798579401319127, "grad_norm": 1.1323101168080565, "learning_rate": 4.840705018115595e-06, "loss": 0.1808, "step": 828 }, { "epoch": 1.6839167935058346, "grad_norm": 1.128025551519256, "learning_rate": 4.8171156116887725e-06, "loss": 0.1757, "step": 830 }, { "epoch": 1.6879756468797564, "grad_norm": 1.0934679444423028, "learning_rate": 4.7935302807483965e-06, "loss": 0.1924, "step": 832 }, { "epoch": 1.6920345002536783, "grad_norm": 1.0760501447523048, "learning_rate": 4.769949550881687e-06, "loss": 0.1902, "step": 834 }, { "epoch": 1.6960933536276002, "grad_norm": 1.0550473463955812, "learning_rate": 4.746373947573325e-06, "loss": 0.1787, "step": 836 }, { "epoch": 1.700152207001522, "grad_norm": 1.2275660092618677, "learning_rate": 4.722803996193753e-06, "loss": 0.197, "step": 838 }, { "epoch": 1.704211060375444, "grad_norm": 1.0505064696078903, "learning_rate": 4.699240221987461e-06, "loss": 0.1819, "step": 840 }, { "epoch": 1.7082699137493658, "grad_norm": 1.1523046961319277, "learning_rate": 4.6756831500612846e-06, "loss": 0.1888, "step": 842 }, { "epoch": 1.7123287671232876, "grad_norm": 0.9989431343495883, "learning_rate": 4.652133305372705e-06, "loss": 0.1727, "step": 844 }, { "epoch": 1.7163876204972095, "grad_norm": 1.0740718139978316, "learning_rate": 4.628591212718144e-06, "loss": 0.1756, "step": 846 }, { "epoch": 1.7204464738711314, "grad_norm": 1.1041425692480016, "learning_rate": 4.605057396721275e-06, "loss": 0.1741, "step": 848 }, { "epoch": 1.7245053272450532, "grad_norm": 1.212666044737014, "learning_rate": 4.58153238182133e-06, "loss": 0.1841, "step": 850 }, { "epoch": 1.728564180618975, "grad_norm": 1.0783964750466963, "learning_rate": 4.558016692261412e-06, "loss": 0.1698, "step": 852 }, { "epoch": 1.732623033992897, "grad_norm": 1.0683744846668402, "learning_rate": 4.534510852076817e-06, "loss": 0.1886, "step": 854 }, { "epoch": 1.7366818873668188, "grad_norm": 1.1011269960255068, "learning_rate": 4.511015385083345e-06, "loss": 0.1945, "step": 856 }, { "epoch": 1.7407407407407407, "grad_norm": 0.9991671395240459, "learning_rate": 4.487530814865646e-06, "loss": 0.1824, "step": 858 }, { "epoch": 1.7447995941146626, "grad_norm": 1.0566808884558716, "learning_rate": 4.464057664765532e-06, "loss": 0.1823, "step": 860 }, { "epoch": 1.7488584474885844, "grad_norm": 1.0940203228626781, "learning_rate": 4.440596457870327e-06, "loss": 0.1834, "step": 862 }, { "epoch": 1.7529173008625063, "grad_norm": 1.0234332049105062, "learning_rate": 4.417147717001205e-06, "loss": 0.1746, "step": 864 }, { "epoch": 1.7569761542364282, "grad_norm": 0.9623367599486023, "learning_rate": 4.393711964701541e-06, "loss": 0.1682, "step": 866 }, { "epoch": 1.76103500761035, "grad_norm": 1.0516978200243972, "learning_rate": 4.37028972322527e-06, "loss": 0.1786, "step": 868 }, { "epoch": 1.765093860984272, "grad_norm": 1.1391069464012384, "learning_rate": 4.346881514525236e-06, "loss": 0.1791, "step": 870 }, { "epoch": 1.7691527143581938, "grad_norm": 0.971499677774941, "learning_rate": 4.323487860241582e-06, "loss": 0.1672, "step": 872 }, { "epoch": 1.7732115677321156, "grad_norm": 1.1577835890912351, "learning_rate": 4.3001092816901055e-06, "loss": 0.1854, "step": 874 }, { "epoch": 1.7772704211060375, "grad_norm": 1.1217645675230743, "learning_rate": 4.2767462998506485e-06, "loss": 0.1823, "step": 876 }, { "epoch": 1.7813292744799594, "grad_norm": 1.1190282031824559, "learning_rate": 4.253399435355492e-06, "loss": 0.1895, "step": 878 }, { "epoch": 1.7853881278538812, "grad_norm": 1.0134907069750374, "learning_rate": 4.230069208477745e-06, "loss": 0.175, "step": 880 }, { "epoch": 1.789446981227803, "grad_norm": 1.1494619938574746, "learning_rate": 4.206756139119762e-06, "loss": 0.1953, "step": 882 }, { "epoch": 1.793505834601725, "grad_norm": 0.9248356141218419, "learning_rate": 4.183460746801546e-06, "loss": 0.1702, "step": 884 }, { "epoch": 1.7975646879756468, "grad_norm": 1.0725930962377248, "learning_rate": 4.160183550649176e-06, "loss": 0.1778, "step": 886 }, { "epoch": 1.8016235413495687, "grad_norm": 1.0788894008577279, "learning_rate": 4.136925069383243e-06, "loss": 0.1917, "step": 888 }, { "epoch": 1.8056823947234906, "grad_norm": 1.0122516476461982, "learning_rate": 4.113685821307282e-06, "loss": 0.1898, "step": 890 }, { "epoch": 1.8097412480974124, "grad_norm": 1.027424121449119, "learning_rate": 4.090466324296228e-06, "loss": 0.1822, "step": 892 }, { "epoch": 1.8138001014713343, "grad_norm": 1.1269393961404834, "learning_rate": 4.067267095784871e-06, "loss": 0.1841, "step": 894 }, { "epoch": 1.8178589548452562, "grad_norm": 1.0052196803723334, "learning_rate": 4.044088652756332e-06, "loss": 0.1629, "step": 896 }, { "epoch": 1.821917808219178, "grad_norm": 1.079578577258494, "learning_rate": 4.020931511730533e-06, "loss": 0.1774, "step": 898 }, { "epoch": 1.8259766615931, "grad_norm": 0.9587436391914074, "learning_rate": 3.997796188752695e-06, "loss": 0.1733, "step": 900 }, { "epoch": 1.8300355149670218, "grad_norm": 0.9992614549374934, "learning_rate": 3.974683199381836e-06, "loss": 0.1685, "step": 902 }, { "epoch": 1.8340943683409436, "grad_norm": 0.9418897276184947, "learning_rate": 3.951593058679276e-06, "loss": 0.1672, "step": 904 }, { "epoch": 1.8381532217148655, "grad_norm": 1.1397268358795776, "learning_rate": 3.928526281197169e-06, "loss": 0.1749, "step": 906 }, { "epoch": 1.8422120750887874, "grad_norm": 1.0440206163216095, "learning_rate": 3.905483380967027e-06, "loss": 0.1722, "step": 908 }, { "epoch": 1.8462709284627092, "grad_norm": 1.048561547401053, "learning_rate": 3.882464871488273e-06, "loss": 0.1693, "step": 910 }, { "epoch": 1.850329781836631, "grad_norm": 1.0284223905418497, "learning_rate": 3.859471265716791e-06, "loss": 0.1691, "step": 912 }, { "epoch": 1.854388635210553, "grad_norm": 1.004974372673609, "learning_rate": 3.836503076053501e-06, "loss": 0.1751, "step": 914 }, { "epoch": 1.8584474885844748, "grad_norm": 1.1435852033856233, "learning_rate": 3.8135608143329404e-06, "loss": 0.1809, "step": 916 }, { "epoch": 1.8625063419583967, "grad_norm": 0.9996592509283232, "learning_rate": 3.7906449918118493e-06, "loss": 0.1696, "step": 918 }, { "epoch": 1.8665651953323186, "grad_norm": 1.0632454270482863, "learning_rate": 3.7677561191577873e-06, "loss": 0.17, "step": 920 }, { "epoch": 1.8706240487062404, "grad_norm": 1.026248873958979, "learning_rate": 3.7448947064377496e-06, "loss": 0.1768, "step": 922 }, { "epoch": 1.8746829020801623, "grad_norm": 1.0006033609765281, "learning_rate": 3.722061263106797e-06, "loss": 0.1712, "step": 924 }, { "epoch": 1.8787417554540842, "grad_norm": 1.0394166532597735, "learning_rate": 3.699256297996714e-06, "loss": 0.1802, "step": 926 }, { "epoch": 1.882800608828006, "grad_norm": 1.1087386752635604, "learning_rate": 3.6764803193046538e-06, "loss": 0.1787, "step": 928 }, { "epoch": 1.886859462201928, "grad_norm": 1.1539792806225302, "learning_rate": 3.6537338345818273e-06, "loss": 0.177, "step": 930 }, { "epoch": 1.8909183155758498, "grad_norm": 1.1358496184900775, "learning_rate": 3.6310173507221884e-06, "loss": 0.1784, "step": 932 }, { "epoch": 1.8949771689497716, "grad_norm": 0.9648197271266891, "learning_rate": 3.6083313739511316e-06, "loss": 0.1613, "step": 934 }, { "epoch": 1.8990360223236935, "grad_norm": 1.069217067686545, "learning_rate": 3.5856764098142207e-06, "loss": 0.1722, "step": 936 }, { "epoch": 1.9030948756976154, "grad_norm": 0.9827567009351711, "learning_rate": 3.563052963165915e-06, "loss": 0.1619, "step": 938 }, { "epoch": 1.9071537290715372, "grad_norm": 1.0416626747952469, "learning_rate": 3.5404615381583264e-06, "loss": 0.1786, "step": 940 }, { "epoch": 1.911212582445459, "grad_norm": 0.9796952362181767, "learning_rate": 3.5179026382299752e-06, "loss": 0.1635, "step": 942 }, { "epoch": 1.915271435819381, "grad_norm": 1.0913636067798673, "learning_rate": 3.4953767660945825e-06, "loss": 0.1849, "step": 944 }, { "epoch": 1.9193302891933028, "grad_norm": 1.054155532699976, "learning_rate": 3.472884423729861e-06, "loss": 0.1824, "step": 946 }, { "epoch": 1.9233891425672247, "grad_norm": 1.1299730992487989, "learning_rate": 3.4504261123663243e-06, "loss": 0.1741, "step": 948 }, { "epoch": 1.9274479959411466, "grad_norm": 0.9544541000662791, "learning_rate": 3.4280023324761287e-06, "loss": 0.1622, "step": 950 }, { "epoch": 1.9315068493150684, "grad_norm": 1.0633189960260987, "learning_rate": 3.4056135837619077e-06, "loss": 0.1714, "step": 952 }, { "epoch": 1.9355657026889903, "grad_norm": 0.9744187925381573, "learning_rate": 3.3832603651456486e-06, "loss": 0.1704, "step": 954 }, { "epoch": 1.9396245560629122, "grad_norm": 1.0876850028674756, "learning_rate": 3.360943174757564e-06, "loss": 0.1835, "step": 956 }, { "epoch": 1.943683409436834, "grad_norm": 0.9933052886092801, "learning_rate": 3.3386625099249957e-06, "loss": 0.1722, "step": 958 }, { "epoch": 1.947742262810756, "grad_norm": 1.0507655110716982, "learning_rate": 3.3164188671613382e-06, "loss": 0.1799, "step": 960 }, { "epoch": 1.9518011161846778, "grad_norm": 0.9635760320459535, "learning_rate": 3.29421274215496e-06, "loss": 0.1665, "step": 962 }, { "epoch": 1.9558599695585996, "grad_norm": 1.0098097588789372, "learning_rate": 3.2720446297581696e-06, "loss": 0.1756, "step": 964 }, { "epoch": 1.9599188229325215, "grad_norm": 0.9487863077375068, "learning_rate": 3.2499150239761813e-06, "loss": 0.1674, "step": 966 }, { "epoch": 1.9639776763064434, "grad_norm": 1.0886147748414823, "learning_rate": 3.2278244179561107e-06, "loss": 0.176, "step": 968 }, { "epoch": 1.9680365296803652, "grad_norm": 1.0933879192223048, "learning_rate": 3.205773303975982e-06, "loss": 0.1649, "step": 970 }, { "epoch": 1.972095383054287, "grad_norm": 1.032499198840103, "learning_rate": 3.1837621734337607e-06, "loss": 0.1712, "step": 972 }, { "epoch": 1.976154236428209, "grad_norm": 1.0255453322259884, "learning_rate": 3.1617915168363994e-06, "loss": 0.1835, "step": 974 }, { "epoch": 1.9802130898021308, "grad_norm": 0.9986750671875287, "learning_rate": 3.1398618237889124e-06, "loss": 0.1685, "step": 976 }, { "epoch": 1.9842719431760527, "grad_norm": 1.018414001751852, "learning_rate": 3.11797358298346e-06, "loss": 0.1707, "step": 978 }, { "epoch": 1.9883307965499746, "grad_norm": 1.0770634669309533, "learning_rate": 3.096127282188458e-06, "loss": 0.1687, "step": 980 }, { "epoch": 1.9923896499238964, "grad_norm": 1.1276389976988863, "learning_rate": 3.074323408237716e-06, "loss": 0.1788, "step": 982 }, { "epoch": 1.9964485032978183, "grad_norm": 1.1430608051198146, "learning_rate": 3.0525624470195746e-06, "loss": 0.1878, "step": 984 }, { "epoch": 2.0, "grad_norm": 1.1495443627078172, "learning_rate": 3.0308448834660953e-06, "loss": 0.1664, "step": 986 }, { "epoch": 2.004058853373922, "grad_norm": 0.7595072680880963, "learning_rate": 3.009171201542235e-06, "loss": 0.073, "step": 988 }, { "epoch": 2.0081177067478437, "grad_norm": 0.6553749481487482, "learning_rate": 2.987541884235078e-06, "loss": 0.0666, "step": 990 }, { "epoch": 2.0121765601217656, "grad_norm": 0.6758730321723564, "learning_rate": 2.965957413543063e-06, "loss": 0.068, "step": 992 }, { "epoch": 2.0162354134956875, "grad_norm": 0.7705357959194599, "learning_rate": 2.944418270465243e-06, "loss": 0.0722, "step": 994 }, { "epoch": 2.0202942668696093, "grad_norm": 0.7447986269637112, "learning_rate": 2.9229249349905686e-06, "loss": 0.0636, "step": 996 }, { "epoch": 2.024353120243531, "grad_norm": 0.7070522483364615, "learning_rate": 2.9014778860871916e-06, "loss": 0.056, "step": 998 }, { "epoch": 2.028411973617453, "grad_norm": 0.8137822848077799, "learning_rate": 2.880077601691793e-06, "loss": 0.0711, "step": 1000 }, { "epoch": 2.032470826991375, "grad_norm": 0.6930740331498185, "learning_rate": 2.8587245586989265e-06, "loss": 0.0619, "step": 1002 }, { "epoch": 2.036529680365297, "grad_norm": 0.6609838916960153, "learning_rate": 2.8374192329503934e-06, "loss": 0.0604, "step": 1004 }, { "epoch": 2.0405885337392187, "grad_norm": 0.6985284878939515, "learning_rate": 2.8161620992246497e-06, "loss": 0.0616, "step": 1006 }, { "epoch": 2.0446473871131405, "grad_norm": 0.697512001428481, "learning_rate": 2.7949536312262048e-06, "loss": 0.0649, "step": 1008 }, { "epoch": 2.0487062404870624, "grad_norm": 0.6960549223161825, "learning_rate": 2.7737943015750862e-06, "loss": 0.0699, "step": 1010 }, { "epoch": 2.0527650938609843, "grad_norm": 0.7147137635432833, "learning_rate": 2.752684581796292e-06, "loss": 0.0626, "step": 1012 }, { "epoch": 2.056823947234906, "grad_norm": 0.6478212242747857, "learning_rate": 2.7316249423092923e-06, "loss": 0.0594, "step": 1014 }, { "epoch": 2.060882800608828, "grad_norm": 0.7050061564241327, "learning_rate": 2.7106158524175396e-06, "loss": 0.0646, "step": 1016 }, { "epoch": 2.06494165398275, "grad_norm": 0.5836812360722653, "learning_rate": 2.689657780298019e-06, "loss": 0.0552, "step": 1018 }, { "epoch": 2.0690005073566717, "grad_norm": 0.6963206280767881, "learning_rate": 2.6687511929908093e-06, "loss": 0.0633, "step": 1020 }, { "epoch": 2.0730593607305936, "grad_norm": 0.6223916195319845, "learning_rate": 2.6478965563886745e-06, "loss": 0.0567, "step": 1022 }, { "epoch": 2.0771182141045155, "grad_norm": 0.7658362298283596, "learning_rate": 2.627094335226682e-06, "loss": 0.059, "step": 1024 }, { "epoch": 2.0811770674784373, "grad_norm": 0.6242034540359834, "learning_rate": 2.6063449930718487e-06, "loss": 0.0566, "step": 1026 }, { "epoch": 2.085235920852359, "grad_norm": 0.6057344953235689, "learning_rate": 2.5856489923128136e-06, "loss": 0.0573, "step": 1028 }, { "epoch": 2.089294774226281, "grad_norm": 0.6209117592060627, "learning_rate": 2.5650067941495236e-06, "loss": 0.0543, "step": 1030 }, { "epoch": 2.093353627600203, "grad_norm": 0.626145712835931, "learning_rate": 2.5444188585829634e-06, "loss": 0.0573, "step": 1032 }, { "epoch": 2.097412480974125, "grad_norm": 0.7019968254783542, "learning_rate": 2.523885644404906e-06, "loss": 0.0629, "step": 1034 }, { "epoch": 2.1014713343480467, "grad_norm": 0.672737803900476, "learning_rate": 2.5034076091876813e-06, "loss": 0.0599, "step": 1036 }, { "epoch": 2.1055301877219685, "grad_norm": 0.7666550066371539, "learning_rate": 2.48298520927399e-06, "loss": 0.0685, "step": 1038 }, { "epoch": 2.1095890410958904, "grad_norm": 0.6186035266270663, "learning_rate": 2.4626188997667224e-06, "loss": 0.0528, "step": 1040 }, { "epoch": 2.1136478944698123, "grad_norm": 0.6362086867459513, "learning_rate": 2.4423091345188244e-06, "loss": 0.0609, "step": 1042 }, { "epoch": 2.117706747843734, "grad_norm": 0.7043829575983082, "learning_rate": 2.4220563661231793e-06, "loss": 0.0607, "step": 1044 }, { "epoch": 2.121765601217656, "grad_norm": 0.6342780093035242, "learning_rate": 2.4018610459025317e-06, "loss": 0.0614, "step": 1046 }, { "epoch": 2.125824454591578, "grad_norm": 0.6371937965298768, "learning_rate": 2.381723623899412e-06, "loss": 0.0576, "step": 1048 }, { "epoch": 2.1298833079654997, "grad_norm": 0.7261372327288902, "learning_rate": 2.361644548866127e-06, "loss": 0.0612, "step": 1050 }, { "epoch": 2.1339421613394216, "grad_norm": 0.6921022790180155, "learning_rate": 2.341624268254747e-06, "loss": 0.0637, "step": 1052 }, { "epoch": 2.1380010147133435, "grad_norm": 0.7180722302503428, "learning_rate": 2.3216632282071345e-06, "loss": 0.0653, "step": 1054 }, { "epoch": 2.1420598680872653, "grad_norm": 0.6048159926460217, "learning_rate": 2.3017618735450142e-06, "loss": 0.055, "step": 1056 }, { "epoch": 2.146118721461187, "grad_norm": 0.6876882976918033, "learning_rate": 2.2819206477600462e-06, "loss": 0.0593, "step": 1058 }, { "epoch": 2.150177574835109, "grad_norm": 0.6830162087163217, "learning_rate": 2.2621399930039493e-06, "loss": 0.0576, "step": 1060 }, { "epoch": 2.154236428209031, "grad_norm": 0.572916709401609, "learning_rate": 2.2424203500786473e-06, "loss": 0.0565, "step": 1062 }, { "epoch": 2.158295281582953, "grad_norm": 0.6064104088259805, "learning_rate": 2.2227621584264505e-06, "loss": 0.0609, "step": 1064 }, { "epoch": 2.1623541349568747, "grad_norm": 0.5345159650560205, "learning_rate": 2.203165856120251e-06, "loss": 0.0486, "step": 1066 }, { "epoch": 2.1664129883307965, "grad_norm": 0.708241954697193, "learning_rate": 2.183631879853776e-06, "loss": 0.0592, "step": 1068 }, { "epoch": 2.1704718417047184, "grad_norm": 0.6237247767204619, "learning_rate": 2.164160664931843e-06, "loss": 0.0564, "step": 1070 }, { "epoch": 2.1745306950786403, "grad_norm": 0.6913274477398279, "learning_rate": 2.1447526452606658e-06, "loss": 0.0608, "step": 1072 }, { "epoch": 2.178589548452562, "grad_norm": 0.615699302647019, "learning_rate": 2.125408253338183e-06, "loss": 0.0572, "step": 1074 }, { "epoch": 2.182648401826484, "grad_norm": 0.7114790842641555, "learning_rate": 2.106127920244423e-06, "loss": 0.056, "step": 1076 }, { "epoch": 2.186707255200406, "grad_norm": 0.6498843232992042, "learning_rate": 2.086912075631896e-06, "loss": 0.0579, "step": 1078 }, { "epoch": 2.1907661085743277, "grad_norm": 0.6342534301664807, "learning_rate": 2.067761147716017e-06, "loss": 0.0573, "step": 1080 }, { "epoch": 2.1948249619482496, "grad_norm": 0.7027558078351507, "learning_rate": 2.0486755632655643e-06, "loss": 0.0593, "step": 1082 }, { "epoch": 2.1988838153221715, "grad_norm": 0.6558346648533067, "learning_rate": 2.029655747593169e-06, "loss": 0.0605, "step": 1084 }, { "epoch": 2.2029426686960933, "grad_norm": 0.6450513139758751, "learning_rate": 2.010702124545845e-06, "loss": 0.0598, "step": 1086 }, { "epoch": 2.207001522070015, "grad_norm": 0.7322704077213636, "learning_rate": 1.9918151164955303e-06, "loss": 0.0617, "step": 1088 }, { "epoch": 2.211060375443937, "grad_norm": 0.7103114005030767, "learning_rate": 1.9729951443296823e-06, "loss": 0.0564, "step": 1090 }, { "epoch": 2.215119228817859, "grad_norm": 0.6912085691591306, "learning_rate": 1.9542426274418975e-06, "loss": 0.0628, "step": 1092 }, { "epoch": 2.219178082191781, "grad_norm": 0.7492868491245555, "learning_rate": 1.9355579837225673e-06, "loss": 0.0601, "step": 1094 }, { "epoch": 2.2232369355657027, "grad_norm": 0.6846296844726598, "learning_rate": 1.916941629549565e-06, "loss": 0.0562, "step": 1096 }, { "epoch": 2.2272957889396245, "grad_norm": 0.6860703433669731, "learning_rate": 1.8983939797789624e-06, "loss": 0.0604, "step": 1098 }, { "epoch": 2.2313546423135464, "grad_norm": 0.6583737331854461, "learning_rate": 1.8799154477357883e-06, "loss": 0.057, "step": 1100 }, { "epoch": 2.2354134956874683, "grad_norm": 0.6015963283689161, "learning_rate": 1.8615064452048181e-06, "loss": 0.0529, "step": 1102 }, { "epoch": 2.23947234906139, "grad_norm": 0.6596224589385736, "learning_rate": 1.8431673824214013e-06, "loss": 0.0607, "step": 1104 }, { "epoch": 2.243531202435312, "grad_norm": 0.6295377331681089, "learning_rate": 1.8248986680623077e-06, "loss": 0.0524, "step": 1106 }, { "epoch": 2.247590055809234, "grad_norm": 0.7623992953499044, "learning_rate": 1.8067007092366368e-06, "loss": 0.0633, "step": 1108 }, { "epoch": 2.2516489091831557, "grad_norm": 0.6747010480441555, "learning_rate": 1.7885739114767292e-06, "loss": 0.0575, "step": 1110 }, { "epoch": 2.2557077625570776, "grad_norm": 0.6640676472618579, "learning_rate": 1.770518678729139e-06, "loss": 0.0532, "step": 1112 }, { "epoch": 2.2597666159309995, "grad_norm": 0.6588950267013456, "learning_rate": 1.752535413345634e-06, "loss": 0.0572, "step": 1114 }, { "epoch": 2.2638254693049213, "grad_norm": 0.6957155982625279, "learning_rate": 1.734624516074221e-06, "loss": 0.0591, "step": 1116 }, { "epoch": 2.267884322678843, "grad_norm": 0.7568659886745189, "learning_rate": 1.716786386050221e-06, "loss": 0.0619, "step": 1118 }, { "epoch": 2.271943176052765, "grad_norm": 0.7149883674184384, "learning_rate": 1.6990214207873723e-06, "loss": 0.0603, "step": 1120 }, { "epoch": 2.276002029426687, "grad_norm": 0.6375454124392296, "learning_rate": 1.681330016168977e-06, "loss": 0.0583, "step": 1122 }, { "epoch": 2.280060882800609, "grad_norm": 0.7001731734101665, "learning_rate": 1.6637125664390747e-06, "loss": 0.06, "step": 1124 }, { "epoch": 2.2841197361745307, "grad_norm": 0.5744052821303995, "learning_rate": 1.6461694641936544e-06, "loss": 0.0532, "step": 1126 }, { "epoch": 2.2881785895484525, "grad_norm": 0.6379817597651213, "learning_rate": 1.6287011003719105e-06, "loss": 0.0581, "step": 1128 }, { "epoch": 2.2922374429223744, "grad_norm": 0.6936132311539229, "learning_rate": 1.61130786424753e-06, "loss": 0.0578, "step": 1130 }, { "epoch": 2.2962962962962963, "grad_norm": 0.7643680282316857, "learning_rate": 1.5939901434200145e-06, "loss": 0.0587, "step": 1132 }, { "epoch": 2.300355149670218, "grad_norm": 0.676227187971244, "learning_rate": 1.5767483238060498e-06, "loss": 0.0568, "step": 1134 }, { "epoch": 2.30441400304414, "grad_norm": 0.6485083388679013, "learning_rate": 1.5595827896308968e-06, "loss": 0.0615, "step": 1136 }, { "epoch": 2.308472856418062, "grad_norm": 0.6513490211287265, "learning_rate": 1.5424939234198377e-06, "loss": 0.0558, "step": 1138 }, { "epoch": 2.3125317097919837, "grad_norm": 0.6470779888157924, "learning_rate": 1.5254821059896452e-06, "loss": 0.0569, "step": 1140 }, { "epoch": 2.3165905631659056, "grad_norm": 0.6690972634285901, "learning_rate": 1.5085477164400975e-06, "loss": 0.0564, "step": 1142 }, { "epoch": 2.3206494165398275, "grad_norm": 0.6192010726163617, "learning_rate": 1.4916911321455362e-06, "loss": 0.0566, "step": 1144 }, { "epoch": 2.3247082699137493, "grad_norm": 0.6718507373057219, "learning_rate": 1.4749127287464483e-06, "loss": 0.0566, "step": 1146 }, { "epoch": 2.328767123287671, "grad_norm": 0.6291964856953526, "learning_rate": 1.458212880141099e-06, "loss": 0.0568, "step": 1148 }, { "epoch": 2.332825976661593, "grad_norm": 0.6090771051076672, "learning_rate": 1.4415919584771999e-06, "loss": 0.0547, "step": 1150 }, { "epoch": 2.336884830035515, "grad_norm": 0.6315155634950337, "learning_rate": 1.425050334143616e-06, "loss": 0.0586, "step": 1152 }, { "epoch": 2.340943683409437, "grad_norm": 0.682944731395333, "learning_rate": 1.408588375762114e-06, "loss": 0.0575, "step": 1154 }, { "epoch": 2.3450025367833587, "grad_norm": 0.6828351505916127, "learning_rate": 1.39220645017914e-06, "loss": 0.0575, "step": 1156 }, { "epoch": 2.3490613901572805, "grad_norm": 0.6343438475682116, "learning_rate": 1.3759049224576516e-06, "loss": 0.054, "step": 1158 }, { "epoch": 2.3531202435312024, "grad_norm": 0.6608391141452298, "learning_rate": 1.3596841558689788e-06, "loss": 0.0611, "step": 1160 }, { "epoch": 2.3571790969051243, "grad_norm": 0.6847038775101427, "learning_rate": 1.3435445118847362e-06, "loss": 0.0597, "step": 1162 }, { "epoch": 2.361237950279046, "grad_norm": 0.6758006038700237, "learning_rate": 1.3274863501687546e-06, "loss": 0.0582, "step": 1164 }, { "epoch": 2.365296803652968, "grad_norm": 0.679273586750369, "learning_rate": 1.3115100285690795e-06, "loss": 0.0586, "step": 1166 }, { "epoch": 2.36935565702689, "grad_norm": 0.6455382817232485, "learning_rate": 1.2956159031099874e-06, "loss": 0.0572, "step": 1168 }, { "epoch": 2.3734145104008117, "grad_norm": 0.6494368906651962, "learning_rate": 1.2798043279840544e-06, "loss": 0.0573, "step": 1170 }, { "epoch": 2.3774733637747336, "grad_norm": 0.6652797010014586, "learning_rate": 1.2640756555442684e-06, "loss": 0.0585, "step": 1172 }, { "epoch": 2.3815322171486555, "grad_norm": 0.7085143214543068, "learning_rate": 1.248430236296168e-06, "loss": 0.0531, "step": 1174 }, { "epoch": 2.3855910705225774, "grad_norm": 0.7017172691247079, "learning_rate": 1.2328684188900392e-06, "loss": 0.0562, "step": 1176 }, { "epoch": 2.389649923896499, "grad_norm": 0.5911653357727898, "learning_rate": 1.2173905501131395e-06, "loss": 0.0555, "step": 1178 }, { "epoch": 2.393708777270421, "grad_norm": 0.7052853605271917, "learning_rate": 1.2019969748819783e-06, "loss": 0.0633, "step": 1180 }, { "epoch": 2.397767630644343, "grad_norm": 0.5468703307014926, "learning_rate": 1.186688036234625e-06, "loss": 0.0512, "step": 1182 }, { "epoch": 2.401826484018265, "grad_norm": 0.6298551223639655, "learning_rate": 1.1714640753230628e-06, "loss": 0.0523, "step": 1184 }, { "epoch": 2.4058853373921867, "grad_norm": 0.7357211211763364, "learning_rate": 1.1563254314055893e-06, "loss": 0.0553, "step": 1186 }, { "epoch": 2.4099441907661086, "grad_norm": 0.6645237994069922, "learning_rate": 1.1412724418392562e-06, "loss": 0.0544, "step": 1188 }, { "epoch": 2.4140030441400304, "grad_norm": 0.661565716961166, "learning_rate": 1.126305442072354e-06, "loss": 0.055, "step": 1190 }, { "epoch": 2.4180618975139523, "grad_norm": 0.6206331269413049, "learning_rate": 1.1114247656369305e-06, "loss": 0.0545, "step": 1192 }, { "epoch": 2.422120750887874, "grad_norm": 0.7180840889505126, "learning_rate": 1.0966307441413598e-06, "loss": 0.0581, "step": 1194 }, { "epoch": 2.426179604261796, "grad_norm": 0.7354347742714406, "learning_rate": 1.0819237072629606e-06, "loss": 0.0597, "step": 1196 }, { "epoch": 2.430238457635718, "grad_norm": 0.664963118331295, "learning_rate": 1.0673039827406373e-06, "loss": 0.0592, "step": 1198 }, { "epoch": 2.4342973110096398, "grad_norm": 0.6596757154643482, "learning_rate": 1.0527718963675871e-06, "loss": 0.0543, "step": 1200 }, { "epoch": 2.4383561643835616, "grad_norm": 0.614862575729698, "learning_rate": 1.0383277719840318e-06, "loss": 0.051, "step": 1202 }, { "epoch": 2.4424150177574835, "grad_norm": 0.6239737484592334, "learning_rate": 1.0239719314700052e-06, "loss": 0.0569, "step": 1204 }, { "epoch": 2.4464738711314054, "grad_norm": 0.7008527413286773, "learning_rate": 1.0097046947381805e-06, "loss": 0.0622, "step": 1206 }, { "epoch": 2.450532724505327, "grad_norm": 0.646447221626618, "learning_rate": 9.955263797267379e-07, "loss": 0.0593, "step": 1208 }, { "epoch": 2.454591577879249, "grad_norm": 0.6736248842428098, "learning_rate": 9.814373023922851e-07, "loss": 0.0573, "step": 1210 }, { "epoch": 2.458650431253171, "grad_norm": 0.7520656749859748, "learning_rate": 9.674377767028142e-07, "loss": 0.0595, "step": 1212 }, { "epoch": 2.462709284627093, "grad_norm": 0.6256286530852058, "learning_rate": 9.53528114630699e-07, "loss": 0.0539, "step": 1214 }, { "epoch": 2.4667681380010147, "grad_norm": 0.7163476466314366, "learning_rate": 9.397086261457511e-07, "loss": 0.0587, "step": 1216 }, { "epoch": 2.4708269913749366, "grad_norm": 0.6810504627251797, "learning_rate": 9.259796192083071e-07, "loss": 0.0576, "step": 1218 }, { "epoch": 2.4748858447488584, "grad_norm": 0.6288859512257164, "learning_rate": 9.123413997623714e-07, "loss": 0.0543, "step": 1220 }, { "epoch": 2.4789446981227803, "grad_norm": 0.6740418425171263, "learning_rate": 8.987942717287923e-07, "loss": 0.0578, "step": 1222 }, { "epoch": 2.483003551496702, "grad_norm": 0.6224042862768536, "learning_rate": 8.853385369984901e-07, "loss": 0.0537, "step": 1224 }, { "epoch": 2.487062404870624, "grad_norm": 0.6485173083194978, "learning_rate": 8.719744954257375e-07, "loss": 0.056, "step": 1226 }, { "epoch": 2.491121258244546, "grad_norm": 0.6582025110825541, "learning_rate": 8.587024448214637e-07, "loss": 0.0541, "step": 1228 }, { "epoch": 2.4951801116184678, "grad_norm": 0.7255418838785723, "learning_rate": 8.455226809466327e-07, "loss": 0.0592, "step": 1230 }, { "epoch": 2.4992389649923896, "grad_norm": 0.6507900300493289, "learning_rate": 8.324354975056403e-07, "loss": 0.0539, "step": 1232 }, { "epoch": 2.5032978183663115, "grad_norm": 0.6681824734246471, "learning_rate": 8.19441186139776e-07, "loss": 0.0591, "step": 1234 }, { "epoch": 2.5073566717402334, "grad_norm": 0.6936354997773724, "learning_rate": 8.065400364207194e-07, "loss": 0.0584, "step": 1236 }, { "epoch": 2.5114155251141552, "grad_norm": 0.6718221656136698, "learning_rate": 7.937323358440935e-07, "loss": 0.0543, "step": 1238 }, { "epoch": 2.515474378488077, "grad_norm": 0.6632718887777156, "learning_rate": 7.810183698230539e-07, "loss": 0.0572, "step": 1240 }, { "epoch": 2.519533231861999, "grad_norm": 0.6239004674471934, "learning_rate": 7.683984216819262e-07, "loss": 0.0545, "step": 1242 }, { "epoch": 2.523592085235921, "grad_norm": 0.5944403471674328, "learning_rate": 7.55872772649896e-07, "loss": 0.0535, "step": 1244 }, { "epoch": 2.5276509386098427, "grad_norm": 0.6165723170085607, "learning_rate": 7.434417018547396e-07, "loss": 0.0514, "step": 1246 }, { "epoch": 2.5317097919837646, "grad_norm": 0.7183547419188132, "learning_rate": 7.311054863166095e-07, "loss": 0.0588, "step": 1248 }, { "epoch": 2.5357686453576864, "grad_norm": 0.7242691023134634, "learning_rate": 7.188644009418517e-07, "loss": 0.0603, "step": 1250 }, { "epoch": 2.5398274987316083, "grad_norm": 0.5791350405500479, "learning_rate": 7.067187185168862e-07, "loss": 0.0531, "step": 1252 }, { "epoch": 2.54388635210553, "grad_norm": 0.6776524885992443, "learning_rate": 6.946687097021249e-07, "loss": 0.0544, "step": 1254 }, { "epoch": 2.547945205479452, "grad_norm": 0.6068439005721586, "learning_rate": 6.827146430259446e-07, "loss": 0.0504, "step": 1256 }, { "epoch": 2.552004058853374, "grad_norm": 0.6517780554148217, "learning_rate": 6.70856784878699e-07, "loss": 0.0576, "step": 1258 }, { "epoch": 2.5560629122272958, "grad_norm": 0.6551482566155284, "learning_rate": 6.590953995067812e-07, "loss": 0.0585, "step": 1260 }, { "epoch": 2.5601217656012176, "grad_norm": 0.7195939705815774, "learning_rate": 6.474307490067383e-07, "loss": 0.0591, "step": 1262 }, { "epoch": 2.5641806189751395, "grad_norm": 0.7410151414847665, "learning_rate": 6.358630933194282e-07, "loss": 0.0618, "step": 1264 }, { "epoch": 2.5682394723490614, "grad_norm": 0.6972961543769066, "learning_rate": 6.24392690224232e-07, "loss": 0.0607, "step": 1266 }, { "epoch": 2.5722983257229832, "grad_norm": 0.6627967555045137, "learning_rate": 6.130197953333017e-07, "loss": 0.0602, "step": 1268 }, { "epoch": 2.576357179096905, "grad_norm": 0.6185680861600283, "learning_rate": 6.017446620858708e-07, "loss": 0.0565, "step": 1270 }, { "epoch": 2.580416032470827, "grad_norm": 0.6250339920749016, "learning_rate": 5.905675417426027e-07, "loss": 0.0572, "step": 1272 }, { "epoch": 2.584474885844749, "grad_norm": 0.6417143051901513, "learning_rate": 5.794886833799923e-07, "loss": 0.0514, "step": 1274 }, { "epoch": 2.5885337392186707, "grad_norm": 0.5742514445618982, "learning_rate": 5.685083338848152e-07, "loss": 0.0509, "step": 1276 }, { "epoch": 2.5925925925925926, "grad_norm": 0.6842845774267343, "learning_rate": 5.576267379486294e-07, "loss": 0.0608, "step": 1278 }, { "epoch": 2.5966514459665144, "grad_norm": 0.7682701087480387, "learning_rate": 5.468441380623169e-07, "loss": 0.0619, "step": 1280 }, { "epoch": 2.6007102993404363, "grad_norm": 0.6379342121635503, "learning_rate": 5.361607745106817e-07, "loss": 0.0534, "step": 1282 }, { "epoch": 2.604769152714358, "grad_norm": 0.6746813956871355, "learning_rate": 5.255768853671011e-07, "loss": 0.0568, "step": 1284 }, { "epoch": 2.60882800608828, "grad_norm": 0.5662682410250746, "learning_rate": 5.150927064882089e-07, "loss": 0.0488, "step": 1286 }, { "epoch": 2.612886859462202, "grad_norm": 0.7495492021339842, "learning_rate": 5.047084715086515e-07, "loss": 0.0627, "step": 1288 }, { "epoch": 2.6169457128361238, "grad_norm": 0.5821244808744749, "learning_rate": 4.944244118358721e-07, "loss": 0.0496, "step": 1290 }, { "epoch": 2.6210045662100456, "grad_norm": 0.6716113774136223, "learning_rate": 4.842407566449591e-07, "loss": 0.0527, "step": 1292 }, { "epoch": 2.6250634195839675, "grad_norm": 0.6675478536309039, "learning_rate": 4.741577328735364e-07, "loss": 0.0562, "step": 1294 }, { "epoch": 2.6291222729578894, "grad_norm": 0.6215223340536244, "learning_rate": 4.641755652167107e-07, "loss": 0.0557, "step": 1296 }, { "epoch": 2.6331811263318112, "grad_norm": 0.6998801247830552, "learning_rate": 4.5429447612205635e-07, "loss": 0.0559, "step": 1298 }, { "epoch": 2.637239979705733, "grad_norm": 0.7251216921883125, "learning_rate": 4.445146857846672e-07, "loss": 0.0505, "step": 1300 }, { "epoch": 2.641298833079655, "grad_norm": 0.674778502737252, "learning_rate": 4.3483641214224325e-07, "loss": 0.0536, "step": 1302 }, { "epoch": 2.645357686453577, "grad_norm": 0.603226222531606, "learning_rate": 4.2525987087023433e-07, "loss": 0.0492, "step": 1304 }, { "epoch": 2.6494165398274987, "grad_norm": 0.7870111250427357, "learning_rate": 4.1578527537703973e-07, "loss": 0.061, "step": 1306 }, { "epoch": 2.6534753932014206, "grad_norm": 0.6451056158523719, "learning_rate": 4.064128367992459e-07, "loss": 0.0556, "step": 1308 }, { "epoch": 2.6575342465753424, "grad_norm": 0.7137232587300326, "learning_rate": 3.971427639969233e-07, "loss": 0.0557, "step": 1310 }, { "epoch": 2.6615930999492643, "grad_norm": 0.6325976622831373, "learning_rate": 3.879752635489736e-07, "loss": 0.0525, "step": 1312 }, { "epoch": 2.665651953323186, "grad_norm": 0.670332593338025, "learning_rate": 3.7891053974852597e-07, "loss": 0.0524, "step": 1314 }, { "epoch": 2.669710806697108, "grad_norm": 0.6627524615579112, "learning_rate": 3.6994879459838375e-07, "loss": 0.0557, "step": 1316 }, { "epoch": 2.67376966007103, "grad_norm": 0.6143115984862325, "learning_rate": 3.6109022780652147e-07, "loss": 0.0569, "step": 1318 }, { "epoch": 2.6778285134449518, "grad_norm": 0.645959952937934, "learning_rate": 3.5233503678163696e-07, "loss": 0.0571, "step": 1320 }, { "epoch": 2.6818873668188736, "grad_norm": 0.6681068621470589, "learning_rate": 3.4368341662875004e-07, "loss": 0.0535, "step": 1322 }, { "epoch": 2.6859462201927955, "grad_norm": 0.6686418775510267, "learning_rate": 3.3513556014485805e-07, "loss": 0.0615, "step": 1324 }, { "epoch": 2.6900050735667174, "grad_norm": 0.7424511108861405, "learning_rate": 3.26691657814634e-07, "loss": 0.0592, "step": 1326 }, { "epoch": 2.6940639269406392, "grad_norm": 0.659620559037728, "learning_rate": 3.183518978061895e-07, "loss": 0.0555, "step": 1328 }, { "epoch": 2.698122780314561, "grad_norm": 0.6744968743539785, "learning_rate": 3.101164659668732e-07, "loss": 0.0557, "step": 1330 }, { "epoch": 2.702181633688483, "grad_norm": 0.6611821221163022, "learning_rate": 3.0198554581913343e-07, "loss": 0.0572, "step": 1332 }, { "epoch": 2.706240487062405, "grad_norm": 0.6422102786010568, "learning_rate": 2.9395931855643043e-07, "loss": 0.0529, "step": 1334 }, { "epoch": 2.7102993404363267, "grad_norm": 0.5990751150942317, "learning_rate": 2.860379630391935e-07, "loss": 0.0522, "step": 1336 }, { "epoch": 2.7143581938102486, "grad_norm": 0.6397947037965095, "learning_rate": 2.7822165579084013e-07, "loss": 0.0516, "step": 1338 }, { "epoch": 2.7184170471841704, "grad_norm": 0.6033731180075299, "learning_rate": 2.705105709938388e-07, "loss": 0.0522, "step": 1340 }, { "epoch": 2.7224759005580923, "grad_norm": 0.5850767244585305, "learning_rate": 2.629048804858275e-07, "loss": 0.0571, "step": 1342 }, { "epoch": 2.726534753932014, "grad_norm": 0.6793421437999266, "learning_rate": 2.5540475375578967e-07, "loss": 0.0579, "step": 1344 }, { "epoch": 2.730593607305936, "grad_norm": 0.6089471972028935, "learning_rate": 2.4801035794026987e-07, "loss": 0.0537, "step": 1346 }, { "epoch": 2.734652460679858, "grad_norm": 0.5978644646078147, "learning_rate": 2.407218578196524e-07, "loss": 0.0521, "step": 1348 }, { "epoch": 2.7387113140537798, "grad_norm": 0.6521169806760104, "learning_rate": 2.3353941581449048e-07, "loss": 0.0584, "step": 1350 }, { "epoch": 2.7427701674277016, "grad_norm": 0.6427655880180726, "learning_rate": 2.2646319198188495e-07, "loss": 0.0531, "step": 1352 }, { "epoch": 2.7468290208016235, "grad_norm": 0.7400104362172217, "learning_rate": 2.1949334401192013e-07, "loss": 0.0597, "step": 1354 }, { "epoch": 2.7508878741755454, "grad_norm": 0.7047967548664266, "learning_rate": 2.1263002722414383e-07, "loss": 0.0593, "step": 1356 }, { "epoch": 2.7549467275494672, "grad_norm": 0.6950676115493591, "learning_rate": 2.0587339456411503e-07, "loss": 0.0558, "step": 1358 }, { "epoch": 2.759005580923389, "grad_norm": 0.6247096449549893, "learning_rate": 1.9922359659998724e-07, "loss": 0.0535, "step": 1360 }, { "epoch": 2.763064434297311, "grad_norm": 0.6324280760328311, "learning_rate": 1.9268078151915724e-07, "loss": 0.0572, "step": 1362 }, { "epoch": 2.767123287671233, "grad_norm": 0.6887667157732265, "learning_rate": 1.8624509512496336e-07, "loss": 0.0567, "step": 1364 }, { "epoch": 2.7711821410451547, "grad_norm": 0.6432677564664775, "learning_rate": 1.799166808334335e-07, "loss": 0.0561, "step": 1366 }, { "epoch": 2.7752409944190766, "grad_norm": 0.6374676188470956, "learning_rate": 1.7369567967009226e-07, "loss": 0.052, "step": 1368 }, { "epoch": 2.7792998477929984, "grad_norm": 0.6411778692609862, "learning_rate": 1.6758223026681507e-07, "loss": 0.056, "step": 1370 }, { "epoch": 2.7833587011669203, "grad_norm": 0.5631134199833117, "learning_rate": 1.615764688587429e-07, "loss": 0.0508, "step": 1372 }, { "epoch": 2.787417554540842, "grad_norm": 0.6819140357771947, "learning_rate": 1.5567852928124237e-07, "loss": 0.0571, "step": 1374 }, { "epoch": 2.791476407914764, "grad_norm": 0.5799311609819854, "learning_rate": 1.4988854296692557e-07, "loss": 0.0503, "step": 1376 }, { "epoch": 2.795535261288686, "grad_norm": 0.6901834332410363, "learning_rate": 1.442066389427199e-07, "loss": 0.0599, "step": 1378 }, { "epoch": 2.7995941146626078, "grad_norm": 0.6242349895835719, "learning_rate": 1.386329438269929e-07, "loss": 0.0563, "step": 1380 }, { "epoch": 2.8036529680365296, "grad_norm": 0.6747442124346058, "learning_rate": 1.3316758182673307e-07, "loss": 0.0559, "step": 1382 }, { "epoch": 2.8077118214104515, "grad_norm": 0.6806732183113077, "learning_rate": 1.2781067473477905e-07, "loss": 0.0553, "step": 1384 }, { "epoch": 2.8117706747843734, "grad_norm": 0.5453489129772455, "learning_rate": 1.225623419271055e-07, "loss": 0.0492, "step": 1386 }, { "epoch": 2.8158295281582952, "grad_norm": 0.6628204710761366, "learning_rate": 1.1742270036016523e-07, "loss": 0.0542, "step": 1388 }, { "epoch": 2.819888381532217, "grad_norm": 0.6122682091369429, "learning_rate": 1.1239186456828033e-07, "loss": 0.0551, "step": 1390 }, { "epoch": 2.823947234906139, "grad_norm": 0.6936962374041317, "learning_rate": 1.0746994666109234e-07, "loss": 0.0573, "step": 1392 }, { "epoch": 2.828006088280061, "grad_norm": 0.5643200467907136, "learning_rate": 1.0265705632106216e-07, "loss": 0.0546, "step": 1394 }, { "epoch": 2.8320649416539827, "grad_norm": 0.6353165412067484, "learning_rate": 9.795330080102527e-08, "loss": 0.0541, "step": 1396 }, { "epoch": 2.8361237950279046, "grad_norm": 0.6131605918166376, "learning_rate": 9.335878492180373e-08, "loss": 0.0519, "step": 1398 }, { "epoch": 2.8401826484018264, "grad_norm": 0.6838700561564602, "learning_rate": 8.887361106986848e-08, "loss": 0.0557, "step": 1400 }, { "epoch": 2.8442415017757483, "grad_norm": 0.6903440686599182, "learning_rate": 8.44978791950607e-08, "loss": 0.0594, "step": 1402 }, { "epoch": 2.84830035514967, "grad_norm": 0.6727563667189779, "learning_rate": 8.023168680835913e-08, "loss": 0.0599, "step": 1404 }, { "epoch": 2.852359208523592, "grad_norm": 0.6714864747461081, "learning_rate": 7.60751289797118e-08, "loss": 0.0596, "step": 1406 }, { "epoch": 2.856418061897514, "grad_norm": 0.6100055956786066, "learning_rate": 7.202829833591496e-08, "loss": 0.056, "step": 1408 }, { "epoch": 2.8604769152714358, "grad_norm": 0.6561132306996892, "learning_rate": 6.809128505855189e-08, "loss": 0.0556, "step": 1410 }, { "epoch": 2.8645357686453576, "grad_norm": 0.6332585194044941, "learning_rate": 6.426417688197961e-08, "loss": 0.0532, "step": 1412 }, { "epoch": 2.8685946220192795, "grad_norm": 0.6170375673674818, "learning_rate": 6.054705909137426e-08, "loss": 0.0496, "step": 1414 }, { "epoch": 2.8726534753932014, "grad_norm": 0.60220768047444, "learning_rate": 5.6940014520834865e-08, "loss": 0.0539, "step": 1416 }, { "epoch": 2.8767123287671232, "grad_norm": 0.5816521708453392, "learning_rate": 5.344312355153036e-08, "loss": 0.0523, "step": 1418 }, { "epoch": 2.880771182141045, "grad_norm": 0.6233216693560418, "learning_rate": 5.005646410991549e-08, "loss": 0.0547, "step": 1420 }, { "epoch": 2.884830035514967, "grad_norm": 0.6419014793947228, "learning_rate": 4.678011166598884e-08, "loss": 0.0619, "step": 1422 }, { "epoch": 2.888888888888889, "grad_norm": 0.6187502232959146, "learning_rate": 4.3614139231614725e-08, "loss": 0.0506, "step": 1424 }, { "epoch": 2.8929477422628107, "grad_norm": 0.6573679267165627, "learning_rate": 4.0558617358892326e-08, "loss": 0.054, "step": 1426 }, { "epoch": 2.8970065956367326, "grad_norm": 0.7332565562788992, "learning_rate": 3.7613614138587995e-08, "loss": 0.0587, "step": 1428 }, { "epoch": 2.9010654490106544, "grad_norm": 0.6415121216139319, "learning_rate": 3.477919519861428e-08, "loss": 0.0537, "step": 1430 }, { "epoch": 2.9051243023845763, "grad_norm": 0.7104972559533141, "learning_rate": 3.205542370256997e-08, "loss": 0.0549, "step": 1432 }, { "epoch": 2.909183155758498, "grad_norm": 0.6659234362952994, "learning_rate": 2.944236034832959e-08, "loss": 0.059, "step": 1434 }, { "epoch": 2.91324200913242, "grad_norm": 0.6504771749865272, "learning_rate": 2.6940063366693303e-08, "loss": 0.0545, "step": 1436 }, { "epoch": 2.917300862506342, "grad_norm": 0.6035759484889454, "learning_rate": 2.4548588520089123e-08, "loss": 0.0544, "step": 1438 }, { "epoch": 2.9213597158802638, "grad_norm": 0.6114721671996689, "learning_rate": 2.2267989101328878e-08, "loss": 0.0531, "step": 1440 }, { "epoch": 2.9254185692541856, "grad_norm": 0.6776141271918054, "learning_rate": 2.0098315932421952e-08, "loss": 0.0548, "step": 1442 }, { "epoch": 2.9294774226281075, "grad_norm": 0.5785157075548263, "learning_rate": 1.803961736344062e-08, "loss": 0.0489, "step": 1444 }, { "epoch": 2.9335362760020294, "grad_norm": 0.6257664598177896, "learning_rate": 1.6091939271446478e-08, "loss": 0.0541, "step": 1446 }, { "epoch": 2.9375951293759512, "grad_norm": 0.6581827205937377, "learning_rate": 1.4255325059463477e-08, "loss": 0.057, "step": 1448 }, { "epoch": 2.941653982749873, "grad_norm": 0.5842892930844981, "learning_rate": 1.252981565551481e-08, "loss": 0.0524, "step": 1450 }, { "epoch": 2.945712836123795, "grad_norm": 0.660064548851826, "learning_rate": 1.0915449511708088e-08, "loss": 0.0546, "step": 1452 }, { "epoch": 2.949771689497717, "grad_norm": 0.6419857691002046, "learning_rate": 9.412262603378797e-09, "loss": 0.0544, "step": 1454 }, { "epoch": 2.9538305428716387, "grad_norm": 0.5794278961094809, "learning_rate": 8.020288428289836e-09, "loss": 0.0532, "step": 1456 }, { "epoch": 2.9578893962455606, "grad_norm": 0.7121133884438979, "learning_rate": 6.739558005884883e-09, "loss": 0.0577, "step": 1458 }, { "epoch": 2.9619482496194824, "grad_norm": 0.6610137998761876, "learning_rate": 5.570099876595625e-09, "loss": 0.0582, "step": 1460 }, { "epoch": 2.9660071029934043, "grad_norm": 0.5782156107576852, "learning_rate": 4.511940101207812e-09, "loss": 0.0517, "step": 1462 }, { "epoch": 2.970065956367326, "grad_norm": 0.6497750292771783, "learning_rate": 3.565102260278397e-09, "loss": 0.0566, "step": 1464 }, { "epoch": 2.974124809741248, "grad_norm": 0.6075980392613102, "learning_rate": 2.72960745361206e-09, "loss": 0.0521, "step": 1466 }, { "epoch": 2.97818366311517, "grad_norm": 0.6593396919810925, "learning_rate": 2.0054742997893674e-09, "loss": 0.0561, "step": 1468 }, { "epoch": 2.9822425164890918, "grad_norm": 0.6349479858496213, "learning_rate": 1.392718935752102e-09, "loss": 0.0527, "step": 1470 }, { "epoch": 2.9863013698630136, "grad_norm": 0.6624916331368633, "learning_rate": 8.913550164463269e-10, "loss": 0.053, "step": 1472 }, { "epoch": 2.9903602232369355, "grad_norm": 0.6290549360832741, "learning_rate": 5.013937145131875e-10, "loss": 0.0563, "step": 1474 }, { "epoch": 2.9944190766108574, "grad_norm": 0.625615525565498, "learning_rate": 2.2284372004410804e-10, "loss": 0.0562, "step": 1476 }, { "epoch": 2.9984779299847792, "grad_norm": 0.5809460613562152, "learning_rate": 5.5711240385392106e-11, "loss": 0.0501, "step": 1478 }, { "epoch": 3.0, "step": 1479, "total_flos": 4127658346151936.0, "train_loss": 0.21015015432889433, "train_runtime": 144374.0796, "train_samples_per_second": 1.311, "train_steps_per_second": 0.01 } ], "logging_steps": 2, "max_steps": 1479, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4127658346151936.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }